kash-shell 0.3.23__py3-none-any.whl → 0.3.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. kash/actions/core/combine_docs.py +52 -0
  2. kash/actions/core/concat_docs.py +47 -0
  3. kash/commands/workspace/workspace_commands.py +2 -2
  4. kash/config/logger.py +3 -2
  5. kash/config/settings.py +8 -0
  6. kash/docs/markdown/topics/a2_installation.md +2 -2
  7. kash/embeddings/embeddings.py +4 -6
  8. kash/embeddings/text_similarity.py +2 -5
  9. kash/exec/action_exec.py +1 -1
  10. kash/exec/fetch_url_items.py +36 -8
  11. kash/help/help_embeddings.py +3 -0
  12. kash/llm_utils/llm_completion.py +1 -1
  13. kash/llm_utils/llm_features.py +1 -1
  14. kash/llm_utils/llms.py +5 -7
  15. kash/mcp/mcp_cli.py +2 -2
  16. kash/model/params_model.py +1 -1
  17. kash/utils/api_utils/api_retries.py +84 -76
  18. kash/utils/api_utils/gather_limited.py +227 -89
  19. kash/utils/api_utils/http_utils.py +46 -0
  20. kash/utils/api_utils/progress_protocol.py +49 -56
  21. kash/utils/rich_custom/multitask_status.py +70 -21
  22. kash/utils/text_handling/doc_normalization.py +2 -0
  23. kash/utils/text_handling/markdown_utils.py +14 -3
  24. kash/web_content/web_extract.py +12 -8
  25. kash/web_content/web_fetch.py +289 -60
  26. kash/web_content/web_page_model.py +5 -0
  27. kash/web_gen/templates/base_styles.css.jinja +8 -1
  28. {kash_shell-0.3.23.dist-info → kash_shell-0.3.25.dist-info}/METADATA +6 -4
  29. {kash_shell-0.3.23.dist-info → kash_shell-0.3.25.dist-info}/RECORD +32 -29
  30. {kash_shell-0.3.23.dist-info → kash_shell-0.3.25.dist-info}/WHEEL +0 -0
  31. {kash_shell-0.3.23.dist-info → kash_shell-0.3.25.dist-info}/entry_points.txt +0 -0
  32. {kash_shell-0.3.23.dist-info → kash_shell-0.3.25.dist-info}/licenses/LICENSE +0 -0
@@ -2,11 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  from dataclasses import dataclass
5
- from functools import cached_property
5
+ from enum import Enum
6
+ from functools import cache, cached_property
6
7
  from pathlib import Path
7
8
  from typing import TYPE_CHECKING, Any
8
9
  from urllib.parse import urlparse
9
10
 
11
+ from cachetools import TTLCache
10
12
  from strif import atomic_output_file, copyfile_atomic
11
13
 
12
14
  from kash.config.env_settings import KashEnv
@@ -14,59 +16,245 @@ from kash.utils.common.url import Url
14
16
  from kash.utils.file_utils.file_formats import MimeType
15
17
 
16
18
  if TYPE_CHECKING:
17
- from httpx import Client, Response
19
+ from curl_cffi.requests import Response as CurlCffiResponse
20
+ from curl_cffi.requests import Session as CurlCffiSession
21
+ from httpx import Client as HttpxClient
22
+ from httpx import Response as HttpxResponse
18
23
 
19
24
  log = logging.getLogger(__name__)
20
25
 
21
26
 
22
27
  DEFAULT_TIMEOUT = 30
28
+ CURL_CFFI_IMPERSONATE_VERSION = "chrome120"
23
29
 
24
-
25
- DEFAULT_USER_AGENT = (
26
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0"
30
+ # Header helpers
31
+ _DEFAULT_UA = (
32
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_3) "
33
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
34
+ "Chrome/126.0.0.0 Safari/537.36"
27
35
  )
36
+ _SIMPLE_HEADERS = {"User-Agent": KashEnv.KASH_USER_AGENT.read_str(default=_DEFAULT_UA)}
37
+
38
+
39
+ class ClientMode(Enum):
40
+ """
41
+ Defines the web client and settings.
42
+ """
43
+
44
+ SIMPLE = "SIMPLE"
45
+ """httpx with minimal headers"""
46
+
47
+ BROWSER_HEADERS = "BROWSER_HEADERS"
48
+ """httpx with extensive, manually-set headers"""
49
+
50
+ CURL_CFFI = "CURL_CFFI"
51
+ """curl_cffi for full browser impersonation (incl. TLS)"""
52
+
53
+ AUTO = "AUTO"
54
+ """Automatically pick CURL_CFFI if available, otherwise BROWSER_HEADERS"""
55
+
56
+
57
+ @cache
58
+ def _have_brotli() -> bool:
59
+ """
60
+ Check if brotli compression is available.
61
+ Warns once if brotli is not installed.
62
+ """
63
+ try:
64
+ import brotli # noqa: F401
65
+
66
+ return True
67
+ except ImportError:
68
+ log.warning("web_fetch: brotli package not found; install for better download performance")
69
+ return False
70
+
71
+
72
+ @cache
73
+ def _have_curl_cffi() -> bool:
74
+ """
75
+ Check if curl_cffi is available.
76
+ Warns once if curl_cffi is not installed.
77
+ """
78
+ try:
79
+ import curl_cffi.requests # noqa: F401
80
+
81
+ return True
82
+ except ImportError:
83
+ log.warning("web_fetch: curl_cffi package not found; install for browser impersonation")
84
+ return False
28
85
 
29
86
 
30
- def default_headers() -> dict[str, str]:
31
- return {"User-Agent": KashEnv.KASH_USER_AGENT.read_str(default=DEFAULT_USER_AGENT)}
87
+ @cache
88
+ def _get_auto_mode() -> ClientMode:
89
+ """
90
+ Automatically select the best available client mode.
91
+ Logs the decision once due to caching.
92
+ """
93
+ if _have_curl_cffi():
94
+ log.info("web_fetch: AUTO mode selected CURL_CFFI (full browser impersonation)")
95
+ return ClientMode.CURL_CFFI
96
+ else:
97
+ log.info("web_fetch: AUTO mode selected BROWSER_HEADERS (httpx with browser headers)")
98
+ return ClientMode.BROWSER_HEADERS
99
+
100
+
101
+ @cache
102
+ def _browser_like_headers() -> dict[str, str]:
103
+ """
104
+ Full header set that looks like a 2025-era Chrome GET.
105
+ """
106
+ ua = KashEnv.KASH_USER_AGENT.read_str(default=_DEFAULT_UA)
107
+
108
+ # Build Accept-Encoding based on available compression support
109
+ encodings = ["gzip", "deflate"]
110
+ if _have_brotli():
111
+ encodings.append("br")
112
+ accept_encoding = ", ".join(encodings)
113
+
114
+ return {
115
+ "User-Agent": ua,
116
+ "Accept": (
117
+ "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
118
+ ),
119
+ "Accept-Language": "en-US,en;q=0.9",
120
+ "Accept-Encoding": accept_encoding,
121
+ "Referer": "https://www.google.com/",
122
+ "DNT": "1",
123
+ "Upgrade-Insecure-Requests": "1",
124
+ }
125
+
126
+
127
+ # Cookie priming cache - tracks which hosts have been primed
128
+ _primed_hosts = TTLCache(maxsize=10000, ttl=3600)
129
+
130
+
131
+ def _prime_host(host: str, client: HttpxClient | CurlCffiSession, timeout: int, **kwargs) -> bool:
132
+ """
133
+ Prime cookies for a host using the provided client and extra arguments.
134
+ """
135
+ if host in _primed_hosts:
136
+ log.debug("Cookie priming for %s skipped (cached)", host)
137
+ return True
138
+
139
+ try:
140
+ root = f"https://{host}/"
141
+ # Pass client-specific kwargs like `impersonate`
142
+ client.get(root, timeout=timeout, **kwargs)
143
+ log.debug("Cookie priming completed for %s", host)
144
+ except Exception as exc:
145
+ log.debug("Cookie priming for %s failed (%s); continuing", host, exc)
146
+
147
+ # Mark as primed (both success and failure to avoid immediate retries)
148
+ _primed_hosts[host] = True
149
+ return True
150
+
151
+
152
+ def _get_req_headers(
153
+ mode: ClientMode, user_headers: dict[str, str] | None = None
154
+ ) -> dict[str, str]:
155
+ """
156
+ Build headers based on the selected ClientMode.
157
+ For CURL_CFFI, curl_cffi handles headers automatically.
158
+ """
159
+ if mode is ClientMode.AUTO:
160
+ mode = _get_auto_mode()
161
+
162
+ base_headers = {}
163
+ if mode is ClientMode.SIMPLE:
164
+ base_headers = _SIMPLE_HEADERS
165
+ elif mode is ClientMode.BROWSER_HEADERS:
166
+ base_headers = _browser_like_headers()
167
+ elif mode is ClientMode.CURL_CFFI:
168
+ # curl_cffi handles the important headers (UA, Accept-*, etc.)
169
+ # We only need to add user-provided ones.
170
+ return user_headers or {}
171
+
172
+ if user_headers:
173
+ return {**base_headers, **user_headers}
174
+
175
+ return base_headers
32
176
 
33
177
 
34
178
  def fetch_url(
35
179
  url: Url,
180
+ *,
36
181
  timeout: int = DEFAULT_TIMEOUT,
37
182
  auth: Any | None = None,
38
183
  headers: dict[str, str] | None = None,
39
- ) -> Response:
184
+ mode: ClientMode = ClientMode.AUTO,
185
+ ) -> HttpxResponse | CurlCffiResponse:
40
186
  """
41
- Fetch a URL using httpx with logging and reasonable defaults.
42
- Raise httpx.HTTPError for non-2xx responses.
187
+ Fetch a URL, dispatching to httpx or curl_cffi based on the mode.
43
188
  """
44
- import httpx
189
+ if mode is ClientMode.AUTO:
190
+ mode = _get_auto_mode()
191
+
192
+ req_headers = _get_req_headers(mode, headers)
193
+ parsed_url = urlparse(str(url))
194
+
195
+ # Handle curl_cffi mode
196
+ if mode is ClientMode.CURL_CFFI:
197
+ if not _have_curl_cffi():
198
+ raise ValueError("Could not find curl_cffi, which is needed for CURL_CFFI mode")
45
199
 
46
- with httpx.Client(
47
- follow_redirects=True,
48
- timeout=timeout,
49
- auth=auth,
50
- headers=headers or default_headers(),
51
- ) as client:
52
- log.debug("fetch_url: using headers: %s", client.headers)
53
- response = client.get(url)
54
- log.info("Fetched: %s (%s bytes): %s", response.status_code, len(response.content), url)
55
- response.raise_for_status()
56
- return response
200
+ from curl_cffi.requests import Session
201
+
202
+ with Session() as client:
203
+ # Set headers on the session - they will be sent with all requests
204
+ client.headers.update(req_headers)
205
+ _prime_host(
206
+ parsed_url.netloc, client, timeout, impersonate=CURL_CFFI_IMPERSONATE_VERSION
207
+ )
208
+ log.debug("fetch_url (curl_cffi): using session headers: %s", client.headers)
209
+ response = client.get(
210
+ url,
211
+ impersonate=CURL_CFFI_IMPERSONATE_VERSION,
212
+ timeout=timeout,
213
+ auth=auth,
214
+ allow_redirects=True,
215
+ )
216
+ log.info(
217
+ "Fetched (curl_cffi): %s (%s bytes): %s",
218
+ response.status_code,
219
+ len(response.content),
220
+ url,
221
+ )
222
+ response.raise_for_status()
223
+ return response
224
+
225
+ # Handle httpx modes
226
+ else:
227
+ import httpx
228
+
229
+ with httpx.Client(
230
+ follow_redirects=True,
231
+ timeout=timeout,
232
+ auth=auth,
233
+ headers=req_headers,
234
+ ) as client:
235
+ log.debug("fetch_url (httpx): using headers: %s", client.headers)
236
+
237
+ # Cookie priming only makes sense for the browser-like mode
238
+ if mode is ClientMode.BROWSER_HEADERS:
239
+ _prime_host(parsed_url.netloc, client, timeout)
240
+
241
+ response = client.get(url)
242
+ log.info(
243
+ "Fetched (httpx): %s (%s bytes): %s",
244
+ response.status_code,
245
+ len(response.content),
246
+ url,
247
+ )
248
+ response.raise_for_status()
249
+ return response
57
250
 
58
251
 
59
252
  @dataclass(frozen=True)
60
253
  class HttpHeaders:
61
- """
62
- HTTP response headers.
63
- """
64
-
65
254
  headers: dict[str, str]
66
255
 
67
256
  @cached_property
68
257
  def mime_type(self) -> MimeType | None:
69
- """Get content type header, if available."""
70
258
  for key, value in self.headers.items():
71
259
  if key.lower() == "content-type":
72
260
  return MimeType(value)
@@ -76,11 +264,12 @@ class HttpHeaders:
76
264
  def download_url(
77
265
  url: Url,
78
266
  target_filename: str | Path,
79
- session: Client | None = None,
267
+ *,
80
268
  show_progress: bool = False,
81
269
  timeout: int = DEFAULT_TIMEOUT,
82
270
  auth: Any | None = None,
83
271
  headers: dict[str, str] | None = None,
272
+ mode: ClientMode = ClientMode.AUTO,
84
273
  ) -> HttpHeaders | None:
85
274
  """
86
275
  Download given file, optionally with progress bar, streaming to a target file.
@@ -88,8 +277,8 @@ def download_url(
88
277
  Raise httpx.HTTPError for non-2xx responses.
89
278
  Returns response headers for HTTP/HTTPS requests, None for other URL types.
90
279
  """
91
- import httpx
92
- from tqdm import tqdm
280
+ if mode is ClientMode.AUTO:
281
+ mode = _get_auto_mode()
93
282
 
94
283
  target_filename = str(target_filename)
95
284
  parsed_url = urlparse(url)
@@ -106,39 +295,79 @@ def download_url(
106
295
  s3_path = parsed_url.path.lstrip("/")
107
296
  s3.Bucket(parsed_url.netloc).download_file(s3_path, target_filename)
108
297
  return None
109
- else:
110
- client = session or httpx.Client(follow_redirects=True, timeout=timeout)
111
- response: httpx.Response | None = None
112
- response_headers: dict[str, str] | None = None
113
- try:
114
- headers = headers or default_headers()
115
- log.debug("download_url: using headers: %s", headers)
116
- with client.stream(
117
- "GET",
298
+
299
+ req_headers = _get_req_headers(mode, headers)
300
+ response_headers = None
301
+
302
+ def stream_to_file(response_iterator, total_size):
303
+ with atomic_output_file(target_filename, make_parents=True) as temp_filename:
304
+ with open(temp_filename, "wb") as f:
305
+ if not show_progress:
306
+ for chunk in response_iterator:
307
+ if chunk: # Skip empty chunks
308
+ f.write(chunk)
309
+ else:
310
+ from tqdm import tqdm
311
+
312
+ with tqdm(
313
+ total=total_size,
314
+ unit="B",
315
+ unit_scale=True,
316
+ desc=f"Downloading {Path(target_filename).name}",
317
+ ) as progress:
318
+ for chunk in response_iterator:
319
+ if chunk: # Skip empty chunks
320
+ f.write(chunk)
321
+ progress.update(len(chunk))
322
+
323
+ # Handle curl_cffi mode
324
+ if mode is ClientMode.CURL_CFFI:
325
+ if not _have_curl_cffi():
326
+ raise ValueError("Could not find curl_cffi, which is needed for CURL_CFFI mode")
327
+
328
+ from curl_cffi.requests import Session
329
+
330
+ with Session() as client:
331
+ # Set headers on the session; they will be sent with all requests
332
+ client.headers.update(req_headers)
333
+ _prime_host(
334
+ parsed_url.netloc, client, timeout, impersonate=CURL_CFFI_IMPERSONATE_VERSION
335
+ )
336
+ log.debug("download_url (curl_cffi): using session headers: %s", client.headers)
337
+
338
+ response = client.get(
118
339
  url,
119
- follow_redirects=True,
340
+ impersonate=CURL_CFFI_IMPERSONATE_VERSION,
120
341
  timeout=timeout,
121
342
  auth=auth,
122
- headers=headers,
123
- ) as response:
343
+ allow_redirects=True,
344
+ stream=True,
345
+ )
346
+ response.raise_for_status()
347
+ response_headers = dict(response.headers)
348
+ total = int(response.headers.get("content-length", "0"))
349
+
350
+ # Use iter_content for streaming; this is the standard method for curl_cffi
351
+ chunk_iterator = response.iter_content(chunk_size=8192)
352
+ stream_to_file(chunk_iterator, total)
353
+
354
+ # Handle httpx modes
355
+ else:
356
+ import httpx
357
+
358
+ with httpx.Client(follow_redirects=True, timeout=timeout, headers=req_headers) as client:
359
+ if mode is ClientMode.BROWSER_HEADERS:
360
+ _prime_host(parsed_url.netloc, client, timeout)
361
+
362
+ log.debug("download_url (httpx): using headers: %s", client.headers)
363
+ with client.stream("GET", url, auth=auth, follow_redirects=True) as response:
124
364
  response.raise_for_status()
125
365
  response_headers = dict(response.headers)
126
- total_size = int(response.headers.get("content-length", "0"))
366
+ total = int(response.headers.get("content-length", "0"))
367
+ stream_to_file(response.iter_bytes(), total)
127
368
 
128
- with atomic_output_file(target_filename, make_parents=True) as temp_filename:
129
- with open(temp_filename, "wb") as f:
130
- if not show_progress:
131
- for chunk in response.iter_bytes():
132
- f.write(chunk)
133
- else:
134
- with tqdm(total=total_size, unit="B", unit_scale=True) as progress:
135
- for chunk in response.iter_bytes():
136
- f.write(chunk)
137
- progress.update(len(chunk))
138
- finally:
139
- if not session: # Only close if we created the client
140
- client.close()
141
- if response:
142
- response.raise_for_status() # In case of errors during streaming
143
-
144
- return HttpHeaders(response_headers) if response_headers else None
369
+ # Filter out None values from headers for HttpHeaders type compatibility
370
+ if response_headers:
371
+ clean_headers = {k: v for k, v in response_headers.items() if v is not None}
372
+ return HttpHeaders(clean_headers)
373
+ return None
@@ -6,6 +6,7 @@ from pydantic.dataclasses import dataclass
6
6
 
7
7
  from kash.utils.common.url import Url
8
8
  from kash.utils.file_utils.file_formats_model import FileFormatInfo
9
+ from kash.web_content.local_file_cache import CacheResult
9
10
 
10
11
 
11
12
  @dataclass
@@ -18,6 +19,9 @@ class WebPageData:
18
19
  The `clean_html` field should be a clean HTML version of the page, if available.
19
20
  The `saved_content` is optional but can be used to reference the original content,
20
21
  especially for large or non-text content.
22
+
23
+ Optionally exposes the cache result for the content, so the client can have
24
+ information about headers and whether it was cached.
21
25
  """
22
26
 
23
27
  locator: Url | Path
@@ -29,6 +33,7 @@ class WebPageData:
29
33
  saved_content: Path | None = None
30
34
  format_info: FileFormatInfo | None = None
31
35
  thumbnail_url: Url | None = None
36
+ cache_result: CacheResult | None = None
32
37
 
33
38
  def __repr__(self):
34
39
  return abbrev_obj(self)
@@ -22,6 +22,7 @@
22
22
  {% endblock root_variables %}
23
23
  }
24
24
 
25
+ /* CSS color definitions. */
25
26
  {{ color_defs|safe }}
26
27
 
27
28
  {% block selection_styles %}
@@ -145,7 +146,7 @@ h2 + h3 {
145
146
  }
146
147
 
147
148
  h3 {
148
- font-size: 1.18rem;
149
+ font-size: 1.15rem;
149
150
  margin-top: 1.4rem;
150
151
  margin-bottom: 0.7rem;
151
152
  }
@@ -662,6 +663,12 @@ sup {
662
663
  max-width: none;
663
664
  }
664
665
 
666
+ /* Smaller table text on mobile. */
667
+ table code,
668
+ table pre {
669
+ font-size: var(--font-size-mono-small);
670
+ }
671
+
665
672
  ul, ol {
666
673
  margin-left: 1rem;
667
674
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kash-shell
3
- Version: 0.3.23
3
+ Version: 0.3.25
4
4
  Summary: The knowledge agent shell (core)
5
5
  Project-URL: Repository, https://github.com/jlevy/kash-shell
6
6
  Author-email: Joshua Levy <joshua@cal.berkeley.edu>
@@ -24,12 +24,14 @@ Requires-Dist: chopdiff>=0.2.3
24
24
  Requires-Dist: clideps>=0.1.4
25
25
  Requires-Dist: colour>=0.1.5
26
26
  Requires-Dist: cssselect>=1.2.0
27
+ Requires-Dist: curl-cffi>=0.11.4
27
28
  Requires-Dist: deepgram-sdk>=3.10.1
28
29
  Requires-Dist: dunamai>=1.23.0
29
30
  Requires-Dist: fastapi>=0.115.11
30
- Requires-Dist: flowmark>=0.4.6
31
+ Requires-Dist: flowmark>=0.4.8
31
32
  Requires-Dist: frontmatter-format>=0.2.1
32
33
  Requires-Dist: funlog>=0.2.0
34
+ Requires-Dist: httpx[brotli]>=0.28.1
33
35
  Requires-Dist: humanfriendly>=10.0
34
36
  Requires-Dist: inquirerpy>=0.3.4
35
37
  Requires-Dist: jinja2>=3.1.6
@@ -329,7 +331,7 @@ These are for `kash-media` but you can use a `kash-shell` for a more basic setup
329
331
 
330
332
  You can use kash from your MCP client (such as Anthropic Desktop or Cursor).
331
333
 
332
- You do this by running the the `kash_mcp` binary to make kash actions available as MCP
334
+ You do this by running the the `kash-mcp` binary to make kash actions available as MCP
333
335
  tools.
334
336
 
335
337
  For Claude Desktop, my config looks like this:
@@ -338,7 +340,7 @@ For Claude Desktop, my config looks like this:
338
340
  {
339
341
  "mcpServers": {
340
342
  "kash": {
341
- "command": "/Users/levy/.local/bin/kash_mcp",
343
+ "command": "/Users/levy/.local/bin/kash-mcp",
342
344
  "args": ["--proxy"]
343
345
  }
344
346
  }