PyPI - kash-shell - Versions diffs - 0.3.23__py3-none-any.whl → 0.3.25__py3-none-any.whl - Mend

kash-shell 0.3.23py3-none-any.whl → 0.3.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

kash/actions/core/combine_docs.py +52 -0
kash/actions/core/concat_docs.py +47 -0
kash/commands/workspace/workspace_commands.py +2 -2
kash/config/logger.py +3 -2
kash/config/settings.py +8 -0
kash/docs/markdown/topics/a2_installation.md +2 -2
kash/embeddings/embeddings.py +4 -6
kash/embeddings/text_similarity.py +2 -5
kash/exec/action_exec.py +1 -1
kash/exec/fetch_url_items.py +36 -8
kash/help/help_embeddings.py +3 -0
kash/llm_utils/llm_completion.py +1 -1
kash/llm_utils/llm_features.py +1 -1
kash/llm_utils/llms.py +5 -7
kash/mcp/mcp_cli.py +2 -2
kash/model/params_model.py +1 -1
kash/utils/api_utils/api_retries.py +84 -76
kash/utils/api_utils/gather_limited.py +227 -89
kash/utils/api_utils/http_utils.py +46 -0
kash/utils/api_utils/progress_protocol.py +49 -56
kash/utils/rich_custom/multitask_status.py +70 -21
kash/utils/text_handling/doc_normalization.py +2 -0
kash/utils/text_handling/markdown_utils.py +14 -3
kash/web_content/web_extract.py +12 -8
kash/web_content/web_fetch.py +289 -60
kash/web_content/web_page_model.py +5 -0
kash/web_gen/templates/base_styles.css.jinja +8 -1
{kash_shell-0.3.23.dist-info → kash_shell-0.3.25.dist-info}/METADATA +6 -4
{kash_shell-0.3.23.dist-info → kash_shell-0.3.25.dist-info}/RECORD +32 -29
{kash_shell-0.3.23.dist-info → kash_shell-0.3.25.dist-info}/WHEEL +0 -0
{kash_shell-0.3.23.dist-info → kash_shell-0.3.25.dist-info}/entry_points.txt +0 -0
{kash_shell-0.3.23.dist-info → kash_shell-0.3.25.dist-info}/licenses/LICENSE +0 -0

kash/web_content/web_fetch.py CHANGED Viewed

@@ -2,11 +2,13 @@ from __future__ import annotations
 import logging
 from dataclasses import dataclass
-from functools import cached_property
+from enum import Enum
+from functools import cache, cached_property
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from urllib.parse import urlparse
+from cachetools import TTLCache
 from strif import atomic_output_file, copyfile_atomic
 from kash.config.env_settings import KashEnv
@@ -14,59 +16,245 @@ from kash.utils.common.url import Url
 from kash.utils.file_utils.file_formats import MimeType
 if TYPE_CHECKING:
-    from httpx import Client, Response
+    from curl_cffi.requests import Response as CurlCffiResponse
+    from curl_cffi.requests import Session as CurlCffiSession
+    from httpx import Client as HttpxClient
+    from httpx import Response as HttpxResponse
 log = logging.getLogger(__name__)
 DEFAULT_TIMEOUT = 30
+CURL_CFFI_IMPERSONATE_VERSION = "chrome120"
-DEFAULT_USER_AGENT = (
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0"
+# Header helpers
+_DEFAULT_UA = (
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_3) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/126.0.0.0 Safari/537.36"
 )
+_SIMPLE_HEADERS = {"User-Agent": KashEnv.KASH_USER_AGENT.read_str(default=_DEFAULT_UA)}
+class ClientMode(Enum):
+    """
+    Defines the web client and settings.
+    """
+    SIMPLE = "SIMPLE"
+    """httpx with minimal headers"""
+    BROWSER_HEADERS = "BROWSER_HEADERS"
+    """httpx with extensive, manually-set headers"""
+    CURL_CFFI = "CURL_CFFI"
+    """curl_cffi for full browser impersonation (incl. TLS)"""
+    AUTO = "AUTO"
+    """Automatically pick CURL_CFFI if available, otherwise BROWSER_HEADERS"""
+@cache
+def _have_brotli() -> bool:
+    """
+    Check if brotli compression is available.
+    Warns once if brotli is not installed.
+    """
+    try:
+        import brotli  # noqa: F401
+        return True
+    except ImportError:
+        log.warning("web_fetch: brotli package not found; install for better download performance")
+        return False
+@cache
+def _have_curl_cffi() -> bool:
+    """
+    Check if curl_cffi is available.
+    Warns once if curl_cffi is not installed.
+    """
+    try:
+        import curl_cffi.requests  # noqa: F401
+        return True
+    except ImportError:
+        log.warning("web_fetch: curl_cffi package not found; install for browser impersonation")
+        return False
-def default_headers() -> dict[str, str]:
-    return {"User-Agent": KashEnv.KASH_USER_AGENT.read_str(default=DEFAULT_USER_AGENT)}
+@cache
+def _get_auto_mode() -> ClientMode:
+    """
+    Automatically select the best available client mode.
+    Logs the decision once due to caching.
+    """
+    if _have_curl_cffi():
+        log.info("web_fetch: AUTO mode selected CURL_CFFI (full browser impersonation)")
+        return ClientMode.CURL_CFFI
+    else:
+        log.info("web_fetch: AUTO mode selected BROWSER_HEADERS (httpx with browser headers)")
+        return ClientMode.BROWSER_HEADERS
+@cache
+def _browser_like_headers() -> dict[str, str]:
+    """
+    Full header set that looks like a 2025-era Chrome GET.
+    """
+    ua = KashEnv.KASH_USER_AGENT.read_str(default=_DEFAULT_UA)
+    # Build Accept-Encoding based on available compression support
+    encodings = ["gzip", "deflate"]
+    if _have_brotli():
+        encodings.append("br")
+    accept_encoding = ", ".join(encodings)
+    return {
+        "User-Agent": ua,
+        "Accept": (
+            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
+        ),
+        "Accept-Language": "en-US,en;q=0.9",
+        "Accept-Encoding": accept_encoding,
+        "Referer": "https://www.google.com/",
+        "DNT": "1",
+        "Upgrade-Insecure-Requests": "1",
+    }
+# Cookie priming cache - tracks which hosts have been primed
+_primed_hosts = TTLCache(maxsize=10000, ttl=3600)
+def _prime_host(host: str, client: HttpxClient | CurlCffiSession, timeout: int, **kwargs) -> bool:
+    """
+    Prime cookies for a host using the provided client and extra arguments.
+    """
+    if host in _primed_hosts:
+        log.debug("Cookie priming for %s skipped (cached)", host)
+        return True
+    try:
+        root = f"https://{host}/"
+        # Pass client-specific kwargs like `impersonate`
+        client.get(root, timeout=timeout, **kwargs)
+        log.debug("Cookie priming completed for %s", host)
+    except Exception as exc:
+        log.debug("Cookie priming for %s failed (%s); continuing", host, exc)
+    # Mark as primed (both success and failure to avoid immediate retries)
+    _primed_hosts[host] = True
+    return True
+def _get_req_headers(
+    mode: ClientMode, user_headers: dict[str, str] | None = None
+) -> dict[str, str]:
+    """
+    Build headers based on the selected ClientMode.
+    For CURL_CFFI, curl_cffi handles headers automatically.
+    """
+    if mode is ClientMode.AUTO:
+        mode = _get_auto_mode()
+    base_headers = {}
+    if mode is ClientMode.SIMPLE:
+        base_headers = _SIMPLE_HEADERS
+    elif mode is ClientMode.BROWSER_HEADERS:
+        base_headers = _browser_like_headers()
+    elif mode is ClientMode.CURL_CFFI:
+        # curl_cffi handles the important headers (UA, Accept-*, etc.)
+        # We only need to add user-provided ones.
+        return user_headers or {}
+    if user_headers:
+        return {**base_headers, **user_headers}
+    return base_headers
 def fetch_url(
     url: Url,
+    *,
     timeout: int = DEFAULT_TIMEOUT,
     auth: Any | None = None,
     headers: dict[str, str] | None = None,
-) -> Response:
+    mode: ClientMode = ClientMode.AUTO,
+) -> HttpxResponse | CurlCffiResponse:
     """
-    Fetch a URL using httpx with logging and reasonable defaults.
-    Raise httpx.HTTPError for non-2xx responses.
+    Fetch a URL, dispatching to httpx or curl_cffi based on the mode.
     """
-    import httpx
+    if mode is ClientMode.AUTO:
+        mode = _get_auto_mode()
+    req_headers = _get_req_headers(mode, headers)
+    parsed_url = urlparse(str(url))
+    # Handle curl_cffi mode
+    if mode is ClientMode.CURL_CFFI:
+        if not _have_curl_cffi():
+            raise ValueError("Could not find curl_cffi, which is needed for CURL_CFFI mode")
-    with httpx.Client(
-        follow_redirects=True,
-        timeout=timeout,
-        auth=auth,
-        headers=headers or default_headers(),
-    ) as client:
-        log.debug("fetch_url: using headers: %s", client.headers)
-        response = client.get(url)
-        log.info("Fetched: %s (%s bytes): %s", response.status_code, len(response.content), url)
-        response.raise_for_status()
-        return response
+        from curl_cffi.requests import Session
+        with Session() as client:
+            # Set headers on the session - they will be sent with all requests
+            client.headers.update(req_headers)
+            _prime_host(
+                parsed_url.netloc, client, timeout, impersonate=CURL_CFFI_IMPERSONATE_VERSION
+            )
+            log.debug("fetch_url (curl_cffi): using session headers: %s", client.headers)
+            response = client.get(
+                url,
+                impersonate=CURL_CFFI_IMPERSONATE_VERSION,
+                timeout=timeout,
+                auth=auth,
+                allow_redirects=True,
+            )
+            log.info(
+                "Fetched (curl_cffi): %s (%s bytes): %s",
+                response.status_code,
+                len(response.content),
+                url,
+            )
+            response.raise_for_status()
+            return response
+    # Handle httpx modes
+    else:
+        import httpx
+        with httpx.Client(
+            follow_redirects=True,
+            timeout=timeout,
+            auth=auth,
+            headers=req_headers,
+        ) as client:
+            log.debug("fetch_url (httpx): using headers: %s", client.headers)
+            # Cookie priming only makes sense for the browser-like mode
+            if mode is ClientMode.BROWSER_HEADERS:
+                _prime_host(parsed_url.netloc, client, timeout)
+            response = client.get(url)
+            log.info(
+                "Fetched (httpx): %s (%s bytes): %s",
+                response.status_code,
+                len(response.content),
+                url,
+            )
+            response.raise_for_status()
+            return response
 @dataclass(frozen=True)
 class HttpHeaders:
-    """
-    HTTP response headers.
-    """
     headers: dict[str, str]
     @cached_property
     def mime_type(self) -> MimeType | None:
-        """Get content type header, if available."""
         for key, value in self.headers.items():
             if key.lower() == "content-type":
                 return MimeType(value)
@@ -76,11 +264,12 @@ class HttpHeaders:
 def download_url(
     url: Url,
     target_filename: str | Path,
-    session: Client | None = None,
+    *,
     show_progress: bool = False,
     timeout: int = DEFAULT_TIMEOUT,
     auth: Any | None = None,
     headers: dict[str, str] | None = None,
+    mode: ClientMode = ClientMode.AUTO,
 ) -> HttpHeaders | None:
     """
     Download given file, optionally with progress bar, streaming to a target file.
@@ -88,8 +277,8 @@ def download_url(
     Raise httpx.HTTPError for non-2xx responses.
     Returns response headers for HTTP/HTTPS requests, None for other URL types.
     """
-    import httpx
-    from tqdm import tqdm
+    if mode is ClientMode.AUTO:
+        mode = _get_auto_mode()
     target_filename = str(target_filename)
     parsed_url = urlparse(url)
@@ -106,39 +295,79 @@ def download_url(
         s3_path = parsed_url.path.lstrip("/")
         s3.Bucket(parsed_url.netloc).download_file(s3_path, target_filename)
         return None
-    else:
-        client = session or httpx.Client(follow_redirects=True, timeout=timeout)
-        response: httpx.Response | None = None
-        response_headers: dict[str, str] | None = None
-        try:
-            headers = headers or default_headers()
-            log.debug("download_url: using headers: %s", headers)
-            with client.stream(
-                "GET",
+    req_headers = _get_req_headers(mode, headers)
+    response_headers = None
+    def stream_to_file(response_iterator, total_size):
+        with atomic_output_file(target_filename, make_parents=True) as temp_filename:
+            with open(temp_filename, "wb") as f:
+                if not show_progress:
+                    for chunk in response_iterator:
+                        if chunk:  # Skip empty chunks
+                            f.write(chunk)
+                else:
+                    from tqdm import tqdm
+                    with tqdm(
+                        total=total_size,
+                        unit="B",
+                        unit_scale=True,
+                        desc=f"Downloading {Path(target_filename).name}",
+                    ) as progress:
+                        for chunk in response_iterator:
+                            if chunk:  # Skip empty chunks
+                                f.write(chunk)
+                                progress.update(len(chunk))
+    # Handle curl_cffi mode
+    if mode is ClientMode.CURL_CFFI:
+        if not _have_curl_cffi():
+            raise ValueError("Could not find curl_cffi, which is needed for CURL_CFFI mode")
+        from curl_cffi.requests import Session
+        with Session() as client:
+            # Set headers on the session; they will be sent with all requests
+            client.headers.update(req_headers)
+            _prime_host(
+                parsed_url.netloc, client, timeout, impersonate=CURL_CFFI_IMPERSONATE_VERSION
+            )
+            log.debug("download_url (curl_cffi): using session headers: %s", client.headers)
+            response = client.get(
                 url,
-                follow_redirects=True,
+                impersonate=CURL_CFFI_IMPERSONATE_VERSION,
                 timeout=timeout,
                 auth=auth,
-                headers=headers,
-            ) as response:
+                allow_redirects=True,
+                stream=True,
+            )
+            response.raise_for_status()
+            response_headers = dict(response.headers)
+            total = int(response.headers.get("content-length", "0"))
+            # Use iter_content for streaming; this is the standard method for curl_cffi
+            chunk_iterator = response.iter_content(chunk_size=8192)
+            stream_to_file(chunk_iterator, total)
+    # Handle httpx modes
+    else:
+        import httpx
+        with httpx.Client(follow_redirects=True, timeout=timeout, headers=req_headers) as client:
+            if mode is ClientMode.BROWSER_HEADERS:
+                _prime_host(parsed_url.netloc, client, timeout)
+            log.debug("download_url (httpx): using headers: %s", client.headers)
+            with client.stream("GET", url, auth=auth, follow_redirects=True) as response:
                 response.raise_for_status()
                 response_headers = dict(response.headers)
-                total_size = int(response.headers.get("content-length", "0"))
+                total = int(response.headers.get("content-length", "0"))
+                stream_to_file(response.iter_bytes(), total)
-                with atomic_output_file(target_filename, make_parents=True) as temp_filename:
-                    with open(temp_filename, "wb") as f:
-                        if not show_progress:
-                            for chunk in response.iter_bytes():
-                                f.write(chunk)
-                        else:
-                            with tqdm(total=total_size, unit="B", unit_scale=True) as progress:
-                                for chunk in response.iter_bytes():
-                                    f.write(chunk)
-                                    progress.update(len(chunk))
-        finally:
-            if not session:  # Only close if we created the client
-                client.close()
-            if response:
-                response.raise_for_status()  # In case of errors during streaming
-        return HttpHeaders(response_headers) if response_headers else None
+    # Filter out None values from headers for HttpHeaders type compatibility
+    if response_headers:
+        clean_headers = {k: v for k, v in response_headers.items() if v is not None}
+        return HttpHeaders(clean_headers)
+    return None

kash/web_content/web_page_model.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pydantic.dataclasses import dataclass
 from kash.utils.common.url import Url
 from kash.utils.file_utils.file_formats_model import FileFormatInfo
+from kash.web_content.local_file_cache import CacheResult
 @dataclass
@@ -18,6 +19,9 @@ class WebPageData:
     The `clean_html` field should be a clean HTML version of the page, if available.
     The `saved_content` is optional but can be used to reference the original content,
     especially for large or non-text content.
+    Optionally exposes the cache result for the content, so the client can have
+    information about headers and whether it was cached.
     """
     locator: Url | Path
@@ -29,6 +33,7 @@ class WebPageData:
     saved_content: Path | None = None
     format_info: FileFormatInfo | None = None
     thumbnail_url: Url | None = None
+    cache_result: CacheResult | None = None
     def __repr__(self):
         return abbrev_obj(self)

kash/web_gen/templates/base_styles.css.jinja CHANGED Viewed

@@ -22,6 +22,7 @@
   {% endblock root_variables %}
 }
+/* CSS color definitions. */
 {{ color_defs|safe }}
 {% block selection_styles %}
@@ -145,7 +146,7 @@ h2 + h3 {
 }
 h3 {
-  font-size: 1.18rem;
+  font-size: 1.15rem;
   margin-top: 1.4rem;
   margin-bottom: 0.7rem;
 }
@@ -662,6 +663,12 @@ sup {
     max-width: none;
   }
+  /* Smaller table text on mobile. */
+  table code,
+  table pre {
+    font-size: var(--font-size-mono-small);
+  }
   ul, ol {
     margin-left: 1rem;
   }

{kash_shell-0.3.23.dist-info → kash_shell-0.3.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kash-shell
-Version: 0.3.23
+Version: 0.3.25
 Summary: The knowledge agent shell (core)
 Project-URL: Repository, https://github.com/jlevy/kash-shell
 Author-email: Joshua Levy <joshua@cal.berkeley.edu>
@@ -24,12 +24,14 @@ Requires-Dist: chopdiff>=0.2.3
 Requires-Dist: clideps>=0.1.4
 Requires-Dist: colour>=0.1.5
 Requires-Dist: cssselect>=1.2.0
+Requires-Dist: curl-cffi>=0.11.4
 Requires-Dist: deepgram-sdk>=3.10.1
 Requires-Dist: dunamai>=1.23.0
 Requires-Dist: fastapi>=0.115.11
-Requires-Dist: flowmark>=0.4.6
+Requires-Dist: flowmark>=0.4.8
 Requires-Dist: frontmatter-format>=0.2.1
 Requires-Dist: funlog>=0.2.0
+Requires-Dist: httpx[brotli]>=0.28.1
 Requires-Dist: humanfriendly>=10.0
 Requires-Dist: inquirerpy>=0.3.4
 Requires-Dist: jinja2>=3.1.6
@@ -329,7 +331,7 @@ These are for `kash-media` but you can use a `kash-shell` for a more basic setup
 You can use kash from your MCP client (such as Anthropic Desktop or Cursor).
-You do this by running the the `kash_mcp` binary to make kash actions available as MCP
+You do this by running the the `kash-mcp` binary to make kash actions available as MCP
 tools.
 For Claude Desktop, my config looks like this:
@@ -338,7 +340,7 @@ For Claude Desktop, my config looks like this:
 {
   "mcpServers": {
     "kash": {
-      "command": "/Users/levy/.local/bin/kash_mcp",
+      "command": "/Users/levy/.local/bin/kash-mcp",
       "args": ["--proxy"]
     }
   }

kash-shell 0.3.23__py3-none-any.whl → 0.3.25__py3-none-any.whl

kash-shell 0.3.23py3-none-any.whl → 0.3.25py3-none-any.whl