PyPI - cat-stack - Versions diffs - 1.4.0__tar.gz → 1.5.0__tar.gz - Mend

cat-stack 1.4.0tar.gz → 1.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{cat_stack-1.4.0 → cat_stack-1.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cat-stack
-Version: 1.4.0
+Version: 1.5.0
 Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
 Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
 Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -20,7 +20,6 @@ Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
 Requires-Python: >=3.8
 Requires-Dist: pandas
-Requires-Dist: regex
 Requires-Dist: requests
 Requires-Dist: tqdm
 Provides-Extra: docx

{cat_stack-1.4.0 → cat_stack-1.5.0}/pyproject.toml RENAMED Viewed

@@ -28,7 +28,6 @@ dependencies = [
   "pandas",
   "tqdm",
   "requests",
-  "regex",
 ]
 [project.optional-dependencies]

{cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/__about__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
-__version__ = "1.4.0"
+__version__ = "1.5.0"
 __author__ = "Chris Soria"
 __email__ = "chrissoria@berkeley.edu"
 __title__ = "cat-stack"

{cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_batch.py RENAMED Viewed

@@ -818,13 +818,16 @@ def _run_one_sync_model(
             multi_label=prompt_params.get("multi_label", True),
         )
         try:
-            raw = client.complete(
+            raw, err = client.complete(
                 messages=messages,
                 json_schema=json_schema,
                 creativity=creativity,
                 thinking_budget=thinking_budget if thinking_budget and thinking_budget > 0 else None,
             )
-            item_results[idx] = (extract_json(raw), None)
+            if err:
+                item_results[idx] = (None, err)
+            else:
+                item_results[idx] = (extract_json(raw), None)
         except Exception as e:
             item_results[idx] = (None, str(e))

{cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_providers.py RENAMED Viewed

@@ -7,6 +7,7 @@ without requiring provider-specific SDKs.
 """
 import json
+import threading
 import time
 import requests
@@ -68,54 +69,76 @@ _HF_ROUTER_ENDPOINTS = {
 }
-def _detect_huggingface_endpoint(api_key: str, model: str) -> str:
+def _detect_huggingface_endpoint(api_key: str, model: str, skip: set = None) -> str:
     """
-    Test which HuggingFace endpoint works for this model.
-    If the model name has a router suffix (e.g., ":novita"), route directly
-    to that provider's endpoint. Otherwise tries generic router, then Together.
+    Probe HuggingFace endpoints to find one that supports this model.
+    Two call modes:
+      - Legacy (skip=None): probe generic + Together only. Falls back to
+        returning the generic base URL when nothing responds 200 — keeps
+        existing `image_functions` / `pdf_functions` callers behaving as
+        before so they can surface their own error from the eventual request.
+      - Lazy-fallback (skip=non-empty set): probe generic + all five known
+        router endpoints, skipping any in `skip`. Returns None when no
+        candidate responds 200 — caller (e.g., UnifiedLLMClient.complete)
+        should then surface the original error.
     Args:
-        api_key: HuggingFace API key
-        model: Model name to test (may include :router suffix)
+        api_key: HuggingFace API key.
+        model: Model name to test (may include `:router` suffix).
+        skip: optional set of base URLs to skip (typically the URL that
+            just failed at the call site).
     Returns:
-        Base URL for the working endpoint (without /chat/completions)
+        Base URL (without /chat/completions) of a working endpoint, or
+        None when skip is non-empty and nothing worked.
     """
+    skip = skip or set()
     clean_model, router = _parse_hf_model_suffix(model)
-    # If explicit router suffix, use that endpoint directly
+    # If explicit router suffix and the suffix endpoint is not skipped,
+    # route directly without probing.
     if router and router in _HF_ROUTER_ENDPOINTS:
-        return _HF_ROUTER_ENDPOINTS[router]
+        candidate = _HF_ROUTER_ENDPOINTS[router]
+        if candidate not in skip:
+            return candidate
+    generic_base = PROVIDER_CONFIG["huggingface"]["endpoint"].replace("/chat/completions", "")
-    # Otherwise auto-detect
-    endpoints = [
-        "https://router.huggingface.co/v1/chat/completions",
-        "https://router.huggingface.co/together/v1/chat/completions",
-    ]
+    if skip:
+        # Lazy-fallback mode: probe all known routers in priority order.
+        candidates_base = [generic_base] + list(_HF_ROUTER_ENDPOINTS.values())
+    else:
+        # Legacy mode: only generic + Together (preserves prior behavior
+        # and probe count for non-UnifiedLLMClient callers).
+        candidates_base = [generic_base, _HF_ROUTER_ENDPOINTS["together"]]
     headers = {
         "Content-Type": "application/json",
-        "Authorization": f"Bearer {api_key}"
+        "Authorization": f"Bearer {api_key}",
     }
     payload = {
         "model": clean_model,
         "messages": [{"role": "user", "content": "hi"}],
-        "max_tokens": 5
+        "max_tokens": 5,
     }
-    for endpoint in endpoints:
+    for base in candidates_base:
+        if base in skip:
+            continue
         try:
-            response = requests.post(endpoint, headers=headers, json=payload, timeout=30)
+            response = requests.post(f"{base}/chat/completions", headers=headers, json=payload, timeout=30)
             if response.status_code == 200:
-                # Return the base URL (without /chat/completions)
-                return endpoint.replace("/chat/completions", "")
+                return base
         except Exception:
             continue
-    # Default to generic (will fail with informative error)
-    return "https://router.huggingface.co/v1"
+    # Legacy callers expect a base URL even on failure (their HTTP call
+    # surfaces the real error). Lazy-fallback callers prefer None so they
+    # can surface the original error rather than retrying a known-bad URL.
+    if skip:
+        return None
+    return generic_base
 # =============================================================================
@@ -186,14 +209,24 @@ class UnifiedLLMClient:
     def __init__(self, provider: str, api_key: str, model: str):
         self.provider = provider.lower()
         self.api_key = api_key
-        # Keep full model name with router suffix — the generic HF router
-        # uses the suffix (e.g. :novita, :together) for routing.
         self.model = model
-        # Auto-detect HuggingFace endpoint (but always use generic router)
+        # Lazy HuggingFace router fallback — start with None and only
+        # populate when we either (a) have an explicit router suffix, or
+        # (b) the default endpoint returns a "wrong router" 400 on a real
+        # request. Avoids burning two probe POSTs (and leaking the API key
+        # to two endpoints) on every UnifiedLLMClient construction.
+        self._custom_endpoint = None
+        self._endpoint_lock = threading.Lock()
         if self.provider == "huggingface":
-            _detect_huggingface_endpoint(api_key, model)
+            clean_model, router = _parse_hf_model_suffix(model)
+            if router and router in _HF_ROUTER_ENDPOINTS:
+                # User was explicit about the router; honour it directly and
+                # strip the suffix from the model name (specific-router
+                # endpoints expect the clean name, not the suffix).
+                self._custom_endpoint = f"{_HF_ROUTER_ENDPOINTS[router]}/chat/completions"
+                self.model = clean_model
         if self.provider not in PROVIDER_CONFIG:
             raise ValueError(f"Unsupported provider: {provider}. "
@@ -201,6 +234,54 @@ class UnifiedLLMClient:
         self.config = PROVIDER_CONFIG[self.provider]
+    def _is_hf_wrong_router_400(self, body: str) -> bool:
+        """True if a 400 response body indicates the current HF router doesn't
+        carry this model (vs. truly nonexistent or a non-routing problem).
+        Trigger shapes (from a smoke test against the live HF API):
+          - Generic router: `{"error":{"code":"model_not_supported",...}}`
+          - Specific router: `{"error":"Model not supported by provider XYZ"}`
+        Intentionally NOT triggered by `model_not_found` (no router will help
+        a nonexistent model), 401/403 (auth), 5xx/429 (transient), or any
+        other 400 unrelated to router routing.
+        """
+        if self.provider != "huggingface":
+            return False
+        return (
+            '"code":"model_not_supported"' in body
+            or "Model not supported by provider" in body
+        )
+    def _try_hf_router_fallback(self, failed_endpoint: str) -> bool:
+        """Find an HF router that has this model. Cache it on self.
+        Called from `complete()` when an HF request returns a "wrong router"
+        400. Probes all five known specific routers plus the generic router,
+        skipping the one that just failed. Idempotent and thread-safe via
+        the per-instance endpoint lock — if two concurrent callers both hit
+        the fallback path, only one runs the probe.
+        Returns True if a working endpoint was found and cached (caller
+        should refresh and retry). Returns False if every alternative also
+        rejected the model (caller should surface the original error).
+        """
+        failed_base = failed_endpoint.replace("/chat/completions", "")
+        with self._endpoint_lock:
+            # Did another thread already find a different working endpoint?
+            if self._custom_endpoint:
+                current_base = self._custom_endpoint.replace("/chat/completions", "")
+                if current_base != failed_base:
+                    return True
+            new_base = _detect_huggingface_endpoint(
+                self.api_key, self.model, skip={failed_base}
+            )
+            if new_base:
+                self._custom_endpoint = f"{new_base}/chat/completions"
+                return True
+            return False
     def _get_endpoint(self) -> str:
         """Get the API endpoint, substituting model if needed."""
         # Use custom endpoint if set (e.g., for HuggingFace router suffixes)
@@ -555,11 +636,11 @@ class UnifiedLLMClient:
         if self.provider == "claude-code":
             return self._call_claude_cli(messages, max_retries=max_retries, initial_delay=initial_delay)
-        endpoint = self._get_endpoint()
         headers = self._get_headers()
         payload = self._build_payload(messages, json_schema, creativity, thinking_budget=thinking_budget, force_json=force_json)
         for attempt in range(max_retries):
+            endpoint = self._get_endpoint()
             try:
                 response = requests.post(
                     endpoint,
@@ -582,6 +663,12 @@ class UnifiedLLMClient:
                                 self._warned_no_structured = True
                             payload.pop("response_format")
                             continue  # Retry immediately without response_format
+                    # HuggingFace: try other routers when the current one
+                    # rejects the model with a "wrong router" 400.
+                    if self._is_hf_wrong_router_400(response.text):
+                        if self._try_hf_router_fallback(endpoint):
+                            continue  # retry with the newly-cached endpoint
                 if response.status_code == 404 or (response.status_code == 400 and "not found" in response.text.lower() and "model" in response.text.lower()):
                     return None, f"Model '{self.model}' not found for {self.provider}"
                 elif response.status_code in [401, 403]:

{cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_utils.py RENAMED Viewed

@@ -7,7 +7,6 @@ encoding, and other common operations used across the package.
 import json
 import re
-import regex
 __all__ = [
     # JSON utilities
@@ -88,15 +87,55 @@ def build_json_schema(categories: list, include_additional_properties: bool = Tr
     return schema
+def _extract_balanced_json(text: str) -> str | None:
+    """Return the first balanced-brace JSON object substring in text, or None.
+    String-aware: a `{` or `}` inside a JSON string (between unescaped double
+    quotes) doesn't change scan depth. Replaces the prior `regex.findall` with
+    a recursive `(?R)` pattern — same semantics for well-formed input, but
+    correct on inputs like `{"summary": "see Fig {3}"}` (the regex version
+    truncated at the first `}` inside the string).
+    """
+    if text is None:
+        return None
+    depth = 0
+    start = None
+    in_string = False
+    escape = False
+    for i, ch in enumerate(text):
+        if escape:
+            escape = False
+            continue
+        if ch == '\\':
+            escape = True
+            continue
+        if ch == '"':
+            in_string = not in_string
+            continue
+        if in_string:
+            continue
+        if ch == '{':
+            if depth == 0:
+                start = i
+            depth += 1
+        elif ch == '}':
+            if depth == 0:
+                continue
+            depth -= 1
+            if depth == 0 and start is not None:
+                return text[start:i + 1]
+    return None
 def extract_json(reply: str) -> str:
     """Extract JSON from model reply."""
     if reply is None:
         return '{"1":"e"}'
-    extracted = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
-    if extracted:
-        # Clean up the JSON string
-        return extracted[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '')
+    extracted = _extract_balanced_json(reply)
+    if extracted is not None:
+        return extracted.replace('[', '').replace(']', '').replace('\n', '').replace(" ", '')
     else:
         return '{"1":"e"}'

cat_stack-1.5.0/src/catstack/_web_fetch.py ADDED Viewed

@@ -0,0 +1,265 @@
+"""
+Web content fetching utilities for URL input type.
+Provides URL detection, HTML text extraction, and batch URL fetching
+for use as a preprocessing step before text classification/extraction/summarization.
+"""
+import html as html_lib
+import ipaddress
+import re
+import socket
+from urllib.parse import urlsplit
+import requests
+__all__ = [
+    "is_url",
+    "fetch_url_text",
+    "fetch_urls",
+    "detect_url_input",
+    "strip_html_tags",
+]
+_DEFAULT_TIMEOUT = 30
+_MAX_CONTENT_CHARS = 50000
+# Hard cap on bytes pulled from the response before bailing — guards against
+# OOM on a hostile or accidentally-huge URL. 5x slack over the char cap so
+# HTML markup that gets stripped later still leaves real payload room.
+_MAX_RESPONSE_BYTES = 5 * _MAX_CONTENT_CHARS
+# Schemes fetch_url_text will follow. Anything else (file://, ftp://, data:,
+# javascript:, ...) is rejected at validation time.
+_ALLOWED_SCHEMES = frozenset({"http", "https"})
+_USER_AGENT = (
+    "Mozilla/5.0 (compatible; CatStack/1.0; "
+    "+https://github.com/chrissoria/cat-stack)"
+)
+def is_url(s) -> bool:
+    """
+    Check whether a string is a well-formed http(s) URL.
+    Structural check only — no DNS resolution, no network call. Rejects
+    strings with embedded control characters, non-http(s) schemes, and
+    missing netloc.
+    """
+    if not isinstance(s, str):
+        return False
+    s = s.strip()
+    if any(c in s for c in ("\r", "\n", "\x00")):
+        return False
+    try:
+        parts = urlsplit(s)
+    except Exception:
+        return False
+    return parts.scheme in _ALLOWED_SCHEMES and bool(parts.netloc)
+def detect_url_input(items) -> bool:
+    """
+    Check whether input data is a collection of URLs.
+    Inspects the first non-null item in the iterable. Returns True if it
+    looks like a URL.
+    """
+    import pandas as pd
+    if isinstance(items, str):
+        return is_url(items)
+    if hasattr(items, "__iter__"):
+        for item in items:
+            if item is not None:
+                try:
+                    if pd.isna(item):
+                        continue
+                except (TypeError, ValueError):
+                    pass
+                return is_url(str(item))
+    return False
+def _validate_url_safe(url):
+    """
+    Validate a URL for safe fetching: structure + SSRF host guard.
+    Returns (cleaned_url, error_message). error_message is None on success.
+    The SSRF guard resolves the hostname via socket.getaddrinfo and rejects
+    if ANY returned address is private, loopback, link-local, reserved,
+    multicast, or unspecified. Catches AWS metadata (169.254.169.254),
+    localhost (127.0.0.1, ::1), RFC1918, GCP metadata host, and similar
+    internal targets before any HTTP request goes out.
+    Does NOT defend against DNS rebinding (resolve-once-then-reconnect to
+    a different IP); that requires a custom HTTPAdapter and is out of
+    scope here.
+    """
+    if not isinstance(url, str):
+        return "", "url must be a string"
+    url = url.strip()
+    if any(c in url for c in ("\r", "\n", "\x00")):
+        return "", "url contains control characters"
+    try:
+        parts = urlsplit(url)
+    except Exception as e:
+        return "", f"could not parse url: {e}"
+    if parts.scheme not in _ALLOWED_SCHEMES:
+        return "", f"scheme must be http or https; got {parts.scheme!r}"
+    if not parts.netloc:
+        return "", "url has empty netloc"
+    hostname = parts.hostname
+    if not hostname:
+        return "", "url has empty hostname"
+    try:
+        addrinfo = socket.getaddrinfo(hostname, None)
+    except socket.gaierror as e:
+        return "", f"could not resolve {hostname!r}: {e}"
+    for info in addrinfo:
+        ip_str = info[4][0]
+        try:
+            ip = ipaddress.ip_address(ip_str)
+        except ValueError:
+            return "", f"resolved address {ip_str!r} is not a valid IP"
+        if (
+            ip.is_private
+            or ip.is_loopback
+            or ip.is_link_local
+            or ip.is_reserved
+            or ip.is_multicast
+            or ip.is_unspecified
+        ):
+            return "", (
+                f"{hostname!r} resolves to {ip_str} (private/internal); "
+                f"refusing to fetch as an SSRF guard"
+            )
+    return url, None
+def strip_html_tags(html: str) -> str:
+    """
+    Extract readable text from an HTML string.
+    Removes non-content elements (navigation, headers, footers, sidebars,
+    forms, scripts, styles), strips remaining tags, collapses whitespace,
+    and decodes HTML entities.
+    """
+    text = html
+    _JUNK_TAGS = (
+        "script", "style", "nav", "header", "footer", "aside",
+        "noscript", "iframe", "form", "svg",
+    )
+    for tag in _JUNK_TAGS:
+        text = re.sub(
+            rf"<{tag}[^>]*>.*?</{tag}>",
+            "",
+            text,
+            flags=re.DOTALL | re.IGNORECASE,
+        )
+    for tag in ("input", "meta", "link", "img"):
+        text = re.sub(rf"<{tag}[^>]*/?\s*>", "", text, flags=re.IGNORECASE)
+    text = re.sub(r"<[^>]+>", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    text = html_lib.unescape(text)
+    return text
+def fetch_url_text(url: str, timeout: int = _DEFAULT_TIMEOUT):
+    """
+    Fetch a single URL and extract its text content.
+    Pre-flight: the URL's scheme and hostname are validated, and the
+    hostname is resolved; if it points at a private/internal IP, the
+    fetch is refused (SSRF guard). The response body is streamed and
+    capped to prevent OOM on very large pages. TLS errors are surfaced —
+    there is no silent verify=False fallback.
+    Returns (text, error). error is None on success.
+    """
+    cleaned_url, validation_error = _validate_url_safe(url)
+    if validation_error:
+        return "", f"Error fetching {url}: {validation_error}"
+    headers = {"User-Agent": _USER_AGENT}
+    try:
+        with requests.get(
+            cleaned_url,
+            headers=headers,
+            timeout=timeout,
+            stream=True,
+        ) as response:
+            response.raise_for_status()
+            content_type = response.headers.get("Content-Type", "")
+            encoding = response.encoding
+            chunks = []
+            bytes_read = 0
+            for chunk in response.iter_content(chunk_size=8192):
+                if not chunk:
+                    continue
+                chunks.append(chunk)
+                bytes_read += len(chunk)
+                if bytes_read > _MAX_RESPONSE_BYTES:
+                    break
+            raw = b"".join(chunks)
+        encoding = encoding or "utf-8"
+        try:
+            body = raw.decode(encoding, errors="replace")
+        except (LookupError, TypeError):
+            body = raw.decode("utf-8", errors="replace")
+        if (
+            "text/html" in content_type
+            or "text/plain" in content_type
+            or not content_type
+        ):
+            text = strip_html_tags(body)
+        else:
+            text = body
+        if len(text) > _MAX_CONTENT_CHARS:
+            text = text[:_MAX_CONTENT_CHARS] + (
+                f"\n\n[Content truncated at {_MAX_CONTENT_CHARS} characters]"
+            )
+        return text, None
+    except requests.exceptions.Timeout:
+        return "", f"Timeout after {timeout}s fetching {url}"
+    except requests.exceptions.SSLError as e:
+        return "", f"SSL/TLS error fetching {url}: {e}"
+    except requests.exceptions.HTTPError as e:
+        return "", f"HTTP {e.response.status_code} fetching {url}"
+    except Exception as e:
+        return "", f"Error fetching {url}: {e}"
+def fetch_urls(urls, timeout: int = _DEFAULT_TIMEOUT):
+    """
+    Fetch content from a list of URLs.
+    Returns list of (original_url, fetched_text, error) tuples. On success
+    error is None; on failure fetched_text is "".
+    """
+    results = []
+    for url in urls:
+        url_str = str(url).strip()
+        if not is_url(url_str):
+            results.append((url_str, "", f"Not a valid URL: {url_str}"))
+            continue
+        text, error = fetch_url_text(url_str, timeout=timeout)
+        results.append((url_str, text, error))
+    return results

{cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/__init__.py RENAMED Viewed

@@ -2,15 +2,17 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
-from .all_calls import (
+from .stepback import (
     get_stepback_insight_openai,
     get_stepback_insight_anthropic,
     get_stepback_insight_google,
     get_stepback_insight_mistral,
+)
+from .CoVe import (
     chain_of_verification_openai,
-    chain_of_verification_google,
     chain_of_verification_anthropic,
-    chain_of_verification_mistral
+    chain_of_verification_google,
+    chain_of_verification_mistral,
 )
 __all__ = [
@@ -22,4 +24,4 @@ __all__ = [
     'chain_of_verification_anthropic',
     'chain_of_verification_google',
     'chain_of_verification_mistral',
-]
+]

cat-stack 1.4.0__tar.gz → 1.5.0__tar.gz

cat-stack 1.4.0tar.gz → 1.5.0tar.gz