PyPI - cat-stack - Versions diffs - 1.0.3__tar.gz → 1.0.5__tar.gz - Mend

cat-stack 1.0.3tar.gz → 1.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{cat_stack-1.0.3 → cat_stack-1.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cat-stack
-Version: 1.0.3
+Version: 1.0.5
 Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
 Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
 Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/__about__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
-__version__ = "1.0.3"
+__version__ = "1.0.5"
 __author__ = "Chris Soria"
 __email__ = "chrissoria@berkeley.edu"
 __title__ = "cat-stack"

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_providers.py RENAMED Viewed

@@ -36,18 +36,59 @@ __all__ = [
 # HuggingFace Endpoint Auto-Detection
 # =============================================================================
+def _parse_hf_model_suffix(model: str) -> tuple:
+    """
+    Parse a HuggingFace model name that may have a :router suffix.
+    Examples:
+        "Qwen/Qwen3-VL-235B:novita" -> ("Qwen/Qwen3-VL-235B", "novita")
+        "meta-llama/Llama-3-8B" -> ("meta-llama/Llama-3-8B", None)
+    Returns:
+        (clean_model_name, router_name_or_None)
+    """
+    # Only treat the last segment after ':' as a router suffix if the model
+    # contains a '/' (org/model format) to avoid confusing with Ollama tags
+    if ":" in model and "/" in model:
+        parts = model.rsplit(":", 1)
+        suffix = parts[1].lower()
+        # Known HuggingFace inference provider routers
+        if suffix in ("novita", "together", "sambanova", "cerebras", "fireworks"):
+            return parts[0], suffix
+    return model, None
+# Known router suffix -> endpoint mapping
+_HF_ROUTER_ENDPOINTS = {
+    "novita": "https://router.huggingface.co/novita/v3/openai",
+    "together": "https://router.huggingface.co/together/v1",
+    "sambanova": "https://router.huggingface.co/sambanova/v1",
+    "cerebras": "https://router.huggingface.co/cerebras/v1",
+    "fireworks": "https://router.huggingface.co/fireworks/v1",
+}
 def _detect_huggingface_endpoint(api_key: str, model: str) -> str:
     """
     Test which HuggingFace endpoint works for this model.
-    Tries generic router first, then Together.
+    If the model name has a router suffix (e.g., ":novita"), route directly
+    to that provider's endpoint. Otherwise tries generic router, then Together.
     Args:
         api_key: HuggingFace API key
-        model: Model name to test
+        model: Model name to test (may include :router suffix)
     Returns:
         Base URL for the working endpoint (without /chat/completions)
     """
+    clean_model, router = _parse_hf_model_suffix(model)
+    # If explicit router suffix, use that endpoint directly
+    if router and router in _HF_ROUTER_ENDPOINTS:
+        return _HF_ROUTER_ENDPOINTS[router]
+    # Otherwise auto-detect
     endpoints = [
         "https://router.huggingface.co/v1/chat/completions",
         "https://router.huggingface.co/together/v1/chat/completions",
@@ -59,7 +100,7 @@ def _detect_huggingface_endpoint(api_key: str, model: str) -> str:
     }
     payload = {
-        "model": model,
+        "model": clean_model,
         "messages": [{"role": "user", "content": "hi"}],
         "max_tokens": 5
     }
@@ -145,13 +186,19 @@ class UnifiedLLMClient:
     def __init__(self, provider: str, api_key: str, model: str):
         self.provider = provider.lower()
         self.api_key = api_key
-        self.model = model
+        # Strip router suffix from model name and detect endpoint
+        clean_model, router = _parse_hf_model_suffix(model)
+        self.model = clean_model if self.provider == "huggingface" else model
         # Auto-detect HuggingFace endpoint
         if self.provider == "huggingface":
             detected_url = _detect_huggingface_endpoint(api_key, model)
             if "together" in detected_url:
                 self.provider = "huggingface-together"
+            elif router and router in _HF_ROUTER_ENDPOINTS:
+                # Use the router-specific endpoint as a custom provider config
+                self._custom_endpoint = _HF_ROUTER_ENDPOINTS[router] + "/chat/completions"
         if self.provider not in PROVIDER_CONFIG:
             raise ValueError(f"Unsupported provider: {provider}. "
@@ -161,7 +208,8 @@ class UnifiedLLMClient:
     def _get_endpoint(self) -> str:
         """Get the API endpoint, substituting model if needed."""
-        endpoint = self.config["endpoint"]
+        # Use custom endpoint if set (e.g., for HuggingFace router suffixes)
+        endpoint = getattr(self, "_custom_endpoint", None) or self.config["endpoint"]
         if "{model}" in endpoint:
             endpoint = endpoint.format(model=self.model)
         return endpoint

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/text_functions.py RENAMED Viewed

@@ -762,6 +762,24 @@ def explore_common_categories(
     # Second-pass semantic merge prompt
     seed_list = result["Category"].head(max_categories * 3).tolist()
+    if specificity == "specific":
+        name_instruction = (
+            "Keep category names DETAILED and DESCRIPTIVE with examples. "
+            "Each category name MUST include a brief clarifying phrase using "
+            "'such as' or parenthetical examples. For example:\n"
+            "   - 'Residential Zoning Changes (e.g., rezoning parcels, density adjustments)'\n"
+            "   - 'Construction Contract Extensions (e.g., timeline amendments, scope changes)'\n"
+            "   - 'Environmental Compliance (e.g., stormwater regulations, habitat protections)'\n"
+            "Do NOT use short generic labels like 'Zoning' or 'Contracts'. "
+            "Every category must be specific enough that a reader immediately "
+            "understands what types of documents belong in it."
+        )
+    else:
+        name_instruction = (
+            "Keep category names broad and general. "
+            "Use the most frequent or clearest label when merging."
+        )
     second_prompt = f"""
 You are a data analyst reviewing categorized text data.
@@ -774,9 +792,8 @@ Critical Instructions:
    - "breakup/household conflict" = "relationship problems"
 3) When merging:
    - Combine frequencies mentally
-   - Keep the most frequent OR clearest label
    - Each concept appears ONLY ONCE
-4) Keep category names {specificity}.
+4) {name_instruction}
 5) Return ONLY a numbered list of {max_categories} categories. No extra text.
 Pre-processed Categories (sorted by frequency, top sample):
@@ -820,13 +837,17 @@ Output:
     print("\nTop categories:\n" + "\n".join(f"{i+1}. {c}" for i, c in enumerate(final[:max_categories])))
+    top = final[:max_categories]
     if filename:
-        result.to_csv(filename, index=False)
-        print(f"\nResults saved to {filename}")
+        import pandas as _pd
+        top_df = _pd.DataFrame({"rank": range(1, len(top) + 1), "category": top})
+        top_df.to_csv(filename, index=False)
+        print(f"\nTop {len(top)} categories saved to {filename}")
     return {
         "counts_df": result,
-        "top_categories": final[:max_categories],
+        "top_categories": top,
         "raw_top_text": top_categories_text
     }

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/text_functions_ensemble.py RENAMED Viewed

@@ -1313,6 +1313,38 @@ Provide concise summaries that capture essential information.
     return messages
+def _extract_json_for_summary(reply: str) -> str:
+    """Extract JSON from model reply without destroying freeform text content.
+    Unlike extract_json() (designed for classification 0/1 values), this
+    preserves spaces, brackets, and newlines inside string values.
+    """
+    if reply is None:
+        return '{"summary": ""}'
+    # Strip thinking tags if present (Qwen3, DeepSeek, etc.)
+    import re as _re
+    reply = _re.sub(r'<think>.*?</think>', '', reply, flags=_re.DOTALL).strip()
+    # Find JSON object using recursive regex (regex module imported at top of file)
+    try:
+        extracted = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
+        if extracted:
+            return extracted[0]
+    except Exception:
+        pass
+    # Fallback: try simple JSON parse
+    try:
+        import json
+        json.loads(reply)
+        return reply
+    except Exception:
+        pass
+    return '{"summary": ""}'
 def extract_summary_from_json(json_str: str) -> tuple:
     """
     Extract summary from JSON response.
@@ -1329,6 +1361,11 @@ def extract_summary_from_json(json_str: str) -> tuple:
             summary = data["summary"]
             if isinstance(summary, str) and summary.strip():
                 return True, summary.strip()
+            elif isinstance(summary, list):
+                # Model returned summary as a list of strings (e.g., bullet points)
+                joined = "\n".join(str(s) for s in summary if s)
+                if joined.strip():
+                    return True, joined.strip()
         return False, None
     except (json.JSONDecodeError, TypeError):
         return False, None
@@ -1744,6 +1781,117 @@ def _prepare_page_data(
 # Image-Specific Functions
 # =============================================================================
+def build_image_summarization_prompt(
+    image_data: dict,
+    input_description: str = "",
+    summary_instructions: str = "",
+    max_length: int = None,
+    focus: str = None,
+    provider: str = "openai",
+    chain_of_thought: bool = False,
+    context_prompt: bool = False,
+    step_back_prompt: bool = False,
+    stepback_insights: dict = None,
+    model_name: str = None,
+) -> list:
+    """
+    Build the summarization prompt for an image.
+    Parallel to build_pdf_summarization_prompt() but for standalone images.
+    Args:
+        image_data: Dict from _prepare_image_data() containing:
+            - encoded_image: Base64 encoded image
+            - extension: Image file extension (without dot)
+        input_description: Description of what the images contain
+        summary_instructions: Specific instructions (e.g., format/tone)
+        max_length: Maximum summary length in words
+        focus: What to focus on in the summary
+        provider: Provider name for format-specific handling
+        chain_of_thought: Whether to use step-by-step reasoning
+        context_prompt: Whether to add expert context prefix
+        step_back_prompt: Whether step-back prompting is enabled
+        stepback_insights: Dict of step-back insights per model
+        model_name: Current model name (for step-back lookup)
+    Returns:
+        List of message dicts for the LLM (format varies by provider)
+    """
+    focus_instruction = f", focusing on {focus}" if focus else ""
+    length_instruction = f"\n\nKeep the summary under {max_length} words." if max_length else ""
+    custom_instructions = f"\n\nAdditional instructions: {summary_instructions}" if summary_instructions else ""
+    if chain_of_thought:
+        base_text = f"""You are an image summarization assistant.
+Task: Examine the attached image and provide a concise summary{focus_instruction}.
+{f'Image context: {input_description}' if input_description else ''}
+Let's analyze step by step:
+1. First, identify the main subject and visual elements in the image
+2. Then, extract the key information, text, or message conveyed
+3. Finally, synthesize into a concise summary{length_instruction}{custom_instructions}
+Provide your answer in JSON format: {{"summary": "your summary here"}}"""
+    else:
+        base_text = f"""You are an image summarization assistant.
+Task: Examine the attached image and provide a concise summary{focus_instruction}.
+{f'Image context: {input_description}' if input_description else ''}{length_instruction}{custom_instructions}
+Provide your answer in JSON format: {{"summary": "your summary here"}}"""
+    if context_prompt:
+        context = """You are an expert at analyzing and describing visual content.
+Focus on accuracy, key details, and any text visible in the image.
+"""
+        base_text = context + base_text
+    messages = []
+    if step_back_prompt and stepback_insights and model_name in stepback_insights:
+        sb_question, sb_insight = stepback_insights[model_name]
+        messages.append({"role": "user", "content": sb_question})
+        messages.append({"role": "assistant", "content": sb_insight})
+    encoded = image_data.get("encoded_image", "")
+    ext = image_data.get("extension", "png")
+    if provider == "anthropic":
+        content = [
+            {"type": "text", "text": base_text},
+            {
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": f"image/{ext}",
+                    "data": encoded
+                }
+            }
+        ]
+        messages.append({"role": "user", "content": content})
+    elif provider == "google":
+        content = [
+            {"type": "text", "text": base_text},
+            {
+                "type": "inline_data",
+                "mime_type": f"image/{ext}",
+                "data": encoded
+            }
+        ]
+        messages.append({"role": "user", "content": content})
+    else:
+        encoded_url = f"data:image/{ext};base64,{encoded}"
+        content = [
+            {"type": "text", "text": base_text},
+            {"type": "image_url", "image_url": {"url": encoded_url, "detail": "high"}}
+        ]
+        messages.append({"role": "user", "content": content})
+    return messages
 def build_image_classification_prompt(
     image_data: dict,
     categories_str: str,
@@ -3774,7 +3922,7 @@ def summarize_ensemble(
                     return (model_name, '{"summary": ""}', error)
                 # Extract JSON from response
-                json_str = extract_json(response)
+                json_str = _extract_json_for_summary(response)
                 return (model_name, json_str, None)
@@ -3782,6 +3930,65 @@ def summarize_ensemble(
                 error_msg = str(e)
                 return (model_name, '{"summary": ""}', error_msg)
+        elif is_image_mode and isinstance(item, tuple) and len(item) == 2:
+            # IMAGE MODE: item is (image_path, image_label)
+            image_path, image_label = item
+            try:
+                image_data = _prepare_image_data(image_path, image_label)
+                if image_data.get("error"):
+                    return (model_name, '{"summary": ""}', image_data["error"])
+                messages = build_image_summarization_prompt(
+                    image_data=image_data,
+                    input_description=input_description,
+                    summary_instructions=summary_instructions,
+                    max_length=max_length,
+                    focus=focus,
+                    provider=cfg["provider"],
+                    chain_of_thought=chain_of_thought,
+                    context_prompt=context_prompt,
+                    step_back_prompt=step_back_prompt,
+                    stepback_insights=stepback_insights,
+                    model_name=model_name,
+                )
+                client = UnifiedLLMClient(
+                    provider=cfg["provider"],
+                    api_key=cfg["api_key"],
+                    model=cfg["model"],
+                )
+                json_schema = json_schemas[model_name]
+                effective_thinking = thinking_budget if cfg["provider"] in ("google", "openai", "anthropic", "huggingface", "huggingface-together") else None
+                if cfg["provider"] == "google":
+                    response = _call_google_multimodal(
+                        client=client,
+                        messages=messages,
+                        json_schema=json_schema,
+                        creativity=creativity,
+                        thinking_budget=effective_thinking or 0,
+                        max_retries=max_retries,
+                    )
+                else:
+                    response, error = client.complete(
+                        messages=messages,
+                        json_schema=json_schema,
+                        creativity=creativity,
+                        thinking_budget=effective_thinking,
+                        max_retries=max_retries,
+                    )
+                if error:
+                    return (model_name, '{"summary": ""}', error)
+                json_str = _extract_json_for_summary(response)
+                return (model_name, json_str, None)
+            except Exception as e:
+                return (model_name, '{"summary": ""}', str(e))
         else:
             # TEXT MODE: Original text handling
             # Skip empty/null items
@@ -3827,7 +4034,7 @@ def summarize_ensemble(
                     return (model_name, '{"summary": ""}', error)
                 # Extract JSON from response
-                json_str = extract_json(response)
+                json_str = _extract_json_for_summary(response)
                 return (model_name, json_str, None)
@@ -4162,7 +4369,7 @@ Provide your answer in JSON format: {{"summary": "your synthesized summary"}}"""
             max_retries=max_retries,
         )
-        json_str = extract_json(response)
+        json_str = _extract_json_for_summary(response)
         is_valid, summary = extract_summary_from_json(json_str)
         if is_valid:

{cat_stack-1.0.3 → cat_stack-1.0.5}/.gitignore RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/LICENSE RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/README.md RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/pyproject.toml RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/__init__.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_batch.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_category_analysis.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_chunked.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_embeddings.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_formatter.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_pilot_test.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_review_ui.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_tiebreaker.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_utils.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_web_fetch.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/CoVe.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/__init__.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/all_calls.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/image_CoVe.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/image_stepback.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/pdf_CoVe.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/pdf_stepback.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/stepback.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/top_n.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/classify.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/explore.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/extract.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/image_functions.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/images/circle.png RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/images/cube.png RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/images/diamond.png RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/images/overlapping_pentagons.png RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/images/rectangles.png RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/model_reference_list.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/pdf_functions.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/prompt_tune.py RENAMED Viewed

File without changes

{cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/summarize.py RENAMED Viewed

File without changes

cat-stack 1.0.3__tar.gz → 1.0.5__tar.gz

cat-stack 1.0.3tar.gz → 1.0.5tar.gz