cat-stack 1.0.3__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {cat_stack-1.0.3 → cat_stack-1.0.5}/PKG-INFO +1 -1
  2. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/__about__.py +1 -1
  3. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_providers.py +53 -5
  4. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/text_functions.py +26 -5
  5. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/text_functions_ensemble.py +210 -3
  6. {cat_stack-1.0.3 → cat_stack-1.0.5}/.gitignore +0 -0
  7. {cat_stack-1.0.3 → cat_stack-1.0.5}/LICENSE +0 -0
  8. {cat_stack-1.0.3 → cat_stack-1.0.5}/README.md +0 -0
  9. {cat_stack-1.0.3 → cat_stack-1.0.5}/pyproject.toml +0 -0
  10. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/__init__.py +0 -0
  11. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_batch.py +0 -0
  12. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_category_analysis.py +0 -0
  13. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_chunked.py +0 -0
  14. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_embeddings.py +0 -0
  15. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_formatter.py +0 -0
  16. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_pilot_test.py +0 -0
  17. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_review_ui.py +0 -0
  18. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_tiebreaker.py +0 -0
  19. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_utils.py +0 -0
  20. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/_web_fetch.py +0 -0
  21. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/CoVe.py +0 -0
  22. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/__init__.py +0 -0
  23. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/all_calls.py +0 -0
  24. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/image_CoVe.py +0 -0
  25. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/image_stepback.py +0 -0
  26. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/pdf_CoVe.py +0 -0
  27. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/pdf_stepback.py +0 -0
  28. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/stepback.py +0 -0
  29. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/calls/top_n.py +0 -0
  30. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/classify.py +0 -0
  31. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/explore.py +0 -0
  32. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/extract.py +0 -0
  33. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/image_functions.py +0 -0
  34. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/images/circle.png +0 -0
  35. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/images/cube.png +0 -0
  36. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/images/diamond.png +0 -0
  37. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/images/overlapping_pentagons.png +0 -0
  38. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/images/rectangles.png +0 -0
  39. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/model_reference_list.py +0 -0
  40. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/pdf_functions.py +0 -0
  41. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/prompt_tune.py +0 -0
  42. {cat_stack-1.0.3 → cat_stack-1.0.5}/src/cat_stack/summarize.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-stack
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
- __version__ = "1.0.3"
4
+ __version__ = "1.0.5"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-stack"
@@ -36,18 +36,59 @@ __all__ = [
36
36
  # HuggingFace Endpoint Auto-Detection
37
37
  # =============================================================================
38
38
 
39
+ def _parse_hf_model_suffix(model: str) -> tuple:
40
+ """
41
+ Parse a HuggingFace model name that may have a :router suffix.
42
+
43
+ Examples:
44
+ "Qwen/Qwen3-VL-235B:novita" -> ("Qwen/Qwen3-VL-235B", "novita")
45
+ "meta-llama/Llama-3-8B" -> ("meta-llama/Llama-3-8B", None)
46
+
47
+ Returns:
48
+ (clean_model_name, router_name_or_None)
49
+ """
50
+ # Only treat the last segment after ':' as a router suffix if the model
51
+ # contains a '/' (org/model format) to avoid confusing with Ollama tags
52
+ if ":" in model and "/" in model:
53
+ parts = model.rsplit(":", 1)
54
+ suffix = parts[1].lower()
55
+ # Known HuggingFace inference provider routers
56
+ if suffix in ("novita", "together", "sambanova", "cerebras", "fireworks"):
57
+ return parts[0], suffix
58
+ return model, None
59
+
60
+
61
+ # Known router suffix -> endpoint mapping
62
+ _HF_ROUTER_ENDPOINTS = {
63
+ "novita": "https://router.huggingface.co/novita/v3/openai",
64
+ "together": "https://router.huggingface.co/together/v1",
65
+ "sambanova": "https://router.huggingface.co/sambanova/v1",
66
+ "cerebras": "https://router.huggingface.co/cerebras/v1",
67
+ "fireworks": "https://router.huggingface.co/fireworks/v1",
68
+ }
69
+
70
+
39
71
  def _detect_huggingface_endpoint(api_key: str, model: str) -> str:
40
72
  """
41
73
  Test which HuggingFace endpoint works for this model.
42
- Tries generic router first, then Together.
74
+
75
+ If the model name has a router suffix (e.g., ":novita"), route directly
76
+ to that provider's endpoint. Otherwise tries generic router, then Together.
43
77
 
44
78
  Args:
45
79
  api_key: HuggingFace API key
46
- model: Model name to test
80
+ model: Model name to test (may include :router suffix)
47
81
 
48
82
  Returns:
49
83
  Base URL for the working endpoint (without /chat/completions)
50
84
  """
85
+ clean_model, router = _parse_hf_model_suffix(model)
86
+
87
+ # If explicit router suffix, use that endpoint directly
88
+ if router and router in _HF_ROUTER_ENDPOINTS:
89
+ return _HF_ROUTER_ENDPOINTS[router]
90
+
91
+ # Otherwise auto-detect
51
92
  endpoints = [
52
93
  "https://router.huggingface.co/v1/chat/completions",
53
94
  "https://router.huggingface.co/together/v1/chat/completions",
@@ -59,7 +100,7 @@ def _detect_huggingface_endpoint(api_key: str, model: str) -> str:
59
100
  }
60
101
 
61
102
  payload = {
62
- "model": model,
103
+ "model": clean_model,
63
104
  "messages": [{"role": "user", "content": "hi"}],
64
105
  "max_tokens": 5
65
106
  }
@@ -145,13 +186,19 @@ class UnifiedLLMClient:
145
186
  def __init__(self, provider: str, api_key: str, model: str):
146
187
  self.provider = provider.lower()
147
188
  self.api_key = api_key
148
- self.model = model
189
+
190
+ # Strip router suffix from model name and detect endpoint
191
+ clean_model, router = _parse_hf_model_suffix(model)
192
+ self.model = clean_model if self.provider == "huggingface" else model
149
193
 
150
194
  # Auto-detect HuggingFace endpoint
151
195
  if self.provider == "huggingface":
152
196
  detected_url = _detect_huggingface_endpoint(api_key, model)
153
197
  if "together" in detected_url:
154
198
  self.provider = "huggingface-together"
199
+ elif router and router in _HF_ROUTER_ENDPOINTS:
200
+ # Use the router-specific endpoint as a custom provider config
201
+ self._custom_endpoint = _HF_ROUTER_ENDPOINTS[router] + "/chat/completions"
155
202
 
156
203
  if self.provider not in PROVIDER_CONFIG:
157
204
  raise ValueError(f"Unsupported provider: {provider}. "
@@ -161,7 +208,8 @@ class UnifiedLLMClient:
161
208
 
162
209
  def _get_endpoint(self) -> str:
163
210
  """Get the API endpoint, substituting model if needed."""
164
- endpoint = self.config["endpoint"]
211
+ # Use custom endpoint if set (e.g., for HuggingFace router suffixes)
212
+ endpoint = getattr(self, "_custom_endpoint", None) or self.config["endpoint"]
165
213
  if "{model}" in endpoint:
166
214
  endpoint = endpoint.format(model=self.model)
167
215
  return endpoint
@@ -762,6 +762,24 @@ def explore_common_categories(
762
762
  # Second-pass semantic merge prompt
763
763
  seed_list = result["Category"].head(max_categories * 3).tolist()
764
764
 
765
+ if specificity == "specific":
766
+ name_instruction = (
767
+ "Keep category names DETAILED and DESCRIPTIVE with examples. "
768
+ "Each category name MUST include a brief clarifying phrase using "
769
+ "'such as' or parenthetical examples. For example:\n"
770
+ " - 'Residential Zoning Changes (e.g., rezoning parcels, density adjustments)'\n"
771
+ " - 'Construction Contract Extensions (e.g., timeline amendments, scope changes)'\n"
772
+ " - 'Environmental Compliance (e.g., stormwater regulations, habitat protections)'\n"
773
+ "Do NOT use short generic labels like 'Zoning' or 'Contracts'. "
774
+ "Every category must be specific enough that a reader immediately "
775
+ "understands what types of documents belong in it."
776
+ )
777
+ else:
778
+ name_instruction = (
779
+ "Keep category names broad and general. "
780
+ "Use the most frequent or clearest label when merging."
781
+ )
782
+
765
783
  second_prompt = f"""
766
784
  You are a data analyst reviewing categorized text data.
767
785
 
@@ -774,9 +792,8 @@ Critical Instructions:
774
792
  - "breakup/household conflict" = "relationship problems"
775
793
  3) When merging:
776
794
  - Combine frequencies mentally
777
- - Keep the most frequent OR clearest label
778
795
  - Each concept appears ONLY ONCE
779
- 4) Keep category names {specificity}.
796
+ 4) {name_instruction}
780
797
  5) Return ONLY a numbered list of {max_categories} categories. No extra text.
781
798
 
782
799
  Pre-processed Categories (sorted by frequency, top sample):
@@ -820,13 +837,17 @@ Output:
820
837
 
821
838
  print("\nTop categories:\n" + "\n".join(f"{i+1}. {c}" for i, c in enumerate(final[:max_categories])))
822
839
 
840
+ top = final[:max_categories]
841
+
823
842
  if filename:
824
- result.to_csv(filename, index=False)
825
- print(f"\nResults saved to {filename}")
843
+ import pandas as _pd
844
+ top_df = _pd.DataFrame({"rank": range(1, len(top) + 1), "category": top})
845
+ top_df.to_csv(filename, index=False)
846
+ print(f"\nTop {len(top)} categories saved to {filename}")
826
847
 
827
848
  return {
828
849
  "counts_df": result,
829
- "top_categories": final[:max_categories],
850
+ "top_categories": top,
830
851
  "raw_top_text": top_categories_text
831
852
  }
832
853
 
@@ -1313,6 +1313,38 @@ Provide concise summaries that capture essential information.
1313
1313
  return messages
1314
1314
 
1315
1315
 
1316
+ def _extract_json_for_summary(reply: str) -> str:
1317
+ """Extract JSON from model reply without destroying freeform text content.
1318
+
1319
+ Unlike extract_json() (designed for classification 0/1 values), this
1320
+ preserves spaces, brackets, and newlines inside string values.
1321
+ """
1322
+ if reply is None:
1323
+ return '{"summary": ""}'
1324
+
1325
+ # Strip thinking tags if present (Qwen3, DeepSeek, etc.)
1326
+ import re as _re
1327
+ reply = _re.sub(r'<think>.*?</think>', '', reply, flags=_re.DOTALL).strip()
1328
+
1329
+ # Find JSON object using recursive regex (regex module imported at top of file)
1330
+ try:
1331
+ extracted = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
1332
+ if extracted:
1333
+ return extracted[0]
1334
+ except Exception:
1335
+ pass
1336
+
1337
+ # Fallback: try simple JSON parse
1338
+ try:
1339
+ import json
1340
+ json.loads(reply)
1341
+ return reply
1342
+ except Exception:
1343
+ pass
1344
+
1345
+ return '{"summary": ""}'
1346
+
1347
+
1316
1348
  def extract_summary_from_json(json_str: str) -> tuple:
1317
1349
  """
1318
1350
  Extract summary from JSON response.
@@ -1329,6 +1361,11 @@ def extract_summary_from_json(json_str: str) -> tuple:
1329
1361
  summary = data["summary"]
1330
1362
  if isinstance(summary, str) and summary.strip():
1331
1363
  return True, summary.strip()
1364
+ elif isinstance(summary, list):
1365
+ # Model returned summary as a list of strings (e.g., bullet points)
1366
+ joined = "\n".join(str(s) for s in summary if s)
1367
+ if joined.strip():
1368
+ return True, joined.strip()
1332
1369
  return False, None
1333
1370
  except (json.JSONDecodeError, TypeError):
1334
1371
  return False, None
@@ -1744,6 +1781,117 @@ def _prepare_page_data(
1744
1781
  # Image-Specific Functions
1745
1782
  # =============================================================================
1746
1783
 
1784
+ def build_image_summarization_prompt(
1785
+ image_data: dict,
1786
+ input_description: str = "",
1787
+ summary_instructions: str = "",
1788
+ max_length: int = None,
1789
+ focus: str = None,
1790
+ provider: str = "openai",
1791
+ chain_of_thought: bool = False,
1792
+ context_prompt: bool = False,
1793
+ step_back_prompt: bool = False,
1794
+ stepback_insights: dict = None,
1795
+ model_name: str = None,
1796
+ ) -> list:
1797
+ """
1798
+ Build the summarization prompt for an image.
1799
+
1800
+ Parallel to build_pdf_summarization_prompt() but for standalone images.
1801
+
1802
+ Args:
1803
+ image_data: Dict from _prepare_image_data() containing:
1804
+ - encoded_image: Base64 encoded image
1805
+ - extension: Image file extension (without dot)
1806
+ input_description: Description of what the images contain
1807
+ summary_instructions: Specific instructions (e.g., format/tone)
1808
+ max_length: Maximum summary length in words
1809
+ focus: What to focus on in the summary
1810
+ provider: Provider name for format-specific handling
1811
+ chain_of_thought: Whether to use step-by-step reasoning
1812
+ context_prompt: Whether to add expert context prefix
1813
+ step_back_prompt: Whether step-back prompting is enabled
1814
+ stepback_insights: Dict of step-back insights per model
1815
+ model_name: Current model name (for step-back lookup)
1816
+
1817
+ Returns:
1818
+ List of message dicts for the LLM (format varies by provider)
1819
+ """
1820
+ focus_instruction = f", focusing on {focus}" if focus else ""
1821
+ length_instruction = f"\n\nKeep the summary under {max_length} words." if max_length else ""
1822
+ custom_instructions = f"\n\nAdditional instructions: {summary_instructions}" if summary_instructions else ""
1823
+
1824
+ if chain_of_thought:
1825
+ base_text = f"""You are an image summarization assistant.
1826
+ Task: Examine the attached image and provide a concise summary{focus_instruction}.
1827
+
1828
+ {f'Image context: {input_description}' if input_description else ''}
1829
+
1830
+ Let's analyze step by step:
1831
+ 1. First, identify the main subject and visual elements in the image
1832
+ 2. Then, extract the key information, text, or message conveyed
1833
+ 3. Finally, synthesize into a concise summary{length_instruction}{custom_instructions}
1834
+
1835
+ Provide your answer in JSON format: {{"summary": "your summary here"}}"""
1836
+ else:
1837
+ base_text = f"""You are an image summarization assistant.
1838
+ Task: Examine the attached image and provide a concise summary{focus_instruction}.
1839
+
1840
+ {f'Image context: {input_description}' if input_description else ''}{length_instruction}{custom_instructions}
1841
+
1842
+ Provide your answer in JSON format: {{"summary": "your summary here"}}"""
1843
+
1844
+ if context_prompt:
1845
+ context = """You are an expert at analyzing and describing visual content.
1846
+ Focus on accuracy, key details, and any text visible in the image.
1847
+
1848
+ """
1849
+ base_text = context + base_text
1850
+
1851
+ messages = []
1852
+
1853
+ if step_back_prompt and stepback_insights and model_name in stepback_insights:
1854
+ sb_question, sb_insight = stepback_insights[model_name]
1855
+ messages.append({"role": "user", "content": sb_question})
1856
+ messages.append({"role": "assistant", "content": sb_insight})
1857
+
1858
+ encoded = image_data.get("encoded_image", "")
1859
+ ext = image_data.get("extension", "png")
1860
+
1861
+ if provider == "anthropic":
1862
+ content = [
1863
+ {"type": "text", "text": base_text},
1864
+ {
1865
+ "type": "image",
1866
+ "source": {
1867
+ "type": "base64",
1868
+ "media_type": f"image/{ext}",
1869
+ "data": encoded
1870
+ }
1871
+ }
1872
+ ]
1873
+ messages.append({"role": "user", "content": content})
1874
+ elif provider == "google":
1875
+ content = [
1876
+ {"type": "text", "text": base_text},
1877
+ {
1878
+ "type": "inline_data",
1879
+ "mime_type": f"image/{ext}",
1880
+ "data": encoded
1881
+ }
1882
+ ]
1883
+ messages.append({"role": "user", "content": content})
1884
+ else:
1885
+ encoded_url = f"data:image/{ext};base64,{encoded}"
1886
+ content = [
1887
+ {"type": "text", "text": base_text},
1888
+ {"type": "image_url", "image_url": {"url": encoded_url, "detail": "high"}}
1889
+ ]
1890
+ messages.append({"role": "user", "content": content})
1891
+
1892
+ return messages
1893
+
1894
+
1747
1895
  def build_image_classification_prompt(
1748
1896
  image_data: dict,
1749
1897
  categories_str: str,
@@ -3774,7 +3922,7 @@ def summarize_ensemble(
3774
3922
  return (model_name, '{"summary": ""}', error)
3775
3923
 
3776
3924
  # Extract JSON from response
3777
- json_str = extract_json(response)
3925
+ json_str = _extract_json_for_summary(response)
3778
3926
 
3779
3927
  return (model_name, json_str, None)
3780
3928
 
@@ -3782,6 +3930,65 @@ def summarize_ensemble(
3782
3930
  error_msg = str(e)
3783
3931
  return (model_name, '{"summary": ""}', error_msg)
3784
3932
 
3933
+ elif is_image_mode and isinstance(item, tuple) and len(item) == 2:
3934
+ # IMAGE MODE: item is (image_path, image_label)
3935
+ image_path, image_label = item
3936
+
3937
+ try:
3938
+ image_data = _prepare_image_data(image_path, image_label)
3939
+ if image_data.get("error"):
3940
+ return (model_name, '{"summary": ""}', image_data["error"])
3941
+
3942
+ messages = build_image_summarization_prompt(
3943
+ image_data=image_data,
3944
+ input_description=input_description,
3945
+ summary_instructions=summary_instructions,
3946
+ max_length=max_length,
3947
+ focus=focus,
3948
+ provider=cfg["provider"],
3949
+ chain_of_thought=chain_of_thought,
3950
+ context_prompt=context_prompt,
3951
+ step_back_prompt=step_back_prompt,
3952
+ stepback_insights=stepback_insights,
3953
+ model_name=model_name,
3954
+ )
3955
+
3956
+ client = UnifiedLLMClient(
3957
+ provider=cfg["provider"],
3958
+ api_key=cfg["api_key"],
3959
+ model=cfg["model"],
3960
+ )
3961
+
3962
+ json_schema = json_schemas[model_name]
3963
+ effective_thinking = thinking_budget if cfg["provider"] in ("google", "openai", "anthropic", "huggingface", "huggingface-together") else None
3964
+
3965
+ if cfg["provider"] == "google":
3966
+ response = _call_google_multimodal(
3967
+ client=client,
3968
+ messages=messages,
3969
+ json_schema=json_schema,
3970
+ creativity=creativity,
3971
+ thinking_budget=effective_thinking or 0,
3972
+ max_retries=max_retries,
3973
+ )
3974
+ else:
3975
+ response, error = client.complete(
3976
+ messages=messages,
3977
+ json_schema=json_schema,
3978
+ creativity=creativity,
3979
+ thinking_budget=effective_thinking,
3980
+ max_retries=max_retries,
3981
+ )
3982
+
3983
+ if error:
3984
+ return (model_name, '{"summary": ""}', error)
3985
+
3986
+ json_str = _extract_json_for_summary(response)
3987
+ return (model_name, json_str, None)
3988
+
3989
+ except Exception as e:
3990
+ return (model_name, '{"summary": ""}', str(e))
3991
+
3785
3992
  else:
3786
3993
  # TEXT MODE: Original text handling
3787
3994
  # Skip empty/null items
@@ -3827,7 +4034,7 @@ def summarize_ensemble(
3827
4034
  return (model_name, '{"summary": ""}', error)
3828
4035
 
3829
4036
  # Extract JSON from response
3830
- json_str = extract_json(response)
4037
+ json_str = _extract_json_for_summary(response)
3831
4038
 
3832
4039
  return (model_name, json_str, None)
3833
4040
 
@@ -4162,7 +4369,7 @@ Provide your answer in JSON format: {{"summary": "your synthesized summary"}}"""
4162
4369
  max_retries=max_retries,
4163
4370
  )
4164
4371
 
4165
- json_str = extract_json(response)
4372
+ json_str = _extract_json_for_summary(response)
4166
4373
  is_valid, summary = extract_summary_from_json(json_str)
4167
4374
 
4168
4375
  if is_valid:
File without changes
File without changes
File without changes
File without changes