cat-stack 1.0.7__tar.gz → 1.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {cat_stack-1.0.7 → cat_stack-1.0.10}/PKG-INFO +2 -4
  2. {cat_stack-1.0.7 → cat_stack-1.0.10}/pyproject.toml +1 -3
  3. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/__about__.py +1 -1
  4. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_providers.py +5 -10
  5. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_utils.py +24 -4
  6. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/summarize.py +18 -0
  7. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/text_functions.py +23 -4
  8. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/text_functions_ensemble.py +36 -8
  9. {cat_stack-1.0.7 → cat_stack-1.0.10}/.gitignore +0 -0
  10. {cat_stack-1.0.7 → cat_stack-1.0.10}/LICENSE +0 -0
  11. {cat_stack-1.0.7 → cat_stack-1.0.10}/README.md +0 -0
  12. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/__init__.py +0 -0
  13. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_batch.py +0 -0
  14. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_category_analysis.py +0 -0
  15. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_chunked.py +0 -0
  16. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_embeddings.py +0 -0
  17. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_formatter.py +0 -0
  18. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_pilot_test.py +0 -0
  19. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_review_ui.py +0 -0
  20. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_tiebreaker.py +0 -0
  21. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_web_fetch.py +0 -0
  22. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/CoVe.py +0 -0
  23. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/__init__.py +0 -0
  24. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/all_calls.py +0 -0
  25. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/image_CoVe.py +0 -0
  26. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/image_stepback.py +0 -0
  27. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/pdf_CoVe.py +0 -0
  28. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/pdf_stepback.py +0 -0
  29. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/stepback.py +0 -0
  30. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/top_n.py +0 -0
  31. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/classify.py +0 -0
  32. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/explore.py +0 -0
  33. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/extract.py +0 -0
  34. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/image_functions.py +0 -0
  35. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/images/circle.png +0 -0
  36. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/images/cube.png +0 -0
  37. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/images/diamond.png +0 -0
  38. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/images/overlapping_pentagons.png +0 -0
  39. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/images/rectangles.png +0 -0
  40. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/model_reference_list.py +0 -0
  41. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/pdf_functions.py +0 -0
  42. {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/prompt_tune.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-stack
3
- Version: 1.0.7
3
+ Version: 1.0.10
4
4
  Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -19,10 +19,8 @@ Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: Implementation :: CPython
20
20
  Classifier: Programming Language :: Python :: Implementation :: PyPy
21
21
  Requires-Python: >=3.8
22
- Requires-Dist: anthropic
23
- Requires-Dist: openai
24
22
  Requires-Dist: pandas
25
- Requires-Dist: perplexityai
23
+ Requires-Dist: regex
26
24
  Requires-Dist: requests
27
25
  Requires-Dist: tqdm
28
26
  Provides-Extra: docx
@@ -28,9 +28,7 @@ dependencies = [
28
28
  "pandas",
29
29
  "tqdm",
30
30
  "requests",
31
- "openai",
32
- "anthropic",
33
- "perplexityai"
31
+ "regex",
34
32
  ]
35
33
 
36
34
  [project.optional-dependencies]
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
- __version__ = "1.0.7"
4
+ __version__ = "1.0.10"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-stack"
@@ -187,18 +187,13 @@ class UnifiedLLMClient:
187
187
  self.provider = provider.lower()
188
188
  self.api_key = api_key
189
189
 
190
- # Strip router suffix from model name and detect endpoint
191
- clean_model, router = _parse_hf_model_suffix(model)
192
- self.model = clean_model if self.provider == "huggingface" else model
190
+ # Keep full model name with router suffix the generic HF router
191
+ # uses the suffix (e.g. :novita, :together) for routing.
192
+ self.model = model
193
193
 
194
- # Auto-detect HuggingFace endpoint
194
+ # Auto-detect HuggingFace endpoint (but always use generic router)
195
195
  if self.provider == "huggingface":
196
- detected_url = _detect_huggingface_endpoint(api_key, model)
197
- if "together" in detected_url:
198
- self.provider = "huggingface-together"
199
- elif router and router in _HF_ROUTER_ENDPOINTS:
200
- # Use the router-specific endpoint as a custom provider config
201
- self._custom_endpoint = _HF_ROUTER_ENDPOINTS[router] + "/chat/completions"
196
+ _detect_huggingface_endpoint(api_key, model)
202
197
 
203
198
  if self.provider not in PROVIDER_CONFIG:
204
199
  raise ValueError(f"Unsupported provider: {provider}. "
@@ -92,17 +92,37 @@ def validate_classification_json(json_str: str, num_categories: int) -> tuple[bo
92
92
  if not isinstance(parsed, dict):
93
93
  return False, None
94
94
 
95
+ # Build a mapping from numeric prefix to value, handling keys like
96
+ # "1", "1.", "1. Category name", etc.
97
+ numeric_map = {}
98
+ for key, val in parsed.items():
99
+ # Extract leading number from key
100
+ stripped = str(key).strip()
101
+ num_part = ""
102
+ for ch in stripped:
103
+ if ch.isdigit():
104
+ num_part += ch
105
+ else:
106
+ break
107
+ if num_part:
108
+ numeric_map[num_part] = val
109
+
95
110
  # Check that all expected keys are present and values are "0" or "1"
96
111
  for i in range(1, num_categories + 1):
97
112
  key = str(i)
98
- if key not in parsed:
113
+ if key not in parsed and key not in numeric_map:
99
114
  return False, None
100
- val = str(parsed[key]).strip()
115
+ raw_val = parsed.get(key, numeric_map.get(key))
116
+ val = str(raw_val).strip()
101
117
  if val not in ("0", "1"):
102
118
  return False, None
103
119
 
104
- # Normalize values to strings
105
- normalized = {str(i): str(parsed[str(i)]).strip() for i in range(1, num_categories + 1)}
120
+ # Normalize values to strings, preferring exact key match then numeric prefix
121
+ normalized = {}
122
+ for i in range(1, num_categories + 1):
123
+ key = str(i)
124
+ raw_val = parsed.get(key, numeric_map.get(key))
125
+ normalized[key] = str(raw_val).strip()
106
126
  return True, normalized
107
127
 
108
128
  except (json.JSONDecodeError, KeyError, TypeError):
@@ -91,6 +91,8 @@ def summarize(
91
91
  Background, Key Provisions, Stakeholders/Impact, Implementation)
92
92
  - "detailed-report": Exhaustive report enumerating every provision,
93
93
  with an additional Details section for exceptions and cross-references
94
+ - "threads": Social media post for Threads (strict 500-character limit),
95
+ punchy and engaging, plain language, no hashtags/emojis
94
96
  - "alt-text": Factual visual description for blind/visually impaired
95
97
  users — no interpretation, only what is literally shown
96
98
  max_length (int): Maximum summary length in words
@@ -288,6 +290,22 @@ def summarize(
288
290
  ),
289
291
  "max_length": None,
290
292
  },
293
+ "threads": {
294
+ "instructions": (
295
+ "Write a social media post summarizing this content for Threads. "
296
+ "STRICT LIMIT: The entire output must be under 400 characters including spaces. "
297
+ "Structure:\n"
298
+ "- First line: A single standalone sentence that gives the high-level takeaway. "
299
+ "Start with who is acting and what they did, "
300
+ "e.g., 'The Senate just introduced a bill that...', 'House Republicans passed a measure to...', "
301
+ "'Congress is moving to...'. This sentence must make sense completely on its own.\n"
302
+ "- Then leave a blank line (two newlines).\n"
303
+ "- Then 2-3 short sentences with key supporting details — what it does, who it affects, why it matters.\n\n"
304
+ "No hashtags, no emojis, no bullet points — just clean, "
305
+ "compelling text. Use plain language."
306
+ ),
307
+ "max_length": 80, # ~500 chars at ~6 chars/word
308
+ },
291
309
  # Keep "report" as alias for backward compat
292
310
  "report": {
293
311
  "instructions": (
@@ -173,17 +173,36 @@ def validate_classification_json(json_str: str, num_categories: int) -> tuple[bo
173
173
  if not isinstance(parsed, dict):
174
174
  return False, None
175
175
 
176
+ # Build a mapping from numeric prefix to value, handling keys like
177
+ # "1", "1.", "1. Category name", etc.
178
+ numeric_map = {}
179
+ for key, val in parsed.items():
180
+ stripped = str(key).strip()
181
+ num_part = ""
182
+ for ch in stripped:
183
+ if ch.isdigit():
184
+ num_part += ch
185
+ else:
186
+ break
187
+ if num_part:
188
+ numeric_map[num_part] = val
189
+
176
190
  # Check that all expected keys are present and values are "0" or "1"
177
191
  for i in range(1, num_categories + 1):
178
192
  key = str(i)
179
- if key not in parsed:
193
+ if key not in parsed and key not in numeric_map:
180
194
  return False, None
181
- val = str(parsed[key]).strip()
195
+ raw_val = parsed.get(key, numeric_map.get(key))
196
+ val = str(raw_val).strip()
182
197
  if val not in ("0", "1"):
183
198
  return False, None
184
199
 
185
- # Normalize values to strings
186
- normalized = {str(i): str(parsed[str(i)]).strip() for i in range(1, num_categories + 1)}
200
+ # Normalize values to strings, preferring exact key match then numeric prefix
201
+ normalized = {}
202
+ for i in range(1, num_categories + 1):
203
+ key = str(i)
204
+ raw_val = parsed.get(key, numeric_map.get(key))
205
+ normalized[key] = str(raw_val).strip()
187
206
  return True, normalized
188
207
 
189
208
  except (json.JSONDecodeError, KeyError, TypeError):
@@ -688,6 +688,31 @@ def prepare_model_configs(models: list, auto_download: bool = False) -> list:
688
688
  return configs
689
689
 
690
690
 
691
+ def _normalize_json_keys(parsed: dict, expected_keys: set) -> dict:
692
+ """Normalize JSON keys by extracting leading numeric prefix.
693
+
694
+ Handles keys like "1. Category name" -> "1", which some HuggingFace
695
+ models produce when using json_object mode (no strict schema enforcement).
696
+ """
697
+ normalized = {}
698
+ for k, v in parsed.items():
699
+ stripped = str(k).strip()
700
+ # Extract leading digits
701
+ num_part = ""
702
+ for ch in stripped:
703
+ if ch.isdigit():
704
+ num_part += ch
705
+ else:
706
+ break
707
+ norm_key = num_part if num_part else stripped
708
+ # Prefer exact match; only use normalized key if exact not already present
709
+ if stripped in expected_keys:
710
+ normalized[stripped] = v
711
+ elif norm_key in expected_keys and norm_key not in normalized:
712
+ normalized[norm_key] = v
713
+ return normalized
714
+
715
+
691
716
  def aggregate_results(
692
717
  model_results: dict,
693
718
  categories: list,
@@ -725,18 +750,19 @@ def aggregate_results(
725
750
  else:
726
751
  try:
727
752
  parsed = json.loads(json_str)
753
+
754
+ normalized_parsed = _normalize_json_keys(parsed, expected_keys)
755
+
728
756
  # Accept if at least one key is a valid numbered category
729
757
  # with a 0/1 value. Models may only return present categories
730
758
  # (e.g. {"3": "1"}) — missing keys default to 0 downstream.
731
- # Strip out any keys with invalid values so they also
732
- # default to 0 cleanly instead of hitting error paths.
733
759
  valid_count = sum(
734
- 1 for k, v in parsed.items()
760
+ 1 for k, v in normalized_parsed.items()
735
761
  if k in expected_keys and str(v).strip() in ("0", "1")
736
762
  )
737
763
  if valid_count > 0:
738
764
  cleaned = {
739
- k: str(v).strip() for k, v in parsed.items()
765
+ k: str(v).strip() for k, v in normalized_parsed.items()
740
766
  if k in expected_keys and str(v).strip() in ("0", "1")
741
767
  }
742
768
  successful[model_name] = cleaned
@@ -3213,9 +3239,9 @@ Categorize text responses {cove_categorize}:
3213
3239
  # Check JSON parsing AND schema validation
3214
3240
  try:
3215
3241
  parsed = json.loads(json_str)
3216
- # At least one valid numbered key with 0/1 value
3242
+ normalized = _normalize_json_keys(parsed, expected_keys)
3217
3243
  valid_count = sum(
3218
- 1 for k, v in parsed.items()
3244
+ 1 for k, v in normalized.items()
3219
3245
  if k in expected_keys and str(v).strip() in ("0", "1")
3220
3246
  )
3221
3247
  if valid_count == 0:
@@ -3252,8 +3278,9 @@ Categorize text responses {cove_categorize}:
3252
3278
  # Verify JSON is valid and has correct schema
3253
3279
  try:
3254
3280
  parsed = json.loads(json_result)
3281
+ normalized = _normalize_json_keys(parsed, expected_keys)
3255
3282
  valid_count = sum(
3256
- 1 for k, v in parsed.items()
3283
+ 1 for k, v in normalized.items()
3257
3284
  if k in expected_keys and str(v).strip() in ("0", "1")
3258
3285
  )
3259
3286
  if valid_count > 0:
@@ -3267,8 +3294,9 @@ Categorize text responses {cove_categorize}:
3267
3294
  if error is None:
3268
3295
  try:
3269
3296
  parsed = json.loads(json_result)
3297
+ normalized = _normalize_json_keys(parsed, expected_keys)
3270
3298
  valid_count = sum(
3271
- 1 for k, v in parsed.items()
3299
+ 1 for k, v in normalized.items()
3272
3300
  if k in expected_keys and str(v).strip() in ("0", "1")
3273
3301
  )
3274
3302
  if valid_count > 0:
File without changes
File without changes
File without changes