cat-stack 1.0.7__tar.gz → 1.0.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cat_stack-1.0.7 → cat_stack-1.0.10}/PKG-INFO +2 -4
- {cat_stack-1.0.7 → cat_stack-1.0.10}/pyproject.toml +1 -3
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/__about__.py +1 -1
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_providers.py +5 -10
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_utils.py +24 -4
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/summarize.py +18 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/text_functions.py +23 -4
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/text_functions_ensemble.py +36 -8
- {cat_stack-1.0.7 → cat_stack-1.0.10}/.gitignore +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/LICENSE +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/README.md +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/__init__.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_batch.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_category_analysis.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_chunked.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_embeddings.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_formatter.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_pilot_test.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_review_ui.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_tiebreaker.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/_web_fetch.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/CoVe.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/__init__.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/all_calls.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/image_CoVe.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/image_stepback.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/pdf_CoVe.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/pdf_stepback.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/stepback.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/calls/top_n.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/classify.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/explore.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/extract.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/image_functions.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/images/circle.png +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/images/cube.png +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/images/diamond.png +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/images/overlapping_pentagons.png +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/images/rectangles.png +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/model_reference_list.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/pdf_functions.py +0 -0
- {cat_stack-1.0.7 → cat_stack-1.0.10}/src/cat_stack/prompt_tune.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cat-stack
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.10
|
|
4
4
|
Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
|
|
5
5
|
Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
|
|
@@ -19,10 +19,8 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
19
19
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
20
20
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
21
21
|
Requires-Python: >=3.8
|
|
22
|
-
Requires-Dist: anthropic
|
|
23
|
-
Requires-Dist: openai
|
|
24
22
|
Requires-Dist: pandas
|
|
25
|
-
Requires-Dist:
|
|
23
|
+
Requires-Dist: regex
|
|
26
24
|
Requires-Dist: requests
|
|
27
25
|
Requires-Dist: tqdm
|
|
28
26
|
Provides-Extra: docx
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
|
-
__version__ = "1.0.
|
|
4
|
+
__version__ = "1.0.10"
|
|
5
5
|
__author__ = "Chris Soria"
|
|
6
6
|
__email__ = "chrissoria@berkeley.edu"
|
|
7
7
|
__title__ = "cat-stack"
|
|
@@ -187,18 +187,13 @@ class UnifiedLLMClient:
|
|
|
187
187
|
self.provider = provider.lower()
|
|
188
188
|
self.api_key = api_key
|
|
189
189
|
|
|
190
|
-
#
|
|
191
|
-
|
|
192
|
-
self.model =
|
|
190
|
+
# Keep full model name with router suffix — the generic HF router
|
|
191
|
+
# uses the suffix (e.g. :novita, :together) for routing.
|
|
192
|
+
self.model = model
|
|
193
193
|
|
|
194
|
-
# Auto-detect HuggingFace endpoint
|
|
194
|
+
# Auto-detect HuggingFace endpoint (but always use generic router)
|
|
195
195
|
if self.provider == "huggingface":
|
|
196
|
-
|
|
197
|
-
if "together" in detected_url:
|
|
198
|
-
self.provider = "huggingface-together"
|
|
199
|
-
elif router and router in _HF_ROUTER_ENDPOINTS:
|
|
200
|
-
# Use the router-specific endpoint as a custom provider config
|
|
201
|
-
self._custom_endpoint = _HF_ROUTER_ENDPOINTS[router] + "/chat/completions"
|
|
196
|
+
_detect_huggingface_endpoint(api_key, model)
|
|
202
197
|
|
|
203
198
|
if self.provider not in PROVIDER_CONFIG:
|
|
204
199
|
raise ValueError(f"Unsupported provider: {provider}. "
|
|
@@ -92,17 +92,37 @@ def validate_classification_json(json_str: str, num_categories: int) -> tuple[bo
|
|
|
92
92
|
if not isinstance(parsed, dict):
|
|
93
93
|
return False, None
|
|
94
94
|
|
|
95
|
+
# Build a mapping from numeric prefix to value, handling keys like
|
|
96
|
+
# "1", "1.", "1. Category name", etc.
|
|
97
|
+
numeric_map = {}
|
|
98
|
+
for key, val in parsed.items():
|
|
99
|
+
# Extract leading number from key
|
|
100
|
+
stripped = str(key).strip()
|
|
101
|
+
num_part = ""
|
|
102
|
+
for ch in stripped:
|
|
103
|
+
if ch.isdigit():
|
|
104
|
+
num_part += ch
|
|
105
|
+
else:
|
|
106
|
+
break
|
|
107
|
+
if num_part:
|
|
108
|
+
numeric_map[num_part] = val
|
|
109
|
+
|
|
95
110
|
# Check that all expected keys are present and values are "0" or "1"
|
|
96
111
|
for i in range(1, num_categories + 1):
|
|
97
112
|
key = str(i)
|
|
98
|
-
if key not in parsed:
|
|
113
|
+
if key not in parsed and key not in numeric_map:
|
|
99
114
|
return False, None
|
|
100
|
-
|
|
115
|
+
raw_val = parsed.get(key, numeric_map.get(key))
|
|
116
|
+
val = str(raw_val).strip()
|
|
101
117
|
if val not in ("0", "1"):
|
|
102
118
|
return False, None
|
|
103
119
|
|
|
104
|
-
# Normalize values to strings
|
|
105
|
-
normalized = {
|
|
120
|
+
# Normalize values to strings, preferring exact key match then numeric prefix
|
|
121
|
+
normalized = {}
|
|
122
|
+
for i in range(1, num_categories + 1):
|
|
123
|
+
key = str(i)
|
|
124
|
+
raw_val = parsed.get(key, numeric_map.get(key))
|
|
125
|
+
normalized[key] = str(raw_val).strip()
|
|
106
126
|
return True, normalized
|
|
107
127
|
|
|
108
128
|
except (json.JSONDecodeError, KeyError, TypeError):
|
|
@@ -91,6 +91,8 @@ def summarize(
|
|
|
91
91
|
Background, Key Provisions, Stakeholders/Impact, Implementation)
|
|
92
92
|
- "detailed-report": Exhaustive report enumerating every provision,
|
|
93
93
|
with an additional Details section for exceptions and cross-references
|
|
94
|
+
- "threads": Social media post for Threads (strict 500-character limit),
|
|
95
|
+
punchy and engaging, plain language, no hashtags/emojis
|
|
94
96
|
- "alt-text": Factual visual description for blind/visually impaired
|
|
95
97
|
users — no interpretation, only what is literally shown
|
|
96
98
|
max_length (int): Maximum summary length in words
|
|
@@ -288,6 +290,22 @@ def summarize(
|
|
|
288
290
|
),
|
|
289
291
|
"max_length": None,
|
|
290
292
|
},
|
|
293
|
+
"threads": {
|
|
294
|
+
"instructions": (
|
|
295
|
+
"Write a social media post summarizing this content for Threads. "
|
|
296
|
+
"STRICT LIMIT: The entire output must be under 400 characters including spaces. "
|
|
297
|
+
"Structure:\n"
|
|
298
|
+
"- First line: A single standalone sentence that gives the high-level takeaway. "
|
|
299
|
+
"Start with who is acting and what they did, "
|
|
300
|
+
"e.g., 'The Senate just introduced a bill that...', 'House Republicans passed a measure to...', "
|
|
301
|
+
"'Congress is moving to...'. This sentence must make sense completely on its own.\n"
|
|
302
|
+
"- Then leave a blank line (two newlines).\n"
|
|
303
|
+
"- Then 2-3 short sentences with key supporting details — what it does, who it affects, why it matters.\n\n"
|
|
304
|
+
"No hashtags, no emojis, no bullet points — just clean, "
|
|
305
|
+
"compelling text. Use plain language."
|
|
306
|
+
),
|
|
307
|
+
"max_length": 80, # ~500 chars at ~6 chars/word
|
|
308
|
+
},
|
|
291
309
|
# Keep "report" as alias for backward compat
|
|
292
310
|
"report": {
|
|
293
311
|
"instructions": (
|
|
@@ -173,17 +173,36 @@ def validate_classification_json(json_str: str, num_categories: int) -> tuple[bo
|
|
|
173
173
|
if not isinstance(parsed, dict):
|
|
174
174
|
return False, None
|
|
175
175
|
|
|
176
|
+
# Build a mapping from numeric prefix to value, handling keys like
|
|
177
|
+
# "1", "1.", "1. Category name", etc.
|
|
178
|
+
numeric_map = {}
|
|
179
|
+
for key, val in parsed.items():
|
|
180
|
+
stripped = str(key).strip()
|
|
181
|
+
num_part = ""
|
|
182
|
+
for ch in stripped:
|
|
183
|
+
if ch.isdigit():
|
|
184
|
+
num_part += ch
|
|
185
|
+
else:
|
|
186
|
+
break
|
|
187
|
+
if num_part:
|
|
188
|
+
numeric_map[num_part] = val
|
|
189
|
+
|
|
176
190
|
# Check that all expected keys are present and values are "0" or "1"
|
|
177
191
|
for i in range(1, num_categories + 1):
|
|
178
192
|
key = str(i)
|
|
179
|
-
if key not in parsed:
|
|
193
|
+
if key not in parsed and key not in numeric_map:
|
|
180
194
|
return False, None
|
|
181
|
-
|
|
195
|
+
raw_val = parsed.get(key, numeric_map.get(key))
|
|
196
|
+
val = str(raw_val).strip()
|
|
182
197
|
if val not in ("0", "1"):
|
|
183
198
|
return False, None
|
|
184
199
|
|
|
185
|
-
# Normalize values to strings
|
|
186
|
-
normalized = {
|
|
200
|
+
# Normalize values to strings, preferring exact key match then numeric prefix
|
|
201
|
+
normalized = {}
|
|
202
|
+
for i in range(1, num_categories + 1):
|
|
203
|
+
key = str(i)
|
|
204
|
+
raw_val = parsed.get(key, numeric_map.get(key))
|
|
205
|
+
normalized[key] = str(raw_val).strip()
|
|
187
206
|
return True, normalized
|
|
188
207
|
|
|
189
208
|
except (json.JSONDecodeError, KeyError, TypeError):
|
|
@@ -688,6 +688,31 @@ def prepare_model_configs(models: list, auto_download: bool = False) -> list:
|
|
|
688
688
|
return configs
|
|
689
689
|
|
|
690
690
|
|
|
691
|
+
def _normalize_json_keys(parsed: dict, expected_keys: set) -> dict:
|
|
692
|
+
"""Normalize JSON keys by extracting leading numeric prefix.
|
|
693
|
+
|
|
694
|
+
Handles keys like "1. Category name" -> "1", which some HuggingFace
|
|
695
|
+
models produce when using json_object mode (no strict schema enforcement).
|
|
696
|
+
"""
|
|
697
|
+
normalized = {}
|
|
698
|
+
for k, v in parsed.items():
|
|
699
|
+
stripped = str(k).strip()
|
|
700
|
+
# Extract leading digits
|
|
701
|
+
num_part = ""
|
|
702
|
+
for ch in stripped:
|
|
703
|
+
if ch.isdigit():
|
|
704
|
+
num_part += ch
|
|
705
|
+
else:
|
|
706
|
+
break
|
|
707
|
+
norm_key = num_part if num_part else stripped
|
|
708
|
+
# Prefer exact match; only use normalized key if exact not already present
|
|
709
|
+
if stripped in expected_keys:
|
|
710
|
+
normalized[stripped] = v
|
|
711
|
+
elif norm_key in expected_keys and norm_key not in normalized:
|
|
712
|
+
normalized[norm_key] = v
|
|
713
|
+
return normalized
|
|
714
|
+
|
|
715
|
+
|
|
691
716
|
def aggregate_results(
|
|
692
717
|
model_results: dict,
|
|
693
718
|
categories: list,
|
|
@@ -725,18 +750,19 @@ def aggregate_results(
|
|
|
725
750
|
else:
|
|
726
751
|
try:
|
|
727
752
|
parsed = json.loads(json_str)
|
|
753
|
+
|
|
754
|
+
normalized_parsed = _normalize_json_keys(parsed, expected_keys)
|
|
755
|
+
|
|
728
756
|
# Accept if at least one key is a valid numbered category
|
|
729
757
|
# with a 0/1 value. Models may only return present categories
|
|
730
758
|
# (e.g. {"3": "1"}) — missing keys default to 0 downstream.
|
|
731
|
-
# Strip out any keys with invalid values so they also
|
|
732
|
-
# default to 0 cleanly instead of hitting error paths.
|
|
733
759
|
valid_count = sum(
|
|
734
|
-
1 for k, v in
|
|
760
|
+
1 for k, v in normalized_parsed.items()
|
|
735
761
|
if k in expected_keys and str(v).strip() in ("0", "1")
|
|
736
762
|
)
|
|
737
763
|
if valid_count > 0:
|
|
738
764
|
cleaned = {
|
|
739
|
-
k: str(v).strip() for k, v in
|
|
765
|
+
k: str(v).strip() for k, v in normalized_parsed.items()
|
|
740
766
|
if k in expected_keys and str(v).strip() in ("0", "1")
|
|
741
767
|
}
|
|
742
768
|
successful[model_name] = cleaned
|
|
@@ -3213,9 +3239,9 @@ Categorize text responses {cove_categorize}:
|
|
|
3213
3239
|
# Check JSON parsing AND schema validation
|
|
3214
3240
|
try:
|
|
3215
3241
|
parsed = json.loads(json_str)
|
|
3216
|
-
|
|
3242
|
+
normalized = _normalize_json_keys(parsed, expected_keys)
|
|
3217
3243
|
valid_count = sum(
|
|
3218
|
-
1 for k, v in
|
|
3244
|
+
1 for k, v in normalized.items()
|
|
3219
3245
|
if k in expected_keys and str(v).strip() in ("0", "1")
|
|
3220
3246
|
)
|
|
3221
3247
|
if valid_count == 0:
|
|
@@ -3252,8 +3278,9 @@ Categorize text responses {cove_categorize}:
|
|
|
3252
3278
|
# Verify JSON is valid and has correct schema
|
|
3253
3279
|
try:
|
|
3254
3280
|
parsed = json.loads(json_result)
|
|
3281
|
+
normalized = _normalize_json_keys(parsed, expected_keys)
|
|
3255
3282
|
valid_count = sum(
|
|
3256
|
-
1 for k, v in
|
|
3283
|
+
1 for k, v in normalized.items()
|
|
3257
3284
|
if k in expected_keys and str(v).strip() in ("0", "1")
|
|
3258
3285
|
)
|
|
3259
3286
|
if valid_count > 0:
|
|
@@ -3267,8 +3294,9 @@ Categorize text responses {cove_categorize}:
|
|
|
3267
3294
|
if error is None:
|
|
3268
3295
|
try:
|
|
3269
3296
|
parsed = json.loads(json_result)
|
|
3297
|
+
normalized = _normalize_json_keys(parsed, expected_keys)
|
|
3270
3298
|
valid_count = sum(
|
|
3271
|
-
1 for k, v in
|
|
3299
|
+
1 for k, v in normalized.items()
|
|
3272
3300
|
if k in expected_keys and str(v).strip() in ("0", "1")
|
|
3273
3301
|
)
|
|
3274
3302
|
if valid_count > 0:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|