cat-stack 2.0.0b4__tar.gz → 2.0.0b5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/PKG-INFO +62 -1
  2. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/README.md +61 -0
  3. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/__about__.py +1 -1
  4. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_providers.py +21 -1
  5. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/.gitignore +0 -0
  6. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/LICENSE +0 -0
  7. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/pyproject.toml +0 -0
  8. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/cat_stack/__init__.py +0 -0
  9. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/__init__.py +0 -0
  10. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_batch.py +0 -0
  11. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_category_analysis.py +0 -0
  12. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_chunked.py +0 -0
  13. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_embeddings.py +0 -0
  14. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_formatter.py +0 -0
  15. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_pilot_test.py +0 -0
  16. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_prompts.py +0 -0
  17. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_review_ui.py +0 -0
  18. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_tiebreaker.py +0 -0
  19. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_utils.py +0 -0
  20. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_web_fetch.py +0 -0
  21. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/_wrapper_helpers.py +0 -0
  22. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/calls/CoVe.py +0 -0
  23. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/calls/__init__.py +0 -0
  24. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/calls/image_CoVe.py +0 -0
  25. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/calls/image_stepback.py +0 -0
  26. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/calls/pdf_CoVe.py +0 -0
  27. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/calls/pdf_stepback.py +0 -0
  28. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/calls/stepback.py +0 -0
  29. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/calls/top_n.py +0 -0
  30. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/classify.py +0 -0
  31. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/collapse_themes.py +0 -0
  32. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/explore.py +0 -0
  33. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/extract.py +0 -0
  34. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/image_functions.py +0 -0
  35. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/images/circle.png +0 -0
  36. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/images/cube.png +0 -0
  37. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/images/diamond.png +0 -0
  38. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/images/overlapping_pentagons.png +0 -0
  39. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/images/rectangles.png +0 -0
  40. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/model_reference_list.py +0 -0
  41. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/pdf_functions.py +0 -0
  42. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/prompt_tune.py +0 -0
  43. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/summarize.py +0 -0
  44. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/text_functions.py +0 -0
  45. {cat_stack-2.0.0b4 → cat_stack-2.0.0b5}/src/catstack/text_functions_ensemble.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-stack
3
- Version: 2.0.0b4
3
+ Version: 2.0.0b5
4
4
  Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -154,6 +154,67 @@ cat.explore(
154
154
  )
155
155
  ```
156
156
 
157
+ ### `collapse_themes()`
158
+ Consolidate a long, redundant list of extracted category labels (e.g. the output of `explore()`) into a smaller, deduplicated taxonomy. Runs the semantic merge iteratively, then applies a single deterministic embedding re-merge over the whole result to collapse cross-batch lexical siblings (e.g. "tension" / "estrangement") that batched passes leave separate. Tuned to err toward over-segmentation (keeping categories) rather than over-merging.
159
+
160
+ ```python
161
+ # Basic: aggressive merge, auto-stop at the quality peak
162
+ cat.collapse_themes(
163
+ input_data=raw_labels, # list[str] or a frequency Series/dict
164
+ api_key=key,
165
+ description="Why did you move?", # the survey question / context
166
+ aggressive=True,
167
+ passes="auto",
168
+ user_model="gpt-4o",
169
+ )
170
+ ```
171
+
172
+ ```python
173
+ # Per-step model assignment: a cheap model thins restatements,
174
+ # a stronger model does the conceptual merge (providers can differ)
175
+ cat.collapse_themes(
176
+ input_data=raw_labels,
177
+ api_key=key,
178
+ description="Why did you move?",
179
+ aggressive=True,
180
+ passes="auto",
181
+ unique_model="Qwen/Qwen2.5-72B-Instruct:together",
182
+ unique_model_source="huggingface",
183
+ unique_passes=1,
184
+ merge_model="Qwen/Qwen3.6-35B-A3B:together",
185
+ merge_model_source="huggingface",
186
+ max_workers=8,
187
+ )
188
+ ```
189
+
190
+ **Parameters**
191
+
192
+ | Parameter | Default | Description |
193
+ | --- | --- | --- |
194
+ | `input_data` | — | List of category labels, or a frequency `Series`/`dict` (`label -> count`). |
195
+ | `api_key` | `None` | API key for the LLM provider (required). |
196
+ | `description` | `""` | The survey question or context, used in the merge prompt. |
197
+ | `passes` | `1` | Number of merge iterations, or `"auto"` to iterate until the embedding-quality benchmark peaks. |
198
+ | `max_passes` | `10` | Cap on iterations when `passes="auto"`. |
199
+ | `batch_size` | `40` | Labels per LLM chunk (`ceil(n / batch_size)` calls per pass). |
200
+ | `aggressive` | `False` | `True` = conceptual-merge prompt (compress related labels); `False` = extract-unique (faithful thinning, removes restatements only). |
201
+ | `dedupe_threshold` | `0.95` | Jaro-Winkler similarity at/above which normalized labels are deduped (`1.0` = exact only). |
202
+ | `embedding_merge_threshold` | `0.92` | Cosine similarity at/above which labels are merged in the pre-LLM embedding step. `None`/`>=1.0` disables it. |
203
+ | `shuffle` | `True` | Randomize order each pass so batch composition varies (improves convergence stability). |
204
+ | `final_consolidation` | `0.82` | Cosine threshold for one greedy global embedding re-merge after all passes, collapsing cross-batch duplicates. Conservative by design (errs toward keeping categories). `False`/`None` skips it. |
205
+ | `user_model` | `"gpt-4o"` | Model for the merge phase. Use a capable model — small models can degenerate. |
206
+ | `model_source` | `"auto"` | Provider for `user_model` (`"auto"`, `"openai"`, `"huggingface"`, …). |
207
+ | `unique_model` | `None` | If set, run an initial extract-unique thinning phase on this (typically cheaper) model before the merge phase. `None` skips the phase (backward compatible). |
208
+ | `unique_model_source` | `"auto"` | Provider for `unique_model` — can differ from the merge phase. |
209
+ | `unique_passes` | `1` | Number of thinning passes when `unique_model` is set. |
210
+ | `merge_model` | `None` | Model for the merge phase; falls back to `user_model` when `None`. |
211
+ | `merge_model_source` | `"auto"` | Provider for `merge_model`. |
212
+ | `creativity` | `0` | Temperature (`0` = deterministic). |
213
+ | `max_workers` | `1` | Batches processed concurrently per pass. |
214
+ | `random_state` | `None` | Seed for shuffling (per-pass seed = `random_state + pass`). |
215
+ | `filename` | `None` | Optional CSV path to save the final list. |
216
+ | `progress_callback` | `None` | Optional `callback(pass, passes, label)` for progress reporting. |
217
+
157
218
  ### `summarize()`
158
219
  Summarize text or PDF documents, with optional multi-model ensemble.
159
220
 
@@ -118,6 +118,67 @@ cat.explore(
118
118
  )
119
119
  ```
120
120
 
121
+ ### `collapse_themes()`
122
+ Consolidate a long, redundant list of extracted category labels (e.g. the output of `explore()`) into a smaller, deduplicated taxonomy. Runs the semantic merge iteratively, then applies a single deterministic embedding re-merge over the whole result to collapse cross-batch lexical siblings (e.g. "tension" / "estrangement") that batched passes leave separate. Tuned to err toward over-segmentation (keeping categories) rather than over-merging.
123
+
124
+ ```python
125
+ # Basic: aggressive merge, auto-stop at the quality peak
126
+ cat.collapse_themes(
127
+ input_data=raw_labels, # list[str] or a frequency Series/dict
128
+ api_key=key,
129
+ description="Why did you move?", # the survey question / context
130
+ aggressive=True,
131
+ passes="auto",
132
+ user_model="gpt-4o",
133
+ )
134
+ ```
135
+
136
+ ```python
137
+ # Per-step model assignment: a cheap model thins restatements,
138
+ # a stronger model does the conceptual merge (providers can differ)
139
+ cat.collapse_themes(
140
+ input_data=raw_labels,
141
+ api_key=key,
142
+ description="Why did you move?",
143
+ aggressive=True,
144
+ passes="auto",
145
+ unique_model="Qwen/Qwen2.5-72B-Instruct:together",
146
+ unique_model_source="huggingface",
147
+ unique_passes=1,
148
+ merge_model="Qwen/Qwen3.6-35B-A3B:together",
149
+ merge_model_source="huggingface",
150
+ max_workers=8,
151
+ )
152
+ ```
153
+
154
+ **Parameters**
155
+
156
+ | Parameter | Default | Description |
157
+ | --- | --- | --- |
158
+ | `input_data` | — | List of category labels, or a frequency `Series`/`dict` (`label -> count`). |
159
+ | `api_key` | `None` | API key for the LLM provider (required). |
160
+ | `description` | `""` | The survey question or context, used in the merge prompt. |
161
+ | `passes` | `1` | Number of merge iterations, or `"auto"` to iterate until the embedding-quality benchmark peaks. |
162
+ | `max_passes` | `10` | Cap on iterations when `passes="auto"`. |
163
+ | `batch_size` | `40` | Labels per LLM chunk (`ceil(n / batch_size)` calls per pass). |
164
+ | `aggressive` | `False` | `True` = conceptual-merge prompt (compress related labels); `False` = extract-unique (faithful thinning, removes restatements only). |
165
+ | `dedupe_threshold` | `0.95` | Jaro-Winkler similarity at/above which normalized labels are deduped (`1.0` = exact only). |
166
+ | `embedding_merge_threshold` | `0.92` | Cosine similarity at/above which labels are merged in the pre-LLM embedding step. `None`/`>=1.0` disables it. |
167
+ | `shuffle` | `True` | Randomize order each pass so batch composition varies (improves convergence stability). |
168
+ | `final_consolidation` | `0.82` | Cosine threshold for one greedy global embedding re-merge after all passes, collapsing cross-batch duplicates. Conservative by design (errs toward keeping categories). `False`/`None` skips it. |
169
+ | `user_model` | `"gpt-4o"` | Model for the merge phase. Use a capable model — small models can degenerate. |
170
+ | `model_source` | `"auto"` | Provider for `user_model` (`"auto"`, `"openai"`, `"huggingface"`, …). |
171
+ | `unique_model` | `None` | If set, run an initial extract-unique thinning phase on this (typically cheaper) model before the merge phase. `None` skips the phase (backward compatible). |
172
+ | `unique_model_source` | `"auto"` | Provider for `unique_model` — can differ from the merge phase. |
173
+ | `unique_passes` | `1` | Number of thinning passes when `unique_model` is set. |
174
+ | `merge_model` | `None` | Model for the merge phase; falls back to `user_model` when `None`. |
175
+ | `merge_model_source` | `"auto"` | Provider for `merge_model`. |
176
+ | `creativity` | `0` | Temperature (`0` = deterministic). |
177
+ | `max_workers` | `1` | Batches processed concurrently per pass. |
178
+ | `random_state` | `None` | Seed for shuffling (per-pass seed = `random_state + pass`). |
179
+ | `filename` | `None` | Optional CSV path to save the final list. |
180
+ | `progress_callback` | `None` | Optional `callback(pass, passes, label)` for progress reporting. |
181
+
121
182
  ### `summarize()`
122
183
  Summarize text or PDF documents, with optional multi-model ensemble.
123
184
 
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
- __version__ = "2.0.0b4"
4
+ __version__ = "2.0.0b5"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-stack"
@@ -333,6 +333,14 @@ def _detect_huggingface_endpoint(api_key: str, model: str, skip: set = None) ->
333
333
  headers = {
334
334
  "Content-Type": "application/json",
335
335
  "Authorization": f"Bearer {api_key}",
336
+ # Match the main request path: featherless's WAF 403s the default
337
+ # python-requests agent, which would make this probe wrongly skip a
338
+ # working endpoint.
339
+ "User-Agent": (
340
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
341
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
342
+ "Chrome/120.0.0.0 Safari/537.36"
343
+ ),
336
344
  }
337
345
  payload = {
338
346
  "model": clean_model,
@@ -526,7 +534,19 @@ class UnifiedLLMClient:
526
534
 
527
535
  def _get_headers(self) -> dict:
528
536
  """Build request headers for the provider."""
529
- headers = {"Content-Type": "application/json"}
537
+ # Send a browser-like User-Agent. Some providers fronted by a WAF
538
+ # (notably the HuggingFace router's featherless-ai backend) intermittently
539
+ # 403 the default `python-requests/x.y` agent via a Cloudflare bot rule,
540
+ # which surfaces as spurious classification failures. A standard UA is
541
+ # accepted everywhere and costs nothing on providers that don't care.
542
+ headers = {
543
+ "Content-Type": "application/json",
544
+ "User-Agent": (
545
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
546
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
547
+ "Chrome/120.0.0.0 Safari/537.36"
548
+ ),
549
+ }
530
550
  auth_header = self.config["auth_header"]
531
551
  auth_prefix = self.config["auth_prefix"]
532
552
 
File without changes
File without changes
File without changes