cat-stack 2.0.0b1__tar.gz → 2.0.0b5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/PKG-INFO +62 -1
  2. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/README.md +61 -0
  3. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/__about__.py +1 -1
  4. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_providers.py +21 -1
  5. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/collapse_themes.py +142 -27
  6. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/.gitignore +0 -0
  7. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/LICENSE +0 -0
  8. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/pyproject.toml +0 -0
  9. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/cat_stack/__init__.py +0 -0
  10. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/__init__.py +0 -0
  11. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_batch.py +0 -0
  12. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_category_analysis.py +0 -0
  13. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_chunked.py +0 -0
  14. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_embeddings.py +0 -0
  15. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_formatter.py +0 -0
  16. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_pilot_test.py +0 -0
  17. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_prompts.py +0 -0
  18. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_review_ui.py +0 -0
  19. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_tiebreaker.py +0 -0
  20. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_utils.py +0 -0
  21. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_web_fetch.py +0 -0
  22. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/_wrapper_helpers.py +0 -0
  23. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/calls/CoVe.py +0 -0
  24. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/calls/__init__.py +0 -0
  25. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/calls/image_CoVe.py +0 -0
  26. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/calls/image_stepback.py +0 -0
  27. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/calls/pdf_CoVe.py +0 -0
  28. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/calls/pdf_stepback.py +0 -0
  29. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/calls/stepback.py +0 -0
  30. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/calls/top_n.py +0 -0
  31. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/classify.py +0 -0
  32. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/explore.py +0 -0
  33. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/extract.py +0 -0
  34. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/image_functions.py +0 -0
  35. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/images/circle.png +0 -0
  36. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/images/cube.png +0 -0
  37. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/images/diamond.png +0 -0
  38. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/images/overlapping_pentagons.png +0 -0
  39. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/images/rectangles.png +0 -0
  40. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/model_reference_list.py +0 -0
  41. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/pdf_functions.py +0 -0
  42. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/prompt_tune.py +0 -0
  43. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/summarize.py +0 -0
  44. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/text_functions.py +0 -0
  45. {cat_stack-2.0.0b1 → cat_stack-2.0.0b5}/src/catstack/text_functions_ensemble.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-stack
3
- Version: 2.0.0b1
3
+ Version: 2.0.0b5
4
4
  Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -154,6 +154,67 @@ cat.explore(
154
154
  )
155
155
  ```
156
156
 
157
+ ### `collapse_themes()`
158
+ Consolidate a long, redundant list of extracted category labels (e.g. the output of `explore()`) into a smaller, deduplicated taxonomy. Runs the semantic merge iteratively, then applies a single deterministic embedding re-merge over the whole result to collapse cross-batch lexical siblings (e.g. "tension" / "estrangement") that batched passes leave separate. Tuned to err toward over-segmentation (keeping categories) rather than over-merging.
159
+
160
+ ```python
161
+ # Basic: aggressive merge, auto-stop at the quality peak
162
+ cat.collapse_themes(
163
+ input_data=raw_labels, # list[str] or a frequency Series/dict
164
+ api_key=key,
165
+ description="Why did you move?", # the survey question / context
166
+ aggressive=True,
167
+ passes="auto",
168
+ user_model="gpt-4o",
169
+ )
170
+ ```
171
+
172
+ ```python
173
+ # Per-step model assignment: a cheap model thins restatements,
174
+ # a stronger model does the conceptual merge (providers can differ)
175
+ cat.collapse_themes(
176
+ input_data=raw_labels,
177
+ api_key=key,
178
+ description="Why did you move?",
179
+ aggressive=True,
180
+ passes="auto",
181
+ unique_model="Qwen/Qwen2.5-72B-Instruct:together",
182
+ unique_model_source="huggingface",
183
+ unique_passes=1,
184
+ merge_model="Qwen/Qwen3.6-35B-A3B:together",
185
+ merge_model_source="huggingface",
186
+ max_workers=8,
187
+ )
188
+ ```
189
+
190
+ **Parameters**
191
+
192
+ | Parameter | Default | Description |
193
+ | --- | --- | --- |
194
+ | `input_data` | — | List of category labels, or a frequency `Series`/`dict` (`label -> count`). |
195
+ | `api_key` | `None` | API key for the LLM provider (required). |
196
+ | `description` | `""` | The survey question or context, used in the merge prompt. |
197
+ | `passes` | `1` | Number of merge iterations, or `"auto"` to iterate until the embedding-quality benchmark peaks. |
198
+ | `max_passes` | `10` | Cap on iterations when `passes="auto"`. |
199
+ | `batch_size` | `40` | Labels per LLM chunk (`ceil(n / batch_size)` calls per pass). |
200
+ | `aggressive` | `False` | `True` = conceptual-merge prompt (compress related labels); `False` = extract-unique (faithful thinning, removes restatements only). |
201
+ | `dedupe_threshold` | `0.95` | Jaro-Winkler similarity at/above which normalized labels are deduped (`1.0` = exact only). |
202
+ | `embedding_merge_threshold` | `0.92` | Cosine similarity at/above which labels are merged in the pre-LLM embedding step. `None`/`>=1.0` disables it. |
203
+ | `shuffle` | `True` | Randomize order each pass so batch composition varies (improves convergence stability). |
204
+ | `final_consolidation` | `0.82` | Cosine threshold for one greedy global embedding re-merge after all passes, collapsing cross-batch duplicates. Conservative by design (errs toward keeping categories). `False`/`None` skips it. |
205
+ | `user_model` | `"gpt-4o"` | Model for the merge phase. Use a capable model — small models can degenerate. |
206
+ | `model_source` | `"auto"` | Provider for `user_model` (`"auto"`, `"openai"`, `"huggingface"`, …). |
207
+ | `unique_model` | `None` | If set, run an initial extract-unique thinning phase on this (typically cheaper) model before the merge phase. `None` skips the phase (backward compatible). |
208
+ | `unique_model_source` | `"auto"` | Provider for `unique_model` — can differ from the merge phase. |
209
+ | `unique_passes` | `1` | Number of thinning passes when `unique_model` is set. |
210
+ | `merge_model` | `None` | Model for the merge phase; falls back to `user_model` when `None`. |
211
+ | `merge_model_source` | `"auto"` | Provider for `merge_model`. |
212
+ | `creativity` | `0` | Temperature (`0` = deterministic). |
213
+ | `max_workers` | `1` | Batches processed concurrently per pass. |
214
+ | `random_state` | `None` | Seed for shuffling (per-pass seed = `random_state + pass`). |
215
+ | `filename` | `None` | Optional CSV path to save the final list. |
216
+ | `progress_callback` | `None` | Optional `callback(pass, passes, label)` for progress reporting. |
217
+
157
218
  ### `summarize()`
158
219
  Summarize text or PDF documents, with optional multi-model ensemble.
159
220
 
@@ -118,6 +118,67 @@ cat.explore(
118
118
  )
119
119
  ```
120
120
 
121
+ ### `collapse_themes()`
122
+ Consolidate a long, redundant list of extracted category labels (e.g. the output of `explore()`) into a smaller, deduplicated taxonomy. Runs the semantic merge iteratively, then applies a single deterministic embedding re-merge over the whole result to collapse cross-batch lexical siblings (e.g. "tension" / "estrangement") that batched passes leave separate. Tuned to err toward over-segmentation (keeping categories) rather than over-merging.
123
+
124
+ ```python
125
+ # Basic: aggressive merge, auto-stop at the quality peak
126
+ cat.collapse_themes(
127
+ input_data=raw_labels, # list[str] or a frequency Series/dict
128
+ api_key=key,
129
+ description="Why did you move?", # the survey question / context
130
+ aggressive=True,
131
+ passes="auto",
132
+ user_model="gpt-4o",
133
+ )
134
+ ```
135
+
136
+ ```python
137
+ # Per-step model assignment: a cheap model thins restatements,
138
+ # a stronger model does the conceptual merge (providers can differ)
139
+ cat.collapse_themes(
140
+ input_data=raw_labels,
141
+ api_key=key,
142
+ description="Why did you move?",
143
+ aggressive=True,
144
+ passes="auto",
145
+ unique_model="Qwen/Qwen2.5-72B-Instruct:together",
146
+ unique_model_source="huggingface",
147
+ unique_passes=1,
148
+ merge_model="Qwen/Qwen3.6-35B-A3B:together",
149
+ merge_model_source="huggingface",
150
+ max_workers=8,
151
+ )
152
+ ```
153
+
154
+ **Parameters**
155
+
156
+ | Parameter | Default | Description |
157
+ | --- | --- | --- |
158
+ | `input_data` | — | List of category labels, or a frequency `Series`/`dict` (`label -> count`). |
159
+ | `api_key` | `None` | API key for the LLM provider (required). |
160
+ | `description` | `""` | The survey question or context, used in the merge prompt. |
161
+ | `passes` | `1` | Number of merge iterations, or `"auto"` to iterate until the embedding-quality benchmark peaks. |
162
+ | `max_passes` | `10` | Cap on iterations when `passes="auto"`. |
163
+ | `batch_size` | `40` | Labels per LLM chunk (`ceil(n / batch_size)` calls per pass). |
164
+ | `aggressive` | `False` | `True` = conceptual-merge prompt (compress related labels); `False` = extract-unique (faithful thinning, removes restatements only). |
165
+ | `dedupe_threshold` | `0.95` | Jaro-Winkler similarity at/above which normalized labels are deduped (`1.0` = exact only). |
166
+ | `embedding_merge_threshold` | `0.92` | Cosine similarity at/above which labels are merged in the pre-LLM embedding step. `None`/`>=1.0` disables it. |
167
+ | `shuffle` | `True` | Randomize order each pass so batch composition varies (improves convergence stability). |
168
+ | `final_consolidation` | `0.82` | Cosine threshold for one greedy global embedding re-merge after all passes, collapsing cross-batch duplicates. Conservative by design (errs toward keeping categories). `False`/`None` skips it. |
169
+ | `user_model` | `"gpt-4o"` | Model for the merge phase. Use a capable model — small models can degenerate. |
170
+ | `model_source` | `"auto"` | Provider for `user_model` (`"auto"`, `"openai"`, `"huggingface"`, …). |
171
+ | `unique_model` | `None` | If set, run an initial extract-unique thinning phase on this (typically cheaper) model before the merge phase. `None` skips the phase (backward compatible). |
172
+ | `unique_model_source` | `"auto"` | Provider for `unique_model` — can differ from the merge phase. |
173
+ | `unique_passes` | `1` | Number of thinning passes when `unique_model` is set. |
174
+ | `merge_model` | `None` | Model for the merge phase; falls back to `user_model` when `None`. |
175
+ | `merge_model_source` | `"auto"` | Provider for `merge_model`. |
176
+ | `creativity` | `0` | Temperature (`0` = deterministic). |
177
+ | `max_workers` | `1` | Batches processed concurrently per pass. |
178
+ | `random_state` | `None` | Seed for shuffling (per-pass seed = `random_state + pass`). |
179
+ | `filename` | `None` | Optional CSV path to save the final list. |
180
+ | `progress_callback` | `None` | Optional `callback(pass, passes, label)` for progress reporting. |
181
+
121
182
  ### `summarize()`
122
183
  Summarize text or PDF documents, with optional multi-model ensemble.
123
184
 
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
- __version__ = "2.0.0b1"
4
+ __version__ = "2.0.0b5"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-stack"
@@ -333,6 +333,14 @@ def _detect_huggingface_endpoint(api_key: str, model: str, skip: set = None) ->
333
333
  headers = {
334
334
  "Content-Type": "application/json",
335
335
  "Authorization": f"Bearer {api_key}",
336
+ # Match the main request path: featherless's WAF 403s the default
337
+ # python-requests agent, which would make this probe wrongly skip a
338
+ # working endpoint.
339
+ "User-Agent": (
340
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
341
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
342
+ "Chrome/120.0.0.0 Safari/537.36"
343
+ ),
336
344
  }
337
345
  payload = {
338
346
  "model": clean_model,
@@ -526,7 +534,19 @@ class UnifiedLLMClient:
526
534
 
527
535
  def _get_headers(self) -> dict:
528
536
  """Build request headers for the provider."""
529
- headers = {"Content-Type": "application/json"}
537
+ # Send a browser-like User-Agent. Some providers fronted by a WAF
538
+ # (notably the HuggingFace router's featherless-ai backend) intermittently
539
+ # 403 the default `python-requests/x.y` agent via a Cloudflare bot rule,
540
+ # which surfaces as spurious classification failures. A standard UA is
541
+ # accepted everywhere and costs nothing on providers that don't care.
542
+ headers = {
543
+ "Content-Type": "application/json",
544
+ "User-Agent": (
545
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
546
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
547
+ "Chrome/120.0.0.0 Safari/537.36"
548
+ ),
549
+ }
530
550
  auth_header = self.config["auth_header"]
531
551
  auth_prefix = self.config["auth_prefix"]
532
552
 
@@ -70,17 +70,21 @@ def _jw_dedupe(items, threshold):
70
70
  return out
71
71
 
72
72
 
73
- def _embedding_merge(items, threshold):
74
- """Greedy embedding clustering: drop labels whose cosine similarity to an
75
- already-kept label is >= threshold. Keeps the first-seen representative.
76
- Uses cat-stack's canonical BAAI/bge-small model (cached)."""
73
+ def _get_emb_model():
74
+ """Load (once) and return cat-stack's canonical BAAI/bge-small embedder."""
77
75
  global _EMB_MODEL
78
- if not threshold or threshold >= 1.0 or len(items) < 2:
79
- return items
80
76
  if _EMB_MODEL is None:
81
77
  from ._embeddings import load_embedding_model
82
78
  _EMB_MODEL = load_embedding_model()
83
- embs = _EMB_MODEL.encode(items, normalize_embeddings=True, show_progress_bar=False)
79
+ return _EMB_MODEL
80
+
81
+
82
+ def _embedding_merge(items, threshold):
83
+ """Greedy embedding clustering: drop labels whose cosine similarity to an
84
+ already-kept label is >= threshold. Keeps the first-seen representative."""
85
+ if not threshold or threshold >= 1.0 or len(items) < 2:
86
+ return items
87
+ embs = _get_emb_model().encode(items, normalize_embeddings=True, show_progress_bar=False)
84
88
  reps, rep_embs = [], []
85
89
  for it, e in zip(items, embs):
86
90
  if rep_embs and float(np.max(np.asarray(rep_embs) @ e)) >= threshold:
@@ -90,6 +94,29 @@ def _embedding_merge(items, threshold):
90
94
  return reps
91
95
 
92
96
 
97
+ def _quality(output, raw_embs, tau_cov=0.70, tau_red=0.85, beta=2.0):
98
+ """Deterministic quality of a candidate taxonomy vs the raw input themes:
99
+ coverage-weighted F-beta of recall=coverage_hard (share of raw within tau_cov
100
+ of some output) and precision=(1 - redundancy_rate) (share of outputs with a
101
+ near-twin >= tau_red). Embedding-only — the convergence signal for passes='auto'.
102
+ """
103
+ if not output:
104
+ return 0.0
105
+ O = _get_emb_model().encode(list(output), normalize_embeddings=True, show_progress_bar=False)
106
+ coverage = float(((raw_embs @ O.T).max(axis=1) >= tau_cov).mean())
107
+ if len(output) > 1:
108
+ OO = O @ O.T
109
+ np.fill_diagonal(OO, -1.0)
110
+ redundancy = float((OO.max(axis=1) >= tau_red).mean())
111
+ else:
112
+ redundancy = 0.0
113
+ precision = 1.0 - redundancy
114
+ if coverage <= 0 or precision <= 0:
115
+ return 0.0
116
+ b2 = beta * beta
117
+ return (1 + b2) * precision * coverage / (b2 * precision + coverage)
118
+
119
+
93
120
  def _collapse_batch(client, batch, description, creativity, mode="unique"):
94
121
  """One LLM call on a single batch -> list[str].
95
122
 
@@ -254,13 +281,20 @@ def collapse_themes(
254
281
  api_key=None,
255
282
  description="",
256
283
  passes=1,
284
+ max_passes=10,
257
285
  batch_size=40,
258
286
  aggressive=False,
259
287
  dedupe_threshold=0.95,
260
288
  embedding_merge_threshold=0.92,
261
289
  shuffle=True,
290
+ final_consolidation=0.82,
262
291
  user_model="gpt-4o",
263
292
  model_source="auto",
293
+ unique_model=None,
294
+ unique_model_source="auto",
295
+ unique_passes=1,
296
+ merge_model=None,
297
+ merge_model_source="auto",
264
298
  creativity=0,
265
299
  max_workers=1,
266
300
  random_state=None,
@@ -296,7 +330,10 @@ def collapse_themes(
296
330
  description (str): Data/question context, injected into the prompt — e.g.
297
331
  the survey question the categories came from. Helps the model judge
298
332
  which distinctions matter.
299
- passes (int): Number of collapse iterations to run. Default 1.
333
+ passes (int | str): Number of collapse iterations, or "auto" to iterate
334
+ until the deterministic quality benchmark peaks (the recommended mode
335
+ for a final taxonomy — pair with aggressive=True). Default 1.
336
+ max_passes (int): Cap on iterations when passes="auto". Default 10.
300
337
  batch_size (int): Themes per LLM chunk (ceil(n / batch_size) calls per
301
338
  pass). Default 40.
302
339
  aggressive (bool): If True, use the conceptual-merge prompt (compress);
@@ -308,9 +345,28 @@ def collapse_themes(
308
345
  0.92. None or >=1.0 skips embeddings.
309
346
  shuffle (bool): Randomize order each pass so batch composition varies.
310
347
  Default True (improves convergence stability).
311
- user_model (str): Model name. Default "gpt-4o". Use a capable model —
312
- small models can degenerate into repetition.
348
+ final_consolidation (float): Cosine threshold for one greedy embedding
349
+ re-merge over the whole result after all passes, collapsing cross-batch
350
+ lexical-sibling duplicates that batched passes (and the auto loop) cannot
351
+ reach. Default 0.82 — deterministic and tuned to land just above the true
352
+ concept count (errs toward keeping categories; over-segmentation is
353
+ preferred over over-consolidation). False/None skips.
354
+ user_model (str): Model name for the merge phase. Default "gpt-4o". Use a
355
+ capable model — small models can degenerate into repetition.
313
356
  model_source (str): Provider — "auto", "openai", "huggingface", etc.
357
+ unique_model (str): If set, run an initial extract-unique thinning phase on
358
+ this (typically cheaper) model before the merge phase, allocating model
359
+ spend by task difficulty: a smaller model handles faithful restatement
360
+ removal, a stronger one handles conceptual merging. None (default) skips
361
+ the phase entirely (backward compatible). Recommended pairing:
362
+ unique_model = a 72B-class model, merge_model = a frontier model.
363
+ unique_model_source (str): Provider for unique_model. Default "auto" — can
364
+ differ from the merge phase, so the two phases may sit on different
365
+ providers.
366
+ unique_passes (int): Number of extract-unique passes in the thinning phase
367
+ when unique_model is set. Default 1.
368
+ merge_model (str): Model for the merge phase. Defaults to user_model when None.
369
+ merge_model_source (str): Provider for merge_model. Default "auto".
314
370
  creativity (float): Temperature. Default 0 (deterministic).
315
371
  max_workers (int): Batches processed concurrently per pass. Default 1.
316
372
  random_state (int): Seed for shuffling (per-pass seed = random_state + p).
@@ -325,37 +381,96 @@ def collapse_themes(
325
381
  >>> import cat_stack as cat
326
382
  >>> themes = cat.explore(df['responses'], description="Why did you move?",
327
383
  ... api_key=key)
328
- >>> # 1) thin faithfully, then 2) compress
329
- >>> thinned = cat.collapse_themes(themes, api_key=key,
330
- ... description="Why did you move?", passes=10, max_workers=8)
331
- >>> final = cat.collapse_themes(thinned, api_key=key,
332
- ... description="Why did you move?", passes=2, aggressive=True)
384
+ >>> # Recommended: aggressive merge, auto-stop at the quality peak
385
+ >>> taxonomy = cat.collapse_themes(
386
+ ... themes, api_key=key, description="Why did you move?",
387
+ ... aggressive=True, passes="auto", max_workers=8,
388
+ ... )
333
389
  """
334
390
  if not api_key:
335
391
  raise ValueError("collapse_themes() needs an api_key for the LLM call.")
336
392
 
337
393
  mode = "merge" if aggressive else "unique"
338
- provider = detect_provider(user_model, model_source)
339
- client = UnifiedLLMClient(provider=provider, api_key=api_key, model=user_model)
340
394
 
341
- current = input_data
342
- for p in range(passes):
343
- seed = None if random_state is None else random_state + p
344
- current = _collapse_once(
345
- client,
346
- current,
395
+ # The main (merge) phase runs on merge_model if given, else user_model. A separate
396
+ # cheaper model can handle the simpler unique-keeping phase via unique_model — per
397
+ # step the work differs in difficulty (faithful thinning is easy, conceptual
398
+ # merging is hard), so model spend can be allocated accordingly. Each phase
399
+ # resolves its own provider, so the two can sit on different providers.
400
+ merge_name = merge_model or user_model
401
+ merge_src = merge_model_source if merge_model else model_source
402
+ merge_provider = detect_provider(merge_name, merge_src)
403
+ client = UnifiedLLMClient(provider=merge_provider, api_key=api_key, model=merge_name)
404
+
405
+ def _run(cl, items, md, p):
406
+ return _collapse_once(
407
+ cl, items,
347
408
  description=description,
348
409
  batch_size=batch_size,
349
410
  dedupe_threshold=dedupe_threshold,
350
411
  embedding_merge_threshold=embedding_merge_threshold,
351
- mode=mode,
412
+ mode=md,
352
413
  shuffle=shuffle,
353
- random_state=seed,
414
+ random_state=(None if random_state is None else random_state + p),
354
415
  creativity=creativity,
355
416
  max_workers=max_workers,
356
417
  )
357
- if progress_callback:
358
- progress_callback(p + 1, passes, "collapse_themes")
418
+
419
+ def _pass(items, p):
420
+ return _run(client, items, mode, p)
421
+
422
+ current = input_data
423
+
424
+ # Phase 1 (optional): cheap unique-keeping thin. When unique_model is set, run
425
+ # `unique_passes` extract-unique passes on a separate (typically smaller, cheaper)
426
+ # model to strip restatement-level duplicates before the expensive merge phase.
427
+ # Skipped entirely when unique_model is None (fully backward compatible).
428
+ if unique_model:
429
+ u_provider = detect_provider(unique_model, unique_model_source)
430
+ u_client = UnifiedLLMClient(provider=u_provider, api_key=api_key, model=unique_model)
431
+ for p in range(int(unique_passes)):
432
+ current = _run(u_client, current, "unique", p)
433
+ if progress_callback:
434
+ progress_callback(p + 1, int(unique_passes), "collapse_themes:unique")
435
+ if passes == "auto":
436
+ # Iterate until the deterministic quality benchmark stops improving (the
437
+ # peak), capped at max_passes. Quality is scored vs the ORIGINAL input
438
+ # themes — embedding-only, model-independent at decision time. The peak is
439
+ # the principled stop (validated across surveys and list sizes).
440
+ raw_embs = _get_emb_model().encode(
441
+ list(_to_counts(input_data).keys()), normalize_embeddings=True,
442
+ show_progress_bar=False,
443
+ )
444
+ best, best_q = None, -1.0
445
+ for p in range(max_passes):
446
+ current = _pass(current, p)
447
+ q = _quality(current, raw_embs)
448
+ if progress_callback:
449
+ progress_callback(p + 1, max_passes, "collapse_themes")
450
+ if q < best_q:
451
+ break # quality dropped -> the previous pass was the peak
452
+ best, best_q = current, q
453
+ current = best if best is not None else current
454
+ else:
455
+ for p in range(int(passes)):
456
+ current = _pass(current, p)
457
+ if progress_callback:
458
+ progress_callback(p + 1, int(passes), "collapse_themes")
459
+
460
+ # Final global consolidation. Batched passes (and the auto loop) can only merge
461
+ # labels that share a batch, so cross-batch lexical siblings — e.g. "tension" vs
462
+ # "estrangement", which restate one concept but embed below the per-pass dedupe
463
+ # threshold — survive as separate themes, inflating the count above the true
464
+ # number of concepts. This applies one greedy embedding re-merge over the WHOLE
465
+ # result at a lower threshold, dropping each label that restates an already-kept
466
+ # one to bring the count closer to truth. Greedy (compares only against kept
467
+ # representatives, no transitive chaining) avoids blobbing related-but-distinct
468
+ # labels. It is deterministic (no extra LLM call, model-independent at decision
469
+ # time) and tuned to land just above the true count, so it errs toward KEEPING
470
+ # categories — over-segmentation is the preferred failure mode, not
471
+ # over-consolidation. Set final_consolidation=False to skip.
472
+ if final_consolidation and len(current) > 1:
473
+ current = _embedding_merge(current, final_consolidation)
359
474
 
360
475
  if filename:
361
476
  pd.DataFrame({"category": current}).to_csv(filename, index=False)
File without changes
File without changes
File without changes