cat-stack 1.6.9__tar.gz → 2.0.0b4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/PKG-INFO +1 -1
  2. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/__about__.py +1 -1
  3. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/__init__.py +2 -0
  4. cat_stack-2.0.0b4/src/catstack/collapse_themes.py +479 -0
  5. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/.gitignore +0 -0
  6. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/LICENSE +0 -0
  7. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/README.md +0 -0
  8. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/pyproject.toml +0 -0
  9. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/cat_stack/__init__.py +0 -0
  10. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_batch.py +0 -0
  11. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_category_analysis.py +0 -0
  12. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_chunked.py +0 -0
  13. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_embeddings.py +0 -0
  14. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_formatter.py +0 -0
  15. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_pilot_test.py +0 -0
  16. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_prompts.py +0 -0
  17. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_providers.py +0 -0
  18. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_review_ui.py +0 -0
  19. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_tiebreaker.py +0 -0
  20. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_utils.py +0 -0
  21. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_web_fetch.py +0 -0
  22. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/_wrapper_helpers.py +0 -0
  23. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/calls/CoVe.py +0 -0
  24. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/calls/__init__.py +0 -0
  25. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/calls/image_CoVe.py +0 -0
  26. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/calls/image_stepback.py +0 -0
  27. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/calls/pdf_CoVe.py +0 -0
  28. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/calls/pdf_stepback.py +0 -0
  29. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/calls/stepback.py +0 -0
  30. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/calls/top_n.py +0 -0
  31. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/classify.py +0 -0
  32. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/explore.py +0 -0
  33. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/extract.py +0 -0
  34. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/image_functions.py +0 -0
  35. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/images/circle.png +0 -0
  36. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/images/cube.png +0 -0
  37. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/images/diamond.png +0 -0
  38. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/images/overlapping_pentagons.png +0 -0
  39. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/images/rectangles.png +0 -0
  40. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/model_reference_list.py +0 -0
  41. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/pdf_functions.py +0 -0
  42. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/prompt_tune.py +0 -0
  43. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/summarize.py +0 -0
  44. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/text_functions.py +0 -0
  45. {cat_stack-1.6.9 → cat_stack-2.0.0b4}/src/catstack/text_functions_ensemble.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-stack
3
- Version: 1.6.9
3
+ Version: 2.0.0b4
4
4
  Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
- __version__ = "1.6.9"
4
+ __version__ = "2.0.0b4"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-stack"
@@ -18,6 +18,7 @@ from .__about__ import (
18
18
  # Main entry points
19
19
  from .extract import extract
20
20
  from .explore import explore
21
+ from .collapse_themes import collapse_themes
21
22
  from .classify import classify
22
23
  from .summarize import summarize
23
24
  from .prompt_tune import prompt_tune
@@ -103,6 +104,7 @@ __all__ = [
103
104
  # Main entry points
104
105
  "extract",
105
106
  "explore",
107
+ "collapse_themes",
106
108
  "classify",
107
109
  "summarize",
108
110
  "prompt_tune",
@@ -0,0 +1,479 @@
1
+ """
2
+ Theme collapsing for CatLLM.
3
+
4
+ collapse_themes() takes an already-extracted list of category/theme strings (for
5
+ example the output of explore()) and iteratively consolidates near-duplicate /
6
+ synonymous labels into a smaller list. Each pass:
7
+
8
+ A. accept the list,
9
+ B. PRE-CLEAN before the model — normalize + Jaro-Winkler dedup (surface
10
+ variants) then embedding-merge (semantic near-duplicates),
11
+ C. split the cleaned list into batches of `batch_size`,
12
+ D. read every batch with one LLM call (extract-unique, or aggressive merge),
13
+ E. concatenate and dedupe into a single, smaller list.
14
+
15
+ `passes` iterations run in one call, randomizing batch composition each pass so
16
+ labels stranded in separate batches get fresh chances to meet and merge.
17
+ Provider-agnostic via the same dispatch classify()/explore() use.
18
+ """
19
+
20
+ import random
21
+ import re
22
+ import sys
23
+
24
+ import numpy as np
25
+ import pandas as pd
26
+ from jellyfish import jaro_winkler_similarity
27
+
28
+ from ._providers import UnifiedLLMClient, detect_provider
29
+ from ._utils import _clean_label
30
+
31
+ __all__ = [
32
+ "collapse_themes",
33
+ ]
34
+
35
+ _LINE_PAT = re.compile(r"^\s*\d+\s*[\.\)\-]\s*(.+)$")
36
+ _EMB_MODEL = None # cached embedding model (loaded once per process)
37
+
38
+
39
+ def _strip_parens(label):
40
+ """Drop parenthetical examples — '(...)' doesn't change the category."""
41
+ return re.sub(r"\s*\([^)]*\)", "", label).strip()
42
+
43
+
44
+ def _norm_key(label):
45
+ """Canonical dedup key: parens-stripped, lowercased, separators/order unified."""
46
+ s = _strip_parens(label).lower().strip()
47
+ s = re.sub(r"\s*&\s*|\s+and\s+|\s*/\s*", " / ", s)
48
+ parts = sorted(p.strip() for p in s.split("/") if p.strip())
49
+ return " / ".join(parts)
50
+
51
+
52
+ def _jw_dedupe(items, threshold):
53
+ """Order-preserving dedup: normalize each label and collapse near-identical
54
+ normalized labels with a Jaro-Winkler threshold. Returns readable forms."""
55
+ kept_keys = []
56
+ out = []
57
+ for c in items:
58
+ disp = _strip_parens(c).lower().strip()
59
+ key = _norm_key(c)
60
+ if not disp or not key:
61
+ continue
62
+ is_dup = any(
63
+ k == key
64
+ or (threshold < 1.0 and jaro_winkler_similarity(key, k) >= threshold)
65
+ for k in kept_keys
66
+ )
67
+ if not is_dup:
68
+ kept_keys.append(key)
69
+ out.append(disp)
70
+ return out
71
+
72
+
73
+ def _get_emb_model():
74
+ """Load (once) and return cat-stack's canonical BAAI/bge-small embedder."""
75
+ global _EMB_MODEL
76
+ if _EMB_MODEL is None:
77
+ from ._embeddings import load_embedding_model
78
+ _EMB_MODEL = load_embedding_model()
79
+ return _EMB_MODEL
80
+
81
+
82
+ def _embedding_merge(items, threshold):
83
+ """Greedy embedding clustering: drop labels whose cosine similarity to an
84
+ already-kept label is >= threshold. Keeps the first-seen representative."""
85
+ if not threshold or threshold >= 1.0 or len(items) < 2:
86
+ return items
87
+ embs = _get_emb_model().encode(items, normalize_embeddings=True, show_progress_bar=False)
88
+ reps, rep_embs = [], []
89
+ for it, e in zip(items, embs):
90
+ if rep_embs and float(np.max(np.asarray(rep_embs) @ e)) >= threshold:
91
+ continue
92
+ reps.append(it)
93
+ rep_embs.append(e)
94
+ return reps
95
+
96
+
97
+ def _quality(output, raw_embs, tau_cov=0.70, tau_red=0.85, beta=2.0):
98
+ """Deterministic quality of a candidate taxonomy vs the raw input themes:
99
+ coverage-weighted F-beta of recall=coverage_hard (share of raw within tau_cov
100
+ of some output) and precision=(1 - redundancy_rate) (share of outputs with a
101
+ near-twin >= tau_red). Embedding-only — the convergence signal for passes='auto'.
102
+ """
103
+ if not output:
104
+ return 0.0
105
+ O = _get_emb_model().encode(list(output), normalize_embeddings=True, show_progress_bar=False)
106
+ coverage = float(((raw_embs @ O.T).max(axis=1) >= tau_cov).mean())
107
+ if len(output) > 1:
108
+ OO = O @ O.T
109
+ np.fill_diagonal(OO, -1.0)
110
+ redundancy = float((OO.max(axis=1) >= tau_red).mean())
111
+ else:
112
+ redundancy = 0.0
113
+ precision = 1.0 - redundancy
114
+ if coverage <= 0 or precision <= 0:
115
+ return 0.0
116
+ b2 = beta * beta
117
+ return (1 + b2) * precision * coverage / (b2 * precision + coverage)
118
+
119
+
120
+ def _collapse_batch(client, batch, description, creativity, mode="unique"):
121
+ """One LLM call on a single batch -> list[str].
122
+
123
+ mode="unique": extract unique categories only (remove restatements, keep
124
+ distinct ones) — gentle, near-idempotent, guaranteed to only remove.
125
+ mode="merge": aggressively consolidate related labels into broader concepts
126
+ while retaining meaningful distinctions — for a final compression step.
127
+
128
+ Strict numbered-list prompt + strict parsing, so the reply is always a clean
129
+ list and any stray prose is ignored. Guardrails: a failed call returns the
130
+ batch unchanged (no data loss); in "unique" mode the output is forced to be a
131
+ subset of the input (monotone, drift-free).
132
+ """
133
+ items_blob = "; ".join(batch)
134
+ context = f' about: "{description}"' if description else ""
135
+ if mode == "merge":
136
+ prompt = (
137
+ f"You are consolidating a list of category labels{context} into a smaller set of "
138
+ "broader categories. Group labels that describe the same underlying concept and give "
139
+ "each group ONE clear representative label — actively merge near-synonyms and closely "
140
+ "related labels into broader themes. BUT retain nuance: do NOT over-merge — keep labels "
141
+ "separate when they capture a genuinely distinct concept, even if related, rather than "
142
+ "collapsing them into one vague catch-all. Prefer fewer, cleaner categories without "
143
+ f"losing real distinctions. Labels are separated by semicolons within triple backticks: "
144
+ f"```{items_blob}```.\n\n"
145
+ "Return ONLY a numbered list of the consolidated categories. Each line must follow this "
146
+ "exact format, with no other text before or after the list:\n"
147
+ "N. label\n\n"
148
+ "Example:\n"
149
+ "1. Employment\n"
150
+ "2. Education\n"
151
+ "3. Religion"
152
+ )
153
+ else:
154
+ prompt = (
155
+ f"You are given a list of category labels{context}. "
156
+ "Return the UNIQUE categories. Remove ONLY exact duplicates and labels that "
157
+ "restate the SAME category in different words — when two labels are the same "
158
+ "category, keep one of them exactly as written. KEEP every genuinely distinct "
159
+ "category. Do NOT merge categories that are merely related, do NOT invent or "
160
+ "broaden labels, and do NOT drop a category just to make the list shorter. "
161
+ "If all the labels are already distinct categories, return ALL of them unchanged. "
162
+ f"Labels are separated by semicolons within triple backticks: ```{items_blob}```.\n\n"
163
+ "Return ONLY a numbered list, using the labels exactly as they appear. Each line "
164
+ "must follow this exact format, with no other text before or after the list:\n"
165
+ "N. label\n\n"
166
+ "Example:\n"
167
+ "1. Employment\n"
168
+ "2. Education\n"
169
+ "3. Religion"
170
+ )
171
+ reply, error = client.complete(
172
+ messages=[{"role": "user", "content": prompt}],
173
+ creativity=creativity,
174
+ force_json=False,
175
+ )
176
+ if error:
177
+ # No data loss: keep the batch unchanged so its categories aren't dropped.
178
+ sys.stderr.write(f"[collapse_themes] batch failed: {error} — keeping batch unchanged\n")
179
+ return [str(x).strip().lower() for x in batch]
180
+
181
+ out = []
182
+ for line in (reply or "").splitlines():
183
+ m = _LINE_PAT.match(line.strip())
184
+ if m:
185
+ label = _clean_label(m.group(1)).strip(" ;.,")
186
+ if label:
187
+ out.append(label)
188
+
189
+ if mode == "unique":
190
+ # Contraction guarantee: extract-unique must only REMOVE, never add or
191
+ # mutate. Keep only outputs that map back to an input label (by normalized
192
+ # key), as the original input string. Makes every pass monotone and
193
+ # drift-free, immune to intermittent model rephrasing/splitting.
194
+ in_by_key = {}
195
+ for x in batch:
196
+ in_by_key.setdefault(_norm_key(x), str(x).strip().lower())
197
+ seen, subset = set(), []
198
+ for o in out:
199
+ k = _norm_key(o)
200
+ if k in in_by_key and k not in seen:
201
+ seen.add(k)
202
+ subset.append(in_by_key[k])
203
+ # If parsing/matching failed entirely, fall back to the batch (no loss).
204
+ out = subset if subset else [str(x).strip().lower() for x in batch]
205
+ return out
206
+
207
+
208
+ def _to_counts(input_data):
209
+ """Coerce the accepted input forms into a {category: count} dict."""
210
+ if isinstance(input_data, pd.DataFrame):
211
+ cols = {c.lower(): c for c in input_data.columns}
212
+ cat_col = cols.get("category")
213
+ cnt_col = cols.get("count")
214
+ if cat_col is None:
215
+ raise ValueError("DataFrame input must have a 'category' column.")
216
+ if cnt_col is not None:
217
+ return input_data.groupby(cat_col)[cnt_col].sum().astype(int).to_dict()
218
+ return input_data[cat_col].value_counts().to_dict()
219
+ if isinstance(input_data, dict):
220
+ return {str(k): int(v) for k, v in input_data.items()}
221
+ series = input_data if isinstance(input_data, pd.Series) else pd.Series(input_data)
222
+ series = series.dropna().astype("string")
223
+ return series.value_counts().to_dict()
224
+
225
+
226
+ def _collapse_once(
227
+ client,
228
+ items,
229
+ *,
230
+ description,
231
+ batch_size,
232
+ dedupe_threshold,
233
+ embedding_merge_threshold,
234
+ mode,
235
+ shuffle,
236
+ random_state,
237
+ creativity,
238
+ max_workers,
239
+ ):
240
+ """Run a single collapse pass over `items` and return the reduced list."""
241
+ # A. accept -> {category: count}
242
+ counts = _to_counts(items)
243
+
244
+ # B. PRE-CLEAN before the model: normalize+JW dedup, then embedding-merge
245
+ ordered = sorted(counts, key=counts.get, reverse=True)
246
+ cleaned = _jw_dedupe(ordered, dedupe_threshold)
247
+ cleaned = _embedding_merge(cleaned, embedding_merge_threshold)
248
+
249
+ # Randomize order so batch composition varies across passes — gives near-
250
+ # duplicates split across batches fresh chances to co-occur and merge.
251
+ if shuffle:
252
+ random.Random(random_state).shuffle(cleaned)
253
+
254
+ # C. split into batches
255
+ batches = [cleaned[i:i + batch_size] for i in range(0, len(cleaned), batch_size)]
256
+
257
+ # D. one LLM call per batch (sequential or parallel)
258
+ if max_workers and max_workers > 1:
259
+ from concurrent.futures import ThreadPoolExecutor, as_completed
260
+
261
+ results = [None] * len(batches)
262
+ with ThreadPoolExecutor(max_workers=max_workers) as ex:
263
+ futures = {
264
+ ex.submit(_collapse_batch, client, b, description, creativity, mode): i
265
+ for i, b in enumerate(batches)
266
+ }
267
+ for fut in as_completed(futures):
268
+ results[futures[fut]] = fut.result()
269
+ out = [label for r in results for label in (r or [])]
270
+ else:
271
+ out = []
272
+ for batch in batches:
273
+ out.extend(_collapse_batch(client, batch, description, creativity, mode))
274
+
275
+ # E. dedupe the concatenated output (surface-level)
276
+ return _jw_dedupe(out, dedupe_threshold)
277
+
278
+
279
+ def collapse_themes(
280
+ input_data,
281
+ api_key=None,
282
+ description="",
283
+ passes=1,
284
+ max_passes=10,
285
+ batch_size=40,
286
+ aggressive=False,
287
+ dedupe_threshold=0.95,
288
+ embedding_merge_threshold=0.92,
289
+ shuffle=True,
290
+ final_consolidation=0.82,
291
+ user_model="gpt-4o",
292
+ model_source="auto",
293
+ unique_model=None,
294
+ unique_model_source="auto",
295
+ unique_passes=1,
296
+ merge_model=None,
297
+ merge_model_source="auto",
298
+ creativity=0,
299
+ max_workers=1,
300
+ random_state=None,
301
+ filename=None,
302
+ progress_callback=None,
303
+ ):
304
+ """
305
+ Collapse a list of extracted themes into a smaller, deduplicated list.
306
+
307
+ Iteratively consolidates near-duplicate / synonymous category labels (for
308
+ example the output of explore()). Each pass PRE-CLEANS before the model
309
+ (normalize + Jaro-Winkler dedup, then embedding-merge), splits into batches,
310
+ sends each batch to the model, and dedupes the concatenated result. Runs
311
+ `passes` iterations, randomizing batch composition each pass so labels
312
+ stranded in separate batches get fresh chances to merge.
313
+
314
+ Two modes:
315
+ - aggressive=False (default): extract-unique — only removes duplicates /
316
+ restatements, never invents or broadens. Each pass is guaranteed monotone
317
+ (output is a subset of its input). Use to thin a noisy list faithfully.
318
+ - aggressive=True: conceptual merge — actively consolidates related labels
319
+ into broader categories while retaining meaningful distinctions. Use as a
320
+ final compression step.
321
+
322
+ Provider-agnostic (model_source: "auto", "openai", "huggingface", ...), via
323
+ the same dispatch classify()/explore() use.
324
+
325
+ Args:
326
+ input_data: Themes to collapse. list[str] (duplicates allowed), pandas
327
+ Series, dict {category: count}, or DataFrame with "category"
328
+ [and optional "count"] columns.
329
+ api_key (str): API key for the model provider.
330
+ description (str): Data/question context, injected into the prompt — e.g.
331
+ the survey question the categories came from. Helps the model judge
332
+ which distinctions matter.
333
+ passes (int | str): Number of collapse iterations, or "auto" to iterate
334
+ until the deterministic quality benchmark peaks (the recommended mode
335
+ for a final taxonomy — pair with aggressive=True). Default 1.
336
+ max_passes (int): Cap on iterations when passes="auto". Default 10.
337
+ batch_size (int): Themes per LLM chunk (ceil(n / batch_size) calls per
338
+ pass). Default 40.
339
+ aggressive (bool): If True, use the conceptual-merge prompt (compress);
340
+ if False, extract-unique (faithful thinning). Default False.
341
+ dedupe_threshold (float): Jaro-Winkler similarity at/above which two
342
+ normalized labels are deduped. Default 0.95; 1.0 = exact only.
343
+ embedding_merge_threshold (float): Cosine similarity at/above which labels
344
+ are merged in the pre-LLM embedding step (BAAI/bge-small). Default
345
+ 0.92. None or >=1.0 skips embeddings.
346
+ shuffle (bool): Randomize order each pass so batch composition varies.
347
+ Default True (improves convergence stability).
348
+ final_consolidation (float): Cosine threshold for one greedy embedding
349
+ re-merge over the whole result after all passes, collapsing cross-batch
350
+ lexical-sibling duplicates that batched passes (and the auto loop) cannot
351
+ reach. Default 0.82 — deterministic and tuned to land just above the true
352
+ concept count (errs toward keeping categories; over-segmentation is
353
+ preferred over over-consolidation). False/None skips.
354
+ user_model (str): Model name for the merge phase. Default "gpt-4o". Use a
355
+ capable model — small models can degenerate into repetition.
356
+ model_source (str): Provider — "auto", "openai", "huggingface", etc.
357
+ unique_model (str): If set, run an initial extract-unique thinning phase on
358
+ this (typically cheaper) model before the merge phase, allocating model
359
+ spend by task difficulty: a smaller model handles faithful restatement
360
+ removal, a stronger one handles conceptual merging. None (default) skips
361
+ the phase entirely (backward compatible). Recommended pairing:
362
+ unique_model = a 72B-class model, merge_model = a frontier model.
363
+ unique_model_source (str): Provider for unique_model. Default "auto" — can
364
+ differ from the merge phase, so the two phases may sit on different
365
+ providers.
366
+ unique_passes (int): Number of extract-unique passes in the thinning phase
367
+ when unique_model is set. Default 1.
368
+ merge_model (str): Model for the merge phase. Defaults to user_model when None.
369
+ merge_model_source (str): Provider for merge_model. Default "auto".
370
+ creativity (float): Temperature. Default 0 (deterministic).
371
+ max_workers (int): Batches processed concurrently per pass. Default 1.
372
+ random_state (int): Seed for shuffling (per-pass seed = random_state + p).
373
+ None = nondeterministic.
374
+ filename (str): Optional CSV path to save the final list.
375
+ progress_callback (callable): Optional callback(pass, passes, label).
376
+
377
+ Returns:
378
+ list[str]: The collapsed category list after `passes` iterations.
379
+
380
+ Examples:
381
+ >>> import cat_stack as cat
382
+ >>> themes = cat.explore(df['responses'], description="Why did you move?",
383
+ ... api_key=key)
384
+ >>> # Recommended: aggressive merge, auto-stop at the quality peak
385
+ >>> taxonomy = cat.collapse_themes(
386
+ ... themes, api_key=key, description="Why did you move?",
387
+ ... aggressive=True, passes="auto", max_workers=8,
388
+ ... )
389
+ """
390
+ if not api_key:
391
+ raise ValueError("collapse_themes() needs an api_key for the LLM call.")
392
+
393
+ mode = "merge" if aggressive else "unique"
394
+
395
+ # The main (merge) phase runs on merge_model if given, else user_model. A separate
396
+ # cheaper model can handle the simpler unique-keeping phase via unique_model — per
397
+ # step the work differs in difficulty (faithful thinning is easy, conceptual
398
+ # merging is hard), so model spend can be allocated accordingly. Each phase
399
+ # resolves its own provider, so the two can sit on different providers.
400
+ merge_name = merge_model or user_model
401
+ merge_src = merge_model_source if merge_model else model_source
402
+ merge_provider = detect_provider(merge_name, merge_src)
403
+ client = UnifiedLLMClient(provider=merge_provider, api_key=api_key, model=merge_name)
404
+
405
+ def _run(cl, items, md, p):
406
+ return _collapse_once(
407
+ cl, items,
408
+ description=description,
409
+ batch_size=batch_size,
410
+ dedupe_threshold=dedupe_threshold,
411
+ embedding_merge_threshold=embedding_merge_threshold,
412
+ mode=md,
413
+ shuffle=shuffle,
414
+ random_state=(None if random_state is None else random_state + p),
415
+ creativity=creativity,
416
+ max_workers=max_workers,
417
+ )
418
+
419
+ def _pass(items, p):
420
+ return _run(client, items, mode, p)
421
+
422
+ current = input_data
423
+
424
+ # Phase 1 (optional): cheap unique-keeping thin. When unique_model is set, run
425
+ # `unique_passes` extract-unique passes on a separate (typically smaller, cheaper)
426
+ # model to strip restatement-level duplicates before the expensive merge phase.
427
+ # Skipped entirely when unique_model is None (fully backward compatible).
428
+ if unique_model:
429
+ u_provider = detect_provider(unique_model, unique_model_source)
430
+ u_client = UnifiedLLMClient(provider=u_provider, api_key=api_key, model=unique_model)
431
+ for p in range(int(unique_passes)):
432
+ current = _run(u_client, current, "unique", p)
433
+ if progress_callback:
434
+ progress_callback(p + 1, int(unique_passes), "collapse_themes:unique")
435
+ if passes == "auto":
436
+ # Iterate until the deterministic quality benchmark stops improving (the
437
+ # peak), capped at max_passes. Quality is scored vs the ORIGINAL input
438
+ # themes — embedding-only, model-independent at decision time. The peak is
439
+ # the principled stop (validated across surveys and list sizes).
440
+ raw_embs = _get_emb_model().encode(
441
+ list(_to_counts(input_data).keys()), normalize_embeddings=True,
442
+ show_progress_bar=False,
443
+ )
444
+ best, best_q = None, -1.0
445
+ for p in range(max_passes):
446
+ current = _pass(current, p)
447
+ q = _quality(current, raw_embs)
448
+ if progress_callback:
449
+ progress_callback(p + 1, max_passes, "collapse_themes")
450
+ if q < best_q:
451
+ break # quality dropped -> the previous pass was the peak
452
+ best, best_q = current, q
453
+ current = best if best is not None else current
454
+ else:
455
+ for p in range(int(passes)):
456
+ current = _pass(current, p)
457
+ if progress_callback:
458
+ progress_callback(p + 1, int(passes), "collapse_themes")
459
+
460
+ # Final global consolidation. Batched passes (and the auto loop) can only merge
461
+ # labels that share a batch, so cross-batch lexical siblings — e.g. "tension" vs
462
+ # "estrangement", which restate one concept but embed below the per-pass dedupe
463
+ # threshold — survive as separate themes, inflating the count above the true
464
+ # number of concepts. This applies one greedy embedding re-merge over the WHOLE
465
+ # result at a lower threshold, dropping each label that restates an already-kept
466
+ # one to bring the count closer to truth. Greedy (compares only against kept
467
+ # representatives, no transitive chaining) avoids blobbing related-but-distinct
468
+ # labels. It is deterministic (no extra LLM call, model-independent at decision
469
+ # time) and tuned to land just above the true count, so it errs toward KEEPING
470
+ # categories — over-segmentation is the preferred failure mode, not
471
+ # over-consolidation. Set final_consolidation=False to skip.
472
+ if final_consolidation and len(current) > 1:
473
+ current = _embedding_merge(current, final_consolidation)
474
+
475
+ if filename:
476
+ pd.DataFrame({"category": current}).to_csv(filename, index=False)
477
+ print(f"Collapsed categories saved to {filename}")
478
+
479
+ return current
File without changes
File without changes
File without changes
File without changes