cat-stack 2.0.0b1__tar.gz → 2.0.0b4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/PKG-INFO +1 -1
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/__about__.py +1 -1
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/collapse_themes.py +142 -27
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/.gitignore +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/LICENSE +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/README.md +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/pyproject.toml +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/cat_stack/__init__.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/__init__.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_batch.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_category_analysis.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_chunked.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_embeddings.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_formatter.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_pilot_test.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_prompts.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_providers.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_review_ui.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_tiebreaker.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_utils.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_web_fetch.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/_wrapper_helpers.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/calls/CoVe.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/calls/__init__.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/calls/image_CoVe.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/calls/image_stepback.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/calls/pdf_CoVe.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/calls/pdf_stepback.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/calls/stepback.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/calls/top_n.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/classify.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/explore.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/extract.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/image_functions.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/images/circle.png +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/images/cube.png +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/images/diamond.png +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/images/overlapping_pentagons.png +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/images/rectangles.png +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/model_reference_list.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/pdf_functions.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/prompt_tune.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/summarize.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/text_functions.py +0 -0
- {cat_stack-2.0.0b1 → cat_stack-2.0.0b4}/src/catstack/text_functions_ensemble.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cat-stack
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.0b4
|
|
4
4
|
Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
|
|
5
5
|
Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
|
-
__version__ = "2.0.
|
|
4
|
+
__version__ = "2.0.0b4"
|
|
5
5
|
__author__ = "Chris Soria"
|
|
6
6
|
__email__ = "chrissoria@berkeley.edu"
|
|
7
7
|
__title__ = "cat-stack"
|
|
@@ -70,17 +70,21 @@ def _jw_dedupe(items, threshold):
|
|
|
70
70
|
return out
|
|
71
71
|
|
|
72
72
|
|
|
73
|
-
def
|
|
74
|
-
"""
|
|
75
|
-
already-kept label is >= threshold. Keeps the first-seen representative.
|
|
76
|
-
Uses cat-stack's canonical BAAI/bge-small model (cached)."""
|
|
73
|
+
def _get_emb_model():
|
|
74
|
+
"""Load (once) and return cat-stack's canonical BAAI/bge-small embedder."""
|
|
77
75
|
global _EMB_MODEL
|
|
78
|
-
if not threshold or threshold >= 1.0 or len(items) < 2:
|
|
79
|
-
return items
|
|
80
76
|
if _EMB_MODEL is None:
|
|
81
77
|
from ._embeddings import load_embedding_model
|
|
82
78
|
_EMB_MODEL = load_embedding_model()
|
|
83
|
-
|
|
79
|
+
return _EMB_MODEL
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _embedding_merge(items, threshold):
|
|
83
|
+
"""Greedy embedding clustering: drop labels whose cosine similarity to an
|
|
84
|
+
already-kept label is >= threshold. Keeps the first-seen representative."""
|
|
85
|
+
if not threshold or threshold >= 1.0 or len(items) < 2:
|
|
86
|
+
return items
|
|
87
|
+
embs = _get_emb_model().encode(items, normalize_embeddings=True, show_progress_bar=False)
|
|
84
88
|
reps, rep_embs = [], []
|
|
85
89
|
for it, e in zip(items, embs):
|
|
86
90
|
if rep_embs and float(np.max(np.asarray(rep_embs) @ e)) >= threshold:
|
|
@@ -90,6 +94,29 @@ def _embedding_merge(items, threshold):
|
|
|
90
94
|
return reps
|
|
91
95
|
|
|
92
96
|
|
|
97
|
+
def _quality(output, raw_embs, tau_cov=0.70, tau_red=0.85, beta=2.0):
|
|
98
|
+
"""Deterministic quality of a candidate taxonomy vs the raw input themes:
|
|
99
|
+
coverage-weighted F-beta of recall=coverage_hard (share of raw within tau_cov
|
|
100
|
+
of some output) and precision=(1 - redundancy_rate) (share of outputs with a
|
|
101
|
+
near-twin >= tau_red). Embedding-only — the convergence signal for passes='auto'.
|
|
102
|
+
"""
|
|
103
|
+
if not output:
|
|
104
|
+
return 0.0
|
|
105
|
+
O = _get_emb_model().encode(list(output), normalize_embeddings=True, show_progress_bar=False)
|
|
106
|
+
coverage = float(((raw_embs @ O.T).max(axis=1) >= tau_cov).mean())
|
|
107
|
+
if len(output) > 1:
|
|
108
|
+
OO = O @ O.T
|
|
109
|
+
np.fill_diagonal(OO, -1.0)
|
|
110
|
+
redundancy = float((OO.max(axis=1) >= tau_red).mean())
|
|
111
|
+
else:
|
|
112
|
+
redundancy = 0.0
|
|
113
|
+
precision = 1.0 - redundancy
|
|
114
|
+
if coverage <= 0 or precision <= 0:
|
|
115
|
+
return 0.0
|
|
116
|
+
b2 = beta * beta
|
|
117
|
+
return (1 + b2) * precision * coverage / (b2 * precision + coverage)
|
|
118
|
+
|
|
119
|
+
|
|
93
120
|
def _collapse_batch(client, batch, description, creativity, mode="unique"):
|
|
94
121
|
"""One LLM call on a single batch -> list[str].
|
|
95
122
|
|
|
@@ -254,13 +281,20 @@ def collapse_themes(
|
|
|
254
281
|
api_key=None,
|
|
255
282
|
description="",
|
|
256
283
|
passes=1,
|
|
284
|
+
max_passes=10,
|
|
257
285
|
batch_size=40,
|
|
258
286
|
aggressive=False,
|
|
259
287
|
dedupe_threshold=0.95,
|
|
260
288
|
embedding_merge_threshold=0.92,
|
|
261
289
|
shuffle=True,
|
|
290
|
+
final_consolidation=0.82,
|
|
262
291
|
user_model="gpt-4o",
|
|
263
292
|
model_source="auto",
|
|
293
|
+
unique_model=None,
|
|
294
|
+
unique_model_source="auto",
|
|
295
|
+
unique_passes=1,
|
|
296
|
+
merge_model=None,
|
|
297
|
+
merge_model_source="auto",
|
|
264
298
|
creativity=0,
|
|
265
299
|
max_workers=1,
|
|
266
300
|
random_state=None,
|
|
@@ -296,7 +330,10 @@ def collapse_themes(
|
|
|
296
330
|
description (str): Data/question context, injected into the prompt — e.g.
|
|
297
331
|
the survey question the categories came from. Helps the model judge
|
|
298
332
|
which distinctions matter.
|
|
299
|
-
passes (int): Number of collapse iterations
|
|
333
|
+
passes (int | str): Number of collapse iterations, or "auto" to iterate
|
|
334
|
+
until the deterministic quality benchmark peaks (the recommended mode
|
|
335
|
+
for a final taxonomy — pair with aggressive=True). Default 1.
|
|
336
|
+
max_passes (int): Cap on iterations when passes="auto". Default 10.
|
|
300
337
|
batch_size (int): Themes per LLM chunk (ceil(n / batch_size) calls per
|
|
301
338
|
pass). Default 40.
|
|
302
339
|
aggressive (bool): If True, use the conceptual-merge prompt (compress);
|
|
@@ -308,9 +345,28 @@ def collapse_themes(
|
|
|
308
345
|
0.92. None or >=1.0 skips embeddings.
|
|
309
346
|
shuffle (bool): Randomize order each pass so batch composition varies.
|
|
310
347
|
Default True (improves convergence stability).
|
|
311
|
-
|
|
312
|
-
|
|
348
|
+
final_consolidation (float): Cosine threshold for one greedy embedding
|
|
349
|
+
re-merge over the whole result after all passes, collapsing cross-batch
|
|
350
|
+
lexical-sibling duplicates that batched passes (and the auto loop) cannot
|
|
351
|
+
reach. Default 0.82 — deterministic and tuned to land just above the true
|
|
352
|
+
concept count (errs toward keeping categories; over-segmentation is
|
|
353
|
+
preferred over over-consolidation). False/None skips.
|
|
354
|
+
user_model (str): Model name for the merge phase. Default "gpt-4o". Use a
|
|
355
|
+
capable model — small models can degenerate into repetition.
|
|
313
356
|
model_source (str): Provider — "auto", "openai", "huggingface", etc.
|
|
357
|
+
unique_model (str): If set, run an initial extract-unique thinning phase on
|
|
358
|
+
this (typically cheaper) model before the merge phase, allocating model
|
|
359
|
+
spend by task difficulty: a smaller model handles faithful restatement
|
|
360
|
+
removal, a stronger one handles conceptual merging. None (default) skips
|
|
361
|
+
the phase entirely (backward compatible). Recommended pairing:
|
|
362
|
+
unique_model = a 72B-class model, merge_model = a frontier model.
|
|
363
|
+
unique_model_source (str): Provider for unique_model. Default "auto" — can
|
|
364
|
+
differ from the merge phase, so the two phases may sit on different
|
|
365
|
+
providers.
|
|
366
|
+
unique_passes (int): Number of extract-unique passes in the thinning phase
|
|
367
|
+
when unique_model is set. Default 1.
|
|
368
|
+
merge_model (str): Model for the merge phase. Defaults to user_model when None.
|
|
369
|
+
merge_model_source (str): Provider for merge_model. Default "auto".
|
|
314
370
|
creativity (float): Temperature. Default 0 (deterministic).
|
|
315
371
|
max_workers (int): Batches processed concurrently per pass. Default 1.
|
|
316
372
|
random_state (int): Seed for shuffling (per-pass seed = random_state + p).
|
|
@@ -325,37 +381,96 @@ def collapse_themes(
|
|
|
325
381
|
>>> import cat_stack as cat
|
|
326
382
|
>>> themes = cat.explore(df['responses'], description="Why did you move?",
|
|
327
383
|
... api_key=key)
|
|
328
|
-
>>> #
|
|
329
|
-
>>>
|
|
330
|
-
... description="Why did you move?",
|
|
331
|
-
|
|
332
|
-
...
|
|
384
|
+
>>> # Recommended: aggressive merge, auto-stop at the quality peak
|
|
385
|
+
>>> taxonomy = cat.collapse_themes(
|
|
386
|
+
... themes, api_key=key, description="Why did you move?",
|
|
387
|
+
... aggressive=True, passes="auto", max_workers=8,
|
|
388
|
+
... )
|
|
333
389
|
"""
|
|
334
390
|
if not api_key:
|
|
335
391
|
raise ValueError("collapse_themes() needs an api_key for the LLM call.")
|
|
336
392
|
|
|
337
393
|
mode = "merge" if aggressive else "unique"
|
|
338
|
-
provider = detect_provider(user_model, model_source)
|
|
339
|
-
client = UnifiedLLMClient(provider=provider, api_key=api_key, model=user_model)
|
|
340
394
|
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
395
|
+
# The main (merge) phase runs on merge_model if given, else user_model. A separate
|
|
396
|
+
# cheaper model can handle the simpler unique-keeping phase via unique_model — per
|
|
397
|
+
# step the work differs in difficulty (faithful thinning is easy, conceptual
|
|
398
|
+
# merging is hard), so model spend can be allocated accordingly. Each phase
|
|
399
|
+
# resolves its own provider, so the two can sit on different providers.
|
|
400
|
+
merge_name = merge_model or user_model
|
|
401
|
+
merge_src = merge_model_source if merge_model else model_source
|
|
402
|
+
merge_provider = detect_provider(merge_name, merge_src)
|
|
403
|
+
client = UnifiedLLMClient(provider=merge_provider, api_key=api_key, model=merge_name)
|
|
404
|
+
|
|
405
|
+
def _run(cl, items, md, p):
|
|
406
|
+
return _collapse_once(
|
|
407
|
+
cl, items,
|
|
347
408
|
description=description,
|
|
348
409
|
batch_size=batch_size,
|
|
349
410
|
dedupe_threshold=dedupe_threshold,
|
|
350
411
|
embedding_merge_threshold=embedding_merge_threshold,
|
|
351
|
-
mode=
|
|
412
|
+
mode=md,
|
|
352
413
|
shuffle=shuffle,
|
|
353
|
-
random_state=
|
|
414
|
+
random_state=(None if random_state is None else random_state + p),
|
|
354
415
|
creativity=creativity,
|
|
355
416
|
max_workers=max_workers,
|
|
356
417
|
)
|
|
357
|
-
|
|
358
|
-
|
|
418
|
+
|
|
419
|
+
def _pass(items, p):
|
|
420
|
+
return _run(client, items, mode, p)
|
|
421
|
+
|
|
422
|
+
current = input_data
|
|
423
|
+
|
|
424
|
+
# Phase 1 (optional): cheap unique-keeping thin. When unique_model is set, run
|
|
425
|
+
# `unique_passes` extract-unique passes on a separate (typically smaller, cheaper)
|
|
426
|
+
# model to strip restatement-level duplicates before the expensive merge phase.
|
|
427
|
+
# Skipped entirely when unique_model is None (fully backward compatible).
|
|
428
|
+
if unique_model:
|
|
429
|
+
u_provider = detect_provider(unique_model, unique_model_source)
|
|
430
|
+
u_client = UnifiedLLMClient(provider=u_provider, api_key=api_key, model=unique_model)
|
|
431
|
+
for p in range(int(unique_passes)):
|
|
432
|
+
current = _run(u_client, current, "unique", p)
|
|
433
|
+
if progress_callback:
|
|
434
|
+
progress_callback(p + 1, int(unique_passes), "collapse_themes:unique")
|
|
435
|
+
if passes == "auto":
|
|
436
|
+
# Iterate until the deterministic quality benchmark stops improving (the
|
|
437
|
+
# peak), capped at max_passes. Quality is scored vs the ORIGINAL input
|
|
438
|
+
# themes — embedding-only, model-independent at decision time. The peak is
|
|
439
|
+
# the principled stop (validated across surveys and list sizes).
|
|
440
|
+
raw_embs = _get_emb_model().encode(
|
|
441
|
+
list(_to_counts(input_data).keys()), normalize_embeddings=True,
|
|
442
|
+
show_progress_bar=False,
|
|
443
|
+
)
|
|
444
|
+
best, best_q = None, -1.0
|
|
445
|
+
for p in range(max_passes):
|
|
446
|
+
current = _pass(current, p)
|
|
447
|
+
q = _quality(current, raw_embs)
|
|
448
|
+
if progress_callback:
|
|
449
|
+
progress_callback(p + 1, max_passes, "collapse_themes")
|
|
450
|
+
if q < best_q:
|
|
451
|
+
break # quality dropped -> the previous pass was the peak
|
|
452
|
+
best, best_q = current, q
|
|
453
|
+
current = best if best is not None else current
|
|
454
|
+
else:
|
|
455
|
+
for p in range(int(passes)):
|
|
456
|
+
current = _pass(current, p)
|
|
457
|
+
if progress_callback:
|
|
458
|
+
progress_callback(p + 1, int(passes), "collapse_themes")
|
|
459
|
+
|
|
460
|
+
# Final global consolidation. Batched passes (and the auto loop) can only merge
|
|
461
|
+
# labels that share a batch, so cross-batch lexical siblings — e.g. "tension" vs
|
|
462
|
+
# "estrangement", which restate one concept but embed below the per-pass dedupe
|
|
463
|
+
# threshold — survive as separate themes, inflating the count above the true
|
|
464
|
+
# number of concepts. This applies one greedy embedding re-merge over the WHOLE
|
|
465
|
+
# result at a lower threshold, dropping each label that restates an already-kept
|
|
466
|
+
# one to bring the count closer to truth. Greedy (compares only against kept
|
|
467
|
+
# representatives, no transitive chaining) avoids blobbing related-but-distinct
|
|
468
|
+
# labels. It is deterministic (no extra LLM call, model-independent at decision
|
|
469
|
+
# time) and tuned to land just above the true count, so it errs toward KEEPING
|
|
470
|
+
# categories — over-segmentation is the preferred failure mode, not
|
|
471
|
+
# over-consolidation. Set final_consolidation=False to skip.
|
|
472
|
+
if final_consolidation and len(current) > 1:
|
|
473
|
+
current = _embedding_merge(current, final_consolidation)
|
|
359
474
|
|
|
360
475
|
if filename:
|
|
361
476
|
pd.DataFrame({"category": current}).to_csv(filename, index=False)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|