cat-stack 2.0.0b6__tar.gz → 2.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/PKG-INFO +1 -1
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/__about__.py +1 -1
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_utils.py +38 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/classify.py +111 -15
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/prompt_tune.py +9 -13
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/.gitignore +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/LICENSE +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/README.md +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/pyproject.toml +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/cat_stack/__init__.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/__init__.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_batch.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_category_analysis.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_chunked.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_embeddings.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_formatter.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_pilot_test.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_prompts.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_providers.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_review_ui.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_tiebreaker.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_web_fetch.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_wrapper_helpers.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/CoVe.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/__init__.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/image_CoVe.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/image_stepback.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/pdf_CoVe.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/pdf_stepback.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/stepback.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/top_n.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/collapse_themes.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/explore.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/extract.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/image_functions.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/images/circle.png +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/images/cube.png +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/images/diamond.png +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/images/overlapping_pentagons.png +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/images/rectangles.png +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/model_reference_list.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/pdf_functions.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/summarize.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/text_functions.py +0 -0
- {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/text_functions_ensemble.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cat-stack
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.1
|
|
4
4
|
Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
|
|
5
5
|
Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
|
-
__version__ = "2.0.
|
|
4
|
+
__version__ = "2.0.1"
|
|
5
5
|
__author__ = "Chris Soria"
|
|
6
6
|
__email__ = "chrissoria@berkeley.edu"
|
|
7
7
|
__title__ = "cat-stack"
|
|
@@ -9,6 +9,8 @@ import json
|
|
|
9
9
|
import re
|
|
10
10
|
|
|
11
11
|
__all__ = [
|
|
12
|
+
# Param resolution
|
|
13
|
+
"_resolve_description_context",
|
|
12
14
|
# JSON utilities
|
|
13
15
|
"build_json_schema",
|
|
14
16
|
"validate_classification_json",
|
|
@@ -31,6 +33,42 @@ __all__ = [
|
|
|
31
33
|
]
|
|
32
34
|
|
|
33
35
|
|
|
36
|
+
# =============================================================================
|
|
37
|
+
# Param Resolution
|
|
38
|
+
# =============================================================================
|
|
39
|
+
|
|
40
|
+
def _resolve_description_context(description, survey_question, fn_name):
|
|
41
|
+
"""Reconcile the canonical `description=` with its deprecated alias
|
|
42
|
+
`survey_question=` for entry points whose downstream prompt assembly
|
|
43
|
+
still keys the text-prompt "Context:" line (plus step-back and
|
|
44
|
+
categories="auto") off `survey_question`.
|
|
45
|
+
|
|
46
|
+
Returns the reconciled ``(description, survey_question)`` pair:
|
|
47
|
+
- only survey_question given -> DeprecationWarning; mirrored into
|
|
48
|
+
description.
|
|
49
|
+
- only description given -> mirrored into survey_question so the context
|
|
50
|
+
framing isn't silently lost (description-only callers include every
|
|
51
|
+
domain wrapper).
|
|
52
|
+
- both given -> kept distinct (e.g. cat-vader: survey_question= feed
|
|
53
|
+
question for the Context line, description= platform context).
|
|
54
|
+
"""
|
|
55
|
+
import warnings
|
|
56
|
+
|
|
57
|
+
if survey_question:
|
|
58
|
+
warnings.warn(
|
|
59
|
+
f"`survey_question=` is deprecated in {fn_name}(); use "
|
|
60
|
+
"`description=` instead. The value will be mirrored to "
|
|
61
|
+
"`description` for now.",
|
|
62
|
+
DeprecationWarning,
|
|
63
|
+
stacklevel=3,
|
|
64
|
+
)
|
|
65
|
+
if not description:
|
|
66
|
+
description = survey_question
|
|
67
|
+
elif description:
|
|
68
|
+
survey_question = description
|
|
69
|
+
return description, survey_question
|
|
70
|
+
|
|
71
|
+
|
|
34
72
|
# =============================================================================
|
|
35
73
|
# Label Cleaning
|
|
36
74
|
# =============================================================================
|
|
@@ -41,6 +41,86 @@ from .image_functions import image_multi_class
|
|
|
41
41
|
from .pdf_functions import pdf_multi_class
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
# Minimum estimated API calls (rows x batch-capable models) before the
|
|
45
|
+
# batch-mode cost tip is worth printing. Below this, the absolute savings
|
|
46
|
+
# are small and the async round-trip isn't worth suggesting.
|
|
47
|
+
_BATCH_NUDGE_MIN_REQUESTS = 500
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _maybe_print_batch_nudge(
|
|
51
|
+
input_data,
|
|
52
|
+
models,
|
|
53
|
+
categories_per_call,
|
|
54
|
+
chain_of_verification,
|
|
55
|
+
embedding_tiebreaker,
|
|
56
|
+
progress_callback,
|
|
57
|
+
):
|
|
58
|
+
"""Print a one-line cost tip when a synchronous run qualifies for
|
|
59
|
+
batch_mode=True. Checks the same eligibility rules the batch path
|
|
60
|
+
enforces, so the tip is only shown when opting in would actually work."""
|
|
61
|
+
# Options the batch path rejects or ignores -> no tip.
|
|
62
|
+
if (
|
|
63
|
+
categories_per_call is not None
|
|
64
|
+
or chain_of_verification
|
|
65
|
+
or embedding_tiebreaker
|
|
66
|
+
or progress_callback is not None
|
|
67
|
+
):
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
n_rows = len(input_data)
|
|
72
|
+
except TypeError:
|
|
73
|
+
return
|
|
74
|
+
if n_rows == 0:
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
# Batch mode is text-only.
|
|
78
|
+
from .text_functions_ensemble import _detect_input_type
|
|
79
|
+
if _detect_input_type(input_data) != "text":
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
# Count models on batch-capable providers (openai/anthropic/google/
|
|
83
|
+
# mistral/xai). `models` is already normalized to a list here; provider
|
|
84
|
+
# may still be "auto"/None in the spec, so resolve it the same way
|
|
85
|
+
# prepare_model_configs will.
|
|
86
|
+
from ._batch import UNSUPPORTED_BATCH_PROVIDERS
|
|
87
|
+
from ._providers import detect_provider
|
|
88
|
+
|
|
89
|
+
n_capable = 0
|
|
90
|
+
for m in models:
|
|
91
|
+
name, provider = None, None
|
|
92
|
+
if isinstance(m, (list, tuple)):
|
|
93
|
+
name = m[0] if len(m) >= 1 else None
|
|
94
|
+
provider = m[1] if len(m) >= 2 else None
|
|
95
|
+
elif isinstance(m, dict):
|
|
96
|
+
name = m.get("model")
|
|
97
|
+
provider = m.get("provider")
|
|
98
|
+
elif isinstance(m, str):
|
|
99
|
+
name = m
|
|
100
|
+
if not provider or provider == "auto":
|
|
101
|
+
if not name:
|
|
102
|
+
continue
|
|
103
|
+
try:
|
|
104
|
+
provider = detect_provider(name)
|
|
105
|
+
except Exception:
|
|
106
|
+
continue
|
|
107
|
+
if provider not in UNSUPPORTED_BATCH_PROVIDERS:
|
|
108
|
+
n_capable += 1
|
|
109
|
+
|
|
110
|
+
est_requests = n_rows * n_capable
|
|
111
|
+
if n_capable == 0 or est_requests < _BATCH_NUDGE_MIN_REQUESTS:
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
print(
|
|
115
|
+
f"\n[CatLLM] Tip: this run (~{est_requests:,} API calls across "
|
|
116
|
+
f"{n_capable} batch-capable model(s)) qualifies for batch_mode=True.\n"
|
|
117
|
+
" The async batch API costs ~50% less with identical prompts and\n"
|
|
118
|
+
" results, and gets higher rate limits. The trade-off is latency:\n"
|
|
119
|
+
" the job completes asynchronously (typically minutes to a few\n"
|
|
120
|
+
" hours; 24h worst case). Add batch_mode=True to opt in.\n"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
44
124
|
def classify(
|
|
45
125
|
input_data,
|
|
46
126
|
categories,
|
|
@@ -168,6 +248,8 @@ def classify(
|
|
|
168
248
|
Providers without batch API (HuggingFace, Perplexity, Ollama) fall back to
|
|
169
249
|
synchronous calls and are merged in with the batch results.
|
|
170
250
|
Incompatible with: PDF/image input, progress_callback.
|
|
251
|
+
Large qualifying synchronous runs (>= ~500 estimated API calls)
|
|
252
|
+
print a one-line tip suggesting batch_mode=True.
|
|
171
253
|
batch_poll_interval (float): Seconds between batch job status checks. Default 30.
|
|
172
254
|
batch_timeout (float): Max seconds to wait for batch completion. Default 86400 (24h).
|
|
173
255
|
models (list): For multi-model mode, list of (model, provider, api_key) tuples.
|
|
@@ -355,21 +437,13 @@ def classify(
|
|
|
355
437
|
... consensus_threshold="unanimous", # or "majority", "two-thirds", or 0.75
|
|
356
438
|
... )
|
|
357
439
|
"""
|
|
358
|
-
#
|
|
359
|
-
#
|
|
360
|
-
#
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
"`survey_question=` is deprecated in classify(); use "
|
|
366
|
-
"`description=` instead. The value will be mirrored to "
|
|
367
|
-
"`description` for now.",
|
|
368
|
-
DeprecationWarning,
|
|
369
|
-
stacklevel=2,
|
|
370
|
-
)
|
|
371
|
-
if not description:
|
|
372
|
-
description = survey_question
|
|
440
|
+
# Reconcile the canonical `description=` with the deprecated
|
|
441
|
+
# `survey_question=` (each is mirrored into the other when only one is
|
|
442
|
+
# given — see _resolve_description_context for the full rules).
|
|
443
|
+
from ._utils import _resolve_description_context
|
|
444
|
+
description, survey_question = _resolve_description_context(
|
|
445
|
+
description, survey_question, "classify"
|
|
446
|
+
)
|
|
373
447
|
|
|
374
448
|
# Build models list
|
|
375
449
|
if models is None:
|
|
@@ -620,6 +694,28 @@ def classify(
|
|
|
620
694
|
print("\n\n".join(_strategy_warnings))
|
|
621
695
|
print()
|
|
622
696
|
|
|
697
|
+
# =========================================================================
|
|
698
|
+
# Batch-mode cost nudge
|
|
699
|
+
# =========================================================================
|
|
700
|
+
# One-line tip when a large synchronous run would qualify for the async
|
|
701
|
+
# batch API (~50% cheaper, higher rate limits, identical prompts and
|
|
702
|
+
# results). Fires only when batch_mode=True would actually accept this
|
|
703
|
+
# run — text input, no batch-incompatible options, at least one
|
|
704
|
+
# batch-capable provider — so the tip is never a dead end. Informational
|
|
705
|
+
# only: must never affect or abort the run.
|
|
706
|
+
if not batch_mode:
|
|
707
|
+
try:
|
|
708
|
+
_maybe_print_batch_nudge(
|
|
709
|
+
input_data=input_data,
|
|
710
|
+
models=models,
|
|
711
|
+
categories_per_call=categories_per_call,
|
|
712
|
+
chain_of_verification=chain_of_verification,
|
|
713
|
+
embedding_tiebreaker=embedding_tiebreaker,
|
|
714
|
+
progress_callback=progress_callback,
|
|
715
|
+
)
|
|
716
|
+
except Exception:
|
|
717
|
+
pass
|
|
718
|
+
|
|
623
719
|
# =========================================================================
|
|
624
720
|
# JSON formatter fallback
|
|
625
721
|
# =========================================================================
|
|
@@ -186,19 +186,15 @@ def prompt_tune(
|
|
|
186
186
|
... system_prompt=result["system_prompt"],
|
|
187
187
|
... )
|
|
188
188
|
"""
|
|
189
|
-
#
|
|
190
|
-
#
|
|
191
|
-
#
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
stacklevel=2,
|
|
199
|
-
)
|
|
200
|
-
if not description:
|
|
201
|
-
description = survey_question
|
|
189
|
+
# Reconcile the canonical `description=` with the deprecated
|
|
190
|
+
# `survey_question=` (each is mirrored into the other when only one is
|
|
191
|
+
# given — see _resolve_description_context for the full rules). Without
|
|
192
|
+
# the description->survey_question direction, description-only callers
|
|
193
|
+
# ran the whole tuning loop with no "Context:" line in the prompts.
|
|
194
|
+
from ._utils import _resolve_description_context
|
|
195
|
+
description, survey_question = _resolve_description_context(
|
|
196
|
+
description, survey_question, "prompt_tune"
|
|
197
|
+
)
|
|
202
198
|
|
|
203
199
|
# Build models list
|
|
204
200
|
if models is None:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|