cat-stack 2.0.0b6__tar.gz → 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/PKG-INFO +1 -1
  2. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/__about__.py +1 -1
  3. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_utils.py +38 -0
  4. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/classify.py +111 -15
  5. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/prompt_tune.py +9 -13
  6. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/.gitignore +0 -0
  7. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/LICENSE +0 -0
  8. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/README.md +0 -0
  9. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/pyproject.toml +0 -0
  10. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/cat_stack/__init__.py +0 -0
  11. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/__init__.py +0 -0
  12. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_batch.py +0 -0
  13. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_category_analysis.py +0 -0
  14. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_chunked.py +0 -0
  15. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_embeddings.py +0 -0
  16. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_formatter.py +0 -0
  17. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_pilot_test.py +0 -0
  18. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_prompts.py +0 -0
  19. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_providers.py +0 -0
  20. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_review_ui.py +0 -0
  21. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_tiebreaker.py +0 -0
  22. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_web_fetch.py +0 -0
  23. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/_wrapper_helpers.py +0 -0
  24. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/CoVe.py +0 -0
  25. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/__init__.py +0 -0
  26. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/image_CoVe.py +0 -0
  27. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/image_stepback.py +0 -0
  28. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/pdf_CoVe.py +0 -0
  29. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/pdf_stepback.py +0 -0
  30. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/stepback.py +0 -0
  31. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/calls/top_n.py +0 -0
  32. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/collapse_themes.py +0 -0
  33. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/explore.py +0 -0
  34. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/extract.py +0 -0
  35. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/image_functions.py +0 -0
  36. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/images/circle.png +0 -0
  37. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/images/cube.png +0 -0
  38. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/images/diamond.png +0 -0
  39. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/images/overlapping_pentagons.png +0 -0
  40. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/images/rectangles.png +0 -0
  41. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/model_reference_list.py +0 -0
  42. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/pdf_functions.py +0 -0
  43. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/summarize.py +0 -0
  44. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/text_functions.py +0 -0
  45. {cat_stack-2.0.0b6 → cat_stack-2.0.1}/src/catstack/text_functions_ensemble.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-stack
3
- Version: 2.0.0b6
3
+ Version: 2.0.1
4
4
  Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
- __version__ = "2.0.0b6"
4
+ __version__ = "2.0.1"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-stack"
@@ -9,6 +9,8 @@ import json
9
9
  import re
10
10
 
11
11
  __all__ = [
12
+ # Param resolution
13
+ "_resolve_description_context",
12
14
  # JSON utilities
13
15
  "build_json_schema",
14
16
  "validate_classification_json",
@@ -31,6 +33,42 @@ __all__ = [
31
33
  ]
32
34
 
33
35
 
36
+ # =============================================================================
37
+ # Param Resolution
38
+ # =============================================================================
39
+
40
+ def _resolve_description_context(description, survey_question, fn_name):
41
+ """Reconcile the canonical `description=` with its deprecated alias
42
+ `survey_question=` for entry points whose downstream prompt assembly
43
+ still keys the text-prompt "Context:" line (plus step-back and
44
+ categories="auto") off `survey_question`.
45
+
46
+ Returns the reconciled ``(description, survey_question)`` pair:
47
+ - only survey_question given -> DeprecationWarning; mirrored into
48
+ description.
49
+ - only description given -> mirrored into survey_question so the context
50
+ framing isn't silently lost (description-only callers include every
51
+ domain wrapper).
52
+ - both given -> kept distinct (e.g. cat-vader: survey_question= feed
53
+ question for the Context line, description= platform context).
54
+ """
55
+ import warnings
56
+
57
+ if survey_question:
58
+ warnings.warn(
59
+ f"`survey_question=` is deprecated in {fn_name}(); use "
60
+ "`description=` instead. The value will be mirrored to "
61
+ "`description` for now.",
62
+ DeprecationWarning,
63
+ stacklevel=3,
64
+ )
65
+ if not description:
66
+ description = survey_question
67
+ elif description:
68
+ survey_question = description
69
+ return description, survey_question
70
+
71
+
34
72
  # =============================================================================
35
73
  # Label Cleaning
36
74
  # =============================================================================
@@ -41,6 +41,86 @@ from .image_functions import image_multi_class
41
41
  from .pdf_functions import pdf_multi_class
42
42
 
43
43
 
44
+ # Minimum estimated API calls (rows x batch-capable models) before the
45
+ # batch-mode cost tip is worth printing. Below this, the absolute savings
46
+ # are small and the async round-trip isn't worth suggesting.
47
+ _BATCH_NUDGE_MIN_REQUESTS = 500
48
+
49
+
50
+ def _maybe_print_batch_nudge(
51
+ input_data,
52
+ models,
53
+ categories_per_call,
54
+ chain_of_verification,
55
+ embedding_tiebreaker,
56
+ progress_callback,
57
+ ):
58
+ """Print a one-line cost tip when a synchronous run qualifies for
59
+ batch_mode=True. Checks the same eligibility rules the batch path
60
+ enforces, so the tip is only shown when opting in would actually work."""
61
+ # Options the batch path rejects or ignores -> no tip.
62
+ if (
63
+ categories_per_call is not None
64
+ or chain_of_verification
65
+ or embedding_tiebreaker
66
+ or progress_callback is not None
67
+ ):
68
+ return
69
+
70
+ try:
71
+ n_rows = len(input_data)
72
+ except TypeError:
73
+ return
74
+ if n_rows == 0:
75
+ return
76
+
77
+ # Batch mode is text-only.
78
+ from .text_functions_ensemble import _detect_input_type
79
+ if _detect_input_type(input_data) != "text":
80
+ return
81
+
82
+ # Count models on batch-capable providers (openai/anthropic/google/
83
+ # mistral/xai). `models` is already normalized to a list here; provider
84
+ # may still be "auto"/None in the spec, so resolve it the same way
85
+ # prepare_model_configs will.
86
+ from ._batch import UNSUPPORTED_BATCH_PROVIDERS
87
+ from ._providers import detect_provider
88
+
89
+ n_capable = 0
90
+ for m in models:
91
+ name, provider = None, None
92
+ if isinstance(m, (list, tuple)):
93
+ name = m[0] if len(m) >= 1 else None
94
+ provider = m[1] if len(m) >= 2 else None
95
+ elif isinstance(m, dict):
96
+ name = m.get("model")
97
+ provider = m.get("provider")
98
+ elif isinstance(m, str):
99
+ name = m
100
+ if not provider or provider == "auto":
101
+ if not name:
102
+ continue
103
+ try:
104
+ provider = detect_provider(name)
105
+ except Exception:
106
+ continue
107
+ if provider not in UNSUPPORTED_BATCH_PROVIDERS:
108
+ n_capable += 1
109
+
110
+ est_requests = n_rows * n_capable
111
+ if n_capable == 0 or est_requests < _BATCH_NUDGE_MIN_REQUESTS:
112
+ return
113
+
114
+ print(
115
+ f"\n[CatLLM] Tip: this run (~{est_requests:,} API calls across "
116
+ f"{n_capable} batch-capable model(s)) qualifies for batch_mode=True.\n"
117
+ " The async batch API costs ~50% less with identical prompts and\n"
118
+ " results, and gets higher rate limits. The trade-off is latency:\n"
119
+ " the job completes asynchronously (typically minutes to a few\n"
120
+ " hours; 24h worst case). Add batch_mode=True to opt in.\n"
121
+ )
122
+
123
+
44
124
  def classify(
45
125
  input_data,
46
126
  categories,
@@ -168,6 +248,8 @@ def classify(
168
248
  Providers without batch API (HuggingFace, Perplexity, Ollama) fall back to
169
249
  synchronous calls and are merged in with the batch results.
170
250
  Incompatible with: PDF/image input, progress_callback.
251
+ Large qualifying synchronous runs (>= ~500 estimated API calls)
252
+ print a one-line tip suggesting batch_mode=True.
171
253
  batch_poll_interval (float): Seconds between batch job status checks. Default 30.
172
254
  batch_timeout (float): Max seconds to wait for batch completion. Default 86400 (24h).
173
255
  models (list): For multi-model mode, list of (model, provider, api_key) tuples.
@@ -355,21 +437,13 @@ def classify(
355
437
  ... consensus_threshold="unanimous", # or "majority", "two-thirds", or 0.75
356
438
  ... )
357
439
  """
358
- # `description` is the canonical content-neutral way to describe the
359
- # data; `survey_question` is a soft-deprecated alias kept working for
360
- # legacy callers (cat-survey, pre-rename notebooks, the ecosystem
361
- # docs). Mirror it into `description` if `description` wasn't set, so
362
- # downstream prompt assembly only needs to look in one place.
363
- if survey_question:
364
- warnings.warn(
365
- "`survey_question=` is deprecated in classify(); use "
366
- "`description=` instead. The value will be mirrored to "
367
- "`description` for now.",
368
- DeprecationWarning,
369
- stacklevel=2,
370
- )
371
- if not description:
372
- description = survey_question
440
+ # Reconcile the canonical `description=` with the deprecated
441
+ # `survey_question=` (each is mirrored into the other when only one is
442
+ # given see _resolve_description_context for the full rules).
443
+ from ._utils import _resolve_description_context
444
+ description, survey_question = _resolve_description_context(
445
+ description, survey_question, "classify"
446
+ )
373
447
 
374
448
  # Build models list
375
449
  if models is None:
@@ -620,6 +694,28 @@ def classify(
620
694
  print("\n\n".join(_strategy_warnings))
621
695
  print()
622
696
 
697
+ # =========================================================================
698
+ # Batch-mode cost nudge
699
+ # =========================================================================
700
+ # One-line tip when a large synchronous run would qualify for the async
701
+ # batch API (~50% cheaper, higher rate limits, identical prompts and
702
+ # results). Fires only when batch_mode=True would actually accept this
703
+ # run — text input, no batch-incompatible options, at least one
704
+ # batch-capable provider — so the tip is never a dead end. Informational
705
+ # only: must never affect or abort the run.
706
+ if not batch_mode:
707
+ try:
708
+ _maybe_print_batch_nudge(
709
+ input_data=input_data,
710
+ models=models,
711
+ categories_per_call=categories_per_call,
712
+ chain_of_verification=chain_of_verification,
713
+ embedding_tiebreaker=embedding_tiebreaker,
714
+ progress_callback=progress_callback,
715
+ )
716
+ except Exception:
717
+ pass
718
+
623
719
  # =========================================================================
624
720
  # JSON formatter fallback
625
721
  # =========================================================================
@@ -186,19 +186,15 @@ def prompt_tune(
186
186
  ... system_prompt=result["system_prompt"],
187
187
  ... )
188
188
  """
189
- # `description` is the canonical content-neutral way to describe the
190
- # data; `survey_question` is a soft-deprecated alias kept working for
191
- # legacy callers.
192
- if survey_question:
193
- warnings.warn(
194
- "`survey_question=` is deprecated in prompt_tune(); use "
195
- "`description=` instead. The value will be mirrored to "
196
- "`description` for now.",
197
- DeprecationWarning,
198
- stacklevel=2,
199
- )
200
- if not description:
201
- description = survey_question
189
+ # Reconcile the canonical `description=` with the deprecated
190
+ # `survey_question=` (each is mirrored into the other when only one is
191
+ # given — see _resolve_description_context for the full rules). Without
192
+ # the description->survey_question direction, description-only callers
193
+ # ran the whole tuning loop with no "Context:" line in the prompts.
194
+ from ._utils import _resolve_description_context
195
+ description, survey_question = _resolve_description_context(
196
+ description, survey_question, "prompt_tune"
197
+ )
202
198
 
203
199
  # Build models list
204
200
  if models is None:
File without changes
File without changes
File without changes
File without changes