cat-stack 1.6.7__tar.gz → 1.6.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {cat_stack-1.6.7 → cat_stack-1.6.9}/.gitignore +8 -0
  2. {cat_stack-1.6.7 → cat_stack-1.6.9}/PKG-INFO +1 -1
  3. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/__about__.py +1 -1
  4. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_formatter.py +90 -8
  5. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_providers.py +127 -6
  6. {cat_stack-1.6.7 → cat_stack-1.6.9}/LICENSE +0 -0
  7. {cat_stack-1.6.7 → cat_stack-1.6.9}/README.md +0 -0
  8. {cat_stack-1.6.7 → cat_stack-1.6.9}/pyproject.toml +0 -0
  9. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/cat_stack/__init__.py +0 -0
  10. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/__init__.py +0 -0
  11. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_batch.py +0 -0
  12. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_category_analysis.py +0 -0
  13. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_chunked.py +0 -0
  14. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_embeddings.py +0 -0
  15. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_pilot_test.py +0 -0
  16. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_prompts.py +0 -0
  17. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_review_ui.py +0 -0
  18. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_tiebreaker.py +0 -0
  19. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_utils.py +0 -0
  20. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_web_fetch.py +0 -0
  21. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_wrapper_helpers.py +0 -0
  22. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/CoVe.py +0 -0
  23. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/__init__.py +0 -0
  24. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/image_CoVe.py +0 -0
  25. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/image_stepback.py +0 -0
  26. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/pdf_CoVe.py +0 -0
  27. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/pdf_stepback.py +0 -0
  28. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/stepback.py +0 -0
  29. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/top_n.py +0 -0
  30. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/classify.py +0 -0
  31. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/explore.py +0 -0
  32. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/extract.py +0 -0
  33. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/image_functions.py +0 -0
  34. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/images/circle.png +0 -0
  35. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/images/cube.png +0 -0
  36. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/images/diamond.png +0 -0
  37. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/images/overlapping_pentagons.png +0 -0
  38. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/images/rectangles.png +0 -0
  39. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/model_reference_list.py +0 -0
  40. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/pdf_functions.py +0 -0
  41. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/prompt_tune.py +0 -0
  42. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/summarize.py +0 -0
  43. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/text_functions.py +0 -0
  44. {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/text_functions_ensemble.py +0 -0
@@ -41,3 +41,11 @@ survey-summarizer
41
41
  # R package build/check artifacts
42
42
  *.Rcheck/
43
43
  *.tar.gz
44
+
45
+ # Local experiment scratch (not part of the package)
46
+ /patches/
47
+ /AGENTS.md
48
+ /test_live_*.py
49
+ /test_smoke_*.py
50
+ /test_stress_*.py
51
+ /test_parallel_*.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-stack
3
- Version: 1.6.7
3
+ Version: 1.6.9
4
4
  Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
- __version__ = "1.6.7"
4
+ __version__ = "1.6.9"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-stack"
@@ -45,6 +45,11 @@ def _check_dependencies():
45
45
  def _check_dependencies_installed() -> bool:
46
46
  """Pure check — returns True if all formatter deps import successfully.
47
47
  No side effects, no install attempts."""
48
+ # If a dep was just pip-installed in this process's lifetime, the import
49
+ # system may have cached its earlier absence; clear that so the re-check
50
+ # actually sees the freshly-installed package.
51
+ import importlib
52
+ importlib.invalidate_caches()
48
53
  try:
49
54
  import torch # noqa: F401
50
55
  import transformers # noqa: F401
@@ -165,7 +170,31 @@ def _ensure_dependencies(verbose: bool = True) -> bool:
165
170
  " To skip this and disable the formatter, pass json_formatter=False."
166
171
  )
167
172
 
168
- return _install_dependencies(verbose=verbose)
173
+ ok = _install_dependencies(verbose=verbose)
174
+ if not ok:
175
+ # Freshly pip-installed packages (esp. compiled ones like torch) often
176
+ # cannot be imported by the SAME running process — but they ARE on disk
177
+ # now. Tell the user to re-run rather than silently degrading every row
178
+ # to an error.
179
+ if verbose and _deps_on_disk():
180
+ print(
181
+ "[CatLLM] Formatter dependencies were just installed but cannot "
182
+ "be imported into the already-running process. Please RE-RUN your "
183
+ "command — they will load on the next start. (Avoid this by "
184
+ "pre-installing: pip install 'cat-stack[formatter]'.)"
185
+ )
186
+ return ok
187
+
188
+
189
+ def _deps_on_disk() -> bool:
190
+ """True if the formatter deps are findable on disk (importable in a FRESH
191
+ process) even if they failed to import in the current one."""
192
+ import importlib.util
193
+ try:
194
+ return all(importlib.util.find_spec(m) is not None
195
+ for m in ("torch", "transformers", "accelerate"))
196
+ except (ImportError, ValueError):
197
+ return False
169
198
 
170
199
 
171
200
  def _is_model_cached() -> bool:
@@ -205,6 +234,51 @@ def ensure_formatter_available() -> bool:
205
234
  return True # actual download happens in load_formatter()
206
235
 
207
236
 
237
+ def _load_formatter_tokenizer(AutoTokenizer):
238
+ """Load the formatter tokenizer, defending against a malformed
239
+ `tokenizer_config.json`.
240
+
241
+ Some published configs store `extra_special_tokens` as a LIST, but
242
+ transformers 4.56–4.57.x expect a dict and crash in
243
+ `_set_model_specific_special_tokens` with
244
+ `'list' object has no attribute 'keys'`. On that failure we snapshot the
245
+ repo locally, normalize a list-valued `extra_special_tokens` to `{}`
246
+ (the tokens already live in `added_tokens`/`special_tokens_map`, so
247
+ dropping the field is lossless), and load from the patched local copy.
248
+ """
249
+ try:
250
+ return AutoTokenizer.from_pretrained(
251
+ _MERGED_MODEL_REPO, trust_remote_code=True
252
+ )
253
+ except (AttributeError, TypeError) as e:
254
+ if "keys" not in str(e) and "extra_special_tokens" not in str(e):
255
+ raise
256
+ import json
257
+ import os
258
+ from huggingface_hub import snapshot_download
259
+
260
+ local_dir = snapshot_download(_MERGED_MODEL_REPO)
261
+ cfg_path = os.path.join(local_dir, "tokenizer_config.json")
262
+ with open(cfg_path) as f:
263
+ cfg = json.load(f)
264
+ if isinstance(cfg.get("extra_special_tokens"), list):
265
+ cfg["extra_special_tokens"] = {}
266
+ # snapshot dirs are often read-only symlink caches; patch a copy.
267
+ import tempfile
268
+ import shutil
269
+ patched = tempfile.mkdtemp(prefix="catllm_formatter_tok_")
270
+ for fn in os.listdir(local_dir):
271
+ src = os.path.join(local_dir, fn)
272
+ if os.path.isfile(src):
273
+ shutil.copy(src, os.path.join(patched, fn))
274
+ with open(os.path.join(patched, "tokenizer_config.json"), "w") as f:
275
+ json.dump(cfg, f)
276
+ print("[CatLLM] Patched malformed extra_special_tokens in the "
277
+ "formatter tokenizer config (list -> {}).")
278
+ return AutoTokenizer.from_pretrained(patched, trust_remote_code=True)
279
+ raise
280
+
281
+
208
282
  def load_formatter(device=None):
209
283
  """
210
284
  Load the merged formatter model and tokenizer.
@@ -230,15 +304,21 @@ def load_formatter(device=None):
230
304
  dtype = torch.float16 if device == "cuda" else torch.float32
231
305
 
232
306
  print(f"[CatLLM] Loading JSON formatter on {device}...")
233
- tokenizer = AutoTokenizer.from_pretrained(
234
- _MERGED_MODEL_REPO, trust_remote_code=True
235
- )
307
+ tokenizer = _load_formatter_tokenizer(AutoTokenizer)
236
308
  if tokenizer.pad_token is None:
237
309
  tokenizer.pad_token = tokenizer.eos_token
238
310
 
239
- model = AutoModelForCausalLM.from_pretrained(
240
- _MERGED_MODEL_REPO, dtype=dtype, trust_remote_code=True
241
- )
311
+ # `dtype=` is the transformers >=4.56 kwarg; older versions only accept
312
+ # `torch_dtype=` and crash if `dtype=` leaks into the config. Try the new
313
+ # name, fall back to the old one.
314
+ try:
315
+ model = AutoModelForCausalLM.from_pretrained(
316
+ _MERGED_MODEL_REPO, dtype=dtype, trust_remote_code=True
317
+ )
318
+ except TypeError:
319
+ model = AutoModelForCausalLM.from_pretrained(
320
+ _MERGED_MODEL_REPO, torch_dtype=dtype, trust_remote_code=True
321
+ )
242
322
  model = model.to(device)
243
323
  model.eval()
244
324
 
@@ -281,7 +361,9 @@ def run_formatter(raw_output, categories, model, tokenizer, device):
281
361
  with torch.no_grad():
282
362
  out = model.generate(
283
363
  **inputs,
284
- max_new_tokens=128,
364
+ # 512 (was 128): a large category set produces a long N-key JSON
365
+ # object; 128 tokens truncated it for 28/48-category tasks.
366
+ max_new_tokens=512,
285
367
  do_sample=False,
286
368
  temperature=None,
287
369
  top_p=None,
@@ -126,6 +126,27 @@ def _hf_model_needs_enable_thinking_off(model: str) -> bool:
126
126
  return any(model.startswith(p) for p in _HF_NEEDS_ENABLE_THINKING_OFF)
127
127
 
128
128
 
129
+ # Router-served models measured (2026-06-12 reasoning audit) to reason by
130
+ # default with NO honored off-switch through the OpenAI-compatible router:
131
+ # the router 400-rejects `chat_template_kwargs.enable_thinking` for their
132
+ # templates, and they expose no reasoning_effort. classify() warns once per
133
+ # client so users know the provider default applies.
134
+ _HF_DEFAULT_REASONING_PREFIXES = (
135
+ "openai/gpt-oss",
136
+ "moonshotai/kimi-k2",
137
+ )
138
+
139
+
140
+ def _hf_model_reasons_by_default(model: str) -> bool:
141
+ m = (model or "").lower()
142
+ return any(m.startswith(p) for p in _HF_DEFAULT_REASONING_PREFIXES)
143
+
144
+
145
+ # Module-level: models already warned about uncontrolled reasoning, so the
146
+ # warning fires once per process even though a fresh client is built per row.
147
+ _WARNED_UNCONTROLLED_REASONING: set = set()
148
+
149
+
129
150
  # ---------------------------------------------------------------------------
130
151
  # Anthropic deprecated the `temperature` parameter starting with the Opus 4.7 /
131
152
  # 4.8 generation: these models return 400 "`temperature` is deprecated for this
@@ -545,8 +566,15 @@ class UnifiedLLMClient:
545
566
  # accept booleans). Without this, gpt-oss family models emit long
546
567
  # <think> blocks by default that bloat per-row generation 3-5x.
547
568
  return self._build_openai_payload(messages, json_schema, creativity, force_json, thinking_budget)
569
+ elif self.provider == "xai":
570
+ # v1.6.8: forward the reasoning request. grok-4.3+ hybrids reason
571
+ # by default (2026-06-12 audit: 214 reasoning tokens on a trivial
572
+ # probe with no control sent); non-reasoning variants reject
573
+ # reasoning_effort and are handled by the 400 fallback in
574
+ # complete(), which caches the rejection on the client.
575
+ return self._build_openai_payload(messages, json_schema, creativity, force_json, thinking_budget)
548
576
  else:
549
- # Other OpenAI-compatible providers (xai, mistral, etc.)
577
+ # Other OpenAI-compatible providers (mistral, etc.)
550
578
  return self._build_openai_payload(messages, json_schema, creativity, force_json)
551
579
 
552
580
  def _build_openai_payload(
@@ -620,6 +648,25 @@ class UnifiedLLMClient:
620
648
  elif creativity is not None:
621
649
  payload["temperature"] = creativity
622
650
 
651
+ # xAI (v1.6.8): hybrid grok models accept reasoning_effort alongside
652
+ # temperature. "low" is the lowest tier xAI exposes (no "none" /
653
+ # "minimal"); explicitly non-reasoning variants 400 on the field —
654
+ # complete() pops it and caches `_xai_no_reasoning_effort` so later
655
+ # rows on this client skip the doomed field up front.
656
+ if (
657
+ self.provider == "xai"
658
+ and thinking_budget is not None
659
+ and not getattr(self, "_xai_no_reasoning_effort", False)
660
+ # Variants whose name already encodes "non-reasoning" are off by
661
+ # model choice; sending reasoning_effort to them is not just
662
+ # redundant but HARMFUL — verified 2026-06-13 that
663
+ # grok-4-1-fast-non-reasoning returns 0 reasoning tokens with no
664
+ # field but 207 when sent reasoning_effort="low", i.e. the field
665
+ # turns reasoning back ON. Leave these alone.
666
+ and "non-reasoning" not in (self.model or "").lower()
667
+ ):
668
+ payload["reasoning_effort"] = "low" if thinking_budget == 0 else "high"
669
+
623
670
  # Ollama: per-model-family reasoning control via the top-level
624
671
  # `think` field. gpt-oss expects an enum ("low"/"medium"/"high");
625
672
  # qwen3/deepseek-r1 expect a boolean. Models not in the
@@ -648,6 +695,24 @@ class UnifiedLLMClient:
648
695
  and _hf_model_needs_enable_thinking_off(self.model)
649
696
  ):
650
697
  payload["chat_template_kwargs"] = {"enable_thinking": False}
698
+ elif (
699
+ self.provider in ("huggingface", "huggingface-together")
700
+ and thinking_budget == 0
701
+ and _hf_model_reasons_by_default(self.model)
702
+ and self.model not in _WARNED_UNCONTROLLED_REASONING
703
+ ):
704
+ # v1.6.8: these router-served models reason by default and honor
705
+ # no off-switch through the router (enable_thinking is
706
+ # 400-rejected for their templates). Warn once per process (a
707
+ # fresh client is built per row, so a per-instance flag would
708
+ # warn every row) so the uniform "reasoning off" request isn't
709
+ # silently unmet.
710
+ print(
711
+ f"\n[CatLLM] WARNING: no effective reasoning control delivered "
712
+ f"for '{self.model}'; the provider's default reasoning "
713
+ f"behavior applies. See docs/reasoning-controls.md.\n"
714
+ )
715
+ _WARNED_UNCONTROLLED_REASONING.add(self.model)
651
716
 
652
717
  return payload
653
718
 
@@ -759,11 +824,19 @@ class UnifiedLLMClient:
759
824
  if creativity is not None:
760
825
  payload["generationConfig"]["temperature"] = creativity
761
826
 
762
- # Add thinking budget for extended thinking (Google-specific)
763
- # Must be inside generationConfig, not at top level
764
- # Google requires a reasonable minimum budget (enforce 128 tokens minimum)
765
- if thinking_budget and thinking_budget > 0:
766
- budget = max(thinking_budget, 128)
827
+ # Reasoning control (Google-specific). Must be inside generationConfig.
828
+ # v1.6.8: an explicit zero budget is now SENT at thinking_budget = 0.
829
+ # Previously nothing was sent at 0 and Gemini ran at its provider
830
+ # default, which the 2026-06-12 audit measured as thinking ON
831
+ # (~200+ thought tokens on a trivial classification call). Models
832
+ # that reject 0 (minimum-budget tiers) are handled by the 400
833
+ # fallback in complete(), which caches the discovered floor on the
834
+ # client (`_google_thinking_floor`).
835
+ if thinking_budget is not None:
836
+ if thinking_budget > 0:
837
+ budget = max(thinking_budget, 128)
838
+ else:
839
+ budget = getattr(self, "_google_thinking_floor", 0)
767
840
  payload["generationConfig"]["thinkingConfig"] = {"thinkingBudget": budget}
768
841
 
769
842
  return payload
@@ -946,6 +1019,10 @@ class UnifiedLLMClient:
946
1019
  # transient error this call? Only strip once per call so we don't
947
1020
  # mutate payload on every retry tick.
948
1021
  stripped_response_format = False
1022
+ # v1.6.8: consecutive-timeout counter + one-shot Google schema drop
1023
+ # (see the Timeout handler below).
1024
+ timeout_count = 0
1025
+ dropped_google_schema = False
949
1026
 
950
1027
  for attempt in range(max_retries):
951
1028
  endpoint = self._get_endpoint()
@@ -1019,9 +1096,32 @@ class UnifiedLLMClient:
1019
1096
  payload["reasoning_effort"] = "low"
1020
1097
  continue
1021
1098
  elif current == "low" and "reasoning_effort" in payload:
1099
+ # Model takes no reasoning_effort at all (e.g.
1100
+ # xAI's explicitly non-reasoning variants).
1101
+ # Cache so later rows on this client skip the
1102
+ # doomed field up front (v1.6.8).
1103
+ self._xai_no_reasoning_effort = True
1022
1104
  payload.pop("reasoning_effort")
1023
1105
  continue
1024
1106
 
1107
+ # Google (v1.6.8): minimum-budget thinking tiers reject
1108
+ # thinkingBudget: 0. Fall back to 128 (Google's stated
1109
+ # minimum) and cache on the client.
1110
+ if (
1111
+ self.provider == "google"
1112
+ and "thinking" in error_text
1113
+ and ("budget" in error_text or "invalid" in error_text
1114
+ or "unsupported" in error_text)
1115
+ and payload.get("generationConfig", {})
1116
+ .get("thinkingConfig", {})
1117
+ .get("thinkingBudget") == 0
1118
+ ):
1119
+ self._google_thinking_floor = 128
1120
+ payload["generationConfig"]["thinkingConfig"]["thinkingBudget"] = 128
1121
+ print(f"\n[CatLLM] Model '{self.model}' rejected thinkingBudget=0; "
1122
+ f"falling back to the minimum (128) and caching for this client.\n")
1123
+ continue
1124
+
1025
1125
  # Anthropic deprecated `temperature` for newer models
1026
1126
  # (Opus 4.7+): they 400 with "`temperature` is deprecated
1027
1127
  # for this model." Strip it, cache on the client so the
@@ -1113,6 +1213,27 @@ class UnifiedLLMClient:
1113
1213
  return result, None
1114
1214
 
1115
1215
  except requests.exceptions.Timeout:
1216
+ timeout_count += 1
1217
+ # v1.6.8: Gemini can reproducibly hang on specific inputs
1218
+ # when a strict responseSchema is attached (constrained-
1219
+ # decoding pathology; 2026-06-12 audit — a trivial input
1220
+ # timed out 6/6 attempts WITH the schema and answered
1221
+ # instantly without it). After two consecutive timeouts with
1222
+ # a schema attached, drop the schema once and re-ask: the
1223
+ # prompt still requests JSON and extract_json() parses it
1224
+ # from the free-text response.
1225
+ if (
1226
+ self.provider == "google"
1227
+ and timeout_count >= 2
1228
+ and not dropped_google_schema
1229
+ and "responseSchema" in payload.get("generationConfig", {})
1230
+ ):
1231
+ dropped_google_schema = True
1232
+ payload["generationConfig"].pop("responseSchema", None)
1233
+ print(f"[CatLLM] Repeated timeouts from '{self.model}' with "
1234
+ f"responseSchema attached; retrying schema-less "
1235
+ f"(prompt-based JSON parsing).")
1236
+ continue
1116
1237
  wait_time = _backoff_with_jitter(initial_delay, attempt)
1117
1238
  elapsed = time.monotonic() - start
1118
1239
  if attempt < max_retries - 1 and elapsed + wait_time <= max_total_wait:
File without changes
File without changes
File without changes