cat-stack 1.6.7__tar.gz → 1.6.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cat_stack-1.6.7 → cat_stack-1.6.9}/.gitignore +8 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/PKG-INFO +1 -1
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/__about__.py +1 -1
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_formatter.py +90 -8
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_providers.py +127 -6
- {cat_stack-1.6.7 → cat_stack-1.6.9}/LICENSE +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/README.md +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/pyproject.toml +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/cat_stack/__init__.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/__init__.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_batch.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_category_analysis.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_chunked.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_embeddings.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_pilot_test.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_prompts.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_review_ui.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_tiebreaker.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_utils.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_web_fetch.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/_wrapper_helpers.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/CoVe.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/__init__.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/image_CoVe.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/image_stepback.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/pdf_CoVe.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/pdf_stepback.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/stepback.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/calls/top_n.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/classify.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/explore.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/extract.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/image_functions.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/images/circle.png +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/images/cube.png +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/images/diamond.png +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/images/overlapping_pentagons.png +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/images/rectangles.png +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/model_reference_list.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/pdf_functions.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/prompt_tune.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/summarize.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/text_functions.py +0 -0
- {cat_stack-1.6.7 → cat_stack-1.6.9}/src/catstack/text_functions_ensemble.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cat-stack
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.9
|
|
4
4
|
Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
|
|
5
5
|
Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
|
-
__version__ = "1.6.
|
|
4
|
+
__version__ = "1.6.9"
|
|
5
5
|
__author__ = "Chris Soria"
|
|
6
6
|
__email__ = "chrissoria@berkeley.edu"
|
|
7
7
|
__title__ = "cat-stack"
|
|
@@ -45,6 +45,11 @@ def _check_dependencies():
|
|
|
45
45
|
def _check_dependencies_installed() -> bool:
|
|
46
46
|
"""Pure check — returns True if all formatter deps import successfully.
|
|
47
47
|
No side effects, no install attempts."""
|
|
48
|
+
# If a dep was just pip-installed in this process's lifetime, the import
|
|
49
|
+
# system may have cached its earlier absence; clear that so the re-check
|
|
50
|
+
# actually sees the freshly-installed package.
|
|
51
|
+
import importlib
|
|
52
|
+
importlib.invalidate_caches()
|
|
48
53
|
try:
|
|
49
54
|
import torch # noqa: F401
|
|
50
55
|
import transformers # noqa: F401
|
|
@@ -165,7 +170,31 @@ def _ensure_dependencies(verbose: bool = True) -> bool:
|
|
|
165
170
|
" To skip this and disable the formatter, pass json_formatter=False."
|
|
166
171
|
)
|
|
167
172
|
|
|
168
|
-
|
|
173
|
+
ok = _install_dependencies(verbose=verbose)
|
|
174
|
+
if not ok:
|
|
175
|
+
# Freshly pip-installed packages (esp. compiled ones like torch) often
|
|
176
|
+
# cannot be imported by the SAME running process — but they ARE on disk
|
|
177
|
+
# now. Tell the user to re-run rather than silently degrading every row
|
|
178
|
+
# to an error.
|
|
179
|
+
if verbose and _deps_on_disk():
|
|
180
|
+
print(
|
|
181
|
+
"[CatLLM] Formatter dependencies were just installed but cannot "
|
|
182
|
+
"be imported into the already-running process. Please RE-RUN your "
|
|
183
|
+
"command — they will load on the next start. (Avoid this by "
|
|
184
|
+
"pre-installing: pip install 'cat-stack[formatter]'.)"
|
|
185
|
+
)
|
|
186
|
+
return ok
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _deps_on_disk() -> bool:
|
|
190
|
+
"""True if the formatter deps are findable on disk (importable in a FRESH
|
|
191
|
+
process) even if they failed to import in the current one."""
|
|
192
|
+
import importlib.util
|
|
193
|
+
try:
|
|
194
|
+
return all(importlib.util.find_spec(m) is not None
|
|
195
|
+
for m in ("torch", "transformers", "accelerate"))
|
|
196
|
+
except (ImportError, ValueError):
|
|
197
|
+
return False
|
|
169
198
|
|
|
170
199
|
|
|
171
200
|
def _is_model_cached() -> bool:
|
|
@@ -205,6 +234,51 @@ def ensure_formatter_available() -> bool:
|
|
|
205
234
|
return True # actual download happens in load_formatter()
|
|
206
235
|
|
|
207
236
|
|
|
237
|
+
def _load_formatter_tokenizer(AutoTokenizer):
|
|
238
|
+
"""Load the formatter tokenizer, defending against a malformed
|
|
239
|
+
`tokenizer_config.json`.
|
|
240
|
+
|
|
241
|
+
Some published configs store `extra_special_tokens` as a LIST, but
|
|
242
|
+
transformers 4.56–4.57.x expect a dict and crash in
|
|
243
|
+
`_set_model_specific_special_tokens` with
|
|
244
|
+
`'list' object has no attribute 'keys'`. On that failure we snapshot the
|
|
245
|
+
repo locally, normalize a list-valued `extra_special_tokens` to `{}`
|
|
246
|
+
(the tokens already live in `added_tokens`/`special_tokens_map`, so
|
|
247
|
+
dropping the field is lossless), and load from the patched local copy.
|
|
248
|
+
"""
|
|
249
|
+
try:
|
|
250
|
+
return AutoTokenizer.from_pretrained(
|
|
251
|
+
_MERGED_MODEL_REPO, trust_remote_code=True
|
|
252
|
+
)
|
|
253
|
+
except (AttributeError, TypeError) as e:
|
|
254
|
+
if "keys" not in str(e) and "extra_special_tokens" not in str(e):
|
|
255
|
+
raise
|
|
256
|
+
import json
|
|
257
|
+
import os
|
|
258
|
+
from huggingface_hub import snapshot_download
|
|
259
|
+
|
|
260
|
+
local_dir = snapshot_download(_MERGED_MODEL_REPO)
|
|
261
|
+
cfg_path = os.path.join(local_dir, "tokenizer_config.json")
|
|
262
|
+
with open(cfg_path) as f:
|
|
263
|
+
cfg = json.load(f)
|
|
264
|
+
if isinstance(cfg.get("extra_special_tokens"), list):
|
|
265
|
+
cfg["extra_special_tokens"] = {}
|
|
266
|
+
# snapshot dirs are often read-only symlink caches; patch a copy.
|
|
267
|
+
import tempfile
|
|
268
|
+
import shutil
|
|
269
|
+
patched = tempfile.mkdtemp(prefix="catllm_formatter_tok_")
|
|
270
|
+
for fn in os.listdir(local_dir):
|
|
271
|
+
src = os.path.join(local_dir, fn)
|
|
272
|
+
if os.path.isfile(src):
|
|
273
|
+
shutil.copy(src, os.path.join(patched, fn))
|
|
274
|
+
with open(os.path.join(patched, "tokenizer_config.json"), "w") as f:
|
|
275
|
+
json.dump(cfg, f)
|
|
276
|
+
print("[CatLLM] Patched malformed extra_special_tokens in the "
|
|
277
|
+
"formatter tokenizer config (list -> {}).")
|
|
278
|
+
return AutoTokenizer.from_pretrained(patched, trust_remote_code=True)
|
|
279
|
+
raise
|
|
280
|
+
|
|
281
|
+
|
|
208
282
|
def load_formatter(device=None):
|
|
209
283
|
"""
|
|
210
284
|
Load the merged formatter model and tokenizer.
|
|
@@ -230,15 +304,21 @@ def load_formatter(device=None):
|
|
|
230
304
|
dtype = torch.float16 if device == "cuda" else torch.float32
|
|
231
305
|
|
|
232
306
|
print(f"[CatLLM] Loading JSON formatter on {device}...")
|
|
233
|
-
tokenizer = AutoTokenizer
|
|
234
|
-
_MERGED_MODEL_REPO, trust_remote_code=True
|
|
235
|
-
)
|
|
307
|
+
tokenizer = _load_formatter_tokenizer(AutoTokenizer)
|
|
236
308
|
if tokenizer.pad_token is None:
|
|
237
309
|
tokenizer.pad_token = tokenizer.eos_token
|
|
238
310
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
311
|
+
# `dtype=` is the transformers >=4.56 kwarg; older versions only accept
|
|
312
|
+
# `torch_dtype=` and crash if `dtype=` leaks into the config. Try the new
|
|
313
|
+
# name, fall back to the old one.
|
|
314
|
+
try:
|
|
315
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
316
|
+
_MERGED_MODEL_REPO, dtype=dtype, trust_remote_code=True
|
|
317
|
+
)
|
|
318
|
+
except TypeError:
|
|
319
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
320
|
+
_MERGED_MODEL_REPO, torch_dtype=dtype, trust_remote_code=True
|
|
321
|
+
)
|
|
242
322
|
model = model.to(device)
|
|
243
323
|
model.eval()
|
|
244
324
|
|
|
@@ -281,7 +361,9 @@ def run_formatter(raw_output, categories, model, tokenizer, device):
|
|
|
281
361
|
with torch.no_grad():
|
|
282
362
|
out = model.generate(
|
|
283
363
|
**inputs,
|
|
284
|
-
|
|
364
|
+
# 512 (was 128): a large category set produces a long N-key JSON
|
|
365
|
+
# object; 128 tokens truncated it for 28/48-category tasks.
|
|
366
|
+
max_new_tokens=512,
|
|
285
367
|
do_sample=False,
|
|
286
368
|
temperature=None,
|
|
287
369
|
top_p=None,
|
|
@@ -126,6 +126,27 @@ def _hf_model_needs_enable_thinking_off(model: str) -> bool:
|
|
|
126
126
|
return any(model.startswith(p) for p in _HF_NEEDS_ENABLE_THINKING_OFF)
|
|
127
127
|
|
|
128
128
|
|
|
129
|
+
# Router-served models measured (2026-06-12 reasoning audit) to reason by
|
|
130
|
+
# default with NO honored off-switch through the OpenAI-compatible router:
|
|
131
|
+
# the router 400-rejects `chat_template_kwargs.enable_thinking` for their
|
|
132
|
+
# templates, and they expose no reasoning_effort. classify() warns once per
|
|
133
|
+
# client so users know the provider default applies.
|
|
134
|
+
_HF_DEFAULT_REASONING_PREFIXES = (
|
|
135
|
+
"openai/gpt-oss",
|
|
136
|
+
"moonshotai/kimi-k2",
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _hf_model_reasons_by_default(model: str) -> bool:
|
|
141
|
+
m = (model or "").lower()
|
|
142
|
+
return any(m.startswith(p) for p in _HF_DEFAULT_REASONING_PREFIXES)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# Module-level: models already warned about uncontrolled reasoning, so the
|
|
146
|
+
# warning fires once per process even though a fresh client is built per row.
|
|
147
|
+
_WARNED_UNCONTROLLED_REASONING: set = set()
|
|
148
|
+
|
|
149
|
+
|
|
129
150
|
# ---------------------------------------------------------------------------
|
|
130
151
|
# Anthropic deprecated the `temperature` parameter starting with the Opus 4.7 /
|
|
131
152
|
# 4.8 generation: these models return 400 "`temperature` is deprecated for this
|
|
@@ -545,8 +566,15 @@ class UnifiedLLMClient:
|
|
|
545
566
|
# accept booleans). Without this, gpt-oss family models emit long
|
|
546
567
|
# <think> blocks by default that bloat per-row generation 3-5x.
|
|
547
568
|
return self._build_openai_payload(messages, json_schema, creativity, force_json, thinking_budget)
|
|
569
|
+
elif self.provider == "xai":
|
|
570
|
+
# v1.6.8: forward the reasoning request. grok-4.3+ hybrids reason
|
|
571
|
+
# by default (2026-06-12 audit: 214 reasoning tokens on a trivial
|
|
572
|
+
# probe with no control sent); non-reasoning variants reject
|
|
573
|
+
# reasoning_effort and are handled by the 400 fallback in
|
|
574
|
+
# complete(), which caches the rejection on the client.
|
|
575
|
+
return self._build_openai_payload(messages, json_schema, creativity, force_json, thinking_budget)
|
|
548
576
|
else:
|
|
549
|
-
# Other OpenAI-compatible providers (
|
|
577
|
+
# Other OpenAI-compatible providers (mistral, etc.)
|
|
550
578
|
return self._build_openai_payload(messages, json_schema, creativity, force_json)
|
|
551
579
|
|
|
552
580
|
def _build_openai_payload(
|
|
@@ -620,6 +648,25 @@ class UnifiedLLMClient:
|
|
|
620
648
|
elif creativity is not None:
|
|
621
649
|
payload["temperature"] = creativity
|
|
622
650
|
|
|
651
|
+
# xAI (v1.6.8): hybrid grok models accept reasoning_effort alongside
|
|
652
|
+
# temperature. "low" is the lowest tier xAI exposes (no "none" /
|
|
653
|
+
# "minimal"); explicitly non-reasoning variants 400 on the field —
|
|
654
|
+
# complete() pops it and caches `_xai_no_reasoning_effort` so later
|
|
655
|
+
# rows on this client skip the doomed field up front.
|
|
656
|
+
if (
|
|
657
|
+
self.provider == "xai"
|
|
658
|
+
and thinking_budget is not None
|
|
659
|
+
and not getattr(self, "_xai_no_reasoning_effort", False)
|
|
660
|
+
# Variants whose name already encodes "non-reasoning" are off by
|
|
661
|
+
# model choice; sending reasoning_effort to them is not just
|
|
662
|
+
# redundant but HARMFUL — verified 2026-06-13 that
|
|
663
|
+
# grok-4-1-fast-non-reasoning returns 0 reasoning tokens with no
|
|
664
|
+
# field but 207 when sent reasoning_effort="low", i.e. the field
|
|
665
|
+
# turns reasoning back ON. Leave these alone.
|
|
666
|
+
and "non-reasoning" not in (self.model or "").lower()
|
|
667
|
+
):
|
|
668
|
+
payload["reasoning_effort"] = "low" if thinking_budget == 0 else "high"
|
|
669
|
+
|
|
623
670
|
# Ollama: per-model-family reasoning control via the top-level
|
|
624
671
|
# `think` field. gpt-oss expects an enum ("low"/"medium"/"high");
|
|
625
672
|
# qwen3/deepseek-r1 expect a boolean. Models not in the
|
|
@@ -648,6 +695,24 @@ class UnifiedLLMClient:
|
|
|
648
695
|
and _hf_model_needs_enable_thinking_off(self.model)
|
|
649
696
|
):
|
|
650
697
|
payload["chat_template_kwargs"] = {"enable_thinking": False}
|
|
698
|
+
elif (
|
|
699
|
+
self.provider in ("huggingface", "huggingface-together")
|
|
700
|
+
and thinking_budget == 0
|
|
701
|
+
and _hf_model_reasons_by_default(self.model)
|
|
702
|
+
and self.model not in _WARNED_UNCONTROLLED_REASONING
|
|
703
|
+
):
|
|
704
|
+
# v1.6.8: these router-served models reason by default and honor
|
|
705
|
+
# no off-switch through the router (enable_thinking is
|
|
706
|
+
# 400-rejected for their templates). Warn once per process (a
|
|
707
|
+
# fresh client is built per row, so a per-instance flag would
|
|
708
|
+
# warn every row) so the uniform "reasoning off" request isn't
|
|
709
|
+
# silently unmet.
|
|
710
|
+
print(
|
|
711
|
+
f"\n[CatLLM] WARNING: no effective reasoning control delivered "
|
|
712
|
+
f"for '{self.model}'; the provider's default reasoning "
|
|
713
|
+
f"behavior applies. See docs/reasoning-controls.md.\n"
|
|
714
|
+
)
|
|
715
|
+
_WARNED_UNCONTROLLED_REASONING.add(self.model)
|
|
651
716
|
|
|
652
717
|
return payload
|
|
653
718
|
|
|
@@ -759,11 +824,19 @@ class UnifiedLLMClient:
|
|
|
759
824
|
if creativity is not None:
|
|
760
825
|
payload["generationConfig"]["temperature"] = creativity
|
|
761
826
|
|
|
762
|
-
#
|
|
763
|
-
#
|
|
764
|
-
#
|
|
765
|
-
|
|
766
|
-
|
|
827
|
+
# Reasoning control (Google-specific). Must be inside generationConfig.
|
|
828
|
+
# v1.6.8: an explicit zero budget is now SENT at thinking_budget = 0.
|
|
829
|
+
# Previously nothing was sent at 0 and Gemini ran at its provider
|
|
830
|
+
# default, which the 2026-06-12 audit measured as thinking ON
|
|
831
|
+
# (~200+ thought tokens on a trivial classification call). Models
|
|
832
|
+
# that reject 0 (minimum-budget tiers) are handled by the 400
|
|
833
|
+
# fallback in complete(), which caches the discovered floor on the
|
|
834
|
+
# client (`_google_thinking_floor`).
|
|
835
|
+
if thinking_budget is not None:
|
|
836
|
+
if thinking_budget > 0:
|
|
837
|
+
budget = max(thinking_budget, 128)
|
|
838
|
+
else:
|
|
839
|
+
budget = getattr(self, "_google_thinking_floor", 0)
|
|
767
840
|
payload["generationConfig"]["thinkingConfig"] = {"thinkingBudget": budget}
|
|
768
841
|
|
|
769
842
|
return payload
|
|
@@ -946,6 +1019,10 @@ class UnifiedLLMClient:
|
|
|
946
1019
|
# transient error this call? Only strip once per call so we don't
|
|
947
1020
|
# mutate payload on every retry tick.
|
|
948
1021
|
stripped_response_format = False
|
|
1022
|
+
# v1.6.8: consecutive-timeout counter + one-shot Google schema drop
|
|
1023
|
+
# (see the Timeout handler below).
|
|
1024
|
+
timeout_count = 0
|
|
1025
|
+
dropped_google_schema = False
|
|
949
1026
|
|
|
950
1027
|
for attempt in range(max_retries):
|
|
951
1028
|
endpoint = self._get_endpoint()
|
|
@@ -1019,9 +1096,32 @@ class UnifiedLLMClient:
|
|
|
1019
1096
|
payload["reasoning_effort"] = "low"
|
|
1020
1097
|
continue
|
|
1021
1098
|
elif current == "low" and "reasoning_effort" in payload:
|
|
1099
|
+
# Model takes no reasoning_effort at all (e.g.
|
|
1100
|
+
# xAI's explicitly non-reasoning variants).
|
|
1101
|
+
# Cache so later rows on this client skip the
|
|
1102
|
+
# doomed field up front (v1.6.8).
|
|
1103
|
+
self._xai_no_reasoning_effort = True
|
|
1022
1104
|
payload.pop("reasoning_effort")
|
|
1023
1105
|
continue
|
|
1024
1106
|
|
|
1107
|
+
# Google (v1.6.8): minimum-budget thinking tiers reject
|
|
1108
|
+
# thinkingBudget: 0. Fall back to 128 (Google's stated
|
|
1109
|
+
# minimum) and cache on the client.
|
|
1110
|
+
if (
|
|
1111
|
+
self.provider == "google"
|
|
1112
|
+
and "thinking" in error_text
|
|
1113
|
+
and ("budget" in error_text or "invalid" in error_text
|
|
1114
|
+
or "unsupported" in error_text)
|
|
1115
|
+
and payload.get("generationConfig", {})
|
|
1116
|
+
.get("thinkingConfig", {})
|
|
1117
|
+
.get("thinkingBudget") == 0
|
|
1118
|
+
):
|
|
1119
|
+
self._google_thinking_floor = 128
|
|
1120
|
+
payload["generationConfig"]["thinkingConfig"]["thinkingBudget"] = 128
|
|
1121
|
+
print(f"\n[CatLLM] Model '{self.model}' rejected thinkingBudget=0; "
|
|
1122
|
+
f"falling back to the minimum (128) and caching for this client.\n")
|
|
1123
|
+
continue
|
|
1124
|
+
|
|
1025
1125
|
# Anthropic deprecated `temperature` for newer models
|
|
1026
1126
|
# (Opus 4.7+): they 400 with "`temperature` is deprecated
|
|
1027
1127
|
# for this model." Strip it, cache on the client so the
|
|
@@ -1113,6 +1213,27 @@ class UnifiedLLMClient:
|
|
|
1113
1213
|
return result, None
|
|
1114
1214
|
|
|
1115
1215
|
except requests.exceptions.Timeout:
|
|
1216
|
+
timeout_count += 1
|
|
1217
|
+
# v1.6.8: Gemini can reproducibly hang on specific inputs
|
|
1218
|
+
# when a strict responseSchema is attached (constrained-
|
|
1219
|
+
# decoding pathology; 2026-06-12 audit — a trivial input
|
|
1220
|
+
# timed out 6/6 attempts WITH the schema and answered
|
|
1221
|
+
# instantly without it). After two consecutive timeouts with
|
|
1222
|
+
# a schema attached, drop the schema once and re-ask: the
|
|
1223
|
+
# prompt still requests JSON and extract_json() parses it
|
|
1224
|
+
# from the free-text response.
|
|
1225
|
+
if (
|
|
1226
|
+
self.provider == "google"
|
|
1227
|
+
and timeout_count >= 2
|
|
1228
|
+
and not dropped_google_schema
|
|
1229
|
+
and "responseSchema" in payload.get("generationConfig", {})
|
|
1230
|
+
):
|
|
1231
|
+
dropped_google_schema = True
|
|
1232
|
+
payload["generationConfig"].pop("responseSchema", None)
|
|
1233
|
+
print(f"[CatLLM] Repeated timeouts from '{self.model}' with "
|
|
1234
|
+
f"responseSchema attached; retrying schema-less "
|
|
1235
|
+
f"(prompt-based JSON parsing).")
|
|
1236
|
+
continue
|
|
1116
1237
|
wait_time = _backoff_with_jitter(initial_delay, attempt)
|
|
1117
1238
|
elapsed = time.monotonic() - start
|
|
1118
1239
|
if attempt < max_retries - 1 and elapsed + wait_time <= max_total_wait:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|