cat-stack 1.6.4__tar.gz → 1.6.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {cat_stack-1.6.4 → cat_stack-1.6.6}/PKG-INFO +10 -1
  2. {cat_stack-1.6.4 → cat_stack-1.6.6}/README.md +9 -0
  3. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/__about__.py +1 -1
  4. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_providers.py +159 -7
  5. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/text_functions_ensemble.py +3 -3
  6. {cat_stack-1.6.4 → cat_stack-1.6.6}/.gitignore +0 -0
  7. {cat_stack-1.6.4 → cat_stack-1.6.6}/LICENSE +0 -0
  8. {cat_stack-1.6.4 → cat_stack-1.6.6}/pyproject.toml +0 -0
  9. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/cat_stack/__init__.py +0 -0
  10. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/__init__.py +0 -0
  11. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_batch.py +0 -0
  12. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_category_analysis.py +0 -0
  13. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_chunked.py +0 -0
  14. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_embeddings.py +0 -0
  15. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_formatter.py +0 -0
  16. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_pilot_test.py +0 -0
  17. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_prompts.py +0 -0
  18. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_review_ui.py +0 -0
  19. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_tiebreaker.py +0 -0
  20. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_utils.py +0 -0
  21. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_web_fetch.py +0 -0
  22. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/_wrapper_helpers.py +0 -0
  23. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/calls/CoVe.py +0 -0
  24. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/calls/__init__.py +0 -0
  25. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/calls/image_CoVe.py +0 -0
  26. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/calls/image_stepback.py +0 -0
  27. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/calls/pdf_CoVe.py +0 -0
  28. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/calls/pdf_stepback.py +0 -0
  29. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/calls/stepback.py +0 -0
  30. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/calls/top_n.py +0 -0
  31. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/classify.py +0 -0
  32. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/explore.py +0 -0
  33. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/extract.py +0 -0
  34. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/image_functions.py +0 -0
  35. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/images/circle.png +0 -0
  36. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/images/cube.png +0 -0
  37. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/images/diamond.png +0 -0
  38. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/images/overlapping_pentagons.png +0 -0
  39. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/images/rectangles.png +0 -0
  40. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/model_reference_list.py +0 -0
  41. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/pdf_functions.py +0 -0
  42. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/prompt_tune.py +0 -0
  43. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/summarize.py +0 -0
  44. {cat_stack-1.6.4 → cat_stack-1.6.6}/src/catstack/text_functions.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-stack
3
- Version: 1.6.4
3
+ Version: 1.6.6
4
4
  Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -193,6 +193,15 @@ All providers use the same `(model_name, provider, api_key)` tuple format. Provi
193
193
  as `success`)
194
194
  - **Embedding similarity** tiebreaker for ensemble consensus ties
195
195
  - **Pilot test** — validate classifications on a small sample before committing to the full run
196
+ - **Provider-conditional HTTP timeouts** — cloud providers use a tight
197
+ 120 s per-request timeout (catches genuine hangs without waiting too
198
+ long on transient API blips), and the Ollama provider uses a wider
199
+ 600 s per-request / 1200 s cumulative budget (accommodates the long
200
+ per-row tails that emerge when running 14B+ models on memory-
201
+ constrained hardware like 16 GB Macs). Power users can override per
202
+ client: `UnifiedLLMClient(provider, key, model, request_timeout=900,
203
+ max_total_wait=1800)`, or set a process-wide override with
204
+ `catstack._providers.set_session_timeouts(request_timeout=..., max_total_wait=...)`
196
205
 
197
206
  ## Future work / contributions welcome
198
207
 
@@ -157,6 +157,15 @@ All providers use the same `(model_name, provider, api_key)` tuple format. Provi
157
157
  as `success`)
158
158
  - **Embedding similarity** tiebreaker for ensemble consensus ties
159
159
  - **Pilot test** — validate classifications on a small sample before committing to the full run
160
+ - **Provider-conditional HTTP timeouts** — cloud providers use a tight
161
+ 120 s per-request timeout (catches genuine hangs without waiting too
162
+ long on transient API blips), and the Ollama provider uses a wider
163
+ 600 s per-request / 1200 s cumulative budget (accommodates the long
164
+ per-row tails that emerge when running 14B+ models on memory-
165
+ constrained hardware like 16 GB Macs). Power users can override per
166
+ client: `UnifiedLLMClient(provider, key, model, request_timeout=900,
167
+ max_total_wait=1800)`, or set a process-wide override with
168
+ `catstack._providers.set_session_timeouts(request_timeout=..., max_total_wait=...)`
160
169
 
161
170
  ## Future work / contributions welcome
162
171
 
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
- __version__ = "1.6.4"
4
+ __version__ = "1.6.6"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-stack"
@@ -19,6 +19,52 @@ import requests
19
19
  # short enough that batch ensembles don't stall for half an hour."
20
20
  _MAX_TOTAL_WAIT_SECONDS = 300.0
21
21
 
22
+ # Per-HTTP-request timeout, in seconds. For cloud providers (OpenAI,
23
+ # Anthropic, Google, …) inference is usually 1-10 seconds, so 120 s is
24
+ # a generous ceiling that catches genuine hangs.
25
+ #
26
+ # Local Ollama is a different regime: on memory-constrained hardware
27
+ # (e.g., 16 GB M1 Pro running a 14 B-class model), individual rows can
28
+ # take 2-4+ minutes under thermal/memory pressure. cat-stack 1.6.4
29
+ # logged frequent spurious "Request timeout" failures in those
30
+ # conditions even when Ollama was about to produce valid output.
31
+ # `_OLLAMA_REQUEST_TIMEOUT` and `_OLLAMA_MAX_TOTAL_WAIT_SECONDS` give
32
+ # the Ollama path a much longer window. Surfaced during the small-tier
33
+ # paper run, 2026-06-04.
34
+ _REQUEST_TIMEOUT = 120.0 # cloud providers
35
+ _OLLAMA_REQUEST_TIMEOUT = 600.0 # local Ollama — 5x cloud, accommodates slow-row tails
36
+ _OLLAMA_MAX_TOTAL_WAIT_SECONDS = 1200.0 # 4x cloud, since per-call timeout is also 5x
37
+
38
+
39
+ # Session-level user override. Set non-None at the start of a `classify()`
40
+ # call to override the conditional defaults for ALL UnifiedLLMClient
41
+ # instances constructed during that call without per-site arg threading.
42
+ # Single-process scope; safe under cat-stack's intra-call parallelism
43
+ # (per-call sets/resets bracket all workers).
44
+ _session_request_timeout: float = None
45
+ _session_max_total_wait: float = None
46
+
47
+
48
+ def set_session_timeouts(request_timeout: float = None, max_total_wait: float = None):
49
+ """Set the session-level HTTP-timeout overrides. Pass None to clear."""
50
+ global _session_request_timeout, _session_max_total_wait
51
+ _session_request_timeout = request_timeout
52
+ _session_max_total_wait = max_total_wait
53
+
54
+
55
+ def _request_timeout_for(provider: str) -> float:
56
+ """Per-request HTTP timeout. Session override wins over provider default."""
57
+ if _session_request_timeout is not None:
58
+ return _session_request_timeout
59
+ return _OLLAMA_REQUEST_TIMEOUT if provider == "ollama" else _REQUEST_TIMEOUT
60
+
61
+
62
+ def _max_total_wait_for(provider: str) -> float:
63
+ """Per-call cumulative-wait cap. Session override wins."""
64
+ if _session_max_total_wait is not None:
65
+ return _session_max_total_wait
66
+ return _OLLAMA_MAX_TOTAL_WAIT_SECONDS if provider == "ollama" else _MAX_TOTAL_WAIT_SECONDS
67
+
22
68
 
23
69
  # ---------------------------------------------------------------------------
24
70
  # OpenAI reasoning_effort: per-model-family off-equivalent value.
@@ -79,6 +125,64 @@ _HF_NEEDS_ENABLE_THINKING_OFF = (
79
125
  def _hf_model_needs_enable_thinking_off(model: str) -> bool:
80
126
  return any(model.startswith(p) for p in _HF_NEEDS_ENABLE_THINKING_OFF)
81
127
 
128
+
129
+ # ---------------------------------------------------------------------------
130
+ # Ollama reasoning control: per-model-family parameter format for the
131
+ # top-level `think` field on chat / generate requests.
132
+ #
133
+ # Ollama standardized on a single API field name (`think`) but the value
134
+ # type differs per model family — gpt-oss takes an enum, most others take
135
+ # a boolean. See https://docs.ollama.com/capabilities/thinking.
136
+ #
137
+ # Coverage philosophy: list every Ollama reasoning model family we know of
138
+ # AND that uses the `think` field. Reasoning models that gate via other
139
+ # mechanisms (system prompts, chat-template flags) are explicitly noted in
140
+ # the "NOT in registry" comment below and handled elsewhere — adding them
141
+ # here would silently inject a no-op `think` field, which Ollama may
142
+ # accept but won't honor, leading to surprising behavior.
143
+ #
144
+ # Entries are checked longest-prefix-first by `_ollama_think_value()`, so
145
+ # put more-specific prefixes earlier when adding (e.g. `qwen3-coder` before
146
+ # `qwen3` if they differ).
147
+ #
148
+ # Registry tuple: (model prefix, value-format, low_value, high_value)
149
+ #
150
+ # Models in registry — `think` field works:
151
+ # gpt-oss — enum: "low" / "medium" / "high" (cannot fully disable)
152
+ # qwen3 / qwen3.* — bool: True / False (covers -thinking variants too)
153
+ # qwq — bool: True / False (Qwen QwQ — preceded Qwen3)
154
+ # deepseek-r1 — bool: True / False (covers -distill variants)
155
+ #
156
+ # Models NOT in registry — different mechanism, do NOT add here:
157
+ # magistral — controlled via system prompt (Mistral Magistral)
158
+ # exaone-deep — uses Modelfile-baked reasoning, no API toggle exposed
159
+ # marco-o1 — uses chat-template wrappers, not `think` field
160
+ #
161
+ # Models with NO reasoning (so `think` should not appear at all):
162
+ # gemma2/3, llama3.x/4.x, mistral, mistral-nemo, qwen2.5 (non-QwQ),
163
+ # phi3/4, granite, olmo, codestral, …
164
+ # These are NOT added; the registry's None-return for unmatched prefixes
165
+ # correctly omits the `think` field for them.
166
+ # ---------------------------------------------------------------------------
167
+ _OLLAMA_REASONING_MODELS = (
168
+ ("gpt-oss", "enum", "low", "high"),
169
+ ("qwen3", "bool", False, True), # covers qwen3.*, qwen3-*, -thinking-* variants
170
+ ("qwq", "bool", False, True),
171
+ ("deepseek-r1", "bool", False, True), # covers -distill-qwen, -distill-llama, etc.
172
+ )
173
+
174
+
175
+ def _ollama_think_value(model: str, thinking_budget):
176
+ """Map cat-stack's thinking_budget to the right Ollama `think` value for
177
+ this model family. Returns None if the model isn't in the
178
+ reasoning-capable registry (no `think` field should be set)."""
179
+ if thinking_budget is None:
180
+ return None
181
+ for prefix, fmt, low_val, high_val in _OLLAMA_REASONING_MODELS:
182
+ if model.startswith(prefix):
183
+ return low_val if thinking_budget == 0 else high_val
184
+ return None
185
+
82
186
  __all__ = [
83
187
  # Main client
84
188
  "UnifiedLLMClient",
@@ -274,10 +378,27 @@ PROVIDER_CONFIG = {
274
378
  class UnifiedLLMClient:
275
379
  """A unified client for calling various LLM providers via HTTP."""
276
380
 
277
- def __init__(self, provider: str, api_key: str, model: str):
381
+ def __init__(self, provider: str, api_key: str, model: str,
382
+ request_timeout: float = None,
383
+ max_total_wait: float = None):
384
+ """
385
+ Args:
386
+ request_timeout (float | None): Override the per-HTTP-request
387
+ timeout (seconds). When None, uses the provider-conditional
388
+ default: 120 s for cloud providers, 600 s for Ollama.
389
+ Pass an explicit float to override per call site.
390
+ max_total_wait (float | None): Override the per-call cumulative
391
+ retry budget (seconds). When None, uses provider-conditional
392
+ default: 300 s for cloud, 1200 s for Ollama.
393
+ """
278
394
  self.provider = _normalize_provider(provider)
279
395
  self.api_key = api_key
280
396
  self.model = model
397
+ # User-level overrides for HTTP timeouts. None means "use the
398
+ # provider-conditional default" (see _request_timeout_for /
399
+ # _max_total_wait_for at module level).
400
+ self._request_timeout_override = request_timeout
401
+ self._max_total_wait_override = max_total_wait
281
402
 
282
403
  # Lazy HuggingFace router fallback — start with None and only
283
404
  # populate when we either (a) have an explicit router suffix, or
@@ -394,6 +515,12 @@ class UnifiedLLMClient:
394
515
  elif self.provider in ("huggingface", "huggingface-together"):
395
516
  # HuggingFace needs thinking_budget to disable thinking on models that reason by default
396
517
  return self._build_openai_payload(messages, json_schema, creativity, force_json, thinking_budget)
518
+ elif self.provider == "ollama":
519
+ # Ollama threads thinking_budget to its top-level `think` field for
520
+ # reasoning-capable models (gpt-oss accepts low/medium/high; others
521
+ # accept booleans). Without this, gpt-oss family models emit long
522
+ # <think> blocks by default that bloat per-row generation 3-5x.
523
+ return self._build_openai_payload(messages, json_schema, creativity, force_json, thinking_budget)
397
524
  else:
398
525
  # Other OpenAI-compatible providers (xai, mistral, etc.)
399
526
  return self._build_openai_payload(messages, json_schema, creativity, force_json)
@@ -469,6 +596,19 @@ class UnifiedLLMClient:
469
596
  elif creativity is not None:
470
597
  payload["temperature"] = creativity
471
598
 
599
+ # Ollama: per-model-family reasoning control via the top-level
600
+ # `think` field. gpt-oss expects an enum ("low"/"medium"/"high");
601
+ # qwen3/deepseek-r1 expect a boolean. Models not in the
602
+ # `_OLLAMA_REASONING_MODELS` registry don't support reasoning and
603
+ # get no `think` field (would be a no-op at best, validator-
604
+ # confusing at worst). Without this, Ollama-served gpt-oss
605
+ # produces long `<think>` blocks by default that bloat per-row
606
+ # generation 3-5x.
607
+ if self.provider == "ollama":
608
+ think_value = _ollama_think_value(self.model, thinking_budget)
609
+ if think_value is not None:
610
+ payload["think"] = think_value
611
+
472
612
  # HuggingFace: disable thinking on model families whose chat
473
613
  # template honors `enable_thinking` (Qwen3-family). Other HF-routed
474
614
  # models don't need the kwarg, and strict-validator backends
@@ -755,8 +895,20 @@ class UnifiedLLMClient:
755
895
  payload.pop("response_format")
756
896
 
757
897
  # Track cumulative wait so a long string of transient errors can't
758
- # block the call indefinitely. See _MAX_TOTAL_WAIT_SECONDS.
898
+ # block the call indefinitely. Timeouts are provider-conditional by
899
+ # default; user overrides on the client instance (set at __init__)
900
+ # take precedence.
759
901
  start = time.monotonic()
902
+ request_timeout = (
903
+ self._request_timeout_override
904
+ if self._request_timeout_override is not None
905
+ else _request_timeout_for(self.provider)
906
+ )
907
+ max_total_wait = (
908
+ self._max_total_wait_override
909
+ if self._max_total_wait_override is not None
910
+ else _max_total_wait_for(self.provider)
911
+ )
760
912
  # Per-call flag: have we already tried stripping response_format on a
761
913
  # transient error this call? Only strip once per call so we don't
762
914
  # mutate payload on every retry tick.
@@ -769,7 +921,7 @@ class UnifiedLLMClient:
769
921
  endpoint,
770
922
  headers=headers,
771
923
  json=payload,
772
- timeout=120,
924
+ timeout=request_timeout,
773
925
  )
774
926
 
775
927
  # Check for HTTP errors
@@ -854,7 +1006,7 @@ class UnifiedLLMClient:
854
1006
  if wait_time is None:
855
1007
  wait_time = _backoff_with_jitter(initial_delay, attempt, multiplier=5.0)
856
1008
  elapsed = time.monotonic() - start
857
- if attempt < max_retries - 1 and elapsed + wait_time <= _MAX_TOTAL_WAIT_SECONDS:
1009
+ if attempt < max_retries - 1 and elapsed + wait_time <= max_total_wait:
858
1010
  # Name the throttling provider/model so multi-model
859
1011
  # ensemble runs can attribute the slowdown.
860
1012
  print(f"[{self.provider}/{self.model}] Rate limited. Waiting {wait_time:.1f}s...")
@@ -894,7 +1046,7 @@ class UnifiedLLMClient:
894
1046
  if wait_time is None:
895
1047
  wait_time = _backoff_with_jitter(initial_delay, attempt)
896
1048
  elapsed = time.monotonic() - start
897
- if attempt < max_retries - 1 and elapsed + wait_time <= _MAX_TOTAL_WAIT_SECONDS:
1049
+ if attempt < max_retries - 1 and elapsed + wait_time <= max_total_wait:
898
1050
  # Name the failing provider/model — same rationale as
899
1051
  # the 429 handler above.
900
1052
  print(f"[{self.provider}/{self.model}] Server error {response.status_code}. Retrying in {wait_time:.1f}s...")
@@ -911,7 +1063,7 @@ class UnifiedLLMClient:
911
1063
  except requests.exceptions.Timeout:
912
1064
  wait_time = _backoff_with_jitter(initial_delay, attempt)
913
1065
  elapsed = time.monotonic() - start
914
- if attempt < max_retries - 1 and elapsed + wait_time <= _MAX_TOTAL_WAIT_SECONDS:
1066
+ if attempt < max_retries - 1 and elapsed + wait_time <= max_total_wait:
915
1067
  print(f"Request timeout. Retrying in {wait_time:.1f}s...")
916
1068
  time.sleep(wait_time)
917
1069
  else:
@@ -920,7 +1072,7 @@ class UnifiedLLMClient:
920
1072
  except requests.exceptions.RequestException as e:
921
1073
  wait_time = _backoff_with_jitter(initial_delay, attempt)
922
1074
  elapsed = time.monotonic() - start
923
- if attempt < max_retries - 1 and elapsed + wait_time <= _MAX_TOTAL_WAIT_SECONDS:
1075
+ if attempt < max_retries - 1 and elapsed + wait_time <= max_total_wait:
924
1076
  print(f"Request error: {e}. Retrying in {wait_time:.1f}s...")
925
1077
  time.sleep(wait_time)
926
1078
  else:
@@ -3043,7 +3043,7 @@ Categorize text responses {cove_categorize}:
3043
3043
  messages=messages,
3044
3044
  json_schema=json_schemas[cfg["model"]],
3045
3045
  creativity=effective_creativity,
3046
- thinking_budget=thinking_budget if cfg["provider"] in ("google", "openai", "anthropic", "huggingface", "huggingface-together") else None,
3046
+ thinking_budget=thinking_budget if cfg["provider"] in ("google", "openai", "anthropic", "huggingface", "huggingface-together", "ollama") else None,
3047
3047
  max_retries=max_retries,
3048
3048
  )
3049
3049
 
@@ -3100,7 +3100,7 @@ Categorize text responses {cove_categorize}:
3100
3100
  messages=messages,
3101
3101
  json_schema=json_schemas[cfg["model"]],
3102
3102
  creativity=effective_creativity,
3103
- thinking_budget=thinking_budget if cfg["provider"] in ("google", "openai", "anthropic", "huggingface", "huggingface-together") else None,
3103
+ thinking_budget=thinking_budget if cfg["provider"] in ("google", "openai", "anthropic", "huggingface", "huggingface-together", "ollama") else None,
3104
3104
  max_retries=max_retries,
3105
3105
  )
3106
3106
 
@@ -3184,7 +3184,7 @@ Categorize text responses {cove_categorize}:
3184
3184
  messages=_retry_messages,
3185
3185
  json_schema=json_schemas[cfg["model"]],
3186
3186
  creativity=effective_creativity,
3187
- thinking_budget=thinking_budget if cfg["provider"] in ("google", "openai", "anthropic", "huggingface", "huggingface-together") else None,
3187
+ thinking_budget=thinking_budget if cfg["provider"] in ("google", "openai", "anthropic", "huggingface", "huggingface-together", "ollama") else None,
3188
3188
  max_retries=max_retries,
3189
3189
  )
3190
3190
 
File without changes
File without changes
File without changes