cat-stack 1.6.6__tar.gz → 1.6.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cat_stack-1.6.6 → cat_stack-1.6.8}/PKG-INFO +1 -1
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/__about__.py +1 -1
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_providers.py +181 -8
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/text_functions_ensemble.py +11 -4
- {cat_stack-1.6.6 → cat_stack-1.6.8}/.gitignore +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/LICENSE +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/README.md +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/pyproject.toml +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/cat_stack/__init__.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/__init__.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_batch.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_category_analysis.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_chunked.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_embeddings.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_formatter.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_pilot_test.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_prompts.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_review_ui.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_tiebreaker.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_utils.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_web_fetch.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/_wrapper_helpers.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/calls/CoVe.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/calls/__init__.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/calls/image_CoVe.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/calls/image_stepback.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/calls/pdf_CoVe.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/calls/pdf_stepback.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/calls/stepback.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/calls/top_n.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/classify.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/explore.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/extract.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/image_functions.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/images/circle.png +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/images/cube.png +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/images/diamond.png +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/images/overlapping_pentagons.png +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/images/rectangles.png +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/model_reference_list.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/pdf_functions.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/prompt_tune.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/summarize.py +0 -0
- {cat_stack-1.6.6 → cat_stack-1.6.8}/src/catstack/text_functions.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cat-stack
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.8
|
|
4
4
|
Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
|
|
5
5
|
Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
|
-
__version__ = "1.6.
|
|
4
|
+
__version__ = "1.6.8"
|
|
5
5
|
__author__ = "Chris Soria"
|
|
6
6
|
__email__ = "chrissoria@berkeley.edu"
|
|
7
7
|
__title__ = "cat-stack"
|
|
@@ -126,6 +126,51 @@ def _hf_model_needs_enable_thinking_off(model: str) -> bool:
|
|
|
126
126
|
return any(model.startswith(p) for p in _HF_NEEDS_ENABLE_THINKING_OFF)
|
|
127
127
|
|
|
128
128
|
|
|
129
|
+
# Router-served models measured (2026-06-12 reasoning audit) to reason by
|
|
130
|
+
# default with NO honored off-switch through the OpenAI-compatible router:
|
|
131
|
+
# the router 400-rejects `chat_template_kwargs.enable_thinking` for their
|
|
132
|
+
# templates, and they expose no reasoning_effort. classify() warns once per
|
|
133
|
+
# client so users know the provider default applies.
|
|
134
|
+
_HF_DEFAULT_REASONING_PREFIXES = (
|
|
135
|
+
"openai/gpt-oss",
|
|
136
|
+
"moonshotai/kimi-k2",
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _hf_model_reasons_by_default(model: str) -> bool:
|
|
141
|
+
m = (model or "").lower()
|
|
142
|
+
return any(m.startswith(p) for p in _HF_DEFAULT_REASONING_PREFIXES)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# Module-level: models already warned about uncontrolled reasoning, so the
|
|
146
|
+
# warning fires once per process even though a fresh client is built per row.
|
|
147
|
+
_WARNED_UNCONTROLLED_REASONING: set = set()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# ---------------------------------------------------------------------------
|
|
151
|
+
# Anthropic deprecated the `temperature` parameter starting with the Opus 4.7 /
|
|
152
|
+
# 4.8 generation: these models return 400 "`temperature` is deprecated for this
|
|
153
|
+
# model." if it is sent. Older models (opus-4-6, sonnet-4-6, sonnet-4-5, and
|
|
154
|
+
# earlier) still accept it. This mirrors the OpenAI reasoning-model handling
|
|
155
|
+
# above — we skip `temperature` up-front for the known-deprecated prefixes in
|
|
156
|
+
# `_build_anthropic_payload`, and `UnifiedLLMClient.complete()` strips it on a
|
|
157
|
+
# runtime 400 as a safety net for future families not yet in this table.
|
|
158
|
+
#
|
|
159
|
+
# Matched by name prefix; extend the tuple when new temperature-free models
|
|
160
|
+
# ship.
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
_ANTHROPIC_TEMPERATURE_DEPRECATED = (
|
|
163
|
+
"claude-opus-4-7",
|
|
164
|
+
"claude-opus-4-8",
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _anthropic_supports_temperature(model: str) -> bool:
|
|
169
|
+
"""False for Anthropic models that reject the `temperature` param."""
|
|
170
|
+
m = (model or "").lower()
|
|
171
|
+
return not any(m.startswith(p) for p in _ANTHROPIC_TEMPERATURE_DEPRECATED)
|
|
172
|
+
|
|
173
|
+
|
|
129
174
|
# ---------------------------------------------------------------------------
|
|
130
175
|
# Ollama reasoning control: per-model-family parameter format for the
|
|
131
176
|
# top-level `think` field on chat / generate requests.
|
|
@@ -521,8 +566,15 @@ class UnifiedLLMClient:
|
|
|
521
566
|
# accept booleans). Without this, gpt-oss family models emit long
|
|
522
567
|
# <think> blocks by default that bloat per-row generation 3-5x.
|
|
523
568
|
return self._build_openai_payload(messages, json_schema, creativity, force_json, thinking_budget)
|
|
569
|
+
elif self.provider == "xai":
|
|
570
|
+
# v1.6.8: forward the reasoning request. grok-4.3+ hybrids reason
|
|
571
|
+
# by default (2026-06-12 audit: 214 reasoning tokens on a trivial
|
|
572
|
+
# probe with no control sent); non-reasoning variants reject
|
|
573
|
+
# reasoning_effort and are handled by the 400 fallback in
|
|
574
|
+
# complete(), which caches the rejection on the client.
|
|
575
|
+
return self._build_openai_payload(messages, json_schema, creativity, force_json, thinking_budget)
|
|
524
576
|
else:
|
|
525
|
-
# Other OpenAI-compatible providers (
|
|
577
|
+
# Other OpenAI-compatible providers (mistral, etc.)
|
|
526
578
|
return self._build_openai_payload(messages, json_schema, creativity, force_json)
|
|
527
579
|
|
|
528
580
|
def _build_openai_payload(
|
|
@@ -596,6 +648,25 @@ class UnifiedLLMClient:
|
|
|
596
648
|
elif creativity is not None:
|
|
597
649
|
payload["temperature"] = creativity
|
|
598
650
|
|
|
651
|
+
# xAI (v1.6.8): hybrid grok models accept reasoning_effort alongside
|
|
652
|
+
# temperature. "low" is the lowest tier xAI exposes (no "none" /
|
|
653
|
+
# "minimal"); explicitly non-reasoning variants 400 on the field —
|
|
654
|
+
# complete() pops it and caches `_xai_no_reasoning_effort` so later
|
|
655
|
+
# rows on this client skip the doomed field up front.
|
|
656
|
+
if (
|
|
657
|
+
self.provider == "xai"
|
|
658
|
+
and thinking_budget is not None
|
|
659
|
+
and not getattr(self, "_xai_no_reasoning_effort", False)
|
|
660
|
+
# Variants whose name already encodes "non-reasoning" are off by
|
|
661
|
+
# model choice; sending reasoning_effort to them is not just
|
|
662
|
+
# redundant but HARMFUL — verified 2026-06-13 that
|
|
663
|
+
# grok-4-1-fast-non-reasoning returns 0 reasoning tokens with no
|
|
664
|
+
# field but 207 when sent reasoning_effort="low", i.e. the field
|
|
665
|
+
# turns reasoning back ON. Leave these alone.
|
|
666
|
+
and "non-reasoning" not in (self.model or "").lower()
|
|
667
|
+
):
|
|
668
|
+
payload["reasoning_effort"] = "low" if thinking_budget == 0 else "high"
|
|
669
|
+
|
|
599
670
|
# Ollama: per-model-family reasoning control via the top-level
|
|
600
671
|
# `think` field. gpt-oss expects an enum ("low"/"medium"/"high");
|
|
601
672
|
# qwen3/deepseek-r1 expect a boolean. Models not in the
|
|
@@ -624,6 +695,24 @@ class UnifiedLLMClient:
|
|
|
624
695
|
and _hf_model_needs_enable_thinking_off(self.model)
|
|
625
696
|
):
|
|
626
697
|
payload["chat_template_kwargs"] = {"enable_thinking": False}
|
|
698
|
+
elif (
|
|
699
|
+
self.provider in ("huggingface", "huggingface-together")
|
|
700
|
+
and thinking_budget == 0
|
|
701
|
+
and _hf_model_reasons_by_default(self.model)
|
|
702
|
+
and self.model not in _WARNED_UNCONTROLLED_REASONING
|
|
703
|
+
):
|
|
704
|
+
# v1.6.8: these router-served models reason by default and honor
|
|
705
|
+
# no off-switch through the router (enable_thinking is
|
|
706
|
+
# 400-rejected for their templates). Warn once per process (a
|
|
707
|
+
# fresh client is built per row, so a per-instance flag would
|
|
708
|
+
# warn every row) so the uniform "reasoning off" request isn't
|
|
709
|
+
# silently unmet.
|
|
710
|
+
print(
|
|
711
|
+
f"\n[CatLLM] WARNING: no effective reasoning control delivered "
|
|
712
|
+
f"for '{self.model}'; the provider's default reasoning "
|
|
713
|
+
f"behavior applies. See docs/reasoning-controls.md.\n"
|
|
714
|
+
)
|
|
715
|
+
_WARNED_UNCONTROLLED_REASONING.add(self.model)
|
|
627
716
|
|
|
628
717
|
return payload
|
|
629
718
|
|
|
@@ -660,6 +749,14 @@ class UnifiedLLMClient:
|
|
|
660
749
|
if system_content:
|
|
661
750
|
payload["system"] = system_content
|
|
662
751
|
|
|
752
|
+
# Newer Anthropic models (Opus 4.7+) deprecated `temperature` and 400 if
|
|
753
|
+
# it is sent. Skip it for those known prefixes, and also honor the flag
|
|
754
|
+
# cached by complete()'s runtime 400 fallback for future families.
|
|
755
|
+
_temp_ok = (
|
|
756
|
+
_anthropic_supports_temperature(self.model)
|
|
757
|
+
and not getattr(self, "_anthropic_temperature_unsupported", False)
|
|
758
|
+
)
|
|
759
|
+
|
|
663
760
|
# Extended thinking for Anthropic (minimum 1024 tokens)
|
|
664
761
|
# When thinking is enabled, temperature must be 1 (Anthropic requirement),
|
|
665
762
|
# so we skip setting temperature from creativity in that case
|
|
@@ -669,11 +766,12 @@ class UnifiedLLMClient:
|
|
|
669
766
|
"type": "enabled",
|
|
670
767
|
"budget_tokens": budget,
|
|
671
768
|
}
|
|
672
|
-
|
|
769
|
+
if _temp_ok:
|
|
770
|
+
payload["temperature"] = 1
|
|
673
771
|
# When thinking is enabled, max_tokens must be larger than budget_tokens
|
|
674
772
|
if payload["max_tokens"] <= budget:
|
|
675
773
|
payload["max_tokens"] = budget + 4096
|
|
676
|
-
elif creativity is not None:
|
|
774
|
+
elif creativity is not None and _temp_ok:
|
|
677
775
|
payload["temperature"] = creativity
|
|
678
776
|
|
|
679
777
|
# Use tool calling for structured output (most reliable for Anthropic)
|
|
@@ -726,11 +824,19 @@ class UnifiedLLMClient:
|
|
|
726
824
|
if creativity is not None:
|
|
727
825
|
payload["generationConfig"]["temperature"] = creativity
|
|
728
826
|
|
|
729
|
-
#
|
|
730
|
-
#
|
|
731
|
-
#
|
|
732
|
-
|
|
733
|
-
|
|
827
|
+
# Reasoning control (Google-specific). Must be inside generationConfig.
|
|
828
|
+
# v1.6.8: an explicit zero budget is now SENT at thinking_budget = 0.
|
|
829
|
+
# Previously nothing was sent at 0 and Gemini ran at its provider
|
|
830
|
+
# default, which the 2026-06-12 audit measured as thinking ON
|
|
831
|
+
# (~200+ thought tokens on a trivial classification call). Models
|
|
832
|
+
# that reject 0 (minimum-budget tiers) are handled by the 400
|
|
833
|
+
# fallback in complete(), which caches the discovered floor on the
|
|
834
|
+
# client (`_google_thinking_floor`).
|
|
835
|
+
if thinking_budget is not None:
|
|
836
|
+
if thinking_budget > 0:
|
|
837
|
+
budget = max(thinking_budget, 128)
|
|
838
|
+
else:
|
|
839
|
+
budget = getattr(self, "_google_thinking_floor", 0)
|
|
734
840
|
payload["generationConfig"]["thinkingConfig"] = {"thinkingBudget": budget}
|
|
735
841
|
|
|
736
842
|
return payload
|
|
@@ -913,6 +1019,10 @@ class UnifiedLLMClient:
|
|
|
913
1019
|
# transient error this call? Only strip once per call so we don't
|
|
914
1020
|
# mutate payload on every retry tick.
|
|
915
1021
|
stripped_response_format = False
|
|
1022
|
+
# v1.6.8: consecutive-timeout counter + one-shot Google schema drop
|
|
1023
|
+
# (see the Timeout handler below).
|
|
1024
|
+
timeout_count = 0
|
|
1025
|
+
dropped_google_schema = False
|
|
916
1026
|
|
|
917
1027
|
for attempt in range(max_retries):
|
|
918
1028
|
endpoint = self._get_endpoint()
|
|
@@ -986,9 +1096,51 @@ class UnifiedLLMClient:
|
|
|
986
1096
|
payload["reasoning_effort"] = "low"
|
|
987
1097
|
continue
|
|
988
1098
|
elif current == "low" and "reasoning_effort" in payload:
|
|
1099
|
+
# Model takes no reasoning_effort at all (e.g.
|
|
1100
|
+
# xAI's explicitly non-reasoning variants).
|
|
1101
|
+
# Cache so later rows on this client skip the
|
|
1102
|
+
# doomed field up front (v1.6.8).
|
|
1103
|
+
self._xai_no_reasoning_effort = True
|
|
989
1104
|
payload.pop("reasoning_effort")
|
|
990
1105
|
continue
|
|
991
1106
|
|
|
1107
|
+
# Google (v1.6.8): minimum-budget thinking tiers reject
|
|
1108
|
+
# thinkingBudget: 0. Fall back to 128 (Google's stated
|
|
1109
|
+
# minimum) and cache on the client.
|
|
1110
|
+
if (
|
|
1111
|
+
self.provider == "google"
|
|
1112
|
+
and "thinking" in error_text
|
|
1113
|
+
and ("budget" in error_text or "invalid" in error_text
|
|
1114
|
+
or "unsupported" in error_text)
|
|
1115
|
+
and payload.get("generationConfig", {})
|
|
1116
|
+
.get("thinkingConfig", {})
|
|
1117
|
+
.get("thinkingBudget") == 0
|
|
1118
|
+
):
|
|
1119
|
+
self._google_thinking_floor = 128
|
|
1120
|
+
payload["generationConfig"]["thinkingConfig"]["thinkingBudget"] = 128
|
|
1121
|
+
print(f"\n[CatLLM] Model '{self.model}' rejected thinkingBudget=0; "
|
|
1122
|
+
f"falling back to the minimum (128) and caching for this client.\n")
|
|
1123
|
+
continue
|
|
1124
|
+
|
|
1125
|
+
# Anthropic deprecated `temperature` for newer models
|
|
1126
|
+
# (Opus 4.7+): they 400 with "`temperature` is deprecated
|
|
1127
|
+
# for this model." Strip it, cache on the client so the
|
|
1128
|
+
# payload builder skips it for subsequent rows on this
|
|
1129
|
+
# client, and retry. Safety net for families not yet in
|
|
1130
|
+
# `_ANTHROPIC_TEMPERATURE_DEPRECATED`.
|
|
1131
|
+
if (
|
|
1132
|
+
"temperature" in error_text
|
|
1133
|
+
and "deprecated" in error_text
|
|
1134
|
+
and "temperature" in payload
|
|
1135
|
+
):
|
|
1136
|
+
if not getattr(self, '_warned_temperature_deprecated', False):
|
|
1137
|
+
print(f"\n[CatLLM] Model '{self.model}' deprecated the temperature parameter.")
|
|
1138
|
+
print(f" Dropping it and caching for subsequent calls on this client.\n")
|
|
1139
|
+
self._warned_temperature_deprecated = True
|
|
1140
|
+
self._anthropic_temperature_unsupported = True
|
|
1141
|
+
payload.pop("temperature")
|
|
1142
|
+
continue
|
|
1143
|
+
|
|
992
1144
|
# HuggingFace: try other routers when the current one
|
|
993
1145
|
# rejects the model with a "wrong router" 400.
|
|
994
1146
|
if self._is_hf_wrong_router_400(response.text):
|
|
@@ -1061,6 +1213,27 @@ class UnifiedLLMClient:
|
|
|
1061
1213
|
return result, None
|
|
1062
1214
|
|
|
1063
1215
|
except requests.exceptions.Timeout:
|
|
1216
|
+
timeout_count += 1
|
|
1217
|
+
# v1.6.8: Gemini can reproducibly hang on specific inputs
|
|
1218
|
+
# when a strict responseSchema is attached (constrained-
|
|
1219
|
+
# decoding pathology; 2026-06-12 audit — a trivial input
|
|
1220
|
+
# timed out 6/6 attempts WITH the schema and answered
|
|
1221
|
+
# instantly without it). After two consecutive timeouts with
|
|
1222
|
+
# a schema attached, drop the schema once and re-ask: the
|
|
1223
|
+
# prompt still requests JSON and extract_json() parses it
|
|
1224
|
+
# from the free-text response.
|
|
1225
|
+
if (
|
|
1226
|
+
self.provider == "google"
|
|
1227
|
+
and timeout_count >= 2
|
|
1228
|
+
and not dropped_google_schema
|
|
1229
|
+
and "responseSchema" in payload.get("generationConfig", {})
|
|
1230
|
+
):
|
|
1231
|
+
dropped_google_schema = True
|
|
1232
|
+
payload["generationConfig"].pop("responseSchema", None)
|
|
1233
|
+
print(f"[CatLLM] Repeated timeouts from '{self.model}' with "
|
|
1234
|
+
f"responseSchema attached; retrying schema-less "
|
|
1235
|
+
f"(prompt-based JSON parsing).")
|
|
1236
|
+
continue
|
|
1064
1237
|
wait_time = _backoff_with_jitter(initial_delay, attempt)
|
|
1065
1238
|
elapsed = time.monotonic() - start
|
|
1066
1239
|
if attempt < max_retries - 1 and elapsed + wait_time <= max_total_wait:
|
|
@@ -3638,11 +3638,12 @@ def build_output_dataframes(
|
|
|
3638
3638
|
# Populate data
|
|
3639
3639
|
for idx, result in enumerate(all_results):
|
|
3640
3640
|
combined_data["input_index"].append(idx)
|
|
3641
|
-
#
|
|
3641
|
+
# Full input_data (whitespace-collapsed). Truncating here breaks
|
|
3642
|
+
# downstream joins against gold-standard files and silently feeds
|
|
3643
|
+
# truncated text to any pipeline that reuses input_data as input.
|
|
3642
3644
|
raw = result["response"]
|
|
3643
3645
|
clean = " ".join(str(raw).split()) # collapse whitespace/newlines
|
|
3644
|
-
|
|
3645
|
-
combined_data["input_data"].append(preview)
|
|
3646
|
+
combined_data["input_data"].append(clean)
|
|
3646
3647
|
aggregated = result["aggregated"]
|
|
3647
3648
|
|
|
3648
3649
|
# Add PDF metadata if present
|
|
@@ -4464,7 +4465,13 @@ def summarize_ensemble(
|
|
|
4464
4465
|
# synthesis still has *something* to anchor on (prior behavior).
|
|
4465
4466
|
original_text_for_synthesis = entry.get("page_text") or page_label
|
|
4466
4467
|
else:
|
|
4467
|
-
# Truncate input_data for readability; add input_index for joining
|
|
4468
|
+
# Truncate input_data for readability; add input_index for joining.
|
|
4469
|
+
# Truncation is intentional HERE (summarize): inputs can be whole
|
|
4470
|
+
# documents/PDF pages, and full text would bloat the output and the
|
|
4471
|
+
# synthesis context. classify()'s writer (build_output_dataframes)
|
|
4472
|
+
# deliberately does NOT truncate — survey-length inputs there are
|
|
4473
|
+
# reused for downstream joins. See repo TODO for the permanent
|
|
4474
|
+
# per-function input_data design (preview + stable join key).
|
|
4468
4475
|
clean = " ".join(str(item).split()) # collapse whitespace/newlines
|
|
4469
4476
|
preview = clean[:100] + "..." if len(clean) > 100 else clean
|
|
4470
4477
|
row = {
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|