cat-stack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cat_stack/__about__.py +10 -0
- cat_stack/__init__.py +128 -0
- cat_stack/_batch.py +1388 -0
- cat_stack/_category_analysis.py +348 -0
- cat_stack/_chunked.py +424 -0
- cat_stack/_embeddings.py +189 -0
- cat_stack/_formatter.py +169 -0
- cat_stack/_providers.py +1048 -0
- cat_stack/_tiebreaker.py +277 -0
- cat_stack/_utils.py +512 -0
- cat_stack/_web_fetch.py +194 -0
- cat_stack/calls/CoVe.py +287 -0
- cat_stack/calls/__init__.py +25 -0
- cat_stack/calls/all_calls.py +622 -0
- cat_stack/calls/image_CoVe.py +386 -0
- cat_stack/calls/image_stepback.py +210 -0
- cat_stack/calls/pdf_CoVe.py +386 -0
- cat_stack/calls/pdf_stepback.py +210 -0
- cat_stack/calls/stepback.py +180 -0
- cat_stack/calls/top_n.py +217 -0
- cat_stack/classify.py +682 -0
- cat_stack/explore.py +111 -0
- cat_stack/extract.py +218 -0
- cat_stack/image_functions.py +2078 -0
- cat_stack/images/circle.png +0 -0
- cat_stack/images/cube.png +0 -0
- cat_stack/images/diamond.png +0 -0
- cat_stack/images/overlapping_pentagons.png +0 -0
- cat_stack/images/rectangles.png +0 -0
- cat_stack/model_reference_list.py +94 -0
- cat_stack/pdf_functions.py +2087 -0
- cat_stack/summarize.py +290 -0
- cat_stack/text_functions.py +1358 -0
- cat_stack/text_functions_ensemble.py +3644 -0
- cat_stack-0.1.0.dist-info/METADATA +150 -0
- cat_stack-0.1.0.dist-info/RECORD +38 -0
- cat_stack-0.1.0.dist-info/WHEEL +4 -0
- cat_stack-0.1.0.dist-info/licenses/LICENSE +672 -0
cat_stack/summarize.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Summarization functions for CatLLM.
|
|
3
|
+
|
|
4
|
+
This module provides unified summarization for text and PDF inputs,
|
|
5
|
+
supporting both single-model and multi-model (ensemble) summarization.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import warnings
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
# Main entry point
|
|
12
|
+
"summarize",
|
|
13
|
+
# Ensemble function
|
|
14
|
+
"summarize_ensemble",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
# Import provider infrastructure
|
|
18
|
+
from ._providers import (
|
|
19
|
+
UnifiedLLMClient,
|
|
20
|
+
detect_provider,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Import the implementation functions from existing modules
|
|
24
|
+
from .text_functions_ensemble import (
|
|
25
|
+
summarize_ensemble,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def summarize(
|
|
30
|
+
input_data,
|
|
31
|
+
api_key: str = None,
|
|
32
|
+
description: str = "",
|
|
33
|
+
instructions: str = "",
|
|
34
|
+
max_length: int = None,
|
|
35
|
+
focus: str = None,
|
|
36
|
+
user_model: str = "gpt-4o",
|
|
37
|
+
model_source: str = "auto",
|
|
38
|
+
mode: str = "image",
|
|
39
|
+
pdf_dpi: int = 150,
|
|
40
|
+
creativity: float = None,
|
|
41
|
+
thinking_budget: int = 0,
|
|
42
|
+
chain_of_thought: bool = True,
|
|
43
|
+
context_prompt: bool = False,
|
|
44
|
+
step_back_prompt: bool = False,
|
|
45
|
+
filename: str = None,
|
|
46
|
+
save_directory: str = None,
|
|
47
|
+
progress_callback=None,
|
|
48
|
+
models: list = None,
|
|
49
|
+
max_workers: int = None,
|
|
50
|
+
parallel: bool = None,
|
|
51
|
+
auto_download: bool = False,
|
|
52
|
+
# Robustness parameters
|
|
53
|
+
safety: bool = False,
|
|
54
|
+
max_retries: int = 5,
|
|
55
|
+
batch_retries: int = 2,
|
|
56
|
+
retry_delay: float = 1.0,
|
|
57
|
+
row_delay: float = 0.0,
|
|
58
|
+
fail_strategy: str = "partial",
|
|
59
|
+
# Batch mode parameters
|
|
60
|
+
batch_mode: bool = False,
|
|
61
|
+
batch_poll_interval: float = 30.0,
|
|
62
|
+
batch_timeout: float = 86400.0,
|
|
63
|
+
):
|
|
64
|
+
"""
|
|
65
|
+
Summarize text or PDF data using LLMs.
|
|
66
|
+
|
|
67
|
+
Supports single-model and multi-model (ensemble) summarization. In multi-model
|
|
68
|
+
mode, summaries from all models are synthesized into a consensus summary.
|
|
69
|
+
Input type is auto-detected from the data (text strings or PDF paths).
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
input_data: Data to summarize. Can be:
|
|
73
|
+
- Text: list of strings, pandas Series, or single string
|
|
74
|
+
- PDF: directory path, single PDF path, or list of PDF paths
|
|
75
|
+
api_key (str): API key for the model provider (single-model mode)
|
|
76
|
+
description (str): Description of what the content contains (provides context)
|
|
77
|
+
instructions (str): Specific summarization instructions (e.g., "bullet points")
|
|
78
|
+
max_length (int): Maximum summary length in words
|
|
79
|
+
focus (str): What to focus on (e.g., "main arguments", "emotional content")
|
|
80
|
+
user_model (str): Model to use (default "gpt-4o")
|
|
81
|
+
model_source (str): Provider - "auto", "openai", "anthropic", "google", etc.
|
|
82
|
+
mode (str): PDF processing mode (only used for PDF input):
|
|
83
|
+
- "image" (default): Render pages as images
|
|
84
|
+
- "text": Extract text only
|
|
85
|
+
- "both": Send both image and extracted text
|
|
86
|
+
pdf_dpi (int): DPI for PDF page rendering (default 150)
|
|
87
|
+
creativity (float): Temperature setting (None uses provider default)
|
|
88
|
+
thinking_budget (int): Token budget for extended thinking/reasoning.
|
|
89
|
+
Provider-specific: Google (thinkingConfig), OpenAI (reasoning_effort),
|
|
90
|
+
Anthropic (extended thinking). Default 0 (disabled).
|
|
91
|
+
chain_of_thought (bool): Enable step-by-step reasoning (default True)
|
|
92
|
+
context_prompt (bool): Add expert context prefix
|
|
93
|
+
step_back_prompt (bool): Enable step-back prompting
|
|
94
|
+
filename (str): Output CSV filename
|
|
95
|
+
save_directory (str): Directory to save results
|
|
96
|
+
progress_callback: Optional callback for progress updates
|
|
97
|
+
models (list): For multi-model mode, list of (model, provider, api_key) tuples
|
|
98
|
+
max_workers (int): Max parallel workers for API calls. None = auto.
|
|
99
|
+
parallel (bool): Controls concurrent vs sequential model execution.
|
|
100
|
+
- None (default): auto-detect (sequential for all-Ollama, parallel otherwise)
|
|
101
|
+
- True: force parallel execution
|
|
102
|
+
- False: force sequential execution
|
|
103
|
+
auto_download (bool): Auto-download missing Ollama models. Default False.
|
|
104
|
+
safety (bool): If True, saves progress after each item. Requires filename.
|
|
105
|
+
max_retries (int): Max retries per API call. Default 5.
|
|
106
|
+
batch_retries (int): Max retries for batch-level failures. Default 2.
|
|
107
|
+
retry_delay (float): Delay between retries in seconds. Default 1.0.
|
|
108
|
+
row_delay (float): Delay in seconds between processing each row. Default 0.0.
|
|
109
|
+
fail_strategy (str): How to handle failures - "partial" (default) or "strict".
|
|
110
|
+
batch_mode (bool): If True, use async batch API (50% cost savings).
|
|
111
|
+
Supported providers: openai, anthropic, google, mistral, xai.
|
|
112
|
+
Not compatible with PDF input.
|
|
113
|
+
batch_poll_interval (float): Seconds between batch status checks. Default 30.
|
|
114
|
+
batch_timeout (float): Max seconds to wait for batch completion. Default 86400 (24h).
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
pd.DataFrame: Results with summary column(s):
|
|
118
|
+
- input_data: Original text or page label (for PDFs)
|
|
119
|
+
- summary: Generated summary (or consensus for multi-model)
|
|
120
|
+
- summary_<model>: Per-model summaries (multi-model only)
|
|
121
|
+
- processing_status: "success", "error", "skipped"
|
|
122
|
+
- failed_models: Comma-separated list (multi-model only)
|
|
123
|
+
- pdf_path: Path to source PDF (PDF mode only)
|
|
124
|
+
- page_index: Page number, 0-indexed (PDF mode only)
|
|
125
|
+
|
|
126
|
+
Examples:
|
|
127
|
+
>>> import cat_stack as cat
|
|
128
|
+
>>>
|
|
129
|
+
>>> # Single model text summarization
|
|
130
|
+
>>> results = cat.summarize(
|
|
131
|
+
... input_data=df['responses'],
|
|
132
|
+
... description="Customer feedback",
|
|
133
|
+
... api_key="your-api-key"
|
|
134
|
+
... )
|
|
135
|
+
>>>
|
|
136
|
+
>>> # PDF summarization (auto-detected)
|
|
137
|
+
>>> results = cat.summarize(
|
|
138
|
+
... input_data="/path/to/pdfs/",
|
|
139
|
+
... description="Research papers",
|
|
140
|
+
... mode="image",
|
|
141
|
+
... api_key="your-api-key"
|
|
142
|
+
... )
|
|
143
|
+
>>>
|
|
144
|
+
>>> # With safety saves and row delay
|
|
145
|
+
>>> results = cat.summarize(
|
|
146
|
+
... input_data=df['responses'],
|
|
147
|
+
... description="Customer feedback",
|
|
148
|
+
... api_key="your-api-key",
|
|
149
|
+
... safety=True,
|
|
150
|
+
... filename="results.csv",
|
|
151
|
+
... row_delay=1.0,
|
|
152
|
+
... )
|
|
153
|
+
>>>
|
|
154
|
+
>>> # Batch mode (50% cost savings)
|
|
155
|
+
>>> results = cat.summarize(
|
|
156
|
+
... input_data=df['responses'],
|
|
157
|
+
... description="Customer feedback",
|
|
158
|
+
... api_key="your-api-key",
|
|
159
|
+
... batch_mode=True,
|
|
160
|
+
... filename="batch_results.csv",
|
|
161
|
+
... )
|
|
162
|
+
>>>
|
|
163
|
+
>>> # Multi-model with synthesis
|
|
164
|
+
>>> results = cat.summarize(
|
|
165
|
+
... input_data=df['responses'],
|
|
166
|
+
... models=[
|
|
167
|
+
... ("gpt-4o", "openai", "sk-..."),
|
|
168
|
+
... ("claude-sonnet-4-5-20250929", "anthropic", "sk-ant-..."),
|
|
169
|
+
... ],
|
|
170
|
+
... )
|
|
171
|
+
"""
|
|
172
|
+
# Map mode to pdf_mode
|
|
173
|
+
pdf_mode = mode if mode in ("image", "text", "both") else "image"
|
|
174
|
+
|
|
175
|
+
# =========================================================================
|
|
176
|
+
# Batch mode — bypass summarize_ensemble entirely
|
|
177
|
+
# =========================================================================
|
|
178
|
+
if batch_mode:
|
|
179
|
+
from ._batch import UNSUPPORTED_BATCH_PROVIDERS, run_batch_summarize
|
|
180
|
+
from .text_functions_ensemble import _detect_input_type, prepare_model_configs
|
|
181
|
+
|
|
182
|
+
# Guard: text input only
|
|
183
|
+
detected_type = _detect_input_type(input_data)
|
|
184
|
+
if detected_type == "pdf":
|
|
185
|
+
raise ValueError(
|
|
186
|
+
"batch_mode=True only supports text input, but detected input type is 'pdf'. "
|
|
187
|
+
"Set batch_mode=False for PDF summarization."
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Warn if progress_callback was provided (incompatible with batch)
|
|
191
|
+
if progress_callback is not None:
|
|
192
|
+
print(
|
|
193
|
+
"[CatLLM] WARNING: progress_callback is not available in batch_mode "
|
|
194
|
+
"(no per-item progress until the job completes). Ignoring callback."
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Build models list
|
|
198
|
+
if models is None:
|
|
199
|
+
batch_models = [(user_model, model_source, api_key)]
|
|
200
|
+
else:
|
|
201
|
+
batch_models = models
|
|
202
|
+
|
|
203
|
+
model_configs = prepare_model_configs(batch_models)
|
|
204
|
+
items = list(input_data) if not isinstance(input_data, list) else input_data
|
|
205
|
+
|
|
206
|
+
prompt_params = {
|
|
207
|
+
"input_description": description,
|
|
208
|
+
"summary_instructions": instructions,
|
|
209
|
+
"max_length": max_length,
|
|
210
|
+
"focus": focus,
|
|
211
|
+
"chain_of_thought": chain_of_thought,
|
|
212
|
+
"context_prompt": context_prompt,
|
|
213
|
+
"step_back_prompt": step_back_prompt,
|
|
214
|
+
"stepback_insights": {},
|
|
215
|
+
"creativity": creativity,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if len(batch_models) == 1:
|
|
219
|
+
cfg = model_configs[0]
|
|
220
|
+
if cfg["provider"] in UNSUPPORTED_BATCH_PROVIDERS:
|
|
221
|
+
raise ValueError(
|
|
222
|
+
f"batch_mode=True is not supported for provider '{cfg['provider']}'. "
|
|
223
|
+
f"Supported providers: openai, anthropic, google, mistral, xai."
|
|
224
|
+
)
|
|
225
|
+
return run_batch_summarize(
|
|
226
|
+
items=items,
|
|
227
|
+
cfg=cfg,
|
|
228
|
+
prompt_params=prompt_params,
|
|
229
|
+
filename=filename,
|
|
230
|
+
save_directory=save_directory,
|
|
231
|
+
batch_poll_interval=batch_poll_interval,
|
|
232
|
+
batch_timeout=batch_timeout,
|
|
233
|
+
fail_strategy=fail_strategy,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Ensemble batch path
|
|
237
|
+
print(
|
|
238
|
+
"[CatLLM] NOTE: batch_mode=True with multiple models is experimental. "
|
|
239
|
+
"Each model submits a separate batch job concurrently."
|
|
240
|
+
)
|
|
241
|
+
from ._batch import run_batch_ensemble_summarize
|
|
242
|
+
prompt_params_per_model = {
|
|
243
|
+
cfg["model"]: {
|
|
244
|
+
**prompt_params,
|
|
245
|
+
"creativity": cfg["creativity"] if cfg["creativity"] is not None else creativity,
|
|
246
|
+
}
|
|
247
|
+
for cfg in model_configs
|
|
248
|
+
}
|
|
249
|
+
return run_batch_ensemble_summarize(
|
|
250
|
+
items=items,
|
|
251
|
+
model_configs=model_configs,
|
|
252
|
+
prompt_params_per_model=prompt_params_per_model,
|
|
253
|
+
fail_strategy=fail_strategy,
|
|
254
|
+
filename=filename,
|
|
255
|
+
save_directory=save_directory,
|
|
256
|
+
batch_poll_interval=batch_poll_interval,
|
|
257
|
+
batch_timeout=batch_timeout,
|
|
258
|
+
max_retries=max_retries,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
return summarize_ensemble(
|
|
262
|
+
input_data=input_data,
|
|
263
|
+
api_key=api_key,
|
|
264
|
+
input_description=description,
|
|
265
|
+
summary_instructions=instructions,
|
|
266
|
+
max_length=max_length,
|
|
267
|
+
focus=focus,
|
|
268
|
+
user_model=user_model,
|
|
269
|
+
model_source=model_source,
|
|
270
|
+
pdf_mode=pdf_mode,
|
|
271
|
+
pdf_dpi=pdf_dpi,
|
|
272
|
+
creativity=creativity,
|
|
273
|
+
thinking_budget=thinking_budget,
|
|
274
|
+
chain_of_thought=chain_of_thought,
|
|
275
|
+
context_prompt=context_prompt,
|
|
276
|
+
step_back_prompt=step_back_prompt,
|
|
277
|
+
max_retries=max_retries,
|
|
278
|
+
batch_retries=batch_retries,
|
|
279
|
+
retry_delay=retry_delay,
|
|
280
|
+
row_delay=row_delay,
|
|
281
|
+
fail_strategy=fail_strategy,
|
|
282
|
+
safety=safety,
|
|
283
|
+
filename=filename,
|
|
284
|
+
save_directory=save_directory,
|
|
285
|
+
progress_callback=progress_callback,
|
|
286
|
+
models=models,
|
|
287
|
+
max_workers=max_workers,
|
|
288
|
+
parallel=parallel,
|
|
289
|
+
auto_download=auto_download,
|
|
290
|
+
)
|