cat-stack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cat_stack/summarize.py ADDED
@@ -0,0 +1,290 @@
1
+ """
2
+ Summarization functions for CatLLM.
3
+
4
+ This module provides unified summarization for text and PDF inputs,
5
+ supporting both single-model and multi-model (ensemble) summarization.
6
+ """
7
+
8
+ import warnings
9
+
10
+ __all__ = [
11
+ # Main entry point
12
+ "summarize",
13
+ # Ensemble function
14
+ "summarize_ensemble",
15
+ ]
16
+
17
+ # Import provider infrastructure
18
+ from ._providers import (
19
+ UnifiedLLMClient,
20
+ detect_provider,
21
+ )
22
+
23
+ # Import the implementation functions from existing modules
24
+ from .text_functions_ensemble import (
25
+ summarize_ensemble,
26
+ )
27
+
28
+
29
+ def summarize(
30
+ input_data,
31
+ api_key: str = None,
32
+ description: str = "",
33
+ instructions: str = "",
34
+ max_length: int = None,
35
+ focus: str = None,
36
+ user_model: str = "gpt-4o",
37
+ model_source: str = "auto",
38
+ mode: str = "image",
39
+ pdf_dpi: int = 150,
40
+ creativity: float = None,
41
+ thinking_budget: int = 0,
42
+ chain_of_thought: bool = True,
43
+ context_prompt: bool = False,
44
+ step_back_prompt: bool = False,
45
+ filename: str = None,
46
+ save_directory: str = None,
47
+ progress_callback=None,
48
+ models: list = None,
49
+ max_workers: int = None,
50
+ parallel: bool = None,
51
+ auto_download: bool = False,
52
+ # Robustness parameters
53
+ safety: bool = False,
54
+ max_retries: int = 5,
55
+ batch_retries: int = 2,
56
+ retry_delay: float = 1.0,
57
+ row_delay: float = 0.0,
58
+ fail_strategy: str = "partial",
59
+ # Batch mode parameters
60
+ batch_mode: bool = False,
61
+ batch_poll_interval: float = 30.0,
62
+ batch_timeout: float = 86400.0,
63
+ ):
64
+ """
65
+ Summarize text or PDF data using LLMs.
66
+
67
+ Supports single-model and multi-model (ensemble) summarization. In multi-model
68
+ mode, summaries from all models are synthesized into a consensus summary.
69
+ Input type is auto-detected from the data (text strings or PDF paths).
70
+
71
+ Args:
72
+ input_data: Data to summarize. Can be:
73
+ - Text: list of strings, pandas Series, or single string
74
+ - PDF: directory path, single PDF path, or list of PDF paths
75
+ api_key (str): API key for the model provider (single-model mode)
76
+ description (str): Description of what the content contains (provides context)
77
+ instructions (str): Specific summarization instructions (e.g., "bullet points")
78
+ max_length (int): Maximum summary length in words
79
+ focus (str): What to focus on (e.g., "main arguments", "emotional content")
80
+ user_model (str): Model to use (default "gpt-4o")
81
+ model_source (str): Provider - "auto", "openai", "anthropic", "google", etc.
82
+ mode (str): PDF processing mode (only used for PDF input):
83
+ - "image" (default): Render pages as images
84
+ - "text": Extract text only
85
+ - "both": Send both image and extracted text
86
+ pdf_dpi (int): DPI for PDF page rendering (default 150)
87
+ creativity (float): Temperature setting (None uses provider default)
88
+ thinking_budget (int): Token budget for extended thinking/reasoning.
89
+ Provider-specific: Google (thinkingConfig), OpenAI (reasoning_effort),
90
+ Anthropic (extended thinking). Default 0 (disabled).
91
+ chain_of_thought (bool): Enable step-by-step reasoning (default True)
92
+ context_prompt (bool): Add expert context prefix
93
+ step_back_prompt (bool): Enable step-back prompting
94
+ filename (str): Output CSV filename
95
+ save_directory (str): Directory to save results
96
+ progress_callback: Optional callback for progress updates
97
+ models (list): For multi-model mode, list of (model, provider, api_key) tuples
98
+ max_workers (int): Max parallel workers for API calls. None = auto.
99
+ parallel (bool): Controls concurrent vs sequential model execution.
100
+ - None (default): auto-detect (sequential for all-Ollama, parallel otherwise)
101
+ - True: force parallel execution
102
+ - False: force sequential execution
103
+ auto_download (bool): Auto-download missing Ollama models. Default False.
104
+ safety (bool): If True, saves progress after each item. Requires filename.
105
+ max_retries (int): Max retries per API call. Default 5.
106
+ batch_retries (int): Max retries for batch-level failures. Default 2.
107
+ retry_delay (float): Delay between retries in seconds. Default 1.0.
108
+ row_delay (float): Delay in seconds between processing each row. Default 0.0.
109
+ fail_strategy (str): How to handle failures - "partial" (default) or "strict".
110
+ batch_mode (bool): If True, use async batch API (50% cost savings).
111
+ Supported providers: openai, anthropic, google, mistral, xai.
112
+ Not compatible with PDF input.
113
+ batch_poll_interval (float): Seconds between batch status checks. Default 30.
114
+ batch_timeout (float): Max seconds to wait for batch completion. Default 86400 (24h).
115
+
116
+ Returns:
117
+ pd.DataFrame: Results with summary column(s):
118
+ - input_data: Original text or page label (for PDFs)
119
+ - summary: Generated summary (or consensus for multi-model)
120
+ - summary_<model>: Per-model summaries (multi-model only)
121
+ - processing_status: "success", "error", "skipped"
122
+ - failed_models: Comma-separated list (multi-model only)
123
+ - pdf_path: Path to source PDF (PDF mode only)
124
+ - page_index: Page number, 0-indexed (PDF mode only)
125
+
126
+ Examples:
127
+ >>> import cat_stack as cat
128
+ >>>
129
+ >>> # Single model text summarization
130
+ >>> results = cat.summarize(
131
+ ... input_data=df['responses'],
132
+ ... description="Customer feedback",
133
+ ... api_key="your-api-key"
134
+ ... )
135
+ >>>
136
+ >>> # PDF summarization (auto-detected)
137
+ >>> results = cat.summarize(
138
+ ... input_data="/path/to/pdfs/",
139
+ ... description="Research papers",
140
+ ... mode="image",
141
+ ... api_key="your-api-key"
142
+ ... )
143
+ >>>
144
+ >>> # With safety saves and row delay
145
+ >>> results = cat.summarize(
146
+ ... input_data=df['responses'],
147
+ ... description="Customer feedback",
148
+ ... api_key="your-api-key",
149
+ ... safety=True,
150
+ ... filename="results.csv",
151
+ ... row_delay=1.0,
152
+ ... )
153
+ >>>
154
+ >>> # Batch mode (50% cost savings)
155
+ >>> results = cat.summarize(
156
+ ... input_data=df['responses'],
157
+ ... description="Customer feedback",
158
+ ... api_key="your-api-key",
159
+ ... batch_mode=True,
160
+ ... filename="batch_results.csv",
161
+ ... )
162
+ >>>
163
+ >>> # Multi-model with synthesis
164
+ >>> results = cat.summarize(
165
+ ... input_data=df['responses'],
166
+ ... models=[
167
+ ... ("gpt-4o", "openai", "sk-..."),
168
+ ... ("claude-sonnet-4-5-20250929", "anthropic", "sk-ant-..."),
169
+ ... ],
170
+ ... )
171
+ """
172
+ # Map mode to pdf_mode
173
+ pdf_mode = mode if mode in ("image", "text", "both") else "image"
174
+
175
+ # =========================================================================
176
+ # Batch mode — bypass summarize_ensemble entirely
177
+ # =========================================================================
178
+ if batch_mode:
179
+ from ._batch import UNSUPPORTED_BATCH_PROVIDERS, run_batch_summarize
180
+ from .text_functions_ensemble import _detect_input_type, prepare_model_configs
181
+
182
+ # Guard: text input only
183
+ detected_type = _detect_input_type(input_data)
184
+ if detected_type == "pdf":
185
+ raise ValueError(
186
+ "batch_mode=True only supports text input, but detected input type is 'pdf'. "
187
+ "Set batch_mode=False for PDF summarization."
188
+ )
189
+
190
+ # Warn if progress_callback was provided (incompatible with batch)
191
+ if progress_callback is not None:
192
+ print(
193
+ "[CatLLM] WARNING: progress_callback is not available in batch_mode "
194
+ "(no per-item progress until the job completes). Ignoring callback."
195
+ )
196
+
197
+ # Build models list
198
+ if models is None:
199
+ batch_models = [(user_model, model_source, api_key)]
200
+ else:
201
+ batch_models = models
202
+
203
+ model_configs = prepare_model_configs(batch_models)
204
+ items = list(input_data) if not isinstance(input_data, list) else input_data
205
+
206
+ prompt_params = {
207
+ "input_description": description,
208
+ "summary_instructions": instructions,
209
+ "max_length": max_length,
210
+ "focus": focus,
211
+ "chain_of_thought": chain_of_thought,
212
+ "context_prompt": context_prompt,
213
+ "step_back_prompt": step_back_prompt,
214
+ "stepback_insights": {},
215
+ "creativity": creativity,
216
+ }
217
+
218
+ if len(batch_models) == 1:
219
+ cfg = model_configs[0]
220
+ if cfg["provider"] in UNSUPPORTED_BATCH_PROVIDERS:
221
+ raise ValueError(
222
+ f"batch_mode=True is not supported for provider '{cfg['provider']}'. "
223
+ f"Supported providers: openai, anthropic, google, mistral, xai."
224
+ )
225
+ return run_batch_summarize(
226
+ items=items,
227
+ cfg=cfg,
228
+ prompt_params=prompt_params,
229
+ filename=filename,
230
+ save_directory=save_directory,
231
+ batch_poll_interval=batch_poll_interval,
232
+ batch_timeout=batch_timeout,
233
+ fail_strategy=fail_strategy,
234
+ )
235
+
236
+ # Ensemble batch path
237
+ print(
238
+ "[CatLLM] NOTE: batch_mode=True with multiple models is experimental. "
239
+ "Each model submits a separate batch job concurrently."
240
+ )
241
+ from ._batch import run_batch_ensemble_summarize
242
+ prompt_params_per_model = {
243
+ cfg["model"]: {
244
+ **prompt_params,
245
+ "creativity": cfg["creativity"] if cfg["creativity"] is not None else creativity,
246
+ }
247
+ for cfg in model_configs
248
+ }
249
+ return run_batch_ensemble_summarize(
250
+ items=items,
251
+ model_configs=model_configs,
252
+ prompt_params_per_model=prompt_params_per_model,
253
+ fail_strategy=fail_strategy,
254
+ filename=filename,
255
+ save_directory=save_directory,
256
+ batch_poll_interval=batch_poll_interval,
257
+ batch_timeout=batch_timeout,
258
+ max_retries=max_retries,
259
+ )
260
+
261
+ return summarize_ensemble(
262
+ input_data=input_data,
263
+ api_key=api_key,
264
+ input_description=description,
265
+ summary_instructions=instructions,
266
+ max_length=max_length,
267
+ focus=focus,
268
+ user_model=user_model,
269
+ model_source=model_source,
270
+ pdf_mode=pdf_mode,
271
+ pdf_dpi=pdf_dpi,
272
+ creativity=creativity,
273
+ thinking_budget=thinking_budget,
274
+ chain_of_thought=chain_of_thought,
275
+ context_prompt=context_prompt,
276
+ step_back_prompt=step_back_prompt,
277
+ max_retries=max_retries,
278
+ batch_retries=batch_retries,
279
+ retry_delay=retry_delay,
280
+ row_delay=row_delay,
281
+ fail_strategy=fail_strategy,
282
+ safety=safety,
283
+ filename=filename,
284
+ save_directory=save_directory,
285
+ progress_callback=progress_callback,
286
+ models=models,
287
+ max_workers=max_workers,
288
+ parallel=parallel,
289
+ auto_download=auto_download,
290
+ )