abstractcore 2.6.9__py3-none-any.whl → 2.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. abstractcore/apps/summarizer.py +69 -27
  2. abstractcore/architectures/detection.py +190 -25
  3. abstractcore/assets/architecture_formats.json +129 -6
  4. abstractcore/assets/model_capabilities.json +803 -141
  5. abstractcore/config/main.py +2 -2
  6. abstractcore/config/manager.py +3 -1
  7. abstractcore/events/__init__.py +7 -1
  8. abstractcore/mcp/__init__.py +30 -0
  9. abstractcore/mcp/client.py +213 -0
  10. abstractcore/mcp/factory.py +64 -0
  11. abstractcore/mcp/naming.py +28 -0
  12. abstractcore/mcp/stdio_client.py +336 -0
  13. abstractcore/mcp/tool_source.py +164 -0
  14. abstractcore/processing/__init__.py +2 -2
  15. abstractcore/processing/basic_deepsearch.py +1 -1
  16. abstractcore/processing/basic_summarizer.py +379 -93
  17. abstractcore/providers/anthropic_provider.py +91 -10
  18. abstractcore/providers/base.py +540 -16
  19. abstractcore/providers/huggingface_provider.py +17 -8
  20. abstractcore/providers/lmstudio_provider.py +170 -25
  21. abstractcore/providers/mlx_provider.py +13 -10
  22. abstractcore/providers/ollama_provider.py +42 -26
  23. abstractcore/providers/openai_compatible_provider.py +87 -22
  24. abstractcore/providers/openai_provider.py +12 -9
  25. abstractcore/providers/streaming.py +201 -39
  26. abstractcore/providers/vllm_provider.py +78 -21
  27. abstractcore/server/app.py +116 -30
  28. abstractcore/structured/retry.py +20 -7
  29. abstractcore/tools/__init__.py +46 -24
  30. abstractcore/tools/abstractignore.py +166 -0
  31. abstractcore/tools/arg_canonicalizer.py +61 -0
  32. abstractcore/tools/common_tools.py +2443 -742
  33. abstractcore/tools/core.py +109 -13
  34. abstractcore/tools/handler.py +17 -3
  35. abstractcore/tools/parser.py +894 -159
  36. abstractcore/tools/registry.py +122 -18
  37. abstractcore/tools/syntax_rewriter.py +68 -6
  38. abstractcore/tools/tag_rewriter.py +186 -1
  39. abstractcore/utils/jsonish.py +111 -0
  40. abstractcore/utils/version.py +1 -1
  41. {abstractcore-2.6.9.dist-info → abstractcore-2.9.1.dist-info}/METADATA +56 -2
  42. {abstractcore-2.6.9.dist-info → abstractcore-2.9.1.dist-info}/RECORD +46 -37
  43. {abstractcore-2.6.9.dist-info → abstractcore-2.9.1.dist-info}/WHEEL +0 -0
  44. {abstractcore-2.6.9.dist-info → abstractcore-2.9.1.dist-info}/entry_points.txt +0 -0
  45. {abstractcore-2.6.9.dist-info → abstractcore-2.9.1.dist-info}/licenses/LICENSE +0 -0
  46. {abstractcore-2.6.9.dist-info → abstractcore-2.9.1.dist-info}/top_level.txt +0 -0
@@ -6,8 +6,10 @@ text processing capabilities with minimal complexity.
6
6
  """
7
7
 
8
8
  from enum import Enum
9
- from typing import List, Optional
10
- from pydantic import BaseModel, Field
9
+ import json
10
+ import re
11
+ from typing import List, Optional, Tuple
12
+ from pydantic import BaseModel, Field, ValidationError
11
13
 
12
14
  from ..core.interface import AbstractCoreInterface
13
15
  from ..core.factory import create_llm
@@ -35,6 +37,42 @@ class SummaryLength(Enum):
35
37
  COMPREHENSIVE = "comprehensive" # Full analysis with context
36
38
 
37
39
 
40
+ class CompressionMode(Enum):
41
+ """Compression aggressiveness for chat history summarization.
42
+
43
+ Controls how aggressively the summarizer compresses conversation history:
44
+ - LIGHT: Keep most information, only remove redundancy
45
+ - STANDARD: Balanced compression, main points and context
46
+ - HEAVY: Aggressive compression, only critical information
47
+ """
48
+ LIGHT = "light"
49
+ STANDARD = "standard"
50
+ HEAVY = "heavy"
51
+
52
+
53
+ # Compression mode-specific instructions for summarization prompts
54
+ COMPRESSION_INSTRUCTIONS = {
55
+ CompressionMode.LIGHT: (
56
+ "Preserve most details from this conversation while removing only redundancy. "
57
+ "Keep: all key decisions and outcomes, important context and background, "
58
+ "specific details/names/numbers/technical terms, all tool calls and results, "
59
+ "error messages and resolutions. Remove only: repetitive greetings, duplicate information."
60
+ ),
61
+ CompressionMode.STANDARD: (
62
+ "Summarize with balanced compression, keeping main points and essential context. "
63
+ "Keep: key decisions and rationale, important outcomes, critical context for ongoing work, "
64
+ "unresolved items and pending tasks. Remove: intermediate reasoning steps, "
65
+ "exploratory tangents, detailed tool outputs (keep only key findings)."
66
+ ),
67
+ CompressionMode.HEAVY: (
68
+ "Extract only the most critical information. Keep ONLY: final decisions made, "
69
+ "critical outcomes (success/failure), essential context to continue work, "
70
+ "blocking issues and hard dependencies. Remove: all exploratory discussion, "
71
+ "all intermediate steps, all detailed outputs, all background explanations."
72
+ ),
73
+ }
74
+
75
+
38
76
  class LLMSummaryOutput(BaseModel):
39
77
  """LLM-generated summary output (without word counts)"""
40
78
  summary: str = Field(description="The main summary text")
@@ -81,9 +119,10 @@ class BasicSummarizer:
81
119
  self,
82
120
  llm: Optional[AbstractCoreInterface] = None,
83
121
  max_chunk_size: int = 8000,
84
- max_tokens: int = 32000,
85
- max_output_tokens: int = 8000,
86
- timeout: Optional[float] = None
122
+ max_tokens: int = -1,
123
+ max_output_tokens: int = -1,
124
+ timeout: Optional[float] = None,
125
+ retry_strategy: Optional[FeedbackRetry] = None,
87
126
  ):
88
127
  """
89
128
  Initialize the summarizer
@@ -91,14 +130,26 @@ class BasicSummarizer:
91
130
  Args:
92
131
  llm: AbstractCore instance (any provider). If None, attempts to create ollama gemma3:1b-it-qat
93
132
  max_chunk_size: Maximum characters per chunk for long documents (default 8000)
94
- max_tokens: Maximum total tokens for LLM context (default 32000)
95
- max_output_tokens: Maximum tokens for LLM output generation (default 8000)
133
+ max_tokens: Maximum total tokens for LLM context (default -1 = AUTO).
134
+ - Use -1 (AUTO): Automatically uses model's context window capability
135
+ - Use specific value: Hard limit for deployment constraint (GPU/RAM limits)
136
+ Example: max_tokens=16000 limits to 16K even if model supports 128K
137
+ max_output_tokens: Maximum tokens for LLM output generation (default -1 = AUTO).
138
+ - Use -1 (AUTO): Automatically uses model's output capability
139
+ - Use specific value: Hard limit for output tokens
96
140
  timeout: HTTP request timeout in seconds. None for unlimited timeout (default None)
141
+ retry_strategy: Custom retry strategy for structured output. If None, uses default (3 attempts)
97
142
  """
98
143
  if llm is None:
99
144
  try:
100
145
  # Default to gemma3:1b-it-qat with configurable token limits
101
- self.llm = create_llm("ollama", model="gemma3:1b-it-qat", max_tokens=max_tokens, max_output_tokens=max_output_tokens, timeout=timeout)
146
+ # Only pass token limits if not using AUTO mode (-1)
147
+ llm_kwargs = {'timeout': timeout} if timeout is not None else {}
148
+ if max_tokens != -1:
149
+ llm_kwargs['max_tokens'] = max_tokens
150
+ if max_output_tokens != -1:
151
+ llm_kwargs['max_output_tokens'] = max_output_tokens
152
+ self.llm = create_llm("ollama", model="gemma3:1b-it-qat", **llm_kwargs)
102
153
  except Exception as e:
103
154
  error_msg = (
104
155
  f"❌ Failed to initialize default Ollama model 'gemma3:1b-it-qat': {e}\n\n"
@@ -126,9 +177,13 @@ class BasicSummarizer:
126
177
  else:
127
178
  self.llm = llm
128
179
  self.max_chunk_size = max_chunk_size
180
+ # Store token budgets. -1 means AUTO (use model's capability).
181
+ # In AbstractCore, `max_tokens` is the total (input + output) context budget.
182
+ self.max_tokens = max_tokens
183
+ self.max_output_tokens = max_output_tokens
129
184
 
130
- # Default retry strategy with 3 attempts
131
- self.retry_strategy = FeedbackRetry(max_attempts=3)
185
+ # Default retry strategy with 3 attempts (callers may override for latency-sensitive UX).
186
+ self.retry_strategy = retry_strategy or FeedbackRetry(max_attempts=3)
132
187
 
133
188
  def summarize(
134
189
  self,
@@ -192,36 +247,29 @@ class BasicSummarizer:
192
247
  # Build the prompt based on parameters
193
248
  prompt = self._build_prompt(text, focus, style, length)
194
249
 
195
- # Use AbstractCore's structured output with retry strategy (no word counts in LLM response)
196
- response = self.llm.generate(prompt, response_model=LLMSummaryOutput, retry_strategy=self.retry_strategy)
197
-
198
- # Extract the structured output
199
- llm_result = None
200
- if isinstance(response, LLMSummaryOutput):
201
- # When structured output succeeds, response is the LLMSummaryOutput object directly
202
- llm_result = response
203
- elif hasattr(response, 'structured_output') and response.structured_output:
204
- # Fallback: check for structured_output attribute
205
- llm_result = response.structured_output
206
- else:
207
- # Debug information for troubleshooting
208
- error_msg = f"Failed to generate structured summary output. Response type: {type(response)}"
209
- if hasattr(response, 'content'):
210
- error_msg += f", Content: {response.content[:200]}..."
211
- if hasattr(response, 'structured_output'):
212
- error_msg += f", Structured output: {response.structured_output}"
213
- raise ValueError(error_msg)
250
+ llm_result: Optional[LLMSummaryOutput] = None
251
+ try:
252
+ # Use AbstractCore's structured output with retry strategy (no word counts in LLM response)
253
+ response = self.llm.generate(prompt, response_model=LLMSummaryOutput, retry_strategy=self.retry_strategy)
254
+ llm_result = self._extract_summary_structured_output(response, context="summary")
255
+ except (json.JSONDecodeError, ValidationError) as e:
256
+ logger.warning(
257
+ "Structured summary output failed; falling back to marker format",
258
+ error_type=type(e).__name__,
259
+ error=str(e),
260
+ )
261
+ llm_result = self._summarize_fallback(text=text, focus=focus, style=style, length=length)
214
262
 
215
263
  # Compute word counts ourselves (reliable, client-side calculation)
216
264
  actual_original_words = len(text.split())
217
- actual_summary_words = len(llm_result.summary.split())
265
+ actual_summary_words = len((llm_result.summary if llm_result else "").split())
218
266
 
219
267
  # Create complete result with computed word counts
220
268
  return SummaryOutput(
221
- summary=llm_result.summary,
222
- key_points=llm_result.key_points,
223
- confidence=llm_result.confidence,
224
- focus_alignment=llm_result.focus_alignment,
269
+ summary=(llm_result.summary if llm_result else ""),
270
+ key_points=(llm_result.key_points if llm_result else []),
271
+ confidence=(llm_result.confidence if llm_result else 0.5),
272
+ focus_alignment=(llm_result.focus_alignment if llm_result else 0.5),
225
273
  word_count_original=actual_original_words,
226
274
  word_count_summary=actual_summary_words
227
275
  )
@@ -261,22 +309,31 @@ class BasicSummarizer:
261
309
  summary: str
262
310
  key_points: List[str] = Field(max_length=5)
263
311
 
264
- response = self.llm.generate(chunk_prompt, response_model=ChunkSummary, retry_strategy=self.retry_strategy)
265
- if isinstance(response, ChunkSummary):
266
- # When structured output succeeds, response is the ChunkSummary object directly
267
- chunk_summaries.append(response)
268
- elif hasattr(response, 'structured_output') and response.structured_output:
269
- # Fallback: check for structured_output attribute
270
- chunk_summaries.append(response.structured_output)
271
- else:
272
- # If chunk processing fails, create a fallback summary
273
- logger.warning("Chunk processing failed, creating fallback",
274
- chunk_number=i+1,
275
- total_chunks=len(chunks))
276
- chunk_summaries.append(ChunkSummary(
277
- summary=f"Section {i+1} content summary unavailable",
278
- key_points=["Content processing failed"]
279
- ))
312
+ try:
313
+ response = self.llm.generate(chunk_prompt, response_model=ChunkSummary, retry_strategy=self.retry_strategy)
314
+ if isinstance(response, ChunkSummary):
315
+ # When structured output succeeds, response is the ChunkSummary object directly
316
+ chunk_summaries.append(response)
317
+ elif hasattr(response, 'structured_output') and response.structured_output:
318
+ # Fallback: check for structured_output attribute
319
+ chunk_summaries.append(response.structured_output)
320
+ else:
321
+ raise ValueError(f"Unexpected chunk response type: {type(response)}")
322
+ except (json.JSONDecodeError, ValidationError, ValueError) as e:
323
+ # If chunk processing fails, create a minimal placeholder (do not fail the whole summary).
324
+ logger.warning(
325
+ "Chunk processing failed, creating fallback",
326
+ chunk_number=i + 1,
327
+ total_chunks=len(chunks),
328
+ error_type=type(e).__name__,
329
+ error=str(e),
330
+ )
331
+ chunk_summaries.append(
332
+ ChunkSummary(
333
+ summary=f"Section {i+1} content summary unavailable",
334
+ key_points=["Content processing failed"],
335
+ )
336
+ )
280
337
 
281
338
  # Step 2: Combine chunk summaries (Reduce phase)
282
339
  combined_text = "\n\n".join([
@@ -287,35 +344,28 @@ class BasicSummarizer:
287
344
  # Generate final summary from combined summaries
288
345
  final_prompt = self._build_final_combination_prompt(combined_text, focus, style, length, len(text))
289
346
 
290
- response = self.llm.generate(final_prompt, response_model=LLMSummaryOutput, retry_strategy=self.retry_strategy)
291
-
292
- # Extract the structured output
293
- llm_result = None
294
- if isinstance(response, LLMSummaryOutput):
295
- # When structured output succeeds, response is the LLMSummaryOutput object directly
296
- llm_result = response
297
- elif hasattr(response, 'structured_output') and response.structured_output:
298
- # Fallback: check for structured_output attribute
299
- llm_result = response.structured_output
300
- else:
301
- # Debug information for troubleshooting
302
- error_msg = f"Failed to generate final structured summary output. Response type: {type(response)}"
303
- if hasattr(response, 'content'):
304
- error_msg += f", Content: {response.content[:200]}..."
305
- if hasattr(response, 'structured_output'):
306
- error_msg += f", Structured output: {response.structured_output}"
307
- raise ValueError(error_msg)
347
+ llm_result: Optional[LLMSummaryOutput] = None
348
+ try:
349
+ response = self.llm.generate(final_prompt, response_model=LLMSummaryOutput, retry_strategy=self.retry_strategy)
350
+ llm_result = self._extract_summary_structured_output(response, context="final_summary")
351
+ except (json.JSONDecodeError, ValidationError) as e:
352
+ logger.warning(
353
+ "Structured final summary output failed; falling back to marker format",
354
+ error_type=type(e).__name__,
355
+ error=str(e),
356
+ )
357
+ llm_result = self._summarize_fallback(text=combined_text, focus=focus, style=style, length=length)
308
358
 
309
359
  # Compute word counts ourselves (reliable, client-side calculation)
310
360
  actual_original_words = len(text.split())
311
- actual_summary_words = len(llm_result.summary.split())
361
+ actual_summary_words = len((llm_result.summary if llm_result else "").split())
312
362
 
313
363
  # Create complete result with computed word counts
314
364
  return SummaryOutput(
315
- summary=llm_result.summary,
316
- key_points=llm_result.key_points,
317
- confidence=llm_result.confidence,
318
- focus_alignment=llm_result.focus_alignment,
365
+ summary=(llm_result.summary if llm_result else ""),
366
+ key_points=(llm_result.key_points if llm_result else []),
367
+ confidence=(llm_result.confidence if llm_result else 0.5),
368
+ focus_alignment=(llm_result.focus_alignment if llm_result else 0.5),
319
369
  word_count_original=actual_original_words,
320
370
  word_count_summary=actual_summary_words
321
371
  )
@@ -324,6 +374,13 @@ class BasicSummarizer:
324
374
  """
325
375
  Determine if text should be chunked based on token count.
326
376
 
377
+ Token budget logic:
378
+ - max_tokens = -1 (AUTO): Uses model's full context window capability
379
+ - max_tokens = N: Hard limit (deployment constraint for GPU/RAM)
380
+
381
+ This ensures we don't exceed GPU memory constraints even when the model
382
+ theoretically supports larger contexts.
383
+
327
384
  Uses centralized TokenUtils for accurate token estimation.
328
385
  Falls back to character count if model information unavailable.
329
386
  """
@@ -334,18 +391,214 @@ class BasicSummarizer:
334
391
  if self.llm and hasattr(self.llm, 'model'):
335
392
  model_name = self.llm.model
336
393
 
337
- # Estimate tokens using centralized utility
338
- estimated_tokens = TokenUtils.estimate_tokens(text, model_name)
394
+ # Estimate tokens using centralized utility. If estimation fails for any reason,
395
+ # fall back to character chunking (conservative).
396
+ try:
397
+ estimated_tokens = TokenUtils.estimate_tokens(text, model_name)
398
+ except Exception:
399
+ return len(text) > self.max_chunk_size
339
400
 
340
- # Use a conservative token limit (leaving room for prompt overhead)
341
- # Most models have 32k+ context nowadays, so 8k tokens for input text is safe
342
- token_limit = 8000
401
+ # Determine the effective token budget
402
+ # Get provider's capabilities
403
+ provider_max_input = getattr(self.llm, "max_input_tokens", None) if self.llm else None
404
+ if provider_max_input is None:
405
+ provider_total = getattr(self.llm, "max_tokens", None) if self.llm else None
406
+ provider_output = getattr(self.llm, "max_output_tokens", None) if self.llm else None
407
+ if provider_total is not None and provider_output is not None:
408
+ try:
409
+ provider_max_input = int(provider_total) - int(provider_output)
410
+ except Exception:
411
+ provider_max_input = None
343
412
 
344
- if estimated_tokens > token_limit:
345
- return True
413
+ # Determine effective max_input_tokens based on configuration
414
+ if self.max_tokens == -1:
415
+ # AUTO mode: Use model's capability
416
+ if provider_max_input is not None:
417
+ max_input_tokens = provider_max_input
418
+ else:
419
+ # Fallback to safe default if model info unavailable
420
+ max_input_tokens = 24000 # Conservative default
421
+ else:
422
+ # User-specified limit (deployment constraint)
423
+ user_max_output = self.max_output_tokens if self.max_output_tokens != -1 else 8000
424
+ user_max_input = self.max_tokens - user_max_output
346
425
 
347
- # Fallback to character-based check for very long texts
348
- return len(text) > self.max_chunk_size
426
+ if provider_max_input is not None:
427
+ # Respect BOTH user limit AND model capability (take minimum)
428
+ max_input_tokens = min(provider_max_input, user_max_input)
429
+ else:
430
+ # No model info, use user limit
431
+ max_input_tokens = user_max_input
432
+
433
+ # Reserve prompt/formatting overhead (structured output schemas + instructions).
434
+ # Keep the historical safety floor (8000) for small-context models.
435
+ try:
436
+ token_limit = max(8000, int(max_input_tokens) - 1200)
437
+ except Exception:
438
+ token_limit = 8000
439
+
440
+ logger.debug(
441
+ "Chunking decision",
442
+ estimated_tokens=estimated_tokens,
443
+ token_limit=token_limit,
444
+ max_tokens_config=self.max_tokens,
445
+ is_auto_mode=(self.max_tokens == -1),
446
+ will_chunk=(estimated_tokens > token_limit)
447
+ )
448
+
449
+ return estimated_tokens > token_limit
450
+
451
+ def _extract_summary_structured_output(self, response: object, *, context: str) -> LLMSummaryOutput:
452
+ """Extract structured summary output from AbstractCore responses."""
453
+ if isinstance(response, LLMSummaryOutput):
454
+ return response
455
+ if hasattr(response, "structured_output") and getattr(response, "structured_output"):
456
+ return response.structured_output
457
+
458
+ error_msg = f"Failed to generate structured {context} output. Response type: {type(response)}"
459
+ if hasattr(response, "content") and getattr(response, "content"):
460
+ try:
461
+ error_msg += f", Content: {str(response.content)[:200]}..."
462
+ except Exception:
463
+ pass
464
+ if hasattr(response, "structured_output"):
465
+ try:
466
+ error_msg += f", Structured output: {getattr(response, 'structured_output')}"
467
+ except Exception:
468
+ pass
469
+ raise ValueError(error_msg)
470
+
471
+ def _summarize_fallback(
472
+ self,
473
+ *,
474
+ text: str,
475
+ focus: Optional[str],
476
+ style: SummaryStyle,
477
+ length: SummaryLength,
478
+ ) -> LLMSummaryOutput:
479
+ """Best-effort summary when structured output cannot be produced reliably."""
480
+ prompt = self._build_fallback_prompt(text=text, focus=focus, style=style, length=length)
481
+ response = self.llm.generate(prompt)
482
+ content = getattr(response, "content", None)
483
+ if content is None:
484
+ content = str(response)
485
+ summary, key_points, confidence, focus_alignment = self._parse_fallback_response(str(content))
486
+ return LLMSummaryOutput(
487
+ summary=summary,
488
+ key_points=key_points[:8],
489
+ confidence=confidence,
490
+ focus_alignment=focus_alignment,
491
+ )
492
+
493
+ def _build_fallback_prompt(
494
+ self,
495
+ *,
496
+ text: str,
497
+ focus: Optional[str],
498
+ style: SummaryStyle,
499
+ length: SummaryLength,
500
+ ) -> str:
501
+ """Build a non-JSON prompt that is easy to parse deterministically."""
502
+ style_instructions = {
503
+ SummaryStyle.STRUCTURED: "Present the summary in a clear, organized format with distinct sections or bullet points.",
504
+ SummaryStyle.NARRATIVE: "Write the summary as a flowing narrative that tells the story of the content.",
505
+ SummaryStyle.OBJECTIVE: "Maintain a neutral, factual tone without opinions or interpretations.",
506
+ SummaryStyle.ANALYTICAL: "Provide critical analysis with insights, implications, and deeper understanding.",
507
+ SummaryStyle.EXECUTIVE: "Focus on actionable insights, business implications, and key decisions.",
508
+ SummaryStyle.CONVERSATIONAL: "Preserve conversational context, key decisions, ongoing topics, and user intent. Focus on information needed for conversation continuity.",
509
+ }
510
+
511
+ length_instructions = {
512
+ SummaryLength.BRIEF: "Keep the summary very concise - 2-3 sentences covering only the most essential points.",
513
+ SummaryLength.STANDARD: "Provide a balanced summary of 1-2 paragraphs covering the main ideas.",
514
+ SummaryLength.DETAILED: "Create a comprehensive summary with multiple paragraphs covering all important aspects.",
515
+ SummaryLength.COMPREHENSIVE: "Provide an extensive analysis covering all significant points, context, and implications.",
516
+ }
517
+
518
+ focus_instruction = ""
519
+ if focus:
520
+ focus_instruction = f"\nPay special attention to: {focus}\n"
521
+
522
+ return f"""Analyze the following text and produce a summary.
523
+
524
+ {style_instructions[style]}
525
+ {length_instructions[length]}{focus_instruction}
526
+
527
+ Text to summarize:
528
+ {text}
529
+
530
+ Return your answer in this EXACT plain-text format (no JSON, no code blocks):
531
+
532
+ SUMMARY:
533
+ <the main summary text>
534
+
535
+ KEY POINTS:
536
+ - <point 1>
537
+ - <point 2>
538
+ - <point 3>
539
+
540
+ CONFIDENCE: <0-1>
541
+ FOCUS_ALIGNMENT: <0-1>
542
+ """
543
+
544
+ @staticmethod
545
+ def _parse_fallback_response(content: str) -> Tuple[str, List[str], float, float]:
546
+ """Parse marker-format fallback summaries into structured fields."""
547
+ text = (content or "").strip()
548
+ if not text:
549
+ return "", [], 0.5, 0.5
550
+
551
+ def _parse_score(label_re: str, default: float) -> float:
552
+ m = re.search(rf"(?im)^{label_re}\s*:\s*(.+?)\s*$", text)
553
+ if not m:
554
+ return default
555
+ raw = m.group(1).strip()
556
+ try:
557
+ if raw.endswith("%"):
558
+ val = float(raw[:-1].strip()) / 100.0
559
+ else:
560
+ val = float(raw)
561
+ except Exception:
562
+ return default
563
+ return max(0.0, min(1.0, val))
564
+
565
+ summary = ""
566
+ m_summary = re.search(r"(?is)summary\s*:\s*(.*?)\n\s*key\s*points\s*:", text)
567
+ if m_summary:
568
+ summary = m_summary.group(1).strip()
569
+ else:
570
+ # Best-effort: take the first paragraph.
571
+ summary = text.split("\n\n", 1)[0].strip()
572
+
573
+ key_points: List[str] = []
574
+ m_kp = re.search(
575
+ r"(?is)key\s*points\s*:\s*(.*?)(?:\n\s*confidence\s*:|\n\s*focus[_ ]alignment\s*:|\Z)",
576
+ text,
577
+ )
578
+ if m_kp:
579
+ block = m_kp.group(1)
580
+ for line in block.splitlines():
581
+ line = line.strip()
582
+ if not line:
583
+ continue
584
+ if line.startswith(("-", "•", "*")):
585
+ line = line.lstrip("-•*").strip()
586
+ if line:
587
+ key_points.append(line)
588
+ if not key_points:
589
+ # Fallback: try to extract bullet-like lines anywhere.
590
+ for line in text.splitlines():
591
+ line = line.strip()
592
+ if line.startswith(("-", "•", "*")):
593
+ cleaned = line.lstrip("-•*").strip()
594
+ if cleaned:
595
+ key_points.append(cleaned)
596
+ key_points = key_points[:8]
597
+
598
+ confidence = _parse_score("confidence", 0.6)
599
+ focus_alignment = _parse_score(r"focus[_ ]alignment", 0.6)
600
+
601
+ return summary, key_points, confidence, focus_alignment
349
602
 
350
603
  def _split_text_into_chunks(self, text: str, overlap: int = 200) -> List[str]:
351
604
  """Split text into overlapping chunks"""
@@ -493,7 +746,8 @@ Create a unified summary that represents the entire document effectively."""
493
746
  self,
494
747
  messages: List[dict],
495
748
  preserve_recent: int = 6,
496
- focus: Optional[str] = None
749
+ focus: Optional[str] = None,
750
+ compression_mode: CompressionMode = CompressionMode.STANDARD
497
751
  ) -> SummaryOutput:
498
752
  """
499
753
  Specialized method for chat history summarization following SOTA 2025 practices
@@ -502,6 +756,7 @@ Create a unified summary that represents the entire document effectively."""
502
756
  messages: List of message dicts with 'role' and 'content' keys
503
757
  preserve_recent: Number of recent messages to keep intact (default 6)
504
758
  focus: Optional focus for summarization (e.g., "key decisions", "technical solutions")
759
+ compression_mode: How aggressively to compress (LIGHT, STANDARD, HEAVY)
505
760
 
506
761
  Returns:
507
762
  SummaryOutput: Structured summary optimized for chat history context
@@ -511,36 +766,67 @@ Create a unified summary that represents the entire document effectively."""
511
766
  - Focuses on decisions, solutions, and ongoing topics
512
767
  - Maintains user intent and assistant responses
513
768
  - Optimized for chat continuation rather than standalone summary
769
+
770
+ Compression Modes:
771
+ - LIGHT: Keep most information, only remove redundancy
772
+ - STANDARD: Balanced compression, main points and context
773
+ - HEAVY: Aggressive compression, only critical information
514
774
  """
775
+ # Build focus with compression instructions
776
+ compression_instruction = COMPRESSION_INSTRUCTIONS.get(
777
+ compression_mode,
778
+ COMPRESSION_INSTRUCTIONS[CompressionMode.STANDARD]
779
+ )
780
+
781
+ # Combine user focus with compression instruction
782
+ if focus:
783
+ effective_focus = f"{compression_instruction} Focus especially on: {focus}"
784
+ else:
785
+ effective_focus = compression_instruction
786
+
787
+ # Map compression mode to summary length for appropriate output size
788
+ length_map = {
789
+ CompressionMode.LIGHT: SummaryLength.DETAILED,
790
+ CompressionMode.STANDARD: SummaryLength.STANDARD,
791
+ CompressionMode.HEAVY: SummaryLength.BRIEF,
792
+ }
793
+ target_length = length_map.get(compression_mode, SummaryLength.STANDARD)
794
+
795
+ logger.debug("Chat history summarization with compression mode",
796
+ message_count=len(messages),
797
+ preserve_recent=preserve_recent,
798
+ compression_mode=compression_mode.value,
799
+ target_length=target_length.value)
800
+
515
801
  if len(messages) <= preserve_recent:
516
802
  # If short enough, just summarize normally
517
- logger.debug("Chat history is short, using standard summarization",
518
- message_count=len(messages),
803
+ logger.debug("Chat history is short, using standard summarization",
804
+ message_count=len(messages),
519
805
  preserve_recent=preserve_recent)
520
806
  chat_text = self._format_chat_messages_to_text(messages)
521
807
  return self.summarize(
522
808
  chat_text,
523
- focus=focus or "conversational context and key information",
809
+ focus=effective_focus,
524
810
  style=SummaryStyle.CONVERSATIONAL,
525
- length=SummaryLength.STANDARD
811
+ length=target_length
526
812
  )
527
813
 
528
814
  # Split into older messages (to summarize) and recent messages (to preserve)
529
815
  older_messages = messages[:-preserve_recent]
530
816
  recent_messages = messages[-preserve_recent:]
531
-
532
- logger.debug("Splitting chat history for summarization",
817
+
818
+ logger.debug("Splitting chat history for summarization",
533
819
  total_messages=len(messages),
534
820
  older_messages=len(older_messages),
535
821
  recent_messages=len(recent_messages))
536
822
 
537
- # Summarize older messages with conversational focus
823
+ # Summarize older messages with conversational focus and compression mode
538
824
  older_text = self._format_chat_messages_to_text(older_messages)
539
825
  older_summary = self.summarize(
540
826
  older_text,
541
- focus=focus or "key decisions, solutions, and ongoing context",
827
+ focus=effective_focus,
542
828
  style=SummaryStyle.CONVERSATIONAL,
543
- length=SummaryLength.DETAILED
829
+ length=target_length
544
830
  )
545
831
 
546
832
  # The summary should ONLY contain the older messages summary
@@ -581,4 +867,4 @@ Create a unified summary that represents the entire document effectively."""
581
867
  else:
582
868
  formatted_lines.append(f"[{role.upper()}]: {content}")
583
869
 
584
- return "\n\n".join(formatted_lines)
870
+ return "\n\n".join(formatted_lines)