recursive-llm-ts 4.4.1 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,566 @@
1
+ package rlm
2
+
3
+ import (
4
+ "fmt"
5
+ "strings"
6
+ "sync"
7
+ )
8
+
9
+ // ContextOverflowConfig configures automatic context overflow handling.
10
+ type ContextOverflowConfig struct {
11
+ // Enabled turns on automatic overflow detection and recovery (default: true when config present)
12
+ Enabled bool `json:"enabled"`
13
+ // MaxModelTokens overrides the detected model token limit (0 = auto-detect from API errors)
14
+ MaxModelTokens int `json:"max_model_tokens,omitempty"`
15
+ // Strategy for reducing context: "mapreduce" (default), "truncate", "chunked", "tfidf", "textrank", "refine"
16
+ Strategy string `json:"strategy,omitempty"`
17
+ // SafetyMargin is the fraction of token budget to reserve for prompts/overhead (default: 0.15)
18
+ SafetyMargin float64 `json:"safety_margin,omitempty"`
19
+ // MaxReductionAttempts is how many times to retry with smaller context (default: 3)
20
+ MaxReductionAttempts int `json:"max_reduction_attempts,omitempty"`
21
+ }
22
+
23
+ // DefaultContextOverflowConfig returns sensible defaults for overflow handling.
24
+ func DefaultContextOverflowConfig() ContextOverflowConfig {
25
+ return ContextOverflowConfig{
26
+ Enabled: true,
27
+ Strategy: "mapreduce",
28
+ SafetyMargin: 0.15,
29
+ MaxReductionAttempts: 3,
30
+ }
31
+ }
32
+
33
+ // ─── Token Estimation ────────────────────────────────────────────────────────
34
+
35
+ // EstimateTokens provides a fast approximation of token count for a string.
36
+ // Uses a character-to-token ratio heuristic. This is intentionally conservative
37
+ // (over-estimates slightly) to avoid overflow.
38
+ //
39
+ // Approximate ratios for common encodings:
40
+ // - English text: ~4 chars/token (cl100k_base)
41
+ // - JSON/code: ~3.5 chars/token
42
+ // - CJK text: ~1.5 chars/token
43
+ // - Mixed: ~3.5 chars/token (safe default)
44
+ func EstimateTokens(text string) int {
45
+ if len(text) == 0 {
46
+ return 0
47
+ }
48
+ // Use 3.5 chars/token as conservative estimate
49
+ return (len(text)*10 + 34) / 35 // equivalent to ceil(len/3.5)
50
+ }
51
+
52
+ // EstimateMessagesTokens estimates the total tokens for a set of chat messages.
53
+ // Includes per-message overhead (~4 tokens per message for role + formatting).
54
+ func EstimateMessagesTokens(messages []Message) int {
55
+ total := 3 // Every reply is primed with <|im_start|>assistant<|im_sep|>
56
+ for _, msg := range messages {
57
+ total += 4 // role + formatting overhead
58
+ total += EstimateTokens(msg.Content)
59
+ }
60
+ return total
61
+ }
62
+
63
+ // ─── Context Chunking ────────────────────────────────────────────────────────
64
+
65
+ // ChunkContext splits context into chunks that fit within a token budget.
66
+ // Uses paragraph/sentence boundaries when possible, with overlap for context continuity.
67
+ func ChunkContext(context string, maxTokensPerChunk int) []string {
68
+ if maxTokensPerChunk <= 0 {
69
+ maxTokensPerChunk = 4000
70
+ }
71
+
72
+ // Estimate max chars per chunk (slightly conservative)
73
+ maxCharsPerChunk := maxTokensPerChunk * 3 // Use 3 chars/token to leave room
74
+
75
+ if len(context) <= maxCharsPerChunk {
76
+ return []string{context}
77
+ }
78
+
79
+ var chunks []string
80
+ overlapChars := maxCharsPerChunk / 10 // 10% overlap for context continuity
81
+
82
+ pos := 0
83
+ for pos < len(context) {
84
+ end := pos + maxCharsPerChunk
85
+ if end >= len(context) {
86
+ chunks = append(chunks, context[pos:])
87
+ break
88
+ }
89
+
90
+ // Try to find a good break point (paragraph boundary first, then sentence, then word)
91
+ breakPoint := findBreakPoint(context, pos, end)
92
+ chunks = append(chunks, context[pos:breakPoint])
93
+
94
+ // Move position back by overlap amount to maintain context continuity
95
+ pos = breakPoint - overlapChars
96
+ if pos < 0 {
97
+ pos = 0
98
+ }
99
+ // Ensure we make forward progress
100
+ if pos <= (breakPoint - maxCharsPerChunk) {
101
+ pos = breakPoint
102
+ }
103
+ }
104
+
105
+ return chunks
106
+ }
107
+
108
+ // findBreakPoint finds the best position to split text near the target end position.
109
+ // Prefers paragraph breaks (\n\n), then line breaks (\n), then sentence ends (. ! ?), then word breaks.
110
+ func findBreakPoint(text string, start int, targetEnd int) int {
111
+ if targetEnd >= len(text) {
112
+ return len(text)
113
+ }
114
+
115
+ // Search window: look back from targetEnd up to 20% of the chunk
116
+ searchStart := targetEnd - (targetEnd-start)/5
117
+ if searchStart < start {
118
+ searchStart = start
119
+ }
120
+
121
+ searchRegion := text[searchStart:targetEnd]
122
+
123
+ // Try paragraph break first
124
+ if idx := strings.LastIndex(searchRegion, "\n\n"); idx >= 0 {
125
+ return searchStart + idx + 2
126
+ }
127
+
128
+ // Try line break
129
+ if idx := strings.LastIndex(searchRegion, "\n"); idx >= 0 {
130
+ return searchStart + idx + 1
131
+ }
132
+
133
+ // Try sentence end
134
+ for _, sep := range []string{". ", "! ", "? "} {
135
+ if idx := strings.LastIndex(searchRegion, sep); idx >= 0 {
136
+ return searchStart + idx + len(sep)
137
+ }
138
+ }
139
+
140
+ // Try word break
141
+ if idx := strings.LastIndex(searchRegion, " "); idx >= 0 {
142
+ return searchStart + idx + 1
143
+ }
144
+
145
+ // No good break point, just split at target
146
+ return targetEnd
147
+ }
148
+
149
+ // ─── MapReduce Context Reduction ─────────────────────────────────────────────
150
+
151
+ // MapReduceResult holds the result of a MapReduce context reduction
152
+ type MapReduceResult struct {
153
+ ReducedContext string
154
+ ChunkCount int
155
+ OriginalTokens int
156
+ ReducedTokens int
157
+ }
158
+
159
+ // contextReducer manages context reduction for overflow recovery
160
+ type contextReducer struct {
161
+ rlm *RLM
162
+ config ContextOverflowConfig
163
+ obs *Observer
164
+ }
165
+
166
+ // newContextReducer creates a reducer bound to an RLM engine
167
+ func newContextReducer(rlm *RLM, config ContextOverflowConfig, obs *Observer) *contextReducer {
168
+ return &contextReducer{rlm: rlm, config: config, obs: obs}
169
+ }
170
+
171
+ // ReduceForCompletion handles context overflow for a regular completion.
172
+ // It chunks the context, summarizes each chunk, and combines the summaries.
173
+ func (cr *contextReducer) ReduceForCompletion(query string, context string, modelLimit int) (string, error) {
174
+ cr.obs.Debug("overflow", "Starting MapReduce context reduction: %d estimated tokens, limit %d", EstimateTokens(context), modelLimit)
175
+
176
+ // Calculate safe token budget per chunk
177
+ // Reserve tokens for: system prompt (~500), query, overhead, safety margin
178
+ queryTokens := EstimateTokens(query)
179
+ overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin)
180
+ safeTokensPerChunk := modelLimit - overhead
181
+
182
+ if safeTokensPerChunk <= 0 {
183
+ safeTokensPerChunk = modelLimit / 2
184
+ }
185
+
186
+ chunks := ChunkContext(context, safeTokensPerChunk)
187
+ cr.obs.Debug("overflow", "Split context into %d chunks (budget: %d tokens/chunk)", len(chunks), safeTokensPerChunk)
188
+
189
+ if len(chunks) == 1 {
190
+ // Context is already small enough (or couldn't be meaningfully split)
191
+ return context, nil
192
+ }
193
+
194
+ switch cr.config.Strategy {
195
+ case "truncate":
196
+ return cr.reduceByTruncation(context, modelLimit, overhead)
197
+ case "chunked":
198
+ return cr.reduceByChunkedExtraction(query, chunks, modelLimit, overhead)
199
+ case "tfidf":
200
+ return cr.reduceByTFIDF(context, modelLimit, overhead)
201
+ case "textrank":
202
+ return cr.reduceByTextRank(context, modelLimit, overhead)
203
+ case "refine":
204
+ return cr.reduceByRefine(query, chunks, modelLimit, overhead)
205
+ default: // "mapreduce"
206
+ return cr.reduceByMapReduce(query, chunks, modelLimit, overhead)
207
+ }
208
+ }
209
+
210
+ // reduceByMapReduce summarizes each chunk and combines the summaries.
211
+ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, modelLimit int, overhead int) (string, error) {
212
+ cr.obs.Debug("overflow", "Using MapReduce strategy with %d chunks", len(chunks))
213
+
214
+ summaries := make([]string, len(chunks))
215
+ errs := make([]error, len(chunks))
216
+ var wg sync.WaitGroup
217
+
218
+ // Map phase: summarize each chunk in parallel
219
+ for i, chunk := range chunks {
220
+ wg.Add(1)
221
+ go func(idx int, chunkText string) {
222
+ defer wg.Done()
223
+
224
+ mapPrompt := fmt.Sprintf(
225
+ "Summarize the following text chunk, preserving all key facts, data points, names, numbers, and specific details that would be needed to answer the question: %q\n\n"+
226
+ "IMPORTANT: Be thorough and retain specific data. Do not omit numbers, percentages, dates, or named entities.\n\n"+
227
+ "Text chunk (%d of %d):\n%s",
228
+ query, idx+1, len(chunks), chunkText,
229
+ )
230
+
231
+ messages := []Message{
232
+ {Role: "system", Content: "You are a precise summarization assistant. Preserve all factual details, data points, and specific information."},
233
+ {Role: "user", Content: mapPrompt},
234
+ }
235
+
236
+ result, err := CallChatCompletion(ChatRequest{
237
+ Model: cr.rlm.model,
238
+ Messages: messages,
239
+ APIBase: cr.rlm.apiBase,
240
+ APIKey: cr.rlm.apiKey,
241
+ Timeout: cr.rlm.timeoutSeconds,
242
+ ExtraParams: cr.rlm.extraParams,
243
+ })
244
+ if err != nil {
245
+ errs[idx] = fmt.Errorf("map phase chunk %d: %w", idx+1, err)
246
+ return
247
+ }
248
+
249
+ cr.rlm.stats.LlmCalls++
250
+ summaries[idx] = result
251
+ cr.obs.Debug("overflow", "Chunk %d/%d summarized: %d -> %d chars", idx+1, len(chunks), len(chunkText), len(result))
252
+ }(i, chunk)
253
+ }
254
+
255
+ wg.Wait()
256
+
257
+ // Check for errors
258
+ var mapErrors []string
259
+ for _, err := range errs {
260
+ if err != nil {
261
+ mapErrors = append(mapErrors, err.Error())
262
+ }
263
+ }
264
+ if len(mapErrors) > 0 {
265
+ return "", fmt.Errorf("MapReduce map phase failed: %s", strings.Join(mapErrors, "; "))
266
+ }
267
+
268
+ // Reduce phase: combine summaries
269
+ combined := strings.Join(summaries, "\n\n---\n\n")
270
+
271
+ // Check if combined summaries fit in the budget
272
+ if EstimateTokens(combined)+overhead < modelLimit {
273
+ cr.obs.Debug("overflow", "MapReduce complete: %d -> %d estimated tokens", EstimateTokens(strings.Join(chunks, "")), EstimateTokens(combined))
274
+ return combined, nil
275
+ }
276
+
277
+ // If summaries are still too large, recursively reduce
278
+ cr.obs.Debug("overflow", "Combined summaries still too large (%d tokens), reducing recursively", EstimateTokens(combined))
279
+ return cr.ReduceForCompletion(query, combined, modelLimit)
280
+ }
281
+
282
+ // reduceByTruncation simply truncates context to fit within the limit.
283
+ func (cr *contextReducer) reduceByTruncation(context string, modelLimit int, overhead int) (string, error) {
284
+ cr.obs.Debug("overflow", "Using truncation strategy")
285
+
286
+ availableTokens := modelLimit - overhead
287
+ maxChars := availableTokens * 3 // Conservative chars-to-tokens
288
+
289
+ if maxChars >= len(context) {
290
+ return context, nil
291
+ }
292
+
293
+ // Keep beginning and end, truncate middle (addresses "lost in the middle" problem)
294
+ keepFromStart := maxChars * 2 / 3
295
+ keepFromEnd := maxChars / 3
296
+
297
+ truncated := context[:keepFromStart] +
298
+ "\n\n[... context truncated due to token limit ...]\n\n" +
299
+ context[len(context)-keepFromEnd:]
300
+
301
+ cr.obs.Debug("overflow", "Truncated context: %d -> %d chars", len(context), len(truncated))
302
+ return truncated, nil
303
+ }
304
+
305
+ // reduceByChunkedExtraction processes each chunk independently and returns all extracted content.
306
+ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []string, modelLimit int, overhead int) (string, error) {
307
+ cr.obs.Debug("overflow", "Using chunked extraction strategy with %d chunks", len(chunks))
308
+
309
+ results := make([]string, len(chunks))
310
+ errs := make([]error, len(chunks))
311
+ var wg sync.WaitGroup
312
+
313
+ for i, chunk := range chunks {
314
+ wg.Add(1)
315
+ go func(idx int, chunkText string) {
316
+ defer wg.Done()
317
+
318
+ extractPrompt := fmt.Sprintf(
319
+ "Extract all information relevant to the following question from this text chunk. "+
320
+ "Include specific data, facts, quotes, and details. If nothing relevant is found, respond with 'NO_RELEVANT_CONTENT'.\n\n"+
321
+ "Question: %s\n\nText chunk (%d of %d):\n%s",
322
+ query, idx+1, len(chunks), chunkText,
323
+ )
324
+
325
+ messages := []Message{
326
+ {Role: "system", Content: "You are a precise information extraction assistant. Extract only relevant information."},
327
+ {Role: "user", Content: extractPrompt},
328
+ }
329
+
330
+ result, err := CallChatCompletion(ChatRequest{
331
+ Model: cr.rlm.model,
332
+ Messages: messages,
333
+ APIBase: cr.rlm.apiBase,
334
+ APIKey: cr.rlm.apiKey,
335
+ Timeout: cr.rlm.timeoutSeconds,
336
+ ExtraParams: cr.rlm.extraParams,
337
+ })
338
+ if err != nil {
339
+ errs[idx] = fmt.Errorf("chunked extraction chunk %d: %w", idx+1, err)
340
+ return
341
+ }
342
+
343
+ cr.rlm.stats.LlmCalls++
344
+ if strings.TrimSpace(result) != "NO_RELEVANT_CONTENT" {
345
+ results[idx] = result
346
+ }
347
+ }(i, chunk)
348
+ }
349
+
350
+ wg.Wait()
351
+
352
+ var extractErrors []string
353
+ for _, err := range errs {
354
+ if err != nil {
355
+ extractErrors = append(extractErrors, err.Error())
356
+ }
357
+ }
358
+ if len(extractErrors) > 0 {
359
+ return "", fmt.Errorf("chunked extraction failed: %s", strings.Join(extractErrors, "; "))
360
+ }
361
+
362
+ // Combine non-empty results
363
+ var parts []string
364
+ for _, r := range results {
365
+ if r != "" {
366
+ parts = append(parts, r)
367
+ }
368
+ }
369
+
370
+ if len(parts) == 0 {
371
+ return "No relevant content found across all chunks.", nil
372
+ }
373
+
374
+ return strings.Join(parts, "\n\n---\n\n"), nil
375
+ }
376
+
377
+ // ─── Refine Sequential Strategy ─────────────────────────────────────────────
378
+
379
+ // reduceByRefine processes chunks sequentially, where the first chunk generates
380
+ // an initial answer and each subsequent chunk refines it. This approach has the
381
+ // highest information fidelity because every chunk is processed in context of
382
+ // the cumulative answer, but is sequential (not parallelizable).
383
+ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLimit int, overhead int) (string, error) {
384
+ cr.obs.Debug("overflow", "Using refine strategy with %d chunks", len(chunks))
385
+
386
+ if len(chunks) == 0 {
387
+ return "", fmt.Errorf("refine strategy: no chunks to process")
388
+ }
389
+
390
+ // Phase 1: Generate initial answer from the first chunk
391
+ initialPrompt := fmt.Sprintf(
392
+ "Using the following context, provide a comprehensive answer to the question.\n"+
393
+ "Preserve all key facts, data points, names, numbers, and specific details.\n\n"+
394
+ "Question: %s\n\nContext:\n%s",
395
+ query, chunks[0],
396
+ )
397
+
398
+ messages := []Message{
399
+ {Role: "system", Content: "You are a precise information synthesis assistant. Preserve all factual details and specific data points."},
400
+ {Role: "user", Content: initialPrompt},
401
+ }
402
+
403
+ currentAnswer, err := CallChatCompletion(ChatRequest{
404
+ Model: cr.rlm.model,
405
+ Messages: messages,
406
+ APIBase: cr.rlm.apiBase,
407
+ APIKey: cr.rlm.apiKey,
408
+ Timeout: cr.rlm.timeoutSeconds,
409
+ ExtraParams: cr.rlm.extraParams,
410
+ })
411
+ if err != nil {
412
+ return "", fmt.Errorf("refine initial chunk: %w", err)
413
+ }
414
+ cr.rlm.stats.LlmCalls++
415
+ cr.obs.Debug("overflow", "Refine: initial answer from chunk 1/%d (%d chars)", len(chunks), len(currentAnswer))
416
+
417
+ // Phase 2: Refine the answer with each subsequent chunk
418
+ for i := 1; i < len(chunks); i++ {
419
+ refinePrompt := fmt.Sprintf(
420
+ "You have an existing answer to the question: %q\n\n"+
421
+ "Existing answer:\n%s\n\n"+
422
+ "Now you have additional context that may contain new information, corrections, or supporting details.\n"+
423
+ "Refine the existing answer by incorporating any relevant new information from this context.\n"+
424
+ "If this context adds nothing new, return the existing answer unchanged.\n"+
425
+ "IMPORTANT: Never remove information from the existing answer unless it is contradicted by the new context.\n\n"+
426
+ "Additional context (chunk %d of %d):\n%s",
427
+ query, currentAnswer, i+1, len(chunks), chunks[i],
428
+ )
429
+
430
+ messages := []Message{
431
+ {Role: "system", Content: "You are a precise information synthesis assistant. Refine answers by incorporating new context without losing existing information."},
432
+ {Role: "user", Content: refinePrompt},
433
+ }
434
+
435
+ refined, err := CallChatCompletion(ChatRequest{
436
+ Model: cr.rlm.model,
437
+ Messages: messages,
438
+ APIBase: cr.rlm.apiBase,
439
+ APIKey: cr.rlm.apiKey,
440
+ Timeout: cr.rlm.timeoutSeconds,
441
+ ExtraParams: cr.rlm.extraParams,
442
+ })
443
+ if err != nil {
444
+ cr.obs.Debug("overflow", "Refine: chunk %d/%d failed: %v, keeping current answer", i+1, len(chunks), err)
445
+ // On error, keep current answer rather than failing entirely
446
+ continue
447
+ }
448
+ cr.rlm.stats.LlmCalls++
449
+ currentAnswer = refined
450
+ cr.obs.Debug("overflow", "Refine: incorporated chunk %d/%d (%d chars)", i+1, len(chunks), len(currentAnswer))
451
+ }
452
+
453
+ // Verify the refined answer fits within budget
454
+ if EstimateTokens(currentAnswer)+overhead < modelLimit {
455
+ cr.obs.Debug("overflow", "Refine complete: answer is %d estimated tokens", EstimateTokens(currentAnswer))
456
+ return currentAnswer, nil
457
+ }
458
+
459
+ // If the refined answer is still too large, truncate it
460
+ cr.obs.Debug("overflow", "Refine answer too large (%d tokens), truncating", EstimateTokens(currentAnswer))
461
+ return cr.reduceByTruncation(currentAnswer, modelLimit, overhead)
462
+ }
463
+
464
+ // ─── TF-IDF Strategy (wrapper for contextReducer) ───────────────────────────
465
+
466
+ // reduceByTFIDF uses TF-IDF extractive compression - pure algorithmic, no API calls.
467
+ func (cr *contextReducer) reduceByTFIDF(context string, modelLimit int, overhead int) (string, error) {
468
+ cr.obs.Debug("overflow", "Using TF-IDF extractive strategy")
469
+
470
+ availableTokens := modelLimit - overhead
471
+ if availableTokens <= 0 {
472
+ availableTokens = modelLimit / 2
473
+ }
474
+
475
+ result := CompressContextTFIDF(context, availableTokens)
476
+ cr.obs.Debug("overflow", "TF-IDF compressed: %d -> %d chars (%d -> %d est. tokens)",
477
+ len(context), len(result), EstimateTokens(context), EstimateTokens(result))
478
+ return result, nil
479
+ }
480
+
481
+ // ─── TextRank Strategy (wrapper for contextReducer) ─────────────────────────
482
+
483
+ // reduceByTextRank uses TextRank graph-based ranking - pure algorithmic, no API calls.
484
+ func (cr *contextReducer) reduceByTextRank(context string, modelLimit int, overhead int) (string, error) {
485
+ cr.obs.Debug("overflow", "Using TextRank graph-based strategy")
486
+
487
+ availableTokens := modelLimit - overhead
488
+ if availableTokens <= 0 {
489
+ availableTokens = modelLimit / 2
490
+ }
491
+
492
+ result := CompressContextTextRank(context, availableTokens)
493
+ cr.obs.Debug("overflow", "TextRank compressed: %d -> %d chars (%d -> %d est. tokens)",
494
+ len(context), len(result), EstimateTokens(context), EstimateTokens(result))
495
+ return result, nil
496
+ }
497
+
498
+ // ─── Adaptive Completion with Overflow Recovery ──────────────────────────────
499
+
500
+ // completionWithOverflowRecovery wraps a completion call with automatic overflow detection and retry.
501
+ // When a context overflow error is detected, it reduces the context and retries.
502
+ func (r *RLM) completionWithOverflowRecovery(query string, context string, overflowConfig ContextOverflowConfig) (string, RLMStats, error) {
503
+ obs := r.observer
504
+ if obs == nil {
505
+ obs = NewNoopObserver()
506
+ }
507
+
508
+ // Try the normal completion first
509
+ result, stats, err := r.Completion(query, context)
510
+ if err == nil {
511
+ return result, stats, nil
512
+ }
513
+
514
+ // Check if it's a context overflow error
515
+ coe, isOverflow := IsContextOverflow(err)
516
+ if !isOverflow {
517
+ return "", stats, err // Not an overflow error, return original error
518
+ }
519
+
520
+ obs.Debug("overflow", "Context overflow detected: model limit %d, request %d tokens (%.1f%% over)",
521
+ coe.ModelLimit, coe.RequestTokens, (coe.OverflowRatio()-1)*100)
522
+
523
+ // Use detected limit or configured limit
524
+ modelLimit := coe.ModelLimit
525
+ if overflowConfig.MaxModelTokens > 0 {
526
+ modelLimit = overflowConfig.MaxModelTokens
527
+ }
528
+
529
+ reducer := newContextReducer(r, overflowConfig, obs)
530
+
531
+ // Attempt context reduction and retry
532
+ for attempt := 0; attempt < overflowConfig.MaxReductionAttempts; attempt++ {
533
+ obs.Debug("overflow", "Reduction attempt %d/%d", attempt+1, overflowConfig.MaxReductionAttempts)
534
+
535
+ reducedContext, reduceErr := reducer.ReduceForCompletion(query, context, modelLimit)
536
+ if reduceErr != nil {
537
+ obs.Error("overflow", "Context reduction failed: %v", reduceErr)
538
+ return "", stats, fmt.Errorf("context overflow recovery failed: %w", reduceErr)
539
+ }
540
+
541
+ obs.Debug("overflow", "Context reduced: %d -> %d chars", len(context), len(reducedContext))
542
+
543
+ // Retry with reduced context
544
+ result, stats, err = r.Completion(query, reducedContext)
545
+ if err == nil {
546
+ obs.Event("overflow.recovery_success", map[string]string{
547
+ "attempt": fmt.Sprintf("%d", attempt+1),
548
+ "original_chars": fmt.Sprintf("%d", len(context)),
549
+ "reduced_chars": fmt.Sprintf("%d", len(reducedContext)),
550
+ "reduction_ratio": fmt.Sprintf("%.2f", float64(len(reducedContext))/float64(len(context))),
551
+ })
552
+ return result, stats, nil
553
+ }
554
+
555
+ // If it overflows again, use the reduced context for the next attempt
556
+ if _, stillOverflow := IsContextOverflow(err); stillOverflow {
557
+ context = reducedContext
558
+ continue
559
+ }
560
+
561
+ // Different error, return it
562
+ return "", stats, err
563
+ }
564
+
565
+ return "", stats, fmt.Errorf("context overflow: exceeded %d reduction attempts, model limit is %d tokens", overflowConfig.MaxReductionAttempts, modelLimit)
566
+ }