recursive-llm-ts 4.6.0 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/rlm-go CHANGED
Binary file
@@ -65,6 +65,7 @@ export interface RLMConfig {
65
65
  temperature?: number;
66
66
  max_tokens?: number;
67
67
  structured?: any;
68
+ [key: string]: any;
68
69
  }
69
70
  export interface FileStorageConfig {
70
71
  /** Storage type: 'local' or 's3' */
@@ -30,6 +30,155 @@ func DefaultContextOverflowConfig() ContextOverflowConfig {
30
30
  }
31
31
  }
32
32
 
33
+ // ─── Model Token Limits ──────────────────────────────────────────────────────
34
+
35
+ // modelTokenLimits maps known model name patterns to their maximum context window sizes.
36
+ // Used for pre-emptive overflow detection so we don't need to wait for API errors.
37
+ var modelTokenLimits = map[string]int{
38
+ // OpenAI
39
+ "gpt-4o": 128000,
40
+ "gpt-4o-mini": 128000,
41
+ "gpt-4-turbo": 128000,
42
+ "gpt-4": 8192,
43
+ "gpt-4-32k": 32768,
44
+ "gpt-3.5-turbo": 16385,
45
+ "gpt-3.5-turbo-16k": 16385,
46
+ "o1": 200000,
47
+ "o1-mini": 128000,
48
+ "o1-preview": 128000,
49
+ "o3-mini": 200000,
50
+ // Anthropic (via LiteLLM/proxy)
51
+ "claude-3-opus": 200000,
52
+ "claude-3-sonnet": 200000,
53
+ "claude-3-haiku": 200000,
54
+ "claude-3.5-sonnet": 200000,
55
+ "claude-3.5-haiku": 200000,
56
+ "claude-sonnet-4": 200000,
57
+ "claude-opus-4": 200000,
58
+ // Llama (common vLLM deployments)
59
+ "llama-3": 8192,
60
+ "llama-3.1": 128000,
61
+ "llama-3.2": 128000,
62
+ "llama-3.3": 128000,
63
+ // Mistral
64
+ "mistral-7b": 32768,
65
+ "mixtral-8x7b": 32768,
66
+ "mistral-large": 128000,
67
+ "mistral-small": 128000,
68
+ // Qwen
69
+ "qwen-2": 32768,
70
+ "qwen-2.5": 128000,
71
+ }
72
+
73
+ // LookupModelTokenLimit returns the known token limit for a model, or 0 if unknown.
74
+ // Matches by prefix so "gpt-4o-mini-2024-07-18" matches "gpt-4o-mini".
75
+ func LookupModelTokenLimit(model string) int {
76
+ lowerModel := strings.ToLower(model)
77
+
78
+ // Try exact match first
79
+ if limit, ok := modelTokenLimits[lowerModel]; ok {
80
+ return limit
81
+ }
82
+
83
+ // Try prefix matching (longest prefix wins)
84
+ bestMatch := ""
85
+ bestLimit := 0
86
+ for pattern, limit := range modelTokenLimits {
87
+ if strings.HasPrefix(lowerModel, pattern) && len(pattern) > len(bestMatch) {
88
+ bestMatch = pattern
89
+ bestLimit = limit
90
+ }
91
+ }
92
+
93
+ return bestLimit
94
+ }
95
+
96
+ // getModelTokenLimit returns the effective token limit for pre-emptive overflow checks.
97
+ // Priority: config override > model name lookup > 0 (disabled).
98
+ func (r *RLM) getModelTokenLimit() int {
99
+ if r.contextOverflow != nil && r.contextOverflow.MaxModelTokens > 0 {
100
+ return r.contextOverflow.MaxModelTokens
101
+ }
102
+ return LookupModelTokenLimit(r.model)
103
+ }
104
+
105
+ // ─── Pre-emptive Overflow Check ──────────────────────────────────────────────
106
+
107
+ // structuredPromptOverhead is the approximate token overhead for structured completion prompts
108
+ // (instructions, schema constraints, JSON formatting directives).
109
+ const structuredPromptOverhead = 350
110
+
111
+ // PreemptiveReduceContext checks if the context would overflow the model's token limit
112
+ // and reduces it proactively BEFORE building the prompt. Returns the (possibly reduced)
113
+ // context, or an error if reduction fails.
114
+ //
115
+ // This is called before the first LLM call, unlike post-hoc overflow recovery which
116
+ // only triggers after an API error. Following the RLM paper's principle that
117
+ // "the context window of the root LM is rarely clogged."
118
+ func (r *RLM) PreemptiveReduceContext(query string, context string, extraOverhead int) (string, bool, error) {
119
+ modelLimit := r.getModelTokenLimit()
120
+ if modelLimit == 0 {
121
+ // No known limit; skip pre-emptive check (will rely on post-hoc recovery)
122
+ return context, false, nil
123
+ }
124
+
125
+ if r.contextOverflow == nil || !r.contextOverflow.Enabled {
126
+ return context, false, nil
127
+ }
128
+
129
+ // Estimate total token budget needed
130
+ contextTokens := EstimateTokens(context)
131
+ queryTokens := EstimateTokens(query)
132
+ responseTokens := r.getResponseTokenBudget()
133
+ safetyMargin := r.contextOverflow.SafetyMargin
134
+ if safetyMargin == 0 {
135
+ safetyMargin = 0.15
136
+ }
137
+
138
+ totalEstimate := contextTokens + queryTokens + extraOverhead + responseTokens +
139
+ int(float64(modelLimit)*safetyMargin)
140
+
141
+ r.observer.Debug("overflow", "Pre-emptive check: context=%d query=%d overhead=%d response=%d safety=%d total=%d limit=%d",
142
+ contextTokens, queryTokens, extraOverhead, responseTokens,
143
+ int(float64(modelLimit)*safetyMargin), totalEstimate, modelLimit)
144
+
145
+ if totalEstimate <= modelLimit {
146
+ return context, false, nil
147
+ }
148
+
149
+ // Context would overflow — reduce it proactively
150
+ r.observer.Debug("overflow", "Pre-emptive reduction needed: estimated %d tokens > limit %d", totalEstimate, modelLimit)
151
+
152
+ reducer := newContextReducer(r, *r.contextOverflow, r.observer)
153
+ reduced, err := reducer.ReduceForCompletion(query, context, modelLimit)
154
+ if err != nil {
155
+ return context, false, fmt.Errorf("pre-emptive context reduction failed: %w", err)
156
+ }
157
+
158
+ r.observer.Debug("overflow", "Pre-emptive reduction: %d -> %d chars", len(context), len(reduced))
159
+ return reduced, true, nil
160
+ }
161
+
162
+ // getResponseTokenBudget extracts max_tokens or max_completion_tokens from ExtraParams.
163
+ func (r *RLM) getResponseTokenBudget() int {
164
+ if r.extraParams == nil {
165
+ return 0
166
+ }
167
+ for _, key := range []string{"max_completion_tokens", "max_tokens"} {
168
+ if v, ok := r.extraParams[key]; ok {
169
+ switch n := v.(type) {
170
+ case float64:
171
+ return int(n)
172
+ case int:
173
+ return n
174
+ case int64:
175
+ return int(n)
176
+ }
177
+ }
178
+ }
179
+ return 0
180
+ }
181
+
33
182
  // ─── Token Estimation ────────────────────────────────────────────────────────
34
183
 
35
184
  // EstimateTokens provides a fast approximation of token count for a string.
@@ -168,21 +317,53 @@ func newContextReducer(rlm *RLM, config ContextOverflowConfig, obs *Observer) *c
168
317
  return &contextReducer{rlm: rlm, config: config, obs: obs}
169
318
  }
170
319
 
320
+ // getResponseTokenBudget delegates to the RLM engine's method.
321
+ func (cr *contextReducer) getResponseTokenBudget() int {
322
+ return cr.rlm.getResponseTokenBudget()
323
+ }
324
+
325
+ // makeMapPhaseParams creates ExtraParams suitable for map-phase LLM calls (summarization).
326
+ // It copies the user's ExtraParams but overrides max_tokens to a smaller value since
327
+ // summaries don't need as many tokens as the original completion.
328
+ func (cr *contextReducer) makeMapPhaseParams(modelLimit int) map[string]interface{} {
329
+ params := make(map[string]interface{})
330
+ // Copy all user params (custom_llm_provider, temperature, etc.)
331
+ for k, v := range cr.rlm.extraParams {
332
+ params[k] = v
333
+ }
334
+ // Override max_tokens for map-phase: use at most 1/4 of model limit or 2000, whichever is smaller
335
+ mapMaxTokens := modelLimit / 4
336
+ if mapMaxTokens > 2000 {
337
+ mapMaxTokens = 2000
338
+ }
339
+ if mapMaxTokens < 256 {
340
+ mapMaxTokens = 256
341
+ }
342
+ params["max_tokens"] = mapMaxTokens
343
+ // Remove max_completion_tokens if present to avoid conflicts
344
+ delete(params, "max_completion_tokens")
345
+ return params
346
+ }
347
+
171
348
  // ReduceForCompletion handles context overflow for a regular completion.
172
349
  // It chunks the context, summarizes each chunk, and combines the summaries.
173
350
  func (cr *contextReducer) ReduceForCompletion(query string, context string, modelLimit int) (string, error) {
174
- cr.obs.Debug("overflow", "Starting MapReduce context reduction: %d estimated tokens, limit %d", EstimateTokens(context), modelLimit)
351
+ cr.obs.Debug("overflow", "Starting context reduction: %d estimated tokens, limit %d", EstimateTokens(context), modelLimit)
175
352
 
176
353
  // Calculate safe token budget per chunk
177
- // Reserve tokens for: system prompt (~500), query, overhead, safety margin
354
+ // Reserve tokens for: system prompt (~500), query, overhead, safety margin, response budget
178
355
  queryTokens := EstimateTokens(query)
179
- overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin)
356
+ responseTokens := cr.getResponseTokenBudget()
357
+ overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin) + responseTokens
180
358
  safeTokensPerChunk := modelLimit - overhead
181
359
 
182
360
  if safeTokensPerChunk <= 0 {
183
- safeTokensPerChunk = modelLimit / 2
361
+ safeTokensPerChunk = modelLimit / 4
184
362
  }
185
363
 
364
+ cr.obs.Debug("overflow", "Budget: overhead=%d (query=%d, response=%d, safety=%d), chunk budget=%d",
365
+ overhead, queryTokens, responseTokens, int(float64(modelLimit)*cr.config.SafetyMargin), safeTokensPerChunk)
366
+
186
367
  chunks := ChunkContext(context, safeTokensPerChunk)
187
368
  cr.obs.Debug("overflow", "Split context into %d chunks (budget: %d tokens/chunk)", len(chunks), safeTokensPerChunk)
188
369
 
@@ -211,6 +392,9 @@ func (cr *contextReducer) ReduceForCompletion(query string, context string, mode
211
392
  func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, modelLimit int, overhead int) (string, error) {
212
393
  cr.obs.Debug("overflow", "Using MapReduce strategy with %d chunks", len(chunks))
213
394
 
395
+ // Use map-phase-specific params with reduced max_tokens for summarization
396
+ mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
397
+
214
398
  summaries := make([]string, len(chunks))
215
399
  errs := make([]error, len(chunks))
216
400
  var wg sync.WaitGroup
@@ -239,7 +423,7 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
239
423
  APIBase: cr.rlm.apiBase,
240
424
  APIKey: cr.rlm.apiKey,
241
425
  Timeout: cr.rlm.timeoutSeconds,
242
- ExtraParams: cr.rlm.extraParams,
426
+ ExtraParams: mapPhaseParams,
243
427
  })
244
428
  if err != nil {
245
429
  errs[idx] = fmt.Errorf("map phase chunk %d: %w", idx+1, err)
@@ -254,14 +438,22 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
254
438
 
255
439
  wg.Wait()
256
440
 
257
- // Check for errors
441
+ // Check for errors - if map phase overflows, fall back to tfidf
258
442
  var mapErrors []string
443
+ hasOverflow := false
259
444
  for _, err := range errs {
260
445
  if err != nil {
261
446
  mapErrors = append(mapErrors, err.Error())
447
+ if _, isOverflow := IsContextOverflow(err); isOverflow {
448
+ hasOverflow = true
449
+ }
262
450
  }
263
451
  }
264
452
  if len(mapErrors) > 0 {
453
+ if hasOverflow {
454
+ cr.obs.Debug("overflow", "MapReduce map phase hit overflow, falling back to TF-IDF strategy")
455
+ return cr.reduceByTFIDF(strings.Join(chunks, "\n\n"), modelLimit, overhead)
456
+ }
265
457
  return "", fmt.Errorf("MapReduce map phase failed: %s", strings.Join(mapErrors, "; "))
266
458
  }
267
459
 
@@ -306,6 +498,9 @@ func (cr *contextReducer) reduceByTruncation(context string, modelLimit int, ove
306
498
  func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []string, modelLimit int, overhead int) (string, error) {
307
499
  cr.obs.Debug("overflow", "Using chunked extraction strategy with %d chunks", len(chunks))
308
500
 
501
+ // Use map-phase-specific params with reduced max_tokens
502
+ mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
503
+
309
504
  results := make([]string, len(chunks))
310
505
  errs := make([]error, len(chunks))
311
506
  var wg sync.WaitGroup
@@ -333,7 +528,7 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
333
528
  APIBase: cr.rlm.apiBase,
334
529
  APIKey: cr.rlm.apiKey,
335
530
  Timeout: cr.rlm.timeoutSeconds,
336
- ExtraParams: cr.rlm.extraParams,
531
+ ExtraParams: mapPhaseParams,
337
532
  })
338
533
  if err != nil {
339
534
  errs[idx] = fmt.Errorf("chunked extraction chunk %d: %w", idx+1, err)
@@ -350,12 +545,20 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
350
545
  wg.Wait()
351
546
 
352
547
  var extractErrors []string
548
+ hasOverflow := false
353
549
  for _, err := range errs {
354
550
  if err != nil {
355
551
  extractErrors = append(extractErrors, err.Error())
552
+ if _, isOverflow := IsContextOverflow(err); isOverflow {
553
+ hasOverflow = true
554
+ }
356
555
  }
357
556
  }
358
557
  if len(extractErrors) > 0 {
558
+ if hasOverflow {
559
+ cr.obs.Debug("overflow", "Chunked extraction hit overflow, falling back to TF-IDF strategy")
560
+ return cr.reduceByTFIDF(strings.Join(chunks, "\n\n"), modelLimit, overhead)
561
+ }
359
562
  return "", fmt.Errorf("chunked extraction failed: %s", strings.Join(extractErrors, "; "))
360
563
  }
361
564
 
@@ -387,6 +590,9 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
387
590
  return "", fmt.Errorf("refine strategy: no chunks to process")
388
591
  }
389
592
 
593
+ // Use map-phase-specific params with reduced max_tokens
594
+ mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
595
+
390
596
  // Phase 1: Generate initial answer from the first chunk
391
597
  initialPrompt := fmt.Sprintf(
392
598
  "Using the following context, provide a comprehensive answer to the question.\n"+
@@ -406,7 +612,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
406
612
  APIBase: cr.rlm.apiBase,
407
613
  APIKey: cr.rlm.apiKey,
408
614
  Timeout: cr.rlm.timeoutSeconds,
409
- ExtraParams: cr.rlm.extraParams,
615
+ ExtraParams: mapPhaseParams,
410
616
  })
411
617
  if err != nil {
412
618
  return "", fmt.Errorf("refine initial chunk: %w", err)
@@ -438,7 +644,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
438
644
  APIBase: cr.rlm.apiBase,
439
645
  APIKey: cr.rlm.apiKey,
440
646
  Timeout: cr.rlm.timeoutSeconds,
441
- ExtraParams: cr.rlm.extraParams,
647
+ ExtraParams: mapPhaseParams,
442
648
  })
443
649
  if err != nil {
444
650
  cr.obs.Debug("overflow", "Refine: chunk %d/%d failed: %v, keeping current answer", i+1, len(chunks), err)
@@ -495,72 +701,3 @@ func (cr *contextReducer) reduceByTextRank(context string, modelLimit int, overh
495
701
  return result, nil
496
702
  }
497
703
 
498
- // ─── Adaptive Completion with Overflow Recovery ──────────────────────────────
499
-
500
- // completionWithOverflowRecovery wraps a completion call with automatic overflow detection and retry.
501
- // When a context overflow error is detected, it reduces the context and retries.
502
- func (r *RLM) completionWithOverflowRecovery(query string, context string, overflowConfig ContextOverflowConfig) (string, RLMStats, error) {
503
- obs := r.observer
504
- if obs == nil {
505
- obs = NewNoopObserver()
506
- }
507
-
508
- // Try the normal completion first
509
- result, stats, err := r.Completion(query, context)
510
- if err == nil {
511
- return result, stats, nil
512
- }
513
-
514
- // Check if it's a context overflow error
515
- coe, isOverflow := IsContextOverflow(err)
516
- if !isOverflow {
517
- return "", stats, err // Not an overflow error, return original error
518
- }
519
-
520
- obs.Debug("overflow", "Context overflow detected: model limit %d, request %d tokens (%.1f%% over)",
521
- coe.ModelLimit, coe.RequestTokens, (coe.OverflowRatio()-1)*100)
522
-
523
- // Use detected limit or configured limit
524
- modelLimit := coe.ModelLimit
525
- if overflowConfig.MaxModelTokens > 0 {
526
- modelLimit = overflowConfig.MaxModelTokens
527
- }
528
-
529
- reducer := newContextReducer(r, overflowConfig, obs)
530
-
531
- // Attempt context reduction and retry
532
- for attempt := 0; attempt < overflowConfig.MaxReductionAttempts; attempt++ {
533
- obs.Debug("overflow", "Reduction attempt %d/%d", attempt+1, overflowConfig.MaxReductionAttempts)
534
-
535
- reducedContext, reduceErr := reducer.ReduceForCompletion(query, context, modelLimit)
536
- if reduceErr != nil {
537
- obs.Error("overflow", "Context reduction failed: %v", reduceErr)
538
- return "", stats, fmt.Errorf("context overflow recovery failed: %w", reduceErr)
539
- }
540
-
541
- obs.Debug("overflow", "Context reduced: %d -> %d chars", len(context), len(reducedContext))
542
-
543
- // Retry with reduced context
544
- result, stats, err = r.Completion(query, reducedContext)
545
- if err == nil {
546
- obs.Event("overflow.recovery_success", map[string]string{
547
- "attempt": fmt.Sprintf("%d", attempt+1),
548
- "original_chars": fmt.Sprintf("%d", len(context)),
549
- "reduced_chars": fmt.Sprintf("%d", len(reducedContext)),
550
- "reduction_ratio": fmt.Sprintf("%.2f", float64(len(reducedContext))/float64(len(context))),
551
- })
552
- return result, stats, nil
553
- }
554
-
555
- // If it overflows again, use the reduced context for the next attempt
556
- if _, stillOverflow := IsContextOverflow(err); stillOverflow {
557
- context = reducedContext
558
- continue
559
- }
560
-
561
- // Different error, return it
562
- return "", stats, err
563
- }
564
-
565
- return "", stats, fmt.Errorf("context overflow: exceeded %d reduction attempts, model limit is %d tokens", overflowConfig.MaxReductionAttempts, modelLimit)
566
- }
@@ -107,6 +107,124 @@ func TestIsContextOverflow_GenericError(t *testing.T) {
107
107
  }
108
108
  }
109
109
 
110
+ func TestIsContextOverflow_MaxTokensTooLarge_vLLM(t *testing.T) {
111
+ // vLLM/Ray Serve error when max_tokens exceeds remaining capacity
112
+ // This is the exact error from the user's production logs
113
+ response := `{"object":"error","message":"'max_tokens' or 'max_completion_tokens' is too large: 10000. This model's maximum context length is 32768 tokens and your request has 30168 input tokens (10000 > 32768 - 30168)","type":"BadRequestError","param":null,"code":400}`
114
+ apiErr := NewAPIError(400, response)
115
+
116
+ coe, ok := IsContextOverflow(apiErr)
117
+ if !ok {
118
+ t.Fatal("expected IsContextOverflow to detect max_tokens too large error")
119
+ }
120
+ if coe.ModelLimit != 32768 {
121
+ t.Errorf("expected ModelLimit 32768, got %d", coe.ModelLimit)
122
+ }
123
+ // Request tokens should include both input + max_tokens: 30168 + 10000 = 40168
124
+ if coe.RequestTokens != 40168 {
125
+ t.Errorf("expected RequestTokens 40168 (input 30168 + max_tokens 10000), got %d", coe.RequestTokens)
126
+ }
127
+ }
128
+
129
+ func TestIsContextOverflow_MaxCompletionTokensTooLarge(t *testing.T) {
130
+ // OpenAI newer API format with max_completion_tokens
131
+ response := `{"error":{"message":"'max_tokens' or 'max_completion_tokens' is too large: 5000. This model's maximum context length is 16384 tokens and your request has 14000 input tokens","type":"invalid_request_error","code":"invalid_request_error"}}`
132
+ apiErr := NewAPIError(400, response)
133
+
134
+ coe, ok := IsContextOverflow(apiErr)
135
+ if !ok {
136
+ t.Fatal("expected IsContextOverflow to detect max_completion_tokens too large error")
137
+ }
138
+ if coe.ModelLimit != 16384 {
139
+ t.Errorf("expected ModelLimit 16384, got %d", coe.ModelLimit)
140
+ }
141
+ if coe.RequestTokens != 19000 {
142
+ t.Errorf("expected RequestTokens 19000 (input 14000 + max_tokens 5000), got %d", coe.RequestTokens)
143
+ }
144
+ }
145
+
146
+ func TestGetResponseTokenBudget(t *testing.T) {
147
+ rlm := &RLM{
148
+ extraParams: map[string]interface{}{
149
+ "max_tokens": float64(10000),
150
+ },
151
+ }
152
+ obs := NewNoopObserver()
153
+ config := DefaultContextOverflowConfig()
154
+ reducer := newContextReducer(rlm, config, obs)
155
+
156
+ budget := reducer.getResponseTokenBudget()
157
+ if budget != 10000 {
158
+ t.Errorf("expected response token budget 10000, got %d", budget)
159
+ }
160
+ }
161
+
162
+ func TestGetResponseTokenBudget_MaxCompletionTokens(t *testing.T) {
163
+ rlm := &RLM{
164
+ extraParams: map[string]interface{}{
165
+ "max_completion_tokens": float64(5000),
166
+ },
167
+ }
168
+ obs := NewNoopObserver()
169
+ config := DefaultContextOverflowConfig()
170
+ reducer := newContextReducer(rlm, config, obs)
171
+
172
+ budget := reducer.getResponseTokenBudget()
173
+ if budget != 5000 {
174
+ t.Errorf("expected response token budget 5000, got %d", budget)
175
+ }
176
+ }
177
+
178
+ func TestGetResponseTokenBudget_NoMaxTokens(t *testing.T) {
179
+ rlm := &RLM{
180
+ extraParams: map[string]interface{}{
181
+ "temperature": 0.7,
182
+ },
183
+ }
184
+ obs := NewNoopObserver()
185
+ config := DefaultContextOverflowConfig()
186
+ reducer := newContextReducer(rlm, config, obs)
187
+
188
+ budget := reducer.getResponseTokenBudget()
189
+ if budget != 0 {
190
+ t.Errorf("expected response token budget 0, got %d", budget)
191
+ }
192
+ }
193
+
194
+ func TestMakeMapPhaseParams(t *testing.T) {
195
+ rlm := &RLM{
196
+ extraParams: map[string]interface{}{
197
+ "max_tokens": float64(10000),
198
+ "custom_llm_provider": "vllm",
199
+ "temperature": 0.7,
200
+ },
201
+ }
202
+ obs := NewNoopObserver()
203
+ config := DefaultContextOverflowConfig()
204
+ reducer := newContextReducer(rlm, config, obs)
205
+
206
+ params := reducer.makeMapPhaseParams(32768)
207
+
208
+ // max_tokens should be capped (32768/4 = 8192, but cap is 2000)
209
+ maxTokens, ok := params["max_tokens"].(int)
210
+ if !ok {
211
+ t.Fatal("expected max_tokens to be int in map phase params")
212
+ }
213
+ if maxTokens > 2000 {
214
+ t.Errorf("expected map phase max_tokens <= 2000, got %d", maxTokens)
215
+ }
216
+
217
+ // custom_llm_provider should be preserved
218
+ if params["custom_llm_provider"] != "vllm" {
219
+ t.Errorf("expected custom_llm_provider to be preserved, got %v", params["custom_llm_provider"])
220
+ }
221
+
222
+ // temperature should be preserved
223
+ if params["temperature"] != 0.7 {
224
+ t.Errorf("expected temperature to be preserved, got %v", params["temperature"])
225
+ }
226
+ }
227
+
110
228
  func TestContextOverflowError_OverflowRatio(t *testing.T) {
111
229
  tests := []struct {
112
230
  limit int
@@ -526,10 +644,10 @@ func TestContextOverflowError_ErrorChain(t *testing.T) {
526
644
  if coe.APIError == nil {
527
645
  t.Fatal("expected embedded APIError to be non-nil")
528
646
  }
529
- if coe.APIError.StatusCode != 400 {
530
- t.Errorf("expected status 400, got %d", coe.APIError.StatusCode)
647
+ if coe.StatusCode != 400 {
648
+ t.Errorf("expected status 400, got %d", coe.StatusCode)
531
649
  }
532
- if coe.APIError.RLMError == nil {
650
+ if coe.RLMError == nil {
533
651
  t.Fatal("expected embedded RLMError to be non-nil")
534
652
  }
535
653
 
@@ -781,3 +899,373 @@ func TestReduceForCompletion_DispatchesTextRank(t *testing.T) {
781
899
  t.Errorf("expected reduced context for textrank strategy")
782
900
  }
783
901
  }
902
+
903
+ // ─── Model Token Limits Tests ────────────────────────────────────────────────
904
+
905
+ func TestLookupModelTokenLimit_ExactMatch(t *testing.T) {
906
+ tests := []struct {
907
+ model string
908
+ expected int
909
+ }{
910
+ {"gpt-4o", 128000},
911
+ {"gpt-4o-mini", 128000},
912
+ {"gpt-4", 8192},
913
+ {"gpt-4-32k", 32768},
914
+ {"gpt-3.5-turbo", 16385},
915
+ {"claude-3-opus", 200000},
916
+ {"claude-sonnet-4", 200000},
917
+ {"mistral-7b", 32768},
918
+ }
919
+
920
+ for _, tt := range tests {
921
+ limit := LookupModelTokenLimit(tt.model)
922
+ if limit != tt.expected {
923
+ t.Errorf("LookupModelTokenLimit(%q) = %d, expected %d", tt.model, limit, tt.expected)
924
+ }
925
+ }
926
+ }
927
+
928
+ func TestLookupModelTokenLimit_PrefixMatch(t *testing.T) {
929
+ // Versioned model names should match by prefix
930
+ tests := []struct {
931
+ model string
932
+ expected int
933
+ }{
934
+ {"gpt-4o-mini-2024-07-18", 128000},
935
+ {"gpt-4o-2024-05-13", 128000},
936
+ {"claude-3-opus-20240229", 200000},
937
+ {"mistral-7b-instruct-v0.2", 32768},
938
+ }
939
+
940
+ for _, tt := range tests {
941
+ limit := LookupModelTokenLimit(tt.model)
942
+ if limit != tt.expected {
943
+ t.Errorf("LookupModelTokenLimit(%q) = %d, expected %d", tt.model, limit, tt.expected)
944
+ }
945
+ }
946
+ }
947
+
948
+ func TestLookupModelTokenLimit_Unknown(t *testing.T) {
949
+ limit := LookupModelTokenLimit("completely-unknown-model-xyz")
950
+ if limit != 0 {
951
+ t.Errorf("expected 0 for unknown model, got %d", limit)
952
+ }
953
+ }
954
+
955
+ func TestLookupModelTokenLimit_CaseInsensitive(t *testing.T) {
956
+ limit := LookupModelTokenLimit("GPT-4O-MINI")
957
+ if limit != 128000 {
958
+ t.Errorf("expected 128000 for case-insensitive match, got %d", limit)
959
+ }
960
+ }
961
+
962
+ func TestGetModelTokenLimit_ConfigOverride(t *testing.T) {
963
+ engine := New("gpt-4o-mini", Config{
964
+ APIKey: "test",
965
+ ContextOverflow: &ContextOverflowConfig{
966
+ Enabled: true,
967
+ MaxModelTokens: 16384,
968
+ },
969
+ })
970
+
971
+ limit := engine.getModelTokenLimit()
972
+ if limit != 16384 {
973
+ t.Errorf("expected config override 16384, got %d", limit)
974
+ }
975
+ }
976
+
977
+ func TestGetModelTokenLimit_ModelLookup(t *testing.T) {
978
+ engine := New("gpt-4o-mini", Config{
979
+ APIKey: "test",
980
+ })
981
+
982
+ limit := engine.getModelTokenLimit()
983
+ if limit != 128000 {
984
+ t.Errorf("expected model lookup 128000, got %d", limit)
985
+ }
986
+ }
987
+
988
+ func TestGetModelTokenLimit_UnknownModel(t *testing.T) {
989
+ engine := New("custom-local-model", Config{
990
+ APIKey: "test",
991
+ })
992
+
993
+ limit := engine.getModelTokenLimit()
994
+ if limit != 0 {
995
+ t.Errorf("expected 0 for unknown model, got %d", limit)
996
+ }
997
+ }
998
+
999
+ // ─── Pre-emptive Overflow Tests ──────────────────────────────────────────────
1000
+
1001
+ func TestPreemptiveReduceContext_SmallContext(t *testing.T) {
1002
+ engine := New("gpt-4o-mini", Config{
1003
+ APIKey: "test",
1004
+ })
1005
+
1006
+ // Small context should pass through unchanged
1007
+ context := "This is a small context that easily fits."
1008
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("What is this?", context, 500)
1009
+ if err != nil {
1010
+ t.Fatalf("unexpected error: %v", err)
1011
+ }
1012
+ if wasReduced {
1013
+ t.Error("expected no reduction for small context")
1014
+ }
1015
+ if reduced != context {
1016
+ t.Error("expected context to be unchanged")
1017
+ }
1018
+ }
1019
+
1020
+ func TestPreemptiveReduceContext_LargeContext(t *testing.T) {
1021
+ engine := New("gpt-4o-mini", Config{
1022
+ APIKey: "test",
1023
+ ContextOverflow: &ContextOverflowConfig{
1024
+ Enabled: true,
1025
+ MaxModelTokens: 1000, // Very small limit to force overflow
1026
+ Strategy: "truncate",
1027
+ SafetyMargin: 0.15,
1028
+ },
1029
+ })
1030
+
1031
+ // Create large context that exceeds the 1000 token limit
1032
+ context := strings.Repeat("The revenue for Q4 was $4.2 billion, representing 23% year-over-year growth. ", 100)
1033
+
1034
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("Summarize revenue", context, 300)
1035
+ if err != nil {
1036
+ t.Fatalf("unexpected error: %v", err)
1037
+ }
1038
+ if !wasReduced {
1039
+ t.Error("expected context to be reduced")
1040
+ }
1041
+ if len(reduced) >= len(context) {
1042
+ t.Errorf("expected reduced context to be shorter: %d >= %d", len(reduced), len(context))
1043
+ }
1044
+ }
1045
+
1046
+ func TestPreemptiveReduceContext_DisabledOverflow(t *testing.T) {
1047
+ engine := New("gpt-4o-mini", Config{
1048
+ APIKey: "test",
1049
+ ContextOverflow: &ContextOverflowConfig{
1050
+ Enabled: false,
1051
+ },
1052
+ })
1053
+
1054
+ context := strings.Repeat("Large content. ", 10000)
1055
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("query", context, 500)
1056
+ if err != nil {
1057
+ t.Fatalf("unexpected error: %v", err)
1058
+ }
1059
+ if wasReduced {
1060
+ t.Error("expected no reduction when overflow is disabled")
1061
+ }
1062
+ if reduced != context {
1063
+ t.Error("expected context unchanged when overflow is disabled")
1064
+ }
1065
+ }
1066
+
1067
+ func TestPreemptiveReduceContext_UnknownModel(t *testing.T) {
1068
+ engine := New("custom-local-model", Config{
1069
+ APIKey: "test",
1070
+ })
1071
+
1072
+ // Unknown model with no config override → no pre-emptive check
1073
+ context := strings.Repeat("Large content. ", 10000)
1074
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("query", context, 500)
1075
+ if err != nil {
1076
+ t.Fatalf("unexpected error: %v", err)
1077
+ }
1078
+ if wasReduced {
1079
+ t.Error("expected no reduction for unknown model with no config limit")
1080
+ }
1081
+ if reduced != context {
1082
+ t.Error("expected context unchanged")
1083
+ }
1084
+ }
1085
+
1086
+ func TestPreemptiveReduceContext_AccountsForResponseBudget(t *testing.T) {
1087
+ // With a high max_tokens, even moderate context should trigger reduction
1088
+ engine := New("gpt-4o-mini", Config{
1089
+ APIKey: "test",
1090
+ ContextOverflow: &ContextOverflowConfig{
1091
+ Enabled: true,
1092
+ MaxModelTokens: 2000,
1093
+ Strategy: "truncate",
1094
+ SafetyMargin: 0.15,
1095
+ },
1096
+ ExtraParams: map[string]interface{}{
1097
+ "max_tokens": float64(1000), // Large response budget
1098
+ },
1099
+ })
1100
+
1101
+ // Context of ~500 tokens + max_tokens 1000 + overhead = exceeds 2000
1102
+ context := strings.Repeat("Revenue data: the company earned $4.2B in Q4 fiscal year. ", 30)
1103
+
1104
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("Summarize", context, 300)
1105
+ if err != nil {
1106
+ t.Fatalf("unexpected error: %v", err)
1107
+ }
1108
+ if !wasReduced {
1109
+ t.Error("expected reduction when response budget + context exceeds limit")
1110
+ }
1111
+ if len(reduced) >= len(context) {
1112
+ t.Errorf("expected reduced context: %d >= %d", len(reduced), len(context))
1113
+ }
1114
+ }
1115
+
1116
+ func TestPreemptiveReduceContext_TFIDFStrategy(t *testing.T) {
1117
+ engine := New("gpt-4o-mini", Config{
1118
+ APIKey: "test",
1119
+ ContextOverflow: &ContextOverflowConfig{
1120
+ Enabled: true,
1121
+ MaxModelTokens: 500,
1122
+ Strategy: "tfidf",
1123
+ SafetyMargin: 0.15,
1124
+ },
1125
+ })
1126
+
1127
+ context := strings.Repeat("Machine learning models process large datasets effectively. ", 100)
1128
+
1129
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("Tell me about ML", context, 200)
1130
+ if err != nil {
1131
+ t.Fatalf("unexpected error: %v", err)
1132
+ }
1133
+ if !wasReduced {
1134
+ t.Error("expected reduction with tfidf strategy")
1135
+ }
1136
+ if len(reduced) >= len(context) {
1137
+ t.Errorf("expected shorter context: %d >= %d", len(reduced), len(context))
1138
+ }
1139
+ }
1140
+
1141
+ func TestPreemptiveReduceContext_TextRankStrategy(t *testing.T) {
1142
+ engine := New("gpt-4o-mini", Config{
1143
+ APIKey: "test",
1144
+ ContextOverflow: &ContextOverflowConfig{
1145
+ Enabled: true,
1146
+ MaxModelTokens: 500,
1147
+ Strategy: "textrank",
1148
+ SafetyMargin: 0.15,
1149
+ },
1150
+ })
1151
+
1152
+ context := strings.Repeat("Neural networks are powerful computation models. ", 100)
1153
+
1154
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("Explain neural nets", context, 200)
1155
+ if err != nil {
1156
+ t.Fatalf("unexpected error: %v", err)
1157
+ }
1158
+ if !wasReduced {
1159
+ t.Error("expected reduction with textrank strategy")
1160
+ }
1161
+ if len(reduced) >= len(context) {
1162
+ t.Errorf("expected shorter context: %d >= %d", len(reduced), len(context))
1163
+ }
1164
+ }
1165
+
1166
+ func TestGetResponseTokenBudget_RLMMethod(t *testing.T) {
1167
+ engine := &RLM{
1168
+ extraParams: map[string]interface{}{
1169
+ "max_tokens": float64(5000),
1170
+ },
1171
+ }
1172
+ budget := engine.getResponseTokenBudget()
1173
+ if budget != 5000 {
1174
+ t.Errorf("expected 5000, got %d", budget)
1175
+ }
1176
+ }
1177
+
1178
+ func TestGetResponseTokenBudget_MaxCompletionTokensPreferred(t *testing.T) {
1179
+ engine := &RLM{
1180
+ extraParams: map[string]interface{}{
1181
+ "max_tokens": float64(5000),
1182
+ "max_completion_tokens": float64(8000),
1183
+ },
1184
+ }
1185
+ budget := engine.getResponseTokenBudget()
1186
+ if budget != 8000 {
1187
+ t.Errorf("expected max_completion_tokens=8000 preferred, got %d", budget)
1188
+ }
1189
+ }
1190
+
1191
+ func TestGetResponseTokenBudget_NoParams(t *testing.T) {
1192
+ engine := &RLM{
1193
+ extraParams: map[string]interface{}{
1194
+ "temperature": 0.7,
1195
+ },
1196
+ }
1197
+ budget := engine.getResponseTokenBudget()
1198
+ if budget != 0 {
1199
+ t.Errorf("expected 0 when no max_tokens set, got %d", budget)
1200
+ }
1201
+ }
1202
+
1203
+ // ─── Message Pruning Tests ───────────────────────────────────────────────────
1204
+
1205
+ func TestPruneMessages_SmallHistory(t *testing.T) {
1206
+ messages := []Message{
1207
+ {Role: "system", Content: "You are helpful."},
1208
+ {Role: "user", Content: "Hello"},
1209
+ {Role: "assistant", Content: "Hi there!"},
1210
+ }
1211
+
1212
+ result := pruneMessages(messages, 100)
1213
+ if len(result) != 3 {
1214
+ t.Errorf("expected 3 messages (no pruning needed), got %d", len(result))
1215
+ }
1216
+ }
1217
+
1218
+ func TestPruneMessages_PreservesSystemAndLast(t *testing.T) {
1219
+ messages := []Message{
1220
+ {Role: "system", Content: "System prompt"},
1221
+ {Role: "user", Content: "First question"},
1222
+ {Role: "assistant", Content: "First answer"},
1223
+ {Role: "user", Content: "Second question"},
1224
+ {Role: "assistant", Content: "Second answer"},
1225
+ {Role: "user", Content: strings.Repeat("Third question with lots of context. ", 100)},
1226
+ {Role: "assistant", Content: "Third answer"},
1227
+ }
1228
+
1229
+ result := pruneMessages(messages, 50) // Very tight budget
1230
+
1231
+ // Should always keep system prompt (first) and last 2 messages
1232
+ if len(result) < 3 {
1233
+ t.Errorf("expected at least 3 messages, got %d", len(result))
1234
+ }
1235
+ if result[0].Role != "system" {
1236
+ t.Error("first message should be system prompt")
1237
+ }
1238
+ if result[len(result)-1].Content != "Third answer" {
1239
+ t.Error("last message should be the most recent")
1240
+ }
1241
+ if result[len(result)-2].Role != "user" {
1242
+ t.Error("second-to-last should be the most recent user message")
1243
+ }
1244
+ }
1245
+
1246
+ func TestPruneMessages_KeepsRecentMiddleMessages(t *testing.T) {
1247
+ messages := []Message{
1248
+ {Role: "system", Content: "Short."},
1249
+ {Role: "user", Content: "Q1"},
1250
+ {Role: "assistant", Content: "A1"},
1251
+ {Role: "user", Content: "Q2"},
1252
+ {Role: "assistant", Content: "A2"},
1253
+ {Role: "user", Content: "Q3"},
1254
+ {Role: "assistant", Content: "A3"},
1255
+ }
1256
+
1257
+ // Budget large enough for all
1258
+ result := pruneMessages(messages, 10000)
1259
+ if len(result) != 7 {
1260
+ t.Errorf("expected all 7 messages with large budget, got %d", len(result))
1261
+ }
1262
+ }
1263
+
1264
+ // ─── Structured Completion Pre-emptive Integration Tests ─────────────────────
1265
+
1266
+ func TestStructuredPromptOverhead_Constant(t *testing.T) {
1267
+ // Verify the constant is reasonable (300-500 tokens for structured prompt instructions)
1268
+ if structuredPromptOverhead < 200 || structuredPromptOverhead > 600 {
1269
+ t.Errorf("structuredPromptOverhead=%d seems out of range (expected 200-600)", structuredPromptOverhead)
1270
+ }
1271
+ }
package/go/rlm/errors.go CHANGED
@@ -192,7 +192,31 @@ func parseContextOverflowMessage(msg string) (modelLimit int, requestTokens int,
192
192
  }
193
193
  }
194
194
 
195
- // Pattern 3: "max_tokens" / "input too long" generic patterns
195
+ // Pattern 3: "max_tokens is too large" - response budget exceeds remaining capacity
196
+ // vLLM/OpenAI: "max_tokens' or 'max_completion_tokens' is too large: 10000.
197
+ // This model's maximum context length is 32768 tokens and your request has 30168 input tokens"
198
+ // In this case, input tokens < model limit, but input + max_tokens > model limit.
199
+ // We report the effective total (input + max_tokens) as requestTokens.
200
+ if strings.Contains(lowerMsg, "max_tokens") && strings.Contains(lowerMsg, "too large") {
201
+ limit := extractNumber(msg, "maximum context length is ", " tokens")
202
+ inputTokens := extractNumber(msg, "your request has ", " input tokens")
203
+ if inputTokens == 0 {
204
+ inputTokens = extractNumber(msg, "your request has ", " tokens")
205
+ }
206
+ maxTokens := extractNumber(msg, "too large: ", ".")
207
+ if maxTokens == 0 {
208
+ maxTokens = extractNumber(msg, "too large: ", " ")
209
+ }
210
+ if limit > 0 && inputTokens > 0 && maxTokens > 0 {
211
+ return limit, inputTokens + maxTokens, true
212
+ }
213
+ // Fallback: if we got limit and input tokens, treat input as the overflow
214
+ if limit > 0 && inputTokens > 0 {
215
+ return limit, inputTokens, true
216
+ }
217
+ }
218
+
219
+ // Pattern 4: "input too long" / "too many tokens" generic patterns
196
220
  if strings.Contains(lowerMsg, "input too long") || strings.Contains(lowerMsg, "too many tokens") || strings.Contains(lowerMsg, "too many input tokens") {
197
221
  limit := extractNumber(msg, "limit is ", " tokens")
198
222
  if limit == 0 {
package/go/rlm/rlm.go CHANGED
@@ -109,8 +109,41 @@ func (r *RLM) Completion(query string, context string) (string, RLMStats, error)
109
109
  r.stats.Iterations = iteration + 1
110
110
  r.observer.Debug("rlm", "Iteration %d/%d at depth %d", iteration+1, r.maxIterations, r.currentDepth)
111
111
 
112
+ // Pre-emptive message overflow check: prune older messages if history is growing too large.
113
+ // Regular completion stores context in the REPL env (not messages), but the iterative
114
+ // loop appends assistant+user messages each iteration which can accumulate.
115
+ if modelLimit := r.getModelTokenLimit(); modelLimit > 0 && len(messages) > 4 {
116
+ msgTokens := EstimateMessagesTokens(messages)
117
+ responseTokens := r.getResponseTokenBudget()
118
+ safetyMargin := 0.15
119
+ if r.contextOverflow != nil && r.contextOverflow.SafetyMargin > 0 {
120
+ safetyMargin = r.contextOverflow.SafetyMargin
121
+ }
122
+ available := modelLimit - responseTokens - int(float64(modelLimit)*safetyMargin)
123
+ if msgTokens > available {
124
+ r.observer.Debug("rlm", "Message history overflow: %d tokens > %d available, pruning middle messages", msgTokens, available)
125
+ messages = pruneMessages(messages, available)
126
+ }
127
+ }
128
+
112
129
  response, err := r.callLLM(messages)
113
130
  if err != nil {
131
+ // Check for context overflow and attempt recovery
132
+ if r.contextOverflow != nil && r.contextOverflow.Enabled {
133
+ if _, isOverflow := IsContextOverflow(err); isOverflow && len(messages) > 4 {
134
+ r.observer.Debug("rlm", "Context overflow on iteration %d, pruning messages and retrying", iteration+1)
135
+ modelLimit := r.getModelTokenLimit()
136
+ if modelLimit == 0 {
137
+ modelLimit = 32768 // Reasonable default
138
+ }
139
+ responseTokens := r.getResponseTokenBudget()
140
+ available := modelLimit - responseTokens - int(float64(modelLimit)*0.15)
141
+ messages = pruneMessages(messages, available)
142
+ // Retry this iteration
143
+ iteration--
144
+ continue
145
+ }
146
+ }
114
147
  r.observer.Error("rlm", "LLM call failed on iteration %d: %v", iteration+1, err)
115
148
  return "", r.stats, err
116
149
  }
@@ -214,6 +247,48 @@ func (r *RLM) buildREPLEnv(query string, context string) map[string]interface{}
214
247
  return env
215
248
  }
216
249
 
250
+ // pruneMessages removes older middle messages to fit within a token budget.
251
+ // Preserves the first message (system prompt) and the last 2 messages (most recent exchange).
252
+ func pruneMessages(messages []Message, targetTokens int) []Message {
253
+ if len(messages) <= 3 {
254
+ return messages
255
+ }
256
+
257
+ // Always keep: system prompt (first), last 2 messages (most recent exchange)
258
+ system := messages[0]
259
+ lastN := messages[len(messages)-2:]
260
+
261
+ // Start with the preserved messages
262
+ result := []Message{system}
263
+ currentTokens := EstimateMessagesTokens(append(result, lastN...))
264
+
265
+ if currentTokens >= targetTokens {
266
+ // Even the minimum set exceeds the budget; return it anyway
267
+ return append(result, lastN...)
268
+ }
269
+
270
+ // Add middle messages from most recent to oldest until budget is exceeded
271
+ middle := messages[1 : len(messages)-2]
272
+ for i := len(middle) - 1; i >= 0; i-- {
273
+ msgTokens := 4 + EstimateTokens(middle[i].Content)
274
+ if currentTokens+msgTokens > targetTokens {
275
+ break
276
+ }
277
+ result = append(result, middle[i])
278
+ currentTokens += msgTokens
279
+ }
280
+
281
+ // Reverse the added middle messages (they were added newest-first)
282
+ if len(result) > 1 {
283
+ added := result[1:]
284
+ for i, j := 0, len(added)-1; i < j; i, j = i+1, j-1 {
285
+ added[i], added[j] = added[j], added[i]
286
+ }
287
+ }
288
+
289
+ return append(result, lastN...)
290
+ }
291
+
217
292
  // GetObserver returns the observer for external access to events/traces.
218
293
  func (r *RLM) GetObserver() *Observer {
219
294
  return r.observer
@@ -46,6 +46,20 @@ func (r *RLM) StructuredCompletion(query string, context string, config *Structu
46
46
  subTasks := decomposeSchema(config.Schema)
47
47
  r.observer.Debug("structured", "Schema decomposed into %d subtasks", len(subTasks))
48
48
 
49
+ // Pre-emptive overflow check: reduce context BEFORE building the prompt.
50
+ // Structured completion embeds the full context in the user message, so this is
51
+ // critical to prevent overflow on the first LLM call (following the RLM paper's
52
+ // principle: "the context window of the root LM is rarely clogged").
53
+ schemaJSON, _ := json.Marshal(config.Schema)
54
+ schemaOverhead := EstimateTokens(string(schemaJSON)) + structuredPromptOverhead
55
+ reducedCtx, wasReduced, reduceErr := r.PreemptiveReduceContext(query, context, schemaOverhead)
56
+ if reduceErr != nil {
57
+ r.observer.Error("structured", "Pre-emptive reduction failed: %v (proceeding with original context)", reduceErr)
58
+ } else if wasReduced {
59
+ r.observer.Debug("structured", "Pre-emptive reduction applied: %d -> %d chars", len(context), len(reducedCtx))
60
+ context = reducedCtx
61
+ }
62
+
49
63
  // If simple schema or parallel disabled, use direct method
50
64
  if len(subTasks) <= 2 || !config.ParallelExecution {
51
65
  r.observer.Debug("structured", "Using direct completion method")
@@ -843,7 +857,7 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
843
857
 
844
858
  var feedback strings.Builder
845
859
  feedback.WriteString("VALIDATION ERROR - Your previous response was invalid.\n\n")
846
- feedback.WriteString(fmt.Sprintf("ERROR: %s\n\n", errMsg))
860
+ fmt.Fprintf(&feedback, "ERROR: %s\n\n", errMsg)
847
861
 
848
862
  // Extract what field caused the issue
849
863
  if strings.Contains(errMsg, "missing required field:") {
@@ -852,17 +866,17 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
852
866
  fieldName = strings.TrimSpace(fieldName)
853
867
 
854
868
  feedback.WriteString("SPECIFIC ISSUE:\n")
855
- feedback.WriteString(fmt.Sprintf("The field '%s' is REQUIRED but was not provided.\n\n", fieldName))
869
+ fmt.Fprintf(&feedback, "The field '%s' is REQUIRED but was not provided.\n\n", fieldName)
856
870
 
857
871
  // Find the schema for this field and provide details
858
872
  if schema.Type == "object" && schema.Properties != nil {
859
873
  if fieldSchema, exists := schema.Properties[fieldName]; exists {
860
874
  feedback.WriteString("FIELD REQUIREMENTS:\n")
861
- feedback.WriteString(fmt.Sprintf("- Field name: '%s'\n", fieldName))
862
- feedback.WriteString(fmt.Sprintf("- Type: %s\n", fieldSchema.Type))
875
+ fmt.Fprintf(&feedback, "- Field name: '%s'\n", fieldName)
876
+ fmt.Fprintf(&feedback, "- Type: %s\n", fieldSchema.Type)
863
877
 
864
878
  if fieldSchema.Type == "object" && len(fieldSchema.Required) > 0 {
865
- feedback.WriteString(fmt.Sprintf("- This is an object with required fields: %s\n", strings.Join(fieldSchema.Required, ", ")))
879
+ fmt.Fprintf(&feedback, "- This is an object with required fields: %s\n", strings.Join(fieldSchema.Required, ", "))
866
880
 
867
881
  if fieldSchema.Properties != nil {
868
882
  feedback.WriteString("\nNESTED FIELD DETAILS:\n")
@@ -872,13 +886,13 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
872
886
  if isRequired {
873
887
  requiredMark = " [REQUIRED]"
874
888
  }
875
- feedback.WriteString(fmt.Sprintf(" - %s: %s%s\n", nestedField, nestedSchema.Type, requiredMark))
889
+ fmt.Fprintf(&feedback, " - %s: %s%s\n", nestedField, nestedSchema.Type, requiredMark)
876
890
  }
877
891
  }
878
892
  }
879
893
 
880
894
  if fieldSchema.Type == "array" && fieldSchema.Items != nil {
881
- feedback.WriteString(fmt.Sprintf("- This is an array of: %s\n", fieldSchema.Items.Type))
895
+ fmt.Fprintf(&feedback, "- This is an array of: %s\n", fieldSchema.Items.Type)
882
896
  }
883
897
  }
884
898
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "recursive-llm-ts",
3
- "version": "4.6.0",
3
+ "version": "4.8.0",
4
4
  "description": "TypeScript bridge for recursive-llm: Recursive Language Models for unbounded context processing with structured outputs",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",