recursive-llm-ts 4.6.0 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bridge-interface.d.ts +1 -0
- package/go/rlm/context_overflow.go +84 -78
- package/go/rlm/context_overflow_test.go +121 -3
- package/go/rlm/errors.go +25 -1
- package/go/rlm/structured.go +7 -7
- package/package.json +1 -1
|
@@ -168,21 +168,71 @@ func newContextReducer(rlm *RLM, config ContextOverflowConfig, obs *Observer) *c
|
|
|
168
168
|
return &contextReducer{rlm: rlm, config: config, obs: obs}
|
|
169
169
|
}
|
|
170
170
|
|
|
171
|
+
// getResponseTokenBudget extracts max_tokens or max_completion_tokens from ExtraParams.
|
|
172
|
+
// This represents how many tokens the API will reserve for the response, which must be
|
|
173
|
+
// subtracted from the model's total capacity when sizing input chunks.
|
|
174
|
+
func (cr *contextReducer) getResponseTokenBudget(modelLimit int) int {
|
|
175
|
+
if cr.rlm.extraParams == nil {
|
|
176
|
+
return 0
|
|
177
|
+
}
|
|
178
|
+
// Check max_completion_tokens first (newer API parameter), then max_tokens
|
|
179
|
+
for _, key := range []string{"max_completion_tokens", "max_tokens"} {
|
|
180
|
+
if v, ok := cr.rlm.extraParams[key]; ok {
|
|
181
|
+
switch n := v.(type) {
|
|
182
|
+
case float64:
|
|
183
|
+
return int(n)
|
|
184
|
+
case int:
|
|
185
|
+
return n
|
|
186
|
+
case int64:
|
|
187
|
+
return int(n)
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
return 0
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// makeMapPhaseParams creates ExtraParams suitable for map-phase LLM calls (summarization).
|
|
195
|
+
// It copies the user's ExtraParams but overrides max_tokens to a smaller value since
|
|
196
|
+
// summaries don't need as many tokens as the original completion.
|
|
197
|
+
func (cr *contextReducer) makeMapPhaseParams(modelLimit int) map[string]interface{} {
|
|
198
|
+
params := make(map[string]interface{})
|
|
199
|
+
// Copy all user params (custom_llm_provider, temperature, etc.)
|
|
200
|
+
for k, v := range cr.rlm.extraParams {
|
|
201
|
+
params[k] = v
|
|
202
|
+
}
|
|
203
|
+
// Override max_tokens for map-phase: use at most 1/4 of model limit or 2000, whichever is smaller
|
|
204
|
+
mapMaxTokens := modelLimit / 4
|
|
205
|
+
if mapMaxTokens > 2000 {
|
|
206
|
+
mapMaxTokens = 2000
|
|
207
|
+
}
|
|
208
|
+
if mapMaxTokens < 256 {
|
|
209
|
+
mapMaxTokens = 256
|
|
210
|
+
}
|
|
211
|
+
params["max_tokens"] = mapMaxTokens
|
|
212
|
+
// Remove max_completion_tokens if present to avoid conflicts
|
|
213
|
+
delete(params, "max_completion_tokens")
|
|
214
|
+
return params
|
|
215
|
+
}
|
|
216
|
+
|
|
171
217
|
// ReduceForCompletion handles context overflow for a regular completion.
|
|
172
218
|
// It chunks the context, summarizes each chunk, and combines the summaries.
|
|
173
219
|
func (cr *contextReducer) ReduceForCompletion(query string, context string, modelLimit int) (string, error) {
|
|
174
|
-
cr.obs.Debug("overflow", "Starting
|
|
220
|
+
cr.obs.Debug("overflow", "Starting context reduction: %d estimated tokens, limit %d", EstimateTokens(context), modelLimit)
|
|
175
221
|
|
|
176
222
|
// Calculate safe token budget per chunk
|
|
177
|
-
// Reserve tokens for: system prompt (~500), query, overhead, safety margin
|
|
223
|
+
// Reserve tokens for: system prompt (~500), query, overhead, safety margin, response budget
|
|
178
224
|
queryTokens := EstimateTokens(query)
|
|
179
|
-
|
|
225
|
+
responseTokens := cr.getResponseTokenBudget(modelLimit)
|
|
226
|
+
overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin) + responseTokens
|
|
180
227
|
safeTokensPerChunk := modelLimit - overhead
|
|
181
228
|
|
|
182
229
|
if safeTokensPerChunk <= 0 {
|
|
183
|
-
safeTokensPerChunk = modelLimit /
|
|
230
|
+
safeTokensPerChunk = modelLimit / 4
|
|
184
231
|
}
|
|
185
232
|
|
|
233
|
+
cr.obs.Debug("overflow", "Budget: overhead=%d (query=%d, response=%d, safety=%d), chunk budget=%d",
|
|
234
|
+
overhead, queryTokens, responseTokens, int(float64(modelLimit)*cr.config.SafetyMargin), safeTokensPerChunk)
|
|
235
|
+
|
|
186
236
|
chunks := ChunkContext(context, safeTokensPerChunk)
|
|
187
237
|
cr.obs.Debug("overflow", "Split context into %d chunks (budget: %d tokens/chunk)", len(chunks), safeTokensPerChunk)
|
|
188
238
|
|
|
@@ -211,6 +261,9 @@ func (cr *contextReducer) ReduceForCompletion(query string, context string, mode
|
|
|
211
261
|
func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, modelLimit int, overhead int) (string, error) {
|
|
212
262
|
cr.obs.Debug("overflow", "Using MapReduce strategy with %d chunks", len(chunks))
|
|
213
263
|
|
|
264
|
+
// Use map-phase-specific params with reduced max_tokens for summarization
|
|
265
|
+
mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
|
|
266
|
+
|
|
214
267
|
summaries := make([]string, len(chunks))
|
|
215
268
|
errs := make([]error, len(chunks))
|
|
216
269
|
var wg sync.WaitGroup
|
|
@@ -239,7 +292,7 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
|
|
|
239
292
|
APIBase: cr.rlm.apiBase,
|
|
240
293
|
APIKey: cr.rlm.apiKey,
|
|
241
294
|
Timeout: cr.rlm.timeoutSeconds,
|
|
242
|
-
ExtraParams:
|
|
295
|
+
ExtraParams: mapPhaseParams,
|
|
243
296
|
})
|
|
244
297
|
if err != nil {
|
|
245
298
|
errs[idx] = fmt.Errorf("map phase chunk %d: %w", idx+1, err)
|
|
@@ -254,14 +307,22 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
|
|
|
254
307
|
|
|
255
308
|
wg.Wait()
|
|
256
309
|
|
|
257
|
-
// Check for errors
|
|
310
|
+
// Check for errors - if map phase overflows, fall back to tfidf
|
|
258
311
|
var mapErrors []string
|
|
312
|
+
hasOverflow := false
|
|
259
313
|
for _, err := range errs {
|
|
260
314
|
if err != nil {
|
|
261
315
|
mapErrors = append(mapErrors, err.Error())
|
|
316
|
+
if _, isOverflow := IsContextOverflow(err); isOverflow {
|
|
317
|
+
hasOverflow = true
|
|
318
|
+
}
|
|
262
319
|
}
|
|
263
320
|
}
|
|
264
321
|
if len(mapErrors) > 0 {
|
|
322
|
+
if hasOverflow {
|
|
323
|
+
cr.obs.Debug("overflow", "MapReduce map phase hit overflow, falling back to TF-IDF strategy")
|
|
324
|
+
return cr.reduceByTFIDF(strings.Join(chunks, "\n\n"), modelLimit, overhead)
|
|
325
|
+
}
|
|
265
326
|
return "", fmt.Errorf("MapReduce map phase failed: %s", strings.Join(mapErrors, "; "))
|
|
266
327
|
}
|
|
267
328
|
|
|
@@ -306,6 +367,9 @@ func (cr *contextReducer) reduceByTruncation(context string, modelLimit int, ove
|
|
|
306
367
|
func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []string, modelLimit int, overhead int) (string, error) {
|
|
307
368
|
cr.obs.Debug("overflow", "Using chunked extraction strategy with %d chunks", len(chunks))
|
|
308
369
|
|
|
370
|
+
// Use map-phase-specific params with reduced max_tokens
|
|
371
|
+
mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
|
|
372
|
+
|
|
309
373
|
results := make([]string, len(chunks))
|
|
310
374
|
errs := make([]error, len(chunks))
|
|
311
375
|
var wg sync.WaitGroup
|
|
@@ -333,7 +397,7 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
|
|
|
333
397
|
APIBase: cr.rlm.apiBase,
|
|
334
398
|
APIKey: cr.rlm.apiKey,
|
|
335
399
|
Timeout: cr.rlm.timeoutSeconds,
|
|
336
|
-
ExtraParams:
|
|
400
|
+
ExtraParams: mapPhaseParams,
|
|
337
401
|
})
|
|
338
402
|
if err != nil {
|
|
339
403
|
errs[idx] = fmt.Errorf("chunked extraction chunk %d: %w", idx+1, err)
|
|
@@ -350,12 +414,20 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
|
|
|
350
414
|
wg.Wait()
|
|
351
415
|
|
|
352
416
|
var extractErrors []string
|
|
417
|
+
hasOverflow := false
|
|
353
418
|
for _, err := range errs {
|
|
354
419
|
if err != nil {
|
|
355
420
|
extractErrors = append(extractErrors, err.Error())
|
|
421
|
+
if _, isOverflow := IsContextOverflow(err); isOverflow {
|
|
422
|
+
hasOverflow = true
|
|
423
|
+
}
|
|
356
424
|
}
|
|
357
425
|
}
|
|
358
426
|
if len(extractErrors) > 0 {
|
|
427
|
+
if hasOverflow {
|
|
428
|
+
cr.obs.Debug("overflow", "Chunked extraction hit overflow, falling back to TF-IDF strategy")
|
|
429
|
+
return cr.reduceByTFIDF(strings.Join(chunks, "\n\n"), modelLimit, overhead)
|
|
430
|
+
}
|
|
359
431
|
return "", fmt.Errorf("chunked extraction failed: %s", strings.Join(extractErrors, "; "))
|
|
360
432
|
}
|
|
361
433
|
|
|
@@ -387,6 +459,9 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
|
|
|
387
459
|
return "", fmt.Errorf("refine strategy: no chunks to process")
|
|
388
460
|
}
|
|
389
461
|
|
|
462
|
+
// Use map-phase-specific params with reduced max_tokens
|
|
463
|
+
mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
|
|
464
|
+
|
|
390
465
|
// Phase 1: Generate initial answer from the first chunk
|
|
391
466
|
initialPrompt := fmt.Sprintf(
|
|
392
467
|
"Using the following context, provide a comprehensive answer to the question.\n"+
|
|
@@ -406,7 +481,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
|
|
|
406
481
|
APIBase: cr.rlm.apiBase,
|
|
407
482
|
APIKey: cr.rlm.apiKey,
|
|
408
483
|
Timeout: cr.rlm.timeoutSeconds,
|
|
409
|
-
ExtraParams:
|
|
484
|
+
ExtraParams: mapPhaseParams,
|
|
410
485
|
})
|
|
411
486
|
if err != nil {
|
|
412
487
|
return "", fmt.Errorf("refine initial chunk: %w", err)
|
|
@@ -438,7 +513,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
|
|
|
438
513
|
APIBase: cr.rlm.apiBase,
|
|
439
514
|
APIKey: cr.rlm.apiKey,
|
|
440
515
|
Timeout: cr.rlm.timeoutSeconds,
|
|
441
|
-
ExtraParams:
|
|
516
|
+
ExtraParams: mapPhaseParams,
|
|
442
517
|
})
|
|
443
518
|
if err != nil {
|
|
444
519
|
cr.obs.Debug("overflow", "Refine: chunk %d/%d failed: %v, keeping current answer", i+1, len(chunks), err)
|
|
@@ -495,72 +570,3 @@ func (cr *contextReducer) reduceByTextRank(context string, modelLimit int, overh
|
|
|
495
570
|
return result, nil
|
|
496
571
|
}
|
|
497
572
|
|
|
498
|
-
// ─── Adaptive Completion with Overflow Recovery ──────────────────────────────
|
|
499
|
-
|
|
500
|
-
// completionWithOverflowRecovery wraps a completion call with automatic overflow detection and retry.
|
|
501
|
-
// When a context overflow error is detected, it reduces the context and retries.
|
|
502
|
-
func (r *RLM) completionWithOverflowRecovery(query string, context string, overflowConfig ContextOverflowConfig) (string, RLMStats, error) {
|
|
503
|
-
obs := r.observer
|
|
504
|
-
if obs == nil {
|
|
505
|
-
obs = NewNoopObserver()
|
|
506
|
-
}
|
|
507
|
-
|
|
508
|
-
// Try the normal completion first
|
|
509
|
-
result, stats, err := r.Completion(query, context)
|
|
510
|
-
if err == nil {
|
|
511
|
-
return result, stats, nil
|
|
512
|
-
}
|
|
513
|
-
|
|
514
|
-
// Check if it's a context overflow error
|
|
515
|
-
coe, isOverflow := IsContextOverflow(err)
|
|
516
|
-
if !isOverflow {
|
|
517
|
-
return "", stats, err // Not an overflow error, return original error
|
|
518
|
-
}
|
|
519
|
-
|
|
520
|
-
obs.Debug("overflow", "Context overflow detected: model limit %d, request %d tokens (%.1f%% over)",
|
|
521
|
-
coe.ModelLimit, coe.RequestTokens, (coe.OverflowRatio()-1)*100)
|
|
522
|
-
|
|
523
|
-
// Use detected limit or configured limit
|
|
524
|
-
modelLimit := coe.ModelLimit
|
|
525
|
-
if overflowConfig.MaxModelTokens > 0 {
|
|
526
|
-
modelLimit = overflowConfig.MaxModelTokens
|
|
527
|
-
}
|
|
528
|
-
|
|
529
|
-
reducer := newContextReducer(r, overflowConfig, obs)
|
|
530
|
-
|
|
531
|
-
// Attempt context reduction and retry
|
|
532
|
-
for attempt := 0; attempt < overflowConfig.MaxReductionAttempts; attempt++ {
|
|
533
|
-
obs.Debug("overflow", "Reduction attempt %d/%d", attempt+1, overflowConfig.MaxReductionAttempts)
|
|
534
|
-
|
|
535
|
-
reducedContext, reduceErr := reducer.ReduceForCompletion(query, context, modelLimit)
|
|
536
|
-
if reduceErr != nil {
|
|
537
|
-
obs.Error("overflow", "Context reduction failed: %v", reduceErr)
|
|
538
|
-
return "", stats, fmt.Errorf("context overflow recovery failed: %w", reduceErr)
|
|
539
|
-
}
|
|
540
|
-
|
|
541
|
-
obs.Debug("overflow", "Context reduced: %d -> %d chars", len(context), len(reducedContext))
|
|
542
|
-
|
|
543
|
-
// Retry with reduced context
|
|
544
|
-
result, stats, err = r.Completion(query, reducedContext)
|
|
545
|
-
if err == nil {
|
|
546
|
-
obs.Event("overflow.recovery_success", map[string]string{
|
|
547
|
-
"attempt": fmt.Sprintf("%d", attempt+1),
|
|
548
|
-
"original_chars": fmt.Sprintf("%d", len(context)),
|
|
549
|
-
"reduced_chars": fmt.Sprintf("%d", len(reducedContext)),
|
|
550
|
-
"reduction_ratio": fmt.Sprintf("%.2f", float64(len(reducedContext))/float64(len(context))),
|
|
551
|
-
})
|
|
552
|
-
return result, stats, nil
|
|
553
|
-
}
|
|
554
|
-
|
|
555
|
-
// If it overflows again, use the reduced context for the next attempt
|
|
556
|
-
if _, stillOverflow := IsContextOverflow(err); stillOverflow {
|
|
557
|
-
context = reducedContext
|
|
558
|
-
continue
|
|
559
|
-
}
|
|
560
|
-
|
|
561
|
-
// Different error, return it
|
|
562
|
-
return "", stats, err
|
|
563
|
-
}
|
|
564
|
-
|
|
565
|
-
return "", stats, fmt.Errorf("context overflow: exceeded %d reduction attempts, model limit is %d tokens", overflowConfig.MaxReductionAttempts, modelLimit)
|
|
566
|
-
}
|
|
@@ -107,6 +107,124 @@ func TestIsContextOverflow_GenericError(t *testing.T) {
|
|
|
107
107
|
}
|
|
108
108
|
}
|
|
109
109
|
|
|
110
|
+
func TestIsContextOverflow_MaxTokensTooLarge_vLLM(t *testing.T) {
|
|
111
|
+
// vLLM/Ray Serve error when max_tokens exceeds remaining capacity
|
|
112
|
+
// This is the exact error from the user's production logs
|
|
113
|
+
response := `{"object":"error","message":"'max_tokens' or 'max_completion_tokens' is too large: 10000. This model's maximum context length is 32768 tokens and your request has 30168 input tokens (10000 > 32768 - 30168)","type":"BadRequestError","param":null,"code":400}`
|
|
114
|
+
apiErr := NewAPIError(400, response)
|
|
115
|
+
|
|
116
|
+
coe, ok := IsContextOverflow(apiErr)
|
|
117
|
+
if !ok {
|
|
118
|
+
t.Fatal("expected IsContextOverflow to detect max_tokens too large error")
|
|
119
|
+
}
|
|
120
|
+
if coe.ModelLimit != 32768 {
|
|
121
|
+
t.Errorf("expected ModelLimit 32768, got %d", coe.ModelLimit)
|
|
122
|
+
}
|
|
123
|
+
// Request tokens should include both input + max_tokens: 30168 + 10000 = 40168
|
|
124
|
+
if coe.RequestTokens != 40168 {
|
|
125
|
+
t.Errorf("expected RequestTokens 40168 (input 30168 + max_tokens 10000), got %d", coe.RequestTokens)
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
func TestIsContextOverflow_MaxCompletionTokensTooLarge(t *testing.T) {
|
|
130
|
+
// OpenAI newer API format with max_completion_tokens
|
|
131
|
+
response := `{"error":{"message":"'max_tokens' or 'max_completion_tokens' is too large: 5000. This model's maximum context length is 16384 tokens and your request has 14000 input tokens","type":"invalid_request_error","code":"invalid_request_error"}}`
|
|
132
|
+
apiErr := NewAPIError(400, response)
|
|
133
|
+
|
|
134
|
+
coe, ok := IsContextOverflow(apiErr)
|
|
135
|
+
if !ok {
|
|
136
|
+
t.Fatal("expected IsContextOverflow to detect max_completion_tokens too large error")
|
|
137
|
+
}
|
|
138
|
+
if coe.ModelLimit != 16384 {
|
|
139
|
+
t.Errorf("expected ModelLimit 16384, got %d", coe.ModelLimit)
|
|
140
|
+
}
|
|
141
|
+
if coe.RequestTokens != 19000 {
|
|
142
|
+
t.Errorf("expected RequestTokens 19000 (input 14000 + max_tokens 5000), got %d", coe.RequestTokens)
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
func TestGetResponseTokenBudget(t *testing.T) {
|
|
147
|
+
rlm := &RLM{
|
|
148
|
+
extraParams: map[string]interface{}{
|
|
149
|
+
"max_tokens": float64(10000),
|
|
150
|
+
},
|
|
151
|
+
}
|
|
152
|
+
obs := NewNoopObserver()
|
|
153
|
+
config := DefaultContextOverflowConfig()
|
|
154
|
+
reducer := newContextReducer(rlm, config, obs)
|
|
155
|
+
|
|
156
|
+
budget := reducer.getResponseTokenBudget(32768)
|
|
157
|
+
if budget != 10000 {
|
|
158
|
+
t.Errorf("expected response token budget 10000, got %d", budget)
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
func TestGetResponseTokenBudget_MaxCompletionTokens(t *testing.T) {
|
|
163
|
+
rlm := &RLM{
|
|
164
|
+
extraParams: map[string]interface{}{
|
|
165
|
+
"max_completion_tokens": float64(5000),
|
|
166
|
+
},
|
|
167
|
+
}
|
|
168
|
+
obs := NewNoopObserver()
|
|
169
|
+
config := DefaultContextOverflowConfig()
|
|
170
|
+
reducer := newContextReducer(rlm, config, obs)
|
|
171
|
+
|
|
172
|
+
budget := reducer.getResponseTokenBudget(32768)
|
|
173
|
+
if budget != 5000 {
|
|
174
|
+
t.Errorf("expected response token budget 5000, got %d", budget)
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
func TestGetResponseTokenBudget_NoMaxTokens(t *testing.T) {
|
|
179
|
+
rlm := &RLM{
|
|
180
|
+
extraParams: map[string]interface{}{
|
|
181
|
+
"temperature": 0.7,
|
|
182
|
+
},
|
|
183
|
+
}
|
|
184
|
+
obs := NewNoopObserver()
|
|
185
|
+
config := DefaultContextOverflowConfig()
|
|
186
|
+
reducer := newContextReducer(rlm, config, obs)
|
|
187
|
+
|
|
188
|
+
budget := reducer.getResponseTokenBudget(32768)
|
|
189
|
+
if budget != 0 {
|
|
190
|
+
t.Errorf("expected response token budget 0, got %d", budget)
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
func TestMakeMapPhaseParams(t *testing.T) {
|
|
195
|
+
rlm := &RLM{
|
|
196
|
+
extraParams: map[string]interface{}{
|
|
197
|
+
"max_tokens": float64(10000),
|
|
198
|
+
"custom_llm_provider": "vllm",
|
|
199
|
+
"temperature": 0.7,
|
|
200
|
+
},
|
|
201
|
+
}
|
|
202
|
+
obs := NewNoopObserver()
|
|
203
|
+
config := DefaultContextOverflowConfig()
|
|
204
|
+
reducer := newContextReducer(rlm, config, obs)
|
|
205
|
+
|
|
206
|
+
params := reducer.makeMapPhaseParams(32768)
|
|
207
|
+
|
|
208
|
+
// max_tokens should be capped (32768/4 = 8192, but cap is 2000)
|
|
209
|
+
maxTokens, ok := params["max_tokens"].(int)
|
|
210
|
+
if !ok {
|
|
211
|
+
t.Fatal("expected max_tokens to be int in map phase params")
|
|
212
|
+
}
|
|
213
|
+
if maxTokens > 2000 {
|
|
214
|
+
t.Errorf("expected map phase max_tokens <= 2000, got %d", maxTokens)
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// custom_llm_provider should be preserved
|
|
218
|
+
if params["custom_llm_provider"] != "vllm" {
|
|
219
|
+
t.Errorf("expected custom_llm_provider to be preserved, got %v", params["custom_llm_provider"])
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// temperature should be preserved
|
|
223
|
+
if params["temperature"] != 0.7 {
|
|
224
|
+
t.Errorf("expected temperature to be preserved, got %v", params["temperature"])
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
110
228
|
func TestContextOverflowError_OverflowRatio(t *testing.T) {
|
|
111
229
|
tests := []struct {
|
|
112
230
|
limit int
|
|
@@ -526,10 +644,10 @@ func TestContextOverflowError_ErrorChain(t *testing.T) {
|
|
|
526
644
|
if coe.APIError == nil {
|
|
527
645
|
t.Fatal("expected embedded APIError to be non-nil")
|
|
528
646
|
}
|
|
529
|
-
if coe.
|
|
530
|
-
t.Errorf("expected status 400, got %d", coe.
|
|
647
|
+
if coe.StatusCode != 400 {
|
|
648
|
+
t.Errorf("expected status 400, got %d", coe.StatusCode)
|
|
531
649
|
}
|
|
532
|
-
if coe.
|
|
650
|
+
if coe.RLMError == nil {
|
|
533
651
|
t.Fatal("expected embedded RLMError to be non-nil")
|
|
534
652
|
}
|
|
535
653
|
|
package/go/rlm/errors.go
CHANGED
|
@@ -192,7 +192,31 @@ func parseContextOverflowMessage(msg string) (modelLimit int, requestTokens int,
|
|
|
192
192
|
}
|
|
193
193
|
}
|
|
194
194
|
|
|
195
|
-
// Pattern 3: "max_tokens
|
|
195
|
+
// Pattern 3: "max_tokens is too large" - response budget exceeds remaining capacity
|
|
196
|
+
// vLLM/OpenAI: "max_tokens' or 'max_completion_tokens' is too large: 10000.
|
|
197
|
+
// This model's maximum context length is 32768 tokens and your request has 30168 input tokens"
|
|
198
|
+
// In this case, input tokens < model limit, but input + max_tokens > model limit.
|
|
199
|
+
// We report the effective total (input + max_tokens) as requestTokens.
|
|
200
|
+
if strings.Contains(lowerMsg, "max_tokens") && strings.Contains(lowerMsg, "too large") {
|
|
201
|
+
limit := extractNumber(msg, "maximum context length is ", " tokens")
|
|
202
|
+
inputTokens := extractNumber(msg, "your request has ", " input tokens")
|
|
203
|
+
if inputTokens == 0 {
|
|
204
|
+
inputTokens = extractNumber(msg, "your request has ", " tokens")
|
|
205
|
+
}
|
|
206
|
+
maxTokens := extractNumber(msg, "too large: ", ".")
|
|
207
|
+
if maxTokens == 0 {
|
|
208
|
+
maxTokens = extractNumber(msg, "too large: ", " ")
|
|
209
|
+
}
|
|
210
|
+
if limit > 0 && inputTokens > 0 && maxTokens > 0 {
|
|
211
|
+
return limit, inputTokens + maxTokens, true
|
|
212
|
+
}
|
|
213
|
+
// Fallback: if we got limit and input tokens, treat input as the overflow
|
|
214
|
+
if limit > 0 && inputTokens > 0 {
|
|
215
|
+
return limit, inputTokens, true
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// Pattern 4: "input too long" / "too many tokens" generic patterns
|
|
196
220
|
if strings.Contains(lowerMsg, "input too long") || strings.Contains(lowerMsg, "too many tokens") || strings.Contains(lowerMsg, "too many input tokens") {
|
|
197
221
|
limit := extractNumber(msg, "limit is ", " tokens")
|
|
198
222
|
if limit == 0 {
|
package/go/rlm/structured.go
CHANGED
|
@@ -843,7 +843,7 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
|
|
|
843
843
|
|
|
844
844
|
var feedback strings.Builder
|
|
845
845
|
feedback.WriteString("VALIDATION ERROR - Your previous response was invalid.\n\n")
|
|
846
|
-
|
|
846
|
+
fmt.Fprintf(&feedback, "ERROR: %s\n\n", errMsg)
|
|
847
847
|
|
|
848
848
|
// Extract what field caused the issue
|
|
849
849
|
if strings.Contains(errMsg, "missing required field:") {
|
|
@@ -852,17 +852,17 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
|
|
|
852
852
|
fieldName = strings.TrimSpace(fieldName)
|
|
853
853
|
|
|
854
854
|
feedback.WriteString("SPECIFIC ISSUE:\n")
|
|
855
|
-
|
|
855
|
+
fmt.Fprintf(&feedback, "The field '%s' is REQUIRED but was not provided.\n\n", fieldName)
|
|
856
856
|
|
|
857
857
|
// Find the schema for this field and provide details
|
|
858
858
|
if schema.Type == "object" && schema.Properties != nil {
|
|
859
859
|
if fieldSchema, exists := schema.Properties[fieldName]; exists {
|
|
860
860
|
feedback.WriteString("FIELD REQUIREMENTS:\n")
|
|
861
|
-
|
|
862
|
-
|
|
861
|
+
fmt.Fprintf(&feedback, "- Field name: '%s'\n", fieldName)
|
|
862
|
+
fmt.Fprintf(&feedback, "- Type: %s\n", fieldSchema.Type)
|
|
863
863
|
|
|
864
864
|
if fieldSchema.Type == "object" && len(fieldSchema.Required) > 0 {
|
|
865
|
-
|
|
865
|
+
fmt.Fprintf(&feedback, "- This is an object with required fields: %s\n", strings.Join(fieldSchema.Required, ", "))
|
|
866
866
|
|
|
867
867
|
if fieldSchema.Properties != nil {
|
|
868
868
|
feedback.WriteString("\nNESTED FIELD DETAILS:\n")
|
|
@@ -872,13 +872,13 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
|
|
|
872
872
|
if isRequired {
|
|
873
873
|
requiredMark = " [REQUIRED]"
|
|
874
874
|
}
|
|
875
|
-
|
|
875
|
+
fmt.Fprintf(&feedback, " - %s: %s%s\n", nestedField, nestedSchema.Type, requiredMark)
|
|
876
876
|
}
|
|
877
877
|
}
|
|
878
878
|
}
|
|
879
879
|
|
|
880
880
|
if fieldSchema.Type == "array" && fieldSchema.Items != nil {
|
|
881
|
-
|
|
881
|
+
fmt.Fprintf(&feedback, "- This is an array of: %s\n", fieldSchema.Items.Type)
|
|
882
882
|
}
|
|
883
883
|
}
|
|
884
884
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recursive-llm-ts",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.7.0",
|
|
4
4
|
"description": "TypeScript bridge for recursive-llm: Recursive Language Models for unbounded context processing with structured outputs",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|