npm - recursive-llm-ts - Versions diffs - 4.6.0 → 4.7.0 - Mend

recursive-llm-ts 4.6.0 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/bridge-interface.d.ts +1 -0
package/go/rlm/context_overflow.go +84 -78
package/go/rlm/context_overflow_test.go +121 -3
package/go/rlm/errors.go +25 -1
package/go/rlm/structured.go +7 -7
package/package.json +1 -1

package/dist/bridge-interface.d.ts CHANGED Viewed

@@ -65,6 +65,7 @@ export interface RLMConfig {
     temperature?: number;
     max_tokens?: number;
     structured?: any;
+    [key: string]: any;
 }
 export interface FileStorageConfig {
     /** Storage type: 'local' or 's3' */

package/go/rlm/context_overflow.go CHANGED Viewed

@@ -168,21 +168,71 @@ func newContextReducer(rlm *RLM, config ContextOverflowConfig, obs *Observer) *c
 	return &contextReducer{rlm: rlm, config: config, obs: obs}
 }
+// getResponseTokenBudget extracts max_tokens or max_completion_tokens from ExtraParams.
+// This represents how many tokens the API will reserve for the response, which must be
+// subtracted from the model's total capacity when sizing input chunks.
+func (cr *contextReducer) getResponseTokenBudget(modelLimit int) int {
+	if cr.rlm.extraParams == nil {
+		return 0
+	}
+	// Check max_completion_tokens first (newer API parameter), then max_tokens
+	for _, key := range []string{"max_completion_tokens", "max_tokens"} {
+		if v, ok := cr.rlm.extraParams[key]; ok {
+			switch n := v.(type) {
+			case float64:
+				return int(n)
+			case int:
+				return n
+			case int64:
+				return int(n)
+			}
+		}
+	}
+	return 0
+}
+// makeMapPhaseParams creates ExtraParams suitable for map-phase LLM calls (summarization).
+// It copies the user's ExtraParams but overrides max_tokens to a smaller value since
+// summaries don't need as many tokens as the original completion.
+func (cr *contextReducer) makeMapPhaseParams(modelLimit int) map[string]interface{} {
+	params := make(map[string]interface{})
+	// Copy all user params (custom_llm_provider, temperature, etc.)
+	for k, v := range cr.rlm.extraParams {
+		params[k] = v
+	}
+	// Override max_tokens for map-phase: use at most 1/4 of model limit or 2000, whichever is smaller
+	mapMaxTokens := modelLimit / 4
+	if mapMaxTokens > 2000 {
+		mapMaxTokens = 2000
+	}
+	if mapMaxTokens < 256 {
+		mapMaxTokens = 256
+	}
+	params["max_tokens"] = mapMaxTokens
+	// Remove max_completion_tokens if present to avoid conflicts
+	delete(params, "max_completion_tokens")
+	return params
+}
 // ReduceForCompletion handles context overflow for a regular completion.
 // It chunks the context, summarizes each chunk, and combines the summaries.
 func (cr *contextReducer) ReduceForCompletion(query string, context string, modelLimit int) (string, error) {
-	cr.obs.Debug("overflow", "Starting MapReduce context reduction: %d estimated tokens, limit %d", EstimateTokens(context), modelLimit)
+	cr.obs.Debug("overflow", "Starting context reduction: %d estimated tokens, limit %d", EstimateTokens(context), modelLimit)
 	// Calculate safe token budget per chunk
-	// Reserve tokens for: system prompt (~500), query, overhead, safety margin
+	// Reserve tokens for: system prompt (~500), query, overhead, safety margin, response budget
 	queryTokens := EstimateTokens(query)
-	overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin)
+	responseTokens := cr.getResponseTokenBudget(modelLimit)
+	overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin) + responseTokens
 	safeTokensPerChunk := modelLimit - overhead
 	if safeTokensPerChunk <= 0 {
-		safeTokensPerChunk = modelLimit / 2
+		safeTokensPerChunk = modelLimit / 4
 	}
+	cr.obs.Debug("overflow", "Budget: overhead=%d (query=%d, response=%d, safety=%d), chunk budget=%d",
+		overhead, queryTokens, responseTokens, int(float64(modelLimit)*cr.config.SafetyMargin), safeTokensPerChunk)
 	chunks := ChunkContext(context, safeTokensPerChunk)
 	cr.obs.Debug("overflow", "Split context into %d chunks (budget: %d tokens/chunk)", len(chunks), safeTokensPerChunk)
@@ -211,6 +261,9 @@ func (cr *contextReducer) ReduceForCompletion(query string, context string, mode
 func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, modelLimit int, overhead int) (string, error) {
 	cr.obs.Debug("overflow", "Using MapReduce strategy with %d chunks", len(chunks))
+	// Use map-phase-specific params with reduced max_tokens for summarization
+	mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
 	summaries := make([]string, len(chunks))
 	errs := make([]error, len(chunks))
 	var wg sync.WaitGroup
@@ -239,7 +292,7 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
 				APIBase:     cr.rlm.apiBase,
 				APIKey:      cr.rlm.apiKey,
 				Timeout:     cr.rlm.timeoutSeconds,
-				ExtraParams: cr.rlm.extraParams,
+				ExtraParams: mapPhaseParams,
 			})
 			if err != nil {
 				errs[idx] = fmt.Errorf("map phase chunk %d: %w", idx+1, err)
@@ -254,14 +307,22 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
 	wg.Wait()
-	// Check for errors
+	// Check for errors - if map phase overflows, fall back to tfidf
 	var mapErrors []string
+	hasOverflow := false
 	for _, err := range errs {
 		if err != nil {
 			mapErrors = append(mapErrors, err.Error())
+			if _, isOverflow := IsContextOverflow(err); isOverflow {
+				hasOverflow = true
+			}
 		}
 	}
 	if len(mapErrors) > 0 {
+		if hasOverflow {
+			cr.obs.Debug("overflow", "MapReduce map phase hit overflow, falling back to TF-IDF strategy")
+			return cr.reduceByTFIDF(strings.Join(chunks, "\n\n"), modelLimit, overhead)
+		}
 		return "", fmt.Errorf("MapReduce map phase failed: %s", strings.Join(mapErrors, "; "))
 	}
@@ -306,6 +367,9 @@ func (cr *contextReducer) reduceByTruncation(context string, modelLimit int, ove
 func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []string, modelLimit int, overhead int) (string, error) {
 	cr.obs.Debug("overflow", "Using chunked extraction strategy with %d chunks", len(chunks))
+	// Use map-phase-specific params with reduced max_tokens
+	mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
 	results := make([]string, len(chunks))
 	errs := make([]error, len(chunks))
 	var wg sync.WaitGroup
@@ -333,7 +397,7 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
 				APIBase:     cr.rlm.apiBase,
 				APIKey:      cr.rlm.apiKey,
 				Timeout:     cr.rlm.timeoutSeconds,
-				ExtraParams: cr.rlm.extraParams,
+				ExtraParams: mapPhaseParams,
 			})
 			if err != nil {
 				errs[idx] = fmt.Errorf("chunked extraction chunk %d: %w", idx+1, err)
@@ -350,12 +414,20 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
 	wg.Wait()
 	var extractErrors []string
+	hasOverflow := false
 	for _, err := range errs {
 		if err != nil {
 			extractErrors = append(extractErrors, err.Error())
+			if _, isOverflow := IsContextOverflow(err); isOverflow {
+				hasOverflow = true
+			}
 		}
 	}
 	if len(extractErrors) > 0 {
+		if hasOverflow {
+			cr.obs.Debug("overflow", "Chunked extraction hit overflow, falling back to TF-IDF strategy")
+			return cr.reduceByTFIDF(strings.Join(chunks, "\n\n"), modelLimit, overhead)
+		}
 		return "", fmt.Errorf("chunked extraction failed: %s", strings.Join(extractErrors, "; "))
 	}
@@ -387,6 +459,9 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
 		return "", fmt.Errorf("refine strategy: no chunks to process")
 	}
+	// Use map-phase-specific params with reduced max_tokens
+	mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
 	// Phase 1: Generate initial answer from the first chunk
 	initialPrompt := fmt.Sprintf(
 		"Using the following context, provide a comprehensive answer to the question.\n"+
@@ -406,7 +481,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
 		APIBase:     cr.rlm.apiBase,
 		APIKey:      cr.rlm.apiKey,
 		Timeout:     cr.rlm.timeoutSeconds,
-		ExtraParams: cr.rlm.extraParams,
+		ExtraParams: mapPhaseParams,
 	})
 	if err != nil {
 		return "", fmt.Errorf("refine initial chunk: %w", err)
@@ -438,7 +513,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
 			APIBase:     cr.rlm.apiBase,
 			APIKey:      cr.rlm.apiKey,
 			Timeout:     cr.rlm.timeoutSeconds,
-			ExtraParams: cr.rlm.extraParams,
+			ExtraParams: mapPhaseParams,
 		})
 		if err != nil {
 			cr.obs.Debug("overflow", "Refine: chunk %d/%d failed: %v, keeping current answer", i+1, len(chunks), err)
@@ -495,72 +570,3 @@ func (cr *contextReducer) reduceByTextRank(context string, modelLimit int, overh
 	return result, nil
 }
-// ─── Adaptive Completion with Overflow Recovery ──────────────────────────────
-// completionWithOverflowRecovery wraps a completion call with automatic overflow detection and retry.
-// When a context overflow error is detected, it reduces the context and retries.
-func (r *RLM) completionWithOverflowRecovery(query string, context string, overflowConfig ContextOverflowConfig) (string, RLMStats, error) {
-	obs := r.observer
-	if obs == nil {
-		obs = NewNoopObserver()
-	}
-	// Try the normal completion first
-	result, stats, err := r.Completion(query, context)
-	if err == nil {
-		return result, stats, nil
-	}
-	// Check if it's a context overflow error
-	coe, isOverflow := IsContextOverflow(err)
-	if !isOverflow {
-		return "", stats, err // Not an overflow error, return original error
-	}
-	obs.Debug("overflow", "Context overflow detected: model limit %d, request %d tokens (%.1f%% over)",
-		coe.ModelLimit, coe.RequestTokens, (coe.OverflowRatio()-1)*100)
-	// Use detected limit or configured limit
-	modelLimit := coe.ModelLimit
-	if overflowConfig.MaxModelTokens > 0 {
-		modelLimit = overflowConfig.MaxModelTokens
-	}
-	reducer := newContextReducer(r, overflowConfig, obs)
-	// Attempt context reduction and retry
-	for attempt := 0; attempt < overflowConfig.MaxReductionAttempts; attempt++ {
-		obs.Debug("overflow", "Reduction attempt %d/%d", attempt+1, overflowConfig.MaxReductionAttempts)
-		reducedContext, reduceErr := reducer.ReduceForCompletion(query, context, modelLimit)
-		if reduceErr != nil {
-			obs.Error("overflow", "Context reduction failed: %v", reduceErr)
-			return "", stats, fmt.Errorf("context overflow recovery failed: %w", reduceErr)
-		}
-		obs.Debug("overflow", "Context reduced: %d -> %d chars", len(context), len(reducedContext))
-		// Retry with reduced context
-		result, stats, err = r.Completion(query, reducedContext)
-		if err == nil {
-			obs.Event("overflow.recovery_success", map[string]string{
-				"attempt":          fmt.Sprintf("%d", attempt+1),
-				"original_chars":   fmt.Sprintf("%d", len(context)),
-				"reduced_chars":    fmt.Sprintf("%d", len(reducedContext)),
-				"reduction_ratio":  fmt.Sprintf("%.2f", float64(len(reducedContext))/float64(len(context))),
-			})
-			return result, stats, nil
-		}
-		// If it overflows again, use the reduced context for the next attempt
-		if _, stillOverflow := IsContextOverflow(err); stillOverflow {
-			context = reducedContext
-			continue
-		}
-		// Different error, return it
-		return "", stats, err
-	}
-	return "", stats, fmt.Errorf("context overflow: exceeded %d reduction attempts, model limit is %d tokens", overflowConfig.MaxReductionAttempts, modelLimit)
-}

package/go/rlm/context_overflow_test.go CHANGED Viewed

@@ -107,6 +107,124 @@ func TestIsContextOverflow_GenericError(t *testing.T) {
 	}
 }
+func TestIsContextOverflow_MaxTokensTooLarge_vLLM(t *testing.T) {
+	// vLLM/Ray Serve error when max_tokens exceeds remaining capacity
+	// This is the exact error from the user's production logs
+	response := `{"object":"error","message":"'max_tokens' or 'max_completion_tokens' is too large: 10000. This model's maximum context length is 32768 tokens and your request has 30168 input tokens (10000 > 32768 - 30168)","type":"BadRequestError","param":null,"code":400}`
+	apiErr := NewAPIError(400, response)
+	coe, ok := IsContextOverflow(apiErr)
+	if !ok {
+		t.Fatal("expected IsContextOverflow to detect max_tokens too large error")
+	}
+	if coe.ModelLimit != 32768 {
+		t.Errorf("expected ModelLimit 32768, got %d", coe.ModelLimit)
+	}
+	// Request tokens should include both input + max_tokens: 30168 + 10000 = 40168
+	if coe.RequestTokens != 40168 {
+		t.Errorf("expected RequestTokens 40168 (input 30168 + max_tokens 10000), got %d", coe.RequestTokens)
+	}
+}
+func TestIsContextOverflow_MaxCompletionTokensTooLarge(t *testing.T) {
+	// OpenAI newer API format with max_completion_tokens
+	response := `{"error":{"message":"'max_tokens' or 'max_completion_tokens' is too large: 5000. This model's maximum context length is 16384 tokens and your request has 14000 input tokens","type":"invalid_request_error","code":"invalid_request_error"}}`
+	apiErr := NewAPIError(400, response)
+	coe, ok := IsContextOverflow(apiErr)
+	if !ok {
+		t.Fatal("expected IsContextOverflow to detect max_completion_tokens too large error")
+	}
+	if coe.ModelLimit != 16384 {
+		t.Errorf("expected ModelLimit 16384, got %d", coe.ModelLimit)
+	}
+	if coe.RequestTokens != 19000 {
+		t.Errorf("expected RequestTokens 19000 (input 14000 + max_tokens 5000), got %d", coe.RequestTokens)
+	}
+}
+func TestGetResponseTokenBudget(t *testing.T) {
+	rlm := &RLM{
+		extraParams: map[string]interface{}{
+			"max_tokens": float64(10000),
+		},
+	}
+	obs := NewNoopObserver()
+	config := DefaultContextOverflowConfig()
+	reducer := newContextReducer(rlm, config, obs)
+	budget := reducer.getResponseTokenBudget(32768)
+	if budget != 10000 {
+		t.Errorf("expected response token budget 10000, got %d", budget)
+	}
+}
+func TestGetResponseTokenBudget_MaxCompletionTokens(t *testing.T) {
+	rlm := &RLM{
+		extraParams: map[string]interface{}{
+			"max_completion_tokens": float64(5000),
+		},
+	}
+	obs := NewNoopObserver()
+	config := DefaultContextOverflowConfig()
+	reducer := newContextReducer(rlm, config, obs)
+	budget := reducer.getResponseTokenBudget(32768)
+	if budget != 5000 {
+		t.Errorf("expected response token budget 5000, got %d", budget)
+	}
+}
+func TestGetResponseTokenBudget_NoMaxTokens(t *testing.T) {
+	rlm := &RLM{
+		extraParams: map[string]interface{}{
+			"temperature": 0.7,
+		},
+	}
+	obs := NewNoopObserver()
+	config := DefaultContextOverflowConfig()
+	reducer := newContextReducer(rlm, config, obs)
+	budget := reducer.getResponseTokenBudget(32768)
+	if budget != 0 {
+		t.Errorf("expected response token budget 0, got %d", budget)
+	}
+}
+func TestMakeMapPhaseParams(t *testing.T) {
+	rlm := &RLM{
+		extraParams: map[string]interface{}{
+			"max_tokens":          float64(10000),
+			"custom_llm_provider": "vllm",
+			"temperature":         0.7,
+		},
+	}
+	obs := NewNoopObserver()
+	config := DefaultContextOverflowConfig()
+	reducer := newContextReducer(rlm, config, obs)
+	params := reducer.makeMapPhaseParams(32768)
+	// max_tokens should be capped (32768/4 = 8192, but cap is 2000)
+	maxTokens, ok := params["max_tokens"].(int)
+	if !ok {
+		t.Fatal("expected max_tokens to be int in map phase params")
+	}
+	if maxTokens > 2000 {
+		t.Errorf("expected map phase max_tokens <= 2000, got %d", maxTokens)
+	}
+	// custom_llm_provider should be preserved
+	if params["custom_llm_provider"] != "vllm" {
+		t.Errorf("expected custom_llm_provider to be preserved, got %v", params["custom_llm_provider"])
+	}
+	// temperature should be preserved
+	if params["temperature"] != 0.7 {
+		t.Errorf("expected temperature to be preserved, got %v", params["temperature"])
+	}
+}
 func TestContextOverflowError_OverflowRatio(t *testing.T) {
 	tests := []struct {
 		limit    int
@@ -526,10 +644,10 @@ func TestContextOverflowError_ErrorChain(t *testing.T) {
 	if coe.APIError == nil {
 		t.Fatal("expected embedded APIError to be non-nil")
 	}
-	if coe.APIError.StatusCode != 400 {
-		t.Errorf("expected status 400, got %d", coe.APIError.StatusCode)
+	if coe.StatusCode != 400 {
+		t.Errorf("expected status 400, got %d", coe.StatusCode)
 	}
-	if coe.APIError.RLMError == nil {
+	if coe.RLMError == nil {
 		t.Fatal("expected embedded RLMError to be non-nil")
 	}

package/go/rlm/errors.go CHANGED Viewed

@@ -192,7 +192,31 @@ func parseContextOverflowMessage(msg string) (modelLimit int, requestTokens int,
 		}
 	}
-	// Pattern 3: "max_tokens" / "input too long" generic patterns
+	// Pattern 3: "max_tokens is too large" - response budget exceeds remaining capacity
+	// vLLM/OpenAI: "max_tokens' or 'max_completion_tokens' is too large: 10000.
+	//   This model's maximum context length is 32768 tokens and your request has 30168 input tokens"
+	// In this case, input tokens < model limit, but input + max_tokens > model limit.
+	// We report the effective total (input + max_tokens) as requestTokens.
+	if strings.Contains(lowerMsg, "max_tokens") && strings.Contains(lowerMsg, "too large") {
+		limit := extractNumber(msg, "maximum context length is ", " tokens")
+		inputTokens := extractNumber(msg, "your request has ", " input tokens")
+		if inputTokens == 0 {
+			inputTokens = extractNumber(msg, "your request has ", " tokens")
+		}
+		maxTokens := extractNumber(msg, "too large: ", ".")
+		if maxTokens == 0 {
+			maxTokens = extractNumber(msg, "too large: ", " ")
+		}
+		if limit > 0 && inputTokens > 0 && maxTokens > 0 {
+			return limit, inputTokens + maxTokens, true
+		}
+		// Fallback: if we got limit and input tokens, treat input as the overflow
+		if limit > 0 && inputTokens > 0 {
+			return limit, inputTokens, true
+		}
+	}
+	// Pattern 4: "input too long" / "too many tokens" generic patterns
 	if strings.Contains(lowerMsg, "input too long") || strings.Contains(lowerMsg, "too many tokens") || strings.Contains(lowerMsg, "too many input tokens") {
 		limit := extractNumber(msg, "limit is ", " tokens")
 		if limit == 0 {

package/go/rlm/structured.go CHANGED Viewed

@@ -843,7 +843,7 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
 	var feedback strings.Builder
 	feedback.WriteString("VALIDATION ERROR - Your previous response was invalid.\n\n")
-	feedback.WriteString(fmt.Sprintf("ERROR: %s\n\n", errMsg))
+	fmt.Fprintf(&feedback, "ERROR: %s\n\n", errMsg)
 	// Extract what field caused the issue
 	if strings.Contains(errMsg, "missing required field:") {
@@ -852,17 +852,17 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
 		fieldName = strings.TrimSpace(fieldName)
 		feedback.WriteString("SPECIFIC ISSUE:\n")
-		feedback.WriteString(fmt.Sprintf("The field '%s' is REQUIRED but was not provided.\n\n", fieldName))
+		fmt.Fprintf(&feedback, "The field '%s' is REQUIRED but was not provided.\n\n", fieldName)
 		// Find the schema for this field and provide details
 		if schema.Type == "object" && schema.Properties != nil {
 			if fieldSchema, exists := schema.Properties[fieldName]; exists {
 				feedback.WriteString("FIELD REQUIREMENTS:\n")
-				feedback.WriteString(fmt.Sprintf("- Field name: '%s'\n", fieldName))
-				feedback.WriteString(fmt.Sprintf("- Type: %s\n", fieldSchema.Type))
+				fmt.Fprintf(&feedback, "- Field name: '%s'\n", fieldName)
+				fmt.Fprintf(&feedback, "- Type: %s\n", fieldSchema.Type)
 				if fieldSchema.Type == "object" && len(fieldSchema.Required) > 0 {
-					feedback.WriteString(fmt.Sprintf("- This is an object with required fields: %s\n", strings.Join(fieldSchema.Required, ", ")))
+					fmt.Fprintf(&feedback, "- This is an object with required fields: %s\n", strings.Join(fieldSchema.Required, ", "))
 					if fieldSchema.Properties != nil {
 						feedback.WriteString("\nNESTED FIELD DETAILS:\n")
@@ -872,13 +872,13 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
 							if isRequired {
 								requiredMark = " [REQUIRED]"
 							}
-							feedback.WriteString(fmt.Sprintf("  - %s: %s%s\n", nestedField, nestedSchema.Type, requiredMark))
+							fmt.Fprintf(&feedback, "  - %s: %s%s\n", nestedField, nestedSchema.Type, requiredMark)
 						}
 					}
 				}
 				if fieldSchema.Type == "array" && fieldSchema.Items != nil {
-					feedback.WriteString(fmt.Sprintf("- This is an array of: %s\n", fieldSchema.Items.Type))
+					fmt.Fprintf(&feedback, "- This is an array of: %s\n", fieldSchema.Items.Type)
 				}
 			}
 		}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "recursive-llm-ts",
-  "version": "4.6.0",
+  "version": "4.7.0",
   "description": "TypeScript bridge for recursive-llm: Recursive Language Models for unbounded context processing with structured outputs",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",