npm - recursive-llm-ts - Versions diffs - 4.6.0 → 4.8.0 - Mend

recursive-llm-ts 4.6.0 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/bin/rlm-go +0 -0
package/dist/bridge-interface.d.ts +1 -0
package/go/rlm/context_overflow.go +215 -78
package/go/rlm/context_overflow_test.go +491 -3
package/go/rlm/errors.go +25 -1
package/go/rlm/rlm.go +75 -0
package/go/rlm/structured.go +21 -7
package/package.json +1 -1

package/bin/rlm-go CHANGED Viewed

Binary file

package/dist/bridge-interface.d.ts CHANGED Viewed

@@ -65,6 +65,7 @@ export interface RLMConfig {
     temperature?: number;
     max_tokens?: number;
     structured?: any;
+    [key: string]: any;
 }
 export interface FileStorageConfig {
     /** Storage type: 'local' or 's3' */

package/go/rlm/context_overflow.go CHANGED Viewed

@@ -30,6 +30,155 @@ func DefaultContextOverflowConfig() ContextOverflowConfig {
 	}
 }
+// ─── Model Token Limits ──────────────────────────────────────────────────────
+// modelTokenLimits maps known model name patterns to their maximum context window sizes.
+// Used for pre-emptive overflow detection so we don't need to wait for API errors.
+var modelTokenLimits = map[string]int{
+	// OpenAI
+	"gpt-4o":            128000,
+	"gpt-4o-mini":       128000,
+	"gpt-4-turbo":       128000,
+	"gpt-4":             8192,
+	"gpt-4-32k":         32768,
+	"gpt-3.5-turbo":     16385,
+	"gpt-3.5-turbo-16k": 16385,
+	"o1":                200000,
+	"o1-mini":           128000,
+	"o1-preview":        128000,
+	"o3-mini":           200000,
+	// Anthropic (via LiteLLM/proxy)
+	"claude-3-opus":       200000,
+	"claude-3-sonnet":     200000,
+	"claude-3-haiku":      200000,
+	"claude-3.5-sonnet":   200000,
+	"claude-3.5-haiku":    200000,
+	"claude-sonnet-4":     200000,
+	"claude-opus-4":       200000,
+	// Llama (common vLLM deployments)
+	"llama-3":     8192,
+	"llama-3.1":   128000,
+	"llama-3.2":   128000,
+	"llama-3.3":   128000,
+	// Mistral
+	"mistral-7b":    32768,
+	"mixtral-8x7b":  32768,
+	"mistral-large": 128000,
+	"mistral-small": 128000,
+	// Qwen
+	"qwen-2":   32768,
+	"qwen-2.5": 128000,
+}
+// LookupModelTokenLimit returns the known token limit for a model, or 0 if unknown.
+// Matches by prefix so "gpt-4o-mini-2024-07-18" matches "gpt-4o-mini".
+func LookupModelTokenLimit(model string) int {
+	lowerModel := strings.ToLower(model)
+	// Try exact match first
+	if limit, ok := modelTokenLimits[lowerModel]; ok {
+		return limit
+	}
+	// Try prefix matching (longest prefix wins)
+	bestMatch := ""
+	bestLimit := 0
+	for pattern, limit := range modelTokenLimits {
+		if strings.HasPrefix(lowerModel, pattern) && len(pattern) > len(bestMatch) {
+			bestMatch = pattern
+			bestLimit = limit
+		}
+	}
+	return bestLimit
+}
+// getModelTokenLimit returns the effective token limit for pre-emptive overflow checks.
+// Priority: config override > model name lookup > 0 (disabled).
+func (r *RLM) getModelTokenLimit() int {
+	if r.contextOverflow != nil && r.contextOverflow.MaxModelTokens > 0 {
+		return r.contextOverflow.MaxModelTokens
+	}
+	return LookupModelTokenLimit(r.model)
+}
+// ─── Pre-emptive Overflow Check ──────────────────────────────────────────────
+// structuredPromptOverhead is the approximate token overhead for structured completion prompts
+// (instructions, schema constraints, JSON formatting directives).
+const structuredPromptOverhead = 350
+// PreemptiveReduceContext checks if the context would overflow the model's token limit
+// and reduces it proactively BEFORE building the prompt. Returns the (possibly reduced)
+// context, or an error if reduction fails.
+//
+// This is called before the first LLM call, unlike post-hoc overflow recovery which
+// only triggers after an API error. Following the RLM paper's principle that
+// "the context window of the root LM is rarely clogged."
+func (r *RLM) PreemptiveReduceContext(query string, context string, extraOverhead int) (string, bool, error) {
+	modelLimit := r.getModelTokenLimit()
+	if modelLimit == 0 {
+		// No known limit; skip pre-emptive check (will rely on post-hoc recovery)
+		return context, false, nil
+	}
+	if r.contextOverflow == nil || !r.contextOverflow.Enabled {
+		return context, false, nil
+	}
+	// Estimate total token budget needed
+	contextTokens := EstimateTokens(context)
+	queryTokens := EstimateTokens(query)
+	responseTokens := r.getResponseTokenBudget()
+	safetyMargin := r.contextOverflow.SafetyMargin
+	if safetyMargin == 0 {
+		safetyMargin = 0.15
+	}
+	totalEstimate := contextTokens + queryTokens + extraOverhead + responseTokens +
+		int(float64(modelLimit)*safetyMargin)
+	r.observer.Debug("overflow", "Pre-emptive check: context=%d query=%d overhead=%d response=%d safety=%d total=%d limit=%d",
+		contextTokens, queryTokens, extraOverhead, responseTokens,
+		int(float64(modelLimit)*safetyMargin), totalEstimate, modelLimit)
+	if totalEstimate <= modelLimit {
+		return context, false, nil
+	}
+	// Context would overflow — reduce it proactively
+	r.observer.Debug("overflow", "Pre-emptive reduction needed: estimated %d tokens > limit %d", totalEstimate, modelLimit)
+	reducer := newContextReducer(r, *r.contextOverflow, r.observer)
+	reduced, err := reducer.ReduceForCompletion(query, context, modelLimit)
+	if err != nil {
+		return context, false, fmt.Errorf("pre-emptive context reduction failed: %w", err)
+	}
+	r.observer.Debug("overflow", "Pre-emptive reduction: %d -> %d chars", len(context), len(reduced))
+	return reduced, true, nil
+}
+// getResponseTokenBudget extracts max_tokens or max_completion_tokens from ExtraParams.
+func (r *RLM) getResponseTokenBudget() int {
+	if r.extraParams == nil {
+		return 0
+	}
+	for _, key := range []string{"max_completion_tokens", "max_tokens"} {
+		if v, ok := r.extraParams[key]; ok {
+			switch n := v.(type) {
+			case float64:
+				return int(n)
+			case int:
+				return n
+			case int64:
+				return int(n)
+			}
+		}
+	}
+	return 0
+}
 // ─── Token Estimation ────────────────────────────────────────────────────────
 // EstimateTokens provides a fast approximation of token count for a string.
@@ -168,21 +317,53 @@ func newContextReducer(rlm *RLM, config ContextOverflowConfig, obs *Observer) *c
 	return &contextReducer{rlm: rlm, config: config, obs: obs}
 }
+// getResponseTokenBudget delegates to the RLM engine's method.
+func (cr *contextReducer) getResponseTokenBudget() int {
+	return cr.rlm.getResponseTokenBudget()
+}
+// makeMapPhaseParams creates ExtraParams suitable for map-phase LLM calls (summarization).
+// It copies the user's ExtraParams but overrides max_tokens to a smaller value since
+// summaries don't need as many tokens as the original completion.
+func (cr *contextReducer) makeMapPhaseParams(modelLimit int) map[string]interface{} {
+	params := make(map[string]interface{})
+	// Copy all user params (custom_llm_provider, temperature, etc.)
+	for k, v := range cr.rlm.extraParams {
+		params[k] = v
+	}
+	// Override max_tokens for map-phase: use at most 1/4 of model limit or 2000, whichever is smaller
+	mapMaxTokens := modelLimit / 4
+	if mapMaxTokens > 2000 {
+		mapMaxTokens = 2000
+	}
+	if mapMaxTokens < 256 {
+		mapMaxTokens = 256
+	}
+	params["max_tokens"] = mapMaxTokens
+	// Remove max_completion_tokens if present to avoid conflicts
+	delete(params, "max_completion_tokens")
+	return params
+}
 // ReduceForCompletion handles context overflow for a regular completion.
 // It chunks the context, summarizes each chunk, and combines the summaries.
 func (cr *contextReducer) ReduceForCompletion(query string, context string, modelLimit int) (string, error) {
-	cr.obs.Debug("overflow", "Starting MapReduce context reduction: %d estimated tokens, limit %d", EstimateTokens(context), modelLimit)
+	cr.obs.Debug("overflow", "Starting context reduction: %d estimated tokens, limit %d", EstimateTokens(context), modelLimit)
 	// Calculate safe token budget per chunk
-	// Reserve tokens for: system prompt (~500), query, overhead, safety margin
+	// Reserve tokens for: system prompt (~500), query, overhead, safety margin, response budget
 	queryTokens := EstimateTokens(query)
-	overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin)
+	responseTokens := cr.getResponseTokenBudget()
+	overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin) + responseTokens
 	safeTokensPerChunk := modelLimit - overhead
 	if safeTokensPerChunk <= 0 {
-		safeTokensPerChunk = modelLimit / 2
+		safeTokensPerChunk = modelLimit / 4
 	}
+	cr.obs.Debug("overflow", "Budget: overhead=%d (query=%d, response=%d, safety=%d), chunk budget=%d",
+		overhead, queryTokens, responseTokens, int(float64(modelLimit)*cr.config.SafetyMargin), safeTokensPerChunk)
 	chunks := ChunkContext(context, safeTokensPerChunk)
 	cr.obs.Debug("overflow", "Split context into %d chunks (budget: %d tokens/chunk)", len(chunks), safeTokensPerChunk)
@@ -211,6 +392,9 @@ func (cr *contextReducer) ReduceForCompletion(query string, context string, mode
 func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, modelLimit int, overhead int) (string, error) {
 	cr.obs.Debug("overflow", "Using MapReduce strategy with %d chunks", len(chunks))
+	// Use map-phase-specific params with reduced max_tokens for summarization
+	mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
 	summaries := make([]string, len(chunks))
 	errs := make([]error, len(chunks))
 	var wg sync.WaitGroup
@@ -239,7 +423,7 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
 				APIBase:     cr.rlm.apiBase,
 				APIKey:      cr.rlm.apiKey,
 				Timeout:     cr.rlm.timeoutSeconds,
-				ExtraParams: cr.rlm.extraParams,
+				ExtraParams: mapPhaseParams,
 			})
 			if err != nil {
 				errs[idx] = fmt.Errorf("map phase chunk %d: %w", idx+1, err)
@@ -254,14 +438,22 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
 	wg.Wait()
-	// Check for errors
+	// Check for errors - if map phase overflows, fall back to tfidf
 	var mapErrors []string
+	hasOverflow := false
 	for _, err := range errs {
 		if err != nil {
 			mapErrors = append(mapErrors, err.Error())
+			if _, isOverflow := IsContextOverflow(err); isOverflow {
+				hasOverflow = true
+			}
 		}
 	}
 	if len(mapErrors) > 0 {
+		if hasOverflow {
+			cr.obs.Debug("overflow", "MapReduce map phase hit overflow, falling back to TF-IDF strategy")
+			return cr.reduceByTFIDF(strings.Join(chunks, "\n\n"), modelLimit, overhead)
+		}
 		return "", fmt.Errorf("MapReduce map phase failed: %s", strings.Join(mapErrors, "; "))
 	}
@@ -306,6 +498,9 @@ func (cr *contextReducer) reduceByTruncation(context string, modelLimit int, ove
 func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []string, modelLimit int, overhead int) (string, error) {
 	cr.obs.Debug("overflow", "Using chunked extraction strategy with %d chunks", len(chunks))
+	// Use map-phase-specific params with reduced max_tokens
+	mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
 	results := make([]string, len(chunks))
 	errs := make([]error, len(chunks))
 	var wg sync.WaitGroup
@@ -333,7 +528,7 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
 				APIBase:     cr.rlm.apiBase,
 				APIKey:      cr.rlm.apiKey,
 				Timeout:     cr.rlm.timeoutSeconds,
-				ExtraParams: cr.rlm.extraParams,
+				ExtraParams: mapPhaseParams,
 			})
 			if err != nil {
 				errs[idx] = fmt.Errorf("chunked extraction chunk %d: %w", idx+1, err)
@@ -350,12 +545,20 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
 	wg.Wait()
 	var extractErrors []string
+	hasOverflow := false
 	for _, err := range errs {
 		if err != nil {
 			extractErrors = append(extractErrors, err.Error())
+			if _, isOverflow := IsContextOverflow(err); isOverflow {
+				hasOverflow = true
+			}
 		}
 	}
 	if len(extractErrors) > 0 {
+		if hasOverflow {
+			cr.obs.Debug("overflow", "Chunked extraction hit overflow, falling back to TF-IDF strategy")
+			return cr.reduceByTFIDF(strings.Join(chunks, "\n\n"), modelLimit, overhead)
+		}
 		return "", fmt.Errorf("chunked extraction failed: %s", strings.Join(extractErrors, "; "))
 	}
@@ -387,6 +590,9 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
 		return "", fmt.Errorf("refine strategy: no chunks to process")
 	}
+	// Use map-phase-specific params with reduced max_tokens
+	mapPhaseParams := cr.makeMapPhaseParams(modelLimit)
 	// Phase 1: Generate initial answer from the first chunk
 	initialPrompt := fmt.Sprintf(
 		"Using the following context, provide a comprehensive answer to the question.\n"+
@@ -406,7 +612,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
 		APIBase:     cr.rlm.apiBase,
 		APIKey:      cr.rlm.apiKey,
 		Timeout:     cr.rlm.timeoutSeconds,
-		ExtraParams: cr.rlm.extraParams,
+		ExtraParams: mapPhaseParams,
 	})
 	if err != nil {
 		return "", fmt.Errorf("refine initial chunk: %w", err)
@@ -438,7 +644,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
 			APIBase:     cr.rlm.apiBase,
 			APIKey:      cr.rlm.apiKey,
 			Timeout:     cr.rlm.timeoutSeconds,
-			ExtraParams: cr.rlm.extraParams,
+			ExtraParams: mapPhaseParams,
 		})
 		if err != nil {
 			cr.obs.Debug("overflow", "Refine: chunk %d/%d failed: %v, keeping current answer", i+1, len(chunks), err)
@@ -495,72 +701,3 @@ func (cr *contextReducer) reduceByTextRank(context string, modelLimit int, overh
 	return result, nil
 }
-// ─── Adaptive Completion with Overflow Recovery ──────────────────────────────
-// completionWithOverflowRecovery wraps a completion call with automatic overflow detection and retry.
-// When a context overflow error is detected, it reduces the context and retries.
-func (r *RLM) completionWithOverflowRecovery(query string, context string, overflowConfig ContextOverflowConfig) (string, RLMStats, error) {
-	obs := r.observer
-	if obs == nil {
-		obs = NewNoopObserver()
-	}
-	// Try the normal completion first
-	result, stats, err := r.Completion(query, context)
-	if err == nil {
-		return result, stats, nil
-	}
-	// Check if it's a context overflow error
-	coe, isOverflow := IsContextOverflow(err)
-	if !isOverflow {
-		return "", stats, err // Not an overflow error, return original error
-	}
-	obs.Debug("overflow", "Context overflow detected: model limit %d, request %d tokens (%.1f%% over)",
-		coe.ModelLimit, coe.RequestTokens, (coe.OverflowRatio()-1)*100)
-	// Use detected limit or configured limit
-	modelLimit := coe.ModelLimit
-	if overflowConfig.MaxModelTokens > 0 {
-		modelLimit = overflowConfig.MaxModelTokens
-	}
-	reducer := newContextReducer(r, overflowConfig, obs)
-	// Attempt context reduction and retry
-	for attempt := 0; attempt < overflowConfig.MaxReductionAttempts; attempt++ {
-		obs.Debug("overflow", "Reduction attempt %d/%d", attempt+1, overflowConfig.MaxReductionAttempts)
-		reducedContext, reduceErr := reducer.ReduceForCompletion(query, context, modelLimit)
-		if reduceErr != nil {
-			obs.Error("overflow", "Context reduction failed: %v", reduceErr)
-			return "", stats, fmt.Errorf("context overflow recovery failed: %w", reduceErr)
-		}
-		obs.Debug("overflow", "Context reduced: %d -> %d chars", len(context), len(reducedContext))
-		// Retry with reduced context
-		result, stats, err = r.Completion(query, reducedContext)
-		if err == nil {
-			obs.Event("overflow.recovery_success", map[string]string{
-				"attempt":          fmt.Sprintf("%d", attempt+1),
-				"original_chars":   fmt.Sprintf("%d", len(context)),
-				"reduced_chars":    fmt.Sprintf("%d", len(reducedContext)),
-				"reduction_ratio":  fmt.Sprintf("%.2f", float64(len(reducedContext))/float64(len(context))),
-			})
-			return result, stats, nil
-		}
-		// If it overflows again, use the reduced context for the next attempt
-		if _, stillOverflow := IsContextOverflow(err); stillOverflow {
-			context = reducedContext
-			continue
-		}
-		// Different error, return it
-		return "", stats, err
-	}
-	return "", stats, fmt.Errorf("context overflow: exceeded %d reduction attempts, model limit is %d tokens", overflowConfig.MaxReductionAttempts, modelLimit)
-}

package/go/rlm/context_overflow_test.go CHANGED Viewed

@@ -107,6 +107,124 @@ func TestIsContextOverflow_GenericError(t *testing.T) {
 	}
 }
+func TestIsContextOverflow_MaxTokensTooLarge_vLLM(t *testing.T) {
+	// vLLM/Ray Serve error when max_tokens exceeds remaining capacity
+	// This is the exact error from the user's production logs
+	response := `{"object":"error","message":"'max_tokens' or 'max_completion_tokens' is too large: 10000. This model's maximum context length is 32768 tokens and your request has 30168 input tokens (10000 > 32768 - 30168)","type":"BadRequestError","param":null,"code":400}`
+	apiErr := NewAPIError(400, response)
+	coe, ok := IsContextOverflow(apiErr)
+	if !ok {
+		t.Fatal("expected IsContextOverflow to detect max_tokens too large error")
+	}
+	if coe.ModelLimit != 32768 {
+		t.Errorf("expected ModelLimit 32768, got %d", coe.ModelLimit)
+	}
+	// Request tokens should include both input + max_tokens: 30168 + 10000 = 40168
+	if coe.RequestTokens != 40168 {
+		t.Errorf("expected RequestTokens 40168 (input 30168 + max_tokens 10000), got %d", coe.RequestTokens)
+	}
+}
+func TestIsContextOverflow_MaxCompletionTokensTooLarge(t *testing.T) {
+	// OpenAI newer API format with max_completion_tokens
+	response := `{"error":{"message":"'max_tokens' or 'max_completion_tokens' is too large: 5000. This model's maximum context length is 16384 tokens and your request has 14000 input tokens","type":"invalid_request_error","code":"invalid_request_error"}}`
+	apiErr := NewAPIError(400, response)
+	coe, ok := IsContextOverflow(apiErr)
+	if !ok {
+		t.Fatal("expected IsContextOverflow to detect max_completion_tokens too large error")
+	}
+	if coe.ModelLimit != 16384 {
+		t.Errorf("expected ModelLimit 16384, got %d", coe.ModelLimit)
+	}
+	if coe.RequestTokens != 19000 {
+		t.Errorf("expected RequestTokens 19000 (input 14000 + max_tokens 5000), got %d", coe.RequestTokens)
+	}
+}
+func TestGetResponseTokenBudget(t *testing.T) {
+	rlm := &RLM{
+		extraParams: map[string]interface{}{
+			"max_tokens": float64(10000),
+		},
+	}
+	obs := NewNoopObserver()
+	config := DefaultContextOverflowConfig()
+	reducer := newContextReducer(rlm, config, obs)
+	budget := reducer.getResponseTokenBudget()
+	if budget != 10000 {
+		t.Errorf("expected response token budget 10000, got %d", budget)
+	}
+}
+func TestGetResponseTokenBudget_MaxCompletionTokens(t *testing.T) {
+	rlm := &RLM{
+		extraParams: map[string]interface{}{
+			"max_completion_tokens": float64(5000),
+		},
+	}
+	obs := NewNoopObserver()
+	config := DefaultContextOverflowConfig()
+	reducer := newContextReducer(rlm, config, obs)
+	budget := reducer.getResponseTokenBudget()
+	if budget != 5000 {
+		t.Errorf("expected response token budget 5000, got %d", budget)
+	}
+}
+func TestGetResponseTokenBudget_NoMaxTokens(t *testing.T) {
+	rlm := &RLM{
+		extraParams: map[string]interface{}{
+			"temperature": 0.7,
+		},
+	}
+	obs := NewNoopObserver()
+	config := DefaultContextOverflowConfig()
+	reducer := newContextReducer(rlm, config, obs)
+	budget := reducer.getResponseTokenBudget()
+	if budget != 0 {
+		t.Errorf("expected response token budget 0, got %d", budget)
+	}
+}
+func TestMakeMapPhaseParams(t *testing.T) {
+	rlm := &RLM{
+		extraParams: map[string]interface{}{
+			"max_tokens":          float64(10000),
+			"custom_llm_provider": "vllm",
+			"temperature":         0.7,
+		},
+	}
+	obs := NewNoopObserver()
+	config := DefaultContextOverflowConfig()
+	reducer := newContextReducer(rlm, config, obs)
+	params := reducer.makeMapPhaseParams(32768)
+	// max_tokens should be capped (32768/4 = 8192, but cap is 2000)
+	maxTokens, ok := params["max_tokens"].(int)
+	if !ok {
+		t.Fatal("expected max_tokens to be int in map phase params")
+	}
+	if maxTokens > 2000 {
+		t.Errorf("expected map phase max_tokens <= 2000, got %d", maxTokens)
+	}
+	// custom_llm_provider should be preserved
+	if params["custom_llm_provider"] != "vllm" {
+		t.Errorf("expected custom_llm_provider to be preserved, got %v", params["custom_llm_provider"])
+	}
+	// temperature should be preserved
+	if params["temperature"] != 0.7 {
+		t.Errorf("expected temperature to be preserved, got %v", params["temperature"])
+	}
+}
 func TestContextOverflowError_OverflowRatio(t *testing.T) {
 	tests := []struct {
 		limit    int
@@ -526,10 +644,10 @@ func TestContextOverflowError_ErrorChain(t *testing.T) {
 	if coe.APIError == nil {
 		t.Fatal("expected embedded APIError to be non-nil")
 	}
-	if coe.APIError.StatusCode != 400 {
-		t.Errorf("expected status 400, got %d", coe.APIError.StatusCode)
+	if coe.StatusCode != 400 {
+		t.Errorf("expected status 400, got %d", coe.StatusCode)
 	}
-	if coe.APIError.RLMError == nil {
+	if coe.RLMError == nil {
 		t.Fatal("expected embedded RLMError to be non-nil")
 	}
@@ -781,3 +899,373 @@ func TestReduceForCompletion_DispatchesTextRank(t *testing.T) {
 		t.Errorf("expected reduced context for textrank strategy")
 	}
 }
+// ─── Model Token Limits Tests ────────────────────────────────────────────────
+func TestLookupModelTokenLimit_ExactMatch(t *testing.T) {
+	tests := []struct {
+		model    string
+		expected int
+	}{
+		{"gpt-4o", 128000},
+		{"gpt-4o-mini", 128000},
+		{"gpt-4", 8192},
+		{"gpt-4-32k", 32768},
+		{"gpt-3.5-turbo", 16385},
+		{"claude-3-opus", 200000},
+		{"claude-sonnet-4", 200000},
+		{"mistral-7b", 32768},
+	}
+	for _, tt := range tests {
+		limit := LookupModelTokenLimit(tt.model)
+		if limit != tt.expected {
+			t.Errorf("LookupModelTokenLimit(%q) = %d, expected %d", tt.model, limit, tt.expected)
+		}
+	}
+}
+func TestLookupModelTokenLimit_PrefixMatch(t *testing.T) {
+	// Versioned model names should match by prefix
+	tests := []struct {
+		model    string
+		expected int
+	}{
+		{"gpt-4o-mini-2024-07-18", 128000},
+		{"gpt-4o-2024-05-13", 128000},
+		{"claude-3-opus-20240229", 200000},
+		{"mistral-7b-instruct-v0.2", 32768},
+	}
+	for _, tt := range tests {
+		limit := LookupModelTokenLimit(tt.model)
+		if limit != tt.expected {
+			t.Errorf("LookupModelTokenLimit(%q) = %d, expected %d", tt.model, limit, tt.expected)
+		}
+	}
+}
+func TestLookupModelTokenLimit_Unknown(t *testing.T) {
+	limit := LookupModelTokenLimit("completely-unknown-model-xyz")
+	if limit != 0 {
+		t.Errorf("expected 0 for unknown model, got %d", limit)
+	}
+}
+func TestLookupModelTokenLimit_CaseInsensitive(t *testing.T) {
+	limit := LookupModelTokenLimit("GPT-4O-MINI")
+	if limit != 128000 {
+		t.Errorf("expected 128000 for case-insensitive match, got %d", limit)
+	}
+}
+func TestGetModelTokenLimit_ConfigOverride(t *testing.T) {
+	engine := New("gpt-4o-mini", Config{
+		APIKey: "test",
+		ContextOverflow: &ContextOverflowConfig{
+			Enabled:        true,
+			MaxModelTokens: 16384,
+		},
+	})
+	limit := engine.getModelTokenLimit()
+	if limit != 16384 {
+		t.Errorf("expected config override 16384, got %d", limit)
+	}
+}
+func TestGetModelTokenLimit_ModelLookup(t *testing.T) {
+	engine := New("gpt-4o-mini", Config{
+		APIKey: "test",
+	})
+	limit := engine.getModelTokenLimit()
+	if limit != 128000 {
+		t.Errorf("expected model lookup 128000, got %d", limit)
+	}
+}
+func TestGetModelTokenLimit_UnknownModel(t *testing.T) {
+	engine := New("custom-local-model", Config{
+		APIKey: "test",
+	})
+	limit := engine.getModelTokenLimit()
+	if limit != 0 {
+		t.Errorf("expected 0 for unknown model, got %d", limit)
+	}
+}
+// ─── Pre-emptive Overflow Tests ──────────────────────────────────────────────
+func TestPreemptiveReduceContext_SmallContext(t *testing.T) {
+	engine := New("gpt-4o-mini", Config{
+		APIKey: "test",
+	})
+	// Small context should pass through unchanged
+	context := "This is a small context that easily fits."
+	reduced, wasReduced, err := engine.PreemptiveReduceContext("What is this?", context, 500)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if wasReduced {
+		t.Error("expected no reduction for small context")
+	}
+	if reduced != context {
+		t.Error("expected context to be unchanged")
+	}
+}
+func TestPreemptiveReduceContext_LargeContext(t *testing.T) {
+	engine := New("gpt-4o-mini", Config{
+		APIKey: "test",
+		ContextOverflow: &ContextOverflowConfig{
+			Enabled:        true,
+			MaxModelTokens: 1000, // Very small limit to force overflow
+			Strategy:       "truncate",
+			SafetyMargin:   0.15,
+		},
+	})
+	// Create large context that exceeds the 1000 token limit
+	context := strings.Repeat("The revenue for Q4 was $4.2 billion, representing 23% year-over-year growth. ", 100)
+	reduced, wasReduced, err := engine.PreemptiveReduceContext("Summarize revenue", context, 300)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if !wasReduced {
+		t.Error("expected context to be reduced")
+	}
+	if len(reduced) >= len(context) {
+		t.Errorf("expected reduced context to be shorter: %d >= %d", len(reduced), len(context))
+	}
+}
+func TestPreemptiveReduceContext_DisabledOverflow(t *testing.T) {
+	engine := New("gpt-4o-mini", Config{
+		APIKey: "test",
+		ContextOverflow: &ContextOverflowConfig{
+			Enabled: false,
+		},
+	})
+	context := strings.Repeat("Large content. ", 10000)
+	reduced, wasReduced, err := engine.PreemptiveReduceContext("query", context, 500)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if wasReduced {
+		t.Error("expected no reduction when overflow is disabled")
+	}
+	if reduced != context {
+		t.Error("expected context unchanged when overflow is disabled")
+	}
+}
+func TestPreemptiveReduceContext_UnknownModel(t *testing.T) {
+	engine := New("custom-local-model", Config{
+		APIKey: "test",
+	})
+	// Unknown model with no config override → no pre-emptive check
+	context := strings.Repeat("Large content. ", 10000)
+	reduced, wasReduced, err := engine.PreemptiveReduceContext("query", context, 500)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if wasReduced {
+		t.Error("expected no reduction for unknown model with no config limit")
+	}
+	if reduced != context {
+		t.Error("expected context unchanged")
+	}
+}
+func TestPreemptiveReduceContext_AccountsForResponseBudget(t *testing.T) {
+	// With a high max_tokens, even moderate context should trigger reduction
+	engine := New("gpt-4o-mini", Config{
+		APIKey: "test",
+		ContextOverflow: &ContextOverflowConfig{
+			Enabled:        true,
+			MaxModelTokens: 2000,
+			Strategy:       "truncate",
+			SafetyMargin:   0.15,
+		},
+		ExtraParams: map[string]interface{}{
+			"max_tokens": float64(1000), // Large response budget
+		},
+	})
+	// Context of ~500 tokens + max_tokens 1000 + overhead = exceeds 2000
+	context := strings.Repeat("Revenue data: the company earned $4.2B in Q4 fiscal year. ", 30)
+	reduced, wasReduced, err := engine.PreemptiveReduceContext("Summarize", context, 300)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if !wasReduced {
+		t.Error("expected reduction when response budget + context exceeds limit")
+	}
+	if len(reduced) >= len(context) {
+		t.Errorf("expected reduced context: %d >= %d", len(reduced), len(context))
+	}
+}
+func TestPreemptiveReduceContext_TFIDFStrategy(t *testing.T) {
+	engine := New("gpt-4o-mini", Config{
+		APIKey: "test",
+		ContextOverflow: &ContextOverflowConfig{
+			Enabled:        true,
+			MaxModelTokens: 500,
+			Strategy:       "tfidf",
+			SafetyMargin:   0.15,
+		},
+	})
+	context := strings.Repeat("Machine learning models process large datasets effectively. ", 100)
+	reduced, wasReduced, err := engine.PreemptiveReduceContext("Tell me about ML", context, 200)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if !wasReduced {
+		t.Error("expected reduction with tfidf strategy")
+	}
+	if len(reduced) >= len(context) {
+		t.Errorf("expected shorter context: %d >= %d", len(reduced), len(context))
+	}
+}
+func TestPreemptiveReduceContext_TextRankStrategy(t *testing.T) {
+	engine := New("gpt-4o-mini", Config{
+		APIKey: "test",
+		ContextOverflow: &ContextOverflowConfig{
+			Enabled:        true,
+			MaxModelTokens: 500,
+			Strategy:       "textrank",
+			SafetyMargin:   0.15,
+		},
+	})
+	context := strings.Repeat("Neural networks are powerful computation models. ", 100)
+	reduced, wasReduced, err := engine.PreemptiveReduceContext("Explain neural nets", context, 200)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if !wasReduced {
+		t.Error("expected reduction with textrank strategy")
+	}
+	if len(reduced) >= len(context) {
+		t.Errorf("expected shorter context: %d >= %d", len(reduced), len(context))
+	}
+}
+func TestGetResponseTokenBudget_RLMMethod(t *testing.T) {
+	engine := &RLM{
+		extraParams: map[string]interface{}{
+			"max_tokens": float64(5000),
+		},
+	}
+	budget := engine.getResponseTokenBudget()
+	if budget != 5000 {
+		t.Errorf("expected 5000, got %d", budget)
+	}
+}
+func TestGetResponseTokenBudget_MaxCompletionTokensPreferred(t *testing.T) {
+	engine := &RLM{
+		extraParams: map[string]interface{}{
+			"max_tokens":            float64(5000),
+			"max_completion_tokens": float64(8000),
+		},
+	}
+	budget := engine.getResponseTokenBudget()
+	if budget != 8000 {
+		t.Errorf("expected max_completion_tokens=8000 preferred, got %d", budget)
+	}
+}
+func TestGetResponseTokenBudget_NoParams(t *testing.T) {
+	engine := &RLM{
+		extraParams: map[string]interface{}{
+			"temperature": 0.7,
+		},
+	}
+	budget := engine.getResponseTokenBudget()
+	if budget != 0 {
+		t.Errorf("expected 0 when no max_tokens set, got %d", budget)
+	}
+}
+// ─── Message Pruning Tests ───────────────────────────────────────────────────
+func TestPruneMessages_SmallHistory(t *testing.T) {
+	messages := []Message{
+		{Role: "system", Content: "You are helpful."},
+		{Role: "user", Content: "Hello"},
+		{Role: "assistant", Content: "Hi there!"},
+	}
+	result := pruneMessages(messages, 100)
+	if len(result) != 3 {
+		t.Errorf("expected 3 messages (no pruning needed), got %d", len(result))
+	}
+}
+func TestPruneMessages_PreservesSystemAndLast(t *testing.T) {
+	messages := []Message{
+		{Role: "system", Content: "System prompt"},
+		{Role: "user", Content: "First question"},
+		{Role: "assistant", Content: "First answer"},
+		{Role: "user", Content: "Second question"},
+		{Role: "assistant", Content: "Second answer"},
+		{Role: "user", Content: strings.Repeat("Third question with lots of context. ", 100)},
+		{Role: "assistant", Content: "Third answer"},
+	}
+	result := pruneMessages(messages, 50) // Very tight budget
+	// Should always keep system prompt (first) and last 2 messages
+	if len(result) < 3 {
+		t.Errorf("expected at least 3 messages, got %d", len(result))
+	}
+	if result[0].Role != "system" {
+		t.Error("first message should be system prompt")
+	}
+	if result[len(result)-1].Content != "Third answer" {
+		t.Error("last message should be the most recent")
+	}
+	if result[len(result)-2].Role != "user" {
+		t.Error("second-to-last should be the most recent user message")
+	}
+}
+func TestPruneMessages_KeepsRecentMiddleMessages(t *testing.T) {
+	messages := []Message{
+		{Role: "system", Content: "Short."},
+		{Role: "user", Content: "Q1"},
+		{Role: "assistant", Content: "A1"},
+		{Role: "user", Content: "Q2"},
+		{Role: "assistant", Content: "A2"},
+		{Role: "user", Content: "Q3"},
+		{Role: "assistant", Content: "A3"},
+	}
+	// Budget large enough for all
+	result := pruneMessages(messages, 10000)
+	if len(result) != 7 {
+		t.Errorf("expected all 7 messages with large budget, got %d", len(result))
+	}
+}
+// ─── Structured Completion Pre-emptive Integration Tests ─────────────────────
+func TestStructuredPromptOverhead_Constant(t *testing.T) {
+	// Verify the constant is reasonable (300-500 tokens for structured prompt instructions)
+	if structuredPromptOverhead < 200 || structuredPromptOverhead > 600 {
+		t.Errorf("structuredPromptOverhead=%d seems out of range (expected 200-600)", structuredPromptOverhead)
+	}
+}

package/go/rlm/errors.go CHANGED Viewed

@@ -192,7 +192,31 @@ func parseContextOverflowMessage(msg string) (modelLimit int, requestTokens int,
 		}
 	}
-	// Pattern 3: "max_tokens" / "input too long" generic patterns
+	// Pattern 3: "max_tokens is too large" - response budget exceeds remaining capacity
+	// vLLM/OpenAI: "max_tokens' or 'max_completion_tokens' is too large: 10000.
+	//   This model's maximum context length is 32768 tokens and your request has 30168 input tokens"
+	// In this case, input tokens < model limit, but input + max_tokens > model limit.
+	// We report the effective total (input + max_tokens) as requestTokens.
+	if strings.Contains(lowerMsg, "max_tokens") && strings.Contains(lowerMsg, "too large") {
+		limit := extractNumber(msg, "maximum context length is ", " tokens")
+		inputTokens := extractNumber(msg, "your request has ", " input tokens")
+		if inputTokens == 0 {
+			inputTokens = extractNumber(msg, "your request has ", " tokens")
+		}
+		maxTokens := extractNumber(msg, "too large: ", ".")
+		if maxTokens == 0 {
+			maxTokens = extractNumber(msg, "too large: ", " ")
+		}
+		if limit > 0 && inputTokens > 0 && maxTokens > 0 {
+			return limit, inputTokens + maxTokens, true
+		}
+		// Fallback: if we got limit and input tokens, treat input as the overflow
+		if limit > 0 && inputTokens > 0 {
+			return limit, inputTokens, true
+		}
+	}
+	// Pattern 4: "input too long" / "too many tokens" generic patterns
 	if strings.Contains(lowerMsg, "input too long") || strings.Contains(lowerMsg, "too many tokens") || strings.Contains(lowerMsg, "too many input tokens") {
 		limit := extractNumber(msg, "limit is ", " tokens")
 		if limit == 0 {

package/go/rlm/rlm.go CHANGED Viewed

@@ -109,8 +109,41 @@ func (r *RLM) Completion(query string, context string) (string, RLMStats, error)
 		r.stats.Iterations = iteration + 1
 		r.observer.Debug("rlm", "Iteration %d/%d at depth %d", iteration+1, r.maxIterations, r.currentDepth)
+		// Pre-emptive message overflow check: prune older messages if history is growing too large.
+		// Regular completion stores context in the REPL env (not messages), but the iterative
+		// loop appends assistant+user messages each iteration which can accumulate.
+		if modelLimit := r.getModelTokenLimit(); modelLimit > 0 && len(messages) > 4 {
+			msgTokens := EstimateMessagesTokens(messages)
+			responseTokens := r.getResponseTokenBudget()
+			safetyMargin := 0.15
+			if r.contextOverflow != nil && r.contextOverflow.SafetyMargin > 0 {
+				safetyMargin = r.contextOverflow.SafetyMargin
+			}
+			available := modelLimit - responseTokens - int(float64(modelLimit)*safetyMargin)
+			if msgTokens > available {
+				r.observer.Debug("rlm", "Message history overflow: %d tokens > %d available, pruning middle messages", msgTokens, available)
+				messages = pruneMessages(messages, available)
+			}
+		}
 		response, err := r.callLLM(messages)
 		if err != nil {
+			// Check for context overflow and attempt recovery
+			if r.contextOverflow != nil && r.contextOverflow.Enabled {
+				if _, isOverflow := IsContextOverflow(err); isOverflow && len(messages) > 4 {
+					r.observer.Debug("rlm", "Context overflow on iteration %d, pruning messages and retrying", iteration+1)
+					modelLimit := r.getModelTokenLimit()
+					if modelLimit == 0 {
+						modelLimit = 32768 // Reasonable default
+					}
+					responseTokens := r.getResponseTokenBudget()
+					available := modelLimit - responseTokens - int(float64(modelLimit)*0.15)
+					messages = pruneMessages(messages, available)
+					// Retry this iteration
+					iteration--
+					continue
+				}
+			}
 			r.observer.Error("rlm", "LLM call failed on iteration %d: %v", iteration+1, err)
 			return "", r.stats, err
 		}
@@ -214,6 +247,48 @@ func (r *RLM) buildREPLEnv(query string, context string) map[string]interface{}
 	return env
 }
+// pruneMessages removes older middle messages to fit within a token budget.
+// Preserves the first message (system prompt) and the last 2 messages (most recent exchange).
+func pruneMessages(messages []Message, targetTokens int) []Message {
+	if len(messages) <= 3 {
+		return messages
+	}
+	// Always keep: system prompt (first), last 2 messages (most recent exchange)
+	system := messages[0]
+	lastN := messages[len(messages)-2:]
+	// Start with the preserved messages
+	result := []Message{system}
+	currentTokens := EstimateMessagesTokens(append(result, lastN...))
+	if currentTokens >= targetTokens {
+		// Even the minimum set exceeds the budget; return it anyway
+		return append(result, lastN...)
+	}
+	// Add middle messages from most recent to oldest until budget is exceeded
+	middle := messages[1 : len(messages)-2]
+	for i := len(middle) - 1; i >= 0; i-- {
+		msgTokens := 4 + EstimateTokens(middle[i].Content)
+		if currentTokens+msgTokens > targetTokens {
+			break
+		}
+		result = append(result, middle[i])
+		currentTokens += msgTokens
+	}
+	// Reverse the added middle messages (they were added newest-first)
+	if len(result) > 1 {
+		added := result[1:]
+		for i, j := 0, len(added)-1; i < j; i, j = i+1, j-1 {
+			added[i], added[j] = added[j], added[i]
+		}
+	}
+	return append(result, lastN...)
+}
 // GetObserver returns the observer for external access to events/traces.
 func (r *RLM) GetObserver() *Observer {
 	return r.observer

package/go/rlm/structured.go CHANGED Viewed

@@ -46,6 +46,20 @@ func (r *RLM) StructuredCompletion(query string, context string, config *Structu
 	subTasks := decomposeSchema(config.Schema)
 	r.observer.Debug("structured", "Schema decomposed into %d subtasks", len(subTasks))
+	// Pre-emptive overflow check: reduce context BEFORE building the prompt.
+	// Structured completion embeds the full context in the user message, so this is
+	// critical to prevent overflow on the first LLM call (following the RLM paper's
+	// principle: "the context window of the root LM is rarely clogged").
+	schemaJSON, _ := json.Marshal(config.Schema)
+	schemaOverhead := EstimateTokens(string(schemaJSON)) + structuredPromptOverhead
+	reducedCtx, wasReduced, reduceErr := r.PreemptiveReduceContext(query, context, schemaOverhead)
+	if reduceErr != nil {
+		r.observer.Error("structured", "Pre-emptive reduction failed: %v (proceeding with original context)", reduceErr)
+	} else if wasReduced {
+		r.observer.Debug("structured", "Pre-emptive reduction applied: %d -> %d chars", len(context), len(reducedCtx))
+		context = reducedCtx
+	}
 	// If simple schema or parallel disabled, use direct method
 	if len(subTasks) <= 2 || !config.ParallelExecution {
 		r.observer.Debug("structured", "Using direct completion method")
@@ -843,7 +857,7 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
 	var feedback strings.Builder
 	feedback.WriteString("VALIDATION ERROR - Your previous response was invalid.\n\n")
-	feedback.WriteString(fmt.Sprintf("ERROR: %s\n\n", errMsg))
+	fmt.Fprintf(&feedback, "ERROR: %s\n\n", errMsg)
 	// Extract what field caused the issue
 	if strings.Contains(errMsg, "missing required field:") {
@@ -852,17 +866,17 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
 		fieldName = strings.TrimSpace(fieldName)
 		feedback.WriteString("SPECIFIC ISSUE:\n")
-		feedback.WriteString(fmt.Sprintf("The field '%s' is REQUIRED but was not provided.\n\n", fieldName))
+		fmt.Fprintf(&feedback, "The field '%s' is REQUIRED but was not provided.\n\n", fieldName)
 		// Find the schema for this field and provide details
 		if schema.Type == "object" && schema.Properties != nil {
 			if fieldSchema, exists := schema.Properties[fieldName]; exists {
 				feedback.WriteString("FIELD REQUIREMENTS:\n")
-				feedback.WriteString(fmt.Sprintf("- Field name: '%s'\n", fieldName))
-				feedback.WriteString(fmt.Sprintf("- Type: %s\n", fieldSchema.Type))
+				fmt.Fprintf(&feedback, "- Field name: '%s'\n", fieldName)
+				fmt.Fprintf(&feedback, "- Type: %s\n", fieldSchema.Type)
 				if fieldSchema.Type == "object" && len(fieldSchema.Required) > 0 {
-					feedback.WriteString(fmt.Sprintf("- This is an object with required fields: %s\n", strings.Join(fieldSchema.Required, ", ")))
+					fmt.Fprintf(&feedback, "- This is an object with required fields: %s\n", strings.Join(fieldSchema.Required, ", "))
 					if fieldSchema.Properties != nil {
 						feedback.WriteString("\nNESTED FIELD DETAILS:\n")
@@ -872,13 +886,13 @@ func buildValidationFeedback(validationErr error, schema *JSONSchema, previousRe
 							if isRequired {
 								requiredMark = " [REQUIRED]"
 							}
-							feedback.WriteString(fmt.Sprintf("  - %s: %s%s\n", nestedField, nestedSchema.Type, requiredMark))
+							fmt.Fprintf(&feedback, "  - %s: %s%s\n", nestedField, nestedSchema.Type, requiredMark)
 						}
 					}
 				}
 				if fieldSchema.Type == "array" && fieldSchema.Items != nil {
-					feedback.WriteString(fmt.Sprintf("- This is an array of: %s\n", fieldSchema.Items.Type))
+					fmt.Fprintf(&feedback, "- This is an array of: %s\n", fieldSchema.Items.Type)
 				}
 			}
 		}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "recursive-llm-ts",
-  "version": "4.6.0",
+  "version": "4.8.0",
   "description": "TypeScript bridge for recursive-llm: Recursive Language Models for unbounded context processing with structured outputs",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",