recursive-llm-ts 4.7.0 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/rlm-go CHANGED
Binary file
@@ -30,6 +30,155 @@ func DefaultContextOverflowConfig() ContextOverflowConfig {
30
30
  }
31
31
  }
32
32
 
33
+ // ─── Model Token Limits ──────────────────────────────────────────────────────
34
+
35
+ // modelTokenLimits maps known model name patterns to their maximum context window sizes.
36
+ // Used for pre-emptive overflow detection so we don't need to wait for API errors.
37
+ var modelTokenLimits = map[string]int{
38
+ // OpenAI
39
+ "gpt-4o": 128000,
40
+ "gpt-4o-mini": 128000,
41
+ "gpt-4-turbo": 128000,
42
+ "gpt-4": 8192,
43
+ "gpt-4-32k": 32768,
44
+ "gpt-3.5-turbo": 16385,
45
+ "gpt-3.5-turbo-16k": 16385,
46
+ "o1": 200000,
47
+ "o1-mini": 128000,
48
+ "o1-preview": 128000,
49
+ "o3-mini": 200000,
50
+ // Anthropic (via LiteLLM/proxy)
51
+ "claude-3-opus": 200000,
52
+ "claude-3-sonnet": 200000,
53
+ "claude-3-haiku": 200000,
54
+ "claude-3.5-sonnet": 200000,
55
+ "claude-3.5-haiku": 200000,
56
+ "claude-sonnet-4": 200000,
57
+ "claude-opus-4": 200000,
58
+ // Llama (common vLLM deployments)
59
+ "llama-3": 8192,
60
+ "llama-3.1": 128000,
61
+ "llama-3.2": 128000,
62
+ "llama-3.3": 128000,
63
+ // Mistral
64
+ "mistral-7b": 32768,
65
+ "mixtral-8x7b": 32768,
66
+ "mistral-large": 128000,
67
+ "mistral-small": 128000,
68
+ // Qwen
69
+ "qwen-2": 32768,
70
+ "qwen-2.5": 128000,
71
+ }
72
+
73
+ // LookupModelTokenLimit returns the known token limit for a model, or 0 if unknown.
74
+ // Matches by prefix so "gpt-4o-mini-2024-07-18" matches "gpt-4o-mini".
75
+ func LookupModelTokenLimit(model string) int {
76
+ lowerModel := strings.ToLower(model)
77
+
78
+ // Try exact match first
79
+ if limit, ok := modelTokenLimits[lowerModel]; ok {
80
+ return limit
81
+ }
82
+
83
+ // Try prefix matching (longest prefix wins)
84
+ bestMatch := ""
85
+ bestLimit := 0
86
+ for pattern, limit := range modelTokenLimits {
87
+ if strings.HasPrefix(lowerModel, pattern) && len(pattern) > len(bestMatch) {
88
+ bestMatch = pattern
89
+ bestLimit = limit
90
+ }
91
+ }
92
+
93
+ return bestLimit
94
+ }
95
+
96
+ // getModelTokenLimit returns the effective token limit for pre-emptive overflow checks.
97
+ // Priority: config override > model name lookup > 0 (disabled).
98
+ func (r *RLM) getModelTokenLimit() int {
99
+ if r.contextOverflow != nil && r.contextOverflow.MaxModelTokens > 0 {
100
+ return r.contextOverflow.MaxModelTokens
101
+ }
102
+ return LookupModelTokenLimit(r.model)
103
+ }
104
+
105
+ // ─── Pre-emptive Overflow Check ──────────────────────────────────────────────
106
+
107
+ // structuredPromptOverhead is the approximate token overhead for structured completion prompts
108
+ // (instructions, schema constraints, JSON formatting directives).
109
+ const structuredPromptOverhead = 350
110
+
111
+ // PreemptiveReduceContext checks if the context would overflow the model's token limit
112
+ // and reduces it proactively BEFORE building the prompt. Returns the (possibly reduced)
113
+ // context, or an error if reduction fails.
114
+ //
115
+ // This is called before the first LLM call, unlike post-hoc overflow recovery which
116
+ // only triggers after an API error. Following the RLM paper's principle that
117
+ // "the context window of the root LM is rarely clogged."
118
+ func (r *RLM) PreemptiveReduceContext(query string, context string, extraOverhead int) (string, bool, error) {
119
+ modelLimit := r.getModelTokenLimit()
120
+ if modelLimit == 0 {
121
+ // No known limit; skip pre-emptive check (will rely on post-hoc recovery)
122
+ return context, false, nil
123
+ }
124
+
125
+ if r.contextOverflow == nil || !r.contextOverflow.Enabled {
126
+ return context, false, nil
127
+ }
128
+
129
+ // Estimate total token budget needed
130
+ contextTokens := EstimateTokens(context)
131
+ queryTokens := EstimateTokens(query)
132
+ responseTokens := r.getResponseTokenBudget()
133
+ safetyMargin := r.contextOverflow.SafetyMargin
134
+ if safetyMargin == 0 {
135
+ safetyMargin = 0.15
136
+ }
137
+
138
+ totalEstimate := contextTokens + queryTokens + extraOverhead + responseTokens +
139
+ int(float64(modelLimit)*safetyMargin)
140
+
141
+ r.observer.Debug("overflow", "Pre-emptive check: context=%d query=%d overhead=%d response=%d safety=%d total=%d limit=%d",
142
+ contextTokens, queryTokens, extraOverhead, responseTokens,
143
+ int(float64(modelLimit)*safetyMargin), totalEstimate, modelLimit)
144
+
145
+ if totalEstimate <= modelLimit {
146
+ return context, false, nil
147
+ }
148
+
149
+ // Context would overflow — reduce it proactively
150
+ r.observer.Debug("overflow", "Pre-emptive reduction needed: estimated %d tokens > limit %d", totalEstimate, modelLimit)
151
+
152
+ reducer := newContextReducer(r, *r.contextOverflow, r.observer)
153
+ reduced, err := reducer.ReduceForCompletion(query, context, modelLimit)
154
+ if err != nil {
155
+ return context, false, fmt.Errorf("pre-emptive context reduction failed: %w", err)
156
+ }
157
+
158
+ r.observer.Debug("overflow", "Pre-emptive reduction: %d -> %d chars", len(context), len(reduced))
159
+ return reduced, true, nil
160
+ }
161
+
162
+ // getResponseTokenBudget extracts max_tokens or max_completion_tokens from ExtraParams.
163
+ func (r *RLM) getResponseTokenBudget() int {
164
+ if r.extraParams == nil {
165
+ return 0
166
+ }
167
+ for _, key := range []string{"max_completion_tokens", "max_tokens"} {
168
+ if v, ok := r.extraParams[key]; ok {
169
+ switch n := v.(type) {
170
+ case float64:
171
+ return int(n)
172
+ case int:
173
+ return n
174
+ case int64:
175
+ return int(n)
176
+ }
177
+ }
178
+ }
179
+ return 0
180
+ }
181
+
33
182
  // ─── Token Estimation ────────────────────────────────────────────────────────
34
183
 
35
184
  // EstimateTokens provides a fast approximation of token count for a string.
@@ -168,27 +317,9 @@ func newContextReducer(rlm *RLM, config ContextOverflowConfig, obs *Observer) *c
168
317
  return &contextReducer{rlm: rlm, config: config, obs: obs}
169
318
  }
170
319
 
171
- // getResponseTokenBudget extracts max_tokens or max_completion_tokens from ExtraParams.
172
- // This represents how many tokens the API will reserve for the response, which must be
173
- // subtracted from the model's total capacity when sizing input chunks.
174
- func (cr *contextReducer) getResponseTokenBudget(modelLimit int) int {
175
- if cr.rlm.extraParams == nil {
176
- return 0
177
- }
178
- // Check max_completion_tokens first (newer API parameter), then max_tokens
179
- for _, key := range []string{"max_completion_tokens", "max_tokens"} {
180
- if v, ok := cr.rlm.extraParams[key]; ok {
181
- switch n := v.(type) {
182
- case float64:
183
- return int(n)
184
- case int:
185
- return n
186
- case int64:
187
- return int(n)
188
- }
189
- }
190
- }
191
- return 0
320
+ // getResponseTokenBudget delegates to the RLM engine's method.
321
+ func (cr *contextReducer) getResponseTokenBudget() int {
322
+ return cr.rlm.getResponseTokenBudget()
192
323
  }
193
324
 
194
325
  // makeMapPhaseParams creates ExtraParams suitable for map-phase LLM calls (summarization).
@@ -222,7 +353,7 @@ func (cr *contextReducer) ReduceForCompletion(query string, context string, mode
222
353
  // Calculate safe token budget per chunk
223
354
  // Reserve tokens for: system prompt (~500), query, overhead, safety margin, response budget
224
355
  queryTokens := EstimateTokens(query)
225
- responseTokens := cr.getResponseTokenBudget(modelLimit)
356
+ responseTokens := cr.getResponseTokenBudget()
226
357
  overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin) + responseTokens
227
358
  safeTokensPerChunk := modelLimit - overhead
228
359
 
@@ -153,7 +153,7 @@ func TestGetResponseTokenBudget(t *testing.T) {
153
153
  config := DefaultContextOverflowConfig()
154
154
  reducer := newContextReducer(rlm, config, obs)
155
155
 
156
- budget := reducer.getResponseTokenBudget(32768)
156
+ budget := reducer.getResponseTokenBudget()
157
157
  if budget != 10000 {
158
158
  t.Errorf("expected response token budget 10000, got %d", budget)
159
159
  }
@@ -169,7 +169,7 @@ func TestGetResponseTokenBudget_MaxCompletionTokens(t *testing.T) {
169
169
  config := DefaultContextOverflowConfig()
170
170
  reducer := newContextReducer(rlm, config, obs)
171
171
 
172
- budget := reducer.getResponseTokenBudget(32768)
172
+ budget := reducer.getResponseTokenBudget()
173
173
  if budget != 5000 {
174
174
  t.Errorf("expected response token budget 5000, got %d", budget)
175
175
  }
@@ -185,7 +185,7 @@ func TestGetResponseTokenBudget_NoMaxTokens(t *testing.T) {
185
185
  config := DefaultContextOverflowConfig()
186
186
  reducer := newContextReducer(rlm, config, obs)
187
187
 
188
- budget := reducer.getResponseTokenBudget(32768)
188
+ budget := reducer.getResponseTokenBudget()
189
189
  if budget != 0 {
190
190
  t.Errorf("expected response token budget 0, got %d", budget)
191
191
  }
@@ -899,3 +899,373 @@ func TestReduceForCompletion_DispatchesTextRank(t *testing.T) {
899
899
  t.Errorf("expected reduced context for textrank strategy")
900
900
  }
901
901
  }
902
+
903
+ // ─── Model Token Limits Tests ────────────────────────────────────────────────
904
+
905
+ func TestLookupModelTokenLimit_ExactMatch(t *testing.T) {
906
+ tests := []struct {
907
+ model string
908
+ expected int
909
+ }{
910
+ {"gpt-4o", 128000},
911
+ {"gpt-4o-mini", 128000},
912
+ {"gpt-4", 8192},
913
+ {"gpt-4-32k", 32768},
914
+ {"gpt-3.5-turbo", 16385},
915
+ {"claude-3-opus", 200000},
916
+ {"claude-sonnet-4", 200000},
917
+ {"mistral-7b", 32768},
918
+ }
919
+
920
+ for _, tt := range tests {
921
+ limit := LookupModelTokenLimit(tt.model)
922
+ if limit != tt.expected {
923
+ t.Errorf("LookupModelTokenLimit(%q) = %d, expected %d", tt.model, limit, tt.expected)
924
+ }
925
+ }
926
+ }
927
+
928
+ func TestLookupModelTokenLimit_PrefixMatch(t *testing.T) {
929
+ // Versioned model names should match by prefix
930
+ tests := []struct {
931
+ model string
932
+ expected int
933
+ }{
934
+ {"gpt-4o-mini-2024-07-18", 128000},
935
+ {"gpt-4o-2024-05-13", 128000},
936
+ {"claude-3-opus-20240229", 200000},
937
+ {"mistral-7b-instruct-v0.2", 32768},
938
+ }
939
+
940
+ for _, tt := range tests {
941
+ limit := LookupModelTokenLimit(tt.model)
942
+ if limit != tt.expected {
943
+ t.Errorf("LookupModelTokenLimit(%q) = %d, expected %d", tt.model, limit, tt.expected)
944
+ }
945
+ }
946
+ }
947
+
948
+ func TestLookupModelTokenLimit_Unknown(t *testing.T) {
949
+ limit := LookupModelTokenLimit("completely-unknown-model-xyz")
950
+ if limit != 0 {
951
+ t.Errorf("expected 0 for unknown model, got %d", limit)
952
+ }
953
+ }
954
+
955
+ func TestLookupModelTokenLimit_CaseInsensitive(t *testing.T) {
956
+ limit := LookupModelTokenLimit("GPT-4O-MINI")
957
+ if limit != 128000 {
958
+ t.Errorf("expected 128000 for case-insensitive match, got %d", limit)
959
+ }
960
+ }
961
+
962
+ func TestGetModelTokenLimit_ConfigOverride(t *testing.T) {
963
+ engine := New("gpt-4o-mini", Config{
964
+ APIKey: "test",
965
+ ContextOverflow: &ContextOverflowConfig{
966
+ Enabled: true,
967
+ MaxModelTokens: 16384,
968
+ },
969
+ })
970
+
971
+ limit := engine.getModelTokenLimit()
972
+ if limit != 16384 {
973
+ t.Errorf("expected config override 16384, got %d", limit)
974
+ }
975
+ }
976
+
977
+ func TestGetModelTokenLimit_ModelLookup(t *testing.T) {
978
+ engine := New("gpt-4o-mini", Config{
979
+ APIKey: "test",
980
+ })
981
+
982
+ limit := engine.getModelTokenLimit()
983
+ if limit != 128000 {
984
+ t.Errorf("expected model lookup 128000, got %d", limit)
985
+ }
986
+ }
987
+
988
+ func TestGetModelTokenLimit_UnknownModel(t *testing.T) {
989
+ engine := New("custom-local-model", Config{
990
+ APIKey: "test",
991
+ })
992
+
993
+ limit := engine.getModelTokenLimit()
994
+ if limit != 0 {
995
+ t.Errorf("expected 0 for unknown model, got %d", limit)
996
+ }
997
+ }
998
+
999
+ // ─── Pre-emptive Overflow Tests ──────────────────────────────────────────────
1000
+
1001
+ func TestPreemptiveReduceContext_SmallContext(t *testing.T) {
1002
+ engine := New("gpt-4o-mini", Config{
1003
+ APIKey: "test",
1004
+ })
1005
+
1006
+ // Small context should pass through unchanged
1007
+ context := "This is a small context that easily fits."
1008
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("What is this?", context, 500)
1009
+ if err != nil {
1010
+ t.Fatalf("unexpected error: %v", err)
1011
+ }
1012
+ if wasReduced {
1013
+ t.Error("expected no reduction for small context")
1014
+ }
1015
+ if reduced != context {
1016
+ t.Error("expected context to be unchanged")
1017
+ }
1018
+ }
1019
+
1020
+ func TestPreemptiveReduceContext_LargeContext(t *testing.T) {
1021
+ engine := New("gpt-4o-mini", Config{
1022
+ APIKey: "test",
1023
+ ContextOverflow: &ContextOverflowConfig{
1024
+ Enabled: true,
1025
+ MaxModelTokens: 1000, // Very small limit to force overflow
1026
+ Strategy: "truncate",
1027
+ SafetyMargin: 0.15,
1028
+ },
1029
+ })
1030
+
1031
+ // Create large context that exceeds the 1000 token limit
1032
+ context := strings.Repeat("The revenue for Q4 was $4.2 billion, representing 23% year-over-year growth. ", 100)
1033
+
1034
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("Summarize revenue", context, 300)
1035
+ if err != nil {
1036
+ t.Fatalf("unexpected error: %v", err)
1037
+ }
1038
+ if !wasReduced {
1039
+ t.Error("expected context to be reduced")
1040
+ }
1041
+ if len(reduced) >= len(context) {
1042
+ t.Errorf("expected reduced context to be shorter: %d >= %d", len(reduced), len(context))
1043
+ }
1044
+ }
1045
+
1046
+ func TestPreemptiveReduceContext_DisabledOverflow(t *testing.T) {
1047
+ engine := New("gpt-4o-mini", Config{
1048
+ APIKey: "test",
1049
+ ContextOverflow: &ContextOverflowConfig{
1050
+ Enabled: false,
1051
+ },
1052
+ })
1053
+
1054
+ context := strings.Repeat("Large content. ", 10000)
1055
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("query", context, 500)
1056
+ if err != nil {
1057
+ t.Fatalf("unexpected error: %v", err)
1058
+ }
1059
+ if wasReduced {
1060
+ t.Error("expected no reduction when overflow is disabled")
1061
+ }
1062
+ if reduced != context {
1063
+ t.Error("expected context unchanged when overflow is disabled")
1064
+ }
1065
+ }
1066
+
1067
+ func TestPreemptiveReduceContext_UnknownModel(t *testing.T) {
1068
+ engine := New("custom-local-model", Config{
1069
+ APIKey: "test",
1070
+ })
1071
+
1072
+ // Unknown model with no config override → no pre-emptive check
1073
+ context := strings.Repeat("Large content. ", 10000)
1074
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("query", context, 500)
1075
+ if err != nil {
1076
+ t.Fatalf("unexpected error: %v", err)
1077
+ }
1078
+ if wasReduced {
1079
+ t.Error("expected no reduction for unknown model with no config limit")
1080
+ }
1081
+ if reduced != context {
1082
+ t.Error("expected context unchanged")
1083
+ }
1084
+ }
1085
+
1086
+ func TestPreemptiveReduceContext_AccountsForResponseBudget(t *testing.T) {
1087
+ // With a high max_tokens, even moderate context should trigger reduction
1088
+ engine := New("gpt-4o-mini", Config{
1089
+ APIKey: "test",
1090
+ ContextOverflow: &ContextOverflowConfig{
1091
+ Enabled: true,
1092
+ MaxModelTokens: 2000,
1093
+ Strategy: "truncate",
1094
+ SafetyMargin: 0.15,
1095
+ },
1096
+ ExtraParams: map[string]interface{}{
1097
+ "max_tokens": float64(1000), // Large response budget
1098
+ },
1099
+ })
1100
+
1101
+ // Context of ~500 tokens + max_tokens 1000 + overhead = exceeds 2000
1102
+ context := strings.Repeat("Revenue data: the company earned $4.2B in Q4 fiscal year. ", 30)
1103
+
1104
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("Summarize", context, 300)
1105
+ if err != nil {
1106
+ t.Fatalf("unexpected error: %v", err)
1107
+ }
1108
+ if !wasReduced {
1109
+ t.Error("expected reduction when response budget + context exceeds limit")
1110
+ }
1111
+ if len(reduced) >= len(context) {
1112
+ t.Errorf("expected reduced context: %d >= %d", len(reduced), len(context))
1113
+ }
1114
+ }
1115
+
1116
+ func TestPreemptiveReduceContext_TFIDFStrategy(t *testing.T) {
1117
+ engine := New("gpt-4o-mini", Config{
1118
+ APIKey: "test",
1119
+ ContextOverflow: &ContextOverflowConfig{
1120
+ Enabled: true,
1121
+ MaxModelTokens: 500,
1122
+ Strategy: "tfidf",
1123
+ SafetyMargin: 0.15,
1124
+ },
1125
+ })
1126
+
1127
+ context := strings.Repeat("Machine learning models process large datasets effectively. ", 100)
1128
+
1129
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("Tell me about ML", context, 200)
1130
+ if err != nil {
1131
+ t.Fatalf("unexpected error: %v", err)
1132
+ }
1133
+ if !wasReduced {
1134
+ t.Error("expected reduction with tfidf strategy")
1135
+ }
1136
+ if len(reduced) >= len(context) {
1137
+ t.Errorf("expected shorter context: %d >= %d", len(reduced), len(context))
1138
+ }
1139
+ }
1140
+
1141
+ func TestPreemptiveReduceContext_TextRankStrategy(t *testing.T) {
1142
+ engine := New("gpt-4o-mini", Config{
1143
+ APIKey: "test",
1144
+ ContextOverflow: &ContextOverflowConfig{
1145
+ Enabled: true,
1146
+ MaxModelTokens: 500,
1147
+ Strategy: "textrank",
1148
+ SafetyMargin: 0.15,
1149
+ },
1150
+ })
1151
+
1152
+ context := strings.Repeat("Neural networks are powerful computation models. ", 100)
1153
+
1154
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("Explain neural nets", context, 200)
1155
+ if err != nil {
1156
+ t.Fatalf("unexpected error: %v", err)
1157
+ }
1158
+ if !wasReduced {
1159
+ t.Error("expected reduction with textrank strategy")
1160
+ }
1161
+ if len(reduced) >= len(context) {
1162
+ t.Errorf("expected shorter context: %d >= %d", len(reduced), len(context))
1163
+ }
1164
+ }
1165
+
1166
+ func TestGetResponseTokenBudget_RLMMethod(t *testing.T) {
1167
+ engine := &RLM{
1168
+ extraParams: map[string]interface{}{
1169
+ "max_tokens": float64(5000),
1170
+ },
1171
+ }
1172
+ budget := engine.getResponseTokenBudget()
1173
+ if budget != 5000 {
1174
+ t.Errorf("expected 5000, got %d", budget)
1175
+ }
1176
+ }
1177
+
1178
+ func TestGetResponseTokenBudget_MaxCompletionTokensPreferred(t *testing.T) {
1179
+ engine := &RLM{
1180
+ extraParams: map[string]interface{}{
1181
+ "max_tokens": float64(5000),
1182
+ "max_completion_tokens": float64(8000),
1183
+ },
1184
+ }
1185
+ budget := engine.getResponseTokenBudget()
1186
+ if budget != 8000 {
1187
+ t.Errorf("expected max_completion_tokens=8000 preferred, got %d", budget)
1188
+ }
1189
+ }
1190
+
1191
+ func TestGetResponseTokenBudget_NoParams(t *testing.T) {
1192
+ engine := &RLM{
1193
+ extraParams: map[string]interface{}{
1194
+ "temperature": 0.7,
1195
+ },
1196
+ }
1197
+ budget := engine.getResponseTokenBudget()
1198
+ if budget != 0 {
1199
+ t.Errorf("expected 0 when no max_tokens set, got %d", budget)
1200
+ }
1201
+ }
1202
+
1203
+ // ─── Message Pruning Tests ───────────────────────────────────────────────────
1204
+
1205
+ func TestPruneMessages_SmallHistory(t *testing.T) {
1206
+ messages := []Message{
1207
+ {Role: "system", Content: "You are helpful."},
1208
+ {Role: "user", Content: "Hello"},
1209
+ {Role: "assistant", Content: "Hi there!"},
1210
+ }
1211
+
1212
+ result := pruneMessages(messages, 100)
1213
+ if len(result) != 3 {
1214
+ t.Errorf("expected 3 messages (no pruning needed), got %d", len(result))
1215
+ }
1216
+ }
1217
+
1218
+ func TestPruneMessages_PreservesSystemAndLast(t *testing.T) {
1219
+ messages := []Message{
1220
+ {Role: "system", Content: "System prompt"},
1221
+ {Role: "user", Content: "First question"},
1222
+ {Role: "assistant", Content: "First answer"},
1223
+ {Role: "user", Content: "Second question"},
1224
+ {Role: "assistant", Content: "Second answer"},
1225
+ {Role: "user", Content: strings.Repeat("Third question with lots of context. ", 100)},
1226
+ {Role: "assistant", Content: "Third answer"},
1227
+ }
1228
+
1229
+ result := pruneMessages(messages, 50) // Very tight budget
1230
+
1231
+ // Should always keep system prompt (first) and last 2 messages
1232
+ if len(result) < 3 {
1233
+ t.Errorf("expected at least 3 messages, got %d", len(result))
1234
+ }
1235
+ if result[0].Role != "system" {
1236
+ t.Error("first message should be system prompt")
1237
+ }
1238
+ if result[len(result)-1].Content != "Third answer" {
1239
+ t.Error("last message should be the most recent")
1240
+ }
1241
+ if result[len(result)-2].Role != "user" {
1242
+ t.Error("second-to-last should be the most recent user message")
1243
+ }
1244
+ }
1245
+
1246
+ func TestPruneMessages_KeepsRecentMiddleMessages(t *testing.T) {
1247
+ messages := []Message{
1248
+ {Role: "system", Content: "Short."},
1249
+ {Role: "user", Content: "Q1"},
1250
+ {Role: "assistant", Content: "A1"},
1251
+ {Role: "user", Content: "Q2"},
1252
+ {Role: "assistant", Content: "A2"},
1253
+ {Role: "user", Content: "Q3"},
1254
+ {Role: "assistant", Content: "A3"},
1255
+ }
1256
+
1257
+ // Budget large enough for all
1258
+ result := pruneMessages(messages, 10000)
1259
+ if len(result) != 7 {
1260
+ t.Errorf("expected all 7 messages with large budget, got %d", len(result))
1261
+ }
1262
+ }
1263
+
1264
+ // ─── Structured Completion Pre-emptive Integration Tests ─────────────────────
1265
+
1266
+ func TestStructuredPromptOverhead_Constant(t *testing.T) {
1267
+ // Verify the constant is reasonable (300-500 tokens for structured prompt instructions)
1268
+ if structuredPromptOverhead < 200 || structuredPromptOverhead > 600 {
1269
+ t.Errorf("structuredPromptOverhead=%d seems out of range (expected 200-600)", structuredPromptOverhead)
1270
+ }
1271
+ }
package/go/rlm/rlm.go CHANGED
@@ -109,8 +109,41 @@ func (r *RLM) Completion(query string, context string) (string, RLMStats, error)
109
109
  r.stats.Iterations = iteration + 1
110
110
  r.observer.Debug("rlm", "Iteration %d/%d at depth %d", iteration+1, r.maxIterations, r.currentDepth)
111
111
 
112
+ // Pre-emptive message overflow check: prune older messages if history is growing too large.
113
+ // Regular completion stores context in the REPL env (not messages), but the iterative
114
+ // loop appends assistant+user messages each iteration which can accumulate.
115
+ if modelLimit := r.getModelTokenLimit(); modelLimit > 0 && len(messages) > 4 {
116
+ msgTokens := EstimateMessagesTokens(messages)
117
+ responseTokens := r.getResponseTokenBudget()
118
+ safetyMargin := 0.15
119
+ if r.contextOverflow != nil && r.contextOverflow.SafetyMargin > 0 {
120
+ safetyMargin = r.contextOverflow.SafetyMargin
121
+ }
122
+ available := modelLimit - responseTokens - int(float64(modelLimit)*safetyMargin)
123
+ if msgTokens > available {
124
+ r.observer.Debug("rlm", "Message history overflow: %d tokens > %d available, pruning middle messages", msgTokens, available)
125
+ messages = pruneMessages(messages, available)
126
+ }
127
+ }
128
+
112
129
  response, err := r.callLLM(messages)
113
130
  if err != nil {
131
+ // Check for context overflow and attempt recovery
132
+ if r.contextOverflow != nil && r.contextOverflow.Enabled {
133
+ if _, isOverflow := IsContextOverflow(err); isOverflow && len(messages) > 4 {
134
+ r.observer.Debug("rlm", "Context overflow on iteration %d, pruning messages and retrying", iteration+1)
135
+ modelLimit := r.getModelTokenLimit()
136
+ if modelLimit == 0 {
137
+ modelLimit = 32768 // Reasonable default
138
+ }
139
+ responseTokens := r.getResponseTokenBudget()
140
+ available := modelLimit - responseTokens - int(float64(modelLimit)*0.15)
141
+ messages = pruneMessages(messages, available)
142
+ // Retry this iteration
143
+ iteration--
144
+ continue
145
+ }
146
+ }
114
147
  r.observer.Error("rlm", "LLM call failed on iteration %d: %v", iteration+1, err)
115
148
  return "", r.stats, err
116
149
  }
@@ -214,6 +247,48 @@ func (r *RLM) buildREPLEnv(query string, context string) map[string]interface{}
214
247
  return env
215
248
  }
216
249
 
250
+ // pruneMessages removes older middle messages to fit within a token budget.
251
+ // Preserves the first message (system prompt) and the last 2 messages (most recent exchange).
252
+ func pruneMessages(messages []Message, targetTokens int) []Message {
253
+ if len(messages) <= 3 {
254
+ return messages
255
+ }
256
+
257
+ // Always keep: system prompt (first), last 2 messages (most recent exchange)
258
+ system := messages[0]
259
+ lastN := messages[len(messages)-2:]
260
+
261
+ // Start with the preserved messages
262
+ result := []Message{system}
263
+ currentTokens := EstimateMessagesTokens(append(result, lastN...))
264
+
265
+ if currentTokens >= targetTokens {
266
+ // Even the minimum set exceeds the budget; return it anyway
267
+ return append(result, lastN...)
268
+ }
269
+
270
+ // Add middle messages from most recent to oldest until budget is exceeded
271
+ middle := messages[1 : len(messages)-2]
272
+ for i := len(middle) - 1; i >= 0; i-- {
273
+ msgTokens := 4 + EstimateTokens(middle[i].Content)
274
+ if currentTokens+msgTokens > targetTokens {
275
+ break
276
+ }
277
+ result = append(result, middle[i])
278
+ currentTokens += msgTokens
279
+ }
280
+
281
+ // Reverse the added middle messages (they were added newest-first)
282
+ if len(result) > 1 {
283
+ added := result[1:]
284
+ for i, j := 0, len(added)-1; i < j; i, j = i+1, j-1 {
285
+ added[i], added[j] = added[j], added[i]
286
+ }
287
+ }
288
+
289
+ return append(result, lastN...)
290
+ }
291
+
217
292
  // GetObserver returns the observer for external access to events/traces.
218
293
  func (r *RLM) GetObserver() *Observer {
219
294
  return r.observer
@@ -46,6 +46,20 @@ func (r *RLM) StructuredCompletion(query string, context string, config *Structu
46
46
  subTasks := decomposeSchema(config.Schema)
47
47
  r.observer.Debug("structured", "Schema decomposed into %d subtasks", len(subTasks))
48
48
 
49
+ // Pre-emptive overflow check: reduce context BEFORE building the prompt.
50
+ // Structured completion embeds the full context in the user message, so this is
51
+ // critical to prevent overflow on the first LLM call (following the RLM paper's
52
+ // principle: "the context window of the root LM is rarely clogged").
53
+ schemaJSON, _ := json.Marshal(config.Schema)
54
+ schemaOverhead := EstimateTokens(string(schemaJSON)) + structuredPromptOverhead
55
+ reducedCtx, wasReduced, reduceErr := r.PreemptiveReduceContext(query, context, schemaOverhead)
56
+ if reduceErr != nil {
57
+ r.observer.Error("structured", "Pre-emptive reduction failed: %v (proceeding with original context)", reduceErr)
58
+ } else if wasReduced {
59
+ r.observer.Debug("structured", "Pre-emptive reduction applied: %d -> %d chars", len(context), len(reducedCtx))
60
+ context = reducedCtx
61
+ }
62
+
49
63
  // If simple schema or parallel disabled, use direct method
50
64
  if len(subTasks) <= 2 || !config.ParallelExecution {
51
65
  r.observer.Debug("structured", "Using direct completion method")
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "recursive-llm-ts",
3
- "version": "4.7.0",
3
+ "version": "4.8.0",
4
4
  "description": "TypeScript bridge for recursive-llm: Recursive Language Models for unbounded context processing with structured outputs",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",