recursive-llm-ts 4.7.0 → 4.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/bin/rlm-go +0 -0
- package/dist/bridge-interface.d.ts +3 -0
- package/dist/rlm.js +10 -0
- package/go/README.md +2 -2
- package/go/cmd/rlm/main.go +1 -1
- package/go/go.mod +1 -1
- package/go/rlm/context_overflow.go +181 -29
- package/go/rlm/context_overflow_test.go +373 -3
- package/go/rlm/doc.go +2 -2
- package/go/rlm/meta_agent.go +18 -2
- package/go/rlm/observability.go +6 -0
- package/go/rlm/openai.go +27 -10
- package/go/rlm/rlm.go +86 -3
- package/go/rlm/structured.go +23 -0
- package/go/rlm/token_tracking_test.go +845 -0
- package/go/rlm/types.go +7 -4
- package/package.json +4 -4
|
@@ -153,7 +153,7 @@ func TestGetResponseTokenBudget(t *testing.T) {
|
|
|
153
153
|
config := DefaultContextOverflowConfig()
|
|
154
154
|
reducer := newContextReducer(rlm, config, obs)
|
|
155
155
|
|
|
156
|
-
budget := reducer.getResponseTokenBudget(
|
|
156
|
+
budget := reducer.getResponseTokenBudget()
|
|
157
157
|
if budget != 10000 {
|
|
158
158
|
t.Errorf("expected response token budget 10000, got %d", budget)
|
|
159
159
|
}
|
|
@@ -169,7 +169,7 @@ func TestGetResponseTokenBudget_MaxCompletionTokens(t *testing.T) {
|
|
|
169
169
|
config := DefaultContextOverflowConfig()
|
|
170
170
|
reducer := newContextReducer(rlm, config, obs)
|
|
171
171
|
|
|
172
|
-
budget := reducer.getResponseTokenBudget(
|
|
172
|
+
budget := reducer.getResponseTokenBudget()
|
|
173
173
|
if budget != 5000 {
|
|
174
174
|
t.Errorf("expected response token budget 5000, got %d", budget)
|
|
175
175
|
}
|
|
@@ -185,7 +185,7 @@ func TestGetResponseTokenBudget_NoMaxTokens(t *testing.T) {
|
|
|
185
185
|
config := DefaultContextOverflowConfig()
|
|
186
186
|
reducer := newContextReducer(rlm, config, obs)
|
|
187
187
|
|
|
188
|
-
budget := reducer.getResponseTokenBudget(
|
|
188
|
+
budget := reducer.getResponseTokenBudget()
|
|
189
189
|
if budget != 0 {
|
|
190
190
|
t.Errorf("expected response token budget 0, got %d", budget)
|
|
191
191
|
}
|
|
@@ -899,3 +899,373 @@ func TestReduceForCompletion_DispatchesTextRank(t *testing.T) {
|
|
|
899
899
|
t.Errorf("expected reduced context for textrank strategy")
|
|
900
900
|
}
|
|
901
901
|
}
|
|
902
|
+
|
|
903
|
+
// ─── Model Token Limits Tests ────────────────────────────────────────────────
|
|
904
|
+
|
|
905
|
+
func TestLookupModelTokenLimit_ExactMatch(t *testing.T) {
|
|
906
|
+
tests := []struct {
|
|
907
|
+
model string
|
|
908
|
+
expected int
|
|
909
|
+
}{
|
|
910
|
+
{"gpt-4o", 128000},
|
|
911
|
+
{"gpt-4o-mini", 128000},
|
|
912
|
+
{"gpt-4", 8192},
|
|
913
|
+
{"gpt-4-32k", 32768},
|
|
914
|
+
{"gpt-3.5-turbo", 16385},
|
|
915
|
+
{"claude-3-opus", 200000},
|
|
916
|
+
{"claude-sonnet-4", 200000},
|
|
917
|
+
{"mistral-7b", 32768},
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
for _, tt := range tests {
|
|
921
|
+
limit := LookupModelTokenLimit(tt.model)
|
|
922
|
+
if limit != tt.expected {
|
|
923
|
+
t.Errorf("LookupModelTokenLimit(%q) = %d, expected %d", tt.model, limit, tt.expected)
|
|
924
|
+
}
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
|
|
928
|
+
func TestLookupModelTokenLimit_PrefixMatch(t *testing.T) {
|
|
929
|
+
// Versioned model names should match by prefix
|
|
930
|
+
tests := []struct {
|
|
931
|
+
model string
|
|
932
|
+
expected int
|
|
933
|
+
}{
|
|
934
|
+
{"gpt-4o-mini-2024-07-18", 128000},
|
|
935
|
+
{"gpt-4o-2024-05-13", 128000},
|
|
936
|
+
{"claude-3-opus-20240229", 200000},
|
|
937
|
+
{"mistral-7b-instruct-v0.2", 32768},
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
for _, tt := range tests {
|
|
941
|
+
limit := LookupModelTokenLimit(tt.model)
|
|
942
|
+
if limit != tt.expected {
|
|
943
|
+
t.Errorf("LookupModelTokenLimit(%q) = %d, expected %d", tt.model, limit, tt.expected)
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
func TestLookupModelTokenLimit_Unknown(t *testing.T) {
|
|
949
|
+
limit := LookupModelTokenLimit("completely-unknown-model-xyz")
|
|
950
|
+
if limit != 0 {
|
|
951
|
+
t.Errorf("expected 0 for unknown model, got %d", limit)
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
func TestLookupModelTokenLimit_CaseInsensitive(t *testing.T) {
|
|
956
|
+
limit := LookupModelTokenLimit("GPT-4O-MINI")
|
|
957
|
+
if limit != 128000 {
|
|
958
|
+
t.Errorf("expected 128000 for case-insensitive match, got %d", limit)
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
func TestGetModelTokenLimit_ConfigOverride(t *testing.T) {
|
|
963
|
+
engine := New("gpt-4o-mini", Config{
|
|
964
|
+
APIKey: "test",
|
|
965
|
+
ContextOverflow: &ContextOverflowConfig{
|
|
966
|
+
Enabled: true,
|
|
967
|
+
MaxModelTokens: 16384,
|
|
968
|
+
},
|
|
969
|
+
})
|
|
970
|
+
|
|
971
|
+
limit := engine.getModelTokenLimit()
|
|
972
|
+
if limit != 16384 {
|
|
973
|
+
t.Errorf("expected config override 16384, got %d", limit)
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
func TestGetModelTokenLimit_ModelLookup(t *testing.T) {
|
|
978
|
+
engine := New("gpt-4o-mini", Config{
|
|
979
|
+
APIKey: "test",
|
|
980
|
+
})
|
|
981
|
+
|
|
982
|
+
limit := engine.getModelTokenLimit()
|
|
983
|
+
if limit != 128000 {
|
|
984
|
+
t.Errorf("expected model lookup 128000, got %d", limit)
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
func TestGetModelTokenLimit_UnknownModel(t *testing.T) {
|
|
989
|
+
engine := New("custom-local-model", Config{
|
|
990
|
+
APIKey: "test",
|
|
991
|
+
})
|
|
992
|
+
|
|
993
|
+
limit := engine.getModelTokenLimit()
|
|
994
|
+
if limit != 0 {
|
|
995
|
+
t.Errorf("expected 0 for unknown model, got %d", limit)
|
|
996
|
+
}
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
// ─── Pre-emptive Overflow Tests ──────────────────────────────────────────────
|
|
1000
|
+
|
|
1001
|
+
func TestPreemptiveReduceContext_SmallContext(t *testing.T) {
|
|
1002
|
+
engine := New("gpt-4o-mini", Config{
|
|
1003
|
+
APIKey: "test",
|
|
1004
|
+
})
|
|
1005
|
+
|
|
1006
|
+
// Small context should pass through unchanged
|
|
1007
|
+
context := "This is a small context that easily fits."
|
|
1008
|
+
reduced, wasReduced, err := engine.PreemptiveReduceContext("What is this?", context, 500)
|
|
1009
|
+
if err != nil {
|
|
1010
|
+
t.Fatalf("unexpected error: %v", err)
|
|
1011
|
+
}
|
|
1012
|
+
if wasReduced {
|
|
1013
|
+
t.Error("expected no reduction for small context")
|
|
1014
|
+
}
|
|
1015
|
+
if reduced != context {
|
|
1016
|
+
t.Error("expected context to be unchanged")
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
func TestPreemptiveReduceContext_LargeContext(t *testing.T) {
|
|
1021
|
+
engine := New("gpt-4o-mini", Config{
|
|
1022
|
+
APIKey: "test",
|
|
1023
|
+
ContextOverflow: &ContextOverflowConfig{
|
|
1024
|
+
Enabled: true,
|
|
1025
|
+
MaxModelTokens: 1000, // Very small limit to force overflow
|
|
1026
|
+
Strategy: "truncate",
|
|
1027
|
+
SafetyMargin: 0.15,
|
|
1028
|
+
},
|
|
1029
|
+
})
|
|
1030
|
+
|
|
1031
|
+
// Create large context that exceeds the 1000 token limit
|
|
1032
|
+
context := strings.Repeat("The revenue for Q4 was $4.2 billion, representing 23% year-over-year growth. ", 100)
|
|
1033
|
+
|
|
1034
|
+
reduced, wasReduced, err := engine.PreemptiveReduceContext("Summarize revenue", context, 300)
|
|
1035
|
+
if err != nil {
|
|
1036
|
+
t.Fatalf("unexpected error: %v", err)
|
|
1037
|
+
}
|
|
1038
|
+
if !wasReduced {
|
|
1039
|
+
t.Error("expected context to be reduced")
|
|
1040
|
+
}
|
|
1041
|
+
if len(reduced) >= len(context) {
|
|
1042
|
+
t.Errorf("expected reduced context to be shorter: %d >= %d", len(reduced), len(context))
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1046
|
+
func TestPreemptiveReduceContext_DisabledOverflow(t *testing.T) {
|
|
1047
|
+
engine := New("gpt-4o-mini", Config{
|
|
1048
|
+
APIKey: "test",
|
|
1049
|
+
ContextOverflow: &ContextOverflowConfig{
|
|
1050
|
+
Enabled: false,
|
|
1051
|
+
},
|
|
1052
|
+
})
|
|
1053
|
+
|
|
1054
|
+
context := strings.Repeat("Large content. ", 10000)
|
|
1055
|
+
reduced, wasReduced, err := engine.PreemptiveReduceContext("query", context, 500)
|
|
1056
|
+
if err != nil {
|
|
1057
|
+
t.Fatalf("unexpected error: %v", err)
|
|
1058
|
+
}
|
|
1059
|
+
if wasReduced {
|
|
1060
|
+
t.Error("expected no reduction when overflow is disabled")
|
|
1061
|
+
}
|
|
1062
|
+
if reduced != context {
|
|
1063
|
+
t.Error("expected context unchanged when overflow is disabled")
|
|
1064
|
+
}
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
func TestPreemptiveReduceContext_UnknownModel(t *testing.T) {
|
|
1068
|
+
engine := New("custom-local-model", Config{
|
|
1069
|
+
APIKey: "test",
|
|
1070
|
+
})
|
|
1071
|
+
|
|
1072
|
+
// Unknown model with no config override → no pre-emptive check
|
|
1073
|
+
context := strings.Repeat("Large content. ", 10000)
|
|
1074
|
+
reduced, wasReduced, err := engine.PreemptiveReduceContext("query", context, 500)
|
|
1075
|
+
if err != nil {
|
|
1076
|
+
t.Fatalf("unexpected error: %v", err)
|
|
1077
|
+
}
|
|
1078
|
+
if wasReduced {
|
|
1079
|
+
t.Error("expected no reduction for unknown model with no config limit")
|
|
1080
|
+
}
|
|
1081
|
+
if reduced != context {
|
|
1082
|
+
t.Error("expected context unchanged")
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
func TestPreemptiveReduceContext_AccountsForResponseBudget(t *testing.T) {
|
|
1087
|
+
// With a high max_tokens, even moderate context should trigger reduction
|
|
1088
|
+
engine := New("gpt-4o-mini", Config{
|
|
1089
|
+
APIKey: "test",
|
|
1090
|
+
ContextOverflow: &ContextOverflowConfig{
|
|
1091
|
+
Enabled: true,
|
|
1092
|
+
MaxModelTokens: 2000,
|
|
1093
|
+
Strategy: "truncate",
|
|
1094
|
+
SafetyMargin: 0.15,
|
|
1095
|
+
},
|
|
1096
|
+
ExtraParams: map[string]interface{}{
|
|
1097
|
+
"max_tokens": float64(1000), // Large response budget
|
|
1098
|
+
},
|
|
1099
|
+
})
|
|
1100
|
+
|
|
1101
|
+
// Context of ~500 tokens + max_tokens 1000 + overhead = exceeds 2000
|
|
1102
|
+
context := strings.Repeat("Revenue data: the company earned $4.2B in Q4 fiscal year. ", 30)
|
|
1103
|
+
|
|
1104
|
+
reduced, wasReduced, err := engine.PreemptiveReduceContext("Summarize", context, 300)
|
|
1105
|
+
if err != nil {
|
|
1106
|
+
t.Fatalf("unexpected error: %v", err)
|
|
1107
|
+
}
|
|
1108
|
+
if !wasReduced {
|
|
1109
|
+
t.Error("expected reduction when response budget + context exceeds limit")
|
|
1110
|
+
}
|
|
1111
|
+
if len(reduced) >= len(context) {
|
|
1112
|
+
t.Errorf("expected reduced context: %d >= %d", len(reduced), len(context))
|
|
1113
|
+
}
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
func TestPreemptiveReduceContext_TFIDFStrategy(t *testing.T) {
|
|
1117
|
+
engine := New("gpt-4o-mini", Config{
|
|
1118
|
+
APIKey: "test",
|
|
1119
|
+
ContextOverflow: &ContextOverflowConfig{
|
|
1120
|
+
Enabled: true,
|
|
1121
|
+
MaxModelTokens: 500,
|
|
1122
|
+
Strategy: "tfidf",
|
|
1123
|
+
SafetyMargin: 0.15,
|
|
1124
|
+
},
|
|
1125
|
+
})
|
|
1126
|
+
|
|
1127
|
+
context := strings.Repeat("Machine learning models process large datasets effectively. ", 100)
|
|
1128
|
+
|
|
1129
|
+
reduced, wasReduced, err := engine.PreemptiveReduceContext("Tell me about ML", context, 200)
|
|
1130
|
+
if err != nil {
|
|
1131
|
+
t.Fatalf("unexpected error: %v", err)
|
|
1132
|
+
}
|
|
1133
|
+
if !wasReduced {
|
|
1134
|
+
t.Error("expected reduction with tfidf strategy")
|
|
1135
|
+
}
|
|
1136
|
+
if len(reduced) >= len(context) {
|
|
1137
|
+
t.Errorf("expected shorter context: %d >= %d", len(reduced), len(context))
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
func TestPreemptiveReduceContext_TextRankStrategy(t *testing.T) {
|
|
1142
|
+
engine := New("gpt-4o-mini", Config{
|
|
1143
|
+
APIKey: "test",
|
|
1144
|
+
ContextOverflow: &ContextOverflowConfig{
|
|
1145
|
+
Enabled: true,
|
|
1146
|
+
MaxModelTokens: 500,
|
|
1147
|
+
Strategy: "textrank",
|
|
1148
|
+
SafetyMargin: 0.15,
|
|
1149
|
+
},
|
|
1150
|
+
})
|
|
1151
|
+
|
|
1152
|
+
context := strings.Repeat("Neural networks are powerful computation models. ", 100)
|
|
1153
|
+
|
|
1154
|
+
reduced, wasReduced, err := engine.PreemptiveReduceContext("Explain neural nets", context, 200)
|
|
1155
|
+
if err != nil {
|
|
1156
|
+
t.Fatalf("unexpected error: %v", err)
|
|
1157
|
+
}
|
|
1158
|
+
if !wasReduced {
|
|
1159
|
+
t.Error("expected reduction with textrank strategy")
|
|
1160
|
+
}
|
|
1161
|
+
if len(reduced) >= len(context) {
|
|
1162
|
+
t.Errorf("expected shorter context: %d >= %d", len(reduced), len(context))
|
|
1163
|
+
}
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
func TestGetResponseTokenBudget_RLMMethod(t *testing.T) {
|
|
1167
|
+
engine := &RLM{
|
|
1168
|
+
extraParams: map[string]interface{}{
|
|
1169
|
+
"max_tokens": float64(5000),
|
|
1170
|
+
},
|
|
1171
|
+
}
|
|
1172
|
+
budget := engine.getResponseTokenBudget()
|
|
1173
|
+
if budget != 5000 {
|
|
1174
|
+
t.Errorf("expected 5000, got %d", budget)
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
|
|
1178
|
+
func TestGetResponseTokenBudget_MaxCompletionTokensPreferred(t *testing.T) {
|
|
1179
|
+
engine := &RLM{
|
|
1180
|
+
extraParams: map[string]interface{}{
|
|
1181
|
+
"max_tokens": float64(5000),
|
|
1182
|
+
"max_completion_tokens": float64(8000),
|
|
1183
|
+
},
|
|
1184
|
+
}
|
|
1185
|
+
budget := engine.getResponseTokenBudget()
|
|
1186
|
+
if budget != 8000 {
|
|
1187
|
+
t.Errorf("expected max_completion_tokens=8000 preferred, got %d", budget)
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
func TestGetResponseTokenBudget_NoParams(t *testing.T) {
|
|
1192
|
+
engine := &RLM{
|
|
1193
|
+
extraParams: map[string]interface{}{
|
|
1194
|
+
"temperature": 0.7,
|
|
1195
|
+
},
|
|
1196
|
+
}
|
|
1197
|
+
budget := engine.getResponseTokenBudget()
|
|
1198
|
+
if budget != 0 {
|
|
1199
|
+
t.Errorf("expected 0 when no max_tokens set, got %d", budget)
|
|
1200
|
+
}
|
|
1201
|
+
}
|
|
1202
|
+
|
|
1203
|
+
// ─── Message Pruning Tests ───────────────────────────────────────────────────
|
|
1204
|
+
|
|
1205
|
+
func TestPruneMessages_SmallHistory(t *testing.T) {
|
|
1206
|
+
messages := []Message{
|
|
1207
|
+
{Role: "system", Content: "You are helpful."},
|
|
1208
|
+
{Role: "user", Content: "Hello"},
|
|
1209
|
+
{Role: "assistant", Content: "Hi there!"},
|
|
1210
|
+
}
|
|
1211
|
+
|
|
1212
|
+
result := pruneMessages(messages, 100)
|
|
1213
|
+
if len(result) != 3 {
|
|
1214
|
+
t.Errorf("expected 3 messages (no pruning needed), got %d", len(result))
|
|
1215
|
+
}
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
func TestPruneMessages_PreservesSystemAndLast(t *testing.T) {
|
|
1219
|
+
messages := []Message{
|
|
1220
|
+
{Role: "system", Content: "System prompt"},
|
|
1221
|
+
{Role: "user", Content: "First question"},
|
|
1222
|
+
{Role: "assistant", Content: "First answer"},
|
|
1223
|
+
{Role: "user", Content: "Second question"},
|
|
1224
|
+
{Role: "assistant", Content: "Second answer"},
|
|
1225
|
+
{Role: "user", Content: strings.Repeat("Third question with lots of context. ", 100)},
|
|
1226
|
+
{Role: "assistant", Content: "Third answer"},
|
|
1227
|
+
}
|
|
1228
|
+
|
|
1229
|
+
result := pruneMessages(messages, 50) // Very tight budget
|
|
1230
|
+
|
|
1231
|
+
// Should always keep system prompt (first) and last 2 messages
|
|
1232
|
+
if len(result) < 3 {
|
|
1233
|
+
t.Errorf("expected at least 3 messages, got %d", len(result))
|
|
1234
|
+
}
|
|
1235
|
+
if result[0].Role != "system" {
|
|
1236
|
+
t.Error("first message should be system prompt")
|
|
1237
|
+
}
|
|
1238
|
+
if result[len(result)-1].Content != "Third answer" {
|
|
1239
|
+
t.Error("last message should be the most recent")
|
|
1240
|
+
}
|
|
1241
|
+
if result[len(result)-2].Role != "user" {
|
|
1242
|
+
t.Error("second-to-last should be the most recent user message")
|
|
1243
|
+
}
|
|
1244
|
+
}
|
|
1245
|
+
|
|
1246
|
+
func TestPruneMessages_KeepsRecentMiddleMessages(t *testing.T) {
|
|
1247
|
+
messages := []Message{
|
|
1248
|
+
{Role: "system", Content: "Short."},
|
|
1249
|
+
{Role: "user", Content: "Q1"},
|
|
1250
|
+
{Role: "assistant", Content: "A1"},
|
|
1251
|
+
{Role: "user", Content: "Q2"},
|
|
1252
|
+
{Role: "assistant", Content: "A2"},
|
|
1253
|
+
{Role: "user", Content: "Q3"},
|
|
1254
|
+
{Role: "assistant", Content: "A3"},
|
|
1255
|
+
}
|
|
1256
|
+
|
|
1257
|
+
// Budget large enough for all
|
|
1258
|
+
result := pruneMessages(messages, 10000)
|
|
1259
|
+
if len(result) != 7 {
|
|
1260
|
+
t.Errorf("expected all 7 messages with large budget, got %d", len(result))
|
|
1261
|
+
}
|
|
1262
|
+
}
|
|
1263
|
+
|
|
1264
|
+
// ─── Structured Completion Pre-emptive Integration Tests ─────────────────────
|
|
1265
|
+
|
|
1266
|
+
func TestStructuredPromptOverhead_Constant(t *testing.T) {
|
|
1267
|
+
// Verify the constant is reasonable (300-500 tokens for structured prompt instructions)
|
|
1268
|
+
if structuredPromptOverhead < 200 || structuredPromptOverhead > 600 {
|
|
1269
|
+
t.Errorf("structuredPromptOverhead=%d seems out of range (expected 200-600)", structuredPromptOverhead)
|
|
1270
|
+
}
|
|
1271
|
+
}
|
package/go/rlm/doc.go
CHANGED
|
@@ -8,13 +8,13 @@
|
|
|
8
8
|
//
|
|
9
9
|
// To use this package in your Go project:
|
|
10
10
|
//
|
|
11
|
-
// go get github.com/
|
|
11
|
+
// go get github.com/howlerops/recursive-llm-ts/go
|
|
12
12
|
//
|
|
13
13
|
// # Basic Usage
|
|
14
14
|
//
|
|
15
15
|
// Create an RLM engine and execute a completion:
|
|
16
16
|
//
|
|
17
|
-
// import "github.com/
|
|
17
|
+
// import "github.com/howlerops/recursive-llm-ts/go/rlm"
|
|
18
18
|
//
|
|
19
19
|
// config := rlm.Config{
|
|
20
20
|
// MaxDepth: 5,
|
package/go/rlm/meta_agent.go
CHANGED
|
@@ -74,7 +74,15 @@ func (ma *MetaAgent) OptimizeQuery(query string, context string) (string, error)
|
|
|
74
74
|
return query, nil
|
|
75
75
|
}
|
|
76
76
|
|
|
77
|
-
|
|
77
|
+
// Track meta-agent token usage in the parent RLM's stats
|
|
78
|
+
if result.Usage != nil {
|
|
79
|
+
ma.rlm.stats.PromptTokens += result.Usage.PromptTokens
|
|
80
|
+
ma.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
|
|
81
|
+
ma.rlm.stats.TotalTokens += result.Usage.TotalTokens
|
|
82
|
+
}
|
|
83
|
+
ma.rlm.stats.LlmCalls++
|
|
84
|
+
|
|
85
|
+
optimized := strings.TrimSpace(result.Content)
|
|
78
86
|
ma.obs.Debug("meta_agent", "Optimized query: %s", truncateStr(optimized, 200))
|
|
79
87
|
ma.obs.Event("meta_agent.query_optimized", map[string]string{
|
|
80
88
|
"original_length": fmt.Sprintf("%d", len(query)),
|
|
@@ -136,7 +144,15 @@ func (ma *MetaAgent) OptimizeForStructured(query string, context string, schema
|
|
|
136
144
|
return query, nil
|
|
137
145
|
}
|
|
138
146
|
|
|
139
|
-
|
|
147
|
+
// Track meta-agent token usage in the parent RLM's stats
|
|
148
|
+
if result.Usage != nil {
|
|
149
|
+
ma.rlm.stats.PromptTokens += result.Usage.PromptTokens
|
|
150
|
+
ma.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
|
|
151
|
+
ma.rlm.stats.TotalTokens += result.Usage.TotalTokens
|
|
152
|
+
}
|
|
153
|
+
ma.rlm.stats.LlmCalls++
|
|
154
|
+
|
|
155
|
+
optimized := strings.TrimSpace(result.Content)
|
|
140
156
|
ma.obs.Debug("meta_agent", "Optimized structured query: %s", truncateStr(optimized, 200))
|
|
141
157
|
ma.obs.Event("meta_agent.structured_query_optimized", map[string]string{
|
|
142
158
|
"original_length": fmt.Sprintf("%d", len(query)),
|
package/go/rlm/observability.go
CHANGED
|
@@ -445,6 +445,12 @@ func FormatStatsWithObservability(stats RLMStats, obs *Observer) map[string]inte
|
|
|
445
445
|
result["parsing_retries"] = stats.ParsingRetries
|
|
446
446
|
}
|
|
447
447
|
|
|
448
|
+
if stats.TotalTokens > 0 {
|
|
449
|
+
result["total_tokens"] = stats.TotalTokens
|
|
450
|
+
result["prompt_tokens"] = stats.PromptTokens
|
|
451
|
+
result["completion_tokens"] = stats.CompletionTokens
|
|
452
|
+
}
|
|
453
|
+
|
|
448
454
|
if obs != nil && obs.config.Debug {
|
|
449
455
|
events := obs.GetEvents()
|
|
450
456
|
if len(events) > 0 {
|
package/go/rlm/openai.go
CHANGED
|
@@ -34,6 +34,20 @@ type chatResponse struct {
|
|
|
34
34
|
Error *struct {
|
|
35
35
|
Message string `json:"message"`
|
|
36
36
|
} `json:"error"`
|
|
37
|
+
Usage *TokenUsage `json:"usage,omitempty"`
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// TokenUsage represents token consumption from an LLM API response.
|
|
41
|
+
type TokenUsage struct {
|
|
42
|
+
PromptTokens int `json:"prompt_tokens"`
|
|
43
|
+
CompletionTokens int `json:"completion_tokens"`
|
|
44
|
+
TotalTokens int `json:"total_tokens"`
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ChatCompletionResult holds the content and token usage from an LLM call.
|
|
48
|
+
type ChatCompletionResult struct {
|
|
49
|
+
Content string
|
|
50
|
+
Usage *TokenUsage
|
|
37
51
|
}
|
|
38
52
|
|
|
39
53
|
var (
|
|
@@ -48,7 +62,7 @@ var (
|
|
|
48
62
|
}
|
|
49
63
|
)
|
|
50
64
|
|
|
51
|
-
func CallChatCompletion(request ChatRequest) (
|
|
65
|
+
func CallChatCompletion(request ChatRequest) (ChatCompletionResult, error) {
|
|
52
66
|
endpoint := buildEndpoint(request.APIBase)
|
|
53
67
|
payload := map[string]interface{}{
|
|
54
68
|
"model": request.Model,
|
|
@@ -61,7 +75,7 @@ func CallChatCompletion(request ChatRequest) (string, error) {
|
|
|
61
75
|
|
|
62
76
|
body, err := json.Marshal(payload)
|
|
63
77
|
if err != nil {
|
|
64
|
-
return
|
|
78
|
+
return ChatCompletionResult{}, err
|
|
65
79
|
}
|
|
66
80
|
|
|
67
81
|
// Use shared client with connection pooling
|
|
@@ -76,7 +90,7 @@ func CallChatCompletion(request ChatRequest) (string, error) {
|
|
|
76
90
|
|
|
77
91
|
req, err := http.NewRequest(http.MethodPost, endpoint, bytes.NewReader(body))
|
|
78
92
|
if err != nil {
|
|
79
|
-
return
|
|
93
|
+
return ChatCompletionResult{}, err
|
|
80
94
|
}
|
|
81
95
|
req.Header.Set("Content-Type", "application/json")
|
|
82
96
|
if request.APIKey != "" {
|
|
@@ -85,7 +99,7 @@ func CallChatCompletion(request ChatRequest) (string, error) {
|
|
|
85
99
|
|
|
86
100
|
resp, err := client.Do(req)
|
|
87
101
|
if err != nil {
|
|
88
|
-
return
|
|
102
|
+
return ChatCompletionResult{}, err
|
|
89
103
|
}
|
|
90
104
|
defer func() {
|
|
91
105
|
_ = resp.Body.Close()
|
|
@@ -93,27 +107,30 @@ func CallChatCompletion(request ChatRequest) (string, error) {
|
|
|
93
107
|
|
|
94
108
|
responseBody, err := io.ReadAll(resp.Body)
|
|
95
109
|
if err != nil {
|
|
96
|
-
return
|
|
110
|
+
return ChatCompletionResult{}, err
|
|
97
111
|
}
|
|
98
112
|
|
|
99
113
|
if resp.StatusCode >= http.StatusBadRequest {
|
|
100
|
-
return
|
|
114
|
+
return ChatCompletionResult{}, NewAPIError(resp.StatusCode, strings.TrimSpace(string(responseBody)))
|
|
101
115
|
}
|
|
102
116
|
|
|
103
117
|
var parsed chatResponse
|
|
104
118
|
if err := json.Unmarshal(responseBody, &parsed); err != nil {
|
|
105
|
-
return
|
|
119
|
+
return ChatCompletionResult{}, err
|
|
106
120
|
}
|
|
107
121
|
|
|
108
122
|
if parsed.Error != nil && parsed.Error.Message != "" {
|
|
109
|
-
return
|
|
123
|
+
return ChatCompletionResult{}, errors.New(parsed.Error.Message)
|
|
110
124
|
}
|
|
111
125
|
|
|
112
126
|
if len(parsed.Choices) == 0 {
|
|
113
|
-
return
|
|
127
|
+
return ChatCompletionResult{}, errors.New("no choices returned by LLM")
|
|
114
128
|
}
|
|
115
129
|
|
|
116
|
-
return
|
|
130
|
+
return ChatCompletionResult{
|
|
131
|
+
Content: parsed.Choices[0].Message.Content,
|
|
132
|
+
Usage: parsed.Usage,
|
|
133
|
+
}, nil
|
|
117
134
|
}
|
|
118
135
|
|
|
119
136
|
func buildEndpoint(apiBase string) string {
|
package/go/rlm/rlm.go
CHANGED
|
@@ -109,8 +109,41 @@ func (r *RLM) Completion(query string, context string) (string, RLMStats, error)
|
|
|
109
109
|
r.stats.Iterations = iteration + 1
|
|
110
110
|
r.observer.Debug("rlm", "Iteration %d/%d at depth %d", iteration+1, r.maxIterations, r.currentDepth)
|
|
111
111
|
|
|
112
|
+
// Pre-emptive message overflow check: prune older messages if history is growing too large.
|
|
113
|
+
// Regular completion stores context in the REPL env (not messages), but the iterative
|
|
114
|
+
// loop appends assistant+user messages each iteration which can accumulate.
|
|
115
|
+
if modelLimit := r.getModelTokenLimit(); modelLimit > 0 && len(messages) > 4 {
|
|
116
|
+
msgTokens := EstimateMessagesTokens(messages)
|
|
117
|
+
responseTokens := r.getResponseTokenBudget()
|
|
118
|
+
safetyMargin := 0.15
|
|
119
|
+
if r.contextOverflow != nil && r.contextOverflow.SafetyMargin > 0 {
|
|
120
|
+
safetyMargin = r.contextOverflow.SafetyMargin
|
|
121
|
+
}
|
|
122
|
+
available := modelLimit - responseTokens - int(float64(modelLimit)*safetyMargin)
|
|
123
|
+
if msgTokens > available {
|
|
124
|
+
r.observer.Debug("rlm", "Message history overflow: %d tokens > %d available, pruning middle messages", msgTokens, available)
|
|
125
|
+
messages = pruneMessages(messages, available)
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
112
129
|
response, err := r.callLLM(messages)
|
|
113
130
|
if err != nil {
|
|
131
|
+
// Check for context overflow and attempt recovery
|
|
132
|
+
if r.contextOverflow != nil && r.contextOverflow.Enabled {
|
|
133
|
+
if _, isOverflow := IsContextOverflow(err); isOverflow && len(messages) > 4 {
|
|
134
|
+
r.observer.Debug("rlm", "Context overflow on iteration %d, pruning messages and retrying", iteration+1)
|
|
135
|
+
modelLimit := r.getModelTokenLimit()
|
|
136
|
+
if modelLimit == 0 {
|
|
137
|
+
modelLimit = 32768 // Reasonable default
|
|
138
|
+
}
|
|
139
|
+
responseTokens := r.getResponseTokenBudget()
|
|
140
|
+
available := modelLimit - responseTokens - int(float64(modelLimit)*0.15)
|
|
141
|
+
messages = pruneMessages(messages, available)
|
|
142
|
+
// Retry this iteration
|
|
143
|
+
iteration--
|
|
144
|
+
continue
|
|
145
|
+
}
|
|
146
|
+
}
|
|
114
147
|
r.observer.Error("rlm", "LLM call failed on iteration %d: %v", iteration+1, err)
|
|
115
148
|
return "", r.stats, err
|
|
116
149
|
}
|
|
@@ -165,14 +198,22 @@ func (r *RLM) callLLM(messages []Message) (string, error) {
|
|
|
165
198
|
result, err := CallChatCompletion(request)
|
|
166
199
|
duration := time.Since(start)
|
|
167
200
|
|
|
168
|
-
|
|
201
|
+
tokensUsed := 0
|
|
202
|
+
if result.Usage != nil {
|
|
203
|
+
r.stats.PromptTokens += result.Usage.PromptTokens
|
|
204
|
+
r.stats.CompletionTokens += result.Usage.CompletionTokens
|
|
205
|
+
r.stats.TotalTokens += result.Usage.TotalTokens
|
|
206
|
+
tokensUsed = result.Usage.TotalTokens
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
r.observer.LLMCall(defaultModel, len(messages), tokensUsed, duration, err)
|
|
169
210
|
|
|
170
211
|
if err != nil {
|
|
171
212
|
return "", err
|
|
172
213
|
}
|
|
173
214
|
|
|
174
|
-
r.observer.Debug("llm", "Response received (%d chars) in %s", len(result), duration)
|
|
175
|
-
return result, nil
|
|
215
|
+
r.observer.Debug("llm", "Response received (%d chars, %d tokens) in %s", len(result.Content), tokensUsed, duration)
|
|
216
|
+
return result.Content, nil
|
|
176
217
|
}
|
|
177
218
|
|
|
178
219
|
func (r *RLM) buildREPLEnv(query string, context string) map[string]interface{} {
|
|
@@ -214,6 +255,48 @@ func (r *RLM) buildREPLEnv(query string, context string) map[string]interface{}
|
|
|
214
255
|
return env
|
|
215
256
|
}
|
|
216
257
|
|
|
258
|
+
// pruneMessages removes older middle messages to fit within a token budget.
|
|
259
|
+
// Preserves the first message (system prompt) and the last 2 messages (most recent exchange).
|
|
260
|
+
func pruneMessages(messages []Message, targetTokens int) []Message {
|
|
261
|
+
if len(messages) <= 3 {
|
|
262
|
+
return messages
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Always keep: system prompt (first), last 2 messages (most recent exchange)
|
|
266
|
+
system := messages[0]
|
|
267
|
+
lastN := messages[len(messages)-2:]
|
|
268
|
+
|
|
269
|
+
// Start with the preserved messages
|
|
270
|
+
result := []Message{system}
|
|
271
|
+
currentTokens := EstimateMessagesTokens(append(result, lastN...))
|
|
272
|
+
|
|
273
|
+
if currentTokens >= targetTokens {
|
|
274
|
+
// Even the minimum set exceeds the budget; return it anyway
|
|
275
|
+
return append(result, lastN...)
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Add middle messages from most recent to oldest until budget is exceeded
|
|
279
|
+
middle := messages[1 : len(messages)-2]
|
|
280
|
+
for i := len(middle) - 1; i >= 0; i-- {
|
|
281
|
+
msgTokens := 4 + EstimateTokens(middle[i].Content)
|
|
282
|
+
if currentTokens+msgTokens > targetTokens {
|
|
283
|
+
break
|
|
284
|
+
}
|
|
285
|
+
result = append(result, middle[i])
|
|
286
|
+
currentTokens += msgTokens
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// Reverse the added middle messages (they were added newest-first)
|
|
290
|
+
if len(result) > 1 {
|
|
291
|
+
added := result[1:]
|
|
292
|
+
for i, j := 0, len(added)-1; i < j; i, j = i+1, j-1 {
|
|
293
|
+
added[i], added[j] = added[j], added[i]
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
return append(result, lastN...)
|
|
298
|
+
}
|
|
299
|
+
|
|
217
300
|
// GetObserver returns the observer for external access to events/traces.
|
|
218
301
|
func (r *RLM) GetObserver() *Observer {
|
|
219
302
|
return r.observer
|