recursive-llm-ts 4.7.0 → 4.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/bin/rlm-go +0 -0
- package/dist/bridge-interface.d.ts +3 -0
- package/dist/rlm.js +10 -0
- package/go/README.md +2 -2
- package/go/cmd/rlm/main.go +1 -1
- package/go/go.mod +1 -1
- package/go/rlm/context_overflow.go +181 -29
- package/go/rlm/context_overflow_test.go +373 -3
- package/go/rlm/doc.go +2 -2
- package/go/rlm/meta_agent.go +18 -2
- package/go/rlm/observability.go +6 -0
- package/go/rlm/openai.go +27 -10
- package/go/rlm/rlm.go +86 -3
- package/go/rlm/structured.go +23 -0
- package/go/rlm/token_tracking_test.go +845 -0
- package/go/rlm/types.go +7 -4
- package/package.json +4 -4
package/README.md
CHANGED
|
@@ -1120,7 +1120,7 @@ The Go implementation can be used as a standalone library in Go projects.
|
|
|
1120
1120
|
### Installation
|
|
1121
1121
|
|
|
1122
1122
|
```bash
|
|
1123
|
-
go get github.com/
|
|
1123
|
+
go get github.com/howlerops/recursive-llm-ts/go
|
|
1124
1124
|
```
|
|
1125
1125
|
|
|
1126
1126
|
### Usage
|
|
@@ -1132,7 +1132,7 @@ import (
|
|
|
1132
1132
|
"fmt"
|
|
1133
1133
|
"os"
|
|
1134
1134
|
|
|
1135
|
-
"github.com/
|
|
1135
|
+
"github.com/howlerops/recursive-llm-ts/go/rlm"
|
|
1136
1136
|
)
|
|
1137
1137
|
|
|
1138
1138
|
func main() {
|
package/bin/rlm-go
CHANGED
|
Binary file
|
package/dist/rlm.js
CHANGED
|
@@ -44,6 +44,7 @@ class RLMResultFormatter {
|
|
|
44
44
|
}
|
|
45
45
|
/** Format stats as a concise one-liner */
|
|
46
46
|
prettyStats() {
|
|
47
|
+
var _a, _b;
|
|
47
48
|
const parts = [
|
|
48
49
|
`LLM Calls: ${this.stats.llm_calls}`,
|
|
49
50
|
`Iterations: ${this.stats.iterations}`,
|
|
@@ -52,6 +53,9 @@ class RLMResultFormatter {
|
|
|
52
53
|
if (this.stats.parsing_retries) {
|
|
53
54
|
parts.push(`Retries: ${this.stats.parsing_retries}`);
|
|
54
55
|
}
|
|
56
|
+
if (this.stats.total_tokens) {
|
|
57
|
+
parts.push(`Tokens: ${this.stats.total_tokens} (prompt: ${(_a = this.stats.prompt_tokens) !== null && _a !== void 0 ? _a : 0}, completion: ${(_b = this.stats.completion_tokens) !== null && _b !== void 0 ? _b : 0})`);
|
|
58
|
+
}
|
|
55
59
|
if (this.cached) {
|
|
56
60
|
parts.push('(cached)');
|
|
57
61
|
}
|
|
@@ -69,6 +73,7 @@ class RLMResultFormatter {
|
|
|
69
73
|
}
|
|
70
74
|
/** Format as Markdown */
|
|
71
75
|
toMarkdown() {
|
|
76
|
+
var _a, _b;
|
|
72
77
|
const lines = [
|
|
73
78
|
'## Result',
|
|
74
79
|
'',
|
|
@@ -85,6 +90,11 @@ class RLMResultFormatter {
|
|
|
85
90
|
if (this.stats.parsing_retries) {
|
|
86
91
|
lines.push(`| Parsing Retries | ${this.stats.parsing_retries} |`);
|
|
87
92
|
}
|
|
93
|
+
if (this.stats.total_tokens) {
|
|
94
|
+
lines.push(`| Total Tokens | ${this.stats.total_tokens} |`);
|
|
95
|
+
lines.push(`| Prompt Tokens | ${(_a = this.stats.prompt_tokens) !== null && _a !== void 0 ? _a : 0} |`);
|
|
96
|
+
lines.push(`| Completion Tokens | ${(_b = this.stats.completion_tokens) !== null && _b !== void 0 ? _b : 0} |`);
|
|
97
|
+
}
|
|
88
98
|
lines.push(`| Cached | ${this.cached} |`);
|
|
89
99
|
lines.push(`| Model | ${this.model} |`);
|
|
90
100
|
return lines.join('\n');
|
package/go/README.md
CHANGED
|
@@ -13,7 +13,7 @@ This is both a standalone Go library and CLI binary that implements the RLM algo
|
|
|
13
13
|
### As a Go Library
|
|
14
14
|
|
|
15
15
|
```bash
|
|
16
|
-
go get github.com/
|
|
16
|
+
go get github.com/howlerops/recursive-llm-ts/go
|
|
17
17
|
```
|
|
18
18
|
|
|
19
19
|
### Usage as Library
|
|
@@ -25,7 +25,7 @@ import (
|
|
|
25
25
|
"fmt"
|
|
26
26
|
"os"
|
|
27
27
|
|
|
28
|
-
"github.com/
|
|
28
|
+
"github.com/howlerops/recursive-llm-ts/go/rlm"
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
func main() {
|
package/go/cmd/rlm/main.go
CHANGED
package/go/go.mod
CHANGED
|
@@ -30,6 +30,155 @@ func DefaultContextOverflowConfig() ContextOverflowConfig {
|
|
|
30
30
|
}
|
|
31
31
|
}
|
|
32
32
|
|
|
33
|
+
// ─── Model Token Limits ──────────────────────────────────────────────────────
|
|
34
|
+
|
|
35
|
+
// modelTokenLimits maps known model name patterns to their maximum context window sizes.
|
|
36
|
+
// Used for pre-emptive overflow detection so we don't need to wait for API errors.
|
|
37
|
+
var modelTokenLimits = map[string]int{
|
|
38
|
+
// OpenAI
|
|
39
|
+
"gpt-4o": 128000,
|
|
40
|
+
"gpt-4o-mini": 128000,
|
|
41
|
+
"gpt-4-turbo": 128000,
|
|
42
|
+
"gpt-4": 8192,
|
|
43
|
+
"gpt-4-32k": 32768,
|
|
44
|
+
"gpt-3.5-turbo": 16385,
|
|
45
|
+
"gpt-3.5-turbo-16k": 16385,
|
|
46
|
+
"o1": 200000,
|
|
47
|
+
"o1-mini": 128000,
|
|
48
|
+
"o1-preview": 128000,
|
|
49
|
+
"o3-mini": 200000,
|
|
50
|
+
// Anthropic (via LiteLLM/proxy)
|
|
51
|
+
"claude-3-opus": 200000,
|
|
52
|
+
"claude-3-sonnet": 200000,
|
|
53
|
+
"claude-3-haiku": 200000,
|
|
54
|
+
"claude-3.5-sonnet": 200000,
|
|
55
|
+
"claude-3.5-haiku": 200000,
|
|
56
|
+
"claude-sonnet-4": 200000,
|
|
57
|
+
"claude-opus-4": 200000,
|
|
58
|
+
// Llama (common vLLM deployments)
|
|
59
|
+
"llama-3": 8192,
|
|
60
|
+
"llama-3.1": 128000,
|
|
61
|
+
"llama-3.2": 128000,
|
|
62
|
+
"llama-3.3": 128000,
|
|
63
|
+
// Mistral
|
|
64
|
+
"mistral-7b": 32768,
|
|
65
|
+
"mixtral-8x7b": 32768,
|
|
66
|
+
"mistral-large": 128000,
|
|
67
|
+
"mistral-small": 128000,
|
|
68
|
+
// Qwen
|
|
69
|
+
"qwen-2": 32768,
|
|
70
|
+
"qwen-2.5": 128000,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// LookupModelTokenLimit returns the known token limit for a model, or 0 if unknown.
|
|
74
|
+
// Matches by prefix so "gpt-4o-mini-2024-07-18" matches "gpt-4o-mini".
|
|
75
|
+
func LookupModelTokenLimit(model string) int {
|
|
76
|
+
lowerModel := strings.ToLower(model)
|
|
77
|
+
|
|
78
|
+
// Try exact match first
|
|
79
|
+
if limit, ok := modelTokenLimits[lowerModel]; ok {
|
|
80
|
+
return limit
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Try prefix matching (longest prefix wins)
|
|
84
|
+
bestMatch := ""
|
|
85
|
+
bestLimit := 0
|
|
86
|
+
for pattern, limit := range modelTokenLimits {
|
|
87
|
+
if strings.HasPrefix(lowerModel, pattern) && len(pattern) > len(bestMatch) {
|
|
88
|
+
bestMatch = pattern
|
|
89
|
+
bestLimit = limit
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return bestLimit
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// getModelTokenLimit returns the effective token limit for pre-emptive overflow checks.
|
|
97
|
+
// Priority: config override > model name lookup > 0 (disabled).
|
|
98
|
+
func (r *RLM) getModelTokenLimit() int {
|
|
99
|
+
if r.contextOverflow != nil && r.contextOverflow.MaxModelTokens > 0 {
|
|
100
|
+
return r.contextOverflow.MaxModelTokens
|
|
101
|
+
}
|
|
102
|
+
return LookupModelTokenLimit(r.model)
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// ─── Pre-emptive Overflow Check ──────────────────────────────────────────────
|
|
106
|
+
|
|
107
|
+
// structuredPromptOverhead is the approximate token overhead for structured completion prompts
|
|
108
|
+
// (instructions, schema constraints, JSON formatting directives).
|
|
109
|
+
const structuredPromptOverhead = 350
|
|
110
|
+
|
|
111
|
+
// PreemptiveReduceContext checks if the context would overflow the model's token limit
|
|
112
|
+
// and reduces it proactively BEFORE building the prompt. Returns the (possibly reduced)
|
|
113
|
+
// context, or an error if reduction fails.
|
|
114
|
+
//
|
|
115
|
+
// This is called before the first LLM call, unlike post-hoc overflow recovery which
|
|
116
|
+
// only triggers after an API error. Following the RLM paper's principle that
|
|
117
|
+
// "the context window of the root LM is rarely clogged."
|
|
118
|
+
func (r *RLM) PreemptiveReduceContext(query string, context string, extraOverhead int) (string, bool, error) {
|
|
119
|
+
modelLimit := r.getModelTokenLimit()
|
|
120
|
+
if modelLimit == 0 {
|
|
121
|
+
// No known limit; skip pre-emptive check (will rely on post-hoc recovery)
|
|
122
|
+
return context, false, nil
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
if r.contextOverflow == nil || !r.contextOverflow.Enabled {
|
|
126
|
+
return context, false, nil
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Estimate total token budget needed
|
|
130
|
+
contextTokens := EstimateTokens(context)
|
|
131
|
+
queryTokens := EstimateTokens(query)
|
|
132
|
+
responseTokens := r.getResponseTokenBudget()
|
|
133
|
+
safetyMargin := r.contextOverflow.SafetyMargin
|
|
134
|
+
if safetyMargin == 0 {
|
|
135
|
+
safetyMargin = 0.15
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
totalEstimate := contextTokens + queryTokens + extraOverhead + responseTokens +
|
|
139
|
+
int(float64(modelLimit)*safetyMargin)
|
|
140
|
+
|
|
141
|
+
r.observer.Debug("overflow", "Pre-emptive check: context=%d query=%d overhead=%d response=%d safety=%d total=%d limit=%d",
|
|
142
|
+
contextTokens, queryTokens, extraOverhead, responseTokens,
|
|
143
|
+
int(float64(modelLimit)*safetyMargin), totalEstimate, modelLimit)
|
|
144
|
+
|
|
145
|
+
if totalEstimate <= modelLimit {
|
|
146
|
+
return context, false, nil
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Context would overflow — reduce it proactively
|
|
150
|
+
r.observer.Debug("overflow", "Pre-emptive reduction needed: estimated %d tokens > limit %d", totalEstimate, modelLimit)
|
|
151
|
+
|
|
152
|
+
reducer := newContextReducer(r, *r.contextOverflow, r.observer)
|
|
153
|
+
reduced, err := reducer.ReduceForCompletion(query, context, modelLimit)
|
|
154
|
+
if err != nil {
|
|
155
|
+
return context, false, fmt.Errorf("pre-emptive context reduction failed: %w", err)
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
r.observer.Debug("overflow", "Pre-emptive reduction: %d -> %d chars", len(context), len(reduced))
|
|
159
|
+
return reduced, true, nil
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// getResponseTokenBudget extracts max_tokens or max_completion_tokens from ExtraParams.
|
|
163
|
+
func (r *RLM) getResponseTokenBudget() int {
|
|
164
|
+
if r.extraParams == nil {
|
|
165
|
+
return 0
|
|
166
|
+
}
|
|
167
|
+
for _, key := range []string{"max_completion_tokens", "max_tokens"} {
|
|
168
|
+
if v, ok := r.extraParams[key]; ok {
|
|
169
|
+
switch n := v.(type) {
|
|
170
|
+
case float64:
|
|
171
|
+
return int(n)
|
|
172
|
+
case int:
|
|
173
|
+
return n
|
|
174
|
+
case int64:
|
|
175
|
+
return int(n)
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
return 0
|
|
180
|
+
}
|
|
181
|
+
|
|
33
182
|
// ─── Token Estimation ────────────────────────────────────────────────────────
|
|
34
183
|
|
|
35
184
|
// EstimateTokens provides a fast approximation of token count for a string.
|
|
@@ -168,27 +317,9 @@ func newContextReducer(rlm *RLM, config ContextOverflowConfig, obs *Observer) *c
|
|
|
168
317
|
return &contextReducer{rlm: rlm, config: config, obs: obs}
|
|
169
318
|
}
|
|
170
319
|
|
|
171
|
-
// getResponseTokenBudget
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
func (cr *contextReducer) getResponseTokenBudget(modelLimit int) int {
|
|
175
|
-
if cr.rlm.extraParams == nil {
|
|
176
|
-
return 0
|
|
177
|
-
}
|
|
178
|
-
// Check max_completion_tokens first (newer API parameter), then max_tokens
|
|
179
|
-
for _, key := range []string{"max_completion_tokens", "max_tokens"} {
|
|
180
|
-
if v, ok := cr.rlm.extraParams[key]; ok {
|
|
181
|
-
switch n := v.(type) {
|
|
182
|
-
case float64:
|
|
183
|
-
return int(n)
|
|
184
|
-
case int:
|
|
185
|
-
return n
|
|
186
|
-
case int64:
|
|
187
|
-
return int(n)
|
|
188
|
-
}
|
|
189
|
-
}
|
|
190
|
-
}
|
|
191
|
-
return 0
|
|
320
|
+
// getResponseTokenBudget delegates to the RLM engine's method.
|
|
321
|
+
func (cr *contextReducer) getResponseTokenBudget() int {
|
|
322
|
+
return cr.rlm.getResponseTokenBudget()
|
|
192
323
|
}
|
|
193
324
|
|
|
194
325
|
// makeMapPhaseParams creates ExtraParams suitable for map-phase LLM calls (summarization).
|
|
@@ -222,7 +353,7 @@ func (cr *contextReducer) ReduceForCompletion(query string, context string, mode
|
|
|
222
353
|
// Calculate safe token budget per chunk
|
|
223
354
|
// Reserve tokens for: system prompt (~500), query, overhead, safety margin, response budget
|
|
224
355
|
queryTokens := EstimateTokens(query)
|
|
225
|
-
responseTokens := cr.getResponseTokenBudget(
|
|
356
|
+
responseTokens := cr.getResponseTokenBudget()
|
|
226
357
|
overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin) + responseTokens
|
|
227
358
|
safeTokensPerChunk := modelLimit - overhead
|
|
228
359
|
|
|
@@ -300,8 +431,13 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
|
|
|
300
431
|
}
|
|
301
432
|
|
|
302
433
|
cr.rlm.stats.LlmCalls++
|
|
303
|
-
|
|
304
|
-
|
|
434
|
+
if result.Usage != nil {
|
|
435
|
+
cr.rlm.stats.PromptTokens += result.Usage.PromptTokens
|
|
436
|
+
cr.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
|
|
437
|
+
cr.rlm.stats.TotalTokens += result.Usage.TotalTokens
|
|
438
|
+
}
|
|
439
|
+
summaries[idx] = result.Content
|
|
440
|
+
cr.obs.Debug("overflow", "Chunk %d/%d summarized: %d -> %d chars", idx+1, len(chunks), len(chunkText), len(result.Content))
|
|
305
441
|
}(i, chunk)
|
|
306
442
|
}
|
|
307
443
|
|
|
@@ -405,8 +541,13 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
|
|
|
405
541
|
}
|
|
406
542
|
|
|
407
543
|
cr.rlm.stats.LlmCalls++
|
|
408
|
-
if
|
|
409
|
-
|
|
544
|
+
if result.Usage != nil {
|
|
545
|
+
cr.rlm.stats.PromptTokens += result.Usage.PromptTokens
|
|
546
|
+
cr.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
|
|
547
|
+
cr.rlm.stats.TotalTokens += result.Usage.TotalTokens
|
|
548
|
+
}
|
|
549
|
+
if strings.TrimSpace(result.Content) != "NO_RELEVANT_CONTENT" {
|
|
550
|
+
results[idx] = result.Content
|
|
410
551
|
}
|
|
411
552
|
}(i, chunk)
|
|
412
553
|
}
|
|
@@ -475,7 +616,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
|
|
|
475
616
|
{Role: "user", Content: initialPrompt},
|
|
476
617
|
}
|
|
477
618
|
|
|
478
|
-
|
|
619
|
+
initialResult, err := CallChatCompletion(ChatRequest{
|
|
479
620
|
Model: cr.rlm.model,
|
|
480
621
|
Messages: messages,
|
|
481
622
|
APIBase: cr.rlm.apiBase,
|
|
@@ -487,6 +628,12 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
|
|
|
487
628
|
return "", fmt.Errorf("refine initial chunk: %w", err)
|
|
488
629
|
}
|
|
489
630
|
cr.rlm.stats.LlmCalls++
|
|
631
|
+
if initialResult.Usage != nil {
|
|
632
|
+
cr.rlm.stats.PromptTokens += initialResult.Usage.PromptTokens
|
|
633
|
+
cr.rlm.stats.CompletionTokens += initialResult.Usage.CompletionTokens
|
|
634
|
+
cr.rlm.stats.TotalTokens += initialResult.Usage.TotalTokens
|
|
635
|
+
}
|
|
636
|
+
currentAnswer := initialResult.Content
|
|
490
637
|
cr.obs.Debug("overflow", "Refine: initial answer from chunk 1/%d (%d chars)", len(chunks), len(currentAnswer))
|
|
491
638
|
|
|
492
639
|
// Phase 2: Refine the answer with each subsequent chunk
|
|
@@ -507,7 +654,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
|
|
|
507
654
|
{Role: "user", Content: refinePrompt},
|
|
508
655
|
}
|
|
509
656
|
|
|
510
|
-
|
|
657
|
+
refineResult, err := CallChatCompletion(ChatRequest{
|
|
511
658
|
Model: cr.rlm.model,
|
|
512
659
|
Messages: messages,
|
|
513
660
|
APIBase: cr.rlm.apiBase,
|
|
@@ -521,7 +668,12 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
|
|
|
521
668
|
continue
|
|
522
669
|
}
|
|
523
670
|
cr.rlm.stats.LlmCalls++
|
|
524
|
-
|
|
671
|
+
if refineResult.Usage != nil {
|
|
672
|
+
cr.rlm.stats.PromptTokens += refineResult.Usage.PromptTokens
|
|
673
|
+
cr.rlm.stats.CompletionTokens += refineResult.Usage.CompletionTokens
|
|
674
|
+
cr.rlm.stats.TotalTokens += refineResult.Usage.TotalTokens
|
|
675
|
+
}
|
|
676
|
+
currentAnswer = refineResult.Content
|
|
525
677
|
cr.obs.Debug("overflow", "Refine: incorporated chunk %d/%d (%d chars)", i+1, len(chunks), len(currentAnswer))
|
|
526
678
|
}
|
|
527
679
|
|