recursive-llm-ts 4.7.0 → 4.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1120,7 +1120,7 @@ The Go implementation can be used as a standalone library in Go projects.
1120
1120
  ### Installation
1121
1121
 
1122
1122
  ```bash
1123
- go get github.com/jbeck018/recursive-llm-ts/go
1123
+ go get github.com/howlerops/recursive-llm-ts/go
1124
1124
  ```
1125
1125
 
1126
1126
  ### Usage
@@ -1132,7 +1132,7 @@ import (
1132
1132
  "fmt"
1133
1133
  "os"
1134
1134
 
1135
- "github.com/jbeck018/recursive-llm-ts/go/rlm"
1135
+ "github.com/howlerops/recursive-llm-ts/go/rlm"
1136
1136
  )
1137
1137
 
1138
1138
  func main() {
package/bin/rlm-go CHANGED
Binary file
@@ -3,6 +3,9 @@ export interface RLMStats {
3
3
  iterations: number;
4
4
  depth: number;
5
5
  parsing_retries?: number;
6
+ total_tokens?: number;
7
+ prompt_tokens?: number;
8
+ completion_tokens?: number;
6
9
  }
7
10
  export interface RLMResult {
8
11
  result: string;
package/dist/rlm.js CHANGED
@@ -44,6 +44,7 @@ class RLMResultFormatter {
44
44
  }
45
45
  /** Format stats as a concise one-liner */
46
46
  prettyStats() {
47
+ var _a, _b;
47
48
  const parts = [
48
49
  `LLM Calls: ${this.stats.llm_calls}`,
49
50
  `Iterations: ${this.stats.iterations}`,
@@ -52,6 +53,9 @@ class RLMResultFormatter {
52
53
  if (this.stats.parsing_retries) {
53
54
  parts.push(`Retries: ${this.stats.parsing_retries}`);
54
55
  }
56
+ if (this.stats.total_tokens) {
57
+ parts.push(`Tokens: ${this.stats.total_tokens} (prompt: ${(_a = this.stats.prompt_tokens) !== null && _a !== void 0 ? _a : 0}, completion: ${(_b = this.stats.completion_tokens) !== null && _b !== void 0 ? _b : 0})`);
58
+ }
55
59
  if (this.cached) {
56
60
  parts.push('(cached)');
57
61
  }
@@ -69,6 +73,7 @@ class RLMResultFormatter {
69
73
  }
70
74
  /** Format as Markdown */
71
75
  toMarkdown() {
76
+ var _a, _b;
72
77
  const lines = [
73
78
  '## Result',
74
79
  '',
@@ -85,6 +90,11 @@ class RLMResultFormatter {
85
90
  if (this.stats.parsing_retries) {
86
91
  lines.push(`| Parsing Retries | ${this.stats.parsing_retries} |`);
87
92
  }
93
+ if (this.stats.total_tokens) {
94
+ lines.push(`| Total Tokens | ${this.stats.total_tokens} |`);
95
+ lines.push(`| Prompt Tokens | ${(_a = this.stats.prompt_tokens) !== null && _a !== void 0 ? _a : 0} |`);
96
+ lines.push(`| Completion Tokens | ${(_b = this.stats.completion_tokens) !== null && _b !== void 0 ? _b : 0} |`);
97
+ }
88
98
  lines.push(`| Cached | ${this.cached} |`);
89
99
  lines.push(`| Model | ${this.model} |`);
90
100
  return lines.join('\n');
package/go/README.md CHANGED
@@ -13,7 +13,7 @@ This is both a standalone Go library and CLI binary that implements the RLM algo
13
13
  ### As a Go Library
14
14
 
15
15
  ```bash
16
- go get github.com/jbeck018/recursive-llm-ts/go
16
+ go get github.com/howlerops/recursive-llm-ts/go
17
17
  ```
18
18
 
19
19
  ### Usage as Library
@@ -25,7 +25,7 @@ import (
25
25
  "fmt"
26
26
  "os"
27
27
 
28
- "github.com/jbeck018/recursive-llm-ts/go/rlm"
28
+ "github.com/howlerops/recursive-llm-ts/go/rlm"
29
29
  )
30
30
 
31
31
  func main() {
@@ -6,7 +6,7 @@ import (
6
6
  "io"
7
7
  "os"
8
8
 
9
- "github.com/jbeck018/recursive-llm-ts/go/rlm"
9
+ "github.com/howlerops/recursive-llm-ts/go/rlm"
10
10
  )
11
11
 
12
12
  type requestPayload struct {
package/go/go.mod CHANGED
@@ -1,4 +1,4 @@
1
- module github.com/jbeck018/recursive-llm-ts/go
1
+ module github.com/howlerops/recursive-llm-ts/go
2
2
 
3
3
  go 1.25.0
4
4
 
@@ -30,6 +30,155 @@ func DefaultContextOverflowConfig() ContextOverflowConfig {
30
30
  }
31
31
  }
32
32
 
33
+ // ─── Model Token Limits ──────────────────────────────────────────────────────
34
+
35
+ // modelTokenLimits maps known model name patterns to their maximum context window sizes.
36
+ // Used for pre-emptive overflow detection so we don't need to wait for API errors.
37
+ var modelTokenLimits = map[string]int{
38
+ // OpenAI
39
+ "gpt-4o": 128000,
40
+ "gpt-4o-mini": 128000,
41
+ "gpt-4-turbo": 128000,
42
+ "gpt-4": 8192,
43
+ "gpt-4-32k": 32768,
44
+ "gpt-3.5-turbo": 16385,
45
+ "gpt-3.5-turbo-16k": 16385,
46
+ "o1": 200000,
47
+ "o1-mini": 128000,
48
+ "o1-preview": 128000,
49
+ "o3-mini": 200000,
50
+ // Anthropic (via LiteLLM/proxy)
51
+ "claude-3-opus": 200000,
52
+ "claude-3-sonnet": 200000,
53
+ "claude-3-haiku": 200000,
54
+ "claude-3.5-sonnet": 200000,
55
+ "claude-3.5-haiku": 200000,
56
+ "claude-sonnet-4": 200000,
57
+ "claude-opus-4": 200000,
58
+ // Llama (common vLLM deployments)
59
+ "llama-3": 8192,
60
+ "llama-3.1": 128000,
61
+ "llama-3.2": 128000,
62
+ "llama-3.3": 128000,
63
+ // Mistral
64
+ "mistral-7b": 32768,
65
+ "mixtral-8x7b": 32768,
66
+ "mistral-large": 128000,
67
+ "mistral-small": 128000,
68
+ // Qwen
69
+ "qwen-2": 32768,
70
+ "qwen-2.5": 128000,
71
+ }
72
+
73
+ // LookupModelTokenLimit returns the known token limit for a model, or 0 if unknown.
74
+ // Matches by prefix so "gpt-4o-mini-2024-07-18" matches "gpt-4o-mini".
75
+ func LookupModelTokenLimit(model string) int {
76
+ lowerModel := strings.ToLower(model)
77
+
78
+ // Try exact match first
79
+ if limit, ok := modelTokenLimits[lowerModel]; ok {
80
+ return limit
81
+ }
82
+
83
+ // Try prefix matching (longest prefix wins)
84
+ bestMatch := ""
85
+ bestLimit := 0
86
+ for pattern, limit := range modelTokenLimits {
87
+ if strings.HasPrefix(lowerModel, pattern) && len(pattern) > len(bestMatch) {
88
+ bestMatch = pattern
89
+ bestLimit = limit
90
+ }
91
+ }
92
+
93
+ return bestLimit
94
+ }
95
+
96
+ // getModelTokenLimit returns the effective token limit for pre-emptive overflow checks.
97
+ // Priority: config override > model name lookup > 0 (disabled).
98
+ func (r *RLM) getModelTokenLimit() int {
99
+ if r.contextOverflow != nil && r.contextOverflow.MaxModelTokens > 0 {
100
+ return r.contextOverflow.MaxModelTokens
101
+ }
102
+ return LookupModelTokenLimit(r.model)
103
+ }
104
+
105
+ // ─── Pre-emptive Overflow Check ──────────────────────────────────────────────
106
+
107
+ // structuredPromptOverhead is the approximate token overhead for structured completion prompts
108
+ // (instructions, schema constraints, JSON formatting directives).
109
+ const structuredPromptOverhead = 350
110
+
111
+ // PreemptiveReduceContext checks if the context would overflow the model's token limit
112
+ // and reduces it proactively BEFORE building the prompt. Returns the (possibly reduced)
113
+ // context, or an error if reduction fails.
114
+ //
115
+ // This is called before the first LLM call, unlike post-hoc overflow recovery which
116
+ // only triggers after an API error. Following the RLM paper's principle that
117
+ // "the context window of the root LM is rarely clogged."
118
+ func (r *RLM) PreemptiveReduceContext(query string, context string, extraOverhead int) (string, bool, error) {
119
+ modelLimit := r.getModelTokenLimit()
120
+ if modelLimit == 0 {
121
+ // No known limit; skip pre-emptive check (will rely on post-hoc recovery)
122
+ return context, false, nil
123
+ }
124
+
125
+ if r.contextOverflow == nil || !r.contextOverflow.Enabled {
126
+ return context, false, nil
127
+ }
128
+
129
+ // Estimate total token budget needed
130
+ contextTokens := EstimateTokens(context)
131
+ queryTokens := EstimateTokens(query)
132
+ responseTokens := r.getResponseTokenBudget()
133
+ safetyMargin := r.contextOverflow.SafetyMargin
134
+ if safetyMargin == 0 {
135
+ safetyMargin = 0.15
136
+ }
137
+
138
+ totalEstimate := contextTokens + queryTokens + extraOverhead + responseTokens +
139
+ int(float64(modelLimit)*safetyMargin)
140
+
141
+ r.observer.Debug("overflow", "Pre-emptive check: context=%d query=%d overhead=%d response=%d safety=%d total=%d limit=%d",
142
+ contextTokens, queryTokens, extraOverhead, responseTokens,
143
+ int(float64(modelLimit)*safetyMargin), totalEstimate, modelLimit)
144
+
145
+ if totalEstimate <= modelLimit {
146
+ return context, false, nil
147
+ }
148
+
149
+ // Context would overflow — reduce it proactively
150
+ r.observer.Debug("overflow", "Pre-emptive reduction needed: estimated %d tokens > limit %d", totalEstimate, modelLimit)
151
+
152
+ reducer := newContextReducer(r, *r.contextOverflow, r.observer)
153
+ reduced, err := reducer.ReduceForCompletion(query, context, modelLimit)
154
+ if err != nil {
155
+ return context, false, fmt.Errorf("pre-emptive context reduction failed: %w", err)
156
+ }
157
+
158
+ r.observer.Debug("overflow", "Pre-emptive reduction: %d -> %d chars", len(context), len(reduced))
159
+ return reduced, true, nil
160
+ }
161
+
162
+ // getResponseTokenBudget extracts max_tokens or max_completion_tokens from ExtraParams.
163
+ func (r *RLM) getResponseTokenBudget() int {
164
+ if r.extraParams == nil {
165
+ return 0
166
+ }
167
+ for _, key := range []string{"max_completion_tokens", "max_tokens"} {
168
+ if v, ok := r.extraParams[key]; ok {
169
+ switch n := v.(type) {
170
+ case float64:
171
+ return int(n)
172
+ case int:
173
+ return n
174
+ case int64:
175
+ return int(n)
176
+ }
177
+ }
178
+ }
179
+ return 0
180
+ }
181
+
33
182
  // ─── Token Estimation ────────────────────────────────────────────────────────
34
183
 
35
184
  // EstimateTokens provides a fast approximation of token count for a string.
@@ -168,27 +317,9 @@ func newContextReducer(rlm *RLM, config ContextOverflowConfig, obs *Observer) *c
168
317
  return &contextReducer{rlm: rlm, config: config, obs: obs}
169
318
  }
170
319
 
171
- // getResponseTokenBudget extracts max_tokens or max_completion_tokens from ExtraParams.
172
- // This represents how many tokens the API will reserve for the response, which must be
173
- // subtracted from the model's total capacity when sizing input chunks.
174
- func (cr *contextReducer) getResponseTokenBudget(modelLimit int) int {
175
- if cr.rlm.extraParams == nil {
176
- return 0
177
- }
178
- // Check max_completion_tokens first (newer API parameter), then max_tokens
179
- for _, key := range []string{"max_completion_tokens", "max_tokens"} {
180
- if v, ok := cr.rlm.extraParams[key]; ok {
181
- switch n := v.(type) {
182
- case float64:
183
- return int(n)
184
- case int:
185
- return n
186
- case int64:
187
- return int(n)
188
- }
189
- }
190
- }
191
- return 0
320
+ // getResponseTokenBudget delegates to the RLM engine's method.
321
+ func (cr *contextReducer) getResponseTokenBudget() int {
322
+ return cr.rlm.getResponseTokenBudget()
192
323
  }
193
324
 
194
325
  // makeMapPhaseParams creates ExtraParams suitable for map-phase LLM calls (summarization).
@@ -222,7 +353,7 @@ func (cr *contextReducer) ReduceForCompletion(query string, context string, mode
222
353
  // Calculate safe token budget per chunk
223
354
  // Reserve tokens for: system prompt (~500), query, overhead, safety margin, response budget
224
355
  queryTokens := EstimateTokens(query)
225
- responseTokens := cr.getResponseTokenBudget(modelLimit)
356
+ responseTokens := cr.getResponseTokenBudget()
226
357
  overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin) + responseTokens
227
358
  safeTokensPerChunk := modelLimit - overhead
228
359
 
@@ -300,8 +431,13 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
300
431
  }
301
432
 
302
433
  cr.rlm.stats.LlmCalls++
303
- summaries[idx] = result
304
- cr.obs.Debug("overflow", "Chunk %d/%d summarized: %d -> %d chars", idx+1, len(chunks), len(chunkText), len(result))
434
+ if result.Usage != nil {
435
+ cr.rlm.stats.PromptTokens += result.Usage.PromptTokens
436
+ cr.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
437
+ cr.rlm.stats.TotalTokens += result.Usage.TotalTokens
438
+ }
439
+ summaries[idx] = result.Content
440
+ cr.obs.Debug("overflow", "Chunk %d/%d summarized: %d -> %d chars", idx+1, len(chunks), len(chunkText), len(result.Content))
305
441
  }(i, chunk)
306
442
  }
307
443
 
@@ -405,8 +541,13 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
405
541
  }
406
542
 
407
543
  cr.rlm.stats.LlmCalls++
408
- if strings.TrimSpace(result) != "NO_RELEVANT_CONTENT" {
409
- results[idx] = result
544
+ if result.Usage != nil {
545
+ cr.rlm.stats.PromptTokens += result.Usage.PromptTokens
546
+ cr.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
547
+ cr.rlm.stats.TotalTokens += result.Usage.TotalTokens
548
+ }
549
+ if strings.TrimSpace(result.Content) != "NO_RELEVANT_CONTENT" {
550
+ results[idx] = result.Content
410
551
  }
411
552
  }(i, chunk)
412
553
  }
@@ -475,7 +616,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
475
616
  {Role: "user", Content: initialPrompt},
476
617
  }
477
618
 
478
- currentAnswer, err := CallChatCompletion(ChatRequest{
619
+ initialResult, err := CallChatCompletion(ChatRequest{
479
620
  Model: cr.rlm.model,
480
621
  Messages: messages,
481
622
  APIBase: cr.rlm.apiBase,
@@ -487,6 +628,12 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
487
628
  return "", fmt.Errorf("refine initial chunk: %w", err)
488
629
  }
489
630
  cr.rlm.stats.LlmCalls++
631
+ if initialResult.Usage != nil {
632
+ cr.rlm.stats.PromptTokens += initialResult.Usage.PromptTokens
633
+ cr.rlm.stats.CompletionTokens += initialResult.Usage.CompletionTokens
634
+ cr.rlm.stats.TotalTokens += initialResult.Usage.TotalTokens
635
+ }
636
+ currentAnswer := initialResult.Content
490
637
  cr.obs.Debug("overflow", "Refine: initial answer from chunk 1/%d (%d chars)", len(chunks), len(currentAnswer))
491
638
 
492
639
  // Phase 2: Refine the answer with each subsequent chunk
@@ -507,7 +654,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
507
654
  {Role: "user", Content: refinePrompt},
508
655
  }
509
656
 
510
- refined, err := CallChatCompletion(ChatRequest{
657
+ refineResult, err := CallChatCompletion(ChatRequest{
511
658
  Model: cr.rlm.model,
512
659
  Messages: messages,
513
660
  APIBase: cr.rlm.apiBase,
@@ -521,7 +668,12 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
521
668
  continue
522
669
  }
523
670
  cr.rlm.stats.LlmCalls++
524
- currentAnswer = refined
671
+ if refineResult.Usage != nil {
672
+ cr.rlm.stats.PromptTokens += refineResult.Usage.PromptTokens
673
+ cr.rlm.stats.CompletionTokens += refineResult.Usage.CompletionTokens
674
+ cr.rlm.stats.TotalTokens += refineResult.Usage.TotalTokens
675
+ }
676
+ currentAnswer = refineResult.Content
525
677
  cr.obs.Debug("overflow", "Refine: incorporated chunk %d/%d (%d chars)", i+1, len(chunks), len(currentAnswer))
526
678
  }
527
679