recursive-llm-ts 4.8.0 → 4.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1120,7 +1120,7 @@ The Go implementation can be used as a standalone library in Go projects.
1120
1120
  ### Installation
1121
1121
 
1122
1122
  ```bash
1123
- go get github.com/jbeck018/recursive-llm-ts/go
1123
+ go get github.com/howlerops/recursive-llm-ts/go
1124
1124
  ```
1125
1125
 
1126
1126
  ### Usage
@@ -1132,7 +1132,7 @@ import (
1132
1132
  "fmt"
1133
1133
  "os"
1134
1134
 
1135
- "github.com/jbeck018/recursive-llm-ts/go/rlm"
1135
+ "github.com/howlerops/recursive-llm-ts/go/rlm"
1136
1136
  )
1137
1137
 
1138
1138
  func main() {
package/bin/rlm-go CHANGED
Binary file
@@ -3,6 +3,9 @@ export interface RLMStats {
3
3
  iterations: number;
4
4
  depth: number;
5
5
  parsing_retries?: number;
6
+ total_tokens?: number;
7
+ prompt_tokens?: number;
8
+ completion_tokens?: number;
6
9
  }
7
10
  export interface RLMResult {
8
11
  result: string;
package/dist/rlm.js CHANGED
@@ -44,6 +44,7 @@ class RLMResultFormatter {
44
44
  }
45
45
  /** Format stats as a concise one-liner */
46
46
  prettyStats() {
47
+ var _a, _b;
47
48
  const parts = [
48
49
  `LLM Calls: ${this.stats.llm_calls}`,
49
50
  `Iterations: ${this.stats.iterations}`,
@@ -52,6 +53,9 @@ class RLMResultFormatter {
52
53
  if (this.stats.parsing_retries) {
53
54
  parts.push(`Retries: ${this.stats.parsing_retries}`);
54
55
  }
56
+ if (this.stats.total_tokens) {
57
+ parts.push(`Tokens: ${this.stats.total_tokens} (prompt: ${(_a = this.stats.prompt_tokens) !== null && _a !== void 0 ? _a : 0}, completion: ${(_b = this.stats.completion_tokens) !== null && _b !== void 0 ? _b : 0})`);
58
+ }
55
59
  if (this.cached) {
56
60
  parts.push('(cached)');
57
61
  }
@@ -69,6 +73,7 @@ class RLMResultFormatter {
69
73
  }
70
74
  /** Format as Markdown */
71
75
  toMarkdown() {
76
+ var _a, _b;
72
77
  const lines = [
73
78
  '## Result',
74
79
  '',
@@ -85,6 +90,11 @@ class RLMResultFormatter {
85
90
  if (this.stats.parsing_retries) {
86
91
  lines.push(`| Parsing Retries | ${this.stats.parsing_retries} |`);
87
92
  }
93
+ if (this.stats.total_tokens) {
94
+ lines.push(`| Total Tokens | ${this.stats.total_tokens} |`);
95
+ lines.push(`| Prompt Tokens | ${(_a = this.stats.prompt_tokens) !== null && _a !== void 0 ? _a : 0} |`);
96
+ lines.push(`| Completion Tokens | ${(_b = this.stats.completion_tokens) !== null && _b !== void 0 ? _b : 0} |`);
97
+ }
88
98
  lines.push(`| Cached | ${this.cached} |`);
89
99
  lines.push(`| Model | ${this.model} |`);
90
100
  return lines.join('\n');
package/go/README.md CHANGED
@@ -13,7 +13,7 @@ This is both a standalone Go library and CLI binary that implements the RLM algo
13
13
  ### As a Go Library
14
14
 
15
15
  ```bash
16
- go get github.com/jbeck018/recursive-llm-ts/go
16
+ go get github.com/howlerops/recursive-llm-ts/go
17
17
  ```
18
18
 
19
19
  ### Usage as Library
@@ -25,7 +25,7 @@ import (
25
25
  "fmt"
26
26
  "os"
27
27
 
28
- "github.com/jbeck018/recursive-llm-ts/go/rlm"
28
+ "github.com/howlerops/recursive-llm-ts/go/rlm"
29
29
  )
30
30
 
31
31
  func main() {
@@ -6,7 +6,7 @@ import (
6
6
  "io"
7
7
  "os"
8
8
 
9
- "github.com/jbeck018/recursive-llm-ts/go/rlm"
9
+ "github.com/howlerops/recursive-llm-ts/go/rlm"
10
10
  )
11
11
 
12
12
  type requestPayload struct {
package/go/go.mod CHANGED
@@ -1,4 +1,4 @@
1
- module github.com/jbeck018/recursive-llm-ts/go
1
+ module github.com/howlerops/recursive-llm-ts/go
2
2
 
3
3
  go 1.25.0
4
4
 
@@ -431,8 +431,13 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
431
431
  }
432
432
 
433
433
  cr.rlm.stats.LlmCalls++
434
- summaries[idx] = result
435
- cr.obs.Debug("overflow", "Chunk %d/%d summarized: %d -> %d chars", idx+1, len(chunks), len(chunkText), len(result))
434
+ if result.Usage != nil {
435
+ cr.rlm.stats.PromptTokens += result.Usage.PromptTokens
436
+ cr.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
437
+ cr.rlm.stats.TotalTokens += result.Usage.TotalTokens
438
+ }
439
+ summaries[idx] = result.Content
440
+ cr.obs.Debug("overflow", "Chunk %d/%d summarized: %d -> %d chars", idx+1, len(chunks), len(chunkText), len(result.Content))
436
441
  }(i, chunk)
437
442
  }
438
443
 
@@ -536,8 +541,13 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
536
541
  }
537
542
 
538
543
  cr.rlm.stats.LlmCalls++
539
- if strings.TrimSpace(result) != "NO_RELEVANT_CONTENT" {
540
- results[idx] = result
544
+ if result.Usage != nil {
545
+ cr.rlm.stats.PromptTokens += result.Usage.PromptTokens
546
+ cr.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
547
+ cr.rlm.stats.TotalTokens += result.Usage.TotalTokens
548
+ }
549
+ if strings.TrimSpace(result.Content) != "NO_RELEVANT_CONTENT" {
550
+ results[idx] = result.Content
541
551
  }
542
552
  }(i, chunk)
543
553
  }
@@ -606,7 +616,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
606
616
  {Role: "user", Content: initialPrompt},
607
617
  }
608
618
 
609
- currentAnswer, err := CallChatCompletion(ChatRequest{
619
+ initialResult, err := CallChatCompletion(ChatRequest{
610
620
  Model: cr.rlm.model,
611
621
  Messages: messages,
612
622
  APIBase: cr.rlm.apiBase,
@@ -618,6 +628,12 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
618
628
  return "", fmt.Errorf("refine initial chunk: %w", err)
619
629
  }
620
630
  cr.rlm.stats.LlmCalls++
631
+ if initialResult.Usage != nil {
632
+ cr.rlm.stats.PromptTokens += initialResult.Usage.PromptTokens
633
+ cr.rlm.stats.CompletionTokens += initialResult.Usage.CompletionTokens
634
+ cr.rlm.stats.TotalTokens += initialResult.Usage.TotalTokens
635
+ }
636
+ currentAnswer := initialResult.Content
621
637
  cr.obs.Debug("overflow", "Refine: initial answer from chunk 1/%d (%d chars)", len(chunks), len(currentAnswer))
622
638
 
623
639
  // Phase 2: Refine the answer with each subsequent chunk
@@ -638,7 +654,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
638
654
  {Role: "user", Content: refinePrompt},
639
655
  }
640
656
 
641
- refined, err := CallChatCompletion(ChatRequest{
657
+ refineResult, err := CallChatCompletion(ChatRequest{
642
658
  Model: cr.rlm.model,
643
659
  Messages: messages,
644
660
  APIBase: cr.rlm.apiBase,
@@ -652,7 +668,12 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
652
668
  continue
653
669
  }
654
670
  cr.rlm.stats.LlmCalls++
655
- currentAnswer = refined
671
+ if refineResult.Usage != nil {
672
+ cr.rlm.stats.PromptTokens += refineResult.Usage.PromptTokens
673
+ cr.rlm.stats.CompletionTokens += refineResult.Usage.CompletionTokens
674
+ cr.rlm.stats.TotalTokens += refineResult.Usage.TotalTokens
675
+ }
676
+ currentAnswer = refineResult.Content
656
677
  cr.obs.Debug("overflow", "Refine: incorporated chunk %d/%d (%d chars)", i+1, len(chunks), len(currentAnswer))
657
678
  }
658
679
 
package/go/rlm/doc.go CHANGED
@@ -8,13 +8,13 @@
8
8
  //
9
9
  // To use this package in your Go project:
10
10
  //
11
- // go get github.com/jbeck018/recursive-llm-ts/go
11
+ // go get github.com/howlerops/recursive-llm-ts/go
12
12
  //
13
13
  // # Basic Usage
14
14
  //
15
15
  // Create an RLM engine and execute a completion:
16
16
  //
17
- // import "github.com/jbeck018/recursive-llm-ts/go/rlm"
17
+ // import "github.com/howlerops/recursive-llm-ts/go/rlm"
18
18
  //
19
19
  // config := rlm.Config{
20
20
  // MaxDepth: 5,
@@ -74,7 +74,15 @@ func (ma *MetaAgent) OptimizeQuery(query string, context string) (string, error)
74
74
  return query, nil
75
75
  }
76
76
 
77
- optimized := strings.TrimSpace(result)
77
+ // Track meta-agent token usage in the parent RLM's stats
78
+ if result.Usage != nil {
79
+ ma.rlm.stats.PromptTokens += result.Usage.PromptTokens
80
+ ma.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
81
+ ma.rlm.stats.TotalTokens += result.Usage.TotalTokens
82
+ }
83
+ ma.rlm.stats.LlmCalls++
84
+
85
+ optimized := strings.TrimSpace(result.Content)
78
86
  ma.obs.Debug("meta_agent", "Optimized query: %s", truncateStr(optimized, 200))
79
87
  ma.obs.Event("meta_agent.query_optimized", map[string]string{
80
88
  "original_length": fmt.Sprintf("%d", len(query)),
@@ -136,7 +144,15 @@ func (ma *MetaAgent) OptimizeForStructured(query string, context string, schema
136
144
  return query, nil
137
145
  }
138
146
 
139
- optimized := strings.TrimSpace(result)
147
+ // Track meta-agent token usage in the parent RLM's stats
148
+ if result.Usage != nil {
149
+ ma.rlm.stats.PromptTokens += result.Usage.PromptTokens
150
+ ma.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
151
+ ma.rlm.stats.TotalTokens += result.Usage.TotalTokens
152
+ }
153
+ ma.rlm.stats.LlmCalls++
154
+
155
+ optimized := strings.TrimSpace(result.Content)
140
156
  ma.obs.Debug("meta_agent", "Optimized structured query: %s", truncateStr(optimized, 200))
141
157
  ma.obs.Event("meta_agent.structured_query_optimized", map[string]string{
142
158
  "original_length": fmt.Sprintf("%d", len(query)),
@@ -445,6 +445,12 @@ func FormatStatsWithObservability(stats RLMStats, obs *Observer) map[string]inte
445
445
  result["parsing_retries"] = stats.ParsingRetries
446
446
  }
447
447
 
448
+ if stats.TotalTokens > 0 {
449
+ result["total_tokens"] = stats.TotalTokens
450
+ result["prompt_tokens"] = stats.PromptTokens
451
+ result["completion_tokens"] = stats.CompletionTokens
452
+ }
453
+
448
454
  if obs != nil && obs.config.Debug {
449
455
  events := obs.GetEvents()
450
456
  if len(events) > 0 {
package/go/rlm/openai.go CHANGED
@@ -34,6 +34,20 @@ type chatResponse struct {
34
34
  Error *struct {
35
35
  Message string `json:"message"`
36
36
  } `json:"error"`
37
+ Usage *TokenUsage `json:"usage,omitempty"`
38
+ }
39
+
40
+ // TokenUsage represents token consumption from an LLM API response.
41
+ type TokenUsage struct {
42
+ PromptTokens int `json:"prompt_tokens"`
43
+ CompletionTokens int `json:"completion_tokens"`
44
+ TotalTokens int `json:"total_tokens"`
45
+ }
46
+
47
+ // ChatCompletionResult holds the content and token usage from an LLM call.
48
+ type ChatCompletionResult struct {
49
+ Content string
50
+ Usage *TokenUsage
37
51
  }
38
52
 
39
53
  var (
@@ -48,7 +62,7 @@ var (
48
62
  }
49
63
  )
50
64
 
51
- func CallChatCompletion(request ChatRequest) (string, error) {
65
+ func CallChatCompletion(request ChatRequest) (ChatCompletionResult, error) {
52
66
  endpoint := buildEndpoint(request.APIBase)
53
67
  payload := map[string]interface{}{
54
68
  "model": request.Model,
@@ -61,7 +75,7 @@ func CallChatCompletion(request ChatRequest) (string, error) {
61
75
 
62
76
  body, err := json.Marshal(payload)
63
77
  if err != nil {
64
- return "", err
78
+ return ChatCompletionResult{}, err
65
79
  }
66
80
 
67
81
  // Use shared client with connection pooling
@@ -76,7 +90,7 @@ func CallChatCompletion(request ChatRequest) (string, error) {
76
90
 
77
91
  req, err := http.NewRequest(http.MethodPost, endpoint, bytes.NewReader(body))
78
92
  if err != nil {
79
- return "", err
93
+ return ChatCompletionResult{}, err
80
94
  }
81
95
  req.Header.Set("Content-Type", "application/json")
82
96
  if request.APIKey != "" {
@@ -85,7 +99,7 @@ func CallChatCompletion(request ChatRequest) (string, error) {
85
99
 
86
100
  resp, err := client.Do(req)
87
101
  if err != nil {
88
- return "", err
102
+ return ChatCompletionResult{}, err
89
103
  }
90
104
  defer func() {
91
105
  _ = resp.Body.Close()
@@ -93,27 +107,30 @@ func CallChatCompletion(request ChatRequest) (string, error) {
93
107
 
94
108
  responseBody, err := io.ReadAll(resp.Body)
95
109
  if err != nil {
96
- return "", err
110
+ return ChatCompletionResult{}, err
97
111
  }
98
112
 
99
113
  if resp.StatusCode >= http.StatusBadRequest {
100
- return "", NewAPIError(resp.StatusCode, strings.TrimSpace(string(responseBody)))
114
+ return ChatCompletionResult{}, NewAPIError(resp.StatusCode, strings.TrimSpace(string(responseBody)))
101
115
  }
102
116
 
103
117
  var parsed chatResponse
104
118
  if err := json.Unmarshal(responseBody, &parsed); err != nil {
105
- return "", err
119
+ return ChatCompletionResult{}, err
106
120
  }
107
121
 
108
122
  if parsed.Error != nil && parsed.Error.Message != "" {
109
- return "", errors.New(parsed.Error.Message)
123
+ return ChatCompletionResult{}, errors.New(parsed.Error.Message)
110
124
  }
111
125
 
112
126
  if len(parsed.Choices) == 0 {
113
- return "", errors.New("no choices returned by LLM")
127
+ return ChatCompletionResult{}, errors.New("no choices returned by LLM")
114
128
  }
115
129
 
116
- return parsed.Choices[0].Message.Content, nil
130
+ return ChatCompletionResult{
131
+ Content: parsed.Choices[0].Message.Content,
132
+ Usage: parsed.Usage,
133
+ }, nil
117
134
  }
118
135
 
119
136
  func buildEndpoint(apiBase string) string {
package/go/rlm/rlm.go CHANGED
@@ -198,14 +198,22 @@ func (r *RLM) callLLM(messages []Message) (string, error) {
198
198
  result, err := CallChatCompletion(request)
199
199
  duration := time.Since(start)
200
200
 
201
- r.observer.LLMCall(defaultModel, len(messages), 0, duration, err)
201
+ tokensUsed := 0
202
+ if result.Usage != nil {
203
+ r.stats.PromptTokens += result.Usage.PromptTokens
204
+ r.stats.CompletionTokens += result.Usage.CompletionTokens
205
+ r.stats.TotalTokens += result.Usage.TotalTokens
206
+ tokensUsed = result.Usage.TotalTokens
207
+ }
208
+
209
+ r.observer.LLMCall(defaultModel, len(messages), tokensUsed, duration, err)
202
210
 
203
211
  if err != nil {
204
212
  return "", err
205
213
  }
206
214
 
207
- r.observer.Debug("llm", "Response received (%d chars) in %s", len(result), duration)
208
- return result, nil
215
+ r.observer.Debug("llm", "Response received (%d chars, %d tokens) in %s", len(result.Content), tokensUsed, duration)
216
+ return result.Content, nil
209
217
  }
210
218
 
211
219
  func (r *RLM) buildREPLEnv(query string, context string) map[string]interface{} {
@@ -196,9 +196,15 @@ func (r *RLM) structuredCompletionDirect(query string, context string, config *S
196
196
  }
197
197
 
198
198
  stats.ParsingRetries = attempt
199
+ stats.TotalTokens = r.stats.TotalTokens
200
+ stats.PromptTokens = r.stats.PromptTokens
201
+ stats.CompletionTokens = r.stats.CompletionTokens
199
202
  return parsed, stats, nil
200
203
  }
201
204
 
205
+ stats.TotalTokens = r.stats.TotalTokens
206
+ stats.PromptTokens = r.stats.PromptTokens
207
+ stats.CompletionTokens = r.stats.CompletionTokens
202
208
  return nil, stats, fmt.Errorf("failed to get valid structured output after %d attempts: %v", config.MaxRetries, lastErr)
203
209
  }
204
210
 
@@ -265,6 +271,9 @@ func (r *RLM) structuredCompletionParallel(query string, context string, config
265
271
  totalStats.Depth = stats.Depth
266
272
  }
267
273
  totalStats.ParsingRetries += stats.ParsingRetries
274
+ totalStats.TotalTokens += stats.TotalTokens
275
+ totalStats.PromptTokens += stats.PromptTokens
276
+ totalStats.CompletionTokens += stats.CompletionTokens
268
277
  statsMutex.Unlock()
269
278
  }(i, task)
270
279
  }
@@ -0,0 +1,845 @@
1
+ package rlm
2
+
3
+ import (
4
+ "encoding/json"
5
+ "fmt"
6
+ "math"
7
+ "net/http"
8
+ "net/http/httptest"
9
+ "strings"
10
+ "testing"
11
+ )
12
+
13
+ // ─── Token Tracking Unit Tests ──────────────────────────────────────────────
14
+
15
+ func TestTokenUsage_ParsedFromAPIResponse(t *testing.T) {
16
+ // Verify that CallChatCompletion correctly parses the usage field from API responses
17
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
18
+ resp := map[string]interface{}{
19
+ "choices": []map[string]interface{}{
20
+ {"message": map[string]string{"content": "Hello world"}},
21
+ },
22
+ "usage": map[string]interface{}{
23
+ "prompt_tokens": 150,
24
+ "completion_tokens": 25,
25
+ "total_tokens": 175,
26
+ },
27
+ }
28
+ json.NewEncoder(w).Encode(resp)
29
+ }))
30
+ defer server.Close()
31
+
32
+ result, err := CallChatCompletion(ChatRequest{
33
+ Model: "test-model",
34
+ Messages: []Message{{Role: "user", Content: "test"}},
35
+ APIBase: server.URL,
36
+ })
37
+ if err != nil {
38
+ t.Fatalf("unexpected error: %v", err)
39
+ }
40
+
41
+ if result.Content != "Hello world" {
42
+ t.Errorf("expected content 'Hello world', got %q", result.Content)
43
+ }
44
+ if result.Usage == nil {
45
+ t.Fatal("expected usage to be non-nil")
46
+ }
47
+ if result.Usage.PromptTokens != 150 {
48
+ t.Errorf("expected 150 prompt tokens, got %d", result.Usage.PromptTokens)
49
+ }
50
+ if result.Usage.CompletionTokens != 25 {
51
+ t.Errorf("expected 25 completion tokens, got %d", result.Usage.CompletionTokens)
52
+ }
53
+ if result.Usage.TotalTokens != 175 {
54
+ t.Errorf("expected 175 total tokens, got %d", result.Usage.TotalTokens)
55
+ }
56
+ }
57
+
58
+ func TestTokenUsage_NilWhenAPIDoesNotReturnUsage(t *testing.T) {
59
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
60
+ resp := map[string]interface{}{
61
+ "choices": []map[string]interface{}{
62
+ {"message": map[string]string{"content": "Hello"}},
63
+ },
64
+ }
65
+ json.NewEncoder(w).Encode(resp)
66
+ }))
67
+ defer server.Close()
68
+
69
+ result, err := CallChatCompletion(ChatRequest{
70
+ Model: "test-model",
71
+ Messages: []Message{{Role: "user", Content: "test"}},
72
+ APIBase: server.URL,
73
+ })
74
+ if err != nil {
75
+ t.Fatalf("unexpected error: %v", err)
76
+ }
77
+
78
+ if result.Usage != nil {
79
+ t.Errorf("expected usage to be nil when API doesn't return it, got %+v", result.Usage)
80
+ }
81
+ }
82
+
83
+ func TestRLMStats_TokenAccumulation(t *testing.T) {
84
+ // Test that token usage accumulates across multiple LLM calls
85
+ callCount := 0
86
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
87
+ callCount++
88
+ resp := map[string]interface{}{
89
+ "choices": []map[string]interface{}{
90
+ {"message": map[string]string{"content": fmt.Sprintf(`FINAL("answer from call %d")`, callCount)}},
91
+ },
92
+ "usage": map[string]interface{}{
93
+ "prompt_tokens": 100 * callCount,
94
+ "completion_tokens": 20 * callCount,
95
+ "total_tokens": 120 * callCount,
96
+ },
97
+ }
98
+ json.NewEncoder(w).Encode(resp)
99
+ }))
100
+ defer server.Close()
101
+
102
+ engine := New("test-model", Config{
103
+ APIBase: server.URL,
104
+ MaxDepth: 5,
105
+ MaxIterations: 10,
106
+ })
107
+
108
+ _, stats, err := engine.Completion("test query", "test context")
109
+ if err != nil {
110
+ t.Fatalf("unexpected error: %v", err)
111
+ }
112
+
113
+ // First call should have returned FINAL, so 1 LLM call
114
+ if stats.LlmCalls != 1 {
115
+ t.Errorf("expected 1 LLM call, got %d", stats.LlmCalls)
116
+ }
117
+ if stats.TotalTokens != 120 {
118
+ t.Errorf("expected 120 total tokens, got %d", stats.TotalTokens)
119
+ }
120
+ if stats.PromptTokens != 100 {
121
+ t.Errorf("expected 100 prompt tokens, got %d", stats.PromptTokens)
122
+ }
123
+ if stats.CompletionTokens != 20 {
124
+ t.Errorf("expected 20 completion tokens, got %d", stats.CompletionTokens)
125
+ }
126
+ }
127
+
128
+ func TestRLMStats_TokenAccumulation_MultipleIterations(t *testing.T) {
129
+ // Simulates an RLM completion that takes 3 iterations before producing FINAL
130
+ callCount := 0
131
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
132
+ callCount++
133
+ content := "context.indexOf('test')"
134
+ if callCount >= 3 {
135
+ content = `FINAL("done after 3 calls")`
136
+ }
137
+ resp := map[string]interface{}{
138
+ "choices": []map[string]interface{}{
139
+ {"message": map[string]string{"content": content}},
140
+ },
141
+ "usage": map[string]interface{}{
142
+ "prompt_tokens": 200,
143
+ "completion_tokens": 50,
144
+ "total_tokens": 250,
145
+ },
146
+ }
147
+ json.NewEncoder(w).Encode(resp)
148
+ }))
149
+ defer server.Close()
150
+
151
+ engine := New("test-model", Config{
152
+ APIBase: server.URL,
153
+ MaxDepth: 5,
154
+ MaxIterations: 10,
155
+ })
156
+
157
+ _, stats, err := engine.Completion("test query", "test context for searching")
158
+ if err != nil {
159
+ t.Fatalf("unexpected error: %v", err)
160
+ }
161
+
162
+ if stats.LlmCalls != 3 {
163
+ t.Errorf("expected 3 LLM calls, got %d", stats.LlmCalls)
164
+ }
165
+ // 3 calls * 250 tokens each = 750 total
166
+ if stats.TotalTokens != 750 {
167
+ t.Errorf("expected 750 total tokens (3 calls * 250), got %d", stats.TotalTokens)
168
+ }
169
+ if stats.PromptTokens != 600 {
170
+ t.Errorf("expected 600 prompt tokens (3 calls * 200), got %d", stats.PromptTokens)
171
+ }
172
+ if stats.CompletionTokens != 150 {
173
+ t.Errorf("expected 150 completion tokens (3 calls * 50), got %d", stats.CompletionTokens)
174
+ }
175
+ }
176
+
177
+ func TestRLMStats_TokensInJSONOutput(t *testing.T) {
178
+ // Verify token fields are serialized in the JSON output
179
+ stats := RLMStats{
180
+ LlmCalls: 3,
181
+ Iterations: 2,
182
+ Depth: 0,
183
+ TotalTokens: 750,
184
+ PromptTokens: 600,
185
+ CompletionTokens: 150,
186
+ }
187
+
188
+ data, err := json.Marshal(stats)
189
+ if err != nil {
190
+ t.Fatalf("failed to marshal stats: %v", err)
191
+ }
192
+
193
+ var parsed map[string]interface{}
194
+ if err := json.Unmarshal(data, &parsed); err != nil {
195
+ t.Fatalf("failed to unmarshal stats: %v", err)
196
+ }
197
+
198
+ if v, ok := parsed["total_tokens"].(float64); !ok || int(v) != 750 {
199
+ t.Errorf("expected total_tokens=750 in JSON, got %v", parsed["total_tokens"])
200
+ }
201
+ if v, ok := parsed["prompt_tokens"].(float64); !ok || int(v) != 600 {
202
+ t.Errorf("expected prompt_tokens=600 in JSON, got %v", parsed["prompt_tokens"])
203
+ }
204
+ if v, ok := parsed["completion_tokens"].(float64); !ok || int(v) != 150 {
205
+ t.Errorf("expected completion_tokens=150 in JSON, got %v", parsed["completion_tokens"])
206
+ }
207
+ }
208
+
209
+ func TestRLMStats_ZeroTokensOmittedFromJSON(t *testing.T) {
210
+ // When no tokens are tracked, fields should be omitted (omitempty)
211
+ stats := RLMStats{
212
+ LlmCalls: 1,
213
+ Iterations: 1,
214
+ Depth: 0,
215
+ }
216
+
217
+ data, err := json.Marshal(stats)
218
+ if err != nil {
219
+ t.Fatalf("failed to marshal stats: %v", err)
220
+ }
221
+
222
+ jsonStr := string(data)
223
+ if strings.Contains(jsonStr, "total_tokens") {
224
+ t.Errorf("expected total_tokens to be omitted when zero, got: %s", jsonStr)
225
+ }
226
+ if strings.Contains(jsonStr, "prompt_tokens") {
227
+ t.Errorf("expected prompt_tokens to be omitted when zero, got: %s", jsonStr)
228
+ }
229
+ if strings.Contains(jsonStr, "completion_tokens") {
230
+ t.Errorf("expected completion_tokens to be omitted when zero, got: %s", jsonStr)
231
+ }
232
+ }
233
+
234
+ func TestFormatStatsWithObservability_IncludesTokens(t *testing.T) {
235
+ stats := RLMStats{
236
+ LlmCalls: 2,
237
+ Iterations: 1,
238
+ Depth: 0,
239
+ TotalTokens: 500,
240
+ PromptTokens: 400,
241
+ CompletionTokens: 100,
242
+ }
243
+
244
+ obs := NewNoopObserver()
245
+ formatted := FormatStatsWithObservability(stats, obs)
246
+
247
+ if v, ok := formatted["total_tokens"].(int); !ok || v != 500 {
248
+ t.Errorf("expected total_tokens=500, got %v", formatted["total_tokens"])
249
+ }
250
+ if v, ok := formatted["prompt_tokens"].(int); !ok || v != 400 {
251
+ t.Errorf("expected prompt_tokens=400, got %v", formatted["prompt_tokens"])
252
+ }
253
+ if v, ok := formatted["completion_tokens"].(int); !ok || v != 100 {
254
+ t.Errorf("expected completion_tokens=100, got %v", formatted["completion_tokens"])
255
+ }
256
+ }
257
+
258
+ func TestFormatStatsWithObservability_OmitsZeroTokens(t *testing.T) {
259
+ stats := RLMStats{
260
+ LlmCalls: 1,
261
+ Iterations: 1,
262
+ Depth: 0,
263
+ }
264
+
265
+ obs := NewNoopObserver()
266
+ formatted := FormatStatsWithObservability(stats, obs)
267
+
268
+ if _, exists := formatted["total_tokens"]; exists {
269
+ t.Errorf("expected total_tokens to be absent when zero, got %v", formatted["total_tokens"])
270
+ }
271
+ }
272
+
273
+ // ─── Token Efficiency Tests ─────────────────────────────────────────────────
274
+ //
275
+ // These tests prove that RLM context reduction strategies process fewer tokens
276
+ // than passing an entire large document through as raw context.
277
+
278
+ // generateLargeContext creates a realistic document of approximately targetTokens tokens.
279
+ // It generates structured content with numbered paragraphs to make it easy to verify
280
+ // that reduction strategies preserve key information.
281
+ func generateLargeContext(targetTokens int) string {
282
+ // ~3.5 chars per token is our estimation ratio
283
+ targetChars := int(float64(targetTokens) * 3.5)
284
+
285
+ var sb strings.Builder
286
+ sb.WriteString("# Technical Report: System Performance Analysis\n\n")
287
+ sb.WriteString("## Executive Summary\n\n")
288
+ sb.WriteString("This comprehensive report analyzes the performance characteristics of the distributed system ")
289
+ sb.WriteString("deployed across three data centers. Key findings include a 15% improvement in latency, ")
290
+ sb.WriteString("23% reduction in error rates, and significant cost savings through resource optimization.\n\n")
291
+
292
+ paragraphNum := 1
293
+ for sb.Len() < targetChars {
294
+ // Generate diverse paragraph types to simulate realistic documents
295
+ switch paragraphNum % 5 {
296
+ case 0:
297
+ fmt.Fprintf(&sb, "### Section %d: Database Performance Metrics\n\n", paragraphNum)
298
+ fmt.Fprintf(&sb, "In quarter Q%d, the primary database cluster processed an average of %d,000 queries per second "+
299
+ "with a p99 latency of %d.%d milliseconds. The read-to-write ratio was approximately %d:%d. "+
300
+ "Connection pool utilization peaked at %d%% during high-traffic periods, with %d active connections "+
301
+ "out of a configured maximum of %d. Index hit ratios remained above %d%% for all primary tables, "+
302
+ "though the secondary indexes on the analytics tables showed degradation to %d%% during batch "+
303
+ "processing windows. This resulted in an overall throughput improvement of %d.%d%% compared to "+
304
+ "the previous quarter's baseline measurements.\n\n",
305
+ paragraphNum%4+1, paragraphNum*12+50, paragraphNum%10+1, paragraphNum%99,
306
+ paragraphNum%7+3, 1, paragraphNum%30+70, paragraphNum*3+100, paragraphNum*5+200,
307
+ paragraphNum%5+95, paragraphNum%20+75, paragraphNum%15+5, paragraphNum%99)
308
+ case 1:
309
+ fmt.Fprintf(&sb, "### Section %d: API Gateway Statistics\n\n", paragraphNum)
310
+ fmt.Fprintf(&sb, "The API gateway handled %d.%dM requests during the reporting period. Rate limiting "+
311
+ "was triggered %d times for %d unique clients. The top 5 endpoints by traffic volume were: "+
312
+ "/api/v2/users (%d.%d%%), /api/v2/products (%d.%d%%), /api/v2/orders (%d.%d%%), "+
313
+ "/api/v2/analytics (%d.%d%%), and /api/v2/search (%d.%d%%). Authentication failures "+
314
+ "decreased from %d to %d per day after implementing the new token refresh mechanism. "+
315
+ "The overall API availability was %d.%d%% with %d minutes of total downtime.\n\n",
316
+ paragraphNum*5+10, paragraphNum%99, paragraphNum*7+20, paragraphNum*3+5,
317
+ paragraphNum%20+20, paragraphNum%99, paragraphNum%15+15, paragraphNum%99,
318
+ paragraphNum%10+10, paragraphNum%99, paragraphNum%8+5, paragraphNum%99,
319
+ paragraphNum%5+3, paragraphNum%99, paragraphNum*2+50, paragraphNum+10,
320
+ 99, paragraphNum%10+90, paragraphNum%30+5)
321
+ case 2:
322
+ fmt.Fprintf(&sb, "### Section %d: Memory and CPU Utilization\n\n", paragraphNum)
323
+ fmt.Fprintf(&sb, "Across all %d nodes in the cluster, average memory utilization was %d.%d%%. "+
324
+ "Node %d consistently showed the highest memory consumption at %d.%d%%, primarily due to "+
325
+ "in-memory caching of frequently accessed data structures. CPU utilization averaged %d.%d%% "+
326
+ "with peaks reaching %d.%d%% during the daily ETL batch processing window between "+
327
+ "%d:00 and %d:00 UTC. Garbage collection pauses were reduced from an average of %dms to %dms "+
328
+ "after tuning the JVM parameters. Thread pool saturation events decreased from %d per hour "+
329
+ "to %d per hour following the implementation of adaptive thread pool sizing.\n\n",
330
+ paragraphNum*2+20, paragraphNum%40+50, paragraphNum%99, paragraphNum%20+1,
331
+ paragraphNum%15+80, paragraphNum%99, paragraphNum%30+40, paragraphNum%99,
332
+ paragraphNum%20+75, paragraphNum%99, paragraphNum%6+2, paragraphNum%6+4,
333
+ paragraphNum%50+100, paragraphNum%30+20, paragraphNum%10+5, paragraphNum%5+1)
334
+ case 3:
335
+ fmt.Fprintf(&sb, "### Section %d: Error Analysis and Incident Report\n\n", paragraphNum)
336
+ fmt.Fprintf(&sb, "During the period, %d unique error types were observed across the system. "+
337
+ "The most frequent error (ERR-%04d) was a transient connection timeout to the Redis cluster, "+
338
+ "occurring %d times with a mean time to recovery of %d.%d seconds. Error category breakdown: "+
339
+ "network errors (%d%%), application errors (%d%%), database errors (%d%%), "+
340
+ "authentication errors (%d%%), and other (%d%%). The total error budget consumed was %d.%d%% "+
341
+ "of the allocated %d.%d%% for the quarter. Two P2 incidents were recorded on days %d and %d, "+
342
+ "with root causes traced to upstream provider instability and a misconfigured load balancer "+
343
+ "health check interval respectively.\n\n",
344
+ paragraphNum*3+15, paragraphNum+1000, paragraphNum*50+200, paragraphNum%10+1, paragraphNum%99,
345
+ paragraphNum%30+30, paragraphNum%25+20, paragraphNum%20+15, paragraphNum%10+5,
346
+ paragraphNum%10+5, paragraphNum%3, paragraphNum%99, paragraphNum%5, paragraphNum%99,
347
+ paragraphNum%28+1, paragraphNum%28+15)
348
+ case 4:
349
+ fmt.Fprintf(&sb, "### Section %d: Cost Optimization Results\n\n", paragraphNum)
350
+ fmt.Fprintf(&sb, "Infrastructure costs for the period totaled $%d,%03d.%02d, representing a "+
351
+ "%d.%d%% decrease from the previous quarter. Key savings were achieved through: "+
352
+ "reserved instance utilization (saving $%d,%03d), right-sizing %d underutilized instances "+
353
+ "(saving $%d,%03d), implementing spot instances for batch workloads (saving $%d,%03d), "+
354
+ "and optimizing data transfer routes (saving $%d,%03d). The cost per million API requests "+
355
+ "decreased from $%d.%02d to $%d.%02d. Projected annual savings based on current trends: "+
356
+ "$%d,%03d. Storage costs increased by %d.%d%% due to expanded logging retention requirements.\n\n",
357
+ paragraphNum*100+500, paragraphNum%1000, paragraphNum%100, paragraphNum%15+5, paragraphNum%99,
358
+ paragraphNum*20+100, paragraphNum%1000, paragraphNum*3+10, paragraphNum*10+50, paragraphNum%1000,
359
+ paragraphNum*8+30, paragraphNum%1000, paragraphNum*5+20, paragraphNum%1000,
360
+ paragraphNum%50+10, paragraphNum%100, paragraphNum%40+5, paragraphNum%100,
361
+ paragraphNum*300+1000, paragraphNum%1000, paragraphNum%10+2, paragraphNum%99)
362
+ }
363
+ paragraphNum++
364
+ }
365
+
366
+ return sb.String()
367
+ }
368
+
369
+ func TestTokenEfficiency_TFIDFUsesFewerTokens(t *testing.T) {
370
+ // Generate a large context (~35,000 tokens, well over 32k)
371
+ largeContext := generateLargeContext(35000)
372
+ originalTokens := EstimateTokens(largeContext)
373
+
374
+ if originalTokens < 32000 {
375
+ t.Fatalf("generated context is too small: %d tokens, need at least 32000", originalTokens)
376
+ }
377
+ t.Logf("Original context: %d chars, ~%d estimated tokens", len(largeContext), originalTokens)
378
+
379
+ // Apply TF-IDF compression to fit within a 32k token budget
380
+ modelLimit := 32768
381
+ overhead := 1000 // System prompt + query overhead
382
+ availableTokens := modelLimit - overhead
383
+
384
+ compressed := CompressContextTFIDF(largeContext, availableTokens)
385
+ compressedTokens := EstimateTokens(compressed)
386
+
387
+ t.Logf("TF-IDF compressed: %d chars, ~%d estimated tokens", len(compressed), compressedTokens)
388
+ t.Logf("Token reduction: %d -> %d (%.1f%% reduction)",
389
+ originalTokens, compressedTokens,
390
+ (1.0-float64(compressedTokens)/float64(originalTokens))*100)
391
+
392
+ // Core assertion: TF-IDF MUST produce fewer tokens than the original
393
+ if compressedTokens >= originalTokens {
394
+ t.Errorf("TF-IDF failed to reduce tokens: original=%d, compressed=%d", originalTokens, compressedTokens)
395
+ }
396
+
397
+ // And it must fit within our budget
398
+ if compressedTokens > availableTokens {
399
+ t.Errorf("TF-IDF output exceeds budget: %d tokens > %d available", compressedTokens, availableTokens)
400
+ }
401
+
402
+ // Verify meaningful compression (at least 5% reduction for a context that's over budget)
403
+ reductionPct := (1.0 - float64(compressedTokens)/float64(originalTokens)) * 100
404
+ if reductionPct < 5.0 {
405
+ t.Errorf("TF-IDF compression too weak: only %.1f%% reduction", reductionPct)
406
+ }
407
+ }
408
+
409
+ func TestTokenEfficiency_TextRankUsesFewerTokens(t *testing.T) {
410
+ largeContext := generateLargeContext(35000)
411
+ originalTokens := EstimateTokens(largeContext)
412
+
413
+ if originalTokens < 32000 {
414
+ t.Fatalf("generated context is too small: %d tokens, need at least 32000", originalTokens)
415
+ }
416
+ t.Logf("Original context: %d chars, ~%d estimated tokens", len(largeContext), originalTokens)
417
+
418
+ modelLimit := 32768
419
+ overhead := 1000
420
+ availableTokens := modelLimit - overhead
421
+
422
+ compressed := CompressContextTextRank(largeContext, availableTokens)
423
+ compressedTokens := EstimateTokens(compressed)
424
+
425
+ t.Logf("TextRank compressed: %d chars, ~%d estimated tokens", len(compressed), compressedTokens)
426
+ t.Logf("Token reduction: %d -> %d (%.1f%% reduction)",
427
+ originalTokens, compressedTokens,
428
+ (1.0-float64(compressedTokens)/float64(originalTokens))*100)
429
+
430
+ if compressedTokens >= originalTokens {
431
+ t.Errorf("TextRank failed to reduce tokens: original=%d, compressed=%d", originalTokens, compressedTokens)
432
+ }
433
+
434
+ if compressedTokens > availableTokens {
435
+ t.Errorf("TextRank output exceeds budget: %d tokens > %d available", compressedTokens, availableTokens)
436
+ }
437
+
438
+ reductionPct := (1.0 - float64(compressedTokens)/float64(originalTokens)) * 100
439
+ if reductionPct < 5.0 {
440
+ t.Errorf("TextRank compression too weak: only %.1f%% reduction", reductionPct)
441
+ }
442
+ }
443
+
444
+ func TestTokenEfficiency_TruncateUsesFewerTokens(t *testing.T) {
445
+ largeContext := generateLargeContext(35000)
446
+ originalTokens := EstimateTokens(largeContext)
447
+
448
+ if originalTokens < 32000 {
449
+ t.Fatalf("generated context is too small: %d tokens, need at least 32000", originalTokens)
450
+ }
451
+
452
+ modelLimit := 32768
453
+ overhead := 1000
454
+
455
+ // Create a reducer with truncation strategy
456
+ engine := New("test-model", Config{
457
+ MaxDepth: 5,
458
+ MaxIterations: 10,
459
+ ContextOverflow: &ContextOverflowConfig{
460
+ Enabled: true,
461
+ Strategy: "truncate",
462
+ SafetyMargin: 0.15,
463
+ },
464
+ })
465
+
466
+ reducer := newContextReducer(engine, *engine.contextOverflow, NewNoopObserver())
467
+ truncated, err := reducer.reduceByTruncation(largeContext, modelLimit, overhead)
468
+ if err != nil {
469
+ t.Fatalf("truncation failed: %v", err)
470
+ }
471
+
472
+ truncatedTokens := EstimateTokens(truncated)
473
+
474
+ t.Logf("Truncate: %d -> %d estimated tokens (%.1f%% reduction)",
475
+ originalTokens, truncatedTokens,
476
+ (1.0-float64(truncatedTokens)/float64(originalTokens))*100)
477
+
478
+ if truncatedTokens >= originalTokens {
479
+ t.Errorf("truncation failed to reduce tokens: original=%d, truncated=%d", originalTokens, truncatedTokens)
480
+ }
481
+ }
482
+
483
+ func TestTokenEfficiency_ChunkingProducesSmallChunks(t *testing.T) {
484
+ largeContext := generateLargeContext(35000)
485
+ originalTokens := EstimateTokens(largeContext)
486
+
487
+ if originalTokens < 32000 {
488
+ t.Fatalf("generated context is too small: %d tokens, need at least 32000", originalTokens)
489
+ }
490
+
491
+ // Chunk with a 8k token budget per chunk
492
+ chunkBudget := 8000
493
+ chunks := ChunkContext(largeContext, chunkBudget)
494
+
495
+ t.Logf("Chunked %d tokens into %d chunks (budget: %d tokens/chunk)", originalTokens, len(chunks), chunkBudget)
496
+
497
+ if len(chunks) < 2 {
498
+ t.Errorf("expected multiple chunks for %d token context, got %d", originalTokens, len(chunks))
499
+ }
500
+
501
+ // Each chunk must be smaller than the original
502
+ for i, chunk := range chunks {
503
+ chunkTokens := EstimateTokens(chunk)
504
+ if chunkTokens >= originalTokens {
505
+ t.Errorf("chunk %d is not smaller than original: %d tokens >= %d", i, chunkTokens, originalTokens)
506
+ }
507
+ t.Logf(" Chunk %d: %d estimated tokens", i, chunkTokens)
508
+ }
509
+ }
510
+
511
+ func TestTokenEfficiency_PreemptiveReduction(t *testing.T) {
512
+ // Test that PreemptiveReduceContext actually reduces a large context
513
+ largeContext := generateLargeContext(35000)
514
+ originalTokens := EstimateTokens(largeContext)
515
+
516
+ engine := New("gpt-4o-mini", Config{
517
+ MaxDepth: 5,
518
+ MaxIterations: 10,
519
+ ContextOverflow: &ContextOverflowConfig{
520
+ Enabled: true,
521
+ Strategy: "tfidf",
522
+ SafetyMargin: 0.15,
523
+ },
524
+ })
525
+
526
+ reduced, wasReduced, err := engine.PreemptiveReduceContext("Summarize the key findings", largeContext, 0)
527
+ if err != nil {
528
+ t.Fatalf("preemptive reduction failed: %v", err)
529
+ }
530
+
531
+ // gpt-4o-mini has 128k limit, so 35k should NOT trigger reduction
532
+ if wasReduced {
533
+ t.Logf("context was unexpectedly reduced for 35k input with 128k model limit")
534
+ } else {
535
+ t.Logf("correctly skipped reduction: 35k tokens fits within gpt-4o-mini's 128k limit")
536
+ }
537
+
538
+ // Force a smaller model limit to ensure reduction triggers
539
+ engine2 := New("gpt-4", Config{
540
+ MaxDepth: 5,
541
+ MaxIterations: 10,
542
+ ContextOverflow: &ContextOverflowConfig{
543
+ Enabled: true,
544
+ Strategy: "tfidf",
545
+ SafetyMargin: 0.15,
546
+ MaxModelTokens: 16000, // Force small limit
547
+ },
548
+ })
549
+
550
+ reduced2, wasReduced2, err := engine2.PreemptiveReduceContext("Summarize the key findings", largeContext, 0)
551
+ if err != nil {
552
+ t.Fatalf("preemptive reduction failed: %v", err)
553
+ }
554
+
555
+ if !wasReduced2 {
556
+ t.Error("expected context to be reduced when model limit is 16k and context is 35k tokens")
557
+ }
558
+
559
+ reducedTokens := EstimateTokens(reduced2)
560
+ t.Logf("Preemptive TF-IDF: %d -> %d estimated tokens (%.1f%% reduction)",
561
+ originalTokens, reducedTokens,
562
+ (1.0-float64(reducedTokens)/float64(originalTokens))*100)
563
+
564
+ if reducedTokens >= originalTokens {
565
+ t.Errorf("preemptive reduction failed: original=%d, reduced=%d", originalTokens, reducedTokens)
566
+ }
567
+
568
+ _ = reduced // used above
569
+ }
570
+
571
+ func TestTokenEfficiency_AllStrategiesCompared(t *testing.T) {
572
+ // Generate a 40k token context (well over 32k limit)
573
+ largeContext := generateLargeContext(40000)
574
+ originalTokens := EstimateTokens(largeContext)
575
+
576
+ if originalTokens < 35000 {
577
+ t.Fatalf("generated context is too small: %d tokens, need at least 35000", originalTokens)
578
+ }
579
+
580
+ modelLimit := 32768
581
+ overhead := 1000
582
+
583
+ t.Logf("Original context: %d chars, ~%d estimated tokens", len(largeContext), originalTokens)
584
+ t.Logf("Model limit: %d tokens, overhead: %d, available: %d", modelLimit, overhead, modelLimit-overhead)
585
+
586
+ // Track results for each strategy
587
+ type strategyResult struct {
588
+ name string
589
+ reducedTokens int
590
+ reductionPct float64
591
+ requiresLLM bool
592
+ }
593
+ var results []strategyResult
594
+
595
+ availableTokens := modelLimit - overhead
596
+
597
+ // TF-IDF (pure algorithmic)
598
+ tfidfResult := CompressContextTFIDF(largeContext, availableTokens)
599
+ tfidfTokens := EstimateTokens(tfidfResult)
600
+ results = append(results, strategyResult{
601
+ name: "tfidf",
602
+ reducedTokens: tfidfTokens,
603
+ reductionPct: (1.0 - float64(tfidfTokens)/float64(originalTokens)) * 100,
604
+ requiresLLM: false,
605
+ })
606
+
607
+ // TextRank (pure algorithmic)
608
+ textRankResult := CompressContextTextRank(largeContext, availableTokens)
609
+ textRankTokens := EstimateTokens(textRankResult)
610
+ results = append(results, strategyResult{
611
+ name: "textrank",
612
+ reducedTokens: textRankTokens,
613
+ reductionPct: (1.0 - float64(textRankTokens)/float64(originalTokens)) * 100,
614
+ requiresLLM: false,
615
+ })
616
+
617
+ // Truncation
618
+ engine := New("test-model", Config{
619
+ MaxDepth: 5,
620
+ MaxIterations: 10,
621
+ ContextOverflow: &ContextOverflowConfig{
622
+ Enabled: true,
623
+ Strategy: "truncate",
624
+ SafetyMargin: 0.15,
625
+ },
626
+ })
627
+ reducer := newContextReducer(engine, *engine.contextOverflow, NewNoopObserver())
628
+ truncResult, _ := reducer.reduceByTruncation(largeContext, modelLimit, overhead)
629
+ truncTokens := EstimateTokens(truncResult)
630
+ results = append(results, strategyResult{
631
+ name: "truncate",
632
+ reducedTokens: truncTokens,
633
+ reductionPct: (1.0 - float64(truncTokens)/float64(originalTokens)) * 100,
634
+ requiresLLM: false,
635
+ })
636
+
637
+ // Print comparison table
638
+ t.Logf("\n--- Token Efficiency Comparison ---")
639
+ t.Logf("%-12s | %12s | %10s | %s", "Strategy", "Tokens Used", "Reduction", "Requires LLM")
640
+ t.Logf("%-12s | %12s | %10s | %s", "------------", "------------", "----------", "------------")
641
+ t.Logf("%-12s | %12d | %9s | %s", "raw (none)", originalTokens, "0.0%", "no")
642
+ for _, r := range results {
643
+ llmStr := "no"
644
+ if r.requiresLLM {
645
+ llmStr = "yes"
646
+ }
647
+ t.Logf("%-12s | %12d | %9.1f%% | %s", r.name, r.reducedTokens, r.reductionPct, llmStr)
648
+ }
649
+
650
+ // Assert ALL strategies use fewer tokens than raw
651
+ for _, r := range results {
652
+ if r.reducedTokens >= originalTokens {
653
+ t.Errorf("strategy %q failed: %d tokens >= original %d tokens", r.name, r.reducedTokens, originalTokens)
654
+ }
655
+ }
656
+
657
+ // Assert all strategies fit within the model limit
658
+ for _, r := range results {
659
+ if r.reducedTokens > availableTokens {
660
+ t.Errorf("strategy %q exceeds budget: %d tokens > %d available", r.name, r.reducedTokens, availableTokens)
661
+ }
662
+ }
663
+ }
664
+
665
+ func TestTokenEfficiency_VeryLargeContext_100kTokens(t *testing.T) {
666
+ // Test with a very large context (~100k tokens) to prove scaling
667
+ largeContext := generateLargeContext(100000)
668
+ originalTokens := EstimateTokens(largeContext)
669
+
670
+ if originalTokens < 90000 {
671
+ t.Fatalf("generated context is too small: %d tokens, need at least 90000", originalTokens)
672
+ }
673
+
674
+ modelLimit := 32768
675
+ overhead := 1000
676
+ availableTokens := modelLimit - overhead
677
+
678
+ t.Logf("Original: ~%d estimated tokens (3x over 32k limit)", originalTokens)
679
+
680
+ // TF-IDF
681
+ tfidfResult := CompressContextTFIDF(largeContext, availableTokens)
682
+ tfidfTokens := EstimateTokens(tfidfResult)
683
+
684
+ // TextRank
685
+ textRankResult := CompressContextTextRank(largeContext, availableTokens)
686
+ textRankTokens := EstimateTokens(textRankResult)
687
+
688
+ t.Logf("TF-IDF: %d tokens (%.1f%% reduction)", tfidfTokens, (1.0-float64(tfidfTokens)/float64(originalTokens))*100)
689
+ t.Logf("TextRank: %d tokens (%.1f%% reduction)", textRankTokens, (1.0-float64(textRankTokens)/float64(originalTokens))*100)
690
+
691
+ // Both must be significantly smaller
692
+ if tfidfTokens >= originalTokens/2 {
693
+ t.Errorf("TF-IDF should reduce 100k context by at least 50%%: got %d tokens", tfidfTokens)
694
+ }
695
+ if textRankTokens >= originalTokens/2 {
696
+ t.Errorf("TextRank should reduce 100k context by at least 50%%: got %d tokens", textRankTokens)
697
+ }
698
+
699
+ // Both must fit within budget
700
+ if tfidfTokens > availableTokens {
701
+ t.Errorf("TF-IDF exceeds budget: %d > %d", tfidfTokens, availableTokens)
702
+ }
703
+ if textRankTokens > availableTokens {
704
+ t.Errorf("TextRank exceeds budget: %d > %d", textRankTokens, availableTokens)
705
+ }
706
+ }
707
+
708
+ func TestTokenEfficiency_MapReduceTracksTokens(t *testing.T) {
709
+ // Test that mapreduce strategy properly accumulates token usage from multiple chunks
710
+ callCount := 0
711
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
712
+ callCount++
713
+ // Simulate summarization - return a short summary for each chunk
714
+ resp := map[string]interface{}{
715
+ "choices": []map[string]interface{}{
716
+ {"message": map[string]string{"content": fmt.Sprintf("Summary of chunk %d: key finding was performance improvement.", callCount)}},
717
+ },
718
+ "usage": map[string]interface{}{
719
+ "prompt_tokens": 500 + callCount*50,
720
+ "completion_tokens": 30,
721
+ "total_tokens": 530 + callCount*50,
722
+ },
723
+ }
724
+ json.NewEncoder(w).Encode(resp)
725
+ }))
726
+ defer server.Close()
727
+
728
+ engine := New("test-model", Config{
729
+ APIBase: server.URL,
730
+ MaxDepth: 5,
731
+ MaxIterations: 10,
732
+ ContextOverflow: &ContextOverflowConfig{
733
+ Enabled: true,
734
+ Strategy: "mapreduce",
735
+ SafetyMargin: 0.15,
736
+ },
737
+ })
738
+
739
+ // Create a large context that will be split into multiple chunks
740
+ largeContext := generateLargeContext(40000)
741
+ query := "Summarize the key findings"
742
+
743
+ reducer := newContextReducer(engine, *engine.contextOverflow, NewNoopObserver())
744
+ reduced, err := reducer.ReduceForCompletion(query, largeContext, 16000)
745
+ if err != nil {
746
+ t.Fatalf("mapreduce reduction failed: %v", err)
747
+ }
748
+
749
+ // Verify that token usage was accumulated
750
+ if engine.stats.TotalTokens == 0 {
751
+ t.Error("expected total_tokens > 0 after mapreduce reduction, got 0")
752
+ }
753
+ if engine.stats.PromptTokens == 0 {
754
+ t.Error("expected prompt_tokens > 0 after mapreduce reduction, got 0")
755
+ }
756
+ if engine.stats.CompletionTokens == 0 {
757
+ t.Error("expected completion_tokens > 0 after mapreduce reduction, got 0")
758
+ }
759
+
760
+ t.Logf("MapReduce token tracking: %d total tokens (%d prompt, %d completion) across %d LLM calls",
761
+ engine.stats.TotalTokens, engine.stats.PromptTokens, engine.stats.CompletionTokens, engine.stats.LlmCalls)
762
+ t.Logf("Reduced context: %d chars", len(reduced))
763
+
764
+ // The reduced context should be much smaller than the original
765
+ if len(reduced) >= len(largeContext) {
766
+ t.Errorf("mapreduce failed to reduce context: %d chars >= original %d chars", len(reduced), len(largeContext))
767
+ }
768
+ }
769
+
770
+ func TestTokenEfficiency_StructuredCompletion_TracksTokens(t *testing.T) {
771
+ // Verify structured completion accumulates tokens
772
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
773
+ resp := map[string]interface{}{
774
+ "choices": []map[string]interface{}{
775
+ {"message": map[string]string{"content": `{"summary": "test result", "score": 8}`}},
776
+ },
777
+ "usage": map[string]interface{}{
778
+ "prompt_tokens": 300,
779
+ "completion_tokens": 15,
780
+ "total_tokens": 315,
781
+ },
782
+ }
783
+ json.NewEncoder(w).Encode(resp)
784
+ }))
785
+ defer server.Close()
786
+
787
+ engine := New("test-model", Config{
788
+ APIBase: server.URL,
789
+ MaxDepth: 5,
790
+ MaxIterations: 10,
791
+ })
792
+
793
+ schema := &StructuredConfig{
794
+ Schema: &JSONSchema{
795
+ Type: "object",
796
+ Properties: map[string]*JSONSchema{
797
+ "summary": {Type: "string"},
798
+ "score": {Type: "number"},
799
+ },
800
+ Required: []string{"summary", "score"},
801
+ },
802
+ MaxRetries: 3,
803
+ }
804
+
805
+ result, stats, err := engine.StructuredCompletion("Analyze this", "Some test context", schema)
806
+ if err != nil {
807
+ t.Fatalf("structured completion failed: %v", err)
808
+ }
809
+
810
+ if result == nil {
811
+ t.Fatal("expected non-nil result")
812
+ }
813
+
814
+ if stats.TotalTokens == 0 {
815
+ t.Error("expected total_tokens > 0 after structured completion, got 0")
816
+ }
817
+ if stats.PromptTokens == 0 {
818
+ t.Error("expected prompt_tokens > 0 after structured completion")
819
+ }
820
+ if stats.CompletionTokens == 0 {
821
+ t.Error("expected completion_tokens > 0 after structured completion")
822
+ }
823
+
824
+ t.Logf("Structured completion: %d total tokens (%d prompt, %d completion)", stats.TotalTokens, stats.PromptTokens, stats.CompletionTokens)
825
+ }
826
+
827
+ // ─── Token Estimation Accuracy Tests ─────────────────────────────────────────
828
+
829
+ func TestEstimateTokens_AccuracyForLargeContent(t *testing.T) {
830
+ // Verify that our estimation stays reasonable for large content
831
+ content := generateLargeContext(32000)
832
+ estimated := EstimateTokens(content)
833
+
834
+ // Real tokenizer would give different results, but our estimation should be
835
+ // within a reasonable range. The key property: conservative (over-estimates slightly)
836
+ charToTokenRatio := float64(len(content)) / float64(estimated)
837
+
838
+ // Our estimator uses 3.5 chars/token, so ratio should be ~3.5
839
+ if math.Abs(charToTokenRatio-3.5) > 0.5 {
840
+ t.Errorf("char-to-token ratio %.2f deviates too far from expected ~3.5", charToTokenRatio)
841
+ }
842
+
843
+ t.Logf("Large content: %d chars, %d estimated tokens, ratio: %.2f chars/token",
844
+ len(content), estimated, charToTokenRatio)
845
+ }
package/go/rlm/types.go CHANGED
@@ -6,10 +6,13 @@ import (
6
6
  )
7
7
 
8
8
  type RLMStats struct {
9
- LlmCalls int `json:"llm_calls"`
10
- Iterations int `json:"iterations"`
11
- Depth int `json:"depth"`
12
- ParsingRetries int `json:"parsing_retries,omitempty"`
9
+ LlmCalls int `json:"llm_calls"`
10
+ Iterations int `json:"iterations"`
11
+ Depth int `json:"depth"`
12
+ ParsingRetries int `json:"parsing_retries,omitempty"`
13
+ TotalTokens int `json:"total_tokens,omitempty"`
14
+ PromptTokens int `json:"prompt_tokens,omitempty"`
15
+ CompletionTokens int `json:"completion_tokens,omitempty"`
13
16
  }
14
17
 
15
18
  type JSONSchema struct {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "recursive-llm-ts",
3
- "version": "4.8.0",
3
+ "version": "4.9.0",
4
4
  "description": "TypeScript bridge for recursive-llm: Recursive Language Models for unbounded context processing with structured outputs",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -48,12 +48,12 @@
48
48
  "license": "MIT",
49
49
  "repository": {
50
50
  "type": "git",
51
- "url": "git+https://github.com/jbeck018/recursive-llm-ts.git"
51
+ "url": "git+https://github.com/howlerops/recursive-llm-ts.git"
52
52
  },
53
53
  "bugs": {
54
- "url": "https://github.com/jbeck018/recursive-llm-ts/issues"
54
+ "url": "https://github.com/howlerops/recursive-llm-ts/issues"
55
55
  },
56
- "homepage": "https://github.com/jbeck018/recursive-llm-ts#readme",
56
+ "homepage": "https://github.com/howlerops/recursive-llm-ts#readme",
57
57
  "dependencies": {
58
58
  "zod": "^4.3.6"
59
59
  },