recursive-llm-ts 4.8.0 → 4.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/bin/rlm-go +0 -0
- package/dist/bridge-interface.d.ts +3 -0
- package/dist/rlm.js +10 -0
- package/go/README.md +2 -2
- package/go/cmd/rlm/main.go +1 -1
- package/go/go.mod +1 -1
- package/go/rlm/context_overflow.go +28 -7
- package/go/rlm/doc.go +2 -2
- package/go/rlm/meta_agent.go +18 -2
- package/go/rlm/observability.go +6 -0
- package/go/rlm/openai.go +27 -10
- package/go/rlm/rlm.go +11 -3
- package/go/rlm/structured.go +9 -0
- package/go/rlm/token_tracking_test.go +845 -0
- package/go/rlm/types.go +7 -4
- package/package.json +4 -4
package/README.md
CHANGED
|
@@ -1120,7 +1120,7 @@ The Go implementation can be used as a standalone library in Go projects.
|
|
|
1120
1120
|
### Installation
|
|
1121
1121
|
|
|
1122
1122
|
```bash
|
|
1123
|
-
go get github.com/
|
|
1123
|
+
go get github.com/howlerops/recursive-llm-ts/go
|
|
1124
1124
|
```
|
|
1125
1125
|
|
|
1126
1126
|
### Usage
|
|
@@ -1132,7 +1132,7 @@ import (
|
|
|
1132
1132
|
"fmt"
|
|
1133
1133
|
"os"
|
|
1134
1134
|
|
|
1135
|
-
"github.com/
|
|
1135
|
+
"github.com/howlerops/recursive-llm-ts/go/rlm"
|
|
1136
1136
|
)
|
|
1137
1137
|
|
|
1138
1138
|
func main() {
|
package/bin/rlm-go
CHANGED
|
Binary file
|
package/dist/rlm.js
CHANGED
|
@@ -44,6 +44,7 @@ class RLMResultFormatter {
|
|
|
44
44
|
}
|
|
45
45
|
/** Format stats as a concise one-liner */
|
|
46
46
|
prettyStats() {
|
|
47
|
+
var _a, _b;
|
|
47
48
|
const parts = [
|
|
48
49
|
`LLM Calls: ${this.stats.llm_calls}`,
|
|
49
50
|
`Iterations: ${this.stats.iterations}`,
|
|
@@ -52,6 +53,9 @@ class RLMResultFormatter {
|
|
|
52
53
|
if (this.stats.parsing_retries) {
|
|
53
54
|
parts.push(`Retries: ${this.stats.parsing_retries}`);
|
|
54
55
|
}
|
|
56
|
+
if (this.stats.total_tokens) {
|
|
57
|
+
parts.push(`Tokens: ${this.stats.total_tokens} (prompt: ${(_a = this.stats.prompt_tokens) !== null && _a !== void 0 ? _a : 0}, completion: ${(_b = this.stats.completion_tokens) !== null && _b !== void 0 ? _b : 0})`);
|
|
58
|
+
}
|
|
55
59
|
if (this.cached) {
|
|
56
60
|
parts.push('(cached)');
|
|
57
61
|
}
|
|
@@ -69,6 +73,7 @@ class RLMResultFormatter {
|
|
|
69
73
|
}
|
|
70
74
|
/** Format as Markdown */
|
|
71
75
|
toMarkdown() {
|
|
76
|
+
var _a, _b;
|
|
72
77
|
const lines = [
|
|
73
78
|
'## Result',
|
|
74
79
|
'',
|
|
@@ -85,6 +90,11 @@ class RLMResultFormatter {
|
|
|
85
90
|
if (this.stats.parsing_retries) {
|
|
86
91
|
lines.push(`| Parsing Retries | ${this.stats.parsing_retries} |`);
|
|
87
92
|
}
|
|
93
|
+
if (this.stats.total_tokens) {
|
|
94
|
+
lines.push(`| Total Tokens | ${this.stats.total_tokens} |`);
|
|
95
|
+
lines.push(`| Prompt Tokens | ${(_a = this.stats.prompt_tokens) !== null && _a !== void 0 ? _a : 0} |`);
|
|
96
|
+
lines.push(`| Completion Tokens | ${(_b = this.stats.completion_tokens) !== null && _b !== void 0 ? _b : 0} |`);
|
|
97
|
+
}
|
|
88
98
|
lines.push(`| Cached | ${this.cached} |`);
|
|
89
99
|
lines.push(`| Model | ${this.model} |`);
|
|
90
100
|
return lines.join('\n');
|
package/go/README.md
CHANGED
|
@@ -13,7 +13,7 @@ This is both a standalone Go library and CLI binary that implements the RLM algo
|
|
|
13
13
|
### As a Go Library
|
|
14
14
|
|
|
15
15
|
```bash
|
|
16
|
-
go get github.com/
|
|
16
|
+
go get github.com/howlerops/recursive-llm-ts/go
|
|
17
17
|
```
|
|
18
18
|
|
|
19
19
|
### Usage as Library
|
|
@@ -25,7 +25,7 @@ import (
|
|
|
25
25
|
"fmt"
|
|
26
26
|
"os"
|
|
27
27
|
|
|
28
|
-
"github.com/
|
|
28
|
+
"github.com/howlerops/recursive-llm-ts/go/rlm"
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
func main() {
|
package/go/cmd/rlm/main.go
CHANGED
package/go/go.mod
CHANGED
|
@@ -431,8 +431,13 @@ func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, model
|
|
|
431
431
|
}
|
|
432
432
|
|
|
433
433
|
cr.rlm.stats.LlmCalls++
|
|
434
|
-
|
|
435
|
-
|
|
434
|
+
if result.Usage != nil {
|
|
435
|
+
cr.rlm.stats.PromptTokens += result.Usage.PromptTokens
|
|
436
|
+
cr.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
|
|
437
|
+
cr.rlm.stats.TotalTokens += result.Usage.TotalTokens
|
|
438
|
+
}
|
|
439
|
+
summaries[idx] = result.Content
|
|
440
|
+
cr.obs.Debug("overflow", "Chunk %d/%d summarized: %d -> %d chars", idx+1, len(chunks), len(chunkText), len(result.Content))
|
|
436
441
|
}(i, chunk)
|
|
437
442
|
}
|
|
438
443
|
|
|
@@ -536,8 +541,13 @@ func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []strin
|
|
|
536
541
|
}
|
|
537
542
|
|
|
538
543
|
cr.rlm.stats.LlmCalls++
|
|
539
|
-
if
|
|
540
|
-
|
|
544
|
+
if result.Usage != nil {
|
|
545
|
+
cr.rlm.stats.PromptTokens += result.Usage.PromptTokens
|
|
546
|
+
cr.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
|
|
547
|
+
cr.rlm.stats.TotalTokens += result.Usage.TotalTokens
|
|
548
|
+
}
|
|
549
|
+
if strings.TrimSpace(result.Content) != "NO_RELEVANT_CONTENT" {
|
|
550
|
+
results[idx] = result.Content
|
|
541
551
|
}
|
|
542
552
|
}(i, chunk)
|
|
543
553
|
}
|
|
@@ -606,7 +616,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
|
|
|
606
616
|
{Role: "user", Content: initialPrompt},
|
|
607
617
|
}
|
|
608
618
|
|
|
609
|
-
|
|
619
|
+
initialResult, err := CallChatCompletion(ChatRequest{
|
|
610
620
|
Model: cr.rlm.model,
|
|
611
621
|
Messages: messages,
|
|
612
622
|
APIBase: cr.rlm.apiBase,
|
|
@@ -618,6 +628,12 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
|
|
|
618
628
|
return "", fmt.Errorf("refine initial chunk: %w", err)
|
|
619
629
|
}
|
|
620
630
|
cr.rlm.stats.LlmCalls++
|
|
631
|
+
if initialResult.Usage != nil {
|
|
632
|
+
cr.rlm.stats.PromptTokens += initialResult.Usage.PromptTokens
|
|
633
|
+
cr.rlm.stats.CompletionTokens += initialResult.Usage.CompletionTokens
|
|
634
|
+
cr.rlm.stats.TotalTokens += initialResult.Usage.TotalTokens
|
|
635
|
+
}
|
|
636
|
+
currentAnswer := initialResult.Content
|
|
621
637
|
cr.obs.Debug("overflow", "Refine: initial answer from chunk 1/%d (%d chars)", len(chunks), len(currentAnswer))
|
|
622
638
|
|
|
623
639
|
// Phase 2: Refine the answer with each subsequent chunk
|
|
@@ -638,7 +654,7 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
|
|
|
638
654
|
{Role: "user", Content: refinePrompt},
|
|
639
655
|
}
|
|
640
656
|
|
|
641
|
-
|
|
657
|
+
refineResult, err := CallChatCompletion(ChatRequest{
|
|
642
658
|
Model: cr.rlm.model,
|
|
643
659
|
Messages: messages,
|
|
644
660
|
APIBase: cr.rlm.apiBase,
|
|
@@ -652,7 +668,12 @@ func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLim
|
|
|
652
668
|
continue
|
|
653
669
|
}
|
|
654
670
|
cr.rlm.stats.LlmCalls++
|
|
655
|
-
|
|
671
|
+
if refineResult.Usage != nil {
|
|
672
|
+
cr.rlm.stats.PromptTokens += refineResult.Usage.PromptTokens
|
|
673
|
+
cr.rlm.stats.CompletionTokens += refineResult.Usage.CompletionTokens
|
|
674
|
+
cr.rlm.stats.TotalTokens += refineResult.Usage.TotalTokens
|
|
675
|
+
}
|
|
676
|
+
currentAnswer = refineResult.Content
|
|
656
677
|
cr.obs.Debug("overflow", "Refine: incorporated chunk %d/%d (%d chars)", i+1, len(chunks), len(currentAnswer))
|
|
657
678
|
}
|
|
658
679
|
|
package/go/rlm/doc.go
CHANGED
|
@@ -8,13 +8,13 @@
|
|
|
8
8
|
//
|
|
9
9
|
// To use this package in your Go project:
|
|
10
10
|
//
|
|
11
|
-
// go get github.com/
|
|
11
|
+
// go get github.com/howlerops/recursive-llm-ts/go
|
|
12
12
|
//
|
|
13
13
|
// # Basic Usage
|
|
14
14
|
//
|
|
15
15
|
// Create an RLM engine and execute a completion:
|
|
16
16
|
//
|
|
17
|
-
// import "github.com/
|
|
17
|
+
// import "github.com/howlerops/recursive-llm-ts/go/rlm"
|
|
18
18
|
//
|
|
19
19
|
// config := rlm.Config{
|
|
20
20
|
// MaxDepth: 5,
|
package/go/rlm/meta_agent.go
CHANGED
|
@@ -74,7 +74,15 @@ func (ma *MetaAgent) OptimizeQuery(query string, context string) (string, error)
|
|
|
74
74
|
return query, nil
|
|
75
75
|
}
|
|
76
76
|
|
|
77
|
-
|
|
77
|
+
// Track meta-agent token usage in the parent RLM's stats
|
|
78
|
+
if result.Usage != nil {
|
|
79
|
+
ma.rlm.stats.PromptTokens += result.Usage.PromptTokens
|
|
80
|
+
ma.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
|
|
81
|
+
ma.rlm.stats.TotalTokens += result.Usage.TotalTokens
|
|
82
|
+
}
|
|
83
|
+
ma.rlm.stats.LlmCalls++
|
|
84
|
+
|
|
85
|
+
optimized := strings.TrimSpace(result.Content)
|
|
78
86
|
ma.obs.Debug("meta_agent", "Optimized query: %s", truncateStr(optimized, 200))
|
|
79
87
|
ma.obs.Event("meta_agent.query_optimized", map[string]string{
|
|
80
88
|
"original_length": fmt.Sprintf("%d", len(query)),
|
|
@@ -136,7 +144,15 @@ func (ma *MetaAgent) OptimizeForStructured(query string, context string, schema
|
|
|
136
144
|
return query, nil
|
|
137
145
|
}
|
|
138
146
|
|
|
139
|
-
|
|
147
|
+
// Track meta-agent token usage in the parent RLM's stats
|
|
148
|
+
if result.Usage != nil {
|
|
149
|
+
ma.rlm.stats.PromptTokens += result.Usage.PromptTokens
|
|
150
|
+
ma.rlm.stats.CompletionTokens += result.Usage.CompletionTokens
|
|
151
|
+
ma.rlm.stats.TotalTokens += result.Usage.TotalTokens
|
|
152
|
+
}
|
|
153
|
+
ma.rlm.stats.LlmCalls++
|
|
154
|
+
|
|
155
|
+
optimized := strings.TrimSpace(result.Content)
|
|
140
156
|
ma.obs.Debug("meta_agent", "Optimized structured query: %s", truncateStr(optimized, 200))
|
|
141
157
|
ma.obs.Event("meta_agent.structured_query_optimized", map[string]string{
|
|
142
158
|
"original_length": fmt.Sprintf("%d", len(query)),
|
package/go/rlm/observability.go
CHANGED
|
@@ -445,6 +445,12 @@ func FormatStatsWithObservability(stats RLMStats, obs *Observer) map[string]inte
|
|
|
445
445
|
result["parsing_retries"] = stats.ParsingRetries
|
|
446
446
|
}
|
|
447
447
|
|
|
448
|
+
if stats.TotalTokens > 0 {
|
|
449
|
+
result["total_tokens"] = stats.TotalTokens
|
|
450
|
+
result["prompt_tokens"] = stats.PromptTokens
|
|
451
|
+
result["completion_tokens"] = stats.CompletionTokens
|
|
452
|
+
}
|
|
453
|
+
|
|
448
454
|
if obs != nil && obs.config.Debug {
|
|
449
455
|
events := obs.GetEvents()
|
|
450
456
|
if len(events) > 0 {
|
package/go/rlm/openai.go
CHANGED
|
@@ -34,6 +34,20 @@ type chatResponse struct {
|
|
|
34
34
|
Error *struct {
|
|
35
35
|
Message string `json:"message"`
|
|
36
36
|
} `json:"error"`
|
|
37
|
+
Usage *TokenUsage `json:"usage,omitempty"`
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// TokenUsage represents token consumption from an LLM API response.
|
|
41
|
+
type TokenUsage struct {
|
|
42
|
+
PromptTokens int `json:"prompt_tokens"`
|
|
43
|
+
CompletionTokens int `json:"completion_tokens"`
|
|
44
|
+
TotalTokens int `json:"total_tokens"`
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ChatCompletionResult holds the content and token usage from an LLM call.
|
|
48
|
+
type ChatCompletionResult struct {
|
|
49
|
+
Content string
|
|
50
|
+
Usage *TokenUsage
|
|
37
51
|
}
|
|
38
52
|
|
|
39
53
|
var (
|
|
@@ -48,7 +62,7 @@ var (
|
|
|
48
62
|
}
|
|
49
63
|
)
|
|
50
64
|
|
|
51
|
-
func CallChatCompletion(request ChatRequest) (
|
|
65
|
+
func CallChatCompletion(request ChatRequest) (ChatCompletionResult, error) {
|
|
52
66
|
endpoint := buildEndpoint(request.APIBase)
|
|
53
67
|
payload := map[string]interface{}{
|
|
54
68
|
"model": request.Model,
|
|
@@ -61,7 +75,7 @@ func CallChatCompletion(request ChatRequest) (string, error) {
|
|
|
61
75
|
|
|
62
76
|
body, err := json.Marshal(payload)
|
|
63
77
|
if err != nil {
|
|
64
|
-
return
|
|
78
|
+
return ChatCompletionResult{}, err
|
|
65
79
|
}
|
|
66
80
|
|
|
67
81
|
// Use shared client with connection pooling
|
|
@@ -76,7 +90,7 @@ func CallChatCompletion(request ChatRequest) (string, error) {
|
|
|
76
90
|
|
|
77
91
|
req, err := http.NewRequest(http.MethodPost, endpoint, bytes.NewReader(body))
|
|
78
92
|
if err != nil {
|
|
79
|
-
return
|
|
93
|
+
return ChatCompletionResult{}, err
|
|
80
94
|
}
|
|
81
95
|
req.Header.Set("Content-Type", "application/json")
|
|
82
96
|
if request.APIKey != "" {
|
|
@@ -85,7 +99,7 @@ func CallChatCompletion(request ChatRequest) (string, error) {
|
|
|
85
99
|
|
|
86
100
|
resp, err := client.Do(req)
|
|
87
101
|
if err != nil {
|
|
88
|
-
return
|
|
102
|
+
return ChatCompletionResult{}, err
|
|
89
103
|
}
|
|
90
104
|
defer func() {
|
|
91
105
|
_ = resp.Body.Close()
|
|
@@ -93,27 +107,30 @@ func CallChatCompletion(request ChatRequest) (string, error) {
|
|
|
93
107
|
|
|
94
108
|
responseBody, err := io.ReadAll(resp.Body)
|
|
95
109
|
if err != nil {
|
|
96
|
-
return
|
|
110
|
+
return ChatCompletionResult{}, err
|
|
97
111
|
}
|
|
98
112
|
|
|
99
113
|
if resp.StatusCode >= http.StatusBadRequest {
|
|
100
|
-
return
|
|
114
|
+
return ChatCompletionResult{}, NewAPIError(resp.StatusCode, strings.TrimSpace(string(responseBody)))
|
|
101
115
|
}
|
|
102
116
|
|
|
103
117
|
var parsed chatResponse
|
|
104
118
|
if err := json.Unmarshal(responseBody, &parsed); err != nil {
|
|
105
|
-
return
|
|
119
|
+
return ChatCompletionResult{}, err
|
|
106
120
|
}
|
|
107
121
|
|
|
108
122
|
if parsed.Error != nil && parsed.Error.Message != "" {
|
|
109
|
-
return
|
|
123
|
+
return ChatCompletionResult{}, errors.New(parsed.Error.Message)
|
|
110
124
|
}
|
|
111
125
|
|
|
112
126
|
if len(parsed.Choices) == 0 {
|
|
113
|
-
return
|
|
127
|
+
return ChatCompletionResult{}, errors.New("no choices returned by LLM")
|
|
114
128
|
}
|
|
115
129
|
|
|
116
|
-
return
|
|
130
|
+
return ChatCompletionResult{
|
|
131
|
+
Content: parsed.Choices[0].Message.Content,
|
|
132
|
+
Usage: parsed.Usage,
|
|
133
|
+
}, nil
|
|
117
134
|
}
|
|
118
135
|
|
|
119
136
|
func buildEndpoint(apiBase string) string {
|
package/go/rlm/rlm.go
CHANGED
|
@@ -198,14 +198,22 @@ func (r *RLM) callLLM(messages []Message) (string, error) {
|
|
|
198
198
|
result, err := CallChatCompletion(request)
|
|
199
199
|
duration := time.Since(start)
|
|
200
200
|
|
|
201
|
-
|
|
201
|
+
tokensUsed := 0
|
|
202
|
+
if result.Usage != nil {
|
|
203
|
+
r.stats.PromptTokens += result.Usage.PromptTokens
|
|
204
|
+
r.stats.CompletionTokens += result.Usage.CompletionTokens
|
|
205
|
+
r.stats.TotalTokens += result.Usage.TotalTokens
|
|
206
|
+
tokensUsed = result.Usage.TotalTokens
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
r.observer.LLMCall(defaultModel, len(messages), tokensUsed, duration, err)
|
|
202
210
|
|
|
203
211
|
if err != nil {
|
|
204
212
|
return "", err
|
|
205
213
|
}
|
|
206
214
|
|
|
207
|
-
r.observer.Debug("llm", "Response received (%d chars) in %s", len(result), duration)
|
|
208
|
-
return result, nil
|
|
215
|
+
r.observer.Debug("llm", "Response received (%d chars, %d tokens) in %s", len(result.Content), tokensUsed, duration)
|
|
216
|
+
return result.Content, nil
|
|
209
217
|
}
|
|
210
218
|
|
|
211
219
|
func (r *RLM) buildREPLEnv(query string, context string) map[string]interface{} {
|
package/go/rlm/structured.go
CHANGED
|
@@ -196,9 +196,15 @@ func (r *RLM) structuredCompletionDirect(query string, context string, config *S
|
|
|
196
196
|
}
|
|
197
197
|
|
|
198
198
|
stats.ParsingRetries = attempt
|
|
199
|
+
stats.TotalTokens = r.stats.TotalTokens
|
|
200
|
+
stats.PromptTokens = r.stats.PromptTokens
|
|
201
|
+
stats.CompletionTokens = r.stats.CompletionTokens
|
|
199
202
|
return parsed, stats, nil
|
|
200
203
|
}
|
|
201
204
|
|
|
205
|
+
stats.TotalTokens = r.stats.TotalTokens
|
|
206
|
+
stats.PromptTokens = r.stats.PromptTokens
|
|
207
|
+
stats.CompletionTokens = r.stats.CompletionTokens
|
|
202
208
|
return nil, stats, fmt.Errorf("failed to get valid structured output after %d attempts: %v", config.MaxRetries, lastErr)
|
|
203
209
|
}
|
|
204
210
|
|
|
@@ -265,6 +271,9 @@ func (r *RLM) structuredCompletionParallel(query string, context string, config
|
|
|
265
271
|
totalStats.Depth = stats.Depth
|
|
266
272
|
}
|
|
267
273
|
totalStats.ParsingRetries += stats.ParsingRetries
|
|
274
|
+
totalStats.TotalTokens += stats.TotalTokens
|
|
275
|
+
totalStats.PromptTokens += stats.PromptTokens
|
|
276
|
+
totalStats.CompletionTokens += stats.CompletionTokens
|
|
268
277
|
statsMutex.Unlock()
|
|
269
278
|
}(i, task)
|
|
270
279
|
}
|
|
@@ -0,0 +1,845 @@
|
|
|
1
|
+
package rlm
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"encoding/json"
|
|
5
|
+
"fmt"
|
|
6
|
+
"math"
|
|
7
|
+
"net/http"
|
|
8
|
+
"net/http/httptest"
|
|
9
|
+
"strings"
|
|
10
|
+
"testing"
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
// ─── Token Tracking Unit Tests ──────────────────────────────────────────────
|
|
14
|
+
|
|
15
|
+
func TestTokenUsage_ParsedFromAPIResponse(t *testing.T) {
|
|
16
|
+
// Verify that CallChatCompletion correctly parses the usage field from API responses
|
|
17
|
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
18
|
+
resp := map[string]interface{}{
|
|
19
|
+
"choices": []map[string]interface{}{
|
|
20
|
+
{"message": map[string]string{"content": "Hello world"}},
|
|
21
|
+
},
|
|
22
|
+
"usage": map[string]interface{}{
|
|
23
|
+
"prompt_tokens": 150,
|
|
24
|
+
"completion_tokens": 25,
|
|
25
|
+
"total_tokens": 175,
|
|
26
|
+
},
|
|
27
|
+
}
|
|
28
|
+
json.NewEncoder(w).Encode(resp)
|
|
29
|
+
}))
|
|
30
|
+
defer server.Close()
|
|
31
|
+
|
|
32
|
+
result, err := CallChatCompletion(ChatRequest{
|
|
33
|
+
Model: "test-model",
|
|
34
|
+
Messages: []Message{{Role: "user", Content: "test"}},
|
|
35
|
+
APIBase: server.URL,
|
|
36
|
+
})
|
|
37
|
+
if err != nil {
|
|
38
|
+
t.Fatalf("unexpected error: %v", err)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if result.Content != "Hello world" {
|
|
42
|
+
t.Errorf("expected content 'Hello world', got %q", result.Content)
|
|
43
|
+
}
|
|
44
|
+
if result.Usage == nil {
|
|
45
|
+
t.Fatal("expected usage to be non-nil")
|
|
46
|
+
}
|
|
47
|
+
if result.Usage.PromptTokens != 150 {
|
|
48
|
+
t.Errorf("expected 150 prompt tokens, got %d", result.Usage.PromptTokens)
|
|
49
|
+
}
|
|
50
|
+
if result.Usage.CompletionTokens != 25 {
|
|
51
|
+
t.Errorf("expected 25 completion tokens, got %d", result.Usage.CompletionTokens)
|
|
52
|
+
}
|
|
53
|
+
if result.Usage.TotalTokens != 175 {
|
|
54
|
+
t.Errorf("expected 175 total tokens, got %d", result.Usage.TotalTokens)
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
func TestTokenUsage_NilWhenAPIDoesNotReturnUsage(t *testing.T) {
|
|
59
|
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
60
|
+
resp := map[string]interface{}{
|
|
61
|
+
"choices": []map[string]interface{}{
|
|
62
|
+
{"message": map[string]string{"content": "Hello"}},
|
|
63
|
+
},
|
|
64
|
+
}
|
|
65
|
+
json.NewEncoder(w).Encode(resp)
|
|
66
|
+
}))
|
|
67
|
+
defer server.Close()
|
|
68
|
+
|
|
69
|
+
result, err := CallChatCompletion(ChatRequest{
|
|
70
|
+
Model: "test-model",
|
|
71
|
+
Messages: []Message{{Role: "user", Content: "test"}},
|
|
72
|
+
APIBase: server.URL,
|
|
73
|
+
})
|
|
74
|
+
if err != nil {
|
|
75
|
+
t.Fatalf("unexpected error: %v", err)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if result.Usage != nil {
|
|
79
|
+
t.Errorf("expected usage to be nil when API doesn't return it, got %+v", result.Usage)
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
func TestRLMStats_TokenAccumulation(t *testing.T) {
|
|
84
|
+
// Test that token usage accumulates across multiple LLM calls
|
|
85
|
+
callCount := 0
|
|
86
|
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
87
|
+
callCount++
|
|
88
|
+
resp := map[string]interface{}{
|
|
89
|
+
"choices": []map[string]interface{}{
|
|
90
|
+
{"message": map[string]string{"content": fmt.Sprintf(`FINAL("answer from call %d")`, callCount)}},
|
|
91
|
+
},
|
|
92
|
+
"usage": map[string]interface{}{
|
|
93
|
+
"prompt_tokens": 100 * callCount,
|
|
94
|
+
"completion_tokens": 20 * callCount,
|
|
95
|
+
"total_tokens": 120 * callCount,
|
|
96
|
+
},
|
|
97
|
+
}
|
|
98
|
+
json.NewEncoder(w).Encode(resp)
|
|
99
|
+
}))
|
|
100
|
+
defer server.Close()
|
|
101
|
+
|
|
102
|
+
engine := New("test-model", Config{
|
|
103
|
+
APIBase: server.URL,
|
|
104
|
+
MaxDepth: 5,
|
|
105
|
+
MaxIterations: 10,
|
|
106
|
+
})
|
|
107
|
+
|
|
108
|
+
_, stats, err := engine.Completion("test query", "test context")
|
|
109
|
+
if err != nil {
|
|
110
|
+
t.Fatalf("unexpected error: %v", err)
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// First call should have returned FINAL, so 1 LLM call
|
|
114
|
+
if stats.LlmCalls != 1 {
|
|
115
|
+
t.Errorf("expected 1 LLM call, got %d", stats.LlmCalls)
|
|
116
|
+
}
|
|
117
|
+
if stats.TotalTokens != 120 {
|
|
118
|
+
t.Errorf("expected 120 total tokens, got %d", stats.TotalTokens)
|
|
119
|
+
}
|
|
120
|
+
if stats.PromptTokens != 100 {
|
|
121
|
+
t.Errorf("expected 100 prompt tokens, got %d", stats.PromptTokens)
|
|
122
|
+
}
|
|
123
|
+
if stats.CompletionTokens != 20 {
|
|
124
|
+
t.Errorf("expected 20 completion tokens, got %d", stats.CompletionTokens)
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
func TestRLMStats_TokenAccumulation_MultipleIterations(t *testing.T) {
|
|
129
|
+
// Simulates an RLM completion that takes 3 iterations before producing FINAL
|
|
130
|
+
callCount := 0
|
|
131
|
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
132
|
+
callCount++
|
|
133
|
+
content := "context.indexOf('test')"
|
|
134
|
+
if callCount >= 3 {
|
|
135
|
+
content = `FINAL("done after 3 calls")`
|
|
136
|
+
}
|
|
137
|
+
resp := map[string]interface{}{
|
|
138
|
+
"choices": []map[string]interface{}{
|
|
139
|
+
{"message": map[string]string{"content": content}},
|
|
140
|
+
},
|
|
141
|
+
"usage": map[string]interface{}{
|
|
142
|
+
"prompt_tokens": 200,
|
|
143
|
+
"completion_tokens": 50,
|
|
144
|
+
"total_tokens": 250,
|
|
145
|
+
},
|
|
146
|
+
}
|
|
147
|
+
json.NewEncoder(w).Encode(resp)
|
|
148
|
+
}))
|
|
149
|
+
defer server.Close()
|
|
150
|
+
|
|
151
|
+
engine := New("test-model", Config{
|
|
152
|
+
APIBase: server.URL,
|
|
153
|
+
MaxDepth: 5,
|
|
154
|
+
MaxIterations: 10,
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
_, stats, err := engine.Completion("test query", "test context for searching")
|
|
158
|
+
if err != nil {
|
|
159
|
+
t.Fatalf("unexpected error: %v", err)
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if stats.LlmCalls != 3 {
|
|
163
|
+
t.Errorf("expected 3 LLM calls, got %d", stats.LlmCalls)
|
|
164
|
+
}
|
|
165
|
+
// 3 calls * 250 tokens each = 750 total
|
|
166
|
+
if stats.TotalTokens != 750 {
|
|
167
|
+
t.Errorf("expected 750 total tokens (3 calls * 250), got %d", stats.TotalTokens)
|
|
168
|
+
}
|
|
169
|
+
if stats.PromptTokens != 600 {
|
|
170
|
+
t.Errorf("expected 600 prompt tokens (3 calls * 200), got %d", stats.PromptTokens)
|
|
171
|
+
}
|
|
172
|
+
if stats.CompletionTokens != 150 {
|
|
173
|
+
t.Errorf("expected 150 completion tokens (3 calls * 50), got %d", stats.CompletionTokens)
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
func TestRLMStats_TokensInJSONOutput(t *testing.T) {
|
|
178
|
+
// Verify token fields are serialized in the JSON output
|
|
179
|
+
stats := RLMStats{
|
|
180
|
+
LlmCalls: 3,
|
|
181
|
+
Iterations: 2,
|
|
182
|
+
Depth: 0,
|
|
183
|
+
TotalTokens: 750,
|
|
184
|
+
PromptTokens: 600,
|
|
185
|
+
CompletionTokens: 150,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
data, err := json.Marshal(stats)
|
|
189
|
+
if err != nil {
|
|
190
|
+
t.Fatalf("failed to marshal stats: %v", err)
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
var parsed map[string]interface{}
|
|
194
|
+
if err := json.Unmarshal(data, &parsed); err != nil {
|
|
195
|
+
t.Fatalf("failed to unmarshal stats: %v", err)
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
if v, ok := parsed["total_tokens"].(float64); !ok || int(v) != 750 {
|
|
199
|
+
t.Errorf("expected total_tokens=750 in JSON, got %v", parsed["total_tokens"])
|
|
200
|
+
}
|
|
201
|
+
if v, ok := parsed["prompt_tokens"].(float64); !ok || int(v) != 600 {
|
|
202
|
+
t.Errorf("expected prompt_tokens=600 in JSON, got %v", parsed["prompt_tokens"])
|
|
203
|
+
}
|
|
204
|
+
if v, ok := parsed["completion_tokens"].(float64); !ok || int(v) != 150 {
|
|
205
|
+
t.Errorf("expected completion_tokens=150 in JSON, got %v", parsed["completion_tokens"])
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
func TestRLMStats_ZeroTokensOmittedFromJSON(t *testing.T) {
|
|
210
|
+
// When no tokens are tracked, fields should be omitted (omitempty)
|
|
211
|
+
stats := RLMStats{
|
|
212
|
+
LlmCalls: 1,
|
|
213
|
+
Iterations: 1,
|
|
214
|
+
Depth: 0,
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
data, err := json.Marshal(stats)
|
|
218
|
+
if err != nil {
|
|
219
|
+
t.Fatalf("failed to marshal stats: %v", err)
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
jsonStr := string(data)
|
|
223
|
+
if strings.Contains(jsonStr, "total_tokens") {
|
|
224
|
+
t.Errorf("expected total_tokens to be omitted when zero, got: %s", jsonStr)
|
|
225
|
+
}
|
|
226
|
+
if strings.Contains(jsonStr, "prompt_tokens") {
|
|
227
|
+
t.Errorf("expected prompt_tokens to be omitted when zero, got: %s", jsonStr)
|
|
228
|
+
}
|
|
229
|
+
if strings.Contains(jsonStr, "completion_tokens") {
|
|
230
|
+
t.Errorf("expected completion_tokens to be omitted when zero, got: %s", jsonStr)
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
func TestFormatStatsWithObservability_IncludesTokens(t *testing.T) {
|
|
235
|
+
stats := RLMStats{
|
|
236
|
+
LlmCalls: 2,
|
|
237
|
+
Iterations: 1,
|
|
238
|
+
Depth: 0,
|
|
239
|
+
TotalTokens: 500,
|
|
240
|
+
PromptTokens: 400,
|
|
241
|
+
CompletionTokens: 100,
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
obs := NewNoopObserver()
|
|
245
|
+
formatted := FormatStatsWithObservability(stats, obs)
|
|
246
|
+
|
|
247
|
+
if v, ok := formatted["total_tokens"].(int); !ok || v != 500 {
|
|
248
|
+
t.Errorf("expected total_tokens=500, got %v", formatted["total_tokens"])
|
|
249
|
+
}
|
|
250
|
+
if v, ok := formatted["prompt_tokens"].(int); !ok || v != 400 {
|
|
251
|
+
t.Errorf("expected prompt_tokens=400, got %v", formatted["prompt_tokens"])
|
|
252
|
+
}
|
|
253
|
+
if v, ok := formatted["completion_tokens"].(int); !ok || v != 100 {
|
|
254
|
+
t.Errorf("expected completion_tokens=100, got %v", formatted["completion_tokens"])
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
func TestFormatStatsWithObservability_OmitsZeroTokens(t *testing.T) {
|
|
259
|
+
stats := RLMStats{
|
|
260
|
+
LlmCalls: 1,
|
|
261
|
+
Iterations: 1,
|
|
262
|
+
Depth: 0,
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
obs := NewNoopObserver()
|
|
266
|
+
formatted := FormatStatsWithObservability(stats, obs)
|
|
267
|
+
|
|
268
|
+
if _, exists := formatted["total_tokens"]; exists {
|
|
269
|
+
t.Errorf("expected total_tokens to be absent when zero, got %v", formatted["total_tokens"])
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// ─── Token Efficiency Tests ─────────────────────────────────────────────────
|
|
274
|
+
//
|
|
275
|
+
// These tests prove that RLM context reduction strategies process fewer tokens
|
|
276
|
+
// than passing an entire large document through as raw context.
|
|
277
|
+
|
|
278
|
+
// generateLargeContext creates a realistic document of approximately targetTokens tokens.
|
|
279
|
+
// It generates structured content with numbered paragraphs to make it easy to verify
|
|
280
|
+
// that reduction strategies preserve key information.
|
|
281
|
+
func generateLargeContext(targetTokens int) string {
|
|
282
|
+
// ~3.5 chars per token is our estimation ratio
|
|
283
|
+
targetChars := int(float64(targetTokens) * 3.5)
|
|
284
|
+
|
|
285
|
+
var sb strings.Builder
|
|
286
|
+
sb.WriteString("# Technical Report: System Performance Analysis\n\n")
|
|
287
|
+
sb.WriteString("## Executive Summary\n\n")
|
|
288
|
+
sb.WriteString("This comprehensive report analyzes the performance characteristics of the distributed system ")
|
|
289
|
+
sb.WriteString("deployed across three data centers. Key findings include a 15% improvement in latency, ")
|
|
290
|
+
sb.WriteString("23% reduction in error rates, and significant cost savings through resource optimization.\n\n")
|
|
291
|
+
|
|
292
|
+
paragraphNum := 1
|
|
293
|
+
for sb.Len() < targetChars {
|
|
294
|
+
// Generate diverse paragraph types to simulate realistic documents
|
|
295
|
+
switch paragraphNum % 5 {
|
|
296
|
+
case 0:
|
|
297
|
+
fmt.Fprintf(&sb, "### Section %d: Database Performance Metrics\n\n", paragraphNum)
|
|
298
|
+
fmt.Fprintf(&sb, "In quarter Q%d, the primary database cluster processed an average of %d,000 queries per second "+
|
|
299
|
+
"with a p99 latency of %d.%d milliseconds. The read-to-write ratio was approximately %d:%d. "+
|
|
300
|
+
"Connection pool utilization peaked at %d%% during high-traffic periods, with %d active connections "+
|
|
301
|
+
"out of a configured maximum of %d. Index hit ratios remained above %d%% for all primary tables, "+
|
|
302
|
+
"though the secondary indexes on the analytics tables showed degradation to %d%% during batch "+
|
|
303
|
+
"processing windows. This resulted in an overall throughput improvement of %d.%d%% compared to "+
|
|
304
|
+
"the previous quarter's baseline measurements.\n\n",
|
|
305
|
+
paragraphNum%4+1, paragraphNum*12+50, paragraphNum%10+1, paragraphNum%99,
|
|
306
|
+
paragraphNum%7+3, 1, paragraphNum%30+70, paragraphNum*3+100, paragraphNum*5+200,
|
|
307
|
+
paragraphNum%5+95, paragraphNum%20+75, paragraphNum%15+5, paragraphNum%99)
|
|
308
|
+
case 1:
|
|
309
|
+
fmt.Fprintf(&sb, "### Section %d: API Gateway Statistics\n\n", paragraphNum)
|
|
310
|
+
fmt.Fprintf(&sb, "The API gateway handled %d.%dM requests during the reporting period. Rate limiting "+
|
|
311
|
+
"was triggered %d times for %d unique clients. The top 5 endpoints by traffic volume were: "+
|
|
312
|
+
"/api/v2/users (%d.%d%%), /api/v2/products (%d.%d%%), /api/v2/orders (%d.%d%%), "+
|
|
313
|
+
"/api/v2/analytics (%d.%d%%), and /api/v2/search (%d.%d%%). Authentication failures "+
|
|
314
|
+
"decreased from %d to %d per day after implementing the new token refresh mechanism. "+
|
|
315
|
+
"The overall API availability was %d.%d%% with %d minutes of total downtime.\n\n",
|
|
316
|
+
paragraphNum*5+10, paragraphNum%99, paragraphNum*7+20, paragraphNum*3+5,
|
|
317
|
+
paragraphNum%20+20, paragraphNum%99, paragraphNum%15+15, paragraphNum%99,
|
|
318
|
+
paragraphNum%10+10, paragraphNum%99, paragraphNum%8+5, paragraphNum%99,
|
|
319
|
+
paragraphNum%5+3, paragraphNum%99, paragraphNum*2+50, paragraphNum+10,
|
|
320
|
+
99, paragraphNum%10+90, paragraphNum%30+5)
|
|
321
|
+
case 2:
|
|
322
|
+
fmt.Fprintf(&sb, "### Section %d: Memory and CPU Utilization\n\n", paragraphNum)
|
|
323
|
+
fmt.Fprintf(&sb, "Across all %d nodes in the cluster, average memory utilization was %d.%d%%. "+
|
|
324
|
+
"Node %d consistently showed the highest memory consumption at %d.%d%%, primarily due to "+
|
|
325
|
+
"in-memory caching of frequently accessed data structures. CPU utilization averaged %d.%d%% "+
|
|
326
|
+
"with peaks reaching %d.%d%% during the daily ETL batch processing window between "+
|
|
327
|
+
"%d:00 and %d:00 UTC. Garbage collection pauses were reduced from an average of %dms to %dms "+
|
|
328
|
+
"after tuning the JVM parameters. Thread pool saturation events decreased from %d per hour "+
|
|
329
|
+
"to %d per hour following the implementation of adaptive thread pool sizing.\n\n",
|
|
330
|
+
paragraphNum*2+20, paragraphNum%40+50, paragraphNum%99, paragraphNum%20+1,
|
|
331
|
+
paragraphNum%15+80, paragraphNum%99, paragraphNum%30+40, paragraphNum%99,
|
|
332
|
+
paragraphNum%20+75, paragraphNum%99, paragraphNum%6+2, paragraphNum%6+4,
|
|
333
|
+
paragraphNum%50+100, paragraphNum%30+20, paragraphNum%10+5, paragraphNum%5+1)
|
|
334
|
+
case 3:
|
|
335
|
+
fmt.Fprintf(&sb, "### Section %d: Error Analysis and Incident Report\n\n", paragraphNum)
|
|
336
|
+
fmt.Fprintf(&sb, "During the period, %d unique error types were observed across the system. "+
|
|
337
|
+
"The most frequent error (ERR-%04d) was a transient connection timeout to the Redis cluster, "+
|
|
338
|
+
"occurring %d times with a mean time to recovery of %d.%d seconds. Error category breakdown: "+
|
|
339
|
+
"network errors (%d%%), application errors (%d%%), database errors (%d%%), "+
|
|
340
|
+
"authentication errors (%d%%), and other (%d%%). The total error budget consumed was %d.%d%% "+
|
|
341
|
+
"of the allocated %d.%d%% for the quarter. Two P2 incidents were recorded on days %d and %d, "+
|
|
342
|
+
"with root causes traced to upstream provider instability and a misconfigured load balancer "+
|
|
343
|
+
"health check interval respectively.\n\n",
|
|
344
|
+
paragraphNum*3+15, paragraphNum+1000, paragraphNum*50+200, paragraphNum%10+1, paragraphNum%99,
|
|
345
|
+
paragraphNum%30+30, paragraphNum%25+20, paragraphNum%20+15, paragraphNum%10+5,
|
|
346
|
+
paragraphNum%10+5, paragraphNum%3, paragraphNum%99, paragraphNum%5, paragraphNum%99,
|
|
347
|
+
paragraphNum%28+1, paragraphNum%28+15)
|
|
348
|
+
case 4:
|
|
349
|
+
fmt.Fprintf(&sb, "### Section %d: Cost Optimization Results\n\n", paragraphNum)
|
|
350
|
+
fmt.Fprintf(&sb, "Infrastructure costs for the period totaled $%d,%03d.%02d, representing a "+
|
|
351
|
+
"%d.%d%% decrease from the previous quarter. Key savings were achieved through: "+
|
|
352
|
+
"reserved instance utilization (saving $%d,%03d), right-sizing %d underutilized instances "+
|
|
353
|
+
"(saving $%d,%03d), implementing spot instances for batch workloads (saving $%d,%03d), "+
|
|
354
|
+
"and optimizing data transfer routes (saving $%d,%03d). The cost per million API requests "+
|
|
355
|
+
"decreased from $%d.%02d to $%d.%02d. Projected annual savings based on current trends: "+
|
|
356
|
+
"$%d,%03d. Storage costs increased by %d.%d%% due to expanded logging retention requirements.\n\n",
|
|
357
|
+
paragraphNum*100+500, paragraphNum%1000, paragraphNum%100, paragraphNum%15+5, paragraphNum%99,
|
|
358
|
+
paragraphNum*20+100, paragraphNum%1000, paragraphNum*3+10, paragraphNum*10+50, paragraphNum%1000,
|
|
359
|
+
paragraphNum*8+30, paragraphNum%1000, paragraphNum*5+20, paragraphNum%1000,
|
|
360
|
+
paragraphNum%50+10, paragraphNum%100, paragraphNum%40+5, paragraphNum%100,
|
|
361
|
+
paragraphNum*300+1000, paragraphNum%1000, paragraphNum%10+2, paragraphNum%99)
|
|
362
|
+
}
|
|
363
|
+
paragraphNum++
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return sb.String()
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
func TestTokenEfficiency_TFIDFUsesFewerTokens(t *testing.T) {
|
|
370
|
+
// Generate a large context (~35,000 tokens, well over 32k)
|
|
371
|
+
largeContext := generateLargeContext(35000)
|
|
372
|
+
originalTokens := EstimateTokens(largeContext)
|
|
373
|
+
|
|
374
|
+
if originalTokens < 32000 {
|
|
375
|
+
t.Fatalf("generated context is too small: %d tokens, need at least 32000", originalTokens)
|
|
376
|
+
}
|
|
377
|
+
t.Logf("Original context: %d chars, ~%d estimated tokens", len(largeContext), originalTokens)
|
|
378
|
+
|
|
379
|
+
// Apply TF-IDF compression to fit within a 32k token budget
|
|
380
|
+
modelLimit := 32768
|
|
381
|
+
overhead := 1000 // System prompt + query overhead
|
|
382
|
+
availableTokens := modelLimit - overhead
|
|
383
|
+
|
|
384
|
+
compressed := CompressContextTFIDF(largeContext, availableTokens)
|
|
385
|
+
compressedTokens := EstimateTokens(compressed)
|
|
386
|
+
|
|
387
|
+
t.Logf("TF-IDF compressed: %d chars, ~%d estimated tokens", len(compressed), compressedTokens)
|
|
388
|
+
t.Logf("Token reduction: %d -> %d (%.1f%% reduction)",
|
|
389
|
+
originalTokens, compressedTokens,
|
|
390
|
+
(1.0-float64(compressedTokens)/float64(originalTokens))*100)
|
|
391
|
+
|
|
392
|
+
// Core assertion: TF-IDF MUST produce fewer tokens than the original
|
|
393
|
+
if compressedTokens >= originalTokens {
|
|
394
|
+
t.Errorf("TF-IDF failed to reduce tokens: original=%d, compressed=%d", originalTokens, compressedTokens)
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// And it must fit within our budget
|
|
398
|
+
if compressedTokens > availableTokens {
|
|
399
|
+
t.Errorf("TF-IDF output exceeds budget: %d tokens > %d available", compressedTokens, availableTokens)
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
// Verify meaningful compression (at least 5% reduction for a context that's over budget)
|
|
403
|
+
reductionPct := (1.0 - float64(compressedTokens)/float64(originalTokens)) * 100
|
|
404
|
+
if reductionPct < 5.0 {
|
|
405
|
+
t.Errorf("TF-IDF compression too weak: only %.1f%% reduction", reductionPct)
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
func TestTokenEfficiency_TextRankUsesFewerTokens(t *testing.T) {
|
|
410
|
+
largeContext := generateLargeContext(35000)
|
|
411
|
+
originalTokens := EstimateTokens(largeContext)
|
|
412
|
+
|
|
413
|
+
if originalTokens < 32000 {
|
|
414
|
+
t.Fatalf("generated context is too small: %d tokens, need at least 32000", originalTokens)
|
|
415
|
+
}
|
|
416
|
+
t.Logf("Original context: %d chars, ~%d estimated tokens", len(largeContext), originalTokens)
|
|
417
|
+
|
|
418
|
+
modelLimit := 32768
|
|
419
|
+
overhead := 1000
|
|
420
|
+
availableTokens := modelLimit - overhead
|
|
421
|
+
|
|
422
|
+
compressed := CompressContextTextRank(largeContext, availableTokens)
|
|
423
|
+
compressedTokens := EstimateTokens(compressed)
|
|
424
|
+
|
|
425
|
+
t.Logf("TextRank compressed: %d chars, ~%d estimated tokens", len(compressed), compressedTokens)
|
|
426
|
+
t.Logf("Token reduction: %d -> %d (%.1f%% reduction)",
|
|
427
|
+
originalTokens, compressedTokens,
|
|
428
|
+
(1.0-float64(compressedTokens)/float64(originalTokens))*100)
|
|
429
|
+
|
|
430
|
+
if compressedTokens >= originalTokens {
|
|
431
|
+
t.Errorf("TextRank failed to reduce tokens: original=%d, compressed=%d", originalTokens, compressedTokens)
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
if compressedTokens > availableTokens {
|
|
435
|
+
t.Errorf("TextRank output exceeds budget: %d tokens > %d available", compressedTokens, availableTokens)
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
reductionPct := (1.0 - float64(compressedTokens)/float64(originalTokens)) * 100
|
|
439
|
+
if reductionPct < 5.0 {
|
|
440
|
+
t.Errorf("TextRank compression too weak: only %.1f%% reduction", reductionPct)
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
func TestTokenEfficiency_TruncateUsesFewerTokens(t *testing.T) {
|
|
445
|
+
largeContext := generateLargeContext(35000)
|
|
446
|
+
originalTokens := EstimateTokens(largeContext)
|
|
447
|
+
|
|
448
|
+
if originalTokens < 32000 {
|
|
449
|
+
t.Fatalf("generated context is too small: %d tokens, need at least 32000", originalTokens)
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
modelLimit := 32768
|
|
453
|
+
overhead := 1000
|
|
454
|
+
|
|
455
|
+
// Create a reducer with truncation strategy
|
|
456
|
+
engine := New("test-model", Config{
|
|
457
|
+
MaxDepth: 5,
|
|
458
|
+
MaxIterations: 10,
|
|
459
|
+
ContextOverflow: &ContextOverflowConfig{
|
|
460
|
+
Enabled: true,
|
|
461
|
+
Strategy: "truncate",
|
|
462
|
+
SafetyMargin: 0.15,
|
|
463
|
+
},
|
|
464
|
+
})
|
|
465
|
+
|
|
466
|
+
reducer := newContextReducer(engine, *engine.contextOverflow, NewNoopObserver())
|
|
467
|
+
truncated, err := reducer.reduceByTruncation(largeContext, modelLimit, overhead)
|
|
468
|
+
if err != nil {
|
|
469
|
+
t.Fatalf("truncation failed: %v", err)
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
truncatedTokens := EstimateTokens(truncated)
|
|
473
|
+
|
|
474
|
+
t.Logf("Truncate: %d -> %d estimated tokens (%.1f%% reduction)",
|
|
475
|
+
originalTokens, truncatedTokens,
|
|
476
|
+
(1.0-float64(truncatedTokens)/float64(originalTokens))*100)
|
|
477
|
+
|
|
478
|
+
if truncatedTokens >= originalTokens {
|
|
479
|
+
t.Errorf("truncation failed to reduce tokens: original=%d, truncated=%d", originalTokens, truncatedTokens)
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
func TestTokenEfficiency_ChunkingProducesSmallChunks(t *testing.T) {
|
|
484
|
+
largeContext := generateLargeContext(35000)
|
|
485
|
+
originalTokens := EstimateTokens(largeContext)
|
|
486
|
+
|
|
487
|
+
if originalTokens < 32000 {
|
|
488
|
+
t.Fatalf("generated context is too small: %d tokens, need at least 32000", originalTokens)
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// Chunk with a 8k token budget per chunk
|
|
492
|
+
chunkBudget := 8000
|
|
493
|
+
chunks := ChunkContext(largeContext, chunkBudget)
|
|
494
|
+
|
|
495
|
+
t.Logf("Chunked %d tokens into %d chunks (budget: %d tokens/chunk)", originalTokens, len(chunks), chunkBudget)
|
|
496
|
+
|
|
497
|
+
if len(chunks) < 2 {
|
|
498
|
+
t.Errorf("expected multiple chunks for %d token context, got %d", originalTokens, len(chunks))
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// Each chunk must be smaller than the original
|
|
502
|
+
for i, chunk := range chunks {
|
|
503
|
+
chunkTokens := EstimateTokens(chunk)
|
|
504
|
+
if chunkTokens >= originalTokens {
|
|
505
|
+
t.Errorf("chunk %d is not smaller than original: %d tokens >= %d", i, chunkTokens, originalTokens)
|
|
506
|
+
}
|
|
507
|
+
t.Logf(" Chunk %d: %d estimated tokens", i, chunkTokens)
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
func TestTokenEfficiency_PreemptiveReduction(t *testing.T) {
|
|
512
|
+
// Test that PreemptiveReduceContext actually reduces a large context
|
|
513
|
+
largeContext := generateLargeContext(35000)
|
|
514
|
+
originalTokens := EstimateTokens(largeContext)
|
|
515
|
+
|
|
516
|
+
engine := New("gpt-4o-mini", Config{
|
|
517
|
+
MaxDepth: 5,
|
|
518
|
+
MaxIterations: 10,
|
|
519
|
+
ContextOverflow: &ContextOverflowConfig{
|
|
520
|
+
Enabled: true,
|
|
521
|
+
Strategy: "tfidf",
|
|
522
|
+
SafetyMargin: 0.15,
|
|
523
|
+
},
|
|
524
|
+
})
|
|
525
|
+
|
|
526
|
+
reduced, wasReduced, err := engine.PreemptiveReduceContext("Summarize the key findings", largeContext, 0)
|
|
527
|
+
if err != nil {
|
|
528
|
+
t.Fatalf("preemptive reduction failed: %v", err)
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
// gpt-4o-mini has 128k limit, so 35k should NOT trigger reduction
|
|
532
|
+
if wasReduced {
|
|
533
|
+
t.Logf("context was unexpectedly reduced for 35k input with 128k model limit")
|
|
534
|
+
} else {
|
|
535
|
+
t.Logf("correctly skipped reduction: 35k tokens fits within gpt-4o-mini's 128k limit")
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
// Force a smaller model limit to ensure reduction triggers
|
|
539
|
+
engine2 := New("gpt-4", Config{
|
|
540
|
+
MaxDepth: 5,
|
|
541
|
+
MaxIterations: 10,
|
|
542
|
+
ContextOverflow: &ContextOverflowConfig{
|
|
543
|
+
Enabled: true,
|
|
544
|
+
Strategy: "tfidf",
|
|
545
|
+
SafetyMargin: 0.15,
|
|
546
|
+
MaxModelTokens: 16000, // Force small limit
|
|
547
|
+
},
|
|
548
|
+
})
|
|
549
|
+
|
|
550
|
+
reduced2, wasReduced2, err := engine2.PreemptiveReduceContext("Summarize the key findings", largeContext, 0)
|
|
551
|
+
if err != nil {
|
|
552
|
+
t.Fatalf("preemptive reduction failed: %v", err)
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
if !wasReduced2 {
|
|
556
|
+
t.Error("expected context to be reduced when model limit is 16k and context is 35k tokens")
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
reducedTokens := EstimateTokens(reduced2)
|
|
560
|
+
t.Logf("Preemptive TF-IDF: %d -> %d estimated tokens (%.1f%% reduction)",
|
|
561
|
+
originalTokens, reducedTokens,
|
|
562
|
+
(1.0-float64(reducedTokens)/float64(originalTokens))*100)
|
|
563
|
+
|
|
564
|
+
if reducedTokens >= originalTokens {
|
|
565
|
+
t.Errorf("preemptive reduction failed: original=%d, reduced=%d", originalTokens, reducedTokens)
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
_ = reduced // used above
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
func TestTokenEfficiency_AllStrategiesCompared(t *testing.T) {
|
|
572
|
+
// Generate a 40k token context (well over 32k limit)
|
|
573
|
+
largeContext := generateLargeContext(40000)
|
|
574
|
+
originalTokens := EstimateTokens(largeContext)
|
|
575
|
+
|
|
576
|
+
if originalTokens < 35000 {
|
|
577
|
+
t.Fatalf("generated context is too small: %d tokens, need at least 35000", originalTokens)
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
modelLimit := 32768
|
|
581
|
+
overhead := 1000
|
|
582
|
+
|
|
583
|
+
t.Logf("Original context: %d chars, ~%d estimated tokens", len(largeContext), originalTokens)
|
|
584
|
+
t.Logf("Model limit: %d tokens, overhead: %d, available: %d", modelLimit, overhead, modelLimit-overhead)
|
|
585
|
+
|
|
586
|
+
// Track results for each strategy
|
|
587
|
+
type strategyResult struct {
|
|
588
|
+
name string
|
|
589
|
+
reducedTokens int
|
|
590
|
+
reductionPct float64
|
|
591
|
+
requiresLLM bool
|
|
592
|
+
}
|
|
593
|
+
var results []strategyResult
|
|
594
|
+
|
|
595
|
+
availableTokens := modelLimit - overhead
|
|
596
|
+
|
|
597
|
+
// TF-IDF (pure algorithmic)
|
|
598
|
+
tfidfResult := CompressContextTFIDF(largeContext, availableTokens)
|
|
599
|
+
tfidfTokens := EstimateTokens(tfidfResult)
|
|
600
|
+
results = append(results, strategyResult{
|
|
601
|
+
name: "tfidf",
|
|
602
|
+
reducedTokens: tfidfTokens,
|
|
603
|
+
reductionPct: (1.0 - float64(tfidfTokens)/float64(originalTokens)) * 100,
|
|
604
|
+
requiresLLM: false,
|
|
605
|
+
})
|
|
606
|
+
|
|
607
|
+
// TextRank (pure algorithmic)
|
|
608
|
+
textRankResult := CompressContextTextRank(largeContext, availableTokens)
|
|
609
|
+
textRankTokens := EstimateTokens(textRankResult)
|
|
610
|
+
results = append(results, strategyResult{
|
|
611
|
+
name: "textrank",
|
|
612
|
+
reducedTokens: textRankTokens,
|
|
613
|
+
reductionPct: (1.0 - float64(textRankTokens)/float64(originalTokens)) * 100,
|
|
614
|
+
requiresLLM: false,
|
|
615
|
+
})
|
|
616
|
+
|
|
617
|
+
// Truncation
|
|
618
|
+
engine := New("test-model", Config{
|
|
619
|
+
MaxDepth: 5,
|
|
620
|
+
MaxIterations: 10,
|
|
621
|
+
ContextOverflow: &ContextOverflowConfig{
|
|
622
|
+
Enabled: true,
|
|
623
|
+
Strategy: "truncate",
|
|
624
|
+
SafetyMargin: 0.15,
|
|
625
|
+
},
|
|
626
|
+
})
|
|
627
|
+
reducer := newContextReducer(engine, *engine.contextOverflow, NewNoopObserver())
|
|
628
|
+
truncResult, _ := reducer.reduceByTruncation(largeContext, modelLimit, overhead)
|
|
629
|
+
truncTokens := EstimateTokens(truncResult)
|
|
630
|
+
results = append(results, strategyResult{
|
|
631
|
+
name: "truncate",
|
|
632
|
+
reducedTokens: truncTokens,
|
|
633
|
+
reductionPct: (1.0 - float64(truncTokens)/float64(originalTokens)) * 100,
|
|
634
|
+
requiresLLM: false,
|
|
635
|
+
})
|
|
636
|
+
|
|
637
|
+
// Print comparison table
|
|
638
|
+
t.Logf("\n--- Token Efficiency Comparison ---")
|
|
639
|
+
t.Logf("%-12s | %12s | %10s | %s", "Strategy", "Tokens Used", "Reduction", "Requires LLM")
|
|
640
|
+
t.Logf("%-12s | %12s | %10s | %s", "------------", "------------", "----------", "------------")
|
|
641
|
+
t.Logf("%-12s | %12d | %9s | %s", "raw (none)", originalTokens, "0.0%", "no")
|
|
642
|
+
for _, r := range results {
|
|
643
|
+
llmStr := "no"
|
|
644
|
+
if r.requiresLLM {
|
|
645
|
+
llmStr = "yes"
|
|
646
|
+
}
|
|
647
|
+
t.Logf("%-12s | %12d | %9.1f%% | %s", r.name, r.reducedTokens, r.reductionPct, llmStr)
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
// Assert ALL strategies use fewer tokens than raw
|
|
651
|
+
for _, r := range results {
|
|
652
|
+
if r.reducedTokens >= originalTokens {
|
|
653
|
+
t.Errorf("strategy %q failed: %d tokens >= original %d tokens", r.name, r.reducedTokens, originalTokens)
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
// Assert all strategies fit within the model limit
|
|
658
|
+
for _, r := range results {
|
|
659
|
+
if r.reducedTokens > availableTokens {
|
|
660
|
+
t.Errorf("strategy %q exceeds budget: %d tokens > %d available", r.name, r.reducedTokens, availableTokens)
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
func TestTokenEfficiency_VeryLargeContext_100kTokens(t *testing.T) {
|
|
666
|
+
// Test with a very large context (~100k tokens) to prove scaling
|
|
667
|
+
largeContext := generateLargeContext(100000)
|
|
668
|
+
originalTokens := EstimateTokens(largeContext)
|
|
669
|
+
|
|
670
|
+
if originalTokens < 90000 {
|
|
671
|
+
t.Fatalf("generated context is too small: %d tokens, need at least 90000", originalTokens)
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
modelLimit := 32768
|
|
675
|
+
overhead := 1000
|
|
676
|
+
availableTokens := modelLimit - overhead
|
|
677
|
+
|
|
678
|
+
t.Logf("Original: ~%d estimated tokens (3x over 32k limit)", originalTokens)
|
|
679
|
+
|
|
680
|
+
// TF-IDF
|
|
681
|
+
tfidfResult := CompressContextTFIDF(largeContext, availableTokens)
|
|
682
|
+
tfidfTokens := EstimateTokens(tfidfResult)
|
|
683
|
+
|
|
684
|
+
// TextRank
|
|
685
|
+
textRankResult := CompressContextTextRank(largeContext, availableTokens)
|
|
686
|
+
textRankTokens := EstimateTokens(textRankResult)
|
|
687
|
+
|
|
688
|
+
t.Logf("TF-IDF: %d tokens (%.1f%% reduction)", tfidfTokens, (1.0-float64(tfidfTokens)/float64(originalTokens))*100)
|
|
689
|
+
t.Logf("TextRank: %d tokens (%.1f%% reduction)", textRankTokens, (1.0-float64(textRankTokens)/float64(originalTokens))*100)
|
|
690
|
+
|
|
691
|
+
// Both must be significantly smaller
|
|
692
|
+
if tfidfTokens >= originalTokens/2 {
|
|
693
|
+
t.Errorf("TF-IDF should reduce 100k context by at least 50%%: got %d tokens", tfidfTokens)
|
|
694
|
+
}
|
|
695
|
+
if textRankTokens >= originalTokens/2 {
|
|
696
|
+
t.Errorf("TextRank should reduce 100k context by at least 50%%: got %d tokens", textRankTokens)
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
// Both must fit within budget
|
|
700
|
+
if tfidfTokens > availableTokens {
|
|
701
|
+
t.Errorf("TF-IDF exceeds budget: %d > %d", tfidfTokens, availableTokens)
|
|
702
|
+
}
|
|
703
|
+
if textRankTokens > availableTokens {
|
|
704
|
+
t.Errorf("TextRank exceeds budget: %d > %d", textRankTokens, availableTokens)
|
|
705
|
+
}
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
func TestTokenEfficiency_MapReduceTracksTokens(t *testing.T) {
|
|
709
|
+
// Test that mapreduce strategy properly accumulates token usage from multiple chunks
|
|
710
|
+
callCount := 0
|
|
711
|
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
712
|
+
callCount++
|
|
713
|
+
// Simulate summarization - return a short summary for each chunk
|
|
714
|
+
resp := map[string]interface{}{
|
|
715
|
+
"choices": []map[string]interface{}{
|
|
716
|
+
{"message": map[string]string{"content": fmt.Sprintf("Summary of chunk %d: key finding was performance improvement.", callCount)}},
|
|
717
|
+
},
|
|
718
|
+
"usage": map[string]interface{}{
|
|
719
|
+
"prompt_tokens": 500 + callCount*50,
|
|
720
|
+
"completion_tokens": 30,
|
|
721
|
+
"total_tokens": 530 + callCount*50,
|
|
722
|
+
},
|
|
723
|
+
}
|
|
724
|
+
json.NewEncoder(w).Encode(resp)
|
|
725
|
+
}))
|
|
726
|
+
defer server.Close()
|
|
727
|
+
|
|
728
|
+
engine := New("test-model", Config{
|
|
729
|
+
APIBase: server.URL,
|
|
730
|
+
MaxDepth: 5,
|
|
731
|
+
MaxIterations: 10,
|
|
732
|
+
ContextOverflow: &ContextOverflowConfig{
|
|
733
|
+
Enabled: true,
|
|
734
|
+
Strategy: "mapreduce",
|
|
735
|
+
SafetyMargin: 0.15,
|
|
736
|
+
},
|
|
737
|
+
})
|
|
738
|
+
|
|
739
|
+
// Create a large context that will be split into multiple chunks
|
|
740
|
+
largeContext := generateLargeContext(40000)
|
|
741
|
+
query := "Summarize the key findings"
|
|
742
|
+
|
|
743
|
+
reducer := newContextReducer(engine, *engine.contextOverflow, NewNoopObserver())
|
|
744
|
+
reduced, err := reducer.ReduceForCompletion(query, largeContext, 16000)
|
|
745
|
+
if err != nil {
|
|
746
|
+
t.Fatalf("mapreduce reduction failed: %v", err)
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
// Verify that token usage was accumulated
|
|
750
|
+
if engine.stats.TotalTokens == 0 {
|
|
751
|
+
t.Error("expected total_tokens > 0 after mapreduce reduction, got 0")
|
|
752
|
+
}
|
|
753
|
+
if engine.stats.PromptTokens == 0 {
|
|
754
|
+
t.Error("expected prompt_tokens > 0 after mapreduce reduction, got 0")
|
|
755
|
+
}
|
|
756
|
+
if engine.stats.CompletionTokens == 0 {
|
|
757
|
+
t.Error("expected completion_tokens > 0 after mapreduce reduction, got 0")
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
t.Logf("MapReduce token tracking: %d total tokens (%d prompt, %d completion) across %d LLM calls",
|
|
761
|
+
engine.stats.TotalTokens, engine.stats.PromptTokens, engine.stats.CompletionTokens, engine.stats.LlmCalls)
|
|
762
|
+
t.Logf("Reduced context: %d chars", len(reduced))
|
|
763
|
+
|
|
764
|
+
// The reduced context should be much smaller than the original
|
|
765
|
+
if len(reduced) >= len(largeContext) {
|
|
766
|
+
t.Errorf("mapreduce failed to reduce context: %d chars >= original %d chars", len(reduced), len(largeContext))
|
|
767
|
+
}
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
func TestTokenEfficiency_StructuredCompletion_TracksTokens(t *testing.T) {
|
|
771
|
+
// Verify structured completion accumulates tokens
|
|
772
|
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
773
|
+
resp := map[string]interface{}{
|
|
774
|
+
"choices": []map[string]interface{}{
|
|
775
|
+
{"message": map[string]string{"content": `{"summary": "test result", "score": 8}`}},
|
|
776
|
+
},
|
|
777
|
+
"usage": map[string]interface{}{
|
|
778
|
+
"prompt_tokens": 300,
|
|
779
|
+
"completion_tokens": 15,
|
|
780
|
+
"total_tokens": 315,
|
|
781
|
+
},
|
|
782
|
+
}
|
|
783
|
+
json.NewEncoder(w).Encode(resp)
|
|
784
|
+
}))
|
|
785
|
+
defer server.Close()
|
|
786
|
+
|
|
787
|
+
engine := New("test-model", Config{
|
|
788
|
+
APIBase: server.URL,
|
|
789
|
+
MaxDepth: 5,
|
|
790
|
+
MaxIterations: 10,
|
|
791
|
+
})
|
|
792
|
+
|
|
793
|
+
schema := &StructuredConfig{
|
|
794
|
+
Schema: &JSONSchema{
|
|
795
|
+
Type: "object",
|
|
796
|
+
Properties: map[string]*JSONSchema{
|
|
797
|
+
"summary": {Type: "string"},
|
|
798
|
+
"score": {Type: "number"},
|
|
799
|
+
},
|
|
800
|
+
Required: []string{"summary", "score"},
|
|
801
|
+
},
|
|
802
|
+
MaxRetries: 3,
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
result, stats, err := engine.StructuredCompletion("Analyze this", "Some test context", schema)
|
|
806
|
+
if err != nil {
|
|
807
|
+
t.Fatalf("structured completion failed: %v", err)
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
if result == nil {
|
|
811
|
+
t.Fatal("expected non-nil result")
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
if stats.TotalTokens == 0 {
|
|
815
|
+
t.Error("expected total_tokens > 0 after structured completion, got 0")
|
|
816
|
+
}
|
|
817
|
+
if stats.PromptTokens == 0 {
|
|
818
|
+
t.Error("expected prompt_tokens > 0 after structured completion")
|
|
819
|
+
}
|
|
820
|
+
if stats.CompletionTokens == 0 {
|
|
821
|
+
t.Error("expected completion_tokens > 0 after structured completion")
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
t.Logf("Structured completion: %d total tokens (%d prompt, %d completion)", stats.TotalTokens, stats.PromptTokens, stats.CompletionTokens)
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
// ─── Token Estimation Accuracy Tests ─────────────────────────────────────────
|
|
828
|
+
|
|
829
|
+
func TestEstimateTokens_AccuracyForLargeContent(t *testing.T) {
|
|
830
|
+
// Verify that our estimation stays reasonable for large content
|
|
831
|
+
content := generateLargeContext(32000)
|
|
832
|
+
estimated := EstimateTokens(content)
|
|
833
|
+
|
|
834
|
+
// Real tokenizer would give different results, but our estimation should be
|
|
835
|
+
// within a reasonable range. The key property: conservative (over-estimates slightly)
|
|
836
|
+
charToTokenRatio := float64(len(content)) / float64(estimated)
|
|
837
|
+
|
|
838
|
+
// Our estimator uses 3.5 chars/token, so ratio should be ~3.5
|
|
839
|
+
if math.Abs(charToTokenRatio-3.5) > 0.5 {
|
|
840
|
+
t.Errorf("char-to-token ratio %.2f deviates too far from expected ~3.5", charToTokenRatio)
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
t.Logf("Large content: %d chars, %d estimated tokens, ratio: %.2f chars/token",
|
|
844
|
+
len(content), estimated, charToTokenRatio)
|
|
845
|
+
}
|
package/go/rlm/types.go
CHANGED
|
@@ -6,10 +6,13 @@ import (
|
|
|
6
6
|
)
|
|
7
7
|
|
|
8
8
|
type RLMStats struct {
|
|
9
|
-
LlmCalls
|
|
10
|
-
Iterations
|
|
11
|
-
Depth
|
|
12
|
-
ParsingRetries
|
|
9
|
+
LlmCalls int `json:"llm_calls"`
|
|
10
|
+
Iterations int `json:"iterations"`
|
|
11
|
+
Depth int `json:"depth"`
|
|
12
|
+
ParsingRetries int `json:"parsing_retries,omitempty"`
|
|
13
|
+
TotalTokens int `json:"total_tokens,omitempty"`
|
|
14
|
+
PromptTokens int `json:"prompt_tokens,omitempty"`
|
|
15
|
+
CompletionTokens int `json:"completion_tokens,omitempty"`
|
|
13
16
|
}
|
|
14
17
|
|
|
15
18
|
type JSONSchema struct {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recursive-llm-ts",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.9.0",
|
|
4
4
|
"description": "TypeScript bridge for recursive-llm: Recursive Language Models for unbounded context processing with structured outputs",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -48,12 +48,12 @@
|
|
|
48
48
|
"license": "MIT",
|
|
49
49
|
"repository": {
|
|
50
50
|
"type": "git",
|
|
51
|
-
"url": "git+https://github.com/
|
|
51
|
+
"url": "git+https://github.com/howlerops/recursive-llm-ts.git"
|
|
52
52
|
},
|
|
53
53
|
"bugs": {
|
|
54
|
-
"url": "https://github.com/
|
|
54
|
+
"url": "https://github.com/howlerops/recursive-llm-ts/issues"
|
|
55
55
|
},
|
|
56
|
-
"homepage": "https://github.com/
|
|
56
|
+
"homepage": "https://github.com/howlerops/recursive-llm-ts#readme",
|
|
57
57
|
"dependencies": {
|
|
58
58
|
"zod": "^4.3.6"
|
|
59
59
|
},
|