recursive-llm-ts 4.4.1 → 4.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +375 -12
- package/bin/rlm-go +0 -0
- package/dist/bridge-interface.d.ts +19 -2
- package/dist/cache.d.ts +78 -0
- package/dist/cache.js +246 -0
- package/dist/config.d.ts +37 -0
- package/dist/config.js +162 -0
- package/dist/errors.d.ts +113 -0
- package/dist/errors.js +219 -0
- package/dist/events.d.ts +126 -0
- package/dist/events.js +77 -0
- package/dist/index.d.ts +8 -2
- package/dist/index.js +38 -1
- package/dist/retry.d.ts +56 -0
- package/dist/retry.js +185 -0
- package/dist/rlm.d.ts +391 -13
- package/dist/rlm.js +815 -182
- package/dist/streaming.d.ts +96 -0
- package/dist/streaming.js +210 -0
- package/go/README.md +9 -1
- package/go/rlm/context_overflow.go +566 -0
- package/go/rlm/context_overflow_test.go +783 -0
- package/go/rlm/errors.go +161 -1
- package/go/rlm/rlm.go +10 -0
- package/go/rlm/structured.go +53 -0
- package/go/rlm/textrank.go +273 -0
- package/go/rlm/textrank_test.go +335 -0
- package/go/rlm/tfidf.go +225 -0
- package/go/rlm/tfidf_test.go +272 -0
- package/go/rlm/types.go +25 -2
- package/package.json +16 -4
|
@@ -0,0 +1,566 @@
|
|
|
1
|
+
package rlm
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"fmt"
|
|
5
|
+
"strings"
|
|
6
|
+
"sync"
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
// ContextOverflowConfig configures automatic context overflow handling.
|
|
10
|
+
type ContextOverflowConfig struct {
|
|
11
|
+
// Enabled turns on automatic overflow detection and recovery (default: true when config present)
|
|
12
|
+
Enabled bool `json:"enabled"`
|
|
13
|
+
// MaxModelTokens overrides the detected model token limit (0 = auto-detect from API errors)
|
|
14
|
+
MaxModelTokens int `json:"max_model_tokens,omitempty"`
|
|
15
|
+
// Strategy for reducing context: "mapreduce" (default), "truncate", "chunked", "tfidf", "textrank", "refine"
|
|
16
|
+
Strategy string `json:"strategy,omitempty"`
|
|
17
|
+
// SafetyMargin is the fraction of token budget to reserve for prompts/overhead (default: 0.15)
|
|
18
|
+
SafetyMargin float64 `json:"safety_margin,omitempty"`
|
|
19
|
+
// MaxReductionAttempts is how many times to retry with smaller context (default: 3)
|
|
20
|
+
MaxReductionAttempts int `json:"max_reduction_attempts,omitempty"`
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// DefaultContextOverflowConfig returns sensible defaults for overflow handling.
|
|
24
|
+
func DefaultContextOverflowConfig() ContextOverflowConfig {
|
|
25
|
+
return ContextOverflowConfig{
|
|
26
|
+
Enabled: true,
|
|
27
|
+
Strategy: "mapreduce",
|
|
28
|
+
SafetyMargin: 0.15,
|
|
29
|
+
MaxReductionAttempts: 3,
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// ─── Token Estimation ────────────────────────────────────────────────────────
|
|
34
|
+
|
|
35
|
+
// EstimateTokens provides a fast approximation of token count for a string.
|
|
36
|
+
// Uses a character-to-token ratio heuristic. This is intentionally conservative
|
|
37
|
+
// (over-estimates slightly) to avoid overflow.
|
|
38
|
+
//
|
|
39
|
+
// Approximate ratios for common encodings:
|
|
40
|
+
// - English text: ~4 chars/token (cl100k_base)
|
|
41
|
+
// - JSON/code: ~3.5 chars/token
|
|
42
|
+
// - CJK text: ~1.5 chars/token
|
|
43
|
+
// - Mixed: ~3.5 chars/token (safe default)
|
|
44
|
+
func EstimateTokens(text string) int {
|
|
45
|
+
if len(text) == 0 {
|
|
46
|
+
return 0
|
|
47
|
+
}
|
|
48
|
+
// Use 3.5 chars/token as conservative estimate
|
|
49
|
+
return (len(text)*10 + 34) / 35 // equivalent to ceil(len/3.5)
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// EstimateMessagesTokens estimates the total tokens for a set of chat messages.
|
|
53
|
+
// Includes per-message overhead (~4 tokens per message for role + formatting).
|
|
54
|
+
func EstimateMessagesTokens(messages []Message) int {
|
|
55
|
+
total := 3 // Every reply is primed with <|im_start|>assistant<|im_sep|>
|
|
56
|
+
for _, msg := range messages {
|
|
57
|
+
total += 4 // role + formatting overhead
|
|
58
|
+
total += EstimateTokens(msg.Content)
|
|
59
|
+
}
|
|
60
|
+
return total
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// ─── Context Chunking ────────────────────────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
// ChunkContext splits context into chunks that fit within a token budget.
|
|
66
|
+
// Uses paragraph/sentence boundaries when possible, with overlap for context continuity.
|
|
67
|
+
func ChunkContext(context string, maxTokensPerChunk int) []string {
|
|
68
|
+
if maxTokensPerChunk <= 0 {
|
|
69
|
+
maxTokensPerChunk = 4000
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Estimate max chars per chunk (slightly conservative)
|
|
73
|
+
maxCharsPerChunk := maxTokensPerChunk * 3 // Use 3 chars/token to leave room
|
|
74
|
+
|
|
75
|
+
if len(context) <= maxCharsPerChunk {
|
|
76
|
+
return []string{context}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
var chunks []string
|
|
80
|
+
overlapChars := maxCharsPerChunk / 10 // 10% overlap for context continuity
|
|
81
|
+
|
|
82
|
+
pos := 0
|
|
83
|
+
for pos < len(context) {
|
|
84
|
+
end := pos + maxCharsPerChunk
|
|
85
|
+
if end >= len(context) {
|
|
86
|
+
chunks = append(chunks, context[pos:])
|
|
87
|
+
break
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Try to find a good break point (paragraph boundary first, then sentence, then word)
|
|
91
|
+
breakPoint := findBreakPoint(context, pos, end)
|
|
92
|
+
chunks = append(chunks, context[pos:breakPoint])
|
|
93
|
+
|
|
94
|
+
// Move position back by overlap amount to maintain context continuity
|
|
95
|
+
pos = breakPoint - overlapChars
|
|
96
|
+
if pos < 0 {
|
|
97
|
+
pos = 0
|
|
98
|
+
}
|
|
99
|
+
// Ensure we make forward progress
|
|
100
|
+
if pos <= (breakPoint - maxCharsPerChunk) {
|
|
101
|
+
pos = breakPoint
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return chunks
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// findBreakPoint finds the best position to split text near the target end position.
|
|
109
|
+
// Prefers paragraph breaks (\n\n), then line breaks (\n), then sentence ends (. ! ?), then word breaks.
|
|
110
|
+
func findBreakPoint(text string, start int, targetEnd int) int {
|
|
111
|
+
if targetEnd >= len(text) {
|
|
112
|
+
return len(text)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Search window: look back from targetEnd up to 20% of the chunk
|
|
116
|
+
searchStart := targetEnd - (targetEnd-start)/5
|
|
117
|
+
if searchStart < start {
|
|
118
|
+
searchStart = start
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
searchRegion := text[searchStart:targetEnd]
|
|
122
|
+
|
|
123
|
+
// Try paragraph break first
|
|
124
|
+
if idx := strings.LastIndex(searchRegion, "\n\n"); idx >= 0 {
|
|
125
|
+
return searchStart + idx + 2
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Try line break
|
|
129
|
+
if idx := strings.LastIndex(searchRegion, "\n"); idx >= 0 {
|
|
130
|
+
return searchStart + idx + 1
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Try sentence end
|
|
134
|
+
for _, sep := range []string{". ", "! ", "? "} {
|
|
135
|
+
if idx := strings.LastIndex(searchRegion, sep); idx >= 0 {
|
|
136
|
+
return searchStart + idx + len(sep)
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Try word break
|
|
141
|
+
if idx := strings.LastIndex(searchRegion, " "); idx >= 0 {
|
|
142
|
+
return searchStart + idx + 1
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// No good break point, just split at target
|
|
146
|
+
return targetEnd
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// ─── MapReduce Context Reduction ─────────────────────────────────────────────
|
|
150
|
+
|
|
151
|
+
// MapReduceResult holds the result of a MapReduce context reduction
|
|
152
|
+
type MapReduceResult struct {
|
|
153
|
+
ReducedContext string
|
|
154
|
+
ChunkCount int
|
|
155
|
+
OriginalTokens int
|
|
156
|
+
ReducedTokens int
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// contextReducer manages context reduction for overflow recovery
|
|
160
|
+
type contextReducer struct {
|
|
161
|
+
rlm *RLM
|
|
162
|
+
config ContextOverflowConfig
|
|
163
|
+
obs *Observer
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// newContextReducer creates a reducer bound to an RLM engine
|
|
167
|
+
func newContextReducer(rlm *RLM, config ContextOverflowConfig, obs *Observer) *contextReducer {
|
|
168
|
+
return &contextReducer{rlm: rlm, config: config, obs: obs}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// ReduceForCompletion handles context overflow for a regular completion.
|
|
172
|
+
// It chunks the context, summarizes each chunk, and combines the summaries.
|
|
173
|
+
func (cr *contextReducer) ReduceForCompletion(query string, context string, modelLimit int) (string, error) {
|
|
174
|
+
cr.obs.Debug("overflow", "Starting MapReduce context reduction: %d estimated tokens, limit %d", EstimateTokens(context), modelLimit)
|
|
175
|
+
|
|
176
|
+
// Calculate safe token budget per chunk
|
|
177
|
+
// Reserve tokens for: system prompt (~500), query, overhead, safety margin
|
|
178
|
+
queryTokens := EstimateTokens(query)
|
|
179
|
+
overhead := 500 + queryTokens + int(float64(modelLimit)*cr.config.SafetyMargin)
|
|
180
|
+
safeTokensPerChunk := modelLimit - overhead
|
|
181
|
+
|
|
182
|
+
if safeTokensPerChunk <= 0 {
|
|
183
|
+
safeTokensPerChunk = modelLimit / 2
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
chunks := ChunkContext(context, safeTokensPerChunk)
|
|
187
|
+
cr.obs.Debug("overflow", "Split context into %d chunks (budget: %d tokens/chunk)", len(chunks), safeTokensPerChunk)
|
|
188
|
+
|
|
189
|
+
if len(chunks) == 1 {
|
|
190
|
+
// Context is already small enough (or couldn't be meaningfully split)
|
|
191
|
+
return context, nil
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
switch cr.config.Strategy {
|
|
195
|
+
case "truncate":
|
|
196
|
+
return cr.reduceByTruncation(context, modelLimit, overhead)
|
|
197
|
+
case "chunked":
|
|
198
|
+
return cr.reduceByChunkedExtraction(query, chunks, modelLimit, overhead)
|
|
199
|
+
case "tfidf":
|
|
200
|
+
return cr.reduceByTFIDF(context, modelLimit, overhead)
|
|
201
|
+
case "textrank":
|
|
202
|
+
return cr.reduceByTextRank(context, modelLimit, overhead)
|
|
203
|
+
case "refine":
|
|
204
|
+
return cr.reduceByRefine(query, chunks, modelLimit, overhead)
|
|
205
|
+
default: // "mapreduce"
|
|
206
|
+
return cr.reduceByMapReduce(query, chunks, modelLimit, overhead)
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// reduceByMapReduce summarizes each chunk and combines the summaries.
|
|
211
|
+
func (cr *contextReducer) reduceByMapReduce(query string, chunks []string, modelLimit int, overhead int) (string, error) {
|
|
212
|
+
cr.obs.Debug("overflow", "Using MapReduce strategy with %d chunks", len(chunks))
|
|
213
|
+
|
|
214
|
+
summaries := make([]string, len(chunks))
|
|
215
|
+
errs := make([]error, len(chunks))
|
|
216
|
+
var wg sync.WaitGroup
|
|
217
|
+
|
|
218
|
+
// Map phase: summarize each chunk in parallel
|
|
219
|
+
for i, chunk := range chunks {
|
|
220
|
+
wg.Add(1)
|
|
221
|
+
go func(idx int, chunkText string) {
|
|
222
|
+
defer wg.Done()
|
|
223
|
+
|
|
224
|
+
mapPrompt := fmt.Sprintf(
|
|
225
|
+
"Summarize the following text chunk, preserving all key facts, data points, names, numbers, and specific details that would be needed to answer the question: %q\n\n"+
|
|
226
|
+
"IMPORTANT: Be thorough and retain specific data. Do not omit numbers, percentages, dates, or named entities.\n\n"+
|
|
227
|
+
"Text chunk (%d of %d):\n%s",
|
|
228
|
+
query, idx+1, len(chunks), chunkText,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
messages := []Message{
|
|
232
|
+
{Role: "system", Content: "You are a precise summarization assistant. Preserve all factual details, data points, and specific information."},
|
|
233
|
+
{Role: "user", Content: mapPrompt},
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
result, err := CallChatCompletion(ChatRequest{
|
|
237
|
+
Model: cr.rlm.model,
|
|
238
|
+
Messages: messages,
|
|
239
|
+
APIBase: cr.rlm.apiBase,
|
|
240
|
+
APIKey: cr.rlm.apiKey,
|
|
241
|
+
Timeout: cr.rlm.timeoutSeconds,
|
|
242
|
+
ExtraParams: cr.rlm.extraParams,
|
|
243
|
+
})
|
|
244
|
+
if err != nil {
|
|
245
|
+
errs[idx] = fmt.Errorf("map phase chunk %d: %w", idx+1, err)
|
|
246
|
+
return
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
cr.rlm.stats.LlmCalls++
|
|
250
|
+
summaries[idx] = result
|
|
251
|
+
cr.obs.Debug("overflow", "Chunk %d/%d summarized: %d -> %d chars", idx+1, len(chunks), len(chunkText), len(result))
|
|
252
|
+
}(i, chunk)
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
wg.Wait()
|
|
256
|
+
|
|
257
|
+
// Check for errors
|
|
258
|
+
var mapErrors []string
|
|
259
|
+
for _, err := range errs {
|
|
260
|
+
if err != nil {
|
|
261
|
+
mapErrors = append(mapErrors, err.Error())
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
if len(mapErrors) > 0 {
|
|
265
|
+
return "", fmt.Errorf("MapReduce map phase failed: %s", strings.Join(mapErrors, "; "))
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// Reduce phase: combine summaries
|
|
269
|
+
combined := strings.Join(summaries, "\n\n---\n\n")
|
|
270
|
+
|
|
271
|
+
// Check if combined summaries fit in the budget
|
|
272
|
+
if EstimateTokens(combined)+overhead < modelLimit {
|
|
273
|
+
cr.obs.Debug("overflow", "MapReduce complete: %d -> %d estimated tokens", EstimateTokens(strings.Join(chunks, "")), EstimateTokens(combined))
|
|
274
|
+
return combined, nil
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// If summaries are still too large, recursively reduce
|
|
278
|
+
cr.obs.Debug("overflow", "Combined summaries still too large (%d tokens), reducing recursively", EstimateTokens(combined))
|
|
279
|
+
return cr.ReduceForCompletion(query, combined, modelLimit)
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// reduceByTruncation simply truncates context to fit within the limit.
|
|
283
|
+
func (cr *contextReducer) reduceByTruncation(context string, modelLimit int, overhead int) (string, error) {
|
|
284
|
+
cr.obs.Debug("overflow", "Using truncation strategy")
|
|
285
|
+
|
|
286
|
+
availableTokens := modelLimit - overhead
|
|
287
|
+
maxChars := availableTokens * 3 // Conservative chars-to-tokens
|
|
288
|
+
|
|
289
|
+
if maxChars >= len(context) {
|
|
290
|
+
return context, nil
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
// Keep beginning and end, truncate middle (addresses "lost in the middle" problem)
|
|
294
|
+
keepFromStart := maxChars * 2 / 3
|
|
295
|
+
keepFromEnd := maxChars / 3
|
|
296
|
+
|
|
297
|
+
truncated := context[:keepFromStart] +
|
|
298
|
+
"\n\n[... context truncated due to token limit ...]\n\n" +
|
|
299
|
+
context[len(context)-keepFromEnd:]
|
|
300
|
+
|
|
301
|
+
cr.obs.Debug("overflow", "Truncated context: %d -> %d chars", len(context), len(truncated))
|
|
302
|
+
return truncated, nil
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// reduceByChunkedExtraction processes each chunk independently and returns all extracted content.
|
|
306
|
+
func (cr *contextReducer) reduceByChunkedExtraction(query string, chunks []string, modelLimit int, overhead int) (string, error) {
|
|
307
|
+
cr.obs.Debug("overflow", "Using chunked extraction strategy with %d chunks", len(chunks))
|
|
308
|
+
|
|
309
|
+
results := make([]string, len(chunks))
|
|
310
|
+
errs := make([]error, len(chunks))
|
|
311
|
+
var wg sync.WaitGroup
|
|
312
|
+
|
|
313
|
+
for i, chunk := range chunks {
|
|
314
|
+
wg.Add(1)
|
|
315
|
+
go func(idx int, chunkText string) {
|
|
316
|
+
defer wg.Done()
|
|
317
|
+
|
|
318
|
+
extractPrompt := fmt.Sprintf(
|
|
319
|
+
"Extract all information relevant to the following question from this text chunk. "+
|
|
320
|
+
"Include specific data, facts, quotes, and details. If nothing relevant is found, respond with 'NO_RELEVANT_CONTENT'.\n\n"+
|
|
321
|
+
"Question: %s\n\nText chunk (%d of %d):\n%s",
|
|
322
|
+
query, idx+1, len(chunks), chunkText,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
messages := []Message{
|
|
326
|
+
{Role: "system", Content: "You are a precise information extraction assistant. Extract only relevant information."},
|
|
327
|
+
{Role: "user", Content: extractPrompt},
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
result, err := CallChatCompletion(ChatRequest{
|
|
331
|
+
Model: cr.rlm.model,
|
|
332
|
+
Messages: messages,
|
|
333
|
+
APIBase: cr.rlm.apiBase,
|
|
334
|
+
APIKey: cr.rlm.apiKey,
|
|
335
|
+
Timeout: cr.rlm.timeoutSeconds,
|
|
336
|
+
ExtraParams: cr.rlm.extraParams,
|
|
337
|
+
})
|
|
338
|
+
if err != nil {
|
|
339
|
+
errs[idx] = fmt.Errorf("chunked extraction chunk %d: %w", idx+1, err)
|
|
340
|
+
return
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
cr.rlm.stats.LlmCalls++
|
|
344
|
+
if strings.TrimSpace(result) != "NO_RELEVANT_CONTENT" {
|
|
345
|
+
results[idx] = result
|
|
346
|
+
}
|
|
347
|
+
}(i, chunk)
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
wg.Wait()
|
|
351
|
+
|
|
352
|
+
var extractErrors []string
|
|
353
|
+
for _, err := range errs {
|
|
354
|
+
if err != nil {
|
|
355
|
+
extractErrors = append(extractErrors, err.Error())
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
if len(extractErrors) > 0 {
|
|
359
|
+
return "", fmt.Errorf("chunked extraction failed: %s", strings.Join(extractErrors, "; "))
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Combine non-empty results
|
|
363
|
+
var parts []string
|
|
364
|
+
for _, r := range results {
|
|
365
|
+
if r != "" {
|
|
366
|
+
parts = append(parts, r)
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
if len(parts) == 0 {
|
|
371
|
+
return "No relevant content found across all chunks.", nil
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
return strings.Join(parts, "\n\n---\n\n"), nil
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// ─── Refine Sequential Strategy ─────────────────────────────────────────────
|
|
378
|
+
|
|
379
|
+
// reduceByRefine processes chunks sequentially, where the first chunk generates
|
|
380
|
+
// an initial answer and each subsequent chunk refines it. This approach has the
|
|
381
|
+
// highest information fidelity because every chunk is processed in context of
|
|
382
|
+
// the cumulative answer, but is sequential (not parallelizable).
|
|
383
|
+
func (cr *contextReducer) reduceByRefine(query string, chunks []string, modelLimit int, overhead int) (string, error) {
|
|
384
|
+
cr.obs.Debug("overflow", "Using refine strategy with %d chunks", len(chunks))
|
|
385
|
+
|
|
386
|
+
if len(chunks) == 0 {
|
|
387
|
+
return "", fmt.Errorf("refine strategy: no chunks to process")
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// Phase 1: Generate initial answer from the first chunk
|
|
391
|
+
initialPrompt := fmt.Sprintf(
|
|
392
|
+
"Using the following context, provide a comprehensive answer to the question.\n"+
|
|
393
|
+
"Preserve all key facts, data points, names, numbers, and specific details.\n\n"+
|
|
394
|
+
"Question: %s\n\nContext:\n%s",
|
|
395
|
+
query, chunks[0],
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
messages := []Message{
|
|
399
|
+
{Role: "system", Content: "You are a precise information synthesis assistant. Preserve all factual details and specific data points."},
|
|
400
|
+
{Role: "user", Content: initialPrompt},
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
currentAnswer, err := CallChatCompletion(ChatRequest{
|
|
404
|
+
Model: cr.rlm.model,
|
|
405
|
+
Messages: messages,
|
|
406
|
+
APIBase: cr.rlm.apiBase,
|
|
407
|
+
APIKey: cr.rlm.apiKey,
|
|
408
|
+
Timeout: cr.rlm.timeoutSeconds,
|
|
409
|
+
ExtraParams: cr.rlm.extraParams,
|
|
410
|
+
})
|
|
411
|
+
if err != nil {
|
|
412
|
+
return "", fmt.Errorf("refine initial chunk: %w", err)
|
|
413
|
+
}
|
|
414
|
+
cr.rlm.stats.LlmCalls++
|
|
415
|
+
cr.obs.Debug("overflow", "Refine: initial answer from chunk 1/%d (%d chars)", len(chunks), len(currentAnswer))
|
|
416
|
+
|
|
417
|
+
// Phase 2: Refine the answer with each subsequent chunk
|
|
418
|
+
for i := 1; i < len(chunks); i++ {
|
|
419
|
+
refinePrompt := fmt.Sprintf(
|
|
420
|
+
"You have an existing answer to the question: %q\n\n"+
|
|
421
|
+
"Existing answer:\n%s\n\n"+
|
|
422
|
+
"Now you have additional context that may contain new information, corrections, or supporting details.\n"+
|
|
423
|
+
"Refine the existing answer by incorporating any relevant new information from this context.\n"+
|
|
424
|
+
"If this context adds nothing new, return the existing answer unchanged.\n"+
|
|
425
|
+
"IMPORTANT: Never remove information from the existing answer unless it is contradicted by the new context.\n\n"+
|
|
426
|
+
"Additional context (chunk %d of %d):\n%s",
|
|
427
|
+
query, currentAnswer, i+1, len(chunks), chunks[i],
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
messages := []Message{
|
|
431
|
+
{Role: "system", Content: "You are a precise information synthesis assistant. Refine answers by incorporating new context without losing existing information."},
|
|
432
|
+
{Role: "user", Content: refinePrompt},
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
refined, err := CallChatCompletion(ChatRequest{
|
|
436
|
+
Model: cr.rlm.model,
|
|
437
|
+
Messages: messages,
|
|
438
|
+
APIBase: cr.rlm.apiBase,
|
|
439
|
+
APIKey: cr.rlm.apiKey,
|
|
440
|
+
Timeout: cr.rlm.timeoutSeconds,
|
|
441
|
+
ExtraParams: cr.rlm.extraParams,
|
|
442
|
+
})
|
|
443
|
+
if err != nil {
|
|
444
|
+
cr.obs.Debug("overflow", "Refine: chunk %d/%d failed: %v, keeping current answer", i+1, len(chunks), err)
|
|
445
|
+
// On error, keep current answer rather than failing entirely
|
|
446
|
+
continue
|
|
447
|
+
}
|
|
448
|
+
cr.rlm.stats.LlmCalls++
|
|
449
|
+
currentAnswer = refined
|
|
450
|
+
cr.obs.Debug("overflow", "Refine: incorporated chunk %d/%d (%d chars)", i+1, len(chunks), len(currentAnswer))
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// Verify the refined answer fits within budget
|
|
454
|
+
if EstimateTokens(currentAnswer)+overhead < modelLimit {
|
|
455
|
+
cr.obs.Debug("overflow", "Refine complete: answer is %d estimated tokens", EstimateTokens(currentAnswer))
|
|
456
|
+
return currentAnswer, nil
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
// If the refined answer is still too large, truncate it
|
|
460
|
+
cr.obs.Debug("overflow", "Refine answer too large (%d tokens), truncating", EstimateTokens(currentAnswer))
|
|
461
|
+
return cr.reduceByTruncation(currentAnswer, modelLimit, overhead)
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
// ─── TF-IDF Strategy (wrapper for contextReducer) ───────────────────────────
|
|
465
|
+
|
|
466
|
+
// reduceByTFIDF uses TF-IDF extractive compression - pure algorithmic, no API calls.
|
|
467
|
+
func (cr *contextReducer) reduceByTFIDF(context string, modelLimit int, overhead int) (string, error) {
|
|
468
|
+
cr.obs.Debug("overflow", "Using TF-IDF extractive strategy")
|
|
469
|
+
|
|
470
|
+
availableTokens := modelLimit - overhead
|
|
471
|
+
if availableTokens <= 0 {
|
|
472
|
+
availableTokens = modelLimit / 2
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
result := CompressContextTFIDF(context, availableTokens)
|
|
476
|
+
cr.obs.Debug("overflow", "TF-IDF compressed: %d -> %d chars (%d -> %d est. tokens)",
|
|
477
|
+
len(context), len(result), EstimateTokens(context), EstimateTokens(result))
|
|
478
|
+
return result, nil
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// ─── TextRank Strategy (wrapper for contextReducer) ─────────────────────────
|
|
482
|
+
|
|
483
|
+
// reduceByTextRank uses TextRank graph-based ranking - pure algorithmic, no API calls.
|
|
484
|
+
func (cr *contextReducer) reduceByTextRank(context string, modelLimit int, overhead int) (string, error) {
|
|
485
|
+
cr.obs.Debug("overflow", "Using TextRank graph-based strategy")
|
|
486
|
+
|
|
487
|
+
availableTokens := modelLimit - overhead
|
|
488
|
+
if availableTokens <= 0 {
|
|
489
|
+
availableTokens = modelLimit / 2
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
result := CompressContextTextRank(context, availableTokens)
|
|
493
|
+
cr.obs.Debug("overflow", "TextRank compressed: %d -> %d chars (%d -> %d est. tokens)",
|
|
494
|
+
len(context), len(result), EstimateTokens(context), EstimateTokens(result))
|
|
495
|
+
return result, nil
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
// ─── Adaptive Completion with Overflow Recovery ──────────────────────────────
|
|
499
|
+
|
|
500
|
+
// completionWithOverflowRecovery wraps a completion call with automatic overflow detection and retry.
|
|
501
|
+
// When a context overflow error is detected, it reduces the context and retries.
|
|
502
|
+
func (r *RLM) completionWithOverflowRecovery(query string, context string, overflowConfig ContextOverflowConfig) (string, RLMStats, error) {
|
|
503
|
+
obs := r.observer
|
|
504
|
+
if obs == nil {
|
|
505
|
+
obs = NewNoopObserver()
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
// Try the normal completion first
|
|
509
|
+
result, stats, err := r.Completion(query, context)
|
|
510
|
+
if err == nil {
|
|
511
|
+
return result, stats, nil
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// Check if it's a context overflow error
|
|
515
|
+
coe, isOverflow := IsContextOverflow(err)
|
|
516
|
+
if !isOverflow {
|
|
517
|
+
return "", stats, err // Not an overflow error, return original error
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
obs.Debug("overflow", "Context overflow detected: model limit %d, request %d tokens (%.1f%% over)",
|
|
521
|
+
coe.ModelLimit, coe.RequestTokens, (coe.OverflowRatio()-1)*100)
|
|
522
|
+
|
|
523
|
+
// Use detected limit or configured limit
|
|
524
|
+
modelLimit := coe.ModelLimit
|
|
525
|
+
if overflowConfig.MaxModelTokens > 0 {
|
|
526
|
+
modelLimit = overflowConfig.MaxModelTokens
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
reducer := newContextReducer(r, overflowConfig, obs)
|
|
530
|
+
|
|
531
|
+
// Attempt context reduction and retry
|
|
532
|
+
for attempt := 0; attempt < overflowConfig.MaxReductionAttempts; attempt++ {
|
|
533
|
+
obs.Debug("overflow", "Reduction attempt %d/%d", attempt+1, overflowConfig.MaxReductionAttempts)
|
|
534
|
+
|
|
535
|
+
reducedContext, reduceErr := reducer.ReduceForCompletion(query, context, modelLimit)
|
|
536
|
+
if reduceErr != nil {
|
|
537
|
+
obs.Error("overflow", "Context reduction failed: %v", reduceErr)
|
|
538
|
+
return "", stats, fmt.Errorf("context overflow recovery failed: %w", reduceErr)
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
obs.Debug("overflow", "Context reduced: %d -> %d chars", len(context), len(reducedContext))
|
|
542
|
+
|
|
543
|
+
// Retry with reduced context
|
|
544
|
+
result, stats, err = r.Completion(query, reducedContext)
|
|
545
|
+
if err == nil {
|
|
546
|
+
obs.Event("overflow.recovery_success", map[string]string{
|
|
547
|
+
"attempt": fmt.Sprintf("%d", attempt+1),
|
|
548
|
+
"original_chars": fmt.Sprintf("%d", len(context)),
|
|
549
|
+
"reduced_chars": fmt.Sprintf("%d", len(reducedContext)),
|
|
550
|
+
"reduction_ratio": fmt.Sprintf("%.2f", float64(len(reducedContext))/float64(len(context))),
|
|
551
|
+
})
|
|
552
|
+
return result, stats, nil
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// If it overflows again, use the reduced context for the next attempt
|
|
556
|
+
if _, stillOverflow := IsContextOverflow(err); stillOverflow {
|
|
557
|
+
context = reducedContext
|
|
558
|
+
continue
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
// Different error, return it
|
|
562
|
+
return "", stats, err
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
return "", stats, fmt.Errorf("context overflow: exceeded %d reduction attempts, model limit is %d tokens", overflowConfig.MaxReductionAttempts, modelLimit)
|
|
566
|
+
}
|