recursive-llm-ts 4.9.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,424 @@
1
+ package rlm
2
+
3
+ import (
4
+ "encoding/json"
5
+ "fmt"
6
+ "path/filepath"
7
+ "strings"
8
+ )
9
+
10
+ // ─── LCM Large File Handling ────────────────────────────────────────────────
11
+ // Implements the file handling strategy from the LCM paper (Section 2.2):
12
+ // - Files below token threshold: included in context normally
13
+ // - Files above threshold: stored externally with Exploration Summary
14
+ // - Type-aware dispatching for summary generation
15
+
16
+ // LCMFileConfig configures large file handling.
17
+ type LCMFileConfig struct {
18
+ // TokenThreshold is the token count above which files are stored externally.
19
+ // Default: 25000 (25k tokens, matching the LCM paper).
20
+ TokenThreshold int `json:"token_threshold,omitempty"`
21
+ }
22
+
23
+ // DefaultLCMFileConfig returns default file handling configuration.
24
+ func DefaultLCMFileConfig() LCMFileConfig {
25
+ return LCMFileConfig{
26
+ TokenThreshold: 25000,
27
+ }
28
+ }
29
+
30
+ // LCMFileRef is an opaque reference to a file stored outside the active context.
31
+ type LCMFileRef struct {
32
+ ID string `json:"id"`
33
+ Path string `json:"path"`
34
+ MIMEType string `json:"mime_type"`
35
+ Tokens int `json:"tokens"`
36
+ ExplorationSummary string `json:"exploration_summary"`
37
+ }
38
+
39
+ // LCMFileHandler manages large file references and exploration summaries.
40
+ type LCMFileHandler struct {
41
+ config LCMFileConfig
42
+ files map[string]*LCMFileRef // File refs by ID
43
+ nextID int
44
+ observer *Observer
45
+
46
+ // LLM config for generating exploration summaries
47
+ model string
48
+ apiBase string
49
+ apiKey string
50
+ timeout int
51
+ extraParams map[string]interface{}
52
+ }
53
+
54
+ // NewLCMFileHandler creates a new file handler.
55
+ func NewLCMFileHandler(config LCMFileConfig, model, apiBase, apiKey string, timeout int, extraParams map[string]interface{}, observer *Observer) *LCMFileHandler {
56
+ if config.TokenThreshold == 0 {
57
+ config.TokenThreshold = 25000
58
+ }
59
+ return &LCMFileHandler{
60
+ config: config,
61
+ files: make(map[string]*LCMFileRef),
62
+ observer: observer,
63
+ model: model,
64
+ apiBase: apiBase,
65
+ apiKey: apiKey,
66
+ timeout: timeout,
67
+ extraParams: extraParams,
68
+ }
69
+ }
70
+
71
+ // ProcessFile checks if a file should be included in context or stored externally.
72
+ // Returns (contextContent, fileRef, error).
73
+ // If the file is small enough, contextContent is the file content and fileRef is nil.
74
+ // If the file is too large, contextContent is the exploration summary reference and fileRef is set.
75
+ func (h *LCMFileHandler) ProcessFile(path string, content string) (string, *LCMFileRef, error) {
76
+ tokens := EstimateTokens(content)
77
+
78
+ if tokens <= h.config.TokenThreshold {
79
+ // Small file: include in context normally
80
+ return content, nil, nil
81
+ }
82
+
83
+ // Large file: generate exploration summary
84
+ h.observer.Debug("lcm.files", "File %s (%d tokens) exceeds threshold (%d), generating exploration summary",
85
+ path, tokens, h.config.TokenThreshold)
86
+
87
+ summary, err := h.generateExplorationSummary(path, content)
88
+ if err != nil {
89
+ return "", nil, fmt.Errorf("failed to generate exploration summary for %s: %w", path, err)
90
+ }
91
+
92
+ h.nextID++
93
+ ref := &LCMFileRef{
94
+ ID: fmt.Sprintf("file_%d", h.nextID),
95
+ Path: path,
96
+ MIMEType: detectMIMEType(path),
97
+ Tokens: tokens,
98
+ ExplorationSummary: summary,
99
+ }
100
+ h.files[ref.ID] = ref
101
+
102
+ // Return a compact reference for the active context
103
+ contextRef := fmt.Sprintf("[File %s: %s (%d tokens)]\n%s", ref.ID, path, tokens, summary)
104
+ return contextRef, ref, nil
105
+ }
106
+
107
+ // GetFileRef retrieves a file reference by ID.
108
+ func (h *LCMFileHandler) GetFileRef(id string) (*LCMFileRef, bool) {
109
+ ref, ok := h.files[id]
110
+ return ref, ok
111
+ }
112
+
113
+ // ─── Type-Aware Exploration Summary Generation ──────────────────────────────
114
+
115
+ func (h *LCMFileHandler) generateExplorationSummary(path string, content string) (string, error) {
116
+ ext := strings.ToLower(filepath.Ext(path))
117
+ fileType := classifyFileType(ext)
118
+
119
+ switch fileType {
120
+ case fileTypeStructuredData:
121
+ return h.summarizeStructuredData(path, content, ext)
122
+ case fileTypeCode:
123
+ return h.summarizeCode(path, content, ext)
124
+ default:
125
+ return h.summarizeText(path, content)
126
+ }
127
+ }
128
+
129
+ type fileType int
130
+
131
+ const (
132
+ fileTypeText fileType = iota
133
+ fileTypeCode
134
+ fileTypeStructuredData
135
+ )
136
+
137
+ func classifyFileType(ext string) fileType {
138
+ switch ext {
139
+ case ".json", ".jsonl", ".csv", ".tsv", ".sql", ".sqlite", ".db",
140
+ ".xml", ".yaml", ".yml", ".toml", ".parquet", ".avro":
141
+ return fileTypeStructuredData
142
+ case ".go", ".ts", ".tsx", ".js", ".jsx", ".py", ".rs", ".java",
143
+ ".c", ".cpp", ".h", ".hpp", ".cs", ".rb", ".php", ".swift",
144
+ ".kt", ".scala", ".sh", ".bash", ".zsh", ".lua", ".r", ".R":
145
+ return fileTypeCode
146
+ default:
147
+ return fileTypeText
148
+ }
149
+ }
150
+
151
+ // summarizeStructuredData extracts schema and shape for JSON, CSV, SQL, etc.
152
+ func (h *LCMFileHandler) summarizeStructuredData(path string, content string, ext string) (string, error) {
153
+ // For structured data, try to extract schema deterministically first
154
+ switch ext {
155
+ case ".json":
156
+ return h.summarizeJSON(content), nil
157
+ case ".jsonl":
158
+ return h.summarizeJSONL(content), nil
159
+ case ".csv", ".tsv":
160
+ return h.summarizeCSV(content, ext), nil
161
+ default:
162
+ // Fall back to LLM summary for other structured formats
163
+ return h.llmSummarize(path, content, "structured data")
164
+ }
165
+ }
166
+
167
+ func (h *LCMFileHandler) summarizeJSON(content string) string {
168
+ var parsed interface{}
169
+ if err := json.Unmarshal([]byte(content), &parsed); err != nil {
170
+ return fmt.Sprintf("JSON file (parse error: %s)", err)
171
+ }
172
+
173
+ var sb strings.Builder
174
+ sb.WriteString("JSON file analysis:\n")
175
+ describeJSONShape(&sb, parsed, "", 0, 3)
176
+ return sb.String()
177
+ }
178
+
179
+ func describeJSONShape(sb *strings.Builder, v interface{}, prefix string, depth int, maxDepth int) {
180
+ if depth > maxDepth {
181
+ sb.WriteString(prefix + "...\n")
182
+ return
183
+ }
184
+
185
+ switch val := v.(type) {
186
+ case map[string]interface{}:
187
+ fmt.Fprintf(sb, "%sObject with %d keys: ", prefix, len(val))
188
+ keys := make([]string, 0, len(val))
189
+ for k := range val {
190
+ keys = append(keys, k)
191
+ }
192
+ if len(keys) > 10 {
193
+ fmt.Fprintf(sb, "%s... (%d more)\n", strings.Join(keys[:10], ", "), len(keys)-10)
194
+ } else {
195
+ sb.WriteString(strings.Join(keys, ", ") + "\n")
196
+ }
197
+ for k, child := range val {
198
+ if depth < maxDepth-1 {
199
+ describeJSONShape(sb, child, prefix+" "+k+": ", depth+1, maxDepth)
200
+ }
201
+ }
202
+ case []interface{}:
203
+ fmt.Fprintf(sb, "%sArray with %d items", prefix, len(val))
204
+ if len(val) > 0 {
205
+ fmt.Fprintf(sb, " (first item type: %T)\n", val[0])
206
+ describeJSONShape(sb, val[0], prefix+" [0]: ", depth+1, maxDepth)
207
+ } else {
208
+ sb.WriteString(" (empty)\n")
209
+ }
210
+ default:
211
+ fmt.Fprintf(sb, "%s%T\n", prefix, v)
212
+ }
213
+ }
214
+
215
+ func (h *LCMFileHandler) summarizeJSONL(content string) string {
216
+ lines := strings.Split(strings.TrimSpace(content), "\n")
217
+ var sb strings.Builder
218
+ sb.WriteString(fmt.Sprintf("JSONL file: %d lines\n", len(lines)))
219
+
220
+ // Analyze first line for schema
221
+ if len(lines) > 0 {
222
+ var first interface{}
223
+ if err := json.Unmarshal([]byte(lines[0]), &first); err == nil {
224
+ sb.WriteString("Schema (from first line):\n")
225
+ describeJSONShape(&sb, first, " ", 0, 2)
226
+ }
227
+ }
228
+
229
+ return sb.String()
230
+ }
231
+
232
+ func (h *LCMFileHandler) summarizeCSV(content string, ext string) string {
233
+ lines := strings.Split(strings.TrimSpace(content), "\n")
234
+ var sb strings.Builder
235
+
236
+ delimiter := ","
237
+ if ext == ".tsv" {
238
+ delimiter = "\t"
239
+ }
240
+
241
+ sb.WriteString(fmt.Sprintf("CSV file: %d rows\n", len(lines)))
242
+ if len(lines) > 0 {
243
+ headers := strings.Split(lines[0], delimiter)
244
+ sb.WriteString(fmt.Sprintf("Columns (%d): %s\n", len(headers), strings.Join(headers, ", ")))
245
+ }
246
+ if len(lines) > 1 {
247
+ sb.WriteString(fmt.Sprintf("Sample row: %s\n", lines[1]))
248
+ }
249
+
250
+ return sb.String()
251
+ }
252
+
253
+ // summarizeCode extracts structural analysis for code files.
254
+ func (h *LCMFileHandler) summarizeCode(path string, content string, ext string) (string, error) {
255
+ // Deterministic structural analysis
256
+ summary := extractCodeStructure(content, ext)
257
+ if summary != "" {
258
+ return summary, nil
259
+ }
260
+ // Fall back to LLM
261
+ return h.llmSummarize(path, content, "source code")
262
+ }
263
+
264
+ // extractCodeStructure does basic structural analysis without LLM.
265
+ func extractCodeStructure(content string, ext string) string {
266
+ lines := strings.Split(content, "\n")
267
+ var sb strings.Builder
268
+ sb.WriteString(fmt.Sprintf("Code file: %d lines\n", len(lines)))
269
+
270
+ // Extract function/class/struct definitions
271
+ var defs []string
272
+ for _, line := range lines {
273
+ trimmed := strings.TrimSpace(line)
274
+ if isDefinitionLine(trimmed, ext) {
275
+ defs = append(defs, trimmed)
276
+ }
277
+ }
278
+
279
+ if len(defs) > 0 {
280
+ sb.WriteString(fmt.Sprintf("Definitions (%d):\n", len(defs)))
281
+ for _, d := range defs {
282
+ if len(d) > 120 {
283
+ d = d[:120] + "..."
284
+ }
285
+ sb.WriteString(" " + d + "\n")
286
+ }
287
+ }
288
+
289
+ // Extract imports
290
+ var imports []string
291
+ for _, line := range lines {
292
+ trimmed := strings.TrimSpace(line)
293
+ if isImportLine(trimmed, ext) {
294
+ imports = append(imports, trimmed)
295
+ }
296
+ }
297
+ if len(imports) > 0 {
298
+ sb.WriteString(fmt.Sprintf("Imports (%d):\n", len(imports)))
299
+ max := 20
300
+ if len(imports) < max {
301
+ max = len(imports)
302
+ }
303
+ for _, imp := range imports[:max] {
304
+ sb.WriteString(" " + imp + "\n")
305
+ }
306
+ if len(imports) > max {
307
+ sb.WriteString(fmt.Sprintf(" ... and %d more\n", len(imports)-max))
308
+ }
309
+ }
310
+
311
+ return sb.String()
312
+ }
313
+
314
+ func isDefinitionLine(line string, ext string) bool {
315
+ switch ext {
316
+ case ".go":
317
+ return strings.HasPrefix(line, "func ") || strings.HasPrefix(line, "type ") ||
318
+ strings.HasPrefix(line, "var ") || strings.HasPrefix(line, "const ")
319
+ case ".py":
320
+ return strings.HasPrefix(line, "def ") || strings.HasPrefix(line, "class ") ||
321
+ strings.HasPrefix(line, "async def ")
322
+ case ".ts", ".tsx", ".js", ".jsx":
323
+ return strings.HasPrefix(line, "function ") || strings.HasPrefix(line, "class ") ||
324
+ strings.HasPrefix(line, "export ") || strings.HasPrefix(line, "interface ") ||
325
+ strings.HasPrefix(line, "type ") || strings.HasPrefix(line, "const ")
326
+ case ".rs":
327
+ return strings.HasPrefix(line, "fn ") || strings.HasPrefix(line, "pub fn ") ||
328
+ strings.HasPrefix(line, "struct ") || strings.HasPrefix(line, "enum ") ||
329
+ strings.HasPrefix(line, "impl ") || strings.HasPrefix(line, "trait ")
330
+ case ".java", ".kt", ".scala":
331
+ return strings.HasPrefix(line, "public ") || strings.HasPrefix(line, "private ") ||
332
+ strings.HasPrefix(line, "class ") || strings.HasPrefix(line, "interface ")
333
+ default:
334
+ return strings.HasPrefix(line, "function ") || strings.HasPrefix(line, "class ") ||
335
+ strings.HasPrefix(line, "def ")
336
+ }
337
+ }
338
+
339
+ func isImportLine(line string, ext string) bool {
340
+ switch ext {
341
+ case ".go":
342
+ return strings.HasPrefix(line, "import ") || (strings.HasPrefix(line, "\"") && strings.HasSuffix(line, "\""))
343
+ case ".py":
344
+ return strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "from ")
345
+ case ".ts", ".tsx", ".js", ".jsx":
346
+ return strings.HasPrefix(line, "import ") || strings.HasPrefix(line, "require(")
347
+ case ".rs":
348
+ return strings.HasPrefix(line, "use ")
349
+ case ".java":
350
+ return strings.HasPrefix(line, "import ")
351
+ default:
352
+ return strings.HasPrefix(line, "import ")
353
+ }
354
+ }
355
+
356
+ // summarizeText generates an LLM summary for unstructured text.
357
+ func (h *LCMFileHandler) summarizeText(path string, content string) (string, error) {
358
+ return h.llmSummarize(path, content, "text document")
359
+ }
360
+
361
+ func (h *LCMFileHandler) llmSummarize(path string, content string, fileType string) (string, error) {
362
+ // Truncate content for the summary prompt (we don't need the whole file)
363
+ maxChars := 10000
364
+ truncated := content
365
+ if len(content) > maxChars {
366
+ truncated = content[:maxChars/2] + "\n...\n" + content[len(content)-maxChars/2:]
367
+ }
368
+
369
+ prompt := fmt.Sprintf(`Generate a concise exploration summary for this %s file (%s).
370
+ Include: structure, key entities, purpose, and notable patterns.
371
+ Keep it under 200 words.
372
+
373
+ Content:
374
+ %s`, fileType, path, truncated)
375
+
376
+ request := ChatRequest{
377
+ Model: h.model,
378
+ Messages: []Message{
379
+ {Role: "user", Content: prompt},
380
+ },
381
+ APIBase: h.apiBase,
382
+ APIKey: h.apiKey,
383
+ Timeout: h.timeout,
384
+ ExtraParams: h.extraParams,
385
+ }
386
+
387
+ result, err := CallChatCompletion(request)
388
+ if err != nil {
389
+ // Fall back to deterministic summary on LLM failure
390
+ lines := strings.Split(content, "\n")
391
+ return fmt.Sprintf("%s file: %d lines, %d tokens", fileType, len(lines), EstimateTokens(content)), nil
392
+ }
393
+
394
+ return result.Content, nil
395
+ }
396
+
397
+ // detectMIMEType returns a MIME type based on file extension.
398
+ func detectMIMEType(path string) string {
399
+ ext := strings.ToLower(filepath.Ext(path))
400
+ mimeTypes := map[string]string{
401
+ ".json": "application/json",
402
+ ".jsonl": "application/x-jsonlines",
403
+ ".csv": "text/csv",
404
+ ".tsv": "text/tab-separated-values",
405
+ ".sql": "application/sql",
406
+ ".xml": "application/xml",
407
+ ".yaml": "application/yaml",
408
+ ".yml": "application/yaml",
409
+ ".go": "text/x-go",
410
+ ".ts": "text/typescript",
411
+ ".js": "text/javascript",
412
+ ".py": "text/x-python",
413
+ ".rs": "text/x-rust",
414
+ ".java": "text/x-java",
415
+ ".md": "text/markdown",
416
+ ".txt": "text/plain",
417
+ ".html": "text/html",
418
+ ".css": "text/css",
419
+ }
420
+ if mime, ok := mimeTypes[ext]; ok {
421
+ return mime
422
+ }
423
+ return "application/octet-stream"
424
+ }