byterover-cli 1.7.2 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/README.md +17 -3
  2. package/dist/agent/core/domain/tools/constants.d.ts +0 -15
  3. package/dist/agent/core/domain/tools/constants.js +0 -15
  4. package/dist/agent/core/interfaces/i-cipher-agent.d.ts +6 -0
  5. package/dist/agent/core/interfaces/i-curate-service.d.ts +12 -0
  6. package/dist/agent/infra/llm/internal-llm-service.d.ts +13 -0
  7. package/dist/agent/infra/llm/internal-llm-service.js +61 -21
  8. package/dist/agent/infra/tools/implementations/curate-tool.d.ts +133 -0
  9. package/dist/agent/infra/tools/implementations/curate-tool.js +14 -0
  10. package/dist/agent/infra/tools/implementations/search-knowledge-service.js +91 -14
  11. package/dist/agent/infra/tools/index.d.ts +0 -4
  12. package/dist/agent/infra/tools/index.js +0 -4
  13. package/dist/agent/infra/tools/tool-registry.js +0 -113
  14. package/dist/agent/resources/prompts/curate-detail-preservation.yml +73 -0
  15. package/dist/agent/resources/prompts/system-prompt.yml +69 -3
  16. package/dist/server/core/domain/knowledge/markdown-writer.d.ts +13 -0
  17. package/dist/server/core/domain/knowledge/markdown-writer.js +116 -8
  18. package/dist/server/infra/executor/curate-executor.js +1 -1
  19. package/dist/server/infra/executor/direct-search-responder.d.ts +45 -0
  20. package/dist/server/infra/executor/direct-search-responder.js +86 -0
  21. package/dist/server/infra/executor/folder-pack-executor.d.ts +13 -5
  22. package/dist/server/infra/executor/folder-pack-executor.js +739 -39
  23. package/dist/server/infra/executor/query-executor.d.ts +49 -3
  24. package/dist/server/infra/executor/query-executor.js +194 -9
  25. package/dist/server/infra/executor/query-result-cache.d.ts +87 -0
  26. package/dist/server/infra/executor/query-result-cache.js +127 -0
  27. package/dist/server/infra/executor/query-similarity.d.ts +28 -0
  28. package/dist/server/infra/executor/query-similarity.js +41 -0
  29. package/dist/server/infra/process/agent-worker.js +9 -2
  30. package/dist/server/infra/process/inline-agent-executor.js +16 -5
  31. package/dist/server/infra/usecase/curate-use-case.js +6 -1
  32. package/dist/server/infra/usecase/query-use-case.js +10 -0
  33. package/dist/server/utils/file-validator.js +78 -1
  34. package/dist/tui/hooks/use-slash-completion.js +25 -4
  35. package/oclif.manifest.json +1 -1
  36. package/package.json +1 -1
  37. package/dist/agent/infra/tools/implementations/bash-exec-tool.d.ts +0 -13
  38. package/dist/agent/infra/tools/implementations/bash-exec-tool.js +0 -110
  39. package/dist/agent/infra/tools/implementations/bash-output-tool.d.ts +0 -12
  40. package/dist/agent/infra/tools/implementations/bash-output-tool.js +0 -43
  41. package/dist/agent/infra/tools/implementations/batch-tool.d.ts +0 -12
  42. package/dist/agent/infra/tools/implementations/batch-tool.js +0 -142
  43. package/dist/agent/infra/tools/implementations/create-knowledge-topic-tool.d.ts +0 -11
  44. package/dist/agent/infra/tools/implementations/create-knowledge-topic-tool.js +0 -149
  45. package/dist/agent/infra/tools/implementations/delete-memory-tool.d.ts +0 -12
  46. package/dist/agent/infra/tools/implementations/delete-memory-tool.js +0 -37
  47. package/dist/agent/infra/tools/implementations/edit-file-tool.d.ts +0 -13
  48. package/dist/agent/infra/tools/implementations/edit-file-tool.js +0 -50
  49. package/dist/agent/infra/tools/implementations/edit-memory-tool.d.ts +0 -13
  50. package/dist/agent/infra/tools/implementations/edit-memory-tool.js +0 -53
  51. package/dist/agent/infra/tools/implementations/kill-process-tool.d.ts +0 -12
  52. package/dist/agent/infra/tools/implementations/kill-process-tool.js +0 -55
  53. package/dist/agent/infra/tools/implementations/list-memories-tool.d.ts +0 -12
  54. package/dist/agent/infra/tools/implementations/list-memories-tool.js +0 -63
  55. package/dist/agent/infra/tools/implementations/read-memory-tool.d.ts +0 -12
  56. package/dist/agent/infra/tools/implementations/read-memory-tool.js +0 -39
  57. package/dist/agent/infra/tools/implementations/read-todos-tool.d.ts +0 -11
  58. package/dist/agent/infra/tools/implementations/read-todos-tool.js +0 -39
  59. package/dist/agent/infra/tools/implementations/search-history-tool.d.ts +0 -10
  60. package/dist/agent/infra/tools/implementations/search-history-tool.js +0 -36
  61. package/dist/agent/infra/tools/implementations/spec-analyze-tool.d.ts +0 -7
  62. package/dist/agent/infra/tools/implementations/spec-analyze-tool.js +0 -78
  63. package/dist/agent/infra/tools/implementations/write-memory-tool.d.ts +0 -13
  64. package/dist/agent/infra/tools/implementations/write-memory-tool.js +0 -52
  65. package/dist/agent/infra/tools/implementations/write-todos-tool.d.ts +0 -13
  66. package/dist/agent/infra/tools/implementations/write-todos-tool.js +0 -121
@@ -1,18 +1,20 @@
1
+ import fs from 'node:fs/promises';
1
2
  import path from 'node:path';
2
3
  /**
3
4
  * FolderPackExecutor - Executes folder pack + curate tasks with an injected CipherAgent.
4
5
  *
5
6
  * This executor:
6
7
  * 1. Packs the folder using FolderPackService
7
- * 2. Generates XML from the pack result
8
- * 3. Builds a prompt for the agent to analyze and curate the folder
9
- * 4. Executes with the agent
8
+ * 2. Stores packed data in sandbox environment as context variable
9
+ * 3. Guides agent to iteratively query and extract knowledge
10
+ * 4. Agent curates extracted pieces using tools.curate()
10
11
  *
11
12
  * Architecture:
12
13
  * - TaskProcessor injects the long-lived CipherAgent
13
14
  * - Event streaming is handled by agent-worker (subscribes to agentEventBus)
14
15
  * - Transport handles task lifecycle (task:started, task:completed, task:error)
15
16
  * - Executor focuses solely on folder pack + curate execution
17
+ * - Uses iterative extraction strategy (inspired by rlm) to avoid token limits
16
18
  */
17
19
  export class FolderPackExecutor {
18
20
  folderPackService;
@@ -30,53 +32,751 @@ export class FolderPackExecutor {
30
32
  extractPdfText: true,
31
33
  maxLinesPerFile: 5000, // Limit lines for large files
32
34
  });
33
- // Generate XML from pack result
34
- const xml = this.folderPackService.generateXml(packResult);
35
- // Build prompt for the agent
36
- const prompt = this.buildAnalysisPrompt(xml, content, packResult.fileCount, absoluteFolderPath);
37
- // Execute with curate commandType
38
- const response = await agent.execute(prompt, {
39
- executionContext: { commandType: 'curate' },
40
- taskId,
41
- });
42
- return response;
35
+ // Use iterative extraction strategy (inspired by rlm)
36
+ // Stores packed folder in sandbox environment and lets agent iteratively query/extract
37
+ // This avoids token limits entirely - works for folders of any size
38
+ return this.executeIterative(agent, packResult, content, absoluteFolderPath, taskId);
43
39
  }
44
40
  /**
45
- * Build the analysis prompt for the agent.
41
+ * Build iterative extraction prompt with file-based access.
42
+ * Folder data is stored in a temporary file to avoid token limits.
46
43
  */
47
- buildAnalysisPrompt(xml, context, fileCount, folderPath) {
48
- const contextSection = context?.trim() ? `\n## User Context\n${context}\n` : '';
49
- return `# Folder Analysis Task
44
+ buildIterativePromptWithFileAccess(userContext, folderPath, tmpFilePath, fileCount, totalLines) {
45
+ const contextSection = userContext?.trim() ? `\n## User Context\n${userContext}\n` : '';
46
+ return `# Iterative Folder Curation Task
50
47
 
51
- You are analyzing a packed folder containing ${fileCount} files from: ${folderPath}
48
+ You are curating knowledge from a folder: ${folderPath}
52
49
  ${contextSection}
53
- ## Packed Folder Content
50
+ ## Folder Overview
51
+
52
+ - **Total Files**: ${fileCount}
53
+ - **Total Lines**: ${totalLines}
54
+
55
+ ## Pre-loaded Data
56
+
57
+ **IMPORTANT**: Folder data is stored in a temporary file at \`${tmpFilePath}\` in **repomix-style XML format**.
58
+
59
+ **File Location**: The file is in the current working directory (accessible to code_exec sandbox).
60
+
61
+ **CRITICAL - Tool Usage**:
62
+ - ✅ **USE code_exec ONLY** - All file operations happen inside code_exec
63
+ - ✅ **tools.readFile() IS available** inside code_exec - Use it to read the XML file
64
+ - ✅ **tools.grep() IS available** inside code_exec - Use it to search the XML file
65
+ - ❌ **DO NOT use fs.readFileSync()** - require() is blocked in the sandbox
66
+ - ❌ **DO NOT use bash_exec** - use code_exec with tools.readFile/tools.grep instead
67
+
68
+ **Why tools.readFile() works inside code_exec:**
69
+ - The ToolsSDK is pre-injected into the code_exec sandbox as a global \`tools\` object
70
+ - You CAN call \`await tools.readFile()\` and \`await tools.grep()\` from inside code_exec
71
+ - You CANNOT use \`require('fs')\` because require() is blocked for security
72
+ - All async tools methods (readFile, grep, glob, curate) are available inside code_exec
73
+
74
+ Data structure:
75
+ \`\`\`xml
76
+ <?xml version="1.0" encoding="UTF-8"?>
77
+ <packed_folder>
78
+ <metadata>
79
+ <file_count>...</file_count>
80
+ <total_lines>...</total_lines>
81
+ ...
82
+ </metadata>
83
+ <directory_structure>
84
+ <![CDATA[
85
+ tree structure here
86
+ ]]>
87
+ </directory_structure>
88
+ <files>
89
+ <file path="src/index.ts" lines="100" size="2048" type="code">
90
+ file content here
91
+ </file>
92
+ <file path="package.json" lines="50" size="1024" type="config">
93
+ file content here
94
+ </file>
95
+ <!-- ... all ${fileCount} files -->
96
+ </files>
97
+ <summary>
98
+ <file_types>code: 10, config: 5, doc: 3</file_types>
99
+ </summary>
100
+ </packed_folder>
101
+ \`\`\`
102
+
103
+ ## Strategy
104
+
105
+ Use **code_exec with tools.readFile/tools.grep** to extract knowledge:
106
+
107
+ 1. **Read XML file**: Use \`await tools.readFile('${tmpFilePath}')\` inside code_exec to read the XML
108
+ - Example: \`const fileContent = await tools.readFile('${tmpFilePath}')\`
109
+ - Returns: \`{ content: string, lines: number, truncated: boolean }\`
110
+ 2. **Search XML file**: Use \`await tools.grep(pattern, options)\` inside code_exec to search
111
+ - Example: \`const matches = await tools.grep('<file[^>]*path=".*README.*">', { path: '${tmpFilePath}' })\`
112
+ - Returns: \`{ matches: [...], totalMatches: number }\`
113
+ 3. **Process data**: Parse and analyze the XML content using regex/string methods
114
+ - Extract metadata, file lists, specific files, etc.
115
+ 4. **Curate knowledge**: Use \`await tools.curate(operations)\` inside code_exec
116
+ - Example: \`await tools.curate([{ type: 'ADD', path: 'overview', data: { concept: '...' } }])\`
117
+ 5. **Process in batches**: Handle 5-10 files at a time to manage output size
118
+
119
+ **Important**: All tools.* methods are async - always use \`await\`!
120
+
121
+ ## Common Mistakes to Avoid
122
+
123
+ **❌ WRONG - Using require() inside code_exec:**
124
+ \`\`\`typescript
125
+ // This will FAIL - require() is blocked!
126
+ const result = await tools.code_exec({
127
+ code: \`
128
+ const fs = require('fs') // ❌ ERROR: require is not defined
129
+ const data = fs.readFileSync('file.xml', 'utf-8')
130
+ return data
131
+ \`
132
+ })
133
+ \`\`\`
134
+
135
+ **✅ CORRECT - Use tools.readFile() inside code_exec:**
136
+ \`\`\`typescript
137
+ // ✅ tools.readFile() IS available inside code_exec
138
+ const result = await tools.code_exec({
139
+ code: \`
140
+ // Read file using pre-injected tools SDK
141
+ const fileContent = await tools.readFile('${tmpFilePath}')
142
+ const xmlData = fileContent.content
143
+
144
+ // Process the data
145
+ const metadata = xmlData.match(/<metadata>[\\\\s\\\\S]*?<\\\\/metadata>/)
146
+ return metadata
147
+ \`
148
+ })
149
+ \`\`\`
150
+
151
+ ## Example: Reading the XML Data
152
+
153
+ **Note**: Use \`tools.readFile()\` inside code_exec to read files and process data.
154
+
155
+ \`\`\`typescript
156
+ // Everything happens inside ONE code_exec call
157
+ const overview = await tools.code_exec({
158
+ code: \`
159
+ // =========================================
160
+ // INSIDE code_exec sandbox:
161
+ // - tools.readFile() IS available
162
+ // - tools.grep() IS available
163
+ // - tools.curate() IS available
164
+ // - require() is NOT available
165
+ // =========================================
166
+
167
+ // Step 1: Read the XML file
168
+ const fileContent = await tools.readFile('${tmpFilePath}')
169
+ const xmlData = fileContent.content
170
+
171
+ // Step 2: Extract metadata section using regex
172
+ const metadataMatch = xmlData.match(/<metadata>([\\\\s\\\\S]*?)<\\\\/metadata>/)
173
+ if (!metadataMatch) {
174
+ return { error: 'Metadata not found' }
175
+ }
176
+
177
+ const metadata = metadataMatch[1]
178
+
179
+ // Step 3: Extract specific fields
180
+ const fileCount = metadata.match(/<file_count>(\\\\d+)<\\\\/file_count>/)?.[1]
181
+ const totalLines = metadata.match(/<total_lines>(\\\\d+)<\\\\/total_lines>/)?.[1]
182
+ const fileTypes = xmlData.match(/<file_types>([^<]+)<\\\\/file_types>/)?.[1]
183
+
184
+ return {
185
+ fileCount: parseInt(fileCount) || 0,
186
+ totalLines: parseInt(totalLines) || 0,
187
+ fileTypes: fileTypes || 'unknown'
188
+ }
189
+ \`
190
+ })
191
+
192
+ console.log('Folder overview:', overview)
193
+ \`\`\`
194
+
195
+ ## Example: Extract README with Full Content Preservation
196
+
197
+ **IMPORTANT: This example shows VERBATIM preservation - copying the ENTIRE README content, not summarizing it.**
198
+
199
+ \`\`\`typescript
200
+ // Everything happens inside ONE code_exec call
201
+ await tools.code_exec({
202
+ code: \`
203
+ // Step 1: Use tools.grep() to find README files
204
+ const grepResult = await tools.grep('<file[^>]*path="[^"]*README[^"]*"', {
205
+ path: '${tmpFilePath}'
206
+ })
207
+
208
+ if (!grepResult.matches || grepResult.matches.length === 0) {
209
+ console.log('No README found')
210
+ return { found: false }
211
+ }
212
+
213
+ // Step 2: Read the full XML to extract README content
214
+ const fileContent = await tools.readFile('${tmpFilePath}')
215
+ const xmlData = fileContent.content
216
+
217
+ // Step 3: Extract README file tag and content (COMPLETE content, not summary)
218
+ const readmeMatch = xmlData.match(/<file[^>]*path="[^"]*README[^"]*"[^>]*>([\\\\s\\\\S]*?)<\\\\/file>/i)
219
+ if (!readmeMatch) {
220
+ return { found: false }
221
+ }
222
+
223
+ const fullTag = readmeMatch[0]
224
+ const readmeContent = readmeMatch[1] // ENTIRE README content
225
+ const pathMatch = fullTag.match(/path="([^"]+)"/)
226
+ const readmePath = pathMatch?.[1] || 'unknown'
227
+
228
+ console.log('Found README:', readmePath, '- preserving COMPLETE content')
229
+
230
+ // Step 4: Curate with VERBATIM preservation
231
+ // KEY: Store the ENTIRE README in snippets array - DO NOT summarize
232
+ await tools.curate([{
233
+ type: 'UPSERT',
234
+ path: 'documentation/project',
235
+ title: 'Project Overview',
236
+ content: {
237
+ // PRIMARY: Complete README content in snippets (verbatim)
238
+ snippets: [readmeContent],
239
+
240
+ // SECONDARY: Extract key metadata if needed
241
+ rawConcept: {
242
+ files: [readmePath],
243
+ timestamp: new Date().toISOString()
244
+ },
245
+
246
+ // Relations: Link to related docs if found
247
+ relations: []
248
+ },
249
+ reason: \`Preserving complete README from \${readmePath} - full content captured verbatim\`
250
+ }])
251
+
252
+ return {
253
+ found: true,
254
+ path: readmePath,
255
+ contentLength: readmeContent.length,
256
+ preserved: 'complete'
257
+ }
258
+ \`
259
+ })
260
+ \`\`\`
261
+
262
+ **Key points demonstrated:**
263
+ - ✅ Entire README content copied to \`snippets\` array
264
+ - ✅ No summarization or truncation
265
+ - ✅ Complete file path preserved in \`rawConcept.files\`
266
+ - ✅ Clear reasoning about verbatim preservation
267
+ - ❌ NOT creating a summary or "key points" version
268
+
269
+ ## Example: Process Code Files with Complete Preservation
270
+
271
+ **IMPORTANT: This example shows preserving COMPLETE code files, not just extracting class/function names.**
272
+
273
+ \`\`\`typescript
274
+ // Everything happens inside ONE code_exec call
275
+ const tsFiles = await tools.code_exec({
276
+ code: \`
277
+ // Step 1: Use tools.grep() to find TypeScript interface/type files (high value)
278
+ const grepResult = await tools.grep('<file[^>]*path="[^"]*types\\\\.ts"', {
279
+ path: '${tmpFilePath}'
280
+ })
281
+
282
+ console.log(\`Found \${grepResult.totalMatches} TypeScript type definition files\`)
283
+
284
+ // Step 2: Read the XML file
285
+ const fileContent = await tools.readFile('${tmpFilePath}')
286
+ const xmlData = fileContent.content
287
+
288
+ // Step 3: Extract type files (process in batches of 5-10 to manage output)
289
+ const fileRegex = /<file[^>]*path="([^"]*types\\\\.ts)"[^>]*>([\\\\s\\\\S]*?)<\\\\/file>/g
290
+ const files = []
291
+ let match
292
+
293
+ while ((match = fileRegex.exec(xmlData)) !== null) {
294
+ files.push({
295
+ path: match[1],
296
+ content: match[2] // COMPLETE file content
297
+ })
298
+
299
+ // Process in batches of 5 to manage output size
300
+ if (files.length >= 5) break
301
+ }
302
+
303
+ console.log(\`Processing \${files.length} type definition files - preserving COMPLETE code\`)
304
+
305
+ // Step 4: Curate each file with VERBATIM preservation
306
+ for (const file of files) {
307
+ console.log(\`Curating: \${file.path} - full file content\`)
308
+
309
+ // Extract structural information (optional metadata)
310
+ const interfaceNames = [...file.content.matchAll(/interface\\s+(\\w+)/g)].map(m => m[1])
311
+ const typeNames = [...file.content.matchAll(/type\\s+(\\w+)/g)].map(m => m[1])
312
+ const enumNames = [...file.content.matchAll(/enum\\s+(\\w+)/g)].map(m => m[1])
313
+
314
+ // Create sanitized path for topic (remove special chars)
315
+ const sanitizedPath = file.path
316
+ .replace(/^src\\\\//, '')
317
+ .replace(/\\\\.ts$/, '')
318
+ .replace(/[\\\\/.]/g, '_')
319
+
320
+ // Curate with COMPLETE file content
321
+ await tools.curate([{
322
+ type: 'UPSERT',
323
+ path: \`code/types\`,
324
+ title: sanitizedPath,
325
+ content: {
326
+ // PRIMARY: Complete file in snippets (verbatim code)
327
+ snippets: [
328
+ \`\`\`typescript
329
+ // File: \${file.path}
330
+ \${file.content}
331
+ \`\`\`
332
+ ],
333
+
334
+ // SECONDARY: Metadata for searchability
335
+ rawConcept: {
336
+ files: [file.path],
337
+ patterns: [
338
+ ...interfaceNames.map(name => ({
339
+ pattern: \`interface \${name}\`,
340
+ description: \`Interface definition for \${name}\`
341
+ })),
342
+ ...typeNames.map(name => ({
343
+ pattern: \`type \${name}\`,
344
+ description: \`Type alias for \${name}\`
345
+ })),
346
+ ...enumNames.map(name => ({
347
+ pattern: \`enum \${name}\`,
348
+ description: \`Enum definition for \${name}\`
349
+ }))
350
+ ].slice(0, 20), // Limit metadata, but keep FULL code in snippets
351
+ timestamp: new Date().toISOString()
352
+ },
353
+
354
+ // TERTIARY: Narrative summary (but code is PRIMARY)
355
+ narrative: {
356
+ structure: \`Defines \${interfaceNames.length} interfaces, \${typeNames.length} type aliases, and \${enumNames.length} enums.\`
357
+ }
358
+ },
359
+ reason: \`Preserving complete type definitions from \${file.path} - full code preserved in snippets\`
360
+ }])
361
+ }
362
+
363
+ return {
364
+ processed: files.length,
365
+ total: grepResult.totalMatches,
366
+ preservation: 'complete',
367
+ note: 'All file contents copied verbatim to snippets'
368
+ }
369
+ \`
370
+ })
371
+
372
+ console.log(\`Processed \${tsFiles.processed} of \${tsFiles.total} type files - FULL code preserved\`)
373
+ \`\`\`
374
+
375
+ **Key points demonstrated:**
376
+ - ✅ Complete file content in \`snippets\` with \`\`\`typescript\`\`\` formatting
377
+ - ✅ File path preserved in comment header
378
+ - ✅ Structural metadata extracted but NOT used as replacement for code
379
+ - ✅ Process in batches (5-10 files) to manage output, but each file is COMPLETE
380
+ - ✅ Sanitized file path used for topic organization
381
+ - ❌ NOT extracting just class/function names - preserving FULL implementations
382
+
383
+ ## Example: Get Directory Structure
384
+
385
+ \`\`\`typescript
386
+ // Everything happens inside ONE code_exec call
387
+ const tree = await tools.code_exec({
388
+ code: \`
389
+ // Step 1: Read the XML file
390
+ const fileContent = await tools.readFile('${tmpFilePath}')
391
+ const xmlData = fileContent.content
54
392
 
55
- <packed_folder_xml>
56
- ${xml}
57
- </packed_folder_xml>
393
+ // Step 2: Extract CDATA section with directory structure
394
+ const treeMatch = xmlData.match(/<directory_structure><!\\\\[CDATA\\\\[([\\\\s\\\\S]*?)\\\\]\\\\]><\\\\/directory_structure>/)
58
395
 
59
- ## Instructions
396
+ if (!treeMatch) {
397
+ return null
398
+ }
399
+
400
+ const directoryTree = treeMatch[1]
401
+
402
+ // Step 3: Curate the directory structure
403
+ await tools.curate([{
404
+ type: 'ADD',
405
+ path: 'project/directory_structure',
406
+ data: {
407
+ concept: 'Project directory structure',
408
+ structure: directoryTree
409
+ }
410
+ }])
411
+
412
+ return directoryTree
413
+ \`
414
+ })
415
+
416
+ if (tree) {
417
+ console.log('Directory structure:', tree)
418
+ }
419
+ \`\`\`
420
+
421
+ ## Content Preservation - CRITICAL INSTRUCTIONS
422
+
423
+ **FUNDAMENTAL PRINCIPLE: PRESERVE, DON'T SUMMARIZE**
424
+
425
+ **YOU MUST COPY CONTENT VERBATIM - NOT SUMMARIZE IT**
426
+
427
+ When curating knowledge from source files, you MUST preserve the exact, complete content. This is NOT a summarization task.
428
+
429
+ ### Required Preservation Approach:
430
+
431
+ 1. **For Documentation/README files:**
432
+ - Copy ENTIRE file content into \`snippets\` array (one snippet = one file)
433
+ - Keep ALL sections, ALL paragraphs, ALL details
434
+ - Preserve exact formatting, code blocks, examples
435
+ - DO NOT summarize or paraphrase
436
+
437
+ 2. **For Code files:**
438
+ - Copy COMPLETE function/class definitions into \`snippets\`
439
+ - Include ALL comments, ALL logic, ALL edge cases
440
+ - Preserve exact variable names, function signatures
441
+ - Keep implementation details - they matter
442
+
443
+ 3. **For Configuration files:**
444
+ - Copy ENTIRE config file content into \`snippets\`
445
+ - Preserve ALL settings, ALL comments, ALL structure
446
+ - Keep exact values and formatting
447
+
448
+ 4. **For Rules/Constraints (from docs or comments):**
449
+ - Use \`narrative.rules\` for exact rule text
450
+ - Copy verbatim from source - no paraphrasing
451
+ - Include ALL constraints, not just "important" ones
452
+
453
+ 5. **For Examples (from docs or code):**
454
+ - Use \`narrative.examples\` for complete examples
455
+ - Include full code blocks with all context
456
+ - Preserve exact formatting and output
457
+
458
+ 6. **For Patterns (validation, regex, etc):**
459
+ - Use \`rawConcept.patterns\` with complete pattern strings
460
+ - Include ALL patterns found, not just samples
461
+ - Add descriptions explaining what each pattern does
462
+
463
+ 7. **For Diagrams (Mermaid, PlantUML, ASCII art):**
464
+ - Use \`narrative.diagrams\` array with {type, content, title?}
465
+ - type: "mermaid" | "plantuml" | "ascii" | "other"
466
+ - Copy ENTIRE diagram content verbatim - character for character
467
+ - NEVER describe a diagram in prose instead of storing the actual diagram
468
+ - Detect: fenced blocks with mermaid/plantuml tags, @startuml/@enduml, box-drawing characters
469
+
470
+ 8. **For Tables:**
471
+ - Copy complete tables with ALL rows into \`narrative.structure\` or \`narrative.features\`
472
+ - Preserve column headers and every data row - do not summarize
473
+
474
+ ### What "Preserve" Means:
475
+
476
+ ✅ **CORRECT - Verbatim preservation:**
477
+ \`\`\`typescript
478
+ content: {
479
+ snippets: [
480
+ \`\`\`markdown
481
+ # Authentication Flow
482
+
483
+ The system uses JWT-based authentication with the following steps:
484
+
485
+ 1. User submits credentials via POST /api/auth/login
486
+ 2. Server validates credentials against database
487
+ 3. On success, generates JWT with user ID and role
488
+ 4. JWT expires after 24 hours
489
+ 5. Client stores JWT in httpOnly cookie
490
+
491
+ ## Token Structure
492
+ - Header: { "alg": "HS256", "typ": "JWT" }
493
+ - Payload: { "userId": string, "role": string, "exp": number }
494
+ - Signature: HMAC-SHA256(header + payload, secret)
495
+
496
+ ## Security Notes
497
+ - Secret key rotated monthly
498
+ - Failed login attempts rate limited (5 per minute)
499
+ - JWT blacklist maintained for logout
500
+ \`\`\`
501
+ ],
502
+ narrative: {
503
+ rules: "Authentication rules:\\n- JWT expires after 24 hours\\n- Secret key rotated monthly\\n- Failed login attempts rate limited (5 per minute)\\n- JWT blacklist maintained for logout"
504
+ }
505
+ }
506
+ \`\`\`
507
+
508
+ ❌ **WRONG - Summarization:**
509
+ \`\`\`typescript
510
+ content: {
511
+ snippets: ["JWT authentication with 24-hour expiry"],
512
+ narrative: {
513
+ rules: "Use JWT for auth, rotate secrets regularly"
514
+ }
515
+ }
516
+ \`\`\`
517
+
518
+ ### Data Structure Usage:
519
+
520
+ **Primary field for verbatim content:**
521
+ - \`snippets: string[]\` - Array of complete file contents or large code sections
522
+ - Each snippet should be a complete, self-contained piece of content
523
+ - Use one snippet per file or per major code section
524
+ - Include full context (imports, dependencies, etc.)
525
+
526
+ **Secondary fields for structured details:**
527
+ - \`narrative.rules\` - Exact rule/constraint text from docs
528
+ - \`narrative.examples\` - Complete example code with full context
529
+ - \`narrative.features\` - Full feature descriptions with all details
530
+ - \`narrative.structure\` - Complete structural documentation
531
+ - \`narrative.dependencies\` - Full dependency information
532
+ - \`rawConcept.patterns\` - All patterns with complete regex/validation strings
533
+ - \`rawConcept.files\` - Complete list of related file paths
534
+ - \`rawConcept.flow\` - Detailed execution flow description
535
+
536
+ ### Batch Processing Strategy:
537
+
538
+ Process in batches to manage output size while preserving completeness:
539
+ 1. **Batch 1 (Priority)**: README, CONTRIBUTING, ARCHITECTURE docs (copy full content)
540
+ 2. **Batch 2**: Core interfaces/types (copy complete definitions)
541
+ 3. **Batch 3**: Main implementation files (copy complete functions/classes)
542
+ 4. **Batch 4**: Configuration files (copy full configs)
543
+ 5. **Batch 5**: Test files (copy representative test suites)
544
+
545
+ **Within each batch**: Copy COMPLETE files, don't truncate or summarize
60
546
 
61
- Analyze this folder and extract knowledge using \`tools.curate()\`. Focus on:
547
+ ## What to Extract
62
548
 
63
- 1. **High-level architecture** - How the codebase is organized
64
- 2. **Key modules and their purposes** - What each major component does
65
- 3. **Configuration patterns** - How the project is configured
66
- 4. **Important dependencies** - Key external libraries and their usage
67
- 5. **Domain concepts** - Business logic and domain-specific patterns
549
+ Extract ALL of the following - COMPLETE and VERBATIM:
68
550
 
69
- For each knowledge topic you identify:
70
- - Use \`tools.curate()\` with appropriate operations (ADD, UPDATE, MERGE)
71
- - Create clear, hierarchical paths in the context tree (e.g., "architecture/overview", "modules/authentication")
72
- - Include relevant code references and examples
73
- - Link related topics using the Relations section
551
+ 1. **Documentation files** - Copy entire README, CONTRIBUTING, ARCHITECTURE files
552
+ 2. **Architectural patterns** - Copy complete code showing organization and design
553
+ 3. **Rules & constraints** - Copy exact text from docs/comments/config (no paraphrasing)
554
+ 4. **Validation patterns** - Copy ALL regex/validation rules with complete patterns
555
+ 5. **Configuration** - Copy entire config files with all settings
556
+ 6. **Domain concepts** - Copy complete implementations showing business logic
557
+ 7. **API definitions** - Copy complete interface/type definitions
558
+ 8. **Examples** - Copy full example code with all context
559
+ 9. **Metadata** - Capture authors, versions, dates from files
560
+ 10. **Diagrams** - Mermaid diagrams, PlantUML, ASCII art flow charts, sequence diagrams (use \`narrative.diagrams\` with type and content - preserve verbatim)
561
+ 11. **Tables** - Data tables with ALL rows preserved (use \`narrative.structure\` or \`narrative.features\`)
562
+ 12. **Procedures** - Step-by-step instructions, numbered workflows (use \`narrative.rules\`)
74
563
 
75
- **IMPORTANT:**
76
- - Focus on extractable knowledge, not just file listings
77
- - Prioritize architectural insights over implementation details
78
- - Create topics that would help a new developer understand the codebase
79
- - Use MERGE operations when updating existing topics to preserve existing content
564
+ ## Curation Process
565
+
566
+ For each knowledge topic:
567
+ 1. **Read complete files** using tools.readFile() or parse from XML
568
+ 2. **Copy verbatim content** into appropriate fields (primarily \`snippets\`)
569
+ 3. **Create UPSERT operations** with complete content (not summaries)
570
+ 4. **Use hierarchical paths** (e.g., "documentation/architecture", "code/interfaces")
571
+ 5. **Preserve completeness** - better to split into multiple topics than truncate
572
+ 6. **Link related topics** using relations field
573
+
574
+ **REMEMBER: Your goal is to PRESERVE knowledge, not summarize it. Future agents need the COMPLETE, EXACT content to understand and work with this codebase.**
575
+
576
+ **Start by parsing the XML to understand the folder structure, then systematically extract and process files by type (docs first, then code, then configs).**
577
+
578
+ ## Tips for XML Parsing
579
+
580
+ **Available tools inside code_exec:**
581
+ - \`await tools.readFile(path)\` - Read file contents (returns { content, lines, truncated })
582
+ - \`await tools.grep(pattern, options)\` - Search file contents (returns { matches, totalMatches })
583
+ - \`await tools.curate(operations)\` - Curate knowledge (operations is an array)
584
+ - \`await tools.glob(pattern)\` - Find files by pattern
585
+ - \`await tools.listDirectory(path)\` - List directory contents
586
+
587
+ **Grep patterns for finding files:**
588
+ - Find specific files: \`await tools.grep('<file[^>]*path="[^"]*README[^"]*"', { path: '${tmpFilePath}' })\`
589
+ - Filter by extension: \`await tools.grep('<file[^>]*path="[^"]*\\\\.ts"', { path: '${tmpFilePath}' })\`
590
+ - Search content: \`await tools.grep('function\\\\s+\\\\w+', { path: '${tmpFilePath}' })\`
591
+
592
+ **Regex patterns for parsing XML:**
593
+ - Extract attributes: \`const path = tag.match(/path="([^"]+)"/)?.[1]\`
594
+ - Extract file content: \`const content = tag.match(/<file[^>]*>([\\\\s\\\\S]*?)<\\\\/file>/)?.[1]\`
595
+ - Extract metadata: \`const metadata = xml.match(/<metadata>([\\\\s\\\\S]*?)<\\\\/metadata>/)?.[1]\`
596
+ - Parse multiple files: Use \`fileRegex.exec(xml)\` in a while loop with global flag
597
+
598
+ **Best practices:**
599
+ - Use ONE code_exec call for entire workflow (read → search → process → curate)
600
+ - Use \`tools.grep()\` first to check if patterns exist (fast)
601
+ - Then use \`tools.readFile()\` to get full content for processing
602
+ - Process files in batches (limit to 5-10 files per iteration)
603
+ - Always use \`await\` with tools.* methods (they're async)
604
+ - NEVER use \`require()\` - it's blocked for security
605
+
606
+ **Note**: The temporary file will be automatically deleted after curation completes.
607
+
608
+ ---
609
+
610
+ ## Anti-Patterns: What NOT to Do
611
+
612
+ **❌ WRONG APPROACH - Summarization:**
613
+ \`\`\`typescript
614
+ // This is INCORRECT - creates summaries instead of preserving content
615
+ await tools.curate([{
616
+ type: 'UPSERT',
617
+ path: 'documentation/api',
618
+ title: 'API Guide',
619
+ content: {
620
+ snippets: ['REST API with CRUD operations'], // ❌ Summary
621
+ narrative: {
622
+ rules: 'Follow REST conventions' // ❌ Paraphrased
623
+ }
624
+ },
625
+ reason: 'API documentation'
626
+ }])
627
+ \`\`\`
628
+
629
+ **✅ CORRECT APPROACH - Verbatim Preservation:**
630
+ \`\`\`typescript
631
+ // This is CORRECT - preserves complete original content
632
+ await tools.curate([{
633
+ type: 'UPSERT',
634
+ path: 'documentation/api',
635
+ title: 'API Guide',
636
+ content: {
637
+ snippets: [
638
+ \`\`\`markdown
639
+ # API Documentation
640
+
641
+ ## Endpoints
642
+
643
+ ### GET /api/users
644
+ Returns a list of all users.
645
+
646
+ **Query Parameters:**
647
+ - \`page\` (optional): Page number (default: 1)
648
+ - \`limit\` (optional): Items per page (default: 20)
649
+
650
+ **Response:**
651
+ \\\`\\\`\\\`json
652
+ {
653
+ "users": [
654
+ { "id": 1, "name": "Alice", "email": "alice@example.com" },
655
+ { "id": 2, "name": "Bob", "email": "bob@example.com" }
656
+ ],
657
+ "total": 100,
658
+ "page": 1,
659
+ "limit": 20
660
+ }
661
+ \\\`\\\`\\\`
662
+
663
+ ### POST /api/users
664
+ Creates a new user.
665
+
666
+ **Request Body:**
667
+ \\\`\\\`\\\`json
668
+ {
669
+ "name": "string",
670
+ "email": "string",
671
+ "password": "string"
672
+ }
673
+ \\\`\\\`\\\`
674
+
675
+ **Validation Rules:**
676
+ - Name: Required, 2-50 characters
677
+ - Email: Required, valid email format
678
+ - Password: Required, minimum 8 characters
679
+
680
+ **Response:**
681
+ \\\`\\\`\\\`json
682
+ {
683
+ "id": 3,
684
+ "name": "Charlie",
685
+ "email": "charlie@example.com",
686
+ "createdAt": "2025-03-18T10:00:00Z"
687
+ }
688
+ \\\`\\\`\\\`
689
+ \`\`\`
690
+ ],
691
+ narrative: {
692
+ rules: \`API Validation Rules (verbatim):
693
+ - Name: Required, 2-50 characters
694
+ - Email: Required, valid email format
695
+ - Password: Required, minimum 8 characters
696
+ - All endpoints require authentication except /api/auth/login
697
+ - Rate limit: 100 requests per minute per IP
698
+ - Response format: JSON with UTF-8 encoding\`
699
+ }
700
+ },
701
+ reason: 'Preserving complete API documentation with all endpoints, parameters, validation rules, and examples'
702
+ }])
703
+ \`\`\`
704
+
705
+ **Common Mistakes:**
706
+
707
+ 1. **❌ Extracting only names/patterns:**
708
+ \`\`\`typescript
709
+ // WRONG: Just listing class names
710
+ patterns: { classes: ['UserService', 'AuthService'] }
711
+ // RIGHT: Include complete class implementations in snippets
712
+ snippets: ['class UserService { ... complete code ... }']
713
+ \`\`\`
714
+
715
+ 2. **❌ Paraphrasing rules:**
716
+ \`\`\`typescript
717
+ // WRONG: "Use proper error handling"
718
+ // RIGHT: Copy exact text from source:
719
+ rules: "All API endpoints must catch exceptions and return standardized error responses with status codes: 400 for validation errors, 401 for authentication errors, 403 for authorization errors, 404 for not found, 500 for server errors."
720
+ \`\`\`
721
+
722
+ 3. **❌ Omitting "less important" details:**
723
+ \`\`\`typescript
724
+ // WRONG: Skipping import statements, type definitions, edge cases
725
+ // RIGHT: Copy EVERYTHING - imports, types, comments, edge cases, all logic
726
+ \`\`\`
727
+
728
+ 4. **❌ Creating abbreviated examples:**
729
+ \`\`\`typescript
730
+ // WRONG: examples: "See code for details"
731
+ // RIGHT: Include the COMPLETE example code
732
+ \`\`\`
733
+
734
+ **Remember:** Future agents will need COMPLETE information to understand and modify this codebase. Summaries are useless - they need the ACTUAL code and documentation.
80
735
  `;
81
736
  }
737
+ /**
738
+ * Execute folder curation using iterative extraction strategy.
739
+ * Pre-loads folder data into REPL environment, then guides agent to iterate and curate.
740
+ * This avoids token limits entirely - data is stored in REPL, not in prompt.
741
+ */
742
+ // eslint-disable-next-line max-params
743
+ async executeIterative(agent, packResult, userContext, folderPath, taskId) {
744
+ // Step 1: Generate repomix-style XML (single string with all file contents)
745
+ const packedXml = this.folderPackService.generateXml(packResult);
746
+ const xmlSizeInMB = (packedXml.length / (1024 * 1024)).toFixed(2);
747
+ console.log(`[FolderPackExecutor] Generated XML: ${xmlSizeInMB} MB for ${packResult.fileCount} files`);
748
+ // Step 2: Write XML to temporary file (avoids token limits, works with any agent)
749
+ // This approach: file path (~50 bytes) sent to LLM, data stays on disk
750
+ // IMPORTANT: Write to CWD (not /tmp) so sandbox can access it
751
+ const tmpFilePath = path.join(process.cwd(), `.byterover-curate-${taskId}.xml`);
752
+ console.log(`[FolderPackExecutor] Writing folder data to temp file: ${tmpFilePath}`);
753
+ try {
754
+ await fs.writeFile(tmpFilePath, packedXml, 'utf8');
755
+ console.log(`[FolderPackExecutor] Successfully wrote ${xmlSizeInMB} MB to temp file`);
756
+ }
757
+ catch (error) {
758
+ throw new Error(`Failed to write temp file: ${error instanceof Error ? error.message : String(error)}`);
759
+ }
760
+ // Step 3: Execute main curation task with file path reference
761
+ const prompt = this.buildIterativePromptWithFileAccess(userContext, folderPath, tmpFilePath, packResult.fileCount, packResult.totalLines);
762
+ let response;
763
+ try {
764
+ response = await agent.execute(prompt, {
765
+ executionContext: { commandType: 'curate' },
766
+ taskId,
767
+ });
768
+ }
769
+ finally {
770
+ // Step 4: Clean up - delete temp file
771
+ console.log(`[FolderPackExecutor] Cleaning up temp file`);
772
+ try {
773
+ await fs.unlink(tmpFilePath);
774
+ console.log(`[FolderPackExecutor] Temp file cleanup successful`);
775
+ }
776
+ catch (error) {
777
+ console.warn(`[FolderPackExecutor] Temp file cleanup warning (non-fatal):`, error);
778
+ }
779
+ }
780
+ return response;
781
+ }
82
782
  }