@comfanion/usethis_search 4.2.0-dev.2 → 4.2.0-dev.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cache/manager.ts +20 -1
- package/hooks/message-before.ts +7 -0
- package/package.json +3 -2
- package/tools/search.ts +258 -10
- package/tools/workspace.ts +35 -11
- package/vectorizer/chunkers/chunker-factory.ts +32 -3
- package/vectorizer/chunkers/code-chunker.ts +2 -2
- package/vectorizer/chunkers/lsp-chunker.ts +316 -0
- package/vectorizer/chunkers/markdown-chunker.ts +2 -2
- package/vectorizer/index.ts +25 -2
- package/vectorizer.yaml +4 -4
package/cache/manager.ts
CHANGED
|
@@ -47,7 +47,7 @@ export interface WorkspaceEntry {
|
|
|
47
47
|
/** MD5 hash of chunk content — used by freshen() to detect changes */
|
|
48
48
|
contentHash: string
|
|
49
49
|
/** How this chunk got into workspace */
|
|
50
|
-
role: "search-main" | "search-graph" | "manual"
|
|
50
|
+
role: "search-main" | "search-graph" | "search-context" | "manual"
|
|
51
51
|
/** Timestamp when attached */
|
|
52
52
|
attachedAt: number
|
|
53
53
|
/** Search query or "manual" */
|
|
@@ -547,6 +547,25 @@ class WorkspaceCache {
|
|
|
547
547
|
return removed
|
|
548
548
|
}
|
|
549
549
|
|
|
550
|
+
/**
|
|
551
|
+
* Remove all chunks from a specific file path.
|
|
552
|
+
* Returns number of chunks removed.
|
|
553
|
+
*/
|
|
554
|
+
detachByPath(filePath: string): number {
|
|
555
|
+
let removed = 0
|
|
556
|
+
|
|
557
|
+
for (const [chunkId, entry] of this.entries) {
|
|
558
|
+
if (entry.path === filePath) {
|
|
559
|
+
this.entries.delete(chunkId)
|
|
560
|
+
this._totalTokens -= entry.tokens
|
|
561
|
+
removed++
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
if (removed > 0) this.scheduleSave()
|
|
566
|
+
return removed
|
|
567
|
+
}
|
|
568
|
+
|
|
550
569
|
/**
|
|
551
570
|
* Get all chunks sorted by: search-main first (by score desc), then search-graph, then manual.
|
|
552
571
|
*/
|
package/hooks/message-before.ts
CHANGED
|
@@ -103,6 +103,7 @@ export function createWorkspaceInjectionHandler(state: SessionState) {
|
|
|
103
103
|
|
|
104
104
|
// Group by role for clear structure
|
|
105
105
|
const mainFiles = entries.filter(e => e.role === "search-main")
|
|
106
|
+
const contextFiles = entries.filter(e => e.role === "search-context")
|
|
106
107
|
const graphFiles = entries.filter(e => e.role === "search-graph")
|
|
107
108
|
const manualFiles = entries.filter(e => e.role === "manual")
|
|
108
109
|
|
|
@@ -111,6 +112,12 @@ export function createWorkspaceInjectionHandler(state: SessionState) {
|
|
|
111
112
|
workspace += formatChunksByFile(mainFiles, byFile)
|
|
112
113
|
}
|
|
113
114
|
|
|
115
|
+
// Expanded context (class methods, class headers)
|
|
116
|
+
if (contextFiles.length > 0) {
|
|
117
|
+
workspace += `\n<!-- Expanded context (class methods/headers for completeness) -->\n`
|
|
118
|
+
workspace += formatChunksByFile(contextFiles, byFile)
|
|
119
|
+
}
|
|
120
|
+
|
|
114
121
|
// Graph relations (imports, extends, used_by)
|
|
115
122
|
if (graphFiles.length > 0) {
|
|
116
123
|
workspace += `\n<!-- Search graph relations -->\n`
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@comfanion/usethis_search",
|
|
3
|
-
"version": "4.2.0-dev.
|
|
3
|
+
"version": "4.2.0-dev.4",
|
|
4
4
|
"description": "OpenCode plugin: semantic search with chunk-based workspace injection (v4.2-dev: chunk-level context, granular detach, improved token efficiency)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./index.ts",
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
"index:clear": "bun run cli.ts clear"
|
|
16
16
|
},
|
|
17
17
|
"bin": {
|
|
18
|
-
"usethis-search": "
|
|
18
|
+
"usethis-search": "cli.ts"
|
|
19
19
|
},
|
|
20
20
|
"files": [
|
|
21
21
|
"index.ts",
|
|
@@ -45,6 +45,7 @@
|
|
|
45
45
|
"vectorizer/analyzers/lsp-client.ts",
|
|
46
46
|
"vectorizer/chunkers/markdown-chunker.ts",
|
|
47
47
|
"vectorizer/chunkers/code-chunker.ts",
|
|
48
|
+
"vectorizer/chunkers/lsp-chunker.ts",
|
|
48
49
|
"vectorizer/chunkers/chunker-factory.ts",
|
|
49
50
|
"vectorizer.yaml",
|
|
50
51
|
"README.md",
|
package/tools/search.ts
CHANGED
|
@@ -16,6 +16,92 @@ import fs from "fs/promises"
|
|
|
16
16
|
import { CodebaseIndexer, getSearchConfig, getIndexer, releaseIndexer } from "../vectorizer/index.ts"
|
|
17
17
|
import { workspaceCache } from "../cache/manager.ts"
|
|
18
18
|
|
|
19
|
+
// ── Context Expansion Helpers ─────────────────────────────────────────────
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Expand chunk context using GRAPH + file-level structural expansion.
|
|
23
|
+
*
|
|
24
|
+
* Strategy:
|
|
25
|
+
* 1. CODE: Class → all methods, Method → class header (via findChunksByPath)
|
|
26
|
+
* 2. DOCS: Section → other sections from same file (via file node in graph)
|
|
27
|
+
* 3. ALL: Use relatedContext from graph (imports, extends, contains, etc.)
|
|
28
|
+
*
|
|
29
|
+
* Note: relatedContext already populated by vectorizer._expandGraphContext()
|
|
30
|
+
* This function adds STRUCTURAL context (same-file chunks).
|
|
31
|
+
*/
|
|
32
|
+
async function expandChunkContext(
|
|
33
|
+
mainChunk: any,
|
|
34
|
+
indexer: CodebaseIndexer,
|
|
35
|
+
alreadyAttached: Set<string>,
|
|
36
|
+
): Promise<Array<{ chunk: any; reason: string }>> {
|
|
37
|
+
const expanded: Array<{ chunk: any; reason: string }> = []
|
|
38
|
+
|
|
39
|
+
// ══════════════════════════════════════════════════════════════════════
|
|
40
|
+
// STRUCTURAL EXPANSION: Same-file chunks for completeness
|
|
41
|
+
// ══════════════════════════════════════════════════════════════════════
|
|
42
|
+
|
|
43
|
+
// CODE: Class → add ALL its methods
|
|
44
|
+
if (mainChunk.class_name && !mainChunk.function_name) {
|
|
45
|
+
const allChunks = await indexer.findChunksByPath(mainChunk.file)
|
|
46
|
+
|
|
47
|
+
for (const chunk of allChunks) {
|
|
48
|
+
const chunkId = chunk.chunk_id || `${chunk.file}:chunk-${chunk.chunk_index ?? 0}`
|
|
49
|
+
if (alreadyAttached.has(chunkId)) continue
|
|
50
|
+
|
|
51
|
+
// Add all methods of this class
|
|
52
|
+
if (chunk.class_name === mainChunk.class_name && chunk.function_name) {
|
|
53
|
+
expanded.push({
|
|
54
|
+
chunk,
|
|
55
|
+
reason: `method of class ${mainChunk.class_name}`,
|
|
56
|
+
})
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// CODE: Method → add class header
|
|
62
|
+
else if (mainChunk.class_name && mainChunk.function_name) {
|
|
63
|
+
const allChunks = await indexer.findChunksByPath(mainChunk.file)
|
|
64
|
+
|
|
65
|
+
for (const chunk of allChunks) {
|
|
66
|
+
const chunkId = chunk.chunk_id || `${chunk.file}:chunk-${chunk.chunk_index ?? 0}`
|
|
67
|
+
if (alreadyAttached.has(chunkId)) continue
|
|
68
|
+
|
|
69
|
+
// Find class header (class_name matches but no function_name)
|
|
70
|
+
if (chunk.class_name === mainChunk.class_name && !chunk.function_name) {
|
|
71
|
+
expanded.push({
|
|
72
|
+
chunk,
|
|
73
|
+
reason: `class header for ${mainChunk.function_name}`,
|
|
74
|
+
})
|
|
75
|
+
break
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// DOCS: Section → add other sections from same file (for context)
|
|
81
|
+
// Only for markdown chunks with heading_context
|
|
82
|
+
else if (mainChunk.heading_context && mainChunk.language === "markdown") {
|
|
83
|
+
const allChunks = await indexer.findChunksByPath(mainChunk.file)
|
|
84
|
+
|
|
85
|
+
// Add ALL sections from this file (they're already reasonably sized)
|
|
86
|
+
// This gives full document context when searching in docs
|
|
87
|
+
for (const chunk of allChunks) {
|
|
88
|
+
const chunkId = chunk.chunk_id || `${chunk.file}:chunk-${chunk.chunk_index ?? 0}`
|
|
89
|
+
if (alreadyAttached.has(chunkId)) continue
|
|
90
|
+
|
|
91
|
+
// Skip the main chunk itself
|
|
92
|
+
if (chunkId === mainChunk.chunkId) continue
|
|
93
|
+
|
|
94
|
+
// Add other sections from same document
|
|
95
|
+
expanded.push({
|
|
96
|
+
chunk,
|
|
97
|
+
reason: `section from ${mainChunk.file}`,
|
|
98
|
+
})
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return expanded
|
|
103
|
+
}
|
|
104
|
+
|
|
19
105
|
// ── Extension → language mapping (for filter parsing) ─────────────────────
|
|
20
106
|
const EXT_TO_LANG: Record<string, string> = {
|
|
21
107
|
go: "go", py: "python", ts: "typescript", tsx: "typescript",
|
|
@@ -92,25 +178,31 @@ function parseFilter(filter: string): {
|
|
|
92
178
|
}
|
|
93
179
|
|
|
94
180
|
export default tool({
|
|
95
|
-
description: `Search the codebase semantically
|
|
181
|
+
description: `Search the codebase semantically OR attach specific chunks/files to workspace.
|
|
182
|
+
|
|
183
|
+
Three modes:
|
|
184
|
+
1. Semantic search (query) - Find relevant code by meaning
|
|
185
|
+
2. Direct chunk attach (chunkId) - Attach specific chunk by ID
|
|
186
|
+
3. File attach (path) - Attach all chunks from a file
|
|
96
187
|
|
|
97
188
|
Available indexes:
|
|
98
189
|
- "code" (default) - Source code files (*.js, *.ts, *.py, *.go, etc.)
|
|
99
190
|
- "docs" - Documentation files (*.md, *.txt, etc.)
|
|
100
|
-
- "config" - Configuration files (*.yaml, *.json, etc.)
|
|
101
191
|
- searchAll: true - Search across all indexes
|
|
102
192
|
|
|
103
193
|
Examples:
|
|
104
194
|
- search({ query: "authentication logic" })
|
|
105
195
|
- search({ query: "how to deploy", index: "docs" })
|
|
106
196
|
- search({ query: "tenant management", filter: "internal/domain/" })
|
|
107
|
-
- search({
|
|
108
|
-
- search({
|
|
109
|
-
- search({
|
|
197
|
+
- search({ chunkId: "src/auth.ts:chunk-5" })
|
|
198
|
+
- search({ path: "docs/architecture.md" })
|
|
199
|
+
- search({ path: "src/auth.ts", index: "code" })`,
|
|
110
200
|
|
|
111
201
|
args: {
|
|
112
|
-
query: tool.schema.string().describe("Semantic search query describing what you're looking for"),
|
|
113
|
-
|
|
202
|
+
query: tool.schema.string().optional().describe("Semantic search query describing what you're looking for"),
|
|
203
|
+
chunkId: tool.schema.string().optional().describe("Specific chunk ID to attach (e.g. 'src/auth.ts:chunk-5')"),
|
|
204
|
+
path: tool.schema.string().optional().describe("File path to attach all chunks from (e.g. 'docs/architecture.md')"),
|
|
205
|
+
index: tool.schema.string().optional().default("code").describe("Index to search: code, docs"),
|
|
114
206
|
limit: tool.schema.number().optional().describe("Number of results (default from config, typically 10)"),
|
|
115
207
|
searchAll: tool.schema.boolean().optional().default(false).describe("Search all indexes instead of just one"),
|
|
116
208
|
filter: tool.schema.string().optional().describe("Filter results by path or language. Examples: 'internal/domain/', '*.go', 'internal/**/*.go', 'service'"),
|
|
@@ -120,6 +212,15 @@ Examples:
|
|
|
120
212
|
const projectRoot = process.cwd()
|
|
121
213
|
|
|
122
214
|
try {
|
|
215
|
+
// Validate: exactly one of query, chunkId, or path must be specified
|
|
216
|
+
const modes = [args.query, args.chunkId, args.path].filter(x => x !== undefined)
|
|
217
|
+
if (modes.length === 0) {
|
|
218
|
+
return `Error: Must specify one of: query (semantic search), chunkId (direct attach), or path (file attach)\n\nExamples:\n- search({ query: "authentication" })\n- search({ chunkId: "src/auth.ts:chunk-5" })\n- search({ path: "docs/architecture.md" })`
|
|
219
|
+
}
|
|
220
|
+
if (modes.length > 1) {
|
|
221
|
+
return `Error: Specify only ONE of: query, chunkId, or path (got ${modes.length})`
|
|
222
|
+
}
|
|
223
|
+
|
|
123
224
|
// Load config defaults (parsed from vectorizer.yaml)
|
|
124
225
|
const cfg = getSearchConfig()
|
|
125
226
|
const limit = args.limit || cfg.default_limit || 10
|
|
@@ -130,6 +231,95 @@ Examples:
|
|
|
130
231
|
// Workspace config
|
|
131
232
|
const wsConfig = workspaceCache.getConfig()
|
|
132
233
|
|
|
234
|
+
// ══════════════════════════════════════════════════════════════════════
|
|
235
|
+
// MODE 1: Direct chunk attach by chunkId
|
|
236
|
+
// ══════════════════════════════════════════════════════════════════════
|
|
237
|
+
if (args.chunkId) {
|
|
238
|
+
const indexer = await getIndexer(projectRoot, indexName)
|
|
239
|
+
try {
|
|
240
|
+
const chunk = await indexer.findChunkById(args.chunkId)
|
|
241
|
+
if (!chunk) {
|
|
242
|
+
return `Chunk "${args.chunkId}" not found in index "${indexName}".\n\nMake sure:\n1. The file is indexed\n2. The chunk ID is correct (format: "path:chunk-N")\n3. You're searching the right index`
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Attach to workspace
|
|
246
|
+
workspaceCache.attach({
|
|
247
|
+
chunkId: args.chunkId,
|
|
248
|
+
path: chunk.file,
|
|
249
|
+
content: chunk.content,
|
|
250
|
+
chunkIndex: chunk.chunk_index ?? 0,
|
|
251
|
+
role: "manual",
|
|
252
|
+
attachedAt: Date.now(),
|
|
253
|
+
attachedBy: `direct:${args.chunkId}`,
|
|
254
|
+
metadata: {
|
|
255
|
+
language: chunk.language,
|
|
256
|
+
function_name: chunk.function_name,
|
|
257
|
+
class_name: chunk.class_name,
|
|
258
|
+
heading_context: chunk.heading_context,
|
|
259
|
+
startLine: chunk.start_line,
|
|
260
|
+
endLine: chunk.end_line,
|
|
261
|
+
},
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
workspaceCache.save().catch(() => {})
|
|
265
|
+
|
|
266
|
+
const entry = workspaceCache.get(args.chunkId)!
|
|
267
|
+
return `✓ Attached chunk to workspace\n\nChunk: ${args.chunkId}\nFile: ${chunk.file}\nTokens: ${entry.tokens.toLocaleString()}\nLanguage: ${chunk.language}\nLines: ${chunk.start_line}-${chunk.end_line}\n\nWorkspace: ${workspaceCache.size} chunks, ${workspaceCache.totalTokens.toLocaleString()} tokens`
|
|
268
|
+
} finally {
|
|
269
|
+
releaseIndexer(projectRoot, indexName)
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// ══════════════════════════════════════════════════════════════════════
|
|
274
|
+
// MODE 2: File attach by path (all chunks)
|
|
275
|
+
// ══════════════════════════════════════════════════════════════════════
|
|
276
|
+
if (args.path) {
|
|
277
|
+
const indexer = await getIndexer(projectRoot, indexName)
|
|
278
|
+
try {
|
|
279
|
+
const chunks = await indexer.findChunksByPath(args.path)
|
|
280
|
+
if (chunks.length === 0) {
|
|
281
|
+
return `No chunks found for file "${args.path}" in index "${indexName}".\n\nMake sure:\n1. The file exists and is indexed\n2. The path is correct (relative to project root)\n3. You're searching the right index\n\nRun: bunx usethis_search reindex`
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Attach all chunks to workspace
|
|
285
|
+
let totalTokens = 0
|
|
286
|
+
for (const chunk of chunks) {
|
|
287
|
+
const chunkId = chunk.chunk_id || `${args.path}:chunk-${chunk.chunk_index ?? 0}`
|
|
288
|
+
|
|
289
|
+
workspaceCache.attach({
|
|
290
|
+
chunkId,
|
|
291
|
+
path: args.path,
|
|
292
|
+
content: chunk.content,
|
|
293
|
+
chunkIndex: chunk.chunk_index ?? 0,
|
|
294
|
+
role: "manual",
|
|
295
|
+
attachedAt: Date.now(),
|
|
296
|
+
attachedBy: `file:${args.path}`,
|
|
297
|
+
metadata: {
|
|
298
|
+
language: chunk.language,
|
|
299
|
+
function_name: chunk.function_name,
|
|
300
|
+
class_name: chunk.class_name,
|
|
301
|
+
heading_context: chunk.heading_context,
|
|
302
|
+
startLine: chunk.start_line,
|
|
303
|
+
endLine: chunk.end_line,
|
|
304
|
+
},
|
|
305
|
+
})
|
|
306
|
+
|
|
307
|
+
const entry = workspaceCache.get(chunkId)!
|
|
308
|
+
totalTokens += entry.tokens
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
workspaceCache.save().catch(() => {})
|
|
312
|
+
|
|
313
|
+
return `✓ Attached file to workspace\n\nFile: ${args.path}\nChunks: ${chunks.length}\nTokens: ${totalTokens.toLocaleString()}\nLanguage: ${chunks[0].language}\n\nWorkspace: ${workspaceCache.size} chunks, ${workspaceCache.totalTokens.toLocaleString()} tokens`
|
|
314
|
+
} finally {
|
|
315
|
+
releaseIndexer(projectRoot, indexName)
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// ══════════════════════════════════════════════════════════════════════
|
|
320
|
+
// MODE 3: Semantic search by query (original behavior)
|
|
321
|
+
// ══════════════════════════════════════════════════════════════════════
|
|
322
|
+
|
|
133
323
|
// Parse filter into path/language constraints
|
|
134
324
|
const filterParsed = args.filter ? parseFilter(args.filter) : {}
|
|
135
325
|
|
|
@@ -169,6 +359,19 @@ Examples:
|
|
|
169
359
|
}
|
|
170
360
|
}
|
|
171
361
|
|
|
362
|
+
// Deduplicate chunks (searchAll may return same chunk from multiple indexes)
|
|
363
|
+
const seen = new Set<string>()
|
|
364
|
+
const deduplicated: any[] = []
|
|
365
|
+
|
|
366
|
+
for (const result of allResults) {
|
|
367
|
+
const chunkId = result.chunkId || `${result.file}:chunk-${result.index ?? 0}`
|
|
368
|
+
if (!seen.has(chunkId)) {
|
|
369
|
+
seen.add(chunkId)
|
|
370
|
+
deduplicated.push(result)
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
allResults = deduplicated
|
|
172
375
|
allResults.sort((a, b) => {
|
|
173
376
|
const scoreA = a._combinedScore ?? (a._distance != null ? Math.max(0, 1 - a._distance / 2) : 0)
|
|
174
377
|
const scoreB = b._combinedScore ?? (b._distance != null ? Math.max(0, 1 - b._distance / 2) : 0)
|
|
@@ -255,7 +458,7 @@ Examples:
|
|
|
255
458
|
}
|
|
256
459
|
|
|
257
460
|
// ══════════════════════════════════════════════════════════════════════
|
|
258
|
-
// WORKSPACE ATTACH: Top N chunks +
|
|
461
|
+
// WORKSPACE ATTACH: Top N chunks + expanded context + graph relations
|
|
259
462
|
// ══════════════════════════════════════════════════════════════════════
|
|
260
463
|
|
|
261
464
|
const mainChunks = topChunks.slice(0, wsConfig.attachTopN)
|
|
@@ -263,13 +466,17 @@ Examples:
|
|
|
263
466
|
|
|
264
467
|
const attachedMain: Array<{ chunkId: string; path: string }> = []
|
|
265
468
|
const attachedGraph: Array<{ chunkId: string; path: string }> = []
|
|
469
|
+
const attachedContext: Array<{ chunkId: string; path: string; reason: string }> = []
|
|
266
470
|
const alreadyAttached = new Set<string>()
|
|
267
471
|
|
|
472
|
+
// Get indexer for context expansion (reuse same indexer)
|
|
473
|
+
const indexerForExpansion = await getIndexer(projectRoot, indexName)
|
|
474
|
+
|
|
268
475
|
for (const chunk of mainChunks) {
|
|
269
476
|
// Skip if score too low
|
|
270
477
|
if ((chunk._finalScore ?? 0) < wsConfig.minScoreMain) continue
|
|
271
478
|
|
|
272
|
-
// Attach
|
|
479
|
+
// Attach main chunk
|
|
273
480
|
const chunkId = chunk.chunkId || `${chunk.file}:chunk-${chunk.index ?? 0}`
|
|
274
481
|
|
|
275
482
|
workspaceCache.attach({
|
|
@@ -294,6 +501,44 @@ Examples:
|
|
|
294
501
|
attachedMain.push({ chunkId, path: chunk.file })
|
|
295
502
|
alreadyAttached.add(chunkId)
|
|
296
503
|
|
|
504
|
+
// ── Expand context (class methods, class header) ──────────────────
|
|
505
|
+
try {
|
|
506
|
+
const expandedChunks = await expandChunkContext(chunk, indexerForExpansion, alreadyAttached)
|
|
507
|
+
|
|
508
|
+
for (const { chunk: expChunk, reason } of expandedChunks) {
|
|
509
|
+
const expChunkId = expChunk.chunk_id || `${expChunk.file}:chunk-${expChunk.chunk_index ?? 0}`
|
|
510
|
+
|
|
511
|
+
// Check budget before adding
|
|
512
|
+
if (workspaceCache.size >= wsConfig.maxChunks) break
|
|
513
|
+
|
|
514
|
+
workspaceCache.attach({
|
|
515
|
+
chunkId: expChunkId,
|
|
516
|
+
path: expChunk.file,
|
|
517
|
+
content: expChunk.content,
|
|
518
|
+
chunkIndex: expChunk.chunk_index ?? 0,
|
|
519
|
+
role: "search-context",
|
|
520
|
+
attachedAt: Date.now(),
|
|
521
|
+
attachedBy: `${args.query} (${reason})`,
|
|
522
|
+
score: chunk._finalScore * 0.9, // Slightly lower score than main
|
|
523
|
+
metadata: {
|
|
524
|
+
language: expChunk.language,
|
|
525
|
+
function_name: expChunk.function_name,
|
|
526
|
+
class_name: expChunk.class_name,
|
|
527
|
+
startLine: expChunk.start_line,
|
|
528
|
+
endLine: expChunk.end_line,
|
|
529
|
+
},
|
|
530
|
+
})
|
|
531
|
+
|
|
532
|
+
attachedContext.push({ chunkId: expChunkId, path: expChunk.file, reason })
|
|
533
|
+
alreadyAttached.add(expChunkId)
|
|
534
|
+
}
|
|
535
|
+
} catch (error: any) {
|
|
536
|
+
// Context expansion failed — not critical, continue
|
|
537
|
+
if (process.env.DEBUG) {
|
|
538
|
+
console.log(`[search] Context expansion failed for ${chunkId}: ${error.message}`)
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
297
542
|
// Attach graph relations (imports, extends, used_by)
|
|
298
543
|
if (chunk.relatedContext && chunk.relatedContext.length > 0) {
|
|
299
544
|
const topRelated = chunk.relatedContext
|
|
@@ -329,8 +574,11 @@ Examples:
|
|
|
329
574
|
}
|
|
330
575
|
}
|
|
331
576
|
|
|
577
|
+
// Release indexer used for expansion
|
|
578
|
+
releaseIndexer(projectRoot, indexName)
|
|
579
|
+
|
|
332
580
|
// ── Flush workspace to disk immediately (don't rely on debounce) ─────
|
|
333
|
-
if (attachedMain.length > 0 || attachedGraph.length > 0) {
|
|
581
|
+
if (attachedMain.length > 0 || attachedGraph.length > 0 || attachedContext.length > 0) {
|
|
334
582
|
workspaceCache.save().catch(() => {})
|
|
335
583
|
}
|
|
336
584
|
|
package/tools/workspace.ts
CHANGED
|
@@ -56,11 +56,14 @@ export const workspace_list = tool({
|
|
|
56
56
|
const mainFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
|
|
57
57
|
chunks.some(c => c.role === "search-main")
|
|
58
58
|
)
|
|
59
|
+
const contextFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
|
|
60
|
+
chunks.some(c => c.role === "search-context") && !chunks.some(c => c.role === "search-main")
|
|
61
|
+
)
|
|
59
62
|
const graphFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
|
|
60
|
-
chunks.some(c => c.role === "search-graph") && !chunks.some(c => c.role === "search-main")
|
|
63
|
+
chunks.some(c => c.role === "search-graph") && !chunks.some(c => c.role === "search-main" || c.role === "search-context")
|
|
61
64
|
)
|
|
62
65
|
const manualFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
|
|
63
|
-
chunks.some(c => c.role === "manual") && !chunks.some(c => c.role === "search-main" || c.role === "search-graph")
|
|
66
|
+
chunks.some(c => c.role === "manual") && !chunks.some(c => c.role === "search-main" || c.role === "search-graph" || c.role === "search-context")
|
|
64
67
|
)
|
|
65
68
|
|
|
66
69
|
if (mainFiles.length > 0) {
|
|
@@ -86,6 +89,25 @@ export const workspace_list = tool({
|
|
|
86
89
|
output += `\n`
|
|
87
90
|
}
|
|
88
91
|
|
|
92
|
+
if (contextFiles.length > 0) {
|
|
93
|
+
output += `### Expanded context (${contextFiles.length} files)\n`
|
|
94
|
+
for (const [filePath, chunks] of contextFiles) {
|
|
95
|
+
const totalTokens = chunks.reduce((sum, c) => sum + c.tokens, 0)
|
|
96
|
+
const reason = chunks[0]?.attachedBy?.match(/\((.+)\)/)?.[1] || "context"
|
|
97
|
+
const age = Math.floor((Date.now() - chunks[0].attachedAt) / 1000 / 60)
|
|
98
|
+
|
|
99
|
+
output += `- **${filePath}** (${chunks.length} chunk${chunks.length > 1 ? "s" : ""}, ${totalTokens.toLocaleString()} tokens) — ${reason} — ${age}m ago\n`
|
|
100
|
+
|
|
101
|
+
if (chunks.length > 1) {
|
|
102
|
+
for (const chunk of chunks) {
|
|
103
|
+
const meta = chunk.metadata?.function_name || chunk.metadata?.class_name || ""
|
|
104
|
+
output += ` • ${chunk.chunkId} — ${meta} (chunk ${chunk.chunkIndex}, ${chunk.tokens.toLocaleString()} tok)\n`
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
output += `\n`
|
|
109
|
+
}
|
|
110
|
+
|
|
89
111
|
if (graphFiles.length > 0) {
|
|
90
112
|
output += `### Graph relations (${graphFiles.length} files)\n`
|
|
91
113
|
for (const [filePath, chunks] of graphFiles) {
|
|
@@ -155,8 +177,12 @@ export const workspace_attach = tool({
|
|
|
155
177
|
|
|
156
178
|
// Check if already attached
|
|
157
179
|
if (workspaceCache.has(args.filePath)) {
|
|
158
|
-
const existing = workspaceCache.
|
|
159
|
-
|
|
180
|
+
const existing = workspaceCache.getChunksByPath(args.filePath)
|
|
181
|
+
if (existing.length > 0) {
|
|
182
|
+
const first = existing[0]
|
|
183
|
+
const totalTokens = existing.reduce((sum, c) => sum + c.tokens, 0)
|
|
184
|
+
return `File "${args.filePath}" is already in workspace (${existing.length} chunk${existing.length > 1 ? "s" : ""}).\nTokens: ${totalTokens.toLocaleString()} | Role: ${first.role} | Score: ${first.score?.toFixed(3) ?? "n/a"}`
|
|
185
|
+
}
|
|
160
186
|
}
|
|
161
187
|
|
|
162
188
|
workspaceCache.attach({
|
|
@@ -169,7 +195,7 @@ export const workspace_attach = tool({
|
|
|
169
195
|
attachedBy: "manual",
|
|
170
196
|
})
|
|
171
197
|
|
|
172
|
-
const entry = workspaceCache.get(
|
|
198
|
+
const entry = workspaceCache.get(chunkId)!
|
|
173
199
|
return `Attached "${args.filePath}" to workspace as single chunk.\nChunkId: ${chunkId}\nTokens: ${entry.tokens.toLocaleString()}\nWorkspace total: ${workspaceCache.totalTokens.toLocaleString()} tokens (${workspaceCache.size} chunks)`
|
|
174
200
|
} catch (error: any) {
|
|
175
201
|
return `Failed to attach "${args.filePath}": ${error.message || String(error)}`
|
|
@@ -194,27 +220,25 @@ export const workspace_detach = tool({
|
|
|
194
220
|
|
|
195
221
|
if (args.chunkId) {
|
|
196
222
|
// Detach specific chunk by chunkId
|
|
197
|
-
const
|
|
198
|
-
const entry = entries.find(e => e.chunkId === args.chunkId)
|
|
223
|
+
const entry = workspaceCache.get(args.chunkId)
|
|
199
224
|
|
|
200
225
|
if (!entry) {
|
|
201
226
|
return `Chunk "${args.chunkId}" not found in workspace.`
|
|
202
227
|
}
|
|
203
228
|
|
|
204
|
-
removed = workspaceCache.detach(
|
|
229
|
+
removed = workspaceCache.detach(args.chunkId) ? 1 : 0
|
|
205
230
|
if (removed === 0) {
|
|
206
231
|
return `Failed to remove chunk "${args.chunkId}".`
|
|
207
232
|
}
|
|
208
233
|
} else if (args.filePath) {
|
|
209
234
|
// Detach all chunks of a file
|
|
210
|
-
const
|
|
211
|
-
const fileChunks = entries.filter(e => e.path === args.filePath)
|
|
235
|
+
const fileChunks = workspaceCache.getChunksByPath(args.filePath)
|
|
212
236
|
|
|
213
237
|
if (fileChunks.length === 0) {
|
|
214
238
|
return `File "${args.filePath}" not found in workspace.`
|
|
215
239
|
}
|
|
216
240
|
|
|
217
|
-
removed = workspaceCache.
|
|
241
|
+
removed = workspaceCache.detachByPath(args.filePath)
|
|
218
242
|
if (removed === 0) {
|
|
219
243
|
return `Failed to remove chunks from "${args.filePath}".`
|
|
220
244
|
}
|
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Chunker Factory — selects the appropriate chunker based on file type.
|
|
3
3
|
*
|
|
4
|
-
* Routes:
|
|
4
|
+
* Routes:
|
|
5
|
+
* - markdown → markdown-chunker
|
|
6
|
+
* - code → lsp-chunker (fallback: code-chunker regex)
|
|
7
|
+
* - else → fixed
|
|
5
8
|
*/
|
|
6
9
|
|
|
7
10
|
import { chunkMarkdown, type MarkdownChunkConfig, DEFAULT_MD_CONFIG } from "./markdown-chunker"
|
|
8
11
|
import { chunkCode, type CodeChunkConfig, DEFAULT_CODE_CONFIG } from "./code-chunker"
|
|
12
|
+
import { chunkCodeWithLSP } from "./lsp-chunker"
|
|
9
13
|
import type { FileType } from "../metadata-extractor"
|
|
10
14
|
|
|
11
15
|
// ── Types ───────────────────────────────────────────────────────────────────
|
|
@@ -63,13 +67,18 @@ function chunkFixed(content: string, maxChars: number): UnifiedChunk[] {
|
|
|
63
67
|
|
|
64
68
|
/**
|
|
65
69
|
* Chunk content using the appropriate strategy for the given file type.
|
|
70
|
+
*
|
|
71
|
+
* For code files: tries LSP-based chunking first (AST-accurate + godoc capture),
|
|
72
|
+
* falls back to regex-based chunker if LSP unavailable.
|
|
66
73
|
*/
|
|
67
|
-
export function chunkContent(
|
|
74
|
+
export async function chunkContent(
|
|
68
75
|
content: string,
|
|
69
76
|
fileType: FileType,
|
|
70
77
|
language: string,
|
|
71
78
|
config: ChunkingConfig = DEFAULT_CHUNKING_CONFIG,
|
|
72
|
-
|
|
79
|
+
filePath?: string,
|
|
80
|
+
projectRoot?: string,
|
|
81
|
+
): Promise<UnifiedChunk[]> {
|
|
73
82
|
// If strategy is "fixed", always use fixed chunker
|
|
74
83
|
if (config.strategy === "fixed") {
|
|
75
84
|
return chunkFixed(content, config.fixed.max_chars)
|
|
@@ -85,6 +94,26 @@ export function chunkContent(
|
|
|
85
94
|
}
|
|
86
95
|
|
|
87
96
|
if (fileType === "code") {
|
|
97
|
+
// Try LSP-based chunker first (captures godoc/JSDoc comments!)
|
|
98
|
+
if (filePath) {
|
|
99
|
+
try {
|
|
100
|
+
const lspChunks = await chunkCodeWithLSP(filePath, content, config.code, projectRoot)
|
|
101
|
+
if (lspChunks && lspChunks.length > 0) {
|
|
102
|
+
return lspChunks.map((c) => ({
|
|
103
|
+
content: c.content,
|
|
104
|
+
function_name: c.function_name,
|
|
105
|
+
class_name: c.class_name,
|
|
106
|
+
}))
|
|
107
|
+
}
|
|
108
|
+
} catch (error) {
|
|
109
|
+
// LSP failed — fall through to regex chunker
|
|
110
|
+
if (process.env.DEBUG_LSP_CHUNKER) {
|
|
111
|
+
console.log(`[chunker-factory] LSP chunker failed for ${filePath}: ${error}`)
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Fallback: regex-based code chunker
|
|
88
117
|
const codeChunks = chunkCode(content, config.code)
|
|
89
118
|
return codeChunks.map((c) => ({
|
|
90
119
|
content: c.content,
|
|
@@ -13,8 +13,8 @@ export interface CodeChunkConfig {
|
|
|
13
13
|
}
|
|
14
14
|
|
|
15
15
|
export const DEFAULT_CODE_CONFIG: CodeChunkConfig = {
|
|
16
|
-
min_chunk_size:
|
|
17
|
-
max_chunk_size:
|
|
16
|
+
min_chunk_size: 600, // Function with godoc/JSDoc (avoid tiny chunks)
|
|
17
|
+
max_chunk_size: 3000, // Allow larger chunks (class with multiple methods)
|
|
18
18
|
split_by_functions: true,
|
|
19
19
|
include_function_signature: true,
|
|
20
20
|
}
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LSP-Based Code Chunker
|
|
3
|
+
*
|
|
4
|
+
* Uses Language Server Protocol to get AST-accurate function/class boundaries.
|
|
5
|
+
* Captures godoc/JSDoc comments that belong to each symbol.
|
|
6
|
+
*
|
|
7
|
+
* Advantages over regex-chunker:
|
|
8
|
+
* - ✅ Accurate AST parsing (no brace counting bugs)
|
|
9
|
+
* - ✅ Captures leading documentation comments (godoc, JSDoc, docstrings)
|
|
10
|
+
* - ✅ Handles nested structures (class methods, nested functions)
|
|
11
|
+
* - ✅ Language-agnostic (works for Go, TS, Python, Rust, Java, etc.)
|
|
12
|
+
*
|
|
13
|
+
* Fallback: If LSP unavailable → use regex-chunker
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { LSPClient, LSPSymbolInformation, SymbolKind } from "../analyzers/lsp-client.ts"
|
|
17
|
+
import type { CodeChunk, CodeChunkConfig } from "./code-chunker.ts"
|
|
18
|
+
|
|
19
|
+
const DEBUG = process.env.DEBUG_LSP_CHUNKER === "true"
|
|
20
|
+
|
|
21
|
+
/** Symbol kinds we want to chunk separately */
|
|
22
|
+
const CHUNKABLE_SYMBOLS = new Set([
|
|
23
|
+
SymbolKind.Function,
|
|
24
|
+
SymbolKind.Method,
|
|
25
|
+
SymbolKind.Class,
|
|
26
|
+
SymbolKind.Interface,
|
|
27
|
+
SymbolKind.Enum,
|
|
28
|
+
// Note: Struct is not in SymbolKind — Go structs appear as Class
|
|
29
|
+
])
|
|
30
|
+
|
|
31
|
+
/** Map file extension to LSP language ID */
|
|
32
|
+
const EXT_TO_LANGUAGE: Record<string, string> = {
|
|
33
|
+
ts: "typescript",
|
|
34
|
+
js: "javascript",
|
|
35
|
+
tsx: "typescriptreact",
|
|
36
|
+
jsx: "javascriptreact",
|
|
37
|
+
py: "python",
|
|
38
|
+
go: "go",
|
|
39
|
+
rs: "rust",
|
|
40
|
+
java: "java",
|
|
41
|
+
cpp: "cpp",
|
|
42
|
+
c: "c",
|
|
43
|
+
cs: "csharp",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Chunk code using LSP documentSymbol API.
|
|
48
|
+
* Falls back to regex-chunker if LSP unavailable.
|
|
49
|
+
*/
|
|
50
|
+
export async function chunkCodeWithLSP(
|
|
51
|
+
filePath: string,
|
|
52
|
+
content: string,
|
|
53
|
+
config: CodeChunkConfig,
|
|
54
|
+
projectRoot?: string,
|
|
55
|
+
): Promise<CodeChunk[] | null> {
|
|
56
|
+
// Check if LSP available for this language
|
|
57
|
+
const ext = filePath.split(".").pop() || ""
|
|
58
|
+
const language = EXT_TO_LANGUAGE[ext]
|
|
59
|
+
if (!language) {
|
|
60
|
+
if (DEBUG) console.log(`[lsp-chunker] No language mapping for .${ext}`)
|
|
61
|
+
return null // Fallback to regex
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const available = await LSPClient.isAvailable(language)
|
|
65
|
+
if (!available) {
|
|
66
|
+
if (DEBUG) console.log(`[lsp-chunker] LSP not available for ${language}`)
|
|
67
|
+
return null // Fallback to regex
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Start LSP client
|
|
71
|
+
const client = new LSPClient(language, projectRoot)
|
|
72
|
+
try {
|
|
73
|
+
await client.start()
|
|
74
|
+
await client.openDocument(filePath, content)
|
|
75
|
+
|
|
76
|
+
// Get document symbols
|
|
77
|
+
const symbols = await client.documentSymbol(filePath)
|
|
78
|
+
if (!symbols || symbols.length === 0) {
|
|
79
|
+
if (DEBUG) console.log(`[lsp-chunker] No symbols found in ${filePath}`)
|
|
80
|
+
return null // Fallback to regex
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const lines = content.split("\n")
|
|
84
|
+
const chunks: CodeChunk[] = []
|
|
85
|
+
|
|
86
|
+
// Extract chunks from symbols (recursive for nested symbols)
|
|
87
|
+
extractChunksFromSymbols(symbols, lines, chunks, config)
|
|
88
|
+
|
|
89
|
+
// Add gaps (code between symbols: imports, package declarations, etc.)
|
|
90
|
+
addGapChunks(chunks, lines, config)
|
|
91
|
+
|
|
92
|
+
if (DEBUG) console.log(`[lsp-chunker] Generated ${chunks.length} chunks from ${symbols.length} symbols`)
|
|
93
|
+
|
|
94
|
+
await client.closeDocument(filePath)
|
|
95
|
+
await client.stop()
|
|
96
|
+
|
|
97
|
+
return chunks.length > 0 ? chunks : null
|
|
98
|
+
} catch (error: any) {
|
|
99
|
+
if (DEBUG) console.log(`[lsp-chunker] Error: ${error.message}`)
|
|
100
|
+
try {
|
|
101
|
+
await client.stop()
|
|
102
|
+
} catch {}
|
|
103
|
+
return null // Fallback to regex
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Recursively extract chunks from LSP symbols.
|
|
109
|
+
* Handles nested structures (class methods, nested functions).
|
|
110
|
+
*/
|
|
111
|
+
function extractChunksFromSymbols(
|
|
112
|
+
symbols: LSPSymbolInformation[],
|
|
113
|
+
lines: string[],
|
|
114
|
+
chunks: CodeChunk[],
|
|
115
|
+
config: CodeChunkConfig,
|
|
116
|
+
parentClass?: string,
|
|
117
|
+
): void {
|
|
118
|
+
for (const symbol of symbols) {
|
|
119
|
+
// Skip non-chunkable symbols (variables, properties, etc.)
|
|
120
|
+
if (!CHUNKABLE_SYMBOLS.has(symbol.kind)) continue
|
|
121
|
+
|
|
122
|
+
const startLine = symbol.range.start.line
|
|
123
|
+
const endLine = symbol.range.end.line
|
|
124
|
+
|
|
125
|
+
// Expand startLine backward to capture leading comments (godoc, JSDoc, docstrings)
|
|
126
|
+
const commentStartLine = captureLeadingComments(lines, startLine)
|
|
127
|
+
|
|
128
|
+
// Extract chunk content
|
|
129
|
+
const chunkLines = lines.slice(commentStartLine, endLine + 1)
|
|
130
|
+
const chunkContent = chunkLines.join("\n")
|
|
131
|
+
|
|
132
|
+
// Check size constraints
|
|
133
|
+
if (chunkContent.length < config.min_chunk_size && chunkLines.length < 5) {
|
|
134
|
+
// Too small — skip (will be captured in gaps)
|
|
135
|
+
continue
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Determine chunk metadata
|
|
139
|
+
const isClass = symbol.kind === SymbolKind.Class || symbol.kind === SymbolKind.Interface
|
|
140
|
+
const isFunction = symbol.kind === SymbolKind.Function || symbol.kind === SymbolKind.Method
|
|
141
|
+
|
|
142
|
+
const chunk: CodeChunk = {
|
|
143
|
+
content: chunkContent,
|
|
144
|
+
start_line: commentStartLine,
|
|
145
|
+
end_line: endLine,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (isClass) {
|
|
149
|
+
chunk.class_name = symbol.name
|
|
150
|
+
}
|
|
151
|
+
if (isFunction) {
|
|
152
|
+
chunk.function_name = symbol.name
|
|
153
|
+
if (parentClass) chunk.class_name = parentClass
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// If chunk too large → split by children (for classes with many methods)
|
|
157
|
+
if (chunkContent.length > config.max_chunk_size && symbol.children && symbol.children.length > 0) {
|
|
158
|
+
if (DEBUG) console.log(`[lsp-chunker] Splitting large ${symbol.kind === SymbolKind.Class ? 'class' : 'symbol'} ${symbol.name}`)
|
|
159
|
+
|
|
160
|
+
// For classes: chunk class header + each method separately
|
|
161
|
+
if (isClass) {
|
|
162
|
+
// Find first child's start line
|
|
163
|
+
const firstChildStart = Math.min(...symbol.children.map(c => c.range.start.line))
|
|
164
|
+
|
|
165
|
+
// Class header chunk (from comment to first method)
|
|
166
|
+
const headerLines = lines.slice(commentStartLine, firstChildStart)
|
|
167
|
+
if (headerLines.join("\n").trim().length > 0) {
|
|
168
|
+
chunks.push({
|
|
169
|
+
content: headerLines.join("\n"),
|
|
170
|
+
class_name: symbol.name,
|
|
171
|
+
start_line: commentStartLine,
|
|
172
|
+
end_line: firstChildStart - 1,
|
|
173
|
+
})
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Chunk each method separately (with its comments)
|
|
177
|
+
extractChunksFromSymbols(symbol.children, lines, chunks, config, symbol.name)
|
|
178
|
+
} else {
|
|
179
|
+
// Non-class: chunk children recursively
|
|
180
|
+
extractChunksFromSymbols(symbol.children, lines, chunks, config, parentClass)
|
|
181
|
+
}
|
|
182
|
+
} else {
|
|
183
|
+
// Chunk fits size limit → add it
|
|
184
|
+
chunks.push(chunk)
|
|
185
|
+
|
|
186
|
+
// Still process children if they exist (nested functions in Go, for example)
|
|
187
|
+
if (symbol.children && symbol.children.length > 0) {
|
|
188
|
+
extractChunksFromSymbols(symbol.children, lines, chunks, config, isClass ? symbol.name : parentClass)
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Sort chunks by start_line
|
|
194
|
+
chunks.sort((a, b) => (a.start_line ?? 0) - (b.start_line ?? 0))
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* Capture leading comments above a symbol.
|
|
199
|
+
* Walks backward from startLine to find godoc, JSDoc, docstrings, etc.
|
|
200
|
+
*
|
|
201
|
+
* Handles:
|
|
202
|
+
* - Go: // comments (consecutive)
|
|
203
|
+
* - Python: """docstring"""
|
|
204
|
+
* - JS/TS: /** JSDoc *\/ or // comments
|
|
205
|
+
* - Rust: /// doc comments
|
|
206
|
+
* - Java/C#: /** JavaDoc *\/
|
|
207
|
+
*/
|
|
208
|
+
function captureLeadingComments(lines: string[], startLine: number): number {
|
|
209
|
+
if (startLine <= 0) return startLine
|
|
210
|
+
|
|
211
|
+
let commentStart = startLine - 1
|
|
212
|
+
let foundComment = false
|
|
213
|
+
|
|
214
|
+
// Walk backward to find comment block
|
|
215
|
+
while (commentStart >= 0) {
|
|
216
|
+
const line = lines[commentStart]
|
|
217
|
+
const trimmed = line.trim()
|
|
218
|
+
|
|
219
|
+
// Empty line
|
|
220
|
+
if (trimmed === "") {
|
|
221
|
+
// Allow max 1 blank line between comment and declaration
|
|
222
|
+
if (foundComment && commentStart > 0) {
|
|
223
|
+
const prevLine = lines[commentStart - 1].trim()
|
|
224
|
+
if (isCommentLine(prevLine)) {
|
|
225
|
+
commentStart--
|
|
226
|
+
continue
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
break
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Check if line is a comment
|
|
233
|
+
if (isCommentLine(trimmed)) {
|
|
234
|
+
foundComment = true
|
|
235
|
+
commentStart--
|
|
236
|
+
continue
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// Non-comment, non-empty line → stop
|
|
240
|
+
break
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
return foundComment ? commentStart + 1 : startLine
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Check if a line is a documentation comment.
|
|
248
|
+
*/
|
|
249
|
+
function isCommentLine(line: string): boolean {
|
|
250
|
+
return (
|
|
251
|
+
line.startsWith("//") || // Go, JS, TS, Rust, C++
|
|
252
|
+
line.startsWith("///") || // Rust doc comments
|
|
253
|
+
line.startsWith("#") || // Python
|
|
254
|
+
line.startsWith("*") || // Inside /** ... */
|
|
255
|
+
line.startsWith("/**") || // JSDoc/JavaDoc start
|
|
256
|
+
line.endsWith("*/") || // JSDoc/JavaDoc end
|
|
257
|
+
line.match(/^("""|''')/) || // Python docstring
|
|
258
|
+
line.startsWith("<!--") // HTML/Markdown
|
|
259
|
+
)
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Add gap chunks (code between symbols: imports, package decl, constants, etc.)
|
|
264
|
+
*/
|
|
265
|
+
function addGapChunks(chunks: CodeChunk[], lines: string[], config: CodeChunkConfig): void {
|
|
266
|
+
if (chunks.length === 0) {
|
|
267
|
+
// No symbols found → chunk entire file
|
|
268
|
+
chunks.push({
|
|
269
|
+
content: lines.join("\n"),
|
|
270
|
+
start_line: 0,
|
|
271
|
+
end_line: lines.length - 1,
|
|
272
|
+
})
|
|
273
|
+
return
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
const gaps: CodeChunk[] = []
|
|
277
|
+
let lastEnd = -1
|
|
278
|
+
|
|
279
|
+
for (const chunk of chunks) {
|
|
280
|
+
const start = chunk.start_line ?? 0
|
|
281
|
+
|
|
282
|
+
// Gap before this chunk
|
|
283
|
+
if (start > lastEnd + 1) {
|
|
284
|
+
const gapLines = lines.slice(lastEnd + 1, start)
|
|
285
|
+
const gapContent = gapLines.join("\n").trim()
|
|
286
|
+
|
|
287
|
+
if (gapContent.length >= config.min_chunk_size) {
|
|
288
|
+
gaps.push({
|
|
289
|
+
content: gapContent,
|
|
290
|
+
start_line: lastEnd + 1,
|
|
291
|
+
end_line: start - 1,
|
|
292
|
+
})
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
lastEnd = chunk.end_line ?? start
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
// Trailing gap
|
|
300
|
+
if (lastEnd < lines.length - 1) {
|
|
301
|
+
const gapLines = lines.slice(lastEnd + 1)
|
|
302
|
+
const gapContent = gapLines.join("\n").trim()
|
|
303
|
+
|
|
304
|
+
if (gapContent.length >= config.min_chunk_size) {
|
|
305
|
+
gaps.push({
|
|
306
|
+
content: gapContent,
|
|
307
|
+
start_line: lastEnd + 1,
|
|
308
|
+
end_line: lines.length - 1,
|
|
309
|
+
})
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Merge gaps into chunks
|
|
314
|
+
chunks.push(...gaps)
|
|
315
|
+
chunks.sort((a, b) => (a.start_line ?? 0) - (b.start_line ?? 0))
|
|
316
|
+
}
|
|
@@ -13,8 +13,8 @@ export interface MarkdownChunkConfig {
|
|
|
13
13
|
}
|
|
14
14
|
|
|
15
15
|
export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
|
|
16
|
-
min_chunk_size:
|
|
17
|
-
max_chunk_size:
|
|
16
|
+
min_chunk_size: 1000, // Merge small sections (headers without content)
|
|
17
|
+
max_chunk_size: 8000, // Large chunks for docs (SQL schemas, API specs, etc.)
|
|
18
18
|
split_by_headings: true,
|
|
19
19
|
preserve_heading_hierarchy: true,
|
|
20
20
|
}
|
package/vectorizer/index.ts
CHANGED
|
@@ -696,8 +696,8 @@ class CodebaseIndexer {
|
|
|
696
696
|
// Clean content before chunking
|
|
697
697
|
const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
|
|
698
698
|
|
|
699
|
-
// Semantic chunking
|
|
700
|
-
const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
|
|
699
|
+
// Semantic chunking (async for LSP-based chunking)
|
|
700
|
+
const chunks = await chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG, filePath, this.root);
|
|
701
701
|
|
|
702
702
|
// Assign chunk IDs
|
|
703
703
|
const chunksWithIds = this.graphBuilder
|
|
@@ -1348,6 +1348,29 @@ class CodebaseIndexer {
|
|
|
1348
1348
|
return this._chunkCache.get(chunkId) || null;
|
|
1349
1349
|
}
|
|
1350
1350
|
|
|
1351
|
+
/**
|
|
1352
|
+
* Find all chunks belonging to a specific file path.
|
|
1353
|
+
* @param {string} filePath - Relative file path (e.g. "src/auth.ts")
|
|
1354
|
+
* @returns {Promise<Array>} Array of chunks from this file
|
|
1355
|
+
*/
|
|
1356
|
+
async findChunksByPath(filePath) {
|
|
1357
|
+
// Ensure chunk cache is loaded
|
|
1358
|
+
await this.findChunkById("__force_cache_load__");
|
|
1359
|
+
|
|
1360
|
+
if (!this._chunkCache) return [];
|
|
1361
|
+
|
|
1362
|
+
const chunks = [];
|
|
1363
|
+
for (const chunk of this._chunkCache.values()) {
|
|
1364
|
+
if (chunk.file === filePath) {
|
|
1365
|
+
chunks.push(chunk);
|
|
1366
|
+
}
|
|
1367
|
+
}
|
|
1368
|
+
|
|
1369
|
+
// Sort by chunk_index
|
|
1370
|
+
chunks.sort((a, b) => (a.chunk_index || 0) - (b.chunk_index || 0));
|
|
1371
|
+
return chunks;
|
|
1372
|
+
}
|
|
1373
|
+
|
|
1351
1374
|
cosineSimilarity(vecA, vecB) {
|
|
1352
1375
|
let dotProduct = 0;
|
|
1353
1376
|
let normA = 0;
|
package/vectorizer.yaml
CHANGED
|
@@ -23,14 +23,14 @@ vectorizer:
|
|
|
23
23
|
strategy: "semantic" # fixed | semantic
|
|
24
24
|
markdown:
|
|
25
25
|
split_by_headings: true
|
|
26
|
-
min_chunk_size:
|
|
27
|
-
max_chunk_size:
|
|
26
|
+
min_chunk_size: 1000 # Merge small sections (avoid header-only chunks)
|
|
27
|
+
max_chunk_size: 8000 # Large chunks for docs (SQL schemas, API specs, etc.)
|
|
28
28
|
preserve_heading_hierarchy: true
|
|
29
29
|
code:
|
|
30
30
|
split_by_functions: true
|
|
31
31
|
include_function_signature: true
|
|
32
|
-
min_chunk_size:
|
|
33
|
-
max_chunk_size:
|
|
32
|
+
min_chunk_size: 600 # Function + godoc/JSDoc (avoid tiny chunks)
|
|
33
|
+
max_chunk_size: 3000 # Allow larger chunks (class with multiple methods)
|
|
34
34
|
fixed:
|
|
35
35
|
max_chars: 1500
|
|
36
36
|
|