@comfanion/usethis_search 4.4.0 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/api.ts +34 -17
- package/cache/manager.ts +30 -19
- package/cli.ts +8 -5
- package/file-indexer.ts +28 -11
- package/hooks/message-before.ts +5 -5
- package/hooks/tool-substitution.ts +4 -120
- package/index.ts +17 -6
- package/package.json +3 -2
- package/tools/codeindex.ts +192 -184
- package/tools/graph.ts +265 -0
- package/tools/read-interceptor.ts +7 -3
- package/tools/search.ts +268 -190
- package/tools/workspace-state.ts +1 -2
- package/tools/workspace.ts +76 -108
- package/vectorizer/analyzers/lsp-client.ts +52 -6
- package/vectorizer/chunkers/chunker-factory.ts +6 -0
- package/vectorizer/chunkers/code-chunker.ts +73 -16
- package/vectorizer/chunkers/lsp-chunker.ts +313 -191
- package/vectorizer/graph-db.ts +6 -4
- package/vectorizer/index.ts +328 -132
- package/vectorizer/usage-tracker.ts +36 -0
- package/vectorizer.yaml +2 -2
|
@@ -2,15 +2,15 @@
|
|
|
2
2
|
* LSP-Based Code Chunker
|
|
3
3
|
*
|
|
4
4
|
* Uses Language Server Protocol to get AST-accurate function/class boundaries.
|
|
5
|
-
* Captures godoc/JSDoc comments that belong to each symbol.
|
|
6
5
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
6
|
+
* Algorithm:
|
|
7
|
+
* 1. Get flat list of symbols from LSP (functions, methods, classes)
|
|
8
|
+
* 2. Sort by start_line
|
|
9
|
+
* 3. For each symbol: everything between previous symbol's end and this symbol's start
|
|
10
|
+
* (comments, blank lines, decorators) → prepend to this symbol's chunk
|
|
11
|
+
* 4. Content before first symbol → separate "header" chunk (imports, package decl)
|
|
12
|
+
* 5. Large classes → split into header + individual methods
|
|
13
|
+
* 6. No duplicate/overlapping chunks — each line belongs to exactly one chunk
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
16
|
import { LSPClient, LSPSymbolInformation, SymbolKind } from "../analyzers/lsp-client.ts"
|
|
@@ -25,7 +25,6 @@ const CHUNKABLE_SYMBOLS = new Set([
|
|
|
25
25
|
SymbolKind.Class,
|
|
26
26
|
SymbolKind.Interface,
|
|
27
27
|
SymbolKind.Enum,
|
|
28
|
-
// Note: Struct is not in SymbolKind — Go structs appear as Class
|
|
29
28
|
])
|
|
30
29
|
|
|
31
30
|
/** Map file extension to LSP language ID */
|
|
@@ -43,9 +42,17 @@ const EXT_TO_LANGUAGE: Record<string, string> = {
|
|
|
43
42
|
cs: "csharp",
|
|
44
43
|
}
|
|
45
44
|
|
|
45
|
+
interface FlatSymbol {
|
|
46
|
+
name: string
|
|
47
|
+
startLine: number
|
|
48
|
+
endLine: number
|
|
49
|
+
functionName?: string
|
|
50
|
+
className?: string
|
|
51
|
+
}
|
|
52
|
+
|
|
46
53
|
/**
|
|
47
54
|
* Chunk code using LSP documentSymbol API.
|
|
48
|
-
*
|
|
55
|
+
* Returns null if LSP unavailable → caller falls back to regex chunker.
|
|
49
56
|
*/
|
|
50
57
|
export async function chunkCodeWithLSP(
|
|
51
58
|
filePath: string,
|
|
@@ -53,43 +60,34 @@ export async function chunkCodeWithLSP(
|
|
|
53
60
|
config: CodeChunkConfig,
|
|
54
61
|
projectRoot?: string,
|
|
55
62
|
): Promise<CodeChunk[] | null> {
|
|
56
|
-
// Check if LSP available for this language
|
|
57
63
|
const ext = filePath.split(".").pop() || ""
|
|
58
64
|
const language = EXT_TO_LANGUAGE[ext]
|
|
59
65
|
if (!language) {
|
|
60
66
|
if (DEBUG) console.log(`[lsp-chunker] No language mapping for .${ext}`)
|
|
61
|
-
return null
|
|
67
|
+
return null
|
|
62
68
|
}
|
|
63
69
|
|
|
64
70
|
const available = await LSPClient.isAvailable(language)
|
|
65
71
|
if (!available) {
|
|
66
72
|
if (DEBUG) console.log(`[lsp-chunker] LSP not available for ${language}`)
|
|
67
|
-
return null
|
|
73
|
+
return null
|
|
68
74
|
}
|
|
69
75
|
|
|
70
|
-
|
|
71
|
-
const client = new LSPClient(language, projectRoot)
|
|
76
|
+
const client = new LSPClient(projectRoot || process.cwd())
|
|
72
77
|
try {
|
|
73
|
-
await client.start()
|
|
78
|
+
await client.start(language)
|
|
74
79
|
await client.openDocument(filePath, content)
|
|
75
80
|
|
|
76
|
-
// Get document symbols
|
|
77
81
|
const symbols = await client.documentSymbol(filePath)
|
|
78
82
|
if (!symbols || symbols.length === 0) {
|
|
79
83
|
if (DEBUG) console.log(`[lsp-chunker] No symbols found in ${filePath}`)
|
|
80
|
-
return null
|
|
84
|
+
return null
|
|
81
85
|
}
|
|
82
86
|
|
|
83
87
|
const lines = content.split("\n")
|
|
84
|
-
const chunks
|
|
85
|
-
|
|
86
|
-
// Extract chunks from symbols (recursive for nested symbols)
|
|
87
|
-
extractChunksFromSymbols(symbols, lines, chunks, config)
|
|
88
|
-
|
|
89
|
-
// Add gaps (code between symbols: imports, package declarations, etc.)
|
|
90
|
-
addGapChunks(chunks, lines, config)
|
|
88
|
+
const chunks = buildChunks(symbols, lines, config)
|
|
91
89
|
|
|
92
|
-
if (DEBUG) console.log(`[lsp-chunker]
|
|
90
|
+
if (DEBUG) console.log(`[lsp-chunker] ${filePath}: ${chunks.length} chunks from ${symbols.length} symbols`)
|
|
93
91
|
|
|
94
92
|
await client.closeDocument(filePath)
|
|
95
93
|
await client.stop()
|
|
@@ -97,220 +95,344 @@ export async function chunkCodeWithLSP(
|
|
|
97
95
|
return chunks.length > 0 ? chunks : null
|
|
98
96
|
} catch (error: any) {
|
|
99
97
|
if (DEBUG) console.log(`[lsp-chunker] Error: ${error.message}`)
|
|
100
|
-
try {
|
|
101
|
-
|
|
102
|
-
} catch {}
|
|
103
|
-
return null // Fallback to regex
|
|
98
|
+
try { await client.stop() } catch {}
|
|
99
|
+
return null
|
|
104
100
|
}
|
|
105
101
|
}
|
|
106
102
|
|
|
107
103
|
/**
|
|
108
|
-
*
|
|
109
|
-
*
|
|
104
|
+
* Build non-overlapping chunks from LSP symbols.
|
|
105
|
+
*
|
|
106
|
+
* Each line in the file belongs to exactly one chunk.
|
|
107
|
+
* Comments/gaps between symbols are prepended to the next symbol.
|
|
108
|
+
* Content before the first symbol becomes a "header" chunk.
|
|
110
109
|
*/
|
|
111
|
-
function
|
|
110
|
+
function buildChunks(
|
|
112
111
|
symbols: LSPSymbolInformation[],
|
|
113
112
|
lines: string[],
|
|
114
|
-
chunks: CodeChunk[],
|
|
115
113
|
config: CodeChunkConfig,
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
// Skip non-chunkable symbols (variables, properties, etc.)
|
|
120
|
-
if (!CHUNKABLE_SYMBOLS.has(symbol.kind)) continue
|
|
114
|
+
): CodeChunk[] {
|
|
115
|
+
// Step 1: Flatten symbols into a sorted list of non-overlapping ranges
|
|
116
|
+
const flat = flattenSymbols(symbols, config)
|
|
121
117
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
const commentStartLine = captureLeadingComments(lines, startLine)
|
|
118
|
+
if (flat.length === 0) {
|
|
119
|
+
// No chunkable symbols — return whole file as one chunk
|
|
120
|
+
return [{ content: lines.join("\n"), start_line: 0, end_line: lines.length - 1 }]
|
|
121
|
+
}
|
|
127
122
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
const chunkContent = chunkLines.join("\n")
|
|
123
|
+
const chunks: CodeChunk[] = []
|
|
124
|
+
let lastEnd = -1
|
|
131
125
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
126
|
+
for (let i = 0; i < flat.length; i++) {
|
|
127
|
+
const sym = flat[i]
|
|
128
|
+
|
|
129
|
+
// Gap between previous symbol end and this symbol start
|
|
130
|
+
// → prepend to this symbol (comments, decorators, blank lines)
|
|
131
|
+
const chunkStart = lastEnd + 1
|
|
132
|
+
const chunkEnd = sym.endLine
|
|
133
|
+
|
|
134
|
+
// But if there's a large gap with real code before first symbol → separate header chunk
|
|
135
|
+
if (i === 0 && chunkStart < sym.startLine) {
|
|
136
|
+
const headerContent = lines.slice(chunkStart, sym.startLine).join("\n").trimEnd()
|
|
137
|
+
if (headerContent.length > 0 && hasRealCode(headerContent)) {
|
|
138
|
+
chunks.push({
|
|
139
|
+
content: headerContent,
|
|
140
|
+
start_line: chunkStart,
|
|
141
|
+
end_line: sym.startLine - 1,
|
|
142
|
+
})
|
|
143
|
+
// Symbol chunk starts at its own startLine (no gap prepended)
|
|
144
|
+
const symContent = lines.slice(sym.startLine, chunkEnd + 1).join("\n")
|
|
145
|
+
chunks.push({
|
|
146
|
+
content: symContent,
|
|
147
|
+
function_name: sym.functionName,
|
|
148
|
+
class_name: sym.className,
|
|
149
|
+
start_line: sym.startLine,
|
|
150
|
+
end_line: chunkEnd,
|
|
151
|
+
})
|
|
152
|
+
lastEnd = chunkEnd
|
|
153
|
+
continue
|
|
154
|
+
}
|
|
136
155
|
}
|
|
137
156
|
|
|
138
|
-
//
|
|
139
|
-
const
|
|
140
|
-
const isFunction = symbol.kind === SymbolKind.Function || symbol.kind === SymbolKind.Method
|
|
157
|
+
// Normal case: gap + symbol → one chunk
|
|
158
|
+
const chunkContent = lines.slice(chunkStart, chunkEnd + 1).join("\n")
|
|
141
159
|
|
|
142
|
-
|
|
160
|
+
chunks.push({
|
|
143
161
|
content: chunkContent,
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
162
|
+
function_name: sym.functionName,
|
|
163
|
+
class_name: sym.className,
|
|
164
|
+
start_line: chunkStart,
|
|
165
|
+
end_line: chunkEnd,
|
|
166
|
+
})
|
|
167
|
+
|
|
168
|
+
lastEnd = chunkEnd
|
|
169
|
+
}
|
|
147
170
|
|
|
148
|
-
|
|
149
|
-
|
|
171
|
+
// Trailing content after last symbol
|
|
172
|
+
if (lastEnd < lines.length - 1) {
|
|
173
|
+
const trailing = lines.slice(lastEnd + 1).join("\n").trimEnd()
|
|
174
|
+
if (trailing.length > 0) {
|
|
175
|
+
// Append to last chunk if small, otherwise separate
|
|
176
|
+
const lastChunk = chunks[chunks.length - 1]
|
|
177
|
+
if (trailing.length < config.min_chunk_size && lastChunk) {
|
|
178
|
+
lastChunk.content += "\n" + trailing
|
|
179
|
+
lastChunk.end_line = lines.length - 1
|
|
180
|
+
} else {
|
|
181
|
+
chunks.push({
|
|
182
|
+
content: trailing,
|
|
183
|
+
start_line: lastEnd + 1,
|
|
184
|
+
end_line: lines.length - 1,
|
|
185
|
+
})
|
|
186
|
+
}
|
|
150
187
|
}
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Split any chunk that's still too large
|
|
191
|
+
const result: CodeChunk[] = []
|
|
192
|
+
for (const chunk of chunks) {
|
|
193
|
+
if (chunk.content.length > config.max_chunk_size) {
|
|
194
|
+
result.push(...splitLargeChunk(chunk, lines, config))
|
|
195
|
+
} else {
|
|
196
|
+
result.push(chunk)
|
|
154
197
|
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Filter out empty/trivial chunks
|
|
201
|
+
return result.filter(c => {
|
|
202
|
+
const trimmed = c.content.trim()
|
|
203
|
+
if (trimmed.length === 0) return false
|
|
204
|
+
if (c.function_name || c.class_name) return true
|
|
205
|
+
if (trimmed.length < 50) return false
|
|
206
|
+
return true
|
|
207
|
+
})
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Flatten LSP symbol tree into a sorted, non-overlapping list.
|
|
212
|
+
*
|
|
213
|
+
* For small classes (< max_chunk_size): one chunk for the whole class.
|
|
214
|
+
* For large classes: class header + individual methods.
|
|
215
|
+
* Nested callbacks/arrow functions inside a function → NOT separate chunks.
|
|
216
|
+
*/
|
|
217
|
+
function flattenSymbols(
|
|
218
|
+
symbols: LSPSymbolInformation[],
|
|
219
|
+
config: CodeChunkConfig,
|
|
220
|
+
parentClass?: string,
|
|
221
|
+
): FlatSymbol[] {
|
|
222
|
+
const result: FlatSymbol[] = []
|
|
223
|
+
|
|
224
|
+
for (const sym of symbols) {
|
|
225
|
+
if (!CHUNKABLE_SYMBOLS.has(sym.kind)) continue
|
|
155
226
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
227
|
+
const startLine = sym.range.start.line
|
|
228
|
+
const endLine = sym.range.end.line
|
|
229
|
+
const isClass = sym.kind === SymbolKind.Class || sym.kind === SymbolKind.Interface || sym.kind === SymbolKind.Enum
|
|
230
|
+
const isFunction = sym.kind === SymbolKind.Function || sym.kind === SymbolKind.Method
|
|
231
|
+
|
|
232
|
+
if (isClass && sym.children && sym.children.length > 0) {
|
|
233
|
+
// Check if class content is too large → split into methods
|
|
234
|
+
// Estimate size: (endLine - startLine) * ~40 chars per line
|
|
235
|
+
const estimatedSize = (endLine - startLine + 1) * 40
|
|
159
236
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
const firstChildStart = Math.min(...symbol.children.map(c => c.range.start.line))
|
|
237
|
+
if (estimatedSize > config.max_chunk_size) {
|
|
238
|
+
// Large class → flatten children (methods) as separate symbols
|
|
239
|
+
const methods = flattenSymbols(sym.children, config, sym.name)
|
|
164
240
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
241
|
+
if (methods.length > 0) {
|
|
242
|
+
// Class header: from class start to first method
|
|
243
|
+
const firstMethodStart = Math.min(...methods.map(m => m.startLine))
|
|
244
|
+
if (firstMethodStart > startLine) {
|
|
245
|
+
result.push({
|
|
246
|
+
name: sym.name,
|
|
247
|
+
startLine,
|
|
248
|
+
endLine: firstMethodStart - 1,
|
|
249
|
+
className: sym.name,
|
|
250
|
+
})
|
|
251
|
+
}
|
|
252
|
+
result.push(...methods)
|
|
253
|
+
|
|
254
|
+
// Class tail: from last method end to class end
|
|
255
|
+
const lastMethodEnd = Math.max(...methods.map(m => m.endLine))
|
|
256
|
+
if (lastMethodEnd < endLine) {
|
|
257
|
+
result.push({
|
|
258
|
+
name: `${sym.name}::tail`,
|
|
259
|
+
startLine: lastMethodEnd + 1,
|
|
260
|
+
endLine,
|
|
261
|
+
className: sym.name,
|
|
262
|
+
})
|
|
263
|
+
}
|
|
264
|
+
} else {
|
|
265
|
+
// No chunkable children → whole class as one chunk
|
|
266
|
+
result.push({
|
|
267
|
+
name: sym.name,
|
|
268
|
+
startLine,
|
|
269
|
+
endLine,
|
|
270
|
+
className: sym.name,
|
|
173
271
|
})
|
|
174
272
|
}
|
|
175
|
-
|
|
176
|
-
// Chunk each method separately (with its comments)
|
|
177
|
-
extractChunksFromSymbols(symbol.children, lines, chunks, config, symbol.name)
|
|
178
273
|
} else {
|
|
179
|
-
//
|
|
180
|
-
|
|
274
|
+
// Small class → one chunk, NO children
|
|
275
|
+
result.push({
|
|
276
|
+
name: sym.name,
|
|
277
|
+
startLine,
|
|
278
|
+
endLine,
|
|
279
|
+
className: sym.name,
|
|
280
|
+
})
|
|
181
281
|
}
|
|
182
|
-
} else {
|
|
183
|
-
//
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
282
|
+
} else if (isFunction) {
|
|
283
|
+
// Check if function is too large AND has chunkable children
|
|
284
|
+
// (e.g. describe() with it() blocks, or large function with nested functions)
|
|
285
|
+
const estimatedSize = (endLine - startLine + 1) * 40
|
|
286
|
+
const chunkableChildren = sym.children?.filter(c => CHUNKABLE_SYMBOLS.has(c.kind)) || []
|
|
287
|
+
|
|
288
|
+
if (estimatedSize > config.max_chunk_size && chunkableChildren.length > 0) {
|
|
289
|
+
// Large function with children → split like a class
|
|
290
|
+
const children = flattenSymbols(sym.children!, config, parentClass)
|
|
291
|
+
|
|
292
|
+
if (children.length > 0) {
|
|
293
|
+
// Function header: from start to first child
|
|
294
|
+
const firstChildStart = Math.min(...children.map(m => m.startLine))
|
|
295
|
+
if (firstChildStart > startLine) {
|
|
296
|
+
result.push({
|
|
297
|
+
name: sym.name,
|
|
298
|
+
startLine,
|
|
299
|
+
endLine: firstChildStart - 1,
|
|
300
|
+
functionName: sym.name,
|
|
301
|
+
className: parentClass,
|
|
302
|
+
})
|
|
303
|
+
}
|
|
304
|
+
result.push(...children)
|
|
305
|
+
|
|
306
|
+
// Function tail: from last child end to function end
|
|
307
|
+
const lastChildEnd = Math.max(...children.map(m => m.endLine))
|
|
308
|
+
if (lastChildEnd < endLine) {
|
|
309
|
+
result.push({
|
|
310
|
+
name: `${sym.name}::tail`,
|
|
311
|
+
startLine: lastChildEnd + 1,
|
|
312
|
+
endLine,
|
|
313
|
+
functionName: sym.name,
|
|
314
|
+
className: parentClass,
|
|
315
|
+
})
|
|
316
|
+
}
|
|
317
|
+
} else {
|
|
318
|
+
// No chunkable children found → keep as one chunk
|
|
319
|
+
result.push({
|
|
320
|
+
name: sym.name,
|
|
321
|
+
startLine,
|
|
322
|
+
endLine,
|
|
323
|
+
functionName: sym.name,
|
|
324
|
+
className: parentClass,
|
|
325
|
+
})
|
|
326
|
+
}
|
|
327
|
+
} else {
|
|
328
|
+
// Small function or no children → one chunk, NO nested callbacks
|
|
329
|
+
result.push({
|
|
330
|
+
name: sym.name,
|
|
331
|
+
startLine,
|
|
332
|
+
endLine,
|
|
333
|
+
functionName: sym.name,
|
|
334
|
+
className: parentClass,
|
|
335
|
+
})
|
|
189
336
|
}
|
|
337
|
+
} else {
|
|
338
|
+
// Interface, Enum without children
|
|
339
|
+
result.push({
|
|
340
|
+
name: sym.name,
|
|
341
|
+
startLine,
|
|
342
|
+
endLine,
|
|
343
|
+
className: sym.name,
|
|
344
|
+
})
|
|
190
345
|
}
|
|
191
346
|
}
|
|
192
347
|
|
|
193
|
-
// Sort
|
|
194
|
-
|
|
348
|
+
// Sort by start line and remove overlaps
|
|
349
|
+
result.sort((a, b) => a.startLine - b.startLine)
|
|
350
|
+
return deduplicateRanges(result)
|
|
195
351
|
}
|
|
196
352
|
|
|
197
353
|
/**
|
|
198
|
-
*
|
|
199
|
-
*
|
|
200
|
-
*
|
|
201
|
-
* Handles:
|
|
202
|
-
* - Go: // comments (consecutive)
|
|
203
|
-
* - Python: """docstring"""
|
|
204
|
-
* - JS/TS: /** JSDoc *\/ or // comments
|
|
205
|
-
* - Rust: /// doc comments
|
|
206
|
-
* - Java/C#: /** JavaDoc *\/
|
|
354
|
+
* Remove overlapping ranges — keep the more specific (smaller) one.
|
|
355
|
+
* After sorting by startLine, if B starts inside A, keep whichever is smaller.
|
|
207
356
|
*/
|
|
208
|
-
function
|
|
209
|
-
if (
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
if (isCommentLine(prevLine)) {
|
|
225
|
-
commentStart--
|
|
226
|
-
continue
|
|
227
|
-
}
|
|
357
|
+
function deduplicateRanges(symbols: FlatSymbol[]): FlatSymbol[] {
|
|
358
|
+
if (symbols.length <= 1) return symbols
|
|
359
|
+
|
|
360
|
+
const result: FlatSymbol[] = [symbols[0]]
|
|
361
|
+
|
|
362
|
+
for (let i = 1; i < symbols.length; i++) {
|
|
363
|
+
const prev = result[result.length - 1]
|
|
364
|
+
const curr = symbols[i]
|
|
365
|
+
|
|
366
|
+
if (curr.startLine <= prev.endLine) {
|
|
367
|
+
// Overlap — keep the one that's NOT a container
|
|
368
|
+
// If prev contains curr entirely → prev is a class stub, skip it and keep curr
|
|
369
|
+
// If curr is inside prev → skip curr (it's a nested callback)
|
|
370
|
+
if (curr.startLine >= prev.startLine && curr.endLine <= prev.endLine) {
|
|
371
|
+
// curr is inside prev — skip curr (nested callback/arrow fn)
|
|
372
|
+
continue
|
|
228
373
|
}
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
// Check if line is a comment
|
|
233
|
-
if (isCommentLine(trimmed)) {
|
|
234
|
-
foundComment = true
|
|
235
|
-
commentStart--
|
|
236
|
-
continue
|
|
374
|
+
// Partial overlap — adjust prev.endLine
|
|
375
|
+
prev.endLine = Math.min(prev.endLine, curr.startLine - 1)
|
|
237
376
|
}
|
|
238
377
|
|
|
239
|
-
|
|
240
|
-
break
|
|
378
|
+
result.push(curr)
|
|
241
379
|
}
|
|
242
380
|
|
|
243
|
-
return
|
|
381
|
+
return result
|
|
244
382
|
}
|
|
245
383
|
|
|
246
384
|
/**
|
|
247
|
-
* Check if
|
|
385
|
+
* Check if content has real code (not just comments/whitespace/braces).
|
|
248
386
|
*/
|
|
249
|
-
function
|
|
250
|
-
return (
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
line.startsWith("<!--") // HTML/Markdown
|
|
259
|
-
)
|
|
387
|
+
function hasRealCode(content: string): boolean {
|
|
388
|
+
return content.split("\n").some(l => {
|
|
389
|
+
const t = l.trim()
|
|
390
|
+
if (t.length === 0) return false
|
|
391
|
+
if (t.startsWith("//") || t.startsWith("/*") || t.startsWith("*") || t.startsWith("*/")) return false
|
|
392
|
+
if (t.startsWith("#") || t.startsWith("<!--")) return false
|
|
393
|
+
if (/^[{}()\[\];,]+$/.test(t)) return false
|
|
394
|
+
return true
|
|
395
|
+
})
|
|
260
396
|
}
|
|
261
397
|
|
|
262
398
|
/**
|
|
263
|
-
*
|
|
399
|
+
* Split a large chunk by line count, preserving metadata.
|
|
264
400
|
*/
|
|
265
|
-
function
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
if (gapContent.length >= config.min_chunk_size) {
|
|
288
|
-
gaps.push({
|
|
289
|
-
content: gapContent,
|
|
290
|
-
start_line: lastEnd + 1,
|
|
291
|
-
end_line: start - 1,
|
|
292
|
-
})
|
|
293
|
-
}
|
|
401
|
+
function splitLargeChunk(chunk: CodeChunk, allLines: string[], config: CodeChunkConfig): CodeChunk[] {
|
|
402
|
+
const chunkLines = chunk.content.split("\n")
|
|
403
|
+
const baseLine = chunk.start_line || 0
|
|
404
|
+
const parts: CodeChunk[] = []
|
|
405
|
+
let current: string[] = []
|
|
406
|
+
let currentLen = 0
|
|
407
|
+
let startLine = baseLine
|
|
408
|
+
|
|
409
|
+
for (let i = 0; i < chunkLines.length; i++) {
|
|
410
|
+
const line = chunkLines[i]
|
|
411
|
+
if (currentLen + line.length + 1 > config.max_chunk_size && current.length > 0) {
|
|
412
|
+
parts.push({
|
|
413
|
+
content: current.join("\n"),
|
|
414
|
+
function_name: chunk.function_name,
|
|
415
|
+
class_name: chunk.class_name,
|
|
416
|
+
start_line: startLine,
|
|
417
|
+
end_line: baseLine + i - 1,
|
|
418
|
+
})
|
|
419
|
+
current = []
|
|
420
|
+
currentLen = 0
|
|
421
|
+
startLine = baseLine + i
|
|
294
422
|
}
|
|
295
|
-
|
|
296
|
-
|
|
423
|
+
current.push(line)
|
|
424
|
+
currentLen += line.length + 1
|
|
297
425
|
}
|
|
298
426
|
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
start_line: lastEnd + 1,
|
|
308
|
-
end_line: lines.length - 1,
|
|
309
|
-
})
|
|
310
|
-
}
|
|
427
|
+
if (current.length > 0) {
|
|
428
|
+
parts.push({
|
|
429
|
+
content: current.join("\n"),
|
|
430
|
+
function_name: chunk.function_name,
|
|
431
|
+
class_name: chunk.class_name,
|
|
432
|
+
start_line: startLine,
|
|
433
|
+
end_line: baseLine + chunkLines.length - 1,
|
|
434
|
+
})
|
|
311
435
|
}
|
|
312
436
|
|
|
313
|
-
|
|
314
|
-
chunks.push(...gaps)
|
|
315
|
-
chunks.sort((a, b) => (a.start_line ?? 0) - (b.start_line ?? 0))
|
|
437
|
+
return parts
|
|
316
438
|
}
|
package/vectorizer/graph-db.ts
CHANGED
|
@@ -24,6 +24,7 @@ export class GraphDB {
|
|
|
24
24
|
private _stmtBySubjectPredicate: any = null
|
|
25
25
|
private _stmtByPredicate: any = null
|
|
26
26
|
private _stmtAll: any = null
|
|
27
|
+
private _stmtDeleteMeta: any = null
|
|
27
28
|
|
|
28
29
|
constructor(private dbPath: string) {}
|
|
29
30
|
|
|
@@ -69,6 +70,7 @@ export class GraphDB {
|
|
|
69
70
|
this._stmtBySubjectPredicate = this.db.prepare("SELECT * FROM triples WHERE subject = ? AND predicate = ?")
|
|
70
71
|
this._stmtByPredicate = this.db.prepare("SELECT * FROM triples WHERE predicate = ?")
|
|
71
72
|
this._stmtAll = this.db.prepare("SELECT * FROM triples")
|
|
73
|
+
this._stmtDeleteMeta = this.db.prepare("DELETE FROM triples WHERE subject = ? AND predicate = ?")
|
|
72
74
|
|
|
73
75
|
this.initialized = true
|
|
74
76
|
return this
|
|
@@ -135,6 +137,7 @@ export class GraphDB {
|
|
|
135
137
|
this._stmtBySubjectPredicate = null
|
|
136
138
|
this._stmtByPredicate = null
|
|
137
139
|
this._stmtAll = null
|
|
140
|
+
this._stmtDeleteMeta = null
|
|
138
141
|
this.initialized = false
|
|
139
142
|
}
|
|
140
143
|
}
|
|
@@ -173,8 +176,7 @@ export class GraphDB {
|
|
|
173
176
|
if (!this.initialized || !this.db) throw new Error("GraphDB not initialized. Call init() first.")
|
|
174
177
|
|
|
175
178
|
try {
|
|
176
|
-
this.
|
|
177
|
-
.run(`meta:${filePath}`, "graph_built")
|
|
179
|
+
this._stmtDeleteMeta.run(`meta:${filePath}`, "graph_built")
|
|
178
180
|
} catch {
|
|
179
181
|
// Silently ignore errors
|
|
180
182
|
}
|
|
@@ -266,8 +268,8 @@ export class GraphDB {
|
|
|
266
268
|
})
|
|
267
269
|
}
|
|
268
270
|
}
|
|
269
|
-
} catch
|
|
270
|
-
|
|
271
|
+
} catch {
|
|
272
|
+
// Non-fatal — skip node on error (corrupted edge, closed DB, etc.)
|
|
271
273
|
}
|
|
272
274
|
}
|
|
273
275
|
|