@comfanion/usethis_search 4.2.0-dev.3 → 4.2.0-dev.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@comfanion/usethis_search",
3
- "version": "4.2.0-dev.3",
3
+ "version": "4.2.0-dev.4",
4
4
  "description": "OpenCode plugin: semantic search with chunk-based workspace injection (v4.2-dev: chunk-level context, granular detach, improved token efficiency)",
5
5
  "type": "module",
6
6
  "main": "./index.ts",
@@ -15,7 +15,7 @@
15
15
  "index:clear": "bun run cli.ts clear"
16
16
  },
17
17
  "bin": {
18
- "usethis-search": "./cli.ts"
18
+ "usethis-search": "cli.ts"
19
19
  },
20
20
  "files": [
21
21
  "index.ts",
@@ -45,6 +45,7 @@
45
45
  "vectorizer/analyzers/lsp-client.ts",
46
46
  "vectorizer/chunkers/markdown-chunker.ts",
47
47
  "vectorizer/chunkers/code-chunker.ts",
48
+ "vectorizer/chunkers/lsp-chunker.ts",
48
49
  "vectorizer/chunkers/chunker-factory.ts",
49
50
  "vectorizer.yaml",
50
51
  "README.md",
@@ -0,0 +1,316 @@
1
+ /**
2
+ * LSP-Based Code Chunker
3
+ *
4
+ * Uses Language Server Protocol to get AST-accurate function/class boundaries.
5
+ * Captures godoc/JSDoc comments that belong to each symbol.
6
+ *
7
+ * Advantages over regex-chunker:
8
+ * - ✅ Accurate AST parsing (no brace counting bugs)
9
+ * - ✅ Captures leading documentation comments (godoc, JSDoc, docstrings)
10
+ * - ✅ Handles nested structures (class methods, nested functions)
11
+ * - ✅ Language-agnostic (works for Go, TS, Python, Rust, Java, etc.)
12
+ *
13
+ * Fallback: If LSP unavailable → use regex-chunker
14
+ */
15
+
16
+ import { LSPClient, LSPSymbolInformation, SymbolKind } from "../analyzers/lsp-client.ts"
17
+ import type { CodeChunk, CodeChunkConfig } from "./code-chunker.ts"
18
+
19
+ const DEBUG = process.env.DEBUG_LSP_CHUNKER === "true"
20
+
21
+ /** Symbol kinds we want to chunk separately */
22
+ const CHUNKABLE_SYMBOLS = new Set([
23
+ SymbolKind.Function,
24
+ SymbolKind.Method,
25
+ SymbolKind.Class,
26
+ SymbolKind.Interface,
27
+ SymbolKind.Enum,
28
+ // Note: Struct is not in SymbolKind — Go structs appear as Class
29
+ ])
30
+
31
+ /** Map file extension to LSP language ID */
32
+ const EXT_TO_LANGUAGE: Record<string, string> = {
33
+ ts: "typescript",
34
+ js: "javascript",
35
+ tsx: "typescriptreact",
36
+ jsx: "javascriptreact",
37
+ py: "python",
38
+ go: "go",
39
+ rs: "rust",
40
+ java: "java",
41
+ cpp: "cpp",
42
+ c: "c",
43
+ cs: "csharp",
44
+ }
45
+
46
+ /**
47
+ * Chunk code using LSP documentSymbol API.
48
+ * Falls back to regex-chunker if LSP unavailable.
49
+ */
50
+ export async function chunkCodeWithLSP(
51
+ filePath: string,
52
+ content: string,
53
+ config: CodeChunkConfig,
54
+ projectRoot?: string,
55
+ ): Promise<CodeChunk[] | null> {
56
+ // Check if LSP available for this language
57
+ const ext = filePath.split(".").pop() || ""
58
+ const language = EXT_TO_LANGUAGE[ext]
59
+ if (!language) {
60
+ if (DEBUG) console.log(`[lsp-chunker] No language mapping for .${ext}`)
61
+ return null // Fallback to regex
62
+ }
63
+
64
+ const available = await LSPClient.isAvailable(language)
65
+ if (!available) {
66
+ if (DEBUG) console.log(`[lsp-chunker] LSP not available for ${language}`)
67
+ return null // Fallback to regex
68
+ }
69
+
70
+ // Start LSP client
71
+ const client = new LSPClient(language, projectRoot)
72
+ try {
73
+ await client.start()
74
+ await client.openDocument(filePath, content)
75
+
76
+ // Get document symbols
77
+ const symbols = await client.documentSymbol(filePath)
78
+ if (!symbols || symbols.length === 0) {
79
+ if (DEBUG) console.log(`[lsp-chunker] No symbols found in ${filePath}`)
80
+ return null // Fallback to regex
81
+ }
82
+
83
+ const lines = content.split("\n")
84
+ const chunks: CodeChunk[] = []
85
+
86
+ // Extract chunks from symbols (recursive for nested symbols)
87
+ extractChunksFromSymbols(symbols, lines, chunks, config)
88
+
89
+ // Add gaps (code between symbols: imports, package declarations, etc.)
90
+ addGapChunks(chunks, lines, config)
91
+
92
+ if (DEBUG) console.log(`[lsp-chunker] Generated ${chunks.length} chunks from ${symbols.length} symbols`)
93
+
94
+ await client.closeDocument(filePath)
95
+ await client.stop()
96
+
97
+ return chunks.length > 0 ? chunks : null
98
+ } catch (error: any) {
99
+ if (DEBUG) console.log(`[lsp-chunker] Error: ${error.message}`)
100
+ try {
101
+ await client.stop()
102
+ } catch {}
103
+ return null // Fallback to regex
104
+ }
105
+ }
106
+
107
+ /**
108
+ * Recursively extract chunks from LSP symbols.
109
+ * Handles nested structures (class methods, nested functions).
110
+ */
111
+ function extractChunksFromSymbols(
112
+ symbols: LSPSymbolInformation[],
113
+ lines: string[],
114
+ chunks: CodeChunk[],
115
+ config: CodeChunkConfig,
116
+ parentClass?: string,
117
+ ): void {
118
+ for (const symbol of symbols) {
119
+ // Skip non-chunkable symbols (variables, properties, etc.)
120
+ if (!CHUNKABLE_SYMBOLS.has(symbol.kind)) continue
121
+
122
+ const startLine = symbol.range.start.line
123
+ const endLine = symbol.range.end.line
124
+
125
+ // Expand startLine backward to capture leading comments (godoc, JSDoc, docstrings)
126
+ const commentStartLine = captureLeadingComments(lines, startLine)
127
+
128
+ // Extract chunk content
129
+ const chunkLines = lines.slice(commentStartLine, endLine + 1)
130
+ const chunkContent = chunkLines.join("\n")
131
+
132
+ // Check size constraints
133
+ if (chunkContent.length < config.min_chunk_size && chunkLines.length < 5) {
134
+ // Too small — skip (will be captured in gaps)
135
+ continue
136
+ }
137
+
138
+ // Determine chunk metadata
139
+ const isClass = symbol.kind === SymbolKind.Class || symbol.kind === SymbolKind.Interface
140
+ const isFunction = symbol.kind === SymbolKind.Function || symbol.kind === SymbolKind.Method
141
+
142
+ const chunk: CodeChunk = {
143
+ content: chunkContent,
144
+ start_line: commentStartLine,
145
+ end_line: endLine,
146
+ }
147
+
148
+ if (isClass) {
149
+ chunk.class_name = symbol.name
150
+ }
151
+ if (isFunction) {
152
+ chunk.function_name = symbol.name
153
+ if (parentClass) chunk.class_name = parentClass
154
+ }
155
+
156
+ // If chunk too large → split by children (for classes with many methods)
157
+ if (chunkContent.length > config.max_chunk_size && symbol.children && symbol.children.length > 0) {
158
+ if (DEBUG) console.log(`[lsp-chunker] Splitting large ${symbol.kind === SymbolKind.Class ? 'class' : 'symbol'} ${symbol.name}`)
159
+
160
+ // For classes: chunk class header + each method separately
161
+ if (isClass) {
162
+ // Find first child's start line
163
+ const firstChildStart = Math.min(...symbol.children.map(c => c.range.start.line))
164
+
165
+ // Class header chunk (from comment to first method)
166
+ const headerLines = lines.slice(commentStartLine, firstChildStart)
167
+ if (headerLines.join("\n").trim().length > 0) {
168
+ chunks.push({
169
+ content: headerLines.join("\n"),
170
+ class_name: symbol.name,
171
+ start_line: commentStartLine,
172
+ end_line: firstChildStart - 1,
173
+ })
174
+ }
175
+
176
+ // Chunk each method separately (with its comments)
177
+ extractChunksFromSymbols(symbol.children, lines, chunks, config, symbol.name)
178
+ } else {
179
+ // Non-class: chunk children recursively
180
+ extractChunksFromSymbols(symbol.children, lines, chunks, config, parentClass)
181
+ }
182
+ } else {
183
+ // Chunk fits size limit → add it
184
+ chunks.push(chunk)
185
+
186
+ // Still process children if they exist (nested functions in Go, for example)
187
+ if (symbol.children && symbol.children.length > 0) {
188
+ extractChunksFromSymbols(symbol.children, lines, chunks, config, isClass ? symbol.name : parentClass)
189
+ }
190
+ }
191
+ }
192
+
193
+ // Sort chunks by start_line
194
+ chunks.sort((a, b) => (a.start_line ?? 0) - (b.start_line ?? 0))
195
+ }
196
+
197
+ /**
198
+ * Capture leading comments above a symbol.
199
+ * Walks backward from startLine to find godoc, JSDoc, docstrings, etc.
200
+ *
201
+ * Handles:
202
+ * - Go: // comments (consecutive)
203
+ * - Python: """docstring"""
204
+ * - JS/TS: /** JSDoc *\/ or // comments
205
+ * - Rust: /// doc comments
206
+ * - Java/C#: /** JavaDoc *\/
207
+ */
208
+ function captureLeadingComments(lines: string[], startLine: number): number {
209
+ if (startLine <= 0) return startLine
210
+
211
+ let commentStart = startLine - 1
212
+ let foundComment = false
213
+
214
+ // Walk backward to find comment block
215
+ while (commentStart >= 0) {
216
+ const line = lines[commentStart]
217
+ const trimmed = line.trim()
218
+
219
+ // Empty line
220
+ if (trimmed === "") {
221
+ // Allow max 1 blank line between comment and declaration
222
+ if (foundComment && commentStart > 0) {
223
+ const prevLine = lines[commentStart - 1].trim()
224
+ if (isCommentLine(prevLine)) {
225
+ commentStart--
226
+ continue
227
+ }
228
+ }
229
+ break
230
+ }
231
+
232
+ // Check if line is a comment
233
+ if (isCommentLine(trimmed)) {
234
+ foundComment = true
235
+ commentStart--
236
+ continue
237
+ }
238
+
239
+ // Non-comment, non-empty line → stop
240
+ break
241
+ }
242
+
243
+ return foundComment ? commentStart + 1 : startLine
244
+ }
245
+
246
+ /**
247
+ * Check if a line is a documentation comment.
248
+ */
249
+ function isCommentLine(line: string): boolean {
250
+ return (
251
+ line.startsWith("//") || // Go, JS, TS, Rust, C++
252
+ line.startsWith("///") || // Rust doc comments
253
+ line.startsWith("#") || // Python
254
+ line.startsWith("*") || // Inside /** ... */
255
+ line.startsWith("/**") || // JSDoc/JavaDoc start
256
+ line.endsWith("*/") || // JSDoc/JavaDoc end
257
+ line.match(/^("""|''')/) || // Python docstring
258
+ line.startsWith("<!--") // HTML/Markdown
259
+ )
260
+ }
261
+
262
+ /**
263
+ * Add gap chunks (code between symbols: imports, package decl, constants, etc.)
264
+ */
265
+ function addGapChunks(chunks: CodeChunk[], lines: string[], config: CodeChunkConfig): void {
266
+ if (chunks.length === 0) {
267
+ // No symbols found → chunk entire file
268
+ chunks.push({
269
+ content: lines.join("\n"),
270
+ start_line: 0,
271
+ end_line: lines.length - 1,
272
+ })
273
+ return
274
+ }
275
+
276
+ const gaps: CodeChunk[] = []
277
+ let lastEnd = -1
278
+
279
+ for (const chunk of chunks) {
280
+ const start = chunk.start_line ?? 0
281
+
282
+ // Gap before this chunk
283
+ if (start > lastEnd + 1) {
284
+ const gapLines = lines.slice(lastEnd + 1, start)
285
+ const gapContent = gapLines.join("\n").trim()
286
+
287
+ if (gapContent.length >= config.min_chunk_size) {
288
+ gaps.push({
289
+ content: gapContent,
290
+ start_line: lastEnd + 1,
291
+ end_line: start - 1,
292
+ })
293
+ }
294
+ }
295
+
296
+ lastEnd = chunk.end_line ?? start
297
+ }
298
+
299
+ // Trailing gap
300
+ if (lastEnd < lines.length - 1) {
301
+ const gapLines = lines.slice(lastEnd + 1)
302
+ const gapContent = gapLines.join("\n").trim()
303
+
304
+ if (gapContent.length >= config.min_chunk_size) {
305
+ gaps.push({
306
+ content: gapContent,
307
+ start_line: lastEnd + 1,
308
+ end_line: lines.length - 1,
309
+ })
310
+ }
311
+ }
312
+
313
+ // Merge gaps into chunks
314
+ chunks.push(...gaps)
315
+ chunks.sort((a, b) => (a.start_line ?? 0) - (b.start_line ?? 0))
316
+ }