@comfanion/usethis_search 4.2.0-dev.2 → 4.2.0-dev.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cache/manager.ts CHANGED
@@ -47,7 +47,7 @@ export interface WorkspaceEntry {
47
47
  /** MD5 hash of chunk content — used by freshen() to detect changes */
48
48
  contentHash: string
49
49
  /** How this chunk got into workspace */
50
- role: "search-main" | "search-graph" | "manual"
50
+ role: "search-main" | "search-graph" | "search-context" | "manual"
51
51
  /** Timestamp when attached */
52
52
  attachedAt: number
53
53
  /** Search query or "manual" */
@@ -547,6 +547,25 @@ class WorkspaceCache {
547
547
  return removed
548
548
  }
549
549
 
550
+ /**
551
+ * Remove all chunks from a specific file path.
552
+ * Returns number of chunks removed.
553
+ */
554
+ detachByPath(filePath: string): number {
555
+ let removed = 0
556
+
557
+ for (const [chunkId, entry] of this.entries) {
558
+ if (entry.path === filePath) {
559
+ this.entries.delete(chunkId)
560
+ this._totalTokens -= entry.tokens
561
+ removed++
562
+ }
563
+ }
564
+
565
+ if (removed > 0) this.scheduleSave()
566
+ return removed
567
+ }
568
+
550
569
  /**
551
570
  * Get all chunks sorted by: search-main first (by score desc), then search-graph, then manual.
552
571
  */
@@ -103,6 +103,7 @@ export function createWorkspaceInjectionHandler(state: SessionState) {
103
103
 
104
104
  // Group by role for clear structure
105
105
  const mainFiles = entries.filter(e => e.role === "search-main")
106
+ const contextFiles = entries.filter(e => e.role === "search-context")
106
107
  const graphFiles = entries.filter(e => e.role === "search-graph")
107
108
  const manualFiles = entries.filter(e => e.role === "manual")
108
109
 
@@ -111,6 +112,12 @@ export function createWorkspaceInjectionHandler(state: SessionState) {
111
112
  workspace += formatChunksByFile(mainFiles, byFile)
112
113
  }
113
114
 
115
+ // Expanded context (class methods, class headers)
116
+ if (contextFiles.length > 0) {
117
+ workspace += `\n<!-- Expanded context (class methods/headers for completeness) -->\n`
118
+ workspace += formatChunksByFile(contextFiles, byFile)
119
+ }
120
+
114
121
  // Graph relations (imports, extends, used_by)
115
122
  if (graphFiles.length > 0) {
116
123
  workspace += `\n<!-- Search graph relations -->\n`
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@comfanion/usethis_search",
3
- "version": "4.2.0-dev.2",
3
+ "version": "4.2.0-dev.3",
4
4
  "description": "OpenCode plugin: semantic search with chunk-based workspace injection (v4.2-dev: chunk-level context, granular detach, improved token efficiency)",
5
5
  "type": "module",
6
6
  "main": "./index.ts",
package/tools/search.ts CHANGED
@@ -16,6 +16,92 @@ import fs from "fs/promises"
16
16
  import { CodebaseIndexer, getSearchConfig, getIndexer, releaseIndexer } from "../vectorizer/index.ts"
17
17
  import { workspaceCache } from "../cache/manager.ts"
18
18
 
19
+ // ── Context Expansion Helpers ─────────────────────────────────────────────
20
+
21
+ /**
22
+ * Expand chunk context using GRAPH + file-level structural expansion.
23
+ *
24
+ * Strategy:
25
+ * 1. CODE: Class → all methods, Method → class header (via findChunksByPath)
26
+ * 2. DOCS: Section → other sections from same file (via file node in graph)
27
+ * 3. ALL: Use relatedContext from graph (imports, extends, contains, etc.)
28
+ *
29
+ * Note: relatedContext already populated by vectorizer._expandGraphContext()
30
+ * This function adds STRUCTURAL context (same-file chunks).
31
+ */
32
+ async function expandChunkContext(
33
+ mainChunk: any,
34
+ indexer: CodebaseIndexer,
35
+ alreadyAttached: Set<string>,
36
+ ): Promise<Array<{ chunk: any; reason: string }>> {
37
+ const expanded: Array<{ chunk: any; reason: string }> = []
38
+
39
+ // ══════════════════════════════════════════════════════════════════════
40
+ // STRUCTURAL EXPANSION: Same-file chunks for completeness
41
+ // ══════════════════════════════════════════════════════════════════════
42
+
43
+ // CODE: Class → add ALL its methods
44
+ if (mainChunk.class_name && !mainChunk.function_name) {
45
+ const allChunks = await indexer.findChunksByPath(mainChunk.file)
46
+
47
+ for (const chunk of allChunks) {
48
+ const chunkId = chunk.chunk_id || `${chunk.file}:chunk-${chunk.chunk_index ?? 0}`
49
+ if (alreadyAttached.has(chunkId)) continue
50
+
51
+ // Add all methods of this class
52
+ if (chunk.class_name === mainChunk.class_name && chunk.function_name) {
53
+ expanded.push({
54
+ chunk,
55
+ reason: `method of class ${mainChunk.class_name}`,
56
+ })
57
+ }
58
+ }
59
+ }
60
+
61
+ // CODE: Method → add class header
62
+ else if (mainChunk.class_name && mainChunk.function_name) {
63
+ const allChunks = await indexer.findChunksByPath(mainChunk.file)
64
+
65
+ for (const chunk of allChunks) {
66
+ const chunkId = chunk.chunk_id || `${chunk.file}:chunk-${chunk.chunk_index ?? 0}`
67
+ if (alreadyAttached.has(chunkId)) continue
68
+
69
+ // Find class header (class_name matches but no function_name)
70
+ if (chunk.class_name === mainChunk.class_name && !chunk.function_name) {
71
+ expanded.push({
72
+ chunk,
73
+ reason: `class header for ${mainChunk.function_name}`,
74
+ })
75
+ break
76
+ }
77
+ }
78
+ }
79
+
80
+ // DOCS: Section → add other sections from same file (for context)
81
+ // Only for markdown chunks with heading_context
82
+ else if (mainChunk.heading_context && mainChunk.language === "markdown") {
83
+ const allChunks = await indexer.findChunksByPath(mainChunk.file)
84
+
85
+ // Add ALL sections from this file (they're already reasonably sized)
86
+ // This gives full document context when searching in docs
87
+ for (const chunk of allChunks) {
88
+ const chunkId = chunk.chunk_id || `${chunk.file}:chunk-${chunk.chunk_index ?? 0}`
89
+ if (alreadyAttached.has(chunkId)) continue
90
+
91
+ // Skip the main chunk itself
92
+ if (chunkId === mainChunk.chunkId) continue
93
+
94
+ // Add other sections from same document
95
+ expanded.push({
96
+ chunk,
97
+ reason: `section from ${mainChunk.file}`,
98
+ })
99
+ }
100
+ }
101
+
102
+ return expanded
103
+ }
104
+
19
105
  // ── Extension → language mapping (for filter parsing) ─────────────────────
20
106
  const EXT_TO_LANG: Record<string, string> = {
21
107
  go: "go", py: "python", ts: "typescript", tsx: "typescript",
@@ -92,25 +178,31 @@ function parseFilter(filter: string): {
92
178
  }
93
179
 
94
180
  export default tool({
95
- description: `Search the codebase semantically. Top results are attached to workspace with full content (visible via context injection). Rest returned as summary.
181
+ description: `Search the codebase semantically OR attach specific chunks/files to workspace.
182
+
183
+ Three modes:
184
+ 1. Semantic search (query) - Find relevant code by meaning
185
+ 2. Direct chunk attach (chunkId) - Attach specific chunk by ID
186
+ 3. File attach (path) - Attach all chunks from a file
96
187
 
97
188
  Available indexes:
98
189
  - "code" (default) - Source code files (*.js, *.ts, *.py, *.go, etc.)
99
190
  - "docs" - Documentation files (*.md, *.txt, etc.)
100
- - "config" - Configuration files (*.yaml, *.json, etc.)
101
191
  - searchAll: true - Search across all indexes
102
192
 
103
193
  Examples:
104
194
  - search({ query: "authentication logic" })
105
195
  - search({ query: "how to deploy", index: "docs" })
106
196
  - search({ query: "tenant management", filter: "internal/domain/" })
107
- - search({ query: "event handling", filter: "*.go" })
108
- - search({ query: "API routes", filter: "internal/**/*.go" })
109
- - search({ query: "metrics", searchAll: true })`,
197
+ - search({ chunkId: "src/auth.ts:chunk-5" })
198
+ - search({ path: "docs/architecture.md" })
199
+ - search({ path: "src/auth.ts", index: "code" })`,
110
200
 
111
201
  args: {
112
- query: tool.schema.string().describe("Semantic search query describing what you're looking for"),
113
- index: tool.schema.string().optional().default("code").describe("Index to search: code, docs, config"),
202
+ query: tool.schema.string().optional().describe("Semantic search query describing what you're looking for"),
203
+ chunkId: tool.schema.string().optional().describe("Specific chunk ID to attach (e.g. 'src/auth.ts:chunk-5')"),
204
+ path: tool.schema.string().optional().describe("File path to attach all chunks from (e.g. 'docs/architecture.md')"),
205
+ index: tool.schema.string().optional().default("code").describe("Index to search: code, docs"),
114
206
  limit: tool.schema.number().optional().describe("Number of results (default from config, typically 10)"),
115
207
  searchAll: tool.schema.boolean().optional().default(false).describe("Search all indexes instead of just one"),
116
208
  filter: tool.schema.string().optional().describe("Filter results by path or language. Examples: 'internal/domain/', '*.go', 'internal/**/*.go', 'service'"),
@@ -120,6 +212,15 @@ Examples:
120
212
  const projectRoot = process.cwd()
121
213
 
122
214
  try {
215
+ // Validate: exactly one of query, chunkId, or path must be specified
216
+ const modes = [args.query, args.chunkId, args.path].filter(x => x !== undefined)
217
+ if (modes.length === 0) {
218
+ return `Error: Must specify one of: query (semantic search), chunkId (direct attach), or path (file attach)\n\nExamples:\n- search({ query: "authentication" })\n- search({ chunkId: "src/auth.ts:chunk-5" })\n- search({ path: "docs/architecture.md" })`
219
+ }
220
+ if (modes.length > 1) {
221
+ return `Error: Specify only ONE of: query, chunkId, or path (got ${modes.length})`
222
+ }
223
+
123
224
  // Load config defaults (parsed from vectorizer.yaml)
124
225
  const cfg = getSearchConfig()
125
226
  const limit = args.limit || cfg.default_limit || 10
@@ -130,6 +231,95 @@ Examples:
130
231
  // Workspace config
131
232
  const wsConfig = workspaceCache.getConfig()
132
233
 
234
+ // ══════════════════════════════════════════════════════════════════════
235
+ // MODE 1: Direct chunk attach by chunkId
236
+ // ══════════════════════════════════════════════════════════════════════
237
+ if (args.chunkId) {
238
+ const indexer = await getIndexer(projectRoot, indexName)
239
+ try {
240
+ const chunk = await indexer.findChunkById(args.chunkId)
241
+ if (!chunk) {
242
+ return `Chunk "${args.chunkId}" not found in index "${indexName}".\n\nMake sure:\n1. The file is indexed\n2. The chunk ID is correct (format: "path:chunk-N")\n3. You're searching the right index`
243
+ }
244
+
245
+ // Attach to workspace
246
+ workspaceCache.attach({
247
+ chunkId: args.chunkId,
248
+ path: chunk.file,
249
+ content: chunk.content,
250
+ chunkIndex: chunk.chunk_index ?? 0,
251
+ role: "manual",
252
+ attachedAt: Date.now(),
253
+ attachedBy: `direct:${args.chunkId}`,
254
+ metadata: {
255
+ language: chunk.language,
256
+ function_name: chunk.function_name,
257
+ class_name: chunk.class_name,
258
+ heading_context: chunk.heading_context,
259
+ startLine: chunk.start_line,
260
+ endLine: chunk.end_line,
261
+ },
262
+ })
263
+
264
+ workspaceCache.save().catch(() => {})
265
+
266
+ const entry = workspaceCache.get(args.chunkId)!
267
+ return `✓ Attached chunk to workspace\n\nChunk: ${args.chunkId}\nFile: ${chunk.file}\nTokens: ${entry.tokens.toLocaleString()}\nLanguage: ${chunk.language}\nLines: ${chunk.start_line}-${chunk.end_line}\n\nWorkspace: ${workspaceCache.size} chunks, ${workspaceCache.totalTokens.toLocaleString()} tokens`
268
+ } finally {
269
+ releaseIndexer(projectRoot, indexName)
270
+ }
271
+ }
272
+
273
+ // ══════════════════════════════════════════════════════════════════════
274
+ // MODE 2: File attach by path (all chunks)
275
+ // ══════════════════════════════════════════════════════════════════════
276
+ if (args.path) {
277
+ const indexer = await getIndexer(projectRoot, indexName)
278
+ try {
279
+ const chunks = await indexer.findChunksByPath(args.path)
280
+ if (chunks.length === 0) {
281
+ return `No chunks found for file "${args.path}" in index "${indexName}".\n\nMake sure:\n1. The file exists and is indexed\n2. The path is correct (relative to project root)\n3. You're searching the right index\n\nRun: bunx usethis_search reindex`
282
+ }
283
+
284
+ // Attach all chunks to workspace
285
+ let totalTokens = 0
286
+ for (const chunk of chunks) {
287
+ const chunkId = chunk.chunk_id || `${args.path}:chunk-${chunk.chunk_index ?? 0}`
288
+
289
+ workspaceCache.attach({
290
+ chunkId,
291
+ path: args.path,
292
+ content: chunk.content,
293
+ chunkIndex: chunk.chunk_index ?? 0,
294
+ role: "manual",
295
+ attachedAt: Date.now(),
296
+ attachedBy: `file:${args.path}`,
297
+ metadata: {
298
+ language: chunk.language,
299
+ function_name: chunk.function_name,
300
+ class_name: chunk.class_name,
301
+ heading_context: chunk.heading_context,
302
+ startLine: chunk.start_line,
303
+ endLine: chunk.end_line,
304
+ },
305
+ })
306
+
307
+ const entry = workspaceCache.get(chunkId)!
308
+ totalTokens += entry.tokens
309
+ }
310
+
311
+ workspaceCache.save().catch(() => {})
312
+
313
+ return `✓ Attached file to workspace\n\nFile: ${args.path}\nChunks: ${chunks.length}\nTokens: ${totalTokens.toLocaleString()}\nLanguage: ${chunks[0].language}\n\nWorkspace: ${workspaceCache.size} chunks, ${workspaceCache.totalTokens.toLocaleString()} tokens`
314
+ } finally {
315
+ releaseIndexer(projectRoot, indexName)
316
+ }
317
+ }
318
+
319
+ // ══════════════════════════════════════════════════════════════════════
320
+ // MODE 3: Semantic search by query (original behavior)
321
+ // ══════════════════════════════════════════════════════════════════════
322
+
133
323
  // Parse filter into path/language constraints
134
324
  const filterParsed = args.filter ? parseFilter(args.filter) : {}
135
325
 
@@ -169,6 +359,19 @@ Examples:
169
359
  }
170
360
  }
171
361
 
362
+ // Deduplicate chunks (searchAll may return same chunk from multiple indexes)
363
+ const seen = new Set<string>()
364
+ const deduplicated: any[] = []
365
+
366
+ for (const result of allResults) {
367
+ const chunkId = result.chunkId || `${result.file}:chunk-${result.index ?? 0}`
368
+ if (!seen.has(chunkId)) {
369
+ seen.add(chunkId)
370
+ deduplicated.push(result)
371
+ }
372
+ }
373
+
374
+ allResults = deduplicated
172
375
  allResults.sort((a, b) => {
173
376
  const scoreA = a._combinedScore ?? (a._distance != null ? Math.max(0, 1 - a._distance / 2) : 0)
174
377
  const scoreB = b._combinedScore ?? (b._distance != null ? Math.max(0, 1 - b._distance / 2) : 0)
@@ -255,7 +458,7 @@ Examples:
255
458
  }
256
459
 
257
460
  // ══════════════════════════════════════════════════════════════════════
258
- // WORKSPACE ATTACH: Top N chunks + graph relations (CHUNK CONTENT ONLY)
461
+ // WORKSPACE ATTACH: Top N chunks + expanded context + graph relations
259
462
  // ══════════════════════════════════════════════════════════════════════
260
463
 
261
464
  const mainChunks = topChunks.slice(0, wsConfig.attachTopN)
@@ -263,13 +466,17 @@ Examples:
263
466
 
264
467
  const attachedMain: Array<{ chunkId: string; path: string }> = []
265
468
  const attachedGraph: Array<{ chunkId: string; path: string }> = []
469
+ const attachedContext: Array<{ chunkId: string; path: string; reason: string }> = []
266
470
  const alreadyAttached = new Set<string>()
267
471
 
472
+ // Get indexer for context expansion (reuse same indexer)
473
+ const indexerForExpansion = await getIndexer(projectRoot, indexName)
474
+
268
475
  for (const chunk of mainChunks) {
269
476
  // Skip if score too low
270
477
  if ((chunk._finalScore ?? 0) < wsConfig.minScoreMain) continue
271
478
 
272
- // Attach chunk directly (no file read needed — chunk.content already has it)
479
+ // Attach main chunk
273
480
  const chunkId = chunk.chunkId || `${chunk.file}:chunk-${chunk.index ?? 0}`
274
481
 
275
482
  workspaceCache.attach({
@@ -294,6 +501,44 @@ Examples:
294
501
  attachedMain.push({ chunkId, path: chunk.file })
295
502
  alreadyAttached.add(chunkId)
296
503
 
504
+ // ── Expand context (class methods, class header) ──────────────────
505
+ try {
506
+ const expandedChunks = await expandChunkContext(chunk, indexerForExpansion, alreadyAttached)
507
+
508
+ for (const { chunk: expChunk, reason } of expandedChunks) {
509
+ const expChunkId = expChunk.chunk_id || `${expChunk.file}:chunk-${expChunk.chunk_index ?? 0}`
510
+
511
+ // Check budget before adding
512
+ if (workspaceCache.size >= wsConfig.maxChunks) break
513
+
514
+ workspaceCache.attach({
515
+ chunkId: expChunkId,
516
+ path: expChunk.file,
517
+ content: expChunk.content,
518
+ chunkIndex: expChunk.chunk_index ?? 0,
519
+ role: "search-context",
520
+ attachedAt: Date.now(),
521
+ attachedBy: `${args.query} (${reason})`,
522
+ score: chunk._finalScore * 0.9, // Slightly lower score than main
523
+ metadata: {
524
+ language: expChunk.language,
525
+ function_name: expChunk.function_name,
526
+ class_name: expChunk.class_name,
527
+ startLine: expChunk.start_line,
528
+ endLine: expChunk.end_line,
529
+ },
530
+ })
531
+
532
+ attachedContext.push({ chunkId: expChunkId, path: expChunk.file, reason })
533
+ alreadyAttached.add(expChunkId)
534
+ }
535
+ } catch (error: any) {
536
+ // Context expansion failed — not critical, continue
537
+ if (process.env.DEBUG) {
538
+ console.log(`[search] Context expansion failed for ${chunkId}: ${error.message}`)
539
+ }
540
+ }
541
+
297
542
  // Attach graph relations (imports, extends, used_by)
298
543
  if (chunk.relatedContext && chunk.relatedContext.length > 0) {
299
544
  const topRelated = chunk.relatedContext
@@ -329,8 +574,11 @@ Examples:
329
574
  }
330
575
  }
331
576
 
577
+ // Release indexer used for expansion
578
+ releaseIndexer(projectRoot, indexName)
579
+
332
580
  // ── Flush workspace to disk immediately (don't rely on debounce) ─────
333
- if (attachedMain.length > 0 || attachedGraph.length > 0) {
581
+ if (attachedMain.length > 0 || attachedGraph.length > 0 || attachedContext.length > 0) {
334
582
  workspaceCache.save().catch(() => {})
335
583
  }
336
584
 
@@ -56,11 +56,14 @@ export const workspace_list = tool({
56
56
  const mainFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
57
57
  chunks.some(c => c.role === "search-main")
58
58
  )
59
+ const contextFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
60
+ chunks.some(c => c.role === "search-context") && !chunks.some(c => c.role === "search-main")
61
+ )
59
62
  const graphFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
60
- chunks.some(c => c.role === "search-graph") && !chunks.some(c => c.role === "search-main")
63
+ chunks.some(c => c.role === "search-graph") && !chunks.some(c => c.role === "search-main" || c.role === "search-context")
61
64
  )
62
65
  const manualFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
63
- chunks.some(c => c.role === "manual") && !chunks.some(c => c.role === "search-main" || c.role === "search-graph")
66
+ chunks.some(c => c.role === "manual") && !chunks.some(c => c.role === "search-main" || c.role === "search-graph" || c.role === "search-context")
64
67
  )
65
68
 
66
69
  if (mainFiles.length > 0) {
@@ -86,6 +89,25 @@ export const workspace_list = tool({
86
89
  output += `\n`
87
90
  }
88
91
 
92
+ if (contextFiles.length > 0) {
93
+ output += `### Expanded context (${contextFiles.length} files)\n`
94
+ for (const [filePath, chunks] of contextFiles) {
95
+ const totalTokens = chunks.reduce((sum, c) => sum + c.tokens, 0)
96
+ const reason = chunks[0]?.attachedBy?.match(/\((.+)\)/)?.[1] || "context"
97
+ const age = Math.floor((Date.now() - chunks[0].attachedAt) / 1000 / 60)
98
+
99
+ output += `- **${filePath}** (${chunks.length} chunk${chunks.length > 1 ? "s" : ""}, ${totalTokens.toLocaleString()} tokens) — ${reason} — ${age}m ago\n`
100
+
101
+ if (chunks.length > 1) {
102
+ for (const chunk of chunks) {
103
+ const meta = chunk.metadata?.function_name || chunk.metadata?.class_name || ""
104
+ output += ` • ${chunk.chunkId} — ${meta} (chunk ${chunk.chunkIndex}, ${chunk.tokens.toLocaleString()} tok)\n`
105
+ }
106
+ }
107
+ }
108
+ output += `\n`
109
+ }
110
+
89
111
  if (graphFiles.length > 0) {
90
112
  output += `### Graph relations (${graphFiles.length} files)\n`
91
113
  for (const [filePath, chunks] of graphFiles) {
@@ -155,8 +177,12 @@ export const workspace_attach = tool({
155
177
 
156
178
  // Check if already attached
157
179
  if (workspaceCache.has(args.filePath)) {
158
- const existing = workspaceCache.get(args.filePath)!
159
- return `File "${args.filePath}" is already in workspace.\nChunkId: ${existing.chunkId} | Role: ${existing.role} | Tokens: ${existing.tokens.toLocaleString()} | Score: ${existing.score?.toFixed(3) ?? "n/a"}`
180
+ const existing = workspaceCache.getChunksByPath(args.filePath)
181
+ if (existing.length > 0) {
182
+ const first = existing[0]
183
+ const totalTokens = existing.reduce((sum, c) => sum + c.tokens, 0)
184
+ return `File "${args.filePath}" is already in workspace (${existing.length} chunk${existing.length > 1 ? "s" : ""}).\nTokens: ${totalTokens.toLocaleString()} | Role: ${first.role} | Score: ${first.score?.toFixed(3) ?? "n/a"}`
185
+ }
160
186
  }
161
187
 
162
188
  workspaceCache.attach({
@@ -169,7 +195,7 @@ export const workspace_attach = tool({
169
195
  attachedBy: "manual",
170
196
  })
171
197
 
172
- const entry = workspaceCache.get(args.filePath)!
198
+ const entry = workspaceCache.get(chunkId)!
173
199
  return `Attached "${args.filePath}" to workspace as single chunk.\nChunkId: ${chunkId}\nTokens: ${entry.tokens.toLocaleString()}\nWorkspace total: ${workspaceCache.totalTokens.toLocaleString()} tokens (${workspaceCache.size} chunks)`
174
200
  } catch (error: any) {
175
201
  return `Failed to attach "${args.filePath}": ${error.message || String(error)}`
@@ -194,27 +220,25 @@ export const workspace_detach = tool({
194
220
 
195
221
  if (args.chunkId) {
196
222
  // Detach specific chunk by chunkId
197
- const entries = workspaceCache.getAll()
198
- const entry = entries.find(e => e.chunkId === args.chunkId)
223
+ const entry = workspaceCache.get(args.chunkId)
199
224
 
200
225
  if (!entry) {
201
226
  return `Chunk "${args.chunkId}" not found in workspace.`
202
227
  }
203
228
 
204
- removed = workspaceCache.detach(entry.path) ? 1 : 0
229
+ removed = workspaceCache.detach(args.chunkId) ? 1 : 0
205
230
  if (removed === 0) {
206
231
  return `Failed to remove chunk "${args.chunkId}".`
207
232
  }
208
233
  } else if (args.filePath) {
209
234
  // Detach all chunks of a file
210
- const entries = workspaceCache.getAll()
211
- const fileChunks = entries.filter(e => e.path === args.filePath)
235
+ const fileChunks = workspaceCache.getChunksByPath(args.filePath)
212
236
 
213
237
  if (fileChunks.length === 0) {
214
238
  return `File "${args.filePath}" not found in workspace.`
215
239
  }
216
240
 
217
- removed = workspaceCache.detach(args.filePath) ? fileChunks.length : 0
241
+ removed = workspaceCache.detachByPath(args.filePath)
218
242
  if (removed === 0) {
219
243
  return `Failed to remove chunks from "${args.filePath}".`
220
244
  }
@@ -1,11 +1,15 @@
1
1
  /**
2
2
  * Chunker Factory — selects the appropriate chunker based on file type.
3
3
  *
4
- * Routes: markdown → markdown-chunker, code → code-chunker, else → fixed.
4
+ * Routes:
5
+ * - markdown → markdown-chunker
6
+ * - code → lsp-chunker (fallback: code-chunker regex)
7
+ * - else → fixed
5
8
  */
6
9
 
7
10
  import { chunkMarkdown, type MarkdownChunkConfig, DEFAULT_MD_CONFIG } from "./markdown-chunker"
8
11
  import { chunkCode, type CodeChunkConfig, DEFAULT_CODE_CONFIG } from "./code-chunker"
12
+ import { chunkCodeWithLSP } from "./lsp-chunker"
9
13
  import type { FileType } from "../metadata-extractor"
10
14
 
11
15
  // ── Types ───────────────────────────────────────────────────────────────────
@@ -63,13 +67,18 @@ function chunkFixed(content: string, maxChars: number): UnifiedChunk[] {
63
67
 
64
68
  /**
65
69
  * Chunk content using the appropriate strategy for the given file type.
70
+ *
71
+ * For code files: tries LSP-based chunking first (AST-accurate + godoc capture),
72
+ * falls back to regex-based chunker if LSP unavailable.
66
73
  */
67
- export function chunkContent(
74
+ export async function chunkContent(
68
75
  content: string,
69
76
  fileType: FileType,
70
77
  language: string,
71
78
  config: ChunkingConfig = DEFAULT_CHUNKING_CONFIG,
72
- ): UnifiedChunk[] {
79
+ filePath?: string,
80
+ projectRoot?: string,
81
+ ): Promise<UnifiedChunk[]> {
73
82
  // If strategy is "fixed", always use fixed chunker
74
83
  if (config.strategy === "fixed") {
75
84
  return chunkFixed(content, config.fixed.max_chars)
@@ -85,6 +94,26 @@ export function chunkContent(
85
94
  }
86
95
 
87
96
  if (fileType === "code") {
97
+ // Try LSP-based chunker first (captures godoc/JSDoc comments!)
98
+ if (filePath) {
99
+ try {
100
+ const lspChunks = await chunkCodeWithLSP(filePath, content, config.code, projectRoot)
101
+ if (lspChunks && lspChunks.length > 0) {
102
+ return lspChunks.map((c) => ({
103
+ content: c.content,
104
+ function_name: c.function_name,
105
+ class_name: c.class_name,
106
+ }))
107
+ }
108
+ } catch (error) {
109
+ // LSP failed — fall through to regex chunker
110
+ if (process.env.DEBUG_LSP_CHUNKER) {
111
+ console.log(`[chunker-factory] LSP chunker failed for ${filePath}: ${error}`)
112
+ }
113
+ }
114
+ }
115
+
116
+ // Fallback: regex-based code chunker
88
117
  const codeChunks = chunkCode(content, config.code)
89
118
  return codeChunks.map((c) => ({
90
119
  content: c.content,
@@ -13,8 +13,8 @@ export interface CodeChunkConfig {
13
13
  }
14
14
 
15
15
  export const DEFAULT_CODE_CONFIG: CodeChunkConfig = {
16
- min_chunk_size: 300,
17
- max_chunk_size: 1500,
16
+ min_chunk_size: 600, // Function with godoc/JSDoc (avoid tiny chunks)
17
+ max_chunk_size: 3000, // Allow larger chunks (class with multiple methods)
18
18
  split_by_functions: true,
19
19
  include_function_signature: true,
20
20
  }
@@ -13,8 +13,8 @@ export interface MarkdownChunkConfig {
13
13
  }
14
14
 
15
15
  export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
16
- min_chunk_size: 200,
17
- max_chunk_size: 2000,
16
+ min_chunk_size: 1000, // Merge small sections (headers without content)
17
+ max_chunk_size: 8000, // Large chunks for docs (SQL schemas, API specs, etc.)
18
18
  split_by_headings: true,
19
19
  preserve_heading_hierarchy: true,
20
20
  }
@@ -696,8 +696,8 @@ class CodebaseIndexer {
696
696
  // Clean content before chunking
697
697
  const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
698
698
 
699
- // Semantic chunking
700
- const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
699
+ // Semantic chunking (async for LSP-based chunking)
700
+ const chunks = await chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG, filePath, this.root);
701
701
 
702
702
  // Assign chunk IDs
703
703
  const chunksWithIds = this.graphBuilder
@@ -1348,6 +1348,29 @@ class CodebaseIndexer {
1348
1348
  return this._chunkCache.get(chunkId) || null;
1349
1349
  }
1350
1350
 
1351
+ /**
1352
+ * Find all chunks belonging to a specific file path.
1353
+ * @param {string} filePath - Relative file path (e.g. "src/auth.ts")
1354
+ * @returns {Promise<Array>} Array of chunks from this file
1355
+ */
1356
+ async findChunksByPath(filePath) {
1357
+ // Ensure chunk cache is loaded
1358
+ await this.findChunkById("__force_cache_load__");
1359
+
1360
+ if (!this._chunkCache) return [];
1361
+
1362
+ const chunks = [];
1363
+ for (const chunk of this._chunkCache.values()) {
1364
+ if (chunk.file === filePath) {
1365
+ chunks.push(chunk);
1366
+ }
1367
+ }
1368
+
1369
+ // Sort by chunk_index
1370
+ chunks.sort((a, b) => (a.chunk_index || 0) - (b.chunk_index || 0));
1371
+ return chunks;
1372
+ }
1373
+
1351
1374
  cosineSimilarity(vecA, vecB) {
1352
1375
  let dotProduct = 0;
1353
1376
  let normA = 0;
package/vectorizer.yaml CHANGED
@@ -23,14 +23,14 @@ vectorizer:
23
23
  strategy: "semantic" # fixed | semantic
24
24
  markdown:
25
25
  split_by_headings: true
26
- min_chunk_size: 200
27
- max_chunk_size: 2000
26
+ min_chunk_size: 1000 # Merge small sections (avoid header-only chunks)
27
+ max_chunk_size: 8000 # Large chunks for docs (SQL schemas, API specs, etc.)
28
28
  preserve_heading_hierarchy: true
29
29
  code:
30
30
  split_by_functions: true
31
31
  include_function_signature: true
32
- min_chunk_size: 300
33
- max_chunk_size: 1500
32
+ min_chunk_size: 600 # Function + godoc/JSDoc (avoid tiny chunks)
33
+ max_chunk_size: 3000 # Allow larger chunks (class with multiple methods)
34
34
  fixed:
35
35
  max_chars: 1500
36
36