@comfanion/usethis_search 4.3.0-dev.4 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@comfanion/usethis_search",
3
- "version": "4.3.0-dev.4",
4
- "description": "OpenCode plugin: semantic search with context-efficient workspace state (v4.3: no injection, each tool returns full state inline, auto-prune history, auto-detect modes, line numbers, LSP memory leak fixed)",
3
+ "version": "4.4.0",
4
+ "description": "OpenCode plugin: semantic search with query decomposition, RRF merge, and context-efficient workspace (v4.4.0)",
5
5
  "type": "module",
6
6
  "main": "./index.ts",
7
7
  "exports": {
@@ -41,6 +41,7 @@
41
41
  "vectorizer/graph-db.ts",
42
42
  "vectorizer/chunk-store.ts",
43
43
  "vectorizer/usage-tracker.ts",
44
+ "vectorizer/query-decomposer.ts",
44
45
  "vectorizer/graph-builder.ts",
45
46
  "vectorizer/analyzers/regex-analyzer.ts",
46
47
  "vectorizer/analyzers/lsp-analyzer.ts",
package/tools/search.ts CHANGED
@@ -13,9 +13,10 @@ import { tool } from "@opencode-ai/plugin"
13
13
  import path from "path"
14
14
  import fs from "fs/promises"
15
15
 
16
- import { CodebaseIndexer, getSearchConfig, getIndexer, releaseIndexer } from "../vectorizer/index.ts"
16
+ import { CodebaseIndexer, getSearchConfig, getDecomposerConfig, getIndexer, releaseIndexer } from "../vectorizer/index.ts"
17
17
  import { workspaceCache } from "../cache/manager.ts"
18
18
  import { buildWorkspaceOutput } from "./workspace-state.ts"
19
+ import { decomposeQuery } from "../vectorizer/query-decomposer.ts"
19
20
 
20
21
  // ── Context Expansion Helpers ─────────────────────────────────────────────
21
22
 
@@ -179,30 +180,55 @@ function parseFilter(filter: string): {
179
180
  }
180
181
 
181
182
  export default tool({
182
- description: `Search codebase and automatically attach relevant context to workspace.
183
-
184
- Accepts any query - semantic search, file path, or chunk ID:
185
- - "authentication logic" finds relevant code
186
- - "docs/architecture.md" attaches file
187
- - "src/auth.ts:chunk-5" attaches specific chunk
188
-
189
- Results are optimized for context - top chunks auto-attached with expanded context
190
- (related code, imports, class methods). Returns full workspace state inline.
191
- Previous search outputs are automatically pruned from history.
192
-
193
- IMPORTANT: Workspace has limited token budget. Use workspace_forget() to remove
194
- irrelevant files or old searches before adding new context.
195
-
196
- Choose index based on what you're looking for:
197
- - index: "code" search source code
198
- - index: "docs" search documentation
199
- - searchAll: true search everywhere
183
+ description: `Search the codebase semantically. Use this to find relevant code snippets, functions, or files based on meaning, not just text matching.
184
+
185
+ Available indexes:
186
+ - "code" (default) - Source code files (*.js, *.ts, *.py, *.go, etc.)
187
+ - "docs" - Documentation files (*.md, *.txt, etc.)
188
+ - "config" - Configuration files (*.yaml, *.json, etc.)
189
+ - searchAll: true - Search across all indexes
190
+
191
+ Auto-detects query type:
192
+ - Semantic: "authentication logic" vector search for relevant code
193
+ - File path: "docs/architecture.md" → attaches entire file to workspace
194
+ - Chunk ID: "src/auth.ts:chunk-5" attaches specific chunk
195
+
196
+ How workspace works:
197
+ - Top results are AUTO-ATTACHED to workspace with expanded context (class methods, imports, related code via graph)
198
+ - Workspace has a TOKEN BUDGET (~50K tokens, ~100 chunks). When full, oldest chunks are evicted
199
+ - Each search call returns full <workspace_state> with all chunk contents inline
200
+ - Only the LATEST search/workspace output is kept in chat history — older ones are auto-pruned
201
+ - Workspace persists across searches — new results ADD to existing workspace
202
+
203
+ IMPORTANT: Chunks contain DIRECT file content dumps (raw code/text from files).
204
+ - You DO NOT need to verify chunk content with grep/read tools
205
+ - Chunks are already the actual file content, not summaries or references
206
+ - Trust the chunk content as the source of truth
207
+ - Use Read tool only if you need content OUTSIDE the indexed chunks
208
+
209
+ Context management (CRITICAL — follow these rules):
210
+ - BEFORE searching a new topic, you MUST call workspace_forget() to remove irrelevant old context
211
+ - Workspace has LIMITED token budget. If budget >60%, evict old chunks with workspace_forget({ what: "5" })
212
+ - Use workspace_clear() when switching to a completely different task
213
+ - After editing files, forget stale chunks: workspace_forget({ what: "edited-file.ts" })
214
+ - The workspace is your working memory — KEEP IT FOCUSED. Stale context degrades search quality
215
+ - Rule of thumb: forget BEFORE you search, not after
216
+
217
+ Filter narrows results by path or language:
218
+ - "internal/domain/" → only files under that path
219
+ - "*.go" → only Go files
220
+ - "internal/**/*.go" → path + language combined
221
+ - "service" → files containing "service" in path
200
222
 
201
223
  Examples:
202
224
  - search({ query: "authentication logic" })
203
225
  - search({ query: "how to deploy", index: "docs" })
204
- - search({ query: "docs/prd.md" }) // attach file
205
- - search({ query: "internal/domain/", filter: "*.go" })`,
226
+ - search({ query: "tenant management", filter: "internal/domain/" })
227
+ - search({ query: "event handling", filter: "*.go" })
228
+ - search({ query: "API routes", filter: "internal/**/*.go" })
229
+ - search({ query: "metrics", searchAll: true })
230
+ - search({ query: "docs/prd.md" })
231
+ - search({ query: "src/auth.ts:chunk-5" })`,
206
232
 
207
233
  args: {
208
234
  query: tool.schema.string().describe("What to search: semantic query, file path, or chunk ID"),
@@ -657,10 +683,17 @@ Examples:
657
683
 
658
684
  const topScore = topChunks[0]?._finalScore ?? 0
659
685
  const hasBM25Only = allResults.some((r: any) => r._bm25Only)
686
+ const hasRRF = allResults.some((r: any) => r._rrfScore != null)
660
687
  const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
661
688
  const filterLabel = args.filter ? ` filter:"${args.filter}"` : ""
662
689
  let output = `## Search: "${semanticQuery}" (${scope}${filterLabel})\n\n`
663
690
 
691
+ // Show decomposition info if query was decomposed
692
+ const decomposition = decomposeQuery(semanticQuery!, getDecomposerConfig())
693
+ if (decomposition.decomposed) {
694
+ output += `> **Query decomposed** (${decomposition.strategy}): ${decomposition.subQueries.map(q => `"${q}"`).join(", ")}\n\n`
695
+ }
696
+
664
697
  if (hasBM25Only) {
665
698
  output += `> **BM25-only mode** -- vector embeddings not yet available. Quality will improve after embedding completes.\n\n`
666
699
  }
@@ -20,7 +20,15 @@ import { buildWorkspaceOutput } from "./workspace-state.ts"
20
20
  // ── workspace.list ──────────────────────────────────────────────────────────
21
21
 
22
22
  export const workspace_list = tool({
23
- description: `Show full workspace state with all chunk content. Returns file listing and inline content for every attached chunk.`,
23
+ description: `Show current workspace contents all attached code chunks with full source code, line numbers, and metadata.
24
+
25
+ Use this to:
26
+ - Check what context is currently loaded after compaction or session restore
27
+ - Verify workspace contents before starting implementation
28
+ - See token budget usage (how much space is left for new searches)
29
+
30
+ Returns <workspace_state> with every chunk's full content. This is the same state appended to every search() call.
31
+ Only the LATEST workspace tool output is kept in chat — older outputs are auto-pruned.`,
24
32
 
25
33
  args: {},
26
34
 
@@ -37,6 +45,13 @@ export const workspace_forget = tool({
37
45
  IMPORTANT: Regularly clean up workspace by removing irrelevant files or old search results.
38
46
  This keeps context focused and prevents token budget overflow.
39
47
 
48
+ WHEN TO CLEAN UP:
49
+ - BEFORE searching a new topic — forget the previous search results first:
50
+ workspace_forget({ what: "previous search query" }) → then search({ query: "new topic" })
51
+ - AFTER finishing a subtask — forget files you no longer need
52
+ - WHEN budget >60% — evict old chunks: workspace_forget({ what: "5" })
53
+ - AFTER editing files — workspace chunks become stale, forget and re-search
54
+
40
55
  Auto-detects what to remove based on input:
41
56
  - Chunk ID: "src/auth.ts:chunk-5"
42
57
  - File path: "docs/architecture.md" (removes ALL chunks)
@@ -46,7 +61,8 @@ Auto-detects what to remove based on input:
46
61
  Examples:
47
62
  - workspace_forget({ what: "docs/prd.md" })
48
63
  - workspace_forget({ what: "5" }) // older than 5 min
49
- - workspace_forget({ what: "src/auth.ts:chunk-3" })`,
64
+ - workspace_forget({ what: "src/auth.ts:chunk-3" })
65
+ - workspace_forget({ what: "authentication logic" }) // forget previous search`,
50
66
 
51
67
  args: {
52
68
  what: tool.schema.string().describe("What to forget: chunk ID, file path, search query, or age in minutes"),
@@ -110,7 +126,15 @@ Examples:
110
126
  // ── workspace.clear ─────────────────────────────────────────────────────────
111
127
 
112
128
  export const workspace_clear = tool({
113
- description: `Remove ALL chunks from workspace context. Use when switching tasks or starting fresh.`,
129
+ description: `Remove ALL chunks from workspace context. Use when switching tasks or starting fresh.
130
+
131
+ Use when:
132
+ - Switching to a completely different task or topic
133
+ - Workspace is cluttered with irrelevant context from many searches
134
+ - Starting a fresh investigation from scratch
135
+
136
+ Prefer workspace_forget() for selective cleanup. Use workspace_clear() only for full reset.
137
+ Returns empty workspace state.`,
114
138
 
115
139
  args: {},
116
140
 
@@ -126,7 +150,15 @@ export const workspace_clear = tool({
126
150
  // ── workspace.restore ───────────────────────────────────────────────────────
127
151
 
128
152
  export const workspace_restore = tool({
129
- description: `Restore workspace from a saved session snapshot. Use after compaction or to switch context.`,
153
+ description: `Restore workspace from a previously saved session snapshot.
154
+
155
+ Use when:
156
+ - After compaction — restore the workspace context from before compaction
157
+ - Resuming work on a previous task — switch back to that context
158
+ - After workspace_clear() — if you need the old context back
159
+
160
+ Call without sessionId to list available snapshots with their chunk counts and token sizes.
161
+ Call with sessionId to restore a specific snapshot. Replaces current workspace entirely.`,
130
162
 
131
163
  args: {
132
164
  sessionId: tool.schema.string().optional().describe("Session ID to restore. If not provided, lists available snapshots."),
@@ -19,6 +19,8 @@ import { GraphDB } from "./graph-db.ts";
19
19
  import { GraphBuilder, isStructuralPredicate } from "./graph-builder.ts";
20
20
  import { UsageTracker } from "./usage-tracker.ts";
21
21
  import { ChunkStore } from "./chunk-store.ts";
22
+ import { decomposeQuery, rrfMerge, DEFAULT_DECOMPOSER_CONFIG } from "./query-decomposer.ts";
23
+ import type { DecomposerConfig } from "./query-decomposer.ts";
22
24
 
23
25
  // Suppress transformers.js logs unless DEBUG is set
24
26
  const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
@@ -86,6 +88,9 @@ let HYBRID_CONFIG = { ...DEFAULT_HYBRID_CONFIG };
86
88
  let METRICS_ENABLED = false;
87
89
  let CACHE_ENABLED = true;
88
90
 
91
+ // ── Query decomposition config ───────────────────────────────────────────────
92
+ let DECOMPOSER_CONFIG: DecomposerConfig = { ...DEFAULT_DECOMPOSER_CONFIG };
93
+
89
94
  // ── Search defaults (exposed to tool layer) ──────────────────────────────────
90
95
  const DEFAULT_SEARCH_CONFIG = {
91
96
  freshen: false, // Don't freshen on every search — auto_index handles it
@@ -188,6 +193,13 @@ function defaultVectorizerYaml() {
188
193
  ` auto_prune_search: true # Replace old search outputs with compact summaries\n` +
189
194
  ` substitute_tool_outputs: true # Replace tool outputs when files in workspace\n` +
190
195
  `\n` +
196
+ ` # Query decomposition (v4 — improves long query relevance)\n` +
197
+ ` decomposition:\n` +
198
+ ` enabled: true # Split complex queries into focused sub-queries\n` +
199
+ ` min_words: 5 # Min significant words to trigger decomposition\n` +
200
+ ` max_sub_queries: 4 # Max sub-queries (including keyword core)\n` +
201
+ ` min_sub_query_words: 2 # Min words per sub-query\n` +
202
+ `\n` +
191
203
  ` # Quality monitoring\n` +
192
204
  ` quality:\n` +
193
205
  ` enable_metrics: false\n` +
@@ -370,6 +382,17 @@ async function loadConfig(projectRoot) {
370
382
  CACHE_ENABLED = parseBool(qs, "enable_cache", true);
371
383
  }
372
384
 
385
+ // ── Parse query decomposition config ────────────────────────────────────
386
+ const decomposerMatch = section.match(/^\s{2}decomposition:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
387
+ if (decomposerMatch) {
388
+ const ds = decomposerMatch[1];
389
+ DECOMPOSER_CONFIG.enabled = parseBool(ds, "enabled", DEFAULT_DECOMPOSER_CONFIG.enabled);
390
+ DECOMPOSER_CONFIG.minWords = parseNumber(ds, "min_words", DEFAULT_DECOMPOSER_CONFIG.minWords);
391
+ DECOMPOSER_CONFIG.maxSubQueries = parseNumber(ds, "max_sub_queries", DEFAULT_DECOMPOSER_CONFIG.maxSubQueries);
392
+ DECOMPOSER_CONFIG.minSubQueryWords = parseNumber(ds, "min_sub_query_words", DEFAULT_DECOMPOSER_CONFIG.minSubQueryWords);
393
+ if (DEBUG) console.log("[vectorizer] Decomposer config:", DECOMPOSER_CONFIG);
394
+ }
395
+
373
396
  // ── Parse graph config (v3) ──────────────────────────────────────────────
374
397
  const graphMatch = section.match(/^\s{2}graph:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
375
398
  if (graphMatch) {
@@ -1121,9 +1144,9 @@ class CodebaseIndexer {
1121
1144
  }
1122
1145
  }
1123
1146
 
1124
- // ── Search (v3: hybrid + BM25-only fallback + metadata filters + metrics) ──
1147
+ // ── Single-query search (internal used by search() for each sub-query) ──
1125
1148
 
1126
- async search(query, limit = 5, includeArchived = false, options = {}) {
1149
+ async _searchSingle(query, limit = 5, includeArchived = false, options = {}) {
1127
1150
  const tableName = "chunks";
1128
1151
  const tables = await this.db.tableNames();
1129
1152
 
@@ -1178,14 +1201,9 @@ class CodebaseIndexer {
1178
1201
  }
1179
1202
  }
1180
1203
 
1181
- // Apply metadata filters then return (graph context added below)
1204
+ // Apply metadata filters then return
1182
1205
  results = this._applyMetadataFilters(results, includeArchived, options);
1183
- const finalResults = results.slice(0, limit);
1184
-
1185
- // Graph context expansion (same as vector path)
1186
- await this._expandGraphContext(finalResults, null, query);
1187
-
1188
- return finalResults;
1206
+ return results.slice(0, limit);
1189
1207
  }
1190
1208
 
1191
1209
  // ── Vector search (Phase 2 complete) ─────────────────────────────────────
@@ -1280,7 +1298,51 @@ class CodebaseIndexer {
1280
1298
 
1281
1299
  // ── Metadata filters ──────────────────────────────────────────────────
1282
1300
  results = this._applyMetadataFilters(results, includeArchived, options);
1283
- const finalResults = results.slice(0, limit);
1301
+ return results.slice(0, limit);
1302
+ }
1303
+
1304
+ // ── Search (v4: query decomposition + RRF merge + hybrid + metrics) ────────
1305
+
1306
+ async search(query, limit = 5, includeArchived = false, options = {}) {
1307
+ // ── Query decomposition ──────────────────────────────────────────────────
1308
+ const decomposition = decomposeQuery(query, DECOMPOSER_CONFIG);
1309
+
1310
+ let finalResults;
1311
+
1312
+ if (decomposition.decomposed && decomposition.subQueries.length > 1) {
1313
+ if (DEBUG) {
1314
+ console.log(`[vectorizer] Query decomposed (${decomposition.strategy}): ${decomposition.subQueries.length} sub-queries`);
1315
+ for (const sq of decomposition.subQueries) {
1316
+ console.log(` → "${sq}"`);
1317
+ }
1318
+ }
1319
+
1320
+ // Run each sub-query independently, over-fetch to give RRF more signal
1321
+ const perQueryLimit = Math.max(limit * 2, 20);
1322
+ const resultSets = [];
1323
+
1324
+ for (const subQuery of decomposition.subQueries) {
1325
+ const results = await this._searchSingle(subQuery, perQueryLimit, includeArchived, options);
1326
+ if (results.length > 0) {
1327
+ resultSets.push(results);
1328
+ }
1329
+ }
1330
+
1331
+ if (resultSets.length === 0) {
1332
+ finalResults = [];
1333
+ } else if (resultSets.length === 1) {
1334
+ finalResults = resultSets[0].slice(0, limit);
1335
+ } else {
1336
+ // RRF merge across sub-query result sets
1337
+ finalResults = rrfMerge(resultSets, 60, limit);
1338
+ if (DEBUG) {
1339
+ console.log(`[vectorizer] RRF merged ${resultSets.length} result sets → ${finalResults.length} results`);
1340
+ }
1341
+ }
1342
+ } else {
1343
+ // Short/simple query — single search (no decomposition overhead)
1344
+ finalResults = await this._searchSingle(query, limit, includeArchived, options);
1345
+ }
1284
1346
 
1285
1347
  // ── Metrics tracking ────────────────────────────────────────────────────
1286
1348
  if (METRICS_ENABLED) {
@@ -1304,6 +1366,8 @@ class CodebaseIndexer {
1304
1366
  }
1305
1367
 
1306
1368
  // ── Graph context expansion (v3) ───────────────────────────────────────
1369
+ // Use original query for graph expansion (most complete context)
1370
+ const queryEmbedding = finalResults.length > 0 ? await this.embedQuery(query).catch(() => null) : null;
1307
1371
  await this._expandGraphContext(finalResults, queryEmbedding, query);
1308
1372
 
1309
1373
  return finalResults;
@@ -1826,4 +1890,8 @@ async function destroyIndexer(projectRoot: string, indexName: string = "code") {
1826
1890
  }
1827
1891
  }
1828
1892
 
1829
- export { CodebaseIndexer, INDEX_PRESETS, getEmbeddingModel, getSearchConfig, getWorkspaceConfig, getIndexer, releaseIndexer, destroyIndexer };
1893
+ function getDecomposerConfig() {
1894
+ return DECOMPOSER_CONFIG;
1895
+ }
1896
+
1897
+ export { CodebaseIndexer, INDEX_PRESETS, getEmbeddingModel, getSearchConfig, getWorkspaceConfig, getDecomposerConfig, getIndexer, releaseIndexer, destroyIndexer };
@@ -0,0 +1,397 @@
1
+ /**
2
+ * Query Decomposer — splits complex queries into focused sub-queries.
3
+ *
4
+ * Problem: Long, multi-concept queries produce "diluted" embeddings
5
+ * because the embedding model (all-MiniLM-L6-v2, 384d) averages all
6
+ * token vectors into one. "JWT authentication middleware that validates
7
+ * permissions" → a blurry vector between auth, JWT, middleware, permissions.
8
+ *
9
+ * Solution: Decompose into focused sub-queries, search each independently,
10
+ * merge results via Reciprocal Rank Fusion (RRF).
11
+ *
12
+ * Strategy (no LLM — pure heuristics):
13
+ * 1. Short queries (≤4 significant words) → pass through unchanged
14
+ * 2. Medium queries (5-8 words) → extract keyword core + original
15
+ * 3. Long queries (9+ words) → split into 2-4 concept clusters + keyword core
16
+ *
17
+ * All decomposition is deterministic and fast (<1ms).
18
+ */
19
+
20
+ // ── Types ───────────────────────────────────────────────────────────────────
21
+
22
+ export interface DecompositionResult {
23
+ /** Original query (always included in sub-queries) */
24
+ original: string
25
+ /** Focused sub-queries (includes original if short enough) */
26
+ subQueries: string[]
27
+ /** Whether decomposition was applied */
28
+ decomposed: boolean
29
+ /** Strategy used */
30
+ strategy: "passthrough" | "keyword-core" | "concept-split"
31
+ }
32
+
33
+ export interface DecomposerConfig {
34
+ /** Enable/disable decomposition */
35
+ enabled: boolean
36
+ /** Min significant words to trigger decomposition */
37
+ minWords: number
38
+ /** Max sub-queries to generate (including original) */
39
+ maxSubQueries: number
40
+ /** Min words per sub-query */
41
+ minSubQueryWords: number
42
+ }
43
+
44
+ export const DEFAULT_DECOMPOSER_CONFIG: DecomposerConfig = {
45
+ enabled: true,
46
+ minWords: 5,
47
+ maxSubQueries: 4,
48
+ minSubQueryWords: 2,
49
+ }
50
+
51
+ // ── Stop words (shared with BM25 + extras for query context) ────────────────
52
+
53
+ const STOP_WORDS = new Set([
54
+ "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
55
+ "have", "has", "had", "do", "does", "did", "will", "would", "could",
56
+ "should", "may", "might", "shall", "can", "need", "must",
57
+ "and", "or", "but", "not", "no", "nor",
58
+ "in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
59
+ "into", "about", "between", "through", "during", "before", "after",
60
+ "this", "that", "these", "those", "it", "its",
61
+ "i", "you", "he", "she", "we", "they", "me", "him", "her", "us", "them",
62
+ "my", "your", "his", "our", "their",
63
+ "what", "which", "who", "whom", "where", "when", "how", "why",
64
+ "if", "then", "else", "so", "than", "too", "very",
65
+ // Query-specific stop words (common in agent queries)
66
+ "find", "search", "look", "show", "get", "give", "tell",
67
+ "using", "used", "uses", "use",
68
+ "like", "such", "also", "just", "only",
69
+ "all", "any", "each", "every", "some",
70
+ "code", "file", "files", "function", "class", "method",
71
+ "implement", "implementation", "implements", "implemented",
72
+ "related", "relevant", "similar",
73
+ "please", "help", "want", "need",
74
+ ])
75
+
76
+ // ── Connectors that signal concept boundaries ───────────────────────────────
77
+
78
+ const CONCEPT_CONNECTORS = new Set([
79
+ "and", "or", "that", "which", "where", "when", "while",
80
+ "with", "using", "through", "via", "for", "including",
81
+ "also", "both", "either", "neither",
82
+ ])
83
+
84
+ // ── Domain compound terms (keep together) ───────────────────────────────────
85
+
86
+ const COMPOUND_TERMS: Array<[string, string]> = [
87
+ ["error", "handling"],
88
+ ["event", "sourcing"],
89
+ ["dependency", "injection"],
90
+ ["access", "control"],
91
+ ["rate", "limiting"],
92
+ ["load", "balancing"],
93
+ ["unit", "test"],
94
+ ["integration", "test"],
95
+ ["api", "endpoint"],
96
+ ["api", "gateway"],
97
+ ["data", "model"],
98
+ ["data", "transfer"],
99
+ ["database", "connection"],
100
+ ["file", "system"],
101
+ ["message", "queue"],
102
+ ["state", "management"],
103
+ ["type", "checking"],
104
+ ["code", "review"],
105
+ ["pull", "request"],
106
+ ["design", "pattern"],
107
+ ["repository", "pattern"],
108
+ ["factory", "pattern"],
109
+ ["observer", "pattern"],
110
+ ["middleware", "chain"],
111
+ ["call", "hierarchy"],
112
+ ["graph", "traversal"],
113
+ ]
114
+
115
+ // ── Tokenizer ───────────────────────────────────────────────────────────────
116
+
117
+ /**
118
+ * Tokenize query into lowercase words, preserving compound terms.
119
+ */
120
+ export function tokenizeQuery(query: string): string[] {
121
+ const raw = query
122
+ .toLowerCase()
123
+ .replace(/[^a-z0-9_\-]/g, " ")
124
+ .split(/\s+/)
125
+ .filter(t => t.length > 1)
126
+
127
+ // Merge compound terms
128
+ const merged: string[] = []
129
+ let i = 0
130
+ while (i < raw.length) {
131
+ let found = false
132
+ if (i < raw.length - 1) {
133
+ for (const [a, b] of COMPOUND_TERMS) {
134
+ if (raw[i] === a && raw[i + 1] === b) {
135
+ merged.push(`${a}_${b}`)
136
+ i += 2
137
+ found = true
138
+ break
139
+ }
140
+ }
141
+ }
142
+ if (!found) {
143
+ merged.push(raw[i])
144
+ i++
145
+ }
146
+ }
147
+
148
+ return merged
149
+ }
150
+
151
+ /**
152
+ * Extract significant (non-stop) words from token list.
153
+ */
154
+ export function extractSignificant(tokens: string[]): string[] {
155
+ return tokens.filter(t => !STOP_WORDS.has(t) && t.length > 2)
156
+ }
157
+
158
+ // ── Concept Clustering ──────────────────────────────────────────────────────
159
+
160
+ /**
161
+ * Split tokens into concept groups at connector boundaries.
162
+ *
163
+ * "JWT authentication middleware that validates user permissions for API endpoints"
164
+ * → ["JWT authentication middleware", "validates user permissions", "API endpoints"]
165
+ */
166
+ export function splitByConcepts(tokens: string[]): string[][] {
167
+ const groups: string[][] = []
168
+ let current: string[] = []
169
+
170
+ for (const token of tokens) {
171
+ if (CONCEPT_CONNECTORS.has(token)) {
172
+ if (current.length > 0) {
173
+ groups.push(current)
174
+ current = []
175
+ }
176
+ // Skip the connector itself
177
+ } else {
178
+ current.push(token)
179
+ }
180
+ }
181
+
182
+ if (current.length > 0) {
183
+ groups.push(current)
184
+ }
185
+
186
+ return groups
187
+ }
188
+
189
+ /**
190
+ * Merge small concept groups with neighbors to meet minimum size.
191
+ */
192
+ function mergeSmallGroups(groups: string[][], minSize: number): string[][] {
193
+ if (groups.length <= 1) return groups
194
+
195
+ const merged: string[][] = []
196
+ let buffer: string[] = []
197
+
198
+ for (const group of groups) {
199
+ buffer.push(...group)
200
+ // Extract significant words to check if buffer is "big enough"
201
+ const sig = extractSignificant(buffer)
202
+ if (sig.length >= minSize) {
203
+ merged.push([...buffer])
204
+ buffer = []
205
+ }
206
+ }
207
+
208
+ // Remaining buffer: merge with last group or push as-is
209
+ if (buffer.length > 0) {
210
+ if (merged.length > 0) {
211
+ merged[merged.length - 1].push(...buffer)
212
+ } else {
213
+ merged.push(buffer)
214
+ }
215
+ }
216
+
217
+ return merged
218
+ }
219
+
220
+ // ── Keyword Core Extraction ─────────────────────────────────────────────────
221
+
222
+ /**
223
+ * Extract a "keyword core" — the most important 3-4 words from the query.
224
+ * Uses a simple heuristic: take significant words, prefer longer/rarer ones.
225
+ */
226
+ export function extractKeywordCore(significant: string[], maxWords: number = 3): string {
227
+ // Score words: longer words and compound terms score higher
228
+ const scored = significant.map(w => ({
229
+ word: w,
230
+ score: w.length + (w.includes("_") ? 5 : 0),
231
+ }))
232
+
233
+ scored.sort((a, b) => b.score - a.score)
234
+ const top = scored.slice(0, maxWords).map(s => s.word)
235
+
236
+ // Restore original order
237
+ const ordered = significant.filter(w => top.includes(w))
238
+ return ordered.slice(0, maxWords).join(" ").replace(/_/g, " ")
239
+ }
240
+
241
+ // ── Main Decomposer ─────────────────────────────────────────────────────────
242
+
243
+ /**
244
+ * Decompose a search query into focused sub-queries.
245
+ *
246
+ * @param query The original search query
247
+ * @param config Decomposer configuration
248
+ * @returns DecompositionResult with sub-queries and metadata
249
+ */
250
+ export function decomposeQuery(
251
+ query: string,
252
+ config: DecomposerConfig = DEFAULT_DECOMPOSER_CONFIG,
253
+ ): DecompositionResult {
254
+ if (!config.enabled) {
255
+ return {
256
+ original: query,
257
+ subQueries: [query],
258
+ decomposed: false,
259
+ strategy: "passthrough",
260
+ }
261
+ }
262
+
263
+ const tokens = tokenizeQuery(query)
264
+ const significant = extractSignificant(tokens)
265
+
266
+ // ── Strategy 1: Short query → passthrough ─────────────────────────────────
267
+ if (significant.length < config.minWords) {
268
+ return {
269
+ original: query,
270
+ subQueries: [query],
271
+ decomposed: false,
272
+ strategy: "passthrough",
273
+ }
274
+ }
275
+
276
+ // ── Strategy 2: Medium query (5-8 significant words) → keyword core ───────
277
+ if (significant.length <= 8) {
278
+ const core = extractKeywordCore(significant, 3)
279
+ const subQueries = [query]
280
+
281
+ // Only add core if it's meaningfully different from original
282
+ if (core !== query.toLowerCase().trim() && core.split(" ").length >= config.minSubQueryWords) {
283
+ subQueries.push(core)
284
+ }
285
+
286
+ return {
287
+ original: query,
288
+ subQueries: subQueries.slice(0, config.maxSubQueries),
289
+ decomposed: subQueries.length > 1,
290
+ strategy: subQueries.length > 1 ? "keyword-core" : "passthrough",
291
+ }
292
+ }
293
+
294
+ // ── Strategy 3: Long query (9+ significant words) → concept split ─────────
295
+ const conceptGroups = splitByConcepts(tokens)
296
+ const mergedGroups = mergeSmallGroups(conceptGroups, config.minSubQueryWords)
297
+
298
+ const subQueries: string[] = []
299
+
300
+ // Always include keyword core as first sub-query (highest signal)
301
+ const core = extractKeywordCore(significant, 4)
302
+ if (core.split(" ").length >= config.minSubQueryWords) {
303
+ subQueries.push(core)
304
+ }
305
+
306
+ // Add concept groups as sub-queries
307
+ for (const group of mergedGroups) {
308
+ const groupSig = extractSignificant(group)
309
+ if (groupSig.length >= config.minSubQueryWords) {
310
+ const subQuery = groupSig.join(" ").replace(/_/g, " ")
311
+ // Avoid duplicates
312
+ if (!subQueries.includes(subQuery)) {
313
+ subQueries.push(subQuery)
314
+ }
315
+ }
316
+ }
317
+
318
+ // If we still have room, add the original (truncated to first N significant words)
319
+ if (subQueries.length < config.maxSubQueries) {
320
+ const truncated = significant.slice(0, 6).join(" ").replace(/_/g, " ")
321
+ if (!subQueries.includes(truncated)) {
322
+ subQueries.push(truncated)
323
+ }
324
+ }
325
+
326
+ // Ensure we don't exceed max
327
+ const finalQueries = subQueries.slice(0, config.maxSubQueries)
328
+
329
+ return {
330
+ original: query,
331
+ subQueries: finalQueries.length > 0 ? finalQueries : [query],
332
+ decomposed: finalQueries.length > 1,
333
+ strategy: finalQueries.length > 1 ? "concept-split" : "passthrough",
334
+ }
335
+ }
336
+
337
+ // ── RRF Merge ───────────────────────────────────────────────────────────────
338
+
339
+ /**
340
+ * Reciprocal Rank Fusion — merge ranked result lists from multiple sub-queries.
341
+ *
342
+ * RRF score = sum(1 / (k + rank_i)) for each sub-query where the result appears.
343
+ *
344
+ * @param resultSets Array of result arrays, each sorted by relevance (best first)
345
+ * @param k RRF constant (default: 60, standard value from the paper)
346
+ * @param limit Max results to return
347
+ * @returns Merged results sorted by RRF score, with _rrfScore and _combinedScore set
348
+ */
349
+ export function rrfMerge(
350
+ resultSets: Array<Array<Record<string, any>>>,
351
+ k: number = 60,
352
+ limit: number = 10,
353
+ ): Array<Record<string, any>> {
354
+ if (resultSets.length === 0) return []
355
+ if (resultSets.length === 1) return resultSets[0].slice(0, limit)
356
+
357
+ // Build RRF scores keyed by chunk identity (file:chunk_index)
358
+ const scoreMap = new Map<string, { row: Record<string, any>; rrfScore: number; bestOriginalScore: number }>()
359
+
360
+ for (const results of resultSets) {
361
+ for (let rank = 0; rank < results.length; rank++) {
362
+ const r = results[rank]
363
+ const key = `${r.file}:${r.chunk_index}`
364
+ const rrfContribution = 1 / (k + rank + 1) // rank is 0-based, RRF uses 1-based
365
+
366
+ const existing = scoreMap.get(key)
367
+ const originalScore = r._combinedScore ?? (r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0)
368
+
369
+ if (existing) {
370
+ existing.rrfScore += rrfContribution
371
+ // Keep the row with the best original score (most metadata)
372
+ if (originalScore > existing.bestOriginalScore) {
373
+ existing.row = r
374
+ existing.bestOriginalScore = originalScore
375
+ }
376
+ } else {
377
+ scoreMap.set(key, {
378
+ row: r,
379
+ rrfScore: rrfContribution,
380
+ bestOriginalScore: originalScore,
381
+ })
382
+ }
383
+ }
384
+ }
385
+
386
+ // Sort by RRF score and return
387
+ const merged = Array.from(scoreMap.values())
388
+ .sort((a, b) => b.rrfScore - a.rrfScore)
389
+ .slice(0, limit)
390
+ .map(entry => ({
391
+ ...entry.row,
392
+ _rrfScore: entry.rrfScore,
393
+ _combinedScore: entry.bestOriginalScore, // preserve for downstream compatibility
394
+ }))
395
+
396
+ return merged
397
+ }
package/vectorizer.yaml CHANGED
@@ -68,6 +68,13 @@ vectorizer:
68
68
  auto_prune_search: true # Replace old search outputs with compact summaries
69
69
  substitute_tool_outputs: true # Replace read() outputs when chunks in workspace
70
70
 
71
+ # Query decomposition (v4 — improves long query relevance)
72
+ decomposition:
73
+ enabled: true # Split complex queries into focused sub-queries
74
+ min_words: 5 # Min significant words to trigger decomposition
75
+ max_sub_queries: 4 # Max sub-queries (including keyword core)
76
+ min_sub_query_words: 2 # Min words per sub-query
77
+
71
78
  # Quality monitoring (v2)
72
79
  quality:
73
80
  enable_metrics: false # Track search quality metrics