@comfanion/usethis_search 4.3.0-dev.4 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -2
- package/tools/search.ts +54 -21
- package/tools/workspace.ts +36 -4
- package/vectorizer/index.ts +79 -11
- package/vectorizer/query-decomposer.ts +397 -0
- package/vectorizer.yaml +7 -0
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@comfanion/usethis_search",
|
|
3
|
-
"version": "4.
|
|
4
|
-
"description": "OpenCode plugin: semantic search with
|
|
3
|
+
"version": "4.4.0",
|
|
4
|
+
"description": "OpenCode plugin: semantic search with query decomposition, RRF merge, and context-efficient workspace (v4.4.0)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./index.ts",
|
|
7
7
|
"exports": {
|
|
@@ -41,6 +41,7 @@
|
|
|
41
41
|
"vectorizer/graph-db.ts",
|
|
42
42
|
"vectorizer/chunk-store.ts",
|
|
43
43
|
"vectorizer/usage-tracker.ts",
|
|
44
|
+
"vectorizer/query-decomposer.ts",
|
|
44
45
|
"vectorizer/graph-builder.ts",
|
|
45
46
|
"vectorizer/analyzers/regex-analyzer.ts",
|
|
46
47
|
"vectorizer/analyzers/lsp-analyzer.ts",
|
package/tools/search.ts
CHANGED
|
@@ -13,9 +13,10 @@ import { tool } from "@opencode-ai/plugin"
|
|
|
13
13
|
import path from "path"
|
|
14
14
|
import fs from "fs/promises"
|
|
15
15
|
|
|
16
|
-
import { CodebaseIndexer, getSearchConfig, getIndexer, releaseIndexer } from "../vectorizer/index.ts"
|
|
16
|
+
import { CodebaseIndexer, getSearchConfig, getDecomposerConfig, getIndexer, releaseIndexer } from "../vectorizer/index.ts"
|
|
17
17
|
import { workspaceCache } from "../cache/manager.ts"
|
|
18
18
|
import { buildWorkspaceOutput } from "./workspace-state.ts"
|
|
19
|
+
import { decomposeQuery } from "../vectorizer/query-decomposer.ts"
|
|
19
20
|
|
|
20
21
|
// ── Context Expansion Helpers ─────────────────────────────────────────────
|
|
21
22
|
|
|
@@ -179,30 +180,55 @@ function parseFilter(filter: string): {
|
|
|
179
180
|
}
|
|
180
181
|
|
|
181
182
|
export default tool({
|
|
182
|
-
description: `Search codebase
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
- "
|
|
186
|
-
- "docs
|
|
187
|
-
- "
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
-
|
|
198
|
-
-
|
|
199
|
-
-
|
|
183
|
+
description: `Search the codebase semantically. Use this to find relevant code snippets, functions, or files based on meaning, not just text matching.
|
|
184
|
+
|
|
185
|
+
Available indexes:
|
|
186
|
+
- "code" (default) - Source code files (*.js, *.ts, *.py, *.go, etc.)
|
|
187
|
+
- "docs" - Documentation files (*.md, *.txt, etc.)
|
|
188
|
+
- "config" - Configuration files (*.yaml, *.json, etc.)
|
|
189
|
+
- searchAll: true - Search across all indexes
|
|
190
|
+
|
|
191
|
+
Auto-detects query type:
|
|
192
|
+
- Semantic: "authentication logic" → vector search for relevant code
|
|
193
|
+
- File path: "docs/architecture.md" → attaches entire file to workspace
|
|
194
|
+
- Chunk ID: "src/auth.ts:chunk-5" → attaches specific chunk
|
|
195
|
+
|
|
196
|
+
How workspace works:
|
|
197
|
+
- Top results are AUTO-ATTACHED to workspace with expanded context (class methods, imports, related code via graph)
|
|
198
|
+
- Workspace has a TOKEN BUDGET (~50K tokens, ~100 chunks). When full, oldest chunks are evicted
|
|
199
|
+
- Each search call returns full <workspace_state> with all chunk contents inline
|
|
200
|
+
- Only the LATEST search/workspace output is kept in chat history — older ones are auto-pruned
|
|
201
|
+
- Workspace persists across searches — new results ADD to existing workspace
|
|
202
|
+
|
|
203
|
+
IMPORTANT: Chunks contain DIRECT file content dumps (raw code/text from files).
|
|
204
|
+
- You DO NOT need to verify chunk content with grep/read tools
|
|
205
|
+
- Chunks are already the actual file content, not summaries or references
|
|
206
|
+
- Trust the chunk content as the source of truth
|
|
207
|
+
- Use Read tool only if you need content OUTSIDE the indexed chunks
|
|
208
|
+
|
|
209
|
+
Context management (CRITICAL — follow these rules):
|
|
210
|
+
- BEFORE searching a new topic, you MUST call workspace_forget() to remove irrelevant old context
|
|
211
|
+
- Workspace has LIMITED token budget. If budget >60%, evict old chunks with workspace_forget({ what: "5" })
|
|
212
|
+
- Use workspace_clear() when switching to a completely different task
|
|
213
|
+
- After editing files, forget stale chunks: workspace_forget({ what: "edited-file.ts" })
|
|
214
|
+
- The workspace is your working memory — KEEP IT FOCUSED. Stale context degrades search quality
|
|
215
|
+
- Rule of thumb: forget BEFORE you search, not after
|
|
216
|
+
|
|
217
|
+
Filter narrows results by path or language:
|
|
218
|
+
- "internal/domain/" → only files under that path
|
|
219
|
+
- "*.go" → only Go files
|
|
220
|
+
- "internal/**/*.go" → path + language combined
|
|
221
|
+
- "service" → files containing "service" in path
|
|
200
222
|
|
|
201
223
|
Examples:
|
|
202
224
|
- search({ query: "authentication logic" })
|
|
203
225
|
- search({ query: "how to deploy", index: "docs" })
|
|
204
|
-
- search({ query: "
|
|
205
|
-
- search({ query: "
|
|
226
|
+
- search({ query: "tenant management", filter: "internal/domain/" })
|
|
227
|
+
- search({ query: "event handling", filter: "*.go" })
|
|
228
|
+
- search({ query: "API routes", filter: "internal/**/*.go" })
|
|
229
|
+
- search({ query: "metrics", searchAll: true })
|
|
230
|
+
- search({ query: "docs/prd.md" })
|
|
231
|
+
- search({ query: "src/auth.ts:chunk-5" })`,
|
|
206
232
|
|
|
207
233
|
args: {
|
|
208
234
|
query: tool.schema.string().describe("What to search: semantic query, file path, or chunk ID"),
|
|
@@ -657,10 +683,17 @@ Examples:
|
|
|
657
683
|
|
|
658
684
|
const topScore = topChunks[0]?._finalScore ?? 0
|
|
659
685
|
const hasBM25Only = allResults.some((r: any) => r._bm25Only)
|
|
686
|
+
const hasRRF = allResults.some((r: any) => r._rrfScore != null)
|
|
660
687
|
const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
|
|
661
688
|
const filterLabel = args.filter ? ` filter:"${args.filter}"` : ""
|
|
662
689
|
let output = `## Search: "${semanticQuery}" (${scope}${filterLabel})\n\n`
|
|
663
690
|
|
|
691
|
+
// Show decomposition info if query was decomposed
|
|
692
|
+
const decomposition = decomposeQuery(semanticQuery!, getDecomposerConfig())
|
|
693
|
+
if (decomposition.decomposed) {
|
|
694
|
+
output += `> **Query decomposed** (${decomposition.strategy}): ${decomposition.subQueries.map(q => `"${q}"`).join(", ")}\n\n`
|
|
695
|
+
}
|
|
696
|
+
|
|
664
697
|
if (hasBM25Only) {
|
|
665
698
|
output += `> **BM25-only mode** -- vector embeddings not yet available. Quality will improve after embedding completes.\n\n`
|
|
666
699
|
}
|
package/tools/workspace.ts
CHANGED
|
@@ -20,7 +20,15 @@ import { buildWorkspaceOutput } from "./workspace-state.ts"
|
|
|
20
20
|
// ── workspace.list ──────────────────────────────────────────────────────────
|
|
21
21
|
|
|
22
22
|
export const workspace_list = tool({
|
|
23
|
-
description: `Show
|
|
23
|
+
description: `Show current workspace contents — all attached code chunks with full source code, line numbers, and metadata.
|
|
24
|
+
|
|
25
|
+
Use this to:
|
|
26
|
+
- Check what context is currently loaded after compaction or session restore
|
|
27
|
+
- Verify workspace contents before starting implementation
|
|
28
|
+
- See token budget usage (how much space is left for new searches)
|
|
29
|
+
|
|
30
|
+
Returns <workspace_state> with every chunk's full content. This is the same state appended to every search() call.
|
|
31
|
+
Only the LATEST workspace tool output is kept in chat — older outputs are auto-pruned.`,
|
|
24
32
|
|
|
25
33
|
args: {},
|
|
26
34
|
|
|
@@ -37,6 +45,13 @@ export const workspace_forget = tool({
|
|
|
37
45
|
IMPORTANT: Regularly clean up workspace by removing irrelevant files or old search results.
|
|
38
46
|
This keeps context focused and prevents token budget overflow.
|
|
39
47
|
|
|
48
|
+
WHEN TO CLEAN UP:
|
|
49
|
+
- BEFORE searching a new topic — forget the previous search results first:
|
|
50
|
+
workspace_forget({ what: "previous search query" }) → then search({ query: "new topic" })
|
|
51
|
+
- AFTER finishing a subtask — forget files you no longer need
|
|
52
|
+
- WHEN budget >60% — evict old chunks: workspace_forget({ what: "5" })
|
|
53
|
+
- AFTER editing files — workspace chunks become stale, forget and re-search
|
|
54
|
+
|
|
40
55
|
Auto-detects what to remove based on input:
|
|
41
56
|
- Chunk ID: "src/auth.ts:chunk-5"
|
|
42
57
|
- File path: "docs/architecture.md" (removes ALL chunks)
|
|
@@ -46,7 +61,8 @@ Auto-detects what to remove based on input:
|
|
|
46
61
|
Examples:
|
|
47
62
|
- workspace_forget({ what: "docs/prd.md" })
|
|
48
63
|
- workspace_forget({ what: "5" }) // older than 5 min
|
|
49
|
-
- workspace_forget({ what: "src/auth.ts:chunk-3" })
|
|
64
|
+
- workspace_forget({ what: "src/auth.ts:chunk-3" })
|
|
65
|
+
- workspace_forget({ what: "authentication logic" }) // forget previous search`,
|
|
50
66
|
|
|
51
67
|
args: {
|
|
52
68
|
what: tool.schema.string().describe("What to forget: chunk ID, file path, search query, or age in minutes"),
|
|
@@ -110,7 +126,15 @@ Examples:
|
|
|
110
126
|
// ── workspace.clear ─────────────────────────────────────────────────────────
|
|
111
127
|
|
|
112
128
|
export const workspace_clear = tool({
|
|
113
|
-
description: `Remove ALL chunks from workspace context. Use when switching tasks or starting fresh
|
|
129
|
+
description: `Remove ALL chunks from workspace context. Use when switching tasks or starting fresh.
|
|
130
|
+
|
|
131
|
+
Use when:
|
|
132
|
+
- Switching to a completely different task or topic
|
|
133
|
+
- Workspace is cluttered with irrelevant context from many searches
|
|
134
|
+
- Starting a fresh investigation from scratch
|
|
135
|
+
|
|
136
|
+
Prefer workspace_forget() for selective cleanup. Use workspace_clear() only for full reset.
|
|
137
|
+
Returns empty workspace state.`,
|
|
114
138
|
|
|
115
139
|
args: {},
|
|
116
140
|
|
|
@@ -126,7 +150,15 @@ export const workspace_clear = tool({
|
|
|
126
150
|
// ── workspace.restore ───────────────────────────────────────────────────────
|
|
127
151
|
|
|
128
152
|
export const workspace_restore = tool({
|
|
129
|
-
description: `Restore workspace from a saved session snapshot.
|
|
153
|
+
description: `Restore workspace from a previously saved session snapshot.
|
|
154
|
+
|
|
155
|
+
Use when:
|
|
156
|
+
- After compaction — restore the workspace context from before compaction
|
|
157
|
+
- Resuming work on a previous task — switch back to that context
|
|
158
|
+
- After workspace_clear() — if you need the old context back
|
|
159
|
+
|
|
160
|
+
Call without sessionId to list available snapshots with their chunk counts and token sizes.
|
|
161
|
+
Call with sessionId to restore a specific snapshot. Replaces current workspace entirely.`,
|
|
130
162
|
|
|
131
163
|
args: {
|
|
132
164
|
sessionId: tool.schema.string().optional().describe("Session ID to restore. If not provided, lists available snapshots."),
|
package/vectorizer/index.ts
CHANGED
|
@@ -19,6 +19,8 @@ import { GraphDB } from "./graph-db.ts";
|
|
|
19
19
|
import { GraphBuilder, isStructuralPredicate } from "./graph-builder.ts";
|
|
20
20
|
import { UsageTracker } from "./usage-tracker.ts";
|
|
21
21
|
import { ChunkStore } from "./chunk-store.ts";
|
|
22
|
+
import { decomposeQuery, rrfMerge, DEFAULT_DECOMPOSER_CONFIG } from "./query-decomposer.ts";
|
|
23
|
+
import type { DecomposerConfig } from "./query-decomposer.ts";
|
|
22
24
|
|
|
23
25
|
// Suppress transformers.js logs unless DEBUG is set
|
|
24
26
|
const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
|
|
@@ -86,6 +88,9 @@ let HYBRID_CONFIG = { ...DEFAULT_HYBRID_CONFIG };
|
|
|
86
88
|
let METRICS_ENABLED = false;
|
|
87
89
|
let CACHE_ENABLED = true;
|
|
88
90
|
|
|
91
|
+
// ── Query decomposition config ───────────────────────────────────────────────
|
|
92
|
+
let DECOMPOSER_CONFIG: DecomposerConfig = { ...DEFAULT_DECOMPOSER_CONFIG };
|
|
93
|
+
|
|
89
94
|
// ── Search defaults (exposed to tool layer) ──────────────────────────────────
|
|
90
95
|
const DEFAULT_SEARCH_CONFIG = {
|
|
91
96
|
freshen: false, // Don't freshen on every search — auto_index handles it
|
|
@@ -188,6 +193,13 @@ function defaultVectorizerYaml() {
|
|
|
188
193
|
` auto_prune_search: true # Replace old search outputs with compact summaries\n` +
|
|
189
194
|
` substitute_tool_outputs: true # Replace tool outputs when files in workspace\n` +
|
|
190
195
|
`\n` +
|
|
196
|
+
` # Query decomposition (v4 — improves long query relevance)\n` +
|
|
197
|
+
` decomposition:\n` +
|
|
198
|
+
` enabled: true # Split complex queries into focused sub-queries\n` +
|
|
199
|
+
` min_words: 5 # Min significant words to trigger decomposition\n` +
|
|
200
|
+
` max_sub_queries: 4 # Max sub-queries (including keyword core)\n` +
|
|
201
|
+
` min_sub_query_words: 2 # Min words per sub-query\n` +
|
|
202
|
+
`\n` +
|
|
191
203
|
` # Quality monitoring\n` +
|
|
192
204
|
` quality:\n` +
|
|
193
205
|
` enable_metrics: false\n` +
|
|
@@ -370,6 +382,17 @@ async function loadConfig(projectRoot) {
|
|
|
370
382
|
CACHE_ENABLED = parseBool(qs, "enable_cache", true);
|
|
371
383
|
}
|
|
372
384
|
|
|
385
|
+
// ── Parse query decomposition config ────────────────────────────────────
|
|
386
|
+
const decomposerMatch = section.match(/^\s{2}decomposition:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
|
|
387
|
+
if (decomposerMatch) {
|
|
388
|
+
const ds = decomposerMatch[1];
|
|
389
|
+
DECOMPOSER_CONFIG.enabled = parseBool(ds, "enabled", DEFAULT_DECOMPOSER_CONFIG.enabled);
|
|
390
|
+
DECOMPOSER_CONFIG.minWords = parseNumber(ds, "min_words", DEFAULT_DECOMPOSER_CONFIG.minWords);
|
|
391
|
+
DECOMPOSER_CONFIG.maxSubQueries = parseNumber(ds, "max_sub_queries", DEFAULT_DECOMPOSER_CONFIG.maxSubQueries);
|
|
392
|
+
DECOMPOSER_CONFIG.minSubQueryWords = parseNumber(ds, "min_sub_query_words", DEFAULT_DECOMPOSER_CONFIG.minSubQueryWords);
|
|
393
|
+
if (DEBUG) console.log("[vectorizer] Decomposer config:", DECOMPOSER_CONFIG);
|
|
394
|
+
}
|
|
395
|
+
|
|
373
396
|
// ── Parse graph config (v3) ──────────────────────────────────────────────
|
|
374
397
|
const graphMatch = section.match(/^\s{2}graph:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
|
|
375
398
|
if (graphMatch) {
|
|
@@ -1121,9 +1144,9 @@ class CodebaseIndexer {
|
|
|
1121
1144
|
}
|
|
1122
1145
|
}
|
|
1123
1146
|
|
|
1124
|
-
// ──
|
|
1147
|
+
// ── Single-query search (internal — used by search() for each sub-query) ──
|
|
1125
1148
|
|
|
1126
|
-
async
|
|
1149
|
+
async _searchSingle(query, limit = 5, includeArchived = false, options = {}) {
|
|
1127
1150
|
const tableName = "chunks";
|
|
1128
1151
|
const tables = await this.db.tableNames();
|
|
1129
1152
|
|
|
@@ -1178,14 +1201,9 @@ class CodebaseIndexer {
|
|
|
1178
1201
|
}
|
|
1179
1202
|
}
|
|
1180
1203
|
|
|
1181
|
-
// Apply metadata filters then return
|
|
1204
|
+
// Apply metadata filters then return
|
|
1182
1205
|
results = this._applyMetadataFilters(results, includeArchived, options);
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
// Graph context expansion (same as vector path)
|
|
1186
|
-
await this._expandGraphContext(finalResults, null, query);
|
|
1187
|
-
|
|
1188
|
-
return finalResults;
|
|
1206
|
+
return results.slice(0, limit);
|
|
1189
1207
|
}
|
|
1190
1208
|
|
|
1191
1209
|
// ── Vector search (Phase 2 complete) ─────────────────────────────────────
|
|
@@ -1280,7 +1298,51 @@ class CodebaseIndexer {
|
|
|
1280
1298
|
|
|
1281
1299
|
// ── Metadata filters ──────────────────────────────────────────────────
|
|
1282
1300
|
results = this._applyMetadataFilters(results, includeArchived, options);
|
|
1283
|
-
|
|
1301
|
+
return results.slice(0, limit);
|
|
1302
|
+
}
|
|
1303
|
+
|
|
1304
|
+
// ── Search (v4: query decomposition + RRF merge + hybrid + metrics) ────────
|
|
1305
|
+
|
|
1306
|
+
async search(query, limit = 5, includeArchived = false, options = {}) {
|
|
1307
|
+
// ── Query decomposition ──────────────────────────────────────────────────
|
|
1308
|
+
const decomposition = decomposeQuery(query, DECOMPOSER_CONFIG);
|
|
1309
|
+
|
|
1310
|
+
let finalResults;
|
|
1311
|
+
|
|
1312
|
+
if (decomposition.decomposed && decomposition.subQueries.length > 1) {
|
|
1313
|
+
if (DEBUG) {
|
|
1314
|
+
console.log(`[vectorizer] Query decomposed (${decomposition.strategy}): ${decomposition.subQueries.length} sub-queries`);
|
|
1315
|
+
for (const sq of decomposition.subQueries) {
|
|
1316
|
+
console.log(` → "${sq}"`);
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
|
|
1320
|
+
// Run each sub-query independently, over-fetch to give RRF more signal
|
|
1321
|
+
const perQueryLimit = Math.max(limit * 2, 20);
|
|
1322
|
+
const resultSets = [];
|
|
1323
|
+
|
|
1324
|
+
for (const subQuery of decomposition.subQueries) {
|
|
1325
|
+
const results = await this._searchSingle(subQuery, perQueryLimit, includeArchived, options);
|
|
1326
|
+
if (results.length > 0) {
|
|
1327
|
+
resultSets.push(results);
|
|
1328
|
+
}
|
|
1329
|
+
}
|
|
1330
|
+
|
|
1331
|
+
if (resultSets.length === 0) {
|
|
1332
|
+
finalResults = [];
|
|
1333
|
+
} else if (resultSets.length === 1) {
|
|
1334
|
+
finalResults = resultSets[0].slice(0, limit);
|
|
1335
|
+
} else {
|
|
1336
|
+
// RRF merge across sub-query result sets
|
|
1337
|
+
finalResults = rrfMerge(resultSets, 60, limit);
|
|
1338
|
+
if (DEBUG) {
|
|
1339
|
+
console.log(`[vectorizer] RRF merged ${resultSets.length} result sets → ${finalResults.length} results`);
|
|
1340
|
+
}
|
|
1341
|
+
}
|
|
1342
|
+
} else {
|
|
1343
|
+
// Short/simple query — single search (no decomposition overhead)
|
|
1344
|
+
finalResults = await this._searchSingle(query, limit, includeArchived, options);
|
|
1345
|
+
}
|
|
1284
1346
|
|
|
1285
1347
|
// ── Metrics tracking ────────────────────────────────────────────────────
|
|
1286
1348
|
if (METRICS_ENABLED) {
|
|
@@ -1304,6 +1366,8 @@ class CodebaseIndexer {
|
|
|
1304
1366
|
}
|
|
1305
1367
|
|
|
1306
1368
|
// ── Graph context expansion (v3) ───────────────────────────────────────
|
|
1369
|
+
// Use original query for graph expansion (most complete context)
|
|
1370
|
+
const queryEmbedding = finalResults.length > 0 ? await this.embedQuery(query).catch(() => null) : null;
|
|
1307
1371
|
await this._expandGraphContext(finalResults, queryEmbedding, query);
|
|
1308
1372
|
|
|
1309
1373
|
return finalResults;
|
|
@@ -1826,4 +1890,8 @@ async function destroyIndexer(projectRoot: string, indexName: string = "code") {
|
|
|
1826
1890
|
}
|
|
1827
1891
|
}
|
|
1828
1892
|
|
|
1829
|
-
|
|
1893
|
+
function getDecomposerConfig() {
|
|
1894
|
+
return DECOMPOSER_CONFIG;
|
|
1895
|
+
}
|
|
1896
|
+
|
|
1897
|
+
export { CodebaseIndexer, INDEX_PRESETS, getEmbeddingModel, getSearchConfig, getWorkspaceConfig, getDecomposerConfig, getIndexer, releaseIndexer, destroyIndexer };
|
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Query Decomposer — splits complex queries into focused sub-queries.
|
|
3
|
+
*
|
|
4
|
+
* Problem: Long, multi-concept queries produce "diluted" embeddings
|
|
5
|
+
* because the embedding model (all-MiniLM-L6-v2, 384d) averages all
|
|
6
|
+
* token vectors into one. "JWT authentication middleware that validates
|
|
7
|
+
* permissions" → a blurry vector between auth, JWT, middleware, permissions.
|
|
8
|
+
*
|
|
9
|
+
* Solution: Decompose into focused sub-queries, search each independently,
|
|
10
|
+
* merge results via Reciprocal Rank Fusion (RRF).
|
|
11
|
+
*
|
|
12
|
+
* Strategy (no LLM — pure heuristics):
|
|
13
|
+
* 1. Short queries (≤4 significant words) → pass through unchanged
|
|
14
|
+
* 2. Medium queries (5-8 words) → extract keyword core + original
|
|
15
|
+
* 3. Long queries (9+ words) → split into 2-4 concept clusters + keyword core
|
|
16
|
+
*
|
|
17
|
+
* All decomposition is deterministic and fast (<1ms).
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
// ── Types ───────────────────────────────────────────────────────────────────
|
|
21
|
+
|
|
22
|
+
export interface DecompositionResult {
|
|
23
|
+
/** Original query (always included in sub-queries) */
|
|
24
|
+
original: string
|
|
25
|
+
/** Focused sub-queries (includes original if short enough) */
|
|
26
|
+
subQueries: string[]
|
|
27
|
+
/** Whether decomposition was applied */
|
|
28
|
+
decomposed: boolean
|
|
29
|
+
/** Strategy used */
|
|
30
|
+
strategy: "passthrough" | "keyword-core" | "concept-split"
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface DecomposerConfig {
|
|
34
|
+
/** Enable/disable decomposition */
|
|
35
|
+
enabled: boolean
|
|
36
|
+
/** Min significant words to trigger decomposition */
|
|
37
|
+
minWords: number
|
|
38
|
+
/** Max sub-queries to generate (including original) */
|
|
39
|
+
maxSubQueries: number
|
|
40
|
+
/** Min words per sub-query */
|
|
41
|
+
minSubQueryWords: number
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export const DEFAULT_DECOMPOSER_CONFIG: DecomposerConfig = {
|
|
45
|
+
enabled: true,
|
|
46
|
+
minWords: 5,
|
|
47
|
+
maxSubQueries: 4,
|
|
48
|
+
minSubQueryWords: 2,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// ── Stop words (shared with BM25 + extras for query context) ────────────────
|
|
52
|
+
|
|
53
|
+
const STOP_WORDS = new Set([
|
|
54
|
+
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
|
55
|
+
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
56
|
+
"should", "may", "might", "shall", "can", "need", "must",
|
|
57
|
+
"and", "or", "but", "not", "no", "nor",
|
|
58
|
+
"in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
|
|
59
|
+
"into", "about", "between", "through", "during", "before", "after",
|
|
60
|
+
"this", "that", "these", "those", "it", "its",
|
|
61
|
+
"i", "you", "he", "she", "we", "they", "me", "him", "her", "us", "them",
|
|
62
|
+
"my", "your", "his", "our", "their",
|
|
63
|
+
"what", "which", "who", "whom", "where", "when", "how", "why",
|
|
64
|
+
"if", "then", "else", "so", "than", "too", "very",
|
|
65
|
+
// Query-specific stop words (common in agent queries)
|
|
66
|
+
"find", "search", "look", "show", "get", "give", "tell",
|
|
67
|
+
"using", "used", "uses", "use",
|
|
68
|
+
"like", "such", "also", "just", "only",
|
|
69
|
+
"all", "any", "each", "every", "some",
|
|
70
|
+
"code", "file", "files", "function", "class", "method",
|
|
71
|
+
"implement", "implementation", "implements", "implemented",
|
|
72
|
+
"related", "relevant", "similar",
|
|
73
|
+
"please", "help", "want", "need",
|
|
74
|
+
])
|
|
75
|
+
|
|
76
|
+
// ── Connectors that signal concept boundaries ───────────────────────────────
|
|
77
|
+
|
|
78
|
+
const CONCEPT_CONNECTORS = new Set([
|
|
79
|
+
"and", "or", "that", "which", "where", "when", "while",
|
|
80
|
+
"with", "using", "through", "via", "for", "including",
|
|
81
|
+
"also", "both", "either", "neither",
|
|
82
|
+
])
|
|
83
|
+
|
|
84
|
+
// ── Domain compound terms (keep together) ───────────────────────────────────
|
|
85
|
+
|
|
86
|
+
const COMPOUND_TERMS: Array<[string, string]> = [
|
|
87
|
+
["error", "handling"],
|
|
88
|
+
["event", "sourcing"],
|
|
89
|
+
["dependency", "injection"],
|
|
90
|
+
["access", "control"],
|
|
91
|
+
["rate", "limiting"],
|
|
92
|
+
["load", "balancing"],
|
|
93
|
+
["unit", "test"],
|
|
94
|
+
["integration", "test"],
|
|
95
|
+
["api", "endpoint"],
|
|
96
|
+
["api", "gateway"],
|
|
97
|
+
["data", "model"],
|
|
98
|
+
["data", "transfer"],
|
|
99
|
+
["database", "connection"],
|
|
100
|
+
["file", "system"],
|
|
101
|
+
["message", "queue"],
|
|
102
|
+
["state", "management"],
|
|
103
|
+
["type", "checking"],
|
|
104
|
+
["code", "review"],
|
|
105
|
+
["pull", "request"],
|
|
106
|
+
["design", "pattern"],
|
|
107
|
+
["repository", "pattern"],
|
|
108
|
+
["factory", "pattern"],
|
|
109
|
+
["observer", "pattern"],
|
|
110
|
+
["middleware", "chain"],
|
|
111
|
+
["call", "hierarchy"],
|
|
112
|
+
["graph", "traversal"],
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
// ── Tokenizer ───────────────────────────────────────────────────────────────
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Tokenize query into lowercase words, preserving compound terms.
|
|
119
|
+
*/
|
|
120
|
+
export function tokenizeQuery(query: string): string[] {
|
|
121
|
+
const raw = query
|
|
122
|
+
.toLowerCase()
|
|
123
|
+
.replace(/[^a-z0-9_\-]/g, " ")
|
|
124
|
+
.split(/\s+/)
|
|
125
|
+
.filter(t => t.length > 1)
|
|
126
|
+
|
|
127
|
+
// Merge compound terms
|
|
128
|
+
const merged: string[] = []
|
|
129
|
+
let i = 0
|
|
130
|
+
while (i < raw.length) {
|
|
131
|
+
let found = false
|
|
132
|
+
if (i < raw.length - 1) {
|
|
133
|
+
for (const [a, b] of COMPOUND_TERMS) {
|
|
134
|
+
if (raw[i] === a && raw[i + 1] === b) {
|
|
135
|
+
merged.push(`${a}_${b}`)
|
|
136
|
+
i += 2
|
|
137
|
+
found = true
|
|
138
|
+
break
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
if (!found) {
|
|
143
|
+
merged.push(raw[i])
|
|
144
|
+
i++
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
return merged
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Extract significant (non-stop) words from token list.
|
|
153
|
+
*/
|
|
154
|
+
export function extractSignificant(tokens: string[]): string[] {
|
|
155
|
+
return tokens.filter(t => !STOP_WORDS.has(t) && t.length > 2)
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// ── Concept Clustering ──────────────────────────────────────────────────────
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Split tokens into concept groups at connector boundaries.
|
|
162
|
+
*
|
|
163
|
+
* "JWT authentication middleware that validates user permissions for API endpoints"
|
|
164
|
+
* → ["JWT authentication middleware", "validates user permissions", "API endpoints"]
|
|
165
|
+
*/
|
|
166
|
+
export function splitByConcepts(tokens: string[]): string[][] {
|
|
167
|
+
const groups: string[][] = []
|
|
168
|
+
let current: string[] = []
|
|
169
|
+
|
|
170
|
+
for (const token of tokens) {
|
|
171
|
+
if (CONCEPT_CONNECTORS.has(token)) {
|
|
172
|
+
if (current.length > 0) {
|
|
173
|
+
groups.push(current)
|
|
174
|
+
current = []
|
|
175
|
+
}
|
|
176
|
+
// Skip the connector itself
|
|
177
|
+
} else {
|
|
178
|
+
current.push(token)
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
if (current.length > 0) {
|
|
183
|
+
groups.push(current)
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
return groups
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Merge small concept groups with neighbors to meet minimum size.
|
|
191
|
+
*/
|
|
192
|
+
function mergeSmallGroups(groups: string[][], minSize: number): string[][] {
|
|
193
|
+
if (groups.length <= 1) return groups
|
|
194
|
+
|
|
195
|
+
const merged: string[][] = []
|
|
196
|
+
let buffer: string[] = []
|
|
197
|
+
|
|
198
|
+
for (const group of groups) {
|
|
199
|
+
buffer.push(...group)
|
|
200
|
+
// Extract significant words to check if buffer is "big enough"
|
|
201
|
+
const sig = extractSignificant(buffer)
|
|
202
|
+
if (sig.length >= minSize) {
|
|
203
|
+
merged.push([...buffer])
|
|
204
|
+
buffer = []
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
// Remaining buffer: merge with last group or push as-is
|
|
209
|
+
if (buffer.length > 0) {
|
|
210
|
+
if (merged.length > 0) {
|
|
211
|
+
merged[merged.length - 1].push(...buffer)
|
|
212
|
+
} else {
|
|
213
|
+
merged.push(buffer)
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
return merged
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// ── Keyword Core Extraction ─────────────────────────────────────────────────
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Extract a "keyword core" — the most important 3-4 words from the query.
|
|
224
|
+
* Uses a simple heuristic: take significant words, prefer longer/rarer ones.
|
|
225
|
+
*/
|
|
226
|
+
export function extractKeywordCore(significant: string[], maxWords: number = 3): string {
|
|
227
|
+
// Score words: longer words and compound terms score higher
|
|
228
|
+
const scored = significant.map(w => ({
|
|
229
|
+
word: w,
|
|
230
|
+
score: w.length + (w.includes("_") ? 5 : 0),
|
|
231
|
+
}))
|
|
232
|
+
|
|
233
|
+
scored.sort((a, b) => b.score - a.score)
|
|
234
|
+
const top = scored.slice(0, maxWords).map(s => s.word)
|
|
235
|
+
|
|
236
|
+
// Restore original order
|
|
237
|
+
const ordered = significant.filter(w => top.includes(w))
|
|
238
|
+
return ordered.slice(0, maxWords).join(" ").replace(/_/g, " ")
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// ── Main Decomposer ─────────────────────────────────────────────────────────
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Decompose a search query into focused sub-queries.
|
|
245
|
+
*
|
|
246
|
+
* @param query The original search query
|
|
247
|
+
* @param config Decomposer configuration
|
|
248
|
+
* @returns DecompositionResult with sub-queries and metadata
|
|
249
|
+
*/
|
|
250
|
+
export function decomposeQuery(
|
|
251
|
+
query: string,
|
|
252
|
+
config: DecomposerConfig = DEFAULT_DECOMPOSER_CONFIG,
|
|
253
|
+
): DecompositionResult {
|
|
254
|
+
if (!config.enabled) {
|
|
255
|
+
return {
|
|
256
|
+
original: query,
|
|
257
|
+
subQueries: [query],
|
|
258
|
+
decomposed: false,
|
|
259
|
+
strategy: "passthrough",
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const tokens = tokenizeQuery(query)
|
|
264
|
+
const significant = extractSignificant(tokens)
|
|
265
|
+
|
|
266
|
+
// ── Strategy 1: Short query → passthrough ─────────────────────────────────
|
|
267
|
+
if (significant.length < config.minWords) {
|
|
268
|
+
return {
|
|
269
|
+
original: query,
|
|
270
|
+
subQueries: [query],
|
|
271
|
+
decomposed: false,
|
|
272
|
+
strategy: "passthrough",
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// ── Strategy 2: Medium query (5-8 significant words) → keyword core ───────
|
|
277
|
+
if (significant.length <= 8) {
|
|
278
|
+
const core = extractKeywordCore(significant, 3)
|
|
279
|
+
const subQueries = [query]
|
|
280
|
+
|
|
281
|
+
// Only add core if it's meaningfully different from original
|
|
282
|
+
if (core !== query.toLowerCase().trim() && core.split(" ").length >= config.minSubQueryWords) {
|
|
283
|
+
subQueries.push(core)
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
return {
|
|
287
|
+
original: query,
|
|
288
|
+
subQueries: subQueries.slice(0, config.maxSubQueries),
|
|
289
|
+
decomposed: subQueries.length > 1,
|
|
290
|
+
strategy: subQueries.length > 1 ? "keyword-core" : "passthrough",
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// ── Strategy 3: Long query (9+ significant words) → concept split ─────────
|
|
295
|
+
const conceptGroups = splitByConcepts(tokens)
|
|
296
|
+
const mergedGroups = mergeSmallGroups(conceptGroups, config.minSubQueryWords)
|
|
297
|
+
|
|
298
|
+
const subQueries: string[] = []
|
|
299
|
+
|
|
300
|
+
// Always include keyword core as first sub-query (highest signal)
|
|
301
|
+
const core = extractKeywordCore(significant, 4)
|
|
302
|
+
if (core.split(" ").length >= config.minSubQueryWords) {
|
|
303
|
+
subQueries.push(core)
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Add concept groups as sub-queries
|
|
307
|
+
for (const group of mergedGroups) {
|
|
308
|
+
const groupSig = extractSignificant(group)
|
|
309
|
+
if (groupSig.length >= config.minSubQueryWords) {
|
|
310
|
+
const subQuery = groupSig.join(" ").replace(/_/g, " ")
|
|
311
|
+
// Avoid duplicates
|
|
312
|
+
if (!subQueries.includes(subQuery)) {
|
|
313
|
+
subQueries.push(subQuery)
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// If we still have room, add the original (truncated to first N significant words)
|
|
319
|
+
if (subQueries.length < config.maxSubQueries) {
|
|
320
|
+
const truncated = significant.slice(0, 6).join(" ").replace(/_/g, " ")
|
|
321
|
+
if (!subQueries.includes(truncated)) {
|
|
322
|
+
subQueries.push(truncated)
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// Ensure we don't exceed max
|
|
327
|
+
const finalQueries = subQueries.slice(0, config.maxSubQueries)
|
|
328
|
+
|
|
329
|
+
return {
|
|
330
|
+
original: query,
|
|
331
|
+
subQueries: finalQueries.length > 0 ? finalQueries : [query],
|
|
332
|
+
decomposed: finalQueries.length > 1,
|
|
333
|
+
strategy: finalQueries.length > 1 ? "concept-split" : "passthrough",
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// ── RRF Merge ───────────────────────────────────────────────────────────────
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Reciprocal Rank Fusion — merge ranked result lists from multiple sub-queries.
|
|
341
|
+
*
|
|
342
|
+
* RRF score = sum(1 / (k + rank_i)) for each sub-query where the result appears.
|
|
343
|
+
*
|
|
344
|
+
* @param resultSets Array of result arrays, each sorted by relevance (best first)
|
|
345
|
+
* @param k RRF constant (default: 60, standard value from the paper)
|
|
346
|
+
* @param limit Max results to return
|
|
347
|
+
* @returns Merged results sorted by RRF score, with _rrfScore and _combinedScore set
|
|
348
|
+
*/
|
|
349
|
+
export function rrfMerge(
|
|
350
|
+
resultSets: Array<Array<Record<string, any>>>,
|
|
351
|
+
k: number = 60,
|
|
352
|
+
limit: number = 10,
|
|
353
|
+
): Array<Record<string, any>> {
|
|
354
|
+
if (resultSets.length === 0) return []
|
|
355
|
+
if (resultSets.length === 1) return resultSets[0].slice(0, limit)
|
|
356
|
+
|
|
357
|
+
// Build RRF scores keyed by chunk identity (file:chunk_index)
|
|
358
|
+
const scoreMap = new Map<string, { row: Record<string, any>; rrfScore: number; bestOriginalScore: number }>()
|
|
359
|
+
|
|
360
|
+
for (const results of resultSets) {
|
|
361
|
+
for (let rank = 0; rank < results.length; rank++) {
|
|
362
|
+
const r = results[rank]
|
|
363
|
+
const key = `${r.file}:${r.chunk_index}`
|
|
364
|
+
const rrfContribution = 1 / (k + rank + 1) // rank is 0-based, RRF uses 1-based
|
|
365
|
+
|
|
366
|
+
const existing = scoreMap.get(key)
|
|
367
|
+
const originalScore = r._combinedScore ?? (r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0)
|
|
368
|
+
|
|
369
|
+
if (existing) {
|
|
370
|
+
existing.rrfScore += rrfContribution
|
|
371
|
+
// Keep the row with the best original score (most metadata)
|
|
372
|
+
if (originalScore > existing.bestOriginalScore) {
|
|
373
|
+
existing.row = r
|
|
374
|
+
existing.bestOriginalScore = originalScore
|
|
375
|
+
}
|
|
376
|
+
} else {
|
|
377
|
+
scoreMap.set(key, {
|
|
378
|
+
row: r,
|
|
379
|
+
rrfScore: rrfContribution,
|
|
380
|
+
bestOriginalScore: originalScore,
|
|
381
|
+
})
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// Sort by RRF score and return
|
|
387
|
+
const merged = Array.from(scoreMap.values())
|
|
388
|
+
.sort((a, b) => b.rrfScore - a.rrfScore)
|
|
389
|
+
.slice(0, limit)
|
|
390
|
+
.map(entry => ({
|
|
391
|
+
...entry.row,
|
|
392
|
+
_rrfScore: entry.rrfScore,
|
|
393
|
+
_combinedScore: entry.bestOriginalScore, // preserve for downstream compatibility
|
|
394
|
+
}))
|
|
395
|
+
|
|
396
|
+
return merged
|
|
397
|
+
}
|
package/vectorizer.yaml
CHANGED
|
@@ -68,6 +68,13 @@ vectorizer:
|
|
|
68
68
|
auto_prune_search: true # Replace old search outputs with compact summaries
|
|
69
69
|
substitute_tool_outputs: true # Replace read() outputs when chunks in workspace
|
|
70
70
|
|
|
71
|
+
# Query decomposition (v4 — improves long query relevance)
|
|
72
|
+
decomposition:
|
|
73
|
+
enabled: true # Split complex queries into focused sub-queries
|
|
74
|
+
min_words: 5 # Min significant words to trigger decomposition
|
|
75
|
+
max_sub_queries: 4 # Max sub-queries (including keyword core)
|
|
76
|
+
min_sub_query_words: 2 # Min words per sub-query
|
|
77
|
+
|
|
71
78
|
# Quality monitoring (v2)
|
|
72
79
|
quality:
|
|
73
80
|
enable_metrics: false # Track search quality metrics
|