@comfanion/usethis_search 4.3.1 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,397 @@
1
+ /**
2
+ * Query Decomposer — splits complex queries into focused sub-queries.
3
+ *
4
+ * Problem: Long, multi-concept queries produce "diluted" embeddings
5
+ * because the embedding model (all-MiniLM-L6-v2, 384d) averages all
6
+ * token vectors into one. "JWT authentication middleware that validates
7
+ * permissions" → a blurry vector between auth, JWT, middleware, permissions.
8
+ *
9
+ * Solution: Decompose into focused sub-queries, search each independently,
10
+ * merge results via Reciprocal Rank Fusion (RRF).
11
+ *
12
+ * Strategy (no LLM — pure heuristics):
13
+ * 1. Short queries (≤4 significant words) → pass through unchanged
14
+ * 2. Medium queries (5-8 words) → extract keyword core + original
15
+ * 3. Long queries (9+ words) → split into 2-4 concept clusters + keyword core
16
+ *
17
+ * All decomposition is deterministic and fast (<1ms).
18
+ */
19
+
20
+ // ── Types ───────────────────────────────────────────────────────────────────
21
+
22
+ export interface DecompositionResult {
23
+ /** Original query (always included in sub-queries) */
24
+ original: string
25
+ /** Focused sub-queries (includes original if short enough) */
26
+ subQueries: string[]
27
+ /** Whether decomposition was applied */
28
+ decomposed: boolean
29
+ /** Strategy used */
30
+ strategy: "passthrough" | "keyword-core" | "concept-split"
31
+ }
32
+
33
+ export interface DecomposerConfig {
34
+ /** Enable/disable decomposition */
35
+ enabled: boolean
36
+ /** Min significant words to trigger decomposition */
37
+ minWords: number
38
+ /** Max sub-queries to generate (including original) */
39
+ maxSubQueries: number
40
+ /** Min words per sub-query */
41
+ minSubQueryWords: number
42
+ }
43
+
44
+ export const DEFAULT_DECOMPOSER_CONFIG: DecomposerConfig = {
45
+ enabled: true,
46
+ minWords: 5,
47
+ maxSubQueries: 4,
48
+ minSubQueryWords: 2,
49
+ }
50
+
51
+ // ── Stop words (shared with BM25 + extras for query context) ────────────────
52
+
53
+ const STOP_WORDS = new Set([
54
+ "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
55
+ "have", "has", "had", "do", "does", "did", "will", "would", "could",
56
+ "should", "may", "might", "shall", "can", "need", "must",
57
+ "and", "or", "but", "not", "no", "nor",
58
+ "in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
59
+ "into", "about", "between", "through", "during", "before", "after",
60
+ "this", "that", "these", "those", "it", "its",
61
+ "i", "you", "he", "she", "we", "they", "me", "him", "her", "us", "them",
62
+ "my", "your", "his", "our", "their",
63
+ "what", "which", "who", "whom", "where", "when", "how", "why",
64
+ "if", "then", "else", "so", "than", "too", "very",
65
+ // Query-specific stop words (common in agent queries)
66
+ "find", "search", "look", "show", "get", "give", "tell",
67
+ "using", "used", "uses", "use",
68
+ "like", "such", "also", "just", "only",
69
+ "all", "any", "each", "every", "some",
70
+ "code", "file", "files", "function", "class", "method",
71
+ "implement", "implementation", "implements", "implemented",
72
+ "related", "relevant", "similar",
73
+ "please", "help", "want", "need",
74
+ ])
75
+
76
+ // ── Connectors that signal concept boundaries ───────────────────────────────
77
+
78
+ const CONCEPT_CONNECTORS = new Set([
79
+ "and", "or", "that", "which", "where", "when", "while",
80
+ "with", "using", "through", "via", "for", "including",
81
+ "also", "both", "either", "neither",
82
+ ])
83
+
84
+ // ── Domain compound terms (keep together) ───────────────────────────────────
85
+
86
+ const COMPOUND_TERMS: Array<[string, string]> = [
87
+ ["error", "handling"],
88
+ ["event", "sourcing"],
89
+ ["dependency", "injection"],
90
+ ["access", "control"],
91
+ ["rate", "limiting"],
92
+ ["load", "balancing"],
93
+ ["unit", "test"],
94
+ ["integration", "test"],
95
+ ["api", "endpoint"],
96
+ ["api", "gateway"],
97
+ ["data", "model"],
98
+ ["data", "transfer"],
99
+ ["database", "connection"],
100
+ ["file", "system"],
101
+ ["message", "queue"],
102
+ ["state", "management"],
103
+ ["type", "checking"],
104
+ ["code", "review"],
105
+ ["pull", "request"],
106
+ ["design", "pattern"],
107
+ ["repository", "pattern"],
108
+ ["factory", "pattern"],
109
+ ["observer", "pattern"],
110
+ ["middleware", "chain"],
111
+ ["call", "hierarchy"],
112
+ ["graph", "traversal"],
113
+ ]
114
+
115
+ // ── Tokenizer ───────────────────────────────────────────────────────────────
116
+
117
+ /**
118
+ * Tokenize query into lowercase words, preserving compound terms.
119
+ */
120
+ export function tokenizeQuery(query: string): string[] {
121
+ const raw = query
122
+ .toLowerCase()
123
+ .replace(/[^a-z0-9_\-]/g, " ")
124
+ .split(/\s+/)
125
+ .filter(t => t.length > 1)
126
+
127
+ // Merge compound terms
128
+ const merged: string[] = []
129
+ let i = 0
130
+ while (i < raw.length) {
131
+ let found = false
132
+ if (i < raw.length - 1) {
133
+ for (const [a, b] of COMPOUND_TERMS) {
134
+ if (raw[i] === a && raw[i + 1] === b) {
135
+ merged.push(`${a}_${b}`)
136
+ i += 2
137
+ found = true
138
+ break
139
+ }
140
+ }
141
+ }
142
+ if (!found) {
143
+ merged.push(raw[i])
144
+ i++
145
+ }
146
+ }
147
+
148
+ return merged
149
+ }
150
+
151
+ /**
152
+ * Extract significant (non-stop) words from token list.
153
+ */
154
+ export function extractSignificant(tokens: string[]): string[] {
155
+ return tokens.filter(t => !STOP_WORDS.has(t) && t.length > 2)
156
+ }
157
+
158
+ // ── Concept Clustering ──────────────────────────────────────────────────────
159
+
160
+ /**
161
+ * Split tokens into concept groups at connector boundaries.
162
+ *
163
+ * "JWT authentication middleware that validates user permissions for API endpoints"
164
+ * → ["JWT authentication middleware", "validates user permissions", "API endpoints"]
165
+ */
166
+ export function splitByConcepts(tokens: string[]): string[][] {
167
+ const groups: string[][] = []
168
+ let current: string[] = []
169
+
170
+ for (const token of tokens) {
171
+ if (CONCEPT_CONNECTORS.has(token)) {
172
+ if (current.length > 0) {
173
+ groups.push(current)
174
+ current = []
175
+ }
176
+ // Skip the connector itself
177
+ } else {
178
+ current.push(token)
179
+ }
180
+ }
181
+
182
+ if (current.length > 0) {
183
+ groups.push(current)
184
+ }
185
+
186
+ return groups
187
+ }
188
+
189
+ /**
190
+ * Merge small concept groups with neighbors to meet minimum size.
191
+ */
192
+ function mergeSmallGroups(groups: string[][], minSize: number): string[][] {
193
+ if (groups.length <= 1) return groups
194
+
195
+ const merged: string[][] = []
196
+ let buffer: string[] = []
197
+
198
+ for (const group of groups) {
199
+ buffer.push(...group)
200
+ // Extract significant words to check if buffer is "big enough"
201
+ const sig = extractSignificant(buffer)
202
+ if (sig.length >= minSize) {
203
+ merged.push([...buffer])
204
+ buffer = []
205
+ }
206
+ }
207
+
208
+ // Remaining buffer: merge with last group or push as-is
209
+ if (buffer.length > 0) {
210
+ if (merged.length > 0) {
211
+ merged[merged.length - 1].push(...buffer)
212
+ } else {
213
+ merged.push(buffer)
214
+ }
215
+ }
216
+
217
+ return merged
218
+ }
219
+
220
+ // ── Keyword Core Extraction ─────────────────────────────────────────────────
221
+
222
+ /**
223
+ * Extract a "keyword core" — the most important 3-4 words from the query.
224
+ * Uses a simple heuristic: take significant words, prefer longer/rarer ones.
225
+ */
226
+ export function extractKeywordCore(significant: string[], maxWords: number = 3): string {
227
+ // Score words: longer words and compound terms score higher
228
+ const scored = significant.map(w => ({
229
+ word: w,
230
+ score: w.length + (w.includes("_") ? 5 : 0),
231
+ }))
232
+
233
+ scored.sort((a, b) => b.score - a.score)
234
+ const top = scored.slice(0, maxWords).map(s => s.word)
235
+
236
+ // Restore original order
237
+ const ordered = significant.filter(w => top.includes(w))
238
+ return ordered.slice(0, maxWords).join(" ").replace(/_/g, " ")
239
+ }
240
+
241
+ // ── Main Decomposer ─────────────────────────────────────────────────────────
242
+
243
+ /**
244
+ * Decompose a search query into focused sub-queries.
245
+ *
246
+ * @param query The original search query
247
+ * @param config Decomposer configuration
248
+ * @returns DecompositionResult with sub-queries and metadata
249
+ */
250
+ export function decomposeQuery(
251
+ query: string,
252
+ config: DecomposerConfig = DEFAULT_DECOMPOSER_CONFIG,
253
+ ): DecompositionResult {
254
+ if (!config.enabled) {
255
+ return {
256
+ original: query,
257
+ subQueries: [query],
258
+ decomposed: false,
259
+ strategy: "passthrough",
260
+ }
261
+ }
262
+
263
+ const tokens = tokenizeQuery(query)
264
+ const significant = extractSignificant(tokens)
265
+
266
+ // ── Strategy 1: Short query → passthrough ─────────────────────────────────
267
+ if (significant.length < config.minWords) {
268
+ return {
269
+ original: query,
270
+ subQueries: [query],
271
+ decomposed: false,
272
+ strategy: "passthrough",
273
+ }
274
+ }
275
+
276
+ // ── Strategy 2: Medium query (5-8 significant words) → keyword core ───────
277
+ if (significant.length <= 8) {
278
+ const core = extractKeywordCore(significant, 3)
279
+ const subQueries = [query]
280
+
281
+ // Only add core if it's meaningfully different from original
282
+ if (core !== query.toLowerCase().trim() && core.split(" ").length >= config.minSubQueryWords) {
283
+ subQueries.push(core)
284
+ }
285
+
286
+ return {
287
+ original: query,
288
+ subQueries: subQueries.slice(0, config.maxSubQueries),
289
+ decomposed: subQueries.length > 1,
290
+ strategy: subQueries.length > 1 ? "keyword-core" : "passthrough",
291
+ }
292
+ }
293
+
294
+ // ── Strategy 3: Long query (9+ significant words) → concept split ─────────
295
+ const conceptGroups = splitByConcepts(tokens)
296
+ const mergedGroups = mergeSmallGroups(conceptGroups, config.minSubQueryWords)
297
+
298
+ const subQueries: string[] = []
299
+
300
+ // Always include keyword core as first sub-query (highest signal)
301
+ const core = extractKeywordCore(significant, 4)
302
+ if (core.split(" ").length >= config.minSubQueryWords) {
303
+ subQueries.push(core)
304
+ }
305
+
306
+ // Add concept groups as sub-queries
307
+ for (const group of mergedGroups) {
308
+ const groupSig = extractSignificant(group)
309
+ if (groupSig.length >= config.minSubQueryWords) {
310
+ const subQuery = groupSig.join(" ").replace(/_/g, " ")
311
+ // Avoid duplicates
312
+ if (!subQueries.includes(subQuery)) {
313
+ subQueries.push(subQuery)
314
+ }
315
+ }
316
+ }
317
+
318
+ // If we still have room, add the original (truncated to first N significant words)
319
+ if (subQueries.length < config.maxSubQueries) {
320
+ const truncated = significant.slice(0, 6).join(" ").replace(/_/g, " ")
321
+ if (!subQueries.includes(truncated)) {
322
+ subQueries.push(truncated)
323
+ }
324
+ }
325
+
326
+ // Ensure we don't exceed max
327
+ const finalQueries = subQueries.slice(0, config.maxSubQueries)
328
+
329
+ return {
330
+ original: query,
331
+ subQueries: finalQueries.length > 0 ? finalQueries : [query],
332
+ decomposed: finalQueries.length > 1,
333
+ strategy: finalQueries.length > 1 ? "concept-split" : "passthrough",
334
+ }
335
+ }
336
+
337
+ // ── RRF Merge ───────────────────────────────────────────────────────────────
338
+
339
+ /**
340
+ * Reciprocal Rank Fusion — merge ranked result lists from multiple sub-queries.
341
+ *
342
+ * RRF score = sum(1 / (k + rank_i)) for each sub-query where the result appears.
343
+ *
344
+ * @param resultSets Array of result arrays, each sorted by relevance (best first)
345
+ * @param k RRF constant (default: 60, standard value from the paper)
346
+ * @param limit Max results to return
347
+ * @returns Merged results sorted by RRF score, with _rrfScore and _combinedScore set
348
+ */
349
+ export function rrfMerge(
350
+ resultSets: Array<Array<Record<string, any>>>,
351
+ k: number = 60,
352
+ limit: number = 10,
353
+ ): Array<Record<string, any>> {
354
+ if (resultSets.length === 0) return []
355
+ if (resultSets.length === 1) return resultSets[0].slice(0, limit)
356
+
357
+ // Build RRF scores keyed by chunk identity (file:chunk_index)
358
+ const scoreMap = new Map<string, { row: Record<string, any>; rrfScore: number; bestOriginalScore: number }>()
359
+
360
+ for (const results of resultSets) {
361
+ for (let rank = 0; rank < results.length; rank++) {
362
+ const r = results[rank]
363
+ const key = `${r.file}:${r.chunk_index}`
364
+ const rrfContribution = 1 / (k + rank + 1) // rank is 0-based, RRF uses 1-based
365
+
366
+ const existing = scoreMap.get(key)
367
+ const originalScore = r._combinedScore ?? (r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0)
368
+
369
+ if (existing) {
370
+ existing.rrfScore += rrfContribution
371
+ // Keep the row with the best original score (most metadata)
372
+ if (originalScore > existing.bestOriginalScore) {
373
+ existing.row = r
374
+ existing.bestOriginalScore = originalScore
375
+ }
376
+ } else {
377
+ scoreMap.set(key, {
378
+ row: r,
379
+ rrfScore: rrfContribution,
380
+ bestOriginalScore: originalScore,
381
+ })
382
+ }
383
+ }
384
+ }
385
+
386
+ // Sort by RRF score and return
387
+ const merged = Array.from(scoreMap.values())
388
+ .sort((a, b) => b.rrfScore - a.rrfScore)
389
+ .slice(0, limit)
390
+ .map(entry => ({
391
+ ...entry.row,
392
+ _rrfScore: entry.rrfScore,
393
+ _combinedScore: entry.bestOriginalScore, // preserve for downstream compatibility
394
+ }))
395
+
396
+ return merged
397
+ }
@@ -46,6 +46,8 @@ export interface UsageData {
46
46
  }
47
47
 
48
48
  const MAX_PROVENANCE_PER_CHUNK = 20
49
+ const MAX_TRACKED_CHUNKS = 5000 // Cap total tracked chunks to prevent unbounded growth
50
+ const STALE_CHUNK_AGE_MS = 7 * 24 * 60 * 60 * 1000 // 7 days — evict chunks not used since
49
51
 
50
52
  // ---------------------------------------------------------------------------
51
53
  // UsageTracker
@@ -126,6 +128,40 @@ export class UsageTracker {
126
128
  stats.lastUsed = now
127
129
  }
128
130
  this.dirty = true
131
+
132
+ // Evict stale + over-cap entries periodically (every 50 searches)
133
+ if (this.data.totalSearches % 50 === 0) {
134
+ this.evictStaleChunks()
135
+ }
136
+ }
137
+
138
+ /**
139
+ * Evict chunks not used within STALE_CHUNK_AGE_MS, then cap at MAX_TRACKED_CHUNKS.
140
+ * Keeps the most recently used chunks.
141
+ */
142
+ private evictStaleChunks(): void {
143
+ if (!this.data) return
144
+ const now = Date.now()
145
+ const chunks = this.data.chunks
146
+
147
+ // Phase 1: remove stale (not used in 7 days)
148
+ for (const [id, stats] of Object.entries(chunks)) {
149
+ if (stats.lastUsed > 0 && now - stats.lastUsed > STALE_CHUNK_AGE_MS) {
150
+ delete chunks[id]
151
+ this.dirty = true
152
+ }
153
+ }
154
+
155
+ // Phase 2: if still over cap, evict least-used
156
+ const entries = Object.entries(chunks)
157
+ if (entries.length > MAX_TRACKED_CHUNKS) {
158
+ entries.sort((a, b) => a[1].lastUsed - b[1].lastUsed)
159
+ const toRemove = entries.length - MAX_TRACKED_CHUNKS
160
+ for (let i = 0; i < toRemove; i++) {
161
+ delete chunks[entries[i][0]]
162
+ }
163
+ this.dirty = true
164
+ }
129
165
  }
130
166
 
131
167
  // ---- FR-062: "where is chunk X used?" -----------------------------------
package/vectorizer.yaml CHANGED
@@ -62,12 +62,19 @@ vectorizer:
62
62
  max_chunks: 100 # Max number of chunks in workspace
63
63
  attach_top_n: 10 # Top N search chunks to attach with full content
64
64
  attach_related_per_chunk: 3 # Max graph relation chunks per main chunk
65
- min_score_main: 0.65 # Min score for main chunks
66
- min_score_related: 0.5 # Min score for graph relation chunks
65
+ min_score_main: 0.5 # Min score for main chunks
66
+ min_score_related: 0.35 # Min score for graph relation chunks
67
67
  persist_content: false # Save full chunk content in snapshots (debug mode)
68
68
  auto_prune_search: true # Replace old search outputs with compact summaries
69
69
  substitute_tool_outputs: true # Replace read() outputs when chunks in workspace
70
70
 
71
+ # Query decomposition (v4 — improves long query relevance)
72
+ decomposition:
73
+ enabled: true # Split complex queries into focused sub-queries
74
+ min_words: 5 # Min significant words to trigger decomposition
75
+ max_sub_queries: 4 # Max sub-queries (including keyword core)
76
+ min_sub_query_words: 2 # Min words per sub-query
77
+
71
78
  # Quality monitoring (v2)
72
79
  quality:
73
80
  enable_metrics: false # Track search quality metrics