sweet-search 0.0.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. package/LICENSE +190 -0
  2. package/NOTICE +23 -0
  3. package/core/cli.js +51 -0
  4. package/core/config.js +27 -0
  5. package/core/embedding/embedding-cache.js +467 -0
  6. package/core/embedding/embedding-local-model.js +845 -0
  7. package/core/embedding/embedding-remote.js +492 -0
  8. package/core/embedding/embedding-service.js +712 -0
  9. package/core/embedding/embedding-telemetry.js +219 -0
  10. package/core/embedding/index.js +40 -0
  11. package/core/graph/community-detector.js +294 -0
  12. package/core/graph/graph-expansion.js +839 -0
  13. package/core/graph/graph-extractor.js +2304 -0
  14. package/core/graph/graph-search.js +2148 -0
  15. package/core/graph/hcgs-generator.js +666 -0
  16. package/core/graph/index.js +16 -0
  17. package/core/graph/leiden-algorithm.js +547 -0
  18. package/core/graph/relationship-resolver.js +366 -0
  19. package/core/graph/repo-map.js +408 -0
  20. package/core/graph/summary-manager.js +549 -0
  21. package/core/indexing/artifact-builder.js +1054 -0
  22. package/core/indexing/ast-chunker.js +709 -0
  23. package/core/indexing/chunking/chunk-builder.js +170 -0
  24. package/core/indexing/chunking/markdown-chunker.js +503 -0
  25. package/core/indexing/chunking/plaintext-chunker.js +104 -0
  26. package/core/indexing/dedup/dedup-phase.js +159 -0
  27. package/core/indexing/dedup/exemplar-selector.js +65 -0
  28. package/core/indexing/document-chunker.js +56 -0
  29. package/core/indexing/incremental-parser.js +390 -0
  30. package/core/indexing/incremental-tracker.js +761 -0
  31. package/core/indexing/index-codebase-v21.js +472 -0
  32. package/core/indexing/index-maintainer.mjs +1674 -0
  33. package/core/indexing/index.js +90 -0
  34. package/core/indexing/indexer-ann.js +1077 -0
  35. package/core/indexing/indexer-build.js +742 -0
  36. package/core/indexing/indexer-phases.js +800 -0
  37. package/core/indexing/indexer-pool.js +764 -0
  38. package/core/indexing/indexer-sparse-gram.js +98 -0
  39. package/core/indexing/indexer-utils.js +536 -0
  40. package/core/indexing/indexer-worker.js +148 -0
  41. package/core/indexing/li-skip-policy.js +225 -0
  42. package/core/indexing/merkle-tracker.js +244 -0
  43. package/core/indexing/model-pool.js +166 -0
  44. package/core/infrastructure/code-graph-repository.js +120 -0
  45. package/core/infrastructure/codebase-repository.js +131 -0
  46. package/core/infrastructure/config/dedup.js +54 -0
  47. package/core/infrastructure/config/embedding.js +298 -0
  48. package/core/infrastructure/config/graph.js +80 -0
  49. package/core/infrastructure/config/index.js +82 -0
  50. package/core/infrastructure/config/indexing.js +8 -0
  51. package/core/infrastructure/config/platform.js +254 -0
  52. package/core/infrastructure/config/ranking.js +221 -0
  53. package/core/infrastructure/config/search.js +396 -0
  54. package/core/infrastructure/config/translation.js +89 -0
  55. package/core/infrastructure/config/vector-store.js +114 -0
  56. package/core/infrastructure/constants.js +86 -0
  57. package/core/infrastructure/coreml-cascade.js +909 -0
  58. package/core/infrastructure/coreml-cascade.json +46 -0
  59. package/core/infrastructure/coreml-provider.js +81 -0
  60. package/core/infrastructure/db-utils.js +69 -0
  61. package/core/infrastructure/dedup-hashing.js +83 -0
  62. package/core/infrastructure/hardware-capability.js +332 -0
  63. package/core/infrastructure/index.js +104 -0
  64. package/core/infrastructure/language-patterns/maps.js +121 -0
  65. package/core/infrastructure/language-patterns/registry-core.js +323 -0
  66. package/core/infrastructure/language-patterns/registry-data-query.js +155 -0
  67. package/core/infrastructure/language-patterns/registry-object-oriented.js +285 -0
  68. package/core/infrastructure/language-patterns/registry-tooling.js +240 -0
  69. package/core/infrastructure/language-patterns/registry-web-style.js +143 -0
  70. package/core/infrastructure/language-patterns/registry.js +19 -0
  71. package/core/infrastructure/language-patterns.js +141 -0
  72. package/core/infrastructure/llm-provider.js +733 -0
  73. package/core/infrastructure/manifest.json +46 -0
  74. package/core/infrastructure/maxsim.wasm +0 -0
  75. package/core/infrastructure/model-fetcher.js +423 -0
  76. package/core/infrastructure/model-registry.js +214 -0
  77. package/core/infrastructure/native-inference.js +587 -0
  78. package/core/infrastructure/native-resolver.js +187 -0
  79. package/core/infrastructure/native-sparse-gram.js +257 -0
  80. package/core/infrastructure/native-tokenizer.js +160 -0
  81. package/core/infrastructure/onnx-mutex.js +45 -0
  82. package/core/infrastructure/onnx-session-utils.js +261 -0
  83. package/core/infrastructure/ort-pipeline.js +111 -0
  84. package/core/infrastructure/project-detector.js +102 -0
  85. package/core/infrastructure/quantization.js +410 -0
  86. package/core/infrastructure/simd-distance.js +502 -0
  87. package/core/infrastructure/simd-distance.wasm +0 -0
  88. package/core/infrastructure/tree-sitter-provider.js +665 -0
  89. package/core/infrastructure/webgpu-maxsim.js +222 -0
  90. package/core/query/index.js +35 -0
  91. package/core/query/intent-detector.js +201 -0
  92. package/core/query/intent-router.js +156 -0
  93. package/core/query/query-router-catboost.js +222 -0
  94. package/core/query/query-router-ml.js +266 -0
  95. package/core/query/query-router.js +213 -0
  96. package/core/ranking/cascaded-scorer.js +379 -0
  97. package/core/ranking/flashrank.js +810 -0
  98. package/core/ranking/index.js +49 -0
  99. package/core/ranking/late-interaction-index.js +2383 -0
  100. package/core/ranking/late-interaction-model.js +812 -0
  101. package/core/ranking/local-reranker.js +374 -0
  102. package/core/ranking/mmr.js +379 -0
  103. package/core/ranking/quality-scorer.js +363 -0
  104. package/core/search/context-expander.js +1167 -0
  105. package/core/search/dedup/sibling-expander.js +327 -0
  106. package/core/search/index.js +16 -0
  107. package/core/search/search-boost.js +259 -0
  108. package/core/search/search-cli.js +544 -0
  109. package/core/search/search-format.js +282 -0
  110. package/core/search/search-fusion.js +327 -0
  111. package/core/search/search-hybrid.js +204 -0
  112. package/core/search/search-pattern-chunks.js +337 -0
  113. package/core/search/search-pattern-planner.js +439 -0
  114. package/core/search/search-pattern-prefilter.js +412 -0
  115. package/core/search/search-pattern-ripgrep.js +663 -0
  116. package/core/search/search-pattern.js +463 -0
  117. package/core/search/search-postprocess.js +452 -0
  118. package/core/search/search-semantic.js +706 -0
  119. package/core/search/search-server.js +554 -0
  120. package/core/search/session-daemon-prewarm.mjs +164 -0
  121. package/core/search/session-warmup.js +595 -0
  122. package/core/search/sweet-search.js +632 -0
  123. package/core/search/warmup-metrics.js +532 -0
  124. package/core/start-server.js +6 -0
  125. package/core/training/query-router/features/extractor.js +762 -0
  126. package/core/training/query-router/features/multilingual-patterns.js +431 -0
  127. package/core/training/query-router/features/text-segmenter.js +303 -0
  128. package/core/training/query-router/features/unicode-utils.js +383 -0
  129. package/core/training/query-router/output/v45_router_d4.js +11521 -0
  130. package/core/training/query-router/output/v46_router_d4.js +11498 -0
  131. package/core/vector-store/binary-heap.js +227 -0
  132. package/core/vector-store/binary-hnsw-index.js +1004 -0
  133. package/core/vector-store/float-vector-store.js +234 -0
  134. package/core/vector-store/hnsw-index.js +580 -0
  135. package/core/vector-store/index.js +39 -0
  136. package/core/vector-store/seismic-index.js +498 -0
  137. package/core/vocabulary/index.js +84 -0
  138. package/core/vocabulary/vocab-constants.js +20 -0
  139. package/core/vocabulary/vocab-miner-extractors.js +375 -0
  140. package/core/vocabulary/vocab-miner-nl.js +404 -0
  141. package/core/vocabulary/vocab-miner-utils.js +146 -0
  142. package/core/vocabulary/vocab-miner.js +574 -0
  143. package/core/vocabulary/vocab-prewarm-cli.js +110 -0
  144. package/core/vocabulary/vocab-ranker.js +492 -0
  145. package/core/vocabulary/vocab-warmer.js +523 -0
  146. package/core/vocabulary/vocab-warmup-orchestrator.js +425 -0
  147. package/core/vocabulary/vocabulary-utils.js +704 -0
  148. package/crates/wasm-router/pkg/package.json +13 -0
  149. package/crates/wasm-router/pkg/query_router_wasm.d.ts +36 -0
  150. package/crates/wasm-router/pkg/query_router_wasm.js +271 -0
  151. package/crates/wasm-router/pkg/query_router_wasm_bg.wasm +0 -0
  152. package/crates/wasm-router/pkg/query_router_wasm_bg.wasm.d.ts +19 -0
  153. package/mcp/config-gen.js +121 -0
  154. package/mcp/server.js +335 -0
  155. package/mcp/tool-handlers.js +476 -0
  156. package/package.json +131 -9
  157. package/scripts/benchmark-harness.js +794 -0
  158. package/scripts/init.js +1058 -0
  159. package/scripts/smoke-test.js +435 -0
  160. package/scripts/uninstall.js +478 -0
  161. package/scripts/verify-runtime.js +176 -0
@@ -0,0 +1,839 @@
1
+ /**
2
+ * Graph Expansion Module
3
+ *
4
+ * Expands search results by following relationship edges in the entity graph.
5
+ * Given top-k search results, performs 1-hop (or 2-hop) traversal to include
6
+ * related code chunks (imports, extends, implements, uses, calls).
7
+ *
8
+ * This helps with multi-hop coding questions where the answer spans
9
+ * multiple related entities.
10
+ */
11
+
12
+ // Default edge types to follow during expansion
13
+ const DEFAULT_EDGE_TYPES = new Set(['imports', 'extends', 'implements', 'uses', 'calls']);
14
+
15
+ // --- Token Estimation Helpers ---
16
+
17
+ // Language-specific tokens-per-line averages (from CodeSearchNet analysis)
18
+ const TOKENS_PER_LINE = {
19
+ java: 15, kotlin: 14, swift: 13,
20
+ go: 12, c: 12, cpp: 12, php: 11,
21
+ javascript: 10, typescript: 10, jsx: 10, tsx: 10,
22
+ ruby: 9, python: 8,
23
+ };
24
+
25
+ // Map file extensions to language keys
26
+ const EXT_TO_LANG = {
27
+ js: 'javascript', ts: 'typescript', py: 'python', rb: 'ruby',
28
+ kt: 'kotlin', cc: 'cpp', cxx: 'cpp', h: 'c', hpp: 'cpp', m: 'c',
29
+ };
30
+
31
+ /**
32
+ * Estimate token count from text using whitespace splitting.
33
+ * ±10-15% of real BPE counts, <0.1ms for typical chunks.
34
+ */
35
+ function estimateTokenCount(text) {
36
+ if (!text) return 0;
37
+ return (text.match(/\S+/g) || []).length;
38
+ }
39
+
40
+ /**
41
+ * Fallback token estimate using language-specific multipliers.
42
+ * Much better than flat ×10 for mixed-language codebases.
43
+ */
44
+ function fallbackTokenEstimate(result) {
45
+ const ext = (result.file_path || result.file || result.metadata?.file || result.metadata?.path || '')
46
+ .split('.').pop()?.toLowerCase();
47
+ const lang = result.metadata?.language || EXT_TO_LANG[ext] || ext;
48
+ const perLine = TOKENS_PER_LINE[lang] || 10;
49
+ const startLine = result.start_line || result.startLine || 0;
50
+ const endLine = result.end_line || result.endLine || startLine;
51
+ return Math.max(1, (endLine - startLine + 1)) * perLine;
52
+ }
53
+
54
+ /**
55
+ * Batch-load chunk texts from codebase.db vectors table.
56
+ * Accepts either a CodebaseRepository or a raw better-sqlite3 Database (legacy).
57
+ * @param {import('../infrastructure/codebase-repository.js').CodebaseRepository|import('better-sqlite3').Database} codebaseDbOrRepo
58
+ * @param {string[]} ids - Vector IDs to look up
59
+ * @returns {Map<string, string>} id → text
60
+ */
61
+ export function loadChunkTexts(codebaseDbOrRepo, ids) {
62
+ if (!codebaseDbOrRepo || !ids || ids.length === 0) return new Map();
63
+ // Repository path (preferred)
64
+ if (typeof codebaseDbOrRepo.getChunkTexts === 'function') {
65
+ return codebaseDbOrRepo.getChunkTexts(ids);
66
+ }
67
+ // Legacy raw-DB path (backward compat)
68
+ try {
69
+ const ph = ids.map(() => '?').join(',');
70
+ const rows = codebaseDbOrRepo.prepare(
71
+ `SELECT id, text FROM vectors WHERE id IN (${ph})`
72
+ ).all(...ids);
73
+ return new Map(rows.map(r => [r.id, r.text]));
74
+ } catch {
75
+ return new Map();
76
+ }
77
+ }
78
+
79
+ /**
80
+ * Compute accurate token estimates for a mixed set of results.
81
+ * Original results (from HNSW) use codebaseDb text; expanded results
82
+ * use readFileLines (injected to keep this module import-free).
83
+ *
84
+ * @param {Array} results
85
+ * @param {Object} options
86
+ * @param {import('better-sqlite3').Database} [options.codebaseDb]
87
+ * @param {Function} [options.readFileLines] - (filePath, startLine, endLine) => string|null
88
+ * @returns {Map<number, number>} index → token count
89
+ */
90
+ export function computeTokenEstimates(results, options = {}) {
91
+ const { codebaseDb, readFileLines } = options;
92
+ const estimates = new Map();
93
+
94
+ const originalIds = [];
95
+ const originalIndexes = [];
96
+
97
+ for (let i = 0; i < results.length; i++) {
98
+ const r = results[i];
99
+ if (!r.is_expanded && r.id) {
100
+ originalIds.push(r.id);
101
+ originalIndexes.push(i);
102
+ } else if (r.is_expanded && readFileLines) {
103
+ const filePath = r.file_path || r.file;
104
+ const startLine = r.start_line || r.startLine;
105
+ const endLine = r.end_line || r.endLine;
106
+ if (filePath && startLine) {
107
+ const text = readFileLines(filePath, startLine, endLine);
108
+ if (text) {
109
+ estimates.set(i, estimateTokenCount(text));
110
+ }
111
+ }
112
+ }
113
+ }
114
+
115
+ // Batch-load original chunk texts from codebase.db
116
+ const textMap = loadChunkTexts(codebaseDb, originalIds);
117
+ for (let j = 0; j < originalIds.length; j++) {
118
+ const text = textMap.get(originalIds[j]);
119
+ if (text) {
120
+ estimates.set(originalIndexes[j], estimateTokenCount(text));
121
+ }
122
+ }
123
+
124
+ return estimates;
125
+ }
126
+
127
+ // Score decay per hop (graph-expanded results are less relevant than direct hits)
128
+ const HOP_DECAY = 0.6;
129
+ const HOP2_DECAY = 0.35;
130
+
131
+ // Edge priority scores for adaptive 2-hop ranking
132
+ const EDGE_PRIORITY = {
133
+ extends: 4,
134
+ implements: 4,
135
+ imports: 3,
136
+ calls: 2,
137
+ uses: 1,
138
+ };
139
+
140
+ // PathRAG/LEGO-GraphRAG SOTA scoring constants for adaptive 2-hop
141
+ const BASE_ALPHA = 0.55;
142
+ const EDGE_ALPHA_BONUS = {
143
+ extends: 0.25, // effective alpha = 0.80
144
+ implements: 0.25, // effective alpha = 0.80
145
+ imports: 0.10, // effective alpha = 0.65
146
+ calls: 0.05, // effective alpha = 0.60
147
+ uses: 0.00, // effective alpha = 0.55
148
+ };
149
+ const FLOW_THRESHOLD = 0.05;
150
+
151
+ // Structural entity type boosts for reranking
152
+ const TYPE_BOOST = {
153
+ class: 1.3,
154
+ function: 1.2,
155
+ method: 1.2,
156
+ interface: 1.3,
157
+ struct: 1.2,
158
+ };
159
+
160
+ function clampSemanticWeight(value) {
161
+ if (!Number.isFinite(value)) return 0.4;
162
+ return Math.max(0, Math.min(1, value));
163
+ }
164
+
165
+ function normalizeMinMax(values) {
166
+ if (values.length === 0) return [];
167
+ const min = Math.min(...values);
168
+ const max = Math.max(...values);
169
+ if (max === min) return values.map(() => 0.5);
170
+ return values.map(v => (v - min) / (max - min));
171
+ }
172
+
173
+ function blendScores(graphScore, cosineSim, weight) {
174
+ return (1 - weight) * graphScore + weight * cosineSim;
175
+ }
176
+
177
+ /**
178
+ * Expand search results using the entity relationship graph.
179
+ *
180
+ * @param {import('better-sqlite3').Database} db - The code graph database
181
+ * @param {Array} results - Initial search results with entity IDs
182
+ * @param {Object} options
183
+ * @param {string} options.expandMode - 'none' | '1hop' | '2hop'
184
+ * @param {number} options.maxExpanded - Max expanded results to add
185
+ * @param {number} options.tokenBudget - Max total tokens in expanded set
186
+ * @param {Set<string>} options.edgeTypes - Relationship types to follow
187
+ * @returns {Array} Expanded and reranked results
188
+ */
189
+ export function expandResults(db, results, options = {}) {
190
+ const {
191
+ expandMode = '1hop',
192
+ maxExpanded = 10,
193
+ tokenBudget = 8000,
194
+ edgeTypes = DEFAULT_EDGE_TYPES,
195
+ adaptiveHop2 = false,
196
+ hop2TokenBudget = 4000,
197
+ expandedBudget,
198
+ queryInt8 = null,
199
+ hnswIndex = null,
200
+ semanticWeight = 0.4,
201
+ cosineSimilarity = null,
202
+ codebaseDb = null,
203
+ readFileLines = null,
204
+ } = options;
205
+ const clampedSemanticWeight = clampSemanticWeight(semanticWeight);
206
+
207
+ if (expandMode === 'none' || results.length === 0) return results;
208
+
209
+ // Collect entity IDs from results
210
+ const seedIds = collectSeedIds(db, results);
211
+ if (seedIds.size === 0) return results;
212
+
213
+ // 1-hop expansion: find neighbors via forward + reverse edges
214
+ const expanded = expandOneHop(db, seedIds, edgeTypes);
215
+
216
+ // 2-hop expansion (if requested)
217
+ if (expandMode === '2hop' && expanded.size > 0) {
218
+ if (adaptiveHop2) {
219
+ expandSecondHopAdaptive(db, seedIds, expanded, edgeTypes, {
220
+ maxHop2: maxExpanded,
221
+ tokenBudget: hop2TokenBudget,
222
+ queryInt8,
223
+ hnswIndex,
224
+ semanticWeight: clampedSemanticWeight,
225
+ cosineSimilarity,
226
+ });
227
+ } else {
228
+ expandSecondHop(db, seedIds, expanded, edgeTypes, {
229
+ queryInt8,
230
+ hnswIndex,
231
+ semanticWeight: clampedSemanticWeight,
232
+ cosineSimilarity,
233
+ });
234
+ }
235
+ }
236
+
237
+ if (expanded.size === 0) return results;
238
+
239
+ // Look up entity details for expanded IDs, respecting maxExpanded
240
+ const expandedIds = [...expanded.keys()].slice(0, maxExpanded);
241
+ const expandedResults = lookupEntities(db, expandedIds, expanded);
242
+
243
+ // Score expanded results relative to original results
244
+ const maxOriginalScore = Math.max(...results.map(r => r.score || 0), 1);
245
+ for (const er of expandedResults) {
246
+ const hops = er.expansion?.hops || 1;
247
+ const decay = er.expansion?.decay || (hops === 1 ? HOP_DECAY : HOP2_DECAY);
248
+ er.score = maxOriginalScore * decay;
249
+ }
250
+
251
+ // Rerank expanded results using composite scoring (file proximity + entity type + semantic)
252
+ rerankExpanded(expandedResults, results, {
253
+ queryInt8,
254
+ hnswIndex,
255
+ semanticWeight: clampedSemanticWeight,
256
+ cosineSimilarity,
257
+ });
258
+
259
+ // Apply token budget
260
+ const { results: budgeted, stats: budgetStats } = applyTokenBudget(
261
+ [...results, ...expandedResults], tokenBudget,
262
+ { expandedBudget, codebaseDb, readFileLines }
263
+ );
264
+
265
+ budgeted._budgetStats = budgetStats;
266
+ return budgeted;
267
+ }
268
+
269
+ /**
270
+ * Collect entity IDs from search results.
271
+ * Tries entity_id from metadata, then falls back to file_path + line range matching.
272
+ *
273
+ * @param {import('better-sqlite3').Database} db
274
+ * @param {Array} results
275
+ * @returns {Set<string>}
276
+ */
277
+ function collectSeedIds(db, results) {
278
+ const seedIds = new Set();
279
+
280
+ for (const r of results) {
281
+ if (r.entity_id) seedIds.add(r.entity_id);
282
+ else if (r.metadata?.entity_id) seedIds.add(r.metadata.entity_id);
283
+ else if (r.id) seedIds.add(r.id);
284
+ }
285
+
286
+ if (seedIds.size > 0) return seedIds;
287
+
288
+ // Fallback: match results to entities by file_path + line range
289
+ let entityLookup;
290
+ try {
291
+ entityLookup = db.prepare(`
292
+ SELECT id, file_path, start_line, end_line
293
+ FROM entities WHERE stale_since IS NULL
294
+ `).all();
295
+ } catch {
296
+ return seedIds;
297
+ }
298
+
299
+ for (const r of results) {
300
+ const filePath = r.file_path || r.file || r.metadata?.file || r.metadata?.path;
301
+ const lineStart = r.start_line || r.startLine || r.metadata?.line_start || r.metadata?.startLine;
302
+ if (!filePath) continue;
303
+
304
+ for (const e of entityLookup) {
305
+ if (e.file_path === filePath && e.start_line != null && lineStart != null &&
306
+ e.start_line <= lineStart && e.end_line >= lineStart) {
307
+ seedIds.add(e.id);
308
+ break;
309
+ }
310
+ }
311
+ }
312
+
313
+ return seedIds;
314
+ }
315
+
316
+ /**
317
+ * Perform 1-hop graph expansion from seed entity IDs.
318
+ *
319
+ * @param {import('better-sqlite3').Database} db
320
+ * @param {Set<string>} seedIds
321
+ * @param {Set<string>} edgeTypes
322
+ * @returns {Map<string, {via: string, direction: string, score: number, hops?: number}>}
323
+ */
324
+ export function expandOneHop(db, seedIds, edgeTypes) {
325
+ const expanded = new Map();
326
+ const seedArray = [...seedIds];
327
+ const placeholders = seedArray.map(() => '?').join(',');
328
+
329
+ // Forward edges: seed -> neighbor
330
+ let forwardRels;
331
+ try {
332
+ forwardRels = db.prepare(`
333
+ SELECT DISTINCT target_id, type FROM relationships
334
+ WHERE source_id IN (${placeholders}) AND target_id IS NOT NULL
335
+ `).all(...seedArray);
336
+ } catch {
337
+ forwardRels = [];
338
+ }
339
+
340
+ // Reverse edges: neighbor -> seed
341
+ let reverseRels;
342
+ try {
343
+ reverseRels = db.prepare(`
344
+ SELECT DISTINCT source_id, type FROM relationships
345
+ WHERE target_id IN (${placeholders}) AND source_id IS NOT NULL
346
+ `).all(...seedArray);
347
+ } catch {
348
+ reverseRels = [];
349
+ }
350
+
351
+ for (const { rels, idField, direction } of [
352
+ { rels: forwardRels, idField: 'target_id', direction: 'forward' },
353
+ { rels: reverseRels, idField: 'source_id', direction: 'reverse' },
354
+ ]) {
355
+ for (const rel of rels) {
356
+ const neighborId = rel[idField];
357
+ if (edgeTypes.has(rel.type) && !seedIds.has(neighborId)) {
358
+ const effectiveAlpha = BASE_ALPHA + (EDGE_ALPHA_BONUS[rel.type] || 0);
359
+ const score = effectiveAlpha;
360
+ const existing = expanded.get(neighborId);
361
+ if (!existing || score > existing.score) {
362
+ expanded.set(neighborId, { via: rel.type, direction, score });
363
+ }
364
+ }
365
+ }
366
+ }
367
+
368
+ return expanded;
369
+ }
370
+
371
+ /**
372
+ * Perform 2nd-hop expansion from the 1-hop neighbors.
373
+ *
374
+ * @param {import('better-sqlite3').Database} db
375
+ * @param {Set<string>} seedIds - Original seeds
376
+ * @param {Map<string, Object>} expanded - 1-hop expansion map (mutated in place)
377
+ * @param {Set<string>} edgeTypes
378
+ */
379
+ export function expandSecondHop(db, seedIds, expanded, edgeTypes, options = {}) {
380
+ const {
381
+ queryInt8 = null,
382
+ hnswIndex = null,
383
+ semanticWeight = 0.4,
384
+ cosineSimilarity = null,
385
+ } = options;
386
+ const semanticEnabled = !!(queryInt8 && hnswIndex && cosineSimilarity && semanticWeight > 0);
387
+
388
+ const hop1Ids = [...expanded.keys()];
389
+ if (hop1Ids.length === 0) return;
390
+
391
+ const ph = hop1Ids.map(() => '?').join(',');
392
+
393
+ let hop2Forward;
394
+ try {
395
+ hop2Forward = db.prepare(`
396
+ SELECT source_id, target_id, type FROM relationships
397
+ WHERE source_id IN (${ph}) AND target_id IS NOT NULL
398
+ `).all(...hop1Ids);
399
+ } catch {
400
+ return;
401
+ }
402
+
403
+ if (!semanticEnabled) {
404
+ for (const rel of hop2Forward) {
405
+ if (edgeTypes.has(rel.type) && !seedIds.has(rel.target_id) && !expanded.has(rel.target_id)) {
406
+ expanded.set(rel.target_id, { via: rel.type, direction: 'forward', hops: 2 });
407
+ }
408
+ }
409
+ return;
410
+ }
411
+
412
+ const excluded = new Set([...seedIds, ...expanded.keys()]);
413
+ const candidates = [];
414
+ for (const rel of hop2Forward) {
415
+ if (!edgeTypes.has(rel.type) || excluded.has(rel.target_id)) continue;
416
+
417
+ const hop1Entry = expanded.get(rel.source_id);
418
+ const hop1Score = hop1Entry?.score ?? 1; // identity: preserves old edgePriority × weight
419
+ const graphScore = hop1Score * (EDGE_PRIORITY[rel.type] || 1) * (rel.weight || 1.0);
420
+ let normSim = null;
421
+ const entityInt8 = hnswIndex.getInt8Vector(rel.target_id);
422
+ if (entityInt8) {
423
+ const cosSim = cosineSimilarity(queryInt8, entityInt8);
424
+ normSim = (cosSim + 1) / 2;
425
+ }
426
+ candidates.push({ rel, graphScore, normSim });
427
+ }
428
+
429
+ if (candidates.length === 0) return;
430
+ const normalizedGraphScores = normalizeMinMax(candidates.map(c => c.graphScore));
431
+ const bestByTarget = new Map();
432
+
433
+ for (let i = 0; i < candidates.length; i++) {
434
+ const c = candidates[i];
435
+ const normGraph = normalizedGraphScores[i];
436
+ let score = normGraph;
437
+ if (c.normSim != null) {
438
+ score = blendScores(normGraph, c.normSim, semanticWeight);
439
+ }
440
+
441
+ const prev = bestByTarget.get(c.rel.target_id);
442
+ if (!prev || score > prev.score) {
443
+ bestByTarget.set(c.rel.target_id, { rel: c.rel, score });
444
+ }
445
+ }
446
+
447
+ const ranked = [...bestByTarget.values()].sort((a, b) => b.score - a.score);
448
+ for (const c of ranked) {
449
+ expanded.set(c.rel.target_id, {
450
+ via: c.rel.type,
451
+ direction: 'forward',
452
+ hops: 2,
453
+ });
454
+ }
455
+ }
456
+
457
+ /**
458
+ * Perform adaptive 2nd-hop expansion with per-edge-type alpha decay,
459
+ * degree normalization, and flow-based early stopping (PathRAG-style).
460
+ *
461
+ * @param {import('better-sqlite3').Database} db
462
+ * @param {Set<string>} seedIds - Original seed entity IDs
463
+ * @param {Map<string, Object>} hop1Expanded - 1-hop expansion map (mutated in place)
464
+ * @param {Set<string>} edgeTypes - Allowed edge types
465
+ * @param {Object} options
466
+ * @param {number} options.maxHop2 - Max 2-hop entities to add
467
+ * @param {number} options.tokenBudget - Token budget for 2-hop expansion
468
+ * @returns {{ added: number, budgetUsed: number, candidates: number }}
469
+ */
470
+ export function expandSecondHopAdaptive(db, seedIds, hop1Expanded, edgeTypes, options = {}) {
471
+ const {
472
+ maxHop2 = 5,
473
+ tokenBudget = 4000,
474
+ queryInt8 = null,
475
+ hnswIndex = null,
476
+ semanticWeight = 0.4,
477
+ cosineSimilarity = null,
478
+ } = options;
479
+ const semanticEnabled = !!(queryInt8 && hnswIndex && cosineSimilarity && semanticWeight > 0);
480
+
481
+ const hop1Ids = [...hop1Expanded.keys()];
482
+ if (hop1Ids.length === 0) return { added: 0, budgetUsed: 0, candidates: 0 };
483
+
484
+ const ph = hop1Ids.map(() => '?').join(',');
485
+
486
+ // Query out-degrees for hop-1 nodes, filtered to active edge types only.
487
+ // Counting all edge types would over-penalize nodes with many irrelevant edges.
488
+ // Safety: edgeTypes is always code-controlled (DEFAULT_EDGE_TYPES or intent policy
489
+ // constants). Not parameterized because better-sqlite3 doesn't support mixing
490
+ // positional params across two IN clauses cleanly. Never pass user input here.
491
+ const typeList = [...edgeTypes].map(t => `'${t}'`).join(',');
492
+ let degreeMap;
493
+ try {
494
+ const degRows = db.prepare(`
495
+ SELECT source_id, COUNT(*) as deg FROM relationships
496
+ WHERE source_id IN (${ph}) AND type IN (${typeList})
497
+ GROUP BY source_id
498
+ `).all(...hop1Ids);
499
+ degreeMap = new Map(degRows.map(r => [r.source_id, r.deg]));
500
+ } catch {
501
+ degreeMap = new Map();
502
+ }
503
+
504
+ // Query candidate 2-hop targets with source, weights, and line ranges
505
+ let rawCandidates;
506
+ try {
507
+ rawCandidates = db.prepare(`
508
+ SELECT r.source_id, r.target_id, r.type, r.weight, e.file_path, e.start_line, e.end_line
509
+ FROM relationships r
510
+ JOIN entities e ON e.id = r.target_id AND e.stale_since IS NULL
511
+ WHERE r.source_id IN (${ph}) AND r.target_id IS NOT NULL
512
+ `).all(...hop1Ids);
513
+ } catch {
514
+ return { added: 0, budgetUsed: 0, candidates: 0 };
515
+ }
516
+
517
+ // Filter by edge types and score all paths.
518
+ const excluded = new Set([...seedIds, ...hop1Expanded.keys()]);
519
+ const vectorCache = semanticEnabled ? new Map() : null;
520
+ const scoredCandidates = [];
521
+
522
+ for (const c of rawCandidates) {
523
+ if (!edgeTypes.has(c.type) || excluded.has(c.target_id)) continue;
524
+
525
+ const effectiveAlpha = BASE_ALPHA + (EDGE_ALPHA_BONUS[c.type] || 0);
526
+ const edgePriority = EDGE_PRIORITY[c.type] || 1;
527
+ const weight = c.weight || 1.0;
528
+ const outDegree = degreeMap.get(c.source_id) || 1;
529
+ const hop1Entry = hop1Expanded.get(c.source_id);
530
+ const hop1Score = hop1Entry?.score ?? effectiveAlpha;
531
+ const graphScore = (hop1Score * effectiveAlpha * weight * edgePriority) / Math.sqrt(outDegree);
532
+
533
+ let normSim = null;
534
+ if (semanticEnabled) {
535
+ if (!vectorCache.has(c.target_id)) {
536
+ vectorCache.set(c.target_id, hnswIndex.getInt8Vector(c.target_id));
537
+ }
538
+ const entityInt8 = vectorCache.get(c.target_id);
539
+ if (entityInt8) {
540
+ const cosSim = cosineSimilarity(queryInt8, entityInt8);
541
+ normSim = (cosSim + 1) / 2;
542
+ }
543
+ }
544
+
545
+ const estimatedTokens = fallbackTokenEstimate({
546
+ file_path: c.file_path,
547
+ start_line: c.start_line,
548
+ end_line: c.end_line,
549
+ });
550
+
551
+ scoredCandidates.push({
552
+ target_id: c.target_id,
553
+ source_id: c.source_id,
554
+ type: c.type,
555
+ graphScore,
556
+ normSim,
557
+ estimatedTokens,
558
+ effectiveAlpha,
559
+ outDegree,
560
+ });
561
+ }
562
+
563
+ if (scoredCandidates.length === 0) return { added: 0, budgetUsed: 0, candidates: 0 };
564
+
565
+ const normalizedGraphScores = semanticEnabled
566
+ ? normalizeMinMax(scoredCandidates.map(c => c.graphScore))
567
+ : [];
568
+
569
+ // Multiple hop-1 sources may reach the same target — keep the highest score.
570
+ const bestByTarget = new Map(); // target_id -> best scored entry
571
+ for (let i = 0; i < scoredCandidates.length; i++) {
572
+ const c = scoredCandidates[i];
573
+ let score = c.graphScore;
574
+
575
+ if (semanticEnabled) {
576
+ const normGraph = normalizedGraphScores[i];
577
+ score = normGraph;
578
+ if (c.normSim != null) {
579
+ score = blendScores(normGraph, c.normSim, semanticWeight);
580
+ }
581
+ }
582
+
583
+ // PathRAG-style early stopping
584
+ if (score < FLOW_THRESHOLD) continue;
585
+
586
+ const prev = bestByTarget.get(c.target_id);
587
+ if (prev && prev.score >= score) continue;
588
+ bestByTarget.set(c.target_id, { ...c, score });
589
+ }
590
+
591
+ const scored = [...bestByTarget.values()];
592
+
593
+ // Sort by composite score descending
594
+ scored.sort((a, b) => b.score - a.score);
595
+
596
+ // Greedily select candidates within token budget and maxHop2 limit
597
+ let budgetUsed = 0;
598
+ let count = 0;
599
+
600
+ for (const s of scored) {
601
+ if (count >= maxHop2) break;
602
+ if (budgetUsed + s.estimatedTokens > tokenBudget && count > 0) break;
603
+
604
+ const decay = s.effectiveAlpha * s.effectiveAlpha;
605
+
606
+ hop1Expanded.set(s.target_id, {
607
+ via: s.type,
608
+ direction: 'forward',
609
+ hops: 2,
610
+ adaptiveScore: s.score,
611
+ decay,
612
+ sourceOutDegree: s.outDegree,
613
+ });
614
+
615
+ budgetUsed += s.estimatedTokens;
616
+ count++;
617
+ }
618
+
619
+ return { added: count, budgetUsed, candidates: scored.length };
620
+ }
621
+
622
+ /**
623
+ * Look up entity details for expanded IDs.
624
+ *
625
+ * @param {import('better-sqlite3').Database} db
626
+ * @param {string[]} expandedIds
627
+ * @param {Map<string, Object>} expansionMeta
628
+ * @returns {Array}
629
+ */
630
+ function lookupEntities(db, expandedIds, expansionMeta) {
631
+ if (expandedIds.length === 0) return [];
632
+
633
+ const ph = expandedIds.map(() => '?').join(',');
634
+ let entities;
635
+ try {
636
+ entities = db.prepare(`
637
+ SELECT id, file_path, type, name, signature, start_line, end_line
638
+ FROM entities WHERE id IN (${ph}) AND stale_since IS NULL
639
+ `).all(...expandedIds);
640
+ } catch {
641
+ return [];
642
+ }
643
+
644
+ return entities.map(e => ({
645
+ entity_id: e.id,
646
+ id: e.id,
647
+ file_path: e.file_path,
648
+ file: e.file_path,
649
+ name: e.name,
650
+ type: e.type,
651
+ signature: e.signature,
652
+ startLine: e.start_line,
653
+ endLine: e.end_line,
654
+ start_line: e.start_line,
655
+ end_line: e.end_line,
656
+ expansion: expansionMeta.get(e.id),
657
+ score: 0,
658
+ is_expanded: true,
659
+ }));
660
+ }
661
+
662
+ /**
663
+ * Rerank expanded results using a composite score that combines the
664
+ * decay-based score with file proximity and entity type relevance.
665
+ *
666
+ * Score factors:
667
+ * 1. Decay score (already assigned by the caller)
668
+ * 2. File proximity: entities in the same file as a seed result get a 1.5x boost
669
+ * 3. Entity type relevance: structural types (class, interface, function, method, struct)
670
+ * receive a multiplicative boost (1.2-1.3x)
671
+ *
672
+ * Mutates `expandedResults` in place: updates `.score` and re-sorts descending.
673
+ *
674
+ * @param {Array} expandedResults - Expanded results with scores already assigned
675
+ * @param {Array} seedResults - Original seed results (used to determine file proximity)
676
+ * @returns {Array} The same array, sorted by reranked score descending
677
+ */
678
+ export function rerankExpanded(expandedResults, seedResults, options = {}) {
679
+ const {
680
+ queryInt8 = null,
681
+ hnswIndex = null,
682
+ semanticWeight = 0.4,
683
+ cosineSimilarity = null,
684
+ } = options;
685
+ const clampedSemanticWeight = clampSemanticWeight(semanticWeight);
686
+ const semanticEnabled = !!(queryInt8 && hnswIndex && cosineSimilarity && clampedSemanticWeight > 0);
687
+
688
+ if (expandedResults.length === 0) return expandedResults;
689
+
690
+ const seedFiles = new Set(
691
+ seedResults.map(r => r.file_path || r.file || r.metadata?.path).filter(Boolean)
692
+ );
693
+
694
+ const baseScores = [];
695
+
696
+ for (const er of expandedResults) {
697
+ let rerankScore = er.score || 0;
698
+
699
+ // File proximity boost: entities in same file as seeds are more relevant
700
+ const erFile = er.file_path || er.file || er.metadata?.path;
701
+ if (erFile && seedFiles.has(erFile)) {
702
+ rerankScore *= 1.5;
703
+ }
704
+
705
+ // Entity type relevance: structural entities are more valuable
706
+ const entType = er.type || er.metadata?.chunk_type;
707
+ if (TYPE_BOOST[entType]) {
708
+ rerankScore *= TYPE_BOOST[entType];
709
+ }
710
+
711
+ baseScores.push(rerankScore);
712
+ }
713
+
714
+ if (!semanticEnabled) {
715
+ for (let i = 0; i < expandedResults.length; i++) {
716
+ expandedResults[i].score = baseScores[i];
717
+ }
718
+ } else {
719
+ const normalizedGraphScores = normalizeMinMax(baseScores);
720
+ for (let i = 0; i < expandedResults.length; i++) {
721
+ const er = expandedResults[i];
722
+ const normGraph = normalizedGraphScores[i];
723
+ let rerankScore = normGraph;
724
+ const entityId = er.entity_id || er.id;
725
+ const entityInt8 = hnswIndex.getInt8Vector(entityId);
726
+ if (entityInt8) {
727
+ const cosSim = cosineSimilarity(queryInt8, entityInt8);
728
+ const normSim = (cosSim + 1) / 2;
729
+ rerankScore = blendScores(normGraph, normSim, clampedSemanticWeight);
730
+ }
731
+ er.score = rerankScore;
732
+ }
733
+ }
734
+
735
+ // Re-sort by reranked score descending
736
+ expandedResults.sort((a, b) => (b.score || 0) - (a.score || 0));
737
+ return expandedResults;
738
+ }
739
+
740
+ /**
741
+ * Apply token budget to limit total result set size.
742
+ * Uses accurate token counts from chunk text when available,
743
+ * falls back to language-specific per-line multipliers.
744
+ *
745
+ * @param {Array} results
746
+ * @param {number} budget - Total token budget
747
+ * @param {Object} [options]
748
+ * @param {number} [options.expandedBudget] - Separate budget for expanded results
749
+ * @param {import('better-sqlite3').Database} [options.codebaseDb] - For chunk text lookup
750
+ * @param {Function} [options.readFileLines] - For expanded result text lookup
751
+ * @returns {{ results: Array, stats: { original: Object, hop1: Object, hop2: Object, total: Object } }}
752
+ */
753
+ export function applyTokenBudget(results, budget, options = {}) {
754
+ const { expandedBudget, codebaseDb, readFileLines } = options;
755
+
756
+ // Pre-compute accurate token estimates when data sources are available
757
+ const tokenEstimates = (codebaseDb || readFileLines)
758
+ ? computeTokenEstimates(results, { codebaseDb, readFileLines })
759
+ : new Map();
760
+
761
+ let totalTokens = 0;
762
+ let expandedTokens = 0;
763
+ const budgeted = [];
764
+ const stats = {
765
+ original: { count: 0, tokens: 0 },
766
+ hop1: { count: 0, tokens: 0 },
767
+ hop2: { count: 0, tokens: 0 },
768
+ total: { count: 0, tokens: 0 },
769
+ };
770
+
771
+ for (let i = 0; i < results.length; i++) {
772
+ const r = results[i];
773
+ // Use accurate estimate if available, otherwise fall back to per-language heuristic
774
+ const accurate = tokenEstimates.get(i);
775
+ const estimatedTokens = (accurate != null && accurate > 0) ? accurate : fallbackTokenEstimate(r);
776
+ const isExpanded = !!r.is_expanded;
777
+ const hops = r.expansion?.hops || (isExpanded ? 1 : 0);
778
+
779
+ // Check total budget (always include at least one result)
780
+ if (totalTokens + estimatedTokens > budget && budgeted.length > 0) break;
781
+
782
+ // Check expanded-specific budget
783
+ if (isExpanded && expandedBudget != null && expandedTokens + estimatedTokens > expandedBudget) {
784
+ continue;
785
+ }
786
+
787
+ totalTokens += estimatedTokens;
788
+ if (isExpanded) expandedTokens += estimatedTokens;
789
+ budgeted.push(r);
790
+
791
+ // Track per-category stats
792
+ if (!isExpanded) {
793
+ stats.original.count++;
794
+ stats.original.tokens += estimatedTokens;
795
+ } else if (hops === 2) {
796
+ stats.hop2.count++;
797
+ stats.hop2.tokens += estimatedTokens;
798
+ } else {
799
+ stats.hop1.count++;
800
+ stats.hop1.tokens += estimatedTokens;
801
+ }
802
+ }
803
+
804
+ stats.total = { count: budgeted.length, tokens: totalTokens };
805
+ return { results: budgeted, stats };
806
+ }
807
+
808
+ /**
809
+ * Get expansion statistics for a set of entities.
810
+ *
811
+ * @param {import('better-sqlite3').Database} db
812
+ * @param {string[]} entityIds
813
+ * @returns {{ total: number, byType: Record<string, number> }}
814
+ */
815
+ export function getExpansionStats(db, entityIds) {
816
+ if (!entityIds || entityIds.length === 0) return { total: 0, byType: {} };
817
+
818
+ const ph = entityIds.map(() => '?').join(',');
819
+ let rels;
820
+ try {
821
+ rels = db.prepare(`
822
+ SELECT type, COUNT(*) as count FROM relationships
823
+ WHERE (source_id IN (${ph}) OR target_id IN (${ph}))
824
+ AND source_id IS NOT NULL AND target_id IS NOT NULL
825
+ GROUP BY type
826
+ `).all(...entityIds, ...entityIds);
827
+ } catch {
828
+ return { total: 0, byType: {} };
829
+ }
830
+
831
+ const byType = {};
832
+ let total = 0;
833
+ for (const r of rels) {
834
+ byType[r.type] = r.count;
835
+ total += r.count;
836
+ }
837
+
838
+ return { total, byType };
839
+ }