sweet-search 0.0.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +190 -0
- package/NOTICE +23 -0
- package/core/cli.js +51 -0
- package/core/config.js +27 -0
- package/core/embedding/embedding-cache.js +467 -0
- package/core/embedding/embedding-local-model.js +845 -0
- package/core/embedding/embedding-remote.js +492 -0
- package/core/embedding/embedding-service.js +712 -0
- package/core/embedding/embedding-telemetry.js +219 -0
- package/core/embedding/index.js +40 -0
- package/core/graph/community-detector.js +294 -0
- package/core/graph/graph-expansion.js +839 -0
- package/core/graph/graph-extractor.js +2304 -0
- package/core/graph/graph-search.js +2148 -0
- package/core/graph/hcgs-generator.js +666 -0
- package/core/graph/index.js +16 -0
- package/core/graph/leiden-algorithm.js +547 -0
- package/core/graph/relationship-resolver.js +366 -0
- package/core/graph/repo-map.js +408 -0
- package/core/graph/summary-manager.js +549 -0
- package/core/indexing/artifact-builder.js +1054 -0
- package/core/indexing/ast-chunker.js +709 -0
- package/core/indexing/chunking/chunk-builder.js +170 -0
- package/core/indexing/chunking/markdown-chunker.js +503 -0
- package/core/indexing/chunking/plaintext-chunker.js +104 -0
- package/core/indexing/dedup/dedup-phase.js +159 -0
- package/core/indexing/dedup/exemplar-selector.js +65 -0
- package/core/indexing/document-chunker.js +56 -0
- package/core/indexing/incremental-parser.js +390 -0
- package/core/indexing/incremental-tracker.js +761 -0
- package/core/indexing/index-codebase-v21.js +472 -0
- package/core/indexing/index-maintainer.mjs +1674 -0
- package/core/indexing/index.js +90 -0
- package/core/indexing/indexer-ann.js +1077 -0
- package/core/indexing/indexer-build.js +742 -0
- package/core/indexing/indexer-phases.js +800 -0
- package/core/indexing/indexer-pool.js +764 -0
- package/core/indexing/indexer-sparse-gram.js +98 -0
- package/core/indexing/indexer-utils.js +536 -0
- package/core/indexing/indexer-worker.js +148 -0
- package/core/indexing/li-skip-policy.js +225 -0
- package/core/indexing/merkle-tracker.js +244 -0
- package/core/indexing/model-pool.js +166 -0
- package/core/infrastructure/code-graph-repository.js +120 -0
- package/core/infrastructure/codebase-repository.js +131 -0
- package/core/infrastructure/config/dedup.js +54 -0
- package/core/infrastructure/config/embedding.js +298 -0
- package/core/infrastructure/config/graph.js +80 -0
- package/core/infrastructure/config/index.js +82 -0
- package/core/infrastructure/config/indexing.js +8 -0
- package/core/infrastructure/config/platform.js +254 -0
- package/core/infrastructure/config/ranking.js +221 -0
- package/core/infrastructure/config/search.js +396 -0
- package/core/infrastructure/config/translation.js +89 -0
- package/core/infrastructure/config/vector-store.js +114 -0
- package/core/infrastructure/constants.js +86 -0
- package/core/infrastructure/coreml-cascade.js +909 -0
- package/core/infrastructure/coreml-cascade.json +46 -0
- package/core/infrastructure/coreml-provider.js +81 -0
- package/core/infrastructure/db-utils.js +69 -0
- package/core/infrastructure/dedup-hashing.js +83 -0
- package/core/infrastructure/hardware-capability.js +332 -0
- package/core/infrastructure/index.js +104 -0
- package/core/infrastructure/language-patterns/maps.js +121 -0
- package/core/infrastructure/language-patterns/registry-core.js +323 -0
- package/core/infrastructure/language-patterns/registry-data-query.js +155 -0
- package/core/infrastructure/language-patterns/registry-object-oriented.js +285 -0
- package/core/infrastructure/language-patterns/registry-tooling.js +240 -0
- package/core/infrastructure/language-patterns/registry-web-style.js +143 -0
- package/core/infrastructure/language-patterns/registry.js +19 -0
- package/core/infrastructure/language-patterns.js +141 -0
- package/core/infrastructure/llm-provider.js +733 -0
- package/core/infrastructure/manifest.json +46 -0
- package/core/infrastructure/maxsim.wasm +0 -0
- package/core/infrastructure/model-fetcher.js +423 -0
- package/core/infrastructure/model-registry.js +214 -0
- package/core/infrastructure/native-inference.js +587 -0
- package/core/infrastructure/native-resolver.js +187 -0
- package/core/infrastructure/native-sparse-gram.js +257 -0
- package/core/infrastructure/native-tokenizer.js +160 -0
- package/core/infrastructure/onnx-mutex.js +45 -0
- package/core/infrastructure/onnx-session-utils.js +261 -0
- package/core/infrastructure/ort-pipeline.js +111 -0
- package/core/infrastructure/project-detector.js +102 -0
- package/core/infrastructure/quantization.js +410 -0
- package/core/infrastructure/simd-distance.js +502 -0
- package/core/infrastructure/simd-distance.wasm +0 -0
- package/core/infrastructure/tree-sitter-provider.js +665 -0
- package/core/infrastructure/webgpu-maxsim.js +222 -0
- package/core/query/index.js +35 -0
- package/core/query/intent-detector.js +201 -0
- package/core/query/intent-router.js +156 -0
- package/core/query/query-router-catboost.js +222 -0
- package/core/query/query-router-ml.js +266 -0
- package/core/query/query-router.js +213 -0
- package/core/ranking/cascaded-scorer.js +379 -0
- package/core/ranking/flashrank.js +810 -0
- package/core/ranking/index.js +49 -0
- package/core/ranking/late-interaction-index.js +2383 -0
- package/core/ranking/late-interaction-model.js +812 -0
- package/core/ranking/local-reranker.js +374 -0
- package/core/ranking/mmr.js +379 -0
- package/core/ranking/quality-scorer.js +363 -0
- package/core/search/context-expander.js +1167 -0
- package/core/search/dedup/sibling-expander.js +327 -0
- package/core/search/index.js +16 -0
- package/core/search/search-boost.js +259 -0
- package/core/search/search-cli.js +544 -0
- package/core/search/search-format.js +282 -0
- package/core/search/search-fusion.js +327 -0
- package/core/search/search-hybrid.js +204 -0
- package/core/search/search-pattern-chunks.js +337 -0
- package/core/search/search-pattern-planner.js +439 -0
- package/core/search/search-pattern-prefilter.js +412 -0
- package/core/search/search-pattern-ripgrep.js +663 -0
- package/core/search/search-pattern.js +463 -0
- package/core/search/search-postprocess.js +452 -0
- package/core/search/search-semantic.js +706 -0
- package/core/search/search-server.js +554 -0
- package/core/search/session-daemon-prewarm.mjs +164 -0
- package/core/search/session-warmup.js +595 -0
- package/core/search/sweet-search.js +632 -0
- package/core/search/warmup-metrics.js +532 -0
- package/core/start-server.js +6 -0
- package/core/training/query-router/features/extractor.js +762 -0
- package/core/training/query-router/features/multilingual-patterns.js +431 -0
- package/core/training/query-router/features/text-segmenter.js +303 -0
- package/core/training/query-router/features/unicode-utils.js +383 -0
- package/core/training/query-router/output/v45_router_d4.js +11521 -0
- package/core/training/query-router/output/v46_router_d4.js +11498 -0
- package/core/vector-store/binary-heap.js +227 -0
- package/core/vector-store/binary-hnsw-index.js +1004 -0
- package/core/vector-store/float-vector-store.js +234 -0
- package/core/vector-store/hnsw-index.js +580 -0
- package/core/vector-store/index.js +39 -0
- package/core/vector-store/seismic-index.js +498 -0
- package/core/vocabulary/index.js +84 -0
- package/core/vocabulary/vocab-constants.js +20 -0
- package/core/vocabulary/vocab-miner-extractors.js +375 -0
- package/core/vocabulary/vocab-miner-nl.js +404 -0
- package/core/vocabulary/vocab-miner-utils.js +146 -0
- package/core/vocabulary/vocab-miner.js +574 -0
- package/core/vocabulary/vocab-prewarm-cli.js +110 -0
- package/core/vocabulary/vocab-ranker.js +492 -0
- package/core/vocabulary/vocab-warmer.js +523 -0
- package/core/vocabulary/vocab-warmup-orchestrator.js +425 -0
- package/core/vocabulary/vocabulary-utils.js +704 -0
- package/crates/wasm-router/pkg/package.json +13 -0
- package/crates/wasm-router/pkg/query_router_wasm.d.ts +36 -0
- package/crates/wasm-router/pkg/query_router_wasm.js +271 -0
- package/crates/wasm-router/pkg/query_router_wasm_bg.wasm +0 -0
- package/crates/wasm-router/pkg/query_router_wasm_bg.wasm.d.ts +19 -0
- package/mcp/config-gen.js +121 -0
- package/mcp/server.js +335 -0
- package/mcp/tool-handlers.js +476 -0
- package/package.json +131 -9
- package/scripts/benchmark-harness.js +794 -0
- package/scripts/init.js +1058 -0
- package/scripts/smoke-test.js +435 -0
- package/scripts/uninstall.js +478 -0
- package/scripts/verify-runtime.js +176 -0
|
@@ -0,0 +1,839 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Graph Expansion Module
|
|
3
|
+
*
|
|
4
|
+
* Expands search results by following relationship edges in the entity graph.
|
|
5
|
+
* Given top-k search results, performs 1-hop (or 2-hop) traversal to include
|
|
6
|
+
* related code chunks (imports, extends, implements, uses, calls).
|
|
7
|
+
*
|
|
8
|
+
* This helps with multi-hop coding questions where the answer spans
|
|
9
|
+
* multiple related entities.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
// Default edge types to follow during expansion
|
|
13
|
+
const DEFAULT_EDGE_TYPES = new Set(['imports', 'extends', 'implements', 'uses', 'calls']);
|
|
14
|
+
|
|
15
|
+
// --- Token Estimation Helpers ---
|
|
16
|
+
|
|
17
|
+
// Language-specific tokens-per-line averages (from CodeSearchNet analysis)
|
|
18
|
+
const TOKENS_PER_LINE = {
|
|
19
|
+
java: 15, kotlin: 14, swift: 13,
|
|
20
|
+
go: 12, c: 12, cpp: 12, php: 11,
|
|
21
|
+
javascript: 10, typescript: 10, jsx: 10, tsx: 10,
|
|
22
|
+
ruby: 9, python: 8,
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
// Map file extensions to language keys
|
|
26
|
+
const EXT_TO_LANG = {
|
|
27
|
+
js: 'javascript', ts: 'typescript', py: 'python', rb: 'ruby',
|
|
28
|
+
kt: 'kotlin', cc: 'cpp', cxx: 'cpp', h: 'c', hpp: 'cpp', m: 'c',
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Estimate token count from text using whitespace splitting.
|
|
33
|
+
* ±10-15% of real BPE counts, <0.1ms for typical chunks.
|
|
34
|
+
*/
|
|
35
|
+
function estimateTokenCount(text) {
|
|
36
|
+
if (!text) return 0;
|
|
37
|
+
return (text.match(/\S+/g) || []).length;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Fallback token estimate using language-specific multipliers.
|
|
42
|
+
* Much better than flat ×10 for mixed-language codebases.
|
|
43
|
+
*/
|
|
44
|
+
function fallbackTokenEstimate(result) {
|
|
45
|
+
const ext = (result.file_path || result.file || result.metadata?.file || result.metadata?.path || '')
|
|
46
|
+
.split('.').pop()?.toLowerCase();
|
|
47
|
+
const lang = result.metadata?.language || EXT_TO_LANG[ext] || ext;
|
|
48
|
+
const perLine = TOKENS_PER_LINE[lang] || 10;
|
|
49
|
+
const startLine = result.start_line || result.startLine || 0;
|
|
50
|
+
const endLine = result.end_line || result.endLine || startLine;
|
|
51
|
+
return Math.max(1, (endLine - startLine + 1)) * perLine;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Batch-load chunk texts from codebase.db vectors table.
|
|
56
|
+
* Accepts either a CodebaseRepository or a raw better-sqlite3 Database (legacy).
|
|
57
|
+
* @param {import('../infrastructure/codebase-repository.js').CodebaseRepository|import('better-sqlite3').Database} codebaseDbOrRepo
|
|
58
|
+
* @param {string[]} ids - Vector IDs to look up
|
|
59
|
+
* @returns {Map<string, string>} id → text
|
|
60
|
+
*/
|
|
61
|
+
export function loadChunkTexts(codebaseDbOrRepo, ids) {
|
|
62
|
+
if (!codebaseDbOrRepo || !ids || ids.length === 0) return new Map();
|
|
63
|
+
// Repository path (preferred)
|
|
64
|
+
if (typeof codebaseDbOrRepo.getChunkTexts === 'function') {
|
|
65
|
+
return codebaseDbOrRepo.getChunkTexts(ids);
|
|
66
|
+
}
|
|
67
|
+
// Legacy raw-DB path (backward compat)
|
|
68
|
+
try {
|
|
69
|
+
const ph = ids.map(() => '?').join(',');
|
|
70
|
+
const rows = codebaseDbOrRepo.prepare(
|
|
71
|
+
`SELECT id, text FROM vectors WHERE id IN (${ph})`
|
|
72
|
+
).all(...ids);
|
|
73
|
+
return new Map(rows.map(r => [r.id, r.text]));
|
|
74
|
+
} catch {
|
|
75
|
+
return new Map();
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Compute accurate token estimates for a mixed set of results.
|
|
81
|
+
* Original results (from HNSW) use codebaseDb text; expanded results
|
|
82
|
+
* use readFileLines (injected to keep this module import-free).
|
|
83
|
+
*
|
|
84
|
+
* @param {Array} results
|
|
85
|
+
* @param {Object} options
|
|
86
|
+
* @param {import('better-sqlite3').Database} [options.codebaseDb]
|
|
87
|
+
* @param {Function} [options.readFileLines] - (filePath, startLine, endLine) => string|null
|
|
88
|
+
* @returns {Map<number, number>} index → token count
|
|
89
|
+
*/
|
|
90
|
+
export function computeTokenEstimates(results, options = {}) {
|
|
91
|
+
const { codebaseDb, readFileLines } = options;
|
|
92
|
+
const estimates = new Map();
|
|
93
|
+
|
|
94
|
+
const originalIds = [];
|
|
95
|
+
const originalIndexes = [];
|
|
96
|
+
|
|
97
|
+
for (let i = 0; i < results.length; i++) {
|
|
98
|
+
const r = results[i];
|
|
99
|
+
if (!r.is_expanded && r.id) {
|
|
100
|
+
originalIds.push(r.id);
|
|
101
|
+
originalIndexes.push(i);
|
|
102
|
+
} else if (r.is_expanded && readFileLines) {
|
|
103
|
+
const filePath = r.file_path || r.file;
|
|
104
|
+
const startLine = r.start_line || r.startLine;
|
|
105
|
+
const endLine = r.end_line || r.endLine;
|
|
106
|
+
if (filePath && startLine) {
|
|
107
|
+
const text = readFileLines(filePath, startLine, endLine);
|
|
108
|
+
if (text) {
|
|
109
|
+
estimates.set(i, estimateTokenCount(text));
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Batch-load original chunk texts from codebase.db
|
|
116
|
+
const textMap = loadChunkTexts(codebaseDb, originalIds);
|
|
117
|
+
for (let j = 0; j < originalIds.length; j++) {
|
|
118
|
+
const text = textMap.get(originalIds[j]);
|
|
119
|
+
if (text) {
|
|
120
|
+
estimates.set(originalIndexes[j], estimateTokenCount(text));
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return estimates;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Score decay per hop (graph-expanded results are less relevant than direct hits)
|
|
128
|
+
const HOP_DECAY = 0.6;
|
|
129
|
+
const HOP2_DECAY = 0.35;
|
|
130
|
+
|
|
131
|
+
// Edge priority scores for adaptive 2-hop ranking
|
|
132
|
+
const EDGE_PRIORITY = {
|
|
133
|
+
extends: 4,
|
|
134
|
+
implements: 4,
|
|
135
|
+
imports: 3,
|
|
136
|
+
calls: 2,
|
|
137
|
+
uses: 1,
|
|
138
|
+
};
|
|
139
|
+
|
|
140
|
+
// PathRAG/LEGO-GraphRAG SOTA scoring constants for adaptive 2-hop
|
|
141
|
+
const BASE_ALPHA = 0.55;
|
|
142
|
+
const EDGE_ALPHA_BONUS = {
|
|
143
|
+
extends: 0.25, // effective alpha = 0.80
|
|
144
|
+
implements: 0.25, // effective alpha = 0.80
|
|
145
|
+
imports: 0.10, // effective alpha = 0.65
|
|
146
|
+
calls: 0.05, // effective alpha = 0.60
|
|
147
|
+
uses: 0.00, // effective alpha = 0.55
|
|
148
|
+
};
|
|
149
|
+
const FLOW_THRESHOLD = 0.05;
|
|
150
|
+
|
|
151
|
+
// Structural entity type boosts for reranking
|
|
152
|
+
const TYPE_BOOST = {
|
|
153
|
+
class: 1.3,
|
|
154
|
+
function: 1.2,
|
|
155
|
+
method: 1.2,
|
|
156
|
+
interface: 1.3,
|
|
157
|
+
struct: 1.2,
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
function clampSemanticWeight(value) {
|
|
161
|
+
if (!Number.isFinite(value)) return 0.4;
|
|
162
|
+
return Math.max(0, Math.min(1, value));
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function normalizeMinMax(values) {
|
|
166
|
+
if (values.length === 0) return [];
|
|
167
|
+
const min = Math.min(...values);
|
|
168
|
+
const max = Math.max(...values);
|
|
169
|
+
if (max === min) return values.map(() => 0.5);
|
|
170
|
+
return values.map(v => (v - min) / (max - min));
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function blendScores(graphScore, cosineSim, weight) {
|
|
174
|
+
return (1 - weight) * graphScore + weight * cosineSim;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Expand search results using the entity relationship graph.
|
|
179
|
+
*
|
|
180
|
+
* @param {import('better-sqlite3').Database} db - The code graph database
|
|
181
|
+
* @param {Array} results - Initial search results with entity IDs
|
|
182
|
+
* @param {Object} options
|
|
183
|
+
* @param {string} options.expandMode - 'none' | '1hop' | '2hop'
|
|
184
|
+
* @param {number} options.maxExpanded - Max expanded results to add
|
|
185
|
+
* @param {number} options.tokenBudget - Max total tokens in expanded set
|
|
186
|
+
* @param {Set<string>} options.edgeTypes - Relationship types to follow
|
|
187
|
+
* @returns {Array} Expanded and reranked results
|
|
188
|
+
*/
|
|
189
|
+
export function expandResults(db, results, options = {}) {
|
|
190
|
+
const {
|
|
191
|
+
expandMode = '1hop',
|
|
192
|
+
maxExpanded = 10,
|
|
193
|
+
tokenBudget = 8000,
|
|
194
|
+
edgeTypes = DEFAULT_EDGE_TYPES,
|
|
195
|
+
adaptiveHop2 = false,
|
|
196
|
+
hop2TokenBudget = 4000,
|
|
197
|
+
expandedBudget,
|
|
198
|
+
queryInt8 = null,
|
|
199
|
+
hnswIndex = null,
|
|
200
|
+
semanticWeight = 0.4,
|
|
201
|
+
cosineSimilarity = null,
|
|
202
|
+
codebaseDb = null,
|
|
203
|
+
readFileLines = null,
|
|
204
|
+
} = options;
|
|
205
|
+
const clampedSemanticWeight = clampSemanticWeight(semanticWeight);
|
|
206
|
+
|
|
207
|
+
if (expandMode === 'none' || results.length === 0) return results;
|
|
208
|
+
|
|
209
|
+
// Collect entity IDs from results
|
|
210
|
+
const seedIds = collectSeedIds(db, results);
|
|
211
|
+
if (seedIds.size === 0) return results;
|
|
212
|
+
|
|
213
|
+
// 1-hop expansion: find neighbors via forward + reverse edges
|
|
214
|
+
const expanded = expandOneHop(db, seedIds, edgeTypes);
|
|
215
|
+
|
|
216
|
+
// 2-hop expansion (if requested)
|
|
217
|
+
if (expandMode === '2hop' && expanded.size > 0) {
|
|
218
|
+
if (adaptiveHop2) {
|
|
219
|
+
expandSecondHopAdaptive(db, seedIds, expanded, edgeTypes, {
|
|
220
|
+
maxHop2: maxExpanded,
|
|
221
|
+
tokenBudget: hop2TokenBudget,
|
|
222
|
+
queryInt8,
|
|
223
|
+
hnswIndex,
|
|
224
|
+
semanticWeight: clampedSemanticWeight,
|
|
225
|
+
cosineSimilarity,
|
|
226
|
+
});
|
|
227
|
+
} else {
|
|
228
|
+
expandSecondHop(db, seedIds, expanded, edgeTypes, {
|
|
229
|
+
queryInt8,
|
|
230
|
+
hnswIndex,
|
|
231
|
+
semanticWeight: clampedSemanticWeight,
|
|
232
|
+
cosineSimilarity,
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
if (expanded.size === 0) return results;
|
|
238
|
+
|
|
239
|
+
// Look up entity details for expanded IDs, respecting maxExpanded
|
|
240
|
+
const expandedIds = [...expanded.keys()].slice(0, maxExpanded);
|
|
241
|
+
const expandedResults = lookupEntities(db, expandedIds, expanded);
|
|
242
|
+
|
|
243
|
+
// Score expanded results relative to original results
|
|
244
|
+
const maxOriginalScore = Math.max(...results.map(r => r.score || 0), 1);
|
|
245
|
+
for (const er of expandedResults) {
|
|
246
|
+
const hops = er.expansion?.hops || 1;
|
|
247
|
+
const decay = er.expansion?.decay || (hops === 1 ? HOP_DECAY : HOP2_DECAY);
|
|
248
|
+
er.score = maxOriginalScore * decay;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// Rerank expanded results using composite scoring (file proximity + entity type + semantic)
|
|
252
|
+
rerankExpanded(expandedResults, results, {
|
|
253
|
+
queryInt8,
|
|
254
|
+
hnswIndex,
|
|
255
|
+
semanticWeight: clampedSemanticWeight,
|
|
256
|
+
cosineSimilarity,
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
// Apply token budget
|
|
260
|
+
const { results: budgeted, stats: budgetStats } = applyTokenBudget(
|
|
261
|
+
[...results, ...expandedResults], tokenBudget,
|
|
262
|
+
{ expandedBudget, codebaseDb, readFileLines }
|
|
263
|
+
);
|
|
264
|
+
|
|
265
|
+
budgeted._budgetStats = budgetStats;
|
|
266
|
+
return budgeted;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Collect entity IDs from search results.
|
|
271
|
+
* Tries entity_id from metadata, then falls back to file_path + line range matching.
|
|
272
|
+
*
|
|
273
|
+
* @param {import('better-sqlite3').Database} db
|
|
274
|
+
* @param {Array} results
|
|
275
|
+
* @returns {Set<string>}
|
|
276
|
+
*/
|
|
277
|
+
function collectSeedIds(db, results) {
|
|
278
|
+
const seedIds = new Set();
|
|
279
|
+
|
|
280
|
+
for (const r of results) {
|
|
281
|
+
if (r.entity_id) seedIds.add(r.entity_id);
|
|
282
|
+
else if (r.metadata?.entity_id) seedIds.add(r.metadata.entity_id);
|
|
283
|
+
else if (r.id) seedIds.add(r.id);
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
if (seedIds.size > 0) return seedIds;
|
|
287
|
+
|
|
288
|
+
// Fallback: match results to entities by file_path + line range
|
|
289
|
+
let entityLookup;
|
|
290
|
+
try {
|
|
291
|
+
entityLookup = db.prepare(`
|
|
292
|
+
SELECT id, file_path, start_line, end_line
|
|
293
|
+
FROM entities WHERE stale_since IS NULL
|
|
294
|
+
`).all();
|
|
295
|
+
} catch {
|
|
296
|
+
return seedIds;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
for (const r of results) {
|
|
300
|
+
const filePath = r.file_path || r.file || r.metadata?.file || r.metadata?.path;
|
|
301
|
+
const lineStart = r.start_line || r.startLine || r.metadata?.line_start || r.metadata?.startLine;
|
|
302
|
+
if (!filePath) continue;
|
|
303
|
+
|
|
304
|
+
for (const e of entityLookup) {
|
|
305
|
+
if (e.file_path === filePath && e.start_line != null && lineStart != null &&
|
|
306
|
+
e.start_line <= lineStart && e.end_line >= lineStart) {
|
|
307
|
+
seedIds.add(e.id);
|
|
308
|
+
break;
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
return seedIds;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Perform 1-hop graph expansion from seed entity IDs.
|
|
318
|
+
*
|
|
319
|
+
* @param {import('better-sqlite3').Database} db
|
|
320
|
+
* @param {Set<string>} seedIds
|
|
321
|
+
* @param {Set<string>} edgeTypes
|
|
322
|
+
* @returns {Map<string, {via: string, direction: string, score: number, hops?: number}>}
|
|
323
|
+
*/
|
|
324
|
+
export function expandOneHop(db, seedIds, edgeTypes) {
|
|
325
|
+
const expanded = new Map();
|
|
326
|
+
const seedArray = [...seedIds];
|
|
327
|
+
const placeholders = seedArray.map(() => '?').join(',');
|
|
328
|
+
|
|
329
|
+
// Forward edges: seed -> neighbor
|
|
330
|
+
let forwardRels;
|
|
331
|
+
try {
|
|
332
|
+
forwardRels = db.prepare(`
|
|
333
|
+
SELECT DISTINCT target_id, type FROM relationships
|
|
334
|
+
WHERE source_id IN (${placeholders}) AND target_id IS NOT NULL
|
|
335
|
+
`).all(...seedArray);
|
|
336
|
+
} catch {
|
|
337
|
+
forwardRels = [];
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Reverse edges: neighbor -> seed
|
|
341
|
+
let reverseRels;
|
|
342
|
+
try {
|
|
343
|
+
reverseRels = db.prepare(`
|
|
344
|
+
SELECT DISTINCT source_id, type FROM relationships
|
|
345
|
+
WHERE target_id IN (${placeholders}) AND source_id IS NOT NULL
|
|
346
|
+
`).all(...seedArray);
|
|
347
|
+
} catch {
|
|
348
|
+
reverseRels = [];
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
for (const { rels, idField, direction } of [
|
|
352
|
+
{ rels: forwardRels, idField: 'target_id', direction: 'forward' },
|
|
353
|
+
{ rels: reverseRels, idField: 'source_id', direction: 'reverse' },
|
|
354
|
+
]) {
|
|
355
|
+
for (const rel of rels) {
|
|
356
|
+
const neighborId = rel[idField];
|
|
357
|
+
if (edgeTypes.has(rel.type) && !seedIds.has(neighborId)) {
|
|
358
|
+
const effectiveAlpha = BASE_ALPHA + (EDGE_ALPHA_BONUS[rel.type] || 0);
|
|
359
|
+
const score = effectiveAlpha;
|
|
360
|
+
const existing = expanded.get(neighborId);
|
|
361
|
+
if (!existing || score > existing.score) {
|
|
362
|
+
expanded.set(neighborId, { via: rel.type, direction, score });
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
return expanded;
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
/**
|
|
372
|
+
* Perform 2nd-hop expansion from the 1-hop neighbors.
|
|
373
|
+
*
|
|
374
|
+
* @param {import('better-sqlite3').Database} db
|
|
375
|
+
* @param {Set<string>} seedIds - Original seeds
|
|
376
|
+
* @param {Map<string, Object>} expanded - 1-hop expansion map (mutated in place)
|
|
377
|
+
* @param {Set<string>} edgeTypes
|
|
378
|
+
*/
|
|
379
|
+
export function expandSecondHop(db, seedIds, expanded, edgeTypes, options = {}) {
|
|
380
|
+
const {
|
|
381
|
+
queryInt8 = null,
|
|
382
|
+
hnswIndex = null,
|
|
383
|
+
semanticWeight = 0.4,
|
|
384
|
+
cosineSimilarity = null,
|
|
385
|
+
} = options;
|
|
386
|
+
const semanticEnabled = !!(queryInt8 && hnswIndex && cosineSimilarity && semanticWeight > 0);
|
|
387
|
+
|
|
388
|
+
const hop1Ids = [...expanded.keys()];
|
|
389
|
+
if (hop1Ids.length === 0) return;
|
|
390
|
+
|
|
391
|
+
const ph = hop1Ids.map(() => '?').join(',');
|
|
392
|
+
|
|
393
|
+
let hop2Forward;
|
|
394
|
+
try {
|
|
395
|
+
hop2Forward = db.prepare(`
|
|
396
|
+
SELECT source_id, target_id, type FROM relationships
|
|
397
|
+
WHERE source_id IN (${ph}) AND target_id IS NOT NULL
|
|
398
|
+
`).all(...hop1Ids);
|
|
399
|
+
} catch {
|
|
400
|
+
return;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
if (!semanticEnabled) {
|
|
404
|
+
for (const rel of hop2Forward) {
|
|
405
|
+
if (edgeTypes.has(rel.type) && !seedIds.has(rel.target_id) && !expanded.has(rel.target_id)) {
|
|
406
|
+
expanded.set(rel.target_id, { via: rel.type, direction: 'forward', hops: 2 });
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
return;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
const excluded = new Set([...seedIds, ...expanded.keys()]);
|
|
413
|
+
const candidates = [];
|
|
414
|
+
for (const rel of hop2Forward) {
|
|
415
|
+
if (!edgeTypes.has(rel.type) || excluded.has(rel.target_id)) continue;
|
|
416
|
+
|
|
417
|
+
const hop1Entry = expanded.get(rel.source_id);
|
|
418
|
+
const hop1Score = hop1Entry?.score ?? 1; // identity: preserves old edgePriority × weight
|
|
419
|
+
const graphScore = hop1Score * (EDGE_PRIORITY[rel.type] || 1) * (rel.weight || 1.0);
|
|
420
|
+
let normSim = null;
|
|
421
|
+
const entityInt8 = hnswIndex.getInt8Vector(rel.target_id);
|
|
422
|
+
if (entityInt8) {
|
|
423
|
+
const cosSim = cosineSimilarity(queryInt8, entityInt8);
|
|
424
|
+
normSim = (cosSim + 1) / 2;
|
|
425
|
+
}
|
|
426
|
+
candidates.push({ rel, graphScore, normSim });
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
if (candidates.length === 0) return;
|
|
430
|
+
const normalizedGraphScores = normalizeMinMax(candidates.map(c => c.graphScore));
|
|
431
|
+
const bestByTarget = new Map();
|
|
432
|
+
|
|
433
|
+
for (let i = 0; i < candidates.length; i++) {
|
|
434
|
+
const c = candidates[i];
|
|
435
|
+
const normGraph = normalizedGraphScores[i];
|
|
436
|
+
let score = normGraph;
|
|
437
|
+
if (c.normSim != null) {
|
|
438
|
+
score = blendScores(normGraph, c.normSim, semanticWeight);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
const prev = bestByTarget.get(c.rel.target_id);
|
|
442
|
+
if (!prev || score > prev.score) {
|
|
443
|
+
bestByTarget.set(c.rel.target_id, { rel: c.rel, score });
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
const ranked = [...bestByTarget.values()].sort((a, b) => b.score - a.score);
|
|
448
|
+
for (const c of ranked) {
|
|
449
|
+
expanded.set(c.rel.target_id, {
|
|
450
|
+
via: c.rel.type,
|
|
451
|
+
direction: 'forward',
|
|
452
|
+
hops: 2,
|
|
453
|
+
});
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
/**
|
|
458
|
+
* Perform adaptive 2nd-hop expansion with per-edge-type alpha decay,
|
|
459
|
+
* degree normalization, and flow-based early stopping (PathRAG-style).
|
|
460
|
+
*
|
|
461
|
+
* @param {import('better-sqlite3').Database} db
|
|
462
|
+
* @param {Set<string>} seedIds - Original seed entity IDs
|
|
463
|
+
* @param {Map<string, Object>} hop1Expanded - 1-hop expansion map (mutated in place)
|
|
464
|
+
* @param {Set<string>} edgeTypes - Allowed edge types
|
|
465
|
+
* @param {Object} options
|
|
466
|
+
* @param {number} options.maxHop2 - Max 2-hop entities to add
|
|
467
|
+
* @param {number} options.tokenBudget - Token budget for 2-hop expansion
|
|
468
|
+
* @returns {{ added: number, budgetUsed: number, candidates: number }}
|
|
469
|
+
*/
|
|
470
|
+
export function expandSecondHopAdaptive(db, seedIds, hop1Expanded, edgeTypes, options = {}) {
|
|
471
|
+
const {
|
|
472
|
+
maxHop2 = 5,
|
|
473
|
+
tokenBudget = 4000,
|
|
474
|
+
queryInt8 = null,
|
|
475
|
+
hnswIndex = null,
|
|
476
|
+
semanticWeight = 0.4,
|
|
477
|
+
cosineSimilarity = null,
|
|
478
|
+
} = options;
|
|
479
|
+
const semanticEnabled = !!(queryInt8 && hnswIndex && cosineSimilarity && semanticWeight > 0);
|
|
480
|
+
|
|
481
|
+
const hop1Ids = [...hop1Expanded.keys()];
|
|
482
|
+
if (hop1Ids.length === 0) return { added: 0, budgetUsed: 0, candidates: 0 };
|
|
483
|
+
|
|
484
|
+
const ph = hop1Ids.map(() => '?').join(',');
|
|
485
|
+
|
|
486
|
+
// Query out-degrees for hop-1 nodes, filtered to active edge types only.
|
|
487
|
+
// Counting all edge types would over-penalize nodes with many irrelevant edges.
|
|
488
|
+
// Safety: edgeTypes is always code-controlled (DEFAULT_EDGE_TYPES or intent policy
|
|
489
|
+
// constants). Not parameterized because better-sqlite3 doesn't support mixing
|
|
490
|
+
// positional params across two IN clauses cleanly. Never pass user input here.
|
|
491
|
+
const typeList = [...edgeTypes].map(t => `'${t}'`).join(',');
|
|
492
|
+
let degreeMap;
|
|
493
|
+
try {
|
|
494
|
+
const degRows = db.prepare(`
|
|
495
|
+
SELECT source_id, COUNT(*) as deg FROM relationships
|
|
496
|
+
WHERE source_id IN (${ph}) AND type IN (${typeList})
|
|
497
|
+
GROUP BY source_id
|
|
498
|
+
`).all(...hop1Ids);
|
|
499
|
+
degreeMap = new Map(degRows.map(r => [r.source_id, r.deg]));
|
|
500
|
+
} catch {
|
|
501
|
+
degreeMap = new Map();
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
// Query candidate 2-hop targets with source, weights, and line ranges
|
|
505
|
+
let rawCandidates;
|
|
506
|
+
try {
|
|
507
|
+
rawCandidates = db.prepare(`
|
|
508
|
+
SELECT r.source_id, r.target_id, r.type, r.weight, e.file_path, e.start_line, e.end_line
|
|
509
|
+
FROM relationships r
|
|
510
|
+
JOIN entities e ON e.id = r.target_id AND e.stale_since IS NULL
|
|
511
|
+
WHERE r.source_id IN (${ph}) AND r.target_id IS NOT NULL
|
|
512
|
+
`).all(...hop1Ids);
|
|
513
|
+
} catch {
|
|
514
|
+
return { added: 0, budgetUsed: 0, candidates: 0 };
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Filter by edge types and score all paths.
|
|
518
|
+
const excluded = new Set([...seedIds, ...hop1Expanded.keys()]);
|
|
519
|
+
const vectorCache = semanticEnabled ? new Map() : null;
|
|
520
|
+
const scoredCandidates = [];
|
|
521
|
+
|
|
522
|
+
for (const c of rawCandidates) {
|
|
523
|
+
if (!edgeTypes.has(c.type) || excluded.has(c.target_id)) continue;
|
|
524
|
+
|
|
525
|
+
const effectiveAlpha = BASE_ALPHA + (EDGE_ALPHA_BONUS[c.type] || 0);
|
|
526
|
+
const edgePriority = EDGE_PRIORITY[c.type] || 1;
|
|
527
|
+
const weight = c.weight || 1.0;
|
|
528
|
+
const outDegree = degreeMap.get(c.source_id) || 1;
|
|
529
|
+
const hop1Entry = hop1Expanded.get(c.source_id);
|
|
530
|
+
const hop1Score = hop1Entry?.score ?? effectiveAlpha;
|
|
531
|
+
const graphScore = (hop1Score * effectiveAlpha * weight * edgePriority) / Math.sqrt(outDegree);
|
|
532
|
+
|
|
533
|
+
let normSim = null;
|
|
534
|
+
if (semanticEnabled) {
|
|
535
|
+
if (!vectorCache.has(c.target_id)) {
|
|
536
|
+
vectorCache.set(c.target_id, hnswIndex.getInt8Vector(c.target_id));
|
|
537
|
+
}
|
|
538
|
+
const entityInt8 = vectorCache.get(c.target_id);
|
|
539
|
+
if (entityInt8) {
|
|
540
|
+
const cosSim = cosineSimilarity(queryInt8, entityInt8);
|
|
541
|
+
normSim = (cosSim + 1) / 2;
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
const estimatedTokens = fallbackTokenEstimate({
|
|
546
|
+
file_path: c.file_path,
|
|
547
|
+
start_line: c.start_line,
|
|
548
|
+
end_line: c.end_line,
|
|
549
|
+
});
|
|
550
|
+
|
|
551
|
+
scoredCandidates.push({
|
|
552
|
+
target_id: c.target_id,
|
|
553
|
+
source_id: c.source_id,
|
|
554
|
+
type: c.type,
|
|
555
|
+
graphScore,
|
|
556
|
+
normSim,
|
|
557
|
+
estimatedTokens,
|
|
558
|
+
effectiveAlpha,
|
|
559
|
+
outDegree,
|
|
560
|
+
});
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
if (scoredCandidates.length === 0) return { added: 0, budgetUsed: 0, candidates: 0 };
|
|
564
|
+
|
|
565
|
+
const normalizedGraphScores = semanticEnabled
|
|
566
|
+
? normalizeMinMax(scoredCandidates.map(c => c.graphScore))
|
|
567
|
+
: [];
|
|
568
|
+
|
|
569
|
+
// Multiple hop-1 sources may reach the same target — keep the highest score.
|
|
570
|
+
const bestByTarget = new Map(); // target_id -> best scored entry
|
|
571
|
+
for (let i = 0; i < scoredCandidates.length; i++) {
|
|
572
|
+
const c = scoredCandidates[i];
|
|
573
|
+
let score = c.graphScore;
|
|
574
|
+
|
|
575
|
+
if (semanticEnabled) {
|
|
576
|
+
const normGraph = normalizedGraphScores[i];
|
|
577
|
+
score = normGraph;
|
|
578
|
+
if (c.normSim != null) {
|
|
579
|
+
score = blendScores(normGraph, c.normSim, semanticWeight);
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
// PathRAG-style early stopping
|
|
584
|
+
if (score < FLOW_THRESHOLD) continue;
|
|
585
|
+
|
|
586
|
+
const prev = bestByTarget.get(c.target_id);
|
|
587
|
+
if (prev && prev.score >= score) continue;
|
|
588
|
+
bestByTarget.set(c.target_id, { ...c, score });
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
const scored = [...bestByTarget.values()];
|
|
592
|
+
|
|
593
|
+
// Sort by composite score descending
|
|
594
|
+
scored.sort((a, b) => b.score - a.score);
|
|
595
|
+
|
|
596
|
+
// Greedily select candidates within token budget and maxHop2 limit
|
|
597
|
+
let budgetUsed = 0;
|
|
598
|
+
let count = 0;
|
|
599
|
+
|
|
600
|
+
for (const s of scored) {
|
|
601
|
+
if (count >= maxHop2) break;
|
|
602
|
+
if (budgetUsed + s.estimatedTokens > tokenBudget && count > 0) break;
|
|
603
|
+
|
|
604
|
+
const decay = s.effectiveAlpha * s.effectiveAlpha;
|
|
605
|
+
|
|
606
|
+
hop1Expanded.set(s.target_id, {
|
|
607
|
+
via: s.type,
|
|
608
|
+
direction: 'forward',
|
|
609
|
+
hops: 2,
|
|
610
|
+
adaptiveScore: s.score,
|
|
611
|
+
decay,
|
|
612
|
+
sourceOutDegree: s.outDegree,
|
|
613
|
+
});
|
|
614
|
+
|
|
615
|
+
budgetUsed += s.estimatedTokens;
|
|
616
|
+
count++;
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
return { added: count, budgetUsed, candidates: scored.length };
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
/**
|
|
623
|
+
* Look up entity details for expanded IDs.
|
|
624
|
+
*
|
|
625
|
+
* @param {import('better-sqlite3').Database} db
|
|
626
|
+
* @param {string[]} expandedIds
|
|
627
|
+
* @param {Map<string, Object>} expansionMeta
|
|
628
|
+
* @returns {Array}
|
|
629
|
+
*/
|
|
630
|
+
function lookupEntities(db, expandedIds, expansionMeta) {
|
|
631
|
+
if (expandedIds.length === 0) return [];
|
|
632
|
+
|
|
633
|
+
const ph = expandedIds.map(() => '?').join(',');
|
|
634
|
+
let entities;
|
|
635
|
+
try {
|
|
636
|
+
entities = db.prepare(`
|
|
637
|
+
SELECT id, file_path, type, name, signature, start_line, end_line
|
|
638
|
+
FROM entities WHERE id IN (${ph}) AND stale_since IS NULL
|
|
639
|
+
`).all(...expandedIds);
|
|
640
|
+
} catch {
|
|
641
|
+
return [];
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
return entities.map(e => ({
|
|
645
|
+
entity_id: e.id,
|
|
646
|
+
id: e.id,
|
|
647
|
+
file_path: e.file_path,
|
|
648
|
+
file: e.file_path,
|
|
649
|
+
name: e.name,
|
|
650
|
+
type: e.type,
|
|
651
|
+
signature: e.signature,
|
|
652
|
+
startLine: e.start_line,
|
|
653
|
+
endLine: e.end_line,
|
|
654
|
+
start_line: e.start_line,
|
|
655
|
+
end_line: e.end_line,
|
|
656
|
+
expansion: expansionMeta.get(e.id),
|
|
657
|
+
score: 0,
|
|
658
|
+
is_expanded: true,
|
|
659
|
+
}));
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
/**
|
|
663
|
+
* Rerank expanded results using a composite score that combines the
|
|
664
|
+
* decay-based score with file proximity and entity type relevance.
|
|
665
|
+
*
|
|
666
|
+
* Score factors:
|
|
667
|
+
* 1. Decay score (already assigned by the caller)
|
|
668
|
+
* 2. File proximity: entities in the same file as a seed result get a 1.5x boost
|
|
669
|
+
* 3. Entity type relevance: structural types (class, interface, function, method, struct)
|
|
670
|
+
* receive a multiplicative boost (1.2-1.3x)
|
|
671
|
+
*
|
|
672
|
+
* Mutates `expandedResults` in place: updates `.score` and re-sorts descending.
|
|
673
|
+
*
|
|
674
|
+
* @param {Array} expandedResults - Expanded results with scores already assigned
|
|
675
|
+
* @param {Array} seedResults - Original seed results (used to determine file proximity)
|
|
676
|
+
* @returns {Array} The same array, sorted by reranked score descending
|
|
677
|
+
*/
|
|
678
|
+
export function rerankExpanded(expandedResults, seedResults, options = {}) {
|
|
679
|
+
const {
|
|
680
|
+
queryInt8 = null,
|
|
681
|
+
hnswIndex = null,
|
|
682
|
+
semanticWeight = 0.4,
|
|
683
|
+
cosineSimilarity = null,
|
|
684
|
+
} = options;
|
|
685
|
+
const clampedSemanticWeight = clampSemanticWeight(semanticWeight);
|
|
686
|
+
const semanticEnabled = !!(queryInt8 && hnswIndex && cosineSimilarity && clampedSemanticWeight > 0);
|
|
687
|
+
|
|
688
|
+
if (expandedResults.length === 0) return expandedResults;
|
|
689
|
+
|
|
690
|
+
const seedFiles = new Set(
|
|
691
|
+
seedResults.map(r => r.file_path || r.file || r.metadata?.path).filter(Boolean)
|
|
692
|
+
);
|
|
693
|
+
|
|
694
|
+
const baseScores = [];
|
|
695
|
+
|
|
696
|
+
for (const er of expandedResults) {
|
|
697
|
+
let rerankScore = er.score || 0;
|
|
698
|
+
|
|
699
|
+
// File proximity boost: entities in same file as seeds are more relevant
|
|
700
|
+
const erFile = er.file_path || er.file || er.metadata?.path;
|
|
701
|
+
if (erFile && seedFiles.has(erFile)) {
|
|
702
|
+
rerankScore *= 1.5;
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
// Entity type relevance: structural entities are more valuable
|
|
706
|
+
const entType = er.type || er.metadata?.chunk_type;
|
|
707
|
+
if (TYPE_BOOST[entType]) {
|
|
708
|
+
rerankScore *= TYPE_BOOST[entType];
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
baseScores.push(rerankScore);
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
if (!semanticEnabled) {
|
|
715
|
+
for (let i = 0; i < expandedResults.length; i++) {
|
|
716
|
+
expandedResults[i].score = baseScores[i];
|
|
717
|
+
}
|
|
718
|
+
} else {
|
|
719
|
+
const normalizedGraphScores = normalizeMinMax(baseScores);
|
|
720
|
+
for (let i = 0; i < expandedResults.length; i++) {
|
|
721
|
+
const er = expandedResults[i];
|
|
722
|
+
const normGraph = normalizedGraphScores[i];
|
|
723
|
+
let rerankScore = normGraph;
|
|
724
|
+
const entityId = er.entity_id || er.id;
|
|
725
|
+
const entityInt8 = hnswIndex.getInt8Vector(entityId);
|
|
726
|
+
if (entityInt8) {
|
|
727
|
+
const cosSim = cosineSimilarity(queryInt8, entityInt8);
|
|
728
|
+
const normSim = (cosSim + 1) / 2;
|
|
729
|
+
rerankScore = blendScores(normGraph, normSim, clampedSemanticWeight);
|
|
730
|
+
}
|
|
731
|
+
er.score = rerankScore;
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
// Re-sort by reranked score descending
|
|
736
|
+
expandedResults.sort((a, b) => (b.score || 0) - (a.score || 0));
|
|
737
|
+
return expandedResults;
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
/**
|
|
741
|
+
* Apply token budget to limit total result set size.
|
|
742
|
+
* Uses accurate token counts from chunk text when available,
|
|
743
|
+
* falls back to language-specific per-line multipliers.
|
|
744
|
+
*
|
|
745
|
+
* @param {Array} results
|
|
746
|
+
* @param {number} budget - Total token budget
|
|
747
|
+
* @param {Object} [options]
|
|
748
|
+
* @param {number} [options.expandedBudget] - Separate budget for expanded results
|
|
749
|
+
* @param {import('better-sqlite3').Database} [options.codebaseDb] - For chunk text lookup
|
|
750
|
+
* @param {Function} [options.readFileLines] - For expanded result text lookup
|
|
751
|
+
* @returns {{ results: Array, stats: { original: Object, hop1: Object, hop2: Object, total: Object } }}
|
|
752
|
+
*/
|
|
753
|
+
export function applyTokenBudget(results, budget, options = {}) {
|
|
754
|
+
const { expandedBudget, codebaseDb, readFileLines } = options;
|
|
755
|
+
|
|
756
|
+
// Pre-compute accurate token estimates when data sources are available
|
|
757
|
+
const tokenEstimates = (codebaseDb || readFileLines)
|
|
758
|
+
? computeTokenEstimates(results, { codebaseDb, readFileLines })
|
|
759
|
+
: new Map();
|
|
760
|
+
|
|
761
|
+
let totalTokens = 0;
|
|
762
|
+
let expandedTokens = 0;
|
|
763
|
+
const budgeted = [];
|
|
764
|
+
const stats = {
|
|
765
|
+
original: { count: 0, tokens: 0 },
|
|
766
|
+
hop1: { count: 0, tokens: 0 },
|
|
767
|
+
hop2: { count: 0, tokens: 0 },
|
|
768
|
+
total: { count: 0, tokens: 0 },
|
|
769
|
+
};
|
|
770
|
+
|
|
771
|
+
for (let i = 0; i < results.length; i++) {
|
|
772
|
+
const r = results[i];
|
|
773
|
+
// Use accurate estimate if available, otherwise fall back to per-language heuristic
|
|
774
|
+
const accurate = tokenEstimates.get(i);
|
|
775
|
+
const estimatedTokens = (accurate != null && accurate > 0) ? accurate : fallbackTokenEstimate(r);
|
|
776
|
+
const isExpanded = !!r.is_expanded;
|
|
777
|
+
const hops = r.expansion?.hops || (isExpanded ? 1 : 0);
|
|
778
|
+
|
|
779
|
+
// Check total budget (always include at least one result)
|
|
780
|
+
if (totalTokens + estimatedTokens > budget && budgeted.length > 0) break;
|
|
781
|
+
|
|
782
|
+
// Check expanded-specific budget
|
|
783
|
+
if (isExpanded && expandedBudget != null && expandedTokens + estimatedTokens > expandedBudget) {
|
|
784
|
+
continue;
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
totalTokens += estimatedTokens;
|
|
788
|
+
if (isExpanded) expandedTokens += estimatedTokens;
|
|
789
|
+
budgeted.push(r);
|
|
790
|
+
|
|
791
|
+
// Track per-category stats
|
|
792
|
+
if (!isExpanded) {
|
|
793
|
+
stats.original.count++;
|
|
794
|
+
stats.original.tokens += estimatedTokens;
|
|
795
|
+
} else if (hops === 2) {
|
|
796
|
+
stats.hop2.count++;
|
|
797
|
+
stats.hop2.tokens += estimatedTokens;
|
|
798
|
+
} else {
|
|
799
|
+
stats.hop1.count++;
|
|
800
|
+
stats.hop1.tokens += estimatedTokens;
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
stats.total = { count: budgeted.length, tokens: totalTokens };
|
|
805
|
+
return { results: budgeted, stats };
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
/**
|
|
809
|
+
* Get expansion statistics for a set of entities.
|
|
810
|
+
*
|
|
811
|
+
* @param {import('better-sqlite3').Database} db
|
|
812
|
+
* @param {string[]} entityIds
|
|
813
|
+
* @returns {{ total: number, byType: Record<string, number> }}
|
|
814
|
+
*/
|
|
815
|
+
export function getExpansionStats(db, entityIds) {
|
|
816
|
+
if (!entityIds || entityIds.length === 0) return { total: 0, byType: {} };
|
|
817
|
+
|
|
818
|
+
const ph = entityIds.map(() => '?').join(',');
|
|
819
|
+
let rels;
|
|
820
|
+
try {
|
|
821
|
+
rels = db.prepare(`
|
|
822
|
+
SELECT type, COUNT(*) as count FROM relationships
|
|
823
|
+
WHERE (source_id IN (${ph}) OR target_id IN (${ph}))
|
|
824
|
+
AND source_id IS NOT NULL AND target_id IS NOT NULL
|
|
825
|
+
GROUP BY type
|
|
826
|
+
`).all(...entityIds, ...entityIds);
|
|
827
|
+
} catch {
|
|
828
|
+
return { total: 0, byType: {} };
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
const byType = {};
|
|
832
|
+
let total = 0;
|
|
833
|
+
for (const r of rels) {
|
|
834
|
+
byType[r.type] = r.count;
|
|
835
|
+
total += r.count;
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
return { total, byType };
|
|
839
|
+
}
|