spec-gen-cli 1.2.1 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -11
- package/dist/api/generate.d.ts.map +1 -1
- package/dist/api/generate.js +5 -4
- package/dist/api/generate.js.map +1 -1
- package/dist/cli/commands/analyze.d.ts.map +1 -1
- package/dist/cli/commands/analyze.js +101 -41
- package/dist/cli/commands/analyze.js.map +1 -1
- package/dist/cli/commands/generate.d.ts.map +1 -1
- package/dist/cli/commands/generate.js +25 -21
- package/dist/cli/commands/generate.js.map +1 -1
- package/dist/cli/commands/mcp.d.ts +353 -10
- package/dist/cli/commands/mcp.d.ts.map +1 -1
- package/dist/cli/commands/mcp.js +236 -45
- package/dist/cli/commands/mcp.js.map +1 -1
- package/dist/cli/commands/view.d.ts.map +1 -1
- package/dist/cli/commands/view.js +33 -4
- package/dist/cli/commands/view.js.map +1 -1
- package/dist/constants.d.ts +10 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +10 -0
- package/dist/constants.js.map +1 -1
- package/dist/core/analyzer/ast-chunker.d.ts +24 -0
- package/dist/core/analyzer/ast-chunker.d.ts.map +1 -0
- package/dist/core/analyzer/ast-chunker.js +198 -0
- package/dist/core/analyzer/ast-chunker.js.map +1 -0
- package/dist/core/analyzer/call-graph.d.ts +51 -4
- package/dist/core/analyzer/call-graph.d.ts.map +1 -1
- package/dist/core/analyzer/call-graph.js +634 -44
- package/dist/core/analyzer/call-graph.js.map +1 -1
- package/dist/core/analyzer/code-shaper.d.ts.map +1 -1
- package/dist/core/analyzer/code-shaper.js +5 -0
- package/dist/core/analyzer/code-shaper.js.map +1 -1
- package/dist/core/analyzer/codebase-digest.d.ts +40 -0
- package/dist/core/analyzer/codebase-digest.d.ts.map +1 -0
- package/dist/core/analyzer/codebase-digest.js +194 -0
- package/dist/core/analyzer/codebase-digest.js.map +1 -0
- package/dist/core/analyzer/cpp-header-resolver.d.ts +30 -0
- package/dist/core/analyzer/cpp-header-resolver.d.ts.map +1 -0
- package/dist/core/analyzer/cpp-header-resolver.js +71 -0
- package/dist/core/analyzer/cpp-header-resolver.js.map +1 -0
- package/dist/core/analyzer/function-registry-trie.d.ts +21 -0
- package/dist/core/analyzer/function-registry-trie.d.ts.map +1 -0
- package/dist/core/analyzer/function-registry-trie.js +39 -0
- package/dist/core/analyzer/function-registry-trie.js.map +1 -0
- package/dist/core/analyzer/import-resolver-bridge.d.ts +25 -0
- package/dist/core/analyzer/import-resolver-bridge.d.ts.map +1 -0
- package/dist/core/analyzer/import-resolver-bridge.js +99 -0
- package/dist/core/analyzer/import-resolver-bridge.js.map +1 -0
- package/dist/core/analyzer/signature-extractor.d.ts.map +1 -1
- package/dist/core/analyzer/signature-extractor.js +72 -3
- package/dist/core/analyzer/signature-extractor.js.map +1 -1
- package/dist/core/analyzer/subgraph-extractor.d.ts +10 -2
- package/dist/core/analyzer/subgraph-extractor.d.ts.map +1 -1
- package/dist/core/analyzer/subgraph-extractor.js +25 -7
- package/dist/core/analyzer/subgraph-extractor.js.map +1 -1
- package/dist/core/analyzer/type-inference-engine.d.ts +23 -0
- package/dist/core/analyzer/type-inference-engine.d.ts.map +1 -0
- package/dist/core/analyzer/type-inference-engine.js +130 -0
- package/dist/core/analyzer/type-inference-engine.js.map +1 -0
- package/dist/core/analyzer/vector-index.d.ts +35 -6
- package/dist/core/analyzer/vector-index.d.ts.map +1 -1
- package/dist/core/analyzer/vector-index.js +308 -54
- package/dist/core/analyzer/vector-index.js.map +1 -1
- package/dist/core/generator/schemas.d.ts +365 -0
- package/dist/core/generator/schemas.d.ts.map +1 -0
- package/dist/core/generator/schemas.js +190 -0
- package/dist/core/generator/schemas.js.map +1 -0
- package/dist/core/generator/spec-pipeline.d.ts +31 -11
- package/dist/core/generator/spec-pipeline.d.ts.map +1 -1
- package/dist/core/generator/spec-pipeline.js +172 -40
- package/dist/core/generator/spec-pipeline.js.map +1 -1
- package/dist/core/generator/stages/stage2-entities.d.ts.map +1 -1
- package/dist/core/generator/stages/stage2-entities.js +4 -2
- package/dist/core/generator/stages/stage2-entities.js.map +1 -1
- package/dist/core/generator/stages/stage3-services.d.ts.map +1 -1
- package/dist/core/generator/stages/stage3-services.js +4 -2
- package/dist/core/generator/stages/stage3-services.js.map +1 -1
- package/dist/core/generator/stages/stage4-api.d.ts.map +1 -1
- package/dist/core/generator/stages/stage4-api.js +4 -2
- package/dist/core/generator/stages/stage4-api.js.map +1 -1
- package/dist/core/generator/stages/stage5-architecture.d.ts +2 -1
- package/dist/core/generator/stages/stage5-architecture.d.ts.map +1 -1
- package/dist/core/generator/stages/stage5-architecture.js +15 -3
- package/dist/core/generator/stages/stage5-architecture.js.map +1 -1
- package/dist/core/generator/stages/stage6-adr.d.ts.map +1 -1
- package/dist/core/generator/stages/stage6-adr.js +2 -1
- package/dist/core/generator/stages/stage6-adr.js.map +1 -1
- package/dist/core/services/chat-agent.d.ts +5 -0
- package/dist/core/services/chat-agent.d.ts.map +1 -1
- package/dist/core/services/chat-agent.js +14 -0
- package/dist/core/services/chat-agent.js.map +1 -1
- package/dist/core/services/chat-tools.d.ts.map +1 -1
- package/dist/core/services/chat-tools.js +172 -50
- package/dist/core/services/chat-tools.js.map +1 -1
- package/dist/core/services/llm-service.d.ts +2 -0
- package/dist/core/services/llm-service.d.ts.map +1 -1
- package/dist/core/services/llm-service.js +47 -5
- package/dist/core/services/llm-service.js.map +1 -1
- package/dist/core/services/mcp-handlers/analysis.d.ts +12 -0
- package/dist/core/services/mcp-handlers/analysis.d.ts.map +1 -1
- package/dist/core/services/mcp-handlers/analysis.js +138 -2
- package/dist/core/services/mcp-handlers/analysis.js.map +1 -1
- package/dist/core/services/mcp-handlers/graph.d.ts +21 -1
- package/dist/core/services/mcp-handlers/graph.d.ts.map +1 -1
- package/dist/core/services/mcp-handlers/graph.js +142 -2
- package/dist/core/services/mcp-handlers/graph.js.map +1 -1
- package/dist/core/services/mcp-handlers/orient.d.ts +17 -0
- package/dist/core/services/mcp-handlers/orient.d.ts.map +1 -0
- package/dist/core/services/mcp-handlers/orient.js +200 -0
- package/dist/core/services/mcp-handlers/orient.js.map +1 -0
- package/dist/core/services/mcp-handlers/semantic.d.ts +18 -4
- package/dist/core/services/mcp-handlers/semantic.d.ts.map +1 -1
- package/dist/core/services/mcp-handlers/semantic.js +161 -17
- package/dist/core/services/mcp-handlers/semantic.js.map +1 -1
- package/dist/core/services/mcp-handlers/utils.d.ts +43 -0
- package/dist/core/services/mcp-handlers/utils.d.ts.map +1 -1
- package/dist/core/services/mcp-handlers/utils.js +66 -1
- package/dist/core/services/mcp-handlers/utils.js.map +1 -1
- package/dist/core/services/mcp-watcher.d.ts +41 -0
- package/dist/core/services/mcp-watcher.d.ts.map +1 -0
- package/dist/core/services/mcp-watcher.js +177 -0
- package/dist/core/services/mcp-watcher.js.map +1 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/pipeline.d.ts +7 -0
- package/dist/types/pipeline.d.ts.map +1 -1
- package/package.json +3 -2
- package/src/viewer/InteractiveGraphViewer.jsx +33 -8
- package/src/viewer/components/ChatPanel.jsx +8 -5
- package/src/viewer/components/ClassGraph.jsx +699 -0
- package/src/viewer/utils/graph-helpers.js +1 -1
- package/src/viewer/utils/themes.js +36 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TypeInferenceEngine — lightweight regex-based type inference for 7 languages.
|
|
3
|
+
*
|
|
4
|
+
* Given the source text of a single function body, returns a map of
|
|
5
|
+
* { variableName → className } inferred from declarations, annotations,
|
|
6
|
+
* and constructor calls.
|
|
7
|
+
*
|
|
8
|
+
* Intentionally NOT a full type system — false positives are acceptable;
|
|
9
|
+
* false negatives (missing resolutions) are the only cost. Only class names
|
|
10
|
+
* starting with an uppercase letter are tracked (conventional in all supported
|
|
11
|
+
* languages), which eliminates most false positives from primitive types.
|
|
12
|
+
*/
|
|
13
|
+
export function inferTypesFromSource(source, language) {
|
|
14
|
+
switch (language) {
|
|
15
|
+
case 'Python': return inferPython(source);
|
|
16
|
+
case 'C++': return inferCpp(source);
|
|
17
|
+
case 'TypeScript':
|
|
18
|
+
case 'JavaScript': return inferTypeScript(source);
|
|
19
|
+
case 'Go': return inferGo(source);
|
|
20
|
+
case 'Rust': return inferRust(source);
|
|
21
|
+
case 'Java': return inferJava(source);
|
|
22
|
+
case 'Ruby': return inferRuby(source);
|
|
23
|
+
default: return new Map();
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
// Per-language inference rules
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
function inferPython(source) {
|
|
30
|
+
const result = new Map();
|
|
31
|
+
// var = ClassName(...)
|
|
32
|
+
for (const m of source.matchAll(/^\s*(\w+)\s*=\s*([A-Z]\w*)\s*\(/gm))
|
|
33
|
+
result.set(m[1], m[2]);
|
|
34
|
+
// var: ClassName = ...
|
|
35
|
+
for (const m of source.matchAll(/^\s*(\w+)\s*:\s*([A-Z]\w*)\s*=/gm))
|
|
36
|
+
result.set(m[1], m[2]);
|
|
37
|
+
// param: ClassName in signatures
|
|
38
|
+
for (const m of source.matchAll(/\b(\w+)\s*:\s*([A-Z]\w*)\b/g))
|
|
39
|
+
result.set(m[1], m[2]);
|
|
40
|
+
return result;
|
|
41
|
+
}
|
|
42
|
+
function inferCpp(source) {
|
|
43
|
+
const result = new Map();
|
|
44
|
+
// ClassName var; or ClassName var(...)
|
|
45
|
+
for (const m of source.matchAll(/\b([A-Z]\w*)\s+(\w+)\s*[;({]/g))
|
|
46
|
+
result.set(m[2], m[1]);
|
|
47
|
+
// ClassName* var = new ClassName(...)
|
|
48
|
+
for (const m of source.matchAll(/\b([A-Z]\w*)\s*\*\s*(\w+)\s*=\s*new\s+\1/g))
|
|
49
|
+
result.set(m[2], m[1]);
|
|
50
|
+
// auto var = make_shared<ClassName>(...) / make_unique<ClassName>(...)
|
|
51
|
+
for (const m of source.matchAll(/auto\s+(\w+)\s*=\s*(?:make_shared|make_unique)<([A-Z]\w*)>/g))
|
|
52
|
+
result.set(m[1], m[2]);
|
|
53
|
+
// shared_ptr<ClassName> var / unique_ptr / weak_ptr
|
|
54
|
+
for (const m of source.matchAll(/(?:shared_ptr|unique_ptr|weak_ptr)<([A-Z]\w*)>\s+(\w+)/g))
|
|
55
|
+
result.set(m[2], m[1]);
|
|
56
|
+
return result;
|
|
57
|
+
}
|
|
58
|
+
function inferTypeScript(source) {
|
|
59
|
+
const result = new Map();
|
|
60
|
+
// const var = new ClassName(...)
|
|
61
|
+
for (const m of source.matchAll(/\bconst\s+(\w+)\s*=\s*new\s+([A-Z]\w*)\s*\(/g))
|
|
62
|
+
result.set(m[1], m[2]);
|
|
63
|
+
// let/var/const var: ClassName =
|
|
64
|
+
for (const m of source.matchAll(/\b(?:let|var|const)\s+(\w+)\s*:\s*([A-Z]\w*)\s*=/g))
|
|
65
|
+
result.set(m[1], m[2]);
|
|
66
|
+
// param: ClassName in signatures
|
|
67
|
+
for (const m of source.matchAll(/\b(\w+)\s*:\s*([A-Z]\w*)\b/g))
|
|
68
|
+
result.set(m[1], m[2]);
|
|
69
|
+
return result;
|
|
70
|
+
}
|
|
71
|
+
function inferGo(source) {
|
|
72
|
+
const result = new Map();
|
|
73
|
+
// var svc *MyService
|
|
74
|
+
for (const m of source.matchAll(/\bvar\s+(\w+)\s+\*?([A-Z]\w*)\b/g))
|
|
75
|
+
result.set(m[1], m[2]);
|
|
76
|
+
// svc := MyService{...} or NewMyService(...)
|
|
77
|
+
for (const m of source.matchAll(/\b(\w+)\s*:=\s*(?:New)?([A-Z]\w*)[{(]/g))
|
|
78
|
+
result.set(m[1], m[2]);
|
|
79
|
+
// svc := &MyService{...}
|
|
80
|
+
for (const m of source.matchAll(/\b(\w+)\s*:=\s*&([A-Z]\w*)\s*{/g))
|
|
81
|
+
result.set(m[1], m[2]);
|
|
82
|
+
// func f(svc *MyService) — parameter annotations
|
|
83
|
+
for (const m of source.matchAll(/\b(\w+)\s+\*?([A-Z]\w*)\b/g))
|
|
84
|
+
result.set(m[1], m[2]);
|
|
85
|
+
return result;
|
|
86
|
+
}
|
|
87
|
+
function inferRust(source) {
|
|
88
|
+
const result = new Map();
|
|
89
|
+
// let svc: MyService = ...
|
|
90
|
+
for (const m of source.matchAll(/\blet\s+(?:mut\s+)?(\w+)\s*:\s*([A-Z]\w*)\b/g))
|
|
91
|
+
result.set(m[1], m[2]);
|
|
92
|
+
// let svc = MyService::new(...) / MyService::default()
|
|
93
|
+
for (const m of source.matchAll(/\blet\s+(?:mut\s+)?(\w+)\s*=\s*([A-Z]\w*)::(?:new|default)\s*\(/g))
|
|
94
|
+
result.set(m[1], m[2]);
|
|
95
|
+
// let svc = Box::new(MyService::new(...))
|
|
96
|
+
for (const m of source.matchAll(/\blet\s+(?:mut\s+)?(\w+)\s*=\s*Box::new\(([A-Z]\w*)::new/g))
|
|
97
|
+
result.set(m[1], m[2]);
|
|
98
|
+
return result;
|
|
99
|
+
}
|
|
100
|
+
function inferJava(source) {
|
|
101
|
+
const result = new Map();
|
|
102
|
+
// ClassName var = ... or ClassName var;
|
|
103
|
+
for (const m of source.matchAll(/\b([A-Z]\w*)\s+(\w+)\s*(?:=|;)/g))
|
|
104
|
+
result.set(m[2], m[1]);
|
|
105
|
+
// Interface var = new ConcreteClass(...) — prefer the concrete type
|
|
106
|
+
for (const m of source.matchAll(/\b([A-Z]\w*)\s+(\w+)\s*=\s*new\s+([A-Z]\w*)\s*\(/g))
|
|
107
|
+
result.set(m[2], m[3]);
|
|
108
|
+
return result;
|
|
109
|
+
}
|
|
110
|
+
function inferRuby(source) {
|
|
111
|
+
const result = new Map();
|
|
112
|
+
// svc = MyClass.new(...)
|
|
113
|
+
for (const m of source.matchAll(/\b(\w+)\s*=\s*([A-Z]\w*)\.new\b/g))
|
|
114
|
+
result.set(m[1], m[2]);
|
|
115
|
+
return result;
|
|
116
|
+
}
|
|
117
|
+
// ---------------------------------------------------------------------------
|
|
118
|
+
// Common resolution helper
|
|
119
|
+
// ---------------------------------------------------------------------------
|
|
120
|
+
/**
|
|
121
|
+
* Given a receiver variable name and a method name, look up the inferred type
|
|
122
|
+
* of the receiver and resolve the method to a FunctionNode via the trie.
|
|
123
|
+
*/
|
|
124
|
+
export function resolveViaTypeInference(calleeObject, calleeName, inferredTypes, trie) {
|
|
125
|
+
const className = inferredTypes.get(calleeObject);
|
|
126
|
+
if (!className)
|
|
127
|
+
return undefined;
|
|
128
|
+
return trie.findByQualifiedName(className, calleeName)[0];
|
|
129
|
+
}
|
|
130
|
+
//# sourceMappingURL=type-inference-engine.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"type-inference-engine.js","sourceRoot":"","sources":["../../../src/core/analyzer/type-inference-engine.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAQH,MAAM,UAAU,oBAAoB,CAAC,MAAc,EAAE,QAAgB;IACnE,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,QAAQ,CAAC,CAAK,OAAO,WAAW,CAAC,MAAM,CAAC,CAAC;QAC9C,KAAK,KAAK,CAAC,CAAQ,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC;QAC3C,KAAK,YAAY,CAAC;QAClB,KAAK,YAAY,CAAC,CAAC,OAAO,eAAe,CAAC,MAAM,CAAC,CAAC;QAClD,KAAK,IAAI,CAAC,CAAS,OAAO,OAAO,CAAC,MAAM,CAAC,CAAC;QAC1C,KAAK,MAAM,CAAC,CAAO,OAAO,SAAS,CAAC,MAAM,CAAC,CAAC;QAC5C,KAAK,MAAM,CAAC,CAAO,OAAO,SAAS,CAAC,MAAM,CAAC,CAAC;QAC5C,KAAK,MAAM,CAAC,CAAO,OAAO,SAAS,CAAC,MAAM,CAAC,CAAC;QAC5C,OAAO,CAAC,CAAW,OAAO,IAAI,GAAG,EAAE,CAAC;IACtC,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,+BAA+B;AAC/B,8EAA8E;AAE9E,SAAS,WAAW,CAAC,MAAc;IACjC,MAAM,MAAM,GAAkB,IAAI,GAAG,EAAE,CAAC;IACxC,uBAAuB;IACvB,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,mCAAmC,CAAC;QAClE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,uBAAuB;IACvB,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,kCAAkC,CAAC;QACjE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,iCAAiC;IACjC,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,6BAA6B,CAAC;QAC5D,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,QAAQ,CAAC,MAAc;IAC9B,MAAM,MAAM,GAAkB,IAAI,GAAG,EAAE,CAAC;IACxC,yCAAyC;IACzC,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,+BAA+B,CAAC;QAC9D,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,sCAAsC;IACtC,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,2CAA2C,CAAC;QAC1E,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,yEAAyE;IACzE,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,6DAA6D,CAAC;QAC5F,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,wDAAwD;IACxD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,yDAAyD,CAAC;QACxF,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,eAAe,CAAC,MAAc;IACrC,MAAM,MAAM,GAAkB,IAAI,GAAG,EAAE,CAAC;IACxC,iCAAiC;IACjC,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,8CAA8C,CAAC;QAC7E,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,iCAAiC;IACjC,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,mDAAmD,CAAC;QAClF,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,iCAAiC;IACjC,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,6BAA6B,CAAC;QAC5D,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,OAAO,CAAC,MAAc;IAC7B,MAAM,MAAM,GAAkB,IAAI,GAAG,EAAE,CAAC;IACxC,qBAAqB;IACrB,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,kCAAkC,CAAC;QACjE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,+CAA+C;IAC/C,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,wCAAwC,CAAC;QACvE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,yBAAyB;IACzB,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,iCAAiC,CAAC;QAChE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,iDAAiD;IACjD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,4BAA4B,CAAC;QAC3D,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,SAAS,CAAC,MAAc;IAC/B,MAAM,MAAM,GAAkB,IAAI,GAAG,EAAE,CAAC;IACxC,2BAA2B;IAC3B,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,8CAA8C,CAAC;QAC7E,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,yDAAyD;IACzD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,kEAAkE,CAAC;QACjG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,0CAA0C;IAC1C,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,2DAA2D,CAAC;QAC1F,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,SAAS,CAAC,MAAc;IAC/B,MAAM,MAAM,GAAkB,IAAI,GAAG,EAAE,CAAC;IACxC,0CAA0C;IAC1C,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,iCAAiC,CAAC;QAChE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,qEAAqE;IACrE,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,mDAAmD,CAAC;QAClF,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,SAAS,CAAC,MAAc;IAC/B,MAAM,MAAM,GAAkB,IAAI,GAAG,EAAE,CAAC;IACxC,yBAAyB;IACzB,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,kCAAkC,CAAC;QACjE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzB,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,8EAA8E;AAC9E,2BAA2B;AAC3B,8EAA8E;AAE9E;;;GAGG;AACH,MAAM,UAAU,uBAAuB,CACrC,YAAoB,EACpB,UAAkB,EAClB,aAA4B,EAC5B,IAA0B;IAE1B,MAAM,SAAS,GAAG,aAAa,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IAClD,IAAI,CAAC,SAAS;QAAE,OAAO,SAAS,CAAC;IACjC,OAAO,IAAI,CAAC,mBAAmB,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;AAC5D,CAAC"}
|
|
@@ -37,24 +37,53 @@ export interface FunctionRecord {
|
|
|
37
37
|
}
|
|
38
38
|
export interface SearchResult {
|
|
39
39
|
record: Omit<FunctionRecord, 'vector'>;
|
|
40
|
-
/**
|
|
40
|
+
/**
|
|
41
|
+
* Relevance score. For hybrid search (default): RRF score, higher = more relevant.
|
|
42
|
+
* For dense-only search: cosine distance from LanceDB, lower = more similar.
|
|
43
|
+
*/
|
|
41
44
|
score: number;
|
|
42
45
|
}
|
|
43
46
|
export declare class VectorIndex {
|
|
44
47
|
/**
|
|
45
48
|
* Build (or rebuild) the vector index from call graph nodes + signatures.
|
|
46
|
-
*
|
|
49
|
+
*
|
|
50
|
+
* When `incremental` is true and an existing index is found, only functions
|
|
51
|
+
* whose text has changed since the last build are re-embedded. Unchanged
|
|
52
|
+
* functions reuse their cached vectors. Pass `incremental: false` (or omit
|
|
53
|
+
* when no index exists) to do a full rebuild.
|
|
54
|
+
*
|
|
55
|
+
* Returns a summary of how many functions were embedded vs reused.
|
|
47
56
|
*/
|
|
48
|
-
static build(outputDir: string, nodes: FunctionNode[], signatures: FileSignatureMap[], hubIds: Set<string>, entryPointIds: Set<string>, embedSvc: EmbeddingService
|
|
57
|
+
static build(outputDir: string, nodes: FunctionNode[], signatures: FileSignatureMap[], hubIds: Set<string>, entryPointIds: Set<string>, embedSvc: EmbeddingService,
|
|
58
|
+
/** Optional map of filePath → source content for skeleton-based body indexing */
|
|
59
|
+
fileContents?: Map<string, string>,
|
|
60
|
+
/** When true, reuse cached vectors for unchanged functions */
|
|
61
|
+
incremental?: boolean): Promise<{
|
|
62
|
+
embedded: number;
|
|
63
|
+
reused: number;
|
|
64
|
+
}>;
|
|
49
65
|
/**
|
|
50
|
-
*
|
|
51
|
-
*
|
|
66
|
+
* Hybrid search over the index: dense (ANN) + sparse (BM25) merged via RRF.
|
|
67
|
+
*
|
|
68
|
+
* Dense recall fetches top `limit*5` candidates from the vector index.
|
|
69
|
+
* Sparse recall scores the full corpus with BM25 (cached per session).
|
|
70
|
+
* Reciprocal Rank Fusion (RRF) combines both rankings into a single list.
|
|
71
|
+
*
|
|
72
|
+
* Set `hybrid: false` to use dense-only search (original behaviour).
|
|
73
|
+
* Returns up to `limit` results sorted by relevance (highest first).
|
|
52
74
|
*/
|
|
53
|
-
static search(outputDir: string, query: string, embedSvc: EmbeddingService, opts?: {
|
|
75
|
+
static search(outputDir: string, query: string, embedSvc: EmbeddingService | null | undefined, opts?: {
|
|
54
76
|
limit?: number;
|
|
55
77
|
language?: string;
|
|
56
78
|
minFanIn?: number;
|
|
79
|
+
/** Enable hybrid dense+sparse retrieval via RRF (default: true when embedSvc available) */
|
|
80
|
+
hybrid?: boolean;
|
|
57
81
|
}): Promise<SearchResult[]>;
|
|
82
|
+
/**
|
|
83
|
+
* BM25-only search: used when no embedding service is available.
|
|
84
|
+
* Scores the full corpus with BM25 and returns the top `limit` results.
|
|
85
|
+
*/
|
|
86
|
+
private static _bm25Only;
|
|
58
87
|
/**
|
|
59
88
|
* Returns true if a vector index has been built for this output directory.
|
|
60
89
|
*/
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"vector-index.d.ts","sourceRoot":"","sources":["../../../src/core/analyzer/vector-index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAIH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AACpD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,0BAA0B,CAAC;AACjE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;
|
|
1
|
+
{"version":3,"file":"vector-index.d.ts","sourceRoot":"","sources":["../../../src/core/analyzer/vector-index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAIH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AACpD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,0BAA0B,CAAC;AACjE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAO/D,MAAM,WAAW,cAAc;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,OAAO,CAAC;IACf,YAAY,EAAE,OAAO,CAAC;IACtB,2CAA2C;IAC3C,IAAI,EAAE,MAAM,CAAC;IACb,uBAAuB;IACvB,MAAM,EAAE,MAAM,EAAE,CAAC;CAClB;AAED,MAAM,WAAW,YAAY;IAC3B,MAAM,EAAE,IAAI,CAAC,cAAc,EAAE,QAAQ,CAAC,CAAC;IACvC;;;OAGG;IACH,KAAK,EAAE,MAAM,CAAC;CACf;AAuKD,qBAAa,WAAW;IACtB;;;;;;;;;OASG;WACU,KAAK,CAChB,SAAS,EAAE,MAAM,EACjB,KAAK,EAAE,YAAY,EAAE,EACrB,UAAU,EAAE,gBAAgB,EAAE,EAC9B,MAAM,EAAE,GAAG,CAAC,MAAM,CAAC,EACnB,aAAa,EAAE,GAAG,CAAC,MAAM,CAAC,EAC1B,QAAQ,EAAE,gBAAgB;IAC1B,iFAAiF;IACjF,YAAY,CAAC,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC;IAClC,8DAA8D;IAC9D,WAAW,UAAQ,GAClB,OAAO,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;IAsIhD;;;;;;;;;OASG;WACU,MAAM,CACjB,SAAS,EAAE,MAAM,EACjB,KAAK,EAAE,MAAM,EACb,QAAQ,EAAE,gBAAgB,GAAG,IAAI,GAAG,SAAS,EAC7C,IAAI,GAAE;QACJ,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,2FAA2F;QAC3F,MAAM,CAAC,EAAE,OAAO,CAAC;KACb,GACL,OAAO,CAAC,YAAY,EAAE,CAAC;IAuH1B;;;OAGG;mBACkB,SAAS;IAoD9B;;OAEG;IACH,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO;CAG1C"}
|
|
@@ -17,19 +17,84 @@
|
|
|
17
17
|
*/
|
|
18
18
|
import { existsSync } from 'node:fs';
|
|
19
19
|
import { join } from 'node:path';
|
|
20
|
+
import { getSkeletonContent, isSkeletonWorthIncluding } from './code-shaper.js';
|
|
20
21
|
// ============================================================================
|
|
21
22
|
// CONSTANTS
|
|
22
23
|
// ============================================================================
|
|
23
24
|
const DB_FOLDER = 'vector-index';
|
|
24
25
|
const TABLE_NAME = 'functions';
|
|
26
|
+
/** Convert a raw LanceDB row to a FunctionRecord (without the vector field). */
|
|
27
|
+
function rowToRecord(row) {
|
|
28
|
+
return {
|
|
29
|
+
id: row.id,
|
|
30
|
+
name: row.name,
|
|
31
|
+
filePath: row.filePath,
|
|
32
|
+
className: row.className,
|
|
33
|
+
language: row.language,
|
|
34
|
+
signature: row.signature,
|
|
35
|
+
docstring: row.docstring,
|
|
36
|
+
fanIn: row.fanIn,
|
|
37
|
+
fanOut: row.fanOut,
|
|
38
|
+
isHub: row.isHub,
|
|
39
|
+
isEntryPoint: row.isEntryPoint,
|
|
40
|
+
text: row.text,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
function tokenize(text) {
|
|
44
|
+
// Split on non-alphanumeric, keep tokens longer than 1 char
|
|
45
|
+
return text.toLowerCase().split(/[^a-z0-9]+/).filter(t => t.length > 1);
|
|
46
|
+
}
|
|
47
|
+
function buildBm25Corpus(records) {
|
|
48
|
+
const docs = [];
|
|
49
|
+
const df = new Map();
|
|
50
|
+
let totalLen = 0;
|
|
51
|
+
for (const r of records) {
|
|
52
|
+
const tokens = tokenize(r.text);
|
|
53
|
+
const tfMap = new Map();
|
|
54
|
+
for (const t of tokens)
|
|
55
|
+
tfMap.set(t, (tfMap.get(t) ?? 0) + 1);
|
|
56
|
+
docs.push({ id: r.id, tfMap, length: tokens.length });
|
|
57
|
+
totalLen += tokens.length;
|
|
58
|
+
for (const t of tfMap.keys())
|
|
59
|
+
df.set(t, (df.get(t) ?? 0) + 1);
|
|
60
|
+
}
|
|
61
|
+
return { docs, df, avgLength: docs.length > 0 ? totalLen / docs.length : 1, N: docs.length };
|
|
62
|
+
}
|
|
63
|
+
const BM25_K1 = 1.2;
|
|
64
|
+
const BM25_B = 0.75;
|
|
65
|
+
function bm25Score(corpus, queryTokens, docIdx) {
|
|
66
|
+
const doc = corpus.docs[docIdx];
|
|
67
|
+
let score = 0;
|
|
68
|
+
for (const q of queryTokens) {
|
|
69
|
+
const df = corpus.df.get(q) ?? 0;
|
|
70
|
+
if (df === 0)
|
|
71
|
+
continue;
|
|
72
|
+
const idf = Math.log((corpus.N - df + 0.5) / (df + 0.5) + 1);
|
|
73
|
+
const tf = doc.tfMap.get(q) ?? 0;
|
|
74
|
+
const tfNorm = (tf * (BM25_K1 + 1)) /
|
|
75
|
+
(tf + BM25_K1 * (1 - BM25_B + BM25_B * (doc.length / corpus.avgLength)));
|
|
76
|
+
score += idf * tfNorm;
|
|
77
|
+
}
|
|
78
|
+
return score;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Reciprocal Rank Fusion: merges two ranked lists into a single relevance score.
|
|
82
|
+
* k=60 is the standard parameter (Cormack et al., 2009).
|
|
83
|
+
*/
|
|
84
|
+
function rrfScore(rankDense, rankSparse, k = 60) {
|
|
85
|
+
return 1 / (k + rankDense + 1) + 1 / (k + rankSparse + 1);
|
|
86
|
+
}
|
|
87
|
+
// Module-level BM25 corpus cache: avoids a full table scan on every search call
|
|
88
|
+
// when the index hasn't changed. Keyed by dbPath; invalidated when row count changes.
|
|
89
|
+
const _bm25Cache = new Map();
|
|
25
90
|
// ============================================================================
|
|
26
91
|
// HELPERS
|
|
27
92
|
// ============================================================================
|
|
28
93
|
/**
|
|
29
94
|
* Build the text to embed for a function.
|
|
30
|
-
* Combines language, path, qualified name, signature, and
|
|
95
|
+
* Combines language, path, qualified name, signature, docstring, and skeleton body.
|
|
31
96
|
*/
|
|
32
|
-
function buildText(node, signature, docstring) {
|
|
97
|
+
function buildText(node, signature, docstring, fileContents) {
|
|
33
98
|
const qualifiedName = node.className
|
|
34
99
|
? `${node.className}.${node.name}`
|
|
35
100
|
: node.name;
|
|
@@ -38,6 +103,22 @@ function buildText(node, signature, docstring) {
|
|
|
38
103
|
parts.push(signature);
|
|
39
104
|
if (docstring)
|
|
40
105
|
parts.push(docstring);
|
|
106
|
+
// Append skeleton body when file contents are available.
|
|
107
|
+
// The skeleton strips noise (logs, comments) while preserving business-logic signals
|
|
108
|
+
// (variable names, control flow, calls, return/throw). Only included when it provides
|
|
109
|
+
// meaningful reduction over the raw body (≥20% smaller).
|
|
110
|
+
if (fileContents && node.startIndex < node.endIndex) {
|
|
111
|
+
const src = fileContents.get(node.filePath);
|
|
112
|
+
if (src) {
|
|
113
|
+
const body = src.slice(node.startIndex, node.endIndex);
|
|
114
|
+
if (body.trim()) {
|
|
115
|
+
const skeleton = getSkeletonContent(body, node.language);
|
|
116
|
+
if (isSkeletonWorthIncluding(body, skeleton)) {
|
|
117
|
+
parts.push(skeleton);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
41
122
|
return parts.join('\n');
|
|
42
123
|
}
|
|
43
124
|
/**
|
|
@@ -69,18 +150,36 @@ function findSignatureEntry(node, sigIndex) {
|
|
|
69
150
|
export class VectorIndex {
|
|
70
151
|
/**
|
|
71
152
|
* Build (or rebuild) the vector index from call graph nodes + signatures.
|
|
72
|
-
*
|
|
153
|
+
*
|
|
154
|
+
* When `incremental` is true and an existing index is found, only functions
|
|
155
|
+
* whose text has changed since the last build are re-embedded. Unchanged
|
|
156
|
+
* functions reuse their cached vectors. Pass `incremental: false` (or omit
|
|
157
|
+
* when no index exists) to do a full rebuild.
|
|
158
|
+
*
|
|
159
|
+
* Returns a summary of how many functions were embedded vs reused.
|
|
73
160
|
*/
|
|
74
|
-
static async build(outputDir, nodes, signatures, hubIds, entryPointIds, embedSvc
|
|
161
|
+
static async build(outputDir, nodes, signatures, hubIds, entryPointIds, embedSvc,
|
|
162
|
+
/** Optional map of filePath → source content for skeleton-based body indexing */
|
|
163
|
+
fileContents,
|
|
164
|
+
/** When true, reuse cached vectors for unchanged functions */
|
|
165
|
+
incremental = false) {
|
|
75
166
|
const { connect } = await import('@lancedb/lancedb');
|
|
76
167
|
if (nodes.length === 0) {
|
|
77
168
|
throw new Error('No functions to index');
|
|
78
169
|
}
|
|
79
170
|
const sigIndex = buildSignatureIndex(signatures);
|
|
80
|
-
// Build records
|
|
171
|
+
// Build candidate records (without vectors)
|
|
81
172
|
const nodeIds = new Set(nodes.map(n => n.id));
|
|
82
|
-
const
|
|
83
|
-
const
|
|
173
|
+
const candidates = nodes.map(node => {
|
|
174
|
+
const cgDoc = node.docstring ?? '';
|
|
175
|
+
const cgSig = node.signature ?? '';
|
|
176
|
+
// Always check regex index as fallback — CG may miss docstrings when
|
|
177
|
+
// startIndex points inside an export_statement (past the `export` keyword),
|
|
178
|
+
// causing extractDocstringBefore to scan into the export keyword instead of
|
|
179
|
+
// reaching the JSDoc block above it.
|
|
180
|
+
const { signature: regexSig, docstring: regexDoc } = findSignatureEntry(node, sigIndex);
|
|
181
|
+
const signature = cgSig || regexSig;
|
|
182
|
+
const docstring = cgDoc || regexDoc;
|
|
84
183
|
return {
|
|
85
184
|
id: node.id,
|
|
86
185
|
name: node.name,
|
|
@@ -93,7 +192,7 @@ export class VectorIndex {
|
|
|
93
192
|
fanOut: node.fanOut,
|
|
94
193
|
isHub: hubIds.has(node.id),
|
|
95
194
|
isEntryPoint: entryPointIds.has(node.id),
|
|
96
|
-
text: buildText(node, signature, docstring),
|
|
195
|
+
text: buildText(node, signature, docstring, fileContents),
|
|
97
196
|
};
|
|
98
197
|
});
|
|
99
198
|
// Also index signature entries that have no call graph node (constants, type aliases, etc.)
|
|
@@ -107,7 +206,7 @@ export class VectorIndex {
|
|
|
107
206
|
continue;
|
|
108
207
|
const sig = entry.signature ?? '';
|
|
109
208
|
const doc = entry.docstring ?? '';
|
|
110
|
-
|
|
209
|
+
candidates.push({
|
|
111
210
|
id: syntheticId,
|
|
112
211
|
name: entry.name,
|
|
113
212
|
filePath: fsm.path,
|
|
@@ -123,69 +222,224 @@ export class VectorIndex {
|
|
|
123
222
|
});
|
|
124
223
|
}
|
|
125
224
|
}
|
|
126
|
-
//
|
|
127
|
-
const texts = records.map(r => r.text);
|
|
128
|
-
const vectors = await embedSvc.embed(texts);
|
|
129
|
-
if (vectors.length !== records.length) {
|
|
130
|
-
throw new Error(`Embedding count mismatch: expected ${records.length}, got ${vectors.length}`);
|
|
131
|
-
}
|
|
132
|
-
// Assemble final records with vectors
|
|
133
|
-
const fullRecords = records.map((r, i) => ({
|
|
134
|
-
...r,
|
|
135
|
-
vector: vectors[i],
|
|
136
|
-
}));
|
|
137
|
-
// Connect to LanceDB and write table
|
|
225
|
+
// ── Incremental cache lookup ─────────────────────────────────────────────
|
|
138
226
|
const dbPath = join(outputDir, DB_FOLDER);
|
|
227
|
+
let cachedVectors = new Map(); // id → vector
|
|
228
|
+
if (incremental && VectorIndex.exists(outputDir)) {
|
|
229
|
+
try {
|
|
230
|
+
const db = await connect(dbPath);
|
|
231
|
+
const table = await db.openTable(TABLE_NAME);
|
|
232
|
+
// Full table scan to load existing vectors
|
|
233
|
+
const existing = await table.query().toArray();
|
|
234
|
+
for (const row of existing) {
|
|
235
|
+
const id = row.id;
|
|
236
|
+
const text = row.text;
|
|
237
|
+
// Convert Arrow typed arrays (Float32Array etc.) to plain number[]
|
|
238
|
+
// so LanceDB can re-infer the schema when writing back
|
|
239
|
+
const vector = Array.from(row.vector);
|
|
240
|
+
// Cache the vector keyed by "id::text" so a text change invalidates it
|
|
241
|
+
cachedVectors.set(`${id}::${text}`, vector);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
catch {
|
|
245
|
+
// Existing index unreadable — fall back to full build
|
|
246
|
+
cachedVectors = new Map();
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
// ── Split into cached vs needs-embedding ────────────────────────────────
|
|
250
|
+
const toEmbed = [];
|
|
251
|
+
const toEmbedIdx = []; // index into `candidates`
|
|
252
|
+
const cachedIdx = [];
|
|
253
|
+
for (let i = 0; i < candidates.length; i++) {
|
|
254
|
+
const r = candidates[i];
|
|
255
|
+
const cacheKey = `${r.id}::${r.text}`;
|
|
256
|
+
if (cachedVectors.has(cacheKey)) {
|
|
257
|
+
cachedIdx.push(i);
|
|
258
|
+
}
|
|
259
|
+
else {
|
|
260
|
+
toEmbed.push(r);
|
|
261
|
+
toEmbedIdx.push(i);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
// ── Embed only changed / new functions ───────────────────────────────────
|
|
265
|
+
let newVectors = [];
|
|
266
|
+
if (toEmbed.length > 0) {
|
|
267
|
+
newVectors = await embedSvc.embed(toEmbed.map(r => r.text));
|
|
268
|
+
if (newVectors.length !== toEmbed.length) {
|
|
269
|
+
throw new Error(`Embedding count mismatch: expected ${toEmbed.length}, got ${newVectors.length}`);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
// ── Assemble final records ───────────────────────────────────────────────
|
|
273
|
+
const fullRecords = new Array(candidates.length);
|
|
274
|
+
for (let i = 0; i < cachedIdx.length; i++) {
|
|
275
|
+
const idx = cachedIdx[i];
|
|
276
|
+
const r = candidates[idx];
|
|
277
|
+
fullRecords[idx] = { ...r, vector: cachedVectors.get(`${r.id}::${r.text}`) };
|
|
278
|
+
}
|
|
279
|
+
for (let i = 0; i < toEmbedIdx.length; i++) {
|
|
280
|
+
const idx = toEmbedIdx[i];
|
|
281
|
+
fullRecords[idx] = { ...candidates[idx], vector: newVectors[i] };
|
|
282
|
+
}
|
|
283
|
+
// ── Write table ──────────────────────────────────────────────────────────
|
|
139
284
|
const db = await connect(dbPath);
|
|
140
285
|
await db.createTable(TABLE_NAME, fullRecords, { mode: 'overwrite' });
|
|
286
|
+
return { embedded: toEmbed.length, reused: cachedIdx.length };
|
|
141
287
|
}
|
|
142
288
|
/**
|
|
143
|
-
*
|
|
144
|
-
*
|
|
289
|
+
* Hybrid search over the index: dense (ANN) + sparse (BM25) merged via RRF.
|
|
290
|
+
*
|
|
291
|
+
* Dense recall fetches top `limit*5` candidates from the vector index.
|
|
292
|
+
* Sparse recall scores the full corpus with BM25 (cached per session).
|
|
293
|
+
* Reciprocal Rank Fusion (RRF) combines both rankings into a single list.
|
|
294
|
+
*
|
|
295
|
+
* Set `hybrid: false` to use dense-only search (original behaviour).
|
|
296
|
+
* Returns up to `limit` results sorted by relevance (highest first).
|
|
145
297
|
*/
|
|
146
298
|
static async search(outputDir, query, embedSvc, opts = {}) {
|
|
147
299
|
const { connect } = await import('@lancedb/lancedb');
|
|
148
|
-
const { limit = 10, language, minFanIn } = opts;
|
|
149
|
-
// Embed the query
|
|
150
|
-
const [queryVector] = await embedSvc.embed([query]);
|
|
151
|
-
if (!queryVector) {
|
|
152
|
-
throw new Error('Failed to embed query');
|
|
153
|
-
}
|
|
154
|
-
const dbPath = join(outputDir, DB_FOLDER);
|
|
300
|
+
const { limit = 10, language, minFanIn, hybrid = true } = opts;
|
|
155
301
|
if (!VectorIndex.exists(outputDir)) {
|
|
156
302
|
throw new Error('Vector index not found. Run "spec-gen analyze --embed" first.');
|
|
157
303
|
}
|
|
304
|
+
const dbPath = join(outputDir, DB_FOLDER);
|
|
158
305
|
const db = await connect(dbPath);
|
|
159
306
|
const table = await db.openTable(TABLE_NAME);
|
|
160
|
-
//
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
//
|
|
165
|
-
|
|
307
|
+
// ── BM25-only path (no embedding service available) ───────────────────────
|
|
308
|
+
if (!embedSvc) {
|
|
309
|
+
return VectorIndex._bm25Only(table, dbPath, query, limit, language, minFanIn);
|
|
310
|
+
}
|
|
311
|
+
// ── Dense recall ──────────────────────────────────────────────────────────
|
|
312
|
+
let queryVector;
|
|
313
|
+
try {
|
|
314
|
+
[queryVector] = await embedSvc.embed([query]);
|
|
315
|
+
}
|
|
316
|
+
catch {
|
|
317
|
+
// Embedding server unreachable — fall back to BM25
|
|
318
|
+
return VectorIndex._bm25Only(table, dbPath, query, limit, language, minFanIn);
|
|
319
|
+
}
|
|
320
|
+
if (!queryVector)
|
|
321
|
+
throw new Error('Failed to embed query');
|
|
322
|
+
const denseFetch = hybrid ? Math.min(limit * 5, 500) : Math.min(limit * 10, 1000);
|
|
323
|
+
const denseRows = await table.query().nearestTo(queryVector).limit(denseFetch).toArray();
|
|
324
|
+
const passesFilters = (row) => {
|
|
325
|
+
if (language && row.language !== language)
|
|
326
|
+
return false;
|
|
327
|
+
if (minFanIn !== undefined && minFanIn > 0 && row.fanIn < minFanIn)
|
|
328
|
+
return false;
|
|
329
|
+
return true;
|
|
330
|
+
};
|
|
331
|
+
// ── Dense-only path ───────────────────────────────────────────────────────
|
|
332
|
+
if (!hybrid) {
|
|
333
|
+
return denseRows
|
|
334
|
+
.filter(passesFilters)
|
|
335
|
+
.slice(0, limit)
|
|
336
|
+
.map(row => ({ record: rowToRecord(row), score: row._distance }));
|
|
337
|
+
}
|
|
338
|
+
// ── Sparse recall (BM25 over full corpus) ─────────────────────────────────
|
|
339
|
+
let cachedEntry = _bm25Cache.get(dbPath);
|
|
340
|
+
let allRows;
|
|
341
|
+
if (!cachedEntry) {
|
|
342
|
+
allRows = await table.query().toArray();
|
|
343
|
+
const corpus = buildBm25Corpus(allRows.map(r => ({ id: r.id, text: r.text })));
|
|
344
|
+
cachedEntry = { corpus, rowCount: allRows.length };
|
|
345
|
+
_bm25Cache.set(dbPath, cachedEntry);
|
|
346
|
+
}
|
|
347
|
+
else {
|
|
348
|
+
// Lightweight cache validation: re-scan only if row count has changed
|
|
349
|
+
allRows = await table.query().toArray();
|
|
350
|
+
if (allRows.length !== cachedEntry.rowCount) {
|
|
351
|
+
const corpus = buildBm25Corpus(allRows.map(r => ({ id: r.id, text: r.text })));
|
|
352
|
+
cachedEntry = { corpus, rowCount: allRows.length };
|
|
353
|
+
_bm25Cache.set(dbPath, cachedEntry);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
const { corpus } = cachedEntry;
|
|
357
|
+
const queryTokens = tokenize(query);
|
|
358
|
+
// Score all corpus documents with BM25
|
|
359
|
+
const sparseScored = corpus.docs
|
|
360
|
+
.map((_, i) => ({ idx: i, score: bm25Score(corpus, queryTokens, i) }))
|
|
361
|
+
.sort((a, b) => b.score - a.score)
|
|
362
|
+
.slice(0, limit * 5);
|
|
363
|
+
// Build id→row map from allRows for sparse candidates
|
|
364
|
+
const rowById = new Map(allRows.map(r => [r.id, r]));
|
|
365
|
+
// ── RRF merge ────────────────────────────────────────────────────────────
|
|
366
|
+
const rrfMap = new Map();
|
|
367
|
+
denseRows.forEach((row, rank) => {
|
|
368
|
+
const id = row.id;
|
|
369
|
+
const entry = rrfMap.get(id) ?? { row, score: 0 };
|
|
370
|
+
entry.score += rrfScore(rank, Infinity); // sparse rank = Infinity if not in sparse list
|
|
371
|
+
rrfMap.set(id, entry);
|
|
372
|
+
});
|
|
373
|
+
sparseScored.forEach(({ idx, score: bm25 }, rank) => {
|
|
374
|
+
if (bm25 === 0)
|
|
375
|
+
return; // no BM25 signal — skip
|
|
376
|
+
const id = corpus.docs[idx].id;
|
|
377
|
+
const row = rowById.get(id);
|
|
378
|
+
if (!row)
|
|
379
|
+
return;
|
|
380
|
+
const entry = rrfMap.get(id) ?? { row, score: 0 };
|
|
381
|
+
entry.score += 1 / (60 + rank + 1);
|
|
382
|
+
rrfMap.set(id, entry);
|
|
383
|
+
});
|
|
384
|
+
// Fix dense ranks now that we know the full picture
|
|
385
|
+
// Re-compute proper RRF scores with both ranks available
|
|
386
|
+
const denseRankById = new Map(denseRows.map((r, i) => [r.id, i]));
|
|
387
|
+
const sparseRankById = new Map(sparseScored.map(({ idx }, i) => [corpus.docs[idx].id, i]));
|
|
388
|
+
const merged = [...rrfMap.values()].map(({ row }) => {
|
|
389
|
+
const id = row.id;
|
|
390
|
+
const dr = denseRankById.get(id) ?? Infinity;
|
|
391
|
+
const sr = sparseRankById.get(id) ?? Infinity;
|
|
392
|
+
return { row, score: rrfScore(dr, sr) };
|
|
393
|
+
});
|
|
394
|
+
return merged
|
|
395
|
+
.sort((a, b) => b.score - a.score)
|
|
396
|
+
.filter(({ row }) => passesFilters(row))
|
|
397
|
+
.slice(0, limit)
|
|
398
|
+
.map(({ row, score }) => ({ record: rowToRecord(row), score }));
|
|
399
|
+
}
|
|
400
|
+
/**
|
|
401
|
+
* BM25-only search: used when no embedding service is available.
|
|
402
|
+
* Scores the full corpus with BM25 and returns the top `limit` results.
|
|
403
|
+
*/
|
|
404
|
+
static async _bm25Only(table, dbPath, query, limit, language, minFanIn) {
|
|
405
|
+
let cachedEntry = _bm25Cache.get(dbPath);
|
|
406
|
+
let allRows;
|
|
407
|
+
if (!cachedEntry) {
|
|
408
|
+
allRows = await table.query().toArray();
|
|
409
|
+
const corpus = buildBm25Corpus(allRows.map(r => ({ id: r.id, text: r.text })));
|
|
410
|
+
cachedEntry = { corpus, rowCount: allRows.length };
|
|
411
|
+
_bm25Cache.set(dbPath, cachedEntry);
|
|
412
|
+
}
|
|
413
|
+
else {
|
|
414
|
+
allRows = await table.query().toArray();
|
|
415
|
+
if (allRows.length !== cachedEntry.rowCount) {
|
|
416
|
+
const corpus = buildBm25Corpus(allRows.map(r => ({ id: r.id, text: r.text })));
|
|
417
|
+
cachedEntry = { corpus, rowCount: allRows.length };
|
|
418
|
+
_bm25Cache.set(dbPath, cachedEntry);
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
const { corpus } = cachedEntry;
|
|
422
|
+
const queryTokens = tokenize(query);
|
|
423
|
+
const rowById = new Map(allRows.map(r => [r.id, r]));
|
|
424
|
+
return corpus.docs
|
|
425
|
+
.map((_, i) => ({ idx: i, score: bm25Score(corpus, queryTokens, i) }))
|
|
426
|
+
.filter(({ score }) => score > 0)
|
|
427
|
+
.sort((a, b) => b.score - a.score)
|
|
428
|
+
.slice(0, limit * 3) // oversample before filtering
|
|
429
|
+
.map(({ idx, score }) => {
|
|
430
|
+
const row = rowById.get(corpus.docs[idx].id);
|
|
431
|
+
return row ? { row, score } : null;
|
|
432
|
+
})
|
|
433
|
+
.filter((x) => x !== null)
|
|
434
|
+
.filter(({ row }) => {
|
|
166
435
|
if (language && row.language !== language)
|
|
167
436
|
return false;
|
|
168
437
|
if (minFanIn !== undefined && minFanIn > 0 && row.fanIn < minFanIn)
|
|
169
438
|
return false;
|
|
170
439
|
return true;
|
|
171
|
-
})
|
|
172
|
-
|
|
173
|
-
record:
|
|
174
|
-
id: row.id,
|
|
175
|
-
name: row.name,
|
|
176
|
-
filePath: row.filePath,
|
|
177
|
-
className: row.className,
|
|
178
|
-
language: row.language,
|
|
179
|
-
signature: row.signature,
|
|
180
|
-
docstring: row.docstring,
|
|
181
|
-
fanIn: row.fanIn,
|
|
182
|
-
fanOut: row.fanOut,
|
|
183
|
-
isHub: row.isHub,
|
|
184
|
-
isEntryPoint: row.isEntryPoint,
|
|
185
|
-
text: row.text,
|
|
186
|
-
},
|
|
187
|
-
score: row._distance,
|
|
188
|
-
}));
|
|
440
|
+
})
|
|
441
|
+
.slice(0, limit)
|
|
442
|
+
.map(({ row, score }) => ({ record: rowToRecord(row), score }));
|
|
189
443
|
}
|
|
190
444
|
/**
|
|
191
445
|
* Returns true if a vector index has been built for this output directory.
|