@getmikk/core 2.0.14 → 2.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/package.json +2 -1
- package/src/analysis/type-flow.ts +1 -1
- package/src/cache/incremental-cache.ts +86 -80
- package/src/contract/contract-reader.ts +1 -0
- package/src/contract/lock-compiler.ts +95 -13
- package/src/contract/schema.ts +2 -0
- package/src/error-handler.ts +2 -1
- package/src/graph/cluster-detector.ts +2 -4
- package/src/graph/dead-code-detector.ts +303 -117
- package/src/graph/graph-builder.ts +21 -161
- package/src/graph/impact-analyzer.ts +1 -0
- package/src/graph/index.ts +2 -0
- package/src/graph/rich-function-index.ts +1080 -0
- package/src/graph/symbol-table.ts +252 -0
- package/src/hash/hash-store.ts +1 -0
- package/src/index.ts +2 -0
- package/src/parser/base-extractor.ts +19 -0
- package/src/parser/boundary-checker.ts +31 -12
- package/src/parser/error-recovery.ts +5 -4
- package/src/parser/function-body-extractor.ts +248 -0
- package/src/parser/go/go-extractor.ts +249 -676
- package/src/parser/index.ts +132 -318
- package/src/parser/language-registry.ts +57 -0
- package/src/parser/oxc-parser.ts +166 -28
- package/src/parser/oxc-resolver.ts +179 -11
- package/src/parser/parser-constants.ts +1 -0
- package/src/parser/rust/rust-extractor.ts +109 -0
- package/src/parser/tree-sitter/parser.ts +369 -62
- package/src/parser/tree-sitter/queries.ts +106 -10
- package/src/parser/types.ts +20 -1
- package/src/search/bm25.ts +21 -8
- package/src/search/direct-search.ts +472 -0
- package/src/search/embedding-provider.ts +249 -0
- package/src/search/index.ts +12 -0
- package/src/search/semantic-search.ts +435 -0
- package/src/utils/artifact-transaction.ts +1 -0
- package/src/utils/atomic-write.ts +1 -0
- package/src/utils/errors.ts +89 -4
- package/src/utils/fs.ts +104 -50
- package/src/utils/json.ts +1 -0
- package/src/utils/language-registry.ts +84 -6
- package/src/utils/path.ts +26 -0
- package/tests/dead-code.test.ts +3 -2
- package/tests/direct-search.test.ts +435 -0
- package/tests/error-recovery.test.ts +143 -0
- package/tests/fixtures/simple-api/src/index.ts +1 -1
- package/tests/go-parser.test.ts +19 -335
- package/tests/js-parser.test.ts +18 -1089
- package/tests/language-registry-all.test.ts +276 -0
- package/tests/language-registry.test.ts +6 -4
- package/tests/parse-diagnostics.test.ts +9 -96
- package/tests/parser.test.ts +42 -771
- package/tests/polyglot-parser.test.ts +117 -0
- package/tests/rich-function-index.test.ts +703 -0
- package/tests/tree-sitter-parser.test.ts +108 -80
- package/tests/ts-parser.test.ts +8 -8
- package/tests/verification.test.ts +175 -0
- package/src/parser/base-parser.ts +0 -16
- package/src/parser/go/go-parser.ts +0 -43
- package/src/parser/javascript/js-extractor.ts +0 -278
- package/src/parser/javascript/js-parser.ts +0 -101
- package/src/parser/typescript/ts-extractor.ts +0 -447
- package/src/parser/typescript/ts-parser.ts +0 -36
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import * as path from 'node:path';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Common interface for embedding providers
|
|
5
|
+
*/
|
|
6
|
+
export interface EmbeddingProvider {
|
|
7
|
+
embed(text: string): Promise<number[]>;
|
|
8
|
+
embedBatch(texts: string[]): Promise<number[][]>;
|
|
9
|
+
getDimensions(): number;
|
|
10
|
+
isAvailable(): Promise<boolean>;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
const VOCABULARY = [
|
|
14
|
+
'function', 'class', 'method', 'async', 'await', 'return', 'const', 'let', 'var',
|
|
15
|
+
'import', 'export', 'from', 'type', 'interface', 'extends', 'implements',
|
|
16
|
+
'constructor', 'prototype', 'static', 'private', 'public', 'protected',
|
|
17
|
+
'if', 'else', 'for', 'while', 'do', 'switch', 'case', 'break', 'continue',
|
|
18
|
+
'try', 'catch', 'finally', 'throw', 'error', 'exception',
|
|
19
|
+
'parse', 'format', 'validate', 'create', 'update', 'delete', 'remove',
|
|
20
|
+
'get', 'set', 'find', 'search', 'filter', 'map', 'reduce', 'transform',
|
|
21
|
+
'init', 'setup', 'config', 'options', 'settings', 'defaults',
|
|
22
|
+
'data', 'object', 'array', 'string', 'number', 'boolean', 'null', 'undefined',
|
|
23
|
+
'request', 'response', 'http', 'https', 'api', 'endpoint', 'route',
|
|
24
|
+
'auth', 'token', 'jwt', 'session', 'cookie', 'header',
|
|
25
|
+
'database', 'query', 'sql', 'transaction', 'connection', 'pool',
|
|
26
|
+
'file', 'path', 'directory', 'read', 'write', 'stream', 'buffer',
|
|
27
|
+
'event', 'listener', 'handler', 'callback', 'promise', 'observer',
|
|
28
|
+
'log', 'debug', 'info', 'warn', 'error', 'trace',
|
|
29
|
+
'test', 'mock', 'stub', 'assert', 'expect',
|
|
30
|
+
'cache', 'store', 'memory', 'session', 'local',
|
|
31
|
+
'user', 'account', 'profile', 'permission', 'role',
|
|
32
|
+
'create', 'register', 'login', 'logout', 'verify',
|
|
33
|
+
'send', 'receive', 'push', 'pull', 'fetch', 'upload', 'download',
|
|
34
|
+
'process', 'worker', 'thread', 'task', 'job', 'queue',
|
|
35
|
+
'client', 'server', 'service', 'endpoint', 'middleware',
|
|
36
|
+
];
|
|
37
|
+
|
|
38
|
+
function tokenize(text: string): string[] {
|
|
39
|
+
return text
|
|
40
|
+
.toLowerCase()
|
|
41
|
+
.replace(/[^a-z0-9\s]/g, ' ')
|
|
42
|
+
.split(/\s+/)
|
|
43
|
+
.filter(t => t.length > 1)
|
|
44
|
+
.filter(t => !['the', 'and', 'for', 'with', 'from', 'this', 'that', 'have', 'has'].includes(t));
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function computeTF(tokens: string[]): Map<string, number> {
|
|
48
|
+
const tf = new Map<string, number>();
|
|
49
|
+
for (const token of tokens) {
|
|
50
|
+
tf.set(token, (tf.get(token) || 0) + 1);
|
|
51
|
+
}
|
|
52
|
+
const max = Math.max(...tf.values(), 1);
|
|
53
|
+
for (const [key, value] of tf) {
|
|
54
|
+
tf.set(key, value / max);
|
|
55
|
+
}
|
|
56
|
+
return tf;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Fast vocabulary-based embeddings for when ML models aren't available.
|
|
61
|
+
* Uses TF-IDF with a programming-focused vocabulary.
|
|
62
|
+
*/
|
|
63
|
+
export class VocabularyEmbedder implements EmbeddingProvider {
|
|
64
|
+
readonly dimensions: number;
|
|
65
|
+
private vocabMap: Map<string, number>;
|
|
66
|
+
private defaultIDF: number;
|
|
67
|
+
|
|
68
|
+
constructor(vocab: string[] = VOCABULARY, dimensions = 128) {
|
|
69
|
+
this.dimensions = dimensions;
|
|
70
|
+
this.vocabMap = new Map();
|
|
71
|
+
|
|
72
|
+
for (let i = 0; i < Math.min(vocab.length, dimensions); i++) {
|
|
73
|
+
this.vocabMap.set(vocab[i], i);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
this.defaultIDF = Math.log(vocab.length + 1) + 1;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async embed(text: string): Promise<number[]> {
|
|
80
|
+
const tokens = tokenize(text);
|
|
81
|
+
const tf = computeTF(tokens);
|
|
82
|
+
|
|
83
|
+
const vector = new Array(this.dimensions).fill(0);
|
|
84
|
+
const magnitude = Math.sqrt(tokens.length || 1);
|
|
85
|
+
|
|
86
|
+
for (const [token, tfScore] of tf) {
|
|
87
|
+
const idx = this.vocabMap.get(token);
|
|
88
|
+
if (idx !== undefined && idx < this.dimensions) {
|
|
89
|
+
vector[idx] = tfScore * this.defaultIDF / magnitude;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const norm = Math.sqrt(vector.reduce((sum, v) => sum + v * v, 0)) || 1;
|
|
94
|
+
return vector.map(v => v / norm);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async embedBatch(texts: string[]): Promise<number[][]> {
|
|
98
|
+
return Promise.all(texts.map(t => this.embed(t)));
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
getDimensions(): number {
|
|
102
|
+
return this.dimensions;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
async isAvailable(): Promise<boolean> {
|
|
106
|
+
return true;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Local embeddings using ONNX runtime (via @xenova/transformers)
|
|
112
|
+
*/
|
|
113
|
+
export class LocalONNXEmbedder implements EmbeddingProvider {
|
|
114
|
+
private pipeline: unknown = null;
|
|
115
|
+
readonly dimensions = 384;
|
|
116
|
+
|
|
117
|
+
readonly MODEL_NAME = 'Xenova/all-MiniLM-L6-v2';
|
|
118
|
+
|
|
119
|
+
async isAvailable(): Promise<boolean> {
|
|
120
|
+
try {
|
|
121
|
+
await import('@xenova/transformers');
|
|
122
|
+
return true;
|
|
123
|
+
} catch {
|
|
124
|
+
return false;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
private async ensurePipeline() {
|
|
129
|
+
if (this.pipeline) return;
|
|
130
|
+
const { pipeline } = await import('@xenova/transformers');
|
|
131
|
+
this.pipeline = await pipeline('feature-extraction', this.MODEL_NAME);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
async embed(text: string): Promise<number[]> {
|
|
135
|
+
await this.ensurePipeline();
|
|
136
|
+
const p = this.pipeline as (texts: string[], options: unknown) => Promise<Array<{ data: Float32Array }>>;
|
|
137
|
+
const output = await p([text], { pooling: 'mean', normalize: true });
|
|
138
|
+
return Array.from(output[0].data);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
async embedBatch(texts: string[]): Promise<number[][]> {
|
|
142
|
+
await this.ensurePipeline();
|
|
143
|
+
const p = this.pipeline as (texts: string[], options: unknown) => Promise<Array<{ data: Float32Array }>>;
|
|
144
|
+
const output = await p(texts, { pooling: 'mean', normalize: true });
|
|
145
|
+
return output.map(o => Array.from(o.data));
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
getDimensions(): number {
|
|
149
|
+
return this.dimensions;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const GEMINI_MODEL_NAME = 'gemini-embedding-001';
|
|
154
|
+
const GEMINI_DIMENSIONS = 3072;
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Gemini-backed embedding provider
|
|
158
|
+
*/
|
|
159
|
+
export class GeminiEmbedder implements EmbeddingProvider {
|
|
160
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
161
|
+
private model: any;
|
|
162
|
+
readonly dimensions = GEMINI_DIMENSIONS;
|
|
163
|
+
|
|
164
|
+
constructor(apiKey: string) {
|
|
165
|
+
this.initialize(apiKey);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
private initialize(apiKey: string) {
|
|
169
|
+
const { GoogleGenerativeAI } = require('@google/generative-ai');
|
|
170
|
+
const genAI = new GoogleGenerativeAI(apiKey);
|
|
171
|
+
this.model = genAI.getGenerativeModel({ model: GEMINI_MODEL_NAME });
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
async embed(text: string): Promise<number[]> {
|
|
175
|
+
const result = await this.model.embedContent(text);
|
|
176
|
+
return result.embedding.values;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
async embedBatch(texts: string[]): Promise<number[][]> {
|
|
180
|
+
const result = await this.model.batchEmbedContents({
|
|
181
|
+
requests: texts.map((t) => ({ content: { role: 'user', parts: [{ text: t }] } })),
|
|
182
|
+
});
|
|
183
|
+
return result.embeddings.map((e: { values: number[] }) => e.values);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
getDimensions(): number {
|
|
187
|
+
return this.dimensions;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
async isAvailable(): Promise<boolean> {
|
|
191
|
+
return !!process.env.GEMINI_API_KEY;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
let cachedProvider: EmbeddingProvider | null = null;
|
|
196
|
+
let providerInitPromise: Promise<EmbeddingProvider> | null = null;
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Factory to create the best available provider.
|
|
200
|
+
* Caches the provider for subsequent calls.
|
|
201
|
+
*/
|
|
202
|
+
export async function createEmbeddingProvider(): Promise<EmbeddingProvider> {
|
|
203
|
+
if (cachedProvider) {
|
|
204
|
+
return cachedProvider;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
if (providerInitPromise) {
|
|
208
|
+
return providerInitPromise;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
providerInitPromise = (async () => {
|
|
212
|
+
const localONNX = new LocalONNXEmbedder();
|
|
213
|
+
|
|
214
|
+
if (await localONNX.isAvailable()) {
|
|
215
|
+
cachedProvider = localONNX;
|
|
216
|
+
return localONNX;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const apiKey = process.env.GEMINI_API_KEY;
|
|
220
|
+
if (apiKey) {
|
|
221
|
+
try {
|
|
222
|
+
cachedProvider = new GeminiEmbedder(apiKey);
|
|
223
|
+
return cachedProvider;
|
|
224
|
+
} catch {
|
|
225
|
+
// Fall through to vocabulary embedder
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
cachedProvider = new VocabularyEmbedder();
|
|
230
|
+
return cachedProvider;
|
|
231
|
+
})();
|
|
232
|
+
|
|
233
|
+
return providerInitPromise;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Get the cached provider synchronously (may be null if not yet initialized)
|
|
238
|
+
*/
|
|
239
|
+
export function getCachedProvider(): EmbeddingProvider | null {
|
|
240
|
+
return cachedProvider;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Clear the cached provider
|
|
245
|
+
*/
|
|
246
|
+
export function clearProviderCache(): void {
|
|
247
|
+
cachedProvider = null;
|
|
248
|
+
providerInitPromise = null;
|
|
249
|
+
}
|
package/src/search/index.ts
CHANGED
|
@@ -1,3 +1,15 @@
|
|
|
1
1
|
// @getmikk/core search module
|
|
2
2
|
export { BM25Index, reciprocalRankFusion, tokenize, buildFunctionTokens } from './bm25.js'
|
|
3
3
|
export type { BM25Result } from './bm25.js'
|
|
4
|
+
export { DirectSearchEngine, createDirectSearch, extractSignatures, extractNames, extractSignaturesMap, summarizeFunction, formatFunctionList } from './direct-search.js'
|
|
5
|
+
export type { DirectQuery, DirectContext } from './direct-search.js'
|
|
6
|
+
|
|
7
|
+
export {
|
|
8
|
+
VocabularyEmbedder,
|
|
9
|
+
LocalONNXEmbedder,
|
|
10
|
+
GeminiEmbedder,
|
|
11
|
+
createEmbeddingProvider,
|
|
12
|
+
getCachedProvider,
|
|
13
|
+
clearProviderCache
|
|
14
|
+
} from './embedding-provider.js'
|
|
15
|
+
export type { EmbeddingProvider } from './embedding-provider.js'
|
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic Code Search — code embeddings for semantic similarity search
|
|
3
|
+
* Provides natural language code search and code-to-code similarity
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import type { MikkLock, MikkLockFunction } from '../contract/schema.js'
|
|
7
|
+
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
// Types
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
|
|
12
|
+
export interface CodeEmbedding {
|
|
13
|
+
id: string
|
|
14
|
+
vector: number[]
|
|
15
|
+
metadata: {
|
|
16
|
+
name: string
|
|
17
|
+
file: string
|
|
18
|
+
moduleId: string
|
|
19
|
+
purpose?: string
|
|
20
|
+
params?: string
|
|
21
|
+
returnType?: string
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface SemanticSearchResult {
|
|
26
|
+
functionId: string
|
|
27
|
+
name: string
|
|
28
|
+
file: string
|
|
29
|
+
moduleId: string
|
|
30
|
+
score: number
|
|
31
|
+
purpose?: string
|
|
32
|
+
snippet?: string
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface SemanticSearchOptions {
|
|
36
|
+
limit?: number
|
|
37
|
+
minScore?: number
|
|
38
|
+
filterModule?: string
|
|
39
|
+
filterFile?: string
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
interface EmbeddingIndex {
|
|
43
|
+
functions: Map<string, CodeEmbedding>
|
|
44
|
+
dimensions: number
|
|
45
|
+
indexedAt: number
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
// Simple embedding model using TF-IDF-like approach
|
|
50
|
+
// For production, would use transformer-based embeddings
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
export class SemanticCodeSearch {
|
|
54
|
+
private lock: MikkLock
|
|
55
|
+
private index: EmbeddingIndex | null = null
|
|
56
|
+
private readonly DIMENSIONS = 128
|
|
57
|
+
|
|
58
|
+
constructor(lock: MikkLock) {
|
|
59
|
+
this.lock = lock
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Build semantic index from lock file
|
|
64
|
+
*/
|
|
65
|
+
async buildIndex(): Promise<void> {
|
|
66
|
+
const functions = Object.values(this.lock.functions)
|
|
67
|
+
const embeddings = new Map<string, CodeEmbedding>()
|
|
68
|
+
|
|
69
|
+
for (const fn of functions) {
|
|
70
|
+
const vector = this.computeEmbedding(fn)
|
|
71
|
+
embeddings.set(fn.id, {
|
|
72
|
+
id: fn.id,
|
|
73
|
+
vector,
|
|
74
|
+
metadata: {
|
|
75
|
+
name: fn.name,
|
|
76
|
+
file: fn.file,
|
|
77
|
+
moduleId: fn.moduleId,
|
|
78
|
+
purpose: fn.purpose,
|
|
79
|
+
params: fn.params?.map(p => p.name).join(', '),
|
|
80
|
+
returnType: fn.returnType,
|
|
81
|
+
},
|
|
82
|
+
})
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
this.index = {
|
|
86
|
+
functions: embeddings,
|
|
87
|
+
dimensions: this.DIMENSIONS,
|
|
88
|
+
indexedAt: Date.now(),
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Search code using natural language query
|
|
94
|
+
*/
|
|
95
|
+
async search(query: string, options: SemanticSearchOptions = {}): Promise<SemanticSearchResult[]> {
|
|
96
|
+
if (!this.index) {
|
|
97
|
+
await this.buildIndex()
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const queryVector = this.computeQueryEmbedding(query)
|
|
101
|
+
const results: Array<{ fn: MikkLockFunction; score: number }> = []
|
|
102
|
+
|
|
103
|
+
const functions = Object.values(this.lock.functions)
|
|
104
|
+
for (const fn of functions) {
|
|
105
|
+
const embedding = this.index!.functions.get(fn.id)
|
|
106
|
+
if (!embedding) continue
|
|
107
|
+
|
|
108
|
+
// Filter by module if specified
|
|
109
|
+
if (options.filterModule && fn.moduleId !== options.filterModule) continue
|
|
110
|
+
|
|
111
|
+
// Filter by file if specified
|
|
112
|
+
if (options.filterFile && !fn.file.includes(options.filterFile)) continue
|
|
113
|
+
|
|
114
|
+
const score = this.cosineSimilarity(queryVector, embedding.vector)
|
|
115
|
+
|
|
116
|
+
if (score >= (options.minScore ?? 0)) {
|
|
117
|
+
results.push({ fn, score })
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Sort by score descending
|
|
122
|
+
results.sort((a, b) => b.score - a.score)
|
|
123
|
+
|
|
124
|
+
const limit = options.limit ?? 20
|
|
125
|
+
return results.slice(0, limit).map(({ fn, score }) => ({
|
|
126
|
+
functionId: fn.id,
|
|
127
|
+
name: fn.name,
|
|
128
|
+
file: fn.file,
|
|
129
|
+
moduleId: fn.moduleId,
|
|
130
|
+
score,
|
|
131
|
+
purpose: fn.purpose,
|
|
132
|
+
}))
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Find similar code to given code snippet
|
|
137
|
+
*/
|
|
138
|
+
async findSimilarCode(code: string, options: SemanticSearchOptions = {}): Promise<SemanticSearchResult[]> {
|
|
139
|
+
if (!this.index) {
|
|
140
|
+
await this.buildIndex()
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
const codeVector = this.computeCodeEmbedding(code)
|
|
144
|
+
const results: Array<{ fn: MikkLockFunction; score: number }> = []
|
|
145
|
+
|
|
146
|
+
const functions = Object.values(this.lock.functions)
|
|
147
|
+
for (const fn of functions) {
|
|
148
|
+
const embedding = this.index!.functions.get(fn.id)
|
|
149
|
+
if (!embedding) continue
|
|
150
|
+
|
|
151
|
+
const score = this.cosineSimilarity(codeVector, embedding.vector)
|
|
152
|
+
|
|
153
|
+
if (score >= (options.minScore ?? 0.3)) {
|
|
154
|
+
results.push({ fn, score })
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
results.sort((a, b) => b.score - a.score)
|
|
159
|
+
|
|
160
|
+
const limit = options.limit ?? 10
|
|
161
|
+
return results.slice(0, limit).map(({ fn, score }) => ({
|
|
162
|
+
functionId: fn.id,
|
|
163
|
+
name: fn.name,
|
|
164
|
+
file: fn.file,
|
|
165
|
+
moduleId: fn.moduleId,
|
|
166
|
+
score,
|
|
167
|
+
purpose: fn.purpose,
|
|
168
|
+
}))
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Compute embedding for a function using keyword + structural features
|
|
173
|
+
*/
|
|
174
|
+
private computeEmbedding(fn: MikkLockFunction): number[] {
|
|
175
|
+
const vector = new Array(this.DIMENSIONS).fill(0)
|
|
176
|
+
|
|
177
|
+
// Feature 1: Function name tokens (first 32 dims)
|
|
178
|
+
const nameTokens = this.tokenize(fn.name)
|
|
179
|
+
for (let i = 0; i < Math.min(nameTokens.length, 32); i++) {
|
|
180
|
+
vector[i] = this.hashToken(nameTokens[i], i)
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Feature 2: Purpose keywords (next 32 dims)
|
|
184
|
+
if (fn.purpose) {
|
|
185
|
+
const purposeTokens = this.tokenize(fn.purpose)
|
|
186
|
+
for (let i = 0; i < Math.min(purposeTokens.length, 32); i++) {
|
|
187
|
+
vector[32 + i] = this.hashToken(purposeTokens[i], 32 + i)
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Feature 3: Module context (next 32 dims)
|
|
192
|
+
if (fn.moduleId) {
|
|
193
|
+
const moduleTokens = this.tokenize(fn.moduleId)
|
|
194
|
+
for (let i = 0; i < Math.min(moduleTokens.length, 32); i++) {
|
|
195
|
+
vector[64 + i] = this.hashToken(moduleTokens[i], 64 + i)
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Feature 4: Structural features (last 32 dims)
|
|
200
|
+
vector[96] = fn.isAsync ? 1 : 0
|
|
201
|
+
vector[97] = fn.isExported ? 1 : 0
|
|
202
|
+
vector[98] = fn.params?.length ?? 0
|
|
203
|
+
vector[99] = (fn.endLine - fn.startLine) / 100 // normalized function size
|
|
204
|
+
vector[100] = fn.calls?.length ?? 0 // number of calls
|
|
205
|
+
vector[101] = fn.calledBy?.length ?? 0 // number of callers
|
|
206
|
+
|
|
207
|
+
// Hash additional features
|
|
208
|
+
if (fn.returnType) {
|
|
209
|
+
vector[102] = this.hashToken(fn.returnType, 102) % 1
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Normalize vector
|
|
213
|
+
return this.normalizeVector(vector)
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Compute query embedding
|
|
218
|
+
*/
|
|
219
|
+
private computeQueryEmbedding(query: string): number[] {
|
|
220
|
+
const vector = new Array(this.DIMENSIONS).fill(0)
|
|
221
|
+
const tokens = this.tokenize(query)
|
|
222
|
+
|
|
223
|
+
// Weight recent tokens more heavily
|
|
224
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
225
|
+
const weight = 1 - (i / tokens.length) * 0.5 // decreasing weight
|
|
226
|
+
const hash = this.hashToken(tokens[i], i % this.DIMENSIONS)
|
|
227
|
+
vector[i % this.DIMENSIONS] += hash * weight
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
return this.normalizeVector(vector)
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
/**
|
|
234
|
+
* Compute embedding for arbitrary code snippet
|
|
235
|
+
*/
|
|
236
|
+
private computeCodeEmbedding(code: string): number[] {
|
|
237
|
+
const vector = new Array(this.DIMENSIONS).fill(0)
|
|
238
|
+
const tokens = this.tokenize(code)
|
|
239
|
+
|
|
240
|
+
for (let i = 0; i < Math.min(tokens.length, this.DIMENSIONS); i++) {
|
|
241
|
+
vector[i] = this.hashToken(tokens[i], i)
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
return this.normalizeVector(vector)
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Tokenize text into words
|
|
249
|
+
*/
|
|
250
|
+
private tokenize(text: string): string[] {
|
|
251
|
+
return text
|
|
252
|
+
.toLowerCase()
|
|
253
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
254
|
+
.split(/[\s_./\\{}()[]"']+/)
|
|
255
|
+
.filter(Boolean)
|
|
256
|
+
.filter(w => w.length > 1)
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Hash token to 0-1 range for embedding
|
|
261
|
+
*/
|
|
262
|
+
private hashToken(token: string, seed: number): number {
|
|
263
|
+
let hash = 0
|
|
264
|
+
for (let i = 0; i < token.length; i++) {
|
|
265
|
+
hash = ((hash << 5) - hash + token.charCodeAt(i) + seed) >>> 0
|
|
266
|
+
}
|
|
267
|
+
return (hash % 1000) / 1000
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Normalize vector to unit length
|
|
272
|
+
*/
|
|
273
|
+
private normalizeVector(vector: number[]): number[] {
|
|
274
|
+
const magnitude = Math.sqrt(vector.reduce((sum, v) => sum + v * v, 0))
|
|
275
|
+
if (magnitude === 0) return vector
|
|
276
|
+
return vector.map(v => v / magnitude)
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
/**
|
|
280
|
+
* Compute cosine similarity between two vectors
|
|
281
|
+
*/
|
|
282
|
+
private cosineSimilarity(a: number[], b: number[]): number {
|
|
283
|
+
let dotProduct = 0
|
|
284
|
+
let normA = 0
|
|
285
|
+
let normB = 0
|
|
286
|
+
|
|
287
|
+
for (let i = 0; i < a.length; i++) {
|
|
288
|
+
dotProduct += a[i] * b[i]
|
|
289
|
+
normA += a[i] * a[i]
|
|
290
|
+
normB += b[i] * b[i]
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
const denominator = Math.sqrt(normA) * Math.sqrt(normB)
|
|
294
|
+
if (denominator === 0) return 0
|
|
295
|
+
|
|
296
|
+
return dotProduct / denominator
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/**
|
|
300
|
+
* Get index statistics
|
|
301
|
+
*/
|
|
302
|
+
getIndexStats(): { functionCount: number; dimensions: number; indexedAt: number } | null {
|
|
303
|
+
if (!this.index) return null
|
|
304
|
+
|
|
305
|
+
return {
|
|
306
|
+
functionCount: this.index.functions.size,
|
|
307
|
+
dimensions: this.index.dimensions,
|
|
308
|
+
indexedAt: this.index.indexedAt,
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/**
|
|
314
|
+
* Hybrid search combining BM25 and semantic search
|
|
315
|
+
*/
|
|
316
|
+
export class HybridSearchEngine {
|
|
317
|
+
private lock: MikkLock
|
|
318
|
+
private semanticSearch: SemanticCodeSearch
|
|
319
|
+
private readonly SEMANTIC_WEIGHT = 0.6
|
|
320
|
+
private readonly BM25_WEIGHT = 0.4
|
|
321
|
+
|
|
322
|
+
constructor(lock: MikkLock) {
|
|
323
|
+
this.lock = lock
|
|
324
|
+
this.semanticSearch = new SemanticCodeSearch(lock)
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Search using both BM25 and semantic search with reranking
|
|
329
|
+
*/
|
|
330
|
+
async search(
|
|
331
|
+
query: string,
|
|
332
|
+
options: SemanticSearchOptions & { useHybrid?: boolean } = {}
|
|
333
|
+
): Promise<SemanticSearchResult[]> {
|
|
334
|
+
const { useHybrid = true, limit = 20, ...filterOptions } = options
|
|
335
|
+
|
|
336
|
+
if (!useHybrid) {
|
|
337
|
+
return this.semanticSearch.search(query, { ...filterOptions, limit })
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Run both searches in parallel
|
|
341
|
+
const [semanticResults, bm25Results] = await Promise.all([
|
|
342
|
+
this.semanticSearch.search(query, { ...filterOptions, limit: limit * 2 }),
|
|
343
|
+
this.bm25Search(query, { ...filterOptions, limit: limit * 2 }),
|
|
344
|
+
])
|
|
345
|
+
|
|
346
|
+
// Combine scores using weighted RRF
|
|
347
|
+
const combinedScores = new Map<string, { fn: MikkLockFunction; score: number }>()
|
|
348
|
+
|
|
349
|
+
// Add semantic scores
|
|
350
|
+
for (const result of semanticResults) {
|
|
351
|
+
combinedScores.set(result.functionId, {
|
|
352
|
+
fn: this.lock.functions[result.functionId],
|
|
353
|
+
score: result.score * this.SEMANTIC_WEIGHT,
|
|
354
|
+
})
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// Add BM25 scores
|
|
358
|
+
for (const result of bm25Results) {
|
|
359
|
+
const existing = combinedScores.get(result.functionId)
|
|
360
|
+
const bm25Score = result.score * this.BM25_WEIGHT
|
|
361
|
+
|
|
362
|
+
if (existing) {
|
|
363
|
+
existing.score += bm25Score
|
|
364
|
+
} else {
|
|
365
|
+
combinedScores.set(result.functionId, {
|
|
366
|
+
fn: this.lock.functions[result.functionId],
|
|
367
|
+
score: bm25Score,
|
|
368
|
+
})
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Sort by combined score
|
|
373
|
+
const results = Array.from(combinedScores.values())
|
|
374
|
+
.sort((a, b) => b.score - a.score)
|
|
375
|
+
.slice(0, limit)
|
|
376
|
+
.map(({ fn, score }) => ({
|
|
377
|
+
functionId: fn.id,
|
|
378
|
+
name: fn.name,
|
|
379
|
+
file: fn.file,
|
|
380
|
+
moduleId: fn.moduleId,
|
|
381
|
+
score,
|
|
382
|
+
purpose: fn.purpose,
|
|
383
|
+
}))
|
|
384
|
+
|
|
385
|
+
return results
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
/**
|
|
389
|
+
* Simple BM25 search for hybrid results
|
|
390
|
+
*/
|
|
391
|
+
private async bm25Search(
|
|
392
|
+
query: string,
|
|
393
|
+
options: SemanticSearchOptions
|
|
394
|
+
): Promise<Array<{ functionId: string; score: number }>> {
|
|
395
|
+
const tokens = this.tokenize(query)
|
|
396
|
+
const functions = Object.values(this.lock.functions)
|
|
397
|
+
|
|
398
|
+
const scores: Array<{ fn: MikkLockFunction; score: number }> = []
|
|
399
|
+
|
|
400
|
+
for (const fn of functions) {
|
|
401
|
+
let score = 0
|
|
402
|
+
const fnText = `${fn.name} ${fn.purpose || ''}`.toLowerCase()
|
|
403
|
+
|
|
404
|
+
for (const token of tokens) {
|
|
405
|
+
if (fnText.includes(token)) {
|
|
406
|
+
score += 1
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
if (score > 0) {
|
|
411
|
+
scores.push({ fn, score: score / tokens.length })
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
scores.sort((a, b) => b.score - a.score)
|
|
416
|
+
|
|
417
|
+
return scores.slice(0, options.limit ?? 20).map(({ fn, score }) => ({
|
|
418
|
+
functionId: fn.id,
|
|
419
|
+
score,
|
|
420
|
+
}))
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
private tokenize(text: string): string[] {
|
|
424
|
+
return text
|
|
425
|
+
.toLowerCase()
|
|
426
|
+
.split(/[\s]+/)
|
|
427
|
+
.filter(Boolean)
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
// ---------------------------------------------------------------------------
|
|
432
|
+
// Re-export for compatibility
|
|
433
|
+
// ---------------------------------------------------------------------------
|
|
434
|
+
|
|
435
|
+
// Exported as SemanticCodeSearch above
|