@getmikk/core 2.0.14 → 2.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +4 -4
  2. package/package.json +2 -1
  3. package/src/analysis/type-flow.ts +1 -1
  4. package/src/cache/incremental-cache.ts +86 -80
  5. package/src/contract/contract-reader.ts +1 -0
  6. package/src/contract/lock-compiler.ts +95 -13
  7. package/src/contract/schema.ts +2 -0
  8. package/src/error-handler.ts +2 -1
  9. package/src/graph/cluster-detector.ts +2 -4
  10. package/src/graph/dead-code-detector.ts +303 -117
  11. package/src/graph/graph-builder.ts +21 -161
  12. package/src/graph/impact-analyzer.ts +1 -0
  13. package/src/graph/index.ts +2 -0
  14. package/src/graph/rich-function-index.ts +1080 -0
  15. package/src/graph/symbol-table.ts +252 -0
  16. package/src/hash/hash-store.ts +1 -0
  17. package/src/index.ts +2 -0
  18. package/src/parser/base-extractor.ts +19 -0
  19. package/src/parser/boundary-checker.ts +31 -12
  20. package/src/parser/error-recovery.ts +5 -4
  21. package/src/parser/function-body-extractor.ts +248 -0
  22. package/src/parser/go/go-extractor.ts +249 -676
  23. package/src/parser/index.ts +132 -318
  24. package/src/parser/language-registry.ts +57 -0
  25. package/src/parser/oxc-parser.ts +166 -28
  26. package/src/parser/oxc-resolver.ts +179 -11
  27. package/src/parser/parser-constants.ts +1 -0
  28. package/src/parser/rust/rust-extractor.ts +109 -0
  29. package/src/parser/tree-sitter/parser.ts +369 -62
  30. package/src/parser/tree-sitter/queries.ts +106 -10
  31. package/src/parser/types.ts +20 -1
  32. package/src/search/bm25.ts +21 -8
  33. package/src/search/direct-search.ts +472 -0
  34. package/src/search/embedding-provider.ts +249 -0
  35. package/src/search/index.ts +12 -0
  36. package/src/search/semantic-search.ts +435 -0
  37. package/src/utils/artifact-transaction.ts +1 -0
  38. package/src/utils/atomic-write.ts +1 -0
  39. package/src/utils/errors.ts +89 -4
  40. package/src/utils/fs.ts +104 -50
  41. package/src/utils/json.ts +1 -0
  42. package/src/utils/language-registry.ts +84 -6
  43. package/src/utils/path.ts +26 -0
  44. package/tests/dead-code.test.ts +3 -2
  45. package/tests/direct-search.test.ts +435 -0
  46. package/tests/error-recovery.test.ts +143 -0
  47. package/tests/fixtures/simple-api/src/index.ts +1 -1
  48. package/tests/go-parser.test.ts +19 -335
  49. package/tests/js-parser.test.ts +18 -1089
  50. package/tests/language-registry-all.test.ts +276 -0
  51. package/tests/language-registry.test.ts +6 -4
  52. package/tests/parse-diagnostics.test.ts +9 -96
  53. package/tests/parser.test.ts +42 -771
  54. package/tests/polyglot-parser.test.ts +117 -0
  55. package/tests/rich-function-index.test.ts +703 -0
  56. package/tests/tree-sitter-parser.test.ts +108 -80
  57. package/tests/ts-parser.test.ts +8 -8
  58. package/tests/verification.test.ts +175 -0
  59. package/src/parser/base-parser.ts +0 -16
  60. package/src/parser/go/go-parser.ts +0 -43
  61. package/src/parser/javascript/js-extractor.ts +0 -278
  62. package/src/parser/javascript/js-parser.ts +0 -101
  63. package/src/parser/typescript/ts-extractor.ts +0 -447
  64. package/src/parser/typescript/ts-parser.ts +0 -36
@@ -0,0 +1,249 @@
1
+ import * as path from 'node:path';
2
+
3
+ /**
4
+ * Common interface for embedding providers
5
+ */
6
+ export interface EmbeddingProvider {
7
+ embed(text: string): Promise<number[]>;
8
+ embedBatch(texts: string[]): Promise<number[][]>;
9
+ getDimensions(): number;
10
+ isAvailable(): Promise<boolean>;
11
+ }
12
+
13
+ const VOCABULARY = [
14
+ 'function', 'class', 'method', 'async', 'await', 'return', 'const', 'let', 'var',
15
+ 'import', 'export', 'from', 'type', 'interface', 'extends', 'implements',
16
+ 'constructor', 'prototype', 'static', 'private', 'public', 'protected',
17
+ 'if', 'else', 'for', 'while', 'do', 'switch', 'case', 'break', 'continue',
18
+ 'try', 'catch', 'finally', 'throw', 'error', 'exception',
19
+ 'parse', 'format', 'validate', 'create', 'update', 'delete', 'remove',
20
+ 'get', 'set', 'find', 'search', 'filter', 'map', 'reduce', 'transform',
21
+ 'init', 'setup', 'config', 'options', 'settings', 'defaults',
22
+ 'data', 'object', 'array', 'string', 'number', 'boolean', 'null', 'undefined',
23
+ 'request', 'response', 'http', 'https', 'api', 'endpoint', 'route',
24
+ 'auth', 'token', 'jwt', 'session', 'cookie', 'header',
25
+ 'database', 'query', 'sql', 'transaction', 'connection', 'pool',
26
+ 'file', 'path', 'directory', 'read', 'write', 'stream', 'buffer',
27
+ 'event', 'listener', 'handler', 'callback', 'promise', 'observer',
28
+ 'log', 'debug', 'info', 'warn', 'error', 'trace',
29
+ 'test', 'mock', 'stub', 'assert', 'expect',
30
+ 'cache', 'store', 'memory', 'session', 'local',
31
+ 'user', 'account', 'profile', 'permission', 'role',
32
+ 'create', 'register', 'login', 'logout', 'verify',
33
+ 'send', 'receive', 'push', 'pull', 'fetch', 'upload', 'download',
34
+ 'process', 'worker', 'thread', 'task', 'job', 'queue',
35
+ 'client', 'server', 'service', 'endpoint', 'middleware',
36
+ ];
37
+
38
+ function tokenize(text: string): string[] {
39
+ return text
40
+ .toLowerCase()
41
+ .replace(/[^a-z0-9\s]/g, ' ')
42
+ .split(/\s+/)
43
+ .filter(t => t.length > 1)
44
+ .filter(t => !['the', 'and', 'for', 'with', 'from', 'this', 'that', 'have', 'has'].includes(t));
45
+ }
46
+
47
+ function computeTF(tokens: string[]): Map<string, number> {
48
+ const tf = new Map<string, number>();
49
+ for (const token of tokens) {
50
+ tf.set(token, (tf.get(token) || 0) + 1);
51
+ }
52
+ const max = Math.max(...tf.values(), 1);
53
+ for (const [key, value] of tf) {
54
+ tf.set(key, value / max);
55
+ }
56
+ return tf;
57
+ }
58
+
59
+ /**
60
+ * Fast vocabulary-based embeddings for when ML models aren't available.
61
+ * Uses TF-IDF with a programming-focused vocabulary.
62
+ */
63
+ export class VocabularyEmbedder implements EmbeddingProvider {
64
+ readonly dimensions: number;
65
+ private vocabMap: Map<string, number>;
66
+ private defaultIDF: number;
67
+
68
+ constructor(vocab: string[] = VOCABULARY, dimensions = 128) {
69
+ this.dimensions = dimensions;
70
+ this.vocabMap = new Map();
71
+
72
+ for (let i = 0; i < Math.min(vocab.length, dimensions); i++) {
73
+ this.vocabMap.set(vocab[i], i);
74
+ }
75
+
76
+ this.defaultIDF = Math.log(vocab.length + 1) + 1;
77
+ }
78
+
79
+ async embed(text: string): Promise<number[]> {
80
+ const tokens = tokenize(text);
81
+ const tf = computeTF(tokens);
82
+
83
+ const vector = new Array(this.dimensions).fill(0);
84
+ const magnitude = Math.sqrt(tokens.length || 1);
85
+
86
+ for (const [token, tfScore] of tf) {
87
+ const idx = this.vocabMap.get(token);
88
+ if (idx !== undefined && idx < this.dimensions) {
89
+ vector[idx] = tfScore * this.defaultIDF / magnitude;
90
+ }
91
+ }
92
+
93
+ const norm = Math.sqrt(vector.reduce((sum, v) => sum + v * v, 0)) || 1;
94
+ return vector.map(v => v / norm);
95
+ }
96
+
97
+ async embedBatch(texts: string[]): Promise<number[][]> {
98
+ return Promise.all(texts.map(t => this.embed(t)));
99
+ }
100
+
101
+ getDimensions(): number {
102
+ return this.dimensions;
103
+ }
104
+
105
+ async isAvailable(): Promise<boolean> {
106
+ return true;
107
+ }
108
+ }
109
+
110
+ /**
111
+ * Local embeddings using ONNX runtime (via @xenova/transformers)
112
+ */
113
+ export class LocalONNXEmbedder implements EmbeddingProvider {
114
+ private pipeline: unknown = null;
115
+ readonly dimensions = 384;
116
+
117
+ readonly MODEL_NAME = 'Xenova/all-MiniLM-L6-v2';
118
+
119
+ async isAvailable(): Promise<boolean> {
120
+ try {
121
+ await import('@xenova/transformers');
122
+ return true;
123
+ } catch {
124
+ return false;
125
+ }
126
+ }
127
+
128
+ private async ensurePipeline() {
129
+ if (this.pipeline) return;
130
+ const { pipeline } = await import('@xenova/transformers');
131
+ this.pipeline = await pipeline('feature-extraction', this.MODEL_NAME);
132
+ }
133
+
134
+ async embed(text: string): Promise<number[]> {
135
+ await this.ensurePipeline();
136
+ const p = this.pipeline as (texts: string[], options: unknown) => Promise<Array<{ data: Float32Array }>>;
137
+ const output = await p([text], { pooling: 'mean', normalize: true });
138
+ return Array.from(output[0].data);
139
+ }
140
+
141
+ async embedBatch(texts: string[]): Promise<number[][]> {
142
+ await this.ensurePipeline();
143
+ const p = this.pipeline as (texts: string[], options: unknown) => Promise<Array<{ data: Float32Array }>>;
144
+ const output = await p(texts, { pooling: 'mean', normalize: true });
145
+ return output.map(o => Array.from(o.data));
146
+ }
147
+
148
+ getDimensions(): number {
149
+ return this.dimensions;
150
+ }
151
+ }
152
+
153
+ const GEMINI_MODEL_NAME = 'gemini-embedding-001';
154
+ const GEMINI_DIMENSIONS = 3072;
155
+
156
+ /**
157
+ * Gemini-backed embedding provider
158
+ */
159
+ export class GeminiEmbedder implements EmbeddingProvider {
160
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
161
+ private model: any;
162
+ readonly dimensions = GEMINI_DIMENSIONS;
163
+
164
+ constructor(apiKey: string) {
165
+ this.initialize(apiKey);
166
+ }
167
+
168
+ private initialize(apiKey: string) {
169
+ const { GoogleGenerativeAI } = require('@google/generative-ai');
170
+ const genAI = new GoogleGenerativeAI(apiKey);
171
+ this.model = genAI.getGenerativeModel({ model: GEMINI_MODEL_NAME });
172
+ }
173
+
174
+ async embed(text: string): Promise<number[]> {
175
+ const result = await this.model.embedContent(text);
176
+ return result.embedding.values;
177
+ }
178
+
179
+ async embedBatch(texts: string[]): Promise<number[][]> {
180
+ const result = await this.model.batchEmbedContents({
181
+ requests: texts.map((t) => ({ content: { role: 'user', parts: [{ text: t }] } })),
182
+ });
183
+ return result.embeddings.map((e: { values: number[] }) => e.values);
184
+ }
185
+
186
+ getDimensions(): number {
187
+ return this.dimensions;
188
+ }
189
+
190
+ async isAvailable(): Promise<boolean> {
191
+ return !!process.env.GEMINI_API_KEY;
192
+ }
193
+ }
194
+
195
+ let cachedProvider: EmbeddingProvider | null = null;
196
+ let providerInitPromise: Promise<EmbeddingProvider> | null = null;
197
+
198
+ /**
199
+ * Factory to create the best available provider.
200
+ * Caches the provider for subsequent calls.
201
+ */
202
+ export async function createEmbeddingProvider(): Promise<EmbeddingProvider> {
203
+ if (cachedProvider) {
204
+ return cachedProvider;
205
+ }
206
+
207
+ if (providerInitPromise) {
208
+ return providerInitPromise;
209
+ }
210
+
211
+ providerInitPromise = (async () => {
212
+ const localONNX = new LocalONNXEmbedder();
213
+
214
+ if (await localONNX.isAvailable()) {
215
+ cachedProvider = localONNX;
216
+ return localONNX;
217
+ }
218
+
219
+ const apiKey = process.env.GEMINI_API_KEY;
220
+ if (apiKey) {
221
+ try {
222
+ cachedProvider = new GeminiEmbedder(apiKey);
223
+ return cachedProvider;
224
+ } catch {
225
+ // Fall through to vocabulary embedder
226
+ }
227
+ }
228
+
229
+ cachedProvider = new VocabularyEmbedder();
230
+ return cachedProvider;
231
+ })();
232
+
233
+ return providerInitPromise;
234
+ }
235
+
236
+ /**
237
+ * Get the cached provider synchronously (may be null if not yet initialized)
238
+ */
239
+ export function getCachedProvider(): EmbeddingProvider | null {
240
+ return cachedProvider;
241
+ }
242
+
243
+ /**
244
+ * Clear the cached provider
245
+ */
246
+ export function clearProviderCache(): void {
247
+ cachedProvider = null;
248
+ providerInitPromise = null;
249
+ }
@@ -1,3 +1,15 @@
1
1
  // @getmikk/core search module
2
2
  export { BM25Index, reciprocalRankFusion, tokenize, buildFunctionTokens } from './bm25.js'
3
3
  export type { BM25Result } from './bm25.js'
4
+ export { DirectSearchEngine, createDirectSearch, extractSignatures, extractNames, extractSignaturesMap, summarizeFunction, formatFunctionList } from './direct-search.js'
5
+ export type { DirectQuery, DirectContext } from './direct-search.js'
6
+
7
+ export {
8
+ VocabularyEmbedder,
9
+ LocalONNXEmbedder,
10
+ GeminiEmbedder,
11
+ createEmbeddingProvider,
12
+ getCachedProvider,
13
+ clearProviderCache
14
+ } from './embedding-provider.js'
15
+ export type { EmbeddingProvider } from './embedding-provider.js'
@@ -0,0 +1,435 @@
1
+ /**
2
+ * Semantic Code Search — code embeddings for semantic similarity search
3
+ * Provides natural language code search and code-to-code similarity
4
+ */
5
+
6
+ import type { MikkLock, MikkLockFunction } from '../contract/schema.js'
7
+
8
+ // ---------------------------------------------------------------------------
9
+ // Types
10
+ // ---------------------------------------------------------------------------
11
+
12
+ export interface CodeEmbedding {
13
+ id: string
14
+ vector: number[]
15
+ metadata: {
16
+ name: string
17
+ file: string
18
+ moduleId: string
19
+ purpose?: string
20
+ params?: string
21
+ returnType?: string
22
+ }
23
+ }
24
+
25
+ export interface SemanticSearchResult {
26
+ functionId: string
27
+ name: string
28
+ file: string
29
+ moduleId: string
30
+ score: number
31
+ purpose?: string
32
+ snippet?: string
33
+ }
34
+
35
+ export interface SemanticSearchOptions {
36
+ limit?: number
37
+ minScore?: number
38
+ filterModule?: string
39
+ filterFile?: string
40
+ }
41
+
42
+ interface EmbeddingIndex {
43
+ functions: Map<string, CodeEmbedding>
44
+ dimensions: number
45
+ indexedAt: number
46
+ }
47
+
48
+ // ---------------------------------------------------------------------------
49
+ // Simple embedding model using TF-IDF-like approach
50
+ // For production, would use transformer-based embeddings
51
+ // ---------------------------------------------------------------------------
52
+
53
+ export class SemanticCodeSearch {
54
+ private lock: MikkLock
55
+ private index: EmbeddingIndex | null = null
56
+ private readonly DIMENSIONS = 128
57
+
58
+ constructor(lock: MikkLock) {
59
+ this.lock = lock
60
+ }
61
+
62
+ /**
63
+ * Build semantic index from lock file
64
+ */
65
+ async buildIndex(): Promise<void> {
66
+ const functions = Object.values(this.lock.functions)
67
+ const embeddings = new Map<string, CodeEmbedding>()
68
+
69
+ for (const fn of functions) {
70
+ const vector = this.computeEmbedding(fn)
71
+ embeddings.set(fn.id, {
72
+ id: fn.id,
73
+ vector,
74
+ metadata: {
75
+ name: fn.name,
76
+ file: fn.file,
77
+ moduleId: fn.moduleId,
78
+ purpose: fn.purpose,
79
+ params: fn.params?.map(p => p.name).join(', '),
80
+ returnType: fn.returnType,
81
+ },
82
+ })
83
+ }
84
+
85
+ this.index = {
86
+ functions: embeddings,
87
+ dimensions: this.DIMENSIONS,
88
+ indexedAt: Date.now(),
89
+ }
90
+ }
91
+
92
+ /**
93
+ * Search code using natural language query
94
+ */
95
+ async search(query: string, options: SemanticSearchOptions = {}): Promise<SemanticSearchResult[]> {
96
+ if (!this.index) {
97
+ await this.buildIndex()
98
+ }
99
+
100
+ const queryVector = this.computeQueryEmbedding(query)
101
+ const results: Array<{ fn: MikkLockFunction; score: number }> = []
102
+
103
+ const functions = Object.values(this.lock.functions)
104
+ for (const fn of functions) {
105
+ const embedding = this.index!.functions.get(fn.id)
106
+ if (!embedding) continue
107
+
108
+ // Filter by module if specified
109
+ if (options.filterModule && fn.moduleId !== options.filterModule) continue
110
+
111
+ // Filter by file if specified
112
+ if (options.filterFile && !fn.file.includes(options.filterFile)) continue
113
+
114
+ const score = this.cosineSimilarity(queryVector, embedding.vector)
115
+
116
+ if (score >= (options.minScore ?? 0)) {
117
+ results.push({ fn, score })
118
+ }
119
+ }
120
+
121
+ // Sort by score descending
122
+ results.sort((a, b) => b.score - a.score)
123
+
124
+ const limit = options.limit ?? 20
125
+ return results.slice(0, limit).map(({ fn, score }) => ({
126
+ functionId: fn.id,
127
+ name: fn.name,
128
+ file: fn.file,
129
+ moduleId: fn.moduleId,
130
+ score,
131
+ purpose: fn.purpose,
132
+ }))
133
+ }
134
+
135
+ /**
136
+ * Find similar code to given code snippet
137
+ */
138
+ async findSimilarCode(code: string, options: SemanticSearchOptions = {}): Promise<SemanticSearchResult[]> {
139
+ if (!this.index) {
140
+ await this.buildIndex()
141
+ }
142
+
143
+ const codeVector = this.computeCodeEmbedding(code)
144
+ const results: Array<{ fn: MikkLockFunction; score: number }> = []
145
+
146
+ const functions = Object.values(this.lock.functions)
147
+ for (const fn of functions) {
148
+ const embedding = this.index!.functions.get(fn.id)
149
+ if (!embedding) continue
150
+
151
+ const score = this.cosineSimilarity(codeVector, embedding.vector)
152
+
153
+ if (score >= (options.minScore ?? 0.3)) {
154
+ results.push({ fn, score })
155
+ }
156
+ }
157
+
158
+ results.sort((a, b) => b.score - a.score)
159
+
160
+ const limit = options.limit ?? 10
161
+ return results.slice(0, limit).map(({ fn, score }) => ({
162
+ functionId: fn.id,
163
+ name: fn.name,
164
+ file: fn.file,
165
+ moduleId: fn.moduleId,
166
+ score,
167
+ purpose: fn.purpose,
168
+ }))
169
+ }
170
+
171
+ /**
172
+ * Compute embedding for a function using keyword + structural features
173
+ */
174
+ private computeEmbedding(fn: MikkLockFunction): number[] {
175
+ const vector = new Array(this.DIMENSIONS).fill(0)
176
+
177
+ // Feature 1: Function name tokens (first 32 dims)
178
+ const nameTokens = this.tokenize(fn.name)
179
+ for (let i = 0; i < Math.min(nameTokens.length, 32); i++) {
180
+ vector[i] = this.hashToken(nameTokens[i], i)
181
+ }
182
+
183
+ // Feature 2: Purpose keywords (next 32 dims)
184
+ if (fn.purpose) {
185
+ const purposeTokens = this.tokenize(fn.purpose)
186
+ for (let i = 0; i < Math.min(purposeTokens.length, 32); i++) {
187
+ vector[32 + i] = this.hashToken(purposeTokens[i], 32 + i)
188
+ }
189
+ }
190
+
191
+ // Feature 3: Module context (next 32 dims)
192
+ if (fn.moduleId) {
193
+ const moduleTokens = this.tokenize(fn.moduleId)
194
+ for (let i = 0; i < Math.min(moduleTokens.length, 32); i++) {
195
+ vector[64 + i] = this.hashToken(moduleTokens[i], 64 + i)
196
+ }
197
+ }
198
+
199
+ // Feature 4: Structural features (last 32 dims)
200
+ vector[96] = fn.isAsync ? 1 : 0
201
+ vector[97] = fn.isExported ? 1 : 0
202
+ vector[98] = fn.params?.length ?? 0
203
+ vector[99] = (fn.endLine - fn.startLine) / 100 // normalized function size
204
+ vector[100] = fn.calls?.length ?? 0 // number of calls
205
+ vector[101] = fn.calledBy?.length ?? 0 // number of callers
206
+
207
+ // Hash additional features
208
+ if (fn.returnType) {
209
+ vector[102] = this.hashToken(fn.returnType, 102) % 1
210
+ }
211
+
212
+ // Normalize vector
213
+ return this.normalizeVector(vector)
214
+ }
215
+
216
+ /**
217
+ * Compute query embedding
218
+ */
219
+ private computeQueryEmbedding(query: string): number[] {
220
+ const vector = new Array(this.DIMENSIONS).fill(0)
221
+ const tokens = this.tokenize(query)
222
+
223
+ // Weight recent tokens more heavily
224
+ for (let i = 0; i < tokens.length; i++) {
225
+ const weight = 1 - (i / tokens.length) * 0.5 // decreasing weight
226
+ const hash = this.hashToken(tokens[i], i % this.DIMENSIONS)
227
+ vector[i % this.DIMENSIONS] += hash * weight
228
+ }
229
+
230
+ return this.normalizeVector(vector)
231
+ }
232
+
233
+ /**
234
+ * Compute embedding for arbitrary code snippet
235
+ */
236
+ private computeCodeEmbedding(code: string): number[] {
237
+ const vector = new Array(this.DIMENSIONS).fill(0)
238
+ const tokens = this.tokenize(code)
239
+
240
+ for (let i = 0; i < Math.min(tokens.length, this.DIMENSIONS); i++) {
241
+ vector[i] = this.hashToken(tokens[i], i)
242
+ }
243
+
244
+ return this.normalizeVector(vector)
245
+ }
246
+
247
+ /**
248
+ * Tokenize text into words
249
+ */
250
+ private tokenize(text: string): string[] {
251
+ return text
252
+ .toLowerCase()
253
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
254
+ .split(/[\s_./\\{}()[]"']+/)
255
+ .filter(Boolean)
256
+ .filter(w => w.length > 1)
257
+ }
258
+
259
+ /**
260
+ * Hash token to 0-1 range for embedding
261
+ */
262
+ private hashToken(token: string, seed: number): number {
263
+ let hash = 0
264
+ for (let i = 0; i < token.length; i++) {
265
+ hash = ((hash << 5) - hash + token.charCodeAt(i) + seed) >>> 0
266
+ }
267
+ return (hash % 1000) / 1000
268
+ }
269
+
270
+ /**
271
+ * Normalize vector to unit length
272
+ */
273
+ private normalizeVector(vector: number[]): number[] {
274
+ const magnitude = Math.sqrt(vector.reduce((sum, v) => sum + v * v, 0))
275
+ if (magnitude === 0) return vector
276
+ return vector.map(v => v / magnitude)
277
+ }
278
+
279
+ /**
280
+ * Compute cosine similarity between two vectors
281
+ */
282
+ private cosineSimilarity(a: number[], b: number[]): number {
283
+ let dotProduct = 0
284
+ let normA = 0
285
+ let normB = 0
286
+
287
+ for (let i = 0; i < a.length; i++) {
288
+ dotProduct += a[i] * b[i]
289
+ normA += a[i] * a[i]
290
+ normB += b[i] * b[i]
291
+ }
292
+
293
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB)
294
+ if (denominator === 0) return 0
295
+
296
+ return dotProduct / denominator
297
+ }
298
+
299
+ /**
300
+ * Get index statistics
301
+ */
302
+ getIndexStats(): { functionCount: number; dimensions: number; indexedAt: number } | null {
303
+ if (!this.index) return null
304
+
305
+ return {
306
+ functionCount: this.index.functions.size,
307
+ dimensions: this.index.dimensions,
308
+ indexedAt: this.index.indexedAt,
309
+ }
310
+ }
311
+ }
312
+
313
+ /**
314
+ * Hybrid search combining BM25 and semantic search
315
+ */
316
+ export class HybridSearchEngine {
317
+ private lock: MikkLock
318
+ private semanticSearch: SemanticCodeSearch
319
+ private readonly SEMANTIC_WEIGHT = 0.6
320
+ private readonly BM25_WEIGHT = 0.4
321
+
322
+ constructor(lock: MikkLock) {
323
+ this.lock = lock
324
+ this.semanticSearch = new SemanticCodeSearch(lock)
325
+ }
326
+
327
+ /**
328
+ * Search using both BM25 and semantic search with reranking
329
+ */
330
+ async search(
331
+ query: string,
332
+ options: SemanticSearchOptions & { useHybrid?: boolean } = {}
333
+ ): Promise<SemanticSearchResult[]> {
334
+ const { useHybrid = true, limit = 20, ...filterOptions } = options
335
+
336
+ if (!useHybrid) {
337
+ return this.semanticSearch.search(query, { ...filterOptions, limit })
338
+ }
339
+
340
+ // Run both searches in parallel
341
+ const [semanticResults, bm25Results] = await Promise.all([
342
+ this.semanticSearch.search(query, { ...filterOptions, limit: limit * 2 }),
343
+ this.bm25Search(query, { ...filterOptions, limit: limit * 2 }),
344
+ ])
345
+
346
+ // Combine scores using weighted RRF
347
+ const combinedScores = new Map<string, { fn: MikkLockFunction; score: number }>()
348
+
349
+ // Add semantic scores
350
+ for (const result of semanticResults) {
351
+ combinedScores.set(result.functionId, {
352
+ fn: this.lock.functions[result.functionId],
353
+ score: result.score * this.SEMANTIC_WEIGHT,
354
+ })
355
+ }
356
+
357
+ // Add BM25 scores
358
+ for (const result of bm25Results) {
359
+ const existing = combinedScores.get(result.functionId)
360
+ const bm25Score = result.score * this.BM25_WEIGHT
361
+
362
+ if (existing) {
363
+ existing.score += bm25Score
364
+ } else {
365
+ combinedScores.set(result.functionId, {
366
+ fn: this.lock.functions[result.functionId],
367
+ score: bm25Score,
368
+ })
369
+ }
370
+ }
371
+
372
+ // Sort by combined score
373
+ const results = Array.from(combinedScores.values())
374
+ .sort((a, b) => b.score - a.score)
375
+ .slice(0, limit)
376
+ .map(({ fn, score }) => ({
377
+ functionId: fn.id,
378
+ name: fn.name,
379
+ file: fn.file,
380
+ moduleId: fn.moduleId,
381
+ score,
382
+ purpose: fn.purpose,
383
+ }))
384
+
385
+ return results
386
+ }
387
+
388
+ /**
389
+ * Simple BM25 search for hybrid results
390
+ */
391
+ private async bm25Search(
392
+ query: string,
393
+ options: SemanticSearchOptions
394
+ ): Promise<Array<{ functionId: string; score: number }>> {
395
+ const tokens = this.tokenize(query)
396
+ const functions = Object.values(this.lock.functions)
397
+
398
+ const scores: Array<{ fn: MikkLockFunction; score: number }> = []
399
+
400
+ for (const fn of functions) {
401
+ let score = 0
402
+ const fnText = `${fn.name} ${fn.purpose || ''}`.toLowerCase()
403
+
404
+ for (const token of tokens) {
405
+ if (fnText.includes(token)) {
406
+ score += 1
407
+ }
408
+ }
409
+
410
+ if (score > 0) {
411
+ scores.push({ fn, score: score / tokens.length })
412
+ }
413
+ }
414
+
415
+ scores.sort((a, b) => b.score - a.score)
416
+
417
+ return scores.slice(0, options.limit ?? 20).map(({ fn, score }) => ({
418
+ functionId: fn.id,
419
+ score,
420
+ }))
421
+ }
422
+
423
+ private tokenize(text: string): string[] {
424
+ return text
425
+ .toLowerCase()
426
+ .split(/[\s]+/)
427
+ .filter(Boolean)
428
+ }
429
+ }
430
+
431
+ // ---------------------------------------------------------------------------
432
+ // Re-export for compatibility
433
+ // ---------------------------------------------------------------------------
434
+
435
+ // Exported as SemanticCodeSearch above
@@ -1,3 +1,4 @@
1
+ /* eslint-disable @typescript-eslint/no-explicit-any */
1
2
  import * as fs from 'node:fs/promises'
2
3
  import * as path from 'node:path'
3
4
  import { randomUUID } from 'node:crypto'
@@ -1,3 +1,4 @@
1
+ /* eslint-disable @typescript-eslint/no-explicit-any */
1
2
  import * as fs from 'node:fs/promises'
2
3
  import * as path from 'node:path'
3
4
  import { randomUUID } from 'node:crypto'