sweet-search 0.0.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. package/LICENSE +190 -0
  2. package/NOTICE +23 -0
  3. package/core/cli.js +51 -0
  4. package/core/config.js +27 -0
  5. package/core/embedding/embedding-cache.js +467 -0
  6. package/core/embedding/embedding-local-model.js +845 -0
  7. package/core/embedding/embedding-remote.js +492 -0
  8. package/core/embedding/embedding-service.js +712 -0
  9. package/core/embedding/embedding-telemetry.js +219 -0
  10. package/core/embedding/index.js +40 -0
  11. package/core/graph/community-detector.js +294 -0
  12. package/core/graph/graph-expansion.js +839 -0
  13. package/core/graph/graph-extractor.js +2304 -0
  14. package/core/graph/graph-search.js +2148 -0
  15. package/core/graph/hcgs-generator.js +666 -0
  16. package/core/graph/index.js +16 -0
  17. package/core/graph/leiden-algorithm.js +547 -0
  18. package/core/graph/relationship-resolver.js +366 -0
  19. package/core/graph/repo-map.js +408 -0
  20. package/core/graph/summary-manager.js +549 -0
  21. package/core/indexing/artifact-builder.js +1054 -0
  22. package/core/indexing/ast-chunker.js +709 -0
  23. package/core/indexing/chunking/chunk-builder.js +170 -0
  24. package/core/indexing/chunking/markdown-chunker.js +503 -0
  25. package/core/indexing/chunking/plaintext-chunker.js +104 -0
  26. package/core/indexing/dedup/dedup-phase.js +159 -0
  27. package/core/indexing/dedup/exemplar-selector.js +65 -0
  28. package/core/indexing/document-chunker.js +56 -0
  29. package/core/indexing/incremental-parser.js +390 -0
  30. package/core/indexing/incremental-tracker.js +761 -0
  31. package/core/indexing/index-codebase-v21.js +472 -0
  32. package/core/indexing/index-maintainer.mjs +1674 -0
  33. package/core/indexing/index.js +90 -0
  34. package/core/indexing/indexer-ann.js +1077 -0
  35. package/core/indexing/indexer-build.js +742 -0
  36. package/core/indexing/indexer-phases.js +800 -0
  37. package/core/indexing/indexer-pool.js +764 -0
  38. package/core/indexing/indexer-sparse-gram.js +98 -0
  39. package/core/indexing/indexer-utils.js +536 -0
  40. package/core/indexing/indexer-worker.js +148 -0
  41. package/core/indexing/li-skip-policy.js +225 -0
  42. package/core/indexing/merkle-tracker.js +244 -0
  43. package/core/indexing/model-pool.js +166 -0
  44. package/core/infrastructure/code-graph-repository.js +120 -0
  45. package/core/infrastructure/codebase-repository.js +131 -0
  46. package/core/infrastructure/config/dedup.js +54 -0
  47. package/core/infrastructure/config/embedding.js +298 -0
  48. package/core/infrastructure/config/graph.js +80 -0
  49. package/core/infrastructure/config/index.js +82 -0
  50. package/core/infrastructure/config/indexing.js +8 -0
  51. package/core/infrastructure/config/platform.js +254 -0
  52. package/core/infrastructure/config/ranking.js +221 -0
  53. package/core/infrastructure/config/search.js +396 -0
  54. package/core/infrastructure/config/translation.js +89 -0
  55. package/core/infrastructure/config/vector-store.js +114 -0
  56. package/core/infrastructure/constants.js +86 -0
  57. package/core/infrastructure/coreml-cascade.js +909 -0
  58. package/core/infrastructure/coreml-cascade.json +46 -0
  59. package/core/infrastructure/coreml-provider.js +81 -0
  60. package/core/infrastructure/db-utils.js +69 -0
  61. package/core/infrastructure/dedup-hashing.js +83 -0
  62. package/core/infrastructure/hardware-capability.js +332 -0
  63. package/core/infrastructure/index.js +104 -0
  64. package/core/infrastructure/language-patterns/maps.js +121 -0
  65. package/core/infrastructure/language-patterns/registry-core.js +323 -0
  66. package/core/infrastructure/language-patterns/registry-data-query.js +155 -0
  67. package/core/infrastructure/language-patterns/registry-object-oriented.js +285 -0
  68. package/core/infrastructure/language-patterns/registry-tooling.js +240 -0
  69. package/core/infrastructure/language-patterns/registry-web-style.js +143 -0
  70. package/core/infrastructure/language-patterns/registry.js +19 -0
  71. package/core/infrastructure/language-patterns.js +141 -0
  72. package/core/infrastructure/llm-provider.js +733 -0
  73. package/core/infrastructure/manifest.json +46 -0
  74. package/core/infrastructure/maxsim.wasm +0 -0
  75. package/core/infrastructure/model-fetcher.js +423 -0
  76. package/core/infrastructure/model-registry.js +214 -0
  77. package/core/infrastructure/native-inference.js +587 -0
  78. package/core/infrastructure/native-resolver.js +187 -0
  79. package/core/infrastructure/native-sparse-gram.js +257 -0
  80. package/core/infrastructure/native-tokenizer.js +160 -0
  81. package/core/infrastructure/onnx-mutex.js +45 -0
  82. package/core/infrastructure/onnx-session-utils.js +261 -0
  83. package/core/infrastructure/ort-pipeline.js +111 -0
  84. package/core/infrastructure/project-detector.js +102 -0
  85. package/core/infrastructure/quantization.js +410 -0
  86. package/core/infrastructure/simd-distance.js +502 -0
  87. package/core/infrastructure/simd-distance.wasm +0 -0
  88. package/core/infrastructure/tree-sitter-provider.js +665 -0
  89. package/core/infrastructure/webgpu-maxsim.js +222 -0
  90. package/core/query/index.js +35 -0
  91. package/core/query/intent-detector.js +201 -0
  92. package/core/query/intent-router.js +156 -0
  93. package/core/query/query-router-catboost.js +222 -0
  94. package/core/query/query-router-ml.js +266 -0
  95. package/core/query/query-router.js +213 -0
  96. package/core/ranking/cascaded-scorer.js +379 -0
  97. package/core/ranking/flashrank.js +810 -0
  98. package/core/ranking/index.js +49 -0
  99. package/core/ranking/late-interaction-index.js +2383 -0
  100. package/core/ranking/late-interaction-model.js +812 -0
  101. package/core/ranking/local-reranker.js +374 -0
  102. package/core/ranking/mmr.js +379 -0
  103. package/core/ranking/quality-scorer.js +363 -0
  104. package/core/search/context-expander.js +1167 -0
  105. package/core/search/dedup/sibling-expander.js +327 -0
  106. package/core/search/index.js +16 -0
  107. package/core/search/search-boost.js +259 -0
  108. package/core/search/search-cli.js +544 -0
  109. package/core/search/search-format.js +282 -0
  110. package/core/search/search-fusion.js +327 -0
  111. package/core/search/search-hybrid.js +204 -0
  112. package/core/search/search-pattern-chunks.js +337 -0
  113. package/core/search/search-pattern-planner.js +439 -0
  114. package/core/search/search-pattern-prefilter.js +412 -0
  115. package/core/search/search-pattern-ripgrep.js +663 -0
  116. package/core/search/search-pattern.js +463 -0
  117. package/core/search/search-postprocess.js +452 -0
  118. package/core/search/search-semantic.js +706 -0
  119. package/core/search/search-server.js +554 -0
  120. package/core/search/session-daemon-prewarm.mjs +164 -0
  121. package/core/search/session-warmup.js +595 -0
  122. package/core/search/sweet-search.js +632 -0
  123. package/core/search/warmup-metrics.js +532 -0
  124. package/core/start-server.js +6 -0
  125. package/core/training/query-router/features/extractor.js +762 -0
  126. package/core/training/query-router/features/multilingual-patterns.js +431 -0
  127. package/core/training/query-router/features/text-segmenter.js +303 -0
  128. package/core/training/query-router/features/unicode-utils.js +383 -0
  129. package/core/training/query-router/output/v45_router_d4.js +11521 -0
  130. package/core/training/query-router/output/v46_router_d4.js +11498 -0
  131. package/core/vector-store/binary-heap.js +227 -0
  132. package/core/vector-store/binary-hnsw-index.js +1004 -0
  133. package/core/vector-store/float-vector-store.js +234 -0
  134. package/core/vector-store/hnsw-index.js +580 -0
  135. package/core/vector-store/index.js +39 -0
  136. package/core/vector-store/seismic-index.js +498 -0
  137. package/core/vocabulary/index.js +84 -0
  138. package/core/vocabulary/vocab-constants.js +20 -0
  139. package/core/vocabulary/vocab-miner-extractors.js +375 -0
  140. package/core/vocabulary/vocab-miner-nl.js +404 -0
  141. package/core/vocabulary/vocab-miner-utils.js +146 -0
  142. package/core/vocabulary/vocab-miner.js +574 -0
  143. package/core/vocabulary/vocab-prewarm-cli.js +110 -0
  144. package/core/vocabulary/vocab-ranker.js +492 -0
  145. package/core/vocabulary/vocab-warmer.js +523 -0
  146. package/core/vocabulary/vocab-warmup-orchestrator.js +425 -0
  147. package/core/vocabulary/vocabulary-utils.js +704 -0
  148. package/crates/wasm-router/pkg/package.json +13 -0
  149. package/crates/wasm-router/pkg/query_router_wasm.d.ts +36 -0
  150. package/crates/wasm-router/pkg/query_router_wasm.js +271 -0
  151. package/crates/wasm-router/pkg/query_router_wasm_bg.wasm +0 -0
  152. package/crates/wasm-router/pkg/query_router_wasm_bg.wasm.d.ts +19 -0
  153. package/mcp/config-gen.js +121 -0
  154. package/mcp/server.js +335 -0
  155. package/mcp/tool-handlers.js +476 -0
  156. package/package.json +131 -9
  157. package/scripts/benchmark-harness.js +794 -0
  158. package/scripts/init.js +1058 -0
  159. package/scripts/smoke-test.js +435 -0
  160. package/scripts/uninstall.js +478 -0
  161. package/scripts/verify-runtime.js +176 -0
@@ -0,0 +1,704 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Vocabulary Utilities v2.0 - Binary Format + Query Mining + Graph Templates
5
+ *
6
+ * SOTA December 2025 improvements:
7
+ * 1. Query Log Mining - Auto-learns from your frequent queries
8
+ * 2. Matryoshka Truncation - 256d (75% smaller, 4x faster load)
9
+ * 3. Binary mmap Format - Zero-load vocabulary via Float32Array
10
+ * 4. Graph-Aware Templates - Pre-warms structural queries
11
+ *
12
+ * Usage:
13
+ * node vocabulary-utils.js migrate # Migrate JSON to binary (one-time)
14
+ * node vocabulary-utils.js learn # Learn from query logs
15
+ * node vocabulary-utils.js warmup # Full warmup (entities + questions + graph)
16
+ * node vocabulary-utils.js warmup-inc # Incremental warmup (changed files only)
17
+ * node vocabulary-utils.js stats # Show vocabulary statistics
18
+ *
19
+ * References:
20
+ * - https://blog.voyageai.com/2024/12/04/voyage-code-3/
21
+ * - https://docs.voyageai.com/docs/flexible-dimensions-and-quantization
22
+ */
23
+
24
+ import fs from 'fs/promises';
25
+ import { existsSync, statSync } from 'fs';
26
+ import path from 'path';
27
+ import { PROJECT_ROOT } from '../infrastructure/config/index.js';
28
+
29
+ // =============================================================================
30
+ // CONFIGURATION
31
+ // =============================================================================
32
+
33
+ const CONFIG = {
34
+ // Matryoshka dimension (256d = 75% smaller, ~1.5% accuracy loss)
35
+ matryoshkaDimension: 256,
36
+
37
+ // Query log mining
38
+ minQueryFrequency: 3, // Promote queries used 3+ times
39
+
40
+ // Graph-aware templates for structural queries (GraphRAG support)
41
+ structuralTemplates: [
42
+ 'what calls {name}',
43
+ 'who calls {name}',
44
+ '{name} callers',
45
+ 'what does {name} call',
46
+ '{name} dependencies',
47
+ 'implementations of {name}',
48
+ '{name} implementors',
49
+ 'what extends {name}',
50
+ '{name} subclasses',
51
+ 'impact of changing {name}',
52
+ ],
53
+
54
+ // Standard question templates
55
+ questionTemplates: [
56
+ 'what is {name}',
57
+ 'how does {name} work',
58
+ 'where is {name} defined',
59
+ 'where is {name} used',
60
+ '{name} implementation',
61
+ ],
62
+
63
+ // Entity types that get question variants
64
+ importantTypes: ['class', 'interface', 'service', 'controller', 'repository'],
65
+
66
+ // Batch size for Voyage API
67
+ batchSize: 128,
68
+
69
+ // Rate limiting (300 req/min = 200ms between requests)
70
+ rateLimitMs: 200,
71
+ };
72
+
73
+ // =============================================================================
74
+ // PATH CONFIGURATION
75
+ // =============================================================================
76
+
77
+ // Project root is resolved in config.js (cwd/env-aware for standalone and host repos).
78
+
79
+ const PATHS = {
80
+ jsonVocab: path.join(PROJECT_ROOT, '.sweet-search', 'query-vocabulary.json'),
81
+ binaryVocab: path.join(PROJECT_ROOT, '.sweet-search', 'vocabulary.bin'),
82
+ binaryMeta: path.join(PROJECT_ROOT, '.sweet-search', 'vocabulary.meta.json'),
83
+ queryStats: path.join(PROJECT_ROOT, '.sweet-search', 'query-vocabulary-stats.json'),
84
+ codeGraph: path.join(PROJECT_ROOT, '.sweet-search', 'code-graph.db'),
85
+ };
86
+
87
+ // =============================================================================
88
+ // BINARY VOCABULARY CLASS
89
+ // =============================================================================
90
+
91
+ /**
92
+ * Binary Vocabulary with Matryoshka truncation
93
+ *
94
+ * Format:
95
+ * Header (32 bytes): VCAB magic, version, dimension, term count
96
+ * Embeddings: Float32Array (dimension * termCount * 4 bytes)
97
+ * Metadata: JSON with term->index mapping
98
+ *
99
+ * Benefits:
100
+ * - 75% smaller (256d vs 1024d)
101
+ * - ~4x faster load (binary vs JSON parsing)
102
+ * - Zero-copy Float32Array views
103
+ */
104
+ export class BinaryVocabulary {
105
+ constructor() {
106
+ this.buffer = null;
107
+ this.metadata = null;
108
+ this.termIndex = null;
109
+ this.dimension = CONFIG.matryoshkaDimension;
110
+ }
111
+
112
+ async load() {
113
+ if (this.buffer) return true;
114
+
115
+ if (!existsSync(PATHS.binaryVocab) || !existsSync(PATHS.binaryMeta)) {
116
+ return false;
117
+ }
118
+
119
+ const start = Date.now();
120
+
121
+ // Load metadata (small JSON with term→index mapping)
122
+ const metaContent = await fs.readFile(PATHS.binaryMeta, 'utf-8');
123
+ this.metadata = JSON.parse(metaContent);
124
+ this.dimension = this.metadata.dimension;
125
+
126
+ // Build term→index lookup
127
+ this.termIndex = new Map();
128
+ for (let i = 0; i < this.metadata.terms.length; i++) {
129
+ this.termIndex.set(this.metadata.terms[i].toLowerCase(), i);
130
+ }
131
+
132
+ // Load binary file into buffer (OS mmap's for large files)
133
+ this.buffer = await fs.readFile(PATHS.binaryVocab);
134
+
135
+ const loadMs = Date.now() - start;
136
+ return { ok: true, terms: this.termIndex.size, dimension: this.dimension, ms: loadMs };
137
+ }
138
+
139
+ get(term) {
140
+ if (!this.buffer || !this.termIndex) return null;
141
+
142
+ const normalized = term.toLowerCase().trim();
143
+ const index = this.termIndex.get(normalized);
144
+ if (index === undefined) return null;
145
+
146
+ // Skip 32-byte header + (index * dimension * 4 bytes per float)
147
+ const headerSize = 32;
148
+ const offset = headerSize + (index * this.dimension * 4);
149
+
150
+ // Create Float32Array view directly on buffer (zero-copy)
151
+ const embedding = new Float32Array(
152
+ this.buffer.buffer,
153
+ this.buffer.byteOffset + offset,
154
+ this.dimension
155
+ );
156
+
157
+ return Array.from(embedding);
158
+ }
159
+
160
+ has(term) {
161
+ return this.termIndex?.has(term.toLowerCase().trim()) || false;
162
+ }
163
+
164
+ size() {
165
+ return this.termIndex?.size || 0;
166
+ }
167
+
168
+ async save(embeddings) {
169
+ const terms = Array.from(embeddings.keys());
170
+ const dimension = CONFIG.matryoshkaDimension;
171
+
172
+ // Create header
173
+ const header = Buffer.alloc(32);
174
+ Buffer.from('VCAB').copy(header, 0);
175
+ header.writeUInt32LE(2, 4); // Version
176
+ header.writeUInt32LE(dimension, 8);
177
+ header.writeUInt32LE(terms.length, 12);
178
+
179
+ // Create embedding buffer
180
+ const embeddingBuffer = Buffer.alloc(terms.length * dimension * 4);
181
+
182
+ for (let i = 0; i < terms.length; i++) {
183
+ const term = terms[i];
184
+ const embedding = embeddings.get(term);
185
+
186
+ // Truncate to 256d if needed (Matryoshka)
187
+ const truncated = embedding.slice(0, dimension);
188
+
189
+ // Write Float32Array
190
+ const offset = i * dimension * 4;
191
+ for (let j = 0; j < dimension; j++) {
192
+ embeddingBuffer.writeFloatLE(truncated[j] || 0, offset + j * 4);
193
+ }
194
+ }
195
+
196
+ // Combine and write
197
+ const fullBuffer = Buffer.concat([header, embeddingBuffer]);
198
+ await fs.mkdir(path.dirname(PATHS.binaryVocab), { recursive: true });
199
+ await fs.writeFile(PATHS.binaryVocab, fullBuffer);
200
+
201
+ // Write metadata
202
+ const metadata = {
203
+ version: 2,
204
+ dimension,
205
+ termCount: terms.length,
206
+ created: new Date().toISOString(),
207
+ terms,
208
+ };
209
+ await fs.writeFile(PATHS.binaryMeta, JSON.stringify(metadata, null, 2));
210
+
211
+ return { terms: terms.length, sizeBytes: fullBuffer.length };
212
+ }
213
+ }
214
+
215
+ // =============================================================================
216
+ // QUERY LOG MINING
217
+ // =============================================================================
218
+
219
+ /**
220
+ * Mine query logs to find frequently-used queries that should be pre-warmed.
221
+ * This is the "self-learning" part - the system adapts to your usage patterns.
222
+ */
223
+ export async function mineQueryLogs() {
224
+ if (!existsSync(PATHS.queryStats)) {
225
+ return { queries: [], total: 0 };
226
+ }
227
+
228
+ const data = JSON.parse(await fs.readFile(PATHS.queryStats, 'utf-8'));
229
+ const queries = data.queries || {};
230
+
231
+ // Find queries used >= minQueryFrequency times
232
+ const frequentQueries = [];
233
+ for (const [query, count] of Object.entries(queries)) {
234
+ if (count >= CONFIG.minQueryFrequency) {
235
+ frequentQueries.push({ query, count });
236
+ }
237
+ }
238
+
239
+ // Sort by frequency (most used first)
240
+ frequentQueries.sort((a, b) => b.count - a.count);
241
+
242
+ return {
243
+ queries: frequentQueries.map(q => q.query),
244
+ total: frequentQueries.length,
245
+ top10: frequentQueries.slice(0, 10),
246
+ };
247
+ }
248
+
249
+ // =============================================================================
250
+ // ENTITY EXTRACTION
251
+ // =============================================================================
252
+
253
+ export async function extractEntities() {
254
+ if (!existsSync(PATHS.codeGraph)) {
255
+ return { entities: [], error: 'Code graph not found' };
256
+ }
257
+
258
+ const Database = (await import('better-sqlite3')).default;
259
+ const { applyReadPragmas } = await import('../infrastructure/db-utils.js');
260
+ const db = new Database(PATHS.codeGraph, { readonly: true });
261
+ applyReadPragmas(db);
262
+
263
+ const entities = db.prepare(`
264
+ SELECT name, type, COUNT(*) as count
265
+ FROM entities
266
+ GROUP BY name, type
267
+ ORDER BY count DESC
268
+ `).all();
269
+
270
+ db.close();
271
+
272
+ return { entities, count: entities.length };
273
+ }
274
+
275
+ // =============================================================================
276
+ // TEMPLATE GENERATION
277
+ // =============================================================================
278
+
279
+ export function generateAllTerms(entities, frequentQueries) {
280
+ const terms = new Set();
281
+ const stats = { names: 0, questions: 0, structural: 0, learned: 0 };
282
+
283
+ // 1. Add all entity names
284
+ for (const entity of entities) {
285
+ terms.add(entity.name);
286
+ stats.names++;
287
+ }
288
+
289
+ // 2. Add question templates for important types
290
+ for (const entity of entities) {
291
+ const name = entity.name;
292
+ const type = entity.type?.toLowerCase() || '';
293
+
294
+ if (CONFIG.importantTypes.some(t => type.includes(t) || name.toLowerCase().includes(t))) {
295
+ for (const template of CONFIG.questionTemplates) {
296
+ terms.add(template.replace('{name}', name));
297
+ stats.questions++;
298
+ }
299
+ }
300
+ }
301
+
302
+ // 3. Add graph-aware structural templates for important entities
303
+ for (const entity of entities) {
304
+ const name = entity.name;
305
+ const type = entity.type?.toLowerCase() || '';
306
+
307
+ if (CONFIG.importantTypes.some(t => type.includes(t))) {
308
+ for (const template of CONFIG.structuralTemplates) {
309
+ terms.add(template.replace('{name}', name));
310
+ stats.structural++;
311
+ }
312
+ }
313
+ }
314
+
315
+ // 4. Add frequently used queries from log mining
316
+ for (const query of frequentQueries) {
317
+ terms.add(query);
318
+ stats.learned++;
319
+ }
320
+
321
+ return { terms: Array.from(terms), stats };
322
+ }
323
+
324
+ // =============================================================================
325
+ // VOYAGE API
326
+ // =============================================================================
327
+
328
+ async function getVoyageConfig() {
329
+ const { EMBEDDING_PROVIDERS } = await import('../infrastructure/config/index.js');
330
+ return EMBEDDING_PROVIDERS.voyage;
331
+ }
332
+
333
+ export async function callVoyageAPI(texts, outputDimension = CONFIG.matryoshkaDimension) {
334
+ const config = await getVoyageConfig();
335
+
336
+ if (!config.enabled) {
337
+ throw new Error('Voyage API not configured. Set VOYAGEAI_API_KEY.');
338
+ }
339
+
340
+ const response = await fetch(config.endpoint, {
341
+ method: 'POST',
342
+ headers: {
343
+ 'Authorization': `Bearer ${config.apiKey}`,
344
+ 'Content-Type': 'application/json',
345
+ },
346
+ body: JSON.stringify({
347
+ model: config.model,
348
+ input: texts,
349
+ input_type: 'query',
350
+ output_dimension: outputDimension, // Matryoshka at API level
351
+ }),
352
+ });
353
+
354
+ if (!response.ok) {
355
+ const error = await response.text();
356
+ throw new Error(`Voyage API error: ${response.status} - ${error}`);
357
+ }
358
+
359
+ const data = await response.json();
360
+ return data.data.map(d => d.embedding);
361
+ }
362
+
363
+ // =============================================================================
364
+ // MIGRATION: JSON → BINARY
365
+ // =============================================================================
366
+
367
+ export async function migrateJsonToBinary() {
368
+ if (!existsSync(PATHS.jsonVocab)) {
369
+ return { error: 'No JSON vocabulary found' };
370
+ }
371
+
372
+ const start = Date.now();
373
+
374
+ // Load JSON
375
+ const jsonData = JSON.parse(await fs.readFile(PATHS.jsonVocab, 'utf-8'));
376
+ const terms = jsonData.terms || {};
377
+ const originalDim = Object.values(terms)[0]?.length || 1024;
378
+
379
+ if (originalDim < CONFIG.matryoshkaDimension) {
380
+ return { error: `Original dimension ${originalDim}d < target ${CONFIG.matryoshkaDimension}d` };
381
+ }
382
+
383
+ // Truncate and convert to Map
384
+ const embeddings = new Map();
385
+ for (const [term, embedding] of Object.entries(terms)) {
386
+ embeddings.set(term, embedding.slice(0, CONFIG.matryoshkaDimension));
387
+ }
388
+
389
+ // Save binary
390
+ const binaryVocab = new BinaryVocabulary();
391
+ const result = await binaryVocab.save(embeddings);
392
+
393
+ // Get sizes for comparison
394
+ const jsonSize = statSync(PATHS.jsonVocab).size;
395
+ const binSize = result.sizeBytes;
396
+
397
+ return {
398
+ ok: true,
399
+ terms: result.terms,
400
+ jsonSizeMB: (jsonSize / 1024 / 1024).toFixed(1),
401
+ binSizeMB: (binSize / 1024 / 1024).toFixed(1),
402
+ reduction: ((1 - binSize / jsonSize) * 100).toFixed(1) + '%',
403
+ ms: Date.now() - start,
404
+ };
405
+ }
406
+
407
+ // =============================================================================
408
+ // WARMUP: FULL
409
+ // =============================================================================
410
+
411
+ export async function warmupFull(options = {}) {
412
+ const { force = false, verbose = false } = options;
413
+ const start = Date.now();
414
+ const log = verbose ? console.log : () => {};
415
+
416
+ log('Step 1: Mining query logs...');
417
+ const { queries: frequentQueries, total: learnedCount } = await mineQueryLogs();
418
+ log(` Found ${learnedCount} frequent queries`);
419
+
420
+ log('Step 2: Extracting entities...');
421
+ const { entities, count: entityCount } = await extractEntities();
422
+ log(` Found ${entityCount} entities`);
423
+
424
+ log('Step 3: Generating terms...');
425
+ const { terms: allTerms, stats } = generateAllTerms(entities, frequentQueries);
426
+ log(` Names: ${stats.names}, Questions: ${stats.questions}, Structural: ${stats.structural}, Learned: ${stats.learned}`);
427
+ log(` Total unique: ${allTerms.length}`);
428
+
429
+ log('Step 4: Loading existing vocabulary...');
430
+ const binaryVocab = new BinaryVocabulary();
431
+ const existingLoaded = force ? false : await binaryVocab.load();
432
+
433
+ // Find new terms
434
+ let newTerms = allTerms;
435
+ if (existingLoaded) {
436
+ newTerms = allTerms.filter(t => !binaryVocab.has(t));
437
+ log(` Existing: ${binaryVocab.size()}, New: ${newTerms.length}`);
438
+ }
439
+
440
+ if (newTerms.length === 0) {
441
+ return { ok: true, terms: binaryVocab.size(), newTerms: 0, ms: Date.now() - start };
442
+ }
443
+
444
+ log('Step 5: Generating embeddings...');
445
+ const embeddings = new Map();
446
+
447
+ // Copy existing
448
+ if (existingLoaded) {
449
+ for (const term of binaryVocab.metadata.terms) {
450
+ embeddings.set(term, binaryVocab.get(term));
451
+ }
452
+ }
453
+
454
+ // Generate new in batches
455
+ const batches = [];
456
+ for (let i = 0; i < newTerms.length; i += CONFIG.batchSize) {
457
+ batches.push(newTerms.slice(i, i + CONFIG.batchSize));
458
+ }
459
+
460
+ for (let i = 0; i < batches.length; i++) {
461
+ const batch = batches[i];
462
+ try {
463
+ const batchEmbeddings = await callVoyageAPI(batch);
464
+ for (let j = 0; j < batch.length; j++) {
465
+ embeddings.set(batch[j].toLowerCase().trim(), batchEmbeddings[j]);
466
+ }
467
+ if (verbose) {
468
+ process.stdout.write(`\r Batch ${i + 1}/${batches.length}`);
469
+ }
470
+ if (i < batches.length - 1) {
471
+ await new Promise(r => setTimeout(r, CONFIG.rateLimitMs));
472
+ }
473
+ } catch (err) {
474
+ log(` Batch ${i + 1} failed: ${err.message}`);
475
+ }
476
+ }
477
+ if (verbose) console.log();
478
+
479
+ log('Step 6: Saving binary vocabulary...');
480
+ const result = await binaryVocab.save(embeddings);
481
+
482
+ return {
483
+ ok: true,
484
+ terms: result.terms,
485
+ newTerms: newTerms.length,
486
+ sizeBytes: result.sizeBytes,
487
+ ms: Date.now() - start,
488
+ };
489
+ }
490
+
491
+ // =============================================================================
492
+ // WARMUP: LEARN ONLY
493
+ // =============================================================================
494
+
495
+ export async function warmupLearn(options = {}) {
496
+ const { verbose = false } = options;
497
+ const start = Date.now();
498
+ const log = verbose ? console.log : () => {};
499
+
500
+ log('Step 1: Mining query logs...');
501
+ const { queries: frequentQueries, total: learnedCount, top10 } = await mineQueryLogs();
502
+
503
+ if (learnedCount === 0) {
504
+ return { ok: true, learned: 0, ms: Date.now() - start };
505
+ }
506
+
507
+ log(` Found ${learnedCount} frequent queries`);
508
+ if (verbose && top10.length > 0) {
509
+ log(' Top 10:');
510
+ for (const { query, count } of top10) {
511
+ log(` "${query.substring(0, 40)}..." (${count}x)`);
512
+ }
513
+ }
514
+
515
+ log('Step 2: Loading existing vocabulary...');
516
+ const binaryVocab = new BinaryVocabulary();
517
+ const loaded = await binaryVocab.load();
518
+
519
+ // Find queries not in vocabulary
520
+ const newQueries = frequentQueries.filter(q => !loaded || !binaryVocab.has(q));
521
+ log(` New queries to embed: ${newQueries.length}`);
522
+
523
+ if (newQueries.length === 0) {
524
+ return { ok: true, learned: 0, existing: binaryVocab.size(), ms: Date.now() - start };
525
+ }
526
+
527
+ log('Step 3: Generating embeddings...');
528
+ const embeddings = new Map();
529
+
530
+ // Copy existing
531
+ if (loaded) {
532
+ for (const term of binaryVocab.metadata.terms) {
533
+ embeddings.set(term, binaryVocab.get(term));
534
+ }
535
+ }
536
+
537
+ // Generate new
538
+ const batches = [];
539
+ for (let i = 0; i < newQueries.length; i += CONFIG.batchSize) {
540
+ batches.push(newQueries.slice(i, i + CONFIG.batchSize));
541
+ }
542
+
543
+ for (const batch of batches) {
544
+ try {
545
+ const batchEmbeddings = await callVoyageAPI(batch);
546
+ for (let j = 0; j < batch.length; j++) {
547
+ embeddings.set(batch[j].toLowerCase().trim(), batchEmbeddings[j]);
548
+ }
549
+ } catch (err) {
550
+ log(` Batch failed: ${err.message}`);
551
+ }
552
+ }
553
+
554
+ log('Step 4: Saving...');
555
+ const result = await binaryVocab.save(embeddings);
556
+
557
+ return {
558
+ ok: true,
559
+ learned: newQueries.length,
560
+ total: result.terms,
561
+ ms: Date.now() - start,
562
+ };
563
+ }
564
+
565
+ // =============================================================================
566
+ // STATS
567
+ // =============================================================================
568
+
569
+ export async function getStats() {
570
+ const stats = {};
571
+
572
+ // Binary vocabulary
573
+ if (existsSync(PATHS.binaryVocab) && existsSync(PATHS.binaryMeta)) {
574
+ const binSize = statSync(PATHS.binaryVocab).size;
575
+ const meta = JSON.parse(await fs.readFile(PATHS.binaryMeta, 'utf-8'));
576
+ stats.binary = {
577
+ terms: meta.termCount,
578
+ dimension: meta.dimension,
579
+ sizeMB: (binSize / 1024 / 1024).toFixed(1),
580
+ created: meta.created,
581
+ };
582
+ }
583
+
584
+ // JSON vocabulary (legacy)
585
+ if (existsSync(PATHS.jsonVocab)) {
586
+ const jsonSize = statSync(PATHS.jsonVocab).size;
587
+ stats.json = {
588
+ sizeMB: (jsonSize / 1024 / 1024).toFixed(1),
589
+ };
590
+ }
591
+
592
+ // Query stats
593
+ if (existsSync(PATHS.queryStats)) {
594
+ const data = JSON.parse(await fs.readFile(PATHS.queryStats, 'utf-8'));
595
+ const queries = data.queries || {};
596
+ const counts = Object.values(queries);
597
+ stats.queryLog = {
598
+ uniqueQueries: Object.keys(queries).length,
599
+ totalQueries: counts.reduce((a, b) => a + b, 0),
600
+ frequent: counts.filter(c => c >= CONFIG.minQueryFrequency).length,
601
+ };
602
+ }
603
+
604
+ return stats;
605
+ }
606
+
607
+ // =============================================================================
608
+ // CLI
609
+ // =============================================================================
610
+
611
+ async function main() {
612
+ const args = process.argv.slice(2);
613
+ const cmd = args[0];
614
+ const verbose = args.includes('--verbose') || args.includes('-v');
615
+
616
+ switch (cmd) {
617
+ case 'migrate':
618
+ console.log('Migrating JSON vocabulary to binary format...');
619
+ const migrateResult = await migrateJsonToBinary();
620
+ if (migrateResult.error) {
621
+ console.error('Error:', migrateResult.error);
622
+ } else {
623
+ console.log(`Done in ${migrateResult.ms}ms`);
624
+ console.log(` Terms: ${migrateResult.terms}`);
625
+ console.log(` JSON: ${migrateResult.jsonSizeMB}MB → Binary: ${migrateResult.binSizeMB}MB`);
626
+ console.log(` Reduction: ${migrateResult.reduction}`);
627
+ }
628
+ break;
629
+
630
+ case 'learn':
631
+ console.log('Learning from query logs...');
632
+ const learnResult = await warmupLearn({ verbose });
633
+ console.log(`Done in ${learnResult.ms}ms`);
634
+ console.log(` Learned: ${learnResult.learned} queries`);
635
+ console.log(` Total vocabulary: ${learnResult.total || learnResult.existing || 0} terms`);
636
+ break;
637
+
638
+ case 'warmup':
639
+ console.log('Full vocabulary warmup (entities + questions + graph + learned)...');
640
+ const warmupResult = await warmupFull({ verbose, force: args.includes('--force') });
641
+ console.log(`Done in ${warmupResult.ms}ms`);
642
+ console.log(` Total: ${warmupResult.terms} terms`);
643
+ console.log(` New: ${warmupResult.newTerms} terms`);
644
+ break;
645
+
646
+ case 'warmup-inc':
647
+ console.log('Incremental warmup (learned queries only)...');
648
+ const incResult = await warmupLearn({ verbose });
649
+ console.log(`Done in ${incResult.ms}ms`);
650
+ console.log(` Learned: ${incResult.learned} queries`);
651
+ break;
652
+
653
+ case 'stats':
654
+ const stats = await getStats();
655
+ console.log('Vocabulary Statistics:');
656
+ if (stats.binary) {
657
+ console.log(` Binary: ${stats.binary.terms} terms (${stats.binary.dimension}d, ${stats.binary.sizeMB}MB)`);
658
+ }
659
+ if (stats.json) {
660
+ console.log(` JSON (legacy): ${stats.json.sizeMB}MB`);
661
+ }
662
+ if (stats.queryLog) {
663
+ console.log(` Query log: ${stats.queryLog.uniqueQueries} unique, ${stats.queryLog.frequent} frequent`);
664
+ }
665
+ break;
666
+
667
+ default:
668
+ console.log(`
669
+ Vocabulary Utilities v2.0
670
+
671
+ Usage:
672
+ node vocabulary-utils.js migrate Migrate JSON to binary (75% smaller)
673
+ node vocabulary-utils.js learn Learn from query logs (self-learning)
674
+ node vocabulary-utils.js warmup Full warmup (all templates)
675
+ node vocabulary-utils.js warmup-inc Incremental warmup (learned only)
676
+ node vocabulary-utils.js stats Show vocabulary statistics
677
+
678
+ Options:
679
+ --verbose, -v Show detailed progress
680
+ --force Force regenerate all embeddings
681
+ `);
682
+ }
683
+ }
684
+
685
+ if (import.meta.url === `file://${process.argv[1]}`) {
686
+ main().catch(err => {
687
+ console.error('Error:', err.message);
688
+ process.exit(1);
689
+ });
690
+ }
691
+
692
+ export default {
693
+ BinaryVocabulary,
694
+ mineQueryLogs,
695
+ extractEntities,
696
+ generateAllTerms,
697
+ callVoyageAPI,
698
+ migrateJsonToBinary,
699
+ warmupFull,
700
+ warmupLearn,
701
+ getStats,
702
+ CONFIG,
703
+ PATHS,
704
+ };
@@ -0,0 +1,13 @@
1
+ {
2
+ "name": "query_router_wasm",
3
+ "description": "Ultra-fast query router for code search (<5μs)",
4
+ "version": "1.0.0",
5
+ "license": "MIT",
6
+ "files": [
7
+ "query_router_wasm_bg.wasm",
8
+ "query_router_wasm.js",
9
+ "query_router_wasm.d.ts"
10
+ ],
11
+ "main": "query_router_wasm.js",
12
+ "types": "query_router_wasm.d.ts"
13
+ }