sweet-search 0.0.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +190 -0
- package/NOTICE +23 -0
- package/core/cli.js +51 -0
- package/core/config.js +27 -0
- package/core/embedding/embedding-cache.js +467 -0
- package/core/embedding/embedding-local-model.js +845 -0
- package/core/embedding/embedding-remote.js +492 -0
- package/core/embedding/embedding-service.js +712 -0
- package/core/embedding/embedding-telemetry.js +219 -0
- package/core/embedding/index.js +40 -0
- package/core/graph/community-detector.js +294 -0
- package/core/graph/graph-expansion.js +839 -0
- package/core/graph/graph-extractor.js +2304 -0
- package/core/graph/graph-search.js +2148 -0
- package/core/graph/hcgs-generator.js +666 -0
- package/core/graph/index.js +16 -0
- package/core/graph/leiden-algorithm.js +547 -0
- package/core/graph/relationship-resolver.js +366 -0
- package/core/graph/repo-map.js +408 -0
- package/core/graph/summary-manager.js +549 -0
- package/core/indexing/artifact-builder.js +1054 -0
- package/core/indexing/ast-chunker.js +709 -0
- package/core/indexing/chunking/chunk-builder.js +170 -0
- package/core/indexing/chunking/markdown-chunker.js +503 -0
- package/core/indexing/chunking/plaintext-chunker.js +104 -0
- package/core/indexing/dedup/dedup-phase.js +159 -0
- package/core/indexing/dedup/exemplar-selector.js +65 -0
- package/core/indexing/document-chunker.js +56 -0
- package/core/indexing/incremental-parser.js +390 -0
- package/core/indexing/incremental-tracker.js +761 -0
- package/core/indexing/index-codebase-v21.js +472 -0
- package/core/indexing/index-maintainer.mjs +1674 -0
- package/core/indexing/index.js +90 -0
- package/core/indexing/indexer-ann.js +1077 -0
- package/core/indexing/indexer-build.js +742 -0
- package/core/indexing/indexer-phases.js +800 -0
- package/core/indexing/indexer-pool.js +764 -0
- package/core/indexing/indexer-sparse-gram.js +98 -0
- package/core/indexing/indexer-utils.js +536 -0
- package/core/indexing/indexer-worker.js +148 -0
- package/core/indexing/li-skip-policy.js +225 -0
- package/core/indexing/merkle-tracker.js +244 -0
- package/core/indexing/model-pool.js +166 -0
- package/core/infrastructure/code-graph-repository.js +120 -0
- package/core/infrastructure/codebase-repository.js +131 -0
- package/core/infrastructure/config/dedup.js +54 -0
- package/core/infrastructure/config/embedding.js +298 -0
- package/core/infrastructure/config/graph.js +80 -0
- package/core/infrastructure/config/index.js +82 -0
- package/core/infrastructure/config/indexing.js +8 -0
- package/core/infrastructure/config/platform.js +254 -0
- package/core/infrastructure/config/ranking.js +221 -0
- package/core/infrastructure/config/search.js +396 -0
- package/core/infrastructure/config/translation.js +89 -0
- package/core/infrastructure/config/vector-store.js +114 -0
- package/core/infrastructure/constants.js +86 -0
- package/core/infrastructure/coreml-cascade.js +909 -0
- package/core/infrastructure/coreml-cascade.json +46 -0
- package/core/infrastructure/coreml-provider.js +81 -0
- package/core/infrastructure/db-utils.js +69 -0
- package/core/infrastructure/dedup-hashing.js +83 -0
- package/core/infrastructure/hardware-capability.js +332 -0
- package/core/infrastructure/index.js +104 -0
- package/core/infrastructure/language-patterns/maps.js +121 -0
- package/core/infrastructure/language-patterns/registry-core.js +323 -0
- package/core/infrastructure/language-patterns/registry-data-query.js +155 -0
- package/core/infrastructure/language-patterns/registry-object-oriented.js +285 -0
- package/core/infrastructure/language-patterns/registry-tooling.js +240 -0
- package/core/infrastructure/language-patterns/registry-web-style.js +143 -0
- package/core/infrastructure/language-patterns/registry.js +19 -0
- package/core/infrastructure/language-patterns.js +141 -0
- package/core/infrastructure/llm-provider.js +733 -0
- package/core/infrastructure/manifest.json +46 -0
- package/core/infrastructure/maxsim.wasm +0 -0
- package/core/infrastructure/model-fetcher.js +423 -0
- package/core/infrastructure/model-registry.js +214 -0
- package/core/infrastructure/native-inference.js +587 -0
- package/core/infrastructure/native-resolver.js +187 -0
- package/core/infrastructure/native-sparse-gram.js +257 -0
- package/core/infrastructure/native-tokenizer.js +160 -0
- package/core/infrastructure/onnx-mutex.js +45 -0
- package/core/infrastructure/onnx-session-utils.js +261 -0
- package/core/infrastructure/ort-pipeline.js +111 -0
- package/core/infrastructure/project-detector.js +102 -0
- package/core/infrastructure/quantization.js +410 -0
- package/core/infrastructure/simd-distance.js +502 -0
- package/core/infrastructure/simd-distance.wasm +0 -0
- package/core/infrastructure/tree-sitter-provider.js +665 -0
- package/core/infrastructure/webgpu-maxsim.js +222 -0
- package/core/query/index.js +35 -0
- package/core/query/intent-detector.js +201 -0
- package/core/query/intent-router.js +156 -0
- package/core/query/query-router-catboost.js +222 -0
- package/core/query/query-router-ml.js +266 -0
- package/core/query/query-router.js +213 -0
- package/core/ranking/cascaded-scorer.js +379 -0
- package/core/ranking/flashrank.js +810 -0
- package/core/ranking/index.js +49 -0
- package/core/ranking/late-interaction-index.js +2383 -0
- package/core/ranking/late-interaction-model.js +812 -0
- package/core/ranking/local-reranker.js +374 -0
- package/core/ranking/mmr.js +379 -0
- package/core/ranking/quality-scorer.js +363 -0
- package/core/search/context-expander.js +1167 -0
- package/core/search/dedup/sibling-expander.js +327 -0
- package/core/search/index.js +16 -0
- package/core/search/search-boost.js +259 -0
- package/core/search/search-cli.js +544 -0
- package/core/search/search-format.js +282 -0
- package/core/search/search-fusion.js +327 -0
- package/core/search/search-hybrid.js +204 -0
- package/core/search/search-pattern-chunks.js +337 -0
- package/core/search/search-pattern-planner.js +439 -0
- package/core/search/search-pattern-prefilter.js +412 -0
- package/core/search/search-pattern-ripgrep.js +663 -0
- package/core/search/search-pattern.js +463 -0
- package/core/search/search-postprocess.js +452 -0
- package/core/search/search-semantic.js +706 -0
- package/core/search/search-server.js +554 -0
- package/core/search/session-daemon-prewarm.mjs +164 -0
- package/core/search/session-warmup.js +595 -0
- package/core/search/sweet-search.js +632 -0
- package/core/search/warmup-metrics.js +532 -0
- package/core/start-server.js +6 -0
- package/core/training/query-router/features/extractor.js +762 -0
- package/core/training/query-router/features/multilingual-patterns.js +431 -0
- package/core/training/query-router/features/text-segmenter.js +303 -0
- package/core/training/query-router/features/unicode-utils.js +383 -0
- package/core/training/query-router/output/v45_router_d4.js +11521 -0
- package/core/training/query-router/output/v46_router_d4.js +11498 -0
- package/core/vector-store/binary-heap.js +227 -0
- package/core/vector-store/binary-hnsw-index.js +1004 -0
- package/core/vector-store/float-vector-store.js +234 -0
- package/core/vector-store/hnsw-index.js +580 -0
- package/core/vector-store/index.js +39 -0
- package/core/vector-store/seismic-index.js +498 -0
- package/core/vocabulary/index.js +84 -0
- package/core/vocabulary/vocab-constants.js +20 -0
- package/core/vocabulary/vocab-miner-extractors.js +375 -0
- package/core/vocabulary/vocab-miner-nl.js +404 -0
- package/core/vocabulary/vocab-miner-utils.js +146 -0
- package/core/vocabulary/vocab-miner.js +574 -0
- package/core/vocabulary/vocab-prewarm-cli.js +110 -0
- package/core/vocabulary/vocab-ranker.js +492 -0
- package/core/vocabulary/vocab-warmer.js +523 -0
- package/core/vocabulary/vocab-warmup-orchestrator.js +425 -0
- package/core/vocabulary/vocabulary-utils.js +704 -0
- package/crates/wasm-router/pkg/package.json +13 -0
- package/crates/wasm-router/pkg/query_router_wasm.d.ts +36 -0
- package/crates/wasm-router/pkg/query_router_wasm.js +271 -0
- package/crates/wasm-router/pkg/query_router_wasm_bg.wasm +0 -0
- package/crates/wasm-router/pkg/query_router_wasm_bg.wasm.d.ts +19 -0
- package/mcp/config-gen.js +121 -0
- package/mcp/server.js +335 -0
- package/mcp/tool-handlers.js +476 -0
- package/package.json +131 -9
- package/scripts/benchmark-harness.js +794 -0
- package/scripts/init.js +1058 -0
- package/scripts/smoke-test.js +435 -0
- package/scripts/uninstall.js +478 -0
- package/scripts/verify-runtime.js +176 -0
|
@@ -0,0 +1,704 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Vocabulary Utilities v2.0 - Binary Format + Query Mining + Graph Templates
|
|
5
|
+
*
|
|
6
|
+
* SOTA December 2025 improvements:
|
|
7
|
+
* 1. Query Log Mining - Auto-learns from your frequent queries
|
|
8
|
+
* 2. Matryoshka Truncation - 256d (75% smaller, 4x faster load)
|
|
9
|
+
* 3. Binary mmap Format - Zero-load vocabulary via Float32Array
|
|
10
|
+
* 4. Graph-Aware Templates - Pre-warms structural queries
|
|
11
|
+
*
|
|
12
|
+
* Usage:
|
|
13
|
+
* node vocabulary-utils.js migrate # Migrate JSON to binary (one-time)
|
|
14
|
+
* node vocabulary-utils.js learn # Learn from query logs
|
|
15
|
+
* node vocabulary-utils.js warmup # Full warmup (entities + questions + graph)
|
|
16
|
+
* node vocabulary-utils.js warmup-inc # Incremental warmup (changed files only)
|
|
17
|
+
* node vocabulary-utils.js stats # Show vocabulary statistics
|
|
18
|
+
*
|
|
19
|
+
* References:
|
|
20
|
+
* - https://blog.voyageai.com/2024/12/04/voyage-code-3/
|
|
21
|
+
* - https://docs.voyageai.com/docs/flexible-dimensions-and-quantization
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import fs from 'fs/promises';
|
|
25
|
+
import { existsSync, statSync } from 'fs';
|
|
26
|
+
import path from 'path';
|
|
27
|
+
import { PROJECT_ROOT } from '../infrastructure/config/index.js';
|
|
28
|
+
|
|
29
|
+
// =============================================================================
|
|
30
|
+
// CONFIGURATION
|
|
31
|
+
// =============================================================================
|
|
32
|
+
|
|
33
|
+
const CONFIG = {
|
|
34
|
+
// Matryoshka dimension (256d = 75% smaller, ~1.5% accuracy loss)
|
|
35
|
+
matryoshkaDimension: 256,
|
|
36
|
+
|
|
37
|
+
// Query log mining
|
|
38
|
+
minQueryFrequency: 3, // Promote queries used 3+ times
|
|
39
|
+
|
|
40
|
+
// Graph-aware templates for structural queries (GraphRAG support)
|
|
41
|
+
structuralTemplates: [
|
|
42
|
+
'what calls {name}',
|
|
43
|
+
'who calls {name}',
|
|
44
|
+
'{name} callers',
|
|
45
|
+
'what does {name} call',
|
|
46
|
+
'{name} dependencies',
|
|
47
|
+
'implementations of {name}',
|
|
48
|
+
'{name} implementors',
|
|
49
|
+
'what extends {name}',
|
|
50
|
+
'{name} subclasses',
|
|
51
|
+
'impact of changing {name}',
|
|
52
|
+
],
|
|
53
|
+
|
|
54
|
+
// Standard question templates
|
|
55
|
+
questionTemplates: [
|
|
56
|
+
'what is {name}',
|
|
57
|
+
'how does {name} work',
|
|
58
|
+
'where is {name} defined',
|
|
59
|
+
'where is {name} used',
|
|
60
|
+
'{name} implementation',
|
|
61
|
+
],
|
|
62
|
+
|
|
63
|
+
// Entity types that get question variants
|
|
64
|
+
importantTypes: ['class', 'interface', 'service', 'controller', 'repository'],
|
|
65
|
+
|
|
66
|
+
// Batch size for Voyage API
|
|
67
|
+
batchSize: 128,
|
|
68
|
+
|
|
69
|
+
// Rate limiting (300 req/min = 200ms between requests)
|
|
70
|
+
rateLimitMs: 200,
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
// =============================================================================
|
|
74
|
+
// PATH CONFIGURATION
|
|
75
|
+
// =============================================================================
|
|
76
|
+
|
|
77
|
+
// Project root is resolved in config.js (cwd/env-aware for standalone and host repos).
|
|
78
|
+
|
|
79
|
+
const PATHS = {
|
|
80
|
+
jsonVocab: path.join(PROJECT_ROOT, '.sweet-search', 'query-vocabulary.json'),
|
|
81
|
+
binaryVocab: path.join(PROJECT_ROOT, '.sweet-search', 'vocabulary.bin'),
|
|
82
|
+
binaryMeta: path.join(PROJECT_ROOT, '.sweet-search', 'vocabulary.meta.json'),
|
|
83
|
+
queryStats: path.join(PROJECT_ROOT, '.sweet-search', 'query-vocabulary-stats.json'),
|
|
84
|
+
codeGraph: path.join(PROJECT_ROOT, '.sweet-search', 'code-graph.db'),
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
// =============================================================================
|
|
88
|
+
// BINARY VOCABULARY CLASS
|
|
89
|
+
// =============================================================================
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Binary Vocabulary with Matryoshka truncation
|
|
93
|
+
*
|
|
94
|
+
* Format:
|
|
95
|
+
* Header (32 bytes): VCAB magic, version, dimension, term count
|
|
96
|
+
* Embeddings: Float32Array (dimension * termCount * 4 bytes)
|
|
97
|
+
* Metadata: JSON with term->index mapping
|
|
98
|
+
*
|
|
99
|
+
* Benefits:
|
|
100
|
+
* - 75% smaller (256d vs 1024d)
|
|
101
|
+
* - ~4x faster load (binary vs JSON parsing)
|
|
102
|
+
* - Zero-copy Float32Array views
|
|
103
|
+
*/
|
|
104
|
+
export class BinaryVocabulary {
|
|
105
|
+
constructor() {
|
|
106
|
+
this.buffer = null;
|
|
107
|
+
this.metadata = null;
|
|
108
|
+
this.termIndex = null;
|
|
109
|
+
this.dimension = CONFIG.matryoshkaDimension;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
async load() {
|
|
113
|
+
if (this.buffer) return true;
|
|
114
|
+
|
|
115
|
+
if (!existsSync(PATHS.binaryVocab) || !existsSync(PATHS.binaryMeta)) {
|
|
116
|
+
return false;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const start = Date.now();
|
|
120
|
+
|
|
121
|
+
// Load metadata (small JSON with term→index mapping)
|
|
122
|
+
const metaContent = await fs.readFile(PATHS.binaryMeta, 'utf-8');
|
|
123
|
+
this.metadata = JSON.parse(metaContent);
|
|
124
|
+
this.dimension = this.metadata.dimension;
|
|
125
|
+
|
|
126
|
+
// Build term→index lookup
|
|
127
|
+
this.termIndex = new Map();
|
|
128
|
+
for (let i = 0; i < this.metadata.terms.length; i++) {
|
|
129
|
+
this.termIndex.set(this.metadata.terms[i].toLowerCase(), i);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Load binary file into buffer (OS mmap's for large files)
|
|
133
|
+
this.buffer = await fs.readFile(PATHS.binaryVocab);
|
|
134
|
+
|
|
135
|
+
const loadMs = Date.now() - start;
|
|
136
|
+
return { ok: true, terms: this.termIndex.size, dimension: this.dimension, ms: loadMs };
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
get(term) {
|
|
140
|
+
if (!this.buffer || !this.termIndex) return null;
|
|
141
|
+
|
|
142
|
+
const normalized = term.toLowerCase().trim();
|
|
143
|
+
const index = this.termIndex.get(normalized);
|
|
144
|
+
if (index === undefined) return null;
|
|
145
|
+
|
|
146
|
+
// Skip 32-byte header + (index * dimension * 4 bytes per float)
|
|
147
|
+
const headerSize = 32;
|
|
148
|
+
const offset = headerSize + (index * this.dimension * 4);
|
|
149
|
+
|
|
150
|
+
// Create Float32Array view directly on buffer (zero-copy)
|
|
151
|
+
const embedding = new Float32Array(
|
|
152
|
+
this.buffer.buffer,
|
|
153
|
+
this.buffer.byteOffset + offset,
|
|
154
|
+
this.dimension
|
|
155
|
+
);
|
|
156
|
+
|
|
157
|
+
return Array.from(embedding);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
has(term) {
|
|
161
|
+
return this.termIndex?.has(term.toLowerCase().trim()) || false;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
size() {
|
|
165
|
+
return this.termIndex?.size || 0;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
async save(embeddings) {
|
|
169
|
+
const terms = Array.from(embeddings.keys());
|
|
170
|
+
const dimension = CONFIG.matryoshkaDimension;
|
|
171
|
+
|
|
172
|
+
// Create header
|
|
173
|
+
const header = Buffer.alloc(32);
|
|
174
|
+
Buffer.from('VCAB').copy(header, 0);
|
|
175
|
+
header.writeUInt32LE(2, 4); // Version
|
|
176
|
+
header.writeUInt32LE(dimension, 8);
|
|
177
|
+
header.writeUInt32LE(terms.length, 12);
|
|
178
|
+
|
|
179
|
+
// Create embedding buffer
|
|
180
|
+
const embeddingBuffer = Buffer.alloc(terms.length * dimension * 4);
|
|
181
|
+
|
|
182
|
+
for (let i = 0; i < terms.length; i++) {
|
|
183
|
+
const term = terms[i];
|
|
184
|
+
const embedding = embeddings.get(term);
|
|
185
|
+
|
|
186
|
+
// Truncate to 256d if needed (Matryoshka)
|
|
187
|
+
const truncated = embedding.slice(0, dimension);
|
|
188
|
+
|
|
189
|
+
// Write Float32Array
|
|
190
|
+
const offset = i * dimension * 4;
|
|
191
|
+
for (let j = 0; j < dimension; j++) {
|
|
192
|
+
embeddingBuffer.writeFloatLE(truncated[j] || 0, offset + j * 4);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Combine and write
|
|
197
|
+
const fullBuffer = Buffer.concat([header, embeddingBuffer]);
|
|
198
|
+
await fs.mkdir(path.dirname(PATHS.binaryVocab), { recursive: true });
|
|
199
|
+
await fs.writeFile(PATHS.binaryVocab, fullBuffer);
|
|
200
|
+
|
|
201
|
+
// Write metadata
|
|
202
|
+
const metadata = {
|
|
203
|
+
version: 2,
|
|
204
|
+
dimension,
|
|
205
|
+
termCount: terms.length,
|
|
206
|
+
created: new Date().toISOString(),
|
|
207
|
+
terms,
|
|
208
|
+
};
|
|
209
|
+
await fs.writeFile(PATHS.binaryMeta, JSON.stringify(metadata, null, 2));
|
|
210
|
+
|
|
211
|
+
return { terms: terms.length, sizeBytes: fullBuffer.length };
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// =============================================================================
|
|
216
|
+
// QUERY LOG MINING
|
|
217
|
+
// =============================================================================
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Mine query logs to find frequently-used queries that should be pre-warmed.
|
|
221
|
+
* This is the "self-learning" part - the system adapts to your usage patterns.
|
|
222
|
+
*/
|
|
223
|
+
export async function mineQueryLogs() {
|
|
224
|
+
if (!existsSync(PATHS.queryStats)) {
|
|
225
|
+
return { queries: [], total: 0 };
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
const data = JSON.parse(await fs.readFile(PATHS.queryStats, 'utf-8'));
|
|
229
|
+
const queries = data.queries || {};
|
|
230
|
+
|
|
231
|
+
// Find queries used >= minQueryFrequency times
|
|
232
|
+
const frequentQueries = [];
|
|
233
|
+
for (const [query, count] of Object.entries(queries)) {
|
|
234
|
+
if (count >= CONFIG.minQueryFrequency) {
|
|
235
|
+
frequentQueries.push({ query, count });
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// Sort by frequency (most used first)
|
|
240
|
+
frequentQueries.sort((a, b) => b.count - a.count);
|
|
241
|
+
|
|
242
|
+
return {
|
|
243
|
+
queries: frequentQueries.map(q => q.query),
|
|
244
|
+
total: frequentQueries.length,
|
|
245
|
+
top10: frequentQueries.slice(0, 10),
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// =============================================================================
|
|
250
|
+
// ENTITY EXTRACTION
|
|
251
|
+
// =============================================================================
|
|
252
|
+
|
|
253
|
+
export async function extractEntities() {
|
|
254
|
+
if (!existsSync(PATHS.codeGraph)) {
|
|
255
|
+
return { entities: [], error: 'Code graph not found' };
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
const Database = (await import('better-sqlite3')).default;
|
|
259
|
+
const { applyReadPragmas } = await import('../infrastructure/db-utils.js');
|
|
260
|
+
const db = new Database(PATHS.codeGraph, { readonly: true });
|
|
261
|
+
applyReadPragmas(db);
|
|
262
|
+
|
|
263
|
+
const entities = db.prepare(`
|
|
264
|
+
SELECT name, type, COUNT(*) as count
|
|
265
|
+
FROM entities
|
|
266
|
+
GROUP BY name, type
|
|
267
|
+
ORDER BY count DESC
|
|
268
|
+
`).all();
|
|
269
|
+
|
|
270
|
+
db.close();
|
|
271
|
+
|
|
272
|
+
return { entities, count: entities.length };
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// =============================================================================
|
|
276
|
+
// TEMPLATE GENERATION
|
|
277
|
+
// =============================================================================
|
|
278
|
+
|
|
279
|
+
export function generateAllTerms(entities, frequentQueries) {
|
|
280
|
+
const terms = new Set();
|
|
281
|
+
const stats = { names: 0, questions: 0, structural: 0, learned: 0 };
|
|
282
|
+
|
|
283
|
+
// 1. Add all entity names
|
|
284
|
+
for (const entity of entities) {
|
|
285
|
+
terms.add(entity.name);
|
|
286
|
+
stats.names++;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// 2. Add question templates for important types
|
|
290
|
+
for (const entity of entities) {
|
|
291
|
+
const name = entity.name;
|
|
292
|
+
const type = entity.type?.toLowerCase() || '';
|
|
293
|
+
|
|
294
|
+
if (CONFIG.importantTypes.some(t => type.includes(t) || name.toLowerCase().includes(t))) {
|
|
295
|
+
for (const template of CONFIG.questionTemplates) {
|
|
296
|
+
terms.add(template.replace('{name}', name));
|
|
297
|
+
stats.questions++;
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// 3. Add graph-aware structural templates for important entities
|
|
303
|
+
for (const entity of entities) {
|
|
304
|
+
const name = entity.name;
|
|
305
|
+
const type = entity.type?.toLowerCase() || '';
|
|
306
|
+
|
|
307
|
+
if (CONFIG.importantTypes.some(t => type.includes(t))) {
|
|
308
|
+
for (const template of CONFIG.structuralTemplates) {
|
|
309
|
+
terms.add(template.replace('{name}', name));
|
|
310
|
+
stats.structural++;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// 4. Add frequently used queries from log mining
|
|
316
|
+
for (const query of frequentQueries) {
|
|
317
|
+
terms.add(query);
|
|
318
|
+
stats.learned++;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
return { terms: Array.from(terms), stats };
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// =============================================================================
|
|
325
|
+
// VOYAGE API
|
|
326
|
+
// =============================================================================
|
|
327
|
+
|
|
328
|
+
async function getVoyageConfig() {
|
|
329
|
+
const { EMBEDDING_PROVIDERS } = await import('../infrastructure/config/index.js');
|
|
330
|
+
return EMBEDDING_PROVIDERS.voyage;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
export async function callVoyageAPI(texts, outputDimension = CONFIG.matryoshkaDimension) {
|
|
334
|
+
const config = await getVoyageConfig();
|
|
335
|
+
|
|
336
|
+
if (!config.enabled) {
|
|
337
|
+
throw new Error('Voyage API not configured. Set VOYAGEAI_API_KEY.');
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
const response = await fetch(config.endpoint, {
|
|
341
|
+
method: 'POST',
|
|
342
|
+
headers: {
|
|
343
|
+
'Authorization': `Bearer ${config.apiKey}`,
|
|
344
|
+
'Content-Type': 'application/json',
|
|
345
|
+
},
|
|
346
|
+
body: JSON.stringify({
|
|
347
|
+
model: config.model,
|
|
348
|
+
input: texts,
|
|
349
|
+
input_type: 'query',
|
|
350
|
+
output_dimension: outputDimension, // Matryoshka at API level
|
|
351
|
+
}),
|
|
352
|
+
});
|
|
353
|
+
|
|
354
|
+
if (!response.ok) {
|
|
355
|
+
const error = await response.text();
|
|
356
|
+
throw new Error(`Voyage API error: ${response.status} - ${error}`);
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
const data = await response.json();
|
|
360
|
+
return data.data.map(d => d.embedding);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// =============================================================================
|
|
364
|
+
// MIGRATION: JSON → BINARY
|
|
365
|
+
// =============================================================================
|
|
366
|
+
|
|
367
|
+
export async function migrateJsonToBinary() {
|
|
368
|
+
if (!existsSync(PATHS.jsonVocab)) {
|
|
369
|
+
return { error: 'No JSON vocabulary found' };
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
const start = Date.now();
|
|
373
|
+
|
|
374
|
+
// Load JSON
|
|
375
|
+
const jsonData = JSON.parse(await fs.readFile(PATHS.jsonVocab, 'utf-8'));
|
|
376
|
+
const terms = jsonData.terms || {};
|
|
377
|
+
const originalDim = Object.values(terms)[0]?.length || 1024;
|
|
378
|
+
|
|
379
|
+
if (originalDim < CONFIG.matryoshkaDimension) {
|
|
380
|
+
return { error: `Original dimension ${originalDim}d < target ${CONFIG.matryoshkaDimension}d` };
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// Truncate and convert to Map
|
|
384
|
+
const embeddings = new Map();
|
|
385
|
+
for (const [term, embedding] of Object.entries(terms)) {
|
|
386
|
+
embeddings.set(term, embedding.slice(0, CONFIG.matryoshkaDimension));
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// Save binary
|
|
390
|
+
const binaryVocab = new BinaryVocabulary();
|
|
391
|
+
const result = await binaryVocab.save(embeddings);
|
|
392
|
+
|
|
393
|
+
// Get sizes for comparison
|
|
394
|
+
const jsonSize = statSync(PATHS.jsonVocab).size;
|
|
395
|
+
const binSize = result.sizeBytes;
|
|
396
|
+
|
|
397
|
+
return {
|
|
398
|
+
ok: true,
|
|
399
|
+
terms: result.terms,
|
|
400
|
+
jsonSizeMB: (jsonSize / 1024 / 1024).toFixed(1),
|
|
401
|
+
binSizeMB: (binSize / 1024 / 1024).toFixed(1),
|
|
402
|
+
reduction: ((1 - binSize / jsonSize) * 100).toFixed(1) + '%',
|
|
403
|
+
ms: Date.now() - start,
|
|
404
|
+
};
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
// =============================================================================
|
|
408
|
+
// WARMUP: FULL
|
|
409
|
+
// =============================================================================
|
|
410
|
+
|
|
411
|
+
export async function warmupFull(options = {}) {
|
|
412
|
+
const { force = false, verbose = false } = options;
|
|
413
|
+
const start = Date.now();
|
|
414
|
+
const log = verbose ? console.log : () => {};
|
|
415
|
+
|
|
416
|
+
log('Step 1: Mining query logs...');
|
|
417
|
+
const { queries: frequentQueries, total: learnedCount } = await mineQueryLogs();
|
|
418
|
+
log(` Found ${learnedCount} frequent queries`);
|
|
419
|
+
|
|
420
|
+
log('Step 2: Extracting entities...');
|
|
421
|
+
const { entities, count: entityCount } = await extractEntities();
|
|
422
|
+
log(` Found ${entityCount} entities`);
|
|
423
|
+
|
|
424
|
+
log('Step 3: Generating terms...');
|
|
425
|
+
const { terms: allTerms, stats } = generateAllTerms(entities, frequentQueries);
|
|
426
|
+
log(` Names: ${stats.names}, Questions: ${stats.questions}, Structural: ${stats.structural}, Learned: ${stats.learned}`);
|
|
427
|
+
log(` Total unique: ${allTerms.length}`);
|
|
428
|
+
|
|
429
|
+
log('Step 4: Loading existing vocabulary...');
|
|
430
|
+
const binaryVocab = new BinaryVocabulary();
|
|
431
|
+
const existingLoaded = force ? false : await binaryVocab.load();
|
|
432
|
+
|
|
433
|
+
// Find new terms
|
|
434
|
+
let newTerms = allTerms;
|
|
435
|
+
if (existingLoaded) {
|
|
436
|
+
newTerms = allTerms.filter(t => !binaryVocab.has(t));
|
|
437
|
+
log(` Existing: ${binaryVocab.size()}, New: ${newTerms.length}`);
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
if (newTerms.length === 0) {
|
|
441
|
+
return { ok: true, terms: binaryVocab.size(), newTerms: 0, ms: Date.now() - start };
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
log('Step 5: Generating embeddings...');
|
|
445
|
+
const embeddings = new Map();
|
|
446
|
+
|
|
447
|
+
// Copy existing
|
|
448
|
+
if (existingLoaded) {
|
|
449
|
+
for (const term of binaryVocab.metadata.terms) {
|
|
450
|
+
embeddings.set(term, binaryVocab.get(term));
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
// Generate new in batches
|
|
455
|
+
const batches = [];
|
|
456
|
+
for (let i = 0; i < newTerms.length; i += CONFIG.batchSize) {
|
|
457
|
+
batches.push(newTerms.slice(i, i + CONFIG.batchSize));
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
for (let i = 0; i < batches.length; i++) {
|
|
461
|
+
const batch = batches[i];
|
|
462
|
+
try {
|
|
463
|
+
const batchEmbeddings = await callVoyageAPI(batch);
|
|
464
|
+
for (let j = 0; j < batch.length; j++) {
|
|
465
|
+
embeddings.set(batch[j].toLowerCase().trim(), batchEmbeddings[j]);
|
|
466
|
+
}
|
|
467
|
+
if (verbose) {
|
|
468
|
+
process.stdout.write(`\r Batch ${i + 1}/${batches.length}`);
|
|
469
|
+
}
|
|
470
|
+
if (i < batches.length - 1) {
|
|
471
|
+
await new Promise(r => setTimeout(r, CONFIG.rateLimitMs));
|
|
472
|
+
}
|
|
473
|
+
} catch (err) {
|
|
474
|
+
log(` Batch ${i + 1} failed: ${err.message}`);
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
if (verbose) console.log();
|
|
478
|
+
|
|
479
|
+
log('Step 6: Saving binary vocabulary...');
|
|
480
|
+
const result = await binaryVocab.save(embeddings);
|
|
481
|
+
|
|
482
|
+
return {
|
|
483
|
+
ok: true,
|
|
484
|
+
terms: result.terms,
|
|
485
|
+
newTerms: newTerms.length,
|
|
486
|
+
sizeBytes: result.sizeBytes,
|
|
487
|
+
ms: Date.now() - start,
|
|
488
|
+
};
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// =============================================================================
|
|
492
|
+
// WARMUP: LEARN ONLY
|
|
493
|
+
// =============================================================================
|
|
494
|
+
|
|
495
|
+
export async function warmupLearn(options = {}) {
|
|
496
|
+
const { verbose = false } = options;
|
|
497
|
+
const start = Date.now();
|
|
498
|
+
const log = verbose ? console.log : () => {};
|
|
499
|
+
|
|
500
|
+
log('Step 1: Mining query logs...');
|
|
501
|
+
const { queries: frequentQueries, total: learnedCount, top10 } = await mineQueryLogs();
|
|
502
|
+
|
|
503
|
+
if (learnedCount === 0) {
|
|
504
|
+
return { ok: true, learned: 0, ms: Date.now() - start };
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
log(` Found ${learnedCount} frequent queries`);
|
|
508
|
+
if (verbose && top10.length > 0) {
|
|
509
|
+
log(' Top 10:');
|
|
510
|
+
for (const { query, count } of top10) {
|
|
511
|
+
log(` "${query.substring(0, 40)}..." (${count}x)`);
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
log('Step 2: Loading existing vocabulary...');
|
|
516
|
+
const binaryVocab = new BinaryVocabulary();
|
|
517
|
+
const loaded = await binaryVocab.load();
|
|
518
|
+
|
|
519
|
+
// Find queries not in vocabulary
|
|
520
|
+
const newQueries = frequentQueries.filter(q => !loaded || !binaryVocab.has(q));
|
|
521
|
+
log(` New queries to embed: ${newQueries.length}`);
|
|
522
|
+
|
|
523
|
+
if (newQueries.length === 0) {
|
|
524
|
+
return { ok: true, learned: 0, existing: binaryVocab.size(), ms: Date.now() - start };
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
log('Step 3: Generating embeddings...');
|
|
528
|
+
const embeddings = new Map();
|
|
529
|
+
|
|
530
|
+
// Copy existing
|
|
531
|
+
if (loaded) {
|
|
532
|
+
for (const term of binaryVocab.metadata.terms) {
|
|
533
|
+
embeddings.set(term, binaryVocab.get(term));
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
// Generate new
|
|
538
|
+
const batches = [];
|
|
539
|
+
for (let i = 0; i < newQueries.length; i += CONFIG.batchSize) {
|
|
540
|
+
batches.push(newQueries.slice(i, i + CONFIG.batchSize));
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
for (const batch of batches) {
|
|
544
|
+
try {
|
|
545
|
+
const batchEmbeddings = await callVoyageAPI(batch);
|
|
546
|
+
for (let j = 0; j < batch.length; j++) {
|
|
547
|
+
embeddings.set(batch[j].toLowerCase().trim(), batchEmbeddings[j]);
|
|
548
|
+
}
|
|
549
|
+
} catch (err) {
|
|
550
|
+
log(` Batch failed: ${err.message}`);
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
log('Step 4: Saving...');
|
|
555
|
+
const result = await binaryVocab.save(embeddings);
|
|
556
|
+
|
|
557
|
+
return {
|
|
558
|
+
ok: true,
|
|
559
|
+
learned: newQueries.length,
|
|
560
|
+
total: result.terms,
|
|
561
|
+
ms: Date.now() - start,
|
|
562
|
+
};
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
// =============================================================================
|
|
566
|
+
// STATS
|
|
567
|
+
// =============================================================================
|
|
568
|
+
|
|
569
|
+
export async function getStats() {
|
|
570
|
+
const stats = {};
|
|
571
|
+
|
|
572
|
+
// Binary vocabulary
|
|
573
|
+
if (existsSync(PATHS.binaryVocab) && existsSync(PATHS.binaryMeta)) {
|
|
574
|
+
const binSize = statSync(PATHS.binaryVocab).size;
|
|
575
|
+
const meta = JSON.parse(await fs.readFile(PATHS.binaryMeta, 'utf-8'));
|
|
576
|
+
stats.binary = {
|
|
577
|
+
terms: meta.termCount,
|
|
578
|
+
dimension: meta.dimension,
|
|
579
|
+
sizeMB: (binSize / 1024 / 1024).toFixed(1),
|
|
580
|
+
created: meta.created,
|
|
581
|
+
};
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
// JSON vocabulary (legacy)
|
|
585
|
+
if (existsSync(PATHS.jsonVocab)) {
|
|
586
|
+
const jsonSize = statSync(PATHS.jsonVocab).size;
|
|
587
|
+
stats.json = {
|
|
588
|
+
sizeMB: (jsonSize / 1024 / 1024).toFixed(1),
|
|
589
|
+
};
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
// Query stats
|
|
593
|
+
if (existsSync(PATHS.queryStats)) {
|
|
594
|
+
const data = JSON.parse(await fs.readFile(PATHS.queryStats, 'utf-8'));
|
|
595
|
+
const queries = data.queries || {};
|
|
596
|
+
const counts = Object.values(queries);
|
|
597
|
+
stats.queryLog = {
|
|
598
|
+
uniqueQueries: Object.keys(queries).length,
|
|
599
|
+
totalQueries: counts.reduce((a, b) => a + b, 0),
|
|
600
|
+
frequent: counts.filter(c => c >= CONFIG.minQueryFrequency).length,
|
|
601
|
+
};
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
return stats;
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
// =============================================================================
|
|
608
|
+
// CLI
|
|
609
|
+
// =============================================================================
|
|
610
|
+
|
|
611
|
+
async function main() {
|
|
612
|
+
const args = process.argv.slice(2);
|
|
613
|
+
const cmd = args[0];
|
|
614
|
+
const verbose = args.includes('--verbose') || args.includes('-v');
|
|
615
|
+
|
|
616
|
+
switch (cmd) {
|
|
617
|
+
case 'migrate':
|
|
618
|
+
console.log('Migrating JSON vocabulary to binary format...');
|
|
619
|
+
const migrateResult = await migrateJsonToBinary();
|
|
620
|
+
if (migrateResult.error) {
|
|
621
|
+
console.error('Error:', migrateResult.error);
|
|
622
|
+
} else {
|
|
623
|
+
console.log(`Done in ${migrateResult.ms}ms`);
|
|
624
|
+
console.log(` Terms: ${migrateResult.terms}`);
|
|
625
|
+
console.log(` JSON: ${migrateResult.jsonSizeMB}MB → Binary: ${migrateResult.binSizeMB}MB`);
|
|
626
|
+
console.log(` Reduction: ${migrateResult.reduction}`);
|
|
627
|
+
}
|
|
628
|
+
break;
|
|
629
|
+
|
|
630
|
+
case 'learn':
|
|
631
|
+
console.log('Learning from query logs...');
|
|
632
|
+
const learnResult = await warmupLearn({ verbose });
|
|
633
|
+
console.log(`Done in ${learnResult.ms}ms`);
|
|
634
|
+
console.log(` Learned: ${learnResult.learned} queries`);
|
|
635
|
+
console.log(` Total vocabulary: ${learnResult.total || learnResult.existing || 0} terms`);
|
|
636
|
+
break;
|
|
637
|
+
|
|
638
|
+
case 'warmup':
|
|
639
|
+
console.log('Full vocabulary warmup (entities + questions + graph + learned)...');
|
|
640
|
+
const warmupResult = await warmupFull({ verbose, force: args.includes('--force') });
|
|
641
|
+
console.log(`Done in ${warmupResult.ms}ms`);
|
|
642
|
+
console.log(` Total: ${warmupResult.terms} terms`);
|
|
643
|
+
console.log(` New: ${warmupResult.newTerms} terms`);
|
|
644
|
+
break;
|
|
645
|
+
|
|
646
|
+
case 'warmup-inc':
|
|
647
|
+
console.log('Incremental warmup (learned queries only)...');
|
|
648
|
+
const incResult = await warmupLearn({ verbose });
|
|
649
|
+
console.log(`Done in ${incResult.ms}ms`);
|
|
650
|
+
console.log(` Learned: ${incResult.learned} queries`);
|
|
651
|
+
break;
|
|
652
|
+
|
|
653
|
+
case 'stats':
|
|
654
|
+
const stats = await getStats();
|
|
655
|
+
console.log('Vocabulary Statistics:');
|
|
656
|
+
if (stats.binary) {
|
|
657
|
+
console.log(` Binary: ${stats.binary.terms} terms (${stats.binary.dimension}d, ${stats.binary.sizeMB}MB)`);
|
|
658
|
+
}
|
|
659
|
+
if (stats.json) {
|
|
660
|
+
console.log(` JSON (legacy): ${stats.json.sizeMB}MB`);
|
|
661
|
+
}
|
|
662
|
+
if (stats.queryLog) {
|
|
663
|
+
console.log(` Query log: ${stats.queryLog.uniqueQueries} unique, ${stats.queryLog.frequent} frequent`);
|
|
664
|
+
}
|
|
665
|
+
break;
|
|
666
|
+
|
|
667
|
+
default:
|
|
668
|
+
console.log(`
|
|
669
|
+
Vocabulary Utilities v2.0
|
|
670
|
+
|
|
671
|
+
Usage:
|
|
672
|
+
node vocabulary-utils.js migrate Migrate JSON to binary (75% smaller)
|
|
673
|
+
node vocabulary-utils.js learn Learn from query logs (self-learning)
|
|
674
|
+
node vocabulary-utils.js warmup Full warmup (all templates)
|
|
675
|
+
node vocabulary-utils.js warmup-inc Incremental warmup (learned only)
|
|
676
|
+
node vocabulary-utils.js stats Show vocabulary statistics
|
|
677
|
+
|
|
678
|
+
Options:
|
|
679
|
+
--verbose, -v Show detailed progress
|
|
680
|
+
--force Force regenerate all embeddings
|
|
681
|
+
`);
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
686
|
+
main().catch(err => {
|
|
687
|
+
console.error('Error:', err.message);
|
|
688
|
+
process.exit(1);
|
|
689
|
+
});
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
export default {
|
|
693
|
+
BinaryVocabulary,
|
|
694
|
+
mineQueryLogs,
|
|
695
|
+
extractEntities,
|
|
696
|
+
generateAllTerms,
|
|
697
|
+
callVoyageAPI,
|
|
698
|
+
migrateJsonToBinary,
|
|
699
|
+
warmupFull,
|
|
700
|
+
warmupLearn,
|
|
701
|
+
getStats,
|
|
702
|
+
CONFIG,
|
|
703
|
+
PATHS,
|
|
704
|
+
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "query_router_wasm",
|
|
3
|
+
"description": "Ultra-fast query router for code search (<5μs)",
|
|
4
|
+
"version": "1.0.0",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"files": [
|
|
7
|
+
"query_router_wasm_bg.wasm",
|
|
8
|
+
"query_router_wasm.js",
|
|
9
|
+
"query_router_wasm.d.ts"
|
|
10
|
+
],
|
|
11
|
+
"main": "query_router_wasm.js",
|
|
12
|
+
"types": "query_router_wasm.d.ts"
|
|
13
|
+
}
|