sweet-search 0.0.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. package/LICENSE +190 -0
  2. package/NOTICE +23 -0
  3. package/core/cli.js +51 -0
  4. package/core/config.js +27 -0
  5. package/core/embedding/embedding-cache.js +467 -0
  6. package/core/embedding/embedding-local-model.js +845 -0
  7. package/core/embedding/embedding-remote.js +492 -0
  8. package/core/embedding/embedding-service.js +712 -0
  9. package/core/embedding/embedding-telemetry.js +219 -0
  10. package/core/embedding/index.js +40 -0
  11. package/core/graph/community-detector.js +294 -0
  12. package/core/graph/graph-expansion.js +839 -0
  13. package/core/graph/graph-extractor.js +2304 -0
  14. package/core/graph/graph-search.js +2148 -0
  15. package/core/graph/hcgs-generator.js +666 -0
  16. package/core/graph/index.js +16 -0
  17. package/core/graph/leiden-algorithm.js +547 -0
  18. package/core/graph/relationship-resolver.js +366 -0
  19. package/core/graph/repo-map.js +408 -0
  20. package/core/graph/summary-manager.js +549 -0
  21. package/core/indexing/artifact-builder.js +1054 -0
  22. package/core/indexing/ast-chunker.js +709 -0
  23. package/core/indexing/chunking/chunk-builder.js +170 -0
  24. package/core/indexing/chunking/markdown-chunker.js +503 -0
  25. package/core/indexing/chunking/plaintext-chunker.js +104 -0
  26. package/core/indexing/dedup/dedup-phase.js +159 -0
  27. package/core/indexing/dedup/exemplar-selector.js +65 -0
  28. package/core/indexing/document-chunker.js +56 -0
  29. package/core/indexing/incremental-parser.js +390 -0
  30. package/core/indexing/incremental-tracker.js +761 -0
  31. package/core/indexing/index-codebase-v21.js +472 -0
  32. package/core/indexing/index-maintainer.mjs +1674 -0
  33. package/core/indexing/index.js +90 -0
  34. package/core/indexing/indexer-ann.js +1077 -0
  35. package/core/indexing/indexer-build.js +742 -0
  36. package/core/indexing/indexer-phases.js +800 -0
  37. package/core/indexing/indexer-pool.js +764 -0
  38. package/core/indexing/indexer-sparse-gram.js +98 -0
  39. package/core/indexing/indexer-utils.js +536 -0
  40. package/core/indexing/indexer-worker.js +148 -0
  41. package/core/indexing/li-skip-policy.js +225 -0
  42. package/core/indexing/merkle-tracker.js +244 -0
  43. package/core/indexing/model-pool.js +166 -0
  44. package/core/infrastructure/code-graph-repository.js +120 -0
  45. package/core/infrastructure/codebase-repository.js +131 -0
  46. package/core/infrastructure/config/dedup.js +54 -0
  47. package/core/infrastructure/config/embedding.js +298 -0
  48. package/core/infrastructure/config/graph.js +80 -0
  49. package/core/infrastructure/config/index.js +82 -0
  50. package/core/infrastructure/config/indexing.js +8 -0
  51. package/core/infrastructure/config/platform.js +254 -0
  52. package/core/infrastructure/config/ranking.js +221 -0
  53. package/core/infrastructure/config/search.js +396 -0
  54. package/core/infrastructure/config/translation.js +89 -0
  55. package/core/infrastructure/config/vector-store.js +114 -0
  56. package/core/infrastructure/constants.js +86 -0
  57. package/core/infrastructure/coreml-cascade.js +909 -0
  58. package/core/infrastructure/coreml-cascade.json +46 -0
  59. package/core/infrastructure/coreml-provider.js +81 -0
  60. package/core/infrastructure/db-utils.js +69 -0
  61. package/core/infrastructure/dedup-hashing.js +83 -0
  62. package/core/infrastructure/hardware-capability.js +332 -0
  63. package/core/infrastructure/index.js +104 -0
  64. package/core/infrastructure/language-patterns/maps.js +121 -0
  65. package/core/infrastructure/language-patterns/registry-core.js +323 -0
  66. package/core/infrastructure/language-patterns/registry-data-query.js +155 -0
  67. package/core/infrastructure/language-patterns/registry-object-oriented.js +285 -0
  68. package/core/infrastructure/language-patterns/registry-tooling.js +240 -0
  69. package/core/infrastructure/language-patterns/registry-web-style.js +143 -0
  70. package/core/infrastructure/language-patterns/registry.js +19 -0
  71. package/core/infrastructure/language-patterns.js +141 -0
  72. package/core/infrastructure/llm-provider.js +733 -0
  73. package/core/infrastructure/manifest.json +46 -0
  74. package/core/infrastructure/maxsim.wasm +0 -0
  75. package/core/infrastructure/model-fetcher.js +423 -0
  76. package/core/infrastructure/model-registry.js +214 -0
  77. package/core/infrastructure/native-inference.js +587 -0
  78. package/core/infrastructure/native-resolver.js +187 -0
  79. package/core/infrastructure/native-sparse-gram.js +257 -0
  80. package/core/infrastructure/native-tokenizer.js +160 -0
  81. package/core/infrastructure/onnx-mutex.js +45 -0
  82. package/core/infrastructure/onnx-session-utils.js +261 -0
  83. package/core/infrastructure/ort-pipeline.js +111 -0
  84. package/core/infrastructure/project-detector.js +102 -0
  85. package/core/infrastructure/quantization.js +410 -0
  86. package/core/infrastructure/simd-distance.js +502 -0
  87. package/core/infrastructure/simd-distance.wasm +0 -0
  88. package/core/infrastructure/tree-sitter-provider.js +665 -0
  89. package/core/infrastructure/webgpu-maxsim.js +222 -0
  90. package/core/query/index.js +35 -0
  91. package/core/query/intent-detector.js +201 -0
  92. package/core/query/intent-router.js +156 -0
  93. package/core/query/query-router-catboost.js +222 -0
  94. package/core/query/query-router-ml.js +266 -0
  95. package/core/query/query-router.js +213 -0
  96. package/core/ranking/cascaded-scorer.js +379 -0
  97. package/core/ranking/flashrank.js +810 -0
  98. package/core/ranking/index.js +49 -0
  99. package/core/ranking/late-interaction-index.js +2383 -0
  100. package/core/ranking/late-interaction-model.js +812 -0
  101. package/core/ranking/local-reranker.js +374 -0
  102. package/core/ranking/mmr.js +379 -0
  103. package/core/ranking/quality-scorer.js +363 -0
  104. package/core/search/context-expander.js +1167 -0
  105. package/core/search/dedup/sibling-expander.js +327 -0
  106. package/core/search/index.js +16 -0
  107. package/core/search/search-boost.js +259 -0
  108. package/core/search/search-cli.js +544 -0
  109. package/core/search/search-format.js +282 -0
  110. package/core/search/search-fusion.js +327 -0
  111. package/core/search/search-hybrid.js +204 -0
  112. package/core/search/search-pattern-chunks.js +337 -0
  113. package/core/search/search-pattern-planner.js +439 -0
  114. package/core/search/search-pattern-prefilter.js +412 -0
  115. package/core/search/search-pattern-ripgrep.js +663 -0
  116. package/core/search/search-pattern.js +463 -0
  117. package/core/search/search-postprocess.js +452 -0
  118. package/core/search/search-semantic.js +706 -0
  119. package/core/search/search-server.js +554 -0
  120. package/core/search/session-daemon-prewarm.mjs +164 -0
  121. package/core/search/session-warmup.js +595 -0
  122. package/core/search/sweet-search.js +632 -0
  123. package/core/search/warmup-metrics.js +532 -0
  124. package/core/start-server.js +6 -0
  125. package/core/training/query-router/features/extractor.js +762 -0
  126. package/core/training/query-router/features/multilingual-patterns.js +431 -0
  127. package/core/training/query-router/features/text-segmenter.js +303 -0
  128. package/core/training/query-router/features/unicode-utils.js +383 -0
  129. package/core/training/query-router/output/v45_router_d4.js +11521 -0
  130. package/core/training/query-router/output/v46_router_d4.js +11498 -0
  131. package/core/vector-store/binary-heap.js +227 -0
  132. package/core/vector-store/binary-hnsw-index.js +1004 -0
  133. package/core/vector-store/float-vector-store.js +234 -0
  134. package/core/vector-store/hnsw-index.js +580 -0
  135. package/core/vector-store/index.js +39 -0
  136. package/core/vector-store/seismic-index.js +498 -0
  137. package/core/vocabulary/index.js +84 -0
  138. package/core/vocabulary/vocab-constants.js +20 -0
  139. package/core/vocabulary/vocab-miner-extractors.js +375 -0
  140. package/core/vocabulary/vocab-miner-nl.js +404 -0
  141. package/core/vocabulary/vocab-miner-utils.js +146 -0
  142. package/core/vocabulary/vocab-miner.js +574 -0
  143. package/core/vocabulary/vocab-prewarm-cli.js +110 -0
  144. package/core/vocabulary/vocab-ranker.js +492 -0
  145. package/core/vocabulary/vocab-warmer.js +523 -0
  146. package/core/vocabulary/vocab-warmup-orchestrator.js +425 -0
  147. package/core/vocabulary/vocabulary-utils.js +704 -0
  148. package/crates/wasm-router/pkg/package.json +13 -0
  149. package/crates/wasm-router/pkg/query_router_wasm.d.ts +36 -0
  150. package/crates/wasm-router/pkg/query_router_wasm.js +271 -0
  151. package/crates/wasm-router/pkg/query_router_wasm_bg.wasm +0 -0
  152. package/crates/wasm-router/pkg/query_router_wasm_bg.wasm.d.ts +19 -0
  153. package/mcp/config-gen.js +121 -0
  154. package/mcp/server.js +335 -0
  155. package/mcp/tool-handlers.js +476 -0
  156. package/package.json +131 -9
  157. package/scripts/benchmark-harness.js +794 -0
  158. package/scripts/init.js +1058 -0
  159. package/scripts/smoke-test.js +435 -0
  160. package/scripts/uninstall.js +478 -0
  161. package/scripts/verify-runtime.js +176 -0
@@ -0,0 +1,845 @@
1
+ /**
2
+ * Embedding Local Model - ONNX-based local embedding inference.
3
+ * Extracted from embedding-service.js for file size compliance (<500 lines).
4
+ */
5
+
6
+ import crypto from 'crypto';
7
+ import { existsSync, readFileSync, writeFileSync, mkdirSync, readdirSync } from 'fs';
8
+ import path from 'path';
9
+ import { join } from 'path';
10
+ import os from 'os';
11
+ import { EMBEDDING_PROVIDERS } from '../infrastructure/config/index.js';
12
+ import { fetchModel, getModelCacheDir } from '../infrastructure/model-fetcher.js';
13
+ import { getModelEntry } from '../infrastructure/model-registry.js';
14
+ import { isAppleSilicon, isCoreMLProviderAvailable, shouldUseCoreML, getCoreMLExecutionProviders } from '../infrastructure/coreml-provider.js';
15
+ import { createTokenizer } from '../infrastructure/native-tokenizer.js';
16
+ import { initOrt, buildFeed } from '../infrastructure/ort-pipeline.js';
17
+ import { isNativeInferenceAvailable, isNativeEmbeddingModelLoaded, nativeEmbed } from '../infrastructure/native-inference.js';
18
+
19
+ // =============================================================================
20
+ // SEQUENCE LENGTH CONSTANTS (L2: configurable via env)
21
+ // =============================================================================
22
+
23
+ export const INDEXING_MAX_LENGTH = parseInt(process.env.SWEET_SEARCH_INDEXING_MAX_LENGTH || '512', 10);
24
+ export const QUERY_MAX_LENGTH = parseInt(process.env.SWEET_SEARCH_QUERY_MAX_LENGTH || '512', 10);
25
+
26
+ // =============================================================================
27
+ // ONNX SESSION HELPERS
28
+ // =============================================================================
29
+
30
+ // Import + re-export from infrastructure (canonical location)
31
+ import {
32
+ bestIntraOpThreads,
33
+ defaultOrtExecutionMode,
34
+ detectLastLevelCacheBytes,
35
+ computeWeightsAwareBatchCap,
36
+ } from '../infrastructure/onnx-session-utils.js';
37
+ export { bestIntraOpThreads };
38
+
39
+ // CodeRankEmbed (the only local embedding model) is a NomicBERT/ModernBERT-
40
+ // family encoder with hidden dim 768. Used by the cache-aware budget below
41
+ // to estimate per-layer transformer-weight footprint vs. activation working
42
+ // set. Update if the local provider ever switches model.
43
+ const LOCAL_EMBEDDING_HIDDEN_DIM = 768;
44
+
45
+ export function isIntelCpu() {
46
+ const model = os.cpus()?.[0]?.model || '';
47
+ return model.toLowerCase().includes('intel');
48
+ }
49
+
50
+ let openVinoProviderAvailable = null;
51
+ let localModelRuntimeConfig = {
52
+ intraOpThreads: null,
53
+ interOpThreads: null,
54
+ executionMode: null,
55
+ };
56
+
57
+ export function configureLocalModelRuntime(overrides = {}) {
58
+ localModelRuntimeConfig = {
59
+ ...localModelRuntimeConfig,
60
+ ...overrides,
61
+ };
62
+ }
63
+
64
+ export function resetLocalModelRuntime() {
65
+ localModelRuntimeConfig = {
66
+ intraOpThreads: null,
67
+ interOpThreads: null,
68
+ executionMode: null,
69
+ };
70
+ }
71
+
72
+ export function isOpenVinoProviderAvailable() {
73
+ if (openVinoProviderAvailable !== null) return openVinoProviderAvailable;
74
+
75
+ const candidateRoots = [
76
+ path.resolve('node_modules/onnxruntime-node/bin'),
77
+ ];
78
+
79
+ const stack = candidateRoots.filter(existsSync);
80
+ while (stack.length > 0) {
81
+ const current = stack.pop();
82
+ try {
83
+ const entries = readdirSync(current, { withFileTypes: true });
84
+ for (const entry of entries) {
85
+ const fullPath = path.join(current, entry.name);
86
+ if (entry.isDirectory()) {
87
+ stack.push(fullPath);
88
+ continue;
89
+ }
90
+ if (entry.name.toLowerCase().includes('openvino')) {
91
+ openVinoProviderAvailable = true;
92
+ return true;
93
+ }
94
+ }
95
+ } catch {
96
+ // Ignore unreadable directories.
97
+ }
98
+ }
99
+
100
+ openVinoProviderAvailable = false;
101
+ return false;
102
+ }
103
+
104
+ export function shouldUseOpenVino(openVinoAvailable = isOpenVinoProviderAvailable()) {
105
+ const raw = (process.env.SWEET_SEARCH_USE_OPENVINO ?? '').trim().toLowerCase();
106
+ if (raw === '0' || raw === 'false' || raw === 'off') return false;
107
+ if (!isIntelCpu()) return false;
108
+
109
+ const autoMode = raw === '' || raw === 'auto';
110
+ const explicitOn = raw === '1' || raw === 'true' || raw === 'on';
111
+ if (!autoMode && !explicitOn) return false;
112
+
113
+ // Enable only when the runtime bundle exposes OpenVINO provider artifacts.
114
+ return openVinoAvailable;
115
+ }
116
+
117
+ /**
118
+ * Resolve which model repo to load based on quantization mode.
119
+ * - quantized=true → quantizedModel (INT8, ~132 MB, ~2× faster)
120
+ * - quantized=false → model (FP32, ~522 MB, baseline)
121
+ */
122
+ export function resolveLocalModelName(quantized) {
123
+ if (quantized && EMBEDDING_PROVIDERS.local.quantizedModel) {
124
+ return EMBEDDING_PROVIDERS.local.quantizedModel;
125
+ }
126
+ return EMBEDDING_PROVIDERS.local.model;
127
+ }
128
+
129
+ /**
130
+ * L3b: Return path for the ORT-optimized model graph cache.
131
+ * Uses the actual model name in the hash so FP32 and INT8 never share a cache file.
132
+ */
133
+ export function getOptimizedModelPath(quantLabel = 'q8') {
134
+ const cacheDir = path.join(os.homedir(), '.cache', 'sweet-search');
135
+ mkdirSync(cacheDir, { recursive: true });
136
+
137
+ let ortVersion = 'unknown';
138
+ try {
139
+ const ortPkg = JSON.parse(readFileSync(
140
+ path.resolve('node_modules/onnxruntime-node/package.json'), 'utf8'
141
+ ));
142
+ ortVersion = ortPkg.version;
143
+ } catch {
144
+ // ORT pulled in transitively; version unknown is fine
145
+ }
146
+
147
+ const isQuantized = quantLabel !== 'fp32';
148
+ const modelName = resolveLocalModelName(isQuantized);
149
+ const modelHash = crypto.createHash('sha256')
150
+ .update(modelName)
151
+ .digest('hex')
152
+ .slice(0, 12);
153
+
154
+ return path.join(cacheDir, `coderankembed-optimized-ort${ortVersion}-${quantLabel}-${modelHash}.onnx`);
155
+ }
156
+
157
+ export function getCalibrationFactor() {
158
+ return 4;
159
+ }
160
+
161
+ export function buildLocalSessionOptions(quantLabel = 'q8', coremlAvailable = false, runtimeConfig = {}) {
162
+ const executionMode = runtimeConfig.executionMode
163
+ ?? localModelRuntimeConfig.executionMode
164
+ ?? process.env.SWEET_SEARCH_ORT_EXEC_MODE
165
+ ?? defaultOrtExecutionMode();
166
+ const interOpThreads = runtimeConfig.interOpThreads
167
+ ?? localModelRuntimeConfig.interOpThreads
168
+ ?? parseInt(process.env.SWEET_SEARCH_ORT_INTER_OP_THREADS || '1', 10);
169
+ const intraOpThreads = runtimeConfig.intraOpThreads
170
+ ?? localModelRuntimeConfig.intraOpThreads
171
+ ?? bestIntraOpThreads(runtimeConfig);
172
+
173
+ const sessionOptions = {
174
+ graphOptimizationLevel: 'all',
175
+ intraOpNumThreads: intraOpThreads,
176
+ interOpNumThreads: interOpThreads,
177
+ executionMode,
178
+ enableCpuMemArena: true,
179
+ enableMemPattern: true,
180
+ optimizedModelFilePath: getOptimizedModelPath(quantLabel),
181
+ };
182
+
183
+ // Thread spinning keeps ORT worker threads hot-looping for work instead of
184
+ // sleeping on OS primitives. Trades idle CPU for lower per-batch latency.
185
+ sessionOptions.extra = {
186
+ session: {
187
+ intra_op: { allow_spinning: '1' },
188
+ },
189
+ };
190
+
191
+ if (shouldUseOpenVino()) {
192
+ // Note: OpenVINO EP is not bundled in onnxruntime-node 1.24 for macOS.
193
+ // On Intel Linux builds where it is bundled, the lowercase name is required.
194
+ sessionOptions.executionProviders = [
195
+ { name: 'openvino' },
196
+ 'cpu',
197
+ ];
198
+ } else if (shouldUseCoreML(coremlAvailable)) {
199
+ sessionOptions.executionProviders = getCoreMLExecutionProviders();
200
+ }
201
+
202
+ return sessionOptions;
203
+ }
204
+
205
+ /**
206
+ * Resolve quantization mode from env var.
207
+ * Returns { quantized: bool, label: string }
208
+ *
209
+ * Note: FP32 mode (SWEET_SEARCH_LOCAL_QUANTIZED=0) is not supported with the
210
+ * managed model cache — the FP32 model (jalipalo/CodeRankEmbed-onnx) is not
211
+ * in the registry. If explicitly set to false, warn and fall back to INT8.
212
+ */
213
+ export function resolveQuantizationMode() {
214
+ const raw = (process.env.SWEET_SEARCH_LOCAL_QUANTIZED ?? '').trim().toLowerCase();
215
+ if (raw === '0' || raw === 'false') {
216
+ console.warn('[L1] SWEET_SEARCH_LOCAL_QUANTIZED=false requested but FP32 model is not in managed cache. Using INT8.');
217
+ }
218
+ return { quantized: true, label: 'q8' };
219
+ }
220
+
221
+ /** Registry key for the managed embedding model. */
222
+ const EMBEDDING_REGISTRY_KEY = 'coderankembed-int8';
223
+
224
+ /**
225
+ * Resolve the ONNX model file path from the managed model cache.
226
+ */
227
+ function resolveOnnxModelPath() {
228
+ const entry = getModelEntry(EMBEDDING_REGISTRY_KEY);
229
+ const onnxFile = entry.files.find(f => f.path.endsWith('.onnx'));
230
+ return join(getModelCacheDir(entry.hfId), onnxFile.path);
231
+ }
232
+
233
+ /**
234
+ * Resolve the tokenizer.json path from the managed model cache.
235
+ */
236
+ function resolveTokenizerPath() {
237
+ const entry = getModelEntry(EMBEDDING_REGISTRY_KEY);
238
+ return join(getModelCacheDir(entry.hfId), 'tokenizer.json');
239
+ }
240
+
241
+ // =============================================================================
242
+ // POOLING AND NORMALIZATION
243
+ // =============================================================================
244
+
245
+ export function maskIsActive(maskValue) {
246
+ return typeof maskValue === 'bigint' ? maskValue !== 0n : maskValue !== 0;
247
+ }
248
+
249
+ export function l2NormalizeRowsInPlace(data, rows, cols) {
250
+ for (let r = 0; r < rows; r++) {
251
+ const offset = r * cols;
252
+ let normSq = 0;
253
+ for (let c = 0; c < cols; c++) {
254
+ const v = data[offset + c];
255
+ normSq += v * v;
256
+ }
257
+
258
+ const norm = Math.sqrt(normSq);
259
+ if (norm > 0) {
260
+ const inv = 1 / norm;
261
+ for (let c = 0; c < cols; c++) {
262
+ data[offset + c] *= inv;
263
+ }
264
+ }
265
+ }
266
+ }
267
+
268
+ export function meanPoolWithAttentionMask(tokenEmbeddings, attentionMask, normalize = true) {
269
+ const dims = tokenEmbeddings?.dims || [];
270
+ if (dims.length !== 3) {
271
+ throw new Error(`[L1] Expected dims [batch, seq, hidden], got [${dims.join(', ')}]`);
272
+ }
273
+
274
+ const [batchSize, seqLength, hiddenSize] = dims;
275
+ const pooled = new Float32Array(batchSize * hiddenSize);
276
+ const tokenData = tokenEmbeddings.data;
277
+ const maskData = attentionMask?.data || null;
278
+
279
+ for (let b = 0; b < batchSize; b++) {
280
+ const rowOffset = b * hiddenSize;
281
+ let validTokens = 0;
282
+
283
+ for (let t = 0; t < seqLength; t++) {
284
+ const maskOffset = b * seqLength + t;
285
+ if (maskData && !maskIsActive(maskData[maskOffset])) continue;
286
+
287
+ validTokens++;
288
+ const tokenOffset = (b * seqLength + t) * hiddenSize;
289
+ for (let h = 0; h < hiddenSize; h++) {
290
+ pooled[rowOffset + h] += tokenData[tokenOffset + h];
291
+ }
292
+ }
293
+
294
+ const denom = validTokens > 0 ? validTokens : 1;
295
+ const inv = 1 / denom;
296
+ for (let h = 0; h < hiddenSize; h++) {
297
+ pooled[rowOffset + h] *= inv;
298
+ }
299
+ }
300
+
301
+ if (normalize) {
302
+ l2NormalizeRowsInPlace(pooled, batchSize, hiddenSize);
303
+ }
304
+
305
+ return {
306
+ data: pooled,
307
+ batchSize,
308
+ dim: hiddenSize,
309
+ };
310
+ }
311
+
312
+ export function extractPooledEmbeddings(outputs, attentionMask, normalize = true) {
313
+ const candidate = outputs?.last_hidden_state || outputs?.logits || outputs?.token_embeddings;
314
+ if (!candidate?.dims || !candidate?.data) {
315
+ throw new Error('[L1] Model output missing tensor data for feature extraction');
316
+ }
317
+
318
+ if (candidate.dims.length === 3) {
319
+ return meanPoolWithAttentionMask(candidate, attentionMask, normalize);
320
+ }
321
+
322
+ if (candidate.dims.length === 2) {
323
+ const [batchSize, dim] = candidate.dims;
324
+ const data = new Float32Array(candidate.data.length);
325
+ data.set(candidate.data);
326
+ if (normalize) {
327
+ l2NormalizeRowsInPlace(data, batchSize, dim);
328
+ }
329
+ return { data, batchSize, dim };
330
+ }
331
+
332
+ throw new Error(`[L1] Unsupported tensor shape: [${candidate.dims.join(', ')}]`);
333
+ }
334
+
335
+ // =============================================================================
336
+ // TIMING INSTRUMENTATION (Phase 0 — per-batch profiling)
337
+ // =============================================================================
338
+
339
+ const _embeddingTimings = { tokenize_us: 0, inference_us: 0, pool_us: 0, calls: 0, totalTexts: 0 };
340
+
341
+ /** Read and reset accumulated embedding timings. */
342
+ export function getEmbeddingTimings() {
343
+ const snap = { ..._embeddingTimings };
344
+ _embeddingTimings.tokenize_us = 0;
345
+ _embeddingTimings.inference_us = 0;
346
+ _embeddingTimings.pool_us = 0;
347
+ _embeddingTimings.calls = 0;
348
+ _embeddingTimings.totalTexts = 0;
349
+ return snap;
350
+ }
351
+
352
+ // =============================================================================
353
+ // WORKER POOL SLOT (Phase 2 — parallel ORT inference via worker_threads)
354
+ // =============================================================================
355
+ //
356
+ // The embedding layer exposes a SLOT for an external pool implementation.
357
+ // It does NOT own the pool lifecycle or construction — that responsibility
358
+ // lives in `core/indexing/indexer-pool.js` which respects the DDD matrix
359
+ // (indexing may depend on embedding, not the reverse).
360
+ //
361
+ // At embed time (`callLocalModel`), if a pool is installed in this slot, we
362
+ // dispatch batches through `pool.embed(texts, { maxLength })`. The pool
363
+ // contract is duck-typed: any object exposing `embed(texts, options) =>
364
+ // Promise<Float32Array[]>` (and `numWorkers?: number`) satisfies it.
365
+ //
366
+ // See `core/indexing/indexer-pool.js::initEmbeddingPool` for the owner.
367
+
368
+ let _embeddingPool = null;
369
+
370
+ /** Install an external embedding worker pool into the slot. */
371
+ export function setEmbeddingPool(pool) { _embeddingPool = pool; }
372
+
373
+ /** Clear the slot. Does NOT shut the pool down — the caller owns lifecycle. */
374
+ export function clearEmbeddingPool() { _embeddingPool = null; }
375
+
376
+ /** Get the active pool (null if not installed). */
377
+ export function getEmbeddingPool() { return _embeddingPool; }
378
+
379
+ async function embedBatchesWithPool(pool, batches, maxLength, onProgress, totalTexts) {
380
+ const results = new Array(batches.length);
381
+ const waveSize = Math.max(1, Math.min(pool.numWorkers || 1, batches.length));
382
+ let completed = 0;
383
+ for (let i = 0; i < batches.length; i += waveSize) {
384
+ const wave = batches.slice(i, i + waveSize);
385
+ const waveResults = await Promise.all(
386
+ wave.map(batch => pool.embed(batch.map(item => item.text), { maxLength }))
387
+ );
388
+ for (let j = 0; j < wave.length; j++) {
389
+ results[i + j] = waveResults[j];
390
+ completed += wave[j].length;
391
+ }
392
+ if (onProgress) onProgress(completed, totalTexts);
393
+ }
394
+ return results;
395
+ }
396
+
397
+ // =============================================================================
398
+ // PIPELINE SINGLETON
399
+ // =============================================================================
400
+
401
+ let localPipeline = null;
402
+ let isLoadingLocal = false;
403
+ let loadPromise = null;
404
+
405
+ export async function getLocalPipeline() {
406
+ if (localPipeline) return localPipeline;
407
+ if (isLoadingLocal && loadPromise) return loadPromise;
408
+
409
+ isLoadingLocal = true;
410
+ loadPromise = (async () => {
411
+ const start = Date.now();
412
+ const { quantized: isQuantized, label: quantLabel } = resolveQuantizationMode();
413
+ console.log(`Loading local model: ${resolveLocalModelName(isQuantized)}...`);
414
+
415
+ // Fetch model files to managed cache (verifies checksums, respects allowRuntimeModelDownload)
416
+ await fetchModel('coderankembed-int8');
417
+
418
+ const ort = await initOrt();
419
+ const onnxPath = resolveOnnxModelPath();
420
+ const tokenizerPath = resolveTokenizerPath();
421
+
422
+ // Phase 1d: CoreML detection with persistent failure cache.
423
+ // ORT 1.24.3 can't serialize models with CoreML compiled nodes, causing a
424
+ // ~12s cold-start penalty from 3 failed session attempts. Cache the failure
425
+ // so subsequent loads skip CoreML probing entirely.
426
+ const coremlFlagPath = path.join(os.homedir(), '.cache', 'sweet-search', '.coreml-embedding-failed');
427
+ let coremlAvailable = false;
428
+ if (isAppleSilicon() && !existsSync(coremlFlagPath)) {
429
+ coremlAvailable = await isCoreMLProviderAvailable();
430
+ }
431
+ const sessionOptions = buildLocalSessionOptions(quantLabel, coremlAvailable);
432
+ let backend = 'cpu';
433
+ if (sessionOptions.executionProviders) {
434
+ const names = sessionOptions.executionProviders.map(ep => typeof ep === 'string' ? ep : ep.name);
435
+ backend = names.includes('coreml') ? 'coreml+cpu' : 'openvino+cpu';
436
+ }
437
+
438
+ let session;
439
+ try {
440
+ session = await ort.InferenceSession.create(onnxPath, sessionOptions);
441
+ } catch (err) {
442
+ if (sessionOptions.executionProviders) {
443
+ const epName = backend.split('+')[0];
444
+ if (epName === 'coreml') {
445
+ console.warn(`[L5] CoreML MLProgram failed (${err.message}), trying NeuralNetwork format`);
446
+ try {
447
+ const nnOptions = buildLocalSessionOptions(quantLabel);
448
+ nnOptions.executionProviders = getCoreMLExecutionProviders(false);
449
+ session = await ort.InferenceSession.create(onnxPath, nnOptions);
450
+ backend = 'coreml-nn+cpu';
451
+ } catch {
452
+ console.warn('[L5] CoreML NeuralNetwork also failed, falling back to CPU only');
453
+ // Cache the failure to avoid ~12s cold-start penalty next time
454
+ try { mkdirSync(path.dirname(coremlFlagPath), { recursive: true }); } catch { /* ok */ }
455
+ try { writeFileSync(coremlFlagPath, new Date().toISOString()); } catch { /* best effort */ }
456
+ const cpuOnlyOptions = buildLocalSessionOptions(quantLabel);
457
+ delete cpuOnlyOptions.executionProviders;
458
+ session = await ort.InferenceSession.create(onnxPath, cpuOnlyOptions);
459
+ backend = 'cpu';
460
+ }
461
+ } else {
462
+ console.warn(`[L5] ${epName} session init failed (${err.message}), retrying with CPU only`);
463
+ const cpuOnlyOptions = buildLocalSessionOptions(quantLabel);
464
+ delete cpuOnlyOptions.executionProviders;
465
+ session = await ort.InferenceSession.create(onnxPath, cpuOnlyOptions);
466
+ backend = 'cpu';
467
+ }
468
+ } else {
469
+ throw err;
470
+ }
471
+ }
472
+
473
+ const tokenizer = await createTokenizer(tokenizerPath);
474
+
475
+ // Warmup: ORT needs 10+ inference passes to stabilize JIT compilation,
476
+ // memory pool sizing, and thread pool scheduling. Warmup at production
477
+ // batch sizes so ORT's memory planner pre-allocates the right arenas.
478
+ // Short texts (batch 16) + medium texts (batch 8) + long texts (batch 4).
479
+ const warmupShort = Array.from({ length: 16 }, (_, i) =>
480
+ `function f${i}() { return ${i}; }`);
481
+ const warmupMedium = Array.from({ length: 8 }, (_, i) =>
482
+ `export class Service${i} { constructor(private db) {} async find(id) { const row = await this.db.query("SELECT * FROM t WHERE id = ?", [id]); return row; } async update(id, data) { await this.db.run("UPDATE t SET v = ? WHERE id = ?", [data, id]); } }`);
483
+ const warmupLong = Array.from({ length: 4 }, (_, i) =>
484
+ `/**\n * Module ${i}: handles complex business logic including validation,\n * transformation, caching, and event emission across multiple\n * bounded contexts. Each method delegates to specialized services\n * and aggregates results before returning to the caller.\n */\nexport class ComplexModule${i} {\n constructor(private svc, private cache, private events) {}\n async process(input) {\n const validated = this.svc.validate(input);\n const cached = await this.cache.get(validated.key);\n if (cached) return cached;\n const result = await this.svc.transform(validated);\n await this.cache.set(validated.key, result);\n this.events.emit('processed', { module: ${i}, key: validated.key });\n return result;\n }\n}`);
485
+ const warmupSets = [warmupShort, warmupMedium, warmupLong];
486
+ for (let pass = 0; pass < 10; pass++) {
487
+ const texts = warmupSets[pass % warmupSets.length];
488
+ const warmupTokenized = tokenizer(texts, { padding: true, truncation: true, max_length: INDEXING_MAX_LENGTH });
489
+ const warmupFeed = buildFeed(warmupTokenized, session.inputNames);
490
+ await session.run(warmupFeed);
491
+ }
492
+
493
+ console.log(`[ORT] Direct session: inputs=[${session.inputNames}], outputs=[${session.outputNames}]`);
494
+
495
+ const optimizedPath = getOptimizedModelPath(quantLabel);
496
+ if (!existsSync(optimizedPath)) {
497
+ console.warn(`[L3b] Optimized model file was not materialized at ${optimizedPath}. Session options may not be fully forwarded.`);
498
+ }
499
+
500
+ localPipeline = { session, tokenizer, quantLabel, backend };
501
+
502
+ console.log(`Local model loaded in ${Date.now() - start}ms (threads: ${sessionOptions.intraOpNumThreads}, backend: ${backend}, quantized: ${quantLabel})`);
503
+ isLoadingLocal = false;
504
+ return localPipeline;
505
+ })();
506
+
507
+ return loadPromise;
508
+ }
509
+
510
+ // =============================================================================
511
+ // CORE INFERENCE FUNCTIONS
512
+ // =============================================================================
513
+
514
+ /**
515
+ * L1: True batch inference for local model.
516
+ * Uses native candle inference (FP32, Metal GPU) when available,
517
+ * falls back to ORT INT8 session.
518
+ * Returns Float32Array subarray views from a per-batch pool (zero-copy downstream).
519
+ */
520
+ export async function callLocalModel(texts, options = {}) {
521
+ // Default dispatcher: pick the best path. Hybrid CPU+GPU dispatching is
522
+ // done at callLocalModelBucketed (which sees the full batch list and can
523
+ // run both encoders in parallel).
524
+ //
525
+ // SWEET_SEARCH_EMBED_USE_CPU=1 forces the ORT INT8 CPU path even when the
526
+ // native Metal addon is available. This is the symmetric counterpart to
527
+ // SWEET_SEARCH_LI_USE_CPU=1 and is the intended way to run the
528
+ // "ORT embed on CPU ‖ native LI on Metal" pipeline at index time: the
529
+ // indexer's parallel embed + LI phase then actually runs the two on
530
+ // different devices with no Metal queue contention.
531
+ if (!texts || texts.length === 0) return [];
532
+ const forceEmbedCpu = process.env.SWEET_SEARCH_EMBED_USE_CPU === '1';
533
+ if (!forceEmbedCpu && isNativeInferenceAvailable() && isNativeEmbeddingModelLoaded()) {
534
+ return callLocalModelGpu(texts, options);
535
+ }
536
+ return callLocalModelCpu(texts, options);
537
+ }
538
+
539
+ /**
540
+ * Native Metal GPU embedding (candle + Metal SDPA, F16). Throws if the
541
+ * native addon isn't loaded — caller is expected to verify availability.
542
+ */
543
+ export async function callLocalModelGpu(texts, options = {}) {
544
+ if (!texts || texts.length === 0) return [];
545
+ const { maxLength = INDEXING_MAX_LENGTH } = options;
546
+ const t0 = performance.now();
547
+ const embeddings = await nativeEmbed(texts, { maxLength });
548
+ const t1 = performance.now();
549
+ _embeddingTimings.inference_us += Math.round((t1 - t0) * 1000);
550
+ _embeddingTimings.calls++;
551
+ _embeddingTimings.totalTexts += texts.length;
552
+ return embeddings;
553
+ }
554
+
555
+ /**
556
+ * ORT INT8 CPU embedding (onnxruntime-node, accelerated by the platform BLAS).
557
+ * Returns the same shape as the GPU path so callers can mix results
558
+ * transparently from a hybrid dispatcher, and serves as the default fallback
559
+ * when the native GPU addon isn't available on the host.
560
+ */
561
+ export async function callLocalModelCpu(texts, options = {}) {
562
+ if (!texts || texts.length === 0) return [];
563
+ const { maxLength = INDEXING_MAX_LENGTH } = options;
564
+
565
+ const { session, tokenizer } = await getLocalPipeline();
566
+
567
+ const t0 = performance.now();
568
+ const tokenized = tokenizer(texts, {
569
+ padding: true,
570
+ truncation: true,
571
+ max_length: maxLength,
572
+ });
573
+
574
+ const t1 = performance.now();
575
+ const feed = buildFeed(tokenized, session.inputNames);
576
+ const outputs = await session.run(feed);
577
+ const t2 = performance.now();
578
+
579
+ const pooled = extractPooledEmbeddings(outputs, tokenized.attention_mask, true);
580
+ const t3 = performance.now();
581
+
582
+ _embeddingTimings.tokenize_us += Math.round((t1 - t0) * 1000);
583
+ _embeddingTimings.inference_us += Math.round((t2 - t1) * 1000);
584
+ _embeddingTimings.pool_us += Math.round((t3 - t2) * 1000);
585
+ _embeddingTimings.calls++;
586
+ _embeddingTimings.totalTexts += texts.length;
587
+ const { data, batchSize, dim } = pooled;
588
+
589
+ if (batchSize !== texts.length) {
590
+ throw new Error(`[L1] Output count mismatch: got ${batchSize} embeddings for ${texts.length} texts`);
591
+ }
592
+
593
+ const expectedDim = EMBEDDING_PROVIDERS.local.dimensions.full;
594
+ if (dim !== expectedDim) {
595
+ console.warn(`[L1] Local embedding dim mismatch: expected ${expectedDim}, got ${dim}`);
596
+ }
597
+
598
+ // data is already a fresh Float32Array from extractPooledEmbeddings — subarray
599
+ // directly instead of allocating + copying into yet another buffer.
600
+ const embeddings = new Array(texts.length);
601
+ for (let i = 0; i < texts.length; i++) {
602
+ embeddings[i] = data.subarray(i * dim, (i + 1) * dim);
603
+ }
604
+ if (process.env.NODE_ENV !== 'production') Object.freeze(embeddings);
605
+ return embeddings;
606
+ }
607
+
608
+ /**
609
+ * L0: Length-sorted bucketing for local model batch inference.
610
+ * When the embedding worker pool is active, dispatches all batches concurrently
611
+ * across workers for true CPU parallelism (Phase 2 integration).
612
+ */
613
+ export async function callLocalModelBucketed(texts, options = {}) {
614
+ const maxLength = options.maxLength ?? INDEXING_MAX_LENGTH;
615
+
616
+ const charPerToken = getCalibrationFactor();
617
+ const batchingSafety = options.batchingSafety
618
+ ?? Number(process.env.SWEET_SEARCH_BATCHING_SAFETY ?? 1.15);
619
+ const indexed = texts.map((text, i) => {
620
+ const est = Math.ceil((text.length / charPerToken) * batchingSafety);
621
+ const estTokens = Math.max(1, Math.min(est, maxLength));
622
+ return { text, origIdx: i, estTokens };
623
+ });
624
+ indexed.sort((a, b) => a.estTokens - b.estTokens);
625
+
626
+ const embeddings = new Array(texts.length);
627
+
628
+ // Phase 3: Adaptive memory guard
629
+ const totalMemGB = os.totalmem() / 1024 / 1024 / 1024;
630
+ const adaptiveMemCapBytes = totalMemGB > 32
631
+ ? Infinity
632
+ : totalMemGB > 8
633
+ ? 2 * 1024 * 1024 * 1024
634
+ : 512 * 1024 * 1024;
635
+ const memGuardHighWatermark = 0.85;
636
+ const memGuardActive = !process.env.SWEET_SEARCH_DISABLE_MEM_GUARD && adaptiveMemCapBytes !== Infinity;
637
+
638
+ // Pre-compute all batch boundaries
639
+ const batches = [];
640
+ let i = 0;
641
+ // Attention budget caps per-batch compute work (seq² × batch) on top of the
642
+ // token budget (memory). The long-sequence tail is where it matters: a
643
+ // batch of 64 × 512 tokens does ~50× more attention compute than a head
644
+ // batch of 128 × 50 tokens at the same token budget, AND its activation
645
+ // working set per transformer layer can overflow last-level cache, forcing
646
+ // every layer to read/write DRAM.
647
+ //
648
+ // Cache-aware sizing (mirrors the LI fix in indexer-pool.js): one resident
649
+ // transformer layer's weights compete with activations for the same cache.
650
+ // We size B at maxLength such that (weights + B × per_item) fits.
651
+ // CodeRankEmbed weight dtype depends on path: native uses F32 (correctness
652
+ // fix), ORT uses INT8. Activations are F32 in both. See
653
+ // computeWeightsAwareBatchCap() in onnx-session-utils.js for the full
654
+ // derivation. Override via SWEET_SEARCH_EMBED_ATTENTION_BUDGET (explicit
655
+ // FLOPs cap, 0 disables) or SWEET_SEARCH_EMBED_L2_SAFETY (multiplicative).
656
+ const envEmbedAttnBudget = parseInt(
657
+ process.env.SWEET_SEARCH_EMBED_ATTENTION_BUDGET || '',
658
+ 10,
659
+ );
660
+ const _embedNativeActive = isNativeInferenceAvailable();
661
+ const _embedWeightBytesPerParam = _embedNativeActive ? 4 : 1; // F32 native | INT8 ORT
662
+ const _embedActBytesPerItem = 4; // F32 activations on both paths
663
+ const _embedLLC = detectLastLevelCacheBytes();
664
+ const _parsedEmbedSafety = Number(process.env.SWEET_SEARCH_EMBED_L2_SAFETY);
665
+ const _embedL2Safety = Number.isFinite(_parsedEmbedSafety) && _parsedEmbedSafety > 0
666
+ ? _parsedEmbedSafety
667
+ : 1.0;
668
+ const _embedCacheBoundBatch = computeWeightsAwareBatchCap({
669
+ cacheBytes: _embedLLC,
670
+ hiddenDim: LOCAL_EMBEDDING_HIDDEN_DIM,
671
+ maxLength,
672
+ weightBytesPerParam: _embedWeightBytesPerParam,
673
+ actBytesPerItem: _embedActBytesPerItem,
674
+ safety: _embedL2Safety,
675
+ });
676
+ while (i < indexed.length) {
677
+ const tokenBudget = 16384;
678
+ const baseHardCap = options.hardCap ?? (maxLength <= 256 ? 128 : 64);
679
+ const resolveHardCap = options.resolveHardCap ?? (() => baseHardCap);
680
+ let attentionBudget;
681
+ if (envEmbedAttnBudget === 0) {
682
+ attentionBudget = Infinity;
683
+ } else if (Number.isFinite(envEmbedAttnBudget) && envEmbedAttnBudget > 0) {
684
+ attentionBudget = envEmbedAttnBudget;
685
+ } else {
686
+ const tierLongSeqBatch = Math.max(1, Math.floor(baseHardCap / 2));
687
+ const effectiveLongSeqBatch = _embedCacheBoundBatch != null
688
+ ? Math.min(tierLongSeqBatch, _embedCacheBoundBatch)
689
+ : tierLongSeqBatch;
690
+ attentionBudget = effectiveLongSeqBatch * maxLength * maxLength;
691
+ }
692
+
693
+ let batchSize = 1;
694
+ while (i + batchSize < indexed.length) {
695
+ const rawEst = indexed[i + batchSize].estTokens;
696
+ const candidateLongest = Math.min(rawEst, maxLength);
697
+ const candidateCount = batchSize + 1;
698
+ const candidateHardCap = resolveHardCap(candidateLongest);
699
+ if (candidateCount > candidateHardCap) break;
700
+ // Memory cap — linear in seq_len
701
+ if (candidateLongest * candidateCount > tokenBudget) break;
702
+ // Compute cap — quadratic in seq_len
703
+ if (candidateLongest * candidateLongest * candidateCount > attentionBudget) break;
704
+ batchSize = candidateCount;
705
+ }
706
+ batches.push(indexed.slice(i, i + batchSize));
707
+ i += batchSize;
708
+ }
709
+
710
+ // Phase 2: If pool is active and memory guard is off, dispatch concurrently
711
+ // across workers. The pool round-robins batches so workers run in parallel.
712
+ const pool = getEmbeddingPool();
713
+ if (pool && !memGuardActive) {
714
+ const batchResults = await embedBatchesWithPool(pool, batches, maxLength, options.onProgress, texts.length);
715
+ for (let b = 0; b < batches.length; b++) {
716
+ for (let j = 0; j < batches[b].length; j++) {
717
+ embeddings[batches[b][j].origIdx] = batchResults[b][j];
718
+ }
719
+ }
720
+ return embeddings;
721
+ }
722
+
723
+ // ── Hybrid CPU+GPU dispatch (OPT-IN, experimental) ──
724
+ //
725
+ // Default routes through whichever single encoder is available (native GPU
726
+ // preferred when present, ORT INT8 CPU fallback otherwise).
727
+ //
728
+ // Enable explicitly via SWEET_SEARCH_EMBED_HYBRID=1 to run the GPU and CPU
729
+ // encoders in parallel via a 2-worker shared-counter queue. Both paths
730
+ // produce the same shape vectors so results are mixed transparently.
731
+ //
732
+ // Why opt-in: in the default pipeline (parallel embed + LI phases) the
733
+ // GPU device queue is shared, and on unified-memory systems both encoders
734
+ // contend for the same DRAM bandwidth. The contention overhead usually
735
+ // erases the parallelism win. Hybrid is mainly useful in standalone
736
+ // embedding benchmarks or on hardware with separate CPU and GPU memory
737
+ // pools.
738
+ const onProgress = options.onProgress;
739
+ let completed = 0;
740
+
741
+ const embedHybridEnv = (process.env.SWEET_SEARCH_EMBED_HYBRID ?? '').trim().toLowerCase();
742
+ const embedHybridEnabled = embedHybridEnv === '1' || embedHybridEnv === 'true' || embedHybridEnv === 'on';
743
+ let useHybrid = false;
744
+ if (embedHybridEnabled && !pool && isNativeInferenceAvailable() && isNativeEmbeddingModelLoaded()) {
745
+ // Probe ORT availability — getLocalPipeline returns the loaded pipeline
746
+ // or throws/returns null if onnxruntime-node + model files aren't ready.
747
+ try {
748
+ const ortPipeline = await getLocalPipeline();
749
+ if (ortPipeline?.session) {
750
+ useHybrid = true;
751
+ console.log('[Embedding] hybrid CPU+GPU dispatch enabled (smart routing)');
752
+ }
753
+ } catch {
754
+ useHybrid = false;
755
+ }
756
+ }
757
+
758
+ if (useHybrid) {
759
+ // Smart bidirectional routing — see indexer-ann.js for the full rationale.
760
+ // The bucketer sorts batches ascending by token length. GPU pulls from
761
+ // the END (longest, where its compute advantage dominates kernel-launch
762
+ // overhead) and CPU pulls from the BEGINNING (shortest, where the BLAS-
763
+ // accelerated ORT INT8 path is competitive with or faster than the GPU's
764
+ // fixed dispatch cost). They meet in the middle, dynamically self-balancing.
765
+ let front = 0;
766
+ let back = batches.length - 1;
767
+ const runGpu = async () => {
768
+ while (true) {
769
+ if (back < front) break;
770
+ const myIdx = back--;
771
+ const batch = batches[myIdx];
772
+ const batchTexts = batch.map((b) => b.text);
773
+ const batchEmbeddings = await callLocalModelGpu(batchTexts, { maxLength });
774
+ for (let j = 0; j < batch.length; j++) {
775
+ embeddings[batch[j].origIdx] = batchEmbeddings[j];
776
+ }
777
+ completed += batch.length;
778
+ if (onProgress) onProgress(completed, texts.length);
779
+ }
780
+ };
781
+ const runCpu = async () => {
782
+ while (true) {
783
+ if (front > back) break;
784
+ const myIdx = front++;
785
+ const batch = batches[myIdx];
786
+ const batchTexts = batch.map((b) => b.text);
787
+ const batchEmbeddings = await callLocalModelCpu(batchTexts, { maxLength });
788
+ for (let j = 0; j < batch.length; j++) {
789
+ embeddings[batch[j].origIdx] = batchEmbeddings[j];
790
+ }
791
+ completed += batch.length;
792
+ if (onProgress) onProgress(completed, texts.length);
793
+ }
794
+ };
795
+ await Promise.all([runGpu(), runCpu()]);
796
+ return embeddings;
797
+ }
798
+
799
+ // Sequential path: process each pre-computed batch one at a time.
800
+ // Uses pool if available (still benefits from dedicated ORT sessions),
801
+ // otherwise falls back to in-process callLocalModel.
802
+ const infer = pool ? (t, o) => pool.embed(t, o) : callLocalModel;
803
+ for (const batch of batches) {
804
+ const batchTexts = batch.map(b => b.text);
805
+ const batchEmbeddings = await infer(batchTexts, { maxLength });
806
+ for (let j = 0; j < batch.length; j++) {
807
+ embeddings[batch[j].origIdx] = batchEmbeddings[j];
808
+ }
809
+ completed += batch.length;
810
+ if (onProgress) onProgress(completed, texts.length);
811
+ }
812
+
813
+ return embeddings;
814
+ }
815
+
816
+ // =============================================================================
817
+ // QUERY PREFIX
818
+ // =============================================================================
819
+
820
+ export function applyLocalQueryPrefix(text) {
821
+ const prefix = EMBEDDING_PROVIDERS.local?.queryPrefix || '';
822
+ if (prefix && !text.startsWith(prefix)) {
823
+ return prefix + text;
824
+ }
825
+ return text;
826
+ }
827
+
828
+ // =============================================================================
829
+ // LIFECYCLE
830
+ // =============================================================================
831
+
832
+ export async function unloadLocalModel() {
833
+ if (localPipeline?.session) {
834
+ // Note: ORT has a known native memory leak in session.release()
835
+ // (microsoft/onnxruntime#25325) — avoid frequent load/unload cycles.
836
+ try { await localPipeline.session.release(); } catch { /* best-effort cleanup */ }
837
+ }
838
+ localPipeline = null;
839
+ isLoadingLocal = false;
840
+ loadPromise = null;
841
+ }
842
+
843
+ export function isLocalModelLoaded() {
844
+ return localPipeline !== null;
845
+ }