@nataliapc/mcp-openmsx 1.2.9 → 1.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +41 -2
  2. package/bin/win-x64/mcp-openmsx-sspi-proxy.exe +0 -0
  3. package/dist/chunker.js +187 -0
  4. package/dist/embedder.js +250 -0
  5. package/dist/openmsx.js +113 -248
  6. package/dist/openmsx_windows.js +316 -0
  7. package/dist/server.js +6 -1
  8. package/dist/server_tools.js +6 -5
  9. package/dist/vectordb.js +94 -35
  10. package/package.json +16 -18
  11. package/resources/audio/chipsfmpacpr1_en.md +209 -0
  12. package/resources/audio/chipsfmpacpr2_en.md +170 -0
  13. package/resources/audio/toc.json +12 -0
  14. package/resources/book--msx-top-secret-3/MTS3-Appendix-English-Upd2.pdf +0 -0
  15. package/resources/book--msx-top-secret-3/MTS3-Complete-English.pdf +0 -0
  16. package/resources/book--msx2-technical-handbook/toc.json +1 -1
  17. package/resources/book--the-msx-red-book/Chapter1_Programmable_Peripheral_Interface.md +112 -0
  18. package/resources/book--the-msx-red-book/Chapter2_Video_Display_Processor.md +308 -0
  19. package/resources/book--the-msx-red-book/Chapter3_Programmable_Sound_Generator.md +168 -0
  20. package/resources/book--the-msx-red-book/Chapter4_ROM_BIOS.md +2528 -0
  21. package/resources/book--the-msx-red-book/Chapter5_ROM_BASIC_Interpreter.md +3975 -0
  22. package/resources/book--the-msx-red-book/Chapter6_Memory_Map.md +1963 -0
  23. package/resources/book--the-msx-red-book/Chapter7_Machine_Code_Programs.md +1238 -0
  24. package/resources/book--the-msx-red-book/Introduction.md +104 -0
  25. package/resources/book--the-msx-red-book/toc.json +38 -3
  26. package/resources/processors/toc.json +3 -3
  27. package/resources/processors/z80-undocumented.md +141 -0
  28. package/resources/sdcc/1_Introduction.md +199 -0
  29. package/resources/sdcc/2_Installing_SDCC.md +533 -0
  30. package/resources/sdcc/3_Using_SDCC.md +1758 -0
  31. package/resources/sdcc/4_Notes_on_supported_Processors.md +1638 -0
  32. package/resources/sdcc/5_Debugging.md +210 -0
  33. package/resources/sdcc/6_Tips_and_Support.md +258 -0
  34. package/resources/sdcc/7_SDCC_Technical_Data.md +489 -0
  35. package/resources/sdcc/8_Compiler_internals.md +477 -0
  36. package/resources/sdcc/toc.json +44 -2
  37. package/vector-db/msxdocs.lance/_indices/4d3bd360-e3c6-408d-b0ff-a4d6bd9580cb/metadata.lance +0 -0
  38. package/vector-db/msxdocs.lance/_indices/4d3bd360-e3c6-408d-b0ff-a4d6bd9580cb/part_0_docs.lance +0 -0
  39. package/vector-db/msxdocs.lance/_indices/4d3bd360-e3c6-408d-b0ff-a4d6bd9580cb/part_0_invert.lance +0 -0
  40. package/vector-db/msxdocs.lance/_indices/4d3bd360-e3c6-408d-b0ff-a4d6bd9580cb/part_0_tokens.lance +0 -0
  41. package/vector-db/msxdocs.lance/_transactions/0-6f47c9fc-3657-40f0-9dd4-c7226b2a4805.txn +0 -0
  42. package/vector-db/msxdocs.lance/_transactions/1-2bb7426e-a4b0-40ea-9a58-00c4985fc6a9.txn +0 -0
  43. package/vector-db/msxdocs.lance/_versions/18446744073709551613.manifest +0 -0
  44. package/vector-db/msxdocs.lance/_versions/18446744073709551614.manifest +0 -0
  45. package/vector-db/msxdocs.lance/_versions/latest_version_hint.json +1 -0
  46. package/vector-db/msxdocs.lance/data/110001110001011010001000876c134b8296fbc47762d1e1ab.lance +0 -0
  47. package/resources/book--the-msx-red-book/the_msx_red_book.md +0 -10349
  48. package/resources/processors/z80-undocumented.tex +0 -5617
  49. package/resources/sdcc/lyx2md.py +0 -745
  50. package/resources/sdcc/sdccman.lyx +0 -81574
  51. package/resources/sdcc/sdccman.md +0 -5557
  52. package/vector-db/index.json +0 -1
package/README.md CHANGED
@@ -46,7 +46,7 @@ This project creates a bridge between modern AI-assisted development (e.g. GitHu
46
46
  - **Video Control**: VDP register manipulation and screen capture.
47
47
  - **Memory Operations**: Read/write RAM, VRAM, and I/O port access.
48
48
  - **Automation**: Keyboard input simulation and savestate management.
49
- - **Vector DB Integration**: Query an embedded vector database with MSX resources for development support.
49
+ - **Hybrid Documentation Search**: Query an embedded local index of MSX resources combining semantic (multilingual embeddings) and keyword (BM25) search, runs fully offline.
50
50
  - **Hybrid Mode**: This MCP server supports hybrid access mode (_STDIO_ and _HTTP_ transports).
51
51
 
52
52
  ## Architecture
@@ -117,7 +117,7 @@ The MCP server translates high-level natural language commands from your Copilot
117
117
  - `msxdocs_resource_get`: Retrieve MCP resources for MCP clients that don't support MCP resources.
118
118
 
119
119
  ### Documentation Tools
120
- - `vector_db_query`: Query the Vector DB resources to obtain information about MSX systems, cartridges, and other development resources.
120
+ - `vector_db_query`: Hybrid search (semantic embeddings + BM25) over the local MSX documentation index, for information about MSX systems, cartridges, programming, and other development resources.
121
121
  - `msxdocs_resource_get`: Retrieve MCP resources for MCP clients that don't support MCP resources.
122
122
 
123
123
  ## Available MCP Resources
@@ -194,6 +194,9 @@ Steps to install the MCP server in VSCode:
194
194
  }
195
195
  ```
196
196
 
197
+ > [!NOTE]
198
+ > In Windows you can change the `command` field to `npx.cmd` if you experience permission issues.
199
+
197
200
  > [!NOTE]
198
201
  > Environment variables are optional. Customize them as you need.
199
202
 
@@ -249,6 +252,42 @@ Edit it to include the following JSON entry:
249
252
  | `MCP_TRANSPORT` | Transport mode (`stdio` or `http`) | `stdio` | `http` |
250
253
  | `MCP_HTTP_PORT` | Port number for HTTP transport mode | `3000` | `8080` |
251
254
  | `MCP_ALLOWED_ORIGINS` | Comma-separated list of allowed origins for HTTP transport | Empty for all allowed | `http://localhost,http://mydomain.com` |
255
+ | `OPENMSX_WINDOWS_CONTROL` | **Windows only.** How the server talks to openMSX's control socket (see below) | `stdio-proxy` | `direct-sspi` |
256
+ | `OPENMSX_WINDOWS_PROXY_EXECUTABLE` | **Windows only.** Override path to the SSPI proxy helper (development) | Bundled `bin/win-x64/mcp-openmsx-sspi-proxy.exe` | `C:\path\to\mcp-openmsx-sspi-proxy.exe` |
257
+ | `OPENMSX_MODELS_CACHE` | Directory where the embedding model is cached (also honors `HF_HOME` / `TRANSFORMERS_CACHE`) | `~/.cache/mcp-openmsx` | `/opt/models` |
258
+ | `OPENMSX_EMBED_PROVIDER` | **Index generator only.** `cuda` uses the GPU (fp32 model) to regenerate the index, falling back to CPU if CUDA is unavailable. The MCP server itself always uses CPU/int8 and ignores this variable. | (generator: `cpu`) | `cuda` |
259
+
260
+ #### Documentation search model
261
+
262
+ The `vector_db_query` tool runs a local hybrid search (semantic embeddings + BM25). The embedding model
263
+ (`multilingual-e5-small`, ONNX quantized, ~118 MB, 512-token context, multilingual) is **downloaded once**
264
+ from the HuggingFace Hub on the first query and cached on disk (see `OPENMSX_MODELS_CACHE` above). After that
265
+ it runs fully offline. No API key is required. To pre-populate the cache for air-gapped environments, run one
266
+ query on a networked machine and copy the cache directory.
267
+
268
+ Regenerating the index (rare) embeds the whole corpus. On CPU this is slow; on an NVIDIA GPU set
269
+ `OPENMSX_EMBED_PROVIDER=cuda` for the generator to use it (requires CUDA 13 runtime libraries + cuDNN 9),
270
+ which is ~50× faster. The GPU path uses the larger fp32 model; the server keeps using the int8 model and
271
+ the two are interchangeable for search (same ranking). End users never download the fp32 model.
272
+
273
+ #### Windows control modes (`OPENMSX_WINDOWS_CONTROL`)
274
+
275
+ On Windows, openMSX is a GUI app whose TCP control socket requires SSPI (Negotiate/NTLM) authentication. The server supports several transports:
276
+
277
+ | Value | Description |
278
+ |-------|-------------|
279
+ | `stdio-proxy` | **Default.** Launches a self-contained .NET helper that performs SSPI and exposes a clean XML stdio channel — the most robust path. |
280
+ | `direct-sspi` | Authenticates from Node via the optional `node-expose-sspi` package. Fallback / debugging. |
281
+ | `socket` | Legacy alias of `direct-sspi`. |
282
+
283
+ Linux and macOS are unaffected (`openmsx -control stdio`).
284
+
285
+ The bundled proxy is built from `helpers/openmsx-sspi-proxy` and can be rebuilt reproducibly from Linux with Docker (no local .NET required):
286
+
287
+ ```bash
288
+ cd mcp-server
289
+ pnpm build:proxy:win-x64:docker # → bin/win-x64/mcp-openmsx-sspi-proxy.exe
290
+ ```
252
291
 
253
292
 
254
293
  ## Advanced Manual Usage
@@ -0,0 +1,187 @@
1
+ /**
2
+ * Local text chunkers (no external API).
3
+ *
4
+ * Two strategies:
5
+ * - `chunkText`: deterministic, markdown-aware, fixed-size with overlap.
6
+ * Used as a fallback and to hard-split oversized units.
7
+ * - `semanticChunk`: groups consecutive sentences by embedding similarity
8
+ * (cosine), so each chunk stays topically coherent, up to a size bound.
9
+ * Requires an embedding function (injected) — the model runs locally.
10
+ *
11
+ * Sizing targets the embedding model's context window. With multilingual-e5
12
+ * (max 512 tokens) we aim for ~1600 characters (~400 tokens), leaving room for
13
+ * the "passage: " prefix.
14
+ *
15
+ * @author Natalia Pujol Cremades (@nataliapc)
16
+ * @license GPL2
17
+ */
18
+ export const DEFAULT_MAX_CHARS = 1600;
19
+ export const DEFAULT_OVERLAP = 100;
20
+ /** Hard-split a single oversized block into overlapping windows. */
21
+ function splitLong(s, maxChars, overlap) {
22
+ const out = [];
23
+ const step = Math.max(1, maxChars - overlap);
24
+ let start = 0;
25
+ while (start < s.length) {
26
+ const end = Math.min(start + maxChars, s.length);
27
+ const piece = s.slice(start, end).trim();
28
+ if (piece) {
29
+ out.push(piece);
30
+ }
31
+ if (end >= s.length) {
32
+ break;
33
+ }
34
+ start += step;
35
+ }
36
+ return out;
37
+ }
38
+ /**
39
+ * Split `text` into overlapping, markdown-aware fixed-size chunks.
40
+ * Returns [] for empty/whitespace input.
41
+ */
42
+ export function chunkText(text, opts = {}) {
43
+ const maxChars = opts.maxChars ?? DEFAULT_MAX_CHARS;
44
+ const overlap = opts.overlap ?? DEFAULT_OVERLAP;
45
+ const clean = text.replace(/\r\n/g, '\n').trim();
46
+ if (!clean) {
47
+ return [];
48
+ }
49
+ if (clean.length <= maxChars) {
50
+ return [clean];
51
+ }
52
+ const blocks = clean
53
+ .split(/\n{2,}/)
54
+ .map((b) => b.trim())
55
+ .filter(Boolean);
56
+ const chunks = [];
57
+ let buf = '';
58
+ const flush = () => {
59
+ if (buf.trim()) {
60
+ chunks.push(buf.trim());
61
+ }
62
+ };
63
+ for (const block of blocks) {
64
+ if (block.length > maxChars) {
65
+ flush();
66
+ buf = '';
67
+ chunks.push(...splitLong(block, maxChars, overlap));
68
+ continue;
69
+ }
70
+ if (buf && buf.length + block.length + 1 > maxChars) {
71
+ flush();
72
+ const tail = buf.slice(-overlap);
73
+ buf = `${tail}\n${block}`;
74
+ }
75
+ else {
76
+ buf = buf ? `${buf}\n${block}` : block;
77
+ }
78
+ }
79
+ flush();
80
+ return chunks;
81
+ }
82
+ export const SEMANTIC_DEFAULTS = {
83
+ maxChars: 1800,
84
+ minChars: 250,
85
+ similarityThreshold: 0.90,
86
+ };
87
+ /** Split text into sentence-ish units (sentence punctuation or line breaks). */
88
+ export function splitSentences(text) {
89
+ return text
90
+ .replace(/\r\n/g, '\n')
91
+ .split(/(?<=[.!?:;])\s+|\n+/)
92
+ .map((s) => s.trim())
93
+ .filter(Boolean);
94
+ }
95
+ /** Dot product of two equal-length, L2-normalized vectors (= cosine). */
96
+ function dot(a, b) {
97
+ let s = 0;
98
+ for (let i = 0; i < a.length; i++) {
99
+ s += a[i] * b[i];
100
+ }
101
+ return s;
102
+ }
103
+ /**
104
+ * Group consecutive sentences by embedding similarity into coherent chunks.
105
+ * All sentences are embedded in one batched call (`embedFn`); a running
106
+ * (re-normalized) centroid represents the current group. A sentence starts a
107
+ * new chunk when it is too dissimilar from the centroid or would overflow
108
+ * `maxChars`.
109
+ */
110
+ export async function semanticChunk(text, embedFn, opts = {}) {
111
+ const maxChars = opts.maxChars ?? SEMANTIC_DEFAULTS.maxChars;
112
+ const minChars = opts.minChars ?? SEMANTIC_DEFAULTS.minChars;
113
+ const threshold = opts.similarityThreshold ?? SEMANTIC_DEFAULTS.similarityThreshold;
114
+ const clean = text.replace(/\r\n/g, '\n').trim();
115
+ if (!clean) {
116
+ return [];
117
+ }
118
+ // Units = paragraphs (split on blank lines). Oversized paragraphs are split
119
+ // into sentences, and oversized sentences hard-split. Paragraph granularity
120
+ // keeps the embedding count tractable on CPU while staying semantically
121
+ // meaningful (a paragraph is a natural topical unit).
122
+ const units = [];
123
+ for (const para of clean.split(/\n{2,}/).map((p) => p.trim()).filter(Boolean)) {
124
+ if (para.length <= maxChars) {
125
+ units.push(para);
126
+ continue;
127
+ }
128
+ for (const s of splitSentences(para)) {
129
+ if (s.length > maxChars) {
130
+ units.push(...splitLong(s, maxChars, 0));
131
+ }
132
+ else {
133
+ units.push(s);
134
+ }
135
+ }
136
+ }
137
+ if (units.length === 0) {
138
+ return [];
139
+ }
140
+ if (units.length === 1) {
141
+ return [units[0]];
142
+ }
143
+ const embeddings = await embedFn(units);
144
+ const dim = embeddings[0].length;
145
+ const chunks = [];
146
+ let groupText = units[0];
147
+ let sum = embeddings[0].slice(); // running sum of member vectors
148
+ let centroid = embeddings[0]; // normalized centroid
149
+ const renormalize = (v) => {
150
+ let n = 0;
151
+ for (let i = 0; i < dim; i++) {
152
+ n += v[i] * v[i];
153
+ }
154
+ n = Math.max(Math.sqrt(n), 1e-12);
155
+ return v.map((x) => x / n);
156
+ };
157
+ for (let i = 1; i < units.length; i++) {
158
+ const sim = dot(centroid, embeddings[i]);
159
+ const wouldOverflow = groupText.length + 1 + units[i].length > maxChars;
160
+ if (sim >= threshold && !wouldOverflow) {
161
+ groupText += `\n${units[i]}`;
162
+ for (let d = 0; d < dim; d++) {
163
+ sum[d] += embeddings[i][d];
164
+ }
165
+ centroid = renormalize(sum);
166
+ }
167
+ else {
168
+ chunks.push(groupText);
169
+ groupText = units[i];
170
+ sum = embeddings[i].slice();
171
+ centroid = embeddings[i];
172
+ }
173
+ }
174
+ chunks.push(groupText);
175
+ // Merge tiny trailing fragments into the previous chunk when they fit.
176
+ const merged = [];
177
+ for (const c of chunks) {
178
+ const prev = merged[merged.length - 1];
179
+ if (prev && c.length < minChars && prev.length + 1 + c.length <= maxChars) {
180
+ merged[merged.length - 1] = `${prev}\n${c}`;
181
+ }
182
+ else {
183
+ merged.push(c);
184
+ }
185
+ }
186
+ return merged;
187
+ }
@@ -0,0 +1,250 @@
1
+ /**
2
+ * Local text embedding engine.
3
+ *
4
+ * Uses onnxruntime-node + @anush008/tokenizers (both prebuilt napi, no `sharp`)
5
+ * to run the multilingual model `multilingual-e5-small` fully offline.
6
+ *
7
+ * The ONNX weights + tokenizer are downloaded on first use from the
8
+ * HuggingFace Hub and cached on disk. The same module is the single source of
9
+ * truth for embeddings in both generation (vector-db) and query (server).
10
+ *
11
+ * Model notes (e5):
12
+ * - MEAN pooling over masked token embeddings + L2 normalization. Do NOT
13
+ * switch to CLS pooling — it would silently degrade retrieval quality.
14
+ * - e5 is trained with asymmetric prefixes: queries must be prefixed with
15
+ * "query: " and documents/passages with "passage: ". Use `embedQuery` for
16
+ * search input and `embedPassage` for indexed text. The prefix is only used
17
+ * to compute the vector; it is never stored.
18
+ * - max_seq_length = 512 (enough for ~400-token semantic chunks).
19
+ *
20
+ * @author Natalia Pujol Cremades (@nataliapc)
21
+ * @license GPL2
22
+ */
23
+ import * as ort from 'onnxruntime-node';
24
+ import { Tokenizer } from '@anush008/tokenizers';
25
+ import * as fs from 'fs';
26
+ import * as path from 'path';
27
+ import * as os from 'os';
28
+ const MODEL_REPO = 'Xenova/multilingual-e5-small';
29
+ // int8 ONNX is fastest on CPU; the GPU uses the fp32 ONNX (CUDA has no
30
+ // efficient kernels for dynamic-quantized ops, so the int8 model falls back to
31
+ // CPU even under the CUDA provider). fp32 (not fp16) is used on GPU so the
32
+ // output stays Float32Array — the fp16 model emits float16 output that the
33
+ // pooling code cannot read directly. The file is chosen per provider.
34
+ const ONNX_FILE_CPU = 'onnx/model_quantized.onnx';
35
+ const ONNX_FILE_GPU = 'onnx/model.onnx';
36
+ const TOKENIZER_FILE = 'tokenizer.json';
37
+ const HF_BASE = `https://huggingface.co/${MODEL_REPO}/resolve/main`;
38
+ /** Embedding dimensionality of the model. */
39
+ export const EMBEDDING_DIM = 384;
40
+ /** Model max sequence length (e5 max_seq_length). */
41
+ const MAX_LENGTH = 512;
42
+ let enginePromise = null;
43
+ // Server-safe default: the MCP server NEVER downloads or runs the large fp32
44
+ // model. Only an explicit setEmbedProvider('cuda') — used by the offline index
45
+ // generator — can opt into the GPU. The server never calls it, so it stays int8.
46
+ let requestedProvider = 'cpu';
47
+ /**
48
+ * Select the embedding execution provider. Must be called before the first
49
+ * embedding. Only the index generator should request 'cuda'; the MCP server
50
+ * leaves the default ('cpu' / int8) so end users only ever download the 118 MB
51
+ * quantized model.
52
+ */
53
+ export function setEmbedProvider(provider) {
54
+ if (enginePromise) {
55
+ throw new Error('setEmbedProvider must be called before the first embedding');
56
+ }
57
+ requestedProvider = provider;
58
+ }
59
+ /** Resolve the on-disk cache directory for the model files. */
60
+ function getCacheDir() {
61
+ const base = process.env.OPENMSX_MODELS_CACHE ||
62
+ process.env.HF_HOME ||
63
+ process.env.TRANSFORMERS_CACHE ||
64
+ path.join(os.homedir(), '.cache', 'mcp-openmsx');
65
+ return path.join(base, 'models', MODEL_REPO.replace('/', '__'));
66
+ }
67
+ /** Download a single file from the HF Hub to dest if not already present. */
68
+ async function downloadFile(remote, dest) {
69
+ if (fs.existsSync(dest) && fs.statSync(dest).size > 0) {
70
+ return;
71
+ }
72
+ await fs.promises.mkdir(path.dirname(dest), { recursive: true });
73
+ const url = `${HF_BASE}/${remote}`;
74
+ const res = await fetch(url);
75
+ if (!res.ok || !res.body) {
76
+ throw new Error(`Failed to download model file ${url}: ${res.status} ${res.statusText}`);
77
+ }
78
+ const buffer = Buffer.from(await res.arrayBuffer());
79
+ // Write atomically: tmp file + rename, so a crash mid-download cannot leave
80
+ // a truncated file that later looks "present".
81
+ const tmp = `${dest}.download`;
82
+ await fs.promises.writeFile(tmp, buffer);
83
+ await fs.promises.rename(tmp, dest);
84
+ }
85
+ /** Download a specific ONNX file + tokenizer if missing; returns the onnx path. */
86
+ async function ensureFiles(onnxFile) {
87
+ const dir = getCacheDir();
88
+ const onnxPath = path.join(dir, onnxFile);
89
+ const tokenizerPath = path.join(dir, TOKENIZER_FILE);
90
+ await Promise.all([
91
+ downloadFile(onnxFile, onnxPath),
92
+ downloadFile(TOKENIZER_FILE, tokenizerPath),
93
+ ]);
94
+ return { onnxPath, tokenizerPath };
95
+ }
96
+ const baseSessionOptions = {
97
+ graphOptimizationLevel: 'all',
98
+ intraOpNumThreads: Math.max(1, os.cpus().length),
99
+ interOpNumThreads: 1,
100
+ executionMode: 'sequential',
101
+ };
102
+ /** Probe whether the CUDA provider can actually be created, using the small
103
+ * int8 model already on disk (avoids downloading the 470 MB fp32 model just to
104
+ * find out CUDA is unavailable). Returns true only if a CUDA session loads. */
105
+ async function cudaAvailable(probeOnnxPath) {
106
+ try {
107
+ const probe = await ort.InferenceSession.create(probeOnnxPath, {
108
+ ...baseSessionOptions,
109
+ executionProviders: ['cuda'],
110
+ });
111
+ await probe.release?.();
112
+ return true;
113
+ }
114
+ catch {
115
+ return false;
116
+ }
117
+ }
118
+ /**
119
+ * Lazily initialize the ONNX session + tokenizer (singleton).
120
+ *
121
+ * The int8 model is always fetched first: it is the server default, the
122
+ * fallback, and the cheap CUDA probe. The large fp32 model is downloaded ONLY
123
+ * when 'cuda' was explicitly requested AND CUDA is confirmed available — so the
124
+ * server (which never requests 'cuda') can never pull the fp32 model.
125
+ */
126
+ function getEngine() {
127
+ if (!enginePromise) {
128
+ enginePromise = (async () => {
129
+ const { onnxPath: int8Path, tokenizerPath } = await ensureFiles(ONNX_FILE_CPU);
130
+ const tokenizer = Tokenizer.fromFile(tokenizerPath);
131
+ tokenizer.setTruncation(MAX_LENGTH);
132
+ if (requestedProvider === 'cuda') {
133
+ if (await cudaAvailable(int8Path)) {
134
+ // CUDA confirmed → only now download + load the fp32 model.
135
+ const { onnxPath: fp32Path } = await ensureFiles(ONNX_FILE_GPU);
136
+ const session = await ort.InferenceSession.create(fp32Path, {
137
+ ...baseSessionOptions,
138
+ executionProviders: ['cuda'],
139
+ });
140
+ process.stderr.write('[embedder] using CUDA execution provider (fp32)\n');
141
+ return { session, tokenizer };
142
+ }
143
+ process.stderr.write('[embedder] CUDA requested but unavailable; using CPU (int8)\n');
144
+ }
145
+ const session = await ort.InferenceSession.create(int8Path, baseSessionOptions);
146
+ return { session, tokenizer };
147
+ })().catch((err) => {
148
+ // Reset so a transient failure (e.g. network) can be retried.
149
+ enginePromise = null;
150
+ throw err;
151
+ });
152
+ }
153
+ return enginePromise;
154
+ }
155
+ /**
156
+ * Default batch size for batched inference.
157
+ */
158
+ const BATCH_SIZE = 32;
159
+ // XLM-RoBERTa / e5 pad token id. Padded positions get attention_mask 0, so the
160
+ // exact id is irrelevant to the pooled result; it only fills the tensor.
161
+ const PAD_ID = 1n;
162
+ /**
163
+ * Embed a list of already-prefixed inputs in batches (one ONNX run per batch,
164
+ * dynamic padding to the longest sequence in the batch). Returns one
165
+ * 384-dimension, L2-normalized vector per input (mean pooling over masked
166
+ * tokens). Batching is essential for throughput when embedding many sentences.
167
+ */
168
+ async function embedRawBatch(inputs, batchSize = BATCH_SIZE) {
169
+ if (inputs.length === 0) {
170
+ return [];
171
+ }
172
+ const { session, tokenizer } = await getEngine();
173
+ const hasTokenTypes = session.inputNames.includes('token_type_ids');
174
+ const results = [];
175
+ for (let start = 0; start < inputs.length; start += batchSize) {
176
+ const batch = inputs.slice(start, start + batchSize);
177
+ const encodings = await Promise.all(batch.map((t) => tokenizer.encode(t)));
178
+ const idsArr = encodings.map((e) => e.getIds());
179
+ const maskArr = encodings.map((e) => e.getAttentionMask());
180
+ const B = batch.length;
181
+ const maxLen = Math.min(MAX_LENGTH, Math.max(...idsArr.map((a) => a.length)));
182
+ const flatIds = new BigInt64Array(B * maxLen);
183
+ const flatMask = new BigInt64Array(B * maxLen);
184
+ for (let r = 0; r < B; r++) {
185
+ const ids = idsArr[r];
186
+ const mask = maskArr[r];
187
+ const len = Math.min(ids.length, maxLen);
188
+ const rowBase = r * maxLen;
189
+ for (let c = 0; c < len; c++) {
190
+ flatIds[rowBase + c] = BigInt(ids[c]);
191
+ flatMask[rowBase + c] = BigInt(mask[c]);
192
+ }
193
+ for (let c = len; c < maxLen; c++) {
194
+ flatIds[rowBase + c] = PAD_ID;
195
+ flatMask[rowBase + c] = 0n;
196
+ }
197
+ }
198
+ const feeds = {
199
+ input_ids: new ort.Tensor('int64', flatIds, [B, maxLen]),
200
+ attention_mask: new ort.Tensor('int64', flatMask, [B, maxLen]),
201
+ };
202
+ if (hasTokenTypes) {
203
+ feeds.token_type_ids = new ort.Tensor('int64', new BigInt64Array(B * maxLen), [B, maxLen]);
204
+ }
205
+ const output = await session.run(feeds);
206
+ const hidden = output['last_hidden_state'] ?? output[session.outputNames[0]];
207
+ const data = hidden.data;
208
+ const dim = hidden.dims[hidden.dims.length - 1];
209
+ for (let r = 0; r < B; r++) {
210
+ const pooled = new Array(dim).fill(0);
211
+ let count = 0;
212
+ for (let t = 0; t < maxLen; t++) {
213
+ if (flatMask[r * maxLen + t] === 0n) {
214
+ continue;
215
+ }
216
+ count++;
217
+ const base = (r * maxLen + t) * dim;
218
+ for (let d = 0; d < dim; d++) {
219
+ pooled[d] += data[base + d];
220
+ }
221
+ }
222
+ const denom = Math.max(count, 1);
223
+ let norm = 0;
224
+ for (let d = 0; d < dim; d++) {
225
+ pooled[d] /= denom;
226
+ norm += pooled[d] * pooled[d];
227
+ }
228
+ norm = Math.max(Math.sqrt(norm), 1e-12);
229
+ for (let d = 0; d < dim; d++) {
230
+ pooled[d] /= norm;
231
+ }
232
+ results.push(pooled);
233
+ }
234
+ }
235
+ return results;
236
+ }
237
+ /** Embed a search query (e5 "query: " prefix). */
238
+ export async function embedQuery(text) {
239
+ return (await embedRawBatch([`query: ${text}`]))[0];
240
+ }
241
+ /** Embed a document/passage to be indexed (e5 "passage: " prefix). */
242
+ export async function embedPassage(text) {
243
+ return (await embedRawBatch([`passage: ${text}`]))[0];
244
+ }
245
+ /** Batch-embed passages (e5 "passage: " prefix). One ONNX run per batch. */
246
+ export function embedPassageBatch(texts) {
247
+ return embedRawBatch(texts.map((t) => `passage: ${t}`));
248
+ }
249
+ /** Default embedding = query side (kept for backward compatibility). */
250
+ export const embed = embedQuery;