npm - @nataliapc/mcp-openmsx - Versions diffs - 1.2.9 → 1.2.11 - Mend

@nataliapc/mcp-openmsx 1.2.9 → 1.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/README.md +41 -2
package/bin/win-x64/mcp-openmsx-sspi-proxy.exe +0 -0
package/dist/chunker.js +187 -0
package/dist/embedder.js +250 -0
package/dist/openmsx.js +113 -248
package/dist/openmsx_windows.js +316 -0
package/dist/server.js +6 -1
package/dist/server_tools.js +6 -5
package/dist/vectordb.js +94 -35
package/package.json +16 -18
package/resources/audio/chipsfmpacpr1_en.md +209 -0
package/resources/audio/chipsfmpacpr2_en.md +170 -0
package/resources/audio/toc.json +12 -0
package/resources/book--msx-top-secret-3/MTS3-Appendix-English-Upd2.pdf +0 -0
package/resources/book--msx-top-secret-3/MTS3-Complete-English.pdf +0 -0
package/resources/book--msx2-technical-handbook/toc.json +1 -1
package/resources/book--the-msx-red-book/Chapter1_Programmable_Peripheral_Interface.md +112 -0
package/resources/book--the-msx-red-book/Chapter2_Video_Display_Processor.md +308 -0
package/resources/book--the-msx-red-book/Chapter3_Programmable_Sound_Generator.md +168 -0
package/resources/book--the-msx-red-book/Chapter4_ROM_BIOS.md +2528 -0
package/resources/book--the-msx-red-book/Chapter5_ROM_BASIC_Interpreter.md +3975 -0
package/resources/book--the-msx-red-book/Chapter6_Memory_Map.md +1963 -0
package/resources/book--the-msx-red-book/Chapter7_Machine_Code_Programs.md +1238 -0
package/resources/book--the-msx-red-book/Introduction.md +104 -0
package/resources/book--the-msx-red-book/toc.json +38 -3
package/resources/processors/toc.json +3 -3
package/resources/processors/z80-undocumented.md +141 -0
package/resources/sdcc/1_Introduction.md +199 -0
package/resources/sdcc/2_Installing_SDCC.md +533 -0
package/resources/sdcc/3_Using_SDCC.md +1758 -0
package/resources/sdcc/4_Notes_on_supported_Processors.md +1638 -0
package/resources/sdcc/5_Debugging.md +210 -0
package/resources/sdcc/6_Tips_and_Support.md +258 -0
package/resources/sdcc/7_SDCC_Technical_Data.md +489 -0
package/resources/sdcc/8_Compiler_internals.md +477 -0
package/resources/sdcc/toc.json +44 -2
package/vector-db/msxdocs.lance/_indices/4d3bd360-e3c6-408d-b0ff-a4d6bd9580cb/metadata.lance +0 -0
package/vector-db/msxdocs.lance/_indices/4d3bd360-e3c6-408d-b0ff-a4d6bd9580cb/part_0_docs.lance +0 -0
package/vector-db/msxdocs.lance/_indices/4d3bd360-e3c6-408d-b0ff-a4d6bd9580cb/part_0_invert.lance +0 -0
package/vector-db/msxdocs.lance/_indices/4d3bd360-e3c6-408d-b0ff-a4d6bd9580cb/part_0_tokens.lance +0 -0
package/vector-db/msxdocs.lance/_transactions/0-6f47c9fc-3657-40f0-9dd4-c7226b2a4805.txn +0 -0
package/vector-db/msxdocs.lance/_transactions/1-2bb7426e-a4b0-40ea-9a58-00c4985fc6a9.txn +0 -0
package/vector-db/msxdocs.lance/_versions/18446744073709551613.manifest +0 -0
package/vector-db/msxdocs.lance/_versions/18446744073709551614.manifest +0 -0
package/vector-db/msxdocs.lance/_versions/latest_version_hint.json +1 -0
package/vector-db/msxdocs.lance/data/110001110001011010001000876c134b8296fbc47762d1e1ab.lance +0 -0
package/resources/book--the-msx-red-book/the_msx_red_book.md +0 -10349
package/resources/processors/z80-undocumented.tex +0 -5617
package/resources/sdcc/lyx2md.py +0 -745
package/resources/sdcc/sdccman.lyx +0 -81574
package/resources/sdcc/sdccman.md +0 -5557
package/vector-db/index.json +0 -1

package/README.md CHANGED Viewed

@@ -46,7 +46,7 @@ This project creates a bridge between modern AI-assisted development (e.g. GitHu
 - **Video Control**: VDP register manipulation and screen capture.
 - **Memory Operations**: Read/write RAM, VRAM, and I/O port access.
 - **Automation**: Keyboard input simulation and savestate management.
-- **Vector DB Integration**: Query an embedded vector database with MSX resources for development support.
+- **Hybrid Documentation Search**: Query an embedded local index of MSX resources combining semantic (multilingual embeddings) and keyword (BM25) search, runs fully offline.
 - **Hybrid Mode**: This MCP server supports hybrid access mode (_STDIO_ and _HTTP_ transports).
 ## Architecture
@@ -117,7 +117,7 @@ The MCP server translates high-level natural language commands from your Copilot
 - `msxdocs_resource_get`: Retrieve MCP resources for MCP clients that don't support MCP resources.
 ### Documentation Tools
-- `vector_db_query`: Query the Vector DB resources to obtain information about MSX systems, cartridges, and other development resources.
+- `vector_db_query`: Hybrid search (semantic embeddings + BM25) over the local MSX documentation index, for information about MSX systems, cartridges, programming, and other development resources.
 - `msxdocs_resource_get`: Retrieve MCP resources for MCP clients that don't support MCP resources.
 ## Available MCP Resources
@@ -194,6 +194,9 @@ Steps to install the MCP server in VSCode:
 }
 ```
+> [!NOTE]
+> In Windows you can change the `command` field to `npx.cmd` if you experience permission issues.
 > [!NOTE]
 > Environment variables are optional. Customize them as you need.
@@ -249,6 +252,42 @@ Edit it to include the following JSON entry:
 | `MCP_TRANSPORT` | Transport mode (`stdio` or `http`) | `stdio` | `http` |
 | `MCP_HTTP_PORT` | Port number for HTTP transport mode | `3000` | `8080` |
 | `MCP_ALLOWED_ORIGINS` | Comma-separated list of allowed origins for HTTP transport | Empty for all allowed | `http://localhost,http://mydomain.com` |
+| `OPENMSX_WINDOWS_CONTROL` | **Windows only.** How the server talks to openMSX's control socket (see below) | `stdio-proxy` | `direct-sspi` |
+| `OPENMSX_WINDOWS_PROXY_EXECUTABLE` | **Windows only.** Override path to the SSPI proxy helper (development) | Bundled `bin/win-x64/mcp-openmsx-sspi-proxy.exe` | `C:\path\to\mcp-openmsx-sspi-proxy.exe` |
+| `OPENMSX_MODELS_CACHE` | Directory where the embedding model is cached (also honors `HF_HOME` / `TRANSFORMERS_CACHE`) | `~/.cache/mcp-openmsx` | `/opt/models` |
+| `OPENMSX_EMBED_PROVIDER` | **Index generator only.** `cuda` uses the GPU (fp32 model) to regenerate the index, falling back to CPU if CUDA is unavailable. The MCP server itself always uses CPU/int8 and ignores this variable. | (generator: `cpu`) | `cuda` |
+#### Documentation search model
+The `vector_db_query` tool runs a local hybrid search (semantic embeddings + BM25). The embedding model
+(`multilingual-e5-small`, ONNX quantized, ~118 MB, 512-token context, multilingual) is **downloaded once**
+from the HuggingFace Hub on the first query and cached on disk (see `OPENMSX_MODELS_CACHE` above). After that
+it runs fully offline. No API key is required. To pre-populate the cache for air-gapped environments, run one
+query on a networked machine and copy the cache directory.
+Regenerating the index (rare) embeds the whole corpus. On CPU this is slow; on an NVIDIA GPU set
+`OPENMSX_EMBED_PROVIDER=cuda` for the generator to use it (requires CUDA 13 runtime libraries + cuDNN 9),
+which is ~50× faster. The GPU path uses the larger fp32 model; the server keeps using the int8 model and
+the two are interchangeable for search (same ranking). End users never download the fp32 model.
+#### Windows control modes (`OPENMSX_WINDOWS_CONTROL`)
+On Windows, openMSX is a GUI app whose TCP control socket requires SSPI (Negotiate/NTLM) authentication. The server supports several transports:
+| Value | Description |
+|-------|-------------|
+| `stdio-proxy` | **Default.** Launches a self-contained .NET helper that performs SSPI and exposes a clean XML stdio channel — the most robust path. |
+| `direct-sspi` | Authenticates from Node via the optional `node-expose-sspi` package. Fallback / debugging. |
+| `socket` | Legacy alias of `direct-sspi`. |
+Linux and macOS are unaffected (`openmsx -control stdio`).
+The bundled proxy is built from `helpers/openmsx-sspi-proxy` and can be rebuilt reproducibly from Linux with Docker (no local .NET required):
+```bash
+cd mcp-server
+pnpm build:proxy:win-x64:docker   # → bin/win-x64/mcp-openmsx-sspi-proxy.exe
+```
 ## Advanced Manual Usage

package/bin/win-x64/mcp-openmsx-sspi-proxy.exe ADDED Viewed

Binary file

package/dist/chunker.js ADDED Viewed

@@ -0,0 +1,187 @@
+/**
+ * Local text chunkers (no external API).
+ *
+ * Two strategies:
+ *  - `chunkText`: deterministic, markdown-aware, fixed-size with overlap.
+ *    Used as a fallback and to hard-split oversized units.
+ *  - `semanticChunk`: groups consecutive sentences by embedding similarity
+ *    (cosine), so each chunk stays topically coherent, up to a size bound.
+ *    Requires an embedding function (injected) — the model runs locally.
+ *
+ * Sizing targets the embedding model's context window. With multilingual-e5
+ * (max 512 tokens) we aim for ~1600 characters (~400 tokens), leaving room for
+ * the "passage: " prefix.
+ *
+ * @author Natalia Pujol Cremades (@nataliapc)
+ * @license GPL2
+ */
+export const DEFAULT_MAX_CHARS = 1600;
+export const DEFAULT_OVERLAP = 100;
+/** Hard-split a single oversized block into overlapping windows. */
+function splitLong(s, maxChars, overlap) {
+    const out = [];
+    const step = Math.max(1, maxChars - overlap);
+    let start = 0;
+    while (start < s.length) {
+        const end = Math.min(start + maxChars, s.length);
+        const piece = s.slice(start, end).trim();
+        if (piece) {
+            out.push(piece);
+        }
+        if (end >= s.length) {
+            break;
+        }
+        start += step;
+    }
+    return out;
+}
+/**
+ * Split `text` into overlapping, markdown-aware fixed-size chunks.
+ * Returns [] for empty/whitespace input.
+ */
+export function chunkText(text, opts = {}) {
+    const maxChars = opts.maxChars ?? DEFAULT_MAX_CHARS;
+    const overlap = opts.overlap ?? DEFAULT_OVERLAP;
+    const clean = text.replace(/\r\n/g, '\n').trim();
+    if (!clean) {
+        return [];
+    }
+    if (clean.length <= maxChars) {
+        return [clean];
+    }
+    const blocks = clean
+        .split(/\n{2,}/)
+        .map((b) => b.trim())
+        .filter(Boolean);
+    const chunks = [];
+    let buf = '';
+    const flush = () => {
+        if (buf.trim()) {
+            chunks.push(buf.trim());
+        }
+    };
+    for (const block of blocks) {
+        if (block.length > maxChars) {
+            flush();
+            buf = '';
+            chunks.push(...splitLong(block, maxChars, overlap));
+            continue;
+        }
+        if (buf && buf.length + block.length + 1 > maxChars) {
+            flush();
+            const tail = buf.slice(-overlap);
+            buf = `${tail}\n${block}`;
+        }
+        else {
+            buf = buf ? `${buf}\n${block}` : block;
+        }
+    }
+    flush();
+    return chunks;
+}
+export const SEMANTIC_DEFAULTS = {
+    maxChars: 1800,
+    minChars: 250,
+    similarityThreshold: 0.90,
+};
+/** Split text into sentence-ish units (sentence punctuation or line breaks). */
+export function splitSentences(text) {
+    return text
+        .replace(/\r\n/g, '\n')
+        .split(/(?<=[.!?:;])\s+|\n+/)
+        .map((s) => s.trim())
+        .filter(Boolean);
+}
+/** Dot product of two equal-length, L2-normalized vectors (= cosine). */
+function dot(a, b) {
+    let s = 0;
+    for (let i = 0; i < a.length; i++) {
+        s += a[i] * b[i];
+    }
+    return s;
+}
+/**
+ * Group consecutive sentences by embedding similarity into coherent chunks.
+ * All sentences are embedded in one batched call (`embedFn`); a running
+ * (re-normalized) centroid represents the current group. A sentence starts a
+ * new chunk when it is too dissimilar from the centroid or would overflow
+ * `maxChars`.
+ */
+export async function semanticChunk(text, embedFn, opts = {}) {
+    const maxChars = opts.maxChars ?? SEMANTIC_DEFAULTS.maxChars;
+    const minChars = opts.minChars ?? SEMANTIC_DEFAULTS.minChars;
+    const threshold = opts.similarityThreshold ?? SEMANTIC_DEFAULTS.similarityThreshold;
+    const clean = text.replace(/\r\n/g, '\n').trim();
+    if (!clean) {
+        return [];
+    }
+    // Units = paragraphs (split on blank lines). Oversized paragraphs are split
+    // into sentences, and oversized sentences hard-split. Paragraph granularity
+    // keeps the embedding count tractable on CPU while staying semantically
+    // meaningful (a paragraph is a natural topical unit).
+    const units = [];
+    for (const para of clean.split(/\n{2,}/).map((p) => p.trim()).filter(Boolean)) {
+        if (para.length <= maxChars) {
+            units.push(para);
+            continue;
+        }
+        for (const s of splitSentences(para)) {
+            if (s.length > maxChars) {
+                units.push(...splitLong(s, maxChars, 0));
+            }
+            else {
+                units.push(s);
+            }
+        }
+    }
+    if (units.length === 0) {
+        return [];
+    }
+    if (units.length === 1) {
+        return [units[0]];
+    }
+    const embeddings = await embedFn(units);
+    const dim = embeddings[0].length;
+    const chunks = [];
+    let groupText = units[0];
+    let sum = embeddings[0].slice(); // running sum of member vectors
+    let centroid = embeddings[0]; // normalized centroid
+    const renormalize = (v) => {
+        let n = 0;
+        for (let i = 0; i < dim; i++) {
+            n += v[i] * v[i];
+        }
+        n = Math.max(Math.sqrt(n), 1e-12);
+        return v.map((x) => x / n);
+    };
+    for (let i = 1; i < units.length; i++) {
+        const sim = dot(centroid, embeddings[i]);
+        const wouldOverflow = groupText.length + 1 + units[i].length > maxChars;
+        if (sim >= threshold && !wouldOverflow) {
+            groupText += `\n${units[i]}`;
+            for (let d = 0; d < dim; d++) {
+                sum[d] += embeddings[i][d];
+            }
+            centroid = renormalize(sum);
+        }
+        else {
+            chunks.push(groupText);
+            groupText = units[i];
+            sum = embeddings[i].slice();
+            centroid = embeddings[i];
+        }
+    }
+    chunks.push(groupText);
+    // Merge tiny trailing fragments into the previous chunk when they fit.
+    const merged = [];
+    for (const c of chunks) {
+        const prev = merged[merged.length - 1];
+        if (prev && c.length < minChars && prev.length + 1 + c.length <= maxChars) {
+            merged[merged.length - 1] = `${prev}\n${c}`;
+        }
+        else {
+            merged.push(c);
+        }
+    }
+    return merged;
+}

package/dist/embedder.js ADDED Viewed

@@ -0,0 +1,250 @@
+/**
+ * Local text embedding engine.
+ *
+ * Uses onnxruntime-node + @anush008/tokenizers (both prebuilt napi, no `sharp`)
+ * to run the multilingual model `multilingual-e5-small` fully offline.
+ *
+ * The ONNX weights + tokenizer are downloaded on first use from the
+ * HuggingFace Hub and cached on disk. The same module is the single source of
+ * truth for embeddings in both generation (vector-db) and query (server).
+ *
+ * Model notes (e5):
+ *  - MEAN pooling over masked token embeddings + L2 normalization. Do NOT
+ *    switch to CLS pooling — it would silently degrade retrieval quality.
+ *  - e5 is trained with asymmetric prefixes: queries must be prefixed with
+ *    "query: " and documents/passages with "passage: ". Use `embedQuery` for
+ *    search input and `embedPassage` for indexed text. The prefix is only used
+ *    to compute the vector; it is never stored.
+ *  - max_seq_length = 512 (enough for ~400-token semantic chunks).
+ *
+ * @author Natalia Pujol Cremades (@nataliapc)
+ * @license GPL2
+ */
+import * as ort from 'onnxruntime-node';
+import { Tokenizer } from '@anush008/tokenizers';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+const MODEL_REPO = 'Xenova/multilingual-e5-small';
+// int8 ONNX is fastest on CPU; the GPU uses the fp32 ONNX (CUDA has no
+// efficient kernels for dynamic-quantized ops, so the int8 model falls back to
+// CPU even under the CUDA provider). fp32 (not fp16) is used on GPU so the
+// output stays Float32Array — the fp16 model emits float16 output that the
+// pooling code cannot read directly. The file is chosen per provider.
+const ONNX_FILE_CPU = 'onnx/model_quantized.onnx';
+const ONNX_FILE_GPU = 'onnx/model.onnx';
+const TOKENIZER_FILE = 'tokenizer.json';
+const HF_BASE = `https://huggingface.co/${MODEL_REPO}/resolve/main`;
+/** Embedding dimensionality of the model. */
+export const EMBEDDING_DIM = 384;
+/** Model max sequence length (e5 max_seq_length). */
+const MAX_LENGTH = 512;
+let enginePromise = null;
+// Server-safe default: the MCP server NEVER downloads or runs the large fp32
+// model. Only an explicit setEmbedProvider('cuda') — used by the offline index
+// generator — can opt into the GPU. The server never calls it, so it stays int8.
+let requestedProvider = 'cpu';
+/**
+ * Select the embedding execution provider. Must be called before the first
+ * embedding. Only the index generator should request 'cuda'; the MCP server
+ * leaves the default ('cpu' / int8) so end users only ever download the 118 MB
+ * quantized model.
+ */
+export function setEmbedProvider(provider) {
+    if (enginePromise) {
+        throw new Error('setEmbedProvider must be called before the first embedding');
+    }
+    requestedProvider = provider;
+}
+/** Resolve the on-disk cache directory for the model files. */
+function getCacheDir() {
+    const base = process.env.OPENMSX_MODELS_CACHE ||
+        process.env.HF_HOME ||
+        process.env.TRANSFORMERS_CACHE ||
+        path.join(os.homedir(), '.cache', 'mcp-openmsx');
+    return path.join(base, 'models', MODEL_REPO.replace('/', '__'));
+}
+/** Download a single file from the HF Hub to dest if not already present. */
+async function downloadFile(remote, dest) {
+    if (fs.existsSync(dest) && fs.statSync(dest).size > 0) {
+        return;
+    }
+    await fs.promises.mkdir(path.dirname(dest), { recursive: true });
+    const url = `${HF_BASE}/${remote}`;
+    const res = await fetch(url);
+    if (!res.ok || !res.body) {
+        throw new Error(`Failed to download model file ${url}: ${res.status} ${res.statusText}`);
+    }
+    const buffer = Buffer.from(await res.arrayBuffer());
+    // Write atomically: tmp file + rename, so a crash mid-download cannot leave
+    // a truncated file that later looks "present".
+    const tmp = `${dest}.download`;
+    await fs.promises.writeFile(tmp, buffer);
+    await fs.promises.rename(tmp, dest);
+}
+/** Download a specific ONNX file + tokenizer if missing; returns the onnx path. */
+async function ensureFiles(onnxFile) {
+    const dir = getCacheDir();
+    const onnxPath = path.join(dir, onnxFile);
+    const tokenizerPath = path.join(dir, TOKENIZER_FILE);
+    await Promise.all([
+        downloadFile(onnxFile, onnxPath),
+        downloadFile(TOKENIZER_FILE, tokenizerPath),
+    ]);
+    return { onnxPath, tokenizerPath };
+}
+const baseSessionOptions = {
+    graphOptimizationLevel: 'all',
+    intraOpNumThreads: Math.max(1, os.cpus().length),
+    interOpNumThreads: 1,
+    executionMode: 'sequential',
+};
+/** Probe whether the CUDA provider can actually be created, using the small
+ *  int8 model already on disk (avoids downloading the 470 MB fp32 model just to
+ *  find out CUDA is unavailable). Returns true only if a CUDA session loads. */
+async function cudaAvailable(probeOnnxPath) {
+    try {
+        const probe = await ort.InferenceSession.create(probeOnnxPath, {
+            ...baseSessionOptions,
+            executionProviders: ['cuda'],
+        });
+        await probe.release?.();
+        return true;
+    }
+    catch {
+        return false;
+    }
+}
+/**
+ * Lazily initialize the ONNX session + tokenizer (singleton).
+ *
+ * The int8 model is always fetched first: it is the server default, the
+ * fallback, and the cheap CUDA probe. The large fp32 model is downloaded ONLY
+ * when 'cuda' was explicitly requested AND CUDA is confirmed available — so the
+ * server (which never requests 'cuda') can never pull the fp32 model.
+ */
+function getEngine() {
+    if (!enginePromise) {
+        enginePromise = (async () => {
+            const { onnxPath: int8Path, tokenizerPath } = await ensureFiles(ONNX_FILE_CPU);
+            const tokenizer = Tokenizer.fromFile(tokenizerPath);
+            tokenizer.setTruncation(MAX_LENGTH);
+            if (requestedProvider === 'cuda') {
+                if (await cudaAvailable(int8Path)) {
+                    // CUDA confirmed → only now download + load the fp32 model.
+                    const { onnxPath: fp32Path } = await ensureFiles(ONNX_FILE_GPU);
+                    const session = await ort.InferenceSession.create(fp32Path, {
+                        ...baseSessionOptions,
+                        executionProviders: ['cuda'],
+                    });
+                    process.stderr.write('[embedder] using CUDA execution provider (fp32)\n');
+                    return { session, tokenizer };
+                }
+                process.stderr.write('[embedder] CUDA requested but unavailable; using CPU (int8)\n');
+            }
+            const session = await ort.InferenceSession.create(int8Path, baseSessionOptions);
+            return { session, tokenizer };
+        })().catch((err) => {
+            // Reset so a transient failure (e.g. network) can be retried.
+            enginePromise = null;
+            throw err;
+        });
+    }
+    return enginePromise;
+}
+/**
+ * Default batch size for batched inference.
+ */
+const BATCH_SIZE = 32;
+// XLM-RoBERTa / e5 pad token id. Padded positions get attention_mask 0, so the
+// exact id is irrelevant to the pooled result; it only fills the tensor.
+const PAD_ID = 1n;
+/**
+ * Embed a list of already-prefixed inputs in batches (one ONNX run per batch,
+ * dynamic padding to the longest sequence in the batch). Returns one
+ * 384-dimension, L2-normalized vector per input (mean pooling over masked
+ * tokens). Batching is essential for throughput when embedding many sentences.
+ */
+async function embedRawBatch(inputs, batchSize = BATCH_SIZE) {
+    if (inputs.length === 0) {
+        return [];
+    }
+    const { session, tokenizer } = await getEngine();
+    const hasTokenTypes = session.inputNames.includes('token_type_ids');
+    const results = [];
+    for (let start = 0; start < inputs.length; start += batchSize) {
+        const batch = inputs.slice(start, start + batchSize);
+        const encodings = await Promise.all(batch.map((t) => tokenizer.encode(t)));
+        const idsArr = encodings.map((e) => e.getIds());
+        const maskArr = encodings.map((e) => e.getAttentionMask());
+        const B = batch.length;
+        const maxLen = Math.min(MAX_LENGTH, Math.max(...idsArr.map((a) => a.length)));
+        const flatIds = new BigInt64Array(B * maxLen);
+        const flatMask = new BigInt64Array(B * maxLen);
+        for (let r = 0; r < B; r++) {
+            const ids = idsArr[r];
+            const mask = maskArr[r];
+            const len = Math.min(ids.length, maxLen);
+            const rowBase = r * maxLen;
+            for (let c = 0; c < len; c++) {
+                flatIds[rowBase + c] = BigInt(ids[c]);
+                flatMask[rowBase + c] = BigInt(mask[c]);
+            }
+            for (let c = len; c < maxLen; c++) {
+                flatIds[rowBase + c] = PAD_ID;
+                flatMask[rowBase + c] = 0n;
+            }
+        }
+        const feeds = {
+            input_ids: new ort.Tensor('int64', flatIds, [B, maxLen]),
+            attention_mask: new ort.Tensor('int64', flatMask, [B, maxLen]),
+        };
+        if (hasTokenTypes) {
+            feeds.token_type_ids = new ort.Tensor('int64', new BigInt64Array(B * maxLen), [B, maxLen]);
+        }
+        const output = await session.run(feeds);
+        const hidden = output['last_hidden_state'] ?? output[session.outputNames[0]];
+        const data = hidden.data;
+        const dim = hidden.dims[hidden.dims.length - 1];
+        for (let r = 0; r < B; r++) {
+            const pooled = new Array(dim).fill(0);
+            let count = 0;
+            for (let t = 0; t < maxLen; t++) {
+                if (flatMask[r * maxLen + t] === 0n) {
+                    continue;
+                }
+                count++;
+                const base = (r * maxLen + t) * dim;
+                for (let d = 0; d < dim; d++) {
+                    pooled[d] += data[base + d];
+                }
+            }
+            const denom = Math.max(count, 1);
+            let norm = 0;
+            for (let d = 0; d < dim; d++) {
+                pooled[d] /= denom;
+                norm += pooled[d] * pooled[d];
+            }
+            norm = Math.max(Math.sqrt(norm), 1e-12);
+            for (let d = 0; d < dim; d++) {
+                pooled[d] /= norm;
+            }
+            results.push(pooled);
+        }
+    }
+    return results;
+}
+/** Embed a search query (e5 "query: " prefix). */
+export async function embedQuery(text) {
+    return (await embedRawBatch([`query: ${text}`]))[0];
+}
+/** Embed a document/passage to be indexed (e5 "passage: " prefix). */
+export async function embedPassage(text) {
+    return (await embedRawBatch([`passage: ${text}`]))[0];
+}
+/** Batch-embed passages (e5 "passage: " prefix). One ONNX run per batch. */
+export function embedPassageBatch(texts) {
+    return embedRawBatch(texts.map((t) => `passage: ${t}`));
+}
+/** Default embedding = query side (kept for backward compatibility). */
+export const embed = embedQuery;