@nataliapc/mcp-openmsx 1.2.10 → 1.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/README.md +20 -2
  2. package/dist/chunker.js +187 -0
  3. package/dist/embedder.js +250 -0
  4. package/dist/server.js +6 -1
  5. package/dist/server_tools.js +6 -5
  6. package/dist/vectordb.js +94 -35
  7. package/package.json +4 -8
  8. package/resources/audio/chipsfmpacpr1_en.md +209 -0
  9. package/resources/audio/chipsfmpacpr2_en.md +170 -0
  10. package/resources/audio/toc.json +12 -0
  11. package/resources/book--msx-top-secret-3/MTS3-Appendix-English-Upd2.pdf +0 -0
  12. package/resources/book--msx-top-secret-3/MTS3-Complete-English.pdf +0 -0
  13. package/resources/book--msx-top-secret-3/mts3-appendix-english-upd2.md +25863 -0
  14. package/resources/book--msx-top-secret-3/mts3-complete-english.md +44895 -0
  15. package/resources/book--msx2-technical-handbook/toc.json +1 -1
  16. package/resources/book--the-msx-red-book/Chapter1_Programmable_Peripheral_Interface.md +112 -0
  17. package/resources/book--the-msx-red-book/Chapter2_Video_Display_Processor.md +308 -0
  18. package/resources/book--the-msx-red-book/Chapter3_Programmable_Sound_Generator.md +168 -0
  19. package/resources/book--the-msx-red-book/Chapter4_ROM_BIOS.md +2528 -0
  20. package/resources/book--the-msx-red-book/Chapter5_ROM_BASIC_Interpreter.md +3975 -0
  21. package/resources/book--the-msx-red-book/Chapter6_Memory_Map.md +1963 -0
  22. package/resources/book--the-msx-red-book/Chapter7_Machine_Code_Programs.md +1238 -0
  23. package/resources/book--the-msx-red-book/Introduction.md +104 -0
  24. package/resources/book--the-msx-red-book/toc.json +38 -3
  25. package/resources/processors/toc.json +3 -3
  26. package/resources/processors/z80-undocumented.md +141 -0
  27. package/resources/programming/asm_develop_a_program_in_cartridge_rom.md +1881 -0
  28. package/resources/programming/toc.json +6 -0
  29. package/resources/sdcc/1_Introduction.md +199 -0
  30. package/resources/sdcc/2_Installing_SDCC.md +533 -0
  31. package/resources/sdcc/3_Using_SDCC.md +1758 -0
  32. package/resources/sdcc/4_Notes_on_supported_Processors.md +1638 -0
  33. package/resources/sdcc/5_Debugging.md +210 -0
  34. package/resources/sdcc/6_Tips_and_Support.md +258 -0
  35. package/resources/sdcc/7_SDCC_Technical_Data.md +489 -0
  36. package/resources/sdcc/8_Compiler_internals.md +477 -0
  37. package/resources/sdcc/toc.json +44 -2
  38. package/resources/system/how_to_detect_ram.md +14 -0
  39. package/resources/system/mrc_wiki_megarom_mappers.md +533 -0
  40. package/resources/system/the_memory.md +118 -0
  41. package/resources/system/toc.json +18 -0
  42. package/vector-db/__manifest/_transactions/0-675ee228-bffb-4636-80e5-cdfde25cc4fe.txn +2 -0
  43. package/vector-db/__manifest/_versions/18446744073709551614.manifest +0 -0
  44. package/vector-db/__manifest/_versions/latest_version_hint.json +1 -0
  45. package/vector-db/msxdocs.lance/_indices/37194b01-2a25-40d1-ac38-7fbe254df5ea/metadata.lance +0 -0
  46. package/vector-db/msxdocs.lance/_indices/37194b01-2a25-40d1-ac38-7fbe254df5ea/part_2_docs.lance +0 -0
  47. package/vector-db/msxdocs.lance/_indices/37194b01-2a25-40d1-ac38-7fbe254df5ea/part_2_invert.lance +0 -0
  48. package/vector-db/msxdocs.lance/_indices/37194b01-2a25-40d1-ac38-7fbe254df5ea/part_2_tokens.lance +0 -0
  49. package/vector-db/msxdocs.lance/_transactions/0-dd155672-40e6-4c6a-942f-7fcbe8c3dbd0.txn +0 -0
  50. package/vector-db/msxdocs.lance/_transactions/1-e7230cbd-ce8e-465c-9b85-b91443862427.txn +0 -0
  51. package/vector-db/msxdocs.lance/_versions/18446744073709551613.manifest +0 -0
  52. package/vector-db/msxdocs.lance/_versions/18446744073709551614.manifest +0 -0
  53. package/vector-db/msxdocs.lance/_versions/latest_version_hint.json +1 -0
  54. package/vector-db/msxdocs.lance/data/000100110110001011110001fc578141d296825d0bea11c95d.lance +0 -0
  55. package/resources/book--the-msx-red-book/the_msx_red_book.md +0 -10349
  56. package/resources/processors/z80-undocumented.tex +0 -5617
  57. package/resources/sdcc/lyx2md.py +0 -745
  58. package/resources/sdcc/sdccman.lyx +0 -81574
  59. package/resources/sdcc/sdccman.md +0 -5557
  60. package/vector-db/index.json +0 -1
package/README.md CHANGED
@@ -46,7 +46,7 @@ This project creates a bridge between modern AI-assisted development (e.g. GitHu
46
46
  - **Video Control**: VDP register manipulation and screen capture.
47
47
  - **Memory Operations**: Read/write RAM, VRAM, and I/O port access.
48
48
  - **Automation**: Keyboard input simulation and savestate management.
49
- - **Vector DB Integration**: Query an embedded vector database with MSX resources for development support.
49
+ - **Hybrid Documentation Search**: Query an embedded local index of MSX resources combining semantic (multilingual embeddings) and keyword (BM25) search, runs fully offline.
50
50
  - **Hybrid Mode**: This MCP server supports hybrid access mode (_STDIO_ and _HTTP_ transports).
51
51
 
52
52
  ## Architecture
@@ -117,7 +117,7 @@ The MCP server translates high-level natural language commands from your Copilot
117
117
  - `msxdocs_resource_get`: Retrieve MCP resources for MCP clients that don't support MCP resources.
118
118
 
119
119
  ### Documentation Tools
120
- - `vector_db_query`: Query the Vector DB resources to obtain information about MSX systems, cartridges, and other development resources.
120
+ - `vector_db_query`: Hybrid search (semantic embeddings + BM25) over the local MSX documentation index, for information about MSX systems, cartridges, programming, and other development resources.
121
121
  - `msxdocs_resource_get`: Retrieve MCP resources for MCP clients that don't support MCP resources.
122
122
 
123
123
  ## Available MCP Resources
@@ -194,6 +194,9 @@ Steps to install the MCP server in VSCode:
194
194
  }
195
195
  ```
196
196
 
197
+ > [!NOTE]
198
+ > In Windows you can change the `command` field to `npx.cmd` if you experience permission issues.
199
+
197
200
  > [!NOTE]
198
201
  > Environment variables are optional. Customize them as you need.
199
202
 
@@ -251,6 +254,21 @@ Edit it to include the following JSON entry:
251
254
  | `MCP_ALLOWED_ORIGINS` | Comma-separated list of allowed origins for HTTP transport | Empty for all allowed | `http://localhost,http://mydomain.com` |
252
255
  | `OPENMSX_WINDOWS_CONTROL` | **Windows only.** How the server talks to openMSX's control socket (see below) | `stdio-proxy` | `direct-sspi` |
253
256
  | `OPENMSX_WINDOWS_PROXY_EXECUTABLE` | **Windows only.** Override path to the SSPI proxy helper (development) | Bundled `bin/win-x64/mcp-openmsx-sspi-proxy.exe` | `C:\path\to\mcp-openmsx-sspi-proxy.exe` |
257
+ | `OPENMSX_MODELS_CACHE` | Directory where the embedding model is cached (also honors `HF_HOME` / `TRANSFORMERS_CACHE`) | `~/.cache/mcp-openmsx` | `/opt/models` |
258
+ | `OPENMSX_EMBED_PROVIDER` | **Index generator only.** `cuda` uses the GPU (fp32 model) to regenerate the index, falling back to CPU if CUDA is unavailable. The MCP server itself always uses CPU/int8 and ignores this variable. | (generator: `cpu`) | `cuda` |
259
+
260
+ #### Documentation search model
261
+
262
+ The `vector_db_query` tool runs a local hybrid search (semantic embeddings + BM25). The embedding model
263
+ (`multilingual-e5-small`, ONNX quantized, ~118 MB, 512-token context, multilingual) is **downloaded once**
264
+ from the HuggingFace Hub on the first query and cached on disk (see `OPENMSX_MODELS_CACHE` above). After that
265
+ it runs fully offline. No API key is required. To pre-populate the cache for air-gapped environments, run one
266
+ query on a networked machine and copy the cache directory.
267
+
268
+ Regenerating the index (rare) embeds the whole corpus. On CPU this is slow; on an NVIDIA GPU set
269
+ `OPENMSX_EMBED_PROVIDER=cuda` for the generator to use it (requires CUDA 13 runtime libraries + cuDNN 9),
270
+ which is ~50× faster. The GPU path uses the larger fp32 model; the server keeps using the int8 model and
271
+ the two are interchangeable for search (same ranking). End users never download the fp32 model.
254
272
 
255
273
  #### Windows control modes (`OPENMSX_WINDOWS_CONTROL`)
256
274
 
@@ -0,0 +1,187 @@
1
+ /**
2
+ * Local text chunkers (no external API).
3
+ *
4
+ * Two strategies:
5
+ * - `chunkText`: deterministic, markdown-aware, fixed-size with overlap.
6
+ * Used as a fallback and to hard-split oversized units.
7
+ * - `semanticChunk`: groups consecutive sentences by embedding similarity
8
+ * (cosine), so each chunk stays topically coherent, up to a size bound.
9
+ * Requires an embedding function (injected) — the model runs locally.
10
+ *
11
+ * Sizing targets the embedding model's context window. With multilingual-e5
12
+ * (max 512 tokens) we aim for ~1600 characters (~400 tokens), leaving room for
13
+ * the "passage: " prefix.
14
+ *
15
+ * @author Natalia Pujol Cremades (@nataliapc)
16
+ * @license GPL2
17
+ */
18
+ export const DEFAULT_MAX_CHARS = 1600;
19
+ export const DEFAULT_OVERLAP = 100;
20
+ /** Hard-split a single oversized block into overlapping windows. */
21
+ function splitLong(s, maxChars, overlap) {
22
+ const out = [];
23
+ const step = Math.max(1, maxChars - overlap);
24
+ let start = 0;
25
+ while (start < s.length) {
26
+ const end = Math.min(start + maxChars, s.length);
27
+ const piece = s.slice(start, end).trim();
28
+ if (piece) {
29
+ out.push(piece);
30
+ }
31
+ if (end >= s.length) {
32
+ break;
33
+ }
34
+ start += step;
35
+ }
36
+ return out;
37
+ }
38
+ /**
39
+ * Split `text` into overlapping, markdown-aware fixed-size chunks.
40
+ * Returns [] for empty/whitespace input.
41
+ */
42
+ export function chunkText(text, opts = {}) {
43
+ const maxChars = opts.maxChars ?? DEFAULT_MAX_CHARS;
44
+ const overlap = opts.overlap ?? DEFAULT_OVERLAP;
45
+ const clean = text.replace(/\r\n/g, '\n').trim();
46
+ if (!clean) {
47
+ return [];
48
+ }
49
+ if (clean.length <= maxChars) {
50
+ return [clean];
51
+ }
52
+ const blocks = clean
53
+ .split(/\n{2,}/)
54
+ .map((b) => b.trim())
55
+ .filter(Boolean);
56
+ const chunks = [];
57
+ let buf = '';
58
+ const flush = () => {
59
+ if (buf.trim()) {
60
+ chunks.push(buf.trim());
61
+ }
62
+ };
63
+ for (const block of blocks) {
64
+ if (block.length > maxChars) {
65
+ flush();
66
+ buf = '';
67
+ chunks.push(...splitLong(block, maxChars, overlap));
68
+ continue;
69
+ }
70
+ if (buf && buf.length + block.length + 1 > maxChars) {
71
+ flush();
72
+ const tail = buf.slice(-overlap);
73
+ buf = `${tail}\n${block}`;
74
+ }
75
+ else {
76
+ buf = buf ? `${buf}\n${block}` : block;
77
+ }
78
+ }
79
+ flush();
80
+ return chunks;
81
+ }
82
+ export const SEMANTIC_DEFAULTS = {
83
+ maxChars: 1800,
84
+ minChars: 250,
85
+ similarityThreshold: 0.90,
86
+ };
87
+ /** Split text into sentence-ish units (sentence punctuation or line breaks). */
88
+ export function splitSentences(text) {
89
+ return text
90
+ .replace(/\r\n/g, '\n')
91
+ .split(/(?<=[.!?:;])\s+|\n+/)
92
+ .map((s) => s.trim())
93
+ .filter(Boolean);
94
+ }
95
+ /** Dot product of two equal-length, L2-normalized vectors (= cosine). */
96
+ function dot(a, b) {
97
+ let s = 0;
98
+ for (let i = 0; i < a.length; i++) {
99
+ s += a[i] * b[i];
100
+ }
101
+ return s;
102
+ }
103
+ /**
104
+ * Group consecutive sentences by embedding similarity into coherent chunks.
105
+ * All sentences are embedded in one batched call (`embedFn`); a running
106
+ * (re-normalized) centroid represents the current group. A sentence starts a
107
+ * new chunk when it is too dissimilar from the centroid or would overflow
108
+ * `maxChars`.
109
+ */
110
+ export async function semanticChunk(text, embedFn, opts = {}) {
111
+ const maxChars = opts.maxChars ?? SEMANTIC_DEFAULTS.maxChars;
112
+ const minChars = opts.minChars ?? SEMANTIC_DEFAULTS.minChars;
113
+ const threshold = opts.similarityThreshold ?? SEMANTIC_DEFAULTS.similarityThreshold;
114
+ const clean = text.replace(/\r\n/g, '\n').trim();
115
+ if (!clean) {
116
+ return [];
117
+ }
118
+ // Units = paragraphs (split on blank lines). Oversized paragraphs are split
119
+ // into sentences, and oversized sentences hard-split. Paragraph granularity
120
+ // keeps the embedding count tractable on CPU while staying semantically
121
+ // meaningful (a paragraph is a natural topical unit).
122
+ const units = [];
123
+ for (const para of clean.split(/\n{2,}/).map((p) => p.trim()).filter(Boolean)) {
124
+ if (para.length <= maxChars) {
125
+ units.push(para);
126
+ continue;
127
+ }
128
+ for (const s of splitSentences(para)) {
129
+ if (s.length > maxChars) {
130
+ units.push(...splitLong(s, maxChars, 0));
131
+ }
132
+ else {
133
+ units.push(s);
134
+ }
135
+ }
136
+ }
137
+ if (units.length === 0) {
138
+ return [];
139
+ }
140
+ if (units.length === 1) {
141
+ return [units[0]];
142
+ }
143
+ const embeddings = await embedFn(units);
144
+ const dim = embeddings[0].length;
145
+ const chunks = [];
146
+ let groupText = units[0];
147
+ let sum = embeddings[0].slice(); // running sum of member vectors
148
+ let centroid = embeddings[0]; // normalized centroid
149
+ const renormalize = (v) => {
150
+ let n = 0;
151
+ for (let i = 0; i < dim; i++) {
152
+ n += v[i] * v[i];
153
+ }
154
+ n = Math.max(Math.sqrt(n), 1e-12);
155
+ return v.map((x) => x / n);
156
+ };
157
+ for (let i = 1; i < units.length; i++) {
158
+ const sim = dot(centroid, embeddings[i]);
159
+ const wouldOverflow = groupText.length + 1 + units[i].length > maxChars;
160
+ if (sim >= threshold && !wouldOverflow) {
161
+ groupText += `\n${units[i]}`;
162
+ for (let d = 0; d < dim; d++) {
163
+ sum[d] += embeddings[i][d];
164
+ }
165
+ centroid = renormalize(sum);
166
+ }
167
+ else {
168
+ chunks.push(groupText);
169
+ groupText = units[i];
170
+ sum = embeddings[i].slice();
171
+ centroid = embeddings[i];
172
+ }
173
+ }
174
+ chunks.push(groupText);
175
+ // Merge tiny trailing fragments into the previous chunk when they fit.
176
+ const merged = [];
177
+ for (const c of chunks) {
178
+ const prev = merged[merged.length - 1];
179
+ if (prev && c.length < minChars && prev.length + 1 + c.length <= maxChars) {
180
+ merged[merged.length - 1] = `${prev}\n${c}`;
181
+ }
182
+ else {
183
+ merged.push(c);
184
+ }
185
+ }
186
+ return merged;
187
+ }
@@ -0,0 +1,250 @@
1
+ /**
2
+ * Local text embedding engine.
3
+ *
4
+ * Uses onnxruntime-node + @anush008/tokenizers (both prebuilt napi, no `sharp`)
5
+ * to run the multilingual model `multilingual-e5-small` fully offline.
6
+ *
7
+ * The ONNX weights + tokenizer are downloaded on first use from the
8
+ * HuggingFace Hub and cached on disk. The same module is the single source of
9
+ * truth for embeddings in both generation (vector-db) and query (server).
10
+ *
11
+ * Model notes (e5):
12
+ * - MEAN pooling over masked token embeddings + L2 normalization. Do NOT
13
+ * switch to CLS pooling — it would silently degrade retrieval quality.
14
+ * - e5 is trained with asymmetric prefixes: queries must be prefixed with
15
+ * "query: " and documents/passages with "passage: ". Use `embedQuery` for
16
+ * search input and `embedPassage` for indexed text. The prefix is only used
17
+ * to compute the vector; it is never stored.
18
+ * - max_seq_length = 512 (enough for ~400-token semantic chunks).
19
+ *
20
+ * @author Natalia Pujol Cremades (@nataliapc)
21
+ * @license GPL2
22
+ */
23
+ import * as ort from 'onnxruntime-node';
24
+ import { Tokenizer } from '@anush008/tokenizers';
25
+ import * as fs from 'fs';
26
+ import * as path from 'path';
27
+ import * as os from 'os';
28
+ const MODEL_REPO = 'Xenova/multilingual-e5-small';
29
+ // int8 ONNX is fastest on CPU; the GPU uses the fp32 ONNX (CUDA has no
30
+ // efficient kernels for dynamic-quantized ops, so the int8 model falls back to
31
+ // CPU even under the CUDA provider). fp32 (not fp16) is used on GPU so the
32
+ // output stays Float32Array — the fp16 model emits float16 output that the
33
+ // pooling code cannot read directly. The file is chosen per provider.
34
+ const ONNX_FILE_CPU = 'onnx/model_quantized.onnx';
35
+ const ONNX_FILE_GPU = 'onnx/model.onnx';
36
+ const TOKENIZER_FILE = 'tokenizer.json';
37
+ const HF_BASE = `https://huggingface.co/${MODEL_REPO}/resolve/main`;
38
+ /** Embedding dimensionality of the model. */
39
+ export const EMBEDDING_DIM = 384;
40
+ /** Model max sequence length (e5 max_seq_length). */
41
+ const MAX_LENGTH = 512;
42
+ let enginePromise = null;
43
+ // Server-safe default: the MCP server NEVER downloads or runs the large fp32
44
+ // model. Only an explicit setEmbedProvider('cuda') — used by the offline index
45
+ // generator — can opt into the GPU. The server never calls it, so it stays int8.
46
+ let requestedProvider = 'cpu';
47
+ /**
48
+ * Select the embedding execution provider. Must be called before the first
49
+ * embedding. Only the index generator should request 'cuda'; the MCP server
50
+ * leaves the default ('cpu' / int8) so end users only ever download the 118 MB
51
+ * quantized model.
52
+ */
53
+ export function setEmbedProvider(provider) {
54
+ if (enginePromise) {
55
+ throw new Error('setEmbedProvider must be called before the first embedding');
56
+ }
57
+ requestedProvider = provider;
58
+ }
59
+ /** Resolve the on-disk cache directory for the model files. */
60
+ function getCacheDir() {
61
+ const base = process.env.OPENMSX_MODELS_CACHE ||
62
+ process.env.HF_HOME ||
63
+ process.env.TRANSFORMERS_CACHE ||
64
+ path.join(os.homedir(), '.cache', 'mcp-openmsx');
65
+ return path.join(base, 'models', MODEL_REPO.replace('/', '__'));
66
+ }
67
+ /** Download a single file from the HF Hub to dest if not already present. */
68
+ async function downloadFile(remote, dest) {
69
+ if (fs.existsSync(dest) && fs.statSync(dest).size > 0) {
70
+ return;
71
+ }
72
+ await fs.promises.mkdir(path.dirname(dest), { recursive: true });
73
+ const url = `${HF_BASE}/${remote}`;
74
+ const res = await fetch(url);
75
+ if (!res.ok || !res.body) {
76
+ throw new Error(`Failed to download model file ${url}: ${res.status} ${res.statusText}`);
77
+ }
78
+ const buffer = Buffer.from(await res.arrayBuffer());
79
+ // Write atomically: tmp file + rename, so a crash mid-download cannot leave
80
+ // a truncated file that later looks "present".
81
+ const tmp = `${dest}.download`;
82
+ await fs.promises.writeFile(tmp, buffer);
83
+ await fs.promises.rename(tmp, dest);
84
+ }
85
+ /** Download a specific ONNX file + tokenizer if missing; returns the onnx path. */
86
+ async function ensureFiles(onnxFile) {
87
+ const dir = getCacheDir();
88
+ const onnxPath = path.join(dir, onnxFile);
89
+ const tokenizerPath = path.join(dir, TOKENIZER_FILE);
90
+ await Promise.all([
91
+ downloadFile(onnxFile, onnxPath),
92
+ downloadFile(TOKENIZER_FILE, tokenizerPath),
93
+ ]);
94
+ return { onnxPath, tokenizerPath };
95
+ }
96
+ const baseSessionOptions = {
97
+ graphOptimizationLevel: 'all',
98
+ intraOpNumThreads: Math.max(1, os.cpus().length),
99
+ interOpNumThreads: 1,
100
+ executionMode: 'sequential',
101
+ };
102
+ /** Probe whether the CUDA provider can actually be created, using the small
103
+ * int8 model already on disk (avoids downloading the 470 MB fp32 model just to
104
+ * find out CUDA is unavailable). Returns true only if a CUDA session loads. */
105
+ async function cudaAvailable(probeOnnxPath) {
106
+ try {
107
+ const probe = await ort.InferenceSession.create(probeOnnxPath, {
108
+ ...baseSessionOptions,
109
+ executionProviders: ['cuda'],
110
+ });
111
+ await probe.release?.();
112
+ return true;
113
+ }
114
+ catch {
115
+ return false;
116
+ }
117
+ }
118
+ /**
119
+ * Lazily initialize the ONNX session + tokenizer (singleton).
120
+ *
121
+ * The int8 model is always fetched first: it is the server default, the
122
+ * fallback, and the cheap CUDA probe. The large fp32 model is downloaded ONLY
123
+ * when 'cuda' was explicitly requested AND CUDA is confirmed available — so the
124
+ * server (which never requests 'cuda') can never pull the fp32 model.
125
+ */
126
+ function getEngine() {
127
+ if (!enginePromise) {
128
+ enginePromise = (async () => {
129
+ const { onnxPath: int8Path, tokenizerPath } = await ensureFiles(ONNX_FILE_CPU);
130
+ const tokenizer = Tokenizer.fromFile(tokenizerPath);
131
+ tokenizer.setTruncation(MAX_LENGTH);
132
+ if (requestedProvider === 'cuda') {
133
+ if (await cudaAvailable(int8Path)) {
134
+ // CUDA confirmed → only now download + load the fp32 model.
135
+ const { onnxPath: fp32Path } = await ensureFiles(ONNX_FILE_GPU);
136
+ const session = await ort.InferenceSession.create(fp32Path, {
137
+ ...baseSessionOptions,
138
+ executionProviders: ['cuda'],
139
+ });
140
+ process.stderr.write('[embedder] using CUDA execution provider (fp32)\n');
141
+ return { session, tokenizer };
142
+ }
143
+ process.stderr.write('[embedder] CUDA requested but unavailable; using CPU (int8)\n');
144
+ }
145
+ const session = await ort.InferenceSession.create(int8Path, baseSessionOptions);
146
+ return { session, tokenizer };
147
+ })().catch((err) => {
148
+ // Reset so a transient failure (e.g. network) can be retried.
149
+ enginePromise = null;
150
+ throw err;
151
+ });
152
+ }
153
+ return enginePromise;
154
+ }
155
+ /**
156
+ * Default batch size for batched inference.
157
+ */
158
+ const BATCH_SIZE = 32;
159
+ // XLM-RoBERTa / e5 pad token id. Padded positions get attention_mask 0, so the
160
+ // exact id is irrelevant to the pooled result; it only fills the tensor.
161
+ const PAD_ID = 1n;
162
+ /**
163
+ * Embed a list of already-prefixed inputs in batches (one ONNX run per batch,
164
+ * dynamic padding to the longest sequence in the batch). Returns one
165
+ * 384-dimension, L2-normalized vector per input (mean pooling over masked
166
+ * tokens). Batching is essential for throughput when embedding many sentences.
167
+ */
168
+ async function embedRawBatch(inputs, batchSize = BATCH_SIZE) {
169
+ if (inputs.length === 0) {
170
+ return [];
171
+ }
172
+ const { session, tokenizer } = await getEngine();
173
+ const hasTokenTypes = session.inputNames.includes('token_type_ids');
174
+ const results = [];
175
+ for (let start = 0; start < inputs.length; start += batchSize) {
176
+ const batch = inputs.slice(start, start + batchSize);
177
+ const encodings = await Promise.all(batch.map((t) => tokenizer.encode(t)));
178
+ const idsArr = encodings.map((e) => e.getIds());
179
+ const maskArr = encodings.map((e) => e.getAttentionMask());
180
+ const B = batch.length;
181
+ const maxLen = Math.min(MAX_LENGTH, Math.max(...idsArr.map((a) => a.length)));
182
+ const flatIds = new BigInt64Array(B * maxLen);
183
+ const flatMask = new BigInt64Array(B * maxLen);
184
+ for (let r = 0; r < B; r++) {
185
+ const ids = idsArr[r];
186
+ const mask = maskArr[r];
187
+ const len = Math.min(ids.length, maxLen);
188
+ const rowBase = r * maxLen;
189
+ for (let c = 0; c < len; c++) {
190
+ flatIds[rowBase + c] = BigInt(ids[c]);
191
+ flatMask[rowBase + c] = BigInt(mask[c]);
192
+ }
193
+ for (let c = len; c < maxLen; c++) {
194
+ flatIds[rowBase + c] = PAD_ID;
195
+ flatMask[rowBase + c] = 0n;
196
+ }
197
+ }
198
+ const feeds = {
199
+ input_ids: new ort.Tensor('int64', flatIds, [B, maxLen]),
200
+ attention_mask: new ort.Tensor('int64', flatMask, [B, maxLen]),
201
+ };
202
+ if (hasTokenTypes) {
203
+ feeds.token_type_ids = new ort.Tensor('int64', new BigInt64Array(B * maxLen), [B, maxLen]);
204
+ }
205
+ const output = await session.run(feeds);
206
+ const hidden = output['last_hidden_state'] ?? output[session.outputNames[0]];
207
+ const data = hidden.data;
208
+ const dim = hidden.dims[hidden.dims.length - 1];
209
+ for (let r = 0; r < B; r++) {
210
+ const pooled = new Array(dim).fill(0);
211
+ let count = 0;
212
+ for (let t = 0; t < maxLen; t++) {
213
+ if (flatMask[r * maxLen + t] === 0n) {
214
+ continue;
215
+ }
216
+ count++;
217
+ const base = (r * maxLen + t) * dim;
218
+ for (let d = 0; d < dim; d++) {
219
+ pooled[d] += data[base + d];
220
+ }
221
+ }
222
+ const denom = Math.max(count, 1);
223
+ let norm = 0;
224
+ for (let d = 0; d < dim; d++) {
225
+ pooled[d] /= denom;
226
+ norm += pooled[d] * pooled[d];
227
+ }
228
+ norm = Math.max(Math.sqrt(norm), 1e-12);
229
+ for (let d = 0; d < dim; d++) {
230
+ pooled[d] /= norm;
231
+ }
232
+ results.push(pooled);
233
+ }
234
+ }
235
+ return results;
236
+ }
237
+ /** Embed a search query (e5 "query: " prefix). */
238
+ export async function embedQuery(text) {
239
+ return (await embedRawBatch([`query: ${text}`]))[0];
240
+ }
241
+ /** Embed a document/passage to be indexed (e5 "passage: " prefix). */
242
+ export async function embedPassage(text) {
243
+ return (await embedRawBatch([`passage: ${text}`]))[0];
244
+ }
245
+ /** Batch-embed passages (e5 "passage: " prefix). One ONNX run per batch. */
246
+ export function embedPassageBatch(texts) {
247
+ return embedRawBatch(texts.map((t) => `passage: ${t}`));
248
+ }
249
+ /** Default embedding = query side (kept for backward compatibility). */
250
+ export const embed = embedQuery;
package/dist/server.js CHANGED
@@ -29,7 +29,12 @@ const require = createRequire(import.meta.url);
29
29
  export const PACKAGE_VERSION = require('../package.json').version;
30
30
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
31
31
  const resourcesDir = path.join(__dirname, "../resources");
32
- const vectorDbDir = path.join(__dirname, "../vector-db");
32
+ // Index location. Defaults to the bundled `vector-db` next to the build, but can
33
+ // be overridden with OPENMSX_VECTORDB_DIR. LanceDB/lance-io (Rust object_store)
34
+ // cannot read a local index from a Windows *mapped network drive* (it drops the
35
+ // drive letter when converting the path to a file:// URL). When the project lives
36
+ // on a network share, point this at a copy of the index on a local disk.
37
+ const vectorDbDir = process.env.OPENMSX_VECTORDB_DIR?.trim() || path.join(__dirname, "../vector-db");
33
38
  export const emuDirectories = {
34
39
  OPENMSX_SHARE_DIR: '',
35
40
  OPENMSX_EXECUTABLE: detectOpenMSXExecutable(),
@@ -1677,10 +1677,11 @@ The parameter scrbasename is the name of the filename (without path) to save the
1677
1677
  "vector_db_query", {
1678
1678
  title: "Vector DB query from resources",
1679
1679
  // Description of the tool (what it does)
1680
- description: `Query the Vector DB resources to obtain information about MSX system, cartridges, programming, and other development resources.
1681
- The query is a string used to search within the Vector DB resources; it is case-insensitive and may contain spaces.
1682
- The response is the list of the top 10 result resources that match the query, including their score, title, and resource URI, and are sorted in descending order by proximity score to the query.
1683
- **Important Note**: The Vector DB resources are in english, japanese, or dutch.
1680
+ description: `Query the documentation index to obtain information about MSX system, cartridges, programming, and other development resources.
1681
+ Uses hybrid search: semantic (multilingual embeddings) combined with keyword (BM25) matching, fused with Reciprocal Rank Fusion. Good for both conceptual questions and exact terms (mnemonics, register names, BIOS calls).
1682
+ The query is a string; it is case-insensitive and may contain spaces.
1683
+ The response is the list of the top 10 matching resource chunks, including their relevance score, title, and resource URI, sorted in descending order by score.
1684
+ **Important Note**: The documentation resources are in english, japanese, or dutch (the embedding model is multilingual).
1684
1685
  `,
1685
1686
  // Schema for the tool (input validation)
1686
1687
  inputSchema: {
@@ -1691,7 +1692,7 @@ The response is the list of the top 10 result resources that match the query, in
1691
1692
  },
1692
1693
  outputSchema: {
1693
1694
  results: z.array(z.object({
1694
- score: z.string().describe("Proximity score of the result to the query, higher is better."),
1695
+ score: z.string().describe("Reciprocal Rank Fusion (RRF) relevance score combining semantic and keyword matches; higher is better. Only the relative ranking is meaningful, not the absolute value (typically ~0.01-0.03)."),
1695
1696
  title: z.string().describe("Title of the resource."),
1696
1697
  uri: z.string().describe("URI of the resource, which can be used to access the resource."),
1697
1698
  document: z.string().describe("Document chunk of the resource, retrieved from the Vector DB."),