@nataliapc/mcp-openmsx 1.2.10 → 1.2.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -2
- package/dist/chunker.js +187 -0
- package/dist/embedder.js +250 -0
- package/dist/server.js +6 -1
- package/dist/server_tools.js +6 -5
- package/dist/vectordb.js +94 -35
- package/package.json +4 -8
- package/resources/audio/chipsfmpacpr1_en.md +209 -0
- package/resources/audio/chipsfmpacpr2_en.md +170 -0
- package/resources/audio/toc.json +12 -0
- package/resources/book--msx-top-secret-3/MTS3-Appendix-English-Upd2.pdf +0 -0
- package/resources/book--msx-top-secret-3/MTS3-Complete-English.pdf +0 -0
- package/resources/book--msx-top-secret-3/mts3-appendix-english-upd2.md +25863 -0
- package/resources/book--msx-top-secret-3/mts3-complete-english.md +44895 -0
- package/resources/book--msx2-technical-handbook/toc.json +1 -1
- package/resources/book--the-msx-red-book/Chapter1_Programmable_Peripheral_Interface.md +112 -0
- package/resources/book--the-msx-red-book/Chapter2_Video_Display_Processor.md +308 -0
- package/resources/book--the-msx-red-book/Chapter3_Programmable_Sound_Generator.md +168 -0
- package/resources/book--the-msx-red-book/Chapter4_ROM_BIOS.md +2528 -0
- package/resources/book--the-msx-red-book/Chapter5_ROM_BASIC_Interpreter.md +3975 -0
- package/resources/book--the-msx-red-book/Chapter6_Memory_Map.md +1963 -0
- package/resources/book--the-msx-red-book/Chapter7_Machine_Code_Programs.md +1238 -0
- package/resources/book--the-msx-red-book/Introduction.md +104 -0
- package/resources/book--the-msx-red-book/toc.json +38 -3
- package/resources/processors/toc.json +3 -3
- package/resources/processors/z80-undocumented.md +141 -0
- package/resources/programming/asm_develop_a_program_in_cartridge_rom.md +1881 -0
- package/resources/programming/toc.json +6 -0
- package/resources/sdcc/1_Introduction.md +199 -0
- package/resources/sdcc/2_Installing_SDCC.md +533 -0
- package/resources/sdcc/3_Using_SDCC.md +1758 -0
- package/resources/sdcc/4_Notes_on_supported_Processors.md +1638 -0
- package/resources/sdcc/5_Debugging.md +210 -0
- package/resources/sdcc/6_Tips_and_Support.md +258 -0
- package/resources/sdcc/7_SDCC_Technical_Data.md +489 -0
- package/resources/sdcc/8_Compiler_internals.md +477 -0
- package/resources/sdcc/toc.json +44 -2
- package/resources/system/how_to_detect_ram.md +14 -0
- package/resources/system/mrc_wiki_megarom_mappers.md +533 -0
- package/resources/system/the_memory.md +118 -0
- package/resources/system/toc.json +18 -0
- package/vector-db/__manifest/_transactions/0-675ee228-bffb-4636-80e5-cdfde25cc4fe.txn +2 -0
- package/vector-db/__manifest/_versions/18446744073709551614.manifest +0 -0
- package/vector-db/__manifest/_versions/latest_version_hint.json +1 -0
- package/vector-db/msxdocs.lance/_indices/37194b01-2a25-40d1-ac38-7fbe254df5ea/metadata.lance +0 -0
- package/vector-db/msxdocs.lance/_indices/37194b01-2a25-40d1-ac38-7fbe254df5ea/part_2_docs.lance +0 -0
- package/vector-db/msxdocs.lance/_indices/37194b01-2a25-40d1-ac38-7fbe254df5ea/part_2_invert.lance +0 -0
- package/vector-db/msxdocs.lance/_indices/37194b01-2a25-40d1-ac38-7fbe254df5ea/part_2_tokens.lance +0 -0
- package/vector-db/msxdocs.lance/_transactions/0-dd155672-40e6-4c6a-942f-7fcbe8c3dbd0.txn +0 -0
- package/vector-db/msxdocs.lance/_transactions/1-e7230cbd-ce8e-465c-9b85-b91443862427.txn +0 -0
- package/vector-db/msxdocs.lance/_versions/18446744073709551613.manifest +0 -0
- package/vector-db/msxdocs.lance/_versions/18446744073709551614.manifest +0 -0
- package/vector-db/msxdocs.lance/_versions/latest_version_hint.json +1 -0
- package/vector-db/msxdocs.lance/data/000100110110001011110001fc578141d296825d0bea11c95d.lance +0 -0
- package/resources/book--the-msx-red-book/the_msx_red_book.md +0 -10349
- package/resources/processors/z80-undocumented.tex +0 -5617
- package/resources/sdcc/lyx2md.py +0 -745
- package/resources/sdcc/sdccman.lyx +0 -81574
- package/resources/sdcc/sdccman.md +0 -5557
- package/vector-db/index.json +0 -1
package/README.md
CHANGED
|
@@ -46,7 +46,7 @@ This project creates a bridge between modern AI-assisted development (e.g. GitHu
|
|
|
46
46
|
- **Video Control**: VDP register manipulation and screen capture.
|
|
47
47
|
- **Memory Operations**: Read/write RAM, VRAM, and I/O port access.
|
|
48
48
|
- **Automation**: Keyboard input simulation and savestate management.
|
|
49
|
-
- **
|
|
49
|
+
- **Hybrid Documentation Search**: Query an embedded local index of MSX resources combining semantic (multilingual embeddings) and keyword (BM25) search, runs fully offline.
|
|
50
50
|
- **Hybrid Mode**: This MCP server supports hybrid access mode (_STDIO_ and _HTTP_ transports).
|
|
51
51
|
|
|
52
52
|
## Architecture
|
|
@@ -117,7 +117,7 @@ The MCP server translates high-level natural language commands from your Copilot
|
|
|
117
117
|
- `msxdocs_resource_get`: Retrieve MCP resources for MCP clients that don't support MCP resources.
|
|
118
118
|
|
|
119
119
|
### Documentation Tools
|
|
120
|
-
- `vector_db_query`:
|
|
120
|
+
- `vector_db_query`: Hybrid search (semantic embeddings + BM25) over the local MSX documentation index, for information about MSX systems, cartridges, programming, and other development resources.
|
|
121
121
|
- `msxdocs_resource_get`: Retrieve MCP resources for MCP clients that don't support MCP resources.
|
|
122
122
|
|
|
123
123
|
## Available MCP Resources
|
|
@@ -194,6 +194,9 @@ Steps to install the MCP server in VSCode:
|
|
|
194
194
|
}
|
|
195
195
|
```
|
|
196
196
|
|
|
197
|
+
> [!NOTE]
|
|
198
|
+
> In Windows you can change the `command` field to `npx.cmd` if you experience permission issues.
|
|
199
|
+
|
|
197
200
|
> [!NOTE]
|
|
198
201
|
> Environment variables are optional. Customize them as you need.
|
|
199
202
|
|
|
@@ -251,6 +254,21 @@ Edit it to include the following JSON entry:
|
|
|
251
254
|
| `MCP_ALLOWED_ORIGINS` | Comma-separated list of allowed origins for HTTP transport | Empty for all allowed | `http://localhost,http://mydomain.com` |
|
|
252
255
|
| `OPENMSX_WINDOWS_CONTROL` | **Windows only.** How the server talks to openMSX's control socket (see below) | `stdio-proxy` | `direct-sspi` |
|
|
253
256
|
| `OPENMSX_WINDOWS_PROXY_EXECUTABLE` | **Windows only.** Override path to the SSPI proxy helper (development) | Bundled `bin/win-x64/mcp-openmsx-sspi-proxy.exe` | `C:\path\to\mcp-openmsx-sspi-proxy.exe` |
|
|
257
|
+
| `OPENMSX_MODELS_CACHE` | Directory where the embedding model is cached (also honors `HF_HOME` / `TRANSFORMERS_CACHE`) | `~/.cache/mcp-openmsx` | `/opt/models` |
|
|
258
|
+
| `OPENMSX_EMBED_PROVIDER` | **Index generator only.** `cuda` uses the GPU (fp32 model) to regenerate the index, falling back to CPU if CUDA is unavailable. The MCP server itself always uses CPU/int8 and ignores this variable. | (generator: `cpu`) | `cuda` |
|
|
259
|
+
|
|
260
|
+
#### Documentation search model
|
|
261
|
+
|
|
262
|
+
The `vector_db_query` tool runs a local hybrid search (semantic embeddings + BM25). The embedding model
|
|
263
|
+
(`multilingual-e5-small`, ONNX quantized, ~118 MB, 512-token context, multilingual) is **downloaded once**
|
|
264
|
+
from the HuggingFace Hub on the first query and cached on disk (see `OPENMSX_MODELS_CACHE` above). After that
|
|
265
|
+
it runs fully offline. No API key is required. To pre-populate the cache for air-gapped environments, run one
|
|
266
|
+
query on a networked machine and copy the cache directory.
|
|
267
|
+
|
|
268
|
+
Regenerating the index (rare) embeds the whole corpus. On CPU this is slow; on an NVIDIA GPU set
|
|
269
|
+
`OPENMSX_EMBED_PROVIDER=cuda` for the generator to use it (requires CUDA 13 runtime libraries + cuDNN 9),
|
|
270
|
+
which is ~50× faster. The GPU path uses the larger fp32 model; the server keeps using the int8 model and
|
|
271
|
+
the two are interchangeable for search (same ranking). End users never download the fp32 model.
|
|
254
272
|
|
|
255
273
|
#### Windows control modes (`OPENMSX_WINDOWS_CONTROL`)
|
|
256
274
|
|
package/dist/chunker.js
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Local text chunkers (no external API).
|
|
3
|
+
*
|
|
4
|
+
* Two strategies:
|
|
5
|
+
* - `chunkText`: deterministic, markdown-aware, fixed-size with overlap.
|
|
6
|
+
* Used as a fallback and to hard-split oversized units.
|
|
7
|
+
* - `semanticChunk`: groups consecutive sentences by embedding similarity
|
|
8
|
+
* (cosine), so each chunk stays topically coherent, up to a size bound.
|
|
9
|
+
* Requires an embedding function (injected) — the model runs locally.
|
|
10
|
+
*
|
|
11
|
+
* Sizing targets the embedding model's context window. With multilingual-e5
|
|
12
|
+
* (max 512 tokens) we aim for ~1600 characters (~400 tokens), leaving room for
|
|
13
|
+
* the "passage: " prefix.
|
|
14
|
+
*
|
|
15
|
+
* @author Natalia Pujol Cremades (@nataliapc)
|
|
16
|
+
* @license GPL2
|
|
17
|
+
*/
|
|
18
|
+
export const DEFAULT_MAX_CHARS = 1600;
|
|
19
|
+
export const DEFAULT_OVERLAP = 100;
|
|
20
|
+
/** Hard-split a single oversized block into overlapping windows. */
|
|
21
|
+
function splitLong(s, maxChars, overlap) {
|
|
22
|
+
const out = [];
|
|
23
|
+
const step = Math.max(1, maxChars - overlap);
|
|
24
|
+
let start = 0;
|
|
25
|
+
while (start < s.length) {
|
|
26
|
+
const end = Math.min(start + maxChars, s.length);
|
|
27
|
+
const piece = s.slice(start, end).trim();
|
|
28
|
+
if (piece) {
|
|
29
|
+
out.push(piece);
|
|
30
|
+
}
|
|
31
|
+
if (end >= s.length) {
|
|
32
|
+
break;
|
|
33
|
+
}
|
|
34
|
+
start += step;
|
|
35
|
+
}
|
|
36
|
+
return out;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Split `text` into overlapping, markdown-aware fixed-size chunks.
|
|
40
|
+
* Returns [] for empty/whitespace input.
|
|
41
|
+
*/
|
|
42
|
+
export function chunkText(text, opts = {}) {
|
|
43
|
+
const maxChars = opts.maxChars ?? DEFAULT_MAX_CHARS;
|
|
44
|
+
const overlap = opts.overlap ?? DEFAULT_OVERLAP;
|
|
45
|
+
const clean = text.replace(/\r\n/g, '\n').trim();
|
|
46
|
+
if (!clean) {
|
|
47
|
+
return [];
|
|
48
|
+
}
|
|
49
|
+
if (clean.length <= maxChars) {
|
|
50
|
+
return [clean];
|
|
51
|
+
}
|
|
52
|
+
const blocks = clean
|
|
53
|
+
.split(/\n{2,}/)
|
|
54
|
+
.map((b) => b.trim())
|
|
55
|
+
.filter(Boolean);
|
|
56
|
+
const chunks = [];
|
|
57
|
+
let buf = '';
|
|
58
|
+
const flush = () => {
|
|
59
|
+
if (buf.trim()) {
|
|
60
|
+
chunks.push(buf.trim());
|
|
61
|
+
}
|
|
62
|
+
};
|
|
63
|
+
for (const block of blocks) {
|
|
64
|
+
if (block.length > maxChars) {
|
|
65
|
+
flush();
|
|
66
|
+
buf = '';
|
|
67
|
+
chunks.push(...splitLong(block, maxChars, overlap));
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
if (buf && buf.length + block.length + 1 > maxChars) {
|
|
71
|
+
flush();
|
|
72
|
+
const tail = buf.slice(-overlap);
|
|
73
|
+
buf = `${tail}\n${block}`;
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
buf = buf ? `${buf}\n${block}` : block;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
flush();
|
|
80
|
+
return chunks;
|
|
81
|
+
}
|
|
82
|
+
export const SEMANTIC_DEFAULTS = {
|
|
83
|
+
maxChars: 1800,
|
|
84
|
+
minChars: 250,
|
|
85
|
+
similarityThreshold: 0.90,
|
|
86
|
+
};
|
|
87
|
+
/** Split text into sentence-ish units (sentence punctuation or line breaks). */
|
|
88
|
+
export function splitSentences(text) {
|
|
89
|
+
return text
|
|
90
|
+
.replace(/\r\n/g, '\n')
|
|
91
|
+
.split(/(?<=[.!?:;])\s+|\n+/)
|
|
92
|
+
.map((s) => s.trim())
|
|
93
|
+
.filter(Boolean);
|
|
94
|
+
}
|
|
95
|
+
/** Dot product of two equal-length, L2-normalized vectors (= cosine). */
|
|
96
|
+
function dot(a, b) {
|
|
97
|
+
let s = 0;
|
|
98
|
+
for (let i = 0; i < a.length; i++) {
|
|
99
|
+
s += a[i] * b[i];
|
|
100
|
+
}
|
|
101
|
+
return s;
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Group consecutive sentences by embedding similarity into coherent chunks.
|
|
105
|
+
* All sentences are embedded in one batched call (`embedFn`); a running
|
|
106
|
+
* (re-normalized) centroid represents the current group. A sentence starts a
|
|
107
|
+
* new chunk when it is too dissimilar from the centroid or would overflow
|
|
108
|
+
* `maxChars`.
|
|
109
|
+
*/
|
|
110
|
+
export async function semanticChunk(text, embedFn, opts = {}) {
|
|
111
|
+
const maxChars = opts.maxChars ?? SEMANTIC_DEFAULTS.maxChars;
|
|
112
|
+
const minChars = opts.minChars ?? SEMANTIC_DEFAULTS.minChars;
|
|
113
|
+
const threshold = opts.similarityThreshold ?? SEMANTIC_DEFAULTS.similarityThreshold;
|
|
114
|
+
const clean = text.replace(/\r\n/g, '\n').trim();
|
|
115
|
+
if (!clean) {
|
|
116
|
+
return [];
|
|
117
|
+
}
|
|
118
|
+
// Units = paragraphs (split on blank lines). Oversized paragraphs are split
|
|
119
|
+
// into sentences, and oversized sentences hard-split. Paragraph granularity
|
|
120
|
+
// keeps the embedding count tractable on CPU while staying semantically
|
|
121
|
+
// meaningful (a paragraph is a natural topical unit).
|
|
122
|
+
const units = [];
|
|
123
|
+
for (const para of clean.split(/\n{2,}/).map((p) => p.trim()).filter(Boolean)) {
|
|
124
|
+
if (para.length <= maxChars) {
|
|
125
|
+
units.push(para);
|
|
126
|
+
continue;
|
|
127
|
+
}
|
|
128
|
+
for (const s of splitSentences(para)) {
|
|
129
|
+
if (s.length > maxChars) {
|
|
130
|
+
units.push(...splitLong(s, maxChars, 0));
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
units.push(s);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
if (units.length === 0) {
|
|
138
|
+
return [];
|
|
139
|
+
}
|
|
140
|
+
if (units.length === 1) {
|
|
141
|
+
return [units[0]];
|
|
142
|
+
}
|
|
143
|
+
const embeddings = await embedFn(units);
|
|
144
|
+
const dim = embeddings[0].length;
|
|
145
|
+
const chunks = [];
|
|
146
|
+
let groupText = units[0];
|
|
147
|
+
let sum = embeddings[0].slice(); // running sum of member vectors
|
|
148
|
+
let centroid = embeddings[0]; // normalized centroid
|
|
149
|
+
const renormalize = (v) => {
|
|
150
|
+
let n = 0;
|
|
151
|
+
for (let i = 0; i < dim; i++) {
|
|
152
|
+
n += v[i] * v[i];
|
|
153
|
+
}
|
|
154
|
+
n = Math.max(Math.sqrt(n), 1e-12);
|
|
155
|
+
return v.map((x) => x / n);
|
|
156
|
+
};
|
|
157
|
+
for (let i = 1; i < units.length; i++) {
|
|
158
|
+
const sim = dot(centroid, embeddings[i]);
|
|
159
|
+
const wouldOverflow = groupText.length + 1 + units[i].length > maxChars;
|
|
160
|
+
if (sim >= threshold && !wouldOverflow) {
|
|
161
|
+
groupText += `\n${units[i]}`;
|
|
162
|
+
for (let d = 0; d < dim; d++) {
|
|
163
|
+
sum[d] += embeddings[i][d];
|
|
164
|
+
}
|
|
165
|
+
centroid = renormalize(sum);
|
|
166
|
+
}
|
|
167
|
+
else {
|
|
168
|
+
chunks.push(groupText);
|
|
169
|
+
groupText = units[i];
|
|
170
|
+
sum = embeddings[i].slice();
|
|
171
|
+
centroid = embeddings[i];
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
chunks.push(groupText);
|
|
175
|
+
// Merge tiny trailing fragments into the previous chunk when they fit.
|
|
176
|
+
const merged = [];
|
|
177
|
+
for (const c of chunks) {
|
|
178
|
+
const prev = merged[merged.length - 1];
|
|
179
|
+
if (prev && c.length < minChars && prev.length + 1 + c.length <= maxChars) {
|
|
180
|
+
merged[merged.length - 1] = `${prev}\n${c}`;
|
|
181
|
+
}
|
|
182
|
+
else {
|
|
183
|
+
merged.push(c);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
return merged;
|
|
187
|
+
}
|
package/dist/embedder.js
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Local text embedding engine.
|
|
3
|
+
*
|
|
4
|
+
* Uses onnxruntime-node + @anush008/tokenizers (both prebuilt napi, no `sharp`)
|
|
5
|
+
* to run the multilingual model `multilingual-e5-small` fully offline.
|
|
6
|
+
*
|
|
7
|
+
* The ONNX weights + tokenizer are downloaded on first use from the
|
|
8
|
+
* HuggingFace Hub and cached on disk. The same module is the single source of
|
|
9
|
+
* truth for embeddings in both generation (vector-db) and query (server).
|
|
10
|
+
*
|
|
11
|
+
* Model notes (e5):
|
|
12
|
+
* - MEAN pooling over masked token embeddings + L2 normalization. Do NOT
|
|
13
|
+
* switch to CLS pooling — it would silently degrade retrieval quality.
|
|
14
|
+
* - e5 is trained with asymmetric prefixes: queries must be prefixed with
|
|
15
|
+
* "query: " and documents/passages with "passage: ". Use `embedQuery` for
|
|
16
|
+
* search input and `embedPassage` for indexed text. The prefix is only used
|
|
17
|
+
* to compute the vector; it is never stored.
|
|
18
|
+
* - max_seq_length = 512 (enough for ~400-token semantic chunks).
|
|
19
|
+
*
|
|
20
|
+
* @author Natalia Pujol Cremades (@nataliapc)
|
|
21
|
+
* @license GPL2
|
|
22
|
+
*/
|
|
23
|
+
import * as ort from 'onnxruntime-node';
|
|
24
|
+
import { Tokenizer } from '@anush008/tokenizers';
|
|
25
|
+
import * as fs from 'fs';
|
|
26
|
+
import * as path from 'path';
|
|
27
|
+
import * as os from 'os';
|
|
28
|
+
const MODEL_REPO = 'Xenova/multilingual-e5-small';
|
|
29
|
+
// int8 ONNX is fastest on CPU; the GPU uses the fp32 ONNX (CUDA has no
|
|
30
|
+
// efficient kernels for dynamic-quantized ops, so the int8 model falls back to
|
|
31
|
+
// CPU even under the CUDA provider). fp32 (not fp16) is used on GPU so the
|
|
32
|
+
// output stays Float32Array — the fp16 model emits float16 output that the
|
|
33
|
+
// pooling code cannot read directly. The file is chosen per provider.
|
|
34
|
+
const ONNX_FILE_CPU = 'onnx/model_quantized.onnx';
|
|
35
|
+
const ONNX_FILE_GPU = 'onnx/model.onnx';
|
|
36
|
+
const TOKENIZER_FILE = 'tokenizer.json';
|
|
37
|
+
const HF_BASE = `https://huggingface.co/${MODEL_REPO}/resolve/main`;
|
|
38
|
+
/** Embedding dimensionality of the model. */
|
|
39
|
+
export const EMBEDDING_DIM = 384;
|
|
40
|
+
/** Model max sequence length (e5 max_seq_length). */
|
|
41
|
+
const MAX_LENGTH = 512;
|
|
42
|
+
let enginePromise = null;
|
|
43
|
+
// Server-safe default: the MCP server NEVER downloads or runs the large fp32
|
|
44
|
+
// model. Only an explicit setEmbedProvider('cuda') — used by the offline index
|
|
45
|
+
// generator — can opt into the GPU. The server never calls it, so it stays int8.
|
|
46
|
+
let requestedProvider = 'cpu';
|
|
47
|
+
/**
|
|
48
|
+
* Select the embedding execution provider. Must be called before the first
|
|
49
|
+
* embedding. Only the index generator should request 'cuda'; the MCP server
|
|
50
|
+
* leaves the default ('cpu' / int8) so end users only ever download the 118 MB
|
|
51
|
+
* quantized model.
|
|
52
|
+
*/
|
|
53
|
+
export function setEmbedProvider(provider) {
|
|
54
|
+
if (enginePromise) {
|
|
55
|
+
throw new Error('setEmbedProvider must be called before the first embedding');
|
|
56
|
+
}
|
|
57
|
+
requestedProvider = provider;
|
|
58
|
+
}
|
|
59
|
+
/** Resolve the on-disk cache directory for the model files. */
|
|
60
|
+
function getCacheDir() {
|
|
61
|
+
const base = process.env.OPENMSX_MODELS_CACHE ||
|
|
62
|
+
process.env.HF_HOME ||
|
|
63
|
+
process.env.TRANSFORMERS_CACHE ||
|
|
64
|
+
path.join(os.homedir(), '.cache', 'mcp-openmsx');
|
|
65
|
+
return path.join(base, 'models', MODEL_REPO.replace('/', '__'));
|
|
66
|
+
}
|
|
67
|
+
/** Download a single file from the HF Hub to dest if not already present. */
|
|
68
|
+
async function downloadFile(remote, dest) {
|
|
69
|
+
if (fs.existsSync(dest) && fs.statSync(dest).size > 0) {
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
await fs.promises.mkdir(path.dirname(dest), { recursive: true });
|
|
73
|
+
const url = `${HF_BASE}/${remote}`;
|
|
74
|
+
const res = await fetch(url);
|
|
75
|
+
if (!res.ok || !res.body) {
|
|
76
|
+
throw new Error(`Failed to download model file ${url}: ${res.status} ${res.statusText}`);
|
|
77
|
+
}
|
|
78
|
+
const buffer = Buffer.from(await res.arrayBuffer());
|
|
79
|
+
// Write atomically: tmp file + rename, so a crash mid-download cannot leave
|
|
80
|
+
// a truncated file that later looks "present".
|
|
81
|
+
const tmp = `${dest}.download`;
|
|
82
|
+
await fs.promises.writeFile(tmp, buffer);
|
|
83
|
+
await fs.promises.rename(tmp, dest);
|
|
84
|
+
}
|
|
85
|
+
/** Download a specific ONNX file + tokenizer if missing; returns the onnx path. */
|
|
86
|
+
async function ensureFiles(onnxFile) {
|
|
87
|
+
const dir = getCacheDir();
|
|
88
|
+
const onnxPath = path.join(dir, onnxFile);
|
|
89
|
+
const tokenizerPath = path.join(dir, TOKENIZER_FILE);
|
|
90
|
+
await Promise.all([
|
|
91
|
+
downloadFile(onnxFile, onnxPath),
|
|
92
|
+
downloadFile(TOKENIZER_FILE, tokenizerPath),
|
|
93
|
+
]);
|
|
94
|
+
return { onnxPath, tokenizerPath };
|
|
95
|
+
}
|
|
96
|
+
const baseSessionOptions = {
|
|
97
|
+
graphOptimizationLevel: 'all',
|
|
98
|
+
intraOpNumThreads: Math.max(1, os.cpus().length),
|
|
99
|
+
interOpNumThreads: 1,
|
|
100
|
+
executionMode: 'sequential',
|
|
101
|
+
};
|
|
102
|
+
/** Probe whether the CUDA provider can actually be created, using the small
|
|
103
|
+
* int8 model already on disk (avoids downloading the 470 MB fp32 model just to
|
|
104
|
+
* find out CUDA is unavailable). Returns true only if a CUDA session loads. */
|
|
105
|
+
async function cudaAvailable(probeOnnxPath) {
|
|
106
|
+
try {
|
|
107
|
+
const probe = await ort.InferenceSession.create(probeOnnxPath, {
|
|
108
|
+
...baseSessionOptions,
|
|
109
|
+
executionProviders: ['cuda'],
|
|
110
|
+
});
|
|
111
|
+
await probe.release?.();
|
|
112
|
+
return true;
|
|
113
|
+
}
|
|
114
|
+
catch {
|
|
115
|
+
return false;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Lazily initialize the ONNX session + tokenizer (singleton).
|
|
120
|
+
*
|
|
121
|
+
* The int8 model is always fetched first: it is the server default, the
|
|
122
|
+
* fallback, and the cheap CUDA probe. The large fp32 model is downloaded ONLY
|
|
123
|
+
* when 'cuda' was explicitly requested AND CUDA is confirmed available — so the
|
|
124
|
+
* server (which never requests 'cuda') can never pull the fp32 model.
|
|
125
|
+
*/
|
|
126
|
+
function getEngine() {
|
|
127
|
+
if (!enginePromise) {
|
|
128
|
+
enginePromise = (async () => {
|
|
129
|
+
const { onnxPath: int8Path, tokenizerPath } = await ensureFiles(ONNX_FILE_CPU);
|
|
130
|
+
const tokenizer = Tokenizer.fromFile(tokenizerPath);
|
|
131
|
+
tokenizer.setTruncation(MAX_LENGTH);
|
|
132
|
+
if (requestedProvider === 'cuda') {
|
|
133
|
+
if (await cudaAvailable(int8Path)) {
|
|
134
|
+
// CUDA confirmed → only now download + load the fp32 model.
|
|
135
|
+
const { onnxPath: fp32Path } = await ensureFiles(ONNX_FILE_GPU);
|
|
136
|
+
const session = await ort.InferenceSession.create(fp32Path, {
|
|
137
|
+
...baseSessionOptions,
|
|
138
|
+
executionProviders: ['cuda'],
|
|
139
|
+
});
|
|
140
|
+
process.stderr.write('[embedder] using CUDA execution provider (fp32)\n');
|
|
141
|
+
return { session, tokenizer };
|
|
142
|
+
}
|
|
143
|
+
process.stderr.write('[embedder] CUDA requested but unavailable; using CPU (int8)\n');
|
|
144
|
+
}
|
|
145
|
+
const session = await ort.InferenceSession.create(int8Path, baseSessionOptions);
|
|
146
|
+
return { session, tokenizer };
|
|
147
|
+
})().catch((err) => {
|
|
148
|
+
// Reset so a transient failure (e.g. network) can be retried.
|
|
149
|
+
enginePromise = null;
|
|
150
|
+
throw err;
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
return enginePromise;
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Default batch size for batched inference.
|
|
157
|
+
*/
|
|
158
|
+
const BATCH_SIZE = 32;
|
|
159
|
+
// XLM-RoBERTa / e5 pad token id. Padded positions get attention_mask 0, so the
|
|
160
|
+
// exact id is irrelevant to the pooled result; it only fills the tensor.
|
|
161
|
+
const PAD_ID = 1n;
|
|
162
|
+
/**
|
|
163
|
+
* Embed a list of already-prefixed inputs in batches (one ONNX run per batch,
|
|
164
|
+
* dynamic padding to the longest sequence in the batch). Returns one
|
|
165
|
+
* 384-dimension, L2-normalized vector per input (mean pooling over masked
|
|
166
|
+
* tokens). Batching is essential for throughput when embedding many sentences.
|
|
167
|
+
*/
|
|
168
|
+
async function embedRawBatch(inputs, batchSize = BATCH_SIZE) {
|
|
169
|
+
if (inputs.length === 0) {
|
|
170
|
+
return [];
|
|
171
|
+
}
|
|
172
|
+
const { session, tokenizer } = await getEngine();
|
|
173
|
+
const hasTokenTypes = session.inputNames.includes('token_type_ids');
|
|
174
|
+
const results = [];
|
|
175
|
+
for (let start = 0; start < inputs.length; start += batchSize) {
|
|
176
|
+
const batch = inputs.slice(start, start + batchSize);
|
|
177
|
+
const encodings = await Promise.all(batch.map((t) => tokenizer.encode(t)));
|
|
178
|
+
const idsArr = encodings.map((e) => e.getIds());
|
|
179
|
+
const maskArr = encodings.map((e) => e.getAttentionMask());
|
|
180
|
+
const B = batch.length;
|
|
181
|
+
const maxLen = Math.min(MAX_LENGTH, Math.max(...idsArr.map((a) => a.length)));
|
|
182
|
+
const flatIds = new BigInt64Array(B * maxLen);
|
|
183
|
+
const flatMask = new BigInt64Array(B * maxLen);
|
|
184
|
+
for (let r = 0; r < B; r++) {
|
|
185
|
+
const ids = idsArr[r];
|
|
186
|
+
const mask = maskArr[r];
|
|
187
|
+
const len = Math.min(ids.length, maxLen);
|
|
188
|
+
const rowBase = r * maxLen;
|
|
189
|
+
for (let c = 0; c < len; c++) {
|
|
190
|
+
flatIds[rowBase + c] = BigInt(ids[c]);
|
|
191
|
+
flatMask[rowBase + c] = BigInt(mask[c]);
|
|
192
|
+
}
|
|
193
|
+
for (let c = len; c < maxLen; c++) {
|
|
194
|
+
flatIds[rowBase + c] = PAD_ID;
|
|
195
|
+
flatMask[rowBase + c] = 0n;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
const feeds = {
|
|
199
|
+
input_ids: new ort.Tensor('int64', flatIds, [B, maxLen]),
|
|
200
|
+
attention_mask: new ort.Tensor('int64', flatMask, [B, maxLen]),
|
|
201
|
+
};
|
|
202
|
+
if (hasTokenTypes) {
|
|
203
|
+
feeds.token_type_ids = new ort.Tensor('int64', new BigInt64Array(B * maxLen), [B, maxLen]);
|
|
204
|
+
}
|
|
205
|
+
const output = await session.run(feeds);
|
|
206
|
+
const hidden = output['last_hidden_state'] ?? output[session.outputNames[0]];
|
|
207
|
+
const data = hidden.data;
|
|
208
|
+
const dim = hidden.dims[hidden.dims.length - 1];
|
|
209
|
+
for (let r = 0; r < B; r++) {
|
|
210
|
+
const pooled = new Array(dim).fill(0);
|
|
211
|
+
let count = 0;
|
|
212
|
+
for (let t = 0; t < maxLen; t++) {
|
|
213
|
+
if (flatMask[r * maxLen + t] === 0n) {
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
count++;
|
|
217
|
+
const base = (r * maxLen + t) * dim;
|
|
218
|
+
for (let d = 0; d < dim; d++) {
|
|
219
|
+
pooled[d] += data[base + d];
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
const denom = Math.max(count, 1);
|
|
223
|
+
let norm = 0;
|
|
224
|
+
for (let d = 0; d < dim; d++) {
|
|
225
|
+
pooled[d] /= denom;
|
|
226
|
+
norm += pooled[d] * pooled[d];
|
|
227
|
+
}
|
|
228
|
+
norm = Math.max(Math.sqrt(norm), 1e-12);
|
|
229
|
+
for (let d = 0; d < dim; d++) {
|
|
230
|
+
pooled[d] /= norm;
|
|
231
|
+
}
|
|
232
|
+
results.push(pooled);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
return results;
|
|
236
|
+
}
|
|
237
|
+
/** Embed a search query (e5 "query: " prefix). */
|
|
238
|
+
export async function embedQuery(text) {
|
|
239
|
+
return (await embedRawBatch([`query: ${text}`]))[0];
|
|
240
|
+
}
|
|
241
|
+
/** Embed a document/passage to be indexed (e5 "passage: " prefix). */
|
|
242
|
+
export async function embedPassage(text) {
|
|
243
|
+
return (await embedRawBatch([`passage: ${text}`]))[0];
|
|
244
|
+
}
|
|
245
|
+
/** Batch-embed passages (e5 "passage: " prefix). One ONNX run per batch. */
|
|
246
|
+
export function embedPassageBatch(texts) {
|
|
247
|
+
return embedRawBatch(texts.map((t) => `passage: ${t}`));
|
|
248
|
+
}
|
|
249
|
+
/** Default embedding = query side (kept for backward compatibility). */
|
|
250
|
+
export const embed = embedQuery;
|
package/dist/server.js
CHANGED
|
@@ -29,7 +29,12 @@ const require = createRequire(import.meta.url);
|
|
|
29
29
|
export const PACKAGE_VERSION = require('../package.json').version;
|
|
30
30
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
31
31
|
const resourcesDir = path.join(__dirname, "../resources");
|
|
32
|
-
|
|
32
|
+
// Index location. Defaults to the bundled `vector-db` next to the build, but can
|
|
33
|
+
// be overridden with OPENMSX_VECTORDB_DIR. LanceDB/lance-io (Rust object_store)
|
|
34
|
+
// cannot read a local index from a Windows *mapped network drive* (it drops the
|
|
35
|
+
// drive letter when converting the path to a file:// URL). When the project lives
|
|
36
|
+
// on a network share, point this at a copy of the index on a local disk.
|
|
37
|
+
const vectorDbDir = process.env.OPENMSX_VECTORDB_DIR?.trim() || path.join(__dirname, "../vector-db");
|
|
33
38
|
export const emuDirectories = {
|
|
34
39
|
OPENMSX_SHARE_DIR: '',
|
|
35
40
|
OPENMSX_EXECUTABLE: detectOpenMSXExecutable(),
|
package/dist/server_tools.js
CHANGED
|
@@ -1677,10 +1677,11 @@ The parameter scrbasename is the name of the filename (without path) to save the
|
|
|
1677
1677
|
"vector_db_query", {
|
|
1678
1678
|
title: "Vector DB query from resources",
|
|
1679
1679
|
// Description of the tool (what it does)
|
|
1680
|
-
description: `Query the
|
|
1681
|
-
|
|
1682
|
-
The
|
|
1683
|
-
|
|
1680
|
+
description: `Query the documentation index to obtain information about MSX system, cartridges, programming, and other development resources.
|
|
1681
|
+
Uses hybrid search: semantic (multilingual embeddings) combined with keyword (BM25) matching, fused with Reciprocal Rank Fusion. Good for both conceptual questions and exact terms (mnemonics, register names, BIOS calls).
|
|
1682
|
+
The query is a string; it is case-insensitive and may contain spaces.
|
|
1683
|
+
The response is the list of the top 10 matching resource chunks, including their relevance score, title, and resource URI, sorted in descending order by score.
|
|
1684
|
+
**Important Note**: The documentation resources are in english, japanese, or dutch (the embedding model is multilingual).
|
|
1684
1685
|
`,
|
|
1685
1686
|
// Schema for the tool (input validation)
|
|
1686
1687
|
inputSchema: {
|
|
@@ -1691,7 +1692,7 @@ The response is the list of the top 10 result resources that match the query, in
|
|
|
1691
1692
|
},
|
|
1692
1693
|
outputSchema: {
|
|
1693
1694
|
results: z.array(z.object({
|
|
1694
|
-
score: z.string().describe("
|
|
1695
|
+
score: z.string().describe("Reciprocal Rank Fusion (RRF) relevance score combining semantic and keyword matches; higher is better. Only the relative ranking is meaningful, not the absolute value (typically ~0.01-0.03)."),
|
|
1695
1696
|
title: z.string().describe("Title of the resource."),
|
|
1696
1697
|
uri: z.string().describe("URI of the resource, which can be used to access the resource."),
|
|
1697
1698
|
document: z.string().describe("Document chunk of the resource, retrieved from the Vector DB."),
|