@deepsweet/mdn 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +0 -2
  2. package/dist/index.js +34 -33
  3. package/package.json +6 -1
package/README.md CHANGED
@@ -55,7 +55,6 @@ The `stdio` server will spawn [llama.cpp](https://github.com/ggml-org/llama.cpp)
55
55
  | Env variable | Default value | Description |
56
56
  |----------------------------|-----------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------|
57
57
  | `MDN_DATASET_PATH` | HuggingFace cache | Custom dataset directory path |
58
- | `MDN_DATASET_LOCALE` | `en-us` | Dataset language, currently `en-us` only |
59
58
  | `MDN_MODEL_PATH` | HuggingFace cache | Custom model file path |
60
59
  | `MDN_MODEL_TTL` | `1800` | For how long llama.cpp with embedding model should be kept loaded in memory, in seconds; `0` to prevent unloading |
61
60
  | `MDN_QUERY_DESCRIPTION` | `Natural language query for hybrid vector and full-text search` | Custom search query description in case your LLM does a poor job asking the MCP tool |
@@ -64,7 +63,6 @@ The `stdio` server will spawn [llama.cpp](https://github.com/ggml-org/llama.cpp)
64
63
  ## To do
65
64
 
66
65
  - [ ] figure out a better query description so that LLM doesn't over-generate keywords
67
- - [ ] add more dataset [translations](https://github.com/mdn/translated-content/tree/main/files/)
68
66
  - [ ] automatically update and upload the dataset artifacts monthly with GitHub Actions
69
67
 
70
68
  ## License
package/dist/index.js CHANGED
@@ -1,10 +1,27 @@
1
1
  #!/usr/bin/env node
2
2
 
3
+ // src/huggingface.ts
4
+ import fs from "node:fs/promises";
5
+ import path from "path";
6
+ import {
7
+ getHFHubCachePath,
8
+ getRepoFolderName,
9
+ scanCachedRepo,
10
+ snapshotDownload
11
+ } from "@huggingface/hub";
12
+
13
+ // src/const.ts
14
+ var DATASET_REPO = "deepsweet/mdn";
15
+ var MODEL_REPO = "deepsweet/bge-m3-GGUF-Q4_K_M";
16
+ var MODEL_FILE = "bge-m3-GGUF-Q4_K_M.gguf";
17
+ var MODEL_MAX_TOKENS = 8192;
18
+ var TABLE_NAME = "mdn";
19
+ var TABLE_FILENAME = `${TABLE_NAME}.lance`;
20
+
3
21
  // src/env.ts
4
22
  import { z } from "zod";
5
23
  var env = z.object({
6
24
  MDN_DATASET_PATH: z.string().optional(),
7
- MDN_DATASET_LOCALE: z.enum(["en-us"]).default("en-us"),
8
25
  MDN_MODEL_PATH: z.string().optional(),
9
26
  MDN_MODEL_TTL: z.number().default(1800),
10
27
  MDN_QUERY_DESCRIPTION: z.string().default("Natural language query for hybrid vector and full-text search"),
@@ -12,19 +29,6 @@ var env = z.object({
12
29
  }).parse(process.env);
13
30
 
14
31
  // src/huggingface.ts
15
- import fs from "node:fs/promises";
16
- import path from "path";
17
- import { getHFHubCachePath, getRepoFolderName, scanCachedRepo, snapshotDownload } from "@huggingface/hub";
18
-
19
- // src/utils.ts
20
- var getTableName = (locale) => {
21
- return `mdn-${locale}`;
22
- };
23
-
24
- // src/huggingface.ts
25
- var DATASET_REPO = "deepsweet/mdn";
26
- var MODEL_REPO = "deepsweet/bge-m3-GGUF-Q4_K_M";
27
- var MODEL_FILE = "bge-m3-GGUF-Q4_K_M.gguf";
28
32
  var replaceSymlinksWithHardlinks = async (dir) => {
29
33
  const entries = await fs.readdir(dir, { withFileTypes: true });
30
34
  for (const entry of entries) {
@@ -57,18 +61,16 @@ var getLatestCachedRepoRevision = async (name, type) => {
57
61
  });
58
62
  return latestRevision.path;
59
63
  };
60
- var downloadDataset = async (locale) => {
61
- const tableName = getTableName(locale);
64
+ var downloadDataset = async () => {
62
65
  const dirPath = await snapshotDownload({
63
- repo: `datasets/${DATASET_REPO}`,
64
- path: `data/${tableName}.lance`
66
+ repo: `datasets/${DATASET_REPO}`
65
67
  });
66
68
  const dataPath = path.join(dirPath, "data");
67
69
  await replaceSymlinksWithHardlinks(dataPath);
68
70
  };
69
71
  var getDatasetPath = async () => {
70
- if (process.env.MDN_DATASET_PATH != null) {
71
- return process.env.MDN_DATASET_PATH;
72
+ if (env.MDN_DATASET_PATH != null) {
73
+ return env.MDN_DATASET_PATH;
72
74
  }
73
75
  const latestRevisionPath = await getLatestCachedRepoRevision(DATASET_REPO, "dataset");
74
76
  const datasetPath = path.join(latestRevisionPath, "data");
@@ -80,8 +82,8 @@ var downloadModel = async () => {
80
82
  });
81
83
  };
82
84
  var getModelPath = async () => {
83
- if (process.env.MDN_MODEL_PATH != null) {
84
- return process.env.MDN_MODEL_PATH;
85
+ if (env.MDN_MODEL_PATH != null) {
86
+ return env.MDN_MODEL_PATH;
85
87
  }
86
88
  const latestRevisionPath = await getLatestCachedRepoRevision(MODEL_REPO, "model");
87
89
  const modelPath = path.join(latestRevisionPath, MODEL_FILE);
@@ -95,15 +97,16 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"
95
97
  import { z as z2 } from "zod";
96
98
 
97
99
  // src/llama.ts
100
+ import os from "node:os";
98
101
  import { getLlama, LlamaLogLevel } from "node-llama-cpp";
99
- var MAX_TOKENS = 8192;
100
102
  var getLlamaContext = async (modelPath) => {
103
+ const threads = Math.floor(os.availableParallelism() / 2);
101
104
  const llama = await getLlama({ logLevel: LlamaLogLevel.error });
102
105
  const model = await llama.loadModel({ modelPath });
103
106
  const context = await model.createEmbeddingContext({
104
- contextSize: MAX_TOKENS,
105
- batchSize: MAX_TOKENS,
106
- threads: 0
107
+ contextSize: MODEL_MAX_TOKENS,
108
+ batchSize: MODEL_MAX_TOKENS,
109
+ threads
107
110
  });
108
111
  context.onDispose.createOnceListener(() => {
109
112
  model.dispose().then(() => llama.dispose()).catch(console.error);
@@ -124,7 +127,7 @@ var vectorize = async (context, text) => {
124
127
  // src/query.ts
125
128
  var queryHybrid = async (llamaContext, table, reranker, text) => {
126
129
  const vector = await vectorize(llamaContext, text);
127
- const results = await table.query().nearestTo(vector).fullTextSearch(text).rerank(reranker).limit(env.MDN_SEARCH_RESULTS_LIMIT).toArray();
130
+ const results = await table.query().nearestTo(vector).column("vector").fullTextSearch(text, { columns: "text" }).rerank(reranker).limit(env.MDN_SEARCH_RESULTS_LIMIT).toArray();
128
131
  return results;
129
132
  };
130
133
  var createReranker = async () => {
@@ -134,7 +137,7 @@ var createReranker = async () => {
134
137
 
135
138
  // package.json
136
139
  var name = "@deepsweet/mdn";
137
- var version = "0.2.0";
140
+ var version = "0.3.1";
138
141
 
139
142
  // src/server.ts
140
143
  var startMcpServer = async () => {
@@ -142,8 +145,7 @@ var startMcpServer = async () => {
142
145
  const db = await lancedb2.connect(datasetPath);
143
146
  const reranker = await createReranker();
144
147
  const server = new McpServer({ name, version });
145
- const tableName = getTableName(env.MDN_DATASET_LOCALE);
146
- const table = await db.openTable(tableName);
148
+ const table = await db.openTable(TABLE_NAME);
147
149
  const modelPath = await getModelPath();
148
150
  const llamaTtl = env.MDN_MODEL_TTL * 1000;
149
151
  let llamaContext = null;
@@ -202,8 +204,7 @@ var startMcpServer = async () => {
202
204
  // src/index.ts
203
205
  switch (process.argv[2]) {
204
206
  case "download": {
205
- const locale = process.argv[3] ?? env.MDN_DATASET_LOCALE;
206
- await downloadDataset(locale);
207
+ await downloadDataset();
207
208
  await downloadModel();
208
209
  break;
209
210
  }
@@ -212,7 +213,7 @@ switch (process.argv[2]) {
212
213
  break;
213
214
  }
214
215
  default: {
215
- console.error('Unknown or missing command, use "download" or "server"');
216
+ console.error('Unknown command, use "download" or "server"');
216
217
  process.exit(1);
217
218
  }
218
219
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@deepsweet/mdn",
3
- "version": "0.2.0",
3
+ "version": "0.3.1",
4
4
  "publishConfig": {
5
5
  "access": "public"
6
6
  },
@@ -27,13 +27,18 @@
27
27
  "eslint": "^10.1.0",
28
28
  "gray-matter": "^4.0.3",
29
29
  "marked": "^17.0.5",
30
+ "p-all": "^5.0.1",
30
31
  "rimraf": "^6.1.3",
31
32
  "typescript": "^6.0.2"
32
33
  },
33
34
  "scripts": {
35
+ "download": "bun scripts/download.ts",
34
36
  "chunk": "bun scripts/chunk.ts",
35
37
  "ingest": "bun scripts/ingest.ts",
38
+ "update": "bun scripts/update.ts",
36
39
  "query": "bun scripts/query.ts",
40
+ "upload": "bun scripts/upload.ts",
41
+ "test": "bun scripts/test.ts",
37
42
  "check": "tsc --noEmit && eslint --cache scripts/ src/",
38
43
  "dist": "bun build --format esm --target node --packages external --outdir dist/ src/index.ts"
39
44
  },