@arabold/docs-mcp-server 1.4.5 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +137 -94
- package/dist/{chunk-BD7OFN4H.js → chunk-S7C2LRQA.js} +167 -73
- package/dist/chunk-S7C2LRQA.js.map +1 -0
- package/dist/cli.js +27 -5
- package/dist/cli.js.map +1 -1
- package/dist/server.js +6 -5
- package/dist/server.js.map +1 -1
- package/package.json +6 -2
- package/dist/chunk-BD7OFN4H.js.map +0 -1
package/README.md
CHANGED
|
@@ -9,7 +9,7 @@ A MCP server for fetching and searching 3rd party package documentation.
|
|
|
9
9
|
- 💾 **Efficient Storage:** Store data in SQLite, leveraging `sqlite-vec` for vector search and FTS5 for full-text search.
|
|
10
10
|
- 🔍 **Hybrid Search:** Combine vector and full-text search for relevant results across different library versions.
|
|
11
11
|
- ⚙️ **Job Management:** Handle scraping tasks asynchronously with a robust job queue and management tools (MCP & CLI).
|
|
12
|
-
- 🐳 **Easy Deployment:** Run the server easily using
|
|
12
|
+
- 🐳 **Easy Deployment:** Run the server easily using Docker or npx.
|
|
13
13
|
|
|
14
14
|
## Overview
|
|
15
15
|
|
|
@@ -26,104 +26,143 @@ The server exposes MCP tools for:
|
|
|
26
26
|
- Finding appropriate versions (`find_version`).
|
|
27
27
|
- Removing indexed documents (`remove_docs`).
|
|
28
28
|
|
|
29
|
-
##
|
|
29
|
+
## Configuration
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
The following environment variables are supported to configure the OpenAI API and embedding behavior:
|
|
32
32
|
|
|
33
|
-
|
|
33
|
+
- `OPENAI_API_KEY`: **Required.** Your OpenAI API key for generating embeddings.
|
|
34
|
+
- `OPENAI_ORG_ID`: **Optional.** Your OpenAI Organization ID (handled automatically by LangChain if set).
|
|
35
|
+
- `OPENAI_API_BASE`: **Optional.** Custom base URL for OpenAI API (e.g., for Azure OpenAI or compatible APIs).
|
|
36
|
+
- `DOCS_MCP_EMBEDDING_MODEL`: **Optional.** Embedding model name (defaults to "text-embedding-3-small"). Must produce vectors with ≤1536 dimensions. Smaller dimensions are automatically padded with zeros.
|
|
34
37
|
|
|
35
|
-
|
|
38
|
+
The database schema uses a fixed dimension of 1536 for embedding vectors. Models that produce larger vectors are not supported and will cause an error. Models with smaller vectors (e.g., older embedding models) are automatically padded with zeros to match the required dimension.
|
|
36
39
|
|
|
37
|
-
|
|
38
|
-
```bash
|
|
39
|
-
npm install -g @arabold/docs-mcp-server
|
|
40
|
-
```
|
|
41
|
-
2. **Run the Server:**
|
|
42
|
-
```bash
|
|
43
|
-
docs-server
|
|
44
|
-
```
|
|
45
|
-
_(Note: You'll need to manage environment variables like `OPENAI_API_KEY` yourself when running this way, e.g., by setting them in your shell profile or using a tool like `dotenv`.)_
|
|
46
|
-
3. **Run the CLI:**
|
|
47
|
-
```bash
|
|
48
|
-
docs-cli <command> [options]
|
|
49
|
-
```
|
|
50
|
-
(See "CLI Command Reference" below for available commands and options.)
|
|
40
|
+
These variables can be set regardless of how you run the server (Docker, npx, or from source).
|
|
51
41
|
|
|
52
|
-
|
|
42
|
+
## Running the MCP Server
|
|
53
43
|
|
|
54
|
-
|
|
44
|
+
There are two ways to run the docs-mcp-server:
|
|
55
45
|
|
|
56
|
-
|
|
46
|
+
### Option 1: Using Docker (Recommended)
|
|
57
47
|
|
|
58
|
-
|
|
59
|
-
2. **Run the Server (e.g., for MCP Integration):**
|
|
48
|
+
This is the recommended approach for most users. It's easy, straightforward, and doesn't require Node.js to be installed.
|
|
60
49
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
-e OPENAI_API_KEY="your-openai-api-key-here" \
|
|
64
|
-
-v docs-mcp-data:/data \
|
|
65
|
-
ghcr.io/arabold/docs-mcp-server:latest
|
|
66
|
-
```
|
|
50
|
+
1. **Ensure Docker is installed and running.**
|
|
51
|
+
2. **Configure your MCP settings:**
|
|
67
52
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
- `-e OPENAI_API_KEY="..."`: **Required.** Set your OpenAI API key.
|
|
71
|
-
- `-v docs-mcp-data:/data`: **Required for persistence.** Mounts a Docker named volume `docs-mcp-data` to the container's `/data` directory, where the database is stored. You can replace `docs-mcp-data` with a specific host path if preferred (e.g., `-v /path/on/host:/data`).
|
|
72
|
-
- `ghcr.io/arabold/docs-mcp-server:latest`: Specifies the public Docker image to use.
|
|
73
|
-
|
|
74
|
-
This is the recommended approach for integrating with tools like Claude Desktop or Cline.
|
|
75
|
-
|
|
76
|
-
**Claude/Cline Configuration Example:**
|
|
77
|
-
Add the following configuration block to your MCP settings file (adjust path as needed):
|
|
78
|
-
|
|
79
|
-
- Cline: `/Users/andrerabold/Library/Application Support/Code/User/globalStorage/saoudrizwan.claude-dev/settings/cline_mcp_settings.json`
|
|
80
|
-
- Claude Desktop (MacOS): `~/Library/Application Support/Claude/claude_desktop_config.json`
|
|
81
|
-
- Claude Desktop (Windows): `%APPDATA%/Claude/claude_desktop_config.json`
|
|
82
|
-
|
|
83
|
-
```json
|
|
84
|
-
{
|
|
85
|
-
"mcpServers": {
|
|
86
|
-
"docs-mcp-server": {
|
|
87
|
-
"command": "docker",
|
|
88
|
-
"args": [
|
|
89
|
-
"run",
|
|
90
|
-
"-i",
|
|
91
|
-
"--rm",
|
|
92
|
-
"-e",
|
|
93
|
-
"OPENAI_API_KEY",
|
|
94
|
-
"-v",
|
|
95
|
-
"docs-mcp-data:/data",
|
|
96
|
-
"ghcr.io/arabold/docs-mcp-server:latest"
|
|
97
|
-
],
|
|
98
|
-
"env": {
|
|
99
|
-
"OPENAI_API_KEY": "sk-proj-..." // Required: Replace with your key
|
|
100
|
-
},
|
|
101
|
-
"disabled": false,
|
|
102
|
-
"autoApprove": []
|
|
103
|
-
}
|
|
104
|
-
// ... other servers might be listed here
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
```
|
|
53
|
+
**Claude/Cline/Roo Configuration Example:**
|
|
54
|
+
Add the following configuration block to your MCP settings file (adjust path as needed):
|
|
108
55
|
|
|
109
|
-
|
|
56
|
+
```json
|
|
57
|
+
{
|
|
58
|
+
"mcpServers": {
|
|
59
|
+
"docs-mcp-server": {
|
|
60
|
+
"command": "docker",
|
|
61
|
+
"args": [
|
|
62
|
+
"run",
|
|
63
|
+
"-i",
|
|
64
|
+
"--rm",
|
|
65
|
+
"-e",
|
|
66
|
+
"OPENAI_API_KEY",
|
|
67
|
+
"-v",
|
|
68
|
+
"docs-mcp-data:/data",
|
|
69
|
+
"ghcr.io/arabold/docs-mcp-server:latest"
|
|
70
|
+
],
|
|
71
|
+
"env": {
|
|
72
|
+
"OPENAI_API_KEY": "sk-proj-..." // Required: Replace with your key
|
|
73
|
+
},
|
|
74
|
+
"disabled": false,
|
|
75
|
+
"autoApprove": []
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
```
|
|
110
80
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
81
|
+
Remember to replace `"sk-proj-..."` with your actual OpenAI API key and restart the application.
|
|
82
|
+
|
|
83
|
+
3. **That's it!** The server will now be available to your AI assistant.
|
|
84
|
+
|
|
85
|
+
**Docker Container Settings:**
|
|
86
|
+
|
|
87
|
+
- `-i`: Keep STDIN open, crucial for MCP communication over stdio.
|
|
88
|
+
- `--rm`: Automatically remove the container when it exits.
|
|
89
|
+
- `-e OPENAI_API_KEY`: **Required.** Set your OpenAI API key.
|
|
90
|
+
- `-v docs-mcp-data:/data`: **Required for persistence.** Mounts a Docker named volume `docs-mcp-data` to store the database. You can replace with a specific host path if preferred (e.g., `-v /path/on/host:/data`).
|
|
91
|
+
|
|
92
|
+
Any of the configuration environment variables (see [Configuration](#configuration) above) can be passed to the container using the `-e` flag. For example:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
docker run -i --rm \
|
|
96
|
+
-e OPENAI_API_KEY="your-key-here" \
|
|
97
|
+
-e DOCS_MCP_EMBEDDING_MODEL="text-embedding-3-large" \
|
|
98
|
+
-e OPENAI_API_BASE="http://your-api-endpoint" \
|
|
99
|
+
-v docs-mcp-data:/data \
|
|
100
|
+
ghcr.io/arabold/docs-mcp-server:latest
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Option 2: Using npx
|
|
104
|
+
|
|
105
|
+
This approach is recommended when you need local file access (e.g., indexing documentation from your local file system). While this can also be achieved by mounting paths into a Docker container, using npx is simpler but requires a Node.js installation.
|
|
106
|
+
|
|
107
|
+
1. **Ensure Node.js is installed.**
|
|
108
|
+
2. **Configure your MCP settings:**
|
|
109
|
+
|
|
110
|
+
**Claude/Cline/Roo Configuration Example:**
|
|
111
|
+
Add the following configuration block to your MCP settings file:
|
|
112
|
+
|
|
113
|
+
```json
|
|
114
|
+
{
|
|
115
|
+
"mcpServers": {
|
|
116
|
+
"docs-mcp-server": {
|
|
117
|
+
"command": "npx",
|
|
118
|
+
"args": ["-y", "--package=@arabold/docs-mcp-server", "docs-server"],
|
|
119
|
+
"env": {
|
|
120
|
+
"OPENAI_API_KEY": "sk-proj-..." // Required: Replace with your key
|
|
121
|
+
},
|
|
122
|
+
"disabled": false,
|
|
123
|
+
"autoApprove": []
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
```
|
|
121
128
|
|
|
122
|
-
|
|
129
|
+
Remember to replace `"sk-proj-..."` with your actual OpenAI API key and restart the application.
|
|
123
130
|
|
|
124
|
-
|
|
131
|
+
3. **That's it!** The server will now be available to your AI assistant.
|
|
125
132
|
|
|
126
|
-
|
|
133
|
+
## Using the CLI
|
|
134
|
+
|
|
135
|
+
You can use the CLI to manage documentation directly, either via Docker or npx. **Important: Use the same method (Docker or npx) for both the server and CLI to ensure access to the same indexed documentation.**
|
|
136
|
+
|
|
137
|
+
### Using Docker CLI
|
|
138
|
+
|
|
139
|
+
If you're running the server with Docker, use Docker for the CLI as well:
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
docker run --rm \
|
|
143
|
+
-e OPENAI_API_KEY="your-openai-api-key-here" \
|
|
144
|
+
-v docs-mcp-data:/data \
|
|
145
|
+
ghcr.io/arabold/docs-mcp-server:latest \
|
|
146
|
+
docs-cli <command> [options]
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Make sure to use the same volume name (`docs-mcp-data` in this example) as you did for the server. Any of the configuration environment variables (see [Configuration](#configuration) above) can be passed using `-e` flags, just like with the server.
|
|
150
|
+
|
|
151
|
+
### Using npx CLI
|
|
152
|
+
|
|
153
|
+
If you're running the server with npx, use npx for the CLI as well:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
npx -y --package=@arabold/docs-mcp-server docs-cli <command> [options]
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
The npx approach will use the default data directory on your system (typically in your home directory), ensuring consistency between server and CLI.
|
|
160
|
+
|
|
161
|
+
(See "CLI Command Reference" below for available commands and options.)
|
|
162
|
+
|
|
163
|
+
### CLI Command Reference
|
|
164
|
+
|
|
165
|
+
The `docs-cli` provides commands for managing the documentation index. Access it either via Docker (`docker run -v docs-mcp-data:/data ghcr.io/arabold/docs-mcp-server:latest docs-cli ...`) or `npx` (`npx -y --package=@arabold/docs-mcp-server docs-cli ...`).
|
|
127
166
|
|
|
128
167
|
**General Help:**
|
|
129
168
|
|
|
@@ -140,7 +179,7 @@ docs-cli scrape --help
|
|
|
140
179
|
docs-cli search --help
|
|
141
180
|
docs-cli find-version --help
|
|
142
181
|
docs-cli remove --help
|
|
143
|
-
docs-cli list
|
|
182
|
+
docs-cli list --help
|
|
144
183
|
```
|
|
145
184
|
|
|
146
185
|
### Scraping Documentation (`scrape`)
|
|
@@ -164,11 +203,8 @@ docs-cli scrape <library> <url> [options]
|
|
|
164
203
|
**Examples:**
|
|
165
204
|
|
|
166
205
|
```bash
|
|
167
|
-
# Scrape React 18.2.0 docs
|
|
206
|
+
# Scrape React 18.2.0 docs
|
|
168
207
|
docs-cli scrape react --version 18.2.0 https://react.dev/
|
|
169
|
-
|
|
170
|
-
# Scrape React docs without a specific version (using npx)
|
|
171
|
-
npx -y --package=@arabold/docs-mcp-server docs-cli scrape react https://react.dev/
|
|
172
208
|
```
|
|
173
209
|
|
|
174
210
|
### Searching Documentation (`search`)
|
|
@@ -194,9 +230,6 @@ docs-cli search <library> <query> [options]
|
|
|
194
230
|
```bash
|
|
195
231
|
# Search latest React docs for 'hooks'
|
|
196
232
|
docs-cli search react 'hooks'
|
|
197
|
-
|
|
198
|
-
# Search React 18.x docs for 'hooks' (using npx)
|
|
199
|
-
npx -y --package=@arabold/docs-mcp-server docs-cli search react --version 18.x 'hooks'
|
|
200
233
|
```
|
|
201
234
|
|
|
202
235
|
### Finding Available Versions (`find-version`)
|
|
@@ -218,12 +251,12 @@ docs-cli find-version <library> [options]
|
|
|
218
251
|
docs-cli find-version react
|
|
219
252
|
```
|
|
220
253
|
|
|
221
|
-
### Listing Libraries (`list
|
|
254
|
+
### Listing Libraries (`list`)
|
|
222
255
|
|
|
223
256
|
Lists all libraries currently indexed in the store.
|
|
224
257
|
|
|
225
258
|
```bash
|
|
226
|
-
docs-cli list
|
|
259
|
+
docs-cli list
|
|
227
260
|
```
|
|
228
261
|
|
|
229
262
|
### Removing Documentation (`remove`)
|
|
@@ -330,6 +363,16 @@ This method is useful for contributing to the project or running un-published ve
|
|
|
330
363
|
# Required: Your OpenAI API key for generating embeddings.
|
|
331
364
|
OPENAI_API_KEY=your-api-key-here
|
|
332
365
|
|
|
366
|
+
# Optional: Your OpenAI Organization ID (handled automatically by LangChain if set)
|
|
367
|
+
OPENAI_ORG_ID=
|
|
368
|
+
|
|
369
|
+
# Optional: Custom base URL for OpenAI API (e.g., for Azure OpenAI or compatible APIs)
|
|
370
|
+
OPENAI_API_BASE=
|
|
371
|
+
|
|
372
|
+
# Optional: Embedding model name (defaults to "text-embedding-3-small")
|
|
373
|
+
# Examples: text-embedding-3-large, text-embedding-ada-002
|
|
374
|
+
DOCS_MCP_EMBEDDING_MODEL=
|
|
375
|
+
|
|
333
376
|
# Optional: Specify a custom directory to store the SQLite database file (documents.db).
|
|
334
377
|
# If set, this path takes precedence over the default locations.
|
|
335
378
|
# Default behavior (if unset):
|
|
@@ -210,8 +210,8 @@ function v4(options, buf, offset) {
|
|
|
210
210
|
}
|
|
211
211
|
var v4_default = v4;
|
|
212
212
|
|
|
213
|
-
// src/
|
|
214
|
-
import
|
|
213
|
+
// src/utils/url.ts
|
|
214
|
+
import psl from "psl";
|
|
215
215
|
|
|
216
216
|
// src/utils/errors.ts
|
|
217
217
|
var ScraperError = class extends Error {
|
|
@@ -231,8 +231,79 @@ var InvalidUrlError = class extends ScraperError {
|
|
|
231
231
|
super(`Invalid URL: ${url}`, false, cause);
|
|
232
232
|
}
|
|
233
233
|
};
|
|
234
|
+
var RedirectError = class extends ScraperError {
|
|
235
|
+
constructor(originalUrl, redirectUrl, statusCode) {
|
|
236
|
+
super(
|
|
237
|
+
`Redirect detected from ${originalUrl} to ${redirectUrl} (status: ${statusCode})`,
|
|
238
|
+
false
|
|
239
|
+
);
|
|
240
|
+
this.originalUrl = originalUrl;
|
|
241
|
+
this.redirectUrl = redirectUrl;
|
|
242
|
+
this.statusCode = statusCode;
|
|
243
|
+
}
|
|
244
|
+
};
|
|
245
|
+
|
|
246
|
+
// src/utils/url.ts
|
|
247
|
+
var defaultNormalizerOptions = {
|
|
248
|
+
ignoreCase: true,
|
|
249
|
+
removeHash: true,
|
|
250
|
+
removeTrailingSlash: true,
|
|
251
|
+
removeQuery: false,
|
|
252
|
+
removeIndex: true
|
|
253
|
+
};
|
|
254
|
+
function normalizeUrl(url, options = defaultNormalizerOptions) {
|
|
255
|
+
try {
|
|
256
|
+
const parsedUrl = new URL(url);
|
|
257
|
+
const finalOptions = { ...defaultNormalizerOptions, ...options };
|
|
258
|
+
const normalized = new URL(parsedUrl.origin + parsedUrl.pathname);
|
|
259
|
+
if (finalOptions.removeIndex) {
|
|
260
|
+
normalized.pathname = normalized.pathname.replace(
|
|
261
|
+
/\/index\.(html|htm|asp|php|jsp)$/i,
|
|
262
|
+
"/"
|
|
263
|
+
);
|
|
264
|
+
}
|
|
265
|
+
if (finalOptions.removeTrailingSlash && normalized.pathname.length > 1) {
|
|
266
|
+
normalized.pathname = normalized.pathname.replace(/\/+$/, "");
|
|
267
|
+
}
|
|
268
|
+
const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : "";
|
|
269
|
+
const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : "";
|
|
270
|
+
let result = normalized.origin + normalized.pathname;
|
|
271
|
+
if (preservedSearch) {
|
|
272
|
+
result += preservedSearch;
|
|
273
|
+
}
|
|
274
|
+
if (preservedHash) {
|
|
275
|
+
result += preservedHash;
|
|
276
|
+
}
|
|
277
|
+
if (finalOptions.ignoreCase) {
|
|
278
|
+
result = result.toLowerCase();
|
|
279
|
+
}
|
|
280
|
+
return result;
|
|
281
|
+
} catch {
|
|
282
|
+
return url;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
function validateUrl(url) {
|
|
286
|
+
try {
|
|
287
|
+
new URL(url);
|
|
288
|
+
} catch (error) {
|
|
289
|
+
throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
function hasSameHostname(urlA, urlB) {
|
|
293
|
+
return urlA.hostname.toLowerCase() === urlB.hostname.toLowerCase();
|
|
294
|
+
}
|
|
295
|
+
function hasSameDomain(urlA, urlB) {
|
|
296
|
+
const domainA = psl.get(urlA.hostname.toLowerCase());
|
|
297
|
+
const domainB = psl.get(urlB.hostname.toLowerCase());
|
|
298
|
+
return domainA !== null && domainA === domainB;
|
|
299
|
+
}
|
|
300
|
+
function isSubpath(baseUrl, targetUrl) {
|
|
301
|
+
const basePath = baseUrl.pathname.endsWith("/") ? baseUrl.pathname : `${baseUrl.pathname}/`;
|
|
302
|
+
return targetUrl.pathname.startsWith(basePath);
|
|
303
|
+
}
|
|
234
304
|
|
|
235
305
|
// src/scraper/fetcher/HttpFetcher.ts
|
|
306
|
+
import axios from "axios";
|
|
236
307
|
var HttpFetcher = class {
|
|
237
308
|
MAX_RETRIES = 6;
|
|
238
309
|
BASE_DELAY = 1e3;
|
|
@@ -246,16 +317,20 @@ var HttpFetcher = class {
|
|
|
246
317
|
async fetch(source, options) {
|
|
247
318
|
const maxRetries = options?.maxRetries ?? this.MAX_RETRIES;
|
|
248
319
|
const baseDelay = options?.retryDelay ?? this.BASE_DELAY;
|
|
320
|
+
const followRedirects = options?.followRedirects ?? true;
|
|
249
321
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
250
322
|
try {
|
|
251
|
-
const
|
|
323
|
+
const config = {
|
|
252
324
|
responseType: "arraybuffer",
|
|
253
325
|
// For handling both text and binary
|
|
254
326
|
headers: options?.headers,
|
|
255
327
|
timeout: options?.timeout,
|
|
256
|
-
signal: options?.signal
|
|
328
|
+
signal: options?.signal,
|
|
257
329
|
// Pass signal to axios
|
|
258
|
-
|
|
330
|
+
// Axios follows redirects by default, we need to explicitly disable it if needed
|
|
331
|
+
maxRedirects: followRedirects ? 5 : 0
|
|
332
|
+
};
|
|
333
|
+
const response = await axios.get(source, config);
|
|
259
334
|
return {
|
|
260
335
|
content: response.data,
|
|
261
336
|
mimeType: response.headers["content-type"] || "application/octet-stream",
|
|
@@ -266,6 +341,12 @@ var HttpFetcher = class {
|
|
|
266
341
|
const axiosError = error;
|
|
267
342
|
const status = axiosError.response?.status;
|
|
268
343
|
const code = axiosError.code;
|
|
344
|
+
if (!followRedirects && status && status >= 300 && status < 400) {
|
|
345
|
+
const location = axiosError.response?.headers?.location;
|
|
346
|
+
if (location) {
|
|
347
|
+
throw new RedirectError(source, location, status);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
269
350
|
if (attempt < maxRetries && (status === void 0 || status >= 500 && status < 600)) {
|
|
270
351
|
const delay = baseDelay * 2 ** attempt;
|
|
271
352
|
logger.warn(
|
|
@@ -355,53 +436,6 @@ var CancellationError = class extends PipelineError {
|
|
|
355
436
|
}
|
|
356
437
|
};
|
|
357
438
|
|
|
358
|
-
// src/utils/url.ts
|
|
359
|
-
var defaultNormalizerOptions = {
|
|
360
|
-
ignoreCase: true,
|
|
361
|
-
removeHash: true,
|
|
362
|
-
removeTrailingSlash: true,
|
|
363
|
-
removeQuery: false,
|
|
364
|
-
removeIndex: true
|
|
365
|
-
};
|
|
366
|
-
function normalizeUrl(url, options = defaultNormalizerOptions) {
|
|
367
|
-
try {
|
|
368
|
-
const parsedUrl = new URL(url);
|
|
369
|
-
const finalOptions = { ...defaultNormalizerOptions, ...options };
|
|
370
|
-
const normalized = new URL(parsedUrl.origin + parsedUrl.pathname);
|
|
371
|
-
if (finalOptions.removeIndex) {
|
|
372
|
-
normalized.pathname = normalized.pathname.replace(
|
|
373
|
-
/\/index\.(html|htm|asp|php|jsp)$/i,
|
|
374
|
-
"/"
|
|
375
|
-
);
|
|
376
|
-
}
|
|
377
|
-
if (finalOptions.removeTrailingSlash && normalized.pathname.length > 1) {
|
|
378
|
-
normalized.pathname = normalized.pathname.replace(/\/+$/, "");
|
|
379
|
-
}
|
|
380
|
-
const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : "";
|
|
381
|
-
const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : "";
|
|
382
|
-
let result = normalized.origin + normalized.pathname;
|
|
383
|
-
if (preservedSearch) {
|
|
384
|
-
result += preservedSearch;
|
|
385
|
-
}
|
|
386
|
-
if (preservedHash) {
|
|
387
|
-
result += preservedHash;
|
|
388
|
-
}
|
|
389
|
-
if (finalOptions.ignoreCase) {
|
|
390
|
-
result = result.toLowerCase();
|
|
391
|
-
}
|
|
392
|
-
return result;
|
|
393
|
-
} catch {
|
|
394
|
-
return url;
|
|
395
|
-
}
|
|
396
|
-
}
|
|
397
|
-
function validateUrl(url) {
|
|
398
|
-
try {
|
|
399
|
-
new URL(url);
|
|
400
|
-
} catch (error) {
|
|
401
|
-
throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
|
|
402
|
-
}
|
|
403
|
-
}
|
|
404
|
-
|
|
405
439
|
// src/scraper/processor/HtmlProcessor.ts
|
|
406
440
|
import createDOMPurify from "dompurify";
|
|
407
441
|
import { JSDOM } from "jsdom";
|
|
@@ -736,11 +770,18 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
|
736
770
|
return false;
|
|
737
771
|
}
|
|
738
772
|
}
|
|
739
|
-
|
|
773
|
+
/**
|
|
774
|
+
* Determines if a target URL should be followed based on the scope setting.
|
|
775
|
+
*/
|
|
776
|
+
isInScope(baseUrl, targetUrl, scope) {
|
|
740
777
|
try {
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
778
|
+
if (scope === "domain") {
|
|
779
|
+
return hasSameDomain(baseUrl, targetUrl);
|
|
780
|
+
}
|
|
781
|
+
if (scope === "hostname") {
|
|
782
|
+
return hasSameHostname(baseUrl, targetUrl);
|
|
783
|
+
}
|
|
784
|
+
return hasSameHostname(baseUrl, targetUrl) && isSubpath(baseUrl, targetUrl);
|
|
744
785
|
} catch {
|
|
745
786
|
return false;
|
|
746
787
|
}
|
|
@@ -748,17 +789,19 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
|
748
789
|
async processItem(item, options, _progressCallback, signal) {
|
|
749
790
|
const { url } = item;
|
|
750
791
|
try {
|
|
751
|
-
const
|
|
792
|
+
const fetchOptions = {
|
|
793
|
+
signal,
|
|
794
|
+
followRedirects: options.followRedirects
|
|
795
|
+
};
|
|
796
|
+
const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
|
|
752
797
|
const processor = this.getProcessor(rawContent.mimeType);
|
|
753
798
|
const result = await processor.process(rawContent);
|
|
754
799
|
const baseUrl = new URL(options.url);
|
|
755
800
|
const links = result.links.filter((link) => {
|
|
756
801
|
try {
|
|
757
802
|
const targetUrl = new URL(link, baseUrl);
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
}
|
|
761
|
-
return (!options.subpagesOnly || this.isSubpage(baseUrl, targetUrl)) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
|
|
803
|
+
const scope = options.scope || "subpages";
|
|
804
|
+
return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
|
|
762
805
|
} catch {
|
|
763
806
|
return false;
|
|
764
807
|
}
|
|
@@ -1460,7 +1503,8 @@ var ScrapeTool = class {
|
|
|
1460
1503
|
url,
|
|
1461
1504
|
library,
|
|
1462
1505
|
version: internalVersion,
|
|
1463
|
-
|
|
1506
|
+
scope: scraperOptions?.scope ?? "subpages",
|
|
1507
|
+
followRedirects: scraperOptions?.followRedirects ?? true,
|
|
1464
1508
|
maxPages: scraperOptions?.maxPages ?? 100,
|
|
1465
1509
|
maxDepth: scraperOptions?.maxDepth ?? 3,
|
|
1466
1510
|
// maxConcurrency is handled by the manager itself now
|
|
@@ -10746,6 +10790,16 @@ var StoreError = class extends Error {
|
|
|
10746
10790
|
}
|
|
10747
10791
|
}
|
|
10748
10792
|
};
|
|
10793
|
+
var DimensionError = class extends StoreError {
|
|
10794
|
+
constructor(modelName, modelDimension, dbDimension) {
|
|
10795
|
+
super(
|
|
10796
|
+
`Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension \u2264 ${dbDimension}.`
|
|
10797
|
+
);
|
|
10798
|
+
this.modelName = modelName;
|
|
10799
|
+
this.modelDimension = modelDimension;
|
|
10800
|
+
this.dbDimension = dbDimension;
|
|
10801
|
+
}
|
|
10802
|
+
};
|
|
10749
10803
|
var ConnectionError = class extends StoreError {
|
|
10750
10804
|
};
|
|
10751
10805
|
|
|
@@ -10819,6 +10873,9 @@ function mapDbDocumentToDocument(doc) {
|
|
|
10819
10873
|
var DocumentStore = class {
|
|
10820
10874
|
db;
|
|
10821
10875
|
embeddings;
|
|
10876
|
+
dbDimension = 1536;
|
|
10877
|
+
// Fixed dimension from schema.ts
|
|
10878
|
+
modelDimension;
|
|
10822
10879
|
statements;
|
|
10823
10880
|
/**
|
|
10824
10881
|
* Calculates Reciprocal Rank Fusion score for a result
|
|
@@ -10927,14 +10984,46 @@ var DocumentStore = class {
|
|
|
10927
10984
|
this.statements = statements;
|
|
10928
10985
|
}
|
|
10929
10986
|
/**
|
|
10930
|
-
*
|
|
10987
|
+
* Pads a vector to the fixed database dimension by appending zeros.
|
|
10988
|
+
* Throws an error if the input vector is longer than the database dimension.
|
|
10989
|
+
*/
|
|
10990
|
+
padVector(vector) {
|
|
10991
|
+
if (vector.length > this.dbDimension) {
|
|
10992
|
+
throw new Error(
|
|
10993
|
+
`Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
|
|
10994
|
+
);
|
|
10995
|
+
}
|
|
10996
|
+
if (vector.length === this.dbDimension) {
|
|
10997
|
+
return vector;
|
|
10998
|
+
}
|
|
10999
|
+
return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
|
|
11000
|
+
}
|
|
11001
|
+
/**
|
|
11002
|
+
* Initializes embeddings client using environment variables for configuration.
|
|
11003
|
+
*
|
|
11004
|
+
* Supports:
|
|
11005
|
+
* - OPENAI_API_KEY (handled automatically by LangChain)
|
|
11006
|
+
* - OPENAI_ORG_ID (handled automatically by LangChain)
|
|
11007
|
+
* - DOCS_MCP_EMBEDDING_MODEL (optional, defaults to "text-embedding-3-small")
|
|
11008
|
+
* - OPENAI_API_BASE (optional)
|
|
10931
11009
|
*/
|
|
10932
|
-
initializeEmbeddings() {
|
|
10933
|
-
|
|
10934
|
-
|
|
11010
|
+
async initializeEmbeddings() {
|
|
11011
|
+
const modelName = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
|
|
11012
|
+
const baseURL = process.env.OPENAI_API_BASE;
|
|
11013
|
+
const config = {
|
|
10935
11014
|
stripNewLines: true,
|
|
10936
|
-
batchSize: 512
|
|
10937
|
-
|
|
11015
|
+
batchSize: 512,
|
|
11016
|
+
modelName
|
|
11017
|
+
};
|
|
11018
|
+
if (baseURL) {
|
|
11019
|
+
config.configuration = { baseURL };
|
|
11020
|
+
}
|
|
11021
|
+
this.embeddings = new OpenAIEmbeddings(config);
|
|
11022
|
+
const testVector = await this.embeddings.embedQuery("test");
|
|
11023
|
+
this.modelDimension = testVector.length;
|
|
11024
|
+
if (this.modelDimension > this.dbDimension) {
|
|
11025
|
+
throw new DimensionError(modelName, this.modelDimension, this.dbDimension);
|
|
11026
|
+
}
|
|
10938
11027
|
}
|
|
10939
11028
|
/**
|
|
10940
11029
|
* Escapes a query string for use with SQLite FTS5 MATCH operator.
|
|
@@ -10952,8 +11041,11 @@ var DocumentStore = class {
|
|
|
10952
11041
|
sqliteVec.load(this.db);
|
|
10953
11042
|
this.db.exec(createTablesSQL);
|
|
10954
11043
|
this.prepareStatements();
|
|
10955
|
-
this.initializeEmbeddings();
|
|
11044
|
+
await this.initializeEmbeddings();
|
|
10956
11045
|
} catch (error) {
|
|
11046
|
+
if (error instanceof StoreError) {
|
|
11047
|
+
throw error;
|
|
11048
|
+
}
|
|
10957
11049
|
throw new ConnectionError("Failed to initialize database connection", error);
|
|
10958
11050
|
}
|
|
10959
11051
|
}
|
|
@@ -11021,7 +11113,8 @@ var DocumentStore = class {
|
|
|
11021
11113
|
`;
|
|
11022
11114
|
return `${header}${doc.pageContent}`;
|
|
11023
11115
|
});
|
|
11024
|
-
const
|
|
11116
|
+
const rawEmbeddings = await this.embeddings.embedDocuments(texts);
|
|
11117
|
+
const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
|
|
11025
11118
|
const transaction = this.db.transaction((docs) => {
|
|
11026
11119
|
for (let i = 0; i < docs.length; i++) {
|
|
11027
11120
|
const doc = docs[i];
|
|
@@ -11042,7 +11135,7 @@ var DocumentStore = class {
|
|
|
11042
11135
|
BigInt(rowId),
|
|
11043
11136
|
library.toLowerCase(),
|
|
11044
11137
|
version.toLowerCase(),
|
|
11045
|
-
JSON.stringify(
|
|
11138
|
+
JSON.stringify(paddedEmbeddings[i])
|
|
11046
11139
|
);
|
|
11047
11140
|
}
|
|
11048
11141
|
});
|
|
@@ -11088,7 +11181,8 @@ var DocumentStore = class {
|
|
|
11088
11181
|
*/
|
|
11089
11182
|
async findByContent(library, version, query, limit) {
|
|
11090
11183
|
try {
|
|
11091
|
-
const
|
|
11184
|
+
const rawEmbedding = await this.embeddings.embedQuery(query);
|
|
11185
|
+
const embedding = this.padVector(rawEmbedding);
|
|
11092
11186
|
const ftsQuery = this.escapeFtsQuery(query);
|
|
11093
11187
|
const stmt = this.db.prepare(`
|
|
11094
11188
|
WITH vec_scores AS (
|
|
@@ -11524,4 +11618,4 @@ export {
|
|
|
11524
11618
|
RemoveTool,
|
|
11525
11619
|
DocumentManagementService
|
|
11526
11620
|
};
|
|
11527
|
-
//# sourceMappingURL=chunk-
|
|
11621
|
+
//# sourceMappingURL=chunk-S7C2LRQA.js.map
|