@arabold/docs-mcp-server 1.21.1 → 1.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/index.js +3571 -849
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/DocumentManagementClient-CAFdDwTu.js +0 -57
- package/dist/DocumentManagementClient-CAFdDwTu.js.map +0 -1
- package/dist/DocumentManagementService-C1xAzouZ.js +0 -2026
- package/dist/DocumentManagementService-C1xAzouZ.js.map +0 -1
- package/dist/EmbeddingFactory-CElwVk3X.js +0 -176
- package/dist/EmbeddingFactory-CElwVk3X.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import "dotenv/config";
|
|
3
|
+
import { BedrockEmbeddings } from "@langchain/aws";
|
|
4
|
+
import { GoogleGenerativeAIEmbeddings } from "@langchain/google-genai";
|
|
5
|
+
import { VertexAIEmbeddings } from "@langchain/google-vertexai";
|
|
6
|
+
import { AzureOpenAIEmbeddings, OpenAIEmbeddings } from "@langchain/openai";
|
|
7
|
+
import { Embeddings } from "@langchain/core/embeddings";
|
|
3
8
|
import { PostHog } from "posthog-node";
|
|
4
9
|
import crypto, { randomUUID } from "node:crypto";
|
|
5
10
|
import fs, { readFileSync, existsSync } from "node:fs";
|
|
@@ -40,14 +45,277 @@ import DOMPurify from "dompurify";
|
|
|
40
45
|
import { fileURLToPath, URL as URL$1 } from "node:url";
|
|
41
46
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
42
47
|
import { createTRPCProxyClient, httpBatchLink } from "@trpc/client";
|
|
43
|
-
import "fuse.js";
|
|
44
|
-
import "langchain/text_splitter";
|
|
45
|
-
import "better-sqlite3";
|
|
46
|
-
import "sqlite-vec";
|
|
48
|
+
import Fuse from "fuse.js";
|
|
49
|
+
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
|
|
50
|
+
import Database from "better-sqlite3";
|
|
51
|
+
import * as sqliteVec from "sqlite-vec";
|
|
47
52
|
import { execSync } from "node:child_process";
|
|
48
53
|
import { v4 } from "uuid";
|
|
49
54
|
import psl from "psl";
|
|
50
55
|
import { minimatch } from "minimatch";
|
|
56
|
+
const VECTOR_DIMENSION = 1536;
|
|
57
|
+
function mapDbDocumentToDocument(doc) {
|
|
58
|
+
return {
|
|
59
|
+
id: doc.id,
|
|
60
|
+
pageContent: doc.content,
|
|
61
|
+
metadata: JSON.parse(doc.metadata)
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
var VersionStatus = /* @__PURE__ */ ((VersionStatus2) => {
|
|
65
|
+
VersionStatus2["NOT_INDEXED"] = "not_indexed";
|
|
66
|
+
VersionStatus2["QUEUED"] = "queued";
|
|
67
|
+
VersionStatus2["RUNNING"] = "running";
|
|
68
|
+
VersionStatus2["COMPLETED"] = "completed";
|
|
69
|
+
VersionStatus2["FAILED"] = "failed";
|
|
70
|
+
VersionStatus2["CANCELLED"] = "cancelled";
|
|
71
|
+
VersionStatus2["UPDATING"] = "updating";
|
|
72
|
+
return VersionStatus2;
|
|
73
|
+
})(VersionStatus || {});
|
|
74
|
+
function normalizeVersionName(name) {
|
|
75
|
+
return name ?? "";
|
|
76
|
+
}
|
|
77
|
+
function denormalizeVersionName(name) {
|
|
78
|
+
return name === "" ? "" : name;
|
|
79
|
+
}
|
|
80
|
+
function getStatusDescription(status) {
|
|
81
|
+
const descriptions = {
|
|
82
|
+
[
|
|
83
|
+
"not_indexed"
|
|
84
|
+
/* NOT_INDEXED */
|
|
85
|
+
]: "Version created but not yet indexed",
|
|
86
|
+
[
|
|
87
|
+
"queued"
|
|
88
|
+
/* QUEUED */
|
|
89
|
+
]: "Waiting in queue for indexing",
|
|
90
|
+
[
|
|
91
|
+
"running"
|
|
92
|
+
/* RUNNING */
|
|
93
|
+
]: "Currently being indexed",
|
|
94
|
+
[
|
|
95
|
+
"completed"
|
|
96
|
+
/* COMPLETED */
|
|
97
|
+
]: "Successfully indexed",
|
|
98
|
+
[
|
|
99
|
+
"failed"
|
|
100
|
+
/* FAILED */
|
|
101
|
+
]: "Indexing failed",
|
|
102
|
+
[
|
|
103
|
+
"cancelled"
|
|
104
|
+
/* CANCELLED */
|
|
105
|
+
]: "Indexing was cancelled",
|
|
106
|
+
[
|
|
107
|
+
"updating"
|
|
108
|
+
/* UPDATING */
|
|
109
|
+
]: "Re-indexing in progress"
|
|
110
|
+
};
|
|
111
|
+
return descriptions[status] || "Unknown status";
|
|
112
|
+
}
|
|
113
|
+
function isActiveStatus(status) {
|
|
114
|
+
return [
|
|
115
|
+
"queued",
|
|
116
|
+
"running",
|
|
117
|
+
"updating"
|
|
118
|
+
/* UPDATING */
|
|
119
|
+
].includes(
|
|
120
|
+
status
|
|
121
|
+
);
|
|
122
|
+
}
|
|
123
|
+
class StoreError extends Error {
|
|
124
|
+
constructor(message, cause) {
|
|
125
|
+
super(cause ? `${message} caused by ${cause}` : message);
|
|
126
|
+
this.cause = cause;
|
|
127
|
+
this.name = this.constructor.name;
|
|
128
|
+
const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
|
|
129
|
+
if (causeError?.stack) {
|
|
130
|
+
this.stack = causeError.stack;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
class DimensionError extends StoreError {
|
|
135
|
+
constructor(modelName, modelDimension, dbDimension) {
|
|
136
|
+
super(
|
|
137
|
+
`Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
|
|
138
|
+
);
|
|
139
|
+
this.modelName = modelName;
|
|
140
|
+
this.modelDimension = modelDimension;
|
|
141
|
+
this.dbDimension = dbDimension;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
class ConnectionError extends StoreError {
|
|
145
|
+
}
|
|
146
|
+
class FixedDimensionEmbeddings extends Embeddings {
|
|
147
|
+
constructor(embeddings, targetDimension, providerAndModel, allowTruncate = false) {
|
|
148
|
+
super({});
|
|
149
|
+
this.embeddings = embeddings;
|
|
150
|
+
this.targetDimension = targetDimension;
|
|
151
|
+
this.allowTruncate = allowTruncate;
|
|
152
|
+
const [providerOrModel, modelName] = providerAndModel.split(":");
|
|
153
|
+
this.provider = modelName ? providerOrModel : "openai";
|
|
154
|
+
this.model = modelName || providerOrModel;
|
|
155
|
+
}
|
|
156
|
+
provider;
|
|
157
|
+
model;
|
|
158
|
+
/**
|
|
159
|
+
* Normalize a vector to the target dimension by truncating (for MRL models) or padding.
|
|
160
|
+
* @throws {DimensionError} If vector is too large and provider doesn't support MRL
|
|
161
|
+
*/
|
|
162
|
+
normalizeVector(vector) {
|
|
163
|
+
const dimension = vector.length;
|
|
164
|
+
if (dimension > this.targetDimension) {
|
|
165
|
+
if (this.allowTruncate) {
|
|
166
|
+
return vector.slice(0, this.targetDimension);
|
|
167
|
+
}
|
|
168
|
+
throw new DimensionError(
|
|
169
|
+
`${this.provider}:${this.model}`,
|
|
170
|
+
dimension,
|
|
171
|
+
this.targetDimension
|
|
172
|
+
);
|
|
173
|
+
}
|
|
174
|
+
if (dimension < this.targetDimension) {
|
|
175
|
+
return [...vector, ...new Array(this.targetDimension - dimension).fill(0)];
|
|
176
|
+
}
|
|
177
|
+
return vector;
|
|
178
|
+
}
|
|
179
|
+
async embedQuery(text) {
|
|
180
|
+
const vector = await this.embeddings.embedQuery(text);
|
|
181
|
+
return this.normalizeVector(vector);
|
|
182
|
+
}
|
|
183
|
+
async embedDocuments(documents) {
|
|
184
|
+
const vectors = await this.embeddings.embedDocuments(documents);
|
|
185
|
+
return vectors.map((vector) => this.normalizeVector(vector));
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
class UnsupportedProviderError extends Error {
|
|
189
|
+
constructor(provider) {
|
|
190
|
+
super(
|
|
191
|
+
`❌ Unsupported embedding provider: ${provider}
|
|
192
|
+
Supported providers: openai, vertex, gemini, aws, microsoft
|
|
193
|
+
See README.md for configuration options or run with --help for more details.`
|
|
194
|
+
);
|
|
195
|
+
this.name = "UnsupportedProviderError";
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
class ModelConfigurationError extends Error {
|
|
199
|
+
constructor(message) {
|
|
200
|
+
super(message);
|
|
201
|
+
this.name = "ModelConfigurationError";
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
function createEmbeddingModel(providerAndModel) {
|
|
205
|
+
const [providerOrModel, ...modelNameParts] = providerAndModel.split(":");
|
|
206
|
+
const modelName = modelNameParts.join(":");
|
|
207
|
+
const provider = modelName ? providerOrModel : "openai";
|
|
208
|
+
const model = modelName || providerOrModel;
|
|
209
|
+
const baseConfig = { stripNewLines: true };
|
|
210
|
+
switch (provider) {
|
|
211
|
+
case "openai": {
|
|
212
|
+
if (!process.env.OPENAI_API_KEY) {
|
|
213
|
+
throw new ModelConfigurationError(
|
|
214
|
+
"❌ Missing API key for embedding provider\n Please set OPENAI_API_KEY or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
|
|
215
|
+
);
|
|
216
|
+
}
|
|
217
|
+
const config = {
|
|
218
|
+
...baseConfig,
|
|
219
|
+
modelName: model,
|
|
220
|
+
batchSize: 512
|
|
221
|
+
// OpenAI supports large batches
|
|
222
|
+
};
|
|
223
|
+
const baseURL = process.env.OPENAI_API_BASE;
|
|
224
|
+
if (baseURL) {
|
|
225
|
+
config.configuration = { baseURL };
|
|
226
|
+
}
|
|
227
|
+
return new OpenAIEmbeddings(config);
|
|
228
|
+
}
|
|
229
|
+
case "vertex": {
|
|
230
|
+
if (!process.env.GOOGLE_APPLICATION_CREDENTIALS) {
|
|
231
|
+
throw new ModelConfigurationError(
|
|
232
|
+
"❌ Missing credentials for Google Cloud Vertex AI\n Please set GOOGLE_APPLICATION_CREDENTIALS or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
|
|
233
|
+
);
|
|
234
|
+
}
|
|
235
|
+
return new VertexAIEmbeddings({
|
|
236
|
+
...baseConfig,
|
|
237
|
+
model
|
|
238
|
+
// e.g., "text-embedding-004"
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
case "gemini": {
|
|
242
|
+
if (!process.env.GOOGLE_API_KEY) {
|
|
243
|
+
throw new ModelConfigurationError(
|
|
244
|
+
"❌ Missing API key for Google AI (Gemini)\n Please set GOOGLE_API_KEY or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
|
|
245
|
+
);
|
|
246
|
+
}
|
|
247
|
+
const baseEmbeddings = new GoogleGenerativeAIEmbeddings({
|
|
248
|
+
...baseConfig,
|
|
249
|
+
apiKey: process.env.GOOGLE_API_KEY,
|
|
250
|
+
model
|
|
251
|
+
// e.g., "gemini-embedding-exp-03-07"
|
|
252
|
+
});
|
|
253
|
+
return new FixedDimensionEmbeddings(
|
|
254
|
+
baseEmbeddings,
|
|
255
|
+
VECTOR_DIMENSION,
|
|
256
|
+
providerAndModel,
|
|
257
|
+
true
|
|
258
|
+
);
|
|
259
|
+
}
|
|
260
|
+
case "aws": {
|
|
261
|
+
const region = process.env.BEDROCK_AWS_REGION || process.env.AWS_REGION;
|
|
262
|
+
if (!region) {
|
|
263
|
+
throw new ModelConfigurationError(
|
|
264
|
+
"BEDROCK_AWS_REGION or AWS_REGION environment variable is required for AWS Bedrock"
|
|
265
|
+
);
|
|
266
|
+
}
|
|
267
|
+
if (!process.env.AWS_PROFILE && !process.env.AWS_ACCESS_KEY_ID && !process.env.AWS_SECRET_ACCESS_KEY) {
|
|
268
|
+
throw new ModelConfigurationError(
|
|
269
|
+
"Either AWS_PROFILE or both AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables are required for AWS Bedrock"
|
|
270
|
+
);
|
|
271
|
+
}
|
|
272
|
+
const credentials = process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY ? {
|
|
273
|
+
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
|
|
274
|
+
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
|
|
275
|
+
sessionToken: process.env.AWS_SESSION_TOKEN
|
|
276
|
+
} : void 0;
|
|
277
|
+
return new BedrockEmbeddings({
|
|
278
|
+
...baseConfig,
|
|
279
|
+
model,
|
|
280
|
+
// e.g., "amazon.titan-embed-text-v1"
|
|
281
|
+
region,
|
|
282
|
+
...credentials ? { credentials } : {}
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
case "microsoft": {
|
|
286
|
+
if (!process.env.AZURE_OPENAI_API_KEY) {
|
|
287
|
+
throw new ModelConfigurationError(
|
|
288
|
+
"AZURE_OPENAI_API_KEY environment variable is required for Azure OpenAI"
|
|
289
|
+
);
|
|
290
|
+
}
|
|
291
|
+
if (!process.env.AZURE_OPENAI_API_INSTANCE_NAME) {
|
|
292
|
+
throw new ModelConfigurationError(
|
|
293
|
+
"AZURE_OPENAI_API_INSTANCE_NAME environment variable is required for Azure OpenAI"
|
|
294
|
+
);
|
|
295
|
+
}
|
|
296
|
+
if (!process.env.AZURE_OPENAI_API_DEPLOYMENT_NAME) {
|
|
297
|
+
throw new ModelConfigurationError(
|
|
298
|
+
"AZURE_OPENAI_API_DEPLOYMENT_NAME environment variable is required for Azure OpenAI"
|
|
299
|
+
);
|
|
300
|
+
}
|
|
301
|
+
if (!process.env.AZURE_OPENAI_API_VERSION) {
|
|
302
|
+
throw new ModelConfigurationError(
|
|
303
|
+
"AZURE_OPENAI_API_VERSION environment variable is required for Azure OpenAI"
|
|
304
|
+
);
|
|
305
|
+
}
|
|
306
|
+
return new AzureOpenAIEmbeddings({
|
|
307
|
+
...baseConfig,
|
|
308
|
+
azureOpenAIApiKey: process.env.AZURE_OPENAI_API_KEY,
|
|
309
|
+
azureOpenAIApiInstanceName: process.env.AZURE_OPENAI_API_INSTANCE_NAME,
|
|
310
|
+
azureOpenAIApiDeploymentName: process.env.AZURE_OPENAI_API_DEPLOYMENT_NAME,
|
|
311
|
+
azureOpenAIApiVersion: process.env.AZURE_OPENAI_API_VERSION,
|
|
312
|
+
deploymentName: model
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
default:
|
|
316
|
+
throw new UnsupportedProviderError(provider);
|
|
317
|
+
}
|
|
318
|
+
}
|
|
51
319
|
const LogLevel = {
|
|
52
320
|
ERROR: 0,
|
|
53
321
|
WARN: 1,
|
|
@@ -106,6 +374,41 @@ const logger = {
|
|
|
106
374
|
}
|
|
107
375
|
}
|
|
108
376
|
};
|
|
377
|
+
function camelToSnakeCase(str) {
|
|
378
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
379
|
+
}
|
|
380
|
+
function convertPropertiesToSnakeCase(obj) {
|
|
381
|
+
const result = {};
|
|
382
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
383
|
+
const snakeKey = camelToSnakeCase(key);
|
|
384
|
+
if (value && typeof value === "object" && !Array.isArray(value) && !(value instanceof Date)) {
|
|
385
|
+
result[snakeKey] = convertPropertiesToSnakeCase(value);
|
|
386
|
+
} else if (Array.isArray(value)) {
|
|
387
|
+
result[snakeKey] = value.map(
|
|
388
|
+
(item) => item && typeof item === "object" && !(item instanceof Date) ? convertPropertiesToSnakeCase(item) : item
|
|
389
|
+
);
|
|
390
|
+
} else {
|
|
391
|
+
result[snakeKey] = value;
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
return result;
|
|
395
|
+
}
|
|
396
|
+
function addPostHogStandardProperties(properties) {
|
|
397
|
+
const result = { ...properties };
|
|
398
|
+
if (properties.sessionId) {
|
|
399
|
+
result.$session_id = properties.sessionId;
|
|
400
|
+
delete result.sessionId;
|
|
401
|
+
}
|
|
402
|
+
if (properties.startTime) {
|
|
403
|
+
result.$start_timestamp = properties.startTime.toISOString();
|
|
404
|
+
delete result.startTime;
|
|
405
|
+
}
|
|
406
|
+
if (properties.appVersion) {
|
|
407
|
+
result.$app_version = properties.appVersion;
|
|
408
|
+
delete result.appVersion;
|
|
409
|
+
}
|
|
410
|
+
return result;
|
|
411
|
+
}
|
|
109
412
|
class PostHogClient {
|
|
110
413
|
client;
|
|
111
414
|
enabled;
|
|
@@ -156,10 +459,12 @@ class PostHogClient {
|
|
|
156
459
|
capture(distinctId, event, properties) {
|
|
157
460
|
if (!this.enabled || !this.client) return;
|
|
158
461
|
try {
|
|
462
|
+
const enhancedProperties = addPostHogStandardProperties(properties);
|
|
463
|
+
const snakeCaseProperties = convertPropertiesToSnakeCase(enhancedProperties);
|
|
159
464
|
this.client.capture({
|
|
160
465
|
distinctId,
|
|
161
466
|
event,
|
|
162
|
-
properties
|
|
467
|
+
properties: snakeCaseProperties
|
|
163
468
|
});
|
|
164
469
|
logger.debug(`PostHog event captured: ${event}`);
|
|
165
470
|
} catch (error) {
|
|
@@ -168,6 +473,26 @@ class PostHogClient {
|
|
|
168
473
|
);
|
|
169
474
|
}
|
|
170
475
|
}
|
|
476
|
+
/**
|
|
477
|
+
* Capture exception using PostHog's native error tracking
|
|
478
|
+
*/
|
|
479
|
+
captureException(distinctId, error, properties) {
|
|
480
|
+
if (!this.enabled || !this.client) return;
|
|
481
|
+
try {
|
|
482
|
+
const enhancedProperties = addPostHogStandardProperties(properties || {});
|
|
483
|
+
const snakeCaseProperties = convertPropertiesToSnakeCase(enhancedProperties);
|
|
484
|
+
this.client.captureException({
|
|
485
|
+
error,
|
|
486
|
+
distinctId,
|
|
487
|
+
properties: snakeCaseProperties
|
|
488
|
+
});
|
|
489
|
+
logger.debug(`PostHog exception captured: ${error.constructor.name}`);
|
|
490
|
+
} catch (captureError) {
|
|
491
|
+
logger.debug(
|
|
492
|
+
`PostHog captureException error: ${captureError instanceof Error ? captureError.message : "Unknown error"}`
|
|
493
|
+
);
|
|
494
|
+
}
|
|
495
|
+
}
|
|
171
496
|
/**
|
|
172
497
|
* Graceful shutdown with event flushing
|
|
173
498
|
*/
|
|
@@ -204,7 +529,7 @@ class SessionTracker {
|
|
|
204
529
|
endSession() {
|
|
205
530
|
if (!this.sessionContext) return null;
|
|
206
531
|
const duration = Date.now() - this.sessionContext.startTime.getTime();
|
|
207
|
-
const sessionInterface = this.sessionContext.
|
|
532
|
+
const sessionInterface = this.sessionContext.appInterface;
|
|
208
533
|
this.sessionContext = void 0;
|
|
209
534
|
return { duration, interface: sessionInterface };
|
|
210
535
|
}
|
|
@@ -214,6 +539,14 @@ class SessionTracker {
|
|
|
214
539
|
getSessionContext() {
|
|
215
540
|
return this.sessionContext;
|
|
216
541
|
}
|
|
542
|
+
/**
|
|
543
|
+
* Update session context with additional fields
|
|
544
|
+
*/
|
|
545
|
+
updateSessionContext(updates) {
|
|
546
|
+
if (this.sessionContext) {
|
|
547
|
+
this.sessionContext = { ...this.sessionContext, ...updates };
|
|
548
|
+
}
|
|
549
|
+
}
|
|
217
550
|
/**
|
|
218
551
|
* Get enriched properties with session context
|
|
219
552
|
*/
|
|
@@ -294,8 +627,6 @@ var TelemetryEvent = /* @__PURE__ */ ((TelemetryEvent2) => {
|
|
|
294
627
|
TelemetryEvent2["PIPELINE_JOB_PROGRESS"] = "pipeline_job_progress";
|
|
295
628
|
TelemetryEvent2["PIPELINE_JOB_COMPLETED"] = "pipeline_job_completed";
|
|
296
629
|
TelemetryEvent2["DOCUMENT_PROCESSED"] = "document_processed";
|
|
297
|
-
TelemetryEvent2["DOCUMENT_PROCESSING_FAILED"] = "document_processing_failed";
|
|
298
|
-
TelemetryEvent2["ERROR_OCCURRED"] = "error_occurred";
|
|
299
630
|
return TelemetryEvent2;
|
|
300
631
|
})(TelemetryEvent || {});
|
|
301
632
|
class Analytics {
|
|
@@ -321,23 +652,34 @@ class Analytics {
|
|
|
321
652
|
if (!this.enabled) return;
|
|
322
653
|
this.sessionTracker.startSession(context);
|
|
323
654
|
this.track("session_started", {
|
|
324
|
-
interface: context.
|
|
325
|
-
version: context.
|
|
326
|
-
platform: context.
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
servicesCount: context.servicesEnabled.length
|
|
655
|
+
interface: context.appInterface,
|
|
656
|
+
version: context.appVersion,
|
|
657
|
+
platform: context.appPlatform,
|
|
658
|
+
authEnabled: context.appAuthEnabled,
|
|
659
|
+
readOnly: context.appReadOnly,
|
|
660
|
+
servicesCount: context.appServicesEnabled.length
|
|
331
661
|
});
|
|
332
662
|
}
|
|
333
663
|
/**
|
|
334
|
-
*
|
|
664
|
+
* Update session context with additional fields (e.g., embedding model info)
|
|
335
665
|
*/
|
|
666
|
+
updateSessionContext(updates) {
|
|
667
|
+
if (!this.enabled) return;
|
|
668
|
+
this.sessionTracker.updateSessionContext(updates);
|
|
669
|
+
}
|
|
336
670
|
track(event, properties = {}) {
|
|
337
671
|
if (!this.enabled) return;
|
|
338
672
|
const eventProperties = this.sessionTracker.getEnrichedProperties(properties);
|
|
339
673
|
this.postHogClient.capture(this.distinctId, event, eventProperties);
|
|
340
674
|
}
|
|
675
|
+
/**
|
|
676
|
+
* Capture exception using PostHog's native error tracking with session context
|
|
677
|
+
*/
|
|
678
|
+
captureException(error, properties = {}) {
|
|
679
|
+
if (!this.enabled) return;
|
|
680
|
+
const eventProperties = this.sessionTracker.getEnrichedProperties(properties);
|
|
681
|
+
this.postHogClient.captureException(this.distinctId, error, eventProperties);
|
|
682
|
+
}
|
|
341
683
|
/**
|
|
342
684
|
* Track session end with duration
|
|
343
685
|
*/
|
|
@@ -369,29 +711,38 @@ class Analytics {
|
|
|
369
711
|
getSessionContext() {
|
|
370
712
|
return this.sessionTracker.getSessionContext();
|
|
371
713
|
}
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
714
|
+
/**
|
|
715
|
+
* Track tool usage with error handling and automatic timing
|
|
716
|
+
*/
|
|
717
|
+
async trackTool(toolName, operation, getProperties) {
|
|
718
|
+
const startTime = Date.now();
|
|
719
|
+
try {
|
|
720
|
+
const result = await operation();
|
|
721
|
+
this.track("tool_used", {
|
|
722
|
+
tool: toolName,
|
|
723
|
+
success: true,
|
|
724
|
+
durationMs: Date.now() - startTime,
|
|
725
|
+
...getProperties ? getProperties(result) : {}
|
|
726
|
+
});
|
|
727
|
+
return result;
|
|
728
|
+
} catch (error) {
|
|
729
|
+
this.track("tool_used", {
|
|
730
|
+
tool: toolName,
|
|
731
|
+
success: false,
|
|
732
|
+
durationMs: Date.now() - startTime
|
|
733
|
+
});
|
|
734
|
+
if (error instanceof Error) {
|
|
735
|
+
this.captureException(error, {
|
|
736
|
+
tool: toolName,
|
|
737
|
+
context: "tool_execution",
|
|
738
|
+
durationMs: Date.now() - startTime
|
|
739
|
+
});
|
|
740
|
+
}
|
|
741
|
+
throw error;
|
|
742
|
+
}
|
|
393
743
|
}
|
|
394
744
|
}
|
|
745
|
+
const analytics = new Analytics();
|
|
395
746
|
function extractHostname(url) {
|
|
396
747
|
try {
|
|
397
748
|
const parsed = new URL(url);
|
|
@@ -411,28 +762,7 @@ function extractProtocol(urlOrPath) {
|
|
|
411
762
|
return "unknown";
|
|
412
763
|
}
|
|
413
764
|
}
|
|
414
|
-
|
|
415
|
-
return {
|
|
416
|
-
length: query.length,
|
|
417
|
-
wordCount: query.trim().split(/\s+/).length,
|
|
418
|
-
hasCodeTerms: /\b(function|class|import|export|const|let|var|def|async|await)\b/i.test(query),
|
|
419
|
-
hasSpecialChars: /[^\w\s]/.test(query)
|
|
420
|
-
};
|
|
421
|
-
}
|
|
422
|
-
function sanitizeErrorMessage(message) {
|
|
423
|
-
return message.replace(/https?:\/\/[^\s]+/gi, "[url]").replace(/file:\/\/[^\s]+/gi, "[file-url]").replace(/\/[^\s]*\.[a-z]{2,4}/gi, "[path]").replace(/[A-Za-z]:\\[^\s]+/g, "[path]").replace(/Bearer\s+[^\s]+/gi, "Bearer [token]").replace(/api[_-]?key[=:]\s*[^\s]+/gi, "api_key=[redacted]").replace(/token[=:]\s*[^\s]+/gi, "token=[redacted]").substring(0, 200);
|
|
424
|
-
}
|
|
425
|
-
function sanitizeError(error) {
|
|
426
|
-
return {
|
|
427
|
-
type: error.constructor.name,
|
|
428
|
-
message: sanitizeErrorMessage(error.message),
|
|
429
|
-
hasStack: Boolean(error.stack)
|
|
430
|
-
};
|
|
431
|
-
}
|
|
432
|
-
function extractCliFlags(argv) {
|
|
433
|
-
return argv.filter((arg) => arg.startsWith("--") || arg.startsWith("-"));
|
|
434
|
-
}
|
|
435
|
-
const version = "1.21.0";
|
|
765
|
+
const version = "1.21.1";
|
|
436
766
|
const packageJson = {
|
|
437
767
|
version
|
|
438
768
|
};
|
|
@@ -440,34 +770,48 @@ function getPackageVersion() {
|
|
|
440
770
|
return packageJson.version;
|
|
441
771
|
}
|
|
442
772
|
function createCliSession(command, options) {
|
|
443
|
-
|
|
773
|
+
const baseSession = {
|
|
444
774
|
sessionId: randomUUID(),
|
|
445
|
-
|
|
775
|
+
appInterface: "cli",
|
|
446
776
|
startTime: /* @__PURE__ */ new Date(),
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
777
|
+
appVersion: getPackageVersion(),
|
|
778
|
+
appPlatform: process.platform,
|
|
779
|
+
appNodeVersion: process.version,
|
|
780
|
+
cliCommand: command || "unknown",
|
|
781
|
+
appAuthEnabled: options?.authEnabled,
|
|
782
|
+
appReadOnly: options?.readOnly,
|
|
783
|
+
appServicesEnabled: ["worker"]
|
|
454
784
|
// CLI typically runs embedded worker
|
|
455
785
|
};
|
|
786
|
+
if (options?.embeddingContext) {
|
|
787
|
+
return {
|
|
788
|
+
...baseSession,
|
|
789
|
+
...options.embeddingContext
|
|
790
|
+
};
|
|
791
|
+
}
|
|
792
|
+
return baseSession;
|
|
456
793
|
}
|
|
457
794
|
function createMcpSession(options) {
|
|
458
|
-
|
|
795
|
+
const baseSession = {
|
|
459
796
|
sessionId: randomUUID(),
|
|
460
|
-
|
|
797
|
+
appInterface: "mcp",
|
|
461
798
|
startTime: /* @__PURE__ */ new Date(),
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
799
|
+
appVersion: getPackageVersion(),
|
|
800
|
+
appPlatform: process.platform,
|
|
801
|
+
appNodeVersion: process.version,
|
|
802
|
+
mcpProtocol: options.protocol || "stdio",
|
|
803
|
+
mcpTransport: options.transport,
|
|
804
|
+
appAuthEnabled: options.authEnabled ?? false,
|
|
805
|
+
appReadOnly: options.readOnly ?? false,
|
|
806
|
+
appServicesEnabled: options.servicesEnabled ?? ["mcp"]
|
|
470
807
|
};
|
|
808
|
+
if (options.embeddingContext) {
|
|
809
|
+
return {
|
|
810
|
+
...baseSession,
|
|
811
|
+
...options.embeddingContext
|
|
812
|
+
};
|
|
813
|
+
}
|
|
814
|
+
return baseSession;
|
|
471
815
|
}
|
|
472
816
|
function createTelemetryService() {
|
|
473
817
|
return {
|
|
@@ -930,42 +1274,55 @@ class CancelJobTool {
|
|
|
930
1274
|
* @returns A promise that resolves with the outcome message.
|
|
931
1275
|
*/
|
|
932
1276
|
async execute(input) {
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
1277
|
+
return analytics.trackTool(
|
|
1278
|
+
"cancel_job",
|
|
1279
|
+
async () => {
|
|
1280
|
+
try {
|
|
1281
|
+
const job = await this.pipeline.getJob(input.jobId);
|
|
1282
|
+
if (!job) {
|
|
1283
|
+
logger.warn(`❓ [CancelJobTool] Job not found: ${input.jobId}`);
|
|
1284
|
+
return {
|
|
1285
|
+
message: `Job with ID ${input.jobId} not found.`,
|
|
1286
|
+
success: false
|
|
1287
|
+
};
|
|
1288
|
+
}
|
|
1289
|
+
if (job.status === PipelineJobStatus.COMPLETED || // Use enum member
|
|
1290
|
+
job.status === PipelineJobStatus.FAILED || // Use enum member
|
|
1291
|
+
job.status === PipelineJobStatus.CANCELLED) {
|
|
1292
|
+
logger.debug(
|
|
1293
|
+
`Job ${input.jobId} is already in a final state: ${job.status}.`
|
|
1294
|
+
);
|
|
1295
|
+
return {
|
|
1296
|
+
message: `Job ${input.jobId} is already ${job.status}. No action taken.`,
|
|
1297
|
+
success: true
|
|
1298
|
+
// Considered success as no cancellation needed
|
|
1299
|
+
};
|
|
1300
|
+
}
|
|
1301
|
+
await this.pipeline.cancelJob(input.jobId);
|
|
1302
|
+
const updatedJob = await this.pipeline.getJob(input.jobId);
|
|
1303
|
+
const finalStatus = updatedJob?.status ?? "UNKNOWN (job disappeared?)";
|
|
1304
|
+
logger.debug(
|
|
1305
|
+
`Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}`
|
|
1306
|
+
);
|
|
1307
|
+
return {
|
|
1308
|
+
message: `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}.`,
|
|
1309
|
+
success: true
|
|
1310
|
+
};
|
|
1311
|
+
} catch (error) {
|
|
1312
|
+
logger.error(`❌ Error cancelling job ${input.jobId}: ${error}`);
|
|
1313
|
+
return {
|
|
1314
|
+
message: `Failed to cancel job ${input.jobId}: ${error instanceof Error ? error.message : String(error)}`,
|
|
1315
|
+
success: false
|
|
1316
|
+
};
|
|
1317
|
+
}
|
|
1318
|
+
},
|
|
1319
|
+
(result) => {
|
|
946
1320
|
return {
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
// Considered success as no cancellation needed
|
|
1321
|
+
success: result.success
|
|
1322
|
+
// Note: success flag already indicates if cancellation was successful
|
|
950
1323
|
};
|
|
951
1324
|
}
|
|
952
|
-
|
|
953
|
-
const updatedJob = await this.pipeline.getJob(input.jobId);
|
|
954
|
-
const finalStatus = updatedJob?.status ?? "UNKNOWN (job disappeared?)";
|
|
955
|
-
logger.debug(
|
|
956
|
-
`Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}`
|
|
957
|
-
);
|
|
958
|
-
return {
|
|
959
|
-
message: `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}.`,
|
|
960
|
-
success: true
|
|
961
|
-
};
|
|
962
|
-
} catch (error) {
|
|
963
|
-
logger.error(`❌ Error cancelling job ${input.jobId}: ${error}`);
|
|
964
|
-
return {
|
|
965
|
-
message: `Failed to cancel job ${input.jobId}: ${error instanceof Error ? error.message : String(error)}`,
|
|
966
|
-
success: false
|
|
967
|
-
};
|
|
968
|
-
}
|
|
1325
|
+
);
|
|
969
1326
|
}
|
|
970
1327
|
}
|
|
971
1328
|
class ClearCompletedJobsTool {
|
|
@@ -983,24 +1340,33 @@ class ClearCompletedJobsTool {
|
|
|
983
1340
|
* @returns A promise that resolves with the outcome of the clear operation.
|
|
984
1341
|
*/
|
|
985
1342
|
async execute(_input) {
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1343
|
+
return analytics.trackTool(
|
|
1344
|
+
"clear_completed_jobs",
|
|
1345
|
+
async () => {
|
|
1346
|
+
try {
|
|
1347
|
+
const clearedCount = await this.pipeline.clearCompletedJobs();
|
|
1348
|
+
const message = clearedCount > 0 ? `Successfully cleared ${clearedCount} completed job${clearedCount === 1 ? "" : "s"} from the queue.` : "No completed jobs to clear.";
|
|
1349
|
+
logger.debug(message);
|
|
1350
|
+
return {
|
|
1351
|
+
message,
|
|
1352
|
+
success: true,
|
|
1353
|
+
clearedCount
|
|
1354
|
+
};
|
|
1355
|
+
} catch (error) {
|
|
1356
|
+
const errorMessage = `Failed to clear completed jobs: ${error instanceof Error ? error.message : String(error)}`;
|
|
1357
|
+
logger.error(`❌ ${errorMessage}`);
|
|
1358
|
+
return {
|
|
1359
|
+
message: errorMessage,
|
|
1360
|
+
success: false,
|
|
1361
|
+
clearedCount: 0
|
|
1362
|
+
};
|
|
1363
|
+
}
|
|
1364
|
+
},
|
|
1365
|
+
(result) => ({
|
|
1366
|
+
success: result.success,
|
|
1367
|
+
clearedCount: result.clearedCount
|
|
1368
|
+
})
|
|
1369
|
+
);
|
|
1004
1370
|
}
|
|
1005
1371
|
}
|
|
1006
1372
|
class ToolError extends Error {
|
|
@@ -1938,91 +2304,106 @@ class FetchUrlTool {
|
|
|
1938
2304
|
* @throws {ToolError} If fetching or processing fails
|
|
1939
2305
|
*/
|
|
1940
2306
|
async execute(options) {
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
const htmlPipeline = new HtmlPipeline();
|
|
1952
|
-
const markdownPipeline = new MarkdownPipeline();
|
|
1953
|
-
const pipelines = [htmlPipeline, markdownPipeline];
|
|
1954
|
-
try {
|
|
1955
|
-
logger.info(`📡 Fetching ${url}...`);
|
|
1956
|
-
const rawContent = await fetcher.fetch(url, {
|
|
1957
|
-
followRedirects: options.followRedirects ?? true,
|
|
1958
|
-
maxRetries: 3,
|
|
1959
|
-
headers
|
|
1960
|
-
// propagate custom headers
|
|
1961
|
-
});
|
|
1962
|
-
logger.info("🔄 Processing content...");
|
|
1963
|
-
let processed;
|
|
1964
|
-
for (const pipeline of pipelines) {
|
|
1965
|
-
if (pipeline.canProcess(rawContent)) {
|
|
1966
|
-
processed = await pipeline.process(
|
|
1967
|
-
rawContent,
|
|
1968
|
-
{
|
|
1969
|
-
url,
|
|
1970
|
-
library: "",
|
|
1971
|
-
version: "",
|
|
1972
|
-
maxDepth: 0,
|
|
1973
|
-
maxPages: 1,
|
|
1974
|
-
maxConcurrency: 1,
|
|
1975
|
-
scope: "subpages",
|
|
1976
|
-
followRedirects: options.followRedirects ?? true,
|
|
1977
|
-
excludeSelectors: void 0,
|
|
1978
|
-
ignoreErrors: false,
|
|
1979
|
-
scrapeMode,
|
|
1980
|
-
headers
|
|
1981
|
-
// propagate custom headers
|
|
1982
|
-
},
|
|
1983
|
-
fetcher
|
|
2307
|
+
return analytics.trackTool(
|
|
2308
|
+
"fetch_url",
|
|
2309
|
+
async () => {
|
|
2310
|
+
const { url, scrapeMode = ScrapeMode.Auto, headers } = options;
|
|
2311
|
+
const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
|
|
2312
|
+
const fetcherIndex = canFetchResults.indexOf(true);
|
|
2313
|
+
if (fetcherIndex === -1) {
|
|
2314
|
+
throw new ToolError(
|
|
2315
|
+
`Invalid URL: ${url}. Must be an HTTP/HTTPS URL or a file:// URL.`,
|
|
2316
|
+
this.constructor.name
|
|
1984
2317
|
);
|
|
1985
|
-
break;
|
|
1986
2318
|
}
|
|
2319
|
+
const fetcher = this.fetchers[fetcherIndex];
|
|
2320
|
+
const htmlPipeline = new HtmlPipeline();
|
|
2321
|
+
const markdownPipeline = new MarkdownPipeline();
|
|
2322
|
+
const pipelines = [htmlPipeline, markdownPipeline];
|
|
2323
|
+
try {
|
|
2324
|
+
logger.info(`📡 Fetching ${url}...`);
|
|
2325
|
+
const rawContent = await fetcher.fetch(url, {
|
|
2326
|
+
followRedirects: options.followRedirects ?? true,
|
|
2327
|
+
maxRetries: 3,
|
|
2328
|
+
headers
|
|
2329
|
+
// propagate custom headers
|
|
2330
|
+
});
|
|
2331
|
+
logger.info("🔄 Processing content...");
|
|
2332
|
+
let processed;
|
|
2333
|
+
for (const pipeline of pipelines) {
|
|
2334
|
+
if (pipeline.canProcess(rawContent)) {
|
|
2335
|
+
processed = await pipeline.process(
|
|
2336
|
+
rawContent,
|
|
2337
|
+
{
|
|
2338
|
+
url,
|
|
2339
|
+
library: "",
|
|
2340
|
+
version: "",
|
|
2341
|
+
maxDepth: 0,
|
|
2342
|
+
maxPages: 1,
|
|
2343
|
+
maxConcurrency: 1,
|
|
2344
|
+
scope: "subpages",
|
|
2345
|
+
followRedirects: options.followRedirects ?? true,
|
|
2346
|
+
excludeSelectors: void 0,
|
|
2347
|
+
ignoreErrors: false,
|
|
2348
|
+
scrapeMode,
|
|
2349
|
+
headers
|
|
2350
|
+
// propagate custom headers
|
|
2351
|
+
},
|
|
2352
|
+
fetcher
|
|
2353
|
+
);
|
|
2354
|
+
break;
|
|
2355
|
+
}
|
|
2356
|
+
}
|
|
2357
|
+
if (!processed) {
|
|
2358
|
+
logger.warn(
|
|
2359
|
+
`⚠️ Unsupported content type "${rawContent.mimeType}" for ${url}. Returning raw content.`
|
|
2360
|
+
);
|
|
2361
|
+
const resolvedCharset = resolveCharset(
|
|
2362
|
+
rawContent.charset,
|
|
2363
|
+
rawContent.content,
|
|
2364
|
+
rawContent.mimeType
|
|
2365
|
+
);
|
|
2366
|
+
const contentString = convertToString(rawContent.content, resolvedCharset);
|
|
2367
|
+
return contentString;
|
|
2368
|
+
}
|
|
2369
|
+
for (const err of processed.errors) {
|
|
2370
|
+
logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
|
|
2371
|
+
}
|
|
2372
|
+
if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
|
|
2373
|
+
throw new ToolError(
|
|
2374
|
+
`Processing resulted in empty content for ${url}`,
|
|
2375
|
+
this.constructor.name
|
|
2376
|
+
);
|
|
2377
|
+
}
|
|
2378
|
+
logger.info(`✅ Successfully processed ${url}`);
|
|
2379
|
+
return processed.textContent;
|
|
2380
|
+
} catch (error) {
|
|
2381
|
+
if (error instanceof ScraperError || error instanceof ToolError) {
|
|
2382
|
+
throw new ToolError(
|
|
2383
|
+
`Failed to fetch or process URL: ${error.message}`,
|
|
2384
|
+
this.constructor.name
|
|
2385
|
+
);
|
|
2386
|
+
}
|
|
2387
|
+
throw new ToolError(
|
|
2388
|
+
`Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
|
|
2389
|
+
this.constructor.name
|
|
2390
|
+
);
|
|
2391
|
+
} finally {
|
|
2392
|
+
await htmlPipeline.close();
|
|
2393
|
+
await markdownPipeline.close();
|
|
2394
|
+
}
|
|
2395
|
+
},
|
|
2396
|
+
(result) => {
|
|
2397
|
+
const { url, scrapeMode, followRedirects, headers } = options;
|
|
2398
|
+
return {
|
|
2399
|
+
url,
|
|
2400
|
+
scrapeMode,
|
|
2401
|
+
followRedirects,
|
|
2402
|
+
contentLength: result.length,
|
|
2403
|
+
hasHeaders: !!headers
|
|
2404
|
+
};
|
|
1987
2405
|
}
|
|
1988
|
-
|
|
1989
|
-
logger.warn(
|
|
1990
|
-
`⚠️ Unsupported content type "${rawContent.mimeType}" for ${url}. Returning raw content.`
|
|
1991
|
-
);
|
|
1992
|
-
const resolvedCharset = resolveCharset(
|
|
1993
|
-
rawContent.charset,
|
|
1994
|
-
rawContent.content,
|
|
1995
|
-
rawContent.mimeType
|
|
1996
|
-
);
|
|
1997
|
-
const contentString = convertToString(rawContent.content, resolvedCharset);
|
|
1998
|
-
return contentString;
|
|
1999
|
-
}
|
|
2000
|
-
for (const err of processed.errors) {
|
|
2001
|
-
logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
|
|
2002
|
-
}
|
|
2003
|
-
if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
|
|
2004
|
-
throw new ToolError(
|
|
2005
|
-
`Processing resulted in empty content for ${url}`,
|
|
2006
|
-
this.constructor.name
|
|
2007
|
-
);
|
|
2008
|
-
}
|
|
2009
|
-
logger.info(`✅ Successfully processed ${url}`);
|
|
2010
|
-
return processed.textContent;
|
|
2011
|
-
} catch (error) {
|
|
2012
|
-
if (error instanceof ScraperError || error instanceof ToolError) {
|
|
2013
|
-
throw new ToolError(
|
|
2014
|
-
`Failed to fetch or process URL: ${error.message}`,
|
|
2015
|
-
this.constructor.name
|
|
2016
|
-
);
|
|
2017
|
-
}
|
|
2018
|
-
throw new ToolError(
|
|
2019
|
-
`Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
|
|
2020
|
-
this.constructor.name
|
|
2021
|
-
);
|
|
2022
|
-
} finally {
|
|
2023
|
-
await htmlPipeline.close();
|
|
2024
|
-
await markdownPipeline.close();
|
|
2025
|
-
}
|
|
2406
|
+
);
|
|
2026
2407
|
}
|
|
2027
2408
|
}
|
|
2028
2409
|
class FindVersionTool {
|
|
@@ -2035,35 +2416,50 @@ class FindVersionTool {
|
|
|
2035
2416
|
* @returns A descriptive string indicating the best match and unversioned status, or an error message.
|
|
2036
2417
|
*/
|
|
2037
2418
|
async execute(options) {
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
library
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2047
|
-
|
|
2048
|
-
|
|
2049
|
-
|
|
2419
|
+
return analytics.trackTool(
|
|
2420
|
+
"find_version",
|
|
2421
|
+
async () => {
|
|
2422
|
+
const { library, targetVersion } = options;
|
|
2423
|
+
const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
|
|
2424
|
+
try {
|
|
2425
|
+
const { bestMatch, hasUnversioned } = await this.docService.findBestVersion(
|
|
2426
|
+
library,
|
|
2427
|
+
targetVersion
|
|
2428
|
+
);
|
|
2429
|
+
let message = "";
|
|
2430
|
+
if (bestMatch) {
|
|
2431
|
+
message = `Best match: ${bestMatch}.`;
|
|
2432
|
+
if (hasUnversioned) {
|
|
2433
|
+
message += " Unversioned docs also available.";
|
|
2434
|
+
}
|
|
2435
|
+
} else if (hasUnversioned) {
|
|
2436
|
+
message = `No matching version found for ${libraryAndVersion}, but unversioned docs exist.`;
|
|
2437
|
+
} else {
|
|
2438
|
+
message = `No matching version or unversioned documents found for ${libraryAndVersion}.`;
|
|
2439
|
+
}
|
|
2440
|
+
return { message, bestMatch, hasUnversioned };
|
|
2441
|
+
} catch (error) {
|
|
2442
|
+
if (error instanceof VersionNotFoundError) {
|
|
2443
|
+
logger.info(`ℹ️ Version not found: ${error.message}`);
|
|
2444
|
+
const message = `No matching version or unversioned documents found for ${libraryAndVersion}. Available: ${error.availableVersions.length > 0 ? error.availableVersions.map((v) => v.version).join(", ") : "None"}.`;
|
|
2445
|
+
return { message, bestMatch: null, hasUnversioned: false };
|
|
2446
|
+
}
|
|
2447
|
+
logger.error(
|
|
2448
|
+
`❌ Error finding version for ${libraryAndVersion}: ${error instanceof Error ? error.message : error}`
|
|
2449
|
+
);
|
|
2450
|
+
throw error;
|
|
2050
2451
|
}
|
|
2051
|
-
}
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
return `No matching version or unversioned documents found for ${libraryAndVersion}. Available: ${error.availableVersions.length > 0 ? error.availableVersions.map((v) => v.version).join(", ") : "None"}.`;
|
|
2452
|
+
},
|
|
2453
|
+
(result) => {
|
|
2454
|
+
const { library, targetVersion } = options;
|
|
2455
|
+
return {
|
|
2456
|
+
library,
|
|
2457
|
+
targetVersion,
|
|
2458
|
+
foundMatch: !!result.bestMatch,
|
|
2459
|
+
hasUnversioned: result.hasUnversioned
|
|
2460
|
+
};
|
|
2061
2461
|
}
|
|
2062
|
-
|
|
2063
|
-
`❌ Error finding version for ${libraryAndVersion}: ${error instanceof Error ? error.message : error}`
|
|
2064
|
-
);
|
|
2065
|
-
throw error;
|
|
2066
|
-
}
|
|
2462
|
+
).then((result) => result.message);
|
|
2067
2463
|
}
|
|
2068
2464
|
}
|
|
2069
2465
|
class GetJobInfoTool {
|
|
@@ -2081,29 +2477,41 @@ class GetJobInfoTool {
|
|
|
2081
2477
|
* @returns A promise that resolves with the simplified job info or null if not found.
|
|
2082
2478
|
*/
|
|
2083
2479
|
async execute(input) {
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2480
|
+
return analytics.trackTool(
|
|
2481
|
+
"get_job_info",
|
|
2482
|
+
async () => {
|
|
2483
|
+
const job = await this.pipeline.getJob(input.jobId);
|
|
2484
|
+
if (!job) {
|
|
2485
|
+
return { job: null };
|
|
2486
|
+
}
|
|
2487
|
+
const jobInfo = {
|
|
2488
|
+
id: job.id,
|
|
2489
|
+
library: job.library,
|
|
2490
|
+
version: job.version,
|
|
2491
|
+
status: job.status,
|
|
2492
|
+
dbStatus: job.versionStatus,
|
|
2493
|
+
createdAt: job.createdAt.toISOString(),
|
|
2494
|
+
startedAt: job.startedAt?.toISOString() ?? null,
|
|
2495
|
+
finishedAt: job.finishedAt?.toISOString() ?? null,
|
|
2496
|
+
error: job.error?.message ?? null,
|
|
2497
|
+
progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
|
|
2498
|
+
pages: job.progressPages || 0,
|
|
2499
|
+
totalPages: job.progressMaxPages,
|
|
2500
|
+
totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
|
|
2501
|
+
} : void 0,
|
|
2502
|
+
updatedAt: job.updatedAt?.toISOString(),
|
|
2503
|
+
errorMessage: job.errorMessage ?? void 0
|
|
2504
|
+
};
|
|
2505
|
+
return { job: jobInfo };
|
|
2506
|
+
},
|
|
2507
|
+
(result) => {
|
|
2508
|
+
return {
|
|
2509
|
+
found: result.job !== null,
|
|
2510
|
+
library: result.job?.library,
|
|
2511
|
+
version: result.job?.version
|
|
2512
|
+
};
|
|
2513
|
+
}
|
|
2514
|
+
);
|
|
2107
2515
|
}
|
|
2108
2516
|
}
|
|
2109
2517
|
class ListJobsTool {
|
|
@@ -2121,28 +2529,45 @@ class ListJobsTool {
|
|
|
2121
2529
|
* @returns A promise that resolves with the list of simplified job objects.
|
|
2122
2530
|
*/
|
|
2123
2531
|
async execute(input) {
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
|
|
2145
|
-
|
|
2532
|
+
return analytics.trackTool(
|
|
2533
|
+
"list_jobs",
|
|
2534
|
+
async () => {
|
|
2535
|
+
const jobs = await this.pipeline.getJobs(input.status);
|
|
2536
|
+
const simplifiedJobs = jobs.map((job) => {
|
|
2537
|
+
return {
|
|
2538
|
+
id: job.id,
|
|
2539
|
+
library: job.library,
|
|
2540
|
+
version: job.version,
|
|
2541
|
+
status: job.status,
|
|
2542
|
+
dbStatus: job.versionStatus,
|
|
2543
|
+
createdAt: job.createdAt.toISOString(),
|
|
2544
|
+
startedAt: job.startedAt?.toISOString() ?? null,
|
|
2545
|
+
finishedAt: job.finishedAt?.toISOString() ?? null,
|
|
2546
|
+
error: job.error?.message ?? null,
|
|
2547
|
+
progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
|
|
2548
|
+
pages: job.progressPages || 0,
|
|
2549
|
+
totalPages: job.progressMaxPages,
|
|
2550
|
+
totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
|
|
2551
|
+
} : void 0,
|
|
2552
|
+
updatedAt: job.updatedAt?.toISOString(),
|
|
2553
|
+
errorMessage: job.errorMessage ?? void 0
|
|
2554
|
+
};
|
|
2555
|
+
});
|
|
2556
|
+
return { jobs: simplifiedJobs };
|
|
2557
|
+
},
|
|
2558
|
+
(result) => {
|
|
2559
|
+
return {
|
|
2560
|
+
jobCount: result.jobs.length,
|
|
2561
|
+
statusCounts: result.jobs.reduce(
|
|
2562
|
+
(acc, job) => {
|
|
2563
|
+
acc[job.status] = (acc[job.status] || 0) + 1;
|
|
2564
|
+
return acc;
|
|
2565
|
+
},
|
|
2566
|
+
{}
|
|
2567
|
+
)
|
|
2568
|
+
};
|
|
2569
|
+
}
|
|
2570
|
+
);
|
|
2146
2571
|
}
|
|
2147
2572
|
}
|
|
2148
2573
|
class ListLibrariesTool {
|
|
@@ -2151,20 +2576,32 @@ class ListLibrariesTool {
|
|
|
2151
2576
|
this.docService = docService;
|
|
2152
2577
|
}
|
|
2153
2578
|
async execute(_options) {
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
|
|
2167
|
-
|
|
2579
|
+
return analytics.trackTool(
|
|
2580
|
+
"list_libraries",
|
|
2581
|
+
async () => {
|
|
2582
|
+
const rawLibraries = await this.docService.listLibraries();
|
|
2583
|
+
const libraries = rawLibraries.map(({ library, versions }) => ({
|
|
2584
|
+
name: library,
|
|
2585
|
+
versions: versions.map((v) => ({
|
|
2586
|
+
version: v.ref.version,
|
|
2587
|
+
documentCount: v.counts.documents,
|
|
2588
|
+
uniqueUrlCount: v.counts.uniqueUrls,
|
|
2589
|
+
indexedAt: v.indexedAt,
|
|
2590
|
+
status: v.status,
|
|
2591
|
+
...v.progress ? { progress: v.progress } : void 0,
|
|
2592
|
+
sourceUrl: v.sourceUrl
|
|
2593
|
+
}))
|
|
2594
|
+
}));
|
|
2595
|
+
return { libraries };
|
|
2596
|
+
},
|
|
2597
|
+
(result) => ({
|
|
2598
|
+
libraryCount: result.libraries.length,
|
|
2599
|
+
totalVersions: result.libraries.reduce(
|
|
2600
|
+
(sum, lib) => sum + lib.versions.length,
|
|
2601
|
+
0
|
|
2602
|
+
)
|
|
2603
|
+
})
|
|
2604
|
+
);
|
|
2168
2605
|
}
|
|
2169
2606
|
}
|
|
2170
2607
|
class RemoveTool {
|
|
@@ -2178,29 +2615,42 @@ class RemoveTool {
|
|
|
2178
2615
|
* Removes all documents, the version record, and the library if no other versions exist.
|
|
2179
2616
|
*/
|
|
2180
2617
|
async execute(args) {
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2618
|
+
return analytics.trackTool(
|
|
2619
|
+
"remove_docs",
|
|
2620
|
+
async () => {
|
|
2621
|
+
const { library, version: version2 } = args;
|
|
2622
|
+
logger.info(`🗑️ Removing library: ${library}${version2 ? `@${version2}` : ""}`);
|
|
2623
|
+
try {
|
|
2624
|
+
const allJobs = await this.pipeline.getJobs();
|
|
2625
|
+
const jobs = allJobs.filter(
|
|
2626
|
+
(job) => job.library === library && job.version === (version2 ?? "") && (job.status === PipelineJobStatus.QUEUED || job.status === PipelineJobStatus.RUNNING)
|
|
2627
|
+
);
|
|
2628
|
+
for (const job of jobs) {
|
|
2629
|
+
logger.info(
|
|
2630
|
+
`🚫 Aborting job for ${library}@${version2 ?? ""} before deletion: ${job.id}`
|
|
2631
|
+
);
|
|
2632
|
+
await this.pipeline.cancelJob(job.id);
|
|
2633
|
+
await this.pipeline.waitForJobCompletion(job.id);
|
|
2634
|
+
}
|
|
2635
|
+
await this.documentManagementService.removeVersion(library, version2);
|
|
2636
|
+
const message = `Successfully removed ${library}${version2 ? `@${version2}` : ""}.`;
|
|
2637
|
+
logger.info(`✅ ${message}`);
|
|
2638
|
+
return { message };
|
|
2639
|
+
} catch (error) {
|
|
2640
|
+
const errorMessage = `Failed to remove ${library}${version2 ? `@${version2}` : ""}: ${error instanceof Error ? error.message : String(error)}`;
|
|
2641
|
+
logger.error(`❌ Error removing library: ${errorMessage}`);
|
|
2642
|
+
throw new ToolError(errorMessage, this.constructor.name);
|
|
2643
|
+
}
|
|
2644
|
+
},
|
|
2645
|
+
() => {
|
|
2646
|
+
const { library, version: version2 } = args;
|
|
2647
|
+
return {
|
|
2648
|
+
library,
|
|
2649
|
+
version: version2
|
|
2650
|
+
// Success is implicit since if this callback runs, no exception was thrown
|
|
2651
|
+
};
|
|
2194
2652
|
}
|
|
2195
|
-
|
|
2196
|
-
const message = `Successfully removed ${library}${version2 ? `@${version2}` : ""}.`;
|
|
2197
|
-
logger.info(`✅ ${message}`);
|
|
2198
|
-
return { message };
|
|
2199
|
-
} catch (error) {
|
|
2200
|
-
const errorMessage = `Failed to remove ${library}${version2 ? `@${version2}` : ""}: ${error instanceof Error ? error.message : String(error)}`;
|
|
2201
|
-
logger.error(`❌ Error removing library: ${errorMessage}`);
|
|
2202
|
-
throw new ToolError(errorMessage, this.constructor.name);
|
|
2203
|
-
}
|
|
2653
|
+
);
|
|
2204
2654
|
}
|
|
2205
2655
|
}
|
|
2206
2656
|
class ScrapeTool {
|
|
@@ -2216,66 +2666,80 @@ class ScrapeTool {
|
|
|
2216
2666
|
options: scraperOptions,
|
|
2217
2667
|
waitForCompletion = true
|
|
2218
2668
|
} = options;
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
|
|
2222
|
-
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
|
|
2226
|
-
internalVersion = validFullVersion;
|
|
2227
|
-
} else if (partialVersionRegex.test(version2)) {
|
|
2228
|
-
const coercedVersion = semver.coerce(version2);
|
|
2229
|
-
if (coercedVersion) {
|
|
2230
|
-
internalVersion = coercedVersion.version;
|
|
2669
|
+
return analytics.trackTool(
|
|
2670
|
+
"scrape_docs",
|
|
2671
|
+
async () => {
|
|
2672
|
+
let internalVersion;
|
|
2673
|
+
const partialVersionRegex = /^\d+(\.\d+)?$/;
|
|
2674
|
+
if (version2 === null || version2 === void 0) {
|
|
2675
|
+
internalVersion = "";
|
|
2231
2676
|
} else {
|
|
2232
|
-
|
|
2233
|
-
|
|
2234
|
-
|
|
2677
|
+
const validFullVersion = semver.valid(version2);
|
|
2678
|
+
if (validFullVersion) {
|
|
2679
|
+
internalVersion = validFullVersion;
|
|
2680
|
+
} else if (partialVersionRegex.test(version2)) {
|
|
2681
|
+
const coercedVersion = semver.coerce(version2);
|
|
2682
|
+
if (coercedVersion) {
|
|
2683
|
+
internalVersion = coercedVersion.version;
|
|
2684
|
+
} else {
|
|
2685
|
+
throw new Error(
|
|
2686
|
+
`Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
|
|
2687
|
+
);
|
|
2688
|
+
}
|
|
2689
|
+
} else {
|
|
2690
|
+
throw new Error(
|
|
2691
|
+
`Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
|
|
2692
|
+
);
|
|
2693
|
+
}
|
|
2235
2694
|
}
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
};
|
|
2273
|
-
}
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
2278
|
-
|
|
2695
|
+
internalVersion = internalVersion.toLowerCase();
|
|
2696
|
+
const pipeline = this.pipeline;
|
|
2697
|
+
const enqueueVersion = internalVersion === "" ? null : internalVersion;
|
|
2698
|
+
const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
|
|
2699
|
+
url,
|
|
2700
|
+
library,
|
|
2701
|
+
version: internalVersion,
|
|
2702
|
+
scope: scraperOptions?.scope ?? "subpages",
|
|
2703
|
+
followRedirects: scraperOptions?.followRedirects ?? true,
|
|
2704
|
+
maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
|
|
2705
|
+
maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH$1,
|
|
2706
|
+
maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
|
|
2707
|
+
ignoreErrors: scraperOptions?.ignoreErrors ?? true,
|
|
2708
|
+
scrapeMode: scraperOptions?.scrapeMode ?? ScrapeMode.Auto,
|
|
2709
|
+
// Pass scrapeMode enum
|
|
2710
|
+
includePatterns: scraperOptions?.includePatterns,
|
|
2711
|
+
excludePatterns: scraperOptions?.excludePatterns,
|
|
2712
|
+
headers: scraperOptions?.headers
|
|
2713
|
+
// <-- propagate headers
|
|
2714
|
+
});
|
|
2715
|
+
if (waitForCompletion) {
|
|
2716
|
+
try {
|
|
2717
|
+
await pipeline.waitForJobCompletion(jobId);
|
|
2718
|
+
const finalJob = await pipeline.getJob(jobId);
|
|
2719
|
+
const finalPagesScraped = finalJob?.progress?.pagesScraped ?? 0;
|
|
2720
|
+
logger.debug(
|
|
2721
|
+
`Job ${jobId} finished with status ${finalJob?.status}. Pages scraped: ${finalPagesScraped}`
|
|
2722
|
+
);
|
|
2723
|
+
return {
|
|
2724
|
+
pagesScraped: finalPagesScraped
|
|
2725
|
+
};
|
|
2726
|
+
} catch (error) {
|
|
2727
|
+
logger.error(`❌ Job ${jobId} failed or was cancelled: ${error}`);
|
|
2728
|
+
throw error;
|
|
2729
|
+
}
|
|
2730
|
+
}
|
|
2731
|
+
return { jobId };
|
|
2732
|
+
},
|
|
2733
|
+
(result) => ({
|
|
2734
|
+
library,
|
|
2735
|
+
version: version2,
|
|
2736
|
+
url,
|
|
2737
|
+
waitForCompletion,
|
|
2738
|
+
...scraperOptions,
|
|
2739
|
+
isBackgroundJob: "jobId" in result,
|
|
2740
|
+
pagesScraped: "pagesScraped" in result ? result.pagesScraped : void 0
|
|
2741
|
+
})
|
|
2742
|
+
);
|
|
2279
2743
|
}
|
|
2280
2744
|
}
|
|
2281
2745
|
class SearchTool {
|
|
@@ -2285,43 +2749,56 @@ class SearchTool {
|
|
|
2285
2749
|
}
|
|
2286
2750
|
async execute(options) {
|
|
2287
2751
|
const { library, version: version2, query, limit = 5, exactMatch = false } = options;
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
|
|
2292
|
-
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
|
|
2304
|
-
|
|
2305
|
-
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2752
|
+
return analytics.trackTool(
|
|
2753
|
+
"search_docs",
|
|
2754
|
+
async () => {
|
|
2755
|
+
if (exactMatch && (!version2 || version2 === "latest")) {
|
|
2756
|
+
await this.docService.validateLibraryExists(library);
|
|
2757
|
+
const allLibraries = await this.docService.listLibraries();
|
|
2758
|
+
const libraryInfo = allLibraries.find((lib) => lib.library === library);
|
|
2759
|
+
const detailedVersions = libraryInfo ? libraryInfo.versions.map((v) => ({
|
|
2760
|
+
version: v.ref.version,
|
|
2761
|
+
documentCount: v.counts.documents,
|
|
2762
|
+
uniqueUrlCount: v.counts.uniqueUrls,
|
|
2763
|
+
indexedAt: v.indexedAt
|
|
2764
|
+
})) : [];
|
|
2765
|
+
throw new VersionNotFoundError(library, version2 ?? "latest", detailedVersions);
|
|
2766
|
+
}
|
|
2767
|
+
const resolvedVersion = version2 || "latest";
|
|
2768
|
+
logger.info(
|
|
2769
|
+
`🔍 Searching ${library}@${resolvedVersion} for: ${query}${exactMatch ? " (exact match)" : ""}`
|
|
2770
|
+
);
|
|
2771
|
+
try {
|
|
2772
|
+
await this.docService.validateLibraryExists(library);
|
|
2773
|
+
let versionToSearch = resolvedVersion;
|
|
2774
|
+
if (!exactMatch) {
|
|
2775
|
+
const versionResult = await this.docService.findBestVersion(library, version2);
|
|
2776
|
+
versionToSearch = versionResult.bestMatch;
|
|
2777
|
+
}
|
|
2778
|
+
const results = await this.docService.searchStore(
|
|
2779
|
+
library,
|
|
2780
|
+
versionToSearch,
|
|
2781
|
+
query,
|
|
2782
|
+
limit
|
|
2783
|
+
);
|
|
2784
|
+
logger.info(`✅ Found ${results.length} matching results`);
|
|
2785
|
+
return { results };
|
|
2786
|
+
} catch (error) {
|
|
2787
|
+
logger.error(
|
|
2788
|
+
`❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
2789
|
+
);
|
|
2790
|
+
throw error;
|
|
2791
|
+
}
|
|
2792
|
+
},
|
|
2793
|
+
(result) => ({
|
|
2312
2794
|
library,
|
|
2313
|
-
|
|
2795
|
+
version: version2,
|
|
2314
2796
|
query,
|
|
2315
|
-
limit
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
logger.error(
|
|
2321
|
-
`❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
2322
|
-
);
|
|
2323
|
-
throw error;
|
|
2324
|
-
}
|
|
2797
|
+
limit,
|
|
2798
|
+
exactMatch,
|
|
2799
|
+
resultCount: result.results.length
|
|
2800
|
+
})
|
|
2801
|
+
);
|
|
2325
2802
|
}
|
|
2326
2803
|
}
|
|
2327
2804
|
function createResponse(text) {
|
|
@@ -3051,6 +3528,7 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
|
|
|
3051
3528
|
authEnabled: !!authManager,
|
|
3052
3529
|
readOnly,
|
|
3053
3530
|
servicesEnabled: ["mcp"]
|
|
3531
|
+
// Embedding context will be resolved by the service that starts MCP
|
|
3054
3532
|
});
|
|
3055
3533
|
analytics.startSession(session);
|
|
3056
3534
|
}
|
|
@@ -3101,23 +3579,10 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
|
|
|
3101
3579
|
const requestTransport = new StreamableHTTPServerTransport({
|
|
3102
3580
|
sessionIdGenerator: void 0
|
|
3103
3581
|
});
|
|
3104
|
-
if (analytics.isEnabled()) {
|
|
3105
|
-
const session = createMcpSession({
|
|
3106
|
-
protocol: "http",
|
|
3107
|
-
transport: "streamable",
|
|
3108
|
-
authEnabled: !!authManager,
|
|
3109
|
-
readOnly,
|
|
3110
|
-
servicesEnabled: ["mcp"]
|
|
3111
|
-
});
|
|
3112
|
-
analytics.startSession(session);
|
|
3113
|
-
}
|
|
3114
3582
|
reply.raw.on("close", () => {
|
|
3115
3583
|
logger.debug("Streamable HTTP request closed");
|
|
3116
3584
|
requestTransport.close();
|
|
3117
3585
|
requestServer.close();
|
|
3118
|
-
if (analytics.isEnabled()) {
|
|
3119
|
-
analytics.endSession();
|
|
3120
|
-
}
|
|
3121
3586
|
});
|
|
3122
3587
|
await requestServer.connect(requestTransport);
|
|
3123
3588
|
await requestTransport.handleRequest(request.raw, reply.raw, request.body);
|
|
@@ -3261,6 +3726,15 @@ function createDataRouter(trpc) {
|
|
|
3261
3726
|
return results;
|
|
3262
3727
|
}
|
|
3263
3728
|
),
|
|
3729
|
+
removeVersion: tt.procedure.input(z$1.object({ library: nonEmpty, version: optionalVersion })).mutation(
|
|
3730
|
+
async ({
|
|
3731
|
+
ctx,
|
|
3732
|
+
input
|
|
3733
|
+
}) => {
|
|
3734
|
+
await ctx.docService.removeVersion(input.library, input.version ?? null);
|
|
3735
|
+
return { ok: true };
|
|
3736
|
+
}
|
|
3737
|
+
),
|
|
3264
3738
|
removeAllDocuments: tt.procedure.input(z$1.object({ library: nonEmpty, version: optionalVersion })).mutation(
|
|
3265
3739
|
async ({
|
|
3266
3740
|
ctx,
|
|
@@ -3518,97 +3992,30 @@ function registerClearCompletedJobsRoute(server, clearCompletedJobsTool) {
|
|
|
3518
3992
|
}
|
|
3519
3993
|
});
|
|
3520
3994
|
}
|
|
3521
|
-
const
|
|
3522
|
-
|
|
3523
|
-
|
|
3524
|
-
|
|
3525
|
-
|
|
3526
|
-
|
|
3527
|
-
|
|
3528
|
-
|
|
3529
|
-
|
|
3530
|
-
|
|
3531
|
-
|
|
3532
|
-
|
|
3533
|
-
|
|
3534
|
-
|
|
3535
|
-
|
|
3536
|
-
|
|
3537
|
-
|
|
3538
|
-
}
|
|
3539
|
-
|
|
3540
|
-
|
|
3541
|
-
|
|
3542
|
-
|
|
3543
|
-
|
|
3544
|
-
}
|
|
3545
|
-
function getStatusDescription(status) {
|
|
3546
|
-
const descriptions = {
|
|
3547
|
-
[
|
|
3548
|
-
"not_indexed"
|
|
3549
|
-
/* NOT_INDEXED */
|
|
3550
|
-
]: "Version created but not yet indexed",
|
|
3551
|
-
[
|
|
3552
|
-
"queued"
|
|
3553
|
-
/* QUEUED */
|
|
3554
|
-
]: "Waiting in queue for indexing",
|
|
3555
|
-
[
|
|
3556
|
-
"running"
|
|
3557
|
-
/* RUNNING */
|
|
3558
|
-
]: "Currently being indexed",
|
|
3559
|
-
[
|
|
3560
|
-
"completed"
|
|
3561
|
-
/* COMPLETED */
|
|
3562
|
-
]: "Successfully indexed",
|
|
3563
|
-
[
|
|
3564
|
-
"failed"
|
|
3565
|
-
/* FAILED */
|
|
3566
|
-
]: "Indexing failed",
|
|
3567
|
-
[
|
|
3568
|
-
"cancelled"
|
|
3569
|
-
/* CANCELLED */
|
|
3570
|
-
]: "Indexing was cancelled",
|
|
3571
|
-
[
|
|
3572
|
-
"updating"
|
|
3573
|
-
/* UPDATING */
|
|
3574
|
-
]: "Re-indexing in progress"
|
|
3575
|
-
};
|
|
3576
|
-
return descriptions[status] || "Unknown status";
|
|
3577
|
-
}
|
|
3578
|
-
function isActiveStatus(status) {
|
|
3579
|
-
return [
|
|
3580
|
-
"queued",
|
|
3581
|
-
"running",
|
|
3582
|
-
"updating"
|
|
3583
|
-
/* UPDATING */
|
|
3584
|
-
].includes(
|
|
3585
|
-
status
|
|
3586
|
-
);
|
|
3587
|
-
}
|
|
3588
|
-
const VersionBadge = ({ version: version2 }) => {
|
|
3589
|
-
if (!version2) {
|
|
3590
|
-
return null;
|
|
3591
|
-
}
|
|
3592
|
-
return /* @__PURE__ */ jsx("span", { class: "bg-purple-100 text-purple-800 text-xs font-medium me-2 px-1.5 py-0.5 rounded dark:bg-purple-900 dark:text-purple-300", children: /* @__PURE__ */ jsx("span", { safe: true, children: version2 }) });
|
|
3593
|
-
};
|
|
3594
|
-
function getStatusClasses(status) {
|
|
3595
|
-
const baseClasses = "px-1.5 py-0.5 text-xs font-medium rounded";
|
|
3596
|
-
switch (status) {
|
|
3597
|
-
case VersionStatus.COMPLETED:
|
|
3598
|
-
return `${baseClasses} bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-300`;
|
|
3599
|
-
case VersionStatus.RUNNING:
|
|
3600
|
-
case VersionStatus.UPDATING:
|
|
3601
|
-
return `${baseClasses} bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-300`;
|
|
3602
|
-
case VersionStatus.QUEUED:
|
|
3603
|
-
return `${baseClasses} bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-300`;
|
|
3604
|
-
case VersionStatus.FAILED:
|
|
3605
|
-
return `${baseClasses} bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-300`;
|
|
3606
|
-
case VersionStatus.CANCELLED:
|
|
3607
|
-
return `${baseClasses} bg-gray-100 text-gray-800 dark:bg-gray-900 dark:text-gray-300`;
|
|
3608
|
-
case VersionStatus.NOT_INDEXED:
|
|
3609
|
-
default:
|
|
3610
|
-
return `${baseClasses} bg-gray-100 text-gray-600 dark:bg-gray-800 dark:text-gray-400`;
|
|
3611
|
-
}
|
|
3995
|
+
const VersionBadge = ({ version: version2 }) => {
|
|
3996
|
+
if (!version2) {
|
|
3997
|
+
return null;
|
|
3998
|
+
}
|
|
3999
|
+
return /* @__PURE__ */ jsx("span", { class: "bg-purple-100 text-purple-800 text-xs font-medium me-2 px-1.5 py-0.5 rounded dark:bg-purple-900 dark:text-purple-300", children: /* @__PURE__ */ jsx("span", { safe: true, children: version2 }) });
|
|
4000
|
+
};
|
|
4001
|
+
function getStatusClasses(status) {
|
|
4002
|
+
const baseClasses = "px-1.5 py-0.5 text-xs font-medium rounded";
|
|
4003
|
+
switch (status) {
|
|
4004
|
+
case VersionStatus.COMPLETED:
|
|
4005
|
+
return `${baseClasses} bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-300`;
|
|
4006
|
+
case VersionStatus.RUNNING:
|
|
4007
|
+
case VersionStatus.UPDATING:
|
|
4008
|
+
return `${baseClasses} bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-300`;
|
|
4009
|
+
case VersionStatus.QUEUED:
|
|
4010
|
+
return `${baseClasses} bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-300`;
|
|
4011
|
+
case VersionStatus.FAILED:
|
|
4012
|
+
return `${baseClasses} bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-300`;
|
|
4013
|
+
case VersionStatus.CANCELLED:
|
|
4014
|
+
return `${baseClasses} bg-gray-100 text-gray-800 dark:bg-gray-900 dark:text-gray-300`;
|
|
4015
|
+
case VersionStatus.NOT_INDEXED:
|
|
4016
|
+
default:
|
|
4017
|
+
return `${baseClasses} bg-gray-100 text-gray-600 dark:bg-gray-800 dark:text-gray-400`;
|
|
4018
|
+
}
|
|
3612
4019
|
}
|
|
3613
4020
|
const StatusBadge = ({ status, showDescription = true }) => /* @__PURE__ */ jsx("span", { class: getStatusClasses(status), children: showDescription ? getStatusDescription(status) : status });
|
|
3614
4021
|
const ProgressBar = ({ progress, showText = true }) => {
|
|
@@ -4940,16 +5347,12 @@ async function registerWorkerService(pipeline) {
|
|
|
4940
5347
|
logger.warn(
|
|
4941
5348
|
`⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`
|
|
4942
5349
|
);
|
|
4943
|
-
|
|
4944
|
-
analytics.track(TelemetryEvent.ERROR_OCCURRED, {
|
|
5350
|
+
analytics.captureException(error, {
|
|
4945
5351
|
jobId: job.id,
|
|
4946
5352
|
// Job IDs are already anonymous
|
|
4947
5353
|
library: job.library,
|
|
4948
|
-
errorType: errorInfo.type,
|
|
4949
|
-
errorMessage: errorInfo.message,
|
|
4950
5354
|
hasDocument: !!document,
|
|
4951
5355
|
stage: document ? "document_processing" : "job_setup",
|
|
4952
|
-
hasStack: errorInfo.hasStack,
|
|
4953
5356
|
pages_processed_before_error: job.progressPages || 0
|
|
4954
5357
|
});
|
|
4955
5358
|
}
|
|
@@ -5028,13 +5431,13 @@ class AppServer {
|
|
|
5028
5431
|
try {
|
|
5029
5432
|
telemetryService.startSession({
|
|
5030
5433
|
sessionId: crypto.randomUUID(),
|
|
5031
|
-
|
|
5434
|
+
appInterface: "web",
|
|
5032
5435
|
startTime: /* @__PURE__ */ new Date(),
|
|
5033
|
-
|
|
5034
|
-
|
|
5035
|
-
|
|
5036
|
-
|
|
5037
|
-
|
|
5436
|
+
appVersion: process.env.npm_package_version || "unknown",
|
|
5437
|
+
appPlatform: process.platform,
|
|
5438
|
+
appServicesEnabled: this.getActiveServicesList(),
|
|
5439
|
+
appAuthEnabled: Boolean(this.config.auth),
|
|
5440
|
+
appReadOnly: Boolean(this.config.readOnly)
|
|
5038
5441
|
});
|
|
5039
5442
|
} catch (error) {
|
|
5040
5443
|
logger.debug(`Failed to initialize telemetry: ${error}`);
|
|
@@ -5111,11 +5514,10 @@ class AppServer {
|
|
|
5111
5514
|
process.on("unhandledRejection", (reason) => {
|
|
5112
5515
|
logger.error(`Unhandled Promise Rejection: ${reason}`);
|
|
5113
5516
|
if (analytics.isEnabled()) {
|
|
5114
|
-
|
|
5115
|
-
|
|
5517
|
+
const error = reason instanceof Error ? reason : new Error(String(reason));
|
|
5518
|
+
analytics.captureException(error, {
|
|
5116
5519
|
error_category: "system",
|
|
5117
|
-
component:
|
|
5118
|
-
severity: "critical",
|
|
5520
|
+
component: AppServer.constructor.name,
|
|
5119
5521
|
context: "process_unhandled_rejection"
|
|
5120
5522
|
});
|
|
5121
5523
|
}
|
|
@@ -5125,11 +5527,9 @@ class AppServer {
|
|
|
5125
5527
|
process.on("uncaughtException", (error) => {
|
|
5126
5528
|
logger.error(`Uncaught Exception: ${error.message}`);
|
|
5127
5529
|
if (analytics.isEnabled()) {
|
|
5128
|
-
analytics.
|
|
5129
|
-
error_type: error.constructor.name,
|
|
5530
|
+
analytics.captureException(error, {
|
|
5130
5531
|
error_category: "system",
|
|
5131
|
-
component:
|
|
5132
|
-
severity: "critical",
|
|
5532
|
+
component: AppServer.constructor.name,
|
|
5133
5533
|
context: "process_uncaught_exception"
|
|
5134
5534
|
});
|
|
5135
5535
|
}
|
|
@@ -5138,11 +5538,9 @@ class AppServer {
|
|
|
5138
5538
|
if (typeof this.server.setErrorHandler === "function") {
|
|
5139
5539
|
this.server.setErrorHandler(async (error, request, reply) => {
|
|
5140
5540
|
if (analytics.isEnabled()) {
|
|
5141
|
-
analytics.
|
|
5142
|
-
error_type: error.constructor.name,
|
|
5541
|
+
analytics.captureException(error, {
|
|
5143
5542
|
error_category: "http",
|
|
5144
5543
|
component: "FastifyServer",
|
|
5145
|
-
severity: "high",
|
|
5146
5544
|
status_code: error.statusCode || 500,
|
|
5147
5545
|
method: request.method,
|
|
5148
5546
|
route: request.routeOptions?.url || request.url,
|
|
@@ -5274,195 +5672,2564 @@ class AppServer {
|
|
|
5274
5672
|
logger.debug("OAuth2 proxy endpoints registered");
|
|
5275
5673
|
}
|
|
5276
5674
|
/**
|
|
5277
|
-
* Log startup information showing which services are enabled.
|
|
5675
|
+
* Log startup information showing which services are enabled.
|
|
5676
|
+
*/
|
|
5677
|
+
logStartupInfo(address) {
|
|
5678
|
+
logger.info(`🚀 AppServer available at ${address}`);
|
|
5679
|
+
const enabledServices = [];
|
|
5680
|
+
if (this.config.enableWebInterface) {
|
|
5681
|
+
enabledServices.push(`Web interface: ${address}`);
|
|
5682
|
+
}
|
|
5683
|
+
if (this.config.enableMcpServer) {
|
|
5684
|
+
enabledServices.push(`MCP endpoints: ${address}/mcp, ${address}/sse`);
|
|
5685
|
+
}
|
|
5686
|
+
if (this.config.enableApiServer) {
|
|
5687
|
+
enabledServices.push(`API: ${address}/api`);
|
|
5688
|
+
}
|
|
5689
|
+
if (this.config.enableWorker) {
|
|
5690
|
+
enabledServices.push("Embedded worker: enabled");
|
|
5691
|
+
} else if (this.config.externalWorkerUrl) {
|
|
5692
|
+
enabledServices.push(`External worker: ${this.config.externalWorkerUrl}`);
|
|
5693
|
+
}
|
|
5694
|
+
for (const service of enabledServices) {
|
|
5695
|
+
logger.info(` • ${service}`);
|
|
5696
|
+
}
|
|
5697
|
+
}
|
|
5698
|
+
}
|
|
5699
|
+
async function startAppServer(docService, pipeline, config) {
|
|
5700
|
+
const appServer = new AppServer(docService, pipeline, config);
|
|
5701
|
+
await appServer.start();
|
|
5702
|
+
return appServer;
|
|
5703
|
+
}
|
|
5704
|
+
async function startStdioServer(tools, readOnly = false) {
|
|
5705
|
+
setLogLevel(LogLevel.ERROR);
|
|
5706
|
+
const server = createMcpServerInstance(tools, readOnly);
|
|
5707
|
+
const transport = new StdioServerTransport();
|
|
5708
|
+
await server.connect(transport);
|
|
5709
|
+
logger.info("🤖 MCP server listening on stdio");
|
|
5710
|
+
return server;
|
|
5711
|
+
}
|
|
5712
|
+
class DocumentManagementClient {
|
|
5713
|
+
baseUrl;
|
|
5714
|
+
client;
|
|
5715
|
+
constructor(serverUrl) {
|
|
5716
|
+
this.baseUrl = serverUrl.replace(/\/$/, "");
|
|
5717
|
+
this.client = createTRPCProxyClient({
|
|
5718
|
+
links: [httpBatchLink({ url: this.baseUrl })]
|
|
5719
|
+
});
|
|
5720
|
+
logger.debug(`DocumentManagementClient (tRPC) created for: ${this.baseUrl}`);
|
|
5721
|
+
}
|
|
5722
|
+
async initialize() {
|
|
5723
|
+
await this.client.ping.query();
|
|
5724
|
+
}
|
|
5725
|
+
async shutdown() {
|
|
5726
|
+
}
|
|
5727
|
+
async listLibraries() {
|
|
5728
|
+
return this.client.listLibraries.query();
|
|
5729
|
+
}
|
|
5730
|
+
async validateLibraryExists(library) {
|
|
5731
|
+
await this.client.validateLibraryExists.mutate({ library });
|
|
5732
|
+
}
|
|
5733
|
+
async findBestVersion(library, targetVersion) {
|
|
5734
|
+
return this.client.findBestVersion.query({ library, targetVersion });
|
|
5735
|
+
}
|
|
5736
|
+
async searchStore(library, version2, query, limit) {
|
|
5737
|
+
return this.client.search.query({ library, version: version2 ?? null, query, limit });
|
|
5738
|
+
}
|
|
5739
|
+
async removeVersion(library, version2) {
|
|
5740
|
+
await this.client.removeVersion.mutate({ library, version: version2 });
|
|
5741
|
+
}
|
|
5742
|
+
async removeAllDocuments(library, version2) {
|
|
5743
|
+
await this.client.removeAllDocuments.mutate({ library, version: version2 ?? null });
|
|
5744
|
+
}
|
|
5745
|
+
async getVersionsByStatus(statuses) {
|
|
5746
|
+
return this.client.getVersionsByStatus.query({
|
|
5747
|
+
statuses
|
|
5748
|
+
});
|
|
5749
|
+
}
|
|
5750
|
+
async findVersionsBySourceUrl(url) {
|
|
5751
|
+
return this.client.findVersionsBySourceUrl.query({ url });
|
|
5752
|
+
}
|
|
5753
|
+
async getScraperOptions(versionId) {
|
|
5754
|
+
return this.client.getScraperOptions.query({ versionId });
|
|
5755
|
+
}
|
|
5756
|
+
async updateVersionStatus(versionId, status, errorMessage) {
|
|
5757
|
+
await this.client.updateVersionStatus.mutate({ versionId, status, errorMessage });
|
|
5758
|
+
}
|
|
5759
|
+
async updateVersionProgress(versionId, pages, maxPages) {
|
|
5760
|
+
await this.client.updateVersionProgress.mutate({ versionId, pages, maxPages });
|
|
5761
|
+
}
|
|
5762
|
+
async storeScraperOptions(versionId, options) {
|
|
5763
|
+
await this.client.storeScraperOptions.mutate({ versionId, options });
|
|
5764
|
+
}
|
|
5765
|
+
}
|
|
5766
|
+
class SplitterError extends Error {
|
|
5767
|
+
}
|
|
5768
|
+
class MinimumChunkSizeError extends SplitterError {
|
|
5769
|
+
constructor(size, maxSize) {
|
|
5770
|
+
super(
|
|
5771
|
+
`Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
|
|
5772
|
+
);
|
|
5773
|
+
}
|
|
5774
|
+
}
|
|
5775
|
+
class ContentSplitterError extends SplitterError {
|
|
5776
|
+
}
|
|
5777
|
+
class GreedySplitter {
|
|
5778
|
+
baseSplitter;
|
|
5779
|
+
minChunkSize;
|
|
5780
|
+
preferredChunkSize;
|
|
5781
|
+
/**
|
|
5782
|
+
* Combines a base document splitter with size constraints to produce optimally-sized chunks.
|
|
5783
|
+
* The base splitter handles the initial semantic splitting, while this class handles
|
|
5784
|
+
* the concatenation strategy.
|
|
5785
|
+
*/
|
|
5786
|
+
constructor(baseSplitter, minChunkSize, preferredChunkSize) {
|
|
5787
|
+
this.baseSplitter = baseSplitter;
|
|
5788
|
+
this.minChunkSize = minChunkSize;
|
|
5789
|
+
this.preferredChunkSize = preferredChunkSize;
|
|
5790
|
+
}
|
|
5791
|
+
/**
|
|
5792
|
+
* Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
|
|
5793
|
+
* are combined until they reach the minimum size, but splits are preserved at major
|
|
5794
|
+
* section boundaries to maintain document structure. This balances the need for
|
|
5795
|
+
* context with semantic coherence.
|
|
5796
|
+
*/
|
|
5797
|
+
async splitText(markdown) {
|
|
5798
|
+
const initialChunks = await this.baseSplitter.splitText(markdown);
|
|
5799
|
+
const concatenatedChunks = [];
|
|
5800
|
+
let currentChunk = null;
|
|
5801
|
+
for (const nextChunk of initialChunks) {
|
|
5802
|
+
if (currentChunk) {
|
|
5803
|
+
if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
|
|
5804
|
+
concatenatedChunks.push(currentChunk);
|
|
5805
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
5806
|
+
continue;
|
|
5807
|
+
}
|
|
5808
|
+
if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
|
|
5809
|
+
concatenatedChunks.push(currentChunk);
|
|
5810
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
5811
|
+
continue;
|
|
5812
|
+
}
|
|
5813
|
+
currentChunk.content += `
|
|
5814
|
+
${nextChunk.content}`;
|
|
5815
|
+
currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
|
|
5816
|
+
currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
|
|
5817
|
+
} else {
|
|
5818
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
5819
|
+
}
|
|
5820
|
+
}
|
|
5821
|
+
if (currentChunk) {
|
|
5822
|
+
concatenatedChunks.push(currentChunk);
|
|
5823
|
+
}
|
|
5824
|
+
return concatenatedChunks;
|
|
5825
|
+
}
|
|
5826
|
+
cloneChunk(chunk) {
|
|
5827
|
+
return {
|
|
5828
|
+
types: [...chunk.types],
|
|
5829
|
+
content: chunk.content,
|
|
5830
|
+
section: {
|
|
5831
|
+
level: chunk.section.level,
|
|
5832
|
+
path: [...chunk.section.path]
|
|
5833
|
+
}
|
|
5834
|
+
};
|
|
5835
|
+
}
|
|
5836
|
+
/**
|
|
5837
|
+
* H1 and H2 headings represent major conceptual breaks in the document.
|
|
5838
|
+
* Preserving these splits helps maintain the document's logical structure.
|
|
5839
|
+
*/
|
|
5840
|
+
startsNewMajorSection(chunk) {
|
|
5841
|
+
return chunk.section.level === 1 || chunk.section.level === 2;
|
|
5842
|
+
}
|
|
5843
|
+
/**
|
|
5844
|
+
* Size limit check to ensure chunks remain within embedding model constraints.
|
|
5845
|
+
* Essential for maintaining consistent embedding quality and avoiding truncation.
|
|
5846
|
+
*/
|
|
5847
|
+
wouldExceedMaxSize(currentChunk, nextChunk) {
|
|
5848
|
+
if (!currentChunk) {
|
|
5849
|
+
return false;
|
|
5850
|
+
}
|
|
5851
|
+
return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
|
|
5852
|
+
}
|
|
5853
|
+
/**
|
|
5854
|
+
* Checks if one path is a prefix of another path, indicating a parent-child relationship
|
|
5855
|
+
*/
|
|
5856
|
+
isPathIncluded(parentPath, childPath) {
|
|
5857
|
+
if (parentPath.length >= childPath.length) return false;
|
|
5858
|
+
return parentPath.every((part, i) => part === childPath[i]);
|
|
5859
|
+
}
|
|
5860
|
+
/**
|
|
5861
|
+
* Merges section metadata when concatenating chunks, following these rules:
|
|
5862
|
+
* 1. Level: Always uses the lowest (most general) level between chunks
|
|
5863
|
+
* 2. Path selection:
|
|
5864
|
+
* - For parent-child relationships (one path includes the other), uses the child's path
|
|
5865
|
+
* - For siblings/unrelated sections, uses the common parent path
|
|
5866
|
+
* - If no common path exists, uses the root path ([])
|
|
5867
|
+
*/
|
|
5868
|
+
mergeSectionInfo(currentChunk, nextChunk) {
|
|
5869
|
+
const level = Math.min(currentChunk.section.level, nextChunk.section.level);
|
|
5870
|
+
if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
|
|
5871
|
+
return currentChunk.section;
|
|
5872
|
+
}
|
|
5873
|
+
if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
|
|
5874
|
+
return {
|
|
5875
|
+
path: nextChunk.section.path,
|
|
5876
|
+
level
|
|
5877
|
+
};
|
|
5878
|
+
}
|
|
5879
|
+
if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
|
|
5880
|
+
return {
|
|
5881
|
+
path: currentChunk.section.path,
|
|
5882
|
+
level
|
|
5883
|
+
};
|
|
5884
|
+
}
|
|
5885
|
+
const commonPath = this.findCommonPrefix(
|
|
5886
|
+
currentChunk.section.path,
|
|
5887
|
+
nextChunk.section.path
|
|
5888
|
+
);
|
|
5889
|
+
return {
|
|
5890
|
+
path: commonPath,
|
|
5891
|
+
level
|
|
5892
|
+
};
|
|
5893
|
+
}
|
|
5894
|
+
mergeTypes(currentTypes, nextTypes) {
|
|
5895
|
+
return [.../* @__PURE__ */ new Set([...currentTypes, ...nextTypes])];
|
|
5896
|
+
}
|
|
5897
|
+
/**
|
|
5898
|
+
* Returns longest common prefix between two paths
|
|
5899
|
+
*/
|
|
5900
|
+
findCommonPrefix(path1, path2) {
|
|
5901
|
+
const common = [];
|
|
5902
|
+
for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
|
|
5903
|
+
if (path1[i] === path2[i]) {
|
|
5904
|
+
common.push(path1[i]);
|
|
5905
|
+
} else {
|
|
5906
|
+
break;
|
|
5907
|
+
}
|
|
5908
|
+
}
|
|
5909
|
+
return common;
|
|
5910
|
+
}
|
|
5911
|
+
}
|
|
5912
|
+
const fullTrim = (str) => {
|
|
5913
|
+
return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
|
|
5914
|
+
};
|
|
5915
|
+
class CodeContentSplitter {
|
|
5916
|
+
constructor(options) {
|
|
5917
|
+
this.options = options;
|
|
5918
|
+
}
|
|
5919
|
+
async split(content) {
|
|
5920
|
+
const language = content.match(/^```(\w+)\n/)?.[1];
|
|
5921
|
+
const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
|
|
5922
|
+
const lines = strippedContent.split("\n");
|
|
5923
|
+
const chunks = [];
|
|
5924
|
+
let currentChunkLines = [];
|
|
5925
|
+
for (const line of lines) {
|
|
5926
|
+
const singleLineSize = this.wrap(line, language).length;
|
|
5927
|
+
if (singleLineSize > this.options.chunkSize) {
|
|
5928
|
+
throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
|
|
5929
|
+
}
|
|
5930
|
+
currentChunkLines.push(line);
|
|
5931
|
+
const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
|
|
5932
|
+
const newChunkSize = newChunkContent.length;
|
|
5933
|
+
if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
|
|
5934
|
+
const lastLine = currentChunkLines.pop();
|
|
5935
|
+
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
5936
|
+
currentChunkLines = [lastLine];
|
|
5937
|
+
}
|
|
5938
|
+
}
|
|
5939
|
+
if (currentChunkLines.length > 0) {
|
|
5940
|
+
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
5941
|
+
}
|
|
5942
|
+
return chunks;
|
|
5943
|
+
}
|
|
5944
|
+
wrap(content, language) {
|
|
5945
|
+
return `\`\`\`${language || ""}
|
|
5946
|
+
${content.replace(/\n+$/, "")}
|
|
5947
|
+
\`\`\``;
|
|
5948
|
+
}
|
|
5949
|
+
}
|
|
5950
|
+
class TableContentSplitter {
|
|
5951
|
+
constructor(options) {
|
|
5952
|
+
this.options = options;
|
|
5953
|
+
}
|
|
5954
|
+
/**
|
|
5955
|
+
* Splits table content into chunks while preserving table structure
|
|
5956
|
+
*/
|
|
5957
|
+
async split(content) {
|
|
5958
|
+
const parsedTable = this.parseTable(content);
|
|
5959
|
+
if (!parsedTable) {
|
|
5960
|
+
return [content];
|
|
5961
|
+
}
|
|
5962
|
+
const { headers, rows } = parsedTable;
|
|
5963
|
+
const chunks = [];
|
|
5964
|
+
let currentRows = [];
|
|
5965
|
+
for (const row of rows) {
|
|
5966
|
+
const singleRowSize = this.wrap(row, headers).length;
|
|
5967
|
+
if (singleRowSize > this.options.chunkSize) {
|
|
5968
|
+
throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
|
|
5969
|
+
}
|
|
5970
|
+
const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
|
|
5971
|
+
const newChunkSize = newChunkContent.length;
|
|
5972
|
+
if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
|
|
5973
|
+
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
5974
|
+
currentRows = [row];
|
|
5975
|
+
} else {
|
|
5976
|
+
currentRows.push(row);
|
|
5977
|
+
}
|
|
5978
|
+
}
|
|
5979
|
+
if (currentRows.length > 0) {
|
|
5980
|
+
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
5981
|
+
}
|
|
5982
|
+
return chunks;
|
|
5983
|
+
}
|
|
5984
|
+
wrap(content, headers) {
|
|
5985
|
+
const headerRow = `| ${headers.join(" | ")} |`;
|
|
5986
|
+
const separatorRow = `|${headers.map(() => "---").join("|")}|`;
|
|
5987
|
+
return [headerRow, separatorRow, content].join("\n");
|
|
5988
|
+
}
|
|
5989
|
+
parseTable(content) {
|
|
5990
|
+
const lines = content.trim().split("\n");
|
|
5991
|
+
if (lines.length < 3) return null;
|
|
5992
|
+
const headers = this.parseRow(lines[0]);
|
|
5993
|
+
if (!headers) return null;
|
|
5994
|
+
const separator = lines[1];
|
|
5995
|
+
if (!this.isValidSeparator(separator)) return null;
|
|
5996
|
+
const rows = lines.slice(2).filter((row) => row.trim() !== "");
|
|
5997
|
+
return { headers, separator, rows };
|
|
5998
|
+
}
|
|
5999
|
+
/**
|
|
6000
|
+
* Parses a table row into cells
|
|
6001
|
+
*/
|
|
6002
|
+
parseRow(row) {
|
|
6003
|
+
if (!row.includes("|")) return null;
|
|
6004
|
+
return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
|
|
6005
|
+
}
|
|
6006
|
+
/**
|
|
6007
|
+
* Validates the separator row of the table
|
|
6008
|
+
*/
|
|
6009
|
+
isValidSeparator(separator) {
|
|
6010
|
+
return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
|
|
6011
|
+
}
|
|
6012
|
+
}
|
|
6013
|
+
class TextContentSplitter {
|
|
6014
|
+
constructor(options) {
|
|
6015
|
+
this.options = options;
|
|
6016
|
+
}
|
|
6017
|
+
/**
|
|
6018
|
+
* Splits text content into chunks while trying to preserve semantic boundaries.
|
|
6019
|
+
* Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
|
|
6020
|
+
*/
|
|
6021
|
+
async split(content) {
|
|
6022
|
+
const trimmedContent = fullTrim(content);
|
|
6023
|
+
if (trimmedContent.length <= this.options.chunkSize) {
|
|
6024
|
+
return [trimmedContent];
|
|
6025
|
+
}
|
|
6026
|
+
const words = trimmedContent.split(/\s+/);
|
|
6027
|
+
const longestWord = words.reduce(
|
|
6028
|
+
(max, word) => word.length > max.length ? word : max
|
|
6029
|
+
);
|
|
6030
|
+
if (longestWord.length > this.options.chunkSize) {
|
|
6031
|
+
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
6032
|
+
}
|
|
6033
|
+
const paragraphChunks = this.splitByParagraphs(trimmedContent);
|
|
6034
|
+
if (this.areChunksValid(paragraphChunks)) {
|
|
6035
|
+
return paragraphChunks;
|
|
6036
|
+
}
|
|
6037
|
+
const lineChunks = this.splitByLines(trimmedContent);
|
|
6038
|
+
if (this.areChunksValid(lineChunks)) {
|
|
6039
|
+
return this.mergeChunks(lineChunks, "\n");
|
|
6040
|
+
}
|
|
6041
|
+
const wordChunks = await this.splitByWords(trimmedContent);
|
|
6042
|
+
return this.mergeChunks(wordChunks, " ");
|
|
6043
|
+
}
|
|
6044
|
+
/**
|
|
6045
|
+
* Checks if all chunks are within the maximum size limit
|
|
6046
|
+
*/
|
|
6047
|
+
areChunksValid(chunks) {
|
|
6048
|
+
return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
|
|
6049
|
+
}
|
|
6050
|
+
/**
|
|
6051
|
+
* Splits text into chunks by paragraph boundaries (double newlines)
|
|
6052
|
+
*/
|
|
6053
|
+
splitByParagraphs(text) {
|
|
6054
|
+
const paragraphs = text.split(/\n\s*\n/).map((p) => fullTrim(p)).filter(Boolean);
|
|
6055
|
+
return paragraphs.filter((chunk) => chunk.length > 2);
|
|
6056
|
+
}
|
|
6057
|
+
/**
|
|
6058
|
+
* Splits text into chunks by line boundaries
|
|
6059
|
+
*/
|
|
6060
|
+
splitByLines(text) {
|
|
6061
|
+
const lines = text.split(/\n/).map((line) => fullTrim(line)).filter(Boolean);
|
|
6062
|
+
return lines.filter((chunk) => chunk.length > 1);
|
|
6063
|
+
}
|
|
6064
|
+
/**
|
|
6065
|
+
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
6066
|
+
*/
|
|
6067
|
+
async splitByWords(text) {
|
|
6068
|
+
const splitter = new RecursiveCharacterTextSplitter({
|
|
6069
|
+
chunkSize: this.options.chunkSize,
|
|
6070
|
+
chunkOverlap: 0
|
|
6071
|
+
});
|
|
6072
|
+
const chunks = await splitter.splitText(text);
|
|
6073
|
+
return chunks;
|
|
6074
|
+
}
|
|
6075
|
+
/**
|
|
6076
|
+
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
6077
|
+
* Only merges if combined size is within maxChunkSize.
|
|
6078
|
+
*/
|
|
6079
|
+
mergeChunks(chunks, separator) {
|
|
6080
|
+
const mergedChunks = [];
|
|
6081
|
+
let currentChunk = null;
|
|
6082
|
+
for (const chunk of chunks) {
|
|
6083
|
+
if (currentChunk === null) {
|
|
6084
|
+
currentChunk = chunk;
|
|
6085
|
+
continue;
|
|
6086
|
+
}
|
|
6087
|
+
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
6088
|
+
const nextChunkSize = this.getChunkSize(chunk);
|
|
6089
|
+
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
6090
|
+
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
6091
|
+
} else {
|
|
6092
|
+
mergedChunks.push(currentChunk);
|
|
6093
|
+
currentChunk = chunk;
|
|
6094
|
+
}
|
|
6095
|
+
}
|
|
6096
|
+
if (currentChunk) {
|
|
6097
|
+
mergedChunks.push(currentChunk);
|
|
6098
|
+
}
|
|
6099
|
+
return mergedChunks;
|
|
6100
|
+
}
|
|
6101
|
+
getChunkSize(chunk) {
|
|
6102
|
+
return chunk.length;
|
|
6103
|
+
}
|
|
6104
|
+
wrap(content) {
|
|
6105
|
+
return content;
|
|
6106
|
+
}
|
|
6107
|
+
}
|
|
6108
|
+
class SemanticMarkdownSplitter {
|
|
6109
|
+
constructor(preferredChunkSize, maxChunkSize) {
|
|
6110
|
+
this.preferredChunkSize = preferredChunkSize;
|
|
6111
|
+
this.maxChunkSize = maxChunkSize;
|
|
6112
|
+
this.turndownService = new TurndownService({
|
|
6113
|
+
headingStyle: "atx",
|
|
6114
|
+
hr: "---",
|
|
6115
|
+
bulletListMarker: "-",
|
|
6116
|
+
codeBlockStyle: "fenced",
|
|
6117
|
+
emDelimiter: "_",
|
|
6118
|
+
strongDelimiter: "**",
|
|
6119
|
+
linkStyle: "inlined"
|
|
6120
|
+
});
|
|
6121
|
+
this.turndownService.addRule("table", {
|
|
6122
|
+
filter: ["table"],
|
|
6123
|
+
replacement: (_content, node) => {
|
|
6124
|
+
const table = node;
|
|
6125
|
+
const headers = Array.from(table.querySelectorAll("th")).map(
|
|
6126
|
+
(th) => th.textContent?.trim() || ""
|
|
6127
|
+
);
|
|
6128
|
+
const rows = Array.from(table.querySelectorAll("tr")).filter(
|
|
6129
|
+
(tr) => !tr.querySelector("th")
|
|
6130
|
+
);
|
|
6131
|
+
if (headers.length === 0 && rows.length === 0) return "";
|
|
6132
|
+
let markdown = "\n";
|
|
6133
|
+
if (headers.length > 0) {
|
|
6134
|
+
markdown += `| ${headers.join(" | ")} |
|
|
6135
|
+
`;
|
|
6136
|
+
markdown += `|${headers.map(() => "---").join("|")}|
|
|
6137
|
+
`;
|
|
6138
|
+
}
|
|
6139
|
+
for (const row of rows) {
|
|
6140
|
+
const cells = Array.from(row.querySelectorAll("td")).map(
|
|
6141
|
+
(td) => td.textContent?.trim() || ""
|
|
6142
|
+
);
|
|
6143
|
+
markdown += `| ${cells.join(" | ")} |
|
|
6144
|
+
`;
|
|
6145
|
+
}
|
|
6146
|
+
return markdown;
|
|
6147
|
+
}
|
|
6148
|
+
});
|
|
6149
|
+
this.textSplitter = new TextContentSplitter({
|
|
6150
|
+
chunkSize: this.preferredChunkSize
|
|
6151
|
+
});
|
|
6152
|
+
this.codeSplitter = new CodeContentSplitter({
|
|
6153
|
+
chunkSize: this.maxChunkSize
|
|
6154
|
+
});
|
|
6155
|
+
this.tableSplitter = new TableContentSplitter({
|
|
6156
|
+
chunkSize: this.maxChunkSize
|
|
6157
|
+
});
|
|
6158
|
+
}
|
|
6159
|
+
turndownService;
|
|
6160
|
+
textSplitter;
|
|
6161
|
+
codeSplitter;
|
|
6162
|
+
tableSplitter;
|
|
6163
|
+
/**
|
|
6164
|
+
* Main entry point for splitting markdown content
|
|
6165
|
+
*/
|
|
6166
|
+
async splitText(markdown) {
|
|
6167
|
+
const html = await this.markdownToHtml(markdown);
|
|
6168
|
+
const dom = await this.parseHtml(html);
|
|
6169
|
+
const sections = await this.splitIntoSections(dom);
|
|
6170
|
+
return this.splitSectionContent(sections);
|
|
6171
|
+
}
|
|
6172
|
+
/**
|
|
6173
|
+
* Step 1: Split document into sections based on H1-H6 headings,
|
|
6174
|
+
* as well as code blocks and tables.
|
|
6175
|
+
*/
|
|
6176
|
+
async splitIntoSections(dom) {
|
|
6177
|
+
const body = dom.querySelector("body");
|
|
6178
|
+
if (!body) {
|
|
6179
|
+
throw new Error("Invalid HTML structure: no body element found");
|
|
6180
|
+
}
|
|
6181
|
+
let currentSection = this.createRootSection();
|
|
6182
|
+
const sections = [];
|
|
6183
|
+
const stack = [currentSection];
|
|
6184
|
+
for (const element of Array.from(body.children)) {
|
|
6185
|
+
const headingMatch = element.tagName.match(/H([1-6])/);
|
|
6186
|
+
if (headingMatch) {
|
|
6187
|
+
const level = Number.parseInt(headingMatch[1], 10);
|
|
6188
|
+
const title = fullTrim(element.textContent || "");
|
|
6189
|
+
while (stack.length > 1 && stack[stack.length - 1].level >= level) {
|
|
6190
|
+
stack.pop();
|
|
6191
|
+
}
|
|
6192
|
+
currentSection = {
|
|
6193
|
+
level,
|
|
6194
|
+
path: [
|
|
6195
|
+
...stack.slice(1).reduce((acc, s) => {
|
|
6196
|
+
const lastPath = s.path[s.path.length - 1];
|
|
6197
|
+
if (lastPath) acc.push(lastPath);
|
|
6198
|
+
return acc;
|
|
6199
|
+
}, []),
|
|
6200
|
+
title
|
|
6201
|
+
],
|
|
6202
|
+
content: [
|
|
6203
|
+
{
|
|
6204
|
+
type: "heading",
|
|
6205
|
+
text: `${"#".repeat(level)} ${title}`
|
|
6206
|
+
}
|
|
6207
|
+
]
|
|
6208
|
+
};
|
|
6209
|
+
sections.push(currentSection);
|
|
6210
|
+
stack.push(currentSection);
|
|
6211
|
+
} else if (element.tagName === "PRE") {
|
|
6212
|
+
const code = element.querySelector("code");
|
|
6213
|
+
const language = code?.className.replace("language-", "") || "";
|
|
6214
|
+
const content = code?.textContent || element.textContent || "";
|
|
6215
|
+
const markdown = `${"```"}${language}
|
|
6216
|
+
${content}
|
|
6217
|
+
${"```"}`;
|
|
6218
|
+
currentSection = {
|
|
6219
|
+
level: currentSection.level,
|
|
6220
|
+
path: currentSection.path,
|
|
6221
|
+
content: [
|
|
6222
|
+
{
|
|
6223
|
+
type: "code",
|
|
6224
|
+
text: markdown
|
|
6225
|
+
}
|
|
6226
|
+
]
|
|
6227
|
+
};
|
|
6228
|
+
sections.push(currentSection);
|
|
6229
|
+
} else if (element.tagName === "TABLE") {
|
|
6230
|
+
const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
|
|
6231
|
+
currentSection = {
|
|
6232
|
+
level: currentSection.level,
|
|
6233
|
+
path: currentSection.path,
|
|
6234
|
+
content: [
|
|
6235
|
+
{
|
|
6236
|
+
type: "table",
|
|
6237
|
+
text: markdown
|
|
6238
|
+
}
|
|
6239
|
+
]
|
|
6240
|
+
};
|
|
6241
|
+
sections.push(currentSection);
|
|
6242
|
+
} else {
|
|
6243
|
+
const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
|
|
6244
|
+
if (markdown) {
|
|
6245
|
+
currentSection = {
|
|
6246
|
+
level: currentSection.level,
|
|
6247
|
+
path: currentSection.path,
|
|
6248
|
+
content: [
|
|
6249
|
+
{
|
|
6250
|
+
type: "text",
|
|
6251
|
+
text: markdown
|
|
6252
|
+
}
|
|
6253
|
+
]
|
|
6254
|
+
};
|
|
6255
|
+
sections.push(currentSection);
|
|
6256
|
+
}
|
|
6257
|
+
}
|
|
6258
|
+
}
|
|
6259
|
+
return sections;
|
|
6260
|
+
}
|
|
6261
|
+
/**
|
|
6262
|
+
* Step 2: Split section content into smaller chunks
|
|
6263
|
+
*/
|
|
6264
|
+
async splitSectionContent(sections) {
|
|
6265
|
+
const chunks = [];
|
|
6266
|
+
for (const section of sections) {
|
|
6267
|
+
for (const content of section.content) {
|
|
6268
|
+
let splitContent = [];
|
|
6269
|
+
try {
|
|
6270
|
+
switch (content.type) {
|
|
6271
|
+
case "heading":
|
|
6272
|
+
case "text": {
|
|
6273
|
+
splitContent = await this.textSplitter.split(content.text);
|
|
6274
|
+
break;
|
|
6275
|
+
}
|
|
6276
|
+
case "code": {
|
|
6277
|
+
splitContent = await this.codeSplitter.split(content.text);
|
|
6278
|
+
break;
|
|
6279
|
+
}
|
|
6280
|
+
case "table": {
|
|
6281
|
+
splitContent = await this.tableSplitter.split(content.text);
|
|
6282
|
+
break;
|
|
6283
|
+
}
|
|
6284
|
+
}
|
|
6285
|
+
} catch (err) {
|
|
6286
|
+
if (err instanceof MinimumChunkSizeError) {
|
|
6287
|
+
logger.warn(
|
|
6288
|
+
`⚠ Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
|
|
6289
|
+
);
|
|
6290
|
+
const splitter = new RecursiveCharacterTextSplitter({
|
|
6291
|
+
chunkSize: this.maxChunkSize,
|
|
6292
|
+
chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
|
|
6293
|
+
// Use more aggressive separators including empty string as last resort
|
|
6294
|
+
separators: [
|
|
6295
|
+
"\n\n",
|
|
6296
|
+
"\n",
|
|
6297
|
+
" ",
|
|
6298
|
+
" ",
|
|
6299
|
+
".",
|
|
6300
|
+
",",
|
|
6301
|
+
";",
|
|
6302
|
+
":",
|
|
6303
|
+
"-",
|
|
6304
|
+
"(",
|
|
6305
|
+
")",
|
|
6306
|
+
"[",
|
|
6307
|
+
"]",
|
|
6308
|
+
"{",
|
|
6309
|
+
"}",
|
|
6310
|
+
""
|
|
6311
|
+
]
|
|
6312
|
+
});
|
|
6313
|
+
const chunks2 = await splitter.splitText(content.text);
|
|
6314
|
+
if (chunks2.length === 0) {
|
|
6315
|
+
splitContent = [content.text.substring(0, this.maxChunkSize)];
|
|
6316
|
+
} else {
|
|
6317
|
+
splitContent = chunks2;
|
|
6318
|
+
}
|
|
6319
|
+
} else {
|
|
6320
|
+
const errMessage = err instanceof Error ? err.message : String(err);
|
|
6321
|
+
throw new ContentSplitterError(
|
|
6322
|
+
`Failed to split ${content.type} content: ${errMessage}`
|
|
6323
|
+
);
|
|
6324
|
+
}
|
|
6325
|
+
}
|
|
6326
|
+
chunks.push(
|
|
6327
|
+
...splitContent.map(
|
|
6328
|
+
(text) => ({
|
|
6329
|
+
types: [content.type],
|
|
6330
|
+
content: text,
|
|
6331
|
+
section: {
|
|
6332
|
+
level: section.level,
|
|
6333
|
+
path: section.path
|
|
6334
|
+
}
|
|
6335
|
+
})
|
|
6336
|
+
)
|
|
6337
|
+
);
|
|
6338
|
+
}
|
|
6339
|
+
}
|
|
6340
|
+
return chunks;
|
|
6341
|
+
}
|
|
6342
|
+
/**
|
|
6343
|
+
* Helper to create the root section
|
|
6344
|
+
*/
|
|
6345
|
+
createRootSection() {
|
|
6346
|
+
return {
|
|
6347
|
+
level: 0,
|
|
6348
|
+
path: [],
|
|
6349
|
+
content: []
|
|
6350
|
+
};
|
|
6351
|
+
}
|
|
6352
|
+
/**
|
|
6353
|
+
* Convert markdown to HTML using remark
|
|
6354
|
+
*/
|
|
6355
|
+
async markdownToHtml(markdown) {
|
|
6356
|
+
const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
|
|
6357
|
+
return `<!DOCTYPE html>
|
|
6358
|
+
<html>
|
|
6359
|
+
<body>
|
|
6360
|
+
${String(html)}
|
|
6361
|
+
</body>
|
|
6362
|
+
</html>`;
|
|
6363
|
+
}
|
|
6364
|
+
/**
|
|
6365
|
+
* Parse HTML
|
|
6366
|
+
*/
|
|
6367
|
+
async parseHtml(html) {
|
|
6368
|
+
const { window } = createJSDOM(html);
|
|
6369
|
+
return window.document;
|
|
6370
|
+
}
|
|
6371
|
+
}
|
|
6372
|
+
const CHILD_LIMIT = 5;
|
|
6373
|
+
const SIBLING_LIMIT = 2;
|
|
6374
|
+
class DocumentRetrieverService {
|
|
6375
|
+
documentStore;
|
|
6376
|
+
constructor(documentStore) {
|
|
6377
|
+
this.documentStore = documentStore;
|
|
6378
|
+
}
|
|
6379
|
+
/**
|
|
6380
|
+
* Collects all related chunk IDs for a given initial hit.
|
|
6381
|
+
* Returns an object with url, hitId, relatedIds (Set), and score.
|
|
6382
|
+
*/
|
|
6383
|
+
async getRelatedChunkIds(library, version2, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
|
|
6384
|
+
const id = doc.id;
|
|
6385
|
+
const url = doc.metadata.url;
|
|
6386
|
+
const score = doc.metadata.score;
|
|
6387
|
+
const relatedIds = /* @__PURE__ */ new Set();
|
|
6388
|
+
relatedIds.add(id);
|
|
6389
|
+
const parent = await this.documentStore.findParentChunk(library, version2, id);
|
|
6390
|
+
if (parent) {
|
|
6391
|
+
relatedIds.add(parent.id);
|
|
6392
|
+
}
|
|
6393
|
+
const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
|
|
6394
|
+
library,
|
|
6395
|
+
version2,
|
|
6396
|
+
id,
|
|
6397
|
+
siblingLimit
|
|
6398
|
+
);
|
|
6399
|
+
for (const sib of precedingSiblings) {
|
|
6400
|
+
relatedIds.add(sib.id);
|
|
6401
|
+
}
|
|
6402
|
+
const childChunks = await this.documentStore.findChildChunks(
|
|
6403
|
+
library,
|
|
6404
|
+
version2,
|
|
6405
|
+
id,
|
|
6406
|
+
childLimit
|
|
6407
|
+
);
|
|
6408
|
+
for (const child of childChunks) {
|
|
6409
|
+
relatedIds.add(child.id);
|
|
6410
|
+
}
|
|
6411
|
+
const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
|
|
6412
|
+
library,
|
|
6413
|
+
version2,
|
|
6414
|
+
id,
|
|
6415
|
+
siblingLimit
|
|
6416
|
+
);
|
|
6417
|
+
for (const sib of subsequentSiblings) {
|
|
6418
|
+
relatedIds.add(sib.id);
|
|
6419
|
+
}
|
|
6420
|
+
return { url, hitId: id, relatedIds, score };
|
|
6421
|
+
}
|
|
6422
|
+
/**
|
|
6423
|
+
* Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
|
|
6424
|
+
*/
|
|
6425
|
+
groupAndPrepareFetch(relatedInfos) {
|
|
6426
|
+
const urlMap = /* @__PURE__ */ new Map();
|
|
6427
|
+
for (const info of relatedInfos) {
|
|
6428
|
+
let entry = urlMap.get(info.url);
|
|
6429
|
+
if (!entry) {
|
|
6430
|
+
entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
|
|
6431
|
+
urlMap.set(info.url, entry);
|
|
6432
|
+
}
|
|
6433
|
+
for (const id of info.relatedIds) {
|
|
6434
|
+
entry.uniqueChunkIds.add(id);
|
|
6435
|
+
}
|
|
6436
|
+
if (info.score > entry.maxScore) {
|
|
6437
|
+
entry.maxScore = info.score;
|
|
6438
|
+
}
|
|
6439
|
+
}
|
|
6440
|
+
return urlMap;
|
|
6441
|
+
}
|
|
6442
|
+
/**
|
|
6443
|
+
* Finalizes the merged result for a URL group by fetching, sorting, and joining content.
|
|
6444
|
+
*/
|
|
6445
|
+
async finalizeResult(library, version2, url, uniqueChunkIds, maxScore) {
|
|
6446
|
+
const ids = Array.from(uniqueChunkIds);
|
|
6447
|
+
const docs = await this.documentStore.findChunksByIds(library, version2, ids);
|
|
6448
|
+
const content = docs.map((d) => d.pageContent).join("\n\n");
|
|
6449
|
+
return {
|
|
6450
|
+
url,
|
|
6451
|
+
content,
|
|
6452
|
+
score: maxScore
|
|
6453
|
+
};
|
|
6454
|
+
}
|
|
6455
|
+
/**
|
|
6456
|
+
* Searches for documents and expands the context around the matches.
|
|
6457
|
+
* @param library The library name.
|
|
6458
|
+
* @param version The library version.
|
|
6459
|
+
* @param query The search query.
|
|
6460
|
+
* @param version The library version (optional, defaults to searching documents without a version).
|
|
6461
|
+
* @param query The search query.
|
|
6462
|
+
* @param limit The optional limit for the initial search results.
|
|
6463
|
+
* @returns An array of strings representing the aggregated content of the retrieved chunks.
|
|
6464
|
+
*/
|
|
6465
|
+
async search(library, version2, query, limit) {
|
|
6466
|
+
const normalizedVersion = (version2 ?? "").toLowerCase();
|
|
6467
|
+
const initialResults = await this.documentStore.findByContent(
|
|
6468
|
+
library,
|
|
6469
|
+
normalizedVersion,
|
|
6470
|
+
query,
|
|
6471
|
+
limit ?? 10
|
|
6472
|
+
);
|
|
6473
|
+
const relatedInfos = await Promise.all(
|
|
6474
|
+
initialResults.map(
|
|
6475
|
+
(doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
|
|
6476
|
+
)
|
|
6477
|
+
);
|
|
6478
|
+
const urlMap = this.groupAndPrepareFetch(relatedInfos);
|
|
6479
|
+
const results = [];
|
|
6480
|
+
for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
|
|
6481
|
+
const result = await this.finalizeResult(
|
|
6482
|
+
library,
|
|
6483
|
+
normalizedVersion,
|
|
6484
|
+
url,
|
|
6485
|
+
uniqueChunkIds,
|
|
6486
|
+
maxScore
|
|
6487
|
+
);
|
|
6488
|
+
results.push(result);
|
|
6489
|
+
}
|
|
6490
|
+
return results;
|
|
6491
|
+
}
|
|
6492
|
+
}
|
|
6493
|
+
const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
|
|
6494
|
+
const MIGRATIONS_TABLE = "_schema_migrations";
|
|
6495
|
+
function ensureMigrationsTable(db) {
|
|
6496
|
+
db.exec(`
|
|
6497
|
+
CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
|
|
6498
|
+
id TEXT PRIMARY KEY,
|
|
6499
|
+
applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
6500
|
+
);
|
|
6501
|
+
`);
|
|
6502
|
+
}
|
|
6503
|
+
function getAppliedMigrations(db) {
|
|
6504
|
+
const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
|
|
6505
|
+
const rows = stmt.all();
|
|
6506
|
+
return new Set(rows.map((row) => row.id));
|
|
6507
|
+
}
|
|
6508
|
+
async function applyMigrations(db) {
|
|
6509
|
+
try {
|
|
6510
|
+
db.pragma("journal_mode = OFF");
|
|
6511
|
+
db.pragma("synchronous = OFF");
|
|
6512
|
+
db.pragma("mmap_size = 268435456");
|
|
6513
|
+
db.pragma("cache_size = -64000");
|
|
6514
|
+
db.pragma("temp_store = MEMORY");
|
|
6515
|
+
logger.debug("Applied performance optimizations for migration");
|
|
6516
|
+
} catch (_error) {
|
|
6517
|
+
logger.warn("⚠️ Could not apply all performance optimizations for migration");
|
|
6518
|
+
}
|
|
6519
|
+
const overallTransaction = db.transaction(() => {
|
|
6520
|
+
logger.debug("Checking database migrations...");
|
|
6521
|
+
ensureMigrationsTable(db);
|
|
6522
|
+
const appliedMigrations = getAppliedMigrations(db);
|
|
6523
|
+
if (!fs.existsSync(MIGRATIONS_DIR)) {
|
|
6524
|
+
throw new StoreError("Migrations directory not found");
|
|
6525
|
+
}
|
|
6526
|
+
const migrationFiles = fs.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
|
|
6527
|
+
const pendingMigrations = migrationFiles.filter(
|
|
6528
|
+
(filename) => !appliedMigrations.has(filename)
|
|
6529
|
+
);
|
|
6530
|
+
if (pendingMigrations.length > 0) {
|
|
6531
|
+
logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
|
|
6532
|
+
}
|
|
6533
|
+
let appliedCount = 0;
|
|
6534
|
+
for (const filename of pendingMigrations) {
|
|
6535
|
+
logger.debug(`Applying migration: ${filename}`);
|
|
6536
|
+
const filePath = path.join(MIGRATIONS_DIR, filename);
|
|
6537
|
+
const sql = fs.readFileSync(filePath, "utf8");
|
|
6538
|
+
try {
|
|
6539
|
+
db.exec(sql);
|
|
6540
|
+
const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
|
|
6541
|
+
insertStmt.run(filename);
|
|
6542
|
+
logger.debug(`Applied migration: ${filename}`);
|
|
6543
|
+
appliedCount++;
|
|
6544
|
+
} catch (error) {
|
|
6545
|
+
logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
|
|
6546
|
+
throw new StoreError(`Migration failed: ${filename}`, error);
|
|
6547
|
+
}
|
|
6548
|
+
}
|
|
6549
|
+
if (appliedCount > 0) {
|
|
6550
|
+
logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
|
|
6551
|
+
} else {
|
|
6552
|
+
logger.debug("Database schema is up to date");
|
|
6553
|
+
}
|
|
6554
|
+
return appliedCount;
|
|
6555
|
+
});
|
|
6556
|
+
let retries = 0;
|
|
6557
|
+
let appliedMigrationsCount = 0;
|
|
6558
|
+
while (true) {
|
|
6559
|
+
try {
|
|
6560
|
+
appliedMigrationsCount = overallTransaction.immediate();
|
|
6561
|
+
logger.debug("Database migrations completed successfully");
|
|
6562
|
+
if (appliedMigrationsCount > 0) {
|
|
6563
|
+
try {
|
|
6564
|
+
logger.debug(
|
|
6565
|
+
`Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
|
|
6566
|
+
);
|
|
6567
|
+
db.exec("VACUUM");
|
|
6568
|
+
logger.debug("Database vacuum completed successfully");
|
|
6569
|
+
} catch (error) {
|
|
6570
|
+
logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
|
|
6571
|
+
}
|
|
6572
|
+
} else {
|
|
6573
|
+
logger.debug("Skipping VACUUM - no migrations were applied");
|
|
6574
|
+
}
|
|
6575
|
+
break;
|
|
6576
|
+
} catch (error) {
|
|
6577
|
+
if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
|
|
6578
|
+
retries++;
|
|
6579
|
+
logger.warn(
|
|
6580
|
+
`⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
|
|
6581
|
+
);
|
|
6582
|
+
await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
|
|
6583
|
+
} else {
|
|
6584
|
+
if (error?.code === "SQLITE_BUSY") {
|
|
6585
|
+
logger.error(
|
|
6586
|
+
`❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
|
|
6587
|
+
);
|
|
6588
|
+
}
|
|
6589
|
+
if (error instanceof StoreError) {
|
|
6590
|
+
throw error;
|
|
6591
|
+
}
|
|
6592
|
+
throw new StoreError("Failed during migration process", error);
|
|
6593
|
+
}
|
|
6594
|
+
}
|
|
6595
|
+
}
|
|
6596
|
+
try {
|
|
6597
|
+
db.pragma("journal_mode = WAL");
|
|
6598
|
+
db.pragma("wal_autocheckpoint = 1000");
|
|
6599
|
+
db.pragma("busy_timeout = 30000");
|
|
6600
|
+
db.pragma("foreign_keys = ON");
|
|
6601
|
+
db.pragma("synchronous = NORMAL");
|
|
6602
|
+
logger.debug(
|
|
6603
|
+
"Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
|
|
6604
|
+
);
|
|
6605
|
+
} catch (_error) {
|
|
6606
|
+
logger.warn("⚠️ Could not apply all production database settings");
|
|
6607
|
+
}
|
|
6608
|
+
}
|
|
6609
|
+
class EmbeddingConfig {
|
|
6610
|
+
static instance = null;
|
|
6611
|
+
/**
|
|
6612
|
+
* Get the singleton instance of EmbeddingConfig.
|
|
6613
|
+
* Creates the instance if it doesn't exist.
|
|
6614
|
+
*/
|
|
6615
|
+
static getInstance() {
|
|
6616
|
+
if (EmbeddingConfig.instance === null) {
|
|
6617
|
+
EmbeddingConfig.instance = new EmbeddingConfig();
|
|
6618
|
+
}
|
|
6619
|
+
return EmbeddingConfig.instance;
|
|
6620
|
+
}
|
|
6621
|
+
/**
|
|
6622
|
+
* Reset the singleton instance (useful for testing).
|
|
6623
|
+
*/
|
|
6624
|
+
static resetInstance() {
|
|
6625
|
+
EmbeddingConfig.instance = null;
|
|
6626
|
+
}
|
|
6627
|
+
/**
|
|
6628
|
+
* Known dimensions for common embedding models.
|
|
6629
|
+
* This avoids expensive API calls for dimension detection in telemetry.
|
|
6630
|
+
*
|
|
6631
|
+
* Note: The "openai" provider also supports OpenAI-compatible APIs like:
|
|
6632
|
+
* - Ollama (local models)
|
|
6633
|
+
* - LMStudio (local models)
|
|
6634
|
+
* - Any service implementing OpenAI's embedding API
|
|
6635
|
+
*/
|
|
6636
|
+
knownModelDimensions = {
|
|
6637
|
+
// OpenAI models (also works with Ollama, LMStudio, and other OpenAI-compatible APIs)
|
|
6638
|
+
"text-embedding-3-small": 1536,
|
|
6639
|
+
"text-embedding-3-large": 3072,
|
|
6640
|
+
"text-embedding-ada-002": 1536,
|
|
6641
|
+
// Google Vertex AI models
|
|
6642
|
+
"text-embedding-004": 768,
|
|
6643
|
+
"textembedding-gecko@003": 768,
|
|
6644
|
+
"textembedding-gecko@002": 768,
|
|
6645
|
+
"textembedding-gecko@001": 768,
|
|
6646
|
+
// Google Gemini models (with MRL support)
|
|
6647
|
+
"text-embedding-preview-0409": 768,
|
|
6648
|
+
"embedding-001": 768,
|
|
6649
|
+
// AWS Bedrock models
|
|
6650
|
+
// Amazon Titan models
|
|
6651
|
+
"amazon.titan-embed-text-v1": 1536,
|
|
6652
|
+
"amazon.titan-embed-text-v2:0": 1024,
|
|
6653
|
+
"amazon.titan-embed-image-v1": 1024,
|
|
6654
|
+
// Image embedding model
|
|
6655
|
+
// Cohere models
|
|
6656
|
+
"cohere.embed-english-v3": 1024,
|
|
6657
|
+
"cohere.embed-multilingual-v3": 1024,
|
|
6658
|
+
// SageMaker models (hosted on AWS SageMaker)
|
|
6659
|
+
"intfloat/multilingual-e5-large": 1024,
|
|
6660
|
+
// Additional AWS models that might be supported
|
|
6661
|
+
// Note: Some of these might be placeholders - verify dimensions before use
|
|
6662
|
+
// "amazon.nova-embed-multilingual-v1:0": 4096, // Commented out as noted in source
|
|
6663
|
+
// MTEB Leaderboard models (source: https://huggingface.co/spaces/mteb/leaderboard)
|
|
6664
|
+
// Top performing models from Massive Text Embedding Benchmark
|
|
6665
|
+
"sentence-transformers/all-MiniLM-L6-v2": 384,
|
|
6666
|
+
"gemini-embedding-001": 3072,
|
|
6667
|
+
"Qwen/Qwen3-Embedding-8B": 4096,
|
|
6668
|
+
"Qwen/Qwen3-Embedding-4B": 2560,
|
|
6669
|
+
"Qwen/Qwen3-Embedding-0.6B": 1024,
|
|
6670
|
+
"Linq-AI-Research/Linq-Embed-Mistral": 4096,
|
|
6671
|
+
"Alibaba-NLP/gte-Qwen2-7B-instruct": 3584,
|
|
6672
|
+
"intfloat/multilingual-e5-large-instruct": 1024,
|
|
6673
|
+
"Salesforce/SFR-Embedding-Mistral": 4096,
|
|
6674
|
+
"text-multilingual-embedding-002": 768,
|
|
6675
|
+
"GritLM/GritLM-7B": 4096,
|
|
6676
|
+
"GritLM/GritLM-8x7B": 4096,
|
|
6677
|
+
"intfloat/e5-mistral-7b-instruct": 4096,
|
|
6678
|
+
"Cohere/Cohere-embed-multilingual-v3.0": 1024,
|
|
6679
|
+
"Alibaba-NLP/gte-Qwen2-1.5B-instruct": 8960,
|
|
6680
|
+
"Lajavaness/bilingual-embedding-large": 1024,
|
|
6681
|
+
"Salesforce/SFR-Embedding-2_R": 4096,
|
|
6682
|
+
"NovaSearch/stella_en_1.5B_v5": 8960,
|
|
6683
|
+
"NovaSearch/jasper_en_vision_language_v1": 8960,
|
|
6684
|
+
"nvidia/NV-Embed-v2": 4096,
|
|
6685
|
+
"OrdalieTech/Solon-embeddings-large-0.1": 1024,
|
|
6686
|
+
"BAAI/bge-m3": 1024,
|
|
6687
|
+
"HIT-TMG/KaLM-embedding-multilingual-mini-v1": 896,
|
|
6688
|
+
"jinaai/jina-embeddings-v3": 1024,
|
|
6689
|
+
"Alibaba-NLP/gte-multilingual-base": 768,
|
|
6690
|
+
"Lajavaness/bilingual-embedding-base": 768,
|
|
6691
|
+
"HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1": 896,
|
|
6692
|
+
"nvidia/NV-Embed-v1": 4096,
|
|
6693
|
+
"Cohere/Cohere-embed-multilingual-light-v3.0": 384,
|
|
6694
|
+
"manu/bge-m3-custom-fr": 1024,
|
|
6695
|
+
"Lajavaness/bilingual-embedding-small": 384,
|
|
6696
|
+
"Snowflake/snowflake-arctic-embed-l-v2.0": 1024,
|
|
6697
|
+
"intfloat/multilingual-e5-base": 768,
|
|
6698
|
+
"voyage-3-lite": 512,
|
|
6699
|
+
"voyage-3": 1024,
|
|
6700
|
+
"intfloat/multilingual-e5-small": 384,
|
|
6701
|
+
"Alibaba-NLP/gte-Qwen1.5-7B-instruct": 4096,
|
|
6702
|
+
"Snowflake/snowflake-arctic-embed-m-v2.0": 768,
|
|
6703
|
+
"deepvk/USER-bge-m3": 1024,
|
|
6704
|
+
"Cohere/Cohere-embed-english-v3.0": 1024,
|
|
6705
|
+
"Omartificial-Intelligence-Space/Arabic-labse-Matryoshka": 768,
|
|
6706
|
+
"ibm-granite/granite-embedding-278m-multilingual": 768,
|
|
6707
|
+
"NovaSearch/stella_en_400M_v5": 4096,
|
|
6708
|
+
"omarelshehy/arabic-english-sts-matryoshka": 1024,
|
|
6709
|
+
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2": 768,
|
|
6710
|
+
"Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka": 768,
|
|
6711
|
+
"Haon-Chen/speed-embedding-7b-instruct": 4096,
|
|
6712
|
+
"sentence-transformers/LaBSE": 768,
|
|
6713
|
+
"WhereIsAI/UAE-Large-V1": 1024,
|
|
6714
|
+
"ibm-granite/granite-embedding-107m-multilingual": 384,
|
|
6715
|
+
"mixedbread-ai/mxbai-embed-large-v1": 1024,
|
|
6716
|
+
"intfloat/e5-large-v2": 1024,
|
|
6717
|
+
"avsolatorio/GIST-large-Embedding-v0": 1024,
|
|
6718
|
+
"sdadas/mmlw-e5-large": 1024,
|
|
6719
|
+
"nomic-ai/nomic-embed-text-v1": 768,
|
|
6720
|
+
"nomic-ai/nomic-embed-text-v1-ablated": 768,
|
|
6721
|
+
"intfloat/e5-base-v2": 768,
|
|
6722
|
+
"BAAI/bge-large-en-v1.5": 1024,
|
|
6723
|
+
"intfloat/e5-large": 1024,
|
|
6724
|
+
"Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet": 384,
|
|
6725
|
+
"Cohere/Cohere-embed-english-light-v3.0": 384,
|
|
6726
|
+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": 768,
|
|
6727
|
+
"Gameselo/STS-multilingual-mpnet-base-v2": 768,
|
|
6728
|
+
"thenlper/gte-large": 1024,
|
|
6729
|
+
"avsolatorio/GIST-Embedding-v0": 768,
|
|
6730
|
+
"nomic-ai/nomic-embed-text-v1-unsupervised": 768,
|
|
6731
|
+
"infgrad/stella-base-en-v2": 768,
|
|
6732
|
+
"avsolatorio/NoInstruct-small-Embedding-v0": 384,
|
|
6733
|
+
"dwzhu/e5-base-4k": 768,
|
|
6734
|
+
"sdadas/mmlw-e5-base": 768,
|
|
6735
|
+
"voyage-multilingual-2": 1024,
|
|
6736
|
+
"McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised": 4096,
|
|
6737
|
+
"BAAI/bge-base-en-v1.5": 768,
|
|
6738
|
+
"avsolatorio/GIST-small-Embedding-v0": 384,
|
|
6739
|
+
"sdadas/mmlw-roberta-large": 1024,
|
|
6740
|
+
"nomic-ai/nomic-embed-text-v1.5": 768,
|
|
6741
|
+
"minishlab/potion-multilingual-128M": 256,
|
|
6742
|
+
"shibing624/text2vec-base-multilingual": 384,
|
|
6743
|
+
"thenlper/gte-base": 768,
|
|
6744
|
+
"intfloat/e5-small-v2": 384,
|
|
6745
|
+
"intfloat/e5-base": 768,
|
|
6746
|
+
"sentence-transformers/static-similarity-mrl-multilingual-v1": 1024,
|
|
6747
|
+
"manu/sentence_croissant_alpha_v0.3": 2048,
|
|
6748
|
+
"BAAI/bge-small-en-v1.5": 512,
|
|
6749
|
+
"thenlper/gte-small": 384,
|
|
6750
|
+
"sdadas/mmlw-e5-small": 384,
|
|
6751
|
+
"manu/sentence_croissant_alpha_v0.4": 2048,
|
|
6752
|
+
"manu/sentence_croissant_alpha_v0.2": 2048,
|
|
6753
|
+
"abhinand/MedEmbed-small-v0.1": 384,
|
|
6754
|
+
"ibm-granite/granite-embedding-125m-english": 768,
|
|
6755
|
+
"intfloat/e5-small": 384,
|
|
6756
|
+
"voyage-large-2-instruct": 1024,
|
|
6757
|
+
"sdadas/mmlw-roberta-base": 768,
|
|
6758
|
+
"Snowflake/snowflake-arctic-embed-l": 1024,
|
|
6759
|
+
"Mihaiii/Ivysaur": 384,
|
|
6760
|
+
"Snowflake/snowflake-arctic-embed-m-long": 768,
|
|
6761
|
+
"bigscience/sgpt-bloom-7b1-msmarco": 4096,
|
|
6762
|
+
"avsolatorio/GIST-all-MiniLM-L6-v2": 384,
|
|
6763
|
+
"sergeyzh/LaBSE-ru-turbo": 768,
|
|
6764
|
+
"sentence-transformers/all-mpnet-base-v2": 768,
|
|
6765
|
+
"Snowflake/snowflake-arctic-embed-m": 768,
|
|
6766
|
+
"Snowflake/snowflake-arctic-embed-s": 384,
|
|
6767
|
+
"sentence-transformers/all-MiniLM-L12-v2": 384,
|
|
6768
|
+
"Mihaiii/gte-micro-v4": 384,
|
|
6769
|
+
"Snowflake/snowflake-arctic-embed-m-v1.5": 768,
|
|
6770
|
+
"cointegrated/LaBSE-en-ru": 768,
|
|
6771
|
+
"Mihaiii/Bulbasaur": 384,
|
|
6772
|
+
"ibm-granite/granite-embedding-30m-english": 384,
|
|
6773
|
+
"deepfile/embedder-100p": 768,
|
|
6774
|
+
"Jaume/gemma-2b-embeddings": 2048,
|
|
6775
|
+
"OrlikB/KartonBERT-USE-base-v1": 768,
|
|
6776
|
+
"izhx/udever-bloom-7b1": 4096,
|
|
6777
|
+
"izhx/udever-bloom-1b1": 1024,
|
|
6778
|
+
"brahmairesearch/slx-v0.1": 384,
|
|
6779
|
+
"Mihaiii/Wartortle": 384,
|
|
6780
|
+
"izhx/udever-bloom-3b": 2048,
|
|
6781
|
+
"deepvk/USER-base": 768,
|
|
6782
|
+
"ai-forever/ru-en-RoSBERTa": 1024,
|
|
6783
|
+
"McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse": 4096,
|
|
6784
|
+
"Mihaiii/Venusaur": 384,
|
|
6785
|
+
"Snowflake/snowflake-arctic-embed-xs": 384,
|
|
6786
|
+
"jinaai/jina-embedding-b-en-v1": 768,
|
|
6787
|
+
"Mihaiii/gte-micro": 384,
|
|
6788
|
+
"aari1995/German_Semantic_STS_V2": 1024,
|
|
6789
|
+
"Mihaiii/Squirtle": 384,
|
|
6790
|
+
"OrlikB/st-polish-kartonberta-base-alpha-v1": 768,
|
|
6791
|
+
"sergeyzh/rubert-tiny-turbo": 312,
|
|
6792
|
+
"minishlab/potion-base-8M": 256,
|
|
6793
|
+
"minishlab/M2V_base_glove_subword": 256,
|
|
6794
|
+
"jinaai/jina-embedding-s-en-v1": 512,
|
|
6795
|
+
"minishlab/potion-base-4M": 128,
|
|
6796
|
+
"minishlab/M2V_base_output": 256,
|
|
6797
|
+
"DeepPavlov/rubert-base-cased-sentence": 768,
|
|
6798
|
+
"jinaai/jina-embeddings-v2-small-en": 512,
|
|
6799
|
+
"cointegrated/rubert-tiny2": 312,
|
|
6800
|
+
"minishlab/M2V_base_glove": 256,
|
|
6801
|
+
"cointegrated/rubert-tiny": 312,
|
|
6802
|
+
"silma-ai/silma-embeddding-matryoshka-v0.1": 768,
|
|
6803
|
+
"DeepPavlov/rubert-base-cased": 768,
|
|
6804
|
+
"Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet": 768,
|
|
6805
|
+
"izhx/udever-bloom-560m": 1024,
|
|
6806
|
+
"minishlab/potion-base-2M": 64,
|
|
6807
|
+
"DeepPavlov/distilrubert-small-cased-conversational": 768,
|
|
6808
|
+
"consciousAI/cai-lunaris-text-embeddings": 1024,
|
|
6809
|
+
"deepvk/deberta-v1-base": 768,
|
|
6810
|
+
"Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka": 768,
|
|
6811
|
+
"Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka": 768,
|
|
6812
|
+
"ai-forever/sbert_large_mt_nlu_ru": 1024,
|
|
6813
|
+
"ai-forever/sbert_large_nlu_ru": 1024,
|
|
6814
|
+
"malenia1/ternary-weight-embedding": 1024,
|
|
6815
|
+
"jinaai/jina-embeddings-v2-base-en": 768,
|
|
6816
|
+
"VPLabs/SearchMap_Preview": 4096,
|
|
6817
|
+
"Hum-Works/lodestone-base-4096-v1": 768,
|
|
6818
|
+
"jinaai/jina-embeddings-v4": 2048
|
|
6819
|
+
};
|
|
6820
|
+
/**
|
|
6821
|
+
* Lowercase lookup map for case-insensitive model dimension queries.
|
|
6822
|
+
* Built lazily from knownModelDimensions to ensure consistency.
|
|
6823
|
+
*/
|
|
6824
|
+
modelLookup;
|
|
6825
|
+
constructor() {
|
|
6826
|
+
this.modelLookup = /* @__PURE__ */ new Map();
|
|
6827
|
+
for (const [model, dimensions] of Object.entries(this.knownModelDimensions)) {
|
|
6828
|
+
this.modelLookup.set(model.toLowerCase(), dimensions);
|
|
6829
|
+
}
|
|
6830
|
+
}
|
|
6831
|
+
/**
|
|
6832
|
+
* Parse embedding model configuration from environment variables.
|
|
6833
|
+
* This is a synchronous operation that extracts provider, model, and known dimensions.
|
|
6834
|
+
*
|
|
6835
|
+
* Supports various providers:
|
|
6836
|
+
* - openai: OpenAI models and OpenAI-compatible APIs (Ollama, LMStudio, etc.)
|
|
6837
|
+
* - vertex: Google Cloud Vertex AI
|
|
6838
|
+
* - gemini: Google Generative AI
|
|
6839
|
+
* - aws: AWS Bedrock models
|
|
6840
|
+
* - microsoft: Azure OpenAI
|
|
6841
|
+
* - sagemaker: AWS SageMaker hosted models
|
|
6842
|
+
*
|
|
6843
|
+
* @param modelSpec Optional model specification, defaults to DOCS_MCP_EMBEDDING_MODEL env var
|
|
6844
|
+
* @returns Parsed embedding model configuration
|
|
6845
|
+
*/
|
|
6846
|
+
parse(modelSpec) {
|
|
6847
|
+
const spec = modelSpec || process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
|
|
6848
|
+
const colonIndex = spec.indexOf(":");
|
|
6849
|
+
let provider;
|
|
6850
|
+
let model;
|
|
6851
|
+
if (colonIndex === -1) {
|
|
6852
|
+
provider = "openai";
|
|
6853
|
+
model = spec;
|
|
6854
|
+
} else {
|
|
6855
|
+
provider = spec.substring(0, colonIndex);
|
|
6856
|
+
model = spec.substring(colonIndex + 1);
|
|
6857
|
+
}
|
|
6858
|
+
const dimensions = this.modelLookup?.get(model.toLowerCase()) || null;
|
|
6859
|
+
return {
|
|
6860
|
+
provider,
|
|
6861
|
+
model,
|
|
6862
|
+
dimensions,
|
|
6863
|
+
modelSpec: spec
|
|
6864
|
+
};
|
|
6865
|
+
}
|
|
6866
|
+
/**
|
|
6867
|
+
* Get the known dimensions for a specific model.
|
|
6868
|
+
* Returns null if the model dimensions are not known.
|
|
6869
|
+
* Uses case-insensitive lookup.
|
|
6870
|
+
*
|
|
6871
|
+
* @param model The model name (e.g., "text-embedding-3-small")
|
|
6872
|
+
* @returns Known dimensions or null
|
|
6873
|
+
*/
|
|
6874
|
+
getKnownDimensions(model) {
|
|
6875
|
+
return this.modelLookup?.get(model.toLowerCase()) || null;
|
|
6876
|
+
}
|
|
6877
|
+
/**
|
|
6878
|
+
* Add or update known dimensions for a model.
|
|
6879
|
+
* This can be used to cache discovered dimensions.
|
|
6880
|
+
* Stores both original case and lowercase for consistent lookup.
|
|
6881
|
+
*
|
|
6882
|
+
* @param model The model name
|
|
6883
|
+
* @param dimensions The dimensions to cache
|
|
6884
|
+
*/
|
|
6885
|
+
setKnownDimensions(model, dimensions) {
|
|
6886
|
+
this.knownModelDimensions[model] = dimensions;
|
|
6887
|
+
if (this.modelLookup) {
|
|
6888
|
+
this.modelLookup.set(model.toLowerCase(), dimensions);
|
|
6889
|
+
}
|
|
6890
|
+
}
|
|
6891
|
+
/**
|
|
6892
|
+
* Static method to parse embedding model configuration using the singleton instance.
|
|
6893
|
+
* This maintains backward compatibility while using the class-based approach.
|
|
6894
|
+
*/
|
|
6895
|
+
static parseEmbeddingConfig(modelSpec) {
|
|
6896
|
+
return EmbeddingConfig.getInstance().parse(modelSpec);
|
|
6897
|
+
}
|
|
6898
|
+
/**
|
|
6899
|
+
* Static method to get known model dimensions using the singleton instance.
|
|
6900
|
+
* This maintains backward compatibility while using the class-based approach.
|
|
6901
|
+
*/
|
|
6902
|
+
static getKnownModelDimensions(model) {
|
|
6903
|
+
return EmbeddingConfig.getInstance().getKnownDimensions(model);
|
|
6904
|
+
}
|
|
6905
|
+
/**
|
|
6906
|
+
* Static method to set known model dimensions using the singleton instance.
|
|
6907
|
+
* This maintains backward compatibility while using the class-based approach.
|
|
6908
|
+
*/
|
|
6909
|
+
static setKnownModelDimensions(model, dimensions) {
|
|
6910
|
+
EmbeddingConfig.getInstance().setKnownDimensions(model, dimensions);
|
|
6911
|
+
}
|
|
6912
|
+
}
|
|
6913
|
+
class DocumentStore {
|
|
6914
|
+
db;
|
|
6915
|
+
embeddings;
|
|
6916
|
+
dbDimension = VECTOR_DIMENSION;
|
|
6917
|
+
modelDimension;
|
|
6918
|
+
embeddingConfig;
|
|
6919
|
+
statements;
|
|
6920
|
+
/**
|
|
6921
|
+
* Calculates Reciprocal Rank Fusion score for a result
|
|
6922
|
+
*/
|
|
6923
|
+
calculateRRF(vecRank, ftsRank, k = 60) {
|
|
6924
|
+
let rrf = 0;
|
|
6925
|
+
if (vecRank !== void 0) {
|
|
6926
|
+
rrf += 1 / (k + vecRank);
|
|
6927
|
+
}
|
|
6928
|
+
if (ftsRank !== void 0) {
|
|
6929
|
+
rrf += 1 / (k + ftsRank);
|
|
6930
|
+
}
|
|
6931
|
+
return rrf;
|
|
6932
|
+
}
|
|
6933
|
+
/**
|
|
6934
|
+
* Assigns ranks to search results based on their scores
|
|
6935
|
+
*/
|
|
6936
|
+
assignRanks(results) {
|
|
6937
|
+
const vecRanks = /* @__PURE__ */ new Map();
|
|
6938
|
+
const ftsRanks = /* @__PURE__ */ new Map();
|
|
6939
|
+
results.filter((r) => r.vec_score !== void 0).sort((a, b) => (b.vec_score ?? 0) - (a.vec_score ?? 0)).forEach((result, index) => {
|
|
6940
|
+
vecRanks.set(Number(result.id), index + 1);
|
|
6941
|
+
});
|
|
6942
|
+
results.filter((r) => r.fts_score !== void 0).sort((a, b) => (b.fts_score ?? 0) - (a.fts_score ?? 0)).forEach((result, index) => {
|
|
6943
|
+
ftsRanks.set(Number(result.id), index + 1);
|
|
6944
|
+
});
|
|
6945
|
+
return results.map((result) => ({
|
|
6946
|
+
...result,
|
|
6947
|
+
vec_rank: vecRanks.get(Number(result.id)),
|
|
6948
|
+
fts_rank: ftsRanks.get(Number(result.id)),
|
|
6949
|
+
rrf_score: this.calculateRRF(
|
|
6950
|
+
vecRanks.get(Number(result.id)),
|
|
6951
|
+
ftsRanks.get(Number(result.id))
|
|
6952
|
+
)
|
|
6953
|
+
}));
|
|
6954
|
+
}
|
|
6955
|
+
constructor(dbPath, embeddingConfig) {
|
|
6956
|
+
if (!dbPath) {
|
|
6957
|
+
throw new StoreError("Missing required database path");
|
|
6958
|
+
}
|
|
6959
|
+
this.db = new Database(dbPath);
|
|
6960
|
+
this.embeddingConfig = embeddingConfig;
|
|
6961
|
+
}
|
|
6962
|
+
/**
|
|
6963
|
+
* Sets up prepared statements for database queries
|
|
6964
|
+
*/
|
|
6965
|
+
prepareStatements() {
|
|
6966
|
+
const statements = {
|
|
6967
|
+
getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
|
|
6968
|
+
insertDocument: this.db.prepare(
|
|
6969
|
+
"INSERT INTO documents (library_id, version_id, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
|
|
6970
|
+
),
|
|
6971
|
+
insertEmbedding: this.db.prepare(
|
|
6972
|
+
"INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)"
|
|
6973
|
+
),
|
|
6974
|
+
insertLibrary: this.db.prepare(
|
|
6975
|
+
"INSERT INTO libraries (name) VALUES (?) ON CONFLICT(name) DO NOTHING"
|
|
6976
|
+
),
|
|
6977
|
+
getLibraryIdByName: this.db.prepare(
|
|
6978
|
+
"SELECT id FROM libraries WHERE name = ?"
|
|
6979
|
+
),
|
|
6980
|
+
// New version-related statements
|
|
6981
|
+
insertVersion: this.db.prepare(
|
|
6982
|
+
"INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
|
|
6983
|
+
),
|
|
6984
|
+
resolveVersionId: this.db.prepare(
|
|
6985
|
+
"SELECT id FROM versions WHERE library_id = ? AND name IS ?"
|
|
6986
|
+
),
|
|
6987
|
+
getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
|
|
6988
|
+
queryVersionsByLibraryId: this.db.prepare(
|
|
6989
|
+
"SELECT * FROM versions WHERE library_id = ? ORDER BY name"
|
|
6990
|
+
),
|
|
6991
|
+
deleteLibraryDocuments: this.db.prepare(
|
|
6992
|
+
`DELETE FROM documents
|
|
6993
|
+
WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6994
|
+
AND version_id = (
|
|
6995
|
+
SELECT v.id FROM versions v
|
|
6996
|
+
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6997
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6998
|
+
)`
|
|
6999
|
+
),
|
|
7000
|
+
deleteDocuments: this.db.prepare(
|
|
7001
|
+
`DELETE FROM documents
|
|
7002
|
+
WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
7003
|
+
AND version_id = (
|
|
7004
|
+
SELECT v.id FROM versions v
|
|
7005
|
+
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
7006
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7007
|
+
)`
|
|
7008
|
+
),
|
|
7009
|
+
deleteDocumentsByUrl: this.db.prepare(
|
|
7010
|
+
`DELETE FROM documents
|
|
7011
|
+
WHERE url = ?
|
|
7012
|
+
AND library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
7013
|
+
AND version_id = (
|
|
7014
|
+
SELECT v.id FROM versions v
|
|
7015
|
+
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
7016
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7017
|
+
)`
|
|
7018
|
+
),
|
|
7019
|
+
getDocumentBySort: this.db.prepare(
|
|
7020
|
+
`SELECT d.id
|
|
7021
|
+
FROM documents d
|
|
7022
|
+
JOIN versions v ON d.version_id = v.id
|
|
7023
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7024
|
+
WHERE l.name = ?
|
|
7025
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7026
|
+
LIMIT 1`
|
|
7027
|
+
),
|
|
7028
|
+
queryVersions: this.db.prepare(
|
|
7029
|
+
`SELECT DISTINCT v.name
|
|
7030
|
+
FROM versions v
|
|
7031
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7032
|
+
WHERE l.name = ?
|
|
7033
|
+
ORDER BY v.name`
|
|
7034
|
+
),
|
|
7035
|
+
checkExists: this.db.prepare(
|
|
7036
|
+
`SELECT d.id FROM documents d
|
|
7037
|
+
JOIN versions v ON d.version_id = v.id
|
|
7038
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7039
|
+
WHERE l.name = ?
|
|
7040
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7041
|
+
LIMIT 1`
|
|
7042
|
+
),
|
|
7043
|
+
// Library/version aggregation including versions without documents and status/progress fields
|
|
7044
|
+
queryLibraryVersions: this.db.prepare(
|
|
7045
|
+
`SELECT
|
|
7046
|
+
l.name as library,
|
|
7047
|
+
COALESCE(v.name, '') as version,
|
|
7048
|
+
v.id as versionId,
|
|
7049
|
+
v.status as status,
|
|
7050
|
+
v.progress_pages as progressPages,
|
|
7051
|
+
v.progress_max_pages as progressMaxPages,
|
|
7052
|
+
v.source_url as sourceUrl,
|
|
7053
|
+
MIN(d.indexed_at) as indexedAt,
|
|
7054
|
+
COUNT(d.id) as documentCount,
|
|
7055
|
+
COUNT(DISTINCT d.url) as uniqueUrlCount
|
|
7056
|
+
FROM versions v
|
|
7057
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7058
|
+
LEFT JOIN documents d ON d.version_id = v.id
|
|
7059
|
+
GROUP BY v.id
|
|
7060
|
+
ORDER BY l.name, version`
|
|
7061
|
+
),
|
|
7062
|
+
getChildChunks: this.db.prepare(`
|
|
7063
|
+
SELECT d.* FROM documents d
|
|
7064
|
+
JOIN versions v ON d.version_id = v.id
|
|
7065
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7066
|
+
WHERE l.name = ?
|
|
7067
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7068
|
+
AND d.url = ?
|
|
7069
|
+
AND json_array_length(json_extract(d.metadata, '$.path')) = ?
|
|
7070
|
+
AND json_extract(d.metadata, '$.path') LIKE ? || '%'
|
|
7071
|
+
AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
7072
|
+
ORDER BY d.sort_order
|
|
7073
|
+
LIMIT ?
|
|
7074
|
+
`),
|
|
7075
|
+
getPrecedingSiblings: this.db.prepare(`
|
|
7076
|
+
SELECT d.* FROM documents d
|
|
7077
|
+
JOIN versions v ON d.version_id = v.id
|
|
7078
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7079
|
+
WHERE l.name = ?
|
|
7080
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7081
|
+
AND d.url = ?
|
|
7082
|
+
AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
7083
|
+
AND json_extract(d.metadata, '$.path') = ?
|
|
7084
|
+
ORDER BY d.sort_order DESC
|
|
7085
|
+
LIMIT ?
|
|
7086
|
+
`),
|
|
7087
|
+
getSubsequentSiblings: this.db.prepare(`
|
|
7088
|
+
SELECT d.* FROM documents d
|
|
7089
|
+
JOIN versions v ON d.version_id = v.id
|
|
7090
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7091
|
+
WHERE l.name = ?
|
|
7092
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7093
|
+
AND d.url = ?
|
|
7094
|
+
AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
7095
|
+
AND json_extract(d.metadata, '$.path') = ?
|
|
7096
|
+
ORDER BY d.sort_order
|
|
7097
|
+
LIMIT ?
|
|
7098
|
+
`),
|
|
7099
|
+
getParentChunk: this.db.prepare(`
|
|
7100
|
+
SELECT d.* FROM documents d
|
|
7101
|
+
JOIN versions v ON d.version_id = v.id
|
|
7102
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7103
|
+
WHERE l.name = ?
|
|
7104
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7105
|
+
AND d.url = ?
|
|
7106
|
+
AND json_extract(d.metadata, '$.path') = ?
|
|
7107
|
+
AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
7108
|
+
ORDER BY d.sort_order DESC
|
|
7109
|
+
LIMIT 1
|
|
7110
|
+
`),
|
|
7111
|
+
// Status tracking statements
|
|
7112
|
+
updateVersionStatus: this.db.prepare(
|
|
7113
|
+
"UPDATE versions SET status = ?, error_message = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
7114
|
+
),
|
|
7115
|
+
updateVersionProgress: this.db.prepare(
|
|
7116
|
+
"UPDATE versions SET progress_pages = ?, progress_max_pages = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
7117
|
+
),
|
|
7118
|
+
getVersionsByStatus: this.db.prepare(
|
|
7119
|
+
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN (SELECT value FROM json_each(?))"
|
|
7120
|
+
),
|
|
7121
|
+
// Scraper options statements
|
|
7122
|
+
updateVersionScraperOptions: this.db.prepare(
|
|
7123
|
+
"UPDATE versions SET source_url = ?, scraper_options = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
7124
|
+
),
|
|
7125
|
+
getVersionWithOptions: this.db.prepare(
|
|
7126
|
+
"SELECT * FROM versions WHERE id = ?"
|
|
7127
|
+
),
|
|
7128
|
+
getVersionsBySourceUrl: this.db.prepare(
|
|
7129
|
+
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.source_url = ? ORDER BY v.created_at DESC"
|
|
7130
|
+
),
|
|
7131
|
+
// Version and library deletion statements
|
|
7132
|
+
deleteVersionById: this.db.prepare("DELETE FROM versions WHERE id = ?"),
|
|
7133
|
+
deleteLibraryById: this.db.prepare("DELETE FROM libraries WHERE id = ?"),
|
|
7134
|
+
countVersionsByLibraryId: this.db.prepare(
|
|
7135
|
+
"SELECT COUNT(*) as count FROM versions WHERE library_id = ?"
|
|
7136
|
+
),
|
|
7137
|
+
getVersionId: this.db.prepare(
|
|
7138
|
+
`SELECT v.id, v.library_id FROM versions v
|
|
7139
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7140
|
+
WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`
|
|
7141
|
+
)
|
|
7142
|
+
};
|
|
7143
|
+
this.statements = statements;
|
|
7144
|
+
}
|
|
7145
|
+
/**
|
|
7146
|
+
* Pads a vector to the fixed database dimension by appending zeros.
|
|
7147
|
+
* Throws an error if the input vector is longer than the database dimension.
|
|
7148
|
+
*/
|
|
7149
|
+
padVector(vector) {
|
|
7150
|
+
if (vector.length > this.dbDimension) {
|
|
7151
|
+
throw new Error(
|
|
7152
|
+
`Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
|
|
7153
|
+
);
|
|
7154
|
+
}
|
|
7155
|
+
if (vector.length === this.dbDimension) {
|
|
7156
|
+
return vector;
|
|
7157
|
+
}
|
|
7158
|
+
return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
|
|
7159
|
+
}
|
|
7160
|
+
/**
|
|
7161
|
+
* Initialize the embeddings client using either provided config or environment variables.
|
|
7162
|
+
* If no embedding config is provided (null), embeddings will not be initialized.
|
|
7163
|
+
* This allows DocumentStore to be used without embeddings for operations that don't need them.
|
|
7164
|
+
*
|
|
7165
|
+
* Environment variables per provider:
|
|
7166
|
+
* - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
|
|
7167
|
+
* - vertex: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
|
|
7168
|
+
* - gemini: GOOGLE_API_KEY
|
|
7169
|
+
* - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION
|
|
7170
|
+
* - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
|
|
7171
|
+
*/
|
|
7172
|
+
async initializeEmbeddings() {
|
|
7173
|
+
if (this.embeddingConfig === null) {
|
|
7174
|
+
logger.debug("Embedding initialization skipped (explicitly disabled)");
|
|
7175
|
+
return;
|
|
7176
|
+
}
|
|
7177
|
+
const config = this.embeddingConfig || EmbeddingConfig.parseEmbeddingConfig();
|
|
7178
|
+
try {
|
|
7179
|
+
this.embeddings = createEmbeddingModel(config.modelSpec);
|
|
7180
|
+
if (config.dimensions !== null) {
|
|
7181
|
+
this.modelDimension = config.dimensions;
|
|
7182
|
+
} else {
|
|
7183
|
+
const testVector = await this.embeddings.embedQuery("test");
|
|
7184
|
+
this.modelDimension = testVector.length;
|
|
7185
|
+
EmbeddingConfig.setKnownModelDimensions(config.model, this.modelDimension);
|
|
7186
|
+
}
|
|
7187
|
+
if (this.modelDimension > this.dbDimension) {
|
|
7188
|
+
throw new DimensionError(config.modelSpec, this.modelDimension, this.dbDimension);
|
|
7189
|
+
}
|
|
7190
|
+
logger.debug(
|
|
7191
|
+
`Embeddings initialized: ${config.provider}:${config.model} (${this.modelDimension}d)`
|
|
7192
|
+
);
|
|
7193
|
+
} catch (error) {
|
|
7194
|
+
if (error instanceof Error) {
|
|
7195
|
+
if (error.message.includes("does not exist") || error.message.includes("MODEL_NOT_FOUND")) {
|
|
7196
|
+
throw new ModelConfigurationError(
|
|
7197
|
+
`❌ Invalid embedding model: ${config.model}
|
|
7198
|
+
The model "${config.model}" is not available or you don't have access to it.
|
|
7199
|
+
See README.md for supported models or run with --help for more details.`
|
|
7200
|
+
);
|
|
7201
|
+
}
|
|
7202
|
+
if (error.message.includes("API key") || error.message.includes("401") || error.message.includes("authentication")) {
|
|
7203
|
+
throw new ModelConfigurationError(
|
|
7204
|
+
`❌ Authentication failed for ${config.provider} embedding provider
|
|
7205
|
+
Please check your API key configuration.
|
|
7206
|
+
See README.md for configuration options or run with --help for more details.`
|
|
7207
|
+
);
|
|
7208
|
+
}
|
|
7209
|
+
}
|
|
7210
|
+
throw error;
|
|
7211
|
+
}
|
|
7212
|
+
}
|
|
7213
|
+
/**
|
|
7214
|
+
* Escapes a query string for use with SQLite FTS5 MATCH operator.
|
|
7215
|
+
* Wraps the query in double quotes and escapes internal double quotes.
|
|
7216
|
+
*/
|
|
7217
|
+
escapeFtsQuery(query) {
|
|
7218
|
+
const escapedQuotes = query.replace(/"/g, '""');
|
|
7219
|
+
return `"${escapedQuotes}"`;
|
|
7220
|
+
}
|
|
7221
|
+
/**
|
|
7222
|
+
* Initializes database connection and ensures readiness
|
|
7223
|
+
*/
|
|
7224
|
+
async initialize() {
|
|
7225
|
+
try {
|
|
7226
|
+
sqliteVec.load(this.db);
|
|
7227
|
+
applyMigrations(this.db);
|
|
7228
|
+
this.prepareStatements();
|
|
7229
|
+
await this.initializeEmbeddings();
|
|
7230
|
+
} catch (error) {
|
|
7231
|
+
if (error instanceof StoreError || error instanceof ModelConfigurationError || error instanceof UnsupportedProviderError) {
|
|
7232
|
+
throw error;
|
|
7233
|
+
}
|
|
7234
|
+
throw new ConnectionError("Failed to initialize database connection", error);
|
|
7235
|
+
}
|
|
7236
|
+
}
|
|
7237
|
+
/**
|
|
7238
|
+
* Gracefully closes database connections
|
|
7239
|
+
*/
|
|
7240
|
+
async shutdown() {
|
|
7241
|
+
this.db.close();
|
|
7242
|
+
}
|
|
7243
|
+
/**
|
|
7244
|
+
* Resolves a library name and version string to library_id and version_id.
|
|
7245
|
+
* Creates library and version records if they don't exist.
|
|
7246
|
+
*/
|
|
7247
|
+
async resolveLibraryAndVersionIds(library, version2) {
|
|
7248
|
+
const normalizedLibrary = library.toLowerCase();
|
|
7249
|
+
const normalizedVersion = denormalizeVersionName(version2.toLowerCase());
|
|
7250
|
+
this.statements.insertLibrary.run(normalizedLibrary);
|
|
7251
|
+
const libraryIdRow = this.statements.getLibraryIdByName.get(normalizedLibrary);
|
|
7252
|
+
if (!libraryIdRow || typeof libraryIdRow.id !== "number") {
|
|
7253
|
+
throw new StoreError(`Failed to resolve library_id for library: ${library}`);
|
|
7254
|
+
}
|
|
7255
|
+
const libraryId = libraryIdRow.id;
|
|
7256
|
+
this.statements.insertVersion.run(libraryId, normalizedVersion);
|
|
7257
|
+
const versionIdRow = this.statements.resolveVersionId.get(
|
|
7258
|
+
libraryId,
|
|
7259
|
+
normalizedVersion === null ? "" : normalizedVersion
|
|
7260
|
+
);
|
|
7261
|
+
if (!versionIdRow || typeof versionIdRow.id !== "number") {
|
|
7262
|
+
throw new StoreError(
|
|
7263
|
+
`Failed to resolve version_id for library: ${library}, version: ${version2}`
|
|
7264
|
+
);
|
|
7265
|
+
}
|
|
7266
|
+
return { libraryId, versionId: versionIdRow.id };
|
|
7267
|
+
}
|
|
7268
|
+
/**
|
|
7269
|
+
* Retrieves all unique versions for a specific library
|
|
7270
|
+
*/
|
|
7271
|
+
async queryUniqueVersions(library) {
|
|
7272
|
+
try {
|
|
7273
|
+
const rows = this.statements.queryVersions.all(library.toLowerCase());
|
|
7274
|
+
return rows.map((row) => normalizeVersionName(row.name));
|
|
7275
|
+
} catch (error) {
|
|
7276
|
+
throw new ConnectionError("Failed to query versions", error);
|
|
7277
|
+
}
|
|
7278
|
+
}
|
|
7279
|
+
/**
|
|
7280
|
+
* Updates the status of a version record in the database.
|
|
7281
|
+
* @param versionId The version ID to update
|
|
7282
|
+
* @param status The new status to set
|
|
7283
|
+
* @param errorMessage Optional error message for failed statuses
|
|
7284
|
+
*/
|
|
7285
|
+
async updateVersionStatus(versionId, status, errorMessage) {
|
|
7286
|
+
try {
|
|
7287
|
+
this.statements.updateVersionStatus.run(status, errorMessage ?? null, versionId);
|
|
7288
|
+
} catch (error) {
|
|
7289
|
+
throw new StoreError(`Failed to update version status: ${error}`);
|
|
7290
|
+
}
|
|
7291
|
+
}
|
|
7292
|
+
/**
|
|
7293
|
+
* Updates the progress counters for a version being indexed.
|
|
7294
|
+
* @param versionId The version ID to update
|
|
7295
|
+
* @param pages Current number of pages processed
|
|
7296
|
+
* @param maxPages Total number of pages to process
|
|
7297
|
+
*/
|
|
7298
|
+
async updateVersionProgress(versionId, pages, maxPages) {
|
|
7299
|
+
try {
|
|
7300
|
+
this.statements.updateVersionProgress.run(pages, maxPages, versionId);
|
|
7301
|
+
} catch (error) {
|
|
7302
|
+
throw new StoreError(`Failed to update version progress: ${error}`);
|
|
7303
|
+
}
|
|
7304
|
+
}
|
|
7305
|
+
/**
|
|
7306
|
+
* Retrieves versions by their status.
|
|
7307
|
+
* @param statuses Array of statuses to filter by
|
|
7308
|
+
* @returns Array of version records matching the statuses
|
|
7309
|
+
*/
|
|
7310
|
+
async getVersionsByStatus(statuses) {
|
|
7311
|
+
try {
|
|
7312
|
+
const statusJson = JSON.stringify(statuses);
|
|
7313
|
+
const rows = this.statements.getVersionsByStatus.all(
|
|
7314
|
+
statusJson
|
|
7315
|
+
);
|
|
7316
|
+
return rows;
|
|
7317
|
+
} catch (error) {
|
|
7318
|
+
throw new StoreError(`Failed to get versions by status: ${error}`);
|
|
7319
|
+
}
|
|
7320
|
+
}
|
|
7321
|
+
/**
|
|
7322
|
+
* Stores scraper options for a version to enable reproducible indexing.
|
|
7323
|
+
* @param versionId The version ID to update
|
|
7324
|
+
* @param options Complete scraper options used for indexing
|
|
7325
|
+
*/
|
|
7326
|
+
async storeScraperOptions(versionId, options) {
|
|
7327
|
+
try {
|
|
7328
|
+
const { url: source_url, library, version: version2, signal, ...scraper_options } = options;
|
|
7329
|
+
const optionsJson = JSON.stringify(scraper_options);
|
|
7330
|
+
this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
|
|
7331
|
+
} catch (error) {
|
|
7332
|
+
throw new StoreError(`Failed to store scraper options: ${error}`);
|
|
7333
|
+
}
|
|
7334
|
+
}
|
|
7335
|
+
/**
|
|
7336
|
+
* Retrieves stored scraping configuration (source URL and options) for a version.
|
|
7337
|
+
* Returns null when no source URL is recorded (not re-indexable).
|
|
7338
|
+
*/
|
|
7339
|
+
async getScraperOptions(versionId) {
|
|
7340
|
+
try {
|
|
7341
|
+
const row = this.statements.getVersionWithOptions.get(versionId);
|
|
7342
|
+
if (!row?.source_url) {
|
|
7343
|
+
return null;
|
|
7344
|
+
}
|
|
7345
|
+
let parsed = {};
|
|
7346
|
+
if (row.scraper_options) {
|
|
7347
|
+
try {
|
|
7348
|
+
parsed = JSON.parse(row.scraper_options);
|
|
7349
|
+
} catch (e) {
|
|
7350
|
+
logger.warn(`⚠️ Invalid scraper_options JSON for version ${versionId}: ${e}`);
|
|
7351
|
+
parsed = {};
|
|
7352
|
+
}
|
|
7353
|
+
}
|
|
7354
|
+
return { sourceUrl: row.source_url, options: parsed };
|
|
7355
|
+
} catch (error) {
|
|
7356
|
+
throw new StoreError(`Failed to get scraper options: ${error}`);
|
|
7357
|
+
}
|
|
7358
|
+
}
|
|
7359
|
+
/**
|
|
7360
|
+
* Finds versions that were indexed from the same source URL.
|
|
7361
|
+
* Useful for finding similar configurations or detecting duplicates.
|
|
7362
|
+
* @param url Source URL to search for
|
|
7363
|
+
* @returns Array of versions with the same source URL
|
|
7364
|
+
*/
|
|
7365
|
+
async findVersionsBySourceUrl(url) {
|
|
7366
|
+
try {
|
|
7367
|
+
const rows = this.statements.getVersionsBySourceUrl.all(
|
|
7368
|
+
url
|
|
7369
|
+
);
|
|
7370
|
+
return rows;
|
|
7371
|
+
} catch (error) {
|
|
7372
|
+
throw new StoreError(`Failed to find versions by source URL: ${error}`);
|
|
7373
|
+
}
|
|
7374
|
+
}
|
|
7375
|
+
/**
|
|
7376
|
+
* Verifies existence of documents for a specific library version
|
|
7377
|
+
*/
|
|
7378
|
+
async checkDocumentExists(library, version2) {
|
|
7379
|
+
try {
|
|
7380
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7381
|
+
const result = this.statements.checkExists.get(
|
|
7382
|
+
library.toLowerCase(),
|
|
7383
|
+
normalizedVersion
|
|
7384
|
+
);
|
|
7385
|
+
return result !== void 0;
|
|
7386
|
+
} catch (error) {
|
|
7387
|
+
throw new ConnectionError("Failed to check document existence", error);
|
|
7388
|
+
}
|
|
7389
|
+
}
|
|
7390
|
+
/**
|
|
7391
|
+
* Retrieves a mapping of all libraries to their available versions with details.
|
|
7392
|
+
*/
|
|
7393
|
+
async queryLibraryVersions() {
|
|
7394
|
+
try {
|
|
7395
|
+
const rows = this.statements.queryLibraryVersions.all();
|
|
7396
|
+
const libraryMap = /* @__PURE__ */ new Map();
|
|
7397
|
+
for (const row of rows) {
|
|
7398
|
+
const library = row.library;
|
|
7399
|
+
if (!libraryMap.has(library)) {
|
|
7400
|
+
libraryMap.set(library, []);
|
|
7401
|
+
}
|
|
7402
|
+
const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
|
|
7403
|
+
libraryMap.get(library)?.push({
|
|
7404
|
+
version: row.version,
|
|
7405
|
+
versionId: row.versionId,
|
|
7406
|
+
// Preserve raw string status here; DocumentManagementService will cast to VersionStatus
|
|
7407
|
+
status: row.status,
|
|
7408
|
+
progressPages: row.progressPages,
|
|
7409
|
+
progressMaxPages: row.progressMaxPages,
|
|
7410
|
+
sourceUrl: row.sourceUrl,
|
|
7411
|
+
documentCount: row.documentCount,
|
|
7412
|
+
uniqueUrlCount: row.uniqueUrlCount,
|
|
7413
|
+
indexedAt: indexedAtISO
|
|
7414
|
+
});
|
|
7415
|
+
}
|
|
7416
|
+
for (const versions of libraryMap.values()) {
|
|
7417
|
+
versions.sort((a, b) => {
|
|
7418
|
+
if (a.version === "" && b.version !== "") {
|
|
7419
|
+
return -1;
|
|
7420
|
+
}
|
|
7421
|
+
if (a.version !== "" && b.version === "") {
|
|
7422
|
+
return 1;
|
|
7423
|
+
}
|
|
7424
|
+
if (a.version === "" && b.version === "") {
|
|
7425
|
+
return 0;
|
|
7426
|
+
}
|
|
7427
|
+
try {
|
|
7428
|
+
return semver__default.compare(a.version, b.version);
|
|
7429
|
+
} catch (_error) {
|
|
7430
|
+
return a.version.localeCompare(b.version);
|
|
7431
|
+
}
|
|
7432
|
+
});
|
|
7433
|
+
}
|
|
7434
|
+
return libraryMap;
|
|
7435
|
+
} catch (error) {
|
|
7436
|
+
throw new ConnectionError("Failed to query library versions", error);
|
|
7437
|
+
}
|
|
7438
|
+
}
|
|
7439
|
+
/**
|
|
7440
|
+
* Stores documents with library and version metadata, generating embeddings
|
|
7441
|
+
* for vector similarity search. Automatically removes any existing documents
|
|
7442
|
+
* for the same URLs before adding new ones to prevent UNIQUE constraint violations.
|
|
7443
|
+
*/
|
|
7444
|
+
async addDocuments(library, version2, documents) {
|
|
7445
|
+
try {
|
|
7446
|
+
if (documents.length === 0) {
|
|
7447
|
+
return;
|
|
7448
|
+
}
|
|
7449
|
+
const urls = /* @__PURE__ */ new Set();
|
|
7450
|
+
for (const doc of documents) {
|
|
7451
|
+
const url = doc.metadata.url;
|
|
7452
|
+
if (!url || typeof url !== "string" || !url.trim()) {
|
|
7453
|
+
throw new StoreError("Document metadata must include a valid URL");
|
|
7454
|
+
}
|
|
7455
|
+
urls.add(url);
|
|
7456
|
+
}
|
|
7457
|
+
const texts = documents.map((doc) => {
|
|
7458
|
+
const header = `<title>${doc.metadata.title}</title>
|
|
7459
|
+
<url>${doc.metadata.url}</url>
|
|
7460
|
+
<path>${doc.metadata.path.join(" / ")}</path>
|
|
7461
|
+
`;
|
|
7462
|
+
return `${header}${doc.pageContent}`;
|
|
7463
|
+
});
|
|
7464
|
+
const maxBatchChars = Number(process.env.DOCS_MCP_EMBEDDING_BATCH_CHARS) || EMBEDDING_BATCH_CHARS;
|
|
7465
|
+
const rawEmbeddings = [];
|
|
7466
|
+
let currentBatch = [];
|
|
7467
|
+
let currentBatchSize = 0;
|
|
7468
|
+
let batchCount = 0;
|
|
7469
|
+
for (const text of texts) {
|
|
7470
|
+
const textSize = text.length;
|
|
7471
|
+
if (currentBatchSize + textSize > maxBatchChars && currentBatch.length > 0) {
|
|
7472
|
+
batchCount++;
|
|
7473
|
+
logger.debug(
|
|
7474
|
+
`🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
7475
|
+
);
|
|
7476
|
+
const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
|
|
7477
|
+
rawEmbeddings.push(...batchEmbeddings);
|
|
7478
|
+
currentBatch = [];
|
|
7479
|
+
currentBatchSize = 0;
|
|
7480
|
+
}
|
|
7481
|
+
currentBatch.push(text);
|
|
7482
|
+
currentBatchSize += textSize;
|
|
7483
|
+
if (currentBatch.length >= EMBEDDING_BATCH_SIZE) {
|
|
7484
|
+
batchCount++;
|
|
7485
|
+
logger.debug(
|
|
7486
|
+
`🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
7487
|
+
);
|
|
7488
|
+
const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
|
|
7489
|
+
rawEmbeddings.push(...batchEmbeddings);
|
|
7490
|
+
currentBatch = [];
|
|
7491
|
+
currentBatchSize = 0;
|
|
7492
|
+
}
|
|
7493
|
+
}
|
|
7494
|
+
if (currentBatch.length > 0) {
|
|
7495
|
+
batchCount++;
|
|
7496
|
+
logger.debug(
|
|
7497
|
+
`🔄 Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
7498
|
+
);
|
|
7499
|
+
const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
|
|
7500
|
+
rawEmbeddings.push(...batchEmbeddings);
|
|
7501
|
+
}
|
|
7502
|
+
const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
|
|
7503
|
+
const { libraryId, versionId } = await this.resolveLibraryAndVersionIds(
|
|
7504
|
+
library,
|
|
7505
|
+
version2
|
|
7506
|
+
);
|
|
7507
|
+
for (const url of urls) {
|
|
7508
|
+
const deletedCount = await this.deleteDocumentsByUrl(library, version2, url);
|
|
7509
|
+
if (deletedCount > 0) {
|
|
7510
|
+
logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`);
|
|
7511
|
+
}
|
|
7512
|
+
}
|
|
7513
|
+
const transaction = this.db.transaction((docs) => {
|
|
7514
|
+
for (let i = 0; i < docs.length; i++) {
|
|
7515
|
+
const doc = docs[i];
|
|
7516
|
+
const url = doc.metadata.url;
|
|
7517
|
+
const result = this.statements.insertDocument.run(
|
|
7518
|
+
BigInt(libraryId),
|
|
7519
|
+
BigInt(versionId),
|
|
7520
|
+
url,
|
|
7521
|
+
doc.pageContent,
|
|
7522
|
+
JSON.stringify(doc.metadata),
|
|
7523
|
+
i,
|
|
7524
|
+
(/* @__PURE__ */ new Date()).toISOString()
|
|
7525
|
+
// Pass current timestamp for indexed_at
|
|
7526
|
+
);
|
|
7527
|
+
const rowId = result.lastInsertRowid;
|
|
7528
|
+
this.statements.insertEmbedding.run(
|
|
7529
|
+
BigInt(rowId),
|
|
7530
|
+
BigInt(libraryId),
|
|
7531
|
+
BigInt(versionId),
|
|
7532
|
+
JSON.stringify(paddedEmbeddings[i])
|
|
7533
|
+
);
|
|
7534
|
+
}
|
|
7535
|
+
});
|
|
7536
|
+
transaction(documents);
|
|
7537
|
+
} catch (error) {
|
|
7538
|
+
throw new ConnectionError("Failed to add documents to store", error);
|
|
7539
|
+
}
|
|
7540
|
+
}
|
|
7541
|
+
/**
|
|
7542
|
+
* Removes documents matching specified library and version
|
|
7543
|
+
* @returns Number of documents deleted
|
|
7544
|
+
*/
|
|
7545
|
+
async deleteDocuments(library, version2) {
|
|
7546
|
+
try {
|
|
7547
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7548
|
+
const result = this.statements.deleteDocuments.run(
|
|
7549
|
+
library.toLowerCase(),
|
|
7550
|
+
library.toLowerCase(),
|
|
7551
|
+
// library name appears twice in the query
|
|
7552
|
+
normalizedVersion
|
|
7553
|
+
);
|
|
7554
|
+
return result.changes;
|
|
7555
|
+
} catch (error) {
|
|
7556
|
+
throw new ConnectionError("Failed to delete documents", error);
|
|
7557
|
+
}
|
|
7558
|
+
}
|
|
7559
|
+
/**
|
|
7560
|
+
* Removes documents for a specific URL within a library and version
|
|
7561
|
+
* @returns Number of documents deleted
|
|
7562
|
+
*/
|
|
7563
|
+
async deleteDocumentsByUrl(library, version2, url) {
|
|
7564
|
+
try {
|
|
7565
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7566
|
+
const result = this.statements.deleteDocumentsByUrl.run(
|
|
7567
|
+
url,
|
|
7568
|
+
library.toLowerCase(),
|
|
7569
|
+
library.toLowerCase(),
|
|
7570
|
+
// library name appears twice in the query
|
|
7571
|
+
normalizedVersion
|
|
7572
|
+
);
|
|
7573
|
+
return result.changes;
|
|
7574
|
+
} catch (error) {
|
|
7575
|
+
throw new ConnectionError("Failed to delete documents by URL", error);
|
|
7576
|
+
}
|
|
7577
|
+
}
|
|
7578
|
+
/**
|
|
7579
|
+
* Completely removes a library version and all associated documents.
|
|
7580
|
+
* Optionally removes the library if no other versions remain.
|
|
7581
|
+
* @param library Library name
|
|
7582
|
+
* @param version Version string (empty string for unversioned)
|
|
7583
|
+
* @param removeLibraryIfEmpty Whether to remove the library if no versions remain
|
|
7584
|
+
* @returns Object with counts of deleted documents, version deletion status, and library deletion status
|
|
7585
|
+
*/
|
|
7586
|
+
async removeVersion(library, version2, removeLibraryIfEmpty = true) {
|
|
7587
|
+
try {
|
|
7588
|
+
const normalizedLibrary = library.toLowerCase();
|
|
7589
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7590
|
+
const versionResult = this.statements.getVersionId.get(
|
|
7591
|
+
normalizedLibrary,
|
|
7592
|
+
normalizedVersion
|
|
7593
|
+
);
|
|
7594
|
+
if (!versionResult) {
|
|
7595
|
+
return { documentsDeleted: 0, versionDeleted: false, libraryDeleted: false };
|
|
7596
|
+
}
|
|
7597
|
+
const { id: versionId, library_id: libraryId } = versionResult;
|
|
7598
|
+
const documentsDeleted = await this.deleteDocuments(library, version2);
|
|
7599
|
+
const versionDeleteResult = this.statements.deleteVersionById.run(versionId);
|
|
7600
|
+
const versionDeleted = versionDeleteResult.changes > 0;
|
|
7601
|
+
let libraryDeleted = false;
|
|
7602
|
+
if (removeLibraryIfEmpty && versionDeleted) {
|
|
7603
|
+
const countResult = this.statements.countVersionsByLibraryId.get(libraryId);
|
|
7604
|
+
const remainingVersions = countResult?.count ?? 0;
|
|
7605
|
+
if (remainingVersions === 0) {
|
|
7606
|
+
const libraryDeleteResult = this.statements.deleteLibraryById.run(libraryId);
|
|
7607
|
+
libraryDeleted = libraryDeleteResult.changes > 0;
|
|
7608
|
+
}
|
|
7609
|
+
}
|
|
7610
|
+
return { documentsDeleted, versionDeleted, libraryDeleted };
|
|
7611
|
+
} catch (error) {
|
|
7612
|
+
throw new ConnectionError("Failed to remove version", error);
|
|
7613
|
+
}
|
|
7614
|
+
}
|
|
7615
|
+
/**
|
|
7616
|
+
* Retrieves a document by its ID.
|
|
7617
|
+
* @param id The ID of the document.
|
|
7618
|
+
* @returns The document, or null if not found.
|
|
7619
|
+
*/
|
|
7620
|
+
async getById(id) {
|
|
7621
|
+
try {
|
|
7622
|
+
const row = this.statements.getById.get(BigInt(id));
|
|
7623
|
+
if (!row) {
|
|
7624
|
+
return null;
|
|
7625
|
+
}
|
|
7626
|
+
return mapDbDocumentToDocument(row);
|
|
7627
|
+
} catch (error) {
|
|
7628
|
+
throw new ConnectionError(`Failed to get document by ID ${id}`, error);
|
|
7629
|
+
}
|
|
7630
|
+
}
|
|
7631
|
+
/**
|
|
7632
|
+
* Finds documents matching a text query using hybrid search.
|
|
7633
|
+
* Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
|
|
7634
|
+
*/
|
|
7635
|
+
async findByContent(library, version2, query, limit) {
|
|
7636
|
+
try {
|
|
7637
|
+
const rawEmbedding = await this.embeddings.embedQuery(query);
|
|
7638
|
+
const embedding = this.padVector(rawEmbedding);
|
|
7639
|
+
const ftsQuery = this.escapeFtsQuery(query);
|
|
7640
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7641
|
+
const stmt = this.db.prepare(`
|
|
7642
|
+
WITH vec_distances AS (
|
|
7643
|
+
SELECT
|
|
7644
|
+
dv.rowid as id,
|
|
7645
|
+
dv.distance as vec_distance
|
|
7646
|
+
FROM documents_vec dv
|
|
7647
|
+
JOIN versions v ON dv.version_id = v.id
|
|
7648
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7649
|
+
WHERE l.name = ?
|
|
7650
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7651
|
+
AND dv.embedding MATCH ?
|
|
7652
|
+
AND dv.k = ?
|
|
7653
|
+
ORDER BY dv.distance
|
|
7654
|
+
),
|
|
7655
|
+
fts_scores AS (
|
|
7656
|
+
SELECT
|
|
7657
|
+
f.rowid as id,
|
|
7658
|
+
bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
|
|
7659
|
+
FROM documents_fts f
|
|
7660
|
+
JOIN documents d ON f.rowid = d.id
|
|
7661
|
+
JOIN versions v ON d.version_id = v.id
|
|
7662
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7663
|
+
WHERE l.name = ?
|
|
7664
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7665
|
+
AND documents_fts MATCH ?
|
|
7666
|
+
ORDER BY fts_score
|
|
7667
|
+
LIMIT ?
|
|
7668
|
+
)
|
|
7669
|
+
SELECT
|
|
7670
|
+
d.id,
|
|
7671
|
+
d.content,
|
|
7672
|
+
d.metadata,
|
|
7673
|
+
COALESCE(1 / (1 + v.vec_distance), 0) as vec_score,
|
|
7674
|
+
COALESCE(-MIN(f.fts_score, 0), 0) as fts_score
|
|
7675
|
+
FROM documents d
|
|
7676
|
+
LEFT JOIN vec_distances v ON d.id = v.id
|
|
7677
|
+
LEFT JOIN fts_scores f ON d.id = f.id
|
|
7678
|
+
WHERE v.id IS NOT NULL OR f.id IS NOT NULL
|
|
7679
|
+
`);
|
|
7680
|
+
const rawResults = stmt.all(
|
|
7681
|
+
library.toLowerCase(),
|
|
7682
|
+
normalizedVersion,
|
|
7683
|
+
JSON.stringify(embedding),
|
|
7684
|
+
limit,
|
|
7685
|
+
library.toLowerCase(),
|
|
7686
|
+
normalizedVersion,
|
|
7687
|
+
ftsQuery,
|
|
7688
|
+
// Use the escaped query
|
|
7689
|
+
limit
|
|
7690
|
+
);
|
|
7691
|
+
const rankedResults = this.assignRanks(rawResults);
|
|
7692
|
+
const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
|
|
7693
|
+
return topResults.map((row) => ({
|
|
7694
|
+
...mapDbDocumentToDocument(row),
|
|
7695
|
+
metadata: {
|
|
7696
|
+
...JSON.parse(row.metadata),
|
|
7697
|
+
id: row.id,
|
|
7698
|
+
score: row.rrf_score,
|
|
7699
|
+
vec_rank: row.vec_rank,
|
|
7700
|
+
fts_rank: row.fts_rank
|
|
7701
|
+
}
|
|
7702
|
+
}));
|
|
7703
|
+
} catch (error) {
|
|
7704
|
+
throw new ConnectionError(
|
|
7705
|
+
`Failed to find documents by content with query "${query}"`,
|
|
7706
|
+
error
|
|
7707
|
+
);
|
|
7708
|
+
}
|
|
7709
|
+
}
|
|
7710
|
+
/**
|
|
7711
|
+
* Finds child chunks of a given document based on path hierarchy.
|
|
7712
|
+
*/
|
|
7713
|
+
async findChildChunks(library, version2, id, limit) {
|
|
7714
|
+
try {
|
|
7715
|
+
const parent = await this.getById(id);
|
|
7716
|
+
if (!parent) {
|
|
7717
|
+
return [];
|
|
7718
|
+
}
|
|
7719
|
+
const parentPath = parent.metadata.path ?? [];
|
|
7720
|
+
const parentUrl = parent.metadata.url;
|
|
7721
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7722
|
+
const result = this.statements.getChildChunks.all(
|
|
7723
|
+
library.toLowerCase(),
|
|
7724
|
+
normalizedVersion,
|
|
7725
|
+
parentUrl,
|
|
7726
|
+
parentPath.length + 1,
|
|
7727
|
+
JSON.stringify(parentPath),
|
|
7728
|
+
BigInt(id),
|
|
7729
|
+
limit
|
|
7730
|
+
);
|
|
7731
|
+
return result.map((row) => mapDbDocumentToDocument(row));
|
|
7732
|
+
} catch (error) {
|
|
7733
|
+
throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
|
|
7734
|
+
}
|
|
7735
|
+
}
|
|
7736
|
+
/**
|
|
7737
|
+
* Finds preceding sibling chunks of a given document.
|
|
7738
|
+
*/
|
|
7739
|
+
async findPrecedingSiblingChunks(library, version2, id, limit) {
|
|
7740
|
+
try {
|
|
7741
|
+
const reference = await this.getById(id);
|
|
7742
|
+
if (!reference) {
|
|
7743
|
+
return [];
|
|
7744
|
+
}
|
|
7745
|
+
const refMetadata = reference.metadata;
|
|
7746
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7747
|
+
const result = this.statements.getPrecedingSiblings.all(
|
|
7748
|
+
library.toLowerCase(),
|
|
7749
|
+
normalizedVersion,
|
|
7750
|
+
refMetadata.url,
|
|
7751
|
+
BigInt(id),
|
|
7752
|
+
JSON.stringify(refMetadata.path),
|
|
7753
|
+
limit
|
|
7754
|
+
);
|
|
7755
|
+
return result.reverse().map((row) => mapDbDocumentToDocument(row));
|
|
7756
|
+
} catch (error) {
|
|
7757
|
+
throw new ConnectionError(
|
|
7758
|
+
`Failed to find preceding sibling chunks for ID ${id}`,
|
|
7759
|
+
error
|
|
7760
|
+
);
|
|
7761
|
+
}
|
|
7762
|
+
}
|
|
7763
|
+
/**
|
|
7764
|
+
* Finds subsequent sibling chunks of a given document.
|
|
7765
|
+
*/
|
|
7766
|
+
async findSubsequentSiblingChunks(library, version2, id, limit) {
|
|
7767
|
+
try {
|
|
7768
|
+
const reference = await this.getById(id);
|
|
7769
|
+
if (!reference) {
|
|
7770
|
+
return [];
|
|
7771
|
+
}
|
|
7772
|
+
const refMetadata = reference.metadata;
|
|
7773
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7774
|
+
const result = this.statements.getSubsequentSiblings.all(
|
|
7775
|
+
library.toLowerCase(),
|
|
7776
|
+
normalizedVersion,
|
|
7777
|
+
refMetadata.url,
|
|
7778
|
+
BigInt(id),
|
|
7779
|
+
JSON.stringify(refMetadata.path),
|
|
7780
|
+
limit
|
|
7781
|
+
);
|
|
7782
|
+
return result.map((row) => mapDbDocumentToDocument(row));
|
|
7783
|
+
} catch (error) {
|
|
7784
|
+
throw new ConnectionError(
|
|
7785
|
+
`Failed to find subsequent sibling chunks for ID ${id}`,
|
|
7786
|
+
error
|
|
7787
|
+
);
|
|
7788
|
+
}
|
|
7789
|
+
}
|
|
7790
|
+
/**
|
|
7791
|
+
* Finds the parent chunk of a given document.
|
|
7792
|
+
*/
|
|
7793
|
+
async findParentChunk(library, version2, id) {
|
|
7794
|
+
try {
|
|
7795
|
+
const child = await this.getById(id);
|
|
7796
|
+
if (!child) {
|
|
7797
|
+
return null;
|
|
7798
|
+
}
|
|
7799
|
+
const childMetadata = child.metadata;
|
|
7800
|
+
const path2 = childMetadata.path ?? [];
|
|
7801
|
+
const parentPath = path2.slice(0, -1);
|
|
7802
|
+
if (parentPath.length === 0) {
|
|
7803
|
+
return null;
|
|
7804
|
+
}
|
|
7805
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7806
|
+
const result = this.statements.getParentChunk.get(
|
|
7807
|
+
library.toLowerCase(),
|
|
7808
|
+
normalizedVersion,
|
|
7809
|
+
childMetadata.url,
|
|
7810
|
+
JSON.stringify(parentPath),
|
|
7811
|
+
BigInt(id)
|
|
7812
|
+
);
|
|
7813
|
+
if (!result) {
|
|
7814
|
+
return null;
|
|
7815
|
+
}
|
|
7816
|
+
return mapDbDocumentToDocument(result);
|
|
7817
|
+
} catch (error) {
|
|
7818
|
+
throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
|
|
7819
|
+
}
|
|
7820
|
+
}
|
|
7821
|
+
/**
|
|
7822
|
+
* Fetches multiple documents by their IDs in a single call.
|
|
7823
|
+
* Returns an array of Document objects, sorted by their sort_order.
|
|
7824
|
+
*/
|
|
7825
|
+
async findChunksByIds(library, version2, ids) {
|
|
7826
|
+
if (!ids.length) return [];
|
|
7827
|
+
try {
|
|
7828
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7829
|
+
const placeholders = ids.map(() => "?").join(",");
|
|
7830
|
+
const stmt = this.db.prepare(
|
|
7831
|
+
`SELECT d.* FROM documents d
|
|
7832
|
+
JOIN libraries l ON d.library_id = l.id
|
|
7833
|
+
JOIN versions v ON d.version_id = v.id
|
|
7834
|
+
WHERE l.name = ?
|
|
7835
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7836
|
+
AND d.id IN (${placeholders})
|
|
7837
|
+
ORDER BY d.sort_order`
|
|
7838
|
+
);
|
|
7839
|
+
const rows = stmt.all(
|
|
7840
|
+
library.toLowerCase(),
|
|
7841
|
+
normalizedVersion,
|
|
7842
|
+
...ids
|
|
7843
|
+
);
|
|
7844
|
+
return rows.map((row) => mapDbDocumentToDocument(row));
|
|
7845
|
+
} catch (error) {
|
|
7846
|
+
throw new ConnectionError("Failed to fetch documents by IDs", error);
|
|
7847
|
+
}
|
|
7848
|
+
}
|
|
7849
|
+
}
|
|
7850
|
+
class DocumentManagementService {
|
|
7851
|
+
store;
|
|
7852
|
+
documentRetriever;
|
|
7853
|
+
splitter;
|
|
7854
|
+
/**
|
|
7855
|
+
* Normalizes a version string, converting null or undefined to an empty string
|
|
7856
|
+
* and converting to lowercase.
|
|
7857
|
+
*/
|
|
7858
|
+
normalizeVersion(version2) {
|
|
7859
|
+
return (version2 ?? "").toLowerCase();
|
|
7860
|
+
}
|
|
7861
|
+
constructor(embeddingConfig) {
|
|
7862
|
+
let dbPath;
|
|
7863
|
+
let dbDir;
|
|
7864
|
+
const envStorePath = process.env.DOCS_MCP_STORE_PATH;
|
|
7865
|
+
if (envStorePath) {
|
|
7866
|
+
dbDir = envStorePath;
|
|
7867
|
+
dbPath = path.join(dbDir, "documents.db");
|
|
7868
|
+
logger.debug(`Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
|
|
7869
|
+
} else {
|
|
7870
|
+
const projectRoot2 = getProjectRoot();
|
|
7871
|
+
const oldDbDir = path.join(projectRoot2, ".store");
|
|
7872
|
+
const oldDbPath = path.join(oldDbDir, "documents.db");
|
|
7873
|
+
const oldDbExists = fs.existsSync(oldDbPath);
|
|
7874
|
+
if (oldDbExists) {
|
|
7875
|
+
dbPath = oldDbPath;
|
|
7876
|
+
dbDir = oldDbDir;
|
|
7877
|
+
logger.debug(`Using legacy database path: ${dbPath}`);
|
|
7878
|
+
} else {
|
|
7879
|
+
const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
|
|
7880
|
+
dbDir = standardPaths.data;
|
|
7881
|
+
dbPath = path.join(dbDir, "documents.db");
|
|
7882
|
+
logger.debug(`Using standard database directory: ${dbDir}`);
|
|
7883
|
+
}
|
|
7884
|
+
}
|
|
7885
|
+
try {
|
|
7886
|
+
fs.mkdirSync(dbDir, { recursive: true });
|
|
7887
|
+
} catch (error) {
|
|
7888
|
+
logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
|
|
7889
|
+
}
|
|
7890
|
+
this.store = new DocumentStore(dbPath, embeddingConfig);
|
|
7891
|
+
this.documentRetriever = new DocumentRetrieverService(this.store);
|
|
7892
|
+
const semanticSplitter = new SemanticMarkdownSplitter(
|
|
7893
|
+
SPLITTER_PREFERRED_CHUNK_SIZE,
|
|
7894
|
+
SPLITTER_MAX_CHUNK_SIZE
|
|
7895
|
+
);
|
|
7896
|
+
const greedySplitter = new GreedySplitter(
|
|
7897
|
+
semanticSplitter,
|
|
7898
|
+
SPLITTER_MIN_CHUNK_SIZE,
|
|
7899
|
+
SPLITTER_PREFERRED_CHUNK_SIZE
|
|
7900
|
+
);
|
|
7901
|
+
this.splitter = greedySplitter;
|
|
7902
|
+
}
|
|
7903
|
+
/**
|
|
7904
|
+
* Initializes the underlying document store.
|
|
7905
|
+
*/
|
|
7906
|
+
async initialize() {
|
|
7907
|
+
await this.store.initialize();
|
|
7908
|
+
}
|
|
7909
|
+
/**
|
|
7910
|
+
* Shuts down the underlying document store.
|
|
7911
|
+
*/
|
|
7912
|
+
async shutdown() {
|
|
7913
|
+
logger.debug("Shutting down store manager");
|
|
7914
|
+
await this.store.shutdown();
|
|
7915
|
+
}
|
|
7916
|
+
// Status tracking methods for pipeline integration
|
|
7917
|
+
/**
|
|
7918
|
+
* Gets versions by their current status.
|
|
7919
|
+
*/
|
|
7920
|
+
async getVersionsByStatus(statuses) {
|
|
7921
|
+
return this.store.getVersionsByStatus(statuses);
|
|
7922
|
+
}
|
|
7923
|
+
/**
|
|
7924
|
+
* Updates the status of a version.
|
|
7925
|
+
*/
|
|
7926
|
+
async updateVersionStatus(versionId, status, errorMessage) {
|
|
7927
|
+
return this.store.updateVersionStatus(versionId, status, errorMessage);
|
|
7928
|
+
}
|
|
7929
|
+
/**
|
|
7930
|
+
* Updates the progress of a version being indexed.
|
|
7931
|
+
*/
|
|
7932
|
+
async updateVersionProgress(versionId, pages, maxPages) {
|
|
7933
|
+
return this.store.updateVersionProgress(versionId, pages, maxPages);
|
|
7934
|
+
}
|
|
7935
|
+
/**
|
|
7936
|
+
* Stores scraper options for a version to enable reproducible indexing.
|
|
7937
|
+
*/
|
|
7938
|
+
async storeScraperOptions(versionId, options) {
|
|
7939
|
+
return this.store.storeScraperOptions(versionId, options);
|
|
7940
|
+
}
|
|
7941
|
+
/**
|
|
7942
|
+
* Retrieves stored scraper options for a version.
|
|
7943
|
+
*/
|
|
7944
|
+
/**
|
|
7945
|
+
* Retrieves stored scraping configuration for a version.
|
|
7946
|
+
*/
|
|
7947
|
+
async getScraperOptions(versionId) {
|
|
7948
|
+
return this.store.getScraperOptions(versionId);
|
|
7949
|
+
}
|
|
7950
|
+
/**
|
|
7951
|
+
* Ensures a library/version exists using a VersionRef and returns version ID.
|
|
7952
|
+
* Delegates to existing ensureLibraryAndVersion for storage.
|
|
7953
|
+
*/
|
|
7954
|
+
async ensureVersion(ref) {
|
|
7955
|
+
const normalized = {
|
|
7956
|
+
library: ref.library.trim().toLowerCase(),
|
|
7957
|
+
version: (ref.version ?? "").trim().toLowerCase()
|
|
7958
|
+
};
|
|
7959
|
+
return this.ensureLibraryAndVersion(normalized.library, normalized.version);
|
|
7960
|
+
}
|
|
7961
|
+
/**
|
|
7962
|
+
* Returns enriched library summaries including version status/progress and counts.
|
|
7963
|
+
* Uses existing store APIs; keeps DB details encapsulated.
|
|
7964
|
+
*/
|
|
7965
|
+
async listLibraries() {
|
|
7966
|
+
const libMap = await this.store.queryLibraryVersions();
|
|
7967
|
+
const summaries = [];
|
|
7968
|
+
for (const [library, versions] of libMap) {
|
|
7969
|
+
const vs = versions.map(
|
|
7970
|
+
(v) => ({
|
|
7971
|
+
id: v.versionId,
|
|
7972
|
+
ref: { library, version: v.version },
|
|
7973
|
+
status: v.status,
|
|
7974
|
+
// Include progress only while indexing is active; set undefined for COMPLETED
|
|
7975
|
+
progress: v.status === "completed" ? void 0 : { pages: v.progressPages, maxPages: v.progressMaxPages },
|
|
7976
|
+
counts: { documents: v.documentCount, uniqueUrls: v.uniqueUrlCount },
|
|
7977
|
+
indexedAt: v.indexedAt,
|
|
7978
|
+
sourceUrl: v.sourceUrl ?? void 0
|
|
7979
|
+
})
|
|
7980
|
+
);
|
|
7981
|
+
summaries.push({ library, versions: vs });
|
|
7982
|
+
}
|
|
7983
|
+
return summaries;
|
|
7984
|
+
}
|
|
7985
|
+
/**
|
|
7986
|
+
* Finds versions that were indexed from the same source URL.
|
|
7987
|
+
*/
|
|
7988
|
+
async findVersionsBySourceUrl(url) {
|
|
7989
|
+
return this.store.findVersionsBySourceUrl(url);
|
|
7990
|
+
}
|
|
7991
|
+
/**
|
|
7992
|
+
* Validates if a library exists in the store (either versioned or unversioned).
|
|
7993
|
+
* Throws LibraryNotFoundError with suggestions if the library is not found.
|
|
7994
|
+
* @param library The name of the library to validate.
|
|
7995
|
+
* @throws {LibraryNotFoundError} If the library does not exist.
|
|
7996
|
+
*/
|
|
7997
|
+
async validateLibraryExists(library) {
|
|
7998
|
+
logger.info(`🔎 Validating existence of library: ${library}`);
|
|
7999
|
+
const normalizedLibrary = library.toLowerCase();
|
|
8000
|
+
const versions = await this.listVersions(normalizedLibrary);
|
|
8001
|
+
const hasUnversioned = await this.exists(normalizedLibrary, "");
|
|
8002
|
+
if (versions.length === 0 && !hasUnversioned) {
|
|
8003
|
+
logger.warn(`⚠️ Library '${library}' not found.`);
|
|
8004
|
+
const allLibraries = await this.listLibraries();
|
|
8005
|
+
const libraryNames = allLibraries.map((lib) => lib.library);
|
|
8006
|
+
let suggestions = [];
|
|
8007
|
+
if (libraryNames.length > 0) {
|
|
8008
|
+
const fuse = new Fuse(libraryNames, {
|
|
8009
|
+
// Configure fuse.js options if needed (e.g., threshold)
|
|
8010
|
+
// isCaseSensitive: false, // Handled by normalizing library names
|
|
8011
|
+
// includeScore: true,
|
|
8012
|
+
threshold: 0.4
|
|
8013
|
+
// Adjust threshold for desired fuzziness (0=exact, 1=match anything)
|
|
8014
|
+
});
|
|
8015
|
+
const results = fuse.search(normalizedLibrary);
|
|
8016
|
+
suggestions = results.slice(0, 3).map((result) => result.item);
|
|
8017
|
+
logger.info(`🔍 Found suggestions: ${suggestions.join(", ")}`);
|
|
8018
|
+
}
|
|
8019
|
+
throw new LibraryNotFoundError(library, suggestions);
|
|
8020
|
+
}
|
|
8021
|
+
logger.info(`✅ Library '${library}' confirmed to exist.`);
|
|
8022
|
+
}
|
|
8023
|
+
/**
|
|
8024
|
+
* Returns a list of all available semantic versions for a library.
|
|
8025
|
+
*/
|
|
8026
|
+
async listVersions(library) {
|
|
8027
|
+
const versions = await this.store.queryUniqueVersions(library);
|
|
8028
|
+
return versions.filter((v) => semver__default.valid(v));
|
|
8029
|
+
}
|
|
8030
|
+
/**
|
|
8031
|
+
* Checks if documents exist for a given library and optional version.
|
|
8032
|
+
* If version is omitted, checks for documents without a specific version.
|
|
5278
8033
|
*/
|
|
5279
|
-
|
|
5280
|
-
|
|
5281
|
-
|
|
5282
|
-
|
|
5283
|
-
|
|
5284
|
-
|
|
5285
|
-
|
|
5286
|
-
|
|
5287
|
-
|
|
5288
|
-
|
|
5289
|
-
|
|
5290
|
-
|
|
5291
|
-
|
|
5292
|
-
|
|
5293
|
-
|
|
5294
|
-
|
|
8034
|
+
async exists(library, version2) {
|
|
8035
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8036
|
+
return this.store.checkDocumentExists(library, normalizedVersion);
|
|
8037
|
+
}
|
|
8038
|
+
/**
|
|
8039
|
+
* Finds the most appropriate version of documentation based on the requested version.
|
|
8040
|
+
* When no target version is specified, returns the latest version.
|
|
8041
|
+
*
|
|
8042
|
+
* Version matching behavior:
|
|
8043
|
+
* - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
|
|
8044
|
+
* - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
|
|
8045
|
+
* - "latest" or no version: Returns the latest available version
|
|
8046
|
+
*
|
|
8047
|
+
* For documentation, we prefer matching older versions over no match at all,
|
|
8048
|
+
* since older docs are often still relevant and useful.
|
|
8049
|
+
* Also checks if unversioned documents exist for the library.
|
|
8050
|
+
*/
|
|
8051
|
+
async findBestVersion(library, targetVersion) {
|
|
8052
|
+
const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
|
|
8053
|
+
logger.info(`🔍 Finding best version for ${libraryAndVersion}`);
|
|
8054
|
+
const hasUnversioned = await this.store.checkDocumentExists(library, "");
|
|
8055
|
+
const versionStrings = await this.listVersions(library);
|
|
8056
|
+
if (versionStrings.length === 0) {
|
|
8057
|
+
if (hasUnversioned) {
|
|
8058
|
+
logger.info(`ℹ️ Unversioned documents exist for ${library}`);
|
|
8059
|
+
return { bestMatch: null, hasUnversioned: true };
|
|
8060
|
+
}
|
|
8061
|
+
logger.warn(`⚠️ No valid versions found for ${library}`);
|
|
8062
|
+
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
8063
|
+
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
8064
|
+
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
8065
|
+
}
|
|
8066
|
+
let bestMatch = null;
|
|
8067
|
+
if (!targetVersion || targetVersion === "latest") {
|
|
8068
|
+
bestMatch = semver__default.maxSatisfying(versionStrings, "*");
|
|
8069
|
+
} else {
|
|
8070
|
+
const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
|
|
8071
|
+
if (!versionRegex.test(targetVersion)) {
|
|
8072
|
+
logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
|
|
8073
|
+
} else {
|
|
8074
|
+
let range = targetVersion;
|
|
8075
|
+
if (!semver__default.validRange(targetVersion)) {
|
|
8076
|
+
range = `~${targetVersion}`;
|
|
8077
|
+
} else if (semver__default.valid(targetVersion)) {
|
|
8078
|
+
range = `${range} || <=${targetVersion}`;
|
|
8079
|
+
}
|
|
8080
|
+
bestMatch = semver__default.maxSatisfying(versionStrings, range);
|
|
8081
|
+
}
|
|
5295
8082
|
}
|
|
5296
|
-
|
|
5297
|
-
logger.info(
|
|
8083
|
+
if (bestMatch) {
|
|
8084
|
+
logger.info(`✅ Found best match version ${bestMatch} for ${libraryAndVersion}`);
|
|
8085
|
+
} else {
|
|
8086
|
+
logger.warn(`⚠️ No matching semver version found for ${libraryAndVersion}`);
|
|
5298
8087
|
}
|
|
5299
|
-
|
|
5300
|
-
|
|
5301
|
-
|
|
5302
|
-
|
|
5303
|
-
await appServer.start();
|
|
5304
|
-
return appServer;
|
|
5305
|
-
}
|
|
5306
|
-
async function startStdioServer(tools, readOnly = false) {
|
|
5307
|
-
setLogLevel(LogLevel.ERROR);
|
|
5308
|
-
const server = createMcpServerInstance(tools, readOnly);
|
|
5309
|
-
const transport = new StdioServerTransport();
|
|
5310
|
-
await server.connect(transport);
|
|
5311
|
-
logger.info("🤖 MCP server listening on stdio");
|
|
5312
|
-
return server;
|
|
5313
|
-
}
|
|
5314
|
-
class StoreError extends Error {
|
|
5315
|
-
constructor(message, cause) {
|
|
5316
|
-
super(cause ? `${message} caused by ${cause}` : message);
|
|
5317
|
-
this.cause = cause;
|
|
5318
|
-
this.name = this.constructor.name;
|
|
5319
|
-
const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
|
|
5320
|
-
if (causeError?.stack) {
|
|
5321
|
-
this.stack = causeError.stack;
|
|
8088
|
+
if (!bestMatch && !hasUnversioned) {
|
|
8089
|
+
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
8090
|
+
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
8091
|
+
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
5322
8092
|
}
|
|
8093
|
+
return { bestMatch, hasUnversioned };
|
|
5323
8094
|
}
|
|
5324
|
-
|
|
5325
|
-
|
|
5326
|
-
|
|
5327
|
-
|
|
5328
|
-
|
|
5329
|
-
);
|
|
5330
|
-
|
|
5331
|
-
|
|
5332
|
-
this.dbDimension = dbDimension;
|
|
5333
|
-
}
|
|
5334
|
-
}
|
|
5335
|
-
class ConnectionError extends StoreError {
|
|
5336
|
-
}
|
|
5337
|
-
const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
|
|
5338
|
-
const MIGRATIONS_TABLE = "_schema_migrations";
|
|
5339
|
-
function ensureMigrationsTable(db) {
|
|
5340
|
-
db.exec(`
|
|
5341
|
-
CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
|
|
5342
|
-
id TEXT PRIMARY KEY,
|
|
5343
|
-
applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
8095
|
+
/**
|
|
8096
|
+
* Removes all documents for a specific library and optional version.
|
|
8097
|
+
* If version is omitted, removes documents without a specific version.
|
|
8098
|
+
*/
|
|
8099
|
+
async removeAllDocuments(library, version2) {
|
|
8100
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8101
|
+
logger.info(
|
|
8102
|
+
`🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
|
|
5344
8103
|
);
|
|
5345
|
-
|
|
5346
|
-
}
|
|
5347
|
-
function getAppliedMigrations(db) {
|
|
5348
|
-
const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
|
|
5349
|
-
const rows = stmt.all();
|
|
5350
|
-
return new Set(rows.map((row) => row.id));
|
|
5351
|
-
}
|
|
5352
|
-
async function applyMigrations(db) {
|
|
5353
|
-
try {
|
|
5354
|
-
db.pragma("journal_mode = OFF");
|
|
5355
|
-
db.pragma("synchronous = OFF");
|
|
5356
|
-
db.pragma("mmap_size = 268435456");
|
|
5357
|
-
db.pragma("cache_size = -64000");
|
|
5358
|
-
db.pragma("temp_store = MEMORY");
|
|
5359
|
-
logger.debug("Applied performance optimizations for migration");
|
|
5360
|
-
} catch (_error) {
|
|
5361
|
-
logger.warn("⚠️ Could not apply all performance optimizations for migration");
|
|
8104
|
+
const count = await this.store.deleteDocuments(library, normalizedVersion);
|
|
8105
|
+
logger.info(`🗑️ Deleted ${count} documents`);
|
|
5362
8106
|
}
|
|
5363
|
-
|
|
5364
|
-
|
|
5365
|
-
|
|
5366
|
-
|
|
5367
|
-
|
|
5368
|
-
|
|
5369
|
-
|
|
5370
|
-
const
|
|
5371
|
-
|
|
5372
|
-
|
|
8107
|
+
/**
|
|
8108
|
+
* Completely removes a library version and all associated documents.
|
|
8109
|
+
* Also removes the library if no other versions remain.
|
|
8110
|
+
* @param library Library name
|
|
8111
|
+
* @param version Version string (null/undefined for unversioned)
|
|
8112
|
+
*/
|
|
8113
|
+
async removeVersion(library, version2) {
|
|
8114
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8115
|
+
logger.info(`🗑️ Removing version: ${library}@${normalizedVersion || "[no version]"}`);
|
|
8116
|
+
const result = await this.store.removeVersion(library, normalizedVersion, true);
|
|
8117
|
+
logger.info(
|
|
8118
|
+
`🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`
|
|
5373
8119
|
);
|
|
5374
|
-
if (
|
|
5375
|
-
logger.info(
|
|
8120
|
+
if (result.versionDeleted && result.libraryDeleted) {
|
|
8121
|
+
logger.info(`✅ Completely removed library ${library} (was last version)`);
|
|
8122
|
+
} else if (result.versionDeleted) {
|
|
8123
|
+
logger.info(`✅ Removed version ${library}@${normalizedVersion || "[no version]"}`);
|
|
8124
|
+
} else {
|
|
8125
|
+
logger.warn(
|
|
8126
|
+
`⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`
|
|
8127
|
+
);
|
|
5376
8128
|
}
|
|
5377
|
-
|
|
5378
|
-
|
|
5379
|
-
|
|
5380
|
-
|
|
5381
|
-
|
|
5382
|
-
|
|
5383
|
-
|
|
5384
|
-
|
|
5385
|
-
|
|
5386
|
-
|
|
5387
|
-
|
|
5388
|
-
|
|
5389
|
-
|
|
5390
|
-
throw new StoreError(`Migration failed: ${filename}`, error);
|
|
5391
|
-
}
|
|
8129
|
+
}
|
|
8130
|
+
/**
|
|
8131
|
+
* Adds a document to the store, splitting it into smaller chunks for better search results.
|
|
8132
|
+
* Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
|
|
8133
|
+
* Preserves hierarchical structure of documents and distinguishes between text and code segments.
|
|
8134
|
+
* If version is omitted, the document is added without a specific version.
|
|
8135
|
+
*/
|
|
8136
|
+
async addDocument(library, version2, document) {
|
|
8137
|
+
const processingStart = performance.now();
|
|
8138
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8139
|
+
const url = document.metadata.url;
|
|
8140
|
+
if (!url || typeof url !== "string" || !url.trim()) {
|
|
8141
|
+
throw new StoreError("Document metadata must include a valid URL");
|
|
5392
8142
|
}
|
|
5393
|
-
|
|
5394
|
-
|
|
5395
|
-
|
|
5396
|
-
logger.debug("Database schema is up to date");
|
|
8143
|
+
logger.info(`📚 Adding document: ${document.metadata.title}`);
|
|
8144
|
+
if (!document.pageContent.trim()) {
|
|
8145
|
+
throw new Error("Document content cannot be empty");
|
|
5397
8146
|
}
|
|
5398
|
-
return appliedCount;
|
|
5399
|
-
});
|
|
5400
|
-
let retries = 0;
|
|
5401
|
-
let appliedMigrationsCount = 0;
|
|
5402
|
-
while (true) {
|
|
5403
8147
|
try {
|
|
5404
|
-
|
|
5405
|
-
|
|
5406
|
-
|
|
5407
|
-
|
|
5408
|
-
|
|
5409
|
-
|
|
5410
|
-
|
|
5411
|
-
db.exec("VACUUM");
|
|
5412
|
-
logger.debug("Database vacuum completed successfully");
|
|
5413
|
-
} catch (error) {
|
|
5414
|
-
logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
|
|
8148
|
+
const chunks = await this.splitter.splitText(document.pageContent);
|
|
8149
|
+
const splitDocs = chunks.map((chunk) => ({
|
|
8150
|
+
pageContent: chunk.content,
|
|
8151
|
+
metadata: {
|
|
8152
|
+
...document.metadata,
|
|
8153
|
+
level: chunk.section.level,
|
|
8154
|
+
path: chunk.section.path
|
|
5415
8155
|
}
|
|
5416
|
-
}
|
|
5417
|
-
|
|
5418
|
-
|
|
5419
|
-
|
|
8156
|
+
}));
|
|
8157
|
+
logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
|
|
8158
|
+
await this.store.addDocuments(library, normalizedVersion, splitDocs);
|
|
8159
|
+
const processingTime = performance.now() - processingStart;
|
|
8160
|
+
analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, {
|
|
8161
|
+
// Content characteristics (privacy-safe)
|
|
8162
|
+
mimeType: document.metadata.mimeType,
|
|
8163
|
+
contentSizeBytes: document.pageContent.length,
|
|
8164
|
+
// Processing metrics
|
|
8165
|
+
processingTimeMs: Math.round(processingTime),
|
|
8166
|
+
chunksCreated: splitDocs.length,
|
|
8167
|
+
// Document characteristics
|
|
8168
|
+
hasTitle: !!document.metadata.title,
|
|
8169
|
+
hasDescription: !!document.metadata.description,
|
|
8170
|
+
urlDomain: extractHostname(url),
|
|
8171
|
+
depth: document.metadata.depth,
|
|
8172
|
+
// Library context
|
|
8173
|
+
library,
|
|
8174
|
+
libraryVersion: normalizedVersion || null,
|
|
8175
|
+
// Processing efficiency
|
|
8176
|
+
avgChunkSizeBytes: Math.round(document.pageContent.length / splitDocs.length),
|
|
8177
|
+
processingSpeedKbPerSec: Math.round(
|
|
8178
|
+
document.pageContent.length / 1024 / (processingTime / 1e3)
|
|
8179
|
+
)
|
|
8180
|
+
});
|
|
5420
8181
|
} catch (error) {
|
|
5421
|
-
|
|
5422
|
-
|
|
5423
|
-
|
|
5424
|
-
|
|
5425
|
-
|
|
5426
|
-
|
|
5427
|
-
|
|
5428
|
-
|
|
5429
|
-
|
|
5430
|
-
|
|
5431
|
-
|
|
5432
|
-
}
|
|
5433
|
-
if (error instanceof StoreError) {
|
|
5434
|
-
throw error;
|
|
5435
|
-
}
|
|
5436
|
-
throw new StoreError("Failed during migration process", error);
|
|
8182
|
+
const processingTime = performance.now() - processingStart;
|
|
8183
|
+
if (error instanceof Error) {
|
|
8184
|
+
analytics.captureException(error, {
|
|
8185
|
+
mimeType: document.metadata.mimeType,
|
|
8186
|
+
contentSizeBytes: document.pageContent.length,
|
|
8187
|
+
processingTimeMs: Math.round(processingTime),
|
|
8188
|
+
library,
|
|
8189
|
+
libraryVersion: normalizedVersion || null,
|
|
8190
|
+
context: "document_processing",
|
|
8191
|
+
component: DocumentManagementService.constructor.name
|
|
8192
|
+
});
|
|
5437
8193
|
}
|
|
8194
|
+
throw error;
|
|
5438
8195
|
}
|
|
5439
8196
|
}
|
|
5440
|
-
|
|
5441
|
-
|
|
5442
|
-
|
|
5443
|
-
|
|
5444
|
-
|
|
5445
|
-
|
|
5446
|
-
|
|
5447
|
-
|
|
8197
|
+
/**
|
|
8198
|
+
* Searches for documentation content across versions.
|
|
8199
|
+
* Uses hybrid search (vector + FTS).
|
|
8200
|
+
* If version is omitted, searches documents without a specific version.
|
|
8201
|
+
*/
|
|
8202
|
+
async searchStore(library, version2, query, limit = 5) {
|
|
8203
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8204
|
+
return this.documentRetriever.search(library, normalizedVersion, query, limit);
|
|
8205
|
+
}
|
|
8206
|
+
// Deprecated simple listing removed: enriched listLibraries() is canonical
|
|
8207
|
+
/**
|
|
8208
|
+
* Ensures a library and version exist in the database and returns the version ID.
|
|
8209
|
+
* Creates the library and version records if they don't exist.
|
|
8210
|
+
*/
|
|
8211
|
+
async ensureLibraryAndVersion(library, version2) {
|
|
8212
|
+
const normalizedLibrary = library.toLowerCase();
|
|
8213
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8214
|
+
const { versionId } = await this.store.resolveLibraryAndVersionIds(
|
|
8215
|
+
normalizedLibrary,
|
|
8216
|
+
normalizedVersion
|
|
5448
8217
|
);
|
|
5449
|
-
|
|
5450
|
-
logger.warn("⚠️ Could not apply all production database settings");
|
|
8218
|
+
return versionId;
|
|
5451
8219
|
}
|
|
5452
8220
|
}
|
|
5453
8221
|
async function createDocumentManagement(options = {}) {
|
|
5454
8222
|
if (options.serverUrl) {
|
|
5455
|
-
const { DocumentManagementClient } = await import("./DocumentManagementClient-CAFdDwTu.js");
|
|
5456
8223
|
const client = new DocumentManagementClient(options.serverUrl);
|
|
5457
8224
|
await client.initialize();
|
|
5458
8225
|
return client;
|
|
5459
8226
|
}
|
|
5460
|
-
const service = new (
|
|
8227
|
+
const service = new DocumentManagementService(options.embeddingConfig);
|
|
5461
8228
|
await service.initialize();
|
|
5462
8229
|
return service;
|
|
5463
8230
|
}
|
|
5464
|
-
async function createLocalDocumentManagement() {
|
|
5465
|
-
const service = new
|
|
8231
|
+
async function createLocalDocumentManagement(embeddingConfig) {
|
|
8232
|
+
const service = new DocumentManagementService(embeddingConfig);
|
|
5466
8233
|
await service.initialize();
|
|
5467
8234
|
return service;
|
|
5468
8235
|
}
|
|
@@ -6992,6 +9759,17 @@ function warnHttpUsage(authConfig, port) {
|
|
|
6992
9759
|
);
|
|
6993
9760
|
}
|
|
6994
9761
|
}
|
|
9762
|
+
function resolveEmbeddingContext(cliArgs) {
|
|
9763
|
+
try {
|
|
9764
|
+
const modelSpec = cliArgs?.embeddingModel || process.env.DOCS_MCP_EMBEDDING_MODEL;
|
|
9765
|
+
logger.debug("Resolving embedding configuration");
|
|
9766
|
+
const config = EmbeddingConfig.parseEmbeddingConfig(modelSpec);
|
|
9767
|
+
return config;
|
|
9768
|
+
} catch (error) {
|
|
9769
|
+
logger.debug(`Failed to resolve embedding configuration: ${error}`);
|
|
9770
|
+
return null;
|
|
9771
|
+
}
|
|
9772
|
+
}
|
|
6995
9773
|
function createDefaultAction(program) {
|
|
6996
9774
|
return program.addOption(
|
|
6997
9775
|
new Option("--protocol <protocol>", "Protocol for MCP server").choices(["auto", "stdio", "http"]).default("auto")
|
|
@@ -7031,7 +9809,8 @@ function createDefaultAction(program) {
|
|
|
7031
9809
|
warnHttpUsage(authConfig, port);
|
|
7032
9810
|
}
|
|
7033
9811
|
ensurePlaywrightBrowsersInstalled();
|
|
7034
|
-
const
|
|
9812
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
9813
|
+
const docService = await createLocalDocumentManagement(embeddingConfig);
|
|
7035
9814
|
const pipelineOptions = {
|
|
7036
9815
|
recoverJobs: options.resume || false,
|
|
7037
9816
|
// Use --resume flag for job recovery
|
|
@@ -7072,24 +9851,12 @@ async function fetchUrlAction(url, options, command) {
|
|
|
7072
9851
|
setupLogging(globalOptions);
|
|
7073
9852
|
const headers = parseHeaders(options.header);
|
|
7074
9853
|
const fetchUrlTool = new FetchUrlTool(new HttpFetcher(), new FileFetcher());
|
|
7075
|
-
const content = await
|
|
7076
|
-
|
|
7077
|
-
|
|
7078
|
-
|
|
7079
|
-
|
|
7080
|
-
|
|
7081
|
-
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
7082
|
-
}),
|
|
7083
|
-
(content2) => ({
|
|
7084
|
-
url_protocol: extractProtocol(url),
|
|
7085
|
-
// Safe: only protocol, not full URL
|
|
7086
|
-
follow_redirects: options.followRedirects,
|
|
7087
|
-
scrape_mode: options.scrapeMode,
|
|
7088
|
-
has_custom_headers: Object.keys(headers).length > 0,
|
|
7089
|
-
content_length: content2.length,
|
|
7090
|
-
cli_flags: extractCliFlags(process.argv)
|
|
7091
|
-
})
|
|
7092
|
-
);
|
|
9854
|
+
const content = await fetchUrlTool.execute({
|
|
9855
|
+
url,
|
|
9856
|
+
followRedirects: options.followRedirects,
|
|
9857
|
+
scrapeMode: options.scrapeMode,
|
|
9858
|
+
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
9859
|
+
});
|
|
7093
9860
|
console.log(content);
|
|
7094
9861
|
}
|
|
7095
9862
|
function createFetchUrlCommand(program) {
|
|
@@ -7121,25 +9888,16 @@ async function findVersionAction(library, options, command) {
|
|
|
7121
9888
|
const globalOptions = command.parent?.opts() || {};
|
|
7122
9889
|
setupLogging(globalOptions);
|
|
7123
9890
|
const serverUrl = options.serverUrl;
|
|
7124
|
-
const docService = await createDocumentManagement({
|
|
9891
|
+
const docService = await createDocumentManagement({
|
|
9892
|
+
serverUrl,
|
|
9893
|
+
embeddingConfig: serverUrl ? void 0 : null
|
|
9894
|
+
});
|
|
7125
9895
|
try {
|
|
7126
9896
|
const findVersionTool = new FindVersionTool(docService);
|
|
7127
|
-
const versionInfo = await
|
|
7128
|
-
|
|
7129
|
-
|
|
7130
|
-
|
|
7131
|
-
targetVersion: options.version
|
|
7132
|
-
}),
|
|
7133
|
-
(versionInfo2) => ({
|
|
7134
|
-
library,
|
|
7135
|
-
// Safe: library names are public
|
|
7136
|
-
has_target_version: !!options.version,
|
|
7137
|
-
result_type: typeof versionInfo2,
|
|
7138
|
-
// 'string'
|
|
7139
|
-
using_remote_server: !!serverUrl,
|
|
7140
|
-
cli_flags: extractCliFlags(process.argv)
|
|
7141
|
-
})
|
|
7142
|
-
);
|
|
9897
|
+
const versionInfo = await findVersionTool.execute({
|
|
9898
|
+
library,
|
|
9899
|
+
targetVersion: options.version
|
|
9900
|
+
});
|
|
7143
9901
|
if (!versionInfo) throw new Error("Failed to get version information");
|
|
7144
9902
|
console.log(versionInfo);
|
|
7145
9903
|
} finally {
|
|
@@ -7156,18 +9914,13 @@ async function listAction(options, command) {
|
|
|
7156
9914
|
const globalOptions = command.parent?.opts() || {};
|
|
7157
9915
|
setupLogging(globalOptions);
|
|
7158
9916
|
const { serverUrl } = options;
|
|
7159
|
-
const docService = await createDocumentManagement({
|
|
9917
|
+
const docService = await createDocumentManagement({
|
|
9918
|
+
serverUrl,
|
|
9919
|
+
embeddingConfig: serverUrl ? void 0 : null
|
|
9920
|
+
});
|
|
7160
9921
|
try {
|
|
7161
9922
|
const listLibrariesTool = new ListLibrariesTool(docService);
|
|
7162
|
-
const result = await
|
|
7163
|
-
"list_libraries",
|
|
7164
|
-
() => listLibrariesTool.execute(),
|
|
7165
|
-
(result2) => ({
|
|
7166
|
-
library_count: result2.libraries.length,
|
|
7167
|
-
using_remote_server: !!serverUrl,
|
|
7168
|
-
cli_flags: extractCliFlags(process.argv)
|
|
7169
|
-
})
|
|
7170
|
-
);
|
|
9923
|
+
const result = await listLibrariesTool.execute();
|
|
7171
9924
|
console.log(formatOutput(result.libraries));
|
|
7172
9925
|
} finally {
|
|
7173
9926
|
await docService.shutdown();
|
|
@@ -7220,8 +9973,16 @@ function createMcpCommand(program) {
|
|
|
7220
9973
|
validateAuthConfig(authConfig);
|
|
7221
9974
|
}
|
|
7222
9975
|
try {
|
|
9976
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
9977
|
+
if (!serverUrl && !embeddingConfig) {
|
|
9978
|
+
logger.error(
|
|
9979
|
+
"❌ Embedding configuration is required for local mode. Configure an embedding provider with CLI options or environment variables."
|
|
9980
|
+
);
|
|
9981
|
+
process.exit(1);
|
|
9982
|
+
}
|
|
7223
9983
|
const docService = await createDocumentManagement({
|
|
7224
|
-
serverUrl
|
|
9984
|
+
serverUrl,
|
|
9985
|
+
embeddingConfig
|
|
7225
9986
|
});
|
|
7226
9987
|
const pipelineOptions = {
|
|
7227
9988
|
recoverJobs: false,
|
|
@@ -7271,20 +10032,13 @@ async function removeAction(library, options, command) {
|
|
|
7271
10032
|
const globalOptions = command.parent?.opts() || {};
|
|
7272
10033
|
setupLogging(globalOptions);
|
|
7273
10034
|
const serverUrl = options.serverUrl;
|
|
7274
|
-
const docService = await createDocumentManagement({
|
|
10035
|
+
const docService = await createDocumentManagement({
|
|
10036
|
+
serverUrl,
|
|
10037
|
+
embeddingConfig: serverUrl ? void 0 : null
|
|
10038
|
+
});
|
|
7275
10039
|
const { version: version2 } = options;
|
|
7276
10040
|
try {
|
|
7277
|
-
await
|
|
7278
|
-
"remove_documents",
|
|
7279
|
-
() => docService.removeAllDocuments(library, version2),
|
|
7280
|
-
() => ({
|
|
7281
|
-
library,
|
|
7282
|
-
// Safe: library names are public
|
|
7283
|
-
has_version: !!version2,
|
|
7284
|
-
using_remote_server: !!serverUrl,
|
|
7285
|
-
cli_flags: extractCliFlags(process.argv)
|
|
7286
|
-
})
|
|
7287
|
-
);
|
|
10041
|
+
await docService.removeAllDocuments(library, version2);
|
|
7288
10042
|
console.log(`✅ Successfully removed ${library}${version2 ? `@${version2}` : ""}.`);
|
|
7289
10043
|
} catch (error) {
|
|
7290
10044
|
console.error(
|
|
@@ -7309,7 +10063,16 @@ async function scrapeAction(library, url, options, command) {
|
|
|
7309
10063
|
const globalOptions = command.parent?.opts() || {};
|
|
7310
10064
|
setupLogging(globalOptions);
|
|
7311
10065
|
const serverUrl = options.serverUrl;
|
|
7312
|
-
const
|
|
10066
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
10067
|
+
if (!serverUrl && !embeddingConfig) {
|
|
10068
|
+
throw new Error(
|
|
10069
|
+
"Embedding configuration is required for local scraping. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
|
|
10070
|
+
);
|
|
10071
|
+
}
|
|
10072
|
+
const docService = await createDocumentManagement({
|
|
10073
|
+
serverUrl,
|
|
10074
|
+
embeddingConfig
|
|
10075
|
+
});
|
|
7313
10076
|
let pipeline = null;
|
|
7314
10077
|
try {
|
|
7315
10078
|
const pipelineOptions = {
|
|
@@ -7324,48 +10087,23 @@ async function scrapeAction(library, url, options, command) {
|
|
|
7324
10087
|
await pipeline.start();
|
|
7325
10088
|
const scrapeTool = new ScrapeTool(pipeline);
|
|
7326
10089
|
const headers = parseHeaders(options.header);
|
|
7327
|
-
const result = await
|
|
7328
|
-
|
|
7329
|
-
|
|
7330
|
-
|
|
7331
|
-
|
|
7332
|
-
|
|
7333
|
-
|
|
7334
|
-
|
|
7335
|
-
|
|
7336
|
-
maxConcurrency: Number.parseInt(options.maxConcurrency, 10),
|
|
7337
|
-
ignoreErrors: options.ignoreErrors,
|
|
7338
|
-
scope: options.scope,
|
|
7339
|
-
followRedirects: options.followRedirects,
|
|
7340
|
-
scrapeMode: options.scrapeMode,
|
|
7341
|
-
includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
|
|
7342
|
-
excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
|
|
7343
|
-
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
7344
|
-
}
|
|
7345
|
-
}),
|
|
7346
|
-
(result2) => ({
|
|
7347
|
-
library,
|
|
7348
|
-
// Safe: library names are public
|
|
7349
|
-
url_protocol: extractProtocol(url),
|
|
7350
|
-
// Safe: only protocol, not full URL
|
|
7351
|
-
max_pages: Number.parseInt(options.maxPages, 10),
|
|
7352
|
-
max_depth: Number.parseInt(options.maxDepth, 10),
|
|
7353
|
-
max_concurrency: Number.parseInt(options.maxConcurrency, 10),
|
|
7354
|
-
has_version: !!options.version,
|
|
10090
|
+
const result = await scrapeTool.execute({
|
|
10091
|
+
url,
|
|
10092
|
+
library,
|
|
10093
|
+
version: options.version,
|
|
10094
|
+
options: {
|
|
10095
|
+
maxPages: Number.parseInt(options.maxPages, 10),
|
|
10096
|
+
maxDepth: Number.parseInt(options.maxDepth, 10),
|
|
10097
|
+
maxConcurrency: Number.parseInt(options.maxConcurrency, 10),
|
|
10098
|
+
ignoreErrors: options.ignoreErrors,
|
|
7355
10099
|
scope: options.scope,
|
|
7356
|
-
|
|
7357
|
-
|
|
7358
|
-
|
|
7359
|
-
|
|
7360
|
-
|
|
7361
|
-
|
|
7362
|
-
|
|
7363
|
-
cli_flags: extractCliFlags(process.argv),
|
|
7364
|
-
is_async_job: !("pagesScraped" in result2),
|
|
7365
|
-
// Pipeline mode vs direct mode
|
|
7366
|
-
pages_scraped: "pagesScraped" in result2 ? result2.pagesScraped : void 0
|
|
7367
|
-
})
|
|
7368
|
-
);
|
|
10100
|
+
followRedirects: options.followRedirects,
|
|
10101
|
+
scrapeMode: options.scrapeMode,
|
|
10102
|
+
includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
|
|
10103
|
+
excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
|
|
10104
|
+
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
10105
|
+
}
|
|
10106
|
+
});
|
|
7369
10107
|
if ("pagesScraped" in result) {
|
|
7370
10108
|
console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
|
|
7371
10109
|
} else {
|
|
@@ -7444,31 +10182,25 @@ async function searchAction(library, query, options, command) {
|
|
|
7444
10182
|
const globalOptions = command.parent?.opts() || {};
|
|
7445
10183
|
setupLogging(globalOptions);
|
|
7446
10184
|
const serverUrl = options.serverUrl;
|
|
7447
|
-
const
|
|
10185
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
10186
|
+
if (!serverUrl && !embeddingConfig) {
|
|
10187
|
+
throw new Error(
|
|
10188
|
+
"Embedding configuration is required for local search. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
|
|
10189
|
+
);
|
|
10190
|
+
}
|
|
10191
|
+
const docService = await createDocumentManagement({
|
|
10192
|
+
serverUrl,
|
|
10193
|
+
embeddingConfig
|
|
10194
|
+
});
|
|
7448
10195
|
try {
|
|
7449
10196
|
const searchTool = new SearchTool(docService);
|
|
7450
|
-
const result = await
|
|
7451
|
-
|
|
7452
|
-
|
|
7453
|
-
|
|
7454
|
-
|
|
7455
|
-
|
|
7456
|
-
|
|
7457
|
-
exactMatch: options.exactMatch
|
|
7458
|
-
}),
|
|
7459
|
-
(result2) => ({
|
|
7460
|
-
library,
|
|
7461
|
-
// Safe: library names are public
|
|
7462
|
-
query_analysis: analyzeSearchQuery(query),
|
|
7463
|
-
// Analyzed, not raw query
|
|
7464
|
-
result_count: result2.results.length,
|
|
7465
|
-
limit_used: Number.parseInt(options.limit, 10),
|
|
7466
|
-
has_version_filter: !!options.version,
|
|
7467
|
-
exact_match: options.exactMatch,
|
|
7468
|
-
using_remote_server: !!serverUrl,
|
|
7469
|
-
cli_flags: extractCliFlags(process.argv)
|
|
7470
|
-
})
|
|
7471
|
-
);
|
|
10197
|
+
const result = await searchTool.execute({
|
|
10198
|
+
library,
|
|
10199
|
+
version: options.version,
|
|
10200
|
+
query,
|
|
10201
|
+
limit: Number.parseInt(options.limit, 10),
|
|
10202
|
+
exactMatch: options.exactMatch
|
|
10203
|
+
});
|
|
7472
10204
|
console.log(formatOutput(result.results));
|
|
7473
10205
|
} finally {
|
|
7474
10206
|
await docService.shutdown();
|
|
@@ -7504,8 +10236,16 @@ function createWebCommand(program) {
|
|
|
7504
10236
|
const serverUrl = cmdOptions.serverUrl;
|
|
7505
10237
|
setupLogging(globalOptions);
|
|
7506
10238
|
try {
|
|
10239
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
10240
|
+
if (!serverUrl && !embeddingConfig) {
|
|
10241
|
+
logger.error(
|
|
10242
|
+
"❌ Embedding configuration is required for local mode. Configure an embedding provider with CLI options or environment variables."
|
|
10243
|
+
);
|
|
10244
|
+
process.exit(1);
|
|
10245
|
+
}
|
|
7507
10246
|
const docService = await createDocumentManagement({
|
|
7508
|
-
serverUrl
|
|
10247
|
+
serverUrl,
|
|
10248
|
+
embeddingConfig
|
|
7509
10249
|
});
|
|
7510
10250
|
const pipelineOptions = {
|
|
7511
10251
|
recoverJobs: false,
|
|
@@ -7554,7 +10294,8 @@ function createWorkerCommand(program) {
|
|
|
7554
10294
|
try {
|
|
7555
10295
|
logger.info(`🚀 Starting external pipeline worker on port ${port}`);
|
|
7556
10296
|
ensurePlaywrightBrowsersInstalled();
|
|
7557
|
-
const
|
|
10297
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
10298
|
+
const docService = await createLocalDocumentManagement(embeddingConfig);
|
|
7558
10299
|
const pipelineOptions = {
|
|
7559
10300
|
recoverJobs: cmdOptions.resume,
|
|
7560
10301
|
// Use the resume option
|
|
@@ -7582,7 +10323,7 @@ function createCliProgram() {
|
|
|
7582
10323
|
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version(packageJson.version).addOption(
|
|
7583
10324
|
new Option("--verbose", "Enable verbose (debug) logging").conflicts("silent")
|
|
7584
10325
|
).addOption(new Option("--silent", "Disable all logging except errors")).addOption(new Option("--no-telemetry", "Disable telemetry collection")).enablePositionalOptions().allowExcessArguments(false).showHelpAfterError(true);
|
|
7585
|
-
program.hook("preAction", (thisCommand, actionCommand) => {
|
|
10326
|
+
program.hook("preAction", async (thisCommand, actionCommand) => {
|
|
7586
10327
|
const globalOptions = thisCommand.opts();
|
|
7587
10328
|
if (globalOptions.silent) setLogLevel(LogLevel.ERROR);
|
|
7588
10329
|
else if (globalOptions.verbose) setLogLevel(LogLevel.DEBUG);
|
|
@@ -7673,7 +10414,11 @@ async function runCli() {
|
|
|
7673
10414
|
});
|
|
7674
10415
|
await program.parseAsync(process.argv);
|
|
7675
10416
|
} catch (error) {
|
|
7676
|
-
|
|
10417
|
+
if (error instanceof ModelConfigurationError || error instanceof UnsupportedProviderError) {
|
|
10418
|
+
logger.error(error.message);
|
|
10419
|
+
} else {
|
|
10420
|
+
logger.error(`❌ Error in CLI: ${error}`);
|
|
10421
|
+
}
|
|
7677
10422
|
if (!isShuttingDown) {
|
|
7678
10423
|
isShuttingDown = true;
|
|
7679
10424
|
const shutdownPromises = [];
|
|
@@ -7723,27 +10468,4 @@ runCli().catch((error) => {
|
|
|
7723
10468
|
console.error(`🔥 Fatal error in main execution: ${error}`);
|
|
7724
10469
|
process.exit(1);
|
|
7725
10470
|
});
|
|
7726
|
-
export {
|
|
7727
|
-
ConnectionError as C,
|
|
7728
|
-
DimensionError as D,
|
|
7729
|
-
EMBEDDING_BATCH_CHARS as E,
|
|
7730
|
-
LibraryNotFoundError as L,
|
|
7731
|
-
StoreError as S,
|
|
7732
|
-
TelemetryEvent as T,
|
|
7733
|
-
VECTOR_DIMENSION as V,
|
|
7734
|
-
applyMigrations as a,
|
|
7735
|
-
EMBEDDING_BATCH_SIZE as b,
|
|
7736
|
-
createJSDOM as c,
|
|
7737
|
-
denormalizeVersionName as d,
|
|
7738
|
-
SPLITTER_PREFERRED_CHUNK_SIZE as e,
|
|
7739
|
-
SPLITTER_MAX_CHUNK_SIZE as f,
|
|
7740
|
-
getProjectRoot as g,
|
|
7741
|
-
VersionNotFoundError as h,
|
|
7742
|
-
analytics as i,
|
|
7743
|
-
extractHostname as j,
|
|
7744
|
-
SPLITTER_MIN_CHUNK_SIZE as k,
|
|
7745
|
-
logger as l,
|
|
7746
|
-
mapDbDocumentToDocument as m,
|
|
7747
|
-
normalizeVersionName as n
|
|
7748
|
-
};
|
|
7749
10471
|
//# sourceMappingURL=index.js.map
|