@arabold/docs-mcp-server 1.21.0 → 1.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/index.js +3572 -849
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/DocumentManagementClient-CAFdDwTu.js +0 -57
- package/dist/DocumentManagementClient-CAFdDwTu.js.map +0 -1
- package/dist/DocumentManagementService-C1xAzouZ.js +0 -2026
- package/dist/DocumentManagementService-C1xAzouZ.js.map +0 -1
- package/dist/EmbeddingFactory-CElwVk3X.js +0 -176
- package/dist/EmbeddingFactory-CElwVk3X.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,4 +1,10 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
1
2
|
import "dotenv/config";
|
|
3
|
+
import { BedrockEmbeddings } from "@langchain/aws";
|
|
4
|
+
import { GoogleGenerativeAIEmbeddings } from "@langchain/google-genai";
|
|
5
|
+
import { VertexAIEmbeddings } from "@langchain/google-vertexai";
|
|
6
|
+
import { AzureOpenAIEmbeddings, OpenAIEmbeddings } from "@langchain/openai";
|
|
7
|
+
import { Embeddings } from "@langchain/core/embeddings";
|
|
2
8
|
import { PostHog } from "posthog-node";
|
|
3
9
|
import crypto, { randomUUID } from "node:crypto";
|
|
4
10
|
import fs, { readFileSync, existsSync } from "node:fs";
|
|
@@ -39,14 +45,277 @@ import DOMPurify from "dompurify";
|
|
|
39
45
|
import { fileURLToPath, URL as URL$1 } from "node:url";
|
|
40
46
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
41
47
|
import { createTRPCProxyClient, httpBatchLink } from "@trpc/client";
|
|
42
|
-
import "fuse.js";
|
|
43
|
-
import "langchain/text_splitter";
|
|
44
|
-
import "better-sqlite3";
|
|
45
|
-
import "sqlite-vec";
|
|
48
|
+
import Fuse from "fuse.js";
|
|
49
|
+
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
|
|
50
|
+
import Database from "better-sqlite3";
|
|
51
|
+
import * as sqliteVec from "sqlite-vec";
|
|
46
52
|
import { execSync } from "node:child_process";
|
|
47
53
|
import { v4 } from "uuid";
|
|
48
54
|
import psl from "psl";
|
|
49
55
|
import { minimatch } from "minimatch";
|
|
56
|
+
const VECTOR_DIMENSION = 1536;
|
|
57
|
+
function mapDbDocumentToDocument(doc) {
|
|
58
|
+
return {
|
|
59
|
+
id: doc.id,
|
|
60
|
+
pageContent: doc.content,
|
|
61
|
+
metadata: JSON.parse(doc.metadata)
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
var VersionStatus = /* @__PURE__ */ ((VersionStatus2) => {
|
|
65
|
+
VersionStatus2["NOT_INDEXED"] = "not_indexed";
|
|
66
|
+
VersionStatus2["QUEUED"] = "queued";
|
|
67
|
+
VersionStatus2["RUNNING"] = "running";
|
|
68
|
+
VersionStatus2["COMPLETED"] = "completed";
|
|
69
|
+
VersionStatus2["FAILED"] = "failed";
|
|
70
|
+
VersionStatus2["CANCELLED"] = "cancelled";
|
|
71
|
+
VersionStatus2["UPDATING"] = "updating";
|
|
72
|
+
return VersionStatus2;
|
|
73
|
+
})(VersionStatus || {});
|
|
74
|
+
function normalizeVersionName(name) {
|
|
75
|
+
return name ?? "";
|
|
76
|
+
}
|
|
77
|
+
function denormalizeVersionName(name) {
|
|
78
|
+
return name === "" ? "" : name;
|
|
79
|
+
}
|
|
80
|
+
function getStatusDescription(status) {
|
|
81
|
+
const descriptions = {
|
|
82
|
+
[
|
|
83
|
+
"not_indexed"
|
|
84
|
+
/* NOT_INDEXED */
|
|
85
|
+
]: "Version created but not yet indexed",
|
|
86
|
+
[
|
|
87
|
+
"queued"
|
|
88
|
+
/* QUEUED */
|
|
89
|
+
]: "Waiting in queue for indexing",
|
|
90
|
+
[
|
|
91
|
+
"running"
|
|
92
|
+
/* RUNNING */
|
|
93
|
+
]: "Currently being indexed",
|
|
94
|
+
[
|
|
95
|
+
"completed"
|
|
96
|
+
/* COMPLETED */
|
|
97
|
+
]: "Successfully indexed",
|
|
98
|
+
[
|
|
99
|
+
"failed"
|
|
100
|
+
/* FAILED */
|
|
101
|
+
]: "Indexing failed",
|
|
102
|
+
[
|
|
103
|
+
"cancelled"
|
|
104
|
+
/* CANCELLED */
|
|
105
|
+
]: "Indexing was cancelled",
|
|
106
|
+
[
|
|
107
|
+
"updating"
|
|
108
|
+
/* UPDATING */
|
|
109
|
+
]: "Re-indexing in progress"
|
|
110
|
+
};
|
|
111
|
+
return descriptions[status] || "Unknown status";
|
|
112
|
+
}
|
|
113
|
+
function isActiveStatus(status) {
|
|
114
|
+
return [
|
|
115
|
+
"queued",
|
|
116
|
+
"running",
|
|
117
|
+
"updating"
|
|
118
|
+
/* UPDATING */
|
|
119
|
+
].includes(
|
|
120
|
+
status
|
|
121
|
+
);
|
|
122
|
+
}
|
|
123
|
+
class StoreError extends Error {
|
|
124
|
+
constructor(message, cause) {
|
|
125
|
+
super(cause ? `${message} caused by ${cause}` : message);
|
|
126
|
+
this.cause = cause;
|
|
127
|
+
this.name = this.constructor.name;
|
|
128
|
+
const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
|
|
129
|
+
if (causeError?.stack) {
|
|
130
|
+
this.stack = causeError.stack;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
class DimensionError extends StoreError {
|
|
135
|
+
constructor(modelName, modelDimension, dbDimension) {
|
|
136
|
+
super(
|
|
137
|
+
`Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
|
|
138
|
+
);
|
|
139
|
+
this.modelName = modelName;
|
|
140
|
+
this.modelDimension = modelDimension;
|
|
141
|
+
this.dbDimension = dbDimension;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
class ConnectionError extends StoreError {
|
|
145
|
+
}
|
|
146
|
+
class FixedDimensionEmbeddings extends Embeddings {
|
|
147
|
+
constructor(embeddings, targetDimension, providerAndModel, allowTruncate = false) {
|
|
148
|
+
super({});
|
|
149
|
+
this.embeddings = embeddings;
|
|
150
|
+
this.targetDimension = targetDimension;
|
|
151
|
+
this.allowTruncate = allowTruncate;
|
|
152
|
+
const [providerOrModel, modelName] = providerAndModel.split(":");
|
|
153
|
+
this.provider = modelName ? providerOrModel : "openai";
|
|
154
|
+
this.model = modelName || providerOrModel;
|
|
155
|
+
}
|
|
156
|
+
provider;
|
|
157
|
+
model;
|
|
158
|
+
/**
|
|
159
|
+
* Normalize a vector to the target dimension by truncating (for MRL models) or padding.
|
|
160
|
+
* @throws {DimensionError} If vector is too large and provider doesn't support MRL
|
|
161
|
+
*/
|
|
162
|
+
normalizeVector(vector) {
|
|
163
|
+
const dimension = vector.length;
|
|
164
|
+
if (dimension > this.targetDimension) {
|
|
165
|
+
if (this.allowTruncate) {
|
|
166
|
+
return vector.slice(0, this.targetDimension);
|
|
167
|
+
}
|
|
168
|
+
throw new DimensionError(
|
|
169
|
+
`${this.provider}:${this.model}`,
|
|
170
|
+
dimension,
|
|
171
|
+
this.targetDimension
|
|
172
|
+
);
|
|
173
|
+
}
|
|
174
|
+
if (dimension < this.targetDimension) {
|
|
175
|
+
return [...vector, ...new Array(this.targetDimension - dimension).fill(0)];
|
|
176
|
+
}
|
|
177
|
+
return vector;
|
|
178
|
+
}
|
|
179
|
+
async embedQuery(text) {
|
|
180
|
+
const vector = await this.embeddings.embedQuery(text);
|
|
181
|
+
return this.normalizeVector(vector);
|
|
182
|
+
}
|
|
183
|
+
async embedDocuments(documents) {
|
|
184
|
+
const vectors = await this.embeddings.embedDocuments(documents);
|
|
185
|
+
return vectors.map((vector) => this.normalizeVector(vector));
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
class UnsupportedProviderError extends Error {
|
|
189
|
+
constructor(provider) {
|
|
190
|
+
super(
|
|
191
|
+
`❌ Unsupported embedding provider: ${provider}
|
|
192
|
+
Supported providers: openai, vertex, gemini, aws, microsoft
|
|
193
|
+
See README.md for configuration options or run with --help for more details.`
|
|
194
|
+
);
|
|
195
|
+
this.name = "UnsupportedProviderError";
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
class ModelConfigurationError extends Error {
|
|
199
|
+
constructor(message) {
|
|
200
|
+
super(message);
|
|
201
|
+
this.name = "ModelConfigurationError";
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
function createEmbeddingModel(providerAndModel) {
|
|
205
|
+
const [providerOrModel, ...modelNameParts] = providerAndModel.split(":");
|
|
206
|
+
const modelName = modelNameParts.join(":");
|
|
207
|
+
const provider = modelName ? providerOrModel : "openai";
|
|
208
|
+
const model = modelName || providerOrModel;
|
|
209
|
+
const baseConfig = { stripNewLines: true };
|
|
210
|
+
switch (provider) {
|
|
211
|
+
case "openai": {
|
|
212
|
+
if (!process.env.OPENAI_API_KEY) {
|
|
213
|
+
throw new ModelConfigurationError(
|
|
214
|
+
"❌ Missing API key for embedding provider\n Please set OPENAI_API_KEY or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
|
|
215
|
+
);
|
|
216
|
+
}
|
|
217
|
+
const config = {
|
|
218
|
+
...baseConfig,
|
|
219
|
+
modelName: model,
|
|
220
|
+
batchSize: 512
|
|
221
|
+
// OpenAI supports large batches
|
|
222
|
+
};
|
|
223
|
+
const baseURL = process.env.OPENAI_API_BASE;
|
|
224
|
+
if (baseURL) {
|
|
225
|
+
config.configuration = { baseURL };
|
|
226
|
+
}
|
|
227
|
+
return new OpenAIEmbeddings(config);
|
|
228
|
+
}
|
|
229
|
+
case "vertex": {
|
|
230
|
+
if (!process.env.GOOGLE_APPLICATION_CREDENTIALS) {
|
|
231
|
+
throw new ModelConfigurationError(
|
|
232
|
+
"❌ Missing credentials for Google Cloud Vertex AI\n Please set GOOGLE_APPLICATION_CREDENTIALS or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
|
|
233
|
+
);
|
|
234
|
+
}
|
|
235
|
+
return new VertexAIEmbeddings({
|
|
236
|
+
...baseConfig,
|
|
237
|
+
model
|
|
238
|
+
// e.g., "text-embedding-004"
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
case "gemini": {
|
|
242
|
+
if (!process.env.GOOGLE_API_KEY) {
|
|
243
|
+
throw new ModelConfigurationError(
|
|
244
|
+
"❌ Missing API key for Google AI (Gemini)\n Please set GOOGLE_API_KEY or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
|
|
245
|
+
);
|
|
246
|
+
}
|
|
247
|
+
const baseEmbeddings = new GoogleGenerativeAIEmbeddings({
|
|
248
|
+
...baseConfig,
|
|
249
|
+
apiKey: process.env.GOOGLE_API_KEY,
|
|
250
|
+
model
|
|
251
|
+
// e.g., "gemini-embedding-exp-03-07"
|
|
252
|
+
});
|
|
253
|
+
return new FixedDimensionEmbeddings(
|
|
254
|
+
baseEmbeddings,
|
|
255
|
+
VECTOR_DIMENSION,
|
|
256
|
+
providerAndModel,
|
|
257
|
+
true
|
|
258
|
+
);
|
|
259
|
+
}
|
|
260
|
+
case "aws": {
|
|
261
|
+
const region = process.env.BEDROCK_AWS_REGION || process.env.AWS_REGION;
|
|
262
|
+
if (!region) {
|
|
263
|
+
throw new ModelConfigurationError(
|
|
264
|
+
"BEDROCK_AWS_REGION or AWS_REGION environment variable is required for AWS Bedrock"
|
|
265
|
+
);
|
|
266
|
+
}
|
|
267
|
+
if (!process.env.AWS_PROFILE && !process.env.AWS_ACCESS_KEY_ID && !process.env.AWS_SECRET_ACCESS_KEY) {
|
|
268
|
+
throw new ModelConfigurationError(
|
|
269
|
+
"Either AWS_PROFILE or both AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables are required for AWS Bedrock"
|
|
270
|
+
);
|
|
271
|
+
}
|
|
272
|
+
const credentials = process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY ? {
|
|
273
|
+
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
|
|
274
|
+
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
|
|
275
|
+
sessionToken: process.env.AWS_SESSION_TOKEN
|
|
276
|
+
} : void 0;
|
|
277
|
+
return new BedrockEmbeddings({
|
|
278
|
+
...baseConfig,
|
|
279
|
+
model,
|
|
280
|
+
// e.g., "amazon.titan-embed-text-v1"
|
|
281
|
+
region,
|
|
282
|
+
...credentials ? { credentials } : {}
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
case "microsoft": {
|
|
286
|
+
if (!process.env.AZURE_OPENAI_API_KEY) {
|
|
287
|
+
throw new ModelConfigurationError(
|
|
288
|
+
"AZURE_OPENAI_API_KEY environment variable is required for Azure OpenAI"
|
|
289
|
+
);
|
|
290
|
+
}
|
|
291
|
+
if (!process.env.AZURE_OPENAI_API_INSTANCE_NAME) {
|
|
292
|
+
throw new ModelConfigurationError(
|
|
293
|
+
"AZURE_OPENAI_API_INSTANCE_NAME environment variable is required for Azure OpenAI"
|
|
294
|
+
);
|
|
295
|
+
}
|
|
296
|
+
if (!process.env.AZURE_OPENAI_API_DEPLOYMENT_NAME) {
|
|
297
|
+
throw new ModelConfigurationError(
|
|
298
|
+
"AZURE_OPENAI_API_DEPLOYMENT_NAME environment variable is required for Azure OpenAI"
|
|
299
|
+
);
|
|
300
|
+
}
|
|
301
|
+
if (!process.env.AZURE_OPENAI_API_VERSION) {
|
|
302
|
+
throw new ModelConfigurationError(
|
|
303
|
+
"AZURE_OPENAI_API_VERSION environment variable is required for Azure OpenAI"
|
|
304
|
+
);
|
|
305
|
+
}
|
|
306
|
+
return new AzureOpenAIEmbeddings({
|
|
307
|
+
...baseConfig,
|
|
308
|
+
azureOpenAIApiKey: process.env.AZURE_OPENAI_API_KEY,
|
|
309
|
+
azureOpenAIApiInstanceName: process.env.AZURE_OPENAI_API_INSTANCE_NAME,
|
|
310
|
+
azureOpenAIApiDeploymentName: process.env.AZURE_OPENAI_API_DEPLOYMENT_NAME,
|
|
311
|
+
azureOpenAIApiVersion: process.env.AZURE_OPENAI_API_VERSION,
|
|
312
|
+
deploymentName: model
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
default:
|
|
316
|
+
throw new UnsupportedProviderError(provider);
|
|
317
|
+
}
|
|
318
|
+
}
|
|
50
319
|
const LogLevel = {
|
|
51
320
|
ERROR: 0,
|
|
52
321
|
WARN: 1,
|
|
@@ -105,6 +374,41 @@ const logger = {
|
|
|
105
374
|
}
|
|
106
375
|
}
|
|
107
376
|
};
|
|
377
|
+
function camelToSnakeCase(str) {
|
|
378
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
379
|
+
}
|
|
380
|
+
function convertPropertiesToSnakeCase(obj) {
|
|
381
|
+
const result = {};
|
|
382
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
383
|
+
const snakeKey = camelToSnakeCase(key);
|
|
384
|
+
if (value && typeof value === "object" && !Array.isArray(value) && !(value instanceof Date)) {
|
|
385
|
+
result[snakeKey] = convertPropertiesToSnakeCase(value);
|
|
386
|
+
} else if (Array.isArray(value)) {
|
|
387
|
+
result[snakeKey] = value.map(
|
|
388
|
+
(item) => item && typeof item === "object" && !(item instanceof Date) ? convertPropertiesToSnakeCase(item) : item
|
|
389
|
+
);
|
|
390
|
+
} else {
|
|
391
|
+
result[snakeKey] = value;
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
return result;
|
|
395
|
+
}
|
|
396
|
+
function addPostHogStandardProperties(properties) {
|
|
397
|
+
const result = { ...properties };
|
|
398
|
+
if (properties.sessionId) {
|
|
399
|
+
result.$session_id = properties.sessionId;
|
|
400
|
+
delete result.sessionId;
|
|
401
|
+
}
|
|
402
|
+
if (properties.startTime) {
|
|
403
|
+
result.$start_timestamp = properties.startTime.toISOString();
|
|
404
|
+
delete result.startTime;
|
|
405
|
+
}
|
|
406
|
+
if (properties.appVersion) {
|
|
407
|
+
result.$app_version = properties.appVersion;
|
|
408
|
+
delete result.appVersion;
|
|
409
|
+
}
|
|
410
|
+
return result;
|
|
411
|
+
}
|
|
108
412
|
class PostHogClient {
|
|
109
413
|
client;
|
|
110
414
|
enabled;
|
|
@@ -155,10 +459,12 @@ class PostHogClient {
|
|
|
155
459
|
capture(distinctId, event, properties) {
|
|
156
460
|
if (!this.enabled || !this.client) return;
|
|
157
461
|
try {
|
|
462
|
+
const enhancedProperties = addPostHogStandardProperties(properties);
|
|
463
|
+
const snakeCaseProperties = convertPropertiesToSnakeCase(enhancedProperties);
|
|
158
464
|
this.client.capture({
|
|
159
465
|
distinctId,
|
|
160
466
|
event,
|
|
161
|
-
properties
|
|
467
|
+
properties: snakeCaseProperties
|
|
162
468
|
});
|
|
163
469
|
logger.debug(`PostHog event captured: ${event}`);
|
|
164
470
|
} catch (error) {
|
|
@@ -167,6 +473,26 @@ class PostHogClient {
|
|
|
167
473
|
);
|
|
168
474
|
}
|
|
169
475
|
}
|
|
476
|
+
/**
|
|
477
|
+
* Capture exception using PostHog's native error tracking
|
|
478
|
+
*/
|
|
479
|
+
captureException(distinctId, error, properties) {
|
|
480
|
+
if (!this.enabled || !this.client) return;
|
|
481
|
+
try {
|
|
482
|
+
const enhancedProperties = addPostHogStandardProperties(properties || {});
|
|
483
|
+
const snakeCaseProperties = convertPropertiesToSnakeCase(enhancedProperties);
|
|
484
|
+
this.client.captureException({
|
|
485
|
+
error,
|
|
486
|
+
distinctId,
|
|
487
|
+
properties: snakeCaseProperties
|
|
488
|
+
});
|
|
489
|
+
logger.debug(`PostHog exception captured: ${error.constructor.name}`);
|
|
490
|
+
} catch (captureError) {
|
|
491
|
+
logger.debug(
|
|
492
|
+
`PostHog captureException error: ${captureError instanceof Error ? captureError.message : "Unknown error"}`
|
|
493
|
+
);
|
|
494
|
+
}
|
|
495
|
+
}
|
|
170
496
|
/**
|
|
171
497
|
* Graceful shutdown with event flushing
|
|
172
498
|
*/
|
|
@@ -203,7 +529,7 @@ class SessionTracker {
|
|
|
203
529
|
endSession() {
|
|
204
530
|
if (!this.sessionContext) return null;
|
|
205
531
|
const duration = Date.now() - this.sessionContext.startTime.getTime();
|
|
206
|
-
const sessionInterface = this.sessionContext.
|
|
532
|
+
const sessionInterface = this.sessionContext.appInterface;
|
|
207
533
|
this.sessionContext = void 0;
|
|
208
534
|
return { duration, interface: sessionInterface };
|
|
209
535
|
}
|
|
@@ -213,6 +539,14 @@ class SessionTracker {
|
|
|
213
539
|
getSessionContext() {
|
|
214
540
|
return this.sessionContext;
|
|
215
541
|
}
|
|
542
|
+
/**
|
|
543
|
+
* Update session context with additional fields
|
|
544
|
+
*/
|
|
545
|
+
updateSessionContext(updates) {
|
|
546
|
+
if (this.sessionContext) {
|
|
547
|
+
this.sessionContext = { ...this.sessionContext, ...updates };
|
|
548
|
+
}
|
|
549
|
+
}
|
|
216
550
|
/**
|
|
217
551
|
* Get enriched properties with session context
|
|
218
552
|
*/
|
|
@@ -293,8 +627,6 @@ var TelemetryEvent = /* @__PURE__ */ ((TelemetryEvent2) => {
|
|
|
293
627
|
TelemetryEvent2["PIPELINE_JOB_PROGRESS"] = "pipeline_job_progress";
|
|
294
628
|
TelemetryEvent2["PIPELINE_JOB_COMPLETED"] = "pipeline_job_completed";
|
|
295
629
|
TelemetryEvent2["DOCUMENT_PROCESSED"] = "document_processed";
|
|
296
|
-
TelemetryEvent2["DOCUMENT_PROCESSING_FAILED"] = "document_processing_failed";
|
|
297
|
-
TelemetryEvent2["ERROR_OCCURRED"] = "error_occurred";
|
|
298
630
|
return TelemetryEvent2;
|
|
299
631
|
})(TelemetryEvent || {});
|
|
300
632
|
class Analytics {
|
|
@@ -320,23 +652,34 @@ class Analytics {
|
|
|
320
652
|
if (!this.enabled) return;
|
|
321
653
|
this.sessionTracker.startSession(context);
|
|
322
654
|
this.track("session_started", {
|
|
323
|
-
interface: context.
|
|
324
|
-
version: context.
|
|
325
|
-
platform: context.
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
servicesCount: context.servicesEnabled.length
|
|
655
|
+
interface: context.appInterface,
|
|
656
|
+
version: context.appVersion,
|
|
657
|
+
platform: context.appPlatform,
|
|
658
|
+
authEnabled: context.appAuthEnabled,
|
|
659
|
+
readOnly: context.appReadOnly,
|
|
660
|
+
servicesCount: context.appServicesEnabled.length
|
|
330
661
|
});
|
|
331
662
|
}
|
|
332
663
|
/**
|
|
333
|
-
*
|
|
664
|
+
* Update session context with additional fields (e.g., embedding model info)
|
|
334
665
|
*/
|
|
666
|
+
updateSessionContext(updates) {
|
|
667
|
+
if (!this.enabled) return;
|
|
668
|
+
this.sessionTracker.updateSessionContext(updates);
|
|
669
|
+
}
|
|
335
670
|
track(event, properties = {}) {
|
|
336
671
|
if (!this.enabled) return;
|
|
337
672
|
const eventProperties = this.sessionTracker.getEnrichedProperties(properties);
|
|
338
673
|
this.postHogClient.capture(this.distinctId, event, eventProperties);
|
|
339
674
|
}
|
|
675
|
+
/**
|
|
676
|
+
* Capture exception using PostHog's native error tracking with session context
|
|
677
|
+
*/
|
|
678
|
+
captureException(error, properties = {}) {
|
|
679
|
+
if (!this.enabled) return;
|
|
680
|
+
const eventProperties = this.sessionTracker.getEnrichedProperties(properties);
|
|
681
|
+
this.postHogClient.captureException(this.distinctId, error, eventProperties);
|
|
682
|
+
}
|
|
340
683
|
/**
|
|
341
684
|
* Track session end with duration
|
|
342
685
|
*/
|
|
@@ -368,29 +711,38 @@ class Analytics {
|
|
|
368
711
|
getSessionContext() {
|
|
369
712
|
return this.sessionTracker.getSessionContext();
|
|
370
713
|
}
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
714
|
+
/**
|
|
715
|
+
* Track tool usage with error handling and automatic timing
|
|
716
|
+
*/
|
|
717
|
+
async trackTool(toolName, operation, getProperties) {
|
|
718
|
+
const startTime = Date.now();
|
|
719
|
+
try {
|
|
720
|
+
const result = await operation();
|
|
721
|
+
this.track("tool_used", {
|
|
722
|
+
tool: toolName,
|
|
723
|
+
success: true,
|
|
724
|
+
durationMs: Date.now() - startTime,
|
|
725
|
+
...getProperties ? getProperties(result) : {}
|
|
726
|
+
});
|
|
727
|
+
return result;
|
|
728
|
+
} catch (error) {
|
|
729
|
+
this.track("tool_used", {
|
|
730
|
+
tool: toolName,
|
|
731
|
+
success: false,
|
|
732
|
+
durationMs: Date.now() - startTime
|
|
733
|
+
});
|
|
734
|
+
if (error instanceof Error) {
|
|
735
|
+
this.captureException(error, {
|
|
736
|
+
tool: toolName,
|
|
737
|
+
context: "tool_execution",
|
|
738
|
+
durationMs: Date.now() - startTime
|
|
739
|
+
});
|
|
740
|
+
}
|
|
741
|
+
throw error;
|
|
742
|
+
}
|
|
392
743
|
}
|
|
393
744
|
}
|
|
745
|
+
const analytics = new Analytics();
|
|
394
746
|
function extractHostname(url) {
|
|
395
747
|
try {
|
|
396
748
|
const parsed = new URL(url);
|
|
@@ -410,28 +762,7 @@ function extractProtocol(urlOrPath) {
|
|
|
410
762
|
return "unknown";
|
|
411
763
|
}
|
|
412
764
|
}
|
|
413
|
-
|
|
414
|
-
return {
|
|
415
|
-
length: query.length,
|
|
416
|
-
wordCount: query.trim().split(/\s+/).length,
|
|
417
|
-
hasCodeTerms: /\b(function|class|import|export|const|let|var|def|async|await)\b/i.test(query),
|
|
418
|
-
hasSpecialChars: /[^\w\s]/.test(query)
|
|
419
|
-
};
|
|
420
|
-
}
|
|
421
|
-
function sanitizeErrorMessage(message) {
|
|
422
|
-
return message.replace(/https?:\/\/[^\s]+/gi, "[url]").replace(/file:\/\/[^\s]+/gi, "[file-url]").replace(/\/[^\s]*\.[a-z]{2,4}/gi, "[path]").replace(/[A-Za-z]:\\[^\s]+/g, "[path]").replace(/Bearer\s+[^\s]+/gi, "Bearer [token]").replace(/api[_-]?key[=:]\s*[^\s]+/gi, "api_key=[redacted]").replace(/token[=:]\s*[^\s]+/gi, "token=[redacted]").substring(0, 200);
|
|
423
|
-
}
|
|
424
|
-
function sanitizeError(error) {
|
|
425
|
-
return {
|
|
426
|
-
type: error.constructor.name,
|
|
427
|
-
message: sanitizeErrorMessage(error.message),
|
|
428
|
-
hasStack: Boolean(error.stack)
|
|
429
|
-
};
|
|
430
|
-
}
|
|
431
|
-
function extractCliFlags(argv) {
|
|
432
|
-
return argv.filter((arg) => arg.startsWith("--") || arg.startsWith("-"));
|
|
433
|
-
}
|
|
434
|
-
const version = "1.20.0";
|
|
765
|
+
const version = "1.21.1";
|
|
435
766
|
const packageJson = {
|
|
436
767
|
version
|
|
437
768
|
};
|
|
@@ -439,34 +770,48 @@ function getPackageVersion() {
|
|
|
439
770
|
return packageJson.version;
|
|
440
771
|
}
|
|
441
772
|
function createCliSession(command, options) {
|
|
442
|
-
|
|
773
|
+
const baseSession = {
|
|
443
774
|
sessionId: randomUUID(),
|
|
444
|
-
|
|
775
|
+
appInterface: "cli",
|
|
445
776
|
startTime: /* @__PURE__ */ new Date(),
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
777
|
+
appVersion: getPackageVersion(),
|
|
778
|
+
appPlatform: process.platform,
|
|
779
|
+
appNodeVersion: process.version,
|
|
780
|
+
cliCommand: command || "unknown",
|
|
781
|
+
appAuthEnabled: options?.authEnabled,
|
|
782
|
+
appReadOnly: options?.readOnly,
|
|
783
|
+
appServicesEnabled: ["worker"]
|
|
453
784
|
// CLI typically runs embedded worker
|
|
454
785
|
};
|
|
786
|
+
if (options?.embeddingContext) {
|
|
787
|
+
return {
|
|
788
|
+
...baseSession,
|
|
789
|
+
...options.embeddingContext
|
|
790
|
+
};
|
|
791
|
+
}
|
|
792
|
+
return baseSession;
|
|
455
793
|
}
|
|
456
794
|
function createMcpSession(options) {
|
|
457
|
-
|
|
795
|
+
const baseSession = {
|
|
458
796
|
sessionId: randomUUID(),
|
|
459
|
-
|
|
797
|
+
appInterface: "mcp",
|
|
460
798
|
startTime: /* @__PURE__ */ new Date(),
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
799
|
+
appVersion: getPackageVersion(),
|
|
800
|
+
appPlatform: process.platform,
|
|
801
|
+
appNodeVersion: process.version,
|
|
802
|
+
mcpProtocol: options.protocol || "stdio",
|
|
803
|
+
mcpTransport: options.transport,
|
|
804
|
+
appAuthEnabled: options.authEnabled ?? false,
|
|
805
|
+
appReadOnly: options.readOnly ?? false,
|
|
806
|
+
appServicesEnabled: options.servicesEnabled ?? ["mcp"]
|
|
469
807
|
};
|
|
808
|
+
if (options.embeddingContext) {
|
|
809
|
+
return {
|
|
810
|
+
...baseSession,
|
|
811
|
+
...options.embeddingContext
|
|
812
|
+
};
|
|
813
|
+
}
|
|
814
|
+
return baseSession;
|
|
470
815
|
}
|
|
471
816
|
function createTelemetryService() {
|
|
472
817
|
return {
|
|
@@ -929,42 +1274,55 @@ class CancelJobTool {
|
|
|
929
1274
|
* @returns A promise that resolves with the outcome message.
|
|
930
1275
|
*/
|
|
931
1276
|
async execute(input) {
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
1277
|
+
return analytics.trackTool(
|
|
1278
|
+
"cancel_job",
|
|
1279
|
+
async () => {
|
|
1280
|
+
try {
|
|
1281
|
+
const job = await this.pipeline.getJob(input.jobId);
|
|
1282
|
+
if (!job) {
|
|
1283
|
+
logger.warn(`❓ [CancelJobTool] Job not found: ${input.jobId}`);
|
|
1284
|
+
return {
|
|
1285
|
+
message: `Job with ID ${input.jobId} not found.`,
|
|
1286
|
+
success: false
|
|
1287
|
+
};
|
|
1288
|
+
}
|
|
1289
|
+
if (job.status === PipelineJobStatus.COMPLETED || // Use enum member
|
|
1290
|
+
job.status === PipelineJobStatus.FAILED || // Use enum member
|
|
1291
|
+
job.status === PipelineJobStatus.CANCELLED) {
|
|
1292
|
+
logger.debug(
|
|
1293
|
+
`Job ${input.jobId} is already in a final state: ${job.status}.`
|
|
1294
|
+
);
|
|
1295
|
+
return {
|
|
1296
|
+
message: `Job ${input.jobId} is already ${job.status}. No action taken.`,
|
|
1297
|
+
success: true
|
|
1298
|
+
// Considered success as no cancellation needed
|
|
1299
|
+
};
|
|
1300
|
+
}
|
|
1301
|
+
await this.pipeline.cancelJob(input.jobId);
|
|
1302
|
+
const updatedJob = await this.pipeline.getJob(input.jobId);
|
|
1303
|
+
const finalStatus = updatedJob?.status ?? "UNKNOWN (job disappeared?)";
|
|
1304
|
+
logger.debug(
|
|
1305
|
+
`Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}`
|
|
1306
|
+
);
|
|
1307
|
+
return {
|
|
1308
|
+
message: `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}.`,
|
|
1309
|
+
success: true
|
|
1310
|
+
};
|
|
1311
|
+
} catch (error) {
|
|
1312
|
+
logger.error(`❌ Error cancelling job ${input.jobId}: ${error}`);
|
|
1313
|
+
return {
|
|
1314
|
+
message: `Failed to cancel job ${input.jobId}: ${error instanceof Error ? error.message : String(error)}`,
|
|
1315
|
+
success: false
|
|
1316
|
+
};
|
|
1317
|
+
}
|
|
1318
|
+
},
|
|
1319
|
+
(result) => {
|
|
945
1320
|
return {
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
// Considered success as no cancellation needed
|
|
1321
|
+
success: result.success
|
|
1322
|
+
// Note: success flag already indicates if cancellation was successful
|
|
949
1323
|
};
|
|
950
1324
|
}
|
|
951
|
-
|
|
952
|
-
const updatedJob = await this.pipeline.getJob(input.jobId);
|
|
953
|
-
const finalStatus = updatedJob?.status ?? "UNKNOWN (job disappeared?)";
|
|
954
|
-
logger.debug(
|
|
955
|
-
`Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}`
|
|
956
|
-
);
|
|
957
|
-
return {
|
|
958
|
-
message: `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}.`,
|
|
959
|
-
success: true
|
|
960
|
-
};
|
|
961
|
-
} catch (error) {
|
|
962
|
-
logger.error(`❌ Error cancelling job ${input.jobId}: ${error}`);
|
|
963
|
-
return {
|
|
964
|
-
message: `Failed to cancel job ${input.jobId}: ${error instanceof Error ? error.message : String(error)}`,
|
|
965
|
-
success: false
|
|
966
|
-
};
|
|
967
|
-
}
|
|
1325
|
+
);
|
|
968
1326
|
}
|
|
969
1327
|
}
|
|
970
1328
|
class ClearCompletedJobsTool {
|
|
@@ -982,24 +1340,33 @@ class ClearCompletedJobsTool {
|
|
|
982
1340
|
* @returns A promise that resolves with the outcome of the clear operation.
|
|
983
1341
|
*/
|
|
984
1342
|
async execute(_input) {
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1343
|
+
return analytics.trackTool(
|
|
1344
|
+
"clear_completed_jobs",
|
|
1345
|
+
async () => {
|
|
1346
|
+
try {
|
|
1347
|
+
const clearedCount = await this.pipeline.clearCompletedJobs();
|
|
1348
|
+
const message = clearedCount > 0 ? `Successfully cleared ${clearedCount} completed job${clearedCount === 1 ? "" : "s"} from the queue.` : "No completed jobs to clear.";
|
|
1349
|
+
logger.debug(message);
|
|
1350
|
+
return {
|
|
1351
|
+
message,
|
|
1352
|
+
success: true,
|
|
1353
|
+
clearedCount
|
|
1354
|
+
};
|
|
1355
|
+
} catch (error) {
|
|
1356
|
+
const errorMessage = `Failed to clear completed jobs: ${error instanceof Error ? error.message : String(error)}`;
|
|
1357
|
+
logger.error(`❌ ${errorMessage}`);
|
|
1358
|
+
return {
|
|
1359
|
+
message: errorMessage,
|
|
1360
|
+
success: false,
|
|
1361
|
+
clearedCount: 0
|
|
1362
|
+
};
|
|
1363
|
+
}
|
|
1364
|
+
},
|
|
1365
|
+
(result) => ({
|
|
1366
|
+
success: result.success,
|
|
1367
|
+
clearedCount: result.clearedCount
|
|
1368
|
+
})
|
|
1369
|
+
);
|
|
1003
1370
|
}
|
|
1004
1371
|
}
|
|
1005
1372
|
class ToolError extends Error {
|
|
@@ -1937,91 +2304,106 @@ class FetchUrlTool {
|
|
|
1937
2304
|
* @throws {ToolError} If fetching or processing fails
|
|
1938
2305
|
*/
|
|
1939
2306
|
async execute(options) {
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
const htmlPipeline = new HtmlPipeline();
|
|
1951
|
-
const markdownPipeline = new MarkdownPipeline();
|
|
1952
|
-
const pipelines = [htmlPipeline, markdownPipeline];
|
|
1953
|
-
try {
|
|
1954
|
-
logger.info(`📡 Fetching ${url}...`);
|
|
1955
|
-
const rawContent = await fetcher.fetch(url, {
|
|
1956
|
-
followRedirects: options.followRedirects ?? true,
|
|
1957
|
-
maxRetries: 3,
|
|
1958
|
-
headers
|
|
1959
|
-
// propagate custom headers
|
|
1960
|
-
});
|
|
1961
|
-
logger.info("🔄 Processing content...");
|
|
1962
|
-
let processed;
|
|
1963
|
-
for (const pipeline of pipelines) {
|
|
1964
|
-
if (pipeline.canProcess(rawContent)) {
|
|
1965
|
-
processed = await pipeline.process(
|
|
1966
|
-
rawContent,
|
|
1967
|
-
{
|
|
1968
|
-
url,
|
|
1969
|
-
library: "",
|
|
1970
|
-
version: "",
|
|
1971
|
-
maxDepth: 0,
|
|
1972
|
-
maxPages: 1,
|
|
1973
|
-
maxConcurrency: 1,
|
|
1974
|
-
scope: "subpages",
|
|
1975
|
-
followRedirects: options.followRedirects ?? true,
|
|
1976
|
-
excludeSelectors: void 0,
|
|
1977
|
-
ignoreErrors: false,
|
|
1978
|
-
scrapeMode,
|
|
1979
|
-
headers
|
|
1980
|
-
// propagate custom headers
|
|
1981
|
-
},
|
|
1982
|
-
fetcher
|
|
2307
|
+
return analytics.trackTool(
|
|
2308
|
+
"fetch_url",
|
|
2309
|
+
async () => {
|
|
2310
|
+
const { url, scrapeMode = ScrapeMode.Auto, headers } = options;
|
|
2311
|
+
const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
|
|
2312
|
+
const fetcherIndex = canFetchResults.indexOf(true);
|
|
2313
|
+
if (fetcherIndex === -1) {
|
|
2314
|
+
throw new ToolError(
|
|
2315
|
+
`Invalid URL: ${url}. Must be an HTTP/HTTPS URL or a file:// URL.`,
|
|
2316
|
+
this.constructor.name
|
|
1983
2317
|
);
|
|
1984
|
-
break;
|
|
1985
2318
|
}
|
|
2319
|
+
const fetcher = this.fetchers[fetcherIndex];
|
|
2320
|
+
const htmlPipeline = new HtmlPipeline();
|
|
2321
|
+
const markdownPipeline = new MarkdownPipeline();
|
|
2322
|
+
const pipelines = [htmlPipeline, markdownPipeline];
|
|
2323
|
+
try {
|
|
2324
|
+
logger.info(`📡 Fetching ${url}...`);
|
|
2325
|
+
const rawContent = await fetcher.fetch(url, {
|
|
2326
|
+
followRedirects: options.followRedirects ?? true,
|
|
2327
|
+
maxRetries: 3,
|
|
2328
|
+
headers
|
|
2329
|
+
// propagate custom headers
|
|
2330
|
+
});
|
|
2331
|
+
logger.info("🔄 Processing content...");
|
|
2332
|
+
let processed;
|
|
2333
|
+
for (const pipeline of pipelines) {
|
|
2334
|
+
if (pipeline.canProcess(rawContent)) {
|
|
2335
|
+
processed = await pipeline.process(
|
|
2336
|
+
rawContent,
|
|
2337
|
+
{
|
|
2338
|
+
url,
|
|
2339
|
+
library: "",
|
|
2340
|
+
version: "",
|
|
2341
|
+
maxDepth: 0,
|
|
2342
|
+
maxPages: 1,
|
|
2343
|
+
maxConcurrency: 1,
|
|
2344
|
+
scope: "subpages",
|
|
2345
|
+
followRedirects: options.followRedirects ?? true,
|
|
2346
|
+
excludeSelectors: void 0,
|
|
2347
|
+
ignoreErrors: false,
|
|
2348
|
+
scrapeMode,
|
|
2349
|
+
headers
|
|
2350
|
+
// propagate custom headers
|
|
2351
|
+
},
|
|
2352
|
+
fetcher
|
|
2353
|
+
);
|
|
2354
|
+
break;
|
|
2355
|
+
}
|
|
2356
|
+
}
|
|
2357
|
+
if (!processed) {
|
|
2358
|
+
logger.warn(
|
|
2359
|
+
`⚠️ Unsupported content type "${rawContent.mimeType}" for ${url}. Returning raw content.`
|
|
2360
|
+
);
|
|
2361
|
+
const resolvedCharset = resolveCharset(
|
|
2362
|
+
rawContent.charset,
|
|
2363
|
+
rawContent.content,
|
|
2364
|
+
rawContent.mimeType
|
|
2365
|
+
);
|
|
2366
|
+
const contentString = convertToString(rawContent.content, resolvedCharset);
|
|
2367
|
+
return contentString;
|
|
2368
|
+
}
|
|
2369
|
+
for (const err of processed.errors) {
|
|
2370
|
+
logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
|
|
2371
|
+
}
|
|
2372
|
+
if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
|
|
2373
|
+
throw new ToolError(
|
|
2374
|
+
`Processing resulted in empty content for ${url}`,
|
|
2375
|
+
this.constructor.name
|
|
2376
|
+
);
|
|
2377
|
+
}
|
|
2378
|
+
logger.info(`✅ Successfully processed ${url}`);
|
|
2379
|
+
return processed.textContent;
|
|
2380
|
+
} catch (error) {
|
|
2381
|
+
if (error instanceof ScraperError || error instanceof ToolError) {
|
|
2382
|
+
throw new ToolError(
|
|
2383
|
+
`Failed to fetch or process URL: ${error.message}`,
|
|
2384
|
+
this.constructor.name
|
|
2385
|
+
);
|
|
2386
|
+
}
|
|
2387
|
+
throw new ToolError(
|
|
2388
|
+
`Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
|
|
2389
|
+
this.constructor.name
|
|
2390
|
+
);
|
|
2391
|
+
} finally {
|
|
2392
|
+
await htmlPipeline.close();
|
|
2393
|
+
await markdownPipeline.close();
|
|
2394
|
+
}
|
|
2395
|
+
},
|
|
2396
|
+
(result) => {
|
|
2397
|
+
const { url, scrapeMode, followRedirects, headers } = options;
|
|
2398
|
+
return {
|
|
2399
|
+
url,
|
|
2400
|
+
scrapeMode,
|
|
2401
|
+
followRedirects,
|
|
2402
|
+
contentLength: result.length,
|
|
2403
|
+
hasHeaders: !!headers
|
|
2404
|
+
};
|
|
1986
2405
|
}
|
|
1987
|
-
|
|
1988
|
-
logger.warn(
|
|
1989
|
-
`⚠️ Unsupported content type "${rawContent.mimeType}" for ${url}. Returning raw content.`
|
|
1990
|
-
);
|
|
1991
|
-
const resolvedCharset = resolveCharset(
|
|
1992
|
-
rawContent.charset,
|
|
1993
|
-
rawContent.content,
|
|
1994
|
-
rawContent.mimeType
|
|
1995
|
-
);
|
|
1996
|
-
const contentString = convertToString(rawContent.content, resolvedCharset);
|
|
1997
|
-
return contentString;
|
|
1998
|
-
}
|
|
1999
|
-
for (const err of processed.errors) {
|
|
2000
|
-
logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
|
|
2001
|
-
}
|
|
2002
|
-
if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
|
|
2003
|
-
throw new ToolError(
|
|
2004
|
-
`Processing resulted in empty content for ${url}`,
|
|
2005
|
-
this.constructor.name
|
|
2006
|
-
);
|
|
2007
|
-
}
|
|
2008
|
-
logger.info(`✅ Successfully processed ${url}`);
|
|
2009
|
-
return processed.textContent;
|
|
2010
|
-
} catch (error) {
|
|
2011
|
-
if (error instanceof ScraperError || error instanceof ToolError) {
|
|
2012
|
-
throw new ToolError(
|
|
2013
|
-
`Failed to fetch or process URL: ${error.message}`,
|
|
2014
|
-
this.constructor.name
|
|
2015
|
-
);
|
|
2016
|
-
}
|
|
2017
|
-
throw new ToolError(
|
|
2018
|
-
`Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
|
|
2019
|
-
this.constructor.name
|
|
2020
|
-
);
|
|
2021
|
-
} finally {
|
|
2022
|
-
await htmlPipeline.close();
|
|
2023
|
-
await markdownPipeline.close();
|
|
2024
|
-
}
|
|
2406
|
+
);
|
|
2025
2407
|
}
|
|
2026
2408
|
}
|
|
2027
2409
|
class FindVersionTool {
|
|
@@ -2034,35 +2416,50 @@ class FindVersionTool {
|
|
|
2034
2416
|
* @returns A descriptive string indicating the best match and unversioned status, or an error message.
|
|
2035
2417
|
*/
|
|
2036
2418
|
async execute(options) {
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
library
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2047
|
-
|
|
2048
|
-
|
|
2419
|
+
return analytics.trackTool(
|
|
2420
|
+
"find_version",
|
|
2421
|
+
async () => {
|
|
2422
|
+
const { library, targetVersion } = options;
|
|
2423
|
+
const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
|
|
2424
|
+
try {
|
|
2425
|
+
const { bestMatch, hasUnversioned } = await this.docService.findBestVersion(
|
|
2426
|
+
library,
|
|
2427
|
+
targetVersion
|
|
2428
|
+
);
|
|
2429
|
+
let message = "";
|
|
2430
|
+
if (bestMatch) {
|
|
2431
|
+
message = `Best match: ${bestMatch}.`;
|
|
2432
|
+
if (hasUnversioned) {
|
|
2433
|
+
message += " Unversioned docs also available.";
|
|
2434
|
+
}
|
|
2435
|
+
} else if (hasUnversioned) {
|
|
2436
|
+
message = `No matching version found for ${libraryAndVersion}, but unversioned docs exist.`;
|
|
2437
|
+
} else {
|
|
2438
|
+
message = `No matching version or unversioned documents found for ${libraryAndVersion}.`;
|
|
2439
|
+
}
|
|
2440
|
+
return { message, bestMatch, hasUnversioned };
|
|
2441
|
+
} catch (error) {
|
|
2442
|
+
if (error instanceof VersionNotFoundError) {
|
|
2443
|
+
logger.info(`ℹ️ Version not found: ${error.message}`);
|
|
2444
|
+
const message = `No matching version or unversioned documents found for ${libraryAndVersion}. Available: ${error.availableVersions.length > 0 ? error.availableVersions.map((v) => v.version).join(", ") : "None"}.`;
|
|
2445
|
+
return { message, bestMatch: null, hasUnversioned: false };
|
|
2446
|
+
}
|
|
2447
|
+
logger.error(
|
|
2448
|
+
`❌ Error finding version for ${libraryAndVersion}: ${error instanceof Error ? error.message : error}`
|
|
2449
|
+
);
|
|
2450
|
+
throw error;
|
|
2049
2451
|
}
|
|
2050
|
-
}
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
return `No matching version or unversioned documents found for ${libraryAndVersion}. Available: ${error.availableVersions.length > 0 ? error.availableVersions.map((v) => v.version).join(", ") : "None"}.`;
|
|
2452
|
+
},
|
|
2453
|
+
(result) => {
|
|
2454
|
+
const { library, targetVersion } = options;
|
|
2455
|
+
return {
|
|
2456
|
+
library,
|
|
2457
|
+
targetVersion,
|
|
2458
|
+
foundMatch: !!result.bestMatch,
|
|
2459
|
+
hasUnversioned: result.hasUnversioned
|
|
2460
|
+
};
|
|
2060
2461
|
}
|
|
2061
|
-
|
|
2062
|
-
`❌ Error finding version for ${libraryAndVersion}: ${error instanceof Error ? error.message : error}`
|
|
2063
|
-
);
|
|
2064
|
-
throw error;
|
|
2065
|
-
}
|
|
2462
|
+
).then((result) => result.message);
|
|
2066
2463
|
}
|
|
2067
2464
|
}
|
|
2068
2465
|
class GetJobInfoTool {
|
|
@@ -2080,29 +2477,41 @@ class GetJobInfoTool {
|
|
|
2080
2477
|
* @returns A promise that resolves with the simplified job info or null if not found.
|
|
2081
2478
|
*/
|
|
2082
2479
|
async execute(input) {
|
|
2083
|
-
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2480
|
+
return analytics.trackTool(
|
|
2481
|
+
"get_job_info",
|
|
2482
|
+
async () => {
|
|
2483
|
+
const job = await this.pipeline.getJob(input.jobId);
|
|
2484
|
+
if (!job) {
|
|
2485
|
+
return { job: null };
|
|
2486
|
+
}
|
|
2487
|
+
const jobInfo = {
|
|
2488
|
+
id: job.id,
|
|
2489
|
+
library: job.library,
|
|
2490
|
+
version: job.version,
|
|
2491
|
+
status: job.status,
|
|
2492
|
+
dbStatus: job.versionStatus,
|
|
2493
|
+
createdAt: job.createdAt.toISOString(),
|
|
2494
|
+
startedAt: job.startedAt?.toISOString() ?? null,
|
|
2495
|
+
finishedAt: job.finishedAt?.toISOString() ?? null,
|
|
2496
|
+
error: job.error?.message ?? null,
|
|
2497
|
+
progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
|
|
2498
|
+
pages: job.progressPages || 0,
|
|
2499
|
+
totalPages: job.progressMaxPages,
|
|
2500
|
+
totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
|
|
2501
|
+
} : void 0,
|
|
2502
|
+
updatedAt: job.updatedAt?.toISOString(),
|
|
2503
|
+
errorMessage: job.errorMessage ?? void 0
|
|
2504
|
+
};
|
|
2505
|
+
return { job: jobInfo };
|
|
2506
|
+
},
|
|
2507
|
+
(result) => {
|
|
2508
|
+
return {
|
|
2509
|
+
found: result.job !== null,
|
|
2510
|
+
library: result.job?.library,
|
|
2511
|
+
version: result.job?.version
|
|
2512
|
+
};
|
|
2513
|
+
}
|
|
2514
|
+
);
|
|
2106
2515
|
}
|
|
2107
2516
|
}
|
|
2108
2517
|
class ListJobsTool {
|
|
@@ -2120,28 +2529,45 @@ class ListJobsTool {
|
|
|
2120
2529
|
* @returns A promise that resolves with the list of simplified job objects.
|
|
2121
2530
|
*/
|
|
2122
2531
|
async execute(input) {
|
|
2123
|
-
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
|
|
2532
|
+
return analytics.trackTool(
|
|
2533
|
+
"list_jobs",
|
|
2534
|
+
async () => {
|
|
2535
|
+
const jobs = await this.pipeline.getJobs(input.status);
|
|
2536
|
+
const simplifiedJobs = jobs.map((job) => {
|
|
2537
|
+
return {
|
|
2538
|
+
id: job.id,
|
|
2539
|
+
library: job.library,
|
|
2540
|
+
version: job.version,
|
|
2541
|
+
status: job.status,
|
|
2542
|
+
dbStatus: job.versionStatus,
|
|
2543
|
+
createdAt: job.createdAt.toISOString(),
|
|
2544
|
+
startedAt: job.startedAt?.toISOString() ?? null,
|
|
2545
|
+
finishedAt: job.finishedAt?.toISOString() ?? null,
|
|
2546
|
+
error: job.error?.message ?? null,
|
|
2547
|
+
progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
|
|
2548
|
+
pages: job.progressPages || 0,
|
|
2549
|
+
totalPages: job.progressMaxPages,
|
|
2550
|
+
totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
|
|
2551
|
+
} : void 0,
|
|
2552
|
+
updatedAt: job.updatedAt?.toISOString(),
|
|
2553
|
+
errorMessage: job.errorMessage ?? void 0
|
|
2554
|
+
};
|
|
2555
|
+
});
|
|
2556
|
+
return { jobs: simplifiedJobs };
|
|
2557
|
+
},
|
|
2558
|
+
(result) => {
|
|
2559
|
+
return {
|
|
2560
|
+
jobCount: result.jobs.length,
|
|
2561
|
+
statusCounts: result.jobs.reduce(
|
|
2562
|
+
(acc, job) => {
|
|
2563
|
+
acc[job.status] = (acc[job.status] || 0) + 1;
|
|
2564
|
+
return acc;
|
|
2565
|
+
},
|
|
2566
|
+
{}
|
|
2567
|
+
)
|
|
2568
|
+
};
|
|
2569
|
+
}
|
|
2570
|
+
);
|
|
2145
2571
|
}
|
|
2146
2572
|
}
|
|
2147
2573
|
class ListLibrariesTool {
|
|
@@ -2150,20 +2576,32 @@ class ListLibrariesTool {
|
|
|
2150
2576
|
this.docService = docService;
|
|
2151
2577
|
}
|
|
2152
2578
|
async execute(_options) {
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
|
|
2579
|
+
return analytics.trackTool(
|
|
2580
|
+
"list_libraries",
|
|
2581
|
+
async () => {
|
|
2582
|
+
const rawLibraries = await this.docService.listLibraries();
|
|
2583
|
+
const libraries = rawLibraries.map(({ library, versions }) => ({
|
|
2584
|
+
name: library,
|
|
2585
|
+
versions: versions.map((v) => ({
|
|
2586
|
+
version: v.ref.version,
|
|
2587
|
+
documentCount: v.counts.documents,
|
|
2588
|
+
uniqueUrlCount: v.counts.uniqueUrls,
|
|
2589
|
+
indexedAt: v.indexedAt,
|
|
2590
|
+
status: v.status,
|
|
2591
|
+
...v.progress ? { progress: v.progress } : void 0,
|
|
2592
|
+
sourceUrl: v.sourceUrl
|
|
2593
|
+
}))
|
|
2594
|
+
}));
|
|
2595
|
+
return { libraries };
|
|
2596
|
+
},
|
|
2597
|
+
(result) => ({
|
|
2598
|
+
libraryCount: result.libraries.length,
|
|
2599
|
+
totalVersions: result.libraries.reduce(
|
|
2600
|
+
(sum, lib) => sum + lib.versions.length,
|
|
2601
|
+
0
|
|
2602
|
+
)
|
|
2603
|
+
})
|
|
2604
|
+
);
|
|
2167
2605
|
}
|
|
2168
2606
|
}
|
|
2169
2607
|
class RemoveTool {
|
|
@@ -2177,29 +2615,42 @@ class RemoveTool {
|
|
|
2177
2615
|
* Removes all documents, the version record, and the library if no other versions exist.
|
|
2178
2616
|
*/
|
|
2179
2617
|
async execute(args) {
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2618
|
+
return analytics.trackTool(
|
|
2619
|
+
"remove_docs",
|
|
2620
|
+
async () => {
|
|
2621
|
+
const { library, version: version2 } = args;
|
|
2622
|
+
logger.info(`🗑️ Removing library: ${library}${version2 ? `@${version2}` : ""}`);
|
|
2623
|
+
try {
|
|
2624
|
+
const allJobs = await this.pipeline.getJobs();
|
|
2625
|
+
const jobs = allJobs.filter(
|
|
2626
|
+
(job) => job.library === library && job.version === (version2 ?? "") && (job.status === PipelineJobStatus.QUEUED || job.status === PipelineJobStatus.RUNNING)
|
|
2627
|
+
);
|
|
2628
|
+
for (const job of jobs) {
|
|
2629
|
+
logger.info(
|
|
2630
|
+
`🚫 Aborting job for ${library}@${version2 ?? ""} before deletion: ${job.id}`
|
|
2631
|
+
);
|
|
2632
|
+
await this.pipeline.cancelJob(job.id);
|
|
2633
|
+
await this.pipeline.waitForJobCompletion(job.id);
|
|
2634
|
+
}
|
|
2635
|
+
await this.documentManagementService.removeVersion(library, version2);
|
|
2636
|
+
const message = `Successfully removed ${library}${version2 ? `@${version2}` : ""}.`;
|
|
2637
|
+
logger.info(`✅ ${message}`);
|
|
2638
|
+
return { message };
|
|
2639
|
+
} catch (error) {
|
|
2640
|
+
const errorMessage = `Failed to remove ${library}${version2 ? `@${version2}` : ""}: ${error instanceof Error ? error.message : String(error)}`;
|
|
2641
|
+
logger.error(`❌ Error removing library: ${errorMessage}`);
|
|
2642
|
+
throw new ToolError(errorMessage, this.constructor.name);
|
|
2643
|
+
}
|
|
2644
|
+
},
|
|
2645
|
+
() => {
|
|
2646
|
+
const { library, version: version2 } = args;
|
|
2647
|
+
return {
|
|
2648
|
+
library,
|
|
2649
|
+
version: version2
|
|
2650
|
+
// Success is implicit since if this callback runs, no exception was thrown
|
|
2651
|
+
};
|
|
2193
2652
|
}
|
|
2194
|
-
|
|
2195
|
-
const message = `Successfully removed ${library}${version2 ? `@${version2}` : ""}.`;
|
|
2196
|
-
logger.info(`✅ ${message}`);
|
|
2197
|
-
return { message };
|
|
2198
|
-
} catch (error) {
|
|
2199
|
-
const errorMessage = `Failed to remove ${library}${version2 ? `@${version2}` : ""}: ${error instanceof Error ? error.message : String(error)}`;
|
|
2200
|
-
logger.error(`❌ Error removing library: ${errorMessage}`);
|
|
2201
|
-
throw new ToolError(errorMessage, this.constructor.name);
|
|
2202
|
-
}
|
|
2653
|
+
);
|
|
2203
2654
|
}
|
|
2204
2655
|
}
|
|
2205
2656
|
class ScrapeTool {
|
|
@@ -2215,66 +2666,80 @@ class ScrapeTool {
|
|
|
2215
2666
|
options: scraperOptions,
|
|
2216
2667
|
waitForCompletion = true
|
|
2217
2668
|
} = options;
|
|
2218
|
-
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
|
|
2222
|
-
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
internalVersion = validFullVersion;
|
|
2226
|
-
} else if (partialVersionRegex.test(version2)) {
|
|
2227
|
-
const coercedVersion = semver.coerce(version2);
|
|
2228
|
-
if (coercedVersion) {
|
|
2229
|
-
internalVersion = coercedVersion.version;
|
|
2669
|
+
return analytics.trackTool(
|
|
2670
|
+
"scrape_docs",
|
|
2671
|
+
async () => {
|
|
2672
|
+
let internalVersion;
|
|
2673
|
+
const partialVersionRegex = /^\d+(\.\d+)?$/;
|
|
2674
|
+
if (version2 === null || version2 === void 0) {
|
|
2675
|
+
internalVersion = "";
|
|
2230
2676
|
} else {
|
|
2231
|
-
|
|
2232
|
-
|
|
2233
|
-
|
|
2677
|
+
const validFullVersion = semver.valid(version2);
|
|
2678
|
+
if (validFullVersion) {
|
|
2679
|
+
internalVersion = validFullVersion;
|
|
2680
|
+
} else if (partialVersionRegex.test(version2)) {
|
|
2681
|
+
const coercedVersion = semver.coerce(version2);
|
|
2682
|
+
if (coercedVersion) {
|
|
2683
|
+
internalVersion = coercedVersion.version;
|
|
2684
|
+
} else {
|
|
2685
|
+
throw new Error(
|
|
2686
|
+
`Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
|
|
2687
|
+
);
|
|
2688
|
+
}
|
|
2689
|
+
} else {
|
|
2690
|
+
throw new Error(
|
|
2691
|
+
`Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
|
|
2692
|
+
);
|
|
2693
|
+
}
|
|
2234
2694
|
}
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
};
|
|
2272
|
-
}
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
2695
|
+
internalVersion = internalVersion.toLowerCase();
|
|
2696
|
+
const pipeline = this.pipeline;
|
|
2697
|
+
const enqueueVersion = internalVersion === "" ? null : internalVersion;
|
|
2698
|
+
const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
|
|
2699
|
+
url,
|
|
2700
|
+
library,
|
|
2701
|
+
version: internalVersion,
|
|
2702
|
+
scope: scraperOptions?.scope ?? "subpages",
|
|
2703
|
+
followRedirects: scraperOptions?.followRedirects ?? true,
|
|
2704
|
+
maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
|
|
2705
|
+
maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH$1,
|
|
2706
|
+
maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
|
|
2707
|
+
ignoreErrors: scraperOptions?.ignoreErrors ?? true,
|
|
2708
|
+
scrapeMode: scraperOptions?.scrapeMode ?? ScrapeMode.Auto,
|
|
2709
|
+
// Pass scrapeMode enum
|
|
2710
|
+
includePatterns: scraperOptions?.includePatterns,
|
|
2711
|
+
excludePatterns: scraperOptions?.excludePatterns,
|
|
2712
|
+
headers: scraperOptions?.headers
|
|
2713
|
+
// <-- propagate headers
|
|
2714
|
+
});
|
|
2715
|
+
if (waitForCompletion) {
|
|
2716
|
+
try {
|
|
2717
|
+
await pipeline.waitForJobCompletion(jobId);
|
|
2718
|
+
const finalJob = await pipeline.getJob(jobId);
|
|
2719
|
+
const finalPagesScraped = finalJob?.progress?.pagesScraped ?? 0;
|
|
2720
|
+
logger.debug(
|
|
2721
|
+
`Job ${jobId} finished with status ${finalJob?.status}. Pages scraped: ${finalPagesScraped}`
|
|
2722
|
+
);
|
|
2723
|
+
return {
|
|
2724
|
+
pagesScraped: finalPagesScraped
|
|
2725
|
+
};
|
|
2726
|
+
} catch (error) {
|
|
2727
|
+
logger.error(`❌ Job ${jobId} failed or was cancelled: ${error}`);
|
|
2728
|
+
throw error;
|
|
2729
|
+
}
|
|
2730
|
+
}
|
|
2731
|
+
return { jobId };
|
|
2732
|
+
},
|
|
2733
|
+
(result) => ({
|
|
2734
|
+
library,
|
|
2735
|
+
version: version2,
|
|
2736
|
+
url,
|
|
2737
|
+
waitForCompletion,
|
|
2738
|
+
...scraperOptions,
|
|
2739
|
+
isBackgroundJob: "jobId" in result,
|
|
2740
|
+
pagesScraped: "pagesScraped" in result ? result.pagesScraped : void 0
|
|
2741
|
+
})
|
|
2742
|
+
);
|
|
2278
2743
|
}
|
|
2279
2744
|
}
|
|
2280
2745
|
class SearchTool {
|
|
@@ -2284,43 +2749,56 @@ class SearchTool {
|
|
|
2284
2749
|
}
|
|
2285
2750
|
async execute(options) {
|
|
2286
2751
|
const { library, version: version2, query, limit = 5, exactMatch = false } = options;
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
|
|
2292
|
-
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
|
|
2304
|
-
|
|
2305
|
-
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2752
|
+
return analytics.trackTool(
|
|
2753
|
+
"search_docs",
|
|
2754
|
+
async () => {
|
|
2755
|
+
if (exactMatch && (!version2 || version2 === "latest")) {
|
|
2756
|
+
await this.docService.validateLibraryExists(library);
|
|
2757
|
+
const allLibraries = await this.docService.listLibraries();
|
|
2758
|
+
const libraryInfo = allLibraries.find((lib) => lib.library === library);
|
|
2759
|
+
const detailedVersions = libraryInfo ? libraryInfo.versions.map((v) => ({
|
|
2760
|
+
version: v.ref.version,
|
|
2761
|
+
documentCount: v.counts.documents,
|
|
2762
|
+
uniqueUrlCount: v.counts.uniqueUrls,
|
|
2763
|
+
indexedAt: v.indexedAt
|
|
2764
|
+
})) : [];
|
|
2765
|
+
throw new VersionNotFoundError(library, version2 ?? "latest", detailedVersions);
|
|
2766
|
+
}
|
|
2767
|
+
const resolvedVersion = version2 || "latest";
|
|
2768
|
+
logger.info(
|
|
2769
|
+
`🔍 Searching ${library}@${resolvedVersion} for: ${query}${exactMatch ? " (exact match)" : ""}`
|
|
2770
|
+
);
|
|
2771
|
+
try {
|
|
2772
|
+
await this.docService.validateLibraryExists(library);
|
|
2773
|
+
let versionToSearch = resolvedVersion;
|
|
2774
|
+
if (!exactMatch) {
|
|
2775
|
+
const versionResult = await this.docService.findBestVersion(library, version2);
|
|
2776
|
+
versionToSearch = versionResult.bestMatch;
|
|
2777
|
+
}
|
|
2778
|
+
const results = await this.docService.searchStore(
|
|
2779
|
+
library,
|
|
2780
|
+
versionToSearch,
|
|
2781
|
+
query,
|
|
2782
|
+
limit
|
|
2783
|
+
);
|
|
2784
|
+
logger.info(`✅ Found ${results.length} matching results`);
|
|
2785
|
+
return { results };
|
|
2786
|
+
} catch (error) {
|
|
2787
|
+
logger.error(
|
|
2788
|
+
`❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
2789
|
+
);
|
|
2790
|
+
throw error;
|
|
2791
|
+
}
|
|
2792
|
+
},
|
|
2793
|
+
(result) => ({
|
|
2311
2794
|
library,
|
|
2312
|
-
|
|
2795
|
+
version: version2,
|
|
2313
2796
|
query,
|
|
2314
|
-
limit
|
|
2315
|
-
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
logger.error(
|
|
2320
|
-
`❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
2321
|
-
);
|
|
2322
|
-
throw error;
|
|
2323
|
-
}
|
|
2797
|
+
limit,
|
|
2798
|
+
exactMatch,
|
|
2799
|
+
resultCount: result.results.length
|
|
2800
|
+
})
|
|
2801
|
+
);
|
|
2324
2802
|
}
|
|
2325
2803
|
}
|
|
2326
2804
|
function createResponse(text) {
|
|
@@ -3050,6 +3528,7 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
|
|
|
3050
3528
|
authEnabled: !!authManager,
|
|
3051
3529
|
readOnly,
|
|
3052
3530
|
servicesEnabled: ["mcp"]
|
|
3531
|
+
// Embedding context will be resolved by the service that starts MCP
|
|
3053
3532
|
});
|
|
3054
3533
|
analytics.startSession(session);
|
|
3055
3534
|
}
|
|
@@ -3100,23 +3579,10 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
|
|
|
3100
3579
|
const requestTransport = new StreamableHTTPServerTransport({
|
|
3101
3580
|
sessionIdGenerator: void 0
|
|
3102
3581
|
});
|
|
3103
|
-
if (analytics.isEnabled()) {
|
|
3104
|
-
const session = createMcpSession({
|
|
3105
|
-
protocol: "http",
|
|
3106
|
-
transport: "streamable",
|
|
3107
|
-
authEnabled: !!authManager,
|
|
3108
|
-
readOnly,
|
|
3109
|
-
servicesEnabled: ["mcp"]
|
|
3110
|
-
});
|
|
3111
|
-
analytics.startSession(session);
|
|
3112
|
-
}
|
|
3113
3582
|
reply.raw.on("close", () => {
|
|
3114
3583
|
logger.debug("Streamable HTTP request closed");
|
|
3115
3584
|
requestTransport.close();
|
|
3116
3585
|
requestServer.close();
|
|
3117
|
-
if (analytics.isEnabled()) {
|
|
3118
|
-
analytics.endSession();
|
|
3119
|
-
}
|
|
3120
3586
|
});
|
|
3121
3587
|
await requestServer.connect(requestTransport);
|
|
3122
3588
|
await requestTransport.handleRequest(request.raw, reply.raw, request.body);
|
|
@@ -3260,6 +3726,15 @@ function createDataRouter(trpc) {
|
|
|
3260
3726
|
return results;
|
|
3261
3727
|
}
|
|
3262
3728
|
),
|
|
3729
|
+
removeVersion: tt.procedure.input(z$1.object({ library: nonEmpty, version: optionalVersion })).mutation(
|
|
3730
|
+
async ({
|
|
3731
|
+
ctx,
|
|
3732
|
+
input
|
|
3733
|
+
}) => {
|
|
3734
|
+
await ctx.docService.removeVersion(input.library, input.version ?? null);
|
|
3735
|
+
return { ok: true };
|
|
3736
|
+
}
|
|
3737
|
+
),
|
|
3263
3738
|
removeAllDocuments: tt.procedure.input(z$1.object({ library: nonEmpty, version: optionalVersion })).mutation(
|
|
3264
3739
|
async ({
|
|
3265
3740
|
ctx,
|
|
@@ -3517,97 +3992,30 @@ function registerClearCompletedJobsRoute(server, clearCompletedJobsTool) {
|
|
|
3517
3992
|
}
|
|
3518
3993
|
});
|
|
3519
3994
|
}
|
|
3520
|
-
const
|
|
3521
|
-
|
|
3522
|
-
|
|
3523
|
-
|
|
3524
|
-
|
|
3525
|
-
|
|
3526
|
-
|
|
3527
|
-
|
|
3528
|
-
|
|
3529
|
-
|
|
3530
|
-
|
|
3531
|
-
|
|
3532
|
-
|
|
3533
|
-
|
|
3534
|
-
|
|
3535
|
-
|
|
3536
|
-
|
|
3537
|
-
}
|
|
3538
|
-
|
|
3539
|
-
|
|
3540
|
-
|
|
3541
|
-
|
|
3542
|
-
|
|
3543
|
-
}
|
|
3544
|
-
function getStatusDescription(status) {
|
|
3545
|
-
const descriptions = {
|
|
3546
|
-
[
|
|
3547
|
-
"not_indexed"
|
|
3548
|
-
/* NOT_INDEXED */
|
|
3549
|
-
]: "Version created but not yet indexed",
|
|
3550
|
-
[
|
|
3551
|
-
"queued"
|
|
3552
|
-
/* QUEUED */
|
|
3553
|
-
]: "Waiting in queue for indexing",
|
|
3554
|
-
[
|
|
3555
|
-
"running"
|
|
3556
|
-
/* RUNNING */
|
|
3557
|
-
]: "Currently being indexed",
|
|
3558
|
-
[
|
|
3559
|
-
"completed"
|
|
3560
|
-
/* COMPLETED */
|
|
3561
|
-
]: "Successfully indexed",
|
|
3562
|
-
[
|
|
3563
|
-
"failed"
|
|
3564
|
-
/* FAILED */
|
|
3565
|
-
]: "Indexing failed",
|
|
3566
|
-
[
|
|
3567
|
-
"cancelled"
|
|
3568
|
-
/* CANCELLED */
|
|
3569
|
-
]: "Indexing was cancelled",
|
|
3570
|
-
[
|
|
3571
|
-
"updating"
|
|
3572
|
-
/* UPDATING */
|
|
3573
|
-
]: "Re-indexing in progress"
|
|
3574
|
-
};
|
|
3575
|
-
return descriptions[status] || "Unknown status";
|
|
3576
|
-
}
|
|
3577
|
-
function isActiveStatus(status) {
|
|
3578
|
-
return [
|
|
3579
|
-
"queued",
|
|
3580
|
-
"running",
|
|
3581
|
-
"updating"
|
|
3582
|
-
/* UPDATING */
|
|
3583
|
-
].includes(
|
|
3584
|
-
status
|
|
3585
|
-
);
|
|
3586
|
-
}
|
|
3587
|
-
const VersionBadge = ({ version: version2 }) => {
|
|
3588
|
-
if (!version2) {
|
|
3589
|
-
return null;
|
|
3590
|
-
}
|
|
3591
|
-
return /* @__PURE__ */ jsx("span", { class: "bg-purple-100 text-purple-800 text-xs font-medium me-2 px-1.5 py-0.5 rounded dark:bg-purple-900 dark:text-purple-300", children: /* @__PURE__ */ jsx("span", { safe: true, children: version2 }) });
|
|
3592
|
-
};
|
|
3593
|
-
function getStatusClasses(status) {
|
|
3594
|
-
const baseClasses = "px-1.5 py-0.5 text-xs font-medium rounded";
|
|
3595
|
-
switch (status) {
|
|
3596
|
-
case VersionStatus.COMPLETED:
|
|
3597
|
-
return `${baseClasses} bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-300`;
|
|
3598
|
-
case VersionStatus.RUNNING:
|
|
3599
|
-
case VersionStatus.UPDATING:
|
|
3600
|
-
return `${baseClasses} bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-300`;
|
|
3601
|
-
case VersionStatus.QUEUED:
|
|
3602
|
-
return `${baseClasses} bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-300`;
|
|
3603
|
-
case VersionStatus.FAILED:
|
|
3604
|
-
return `${baseClasses} bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-300`;
|
|
3605
|
-
case VersionStatus.CANCELLED:
|
|
3606
|
-
return `${baseClasses} bg-gray-100 text-gray-800 dark:bg-gray-900 dark:text-gray-300`;
|
|
3607
|
-
case VersionStatus.NOT_INDEXED:
|
|
3608
|
-
default:
|
|
3609
|
-
return `${baseClasses} bg-gray-100 text-gray-600 dark:bg-gray-800 dark:text-gray-400`;
|
|
3610
|
-
}
|
|
3995
|
+
const VersionBadge = ({ version: version2 }) => {
|
|
3996
|
+
if (!version2) {
|
|
3997
|
+
return null;
|
|
3998
|
+
}
|
|
3999
|
+
return /* @__PURE__ */ jsx("span", { class: "bg-purple-100 text-purple-800 text-xs font-medium me-2 px-1.5 py-0.5 rounded dark:bg-purple-900 dark:text-purple-300", children: /* @__PURE__ */ jsx("span", { safe: true, children: version2 }) });
|
|
4000
|
+
};
|
|
4001
|
+
function getStatusClasses(status) {
|
|
4002
|
+
const baseClasses = "px-1.5 py-0.5 text-xs font-medium rounded";
|
|
4003
|
+
switch (status) {
|
|
4004
|
+
case VersionStatus.COMPLETED:
|
|
4005
|
+
return `${baseClasses} bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-300`;
|
|
4006
|
+
case VersionStatus.RUNNING:
|
|
4007
|
+
case VersionStatus.UPDATING:
|
|
4008
|
+
return `${baseClasses} bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-300`;
|
|
4009
|
+
case VersionStatus.QUEUED:
|
|
4010
|
+
return `${baseClasses} bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-300`;
|
|
4011
|
+
case VersionStatus.FAILED:
|
|
4012
|
+
return `${baseClasses} bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-300`;
|
|
4013
|
+
case VersionStatus.CANCELLED:
|
|
4014
|
+
return `${baseClasses} bg-gray-100 text-gray-800 dark:bg-gray-900 dark:text-gray-300`;
|
|
4015
|
+
case VersionStatus.NOT_INDEXED:
|
|
4016
|
+
default:
|
|
4017
|
+
return `${baseClasses} bg-gray-100 text-gray-600 dark:bg-gray-800 dark:text-gray-400`;
|
|
4018
|
+
}
|
|
3611
4019
|
}
|
|
3612
4020
|
const StatusBadge = ({ status, showDescription = true }) => /* @__PURE__ */ jsx("span", { class: getStatusClasses(status), children: showDescription ? getStatusDescription(status) : status });
|
|
3613
4021
|
const ProgressBar = ({ progress, showText = true }) => {
|
|
@@ -4939,16 +5347,12 @@ async function registerWorkerService(pipeline) {
|
|
|
4939
5347
|
logger.warn(
|
|
4940
5348
|
`⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`
|
|
4941
5349
|
);
|
|
4942
|
-
|
|
4943
|
-
analytics.track(TelemetryEvent.ERROR_OCCURRED, {
|
|
5350
|
+
analytics.captureException(error, {
|
|
4944
5351
|
jobId: job.id,
|
|
4945
5352
|
// Job IDs are already anonymous
|
|
4946
5353
|
library: job.library,
|
|
4947
|
-
errorType: errorInfo.type,
|
|
4948
|
-
errorMessage: errorInfo.message,
|
|
4949
5354
|
hasDocument: !!document,
|
|
4950
5355
|
stage: document ? "document_processing" : "job_setup",
|
|
4951
|
-
hasStack: errorInfo.hasStack,
|
|
4952
5356
|
pages_processed_before_error: job.progressPages || 0
|
|
4953
5357
|
});
|
|
4954
5358
|
}
|
|
@@ -5027,13 +5431,13 @@ class AppServer {
|
|
|
5027
5431
|
try {
|
|
5028
5432
|
telemetryService.startSession({
|
|
5029
5433
|
sessionId: crypto.randomUUID(),
|
|
5030
|
-
|
|
5434
|
+
appInterface: "web",
|
|
5031
5435
|
startTime: /* @__PURE__ */ new Date(),
|
|
5032
|
-
|
|
5033
|
-
|
|
5034
|
-
|
|
5035
|
-
|
|
5036
|
-
|
|
5436
|
+
appVersion: process.env.npm_package_version || "unknown",
|
|
5437
|
+
appPlatform: process.platform,
|
|
5438
|
+
appServicesEnabled: this.getActiveServicesList(),
|
|
5439
|
+
appAuthEnabled: Boolean(this.config.auth),
|
|
5440
|
+
appReadOnly: Boolean(this.config.readOnly)
|
|
5037
5441
|
});
|
|
5038
5442
|
} catch (error) {
|
|
5039
5443
|
logger.debug(`Failed to initialize telemetry: ${error}`);
|
|
@@ -5110,11 +5514,10 @@ class AppServer {
|
|
|
5110
5514
|
process.on("unhandledRejection", (reason) => {
|
|
5111
5515
|
logger.error(`Unhandled Promise Rejection: ${reason}`);
|
|
5112
5516
|
if (analytics.isEnabled()) {
|
|
5113
|
-
|
|
5114
|
-
|
|
5517
|
+
const error = reason instanceof Error ? reason : new Error(String(reason));
|
|
5518
|
+
analytics.captureException(error, {
|
|
5115
5519
|
error_category: "system",
|
|
5116
|
-
component:
|
|
5117
|
-
severity: "critical",
|
|
5520
|
+
component: AppServer.constructor.name,
|
|
5118
5521
|
context: "process_unhandled_rejection"
|
|
5119
5522
|
});
|
|
5120
5523
|
}
|
|
@@ -5124,11 +5527,9 @@ class AppServer {
|
|
|
5124
5527
|
process.on("uncaughtException", (error) => {
|
|
5125
5528
|
logger.error(`Uncaught Exception: ${error.message}`);
|
|
5126
5529
|
if (analytics.isEnabled()) {
|
|
5127
|
-
analytics.
|
|
5128
|
-
error_type: error.constructor.name,
|
|
5530
|
+
analytics.captureException(error, {
|
|
5129
5531
|
error_category: "system",
|
|
5130
|
-
component:
|
|
5131
|
-
severity: "critical",
|
|
5532
|
+
component: AppServer.constructor.name,
|
|
5132
5533
|
context: "process_uncaught_exception"
|
|
5133
5534
|
});
|
|
5134
5535
|
}
|
|
@@ -5137,11 +5538,9 @@ class AppServer {
|
|
|
5137
5538
|
if (typeof this.server.setErrorHandler === "function") {
|
|
5138
5539
|
this.server.setErrorHandler(async (error, request, reply) => {
|
|
5139
5540
|
if (analytics.isEnabled()) {
|
|
5140
|
-
analytics.
|
|
5141
|
-
error_type: error.constructor.name,
|
|
5541
|
+
analytics.captureException(error, {
|
|
5142
5542
|
error_category: "http",
|
|
5143
5543
|
component: "FastifyServer",
|
|
5144
|
-
severity: "high",
|
|
5145
5544
|
status_code: error.statusCode || 500,
|
|
5146
5545
|
method: request.method,
|
|
5147
5546
|
route: request.routeOptions?.url || request.url,
|
|
@@ -5273,195 +5672,2564 @@ class AppServer {
|
|
|
5273
5672
|
logger.debug("OAuth2 proxy endpoints registered");
|
|
5274
5673
|
}
|
|
5275
5674
|
/**
|
|
5276
|
-
* Log startup information showing which services are enabled.
|
|
5675
|
+
* Log startup information showing which services are enabled.
|
|
5676
|
+
*/
|
|
5677
|
+
logStartupInfo(address) {
|
|
5678
|
+
logger.info(`🚀 AppServer available at ${address}`);
|
|
5679
|
+
const enabledServices = [];
|
|
5680
|
+
if (this.config.enableWebInterface) {
|
|
5681
|
+
enabledServices.push(`Web interface: ${address}`);
|
|
5682
|
+
}
|
|
5683
|
+
if (this.config.enableMcpServer) {
|
|
5684
|
+
enabledServices.push(`MCP endpoints: ${address}/mcp, ${address}/sse`);
|
|
5685
|
+
}
|
|
5686
|
+
if (this.config.enableApiServer) {
|
|
5687
|
+
enabledServices.push(`API: ${address}/api`);
|
|
5688
|
+
}
|
|
5689
|
+
if (this.config.enableWorker) {
|
|
5690
|
+
enabledServices.push("Embedded worker: enabled");
|
|
5691
|
+
} else if (this.config.externalWorkerUrl) {
|
|
5692
|
+
enabledServices.push(`External worker: ${this.config.externalWorkerUrl}`);
|
|
5693
|
+
}
|
|
5694
|
+
for (const service of enabledServices) {
|
|
5695
|
+
logger.info(` • ${service}`);
|
|
5696
|
+
}
|
|
5697
|
+
}
|
|
5698
|
+
}
|
|
5699
|
+
async function startAppServer(docService, pipeline, config) {
|
|
5700
|
+
const appServer = new AppServer(docService, pipeline, config);
|
|
5701
|
+
await appServer.start();
|
|
5702
|
+
return appServer;
|
|
5703
|
+
}
|
|
5704
|
+
async function startStdioServer(tools, readOnly = false) {
|
|
5705
|
+
setLogLevel(LogLevel.ERROR);
|
|
5706
|
+
const server = createMcpServerInstance(tools, readOnly);
|
|
5707
|
+
const transport = new StdioServerTransport();
|
|
5708
|
+
await server.connect(transport);
|
|
5709
|
+
logger.info("🤖 MCP server listening on stdio");
|
|
5710
|
+
return server;
|
|
5711
|
+
}
|
|
5712
|
+
class DocumentManagementClient {
|
|
5713
|
+
baseUrl;
|
|
5714
|
+
client;
|
|
5715
|
+
constructor(serverUrl) {
|
|
5716
|
+
this.baseUrl = serverUrl.replace(/\/$/, "");
|
|
5717
|
+
this.client = createTRPCProxyClient({
|
|
5718
|
+
links: [httpBatchLink({ url: this.baseUrl })]
|
|
5719
|
+
});
|
|
5720
|
+
logger.debug(`DocumentManagementClient (tRPC) created for: ${this.baseUrl}`);
|
|
5721
|
+
}
|
|
5722
|
+
async initialize() {
|
|
5723
|
+
await this.client.ping.query();
|
|
5724
|
+
}
|
|
5725
|
+
async shutdown() {
|
|
5726
|
+
}
|
|
5727
|
+
async listLibraries() {
|
|
5728
|
+
return this.client.listLibraries.query();
|
|
5729
|
+
}
|
|
5730
|
+
async validateLibraryExists(library) {
|
|
5731
|
+
await this.client.validateLibraryExists.mutate({ library });
|
|
5732
|
+
}
|
|
5733
|
+
async findBestVersion(library, targetVersion) {
|
|
5734
|
+
return this.client.findBestVersion.query({ library, targetVersion });
|
|
5735
|
+
}
|
|
5736
|
+
async searchStore(library, version2, query, limit) {
|
|
5737
|
+
return this.client.search.query({ library, version: version2 ?? null, query, limit });
|
|
5738
|
+
}
|
|
5739
|
+
async removeVersion(library, version2) {
|
|
5740
|
+
await this.client.removeVersion.mutate({ library, version: version2 });
|
|
5741
|
+
}
|
|
5742
|
+
async removeAllDocuments(library, version2) {
|
|
5743
|
+
await this.client.removeAllDocuments.mutate({ library, version: version2 ?? null });
|
|
5744
|
+
}
|
|
5745
|
+
async getVersionsByStatus(statuses) {
|
|
5746
|
+
return this.client.getVersionsByStatus.query({
|
|
5747
|
+
statuses
|
|
5748
|
+
});
|
|
5749
|
+
}
|
|
5750
|
+
async findVersionsBySourceUrl(url) {
|
|
5751
|
+
return this.client.findVersionsBySourceUrl.query({ url });
|
|
5752
|
+
}
|
|
5753
|
+
async getScraperOptions(versionId) {
|
|
5754
|
+
return this.client.getScraperOptions.query({ versionId });
|
|
5755
|
+
}
|
|
5756
|
+
async updateVersionStatus(versionId, status, errorMessage) {
|
|
5757
|
+
await this.client.updateVersionStatus.mutate({ versionId, status, errorMessage });
|
|
5758
|
+
}
|
|
5759
|
+
async updateVersionProgress(versionId, pages, maxPages) {
|
|
5760
|
+
await this.client.updateVersionProgress.mutate({ versionId, pages, maxPages });
|
|
5761
|
+
}
|
|
5762
|
+
async storeScraperOptions(versionId, options) {
|
|
5763
|
+
await this.client.storeScraperOptions.mutate({ versionId, options });
|
|
5764
|
+
}
|
|
5765
|
+
}
|
|
5766
|
+
class SplitterError extends Error {
|
|
5767
|
+
}
|
|
5768
|
+
class MinimumChunkSizeError extends SplitterError {
|
|
5769
|
+
constructor(size, maxSize) {
|
|
5770
|
+
super(
|
|
5771
|
+
`Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
|
|
5772
|
+
);
|
|
5773
|
+
}
|
|
5774
|
+
}
|
|
5775
|
+
class ContentSplitterError extends SplitterError {
|
|
5776
|
+
}
|
|
5777
|
+
class GreedySplitter {
|
|
5778
|
+
baseSplitter;
|
|
5779
|
+
minChunkSize;
|
|
5780
|
+
preferredChunkSize;
|
|
5781
|
+
/**
|
|
5782
|
+
* Combines a base document splitter with size constraints to produce optimally-sized chunks.
|
|
5783
|
+
* The base splitter handles the initial semantic splitting, while this class handles
|
|
5784
|
+
* the concatenation strategy.
|
|
5785
|
+
*/
|
|
5786
|
+
constructor(baseSplitter, minChunkSize, preferredChunkSize) {
|
|
5787
|
+
this.baseSplitter = baseSplitter;
|
|
5788
|
+
this.minChunkSize = minChunkSize;
|
|
5789
|
+
this.preferredChunkSize = preferredChunkSize;
|
|
5790
|
+
}
|
|
5791
|
+
/**
|
|
5792
|
+
* Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
|
|
5793
|
+
* are combined until they reach the minimum size, but splits are preserved at major
|
|
5794
|
+
* section boundaries to maintain document structure. This balances the need for
|
|
5795
|
+
* context with semantic coherence.
|
|
5796
|
+
*/
|
|
5797
|
+
async splitText(markdown) {
|
|
5798
|
+
const initialChunks = await this.baseSplitter.splitText(markdown);
|
|
5799
|
+
const concatenatedChunks = [];
|
|
5800
|
+
let currentChunk = null;
|
|
5801
|
+
for (const nextChunk of initialChunks) {
|
|
5802
|
+
if (currentChunk) {
|
|
5803
|
+
if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
|
|
5804
|
+
concatenatedChunks.push(currentChunk);
|
|
5805
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
5806
|
+
continue;
|
|
5807
|
+
}
|
|
5808
|
+
if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
|
|
5809
|
+
concatenatedChunks.push(currentChunk);
|
|
5810
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
5811
|
+
continue;
|
|
5812
|
+
}
|
|
5813
|
+
currentChunk.content += `
|
|
5814
|
+
${nextChunk.content}`;
|
|
5815
|
+
currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
|
|
5816
|
+
currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
|
|
5817
|
+
} else {
|
|
5818
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
5819
|
+
}
|
|
5820
|
+
}
|
|
5821
|
+
if (currentChunk) {
|
|
5822
|
+
concatenatedChunks.push(currentChunk);
|
|
5823
|
+
}
|
|
5824
|
+
return concatenatedChunks;
|
|
5825
|
+
}
|
|
5826
|
+
cloneChunk(chunk) {
|
|
5827
|
+
return {
|
|
5828
|
+
types: [...chunk.types],
|
|
5829
|
+
content: chunk.content,
|
|
5830
|
+
section: {
|
|
5831
|
+
level: chunk.section.level,
|
|
5832
|
+
path: [...chunk.section.path]
|
|
5833
|
+
}
|
|
5834
|
+
};
|
|
5835
|
+
}
|
|
5836
|
+
/**
|
|
5837
|
+
* H1 and H2 headings represent major conceptual breaks in the document.
|
|
5838
|
+
* Preserving these splits helps maintain the document's logical structure.
|
|
5839
|
+
*/
|
|
5840
|
+
startsNewMajorSection(chunk) {
|
|
5841
|
+
return chunk.section.level === 1 || chunk.section.level === 2;
|
|
5842
|
+
}
|
|
5843
|
+
/**
|
|
5844
|
+
* Size limit check to ensure chunks remain within embedding model constraints.
|
|
5845
|
+
* Essential for maintaining consistent embedding quality and avoiding truncation.
|
|
5846
|
+
*/
|
|
5847
|
+
wouldExceedMaxSize(currentChunk, nextChunk) {
|
|
5848
|
+
if (!currentChunk) {
|
|
5849
|
+
return false;
|
|
5850
|
+
}
|
|
5851
|
+
return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
|
|
5852
|
+
}
|
|
5853
|
+
/**
|
|
5854
|
+
* Checks if one path is a prefix of another path, indicating a parent-child relationship
|
|
5855
|
+
*/
|
|
5856
|
+
isPathIncluded(parentPath, childPath) {
|
|
5857
|
+
if (parentPath.length >= childPath.length) return false;
|
|
5858
|
+
return parentPath.every((part, i) => part === childPath[i]);
|
|
5859
|
+
}
|
|
5860
|
+
/**
|
|
5861
|
+
* Merges section metadata when concatenating chunks, following these rules:
|
|
5862
|
+
* 1. Level: Always uses the lowest (most general) level between chunks
|
|
5863
|
+
* 2. Path selection:
|
|
5864
|
+
* - For parent-child relationships (one path includes the other), uses the child's path
|
|
5865
|
+
* - For siblings/unrelated sections, uses the common parent path
|
|
5866
|
+
* - If no common path exists, uses the root path ([])
|
|
5867
|
+
*/
|
|
5868
|
+
mergeSectionInfo(currentChunk, nextChunk) {
|
|
5869
|
+
const level = Math.min(currentChunk.section.level, nextChunk.section.level);
|
|
5870
|
+
if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
|
|
5871
|
+
return currentChunk.section;
|
|
5872
|
+
}
|
|
5873
|
+
if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
|
|
5874
|
+
return {
|
|
5875
|
+
path: nextChunk.section.path,
|
|
5876
|
+
level
|
|
5877
|
+
};
|
|
5878
|
+
}
|
|
5879
|
+
if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
|
|
5880
|
+
return {
|
|
5881
|
+
path: currentChunk.section.path,
|
|
5882
|
+
level
|
|
5883
|
+
};
|
|
5884
|
+
}
|
|
5885
|
+
const commonPath = this.findCommonPrefix(
|
|
5886
|
+
currentChunk.section.path,
|
|
5887
|
+
nextChunk.section.path
|
|
5888
|
+
);
|
|
5889
|
+
return {
|
|
5890
|
+
path: commonPath,
|
|
5891
|
+
level
|
|
5892
|
+
};
|
|
5893
|
+
}
|
|
5894
|
+
mergeTypes(currentTypes, nextTypes) {
|
|
5895
|
+
return [.../* @__PURE__ */ new Set([...currentTypes, ...nextTypes])];
|
|
5896
|
+
}
|
|
5897
|
+
/**
|
|
5898
|
+
* Returns longest common prefix between two paths
|
|
5899
|
+
*/
|
|
5900
|
+
findCommonPrefix(path1, path2) {
|
|
5901
|
+
const common = [];
|
|
5902
|
+
for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
|
|
5903
|
+
if (path1[i] === path2[i]) {
|
|
5904
|
+
common.push(path1[i]);
|
|
5905
|
+
} else {
|
|
5906
|
+
break;
|
|
5907
|
+
}
|
|
5908
|
+
}
|
|
5909
|
+
return common;
|
|
5910
|
+
}
|
|
5911
|
+
}
|
|
5912
|
+
const fullTrim = (str) => {
|
|
5913
|
+
return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
|
|
5914
|
+
};
|
|
5915
|
+
class CodeContentSplitter {
|
|
5916
|
+
constructor(options) {
|
|
5917
|
+
this.options = options;
|
|
5918
|
+
}
|
|
5919
|
+
async split(content) {
|
|
5920
|
+
const language = content.match(/^```(\w+)\n/)?.[1];
|
|
5921
|
+
const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
|
|
5922
|
+
const lines = strippedContent.split("\n");
|
|
5923
|
+
const chunks = [];
|
|
5924
|
+
let currentChunkLines = [];
|
|
5925
|
+
for (const line of lines) {
|
|
5926
|
+
const singleLineSize = this.wrap(line, language).length;
|
|
5927
|
+
if (singleLineSize > this.options.chunkSize) {
|
|
5928
|
+
throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
|
|
5929
|
+
}
|
|
5930
|
+
currentChunkLines.push(line);
|
|
5931
|
+
const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
|
|
5932
|
+
const newChunkSize = newChunkContent.length;
|
|
5933
|
+
if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
|
|
5934
|
+
const lastLine = currentChunkLines.pop();
|
|
5935
|
+
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
5936
|
+
currentChunkLines = [lastLine];
|
|
5937
|
+
}
|
|
5938
|
+
}
|
|
5939
|
+
if (currentChunkLines.length > 0) {
|
|
5940
|
+
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
5941
|
+
}
|
|
5942
|
+
return chunks;
|
|
5943
|
+
}
|
|
5944
|
+
wrap(content, language) {
|
|
5945
|
+
return `\`\`\`${language || ""}
|
|
5946
|
+
${content.replace(/\n+$/, "")}
|
|
5947
|
+
\`\`\``;
|
|
5948
|
+
}
|
|
5949
|
+
}
|
|
5950
|
+
class TableContentSplitter {
|
|
5951
|
+
constructor(options) {
|
|
5952
|
+
this.options = options;
|
|
5953
|
+
}
|
|
5954
|
+
/**
|
|
5955
|
+
* Splits table content into chunks while preserving table structure
|
|
5956
|
+
*/
|
|
5957
|
+
async split(content) {
|
|
5958
|
+
const parsedTable = this.parseTable(content);
|
|
5959
|
+
if (!parsedTable) {
|
|
5960
|
+
return [content];
|
|
5961
|
+
}
|
|
5962
|
+
const { headers, rows } = parsedTable;
|
|
5963
|
+
const chunks = [];
|
|
5964
|
+
let currentRows = [];
|
|
5965
|
+
for (const row of rows) {
|
|
5966
|
+
const singleRowSize = this.wrap(row, headers).length;
|
|
5967
|
+
if (singleRowSize > this.options.chunkSize) {
|
|
5968
|
+
throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
|
|
5969
|
+
}
|
|
5970
|
+
const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
|
|
5971
|
+
const newChunkSize = newChunkContent.length;
|
|
5972
|
+
if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
|
|
5973
|
+
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
5974
|
+
currentRows = [row];
|
|
5975
|
+
} else {
|
|
5976
|
+
currentRows.push(row);
|
|
5977
|
+
}
|
|
5978
|
+
}
|
|
5979
|
+
if (currentRows.length > 0) {
|
|
5980
|
+
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
5981
|
+
}
|
|
5982
|
+
return chunks;
|
|
5983
|
+
}
|
|
5984
|
+
wrap(content, headers) {
|
|
5985
|
+
const headerRow = `| ${headers.join(" | ")} |`;
|
|
5986
|
+
const separatorRow = `|${headers.map(() => "---").join("|")}|`;
|
|
5987
|
+
return [headerRow, separatorRow, content].join("\n");
|
|
5988
|
+
}
|
|
5989
|
+
parseTable(content) {
|
|
5990
|
+
const lines = content.trim().split("\n");
|
|
5991
|
+
if (lines.length < 3) return null;
|
|
5992
|
+
const headers = this.parseRow(lines[0]);
|
|
5993
|
+
if (!headers) return null;
|
|
5994
|
+
const separator = lines[1];
|
|
5995
|
+
if (!this.isValidSeparator(separator)) return null;
|
|
5996
|
+
const rows = lines.slice(2).filter((row) => row.trim() !== "");
|
|
5997
|
+
return { headers, separator, rows };
|
|
5998
|
+
}
|
|
5999
|
+
/**
|
|
6000
|
+
* Parses a table row into cells
|
|
6001
|
+
*/
|
|
6002
|
+
parseRow(row) {
|
|
6003
|
+
if (!row.includes("|")) return null;
|
|
6004
|
+
return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
|
|
6005
|
+
}
|
|
6006
|
+
/**
|
|
6007
|
+
* Validates the separator row of the table
|
|
6008
|
+
*/
|
|
6009
|
+
isValidSeparator(separator) {
|
|
6010
|
+
return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
|
|
6011
|
+
}
|
|
6012
|
+
}
|
|
6013
|
+
class TextContentSplitter {
|
|
6014
|
+
constructor(options) {
|
|
6015
|
+
this.options = options;
|
|
6016
|
+
}
|
|
6017
|
+
/**
|
|
6018
|
+
* Splits text content into chunks while trying to preserve semantic boundaries.
|
|
6019
|
+
* Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
|
|
6020
|
+
*/
|
|
6021
|
+
async split(content) {
|
|
6022
|
+
const trimmedContent = fullTrim(content);
|
|
6023
|
+
if (trimmedContent.length <= this.options.chunkSize) {
|
|
6024
|
+
return [trimmedContent];
|
|
6025
|
+
}
|
|
6026
|
+
const words = trimmedContent.split(/\s+/);
|
|
6027
|
+
const longestWord = words.reduce(
|
|
6028
|
+
(max, word) => word.length > max.length ? word : max
|
|
6029
|
+
);
|
|
6030
|
+
if (longestWord.length > this.options.chunkSize) {
|
|
6031
|
+
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
6032
|
+
}
|
|
6033
|
+
const paragraphChunks = this.splitByParagraphs(trimmedContent);
|
|
6034
|
+
if (this.areChunksValid(paragraphChunks)) {
|
|
6035
|
+
return paragraphChunks;
|
|
6036
|
+
}
|
|
6037
|
+
const lineChunks = this.splitByLines(trimmedContent);
|
|
6038
|
+
if (this.areChunksValid(lineChunks)) {
|
|
6039
|
+
return this.mergeChunks(lineChunks, "\n");
|
|
6040
|
+
}
|
|
6041
|
+
const wordChunks = await this.splitByWords(trimmedContent);
|
|
6042
|
+
return this.mergeChunks(wordChunks, " ");
|
|
6043
|
+
}
|
|
6044
|
+
/**
|
|
6045
|
+
* Checks if all chunks are within the maximum size limit
|
|
6046
|
+
*/
|
|
6047
|
+
areChunksValid(chunks) {
|
|
6048
|
+
return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
|
|
6049
|
+
}
|
|
6050
|
+
/**
|
|
6051
|
+
* Splits text into chunks by paragraph boundaries (double newlines)
|
|
6052
|
+
*/
|
|
6053
|
+
splitByParagraphs(text) {
|
|
6054
|
+
const paragraphs = text.split(/\n\s*\n/).map((p) => fullTrim(p)).filter(Boolean);
|
|
6055
|
+
return paragraphs.filter((chunk) => chunk.length > 2);
|
|
6056
|
+
}
|
|
6057
|
+
/**
|
|
6058
|
+
* Splits text into chunks by line boundaries
|
|
6059
|
+
*/
|
|
6060
|
+
splitByLines(text) {
|
|
6061
|
+
const lines = text.split(/\n/).map((line) => fullTrim(line)).filter(Boolean);
|
|
6062
|
+
return lines.filter((chunk) => chunk.length > 1);
|
|
6063
|
+
}
|
|
6064
|
+
/**
|
|
6065
|
+
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
6066
|
+
*/
|
|
6067
|
+
async splitByWords(text) {
|
|
6068
|
+
const splitter = new RecursiveCharacterTextSplitter({
|
|
6069
|
+
chunkSize: this.options.chunkSize,
|
|
6070
|
+
chunkOverlap: 0
|
|
6071
|
+
});
|
|
6072
|
+
const chunks = await splitter.splitText(text);
|
|
6073
|
+
return chunks;
|
|
6074
|
+
}
|
|
6075
|
+
/**
|
|
6076
|
+
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
6077
|
+
* Only merges if combined size is within maxChunkSize.
|
|
6078
|
+
*/
|
|
6079
|
+
mergeChunks(chunks, separator) {
|
|
6080
|
+
const mergedChunks = [];
|
|
6081
|
+
let currentChunk = null;
|
|
6082
|
+
for (const chunk of chunks) {
|
|
6083
|
+
if (currentChunk === null) {
|
|
6084
|
+
currentChunk = chunk;
|
|
6085
|
+
continue;
|
|
6086
|
+
}
|
|
6087
|
+
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
6088
|
+
const nextChunkSize = this.getChunkSize(chunk);
|
|
6089
|
+
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
6090
|
+
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
6091
|
+
} else {
|
|
6092
|
+
mergedChunks.push(currentChunk);
|
|
6093
|
+
currentChunk = chunk;
|
|
6094
|
+
}
|
|
6095
|
+
}
|
|
6096
|
+
if (currentChunk) {
|
|
6097
|
+
mergedChunks.push(currentChunk);
|
|
6098
|
+
}
|
|
6099
|
+
return mergedChunks;
|
|
6100
|
+
}
|
|
6101
|
+
getChunkSize(chunk) {
|
|
6102
|
+
return chunk.length;
|
|
6103
|
+
}
|
|
6104
|
+
wrap(content) {
|
|
6105
|
+
return content;
|
|
6106
|
+
}
|
|
6107
|
+
}
|
|
6108
|
+
class SemanticMarkdownSplitter {
|
|
6109
|
+
constructor(preferredChunkSize, maxChunkSize) {
|
|
6110
|
+
this.preferredChunkSize = preferredChunkSize;
|
|
6111
|
+
this.maxChunkSize = maxChunkSize;
|
|
6112
|
+
this.turndownService = new TurndownService({
|
|
6113
|
+
headingStyle: "atx",
|
|
6114
|
+
hr: "---",
|
|
6115
|
+
bulletListMarker: "-",
|
|
6116
|
+
codeBlockStyle: "fenced",
|
|
6117
|
+
emDelimiter: "_",
|
|
6118
|
+
strongDelimiter: "**",
|
|
6119
|
+
linkStyle: "inlined"
|
|
6120
|
+
});
|
|
6121
|
+
this.turndownService.addRule("table", {
|
|
6122
|
+
filter: ["table"],
|
|
6123
|
+
replacement: (_content, node) => {
|
|
6124
|
+
const table = node;
|
|
6125
|
+
const headers = Array.from(table.querySelectorAll("th")).map(
|
|
6126
|
+
(th) => th.textContent?.trim() || ""
|
|
6127
|
+
);
|
|
6128
|
+
const rows = Array.from(table.querySelectorAll("tr")).filter(
|
|
6129
|
+
(tr) => !tr.querySelector("th")
|
|
6130
|
+
);
|
|
6131
|
+
if (headers.length === 0 && rows.length === 0) return "";
|
|
6132
|
+
let markdown = "\n";
|
|
6133
|
+
if (headers.length > 0) {
|
|
6134
|
+
markdown += `| ${headers.join(" | ")} |
|
|
6135
|
+
`;
|
|
6136
|
+
markdown += `|${headers.map(() => "---").join("|")}|
|
|
6137
|
+
`;
|
|
6138
|
+
}
|
|
6139
|
+
for (const row of rows) {
|
|
6140
|
+
const cells = Array.from(row.querySelectorAll("td")).map(
|
|
6141
|
+
(td) => td.textContent?.trim() || ""
|
|
6142
|
+
);
|
|
6143
|
+
markdown += `| ${cells.join(" | ")} |
|
|
6144
|
+
`;
|
|
6145
|
+
}
|
|
6146
|
+
return markdown;
|
|
6147
|
+
}
|
|
6148
|
+
});
|
|
6149
|
+
this.textSplitter = new TextContentSplitter({
|
|
6150
|
+
chunkSize: this.preferredChunkSize
|
|
6151
|
+
});
|
|
6152
|
+
this.codeSplitter = new CodeContentSplitter({
|
|
6153
|
+
chunkSize: this.maxChunkSize
|
|
6154
|
+
});
|
|
6155
|
+
this.tableSplitter = new TableContentSplitter({
|
|
6156
|
+
chunkSize: this.maxChunkSize
|
|
6157
|
+
});
|
|
6158
|
+
}
|
|
6159
|
+
turndownService;
|
|
6160
|
+
textSplitter;
|
|
6161
|
+
codeSplitter;
|
|
6162
|
+
tableSplitter;
|
|
6163
|
+
/**
|
|
6164
|
+
* Main entry point for splitting markdown content
|
|
6165
|
+
*/
|
|
6166
|
+
async splitText(markdown) {
|
|
6167
|
+
const html = await this.markdownToHtml(markdown);
|
|
6168
|
+
const dom = await this.parseHtml(html);
|
|
6169
|
+
const sections = await this.splitIntoSections(dom);
|
|
6170
|
+
return this.splitSectionContent(sections);
|
|
6171
|
+
}
|
|
6172
|
+
/**
|
|
6173
|
+
* Step 1: Split document into sections based on H1-H6 headings,
|
|
6174
|
+
* as well as code blocks and tables.
|
|
6175
|
+
*/
|
|
6176
|
+
async splitIntoSections(dom) {
|
|
6177
|
+
const body = dom.querySelector("body");
|
|
6178
|
+
if (!body) {
|
|
6179
|
+
throw new Error("Invalid HTML structure: no body element found");
|
|
6180
|
+
}
|
|
6181
|
+
let currentSection = this.createRootSection();
|
|
6182
|
+
const sections = [];
|
|
6183
|
+
const stack = [currentSection];
|
|
6184
|
+
for (const element of Array.from(body.children)) {
|
|
6185
|
+
const headingMatch = element.tagName.match(/H([1-6])/);
|
|
6186
|
+
if (headingMatch) {
|
|
6187
|
+
const level = Number.parseInt(headingMatch[1], 10);
|
|
6188
|
+
const title = fullTrim(element.textContent || "");
|
|
6189
|
+
while (stack.length > 1 && stack[stack.length - 1].level >= level) {
|
|
6190
|
+
stack.pop();
|
|
6191
|
+
}
|
|
6192
|
+
currentSection = {
|
|
6193
|
+
level,
|
|
6194
|
+
path: [
|
|
6195
|
+
...stack.slice(1).reduce((acc, s) => {
|
|
6196
|
+
const lastPath = s.path[s.path.length - 1];
|
|
6197
|
+
if (lastPath) acc.push(lastPath);
|
|
6198
|
+
return acc;
|
|
6199
|
+
}, []),
|
|
6200
|
+
title
|
|
6201
|
+
],
|
|
6202
|
+
content: [
|
|
6203
|
+
{
|
|
6204
|
+
type: "heading",
|
|
6205
|
+
text: `${"#".repeat(level)} ${title}`
|
|
6206
|
+
}
|
|
6207
|
+
]
|
|
6208
|
+
};
|
|
6209
|
+
sections.push(currentSection);
|
|
6210
|
+
stack.push(currentSection);
|
|
6211
|
+
} else if (element.tagName === "PRE") {
|
|
6212
|
+
const code = element.querySelector("code");
|
|
6213
|
+
const language = code?.className.replace("language-", "") || "";
|
|
6214
|
+
const content = code?.textContent || element.textContent || "";
|
|
6215
|
+
const markdown = `${"```"}${language}
|
|
6216
|
+
${content}
|
|
6217
|
+
${"```"}`;
|
|
6218
|
+
currentSection = {
|
|
6219
|
+
level: currentSection.level,
|
|
6220
|
+
path: currentSection.path,
|
|
6221
|
+
content: [
|
|
6222
|
+
{
|
|
6223
|
+
type: "code",
|
|
6224
|
+
text: markdown
|
|
6225
|
+
}
|
|
6226
|
+
]
|
|
6227
|
+
};
|
|
6228
|
+
sections.push(currentSection);
|
|
6229
|
+
} else if (element.tagName === "TABLE") {
|
|
6230
|
+
const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
|
|
6231
|
+
currentSection = {
|
|
6232
|
+
level: currentSection.level,
|
|
6233
|
+
path: currentSection.path,
|
|
6234
|
+
content: [
|
|
6235
|
+
{
|
|
6236
|
+
type: "table",
|
|
6237
|
+
text: markdown
|
|
6238
|
+
}
|
|
6239
|
+
]
|
|
6240
|
+
};
|
|
6241
|
+
sections.push(currentSection);
|
|
6242
|
+
} else {
|
|
6243
|
+
const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
|
|
6244
|
+
if (markdown) {
|
|
6245
|
+
currentSection = {
|
|
6246
|
+
level: currentSection.level,
|
|
6247
|
+
path: currentSection.path,
|
|
6248
|
+
content: [
|
|
6249
|
+
{
|
|
6250
|
+
type: "text",
|
|
6251
|
+
text: markdown
|
|
6252
|
+
}
|
|
6253
|
+
]
|
|
6254
|
+
};
|
|
6255
|
+
sections.push(currentSection);
|
|
6256
|
+
}
|
|
6257
|
+
}
|
|
6258
|
+
}
|
|
6259
|
+
return sections;
|
|
6260
|
+
}
|
|
6261
|
+
/**
|
|
6262
|
+
* Step 2: Split section content into smaller chunks
|
|
6263
|
+
*/
|
|
6264
|
+
async splitSectionContent(sections) {
|
|
6265
|
+
const chunks = [];
|
|
6266
|
+
for (const section of sections) {
|
|
6267
|
+
for (const content of section.content) {
|
|
6268
|
+
let splitContent = [];
|
|
6269
|
+
try {
|
|
6270
|
+
switch (content.type) {
|
|
6271
|
+
case "heading":
|
|
6272
|
+
case "text": {
|
|
6273
|
+
splitContent = await this.textSplitter.split(content.text);
|
|
6274
|
+
break;
|
|
6275
|
+
}
|
|
6276
|
+
case "code": {
|
|
6277
|
+
splitContent = await this.codeSplitter.split(content.text);
|
|
6278
|
+
break;
|
|
6279
|
+
}
|
|
6280
|
+
case "table": {
|
|
6281
|
+
splitContent = await this.tableSplitter.split(content.text);
|
|
6282
|
+
break;
|
|
6283
|
+
}
|
|
6284
|
+
}
|
|
6285
|
+
} catch (err) {
|
|
6286
|
+
if (err instanceof MinimumChunkSizeError) {
|
|
6287
|
+
logger.warn(
|
|
6288
|
+
`⚠ Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
|
|
6289
|
+
);
|
|
6290
|
+
const splitter = new RecursiveCharacterTextSplitter({
|
|
6291
|
+
chunkSize: this.maxChunkSize,
|
|
6292
|
+
chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
|
|
6293
|
+
// Use more aggressive separators including empty string as last resort
|
|
6294
|
+
separators: [
|
|
6295
|
+
"\n\n",
|
|
6296
|
+
"\n",
|
|
6297
|
+
" ",
|
|
6298
|
+
" ",
|
|
6299
|
+
".",
|
|
6300
|
+
",",
|
|
6301
|
+
";",
|
|
6302
|
+
":",
|
|
6303
|
+
"-",
|
|
6304
|
+
"(",
|
|
6305
|
+
")",
|
|
6306
|
+
"[",
|
|
6307
|
+
"]",
|
|
6308
|
+
"{",
|
|
6309
|
+
"}",
|
|
6310
|
+
""
|
|
6311
|
+
]
|
|
6312
|
+
});
|
|
6313
|
+
const chunks2 = await splitter.splitText(content.text);
|
|
6314
|
+
if (chunks2.length === 0) {
|
|
6315
|
+
splitContent = [content.text.substring(0, this.maxChunkSize)];
|
|
6316
|
+
} else {
|
|
6317
|
+
splitContent = chunks2;
|
|
6318
|
+
}
|
|
6319
|
+
} else {
|
|
6320
|
+
const errMessage = err instanceof Error ? err.message : String(err);
|
|
6321
|
+
throw new ContentSplitterError(
|
|
6322
|
+
`Failed to split ${content.type} content: ${errMessage}`
|
|
6323
|
+
);
|
|
6324
|
+
}
|
|
6325
|
+
}
|
|
6326
|
+
chunks.push(
|
|
6327
|
+
...splitContent.map(
|
|
6328
|
+
(text) => ({
|
|
6329
|
+
types: [content.type],
|
|
6330
|
+
content: text,
|
|
6331
|
+
section: {
|
|
6332
|
+
level: section.level,
|
|
6333
|
+
path: section.path
|
|
6334
|
+
}
|
|
6335
|
+
})
|
|
6336
|
+
)
|
|
6337
|
+
);
|
|
6338
|
+
}
|
|
6339
|
+
}
|
|
6340
|
+
return chunks;
|
|
6341
|
+
}
|
|
6342
|
+
/**
|
|
6343
|
+
* Helper to create the root section
|
|
6344
|
+
*/
|
|
6345
|
+
createRootSection() {
|
|
6346
|
+
return {
|
|
6347
|
+
level: 0,
|
|
6348
|
+
path: [],
|
|
6349
|
+
content: []
|
|
6350
|
+
};
|
|
6351
|
+
}
|
|
6352
|
+
/**
|
|
6353
|
+
* Convert markdown to HTML using remark
|
|
6354
|
+
*/
|
|
6355
|
+
async markdownToHtml(markdown) {
|
|
6356
|
+
const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
|
|
6357
|
+
return `<!DOCTYPE html>
|
|
6358
|
+
<html>
|
|
6359
|
+
<body>
|
|
6360
|
+
${String(html)}
|
|
6361
|
+
</body>
|
|
6362
|
+
</html>`;
|
|
6363
|
+
}
|
|
6364
|
+
/**
|
|
6365
|
+
* Parse HTML
|
|
6366
|
+
*/
|
|
6367
|
+
async parseHtml(html) {
|
|
6368
|
+
const { window } = createJSDOM(html);
|
|
6369
|
+
return window.document;
|
|
6370
|
+
}
|
|
6371
|
+
}
|
|
6372
|
+
const CHILD_LIMIT = 5;
|
|
6373
|
+
const SIBLING_LIMIT = 2;
|
|
6374
|
+
class DocumentRetrieverService {
|
|
6375
|
+
documentStore;
|
|
6376
|
+
constructor(documentStore) {
|
|
6377
|
+
this.documentStore = documentStore;
|
|
6378
|
+
}
|
|
6379
|
+
/**
|
|
6380
|
+
* Collects all related chunk IDs for a given initial hit.
|
|
6381
|
+
* Returns an object with url, hitId, relatedIds (Set), and score.
|
|
6382
|
+
*/
|
|
6383
|
+
async getRelatedChunkIds(library, version2, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
|
|
6384
|
+
const id = doc.id;
|
|
6385
|
+
const url = doc.metadata.url;
|
|
6386
|
+
const score = doc.metadata.score;
|
|
6387
|
+
const relatedIds = /* @__PURE__ */ new Set();
|
|
6388
|
+
relatedIds.add(id);
|
|
6389
|
+
const parent = await this.documentStore.findParentChunk(library, version2, id);
|
|
6390
|
+
if (parent) {
|
|
6391
|
+
relatedIds.add(parent.id);
|
|
6392
|
+
}
|
|
6393
|
+
const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
|
|
6394
|
+
library,
|
|
6395
|
+
version2,
|
|
6396
|
+
id,
|
|
6397
|
+
siblingLimit
|
|
6398
|
+
);
|
|
6399
|
+
for (const sib of precedingSiblings) {
|
|
6400
|
+
relatedIds.add(sib.id);
|
|
6401
|
+
}
|
|
6402
|
+
const childChunks = await this.documentStore.findChildChunks(
|
|
6403
|
+
library,
|
|
6404
|
+
version2,
|
|
6405
|
+
id,
|
|
6406
|
+
childLimit
|
|
6407
|
+
);
|
|
6408
|
+
for (const child of childChunks) {
|
|
6409
|
+
relatedIds.add(child.id);
|
|
6410
|
+
}
|
|
6411
|
+
const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
|
|
6412
|
+
library,
|
|
6413
|
+
version2,
|
|
6414
|
+
id,
|
|
6415
|
+
siblingLimit
|
|
6416
|
+
);
|
|
6417
|
+
for (const sib of subsequentSiblings) {
|
|
6418
|
+
relatedIds.add(sib.id);
|
|
6419
|
+
}
|
|
6420
|
+
return { url, hitId: id, relatedIds, score };
|
|
6421
|
+
}
|
|
6422
|
+
/**
|
|
6423
|
+
* Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
|
|
6424
|
+
*/
|
|
6425
|
+
groupAndPrepareFetch(relatedInfos) {
|
|
6426
|
+
const urlMap = /* @__PURE__ */ new Map();
|
|
6427
|
+
for (const info of relatedInfos) {
|
|
6428
|
+
let entry = urlMap.get(info.url);
|
|
6429
|
+
if (!entry) {
|
|
6430
|
+
entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
|
|
6431
|
+
urlMap.set(info.url, entry);
|
|
6432
|
+
}
|
|
6433
|
+
for (const id of info.relatedIds) {
|
|
6434
|
+
entry.uniqueChunkIds.add(id);
|
|
6435
|
+
}
|
|
6436
|
+
if (info.score > entry.maxScore) {
|
|
6437
|
+
entry.maxScore = info.score;
|
|
6438
|
+
}
|
|
6439
|
+
}
|
|
6440
|
+
return urlMap;
|
|
6441
|
+
}
|
|
6442
|
+
/**
|
|
6443
|
+
* Finalizes the merged result for a URL group by fetching, sorting, and joining content.
|
|
6444
|
+
*/
|
|
6445
|
+
async finalizeResult(library, version2, url, uniqueChunkIds, maxScore) {
|
|
6446
|
+
const ids = Array.from(uniqueChunkIds);
|
|
6447
|
+
const docs = await this.documentStore.findChunksByIds(library, version2, ids);
|
|
6448
|
+
const content = docs.map((d) => d.pageContent).join("\n\n");
|
|
6449
|
+
return {
|
|
6450
|
+
url,
|
|
6451
|
+
content,
|
|
6452
|
+
score: maxScore
|
|
6453
|
+
};
|
|
6454
|
+
}
|
|
6455
|
+
/**
|
|
6456
|
+
* Searches for documents and expands the context around the matches.
|
|
6457
|
+
* @param library The library name.
|
|
6458
|
+
* @param version The library version.
|
|
6459
|
+
* @param query The search query.
|
|
6460
|
+
* @param version The library version (optional, defaults to searching documents without a version).
|
|
6461
|
+
* @param query The search query.
|
|
6462
|
+
* @param limit The optional limit for the initial search results.
|
|
6463
|
+
* @returns An array of strings representing the aggregated content of the retrieved chunks.
|
|
6464
|
+
*/
|
|
6465
|
+
async search(library, version2, query, limit) {
|
|
6466
|
+
const normalizedVersion = (version2 ?? "").toLowerCase();
|
|
6467
|
+
const initialResults = await this.documentStore.findByContent(
|
|
6468
|
+
library,
|
|
6469
|
+
normalizedVersion,
|
|
6470
|
+
query,
|
|
6471
|
+
limit ?? 10
|
|
6472
|
+
);
|
|
6473
|
+
const relatedInfos = await Promise.all(
|
|
6474
|
+
initialResults.map(
|
|
6475
|
+
(doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
|
|
6476
|
+
)
|
|
6477
|
+
);
|
|
6478
|
+
const urlMap = this.groupAndPrepareFetch(relatedInfos);
|
|
6479
|
+
const results = [];
|
|
6480
|
+
for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
|
|
6481
|
+
const result = await this.finalizeResult(
|
|
6482
|
+
library,
|
|
6483
|
+
normalizedVersion,
|
|
6484
|
+
url,
|
|
6485
|
+
uniqueChunkIds,
|
|
6486
|
+
maxScore
|
|
6487
|
+
);
|
|
6488
|
+
results.push(result);
|
|
6489
|
+
}
|
|
6490
|
+
return results;
|
|
6491
|
+
}
|
|
6492
|
+
}
|
|
6493
|
+
const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
|
|
6494
|
+
const MIGRATIONS_TABLE = "_schema_migrations";
|
|
6495
|
+
function ensureMigrationsTable(db) {
|
|
6496
|
+
db.exec(`
|
|
6497
|
+
CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
|
|
6498
|
+
id TEXT PRIMARY KEY,
|
|
6499
|
+
applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
6500
|
+
);
|
|
6501
|
+
`);
|
|
6502
|
+
}
|
|
6503
|
+
function getAppliedMigrations(db) {
|
|
6504
|
+
const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
|
|
6505
|
+
const rows = stmt.all();
|
|
6506
|
+
return new Set(rows.map((row) => row.id));
|
|
6507
|
+
}
|
|
6508
|
+
async function applyMigrations(db) {
|
|
6509
|
+
try {
|
|
6510
|
+
db.pragma("journal_mode = OFF");
|
|
6511
|
+
db.pragma("synchronous = OFF");
|
|
6512
|
+
db.pragma("mmap_size = 268435456");
|
|
6513
|
+
db.pragma("cache_size = -64000");
|
|
6514
|
+
db.pragma("temp_store = MEMORY");
|
|
6515
|
+
logger.debug("Applied performance optimizations for migration");
|
|
6516
|
+
} catch (_error) {
|
|
6517
|
+
logger.warn("⚠️ Could not apply all performance optimizations for migration");
|
|
6518
|
+
}
|
|
6519
|
+
const overallTransaction = db.transaction(() => {
|
|
6520
|
+
logger.debug("Checking database migrations...");
|
|
6521
|
+
ensureMigrationsTable(db);
|
|
6522
|
+
const appliedMigrations = getAppliedMigrations(db);
|
|
6523
|
+
if (!fs.existsSync(MIGRATIONS_DIR)) {
|
|
6524
|
+
throw new StoreError("Migrations directory not found");
|
|
6525
|
+
}
|
|
6526
|
+
const migrationFiles = fs.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
|
|
6527
|
+
const pendingMigrations = migrationFiles.filter(
|
|
6528
|
+
(filename) => !appliedMigrations.has(filename)
|
|
6529
|
+
);
|
|
6530
|
+
if (pendingMigrations.length > 0) {
|
|
6531
|
+
logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
|
|
6532
|
+
}
|
|
6533
|
+
let appliedCount = 0;
|
|
6534
|
+
for (const filename of pendingMigrations) {
|
|
6535
|
+
logger.debug(`Applying migration: ${filename}`);
|
|
6536
|
+
const filePath = path.join(MIGRATIONS_DIR, filename);
|
|
6537
|
+
const sql = fs.readFileSync(filePath, "utf8");
|
|
6538
|
+
try {
|
|
6539
|
+
db.exec(sql);
|
|
6540
|
+
const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
|
|
6541
|
+
insertStmt.run(filename);
|
|
6542
|
+
logger.debug(`Applied migration: ${filename}`);
|
|
6543
|
+
appliedCount++;
|
|
6544
|
+
} catch (error) {
|
|
6545
|
+
logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
|
|
6546
|
+
throw new StoreError(`Migration failed: ${filename}`, error);
|
|
6547
|
+
}
|
|
6548
|
+
}
|
|
6549
|
+
if (appliedCount > 0) {
|
|
6550
|
+
logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
|
|
6551
|
+
} else {
|
|
6552
|
+
logger.debug("Database schema is up to date");
|
|
6553
|
+
}
|
|
6554
|
+
return appliedCount;
|
|
6555
|
+
});
|
|
6556
|
+
let retries = 0;
|
|
6557
|
+
let appliedMigrationsCount = 0;
|
|
6558
|
+
while (true) {
|
|
6559
|
+
try {
|
|
6560
|
+
appliedMigrationsCount = overallTransaction.immediate();
|
|
6561
|
+
logger.debug("Database migrations completed successfully");
|
|
6562
|
+
if (appliedMigrationsCount > 0) {
|
|
6563
|
+
try {
|
|
6564
|
+
logger.debug(
|
|
6565
|
+
`Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
|
|
6566
|
+
);
|
|
6567
|
+
db.exec("VACUUM");
|
|
6568
|
+
logger.debug("Database vacuum completed successfully");
|
|
6569
|
+
} catch (error) {
|
|
6570
|
+
logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
|
|
6571
|
+
}
|
|
6572
|
+
} else {
|
|
6573
|
+
logger.debug("Skipping VACUUM - no migrations were applied");
|
|
6574
|
+
}
|
|
6575
|
+
break;
|
|
6576
|
+
} catch (error) {
|
|
6577
|
+
if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
|
|
6578
|
+
retries++;
|
|
6579
|
+
logger.warn(
|
|
6580
|
+
`⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
|
|
6581
|
+
);
|
|
6582
|
+
await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
|
|
6583
|
+
} else {
|
|
6584
|
+
if (error?.code === "SQLITE_BUSY") {
|
|
6585
|
+
logger.error(
|
|
6586
|
+
`❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
|
|
6587
|
+
);
|
|
6588
|
+
}
|
|
6589
|
+
if (error instanceof StoreError) {
|
|
6590
|
+
throw error;
|
|
6591
|
+
}
|
|
6592
|
+
throw new StoreError("Failed during migration process", error);
|
|
6593
|
+
}
|
|
6594
|
+
}
|
|
6595
|
+
}
|
|
6596
|
+
try {
|
|
6597
|
+
db.pragma("journal_mode = WAL");
|
|
6598
|
+
db.pragma("wal_autocheckpoint = 1000");
|
|
6599
|
+
db.pragma("busy_timeout = 30000");
|
|
6600
|
+
db.pragma("foreign_keys = ON");
|
|
6601
|
+
db.pragma("synchronous = NORMAL");
|
|
6602
|
+
logger.debug(
|
|
6603
|
+
"Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
|
|
6604
|
+
);
|
|
6605
|
+
} catch (_error) {
|
|
6606
|
+
logger.warn("⚠️ Could not apply all production database settings");
|
|
6607
|
+
}
|
|
6608
|
+
}
|
|
6609
|
+
class EmbeddingConfig {
|
|
6610
|
+
static instance = null;
|
|
6611
|
+
/**
|
|
6612
|
+
* Get the singleton instance of EmbeddingConfig.
|
|
6613
|
+
* Creates the instance if it doesn't exist.
|
|
6614
|
+
*/
|
|
6615
|
+
static getInstance() {
|
|
6616
|
+
if (EmbeddingConfig.instance === null) {
|
|
6617
|
+
EmbeddingConfig.instance = new EmbeddingConfig();
|
|
6618
|
+
}
|
|
6619
|
+
return EmbeddingConfig.instance;
|
|
6620
|
+
}
|
|
6621
|
+
/**
|
|
6622
|
+
* Reset the singleton instance (useful for testing).
|
|
6623
|
+
*/
|
|
6624
|
+
static resetInstance() {
|
|
6625
|
+
EmbeddingConfig.instance = null;
|
|
6626
|
+
}
|
|
6627
|
+
/**
|
|
6628
|
+
* Known dimensions for common embedding models.
|
|
6629
|
+
* This avoids expensive API calls for dimension detection in telemetry.
|
|
6630
|
+
*
|
|
6631
|
+
* Note: The "openai" provider also supports OpenAI-compatible APIs like:
|
|
6632
|
+
* - Ollama (local models)
|
|
6633
|
+
* - LMStudio (local models)
|
|
6634
|
+
* - Any service implementing OpenAI's embedding API
|
|
6635
|
+
*/
|
|
6636
|
+
knownModelDimensions = {
|
|
6637
|
+
// OpenAI models (also works with Ollama, LMStudio, and other OpenAI-compatible APIs)
|
|
6638
|
+
"text-embedding-3-small": 1536,
|
|
6639
|
+
"text-embedding-3-large": 3072,
|
|
6640
|
+
"text-embedding-ada-002": 1536,
|
|
6641
|
+
// Google Vertex AI models
|
|
6642
|
+
"text-embedding-004": 768,
|
|
6643
|
+
"textembedding-gecko@003": 768,
|
|
6644
|
+
"textembedding-gecko@002": 768,
|
|
6645
|
+
"textembedding-gecko@001": 768,
|
|
6646
|
+
// Google Gemini models (with MRL support)
|
|
6647
|
+
"text-embedding-preview-0409": 768,
|
|
6648
|
+
"embedding-001": 768,
|
|
6649
|
+
// AWS Bedrock models
|
|
6650
|
+
// Amazon Titan models
|
|
6651
|
+
"amazon.titan-embed-text-v1": 1536,
|
|
6652
|
+
"amazon.titan-embed-text-v2:0": 1024,
|
|
6653
|
+
"amazon.titan-embed-image-v1": 1024,
|
|
6654
|
+
// Image embedding model
|
|
6655
|
+
// Cohere models
|
|
6656
|
+
"cohere.embed-english-v3": 1024,
|
|
6657
|
+
"cohere.embed-multilingual-v3": 1024,
|
|
6658
|
+
// SageMaker models (hosted on AWS SageMaker)
|
|
6659
|
+
"intfloat/multilingual-e5-large": 1024,
|
|
6660
|
+
// Additional AWS models that might be supported
|
|
6661
|
+
// Note: Some of these might be placeholders - verify dimensions before use
|
|
6662
|
+
// "amazon.nova-embed-multilingual-v1:0": 4096, // Commented out as noted in source
|
|
6663
|
+
// MTEB Leaderboard models (source: https://huggingface.co/spaces/mteb/leaderboard)
|
|
6664
|
+
// Top performing models from Massive Text Embedding Benchmark
|
|
6665
|
+
"sentence-transformers/all-MiniLM-L6-v2": 384,
|
|
6666
|
+
"gemini-embedding-001": 3072,
|
|
6667
|
+
"Qwen/Qwen3-Embedding-8B": 4096,
|
|
6668
|
+
"Qwen/Qwen3-Embedding-4B": 2560,
|
|
6669
|
+
"Qwen/Qwen3-Embedding-0.6B": 1024,
|
|
6670
|
+
"Linq-AI-Research/Linq-Embed-Mistral": 4096,
|
|
6671
|
+
"Alibaba-NLP/gte-Qwen2-7B-instruct": 3584,
|
|
6672
|
+
"intfloat/multilingual-e5-large-instruct": 1024,
|
|
6673
|
+
"Salesforce/SFR-Embedding-Mistral": 4096,
|
|
6674
|
+
"text-multilingual-embedding-002": 768,
|
|
6675
|
+
"GritLM/GritLM-7B": 4096,
|
|
6676
|
+
"GritLM/GritLM-8x7B": 4096,
|
|
6677
|
+
"intfloat/e5-mistral-7b-instruct": 4096,
|
|
6678
|
+
"Cohere/Cohere-embed-multilingual-v3.0": 1024,
|
|
6679
|
+
"Alibaba-NLP/gte-Qwen2-1.5B-instruct": 8960,
|
|
6680
|
+
"Lajavaness/bilingual-embedding-large": 1024,
|
|
6681
|
+
"Salesforce/SFR-Embedding-2_R": 4096,
|
|
6682
|
+
"NovaSearch/stella_en_1.5B_v5": 8960,
|
|
6683
|
+
"NovaSearch/jasper_en_vision_language_v1": 8960,
|
|
6684
|
+
"nvidia/NV-Embed-v2": 4096,
|
|
6685
|
+
"OrdalieTech/Solon-embeddings-large-0.1": 1024,
|
|
6686
|
+
"BAAI/bge-m3": 1024,
|
|
6687
|
+
"HIT-TMG/KaLM-embedding-multilingual-mini-v1": 896,
|
|
6688
|
+
"jinaai/jina-embeddings-v3": 1024,
|
|
6689
|
+
"Alibaba-NLP/gte-multilingual-base": 768,
|
|
6690
|
+
"Lajavaness/bilingual-embedding-base": 768,
|
|
6691
|
+
"HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1": 896,
|
|
6692
|
+
"nvidia/NV-Embed-v1": 4096,
|
|
6693
|
+
"Cohere/Cohere-embed-multilingual-light-v3.0": 384,
|
|
6694
|
+
"manu/bge-m3-custom-fr": 1024,
|
|
6695
|
+
"Lajavaness/bilingual-embedding-small": 384,
|
|
6696
|
+
"Snowflake/snowflake-arctic-embed-l-v2.0": 1024,
|
|
6697
|
+
"intfloat/multilingual-e5-base": 768,
|
|
6698
|
+
"voyage-3-lite": 512,
|
|
6699
|
+
"voyage-3": 1024,
|
|
6700
|
+
"intfloat/multilingual-e5-small": 384,
|
|
6701
|
+
"Alibaba-NLP/gte-Qwen1.5-7B-instruct": 4096,
|
|
6702
|
+
"Snowflake/snowflake-arctic-embed-m-v2.0": 768,
|
|
6703
|
+
"deepvk/USER-bge-m3": 1024,
|
|
6704
|
+
"Cohere/Cohere-embed-english-v3.0": 1024,
|
|
6705
|
+
"Omartificial-Intelligence-Space/Arabic-labse-Matryoshka": 768,
|
|
6706
|
+
"ibm-granite/granite-embedding-278m-multilingual": 768,
|
|
6707
|
+
"NovaSearch/stella_en_400M_v5": 4096,
|
|
6708
|
+
"omarelshehy/arabic-english-sts-matryoshka": 1024,
|
|
6709
|
+
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2": 768,
|
|
6710
|
+
"Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka": 768,
|
|
6711
|
+
"Haon-Chen/speed-embedding-7b-instruct": 4096,
|
|
6712
|
+
"sentence-transformers/LaBSE": 768,
|
|
6713
|
+
"WhereIsAI/UAE-Large-V1": 1024,
|
|
6714
|
+
"ibm-granite/granite-embedding-107m-multilingual": 384,
|
|
6715
|
+
"mixedbread-ai/mxbai-embed-large-v1": 1024,
|
|
6716
|
+
"intfloat/e5-large-v2": 1024,
|
|
6717
|
+
"avsolatorio/GIST-large-Embedding-v0": 1024,
|
|
6718
|
+
"sdadas/mmlw-e5-large": 1024,
|
|
6719
|
+
"nomic-ai/nomic-embed-text-v1": 768,
|
|
6720
|
+
"nomic-ai/nomic-embed-text-v1-ablated": 768,
|
|
6721
|
+
"intfloat/e5-base-v2": 768,
|
|
6722
|
+
"BAAI/bge-large-en-v1.5": 1024,
|
|
6723
|
+
"intfloat/e5-large": 1024,
|
|
6724
|
+
"Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet": 384,
|
|
6725
|
+
"Cohere/Cohere-embed-english-light-v3.0": 384,
|
|
6726
|
+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": 768,
|
|
6727
|
+
"Gameselo/STS-multilingual-mpnet-base-v2": 768,
|
|
6728
|
+
"thenlper/gte-large": 1024,
|
|
6729
|
+
"avsolatorio/GIST-Embedding-v0": 768,
|
|
6730
|
+
"nomic-ai/nomic-embed-text-v1-unsupervised": 768,
|
|
6731
|
+
"infgrad/stella-base-en-v2": 768,
|
|
6732
|
+
"avsolatorio/NoInstruct-small-Embedding-v0": 384,
|
|
6733
|
+
"dwzhu/e5-base-4k": 768,
|
|
6734
|
+
"sdadas/mmlw-e5-base": 768,
|
|
6735
|
+
"voyage-multilingual-2": 1024,
|
|
6736
|
+
"McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised": 4096,
|
|
6737
|
+
"BAAI/bge-base-en-v1.5": 768,
|
|
6738
|
+
"avsolatorio/GIST-small-Embedding-v0": 384,
|
|
6739
|
+
"sdadas/mmlw-roberta-large": 1024,
|
|
6740
|
+
"nomic-ai/nomic-embed-text-v1.5": 768,
|
|
6741
|
+
"minishlab/potion-multilingual-128M": 256,
|
|
6742
|
+
"shibing624/text2vec-base-multilingual": 384,
|
|
6743
|
+
"thenlper/gte-base": 768,
|
|
6744
|
+
"intfloat/e5-small-v2": 384,
|
|
6745
|
+
"intfloat/e5-base": 768,
|
|
6746
|
+
"sentence-transformers/static-similarity-mrl-multilingual-v1": 1024,
|
|
6747
|
+
"manu/sentence_croissant_alpha_v0.3": 2048,
|
|
6748
|
+
"BAAI/bge-small-en-v1.5": 512,
|
|
6749
|
+
"thenlper/gte-small": 384,
|
|
6750
|
+
"sdadas/mmlw-e5-small": 384,
|
|
6751
|
+
"manu/sentence_croissant_alpha_v0.4": 2048,
|
|
6752
|
+
"manu/sentence_croissant_alpha_v0.2": 2048,
|
|
6753
|
+
"abhinand/MedEmbed-small-v0.1": 384,
|
|
6754
|
+
"ibm-granite/granite-embedding-125m-english": 768,
|
|
6755
|
+
"intfloat/e5-small": 384,
|
|
6756
|
+
"voyage-large-2-instruct": 1024,
|
|
6757
|
+
"sdadas/mmlw-roberta-base": 768,
|
|
6758
|
+
"Snowflake/snowflake-arctic-embed-l": 1024,
|
|
6759
|
+
"Mihaiii/Ivysaur": 384,
|
|
6760
|
+
"Snowflake/snowflake-arctic-embed-m-long": 768,
|
|
6761
|
+
"bigscience/sgpt-bloom-7b1-msmarco": 4096,
|
|
6762
|
+
"avsolatorio/GIST-all-MiniLM-L6-v2": 384,
|
|
6763
|
+
"sergeyzh/LaBSE-ru-turbo": 768,
|
|
6764
|
+
"sentence-transformers/all-mpnet-base-v2": 768,
|
|
6765
|
+
"Snowflake/snowflake-arctic-embed-m": 768,
|
|
6766
|
+
"Snowflake/snowflake-arctic-embed-s": 384,
|
|
6767
|
+
"sentence-transformers/all-MiniLM-L12-v2": 384,
|
|
6768
|
+
"Mihaiii/gte-micro-v4": 384,
|
|
6769
|
+
"Snowflake/snowflake-arctic-embed-m-v1.5": 768,
|
|
6770
|
+
"cointegrated/LaBSE-en-ru": 768,
|
|
6771
|
+
"Mihaiii/Bulbasaur": 384,
|
|
6772
|
+
"ibm-granite/granite-embedding-30m-english": 384,
|
|
6773
|
+
"deepfile/embedder-100p": 768,
|
|
6774
|
+
"Jaume/gemma-2b-embeddings": 2048,
|
|
6775
|
+
"OrlikB/KartonBERT-USE-base-v1": 768,
|
|
6776
|
+
"izhx/udever-bloom-7b1": 4096,
|
|
6777
|
+
"izhx/udever-bloom-1b1": 1024,
|
|
6778
|
+
"brahmairesearch/slx-v0.1": 384,
|
|
6779
|
+
"Mihaiii/Wartortle": 384,
|
|
6780
|
+
"izhx/udever-bloom-3b": 2048,
|
|
6781
|
+
"deepvk/USER-base": 768,
|
|
6782
|
+
"ai-forever/ru-en-RoSBERTa": 1024,
|
|
6783
|
+
"McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse": 4096,
|
|
6784
|
+
"Mihaiii/Venusaur": 384,
|
|
6785
|
+
"Snowflake/snowflake-arctic-embed-xs": 384,
|
|
6786
|
+
"jinaai/jina-embedding-b-en-v1": 768,
|
|
6787
|
+
"Mihaiii/gte-micro": 384,
|
|
6788
|
+
"aari1995/German_Semantic_STS_V2": 1024,
|
|
6789
|
+
"Mihaiii/Squirtle": 384,
|
|
6790
|
+
"OrlikB/st-polish-kartonberta-base-alpha-v1": 768,
|
|
6791
|
+
"sergeyzh/rubert-tiny-turbo": 312,
|
|
6792
|
+
"minishlab/potion-base-8M": 256,
|
|
6793
|
+
"minishlab/M2V_base_glove_subword": 256,
|
|
6794
|
+
"jinaai/jina-embedding-s-en-v1": 512,
|
|
6795
|
+
"minishlab/potion-base-4M": 128,
|
|
6796
|
+
"minishlab/M2V_base_output": 256,
|
|
6797
|
+
"DeepPavlov/rubert-base-cased-sentence": 768,
|
|
6798
|
+
"jinaai/jina-embeddings-v2-small-en": 512,
|
|
6799
|
+
"cointegrated/rubert-tiny2": 312,
|
|
6800
|
+
"minishlab/M2V_base_glove": 256,
|
|
6801
|
+
"cointegrated/rubert-tiny": 312,
|
|
6802
|
+
"silma-ai/silma-embeddding-matryoshka-v0.1": 768,
|
|
6803
|
+
"DeepPavlov/rubert-base-cased": 768,
|
|
6804
|
+
"Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet": 768,
|
|
6805
|
+
"izhx/udever-bloom-560m": 1024,
|
|
6806
|
+
"minishlab/potion-base-2M": 64,
|
|
6807
|
+
"DeepPavlov/distilrubert-small-cased-conversational": 768,
|
|
6808
|
+
"consciousAI/cai-lunaris-text-embeddings": 1024,
|
|
6809
|
+
"deepvk/deberta-v1-base": 768,
|
|
6810
|
+
"Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka": 768,
|
|
6811
|
+
"Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka": 768,
|
|
6812
|
+
"ai-forever/sbert_large_mt_nlu_ru": 1024,
|
|
6813
|
+
"ai-forever/sbert_large_nlu_ru": 1024,
|
|
6814
|
+
"malenia1/ternary-weight-embedding": 1024,
|
|
6815
|
+
"jinaai/jina-embeddings-v2-base-en": 768,
|
|
6816
|
+
"VPLabs/SearchMap_Preview": 4096,
|
|
6817
|
+
"Hum-Works/lodestone-base-4096-v1": 768,
|
|
6818
|
+
"jinaai/jina-embeddings-v4": 2048
|
|
6819
|
+
};
|
|
6820
|
+
/**
|
|
6821
|
+
* Lowercase lookup map for case-insensitive model dimension queries.
|
|
6822
|
+
* Built lazily from knownModelDimensions to ensure consistency.
|
|
6823
|
+
*/
|
|
6824
|
+
modelLookup;
|
|
6825
|
+
constructor() {
|
|
6826
|
+
this.modelLookup = /* @__PURE__ */ new Map();
|
|
6827
|
+
for (const [model, dimensions] of Object.entries(this.knownModelDimensions)) {
|
|
6828
|
+
this.modelLookup.set(model.toLowerCase(), dimensions);
|
|
6829
|
+
}
|
|
6830
|
+
}
|
|
6831
|
+
/**
|
|
6832
|
+
* Parse embedding model configuration from environment variables.
|
|
6833
|
+
* This is a synchronous operation that extracts provider, model, and known dimensions.
|
|
6834
|
+
*
|
|
6835
|
+
* Supports various providers:
|
|
6836
|
+
* - openai: OpenAI models and OpenAI-compatible APIs (Ollama, LMStudio, etc.)
|
|
6837
|
+
* - vertex: Google Cloud Vertex AI
|
|
6838
|
+
* - gemini: Google Generative AI
|
|
6839
|
+
* - aws: AWS Bedrock models
|
|
6840
|
+
* - microsoft: Azure OpenAI
|
|
6841
|
+
* - sagemaker: AWS SageMaker hosted models
|
|
6842
|
+
*
|
|
6843
|
+
* @param modelSpec Optional model specification, defaults to DOCS_MCP_EMBEDDING_MODEL env var
|
|
6844
|
+
* @returns Parsed embedding model configuration
|
|
6845
|
+
*/
|
|
6846
|
+
parse(modelSpec) {
|
|
6847
|
+
const spec = modelSpec || process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
|
|
6848
|
+
const colonIndex = spec.indexOf(":");
|
|
6849
|
+
let provider;
|
|
6850
|
+
let model;
|
|
6851
|
+
if (colonIndex === -1) {
|
|
6852
|
+
provider = "openai";
|
|
6853
|
+
model = spec;
|
|
6854
|
+
} else {
|
|
6855
|
+
provider = spec.substring(0, colonIndex);
|
|
6856
|
+
model = spec.substring(colonIndex + 1);
|
|
6857
|
+
}
|
|
6858
|
+
const dimensions = this.modelLookup?.get(model.toLowerCase()) || null;
|
|
6859
|
+
return {
|
|
6860
|
+
provider,
|
|
6861
|
+
model,
|
|
6862
|
+
dimensions,
|
|
6863
|
+
modelSpec: spec
|
|
6864
|
+
};
|
|
6865
|
+
}
|
|
6866
|
+
/**
|
|
6867
|
+
* Get the known dimensions for a specific model.
|
|
6868
|
+
* Returns null if the model dimensions are not known.
|
|
6869
|
+
* Uses case-insensitive lookup.
|
|
6870
|
+
*
|
|
6871
|
+
* @param model The model name (e.g., "text-embedding-3-small")
|
|
6872
|
+
* @returns Known dimensions or null
|
|
6873
|
+
*/
|
|
6874
|
+
getKnownDimensions(model) {
|
|
6875
|
+
return this.modelLookup?.get(model.toLowerCase()) || null;
|
|
6876
|
+
}
|
|
6877
|
+
/**
|
|
6878
|
+
* Add or update known dimensions for a model.
|
|
6879
|
+
* This can be used to cache discovered dimensions.
|
|
6880
|
+
* Stores both original case and lowercase for consistent lookup.
|
|
6881
|
+
*
|
|
6882
|
+
* @param model The model name
|
|
6883
|
+
* @param dimensions The dimensions to cache
|
|
6884
|
+
*/
|
|
6885
|
+
setKnownDimensions(model, dimensions) {
|
|
6886
|
+
this.knownModelDimensions[model] = dimensions;
|
|
6887
|
+
if (this.modelLookup) {
|
|
6888
|
+
this.modelLookup.set(model.toLowerCase(), dimensions);
|
|
6889
|
+
}
|
|
6890
|
+
}
|
|
6891
|
+
/**
|
|
6892
|
+
* Static method to parse embedding model configuration using the singleton instance.
|
|
6893
|
+
* This maintains backward compatibility while using the class-based approach.
|
|
6894
|
+
*/
|
|
6895
|
+
static parseEmbeddingConfig(modelSpec) {
|
|
6896
|
+
return EmbeddingConfig.getInstance().parse(modelSpec);
|
|
6897
|
+
}
|
|
6898
|
+
/**
|
|
6899
|
+
* Static method to get known model dimensions using the singleton instance.
|
|
6900
|
+
* This maintains backward compatibility while using the class-based approach.
|
|
6901
|
+
*/
|
|
6902
|
+
static getKnownModelDimensions(model) {
|
|
6903
|
+
return EmbeddingConfig.getInstance().getKnownDimensions(model);
|
|
6904
|
+
}
|
|
6905
|
+
/**
|
|
6906
|
+
* Static method to set known model dimensions using the singleton instance.
|
|
6907
|
+
* This maintains backward compatibility while using the class-based approach.
|
|
6908
|
+
*/
|
|
6909
|
+
static setKnownModelDimensions(model, dimensions) {
|
|
6910
|
+
EmbeddingConfig.getInstance().setKnownDimensions(model, dimensions);
|
|
6911
|
+
}
|
|
6912
|
+
}
|
|
6913
|
+
class DocumentStore {
|
|
6914
|
+
db;
|
|
6915
|
+
embeddings;
|
|
6916
|
+
dbDimension = VECTOR_DIMENSION;
|
|
6917
|
+
modelDimension;
|
|
6918
|
+
embeddingConfig;
|
|
6919
|
+
statements;
|
|
6920
|
+
/**
|
|
6921
|
+
* Calculates Reciprocal Rank Fusion score for a result
|
|
6922
|
+
*/
|
|
6923
|
+
calculateRRF(vecRank, ftsRank, k = 60) {
|
|
6924
|
+
let rrf = 0;
|
|
6925
|
+
if (vecRank !== void 0) {
|
|
6926
|
+
rrf += 1 / (k + vecRank);
|
|
6927
|
+
}
|
|
6928
|
+
if (ftsRank !== void 0) {
|
|
6929
|
+
rrf += 1 / (k + ftsRank);
|
|
6930
|
+
}
|
|
6931
|
+
return rrf;
|
|
6932
|
+
}
|
|
6933
|
+
/**
|
|
6934
|
+
* Assigns ranks to search results based on their scores
|
|
6935
|
+
*/
|
|
6936
|
+
assignRanks(results) {
|
|
6937
|
+
const vecRanks = /* @__PURE__ */ new Map();
|
|
6938
|
+
const ftsRanks = /* @__PURE__ */ new Map();
|
|
6939
|
+
results.filter((r) => r.vec_score !== void 0).sort((a, b) => (b.vec_score ?? 0) - (a.vec_score ?? 0)).forEach((result, index) => {
|
|
6940
|
+
vecRanks.set(Number(result.id), index + 1);
|
|
6941
|
+
});
|
|
6942
|
+
results.filter((r) => r.fts_score !== void 0).sort((a, b) => (b.fts_score ?? 0) - (a.fts_score ?? 0)).forEach((result, index) => {
|
|
6943
|
+
ftsRanks.set(Number(result.id), index + 1);
|
|
6944
|
+
});
|
|
6945
|
+
return results.map((result) => ({
|
|
6946
|
+
...result,
|
|
6947
|
+
vec_rank: vecRanks.get(Number(result.id)),
|
|
6948
|
+
fts_rank: ftsRanks.get(Number(result.id)),
|
|
6949
|
+
rrf_score: this.calculateRRF(
|
|
6950
|
+
vecRanks.get(Number(result.id)),
|
|
6951
|
+
ftsRanks.get(Number(result.id))
|
|
6952
|
+
)
|
|
6953
|
+
}));
|
|
6954
|
+
}
|
|
6955
|
+
constructor(dbPath, embeddingConfig) {
|
|
6956
|
+
if (!dbPath) {
|
|
6957
|
+
throw new StoreError("Missing required database path");
|
|
6958
|
+
}
|
|
6959
|
+
this.db = new Database(dbPath);
|
|
6960
|
+
this.embeddingConfig = embeddingConfig;
|
|
6961
|
+
}
|
|
6962
|
+
/**
|
|
6963
|
+
* Sets up prepared statements for database queries
|
|
6964
|
+
*/
|
|
6965
|
+
prepareStatements() {
|
|
6966
|
+
const statements = {
|
|
6967
|
+
getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
|
|
6968
|
+
insertDocument: this.db.prepare(
|
|
6969
|
+
"INSERT INTO documents (library_id, version_id, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
|
|
6970
|
+
),
|
|
6971
|
+
insertEmbedding: this.db.prepare(
|
|
6972
|
+
"INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)"
|
|
6973
|
+
),
|
|
6974
|
+
insertLibrary: this.db.prepare(
|
|
6975
|
+
"INSERT INTO libraries (name) VALUES (?) ON CONFLICT(name) DO NOTHING"
|
|
6976
|
+
),
|
|
6977
|
+
getLibraryIdByName: this.db.prepare(
|
|
6978
|
+
"SELECT id FROM libraries WHERE name = ?"
|
|
6979
|
+
),
|
|
6980
|
+
// New version-related statements
|
|
6981
|
+
insertVersion: this.db.prepare(
|
|
6982
|
+
"INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
|
|
6983
|
+
),
|
|
6984
|
+
resolveVersionId: this.db.prepare(
|
|
6985
|
+
"SELECT id FROM versions WHERE library_id = ? AND name IS ?"
|
|
6986
|
+
),
|
|
6987
|
+
getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
|
|
6988
|
+
queryVersionsByLibraryId: this.db.prepare(
|
|
6989
|
+
"SELECT * FROM versions WHERE library_id = ? ORDER BY name"
|
|
6990
|
+
),
|
|
6991
|
+
deleteLibraryDocuments: this.db.prepare(
|
|
6992
|
+
`DELETE FROM documents
|
|
6993
|
+
WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6994
|
+
AND version_id = (
|
|
6995
|
+
SELECT v.id FROM versions v
|
|
6996
|
+
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6997
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6998
|
+
)`
|
|
6999
|
+
),
|
|
7000
|
+
deleteDocuments: this.db.prepare(
|
|
7001
|
+
`DELETE FROM documents
|
|
7002
|
+
WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
7003
|
+
AND version_id = (
|
|
7004
|
+
SELECT v.id FROM versions v
|
|
7005
|
+
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
7006
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7007
|
+
)`
|
|
7008
|
+
),
|
|
7009
|
+
deleteDocumentsByUrl: this.db.prepare(
|
|
7010
|
+
`DELETE FROM documents
|
|
7011
|
+
WHERE url = ?
|
|
7012
|
+
AND library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
7013
|
+
AND version_id = (
|
|
7014
|
+
SELECT v.id FROM versions v
|
|
7015
|
+
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
7016
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7017
|
+
)`
|
|
7018
|
+
),
|
|
7019
|
+
getDocumentBySort: this.db.prepare(
|
|
7020
|
+
`SELECT d.id
|
|
7021
|
+
FROM documents d
|
|
7022
|
+
JOIN versions v ON d.version_id = v.id
|
|
7023
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7024
|
+
WHERE l.name = ?
|
|
7025
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7026
|
+
LIMIT 1`
|
|
7027
|
+
),
|
|
7028
|
+
queryVersions: this.db.prepare(
|
|
7029
|
+
`SELECT DISTINCT v.name
|
|
7030
|
+
FROM versions v
|
|
7031
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7032
|
+
WHERE l.name = ?
|
|
7033
|
+
ORDER BY v.name`
|
|
7034
|
+
),
|
|
7035
|
+
checkExists: this.db.prepare(
|
|
7036
|
+
`SELECT d.id FROM documents d
|
|
7037
|
+
JOIN versions v ON d.version_id = v.id
|
|
7038
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7039
|
+
WHERE l.name = ?
|
|
7040
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7041
|
+
LIMIT 1`
|
|
7042
|
+
),
|
|
7043
|
+
// Library/version aggregation including versions without documents and status/progress fields
|
|
7044
|
+
queryLibraryVersions: this.db.prepare(
|
|
7045
|
+
`SELECT
|
|
7046
|
+
l.name as library,
|
|
7047
|
+
COALESCE(v.name, '') as version,
|
|
7048
|
+
v.id as versionId,
|
|
7049
|
+
v.status as status,
|
|
7050
|
+
v.progress_pages as progressPages,
|
|
7051
|
+
v.progress_max_pages as progressMaxPages,
|
|
7052
|
+
v.source_url as sourceUrl,
|
|
7053
|
+
MIN(d.indexed_at) as indexedAt,
|
|
7054
|
+
COUNT(d.id) as documentCount,
|
|
7055
|
+
COUNT(DISTINCT d.url) as uniqueUrlCount
|
|
7056
|
+
FROM versions v
|
|
7057
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7058
|
+
LEFT JOIN documents d ON d.version_id = v.id
|
|
7059
|
+
GROUP BY v.id
|
|
7060
|
+
ORDER BY l.name, version`
|
|
7061
|
+
),
|
|
7062
|
+
getChildChunks: this.db.prepare(`
|
|
7063
|
+
SELECT d.* FROM documents d
|
|
7064
|
+
JOIN versions v ON d.version_id = v.id
|
|
7065
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7066
|
+
WHERE l.name = ?
|
|
7067
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7068
|
+
AND d.url = ?
|
|
7069
|
+
AND json_array_length(json_extract(d.metadata, '$.path')) = ?
|
|
7070
|
+
AND json_extract(d.metadata, '$.path') LIKE ? || '%'
|
|
7071
|
+
AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
7072
|
+
ORDER BY d.sort_order
|
|
7073
|
+
LIMIT ?
|
|
7074
|
+
`),
|
|
7075
|
+
getPrecedingSiblings: this.db.prepare(`
|
|
7076
|
+
SELECT d.* FROM documents d
|
|
7077
|
+
JOIN versions v ON d.version_id = v.id
|
|
7078
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7079
|
+
WHERE l.name = ?
|
|
7080
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7081
|
+
AND d.url = ?
|
|
7082
|
+
AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
7083
|
+
AND json_extract(d.metadata, '$.path') = ?
|
|
7084
|
+
ORDER BY d.sort_order DESC
|
|
7085
|
+
LIMIT ?
|
|
7086
|
+
`),
|
|
7087
|
+
getSubsequentSiblings: this.db.prepare(`
|
|
7088
|
+
SELECT d.* FROM documents d
|
|
7089
|
+
JOIN versions v ON d.version_id = v.id
|
|
7090
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7091
|
+
WHERE l.name = ?
|
|
7092
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7093
|
+
AND d.url = ?
|
|
7094
|
+
AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
7095
|
+
AND json_extract(d.metadata, '$.path') = ?
|
|
7096
|
+
ORDER BY d.sort_order
|
|
7097
|
+
LIMIT ?
|
|
7098
|
+
`),
|
|
7099
|
+
getParentChunk: this.db.prepare(`
|
|
7100
|
+
SELECT d.* FROM documents d
|
|
7101
|
+
JOIN versions v ON d.version_id = v.id
|
|
7102
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7103
|
+
WHERE l.name = ?
|
|
7104
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7105
|
+
AND d.url = ?
|
|
7106
|
+
AND json_extract(d.metadata, '$.path') = ?
|
|
7107
|
+
AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
7108
|
+
ORDER BY d.sort_order DESC
|
|
7109
|
+
LIMIT 1
|
|
7110
|
+
`),
|
|
7111
|
+
// Status tracking statements
|
|
7112
|
+
updateVersionStatus: this.db.prepare(
|
|
7113
|
+
"UPDATE versions SET status = ?, error_message = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
7114
|
+
),
|
|
7115
|
+
updateVersionProgress: this.db.prepare(
|
|
7116
|
+
"UPDATE versions SET progress_pages = ?, progress_max_pages = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
7117
|
+
),
|
|
7118
|
+
getVersionsByStatus: this.db.prepare(
|
|
7119
|
+
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN (SELECT value FROM json_each(?))"
|
|
7120
|
+
),
|
|
7121
|
+
// Scraper options statements
|
|
7122
|
+
updateVersionScraperOptions: this.db.prepare(
|
|
7123
|
+
"UPDATE versions SET source_url = ?, scraper_options = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
7124
|
+
),
|
|
7125
|
+
getVersionWithOptions: this.db.prepare(
|
|
7126
|
+
"SELECT * FROM versions WHERE id = ?"
|
|
7127
|
+
),
|
|
7128
|
+
getVersionsBySourceUrl: this.db.prepare(
|
|
7129
|
+
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.source_url = ? ORDER BY v.created_at DESC"
|
|
7130
|
+
),
|
|
7131
|
+
// Version and library deletion statements
|
|
7132
|
+
deleteVersionById: this.db.prepare("DELETE FROM versions WHERE id = ?"),
|
|
7133
|
+
deleteLibraryById: this.db.prepare("DELETE FROM libraries WHERE id = ?"),
|
|
7134
|
+
countVersionsByLibraryId: this.db.prepare(
|
|
7135
|
+
"SELECT COUNT(*) as count FROM versions WHERE library_id = ?"
|
|
7136
|
+
),
|
|
7137
|
+
getVersionId: this.db.prepare(
|
|
7138
|
+
`SELECT v.id, v.library_id FROM versions v
|
|
7139
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7140
|
+
WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`
|
|
7141
|
+
)
|
|
7142
|
+
};
|
|
7143
|
+
this.statements = statements;
|
|
7144
|
+
}
|
|
7145
|
+
/**
|
|
7146
|
+
* Pads a vector to the fixed database dimension by appending zeros.
|
|
7147
|
+
* Throws an error if the input vector is longer than the database dimension.
|
|
7148
|
+
*/
|
|
7149
|
+
padVector(vector) {
|
|
7150
|
+
if (vector.length > this.dbDimension) {
|
|
7151
|
+
throw new Error(
|
|
7152
|
+
`Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
|
|
7153
|
+
);
|
|
7154
|
+
}
|
|
7155
|
+
if (vector.length === this.dbDimension) {
|
|
7156
|
+
return vector;
|
|
7157
|
+
}
|
|
7158
|
+
return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
|
|
7159
|
+
}
|
|
7160
|
+
/**
|
|
7161
|
+
* Initialize the embeddings client using either provided config or environment variables.
|
|
7162
|
+
* If no embedding config is provided (null), embeddings will not be initialized.
|
|
7163
|
+
* This allows DocumentStore to be used without embeddings for operations that don't need them.
|
|
7164
|
+
*
|
|
7165
|
+
* Environment variables per provider:
|
|
7166
|
+
* - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
|
|
7167
|
+
* - vertex: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
|
|
7168
|
+
* - gemini: GOOGLE_API_KEY
|
|
7169
|
+
* - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION
|
|
7170
|
+
* - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
|
|
7171
|
+
*/
|
|
7172
|
+
async initializeEmbeddings() {
|
|
7173
|
+
if (this.embeddingConfig === null) {
|
|
7174
|
+
logger.debug("Embedding initialization skipped (explicitly disabled)");
|
|
7175
|
+
return;
|
|
7176
|
+
}
|
|
7177
|
+
const config = this.embeddingConfig || EmbeddingConfig.parseEmbeddingConfig();
|
|
7178
|
+
try {
|
|
7179
|
+
this.embeddings = createEmbeddingModel(config.modelSpec);
|
|
7180
|
+
if (config.dimensions !== null) {
|
|
7181
|
+
this.modelDimension = config.dimensions;
|
|
7182
|
+
} else {
|
|
7183
|
+
const testVector = await this.embeddings.embedQuery("test");
|
|
7184
|
+
this.modelDimension = testVector.length;
|
|
7185
|
+
EmbeddingConfig.setKnownModelDimensions(config.model, this.modelDimension);
|
|
7186
|
+
}
|
|
7187
|
+
if (this.modelDimension > this.dbDimension) {
|
|
7188
|
+
throw new DimensionError(config.modelSpec, this.modelDimension, this.dbDimension);
|
|
7189
|
+
}
|
|
7190
|
+
logger.debug(
|
|
7191
|
+
`Embeddings initialized: ${config.provider}:${config.model} (${this.modelDimension}d)`
|
|
7192
|
+
);
|
|
7193
|
+
} catch (error) {
|
|
7194
|
+
if (error instanceof Error) {
|
|
7195
|
+
if (error.message.includes("does not exist") || error.message.includes("MODEL_NOT_FOUND")) {
|
|
7196
|
+
throw new ModelConfigurationError(
|
|
7197
|
+
`❌ Invalid embedding model: ${config.model}
|
|
7198
|
+
The model "${config.model}" is not available or you don't have access to it.
|
|
7199
|
+
See README.md for supported models or run with --help for more details.`
|
|
7200
|
+
);
|
|
7201
|
+
}
|
|
7202
|
+
if (error.message.includes("API key") || error.message.includes("401") || error.message.includes("authentication")) {
|
|
7203
|
+
throw new ModelConfigurationError(
|
|
7204
|
+
`❌ Authentication failed for ${config.provider} embedding provider
|
|
7205
|
+
Please check your API key configuration.
|
|
7206
|
+
See README.md for configuration options or run with --help for more details.`
|
|
7207
|
+
);
|
|
7208
|
+
}
|
|
7209
|
+
}
|
|
7210
|
+
throw error;
|
|
7211
|
+
}
|
|
7212
|
+
}
|
|
7213
|
+
/**
|
|
7214
|
+
* Escapes a query string for use with SQLite FTS5 MATCH operator.
|
|
7215
|
+
* Wraps the query in double quotes and escapes internal double quotes.
|
|
7216
|
+
*/
|
|
7217
|
+
escapeFtsQuery(query) {
|
|
7218
|
+
const escapedQuotes = query.replace(/"/g, '""');
|
|
7219
|
+
return `"${escapedQuotes}"`;
|
|
7220
|
+
}
|
|
7221
|
+
/**
|
|
7222
|
+
* Initializes database connection and ensures readiness
|
|
7223
|
+
*/
|
|
7224
|
+
async initialize() {
|
|
7225
|
+
try {
|
|
7226
|
+
sqliteVec.load(this.db);
|
|
7227
|
+
applyMigrations(this.db);
|
|
7228
|
+
this.prepareStatements();
|
|
7229
|
+
await this.initializeEmbeddings();
|
|
7230
|
+
} catch (error) {
|
|
7231
|
+
if (error instanceof StoreError || error instanceof ModelConfigurationError || error instanceof UnsupportedProviderError) {
|
|
7232
|
+
throw error;
|
|
7233
|
+
}
|
|
7234
|
+
throw new ConnectionError("Failed to initialize database connection", error);
|
|
7235
|
+
}
|
|
7236
|
+
}
|
|
7237
|
+
/**
|
|
7238
|
+
* Gracefully closes database connections
|
|
7239
|
+
*/
|
|
7240
|
+
async shutdown() {
|
|
7241
|
+
this.db.close();
|
|
7242
|
+
}
|
|
7243
|
+
/**
|
|
7244
|
+
* Resolves a library name and version string to library_id and version_id.
|
|
7245
|
+
* Creates library and version records if they don't exist.
|
|
7246
|
+
*/
|
|
7247
|
+
async resolveLibraryAndVersionIds(library, version2) {
|
|
7248
|
+
const normalizedLibrary = library.toLowerCase();
|
|
7249
|
+
const normalizedVersion = denormalizeVersionName(version2.toLowerCase());
|
|
7250
|
+
this.statements.insertLibrary.run(normalizedLibrary);
|
|
7251
|
+
const libraryIdRow = this.statements.getLibraryIdByName.get(normalizedLibrary);
|
|
7252
|
+
if (!libraryIdRow || typeof libraryIdRow.id !== "number") {
|
|
7253
|
+
throw new StoreError(`Failed to resolve library_id for library: ${library}`);
|
|
7254
|
+
}
|
|
7255
|
+
const libraryId = libraryIdRow.id;
|
|
7256
|
+
this.statements.insertVersion.run(libraryId, normalizedVersion);
|
|
7257
|
+
const versionIdRow = this.statements.resolveVersionId.get(
|
|
7258
|
+
libraryId,
|
|
7259
|
+
normalizedVersion === null ? "" : normalizedVersion
|
|
7260
|
+
);
|
|
7261
|
+
if (!versionIdRow || typeof versionIdRow.id !== "number") {
|
|
7262
|
+
throw new StoreError(
|
|
7263
|
+
`Failed to resolve version_id for library: ${library}, version: ${version2}`
|
|
7264
|
+
);
|
|
7265
|
+
}
|
|
7266
|
+
return { libraryId, versionId: versionIdRow.id };
|
|
7267
|
+
}
|
|
7268
|
+
/**
|
|
7269
|
+
* Retrieves all unique versions for a specific library
|
|
7270
|
+
*/
|
|
7271
|
+
async queryUniqueVersions(library) {
|
|
7272
|
+
try {
|
|
7273
|
+
const rows = this.statements.queryVersions.all(library.toLowerCase());
|
|
7274
|
+
return rows.map((row) => normalizeVersionName(row.name));
|
|
7275
|
+
} catch (error) {
|
|
7276
|
+
throw new ConnectionError("Failed to query versions", error);
|
|
7277
|
+
}
|
|
7278
|
+
}
|
|
7279
|
+
/**
|
|
7280
|
+
* Updates the status of a version record in the database.
|
|
7281
|
+
* @param versionId The version ID to update
|
|
7282
|
+
* @param status The new status to set
|
|
7283
|
+
* @param errorMessage Optional error message for failed statuses
|
|
7284
|
+
*/
|
|
7285
|
+
async updateVersionStatus(versionId, status, errorMessage) {
|
|
7286
|
+
try {
|
|
7287
|
+
this.statements.updateVersionStatus.run(status, errorMessage ?? null, versionId);
|
|
7288
|
+
} catch (error) {
|
|
7289
|
+
throw new StoreError(`Failed to update version status: ${error}`);
|
|
7290
|
+
}
|
|
7291
|
+
}
|
|
7292
|
+
/**
|
|
7293
|
+
* Updates the progress counters for a version being indexed.
|
|
7294
|
+
* @param versionId The version ID to update
|
|
7295
|
+
* @param pages Current number of pages processed
|
|
7296
|
+
* @param maxPages Total number of pages to process
|
|
7297
|
+
*/
|
|
7298
|
+
async updateVersionProgress(versionId, pages, maxPages) {
|
|
7299
|
+
try {
|
|
7300
|
+
this.statements.updateVersionProgress.run(pages, maxPages, versionId);
|
|
7301
|
+
} catch (error) {
|
|
7302
|
+
throw new StoreError(`Failed to update version progress: ${error}`);
|
|
7303
|
+
}
|
|
7304
|
+
}
|
|
7305
|
+
/**
|
|
7306
|
+
* Retrieves versions by their status.
|
|
7307
|
+
* @param statuses Array of statuses to filter by
|
|
7308
|
+
* @returns Array of version records matching the statuses
|
|
7309
|
+
*/
|
|
7310
|
+
async getVersionsByStatus(statuses) {
|
|
7311
|
+
try {
|
|
7312
|
+
const statusJson = JSON.stringify(statuses);
|
|
7313
|
+
const rows = this.statements.getVersionsByStatus.all(
|
|
7314
|
+
statusJson
|
|
7315
|
+
);
|
|
7316
|
+
return rows;
|
|
7317
|
+
} catch (error) {
|
|
7318
|
+
throw new StoreError(`Failed to get versions by status: ${error}`);
|
|
7319
|
+
}
|
|
7320
|
+
}
|
|
7321
|
+
/**
|
|
7322
|
+
* Stores scraper options for a version to enable reproducible indexing.
|
|
7323
|
+
* @param versionId The version ID to update
|
|
7324
|
+
* @param options Complete scraper options used for indexing
|
|
7325
|
+
*/
|
|
7326
|
+
async storeScraperOptions(versionId, options) {
|
|
7327
|
+
try {
|
|
7328
|
+
const { url: source_url, library, version: version2, signal, ...scraper_options } = options;
|
|
7329
|
+
const optionsJson = JSON.stringify(scraper_options);
|
|
7330
|
+
this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
|
|
7331
|
+
} catch (error) {
|
|
7332
|
+
throw new StoreError(`Failed to store scraper options: ${error}`);
|
|
7333
|
+
}
|
|
7334
|
+
}
|
|
7335
|
+
/**
|
|
7336
|
+
* Retrieves stored scraping configuration (source URL and options) for a version.
|
|
7337
|
+
* Returns null when no source URL is recorded (not re-indexable).
|
|
7338
|
+
*/
|
|
7339
|
+
async getScraperOptions(versionId) {
|
|
7340
|
+
try {
|
|
7341
|
+
const row = this.statements.getVersionWithOptions.get(versionId);
|
|
7342
|
+
if (!row?.source_url) {
|
|
7343
|
+
return null;
|
|
7344
|
+
}
|
|
7345
|
+
let parsed = {};
|
|
7346
|
+
if (row.scraper_options) {
|
|
7347
|
+
try {
|
|
7348
|
+
parsed = JSON.parse(row.scraper_options);
|
|
7349
|
+
} catch (e) {
|
|
7350
|
+
logger.warn(`⚠️ Invalid scraper_options JSON for version ${versionId}: ${e}`);
|
|
7351
|
+
parsed = {};
|
|
7352
|
+
}
|
|
7353
|
+
}
|
|
7354
|
+
return { sourceUrl: row.source_url, options: parsed };
|
|
7355
|
+
} catch (error) {
|
|
7356
|
+
throw new StoreError(`Failed to get scraper options: ${error}`);
|
|
7357
|
+
}
|
|
7358
|
+
}
|
|
7359
|
+
/**
|
|
7360
|
+
* Finds versions that were indexed from the same source URL.
|
|
7361
|
+
* Useful for finding similar configurations or detecting duplicates.
|
|
7362
|
+
* @param url Source URL to search for
|
|
7363
|
+
* @returns Array of versions with the same source URL
|
|
7364
|
+
*/
|
|
7365
|
+
async findVersionsBySourceUrl(url) {
|
|
7366
|
+
try {
|
|
7367
|
+
const rows = this.statements.getVersionsBySourceUrl.all(
|
|
7368
|
+
url
|
|
7369
|
+
);
|
|
7370
|
+
return rows;
|
|
7371
|
+
} catch (error) {
|
|
7372
|
+
throw new StoreError(`Failed to find versions by source URL: ${error}`);
|
|
7373
|
+
}
|
|
7374
|
+
}
|
|
7375
|
+
/**
|
|
7376
|
+
* Verifies existence of documents for a specific library version
|
|
7377
|
+
*/
|
|
7378
|
+
async checkDocumentExists(library, version2) {
|
|
7379
|
+
try {
|
|
7380
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7381
|
+
const result = this.statements.checkExists.get(
|
|
7382
|
+
library.toLowerCase(),
|
|
7383
|
+
normalizedVersion
|
|
7384
|
+
);
|
|
7385
|
+
return result !== void 0;
|
|
7386
|
+
} catch (error) {
|
|
7387
|
+
throw new ConnectionError("Failed to check document existence", error);
|
|
7388
|
+
}
|
|
7389
|
+
}
|
|
7390
|
+
/**
|
|
7391
|
+
* Retrieves a mapping of all libraries to their available versions with details.
|
|
7392
|
+
*/
|
|
7393
|
+
async queryLibraryVersions() {
|
|
7394
|
+
try {
|
|
7395
|
+
const rows = this.statements.queryLibraryVersions.all();
|
|
7396
|
+
const libraryMap = /* @__PURE__ */ new Map();
|
|
7397
|
+
for (const row of rows) {
|
|
7398
|
+
const library = row.library;
|
|
7399
|
+
if (!libraryMap.has(library)) {
|
|
7400
|
+
libraryMap.set(library, []);
|
|
7401
|
+
}
|
|
7402
|
+
const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
|
|
7403
|
+
libraryMap.get(library)?.push({
|
|
7404
|
+
version: row.version,
|
|
7405
|
+
versionId: row.versionId,
|
|
7406
|
+
// Preserve raw string status here; DocumentManagementService will cast to VersionStatus
|
|
7407
|
+
status: row.status,
|
|
7408
|
+
progressPages: row.progressPages,
|
|
7409
|
+
progressMaxPages: row.progressMaxPages,
|
|
7410
|
+
sourceUrl: row.sourceUrl,
|
|
7411
|
+
documentCount: row.documentCount,
|
|
7412
|
+
uniqueUrlCount: row.uniqueUrlCount,
|
|
7413
|
+
indexedAt: indexedAtISO
|
|
7414
|
+
});
|
|
7415
|
+
}
|
|
7416
|
+
for (const versions of libraryMap.values()) {
|
|
7417
|
+
versions.sort((a, b) => {
|
|
7418
|
+
if (a.version === "" && b.version !== "") {
|
|
7419
|
+
return -1;
|
|
7420
|
+
}
|
|
7421
|
+
if (a.version !== "" && b.version === "") {
|
|
7422
|
+
return 1;
|
|
7423
|
+
}
|
|
7424
|
+
if (a.version === "" && b.version === "") {
|
|
7425
|
+
return 0;
|
|
7426
|
+
}
|
|
7427
|
+
try {
|
|
7428
|
+
return semver__default.compare(a.version, b.version);
|
|
7429
|
+
} catch (_error) {
|
|
7430
|
+
return a.version.localeCompare(b.version);
|
|
7431
|
+
}
|
|
7432
|
+
});
|
|
7433
|
+
}
|
|
7434
|
+
return libraryMap;
|
|
7435
|
+
} catch (error) {
|
|
7436
|
+
throw new ConnectionError("Failed to query library versions", error);
|
|
7437
|
+
}
|
|
7438
|
+
}
|
|
7439
|
+
/**
|
|
7440
|
+
* Stores documents with library and version metadata, generating embeddings
|
|
7441
|
+
* for vector similarity search. Automatically removes any existing documents
|
|
7442
|
+
* for the same URLs before adding new ones to prevent UNIQUE constraint violations.
|
|
7443
|
+
*/
|
|
7444
|
+
async addDocuments(library, version2, documents) {
|
|
7445
|
+
try {
|
|
7446
|
+
if (documents.length === 0) {
|
|
7447
|
+
return;
|
|
7448
|
+
}
|
|
7449
|
+
const urls = /* @__PURE__ */ new Set();
|
|
7450
|
+
for (const doc of documents) {
|
|
7451
|
+
const url = doc.metadata.url;
|
|
7452
|
+
if (!url || typeof url !== "string" || !url.trim()) {
|
|
7453
|
+
throw new StoreError("Document metadata must include a valid URL");
|
|
7454
|
+
}
|
|
7455
|
+
urls.add(url);
|
|
7456
|
+
}
|
|
7457
|
+
const texts = documents.map((doc) => {
|
|
7458
|
+
const header = `<title>${doc.metadata.title}</title>
|
|
7459
|
+
<url>${doc.metadata.url}</url>
|
|
7460
|
+
<path>${doc.metadata.path.join(" / ")}</path>
|
|
7461
|
+
`;
|
|
7462
|
+
return `${header}${doc.pageContent}`;
|
|
7463
|
+
});
|
|
7464
|
+
const maxBatchChars = Number(process.env.DOCS_MCP_EMBEDDING_BATCH_CHARS) || EMBEDDING_BATCH_CHARS;
|
|
7465
|
+
const rawEmbeddings = [];
|
|
7466
|
+
let currentBatch = [];
|
|
7467
|
+
let currentBatchSize = 0;
|
|
7468
|
+
let batchCount = 0;
|
|
7469
|
+
for (const text of texts) {
|
|
7470
|
+
const textSize = text.length;
|
|
7471
|
+
if (currentBatchSize + textSize > maxBatchChars && currentBatch.length > 0) {
|
|
7472
|
+
batchCount++;
|
|
7473
|
+
logger.debug(
|
|
7474
|
+
`🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
7475
|
+
);
|
|
7476
|
+
const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
|
|
7477
|
+
rawEmbeddings.push(...batchEmbeddings);
|
|
7478
|
+
currentBatch = [];
|
|
7479
|
+
currentBatchSize = 0;
|
|
7480
|
+
}
|
|
7481
|
+
currentBatch.push(text);
|
|
7482
|
+
currentBatchSize += textSize;
|
|
7483
|
+
if (currentBatch.length >= EMBEDDING_BATCH_SIZE) {
|
|
7484
|
+
batchCount++;
|
|
7485
|
+
logger.debug(
|
|
7486
|
+
`🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
7487
|
+
);
|
|
7488
|
+
const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
|
|
7489
|
+
rawEmbeddings.push(...batchEmbeddings);
|
|
7490
|
+
currentBatch = [];
|
|
7491
|
+
currentBatchSize = 0;
|
|
7492
|
+
}
|
|
7493
|
+
}
|
|
7494
|
+
if (currentBatch.length > 0) {
|
|
7495
|
+
batchCount++;
|
|
7496
|
+
logger.debug(
|
|
7497
|
+
`🔄 Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
7498
|
+
);
|
|
7499
|
+
const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
|
|
7500
|
+
rawEmbeddings.push(...batchEmbeddings);
|
|
7501
|
+
}
|
|
7502
|
+
const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
|
|
7503
|
+
const { libraryId, versionId } = await this.resolveLibraryAndVersionIds(
|
|
7504
|
+
library,
|
|
7505
|
+
version2
|
|
7506
|
+
);
|
|
7507
|
+
for (const url of urls) {
|
|
7508
|
+
const deletedCount = await this.deleteDocumentsByUrl(library, version2, url);
|
|
7509
|
+
if (deletedCount > 0) {
|
|
7510
|
+
logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`);
|
|
7511
|
+
}
|
|
7512
|
+
}
|
|
7513
|
+
const transaction = this.db.transaction((docs) => {
|
|
7514
|
+
for (let i = 0; i < docs.length; i++) {
|
|
7515
|
+
const doc = docs[i];
|
|
7516
|
+
const url = doc.metadata.url;
|
|
7517
|
+
const result = this.statements.insertDocument.run(
|
|
7518
|
+
BigInt(libraryId),
|
|
7519
|
+
BigInt(versionId),
|
|
7520
|
+
url,
|
|
7521
|
+
doc.pageContent,
|
|
7522
|
+
JSON.stringify(doc.metadata),
|
|
7523
|
+
i,
|
|
7524
|
+
(/* @__PURE__ */ new Date()).toISOString()
|
|
7525
|
+
// Pass current timestamp for indexed_at
|
|
7526
|
+
);
|
|
7527
|
+
const rowId = result.lastInsertRowid;
|
|
7528
|
+
this.statements.insertEmbedding.run(
|
|
7529
|
+
BigInt(rowId),
|
|
7530
|
+
BigInt(libraryId),
|
|
7531
|
+
BigInt(versionId),
|
|
7532
|
+
JSON.stringify(paddedEmbeddings[i])
|
|
7533
|
+
);
|
|
7534
|
+
}
|
|
7535
|
+
});
|
|
7536
|
+
transaction(documents);
|
|
7537
|
+
} catch (error) {
|
|
7538
|
+
throw new ConnectionError("Failed to add documents to store", error);
|
|
7539
|
+
}
|
|
7540
|
+
}
|
|
7541
|
+
/**
|
|
7542
|
+
* Removes documents matching specified library and version
|
|
7543
|
+
* @returns Number of documents deleted
|
|
7544
|
+
*/
|
|
7545
|
+
async deleteDocuments(library, version2) {
|
|
7546
|
+
try {
|
|
7547
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7548
|
+
const result = this.statements.deleteDocuments.run(
|
|
7549
|
+
library.toLowerCase(),
|
|
7550
|
+
library.toLowerCase(),
|
|
7551
|
+
// library name appears twice in the query
|
|
7552
|
+
normalizedVersion
|
|
7553
|
+
);
|
|
7554
|
+
return result.changes;
|
|
7555
|
+
} catch (error) {
|
|
7556
|
+
throw new ConnectionError("Failed to delete documents", error);
|
|
7557
|
+
}
|
|
7558
|
+
}
|
|
7559
|
+
/**
|
|
7560
|
+
* Removes documents for a specific URL within a library and version
|
|
7561
|
+
* @returns Number of documents deleted
|
|
7562
|
+
*/
|
|
7563
|
+
async deleteDocumentsByUrl(library, version2, url) {
|
|
7564
|
+
try {
|
|
7565
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7566
|
+
const result = this.statements.deleteDocumentsByUrl.run(
|
|
7567
|
+
url,
|
|
7568
|
+
library.toLowerCase(),
|
|
7569
|
+
library.toLowerCase(),
|
|
7570
|
+
// library name appears twice in the query
|
|
7571
|
+
normalizedVersion
|
|
7572
|
+
);
|
|
7573
|
+
return result.changes;
|
|
7574
|
+
} catch (error) {
|
|
7575
|
+
throw new ConnectionError("Failed to delete documents by URL", error);
|
|
7576
|
+
}
|
|
7577
|
+
}
|
|
7578
|
+
/**
|
|
7579
|
+
* Completely removes a library version and all associated documents.
|
|
7580
|
+
* Optionally removes the library if no other versions remain.
|
|
7581
|
+
* @param library Library name
|
|
7582
|
+
* @param version Version string (empty string for unversioned)
|
|
7583
|
+
* @param removeLibraryIfEmpty Whether to remove the library if no versions remain
|
|
7584
|
+
* @returns Object with counts of deleted documents, version deletion status, and library deletion status
|
|
7585
|
+
*/
|
|
7586
|
+
async removeVersion(library, version2, removeLibraryIfEmpty = true) {
|
|
7587
|
+
try {
|
|
7588
|
+
const normalizedLibrary = library.toLowerCase();
|
|
7589
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7590
|
+
const versionResult = this.statements.getVersionId.get(
|
|
7591
|
+
normalizedLibrary,
|
|
7592
|
+
normalizedVersion
|
|
7593
|
+
);
|
|
7594
|
+
if (!versionResult) {
|
|
7595
|
+
return { documentsDeleted: 0, versionDeleted: false, libraryDeleted: false };
|
|
7596
|
+
}
|
|
7597
|
+
const { id: versionId, library_id: libraryId } = versionResult;
|
|
7598
|
+
const documentsDeleted = await this.deleteDocuments(library, version2);
|
|
7599
|
+
const versionDeleteResult = this.statements.deleteVersionById.run(versionId);
|
|
7600
|
+
const versionDeleted = versionDeleteResult.changes > 0;
|
|
7601
|
+
let libraryDeleted = false;
|
|
7602
|
+
if (removeLibraryIfEmpty && versionDeleted) {
|
|
7603
|
+
const countResult = this.statements.countVersionsByLibraryId.get(libraryId);
|
|
7604
|
+
const remainingVersions = countResult?.count ?? 0;
|
|
7605
|
+
if (remainingVersions === 0) {
|
|
7606
|
+
const libraryDeleteResult = this.statements.deleteLibraryById.run(libraryId);
|
|
7607
|
+
libraryDeleted = libraryDeleteResult.changes > 0;
|
|
7608
|
+
}
|
|
7609
|
+
}
|
|
7610
|
+
return { documentsDeleted, versionDeleted, libraryDeleted };
|
|
7611
|
+
} catch (error) {
|
|
7612
|
+
throw new ConnectionError("Failed to remove version", error);
|
|
7613
|
+
}
|
|
7614
|
+
}
|
|
7615
|
+
/**
|
|
7616
|
+
* Retrieves a document by its ID.
|
|
7617
|
+
* @param id The ID of the document.
|
|
7618
|
+
* @returns The document, or null if not found.
|
|
7619
|
+
*/
|
|
7620
|
+
async getById(id) {
|
|
7621
|
+
try {
|
|
7622
|
+
const row = this.statements.getById.get(BigInt(id));
|
|
7623
|
+
if (!row) {
|
|
7624
|
+
return null;
|
|
7625
|
+
}
|
|
7626
|
+
return mapDbDocumentToDocument(row);
|
|
7627
|
+
} catch (error) {
|
|
7628
|
+
throw new ConnectionError(`Failed to get document by ID ${id}`, error);
|
|
7629
|
+
}
|
|
7630
|
+
}
|
|
7631
|
+
/**
|
|
7632
|
+
* Finds documents matching a text query using hybrid search.
|
|
7633
|
+
* Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
|
|
7634
|
+
*/
|
|
7635
|
+
async findByContent(library, version2, query, limit) {
|
|
7636
|
+
try {
|
|
7637
|
+
const rawEmbedding = await this.embeddings.embedQuery(query);
|
|
7638
|
+
const embedding = this.padVector(rawEmbedding);
|
|
7639
|
+
const ftsQuery = this.escapeFtsQuery(query);
|
|
7640
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7641
|
+
const stmt = this.db.prepare(`
|
|
7642
|
+
WITH vec_distances AS (
|
|
7643
|
+
SELECT
|
|
7644
|
+
dv.rowid as id,
|
|
7645
|
+
dv.distance as vec_distance
|
|
7646
|
+
FROM documents_vec dv
|
|
7647
|
+
JOIN versions v ON dv.version_id = v.id
|
|
7648
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7649
|
+
WHERE l.name = ?
|
|
7650
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7651
|
+
AND dv.embedding MATCH ?
|
|
7652
|
+
AND dv.k = ?
|
|
7653
|
+
ORDER BY dv.distance
|
|
7654
|
+
),
|
|
7655
|
+
fts_scores AS (
|
|
7656
|
+
SELECT
|
|
7657
|
+
f.rowid as id,
|
|
7658
|
+
bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
|
|
7659
|
+
FROM documents_fts f
|
|
7660
|
+
JOIN documents d ON f.rowid = d.id
|
|
7661
|
+
JOIN versions v ON d.version_id = v.id
|
|
7662
|
+
JOIN libraries l ON v.library_id = l.id
|
|
7663
|
+
WHERE l.name = ?
|
|
7664
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7665
|
+
AND documents_fts MATCH ?
|
|
7666
|
+
ORDER BY fts_score
|
|
7667
|
+
LIMIT ?
|
|
7668
|
+
)
|
|
7669
|
+
SELECT
|
|
7670
|
+
d.id,
|
|
7671
|
+
d.content,
|
|
7672
|
+
d.metadata,
|
|
7673
|
+
COALESCE(1 / (1 + v.vec_distance), 0) as vec_score,
|
|
7674
|
+
COALESCE(-MIN(f.fts_score, 0), 0) as fts_score
|
|
7675
|
+
FROM documents d
|
|
7676
|
+
LEFT JOIN vec_distances v ON d.id = v.id
|
|
7677
|
+
LEFT JOIN fts_scores f ON d.id = f.id
|
|
7678
|
+
WHERE v.id IS NOT NULL OR f.id IS NOT NULL
|
|
7679
|
+
`);
|
|
7680
|
+
const rawResults = stmt.all(
|
|
7681
|
+
library.toLowerCase(),
|
|
7682
|
+
normalizedVersion,
|
|
7683
|
+
JSON.stringify(embedding),
|
|
7684
|
+
limit,
|
|
7685
|
+
library.toLowerCase(),
|
|
7686
|
+
normalizedVersion,
|
|
7687
|
+
ftsQuery,
|
|
7688
|
+
// Use the escaped query
|
|
7689
|
+
limit
|
|
7690
|
+
);
|
|
7691
|
+
const rankedResults = this.assignRanks(rawResults);
|
|
7692
|
+
const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
|
|
7693
|
+
return topResults.map((row) => ({
|
|
7694
|
+
...mapDbDocumentToDocument(row),
|
|
7695
|
+
metadata: {
|
|
7696
|
+
...JSON.parse(row.metadata),
|
|
7697
|
+
id: row.id,
|
|
7698
|
+
score: row.rrf_score,
|
|
7699
|
+
vec_rank: row.vec_rank,
|
|
7700
|
+
fts_rank: row.fts_rank
|
|
7701
|
+
}
|
|
7702
|
+
}));
|
|
7703
|
+
} catch (error) {
|
|
7704
|
+
throw new ConnectionError(
|
|
7705
|
+
`Failed to find documents by content with query "${query}"`,
|
|
7706
|
+
error
|
|
7707
|
+
);
|
|
7708
|
+
}
|
|
7709
|
+
}
|
|
7710
|
+
/**
|
|
7711
|
+
* Finds child chunks of a given document based on path hierarchy.
|
|
7712
|
+
*/
|
|
7713
|
+
async findChildChunks(library, version2, id, limit) {
|
|
7714
|
+
try {
|
|
7715
|
+
const parent = await this.getById(id);
|
|
7716
|
+
if (!parent) {
|
|
7717
|
+
return [];
|
|
7718
|
+
}
|
|
7719
|
+
const parentPath = parent.metadata.path ?? [];
|
|
7720
|
+
const parentUrl = parent.metadata.url;
|
|
7721
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7722
|
+
const result = this.statements.getChildChunks.all(
|
|
7723
|
+
library.toLowerCase(),
|
|
7724
|
+
normalizedVersion,
|
|
7725
|
+
parentUrl,
|
|
7726
|
+
parentPath.length + 1,
|
|
7727
|
+
JSON.stringify(parentPath),
|
|
7728
|
+
BigInt(id),
|
|
7729
|
+
limit
|
|
7730
|
+
);
|
|
7731
|
+
return result.map((row) => mapDbDocumentToDocument(row));
|
|
7732
|
+
} catch (error) {
|
|
7733
|
+
throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
|
|
7734
|
+
}
|
|
7735
|
+
}
|
|
7736
|
+
/**
|
|
7737
|
+
* Finds preceding sibling chunks of a given document.
|
|
7738
|
+
*/
|
|
7739
|
+
async findPrecedingSiblingChunks(library, version2, id, limit) {
|
|
7740
|
+
try {
|
|
7741
|
+
const reference = await this.getById(id);
|
|
7742
|
+
if (!reference) {
|
|
7743
|
+
return [];
|
|
7744
|
+
}
|
|
7745
|
+
const refMetadata = reference.metadata;
|
|
7746
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7747
|
+
const result = this.statements.getPrecedingSiblings.all(
|
|
7748
|
+
library.toLowerCase(),
|
|
7749
|
+
normalizedVersion,
|
|
7750
|
+
refMetadata.url,
|
|
7751
|
+
BigInt(id),
|
|
7752
|
+
JSON.stringify(refMetadata.path),
|
|
7753
|
+
limit
|
|
7754
|
+
);
|
|
7755
|
+
return result.reverse().map((row) => mapDbDocumentToDocument(row));
|
|
7756
|
+
} catch (error) {
|
|
7757
|
+
throw new ConnectionError(
|
|
7758
|
+
`Failed to find preceding sibling chunks for ID ${id}`,
|
|
7759
|
+
error
|
|
7760
|
+
);
|
|
7761
|
+
}
|
|
7762
|
+
}
|
|
7763
|
+
/**
|
|
7764
|
+
* Finds subsequent sibling chunks of a given document.
|
|
7765
|
+
*/
|
|
7766
|
+
async findSubsequentSiblingChunks(library, version2, id, limit) {
|
|
7767
|
+
try {
|
|
7768
|
+
const reference = await this.getById(id);
|
|
7769
|
+
if (!reference) {
|
|
7770
|
+
return [];
|
|
7771
|
+
}
|
|
7772
|
+
const refMetadata = reference.metadata;
|
|
7773
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7774
|
+
const result = this.statements.getSubsequentSiblings.all(
|
|
7775
|
+
library.toLowerCase(),
|
|
7776
|
+
normalizedVersion,
|
|
7777
|
+
refMetadata.url,
|
|
7778
|
+
BigInt(id),
|
|
7779
|
+
JSON.stringify(refMetadata.path),
|
|
7780
|
+
limit
|
|
7781
|
+
);
|
|
7782
|
+
return result.map((row) => mapDbDocumentToDocument(row));
|
|
7783
|
+
} catch (error) {
|
|
7784
|
+
throw new ConnectionError(
|
|
7785
|
+
`Failed to find subsequent sibling chunks for ID ${id}`,
|
|
7786
|
+
error
|
|
7787
|
+
);
|
|
7788
|
+
}
|
|
7789
|
+
}
|
|
7790
|
+
/**
|
|
7791
|
+
* Finds the parent chunk of a given document.
|
|
7792
|
+
*/
|
|
7793
|
+
async findParentChunk(library, version2, id) {
|
|
7794
|
+
try {
|
|
7795
|
+
const child = await this.getById(id);
|
|
7796
|
+
if (!child) {
|
|
7797
|
+
return null;
|
|
7798
|
+
}
|
|
7799
|
+
const childMetadata = child.metadata;
|
|
7800
|
+
const path2 = childMetadata.path ?? [];
|
|
7801
|
+
const parentPath = path2.slice(0, -1);
|
|
7802
|
+
if (parentPath.length === 0) {
|
|
7803
|
+
return null;
|
|
7804
|
+
}
|
|
7805
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7806
|
+
const result = this.statements.getParentChunk.get(
|
|
7807
|
+
library.toLowerCase(),
|
|
7808
|
+
normalizedVersion,
|
|
7809
|
+
childMetadata.url,
|
|
7810
|
+
JSON.stringify(parentPath),
|
|
7811
|
+
BigInt(id)
|
|
7812
|
+
);
|
|
7813
|
+
if (!result) {
|
|
7814
|
+
return null;
|
|
7815
|
+
}
|
|
7816
|
+
return mapDbDocumentToDocument(result);
|
|
7817
|
+
} catch (error) {
|
|
7818
|
+
throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
|
|
7819
|
+
}
|
|
7820
|
+
}
|
|
7821
|
+
/**
|
|
7822
|
+
* Fetches multiple documents by their IDs in a single call.
|
|
7823
|
+
* Returns an array of Document objects, sorted by their sort_order.
|
|
7824
|
+
*/
|
|
7825
|
+
async findChunksByIds(library, version2, ids) {
|
|
7826
|
+
if (!ids.length) return [];
|
|
7827
|
+
try {
|
|
7828
|
+
const normalizedVersion = version2.toLowerCase();
|
|
7829
|
+
const placeholders = ids.map(() => "?").join(",");
|
|
7830
|
+
const stmt = this.db.prepare(
|
|
7831
|
+
`SELECT d.* FROM documents d
|
|
7832
|
+
JOIN libraries l ON d.library_id = l.id
|
|
7833
|
+
JOIN versions v ON d.version_id = v.id
|
|
7834
|
+
WHERE l.name = ?
|
|
7835
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7836
|
+
AND d.id IN (${placeholders})
|
|
7837
|
+
ORDER BY d.sort_order`
|
|
7838
|
+
);
|
|
7839
|
+
const rows = stmt.all(
|
|
7840
|
+
library.toLowerCase(),
|
|
7841
|
+
normalizedVersion,
|
|
7842
|
+
...ids
|
|
7843
|
+
);
|
|
7844
|
+
return rows.map((row) => mapDbDocumentToDocument(row));
|
|
7845
|
+
} catch (error) {
|
|
7846
|
+
throw new ConnectionError("Failed to fetch documents by IDs", error);
|
|
7847
|
+
}
|
|
7848
|
+
}
|
|
7849
|
+
}
|
|
7850
|
+
class DocumentManagementService {
|
|
7851
|
+
store;
|
|
7852
|
+
documentRetriever;
|
|
7853
|
+
splitter;
|
|
7854
|
+
/**
|
|
7855
|
+
* Normalizes a version string, converting null or undefined to an empty string
|
|
7856
|
+
* and converting to lowercase.
|
|
7857
|
+
*/
|
|
7858
|
+
normalizeVersion(version2) {
|
|
7859
|
+
return (version2 ?? "").toLowerCase();
|
|
7860
|
+
}
|
|
7861
|
+
constructor(embeddingConfig) {
|
|
7862
|
+
let dbPath;
|
|
7863
|
+
let dbDir;
|
|
7864
|
+
const envStorePath = process.env.DOCS_MCP_STORE_PATH;
|
|
7865
|
+
if (envStorePath) {
|
|
7866
|
+
dbDir = envStorePath;
|
|
7867
|
+
dbPath = path.join(dbDir, "documents.db");
|
|
7868
|
+
logger.debug(`Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
|
|
7869
|
+
} else {
|
|
7870
|
+
const projectRoot2 = getProjectRoot();
|
|
7871
|
+
const oldDbDir = path.join(projectRoot2, ".store");
|
|
7872
|
+
const oldDbPath = path.join(oldDbDir, "documents.db");
|
|
7873
|
+
const oldDbExists = fs.existsSync(oldDbPath);
|
|
7874
|
+
if (oldDbExists) {
|
|
7875
|
+
dbPath = oldDbPath;
|
|
7876
|
+
dbDir = oldDbDir;
|
|
7877
|
+
logger.debug(`Using legacy database path: ${dbPath}`);
|
|
7878
|
+
} else {
|
|
7879
|
+
const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
|
|
7880
|
+
dbDir = standardPaths.data;
|
|
7881
|
+
dbPath = path.join(dbDir, "documents.db");
|
|
7882
|
+
logger.debug(`Using standard database directory: ${dbDir}`);
|
|
7883
|
+
}
|
|
7884
|
+
}
|
|
7885
|
+
try {
|
|
7886
|
+
fs.mkdirSync(dbDir, { recursive: true });
|
|
7887
|
+
} catch (error) {
|
|
7888
|
+
logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
|
|
7889
|
+
}
|
|
7890
|
+
this.store = new DocumentStore(dbPath, embeddingConfig);
|
|
7891
|
+
this.documentRetriever = new DocumentRetrieverService(this.store);
|
|
7892
|
+
const semanticSplitter = new SemanticMarkdownSplitter(
|
|
7893
|
+
SPLITTER_PREFERRED_CHUNK_SIZE,
|
|
7894
|
+
SPLITTER_MAX_CHUNK_SIZE
|
|
7895
|
+
);
|
|
7896
|
+
const greedySplitter = new GreedySplitter(
|
|
7897
|
+
semanticSplitter,
|
|
7898
|
+
SPLITTER_MIN_CHUNK_SIZE,
|
|
7899
|
+
SPLITTER_PREFERRED_CHUNK_SIZE
|
|
7900
|
+
);
|
|
7901
|
+
this.splitter = greedySplitter;
|
|
7902
|
+
}
|
|
7903
|
+
/**
|
|
7904
|
+
* Initializes the underlying document store.
|
|
7905
|
+
*/
|
|
7906
|
+
async initialize() {
|
|
7907
|
+
await this.store.initialize();
|
|
7908
|
+
}
|
|
7909
|
+
/**
|
|
7910
|
+
* Shuts down the underlying document store.
|
|
7911
|
+
*/
|
|
7912
|
+
async shutdown() {
|
|
7913
|
+
logger.debug("Shutting down store manager");
|
|
7914
|
+
await this.store.shutdown();
|
|
7915
|
+
}
|
|
7916
|
+
// Status tracking methods for pipeline integration
|
|
7917
|
+
/**
|
|
7918
|
+
* Gets versions by their current status.
|
|
7919
|
+
*/
|
|
7920
|
+
async getVersionsByStatus(statuses) {
|
|
7921
|
+
return this.store.getVersionsByStatus(statuses);
|
|
7922
|
+
}
|
|
7923
|
+
/**
|
|
7924
|
+
* Updates the status of a version.
|
|
7925
|
+
*/
|
|
7926
|
+
async updateVersionStatus(versionId, status, errorMessage) {
|
|
7927
|
+
return this.store.updateVersionStatus(versionId, status, errorMessage);
|
|
7928
|
+
}
|
|
7929
|
+
/**
|
|
7930
|
+
* Updates the progress of a version being indexed.
|
|
7931
|
+
*/
|
|
7932
|
+
async updateVersionProgress(versionId, pages, maxPages) {
|
|
7933
|
+
return this.store.updateVersionProgress(versionId, pages, maxPages);
|
|
7934
|
+
}
|
|
7935
|
+
/**
|
|
7936
|
+
* Stores scraper options for a version to enable reproducible indexing.
|
|
7937
|
+
*/
|
|
7938
|
+
async storeScraperOptions(versionId, options) {
|
|
7939
|
+
return this.store.storeScraperOptions(versionId, options);
|
|
7940
|
+
}
|
|
7941
|
+
/**
|
|
7942
|
+
* Retrieves stored scraper options for a version.
|
|
7943
|
+
*/
|
|
7944
|
+
/**
|
|
7945
|
+
* Retrieves stored scraping configuration for a version.
|
|
7946
|
+
*/
|
|
7947
|
+
async getScraperOptions(versionId) {
|
|
7948
|
+
return this.store.getScraperOptions(versionId);
|
|
7949
|
+
}
|
|
7950
|
+
/**
|
|
7951
|
+
* Ensures a library/version exists using a VersionRef and returns version ID.
|
|
7952
|
+
* Delegates to existing ensureLibraryAndVersion for storage.
|
|
7953
|
+
*/
|
|
7954
|
+
async ensureVersion(ref) {
|
|
7955
|
+
const normalized = {
|
|
7956
|
+
library: ref.library.trim().toLowerCase(),
|
|
7957
|
+
version: (ref.version ?? "").trim().toLowerCase()
|
|
7958
|
+
};
|
|
7959
|
+
return this.ensureLibraryAndVersion(normalized.library, normalized.version);
|
|
7960
|
+
}
|
|
7961
|
+
/**
|
|
7962
|
+
* Returns enriched library summaries including version status/progress and counts.
|
|
7963
|
+
* Uses existing store APIs; keeps DB details encapsulated.
|
|
7964
|
+
*/
|
|
7965
|
+
async listLibraries() {
|
|
7966
|
+
const libMap = await this.store.queryLibraryVersions();
|
|
7967
|
+
const summaries = [];
|
|
7968
|
+
for (const [library, versions] of libMap) {
|
|
7969
|
+
const vs = versions.map(
|
|
7970
|
+
(v) => ({
|
|
7971
|
+
id: v.versionId,
|
|
7972
|
+
ref: { library, version: v.version },
|
|
7973
|
+
status: v.status,
|
|
7974
|
+
// Include progress only while indexing is active; set undefined for COMPLETED
|
|
7975
|
+
progress: v.status === "completed" ? void 0 : { pages: v.progressPages, maxPages: v.progressMaxPages },
|
|
7976
|
+
counts: { documents: v.documentCount, uniqueUrls: v.uniqueUrlCount },
|
|
7977
|
+
indexedAt: v.indexedAt,
|
|
7978
|
+
sourceUrl: v.sourceUrl ?? void 0
|
|
7979
|
+
})
|
|
7980
|
+
);
|
|
7981
|
+
summaries.push({ library, versions: vs });
|
|
7982
|
+
}
|
|
7983
|
+
return summaries;
|
|
7984
|
+
}
|
|
7985
|
+
/**
|
|
7986
|
+
* Finds versions that were indexed from the same source URL.
|
|
7987
|
+
*/
|
|
7988
|
+
async findVersionsBySourceUrl(url) {
|
|
7989
|
+
return this.store.findVersionsBySourceUrl(url);
|
|
7990
|
+
}
|
|
7991
|
+
/**
|
|
7992
|
+
* Validates if a library exists in the store (either versioned or unversioned).
|
|
7993
|
+
* Throws LibraryNotFoundError with suggestions if the library is not found.
|
|
7994
|
+
* @param library The name of the library to validate.
|
|
7995
|
+
* @throws {LibraryNotFoundError} If the library does not exist.
|
|
7996
|
+
*/
|
|
7997
|
+
async validateLibraryExists(library) {
|
|
7998
|
+
logger.info(`🔎 Validating existence of library: ${library}`);
|
|
7999
|
+
const normalizedLibrary = library.toLowerCase();
|
|
8000
|
+
const versions = await this.listVersions(normalizedLibrary);
|
|
8001
|
+
const hasUnversioned = await this.exists(normalizedLibrary, "");
|
|
8002
|
+
if (versions.length === 0 && !hasUnversioned) {
|
|
8003
|
+
logger.warn(`⚠️ Library '${library}' not found.`);
|
|
8004
|
+
const allLibraries = await this.listLibraries();
|
|
8005
|
+
const libraryNames = allLibraries.map((lib) => lib.library);
|
|
8006
|
+
let suggestions = [];
|
|
8007
|
+
if (libraryNames.length > 0) {
|
|
8008
|
+
const fuse = new Fuse(libraryNames, {
|
|
8009
|
+
// Configure fuse.js options if needed (e.g., threshold)
|
|
8010
|
+
// isCaseSensitive: false, // Handled by normalizing library names
|
|
8011
|
+
// includeScore: true,
|
|
8012
|
+
threshold: 0.4
|
|
8013
|
+
// Adjust threshold for desired fuzziness (0=exact, 1=match anything)
|
|
8014
|
+
});
|
|
8015
|
+
const results = fuse.search(normalizedLibrary);
|
|
8016
|
+
suggestions = results.slice(0, 3).map((result) => result.item);
|
|
8017
|
+
logger.info(`🔍 Found suggestions: ${suggestions.join(", ")}`);
|
|
8018
|
+
}
|
|
8019
|
+
throw new LibraryNotFoundError(library, suggestions);
|
|
8020
|
+
}
|
|
8021
|
+
logger.info(`✅ Library '${library}' confirmed to exist.`);
|
|
8022
|
+
}
|
|
8023
|
+
/**
|
|
8024
|
+
* Returns a list of all available semantic versions for a library.
|
|
8025
|
+
*/
|
|
8026
|
+
async listVersions(library) {
|
|
8027
|
+
const versions = await this.store.queryUniqueVersions(library);
|
|
8028
|
+
return versions.filter((v) => semver__default.valid(v));
|
|
8029
|
+
}
|
|
8030
|
+
/**
|
|
8031
|
+
* Checks if documents exist for a given library and optional version.
|
|
8032
|
+
* If version is omitted, checks for documents without a specific version.
|
|
5277
8033
|
*/
|
|
5278
|
-
|
|
5279
|
-
|
|
5280
|
-
|
|
5281
|
-
|
|
5282
|
-
|
|
5283
|
-
|
|
5284
|
-
|
|
5285
|
-
|
|
5286
|
-
|
|
5287
|
-
|
|
5288
|
-
|
|
5289
|
-
|
|
5290
|
-
|
|
5291
|
-
|
|
5292
|
-
|
|
5293
|
-
|
|
8034
|
+
async exists(library, version2) {
|
|
8035
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8036
|
+
return this.store.checkDocumentExists(library, normalizedVersion);
|
|
8037
|
+
}
|
|
8038
|
+
/**
|
|
8039
|
+
* Finds the most appropriate version of documentation based on the requested version.
|
|
8040
|
+
* When no target version is specified, returns the latest version.
|
|
8041
|
+
*
|
|
8042
|
+
* Version matching behavior:
|
|
8043
|
+
* - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
|
|
8044
|
+
* - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
|
|
8045
|
+
* - "latest" or no version: Returns the latest available version
|
|
8046
|
+
*
|
|
8047
|
+
* For documentation, we prefer matching older versions over no match at all,
|
|
8048
|
+
* since older docs are often still relevant and useful.
|
|
8049
|
+
* Also checks if unversioned documents exist for the library.
|
|
8050
|
+
*/
|
|
8051
|
+
async findBestVersion(library, targetVersion) {
|
|
8052
|
+
const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
|
|
8053
|
+
logger.info(`🔍 Finding best version for ${libraryAndVersion}`);
|
|
8054
|
+
const hasUnversioned = await this.store.checkDocumentExists(library, "");
|
|
8055
|
+
const versionStrings = await this.listVersions(library);
|
|
8056
|
+
if (versionStrings.length === 0) {
|
|
8057
|
+
if (hasUnversioned) {
|
|
8058
|
+
logger.info(`ℹ️ Unversioned documents exist for ${library}`);
|
|
8059
|
+
return { bestMatch: null, hasUnversioned: true };
|
|
8060
|
+
}
|
|
8061
|
+
logger.warn(`⚠️ No valid versions found for ${library}`);
|
|
8062
|
+
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
8063
|
+
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
8064
|
+
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
8065
|
+
}
|
|
8066
|
+
let bestMatch = null;
|
|
8067
|
+
if (!targetVersion || targetVersion === "latest") {
|
|
8068
|
+
bestMatch = semver__default.maxSatisfying(versionStrings, "*");
|
|
8069
|
+
} else {
|
|
8070
|
+
const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
|
|
8071
|
+
if (!versionRegex.test(targetVersion)) {
|
|
8072
|
+
logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
|
|
8073
|
+
} else {
|
|
8074
|
+
let range = targetVersion;
|
|
8075
|
+
if (!semver__default.validRange(targetVersion)) {
|
|
8076
|
+
range = `~${targetVersion}`;
|
|
8077
|
+
} else if (semver__default.valid(targetVersion)) {
|
|
8078
|
+
range = `${range} || <=${targetVersion}`;
|
|
8079
|
+
}
|
|
8080
|
+
bestMatch = semver__default.maxSatisfying(versionStrings, range);
|
|
8081
|
+
}
|
|
5294
8082
|
}
|
|
5295
|
-
|
|
5296
|
-
logger.info(
|
|
8083
|
+
if (bestMatch) {
|
|
8084
|
+
logger.info(`✅ Found best match version ${bestMatch} for ${libraryAndVersion}`);
|
|
8085
|
+
} else {
|
|
8086
|
+
logger.warn(`⚠️ No matching semver version found for ${libraryAndVersion}`);
|
|
5297
8087
|
}
|
|
5298
|
-
|
|
5299
|
-
|
|
5300
|
-
|
|
5301
|
-
|
|
5302
|
-
await appServer.start();
|
|
5303
|
-
return appServer;
|
|
5304
|
-
}
|
|
5305
|
-
async function startStdioServer(tools, readOnly = false) {
|
|
5306
|
-
setLogLevel(LogLevel.ERROR);
|
|
5307
|
-
const server = createMcpServerInstance(tools, readOnly);
|
|
5308
|
-
const transport = new StdioServerTransport();
|
|
5309
|
-
await server.connect(transport);
|
|
5310
|
-
logger.info("🤖 MCP server listening on stdio");
|
|
5311
|
-
return server;
|
|
5312
|
-
}
|
|
5313
|
-
class StoreError extends Error {
|
|
5314
|
-
constructor(message, cause) {
|
|
5315
|
-
super(cause ? `${message} caused by ${cause}` : message);
|
|
5316
|
-
this.cause = cause;
|
|
5317
|
-
this.name = this.constructor.name;
|
|
5318
|
-
const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
|
|
5319
|
-
if (causeError?.stack) {
|
|
5320
|
-
this.stack = causeError.stack;
|
|
8088
|
+
if (!bestMatch && !hasUnversioned) {
|
|
8089
|
+
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
8090
|
+
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
8091
|
+
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
5321
8092
|
}
|
|
8093
|
+
return { bestMatch, hasUnversioned };
|
|
5322
8094
|
}
|
|
5323
|
-
|
|
5324
|
-
|
|
5325
|
-
|
|
5326
|
-
|
|
5327
|
-
|
|
5328
|
-
);
|
|
5329
|
-
|
|
5330
|
-
|
|
5331
|
-
this.dbDimension = dbDimension;
|
|
5332
|
-
}
|
|
5333
|
-
}
|
|
5334
|
-
class ConnectionError extends StoreError {
|
|
5335
|
-
}
|
|
5336
|
-
const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
|
|
5337
|
-
const MIGRATIONS_TABLE = "_schema_migrations";
|
|
5338
|
-
function ensureMigrationsTable(db) {
|
|
5339
|
-
db.exec(`
|
|
5340
|
-
CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
|
|
5341
|
-
id TEXT PRIMARY KEY,
|
|
5342
|
-
applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
8095
|
+
/**
|
|
8096
|
+
* Removes all documents for a specific library and optional version.
|
|
8097
|
+
* If version is omitted, removes documents without a specific version.
|
|
8098
|
+
*/
|
|
8099
|
+
async removeAllDocuments(library, version2) {
|
|
8100
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8101
|
+
logger.info(
|
|
8102
|
+
`🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
|
|
5343
8103
|
);
|
|
5344
|
-
|
|
5345
|
-
}
|
|
5346
|
-
function getAppliedMigrations(db) {
|
|
5347
|
-
const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
|
|
5348
|
-
const rows = stmt.all();
|
|
5349
|
-
return new Set(rows.map((row) => row.id));
|
|
5350
|
-
}
|
|
5351
|
-
async function applyMigrations(db) {
|
|
5352
|
-
try {
|
|
5353
|
-
db.pragma("journal_mode = OFF");
|
|
5354
|
-
db.pragma("synchronous = OFF");
|
|
5355
|
-
db.pragma("mmap_size = 268435456");
|
|
5356
|
-
db.pragma("cache_size = -64000");
|
|
5357
|
-
db.pragma("temp_store = MEMORY");
|
|
5358
|
-
logger.debug("Applied performance optimizations for migration");
|
|
5359
|
-
} catch (_error) {
|
|
5360
|
-
logger.warn("⚠️ Could not apply all performance optimizations for migration");
|
|
8104
|
+
const count = await this.store.deleteDocuments(library, normalizedVersion);
|
|
8105
|
+
logger.info(`🗑️ Deleted ${count} documents`);
|
|
5361
8106
|
}
|
|
5362
|
-
|
|
5363
|
-
|
|
5364
|
-
|
|
5365
|
-
|
|
5366
|
-
|
|
5367
|
-
|
|
5368
|
-
|
|
5369
|
-
const
|
|
5370
|
-
|
|
5371
|
-
|
|
8107
|
+
/**
|
|
8108
|
+
* Completely removes a library version and all associated documents.
|
|
8109
|
+
* Also removes the library if no other versions remain.
|
|
8110
|
+
* @param library Library name
|
|
8111
|
+
* @param version Version string (null/undefined for unversioned)
|
|
8112
|
+
*/
|
|
8113
|
+
async removeVersion(library, version2) {
|
|
8114
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8115
|
+
logger.info(`🗑️ Removing version: ${library}@${normalizedVersion || "[no version]"}`);
|
|
8116
|
+
const result = await this.store.removeVersion(library, normalizedVersion, true);
|
|
8117
|
+
logger.info(
|
|
8118
|
+
`🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`
|
|
5372
8119
|
);
|
|
5373
|
-
if (
|
|
5374
|
-
logger.info(
|
|
8120
|
+
if (result.versionDeleted && result.libraryDeleted) {
|
|
8121
|
+
logger.info(`✅ Completely removed library ${library} (was last version)`);
|
|
8122
|
+
} else if (result.versionDeleted) {
|
|
8123
|
+
logger.info(`✅ Removed version ${library}@${normalizedVersion || "[no version]"}`);
|
|
8124
|
+
} else {
|
|
8125
|
+
logger.warn(
|
|
8126
|
+
`⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`
|
|
8127
|
+
);
|
|
5375
8128
|
}
|
|
5376
|
-
|
|
5377
|
-
|
|
5378
|
-
|
|
5379
|
-
|
|
5380
|
-
|
|
5381
|
-
|
|
5382
|
-
|
|
5383
|
-
|
|
5384
|
-
|
|
5385
|
-
|
|
5386
|
-
|
|
5387
|
-
|
|
5388
|
-
|
|
5389
|
-
throw new StoreError(`Migration failed: ${filename}`, error);
|
|
5390
|
-
}
|
|
8129
|
+
}
|
|
8130
|
+
/**
|
|
8131
|
+
* Adds a document to the store, splitting it into smaller chunks for better search results.
|
|
8132
|
+
* Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
|
|
8133
|
+
* Preserves hierarchical structure of documents and distinguishes between text and code segments.
|
|
8134
|
+
* If version is omitted, the document is added without a specific version.
|
|
8135
|
+
*/
|
|
8136
|
+
async addDocument(library, version2, document) {
|
|
8137
|
+
const processingStart = performance.now();
|
|
8138
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8139
|
+
const url = document.metadata.url;
|
|
8140
|
+
if (!url || typeof url !== "string" || !url.trim()) {
|
|
8141
|
+
throw new StoreError("Document metadata must include a valid URL");
|
|
5391
8142
|
}
|
|
5392
|
-
|
|
5393
|
-
|
|
5394
|
-
|
|
5395
|
-
logger.debug("Database schema is up to date");
|
|
8143
|
+
logger.info(`📚 Adding document: ${document.metadata.title}`);
|
|
8144
|
+
if (!document.pageContent.trim()) {
|
|
8145
|
+
throw new Error("Document content cannot be empty");
|
|
5396
8146
|
}
|
|
5397
|
-
return appliedCount;
|
|
5398
|
-
});
|
|
5399
|
-
let retries = 0;
|
|
5400
|
-
let appliedMigrationsCount = 0;
|
|
5401
|
-
while (true) {
|
|
5402
8147
|
try {
|
|
5403
|
-
|
|
5404
|
-
|
|
5405
|
-
|
|
5406
|
-
|
|
5407
|
-
|
|
5408
|
-
|
|
5409
|
-
|
|
5410
|
-
db.exec("VACUUM");
|
|
5411
|
-
logger.debug("Database vacuum completed successfully");
|
|
5412
|
-
} catch (error) {
|
|
5413
|
-
logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
|
|
8148
|
+
const chunks = await this.splitter.splitText(document.pageContent);
|
|
8149
|
+
const splitDocs = chunks.map((chunk) => ({
|
|
8150
|
+
pageContent: chunk.content,
|
|
8151
|
+
metadata: {
|
|
8152
|
+
...document.metadata,
|
|
8153
|
+
level: chunk.section.level,
|
|
8154
|
+
path: chunk.section.path
|
|
5414
8155
|
}
|
|
5415
|
-
}
|
|
5416
|
-
|
|
5417
|
-
|
|
5418
|
-
|
|
8156
|
+
}));
|
|
8157
|
+
logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
|
|
8158
|
+
await this.store.addDocuments(library, normalizedVersion, splitDocs);
|
|
8159
|
+
const processingTime = performance.now() - processingStart;
|
|
8160
|
+
analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, {
|
|
8161
|
+
// Content characteristics (privacy-safe)
|
|
8162
|
+
mimeType: document.metadata.mimeType,
|
|
8163
|
+
contentSizeBytes: document.pageContent.length,
|
|
8164
|
+
// Processing metrics
|
|
8165
|
+
processingTimeMs: Math.round(processingTime),
|
|
8166
|
+
chunksCreated: splitDocs.length,
|
|
8167
|
+
// Document characteristics
|
|
8168
|
+
hasTitle: !!document.metadata.title,
|
|
8169
|
+
hasDescription: !!document.metadata.description,
|
|
8170
|
+
urlDomain: extractHostname(url),
|
|
8171
|
+
depth: document.metadata.depth,
|
|
8172
|
+
// Library context
|
|
8173
|
+
library,
|
|
8174
|
+
libraryVersion: normalizedVersion || null,
|
|
8175
|
+
// Processing efficiency
|
|
8176
|
+
avgChunkSizeBytes: Math.round(document.pageContent.length / splitDocs.length),
|
|
8177
|
+
processingSpeedKbPerSec: Math.round(
|
|
8178
|
+
document.pageContent.length / 1024 / (processingTime / 1e3)
|
|
8179
|
+
)
|
|
8180
|
+
});
|
|
5419
8181
|
} catch (error) {
|
|
5420
|
-
|
|
5421
|
-
|
|
5422
|
-
|
|
5423
|
-
|
|
5424
|
-
|
|
5425
|
-
|
|
5426
|
-
|
|
5427
|
-
|
|
5428
|
-
|
|
5429
|
-
|
|
5430
|
-
|
|
5431
|
-
}
|
|
5432
|
-
if (error instanceof StoreError) {
|
|
5433
|
-
throw error;
|
|
5434
|
-
}
|
|
5435
|
-
throw new StoreError("Failed during migration process", error);
|
|
8182
|
+
const processingTime = performance.now() - processingStart;
|
|
8183
|
+
if (error instanceof Error) {
|
|
8184
|
+
analytics.captureException(error, {
|
|
8185
|
+
mimeType: document.metadata.mimeType,
|
|
8186
|
+
contentSizeBytes: document.pageContent.length,
|
|
8187
|
+
processingTimeMs: Math.round(processingTime),
|
|
8188
|
+
library,
|
|
8189
|
+
libraryVersion: normalizedVersion || null,
|
|
8190
|
+
context: "document_processing",
|
|
8191
|
+
component: DocumentManagementService.constructor.name
|
|
8192
|
+
});
|
|
5436
8193
|
}
|
|
8194
|
+
throw error;
|
|
5437
8195
|
}
|
|
5438
8196
|
}
|
|
5439
|
-
|
|
5440
|
-
|
|
5441
|
-
|
|
5442
|
-
|
|
5443
|
-
|
|
5444
|
-
|
|
5445
|
-
|
|
5446
|
-
|
|
8197
|
+
/**
|
|
8198
|
+
* Searches for documentation content across versions.
|
|
8199
|
+
* Uses hybrid search (vector + FTS).
|
|
8200
|
+
* If version is omitted, searches documents without a specific version.
|
|
8201
|
+
*/
|
|
8202
|
+
async searchStore(library, version2, query, limit = 5) {
|
|
8203
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8204
|
+
return this.documentRetriever.search(library, normalizedVersion, query, limit);
|
|
8205
|
+
}
|
|
8206
|
+
// Deprecated simple listing removed: enriched listLibraries() is canonical
|
|
8207
|
+
/**
|
|
8208
|
+
* Ensures a library and version exist in the database and returns the version ID.
|
|
8209
|
+
* Creates the library and version records if they don't exist.
|
|
8210
|
+
*/
|
|
8211
|
+
async ensureLibraryAndVersion(library, version2) {
|
|
8212
|
+
const normalizedLibrary = library.toLowerCase();
|
|
8213
|
+
const normalizedVersion = this.normalizeVersion(version2);
|
|
8214
|
+
const { versionId } = await this.store.resolveLibraryAndVersionIds(
|
|
8215
|
+
normalizedLibrary,
|
|
8216
|
+
normalizedVersion
|
|
5447
8217
|
);
|
|
5448
|
-
|
|
5449
|
-
logger.warn("⚠️ Could not apply all production database settings");
|
|
8218
|
+
return versionId;
|
|
5450
8219
|
}
|
|
5451
8220
|
}
|
|
5452
8221
|
async function createDocumentManagement(options = {}) {
|
|
5453
8222
|
if (options.serverUrl) {
|
|
5454
|
-
const { DocumentManagementClient } = await import("./DocumentManagementClient-CAFdDwTu.js");
|
|
5455
8223
|
const client = new DocumentManagementClient(options.serverUrl);
|
|
5456
8224
|
await client.initialize();
|
|
5457
8225
|
return client;
|
|
5458
8226
|
}
|
|
5459
|
-
const service = new (
|
|
8227
|
+
const service = new DocumentManagementService(options.embeddingConfig);
|
|
5460
8228
|
await service.initialize();
|
|
5461
8229
|
return service;
|
|
5462
8230
|
}
|
|
5463
|
-
async function createLocalDocumentManagement() {
|
|
5464
|
-
const service = new
|
|
8231
|
+
async function createLocalDocumentManagement(embeddingConfig) {
|
|
8232
|
+
const service = new DocumentManagementService(embeddingConfig);
|
|
5465
8233
|
await service.initialize();
|
|
5466
8234
|
return service;
|
|
5467
8235
|
}
|
|
@@ -6991,6 +9759,17 @@ function warnHttpUsage(authConfig, port) {
|
|
|
6991
9759
|
);
|
|
6992
9760
|
}
|
|
6993
9761
|
}
|
|
9762
|
+
function resolveEmbeddingContext(cliArgs) {
|
|
9763
|
+
try {
|
|
9764
|
+
const modelSpec = cliArgs?.embeddingModel || process.env.DOCS_MCP_EMBEDDING_MODEL;
|
|
9765
|
+
logger.debug("Resolving embedding configuration");
|
|
9766
|
+
const config = EmbeddingConfig.parseEmbeddingConfig(modelSpec);
|
|
9767
|
+
return config;
|
|
9768
|
+
} catch (error) {
|
|
9769
|
+
logger.debug(`Failed to resolve embedding configuration: ${error}`);
|
|
9770
|
+
return null;
|
|
9771
|
+
}
|
|
9772
|
+
}
|
|
6994
9773
|
function createDefaultAction(program) {
|
|
6995
9774
|
return program.addOption(
|
|
6996
9775
|
new Option("--protocol <protocol>", "Protocol for MCP server").choices(["auto", "stdio", "http"]).default("auto")
|
|
@@ -7030,7 +9809,8 @@ function createDefaultAction(program) {
|
|
|
7030
9809
|
warnHttpUsage(authConfig, port);
|
|
7031
9810
|
}
|
|
7032
9811
|
ensurePlaywrightBrowsersInstalled();
|
|
7033
|
-
const
|
|
9812
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
9813
|
+
const docService = await createLocalDocumentManagement(embeddingConfig);
|
|
7034
9814
|
const pipelineOptions = {
|
|
7035
9815
|
recoverJobs: options.resume || false,
|
|
7036
9816
|
// Use --resume flag for job recovery
|
|
@@ -7071,24 +9851,12 @@ async function fetchUrlAction(url, options, command) {
|
|
|
7071
9851
|
setupLogging(globalOptions);
|
|
7072
9852
|
const headers = parseHeaders(options.header);
|
|
7073
9853
|
const fetchUrlTool = new FetchUrlTool(new HttpFetcher(), new FileFetcher());
|
|
7074
|
-
const content = await
|
|
7075
|
-
|
|
7076
|
-
|
|
7077
|
-
|
|
7078
|
-
|
|
7079
|
-
|
|
7080
|
-
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
7081
|
-
}),
|
|
7082
|
-
(content2) => ({
|
|
7083
|
-
url_protocol: extractProtocol(url),
|
|
7084
|
-
// Safe: only protocol, not full URL
|
|
7085
|
-
follow_redirects: options.followRedirects,
|
|
7086
|
-
scrape_mode: options.scrapeMode,
|
|
7087
|
-
has_custom_headers: Object.keys(headers).length > 0,
|
|
7088
|
-
content_length: content2.length,
|
|
7089
|
-
cli_flags: extractCliFlags(process.argv)
|
|
7090
|
-
})
|
|
7091
|
-
);
|
|
9854
|
+
const content = await fetchUrlTool.execute({
|
|
9855
|
+
url,
|
|
9856
|
+
followRedirects: options.followRedirects,
|
|
9857
|
+
scrapeMode: options.scrapeMode,
|
|
9858
|
+
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
9859
|
+
});
|
|
7092
9860
|
console.log(content);
|
|
7093
9861
|
}
|
|
7094
9862
|
function createFetchUrlCommand(program) {
|
|
@@ -7120,25 +9888,16 @@ async function findVersionAction(library, options, command) {
|
|
|
7120
9888
|
const globalOptions = command.parent?.opts() || {};
|
|
7121
9889
|
setupLogging(globalOptions);
|
|
7122
9890
|
const serverUrl = options.serverUrl;
|
|
7123
|
-
const docService = await createDocumentManagement({
|
|
9891
|
+
const docService = await createDocumentManagement({
|
|
9892
|
+
serverUrl,
|
|
9893
|
+
embeddingConfig: serverUrl ? void 0 : null
|
|
9894
|
+
});
|
|
7124
9895
|
try {
|
|
7125
9896
|
const findVersionTool = new FindVersionTool(docService);
|
|
7126
|
-
const versionInfo = await
|
|
7127
|
-
|
|
7128
|
-
|
|
7129
|
-
|
|
7130
|
-
targetVersion: options.version
|
|
7131
|
-
}),
|
|
7132
|
-
(versionInfo2) => ({
|
|
7133
|
-
library,
|
|
7134
|
-
// Safe: library names are public
|
|
7135
|
-
has_target_version: !!options.version,
|
|
7136
|
-
result_type: typeof versionInfo2,
|
|
7137
|
-
// 'string'
|
|
7138
|
-
using_remote_server: !!serverUrl,
|
|
7139
|
-
cli_flags: extractCliFlags(process.argv)
|
|
7140
|
-
})
|
|
7141
|
-
);
|
|
9897
|
+
const versionInfo = await findVersionTool.execute({
|
|
9898
|
+
library,
|
|
9899
|
+
targetVersion: options.version
|
|
9900
|
+
});
|
|
7142
9901
|
if (!versionInfo) throw new Error("Failed to get version information");
|
|
7143
9902
|
console.log(versionInfo);
|
|
7144
9903
|
} finally {
|
|
@@ -7155,18 +9914,13 @@ async function listAction(options, command) {
|
|
|
7155
9914
|
const globalOptions = command.parent?.opts() || {};
|
|
7156
9915
|
setupLogging(globalOptions);
|
|
7157
9916
|
const { serverUrl } = options;
|
|
7158
|
-
const docService = await createDocumentManagement({
|
|
9917
|
+
const docService = await createDocumentManagement({
|
|
9918
|
+
serverUrl,
|
|
9919
|
+
embeddingConfig: serverUrl ? void 0 : null
|
|
9920
|
+
});
|
|
7159
9921
|
try {
|
|
7160
9922
|
const listLibrariesTool = new ListLibrariesTool(docService);
|
|
7161
|
-
const result = await
|
|
7162
|
-
"list_libraries",
|
|
7163
|
-
() => listLibrariesTool.execute(),
|
|
7164
|
-
(result2) => ({
|
|
7165
|
-
library_count: result2.libraries.length,
|
|
7166
|
-
using_remote_server: !!serverUrl,
|
|
7167
|
-
cli_flags: extractCliFlags(process.argv)
|
|
7168
|
-
})
|
|
7169
|
-
);
|
|
9923
|
+
const result = await listLibrariesTool.execute();
|
|
7170
9924
|
console.log(formatOutput(result.libraries));
|
|
7171
9925
|
} finally {
|
|
7172
9926
|
await docService.shutdown();
|
|
@@ -7219,8 +9973,16 @@ function createMcpCommand(program) {
|
|
|
7219
9973
|
validateAuthConfig(authConfig);
|
|
7220
9974
|
}
|
|
7221
9975
|
try {
|
|
9976
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
9977
|
+
if (!serverUrl && !embeddingConfig) {
|
|
9978
|
+
logger.error(
|
|
9979
|
+
"❌ Embedding configuration is required for local mode. Configure an embedding provider with CLI options or environment variables."
|
|
9980
|
+
);
|
|
9981
|
+
process.exit(1);
|
|
9982
|
+
}
|
|
7222
9983
|
const docService = await createDocumentManagement({
|
|
7223
|
-
serverUrl
|
|
9984
|
+
serverUrl,
|
|
9985
|
+
embeddingConfig
|
|
7224
9986
|
});
|
|
7225
9987
|
const pipelineOptions = {
|
|
7226
9988
|
recoverJobs: false,
|
|
@@ -7270,20 +10032,13 @@ async function removeAction(library, options, command) {
|
|
|
7270
10032
|
const globalOptions = command.parent?.opts() || {};
|
|
7271
10033
|
setupLogging(globalOptions);
|
|
7272
10034
|
const serverUrl = options.serverUrl;
|
|
7273
|
-
const docService = await createDocumentManagement({
|
|
10035
|
+
const docService = await createDocumentManagement({
|
|
10036
|
+
serverUrl,
|
|
10037
|
+
embeddingConfig: serverUrl ? void 0 : null
|
|
10038
|
+
});
|
|
7274
10039
|
const { version: version2 } = options;
|
|
7275
10040
|
try {
|
|
7276
|
-
await
|
|
7277
|
-
"remove_documents",
|
|
7278
|
-
() => docService.removeAllDocuments(library, version2),
|
|
7279
|
-
() => ({
|
|
7280
|
-
library,
|
|
7281
|
-
// Safe: library names are public
|
|
7282
|
-
has_version: !!version2,
|
|
7283
|
-
using_remote_server: !!serverUrl,
|
|
7284
|
-
cli_flags: extractCliFlags(process.argv)
|
|
7285
|
-
})
|
|
7286
|
-
);
|
|
10041
|
+
await docService.removeAllDocuments(library, version2);
|
|
7287
10042
|
console.log(`✅ Successfully removed ${library}${version2 ? `@${version2}` : ""}.`);
|
|
7288
10043
|
} catch (error) {
|
|
7289
10044
|
console.error(
|
|
@@ -7308,7 +10063,16 @@ async function scrapeAction(library, url, options, command) {
|
|
|
7308
10063
|
const globalOptions = command.parent?.opts() || {};
|
|
7309
10064
|
setupLogging(globalOptions);
|
|
7310
10065
|
const serverUrl = options.serverUrl;
|
|
7311
|
-
const
|
|
10066
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
10067
|
+
if (!serverUrl && !embeddingConfig) {
|
|
10068
|
+
throw new Error(
|
|
10069
|
+
"Embedding configuration is required for local scraping. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
|
|
10070
|
+
);
|
|
10071
|
+
}
|
|
10072
|
+
const docService = await createDocumentManagement({
|
|
10073
|
+
serverUrl,
|
|
10074
|
+
embeddingConfig
|
|
10075
|
+
});
|
|
7312
10076
|
let pipeline = null;
|
|
7313
10077
|
try {
|
|
7314
10078
|
const pipelineOptions = {
|
|
@@ -7323,48 +10087,23 @@ async function scrapeAction(library, url, options, command) {
|
|
|
7323
10087
|
await pipeline.start();
|
|
7324
10088
|
const scrapeTool = new ScrapeTool(pipeline);
|
|
7325
10089
|
const headers = parseHeaders(options.header);
|
|
7326
|
-
const result = await
|
|
7327
|
-
|
|
7328
|
-
|
|
7329
|
-
|
|
7330
|
-
|
|
7331
|
-
|
|
7332
|
-
|
|
7333
|
-
|
|
7334
|
-
|
|
7335
|
-
maxConcurrency: Number.parseInt(options.maxConcurrency, 10),
|
|
7336
|
-
ignoreErrors: options.ignoreErrors,
|
|
7337
|
-
scope: options.scope,
|
|
7338
|
-
followRedirects: options.followRedirects,
|
|
7339
|
-
scrapeMode: options.scrapeMode,
|
|
7340
|
-
includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
|
|
7341
|
-
excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
|
|
7342
|
-
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
7343
|
-
}
|
|
7344
|
-
}),
|
|
7345
|
-
(result2) => ({
|
|
7346
|
-
library,
|
|
7347
|
-
// Safe: library names are public
|
|
7348
|
-
url_protocol: extractProtocol(url),
|
|
7349
|
-
// Safe: only protocol, not full URL
|
|
7350
|
-
max_pages: Number.parseInt(options.maxPages, 10),
|
|
7351
|
-
max_depth: Number.parseInt(options.maxDepth, 10),
|
|
7352
|
-
max_concurrency: Number.parseInt(options.maxConcurrency, 10),
|
|
7353
|
-
has_version: !!options.version,
|
|
10090
|
+
const result = await scrapeTool.execute({
|
|
10091
|
+
url,
|
|
10092
|
+
library,
|
|
10093
|
+
version: options.version,
|
|
10094
|
+
options: {
|
|
10095
|
+
maxPages: Number.parseInt(options.maxPages, 10),
|
|
10096
|
+
maxDepth: Number.parseInt(options.maxDepth, 10),
|
|
10097
|
+
maxConcurrency: Number.parseInt(options.maxConcurrency, 10),
|
|
10098
|
+
ignoreErrors: options.ignoreErrors,
|
|
7354
10099
|
scope: options.scope,
|
|
7355
|
-
|
|
7356
|
-
|
|
7357
|
-
|
|
7358
|
-
|
|
7359
|
-
|
|
7360
|
-
|
|
7361
|
-
|
|
7362
|
-
cli_flags: extractCliFlags(process.argv),
|
|
7363
|
-
is_async_job: !("pagesScraped" in result2),
|
|
7364
|
-
// Pipeline mode vs direct mode
|
|
7365
|
-
pages_scraped: "pagesScraped" in result2 ? result2.pagesScraped : void 0
|
|
7366
|
-
})
|
|
7367
|
-
);
|
|
10100
|
+
followRedirects: options.followRedirects,
|
|
10101
|
+
scrapeMode: options.scrapeMode,
|
|
10102
|
+
includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
|
|
10103
|
+
excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
|
|
10104
|
+
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
10105
|
+
}
|
|
10106
|
+
});
|
|
7368
10107
|
if ("pagesScraped" in result) {
|
|
7369
10108
|
console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
|
|
7370
10109
|
} else {
|
|
@@ -7443,31 +10182,25 @@ async function searchAction(library, query, options, command) {
|
|
|
7443
10182
|
const globalOptions = command.parent?.opts() || {};
|
|
7444
10183
|
setupLogging(globalOptions);
|
|
7445
10184
|
const serverUrl = options.serverUrl;
|
|
7446
|
-
const
|
|
10185
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
10186
|
+
if (!serverUrl && !embeddingConfig) {
|
|
10187
|
+
throw new Error(
|
|
10188
|
+
"Embedding configuration is required for local search. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
|
|
10189
|
+
);
|
|
10190
|
+
}
|
|
10191
|
+
const docService = await createDocumentManagement({
|
|
10192
|
+
serverUrl,
|
|
10193
|
+
embeddingConfig
|
|
10194
|
+
});
|
|
7447
10195
|
try {
|
|
7448
10196
|
const searchTool = new SearchTool(docService);
|
|
7449
|
-
const result = await
|
|
7450
|
-
|
|
7451
|
-
|
|
7452
|
-
|
|
7453
|
-
|
|
7454
|
-
|
|
7455
|
-
|
|
7456
|
-
exactMatch: options.exactMatch
|
|
7457
|
-
}),
|
|
7458
|
-
(result2) => ({
|
|
7459
|
-
library,
|
|
7460
|
-
// Safe: library names are public
|
|
7461
|
-
query_analysis: analyzeSearchQuery(query),
|
|
7462
|
-
// Analyzed, not raw query
|
|
7463
|
-
result_count: result2.results.length,
|
|
7464
|
-
limit_used: Number.parseInt(options.limit, 10),
|
|
7465
|
-
has_version_filter: !!options.version,
|
|
7466
|
-
exact_match: options.exactMatch,
|
|
7467
|
-
using_remote_server: !!serverUrl,
|
|
7468
|
-
cli_flags: extractCliFlags(process.argv)
|
|
7469
|
-
})
|
|
7470
|
-
);
|
|
10197
|
+
const result = await searchTool.execute({
|
|
10198
|
+
library,
|
|
10199
|
+
version: options.version,
|
|
10200
|
+
query,
|
|
10201
|
+
limit: Number.parseInt(options.limit, 10),
|
|
10202
|
+
exactMatch: options.exactMatch
|
|
10203
|
+
});
|
|
7471
10204
|
console.log(formatOutput(result.results));
|
|
7472
10205
|
} finally {
|
|
7473
10206
|
await docService.shutdown();
|
|
@@ -7503,8 +10236,16 @@ function createWebCommand(program) {
|
|
|
7503
10236
|
const serverUrl = cmdOptions.serverUrl;
|
|
7504
10237
|
setupLogging(globalOptions);
|
|
7505
10238
|
try {
|
|
10239
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
10240
|
+
if (!serverUrl && !embeddingConfig) {
|
|
10241
|
+
logger.error(
|
|
10242
|
+
"❌ Embedding configuration is required for local mode. Configure an embedding provider with CLI options or environment variables."
|
|
10243
|
+
);
|
|
10244
|
+
process.exit(1);
|
|
10245
|
+
}
|
|
7506
10246
|
const docService = await createDocumentManagement({
|
|
7507
|
-
serverUrl
|
|
10247
|
+
serverUrl,
|
|
10248
|
+
embeddingConfig
|
|
7508
10249
|
});
|
|
7509
10250
|
const pipelineOptions = {
|
|
7510
10251
|
recoverJobs: false,
|
|
@@ -7553,7 +10294,8 @@ function createWorkerCommand(program) {
|
|
|
7553
10294
|
try {
|
|
7554
10295
|
logger.info(`🚀 Starting external pipeline worker on port ${port}`);
|
|
7555
10296
|
ensurePlaywrightBrowsersInstalled();
|
|
7556
|
-
const
|
|
10297
|
+
const embeddingConfig = resolveEmbeddingContext();
|
|
10298
|
+
const docService = await createLocalDocumentManagement(embeddingConfig);
|
|
7557
10299
|
const pipelineOptions = {
|
|
7558
10300
|
recoverJobs: cmdOptions.resume,
|
|
7559
10301
|
// Use the resume option
|
|
@@ -7581,7 +10323,7 @@ function createCliProgram() {
|
|
|
7581
10323
|
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version(packageJson.version).addOption(
|
|
7582
10324
|
new Option("--verbose", "Enable verbose (debug) logging").conflicts("silent")
|
|
7583
10325
|
).addOption(new Option("--silent", "Disable all logging except errors")).addOption(new Option("--no-telemetry", "Disable telemetry collection")).enablePositionalOptions().allowExcessArguments(false).showHelpAfterError(true);
|
|
7584
|
-
program.hook("preAction", (thisCommand, actionCommand) => {
|
|
10326
|
+
program.hook("preAction", async (thisCommand, actionCommand) => {
|
|
7585
10327
|
const globalOptions = thisCommand.opts();
|
|
7586
10328
|
if (globalOptions.silent) setLogLevel(LogLevel.ERROR);
|
|
7587
10329
|
else if (globalOptions.verbose) setLogLevel(LogLevel.DEBUG);
|
|
@@ -7672,7 +10414,11 @@ async function runCli() {
|
|
|
7672
10414
|
});
|
|
7673
10415
|
await program.parseAsync(process.argv);
|
|
7674
10416
|
} catch (error) {
|
|
7675
|
-
|
|
10417
|
+
if (error instanceof ModelConfigurationError || error instanceof UnsupportedProviderError) {
|
|
10418
|
+
logger.error(error.message);
|
|
10419
|
+
} else {
|
|
10420
|
+
logger.error(`❌ Error in CLI: ${error}`);
|
|
10421
|
+
}
|
|
7676
10422
|
if (!isShuttingDown) {
|
|
7677
10423
|
isShuttingDown = true;
|
|
7678
10424
|
const shutdownPromises = [];
|
|
@@ -7722,27 +10468,4 @@ runCli().catch((error) => {
|
|
|
7722
10468
|
console.error(`🔥 Fatal error in main execution: ${error}`);
|
|
7723
10469
|
process.exit(1);
|
|
7724
10470
|
});
|
|
7725
|
-
export {
|
|
7726
|
-
ConnectionError as C,
|
|
7727
|
-
DimensionError as D,
|
|
7728
|
-
EMBEDDING_BATCH_CHARS as E,
|
|
7729
|
-
LibraryNotFoundError as L,
|
|
7730
|
-
StoreError as S,
|
|
7731
|
-
TelemetryEvent as T,
|
|
7732
|
-
VECTOR_DIMENSION as V,
|
|
7733
|
-
applyMigrations as a,
|
|
7734
|
-
EMBEDDING_BATCH_SIZE as b,
|
|
7735
|
-
createJSDOM as c,
|
|
7736
|
-
denormalizeVersionName as d,
|
|
7737
|
-
SPLITTER_PREFERRED_CHUNK_SIZE as e,
|
|
7738
|
-
SPLITTER_MAX_CHUNK_SIZE as f,
|
|
7739
|
-
getProjectRoot as g,
|
|
7740
|
-
VersionNotFoundError as h,
|
|
7741
|
-
analytics as i,
|
|
7742
|
-
extractHostname as j,
|
|
7743
|
-
SPLITTER_MIN_CHUNK_SIZE as k,
|
|
7744
|
-
logger as l,
|
|
7745
|
-
mapDbDocumentToDocument as m,
|
|
7746
|
-
normalizeVersionName as n
|
|
7747
|
-
};
|
|
7748
10471
|
//# sourceMappingURL=index.js.map
|