@arabold/docs-mcp-server 1.21.0 → 1.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,4 +1,10 @@
1
+ #!/usr/bin/env node
1
2
  import "dotenv/config";
3
+ import { BedrockEmbeddings } from "@langchain/aws";
4
+ import { GoogleGenerativeAIEmbeddings } from "@langchain/google-genai";
5
+ import { VertexAIEmbeddings } from "@langchain/google-vertexai";
6
+ import { AzureOpenAIEmbeddings, OpenAIEmbeddings } from "@langchain/openai";
7
+ import { Embeddings } from "@langchain/core/embeddings";
2
8
  import { PostHog } from "posthog-node";
3
9
  import crypto, { randomUUID } from "node:crypto";
4
10
  import fs, { readFileSync, existsSync } from "node:fs";
@@ -39,14 +45,277 @@ import DOMPurify from "dompurify";
39
45
  import { fileURLToPath, URL as URL$1 } from "node:url";
40
46
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
41
47
  import { createTRPCProxyClient, httpBatchLink } from "@trpc/client";
42
- import "fuse.js";
43
- import "langchain/text_splitter";
44
- import "better-sqlite3";
45
- import "sqlite-vec";
48
+ import Fuse from "fuse.js";
49
+ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
50
+ import Database from "better-sqlite3";
51
+ import * as sqliteVec from "sqlite-vec";
46
52
  import { execSync } from "node:child_process";
47
53
  import { v4 } from "uuid";
48
54
  import psl from "psl";
49
55
  import { minimatch } from "minimatch";
56
+ const VECTOR_DIMENSION = 1536;
57
+ function mapDbDocumentToDocument(doc) {
58
+ return {
59
+ id: doc.id,
60
+ pageContent: doc.content,
61
+ metadata: JSON.parse(doc.metadata)
62
+ };
63
+ }
64
+ var VersionStatus = /* @__PURE__ */ ((VersionStatus2) => {
65
+ VersionStatus2["NOT_INDEXED"] = "not_indexed";
66
+ VersionStatus2["QUEUED"] = "queued";
67
+ VersionStatus2["RUNNING"] = "running";
68
+ VersionStatus2["COMPLETED"] = "completed";
69
+ VersionStatus2["FAILED"] = "failed";
70
+ VersionStatus2["CANCELLED"] = "cancelled";
71
+ VersionStatus2["UPDATING"] = "updating";
72
+ return VersionStatus2;
73
+ })(VersionStatus || {});
74
+ function normalizeVersionName(name) {
75
+ return name ?? "";
76
+ }
77
+ function denormalizeVersionName(name) {
78
+ return name === "" ? "" : name;
79
+ }
80
+ function getStatusDescription(status) {
81
+ const descriptions = {
82
+ [
83
+ "not_indexed"
84
+ /* NOT_INDEXED */
85
+ ]: "Version created but not yet indexed",
86
+ [
87
+ "queued"
88
+ /* QUEUED */
89
+ ]: "Waiting in queue for indexing",
90
+ [
91
+ "running"
92
+ /* RUNNING */
93
+ ]: "Currently being indexed",
94
+ [
95
+ "completed"
96
+ /* COMPLETED */
97
+ ]: "Successfully indexed",
98
+ [
99
+ "failed"
100
+ /* FAILED */
101
+ ]: "Indexing failed",
102
+ [
103
+ "cancelled"
104
+ /* CANCELLED */
105
+ ]: "Indexing was cancelled",
106
+ [
107
+ "updating"
108
+ /* UPDATING */
109
+ ]: "Re-indexing in progress"
110
+ };
111
+ return descriptions[status] || "Unknown status";
112
+ }
113
+ function isActiveStatus(status) {
114
+ return [
115
+ "queued",
116
+ "running",
117
+ "updating"
118
+ /* UPDATING */
119
+ ].includes(
120
+ status
121
+ );
122
+ }
123
+ class StoreError extends Error {
124
+ constructor(message, cause) {
125
+ super(cause ? `${message} caused by ${cause}` : message);
126
+ this.cause = cause;
127
+ this.name = this.constructor.name;
128
+ const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
129
+ if (causeError?.stack) {
130
+ this.stack = causeError.stack;
131
+ }
132
+ }
133
+ }
134
+ class DimensionError extends StoreError {
135
+ constructor(modelName, modelDimension, dbDimension) {
136
+ super(
137
+ `Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
138
+ );
139
+ this.modelName = modelName;
140
+ this.modelDimension = modelDimension;
141
+ this.dbDimension = dbDimension;
142
+ }
143
+ }
144
+ class ConnectionError extends StoreError {
145
+ }
146
+ class FixedDimensionEmbeddings extends Embeddings {
147
+ constructor(embeddings, targetDimension, providerAndModel, allowTruncate = false) {
148
+ super({});
149
+ this.embeddings = embeddings;
150
+ this.targetDimension = targetDimension;
151
+ this.allowTruncate = allowTruncate;
152
+ const [providerOrModel, modelName] = providerAndModel.split(":");
153
+ this.provider = modelName ? providerOrModel : "openai";
154
+ this.model = modelName || providerOrModel;
155
+ }
156
+ provider;
157
+ model;
158
+ /**
159
+ * Normalize a vector to the target dimension by truncating (for MRL models) or padding.
160
+ * @throws {DimensionError} If vector is too large and provider doesn't support MRL
161
+ */
162
+ normalizeVector(vector) {
163
+ const dimension = vector.length;
164
+ if (dimension > this.targetDimension) {
165
+ if (this.allowTruncate) {
166
+ return vector.slice(0, this.targetDimension);
167
+ }
168
+ throw new DimensionError(
169
+ `${this.provider}:${this.model}`,
170
+ dimension,
171
+ this.targetDimension
172
+ );
173
+ }
174
+ if (dimension < this.targetDimension) {
175
+ return [...vector, ...new Array(this.targetDimension - dimension).fill(0)];
176
+ }
177
+ return vector;
178
+ }
179
+ async embedQuery(text) {
180
+ const vector = await this.embeddings.embedQuery(text);
181
+ return this.normalizeVector(vector);
182
+ }
183
+ async embedDocuments(documents) {
184
+ const vectors = await this.embeddings.embedDocuments(documents);
185
+ return vectors.map((vector) => this.normalizeVector(vector));
186
+ }
187
+ }
188
+ class UnsupportedProviderError extends Error {
189
+ constructor(provider) {
190
+ super(
191
+ `❌ Unsupported embedding provider: ${provider}
192
+ Supported providers: openai, vertex, gemini, aws, microsoft
193
+ See README.md for configuration options or run with --help for more details.`
194
+ );
195
+ this.name = "UnsupportedProviderError";
196
+ }
197
+ }
198
+ class ModelConfigurationError extends Error {
199
+ constructor(message) {
200
+ super(message);
201
+ this.name = "ModelConfigurationError";
202
+ }
203
+ }
204
+ function createEmbeddingModel(providerAndModel) {
205
+ const [providerOrModel, ...modelNameParts] = providerAndModel.split(":");
206
+ const modelName = modelNameParts.join(":");
207
+ const provider = modelName ? providerOrModel : "openai";
208
+ const model = modelName || providerOrModel;
209
+ const baseConfig = { stripNewLines: true };
210
+ switch (provider) {
211
+ case "openai": {
212
+ if (!process.env.OPENAI_API_KEY) {
213
+ throw new ModelConfigurationError(
214
+ "❌ Missing API key for embedding provider\n Please set OPENAI_API_KEY or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
215
+ );
216
+ }
217
+ const config = {
218
+ ...baseConfig,
219
+ modelName: model,
220
+ batchSize: 512
221
+ // OpenAI supports large batches
222
+ };
223
+ const baseURL = process.env.OPENAI_API_BASE;
224
+ if (baseURL) {
225
+ config.configuration = { baseURL };
226
+ }
227
+ return new OpenAIEmbeddings(config);
228
+ }
229
+ case "vertex": {
230
+ if (!process.env.GOOGLE_APPLICATION_CREDENTIALS) {
231
+ throw new ModelConfigurationError(
232
+ "❌ Missing credentials for Google Cloud Vertex AI\n Please set GOOGLE_APPLICATION_CREDENTIALS or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
233
+ );
234
+ }
235
+ return new VertexAIEmbeddings({
236
+ ...baseConfig,
237
+ model
238
+ // e.g., "text-embedding-004"
239
+ });
240
+ }
241
+ case "gemini": {
242
+ if (!process.env.GOOGLE_API_KEY) {
243
+ throw new ModelConfigurationError(
244
+ "❌ Missing API key for Google AI (Gemini)\n Please set GOOGLE_API_KEY or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
245
+ );
246
+ }
247
+ const baseEmbeddings = new GoogleGenerativeAIEmbeddings({
248
+ ...baseConfig,
249
+ apiKey: process.env.GOOGLE_API_KEY,
250
+ model
251
+ // e.g., "gemini-embedding-exp-03-07"
252
+ });
253
+ return new FixedDimensionEmbeddings(
254
+ baseEmbeddings,
255
+ VECTOR_DIMENSION,
256
+ providerAndModel,
257
+ true
258
+ );
259
+ }
260
+ case "aws": {
261
+ const region = process.env.BEDROCK_AWS_REGION || process.env.AWS_REGION;
262
+ if (!region) {
263
+ throw new ModelConfigurationError(
264
+ "BEDROCK_AWS_REGION or AWS_REGION environment variable is required for AWS Bedrock"
265
+ );
266
+ }
267
+ if (!process.env.AWS_PROFILE && !process.env.AWS_ACCESS_KEY_ID && !process.env.AWS_SECRET_ACCESS_KEY) {
268
+ throw new ModelConfigurationError(
269
+ "Either AWS_PROFILE or both AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables are required for AWS Bedrock"
270
+ );
271
+ }
272
+ const credentials = process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY ? {
273
+ accessKeyId: process.env.AWS_ACCESS_KEY_ID,
274
+ secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
275
+ sessionToken: process.env.AWS_SESSION_TOKEN
276
+ } : void 0;
277
+ return new BedrockEmbeddings({
278
+ ...baseConfig,
279
+ model,
280
+ // e.g., "amazon.titan-embed-text-v1"
281
+ region,
282
+ ...credentials ? { credentials } : {}
283
+ });
284
+ }
285
+ case "microsoft": {
286
+ if (!process.env.AZURE_OPENAI_API_KEY) {
287
+ throw new ModelConfigurationError(
288
+ "AZURE_OPENAI_API_KEY environment variable is required for Azure OpenAI"
289
+ );
290
+ }
291
+ if (!process.env.AZURE_OPENAI_API_INSTANCE_NAME) {
292
+ throw new ModelConfigurationError(
293
+ "AZURE_OPENAI_API_INSTANCE_NAME environment variable is required for Azure OpenAI"
294
+ );
295
+ }
296
+ if (!process.env.AZURE_OPENAI_API_DEPLOYMENT_NAME) {
297
+ throw new ModelConfigurationError(
298
+ "AZURE_OPENAI_API_DEPLOYMENT_NAME environment variable is required for Azure OpenAI"
299
+ );
300
+ }
301
+ if (!process.env.AZURE_OPENAI_API_VERSION) {
302
+ throw new ModelConfigurationError(
303
+ "AZURE_OPENAI_API_VERSION environment variable is required for Azure OpenAI"
304
+ );
305
+ }
306
+ return new AzureOpenAIEmbeddings({
307
+ ...baseConfig,
308
+ azureOpenAIApiKey: process.env.AZURE_OPENAI_API_KEY,
309
+ azureOpenAIApiInstanceName: process.env.AZURE_OPENAI_API_INSTANCE_NAME,
310
+ azureOpenAIApiDeploymentName: process.env.AZURE_OPENAI_API_DEPLOYMENT_NAME,
311
+ azureOpenAIApiVersion: process.env.AZURE_OPENAI_API_VERSION,
312
+ deploymentName: model
313
+ });
314
+ }
315
+ default:
316
+ throw new UnsupportedProviderError(provider);
317
+ }
318
+ }
50
319
  const LogLevel = {
51
320
  ERROR: 0,
52
321
  WARN: 1,
@@ -105,6 +374,41 @@ const logger = {
105
374
  }
106
375
  }
107
376
  };
377
+ function camelToSnakeCase(str) {
378
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
379
+ }
380
+ function convertPropertiesToSnakeCase(obj) {
381
+ const result = {};
382
+ for (const [key, value] of Object.entries(obj)) {
383
+ const snakeKey = camelToSnakeCase(key);
384
+ if (value && typeof value === "object" && !Array.isArray(value) && !(value instanceof Date)) {
385
+ result[snakeKey] = convertPropertiesToSnakeCase(value);
386
+ } else if (Array.isArray(value)) {
387
+ result[snakeKey] = value.map(
388
+ (item) => item && typeof item === "object" && !(item instanceof Date) ? convertPropertiesToSnakeCase(item) : item
389
+ );
390
+ } else {
391
+ result[snakeKey] = value;
392
+ }
393
+ }
394
+ return result;
395
+ }
396
+ function addPostHogStandardProperties(properties) {
397
+ const result = { ...properties };
398
+ if (properties.sessionId) {
399
+ result.$session_id = properties.sessionId;
400
+ delete result.sessionId;
401
+ }
402
+ if (properties.startTime) {
403
+ result.$start_timestamp = properties.startTime.toISOString();
404
+ delete result.startTime;
405
+ }
406
+ if (properties.appVersion) {
407
+ result.$app_version = properties.appVersion;
408
+ delete result.appVersion;
409
+ }
410
+ return result;
411
+ }
108
412
  class PostHogClient {
109
413
  client;
110
414
  enabled;
@@ -155,10 +459,12 @@ class PostHogClient {
155
459
  capture(distinctId, event, properties) {
156
460
  if (!this.enabled || !this.client) return;
157
461
  try {
462
+ const enhancedProperties = addPostHogStandardProperties(properties);
463
+ const snakeCaseProperties = convertPropertiesToSnakeCase(enhancedProperties);
158
464
  this.client.capture({
159
465
  distinctId,
160
466
  event,
161
- properties
467
+ properties: snakeCaseProperties
162
468
  });
163
469
  logger.debug(`PostHog event captured: ${event}`);
164
470
  } catch (error) {
@@ -167,6 +473,26 @@ class PostHogClient {
167
473
  );
168
474
  }
169
475
  }
476
+ /**
477
+ * Capture exception using PostHog's native error tracking
478
+ */
479
+ captureException(distinctId, error, properties) {
480
+ if (!this.enabled || !this.client) return;
481
+ try {
482
+ const enhancedProperties = addPostHogStandardProperties(properties || {});
483
+ const snakeCaseProperties = convertPropertiesToSnakeCase(enhancedProperties);
484
+ this.client.captureException({
485
+ error,
486
+ distinctId,
487
+ properties: snakeCaseProperties
488
+ });
489
+ logger.debug(`PostHog exception captured: ${error.constructor.name}`);
490
+ } catch (captureError) {
491
+ logger.debug(
492
+ `PostHog captureException error: ${captureError instanceof Error ? captureError.message : "Unknown error"}`
493
+ );
494
+ }
495
+ }
170
496
  /**
171
497
  * Graceful shutdown with event flushing
172
498
  */
@@ -203,7 +529,7 @@ class SessionTracker {
203
529
  endSession() {
204
530
  if (!this.sessionContext) return null;
205
531
  const duration = Date.now() - this.sessionContext.startTime.getTime();
206
- const sessionInterface = this.sessionContext.interface;
532
+ const sessionInterface = this.sessionContext.appInterface;
207
533
  this.sessionContext = void 0;
208
534
  return { duration, interface: sessionInterface };
209
535
  }
@@ -213,6 +539,14 @@ class SessionTracker {
213
539
  getSessionContext() {
214
540
  return this.sessionContext;
215
541
  }
542
+ /**
543
+ * Update session context with additional fields
544
+ */
545
+ updateSessionContext(updates) {
546
+ if (this.sessionContext) {
547
+ this.sessionContext = { ...this.sessionContext, ...updates };
548
+ }
549
+ }
216
550
  /**
217
551
  * Get enriched properties with session context
218
552
  */
@@ -293,8 +627,6 @@ var TelemetryEvent = /* @__PURE__ */ ((TelemetryEvent2) => {
293
627
  TelemetryEvent2["PIPELINE_JOB_PROGRESS"] = "pipeline_job_progress";
294
628
  TelemetryEvent2["PIPELINE_JOB_COMPLETED"] = "pipeline_job_completed";
295
629
  TelemetryEvent2["DOCUMENT_PROCESSED"] = "document_processed";
296
- TelemetryEvent2["DOCUMENT_PROCESSING_FAILED"] = "document_processing_failed";
297
- TelemetryEvent2["ERROR_OCCURRED"] = "error_occurred";
298
630
  return TelemetryEvent2;
299
631
  })(TelemetryEvent || {});
300
632
  class Analytics {
@@ -320,23 +652,34 @@ class Analytics {
320
652
  if (!this.enabled) return;
321
653
  this.sessionTracker.startSession(context);
322
654
  this.track("session_started", {
323
- interface: context.interface,
324
- version: context.version,
325
- platform: context.platform,
326
- sessionDurationTarget: context.interface === "cli" ? "short" : "long",
327
- authEnabled: context.authEnabled,
328
- readOnly: context.readOnly,
329
- servicesCount: context.servicesEnabled.length
655
+ interface: context.appInterface,
656
+ version: context.appVersion,
657
+ platform: context.appPlatform,
658
+ authEnabled: context.appAuthEnabled,
659
+ readOnly: context.appReadOnly,
660
+ servicesCount: context.appServicesEnabled.length
330
661
  });
331
662
  }
332
663
  /**
333
- * Track an event with automatic session context inclusion
664
+ * Update session context with additional fields (e.g., embedding model info)
334
665
  */
666
+ updateSessionContext(updates) {
667
+ if (!this.enabled) return;
668
+ this.sessionTracker.updateSessionContext(updates);
669
+ }
335
670
  track(event, properties = {}) {
336
671
  if (!this.enabled) return;
337
672
  const eventProperties = this.sessionTracker.getEnrichedProperties(properties);
338
673
  this.postHogClient.capture(this.distinctId, event, eventProperties);
339
674
  }
675
+ /**
676
+ * Capture exception using PostHog's native error tracking with session context
677
+ */
678
+ captureException(error, properties = {}) {
679
+ if (!this.enabled) return;
680
+ const eventProperties = this.sessionTracker.getEnrichedProperties(properties);
681
+ this.postHogClient.captureException(this.distinctId, error, eventProperties);
682
+ }
340
683
  /**
341
684
  * Track session end with duration
342
685
  */
@@ -368,29 +711,38 @@ class Analytics {
368
711
  getSessionContext() {
369
712
  return this.sessionTracker.getSessionContext();
370
713
  }
371
- }
372
- const analytics = new Analytics();
373
- async function trackTool(toolName, operation, getProperties) {
374
- const startTime = Date.now();
375
- try {
376
- const result = await operation();
377
- analytics.track("tool_used", {
378
- tool: toolName,
379
- success: true,
380
- durationMs: Date.now() - startTime,
381
- ...getProperties ? getProperties(result) : {}
382
- });
383
- return result;
384
- } catch (error) {
385
- analytics.track("tool_used", {
386
- tool: toolName,
387
- success: false,
388
- durationMs: Date.now() - startTime,
389
- errorType: error instanceof Error ? error.constructor.name : "UnknownError"
390
- });
391
- throw error;
714
+ /**
715
+ * Track tool usage with error handling and automatic timing
716
+ */
717
+ async trackTool(toolName, operation, getProperties) {
718
+ const startTime = Date.now();
719
+ try {
720
+ const result = await operation();
721
+ this.track("tool_used", {
722
+ tool: toolName,
723
+ success: true,
724
+ durationMs: Date.now() - startTime,
725
+ ...getProperties ? getProperties(result) : {}
726
+ });
727
+ return result;
728
+ } catch (error) {
729
+ this.track("tool_used", {
730
+ tool: toolName,
731
+ success: false,
732
+ durationMs: Date.now() - startTime
733
+ });
734
+ if (error instanceof Error) {
735
+ this.captureException(error, {
736
+ tool: toolName,
737
+ context: "tool_execution",
738
+ durationMs: Date.now() - startTime
739
+ });
740
+ }
741
+ throw error;
742
+ }
392
743
  }
393
744
  }
745
+ const analytics = new Analytics();
394
746
  function extractHostname(url) {
395
747
  try {
396
748
  const parsed = new URL(url);
@@ -410,28 +762,7 @@ function extractProtocol(urlOrPath) {
410
762
  return "unknown";
411
763
  }
412
764
  }
413
- function analyzeSearchQuery(query) {
414
- return {
415
- length: query.length,
416
- wordCount: query.trim().split(/\s+/).length,
417
- hasCodeTerms: /\b(function|class|import|export|const|let|var|def|async|await)\b/i.test(query),
418
- hasSpecialChars: /[^\w\s]/.test(query)
419
- };
420
- }
421
- function sanitizeErrorMessage(message) {
422
- return message.replace(/https?:\/\/[^\s]+/gi, "[url]").replace(/file:\/\/[^\s]+/gi, "[file-url]").replace(/\/[^\s]*\.[a-z]{2,4}/gi, "[path]").replace(/[A-Za-z]:\\[^\s]+/g, "[path]").replace(/Bearer\s+[^\s]+/gi, "Bearer [token]").replace(/api[_-]?key[=:]\s*[^\s]+/gi, "api_key=[redacted]").replace(/token[=:]\s*[^\s]+/gi, "token=[redacted]").substring(0, 200);
423
- }
424
- function sanitizeError(error) {
425
- return {
426
- type: error.constructor.name,
427
- message: sanitizeErrorMessage(error.message),
428
- hasStack: Boolean(error.stack)
429
- };
430
- }
431
- function extractCliFlags(argv) {
432
- return argv.filter((arg) => arg.startsWith("--") || arg.startsWith("-"));
433
- }
434
- const version = "1.20.0";
765
+ const version = "1.21.1";
435
766
  const packageJson = {
436
767
  version
437
768
  };
@@ -439,34 +770,48 @@ function getPackageVersion() {
439
770
  return packageJson.version;
440
771
  }
441
772
  function createCliSession(command, options) {
442
- return {
773
+ const baseSession = {
443
774
  sessionId: randomUUID(),
444
- interface: "cli",
775
+ appInterface: "cli",
445
776
  startTime: /* @__PURE__ */ new Date(),
446
- version: getPackageVersion(),
447
- platform: process.platform,
448
- nodeVersion: process.version,
449
- command: command || "unknown",
450
- authEnabled: options?.authEnabled,
451
- readOnly: options?.readOnly,
452
- servicesEnabled: ["worker"]
777
+ appVersion: getPackageVersion(),
778
+ appPlatform: process.platform,
779
+ appNodeVersion: process.version,
780
+ cliCommand: command || "unknown",
781
+ appAuthEnabled: options?.authEnabled,
782
+ appReadOnly: options?.readOnly,
783
+ appServicesEnabled: ["worker"]
453
784
  // CLI typically runs embedded worker
454
785
  };
786
+ if (options?.embeddingContext) {
787
+ return {
788
+ ...baseSession,
789
+ ...options.embeddingContext
790
+ };
791
+ }
792
+ return baseSession;
455
793
  }
456
794
  function createMcpSession(options) {
457
- return {
795
+ const baseSession = {
458
796
  sessionId: randomUUID(),
459
- interface: "mcp",
797
+ appInterface: "mcp",
460
798
  startTime: /* @__PURE__ */ new Date(),
461
- version: getPackageVersion(),
462
- platform: process.platform,
463
- nodeVersion: process.version,
464
- protocol: options.protocol || "stdio",
465
- transport: options.transport,
466
- authEnabled: options.authEnabled ?? false,
467
- readOnly: options.readOnly ?? false,
468
- servicesEnabled: options.servicesEnabled ?? ["mcp"]
799
+ appVersion: getPackageVersion(),
800
+ appPlatform: process.platform,
801
+ appNodeVersion: process.version,
802
+ mcpProtocol: options.protocol || "stdio",
803
+ mcpTransport: options.transport,
804
+ appAuthEnabled: options.authEnabled ?? false,
805
+ appReadOnly: options.readOnly ?? false,
806
+ appServicesEnabled: options.servicesEnabled ?? ["mcp"]
469
807
  };
808
+ if (options.embeddingContext) {
809
+ return {
810
+ ...baseSession,
811
+ ...options.embeddingContext
812
+ };
813
+ }
814
+ return baseSession;
470
815
  }
471
816
  function createTelemetryService() {
472
817
  return {
@@ -929,42 +1274,55 @@ class CancelJobTool {
929
1274
  * @returns A promise that resolves with the outcome message.
930
1275
  */
931
1276
  async execute(input) {
932
- try {
933
- const job = await this.pipeline.getJob(input.jobId);
934
- if (!job) {
935
- logger.warn(`❓ [CancelJobTool] Job not found: ${input.jobId}`);
936
- return {
937
- message: `Job with ID ${input.jobId} not found.`,
938
- success: false
939
- };
940
- }
941
- if (job.status === PipelineJobStatus.COMPLETED || // Use enum member
942
- job.status === PipelineJobStatus.FAILED || // Use enum member
943
- job.status === PipelineJobStatus.CANCELLED) {
944
- logger.debug(`Job ${input.jobId} is already in a final state: ${job.status}.`);
1277
+ return analytics.trackTool(
1278
+ "cancel_job",
1279
+ async () => {
1280
+ try {
1281
+ const job = await this.pipeline.getJob(input.jobId);
1282
+ if (!job) {
1283
+ logger.warn(`❓ [CancelJobTool] Job not found: ${input.jobId}`);
1284
+ return {
1285
+ message: `Job with ID ${input.jobId} not found.`,
1286
+ success: false
1287
+ };
1288
+ }
1289
+ if (job.status === PipelineJobStatus.COMPLETED || // Use enum member
1290
+ job.status === PipelineJobStatus.FAILED || // Use enum member
1291
+ job.status === PipelineJobStatus.CANCELLED) {
1292
+ logger.debug(
1293
+ `Job ${input.jobId} is already in a final state: ${job.status}.`
1294
+ );
1295
+ return {
1296
+ message: `Job ${input.jobId} is already ${job.status}. No action taken.`,
1297
+ success: true
1298
+ // Considered success as no cancellation needed
1299
+ };
1300
+ }
1301
+ await this.pipeline.cancelJob(input.jobId);
1302
+ const updatedJob = await this.pipeline.getJob(input.jobId);
1303
+ const finalStatus = updatedJob?.status ?? "UNKNOWN (job disappeared?)";
1304
+ logger.debug(
1305
+ `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}`
1306
+ );
1307
+ return {
1308
+ message: `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}.`,
1309
+ success: true
1310
+ };
1311
+ } catch (error) {
1312
+ logger.error(`❌ Error cancelling job ${input.jobId}: ${error}`);
1313
+ return {
1314
+ message: `Failed to cancel job ${input.jobId}: ${error instanceof Error ? error.message : String(error)}`,
1315
+ success: false
1316
+ };
1317
+ }
1318
+ },
1319
+ (result) => {
945
1320
  return {
946
- message: `Job ${input.jobId} is already ${job.status}. No action taken.`,
947
- success: true
948
- // Considered success as no cancellation needed
1321
+ success: result.success
1322
+ // Note: success flag already indicates if cancellation was successful
949
1323
  };
950
1324
  }
951
- await this.pipeline.cancelJob(input.jobId);
952
- const updatedJob = await this.pipeline.getJob(input.jobId);
953
- const finalStatus = updatedJob?.status ?? "UNKNOWN (job disappeared?)";
954
- logger.debug(
955
- `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}`
956
- );
957
- return {
958
- message: `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}.`,
959
- success: true
960
- };
961
- } catch (error) {
962
- logger.error(`❌ Error cancelling job ${input.jobId}: ${error}`);
963
- return {
964
- message: `Failed to cancel job ${input.jobId}: ${error instanceof Error ? error.message : String(error)}`,
965
- success: false
966
- };
967
- }
1325
+ );
968
1326
  }
969
1327
  }
970
1328
  class ClearCompletedJobsTool {
@@ -982,24 +1340,33 @@ class ClearCompletedJobsTool {
982
1340
  * @returns A promise that resolves with the outcome of the clear operation.
983
1341
  */
984
1342
  async execute(_input) {
985
- try {
986
- const clearedCount = await this.pipeline.clearCompletedJobs();
987
- const message = clearedCount > 0 ? `Successfully cleared ${clearedCount} completed job${clearedCount === 1 ? "" : "s"} from the queue.` : "No completed jobs to clear.";
988
- logger.debug(message);
989
- return {
990
- message,
991
- success: true,
992
- clearedCount
993
- };
994
- } catch (error) {
995
- const errorMessage = `Failed to clear completed jobs: ${error instanceof Error ? error.message : String(error)}`;
996
- logger.error(`❌ ${errorMessage}`);
997
- return {
998
- message: errorMessage,
999
- success: false,
1000
- clearedCount: 0
1001
- };
1002
- }
1343
+ return analytics.trackTool(
1344
+ "clear_completed_jobs",
1345
+ async () => {
1346
+ try {
1347
+ const clearedCount = await this.pipeline.clearCompletedJobs();
1348
+ const message = clearedCount > 0 ? `Successfully cleared ${clearedCount} completed job${clearedCount === 1 ? "" : "s"} from the queue.` : "No completed jobs to clear.";
1349
+ logger.debug(message);
1350
+ return {
1351
+ message,
1352
+ success: true,
1353
+ clearedCount
1354
+ };
1355
+ } catch (error) {
1356
+ const errorMessage = `Failed to clear completed jobs: ${error instanceof Error ? error.message : String(error)}`;
1357
+ logger.error(`❌ ${errorMessage}`);
1358
+ return {
1359
+ message: errorMessage,
1360
+ success: false,
1361
+ clearedCount: 0
1362
+ };
1363
+ }
1364
+ },
1365
+ (result) => ({
1366
+ success: result.success,
1367
+ clearedCount: result.clearedCount
1368
+ })
1369
+ );
1003
1370
  }
1004
1371
  }
1005
1372
  class ToolError extends Error {
@@ -1937,91 +2304,106 @@ class FetchUrlTool {
1937
2304
  * @throws {ToolError} If fetching or processing fails
1938
2305
  */
1939
2306
  async execute(options) {
1940
- const { url, scrapeMode = ScrapeMode.Auto, headers } = options;
1941
- const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
1942
- const fetcherIndex = canFetchResults.indexOf(true);
1943
- if (fetcherIndex === -1) {
1944
- throw new ToolError(
1945
- `Invalid URL: ${url}. Must be an HTTP/HTTPS URL or a file:// URL.`,
1946
- this.constructor.name
1947
- );
1948
- }
1949
- const fetcher = this.fetchers[fetcherIndex];
1950
- const htmlPipeline = new HtmlPipeline();
1951
- const markdownPipeline = new MarkdownPipeline();
1952
- const pipelines = [htmlPipeline, markdownPipeline];
1953
- try {
1954
- logger.info(`📡 Fetching ${url}...`);
1955
- const rawContent = await fetcher.fetch(url, {
1956
- followRedirects: options.followRedirects ?? true,
1957
- maxRetries: 3,
1958
- headers
1959
- // propagate custom headers
1960
- });
1961
- logger.info("🔄 Processing content...");
1962
- let processed;
1963
- for (const pipeline of pipelines) {
1964
- if (pipeline.canProcess(rawContent)) {
1965
- processed = await pipeline.process(
1966
- rawContent,
1967
- {
1968
- url,
1969
- library: "",
1970
- version: "",
1971
- maxDepth: 0,
1972
- maxPages: 1,
1973
- maxConcurrency: 1,
1974
- scope: "subpages",
1975
- followRedirects: options.followRedirects ?? true,
1976
- excludeSelectors: void 0,
1977
- ignoreErrors: false,
1978
- scrapeMode,
1979
- headers
1980
- // propagate custom headers
1981
- },
1982
- fetcher
2307
+ return analytics.trackTool(
2308
+ "fetch_url",
2309
+ async () => {
2310
+ const { url, scrapeMode = ScrapeMode.Auto, headers } = options;
2311
+ const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
2312
+ const fetcherIndex = canFetchResults.indexOf(true);
2313
+ if (fetcherIndex === -1) {
2314
+ throw new ToolError(
2315
+ `Invalid URL: ${url}. Must be an HTTP/HTTPS URL or a file:// URL.`,
2316
+ this.constructor.name
1983
2317
  );
1984
- break;
1985
2318
  }
2319
+ const fetcher = this.fetchers[fetcherIndex];
2320
+ const htmlPipeline = new HtmlPipeline();
2321
+ const markdownPipeline = new MarkdownPipeline();
2322
+ const pipelines = [htmlPipeline, markdownPipeline];
2323
+ try {
2324
+ logger.info(`📡 Fetching ${url}...`);
2325
+ const rawContent = await fetcher.fetch(url, {
2326
+ followRedirects: options.followRedirects ?? true,
2327
+ maxRetries: 3,
2328
+ headers
2329
+ // propagate custom headers
2330
+ });
2331
+ logger.info("🔄 Processing content...");
2332
+ let processed;
2333
+ for (const pipeline of pipelines) {
2334
+ if (pipeline.canProcess(rawContent)) {
2335
+ processed = await pipeline.process(
2336
+ rawContent,
2337
+ {
2338
+ url,
2339
+ library: "",
2340
+ version: "",
2341
+ maxDepth: 0,
2342
+ maxPages: 1,
2343
+ maxConcurrency: 1,
2344
+ scope: "subpages",
2345
+ followRedirects: options.followRedirects ?? true,
2346
+ excludeSelectors: void 0,
2347
+ ignoreErrors: false,
2348
+ scrapeMode,
2349
+ headers
2350
+ // propagate custom headers
2351
+ },
2352
+ fetcher
2353
+ );
2354
+ break;
2355
+ }
2356
+ }
2357
+ if (!processed) {
2358
+ logger.warn(
2359
+ `⚠️ Unsupported content type "${rawContent.mimeType}" for ${url}. Returning raw content.`
2360
+ );
2361
+ const resolvedCharset = resolveCharset(
2362
+ rawContent.charset,
2363
+ rawContent.content,
2364
+ rawContent.mimeType
2365
+ );
2366
+ const contentString = convertToString(rawContent.content, resolvedCharset);
2367
+ return contentString;
2368
+ }
2369
+ for (const err of processed.errors) {
2370
+ logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
2371
+ }
2372
+ if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
2373
+ throw new ToolError(
2374
+ `Processing resulted in empty content for ${url}`,
2375
+ this.constructor.name
2376
+ );
2377
+ }
2378
+ logger.info(`✅ Successfully processed ${url}`);
2379
+ return processed.textContent;
2380
+ } catch (error) {
2381
+ if (error instanceof ScraperError || error instanceof ToolError) {
2382
+ throw new ToolError(
2383
+ `Failed to fetch or process URL: ${error.message}`,
2384
+ this.constructor.name
2385
+ );
2386
+ }
2387
+ throw new ToolError(
2388
+ `Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
2389
+ this.constructor.name
2390
+ );
2391
+ } finally {
2392
+ await htmlPipeline.close();
2393
+ await markdownPipeline.close();
2394
+ }
2395
+ },
2396
+ (result) => {
2397
+ const { url, scrapeMode, followRedirects, headers } = options;
2398
+ return {
2399
+ url,
2400
+ scrapeMode,
2401
+ followRedirects,
2402
+ contentLength: result.length,
2403
+ hasHeaders: !!headers
2404
+ };
1986
2405
  }
1987
- if (!processed) {
1988
- logger.warn(
1989
- `⚠️ Unsupported content type "${rawContent.mimeType}" for ${url}. Returning raw content.`
1990
- );
1991
- const resolvedCharset = resolveCharset(
1992
- rawContent.charset,
1993
- rawContent.content,
1994
- rawContent.mimeType
1995
- );
1996
- const contentString = convertToString(rawContent.content, resolvedCharset);
1997
- return contentString;
1998
- }
1999
- for (const err of processed.errors) {
2000
- logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
2001
- }
2002
- if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
2003
- throw new ToolError(
2004
- `Processing resulted in empty content for ${url}`,
2005
- this.constructor.name
2006
- );
2007
- }
2008
- logger.info(`✅ Successfully processed ${url}`);
2009
- return processed.textContent;
2010
- } catch (error) {
2011
- if (error instanceof ScraperError || error instanceof ToolError) {
2012
- throw new ToolError(
2013
- `Failed to fetch or process URL: ${error.message}`,
2014
- this.constructor.name
2015
- );
2016
- }
2017
- throw new ToolError(
2018
- `Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
2019
- this.constructor.name
2020
- );
2021
- } finally {
2022
- await htmlPipeline.close();
2023
- await markdownPipeline.close();
2024
- }
2406
+ );
2025
2407
  }
2026
2408
  }
2027
2409
  class FindVersionTool {
@@ -2034,35 +2416,50 @@ class FindVersionTool {
2034
2416
  * @returns A descriptive string indicating the best match and unversioned status, or an error message.
2035
2417
  */
2036
2418
  async execute(options) {
2037
- const { library, targetVersion } = options;
2038
- const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
2039
- try {
2040
- const { bestMatch, hasUnversioned } = await this.docService.findBestVersion(
2041
- library,
2042
- targetVersion
2043
- );
2044
- let message = "";
2045
- if (bestMatch) {
2046
- message = `Best match: ${bestMatch}.`;
2047
- if (hasUnversioned) {
2048
- message += " Unversioned docs also available.";
2419
+ return analytics.trackTool(
2420
+ "find_version",
2421
+ async () => {
2422
+ const { library, targetVersion } = options;
2423
+ const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
2424
+ try {
2425
+ const { bestMatch, hasUnversioned } = await this.docService.findBestVersion(
2426
+ library,
2427
+ targetVersion
2428
+ );
2429
+ let message = "";
2430
+ if (bestMatch) {
2431
+ message = `Best match: ${bestMatch}.`;
2432
+ if (hasUnversioned) {
2433
+ message += " Unversioned docs also available.";
2434
+ }
2435
+ } else if (hasUnversioned) {
2436
+ message = `No matching version found for ${libraryAndVersion}, but unversioned docs exist.`;
2437
+ } else {
2438
+ message = `No matching version or unversioned documents found for ${libraryAndVersion}.`;
2439
+ }
2440
+ return { message, bestMatch, hasUnversioned };
2441
+ } catch (error) {
2442
+ if (error instanceof VersionNotFoundError) {
2443
+ logger.info(`ℹ️ Version not found: ${error.message}`);
2444
+ const message = `No matching version or unversioned documents found for ${libraryAndVersion}. Available: ${error.availableVersions.length > 0 ? error.availableVersions.map((v) => v.version).join(", ") : "None"}.`;
2445
+ return { message, bestMatch: null, hasUnversioned: false };
2446
+ }
2447
+ logger.error(
2448
+ `❌ Error finding version for ${libraryAndVersion}: ${error instanceof Error ? error.message : error}`
2449
+ );
2450
+ throw error;
2049
2451
  }
2050
- } else if (hasUnversioned) {
2051
- message = `No matching version found for ${libraryAndVersion}, but unversioned docs exist.`;
2052
- } else {
2053
- message = `No matching version or unversioned documents found for ${libraryAndVersion}.`;
2054
- }
2055
- return message;
2056
- } catch (error) {
2057
- if (error instanceof VersionNotFoundError) {
2058
- logger.info(`ℹ️ Version not found: ${error.message}`);
2059
- return `No matching version or unversioned documents found for ${libraryAndVersion}. Available: ${error.availableVersions.length > 0 ? error.availableVersions.map((v) => v.version).join(", ") : "None"}.`;
2452
+ },
2453
+ (result) => {
2454
+ const { library, targetVersion } = options;
2455
+ return {
2456
+ library,
2457
+ targetVersion,
2458
+ foundMatch: !!result.bestMatch,
2459
+ hasUnversioned: result.hasUnversioned
2460
+ };
2060
2461
  }
2061
- logger.error(
2062
- `❌ Error finding version for ${libraryAndVersion}: ${error instanceof Error ? error.message : error}`
2063
- );
2064
- throw error;
2065
- }
2462
+ ).then((result) => result.message);
2066
2463
  }
2067
2464
  }
2068
2465
  class GetJobInfoTool {
@@ -2080,29 +2477,41 @@ class GetJobInfoTool {
2080
2477
  * @returns A promise that resolves with the simplified job info or null if not found.
2081
2478
  */
2082
2479
  async execute(input) {
2083
- const job = await this.pipeline.getJob(input.jobId);
2084
- if (!job) {
2085
- return { job: null };
2086
- }
2087
- const jobInfo = {
2088
- id: job.id,
2089
- library: job.library,
2090
- version: job.version,
2091
- status: job.status,
2092
- dbStatus: job.versionStatus,
2093
- createdAt: job.createdAt.toISOString(),
2094
- startedAt: job.startedAt?.toISOString() ?? null,
2095
- finishedAt: job.finishedAt?.toISOString() ?? null,
2096
- error: job.error?.message ?? null,
2097
- progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
2098
- pages: job.progressPages || 0,
2099
- totalPages: job.progressMaxPages,
2100
- totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
2101
- } : void 0,
2102
- updatedAt: job.updatedAt?.toISOString(),
2103
- errorMessage: job.errorMessage ?? void 0
2104
- };
2105
- return { job: jobInfo };
2480
+ return analytics.trackTool(
2481
+ "get_job_info",
2482
+ async () => {
2483
+ const job = await this.pipeline.getJob(input.jobId);
2484
+ if (!job) {
2485
+ return { job: null };
2486
+ }
2487
+ const jobInfo = {
2488
+ id: job.id,
2489
+ library: job.library,
2490
+ version: job.version,
2491
+ status: job.status,
2492
+ dbStatus: job.versionStatus,
2493
+ createdAt: job.createdAt.toISOString(),
2494
+ startedAt: job.startedAt?.toISOString() ?? null,
2495
+ finishedAt: job.finishedAt?.toISOString() ?? null,
2496
+ error: job.error?.message ?? null,
2497
+ progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
2498
+ pages: job.progressPages || 0,
2499
+ totalPages: job.progressMaxPages,
2500
+ totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
2501
+ } : void 0,
2502
+ updatedAt: job.updatedAt?.toISOString(),
2503
+ errorMessage: job.errorMessage ?? void 0
2504
+ };
2505
+ return { job: jobInfo };
2506
+ },
2507
+ (result) => {
2508
+ return {
2509
+ found: result.job !== null,
2510
+ library: result.job?.library,
2511
+ version: result.job?.version
2512
+ };
2513
+ }
2514
+ );
2106
2515
  }
2107
2516
  }
2108
2517
  class ListJobsTool {
@@ -2120,28 +2529,45 @@ class ListJobsTool {
2120
2529
  * @returns A promise that resolves with the list of simplified job objects.
2121
2530
  */
2122
2531
  async execute(input) {
2123
- const jobs = await this.pipeline.getJobs(input.status);
2124
- const simplifiedJobs = jobs.map((job) => {
2125
- return {
2126
- id: job.id,
2127
- library: job.library,
2128
- version: job.version,
2129
- status: job.status,
2130
- dbStatus: job.versionStatus,
2131
- createdAt: job.createdAt.toISOString(),
2132
- startedAt: job.startedAt?.toISOString() ?? null,
2133
- finishedAt: job.finishedAt?.toISOString() ?? null,
2134
- error: job.error?.message ?? null,
2135
- progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
2136
- pages: job.progressPages || 0,
2137
- totalPages: job.progressMaxPages,
2138
- totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
2139
- } : void 0,
2140
- updatedAt: job.updatedAt?.toISOString(),
2141
- errorMessage: job.errorMessage ?? void 0
2142
- };
2143
- });
2144
- return { jobs: simplifiedJobs };
2532
+ return analytics.trackTool(
2533
+ "list_jobs",
2534
+ async () => {
2535
+ const jobs = await this.pipeline.getJobs(input.status);
2536
+ const simplifiedJobs = jobs.map((job) => {
2537
+ return {
2538
+ id: job.id,
2539
+ library: job.library,
2540
+ version: job.version,
2541
+ status: job.status,
2542
+ dbStatus: job.versionStatus,
2543
+ createdAt: job.createdAt.toISOString(),
2544
+ startedAt: job.startedAt?.toISOString() ?? null,
2545
+ finishedAt: job.finishedAt?.toISOString() ?? null,
2546
+ error: job.error?.message ?? null,
2547
+ progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
2548
+ pages: job.progressPages || 0,
2549
+ totalPages: job.progressMaxPages,
2550
+ totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
2551
+ } : void 0,
2552
+ updatedAt: job.updatedAt?.toISOString(),
2553
+ errorMessage: job.errorMessage ?? void 0
2554
+ };
2555
+ });
2556
+ return { jobs: simplifiedJobs };
2557
+ },
2558
+ (result) => {
2559
+ return {
2560
+ jobCount: result.jobs.length,
2561
+ statusCounts: result.jobs.reduce(
2562
+ (acc, job) => {
2563
+ acc[job.status] = (acc[job.status] || 0) + 1;
2564
+ return acc;
2565
+ },
2566
+ {}
2567
+ )
2568
+ };
2569
+ }
2570
+ );
2145
2571
  }
2146
2572
  }
2147
2573
  class ListLibrariesTool {
@@ -2150,20 +2576,32 @@ class ListLibrariesTool {
2150
2576
  this.docService = docService;
2151
2577
  }
2152
2578
  async execute(_options) {
2153
- const rawLibraries = await this.docService.listLibraries();
2154
- const libraries = rawLibraries.map(({ library, versions }) => ({
2155
- name: library,
2156
- versions: versions.map((v) => ({
2157
- version: v.ref.version,
2158
- documentCount: v.counts.documents,
2159
- uniqueUrlCount: v.counts.uniqueUrls,
2160
- indexedAt: v.indexedAt,
2161
- status: v.status,
2162
- ...v.progress ? { progress: v.progress } : void 0,
2163
- sourceUrl: v.sourceUrl
2164
- }))
2165
- }));
2166
- return { libraries };
2579
+ return analytics.trackTool(
2580
+ "list_libraries",
2581
+ async () => {
2582
+ const rawLibraries = await this.docService.listLibraries();
2583
+ const libraries = rawLibraries.map(({ library, versions }) => ({
2584
+ name: library,
2585
+ versions: versions.map((v) => ({
2586
+ version: v.ref.version,
2587
+ documentCount: v.counts.documents,
2588
+ uniqueUrlCount: v.counts.uniqueUrls,
2589
+ indexedAt: v.indexedAt,
2590
+ status: v.status,
2591
+ ...v.progress ? { progress: v.progress } : void 0,
2592
+ sourceUrl: v.sourceUrl
2593
+ }))
2594
+ }));
2595
+ return { libraries };
2596
+ },
2597
+ (result) => ({
2598
+ libraryCount: result.libraries.length,
2599
+ totalVersions: result.libraries.reduce(
2600
+ (sum, lib) => sum + lib.versions.length,
2601
+ 0
2602
+ )
2603
+ })
2604
+ );
2167
2605
  }
2168
2606
  }
2169
2607
  class RemoveTool {
@@ -2177,29 +2615,42 @@ class RemoveTool {
2177
2615
  * Removes all documents, the version record, and the library if no other versions exist.
2178
2616
  */
2179
2617
  async execute(args) {
2180
- const { library, version: version2 } = args;
2181
- logger.info(`🗑️ Removing library: ${library}${version2 ? `@${version2}` : ""}`);
2182
- try {
2183
- const allJobs = await this.pipeline.getJobs();
2184
- const jobs = allJobs.filter(
2185
- (job) => job.library === library && job.version === (version2 ?? "") && (job.status === PipelineJobStatus.QUEUED || job.status === PipelineJobStatus.RUNNING)
2186
- );
2187
- for (const job of jobs) {
2188
- logger.info(
2189
- `🚫 Aborting job for ${library}@${version2 ?? ""} before deletion: ${job.id}`
2190
- );
2191
- await this.pipeline.cancelJob(job.id);
2192
- await this.pipeline.waitForJobCompletion(job.id);
2618
+ return analytics.trackTool(
2619
+ "remove_docs",
2620
+ async () => {
2621
+ const { library, version: version2 } = args;
2622
+ logger.info(`🗑️ Removing library: ${library}${version2 ? `@${version2}` : ""}`);
2623
+ try {
2624
+ const allJobs = await this.pipeline.getJobs();
2625
+ const jobs = allJobs.filter(
2626
+ (job) => job.library === library && job.version === (version2 ?? "") && (job.status === PipelineJobStatus.QUEUED || job.status === PipelineJobStatus.RUNNING)
2627
+ );
2628
+ for (const job of jobs) {
2629
+ logger.info(
2630
+ `🚫 Aborting job for ${library}@${version2 ?? ""} before deletion: ${job.id}`
2631
+ );
2632
+ await this.pipeline.cancelJob(job.id);
2633
+ await this.pipeline.waitForJobCompletion(job.id);
2634
+ }
2635
+ await this.documentManagementService.removeVersion(library, version2);
2636
+ const message = `Successfully removed ${library}${version2 ? `@${version2}` : ""}.`;
2637
+ logger.info(`✅ ${message}`);
2638
+ return { message };
2639
+ } catch (error) {
2640
+ const errorMessage = `Failed to remove ${library}${version2 ? `@${version2}` : ""}: ${error instanceof Error ? error.message : String(error)}`;
2641
+ logger.error(`❌ Error removing library: ${errorMessage}`);
2642
+ throw new ToolError(errorMessage, this.constructor.name);
2643
+ }
2644
+ },
2645
+ () => {
2646
+ const { library, version: version2 } = args;
2647
+ return {
2648
+ library,
2649
+ version: version2
2650
+ // Success is implicit since if this callback runs, no exception was thrown
2651
+ };
2193
2652
  }
2194
- await this.documentManagementService.removeVersion(library, version2);
2195
- const message = `Successfully removed ${library}${version2 ? `@${version2}` : ""}.`;
2196
- logger.info(`✅ ${message}`);
2197
- return { message };
2198
- } catch (error) {
2199
- const errorMessage = `Failed to remove ${library}${version2 ? `@${version2}` : ""}: ${error instanceof Error ? error.message : String(error)}`;
2200
- logger.error(`❌ Error removing library: ${errorMessage}`);
2201
- throw new ToolError(errorMessage, this.constructor.name);
2202
- }
2653
+ );
2203
2654
  }
2204
2655
  }
2205
2656
  class ScrapeTool {
@@ -2215,66 +2666,80 @@ class ScrapeTool {
2215
2666
  options: scraperOptions,
2216
2667
  waitForCompletion = true
2217
2668
  } = options;
2218
- let internalVersion;
2219
- const partialVersionRegex = /^\d+(\.\d+)?$/;
2220
- if (version2 === null || version2 === void 0) {
2221
- internalVersion = "";
2222
- } else {
2223
- const validFullVersion = semver.valid(version2);
2224
- if (validFullVersion) {
2225
- internalVersion = validFullVersion;
2226
- } else if (partialVersionRegex.test(version2)) {
2227
- const coercedVersion = semver.coerce(version2);
2228
- if (coercedVersion) {
2229
- internalVersion = coercedVersion.version;
2669
+ return analytics.trackTool(
2670
+ "scrape_docs",
2671
+ async () => {
2672
+ let internalVersion;
2673
+ const partialVersionRegex = /^\d+(\.\d+)?$/;
2674
+ if (version2 === null || version2 === void 0) {
2675
+ internalVersion = "";
2230
2676
  } else {
2231
- throw new Error(
2232
- `Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
2233
- );
2677
+ const validFullVersion = semver.valid(version2);
2678
+ if (validFullVersion) {
2679
+ internalVersion = validFullVersion;
2680
+ } else if (partialVersionRegex.test(version2)) {
2681
+ const coercedVersion = semver.coerce(version2);
2682
+ if (coercedVersion) {
2683
+ internalVersion = coercedVersion.version;
2684
+ } else {
2685
+ throw new Error(
2686
+ `Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
2687
+ );
2688
+ }
2689
+ } else {
2690
+ throw new Error(
2691
+ `Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
2692
+ );
2693
+ }
2234
2694
  }
2235
- } else {
2236
- throw new Error(
2237
- `Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
2238
- );
2239
- }
2240
- }
2241
- internalVersion = internalVersion.toLowerCase();
2242
- const pipeline = this.pipeline;
2243
- const enqueueVersion = internalVersion === "" ? null : internalVersion;
2244
- const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
2245
- url,
2246
- library,
2247
- version: internalVersion,
2248
- scope: scraperOptions?.scope ?? "subpages",
2249
- followRedirects: scraperOptions?.followRedirects ?? true,
2250
- maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
2251
- maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH$1,
2252
- maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
2253
- ignoreErrors: scraperOptions?.ignoreErrors ?? true,
2254
- scrapeMode: scraperOptions?.scrapeMode ?? ScrapeMode.Auto,
2255
- // Pass scrapeMode enum
2256
- includePatterns: scraperOptions?.includePatterns,
2257
- excludePatterns: scraperOptions?.excludePatterns,
2258
- headers: scraperOptions?.headers
2259
- // <-- propagate headers
2260
- });
2261
- if (waitForCompletion) {
2262
- try {
2263
- await pipeline.waitForJobCompletion(jobId);
2264
- const finalJob = await pipeline.getJob(jobId);
2265
- const finalPagesScraped = finalJob?.progress?.pagesScraped ?? 0;
2266
- logger.debug(
2267
- `Job ${jobId} finished with status ${finalJob?.status}. Pages scraped: ${finalPagesScraped}`
2268
- );
2269
- return {
2270
- pagesScraped: finalPagesScraped
2271
- };
2272
- } catch (error) {
2273
- logger.error(`❌ Job ${jobId} failed or was cancelled: ${error}`);
2274
- throw error;
2275
- }
2276
- }
2277
- return { jobId };
2695
+ internalVersion = internalVersion.toLowerCase();
2696
+ const pipeline = this.pipeline;
2697
+ const enqueueVersion = internalVersion === "" ? null : internalVersion;
2698
+ const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
2699
+ url,
2700
+ library,
2701
+ version: internalVersion,
2702
+ scope: scraperOptions?.scope ?? "subpages",
2703
+ followRedirects: scraperOptions?.followRedirects ?? true,
2704
+ maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
2705
+ maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH$1,
2706
+ maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
2707
+ ignoreErrors: scraperOptions?.ignoreErrors ?? true,
2708
+ scrapeMode: scraperOptions?.scrapeMode ?? ScrapeMode.Auto,
2709
+ // Pass scrapeMode enum
2710
+ includePatterns: scraperOptions?.includePatterns,
2711
+ excludePatterns: scraperOptions?.excludePatterns,
2712
+ headers: scraperOptions?.headers
2713
+ // <-- propagate headers
2714
+ });
2715
+ if (waitForCompletion) {
2716
+ try {
2717
+ await pipeline.waitForJobCompletion(jobId);
2718
+ const finalJob = await pipeline.getJob(jobId);
2719
+ const finalPagesScraped = finalJob?.progress?.pagesScraped ?? 0;
2720
+ logger.debug(
2721
+ `Job ${jobId} finished with status ${finalJob?.status}. Pages scraped: ${finalPagesScraped}`
2722
+ );
2723
+ return {
2724
+ pagesScraped: finalPagesScraped
2725
+ };
2726
+ } catch (error) {
2727
+ logger.error(`❌ Job ${jobId} failed or was cancelled: ${error}`);
2728
+ throw error;
2729
+ }
2730
+ }
2731
+ return { jobId };
2732
+ },
2733
+ (result) => ({
2734
+ library,
2735
+ version: version2,
2736
+ url,
2737
+ waitForCompletion,
2738
+ ...scraperOptions,
2739
+ isBackgroundJob: "jobId" in result,
2740
+ pagesScraped: "pagesScraped" in result ? result.pagesScraped : void 0
2741
+ })
2742
+ );
2278
2743
  }
2279
2744
  }
2280
2745
  class SearchTool {
@@ -2284,43 +2749,56 @@ class SearchTool {
2284
2749
  }
2285
2750
  async execute(options) {
2286
2751
  const { library, version: version2, query, limit = 5, exactMatch = false } = options;
2287
- if (exactMatch && (!version2 || version2 === "latest")) {
2288
- await this.docService.validateLibraryExists(library);
2289
- const allLibraries = await this.docService.listLibraries();
2290
- const libraryInfo = allLibraries.find((lib) => lib.library === library);
2291
- const detailedVersions = libraryInfo ? libraryInfo.versions.map((v) => ({
2292
- version: v.ref.version,
2293
- documentCount: v.counts.documents,
2294
- uniqueUrlCount: v.counts.uniqueUrls,
2295
- indexedAt: v.indexedAt
2296
- })) : [];
2297
- throw new VersionNotFoundError(library, version2 ?? "latest", detailedVersions);
2298
- }
2299
- const resolvedVersion = version2 || "latest";
2300
- logger.info(
2301
- `🔍 Searching ${library}@${resolvedVersion} for: ${query}${exactMatch ? " (exact match)" : ""}`
2302
- );
2303
- try {
2304
- await this.docService.validateLibraryExists(library);
2305
- let versionToSearch = resolvedVersion;
2306
- if (!exactMatch) {
2307
- const versionResult = await this.docService.findBestVersion(library, version2);
2308
- versionToSearch = versionResult.bestMatch;
2309
- }
2310
- const results = await this.docService.searchStore(
2752
+ return analytics.trackTool(
2753
+ "search_docs",
2754
+ async () => {
2755
+ if (exactMatch && (!version2 || version2 === "latest")) {
2756
+ await this.docService.validateLibraryExists(library);
2757
+ const allLibraries = await this.docService.listLibraries();
2758
+ const libraryInfo = allLibraries.find((lib) => lib.library === library);
2759
+ const detailedVersions = libraryInfo ? libraryInfo.versions.map((v) => ({
2760
+ version: v.ref.version,
2761
+ documentCount: v.counts.documents,
2762
+ uniqueUrlCount: v.counts.uniqueUrls,
2763
+ indexedAt: v.indexedAt
2764
+ })) : [];
2765
+ throw new VersionNotFoundError(library, version2 ?? "latest", detailedVersions);
2766
+ }
2767
+ const resolvedVersion = version2 || "latest";
2768
+ logger.info(
2769
+ `🔍 Searching ${library}@${resolvedVersion} for: ${query}${exactMatch ? " (exact match)" : ""}`
2770
+ );
2771
+ try {
2772
+ await this.docService.validateLibraryExists(library);
2773
+ let versionToSearch = resolvedVersion;
2774
+ if (!exactMatch) {
2775
+ const versionResult = await this.docService.findBestVersion(library, version2);
2776
+ versionToSearch = versionResult.bestMatch;
2777
+ }
2778
+ const results = await this.docService.searchStore(
2779
+ library,
2780
+ versionToSearch,
2781
+ query,
2782
+ limit
2783
+ );
2784
+ logger.info(`✅ Found ${results.length} matching results`);
2785
+ return { results };
2786
+ } catch (error) {
2787
+ logger.error(
2788
+ `❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
2789
+ );
2790
+ throw error;
2791
+ }
2792
+ },
2793
+ (result) => ({
2311
2794
  library,
2312
- versionToSearch,
2795
+ version: version2,
2313
2796
  query,
2314
- limit
2315
- );
2316
- logger.info(`✅ Found ${results.length} matching results`);
2317
- return { results };
2318
- } catch (error) {
2319
- logger.error(
2320
- `❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
2321
- );
2322
- throw error;
2323
- }
2797
+ limit,
2798
+ exactMatch,
2799
+ resultCount: result.results.length
2800
+ })
2801
+ );
2324
2802
  }
2325
2803
  }
2326
2804
  function createResponse(text) {
@@ -3050,6 +3528,7 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
3050
3528
  authEnabled: !!authManager,
3051
3529
  readOnly,
3052
3530
  servicesEnabled: ["mcp"]
3531
+ // Embedding context will be resolved by the service that starts MCP
3053
3532
  });
3054
3533
  analytics.startSession(session);
3055
3534
  }
@@ -3100,23 +3579,10 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
3100
3579
  const requestTransport = new StreamableHTTPServerTransport({
3101
3580
  sessionIdGenerator: void 0
3102
3581
  });
3103
- if (analytics.isEnabled()) {
3104
- const session = createMcpSession({
3105
- protocol: "http",
3106
- transport: "streamable",
3107
- authEnabled: !!authManager,
3108
- readOnly,
3109
- servicesEnabled: ["mcp"]
3110
- });
3111
- analytics.startSession(session);
3112
- }
3113
3582
  reply.raw.on("close", () => {
3114
3583
  logger.debug("Streamable HTTP request closed");
3115
3584
  requestTransport.close();
3116
3585
  requestServer.close();
3117
- if (analytics.isEnabled()) {
3118
- analytics.endSession();
3119
- }
3120
3586
  });
3121
3587
  await requestServer.connect(requestTransport);
3122
3588
  await requestTransport.handleRequest(request.raw, reply.raw, request.body);
@@ -3260,6 +3726,15 @@ function createDataRouter(trpc) {
3260
3726
  return results;
3261
3727
  }
3262
3728
  ),
3729
+ removeVersion: tt.procedure.input(z$1.object({ library: nonEmpty, version: optionalVersion })).mutation(
3730
+ async ({
3731
+ ctx,
3732
+ input
3733
+ }) => {
3734
+ await ctx.docService.removeVersion(input.library, input.version ?? null);
3735
+ return { ok: true };
3736
+ }
3737
+ ),
3263
3738
  removeAllDocuments: tt.procedure.input(z$1.object({ library: nonEmpty, version: optionalVersion })).mutation(
3264
3739
  async ({
3265
3740
  ctx,
@@ -3517,97 +3992,30 @@ function registerClearCompletedJobsRoute(server, clearCompletedJobsTool) {
3517
3992
  }
3518
3993
  });
3519
3994
  }
3520
- const VECTOR_DIMENSION = 1536;
3521
- function mapDbDocumentToDocument(doc) {
3522
- return {
3523
- id: doc.id,
3524
- pageContent: doc.content,
3525
- metadata: JSON.parse(doc.metadata)
3526
- };
3527
- }
3528
- var VersionStatus = /* @__PURE__ */ ((VersionStatus2) => {
3529
- VersionStatus2["NOT_INDEXED"] = "not_indexed";
3530
- VersionStatus2["QUEUED"] = "queued";
3531
- VersionStatus2["RUNNING"] = "running";
3532
- VersionStatus2["COMPLETED"] = "completed";
3533
- VersionStatus2["FAILED"] = "failed";
3534
- VersionStatus2["CANCELLED"] = "cancelled";
3535
- VersionStatus2["UPDATING"] = "updating";
3536
- return VersionStatus2;
3537
- })(VersionStatus || {});
3538
- function normalizeVersionName(name) {
3539
- return name ?? "";
3540
- }
3541
- function denormalizeVersionName(name) {
3542
- return name === "" ? "" : name;
3543
- }
3544
- function getStatusDescription(status) {
3545
- const descriptions = {
3546
- [
3547
- "not_indexed"
3548
- /* NOT_INDEXED */
3549
- ]: "Version created but not yet indexed",
3550
- [
3551
- "queued"
3552
- /* QUEUED */
3553
- ]: "Waiting in queue for indexing",
3554
- [
3555
- "running"
3556
- /* RUNNING */
3557
- ]: "Currently being indexed",
3558
- [
3559
- "completed"
3560
- /* COMPLETED */
3561
- ]: "Successfully indexed",
3562
- [
3563
- "failed"
3564
- /* FAILED */
3565
- ]: "Indexing failed",
3566
- [
3567
- "cancelled"
3568
- /* CANCELLED */
3569
- ]: "Indexing was cancelled",
3570
- [
3571
- "updating"
3572
- /* UPDATING */
3573
- ]: "Re-indexing in progress"
3574
- };
3575
- return descriptions[status] || "Unknown status";
3576
- }
3577
- function isActiveStatus(status) {
3578
- return [
3579
- "queued",
3580
- "running",
3581
- "updating"
3582
- /* UPDATING */
3583
- ].includes(
3584
- status
3585
- );
3586
- }
3587
- const VersionBadge = ({ version: version2 }) => {
3588
- if (!version2) {
3589
- return null;
3590
- }
3591
- return /* @__PURE__ */ jsx("span", { class: "bg-purple-100 text-purple-800 text-xs font-medium me-2 px-1.5 py-0.5 rounded dark:bg-purple-900 dark:text-purple-300", children: /* @__PURE__ */ jsx("span", { safe: true, children: version2 }) });
3592
- };
3593
- function getStatusClasses(status) {
3594
- const baseClasses = "px-1.5 py-0.5 text-xs font-medium rounded";
3595
- switch (status) {
3596
- case VersionStatus.COMPLETED:
3597
- return `${baseClasses} bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-300`;
3598
- case VersionStatus.RUNNING:
3599
- case VersionStatus.UPDATING:
3600
- return `${baseClasses} bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-300`;
3601
- case VersionStatus.QUEUED:
3602
- return `${baseClasses} bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-300`;
3603
- case VersionStatus.FAILED:
3604
- return `${baseClasses} bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-300`;
3605
- case VersionStatus.CANCELLED:
3606
- return `${baseClasses} bg-gray-100 text-gray-800 dark:bg-gray-900 dark:text-gray-300`;
3607
- case VersionStatus.NOT_INDEXED:
3608
- default:
3609
- return `${baseClasses} bg-gray-100 text-gray-600 dark:bg-gray-800 dark:text-gray-400`;
3610
- }
3995
+ const VersionBadge = ({ version: version2 }) => {
3996
+ if (!version2) {
3997
+ return null;
3998
+ }
3999
+ return /* @__PURE__ */ jsx("span", { class: "bg-purple-100 text-purple-800 text-xs font-medium me-2 px-1.5 py-0.5 rounded dark:bg-purple-900 dark:text-purple-300", children: /* @__PURE__ */ jsx("span", { safe: true, children: version2 }) });
4000
+ };
4001
+ function getStatusClasses(status) {
4002
+ const baseClasses = "px-1.5 py-0.5 text-xs font-medium rounded";
4003
+ switch (status) {
4004
+ case VersionStatus.COMPLETED:
4005
+ return `${baseClasses} bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-300`;
4006
+ case VersionStatus.RUNNING:
4007
+ case VersionStatus.UPDATING:
4008
+ return `${baseClasses} bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-300`;
4009
+ case VersionStatus.QUEUED:
4010
+ return `${baseClasses} bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-300`;
4011
+ case VersionStatus.FAILED:
4012
+ return `${baseClasses} bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-300`;
4013
+ case VersionStatus.CANCELLED:
4014
+ return `${baseClasses} bg-gray-100 text-gray-800 dark:bg-gray-900 dark:text-gray-300`;
4015
+ case VersionStatus.NOT_INDEXED:
4016
+ default:
4017
+ return `${baseClasses} bg-gray-100 text-gray-600 dark:bg-gray-800 dark:text-gray-400`;
4018
+ }
3611
4019
  }
3612
4020
  const StatusBadge = ({ status, showDescription = true }) => /* @__PURE__ */ jsx("span", { class: getStatusClasses(status), children: showDescription ? getStatusDescription(status) : status });
3613
4021
  const ProgressBar = ({ progress, showText = true }) => {
@@ -4939,16 +5347,12 @@ async function registerWorkerService(pipeline) {
4939
5347
  logger.warn(
4940
5348
  `⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`
4941
5349
  );
4942
- const errorInfo = sanitizeError(error);
4943
- analytics.track(TelemetryEvent.ERROR_OCCURRED, {
5350
+ analytics.captureException(error, {
4944
5351
  jobId: job.id,
4945
5352
  // Job IDs are already anonymous
4946
5353
  library: job.library,
4947
- errorType: errorInfo.type,
4948
- errorMessage: errorInfo.message,
4949
5354
  hasDocument: !!document,
4950
5355
  stage: document ? "document_processing" : "job_setup",
4951
- hasStack: errorInfo.hasStack,
4952
5356
  pages_processed_before_error: job.progressPages || 0
4953
5357
  });
4954
5358
  }
@@ -5027,13 +5431,13 @@ class AppServer {
5027
5431
  try {
5028
5432
  telemetryService.startSession({
5029
5433
  sessionId: crypto.randomUUID(),
5030
- interface: "web",
5434
+ appInterface: "web",
5031
5435
  startTime: /* @__PURE__ */ new Date(),
5032
- version: process.env.npm_package_version || "unknown",
5033
- platform: process.platform,
5034
- servicesEnabled: this.getActiveServicesList(),
5035
- authEnabled: Boolean(this.config.auth),
5036
- readOnly: Boolean(this.config.readOnly)
5436
+ appVersion: process.env.npm_package_version || "unknown",
5437
+ appPlatform: process.platform,
5438
+ appServicesEnabled: this.getActiveServicesList(),
5439
+ appAuthEnabled: Boolean(this.config.auth),
5440
+ appReadOnly: Boolean(this.config.readOnly)
5037
5441
  });
5038
5442
  } catch (error) {
5039
5443
  logger.debug(`Failed to initialize telemetry: ${error}`);
@@ -5110,11 +5514,10 @@ class AppServer {
5110
5514
  process.on("unhandledRejection", (reason) => {
5111
5515
  logger.error(`Unhandled Promise Rejection: ${reason}`);
5112
5516
  if (analytics.isEnabled()) {
5113
- analytics.track(TelemetryEvent.ERROR_OCCURRED, {
5114
- error_type: "UnhandledPromiseRejection",
5517
+ const error = reason instanceof Error ? reason : new Error(String(reason));
5518
+ analytics.captureException(error, {
5115
5519
  error_category: "system",
5116
- component: "AppServer",
5117
- severity: "critical",
5520
+ component: AppServer.constructor.name,
5118
5521
  context: "process_unhandled_rejection"
5119
5522
  });
5120
5523
  }
@@ -5124,11 +5527,9 @@ class AppServer {
5124
5527
  process.on("uncaughtException", (error) => {
5125
5528
  logger.error(`Uncaught Exception: ${error.message}`);
5126
5529
  if (analytics.isEnabled()) {
5127
- analytics.track(TelemetryEvent.ERROR_OCCURRED, {
5128
- error_type: error.constructor.name,
5530
+ analytics.captureException(error, {
5129
5531
  error_category: "system",
5130
- component: "AppServer",
5131
- severity: "critical",
5532
+ component: AppServer.constructor.name,
5132
5533
  context: "process_uncaught_exception"
5133
5534
  });
5134
5535
  }
@@ -5137,11 +5538,9 @@ class AppServer {
5137
5538
  if (typeof this.server.setErrorHandler === "function") {
5138
5539
  this.server.setErrorHandler(async (error, request, reply) => {
5139
5540
  if (analytics.isEnabled()) {
5140
- analytics.track(TelemetryEvent.ERROR_OCCURRED, {
5141
- error_type: error.constructor.name,
5541
+ analytics.captureException(error, {
5142
5542
  error_category: "http",
5143
5543
  component: "FastifyServer",
5144
- severity: "high",
5145
5544
  status_code: error.statusCode || 500,
5146
5545
  method: request.method,
5147
5546
  route: request.routeOptions?.url || request.url,
@@ -5273,195 +5672,2564 @@ class AppServer {
5273
5672
  logger.debug("OAuth2 proxy endpoints registered");
5274
5673
  }
5275
5674
  /**
5276
- * Log startup information showing which services are enabled.
5675
+ * Log startup information showing which services are enabled.
5676
+ */
5677
+ logStartupInfo(address) {
5678
+ logger.info(`🚀 AppServer available at ${address}`);
5679
+ const enabledServices = [];
5680
+ if (this.config.enableWebInterface) {
5681
+ enabledServices.push(`Web interface: ${address}`);
5682
+ }
5683
+ if (this.config.enableMcpServer) {
5684
+ enabledServices.push(`MCP endpoints: ${address}/mcp, ${address}/sse`);
5685
+ }
5686
+ if (this.config.enableApiServer) {
5687
+ enabledServices.push(`API: ${address}/api`);
5688
+ }
5689
+ if (this.config.enableWorker) {
5690
+ enabledServices.push("Embedded worker: enabled");
5691
+ } else if (this.config.externalWorkerUrl) {
5692
+ enabledServices.push(`External worker: ${this.config.externalWorkerUrl}`);
5693
+ }
5694
+ for (const service of enabledServices) {
5695
+ logger.info(` • ${service}`);
5696
+ }
5697
+ }
5698
+ }
5699
+ async function startAppServer(docService, pipeline, config) {
5700
+ const appServer = new AppServer(docService, pipeline, config);
5701
+ await appServer.start();
5702
+ return appServer;
5703
+ }
5704
+ async function startStdioServer(tools, readOnly = false) {
5705
+ setLogLevel(LogLevel.ERROR);
5706
+ const server = createMcpServerInstance(tools, readOnly);
5707
+ const transport = new StdioServerTransport();
5708
+ await server.connect(transport);
5709
+ logger.info("🤖 MCP server listening on stdio");
5710
+ return server;
5711
+ }
5712
+ class DocumentManagementClient {
5713
+ baseUrl;
5714
+ client;
5715
+ constructor(serverUrl) {
5716
+ this.baseUrl = serverUrl.replace(/\/$/, "");
5717
+ this.client = createTRPCProxyClient({
5718
+ links: [httpBatchLink({ url: this.baseUrl })]
5719
+ });
5720
+ logger.debug(`DocumentManagementClient (tRPC) created for: ${this.baseUrl}`);
5721
+ }
5722
+ async initialize() {
5723
+ await this.client.ping.query();
5724
+ }
5725
+ async shutdown() {
5726
+ }
5727
+ async listLibraries() {
5728
+ return this.client.listLibraries.query();
5729
+ }
5730
+ async validateLibraryExists(library) {
5731
+ await this.client.validateLibraryExists.mutate({ library });
5732
+ }
5733
+ async findBestVersion(library, targetVersion) {
5734
+ return this.client.findBestVersion.query({ library, targetVersion });
5735
+ }
5736
+ async searchStore(library, version2, query, limit) {
5737
+ return this.client.search.query({ library, version: version2 ?? null, query, limit });
5738
+ }
5739
+ async removeVersion(library, version2) {
5740
+ await this.client.removeVersion.mutate({ library, version: version2 });
5741
+ }
5742
+ async removeAllDocuments(library, version2) {
5743
+ await this.client.removeAllDocuments.mutate({ library, version: version2 ?? null });
5744
+ }
5745
+ async getVersionsByStatus(statuses) {
5746
+ return this.client.getVersionsByStatus.query({
5747
+ statuses
5748
+ });
5749
+ }
5750
+ async findVersionsBySourceUrl(url) {
5751
+ return this.client.findVersionsBySourceUrl.query({ url });
5752
+ }
5753
+ async getScraperOptions(versionId) {
5754
+ return this.client.getScraperOptions.query({ versionId });
5755
+ }
5756
+ async updateVersionStatus(versionId, status, errorMessage) {
5757
+ await this.client.updateVersionStatus.mutate({ versionId, status, errorMessage });
5758
+ }
5759
+ async updateVersionProgress(versionId, pages, maxPages) {
5760
+ await this.client.updateVersionProgress.mutate({ versionId, pages, maxPages });
5761
+ }
5762
+ async storeScraperOptions(versionId, options) {
5763
+ await this.client.storeScraperOptions.mutate({ versionId, options });
5764
+ }
5765
+ }
5766
+ class SplitterError extends Error {
5767
+ }
5768
+ class MinimumChunkSizeError extends SplitterError {
5769
+ constructor(size, maxSize) {
5770
+ super(
5771
+ `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
5772
+ );
5773
+ }
5774
+ }
5775
+ class ContentSplitterError extends SplitterError {
5776
+ }
5777
+ class GreedySplitter {
5778
+ baseSplitter;
5779
+ minChunkSize;
5780
+ preferredChunkSize;
5781
+ /**
5782
+ * Combines a base document splitter with size constraints to produce optimally-sized chunks.
5783
+ * The base splitter handles the initial semantic splitting, while this class handles
5784
+ * the concatenation strategy.
5785
+ */
5786
+ constructor(baseSplitter, minChunkSize, preferredChunkSize) {
5787
+ this.baseSplitter = baseSplitter;
5788
+ this.minChunkSize = minChunkSize;
5789
+ this.preferredChunkSize = preferredChunkSize;
5790
+ }
5791
+ /**
5792
+ * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
5793
+ * are combined until they reach the minimum size, but splits are preserved at major
5794
+ * section boundaries to maintain document structure. This balances the need for
5795
+ * context with semantic coherence.
5796
+ */
5797
+ async splitText(markdown) {
5798
+ const initialChunks = await this.baseSplitter.splitText(markdown);
5799
+ const concatenatedChunks = [];
5800
+ let currentChunk = null;
5801
+ for (const nextChunk of initialChunks) {
5802
+ if (currentChunk) {
5803
+ if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
5804
+ concatenatedChunks.push(currentChunk);
5805
+ currentChunk = this.cloneChunk(nextChunk);
5806
+ continue;
5807
+ }
5808
+ if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
5809
+ concatenatedChunks.push(currentChunk);
5810
+ currentChunk = this.cloneChunk(nextChunk);
5811
+ continue;
5812
+ }
5813
+ currentChunk.content += `
5814
+ ${nextChunk.content}`;
5815
+ currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
5816
+ currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
5817
+ } else {
5818
+ currentChunk = this.cloneChunk(nextChunk);
5819
+ }
5820
+ }
5821
+ if (currentChunk) {
5822
+ concatenatedChunks.push(currentChunk);
5823
+ }
5824
+ return concatenatedChunks;
5825
+ }
5826
+ cloneChunk(chunk) {
5827
+ return {
5828
+ types: [...chunk.types],
5829
+ content: chunk.content,
5830
+ section: {
5831
+ level: chunk.section.level,
5832
+ path: [...chunk.section.path]
5833
+ }
5834
+ };
5835
+ }
5836
+ /**
5837
+ * H1 and H2 headings represent major conceptual breaks in the document.
5838
+ * Preserving these splits helps maintain the document's logical structure.
5839
+ */
5840
+ startsNewMajorSection(chunk) {
5841
+ return chunk.section.level === 1 || chunk.section.level === 2;
5842
+ }
5843
+ /**
5844
+ * Size limit check to ensure chunks remain within embedding model constraints.
5845
+ * Essential for maintaining consistent embedding quality and avoiding truncation.
5846
+ */
5847
+ wouldExceedMaxSize(currentChunk, nextChunk) {
5848
+ if (!currentChunk) {
5849
+ return false;
5850
+ }
5851
+ return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
5852
+ }
5853
+ /**
5854
+ * Checks if one path is a prefix of another path, indicating a parent-child relationship
5855
+ */
5856
+ isPathIncluded(parentPath, childPath) {
5857
+ if (parentPath.length >= childPath.length) return false;
5858
+ return parentPath.every((part, i) => part === childPath[i]);
5859
+ }
5860
+ /**
5861
+ * Merges section metadata when concatenating chunks, following these rules:
5862
+ * 1. Level: Always uses the lowest (most general) level between chunks
5863
+ * 2. Path selection:
5864
+ * - For parent-child relationships (one path includes the other), uses the child's path
5865
+ * - For siblings/unrelated sections, uses the common parent path
5866
+ * - If no common path exists, uses the root path ([])
5867
+ */
5868
+ mergeSectionInfo(currentChunk, nextChunk) {
5869
+ const level = Math.min(currentChunk.section.level, nextChunk.section.level);
5870
+ if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
5871
+ return currentChunk.section;
5872
+ }
5873
+ if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
5874
+ return {
5875
+ path: nextChunk.section.path,
5876
+ level
5877
+ };
5878
+ }
5879
+ if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
5880
+ return {
5881
+ path: currentChunk.section.path,
5882
+ level
5883
+ };
5884
+ }
5885
+ const commonPath = this.findCommonPrefix(
5886
+ currentChunk.section.path,
5887
+ nextChunk.section.path
5888
+ );
5889
+ return {
5890
+ path: commonPath,
5891
+ level
5892
+ };
5893
+ }
5894
+ mergeTypes(currentTypes, nextTypes) {
5895
+ return [.../* @__PURE__ */ new Set([...currentTypes, ...nextTypes])];
5896
+ }
5897
+ /**
5898
+ * Returns longest common prefix between two paths
5899
+ */
5900
+ findCommonPrefix(path1, path2) {
5901
+ const common = [];
5902
+ for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
5903
+ if (path1[i] === path2[i]) {
5904
+ common.push(path1[i]);
5905
+ } else {
5906
+ break;
5907
+ }
5908
+ }
5909
+ return common;
5910
+ }
5911
+ }
5912
+ const fullTrim = (str) => {
5913
+ return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
5914
+ };
5915
+ class CodeContentSplitter {
5916
+ constructor(options) {
5917
+ this.options = options;
5918
+ }
5919
+ async split(content) {
5920
+ const language = content.match(/^```(\w+)\n/)?.[1];
5921
+ const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
5922
+ const lines = strippedContent.split("\n");
5923
+ const chunks = [];
5924
+ let currentChunkLines = [];
5925
+ for (const line of lines) {
5926
+ const singleLineSize = this.wrap(line, language).length;
5927
+ if (singleLineSize > this.options.chunkSize) {
5928
+ throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
5929
+ }
5930
+ currentChunkLines.push(line);
5931
+ const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
5932
+ const newChunkSize = newChunkContent.length;
5933
+ if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
5934
+ const lastLine = currentChunkLines.pop();
5935
+ chunks.push(this.wrap(currentChunkLines.join("\n"), language));
5936
+ currentChunkLines = [lastLine];
5937
+ }
5938
+ }
5939
+ if (currentChunkLines.length > 0) {
5940
+ chunks.push(this.wrap(currentChunkLines.join("\n"), language));
5941
+ }
5942
+ return chunks;
5943
+ }
5944
+ wrap(content, language) {
5945
+ return `\`\`\`${language || ""}
5946
+ ${content.replace(/\n+$/, "")}
5947
+ \`\`\``;
5948
+ }
5949
+ }
5950
+ class TableContentSplitter {
5951
+ constructor(options) {
5952
+ this.options = options;
5953
+ }
5954
+ /**
5955
+ * Splits table content into chunks while preserving table structure
5956
+ */
5957
+ async split(content) {
5958
+ const parsedTable = this.parseTable(content);
5959
+ if (!parsedTable) {
5960
+ return [content];
5961
+ }
5962
+ const { headers, rows } = parsedTable;
5963
+ const chunks = [];
5964
+ let currentRows = [];
5965
+ for (const row of rows) {
5966
+ const singleRowSize = this.wrap(row, headers).length;
5967
+ if (singleRowSize > this.options.chunkSize) {
5968
+ throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
5969
+ }
5970
+ const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
5971
+ const newChunkSize = newChunkContent.length;
5972
+ if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
5973
+ chunks.push(this.wrap(currentRows.join("\n"), headers));
5974
+ currentRows = [row];
5975
+ } else {
5976
+ currentRows.push(row);
5977
+ }
5978
+ }
5979
+ if (currentRows.length > 0) {
5980
+ chunks.push(this.wrap(currentRows.join("\n"), headers));
5981
+ }
5982
+ return chunks;
5983
+ }
5984
+ wrap(content, headers) {
5985
+ const headerRow = `| ${headers.join(" | ")} |`;
5986
+ const separatorRow = `|${headers.map(() => "---").join("|")}|`;
5987
+ return [headerRow, separatorRow, content].join("\n");
5988
+ }
5989
+ parseTable(content) {
5990
+ const lines = content.trim().split("\n");
5991
+ if (lines.length < 3) return null;
5992
+ const headers = this.parseRow(lines[0]);
5993
+ if (!headers) return null;
5994
+ const separator = lines[1];
5995
+ if (!this.isValidSeparator(separator)) return null;
5996
+ const rows = lines.slice(2).filter((row) => row.trim() !== "");
5997
+ return { headers, separator, rows };
5998
+ }
5999
+ /**
6000
+ * Parses a table row into cells
6001
+ */
6002
+ parseRow(row) {
6003
+ if (!row.includes("|")) return null;
6004
+ return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
6005
+ }
6006
+ /**
6007
+ * Validates the separator row of the table
6008
+ */
6009
+ isValidSeparator(separator) {
6010
+ return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
6011
+ }
6012
+ }
6013
+ class TextContentSplitter {
6014
+ constructor(options) {
6015
+ this.options = options;
6016
+ }
6017
+ /**
6018
+ * Splits text content into chunks while trying to preserve semantic boundaries.
6019
+ * Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
6020
+ */
6021
+ async split(content) {
6022
+ const trimmedContent = fullTrim(content);
6023
+ if (trimmedContent.length <= this.options.chunkSize) {
6024
+ return [trimmedContent];
6025
+ }
6026
+ const words = trimmedContent.split(/\s+/);
6027
+ const longestWord = words.reduce(
6028
+ (max, word) => word.length > max.length ? word : max
6029
+ );
6030
+ if (longestWord.length > this.options.chunkSize) {
6031
+ throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
6032
+ }
6033
+ const paragraphChunks = this.splitByParagraphs(trimmedContent);
6034
+ if (this.areChunksValid(paragraphChunks)) {
6035
+ return paragraphChunks;
6036
+ }
6037
+ const lineChunks = this.splitByLines(trimmedContent);
6038
+ if (this.areChunksValid(lineChunks)) {
6039
+ return this.mergeChunks(lineChunks, "\n");
6040
+ }
6041
+ const wordChunks = await this.splitByWords(trimmedContent);
6042
+ return this.mergeChunks(wordChunks, " ");
6043
+ }
6044
+ /**
6045
+ * Checks if all chunks are within the maximum size limit
6046
+ */
6047
+ areChunksValid(chunks) {
6048
+ return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
6049
+ }
6050
+ /**
6051
+ * Splits text into chunks by paragraph boundaries (double newlines)
6052
+ */
6053
+ splitByParagraphs(text) {
6054
+ const paragraphs = text.split(/\n\s*\n/).map((p) => fullTrim(p)).filter(Boolean);
6055
+ return paragraphs.filter((chunk) => chunk.length > 2);
6056
+ }
6057
+ /**
6058
+ * Splits text into chunks by line boundaries
6059
+ */
6060
+ splitByLines(text) {
6061
+ const lines = text.split(/\n/).map((line) => fullTrim(line)).filter(Boolean);
6062
+ return lines.filter((chunk) => chunk.length > 1);
6063
+ }
6064
+ /**
6065
+ * Uses LangChain's recursive splitter for word-based splitting as a last resort
6066
+ */
6067
+ async splitByWords(text) {
6068
+ const splitter = new RecursiveCharacterTextSplitter({
6069
+ chunkSize: this.options.chunkSize,
6070
+ chunkOverlap: 0
6071
+ });
6072
+ const chunks = await splitter.splitText(text);
6073
+ return chunks;
6074
+ }
6075
+ /**
6076
+ * Attempts to merge small chunks with previous chunks to minimize fragmentation.
6077
+ * Only merges if combined size is within maxChunkSize.
6078
+ */
6079
+ mergeChunks(chunks, separator) {
6080
+ const mergedChunks = [];
6081
+ let currentChunk = null;
6082
+ for (const chunk of chunks) {
6083
+ if (currentChunk === null) {
6084
+ currentChunk = chunk;
6085
+ continue;
6086
+ }
6087
+ const currentChunkSize = this.getChunkSize(currentChunk);
6088
+ const nextChunkSize = this.getChunkSize(chunk);
6089
+ if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
6090
+ currentChunk = `${currentChunk}${separator}${chunk}`;
6091
+ } else {
6092
+ mergedChunks.push(currentChunk);
6093
+ currentChunk = chunk;
6094
+ }
6095
+ }
6096
+ if (currentChunk) {
6097
+ mergedChunks.push(currentChunk);
6098
+ }
6099
+ return mergedChunks;
6100
+ }
6101
+ getChunkSize(chunk) {
6102
+ return chunk.length;
6103
+ }
6104
+ wrap(content) {
6105
+ return content;
6106
+ }
6107
+ }
6108
+ class SemanticMarkdownSplitter {
6109
+ constructor(preferredChunkSize, maxChunkSize) {
6110
+ this.preferredChunkSize = preferredChunkSize;
6111
+ this.maxChunkSize = maxChunkSize;
6112
+ this.turndownService = new TurndownService({
6113
+ headingStyle: "atx",
6114
+ hr: "---",
6115
+ bulletListMarker: "-",
6116
+ codeBlockStyle: "fenced",
6117
+ emDelimiter: "_",
6118
+ strongDelimiter: "**",
6119
+ linkStyle: "inlined"
6120
+ });
6121
+ this.turndownService.addRule("table", {
6122
+ filter: ["table"],
6123
+ replacement: (_content, node) => {
6124
+ const table = node;
6125
+ const headers = Array.from(table.querySelectorAll("th")).map(
6126
+ (th) => th.textContent?.trim() || ""
6127
+ );
6128
+ const rows = Array.from(table.querySelectorAll("tr")).filter(
6129
+ (tr) => !tr.querySelector("th")
6130
+ );
6131
+ if (headers.length === 0 && rows.length === 0) return "";
6132
+ let markdown = "\n";
6133
+ if (headers.length > 0) {
6134
+ markdown += `| ${headers.join(" | ")} |
6135
+ `;
6136
+ markdown += `|${headers.map(() => "---").join("|")}|
6137
+ `;
6138
+ }
6139
+ for (const row of rows) {
6140
+ const cells = Array.from(row.querySelectorAll("td")).map(
6141
+ (td) => td.textContent?.trim() || ""
6142
+ );
6143
+ markdown += `| ${cells.join(" | ")} |
6144
+ `;
6145
+ }
6146
+ return markdown;
6147
+ }
6148
+ });
6149
+ this.textSplitter = new TextContentSplitter({
6150
+ chunkSize: this.preferredChunkSize
6151
+ });
6152
+ this.codeSplitter = new CodeContentSplitter({
6153
+ chunkSize: this.maxChunkSize
6154
+ });
6155
+ this.tableSplitter = new TableContentSplitter({
6156
+ chunkSize: this.maxChunkSize
6157
+ });
6158
+ }
6159
+ turndownService;
6160
+ textSplitter;
6161
+ codeSplitter;
6162
+ tableSplitter;
6163
+ /**
6164
+ * Main entry point for splitting markdown content
6165
+ */
6166
+ async splitText(markdown) {
6167
+ const html = await this.markdownToHtml(markdown);
6168
+ const dom = await this.parseHtml(html);
6169
+ const sections = await this.splitIntoSections(dom);
6170
+ return this.splitSectionContent(sections);
6171
+ }
6172
+ /**
6173
+ * Step 1: Split document into sections based on H1-H6 headings,
6174
+ * as well as code blocks and tables.
6175
+ */
6176
+ async splitIntoSections(dom) {
6177
+ const body = dom.querySelector("body");
6178
+ if (!body) {
6179
+ throw new Error("Invalid HTML structure: no body element found");
6180
+ }
6181
+ let currentSection = this.createRootSection();
6182
+ const sections = [];
6183
+ const stack = [currentSection];
6184
+ for (const element of Array.from(body.children)) {
6185
+ const headingMatch = element.tagName.match(/H([1-6])/);
6186
+ if (headingMatch) {
6187
+ const level = Number.parseInt(headingMatch[1], 10);
6188
+ const title = fullTrim(element.textContent || "");
6189
+ while (stack.length > 1 && stack[stack.length - 1].level >= level) {
6190
+ stack.pop();
6191
+ }
6192
+ currentSection = {
6193
+ level,
6194
+ path: [
6195
+ ...stack.slice(1).reduce((acc, s) => {
6196
+ const lastPath = s.path[s.path.length - 1];
6197
+ if (lastPath) acc.push(lastPath);
6198
+ return acc;
6199
+ }, []),
6200
+ title
6201
+ ],
6202
+ content: [
6203
+ {
6204
+ type: "heading",
6205
+ text: `${"#".repeat(level)} ${title}`
6206
+ }
6207
+ ]
6208
+ };
6209
+ sections.push(currentSection);
6210
+ stack.push(currentSection);
6211
+ } else if (element.tagName === "PRE") {
6212
+ const code = element.querySelector("code");
6213
+ const language = code?.className.replace("language-", "") || "";
6214
+ const content = code?.textContent || element.textContent || "";
6215
+ const markdown = `${"```"}${language}
6216
+ ${content}
6217
+ ${"```"}`;
6218
+ currentSection = {
6219
+ level: currentSection.level,
6220
+ path: currentSection.path,
6221
+ content: [
6222
+ {
6223
+ type: "code",
6224
+ text: markdown
6225
+ }
6226
+ ]
6227
+ };
6228
+ sections.push(currentSection);
6229
+ } else if (element.tagName === "TABLE") {
6230
+ const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
6231
+ currentSection = {
6232
+ level: currentSection.level,
6233
+ path: currentSection.path,
6234
+ content: [
6235
+ {
6236
+ type: "table",
6237
+ text: markdown
6238
+ }
6239
+ ]
6240
+ };
6241
+ sections.push(currentSection);
6242
+ } else {
6243
+ const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
6244
+ if (markdown) {
6245
+ currentSection = {
6246
+ level: currentSection.level,
6247
+ path: currentSection.path,
6248
+ content: [
6249
+ {
6250
+ type: "text",
6251
+ text: markdown
6252
+ }
6253
+ ]
6254
+ };
6255
+ sections.push(currentSection);
6256
+ }
6257
+ }
6258
+ }
6259
+ return sections;
6260
+ }
6261
+ /**
6262
+ * Step 2: Split section content into smaller chunks
6263
+ */
6264
+ async splitSectionContent(sections) {
6265
+ const chunks = [];
6266
+ for (const section of sections) {
6267
+ for (const content of section.content) {
6268
+ let splitContent = [];
6269
+ try {
6270
+ switch (content.type) {
6271
+ case "heading":
6272
+ case "text": {
6273
+ splitContent = await this.textSplitter.split(content.text);
6274
+ break;
6275
+ }
6276
+ case "code": {
6277
+ splitContent = await this.codeSplitter.split(content.text);
6278
+ break;
6279
+ }
6280
+ case "table": {
6281
+ splitContent = await this.tableSplitter.split(content.text);
6282
+ break;
6283
+ }
6284
+ }
6285
+ } catch (err) {
6286
+ if (err instanceof MinimumChunkSizeError) {
6287
+ logger.warn(
6288
+ `⚠ Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
6289
+ );
6290
+ const splitter = new RecursiveCharacterTextSplitter({
6291
+ chunkSize: this.maxChunkSize,
6292
+ chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
6293
+ // Use more aggressive separators including empty string as last resort
6294
+ separators: [
6295
+ "\n\n",
6296
+ "\n",
6297
+ " ",
6298
+ " ",
6299
+ ".",
6300
+ ",",
6301
+ ";",
6302
+ ":",
6303
+ "-",
6304
+ "(",
6305
+ ")",
6306
+ "[",
6307
+ "]",
6308
+ "{",
6309
+ "}",
6310
+ ""
6311
+ ]
6312
+ });
6313
+ const chunks2 = await splitter.splitText(content.text);
6314
+ if (chunks2.length === 0) {
6315
+ splitContent = [content.text.substring(0, this.maxChunkSize)];
6316
+ } else {
6317
+ splitContent = chunks2;
6318
+ }
6319
+ } else {
6320
+ const errMessage = err instanceof Error ? err.message : String(err);
6321
+ throw new ContentSplitterError(
6322
+ `Failed to split ${content.type} content: ${errMessage}`
6323
+ );
6324
+ }
6325
+ }
6326
+ chunks.push(
6327
+ ...splitContent.map(
6328
+ (text) => ({
6329
+ types: [content.type],
6330
+ content: text,
6331
+ section: {
6332
+ level: section.level,
6333
+ path: section.path
6334
+ }
6335
+ })
6336
+ )
6337
+ );
6338
+ }
6339
+ }
6340
+ return chunks;
6341
+ }
6342
+ /**
6343
+ * Helper to create the root section
6344
+ */
6345
+ createRootSection() {
6346
+ return {
6347
+ level: 0,
6348
+ path: [],
6349
+ content: []
6350
+ };
6351
+ }
6352
+ /**
6353
+ * Convert markdown to HTML using remark
6354
+ */
6355
+ async markdownToHtml(markdown) {
6356
+ const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
6357
+ return `<!DOCTYPE html>
6358
+ <html>
6359
+ <body>
6360
+ ${String(html)}
6361
+ </body>
6362
+ </html>`;
6363
+ }
6364
+ /**
6365
+ * Parse HTML
6366
+ */
6367
+ async parseHtml(html) {
6368
+ const { window } = createJSDOM(html);
6369
+ return window.document;
6370
+ }
6371
+ }
6372
+ const CHILD_LIMIT = 5;
6373
+ const SIBLING_LIMIT = 2;
6374
+ class DocumentRetrieverService {
6375
+ documentStore;
6376
+ constructor(documentStore) {
6377
+ this.documentStore = documentStore;
6378
+ }
6379
+ /**
6380
+ * Collects all related chunk IDs for a given initial hit.
6381
+ * Returns an object with url, hitId, relatedIds (Set), and score.
6382
+ */
6383
+ async getRelatedChunkIds(library, version2, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
6384
+ const id = doc.id;
6385
+ const url = doc.metadata.url;
6386
+ const score = doc.metadata.score;
6387
+ const relatedIds = /* @__PURE__ */ new Set();
6388
+ relatedIds.add(id);
6389
+ const parent = await this.documentStore.findParentChunk(library, version2, id);
6390
+ if (parent) {
6391
+ relatedIds.add(parent.id);
6392
+ }
6393
+ const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
6394
+ library,
6395
+ version2,
6396
+ id,
6397
+ siblingLimit
6398
+ );
6399
+ for (const sib of precedingSiblings) {
6400
+ relatedIds.add(sib.id);
6401
+ }
6402
+ const childChunks = await this.documentStore.findChildChunks(
6403
+ library,
6404
+ version2,
6405
+ id,
6406
+ childLimit
6407
+ );
6408
+ for (const child of childChunks) {
6409
+ relatedIds.add(child.id);
6410
+ }
6411
+ const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
6412
+ library,
6413
+ version2,
6414
+ id,
6415
+ siblingLimit
6416
+ );
6417
+ for (const sib of subsequentSiblings) {
6418
+ relatedIds.add(sib.id);
6419
+ }
6420
+ return { url, hitId: id, relatedIds, score };
6421
+ }
6422
+ /**
6423
+ * Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
6424
+ */
6425
+ groupAndPrepareFetch(relatedInfos) {
6426
+ const urlMap = /* @__PURE__ */ new Map();
6427
+ for (const info of relatedInfos) {
6428
+ let entry = urlMap.get(info.url);
6429
+ if (!entry) {
6430
+ entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
6431
+ urlMap.set(info.url, entry);
6432
+ }
6433
+ for (const id of info.relatedIds) {
6434
+ entry.uniqueChunkIds.add(id);
6435
+ }
6436
+ if (info.score > entry.maxScore) {
6437
+ entry.maxScore = info.score;
6438
+ }
6439
+ }
6440
+ return urlMap;
6441
+ }
6442
+ /**
6443
+ * Finalizes the merged result for a URL group by fetching, sorting, and joining content.
6444
+ */
6445
+ async finalizeResult(library, version2, url, uniqueChunkIds, maxScore) {
6446
+ const ids = Array.from(uniqueChunkIds);
6447
+ const docs = await this.documentStore.findChunksByIds(library, version2, ids);
6448
+ const content = docs.map((d) => d.pageContent).join("\n\n");
6449
+ return {
6450
+ url,
6451
+ content,
6452
+ score: maxScore
6453
+ };
6454
+ }
6455
+ /**
6456
+ * Searches for documents and expands the context around the matches.
6457
+ * @param library The library name.
6458
+ * @param version The library version.
6459
+ * @param query The search query.
6460
+ * @param version The library version (optional, defaults to searching documents without a version).
6461
+ * @param query The search query.
6462
+ * @param limit The optional limit for the initial search results.
6463
+ * @returns An array of strings representing the aggregated content of the retrieved chunks.
6464
+ */
6465
+ async search(library, version2, query, limit) {
6466
+ const normalizedVersion = (version2 ?? "").toLowerCase();
6467
+ const initialResults = await this.documentStore.findByContent(
6468
+ library,
6469
+ normalizedVersion,
6470
+ query,
6471
+ limit ?? 10
6472
+ );
6473
+ const relatedInfos = await Promise.all(
6474
+ initialResults.map(
6475
+ (doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
6476
+ )
6477
+ );
6478
+ const urlMap = this.groupAndPrepareFetch(relatedInfos);
6479
+ const results = [];
6480
+ for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
6481
+ const result = await this.finalizeResult(
6482
+ library,
6483
+ normalizedVersion,
6484
+ url,
6485
+ uniqueChunkIds,
6486
+ maxScore
6487
+ );
6488
+ results.push(result);
6489
+ }
6490
+ return results;
6491
+ }
6492
+ }
6493
+ const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
6494
+ const MIGRATIONS_TABLE = "_schema_migrations";
6495
+ function ensureMigrationsTable(db) {
6496
+ db.exec(`
6497
+ CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
6498
+ id TEXT PRIMARY KEY,
6499
+ applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
6500
+ );
6501
+ `);
6502
+ }
6503
+ function getAppliedMigrations(db) {
6504
+ const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
6505
+ const rows = stmt.all();
6506
+ return new Set(rows.map((row) => row.id));
6507
+ }
6508
+ async function applyMigrations(db) {
6509
+ try {
6510
+ db.pragma("journal_mode = OFF");
6511
+ db.pragma("synchronous = OFF");
6512
+ db.pragma("mmap_size = 268435456");
6513
+ db.pragma("cache_size = -64000");
6514
+ db.pragma("temp_store = MEMORY");
6515
+ logger.debug("Applied performance optimizations for migration");
6516
+ } catch (_error) {
6517
+ logger.warn("⚠️ Could not apply all performance optimizations for migration");
6518
+ }
6519
+ const overallTransaction = db.transaction(() => {
6520
+ logger.debug("Checking database migrations...");
6521
+ ensureMigrationsTable(db);
6522
+ const appliedMigrations = getAppliedMigrations(db);
6523
+ if (!fs.existsSync(MIGRATIONS_DIR)) {
6524
+ throw new StoreError("Migrations directory not found");
6525
+ }
6526
+ const migrationFiles = fs.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
6527
+ const pendingMigrations = migrationFiles.filter(
6528
+ (filename) => !appliedMigrations.has(filename)
6529
+ );
6530
+ if (pendingMigrations.length > 0) {
6531
+ logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
6532
+ }
6533
+ let appliedCount = 0;
6534
+ for (const filename of pendingMigrations) {
6535
+ logger.debug(`Applying migration: ${filename}`);
6536
+ const filePath = path.join(MIGRATIONS_DIR, filename);
6537
+ const sql = fs.readFileSync(filePath, "utf8");
6538
+ try {
6539
+ db.exec(sql);
6540
+ const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
6541
+ insertStmt.run(filename);
6542
+ logger.debug(`Applied migration: ${filename}`);
6543
+ appliedCount++;
6544
+ } catch (error) {
6545
+ logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
6546
+ throw new StoreError(`Migration failed: ${filename}`, error);
6547
+ }
6548
+ }
6549
+ if (appliedCount > 0) {
6550
+ logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
6551
+ } else {
6552
+ logger.debug("Database schema is up to date");
6553
+ }
6554
+ return appliedCount;
6555
+ });
6556
+ let retries = 0;
6557
+ let appliedMigrationsCount = 0;
6558
+ while (true) {
6559
+ try {
6560
+ appliedMigrationsCount = overallTransaction.immediate();
6561
+ logger.debug("Database migrations completed successfully");
6562
+ if (appliedMigrationsCount > 0) {
6563
+ try {
6564
+ logger.debug(
6565
+ `Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
6566
+ );
6567
+ db.exec("VACUUM");
6568
+ logger.debug("Database vacuum completed successfully");
6569
+ } catch (error) {
6570
+ logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
6571
+ }
6572
+ } else {
6573
+ logger.debug("Skipping VACUUM - no migrations were applied");
6574
+ }
6575
+ break;
6576
+ } catch (error) {
6577
+ if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
6578
+ retries++;
6579
+ logger.warn(
6580
+ `⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
6581
+ );
6582
+ await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
6583
+ } else {
6584
+ if (error?.code === "SQLITE_BUSY") {
6585
+ logger.error(
6586
+ `❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
6587
+ );
6588
+ }
6589
+ if (error instanceof StoreError) {
6590
+ throw error;
6591
+ }
6592
+ throw new StoreError("Failed during migration process", error);
6593
+ }
6594
+ }
6595
+ }
6596
+ try {
6597
+ db.pragma("journal_mode = WAL");
6598
+ db.pragma("wal_autocheckpoint = 1000");
6599
+ db.pragma("busy_timeout = 30000");
6600
+ db.pragma("foreign_keys = ON");
6601
+ db.pragma("synchronous = NORMAL");
6602
+ logger.debug(
6603
+ "Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
6604
+ );
6605
+ } catch (_error) {
6606
+ logger.warn("⚠️ Could not apply all production database settings");
6607
+ }
6608
+ }
6609
+ class EmbeddingConfig {
6610
+ static instance = null;
6611
+ /**
6612
+ * Get the singleton instance of EmbeddingConfig.
6613
+ * Creates the instance if it doesn't exist.
6614
+ */
6615
+ static getInstance() {
6616
+ if (EmbeddingConfig.instance === null) {
6617
+ EmbeddingConfig.instance = new EmbeddingConfig();
6618
+ }
6619
+ return EmbeddingConfig.instance;
6620
+ }
6621
+ /**
6622
+ * Reset the singleton instance (useful for testing).
6623
+ */
6624
+ static resetInstance() {
6625
+ EmbeddingConfig.instance = null;
6626
+ }
6627
+ /**
6628
+ * Known dimensions for common embedding models.
6629
+ * This avoids expensive API calls for dimension detection in telemetry.
6630
+ *
6631
+ * Note: The "openai" provider also supports OpenAI-compatible APIs like:
6632
+ * - Ollama (local models)
6633
+ * - LMStudio (local models)
6634
+ * - Any service implementing OpenAI's embedding API
6635
+ */
6636
+ knownModelDimensions = {
6637
+ // OpenAI models (also works with Ollama, LMStudio, and other OpenAI-compatible APIs)
6638
+ "text-embedding-3-small": 1536,
6639
+ "text-embedding-3-large": 3072,
6640
+ "text-embedding-ada-002": 1536,
6641
+ // Google Vertex AI models
6642
+ "text-embedding-004": 768,
6643
+ "textembedding-gecko@003": 768,
6644
+ "textembedding-gecko@002": 768,
6645
+ "textembedding-gecko@001": 768,
6646
+ // Google Gemini models (with MRL support)
6647
+ "text-embedding-preview-0409": 768,
6648
+ "embedding-001": 768,
6649
+ // AWS Bedrock models
6650
+ // Amazon Titan models
6651
+ "amazon.titan-embed-text-v1": 1536,
6652
+ "amazon.titan-embed-text-v2:0": 1024,
6653
+ "amazon.titan-embed-image-v1": 1024,
6654
+ // Image embedding model
6655
+ // Cohere models
6656
+ "cohere.embed-english-v3": 1024,
6657
+ "cohere.embed-multilingual-v3": 1024,
6658
+ // SageMaker models (hosted on AWS SageMaker)
6659
+ "intfloat/multilingual-e5-large": 1024,
6660
+ // Additional AWS models that might be supported
6661
+ // Note: Some of these might be placeholders - verify dimensions before use
6662
+ // "amazon.nova-embed-multilingual-v1:0": 4096, // Commented out as noted in source
6663
+ // MTEB Leaderboard models (source: https://huggingface.co/spaces/mteb/leaderboard)
6664
+ // Top performing models from Massive Text Embedding Benchmark
6665
+ "sentence-transformers/all-MiniLM-L6-v2": 384,
6666
+ "gemini-embedding-001": 3072,
6667
+ "Qwen/Qwen3-Embedding-8B": 4096,
6668
+ "Qwen/Qwen3-Embedding-4B": 2560,
6669
+ "Qwen/Qwen3-Embedding-0.6B": 1024,
6670
+ "Linq-AI-Research/Linq-Embed-Mistral": 4096,
6671
+ "Alibaba-NLP/gte-Qwen2-7B-instruct": 3584,
6672
+ "intfloat/multilingual-e5-large-instruct": 1024,
6673
+ "Salesforce/SFR-Embedding-Mistral": 4096,
6674
+ "text-multilingual-embedding-002": 768,
6675
+ "GritLM/GritLM-7B": 4096,
6676
+ "GritLM/GritLM-8x7B": 4096,
6677
+ "intfloat/e5-mistral-7b-instruct": 4096,
6678
+ "Cohere/Cohere-embed-multilingual-v3.0": 1024,
6679
+ "Alibaba-NLP/gte-Qwen2-1.5B-instruct": 8960,
6680
+ "Lajavaness/bilingual-embedding-large": 1024,
6681
+ "Salesforce/SFR-Embedding-2_R": 4096,
6682
+ "NovaSearch/stella_en_1.5B_v5": 8960,
6683
+ "NovaSearch/jasper_en_vision_language_v1": 8960,
6684
+ "nvidia/NV-Embed-v2": 4096,
6685
+ "OrdalieTech/Solon-embeddings-large-0.1": 1024,
6686
+ "BAAI/bge-m3": 1024,
6687
+ "HIT-TMG/KaLM-embedding-multilingual-mini-v1": 896,
6688
+ "jinaai/jina-embeddings-v3": 1024,
6689
+ "Alibaba-NLP/gte-multilingual-base": 768,
6690
+ "Lajavaness/bilingual-embedding-base": 768,
6691
+ "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1": 896,
6692
+ "nvidia/NV-Embed-v1": 4096,
6693
+ "Cohere/Cohere-embed-multilingual-light-v3.0": 384,
6694
+ "manu/bge-m3-custom-fr": 1024,
6695
+ "Lajavaness/bilingual-embedding-small": 384,
6696
+ "Snowflake/snowflake-arctic-embed-l-v2.0": 1024,
6697
+ "intfloat/multilingual-e5-base": 768,
6698
+ "voyage-3-lite": 512,
6699
+ "voyage-3": 1024,
6700
+ "intfloat/multilingual-e5-small": 384,
6701
+ "Alibaba-NLP/gte-Qwen1.5-7B-instruct": 4096,
6702
+ "Snowflake/snowflake-arctic-embed-m-v2.0": 768,
6703
+ "deepvk/USER-bge-m3": 1024,
6704
+ "Cohere/Cohere-embed-english-v3.0": 1024,
6705
+ "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka": 768,
6706
+ "ibm-granite/granite-embedding-278m-multilingual": 768,
6707
+ "NovaSearch/stella_en_400M_v5": 4096,
6708
+ "omarelshehy/arabic-english-sts-matryoshka": 1024,
6709
+ "sentence-transformers/paraphrase-multilingual-mpnet-base-v2": 768,
6710
+ "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka": 768,
6711
+ "Haon-Chen/speed-embedding-7b-instruct": 4096,
6712
+ "sentence-transformers/LaBSE": 768,
6713
+ "WhereIsAI/UAE-Large-V1": 1024,
6714
+ "ibm-granite/granite-embedding-107m-multilingual": 384,
6715
+ "mixedbread-ai/mxbai-embed-large-v1": 1024,
6716
+ "intfloat/e5-large-v2": 1024,
6717
+ "avsolatorio/GIST-large-Embedding-v0": 1024,
6718
+ "sdadas/mmlw-e5-large": 1024,
6719
+ "nomic-ai/nomic-embed-text-v1": 768,
6720
+ "nomic-ai/nomic-embed-text-v1-ablated": 768,
6721
+ "intfloat/e5-base-v2": 768,
6722
+ "BAAI/bge-large-en-v1.5": 1024,
6723
+ "intfloat/e5-large": 1024,
6724
+ "Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet": 384,
6725
+ "Cohere/Cohere-embed-english-light-v3.0": 384,
6726
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": 768,
6727
+ "Gameselo/STS-multilingual-mpnet-base-v2": 768,
6728
+ "thenlper/gte-large": 1024,
6729
+ "avsolatorio/GIST-Embedding-v0": 768,
6730
+ "nomic-ai/nomic-embed-text-v1-unsupervised": 768,
6731
+ "infgrad/stella-base-en-v2": 768,
6732
+ "avsolatorio/NoInstruct-small-Embedding-v0": 384,
6733
+ "dwzhu/e5-base-4k": 768,
6734
+ "sdadas/mmlw-e5-base": 768,
6735
+ "voyage-multilingual-2": 1024,
6736
+ "McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised": 4096,
6737
+ "BAAI/bge-base-en-v1.5": 768,
6738
+ "avsolatorio/GIST-small-Embedding-v0": 384,
6739
+ "sdadas/mmlw-roberta-large": 1024,
6740
+ "nomic-ai/nomic-embed-text-v1.5": 768,
6741
+ "minishlab/potion-multilingual-128M": 256,
6742
+ "shibing624/text2vec-base-multilingual": 384,
6743
+ "thenlper/gte-base": 768,
6744
+ "intfloat/e5-small-v2": 384,
6745
+ "intfloat/e5-base": 768,
6746
+ "sentence-transformers/static-similarity-mrl-multilingual-v1": 1024,
6747
+ "manu/sentence_croissant_alpha_v0.3": 2048,
6748
+ "BAAI/bge-small-en-v1.5": 512,
6749
+ "thenlper/gte-small": 384,
6750
+ "sdadas/mmlw-e5-small": 384,
6751
+ "manu/sentence_croissant_alpha_v0.4": 2048,
6752
+ "manu/sentence_croissant_alpha_v0.2": 2048,
6753
+ "abhinand/MedEmbed-small-v0.1": 384,
6754
+ "ibm-granite/granite-embedding-125m-english": 768,
6755
+ "intfloat/e5-small": 384,
6756
+ "voyage-large-2-instruct": 1024,
6757
+ "sdadas/mmlw-roberta-base": 768,
6758
+ "Snowflake/snowflake-arctic-embed-l": 1024,
6759
+ "Mihaiii/Ivysaur": 384,
6760
+ "Snowflake/snowflake-arctic-embed-m-long": 768,
6761
+ "bigscience/sgpt-bloom-7b1-msmarco": 4096,
6762
+ "avsolatorio/GIST-all-MiniLM-L6-v2": 384,
6763
+ "sergeyzh/LaBSE-ru-turbo": 768,
6764
+ "sentence-transformers/all-mpnet-base-v2": 768,
6765
+ "Snowflake/snowflake-arctic-embed-m": 768,
6766
+ "Snowflake/snowflake-arctic-embed-s": 384,
6767
+ "sentence-transformers/all-MiniLM-L12-v2": 384,
6768
+ "Mihaiii/gte-micro-v4": 384,
6769
+ "Snowflake/snowflake-arctic-embed-m-v1.5": 768,
6770
+ "cointegrated/LaBSE-en-ru": 768,
6771
+ "Mihaiii/Bulbasaur": 384,
6772
+ "ibm-granite/granite-embedding-30m-english": 384,
6773
+ "deepfile/embedder-100p": 768,
6774
+ "Jaume/gemma-2b-embeddings": 2048,
6775
+ "OrlikB/KartonBERT-USE-base-v1": 768,
6776
+ "izhx/udever-bloom-7b1": 4096,
6777
+ "izhx/udever-bloom-1b1": 1024,
6778
+ "brahmairesearch/slx-v0.1": 384,
6779
+ "Mihaiii/Wartortle": 384,
6780
+ "izhx/udever-bloom-3b": 2048,
6781
+ "deepvk/USER-base": 768,
6782
+ "ai-forever/ru-en-RoSBERTa": 1024,
6783
+ "McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse": 4096,
6784
+ "Mihaiii/Venusaur": 384,
6785
+ "Snowflake/snowflake-arctic-embed-xs": 384,
6786
+ "jinaai/jina-embedding-b-en-v1": 768,
6787
+ "Mihaiii/gte-micro": 384,
6788
+ "aari1995/German_Semantic_STS_V2": 1024,
6789
+ "Mihaiii/Squirtle": 384,
6790
+ "OrlikB/st-polish-kartonberta-base-alpha-v1": 768,
6791
+ "sergeyzh/rubert-tiny-turbo": 312,
6792
+ "minishlab/potion-base-8M": 256,
6793
+ "minishlab/M2V_base_glove_subword": 256,
6794
+ "jinaai/jina-embedding-s-en-v1": 512,
6795
+ "minishlab/potion-base-4M": 128,
6796
+ "minishlab/M2V_base_output": 256,
6797
+ "DeepPavlov/rubert-base-cased-sentence": 768,
6798
+ "jinaai/jina-embeddings-v2-small-en": 512,
6799
+ "cointegrated/rubert-tiny2": 312,
6800
+ "minishlab/M2V_base_glove": 256,
6801
+ "cointegrated/rubert-tiny": 312,
6802
+ "silma-ai/silma-embeddding-matryoshka-v0.1": 768,
6803
+ "DeepPavlov/rubert-base-cased": 768,
6804
+ "Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet": 768,
6805
+ "izhx/udever-bloom-560m": 1024,
6806
+ "minishlab/potion-base-2M": 64,
6807
+ "DeepPavlov/distilrubert-small-cased-conversational": 768,
6808
+ "consciousAI/cai-lunaris-text-embeddings": 1024,
6809
+ "deepvk/deberta-v1-base": 768,
6810
+ "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka": 768,
6811
+ "Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka": 768,
6812
+ "ai-forever/sbert_large_mt_nlu_ru": 1024,
6813
+ "ai-forever/sbert_large_nlu_ru": 1024,
6814
+ "malenia1/ternary-weight-embedding": 1024,
6815
+ "jinaai/jina-embeddings-v2-base-en": 768,
6816
+ "VPLabs/SearchMap_Preview": 4096,
6817
+ "Hum-Works/lodestone-base-4096-v1": 768,
6818
+ "jinaai/jina-embeddings-v4": 2048
6819
+ };
6820
+ /**
6821
+ * Lowercase lookup map for case-insensitive model dimension queries.
6822
+ * Built lazily from knownModelDimensions to ensure consistency.
6823
+ */
6824
+ modelLookup;
6825
+ constructor() {
6826
+ this.modelLookup = /* @__PURE__ */ new Map();
6827
+ for (const [model, dimensions] of Object.entries(this.knownModelDimensions)) {
6828
+ this.modelLookup.set(model.toLowerCase(), dimensions);
6829
+ }
6830
+ }
6831
+ /**
6832
+ * Parse embedding model configuration from environment variables.
6833
+ * This is a synchronous operation that extracts provider, model, and known dimensions.
6834
+ *
6835
+ * Supports various providers:
6836
+ * - openai: OpenAI models and OpenAI-compatible APIs (Ollama, LMStudio, etc.)
6837
+ * - vertex: Google Cloud Vertex AI
6838
+ * - gemini: Google Generative AI
6839
+ * - aws: AWS Bedrock models
6840
+ * - microsoft: Azure OpenAI
6841
+ * - sagemaker: AWS SageMaker hosted models
6842
+ *
6843
+ * @param modelSpec Optional model specification, defaults to DOCS_MCP_EMBEDDING_MODEL env var
6844
+ * @returns Parsed embedding model configuration
6845
+ */
6846
+ parse(modelSpec) {
6847
+ const spec = modelSpec || process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
6848
+ const colonIndex = spec.indexOf(":");
6849
+ let provider;
6850
+ let model;
6851
+ if (colonIndex === -1) {
6852
+ provider = "openai";
6853
+ model = spec;
6854
+ } else {
6855
+ provider = spec.substring(0, colonIndex);
6856
+ model = spec.substring(colonIndex + 1);
6857
+ }
6858
+ const dimensions = this.modelLookup?.get(model.toLowerCase()) || null;
6859
+ return {
6860
+ provider,
6861
+ model,
6862
+ dimensions,
6863
+ modelSpec: spec
6864
+ };
6865
+ }
6866
+ /**
6867
+ * Get the known dimensions for a specific model.
6868
+ * Returns null if the model dimensions are not known.
6869
+ * Uses case-insensitive lookup.
6870
+ *
6871
+ * @param model The model name (e.g., "text-embedding-3-small")
6872
+ * @returns Known dimensions or null
6873
+ */
6874
+ getKnownDimensions(model) {
6875
+ return this.modelLookup?.get(model.toLowerCase()) || null;
6876
+ }
6877
+ /**
6878
+ * Add or update known dimensions for a model.
6879
+ * This can be used to cache discovered dimensions.
6880
+ * Stores both original case and lowercase for consistent lookup.
6881
+ *
6882
+ * @param model The model name
6883
+ * @param dimensions The dimensions to cache
6884
+ */
6885
+ setKnownDimensions(model, dimensions) {
6886
+ this.knownModelDimensions[model] = dimensions;
6887
+ if (this.modelLookup) {
6888
+ this.modelLookup.set(model.toLowerCase(), dimensions);
6889
+ }
6890
+ }
6891
+ /**
6892
+ * Static method to parse embedding model configuration using the singleton instance.
6893
+ * This maintains backward compatibility while using the class-based approach.
6894
+ */
6895
+ static parseEmbeddingConfig(modelSpec) {
6896
+ return EmbeddingConfig.getInstance().parse(modelSpec);
6897
+ }
6898
+ /**
6899
+ * Static method to get known model dimensions using the singleton instance.
6900
+ * This maintains backward compatibility while using the class-based approach.
6901
+ */
6902
+ static getKnownModelDimensions(model) {
6903
+ return EmbeddingConfig.getInstance().getKnownDimensions(model);
6904
+ }
6905
+ /**
6906
+ * Static method to set known model dimensions using the singleton instance.
6907
+ * This maintains backward compatibility while using the class-based approach.
6908
+ */
6909
+ static setKnownModelDimensions(model, dimensions) {
6910
+ EmbeddingConfig.getInstance().setKnownDimensions(model, dimensions);
6911
+ }
6912
+ }
6913
+ class DocumentStore {
6914
+ db;
6915
+ embeddings;
6916
+ dbDimension = VECTOR_DIMENSION;
6917
+ modelDimension;
6918
+ embeddingConfig;
6919
+ statements;
6920
+ /**
6921
+ * Calculates Reciprocal Rank Fusion score for a result
6922
+ */
6923
+ calculateRRF(vecRank, ftsRank, k = 60) {
6924
+ let rrf = 0;
6925
+ if (vecRank !== void 0) {
6926
+ rrf += 1 / (k + vecRank);
6927
+ }
6928
+ if (ftsRank !== void 0) {
6929
+ rrf += 1 / (k + ftsRank);
6930
+ }
6931
+ return rrf;
6932
+ }
6933
+ /**
6934
+ * Assigns ranks to search results based on their scores
6935
+ */
6936
+ assignRanks(results) {
6937
+ const vecRanks = /* @__PURE__ */ new Map();
6938
+ const ftsRanks = /* @__PURE__ */ new Map();
6939
+ results.filter((r) => r.vec_score !== void 0).sort((a, b) => (b.vec_score ?? 0) - (a.vec_score ?? 0)).forEach((result, index) => {
6940
+ vecRanks.set(Number(result.id), index + 1);
6941
+ });
6942
+ results.filter((r) => r.fts_score !== void 0).sort((a, b) => (b.fts_score ?? 0) - (a.fts_score ?? 0)).forEach((result, index) => {
6943
+ ftsRanks.set(Number(result.id), index + 1);
6944
+ });
6945
+ return results.map((result) => ({
6946
+ ...result,
6947
+ vec_rank: vecRanks.get(Number(result.id)),
6948
+ fts_rank: ftsRanks.get(Number(result.id)),
6949
+ rrf_score: this.calculateRRF(
6950
+ vecRanks.get(Number(result.id)),
6951
+ ftsRanks.get(Number(result.id))
6952
+ )
6953
+ }));
6954
+ }
6955
+ constructor(dbPath, embeddingConfig) {
6956
+ if (!dbPath) {
6957
+ throw new StoreError("Missing required database path");
6958
+ }
6959
+ this.db = new Database(dbPath);
6960
+ this.embeddingConfig = embeddingConfig;
6961
+ }
6962
+ /**
6963
+ * Sets up prepared statements for database queries
6964
+ */
6965
+ prepareStatements() {
6966
+ const statements = {
6967
+ getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
6968
+ insertDocument: this.db.prepare(
6969
+ "INSERT INTO documents (library_id, version_id, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
6970
+ ),
6971
+ insertEmbedding: this.db.prepare(
6972
+ "INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)"
6973
+ ),
6974
+ insertLibrary: this.db.prepare(
6975
+ "INSERT INTO libraries (name) VALUES (?) ON CONFLICT(name) DO NOTHING"
6976
+ ),
6977
+ getLibraryIdByName: this.db.prepare(
6978
+ "SELECT id FROM libraries WHERE name = ?"
6979
+ ),
6980
+ // New version-related statements
6981
+ insertVersion: this.db.prepare(
6982
+ "INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
6983
+ ),
6984
+ resolveVersionId: this.db.prepare(
6985
+ "SELECT id FROM versions WHERE library_id = ? AND name IS ?"
6986
+ ),
6987
+ getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
6988
+ queryVersionsByLibraryId: this.db.prepare(
6989
+ "SELECT * FROM versions WHERE library_id = ? ORDER BY name"
6990
+ ),
6991
+ deleteLibraryDocuments: this.db.prepare(
6992
+ `DELETE FROM documents
6993
+ WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
6994
+ AND version_id = (
6995
+ SELECT v.id FROM versions v
6996
+ WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
6997
+ AND COALESCE(v.name, '') = COALESCE(?, '')
6998
+ )`
6999
+ ),
7000
+ deleteDocuments: this.db.prepare(
7001
+ `DELETE FROM documents
7002
+ WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
7003
+ AND version_id = (
7004
+ SELECT v.id FROM versions v
7005
+ WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
7006
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7007
+ )`
7008
+ ),
7009
+ deleteDocumentsByUrl: this.db.prepare(
7010
+ `DELETE FROM documents
7011
+ WHERE url = ?
7012
+ AND library_id = (SELECT id FROM libraries WHERE name = ?)
7013
+ AND version_id = (
7014
+ SELECT v.id FROM versions v
7015
+ WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
7016
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7017
+ )`
7018
+ ),
7019
+ getDocumentBySort: this.db.prepare(
7020
+ `SELECT d.id
7021
+ FROM documents d
7022
+ JOIN versions v ON d.version_id = v.id
7023
+ JOIN libraries l ON v.library_id = l.id
7024
+ WHERE l.name = ?
7025
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7026
+ LIMIT 1`
7027
+ ),
7028
+ queryVersions: this.db.prepare(
7029
+ `SELECT DISTINCT v.name
7030
+ FROM versions v
7031
+ JOIN libraries l ON v.library_id = l.id
7032
+ WHERE l.name = ?
7033
+ ORDER BY v.name`
7034
+ ),
7035
+ checkExists: this.db.prepare(
7036
+ `SELECT d.id FROM documents d
7037
+ JOIN versions v ON d.version_id = v.id
7038
+ JOIN libraries l ON v.library_id = l.id
7039
+ WHERE l.name = ?
7040
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7041
+ LIMIT 1`
7042
+ ),
7043
+ // Library/version aggregation including versions without documents and status/progress fields
7044
+ queryLibraryVersions: this.db.prepare(
7045
+ `SELECT
7046
+ l.name as library,
7047
+ COALESCE(v.name, '') as version,
7048
+ v.id as versionId,
7049
+ v.status as status,
7050
+ v.progress_pages as progressPages,
7051
+ v.progress_max_pages as progressMaxPages,
7052
+ v.source_url as sourceUrl,
7053
+ MIN(d.indexed_at) as indexedAt,
7054
+ COUNT(d.id) as documentCount,
7055
+ COUNT(DISTINCT d.url) as uniqueUrlCount
7056
+ FROM versions v
7057
+ JOIN libraries l ON v.library_id = l.id
7058
+ LEFT JOIN documents d ON d.version_id = v.id
7059
+ GROUP BY v.id
7060
+ ORDER BY l.name, version`
7061
+ ),
7062
+ getChildChunks: this.db.prepare(`
7063
+ SELECT d.* FROM documents d
7064
+ JOIN versions v ON d.version_id = v.id
7065
+ JOIN libraries l ON v.library_id = l.id
7066
+ WHERE l.name = ?
7067
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7068
+ AND d.url = ?
7069
+ AND json_array_length(json_extract(d.metadata, '$.path')) = ?
7070
+ AND json_extract(d.metadata, '$.path') LIKE ? || '%'
7071
+ AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
7072
+ ORDER BY d.sort_order
7073
+ LIMIT ?
7074
+ `),
7075
+ getPrecedingSiblings: this.db.prepare(`
7076
+ SELECT d.* FROM documents d
7077
+ JOIN versions v ON d.version_id = v.id
7078
+ JOIN libraries l ON v.library_id = l.id
7079
+ WHERE l.name = ?
7080
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7081
+ AND d.url = ?
7082
+ AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
7083
+ AND json_extract(d.metadata, '$.path') = ?
7084
+ ORDER BY d.sort_order DESC
7085
+ LIMIT ?
7086
+ `),
7087
+ getSubsequentSiblings: this.db.prepare(`
7088
+ SELECT d.* FROM documents d
7089
+ JOIN versions v ON d.version_id = v.id
7090
+ JOIN libraries l ON v.library_id = l.id
7091
+ WHERE l.name = ?
7092
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7093
+ AND d.url = ?
7094
+ AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
7095
+ AND json_extract(d.metadata, '$.path') = ?
7096
+ ORDER BY d.sort_order
7097
+ LIMIT ?
7098
+ `),
7099
+ getParentChunk: this.db.prepare(`
7100
+ SELECT d.* FROM documents d
7101
+ JOIN versions v ON d.version_id = v.id
7102
+ JOIN libraries l ON v.library_id = l.id
7103
+ WHERE l.name = ?
7104
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7105
+ AND d.url = ?
7106
+ AND json_extract(d.metadata, '$.path') = ?
7107
+ AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
7108
+ ORDER BY d.sort_order DESC
7109
+ LIMIT 1
7110
+ `),
7111
+ // Status tracking statements
7112
+ updateVersionStatus: this.db.prepare(
7113
+ "UPDATE versions SET status = ?, error_message = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
7114
+ ),
7115
+ updateVersionProgress: this.db.prepare(
7116
+ "UPDATE versions SET progress_pages = ?, progress_max_pages = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
7117
+ ),
7118
+ getVersionsByStatus: this.db.prepare(
7119
+ "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN (SELECT value FROM json_each(?))"
7120
+ ),
7121
+ // Scraper options statements
7122
+ updateVersionScraperOptions: this.db.prepare(
7123
+ "UPDATE versions SET source_url = ?, scraper_options = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
7124
+ ),
7125
+ getVersionWithOptions: this.db.prepare(
7126
+ "SELECT * FROM versions WHERE id = ?"
7127
+ ),
7128
+ getVersionsBySourceUrl: this.db.prepare(
7129
+ "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.source_url = ? ORDER BY v.created_at DESC"
7130
+ ),
7131
+ // Version and library deletion statements
7132
+ deleteVersionById: this.db.prepare("DELETE FROM versions WHERE id = ?"),
7133
+ deleteLibraryById: this.db.prepare("DELETE FROM libraries WHERE id = ?"),
7134
+ countVersionsByLibraryId: this.db.prepare(
7135
+ "SELECT COUNT(*) as count FROM versions WHERE library_id = ?"
7136
+ ),
7137
+ getVersionId: this.db.prepare(
7138
+ `SELECT v.id, v.library_id FROM versions v
7139
+ JOIN libraries l ON v.library_id = l.id
7140
+ WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`
7141
+ )
7142
+ };
7143
+ this.statements = statements;
7144
+ }
7145
+ /**
7146
+ * Pads a vector to the fixed database dimension by appending zeros.
7147
+ * Throws an error if the input vector is longer than the database dimension.
7148
+ */
7149
+ padVector(vector) {
7150
+ if (vector.length > this.dbDimension) {
7151
+ throw new Error(
7152
+ `Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
7153
+ );
7154
+ }
7155
+ if (vector.length === this.dbDimension) {
7156
+ return vector;
7157
+ }
7158
+ return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
7159
+ }
7160
+ /**
7161
+ * Initialize the embeddings client using either provided config or environment variables.
7162
+ * If no embedding config is provided (null), embeddings will not be initialized.
7163
+ * This allows DocumentStore to be used without embeddings for operations that don't need them.
7164
+ *
7165
+ * Environment variables per provider:
7166
+ * - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
7167
+ * - vertex: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
7168
+ * - gemini: GOOGLE_API_KEY
7169
+ * - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION
7170
+ * - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
7171
+ */
7172
+ async initializeEmbeddings() {
7173
+ if (this.embeddingConfig === null) {
7174
+ logger.debug("Embedding initialization skipped (explicitly disabled)");
7175
+ return;
7176
+ }
7177
+ const config = this.embeddingConfig || EmbeddingConfig.parseEmbeddingConfig();
7178
+ try {
7179
+ this.embeddings = createEmbeddingModel(config.modelSpec);
7180
+ if (config.dimensions !== null) {
7181
+ this.modelDimension = config.dimensions;
7182
+ } else {
7183
+ const testVector = await this.embeddings.embedQuery("test");
7184
+ this.modelDimension = testVector.length;
7185
+ EmbeddingConfig.setKnownModelDimensions(config.model, this.modelDimension);
7186
+ }
7187
+ if (this.modelDimension > this.dbDimension) {
7188
+ throw new DimensionError(config.modelSpec, this.modelDimension, this.dbDimension);
7189
+ }
7190
+ logger.debug(
7191
+ `Embeddings initialized: ${config.provider}:${config.model} (${this.modelDimension}d)`
7192
+ );
7193
+ } catch (error) {
7194
+ if (error instanceof Error) {
7195
+ if (error.message.includes("does not exist") || error.message.includes("MODEL_NOT_FOUND")) {
7196
+ throw new ModelConfigurationError(
7197
+ `❌ Invalid embedding model: ${config.model}
7198
+ The model "${config.model}" is not available or you don't have access to it.
7199
+ See README.md for supported models or run with --help for more details.`
7200
+ );
7201
+ }
7202
+ if (error.message.includes("API key") || error.message.includes("401") || error.message.includes("authentication")) {
7203
+ throw new ModelConfigurationError(
7204
+ `❌ Authentication failed for ${config.provider} embedding provider
7205
+ Please check your API key configuration.
7206
+ See README.md for configuration options or run with --help for more details.`
7207
+ );
7208
+ }
7209
+ }
7210
+ throw error;
7211
+ }
7212
+ }
7213
+ /**
7214
+ * Escapes a query string for use with SQLite FTS5 MATCH operator.
7215
+ * Wraps the query in double quotes and escapes internal double quotes.
7216
+ */
7217
+ escapeFtsQuery(query) {
7218
+ const escapedQuotes = query.replace(/"/g, '""');
7219
+ return `"${escapedQuotes}"`;
7220
+ }
7221
+ /**
7222
+ * Initializes database connection and ensures readiness
7223
+ */
7224
+ async initialize() {
7225
+ try {
7226
+ sqliteVec.load(this.db);
7227
+ applyMigrations(this.db);
7228
+ this.prepareStatements();
7229
+ await this.initializeEmbeddings();
7230
+ } catch (error) {
7231
+ if (error instanceof StoreError || error instanceof ModelConfigurationError || error instanceof UnsupportedProviderError) {
7232
+ throw error;
7233
+ }
7234
+ throw new ConnectionError("Failed to initialize database connection", error);
7235
+ }
7236
+ }
7237
+ /**
7238
+ * Gracefully closes database connections
7239
+ */
7240
+ async shutdown() {
7241
+ this.db.close();
7242
+ }
7243
+ /**
7244
+ * Resolves a library name and version string to library_id and version_id.
7245
+ * Creates library and version records if they don't exist.
7246
+ */
7247
+ async resolveLibraryAndVersionIds(library, version2) {
7248
+ const normalizedLibrary = library.toLowerCase();
7249
+ const normalizedVersion = denormalizeVersionName(version2.toLowerCase());
7250
+ this.statements.insertLibrary.run(normalizedLibrary);
7251
+ const libraryIdRow = this.statements.getLibraryIdByName.get(normalizedLibrary);
7252
+ if (!libraryIdRow || typeof libraryIdRow.id !== "number") {
7253
+ throw new StoreError(`Failed to resolve library_id for library: ${library}`);
7254
+ }
7255
+ const libraryId = libraryIdRow.id;
7256
+ this.statements.insertVersion.run(libraryId, normalizedVersion);
7257
+ const versionIdRow = this.statements.resolveVersionId.get(
7258
+ libraryId,
7259
+ normalizedVersion === null ? "" : normalizedVersion
7260
+ );
7261
+ if (!versionIdRow || typeof versionIdRow.id !== "number") {
7262
+ throw new StoreError(
7263
+ `Failed to resolve version_id for library: ${library}, version: ${version2}`
7264
+ );
7265
+ }
7266
+ return { libraryId, versionId: versionIdRow.id };
7267
+ }
7268
+ /**
7269
+ * Retrieves all unique versions for a specific library
7270
+ */
7271
+ async queryUniqueVersions(library) {
7272
+ try {
7273
+ const rows = this.statements.queryVersions.all(library.toLowerCase());
7274
+ return rows.map((row) => normalizeVersionName(row.name));
7275
+ } catch (error) {
7276
+ throw new ConnectionError("Failed to query versions", error);
7277
+ }
7278
+ }
7279
+ /**
7280
+ * Updates the status of a version record in the database.
7281
+ * @param versionId The version ID to update
7282
+ * @param status The new status to set
7283
+ * @param errorMessage Optional error message for failed statuses
7284
+ */
7285
+ async updateVersionStatus(versionId, status, errorMessage) {
7286
+ try {
7287
+ this.statements.updateVersionStatus.run(status, errorMessage ?? null, versionId);
7288
+ } catch (error) {
7289
+ throw new StoreError(`Failed to update version status: ${error}`);
7290
+ }
7291
+ }
7292
+ /**
7293
+ * Updates the progress counters for a version being indexed.
7294
+ * @param versionId The version ID to update
7295
+ * @param pages Current number of pages processed
7296
+ * @param maxPages Total number of pages to process
7297
+ */
7298
+ async updateVersionProgress(versionId, pages, maxPages) {
7299
+ try {
7300
+ this.statements.updateVersionProgress.run(pages, maxPages, versionId);
7301
+ } catch (error) {
7302
+ throw new StoreError(`Failed to update version progress: ${error}`);
7303
+ }
7304
+ }
7305
+ /**
7306
+ * Retrieves versions by their status.
7307
+ * @param statuses Array of statuses to filter by
7308
+ * @returns Array of version records matching the statuses
7309
+ */
7310
+ async getVersionsByStatus(statuses) {
7311
+ try {
7312
+ const statusJson = JSON.stringify(statuses);
7313
+ const rows = this.statements.getVersionsByStatus.all(
7314
+ statusJson
7315
+ );
7316
+ return rows;
7317
+ } catch (error) {
7318
+ throw new StoreError(`Failed to get versions by status: ${error}`);
7319
+ }
7320
+ }
7321
+ /**
7322
+ * Stores scraper options for a version to enable reproducible indexing.
7323
+ * @param versionId The version ID to update
7324
+ * @param options Complete scraper options used for indexing
7325
+ */
7326
+ async storeScraperOptions(versionId, options) {
7327
+ try {
7328
+ const { url: source_url, library, version: version2, signal, ...scraper_options } = options;
7329
+ const optionsJson = JSON.stringify(scraper_options);
7330
+ this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
7331
+ } catch (error) {
7332
+ throw new StoreError(`Failed to store scraper options: ${error}`);
7333
+ }
7334
+ }
7335
+ /**
7336
+ * Retrieves stored scraping configuration (source URL and options) for a version.
7337
+ * Returns null when no source URL is recorded (not re-indexable).
7338
+ */
7339
+ async getScraperOptions(versionId) {
7340
+ try {
7341
+ const row = this.statements.getVersionWithOptions.get(versionId);
7342
+ if (!row?.source_url) {
7343
+ return null;
7344
+ }
7345
+ let parsed = {};
7346
+ if (row.scraper_options) {
7347
+ try {
7348
+ parsed = JSON.parse(row.scraper_options);
7349
+ } catch (e) {
7350
+ logger.warn(`⚠️ Invalid scraper_options JSON for version ${versionId}: ${e}`);
7351
+ parsed = {};
7352
+ }
7353
+ }
7354
+ return { sourceUrl: row.source_url, options: parsed };
7355
+ } catch (error) {
7356
+ throw new StoreError(`Failed to get scraper options: ${error}`);
7357
+ }
7358
+ }
7359
+ /**
7360
+ * Finds versions that were indexed from the same source URL.
7361
+ * Useful for finding similar configurations or detecting duplicates.
7362
+ * @param url Source URL to search for
7363
+ * @returns Array of versions with the same source URL
7364
+ */
7365
+ async findVersionsBySourceUrl(url) {
7366
+ try {
7367
+ const rows = this.statements.getVersionsBySourceUrl.all(
7368
+ url
7369
+ );
7370
+ return rows;
7371
+ } catch (error) {
7372
+ throw new StoreError(`Failed to find versions by source URL: ${error}`);
7373
+ }
7374
+ }
7375
+ /**
7376
+ * Verifies existence of documents for a specific library version
7377
+ */
7378
+ async checkDocumentExists(library, version2) {
7379
+ try {
7380
+ const normalizedVersion = version2.toLowerCase();
7381
+ const result = this.statements.checkExists.get(
7382
+ library.toLowerCase(),
7383
+ normalizedVersion
7384
+ );
7385
+ return result !== void 0;
7386
+ } catch (error) {
7387
+ throw new ConnectionError("Failed to check document existence", error);
7388
+ }
7389
+ }
7390
+ /**
7391
+ * Retrieves a mapping of all libraries to their available versions with details.
7392
+ */
7393
+ async queryLibraryVersions() {
7394
+ try {
7395
+ const rows = this.statements.queryLibraryVersions.all();
7396
+ const libraryMap = /* @__PURE__ */ new Map();
7397
+ for (const row of rows) {
7398
+ const library = row.library;
7399
+ if (!libraryMap.has(library)) {
7400
+ libraryMap.set(library, []);
7401
+ }
7402
+ const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
7403
+ libraryMap.get(library)?.push({
7404
+ version: row.version,
7405
+ versionId: row.versionId,
7406
+ // Preserve raw string status here; DocumentManagementService will cast to VersionStatus
7407
+ status: row.status,
7408
+ progressPages: row.progressPages,
7409
+ progressMaxPages: row.progressMaxPages,
7410
+ sourceUrl: row.sourceUrl,
7411
+ documentCount: row.documentCount,
7412
+ uniqueUrlCount: row.uniqueUrlCount,
7413
+ indexedAt: indexedAtISO
7414
+ });
7415
+ }
7416
+ for (const versions of libraryMap.values()) {
7417
+ versions.sort((a, b) => {
7418
+ if (a.version === "" && b.version !== "") {
7419
+ return -1;
7420
+ }
7421
+ if (a.version !== "" && b.version === "") {
7422
+ return 1;
7423
+ }
7424
+ if (a.version === "" && b.version === "") {
7425
+ return 0;
7426
+ }
7427
+ try {
7428
+ return semver__default.compare(a.version, b.version);
7429
+ } catch (_error) {
7430
+ return a.version.localeCompare(b.version);
7431
+ }
7432
+ });
7433
+ }
7434
+ return libraryMap;
7435
+ } catch (error) {
7436
+ throw new ConnectionError("Failed to query library versions", error);
7437
+ }
7438
+ }
7439
+ /**
7440
+ * Stores documents with library and version metadata, generating embeddings
7441
+ * for vector similarity search. Automatically removes any existing documents
7442
+ * for the same URLs before adding new ones to prevent UNIQUE constraint violations.
7443
+ */
7444
+ async addDocuments(library, version2, documents) {
7445
+ try {
7446
+ if (documents.length === 0) {
7447
+ return;
7448
+ }
7449
+ const urls = /* @__PURE__ */ new Set();
7450
+ for (const doc of documents) {
7451
+ const url = doc.metadata.url;
7452
+ if (!url || typeof url !== "string" || !url.trim()) {
7453
+ throw new StoreError("Document metadata must include a valid URL");
7454
+ }
7455
+ urls.add(url);
7456
+ }
7457
+ const texts = documents.map((doc) => {
7458
+ const header = `<title>${doc.metadata.title}</title>
7459
+ <url>${doc.metadata.url}</url>
7460
+ <path>${doc.metadata.path.join(" / ")}</path>
7461
+ `;
7462
+ return `${header}${doc.pageContent}`;
7463
+ });
7464
+ const maxBatchChars = Number(process.env.DOCS_MCP_EMBEDDING_BATCH_CHARS) || EMBEDDING_BATCH_CHARS;
7465
+ const rawEmbeddings = [];
7466
+ let currentBatch = [];
7467
+ let currentBatchSize = 0;
7468
+ let batchCount = 0;
7469
+ for (const text of texts) {
7470
+ const textSize = text.length;
7471
+ if (currentBatchSize + textSize > maxBatchChars && currentBatch.length > 0) {
7472
+ batchCount++;
7473
+ logger.debug(
7474
+ `🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
7475
+ );
7476
+ const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
7477
+ rawEmbeddings.push(...batchEmbeddings);
7478
+ currentBatch = [];
7479
+ currentBatchSize = 0;
7480
+ }
7481
+ currentBatch.push(text);
7482
+ currentBatchSize += textSize;
7483
+ if (currentBatch.length >= EMBEDDING_BATCH_SIZE) {
7484
+ batchCount++;
7485
+ logger.debug(
7486
+ `🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
7487
+ );
7488
+ const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
7489
+ rawEmbeddings.push(...batchEmbeddings);
7490
+ currentBatch = [];
7491
+ currentBatchSize = 0;
7492
+ }
7493
+ }
7494
+ if (currentBatch.length > 0) {
7495
+ batchCount++;
7496
+ logger.debug(
7497
+ `🔄 Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
7498
+ );
7499
+ const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
7500
+ rawEmbeddings.push(...batchEmbeddings);
7501
+ }
7502
+ const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
7503
+ const { libraryId, versionId } = await this.resolveLibraryAndVersionIds(
7504
+ library,
7505
+ version2
7506
+ );
7507
+ for (const url of urls) {
7508
+ const deletedCount = await this.deleteDocumentsByUrl(library, version2, url);
7509
+ if (deletedCount > 0) {
7510
+ logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`);
7511
+ }
7512
+ }
7513
+ const transaction = this.db.transaction((docs) => {
7514
+ for (let i = 0; i < docs.length; i++) {
7515
+ const doc = docs[i];
7516
+ const url = doc.metadata.url;
7517
+ const result = this.statements.insertDocument.run(
7518
+ BigInt(libraryId),
7519
+ BigInt(versionId),
7520
+ url,
7521
+ doc.pageContent,
7522
+ JSON.stringify(doc.metadata),
7523
+ i,
7524
+ (/* @__PURE__ */ new Date()).toISOString()
7525
+ // Pass current timestamp for indexed_at
7526
+ );
7527
+ const rowId = result.lastInsertRowid;
7528
+ this.statements.insertEmbedding.run(
7529
+ BigInt(rowId),
7530
+ BigInt(libraryId),
7531
+ BigInt(versionId),
7532
+ JSON.stringify(paddedEmbeddings[i])
7533
+ );
7534
+ }
7535
+ });
7536
+ transaction(documents);
7537
+ } catch (error) {
7538
+ throw new ConnectionError("Failed to add documents to store", error);
7539
+ }
7540
+ }
7541
+ /**
7542
+ * Removes documents matching specified library and version
7543
+ * @returns Number of documents deleted
7544
+ */
7545
+ async deleteDocuments(library, version2) {
7546
+ try {
7547
+ const normalizedVersion = version2.toLowerCase();
7548
+ const result = this.statements.deleteDocuments.run(
7549
+ library.toLowerCase(),
7550
+ library.toLowerCase(),
7551
+ // library name appears twice in the query
7552
+ normalizedVersion
7553
+ );
7554
+ return result.changes;
7555
+ } catch (error) {
7556
+ throw new ConnectionError("Failed to delete documents", error);
7557
+ }
7558
+ }
7559
+ /**
7560
+ * Removes documents for a specific URL within a library and version
7561
+ * @returns Number of documents deleted
7562
+ */
7563
+ async deleteDocumentsByUrl(library, version2, url) {
7564
+ try {
7565
+ const normalizedVersion = version2.toLowerCase();
7566
+ const result = this.statements.deleteDocumentsByUrl.run(
7567
+ url,
7568
+ library.toLowerCase(),
7569
+ library.toLowerCase(),
7570
+ // library name appears twice in the query
7571
+ normalizedVersion
7572
+ );
7573
+ return result.changes;
7574
+ } catch (error) {
7575
+ throw new ConnectionError("Failed to delete documents by URL", error);
7576
+ }
7577
+ }
7578
+ /**
7579
+ * Completely removes a library version and all associated documents.
7580
+ * Optionally removes the library if no other versions remain.
7581
+ * @param library Library name
7582
+ * @param version Version string (empty string for unversioned)
7583
+ * @param removeLibraryIfEmpty Whether to remove the library if no versions remain
7584
+ * @returns Object with counts of deleted documents, version deletion status, and library deletion status
7585
+ */
7586
+ async removeVersion(library, version2, removeLibraryIfEmpty = true) {
7587
+ try {
7588
+ const normalizedLibrary = library.toLowerCase();
7589
+ const normalizedVersion = version2.toLowerCase();
7590
+ const versionResult = this.statements.getVersionId.get(
7591
+ normalizedLibrary,
7592
+ normalizedVersion
7593
+ );
7594
+ if (!versionResult) {
7595
+ return { documentsDeleted: 0, versionDeleted: false, libraryDeleted: false };
7596
+ }
7597
+ const { id: versionId, library_id: libraryId } = versionResult;
7598
+ const documentsDeleted = await this.deleteDocuments(library, version2);
7599
+ const versionDeleteResult = this.statements.deleteVersionById.run(versionId);
7600
+ const versionDeleted = versionDeleteResult.changes > 0;
7601
+ let libraryDeleted = false;
7602
+ if (removeLibraryIfEmpty && versionDeleted) {
7603
+ const countResult = this.statements.countVersionsByLibraryId.get(libraryId);
7604
+ const remainingVersions = countResult?.count ?? 0;
7605
+ if (remainingVersions === 0) {
7606
+ const libraryDeleteResult = this.statements.deleteLibraryById.run(libraryId);
7607
+ libraryDeleted = libraryDeleteResult.changes > 0;
7608
+ }
7609
+ }
7610
+ return { documentsDeleted, versionDeleted, libraryDeleted };
7611
+ } catch (error) {
7612
+ throw new ConnectionError("Failed to remove version", error);
7613
+ }
7614
+ }
7615
+ /**
7616
+ * Retrieves a document by its ID.
7617
+ * @param id The ID of the document.
7618
+ * @returns The document, or null if not found.
7619
+ */
7620
+ async getById(id) {
7621
+ try {
7622
+ const row = this.statements.getById.get(BigInt(id));
7623
+ if (!row) {
7624
+ return null;
7625
+ }
7626
+ return mapDbDocumentToDocument(row);
7627
+ } catch (error) {
7628
+ throw new ConnectionError(`Failed to get document by ID ${id}`, error);
7629
+ }
7630
+ }
7631
+ /**
7632
+ * Finds documents matching a text query using hybrid search.
7633
+ * Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
7634
+ */
7635
+ async findByContent(library, version2, query, limit) {
7636
+ try {
7637
+ const rawEmbedding = await this.embeddings.embedQuery(query);
7638
+ const embedding = this.padVector(rawEmbedding);
7639
+ const ftsQuery = this.escapeFtsQuery(query);
7640
+ const normalizedVersion = version2.toLowerCase();
7641
+ const stmt = this.db.prepare(`
7642
+ WITH vec_distances AS (
7643
+ SELECT
7644
+ dv.rowid as id,
7645
+ dv.distance as vec_distance
7646
+ FROM documents_vec dv
7647
+ JOIN versions v ON dv.version_id = v.id
7648
+ JOIN libraries l ON v.library_id = l.id
7649
+ WHERE l.name = ?
7650
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7651
+ AND dv.embedding MATCH ?
7652
+ AND dv.k = ?
7653
+ ORDER BY dv.distance
7654
+ ),
7655
+ fts_scores AS (
7656
+ SELECT
7657
+ f.rowid as id,
7658
+ bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
7659
+ FROM documents_fts f
7660
+ JOIN documents d ON f.rowid = d.id
7661
+ JOIN versions v ON d.version_id = v.id
7662
+ JOIN libraries l ON v.library_id = l.id
7663
+ WHERE l.name = ?
7664
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7665
+ AND documents_fts MATCH ?
7666
+ ORDER BY fts_score
7667
+ LIMIT ?
7668
+ )
7669
+ SELECT
7670
+ d.id,
7671
+ d.content,
7672
+ d.metadata,
7673
+ COALESCE(1 / (1 + v.vec_distance), 0) as vec_score,
7674
+ COALESCE(-MIN(f.fts_score, 0), 0) as fts_score
7675
+ FROM documents d
7676
+ LEFT JOIN vec_distances v ON d.id = v.id
7677
+ LEFT JOIN fts_scores f ON d.id = f.id
7678
+ WHERE v.id IS NOT NULL OR f.id IS NOT NULL
7679
+ `);
7680
+ const rawResults = stmt.all(
7681
+ library.toLowerCase(),
7682
+ normalizedVersion,
7683
+ JSON.stringify(embedding),
7684
+ limit,
7685
+ library.toLowerCase(),
7686
+ normalizedVersion,
7687
+ ftsQuery,
7688
+ // Use the escaped query
7689
+ limit
7690
+ );
7691
+ const rankedResults = this.assignRanks(rawResults);
7692
+ const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
7693
+ return topResults.map((row) => ({
7694
+ ...mapDbDocumentToDocument(row),
7695
+ metadata: {
7696
+ ...JSON.parse(row.metadata),
7697
+ id: row.id,
7698
+ score: row.rrf_score,
7699
+ vec_rank: row.vec_rank,
7700
+ fts_rank: row.fts_rank
7701
+ }
7702
+ }));
7703
+ } catch (error) {
7704
+ throw new ConnectionError(
7705
+ `Failed to find documents by content with query "${query}"`,
7706
+ error
7707
+ );
7708
+ }
7709
+ }
7710
+ /**
7711
+ * Finds child chunks of a given document based on path hierarchy.
7712
+ */
7713
+ async findChildChunks(library, version2, id, limit) {
7714
+ try {
7715
+ const parent = await this.getById(id);
7716
+ if (!parent) {
7717
+ return [];
7718
+ }
7719
+ const parentPath = parent.metadata.path ?? [];
7720
+ const parentUrl = parent.metadata.url;
7721
+ const normalizedVersion = version2.toLowerCase();
7722
+ const result = this.statements.getChildChunks.all(
7723
+ library.toLowerCase(),
7724
+ normalizedVersion,
7725
+ parentUrl,
7726
+ parentPath.length + 1,
7727
+ JSON.stringify(parentPath),
7728
+ BigInt(id),
7729
+ limit
7730
+ );
7731
+ return result.map((row) => mapDbDocumentToDocument(row));
7732
+ } catch (error) {
7733
+ throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
7734
+ }
7735
+ }
7736
+ /**
7737
+ * Finds preceding sibling chunks of a given document.
7738
+ */
7739
+ async findPrecedingSiblingChunks(library, version2, id, limit) {
7740
+ try {
7741
+ const reference = await this.getById(id);
7742
+ if (!reference) {
7743
+ return [];
7744
+ }
7745
+ const refMetadata = reference.metadata;
7746
+ const normalizedVersion = version2.toLowerCase();
7747
+ const result = this.statements.getPrecedingSiblings.all(
7748
+ library.toLowerCase(),
7749
+ normalizedVersion,
7750
+ refMetadata.url,
7751
+ BigInt(id),
7752
+ JSON.stringify(refMetadata.path),
7753
+ limit
7754
+ );
7755
+ return result.reverse().map((row) => mapDbDocumentToDocument(row));
7756
+ } catch (error) {
7757
+ throw new ConnectionError(
7758
+ `Failed to find preceding sibling chunks for ID ${id}`,
7759
+ error
7760
+ );
7761
+ }
7762
+ }
7763
+ /**
7764
+ * Finds subsequent sibling chunks of a given document.
7765
+ */
7766
+ async findSubsequentSiblingChunks(library, version2, id, limit) {
7767
+ try {
7768
+ const reference = await this.getById(id);
7769
+ if (!reference) {
7770
+ return [];
7771
+ }
7772
+ const refMetadata = reference.metadata;
7773
+ const normalizedVersion = version2.toLowerCase();
7774
+ const result = this.statements.getSubsequentSiblings.all(
7775
+ library.toLowerCase(),
7776
+ normalizedVersion,
7777
+ refMetadata.url,
7778
+ BigInt(id),
7779
+ JSON.stringify(refMetadata.path),
7780
+ limit
7781
+ );
7782
+ return result.map((row) => mapDbDocumentToDocument(row));
7783
+ } catch (error) {
7784
+ throw new ConnectionError(
7785
+ `Failed to find subsequent sibling chunks for ID ${id}`,
7786
+ error
7787
+ );
7788
+ }
7789
+ }
7790
+ /**
7791
+ * Finds the parent chunk of a given document.
7792
+ */
7793
+ async findParentChunk(library, version2, id) {
7794
+ try {
7795
+ const child = await this.getById(id);
7796
+ if (!child) {
7797
+ return null;
7798
+ }
7799
+ const childMetadata = child.metadata;
7800
+ const path2 = childMetadata.path ?? [];
7801
+ const parentPath = path2.slice(0, -1);
7802
+ if (parentPath.length === 0) {
7803
+ return null;
7804
+ }
7805
+ const normalizedVersion = version2.toLowerCase();
7806
+ const result = this.statements.getParentChunk.get(
7807
+ library.toLowerCase(),
7808
+ normalizedVersion,
7809
+ childMetadata.url,
7810
+ JSON.stringify(parentPath),
7811
+ BigInt(id)
7812
+ );
7813
+ if (!result) {
7814
+ return null;
7815
+ }
7816
+ return mapDbDocumentToDocument(result);
7817
+ } catch (error) {
7818
+ throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
7819
+ }
7820
+ }
7821
+ /**
7822
+ * Fetches multiple documents by their IDs in a single call.
7823
+ * Returns an array of Document objects, sorted by their sort_order.
7824
+ */
7825
+ async findChunksByIds(library, version2, ids) {
7826
+ if (!ids.length) return [];
7827
+ try {
7828
+ const normalizedVersion = version2.toLowerCase();
7829
+ const placeholders = ids.map(() => "?").join(",");
7830
+ const stmt = this.db.prepare(
7831
+ `SELECT d.* FROM documents d
7832
+ JOIN libraries l ON d.library_id = l.id
7833
+ JOIN versions v ON d.version_id = v.id
7834
+ WHERE l.name = ?
7835
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7836
+ AND d.id IN (${placeholders})
7837
+ ORDER BY d.sort_order`
7838
+ );
7839
+ const rows = stmt.all(
7840
+ library.toLowerCase(),
7841
+ normalizedVersion,
7842
+ ...ids
7843
+ );
7844
+ return rows.map((row) => mapDbDocumentToDocument(row));
7845
+ } catch (error) {
7846
+ throw new ConnectionError("Failed to fetch documents by IDs", error);
7847
+ }
7848
+ }
7849
+ }
7850
+ class DocumentManagementService {
7851
+ store;
7852
+ documentRetriever;
7853
+ splitter;
7854
+ /**
7855
+ * Normalizes a version string, converting null or undefined to an empty string
7856
+ * and converting to lowercase.
7857
+ */
7858
+ normalizeVersion(version2) {
7859
+ return (version2 ?? "").toLowerCase();
7860
+ }
7861
+ constructor(embeddingConfig) {
7862
+ let dbPath;
7863
+ let dbDir;
7864
+ const envStorePath = process.env.DOCS_MCP_STORE_PATH;
7865
+ if (envStorePath) {
7866
+ dbDir = envStorePath;
7867
+ dbPath = path.join(dbDir, "documents.db");
7868
+ logger.debug(`Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
7869
+ } else {
7870
+ const projectRoot2 = getProjectRoot();
7871
+ const oldDbDir = path.join(projectRoot2, ".store");
7872
+ const oldDbPath = path.join(oldDbDir, "documents.db");
7873
+ const oldDbExists = fs.existsSync(oldDbPath);
7874
+ if (oldDbExists) {
7875
+ dbPath = oldDbPath;
7876
+ dbDir = oldDbDir;
7877
+ logger.debug(`Using legacy database path: ${dbPath}`);
7878
+ } else {
7879
+ const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
7880
+ dbDir = standardPaths.data;
7881
+ dbPath = path.join(dbDir, "documents.db");
7882
+ logger.debug(`Using standard database directory: ${dbDir}`);
7883
+ }
7884
+ }
7885
+ try {
7886
+ fs.mkdirSync(dbDir, { recursive: true });
7887
+ } catch (error) {
7888
+ logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
7889
+ }
7890
+ this.store = new DocumentStore(dbPath, embeddingConfig);
7891
+ this.documentRetriever = new DocumentRetrieverService(this.store);
7892
+ const semanticSplitter = new SemanticMarkdownSplitter(
7893
+ SPLITTER_PREFERRED_CHUNK_SIZE,
7894
+ SPLITTER_MAX_CHUNK_SIZE
7895
+ );
7896
+ const greedySplitter = new GreedySplitter(
7897
+ semanticSplitter,
7898
+ SPLITTER_MIN_CHUNK_SIZE,
7899
+ SPLITTER_PREFERRED_CHUNK_SIZE
7900
+ );
7901
+ this.splitter = greedySplitter;
7902
+ }
7903
+ /**
7904
+ * Initializes the underlying document store.
7905
+ */
7906
+ async initialize() {
7907
+ await this.store.initialize();
7908
+ }
7909
+ /**
7910
+ * Shuts down the underlying document store.
7911
+ */
7912
+ async shutdown() {
7913
+ logger.debug("Shutting down store manager");
7914
+ await this.store.shutdown();
7915
+ }
7916
+ // Status tracking methods for pipeline integration
7917
+ /**
7918
+ * Gets versions by their current status.
7919
+ */
7920
+ async getVersionsByStatus(statuses) {
7921
+ return this.store.getVersionsByStatus(statuses);
7922
+ }
7923
+ /**
7924
+ * Updates the status of a version.
7925
+ */
7926
+ async updateVersionStatus(versionId, status, errorMessage) {
7927
+ return this.store.updateVersionStatus(versionId, status, errorMessage);
7928
+ }
7929
+ /**
7930
+ * Updates the progress of a version being indexed.
7931
+ */
7932
+ async updateVersionProgress(versionId, pages, maxPages) {
7933
+ return this.store.updateVersionProgress(versionId, pages, maxPages);
7934
+ }
7935
+ /**
7936
+ * Stores scraper options for a version to enable reproducible indexing.
7937
+ */
7938
+ async storeScraperOptions(versionId, options) {
7939
+ return this.store.storeScraperOptions(versionId, options);
7940
+ }
7941
+ /**
7942
+ * Retrieves stored scraper options for a version.
7943
+ */
7944
+ /**
7945
+ * Retrieves stored scraping configuration for a version.
7946
+ */
7947
+ async getScraperOptions(versionId) {
7948
+ return this.store.getScraperOptions(versionId);
7949
+ }
7950
+ /**
7951
+ * Ensures a library/version exists using a VersionRef and returns version ID.
7952
+ * Delegates to existing ensureLibraryAndVersion for storage.
7953
+ */
7954
+ async ensureVersion(ref) {
7955
+ const normalized = {
7956
+ library: ref.library.trim().toLowerCase(),
7957
+ version: (ref.version ?? "").trim().toLowerCase()
7958
+ };
7959
+ return this.ensureLibraryAndVersion(normalized.library, normalized.version);
7960
+ }
7961
+ /**
7962
+ * Returns enriched library summaries including version status/progress and counts.
7963
+ * Uses existing store APIs; keeps DB details encapsulated.
7964
+ */
7965
+ async listLibraries() {
7966
+ const libMap = await this.store.queryLibraryVersions();
7967
+ const summaries = [];
7968
+ for (const [library, versions] of libMap) {
7969
+ const vs = versions.map(
7970
+ (v) => ({
7971
+ id: v.versionId,
7972
+ ref: { library, version: v.version },
7973
+ status: v.status,
7974
+ // Include progress only while indexing is active; set undefined for COMPLETED
7975
+ progress: v.status === "completed" ? void 0 : { pages: v.progressPages, maxPages: v.progressMaxPages },
7976
+ counts: { documents: v.documentCount, uniqueUrls: v.uniqueUrlCount },
7977
+ indexedAt: v.indexedAt,
7978
+ sourceUrl: v.sourceUrl ?? void 0
7979
+ })
7980
+ );
7981
+ summaries.push({ library, versions: vs });
7982
+ }
7983
+ return summaries;
7984
+ }
7985
+ /**
7986
+ * Finds versions that were indexed from the same source URL.
7987
+ */
7988
+ async findVersionsBySourceUrl(url) {
7989
+ return this.store.findVersionsBySourceUrl(url);
7990
+ }
7991
+ /**
7992
+ * Validates if a library exists in the store (either versioned or unversioned).
7993
+ * Throws LibraryNotFoundError with suggestions if the library is not found.
7994
+ * @param library The name of the library to validate.
7995
+ * @throws {LibraryNotFoundError} If the library does not exist.
7996
+ */
7997
+ async validateLibraryExists(library) {
7998
+ logger.info(`🔎 Validating existence of library: ${library}`);
7999
+ const normalizedLibrary = library.toLowerCase();
8000
+ const versions = await this.listVersions(normalizedLibrary);
8001
+ const hasUnversioned = await this.exists(normalizedLibrary, "");
8002
+ if (versions.length === 0 && !hasUnversioned) {
8003
+ logger.warn(`⚠️ Library '${library}' not found.`);
8004
+ const allLibraries = await this.listLibraries();
8005
+ const libraryNames = allLibraries.map((lib) => lib.library);
8006
+ let suggestions = [];
8007
+ if (libraryNames.length > 0) {
8008
+ const fuse = new Fuse(libraryNames, {
8009
+ // Configure fuse.js options if needed (e.g., threshold)
8010
+ // isCaseSensitive: false, // Handled by normalizing library names
8011
+ // includeScore: true,
8012
+ threshold: 0.4
8013
+ // Adjust threshold for desired fuzziness (0=exact, 1=match anything)
8014
+ });
8015
+ const results = fuse.search(normalizedLibrary);
8016
+ suggestions = results.slice(0, 3).map((result) => result.item);
8017
+ logger.info(`🔍 Found suggestions: ${suggestions.join(", ")}`);
8018
+ }
8019
+ throw new LibraryNotFoundError(library, suggestions);
8020
+ }
8021
+ logger.info(`✅ Library '${library}' confirmed to exist.`);
8022
+ }
8023
+ /**
8024
+ * Returns a list of all available semantic versions for a library.
8025
+ */
8026
+ async listVersions(library) {
8027
+ const versions = await this.store.queryUniqueVersions(library);
8028
+ return versions.filter((v) => semver__default.valid(v));
8029
+ }
8030
+ /**
8031
+ * Checks if documents exist for a given library and optional version.
8032
+ * If version is omitted, checks for documents without a specific version.
5277
8033
  */
5278
- logStartupInfo(address) {
5279
- logger.info(`🚀 AppServer available at ${address}`);
5280
- const enabledServices = [];
5281
- if (this.config.enableWebInterface) {
5282
- enabledServices.push(`Web interface: ${address}`);
5283
- }
5284
- if (this.config.enableMcpServer) {
5285
- enabledServices.push(`MCP endpoints: ${address}/mcp, ${address}/sse`);
5286
- }
5287
- if (this.config.enableApiServer) {
5288
- enabledServices.push(`API: ${address}/api`);
5289
- }
5290
- if (this.config.enableWorker) {
5291
- enabledServices.push("Embedded worker: enabled");
5292
- } else if (this.config.externalWorkerUrl) {
5293
- enabledServices.push(`External worker: ${this.config.externalWorkerUrl}`);
8034
+ async exists(library, version2) {
8035
+ const normalizedVersion = this.normalizeVersion(version2);
8036
+ return this.store.checkDocumentExists(library, normalizedVersion);
8037
+ }
8038
+ /**
8039
+ * Finds the most appropriate version of documentation based on the requested version.
8040
+ * When no target version is specified, returns the latest version.
8041
+ *
8042
+ * Version matching behavior:
8043
+ * - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
8044
+ * - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
8045
+ * - "latest" or no version: Returns the latest available version
8046
+ *
8047
+ * For documentation, we prefer matching older versions over no match at all,
8048
+ * since older docs are often still relevant and useful.
8049
+ * Also checks if unversioned documents exist for the library.
8050
+ */
8051
+ async findBestVersion(library, targetVersion) {
8052
+ const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
8053
+ logger.info(`🔍 Finding best version for ${libraryAndVersion}`);
8054
+ const hasUnversioned = await this.store.checkDocumentExists(library, "");
8055
+ const versionStrings = await this.listVersions(library);
8056
+ if (versionStrings.length === 0) {
8057
+ if (hasUnversioned) {
8058
+ logger.info(`ℹ️ Unversioned documents exist for ${library}`);
8059
+ return { bestMatch: null, hasUnversioned: true };
8060
+ }
8061
+ logger.warn(`⚠️ No valid versions found for ${library}`);
8062
+ const allLibraryDetails = await this.store.queryLibraryVersions();
8063
+ const libraryDetails = allLibraryDetails.get(library) ?? [];
8064
+ throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
8065
+ }
8066
+ let bestMatch = null;
8067
+ if (!targetVersion || targetVersion === "latest") {
8068
+ bestMatch = semver__default.maxSatisfying(versionStrings, "*");
8069
+ } else {
8070
+ const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
8071
+ if (!versionRegex.test(targetVersion)) {
8072
+ logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
8073
+ } else {
8074
+ let range = targetVersion;
8075
+ if (!semver__default.validRange(targetVersion)) {
8076
+ range = `~${targetVersion}`;
8077
+ } else if (semver__default.valid(targetVersion)) {
8078
+ range = `${range} || <=${targetVersion}`;
8079
+ }
8080
+ bestMatch = semver__default.maxSatisfying(versionStrings, range);
8081
+ }
5294
8082
  }
5295
- for (const service of enabledServices) {
5296
- logger.info(` • ${service}`);
8083
+ if (bestMatch) {
8084
+ logger.info(`✅ Found best match version ${bestMatch} for ${libraryAndVersion}`);
8085
+ } else {
8086
+ logger.warn(`⚠️ No matching semver version found for ${libraryAndVersion}`);
5297
8087
  }
5298
- }
5299
- }
5300
- async function startAppServer(docService, pipeline, config) {
5301
- const appServer = new AppServer(docService, pipeline, config);
5302
- await appServer.start();
5303
- return appServer;
5304
- }
5305
- async function startStdioServer(tools, readOnly = false) {
5306
- setLogLevel(LogLevel.ERROR);
5307
- const server = createMcpServerInstance(tools, readOnly);
5308
- const transport = new StdioServerTransport();
5309
- await server.connect(transport);
5310
- logger.info("🤖 MCP server listening on stdio");
5311
- return server;
5312
- }
5313
- class StoreError extends Error {
5314
- constructor(message, cause) {
5315
- super(cause ? `${message} caused by ${cause}` : message);
5316
- this.cause = cause;
5317
- this.name = this.constructor.name;
5318
- const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
5319
- if (causeError?.stack) {
5320
- this.stack = causeError.stack;
8088
+ if (!bestMatch && !hasUnversioned) {
8089
+ const allLibraryDetails = await this.store.queryLibraryVersions();
8090
+ const libraryDetails = allLibraryDetails.get(library) ?? [];
8091
+ throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
5321
8092
  }
8093
+ return { bestMatch, hasUnversioned };
5322
8094
  }
5323
- }
5324
- class DimensionError extends StoreError {
5325
- constructor(modelName, modelDimension, dbDimension) {
5326
- super(
5327
- `Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
5328
- );
5329
- this.modelName = modelName;
5330
- this.modelDimension = modelDimension;
5331
- this.dbDimension = dbDimension;
5332
- }
5333
- }
5334
- class ConnectionError extends StoreError {
5335
- }
5336
- const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
5337
- const MIGRATIONS_TABLE = "_schema_migrations";
5338
- function ensureMigrationsTable(db) {
5339
- db.exec(`
5340
- CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
5341
- id TEXT PRIMARY KEY,
5342
- applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
8095
+ /**
8096
+ * Removes all documents for a specific library and optional version.
8097
+ * If version is omitted, removes documents without a specific version.
8098
+ */
8099
+ async removeAllDocuments(library, version2) {
8100
+ const normalizedVersion = this.normalizeVersion(version2);
8101
+ logger.info(
8102
+ `🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
5343
8103
  );
5344
- `);
5345
- }
5346
- function getAppliedMigrations(db) {
5347
- const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
5348
- const rows = stmt.all();
5349
- return new Set(rows.map((row) => row.id));
5350
- }
5351
- async function applyMigrations(db) {
5352
- try {
5353
- db.pragma("journal_mode = OFF");
5354
- db.pragma("synchronous = OFF");
5355
- db.pragma("mmap_size = 268435456");
5356
- db.pragma("cache_size = -64000");
5357
- db.pragma("temp_store = MEMORY");
5358
- logger.debug("Applied performance optimizations for migration");
5359
- } catch (_error) {
5360
- logger.warn("⚠️ Could not apply all performance optimizations for migration");
8104
+ const count = await this.store.deleteDocuments(library, normalizedVersion);
8105
+ logger.info(`🗑️ Deleted ${count} documents`);
5361
8106
  }
5362
- const overallTransaction = db.transaction(() => {
5363
- logger.debug("Checking database migrations...");
5364
- ensureMigrationsTable(db);
5365
- const appliedMigrations = getAppliedMigrations(db);
5366
- if (!fs.existsSync(MIGRATIONS_DIR)) {
5367
- throw new StoreError("Migrations directory not found");
5368
- }
5369
- const migrationFiles = fs.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
5370
- const pendingMigrations = migrationFiles.filter(
5371
- (filename) => !appliedMigrations.has(filename)
8107
+ /**
8108
+ * Completely removes a library version and all associated documents.
8109
+ * Also removes the library if no other versions remain.
8110
+ * @param library Library name
8111
+ * @param version Version string (null/undefined for unversioned)
8112
+ */
8113
+ async removeVersion(library, version2) {
8114
+ const normalizedVersion = this.normalizeVersion(version2);
8115
+ logger.info(`🗑️ Removing version: ${library}@${normalizedVersion || "[no version]"}`);
8116
+ const result = await this.store.removeVersion(library, normalizedVersion, true);
8117
+ logger.info(
8118
+ `🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`
5372
8119
  );
5373
- if (pendingMigrations.length > 0) {
5374
- logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
8120
+ if (result.versionDeleted && result.libraryDeleted) {
8121
+ logger.info(`✅ Completely removed library ${library} (was last version)`);
8122
+ } else if (result.versionDeleted) {
8123
+ logger.info(`✅ Removed version ${library}@${normalizedVersion || "[no version]"}`);
8124
+ } else {
8125
+ logger.warn(
8126
+ `⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`
8127
+ );
5375
8128
  }
5376
- let appliedCount = 0;
5377
- for (const filename of pendingMigrations) {
5378
- logger.debug(`Applying migration: ${filename}`);
5379
- const filePath = path.join(MIGRATIONS_DIR, filename);
5380
- const sql = fs.readFileSync(filePath, "utf8");
5381
- try {
5382
- db.exec(sql);
5383
- const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
5384
- insertStmt.run(filename);
5385
- logger.debug(`Applied migration: ${filename}`);
5386
- appliedCount++;
5387
- } catch (error) {
5388
- logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
5389
- throw new StoreError(`Migration failed: ${filename}`, error);
5390
- }
8129
+ }
8130
+ /**
8131
+ * Adds a document to the store, splitting it into smaller chunks for better search results.
8132
+ * Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
8133
+ * Preserves hierarchical structure of documents and distinguishes between text and code segments.
8134
+ * If version is omitted, the document is added without a specific version.
8135
+ */
8136
+ async addDocument(library, version2, document) {
8137
+ const processingStart = performance.now();
8138
+ const normalizedVersion = this.normalizeVersion(version2);
8139
+ const url = document.metadata.url;
8140
+ if (!url || typeof url !== "string" || !url.trim()) {
8141
+ throw new StoreError("Document metadata must include a valid URL");
5391
8142
  }
5392
- if (appliedCount > 0) {
5393
- logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
5394
- } else {
5395
- logger.debug("Database schema is up to date");
8143
+ logger.info(`📚 Adding document: ${document.metadata.title}`);
8144
+ if (!document.pageContent.trim()) {
8145
+ throw new Error("Document content cannot be empty");
5396
8146
  }
5397
- return appliedCount;
5398
- });
5399
- let retries = 0;
5400
- let appliedMigrationsCount = 0;
5401
- while (true) {
5402
8147
  try {
5403
- appliedMigrationsCount = overallTransaction.immediate();
5404
- logger.debug("Database migrations completed successfully");
5405
- if (appliedMigrationsCount > 0) {
5406
- try {
5407
- logger.debug(
5408
- `Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
5409
- );
5410
- db.exec("VACUUM");
5411
- logger.debug("Database vacuum completed successfully");
5412
- } catch (error) {
5413
- logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
8148
+ const chunks = await this.splitter.splitText(document.pageContent);
8149
+ const splitDocs = chunks.map((chunk) => ({
8150
+ pageContent: chunk.content,
8151
+ metadata: {
8152
+ ...document.metadata,
8153
+ level: chunk.section.level,
8154
+ path: chunk.section.path
5414
8155
  }
5415
- } else {
5416
- logger.debug("Skipping VACUUM - no migrations were applied");
5417
- }
5418
- break;
8156
+ }));
8157
+ logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
8158
+ await this.store.addDocuments(library, normalizedVersion, splitDocs);
8159
+ const processingTime = performance.now() - processingStart;
8160
+ analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, {
8161
+ // Content characteristics (privacy-safe)
8162
+ mimeType: document.metadata.mimeType,
8163
+ contentSizeBytes: document.pageContent.length,
8164
+ // Processing metrics
8165
+ processingTimeMs: Math.round(processingTime),
8166
+ chunksCreated: splitDocs.length,
8167
+ // Document characteristics
8168
+ hasTitle: !!document.metadata.title,
8169
+ hasDescription: !!document.metadata.description,
8170
+ urlDomain: extractHostname(url),
8171
+ depth: document.metadata.depth,
8172
+ // Library context
8173
+ library,
8174
+ libraryVersion: normalizedVersion || null,
8175
+ // Processing efficiency
8176
+ avgChunkSizeBytes: Math.round(document.pageContent.length / splitDocs.length),
8177
+ processingSpeedKbPerSec: Math.round(
8178
+ document.pageContent.length / 1024 / (processingTime / 1e3)
8179
+ )
8180
+ });
5419
8181
  } catch (error) {
5420
- if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
5421
- retries++;
5422
- logger.warn(
5423
- `⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
5424
- );
5425
- await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
5426
- } else {
5427
- if (error?.code === "SQLITE_BUSY") {
5428
- logger.error(
5429
- `❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
5430
- );
5431
- }
5432
- if (error instanceof StoreError) {
5433
- throw error;
5434
- }
5435
- throw new StoreError("Failed during migration process", error);
8182
+ const processingTime = performance.now() - processingStart;
8183
+ if (error instanceof Error) {
8184
+ analytics.captureException(error, {
8185
+ mimeType: document.metadata.mimeType,
8186
+ contentSizeBytes: document.pageContent.length,
8187
+ processingTimeMs: Math.round(processingTime),
8188
+ library,
8189
+ libraryVersion: normalizedVersion || null,
8190
+ context: "document_processing",
8191
+ component: DocumentManagementService.constructor.name
8192
+ });
5436
8193
  }
8194
+ throw error;
5437
8195
  }
5438
8196
  }
5439
- try {
5440
- db.pragma("journal_mode = WAL");
5441
- db.pragma("wal_autocheckpoint = 1000");
5442
- db.pragma("busy_timeout = 30000");
5443
- db.pragma("foreign_keys = ON");
5444
- db.pragma("synchronous = NORMAL");
5445
- logger.debug(
5446
- "Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
8197
+ /**
8198
+ * Searches for documentation content across versions.
8199
+ * Uses hybrid search (vector + FTS).
8200
+ * If version is omitted, searches documents without a specific version.
8201
+ */
8202
+ async searchStore(library, version2, query, limit = 5) {
8203
+ const normalizedVersion = this.normalizeVersion(version2);
8204
+ return this.documentRetriever.search(library, normalizedVersion, query, limit);
8205
+ }
8206
+ // Deprecated simple listing removed: enriched listLibraries() is canonical
8207
+ /**
8208
+ * Ensures a library and version exist in the database and returns the version ID.
8209
+ * Creates the library and version records if they don't exist.
8210
+ */
8211
+ async ensureLibraryAndVersion(library, version2) {
8212
+ const normalizedLibrary = library.toLowerCase();
8213
+ const normalizedVersion = this.normalizeVersion(version2);
8214
+ const { versionId } = await this.store.resolveLibraryAndVersionIds(
8215
+ normalizedLibrary,
8216
+ normalizedVersion
5447
8217
  );
5448
- } catch (_error) {
5449
- logger.warn("⚠️ Could not apply all production database settings");
8218
+ return versionId;
5450
8219
  }
5451
8220
  }
5452
8221
  async function createDocumentManagement(options = {}) {
5453
8222
  if (options.serverUrl) {
5454
- const { DocumentManagementClient } = await import("./DocumentManagementClient-CAFdDwTu.js");
5455
8223
  const client = new DocumentManagementClient(options.serverUrl);
5456
8224
  await client.initialize();
5457
8225
  return client;
5458
8226
  }
5459
- const service = new (await import("./DocumentManagementService-C1xAzouZ.js")).DocumentManagementService();
8227
+ const service = new DocumentManagementService(options.embeddingConfig);
5460
8228
  await service.initialize();
5461
8229
  return service;
5462
8230
  }
5463
- async function createLocalDocumentManagement() {
5464
- const service = new (await import("./DocumentManagementService-C1xAzouZ.js")).DocumentManagementService();
8231
+ async function createLocalDocumentManagement(embeddingConfig) {
8232
+ const service = new DocumentManagementService(embeddingConfig);
5465
8233
  await service.initialize();
5466
8234
  return service;
5467
8235
  }
@@ -6991,6 +9759,17 @@ function warnHttpUsage(authConfig, port) {
6991
9759
  );
6992
9760
  }
6993
9761
  }
9762
+ function resolveEmbeddingContext(cliArgs) {
9763
+ try {
9764
+ const modelSpec = cliArgs?.embeddingModel || process.env.DOCS_MCP_EMBEDDING_MODEL;
9765
+ logger.debug("Resolving embedding configuration");
9766
+ const config = EmbeddingConfig.parseEmbeddingConfig(modelSpec);
9767
+ return config;
9768
+ } catch (error) {
9769
+ logger.debug(`Failed to resolve embedding configuration: ${error}`);
9770
+ return null;
9771
+ }
9772
+ }
6994
9773
  function createDefaultAction(program) {
6995
9774
  return program.addOption(
6996
9775
  new Option("--protocol <protocol>", "Protocol for MCP server").choices(["auto", "stdio", "http"]).default("auto")
@@ -7030,7 +9809,8 @@ function createDefaultAction(program) {
7030
9809
  warnHttpUsage(authConfig, port);
7031
9810
  }
7032
9811
  ensurePlaywrightBrowsersInstalled();
7033
- const docService = await createLocalDocumentManagement();
9812
+ const embeddingConfig = resolveEmbeddingContext();
9813
+ const docService = await createLocalDocumentManagement(embeddingConfig);
7034
9814
  const pipelineOptions = {
7035
9815
  recoverJobs: options.resume || false,
7036
9816
  // Use --resume flag for job recovery
@@ -7071,24 +9851,12 @@ async function fetchUrlAction(url, options, command) {
7071
9851
  setupLogging(globalOptions);
7072
9852
  const headers = parseHeaders(options.header);
7073
9853
  const fetchUrlTool = new FetchUrlTool(new HttpFetcher(), new FileFetcher());
7074
- const content = await trackTool(
7075
- "fetch_url",
7076
- () => fetchUrlTool.execute({
7077
- url,
7078
- followRedirects: options.followRedirects,
7079
- scrapeMode: options.scrapeMode,
7080
- headers: Object.keys(headers).length > 0 ? headers : void 0
7081
- }),
7082
- (content2) => ({
7083
- url_protocol: extractProtocol(url),
7084
- // Safe: only protocol, not full URL
7085
- follow_redirects: options.followRedirects,
7086
- scrape_mode: options.scrapeMode,
7087
- has_custom_headers: Object.keys(headers).length > 0,
7088
- content_length: content2.length,
7089
- cli_flags: extractCliFlags(process.argv)
7090
- })
7091
- );
9854
+ const content = await fetchUrlTool.execute({
9855
+ url,
9856
+ followRedirects: options.followRedirects,
9857
+ scrapeMode: options.scrapeMode,
9858
+ headers: Object.keys(headers).length > 0 ? headers : void 0
9859
+ });
7092
9860
  console.log(content);
7093
9861
  }
7094
9862
  function createFetchUrlCommand(program) {
@@ -7120,25 +9888,16 @@ async function findVersionAction(library, options, command) {
7120
9888
  const globalOptions = command.parent?.opts() || {};
7121
9889
  setupLogging(globalOptions);
7122
9890
  const serverUrl = options.serverUrl;
7123
- const docService = await createDocumentManagement({ serverUrl });
9891
+ const docService = await createDocumentManagement({
9892
+ serverUrl,
9893
+ embeddingConfig: serverUrl ? void 0 : null
9894
+ });
7124
9895
  try {
7125
9896
  const findVersionTool = new FindVersionTool(docService);
7126
- const versionInfo = await trackTool(
7127
- "find_version",
7128
- () => findVersionTool.execute({
7129
- library,
7130
- targetVersion: options.version
7131
- }),
7132
- (versionInfo2) => ({
7133
- library,
7134
- // Safe: library names are public
7135
- has_target_version: !!options.version,
7136
- result_type: typeof versionInfo2,
7137
- // 'string'
7138
- using_remote_server: !!serverUrl,
7139
- cli_flags: extractCliFlags(process.argv)
7140
- })
7141
- );
9897
+ const versionInfo = await findVersionTool.execute({
9898
+ library,
9899
+ targetVersion: options.version
9900
+ });
7142
9901
  if (!versionInfo) throw new Error("Failed to get version information");
7143
9902
  console.log(versionInfo);
7144
9903
  } finally {
@@ -7155,18 +9914,13 @@ async function listAction(options, command) {
7155
9914
  const globalOptions = command.parent?.opts() || {};
7156
9915
  setupLogging(globalOptions);
7157
9916
  const { serverUrl } = options;
7158
- const docService = await createDocumentManagement({ serverUrl });
9917
+ const docService = await createDocumentManagement({
9918
+ serverUrl,
9919
+ embeddingConfig: serverUrl ? void 0 : null
9920
+ });
7159
9921
  try {
7160
9922
  const listLibrariesTool = new ListLibrariesTool(docService);
7161
- const result = await trackTool(
7162
- "list_libraries",
7163
- () => listLibrariesTool.execute(),
7164
- (result2) => ({
7165
- library_count: result2.libraries.length,
7166
- using_remote_server: !!serverUrl,
7167
- cli_flags: extractCliFlags(process.argv)
7168
- })
7169
- );
9923
+ const result = await listLibrariesTool.execute();
7170
9924
  console.log(formatOutput(result.libraries));
7171
9925
  } finally {
7172
9926
  await docService.shutdown();
@@ -7219,8 +9973,16 @@ function createMcpCommand(program) {
7219
9973
  validateAuthConfig(authConfig);
7220
9974
  }
7221
9975
  try {
9976
+ const embeddingConfig = resolveEmbeddingContext();
9977
+ if (!serverUrl && !embeddingConfig) {
9978
+ logger.error(
9979
+ "❌ Embedding configuration is required for local mode. Configure an embedding provider with CLI options or environment variables."
9980
+ );
9981
+ process.exit(1);
9982
+ }
7222
9983
  const docService = await createDocumentManagement({
7223
- serverUrl
9984
+ serverUrl,
9985
+ embeddingConfig
7224
9986
  });
7225
9987
  const pipelineOptions = {
7226
9988
  recoverJobs: false,
@@ -7270,20 +10032,13 @@ async function removeAction(library, options, command) {
7270
10032
  const globalOptions = command.parent?.opts() || {};
7271
10033
  setupLogging(globalOptions);
7272
10034
  const serverUrl = options.serverUrl;
7273
- const docService = await createDocumentManagement({ serverUrl });
10035
+ const docService = await createDocumentManagement({
10036
+ serverUrl,
10037
+ embeddingConfig: serverUrl ? void 0 : null
10038
+ });
7274
10039
  const { version: version2 } = options;
7275
10040
  try {
7276
- await trackTool(
7277
- "remove_documents",
7278
- () => docService.removeAllDocuments(library, version2),
7279
- () => ({
7280
- library,
7281
- // Safe: library names are public
7282
- has_version: !!version2,
7283
- using_remote_server: !!serverUrl,
7284
- cli_flags: extractCliFlags(process.argv)
7285
- })
7286
- );
10041
+ await docService.removeAllDocuments(library, version2);
7287
10042
  console.log(`✅ Successfully removed ${library}${version2 ? `@${version2}` : ""}.`);
7288
10043
  } catch (error) {
7289
10044
  console.error(
@@ -7308,7 +10063,16 @@ async function scrapeAction(library, url, options, command) {
7308
10063
  const globalOptions = command.parent?.opts() || {};
7309
10064
  setupLogging(globalOptions);
7310
10065
  const serverUrl = options.serverUrl;
7311
- const docService = await createDocumentManagement({ serverUrl });
10066
+ const embeddingConfig = resolveEmbeddingContext();
10067
+ if (!serverUrl && !embeddingConfig) {
10068
+ throw new Error(
10069
+ "Embedding configuration is required for local scraping. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
10070
+ );
10071
+ }
10072
+ const docService = await createDocumentManagement({
10073
+ serverUrl,
10074
+ embeddingConfig
10075
+ });
7312
10076
  let pipeline = null;
7313
10077
  try {
7314
10078
  const pipelineOptions = {
@@ -7323,48 +10087,23 @@ async function scrapeAction(library, url, options, command) {
7323
10087
  await pipeline.start();
7324
10088
  const scrapeTool = new ScrapeTool(pipeline);
7325
10089
  const headers = parseHeaders(options.header);
7326
- const result = await trackTool(
7327
- "scrape_docs",
7328
- () => scrapeTool.execute({
7329
- url,
7330
- library,
7331
- version: options.version,
7332
- options: {
7333
- maxPages: Number.parseInt(options.maxPages, 10),
7334
- maxDepth: Number.parseInt(options.maxDepth, 10),
7335
- maxConcurrency: Number.parseInt(options.maxConcurrency, 10),
7336
- ignoreErrors: options.ignoreErrors,
7337
- scope: options.scope,
7338
- followRedirects: options.followRedirects,
7339
- scrapeMode: options.scrapeMode,
7340
- includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
7341
- excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
7342
- headers: Object.keys(headers).length > 0 ? headers : void 0
7343
- }
7344
- }),
7345
- (result2) => ({
7346
- library,
7347
- // Safe: library names are public
7348
- url_protocol: extractProtocol(url),
7349
- // Safe: only protocol, not full URL
7350
- max_pages: Number.parseInt(options.maxPages, 10),
7351
- max_depth: Number.parseInt(options.maxDepth, 10),
7352
- max_concurrency: Number.parseInt(options.maxConcurrency, 10),
7353
- has_version: !!options.version,
10090
+ const result = await scrapeTool.execute({
10091
+ url,
10092
+ library,
10093
+ version: options.version,
10094
+ options: {
10095
+ maxPages: Number.parseInt(options.maxPages, 10),
10096
+ maxDepth: Number.parseInt(options.maxDepth, 10),
10097
+ maxConcurrency: Number.parseInt(options.maxConcurrency, 10),
10098
+ ignoreErrors: options.ignoreErrors,
7354
10099
  scope: options.scope,
7355
- scrape_mode: options.scrapeMode,
7356
- ignore_errors: options.ignoreErrors,
7357
- follow_redirects: options.followRedirects,
7358
- has_include_patterns: Array.isArray(options.includePattern) && options.includePattern.length > 0,
7359
- has_exclude_patterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0,
7360
- has_custom_headers: Object.keys(headers).length > 0,
7361
- using_remote_server: !!serverUrl,
7362
- cli_flags: extractCliFlags(process.argv),
7363
- is_async_job: !("pagesScraped" in result2),
7364
- // Pipeline mode vs direct mode
7365
- pages_scraped: "pagesScraped" in result2 ? result2.pagesScraped : void 0
7366
- })
7367
- );
10100
+ followRedirects: options.followRedirects,
10101
+ scrapeMode: options.scrapeMode,
10102
+ includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
10103
+ excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
10104
+ headers: Object.keys(headers).length > 0 ? headers : void 0
10105
+ }
10106
+ });
7368
10107
  if ("pagesScraped" in result) {
7369
10108
  console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
7370
10109
  } else {
@@ -7443,31 +10182,25 @@ async function searchAction(library, query, options, command) {
7443
10182
  const globalOptions = command.parent?.opts() || {};
7444
10183
  setupLogging(globalOptions);
7445
10184
  const serverUrl = options.serverUrl;
7446
- const docService = await createDocumentManagement({ serverUrl });
10185
+ const embeddingConfig = resolveEmbeddingContext();
10186
+ if (!serverUrl && !embeddingConfig) {
10187
+ throw new Error(
10188
+ "Embedding configuration is required for local search. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
10189
+ );
10190
+ }
10191
+ const docService = await createDocumentManagement({
10192
+ serverUrl,
10193
+ embeddingConfig
10194
+ });
7447
10195
  try {
7448
10196
  const searchTool = new SearchTool(docService);
7449
- const result = await trackTool(
7450
- "search_docs",
7451
- () => searchTool.execute({
7452
- library,
7453
- version: options.version,
7454
- query,
7455
- limit: Number.parseInt(options.limit, 10),
7456
- exactMatch: options.exactMatch
7457
- }),
7458
- (result2) => ({
7459
- library,
7460
- // Safe: library names are public
7461
- query_analysis: analyzeSearchQuery(query),
7462
- // Analyzed, not raw query
7463
- result_count: result2.results.length,
7464
- limit_used: Number.parseInt(options.limit, 10),
7465
- has_version_filter: !!options.version,
7466
- exact_match: options.exactMatch,
7467
- using_remote_server: !!serverUrl,
7468
- cli_flags: extractCliFlags(process.argv)
7469
- })
7470
- );
10197
+ const result = await searchTool.execute({
10198
+ library,
10199
+ version: options.version,
10200
+ query,
10201
+ limit: Number.parseInt(options.limit, 10),
10202
+ exactMatch: options.exactMatch
10203
+ });
7471
10204
  console.log(formatOutput(result.results));
7472
10205
  } finally {
7473
10206
  await docService.shutdown();
@@ -7503,8 +10236,16 @@ function createWebCommand(program) {
7503
10236
  const serverUrl = cmdOptions.serverUrl;
7504
10237
  setupLogging(globalOptions);
7505
10238
  try {
10239
+ const embeddingConfig = resolveEmbeddingContext();
10240
+ if (!serverUrl && !embeddingConfig) {
10241
+ logger.error(
10242
+ "❌ Embedding configuration is required for local mode. Configure an embedding provider with CLI options or environment variables."
10243
+ );
10244
+ process.exit(1);
10245
+ }
7506
10246
  const docService = await createDocumentManagement({
7507
- serverUrl
10247
+ serverUrl,
10248
+ embeddingConfig
7508
10249
  });
7509
10250
  const pipelineOptions = {
7510
10251
  recoverJobs: false,
@@ -7553,7 +10294,8 @@ function createWorkerCommand(program) {
7553
10294
  try {
7554
10295
  logger.info(`🚀 Starting external pipeline worker on port ${port}`);
7555
10296
  ensurePlaywrightBrowsersInstalled();
7556
- const docService = await createLocalDocumentManagement();
10297
+ const embeddingConfig = resolveEmbeddingContext();
10298
+ const docService = await createLocalDocumentManagement(embeddingConfig);
7557
10299
  const pipelineOptions = {
7558
10300
  recoverJobs: cmdOptions.resume,
7559
10301
  // Use the resume option
@@ -7581,7 +10323,7 @@ function createCliProgram() {
7581
10323
  program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version(packageJson.version).addOption(
7582
10324
  new Option("--verbose", "Enable verbose (debug) logging").conflicts("silent")
7583
10325
  ).addOption(new Option("--silent", "Disable all logging except errors")).addOption(new Option("--no-telemetry", "Disable telemetry collection")).enablePositionalOptions().allowExcessArguments(false).showHelpAfterError(true);
7584
- program.hook("preAction", (thisCommand, actionCommand) => {
10326
+ program.hook("preAction", async (thisCommand, actionCommand) => {
7585
10327
  const globalOptions = thisCommand.opts();
7586
10328
  if (globalOptions.silent) setLogLevel(LogLevel.ERROR);
7587
10329
  else if (globalOptions.verbose) setLogLevel(LogLevel.DEBUG);
@@ -7672,7 +10414,11 @@ async function runCli() {
7672
10414
  });
7673
10415
  await program.parseAsync(process.argv);
7674
10416
  } catch (error) {
7675
- logger.error(`❌ Error in CLI: ${error}`);
10417
+ if (error instanceof ModelConfigurationError || error instanceof UnsupportedProviderError) {
10418
+ logger.error(error.message);
10419
+ } else {
10420
+ logger.error(`❌ Error in CLI: ${error}`);
10421
+ }
7676
10422
  if (!isShuttingDown) {
7677
10423
  isShuttingDown = true;
7678
10424
  const shutdownPromises = [];
@@ -7722,27 +10468,4 @@ runCli().catch((error) => {
7722
10468
  console.error(`🔥 Fatal error in main execution: ${error}`);
7723
10469
  process.exit(1);
7724
10470
  });
7725
- export {
7726
- ConnectionError as C,
7727
- DimensionError as D,
7728
- EMBEDDING_BATCH_CHARS as E,
7729
- LibraryNotFoundError as L,
7730
- StoreError as S,
7731
- TelemetryEvent as T,
7732
- VECTOR_DIMENSION as V,
7733
- applyMigrations as a,
7734
- EMBEDDING_BATCH_SIZE as b,
7735
- createJSDOM as c,
7736
- denormalizeVersionName as d,
7737
- SPLITTER_PREFERRED_CHUNK_SIZE as e,
7738
- SPLITTER_MAX_CHUNK_SIZE as f,
7739
- getProjectRoot as g,
7740
- VersionNotFoundError as h,
7741
- analytics as i,
7742
- extractHostname as j,
7743
- SPLITTER_MIN_CHUNK_SIZE as k,
7744
- logger as l,
7745
- mapDbDocumentToDocument as m,
7746
- normalizeVersionName as n
7747
- };
7748
10471
  //# sourceMappingURL=index.js.map