@arabold/docs-mcp-server 1.21.1 → 1.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,5 +1,10 @@
1
1
  #!/usr/bin/env node
2
2
  import "dotenv/config";
3
+ import { BedrockEmbeddings } from "@langchain/aws";
4
+ import { GoogleGenerativeAIEmbeddings } from "@langchain/google-genai";
5
+ import { VertexAIEmbeddings } from "@langchain/google-vertexai";
6
+ import { AzureOpenAIEmbeddings, OpenAIEmbeddings } from "@langchain/openai";
7
+ import { Embeddings } from "@langchain/core/embeddings";
3
8
  import { PostHog } from "posthog-node";
4
9
  import crypto, { randomUUID } from "node:crypto";
5
10
  import fs, { readFileSync, existsSync } from "node:fs";
@@ -40,14 +45,277 @@ import DOMPurify from "dompurify";
40
45
  import { fileURLToPath, URL as URL$1 } from "node:url";
41
46
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
42
47
  import { createTRPCProxyClient, httpBatchLink } from "@trpc/client";
43
- import "fuse.js";
44
- import "langchain/text_splitter";
45
- import "better-sqlite3";
46
- import "sqlite-vec";
48
+ import Fuse from "fuse.js";
49
+ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
50
+ import Database from "better-sqlite3";
51
+ import * as sqliteVec from "sqlite-vec";
47
52
  import { execSync } from "node:child_process";
48
53
  import { v4 } from "uuid";
49
54
  import psl from "psl";
50
55
  import { minimatch } from "minimatch";
56
+ const VECTOR_DIMENSION = 1536;
57
+ function mapDbDocumentToDocument(doc) {
58
+ return {
59
+ id: doc.id,
60
+ pageContent: doc.content,
61
+ metadata: JSON.parse(doc.metadata)
62
+ };
63
+ }
64
+ var VersionStatus = /* @__PURE__ */ ((VersionStatus2) => {
65
+ VersionStatus2["NOT_INDEXED"] = "not_indexed";
66
+ VersionStatus2["QUEUED"] = "queued";
67
+ VersionStatus2["RUNNING"] = "running";
68
+ VersionStatus2["COMPLETED"] = "completed";
69
+ VersionStatus2["FAILED"] = "failed";
70
+ VersionStatus2["CANCELLED"] = "cancelled";
71
+ VersionStatus2["UPDATING"] = "updating";
72
+ return VersionStatus2;
73
+ })(VersionStatus || {});
74
+ function normalizeVersionName(name) {
75
+ return name ?? "";
76
+ }
77
+ function denormalizeVersionName(name) {
78
+ return name === "" ? "" : name;
79
+ }
80
+ function getStatusDescription(status) {
81
+ const descriptions = {
82
+ [
83
+ "not_indexed"
84
+ /* NOT_INDEXED */
85
+ ]: "Version created but not yet indexed",
86
+ [
87
+ "queued"
88
+ /* QUEUED */
89
+ ]: "Waiting in queue for indexing",
90
+ [
91
+ "running"
92
+ /* RUNNING */
93
+ ]: "Currently being indexed",
94
+ [
95
+ "completed"
96
+ /* COMPLETED */
97
+ ]: "Successfully indexed",
98
+ [
99
+ "failed"
100
+ /* FAILED */
101
+ ]: "Indexing failed",
102
+ [
103
+ "cancelled"
104
+ /* CANCELLED */
105
+ ]: "Indexing was cancelled",
106
+ [
107
+ "updating"
108
+ /* UPDATING */
109
+ ]: "Re-indexing in progress"
110
+ };
111
+ return descriptions[status] || "Unknown status";
112
+ }
113
+ function isActiveStatus(status) {
114
+ return [
115
+ "queued",
116
+ "running",
117
+ "updating"
118
+ /* UPDATING */
119
+ ].includes(
120
+ status
121
+ );
122
+ }
123
+ class StoreError extends Error {
124
+ constructor(message, cause) {
125
+ super(cause ? `${message} caused by ${cause}` : message);
126
+ this.cause = cause;
127
+ this.name = this.constructor.name;
128
+ const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
129
+ if (causeError?.stack) {
130
+ this.stack = causeError.stack;
131
+ }
132
+ }
133
+ }
134
+ class DimensionError extends StoreError {
135
+ constructor(modelName, modelDimension, dbDimension) {
136
+ super(
137
+ `Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
138
+ );
139
+ this.modelName = modelName;
140
+ this.modelDimension = modelDimension;
141
+ this.dbDimension = dbDimension;
142
+ }
143
+ }
144
+ class ConnectionError extends StoreError {
145
+ }
146
+ class FixedDimensionEmbeddings extends Embeddings {
147
+ constructor(embeddings, targetDimension, providerAndModel, allowTruncate = false) {
148
+ super({});
149
+ this.embeddings = embeddings;
150
+ this.targetDimension = targetDimension;
151
+ this.allowTruncate = allowTruncate;
152
+ const [providerOrModel, modelName] = providerAndModel.split(":");
153
+ this.provider = modelName ? providerOrModel : "openai";
154
+ this.model = modelName || providerOrModel;
155
+ }
156
+ provider;
157
+ model;
158
+ /**
159
+ * Normalize a vector to the target dimension by truncating (for MRL models) or padding.
160
+ * @throws {DimensionError} If vector is too large and provider doesn't support MRL
161
+ */
162
+ normalizeVector(vector) {
163
+ const dimension = vector.length;
164
+ if (dimension > this.targetDimension) {
165
+ if (this.allowTruncate) {
166
+ return vector.slice(0, this.targetDimension);
167
+ }
168
+ throw new DimensionError(
169
+ `${this.provider}:${this.model}`,
170
+ dimension,
171
+ this.targetDimension
172
+ );
173
+ }
174
+ if (dimension < this.targetDimension) {
175
+ return [...vector, ...new Array(this.targetDimension - dimension).fill(0)];
176
+ }
177
+ return vector;
178
+ }
179
+ async embedQuery(text) {
180
+ const vector = await this.embeddings.embedQuery(text);
181
+ return this.normalizeVector(vector);
182
+ }
183
+ async embedDocuments(documents) {
184
+ const vectors = await this.embeddings.embedDocuments(documents);
185
+ return vectors.map((vector) => this.normalizeVector(vector));
186
+ }
187
+ }
188
+ class UnsupportedProviderError extends Error {
189
+ constructor(provider) {
190
+ super(
191
+ `❌ Unsupported embedding provider: ${provider}
192
+ Supported providers: openai, vertex, gemini, aws, microsoft
193
+ See README.md for configuration options or run with --help for more details.`
194
+ );
195
+ this.name = "UnsupportedProviderError";
196
+ }
197
+ }
198
+ class ModelConfigurationError extends Error {
199
+ constructor(message) {
200
+ super(message);
201
+ this.name = "ModelConfigurationError";
202
+ }
203
+ }
204
+ function createEmbeddingModel(providerAndModel) {
205
+ const [providerOrModel, ...modelNameParts] = providerAndModel.split(":");
206
+ const modelName = modelNameParts.join(":");
207
+ const provider = modelName ? providerOrModel : "openai";
208
+ const model = modelName || providerOrModel;
209
+ const baseConfig = { stripNewLines: true };
210
+ switch (provider) {
211
+ case "openai": {
212
+ if (!process.env.OPENAI_API_KEY) {
213
+ throw new ModelConfigurationError(
214
+ "❌ Missing API key for embedding provider\n Please set OPENAI_API_KEY or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
215
+ );
216
+ }
217
+ const config = {
218
+ ...baseConfig,
219
+ modelName: model,
220
+ batchSize: 512
221
+ // OpenAI supports large batches
222
+ };
223
+ const baseURL = process.env.OPENAI_API_BASE;
224
+ if (baseURL) {
225
+ config.configuration = { baseURL };
226
+ }
227
+ return new OpenAIEmbeddings(config);
228
+ }
229
+ case "vertex": {
230
+ if (!process.env.GOOGLE_APPLICATION_CREDENTIALS) {
231
+ throw new ModelConfigurationError(
232
+ "❌ Missing credentials for Google Cloud Vertex AI\n Please set GOOGLE_APPLICATION_CREDENTIALS or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
233
+ );
234
+ }
235
+ return new VertexAIEmbeddings({
236
+ ...baseConfig,
237
+ model
238
+ // e.g., "text-embedding-004"
239
+ });
240
+ }
241
+ case "gemini": {
242
+ if (!process.env.GOOGLE_API_KEY) {
243
+ throw new ModelConfigurationError(
244
+ "❌ Missing API key for Google AI (Gemini)\n Please set GOOGLE_API_KEY or configure an alternative embedding model.\n See README.md for configuration options or run with --help for more details."
245
+ );
246
+ }
247
+ const baseEmbeddings = new GoogleGenerativeAIEmbeddings({
248
+ ...baseConfig,
249
+ apiKey: process.env.GOOGLE_API_KEY,
250
+ model
251
+ // e.g., "gemini-embedding-exp-03-07"
252
+ });
253
+ return new FixedDimensionEmbeddings(
254
+ baseEmbeddings,
255
+ VECTOR_DIMENSION,
256
+ providerAndModel,
257
+ true
258
+ );
259
+ }
260
+ case "aws": {
261
+ const region = process.env.BEDROCK_AWS_REGION || process.env.AWS_REGION;
262
+ if (!region) {
263
+ throw new ModelConfigurationError(
264
+ "BEDROCK_AWS_REGION or AWS_REGION environment variable is required for AWS Bedrock"
265
+ );
266
+ }
267
+ if (!process.env.AWS_PROFILE && !process.env.AWS_ACCESS_KEY_ID && !process.env.AWS_SECRET_ACCESS_KEY) {
268
+ throw new ModelConfigurationError(
269
+ "Either AWS_PROFILE or both AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables are required for AWS Bedrock"
270
+ );
271
+ }
272
+ const credentials = process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY ? {
273
+ accessKeyId: process.env.AWS_ACCESS_KEY_ID,
274
+ secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
275
+ sessionToken: process.env.AWS_SESSION_TOKEN
276
+ } : void 0;
277
+ return new BedrockEmbeddings({
278
+ ...baseConfig,
279
+ model,
280
+ // e.g., "amazon.titan-embed-text-v1"
281
+ region,
282
+ ...credentials ? { credentials } : {}
283
+ });
284
+ }
285
+ case "microsoft": {
286
+ if (!process.env.AZURE_OPENAI_API_KEY) {
287
+ throw new ModelConfigurationError(
288
+ "AZURE_OPENAI_API_KEY environment variable is required for Azure OpenAI"
289
+ );
290
+ }
291
+ if (!process.env.AZURE_OPENAI_API_INSTANCE_NAME) {
292
+ throw new ModelConfigurationError(
293
+ "AZURE_OPENAI_API_INSTANCE_NAME environment variable is required for Azure OpenAI"
294
+ );
295
+ }
296
+ if (!process.env.AZURE_OPENAI_API_DEPLOYMENT_NAME) {
297
+ throw new ModelConfigurationError(
298
+ "AZURE_OPENAI_API_DEPLOYMENT_NAME environment variable is required for Azure OpenAI"
299
+ );
300
+ }
301
+ if (!process.env.AZURE_OPENAI_API_VERSION) {
302
+ throw new ModelConfigurationError(
303
+ "AZURE_OPENAI_API_VERSION environment variable is required for Azure OpenAI"
304
+ );
305
+ }
306
+ return new AzureOpenAIEmbeddings({
307
+ ...baseConfig,
308
+ azureOpenAIApiKey: process.env.AZURE_OPENAI_API_KEY,
309
+ azureOpenAIApiInstanceName: process.env.AZURE_OPENAI_API_INSTANCE_NAME,
310
+ azureOpenAIApiDeploymentName: process.env.AZURE_OPENAI_API_DEPLOYMENT_NAME,
311
+ azureOpenAIApiVersion: process.env.AZURE_OPENAI_API_VERSION,
312
+ deploymentName: model
313
+ });
314
+ }
315
+ default:
316
+ throw new UnsupportedProviderError(provider);
317
+ }
318
+ }
51
319
  const LogLevel = {
52
320
  ERROR: 0,
53
321
  WARN: 1,
@@ -106,6 +374,41 @@ const logger = {
106
374
  }
107
375
  }
108
376
  };
377
+ function camelToSnakeCase(str) {
378
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
379
+ }
380
+ function convertPropertiesToSnakeCase(obj) {
381
+ const result = {};
382
+ for (const [key, value] of Object.entries(obj)) {
383
+ const snakeKey = camelToSnakeCase(key);
384
+ if (value && typeof value === "object" && !Array.isArray(value) && !(value instanceof Date)) {
385
+ result[snakeKey] = convertPropertiesToSnakeCase(value);
386
+ } else if (Array.isArray(value)) {
387
+ result[snakeKey] = value.map(
388
+ (item) => item && typeof item === "object" && !(item instanceof Date) ? convertPropertiesToSnakeCase(item) : item
389
+ );
390
+ } else {
391
+ result[snakeKey] = value;
392
+ }
393
+ }
394
+ return result;
395
+ }
396
+ function addPostHogStandardProperties(properties) {
397
+ const result = { ...properties };
398
+ if (properties.sessionId) {
399
+ result.$session_id = properties.sessionId;
400
+ delete result.sessionId;
401
+ }
402
+ if (properties.startTime) {
403
+ result.$start_timestamp = properties.startTime.toISOString();
404
+ delete result.startTime;
405
+ }
406
+ if (properties.appVersion) {
407
+ result.$app_version = properties.appVersion;
408
+ delete result.appVersion;
409
+ }
410
+ return result;
411
+ }
109
412
  class PostHogClient {
110
413
  client;
111
414
  enabled;
@@ -156,10 +459,12 @@ class PostHogClient {
156
459
  capture(distinctId, event, properties) {
157
460
  if (!this.enabled || !this.client) return;
158
461
  try {
462
+ const enhancedProperties = addPostHogStandardProperties(properties);
463
+ const snakeCaseProperties = convertPropertiesToSnakeCase(enhancedProperties);
159
464
  this.client.capture({
160
465
  distinctId,
161
466
  event,
162
- properties
467
+ properties: snakeCaseProperties
163
468
  });
164
469
  logger.debug(`PostHog event captured: ${event}`);
165
470
  } catch (error) {
@@ -168,6 +473,26 @@ class PostHogClient {
168
473
  );
169
474
  }
170
475
  }
476
+ /**
477
+ * Capture exception using PostHog's native error tracking
478
+ */
479
+ captureException(distinctId, error, properties) {
480
+ if (!this.enabled || !this.client) return;
481
+ try {
482
+ const enhancedProperties = addPostHogStandardProperties(properties || {});
483
+ const snakeCaseProperties = convertPropertiesToSnakeCase(enhancedProperties);
484
+ this.client.captureException({
485
+ error,
486
+ distinctId,
487
+ properties: snakeCaseProperties
488
+ });
489
+ logger.debug(`PostHog exception captured: ${error.constructor.name}`);
490
+ } catch (captureError) {
491
+ logger.debug(
492
+ `PostHog captureException error: ${captureError instanceof Error ? captureError.message : "Unknown error"}`
493
+ );
494
+ }
495
+ }
171
496
  /**
172
497
  * Graceful shutdown with event flushing
173
498
  */
@@ -204,7 +529,7 @@ class SessionTracker {
204
529
  endSession() {
205
530
  if (!this.sessionContext) return null;
206
531
  const duration = Date.now() - this.sessionContext.startTime.getTime();
207
- const sessionInterface = this.sessionContext.interface;
532
+ const sessionInterface = this.sessionContext.appInterface;
208
533
  this.sessionContext = void 0;
209
534
  return { duration, interface: sessionInterface };
210
535
  }
@@ -214,6 +539,14 @@ class SessionTracker {
214
539
  getSessionContext() {
215
540
  return this.sessionContext;
216
541
  }
542
+ /**
543
+ * Update session context with additional fields
544
+ */
545
+ updateSessionContext(updates) {
546
+ if (this.sessionContext) {
547
+ this.sessionContext = { ...this.sessionContext, ...updates };
548
+ }
549
+ }
217
550
  /**
218
551
  * Get enriched properties with session context
219
552
  */
@@ -294,8 +627,6 @@ var TelemetryEvent = /* @__PURE__ */ ((TelemetryEvent2) => {
294
627
  TelemetryEvent2["PIPELINE_JOB_PROGRESS"] = "pipeline_job_progress";
295
628
  TelemetryEvent2["PIPELINE_JOB_COMPLETED"] = "pipeline_job_completed";
296
629
  TelemetryEvent2["DOCUMENT_PROCESSED"] = "document_processed";
297
- TelemetryEvent2["DOCUMENT_PROCESSING_FAILED"] = "document_processing_failed";
298
- TelemetryEvent2["ERROR_OCCURRED"] = "error_occurred";
299
630
  return TelemetryEvent2;
300
631
  })(TelemetryEvent || {});
301
632
  class Analytics {
@@ -321,23 +652,34 @@ class Analytics {
321
652
  if (!this.enabled) return;
322
653
  this.sessionTracker.startSession(context);
323
654
  this.track("session_started", {
324
- interface: context.interface,
325
- version: context.version,
326
- platform: context.platform,
327
- sessionDurationTarget: context.interface === "cli" ? "short" : "long",
328
- authEnabled: context.authEnabled,
329
- readOnly: context.readOnly,
330
- servicesCount: context.servicesEnabled.length
655
+ interface: context.appInterface,
656
+ version: context.appVersion,
657
+ platform: context.appPlatform,
658
+ authEnabled: context.appAuthEnabled,
659
+ readOnly: context.appReadOnly,
660
+ servicesCount: context.appServicesEnabled.length
331
661
  });
332
662
  }
333
663
  /**
334
- * Track an event with automatic session context inclusion
664
+ * Update session context with additional fields (e.g., embedding model info)
335
665
  */
666
+ updateSessionContext(updates) {
667
+ if (!this.enabled) return;
668
+ this.sessionTracker.updateSessionContext(updates);
669
+ }
336
670
  track(event, properties = {}) {
337
671
  if (!this.enabled) return;
338
672
  const eventProperties = this.sessionTracker.getEnrichedProperties(properties);
339
673
  this.postHogClient.capture(this.distinctId, event, eventProperties);
340
674
  }
675
+ /**
676
+ * Capture exception using PostHog's native error tracking with session context
677
+ */
678
+ captureException(error, properties = {}) {
679
+ if (!this.enabled) return;
680
+ const eventProperties = this.sessionTracker.getEnrichedProperties(properties);
681
+ this.postHogClient.captureException(this.distinctId, error, eventProperties);
682
+ }
341
683
  /**
342
684
  * Track session end with duration
343
685
  */
@@ -369,29 +711,38 @@ class Analytics {
369
711
  getSessionContext() {
370
712
  return this.sessionTracker.getSessionContext();
371
713
  }
372
- }
373
- const analytics = new Analytics();
374
- async function trackTool(toolName, operation, getProperties) {
375
- const startTime = Date.now();
376
- try {
377
- const result = await operation();
378
- analytics.track("tool_used", {
379
- tool: toolName,
380
- success: true,
381
- durationMs: Date.now() - startTime,
382
- ...getProperties ? getProperties(result) : {}
383
- });
384
- return result;
385
- } catch (error) {
386
- analytics.track("tool_used", {
387
- tool: toolName,
388
- success: false,
389
- durationMs: Date.now() - startTime,
390
- errorType: error instanceof Error ? error.constructor.name : "UnknownError"
391
- });
392
- throw error;
714
+ /**
715
+ * Track tool usage with error handling and automatic timing
716
+ */
717
+ async trackTool(toolName, operation, getProperties) {
718
+ const startTime = Date.now();
719
+ try {
720
+ const result = await operation();
721
+ this.track("tool_used", {
722
+ tool: toolName,
723
+ success: true,
724
+ durationMs: Date.now() - startTime,
725
+ ...getProperties ? getProperties(result) : {}
726
+ });
727
+ return result;
728
+ } catch (error) {
729
+ this.track("tool_used", {
730
+ tool: toolName,
731
+ success: false,
732
+ durationMs: Date.now() - startTime
733
+ });
734
+ if (error instanceof Error) {
735
+ this.captureException(error, {
736
+ tool: toolName,
737
+ context: "tool_execution",
738
+ durationMs: Date.now() - startTime
739
+ });
740
+ }
741
+ throw error;
742
+ }
393
743
  }
394
744
  }
745
+ const analytics = new Analytics();
395
746
  function extractHostname(url) {
396
747
  try {
397
748
  const parsed = new URL(url);
@@ -411,28 +762,7 @@ function extractProtocol(urlOrPath) {
411
762
  return "unknown";
412
763
  }
413
764
  }
414
- function analyzeSearchQuery(query) {
415
- return {
416
- length: query.length,
417
- wordCount: query.trim().split(/\s+/).length,
418
- hasCodeTerms: /\b(function|class|import|export|const|let|var|def|async|await)\b/i.test(query),
419
- hasSpecialChars: /[^\w\s]/.test(query)
420
- };
421
- }
422
- function sanitizeErrorMessage(message) {
423
- return message.replace(/https?:\/\/[^\s]+/gi, "[url]").replace(/file:\/\/[^\s]+/gi, "[file-url]").replace(/\/[^\s]*\.[a-z]{2,4}/gi, "[path]").replace(/[A-Za-z]:\\[^\s]+/g, "[path]").replace(/Bearer\s+[^\s]+/gi, "Bearer [token]").replace(/api[_-]?key[=:]\s*[^\s]+/gi, "api_key=[redacted]").replace(/token[=:]\s*[^\s]+/gi, "token=[redacted]").substring(0, 200);
424
- }
425
- function sanitizeError(error) {
426
- return {
427
- type: error.constructor.name,
428
- message: sanitizeErrorMessage(error.message),
429
- hasStack: Boolean(error.stack)
430
- };
431
- }
432
- function extractCliFlags(argv) {
433
- return argv.filter((arg) => arg.startsWith("--") || arg.startsWith("-"));
434
- }
435
- const version = "1.21.0";
765
+ const version = "1.21.1";
436
766
  const packageJson = {
437
767
  version
438
768
  };
@@ -440,34 +770,48 @@ function getPackageVersion() {
440
770
  return packageJson.version;
441
771
  }
442
772
  function createCliSession(command, options) {
443
- return {
773
+ const baseSession = {
444
774
  sessionId: randomUUID(),
445
- interface: "cli",
775
+ appInterface: "cli",
446
776
  startTime: /* @__PURE__ */ new Date(),
447
- version: getPackageVersion(),
448
- platform: process.platform,
449
- nodeVersion: process.version,
450
- command: command || "unknown",
451
- authEnabled: options?.authEnabled,
452
- readOnly: options?.readOnly,
453
- servicesEnabled: ["worker"]
777
+ appVersion: getPackageVersion(),
778
+ appPlatform: process.platform,
779
+ appNodeVersion: process.version,
780
+ cliCommand: command || "unknown",
781
+ appAuthEnabled: options?.authEnabled,
782
+ appReadOnly: options?.readOnly,
783
+ appServicesEnabled: ["worker"]
454
784
  // CLI typically runs embedded worker
455
785
  };
786
+ if (options?.embeddingContext) {
787
+ return {
788
+ ...baseSession,
789
+ ...options.embeddingContext
790
+ };
791
+ }
792
+ return baseSession;
456
793
  }
457
794
  function createMcpSession(options) {
458
- return {
795
+ const baseSession = {
459
796
  sessionId: randomUUID(),
460
- interface: "mcp",
797
+ appInterface: "mcp",
461
798
  startTime: /* @__PURE__ */ new Date(),
462
- version: getPackageVersion(),
463
- platform: process.platform,
464
- nodeVersion: process.version,
465
- protocol: options.protocol || "stdio",
466
- transport: options.transport,
467
- authEnabled: options.authEnabled ?? false,
468
- readOnly: options.readOnly ?? false,
469
- servicesEnabled: options.servicesEnabled ?? ["mcp"]
799
+ appVersion: getPackageVersion(),
800
+ appPlatform: process.platform,
801
+ appNodeVersion: process.version,
802
+ mcpProtocol: options.protocol || "stdio",
803
+ mcpTransport: options.transport,
804
+ appAuthEnabled: options.authEnabled ?? false,
805
+ appReadOnly: options.readOnly ?? false,
806
+ appServicesEnabled: options.servicesEnabled ?? ["mcp"]
470
807
  };
808
+ if (options.embeddingContext) {
809
+ return {
810
+ ...baseSession,
811
+ ...options.embeddingContext
812
+ };
813
+ }
814
+ return baseSession;
471
815
  }
472
816
  function createTelemetryService() {
473
817
  return {
@@ -930,42 +1274,55 @@ class CancelJobTool {
930
1274
  * @returns A promise that resolves with the outcome message.
931
1275
  */
932
1276
  async execute(input) {
933
- try {
934
- const job = await this.pipeline.getJob(input.jobId);
935
- if (!job) {
936
- logger.warn(`❓ [CancelJobTool] Job not found: ${input.jobId}`);
937
- return {
938
- message: `Job with ID ${input.jobId} not found.`,
939
- success: false
940
- };
941
- }
942
- if (job.status === PipelineJobStatus.COMPLETED || // Use enum member
943
- job.status === PipelineJobStatus.FAILED || // Use enum member
944
- job.status === PipelineJobStatus.CANCELLED) {
945
- logger.debug(`Job ${input.jobId} is already in a final state: ${job.status}.`);
1277
+ return analytics.trackTool(
1278
+ "cancel_job",
1279
+ async () => {
1280
+ try {
1281
+ const job = await this.pipeline.getJob(input.jobId);
1282
+ if (!job) {
1283
+ logger.warn(`❓ [CancelJobTool] Job not found: ${input.jobId}`);
1284
+ return {
1285
+ message: `Job with ID ${input.jobId} not found.`,
1286
+ success: false
1287
+ };
1288
+ }
1289
+ if (job.status === PipelineJobStatus.COMPLETED || // Use enum member
1290
+ job.status === PipelineJobStatus.FAILED || // Use enum member
1291
+ job.status === PipelineJobStatus.CANCELLED) {
1292
+ logger.debug(
1293
+ `Job ${input.jobId} is already in a final state: ${job.status}.`
1294
+ );
1295
+ return {
1296
+ message: `Job ${input.jobId} is already ${job.status}. No action taken.`,
1297
+ success: true
1298
+ // Considered success as no cancellation needed
1299
+ };
1300
+ }
1301
+ await this.pipeline.cancelJob(input.jobId);
1302
+ const updatedJob = await this.pipeline.getJob(input.jobId);
1303
+ const finalStatus = updatedJob?.status ?? "UNKNOWN (job disappeared?)";
1304
+ logger.debug(
1305
+ `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}`
1306
+ );
1307
+ return {
1308
+ message: `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}.`,
1309
+ success: true
1310
+ };
1311
+ } catch (error) {
1312
+ logger.error(`❌ Error cancelling job ${input.jobId}: ${error}`);
1313
+ return {
1314
+ message: `Failed to cancel job ${input.jobId}: ${error instanceof Error ? error.message : String(error)}`,
1315
+ success: false
1316
+ };
1317
+ }
1318
+ },
1319
+ (result) => {
946
1320
  return {
947
- message: `Job ${input.jobId} is already ${job.status}. No action taken.`,
948
- success: true
949
- // Considered success as no cancellation needed
1321
+ success: result.success
1322
+ // Note: success flag already indicates if cancellation was successful
950
1323
  };
951
1324
  }
952
- await this.pipeline.cancelJob(input.jobId);
953
- const updatedJob = await this.pipeline.getJob(input.jobId);
954
- const finalStatus = updatedJob?.status ?? "UNKNOWN (job disappeared?)";
955
- logger.debug(
956
- `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}`
957
- );
958
- return {
959
- message: `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}.`,
960
- success: true
961
- };
962
- } catch (error) {
963
- logger.error(`❌ Error cancelling job ${input.jobId}: ${error}`);
964
- return {
965
- message: `Failed to cancel job ${input.jobId}: ${error instanceof Error ? error.message : String(error)}`,
966
- success: false
967
- };
968
- }
1325
+ );
969
1326
  }
970
1327
  }
971
1328
  class ClearCompletedJobsTool {
@@ -983,24 +1340,33 @@ class ClearCompletedJobsTool {
983
1340
  * @returns A promise that resolves with the outcome of the clear operation.
984
1341
  */
985
1342
  async execute(_input) {
986
- try {
987
- const clearedCount = await this.pipeline.clearCompletedJobs();
988
- const message = clearedCount > 0 ? `Successfully cleared ${clearedCount} completed job${clearedCount === 1 ? "" : "s"} from the queue.` : "No completed jobs to clear.";
989
- logger.debug(message);
990
- return {
991
- message,
992
- success: true,
993
- clearedCount
994
- };
995
- } catch (error) {
996
- const errorMessage = `Failed to clear completed jobs: ${error instanceof Error ? error.message : String(error)}`;
997
- logger.error(`❌ ${errorMessage}`);
998
- return {
999
- message: errorMessage,
1000
- success: false,
1001
- clearedCount: 0
1002
- };
1003
- }
1343
+ return analytics.trackTool(
1344
+ "clear_completed_jobs",
1345
+ async () => {
1346
+ try {
1347
+ const clearedCount = await this.pipeline.clearCompletedJobs();
1348
+ const message = clearedCount > 0 ? `Successfully cleared ${clearedCount} completed job${clearedCount === 1 ? "" : "s"} from the queue.` : "No completed jobs to clear.";
1349
+ logger.debug(message);
1350
+ return {
1351
+ message,
1352
+ success: true,
1353
+ clearedCount
1354
+ };
1355
+ } catch (error) {
1356
+ const errorMessage = `Failed to clear completed jobs: ${error instanceof Error ? error.message : String(error)}`;
1357
+ logger.error(`❌ ${errorMessage}`);
1358
+ return {
1359
+ message: errorMessage,
1360
+ success: false,
1361
+ clearedCount: 0
1362
+ };
1363
+ }
1364
+ },
1365
+ (result) => ({
1366
+ success: result.success,
1367
+ clearedCount: result.clearedCount
1368
+ })
1369
+ );
1004
1370
  }
1005
1371
  }
1006
1372
  class ToolError extends Error {
@@ -1938,91 +2304,106 @@ class FetchUrlTool {
1938
2304
  * @throws {ToolError} If fetching or processing fails
1939
2305
  */
1940
2306
  async execute(options) {
1941
- const { url, scrapeMode = ScrapeMode.Auto, headers } = options;
1942
- const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
1943
- const fetcherIndex = canFetchResults.indexOf(true);
1944
- if (fetcherIndex === -1) {
1945
- throw new ToolError(
1946
- `Invalid URL: ${url}. Must be an HTTP/HTTPS URL or a file:// URL.`,
1947
- this.constructor.name
1948
- );
1949
- }
1950
- const fetcher = this.fetchers[fetcherIndex];
1951
- const htmlPipeline = new HtmlPipeline();
1952
- const markdownPipeline = new MarkdownPipeline();
1953
- const pipelines = [htmlPipeline, markdownPipeline];
1954
- try {
1955
- logger.info(`📡 Fetching ${url}...`);
1956
- const rawContent = await fetcher.fetch(url, {
1957
- followRedirects: options.followRedirects ?? true,
1958
- maxRetries: 3,
1959
- headers
1960
- // propagate custom headers
1961
- });
1962
- logger.info("🔄 Processing content...");
1963
- let processed;
1964
- for (const pipeline of pipelines) {
1965
- if (pipeline.canProcess(rawContent)) {
1966
- processed = await pipeline.process(
1967
- rawContent,
1968
- {
1969
- url,
1970
- library: "",
1971
- version: "",
1972
- maxDepth: 0,
1973
- maxPages: 1,
1974
- maxConcurrency: 1,
1975
- scope: "subpages",
1976
- followRedirects: options.followRedirects ?? true,
1977
- excludeSelectors: void 0,
1978
- ignoreErrors: false,
1979
- scrapeMode,
1980
- headers
1981
- // propagate custom headers
1982
- },
1983
- fetcher
2307
+ return analytics.trackTool(
2308
+ "fetch_url",
2309
+ async () => {
2310
+ const { url, scrapeMode = ScrapeMode.Auto, headers } = options;
2311
+ const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
2312
+ const fetcherIndex = canFetchResults.indexOf(true);
2313
+ if (fetcherIndex === -1) {
2314
+ throw new ToolError(
2315
+ `Invalid URL: ${url}. Must be an HTTP/HTTPS URL or a file:// URL.`,
2316
+ this.constructor.name
1984
2317
  );
1985
- break;
1986
2318
  }
2319
+ const fetcher = this.fetchers[fetcherIndex];
2320
+ const htmlPipeline = new HtmlPipeline();
2321
+ const markdownPipeline = new MarkdownPipeline();
2322
+ const pipelines = [htmlPipeline, markdownPipeline];
2323
+ try {
2324
+ logger.info(`📡 Fetching ${url}...`);
2325
+ const rawContent = await fetcher.fetch(url, {
2326
+ followRedirects: options.followRedirects ?? true,
2327
+ maxRetries: 3,
2328
+ headers
2329
+ // propagate custom headers
2330
+ });
2331
+ logger.info("🔄 Processing content...");
2332
+ let processed;
2333
+ for (const pipeline of pipelines) {
2334
+ if (pipeline.canProcess(rawContent)) {
2335
+ processed = await pipeline.process(
2336
+ rawContent,
2337
+ {
2338
+ url,
2339
+ library: "",
2340
+ version: "",
2341
+ maxDepth: 0,
2342
+ maxPages: 1,
2343
+ maxConcurrency: 1,
2344
+ scope: "subpages",
2345
+ followRedirects: options.followRedirects ?? true,
2346
+ excludeSelectors: void 0,
2347
+ ignoreErrors: false,
2348
+ scrapeMode,
2349
+ headers
2350
+ // propagate custom headers
2351
+ },
2352
+ fetcher
2353
+ );
2354
+ break;
2355
+ }
2356
+ }
2357
+ if (!processed) {
2358
+ logger.warn(
2359
+ `⚠️ Unsupported content type "${rawContent.mimeType}" for ${url}. Returning raw content.`
2360
+ );
2361
+ const resolvedCharset = resolveCharset(
2362
+ rawContent.charset,
2363
+ rawContent.content,
2364
+ rawContent.mimeType
2365
+ );
2366
+ const contentString = convertToString(rawContent.content, resolvedCharset);
2367
+ return contentString;
2368
+ }
2369
+ for (const err of processed.errors) {
2370
+ logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
2371
+ }
2372
+ if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
2373
+ throw new ToolError(
2374
+ `Processing resulted in empty content for ${url}`,
2375
+ this.constructor.name
2376
+ );
2377
+ }
2378
+ logger.info(`✅ Successfully processed ${url}`);
2379
+ return processed.textContent;
2380
+ } catch (error) {
2381
+ if (error instanceof ScraperError || error instanceof ToolError) {
2382
+ throw new ToolError(
2383
+ `Failed to fetch or process URL: ${error.message}`,
2384
+ this.constructor.name
2385
+ );
2386
+ }
2387
+ throw new ToolError(
2388
+ `Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
2389
+ this.constructor.name
2390
+ );
2391
+ } finally {
2392
+ await htmlPipeline.close();
2393
+ await markdownPipeline.close();
2394
+ }
2395
+ },
2396
+ (result) => {
2397
+ const { url, scrapeMode, followRedirects, headers } = options;
2398
+ return {
2399
+ url,
2400
+ scrapeMode,
2401
+ followRedirects,
2402
+ contentLength: result.length,
2403
+ hasHeaders: !!headers
2404
+ };
1987
2405
  }
1988
- if (!processed) {
1989
- logger.warn(
1990
- `⚠️ Unsupported content type "${rawContent.mimeType}" for ${url}. Returning raw content.`
1991
- );
1992
- const resolvedCharset = resolveCharset(
1993
- rawContent.charset,
1994
- rawContent.content,
1995
- rawContent.mimeType
1996
- );
1997
- const contentString = convertToString(rawContent.content, resolvedCharset);
1998
- return contentString;
1999
- }
2000
- for (const err of processed.errors) {
2001
- logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
2002
- }
2003
- if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
2004
- throw new ToolError(
2005
- `Processing resulted in empty content for ${url}`,
2006
- this.constructor.name
2007
- );
2008
- }
2009
- logger.info(`✅ Successfully processed ${url}`);
2010
- return processed.textContent;
2011
- } catch (error) {
2012
- if (error instanceof ScraperError || error instanceof ToolError) {
2013
- throw new ToolError(
2014
- `Failed to fetch or process URL: ${error.message}`,
2015
- this.constructor.name
2016
- );
2017
- }
2018
- throw new ToolError(
2019
- `Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
2020
- this.constructor.name
2021
- );
2022
- } finally {
2023
- await htmlPipeline.close();
2024
- await markdownPipeline.close();
2025
- }
2406
+ );
2026
2407
  }
2027
2408
  }
2028
2409
  class FindVersionTool {
@@ -2035,35 +2416,50 @@ class FindVersionTool {
2035
2416
  * @returns A descriptive string indicating the best match and unversioned status, or an error message.
2036
2417
  */
2037
2418
  async execute(options) {
2038
- const { library, targetVersion } = options;
2039
- const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
2040
- try {
2041
- const { bestMatch, hasUnversioned } = await this.docService.findBestVersion(
2042
- library,
2043
- targetVersion
2044
- );
2045
- let message = "";
2046
- if (bestMatch) {
2047
- message = `Best match: ${bestMatch}.`;
2048
- if (hasUnversioned) {
2049
- message += " Unversioned docs also available.";
2419
+ return analytics.trackTool(
2420
+ "find_version",
2421
+ async () => {
2422
+ const { library, targetVersion } = options;
2423
+ const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
2424
+ try {
2425
+ const { bestMatch, hasUnversioned } = await this.docService.findBestVersion(
2426
+ library,
2427
+ targetVersion
2428
+ );
2429
+ let message = "";
2430
+ if (bestMatch) {
2431
+ message = `Best match: ${bestMatch}.`;
2432
+ if (hasUnversioned) {
2433
+ message += " Unversioned docs also available.";
2434
+ }
2435
+ } else if (hasUnversioned) {
2436
+ message = `No matching version found for ${libraryAndVersion}, but unversioned docs exist.`;
2437
+ } else {
2438
+ message = `No matching version or unversioned documents found for ${libraryAndVersion}.`;
2439
+ }
2440
+ return { message, bestMatch, hasUnversioned };
2441
+ } catch (error) {
2442
+ if (error instanceof VersionNotFoundError) {
2443
+ logger.info(`ℹ️ Version not found: ${error.message}`);
2444
+ const message = `No matching version or unversioned documents found for ${libraryAndVersion}. Available: ${error.availableVersions.length > 0 ? error.availableVersions.map((v) => v.version).join(", ") : "None"}.`;
2445
+ return { message, bestMatch: null, hasUnversioned: false };
2446
+ }
2447
+ logger.error(
2448
+ `❌ Error finding version for ${libraryAndVersion}: ${error instanceof Error ? error.message : error}`
2449
+ );
2450
+ throw error;
2050
2451
  }
2051
- } else if (hasUnversioned) {
2052
- message = `No matching version found for ${libraryAndVersion}, but unversioned docs exist.`;
2053
- } else {
2054
- message = `No matching version or unversioned documents found for ${libraryAndVersion}.`;
2055
- }
2056
- return message;
2057
- } catch (error) {
2058
- if (error instanceof VersionNotFoundError) {
2059
- logger.info(`ℹ️ Version not found: ${error.message}`);
2060
- return `No matching version or unversioned documents found for ${libraryAndVersion}. Available: ${error.availableVersions.length > 0 ? error.availableVersions.map((v) => v.version).join(", ") : "None"}.`;
2452
+ },
2453
+ (result) => {
2454
+ const { library, targetVersion } = options;
2455
+ return {
2456
+ library,
2457
+ targetVersion,
2458
+ foundMatch: !!result.bestMatch,
2459
+ hasUnversioned: result.hasUnversioned
2460
+ };
2061
2461
  }
2062
- logger.error(
2063
- `❌ Error finding version for ${libraryAndVersion}: ${error instanceof Error ? error.message : error}`
2064
- );
2065
- throw error;
2066
- }
2462
+ ).then((result) => result.message);
2067
2463
  }
2068
2464
  }
2069
2465
  class GetJobInfoTool {
@@ -2081,29 +2477,41 @@ class GetJobInfoTool {
2081
2477
  * @returns A promise that resolves with the simplified job info or null if not found.
2082
2478
  */
2083
2479
  async execute(input) {
2084
- const job = await this.pipeline.getJob(input.jobId);
2085
- if (!job) {
2086
- return { job: null };
2087
- }
2088
- const jobInfo = {
2089
- id: job.id,
2090
- library: job.library,
2091
- version: job.version,
2092
- status: job.status,
2093
- dbStatus: job.versionStatus,
2094
- createdAt: job.createdAt.toISOString(),
2095
- startedAt: job.startedAt?.toISOString() ?? null,
2096
- finishedAt: job.finishedAt?.toISOString() ?? null,
2097
- error: job.error?.message ?? null,
2098
- progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
2099
- pages: job.progressPages || 0,
2100
- totalPages: job.progressMaxPages,
2101
- totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
2102
- } : void 0,
2103
- updatedAt: job.updatedAt?.toISOString(),
2104
- errorMessage: job.errorMessage ?? void 0
2105
- };
2106
- return { job: jobInfo };
2480
+ return analytics.trackTool(
2481
+ "get_job_info",
2482
+ async () => {
2483
+ const job = await this.pipeline.getJob(input.jobId);
2484
+ if (!job) {
2485
+ return { job: null };
2486
+ }
2487
+ const jobInfo = {
2488
+ id: job.id,
2489
+ library: job.library,
2490
+ version: job.version,
2491
+ status: job.status,
2492
+ dbStatus: job.versionStatus,
2493
+ createdAt: job.createdAt.toISOString(),
2494
+ startedAt: job.startedAt?.toISOString() ?? null,
2495
+ finishedAt: job.finishedAt?.toISOString() ?? null,
2496
+ error: job.error?.message ?? null,
2497
+ progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
2498
+ pages: job.progressPages || 0,
2499
+ totalPages: job.progressMaxPages,
2500
+ totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
2501
+ } : void 0,
2502
+ updatedAt: job.updatedAt?.toISOString(),
2503
+ errorMessage: job.errorMessage ?? void 0
2504
+ };
2505
+ return { job: jobInfo };
2506
+ },
2507
+ (result) => {
2508
+ return {
2509
+ found: result.job !== null,
2510
+ library: result.job?.library,
2511
+ version: result.job?.version
2512
+ };
2513
+ }
2514
+ );
2107
2515
  }
2108
2516
  }
2109
2517
  class ListJobsTool {
@@ -2121,28 +2529,45 @@ class ListJobsTool {
2121
2529
  * @returns A promise that resolves with the list of simplified job objects.
2122
2530
  */
2123
2531
  async execute(input) {
2124
- const jobs = await this.pipeline.getJobs(input.status);
2125
- const simplifiedJobs = jobs.map((job) => {
2126
- return {
2127
- id: job.id,
2128
- library: job.library,
2129
- version: job.version,
2130
- status: job.status,
2131
- dbStatus: job.versionStatus,
2132
- createdAt: job.createdAt.toISOString(),
2133
- startedAt: job.startedAt?.toISOString() ?? null,
2134
- finishedAt: job.finishedAt?.toISOString() ?? null,
2135
- error: job.error?.message ?? null,
2136
- progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
2137
- pages: job.progressPages || 0,
2138
- totalPages: job.progressMaxPages,
2139
- totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
2140
- } : void 0,
2141
- updatedAt: job.updatedAt?.toISOString(),
2142
- errorMessage: job.errorMessage ?? void 0
2143
- };
2144
- });
2145
- return { jobs: simplifiedJobs };
2532
+ return analytics.trackTool(
2533
+ "list_jobs",
2534
+ async () => {
2535
+ const jobs = await this.pipeline.getJobs(input.status);
2536
+ const simplifiedJobs = jobs.map((job) => {
2537
+ return {
2538
+ id: job.id,
2539
+ library: job.library,
2540
+ version: job.version,
2541
+ status: job.status,
2542
+ dbStatus: job.versionStatus,
2543
+ createdAt: job.createdAt.toISOString(),
2544
+ startedAt: job.startedAt?.toISOString() ?? null,
2545
+ finishedAt: job.finishedAt?.toISOString() ?? null,
2546
+ error: job.error?.message ?? null,
2547
+ progress: job.progressMaxPages && job.progressMaxPages > 0 ? {
2548
+ pages: job.progressPages || 0,
2549
+ totalPages: job.progressMaxPages,
2550
+ totalDiscovered: job.progress?.totalDiscovered || job.progressMaxPages
2551
+ } : void 0,
2552
+ updatedAt: job.updatedAt?.toISOString(),
2553
+ errorMessage: job.errorMessage ?? void 0
2554
+ };
2555
+ });
2556
+ return { jobs: simplifiedJobs };
2557
+ },
2558
+ (result) => {
2559
+ return {
2560
+ jobCount: result.jobs.length,
2561
+ statusCounts: result.jobs.reduce(
2562
+ (acc, job) => {
2563
+ acc[job.status] = (acc[job.status] || 0) + 1;
2564
+ return acc;
2565
+ },
2566
+ {}
2567
+ )
2568
+ };
2569
+ }
2570
+ );
2146
2571
  }
2147
2572
  }
2148
2573
  class ListLibrariesTool {
@@ -2151,20 +2576,32 @@ class ListLibrariesTool {
2151
2576
  this.docService = docService;
2152
2577
  }
2153
2578
  async execute(_options) {
2154
- const rawLibraries = await this.docService.listLibraries();
2155
- const libraries = rawLibraries.map(({ library, versions }) => ({
2156
- name: library,
2157
- versions: versions.map((v) => ({
2158
- version: v.ref.version,
2159
- documentCount: v.counts.documents,
2160
- uniqueUrlCount: v.counts.uniqueUrls,
2161
- indexedAt: v.indexedAt,
2162
- status: v.status,
2163
- ...v.progress ? { progress: v.progress } : void 0,
2164
- sourceUrl: v.sourceUrl
2165
- }))
2166
- }));
2167
- return { libraries };
2579
+ return analytics.trackTool(
2580
+ "list_libraries",
2581
+ async () => {
2582
+ const rawLibraries = await this.docService.listLibraries();
2583
+ const libraries = rawLibraries.map(({ library, versions }) => ({
2584
+ name: library,
2585
+ versions: versions.map((v) => ({
2586
+ version: v.ref.version,
2587
+ documentCount: v.counts.documents,
2588
+ uniqueUrlCount: v.counts.uniqueUrls,
2589
+ indexedAt: v.indexedAt,
2590
+ status: v.status,
2591
+ ...v.progress ? { progress: v.progress } : void 0,
2592
+ sourceUrl: v.sourceUrl
2593
+ }))
2594
+ }));
2595
+ return { libraries };
2596
+ },
2597
+ (result) => ({
2598
+ libraryCount: result.libraries.length,
2599
+ totalVersions: result.libraries.reduce(
2600
+ (sum, lib) => sum + lib.versions.length,
2601
+ 0
2602
+ )
2603
+ })
2604
+ );
2168
2605
  }
2169
2606
  }
2170
2607
  class RemoveTool {
@@ -2178,29 +2615,42 @@ class RemoveTool {
2178
2615
  * Removes all documents, the version record, and the library if no other versions exist.
2179
2616
  */
2180
2617
  async execute(args) {
2181
- const { library, version: version2 } = args;
2182
- logger.info(`🗑️ Removing library: ${library}${version2 ? `@${version2}` : ""}`);
2183
- try {
2184
- const allJobs = await this.pipeline.getJobs();
2185
- const jobs = allJobs.filter(
2186
- (job) => job.library === library && job.version === (version2 ?? "") && (job.status === PipelineJobStatus.QUEUED || job.status === PipelineJobStatus.RUNNING)
2187
- );
2188
- for (const job of jobs) {
2189
- logger.info(
2190
- `🚫 Aborting job for ${library}@${version2 ?? ""} before deletion: ${job.id}`
2191
- );
2192
- await this.pipeline.cancelJob(job.id);
2193
- await this.pipeline.waitForJobCompletion(job.id);
2618
+ return analytics.trackTool(
2619
+ "remove_docs",
2620
+ async () => {
2621
+ const { library, version: version2 } = args;
2622
+ logger.info(`🗑️ Removing library: ${library}${version2 ? `@${version2}` : ""}`);
2623
+ try {
2624
+ const allJobs = await this.pipeline.getJobs();
2625
+ const jobs = allJobs.filter(
2626
+ (job) => job.library === library && job.version === (version2 ?? "") && (job.status === PipelineJobStatus.QUEUED || job.status === PipelineJobStatus.RUNNING)
2627
+ );
2628
+ for (const job of jobs) {
2629
+ logger.info(
2630
+ `🚫 Aborting job for ${library}@${version2 ?? ""} before deletion: ${job.id}`
2631
+ );
2632
+ await this.pipeline.cancelJob(job.id);
2633
+ await this.pipeline.waitForJobCompletion(job.id);
2634
+ }
2635
+ await this.documentManagementService.removeVersion(library, version2);
2636
+ const message = `Successfully removed ${library}${version2 ? `@${version2}` : ""}.`;
2637
+ logger.info(`✅ ${message}`);
2638
+ return { message };
2639
+ } catch (error) {
2640
+ const errorMessage = `Failed to remove ${library}${version2 ? `@${version2}` : ""}: ${error instanceof Error ? error.message : String(error)}`;
2641
+ logger.error(`❌ Error removing library: ${errorMessage}`);
2642
+ throw new ToolError(errorMessage, this.constructor.name);
2643
+ }
2644
+ },
2645
+ () => {
2646
+ const { library, version: version2 } = args;
2647
+ return {
2648
+ library,
2649
+ version: version2
2650
+ // Success is implicit since if this callback runs, no exception was thrown
2651
+ };
2194
2652
  }
2195
- await this.documentManagementService.removeVersion(library, version2);
2196
- const message = `Successfully removed ${library}${version2 ? `@${version2}` : ""}.`;
2197
- logger.info(`✅ ${message}`);
2198
- return { message };
2199
- } catch (error) {
2200
- const errorMessage = `Failed to remove ${library}${version2 ? `@${version2}` : ""}: ${error instanceof Error ? error.message : String(error)}`;
2201
- logger.error(`❌ Error removing library: ${errorMessage}`);
2202
- throw new ToolError(errorMessage, this.constructor.name);
2203
- }
2653
+ );
2204
2654
  }
2205
2655
  }
2206
2656
  class ScrapeTool {
@@ -2216,66 +2666,80 @@ class ScrapeTool {
2216
2666
  options: scraperOptions,
2217
2667
  waitForCompletion = true
2218
2668
  } = options;
2219
- let internalVersion;
2220
- const partialVersionRegex = /^\d+(\.\d+)?$/;
2221
- if (version2 === null || version2 === void 0) {
2222
- internalVersion = "";
2223
- } else {
2224
- const validFullVersion = semver.valid(version2);
2225
- if (validFullVersion) {
2226
- internalVersion = validFullVersion;
2227
- } else if (partialVersionRegex.test(version2)) {
2228
- const coercedVersion = semver.coerce(version2);
2229
- if (coercedVersion) {
2230
- internalVersion = coercedVersion.version;
2669
+ return analytics.trackTool(
2670
+ "scrape_docs",
2671
+ async () => {
2672
+ let internalVersion;
2673
+ const partialVersionRegex = /^\d+(\.\d+)?$/;
2674
+ if (version2 === null || version2 === void 0) {
2675
+ internalVersion = "";
2231
2676
  } else {
2232
- throw new Error(
2233
- `Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
2234
- );
2677
+ const validFullVersion = semver.valid(version2);
2678
+ if (validFullVersion) {
2679
+ internalVersion = validFullVersion;
2680
+ } else if (partialVersionRegex.test(version2)) {
2681
+ const coercedVersion = semver.coerce(version2);
2682
+ if (coercedVersion) {
2683
+ internalVersion = coercedVersion.version;
2684
+ } else {
2685
+ throw new Error(
2686
+ `Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
2687
+ );
2688
+ }
2689
+ } else {
2690
+ throw new Error(
2691
+ `Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
2692
+ );
2693
+ }
2235
2694
  }
2236
- } else {
2237
- throw new Error(
2238
- `Invalid version format for scraping: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
2239
- );
2240
- }
2241
- }
2242
- internalVersion = internalVersion.toLowerCase();
2243
- const pipeline = this.pipeline;
2244
- const enqueueVersion = internalVersion === "" ? null : internalVersion;
2245
- const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
2246
- url,
2247
- library,
2248
- version: internalVersion,
2249
- scope: scraperOptions?.scope ?? "subpages",
2250
- followRedirects: scraperOptions?.followRedirects ?? true,
2251
- maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
2252
- maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH$1,
2253
- maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
2254
- ignoreErrors: scraperOptions?.ignoreErrors ?? true,
2255
- scrapeMode: scraperOptions?.scrapeMode ?? ScrapeMode.Auto,
2256
- // Pass scrapeMode enum
2257
- includePatterns: scraperOptions?.includePatterns,
2258
- excludePatterns: scraperOptions?.excludePatterns,
2259
- headers: scraperOptions?.headers
2260
- // <-- propagate headers
2261
- });
2262
- if (waitForCompletion) {
2263
- try {
2264
- await pipeline.waitForJobCompletion(jobId);
2265
- const finalJob = await pipeline.getJob(jobId);
2266
- const finalPagesScraped = finalJob?.progress?.pagesScraped ?? 0;
2267
- logger.debug(
2268
- `Job ${jobId} finished with status ${finalJob?.status}. Pages scraped: ${finalPagesScraped}`
2269
- );
2270
- return {
2271
- pagesScraped: finalPagesScraped
2272
- };
2273
- } catch (error) {
2274
- logger.error(`❌ Job ${jobId} failed or was cancelled: ${error}`);
2275
- throw error;
2276
- }
2277
- }
2278
- return { jobId };
2695
+ internalVersion = internalVersion.toLowerCase();
2696
+ const pipeline = this.pipeline;
2697
+ const enqueueVersion = internalVersion === "" ? null : internalVersion;
2698
+ const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
2699
+ url,
2700
+ library,
2701
+ version: internalVersion,
2702
+ scope: scraperOptions?.scope ?? "subpages",
2703
+ followRedirects: scraperOptions?.followRedirects ?? true,
2704
+ maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
2705
+ maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH$1,
2706
+ maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
2707
+ ignoreErrors: scraperOptions?.ignoreErrors ?? true,
2708
+ scrapeMode: scraperOptions?.scrapeMode ?? ScrapeMode.Auto,
2709
+ // Pass scrapeMode enum
2710
+ includePatterns: scraperOptions?.includePatterns,
2711
+ excludePatterns: scraperOptions?.excludePatterns,
2712
+ headers: scraperOptions?.headers
2713
+ // <-- propagate headers
2714
+ });
2715
+ if (waitForCompletion) {
2716
+ try {
2717
+ await pipeline.waitForJobCompletion(jobId);
2718
+ const finalJob = await pipeline.getJob(jobId);
2719
+ const finalPagesScraped = finalJob?.progress?.pagesScraped ?? 0;
2720
+ logger.debug(
2721
+ `Job ${jobId} finished with status ${finalJob?.status}. Pages scraped: ${finalPagesScraped}`
2722
+ );
2723
+ return {
2724
+ pagesScraped: finalPagesScraped
2725
+ };
2726
+ } catch (error) {
2727
+ logger.error(`❌ Job ${jobId} failed or was cancelled: ${error}`);
2728
+ throw error;
2729
+ }
2730
+ }
2731
+ return { jobId };
2732
+ },
2733
+ (result) => ({
2734
+ library,
2735
+ version: version2,
2736
+ url,
2737
+ waitForCompletion,
2738
+ ...scraperOptions,
2739
+ isBackgroundJob: "jobId" in result,
2740
+ pagesScraped: "pagesScraped" in result ? result.pagesScraped : void 0
2741
+ })
2742
+ );
2279
2743
  }
2280
2744
  }
2281
2745
  class SearchTool {
@@ -2285,43 +2749,56 @@ class SearchTool {
2285
2749
  }
2286
2750
  async execute(options) {
2287
2751
  const { library, version: version2, query, limit = 5, exactMatch = false } = options;
2288
- if (exactMatch && (!version2 || version2 === "latest")) {
2289
- await this.docService.validateLibraryExists(library);
2290
- const allLibraries = await this.docService.listLibraries();
2291
- const libraryInfo = allLibraries.find((lib) => lib.library === library);
2292
- const detailedVersions = libraryInfo ? libraryInfo.versions.map((v) => ({
2293
- version: v.ref.version,
2294
- documentCount: v.counts.documents,
2295
- uniqueUrlCount: v.counts.uniqueUrls,
2296
- indexedAt: v.indexedAt
2297
- })) : [];
2298
- throw new VersionNotFoundError(library, version2 ?? "latest", detailedVersions);
2299
- }
2300
- const resolvedVersion = version2 || "latest";
2301
- logger.info(
2302
- `🔍 Searching ${library}@${resolvedVersion} for: ${query}${exactMatch ? " (exact match)" : ""}`
2303
- );
2304
- try {
2305
- await this.docService.validateLibraryExists(library);
2306
- let versionToSearch = resolvedVersion;
2307
- if (!exactMatch) {
2308
- const versionResult = await this.docService.findBestVersion(library, version2);
2309
- versionToSearch = versionResult.bestMatch;
2310
- }
2311
- const results = await this.docService.searchStore(
2752
+ return analytics.trackTool(
2753
+ "search_docs",
2754
+ async () => {
2755
+ if (exactMatch && (!version2 || version2 === "latest")) {
2756
+ await this.docService.validateLibraryExists(library);
2757
+ const allLibraries = await this.docService.listLibraries();
2758
+ const libraryInfo = allLibraries.find((lib) => lib.library === library);
2759
+ const detailedVersions = libraryInfo ? libraryInfo.versions.map((v) => ({
2760
+ version: v.ref.version,
2761
+ documentCount: v.counts.documents,
2762
+ uniqueUrlCount: v.counts.uniqueUrls,
2763
+ indexedAt: v.indexedAt
2764
+ })) : [];
2765
+ throw new VersionNotFoundError(library, version2 ?? "latest", detailedVersions);
2766
+ }
2767
+ const resolvedVersion = version2 || "latest";
2768
+ logger.info(
2769
+ `🔍 Searching ${library}@${resolvedVersion} for: ${query}${exactMatch ? " (exact match)" : ""}`
2770
+ );
2771
+ try {
2772
+ await this.docService.validateLibraryExists(library);
2773
+ let versionToSearch = resolvedVersion;
2774
+ if (!exactMatch) {
2775
+ const versionResult = await this.docService.findBestVersion(library, version2);
2776
+ versionToSearch = versionResult.bestMatch;
2777
+ }
2778
+ const results = await this.docService.searchStore(
2779
+ library,
2780
+ versionToSearch,
2781
+ query,
2782
+ limit
2783
+ );
2784
+ logger.info(`✅ Found ${results.length} matching results`);
2785
+ return { results };
2786
+ } catch (error) {
2787
+ logger.error(
2788
+ `❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
2789
+ );
2790
+ throw error;
2791
+ }
2792
+ },
2793
+ (result) => ({
2312
2794
  library,
2313
- versionToSearch,
2795
+ version: version2,
2314
2796
  query,
2315
- limit
2316
- );
2317
- logger.info(`✅ Found ${results.length} matching results`);
2318
- return { results };
2319
- } catch (error) {
2320
- logger.error(
2321
- `❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
2322
- );
2323
- throw error;
2324
- }
2797
+ limit,
2798
+ exactMatch,
2799
+ resultCount: result.results.length
2800
+ })
2801
+ );
2325
2802
  }
2326
2803
  }
2327
2804
  function createResponse(text) {
@@ -3051,6 +3528,7 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
3051
3528
  authEnabled: !!authManager,
3052
3529
  readOnly,
3053
3530
  servicesEnabled: ["mcp"]
3531
+ // Embedding context will be resolved by the service that starts MCP
3054
3532
  });
3055
3533
  analytics.startSession(session);
3056
3534
  }
@@ -3101,23 +3579,10 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
3101
3579
  const requestTransport = new StreamableHTTPServerTransport({
3102
3580
  sessionIdGenerator: void 0
3103
3581
  });
3104
- if (analytics.isEnabled()) {
3105
- const session = createMcpSession({
3106
- protocol: "http",
3107
- transport: "streamable",
3108
- authEnabled: !!authManager,
3109
- readOnly,
3110
- servicesEnabled: ["mcp"]
3111
- });
3112
- analytics.startSession(session);
3113
- }
3114
3582
  reply.raw.on("close", () => {
3115
3583
  logger.debug("Streamable HTTP request closed");
3116
3584
  requestTransport.close();
3117
3585
  requestServer.close();
3118
- if (analytics.isEnabled()) {
3119
- analytics.endSession();
3120
- }
3121
3586
  });
3122
3587
  await requestServer.connect(requestTransport);
3123
3588
  await requestTransport.handleRequest(request.raw, reply.raw, request.body);
@@ -3261,6 +3726,15 @@ function createDataRouter(trpc) {
3261
3726
  return results;
3262
3727
  }
3263
3728
  ),
3729
+ removeVersion: tt.procedure.input(z$1.object({ library: nonEmpty, version: optionalVersion })).mutation(
3730
+ async ({
3731
+ ctx,
3732
+ input
3733
+ }) => {
3734
+ await ctx.docService.removeVersion(input.library, input.version ?? null);
3735
+ return { ok: true };
3736
+ }
3737
+ ),
3264
3738
  removeAllDocuments: tt.procedure.input(z$1.object({ library: nonEmpty, version: optionalVersion })).mutation(
3265
3739
  async ({
3266
3740
  ctx,
@@ -3518,97 +3992,30 @@ function registerClearCompletedJobsRoute(server, clearCompletedJobsTool) {
3518
3992
  }
3519
3993
  });
3520
3994
  }
3521
- const VECTOR_DIMENSION = 1536;
3522
- function mapDbDocumentToDocument(doc) {
3523
- return {
3524
- id: doc.id,
3525
- pageContent: doc.content,
3526
- metadata: JSON.parse(doc.metadata)
3527
- };
3528
- }
3529
- var VersionStatus = /* @__PURE__ */ ((VersionStatus2) => {
3530
- VersionStatus2["NOT_INDEXED"] = "not_indexed";
3531
- VersionStatus2["QUEUED"] = "queued";
3532
- VersionStatus2["RUNNING"] = "running";
3533
- VersionStatus2["COMPLETED"] = "completed";
3534
- VersionStatus2["FAILED"] = "failed";
3535
- VersionStatus2["CANCELLED"] = "cancelled";
3536
- VersionStatus2["UPDATING"] = "updating";
3537
- return VersionStatus2;
3538
- })(VersionStatus || {});
3539
- function normalizeVersionName(name) {
3540
- return name ?? "";
3541
- }
3542
- function denormalizeVersionName(name) {
3543
- return name === "" ? "" : name;
3544
- }
3545
- function getStatusDescription(status) {
3546
- const descriptions = {
3547
- [
3548
- "not_indexed"
3549
- /* NOT_INDEXED */
3550
- ]: "Version created but not yet indexed",
3551
- [
3552
- "queued"
3553
- /* QUEUED */
3554
- ]: "Waiting in queue for indexing",
3555
- [
3556
- "running"
3557
- /* RUNNING */
3558
- ]: "Currently being indexed",
3559
- [
3560
- "completed"
3561
- /* COMPLETED */
3562
- ]: "Successfully indexed",
3563
- [
3564
- "failed"
3565
- /* FAILED */
3566
- ]: "Indexing failed",
3567
- [
3568
- "cancelled"
3569
- /* CANCELLED */
3570
- ]: "Indexing was cancelled",
3571
- [
3572
- "updating"
3573
- /* UPDATING */
3574
- ]: "Re-indexing in progress"
3575
- };
3576
- return descriptions[status] || "Unknown status";
3577
- }
3578
- function isActiveStatus(status) {
3579
- return [
3580
- "queued",
3581
- "running",
3582
- "updating"
3583
- /* UPDATING */
3584
- ].includes(
3585
- status
3586
- );
3587
- }
3588
- const VersionBadge = ({ version: version2 }) => {
3589
- if (!version2) {
3590
- return null;
3591
- }
3592
- return /* @__PURE__ */ jsx("span", { class: "bg-purple-100 text-purple-800 text-xs font-medium me-2 px-1.5 py-0.5 rounded dark:bg-purple-900 dark:text-purple-300", children: /* @__PURE__ */ jsx("span", { safe: true, children: version2 }) });
3593
- };
3594
- function getStatusClasses(status) {
3595
- const baseClasses = "px-1.5 py-0.5 text-xs font-medium rounded";
3596
- switch (status) {
3597
- case VersionStatus.COMPLETED:
3598
- return `${baseClasses} bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-300`;
3599
- case VersionStatus.RUNNING:
3600
- case VersionStatus.UPDATING:
3601
- return `${baseClasses} bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-300`;
3602
- case VersionStatus.QUEUED:
3603
- return `${baseClasses} bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-300`;
3604
- case VersionStatus.FAILED:
3605
- return `${baseClasses} bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-300`;
3606
- case VersionStatus.CANCELLED:
3607
- return `${baseClasses} bg-gray-100 text-gray-800 dark:bg-gray-900 dark:text-gray-300`;
3608
- case VersionStatus.NOT_INDEXED:
3609
- default:
3610
- return `${baseClasses} bg-gray-100 text-gray-600 dark:bg-gray-800 dark:text-gray-400`;
3611
- }
3995
+ const VersionBadge = ({ version: version2 }) => {
3996
+ if (!version2) {
3997
+ return null;
3998
+ }
3999
+ return /* @__PURE__ */ jsx("span", { class: "bg-purple-100 text-purple-800 text-xs font-medium me-2 px-1.5 py-0.5 rounded dark:bg-purple-900 dark:text-purple-300", children: /* @__PURE__ */ jsx("span", { safe: true, children: version2 }) });
4000
+ };
4001
+ function getStatusClasses(status) {
4002
+ const baseClasses = "px-1.5 py-0.5 text-xs font-medium rounded";
4003
+ switch (status) {
4004
+ case VersionStatus.COMPLETED:
4005
+ return `${baseClasses} bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-300`;
4006
+ case VersionStatus.RUNNING:
4007
+ case VersionStatus.UPDATING:
4008
+ return `${baseClasses} bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-300`;
4009
+ case VersionStatus.QUEUED:
4010
+ return `${baseClasses} bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-300`;
4011
+ case VersionStatus.FAILED:
4012
+ return `${baseClasses} bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-300`;
4013
+ case VersionStatus.CANCELLED:
4014
+ return `${baseClasses} bg-gray-100 text-gray-800 dark:bg-gray-900 dark:text-gray-300`;
4015
+ case VersionStatus.NOT_INDEXED:
4016
+ default:
4017
+ return `${baseClasses} bg-gray-100 text-gray-600 dark:bg-gray-800 dark:text-gray-400`;
4018
+ }
3612
4019
  }
3613
4020
  const StatusBadge = ({ status, showDescription = true }) => /* @__PURE__ */ jsx("span", { class: getStatusClasses(status), children: showDescription ? getStatusDescription(status) : status });
3614
4021
  const ProgressBar = ({ progress, showText = true }) => {
@@ -4940,16 +5347,12 @@ async function registerWorkerService(pipeline) {
4940
5347
  logger.warn(
4941
5348
  `⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`
4942
5349
  );
4943
- const errorInfo = sanitizeError(error);
4944
- analytics.track(TelemetryEvent.ERROR_OCCURRED, {
5350
+ analytics.captureException(error, {
4945
5351
  jobId: job.id,
4946
5352
  // Job IDs are already anonymous
4947
5353
  library: job.library,
4948
- errorType: errorInfo.type,
4949
- errorMessage: errorInfo.message,
4950
5354
  hasDocument: !!document,
4951
5355
  stage: document ? "document_processing" : "job_setup",
4952
- hasStack: errorInfo.hasStack,
4953
5356
  pages_processed_before_error: job.progressPages || 0
4954
5357
  });
4955
5358
  }
@@ -5028,13 +5431,13 @@ class AppServer {
5028
5431
  try {
5029
5432
  telemetryService.startSession({
5030
5433
  sessionId: crypto.randomUUID(),
5031
- interface: "web",
5434
+ appInterface: "web",
5032
5435
  startTime: /* @__PURE__ */ new Date(),
5033
- version: process.env.npm_package_version || "unknown",
5034
- platform: process.platform,
5035
- servicesEnabled: this.getActiveServicesList(),
5036
- authEnabled: Boolean(this.config.auth),
5037
- readOnly: Boolean(this.config.readOnly)
5436
+ appVersion: process.env.npm_package_version || "unknown",
5437
+ appPlatform: process.platform,
5438
+ appServicesEnabled: this.getActiveServicesList(),
5439
+ appAuthEnabled: Boolean(this.config.auth),
5440
+ appReadOnly: Boolean(this.config.readOnly)
5038
5441
  });
5039
5442
  } catch (error) {
5040
5443
  logger.debug(`Failed to initialize telemetry: ${error}`);
@@ -5111,11 +5514,10 @@ class AppServer {
5111
5514
  process.on("unhandledRejection", (reason) => {
5112
5515
  logger.error(`Unhandled Promise Rejection: ${reason}`);
5113
5516
  if (analytics.isEnabled()) {
5114
- analytics.track(TelemetryEvent.ERROR_OCCURRED, {
5115
- error_type: "UnhandledPromiseRejection",
5517
+ const error = reason instanceof Error ? reason : new Error(String(reason));
5518
+ analytics.captureException(error, {
5116
5519
  error_category: "system",
5117
- component: "AppServer",
5118
- severity: "critical",
5520
+ component: AppServer.constructor.name,
5119
5521
  context: "process_unhandled_rejection"
5120
5522
  });
5121
5523
  }
@@ -5125,11 +5527,9 @@ class AppServer {
5125
5527
  process.on("uncaughtException", (error) => {
5126
5528
  logger.error(`Uncaught Exception: ${error.message}`);
5127
5529
  if (analytics.isEnabled()) {
5128
- analytics.track(TelemetryEvent.ERROR_OCCURRED, {
5129
- error_type: error.constructor.name,
5530
+ analytics.captureException(error, {
5130
5531
  error_category: "system",
5131
- component: "AppServer",
5132
- severity: "critical",
5532
+ component: AppServer.constructor.name,
5133
5533
  context: "process_uncaught_exception"
5134
5534
  });
5135
5535
  }
@@ -5138,11 +5538,9 @@ class AppServer {
5138
5538
  if (typeof this.server.setErrorHandler === "function") {
5139
5539
  this.server.setErrorHandler(async (error, request, reply) => {
5140
5540
  if (analytics.isEnabled()) {
5141
- analytics.track(TelemetryEvent.ERROR_OCCURRED, {
5142
- error_type: error.constructor.name,
5541
+ analytics.captureException(error, {
5143
5542
  error_category: "http",
5144
5543
  component: "FastifyServer",
5145
- severity: "high",
5146
5544
  status_code: error.statusCode || 500,
5147
5545
  method: request.method,
5148
5546
  route: request.routeOptions?.url || request.url,
@@ -5274,195 +5672,2564 @@ class AppServer {
5274
5672
  logger.debug("OAuth2 proxy endpoints registered");
5275
5673
  }
5276
5674
  /**
5277
- * Log startup information showing which services are enabled.
5675
+ * Log startup information showing which services are enabled.
5676
+ */
5677
+ logStartupInfo(address) {
5678
+ logger.info(`🚀 AppServer available at ${address}`);
5679
+ const enabledServices = [];
5680
+ if (this.config.enableWebInterface) {
5681
+ enabledServices.push(`Web interface: ${address}`);
5682
+ }
5683
+ if (this.config.enableMcpServer) {
5684
+ enabledServices.push(`MCP endpoints: ${address}/mcp, ${address}/sse`);
5685
+ }
5686
+ if (this.config.enableApiServer) {
5687
+ enabledServices.push(`API: ${address}/api`);
5688
+ }
5689
+ if (this.config.enableWorker) {
5690
+ enabledServices.push("Embedded worker: enabled");
5691
+ } else if (this.config.externalWorkerUrl) {
5692
+ enabledServices.push(`External worker: ${this.config.externalWorkerUrl}`);
5693
+ }
5694
+ for (const service of enabledServices) {
5695
+ logger.info(` • ${service}`);
5696
+ }
5697
+ }
5698
+ }
5699
+ async function startAppServer(docService, pipeline, config) {
5700
+ const appServer = new AppServer(docService, pipeline, config);
5701
+ await appServer.start();
5702
+ return appServer;
5703
+ }
5704
+ async function startStdioServer(tools, readOnly = false) {
5705
+ setLogLevel(LogLevel.ERROR);
5706
+ const server = createMcpServerInstance(tools, readOnly);
5707
+ const transport = new StdioServerTransport();
5708
+ await server.connect(transport);
5709
+ logger.info("🤖 MCP server listening on stdio");
5710
+ return server;
5711
+ }
5712
+ class DocumentManagementClient {
5713
+ baseUrl;
5714
+ client;
5715
+ constructor(serverUrl) {
5716
+ this.baseUrl = serverUrl.replace(/\/$/, "");
5717
+ this.client = createTRPCProxyClient({
5718
+ links: [httpBatchLink({ url: this.baseUrl })]
5719
+ });
5720
+ logger.debug(`DocumentManagementClient (tRPC) created for: ${this.baseUrl}`);
5721
+ }
5722
+ async initialize() {
5723
+ await this.client.ping.query();
5724
+ }
5725
+ async shutdown() {
5726
+ }
5727
+ async listLibraries() {
5728
+ return this.client.listLibraries.query();
5729
+ }
5730
+ async validateLibraryExists(library) {
5731
+ await this.client.validateLibraryExists.mutate({ library });
5732
+ }
5733
+ async findBestVersion(library, targetVersion) {
5734
+ return this.client.findBestVersion.query({ library, targetVersion });
5735
+ }
5736
+ async searchStore(library, version2, query, limit) {
5737
+ return this.client.search.query({ library, version: version2 ?? null, query, limit });
5738
+ }
5739
+ async removeVersion(library, version2) {
5740
+ await this.client.removeVersion.mutate({ library, version: version2 });
5741
+ }
5742
+ async removeAllDocuments(library, version2) {
5743
+ await this.client.removeAllDocuments.mutate({ library, version: version2 ?? null });
5744
+ }
5745
+ async getVersionsByStatus(statuses) {
5746
+ return this.client.getVersionsByStatus.query({
5747
+ statuses
5748
+ });
5749
+ }
5750
+ async findVersionsBySourceUrl(url) {
5751
+ return this.client.findVersionsBySourceUrl.query({ url });
5752
+ }
5753
+ async getScraperOptions(versionId) {
5754
+ return this.client.getScraperOptions.query({ versionId });
5755
+ }
5756
+ async updateVersionStatus(versionId, status, errorMessage) {
5757
+ await this.client.updateVersionStatus.mutate({ versionId, status, errorMessage });
5758
+ }
5759
+ async updateVersionProgress(versionId, pages, maxPages) {
5760
+ await this.client.updateVersionProgress.mutate({ versionId, pages, maxPages });
5761
+ }
5762
+ async storeScraperOptions(versionId, options) {
5763
+ await this.client.storeScraperOptions.mutate({ versionId, options });
5764
+ }
5765
+ }
5766
+ class SplitterError extends Error {
5767
+ }
5768
+ class MinimumChunkSizeError extends SplitterError {
5769
+ constructor(size, maxSize) {
5770
+ super(
5771
+ `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
5772
+ );
5773
+ }
5774
+ }
5775
+ class ContentSplitterError extends SplitterError {
5776
+ }
5777
+ class GreedySplitter {
5778
+ baseSplitter;
5779
+ minChunkSize;
5780
+ preferredChunkSize;
5781
+ /**
5782
+ * Combines a base document splitter with size constraints to produce optimally-sized chunks.
5783
+ * The base splitter handles the initial semantic splitting, while this class handles
5784
+ * the concatenation strategy.
5785
+ */
5786
+ constructor(baseSplitter, minChunkSize, preferredChunkSize) {
5787
+ this.baseSplitter = baseSplitter;
5788
+ this.minChunkSize = minChunkSize;
5789
+ this.preferredChunkSize = preferredChunkSize;
5790
+ }
5791
+ /**
5792
+ * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
5793
+ * are combined until they reach the minimum size, but splits are preserved at major
5794
+ * section boundaries to maintain document structure. This balances the need for
5795
+ * context with semantic coherence.
5796
+ */
5797
+ async splitText(markdown) {
5798
+ const initialChunks = await this.baseSplitter.splitText(markdown);
5799
+ const concatenatedChunks = [];
5800
+ let currentChunk = null;
5801
+ for (const nextChunk of initialChunks) {
5802
+ if (currentChunk) {
5803
+ if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
5804
+ concatenatedChunks.push(currentChunk);
5805
+ currentChunk = this.cloneChunk(nextChunk);
5806
+ continue;
5807
+ }
5808
+ if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
5809
+ concatenatedChunks.push(currentChunk);
5810
+ currentChunk = this.cloneChunk(nextChunk);
5811
+ continue;
5812
+ }
5813
+ currentChunk.content += `
5814
+ ${nextChunk.content}`;
5815
+ currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
5816
+ currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
5817
+ } else {
5818
+ currentChunk = this.cloneChunk(nextChunk);
5819
+ }
5820
+ }
5821
+ if (currentChunk) {
5822
+ concatenatedChunks.push(currentChunk);
5823
+ }
5824
+ return concatenatedChunks;
5825
+ }
5826
+ cloneChunk(chunk) {
5827
+ return {
5828
+ types: [...chunk.types],
5829
+ content: chunk.content,
5830
+ section: {
5831
+ level: chunk.section.level,
5832
+ path: [...chunk.section.path]
5833
+ }
5834
+ };
5835
+ }
5836
+ /**
5837
+ * H1 and H2 headings represent major conceptual breaks in the document.
5838
+ * Preserving these splits helps maintain the document's logical structure.
5839
+ */
5840
+ startsNewMajorSection(chunk) {
5841
+ return chunk.section.level === 1 || chunk.section.level === 2;
5842
+ }
5843
+ /**
5844
+ * Size limit check to ensure chunks remain within embedding model constraints.
5845
+ * Essential for maintaining consistent embedding quality and avoiding truncation.
5846
+ */
5847
+ wouldExceedMaxSize(currentChunk, nextChunk) {
5848
+ if (!currentChunk) {
5849
+ return false;
5850
+ }
5851
+ return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
5852
+ }
5853
+ /**
5854
+ * Checks if one path is a prefix of another path, indicating a parent-child relationship
5855
+ */
5856
+ isPathIncluded(parentPath, childPath) {
5857
+ if (parentPath.length >= childPath.length) return false;
5858
+ return parentPath.every((part, i) => part === childPath[i]);
5859
+ }
5860
+ /**
5861
+ * Merges section metadata when concatenating chunks, following these rules:
5862
+ * 1. Level: Always uses the lowest (most general) level between chunks
5863
+ * 2. Path selection:
5864
+ * - For parent-child relationships (one path includes the other), uses the child's path
5865
+ * - For siblings/unrelated sections, uses the common parent path
5866
+ * - If no common path exists, uses the root path ([])
5867
+ */
5868
+ mergeSectionInfo(currentChunk, nextChunk) {
5869
+ const level = Math.min(currentChunk.section.level, nextChunk.section.level);
5870
+ if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
5871
+ return currentChunk.section;
5872
+ }
5873
+ if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
5874
+ return {
5875
+ path: nextChunk.section.path,
5876
+ level
5877
+ };
5878
+ }
5879
+ if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
5880
+ return {
5881
+ path: currentChunk.section.path,
5882
+ level
5883
+ };
5884
+ }
5885
+ const commonPath = this.findCommonPrefix(
5886
+ currentChunk.section.path,
5887
+ nextChunk.section.path
5888
+ );
5889
+ return {
5890
+ path: commonPath,
5891
+ level
5892
+ };
5893
+ }
5894
+ mergeTypes(currentTypes, nextTypes) {
5895
+ return [.../* @__PURE__ */ new Set([...currentTypes, ...nextTypes])];
5896
+ }
5897
+ /**
5898
+ * Returns longest common prefix between two paths
5899
+ */
5900
+ findCommonPrefix(path1, path2) {
5901
+ const common = [];
5902
+ for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
5903
+ if (path1[i] === path2[i]) {
5904
+ common.push(path1[i]);
5905
+ } else {
5906
+ break;
5907
+ }
5908
+ }
5909
+ return common;
5910
+ }
5911
+ }
5912
+ const fullTrim = (str) => {
5913
+ return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
5914
+ };
5915
+ class CodeContentSplitter {
5916
+ constructor(options) {
5917
+ this.options = options;
5918
+ }
5919
+ async split(content) {
5920
+ const language = content.match(/^```(\w+)\n/)?.[1];
5921
+ const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
5922
+ const lines = strippedContent.split("\n");
5923
+ const chunks = [];
5924
+ let currentChunkLines = [];
5925
+ for (const line of lines) {
5926
+ const singleLineSize = this.wrap(line, language).length;
5927
+ if (singleLineSize > this.options.chunkSize) {
5928
+ throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
5929
+ }
5930
+ currentChunkLines.push(line);
5931
+ const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
5932
+ const newChunkSize = newChunkContent.length;
5933
+ if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
5934
+ const lastLine = currentChunkLines.pop();
5935
+ chunks.push(this.wrap(currentChunkLines.join("\n"), language));
5936
+ currentChunkLines = [lastLine];
5937
+ }
5938
+ }
5939
+ if (currentChunkLines.length > 0) {
5940
+ chunks.push(this.wrap(currentChunkLines.join("\n"), language));
5941
+ }
5942
+ return chunks;
5943
+ }
5944
+ wrap(content, language) {
5945
+ return `\`\`\`${language || ""}
5946
+ ${content.replace(/\n+$/, "")}
5947
+ \`\`\``;
5948
+ }
5949
+ }
5950
+ class TableContentSplitter {
5951
+ constructor(options) {
5952
+ this.options = options;
5953
+ }
5954
+ /**
5955
+ * Splits table content into chunks while preserving table structure
5956
+ */
5957
+ async split(content) {
5958
+ const parsedTable = this.parseTable(content);
5959
+ if (!parsedTable) {
5960
+ return [content];
5961
+ }
5962
+ const { headers, rows } = parsedTable;
5963
+ const chunks = [];
5964
+ let currentRows = [];
5965
+ for (const row of rows) {
5966
+ const singleRowSize = this.wrap(row, headers).length;
5967
+ if (singleRowSize > this.options.chunkSize) {
5968
+ throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
5969
+ }
5970
+ const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
5971
+ const newChunkSize = newChunkContent.length;
5972
+ if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
5973
+ chunks.push(this.wrap(currentRows.join("\n"), headers));
5974
+ currentRows = [row];
5975
+ } else {
5976
+ currentRows.push(row);
5977
+ }
5978
+ }
5979
+ if (currentRows.length > 0) {
5980
+ chunks.push(this.wrap(currentRows.join("\n"), headers));
5981
+ }
5982
+ return chunks;
5983
+ }
5984
+ wrap(content, headers) {
5985
+ const headerRow = `| ${headers.join(" | ")} |`;
5986
+ const separatorRow = `|${headers.map(() => "---").join("|")}|`;
5987
+ return [headerRow, separatorRow, content].join("\n");
5988
+ }
5989
+ parseTable(content) {
5990
+ const lines = content.trim().split("\n");
5991
+ if (lines.length < 3) return null;
5992
+ const headers = this.parseRow(lines[0]);
5993
+ if (!headers) return null;
5994
+ const separator = lines[1];
5995
+ if (!this.isValidSeparator(separator)) return null;
5996
+ const rows = lines.slice(2).filter((row) => row.trim() !== "");
5997
+ return { headers, separator, rows };
5998
+ }
5999
+ /**
6000
+ * Parses a table row into cells
6001
+ */
6002
+ parseRow(row) {
6003
+ if (!row.includes("|")) return null;
6004
+ return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
6005
+ }
6006
+ /**
6007
+ * Validates the separator row of the table
6008
+ */
6009
+ isValidSeparator(separator) {
6010
+ return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
6011
+ }
6012
+ }
6013
+ class TextContentSplitter {
6014
+ constructor(options) {
6015
+ this.options = options;
6016
+ }
6017
+ /**
6018
+ * Splits text content into chunks while trying to preserve semantic boundaries.
6019
+ * Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
6020
+ */
6021
+ async split(content) {
6022
+ const trimmedContent = fullTrim(content);
6023
+ if (trimmedContent.length <= this.options.chunkSize) {
6024
+ return [trimmedContent];
6025
+ }
6026
+ const words = trimmedContent.split(/\s+/);
6027
+ const longestWord = words.reduce(
6028
+ (max, word) => word.length > max.length ? word : max
6029
+ );
6030
+ if (longestWord.length > this.options.chunkSize) {
6031
+ throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
6032
+ }
6033
+ const paragraphChunks = this.splitByParagraphs(trimmedContent);
6034
+ if (this.areChunksValid(paragraphChunks)) {
6035
+ return paragraphChunks;
6036
+ }
6037
+ const lineChunks = this.splitByLines(trimmedContent);
6038
+ if (this.areChunksValid(lineChunks)) {
6039
+ return this.mergeChunks(lineChunks, "\n");
6040
+ }
6041
+ const wordChunks = await this.splitByWords(trimmedContent);
6042
+ return this.mergeChunks(wordChunks, " ");
6043
+ }
6044
+ /**
6045
+ * Checks if all chunks are within the maximum size limit
6046
+ */
6047
+ areChunksValid(chunks) {
6048
+ return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
6049
+ }
6050
+ /**
6051
+ * Splits text into chunks by paragraph boundaries (double newlines)
6052
+ */
6053
+ splitByParagraphs(text) {
6054
+ const paragraphs = text.split(/\n\s*\n/).map((p) => fullTrim(p)).filter(Boolean);
6055
+ return paragraphs.filter((chunk) => chunk.length > 2);
6056
+ }
6057
+ /**
6058
+ * Splits text into chunks by line boundaries
6059
+ */
6060
+ splitByLines(text) {
6061
+ const lines = text.split(/\n/).map((line) => fullTrim(line)).filter(Boolean);
6062
+ return lines.filter((chunk) => chunk.length > 1);
6063
+ }
6064
+ /**
6065
+ * Uses LangChain's recursive splitter for word-based splitting as a last resort
6066
+ */
6067
+ async splitByWords(text) {
6068
+ const splitter = new RecursiveCharacterTextSplitter({
6069
+ chunkSize: this.options.chunkSize,
6070
+ chunkOverlap: 0
6071
+ });
6072
+ const chunks = await splitter.splitText(text);
6073
+ return chunks;
6074
+ }
6075
+ /**
6076
+ * Attempts to merge small chunks with previous chunks to minimize fragmentation.
6077
+ * Only merges if combined size is within maxChunkSize.
6078
+ */
6079
+ mergeChunks(chunks, separator) {
6080
+ const mergedChunks = [];
6081
+ let currentChunk = null;
6082
+ for (const chunk of chunks) {
6083
+ if (currentChunk === null) {
6084
+ currentChunk = chunk;
6085
+ continue;
6086
+ }
6087
+ const currentChunkSize = this.getChunkSize(currentChunk);
6088
+ const nextChunkSize = this.getChunkSize(chunk);
6089
+ if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
6090
+ currentChunk = `${currentChunk}${separator}${chunk}`;
6091
+ } else {
6092
+ mergedChunks.push(currentChunk);
6093
+ currentChunk = chunk;
6094
+ }
6095
+ }
6096
+ if (currentChunk) {
6097
+ mergedChunks.push(currentChunk);
6098
+ }
6099
+ return mergedChunks;
6100
+ }
6101
+ getChunkSize(chunk) {
6102
+ return chunk.length;
6103
+ }
6104
+ wrap(content) {
6105
+ return content;
6106
+ }
6107
+ }
6108
+ class SemanticMarkdownSplitter {
6109
+ constructor(preferredChunkSize, maxChunkSize) {
6110
+ this.preferredChunkSize = preferredChunkSize;
6111
+ this.maxChunkSize = maxChunkSize;
6112
+ this.turndownService = new TurndownService({
6113
+ headingStyle: "atx",
6114
+ hr: "---",
6115
+ bulletListMarker: "-",
6116
+ codeBlockStyle: "fenced",
6117
+ emDelimiter: "_",
6118
+ strongDelimiter: "**",
6119
+ linkStyle: "inlined"
6120
+ });
6121
+ this.turndownService.addRule("table", {
6122
+ filter: ["table"],
6123
+ replacement: (_content, node) => {
6124
+ const table = node;
6125
+ const headers = Array.from(table.querySelectorAll("th")).map(
6126
+ (th) => th.textContent?.trim() || ""
6127
+ );
6128
+ const rows = Array.from(table.querySelectorAll("tr")).filter(
6129
+ (tr) => !tr.querySelector("th")
6130
+ );
6131
+ if (headers.length === 0 && rows.length === 0) return "";
6132
+ let markdown = "\n";
6133
+ if (headers.length > 0) {
6134
+ markdown += `| ${headers.join(" | ")} |
6135
+ `;
6136
+ markdown += `|${headers.map(() => "---").join("|")}|
6137
+ `;
6138
+ }
6139
+ for (const row of rows) {
6140
+ const cells = Array.from(row.querySelectorAll("td")).map(
6141
+ (td) => td.textContent?.trim() || ""
6142
+ );
6143
+ markdown += `| ${cells.join(" | ")} |
6144
+ `;
6145
+ }
6146
+ return markdown;
6147
+ }
6148
+ });
6149
+ this.textSplitter = new TextContentSplitter({
6150
+ chunkSize: this.preferredChunkSize
6151
+ });
6152
+ this.codeSplitter = new CodeContentSplitter({
6153
+ chunkSize: this.maxChunkSize
6154
+ });
6155
+ this.tableSplitter = new TableContentSplitter({
6156
+ chunkSize: this.maxChunkSize
6157
+ });
6158
+ }
6159
+ turndownService;
6160
+ textSplitter;
6161
+ codeSplitter;
6162
+ tableSplitter;
6163
+ /**
6164
+ * Main entry point for splitting markdown content
6165
+ */
6166
+ async splitText(markdown) {
6167
+ const html = await this.markdownToHtml(markdown);
6168
+ const dom = await this.parseHtml(html);
6169
+ const sections = await this.splitIntoSections(dom);
6170
+ return this.splitSectionContent(sections);
6171
+ }
6172
+ /**
6173
+ * Step 1: Split document into sections based on H1-H6 headings,
6174
+ * as well as code blocks and tables.
6175
+ */
6176
+ async splitIntoSections(dom) {
6177
+ const body = dom.querySelector("body");
6178
+ if (!body) {
6179
+ throw new Error("Invalid HTML structure: no body element found");
6180
+ }
6181
+ let currentSection = this.createRootSection();
6182
+ const sections = [];
6183
+ const stack = [currentSection];
6184
+ for (const element of Array.from(body.children)) {
6185
+ const headingMatch = element.tagName.match(/H([1-6])/);
6186
+ if (headingMatch) {
6187
+ const level = Number.parseInt(headingMatch[1], 10);
6188
+ const title = fullTrim(element.textContent || "");
6189
+ while (stack.length > 1 && stack[stack.length - 1].level >= level) {
6190
+ stack.pop();
6191
+ }
6192
+ currentSection = {
6193
+ level,
6194
+ path: [
6195
+ ...stack.slice(1).reduce((acc, s) => {
6196
+ const lastPath = s.path[s.path.length - 1];
6197
+ if (lastPath) acc.push(lastPath);
6198
+ return acc;
6199
+ }, []),
6200
+ title
6201
+ ],
6202
+ content: [
6203
+ {
6204
+ type: "heading",
6205
+ text: `${"#".repeat(level)} ${title}`
6206
+ }
6207
+ ]
6208
+ };
6209
+ sections.push(currentSection);
6210
+ stack.push(currentSection);
6211
+ } else if (element.tagName === "PRE") {
6212
+ const code = element.querySelector("code");
6213
+ const language = code?.className.replace("language-", "") || "";
6214
+ const content = code?.textContent || element.textContent || "";
6215
+ const markdown = `${"```"}${language}
6216
+ ${content}
6217
+ ${"```"}`;
6218
+ currentSection = {
6219
+ level: currentSection.level,
6220
+ path: currentSection.path,
6221
+ content: [
6222
+ {
6223
+ type: "code",
6224
+ text: markdown
6225
+ }
6226
+ ]
6227
+ };
6228
+ sections.push(currentSection);
6229
+ } else if (element.tagName === "TABLE") {
6230
+ const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
6231
+ currentSection = {
6232
+ level: currentSection.level,
6233
+ path: currentSection.path,
6234
+ content: [
6235
+ {
6236
+ type: "table",
6237
+ text: markdown
6238
+ }
6239
+ ]
6240
+ };
6241
+ sections.push(currentSection);
6242
+ } else {
6243
+ const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
6244
+ if (markdown) {
6245
+ currentSection = {
6246
+ level: currentSection.level,
6247
+ path: currentSection.path,
6248
+ content: [
6249
+ {
6250
+ type: "text",
6251
+ text: markdown
6252
+ }
6253
+ ]
6254
+ };
6255
+ sections.push(currentSection);
6256
+ }
6257
+ }
6258
+ }
6259
+ return sections;
6260
+ }
6261
+ /**
6262
+ * Step 2: Split section content into smaller chunks
6263
+ */
6264
+ async splitSectionContent(sections) {
6265
+ const chunks = [];
6266
+ for (const section of sections) {
6267
+ for (const content of section.content) {
6268
+ let splitContent = [];
6269
+ try {
6270
+ switch (content.type) {
6271
+ case "heading":
6272
+ case "text": {
6273
+ splitContent = await this.textSplitter.split(content.text);
6274
+ break;
6275
+ }
6276
+ case "code": {
6277
+ splitContent = await this.codeSplitter.split(content.text);
6278
+ break;
6279
+ }
6280
+ case "table": {
6281
+ splitContent = await this.tableSplitter.split(content.text);
6282
+ break;
6283
+ }
6284
+ }
6285
+ } catch (err) {
6286
+ if (err instanceof MinimumChunkSizeError) {
6287
+ logger.warn(
6288
+ `⚠ Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
6289
+ );
6290
+ const splitter = new RecursiveCharacterTextSplitter({
6291
+ chunkSize: this.maxChunkSize,
6292
+ chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
6293
+ // Use more aggressive separators including empty string as last resort
6294
+ separators: [
6295
+ "\n\n",
6296
+ "\n",
6297
+ " ",
6298
+ " ",
6299
+ ".",
6300
+ ",",
6301
+ ";",
6302
+ ":",
6303
+ "-",
6304
+ "(",
6305
+ ")",
6306
+ "[",
6307
+ "]",
6308
+ "{",
6309
+ "}",
6310
+ ""
6311
+ ]
6312
+ });
6313
+ const chunks2 = await splitter.splitText(content.text);
6314
+ if (chunks2.length === 0) {
6315
+ splitContent = [content.text.substring(0, this.maxChunkSize)];
6316
+ } else {
6317
+ splitContent = chunks2;
6318
+ }
6319
+ } else {
6320
+ const errMessage = err instanceof Error ? err.message : String(err);
6321
+ throw new ContentSplitterError(
6322
+ `Failed to split ${content.type} content: ${errMessage}`
6323
+ );
6324
+ }
6325
+ }
6326
+ chunks.push(
6327
+ ...splitContent.map(
6328
+ (text) => ({
6329
+ types: [content.type],
6330
+ content: text,
6331
+ section: {
6332
+ level: section.level,
6333
+ path: section.path
6334
+ }
6335
+ })
6336
+ )
6337
+ );
6338
+ }
6339
+ }
6340
+ return chunks;
6341
+ }
6342
+ /**
6343
+ * Helper to create the root section
6344
+ */
6345
+ createRootSection() {
6346
+ return {
6347
+ level: 0,
6348
+ path: [],
6349
+ content: []
6350
+ };
6351
+ }
6352
+ /**
6353
+ * Convert markdown to HTML using remark
6354
+ */
6355
+ async markdownToHtml(markdown) {
6356
+ const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
6357
+ return `<!DOCTYPE html>
6358
+ <html>
6359
+ <body>
6360
+ ${String(html)}
6361
+ </body>
6362
+ </html>`;
6363
+ }
6364
+ /**
6365
+ * Parse HTML
6366
+ */
6367
+ async parseHtml(html) {
6368
+ const { window } = createJSDOM(html);
6369
+ return window.document;
6370
+ }
6371
+ }
6372
+ const CHILD_LIMIT = 5;
6373
+ const SIBLING_LIMIT = 2;
6374
+ class DocumentRetrieverService {
6375
+ documentStore;
6376
+ constructor(documentStore) {
6377
+ this.documentStore = documentStore;
6378
+ }
6379
+ /**
6380
+ * Collects all related chunk IDs for a given initial hit.
6381
+ * Returns an object with url, hitId, relatedIds (Set), and score.
6382
+ */
6383
+ async getRelatedChunkIds(library, version2, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
6384
+ const id = doc.id;
6385
+ const url = doc.metadata.url;
6386
+ const score = doc.metadata.score;
6387
+ const relatedIds = /* @__PURE__ */ new Set();
6388
+ relatedIds.add(id);
6389
+ const parent = await this.documentStore.findParentChunk(library, version2, id);
6390
+ if (parent) {
6391
+ relatedIds.add(parent.id);
6392
+ }
6393
+ const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
6394
+ library,
6395
+ version2,
6396
+ id,
6397
+ siblingLimit
6398
+ );
6399
+ for (const sib of precedingSiblings) {
6400
+ relatedIds.add(sib.id);
6401
+ }
6402
+ const childChunks = await this.documentStore.findChildChunks(
6403
+ library,
6404
+ version2,
6405
+ id,
6406
+ childLimit
6407
+ );
6408
+ for (const child of childChunks) {
6409
+ relatedIds.add(child.id);
6410
+ }
6411
+ const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
6412
+ library,
6413
+ version2,
6414
+ id,
6415
+ siblingLimit
6416
+ );
6417
+ for (const sib of subsequentSiblings) {
6418
+ relatedIds.add(sib.id);
6419
+ }
6420
+ return { url, hitId: id, relatedIds, score };
6421
+ }
6422
+ /**
6423
+ * Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
6424
+ */
6425
+ groupAndPrepareFetch(relatedInfos) {
6426
+ const urlMap = /* @__PURE__ */ new Map();
6427
+ for (const info of relatedInfos) {
6428
+ let entry = urlMap.get(info.url);
6429
+ if (!entry) {
6430
+ entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
6431
+ urlMap.set(info.url, entry);
6432
+ }
6433
+ for (const id of info.relatedIds) {
6434
+ entry.uniqueChunkIds.add(id);
6435
+ }
6436
+ if (info.score > entry.maxScore) {
6437
+ entry.maxScore = info.score;
6438
+ }
6439
+ }
6440
+ return urlMap;
6441
+ }
6442
+ /**
6443
+ * Finalizes the merged result for a URL group by fetching, sorting, and joining content.
6444
+ */
6445
+ async finalizeResult(library, version2, url, uniqueChunkIds, maxScore) {
6446
+ const ids = Array.from(uniqueChunkIds);
6447
+ const docs = await this.documentStore.findChunksByIds(library, version2, ids);
6448
+ const content = docs.map((d) => d.pageContent).join("\n\n");
6449
+ return {
6450
+ url,
6451
+ content,
6452
+ score: maxScore
6453
+ };
6454
+ }
6455
+ /**
6456
+ * Searches for documents and expands the context around the matches.
6457
+ * @param library The library name.
6458
+ * @param version The library version.
6459
+ * @param query The search query.
6460
+ * @param version The library version (optional, defaults to searching documents without a version).
6461
+ * @param query The search query.
6462
+ * @param limit The optional limit for the initial search results.
6463
+ * @returns An array of strings representing the aggregated content of the retrieved chunks.
6464
+ */
6465
+ async search(library, version2, query, limit) {
6466
+ const normalizedVersion = (version2 ?? "").toLowerCase();
6467
+ const initialResults = await this.documentStore.findByContent(
6468
+ library,
6469
+ normalizedVersion,
6470
+ query,
6471
+ limit ?? 10
6472
+ );
6473
+ const relatedInfos = await Promise.all(
6474
+ initialResults.map(
6475
+ (doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
6476
+ )
6477
+ );
6478
+ const urlMap = this.groupAndPrepareFetch(relatedInfos);
6479
+ const results = [];
6480
+ for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
6481
+ const result = await this.finalizeResult(
6482
+ library,
6483
+ normalizedVersion,
6484
+ url,
6485
+ uniqueChunkIds,
6486
+ maxScore
6487
+ );
6488
+ results.push(result);
6489
+ }
6490
+ return results;
6491
+ }
6492
+ }
6493
+ const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
6494
+ const MIGRATIONS_TABLE = "_schema_migrations";
6495
+ function ensureMigrationsTable(db) {
6496
+ db.exec(`
6497
+ CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
6498
+ id TEXT PRIMARY KEY,
6499
+ applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
6500
+ );
6501
+ `);
6502
+ }
6503
+ function getAppliedMigrations(db) {
6504
+ const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
6505
+ const rows = stmt.all();
6506
+ return new Set(rows.map((row) => row.id));
6507
+ }
6508
+ async function applyMigrations(db) {
6509
+ try {
6510
+ db.pragma("journal_mode = OFF");
6511
+ db.pragma("synchronous = OFF");
6512
+ db.pragma("mmap_size = 268435456");
6513
+ db.pragma("cache_size = -64000");
6514
+ db.pragma("temp_store = MEMORY");
6515
+ logger.debug("Applied performance optimizations for migration");
6516
+ } catch (_error) {
6517
+ logger.warn("⚠️ Could not apply all performance optimizations for migration");
6518
+ }
6519
+ const overallTransaction = db.transaction(() => {
6520
+ logger.debug("Checking database migrations...");
6521
+ ensureMigrationsTable(db);
6522
+ const appliedMigrations = getAppliedMigrations(db);
6523
+ if (!fs.existsSync(MIGRATIONS_DIR)) {
6524
+ throw new StoreError("Migrations directory not found");
6525
+ }
6526
+ const migrationFiles = fs.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
6527
+ const pendingMigrations = migrationFiles.filter(
6528
+ (filename) => !appliedMigrations.has(filename)
6529
+ );
6530
+ if (pendingMigrations.length > 0) {
6531
+ logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
6532
+ }
6533
+ let appliedCount = 0;
6534
+ for (const filename of pendingMigrations) {
6535
+ logger.debug(`Applying migration: ${filename}`);
6536
+ const filePath = path.join(MIGRATIONS_DIR, filename);
6537
+ const sql = fs.readFileSync(filePath, "utf8");
6538
+ try {
6539
+ db.exec(sql);
6540
+ const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
6541
+ insertStmt.run(filename);
6542
+ logger.debug(`Applied migration: ${filename}`);
6543
+ appliedCount++;
6544
+ } catch (error) {
6545
+ logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
6546
+ throw new StoreError(`Migration failed: ${filename}`, error);
6547
+ }
6548
+ }
6549
+ if (appliedCount > 0) {
6550
+ logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
6551
+ } else {
6552
+ logger.debug("Database schema is up to date");
6553
+ }
6554
+ return appliedCount;
6555
+ });
6556
+ let retries = 0;
6557
+ let appliedMigrationsCount = 0;
6558
+ while (true) {
6559
+ try {
6560
+ appliedMigrationsCount = overallTransaction.immediate();
6561
+ logger.debug("Database migrations completed successfully");
6562
+ if (appliedMigrationsCount > 0) {
6563
+ try {
6564
+ logger.debug(
6565
+ `Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
6566
+ );
6567
+ db.exec("VACUUM");
6568
+ logger.debug("Database vacuum completed successfully");
6569
+ } catch (error) {
6570
+ logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
6571
+ }
6572
+ } else {
6573
+ logger.debug("Skipping VACUUM - no migrations were applied");
6574
+ }
6575
+ break;
6576
+ } catch (error) {
6577
+ if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
6578
+ retries++;
6579
+ logger.warn(
6580
+ `⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
6581
+ );
6582
+ await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
6583
+ } else {
6584
+ if (error?.code === "SQLITE_BUSY") {
6585
+ logger.error(
6586
+ `❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
6587
+ );
6588
+ }
6589
+ if (error instanceof StoreError) {
6590
+ throw error;
6591
+ }
6592
+ throw new StoreError("Failed during migration process", error);
6593
+ }
6594
+ }
6595
+ }
6596
+ try {
6597
+ db.pragma("journal_mode = WAL");
6598
+ db.pragma("wal_autocheckpoint = 1000");
6599
+ db.pragma("busy_timeout = 30000");
6600
+ db.pragma("foreign_keys = ON");
6601
+ db.pragma("synchronous = NORMAL");
6602
+ logger.debug(
6603
+ "Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
6604
+ );
6605
+ } catch (_error) {
6606
+ logger.warn("⚠️ Could not apply all production database settings");
6607
+ }
6608
+ }
6609
+ class EmbeddingConfig {
6610
+ static instance = null;
6611
+ /**
6612
+ * Get the singleton instance of EmbeddingConfig.
6613
+ * Creates the instance if it doesn't exist.
6614
+ */
6615
+ static getInstance() {
6616
+ if (EmbeddingConfig.instance === null) {
6617
+ EmbeddingConfig.instance = new EmbeddingConfig();
6618
+ }
6619
+ return EmbeddingConfig.instance;
6620
+ }
6621
+ /**
6622
+ * Reset the singleton instance (useful for testing).
6623
+ */
6624
+ static resetInstance() {
6625
+ EmbeddingConfig.instance = null;
6626
+ }
6627
+ /**
6628
+ * Known dimensions for common embedding models.
6629
+ * This avoids expensive API calls for dimension detection in telemetry.
6630
+ *
6631
+ * Note: The "openai" provider also supports OpenAI-compatible APIs like:
6632
+ * - Ollama (local models)
6633
+ * - LMStudio (local models)
6634
+ * - Any service implementing OpenAI's embedding API
6635
+ */
6636
+ knownModelDimensions = {
6637
+ // OpenAI models (also works with Ollama, LMStudio, and other OpenAI-compatible APIs)
6638
+ "text-embedding-3-small": 1536,
6639
+ "text-embedding-3-large": 3072,
6640
+ "text-embedding-ada-002": 1536,
6641
+ // Google Vertex AI models
6642
+ "text-embedding-004": 768,
6643
+ "textembedding-gecko@003": 768,
6644
+ "textembedding-gecko@002": 768,
6645
+ "textembedding-gecko@001": 768,
6646
+ // Google Gemini models (with MRL support)
6647
+ "text-embedding-preview-0409": 768,
6648
+ "embedding-001": 768,
6649
+ // AWS Bedrock models
6650
+ // Amazon Titan models
6651
+ "amazon.titan-embed-text-v1": 1536,
6652
+ "amazon.titan-embed-text-v2:0": 1024,
6653
+ "amazon.titan-embed-image-v1": 1024,
6654
+ // Image embedding model
6655
+ // Cohere models
6656
+ "cohere.embed-english-v3": 1024,
6657
+ "cohere.embed-multilingual-v3": 1024,
6658
+ // SageMaker models (hosted on AWS SageMaker)
6659
+ "intfloat/multilingual-e5-large": 1024,
6660
+ // Additional AWS models that might be supported
6661
+ // Note: Some of these might be placeholders - verify dimensions before use
6662
+ // "amazon.nova-embed-multilingual-v1:0": 4096, // Commented out as noted in source
6663
+ // MTEB Leaderboard models (source: https://huggingface.co/spaces/mteb/leaderboard)
6664
+ // Top performing models from Massive Text Embedding Benchmark
6665
+ "sentence-transformers/all-MiniLM-L6-v2": 384,
6666
+ "gemini-embedding-001": 3072,
6667
+ "Qwen/Qwen3-Embedding-8B": 4096,
6668
+ "Qwen/Qwen3-Embedding-4B": 2560,
6669
+ "Qwen/Qwen3-Embedding-0.6B": 1024,
6670
+ "Linq-AI-Research/Linq-Embed-Mistral": 4096,
6671
+ "Alibaba-NLP/gte-Qwen2-7B-instruct": 3584,
6672
+ "intfloat/multilingual-e5-large-instruct": 1024,
6673
+ "Salesforce/SFR-Embedding-Mistral": 4096,
6674
+ "text-multilingual-embedding-002": 768,
6675
+ "GritLM/GritLM-7B": 4096,
6676
+ "GritLM/GritLM-8x7B": 4096,
6677
+ "intfloat/e5-mistral-7b-instruct": 4096,
6678
+ "Cohere/Cohere-embed-multilingual-v3.0": 1024,
6679
+ "Alibaba-NLP/gte-Qwen2-1.5B-instruct": 8960,
6680
+ "Lajavaness/bilingual-embedding-large": 1024,
6681
+ "Salesforce/SFR-Embedding-2_R": 4096,
6682
+ "NovaSearch/stella_en_1.5B_v5": 8960,
6683
+ "NovaSearch/jasper_en_vision_language_v1": 8960,
6684
+ "nvidia/NV-Embed-v2": 4096,
6685
+ "OrdalieTech/Solon-embeddings-large-0.1": 1024,
6686
+ "BAAI/bge-m3": 1024,
6687
+ "HIT-TMG/KaLM-embedding-multilingual-mini-v1": 896,
6688
+ "jinaai/jina-embeddings-v3": 1024,
6689
+ "Alibaba-NLP/gte-multilingual-base": 768,
6690
+ "Lajavaness/bilingual-embedding-base": 768,
6691
+ "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1": 896,
6692
+ "nvidia/NV-Embed-v1": 4096,
6693
+ "Cohere/Cohere-embed-multilingual-light-v3.0": 384,
6694
+ "manu/bge-m3-custom-fr": 1024,
6695
+ "Lajavaness/bilingual-embedding-small": 384,
6696
+ "Snowflake/snowflake-arctic-embed-l-v2.0": 1024,
6697
+ "intfloat/multilingual-e5-base": 768,
6698
+ "voyage-3-lite": 512,
6699
+ "voyage-3": 1024,
6700
+ "intfloat/multilingual-e5-small": 384,
6701
+ "Alibaba-NLP/gte-Qwen1.5-7B-instruct": 4096,
6702
+ "Snowflake/snowflake-arctic-embed-m-v2.0": 768,
6703
+ "deepvk/USER-bge-m3": 1024,
6704
+ "Cohere/Cohere-embed-english-v3.0": 1024,
6705
+ "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka": 768,
6706
+ "ibm-granite/granite-embedding-278m-multilingual": 768,
6707
+ "NovaSearch/stella_en_400M_v5": 4096,
6708
+ "omarelshehy/arabic-english-sts-matryoshka": 1024,
6709
+ "sentence-transformers/paraphrase-multilingual-mpnet-base-v2": 768,
6710
+ "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka": 768,
6711
+ "Haon-Chen/speed-embedding-7b-instruct": 4096,
6712
+ "sentence-transformers/LaBSE": 768,
6713
+ "WhereIsAI/UAE-Large-V1": 1024,
6714
+ "ibm-granite/granite-embedding-107m-multilingual": 384,
6715
+ "mixedbread-ai/mxbai-embed-large-v1": 1024,
6716
+ "intfloat/e5-large-v2": 1024,
6717
+ "avsolatorio/GIST-large-Embedding-v0": 1024,
6718
+ "sdadas/mmlw-e5-large": 1024,
6719
+ "nomic-ai/nomic-embed-text-v1": 768,
6720
+ "nomic-ai/nomic-embed-text-v1-ablated": 768,
6721
+ "intfloat/e5-base-v2": 768,
6722
+ "BAAI/bge-large-en-v1.5": 1024,
6723
+ "intfloat/e5-large": 1024,
6724
+ "Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet": 384,
6725
+ "Cohere/Cohere-embed-english-light-v3.0": 384,
6726
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": 768,
6727
+ "Gameselo/STS-multilingual-mpnet-base-v2": 768,
6728
+ "thenlper/gte-large": 1024,
6729
+ "avsolatorio/GIST-Embedding-v0": 768,
6730
+ "nomic-ai/nomic-embed-text-v1-unsupervised": 768,
6731
+ "infgrad/stella-base-en-v2": 768,
6732
+ "avsolatorio/NoInstruct-small-Embedding-v0": 384,
6733
+ "dwzhu/e5-base-4k": 768,
6734
+ "sdadas/mmlw-e5-base": 768,
6735
+ "voyage-multilingual-2": 1024,
6736
+ "McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised": 4096,
6737
+ "BAAI/bge-base-en-v1.5": 768,
6738
+ "avsolatorio/GIST-small-Embedding-v0": 384,
6739
+ "sdadas/mmlw-roberta-large": 1024,
6740
+ "nomic-ai/nomic-embed-text-v1.5": 768,
6741
+ "minishlab/potion-multilingual-128M": 256,
6742
+ "shibing624/text2vec-base-multilingual": 384,
6743
+ "thenlper/gte-base": 768,
6744
+ "intfloat/e5-small-v2": 384,
6745
+ "intfloat/e5-base": 768,
6746
+ "sentence-transformers/static-similarity-mrl-multilingual-v1": 1024,
6747
+ "manu/sentence_croissant_alpha_v0.3": 2048,
6748
+ "BAAI/bge-small-en-v1.5": 512,
6749
+ "thenlper/gte-small": 384,
6750
+ "sdadas/mmlw-e5-small": 384,
6751
+ "manu/sentence_croissant_alpha_v0.4": 2048,
6752
+ "manu/sentence_croissant_alpha_v0.2": 2048,
6753
+ "abhinand/MedEmbed-small-v0.1": 384,
6754
+ "ibm-granite/granite-embedding-125m-english": 768,
6755
+ "intfloat/e5-small": 384,
6756
+ "voyage-large-2-instruct": 1024,
6757
+ "sdadas/mmlw-roberta-base": 768,
6758
+ "Snowflake/snowflake-arctic-embed-l": 1024,
6759
+ "Mihaiii/Ivysaur": 384,
6760
+ "Snowflake/snowflake-arctic-embed-m-long": 768,
6761
+ "bigscience/sgpt-bloom-7b1-msmarco": 4096,
6762
+ "avsolatorio/GIST-all-MiniLM-L6-v2": 384,
6763
+ "sergeyzh/LaBSE-ru-turbo": 768,
6764
+ "sentence-transformers/all-mpnet-base-v2": 768,
6765
+ "Snowflake/snowflake-arctic-embed-m": 768,
6766
+ "Snowflake/snowflake-arctic-embed-s": 384,
6767
+ "sentence-transformers/all-MiniLM-L12-v2": 384,
6768
+ "Mihaiii/gte-micro-v4": 384,
6769
+ "Snowflake/snowflake-arctic-embed-m-v1.5": 768,
6770
+ "cointegrated/LaBSE-en-ru": 768,
6771
+ "Mihaiii/Bulbasaur": 384,
6772
+ "ibm-granite/granite-embedding-30m-english": 384,
6773
+ "deepfile/embedder-100p": 768,
6774
+ "Jaume/gemma-2b-embeddings": 2048,
6775
+ "OrlikB/KartonBERT-USE-base-v1": 768,
6776
+ "izhx/udever-bloom-7b1": 4096,
6777
+ "izhx/udever-bloom-1b1": 1024,
6778
+ "brahmairesearch/slx-v0.1": 384,
6779
+ "Mihaiii/Wartortle": 384,
6780
+ "izhx/udever-bloom-3b": 2048,
6781
+ "deepvk/USER-base": 768,
6782
+ "ai-forever/ru-en-RoSBERTa": 1024,
6783
+ "McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse": 4096,
6784
+ "Mihaiii/Venusaur": 384,
6785
+ "Snowflake/snowflake-arctic-embed-xs": 384,
6786
+ "jinaai/jina-embedding-b-en-v1": 768,
6787
+ "Mihaiii/gte-micro": 384,
6788
+ "aari1995/German_Semantic_STS_V2": 1024,
6789
+ "Mihaiii/Squirtle": 384,
6790
+ "OrlikB/st-polish-kartonberta-base-alpha-v1": 768,
6791
+ "sergeyzh/rubert-tiny-turbo": 312,
6792
+ "minishlab/potion-base-8M": 256,
6793
+ "minishlab/M2V_base_glove_subword": 256,
6794
+ "jinaai/jina-embedding-s-en-v1": 512,
6795
+ "minishlab/potion-base-4M": 128,
6796
+ "minishlab/M2V_base_output": 256,
6797
+ "DeepPavlov/rubert-base-cased-sentence": 768,
6798
+ "jinaai/jina-embeddings-v2-small-en": 512,
6799
+ "cointegrated/rubert-tiny2": 312,
6800
+ "minishlab/M2V_base_glove": 256,
6801
+ "cointegrated/rubert-tiny": 312,
6802
+ "silma-ai/silma-embeddding-matryoshka-v0.1": 768,
6803
+ "DeepPavlov/rubert-base-cased": 768,
6804
+ "Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet": 768,
6805
+ "izhx/udever-bloom-560m": 1024,
6806
+ "minishlab/potion-base-2M": 64,
6807
+ "DeepPavlov/distilrubert-small-cased-conversational": 768,
6808
+ "consciousAI/cai-lunaris-text-embeddings": 1024,
6809
+ "deepvk/deberta-v1-base": 768,
6810
+ "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka": 768,
6811
+ "Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka": 768,
6812
+ "ai-forever/sbert_large_mt_nlu_ru": 1024,
6813
+ "ai-forever/sbert_large_nlu_ru": 1024,
6814
+ "malenia1/ternary-weight-embedding": 1024,
6815
+ "jinaai/jina-embeddings-v2-base-en": 768,
6816
+ "VPLabs/SearchMap_Preview": 4096,
6817
+ "Hum-Works/lodestone-base-4096-v1": 768,
6818
+ "jinaai/jina-embeddings-v4": 2048
6819
+ };
6820
+ /**
6821
+ * Lowercase lookup map for case-insensitive model dimension queries.
6822
+ * Built lazily from knownModelDimensions to ensure consistency.
6823
+ */
6824
+ modelLookup;
6825
+ constructor() {
6826
+ this.modelLookup = /* @__PURE__ */ new Map();
6827
+ for (const [model, dimensions] of Object.entries(this.knownModelDimensions)) {
6828
+ this.modelLookup.set(model.toLowerCase(), dimensions);
6829
+ }
6830
+ }
6831
+ /**
6832
+ * Parse embedding model configuration from environment variables.
6833
+ * This is a synchronous operation that extracts provider, model, and known dimensions.
6834
+ *
6835
+ * Supports various providers:
6836
+ * - openai: OpenAI models and OpenAI-compatible APIs (Ollama, LMStudio, etc.)
6837
+ * - vertex: Google Cloud Vertex AI
6838
+ * - gemini: Google Generative AI
6839
+ * - aws: AWS Bedrock models
6840
+ * - microsoft: Azure OpenAI
6841
+ * - sagemaker: AWS SageMaker hosted models
6842
+ *
6843
+ * @param modelSpec Optional model specification, defaults to DOCS_MCP_EMBEDDING_MODEL env var
6844
+ * @returns Parsed embedding model configuration
6845
+ */
6846
+ parse(modelSpec) {
6847
+ const spec = modelSpec || process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
6848
+ const colonIndex = spec.indexOf(":");
6849
+ let provider;
6850
+ let model;
6851
+ if (colonIndex === -1) {
6852
+ provider = "openai";
6853
+ model = spec;
6854
+ } else {
6855
+ provider = spec.substring(0, colonIndex);
6856
+ model = spec.substring(colonIndex + 1);
6857
+ }
6858
+ const dimensions = this.modelLookup?.get(model.toLowerCase()) || null;
6859
+ return {
6860
+ provider,
6861
+ model,
6862
+ dimensions,
6863
+ modelSpec: spec
6864
+ };
6865
+ }
6866
+ /**
6867
+ * Get the known dimensions for a specific model.
6868
+ * Returns null if the model dimensions are not known.
6869
+ * Uses case-insensitive lookup.
6870
+ *
6871
+ * @param model The model name (e.g., "text-embedding-3-small")
6872
+ * @returns Known dimensions or null
6873
+ */
6874
+ getKnownDimensions(model) {
6875
+ return this.modelLookup?.get(model.toLowerCase()) || null;
6876
+ }
6877
+ /**
6878
+ * Add or update known dimensions for a model.
6879
+ * This can be used to cache discovered dimensions.
6880
+ * Stores both original case and lowercase for consistent lookup.
6881
+ *
6882
+ * @param model The model name
6883
+ * @param dimensions The dimensions to cache
6884
+ */
6885
+ setKnownDimensions(model, dimensions) {
6886
+ this.knownModelDimensions[model] = dimensions;
6887
+ if (this.modelLookup) {
6888
+ this.modelLookup.set(model.toLowerCase(), dimensions);
6889
+ }
6890
+ }
6891
+ /**
6892
+ * Static method to parse embedding model configuration using the singleton instance.
6893
+ * This maintains backward compatibility while using the class-based approach.
6894
+ */
6895
+ static parseEmbeddingConfig(modelSpec) {
6896
+ return EmbeddingConfig.getInstance().parse(modelSpec);
6897
+ }
6898
+ /**
6899
+ * Static method to get known model dimensions using the singleton instance.
6900
+ * This maintains backward compatibility while using the class-based approach.
6901
+ */
6902
+ static getKnownModelDimensions(model) {
6903
+ return EmbeddingConfig.getInstance().getKnownDimensions(model);
6904
+ }
6905
+ /**
6906
+ * Static method to set known model dimensions using the singleton instance.
6907
+ * This maintains backward compatibility while using the class-based approach.
6908
+ */
6909
+ static setKnownModelDimensions(model, dimensions) {
6910
+ EmbeddingConfig.getInstance().setKnownDimensions(model, dimensions);
6911
+ }
6912
+ }
6913
+ class DocumentStore {
6914
+ db;
6915
+ embeddings;
6916
+ dbDimension = VECTOR_DIMENSION;
6917
+ modelDimension;
6918
+ embeddingConfig;
6919
+ statements;
6920
+ /**
6921
+ * Calculates Reciprocal Rank Fusion score for a result
6922
+ */
6923
+ calculateRRF(vecRank, ftsRank, k = 60) {
6924
+ let rrf = 0;
6925
+ if (vecRank !== void 0) {
6926
+ rrf += 1 / (k + vecRank);
6927
+ }
6928
+ if (ftsRank !== void 0) {
6929
+ rrf += 1 / (k + ftsRank);
6930
+ }
6931
+ return rrf;
6932
+ }
6933
+ /**
6934
+ * Assigns ranks to search results based on their scores
6935
+ */
6936
+ assignRanks(results) {
6937
+ const vecRanks = /* @__PURE__ */ new Map();
6938
+ const ftsRanks = /* @__PURE__ */ new Map();
6939
+ results.filter((r) => r.vec_score !== void 0).sort((a, b) => (b.vec_score ?? 0) - (a.vec_score ?? 0)).forEach((result, index) => {
6940
+ vecRanks.set(Number(result.id), index + 1);
6941
+ });
6942
+ results.filter((r) => r.fts_score !== void 0).sort((a, b) => (b.fts_score ?? 0) - (a.fts_score ?? 0)).forEach((result, index) => {
6943
+ ftsRanks.set(Number(result.id), index + 1);
6944
+ });
6945
+ return results.map((result) => ({
6946
+ ...result,
6947
+ vec_rank: vecRanks.get(Number(result.id)),
6948
+ fts_rank: ftsRanks.get(Number(result.id)),
6949
+ rrf_score: this.calculateRRF(
6950
+ vecRanks.get(Number(result.id)),
6951
+ ftsRanks.get(Number(result.id))
6952
+ )
6953
+ }));
6954
+ }
6955
+ constructor(dbPath, embeddingConfig) {
6956
+ if (!dbPath) {
6957
+ throw new StoreError("Missing required database path");
6958
+ }
6959
+ this.db = new Database(dbPath);
6960
+ this.embeddingConfig = embeddingConfig;
6961
+ }
6962
+ /**
6963
+ * Sets up prepared statements for database queries
6964
+ */
6965
+ prepareStatements() {
6966
+ const statements = {
6967
+ getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
6968
+ insertDocument: this.db.prepare(
6969
+ "INSERT INTO documents (library_id, version_id, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
6970
+ ),
6971
+ insertEmbedding: this.db.prepare(
6972
+ "INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)"
6973
+ ),
6974
+ insertLibrary: this.db.prepare(
6975
+ "INSERT INTO libraries (name) VALUES (?) ON CONFLICT(name) DO NOTHING"
6976
+ ),
6977
+ getLibraryIdByName: this.db.prepare(
6978
+ "SELECT id FROM libraries WHERE name = ?"
6979
+ ),
6980
+ // New version-related statements
6981
+ insertVersion: this.db.prepare(
6982
+ "INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
6983
+ ),
6984
+ resolveVersionId: this.db.prepare(
6985
+ "SELECT id FROM versions WHERE library_id = ? AND name IS ?"
6986
+ ),
6987
+ getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
6988
+ queryVersionsByLibraryId: this.db.prepare(
6989
+ "SELECT * FROM versions WHERE library_id = ? ORDER BY name"
6990
+ ),
6991
+ deleteLibraryDocuments: this.db.prepare(
6992
+ `DELETE FROM documents
6993
+ WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
6994
+ AND version_id = (
6995
+ SELECT v.id FROM versions v
6996
+ WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
6997
+ AND COALESCE(v.name, '') = COALESCE(?, '')
6998
+ )`
6999
+ ),
7000
+ deleteDocuments: this.db.prepare(
7001
+ `DELETE FROM documents
7002
+ WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
7003
+ AND version_id = (
7004
+ SELECT v.id FROM versions v
7005
+ WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
7006
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7007
+ )`
7008
+ ),
7009
+ deleteDocumentsByUrl: this.db.prepare(
7010
+ `DELETE FROM documents
7011
+ WHERE url = ?
7012
+ AND library_id = (SELECT id FROM libraries WHERE name = ?)
7013
+ AND version_id = (
7014
+ SELECT v.id FROM versions v
7015
+ WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
7016
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7017
+ )`
7018
+ ),
7019
+ getDocumentBySort: this.db.prepare(
7020
+ `SELECT d.id
7021
+ FROM documents d
7022
+ JOIN versions v ON d.version_id = v.id
7023
+ JOIN libraries l ON v.library_id = l.id
7024
+ WHERE l.name = ?
7025
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7026
+ LIMIT 1`
7027
+ ),
7028
+ queryVersions: this.db.prepare(
7029
+ `SELECT DISTINCT v.name
7030
+ FROM versions v
7031
+ JOIN libraries l ON v.library_id = l.id
7032
+ WHERE l.name = ?
7033
+ ORDER BY v.name`
7034
+ ),
7035
+ checkExists: this.db.prepare(
7036
+ `SELECT d.id FROM documents d
7037
+ JOIN versions v ON d.version_id = v.id
7038
+ JOIN libraries l ON v.library_id = l.id
7039
+ WHERE l.name = ?
7040
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7041
+ LIMIT 1`
7042
+ ),
7043
+ // Library/version aggregation including versions without documents and status/progress fields
7044
+ queryLibraryVersions: this.db.prepare(
7045
+ `SELECT
7046
+ l.name as library,
7047
+ COALESCE(v.name, '') as version,
7048
+ v.id as versionId,
7049
+ v.status as status,
7050
+ v.progress_pages as progressPages,
7051
+ v.progress_max_pages as progressMaxPages,
7052
+ v.source_url as sourceUrl,
7053
+ MIN(d.indexed_at) as indexedAt,
7054
+ COUNT(d.id) as documentCount,
7055
+ COUNT(DISTINCT d.url) as uniqueUrlCount
7056
+ FROM versions v
7057
+ JOIN libraries l ON v.library_id = l.id
7058
+ LEFT JOIN documents d ON d.version_id = v.id
7059
+ GROUP BY v.id
7060
+ ORDER BY l.name, version`
7061
+ ),
7062
+ getChildChunks: this.db.prepare(`
7063
+ SELECT d.* FROM documents d
7064
+ JOIN versions v ON d.version_id = v.id
7065
+ JOIN libraries l ON v.library_id = l.id
7066
+ WHERE l.name = ?
7067
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7068
+ AND d.url = ?
7069
+ AND json_array_length(json_extract(d.metadata, '$.path')) = ?
7070
+ AND json_extract(d.metadata, '$.path') LIKE ? || '%'
7071
+ AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
7072
+ ORDER BY d.sort_order
7073
+ LIMIT ?
7074
+ `),
7075
+ getPrecedingSiblings: this.db.prepare(`
7076
+ SELECT d.* FROM documents d
7077
+ JOIN versions v ON d.version_id = v.id
7078
+ JOIN libraries l ON v.library_id = l.id
7079
+ WHERE l.name = ?
7080
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7081
+ AND d.url = ?
7082
+ AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
7083
+ AND json_extract(d.metadata, '$.path') = ?
7084
+ ORDER BY d.sort_order DESC
7085
+ LIMIT ?
7086
+ `),
7087
+ getSubsequentSiblings: this.db.prepare(`
7088
+ SELECT d.* FROM documents d
7089
+ JOIN versions v ON d.version_id = v.id
7090
+ JOIN libraries l ON v.library_id = l.id
7091
+ WHERE l.name = ?
7092
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7093
+ AND d.url = ?
7094
+ AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
7095
+ AND json_extract(d.metadata, '$.path') = ?
7096
+ ORDER BY d.sort_order
7097
+ LIMIT ?
7098
+ `),
7099
+ getParentChunk: this.db.prepare(`
7100
+ SELECT d.* FROM documents d
7101
+ JOIN versions v ON d.version_id = v.id
7102
+ JOIN libraries l ON v.library_id = l.id
7103
+ WHERE l.name = ?
7104
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7105
+ AND d.url = ?
7106
+ AND json_extract(d.metadata, '$.path') = ?
7107
+ AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
7108
+ ORDER BY d.sort_order DESC
7109
+ LIMIT 1
7110
+ `),
7111
+ // Status tracking statements
7112
+ updateVersionStatus: this.db.prepare(
7113
+ "UPDATE versions SET status = ?, error_message = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
7114
+ ),
7115
+ updateVersionProgress: this.db.prepare(
7116
+ "UPDATE versions SET progress_pages = ?, progress_max_pages = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
7117
+ ),
7118
+ getVersionsByStatus: this.db.prepare(
7119
+ "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN (SELECT value FROM json_each(?))"
7120
+ ),
7121
+ // Scraper options statements
7122
+ updateVersionScraperOptions: this.db.prepare(
7123
+ "UPDATE versions SET source_url = ?, scraper_options = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
7124
+ ),
7125
+ getVersionWithOptions: this.db.prepare(
7126
+ "SELECT * FROM versions WHERE id = ?"
7127
+ ),
7128
+ getVersionsBySourceUrl: this.db.prepare(
7129
+ "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.source_url = ? ORDER BY v.created_at DESC"
7130
+ ),
7131
+ // Version and library deletion statements
7132
+ deleteVersionById: this.db.prepare("DELETE FROM versions WHERE id = ?"),
7133
+ deleteLibraryById: this.db.prepare("DELETE FROM libraries WHERE id = ?"),
7134
+ countVersionsByLibraryId: this.db.prepare(
7135
+ "SELECT COUNT(*) as count FROM versions WHERE library_id = ?"
7136
+ ),
7137
+ getVersionId: this.db.prepare(
7138
+ `SELECT v.id, v.library_id FROM versions v
7139
+ JOIN libraries l ON v.library_id = l.id
7140
+ WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`
7141
+ )
7142
+ };
7143
+ this.statements = statements;
7144
+ }
7145
+ /**
7146
+ * Pads a vector to the fixed database dimension by appending zeros.
7147
+ * Throws an error if the input vector is longer than the database dimension.
7148
+ */
7149
+ padVector(vector) {
7150
+ if (vector.length > this.dbDimension) {
7151
+ throw new Error(
7152
+ `Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
7153
+ );
7154
+ }
7155
+ if (vector.length === this.dbDimension) {
7156
+ return vector;
7157
+ }
7158
+ return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
7159
+ }
7160
+ /**
7161
+ * Initialize the embeddings client using either provided config or environment variables.
7162
+ * If no embedding config is provided (null), embeddings will not be initialized.
7163
+ * This allows DocumentStore to be used without embeddings for operations that don't need them.
7164
+ *
7165
+ * Environment variables per provider:
7166
+ * - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
7167
+ * - vertex: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
7168
+ * - gemini: GOOGLE_API_KEY
7169
+ * - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION
7170
+ * - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
7171
+ */
7172
+ async initializeEmbeddings() {
7173
+ if (this.embeddingConfig === null) {
7174
+ logger.debug("Embedding initialization skipped (explicitly disabled)");
7175
+ return;
7176
+ }
7177
+ const config = this.embeddingConfig || EmbeddingConfig.parseEmbeddingConfig();
7178
+ try {
7179
+ this.embeddings = createEmbeddingModel(config.modelSpec);
7180
+ if (config.dimensions !== null) {
7181
+ this.modelDimension = config.dimensions;
7182
+ } else {
7183
+ const testVector = await this.embeddings.embedQuery("test");
7184
+ this.modelDimension = testVector.length;
7185
+ EmbeddingConfig.setKnownModelDimensions(config.model, this.modelDimension);
7186
+ }
7187
+ if (this.modelDimension > this.dbDimension) {
7188
+ throw new DimensionError(config.modelSpec, this.modelDimension, this.dbDimension);
7189
+ }
7190
+ logger.debug(
7191
+ `Embeddings initialized: ${config.provider}:${config.model} (${this.modelDimension}d)`
7192
+ );
7193
+ } catch (error) {
7194
+ if (error instanceof Error) {
7195
+ if (error.message.includes("does not exist") || error.message.includes("MODEL_NOT_FOUND")) {
7196
+ throw new ModelConfigurationError(
7197
+ `❌ Invalid embedding model: ${config.model}
7198
+ The model "${config.model}" is not available or you don't have access to it.
7199
+ See README.md for supported models or run with --help for more details.`
7200
+ );
7201
+ }
7202
+ if (error.message.includes("API key") || error.message.includes("401") || error.message.includes("authentication")) {
7203
+ throw new ModelConfigurationError(
7204
+ `❌ Authentication failed for ${config.provider} embedding provider
7205
+ Please check your API key configuration.
7206
+ See README.md for configuration options or run with --help for more details.`
7207
+ );
7208
+ }
7209
+ }
7210
+ throw error;
7211
+ }
7212
+ }
7213
+ /**
7214
+ * Escapes a query string for use with SQLite FTS5 MATCH operator.
7215
+ * Wraps the query in double quotes and escapes internal double quotes.
7216
+ */
7217
+ escapeFtsQuery(query) {
7218
+ const escapedQuotes = query.replace(/"/g, '""');
7219
+ return `"${escapedQuotes}"`;
7220
+ }
7221
+ /**
7222
+ * Initializes database connection and ensures readiness
7223
+ */
7224
+ async initialize() {
7225
+ try {
7226
+ sqliteVec.load(this.db);
7227
+ applyMigrations(this.db);
7228
+ this.prepareStatements();
7229
+ await this.initializeEmbeddings();
7230
+ } catch (error) {
7231
+ if (error instanceof StoreError || error instanceof ModelConfigurationError || error instanceof UnsupportedProviderError) {
7232
+ throw error;
7233
+ }
7234
+ throw new ConnectionError("Failed to initialize database connection", error);
7235
+ }
7236
+ }
7237
+ /**
7238
+ * Gracefully closes database connections
7239
+ */
7240
+ async shutdown() {
7241
+ this.db.close();
7242
+ }
7243
+ /**
7244
+ * Resolves a library name and version string to library_id and version_id.
7245
+ * Creates library and version records if they don't exist.
7246
+ */
7247
+ async resolveLibraryAndVersionIds(library, version2) {
7248
+ const normalizedLibrary = library.toLowerCase();
7249
+ const normalizedVersion = denormalizeVersionName(version2.toLowerCase());
7250
+ this.statements.insertLibrary.run(normalizedLibrary);
7251
+ const libraryIdRow = this.statements.getLibraryIdByName.get(normalizedLibrary);
7252
+ if (!libraryIdRow || typeof libraryIdRow.id !== "number") {
7253
+ throw new StoreError(`Failed to resolve library_id for library: ${library}`);
7254
+ }
7255
+ const libraryId = libraryIdRow.id;
7256
+ this.statements.insertVersion.run(libraryId, normalizedVersion);
7257
+ const versionIdRow = this.statements.resolveVersionId.get(
7258
+ libraryId,
7259
+ normalizedVersion === null ? "" : normalizedVersion
7260
+ );
7261
+ if (!versionIdRow || typeof versionIdRow.id !== "number") {
7262
+ throw new StoreError(
7263
+ `Failed to resolve version_id for library: ${library}, version: ${version2}`
7264
+ );
7265
+ }
7266
+ return { libraryId, versionId: versionIdRow.id };
7267
+ }
7268
+ /**
7269
+ * Retrieves all unique versions for a specific library
7270
+ */
7271
+ async queryUniqueVersions(library) {
7272
+ try {
7273
+ const rows = this.statements.queryVersions.all(library.toLowerCase());
7274
+ return rows.map((row) => normalizeVersionName(row.name));
7275
+ } catch (error) {
7276
+ throw new ConnectionError("Failed to query versions", error);
7277
+ }
7278
+ }
7279
+ /**
7280
+ * Updates the status of a version record in the database.
7281
+ * @param versionId The version ID to update
7282
+ * @param status The new status to set
7283
+ * @param errorMessage Optional error message for failed statuses
7284
+ */
7285
+ async updateVersionStatus(versionId, status, errorMessage) {
7286
+ try {
7287
+ this.statements.updateVersionStatus.run(status, errorMessage ?? null, versionId);
7288
+ } catch (error) {
7289
+ throw new StoreError(`Failed to update version status: ${error}`);
7290
+ }
7291
+ }
7292
+ /**
7293
+ * Updates the progress counters for a version being indexed.
7294
+ * @param versionId The version ID to update
7295
+ * @param pages Current number of pages processed
7296
+ * @param maxPages Total number of pages to process
7297
+ */
7298
+ async updateVersionProgress(versionId, pages, maxPages) {
7299
+ try {
7300
+ this.statements.updateVersionProgress.run(pages, maxPages, versionId);
7301
+ } catch (error) {
7302
+ throw new StoreError(`Failed to update version progress: ${error}`);
7303
+ }
7304
+ }
7305
+ /**
7306
+ * Retrieves versions by their status.
7307
+ * @param statuses Array of statuses to filter by
7308
+ * @returns Array of version records matching the statuses
7309
+ */
7310
+ async getVersionsByStatus(statuses) {
7311
+ try {
7312
+ const statusJson = JSON.stringify(statuses);
7313
+ const rows = this.statements.getVersionsByStatus.all(
7314
+ statusJson
7315
+ );
7316
+ return rows;
7317
+ } catch (error) {
7318
+ throw new StoreError(`Failed to get versions by status: ${error}`);
7319
+ }
7320
+ }
7321
+ /**
7322
+ * Stores scraper options for a version to enable reproducible indexing.
7323
+ * @param versionId The version ID to update
7324
+ * @param options Complete scraper options used for indexing
7325
+ */
7326
+ async storeScraperOptions(versionId, options) {
7327
+ try {
7328
+ const { url: source_url, library, version: version2, signal, ...scraper_options } = options;
7329
+ const optionsJson = JSON.stringify(scraper_options);
7330
+ this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
7331
+ } catch (error) {
7332
+ throw new StoreError(`Failed to store scraper options: ${error}`);
7333
+ }
7334
+ }
7335
+ /**
7336
+ * Retrieves stored scraping configuration (source URL and options) for a version.
7337
+ * Returns null when no source URL is recorded (not re-indexable).
7338
+ */
7339
+ async getScraperOptions(versionId) {
7340
+ try {
7341
+ const row = this.statements.getVersionWithOptions.get(versionId);
7342
+ if (!row?.source_url) {
7343
+ return null;
7344
+ }
7345
+ let parsed = {};
7346
+ if (row.scraper_options) {
7347
+ try {
7348
+ parsed = JSON.parse(row.scraper_options);
7349
+ } catch (e) {
7350
+ logger.warn(`⚠️ Invalid scraper_options JSON for version ${versionId}: ${e}`);
7351
+ parsed = {};
7352
+ }
7353
+ }
7354
+ return { sourceUrl: row.source_url, options: parsed };
7355
+ } catch (error) {
7356
+ throw new StoreError(`Failed to get scraper options: ${error}`);
7357
+ }
7358
+ }
7359
+ /**
7360
+ * Finds versions that were indexed from the same source URL.
7361
+ * Useful for finding similar configurations or detecting duplicates.
7362
+ * @param url Source URL to search for
7363
+ * @returns Array of versions with the same source URL
7364
+ */
7365
+ async findVersionsBySourceUrl(url) {
7366
+ try {
7367
+ const rows = this.statements.getVersionsBySourceUrl.all(
7368
+ url
7369
+ );
7370
+ return rows;
7371
+ } catch (error) {
7372
+ throw new StoreError(`Failed to find versions by source URL: ${error}`);
7373
+ }
7374
+ }
7375
+ /**
7376
+ * Verifies existence of documents for a specific library version
7377
+ */
7378
+ async checkDocumentExists(library, version2) {
7379
+ try {
7380
+ const normalizedVersion = version2.toLowerCase();
7381
+ const result = this.statements.checkExists.get(
7382
+ library.toLowerCase(),
7383
+ normalizedVersion
7384
+ );
7385
+ return result !== void 0;
7386
+ } catch (error) {
7387
+ throw new ConnectionError("Failed to check document existence", error);
7388
+ }
7389
+ }
7390
+ /**
7391
+ * Retrieves a mapping of all libraries to their available versions with details.
7392
+ */
7393
+ async queryLibraryVersions() {
7394
+ try {
7395
+ const rows = this.statements.queryLibraryVersions.all();
7396
+ const libraryMap = /* @__PURE__ */ new Map();
7397
+ for (const row of rows) {
7398
+ const library = row.library;
7399
+ if (!libraryMap.has(library)) {
7400
+ libraryMap.set(library, []);
7401
+ }
7402
+ const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
7403
+ libraryMap.get(library)?.push({
7404
+ version: row.version,
7405
+ versionId: row.versionId,
7406
+ // Preserve raw string status here; DocumentManagementService will cast to VersionStatus
7407
+ status: row.status,
7408
+ progressPages: row.progressPages,
7409
+ progressMaxPages: row.progressMaxPages,
7410
+ sourceUrl: row.sourceUrl,
7411
+ documentCount: row.documentCount,
7412
+ uniqueUrlCount: row.uniqueUrlCount,
7413
+ indexedAt: indexedAtISO
7414
+ });
7415
+ }
7416
+ for (const versions of libraryMap.values()) {
7417
+ versions.sort((a, b) => {
7418
+ if (a.version === "" && b.version !== "") {
7419
+ return -1;
7420
+ }
7421
+ if (a.version !== "" && b.version === "") {
7422
+ return 1;
7423
+ }
7424
+ if (a.version === "" && b.version === "") {
7425
+ return 0;
7426
+ }
7427
+ try {
7428
+ return semver__default.compare(a.version, b.version);
7429
+ } catch (_error) {
7430
+ return a.version.localeCompare(b.version);
7431
+ }
7432
+ });
7433
+ }
7434
+ return libraryMap;
7435
+ } catch (error) {
7436
+ throw new ConnectionError("Failed to query library versions", error);
7437
+ }
7438
+ }
7439
+ /**
7440
+ * Stores documents with library and version metadata, generating embeddings
7441
+ * for vector similarity search. Automatically removes any existing documents
7442
+ * for the same URLs before adding new ones to prevent UNIQUE constraint violations.
7443
+ */
7444
+ async addDocuments(library, version2, documents) {
7445
+ try {
7446
+ if (documents.length === 0) {
7447
+ return;
7448
+ }
7449
+ const urls = /* @__PURE__ */ new Set();
7450
+ for (const doc of documents) {
7451
+ const url = doc.metadata.url;
7452
+ if (!url || typeof url !== "string" || !url.trim()) {
7453
+ throw new StoreError("Document metadata must include a valid URL");
7454
+ }
7455
+ urls.add(url);
7456
+ }
7457
+ const texts = documents.map((doc) => {
7458
+ const header = `<title>${doc.metadata.title}</title>
7459
+ <url>${doc.metadata.url}</url>
7460
+ <path>${doc.metadata.path.join(" / ")}</path>
7461
+ `;
7462
+ return `${header}${doc.pageContent}`;
7463
+ });
7464
+ const maxBatchChars = Number(process.env.DOCS_MCP_EMBEDDING_BATCH_CHARS) || EMBEDDING_BATCH_CHARS;
7465
+ const rawEmbeddings = [];
7466
+ let currentBatch = [];
7467
+ let currentBatchSize = 0;
7468
+ let batchCount = 0;
7469
+ for (const text of texts) {
7470
+ const textSize = text.length;
7471
+ if (currentBatchSize + textSize > maxBatchChars && currentBatch.length > 0) {
7472
+ batchCount++;
7473
+ logger.debug(
7474
+ `🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
7475
+ );
7476
+ const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
7477
+ rawEmbeddings.push(...batchEmbeddings);
7478
+ currentBatch = [];
7479
+ currentBatchSize = 0;
7480
+ }
7481
+ currentBatch.push(text);
7482
+ currentBatchSize += textSize;
7483
+ if (currentBatch.length >= EMBEDDING_BATCH_SIZE) {
7484
+ batchCount++;
7485
+ logger.debug(
7486
+ `🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
7487
+ );
7488
+ const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
7489
+ rawEmbeddings.push(...batchEmbeddings);
7490
+ currentBatch = [];
7491
+ currentBatchSize = 0;
7492
+ }
7493
+ }
7494
+ if (currentBatch.length > 0) {
7495
+ batchCount++;
7496
+ logger.debug(
7497
+ `🔄 Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
7498
+ );
7499
+ const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
7500
+ rawEmbeddings.push(...batchEmbeddings);
7501
+ }
7502
+ const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
7503
+ const { libraryId, versionId } = await this.resolveLibraryAndVersionIds(
7504
+ library,
7505
+ version2
7506
+ );
7507
+ for (const url of urls) {
7508
+ const deletedCount = await this.deleteDocumentsByUrl(library, version2, url);
7509
+ if (deletedCount > 0) {
7510
+ logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`);
7511
+ }
7512
+ }
7513
+ const transaction = this.db.transaction((docs) => {
7514
+ for (let i = 0; i < docs.length; i++) {
7515
+ const doc = docs[i];
7516
+ const url = doc.metadata.url;
7517
+ const result = this.statements.insertDocument.run(
7518
+ BigInt(libraryId),
7519
+ BigInt(versionId),
7520
+ url,
7521
+ doc.pageContent,
7522
+ JSON.stringify(doc.metadata),
7523
+ i,
7524
+ (/* @__PURE__ */ new Date()).toISOString()
7525
+ // Pass current timestamp for indexed_at
7526
+ );
7527
+ const rowId = result.lastInsertRowid;
7528
+ this.statements.insertEmbedding.run(
7529
+ BigInt(rowId),
7530
+ BigInt(libraryId),
7531
+ BigInt(versionId),
7532
+ JSON.stringify(paddedEmbeddings[i])
7533
+ );
7534
+ }
7535
+ });
7536
+ transaction(documents);
7537
+ } catch (error) {
7538
+ throw new ConnectionError("Failed to add documents to store", error);
7539
+ }
7540
+ }
7541
+ /**
7542
+ * Removes documents matching specified library and version
7543
+ * @returns Number of documents deleted
7544
+ */
7545
+ async deleteDocuments(library, version2) {
7546
+ try {
7547
+ const normalizedVersion = version2.toLowerCase();
7548
+ const result = this.statements.deleteDocuments.run(
7549
+ library.toLowerCase(),
7550
+ library.toLowerCase(),
7551
+ // library name appears twice in the query
7552
+ normalizedVersion
7553
+ );
7554
+ return result.changes;
7555
+ } catch (error) {
7556
+ throw new ConnectionError("Failed to delete documents", error);
7557
+ }
7558
+ }
7559
+ /**
7560
+ * Removes documents for a specific URL within a library and version
7561
+ * @returns Number of documents deleted
7562
+ */
7563
+ async deleteDocumentsByUrl(library, version2, url) {
7564
+ try {
7565
+ const normalizedVersion = version2.toLowerCase();
7566
+ const result = this.statements.deleteDocumentsByUrl.run(
7567
+ url,
7568
+ library.toLowerCase(),
7569
+ library.toLowerCase(),
7570
+ // library name appears twice in the query
7571
+ normalizedVersion
7572
+ );
7573
+ return result.changes;
7574
+ } catch (error) {
7575
+ throw new ConnectionError("Failed to delete documents by URL", error);
7576
+ }
7577
+ }
7578
+ /**
7579
+ * Completely removes a library version and all associated documents.
7580
+ * Optionally removes the library if no other versions remain.
7581
+ * @param library Library name
7582
+ * @param version Version string (empty string for unversioned)
7583
+ * @param removeLibraryIfEmpty Whether to remove the library if no versions remain
7584
+ * @returns Object with counts of deleted documents, version deletion status, and library deletion status
7585
+ */
7586
+ async removeVersion(library, version2, removeLibraryIfEmpty = true) {
7587
+ try {
7588
+ const normalizedLibrary = library.toLowerCase();
7589
+ const normalizedVersion = version2.toLowerCase();
7590
+ const versionResult = this.statements.getVersionId.get(
7591
+ normalizedLibrary,
7592
+ normalizedVersion
7593
+ );
7594
+ if (!versionResult) {
7595
+ return { documentsDeleted: 0, versionDeleted: false, libraryDeleted: false };
7596
+ }
7597
+ const { id: versionId, library_id: libraryId } = versionResult;
7598
+ const documentsDeleted = await this.deleteDocuments(library, version2);
7599
+ const versionDeleteResult = this.statements.deleteVersionById.run(versionId);
7600
+ const versionDeleted = versionDeleteResult.changes > 0;
7601
+ let libraryDeleted = false;
7602
+ if (removeLibraryIfEmpty && versionDeleted) {
7603
+ const countResult = this.statements.countVersionsByLibraryId.get(libraryId);
7604
+ const remainingVersions = countResult?.count ?? 0;
7605
+ if (remainingVersions === 0) {
7606
+ const libraryDeleteResult = this.statements.deleteLibraryById.run(libraryId);
7607
+ libraryDeleted = libraryDeleteResult.changes > 0;
7608
+ }
7609
+ }
7610
+ return { documentsDeleted, versionDeleted, libraryDeleted };
7611
+ } catch (error) {
7612
+ throw new ConnectionError("Failed to remove version", error);
7613
+ }
7614
+ }
7615
+ /**
7616
+ * Retrieves a document by its ID.
7617
+ * @param id The ID of the document.
7618
+ * @returns The document, or null if not found.
7619
+ */
7620
+ async getById(id) {
7621
+ try {
7622
+ const row = this.statements.getById.get(BigInt(id));
7623
+ if (!row) {
7624
+ return null;
7625
+ }
7626
+ return mapDbDocumentToDocument(row);
7627
+ } catch (error) {
7628
+ throw new ConnectionError(`Failed to get document by ID ${id}`, error);
7629
+ }
7630
+ }
7631
+ /**
7632
+ * Finds documents matching a text query using hybrid search.
7633
+ * Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
7634
+ */
7635
+ async findByContent(library, version2, query, limit) {
7636
+ try {
7637
+ const rawEmbedding = await this.embeddings.embedQuery(query);
7638
+ const embedding = this.padVector(rawEmbedding);
7639
+ const ftsQuery = this.escapeFtsQuery(query);
7640
+ const normalizedVersion = version2.toLowerCase();
7641
+ const stmt = this.db.prepare(`
7642
+ WITH vec_distances AS (
7643
+ SELECT
7644
+ dv.rowid as id,
7645
+ dv.distance as vec_distance
7646
+ FROM documents_vec dv
7647
+ JOIN versions v ON dv.version_id = v.id
7648
+ JOIN libraries l ON v.library_id = l.id
7649
+ WHERE l.name = ?
7650
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7651
+ AND dv.embedding MATCH ?
7652
+ AND dv.k = ?
7653
+ ORDER BY dv.distance
7654
+ ),
7655
+ fts_scores AS (
7656
+ SELECT
7657
+ f.rowid as id,
7658
+ bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
7659
+ FROM documents_fts f
7660
+ JOIN documents d ON f.rowid = d.id
7661
+ JOIN versions v ON d.version_id = v.id
7662
+ JOIN libraries l ON v.library_id = l.id
7663
+ WHERE l.name = ?
7664
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7665
+ AND documents_fts MATCH ?
7666
+ ORDER BY fts_score
7667
+ LIMIT ?
7668
+ )
7669
+ SELECT
7670
+ d.id,
7671
+ d.content,
7672
+ d.metadata,
7673
+ COALESCE(1 / (1 + v.vec_distance), 0) as vec_score,
7674
+ COALESCE(-MIN(f.fts_score, 0), 0) as fts_score
7675
+ FROM documents d
7676
+ LEFT JOIN vec_distances v ON d.id = v.id
7677
+ LEFT JOIN fts_scores f ON d.id = f.id
7678
+ WHERE v.id IS NOT NULL OR f.id IS NOT NULL
7679
+ `);
7680
+ const rawResults = stmt.all(
7681
+ library.toLowerCase(),
7682
+ normalizedVersion,
7683
+ JSON.stringify(embedding),
7684
+ limit,
7685
+ library.toLowerCase(),
7686
+ normalizedVersion,
7687
+ ftsQuery,
7688
+ // Use the escaped query
7689
+ limit
7690
+ );
7691
+ const rankedResults = this.assignRanks(rawResults);
7692
+ const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
7693
+ return topResults.map((row) => ({
7694
+ ...mapDbDocumentToDocument(row),
7695
+ metadata: {
7696
+ ...JSON.parse(row.metadata),
7697
+ id: row.id,
7698
+ score: row.rrf_score,
7699
+ vec_rank: row.vec_rank,
7700
+ fts_rank: row.fts_rank
7701
+ }
7702
+ }));
7703
+ } catch (error) {
7704
+ throw new ConnectionError(
7705
+ `Failed to find documents by content with query "${query}"`,
7706
+ error
7707
+ );
7708
+ }
7709
+ }
7710
+ /**
7711
+ * Finds child chunks of a given document based on path hierarchy.
7712
+ */
7713
+ async findChildChunks(library, version2, id, limit) {
7714
+ try {
7715
+ const parent = await this.getById(id);
7716
+ if (!parent) {
7717
+ return [];
7718
+ }
7719
+ const parentPath = parent.metadata.path ?? [];
7720
+ const parentUrl = parent.metadata.url;
7721
+ const normalizedVersion = version2.toLowerCase();
7722
+ const result = this.statements.getChildChunks.all(
7723
+ library.toLowerCase(),
7724
+ normalizedVersion,
7725
+ parentUrl,
7726
+ parentPath.length + 1,
7727
+ JSON.stringify(parentPath),
7728
+ BigInt(id),
7729
+ limit
7730
+ );
7731
+ return result.map((row) => mapDbDocumentToDocument(row));
7732
+ } catch (error) {
7733
+ throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
7734
+ }
7735
+ }
7736
+ /**
7737
+ * Finds preceding sibling chunks of a given document.
7738
+ */
7739
+ async findPrecedingSiblingChunks(library, version2, id, limit) {
7740
+ try {
7741
+ const reference = await this.getById(id);
7742
+ if (!reference) {
7743
+ return [];
7744
+ }
7745
+ const refMetadata = reference.metadata;
7746
+ const normalizedVersion = version2.toLowerCase();
7747
+ const result = this.statements.getPrecedingSiblings.all(
7748
+ library.toLowerCase(),
7749
+ normalizedVersion,
7750
+ refMetadata.url,
7751
+ BigInt(id),
7752
+ JSON.stringify(refMetadata.path),
7753
+ limit
7754
+ );
7755
+ return result.reverse().map((row) => mapDbDocumentToDocument(row));
7756
+ } catch (error) {
7757
+ throw new ConnectionError(
7758
+ `Failed to find preceding sibling chunks for ID ${id}`,
7759
+ error
7760
+ );
7761
+ }
7762
+ }
7763
+ /**
7764
+ * Finds subsequent sibling chunks of a given document.
7765
+ */
7766
+ async findSubsequentSiblingChunks(library, version2, id, limit) {
7767
+ try {
7768
+ const reference = await this.getById(id);
7769
+ if (!reference) {
7770
+ return [];
7771
+ }
7772
+ const refMetadata = reference.metadata;
7773
+ const normalizedVersion = version2.toLowerCase();
7774
+ const result = this.statements.getSubsequentSiblings.all(
7775
+ library.toLowerCase(),
7776
+ normalizedVersion,
7777
+ refMetadata.url,
7778
+ BigInt(id),
7779
+ JSON.stringify(refMetadata.path),
7780
+ limit
7781
+ );
7782
+ return result.map((row) => mapDbDocumentToDocument(row));
7783
+ } catch (error) {
7784
+ throw new ConnectionError(
7785
+ `Failed to find subsequent sibling chunks for ID ${id}`,
7786
+ error
7787
+ );
7788
+ }
7789
+ }
7790
+ /**
7791
+ * Finds the parent chunk of a given document.
7792
+ */
7793
+ async findParentChunk(library, version2, id) {
7794
+ try {
7795
+ const child = await this.getById(id);
7796
+ if (!child) {
7797
+ return null;
7798
+ }
7799
+ const childMetadata = child.metadata;
7800
+ const path2 = childMetadata.path ?? [];
7801
+ const parentPath = path2.slice(0, -1);
7802
+ if (parentPath.length === 0) {
7803
+ return null;
7804
+ }
7805
+ const normalizedVersion = version2.toLowerCase();
7806
+ const result = this.statements.getParentChunk.get(
7807
+ library.toLowerCase(),
7808
+ normalizedVersion,
7809
+ childMetadata.url,
7810
+ JSON.stringify(parentPath),
7811
+ BigInt(id)
7812
+ );
7813
+ if (!result) {
7814
+ return null;
7815
+ }
7816
+ return mapDbDocumentToDocument(result);
7817
+ } catch (error) {
7818
+ throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
7819
+ }
7820
+ }
7821
+ /**
7822
+ * Fetches multiple documents by their IDs in a single call.
7823
+ * Returns an array of Document objects, sorted by their sort_order.
7824
+ */
7825
+ async findChunksByIds(library, version2, ids) {
7826
+ if (!ids.length) return [];
7827
+ try {
7828
+ const normalizedVersion = version2.toLowerCase();
7829
+ const placeholders = ids.map(() => "?").join(",");
7830
+ const stmt = this.db.prepare(
7831
+ `SELECT d.* FROM documents d
7832
+ JOIN libraries l ON d.library_id = l.id
7833
+ JOIN versions v ON d.version_id = v.id
7834
+ WHERE l.name = ?
7835
+ AND COALESCE(v.name, '') = COALESCE(?, '')
7836
+ AND d.id IN (${placeholders})
7837
+ ORDER BY d.sort_order`
7838
+ );
7839
+ const rows = stmt.all(
7840
+ library.toLowerCase(),
7841
+ normalizedVersion,
7842
+ ...ids
7843
+ );
7844
+ return rows.map((row) => mapDbDocumentToDocument(row));
7845
+ } catch (error) {
7846
+ throw new ConnectionError("Failed to fetch documents by IDs", error);
7847
+ }
7848
+ }
7849
+ }
7850
+ class DocumentManagementService {
7851
+ store;
7852
+ documentRetriever;
7853
+ splitter;
7854
+ /**
7855
+ * Normalizes a version string, converting null or undefined to an empty string
7856
+ * and converting to lowercase.
7857
+ */
7858
+ normalizeVersion(version2) {
7859
+ return (version2 ?? "").toLowerCase();
7860
+ }
7861
+ constructor(embeddingConfig) {
7862
+ let dbPath;
7863
+ let dbDir;
7864
+ const envStorePath = process.env.DOCS_MCP_STORE_PATH;
7865
+ if (envStorePath) {
7866
+ dbDir = envStorePath;
7867
+ dbPath = path.join(dbDir, "documents.db");
7868
+ logger.debug(`Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
7869
+ } else {
7870
+ const projectRoot2 = getProjectRoot();
7871
+ const oldDbDir = path.join(projectRoot2, ".store");
7872
+ const oldDbPath = path.join(oldDbDir, "documents.db");
7873
+ const oldDbExists = fs.existsSync(oldDbPath);
7874
+ if (oldDbExists) {
7875
+ dbPath = oldDbPath;
7876
+ dbDir = oldDbDir;
7877
+ logger.debug(`Using legacy database path: ${dbPath}`);
7878
+ } else {
7879
+ const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
7880
+ dbDir = standardPaths.data;
7881
+ dbPath = path.join(dbDir, "documents.db");
7882
+ logger.debug(`Using standard database directory: ${dbDir}`);
7883
+ }
7884
+ }
7885
+ try {
7886
+ fs.mkdirSync(dbDir, { recursive: true });
7887
+ } catch (error) {
7888
+ logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
7889
+ }
7890
+ this.store = new DocumentStore(dbPath, embeddingConfig);
7891
+ this.documentRetriever = new DocumentRetrieverService(this.store);
7892
+ const semanticSplitter = new SemanticMarkdownSplitter(
7893
+ SPLITTER_PREFERRED_CHUNK_SIZE,
7894
+ SPLITTER_MAX_CHUNK_SIZE
7895
+ );
7896
+ const greedySplitter = new GreedySplitter(
7897
+ semanticSplitter,
7898
+ SPLITTER_MIN_CHUNK_SIZE,
7899
+ SPLITTER_PREFERRED_CHUNK_SIZE
7900
+ );
7901
+ this.splitter = greedySplitter;
7902
+ }
7903
+ /**
7904
+ * Initializes the underlying document store.
7905
+ */
7906
+ async initialize() {
7907
+ await this.store.initialize();
7908
+ }
7909
+ /**
7910
+ * Shuts down the underlying document store.
7911
+ */
7912
+ async shutdown() {
7913
+ logger.debug("Shutting down store manager");
7914
+ await this.store.shutdown();
7915
+ }
7916
+ // Status tracking methods for pipeline integration
7917
+ /**
7918
+ * Gets versions by their current status.
7919
+ */
7920
+ async getVersionsByStatus(statuses) {
7921
+ return this.store.getVersionsByStatus(statuses);
7922
+ }
7923
+ /**
7924
+ * Updates the status of a version.
7925
+ */
7926
+ async updateVersionStatus(versionId, status, errorMessage) {
7927
+ return this.store.updateVersionStatus(versionId, status, errorMessage);
7928
+ }
7929
+ /**
7930
+ * Updates the progress of a version being indexed.
7931
+ */
7932
+ async updateVersionProgress(versionId, pages, maxPages) {
7933
+ return this.store.updateVersionProgress(versionId, pages, maxPages);
7934
+ }
7935
+ /**
7936
+ * Stores scraper options for a version to enable reproducible indexing.
7937
+ */
7938
+ async storeScraperOptions(versionId, options) {
7939
+ return this.store.storeScraperOptions(versionId, options);
7940
+ }
7941
+ /**
7942
+ * Retrieves stored scraper options for a version.
7943
+ */
7944
+ /**
7945
+ * Retrieves stored scraping configuration for a version.
7946
+ */
7947
+ async getScraperOptions(versionId) {
7948
+ return this.store.getScraperOptions(versionId);
7949
+ }
7950
+ /**
7951
+ * Ensures a library/version exists using a VersionRef and returns version ID.
7952
+ * Delegates to existing ensureLibraryAndVersion for storage.
7953
+ */
7954
+ async ensureVersion(ref) {
7955
+ const normalized = {
7956
+ library: ref.library.trim().toLowerCase(),
7957
+ version: (ref.version ?? "").trim().toLowerCase()
7958
+ };
7959
+ return this.ensureLibraryAndVersion(normalized.library, normalized.version);
7960
+ }
7961
+ /**
7962
+ * Returns enriched library summaries including version status/progress and counts.
7963
+ * Uses existing store APIs; keeps DB details encapsulated.
7964
+ */
7965
+ async listLibraries() {
7966
+ const libMap = await this.store.queryLibraryVersions();
7967
+ const summaries = [];
7968
+ for (const [library, versions] of libMap) {
7969
+ const vs = versions.map(
7970
+ (v) => ({
7971
+ id: v.versionId,
7972
+ ref: { library, version: v.version },
7973
+ status: v.status,
7974
+ // Include progress only while indexing is active; set undefined for COMPLETED
7975
+ progress: v.status === "completed" ? void 0 : { pages: v.progressPages, maxPages: v.progressMaxPages },
7976
+ counts: { documents: v.documentCount, uniqueUrls: v.uniqueUrlCount },
7977
+ indexedAt: v.indexedAt,
7978
+ sourceUrl: v.sourceUrl ?? void 0
7979
+ })
7980
+ );
7981
+ summaries.push({ library, versions: vs });
7982
+ }
7983
+ return summaries;
7984
+ }
7985
+ /**
7986
+ * Finds versions that were indexed from the same source URL.
7987
+ */
7988
+ async findVersionsBySourceUrl(url) {
7989
+ return this.store.findVersionsBySourceUrl(url);
7990
+ }
7991
+ /**
7992
+ * Validates if a library exists in the store (either versioned or unversioned).
7993
+ * Throws LibraryNotFoundError with suggestions if the library is not found.
7994
+ * @param library The name of the library to validate.
7995
+ * @throws {LibraryNotFoundError} If the library does not exist.
7996
+ */
7997
+ async validateLibraryExists(library) {
7998
+ logger.info(`🔎 Validating existence of library: ${library}`);
7999
+ const normalizedLibrary = library.toLowerCase();
8000
+ const versions = await this.listVersions(normalizedLibrary);
8001
+ const hasUnversioned = await this.exists(normalizedLibrary, "");
8002
+ if (versions.length === 0 && !hasUnversioned) {
8003
+ logger.warn(`⚠️ Library '${library}' not found.`);
8004
+ const allLibraries = await this.listLibraries();
8005
+ const libraryNames = allLibraries.map((lib) => lib.library);
8006
+ let suggestions = [];
8007
+ if (libraryNames.length > 0) {
8008
+ const fuse = new Fuse(libraryNames, {
8009
+ // Configure fuse.js options if needed (e.g., threshold)
8010
+ // isCaseSensitive: false, // Handled by normalizing library names
8011
+ // includeScore: true,
8012
+ threshold: 0.4
8013
+ // Adjust threshold for desired fuzziness (0=exact, 1=match anything)
8014
+ });
8015
+ const results = fuse.search(normalizedLibrary);
8016
+ suggestions = results.slice(0, 3).map((result) => result.item);
8017
+ logger.info(`🔍 Found suggestions: ${suggestions.join(", ")}`);
8018
+ }
8019
+ throw new LibraryNotFoundError(library, suggestions);
8020
+ }
8021
+ logger.info(`✅ Library '${library}' confirmed to exist.`);
8022
+ }
8023
+ /**
8024
+ * Returns a list of all available semantic versions for a library.
8025
+ */
8026
+ async listVersions(library) {
8027
+ const versions = await this.store.queryUniqueVersions(library);
8028
+ return versions.filter((v) => semver__default.valid(v));
8029
+ }
8030
+ /**
8031
+ * Checks if documents exist for a given library and optional version.
8032
+ * If version is omitted, checks for documents without a specific version.
5278
8033
  */
5279
- logStartupInfo(address) {
5280
- logger.info(`🚀 AppServer available at ${address}`);
5281
- const enabledServices = [];
5282
- if (this.config.enableWebInterface) {
5283
- enabledServices.push(`Web interface: ${address}`);
5284
- }
5285
- if (this.config.enableMcpServer) {
5286
- enabledServices.push(`MCP endpoints: ${address}/mcp, ${address}/sse`);
5287
- }
5288
- if (this.config.enableApiServer) {
5289
- enabledServices.push(`API: ${address}/api`);
5290
- }
5291
- if (this.config.enableWorker) {
5292
- enabledServices.push("Embedded worker: enabled");
5293
- } else if (this.config.externalWorkerUrl) {
5294
- enabledServices.push(`External worker: ${this.config.externalWorkerUrl}`);
8034
+ async exists(library, version2) {
8035
+ const normalizedVersion = this.normalizeVersion(version2);
8036
+ return this.store.checkDocumentExists(library, normalizedVersion);
8037
+ }
8038
+ /**
8039
+ * Finds the most appropriate version of documentation based on the requested version.
8040
+ * When no target version is specified, returns the latest version.
8041
+ *
8042
+ * Version matching behavior:
8043
+ * - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
8044
+ * - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
8045
+ * - "latest" or no version: Returns the latest available version
8046
+ *
8047
+ * For documentation, we prefer matching older versions over no match at all,
8048
+ * since older docs are often still relevant and useful.
8049
+ * Also checks if unversioned documents exist for the library.
8050
+ */
8051
+ async findBestVersion(library, targetVersion) {
8052
+ const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
8053
+ logger.info(`🔍 Finding best version for ${libraryAndVersion}`);
8054
+ const hasUnversioned = await this.store.checkDocumentExists(library, "");
8055
+ const versionStrings = await this.listVersions(library);
8056
+ if (versionStrings.length === 0) {
8057
+ if (hasUnversioned) {
8058
+ logger.info(`ℹ️ Unversioned documents exist for ${library}`);
8059
+ return { bestMatch: null, hasUnversioned: true };
8060
+ }
8061
+ logger.warn(`⚠️ No valid versions found for ${library}`);
8062
+ const allLibraryDetails = await this.store.queryLibraryVersions();
8063
+ const libraryDetails = allLibraryDetails.get(library) ?? [];
8064
+ throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
8065
+ }
8066
+ let bestMatch = null;
8067
+ if (!targetVersion || targetVersion === "latest") {
8068
+ bestMatch = semver__default.maxSatisfying(versionStrings, "*");
8069
+ } else {
8070
+ const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
8071
+ if (!versionRegex.test(targetVersion)) {
8072
+ logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
8073
+ } else {
8074
+ let range = targetVersion;
8075
+ if (!semver__default.validRange(targetVersion)) {
8076
+ range = `~${targetVersion}`;
8077
+ } else if (semver__default.valid(targetVersion)) {
8078
+ range = `${range} || <=${targetVersion}`;
8079
+ }
8080
+ bestMatch = semver__default.maxSatisfying(versionStrings, range);
8081
+ }
5295
8082
  }
5296
- for (const service of enabledServices) {
5297
- logger.info(` • ${service}`);
8083
+ if (bestMatch) {
8084
+ logger.info(`✅ Found best match version ${bestMatch} for ${libraryAndVersion}`);
8085
+ } else {
8086
+ logger.warn(`⚠️ No matching semver version found for ${libraryAndVersion}`);
5298
8087
  }
5299
- }
5300
- }
5301
- async function startAppServer(docService, pipeline, config) {
5302
- const appServer = new AppServer(docService, pipeline, config);
5303
- await appServer.start();
5304
- return appServer;
5305
- }
5306
- async function startStdioServer(tools, readOnly = false) {
5307
- setLogLevel(LogLevel.ERROR);
5308
- const server = createMcpServerInstance(tools, readOnly);
5309
- const transport = new StdioServerTransport();
5310
- await server.connect(transport);
5311
- logger.info("🤖 MCP server listening on stdio");
5312
- return server;
5313
- }
5314
- class StoreError extends Error {
5315
- constructor(message, cause) {
5316
- super(cause ? `${message} caused by ${cause}` : message);
5317
- this.cause = cause;
5318
- this.name = this.constructor.name;
5319
- const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
5320
- if (causeError?.stack) {
5321
- this.stack = causeError.stack;
8088
+ if (!bestMatch && !hasUnversioned) {
8089
+ const allLibraryDetails = await this.store.queryLibraryVersions();
8090
+ const libraryDetails = allLibraryDetails.get(library) ?? [];
8091
+ throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
5322
8092
  }
8093
+ return { bestMatch, hasUnversioned };
5323
8094
  }
5324
- }
5325
- class DimensionError extends StoreError {
5326
- constructor(modelName, modelDimension, dbDimension) {
5327
- super(
5328
- `Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
5329
- );
5330
- this.modelName = modelName;
5331
- this.modelDimension = modelDimension;
5332
- this.dbDimension = dbDimension;
5333
- }
5334
- }
5335
- class ConnectionError extends StoreError {
5336
- }
5337
- const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
5338
- const MIGRATIONS_TABLE = "_schema_migrations";
5339
- function ensureMigrationsTable(db) {
5340
- db.exec(`
5341
- CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
5342
- id TEXT PRIMARY KEY,
5343
- applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
8095
+ /**
8096
+ * Removes all documents for a specific library and optional version.
8097
+ * If version is omitted, removes documents without a specific version.
8098
+ */
8099
+ async removeAllDocuments(library, version2) {
8100
+ const normalizedVersion = this.normalizeVersion(version2);
8101
+ logger.info(
8102
+ `🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
5344
8103
  );
5345
- `);
5346
- }
5347
- function getAppliedMigrations(db) {
5348
- const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
5349
- const rows = stmt.all();
5350
- return new Set(rows.map((row) => row.id));
5351
- }
5352
- async function applyMigrations(db) {
5353
- try {
5354
- db.pragma("journal_mode = OFF");
5355
- db.pragma("synchronous = OFF");
5356
- db.pragma("mmap_size = 268435456");
5357
- db.pragma("cache_size = -64000");
5358
- db.pragma("temp_store = MEMORY");
5359
- logger.debug("Applied performance optimizations for migration");
5360
- } catch (_error) {
5361
- logger.warn("⚠️ Could not apply all performance optimizations for migration");
8104
+ const count = await this.store.deleteDocuments(library, normalizedVersion);
8105
+ logger.info(`🗑️ Deleted ${count} documents`);
5362
8106
  }
5363
- const overallTransaction = db.transaction(() => {
5364
- logger.debug("Checking database migrations...");
5365
- ensureMigrationsTable(db);
5366
- const appliedMigrations = getAppliedMigrations(db);
5367
- if (!fs.existsSync(MIGRATIONS_DIR)) {
5368
- throw new StoreError("Migrations directory not found");
5369
- }
5370
- const migrationFiles = fs.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
5371
- const pendingMigrations = migrationFiles.filter(
5372
- (filename) => !appliedMigrations.has(filename)
8107
+ /**
8108
+ * Completely removes a library version and all associated documents.
8109
+ * Also removes the library if no other versions remain.
8110
+ * @param library Library name
8111
+ * @param version Version string (null/undefined for unversioned)
8112
+ */
8113
+ async removeVersion(library, version2) {
8114
+ const normalizedVersion = this.normalizeVersion(version2);
8115
+ logger.info(`🗑️ Removing version: ${library}@${normalizedVersion || "[no version]"}`);
8116
+ const result = await this.store.removeVersion(library, normalizedVersion, true);
8117
+ logger.info(
8118
+ `🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`
5373
8119
  );
5374
- if (pendingMigrations.length > 0) {
5375
- logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
8120
+ if (result.versionDeleted && result.libraryDeleted) {
8121
+ logger.info(`✅ Completely removed library ${library} (was last version)`);
8122
+ } else if (result.versionDeleted) {
8123
+ logger.info(`✅ Removed version ${library}@${normalizedVersion || "[no version]"}`);
8124
+ } else {
8125
+ logger.warn(
8126
+ `⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`
8127
+ );
5376
8128
  }
5377
- let appliedCount = 0;
5378
- for (const filename of pendingMigrations) {
5379
- logger.debug(`Applying migration: ${filename}`);
5380
- const filePath = path.join(MIGRATIONS_DIR, filename);
5381
- const sql = fs.readFileSync(filePath, "utf8");
5382
- try {
5383
- db.exec(sql);
5384
- const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
5385
- insertStmt.run(filename);
5386
- logger.debug(`Applied migration: ${filename}`);
5387
- appliedCount++;
5388
- } catch (error) {
5389
- logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
5390
- throw new StoreError(`Migration failed: ${filename}`, error);
5391
- }
8129
+ }
8130
+ /**
8131
+ * Adds a document to the store, splitting it into smaller chunks for better search results.
8132
+ * Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
8133
+ * Preserves hierarchical structure of documents and distinguishes between text and code segments.
8134
+ * If version is omitted, the document is added without a specific version.
8135
+ */
8136
+ async addDocument(library, version2, document) {
8137
+ const processingStart = performance.now();
8138
+ const normalizedVersion = this.normalizeVersion(version2);
8139
+ const url = document.metadata.url;
8140
+ if (!url || typeof url !== "string" || !url.trim()) {
8141
+ throw new StoreError("Document metadata must include a valid URL");
5392
8142
  }
5393
- if (appliedCount > 0) {
5394
- logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
5395
- } else {
5396
- logger.debug("Database schema is up to date");
8143
+ logger.info(`📚 Adding document: ${document.metadata.title}`);
8144
+ if (!document.pageContent.trim()) {
8145
+ throw new Error("Document content cannot be empty");
5397
8146
  }
5398
- return appliedCount;
5399
- });
5400
- let retries = 0;
5401
- let appliedMigrationsCount = 0;
5402
- while (true) {
5403
8147
  try {
5404
- appliedMigrationsCount = overallTransaction.immediate();
5405
- logger.debug("Database migrations completed successfully");
5406
- if (appliedMigrationsCount > 0) {
5407
- try {
5408
- logger.debug(
5409
- `Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
5410
- );
5411
- db.exec("VACUUM");
5412
- logger.debug("Database vacuum completed successfully");
5413
- } catch (error) {
5414
- logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
8148
+ const chunks = await this.splitter.splitText(document.pageContent);
8149
+ const splitDocs = chunks.map((chunk) => ({
8150
+ pageContent: chunk.content,
8151
+ metadata: {
8152
+ ...document.metadata,
8153
+ level: chunk.section.level,
8154
+ path: chunk.section.path
5415
8155
  }
5416
- } else {
5417
- logger.debug("Skipping VACUUM - no migrations were applied");
5418
- }
5419
- break;
8156
+ }));
8157
+ logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
8158
+ await this.store.addDocuments(library, normalizedVersion, splitDocs);
8159
+ const processingTime = performance.now() - processingStart;
8160
+ analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, {
8161
+ // Content characteristics (privacy-safe)
8162
+ mimeType: document.metadata.mimeType,
8163
+ contentSizeBytes: document.pageContent.length,
8164
+ // Processing metrics
8165
+ processingTimeMs: Math.round(processingTime),
8166
+ chunksCreated: splitDocs.length,
8167
+ // Document characteristics
8168
+ hasTitle: !!document.metadata.title,
8169
+ hasDescription: !!document.metadata.description,
8170
+ urlDomain: extractHostname(url),
8171
+ depth: document.metadata.depth,
8172
+ // Library context
8173
+ library,
8174
+ libraryVersion: normalizedVersion || null,
8175
+ // Processing efficiency
8176
+ avgChunkSizeBytes: Math.round(document.pageContent.length / splitDocs.length),
8177
+ processingSpeedKbPerSec: Math.round(
8178
+ document.pageContent.length / 1024 / (processingTime / 1e3)
8179
+ )
8180
+ });
5420
8181
  } catch (error) {
5421
- if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
5422
- retries++;
5423
- logger.warn(
5424
- `⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
5425
- );
5426
- await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
5427
- } else {
5428
- if (error?.code === "SQLITE_BUSY") {
5429
- logger.error(
5430
- `❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
5431
- );
5432
- }
5433
- if (error instanceof StoreError) {
5434
- throw error;
5435
- }
5436
- throw new StoreError("Failed during migration process", error);
8182
+ const processingTime = performance.now() - processingStart;
8183
+ if (error instanceof Error) {
8184
+ analytics.captureException(error, {
8185
+ mimeType: document.metadata.mimeType,
8186
+ contentSizeBytes: document.pageContent.length,
8187
+ processingTimeMs: Math.round(processingTime),
8188
+ library,
8189
+ libraryVersion: normalizedVersion || null,
8190
+ context: "document_processing",
8191
+ component: DocumentManagementService.constructor.name
8192
+ });
5437
8193
  }
8194
+ throw error;
5438
8195
  }
5439
8196
  }
5440
- try {
5441
- db.pragma("journal_mode = WAL");
5442
- db.pragma("wal_autocheckpoint = 1000");
5443
- db.pragma("busy_timeout = 30000");
5444
- db.pragma("foreign_keys = ON");
5445
- db.pragma("synchronous = NORMAL");
5446
- logger.debug(
5447
- "Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
8197
+ /**
8198
+ * Searches for documentation content across versions.
8199
+ * Uses hybrid search (vector + FTS).
8200
+ * If version is omitted, searches documents without a specific version.
8201
+ */
8202
+ async searchStore(library, version2, query, limit = 5) {
8203
+ const normalizedVersion = this.normalizeVersion(version2);
8204
+ return this.documentRetriever.search(library, normalizedVersion, query, limit);
8205
+ }
8206
+ // Deprecated simple listing removed: enriched listLibraries() is canonical
8207
+ /**
8208
+ * Ensures a library and version exist in the database and returns the version ID.
8209
+ * Creates the library and version records if they don't exist.
8210
+ */
8211
+ async ensureLibraryAndVersion(library, version2) {
8212
+ const normalizedLibrary = library.toLowerCase();
8213
+ const normalizedVersion = this.normalizeVersion(version2);
8214
+ const { versionId } = await this.store.resolveLibraryAndVersionIds(
8215
+ normalizedLibrary,
8216
+ normalizedVersion
5448
8217
  );
5449
- } catch (_error) {
5450
- logger.warn("⚠️ Could not apply all production database settings");
8218
+ return versionId;
5451
8219
  }
5452
8220
  }
5453
8221
  async function createDocumentManagement(options = {}) {
5454
8222
  if (options.serverUrl) {
5455
- const { DocumentManagementClient } = await import("./DocumentManagementClient-CAFdDwTu.js");
5456
8223
  const client = new DocumentManagementClient(options.serverUrl);
5457
8224
  await client.initialize();
5458
8225
  return client;
5459
8226
  }
5460
- const service = new (await import("./DocumentManagementService-C1xAzouZ.js")).DocumentManagementService();
8227
+ const service = new DocumentManagementService(options.embeddingConfig);
5461
8228
  await service.initialize();
5462
8229
  return service;
5463
8230
  }
5464
- async function createLocalDocumentManagement() {
5465
- const service = new (await import("./DocumentManagementService-C1xAzouZ.js")).DocumentManagementService();
8231
+ async function createLocalDocumentManagement(embeddingConfig) {
8232
+ const service = new DocumentManagementService(embeddingConfig);
5466
8233
  await service.initialize();
5467
8234
  return service;
5468
8235
  }
@@ -6992,6 +9759,17 @@ function warnHttpUsage(authConfig, port) {
6992
9759
  );
6993
9760
  }
6994
9761
  }
9762
+ function resolveEmbeddingContext(cliArgs) {
9763
+ try {
9764
+ const modelSpec = cliArgs?.embeddingModel || process.env.DOCS_MCP_EMBEDDING_MODEL;
9765
+ logger.debug("Resolving embedding configuration");
9766
+ const config = EmbeddingConfig.parseEmbeddingConfig(modelSpec);
9767
+ return config;
9768
+ } catch (error) {
9769
+ logger.debug(`Failed to resolve embedding configuration: ${error}`);
9770
+ return null;
9771
+ }
9772
+ }
6995
9773
  function createDefaultAction(program) {
6996
9774
  return program.addOption(
6997
9775
  new Option("--protocol <protocol>", "Protocol for MCP server").choices(["auto", "stdio", "http"]).default("auto")
@@ -7031,7 +9809,8 @@ function createDefaultAction(program) {
7031
9809
  warnHttpUsage(authConfig, port);
7032
9810
  }
7033
9811
  ensurePlaywrightBrowsersInstalled();
7034
- const docService = await createLocalDocumentManagement();
9812
+ const embeddingConfig = resolveEmbeddingContext();
9813
+ const docService = await createLocalDocumentManagement(embeddingConfig);
7035
9814
  const pipelineOptions = {
7036
9815
  recoverJobs: options.resume || false,
7037
9816
  // Use --resume flag for job recovery
@@ -7072,24 +9851,12 @@ async function fetchUrlAction(url, options, command) {
7072
9851
  setupLogging(globalOptions);
7073
9852
  const headers = parseHeaders(options.header);
7074
9853
  const fetchUrlTool = new FetchUrlTool(new HttpFetcher(), new FileFetcher());
7075
- const content = await trackTool(
7076
- "fetch_url",
7077
- () => fetchUrlTool.execute({
7078
- url,
7079
- followRedirects: options.followRedirects,
7080
- scrapeMode: options.scrapeMode,
7081
- headers: Object.keys(headers).length > 0 ? headers : void 0
7082
- }),
7083
- (content2) => ({
7084
- url_protocol: extractProtocol(url),
7085
- // Safe: only protocol, not full URL
7086
- follow_redirects: options.followRedirects,
7087
- scrape_mode: options.scrapeMode,
7088
- has_custom_headers: Object.keys(headers).length > 0,
7089
- content_length: content2.length,
7090
- cli_flags: extractCliFlags(process.argv)
7091
- })
7092
- );
9854
+ const content = await fetchUrlTool.execute({
9855
+ url,
9856
+ followRedirects: options.followRedirects,
9857
+ scrapeMode: options.scrapeMode,
9858
+ headers: Object.keys(headers).length > 0 ? headers : void 0
9859
+ });
7093
9860
  console.log(content);
7094
9861
  }
7095
9862
  function createFetchUrlCommand(program) {
@@ -7121,25 +9888,16 @@ async function findVersionAction(library, options, command) {
7121
9888
  const globalOptions = command.parent?.opts() || {};
7122
9889
  setupLogging(globalOptions);
7123
9890
  const serverUrl = options.serverUrl;
7124
- const docService = await createDocumentManagement({ serverUrl });
9891
+ const docService = await createDocumentManagement({
9892
+ serverUrl,
9893
+ embeddingConfig: serverUrl ? void 0 : null
9894
+ });
7125
9895
  try {
7126
9896
  const findVersionTool = new FindVersionTool(docService);
7127
- const versionInfo = await trackTool(
7128
- "find_version",
7129
- () => findVersionTool.execute({
7130
- library,
7131
- targetVersion: options.version
7132
- }),
7133
- (versionInfo2) => ({
7134
- library,
7135
- // Safe: library names are public
7136
- has_target_version: !!options.version,
7137
- result_type: typeof versionInfo2,
7138
- // 'string'
7139
- using_remote_server: !!serverUrl,
7140
- cli_flags: extractCliFlags(process.argv)
7141
- })
7142
- );
9897
+ const versionInfo = await findVersionTool.execute({
9898
+ library,
9899
+ targetVersion: options.version
9900
+ });
7143
9901
  if (!versionInfo) throw new Error("Failed to get version information");
7144
9902
  console.log(versionInfo);
7145
9903
  } finally {
@@ -7156,18 +9914,13 @@ async function listAction(options, command) {
7156
9914
  const globalOptions = command.parent?.opts() || {};
7157
9915
  setupLogging(globalOptions);
7158
9916
  const { serverUrl } = options;
7159
- const docService = await createDocumentManagement({ serverUrl });
9917
+ const docService = await createDocumentManagement({
9918
+ serverUrl,
9919
+ embeddingConfig: serverUrl ? void 0 : null
9920
+ });
7160
9921
  try {
7161
9922
  const listLibrariesTool = new ListLibrariesTool(docService);
7162
- const result = await trackTool(
7163
- "list_libraries",
7164
- () => listLibrariesTool.execute(),
7165
- (result2) => ({
7166
- library_count: result2.libraries.length,
7167
- using_remote_server: !!serverUrl,
7168
- cli_flags: extractCliFlags(process.argv)
7169
- })
7170
- );
9923
+ const result = await listLibrariesTool.execute();
7171
9924
  console.log(formatOutput(result.libraries));
7172
9925
  } finally {
7173
9926
  await docService.shutdown();
@@ -7220,8 +9973,16 @@ function createMcpCommand(program) {
7220
9973
  validateAuthConfig(authConfig);
7221
9974
  }
7222
9975
  try {
9976
+ const embeddingConfig = resolveEmbeddingContext();
9977
+ if (!serverUrl && !embeddingConfig) {
9978
+ logger.error(
9979
+ "❌ Embedding configuration is required for local mode. Configure an embedding provider with CLI options or environment variables."
9980
+ );
9981
+ process.exit(1);
9982
+ }
7223
9983
  const docService = await createDocumentManagement({
7224
- serverUrl
9984
+ serverUrl,
9985
+ embeddingConfig
7225
9986
  });
7226
9987
  const pipelineOptions = {
7227
9988
  recoverJobs: false,
@@ -7271,20 +10032,13 @@ async function removeAction(library, options, command) {
7271
10032
  const globalOptions = command.parent?.opts() || {};
7272
10033
  setupLogging(globalOptions);
7273
10034
  const serverUrl = options.serverUrl;
7274
- const docService = await createDocumentManagement({ serverUrl });
10035
+ const docService = await createDocumentManagement({
10036
+ serverUrl,
10037
+ embeddingConfig: serverUrl ? void 0 : null
10038
+ });
7275
10039
  const { version: version2 } = options;
7276
10040
  try {
7277
- await trackTool(
7278
- "remove_documents",
7279
- () => docService.removeAllDocuments(library, version2),
7280
- () => ({
7281
- library,
7282
- // Safe: library names are public
7283
- has_version: !!version2,
7284
- using_remote_server: !!serverUrl,
7285
- cli_flags: extractCliFlags(process.argv)
7286
- })
7287
- );
10041
+ await docService.removeAllDocuments(library, version2);
7288
10042
  console.log(`✅ Successfully removed ${library}${version2 ? `@${version2}` : ""}.`);
7289
10043
  } catch (error) {
7290
10044
  console.error(
@@ -7309,7 +10063,16 @@ async function scrapeAction(library, url, options, command) {
7309
10063
  const globalOptions = command.parent?.opts() || {};
7310
10064
  setupLogging(globalOptions);
7311
10065
  const serverUrl = options.serverUrl;
7312
- const docService = await createDocumentManagement({ serverUrl });
10066
+ const embeddingConfig = resolveEmbeddingContext();
10067
+ if (!serverUrl && !embeddingConfig) {
10068
+ throw new Error(
10069
+ "Embedding configuration is required for local scraping. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
10070
+ );
10071
+ }
10072
+ const docService = await createDocumentManagement({
10073
+ serverUrl,
10074
+ embeddingConfig
10075
+ });
7313
10076
  let pipeline = null;
7314
10077
  try {
7315
10078
  const pipelineOptions = {
@@ -7324,48 +10087,23 @@ async function scrapeAction(library, url, options, command) {
7324
10087
  await pipeline.start();
7325
10088
  const scrapeTool = new ScrapeTool(pipeline);
7326
10089
  const headers = parseHeaders(options.header);
7327
- const result = await trackTool(
7328
- "scrape_docs",
7329
- () => scrapeTool.execute({
7330
- url,
7331
- library,
7332
- version: options.version,
7333
- options: {
7334
- maxPages: Number.parseInt(options.maxPages, 10),
7335
- maxDepth: Number.parseInt(options.maxDepth, 10),
7336
- maxConcurrency: Number.parseInt(options.maxConcurrency, 10),
7337
- ignoreErrors: options.ignoreErrors,
7338
- scope: options.scope,
7339
- followRedirects: options.followRedirects,
7340
- scrapeMode: options.scrapeMode,
7341
- includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
7342
- excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
7343
- headers: Object.keys(headers).length > 0 ? headers : void 0
7344
- }
7345
- }),
7346
- (result2) => ({
7347
- library,
7348
- // Safe: library names are public
7349
- url_protocol: extractProtocol(url),
7350
- // Safe: only protocol, not full URL
7351
- max_pages: Number.parseInt(options.maxPages, 10),
7352
- max_depth: Number.parseInt(options.maxDepth, 10),
7353
- max_concurrency: Number.parseInt(options.maxConcurrency, 10),
7354
- has_version: !!options.version,
10090
+ const result = await scrapeTool.execute({
10091
+ url,
10092
+ library,
10093
+ version: options.version,
10094
+ options: {
10095
+ maxPages: Number.parseInt(options.maxPages, 10),
10096
+ maxDepth: Number.parseInt(options.maxDepth, 10),
10097
+ maxConcurrency: Number.parseInt(options.maxConcurrency, 10),
10098
+ ignoreErrors: options.ignoreErrors,
7355
10099
  scope: options.scope,
7356
- scrape_mode: options.scrapeMode,
7357
- ignore_errors: options.ignoreErrors,
7358
- follow_redirects: options.followRedirects,
7359
- has_include_patterns: Array.isArray(options.includePattern) && options.includePattern.length > 0,
7360
- has_exclude_patterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0,
7361
- has_custom_headers: Object.keys(headers).length > 0,
7362
- using_remote_server: !!serverUrl,
7363
- cli_flags: extractCliFlags(process.argv),
7364
- is_async_job: !("pagesScraped" in result2),
7365
- // Pipeline mode vs direct mode
7366
- pages_scraped: "pagesScraped" in result2 ? result2.pagesScraped : void 0
7367
- })
7368
- );
10100
+ followRedirects: options.followRedirects,
10101
+ scrapeMode: options.scrapeMode,
10102
+ includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
10103
+ excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
10104
+ headers: Object.keys(headers).length > 0 ? headers : void 0
10105
+ }
10106
+ });
7369
10107
  if ("pagesScraped" in result) {
7370
10108
  console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
7371
10109
  } else {
@@ -7444,31 +10182,25 @@ async function searchAction(library, query, options, command) {
7444
10182
  const globalOptions = command.parent?.opts() || {};
7445
10183
  setupLogging(globalOptions);
7446
10184
  const serverUrl = options.serverUrl;
7447
- const docService = await createDocumentManagement({ serverUrl });
10185
+ const embeddingConfig = resolveEmbeddingContext();
10186
+ if (!serverUrl && !embeddingConfig) {
10187
+ throw new Error(
10188
+ "Embedding configuration is required for local search. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
10189
+ );
10190
+ }
10191
+ const docService = await createDocumentManagement({
10192
+ serverUrl,
10193
+ embeddingConfig
10194
+ });
7448
10195
  try {
7449
10196
  const searchTool = new SearchTool(docService);
7450
- const result = await trackTool(
7451
- "search_docs",
7452
- () => searchTool.execute({
7453
- library,
7454
- version: options.version,
7455
- query,
7456
- limit: Number.parseInt(options.limit, 10),
7457
- exactMatch: options.exactMatch
7458
- }),
7459
- (result2) => ({
7460
- library,
7461
- // Safe: library names are public
7462
- query_analysis: analyzeSearchQuery(query),
7463
- // Analyzed, not raw query
7464
- result_count: result2.results.length,
7465
- limit_used: Number.parseInt(options.limit, 10),
7466
- has_version_filter: !!options.version,
7467
- exact_match: options.exactMatch,
7468
- using_remote_server: !!serverUrl,
7469
- cli_flags: extractCliFlags(process.argv)
7470
- })
7471
- );
10197
+ const result = await searchTool.execute({
10198
+ library,
10199
+ version: options.version,
10200
+ query,
10201
+ limit: Number.parseInt(options.limit, 10),
10202
+ exactMatch: options.exactMatch
10203
+ });
7472
10204
  console.log(formatOutput(result.results));
7473
10205
  } finally {
7474
10206
  await docService.shutdown();
@@ -7504,8 +10236,16 @@ function createWebCommand(program) {
7504
10236
  const serverUrl = cmdOptions.serverUrl;
7505
10237
  setupLogging(globalOptions);
7506
10238
  try {
10239
+ const embeddingConfig = resolveEmbeddingContext();
10240
+ if (!serverUrl && !embeddingConfig) {
10241
+ logger.error(
10242
+ "❌ Embedding configuration is required for local mode. Configure an embedding provider with CLI options or environment variables."
10243
+ );
10244
+ process.exit(1);
10245
+ }
7507
10246
  const docService = await createDocumentManagement({
7508
- serverUrl
10247
+ serverUrl,
10248
+ embeddingConfig
7509
10249
  });
7510
10250
  const pipelineOptions = {
7511
10251
  recoverJobs: false,
@@ -7554,7 +10294,8 @@ function createWorkerCommand(program) {
7554
10294
  try {
7555
10295
  logger.info(`🚀 Starting external pipeline worker on port ${port}`);
7556
10296
  ensurePlaywrightBrowsersInstalled();
7557
- const docService = await createLocalDocumentManagement();
10297
+ const embeddingConfig = resolveEmbeddingContext();
10298
+ const docService = await createLocalDocumentManagement(embeddingConfig);
7558
10299
  const pipelineOptions = {
7559
10300
  recoverJobs: cmdOptions.resume,
7560
10301
  // Use the resume option
@@ -7582,7 +10323,7 @@ function createCliProgram() {
7582
10323
  program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version(packageJson.version).addOption(
7583
10324
  new Option("--verbose", "Enable verbose (debug) logging").conflicts("silent")
7584
10325
  ).addOption(new Option("--silent", "Disable all logging except errors")).addOption(new Option("--no-telemetry", "Disable telemetry collection")).enablePositionalOptions().allowExcessArguments(false).showHelpAfterError(true);
7585
- program.hook("preAction", (thisCommand, actionCommand) => {
10326
+ program.hook("preAction", async (thisCommand, actionCommand) => {
7586
10327
  const globalOptions = thisCommand.opts();
7587
10328
  if (globalOptions.silent) setLogLevel(LogLevel.ERROR);
7588
10329
  else if (globalOptions.verbose) setLogLevel(LogLevel.DEBUG);
@@ -7673,7 +10414,11 @@ async function runCli() {
7673
10414
  });
7674
10415
  await program.parseAsync(process.argv);
7675
10416
  } catch (error) {
7676
- logger.error(`❌ Error in CLI: ${error}`);
10417
+ if (error instanceof ModelConfigurationError || error instanceof UnsupportedProviderError) {
10418
+ logger.error(error.message);
10419
+ } else {
10420
+ logger.error(`❌ Error in CLI: ${error}`);
10421
+ }
7677
10422
  if (!isShuttingDown) {
7678
10423
  isShuttingDown = true;
7679
10424
  const shutdownPromises = [];
@@ -7723,27 +10468,4 @@ runCli().catch((error) => {
7723
10468
  console.error(`🔥 Fatal error in main execution: ${error}`);
7724
10469
  process.exit(1);
7725
10470
  });
7726
- export {
7727
- ConnectionError as C,
7728
- DimensionError as D,
7729
- EMBEDDING_BATCH_CHARS as E,
7730
- LibraryNotFoundError as L,
7731
- StoreError as S,
7732
- TelemetryEvent as T,
7733
- VECTOR_DIMENSION as V,
7734
- applyMigrations as a,
7735
- EMBEDDING_BATCH_SIZE as b,
7736
- createJSDOM as c,
7737
- denormalizeVersionName as d,
7738
- SPLITTER_PREFERRED_CHUNK_SIZE as e,
7739
- SPLITTER_MAX_CHUNK_SIZE as f,
7740
- getProjectRoot as g,
7741
- VersionNotFoundError as h,
7742
- analytics as i,
7743
- extractHostname as j,
7744
- SPLITTER_MIN_CHUNK_SIZE as k,
7745
- logger as l,
7746
- mapDbDocumentToDocument as m,
7747
- normalizeVersionName as n
7748
- };
7749
10471
  //# sourceMappingURL=index.js.map