@arabold/docs-mcp-server 1.18.0 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,5 +1,5 @@
1
1
  import "dotenv/config";
2
- import { Command } from "commander";
2
+ import { Option, Command } from "commander";
3
3
  import path from "node:path";
4
4
  import formBody from "@fastify/formbody";
5
5
  import fastifyStatic from "@fastify/static";
@@ -21,6 +21,9 @@ import fs from "node:fs/promises";
21
21
  import * as mime from "mime-types";
22
22
  import axios from "axios";
23
23
  import { HeaderGenerator } from "header-generator";
24
+ import { initTRPC } from "@trpc/server";
25
+ import { fastifyTRPCPlugin } from "@trpc/server/adapters/fastify";
26
+ import { z as z$1 } from "zod";
24
27
  import { jsxs, jsx, Fragment } from "@kitajs/html/jsx-runtime";
25
28
  import fs$1, { readFileSync, existsSync } from "node:fs";
26
29
  import { unified } from "unified";
@@ -30,15 +33,16 @@ import remarkHtml from "remark-html";
30
33
  import DOMPurify from "dompurify";
31
34
  import { fileURLToPath, URL as URL$1 } from "node:url";
32
35
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
36
+ import { createTRPCProxyClient, httpBatchLink } from "@trpc/client";
37
+ import "env-paths";
38
+ import "fuse.js";
39
+ import "langchain/text_splitter";
40
+ import "better-sqlite3";
41
+ import "sqlite-vec";
33
42
  import { execSync } from "node:child_process";
34
43
  import { v4 } from "uuid";
35
- import psl from "psl";
44
+ import "psl";
36
45
  import { minimatch } from "minimatch";
37
- import envPaths from "env-paths";
38
- import Fuse from "fuse.js";
39
- import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
40
- import Database from "better-sqlite3";
41
- import * as sqliteVec from "sqlite-vec";
42
46
  const LogLevel = {
43
47
  ERROR: 0,
44
48
  WARN: 1,
@@ -97,7 +101,7 @@ const logger = {
97
101
  }
98
102
  }
99
103
  };
100
- const version = "1.17.0";
104
+ const version = "1.19.0";
101
105
  const packageJson = {
102
106
  version
103
107
  };
@@ -324,14 +328,43 @@ class HtmlLinkExtractorMiddleware {
324
328
  return;
325
329
  }
326
330
  try {
331
+ let docBase = context.source;
332
+ try {
333
+ const baseEl = $("base[href]").first();
334
+ const rawBase = baseEl.attr("href");
335
+ if (rawBase && rawBase.trim() !== "") {
336
+ try {
337
+ const trimmed = rawBase.trim();
338
+ const candidate = new URL(trimmed, context.source);
339
+ const hasScheme = /^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(trimmed);
340
+ const protocolRelative = trimmed.startsWith("//");
341
+ const firstSlash = trimmed.indexOf("/");
342
+ const firstColon = trimmed.indexOf(":");
343
+ const colonBeforeSlash = firstColon !== -1 && (firstSlash === -1 || firstColon < firstSlash);
344
+ const suspiciousColon = colonBeforeSlash && !hasScheme && !protocolRelative;
345
+ if (suspiciousColon || trimmed.startsWith(":")) {
346
+ logger.debug(
347
+ `Ignoring suspicious <base href> value (colon misuse): ${rawBase}`
348
+ );
349
+ } else {
350
+ docBase = candidate.href;
351
+ }
352
+ } catch {
353
+ logger.debug(`Ignoring invalid <base href> value: ${rawBase}`);
354
+ }
355
+ }
356
+ } catch {
357
+ }
327
358
  const linkElements = $("a[href]");
328
- logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
359
+ logger.debug(
360
+ `Found ${linkElements.length} potential links in ${context.source} (base=${docBase})`
361
+ );
329
362
  const extractedLinks = [];
330
363
  linkElements.each((_index, element) => {
331
364
  const href = $(element).attr("href");
332
365
  if (href && href.trim() !== "") {
333
366
  try {
334
- const urlObj = new URL(href, context.source);
367
+ const urlObj = new URL(href, docBase);
335
368
  if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
336
369
  logger.debug(`Ignoring link with invalid protocol: ${href}`);
337
370
  return;
@@ -405,6 +438,7 @@ const SPLITTER_MIN_CHUNK_SIZE = 500;
405
438
  const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
406
439
  const SPLITTER_MAX_CHUNK_SIZE = 5e3;
407
440
  const EMBEDDING_BATCH_SIZE = 100;
441
+ const EMBEDDING_BATCH_CHARS = 5e4;
408
442
  const MIGRATION_MAX_RETRIES = 5;
409
443
  const MIGRATION_RETRY_DELAY_MS = 300;
410
444
  var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
@@ -1319,8 +1353,15 @@ class ListLibrariesTool {
1319
1353
  const rawLibraries = await this.docService.listLibraries();
1320
1354
  const libraries = rawLibraries.map(({ library, versions }) => ({
1321
1355
  name: library,
1322
- versions
1323
- // Directly assign the detailed versions array
1356
+ versions: versions.map((v) => ({
1357
+ version: v.ref.version,
1358
+ documentCount: v.counts.documents,
1359
+ uniqueUrlCount: v.counts.uniqueUrls,
1360
+ indexedAt: v.indexedAt,
1361
+ status: v.status,
1362
+ ...v.progress ? { progress: v.progress } : void 0,
1363
+ sourceUrl: v.sourceUrl
1364
+ }))
1324
1365
  }));
1325
1366
  return { libraries };
1326
1367
  }
@@ -1400,7 +1441,8 @@ class ScrapeTool {
1400
1441
  }
1401
1442
  internalVersion = internalVersion.toLowerCase();
1402
1443
  const pipeline = this.pipeline;
1403
- const jobId = await pipeline.enqueueJob(library, internalVersion, {
1444
+ const enqueueVersion = internalVersion === "" ? null : internalVersion;
1445
+ const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
1404
1446
  url,
1405
1447
  library,
1406
1448
  version: internalVersion,
@@ -1447,13 +1489,13 @@ class SearchTool {
1447
1489
  await this.docService.validateLibraryExists(library);
1448
1490
  const allLibraries = await this.docService.listLibraries();
1449
1491
  const libraryInfo = allLibraries.find((lib) => lib.library === library);
1450
- const detailedVersions = libraryInfo ? libraryInfo.versions : [];
1451
- throw new VersionNotFoundError(
1452
- library,
1453
- "latest",
1454
- // Or perhaps the original 'version' if it wasn't 'latest'? Check logic.
1455
- detailedVersions
1456
- );
1492
+ const detailedVersions = libraryInfo ? libraryInfo.versions.map((v) => ({
1493
+ version: v.ref.version,
1494
+ documentCount: v.counts.documents,
1495
+ uniqueUrlCount: v.counts.uniqueUrls,
1496
+ indexedAt: v.indexedAt
1497
+ })) : [];
1498
+ throw new VersionNotFoundError(library, version2 ?? "latest", detailedVersions);
1457
1499
  }
1458
1500
  const resolvedVersion = version2 || "latest";
1459
1501
  logger.info(
@@ -2081,12 +2123,18 @@ class HttpFetcher {
2081
2123
  } else {
2082
2124
  content = Buffer.from(response.data);
2083
2125
  }
2126
+ const finalUrl = (
2127
+ // Node follow-redirects style
2128
+ response.request?.res?.responseUrl || // Some adapters may expose directly
2129
+ response.request?.responseUrl || // Fallback to axios recorded config URL
2130
+ response.config?.url || source
2131
+ );
2084
2132
  return {
2085
2133
  content,
2086
2134
  mimeType,
2087
2135
  charset,
2088
2136
  encoding: contentEncoding,
2089
- source
2137
+ source: finalUrl
2090
2138
  };
2091
2139
  } catch (error) {
2092
2140
  const axiosError = error;
@@ -2224,134 +2272,229 @@ async function cleanupMcpService(mcpServer) {
2224
2272
  throw error;
2225
2273
  }
2226
2274
  }
2227
- class PipelineApiService {
2228
- pipeline;
2229
- constructor(pipeline) {
2230
- this.pipeline = pipeline;
2231
- }
2232
- /**
2233
- * Registers all pipeline API routes with the given Fastify instance.
2234
- */
2235
- async registerRoutes(server) {
2236
- server.get("/api/health", async (_request, reply) => {
2237
- return reply.send({ status: "ok", timestamp: (/* @__PURE__ */ new Date()).toISOString() });
2238
- });
2239
- server.get(
2240
- "/api/health/detailed",
2241
- async (_request, reply) => {
2242
- try {
2243
- const jobs = await this.pipeline.getJobs();
2244
- return reply.send({
2245
- status: "ok",
2246
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2247
- jobCounts: {
2248
- total: jobs.length,
2249
- queued: jobs.filter((j) => j.status === PipelineJobStatus.QUEUED).length,
2250
- running: jobs.filter((j) => j.status === PipelineJobStatus.RUNNING).length,
2251
- completed: jobs.filter((j) => j.status === PipelineJobStatus.COMPLETED).length,
2252
- failed: jobs.filter((j) => j.status === PipelineJobStatus.FAILED).length,
2253
- cancelled: jobs.filter((j) => j.status === PipelineJobStatus.CANCELLED).length
2254
- }
2255
- });
2256
- } catch (error) {
2257
- return reply.status(500).send({
2258
- status: "error",
2259
- error: error instanceof Error ? error.message : String(error)
2260
- });
2261
- }
2262
- }
2263
- );
2264
- server.post(
2265
- "/api/jobs",
2266
- async (request, reply) => {
2267
- try {
2268
- const { library, version: version2, options } = request.body;
2269
- if (!library || !options) {
2270
- return reply.status(400).send({ error: "Missing required fields: library, options" });
2271
- }
2272
- const jobId = await this.pipeline.enqueueJob(library, version2, options);
2273
- logger.debug(
2274
- `API: Enqueued job ${jobId} for ${library}@${version2 || "unversioned"}`
2275
- );
2276
- return reply.send({ jobId });
2277
- } catch (error) {
2278
- logger.error(`API: Failed to enqueue job: ${error}`);
2279
- return reply.status(500).send({
2280
- error: error instanceof Error ? error.message : String(error)
2281
- });
2282
- }
2283
- }
2284
- );
2285
- server.get(
2286
- "/api/jobs",
2287
- async (request, reply) => {
2288
- try {
2289
- const { status } = request.query;
2290
- const jobs = await this.pipeline.getJobs(status);
2291
- return reply.send({ jobs });
2292
- } catch (error) {
2293
- logger.error(`API: Failed to get jobs: ${error}`);
2294
- return reply.status(500).send({
2295
- error: error instanceof Error ? error.message : String(error)
2296
- });
2297
- }
2275
+ const t$1 = initTRPC.context().create();
2276
+ const nonEmptyTrimmed = z$1.string().transform((s) => s.trim()).refine((s) => s.length > 0, "must not be empty");
2277
+ const optionalTrimmed = z$1.preprocess(
2278
+ (v) => typeof v === "string" ? v.trim() : v,
2279
+ z$1.string().min(1).optional().nullable()
2280
+ );
2281
+ const enqueueInput = z$1.object({
2282
+ library: nonEmptyTrimmed,
2283
+ version: optionalTrimmed,
2284
+ options: z$1.custom()
2285
+ });
2286
+ const jobIdInput = z$1.object({ id: z$1.string().min(1) });
2287
+ const getJobsInput = z$1.object({
2288
+ status: z$1.nativeEnum(PipelineJobStatus).optional()
2289
+ });
2290
+ function createPipelineRouter(trpc) {
2291
+ const tt = trpc;
2292
+ return tt.router({
2293
+ enqueueJob: tt.procedure.input(enqueueInput).mutation(
2294
+ async ({
2295
+ ctx,
2296
+ input
2297
+ }) => {
2298
+ const jobId = await ctx.pipeline.enqueueJob(
2299
+ input.library,
2300
+ input.version ?? null,
2301
+ input.options
2302
+ );
2303
+ return { jobId };
2304
+ }
2305
+ ),
2306
+ getJob: tt.procedure.input(jobIdInput).query(
2307
+ async ({
2308
+ ctx,
2309
+ input
2310
+ }) => {
2311
+ return ctx.pipeline.getJob(input.id);
2312
+ }
2313
+ ),
2314
+ getJobs: tt.procedure.input(getJobsInput.optional()).query(
2315
+ async ({
2316
+ ctx,
2317
+ input
2318
+ }) => {
2319
+ const jobs = await ctx.pipeline.getJobs(input?.status);
2320
+ return { jobs };
2321
+ }
2322
+ ),
2323
+ cancelJob: tt.procedure.input(jobIdInput).mutation(
2324
+ async ({
2325
+ ctx,
2326
+ input
2327
+ }) => {
2328
+ await ctx.pipeline.cancelJob(input.id);
2329
+ return { success: true };
2330
+ }
2331
+ ),
2332
+ clearCompletedJobs: tt.procedure.mutation(
2333
+ async ({ ctx }) => {
2334
+ const count = await ctx.pipeline.clearCompletedJobs();
2335
+ return { count };
2298
2336
  }
2299
- );
2300
- server.get(
2301
- "/api/jobs/:id",
2302
- async (request, reply) => {
2303
- try {
2304
- const { id } = request.params;
2305
- const job = await this.pipeline.getJob(id);
2306
- if (!job) {
2307
- return reply.status(404).send({ error: "Job not found" });
2308
- }
2309
- return reply.send(job);
2310
- } catch (error) {
2311
- logger.error(`API: Failed to get job ${request.params.id}: ${error}`);
2312
- return reply.status(500).send({
2313
- error: error instanceof Error ? error.message : String(error)
2314
- });
2315
- }
2337
+ )
2338
+ });
2339
+ }
2340
+ createPipelineRouter(t$1);
2341
+ const t = initTRPC.context().create();
2342
+ const nonEmpty = z$1.string().min(1).transform((s) => s.trim());
2343
+ const optionalVersion = z$1.string().optional().nullable().transform((v) => typeof v === "string" ? v.trim() : v);
2344
+ function createDataRouter(trpc) {
2345
+ const tt = trpc;
2346
+ return tt.router({
2347
+ listLibraries: tt.procedure.query(async ({ ctx }) => {
2348
+ return await ctx.docService.listLibraries();
2349
+ }),
2350
+ findBestVersion: tt.procedure.input(z$1.object({ library: nonEmpty, targetVersion: z$1.string().optional() })).query(
2351
+ async ({
2352
+ ctx,
2353
+ input
2354
+ }) => {
2355
+ const result = await ctx.docService.findBestVersion(
2356
+ input.library,
2357
+ input.targetVersion
2358
+ );
2359
+ return result;
2360
+ }
2361
+ ),
2362
+ validateLibraryExists: tt.procedure.input(z$1.object({ library: nonEmpty })).mutation(
2363
+ async ({ ctx, input }) => {
2364
+ await ctx.docService.validateLibraryExists(input.library);
2365
+ return { ok: true };
2366
+ }
2367
+ ),
2368
+ search: tt.procedure.input(
2369
+ z$1.object({
2370
+ library: nonEmpty,
2371
+ version: optionalVersion,
2372
+ query: nonEmpty,
2373
+ limit: z$1.number().int().positive().max(50).optional()
2374
+ })
2375
+ ).query(
2376
+ async ({
2377
+ ctx,
2378
+ input
2379
+ }) => {
2380
+ const results = await ctx.docService.searchStore(
2381
+ input.library,
2382
+ input.version ?? null,
2383
+ input.query,
2384
+ input.limit ?? 5
2385
+ );
2386
+ return results;
2387
+ }
2388
+ ),
2389
+ removeAllDocuments: tt.procedure.input(z$1.object({ library: nonEmpty, version: optionalVersion })).mutation(
2390
+ async ({
2391
+ ctx,
2392
+ input
2393
+ }) => {
2394
+ await ctx.docService.removeAllDocuments(input.library, input.version ?? null);
2395
+ return { ok: true };
2396
+ }
2397
+ ),
2398
+ // Status and version helpers
2399
+ getVersionsByStatus: tt.procedure.input(z$1.object({ statuses: z$1.array(z$1.string()) })).query(
2400
+ async ({
2401
+ ctx,
2402
+ input
2403
+ }) => {
2404
+ const statuses = input.statuses;
2405
+ return await ctx.docService.getVersionsByStatus(
2406
+ statuses
2407
+ );
2316
2408
  }
2317
- );
2318
- server.delete(
2319
- "/api/jobs/:id",
2320
- async (request, reply) => {
2321
- try {
2322
- const { id } = request.params;
2323
- await this.pipeline.cancelJob(id);
2324
- logger.debug(`API: Cancelled job ${id}`);
2325
- return reply.send({ success: true });
2326
- } catch (error) {
2327
- logger.error(`API: Failed to cancel job ${request.params.id}: ${error}`);
2328
- return reply.status(500).send({
2329
- error: error instanceof Error ? error.message : String(error)
2330
- });
2331
- }
2409
+ ),
2410
+ findVersionsBySourceUrl: tt.procedure.input(z$1.object({ url: nonEmpty })).query(async ({ ctx, input }) => {
2411
+ return await ctx.docService.findVersionsBySourceUrl(
2412
+ input.url
2413
+ );
2414
+ }),
2415
+ getScraperOptions: tt.procedure.input(z$1.object({ versionId: z$1.number().int().positive() })).query(
2416
+ async ({
2417
+ ctx,
2418
+ input
2419
+ }) => {
2420
+ return await ctx.docService.getScraperOptions(input.versionId);
2421
+ }
2422
+ ),
2423
+ updateVersionStatus: tt.procedure.input(
2424
+ z$1.object({
2425
+ versionId: z$1.number().int().positive(),
2426
+ status: z$1.string(),
2427
+ errorMessage: z$1.string().optional().nullable()
2428
+ })
2429
+ ).mutation(
2430
+ async ({
2431
+ ctx,
2432
+ input
2433
+ }) => {
2434
+ await ctx.docService.updateVersionStatus(
2435
+ input.versionId,
2436
+ input.status,
2437
+ input.errorMessage ?? void 0
2438
+ );
2439
+ return { ok: true };
2440
+ }
2441
+ ),
2442
+ updateVersionProgress: tt.procedure.input(
2443
+ z$1.object({
2444
+ versionId: z$1.number().int().positive(),
2445
+ pages: z$1.number().int().nonnegative(),
2446
+ maxPages: z$1.number().int().positive()
2447
+ })
2448
+ ).mutation(
2449
+ async ({
2450
+ ctx,
2451
+ input
2452
+ }) => {
2453
+ await ctx.docService.updateVersionProgress(
2454
+ input.versionId,
2455
+ input.pages,
2456
+ input.maxPages
2457
+ );
2458
+ return { ok: true };
2332
2459
  }
2333
- );
2334
- server.delete(
2335
- "/api/jobs",
2336
- async (_request, reply) => {
2337
- try {
2338
- const count = await this.pipeline.clearCompletedJobs();
2339
- logger.debug(`API: Cleared ${count} completed jobs`);
2340
- return reply.send({ count });
2341
- } catch (error) {
2342
- logger.error(`API: Failed to clear completed jobs: ${error}`);
2343
- return reply.status(500).send({
2344
- error: error instanceof Error ? error.message : String(error)
2345
- });
2346
- }
2460
+ ),
2461
+ storeScraperOptions: tt.procedure.input(
2462
+ z$1.object({
2463
+ versionId: z$1.number().int().positive(),
2464
+ options: z$1.unknown()
2465
+ })
2466
+ ).mutation(
2467
+ async ({
2468
+ ctx,
2469
+ input
2470
+ }) => {
2471
+ await ctx.docService.storeScraperOptions(
2472
+ input.versionId,
2473
+ input.options
2474
+ );
2475
+ return { ok: true };
2347
2476
  }
2348
- );
2349
- logger.debug("Pipeline API routes registered");
2350
- }
2477
+ )
2478
+ });
2351
2479
  }
2352
- async function registerPipelineApiService(server, pipeline) {
2353
- const pipelineApiService = new PipelineApiService(pipeline);
2354
- await pipelineApiService.registerRoutes(server);
2480
+ createDataRouter(t);
2481
+ async function registerTrpcService(server, pipeline, docService) {
2482
+ const t2 = initTRPC.context().create();
2483
+ const healthRouter = t2.router({
2484
+ ping: t2.procedure.query(async () => ({ status: "ok", ts: Date.now() }))
2485
+ });
2486
+ const router = t2.mergeRouters(
2487
+ healthRouter,
2488
+ createPipelineRouter(t2),
2489
+ createDataRouter(t2)
2490
+ );
2491
+ await server.register(fastifyTRPCPlugin, {
2492
+ prefix: "/api",
2493
+ trpcOptions: {
2494
+ router,
2495
+ createContext: async () => ({ pipeline, docService })
2496
+ }
2497
+ });
2355
2498
  }
2356
2499
  const Layout = ({ title, version: version2, children }) => {
2357
2500
  let versionString = version2;
@@ -2522,7 +2665,7 @@ function normalizeVersionName(name) {
2522
2665
  return name ?? "";
2523
2666
  }
2524
2667
  function denormalizeVersionName(name) {
2525
- return name === "" ? null : name;
2668
+ return name === "" ? "" : name;
2526
2669
  }
2527
2670
  function getStatusDescription(status) {
2528
2671
  const descriptions = {
@@ -3426,8 +3569,8 @@ const VersionDetailsRow = ({
3426
3569
  // Default to true
3427
3570
  }) => {
3428
3571
  const indexedDate = version2.indexedAt ? new Date(version2.indexedAt).toLocaleDateString() : "N/A";
3429
- const versionLabel = version2.version || "Unversioned";
3430
- const versionParam = version2.version || "";
3572
+ const versionLabel = version2.ref.version || "Unversioned";
3573
+ const versionParam = version2.ref.version || "";
3431
3574
  const sanitizedLibraryName = libraryName.replace(/[^a-zA-Z0-9-_]/g, "-");
3432
3575
  const sanitizedVersionParam = versionParam.replace(/[^a-zA-Z0-9-_]/g, "-");
3433
3576
  const rowId = `row-${sanitizedLibraryName}-${sanitizedVersionParam}`;
@@ -3446,19 +3589,19 @@ const VersionDetailsRow = ({
3446
3589
  {
3447
3590
  class: "text-sm text-gray-900 dark:text-white w-1/4 truncate",
3448
3591
  title: versionLabel,
3449
- children: version2.version ? /* @__PURE__ */ jsx(VersionBadge, { version: version2.version }) : /* @__PURE__ */ jsx("span", { children: "Unversioned" })
3592
+ children: version2.ref.version ? /* @__PURE__ */ jsx(VersionBadge, { version: version2.ref.version }) : /* @__PURE__ */ jsx("span", { children: "Unversioned" })
3450
3593
  }
3451
3594
  ),
3452
3595
  /* @__PURE__ */ jsxs("div", { class: "flex space-x-2 text-sm text-gray-600 dark:text-gray-400 w-3/4 justify-end items-center", children: [
3453
3596
  /* @__PURE__ */ jsxs("span", { title: "Number of unique pages indexed", children: [
3454
3597
  "Pages:",
3455
3598
  " ",
3456
- /* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.uniqueUrlCount.toLocaleString() })
3599
+ /* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.counts.uniqueUrls.toLocaleString() })
3457
3600
  ] }),
3458
3601
  /* @__PURE__ */ jsxs("span", { title: "Number of indexed snippets", children: [
3459
3602
  "Snippets:",
3460
3603
  " ",
3461
- /* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.documentCount.toLocaleString() })
3604
+ /* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.counts.documents.toLocaleString() })
3462
3605
  ] }),
3463
3606
  /* @__PURE__ */ jsxs("span", { title: "Date last indexed", children: [
3464
3607
  "Last Update:",
@@ -3558,17 +3701,28 @@ const LibraryDetailCard = ({ library }) => (
3558
3701
  // Use Flowbite Card structure with updated padding and border, and white background
3559
3702
  /* @__PURE__ */ jsxs("div", { class: "block p-4 bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-300 dark:border-gray-600 mb-4", children: [
3560
3703
  /* @__PURE__ */ jsx("h3", { class: "text-lg font-medium text-gray-900 dark:text-white mb-1", children: /* @__PURE__ */ jsx("span", { safe: true, children: library.name }) }),
3561
- /* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((version2) => /* @__PURE__ */ jsx(
3562
- VersionDetailsRow,
3563
- {
3564
- libraryName: library.name,
3565
- version: version2,
3566
- showDelete: false
3567
- }
3568
- )) : (
3569
- // Display message if no versions are indexed
3570
- /* @__PURE__ */ jsx("p", { class: "text-sm text-gray-500 dark:text-gray-400 italic", children: "No versions indexed." })
3571
- ) })
3704
+ /* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((v) => {
3705
+ const adapted = {
3706
+ id: -1,
3707
+ ref: { library: library.name, version: v.version },
3708
+ status: v.status,
3709
+ progress: v.progress,
3710
+ counts: {
3711
+ documents: v.documentCount,
3712
+ uniqueUrls: v.uniqueUrlCount
3713
+ },
3714
+ indexedAt: v.indexedAt,
3715
+ sourceUrl: v.sourceUrl ?? void 0
3716
+ };
3717
+ return /* @__PURE__ */ jsx(
3718
+ VersionDetailsRow,
3719
+ {
3720
+ libraryName: library.name,
3721
+ version: adapted,
3722
+ showDelete: false
3723
+ }
3724
+ );
3725
+ }) : /* @__PURE__ */ jsx("p", { class: "text-sm text-gray-500 dark:text-gray-400 italic", children: "No versions indexed." }) })
3572
3726
  ] })
3573
3727
  );
3574
3728
  const LibrarySearchCard = ({ library }) => {
@@ -3733,7 +3887,21 @@ const LibraryItem = ({ library }) => (
3733
3887
  children: /* @__PURE__ */ jsx("span", { safe: true, children: library.name })
3734
3888
  }
3735
3889
  ) }),
3736
- /* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((version2) => /* @__PURE__ */ jsx(VersionDetailsRow, { libraryName: library.name, version: version2 })) : (
3890
+ /* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((v) => {
3891
+ const adapted = {
3892
+ id: -1,
3893
+ ref: { library: library.name, version: v.version },
3894
+ status: v.status,
3895
+ progress: v.progress,
3896
+ counts: {
3897
+ documents: v.documentCount,
3898
+ uniqueUrls: v.uniqueUrlCount
3899
+ },
3900
+ indexedAt: v.indexedAt,
3901
+ sourceUrl: v.sourceUrl ?? void 0
3902
+ };
3903
+ return /* @__PURE__ */ jsx(VersionDetailsRow, { libraryName: library.name, version: adapted });
3904
+ }) : (
3737
3905
  // Display message if no versions are indexed
3738
3906
  /* @__PURE__ */ jsx("p", { class: "text-sm text-gray-500 dark:text-gray-400 italic", children: "No versions indexed." })
3739
3907
  ) })
@@ -3861,9 +4029,9 @@ class AppServer {
3861
4029
  );
3862
4030
  }
3863
4031
  }
3864
- if (this.config.enableWorker && !this.config.enablePipelineApi) {
4032
+ if (this.config.enableWorker && !this.config.enableApiServer) {
3865
4033
  logger.warn(
3866
- "Warning: Worker is enabled but Pipeline API is disabled. Consider enabling Pipeline API for better observability."
4034
+ "Warning: Worker is enabled but API server is disabled. Consider enabling the API for better observability."
3867
4035
  );
3868
4036
  }
3869
4037
  }
@@ -3915,8 +4083,8 @@ class AppServer {
3915
4083
  if (this.config.enableMcpServer) {
3916
4084
  await this.enableMcpServer();
3917
4085
  }
3918
- if (this.config.enablePipelineApi) {
3919
- await this.enablePipelineApi();
4086
+ if (this.config.enableApiServer) {
4087
+ await this.enableTrpcApi();
3920
4088
  }
3921
4089
  if (this.config.enableWorker) {
3922
4090
  await this.enableWorker();
@@ -3944,11 +4112,11 @@ class AppServer {
3944
4112
  logger.debug("MCP server service enabled");
3945
4113
  }
3946
4114
  /**
3947
- * Enable Pipeline API service.
4115
+ * Enable Pipeline RPC (tRPC) service.
3948
4116
  */
3949
- async enablePipelineApi() {
3950
- await registerPipelineApiService(this.server, this.pipeline);
3951
- logger.debug("Pipeline API service enabled");
4117
+ async enableTrpcApi() {
4118
+ await registerTrpcService(this.server, this.pipeline, this.docService);
4119
+ logger.debug("API server (tRPC) enabled");
3952
4120
  }
3953
4121
  /**
3954
4122
  * Enable worker service.
@@ -3977,10 +4145,10 @@ class AppServer {
3977
4145
  enabledServices.push(`Web interface: ${address}`);
3978
4146
  }
3979
4147
  if (this.config.enableMcpServer) {
3980
- enabledServices.push(`MCP endpoint: ${address}/mcp, ${address}/sse`);
4148
+ enabledServices.push(`MCP endpoints: ${address}/mcp, ${address}/sse`);
3981
4149
  }
3982
- if (this.config.enablePipelineApi) {
3983
- enabledServices.push(`Pipeline API: ${address}/api`);
4150
+ if (this.config.enableApiServer) {
4151
+ enabledServices.push(`API: ${address}/api`);
3984
4152
  }
3985
4153
  if (this.config.enableWorker) {
3986
4154
  enabledServices.push("Embedded worker: enabled");
@@ -4005,6 +4173,161 @@ async function startStdioServer(tools) {
4005
4173
  logger.info("🤖 MCP server listening on stdio");
4006
4174
  return server;
4007
4175
  }
4176
+ class StoreError extends Error {
4177
+ constructor(message, cause) {
4178
+ super(cause ? `${message} caused by ${cause}` : message);
4179
+ this.cause = cause;
4180
+ this.name = this.constructor.name;
4181
+ const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
4182
+ if (causeError?.stack) {
4183
+ this.stack = causeError.stack;
4184
+ }
4185
+ }
4186
+ }
4187
+ class DimensionError extends StoreError {
4188
+ constructor(modelName, modelDimension, dbDimension) {
4189
+ super(
4190
+ `Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
4191
+ );
4192
+ this.modelName = modelName;
4193
+ this.modelDimension = modelDimension;
4194
+ this.dbDimension = dbDimension;
4195
+ }
4196
+ }
4197
+ class ConnectionError extends StoreError {
4198
+ }
4199
+ const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
4200
+ const MIGRATIONS_TABLE = "_schema_migrations";
4201
+ function ensureMigrationsTable(db) {
4202
+ db.exec(`
4203
+ CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
4204
+ id TEXT PRIMARY KEY,
4205
+ applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
4206
+ );
4207
+ `);
4208
+ }
4209
+ function getAppliedMigrations(db) {
4210
+ const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
4211
+ const rows = stmt.all();
4212
+ return new Set(rows.map((row) => row.id));
4213
+ }
4214
+ async function applyMigrations(db) {
4215
+ try {
4216
+ db.pragma("journal_mode = OFF");
4217
+ db.pragma("synchronous = OFF");
4218
+ db.pragma("mmap_size = 268435456");
4219
+ db.pragma("cache_size = -64000");
4220
+ db.pragma("temp_store = MEMORY");
4221
+ logger.debug("Applied performance optimizations for migration");
4222
+ } catch (_error) {
4223
+ logger.warn("⚠️ Could not apply all performance optimizations for migration");
4224
+ }
4225
+ const overallTransaction = db.transaction(() => {
4226
+ logger.debug("Checking database migrations...");
4227
+ ensureMigrationsTable(db);
4228
+ const appliedMigrations = getAppliedMigrations(db);
4229
+ if (!fs$1.existsSync(MIGRATIONS_DIR)) {
4230
+ throw new StoreError("Migrations directory not found");
4231
+ }
4232
+ const migrationFiles = fs$1.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
4233
+ const pendingMigrations = migrationFiles.filter(
4234
+ (filename) => !appliedMigrations.has(filename)
4235
+ );
4236
+ if (pendingMigrations.length > 0) {
4237
+ logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
4238
+ }
4239
+ let appliedCount = 0;
4240
+ for (const filename of pendingMigrations) {
4241
+ logger.debug(`Applying migration: ${filename}`);
4242
+ const filePath = path.join(MIGRATIONS_DIR, filename);
4243
+ const sql = fs$1.readFileSync(filePath, "utf8");
4244
+ try {
4245
+ db.exec(sql);
4246
+ const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
4247
+ insertStmt.run(filename);
4248
+ logger.debug(`✅ Applied migration: ${filename}`);
4249
+ appliedCount++;
4250
+ } catch (error) {
4251
+ logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
4252
+ throw new StoreError(`Migration failed: ${filename}`, error);
4253
+ }
4254
+ }
4255
+ if (appliedCount > 0) {
4256
+ logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
4257
+ } else {
4258
+ logger.debug("Database schema is up to date");
4259
+ }
4260
+ return appliedCount;
4261
+ });
4262
+ let retries = 0;
4263
+ let appliedMigrationsCount = 0;
4264
+ while (true) {
4265
+ try {
4266
+ appliedMigrationsCount = overallTransaction.immediate();
4267
+ logger.debug("Database migrations completed successfully");
4268
+ if (appliedMigrationsCount > 0) {
4269
+ try {
4270
+ logger.debug(
4271
+ `Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
4272
+ );
4273
+ db.exec("VACUUM");
4274
+ logger.debug("Database vacuum completed successfully");
4275
+ } catch (error) {
4276
+ logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
4277
+ }
4278
+ } else {
4279
+ logger.debug("Skipping VACUUM - no migrations were applied");
4280
+ }
4281
+ break;
4282
+ } catch (error) {
4283
+ if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
4284
+ retries++;
4285
+ logger.warn(
4286
+ `⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
4287
+ );
4288
+ await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
4289
+ } else {
4290
+ if (error?.code === "SQLITE_BUSY") {
4291
+ logger.error(
4292
+ `❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
4293
+ );
4294
+ }
4295
+ if (error instanceof StoreError) {
4296
+ throw error;
4297
+ }
4298
+ throw new StoreError("Failed during migration process", error);
4299
+ }
4300
+ }
4301
+ }
4302
+ try {
4303
+ db.pragma("journal_mode = WAL");
4304
+ db.pragma("wal_autocheckpoint = 1000");
4305
+ db.pragma("busy_timeout = 30000");
4306
+ db.pragma("foreign_keys = ON");
4307
+ db.pragma("synchronous = NORMAL");
4308
+ logger.debug(
4309
+ "Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
4310
+ );
4311
+ } catch (_error) {
4312
+ logger.warn("⚠️ Could not apply all production database settings");
4313
+ }
4314
+ }
4315
+ async function createDocumentManagement(options = {}) {
4316
+ if (options.serverUrl) {
4317
+ const { DocumentManagementClient } = await import("./DocumentManagementClient-CAFdDwTu.js");
4318
+ const client = new DocumentManagementClient(options.serverUrl);
4319
+ await client.initialize();
4320
+ return client;
4321
+ }
4322
+ const service = new (await import("./DocumentManagementService-BH02TJEe.js")).DocumentManagementService();
4323
+ await service.initialize();
4324
+ return service;
4325
+ }
4326
+ async function createLocalDocumentManagement() {
4327
+ const service = new (await import("./DocumentManagementService-BH02TJEe.js")).DocumentManagementService();
4328
+ await service.initialize();
4329
+ return service;
4330
+ }
4008
4331
  function deserializeJob(serializedJob) {
4009
4332
  return {
4010
4333
  ...serializedJob,
@@ -4016,21 +4339,22 @@ function deserializeJob(serializedJob) {
4016
4339
  }
4017
4340
  class PipelineClient {
4018
4341
  baseUrl;
4342
+ client;
4019
4343
  pollingInterval = 1e3;
4020
4344
  // 1 second
4021
4345
  activePolling = /* @__PURE__ */ new Set();
4022
4346
  // Track jobs being polled for completion
4023
4347
  constructor(serverUrl) {
4024
4348
  this.baseUrl = serverUrl.replace(/\/$/, "");
4025
- logger.debug(`PipelineClient created for: ${this.baseUrl}`);
4349
+ this.client = createTRPCProxyClient({
4350
+ links: [httpBatchLink({ url: this.baseUrl })]
4351
+ });
4352
+ logger.debug(`PipelineClient (tRPC) created for: ${this.baseUrl}`);
4026
4353
  }
4027
4354
  async start() {
4028
4355
  try {
4029
- const response = await fetch(`${this.baseUrl}/health`);
4030
- if (!response.ok) {
4031
- throw new Error(`External worker health check failed: ${response.status}`);
4032
- }
4033
- logger.debug("PipelineClient connected to external worker");
4356
+ await this.client.ping.query();
4357
+ logger.debug("PipelineClient connected to external worker via tRPC");
4034
4358
  } catch (error) {
4035
4359
  throw new Error(
4036
4360
  `Failed to connect to external worker at ${this.baseUrl}: ${error instanceof Error ? error.message : String(error)}`
@@ -4043,25 +4367,14 @@ class PipelineClient {
4043
4367
  }
4044
4368
  async enqueueJob(library, version2, options) {
4045
4369
  try {
4046
- const response = await fetch(`${this.baseUrl}/jobs`, {
4047
- method: "POST",
4048
- headers: {
4049
- "Content-Type": "application/json"
4050
- },
4051
- body: JSON.stringify({
4052
- library,
4053
- version: version2,
4054
- options
4055
- })
4370
+ const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
4371
+ const result = await this.client.enqueueJob.mutate({
4372
+ library,
4373
+ version: normalizedVersion,
4374
+ options
4056
4375
  });
4057
- if (!response.ok) {
4058
- const errorText = await response.text();
4059
- throw new Error(`Failed to enqueue job: ${response.status} ${errorText}`);
4060
- }
4061
- const result = await response.json();
4062
- const jobId = result.jobId;
4063
- logger.debug(`Job ${jobId} enqueued successfully`);
4064
- return jobId;
4376
+ logger.debug(`Job ${result.jobId} enqueued successfully`);
4377
+ return result.jobId;
4065
4378
  } catch (error) {
4066
4379
  throw new Error(
4067
4380
  `Failed to enqueue job: ${error instanceof Error ? error.message : String(error)}`
@@ -4070,15 +4383,8 @@ class PipelineClient {
4070
4383
  }
4071
4384
  async getJob(jobId) {
4072
4385
  try {
4073
- const response = await fetch(`${this.baseUrl}/jobs/${jobId}`);
4074
- if (response.status === 404) {
4075
- return void 0;
4076
- }
4077
- if (!response.ok) {
4078
- throw new Error(`Failed to get job: ${response.status} ${response.statusText}`);
4079
- }
4080
- const serializedJob = await response.json();
4081
- return deserializeJob(serializedJob);
4386
+ const serializedJob = await this.client.getJob.query({ id: jobId });
4387
+ return serializedJob ? deserializeJob(serializedJob) : void 0;
4082
4388
  } catch (error) {
4083
4389
  throw new Error(
4084
4390
  `Failed to get job ${jobId}: ${error instanceof Error ? error.message : String(error)}`
@@ -4087,18 +4393,11 @@ class PipelineClient {
4087
4393
  }
4088
4394
  async getJobs(status) {
4089
4395
  try {
4090
- const url = new URL(`${this.baseUrl}/jobs`);
4091
- if (status) {
4092
- url.searchParams.set("status", status);
4093
- }
4094
- const response = await fetch(url.toString());
4095
- if (!response.ok) {
4096
- const errorText = await response.text();
4097
- throw new Error(`Failed to get jobs: ${response.status} ${errorText}`);
4098
- }
4099
- const result = await response.json();
4396
+ const result = await this.client.getJobs.query({ status });
4100
4397
  const serializedJobs = result.jobs || [];
4101
- return serializedJobs.map(deserializeJob);
4398
+ return serializedJobs.map(
4399
+ (j) => deserializeJob(j)
4400
+ );
4102
4401
  } catch (error) {
4103
4402
  logger.error(`Failed to get jobs from external worker: ${error}`);
4104
4403
  throw error;
@@ -4106,13 +4405,7 @@ class PipelineClient {
4106
4405
  }
4107
4406
  async cancelJob(jobId) {
4108
4407
  try {
4109
- const response = await fetch(`${this.baseUrl}/jobs/${jobId}`, {
4110
- method: "DELETE"
4111
- });
4112
- if (!response.ok) {
4113
- const errorText = await response.text();
4114
- throw new Error(`Failed to cancel job: ${response.status} ${errorText}`);
4115
- }
4408
+ await this.client.cancelJob.mutate({ id: jobId });
4116
4409
  logger.debug(`Job cancelled via external worker: ${jobId}`);
4117
4410
  } catch (error) {
4118
4411
  logger.error(`Failed to cancel job ${jobId} via external worker: ${error}`);
@@ -4121,16 +4414,7 @@ class PipelineClient {
4121
4414
  }
4122
4415
  async clearCompletedJobs() {
4123
4416
  try {
4124
- const response = await fetch(`${this.baseUrl}/jobs`, {
4125
- method: "DELETE"
4126
- });
4127
- if (!response.ok) {
4128
- const errorText = await response.text();
4129
- throw new Error(
4130
- `Failed to clear completed jobs: ${response.status} ${errorText}`
4131
- );
4132
- }
4133
- const result = await response.json();
4417
+ const result = await this.client.clearCompletedJobs.mutate();
4134
4418
  logger.debug(`Cleared ${result.count} completed jobs via external worker`);
4135
4419
  return result.count || 0;
4136
4420
  } catch (error) {
@@ -4210,17 +4494,33 @@ function validateUrl(url) {
4210
4494
  throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
4211
4495
  }
4212
4496
  }
4213
- function hasSameHostname(urlA, urlB) {
4214
- return urlA.hostname.toLowerCase() === urlB.hostname.toLowerCase();
4215
- }
4216
- function hasSameDomain(urlA, urlB) {
4217
- const domainA = psl.get(urlA.hostname.toLowerCase());
4218
- const domainB = psl.get(urlB.hostname.toLowerCase());
4219
- return domainA !== null && domainA === domainB;
4497
+ function computeBaseDirectory(pathname) {
4498
+ if (pathname === "") return "/";
4499
+ if (pathname.endsWith("/")) return pathname;
4500
+ const lastSegment = pathname.split("/").at(-1) || "";
4501
+ const looksLikeFile = lastSegment.includes(".");
4502
+ if (looksLikeFile) {
4503
+ return pathname.replace(/\/[^/]*$/, "/");
4504
+ }
4505
+ return `${pathname}/`;
4220
4506
  }
4221
- function isSubpath(baseUrl, targetUrl) {
4222
- const basePath = baseUrl.pathname.endsWith("/") ? baseUrl.pathname : `${baseUrl.pathname}/`;
4223
- return targetUrl.pathname.startsWith(basePath);
4507
+ function isInScope(baseUrl, targetUrl, scope) {
4508
+ if (baseUrl.protocol !== targetUrl.protocol) return false;
4509
+ switch (scope) {
4510
+ case "subpages": {
4511
+ if (baseUrl.hostname !== targetUrl.hostname) return false;
4512
+ const baseDir = computeBaseDirectory(baseUrl.pathname);
4513
+ return targetUrl.pathname.startsWith(baseDir);
4514
+ }
4515
+ case "hostname":
4516
+ return baseUrl.hostname === targetUrl.hostname;
4517
+ case "domain": {
4518
+ const getDomain = (host) => host.split(".").slice(-2).join(".");
4519
+ return getDomain(baseUrl.hostname) === getDomain(targetUrl.hostname);
4520
+ }
4521
+ default:
4522
+ return false;
4523
+ }
4224
4524
  }
4225
4525
  function isRegexPattern(pattern) {
4226
4526
  return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
@@ -4268,24 +4568,6 @@ function shouldIncludeUrl(url, includePatterns, excludePatterns) {
4268
4568
  if (!includePatterns || includePatterns.length === 0) return true;
4269
4569
  return matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
4270
4570
  }
4271
- function isInScope(baseUrl, targetUrl, scope) {
4272
- if (baseUrl.protocol !== targetUrl.protocol) return false;
4273
- switch (scope) {
4274
- case "subpages": {
4275
- if (baseUrl.hostname !== targetUrl.hostname) return false;
4276
- const baseDir = baseUrl.pathname.endsWith("/") ? baseUrl.pathname : baseUrl.pathname.replace(/\/[^/]*$/, "/");
4277
- return targetUrl.pathname.startsWith(baseDir);
4278
- }
4279
- case "hostname":
4280
- return baseUrl.hostname === targetUrl.hostname;
4281
- case "domain": {
4282
- const getDomain = (host) => host.split(".").slice(-2).join(".");
4283
- return getDomain(baseUrl.hostname) === getDomain(targetUrl.hostname);
4284
- }
4285
- default:
4286
- return false;
4287
- }
4288
- }
4289
4571
  const DEFAULT_MAX_DEPTH = 3;
4290
4572
  const DEFAULT_CONCURRENCY = 3;
4291
4573
  class BaseScraperStrategy {
@@ -4294,6 +4576,8 @@ class BaseScraperStrategy {
4294
4576
  totalDiscovered = 0;
4295
4577
  // Track total URLs discovered (unlimited)
4296
4578
  effectiveTotal = 0;
4579
+ // Track effective total (limited by maxPages)
4580
+ canonicalBaseUrl;
4297
4581
  options;
4298
4582
  constructor(options = {}) {
4299
4583
  this.options = options;
@@ -4305,7 +4589,7 @@ class BaseScraperStrategy {
4305
4589
  shouldProcessUrl(url, options) {
4306
4590
  if (options.scope) {
4307
4591
  try {
4308
- const base = new URL$1(options.url);
4592
+ const base = this.canonicalBaseUrl ?? new URL$1(options.url);
4309
4593
  const target = new URL$1(url);
4310
4594
  if (!isInScope(base, target, options.scope)) return false;
4311
4595
  } catch {
@@ -4328,6 +4612,23 @@ class BaseScraperStrategy {
4328
4612
  }
4329
4613
  try {
4330
4614
  const result = await this.processItem(item, options, void 0, signal);
4615
+ if (item.depth === 0 && !this.canonicalBaseUrl && result?.finalUrl) {
4616
+ try {
4617
+ const finalUrlStr = result.finalUrl;
4618
+ const original = new URL$1(options.url);
4619
+ const finalUrlObj = new URL$1(finalUrlStr);
4620
+ if (finalUrlObj.href !== original.href && (finalUrlObj.protocol === "http:" || finalUrlObj.protocol === "https:")) {
4621
+ this.canonicalBaseUrl = finalUrlObj;
4622
+ logger.debug(
4623
+ `Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`
4624
+ );
4625
+ } else {
4626
+ this.canonicalBaseUrl = original;
4627
+ }
4628
+ } catch {
4629
+ this.canonicalBaseUrl = new URL$1(options.url);
4630
+ }
4631
+ }
4331
4632
  if (result.document) {
4332
4633
  this.pageCount++;
4333
4634
  logger.info(
@@ -4388,7 +4689,8 @@ class BaseScraperStrategy {
4388
4689
  this.pageCount = 0;
4389
4690
  this.totalDiscovered = 1;
4390
4691
  this.effectiveTotal = 1;
4391
- const baseUrl = new URL$1(options.url);
4692
+ this.canonicalBaseUrl = new URL$1(options.url);
4693
+ let baseUrl = this.canonicalBaseUrl;
4392
4694
  const queue = [{ url: options.url, depth: 0 }];
4393
4695
  this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
4394
4696
  const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
@@ -4409,6 +4711,7 @@ class BaseScraperStrategy {
4409
4711
  queue.length
4410
4712
  );
4411
4713
  const batch = queue.splice(0, batchSize);
4714
+ baseUrl = this.canonicalBaseUrl ?? baseUrl;
4412
4715
  const newUrls = await this.processBatch(
4413
4716
  batch,
4414
4717
  baseUrl,
@@ -4441,22 +4744,7 @@ class WebScraperStrategy extends BaseScraperStrategy {
4441
4744
  return false;
4442
4745
  }
4443
4746
  }
4444
- /**
4445
- * Determines if a target URL should be followed based on the scope setting.
4446
- */
4447
- isInScope(baseUrl, targetUrl, scope) {
4448
- try {
4449
- if (scope === "domain") {
4450
- return hasSameDomain(baseUrl, targetUrl);
4451
- }
4452
- if (scope === "hostname") {
4453
- return hasSameHostname(baseUrl, targetUrl);
4454
- }
4455
- return hasSameHostname(baseUrl, targetUrl) && isSubpath(baseUrl, targetUrl);
4456
- } catch {
4457
- return false;
4458
- }
4459
- }
4747
+ // Removed custom isInScope logic; using shared scope utility for consistent behavior
4460
4748
  /**
4461
4749
  * Processes a single queue item by fetching its content and processing it through pipelines.
4462
4750
  * @param item - The queue item to process.
@@ -4497,12 +4785,12 @@ class WebScraperStrategy extends BaseScraperStrategy {
4497
4785
  );
4498
4786
  return { document: void 0, links: processed.links };
4499
4787
  }
4500
- const baseUrl = new URL(options.url);
4788
+ const baseUrl = item.depth === 0 ? new URL(rawContent.source) : this.canonicalBaseUrl ?? new URL(options.url);
4501
4789
  const filteredLinks = processed.links.filter((link) => {
4502
4790
  try {
4503
4791
  const targetUrl = new URL(link);
4504
4792
  const scope = options.scope || "subpages";
4505
- return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
4793
+ return isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
4506
4794
  } catch {
4507
4795
  return false;
4508
4796
  }
@@ -4518,7 +4806,8 @@ class WebScraperStrategy extends BaseScraperStrategy {
4518
4806
  ...processed.metadata
4519
4807
  }
4520
4808
  },
4521
- links: filteredLinks
4809
+ links: filteredLinks,
4810
+ finalUrl: rawContent.source
4522
4811
  };
4523
4812
  } catch (error) {
4524
4813
  logger.error(`❌ Failed processing page ${url}: ${error}`);
@@ -4901,7 +5190,9 @@ class PipelineManager {
4901
5190
  */
4902
5191
  async recoverPendingJobs() {
4903
5192
  try {
4904
- const runningVersions = await this.store.getRunningVersions();
5193
+ const runningVersions = await this.store.getVersionsByStatus([
5194
+ VersionStatus.RUNNING
5195
+ ]);
4905
5196
  for (const version2 of runningVersions) {
4906
5197
  await this.store.updateVersionStatus(version2.id, VersionStatus.QUEUED);
4907
5198
  logger.info(
@@ -5056,25 +5347,25 @@ class PipelineManager {
5056
5347
  async enqueueJobWithStoredOptions(library, version2) {
5057
5348
  const normalizedVersion = version2 ?? "";
5058
5349
  try {
5059
- const versionId = await this.store.ensureLibraryAndVersion(
5350
+ const versionId = await this.store.ensureVersion({
5060
5351
  library,
5061
- normalizedVersion
5062
- );
5063
- const versionRecord = await this.store.getVersionWithStoredOptions(versionId);
5064
- if (!versionRecord?.scraper_options || !versionRecord.source_url) {
5352
+ version: normalizedVersion
5353
+ });
5354
+ const stored = await this.store.getScraperOptions(versionId);
5355
+ if (!stored) {
5065
5356
  throw new Error(
5066
5357
  `No stored scraper options found for ${library}@${normalizedVersion || "unversioned"}`
5067
5358
  );
5068
5359
  }
5069
- const storedOptions = JSON.parse(versionRecord.scraper_options);
5360
+ const storedOptions = stored.options;
5070
5361
  const completeOptions = {
5071
- url: versionRecord.source_url,
5362
+ url: stored.sourceUrl,
5072
5363
  library,
5073
5364
  version: normalizedVersion,
5074
5365
  ...storedOptions
5075
5366
  };
5076
5367
  logger.info(
5077
- `🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${versionRecord.source_url}`
5368
+ `🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${stored.sourceUrl}`
5078
5369
  );
5079
5370
  return this.enqueueJob(library, normalizedVersion, completeOptions);
5080
5371
  } catch (error) {
@@ -5351,2130 +5642,123 @@ var PipelineFactory;
5351
5642
  logger.debug(`Creating PipelineClient for external worker at: ${serverUrl}`);
5352
5643
  return new PipelineClient(serverUrl);
5353
5644
  }
5354
- return new PipelineManager(docService, concurrency, { recoverJobs });
5645
+ return new PipelineManager(docService, concurrency, {
5646
+ recoverJobs
5647
+ });
5355
5648
  }
5356
5649
  PipelineFactory2.createPipeline = createPipeline;
5357
5650
  })(PipelineFactory || (PipelineFactory = {}));
5358
- class SplitterError extends Error {
5359
- }
5360
- class MinimumChunkSizeError extends SplitterError {
5361
- constructor(size, maxSize) {
5362
- super(
5363
- `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
5651
+ function ensurePlaywrightBrowsersInstalled() {
5652
+ const chromiumEnvPath = process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH;
5653
+ if (chromiumEnvPath && existsSync(chromiumEnvPath)) {
5654
+ logger.debug(
5655
+ `PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH is set to '${chromiumEnvPath}', skipping Playwright browser install.`
5364
5656
  );
5657
+ return;
5365
5658
  }
5366
- }
5367
- class ContentSplitterError extends SplitterError {
5368
- }
5369
- class GreedySplitter {
5370
- baseSplitter;
5371
- minChunkSize;
5372
- preferredChunkSize;
5373
- /**
5374
- * Combines a base document splitter with size constraints to produce optimally-sized chunks.
5375
- * The base splitter handles the initial semantic splitting, while this class handles
5376
- * the concatenation strategy.
5377
- */
5378
- constructor(baseSplitter, minChunkSize, preferredChunkSize) {
5379
- this.baseSplitter = baseSplitter;
5380
- this.minChunkSize = minChunkSize;
5381
- this.preferredChunkSize = preferredChunkSize;
5382
- }
5383
- /**
5384
- * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
5385
- * are combined until they reach the minimum size, but splits are preserved at major
5386
- * section boundaries to maintain document structure. This balances the need for
5387
- * context with semantic coherence.
5388
- */
5389
- async splitText(markdown) {
5390
- const initialChunks = await this.baseSplitter.splitText(markdown);
5391
- const concatenatedChunks = [];
5392
- let currentChunk = null;
5393
- for (const nextChunk of initialChunks) {
5394
- if (currentChunk) {
5395
- if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
5396
- concatenatedChunks.push(currentChunk);
5397
- currentChunk = this.cloneChunk(nextChunk);
5398
- continue;
5399
- }
5400
- if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
5401
- concatenatedChunks.push(currentChunk);
5402
- currentChunk = this.cloneChunk(nextChunk);
5403
- continue;
5404
- }
5405
- currentChunk.content += `
5406
- ${nextChunk.content}`;
5407
- currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
5408
- currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
5409
- } else {
5410
- currentChunk = this.cloneChunk(nextChunk);
5411
- }
5412
- }
5413
- if (currentChunk) {
5414
- concatenatedChunks.push(currentChunk);
5415
- }
5416
- return concatenatedChunks;
5417
- }
5418
- cloneChunk(chunk) {
5419
- return {
5420
- types: [...chunk.types],
5421
- content: chunk.content,
5422
- section: {
5423
- level: chunk.section.level,
5424
- path: [...chunk.section.path]
5425
- }
5426
- };
5427
- }
5428
- /**
5429
- * H1 and H2 headings represent major conceptual breaks in the document.
5430
- * Preserving these splits helps maintain the document's logical structure.
5431
- */
5432
- startsNewMajorSection(chunk) {
5433
- return chunk.section.level === 1 || chunk.section.level === 2;
5434
- }
5435
- /**
5436
- * Size limit check to ensure chunks remain within embedding model constraints.
5437
- * Essential for maintaining consistent embedding quality and avoiding truncation.
5438
- */
5439
- wouldExceedMaxSize(currentChunk, nextChunk) {
5440
- if (!currentChunk) {
5441
- return false;
5442
- }
5443
- return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
5444
- }
5445
- /**
5446
- * Checks if one path is a prefix of another path, indicating a parent-child relationship
5447
- */
5448
- isPathIncluded(parentPath, childPath) {
5449
- if (parentPath.length >= childPath.length) return false;
5450
- return parentPath.every((part, i) => part === childPath[i]);
5451
- }
5452
- /**
5453
- * Merges section metadata when concatenating chunks, following these rules:
5454
- * 1. Level: Always uses the lowest (most general) level between chunks
5455
- * 2. Path selection:
5456
- * - For parent-child relationships (one path includes the other), uses the child's path
5457
- * - For siblings/unrelated sections, uses the common parent path
5458
- * - If no common path exists, uses the root path ([])
5459
- */
5460
- mergeSectionInfo(currentChunk, nextChunk) {
5461
- const level = Math.min(currentChunk.section.level, nextChunk.section.level);
5462
- if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
5463
- return currentChunk.section;
5464
- }
5465
- if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
5466
- return {
5467
- path: nextChunk.section.path,
5468
- level
5469
- };
5470
- }
5471
- if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
5472
- return {
5473
- path: currentChunk.section.path,
5474
- level
5475
- };
5659
+ try {
5660
+ const chromiumPath = chromium.executablePath();
5661
+ if (!chromiumPath || !existsSync(chromiumPath)) {
5662
+ throw new Error("Playwright Chromium browser not found");
5476
5663
  }
5477
- const commonPath = this.findCommonPrefix(
5478
- currentChunk.section.path,
5479
- nextChunk.section.path
5664
+ } catch (_err) {
5665
+ logger.debug(
5666
+ "Playwright browsers not found. Installing Chromium browser for dynamic scraping (this may take a minute)..."
5480
5667
  );
5481
- return {
5482
- path: commonPath,
5483
- level
5484
- };
5485
- }
5486
- mergeTypes(currentTypes, nextTypes) {
5487
- return [.../* @__PURE__ */ new Set([...currentTypes, ...nextTypes])];
5488
- }
5489
- /**
5490
- * Returns longest common prefix between two paths
5491
- */
5492
- findCommonPrefix(path1, path2) {
5493
- const common = [];
5494
- for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
5495
- if (path1[i] === path2[i]) {
5496
- common.push(path1[i]);
5497
- } else {
5498
- break;
5499
- }
5668
+ try {
5669
+ logger.debug("Installing Playwright Chromium browser...");
5670
+ execSync("npm exec -y playwright install --no-shell --with-deps chromium", {
5671
+ stdio: "ignore",
5672
+ // Suppress output
5673
+ cwd: getProjectRoot()
5674
+ });
5675
+ } catch (_installErr) {
5676
+ console.error(
5677
+ "❌ Failed to install Playwright browsers automatically. Please run:\n npx playwright install --no-shell --with-deps chromium\nand try again."
5678
+ );
5679
+ process.exit(1);
5500
5680
  }
5501
- return common;
5502
5681
  }
5503
5682
  }
5504
- const fullTrim = (str) => {
5505
- return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
5506
- };
5507
- class CodeContentSplitter {
5508
- constructor(options) {
5509
- this.options = options;
5510
- }
5511
- async split(content) {
5512
- const language = content.match(/^```(\w+)\n/)?.[1];
5513
- const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
5514
- const lines = strippedContent.split("\n");
5515
- const chunks = [];
5516
- let currentChunkLines = [];
5517
- for (const line of lines) {
5518
- const singleLineSize = this.wrap(line, language).length;
5519
- if (singleLineSize > this.options.chunkSize) {
5520
- throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
5521
- }
5522
- currentChunkLines.push(line);
5523
- const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
5524
- const newChunkSize = newChunkContent.length;
5525
- if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
5526
- const lastLine = currentChunkLines.pop();
5527
- chunks.push(this.wrap(currentChunkLines.join("\n"), language));
5528
- currentChunkLines = [lastLine];
5529
- }
5530
- }
5531
- if (currentChunkLines.length > 0) {
5532
- chunks.push(this.wrap(currentChunkLines.join("\n"), language));
5683
+ function resolveProtocol(protocol) {
5684
+ if (protocol === "auto") {
5685
+ if (!process.stdin.isTTY && !process.stdout.isTTY) {
5686
+ return "stdio";
5533
5687
  }
5534
- return chunks;
5688
+ return "http";
5535
5689
  }
5536
- wrap(content, language) {
5537
- return `\`\`\`${language || ""}
5538
- ${content.replace(/\n+$/, "")}
5539
- \`\`\``;
5690
+ if (protocol === "stdio" || protocol === "http") {
5691
+ return protocol;
5540
5692
  }
5693
+ throw new Error(`Invalid protocol: ${protocol}. Must be 'auto', 'stdio', or 'http'`);
5541
5694
  }
5542
- class TableContentSplitter {
5543
- constructor(options) {
5544
- this.options = options;
5545
- }
5546
- /**
5547
- * Splits table content into chunks while preserving table structure
5548
- */
5549
- async split(content) {
5550
- const parsedTable = this.parseTable(content);
5551
- if (!parsedTable) {
5552
- return [content];
5553
- }
5554
- const { headers, rows } = parsedTable;
5555
- const chunks = [];
5556
- let currentRows = [];
5557
- for (const row of rows) {
5558
- const singleRowSize = this.wrap(row, headers).length;
5559
- if (singleRowSize > this.options.chunkSize) {
5560
- throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
5561
- }
5562
- const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
5563
- const newChunkSize = newChunkContent.length;
5564
- if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
5565
- chunks.push(this.wrap(currentRows.join("\n"), headers));
5566
- currentRows = [row];
5567
- } else {
5568
- currentRows.push(row);
5569
- }
5570
- }
5571
- if (currentRows.length > 0) {
5572
- chunks.push(this.wrap(currentRows.join("\n"), headers));
5573
- }
5574
- return chunks;
5575
- }
5576
- wrap(content, headers) {
5577
- const headerRow = `| ${headers.join(" | ")} |`;
5578
- const separatorRow = `|${headers.map(() => "---").join("|")}|`;
5579
- return [headerRow, separatorRow, content].join("\n");
5580
- }
5581
- parseTable(content) {
5582
- const lines = content.trim().split("\n");
5583
- if (lines.length < 3) return null;
5584
- const headers = this.parseRow(lines[0]);
5585
- if (!headers) return null;
5586
- const separator = lines[1];
5587
- if (!this.isValidSeparator(separator)) return null;
5588
- const rows = lines.slice(2).filter((row) => row.trim() !== "");
5589
- return { headers, separator, rows };
5590
- }
5591
- /**
5592
- * Parses a table row into cells
5593
- */
5594
- parseRow(row) {
5595
- if (!row.includes("|")) return null;
5596
- return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
5695
+ const formatOutput = (data) => JSON.stringify(data, null, 2);
5696
+ function setupLogging(options, protocol) {
5697
+ if (options.silent) {
5698
+ setLogLevel(LogLevel.ERROR);
5699
+ } else if (options.verbose) {
5700
+ setLogLevel(LogLevel.DEBUG);
5597
5701
  }
5598
- /**
5599
- * Validates the separator row of the table
5600
- */
5601
- isValidSeparator(separator) {
5602
- return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
5702
+ if (protocol === "stdio") {
5703
+ setLogLevel(LogLevel.ERROR);
5603
5704
  }
5604
5705
  }
5605
- class TextContentSplitter {
5606
- constructor(options) {
5607
- this.options = options;
5706
+ function validatePort(portString) {
5707
+ const port = Number.parseInt(portString, 10);
5708
+ if (Number.isNaN(port) || port < 1 || port > 65535) {
5709
+ throw new Error("❌ Invalid port number");
5608
5710
  }
5609
- /**
5610
- * Splits text content into chunks while trying to preserve semantic boundaries.
5611
- * Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
5612
- */
5613
- async split(content) {
5614
- const trimmedContent = fullTrim(content);
5615
- if (trimmedContent.length <= this.options.chunkSize) {
5616
- return [trimmedContent];
5617
- }
5618
- const words = trimmedContent.split(/\s+/);
5619
- const longestWord = words.reduce(
5620
- (max, word) => word.length > max.length ? word : max
5621
- );
5622
- if (longestWord.length > this.options.chunkSize) {
5623
- throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
5624
- }
5625
- const paragraphChunks = this.splitByParagraphs(trimmedContent);
5626
- if (this.areChunksValid(paragraphChunks)) {
5627
- return paragraphChunks;
5711
+ return port;
5712
+ }
5713
+ async function createPipelineWithCallbacks(docService, options = {}) {
5714
+ logger.debug(`Initializing pipeline with options: ${JSON.stringify(options)}`);
5715
+ const { serverUrl, ...rest } = options;
5716
+ const pipeline = serverUrl ? await PipelineFactory.createPipeline(void 0, { serverUrl, ...rest }) : await (async () => {
5717
+ if (!docService) {
5718
+ throw new Error("Local pipeline requires a DocumentManagementService instance");
5628
5719
  }
5629
- const lineChunks = this.splitByLines(trimmedContent);
5630
- if (this.areChunksValid(lineChunks)) {
5631
- return this.mergeChunks(lineChunks, "\n");
5720
+ return PipelineFactory.createPipeline(docService, rest);
5721
+ })();
5722
+ pipeline.setCallbacks({
5723
+ onJobProgress: async (job, progress) => {
5724
+ logger.debug(
5725
+ `📊 Job ${job.id} progress: ${progress.pagesScraped}/${progress.totalPages} pages`
5726
+ );
5727
+ },
5728
+ onJobStatusChange: async (job) => {
5729
+ logger.debug(`🔄 Job ${job.id} status changed to: ${job.status}`);
5730
+ },
5731
+ onJobError: async (job, error, document) => {
5732
+ logger.warn(
5733
+ `⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`
5734
+ );
5632
5735
  }
5633
- const wordChunks = await this.splitByWords(trimmedContent);
5634
- return this.mergeChunks(wordChunks, " ");
5635
- }
5636
- /**
5637
- * Checks if all chunks are within the maximum size limit
5638
- */
5639
- areChunksValid(chunks) {
5640
- return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
5641
- }
5642
- /**
5643
- * Splits text into chunks by paragraph boundaries (double newlines)
5644
- */
5645
- splitByParagraphs(text) {
5646
- const paragraphs = text.split(/\n\s*\n/).map((p) => fullTrim(p)).filter(Boolean);
5647
- return paragraphs.filter((chunk) => chunk.length > 2);
5648
- }
5649
- /**
5650
- * Splits text into chunks by line boundaries
5651
- */
5652
- splitByLines(text) {
5653
- const lines = text.split(/\n/).map((line) => fullTrim(line)).filter(Boolean);
5654
- return lines.filter((chunk) => chunk.length > 1);
5655
- }
5656
- /**
5657
- * Uses LangChain's recursive splitter for word-based splitting as a last resort
5658
- */
5659
- async splitByWords(text) {
5660
- const splitter = new RecursiveCharacterTextSplitter({
5661
- chunkSize: this.options.chunkSize,
5662
- chunkOverlap: 0
5663
- });
5664
- const chunks = await splitter.splitText(text);
5665
- return chunks;
5666
- }
5667
- /**
5668
- * Attempts to merge small chunks with previous chunks to minimize fragmentation.
5669
- * Only merges if combined size is within maxChunkSize.
5670
- */
5671
- mergeChunks(chunks, separator) {
5672
- const mergedChunks = [];
5673
- let currentChunk = null;
5674
- for (const chunk of chunks) {
5675
- if (currentChunk === null) {
5676
- currentChunk = chunk;
5677
- continue;
5678
- }
5679
- const currentChunkSize = this.getChunkSize(currentChunk);
5680
- const nextChunkSize = this.getChunkSize(chunk);
5681
- if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
5682
- currentChunk = `${currentChunk}${separator}${chunk}`;
5683
- } else {
5684
- mergedChunks.push(currentChunk);
5685
- currentChunk = chunk;
5736
+ });
5737
+ return pipeline;
5738
+ }
5739
+ function createAppServerConfig(options) {
5740
+ return {
5741
+ enableWebInterface: options.enableWebInterface ?? false,
5742
+ enableMcpServer: options.enableMcpServer ?? true,
5743
+ enableApiServer: options.enableApiServer ?? false,
5744
+ enableWorker: options.enableWorker ?? true,
5745
+ port: options.port,
5746
+ externalWorkerUrl: options.externalWorkerUrl
5747
+ };
5748
+ }
5749
+ function parseHeaders(headerOptions) {
5750
+ const headers = {};
5751
+ if (Array.isArray(headerOptions)) {
5752
+ for (const entry of headerOptions) {
5753
+ const idx = entry.indexOf(":");
5754
+ if (idx > 0) {
5755
+ const name = entry.slice(0, idx).trim();
5756
+ const value = entry.slice(idx + 1).trim();
5757
+ if (name) headers[name] = value;
5686
5758
  }
5687
5759
  }
5688
- if (currentChunk) {
5689
- mergedChunks.push(currentChunk);
5690
- }
5691
- return mergedChunks;
5692
- }
5693
- getChunkSize(chunk) {
5694
- return chunk.length;
5695
- }
5696
- wrap(content) {
5697
- return content;
5698
5760
  }
5699
- }
5700
- class SemanticMarkdownSplitter {
5701
- constructor(preferredChunkSize, maxChunkSize) {
5702
- this.preferredChunkSize = preferredChunkSize;
5703
- this.maxChunkSize = maxChunkSize;
5704
- this.turndownService = new TurndownService({
5705
- headingStyle: "atx",
5706
- hr: "---",
5707
- bulletListMarker: "-",
5708
- codeBlockStyle: "fenced",
5709
- emDelimiter: "_",
5710
- strongDelimiter: "**",
5711
- linkStyle: "inlined"
5712
- });
5713
- this.turndownService.addRule("table", {
5714
- filter: ["table"],
5715
- replacement: (_content, node) => {
5716
- const table = node;
5717
- const headers = Array.from(table.querySelectorAll("th")).map(
5718
- (th) => th.textContent?.trim() || ""
5719
- );
5720
- const rows = Array.from(table.querySelectorAll("tr")).filter(
5721
- (tr) => !tr.querySelector("th")
5722
- );
5723
- if (headers.length === 0 && rows.length === 0) return "";
5724
- let markdown = "\n";
5725
- if (headers.length > 0) {
5726
- markdown += `| ${headers.join(" | ")} |
5727
- `;
5728
- markdown += `|${headers.map(() => "---").join("|")}|
5729
- `;
5730
- }
5731
- for (const row of rows) {
5732
- const cells = Array.from(row.querySelectorAll("td")).map(
5733
- (td) => td.textContent?.trim() || ""
5734
- );
5735
- markdown += `| ${cells.join(" | ")} |
5736
- `;
5737
- }
5738
- return markdown;
5739
- }
5740
- });
5741
- this.textSplitter = new TextContentSplitter({
5742
- chunkSize: this.preferredChunkSize
5743
- });
5744
- this.codeSplitter = new CodeContentSplitter({
5745
- chunkSize: this.maxChunkSize
5746
- });
5747
- this.tableSplitter = new TableContentSplitter({
5748
- chunkSize: this.maxChunkSize
5749
- });
5750
- }
5751
- turndownService;
5752
- textSplitter;
5753
- codeSplitter;
5754
- tableSplitter;
5755
- /**
5756
- * Main entry point for splitting markdown content
5757
- */
5758
- async splitText(markdown) {
5759
- const html = await this.markdownToHtml(markdown);
5760
- const dom = await this.parseHtml(html);
5761
- const sections = await this.splitIntoSections(dom);
5762
- return this.splitSectionContent(sections);
5763
- }
5764
- /**
5765
- * Step 1: Split document into sections based on H1-H6 headings,
5766
- * as well as code blocks and tables.
5767
- */
5768
- async splitIntoSections(dom) {
5769
- const body = dom.querySelector("body");
5770
- if (!body) {
5771
- throw new Error("Invalid HTML structure: no body element found");
5772
- }
5773
- let currentSection = this.createRootSection();
5774
- const sections = [];
5775
- const stack = [currentSection];
5776
- for (const element of Array.from(body.children)) {
5777
- const headingMatch = element.tagName.match(/H([1-6])/);
5778
- if (headingMatch) {
5779
- const level = Number.parseInt(headingMatch[1], 10);
5780
- const title = fullTrim(element.textContent || "");
5781
- while (stack.length > 1 && stack[stack.length - 1].level >= level) {
5782
- stack.pop();
5783
- }
5784
- currentSection = {
5785
- level,
5786
- path: [
5787
- ...stack.slice(1).reduce((acc, s) => {
5788
- const lastPath = s.path[s.path.length - 1];
5789
- if (lastPath) acc.push(lastPath);
5790
- return acc;
5791
- }, []),
5792
- title
5793
- ],
5794
- content: [
5795
- {
5796
- type: "heading",
5797
- text: `${"#".repeat(level)} ${title}`
5798
- }
5799
- ]
5800
- };
5801
- sections.push(currentSection);
5802
- stack.push(currentSection);
5803
- } else if (element.tagName === "PRE") {
5804
- const code = element.querySelector("code");
5805
- const language = code?.className.replace("language-", "") || "";
5806
- const content = code?.textContent || element.textContent || "";
5807
- const markdown = `${"```"}${language}
5808
- ${content}
5809
- ${"```"}`;
5810
- currentSection = {
5811
- level: currentSection.level,
5812
- path: currentSection.path,
5813
- content: [
5814
- {
5815
- type: "code",
5816
- text: markdown
5817
- }
5818
- ]
5819
- };
5820
- sections.push(currentSection);
5821
- } else if (element.tagName === "TABLE") {
5822
- const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
5823
- currentSection = {
5824
- level: currentSection.level,
5825
- path: currentSection.path,
5826
- content: [
5827
- {
5828
- type: "table",
5829
- text: markdown
5830
- }
5831
- ]
5832
- };
5833
- sections.push(currentSection);
5834
- } else {
5835
- const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
5836
- if (markdown) {
5837
- currentSection = {
5838
- level: currentSection.level,
5839
- path: currentSection.path,
5840
- content: [
5841
- {
5842
- type: "text",
5843
- text: markdown
5844
- }
5845
- ]
5846
- };
5847
- sections.push(currentSection);
5848
- }
5849
- }
5850
- }
5851
- return sections;
5852
- }
5853
- /**
5854
- * Step 2: Split section content into smaller chunks
5855
- */
5856
- async splitSectionContent(sections) {
5857
- const chunks = [];
5858
- for (const section of sections) {
5859
- for (const content of section.content) {
5860
- let splitContent = [];
5861
- try {
5862
- switch (content.type) {
5863
- case "heading":
5864
- case "text": {
5865
- splitContent = await this.textSplitter.split(content.text);
5866
- break;
5867
- }
5868
- case "code": {
5869
- splitContent = await this.codeSplitter.split(content.text);
5870
- break;
5871
- }
5872
- case "table": {
5873
- splitContent = await this.tableSplitter.split(content.text);
5874
- break;
5875
- }
5876
- }
5877
- } catch (err) {
5878
- if (err instanceof MinimumChunkSizeError) {
5879
- logger.warn(
5880
- `⚠ Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
5881
- );
5882
- const splitter = new RecursiveCharacterTextSplitter({
5883
- chunkSize: this.maxChunkSize,
5884
- chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
5885
- // Use more aggressive separators including empty string as last resort
5886
- separators: [
5887
- "\n\n",
5888
- "\n",
5889
- " ",
5890
- " ",
5891
- ".",
5892
- ",",
5893
- ";",
5894
- ":",
5895
- "-",
5896
- "(",
5897
- ")",
5898
- "[",
5899
- "]",
5900
- "{",
5901
- "}",
5902
- ""
5903
- ]
5904
- });
5905
- const chunks2 = await splitter.splitText(content.text);
5906
- if (chunks2.length === 0) {
5907
- splitContent = [content.text.substring(0, this.maxChunkSize)];
5908
- } else {
5909
- splitContent = chunks2;
5910
- }
5911
- } else {
5912
- const errMessage = err instanceof Error ? err.message : String(err);
5913
- throw new ContentSplitterError(
5914
- `Failed to split ${content.type} content: ${errMessage}`
5915
- );
5916
- }
5917
- }
5918
- chunks.push(
5919
- ...splitContent.map(
5920
- (text) => ({
5921
- types: [content.type],
5922
- content: text,
5923
- section: {
5924
- level: section.level,
5925
- path: section.path
5926
- }
5927
- })
5928
- )
5929
- );
5930
- }
5931
- }
5932
- return chunks;
5933
- }
5934
- /**
5935
- * Helper to create the root section
5936
- */
5937
- createRootSection() {
5938
- return {
5939
- level: 0,
5940
- path: [],
5941
- content: []
5942
- };
5943
- }
5944
- /**
5945
- * Convert markdown to HTML using remark
5946
- */
5947
- async markdownToHtml(markdown) {
5948
- const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
5949
- return `<!DOCTYPE html>
5950
- <html>
5951
- <body>
5952
- ${String(html)}
5953
- </body>
5954
- </html>`;
5955
- }
5956
- /**
5957
- * Parse HTML
5958
- */
5959
- async parseHtml(html) {
5960
- const { window } = createJSDOM(html);
5961
- return window.document;
5962
- }
5963
- }
5964
- const CHILD_LIMIT = 5;
5965
- const SIBLING_LIMIT = 2;
5966
- class DocumentRetrieverService {
5967
- documentStore;
5968
- constructor(documentStore) {
5969
- this.documentStore = documentStore;
5970
- }
5971
- /**
5972
- * Collects all related chunk IDs for a given initial hit.
5973
- * Returns an object with url, hitId, relatedIds (Set), and score.
5974
- */
5975
- async getRelatedChunkIds(library, version2, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
5976
- const id = doc.id;
5977
- const url = doc.metadata.url;
5978
- const score = doc.metadata.score;
5979
- const relatedIds = /* @__PURE__ */ new Set();
5980
- relatedIds.add(id);
5981
- const parent = await this.documentStore.findParentChunk(library, version2, id);
5982
- if (parent) {
5983
- relatedIds.add(parent.id);
5984
- }
5985
- const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
5986
- library,
5987
- version2,
5988
- id,
5989
- siblingLimit
5990
- );
5991
- for (const sib of precedingSiblings) {
5992
- relatedIds.add(sib.id);
5993
- }
5994
- const childChunks = await this.documentStore.findChildChunks(
5995
- library,
5996
- version2,
5997
- id,
5998
- childLimit
5999
- );
6000
- for (const child of childChunks) {
6001
- relatedIds.add(child.id);
6002
- }
6003
- const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
6004
- library,
6005
- version2,
6006
- id,
6007
- siblingLimit
6008
- );
6009
- for (const sib of subsequentSiblings) {
6010
- relatedIds.add(sib.id);
6011
- }
6012
- return { url, hitId: id, relatedIds, score };
6013
- }
6014
- /**
6015
- * Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
6016
- */
6017
- groupAndPrepareFetch(relatedInfos) {
6018
- const urlMap = /* @__PURE__ */ new Map();
6019
- for (const info of relatedInfos) {
6020
- let entry = urlMap.get(info.url);
6021
- if (!entry) {
6022
- entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
6023
- urlMap.set(info.url, entry);
6024
- }
6025
- for (const id of info.relatedIds) {
6026
- entry.uniqueChunkIds.add(id);
6027
- }
6028
- if (info.score > entry.maxScore) {
6029
- entry.maxScore = info.score;
6030
- }
6031
- }
6032
- return urlMap;
6033
- }
6034
- /**
6035
- * Finalizes the merged result for a URL group by fetching, sorting, and joining content.
6036
- */
6037
- async finalizeResult(library, version2, url, uniqueChunkIds, maxScore) {
6038
- const ids = Array.from(uniqueChunkIds);
6039
- const docs = await this.documentStore.findChunksByIds(library, version2, ids);
6040
- const content = docs.map((d) => d.pageContent).join("\n\n");
6041
- return {
6042
- url,
6043
- content,
6044
- score: maxScore
6045
- };
6046
- }
6047
- /**
6048
- * Searches for documents and expands the context around the matches.
6049
- * @param library The library name.
6050
- * @param version The library version.
6051
- * @param query The search query.
6052
- * @param version The library version (optional, defaults to searching documents without a version).
6053
- * @param query The search query.
6054
- * @param limit The optional limit for the initial search results.
6055
- * @returns An array of strings representing the aggregated content of the retrieved chunks.
6056
- */
6057
- async search(library, version2, query, limit) {
6058
- const normalizedVersion = (version2 ?? "").toLowerCase();
6059
- const initialResults = await this.documentStore.findByContent(
6060
- library,
6061
- normalizedVersion,
6062
- query,
6063
- limit ?? 10
6064
- );
6065
- const relatedInfos = await Promise.all(
6066
- initialResults.map(
6067
- (doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
6068
- )
6069
- );
6070
- const urlMap = this.groupAndPrepareFetch(relatedInfos);
6071
- const results = [];
6072
- for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
6073
- const result = await this.finalizeResult(
6074
- library,
6075
- normalizedVersion,
6076
- url,
6077
- uniqueChunkIds,
6078
- maxScore
6079
- );
6080
- results.push(result);
6081
- }
6082
- return results;
6083
- }
6084
- }
6085
- class StoreError extends Error {
6086
- constructor(message, cause) {
6087
- super(cause ? `${message} caused by ${cause}` : message);
6088
- this.cause = cause;
6089
- this.name = this.constructor.name;
6090
- const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
6091
- if (causeError?.stack) {
6092
- this.stack = causeError.stack;
6093
- }
6094
- }
6095
- }
6096
- class DimensionError extends StoreError {
6097
- constructor(modelName, modelDimension, dbDimension) {
6098
- super(
6099
- `Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
6100
- );
6101
- this.modelName = modelName;
6102
- this.modelDimension = modelDimension;
6103
- this.dbDimension = dbDimension;
6104
- }
6105
- }
6106
- class ConnectionError extends StoreError {
6107
- }
6108
- const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
6109
- const MIGRATIONS_TABLE = "_schema_migrations";
6110
- function ensureMigrationsTable(db) {
6111
- db.exec(`
6112
- CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
6113
- id TEXT PRIMARY KEY,
6114
- applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
6115
- );
6116
- `);
6117
- }
6118
- function getAppliedMigrations(db) {
6119
- const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
6120
- const rows = stmt.all();
6121
- return new Set(rows.map((row) => row.id));
6122
- }
6123
- async function applyMigrations(db) {
6124
- try {
6125
- db.pragma("journal_mode = OFF");
6126
- db.pragma("synchronous = OFF");
6127
- db.pragma("mmap_size = 268435456");
6128
- db.pragma("cache_size = -64000");
6129
- db.pragma("temp_store = MEMORY");
6130
- logger.debug("Applied performance optimizations for migration");
6131
- } catch (_error) {
6132
- logger.warn("⚠️ Could not apply all performance optimizations for migration");
6133
- }
6134
- const overallTransaction = db.transaction(() => {
6135
- logger.debug("Checking database migrations...");
6136
- ensureMigrationsTable(db);
6137
- const appliedMigrations = getAppliedMigrations(db);
6138
- if (!fs$1.existsSync(MIGRATIONS_DIR)) {
6139
- throw new StoreError("Migrations directory not found");
6140
- }
6141
- const migrationFiles = fs$1.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
6142
- const pendingMigrations = migrationFiles.filter(
6143
- (filename) => !appliedMigrations.has(filename)
6144
- );
6145
- if (pendingMigrations.length > 0) {
6146
- logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
6147
- }
6148
- let appliedCount = 0;
6149
- for (const filename of pendingMigrations) {
6150
- logger.debug(`Applying migration: ${filename}`);
6151
- const filePath = path.join(MIGRATIONS_DIR, filename);
6152
- const sql = fs$1.readFileSync(filePath, "utf8");
6153
- try {
6154
- db.exec(sql);
6155
- const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
6156
- insertStmt.run(filename);
6157
- logger.debug(`✅ Applied migration: ${filename}`);
6158
- appliedCount++;
6159
- } catch (error) {
6160
- logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
6161
- throw new StoreError(`Migration failed: ${filename}`, error);
6162
- }
6163
- }
6164
- if (appliedCount > 0) {
6165
- logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
6166
- } else {
6167
- logger.debug("Database schema is up to date");
6168
- }
6169
- return appliedCount;
6170
- });
6171
- let retries = 0;
6172
- let appliedMigrationsCount = 0;
6173
- while (true) {
6174
- try {
6175
- appliedMigrationsCount = overallTransaction.immediate();
6176
- logger.debug("Database migrations completed successfully");
6177
- if (appliedMigrationsCount > 0) {
6178
- try {
6179
- logger.debug(
6180
- `Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
6181
- );
6182
- db.exec("VACUUM");
6183
- logger.debug("Database vacuum completed successfully");
6184
- } catch (error) {
6185
- logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
6186
- }
6187
- } else {
6188
- logger.debug("Skipping VACUUM - no migrations were applied");
6189
- }
6190
- break;
6191
- } catch (error) {
6192
- if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
6193
- retries++;
6194
- logger.warn(
6195
- `⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
6196
- );
6197
- await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
6198
- } else {
6199
- if (error?.code === "SQLITE_BUSY") {
6200
- logger.error(
6201
- `❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
6202
- );
6203
- }
6204
- if (error instanceof StoreError) {
6205
- throw error;
6206
- }
6207
- throw new StoreError("Failed during migration process", error);
6208
- }
6209
- }
6210
- }
6211
- try {
6212
- db.pragma("journal_mode = WAL");
6213
- db.pragma("wal_autocheckpoint = 1000");
6214
- db.pragma("busy_timeout = 30000");
6215
- db.pragma("foreign_keys = ON");
6216
- db.pragma("synchronous = NORMAL");
6217
- logger.debug(
6218
- "Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
6219
- );
6220
- } catch (_error) {
6221
- logger.warn("⚠️ Could not apply all production database settings");
6222
- }
6223
- }
6224
- class DocumentStore {
6225
- db;
6226
- embeddings;
6227
- dbDimension = VECTOR_DIMENSION;
6228
- modelDimension;
6229
- statements;
6230
- /**
6231
- * Calculates Reciprocal Rank Fusion score for a result
6232
- */
6233
- calculateRRF(vecRank, ftsRank, k = 60) {
6234
- let rrf = 0;
6235
- if (vecRank !== void 0) {
6236
- rrf += 1 / (k + vecRank);
6237
- }
6238
- if (ftsRank !== void 0) {
6239
- rrf += 1 / (k + ftsRank);
6240
- }
6241
- return rrf;
6242
- }
6243
- /**
6244
- * Assigns ranks to search results based on their scores
6245
- */
6246
- assignRanks(results) {
6247
- const vecRanks = /* @__PURE__ */ new Map();
6248
- const ftsRanks = /* @__PURE__ */ new Map();
6249
- results.filter((r) => r.vec_score !== void 0).sort((a, b) => (b.vec_score ?? 0) - (a.vec_score ?? 0)).forEach((result, index) => {
6250
- vecRanks.set(Number(result.id), index + 1);
6251
- });
6252
- results.filter((r) => r.fts_score !== void 0).sort((a, b) => (b.fts_score ?? 0) - (a.fts_score ?? 0)).forEach((result, index) => {
6253
- ftsRanks.set(Number(result.id), index + 1);
6254
- });
6255
- return results.map((result) => ({
6256
- ...result,
6257
- vec_rank: vecRanks.get(Number(result.id)),
6258
- fts_rank: ftsRanks.get(Number(result.id)),
6259
- rrf_score: this.calculateRRF(
6260
- vecRanks.get(Number(result.id)),
6261
- ftsRanks.get(Number(result.id))
6262
- )
6263
- }));
6264
- }
6265
- constructor(dbPath) {
6266
- if (!dbPath) {
6267
- throw new StoreError("Missing required database path");
6268
- }
6269
- this.db = new Database(dbPath);
6270
- }
6271
- /**
6272
- * Sets up prepared statements for database queries
6273
- */
6274
- prepareStatements() {
6275
- const statements = {
6276
- getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
6277
- insertDocument: this.db.prepare(
6278
- "INSERT INTO documents (library_id, version_id, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
6279
- ),
6280
- insertEmbedding: this.db.prepare(
6281
- "INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)"
6282
- ),
6283
- insertLibrary: this.db.prepare(
6284
- "INSERT INTO libraries (name) VALUES (?) ON CONFLICT(name) DO NOTHING"
6285
- ),
6286
- getLibraryIdByName: this.db.prepare(
6287
- "SELECT id FROM libraries WHERE name = ?"
6288
- ),
6289
- // New version-related statements
6290
- insertVersion: this.db.prepare(
6291
- "INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
6292
- ),
6293
- resolveVersionId: this.db.prepare(
6294
- "SELECT id FROM versions WHERE library_id = ? AND name IS ?"
6295
- ),
6296
- getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
6297
- queryVersionsByLibraryId: this.db.prepare(
6298
- "SELECT * FROM versions WHERE library_id = ? ORDER BY name"
6299
- ),
6300
- deleteLibraryDocuments: this.db.prepare(
6301
- `DELETE FROM documents
6302
- WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
6303
- AND version_id = (
6304
- SELECT v.id FROM versions v
6305
- WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
6306
- AND COALESCE(v.name, '') = COALESCE(?, '')
6307
- )`
6308
- ),
6309
- deleteDocuments: this.db.prepare(
6310
- `DELETE FROM documents
6311
- WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
6312
- AND version_id = (
6313
- SELECT v.id FROM versions v
6314
- WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
6315
- AND COALESCE(v.name, '') = COALESCE(?, '')
6316
- )`
6317
- ),
6318
- deleteDocumentsByUrl: this.db.prepare(
6319
- `DELETE FROM documents
6320
- WHERE url = ?
6321
- AND library_id = (SELECT id FROM libraries WHERE name = ?)
6322
- AND version_id = (
6323
- SELECT v.id FROM versions v
6324
- WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
6325
- AND COALESCE(v.name, '') = COALESCE(?, '')
6326
- )`
6327
- ),
6328
- getDocumentBySort: this.db.prepare(
6329
- `SELECT d.id
6330
- FROM documents d
6331
- JOIN versions v ON d.version_id = v.id
6332
- JOIN libraries l ON v.library_id = l.id
6333
- WHERE l.name = ?
6334
- AND COALESCE(v.name, '') = COALESCE(?, '')
6335
- LIMIT 1`
6336
- ),
6337
- queryVersions: this.db.prepare(
6338
- `SELECT DISTINCT v.name
6339
- FROM versions v
6340
- JOIN libraries l ON v.library_id = l.id
6341
- WHERE l.name = ?
6342
- ORDER BY v.name`
6343
- ),
6344
- checkExists: this.db.prepare(
6345
- `SELECT d.id FROM documents d
6346
- JOIN versions v ON d.version_id = v.id
6347
- JOIN libraries l ON v.library_id = l.id
6348
- WHERE l.name = ?
6349
- AND COALESCE(v.name, '') = COALESCE(?, '')
6350
- LIMIT 1`
6351
- ),
6352
- queryLibraryVersions: this.db.prepare(
6353
- `SELECT
6354
- l.name as library,
6355
- v.name as version,
6356
- COUNT(*) as documentCount,
6357
- COUNT(DISTINCT d.url) as uniqueUrlCount,
6358
- MIN(d.indexed_at) as indexedAt
6359
- FROM documents d
6360
- JOIN versions v ON d.version_id = v.id
6361
- JOIN libraries l ON v.library_id = l.id
6362
- GROUP BY l.name, v.name
6363
- ORDER BY l.name, v.name`
6364
- ),
6365
- getChildChunks: this.db.prepare(`
6366
- SELECT d.* FROM documents d
6367
- JOIN versions v ON d.version_id = v.id
6368
- JOIN libraries l ON v.library_id = l.id
6369
- WHERE l.name = ?
6370
- AND COALESCE(v.name, '') = COALESCE(?, '')
6371
- AND d.url = ?
6372
- AND json_array_length(json_extract(d.metadata, '$.path')) = ?
6373
- AND json_extract(d.metadata, '$.path') LIKE ? || '%'
6374
- AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
6375
- ORDER BY d.sort_order
6376
- LIMIT ?
6377
- `),
6378
- getPrecedingSiblings: this.db.prepare(`
6379
- SELECT d.* FROM documents d
6380
- JOIN versions v ON d.version_id = v.id
6381
- JOIN libraries l ON v.library_id = l.id
6382
- WHERE l.name = ?
6383
- AND COALESCE(v.name, '') = COALESCE(?, '')
6384
- AND d.url = ?
6385
- AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
6386
- AND json_extract(d.metadata, '$.path') = ?
6387
- ORDER BY d.sort_order DESC
6388
- LIMIT ?
6389
- `),
6390
- getSubsequentSiblings: this.db.prepare(`
6391
- SELECT d.* FROM documents d
6392
- JOIN versions v ON d.version_id = v.id
6393
- JOIN libraries l ON v.library_id = l.id
6394
- WHERE l.name = ?
6395
- AND COALESCE(v.name, '') = COALESCE(?, '')
6396
- AND d.url = ?
6397
- AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
6398
- AND json_extract(d.metadata, '$.path') = ?
6399
- ORDER BY d.sort_order
6400
- LIMIT ?
6401
- `),
6402
- getParentChunk: this.db.prepare(`
6403
- SELECT d.* FROM documents d
6404
- JOIN versions v ON d.version_id = v.id
6405
- JOIN libraries l ON v.library_id = l.id
6406
- WHERE l.name = ?
6407
- AND COALESCE(v.name, '') = COALESCE(?, '')
6408
- AND d.url = ?
6409
- AND json_extract(d.metadata, '$.path') = ?
6410
- AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
6411
- ORDER BY d.sort_order DESC
6412
- LIMIT 1
6413
- `),
6414
- // Status tracking statements
6415
- updateVersionStatus: this.db.prepare(
6416
- "UPDATE versions SET status = ?, error_message = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
6417
- ),
6418
- updateVersionProgress: this.db.prepare(
6419
- "UPDATE versions SET progress_pages = ?, progress_max_pages = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
6420
- ),
6421
- getVersionsByStatus: this.db.prepare(
6422
- "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN (SELECT value FROM json_each(?))"
6423
- ),
6424
- getRunningVersions: this.db.prepare(
6425
- "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status = 'running' ORDER BY v.started_at"
6426
- ),
6427
- getActiveVersions: this.db.prepare(
6428
- "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN ('queued', 'running', 'updating') ORDER BY v.created_at"
6429
- ),
6430
- // Scraper options statements
6431
- updateVersionScraperOptions: this.db.prepare(
6432
- "UPDATE versions SET source_url = ?, scraper_options = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
6433
- ),
6434
- getVersionWithOptions: this.db.prepare(
6435
- "SELECT * FROM versions WHERE id = ?"
6436
- ),
6437
- getVersionsBySourceUrl: this.db.prepare(
6438
- "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.source_url = ? ORDER BY v.created_at DESC"
6439
- )
6440
- };
6441
- this.statements = statements;
6442
- }
6443
- /**
6444
- * Pads a vector to the fixed database dimension by appending zeros.
6445
- * Throws an error if the input vector is longer than the database dimension.
6446
- */
6447
- padVector(vector) {
6448
- if (vector.length > this.dbDimension) {
6449
- throw new Error(
6450
- `Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
6451
- );
6452
- }
6453
- if (vector.length === this.dbDimension) {
6454
- return vector;
6455
- }
6456
- return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
6457
- }
6458
- /**
6459
- * Initializes embeddings client using environment variables for configuration.
6460
- *
6461
- * The embedding model is configured using DOCS_MCP_EMBEDDING_MODEL environment variable.
6462
- * Format: "provider:model_name" (e.g., "google:text-embedding-004") or just "model_name"
6463
- * for OpenAI (default).
6464
- *
6465
- * Supported providers and their required environment variables:
6466
- * - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
6467
- * - google: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
6468
- * - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION (or BEDROCK_AWS_REGION)
6469
- * - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
6470
- */
6471
- async initializeEmbeddings() {
6472
- const modelSpec = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
6473
- const { createEmbeddingModel } = await import("./EmbeddingFactory-CElwVk3X.js");
6474
- this.embeddings = createEmbeddingModel(modelSpec);
6475
- const testVector = await this.embeddings.embedQuery("test");
6476
- this.modelDimension = testVector.length;
6477
- if (this.modelDimension > this.dbDimension) {
6478
- throw new DimensionError(modelSpec, this.modelDimension, this.dbDimension);
6479
- }
6480
- }
6481
- /**
6482
- * Escapes a query string for use with SQLite FTS5 MATCH operator.
6483
- * Wraps the query in double quotes and escapes internal double quotes.
6484
- */
6485
- escapeFtsQuery(query) {
6486
- const escapedQuotes = query.replace(/"/g, '""');
6487
- return `"${escapedQuotes}"`;
6488
- }
6489
- /**
6490
- * Initializes database connection and ensures readiness
6491
- */
6492
- async initialize() {
6493
- try {
6494
- sqliteVec.load(this.db);
6495
- applyMigrations(this.db);
6496
- this.prepareStatements();
6497
- await this.initializeEmbeddings();
6498
- } catch (error) {
6499
- if (error instanceof StoreError) {
6500
- throw error;
6501
- }
6502
- throw new ConnectionError("Failed to initialize database connection", error);
6503
- }
6504
- }
6505
- /**
6506
- * Gracefully closes database connections
6507
- */
6508
- async shutdown() {
6509
- this.db.close();
6510
- }
6511
- /**
6512
- * Resolves a library name and version string to library_id and version_id.
6513
- * Creates library and version records if they don't exist.
6514
- */
6515
- async resolveLibraryAndVersionIds(library, version2) {
6516
- const normalizedLibrary = library.toLowerCase();
6517
- const normalizedVersion = denormalizeVersionName(version2.toLowerCase());
6518
- this.statements.insertLibrary.run(normalizedLibrary);
6519
- const libraryIdRow = this.statements.getLibraryIdByName.get(normalizedLibrary);
6520
- if (!libraryIdRow || typeof libraryIdRow.id !== "number") {
6521
- throw new StoreError(`Failed to resolve library_id for library: ${library}`);
6522
- }
6523
- const libraryId = libraryIdRow.id;
6524
- this.statements.insertVersion.run(libraryId, normalizedVersion);
6525
- const versionIdRow = this.statements.resolveVersionId.get(
6526
- libraryId,
6527
- normalizedVersion
6528
- );
6529
- if (!versionIdRow || typeof versionIdRow.id !== "number") {
6530
- throw new StoreError(
6531
- `Failed to resolve version_id for library: ${library}, version: ${version2}`
6532
- );
6533
- }
6534
- return { libraryId, versionId: versionIdRow.id };
6535
- }
6536
- /**
6537
- * Retrieves all unique versions for a specific library
6538
- */
6539
- async queryUniqueVersions(library) {
6540
- try {
6541
- const rows = this.statements.queryVersions.all(library.toLowerCase());
6542
- return rows.map((row) => normalizeVersionName(row.name));
6543
- } catch (error) {
6544
- throw new ConnectionError("Failed to query versions", error);
6545
- }
6546
- }
6547
- /**
6548
- * Updates the status of a version record in the database.
6549
- * @param versionId The version ID to update
6550
- * @param status The new status to set
6551
- * @param errorMessage Optional error message for failed statuses
6552
- */
6553
- async updateVersionStatus(versionId, status, errorMessage) {
6554
- try {
6555
- this.statements.updateVersionStatus.run(status, errorMessage ?? null, versionId);
6556
- } catch (error) {
6557
- throw new StoreError(`Failed to update version status: ${error}`);
6558
- }
6559
- }
6560
- /**
6561
- * Updates the progress counters for a version being indexed.
6562
- * @param versionId The version ID to update
6563
- * @param pages Current number of pages processed
6564
- * @param maxPages Total number of pages to process
6565
- */
6566
- async updateVersionProgress(versionId, pages, maxPages) {
6567
- try {
6568
- this.statements.updateVersionProgress.run(pages, maxPages, versionId);
6569
- } catch (error) {
6570
- throw new StoreError(`Failed to update version progress: ${error}`);
6571
- }
6572
- }
6573
- /**
6574
- * Retrieves versions by their status.
6575
- * @param statuses Array of statuses to filter by
6576
- * @returns Array of version records matching the statuses
6577
- */
6578
- async getVersionsByStatus(statuses) {
6579
- try {
6580
- const statusJson = JSON.stringify(statuses);
6581
- const rows = this.statements.getVersionsByStatus.all(
6582
- statusJson
6583
- );
6584
- return rows;
6585
- } catch (error) {
6586
- throw new StoreError(`Failed to get versions by status: ${error}`);
6587
- }
6588
- }
6589
- /**
6590
- * Retrieves all versions currently in RUNNING status.
6591
- * @returns Array of running version records with library names
6592
- */
6593
- async getRunningVersions() {
6594
- try {
6595
- const rows = this.statements.getRunningVersions.all();
6596
- return rows;
6597
- } catch (error) {
6598
- throw new StoreError(`Failed to get running versions: ${error}`);
6599
- }
6600
- }
6601
- /**
6602
- * Retrieves all versions in active states (queued, running, updating).
6603
- * @returns Array of active version records with library names
6604
- */
6605
- async getActiveVersions() {
6606
- try {
6607
- const rows = this.statements.getActiveVersions.all();
6608
- return rows;
6609
- } catch (error) {
6610
- throw new StoreError(`Failed to get active versions: ${error}`);
6611
- }
6612
- }
6613
- /**
6614
- * Stores scraper options for a version to enable reproducible indexing.
6615
- * @param versionId The version ID to update
6616
- * @param options Complete scraper options used for indexing
6617
- */
6618
- async storeScraperOptions(versionId, options) {
6619
- try {
6620
- const { url: source_url, library, version: version2, signal, ...scraper_options } = options;
6621
- const optionsJson = JSON.stringify(scraper_options);
6622
- this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
6623
- } catch (error) {
6624
- throw new StoreError(`Failed to store scraper options: ${error}`);
6625
- }
6626
- }
6627
- /**
6628
- * Retrieves stored scraper options for a version.
6629
- * @param versionId The version ID to query
6630
- * @returns Stored scraper options or null if none stored
6631
- */
6632
- async getVersionScraperOptions(versionId) {
6633
- try {
6634
- const row = this.statements.getVersionWithOptions.get(versionId);
6635
- if (!row?.scraper_options) {
6636
- return null;
6637
- }
6638
- return JSON.parse(row.scraper_options);
6639
- } catch (error) {
6640
- throw new StoreError(`Failed to get version scraper options: ${error}`);
6641
- }
6642
- }
6643
- /**
6644
- * Retrieves a version record with all stored options.
6645
- * @param versionId The version ID to query
6646
- * @returns Complete version record or null if not found
6647
- */
6648
- async getVersionWithStoredOptions(versionId) {
6649
- try {
6650
- const row = this.statements.getVersionWithOptions.get(versionId);
6651
- return row || null;
6652
- } catch (error) {
6653
- throw new StoreError(`Failed to get version with stored options: ${error}`);
6654
- }
6655
- }
6656
- /**
6657
- * Finds versions that were indexed from the same source URL.
6658
- * Useful for finding similar configurations or detecting duplicates.
6659
- * @param url Source URL to search for
6660
- * @returns Array of versions with the same source URL
6661
- */
6662
- async findVersionsBySourceUrl(url) {
6663
- try {
6664
- const rows = this.statements.getVersionsBySourceUrl.all(
6665
- url
6666
- );
6667
- return rows;
6668
- } catch (error) {
6669
- throw new StoreError(`Failed to find versions by source URL: ${error}`);
6670
- }
6671
- }
6672
- /**
6673
- * Verifies existence of documents for a specific library version
6674
- */
6675
- async checkDocumentExists(library, version2) {
6676
- try {
6677
- const normalizedVersion = version2.toLowerCase();
6678
- const result = this.statements.checkExists.get(
6679
- library.toLowerCase(),
6680
- normalizedVersion
6681
- );
6682
- return result !== void 0;
6683
- } catch (error) {
6684
- throw new ConnectionError("Failed to check document existence", error);
6685
- }
6686
- }
6687
- /**
6688
- * Retrieves a mapping of all libraries to their available versions with details.
6689
- */
6690
- async queryLibraryVersions() {
6691
- try {
6692
- const rows = this.statements.queryLibraryVersions.all();
6693
- const libraryMap = /* @__PURE__ */ new Map();
6694
- for (const row of rows) {
6695
- const library = row.library;
6696
- if (!libraryMap.has(library)) {
6697
- libraryMap.set(library, []);
6698
- }
6699
- const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
6700
- libraryMap.get(library)?.push({
6701
- version: row.version,
6702
- documentCount: row.documentCount,
6703
- uniqueUrlCount: row.uniqueUrlCount,
6704
- indexedAt: indexedAtISO
6705
- });
6706
- }
6707
- for (const versions of libraryMap.values()) {
6708
- versions.sort((a, b) => {
6709
- if (a.version === "" && b.version !== "") {
6710
- return -1;
6711
- }
6712
- if (a.version !== "" && b.version === "") {
6713
- return 1;
6714
- }
6715
- if (a.version === "" && b.version === "") {
6716
- return 0;
6717
- }
6718
- try {
6719
- return semver__default.compare(a.version, b.version);
6720
- } catch (_error) {
6721
- return a.version.localeCompare(b.version);
6722
- }
6723
- });
6724
- }
6725
- return libraryMap;
6726
- } catch (error) {
6727
- throw new ConnectionError("Failed to query library versions", error);
6728
- }
6729
- }
6730
- /**
6731
- * Stores documents with library and version metadata, generating embeddings
6732
- * for vector similarity search. Automatically removes any existing documents
6733
- * for the same URLs before adding new ones to prevent UNIQUE constraint violations.
6734
- */
6735
- async addDocuments(library, version2, documents) {
6736
- try {
6737
- if (documents.length === 0) {
6738
- return;
6739
- }
6740
- const urls = /* @__PURE__ */ new Set();
6741
- for (const doc of documents) {
6742
- const url = doc.metadata.url;
6743
- if (!url || typeof url !== "string" || !url.trim()) {
6744
- throw new StoreError("Document metadata must include a valid URL");
6745
- }
6746
- urls.add(url);
6747
- }
6748
- const texts = documents.map((doc) => {
6749
- const header = `<title>${doc.metadata.title}</title>
6750
- <url>${doc.metadata.url}</url>
6751
- <path>${doc.metadata.path.join(" / ")}</path>
6752
- `;
6753
- return `${header}${doc.pageContent}`;
6754
- });
6755
- const rawEmbeddings = [];
6756
- for (let i = 0; i < texts.length; i += EMBEDDING_BATCH_SIZE) {
6757
- const batchTexts = texts.slice(i, i + EMBEDDING_BATCH_SIZE);
6758
- const batchEmbeddings = await this.embeddings.embedDocuments(batchTexts);
6759
- rawEmbeddings.push(...batchEmbeddings);
6760
- }
6761
- const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
6762
- const { libraryId, versionId } = await this.resolveLibraryAndVersionIds(
6763
- library,
6764
- version2
6765
- );
6766
- for (const url of urls) {
6767
- const deletedCount = await this.deleteDocumentsByUrl(library, version2, url);
6768
- if (deletedCount > 0) {
6769
- logger.debug(`🗑️ Deleted ${deletedCount} existing documents for URL: ${url}`);
6770
- }
6771
- }
6772
- const transaction = this.db.transaction((docs) => {
6773
- for (let i = 0; i < docs.length; i++) {
6774
- const doc = docs[i];
6775
- const url = doc.metadata.url;
6776
- const result = this.statements.insertDocument.run(
6777
- BigInt(libraryId),
6778
- BigInt(versionId),
6779
- url,
6780
- doc.pageContent,
6781
- JSON.stringify(doc.metadata),
6782
- i,
6783
- (/* @__PURE__ */ new Date()).toISOString()
6784
- // Pass current timestamp for indexed_at
6785
- );
6786
- const rowId = result.lastInsertRowid;
6787
- this.statements.insertEmbedding.run(
6788
- BigInt(rowId),
6789
- BigInt(libraryId),
6790
- BigInt(versionId),
6791
- JSON.stringify(paddedEmbeddings[i])
6792
- );
6793
- }
6794
- });
6795
- transaction(documents);
6796
- } catch (error) {
6797
- throw new ConnectionError("Failed to add documents to store", error);
6798
- }
6799
- }
6800
- /**
6801
- * Removes documents matching specified library and version
6802
- * @returns Number of documents deleted
6803
- */
6804
- async deleteDocuments(library, version2) {
6805
- try {
6806
- const normalizedVersion = version2.toLowerCase();
6807
- const result = this.statements.deleteDocuments.run(
6808
- library.toLowerCase(),
6809
- library.toLowerCase(),
6810
- // library name appears twice in the query
6811
- normalizedVersion
6812
- );
6813
- return result.changes;
6814
- } catch (error) {
6815
- throw new ConnectionError("Failed to delete documents", error);
6816
- }
6817
- }
6818
- /**
6819
- * Removes documents for a specific URL within a library and version
6820
- * @returns Number of documents deleted
6821
- */
6822
- async deleteDocumentsByUrl(library, version2, url) {
6823
- try {
6824
- const normalizedVersion = version2.toLowerCase();
6825
- const result = this.statements.deleteDocumentsByUrl.run(
6826
- url,
6827
- library.toLowerCase(),
6828
- library.toLowerCase(),
6829
- // library name appears twice in the query
6830
- normalizedVersion
6831
- );
6832
- return result.changes;
6833
- } catch (error) {
6834
- throw new ConnectionError("Failed to delete documents by URL", error);
6835
- }
6836
- }
6837
- /**
6838
- * Retrieves a document by its ID.
6839
- * @param id The ID of the document.
6840
- * @returns The document, or null if not found.
6841
- */
6842
- async getById(id) {
6843
- try {
6844
- const row = this.statements.getById.get(BigInt(id));
6845
- if (!row) {
6846
- return null;
6847
- }
6848
- return mapDbDocumentToDocument(row);
6849
- } catch (error) {
6850
- throw new ConnectionError(`Failed to get document by ID ${id}`, error);
6851
- }
6852
- }
6853
- /**
6854
- * Finds documents matching a text query using hybrid search.
6855
- * Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
6856
- */
6857
- async findByContent(library, version2, query, limit) {
6858
- try {
6859
- const rawEmbedding = await this.embeddings.embedQuery(query);
6860
- const embedding = this.padVector(rawEmbedding);
6861
- const ftsQuery = this.escapeFtsQuery(query);
6862
- const normalizedVersion = version2.toLowerCase();
6863
- const stmt = this.db.prepare(`
6864
- WITH vec_distances AS (
6865
- SELECT
6866
- dv.rowid as id,
6867
- dv.distance as vec_distance
6868
- FROM documents_vec dv
6869
- JOIN versions v ON dv.version_id = v.id
6870
- JOIN libraries l ON v.library_id = l.id
6871
- WHERE l.name = ?
6872
- AND COALESCE(v.name, '') = COALESCE(?, '')
6873
- AND dv.embedding MATCH ?
6874
- AND dv.k = ?
6875
- ORDER BY dv.distance
6876
- ),
6877
- fts_scores AS (
6878
- SELECT
6879
- f.rowid as id,
6880
- bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
6881
- FROM documents_fts f
6882
- JOIN documents d ON f.rowid = d.id
6883
- JOIN versions v ON d.version_id = v.id
6884
- JOIN libraries l ON v.library_id = l.id
6885
- WHERE l.name = ?
6886
- AND COALESCE(v.name, '') = COALESCE(?, '')
6887
- AND documents_fts MATCH ?
6888
- ORDER BY fts_score
6889
- LIMIT ?
6890
- )
6891
- SELECT
6892
- d.id,
6893
- d.content,
6894
- d.metadata,
6895
- COALESCE(1 / (1 + v.vec_distance), 0) as vec_score,
6896
- COALESCE(-MIN(f.fts_score, 0), 0) as fts_score
6897
- FROM documents d
6898
- LEFT JOIN vec_distances v ON d.id = v.id
6899
- LEFT JOIN fts_scores f ON d.id = f.id
6900
- WHERE v.id IS NOT NULL OR f.id IS NOT NULL
6901
- `);
6902
- const rawResults = stmt.all(
6903
- library.toLowerCase(),
6904
- normalizedVersion,
6905
- JSON.stringify(embedding),
6906
- limit,
6907
- library.toLowerCase(),
6908
- normalizedVersion,
6909
- ftsQuery,
6910
- // Use the escaped query
6911
- limit
6912
- );
6913
- const rankedResults = this.assignRanks(rawResults);
6914
- const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
6915
- return topResults.map((row) => ({
6916
- ...mapDbDocumentToDocument(row),
6917
- metadata: {
6918
- ...JSON.parse(row.metadata),
6919
- id: row.id,
6920
- score: row.rrf_score,
6921
- vec_rank: row.vec_rank,
6922
- fts_rank: row.fts_rank
6923
- }
6924
- }));
6925
- } catch (error) {
6926
- throw new ConnectionError(
6927
- `Failed to find documents by content with query "${query}"`,
6928
- error
6929
- );
6930
- }
6931
- }
6932
- /**
6933
- * Finds child chunks of a given document based on path hierarchy.
6934
- */
6935
- async findChildChunks(library, version2, id, limit) {
6936
- try {
6937
- const parent = await this.getById(id);
6938
- if (!parent) {
6939
- return [];
6940
- }
6941
- const parentPath = parent.metadata.path ?? [];
6942
- const parentUrl = parent.metadata.url;
6943
- const normalizedVersion = version2.toLowerCase();
6944
- const result = this.statements.getChildChunks.all(
6945
- library.toLowerCase(),
6946
- normalizedVersion,
6947
- parentUrl,
6948
- parentPath.length + 1,
6949
- JSON.stringify(parentPath),
6950
- BigInt(id),
6951
- limit
6952
- );
6953
- return result.map((row) => mapDbDocumentToDocument(row));
6954
- } catch (error) {
6955
- throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
6956
- }
6957
- }
6958
- /**
6959
- * Finds preceding sibling chunks of a given document.
6960
- */
6961
- async findPrecedingSiblingChunks(library, version2, id, limit) {
6962
- try {
6963
- const reference = await this.getById(id);
6964
- if (!reference) {
6965
- return [];
6966
- }
6967
- const refMetadata = reference.metadata;
6968
- const normalizedVersion = version2.toLowerCase();
6969
- const result = this.statements.getPrecedingSiblings.all(
6970
- library.toLowerCase(),
6971
- normalizedVersion,
6972
- refMetadata.url,
6973
- BigInt(id),
6974
- JSON.stringify(refMetadata.path),
6975
- limit
6976
- );
6977
- return result.reverse().map((row) => mapDbDocumentToDocument(row));
6978
- } catch (error) {
6979
- throw new ConnectionError(
6980
- `Failed to find preceding sibling chunks for ID ${id}`,
6981
- error
6982
- );
6983
- }
6984
- }
6985
- /**
6986
- * Finds subsequent sibling chunks of a given document.
6987
- */
6988
- async findSubsequentSiblingChunks(library, version2, id, limit) {
6989
- try {
6990
- const reference = await this.getById(id);
6991
- if (!reference) {
6992
- return [];
6993
- }
6994
- const refMetadata = reference.metadata;
6995
- const normalizedVersion = version2.toLowerCase();
6996
- const result = this.statements.getSubsequentSiblings.all(
6997
- library.toLowerCase(),
6998
- normalizedVersion,
6999
- refMetadata.url,
7000
- BigInt(id),
7001
- JSON.stringify(refMetadata.path),
7002
- limit
7003
- );
7004
- return result.map((row) => mapDbDocumentToDocument(row));
7005
- } catch (error) {
7006
- throw new ConnectionError(
7007
- `Failed to find subsequent sibling chunks for ID ${id}`,
7008
- error
7009
- );
7010
- }
7011
- }
7012
- /**
7013
- * Finds the parent chunk of a given document.
7014
- */
7015
- async findParentChunk(library, version2, id) {
7016
- try {
7017
- const child = await this.getById(id);
7018
- if (!child) {
7019
- return null;
7020
- }
7021
- const childMetadata = child.metadata;
7022
- const path2 = childMetadata.path ?? [];
7023
- const parentPath = path2.slice(0, -1);
7024
- if (parentPath.length === 0) {
7025
- return null;
7026
- }
7027
- const normalizedVersion = version2.toLowerCase();
7028
- const result = this.statements.getParentChunk.get(
7029
- library.toLowerCase(),
7030
- normalizedVersion,
7031
- childMetadata.url,
7032
- JSON.stringify(parentPath),
7033
- BigInt(id)
7034
- );
7035
- if (!result) {
7036
- return null;
7037
- }
7038
- return mapDbDocumentToDocument(result);
7039
- } catch (error) {
7040
- throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
7041
- }
7042
- }
7043
- /**
7044
- * Fetches multiple documents by their IDs in a single call.
7045
- * Returns an array of Document objects, sorted by their sort_order.
7046
- */
7047
- async findChunksByIds(library, version2, ids) {
7048
- if (!ids.length) return [];
7049
- try {
7050
- const normalizedVersion = version2.toLowerCase();
7051
- const placeholders = ids.map(() => "?").join(",");
7052
- const stmt = this.db.prepare(
7053
- `SELECT d.* FROM documents d
7054
- JOIN libraries l ON d.library_id = l.id
7055
- JOIN versions v ON d.version_id = v.id
7056
- WHERE l.name = ?
7057
- AND COALESCE(v.name, '') = COALESCE(?, '')
7058
- AND d.id IN (${placeholders})
7059
- ORDER BY d.sort_order`
7060
- );
7061
- const rows = stmt.all(
7062
- library.toLowerCase(),
7063
- normalizedVersion,
7064
- ...ids
7065
- );
7066
- return rows.map((row) => mapDbDocumentToDocument(row));
7067
- } catch (error) {
7068
- throw new ConnectionError("Failed to fetch documents by IDs", error);
7069
- }
7070
- }
7071
- }
7072
- class DocumentManagementService {
7073
- store;
7074
- documentRetriever;
7075
- splitter;
7076
- /**
7077
- * Normalizes a version string, converting null or undefined to an empty string
7078
- * and converting to lowercase.
7079
- */
7080
- normalizeVersion(version2) {
7081
- return (version2 ?? "").toLowerCase();
7082
- }
7083
- constructor() {
7084
- let dbPath;
7085
- let dbDir;
7086
- const envStorePath = process.env.DOCS_MCP_STORE_PATH;
7087
- if (envStorePath) {
7088
- dbDir = envStorePath;
7089
- dbPath = path.join(dbDir, "documents.db");
7090
- logger.debug(`💾 Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
7091
- } else {
7092
- const projectRoot2 = getProjectRoot();
7093
- const oldDbDir = path.join(projectRoot2, ".store");
7094
- const oldDbPath = path.join(oldDbDir, "documents.db");
7095
- const oldDbExists = fs$1.existsSync(oldDbPath);
7096
- if (oldDbExists) {
7097
- dbPath = oldDbPath;
7098
- dbDir = oldDbDir;
7099
- logger.debug(`💾 Using legacy database path: ${dbPath}`);
7100
- } else {
7101
- const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
7102
- dbDir = standardPaths.data;
7103
- dbPath = path.join(dbDir, "documents.db");
7104
- logger.debug(`💾 Using standard database directory: ${dbDir}`);
7105
- }
7106
- }
7107
- try {
7108
- fs$1.mkdirSync(dbDir, { recursive: true });
7109
- } catch (error) {
7110
- logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
7111
- }
7112
- this.store = new DocumentStore(dbPath);
7113
- this.documentRetriever = new DocumentRetrieverService(this.store);
7114
- const semanticSplitter = new SemanticMarkdownSplitter(
7115
- SPLITTER_PREFERRED_CHUNK_SIZE,
7116
- SPLITTER_MAX_CHUNK_SIZE
7117
- );
7118
- const greedySplitter = new GreedySplitter(
7119
- semanticSplitter,
7120
- SPLITTER_MIN_CHUNK_SIZE,
7121
- SPLITTER_PREFERRED_CHUNK_SIZE
7122
- );
7123
- this.splitter = greedySplitter;
7124
- }
7125
- /**
7126
- * Initializes the underlying document store.
7127
- */
7128
- async initialize() {
7129
- await this.store.initialize();
7130
- }
7131
- /**
7132
- * Shuts down the underlying document store.
7133
- */
7134
- async shutdown() {
7135
- logger.debug("Shutting down store manager");
7136
- await this.store.shutdown();
7137
- }
7138
- // Status tracking methods for pipeline integration
7139
- /**
7140
- * Gets versions by their current status.
7141
- */
7142
- async getVersionsByStatus(statuses) {
7143
- return this.store.getVersionsByStatus(statuses);
7144
- }
7145
- /**
7146
- * Gets all versions currently in RUNNING status.
7147
- */
7148
- async getRunningVersions() {
7149
- return this.store.getRunningVersions();
7150
- }
7151
- /**
7152
- * Updates the status of a version.
7153
- */
7154
- async updateVersionStatus(versionId, status, errorMessage) {
7155
- return this.store.updateVersionStatus(versionId, status, errorMessage);
7156
- }
7157
- /**
7158
- * Updates the progress of a version being indexed.
7159
- */
7160
- async updateVersionProgress(versionId, pages, maxPages) {
7161
- return this.store.updateVersionProgress(versionId, pages, maxPages);
7162
- }
7163
- /**
7164
- * Stores scraper options for a version to enable reproducible indexing.
7165
- */
7166
- async storeScraperOptions(versionId, options) {
7167
- return this.store.storeScraperOptions(versionId, options);
7168
- }
7169
- /**
7170
- * Retrieves stored scraper options for a version.
7171
- */
7172
- async getVersionScraperOptions(versionId) {
7173
- return this.store.getVersionScraperOptions(versionId);
7174
- }
7175
- /**
7176
- * Retrieves a version record with all stored options.
7177
- */
7178
- async getVersionWithStoredOptions(versionId) {
7179
- return this.store.getVersionWithStoredOptions(versionId);
7180
- }
7181
- /**
7182
- * Finds versions that were indexed from the same source URL.
7183
- */
7184
- async findVersionsBySourceUrl(url) {
7185
- return this.store.findVersionsBySourceUrl(url);
7186
- }
7187
- /**
7188
- * Validates if a library exists in the store (either versioned or unversioned).
7189
- * Throws LibraryNotFoundError with suggestions if the library is not found.
7190
- * @param library The name of the library to validate.
7191
- * @throws {LibraryNotFoundError} If the library does not exist.
7192
- */
7193
- async validateLibraryExists(library) {
7194
- logger.info(`🔎 Validating existence of library: ${library}`);
7195
- const normalizedLibrary = library.toLowerCase();
7196
- const versions = await this.listVersions(normalizedLibrary);
7197
- const hasUnversioned = await this.exists(normalizedLibrary, "");
7198
- if (versions.length === 0 && !hasUnversioned) {
7199
- logger.warn(`⚠️ Library '${library}' not found.`);
7200
- const allLibraries = await this.listLibraries();
7201
- const libraryNames = allLibraries.map((lib) => lib.library);
7202
- let suggestions = [];
7203
- if (libraryNames.length > 0) {
7204
- const fuse = new Fuse(libraryNames, {
7205
- // Configure fuse.js options if needed (e.g., threshold)
7206
- // isCaseSensitive: false, // Handled by normalizing library names
7207
- // includeScore: true,
7208
- threshold: 0.4
7209
- // Adjust threshold for desired fuzziness (0=exact, 1=match anything)
7210
- });
7211
- const results = fuse.search(normalizedLibrary);
7212
- suggestions = results.slice(0, 3).map((result) => result.item);
7213
- logger.info(`🔍 Found suggestions: ${suggestions.join(", ")}`);
7214
- }
7215
- throw new LibraryNotFoundError(library, suggestions);
7216
- }
7217
- logger.info(`✅ Library '${library}' confirmed to exist.`);
7218
- }
7219
- /**
7220
- * Returns a list of all available semantic versions for a library.
7221
- */
7222
- async listVersions(library) {
7223
- const versions = await this.store.queryUniqueVersions(library);
7224
- return versions.filter((v) => semver__default.valid(v));
7225
- }
7226
- /**
7227
- * Checks if documents exist for a given library and optional version.
7228
- * If version is omitted, checks for documents without a specific version.
7229
- */
7230
- async exists(library, version2) {
7231
- const normalizedVersion = this.normalizeVersion(version2);
7232
- return this.store.checkDocumentExists(library, normalizedVersion);
7233
- }
7234
- /**
7235
- * Finds the most appropriate version of documentation based on the requested version.
7236
- * When no target version is specified, returns the latest version.
7237
- *
7238
- * Version matching behavior:
7239
- * - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
7240
- * - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
7241
- * - "latest" or no version: Returns the latest available version
7242
- *
7243
- * For documentation, we prefer matching older versions over no match at all,
7244
- * since older docs are often still relevant and useful.
7245
- * Also checks if unversioned documents exist for the library.
7246
- */
7247
- async findBestVersion(library, targetVersion) {
7248
- const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
7249
- logger.info(`🔍 Finding best version for ${libraryAndVersion}`);
7250
- const hasUnversioned = await this.store.checkDocumentExists(library, "");
7251
- const versionStrings = await this.listVersions(library);
7252
- if (versionStrings.length === 0) {
7253
- if (hasUnversioned) {
7254
- logger.info(`ℹ️ Unversioned documents exist for ${library}`);
7255
- return { bestMatch: null, hasUnversioned: true };
7256
- }
7257
- logger.warn(`⚠️ No valid versions found for ${library}`);
7258
- const allLibraryDetails = await this.store.queryLibraryVersions();
7259
- const libraryDetails = allLibraryDetails.get(library) ?? [];
7260
- throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
7261
- }
7262
- let bestMatch = null;
7263
- if (!targetVersion || targetVersion === "latest") {
7264
- bestMatch = semver__default.maxSatisfying(versionStrings, "*");
7265
- } else {
7266
- const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
7267
- if (!versionRegex.test(targetVersion)) {
7268
- logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
7269
- } else {
7270
- let range = targetVersion;
7271
- if (!semver__default.validRange(targetVersion)) {
7272
- range = `~${targetVersion}`;
7273
- } else if (semver__default.valid(targetVersion)) {
7274
- range = `${range} || <=${targetVersion}`;
7275
- }
7276
- bestMatch = semver__default.maxSatisfying(versionStrings, range);
7277
- }
7278
- }
7279
- if (bestMatch) {
7280
- logger.info(`✅ Found best match version ${bestMatch} for ${libraryAndVersion}`);
7281
- } else {
7282
- logger.warn(`⚠️ No matching semver version found for ${libraryAndVersion}`);
7283
- }
7284
- if (!bestMatch && !hasUnversioned) {
7285
- const allLibraryDetails = await this.store.queryLibraryVersions();
7286
- const libraryDetails = allLibraryDetails.get(library) ?? [];
7287
- throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
7288
- }
7289
- return { bestMatch, hasUnversioned };
7290
- }
7291
- /**
7292
- * Removes all documents for a specific library and optional version.
7293
- * If version is omitted, removes documents without a specific version.
7294
- */
7295
- async removeAllDocuments(library, version2) {
7296
- const normalizedVersion = this.normalizeVersion(version2);
7297
- logger.info(
7298
- `🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
7299
- );
7300
- const count = await this.store.deleteDocuments(library, normalizedVersion);
7301
- logger.info(`📊 Deleted ${count} documents`);
7302
- }
7303
- /**
7304
- * Adds a document to the store, splitting it into smaller chunks for better search results.
7305
- * Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
7306
- * Preserves hierarchical structure of documents and distinguishes between text and code segments.
7307
- * If version is omitted, the document is added without a specific version.
7308
- */
7309
- async addDocument(library, version2, document) {
7310
- const normalizedVersion = this.normalizeVersion(version2);
7311
- const url = document.metadata.url;
7312
- if (!url || typeof url !== "string" || !url.trim()) {
7313
- throw new StoreError("Document metadata must include a valid URL");
7314
- }
7315
- logger.info(`📚 Adding document: ${document.metadata.title}`);
7316
- if (!document.pageContent.trim()) {
7317
- throw new Error("Document content cannot be empty");
7318
- }
7319
- const chunks = await this.splitter.splitText(document.pageContent);
7320
- const splitDocs = chunks.map((chunk) => ({
7321
- pageContent: chunk.content,
7322
- metadata: {
7323
- ...document.metadata,
7324
- level: chunk.section.level,
7325
- path: chunk.section.path
7326
- }
7327
- }));
7328
- logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
7329
- await this.store.addDocuments(library, normalizedVersion, splitDocs);
7330
- }
7331
- /**
7332
- * Searches for documentation content across versions.
7333
- * Uses hybrid search (vector + FTS).
7334
- * If version is omitted, searches documents without a specific version.
7335
- */
7336
- async searchStore(library, version2, query, limit = 5) {
7337
- const normalizedVersion = this.normalizeVersion(version2);
7338
- return this.documentRetriever.search(library, normalizedVersion, query, limit);
7339
- }
7340
- async listLibraries() {
7341
- const libraryMap = await this.store.queryLibraryVersions();
7342
- return Array.from(libraryMap.entries()).map(([library, versions]) => ({
7343
- library,
7344
- versions
7345
- // The versions array already contains LibraryVersionDetails
7346
- }));
7347
- }
7348
- /**
7349
- * Gets all versions in active states (queued, running, updating).
7350
- */
7351
- async getActiveVersions() {
7352
- return this.store.getActiveVersions();
7353
- }
7354
- /**
7355
- * Ensures a library and version exist in the database and returns the version ID.
7356
- * Creates the library and version records if they don't exist.
7357
- */
7358
- async ensureLibraryAndVersion(library, version2) {
7359
- const normalizedLibrary = library.toLowerCase();
7360
- const normalizedVersion = this.normalizeVersion(version2);
7361
- const { versionId } = await this.store.resolveLibraryAndVersionIds(
7362
- normalizedLibrary,
7363
- normalizedVersion
7364
- );
7365
- return versionId;
7366
- }
7367
- }
7368
- function ensurePlaywrightBrowsersInstalled() {
7369
- const chromiumEnvPath = process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH;
7370
- if (chromiumEnvPath && existsSync(chromiumEnvPath)) {
7371
- logger.debug(
7372
- `PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH is set to '${chromiumEnvPath}', skipping Playwright browser install.`
7373
- );
7374
- return;
7375
- }
7376
- try {
7377
- const chromiumPath = chromium.executablePath();
7378
- if (!chromiumPath || !existsSync(chromiumPath)) {
7379
- throw new Error("Playwright Chromium browser not found");
7380
- }
7381
- } catch (_err) {
7382
- logger.debug(
7383
- "Playwright browsers not found. Installing Chromium browser for dynamic scraping (this may take a minute)..."
7384
- );
7385
- try {
7386
- logger.debug("Installing Playwright Chromium browser...");
7387
- execSync("npm exec -y playwright install --no-shell --with-deps chromium", {
7388
- stdio: "ignore",
7389
- // Suppress output
7390
- cwd: getProjectRoot()
7391
- });
7392
- } catch (_installErr) {
7393
- console.error(
7394
- "❌ Failed to install Playwright browsers automatically. Please run:\n npx playwright install --no-shell --with-deps chromium\nand try again."
7395
- );
7396
- process.exit(1);
7397
- }
7398
- }
7399
- }
7400
- function resolveProtocol(protocol) {
7401
- if (protocol === "auto") {
7402
- if (!process.stdin.isTTY && !process.stdout.isTTY) {
7403
- return "stdio";
7404
- }
7405
- return "http";
7406
- }
7407
- if (protocol === "stdio" || protocol === "http") {
7408
- return protocol;
7409
- }
7410
- throw new Error(`Invalid protocol: ${protocol}. Must be 'auto', 'stdio', or 'http'`);
7411
- }
7412
- const formatOutput = (data) => JSON.stringify(data, null, 2);
7413
- function setupLogging(options, protocol) {
7414
- if (options.silent) {
7415
- setLogLevel(LogLevel.ERROR);
7416
- } else if (options.verbose) {
7417
- setLogLevel(LogLevel.DEBUG);
7418
- }
7419
- if (protocol === "stdio") {
7420
- setLogLevel(LogLevel.ERROR);
7421
- }
7422
- }
7423
- function validatePort(portString) {
7424
- const port = Number.parseInt(portString, 10);
7425
- if (Number.isNaN(port) || port < 1 || port > 65535) {
7426
- throw new Error("❌ Invalid port number");
7427
- }
7428
- return port;
7429
- }
7430
- async function initializeDocumentService() {
7431
- const docService = new DocumentManagementService();
7432
- await docService.initialize();
7433
- return docService;
7434
- }
7435
- async function initializePipeline(docService, options = {}) {
7436
- logger.debug(`Initializing PipelineManager with options: ${JSON.stringify(options)}`);
7437
- const manager = await PipelineFactory.createPipeline(docService, options);
7438
- manager.setCallbacks({
7439
- onJobProgress: async (job, progress) => {
7440
- logger.debug(
7441
- `📊 Job ${job.id} progress: ${progress.pagesScraped}/${progress.totalPages} pages`
7442
- );
7443
- },
7444
- onJobStatusChange: async (job) => {
7445
- logger.debug(`🔄 Job ${job.id} status changed to: ${job.status}`);
7446
- },
7447
- onJobError: async (job, error, document) => {
7448
- logger.warn(
7449
- `⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`
7450
- );
7451
- }
7452
- });
7453
- return manager;
7454
- }
7455
- function createAppServerConfig(options) {
7456
- return {
7457
- enableWebInterface: options.enableWebInterface ?? false,
7458
- enableMcpServer: options.enableMcpServer ?? true,
7459
- enablePipelineApi: options.enablePipelineApi ?? false,
7460
- enableWorker: options.enableWorker ?? true,
7461
- port: options.port,
7462
- externalWorkerUrl: options.externalWorkerUrl
7463
- };
7464
- }
7465
- function parseHeaders(headerOptions) {
7466
- const headers = {};
7467
- if (Array.isArray(headerOptions)) {
7468
- for (const entry of headerOptions) {
7469
- const idx = entry.indexOf(":");
7470
- if (idx > 0) {
7471
- const name = entry.slice(0, idx).trim();
7472
- const value = entry.slice(idx + 1).trim();
7473
- if (name) headers[name] = value;
7474
- }
7475
- }
7476
- }
7477
- return headers;
5761
+ return headers;
7478
5762
  }
7479
5763
  const CLI_DEFAULTS = {
7480
5764
  PROTOCOL: DEFAULT_PROTOCOL,
@@ -7483,11 +5767,17 @@ const CLI_DEFAULTS = {
7483
5767
  MAX_CONCURRENCY: DEFAULT_MAX_CONCURRENCY
7484
5768
  };
7485
5769
  function createDefaultAction(program) {
7486
- return program.option(
7487
- "--protocol <type>",
7488
- "Protocol for MCP server: 'auto' (default), 'stdio', or 'http'",
7489
- "auto"
7490
- ).option("--port <number>", "Port for the server", CLI_DEFAULTS.HTTP_PORT.toString()).option("--resume", "Resume interrupted jobs on startup", false).action(
5770
+ return program.addOption(
5771
+ new Option("--protocol <protocol>", "Protocol for MCP server").choices(["auto", "stdio", "http"]).default("auto")
5772
+ ).addOption(
5773
+ new Option("--port <number>", "Port for the server").argParser((v) => {
5774
+ const n = Number(v);
5775
+ if (!Number.isInteger(n) || n < 1 || n > 65535) {
5776
+ throw new Error("Port must be an integer between 1 and 65535");
5777
+ }
5778
+ return String(n);
5779
+ }).default(CLI_DEFAULTS.HTTP_PORT.toString())
5780
+ ).option("--resume", "Resume interrupted jobs on startup", false).option("--no-resume", "Do not resume jobs on startup").action(
7491
5781
  async (options, command) => {
7492
5782
  const globalOptions = command.opts();
7493
5783
  const resolvedProtocol = resolveProtocol(options.protocol);
@@ -7495,13 +5785,13 @@ function createDefaultAction(program) {
7495
5785
  logger.debug("No subcommand specified, starting unified server by default...");
7496
5786
  const port = validatePort(options.port);
7497
5787
  ensurePlaywrightBrowsersInstalled();
7498
- const docService = await initializeDocumentService();
5788
+ const docService = await createLocalDocumentManagement();
7499
5789
  const pipelineOptions = {
7500
5790
  recoverJobs: options.resume || false,
7501
5791
  // Use --resume flag for job recovery
7502
5792
  concurrency: 3
7503
5793
  };
7504
- const pipeline = await initializePipeline(docService, pipelineOptions);
5794
+ const pipeline = await createPipelineWithCallbacks(docService, pipelineOptions);
7505
5795
  if (resolvedProtocol === "stdio") {
7506
5796
  logger.debug(`🔍 Auto-detected stdio protocol (no TTY)`);
7507
5797
  await pipeline.start();
@@ -7516,8 +5806,8 @@ function createDefaultAction(program) {
7516
5806
  // Enable web interface in http mode
7517
5807
  enableMcpServer: true,
7518
5808
  // Always enable MCP server
7519
- enablePipelineApi: true,
7520
- // Enable pipeline API in http mode
5809
+ enableApiServer: true,
5810
+ // Enable API (tRPC) in http mode
7521
5811
  enableWorker: true,
7522
5812
  // Always enable in-process worker for unified server
7523
5813
  port
@@ -7529,6 +5819,19 @@ function createDefaultAction(program) {
7529
5819
  }
7530
5820
  );
7531
5821
  }
5822
+ async function fetchUrlAction(url, options, command) {
5823
+ const globalOptions = command.parent?.opts() || {};
5824
+ setupLogging(globalOptions);
5825
+ const headers = parseHeaders(options.header);
5826
+ const fetchUrlTool = new FetchUrlTool(new HttpFetcher(), new FileFetcher());
5827
+ const content = await fetchUrlTool.execute({
5828
+ url,
5829
+ followRedirects: options.followRedirects,
5830
+ scrapeMode: options.scrapeMode,
5831
+ headers: Object.keys(headers).length > 0 ? headers : void 0
5832
+ });
5833
+ console.log(content);
5834
+ }
7532
5835
  function createFetchUrlCommand(program) {
7533
5836
  return program.command("fetch-url <url>").description("Fetch a URL and convert its content to Markdown").option(
7534
5837
  "--no-follow-redirects",
@@ -7552,66 +5855,64 @@ function createFetchUrlCommand(program) {
7552
5855
  "Custom HTTP header to send with the request (can be specified multiple times)",
7553
5856
  (val, prev = []) => prev.concat([val]),
7554
5857
  []
7555
- ).action(
7556
- async (url, options, command) => {
7557
- const globalOptions = command.parent?.opts() || {};
7558
- setupLogging(globalOptions);
7559
- const headers = parseHeaders(options.header);
7560
- const fetchUrlTool = new FetchUrlTool(new HttpFetcher(), new FileFetcher());
7561
- const content = await fetchUrlTool.execute({
7562
- url,
7563
- followRedirects: options.followRedirects,
7564
- scrapeMode: options.scrapeMode,
7565
- headers: Object.keys(headers).length > 0 ? headers : void 0
7566
- });
7567
- console.log(content);
7568
- }
7569
- );
5858
+ ).action(fetchUrlAction);
5859
+ }
5860
+ async function findVersionAction(library, options, command) {
5861
+ const globalOptions = command.parent?.opts() || {};
5862
+ setupLogging(globalOptions);
5863
+ const serverUrl = options.serverUrl;
5864
+ const docService = await createDocumentManagement({ serverUrl });
5865
+ try {
5866
+ const findVersionTool = new FindVersionTool(docService);
5867
+ const versionInfo = await findVersionTool.execute({
5868
+ library,
5869
+ targetVersion: options.version
5870
+ });
5871
+ if (!versionInfo) throw new Error("Failed to get version information");
5872
+ console.log(versionInfo);
5873
+ } finally {
5874
+ await docService.shutdown();
5875
+ }
7570
5876
  }
7571
5877
  function createFindVersionCommand(program) {
7572
- return program.command("find-version <library>").description("Find the best matching version for a library").option("-v, --version <string>", "Pattern to match (optional, supports ranges)").action(async (library, options, command) => {
7573
- const globalOptions = command.parent?.opts() || {};
7574
- setupLogging(globalOptions);
7575
- const docService = await initializeDocumentService();
7576
- try {
7577
- const findVersionTool = new FindVersionTool(docService);
7578
- const versionInfo = await findVersionTool.execute({
7579
- library,
7580
- targetVersion: options.version
7581
- });
7582
- if (!versionInfo) throw new Error("Failed to get version information");
7583
- console.log(versionInfo);
7584
- } finally {
7585
- await docService.shutdown();
7586
- }
7587
- });
5878
+ return program.command("find-version <library>").description("Find the best matching version for a library").option("-v, --version <string>", "Pattern to match (optional, supports ranges)").option(
5879
+ "--server-url <url>",
5880
+ "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
5881
+ ).action(findVersionAction);
5882
+ }
5883
+ async function listAction(options, command) {
5884
+ const globalOptions = command.parent?.opts() || {};
5885
+ setupLogging(globalOptions);
5886
+ const { serverUrl } = options;
5887
+ const docService = await createDocumentManagement({ serverUrl });
5888
+ try {
5889
+ const listLibrariesTool = new ListLibrariesTool(docService);
5890
+ const result = await listLibrariesTool.execute();
5891
+ console.log(formatOutput(result.libraries));
5892
+ } finally {
5893
+ await docService.shutdown();
5894
+ }
7588
5895
  }
7589
5896
  function createListCommand(program) {
7590
- return program.command("list").description("List all available libraries and their versions").action(async (command) => {
7591
- const globalOptions = command.opts() || {};
7592
- setupLogging(globalOptions);
7593
- const docService = await initializeDocumentService();
7594
- try {
7595
- const listLibrariesTool = new ListLibrariesTool(docService);
7596
- const result = await listLibrariesTool.execute();
7597
- console.log(formatOutput(result.libraries));
7598
- } finally {
7599
- await docService.shutdown();
7600
- }
7601
- });
5897
+ return program.command("list").description("List all available libraries and their versions").option(
5898
+ "--server-url <url>",
5899
+ "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
5900
+ ).action(listAction);
7602
5901
  }
7603
5902
  function createMcpCommand(program) {
7604
- return program.command("mcp").description("Start MCP server only").option(
7605
- "--protocol <type>",
7606
- "Protocol for MCP server: 'auto' (default), 'stdio', or 'http'",
7607
- CLI_DEFAULTS.PROTOCOL
7608
- ).option(
7609
- "--port <number>",
7610
- "Port for the MCP server",
7611
- CLI_DEFAULTS.HTTP_PORT.toString()
5903
+ return program.command("mcp").description("Start MCP server only").addOption(
5904
+ new Option("--protocol <protocol>", "Protocol for MCP server").choices(["auto", "stdio", "http"]).default(CLI_DEFAULTS.PROTOCOL)
5905
+ ).addOption(
5906
+ new Option("--port <number>", "Port for the MCP server").argParser((v) => {
5907
+ const n = Number(v);
5908
+ if (!Number.isInteger(n) || n < 1 || n > 65535) {
5909
+ throw new Error("Port must be an integer between 1 and 65535");
5910
+ }
5911
+ return String(n);
5912
+ }).default(CLI_DEFAULTS.HTTP_PORT.toString())
7612
5913
  ).option(
7613
5914
  "--server-url <url>",
7614
- "URL of external pipeline worker API (e.g., http://localhost:6280/api)"
5915
+ "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
7615
5916
  ).action(
7616
5917
  async (cmdOptions, command) => {
7617
5918
  const globalOptions = command.parent?.opts() || {};
@@ -7620,14 +5921,19 @@ function createMcpCommand(program) {
7620
5921
  const resolvedProtocol = resolveProtocol(cmdOptions.protocol);
7621
5922
  setupLogging(globalOptions, resolvedProtocol);
7622
5923
  try {
7623
- const docService = await initializeDocumentService();
5924
+ const docService = await createDocumentManagement({
5925
+ serverUrl
5926
+ });
7624
5927
  const pipelineOptions = {
7625
5928
  recoverJobs: false,
7626
5929
  // MCP command doesn't support job recovery
7627
5930
  serverUrl,
7628
5931
  concurrency: 3
7629
5932
  };
7630
- const pipeline = await initializePipeline(docService, pipelineOptions);
5933
+ const pipeline = await createPipelineWithCallbacks(
5934
+ serverUrl ? void 0 : docService,
5935
+ pipelineOptions
5936
+ );
7631
5937
  if (resolvedProtocol === "stdio") {
7632
5938
  logger.debug(`🔍 Auto-detected stdio protocol (no TTY)`);
7633
5939
  logger.info("🚀 Starting MCP server (stdio mode)");
@@ -7643,8 +5949,8 @@ function createMcpCommand(program) {
7643
5949
  enableWebInterface: false,
7644
5950
  // Never enable web interface in mcp command
7645
5951
  enableMcpServer: true,
7646
- enablePipelineApi: false,
7647
- // Never enable pipeline API in mcp command
5952
+ enableApiServer: false,
5953
+ // Never enable API in mcp command
7648
5954
  enableWorker: !serverUrl,
7649
5955
  port,
7650
5956
  externalWorkerUrl: serverUrl
@@ -7660,30 +5966,81 @@ function createMcpCommand(program) {
7660
5966
  }
7661
5967
  );
7662
5968
  }
5969
+ async function removeAction(library, options, command) {
5970
+ const globalOptions = command.parent?.opts() || {};
5971
+ setupLogging(globalOptions);
5972
+ const serverUrl = options.serverUrl;
5973
+ const docService = await createDocumentManagement({ serverUrl });
5974
+ const { version: version2 } = options;
5975
+ try {
5976
+ await docService.removeAllDocuments(library, version2);
5977
+ console.log(
5978
+ `✅ Successfully removed documents for ${library}${version2 ? `@${version2}` : " (unversioned)"}.`
5979
+ );
5980
+ } catch (error) {
5981
+ console.error(
5982
+ `❌ Failed to remove documents for ${library}${version2 ? `@${version2}` : " (unversioned)"}:`,
5983
+ error instanceof Error ? error.message : String(error)
5984
+ );
5985
+ throw error;
5986
+ } finally {
5987
+ await docService.shutdown();
5988
+ }
5989
+ }
7663
5990
  function createRemoveCommand(program) {
7664
5991
  return program.command("remove <library>").description("Remove documents for a specific library and version").option(
7665
5992
  "-v, --version <string>",
7666
5993
  "Version to remove (optional, removes unversioned if omitted)"
7667
- ).action(async (library, options, command) => {
7668
- const globalOptions = command.parent?.opts() || {};
7669
- setupLogging(globalOptions);
7670
- const docService = await initializeDocumentService();
7671
- const { version: version2 } = options;
7672
- try {
7673
- await docService.removeAllDocuments(library, version2);
7674
- console.log(
7675
- `✅ Successfully removed documents for ${library}${version2 ? `@${version2}` : " (unversioned)"}.`
7676
- );
7677
- } catch (error) {
7678
- console.error(
7679
- `❌ Failed to remove documents for ${library}${version2 ? `@${version2}` : " (unversioned)"}:`,
7680
- error instanceof Error ? error.message : String(error)
7681
- );
7682
- throw error;
7683
- } finally {
7684
- await docService.shutdown();
5994
+ ).option(
5995
+ "--server-url <url>",
5996
+ "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
5997
+ ).action(removeAction);
5998
+ }
5999
+ async function scrapeAction(library, url, options, command) {
6000
+ const globalOptions = command.parent?.opts() || {};
6001
+ setupLogging(globalOptions);
6002
+ const serverUrl = options.serverUrl;
6003
+ const docService = await createDocumentManagement({ serverUrl });
6004
+ let pipeline = null;
6005
+ try {
6006
+ const pipelineOptions = {
6007
+ recoverJobs: false,
6008
+ concurrency: 1,
6009
+ serverUrl
6010
+ };
6011
+ pipeline = await createPipelineWithCallbacks(
6012
+ serverUrl ? void 0 : docService,
6013
+ pipelineOptions
6014
+ );
6015
+ await pipeline.start();
6016
+ const scrapeTool = new ScrapeTool(pipeline);
6017
+ const headers = parseHeaders(options.header);
6018
+ const result = await scrapeTool.execute({
6019
+ url,
6020
+ library,
6021
+ version: options.version,
6022
+ options: {
6023
+ maxPages: Number.parseInt(options.maxPages),
6024
+ maxDepth: Number.parseInt(options.maxDepth),
6025
+ maxConcurrency: Number.parseInt(options.maxConcurrency),
6026
+ ignoreErrors: options.ignoreErrors,
6027
+ scope: options.scope,
6028
+ followRedirects: options.followRedirects,
6029
+ scrapeMode: options.scrapeMode,
6030
+ includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
6031
+ excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
6032
+ headers: Object.keys(headers).length > 0 ? headers : void 0
6033
+ }
6034
+ });
6035
+ if ("pagesScraped" in result) {
6036
+ console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
6037
+ } else {
6038
+ console.log(`🚀 Scraping job started with ID: ${result.jobId}`);
7685
6039
  }
7686
- });
6040
+ } finally {
6041
+ if (pipeline) await pipeline.stop();
6042
+ await docService.shutdown();
6043
+ }
7687
6044
  }
7688
6045
  function createScrapeCommand(program) {
7689
6046
  return program.command("scrape <library> <url>").description(
@@ -7746,55 +6103,27 @@ function createScrapeCommand(program) {
7746
6103
  []
7747
6104
  ).option(
7748
6105
  "--server-url <url>",
7749
- "URL of external pipeline worker API (e.g., http://localhost:6280/api)"
7750
- ).action(
7751
- async (library, url, options, command) => {
7752
- const globalOptions = command.parent?.opts() || {};
7753
- setupLogging(globalOptions);
7754
- const docService = new DocumentManagementService();
7755
- let pipeline = null;
7756
- try {
7757
- await docService.initialize();
7758
- const pipelineOptions = {
7759
- recoverJobs: false,
7760
- // CLI: no job recovery (immediate execution)
7761
- concurrency: 1,
7762
- // CLI: single job at a time
7763
- serverUrl: options.serverUrl
7764
- // Use external worker if specified
7765
- };
7766
- pipeline = await PipelineFactory.createPipeline(docService, pipelineOptions);
7767
- await pipeline.start();
7768
- const scrapeTool = new ScrapeTool(pipeline);
7769
- const headers = parseHeaders(options.header);
7770
- const result = await scrapeTool.execute({
7771
- url,
7772
- library,
7773
- version: options.version,
7774
- options: {
7775
- maxPages: Number.parseInt(options.maxPages),
7776
- maxDepth: Number.parseInt(options.maxDepth),
7777
- maxConcurrency: Number.parseInt(options.maxConcurrency),
7778
- ignoreErrors: options.ignoreErrors,
7779
- scope: options.scope,
7780
- followRedirects: options.followRedirects,
7781
- scrapeMode: options.scrapeMode,
7782
- includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
7783
- excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
7784
- headers: Object.keys(headers).length > 0 ? headers : void 0
7785
- }
7786
- });
7787
- if ("pagesScraped" in result) {
7788
- console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
7789
- } else {
7790
- console.log(`🚀 Scraping job started with ID: ${result.jobId}`);
7791
- }
7792
- } finally {
7793
- if (pipeline) await pipeline.stop();
7794
- await docService.shutdown();
7795
- }
7796
- }
7797
- );
6106
+ "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
6107
+ ).action(scrapeAction);
6108
+ }
6109
+ async function searchAction(library, query, options, command) {
6110
+ const globalOptions = command.parent?.opts() || {};
6111
+ setupLogging(globalOptions);
6112
+ const serverUrl = options.serverUrl;
6113
+ const docService = await createDocumentManagement({ serverUrl });
6114
+ try {
6115
+ const searchTool = new SearchTool(docService);
6116
+ const result = await searchTool.execute({
6117
+ library,
6118
+ version: options.version,
6119
+ query,
6120
+ limit: Number.parseInt(options.limit),
6121
+ exactMatch: options.exactMatch
6122
+ });
6123
+ console.log(formatOutput(result.results));
6124
+ } finally {
6125
+ await docService.shutdown();
6126
+ }
7798
6127
  }
7799
6128
  function createSearchCommand(program) {
7800
6129
  return program.command("search <library> <query>").description(
@@ -7802,35 +6131,23 @@ function createSearchCommand(program) {
7802
6131
  ).option(
7803
6132
  "-v, --version <string>",
7804
6133
  "Version of the library (optional, supports ranges)"
7805
- ).option("-l, --limit <number>", "Maximum number of results", "5").option("-e, --exact-match", "Only use exact version match (default: false)", false).action(
7806
- async (library, query, options, command) => {
7807
- const globalOptions = command.parent?.opts() || {};
7808
- setupLogging(globalOptions);
7809
- const docService = await initializeDocumentService();
7810
- try {
7811
- const searchTool = new SearchTool(docService);
7812
- const result = await searchTool.execute({
7813
- library,
7814
- version: options.version,
7815
- query,
7816
- limit: Number.parseInt(options.limit),
7817
- exactMatch: options.exactMatch
7818
- });
7819
- console.log(formatOutput(result.results));
7820
- } finally {
7821
- await docService.shutdown();
7822
- }
7823
- }
7824
- );
6134
+ ).option("-l, --limit <number>", "Maximum number of results", "5").option("-e, --exact-match", "Only use exact version match (default: false)", false).option(
6135
+ "--server-url <url>",
6136
+ "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
6137
+ ).action(searchAction);
7825
6138
  }
7826
6139
  function createWebCommand(program) {
7827
- return program.command("web").description("Start web interface only").option(
7828
- "--port <number>",
7829
- "Port for the web interface",
7830
- CLI_DEFAULTS.WEB_PORT.toString()
6140
+ return program.command("web").description("Start web interface only").addOption(
6141
+ new Option("--port <number>", "Port for the web interface").argParser((v) => {
6142
+ const n = Number(v);
6143
+ if (!Number.isInteger(n) || n < 1 || n > 65535) {
6144
+ throw new Error("Port must be an integer between 1 and 65535");
6145
+ }
6146
+ return String(n);
6147
+ }).default(CLI_DEFAULTS.WEB_PORT.toString())
7831
6148
  ).option(
7832
6149
  "--server-url <url>",
7833
- "URL of external pipeline worker API (e.g., http://localhost:6280/api)"
6150
+ "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
7834
6151
  ).action(
7835
6152
  async (cmdOptions, command) => {
7836
6153
  const globalOptions = command.parent?.opts() || {};
@@ -7838,18 +6155,23 @@ function createWebCommand(program) {
7838
6155
  const serverUrl = cmdOptions.serverUrl;
7839
6156
  setupLogging(globalOptions);
7840
6157
  try {
7841
- const docService = await initializeDocumentService();
6158
+ const docService = await createDocumentManagement({
6159
+ serverUrl
6160
+ });
7842
6161
  const pipelineOptions = {
7843
6162
  recoverJobs: false,
7844
6163
  // Web command doesn't support job recovery
7845
6164
  serverUrl,
7846
6165
  concurrency: 3
7847
6166
  };
7848
- const pipeline = await initializePipeline(docService, pipelineOptions);
6167
+ const pipeline = await createPipelineWithCallbacks(
6168
+ serverUrl ? void 0 : docService,
6169
+ pipelineOptions
6170
+ );
7849
6171
  const config = createAppServerConfig({
7850
6172
  enableWebInterface: true,
7851
6173
  enableMcpServer: false,
7852
- enablePipelineApi: false,
6174
+ enableApiServer: false,
7853
6175
  enableWorker: !serverUrl,
7854
6176
  port,
7855
6177
  externalWorkerUrl: serverUrl
@@ -7868,28 +6190,35 @@ function createWebCommand(program) {
7868
6190
  );
7869
6191
  }
7870
6192
  function createWorkerCommand(program) {
7871
- return program.command("worker").description("Start external pipeline worker (HTTP API)").option("--port <number>", "Port for worker API", "8080").option("--resume", "Resume interrupted jobs on startup", true).action(async (cmdOptions, command) => {
6193
+ return program.command("worker").description("Start external pipeline worker (HTTP API)").addOption(
6194
+ new Option("--port <number>", "Port for worker API").argParser((v) => {
6195
+ const n = Number(v);
6196
+ if (!Number.isInteger(n) || n < 1 || n > 65535) {
6197
+ throw new Error("Port must be an integer between 1 and 65535");
6198
+ }
6199
+ return String(n);
6200
+ }).default("8080")
6201
+ ).option("--resume", "Resume interrupted jobs on startup", true).option("--no-resume", "Do not resume jobs on startup").action(async (cmdOptions, command) => {
7872
6202
  const globalOptions = command.parent?.opts() || {};
7873
6203
  const port = validatePort(cmdOptions.port);
7874
6204
  setupLogging(globalOptions);
7875
6205
  try {
7876
6206
  logger.info(`🚀 Starting external pipeline worker on port ${port}`);
7877
6207
  ensurePlaywrightBrowsersInstalled();
7878
- const docService = await initializeDocumentService();
6208
+ const docService = await createLocalDocumentManagement();
7879
6209
  const pipelineOptions = {
7880
6210
  recoverJobs: cmdOptions.resume,
7881
6211
  // Use the resume option
7882
6212
  concurrency: CLI_DEFAULTS.MAX_CONCURRENCY
7883
6213
  };
7884
- const pipeline = await initializePipeline(docService, pipelineOptions);
6214
+ const pipeline = await createPipelineWithCallbacks(docService, pipelineOptions);
7885
6215
  const config = createAppServerConfig({
7886
6216
  enableWebInterface: false,
7887
6217
  enableMcpServer: false,
7888
- enablePipelineApi: true,
6218
+ enableApiServer: true,
7889
6219
  enableWorker: true,
7890
6220
  port
7891
6221
  });
7892
- logger.info(`🚀 Starting external pipeline worker with HTTP API`);
7893
6222
  await startAppServer(docService, pipeline, config);
7894
6223
  await new Promise(() => {
7895
6224
  });
@@ -7901,7 +6230,9 @@ function createWorkerCommand(program) {
7901
6230
  }
7902
6231
  function createCliProgram() {
7903
6232
  const program = new Command();
7904
- program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version(packageJson.version).option("--verbose", "Enable verbose (debug) logging", false).option("--silent", "Disable all logging except errors", false).enablePositionalOptions().showHelpAfterError(true);
6233
+ program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version(packageJson.version).addOption(
6234
+ new Option("--verbose", "Enable verbose (debug) logging").conflicts("silent")
6235
+ ).addOption(new Option("--silent", "Disable all logging except errors")).enablePositionalOptions().allowExcessArguments(false).showHelpAfterError(true);
7905
6236
  program.hook("preAction", (thisCommand, _actionCommand) => {
7906
6237
  const globalOptions = thisCommand.opts();
7907
6238
  if (globalOptions.silent) setLogLevel(LogLevel.ERROR);
@@ -8023,7 +6354,23 @@ runCli().catch((error) => {
8023
6354
  process.exit(1);
8024
6355
  });
8025
6356
  export {
6357
+ ConnectionError as C,
8026
6358
  DimensionError as D,
8027
- VECTOR_DIMENSION as V
6359
+ EMBEDDING_BATCH_CHARS as E,
6360
+ LibraryNotFoundError as L,
6361
+ StoreError as S,
6362
+ VECTOR_DIMENSION as V,
6363
+ applyMigrations as a,
6364
+ EMBEDDING_BATCH_SIZE as b,
6365
+ createJSDOM as c,
6366
+ denormalizeVersionName as d,
6367
+ SPLITTER_PREFERRED_CHUNK_SIZE as e,
6368
+ SPLITTER_MAX_CHUNK_SIZE as f,
6369
+ getProjectRoot as g,
6370
+ VersionNotFoundError as h,
6371
+ SPLITTER_MIN_CHUNK_SIZE as i,
6372
+ logger as l,
6373
+ mapDbDocumentToDocument as m,
6374
+ normalizeVersionName as n
8028
6375
  };
8029
6376
  //# sourceMappingURL=index.js.map