@arabold/docs-mcp-server 1.18.0 → 1.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -6
- package/db/migrations/007-dedupe-unversioned-versions.sql +62 -0
- package/db/migrations/008-case-insensitive-names.sql +10 -0
- package/dist/DocumentManagementClient-CAFdDwTu.js +57 -0
- package/dist/DocumentManagementClient-CAFdDwTu.js.map +1 -0
- package/dist/DocumentManagementService-BH02TJEe.js +1917 -0
- package/dist/DocumentManagementService-BH02TJEe.js.map +1 -0
- package/dist/index.js +736 -2429
- package/dist/index.js.map +1 -1
- package/package.json +3 -1
package/dist/index.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import "dotenv/config";
|
|
2
|
-
import { Command } from "commander";
|
|
2
|
+
import { Option, Command } from "commander";
|
|
3
3
|
import path from "node:path";
|
|
4
4
|
import formBody from "@fastify/formbody";
|
|
5
5
|
import fastifyStatic from "@fastify/static";
|
|
@@ -21,6 +21,9 @@ import fs from "node:fs/promises";
|
|
|
21
21
|
import * as mime from "mime-types";
|
|
22
22
|
import axios from "axios";
|
|
23
23
|
import { HeaderGenerator } from "header-generator";
|
|
24
|
+
import { initTRPC } from "@trpc/server";
|
|
25
|
+
import { fastifyTRPCPlugin } from "@trpc/server/adapters/fastify";
|
|
26
|
+
import { z as z$1 } from "zod";
|
|
24
27
|
import { jsxs, jsx, Fragment } from "@kitajs/html/jsx-runtime";
|
|
25
28
|
import fs$1, { readFileSync, existsSync } from "node:fs";
|
|
26
29
|
import { unified } from "unified";
|
|
@@ -30,15 +33,16 @@ import remarkHtml from "remark-html";
|
|
|
30
33
|
import DOMPurify from "dompurify";
|
|
31
34
|
import { fileURLToPath, URL as URL$1 } from "node:url";
|
|
32
35
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
36
|
+
import { createTRPCProxyClient, httpBatchLink } from "@trpc/client";
|
|
37
|
+
import "env-paths";
|
|
38
|
+
import "fuse.js";
|
|
39
|
+
import "langchain/text_splitter";
|
|
40
|
+
import "better-sqlite3";
|
|
41
|
+
import "sqlite-vec";
|
|
33
42
|
import { execSync } from "node:child_process";
|
|
34
43
|
import { v4 } from "uuid";
|
|
35
44
|
import psl from "psl";
|
|
36
45
|
import { minimatch } from "minimatch";
|
|
37
|
-
import envPaths from "env-paths";
|
|
38
|
-
import Fuse from "fuse.js";
|
|
39
|
-
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
|
|
40
|
-
import Database from "better-sqlite3";
|
|
41
|
-
import * as sqliteVec from "sqlite-vec";
|
|
42
46
|
const LogLevel = {
|
|
43
47
|
ERROR: 0,
|
|
44
48
|
WARN: 1,
|
|
@@ -97,7 +101,7 @@ const logger = {
|
|
|
97
101
|
}
|
|
98
102
|
}
|
|
99
103
|
};
|
|
100
|
-
const version = "1.
|
|
104
|
+
const version = "1.18.0";
|
|
101
105
|
const packageJson = {
|
|
102
106
|
version
|
|
103
107
|
};
|
|
@@ -405,6 +409,7 @@ const SPLITTER_MIN_CHUNK_SIZE = 500;
|
|
|
405
409
|
const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
|
|
406
410
|
const SPLITTER_MAX_CHUNK_SIZE = 5e3;
|
|
407
411
|
const EMBEDDING_BATCH_SIZE = 100;
|
|
412
|
+
const EMBEDDING_BATCH_CHARS = 5e4;
|
|
408
413
|
const MIGRATION_MAX_RETRIES = 5;
|
|
409
414
|
const MIGRATION_RETRY_DELAY_MS = 300;
|
|
410
415
|
var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
|
|
@@ -1319,8 +1324,15 @@ class ListLibrariesTool {
|
|
|
1319
1324
|
const rawLibraries = await this.docService.listLibraries();
|
|
1320
1325
|
const libraries = rawLibraries.map(({ library, versions }) => ({
|
|
1321
1326
|
name: library,
|
|
1322
|
-
versions
|
|
1323
|
-
|
|
1327
|
+
versions: versions.map((v) => ({
|
|
1328
|
+
version: v.ref.version,
|
|
1329
|
+
documentCount: v.counts.documents,
|
|
1330
|
+
uniqueUrlCount: v.counts.uniqueUrls,
|
|
1331
|
+
indexedAt: v.indexedAt,
|
|
1332
|
+
status: v.status,
|
|
1333
|
+
...v.progress ? { progress: v.progress } : void 0,
|
|
1334
|
+
sourceUrl: v.sourceUrl
|
|
1335
|
+
}))
|
|
1324
1336
|
}));
|
|
1325
1337
|
return { libraries };
|
|
1326
1338
|
}
|
|
@@ -1400,7 +1412,8 @@ class ScrapeTool {
|
|
|
1400
1412
|
}
|
|
1401
1413
|
internalVersion = internalVersion.toLowerCase();
|
|
1402
1414
|
const pipeline = this.pipeline;
|
|
1403
|
-
const
|
|
1415
|
+
const enqueueVersion = internalVersion === "" ? null : internalVersion;
|
|
1416
|
+
const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
|
|
1404
1417
|
url,
|
|
1405
1418
|
library,
|
|
1406
1419
|
version: internalVersion,
|
|
@@ -1447,13 +1460,13 @@ class SearchTool {
|
|
|
1447
1460
|
await this.docService.validateLibraryExists(library);
|
|
1448
1461
|
const allLibraries = await this.docService.listLibraries();
|
|
1449
1462
|
const libraryInfo = allLibraries.find((lib) => lib.library === library);
|
|
1450
|
-
const detailedVersions = libraryInfo ? libraryInfo.versions
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
);
|
|
1463
|
+
const detailedVersions = libraryInfo ? libraryInfo.versions.map((v) => ({
|
|
1464
|
+
version: v.ref.version,
|
|
1465
|
+
documentCount: v.counts.documents,
|
|
1466
|
+
uniqueUrlCount: v.counts.uniqueUrls,
|
|
1467
|
+
indexedAt: v.indexedAt
|
|
1468
|
+
})) : [];
|
|
1469
|
+
throw new VersionNotFoundError(library, version2 ?? "latest", detailedVersions);
|
|
1457
1470
|
}
|
|
1458
1471
|
const resolvedVersion = version2 || "latest";
|
|
1459
1472
|
logger.info(
|
|
@@ -2224,134 +2237,229 @@ async function cleanupMcpService(mcpServer) {
|
|
|
2224
2237
|
throw error;
|
|
2225
2238
|
}
|
|
2226
2239
|
}
|
|
2227
|
-
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
|
|
2231
|
-
|
|
2232
|
-
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
}
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
2278
|
-
|
|
2279
|
-
|
|
2280
|
-
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
try {
|
|
2289
|
-
const { status } = request.query;
|
|
2290
|
-
const jobs = await this.pipeline.getJobs(status);
|
|
2291
|
-
return reply.send({ jobs });
|
|
2292
|
-
} catch (error) {
|
|
2293
|
-
logger.error(`API: Failed to get jobs: ${error}`);
|
|
2294
|
-
return reply.status(500).send({
|
|
2295
|
-
error: error instanceof Error ? error.message : String(error)
|
|
2296
|
-
});
|
|
2297
|
-
}
|
|
2240
|
+
const t$1 = initTRPC.context().create();
|
|
2241
|
+
const nonEmptyTrimmed = z$1.string().transform((s) => s.trim()).refine((s) => s.length > 0, "must not be empty");
|
|
2242
|
+
const optionalTrimmed = z$1.preprocess(
|
|
2243
|
+
(v) => typeof v === "string" ? v.trim() : v,
|
|
2244
|
+
z$1.string().min(1).optional().nullable()
|
|
2245
|
+
);
|
|
2246
|
+
const enqueueInput = z$1.object({
|
|
2247
|
+
library: nonEmptyTrimmed,
|
|
2248
|
+
version: optionalTrimmed,
|
|
2249
|
+
options: z$1.custom()
|
|
2250
|
+
});
|
|
2251
|
+
const jobIdInput = z$1.object({ id: z$1.string().min(1) });
|
|
2252
|
+
const getJobsInput = z$1.object({
|
|
2253
|
+
status: z$1.nativeEnum(PipelineJobStatus).optional()
|
|
2254
|
+
});
|
|
2255
|
+
function createPipelineRouter(trpc) {
|
|
2256
|
+
const tt = trpc;
|
|
2257
|
+
return tt.router({
|
|
2258
|
+
enqueueJob: tt.procedure.input(enqueueInput).mutation(
|
|
2259
|
+
async ({
|
|
2260
|
+
ctx,
|
|
2261
|
+
input
|
|
2262
|
+
}) => {
|
|
2263
|
+
const jobId = await ctx.pipeline.enqueueJob(
|
|
2264
|
+
input.library,
|
|
2265
|
+
input.version ?? null,
|
|
2266
|
+
input.options
|
|
2267
|
+
);
|
|
2268
|
+
return { jobId };
|
|
2269
|
+
}
|
|
2270
|
+
),
|
|
2271
|
+
getJob: tt.procedure.input(jobIdInput).query(
|
|
2272
|
+
async ({
|
|
2273
|
+
ctx,
|
|
2274
|
+
input
|
|
2275
|
+
}) => {
|
|
2276
|
+
return ctx.pipeline.getJob(input.id);
|
|
2277
|
+
}
|
|
2278
|
+
),
|
|
2279
|
+
getJobs: tt.procedure.input(getJobsInput.optional()).query(
|
|
2280
|
+
async ({
|
|
2281
|
+
ctx,
|
|
2282
|
+
input
|
|
2283
|
+
}) => {
|
|
2284
|
+
const jobs = await ctx.pipeline.getJobs(input?.status);
|
|
2285
|
+
return { jobs };
|
|
2286
|
+
}
|
|
2287
|
+
),
|
|
2288
|
+
cancelJob: tt.procedure.input(jobIdInput).mutation(
|
|
2289
|
+
async ({
|
|
2290
|
+
ctx,
|
|
2291
|
+
input
|
|
2292
|
+
}) => {
|
|
2293
|
+
await ctx.pipeline.cancelJob(input.id);
|
|
2294
|
+
return { success: true };
|
|
2295
|
+
}
|
|
2296
|
+
),
|
|
2297
|
+
clearCompletedJobs: tt.procedure.mutation(
|
|
2298
|
+
async ({ ctx }) => {
|
|
2299
|
+
const count = await ctx.pipeline.clearCompletedJobs();
|
|
2300
|
+
return { count };
|
|
2298
2301
|
}
|
|
2299
|
-
)
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
|
|
2304
|
-
|
|
2305
|
-
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
|
|
2302
|
+
)
|
|
2303
|
+
});
|
|
2304
|
+
}
|
|
2305
|
+
createPipelineRouter(t$1);
|
|
2306
|
+
const t = initTRPC.context().create();
|
|
2307
|
+
const nonEmpty = z$1.string().min(1).transform((s) => s.trim());
|
|
2308
|
+
const optionalVersion = z$1.string().optional().nullable().transform((v) => typeof v === "string" ? v.trim() : v);
|
|
2309
|
+
function createDataRouter(trpc) {
|
|
2310
|
+
const tt = trpc;
|
|
2311
|
+
return tt.router({
|
|
2312
|
+
listLibraries: tt.procedure.query(async ({ ctx }) => {
|
|
2313
|
+
return await ctx.docService.listLibraries();
|
|
2314
|
+
}),
|
|
2315
|
+
findBestVersion: tt.procedure.input(z$1.object({ library: nonEmpty, targetVersion: z$1.string().optional() })).query(
|
|
2316
|
+
async ({
|
|
2317
|
+
ctx,
|
|
2318
|
+
input
|
|
2319
|
+
}) => {
|
|
2320
|
+
const result = await ctx.docService.findBestVersion(
|
|
2321
|
+
input.library,
|
|
2322
|
+
input.targetVersion
|
|
2323
|
+
);
|
|
2324
|
+
return result;
|
|
2325
|
+
}
|
|
2326
|
+
),
|
|
2327
|
+
validateLibraryExists: tt.procedure.input(z$1.object({ library: nonEmpty })).mutation(
|
|
2328
|
+
async ({ ctx, input }) => {
|
|
2329
|
+
await ctx.docService.validateLibraryExists(input.library);
|
|
2330
|
+
return { ok: true };
|
|
2331
|
+
}
|
|
2332
|
+
),
|
|
2333
|
+
search: tt.procedure.input(
|
|
2334
|
+
z$1.object({
|
|
2335
|
+
library: nonEmpty,
|
|
2336
|
+
version: optionalVersion,
|
|
2337
|
+
query: nonEmpty,
|
|
2338
|
+
limit: z$1.number().int().positive().max(50).optional()
|
|
2339
|
+
})
|
|
2340
|
+
).query(
|
|
2341
|
+
async ({
|
|
2342
|
+
ctx,
|
|
2343
|
+
input
|
|
2344
|
+
}) => {
|
|
2345
|
+
const results = await ctx.docService.searchStore(
|
|
2346
|
+
input.library,
|
|
2347
|
+
input.version ?? null,
|
|
2348
|
+
input.query,
|
|
2349
|
+
input.limit ?? 5
|
|
2350
|
+
);
|
|
2351
|
+
return results;
|
|
2352
|
+
}
|
|
2353
|
+
),
|
|
2354
|
+
removeAllDocuments: tt.procedure.input(z$1.object({ library: nonEmpty, version: optionalVersion })).mutation(
|
|
2355
|
+
async ({
|
|
2356
|
+
ctx,
|
|
2357
|
+
input
|
|
2358
|
+
}) => {
|
|
2359
|
+
await ctx.docService.removeAllDocuments(input.library, input.version ?? null);
|
|
2360
|
+
return { ok: true };
|
|
2361
|
+
}
|
|
2362
|
+
),
|
|
2363
|
+
// Status and version helpers
|
|
2364
|
+
getVersionsByStatus: tt.procedure.input(z$1.object({ statuses: z$1.array(z$1.string()) })).query(
|
|
2365
|
+
async ({
|
|
2366
|
+
ctx,
|
|
2367
|
+
input
|
|
2368
|
+
}) => {
|
|
2369
|
+
const statuses = input.statuses;
|
|
2370
|
+
return await ctx.docService.getVersionsByStatus(
|
|
2371
|
+
statuses
|
|
2372
|
+
);
|
|
2316
2373
|
}
|
|
2317
|
-
)
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2330
|
-
|
|
2331
|
-
|
|
2374
|
+
),
|
|
2375
|
+
findVersionsBySourceUrl: tt.procedure.input(z$1.object({ url: nonEmpty })).query(async ({ ctx, input }) => {
|
|
2376
|
+
return await ctx.docService.findVersionsBySourceUrl(
|
|
2377
|
+
input.url
|
|
2378
|
+
);
|
|
2379
|
+
}),
|
|
2380
|
+
getScraperOptions: tt.procedure.input(z$1.object({ versionId: z$1.number().int().positive() })).query(
|
|
2381
|
+
async ({
|
|
2382
|
+
ctx,
|
|
2383
|
+
input
|
|
2384
|
+
}) => {
|
|
2385
|
+
return await ctx.docService.getScraperOptions(input.versionId);
|
|
2386
|
+
}
|
|
2387
|
+
),
|
|
2388
|
+
updateVersionStatus: tt.procedure.input(
|
|
2389
|
+
z$1.object({
|
|
2390
|
+
versionId: z$1.number().int().positive(),
|
|
2391
|
+
status: z$1.string(),
|
|
2392
|
+
errorMessage: z$1.string().optional().nullable()
|
|
2393
|
+
})
|
|
2394
|
+
).mutation(
|
|
2395
|
+
async ({
|
|
2396
|
+
ctx,
|
|
2397
|
+
input
|
|
2398
|
+
}) => {
|
|
2399
|
+
await ctx.docService.updateVersionStatus(
|
|
2400
|
+
input.versionId,
|
|
2401
|
+
input.status,
|
|
2402
|
+
input.errorMessage ?? void 0
|
|
2403
|
+
);
|
|
2404
|
+
return { ok: true };
|
|
2405
|
+
}
|
|
2406
|
+
),
|
|
2407
|
+
updateVersionProgress: tt.procedure.input(
|
|
2408
|
+
z$1.object({
|
|
2409
|
+
versionId: z$1.number().int().positive(),
|
|
2410
|
+
pages: z$1.number().int().nonnegative(),
|
|
2411
|
+
maxPages: z$1.number().int().positive()
|
|
2412
|
+
})
|
|
2413
|
+
).mutation(
|
|
2414
|
+
async ({
|
|
2415
|
+
ctx,
|
|
2416
|
+
input
|
|
2417
|
+
}) => {
|
|
2418
|
+
await ctx.docService.updateVersionProgress(
|
|
2419
|
+
input.versionId,
|
|
2420
|
+
input.pages,
|
|
2421
|
+
input.maxPages
|
|
2422
|
+
);
|
|
2423
|
+
return { ok: true };
|
|
2332
2424
|
}
|
|
2333
|
-
)
|
|
2334
|
-
|
|
2335
|
-
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2425
|
+
),
|
|
2426
|
+
storeScraperOptions: tt.procedure.input(
|
|
2427
|
+
z$1.object({
|
|
2428
|
+
versionId: z$1.number().int().positive(),
|
|
2429
|
+
options: z$1.unknown()
|
|
2430
|
+
})
|
|
2431
|
+
).mutation(
|
|
2432
|
+
async ({
|
|
2433
|
+
ctx,
|
|
2434
|
+
input
|
|
2435
|
+
}) => {
|
|
2436
|
+
await ctx.docService.storeScraperOptions(
|
|
2437
|
+
input.versionId,
|
|
2438
|
+
input.options
|
|
2439
|
+
);
|
|
2440
|
+
return { ok: true };
|
|
2347
2441
|
}
|
|
2348
|
-
)
|
|
2349
|
-
|
|
2350
|
-
}
|
|
2442
|
+
)
|
|
2443
|
+
});
|
|
2351
2444
|
}
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2445
|
+
createDataRouter(t);
|
|
2446
|
+
async function registerTrpcService(server, pipeline, docService) {
|
|
2447
|
+
const t2 = initTRPC.context().create();
|
|
2448
|
+
const healthRouter = t2.router({
|
|
2449
|
+
ping: t2.procedure.query(async () => ({ status: "ok", ts: Date.now() }))
|
|
2450
|
+
});
|
|
2451
|
+
const router = t2.mergeRouters(
|
|
2452
|
+
healthRouter,
|
|
2453
|
+
createPipelineRouter(t2),
|
|
2454
|
+
createDataRouter(t2)
|
|
2455
|
+
);
|
|
2456
|
+
await server.register(fastifyTRPCPlugin, {
|
|
2457
|
+
prefix: "/api",
|
|
2458
|
+
trpcOptions: {
|
|
2459
|
+
router,
|
|
2460
|
+
createContext: async () => ({ pipeline, docService })
|
|
2461
|
+
}
|
|
2462
|
+
});
|
|
2355
2463
|
}
|
|
2356
2464
|
const Layout = ({ title, version: version2, children }) => {
|
|
2357
2465
|
let versionString = version2;
|
|
@@ -2522,7 +2630,7 @@ function normalizeVersionName(name) {
|
|
|
2522
2630
|
return name ?? "";
|
|
2523
2631
|
}
|
|
2524
2632
|
function denormalizeVersionName(name) {
|
|
2525
|
-
return name === "" ?
|
|
2633
|
+
return name === "" ? "" : name;
|
|
2526
2634
|
}
|
|
2527
2635
|
function getStatusDescription(status) {
|
|
2528
2636
|
const descriptions = {
|
|
@@ -3426,8 +3534,8 @@ const VersionDetailsRow = ({
|
|
|
3426
3534
|
// Default to true
|
|
3427
3535
|
}) => {
|
|
3428
3536
|
const indexedDate = version2.indexedAt ? new Date(version2.indexedAt).toLocaleDateString() : "N/A";
|
|
3429
|
-
const versionLabel = version2.version || "Unversioned";
|
|
3430
|
-
const versionParam = version2.version || "";
|
|
3537
|
+
const versionLabel = version2.ref.version || "Unversioned";
|
|
3538
|
+
const versionParam = version2.ref.version || "";
|
|
3431
3539
|
const sanitizedLibraryName = libraryName.replace(/[^a-zA-Z0-9-_]/g, "-");
|
|
3432
3540
|
const sanitizedVersionParam = versionParam.replace(/[^a-zA-Z0-9-_]/g, "-");
|
|
3433
3541
|
const rowId = `row-${sanitizedLibraryName}-${sanitizedVersionParam}`;
|
|
@@ -3446,19 +3554,19 @@ const VersionDetailsRow = ({
|
|
|
3446
3554
|
{
|
|
3447
3555
|
class: "text-sm text-gray-900 dark:text-white w-1/4 truncate",
|
|
3448
3556
|
title: versionLabel,
|
|
3449
|
-
children: version2.version ? /* @__PURE__ */ jsx(VersionBadge, { version: version2.version }) : /* @__PURE__ */ jsx("span", { children: "Unversioned" })
|
|
3557
|
+
children: version2.ref.version ? /* @__PURE__ */ jsx(VersionBadge, { version: version2.ref.version }) : /* @__PURE__ */ jsx("span", { children: "Unversioned" })
|
|
3450
3558
|
}
|
|
3451
3559
|
),
|
|
3452
3560
|
/* @__PURE__ */ jsxs("div", { class: "flex space-x-2 text-sm text-gray-600 dark:text-gray-400 w-3/4 justify-end items-center", children: [
|
|
3453
3561
|
/* @__PURE__ */ jsxs("span", { title: "Number of unique pages indexed", children: [
|
|
3454
3562
|
"Pages:",
|
|
3455
3563
|
" ",
|
|
3456
|
-
/* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.
|
|
3564
|
+
/* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.counts.uniqueUrls.toLocaleString() })
|
|
3457
3565
|
] }),
|
|
3458
3566
|
/* @__PURE__ */ jsxs("span", { title: "Number of indexed snippets", children: [
|
|
3459
3567
|
"Snippets:",
|
|
3460
3568
|
" ",
|
|
3461
|
-
/* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.
|
|
3569
|
+
/* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.counts.documents.toLocaleString() })
|
|
3462
3570
|
] }),
|
|
3463
3571
|
/* @__PURE__ */ jsxs("span", { title: "Date last indexed", children: [
|
|
3464
3572
|
"Last Update:",
|
|
@@ -3558,17 +3666,28 @@ const LibraryDetailCard = ({ library }) => (
|
|
|
3558
3666
|
// Use Flowbite Card structure with updated padding and border, and white background
|
|
3559
3667
|
/* @__PURE__ */ jsxs("div", { class: "block p-4 bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-300 dark:border-gray-600 mb-4", children: [
|
|
3560
3668
|
/* @__PURE__ */ jsx("h3", { class: "text-lg font-medium text-gray-900 dark:text-white mb-1", children: /* @__PURE__ */ jsx("span", { safe: true, children: library.name }) }),
|
|
3561
|
-
/* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((
|
|
3562
|
-
|
|
3563
|
-
|
|
3564
|
-
|
|
3565
|
-
|
|
3566
|
-
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
|
|
3669
|
+
/* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((v) => {
|
|
3670
|
+
const adapted = {
|
|
3671
|
+
id: -1,
|
|
3672
|
+
ref: { library: library.name, version: v.version },
|
|
3673
|
+
status: v.status,
|
|
3674
|
+
progress: v.progress,
|
|
3675
|
+
counts: {
|
|
3676
|
+
documents: v.documentCount,
|
|
3677
|
+
uniqueUrls: v.uniqueUrlCount
|
|
3678
|
+
},
|
|
3679
|
+
indexedAt: v.indexedAt,
|
|
3680
|
+
sourceUrl: v.sourceUrl ?? void 0
|
|
3681
|
+
};
|
|
3682
|
+
return /* @__PURE__ */ jsx(
|
|
3683
|
+
VersionDetailsRow,
|
|
3684
|
+
{
|
|
3685
|
+
libraryName: library.name,
|
|
3686
|
+
version: adapted,
|
|
3687
|
+
showDelete: false
|
|
3688
|
+
}
|
|
3689
|
+
);
|
|
3690
|
+
}) : /* @__PURE__ */ jsx("p", { class: "text-sm text-gray-500 dark:text-gray-400 italic", children: "No versions indexed." }) })
|
|
3572
3691
|
] })
|
|
3573
3692
|
);
|
|
3574
3693
|
const LibrarySearchCard = ({ library }) => {
|
|
@@ -3733,7 +3852,21 @@ const LibraryItem = ({ library }) => (
|
|
|
3733
3852
|
children: /* @__PURE__ */ jsx("span", { safe: true, children: library.name })
|
|
3734
3853
|
}
|
|
3735
3854
|
) }),
|
|
3736
|
-
/* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((
|
|
3855
|
+
/* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((v) => {
|
|
3856
|
+
const adapted = {
|
|
3857
|
+
id: -1,
|
|
3858
|
+
ref: { library: library.name, version: v.version },
|
|
3859
|
+
status: v.status,
|
|
3860
|
+
progress: v.progress,
|
|
3861
|
+
counts: {
|
|
3862
|
+
documents: v.documentCount,
|
|
3863
|
+
uniqueUrls: v.uniqueUrlCount
|
|
3864
|
+
},
|
|
3865
|
+
indexedAt: v.indexedAt,
|
|
3866
|
+
sourceUrl: v.sourceUrl ?? void 0
|
|
3867
|
+
};
|
|
3868
|
+
return /* @__PURE__ */ jsx(VersionDetailsRow, { libraryName: library.name, version: adapted });
|
|
3869
|
+
}) : (
|
|
3737
3870
|
// Display message if no versions are indexed
|
|
3738
3871
|
/* @__PURE__ */ jsx("p", { class: "text-sm text-gray-500 dark:text-gray-400 italic", children: "No versions indexed." })
|
|
3739
3872
|
) })
|
|
@@ -3861,9 +3994,9 @@ class AppServer {
|
|
|
3861
3994
|
);
|
|
3862
3995
|
}
|
|
3863
3996
|
}
|
|
3864
|
-
if (this.config.enableWorker && !this.config.
|
|
3997
|
+
if (this.config.enableWorker && !this.config.enableApiServer) {
|
|
3865
3998
|
logger.warn(
|
|
3866
|
-
"Warning: Worker is enabled but
|
|
3999
|
+
"Warning: Worker is enabled but API server is disabled. Consider enabling the API for better observability."
|
|
3867
4000
|
);
|
|
3868
4001
|
}
|
|
3869
4002
|
}
|
|
@@ -3915,8 +4048,8 @@ class AppServer {
|
|
|
3915
4048
|
if (this.config.enableMcpServer) {
|
|
3916
4049
|
await this.enableMcpServer();
|
|
3917
4050
|
}
|
|
3918
|
-
if (this.config.
|
|
3919
|
-
await this.
|
|
4051
|
+
if (this.config.enableApiServer) {
|
|
4052
|
+
await this.enableTrpcApi();
|
|
3920
4053
|
}
|
|
3921
4054
|
if (this.config.enableWorker) {
|
|
3922
4055
|
await this.enableWorker();
|
|
@@ -3944,11 +4077,11 @@ class AppServer {
|
|
|
3944
4077
|
logger.debug("MCP server service enabled");
|
|
3945
4078
|
}
|
|
3946
4079
|
/**
|
|
3947
|
-
* Enable Pipeline
|
|
4080
|
+
* Enable Pipeline RPC (tRPC) service.
|
|
3948
4081
|
*/
|
|
3949
|
-
async
|
|
3950
|
-
await
|
|
3951
|
-
logger.debug("
|
|
4082
|
+
async enableTrpcApi() {
|
|
4083
|
+
await registerTrpcService(this.server, this.pipeline, this.docService);
|
|
4084
|
+
logger.debug("API server (tRPC) enabled");
|
|
3952
4085
|
}
|
|
3953
4086
|
/**
|
|
3954
4087
|
* Enable worker service.
|
|
@@ -3977,10 +4110,10 @@ class AppServer {
|
|
|
3977
4110
|
enabledServices.push(`Web interface: ${address}`);
|
|
3978
4111
|
}
|
|
3979
4112
|
if (this.config.enableMcpServer) {
|
|
3980
|
-
enabledServices.push(`MCP
|
|
4113
|
+
enabledServices.push(`MCP endpoints: ${address}/mcp, ${address}/sse`);
|
|
3981
4114
|
}
|
|
3982
|
-
if (this.config.
|
|
3983
|
-
enabledServices.push(`
|
|
4115
|
+
if (this.config.enableApiServer) {
|
|
4116
|
+
enabledServices.push(`API: ${address}/api`);
|
|
3984
4117
|
}
|
|
3985
4118
|
if (this.config.enableWorker) {
|
|
3986
4119
|
enabledServices.push("Embedded worker: enabled");
|
|
@@ -4005,6 +4138,161 @@ async function startStdioServer(tools) {
|
|
|
4005
4138
|
logger.info("🤖 MCP server listening on stdio");
|
|
4006
4139
|
return server;
|
|
4007
4140
|
}
|
|
4141
|
+
class StoreError extends Error {
|
|
4142
|
+
constructor(message, cause) {
|
|
4143
|
+
super(cause ? `${message} caused by ${cause}` : message);
|
|
4144
|
+
this.cause = cause;
|
|
4145
|
+
this.name = this.constructor.name;
|
|
4146
|
+
const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
|
|
4147
|
+
if (causeError?.stack) {
|
|
4148
|
+
this.stack = causeError.stack;
|
|
4149
|
+
}
|
|
4150
|
+
}
|
|
4151
|
+
}
|
|
4152
|
+
class DimensionError extends StoreError {
|
|
4153
|
+
constructor(modelName, modelDimension, dbDimension) {
|
|
4154
|
+
super(
|
|
4155
|
+
`Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
|
|
4156
|
+
);
|
|
4157
|
+
this.modelName = modelName;
|
|
4158
|
+
this.modelDimension = modelDimension;
|
|
4159
|
+
this.dbDimension = dbDimension;
|
|
4160
|
+
}
|
|
4161
|
+
}
|
|
4162
|
+
class ConnectionError extends StoreError {
|
|
4163
|
+
}
|
|
4164
|
+
const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
|
|
4165
|
+
const MIGRATIONS_TABLE = "_schema_migrations";
|
|
4166
|
+
function ensureMigrationsTable(db) {
|
|
4167
|
+
db.exec(`
|
|
4168
|
+
CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
|
|
4169
|
+
id TEXT PRIMARY KEY,
|
|
4170
|
+
applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
4171
|
+
);
|
|
4172
|
+
`);
|
|
4173
|
+
}
|
|
4174
|
+
function getAppliedMigrations(db) {
|
|
4175
|
+
const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
|
|
4176
|
+
const rows = stmt.all();
|
|
4177
|
+
return new Set(rows.map((row) => row.id));
|
|
4178
|
+
}
|
|
4179
|
+
async function applyMigrations(db) {
|
|
4180
|
+
try {
|
|
4181
|
+
db.pragma("journal_mode = OFF");
|
|
4182
|
+
db.pragma("synchronous = OFF");
|
|
4183
|
+
db.pragma("mmap_size = 268435456");
|
|
4184
|
+
db.pragma("cache_size = -64000");
|
|
4185
|
+
db.pragma("temp_store = MEMORY");
|
|
4186
|
+
logger.debug("Applied performance optimizations for migration");
|
|
4187
|
+
} catch (_error) {
|
|
4188
|
+
logger.warn("⚠️ Could not apply all performance optimizations for migration");
|
|
4189
|
+
}
|
|
4190
|
+
const overallTransaction = db.transaction(() => {
|
|
4191
|
+
logger.debug("Checking database migrations...");
|
|
4192
|
+
ensureMigrationsTable(db);
|
|
4193
|
+
const appliedMigrations = getAppliedMigrations(db);
|
|
4194
|
+
if (!fs$1.existsSync(MIGRATIONS_DIR)) {
|
|
4195
|
+
throw new StoreError("Migrations directory not found");
|
|
4196
|
+
}
|
|
4197
|
+
const migrationFiles = fs$1.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
|
|
4198
|
+
const pendingMigrations = migrationFiles.filter(
|
|
4199
|
+
(filename) => !appliedMigrations.has(filename)
|
|
4200
|
+
);
|
|
4201
|
+
if (pendingMigrations.length > 0) {
|
|
4202
|
+
logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
|
|
4203
|
+
}
|
|
4204
|
+
let appliedCount = 0;
|
|
4205
|
+
for (const filename of pendingMigrations) {
|
|
4206
|
+
logger.debug(`Applying migration: ${filename}`);
|
|
4207
|
+
const filePath = path.join(MIGRATIONS_DIR, filename);
|
|
4208
|
+
const sql = fs$1.readFileSync(filePath, "utf8");
|
|
4209
|
+
try {
|
|
4210
|
+
db.exec(sql);
|
|
4211
|
+
const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
|
|
4212
|
+
insertStmt.run(filename);
|
|
4213
|
+
logger.debug(`✅ Applied migration: ${filename}`);
|
|
4214
|
+
appliedCount++;
|
|
4215
|
+
} catch (error) {
|
|
4216
|
+
logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
|
|
4217
|
+
throw new StoreError(`Migration failed: ${filename}`, error);
|
|
4218
|
+
}
|
|
4219
|
+
}
|
|
4220
|
+
if (appliedCount > 0) {
|
|
4221
|
+
logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
|
|
4222
|
+
} else {
|
|
4223
|
+
logger.debug("Database schema is up to date");
|
|
4224
|
+
}
|
|
4225
|
+
return appliedCount;
|
|
4226
|
+
});
|
|
4227
|
+
let retries = 0;
|
|
4228
|
+
let appliedMigrationsCount = 0;
|
|
4229
|
+
while (true) {
|
|
4230
|
+
try {
|
|
4231
|
+
appliedMigrationsCount = overallTransaction.immediate();
|
|
4232
|
+
logger.debug("Database migrations completed successfully");
|
|
4233
|
+
if (appliedMigrationsCount > 0) {
|
|
4234
|
+
try {
|
|
4235
|
+
logger.debug(
|
|
4236
|
+
`Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
|
|
4237
|
+
);
|
|
4238
|
+
db.exec("VACUUM");
|
|
4239
|
+
logger.debug("Database vacuum completed successfully");
|
|
4240
|
+
} catch (error) {
|
|
4241
|
+
logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
|
|
4242
|
+
}
|
|
4243
|
+
} else {
|
|
4244
|
+
logger.debug("Skipping VACUUM - no migrations were applied");
|
|
4245
|
+
}
|
|
4246
|
+
break;
|
|
4247
|
+
} catch (error) {
|
|
4248
|
+
if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
|
|
4249
|
+
retries++;
|
|
4250
|
+
logger.warn(
|
|
4251
|
+
`⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
|
|
4252
|
+
);
|
|
4253
|
+
await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
|
|
4254
|
+
} else {
|
|
4255
|
+
if (error?.code === "SQLITE_BUSY") {
|
|
4256
|
+
logger.error(
|
|
4257
|
+
`❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
|
|
4258
|
+
);
|
|
4259
|
+
}
|
|
4260
|
+
if (error instanceof StoreError) {
|
|
4261
|
+
throw error;
|
|
4262
|
+
}
|
|
4263
|
+
throw new StoreError("Failed during migration process", error);
|
|
4264
|
+
}
|
|
4265
|
+
}
|
|
4266
|
+
}
|
|
4267
|
+
try {
|
|
4268
|
+
db.pragma("journal_mode = WAL");
|
|
4269
|
+
db.pragma("wal_autocheckpoint = 1000");
|
|
4270
|
+
db.pragma("busy_timeout = 30000");
|
|
4271
|
+
db.pragma("foreign_keys = ON");
|
|
4272
|
+
db.pragma("synchronous = NORMAL");
|
|
4273
|
+
logger.debug(
|
|
4274
|
+
"Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
|
|
4275
|
+
);
|
|
4276
|
+
} catch (_error) {
|
|
4277
|
+
logger.warn("⚠️ Could not apply all production database settings");
|
|
4278
|
+
}
|
|
4279
|
+
}
|
|
4280
|
+
async function createDocumentManagement(options = {}) {
|
|
4281
|
+
if (options.serverUrl) {
|
|
4282
|
+
const { DocumentManagementClient } = await import("./DocumentManagementClient-CAFdDwTu.js");
|
|
4283
|
+
const client = new DocumentManagementClient(options.serverUrl);
|
|
4284
|
+
await client.initialize();
|
|
4285
|
+
return client;
|
|
4286
|
+
}
|
|
4287
|
+
const service = new (await import("./DocumentManagementService-BH02TJEe.js")).DocumentManagementService();
|
|
4288
|
+
await service.initialize();
|
|
4289
|
+
return service;
|
|
4290
|
+
}
|
|
4291
|
+
async function createLocalDocumentManagement() {
|
|
4292
|
+
const service = new (await import("./DocumentManagementService-BH02TJEe.js")).DocumentManagementService();
|
|
4293
|
+
await service.initialize();
|
|
4294
|
+
return service;
|
|
4295
|
+
}
|
|
4008
4296
|
function deserializeJob(serializedJob) {
|
|
4009
4297
|
return {
|
|
4010
4298
|
...serializedJob,
|
|
@@ -4016,21 +4304,22 @@ function deserializeJob(serializedJob) {
|
|
|
4016
4304
|
}
|
|
4017
4305
|
class PipelineClient {
|
|
4018
4306
|
baseUrl;
|
|
4307
|
+
client;
|
|
4019
4308
|
pollingInterval = 1e3;
|
|
4020
4309
|
// 1 second
|
|
4021
4310
|
activePolling = /* @__PURE__ */ new Set();
|
|
4022
4311
|
// Track jobs being polled for completion
|
|
4023
4312
|
constructor(serverUrl) {
|
|
4024
4313
|
this.baseUrl = serverUrl.replace(/\/$/, "");
|
|
4025
|
-
|
|
4314
|
+
this.client = createTRPCProxyClient({
|
|
4315
|
+
links: [httpBatchLink({ url: this.baseUrl })]
|
|
4316
|
+
});
|
|
4317
|
+
logger.debug(`PipelineClient (tRPC) created for: ${this.baseUrl}`);
|
|
4026
4318
|
}
|
|
4027
4319
|
async start() {
|
|
4028
4320
|
try {
|
|
4029
|
-
|
|
4030
|
-
|
|
4031
|
-
throw new Error(`External worker health check failed: ${response.status}`);
|
|
4032
|
-
}
|
|
4033
|
-
logger.debug("PipelineClient connected to external worker");
|
|
4321
|
+
await this.client.ping.query();
|
|
4322
|
+
logger.debug("PipelineClient connected to external worker via tRPC");
|
|
4034
4323
|
} catch (error) {
|
|
4035
4324
|
throw new Error(
|
|
4036
4325
|
`Failed to connect to external worker at ${this.baseUrl}: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -4043,25 +4332,14 @@ class PipelineClient {
|
|
|
4043
4332
|
}
|
|
4044
4333
|
async enqueueJob(library, version2, options) {
|
|
4045
4334
|
try {
|
|
4046
|
-
const
|
|
4047
|
-
|
|
4048
|
-
|
|
4049
|
-
|
|
4050
|
-
|
|
4051
|
-
body: JSON.stringify({
|
|
4052
|
-
library,
|
|
4053
|
-
version: version2,
|
|
4054
|
-
options
|
|
4055
|
-
})
|
|
4335
|
+
const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
|
|
4336
|
+
const result = await this.client.enqueueJob.mutate({
|
|
4337
|
+
library,
|
|
4338
|
+
version: normalizedVersion,
|
|
4339
|
+
options
|
|
4056
4340
|
});
|
|
4057
|
-
|
|
4058
|
-
|
|
4059
|
-
throw new Error(`Failed to enqueue job: ${response.status} ${errorText}`);
|
|
4060
|
-
}
|
|
4061
|
-
const result = await response.json();
|
|
4062
|
-
const jobId = result.jobId;
|
|
4063
|
-
logger.debug(`Job ${jobId} enqueued successfully`);
|
|
4064
|
-
return jobId;
|
|
4341
|
+
logger.debug(`Job ${result.jobId} enqueued successfully`);
|
|
4342
|
+
return result.jobId;
|
|
4065
4343
|
} catch (error) {
|
|
4066
4344
|
throw new Error(
|
|
4067
4345
|
`Failed to enqueue job: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -4070,15 +4348,8 @@ class PipelineClient {
|
|
|
4070
4348
|
}
|
|
4071
4349
|
async getJob(jobId) {
|
|
4072
4350
|
try {
|
|
4073
|
-
const
|
|
4074
|
-
|
|
4075
|
-
return void 0;
|
|
4076
|
-
}
|
|
4077
|
-
if (!response.ok) {
|
|
4078
|
-
throw new Error(`Failed to get job: ${response.status} ${response.statusText}`);
|
|
4079
|
-
}
|
|
4080
|
-
const serializedJob = await response.json();
|
|
4081
|
-
return deserializeJob(serializedJob);
|
|
4351
|
+
const serializedJob = await this.client.getJob.query({ id: jobId });
|
|
4352
|
+
return serializedJob ? deserializeJob(serializedJob) : void 0;
|
|
4082
4353
|
} catch (error) {
|
|
4083
4354
|
throw new Error(
|
|
4084
4355
|
`Failed to get job ${jobId}: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -4087,18 +4358,11 @@ class PipelineClient {
|
|
|
4087
4358
|
}
|
|
4088
4359
|
async getJobs(status) {
|
|
4089
4360
|
try {
|
|
4090
|
-
const
|
|
4091
|
-
if (status) {
|
|
4092
|
-
url.searchParams.set("status", status);
|
|
4093
|
-
}
|
|
4094
|
-
const response = await fetch(url.toString());
|
|
4095
|
-
if (!response.ok) {
|
|
4096
|
-
const errorText = await response.text();
|
|
4097
|
-
throw new Error(`Failed to get jobs: ${response.status} ${errorText}`);
|
|
4098
|
-
}
|
|
4099
|
-
const result = await response.json();
|
|
4361
|
+
const result = await this.client.getJobs.query({ status });
|
|
4100
4362
|
const serializedJobs = result.jobs || [];
|
|
4101
|
-
return serializedJobs.map(
|
|
4363
|
+
return serializedJobs.map(
|
|
4364
|
+
(j) => deserializeJob(j)
|
|
4365
|
+
);
|
|
4102
4366
|
} catch (error) {
|
|
4103
4367
|
logger.error(`Failed to get jobs from external worker: ${error}`);
|
|
4104
4368
|
throw error;
|
|
@@ -4106,13 +4370,7 @@ class PipelineClient {
|
|
|
4106
4370
|
}
|
|
4107
4371
|
async cancelJob(jobId) {
|
|
4108
4372
|
try {
|
|
4109
|
-
|
|
4110
|
-
method: "DELETE"
|
|
4111
|
-
});
|
|
4112
|
-
if (!response.ok) {
|
|
4113
|
-
const errorText = await response.text();
|
|
4114
|
-
throw new Error(`Failed to cancel job: ${response.status} ${errorText}`);
|
|
4115
|
-
}
|
|
4373
|
+
await this.client.cancelJob.mutate({ id: jobId });
|
|
4116
4374
|
logger.debug(`Job cancelled via external worker: ${jobId}`);
|
|
4117
4375
|
} catch (error) {
|
|
4118
4376
|
logger.error(`Failed to cancel job ${jobId} via external worker: ${error}`);
|
|
@@ -4121,16 +4379,7 @@ class PipelineClient {
|
|
|
4121
4379
|
}
|
|
4122
4380
|
async clearCompletedJobs() {
|
|
4123
4381
|
try {
|
|
4124
|
-
const
|
|
4125
|
-
method: "DELETE"
|
|
4126
|
-
});
|
|
4127
|
-
if (!response.ok) {
|
|
4128
|
-
const errorText = await response.text();
|
|
4129
|
-
throw new Error(
|
|
4130
|
-
`Failed to clear completed jobs: ${response.status} ${errorText}`
|
|
4131
|
-
);
|
|
4132
|
-
}
|
|
4133
|
-
const result = await response.json();
|
|
4382
|
+
const result = await this.client.clearCompletedJobs.mutate();
|
|
4134
4383
|
logger.debug(`Cleared ${result.count} completed jobs via external worker`);
|
|
4135
4384
|
return result.count || 0;
|
|
4136
4385
|
} catch (error) {
|
|
@@ -4901,7 +5150,9 @@ class PipelineManager {
|
|
|
4901
5150
|
*/
|
|
4902
5151
|
async recoverPendingJobs() {
|
|
4903
5152
|
try {
|
|
4904
|
-
const runningVersions = await this.store.
|
|
5153
|
+
const runningVersions = await this.store.getVersionsByStatus([
|
|
5154
|
+
VersionStatus.RUNNING
|
|
5155
|
+
]);
|
|
4905
5156
|
for (const version2 of runningVersions) {
|
|
4906
5157
|
await this.store.updateVersionStatus(version2.id, VersionStatus.QUEUED);
|
|
4907
5158
|
logger.info(
|
|
@@ -5056,25 +5307,25 @@ class PipelineManager {
|
|
|
5056
5307
|
async enqueueJobWithStoredOptions(library, version2) {
|
|
5057
5308
|
const normalizedVersion = version2 ?? "";
|
|
5058
5309
|
try {
|
|
5059
|
-
const versionId = await this.store.
|
|
5310
|
+
const versionId = await this.store.ensureVersion({
|
|
5060
5311
|
library,
|
|
5061
|
-
normalizedVersion
|
|
5062
|
-
);
|
|
5063
|
-
const
|
|
5064
|
-
if (!
|
|
5312
|
+
version: normalizedVersion
|
|
5313
|
+
});
|
|
5314
|
+
const stored = await this.store.getScraperOptions(versionId);
|
|
5315
|
+
if (!stored) {
|
|
5065
5316
|
throw new Error(
|
|
5066
5317
|
`No stored scraper options found for ${library}@${normalizedVersion || "unversioned"}`
|
|
5067
5318
|
);
|
|
5068
5319
|
}
|
|
5069
|
-
const storedOptions =
|
|
5320
|
+
const storedOptions = stored.options;
|
|
5070
5321
|
const completeOptions = {
|
|
5071
|
-
url:
|
|
5322
|
+
url: stored.sourceUrl,
|
|
5072
5323
|
library,
|
|
5073
5324
|
version: normalizedVersion,
|
|
5074
5325
|
...storedOptions
|
|
5075
5326
|
};
|
|
5076
5327
|
logger.info(
|
|
5077
|
-
`🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${
|
|
5328
|
+
`🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${stored.sourceUrl}`
|
|
5078
5329
|
);
|
|
5079
5330
|
return this.enqueueJob(library, normalizedVersion, completeOptions);
|
|
5080
5331
|
} catch (error) {
|
|
@@ -5351,2020 +5602,12 @@ var PipelineFactory;
|
|
|
5351
5602
|
logger.debug(`Creating PipelineClient for external worker at: ${serverUrl}`);
|
|
5352
5603
|
return new PipelineClient(serverUrl);
|
|
5353
5604
|
}
|
|
5354
|
-
return new PipelineManager(docService, concurrency, {
|
|
5605
|
+
return new PipelineManager(docService, concurrency, {
|
|
5606
|
+
recoverJobs
|
|
5607
|
+
});
|
|
5355
5608
|
}
|
|
5356
5609
|
PipelineFactory2.createPipeline = createPipeline;
|
|
5357
5610
|
})(PipelineFactory || (PipelineFactory = {}));
|
|
5358
|
-
class SplitterError extends Error {
|
|
5359
|
-
}
|
|
5360
|
-
class MinimumChunkSizeError extends SplitterError {
|
|
5361
|
-
constructor(size, maxSize) {
|
|
5362
|
-
super(
|
|
5363
|
-
`Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
|
|
5364
|
-
);
|
|
5365
|
-
}
|
|
5366
|
-
}
|
|
5367
|
-
class ContentSplitterError extends SplitterError {
|
|
5368
|
-
}
|
|
5369
|
-
class GreedySplitter {
|
|
5370
|
-
baseSplitter;
|
|
5371
|
-
minChunkSize;
|
|
5372
|
-
preferredChunkSize;
|
|
5373
|
-
/**
|
|
5374
|
-
* Combines a base document splitter with size constraints to produce optimally-sized chunks.
|
|
5375
|
-
* The base splitter handles the initial semantic splitting, while this class handles
|
|
5376
|
-
* the concatenation strategy.
|
|
5377
|
-
*/
|
|
5378
|
-
constructor(baseSplitter, minChunkSize, preferredChunkSize) {
|
|
5379
|
-
this.baseSplitter = baseSplitter;
|
|
5380
|
-
this.minChunkSize = minChunkSize;
|
|
5381
|
-
this.preferredChunkSize = preferredChunkSize;
|
|
5382
|
-
}
|
|
5383
|
-
/**
|
|
5384
|
-
* Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
|
|
5385
|
-
* are combined until they reach the minimum size, but splits are preserved at major
|
|
5386
|
-
* section boundaries to maintain document structure. This balances the need for
|
|
5387
|
-
* context with semantic coherence.
|
|
5388
|
-
*/
|
|
5389
|
-
async splitText(markdown) {
|
|
5390
|
-
const initialChunks = await this.baseSplitter.splitText(markdown);
|
|
5391
|
-
const concatenatedChunks = [];
|
|
5392
|
-
let currentChunk = null;
|
|
5393
|
-
for (const nextChunk of initialChunks) {
|
|
5394
|
-
if (currentChunk) {
|
|
5395
|
-
if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
|
|
5396
|
-
concatenatedChunks.push(currentChunk);
|
|
5397
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
5398
|
-
continue;
|
|
5399
|
-
}
|
|
5400
|
-
if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
|
|
5401
|
-
concatenatedChunks.push(currentChunk);
|
|
5402
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
5403
|
-
continue;
|
|
5404
|
-
}
|
|
5405
|
-
currentChunk.content += `
|
|
5406
|
-
${nextChunk.content}`;
|
|
5407
|
-
currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
|
|
5408
|
-
currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
|
|
5409
|
-
} else {
|
|
5410
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
5411
|
-
}
|
|
5412
|
-
}
|
|
5413
|
-
if (currentChunk) {
|
|
5414
|
-
concatenatedChunks.push(currentChunk);
|
|
5415
|
-
}
|
|
5416
|
-
return concatenatedChunks;
|
|
5417
|
-
}
|
|
5418
|
-
cloneChunk(chunk) {
|
|
5419
|
-
return {
|
|
5420
|
-
types: [...chunk.types],
|
|
5421
|
-
content: chunk.content,
|
|
5422
|
-
section: {
|
|
5423
|
-
level: chunk.section.level,
|
|
5424
|
-
path: [...chunk.section.path]
|
|
5425
|
-
}
|
|
5426
|
-
};
|
|
5427
|
-
}
|
|
5428
|
-
/**
|
|
5429
|
-
* H1 and H2 headings represent major conceptual breaks in the document.
|
|
5430
|
-
* Preserving these splits helps maintain the document's logical structure.
|
|
5431
|
-
*/
|
|
5432
|
-
startsNewMajorSection(chunk) {
|
|
5433
|
-
return chunk.section.level === 1 || chunk.section.level === 2;
|
|
5434
|
-
}
|
|
5435
|
-
/**
|
|
5436
|
-
* Size limit check to ensure chunks remain within embedding model constraints.
|
|
5437
|
-
* Essential for maintaining consistent embedding quality and avoiding truncation.
|
|
5438
|
-
*/
|
|
5439
|
-
wouldExceedMaxSize(currentChunk, nextChunk) {
|
|
5440
|
-
if (!currentChunk) {
|
|
5441
|
-
return false;
|
|
5442
|
-
}
|
|
5443
|
-
return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
|
|
5444
|
-
}
|
|
5445
|
-
/**
|
|
5446
|
-
* Checks if one path is a prefix of another path, indicating a parent-child relationship
|
|
5447
|
-
*/
|
|
5448
|
-
isPathIncluded(parentPath, childPath) {
|
|
5449
|
-
if (parentPath.length >= childPath.length) return false;
|
|
5450
|
-
return parentPath.every((part, i) => part === childPath[i]);
|
|
5451
|
-
}
|
|
5452
|
-
/**
|
|
5453
|
-
* Merges section metadata when concatenating chunks, following these rules:
|
|
5454
|
-
* 1. Level: Always uses the lowest (most general) level between chunks
|
|
5455
|
-
* 2. Path selection:
|
|
5456
|
-
* - For parent-child relationships (one path includes the other), uses the child's path
|
|
5457
|
-
* - For siblings/unrelated sections, uses the common parent path
|
|
5458
|
-
* - If no common path exists, uses the root path ([])
|
|
5459
|
-
*/
|
|
5460
|
-
mergeSectionInfo(currentChunk, nextChunk) {
|
|
5461
|
-
const level = Math.min(currentChunk.section.level, nextChunk.section.level);
|
|
5462
|
-
if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
|
|
5463
|
-
return currentChunk.section;
|
|
5464
|
-
}
|
|
5465
|
-
if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
|
|
5466
|
-
return {
|
|
5467
|
-
path: nextChunk.section.path,
|
|
5468
|
-
level
|
|
5469
|
-
};
|
|
5470
|
-
}
|
|
5471
|
-
if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
|
|
5472
|
-
return {
|
|
5473
|
-
path: currentChunk.section.path,
|
|
5474
|
-
level
|
|
5475
|
-
};
|
|
5476
|
-
}
|
|
5477
|
-
const commonPath = this.findCommonPrefix(
|
|
5478
|
-
currentChunk.section.path,
|
|
5479
|
-
nextChunk.section.path
|
|
5480
|
-
);
|
|
5481
|
-
return {
|
|
5482
|
-
path: commonPath,
|
|
5483
|
-
level
|
|
5484
|
-
};
|
|
5485
|
-
}
|
|
5486
|
-
mergeTypes(currentTypes, nextTypes) {
|
|
5487
|
-
return [.../* @__PURE__ */ new Set([...currentTypes, ...nextTypes])];
|
|
5488
|
-
}
|
|
5489
|
-
/**
|
|
5490
|
-
* Returns longest common prefix between two paths
|
|
5491
|
-
*/
|
|
5492
|
-
findCommonPrefix(path1, path2) {
|
|
5493
|
-
const common = [];
|
|
5494
|
-
for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
|
|
5495
|
-
if (path1[i] === path2[i]) {
|
|
5496
|
-
common.push(path1[i]);
|
|
5497
|
-
} else {
|
|
5498
|
-
break;
|
|
5499
|
-
}
|
|
5500
|
-
}
|
|
5501
|
-
return common;
|
|
5502
|
-
}
|
|
5503
|
-
}
|
|
5504
|
-
const fullTrim = (str) => {
|
|
5505
|
-
return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
|
|
5506
|
-
};
|
|
5507
|
-
class CodeContentSplitter {
|
|
5508
|
-
constructor(options) {
|
|
5509
|
-
this.options = options;
|
|
5510
|
-
}
|
|
5511
|
-
async split(content) {
|
|
5512
|
-
const language = content.match(/^```(\w+)\n/)?.[1];
|
|
5513
|
-
const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
|
|
5514
|
-
const lines = strippedContent.split("\n");
|
|
5515
|
-
const chunks = [];
|
|
5516
|
-
let currentChunkLines = [];
|
|
5517
|
-
for (const line of lines) {
|
|
5518
|
-
const singleLineSize = this.wrap(line, language).length;
|
|
5519
|
-
if (singleLineSize > this.options.chunkSize) {
|
|
5520
|
-
throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
|
|
5521
|
-
}
|
|
5522
|
-
currentChunkLines.push(line);
|
|
5523
|
-
const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
|
|
5524
|
-
const newChunkSize = newChunkContent.length;
|
|
5525
|
-
if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
|
|
5526
|
-
const lastLine = currentChunkLines.pop();
|
|
5527
|
-
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
5528
|
-
currentChunkLines = [lastLine];
|
|
5529
|
-
}
|
|
5530
|
-
}
|
|
5531
|
-
if (currentChunkLines.length > 0) {
|
|
5532
|
-
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
5533
|
-
}
|
|
5534
|
-
return chunks;
|
|
5535
|
-
}
|
|
5536
|
-
wrap(content, language) {
|
|
5537
|
-
return `\`\`\`${language || ""}
|
|
5538
|
-
${content.replace(/\n+$/, "")}
|
|
5539
|
-
\`\`\``;
|
|
5540
|
-
}
|
|
5541
|
-
}
|
|
5542
|
-
class TableContentSplitter {
|
|
5543
|
-
constructor(options) {
|
|
5544
|
-
this.options = options;
|
|
5545
|
-
}
|
|
5546
|
-
/**
|
|
5547
|
-
* Splits table content into chunks while preserving table structure
|
|
5548
|
-
*/
|
|
5549
|
-
async split(content) {
|
|
5550
|
-
const parsedTable = this.parseTable(content);
|
|
5551
|
-
if (!parsedTable) {
|
|
5552
|
-
return [content];
|
|
5553
|
-
}
|
|
5554
|
-
const { headers, rows } = parsedTable;
|
|
5555
|
-
const chunks = [];
|
|
5556
|
-
let currentRows = [];
|
|
5557
|
-
for (const row of rows) {
|
|
5558
|
-
const singleRowSize = this.wrap(row, headers).length;
|
|
5559
|
-
if (singleRowSize > this.options.chunkSize) {
|
|
5560
|
-
throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
|
|
5561
|
-
}
|
|
5562
|
-
const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
|
|
5563
|
-
const newChunkSize = newChunkContent.length;
|
|
5564
|
-
if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
|
|
5565
|
-
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
5566
|
-
currentRows = [row];
|
|
5567
|
-
} else {
|
|
5568
|
-
currentRows.push(row);
|
|
5569
|
-
}
|
|
5570
|
-
}
|
|
5571
|
-
if (currentRows.length > 0) {
|
|
5572
|
-
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
5573
|
-
}
|
|
5574
|
-
return chunks;
|
|
5575
|
-
}
|
|
5576
|
-
wrap(content, headers) {
|
|
5577
|
-
const headerRow = `| ${headers.join(" | ")} |`;
|
|
5578
|
-
const separatorRow = `|${headers.map(() => "---").join("|")}|`;
|
|
5579
|
-
return [headerRow, separatorRow, content].join("\n");
|
|
5580
|
-
}
|
|
5581
|
-
parseTable(content) {
|
|
5582
|
-
const lines = content.trim().split("\n");
|
|
5583
|
-
if (lines.length < 3) return null;
|
|
5584
|
-
const headers = this.parseRow(lines[0]);
|
|
5585
|
-
if (!headers) return null;
|
|
5586
|
-
const separator = lines[1];
|
|
5587
|
-
if (!this.isValidSeparator(separator)) return null;
|
|
5588
|
-
const rows = lines.slice(2).filter((row) => row.trim() !== "");
|
|
5589
|
-
return { headers, separator, rows };
|
|
5590
|
-
}
|
|
5591
|
-
/**
|
|
5592
|
-
* Parses a table row into cells
|
|
5593
|
-
*/
|
|
5594
|
-
parseRow(row) {
|
|
5595
|
-
if (!row.includes("|")) return null;
|
|
5596
|
-
return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
|
|
5597
|
-
}
|
|
5598
|
-
/**
|
|
5599
|
-
* Validates the separator row of the table
|
|
5600
|
-
*/
|
|
5601
|
-
isValidSeparator(separator) {
|
|
5602
|
-
return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
|
|
5603
|
-
}
|
|
5604
|
-
}
|
|
5605
|
-
class TextContentSplitter {
|
|
5606
|
-
constructor(options) {
|
|
5607
|
-
this.options = options;
|
|
5608
|
-
}
|
|
5609
|
-
/**
|
|
5610
|
-
* Splits text content into chunks while trying to preserve semantic boundaries.
|
|
5611
|
-
* Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
|
|
5612
|
-
*/
|
|
5613
|
-
async split(content) {
|
|
5614
|
-
const trimmedContent = fullTrim(content);
|
|
5615
|
-
if (trimmedContent.length <= this.options.chunkSize) {
|
|
5616
|
-
return [trimmedContent];
|
|
5617
|
-
}
|
|
5618
|
-
const words = trimmedContent.split(/\s+/);
|
|
5619
|
-
const longestWord = words.reduce(
|
|
5620
|
-
(max, word) => word.length > max.length ? word : max
|
|
5621
|
-
);
|
|
5622
|
-
if (longestWord.length > this.options.chunkSize) {
|
|
5623
|
-
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
5624
|
-
}
|
|
5625
|
-
const paragraphChunks = this.splitByParagraphs(trimmedContent);
|
|
5626
|
-
if (this.areChunksValid(paragraphChunks)) {
|
|
5627
|
-
return paragraphChunks;
|
|
5628
|
-
}
|
|
5629
|
-
const lineChunks = this.splitByLines(trimmedContent);
|
|
5630
|
-
if (this.areChunksValid(lineChunks)) {
|
|
5631
|
-
return this.mergeChunks(lineChunks, "\n");
|
|
5632
|
-
}
|
|
5633
|
-
const wordChunks = await this.splitByWords(trimmedContent);
|
|
5634
|
-
return this.mergeChunks(wordChunks, " ");
|
|
5635
|
-
}
|
|
5636
|
-
/**
|
|
5637
|
-
* Checks if all chunks are within the maximum size limit
|
|
5638
|
-
*/
|
|
5639
|
-
areChunksValid(chunks) {
|
|
5640
|
-
return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
|
|
5641
|
-
}
|
|
5642
|
-
/**
|
|
5643
|
-
* Splits text into chunks by paragraph boundaries (double newlines)
|
|
5644
|
-
*/
|
|
5645
|
-
splitByParagraphs(text) {
|
|
5646
|
-
const paragraphs = text.split(/\n\s*\n/).map((p) => fullTrim(p)).filter(Boolean);
|
|
5647
|
-
return paragraphs.filter((chunk) => chunk.length > 2);
|
|
5648
|
-
}
|
|
5649
|
-
/**
|
|
5650
|
-
* Splits text into chunks by line boundaries
|
|
5651
|
-
*/
|
|
5652
|
-
splitByLines(text) {
|
|
5653
|
-
const lines = text.split(/\n/).map((line) => fullTrim(line)).filter(Boolean);
|
|
5654
|
-
return lines.filter((chunk) => chunk.length > 1);
|
|
5655
|
-
}
|
|
5656
|
-
/**
|
|
5657
|
-
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
5658
|
-
*/
|
|
5659
|
-
async splitByWords(text) {
|
|
5660
|
-
const splitter = new RecursiveCharacterTextSplitter({
|
|
5661
|
-
chunkSize: this.options.chunkSize,
|
|
5662
|
-
chunkOverlap: 0
|
|
5663
|
-
});
|
|
5664
|
-
const chunks = await splitter.splitText(text);
|
|
5665
|
-
return chunks;
|
|
5666
|
-
}
|
|
5667
|
-
/**
|
|
5668
|
-
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
5669
|
-
* Only merges if combined size is within maxChunkSize.
|
|
5670
|
-
*/
|
|
5671
|
-
mergeChunks(chunks, separator) {
|
|
5672
|
-
const mergedChunks = [];
|
|
5673
|
-
let currentChunk = null;
|
|
5674
|
-
for (const chunk of chunks) {
|
|
5675
|
-
if (currentChunk === null) {
|
|
5676
|
-
currentChunk = chunk;
|
|
5677
|
-
continue;
|
|
5678
|
-
}
|
|
5679
|
-
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
5680
|
-
const nextChunkSize = this.getChunkSize(chunk);
|
|
5681
|
-
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
5682
|
-
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
5683
|
-
} else {
|
|
5684
|
-
mergedChunks.push(currentChunk);
|
|
5685
|
-
currentChunk = chunk;
|
|
5686
|
-
}
|
|
5687
|
-
}
|
|
5688
|
-
if (currentChunk) {
|
|
5689
|
-
mergedChunks.push(currentChunk);
|
|
5690
|
-
}
|
|
5691
|
-
return mergedChunks;
|
|
5692
|
-
}
|
|
5693
|
-
getChunkSize(chunk) {
|
|
5694
|
-
return chunk.length;
|
|
5695
|
-
}
|
|
5696
|
-
wrap(content) {
|
|
5697
|
-
return content;
|
|
5698
|
-
}
|
|
5699
|
-
}
|
|
5700
|
-
class SemanticMarkdownSplitter {
|
|
5701
|
-
constructor(preferredChunkSize, maxChunkSize) {
|
|
5702
|
-
this.preferredChunkSize = preferredChunkSize;
|
|
5703
|
-
this.maxChunkSize = maxChunkSize;
|
|
5704
|
-
this.turndownService = new TurndownService({
|
|
5705
|
-
headingStyle: "atx",
|
|
5706
|
-
hr: "---",
|
|
5707
|
-
bulletListMarker: "-",
|
|
5708
|
-
codeBlockStyle: "fenced",
|
|
5709
|
-
emDelimiter: "_",
|
|
5710
|
-
strongDelimiter: "**",
|
|
5711
|
-
linkStyle: "inlined"
|
|
5712
|
-
});
|
|
5713
|
-
this.turndownService.addRule("table", {
|
|
5714
|
-
filter: ["table"],
|
|
5715
|
-
replacement: (_content, node) => {
|
|
5716
|
-
const table = node;
|
|
5717
|
-
const headers = Array.from(table.querySelectorAll("th")).map(
|
|
5718
|
-
(th) => th.textContent?.trim() || ""
|
|
5719
|
-
);
|
|
5720
|
-
const rows = Array.from(table.querySelectorAll("tr")).filter(
|
|
5721
|
-
(tr) => !tr.querySelector("th")
|
|
5722
|
-
);
|
|
5723
|
-
if (headers.length === 0 && rows.length === 0) return "";
|
|
5724
|
-
let markdown = "\n";
|
|
5725
|
-
if (headers.length > 0) {
|
|
5726
|
-
markdown += `| ${headers.join(" | ")} |
|
|
5727
|
-
`;
|
|
5728
|
-
markdown += `|${headers.map(() => "---").join("|")}|
|
|
5729
|
-
`;
|
|
5730
|
-
}
|
|
5731
|
-
for (const row of rows) {
|
|
5732
|
-
const cells = Array.from(row.querySelectorAll("td")).map(
|
|
5733
|
-
(td) => td.textContent?.trim() || ""
|
|
5734
|
-
);
|
|
5735
|
-
markdown += `| ${cells.join(" | ")} |
|
|
5736
|
-
`;
|
|
5737
|
-
}
|
|
5738
|
-
return markdown;
|
|
5739
|
-
}
|
|
5740
|
-
});
|
|
5741
|
-
this.textSplitter = new TextContentSplitter({
|
|
5742
|
-
chunkSize: this.preferredChunkSize
|
|
5743
|
-
});
|
|
5744
|
-
this.codeSplitter = new CodeContentSplitter({
|
|
5745
|
-
chunkSize: this.maxChunkSize
|
|
5746
|
-
});
|
|
5747
|
-
this.tableSplitter = new TableContentSplitter({
|
|
5748
|
-
chunkSize: this.maxChunkSize
|
|
5749
|
-
});
|
|
5750
|
-
}
|
|
5751
|
-
turndownService;
|
|
5752
|
-
textSplitter;
|
|
5753
|
-
codeSplitter;
|
|
5754
|
-
tableSplitter;
|
|
5755
|
-
/**
|
|
5756
|
-
* Main entry point for splitting markdown content
|
|
5757
|
-
*/
|
|
5758
|
-
async splitText(markdown) {
|
|
5759
|
-
const html = await this.markdownToHtml(markdown);
|
|
5760
|
-
const dom = await this.parseHtml(html);
|
|
5761
|
-
const sections = await this.splitIntoSections(dom);
|
|
5762
|
-
return this.splitSectionContent(sections);
|
|
5763
|
-
}
|
|
5764
|
-
/**
|
|
5765
|
-
* Step 1: Split document into sections based on H1-H6 headings,
|
|
5766
|
-
* as well as code blocks and tables.
|
|
5767
|
-
*/
|
|
5768
|
-
async splitIntoSections(dom) {
|
|
5769
|
-
const body = dom.querySelector("body");
|
|
5770
|
-
if (!body) {
|
|
5771
|
-
throw new Error("Invalid HTML structure: no body element found");
|
|
5772
|
-
}
|
|
5773
|
-
let currentSection = this.createRootSection();
|
|
5774
|
-
const sections = [];
|
|
5775
|
-
const stack = [currentSection];
|
|
5776
|
-
for (const element of Array.from(body.children)) {
|
|
5777
|
-
const headingMatch = element.tagName.match(/H([1-6])/);
|
|
5778
|
-
if (headingMatch) {
|
|
5779
|
-
const level = Number.parseInt(headingMatch[1], 10);
|
|
5780
|
-
const title = fullTrim(element.textContent || "");
|
|
5781
|
-
while (stack.length > 1 && stack[stack.length - 1].level >= level) {
|
|
5782
|
-
stack.pop();
|
|
5783
|
-
}
|
|
5784
|
-
currentSection = {
|
|
5785
|
-
level,
|
|
5786
|
-
path: [
|
|
5787
|
-
...stack.slice(1).reduce((acc, s) => {
|
|
5788
|
-
const lastPath = s.path[s.path.length - 1];
|
|
5789
|
-
if (lastPath) acc.push(lastPath);
|
|
5790
|
-
return acc;
|
|
5791
|
-
}, []),
|
|
5792
|
-
title
|
|
5793
|
-
],
|
|
5794
|
-
content: [
|
|
5795
|
-
{
|
|
5796
|
-
type: "heading",
|
|
5797
|
-
text: `${"#".repeat(level)} ${title}`
|
|
5798
|
-
}
|
|
5799
|
-
]
|
|
5800
|
-
};
|
|
5801
|
-
sections.push(currentSection);
|
|
5802
|
-
stack.push(currentSection);
|
|
5803
|
-
} else if (element.tagName === "PRE") {
|
|
5804
|
-
const code = element.querySelector("code");
|
|
5805
|
-
const language = code?.className.replace("language-", "") || "";
|
|
5806
|
-
const content = code?.textContent || element.textContent || "";
|
|
5807
|
-
const markdown = `${"```"}${language}
|
|
5808
|
-
${content}
|
|
5809
|
-
${"```"}`;
|
|
5810
|
-
currentSection = {
|
|
5811
|
-
level: currentSection.level,
|
|
5812
|
-
path: currentSection.path,
|
|
5813
|
-
content: [
|
|
5814
|
-
{
|
|
5815
|
-
type: "code",
|
|
5816
|
-
text: markdown
|
|
5817
|
-
}
|
|
5818
|
-
]
|
|
5819
|
-
};
|
|
5820
|
-
sections.push(currentSection);
|
|
5821
|
-
} else if (element.tagName === "TABLE") {
|
|
5822
|
-
const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
|
|
5823
|
-
currentSection = {
|
|
5824
|
-
level: currentSection.level,
|
|
5825
|
-
path: currentSection.path,
|
|
5826
|
-
content: [
|
|
5827
|
-
{
|
|
5828
|
-
type: "table",
|
|
5829
|
-
text: markdown
|
|
5830
|
-
}
|
|
5831
|
-
]
|
|
5832
|
-
};
|
|
5833
|
-
sections.push(currentSection);
|
|
5834
|
-
} else {
|
|
5835
|
-
const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
|
|
5836
|
-
if (markdown) {
|
|
5837
|
-
currentSection = {
|
|
5838
|
-
level: currentSection.level,
|
|
5839
|
-
path: currentSection.path,
|
|
5840
|
-
content: [
|
|
5841
|
-
{
|
|
5842
|
-
type: "text",
|
|
5843
|
-
text: markdown
|
|
5844
|
-
}
|
|
5845
|
-
]
|
|
5846
|
-
};
|
|
5847
|
-
sections.push(currentSection);
|
|
5848
|
-
}
|
|
5849
|
-
}
|
|
5850
|
-
}
|
|
5851
|
-
return sections;
|
|
5852
|
-
}
|
|
5853
|
-
/**
|
|
5854
|
-
* Step 2: Split section content into smaller chunks
|
|
5855
|
-
*/
|
|
5856
|
-
async splitSectionContent(sections) {
|
|
5857
|
-
const chunks = [];
|
|
5858
|
-
for (const section of sections) {
|
|
5859
|
-
for (const content of section.content) {
|
|
5860
|
-
let splitContent = [];
|
|
5861
|
-
try {
|
|
5862
|
-
switch (content.type) {
|
|
5863
|
-
case "heading":
|
|
5864
|
-
case "text": {
|
|
5865
|
-
splitContent = await this.textSplitter.split(content.text);
|
|
5866
|
-
break;
|
|
5867
|
-
}
|
|
5868
|
-
case "code": {
|
|
5869
|
-
splitContent = await this.codeSplitter.split(content.text);
|
|
5870
|
-
break;
|
|
5871
|
-
}
|
|
5872
|
-
case "table": {
|
|
5873
|
-
splitContent = await this.tableSplitter.split(content.text);
|
|
5874
|
-
break;
|
|
5875
|
-
}
|
|
5876
|
-
}
|
|
5877
|
-
} catch (err) {
|
|
5878
|
-
if (err instanceof MinimumChunkSizeError) {
|
|
5879
|
-
logger.warn(
|
|
5880
|
-
`⚠ Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
|
|
5881
|
-
);
|
|
5882
|
-
const splitter = new RecursiveCharacterTextSplitter({
|
|
5883
|
-
chunkSize: this.maxChunkSize,
|
|
5884
|
-
chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
|
|
5885
|
-
// Use more aggressive separators including empty string as last resort
|
|
5886
|
-
separators: [
|
|
5887
|
-
"\n\n",
|
|
5888
|
-
"\n",
|
|
5889
|
-
" ",
|
|
5890
|
-
" ",
|
|
5891
|
-
".",
|
|
5892
|
-
",",
|
|
5893
|
-
";",
|
|
5894
|
-
":",
|
|
5895
|
-
"-",
|
|
5896
|
-
"(",
|
|
5897
|
-
")",
|
|
5898
|
-
"[",
|
|
5899
|
-
"]",
|
|
5900
|
-
"{",
|
|
5901
|
-
"}",
|
|
5902
|
-
""
|
|
5903
|
-
]
|
|
5904
|
-
});
|
|
5905
|
-
const chunks2 = await splitter.splitText(content.text);
|
|
5906
|
-
if (chunks2.length === 0) {
|
|
5907
|
-
splitContent = [content.text.substring(0, this.maxChunkSize)];
|
|
5908
|
-
} else {
|
|
5909
|
-
splitContent = chunks2;
|
|
5910
|
-
}
|
|
5911
|
-
} else {
|
|
5912
|
-
const errMessage = err instanceof Error ? err.message : String(err);
|
|
5913
|
-
throw new ContentSplitterError(
|
|
5914
|
-
`Failed to split ${content.type} content: ${errMessage}`
|
|
5915
|
-
);
|
|
5916
|
-
}
|
|
5917
|
-
}
|
|
5918
|
-
chunks.push(
|
|
5919
|
-
...splitContent.map(
|
|
5920
|
-
(text) => ({
|
|
5921
|
-
types: [content.type],
|
|
5922
|
-
content: text,
|
|
5923
|
-
section: {
|
|
5924
|
-
level: section.level,
|
|
5925
|
-
path: section.path
|
|
5926
|
-
}
|
|
5927
|
-
})
|
|
5928
|
-
)
|
|
5929
|
-
);
|
|
5930
|
-
}
|
|
5931
|
-
}
|
|
5932
|
-
return chunks;
|
|
5933
|
-
}
|
|
5934
|
-
/**
|
|
5935
|
-
* Helper to create the root section
|
|
5936
|
-
*/
|
|
5937
|
-
createRootSection() {
|
|
5938
|
-
return {
|
|
5939
|
-
level: 0,
|
|
5940
|
-
path: [],
|
|
5941
|
-
content: []
|
|
5942
|
-
};
|
|
5943
|
-
}
|
|
5944
|
-
/**
|
|
5945
|
-
* Convert markdown to HTML using remark
|
|
5946
|
-
*/
|
|
5947
|
-
async markdownToHtml(markdown) {
|
|
5948
|
-
const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
|
|
5949
|
-
return `<!DOCTYPE html>
|
|
5950
|
-
<html>
|
|
5951
|
-
<body>
|
|
5952
|
-
${String(html)}
|
|
5953
|
-
</body>
|
|
5954
|
-
</html>`;
|
|
5955
|
-
}
|
|
5956
|
-
/**
|
|
5957
|
-
* Parse HTML
|
|
5958
|
-
*/
|
|
5959
|
-
async parseHtml(html) {
|
|
5960
|
-
const { window } = createJSDOM(html);
|
|
5961
|
-
return window.document;
|
|
5962
|
-
}
|
|
5963
|
-
}
|
|
5964
|
-
const CHILD_LIMIT = 5;
|
|
5965
|
-
const SIBLING_LIMIT = 2;
|
|
5966
|
-
class DocumentRetrieverService {
|
|
5967
|
-
documentStore;
|
|
5968
|
-
constructor(documentStore) {
|
|
5969
|
-
this.documentStore = documentStore;
|
|
5970
|
-
}
|
|
5971
|
-
/**
|
|
5972
|
-
* Collects all related chunk IDs for a given initial hit.
|
|
5973
|
-
* Returns an object with url, hitId, relatedIds (Set), and score.
|
|
5974
|
-
*/
|
|
5975
|
-
async getRelatedChunkIds(library, version2, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
|
|
5976
|
-
const id = doc.id;
|
|
5977
|
-
const url = doc.metadata.url;
|
|
5978
|
-
const score = doc.metadata.score;
|
|
5979
|
-
const relatedIds = /* @__PURE__ */ new Set();
|
|
5980
|
-
relatedIds.add(id);
|
|
5981
|
-
const parent = await this.documentStore.findParentChunk(library, version2, id);
|
|
5982
|
-
if (parent) {
|
|
5983
|
-
relatedIds.add(parent.id);
|
|
5984
|
-
}
|
|
5985
|
-
const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
|
|
5986
|
-
library,
|
|
5987
|
-
version2,
|
|
5988
|
-
id,
|
|
5989
|
-
siblingLimit
|
|
5990
|
-
);
|
|
5991
|
-
for (const sib of precedingSiblings) {
|
|
5992
|
-
relatedIds.add(sib.id);
|
|
5993
|
-
}
|
|
5994
|
-
const childChunks = await this.documentStore.findChildChunks(
|
|
5995
|
-
library,
|
|
5996
|
-
version2,
|
|
5997
|
-
id,
|
|
5998
|
-
childLimit
|
|
5999
|
-
);
|
|
6000
|
-
for (const child of childChunks) {
|
|
6001
|
-
relatedIds.add(child.id);
|
|
6002
|
-
}
|
|
6003
|
-
const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
|
|
6004
|
-
library,
|
|
6005
|
-
version2,
|
|
6006
|
-
id,
|
|
6007
|
-
siblingLimit
|
|
6008
|
-
);
|
|
6009
|
-
for (const sib of subsequentSiblings) {
|
|
6010
|
-
relatedIds.add(sib.id);
|
|
6011
|
-
}
|
|
6012
|
-
return { url, hitId: id, relatedIds, score };
|
|
6013
|
-
}
|
|
6014
|
-
/**
|
|
6015
|
-
* Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
|
|
6016
|
-
*/
|
|
6017
|
-
groupAndPrepareFetch(relatedInfos) {
|
|
6018
|
-
const urlMap = /* @__PURE__ */ new Map();
|
|
6019
|
-
for (const info of relatedInfos) {
|
|
6020
|
-
let entry = urlMap.get(info.url);
|
|
6021
|
-
if (!entry) {
|
|
6022
|
-
entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
|
|
6023
|
-
urlMap.set(info.url, entry);
|
|
6024
|
-
}
|
|
6025
|
-
for (const id of info.relatedIds) {
|
|
6026
|
-
entry.uniqueChunkIds.add(id);
|
|
6027
|
-
}
|
|
6028
|
-
if (info.score > entry.maxScore) {
|
|
6029
|
-
entry.maxScore = info.score;
|
|
6030
|
-
}
|
|
6031
|
-
}
|
|
6032
|
-
return urlMap;
|
|
6033
|
-
}
|
|
6034
|
-
/**
|
|
6035
|
-
* Finalizes the merged result for a URL group by fetching, sorting, and joining content.
|
|
6036
|
-
*/
|
|
6037
|
-
async finalizeResult(library, version2, url, uniqueChunkIds, maxScore) {
|
|
6038
|
-
const ids = Array.from(uniqueChunkIds);
|
|
6039
|
-
const docs = await this.documentStore.findChunksByIds(library, version2, ids);
|
|
6040
|
-
const content = docs.map((d) => d.pageContent).join("\n\n");
|
|
6041
|
-
return {
|
|
6042
|
-
url,
|
|
6043
|
-
content,
|
|
6044
|
-
score: maxScore
|
|
6045
|
-
};
|
|
6046
|
-
}
|
|
6047
|
-
/**
|
|
6048
|
-
* Searches for documents and expands the context around the matches.
|
|
6049
|
-
* @param library The library name.
|
|
6050
|
-
* @param version The library version.
|
|
6051
|
-
* @param query The search query.
|
|
6052
|
-
* @param version The library version (optional, defaults to searching documents without a version).
|
|
6053
|
-
* @param query The search query.
|
|
6054
|
-
* @param limit The optional limit for the initial search results.
|
|
6055
|
-
* @returns An array of strings representing the aggregated content of the retrieved chunks.
|
|
6056
|
-
*/
|
|
6057
|
-
async search(library, version2, query, limit) {
|
|
6058
|
-
const normalizedVersion = (version2 ?? "").toLowerCase();
|
|
6059
|
-
const initialResults = await this.documentStore.findByContent(
|
|
6060
|
-
library,
|
|
6061
|
-
normalizedVersion,
|
|
6062
|
-
query,
|
|
6063
|
-
limit ?? 10
|
|
6064
|
-
);
|
|
6065
|
-
const relatedInfos = await Promise.all(
|
|
6066
|
-
initialResults.map(
|
|
6067
|
-
(doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
|
|
6068
|
-
)
|
|
6069
|
-
);
|
|
6070
|
-
const urlMap = this.groupAndPrepareFetch(relatedInfos);
|
|
6071
|
-
const results = [];
|
|
6072
|
-
for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
|
|
6073
|
-
const result = await this.finalizeResult(
|
|
6074
|
-
library,
|
|
6075
|
-
normalizedVersion,
|
|
6076
|
-
url,
|
|
6077
|
-
uniqueChunkIds,
|
|
6078
|
-
maxScore
|
|
6079
|
-
);
|
|
6080
|
-
results.push(result);
|
|
6081
|
-
}
|
|
6082
|
-
return results;
|
|
6083
|
-
}
|
|
6084
|
-
}
|
|
6085
|
-
class StoreError extends Error {
|
|
6086
|
-
constructor(message, cause) {
|
|
6087
|
-
super(cause ? `${message} caused by ${cause}` : message);
|
|
6088
|
-
this.cause = cause;
|
|
6089
|
-
this.name = this.constructor.name;
|
|
6090
|
-
const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
|
|
6091
|
-
if (causeError?.stack) {
|
|
6092
|
-
this.stack = causeError.stack;
|
|
6093
|
-
}
|
|
6094
|
-
}
|
|
6095
|
-
}
|
|
6096
|
-
class DimensionError extends StoreError {
|
|
6097
|
-
constructor(modelName, modelDimension, dbDimension) {
|
|
6098
|
-
super(
|
|
6099
|
-
`Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
|
|
6100
|
-
);
|
|
6101
|
-
this.modelName = modelName;
|
|
6102
|
-
this.modelDimension = modelDimension;
|
|
6103
|
-
this.dbDimension = dbDimension;
|
|
6104
|
-
}
|
|
6105
|
-
}
|
|
6106
|
-
class ConnectionError extends StoreError {
|
|
6107
|
-
}
|
|
6108
|
-
const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
|
|
6109
|
-
const MIGRATIONS_TABLE = "_schema_migrations";
|
|
6110
|
-
function ensureMigrationsTable(db) {
|
|
6111
|
-
db.exec(`
|
|
6112
|
-
CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
|
|
6113
|
-
id TEXT PRIMARY KEY,
|
|
6114
|
-
applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
6115
|
-
);
|
|
6116
|
-
`);
|
|
6117
|
-
}
|
|
6118
|
-
function getAppliedMigrations(db) {
|
|
6119
|
-
const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
|
|
6120
|
-
const rows = stmt.all();
|
|
6121
|
-
return new Set(rows.map((row) => row.id));
|
|
6122
|
-
}
|
|
6123
|
-
async function applyMigrations(db) {
|
|
6124
|
-
try {
|
|
6125
|
-
db.pragma("journal_mode = OFF");
|
|
6126
|
-
db.pragma("synchronous = OFF");
|
|
6127
|
-
db.pragma("mmap_size = 268435456");
|
|
6128
|
-
db.pragma("cache_size = -64000");
|
|
6129
|
-
db.pragma("temp_store = MEMORY");
|
|
6130
|
-
logger.debug("Applied performance optimizations for migration");
|
|
6131
|
-
} catch (_error) {
|
|
6132
|
-
logger.warn("⚠️ Could not apply all performance optimizations for migration");
|
|
6133
|
-
}
|
|
6134
|
-
const overallTransaction = db.transaction(() => {
|
|
6135
|
-
logger.debug("Checking database migrations...");
|
|
6136
|
-
ensureMigrationsTable(db);
|
|
6137
|
-
const appliedMigrations = getAppliedMigrations(db);
|
|
6138
|
-
if (!fs$1.existsSync(MIGRATIONS_DIR)) {
|
|
6139
|
-
throw new StoreError("Migrations directory not found");
|
|
6140
|
-
}
|
|
6141
|
-
const migrationFiles = fs$1.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
|
|
6142
|
-
const pendingMigrations = migrationFiles.filter(
|
|
6143
|
-
(filename) => !appliedMigrations.has(filename)
|
|
6144
|
-
);
|
|
6145
|
-
if (pendingMigrations.length > 0) {
|
|
6146
|
-
logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
|
|
6147
|
-
}
|
|
6148
|
-
let appliedCount = 0;
|
|
6149
|
-
for (const filename of pendingMigrations) {
|
|
6150
|
-
logger.debug(`Applying migration: ${filename}`);
|
|
6151
|
-
const filePath = path.join(MIGRATIONS_DIR, filename);
|
|
6152
|
-
const sql = fs$1.readFileSync(filePath, "utf8");
|
|
6153
|
-
try {
|
|
6154
|
-
db.exec(sql);
|
|
6155
|
-
const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
|
|
6156
|
-
insertStmt.run(filename);
|
|
6157
|
-
logger.debug(`✅ Applied migration: ${filename}`);
|
|
6158
|
-
appliedCount++;
|
|
6159
|
-
} catch (error) {
|
|
6160
|
-
logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
|
|
6161
|
-
throw new StoreError(`Migration failed: ${filename}`, error);
|
|
6162
|
-
}
|
|
6163
|
-
}
|
|
6164
|
-
if (appliedCount > 0) {
|
|
6165
|
-
logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
|
|
6166
|
-
} else {
|
|
6167
|
-
logger.debug("Database schema is up to date");
|
|
6168
|
-
}
|
|
6169
|
-
return appliedCount;
|
|
6170
|
-
});
|
|
6171
|
-
let retries = 0;
|
|
6172
|
-
let appliedMigrationsCount = 0;
|
|
6173
|
-
while (true) {
|
|
6174
|
-
try {
|
|
6175
|
-
appliedMigrationsCount = overallTransaction.immediate();
|
|
6176
|
-
logger.debug("Database migrations completed successfully");
|
|
6177
|
-
if (appliedMigrationsCount > 0) {
|
|
6178
|
-
try {
|
|
6179
|
-
logger.debug(
|
|
6180
|
-
`Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
|
|
6181
|
-
);
|
|
6182
|
-
db.exec("VACUUM");
|
|
6183
|
-
logger.debug("Database vacuum completed successfully");
|
|
6184
|
-
} catch (error) {
|
|
6185
|
-
logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
|
|
6186
|
-
}
|
|
6187
|
-
} else {
|
|
6188
|
-
logger.debug("Skipping VACUUM - no migrations were applied");
|
|
6189
|
-
}
|
|
6190
|
-
break;
|
|
6191
|
-
} catch (error) {
|
|
6192
|
-
if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
|
|
6193
|
-
retries++;
|
|
6194
|
-
logger.warn(
|
|
6195
|
-
`⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
|
|
6196
|
-
);
|
|
6197
|
-
await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
|
|
6198
|
-
} else {
|
|
6199
|
-
if (error?.code === "SQLITE_BUSY") {
|
|
6200
|
-
logger.error(
|
|
6201
|
-
`❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
|
|
6202
|
-
);
|
|
6203
|
-
}
|
|
6204
|
-
if (error instanceof StoreError) {
|
|
6205
|
-
throw error;
|
|
6206
|
-
}
|
|
6207
|
-
throw new StoreError("Failed during migration process", error);
|
|
6208
|
-
}
|
|
6209
|
-
}
|
|
6210
|
-
}
|
|
6211
|
-
try {
|
|
6212
|
-
db.pragma("journal_mode = WAL");
|
|
6213
|
-
db.pragma("wal_autocheckpoint = 1000");
|
|
6214
|
-
db.pragma("busy_timeout = 30000");
|
|
6215
|
-
db.pragma("foreign_keys = ON");
|
|
6216
|
-
db.pragma("synchronous = NORMAL");
|
|
6217
|
-
logger.debug(
|
|
6218
|
-
"Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
|
|
6219
|
-
);
|
|
6220
|
-
} catch (_error) {
|
|
6221
|
-
logger.warn("⚠️ Could not apply all production database settings");
|
|
6222
|
-
}
|
|
6223
|
-
}
|
|
6224
|
-
class DocumentStore {
|
|
6225
|
-
db;
|
|
6226
|
-
embeddings;
|
|
6227
|
-
dbDimension = VECTOR_DIMENSION;
|
|
6228
|
-
modelDimension;
|
|
6229
|
-
statements;
|
|
6230
|
-
/**
|
|
6231
|
-
* Calculates Reciprocal Rank Fusion score for a result
|
|
6232
|
-
*/
|
|
6233
|
-
calculateRRF(vecRank, ftsRank, k = 60) {
|
|
6234
|
-
let rrf = 0;
|
|
6235
|
-
if (vecRank !== void 0) {
|
|
6236
|
-
rrf += 1 / (k + vecRank);
|
|
6237
|
-
}
|
|
6238
|
-
if (ftsRank !== void 0) {
|
|
6239
|
-
rrf += 1 / (k + ftsRank);
|
|
6240
|
-
}
|
|
6241
|
-
return rrf;
|
|
6242
|
-
}
|
|
6243
|
-
/**
|
|
6244
|
-
* Assigns ranks to search results based on their scores
|
|
6245
|
-
*/
|
|
6246
|
-
assignRanks(results) {
|
|
6247
|
-
const vecRanks = /* @__PURE__ */ new Map();
|
|
6248
|
-
const ftsRanks = /* @__PURE__ */ new Map();
|
|
6249
|
-
results.filter((r) => r.vec_score !== void 0).sort((a, b) => (b.vec_score ?? 0) - (a.vec_score ?? 0)).forEach((result, index) => {
|
|
6250
|
-
vecRanks.set(Number(result.id), index + 1);
|
|
6251
|
-
});
|
|
6252
|
-
results.filter((r) => r.fts_score !== void 0).sort((a, b) => (b.fts_score ?? 0) - (a.fts_score ?? 0)).forEach((result, index) => {
|
|
6253
|
-
ftsRanks.set(Number(result.id), index + 1);
|
|
6254
|
-
});
|
|
6255
|
-
return results.map((result) => ({
|
|
6256
|
-
...result,
|
|
6257
|
-
vec_rank: vecRanks.get(Number(result.id)),
|
|
6258
|
-
fts_rank: ftsRanks.get(Number(result.id)),
|
|
6259
|
-
rrf_score: this.calculateRRF(
|
|
6260
|
-
vecRanks.get(Number(result.id)),
|
|
6261
|
-
ftsRanks.get(Number(result.id))
|
|
6262
|
-
)
|
|
6263
|
-
}));
|
|
6264
|
-
}
|
|
6265
|
-
constructor(dbPath) {
|
|
6266
|
-
if (!dbPath) {
|
|
6267
|
-
throw new StoreError("Missing required database path");
|
|
6268
|
-
}
|
|
6269
|
-
this.db = new Database(dbPath);
|
|
6270
|
-
}
|
|
6271
|
-
/**
|
|
6272
|
-
* Sets up prepared statements for database queries
|
|
6273
|
-
*/
|
|
6274
|
-
prepareStatements() {
|
|
6275
|
-
const statements = {
|
|
6276
|
-
getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
|
|
6277
|
-
insertDocument: this.db.prepare(
|
|
6278
|
-
"INSERT INTO documents (library_id, version_id, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
|
|
6279
|
-
),
|
|
6280
|
-
insertEmbedding: this.db.prepare(
|
|
6281
|
-
"INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)"
|
|
6282
|
-
),
|
|
6283
|
-
insertLibrary: this.db.prepare(
|
|
6284
|
-
"INSERT INTO libraries (name) VALUES (?) ON CONFLICT(name) DO NOTHING"
|
|
6285
|
-
),
|
|
6286
|
-
getLibraryIdByName: this.db.prepare(
|
|
6287
|
-
"SELECT id FROM libraries WHERE name = ?"
|
|
6288
|
-
),
|
|
6289
|
-
// New version-related statements
|
|
6290
|
-
insertVersion: this.db.prepare(
|
|
6291
|
-
"INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
|
|
6292
|
-
),
|
|
6293
|
-
resolveVersionId: this.db.prepare(
|
|
6294
|
-
"SELECT id FROM versions WHERE library_id = ? AND name IS ?"
|
|
6295
|
-
),
|
|
6296
|
-
getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
|
|
6297
|
-
queryVersionsByLibraryId: this.db.prepare(
|
|
6298
|
-
"SELECT * FROM versions WHERE library_id = ? ORDER BY name"
|
|
6299
|
-
),
|
|
6300
|
-
deleteLibraryDocuments: this.db.prepare(
|
|
6301
|
-
`DELETE FROM documents
|
|
6302
|
-
WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6303
|
-
AND version_id = (
|
|
6304
|
-
SELECT v.id FROM versions v
|
|
6305
|
-
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6306
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6307
|
-
)`
|
|
6308
|
-
),
|
|
6309
|
-
deleteDocuments: this.db.prepare(
|
|
6310
|
-
`DELETE FROM documents
|
|
6311
|
-
WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6312
|
-
AND version_id = (
|
|
6313
|
-
SELECT v.id FROM versions v
|
|
6314
|
-
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6315
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6316
|
-
)`
|
|
6317
|
-
),
|
|
6318
|
-
deleteDocumentsByUrl: this.db.prepare(
|
|
6319
|
-
`DELETE FROM documents
|
|
6320
|
-
WHERE url = ?
|
|
6321
|
-
AND library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6322
|
-
AND version_id = (
|
|
6323
|
-
SELECT v.id FROM versions v
|
|
6324
|
-
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6325
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6326
|
-
)`
|
|
6327
|
-
),
|
|
6328
|
-
getDocumentBySort: this.db.prepare(
|
|
6329
|
-
`SELECT d.id
|
|
6330
|
-
FROM documents d
|
|
6331
|
-
JOIN versions v ON d.version_id = v.id
|
|
6332
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6333
|
-
WHERE l.name = ?
|
|
6334
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6335
|
-
LIMIT 1`
|
|
6336
|
-
),
|
|
6337
|
-
queryVersions: this.db.prepare(
|
|
6338
|
-
`SELECT DISTINCT v.name
|
|
6339
|
-
FROM versions v
|
|
6340
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6341
|
-
WHERE l.name = ?
|
|
6342
|
-
ORDER BY v.name`
|
|
6343
|
-
),
|
|
6344
|
-
checkExists: this.db.prepare(
|
|
6345
|
-
`SELECT d.id FROM documents d
|
|
6346
|
-
JOIN versions v ON d.version_id = v.id
|
|
6347
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6348
|
-
WHERE l.name = ?
|
|
6349
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6350
|
-
LIMIT 1`
|
|
6351
|
-
),
|
|
6352
|
-
queryLibraryVersions: this.db.prepare(
|
|
6353
|
-
`SELECT
|
|
6354
|
-
l.name as library,
|
|
6355
|
-
v.name as version,
|
|
6356
|
-
COUNT(*) as documentCount,
|
|
6357
|
-
COUNT(DISTINCT d.url) as uniqueUrlCount,
|
|
6358
|
-
MIN(d.indexed_at) as indexedAt
|
|
6359
|
-
FROM documents d
|
|
6360
|
-
JOIN versions v ON d.version_id = v.id
|
|
6361
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6362
|
-
GROUP BY l.name, v.name
|
|
6363
|
-
ORDER BY l.name, v.name`
|
|
6364
|
-
),
|
|
6365
|
-
getChildChunks: this.db.prepare(`
|
|
6366
|
-
SELECT d.* FROM documents d
|
|
6367
|
-
JOIN versions v ON d.version_id = v.id
|
|
6368
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6369
|
-
WHERE l.name = ?
|
|
6370
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6371
|
-
AND d.url = ?
|
|
6372
|
-
AND json_array_length(json_extract(d.metadata, '$.path')) = ?
|
|
6373
|
-
AND json_extract(d.metadata, '$.path') LIKE ? || '%'
|
|
6374
|
-
AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
6375
|
-
ORDER BY d.sort_order
|
|
6376
|
-
LIMIT ?
|
|
6377
|
-
`),
|
|
6378
|
-
getPrecedingSiblings: this.db.prepare(`
|
|
6379
|
-
SELECT d.* FROM documents d
|
|
6380
|
-
JOIN versions v ON d.version_id = v.id
|
|
6381
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6382
|
-
WHERE l.name = ?
|
|
6383
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6384
|
-
AND d.url = ?
|
|
6385
|
-
AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
6386
|
-
AND json_extract(d.metadata, '$.path') = ?
|
|
6387
|
-
ORDER BY d.sort_order DESC
|
|
6388
|
-
LIMIT ?
|
|
6389
|
-
`),
|
|
6390
|
-
getSubsequentSiblings: this.db.prepare(`
|
|
6391
|
-
SELECT d.* FROM documents d
|
|
6392
|
-
JOIN versions v ON d.version_id = v.id
|
|
6393
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6394
|
-
WHERE l.name = ?
|
|
6395
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6396
|
-
AND d.url = ?
|
|
6397
|
-
AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
6398
|
-
AND json_extract(d.metadata, '$.path') = ?
|
|
6399
|
-
ORDER BY d.sort_order
|
|
6400
|
-
LIMIT ?
|
|
6401
|
-
`),
|
|
6402
|
-
getParentChunk: this.db.prepare(`
|
|
6403
|
-
SELECT d.* FROM documents d
|
|
6404
|
-
JOIN versions v ON d.version_id = v.id
|
|
6405
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6406
|
-
WHERE l.name = ?
|
|
6407
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6408
|
-
AND d.url = ?
|
|
6409
|
-
AND json_extract(d.metadata, '$.path') = ?
|
|
6410
|
-
AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
6411
|
-
ORDER BY d.sort_order DESC
|
|
6412
|
-
LIMIT 1
|
|
6413
|
-
`),
|
|
6414
|
-
// Status tracking statements
|
|
6415
|
-
updateVersionStatus: this.db.prepare(
|
|
6416
|
-
"UPDATE versions SET status = ?, error_message = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
6417
|
-
),
|
|
6418
|
-
updateVersionProgress: this.db.prepare(
|
|
6419
|
-
"UPDATE versions SET progress_pages = ?, progress_max_pages = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
6420
|
-
),
|
|
6421
|
-
getVersionsByStatus: this.db.prepare(
|
|
6422
|
-
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN (SELECT value FROM json_each(?))"
|
|
6423
|
-
),
|
|
6424
|
-
getRunningVersions: this.db.prepare(
|
|
6425
|
-
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status = 'running' ORDER BY v.started_at"
|
|
6426
|
-
),
|
|
6427
|
-
getActiveVersions: this.db.prepare(
|
|
6428
|
-
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN ('queued', 'running', 'updating') ORDER BY v.created_at"
|
|
6429
|
-
),
|
|
6430
|
-
// Scraper options statements
|
|
6431
|
-
updateVersionScraperOptions: this.db.prepare(
|
|
6432
|
-
"UPDATE versions SET source_url = ?, scraper_options = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
6433
|
-
),
|
|
6434
|
-
getVersionWithOptions: this.db.prepare(
|
|
6435
|
-
"SELECT * FROM versions WHERE id = ?"
|
|
6436
|
-
),
|
|
6437
|
-
getVersionsBySourceUrl: this.db.prepare(
|
|
6438
|
-
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.source_url = ? ORDER BY v.created_at DESC"
|
|
6439
|
-
)
|
|
6440
|
-
};
|
|
6441
|
-
this.statements = statements;
|
|
6442
|
-
}
|
|
6443
|
-
/**
|
|
6444
|
-
* Pads a vector to the fixed database dimension by appending zeros.
|
|
6445
|
-
* Throws an error if the input vector is longer than the database dimension.
|
|
6446
|
-
*/
|
|
6447
|
-
padVector(vector) {
|
|
6448
|
-
if (vector.length > this.dbDimension) {
|
|
6449
|
-
throw new Error(
|
|
6450
|
-
`Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
|
|
6451
|
-
);
|
|
6452
|
-
}
|
|
6453
|
-
if (vector.length === this.dbDimension) {
|
|
6454
|
-
return vector;
|
|
6455
|
-
}
|
|
6456
|
-
return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
|
|
6457
|
-
}
|
|
6458
|
-
/**
|
|
6459
|
-
* Initializes embeddings client using environment variables for configuration.
|
|
6460
|
-
*
|
|
6461
|
-
* The embedding model is configured using DOCS_MCP_EMBEDDING_MODEL environment variable.
|
|
6462
|
-
* Format: "provider:model_name" (e.g., "google:text-embedding-004") or just "model_name"
|
|
6463
|
-
* for OpenAI (default).
|
|
6464
|
-
*
|
|
6465
|
-
* Supported providers and their required environment variables:
|
|
6466
|
-
* - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
|
|
6467
|
-
* - google: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
|
|
6468
|
-
* - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION (or BEDROCK_AWS_REGION)
|
|
6469
|
-
* - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
|
|
6470
|
-
*/
|
|
6471
|
-
async initializeEmbeddings() {
|
|
6472
|
-
const modelSpec = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
|
|
6473
|
-
const { createEmbeddingModel } = await import("./EmbeddingFactory-CElwVk3X.js");
|
|
6474
|
-
this.embeddings = createEmbeddingModel(modelSpec);
|
|
6475
|
-
const testVector = await this.embeddings.embedQuery("test");
|
|
6476
|
-
this.modelDimension = testVector.length;
|
|
6477
|
-
if (this.modelDimension > this.dbDimension) {
|
|
6478
|
-
throw new DimensionError(modelSpec, this.modelDimension, this.dbDimension);
|
|
6479
|
-
}
|
|
6480
|
-
}
|
|
6481
|
-
/**
|
|
6482
|
-
* Escapes a query string for use with SQLite FTS5 MATCH operator.
|
|
6483
|
-
* Wraps the query in double quotes and escapes internal double quotes.
|
|
6484
|
-
*/
|
|
6485
|
-
escapeFtsQuery(query) {
|
|
6486
|
-
const escapedQuotes = query.replace(/"/g, '""');
|
|
6487
|
-
return `"${escapedQuotes}"`;
|
|
6488
|
-
}
|
|
6489
|
-
/**
|
|
6490
|
-
* Initializes database connection and ensures readiness
|
|
6491
|
-
*/
|
|
6492
|
-
async initialize() {
|
|
6493
|
-
try {
|
|
6494
|
-
sqliteVec.load(this.db);
|
|
6495
|
-
applyMigrations(this.db);
|
|
6496
|
-
this.prepareStatements();
|
|
6497
|
-
await this.initializeEmbeddings();
|
|
6498
|
-
} catch (error) {
|
|
6499
|
-
if (error instanceof StoreError) {
|
|
6500
|
-
throw error;
|
|
6501
|
-
}
|
|
6502
|
-
throw new ConnectionError("Failed to initialize database connection", error);
|
|
6503
|
-
}
|
|
6504
|
-
}
|
|
6505
|
-
/**
|
|
6506
|
-
* Gracefully closes database connections
|
|
6507
|
-
*/
|
|
6508
|
-
async shutdown() {
|
|
6509
|
-
this.db.close();
|
|
6510
|
-
}
|
|
6511
|
-
/**
|
|
6512
|
-
* Resolves a library name and version string to library_id and version_id.
|
|
6513
|
-
* Creates library and version records if they don't exist.
|
|
6514
|
-
*/
|
|
6515
|
-
async resolveLibraryAndVersionIds(library, version2) {
|
|
6516
|
-
const normalizedLibrary = library.toLowerCase();
|
|
6517
|
-
const normalizedVersion = denormalizeVersionName(version2.toLowerCase());
|
|
6518
|
-
this.statements.insertLibrary.run(normalizedLibrary);
|
|
6519
|
-
const libraryIdRow = this.statements.getLibraryIdByName.get(normalizedLibrary);
|
|
6520
|
-
if (!libraryIdRow || typeof libraryIdRow.id !== "number") {
|
|
6521
|
-
throw new StoreError(`Failed to resolve library_id for library: ${library}`);
|
|
6522
|
-
}
|
|
6523
|
-
const libraryId = libraryIdRow.id;
|
|
6524
|
-
this.statements.insertVersion.run(libraryId, normalizedVersion);
|
|
6525
|
-
const versionIdRow = this.statements.resolveVersionId.get(
|
|
6526
|
-
libraryId,
|
|
6527
|
-
normalizedVersion
|
|
6528
|
-
);
|
|
6529
|
-
if (!versionIdRow || typeof versionIdRow.id !== "number") {
|
|
6530
|
-
throw new StoreError(
|
|
6531
|
-
`Failed to resolve version_id for library: ${library}, version: ${version2}`
|
|
6532
|
-
);
|
|
6533
|
-
}
|
|
6534
|
-
return { libraryId, versionId: versionIdRow.id };
|
|
6535
|
-
}
|
|
6536
|
-
/**
|
|
6537
|
-
* Retrieves all unique versions for a specific library
|
|
6538
|
-
*/
|
|
6539
|
-
async queryUniqueVersions(library) {
|
|
6540
|
-
try {
|
|
6541
|
-
const rows = this.statements.queryVersions.all(library.toLowerCase());
|
|
6542
|
-
return rows.map((row) => normalizeVersionName(row.name));
|
|
6543
|
-
} catch (error) {
|
|
6544
|
-
throw new ConnectionError("Failed to query versions", error);
|
|
6545
|
-
}
|
|
6546
|
-
}
|
|
6547
|
-
/**
|
|
6548
|
-
* Updates the status of a version record in the database.
|
|
6549
|
-
* @param versionId The version ID to update
|
|
6550
|
-
* @param status The new status to set
|
|
6551
|
-
* @param errorMessage Optional error message for failed statuses
|
|
6552
|
-
*/
|
|
6553
|
-
async updateVersionStatus(versionId, status, errorMessage) {
|
|
6554
|
-
try {
|
|
6555
|
-
this.statements.updateVersionStatus.run(status, errorMessage ?? null, versionId);
|
|
6556
|
-
} catch (error) {
|
|
6557
|
-
throw new StoreError(`Failed to update version status: ${error}`);
|
|
6558
|
-
}
|
|
6559
|
-
}
|
|
6560
|
-
/**
|
|
6561
|
-
* Updates the progress counters for a version being indexed.
|
|
6562
|
-
* @param versionId The version ID to update
|
|
6563
|
-
* @param pages Current number of pages processed
|
|
6564
|
-
* @param maxPages Total number of pages to process
|
|
6565
|
-
*/
|
|
6566
|
-
async updateVersionProgress(versionId, pages, maxPages) {
|
|
6567
|
-
try {
|
|
6568
|
-
this.statements.updateVersionProgress.run(pages, maxPages, versionId);
|
|
6569
|
-
} catch (error) {
|
|
6570
|
-
throw new StoreError(`Failed to update version progress: ${error}`);
|
|
6571
|
-
}
|
|
6572
|
-
}
|
|
6573
|
-
/**
|
|
6574
|
-
* Retrieves versions by their status.
|
|
6575
|
-
* @param statuses Array of statuses to filter by
|
|
6576
|
-
* @returns Array of version records matching the statuses
|
|
6577
|
-
*/
|
|
6578
|
-
async getVersionsByStatus(statuses) {
|
|
6579
|
-
try {
|
|
6580
|
-
const statusJson = JSON.stringify(statuses);
|
|
6581
|
-
const rows = this.statements.getVersionsByStatus.all(
|
|
6582
|
-
statusJson
|
|
6583
|
-
);
|
|
6584
|
-
return rows;
|
|
6585
|
-
} catch (error) {
|
|
6586
|
-
throw new StoreError(`Failed to get versions by status: ${error}`);
|
|
6587
|
-
}
|
|
6588
|
-
}
|
|
6589
|
-
/**
|
|
6590
|
-
* Retrieves all versions currently in RUNNING status.
|
|
6591
|
-
* @returns Array of running version records with library names
|
|
6592
|
-
*/
|
|
6593
|
-
async getRunningVersions() {
|
|
6594
|
-
try {
|
|
6595
|
-
const rows = this.statements.getRunningVersions.all();
|
|
6596
|
-
return rows;
|
|
6597
|
-
} catch (error) {
|
|
6598
|
-
throw new StoreError(`Failed to get running versions: ${error}`);
|
|
6599
|
-
}
|
|
6600
|
-
}
|
|
6601
|
-
/**
|
|
6602
|
-
* Retrieves all versions in active states (queued, running, updating).
|
|
6603
|
-
* @returns Array of active version records with library names
|
|
6604
|
-
*/
|
|
6605
|
-
async getActiveVersions() {
|
|
6606
|
-
try {
|
|
6607
|
-
const rows = this.statements.getActiveVersions.all();
|
|
6608
|
-
return rows;
|
|
6609
|
-
} catch (error) {
|
|
6610
|
-
throw new StoreError(`Failed to get active versions: ${error}`);
|
|
6611
|
-
}
|
|
6612
|
-
}
|
|
6613
|
-
/**
|
|
6614
|
-
* Stores scraper options for a version to enable reproducible indexing.
|
|
6615
|
-
* @param versionId The version ID to update
|
|
6616
|
-
* @param options Complete scraper options used for indexing
|
|
6617
|
-
*/
|
|
6618
|
-
async storeScraperOptions(versionId, options) {
|
|
6619
|
-
try {
|
|
6620
|
-
const { url: source_url, library, version: version2, signal, ...scraper_options } = options;
|
|
6621
|
-
const optionsJson = JSON.stringify(scraper_options);
|
|
6622
|
-
this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
|
|
6623
|
-
} catch (error) {
|
|
6624
|
-
throw new StoreError(`Failed to store scraper options: ${error}`);
|
|
6625
|
-
}
|
|
6626
|
-
}
|
|
6627
|
-
/**
|
|
6628
|
-
* Retrieves stored scraper options for a version.
|
|
6629
|
-
* @param versionId The version ID to query
|
|
6630
|
-
* @returns Stored scraper options or null if none stored
|
|
6631
|
-
*/
|
|
6632
|
-
async getVersionScraperOptions(versionId) {
|
|
6633
|
-
try {
|
|
6634
|
-
const row = this.statements.getVersionWithOptions.get(versionId);
|
|
6635
|
-
if (!row?.scraper_options) {
|
|
6636
|
-
return null;
|
|
6637
|
-
}
|
|
6638
|
-
return JSON.parse(row.scraper_options);
|
|
6639
|
-
} catch (error) {
|
|
6640
|
-
throw new StoreError(`Failed to get version scraper options: ${error}`);
|
|
6641
|
-
}
|
|
6642
|
-
}
|
|
6643
|
-
/**
|
|
6644
|
-
* Retrieves a version record with all stored options.
|
|
6645
|
-
* @param versionId The version ID to query
|
|
6646
|
-
* @returns Complete version record or null if not found
|
|
6647
|
-
*/
|
|
6648
|
-
async getVersionWithStoredOptions(versionId) {
|
|
6649
|
-
try {
|
|
6650
|
-
const row = this.statements.getVersionWithOptions.get(versionId);
|
|
6651
|
-
return row || null;
|
|
6652
|
-
} catch (error) {
|
|
6653
|
-
throw new StoreError(`Failed to get version with stored options: ${error}`);
|
|
6654
|
-
}
|
|
6655
|
-
}
|
|
6656
|
-
/**
|
|
6657
|
-
* Finds versions that were indexed from the same source URL.
|
|
6658
|
-
* Useful for finding similar configurations or detecting duplicates.
|
|
6659
|
-
* @param url Source URL to search for
|
|
6660
|
-
* @returns Array of versions with the same source URL
|
|
6661
|
-
*/
|
|
6662
|
-
async findVersionsBySourceUrl(url) {
|
|
6663
|
-
try {
|
|
6664
|
-
const rows = this.statements.getVersionsBySourceUrl.all(
|
|
6665
|
-
url
|
|
6666
|
-
);
|
|
6667
|
-
return rows;
|
|
6668
|
-
} catch (error) {
|
|
6669
|
-
throw new StoreError(`Failed to find versions by source URL: ${error}`);
|
|
6670
|
-
}
|
|
6671
|
-
}
|
|
6672
|
-
/**
|
|
6673
|
-
* Verifies existence of documents for a specific library version
|
|
6674
|
-
*/
|
|
6675
|
-
async checkDocumentExists(library, version2) {
|
|
6676
|
-
try {
|
|
6677
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6678
|
-
const result = this.statements.checkExists.get(
|
|
6679
|
-
library.toLowerCase(),
|
|
6680
|
-
normalizedVersion
|
|
6681
|
-
);
|
|
6682
|
-
return result !== void 0;
|
|
6683
|
-
} catch (error) {
|
|
6684
|
-
throw new ConnectionError("Failed to check document existence", error);
|
|
6685
|
-
}
|
|
6686
|
-
}
|
|
6687
|
-
/**
|
|
6688
|
-
* Retrieves a mapping of all libraries to their available versions with details.
|
|
6689
|
-
*/
|
|
6690
|
-
async queryLibraryVersions() {
|
|
6691
|
-
try {
|
|
6692
|
-
const rows = this.statements.queryLibraryVersions.all();
|
|
6693
|
-
const libraryMap = /* @__PURE__ */ new Map();
|
|
6694
|
-
for (const row of rows) {
|
|
6695
|
-
const library = row.library;
|
|
6696
|
-
if (!libraryMap.has(library)) {
|
|
6697
|
-
libraryMap.set(library, []);
|
|
6698
|
-
}
|
|
6699
|
-
const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
|
|
6700
|
-
libraryMap.get(library)?.push({
|
|
6701
|
-
version: row.version,
|
|
6702
|
-
documentCount: row.documentCount,
|
|
6703
|
-
uniqueUrlCount: row.uniqueUrlCount,
|
|
6704
|
-
indexedAt: indexedAtISO
|
|
6705
|
-
});
|
|
6706
|
-
}
|
|
6707
|
-
for (const versions of libraryMap.values()) {
|
|
6708
|
-
versions.sort((a, b) => {
|
|
6709
|
-
if (a.version === "" && b.version !== "") {
|
|
6710
|
-
return -1;
|
|
6711
|
-
}
|
|
6712
|
-
if (a.version !== "" && b.version === "") {
|
|
6713
|
-
return 1;
|
|
6714
|
-
}
|
|
6715
|
-
if (a.version === "" && b.version === "") {
|
|
6716
|
-
return 0;
|
|
6717
|
-
}
|
|
6718
|
-
try {
|
|
6719
|
-
return semver__default.compare(a.version, b.version);
|
|
6720
|
-
} catch (_error) {
|
|
6721
|
-
return a.version.localeCompare(b.version);
|
|
6722
|
-
}
|
|
6723
|
-
});
|
|
6724
|
-
}
|
|
6725
|
-
return libraryMap;
|
|
6726
|
-
} catch (error) {
|
|
6727
|
-
throw new ConnectionError("Failed to query library versions", error);
|
|
6728
|
-
}
|
|
6729
|
-
}
|
|
6730
|
-
/**
|
|
6731
|
-
* Stores documents with library and version metadata, generating embeddings
|
|
6732
|
-
* for vector similarity search. Automatically removes any existing documents
|
|
6733
|
-
* for the same URLs before adding new ones to prevent UNIQUE constraint violations.
|
|
6734
|
-
*/
|
|
6735
|
-
async addDocuments(library, version2, documents) {
|
|
6736
|
-
try {
|
|
6737
|
-
if (documents.length === 0) {
|
|
6738
|
-
return;
|
|
6739
|
-
}
|
|
6740
|
-
const urls = /* @__PURE__ */ new Set();
|
|
6741
|
-
for (const doc of documents) {
|
|
6742
|
-
const url = doc.metadata.url;
|
|
6743
|
-
if (!url || typeof url !== "string" || !url.trim()) {
|
|
6744
|
-
throw new StoreError("Document metadata must include a valid URL");
|
|
6745
|
-
}
|
|
6746
|
-
urls.add(url);
|
|
6747
|
-
}
|
|
6748
|
-
const texts = documents.map((doc) => {
|
|
6749
|
-
const header = `<title>${doc.metadata.title}</title>
|
|
6750
|
-
<url>${doc.metadata.url}</url>
|
|
6751
|
-
<path>${doc.metadata.path.join(" / ")}</path>
|
|
6752
|
-
`;
|
|
6753
|
-
return `${header}${doc.pageContent}`;
|
|
6754
|
-
});
|
|
6755
|
-
const rawEmbeddings = [];
|
|
6756
|
-
for (let i = 0; i < texts.length; i += EMBEDDING_BATCH_SIZE) {
|
|
6757
|
-
const batchTexts = texts.slice(i, i + EMBEDDING_BATCH_SIZE);
|
|
6758
|
-
const batchEmbeddings = await this.embeddings.embedDocuments(batchTexts);
|
|
6759
|
-
rawEmbeddings.push(...batchEmbeddings);
|
|
6760
|
-
}
|
|
6761
|
-
const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
|
|
6762
|
-
const { libraryId, versionId } = await this.resolveLibraryAndVersionIds(
|
|
6763
|
-
library,
|
|
6764
|
-
version2
|
|
6765
|
-
);
|
|
6766
|
-
for (const url of urls) {
|
|
6767
|
-
const deletedCount = await this.deleteDocumentsByUrl(library, version2, url);
|
|
6768
|
-
if (deletedCount > 0) {
|
|
6769
|
-
logger.debug(`🗑️ Deleted ${deletedCount} existing documents for URL: ${url}`);
|
|
6770
|
-
}
|
|
6771
|
-
}
|
|
6772
|
-
const transaction = this.db.transaction((docs) => {
|
|
6773
|
-
for (let i = 0; i < docs.length; i++) {
|
|
6774
|
-
const doc = docs[i];
|
|
6775
|
-
const url = doc.metadata.url;
|
|
6776
|
-
const result = this.statements.insertDocument.run(
|
|
6777
|
-
BigInt(libraryId),
|
|
6778
|
-
BigInt(versionId),
|
|
6779
|
-
url,
|
|
6780
|
-
doc.pageContent,
|
|
6781
|
-
JSON.stringify(doc.metadata),
|
|
6782
|
-
i,
|
|
6783
|
-
(/* @__PURE__ */ new Date()).toISOString()
|
|
6784
|
-
// Pass current timestamp for indexed_at
|
|
6785
|
-
);
|
|
6786
|
-
const rowId = result.lastInsertRowid;
|
|
6787
|
-
this.statements.insertEmbedding.run(
|
|
6788
|
-
BigInt(rowId),
|
|
6789
|
-
BigInt(libraryId),
|
|
6790
|
-
BigInt(versionId),
|
|
6791
|
-
JSON.stringify(paddedEmbeddings[i])
|
|
6792
|
-
);
|
|
6793
|
-
}
|
|
6794
|
-
});
|
|
6795
|
-
transaction(documents);
|
|
6796
|
-
} catch (error) {
|
|
6797
|
-
throw new ConnectionError("Failed to add documents to store", error);
|
|
6798
|
-
}
|
|
6799
|
-
}
|
|
6800
|
-
/**
|
|
6801
|
-
* Removes documents matching specified library and version
|
|
6802
|
-
* @returns Number of documents deleted
|
|
6803
|
-
*/
|
|
6804
|
-
async deleteDocuments(library, version2) {
|
|
6805
|
-
try {
|
|
6806
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6807
|
-
const result = this.statements.deleteDocuments.run(
|
|
6808
|
-
library.toLowerCase(),
|
|
6809
|
-
library.toLowerCase(),
|
|
6810
|
-
// library name appears twice in the query
|
|
6811
|
-
normalizedVersion
|
|
6812
|
-
);
|
|
6813
|
-
return result.changes;
|
|
6814
|
-
} catch (error) {
|
|
6815
|
-
throw new ConnectionError("Failed to delete documents", error);
|
|
6816
|
-
}
|
|
6817
|
-
}
|
|
6818
|
-
/**
|
|
6819
|
-
* Removes documents for a specific URL within a library and version
|
|
6820
|
-
* @returns Number of documents deleted
|
|
6821
|
-
*/
|
|
6822
|
-
async deleteDocumentsByUrl(library, version2, url) {
|
|
6823
|
-
try {
|
|
6824
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6825
|
-
const result = this.statements.deleteDocumentsByUrl.run(
|
|
6826
|
-
url,
|
|
6827
|
-
library.toLowerCase(),
|
|
6828
|
-
library.toLowerCase(),
|
|
6829
|
-
// library name appears twice in the query
|
|
6830
|
-
normalizedVersion
|
|
6831
|
-
);
|
|
6832
|
-
return result.changes;
|
|
6833
|
-
} catch (error) {
|
|
6834
|
-
throw new ConnectionError("Failed to delete documents by URL", error);
|
|
6835
|
-
}
|
|
6836
|
-
}
|
|
6837
|
-
/**
|
|
6838
|
-
* Retrieves a document by its ID.
|
|
6839
|
-
* @param id The ID of the document.
|
|
6840
|
-
* @returns The document, or null if not found.
|
|
6841
|
-
*/
|
|
6842
|
-
async getById(id) {
|
|
6843
|
-
try {
|
|
6844
|
-
const row = this.statements.getById.get(BigInt(id));
|
|
6845
|
-
if (!row) {
|
|
6846
|
-
return null;
|
|
6847
|
-
}
|
|
6848
|
-
return mapDbDocumentToDocument(row);
|
|
6849
|
-
} catch (error) {
|
|
6850
|
-
throw new ConnectionError(`Failed to get document by ID ${id}`, error);
|
|
6851
|
-
}
|
|
6852
|
-
}
|
|
6853
|
-
/**
|
|
6854
|
-
* Finds documents matching a text query using hybrid search.
|
|
6855
|
-
* Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
|
|
6856
|
-
*/
|
|
6857
|
-
async findByContent(library, version2, query, limit) {
|
|
6858
|
-
try {
|
|
6859
|
-
const rawEmbedding = await this.embeddings.embedQuery(query);
|
|
6860
|
-
const embedding = this.padVector(rawEmbedding);
|
|
6861
|
-
const ftsQuery = this.escapeFtsQuery(query);
|
|
6862
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6863
|
-
const stmt = this.db.prepare(`
|
|
6864
|
-
WITH vec_distances AS (
|
|
6865
|
-
SELECT
|
|
6866
|
-
dv.rowid as id,
|
|
6867
|
-
dv.distance as vec_distance
|
|
6868
|
-
FROM documents_vec dv
|
|
6869
|
-
JOIN versions v ON dv.version_id = v.id
|
|
6870
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6871
|
-
WHERE l.name = ?
|
|
6872
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6873
|
-
AND dv.embedding MATCH ?
|
|
6874
|
-
AND dv.k = ?
|
|
6875
|
-
ORDER BY dv.distance
|
|
6876
|
-
),
|
|
6877
|
-
fts_scores AS (
|
|
6878
|
-
SELECT
|
|
6879
|
-
f.rowid as id,
|
|
6880
|
-
bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
|
|
6881
|
-
FROM documents_fts f
|
|
6882
|
-
JOIN documents d ON f.rowid = d.id
|
|
6883
|
-
JOIN versions v ON d.version_id = v.id
|
|
6884
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6885
|
-
WHERE l.name = ?
|
|
6886
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6887
|
-
AND documents_fts MATCH ?
|
|
6888
|
-
ORDER BY fts_score
|
|
6889
|
-
LIMIT ?
|
|
6890
|
-
)
|
|
6891
|
-
SELECT
|
|
6892
|
-
d.id,
|
|
6893
|
-
d.content,
|
|
6894
|
-
d.metadata,
|
|
6895
|
-
COALESCE(1 / (1 + v.vec_distance), 0) as vec_score,
|
|
6896
|
-
COALESCE(-MIN(f.fts_score, 0), 0) as fts_score
|
|
6897
|
-
FROM documents d
|
|
6898
|
-
LEFT JOIN vec_distances v ON d.id = v.id
|
|
6899
|
-
LEFT JOIN fts_scores f ON d.id = f.id
|
|
6900
|
-
WHERE v.id IS NOT NULL OR f.id IS NOT NULL
|
|
6901
|
-
`);
|
|
6902
|
-
const rawResults = stmt.all(
|
|
6903
|
-
library.toLowerCase(),
|
|
6904
|
-
normalizedVersion,
|
|
6905
|
-
JSON.stringify(embedding),
|
|
6906
|
-
limit,
|
|
6907
|
-
library.toLowerCase(),
|
|
6908
|
-
normalizedVersion,
|
|
6909
|
-
ftsQuery,
|
|
6910
|
-
// Use the escaped query
|
|
6911
|
-
limit
|
|
6912
|
-
);
|
|
6913
|
-
const rankedResults = this.assignRanks(rawResults);
|
|
6914
|
-
const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
|
|
6915
|
-
return topResults.map((row) => ({
|
|
6916
|
-
...mapDbDocumentToDocument(row),
|
|
6917
|
-
metadata: {
|
|
6918
|
-
...JSON.parse(row.metadata),
|
|
6919
|
-
id: row.id,
|
|
6920
|
-
score: row.rrf_score,
|
|
6921
|
-
vec_rank: row.vec_rank,
|
|
6922
|
-
fts_rank: row.fts_rank
|
|
6923
|
-
}
|
|
6924
|
-
}));
|
|
6925
|
-
} catch (error) {
|
|
6926
|
-
throw new ConnectionError(
|
|
6927
|
-
`Failed to find documents by content with query "${query}"`,
|
|
6928
|
-
error
|
|
6929
|
-
);
|
|
6930
|
-
}
|
|
6931
|
-
}
|
|
6932
|
-
/**
|
|
6933
|
-
* Finds child chunks of a given document based on path hierarchy.
|
|
6934
|
-
*/
|
|
6935
|
-
async findChildChunks(library, version2, id, limit) {
|
|
6936
|
-
try {
|
|
6937
|
-
const parent = await this.getById(id);
|
|
6938
|
-
if (!parent) {
|
|
6939
|
-
return [];
|
|
6940
|
-
}
|
|
6941
|
-
const parentPath = parent.metadata.path ?? [];
|
|
6942
|
-
const parentUrl = parent.metadata.url;
|
|
6943
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6944
|
-
const result = this.statements.getChildChunks.all(
|
|
6945
|
-
library.toLowerCase(),
|
|
6946
|
-
normalizedVersion,
|
|
6947
|
-
parentUrl,
|
|
6948
|
-
parentPath.length + 1,
|
|
6949
|
-
JSON.stringify(parentPath),
|
|
6950
|
-
BigInt(id),
|
|
6951
|
-
limit
|
|
6952
|
-
);
|
|
6953
|
-
return result.map((row) => mapDbDocumentToDocument(row));
|
|
6954
|
-
} catch (error) {
|
|
6955
|
-
throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
|
|
6956
|
-
}
|
|
6957
|
-
}
|
|
6958
|
-
/**
|
|
6959
|
-
* Finds preceding sibling chunks of a given document.
|
|
6960
|
-
*/
|
|
6961
|
-
async findPrecedingSiblingChunks(library, version2, id, limit) {
|
|
6962
|
-
try {
|
|
6963
|
-
const reference = await this.getById(id);
|
|
6964
|
-
if (!reference) {
|
|
6965
|
-
return [];
|
|
6966
|
-
}
|
|
6967
|
-
const refMetadata = reference.metadata;
|
|
6968
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6969
|
-
const result = this.statements.getPrecedingSiblings.all(
|
|
6970
|
-
library.toLowerCase(),
|
|
6971
|
-
normalizedVersion,
|
|
6972
|
-
refMetadata.url,
|
|
6973
|
-
BigInt(id),
|
|
6974
|
-
JSON.stringify(refMetadata.path),
|
|
6975
|
-
limit
|
|
6976
|
-
);
|
|
6977
|
-
return result.reverse().map((row) => mapDbDocumentToDocument(row));
|
|
6978
|
-
} catch (error) {
|
|
6979
|
-
throw new ConnectionError(
|
|
6980
|
-
`Failed to find preceding sibling chunks for ID ${id}`,
|
|
6981
|
-
error
|
|
6982
|
-
);
|
|
6983
|
-
}
|
|
6984
|
-
}
|
|
6985
|
-
/**
|
|
6986
|
-
* Finds subsequent sibling chunks of a given document.
|
|
6987
|
-
*/
|
|
6988
|
-
async findSubsequentSiblingChunks(library, version2, id, limit) {
|
|
6989
|
-
try {
|
|
6990
|
-
const reference = await this.getById(id);
|
|
6991
|
-
if (!reference) {
|
|
6992
|
-
return [];
|
|
6993
|
-
}
|
|
6994
|
-
const refMetadata = reference.metadata;
|
|
6995
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6996
|
-
const result = this.statements.getSubsequentSiblings.all(
|
|
6997
|
-
library.toLowerCase(),
|
|
6998
|
-
normalizedVersion,
|
|
6999
|
-
refMetadata.url,
|
|
7000
|
-
BigInt(id),
|
|
7001
|
-
JSON.stringify(refMetadata.path),
|
|
7002
|
-
limit
|
|
7003
|
-
);
|
|
7004
|
-
return result.map((row) => mapDbDocumentToDocument(row));
|
|
7005
|
-
} catch (error) {
|
|
7006
|
-
throw new ConnectionError(
|
|
7007
|
-
`Failed to find subsequent sibling chunks for ID ${id}`,
|
|
7008
|
-
error
|
|
7009
|
-
);
|
|
7010
|
-
}
|
|
7011
|
-
}
|
|
7012
|
-
/**
|
|
7013
|
-
* Finds the parent chunk of a given document.
|
|
7014
|
-
*/
|
|
7015
|
-
async findParentChunk(library, version2, id) {
|
|
7016
|
-
try {
|
|
7017
|
-
const child = await this.getById(id);
|
|
7018
|
-
if (!child) {
|
|
7019
|
-
return null;
|
|
7020
|
-
}
|
|
7021
|
-
const childMetadata = child.metadata;
|
|
7022
|
-
const path2 = childMetadata.path ?? [];
|
|
7023
|
-
const parentPath = path2.slice(0, -1);
|
|
7024
|
-
if (parentPath.length === 0) {
|
|
7025
|
-
return null;
|
|
7026
|
-
}
|
|
7027
|
-
const normalizedVersion = version2.toLowerCase();
|
|
7028
|
-
const result = this.statements.getParentChunk.get(
|
|
7029
|
-
library.toLowerCase(),
|
|
7030
|
-
normalizedVersion,
|
|
7031
|
-
childMetadata.url,
|
|
7032
|
-
JSON.stringify(parentPath),
|
|
7033
|
-
BigInt(id)
|
|
7034
|
-
);
|
|
7035
|
-
if (!result) {
|
|
7036
|
-
return null;
|
|
7037
|
-
}
|
|
7038
|
-
return mapDbDocumentToDocument(result);
|
|
7039
|
-
} catch (error) {
|
|
7040
|
-
throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
|
|
7041
|
-
}
|
|
7042
|
-
}
|
|
7043
|
-
/**
|
|
7044
|
-
* Fetches multiple documents by their IDs in a single call.
|
|
7045
|
-
* Returns an array of Document objects, sorted by their sort_order.
|
|
7046
|
-
*/
|
|
7047
|
-
async findChunksByIds(library, version2, ids) {
|
|
7048
|
-
if (!ids.length) return [];
|
|
7049
|
-
try {
|
|
7050
|
-
const normalizedVersion = version2.toLowerCase();
|
|
7051
|
-
const placeholders = ids.map(() => "?").join(",");
|
|
7052
|
-
const stmt = this.db.prepare(
|
|
7053
|
-
`SELECT d.* FROM documents d
|
|
7054
|
-
JOIN libraries l ON d.library_id = l.id
|
|
7055
|
-
JOIN versions v ON d.version_id = v.id
|
|
7056
|
-
WHERE l.name = ?
|
|
7057
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7058
|
-
AND d.id IN (${placeholders})
|
|
7059
|
-
ORDER BY d.sort_order`
|
|
7060
|
-
);
|
|
7061
|
-
const rows = stmt.all(
|
|
7062
|
-
library.toLowerCase(),
|
|
7063
|
-
normalizedVersion,
|
|
7064
|
-
...ids
|
|
7065
|
-
);
|
|
7066
|
-
return rows.map((row) => mapDbDocumentToDocument(row));
|
|
7067
|
-
} catch (error) {
|
|
7068
|
-
throw new ConnectionError("Failed to fetch documents by IDs", error);
|
|
7069
|
-
}
|
|
7070
|
-
}
|
|
7071
|
-
}
|
|
7072
|
-
class DocumentManagementService {
|
|
7073
|
-
store;
|
|
7074
|
-
documentRetriever;
|
|
7075
|
-
splitter;
|
|
7076
|
-
/**
|
|
7077
|
-
* Normalizes a version string, converting null or undefined to an empty string
|
|
7078
|
-
* and converting to lowercase.
|
|
7079
|
-
*/
|
|
7080
|
-
normalizeVersion(version2) {
|
|
7081
|
-
return (version2 ?? "").toLowerCase();
|
|
7082
|
-
}
|
|
7083
|
-
constructor() {
|
|
7084
|
-
let dbPath;
|
|
7085
|
-
let dbDir;
|
|
7086
|
-
const envStorePath = process.env.DOCS_MCP_STORE_PATH;
|
|
7087
|
-
if (envStorePath) {
|
|
7088
|
-
dbDir = envStorePath;
|
|
7089
|
-
dbPath = path.join(dbDir, "documents.db");
|
|
7090
|
-
logger.debug(`💾 Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
|
|
7091
|
-
} else {
|
|
7092
|
-
const projectRoot2 = getProjectRoot();
|
|
7093
|
-
const oldDbDir = path.join(projectRoot2, ".store");
|
|
7094
|
-
const oldDbPath = path.join(oldDbDir, "documents.db");
|
|
7095
|
-
const oldDbExists = fs$1.existsSync(oldDbPath);
|
|
7096
|
-
if (oldDbExists) {
|
|
7097
|
-
dbPath = oldDbPath;
|
|
7098
|
-
dbDir = oldDbDir;
|
|
7099
|
-
logger.debug(`💾 Using legacy database path: ${dbPath}`);
|
|
7100
|
-
} else {
|
|
7101
|
-
const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
|
|
7102
|
-
dbDir = standardPaths.data;
|
|
7103
|
-
dbPath = path.join(dbDir, "documents.db");
|
|
7104
|
-
logger.debug(`💾 Using standard database directory: ${dbDir}`);
|
|
7105
|
-
}
|
|
7106
|
-
}
|
|
7107
|
-
try {
|
|
7108
|
-
fs$1.mkdirSync(dbDir, { recursive: true });
|
|
7109
|
-
} catch (error) {
|
|
7110
|
-
logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
|
|
7111
|
-
}
|
|
7112
|
-
this.store = new DocumentStore(dbPath);
|
|
7113
|
-
this.documentRetriever = new DocumentRetrieverService(this.store);
|
|
7114
|
-
const semanticSplitter = new SemanticMarkdownSplitter(
|
|
7115
|
-
SPLITTER_PREFERRED_CHUNK_SIZE,
|
|
7116
|
-
SPLITTER_MAX_CHUNK_SIZE
|
|
7117
|
-
);
|
|
7118
|
-
const greedySplitter = new GreedySplitter(
|
|
7119
|
-
semanticSplitter,
|
|
7120
|
-
SPLITTER_MIN_CHUNK_SIZE,
|
|
7121
|
-
SPLITTER_PREFERRED_CHUNK_SIZE
|
|
7122
|
-
);
|
|
7123
|
-
this.splitter = greedySplitter;
|
|
7124
|
-
}
|
|
7125
|
-
/**
|
|
7126
|
-
* Initializes the underlying document store.
|
|
7127
|
-
*/
|
|
7128
|
-
async initialize() {
|
|
7129
|
-
await this.store.initialize();
|
|
7130
|
-
}
|
|
7131
|
-
/**
|
|
7132
|
-
* Shuts down the underlying document store.
|
|
7133
|
-
*/
|
|
7134
|
-
async shutdown() {
|
|
7135
|
-
logger.debug("Shutting down store manager");
|
|
7136
|
-
await this.store.shutdown();
|
|
7137
|
-
}
|
|
7138
|
-
// Status tracking methods for pipeline integration
|
|
7139
|
-
/**
|
|
7140
|
-
* Gets versions by their current status.
|
|
7141
|
-
*/
|
|
7142
|
-
async getVersionsByStatus(statuses) {
|
|
7143
|
-
return this.store.getVersionsByStatus(statuses);
|
|
7144
|
-
}
|
|
7145
|
-
/**
|
|
7146
|
-
* Gets all versions currently in RUNNING status.
|
|
7147
|
-
*/
|
|
7148
|
-
async getRunningVersions() {
|
|
7149
|
-
return this.store.getRunningVersions();
|
|
7150
|
-
}
|
|
7151
|
-
/**
|
|
7152
|
-
* Updates the status of a version.
|
|
7153
|
-
*/
|
|
7154
|
-
async updateVersionStatus(versionId, status, errorMessage) {
|
|
7155
|
-
return this.store.updateVersionStatus(versionId, status, errorMessage);
|
|
7156
|
-
}
|
|
7157
|
-
/**
|
|
7158
|
-
* Updates the progress of a version being indexed.
|
|
7159
|
-
*/
|
|
7160
|
-
async updateVersionProgress(versionId, pages, maxPages) {
|
|
7161
|
-
return this.store.updateVersionProgress(versionId, pages, maxPages);
|
|
7162
|
-
}
|
|
7163
|
-
/**
|
|
7164
|
-
* Stores scraper options for a version to enable reproducible indexing.
|
|
7165
|
-
*/
|
|
7166
|
-
async storeScraperOptions(versionId, options) {
|
|
7167
|
-
return this.store.storeScraperOptions(versionId, options);
|
|
7168
|
-
}
|
|
7169
|
-
/**
|
|
7170
|
-
* Retrieves stored scraper options for a version.
|
|
7171
|
-
*/
|
|
7172
|
-
async getVersionScraperOptions(versionId) {
|
|
7173
|
-
return this.store.getVersionScraperOptions(versionId);
|
|
7174
|
-
}
|
|
7175
|
-
/**
|
|
7176
|
-
* Retrieves a version record with all stored options.
|
|
7177
|
-
*/
|
|
7178
|
-
async getVersionWithStoredOptions(versionId) {
|
|
7179
|
-
return this.store.getVersionWithStoredOptions(versionId);
|
|
7180
|
-
}
|
|
7181
|
-
/**
|
|
7182
|
-
* Finds versions that were indexed from the same source URL.
|
|
7183
|
-
*/
|
|
7184
|
-
async findVersionsBySourceUrl(url) {
|
|
7185
|
-
return this.store.findVersionsBySourceUrl(url);
|
|
7186
|
-
}
|
|
7187
|
-
/**
|
|
7188
|
-
* Validates if a library exists in the store (either versioned or unversioned).
|
|
7189
|
-
* Throws LibraryNotFoundError with suggestions if the library is not found.
|
|
7190
|
-
* @param library The name of the library to validate.
|
|
7191
|
-
* @throws {LibraryNotFoundError} If the library does not exist.
|
|
7192
|
-
*/
|
|
7193
|
-
async validateLibraryExists(library) {
|
|
7194
|
-
logger.info(`🔎 Validating existence of library: ${library}`);
|
|
7195
|
-
const normalizedLibrary = library.toLowerCase();
|
|
7196
|
-
const versions = await this.listVersions(normalizedLibrary);
|
|
7197
|
-
const hasUnversioned = await this.exists(normalizedLibrary, "");
|
|
7198
|
-
if (versions.length === 0 && !hasUnversioned) {
|
|
7199
|
-
logger.warn(`⚠️ Library '${library}' not found.`);
|
|
7200
|
-
const allLibraries = await this.listLibraries();
|
|
7201
|
-
const libraryNames = allLibraries.map((lib) => lib.library);
|
|
7202
|
-
let suggestions = [];
|
|
7203
|
-
if (libraryNames.length > 0) {
|
|
7204
|
-
const fuse = new Fuse(libraryNames, {
|
|
7205
|
-
// Configure fuse.js options if needed (e.g., threshold)
|
|
7206
|
-
// isCaseSensitive: false, // Handled by normalizing library names
|
|
7207
|
-
// includeScore: true,
|
|
7208
|
-
threshold: 0.4
|
|
7209
|
-
// Adjust threshold for desired fuzziness (0=exact, 1=match anything)
|
|
7210
|
-
});
|
|
7211
|
-
const results = fuse.search(normalizedLibrary);
|
|
7212
|
-
suggestions = results.slice(0, 3).map((result) => result.item);
|
|
7213
|
-
logger.info(`🔍 Found suggestions: ${suggestions.join(", ")}`);
|
|
7214
|
-
}
|
|
7215
|
-
throw new LibraryNotFoundError(library, suggestions);
|
|
7216
|
-
}
|
|
7217
|
-
logger.info(`✅ Library '${library}' confirmed to exist.`);
|
|
7218
|
-
}
|
|
7219
|
-
/**
|
|
7220
|
-
* Returns a list of all available semantic versions for a library.
|
|
7221
|
-
*/
|
|
7222
|
-
async listVersions(library) {
|
|
7223
|
-
const versions = await this.store.queryUniqueVersions(library);
|
|
7224
|
-
return versions.filter((v) => semver__default.valid(v));
|
|
7225
|
-
}
|
|
7226
|
-
/**
|
|
7227
|
-
* Checks if documents exist for a given library and optional version.
|
|
7228
|
-
* If version is omitted, checks for documents without a specific version.
|
|
7229
|
-
*/
|
|
7230
|
-
async exists(library, version2) {
|
|
7231
|
-
const normalizedVersion = this.normalizeVersion(version2);
|
|
7232
|
-
return this.store.checkDocumentExists(library, normalizedVersion);
|
|
7233
|
-
}
|
|
7234
|
-
/**
|
|
7235
|
-
* Finds the most appropriate version of documentation based on the requested version.
|
|
7236
|
-
* When no target version is specified, returns the latest version.
|
|
7237
|
-
*
|
|
7238
|
-
* Version matching behavior:
|
|
7239
|
-
* - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
|
|
7240
|
-
* - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
|
|
7241
|
-
* - "latest" or no version: Returns the latest available version
|
|
7242
|
-
*
|
|
7243
|
-
* For documentation, we prefer matching older versions over no match at all,
|
|
7244
|
-
* since older docs are often still relevant and useful.
|
|
7245
|
-
* Also checks if unversioned documents exist for the library.
|
|
7246
|
-
*/
|
|
7247
|
-
async findBestVersion(library, targetVersion) {
|
|
7248
|
-
const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
|
|
7249
|
-
logger.info(`🔍 Finding best version for ${libraryAndVersion}`);
|
|
7250
|
-
const hasUnversioned = await this.store.checkDocumentExists(library, "");
|
|
7251
|
-
const versionStrings = await this.listVersions(library);
|
|
7252
|
-
if (versionStrings.length === 0) {
|
|
7253
|
-
if (hasUnversioned) {
|
|
7254
|
-
logger.info(`ℹ️ Unversioned documents exist for ${library}`);
|
|
7255
|
-
return { bestMatch: null, hasUnversioned: true };
|
|
7256
|
-
}
|
|
7257
|
-
logger.warn(`⚠️ No valid versions found for ${library}`);
|
|
7258
|
-
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
7259
|
-
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
7260
|
-
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
7261
|
-
}
|
|
7262
|
-
let bestMatch = null;
|
|
7263
|
-
if (!targetVersion || targetVersion === "latest") {
|
|
7264
|
-
bestMatch = semver__default.maxSatisfying(versionStrings, "*");
|
|
7265
|
-
} else {
|
|
7266
|
-
const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
|
|
7267
|
-
if (!versionRegex.test(targetVersion)) {
|
|
7268
|
-
logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
|
|
7269
|
-
} else {
|
|
7270
|
-
let range = targetVersion;
|
|
7271
|
-
if (!semver__default.validRange(targetVersion)) {
|
|
7272
|
-
range = `~${targetVersion}`;
|
|
7273
|
-
} else if (semver__default.valid(targetVersion)) {
|
|
7274
|
-
range = `${range} || <=${targetVersion}`;
|
|
7275
|
-
}
|
|
7276
|
-
bestMatch = semver__default.maxSatisfying(versionStrings, range);
|
|
7277
|
-
}
|
|
7278
|
-
}
|
|
7279
|
-
if (bestMatch) {
|
|
7280
|
-
logger.info(`✅ Found best match version ${bestMatch} for ${libraryAndVersion}`);
|
|
7281
|
-
} else {
|
|
7282
|
-
logger.warn(`⚠️ No matching semver version found for ${libraryAndVersion}`);
|
|
7283
|
-
}
|
|
7284
|
-
if (!bestMatch && !hasUnversioned) {
|
|
7285
|
-
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
7286
|
-
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
7287
|
-
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
7288
|
-
}
|
|
7289
|
-
return { bestMatch, hasUnversioned };
|
|
7290
|
-
}
|
|
7291
|
-
/**
|
|
7292
|
-
* Removes all documents for a specific library and optional version.
|
|
7293
|
-
* If version is omitted, removes documents without a specific version.
|
|
7294
|
-
*/
|
|
7295
|
-
async removeAllDocuments(library, version2) {
|
|
7296
|
-
const normalizedVersion = this.normalizeVersion(version2);
|
|
7297
|
-
logger.info(
|
|
7298
|
-
`🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
|
|
7299
|
-
);
|
|
7300
|
-
const count = await this.store.deleteDocuments(library, normalizedVersion);
|
|
7301
|
-
logger.info(`📊 Deleted ${count} documents`);
|
|
7302
|
-
}
|
|
7303
|
-
/**
|
|
7304
|
-
* Adds a document to the store, splitting it into smaller chunks for better search results.
|
|
7305
|
-
* Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
|
|
7306
|
-
* Preserves hierarchical structure of documents and distinguishes between text and code segments.
|
|
7307
|
-
* If version is omitted, the document is added without a specific version.
|
|
7308
|
-
*/
|
|
7309
|
-
async addDocument(library, version2, document) {
|
|
7310
|
-
const normalizedVersion = this.normalizeVersion(version2);
|
|
7311
|
-
const url = document.metadata.url;
|
|
7312
|
-
if (!url || typeof url !== "string" || !url.trim()) {
|
|
7313
|
-
throw new StoreError("Document metadata must include a valid URL");
|
|
7314
|
-
}
|
|
7315
|
-
logger.info(`📚 Adding document: ${document.metadata.title}`);
|
|
7316
|
-
if (!document.pageContent.trim()) {
|
|
7317
|
-
throw new Error("Document content cannot be empty");
|
|
7318
|
-
}
|
|
7319
|
-
const chunks = await this.splitter.splitText(document.pageContent);
|
|
7320
|
-
const splitDocs = chunks.map((chunk) => ({
|
|
7321
|
-
pageContent: chunk.content,
|
|
7322
|
-
metadata: {
|
|
7323
|
-
...document.metadata,
|
|
7324
|
-
level: chunk.section.level,
|
|
7325
|
-
path: chunk.section.path
|
|
7326
|
-
}
|
|
7327
|
-
}));
|
|
7328
|
-
logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
|
|
7329
|
-
await this.store.addDocuments(library, normalizedVersion, splitDocs);
|
|
7330
|
-
}
|
|
7331
|
-
/**
|
|
7332
|
-
* Searches for documentation content across versions.
|
|
7333
|
-
* Uses hybrid search (vector + FTS).
|
|
7334
|
-
* If version is omitted, searches documents without a specific version.
|
|
7335
|
-
*/
|
|
7336
|
-
async searchStore(library, version2, query, limit = 5) {
|
|
7337
|
-
const normalizedVersion = this.normalizeVersion(version2);
|
|
7338
|
-
return this.documentRetriever.search(library, normalizedVersion, query, limit);
|
|
7339
|
-
}
|
|
7340
|
-
async listLibraries() {
|
|
7341
|
-
const libraryMap = await this.store.queryLibraryVersions();
|
|
7342
|
-
return Array.from(libraryMap.entries()).map(([library, versions]) => ({
|
|
7343
|
-
library,
|
|
7344
|
-
versions
|
|
7345
|
-
// The versions array already contains LibraryVersionDetails
|
|
7346
|
-
}));
|
|
7347
|
-
}
|
|
7348
|
-
/**
|
|
7349
|
-
* Gets all versions in active states (queued, running, updating).
|
|
7350
|
-
*/
|
|
7351
|
-
async getActiveVersions() {
|
|
7352
|
-
return this.store.getActiveVersions();
|
|
7353
|
-
}
|
|
7354
|
-
/**
|
|
7355
|
-
* Ensures a library and version exist in the database and returns the version ID.
|
|
7356
|
-
* Creates the library and version records if they don't exist.
|
|
7357
|
-
*/
|
|
7358
|
-
async ensureLibraryAndVersion(library, version2) {
|
|
7359
|
-
const normalizedLibrary = library.toLowerCase();
|
|
7360
|
-
const normalizedVersion = this.normalizeVersion(version2);
|
|
7361
|
-
const { versionId } = await this.store.resolveLibraryAndVersionIds(
|
|
7362
|
-
normalizedLibrary,
|
|
7363
|
-
normalizedVersion
|
|
7364
|
-
);
|
|
7365
|
-
return versionId;
|
|
7366
|
-
}
|
|
7367
|
-
}
|
|
7368
5611
|
function ensurePlaywrightBrowsersInstalled() {
|
|
7369
5612
|
const chromiumEnvPath = process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH;
|
|
7370
5613
|
if (chromiumEnvPath && existsSync(chromiumEnvPath)) {
|
|
@@ -7427,15 +5670,16 @@ function validatePort(portString) {
|
|
|
7427
5670
|
}
|
|
7428
5671
|
return port;
|
|
7429
5672
|
}
|
|
7430
|
-
async function
|
|
7431
|
-
|
|
7432
|
-
|
|
7433
|
-
|
|
7434
|
-
|
|
7435
|
-
|
|
7436
|
-
|
|
7437
|
-
|
|
7438
|
-
|
|
5673
|
+
async function createPipelineWithCallbacks(docService, options = {}) {
|
|
5674
|
+
logger.debug(`Initializing pipeline with options: ${JSON.stringify(options)}`);
|
|
5675
|
+
const { serverUrl, ...rest } = options;
|
|
5676
|
+
const pipeline = serverUrl ? await PipelineFactory.createPipeline(void 0, { serverUrl, ...rest }) : await (async () => {
|
|
5677
|
+
if (!docService) {
|
|
5678
|
+
throw new Error("Local pipeline requires a DocumentManagementService instance");
|
|
5679
|
+
}
|
|
5680
|
+
return PipelineFactory.createPipeline(docService, rest);
|
|
5681
|
+
})();
|
|
5682
|
+
pipeline.setCallbacks({
|
|
7439
5683
|
onJobProgress: async (job, progress) => {
|
|
7440
5684
|
logger.debug(
|
|
7441
5685
|
`📊 Job ${job.id} progress: ${progress.pagesScraped}/${progress.totalPages} pages`
|
|
@@ -7450,13 +5694,13 @@ async function initializePipeline(docService, options = {}) {
|
|
|
7450
5694
|
);
|
|
7451
5695
|
}
|
|
7452
5696
|
});
|
|
7453
|
-
return
|
|
5697
|
+
return pipeline;
|
|
7454
5698
|
}
|
|
7455
5699
|
function createAppServerConfig(options) {
|
|
7456
5700
|
return {
|
|
7457
5701
|
enableWebInterface: options.enableWebInterface ?? false,
|
|
7458
5702
|
enableMcpServer: options.enableMcpServer ?? true,
|
|
7459
|
-
|
|
5703
|
+
enableApiServer: options.enableApiServer ?? false,
|
|
7460
5704
|
enableWorker: options.enableWorker ?? true,
|
|
7461
5705
|
port: options.port,
|
|
7462
5706
|
externalWorkerUrl: options.externalWorkerUrl
|
|
@@ -7483,11 +5727,17 @@ const CLI_DEFAULTS = {
|
|
|
7483
5727
|
MAX_CONCURRENCY: DEFAULT_MAX_CONCURRENCY
|
|
7484
5728
|
};
|
|
7485
5729
|
function createDefaultAction(program) {
|
|
7486
|
-
return program.
|
|
7487
|
-
"--protocol <
|
|
7488
|
-
|
|
7489
|
-
"
|
|
7490
|
-
|
|
5730
|
+
return program.addOption(
|
|
5731
|
+
new Option("--protocol <protocol>", "Protocol for MCP server").choices(["auto", "stdio", "http"]).default("auto")
|
|
5732
|
+
).addOption(
|
|
5733
|
+
new Option("--port <number>", "Port for the server").argParser((v) => {
|
|
5734
|
+
const n = Number(v);
|
|
5735
|
+
if (!Number.isInteger(n) || n < 1 || n > 65535) {
|
|
5736
|
+
throw new Error("Port must be an integer between 1 and 65535");
|
|
5737
|
+
}
|
|
5738
|
+
return String(n);
|
|
5739
|
+
}).default(CLI_DEFAULTS.HTTP_PORT.toString())
|
|
5740
|
+
).option("--resume", "Resume interrupted jobs on startup", false).option("--no-resume", "Do not resume jobs on startup").action(
|
|
7491
5741
|
async (options, command) => {
|
|
7492
5742
|
const globalOptions = command.opts();
|
|
7493
5743
|
const resolvedProtocol = resolveProtocol(options.protocol);
|
|
@@ -7495,13 +5745,13 @@ function createDefaultAction(program) {
|
|
|
7495
5745
|
logger.debug("No subcommand specified, starting unified server by default...");
|
|
7496
5746
|
const port = validatePort(options.port);
|
|
7497
5747
|
ensurePlaywrightBrowsersInstalled();
|
|
7498
|
-
const docService = await
|
|
5748
|
+
const docService = await createLocalDocumentManagement();
|
|
7499
5749
|
const pipelineOptions = {
|
|
7500
5750
|
recoverJobs: options.resume || false,
|
|
7501
5751
|
// Use --resume flag for job recovery
|
|
7502
5752
|
concurrency: 3
|
|
7503
5753
|
};
|
|
7504
|
-
const pipeline = await
|
|
5754
|
+
const pipeline = await createPipelineWithCallbacks(docService, pipelineOptions);
|
|
7505
5755
|
if (resolvedProtocol === "stdio") {
|
|
7506
5756
|
logger.debug(`🔍 Auto-detected stdio protocol (no TTY)`);
|
|
7507
5757
|
await pipeline.start();
|
|
@@ -7516,8 +5766,8 @@ function createDefaultAction(program) {
|
|
|
7516
5766
|
// Enable web interface in http mode
|
|
7517
5767
|
enableMcpServer: true,
|
|
7518
5768
|
// Always enable MCP server
|
|
7519
|
-
|
|
7520
|
-
// Enable
|
|
5769
|
+
enableApiServer: true,
|
|
5770
|
+
// Enable API (tRPC) in http mode
|
|
7521
5771
|
enableWorker: true,
|
|
7522
5772
|
// Always enable in-process worker for unified server
|
|
7523
5773
|
port
|
|
@@ -7529,6 +5779,19 @@ function createDefaultAction(program) {
|
|
|
7529
5779
|
}
|
|
7530
5780
|
);
|
|
7531
5781
|
}
|
|
5782
|
+
async function fetchUrlAction(url, options, command) {
|
|
5783
|
+
const globalOptions = command.parent?.opts() || {};
|
|
5784
|
+
setupLogging(globalOptions);
|
|
5785
|
+
const headers = parseHeaders(options.header);
|
|
5786
|
+
const fetchUrlTool = new FetchUrlTool(new HttpFetcher(), new FileFetcher());
|
|
5787
|
+
const content = await fetchUrlTool.execute({
|
|
5788
|
+
url,
|
|
5789
|
+
followRedirects: options.followRedirects,
|
|
5790
|
+
scrapeMode: options.scrapeMode,
|
|
5791
|
+
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
5792
|
+
});
|
|
5793
|
+
console.log(content);
|
|
5794
|
+
}
|
|
7532
5795
|
function createFetchUrlCommand(program) {
|
|
7533
5796
|
return program.command("fetch-url <url>").description("Fetch a URL and convert its content to Markdown").option(
|
|
7534
5797
|
"--no-follow-redirects",
|
|
@@ -7552,66 +5815,64 @@ function createFetchUrlCommand(program) {
|
|
|
7552
5815
|
"Custom HTTP header to send with the request (can be specified multiple times)",
|
|
7553
5816
|
(val, prev = []) => prev.concat([val]),
|
|
7554
5817
|
[]
|
|
7555
|
-
).action(
|
|
7556
|
-
|
|
7557
|
-
|
|
7558
|
-
|
|
7559
|
-
|
|
7560
|
-
|
|
7561
|
-
|
|
7562
|
-
|
|
7563
|
-
|
|
7564
|
-
|
|
7565
|
-
|
|
7566
|
-
|
|
7567
|
-
|
|
7568
|
-
|
|
7569
|
-
|
|
5818
|
+
).action(fetchUrlAction);
|
|
5819
|
+
}
|
|
5820
|
+
async function findVersionAction(library, options, command) {
|
|
5821
|
+
const globalOptions = command.parent?.opts() || {};
|
|
5822
|
+
setupLogging(globalOptions);
|
|
5823
|
+
const serverUrl = options.serverUrl;
|
|
5824
|
+
const docService = await createDocumentManagement({ serverUrl });
|
|
5825
|
+
try {
|
|
5826
|
+
const findVersionTool = new FindVersionTool(docService);
|
|
5827
|
+
const versionInfo = await findVersionTool.execute({
|
|
5828
|
+
library,
|
|
5829
|
+
targetVersion: options.version
|
|
5830
|
+
});
|
|
5831
|
+
if (!versionInfo) throw new Error("Failed to get version information");
|
|
5832
|
+
console.log(versionInfo);
|
|
5833
|
+
} finally {
|
|
5834
|
+
await docService.shutdown();
|
|
5835
|
+
}
|
|
7570
5836
|
}
|
|
7571
5837
|
function createFindVersionCommand(program) {
|
|
7572
|
-
return program.command("find-version <library>").description("Find the best matching version for a library").option("-v, --version <string>", "Pattern to match (optional, supports ranges)").
|
|
7573
|
-
|
|
7574
|
-
|
|
7575
|
-
|
|
7576
|
-
|
|
7577
|
-
|
|
7578
|
-
|
|
7579
|
-
|
|
7580
|
-
|
|
7581
|
-
|
|
7582
|
-
|
|
7583
|
-
|
|
7584
|
-
|
|
7585
|
-
|
|
7586
|
-
|
|
7587
|
-
|
|
5838
|
+
return program.command("find-version <library>").description("Find the best matching version for a library").option("-v, --version <string>", "Pattern to match (optional, supports ranges)").option(
|
|
5839
|
+
"--server-url <url>",
|
|
5840
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
5841
|
+
).action(findVersionAction);
|
|
5842
|
+
}
|
|
5843
|
+
async function listAction(options, command) {
|
|
5844
|
+
const globalOptions = command.parent?.opts() || {};
|
|
5845
|
+
setupLogging(globalOptions);
|
|
5846
|
+
const { serverUrl } = options;
|
|
5847
|
+
const docService = await createDocumentManagement({ serverUrl });
|
|
5848
|
+
try {
|
|
5849
|
+
const listLibrariesTool = new ListLibrariesTool(docService);
|
|
5850
|
+
const result = await listLibrariesTool.execute();
|
|
5851
|
+
console.log(formatOutput(result.libraries));
|
|
5852
|
+
} finally {
|
|
5853
|
+
await docService.shutdown();
|
|
5854
|
+
}
|
|
7588
5855
|
}
|
|
7589
5856
|
function createListCommand(program) {
|
|
7590
|
-
return program.command("list").description("List all available libraries and their versions").
|
|
7591
|
-
|
|
7592
|
-
|
|
7593
|
-
|
|
7594
|
-
try {
|
|
7595
|
-
const listLibrariesTool = new ListLibrariesTool(docService);
|
|
7596
|
-
const result = await listLibrariesTool.execute();
|
|
7597
|
-
console.log(formatOutput(result.libraries));
|
|
7598
|
-
} finally {
|
|
7599
|
-
await docService.shutdown();
|
|
7600
|
-
}
|
|
7601
|
-
});
|
|
5857
|
+
return program.command("list").description("List all available libraries and their versions").option(
|
|
5858
|
+
"--server-url <url>",
|
|
5859
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
5860
|
+
).action(listAction);
|
|
7602
5861
|
}
|
|
7603
5862
|
function createMcpCommand(program) {
|
|
7604
|
-
return program.command("mcp").description("Start MCP server only").
|
|
7605
|
-
"--protocol <
|
|
7606
|
-
|
|
7607
|
-
|
|
7608
|
-
|
|
7609
|
-
|
|
7610
|
-
|
|
7611
|
-
|
|
5863
|
+
return program.command("mcp").description("Start MCP server only").addOption(
|
|
5864
|
+
new Option("--protocol <protocol>", "Protocol for MCP server").choices(["auto", "stdio", "http"]).default(CLI_DEFAULTS.PROTOCOL)
|
|
5865
|
+
).addOption(
|
|
5866
|
+
new Option("--port <number>", "Port for the MCP server").argParser((v) => {
|
|
5867
|
+
const n = Number(v);
|
|
5868
|
+
if (!Number.isInteger(n) || n < 1 || n > 65535) {
|
|
5869
|
+
throw new Error("Port must be an integer between 1 and 65535");
|
|
5870
|
+
}
|
|
5871
|
+
return String(n);
|
|
5872
|
+
}).default(CLI_DEFAULTS.HTTP_PORT.toString())
|
|
7612
5873
|
).option(
|
|
7613
5874
|
"--server-url <url>",
|
|
7614
|
-
"URL of external pipeline worker
|
|
5875
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
7615
5876
|
).action(
|
|
7616
5877
|
async (cmdOptions, command) => {
|
|
7617
5878
|
const globalOptions = command.parent?.opts() || {};
|
|
@@ -7620,14 +5881,19 @@ function createMcpCommand(program) {
|
|
|
7620
5881
|
const resolvedProtocol = resolveProtocol(cmdOptions.protocol);
|
|
7621
5882
|
setupLogging(globalOptions, resolvedProtocol);
|
|
7622
5883
|
try {
|
|
7623
|
-
const docService = await
|
|
5884
|
+
const docService = await createDocumentManagement({
|
|
5885
|
+
serverUrl
|
|
5886
|
+
});
|
|
7624
5887
|
const pipelineOptions = {
|
|
7625
5888
|
recoverJobs: false,
|
|
7626
5889
|
// MCP command doesn't support job recovery
|
|
7627
5890
|
serverUrl,
|
|
7628
5891
|
concurrency: 3
|
|
7629
5892
|
};
|
|
7630
|
-
const pipeline = await
|
|
5893
|
+
const pipeline = await createPipelineWithCallbacks(
|
|
5894
|
+
serverUrl ? void 0 : docService,
|
|
5895
|
+
pipelineOptions
|
|
5896
|
+
);
|
|
7631
5897
|
if (resolvedProtocol === "stdio") {
|
|
7632
5898
|
logger.debug(`🔍 Auto-detected stdio protocol (no TTY)`);
|
|
7633
5899
|
logger.info("🚀 Starting MCP server (stdio mode)");
|
|
@@ -7643,8 +5909,8 @@ function createMcpCommand(program) {
|
|
|
7643
5909
|
enableWebInterface: false,
|
|
7644
5910
|
// Never enable web interface in mcp command
|
|
7645
5911
|
enableMcpServer: true,
|
|
7646
|
-
|
|
7647
|
-
// Never enable
|
|
5912
|
+
enableApiServer: false,
|
|
5913
|
+
// Never enable API in mcp command
|
|
7648
5914
|
enableWorker: !serverUrl,
|
|
7649
5915
|
port,
|
|
7650
5916
|
externalWorkerUrl: serverUrl
|
|
@@ -7660,30 +5926,81 @@ function createMcpCommand(program) {
|
|
|
7660
5926
|
}
|
|
7661
5927
|
);
|
|
7662
5928
|
}
|
|
5929
|
+
async function removeAction(library, options, command) {
|
|
5930
|
+
const globalOptions = command.parent?.opts() || {};
|
|
5931
|
+
setupLogging(globalOptions);
|
|
5932
|
+
const serverUrl = options.serverUrl;
|
|
5933
|
+
const docService = await createDocumentManagement({ serverUrl });
|
|
5934
|
+
const { version: version2 } = options;
|
|
5935
|
+
try {
|
|
5936
|
+
await docService.removeAllDocuments(library, version2);
|
|
5937
|
+
console.log(
|
|
5938
|
+
`✅ Successfully removed documents for ${library}${version2 ? `@${version2}` : " (unversioned)"}.`
|
|
5939
|
+
);
|
|
5940
|
+
} catch (error) {
|
|
5941
|
+
console.error(
|
|
5942
|
+
`❌ Failed to remove documents for ${library}${version2 ? `@${version2}` : " (unversioned)"}:`,
|
|
5943
|
+
error instanceof Error ? error.message : String(error)
|
|
5944
|
+
);
|
|
5945
|
+
throw error;
|
|
5946
|
+
} finally {
|
|
5947
|
+
await docService.shutdown();
|
|
5948
|
+
}
|
|
5949
|
+
}
|
|
7663
5950
|
function createRemoveCommand(program) {
|
|
7664
5951
|
return program.command("remove <library>").description("Remove documents for a specific library and version").option(
|
|
7665
5952
|
"-v, --version <string>",
|
|
7666
5953
|
"Version to remove (optional, removes unversioned if omitted)"
|
|
7667
|
-
).
|
|
7668
|
-
|
|
7669
|
-
|
|
7670
|
-
|
|
7671
|
-
|
|
7672
|
-
|
|
7673
|
-
|
|
7674
|
-
|
|
7675
|
-
|
|
7676
|
-
|
|
7677
|
-
|
|
7678
|
-
|
|
7679
|
-
|
|
7680
|
-
|
|
7681
|
-
|
|
7682
|
-
|
|
7683
|
-
}
|
|
7684
|
-
|
|
5954
|
+
).option(
|
|
5955
|
+
"--server-url <url>",
|
|
5956
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
5957
|
+
).action(removeAction);
|
|
5958
|
+
}
|
|
5959
|
+
async function scrapeAction(library, url, options, command) {
|
|
5960
|
+
const globalOptions = command.parent?.opts() || {};
|
|
5961
|
+
setupLogging(globalOptions);
|
|
5962
|
+
const serverUrl = options.serverUrl;
|
|
5963
|
+
const docService = await createDocumentManagement({ serverUrl });
|
|
5964
|
+
let pipeline = null;
|
|
5965
|
+
try {
|
|
5966
|
+
const pipelineOptions = {
|
|
5967
|
+
recoverJobs: false,
|
|
5968
|
+
concurrency: 1,
|
|
5969
|
+
serverUrl
|
|
5970
|
+
};
|
|
5971
|
+
pipeline = await createPipelineWithCallbacks(
|
|
5972
|
+
serverUrl ? void 0 : docService,
|
|
5973
|
+
pipelineOptions
|
|
5974
|
+
);
|
|
5975
|
+
await pipeline.start();
|
|
5976
|
+
const scrapeTool = new ScrapeTool(pipeline);
|
|
5977
|
+
const headers = parseHeaders(options.header);
|
|
5978
|
+
const result = await scrapeTool.execute({
|
|
5979
|
+
url,
|
|
5980
|
+
library,
|
|
5981
|
+
version: options.version,
|
|
5982
|
+
options: {
|
|
5983
|
+
maxPages: Number.parseInt(options.maxPages),
|
|
5984
|
+
maxDepth: Number.parseInt(options.maxDepth),
|
|
5985
|
+
maxConcurrency: Number.parseInt(options.maxConcurrency),
|
|
5986
|
+
ignoreErrors: options.ignoreErrors,
|
|
5987
|
+
scope: options.scope,
|
|
5988
|
+
followRedirects: options.followRedirects,
|
|
5989
|
+
scrapeMode: options.scrapeMode,
|
|
5990
|
+
includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
|
|
5991
|
+
excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
|
|
5992
|
+
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
5993
|
+
}
|
|
5994
|
+
});
|
|
5995
|
+
if ("pagesScraped" in result) {
|
|
5996
|
+
console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
|
|
5997
|
+
} else {
|
|
5998
|
+
console.log(`🚀 Scraping job started with ID: ${result.jobId}`);
|
|
7685
5999
|
}
|
|
7686
|
-
}
|
|
6000
|
+
} finally {
|
|
6001
|
+
if (pipeline) await pipeline.stop();
|
|
6002
|
+
await docService.shutdown();
|
|
6003
|
+
}
|
|
7687
6004
|
}
|
|
7688
6005
|
function createScrapeCommand(program) {
|
|
7689
6006
|
return program.command("scrape <library> <url>").description(
|
|
@@ -7746,55 +6063,27 @@ function createScrapeCommand(program) {
|
|
|
7746
6063
|
[]
|
|
7747
6064
|
).option(
|
|
7748
6065
|
"--server-url <url>",
|
|
7749
|
-
"URL of external pipeline worker
|
|
7750
|
-
).action(
|
|
7751
|
-
|
|
7752
|
-
|
|
7753
|
-
|
|
7754
|
-
|
|
7755
|
-
|
|
7756
|
-
|
|
7757
|
-
|
|
7758
|
-
|
|
7759
|
-
|
|
7760
|
-
|
|
7761
|
-
|
|
7762
|
-
|
|
7763
|
-
|
|
7764
|
-
|
|
7765
|
-
|
|
7766
|
-
|
|
7767
|
-
|
|
7768
|
-
|
|
7769
|
-
|
|
7770
|
-
const result = await scrapeTool.execute({
|
|
7771
|
-
url,
|
|
7772
|
-
library,
|
|
7773
|
-
version: options.version,
|
|
7774
|
-
options: {
|
|
7775
|
-
maxPages: Number.parseInt(options.maxPages),
|
|
7776
|
-
maxDepth: Number.parseInt(options.maxDepth),
|
|
7777
|
-
maxConcurrency: Number.parseInt(options.maxConcurrency),
|
|
7778
|
-
ignoreErrors: options.ignoreErrors,
|
|
7779
|
-
scope: options.scope,
|
|
7780
|
-
followRedirects: options.followRedirects,
|
|
7781
|
-
scrapeMode: options.scrapeMode,
|
|
7782
|
-
includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
|
|
7783
|
-
excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
|
|
7784
|
-
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
7785
|
-
}
|
|
7786
|
-
});
|
|
7787
|
-
if ("pagesScraped" in result) {
|
|
7788
|
-
console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
|
|
7789
|
-
} else {
|
|
7790
|
-
console.log(`🚀 Scraping job started with ID: ${result.jobId}`);
|
|
7791
|
-
}
|
|
7792
|
-
} finally {
|
|
7793
|
-
if (pipeline) await pipeline.stop();
|
|
7794
|
-
await docService.shutdown();
|
|
7795
|
-
}
|
|
7796
|
-
}
|
|
7797
|
-
);
|
|
6066
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
6067
|
+
).action(scrapeAction);
|
|
6068
|
+
}
|
|
6069
|
+
async function searchAction(library, query, options, command) {
|
|
6070
|
+
const globalOptions = command.parent?.opts() || {};
|
|
6071
|
+
setupLogging(globalOptions);
|
|
6072
|
+
const serverUrl = options.serverUrl;
|
|
6073
|
+
const docService = await createDocumentManagement({ serverUrl });
|
|
6074
|
+
try {
|
|
6075
|
+
const searchTool = new SearchTool(docService);
|
|
6076
|
+
const result = await searchTool.execute({
|
|
6077
|
+
library,
|
|
6078
|
+
version: options.version,
|
|
6079
|
+
query,
|
|
6080
|
+
limit: Number.parseInt(options.limit),
|
|
6081
|
+
exactMatch: options.exactMatch
|
|
6082
|
+
});
|
|
6083
|
+
console.log(formatOutput(result.results));
|
|
6084
|
+
} finally {
|
|
6085
|
+
await docService.shutdown();
|
|
6086
|
+
}
|
|
7798
6087
|
}
|
|
7799
6088
|
function createSearchCommand(program) {
|
|
7800
6089
|
return program.command("search <library> <query>").description(
|
|
@@ -7802,35 +6091,23 @@ function createSearchCommand(program) {
|
|
|
7802
6091
|
).option(
|
|
7803
6092
|
"-v, --version <string>",
|
|
7804
6093
|
"Version of the library (optional, supports ranges)"
|
|
7805
|
-
).option("-l, --limit <number>", "Maximum number of results", "5").option("-e, --exact-match", "Only use exact version match (default: false)", false).
|
|
7806
|
-
|
|
7807
|
-
|
|
7808
|
-
|
|
7809
|
-
const docService = await initializeDocumentService();
|
|
7810
|
-
try {
|
|
7811
|
-
const searchTool = new SearchTool(docService);
|
|
7812
|
-
const result = await searchTool.execute({
|
|
7813
|
-
library,
|
|
7814
|
-
version: options.version,
|
|
7815
|
-
query,
|
|
7816
|
-
limit: Number.parseInt(options.limit),
|
|
7817
|
-
exactMatch: options.exactMatch
|
|
7818
|
-
});
|
|
7819
|
-
console.log(formatOutput(result.results));
|
|
7820
|
-
} finally {
|
|
7821
|
-
await docService.shutdown();
|
|
7822
|
-
}
|
|
7823
|
-
}
|
|
7824
|
-
);
|
|
6094
|
+
).option("-l, --limit <number>", "Maximum number of results", "5").option("-e, --exact-match", "Only use exact version match (default: false)", false).option(
|
|
6095
|
+
"--server-url <url>",
|
|
6096
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
6097
|
+
).action(searchAction);
|
|
7825
6098
|
}
|
|
7826
6099
|
function createWebCommand(program) {
|
|
7827
|
-
return program.command("web").description("Start web interface only").
|
|
7828
|
-
"--port <number>",
|
|
7829
|
-
|
|
7830
|
-
|
|
6100
|
+
return program.command("web").description("Start web interface only").addOption(
|
|
6101
|
+
new Option("--port <number>", "Port for the web interface").argParser((v) => {
|
|
6102
|
+
const n = Number(v);
|
|
6103
|
+
if (!Number.isInteger(n) || n < 1 || n > 65535) {
|
|
6104
|
+
throw new Error("Port must be an integer between 1 and 65535");
|
|
6105
|
+
}
|
|
6106
|
+
return String(n);
|
|
6107
|
+
}).default(CLI_DEFAULTS.WEB_PORT.toString())
|
|
7831
6108
|
).option(
|
|
7832
6109
|
"--server-url <url>",
|
|
7833
|
-
"URL of external pipeline worker
|
|
6110
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
7834
6111
|
).action(
|
|
7835
6112
|
async (cmdOptions, command) => {
|
|
7836
6113
|
const globalOptions = command.parent?.opts() || {};
|
|
@@ -7838,18 +6115,23 @@ function createWebCommand(program) {
|
|
|
7838
6115
|
const serverUrl = cmdOptions.serverUrl;
|
|
7839
6116
|
setupLogging(globalOptions);
|
|
7840
6117
|
try {
|
|
7841
|
-
const docService = await
|
|
6118
|
+
const docService = await createDocumentManagement({
|
|
6119
|
+
serverUrl
|
|
6120
|
+
});
|
|
7842
6121
|
const pipelineOptions = {
|
|
7843
6122
|
recoverJobs: false,
|
|
7844
6123
|
// Web command doesn't support job recovery
|
|
7845
6124
|
serverUrl,
|
|
7846
6125
|
concurrency: 3
|
|
7847
6126
|
};
|
|
7848
|
-
const pipeline = await
|
|
6127
|
+
const pipeline = await createPipelineWithCallbacks(
|
|
6128
|
+
serverUrl ? void 0 : docService,
|
|
6129
|
+
pipelineOptions
|
|
6130
|
+
);
|
|
7849
6131
|
const config = createAppServerConfig({
|
|
7850
6132
|
enableWebInterface: true,
|
|
7851
6133
|
enableMcpServer: false,
|
|
7852
|
-
|
|
6134
|
+
enableApiServer: false,
|
|
7853
6135
|
enableWorker: !serverUrl,
|
|
7854
6136
|
port,
|
|
7855
6137
|
externalWorkerUrl: serverUrl
|
|
@@ -7868,28 +6150,35 @@ function createWebCommand(program) {
|
|
|
7868
6150
|
);
|
|
7869
6151
|
}
|
|
7870
6152
|
function createWorkerCommand(program) {
|
|
7871
|
-
return program.command("worker").description("Start external pipeline worker (HTTP API)").
|
|
6153
|
+
return program.command("worker").description("Start external pipeline worker (HTTP API)").addOption(
|
|
6154
|
+
new Option("--port <number>", "Port for worker API").argParser((v) => {
|
|
6155
|
+
const n = Number(v);
|
|
6156
|
+
if (!Number.isInteger(n) || n < 1 || n > 65535) {
|
|
6157
|
+
throw new Error("Port must be an integer between 1 and 65535");
|
|
6158
|
+
}
|
|
6159
|
+
return String(n);
|
|
6160
|
+
}).default("8080")
|
|
6161
|
+
).option("--resume", "Resume interrupted jobs on startup", true).option("--no-resume", "Do not resume jobs on startup").action(async (cmdOptions, command) => {
|
|
7872
6162
|
const globalOptions = command.parent?.opts() || {};
|
|
7873
6163
|
const port = validatePort(cmdOptions.port);
|
|
7874
6164
|
setupLogging(globalOptions);
|
|
7875
6165
|
try {
|
|
7876
6166
|
logger.info(`🚀 Starting external pipeline worker on port ${port}`);
|
|
7877
6167
|
ensurePlaywrightBrowsersInstalled();
|
|
7878
|
-
const docService = await
|
|
6168
|
+
const docService = await createLocalDocumentManagement();
|
|
7879
6169
|
const pipelineOptions = {
|
|
7880
6170
|
recoverJobs: cmdOptions.resume,
|
|
7881
6171
|
// Use the resume option
|
|
7882
6172
|
concurrency: CLI_DEFAULTS.MAX_CONCURRENCY
|
|
7883
6173
|
};
|
|
7884
|
-
const pipeline = await
|
|
6174
|
+
const pipeline = await createPipelineWithCallbacks(docService, pipelineOptions);
|
|
7885
6175
|
const config = createAppServerConfig({
|
|
7886
6176
|
enableWebInterface: false,
|
|
7887
6177
|
enableMcpServer: false,
|
|
7888
|
-
|
|
6178
|
+
enableApiServer: true,
|
|
7889
6179
|
enableWorker: true,
|
|
7890
6180
|
port
|
|
7891
6181
|
});
|
|
7892
|
-
logger.info(`🚀 Starting external pipeline worker with HTTP API`);
|
|
7893
6182
|
await startAppServer(docService, pipeline, config);
|
|
7894
6183
|
await new Promise(() => {
|
|
7895
6184
|
});
|
|
@@ -7901,7 +6190,9 @@ function createWorkerCommand(program) {
|
|
|
7901
6190
|
}
|
|
7902
6191
|
function createCliProgram() {
|
|
7903
6192
|
const program = new Command();
|
|
7904
|
-
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version(packageJson.version).
|
|
6193
|
+
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version(packageJson.version).addOption(
|
|
6194
|
+
new Option("--verbose", "Enable verbose (debug) logging").conflicts("silent")
|
|
6195
|
+
).addOption(new Option("--silent", "Disable all logging except errors")).enablePositionalOptions().allowExcessArguments(false).showHelpAfterError(true);
|
|
7905
6196
|
program.hook("preAction", (thisCommand, _actionCommand) => {
|
|
7906
6197
|
const globalOptions = thisCommand.opts();
|
|
7907
6198
|
if (globalOptions.silent) setLogLevel(LogLevel.ERROR);
|
|
@@ -8023,7 +6314,23 @@ runCli().catch((error) => {
|
|
|
8023
6314
|
process.exit(1);
|
|
8024
6315
|
});
|
|
8025
6316
|
export {
|
|
6317
|
+
ConnectionError as C,
|
|
8026
6318
|
DimensionError as D,
|
|
8027
|
-
|
|
6319
|
+
EMBEDDING_BATCH_CHARS as E,
|
|
6320
|
+
LibraryNotFoundError as L,
|
|
6321
|
+
StoreError as S,
|
|
6322
|
+
VECTOR_DIMENSION as V,
|
|
6323
|
+
applyMigrations as a,
|
|
6324
|
+
EMBEDDING_BATCH_SIZE as b,
|
|
6325
|
+
createJSDOM as c,
|
|
6326
|
+
denormalizeVersionName as d,
|
|
6327
|
+
SPLITTER_PREFERRED_CHUNK_SIZE as e,
|
|
6328
|
+
SPLITTER_MAX_CHUNK_SIZE as f,
|
|
6329
|
+
getProjectRoot as g,
|
|
6330
|
+
VersionNotFoundError as h,
|
|
6331
|
+
SPLITTER_MIN_CHUNK_SIZE as i,
|
|
6332
|
+
logger as l,
|
|
6333
|
+
mapDbDocumentToDocument as m,
|
|
6334
|
+
normalizeVersionName as n
|
|
8028
6335
|
};
|
|
8029
6336
|
//# sourceMappingURL=index.js.map
|