@arabold/docs-mcp-server 1.18.0 → 1.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -6
- package/db/migrations/007-dedupe-unversioned-versions.sql +62 -0
- package/db/migrations/008-case-insensitive-names.sql +10 -0
- package/dist/DocumentManagementClient-CAFdDwTu.js +57 -0
- package/dist/DocumentManagementClient-CAFdDwTu.js.map +1 -0
- package/dist/DocumentManagementService-BH02TJEe.js +1917 -0
- package/dist/DocumentManagementService-BH02TJEe.js.map +1 -0
- package/dist/index.js +908 -2561
- package/dist/index.js.map +1 -1
- package/package.json +3 -1
package/dist/index.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import "dotenv/config";
|
|
2
|
-
import { Command } from "commander";
|
|
2
|
+
import { Option, Command } from "commander";
|
|
3
3
|
import path from "node:path";
|
|
4
4
|
import formBody from "@fastify/formbody";
|
|
5
5
|
import fastifyStatic from "@fastify/static";
|
|
@@ -21,6 +21,9 @@ import fs from "node:fs/promises";
|
|
|
21
21
|
import * as mime from "mime-types";
|
|
22
22
|
import axios from "axios";
|
|
23
23
|
import { HeaderGenerator } from "header-generator";
|
|
24
|
+
import { initTRPC } from "@trpc/server";
|
|
25
|
+
import { fastifyTRPCPlugin } from "@trpc/server/adapters/fastify";
|
|
26
|
+
import { z as z$1 } from "zod";
|
|
24
27
|
import { jsxs, jsx, Fragment } from "@kitajs/html/jsx-runtime";
|
|
25
28
|
import fs$1, { readFileSync, existsSync } from "node:fs";
|
|
26
29
|
import { unified } from "unified";
|
|
@@ -30,15 +33,16 @@ import remarkHtml from "remark-html";
|
|
|
30
33
|
import DOMPurify from "dompurify";
|
|
31
34
|
import { fileURLToPath, URL as URL$1 } from "node:url";
|
|
32
35
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
36
|
+
import { createTRPCProxyClient, httpBatchLink } from "@trpc/client";
|
|
37
|
+
import "env-paths";
|
|
38
|
+
import "fuse.js";
|
|
39
|
+
import "langchain/text_splitter";
|
|
40
|
+
import "better-sqlite3";
|
|
41
|
+
import "sqlite-vec";
|
|
33
42
|
import { execSync } from "node:child_process";
|
|
34
43
|
import { v4 } from "uuid";
|
|
35
|
-
import
|
|
44
|
+
import "psl";
|
|
36
45
|
import { minimatch } from "minimatch";
|
|
37
|
-
import envPaths from "env-paths";
|
|
38
|
-
import Fuse from "fuse.js";
|
|
39
|
-
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
|
|
40
|
-
import Database from "better-sqlite3";
|
|
41
|
-
import * as sqliteVec from "sqlite-vec";
|
|
42
46
|
const LogLevel = {
|
|
43
47
|
ERROR: 0,
|
|
44
48
|
WARN: 1,
|
|
@@ -97,7 +101,7 @@ const logger = {
|
|
|
97
101
|
}
|
|
98
102
|
}
|
|
99
103
|
};
|
|
100
|
-
const version = "1.
|
|
104
|
+
const version = "1.19.0";
|
|
101
105
|
const packageJson = {
|
|
102
106
|
version
|
|
103
107
|
};
|
|
@@ -324,14 +328,43 @@ class HtmlLinkExtractorMiddleware {
|
|
|
324
328
|
return;
|
|
325
329
|
}
|
|
326
330
|
try {
|
|
331
|
+
let docBase = context.source;
|
|
332
|
+
try {
|
|
333
|
+
const baseEl = $("base[href]").first();
|
|
334
|
+
const rawBase = baseEl.attr("href");
|
|
335
|
+
if (rawBase && rawBase.trim() !== "") {
|
|
336
|
+
try {
|
|
337
|
+
const trimmed = rawBase.trim();
|
|
338
|
+
const candidate = new URL(trimmed, context.source);
|
|
339
|
+
const hasScheme = /^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(trimmed);
|
|
340
|
+
const protocolRelative = trimmed.startsWith("//");
|
|
341
|
+
const firstSlash = trimmed.indexOf("/");
|
|
342
|
+
const firstColon = trimmed.indexOf(":");
|
|
343
|
+
const colonBeforeSlash = firstColon !== -1 && (firstSlash === -1 || firstColon < firstSlash);
|
|
344
|
+
const suspiciousColon = colonBeforeSlash && !hasScheme && !protocolRelative;
|
|
345
|
+
if (suspiciousColon || trimmed.startsWith(":")) {
|
|
346
|
+
logger.debug(
|
|
347
|
+
`Ignoring suspicious <base href> value (colon misuse): ${rawBase}`
|
|
348
|
+
);
|
|
349
|
+
} else {
|
|
350
|
+
docBase = candidate.href;
|
|
351
|
+
}
|
|
352
|
+
} catch {
|
|
353
|
+
logger.debug(`Ignoring invalid <base href> value: ${rawBase}`);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
} catch {
|
|
357
|
+
}
|
|
327
358
|
const linkElements = $("a[href]");
|
|
328
|
-
logger.debug(
|
|
359
|
+
logger.debug(
|
|
360
|
+
`Found ${linkElements.length} potential links in ${context.source} (base=${docBase})`
|
|
361
|
+
);
|
|
329
362
|
const extractedLinks = [];
|
|
330
363
|
linkElements.each((_index, element) => {
|
|
331
364
|
const href = $(element).attr("href");
|
|
332
365
|
if (href && href.trim() !== "") {
|
|
333
366
|
try {
|
|
334
|
-
const urlObj = new URL(href,
|
|
367
|
+
const urlObj = new URL(href, docBase);
|
|
335
368
|
if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
|
|
336
369
|
logger.debug(`Ignoring link with invalid protocol: ${href}`);
|
|
337
370
|
return;
|
|
@@ -405,6 +438,7 @@ const SPLITTER_MIN_CHUNK_SIZE = 500;
|
|
|
405
438
|
const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
|
|
406
439
|
const SPLITTER_MAX_CHUNK_SIZE = 5e3;
|
|
407
440
|
const EMBEDDING_BATCH_SIZE = 100;
|
|
441
|
+
const EMBEDDING_BATCH_CHARS = 5e4;
|
|
408
442
|
const MIGRATION_MAX_RETRIES = 5;
|
|
409
443
|
const MIGRATION_RETRY_DELAY_MS = 300;
|
|
410
444
|
var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
|
|
@@ -1319,8 +1353,15 @@ class ListLibrariesTool {
|
|
|
1319
1353
|
const rawLibraries = await this.docService.listLibraries();
|
|
1320
1354
|
const libraries = rawLibraries.map(({ library, versions }) => ({
|
|
1321
1355
|
name: library,
|
|
1322
|
-
versions
|
|
1323
|
-
|
|
1356
|
+
versions: versions.map((v) => ({
|
|
1357
|
+
version: v.ref.version,
|
|
1358
|
+
documentCount: v.counts.documents,
|
|
1359
|
+
uniqueUrlCount: v.counts.uniqueUrls,
|
|
1360
|
+
indexedAt: v.indexedAt,
|
|
1361
|
+
status: v.status,
|
|
1362
|
+
...v.progress ? { progress: v.progress } : void 0,
|
|
1363
|
+
sourceUrl: v.sourceUrl
|
|
1364
|
+
}))
|
|
1324
1365
|
}));
|
|
1325
1366
|
return { libraries };
|
|
1326
1367
|
}
|
|
@@ -1400,7 +1441,8 @@ class ScrapeTool {
|
|
|
1400
1441
|
}
|
|
1401
1442
|
internalVersion = internalVersion.toLowerCase();
|
|
1402
1443
|
const pipeline = this.pipeline;
|
|
1403
|
-
const
|
|
1444
|
+
const enqueueVersion = internalVersion === "" ? null : internalVersion;
|
|
1445
|
+
const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
|
|
1404
1446
|
url,
|
|
1405
1447
|
library,
|
|
1406
1448
|
version: internalVersion,
|
|
@@ -1447,13 +1489,13 @@ class SearchTool {
|
|
|
1447
1489
|
await this.docService.validateLibraryExists(library);
|
|
1448
1490
|
const allLibraries = await this.docService.listLibraries();
|
|
1449
1491
|
const libraryInfo = allLibraries.find((lib) => lib.library === library);
|
|
1450
|
-
const detailedVersions = libraryInfo ? libraryInfo.versions
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
);
|
|
1492
|
+
const detailedVersions = libraryInfo ? libraryInfo.versions.map((v) => ({
|
|
1493
|
+
version: v.ref.version,
|
|
1494
|
+
documentCount: v.counts.documents,
|
|
1495
|
+
uniqueUrlCount: v.counts.uniqueUrls,
|
|
1496
|
+
indexedAt: v.indexedAt
|
|
1497
|
+
})) : [];
|
|
1498
|
+
throw new VersionNotFoundError(library, version2 ?? "latest", detailedVersions);
|
|
1457
1499
|
}
|
|
1458
1500
|
const resolvedVersion = version2 || "latest";
|
|
1459
1501
|
logger.info(
|
|
@@ -2081,12 +2123,18 @@ class HttpFetcher {
|
|
|
2081
2123
|
} else {
|
|
2082
2124
|
content = Buffer.from(response.data);
|
|
2083
2125
|
}
|
|
2126
|
+
const finalUrl = (
|
|
2127
|
+
// Node follow-redirects style
|
|
2128
|
+
response.request?.res?.responseUrl || // Some adapters may expose directly
|
|
2129
|
+
response.request?.responseUrl || // Fallback to axios recorded config URL
|
|
2130
|
+
response.config?.url || source
|
|
2131
|
+
);
|
|
2084
2132
|
return {
|
|
2085
2133
|
content,
|
|
2086
2134
|
mimeType,
|
|
2087
2135
|
charset,
|
|
2088
2136
|
encoding: contentEncoding,
|
|
2089
|
-
source
|
|
2137
|
+
source: finalUrl
|
|
2090
2138
|
};
|
|
2091
2139
|
} catch (error) {
|
|
2092
2140
|
const axiosError = error;
|
|
@@ -2224,134 +2272,229 @@ async function cleanupMcpService(mcpServer) {
|
|
|
2224
2272
|
throw error;
|
|
2225
2273
|
}
|
|
2226
2274
|
}
|
|
2227
|
-
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
|
|
2231
|
-
|
|
2232
|
-
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
}
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
2278
|
-
|
|
2279
|
-
|
|
2280
|
-
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
try {
|
|
2289
|
-
const { status } = request.query;
|
|
2290
|
-
const jobs = await this.pipeline.getJobs(status);
|
|
2291
|
-
return reply.send({ jobs });
|
|
2292
|
-
} catch (error) {
|
|
2293
|
-
logger.error(`API: Failed to get jobs: ${error}`);
|
|
2294
|
-
return reply.status(500).send({
|
|
2295
|
-
error: error instanceof Error ? error.message : String(error)
|
|
2296
|
-
});
|
|
2297
|
-
}
|
|
2275
|
+
const t$1 = initTRPC.context().create();
|
|
2276
|
+
const nonEmptyTrimmed = z$1.string().transform((s) => s.trim()).refine((s) => s.length > 0, "must not be empty");
|
|
2277
|
+
const optionalTrimmed = z$1.preprocess(
|
|
2278
|
+
(v) => typeof v === "string" ? v.trim() : v,
|
|
2279
|
+
z$1.string().min(1).optional().nullable()
|
|
2280
|
+
);
|
|
2281
|
+
const enqueueInput = z$1.object({
|
|
2282
|
+
library: nonEmptyTrimmed,
|
|
2283
|
+
version: optionalTrimmed,
|
|
2284
|
+
options: z$1.custom()
|
|
2285
|
+
});
|
|
2286
|
+
const jobIdInput = z$1.object({ id: z$1.string().min(1) });
|
|
2287
|
+
const getJobsInput = z$1.object({
|
|
2288
|
+
status: z$1.nativeEnum(PipelineJobStatus).optional()
|
|
2289
|
+
});
|
|
2290
|
+
function createPipelineRouter(trpc) {
|
|
2291
|
+
const tt = trpc;
|
|
2292
|
+
return tt.router({
|
|
2293
|
+
enqueueJob: tt.procedure.input(enqueueInput).mutation(
|
|
2294
|
+
async ({
|
|
2295
|
+
ctx,
|
|
2296
|
+
input
|
|
2297
|
+
}) => {
|
|
2298
|
+
const jobId = await ctx.pipeline.enqueueJob(
|
|
2299
|
+
input.library,
|
|
2300
|
+
input.version ?? null,
|
|
2301
|
+
input.options
|
|
2302
|
+
);
|
|
2303
|
+
return { jobId };
|
|
2304
|
+
}
|
|
2305
|
+
),
|
|
2306
|
+
getJob: tt.procedure.input(jobIdInput).query(
|
|
2307
|
+
async ({
|
|
2308
|
+
ctx,
|
|
2309
|
+
input
|
|
2310
|
+
}) => {
|
|
2311
|
+
return ctx.pipeline.getJob(input.id);
|
|
2312
|
+
}
|
|
2313
|
+
),
|
|
2314
|
+
getJobs: tt.procedure.input(getJobsInput.optional()).query(
|
|
2315
|
+
async ({
|
|
2316
|
+
ctx,
|
|
2317
|
+
input
|
|
2318
|
+
}) => {
|
|
2319
|
+
const jobs = await ctx.pipeline.getJobs(input?.status);
|
|
2320
|
+
return { jobs };
|
|
2321
|
+
}
|
|
2322
|
+
),
|
|
2323
|
+
cancelJob: tt.procedure.input(jobIdInput).mutation(
|
|
2324
|
+
async ({
|
|
2325
|
+
ctx,
|
|
2326
|
+
input
|
|
2327
|
+
}) => {
|
|
2328
|
+
await ctx.pipeline.cancelJob(input.id);
|
|
2329
|
+
return { success: true };
|
|
2330
|
+
}
|
|
2331
|
+
),
|
|
2332
|
+
clearCompletedJobs: tt.procedure.mutation(
|
|
2333
|
+
async ({ ctx }) => {
|
|
2334
|
+
const count = await ctx.pipeline.clearCompletedJobs();
|
|
2335
|
+
return { count };
|
|
2298
2336
|
}
|
|
2299
|
-
)
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
|
|
2304
|
-
|
|
2305
|
-
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
|
|
2337
|
+
)
|
|
2338
|
+
});
|
|
2339
|
+
}
|
|
2340
|
+
createPipelineRouter(t$1);
|
|
2341
|
+
const t = initTRPC.context().create();
|
|
2342
|
+
const nonEmpty = z$1.string().min(1).transform((s) => s.trim());
|
|
2343
|
+
const optionalVersion = z$1.string().optional().nullable().transform((v) => typeof v === "string" ? v.trim() : v);
|
|
2344
|
+
function createDataRouter(trpc) {
|
|
2345
|
+
const tt = trpc;
|
|
2346
|
+
return tt.router({
|
|
2347
|
+
listLibraries: tt.procedure.query(async ({ ctx }) => {
|
|
2348
|
+
return await ctx.docService.listLibraries();
|
|
2349
|
+
}),
|
|
2350
|
+
findBestVersion: tt.procedure.input(z$1.object({ library: nonEmpty, targetVersion: z$1.string().optional() })).query(
|
|
2351
|
+
async ({
|
|
2352
|
+
ctx,
|
|
2353
|
+
input
|
|
2354
|
+
}) => {
|
|
2355
|
+
const result = await ctx.docService.findBestVersion(
|
|
2356
|
+
input.library,
|
|
2357
|
+
input.targetVersion
|
|
2358
|
+
);
|
|
2359
|
+
return result;
|
|
2360
|
+
}
|
|
2361
|
+
),
|
|
2362
|
+
validateLibraryExists: tt.procedure.input(z$1.object({ library: nonEmpty })).mutation(
|
|
2363
|
+
async ({ ctx, input }) => {
|
|
2364
|
+
await ctx.docService.validateLibraryExists(input.library);
|
|
2365
|
+
return { ok: true };
|
|
2366
|
+
}
|
|
2367
|
+
),
|
|
2368
|
+
search: tt.procedure.input(
|
|
2369
|
+
z$1.object({
|
|
2370
|
+
library: nonEmpty,
|
|
2371
|
+
version: optionalVersion,
|
|
2372
|
+
query: nonEmpty,
|
|
2373
|
+
limit: z$1.number().int().positive().max(50).optional()
|
|
2374
|
+
})
|
|
2375
|
+
).query(
|
|
2376
|
+
async ({
|
|
2377
|
+
ctx,
|
|
2378
|
+
input
|
|
2379
|
+
}) => {
|
|
2380
|
+
const results = await ctx.docService.searchStore(
|
|
2381
|
+
input.library,
|
|
2382
|
+
input.version ?? null,
|
|
2383
|
+
input.query,
|
|
2384
|
+
input.limit ?? 5
|
|
2385
|
+
);
|
|
2386
|
+
return results;
|
|
2387
|
+
}
|
|
2388
|
+
),
|
|
2389
|
+
removeAllDocuments: tt.procedure.input(z$1.object({ library: nonEmpty, version: optionalVersion })).mutation(
|
|
2390
|
+
async ({
|
|
2391
|
+
ctx,
|
|
2392
|
+
input
|
|
2393
|
+
}) => {
|
|
2394
|
+
await ctx.docService.removeAllDocuments(input.library, input.version ?? null);
|
|
2395
|
+
return { ok: true };
|
|
2396
|
+
}
|
|
2397
|
+
),
|
|
2398
|
+
// Status and version helpers
|
|
2399
|
+
getVersionsByStatus: tt.procedure.input(z$1.object({ statuses: z$1.array(z$1.string()) })).query(
|
|
2400
|
+
async ({
|
|
2401
|
+
ctx,
|
|
2402
|
+
input
|
|
2403
|
+
}) => {
|
|
2404
|
+
const statuses = input.statuses;
|
|
2405
|
+
return await ctx.docService.getVersionsByStatus(
|
|
2406
|
+
statuses
|
|
2407
|
+
);
|
|
2316
2408
|
}
|
|
2317
|
-
)
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2330
|
-
|
|
2331
|
-
|
|
2409
|
+
),
|
|
2410
|
+
findVersionsBySourceUrl: tt.procedure.input(z$1.object({ url: nonEmpty })).query(async ({ ctx, input }) => {
|
|
2411
|
+
return await ctx.docService.findVersionsBySourceUrl(
|
|
2412
|
+
input.url
|
|
2413
|
+
);
|
|
2414
|
+
}),
|
|
2415
|
+
getScraperOptions: tt.procedure.input(z$1.object({ versionId: z$1.number().int().positive() })).query(
|
|
2416
|
+
async ({
|
|
2417
|
+
ctx,
|
|
2418
|
+
input
|
|
2419
|
+
}) => {
|
|
2420
|
+
return await ctx.docService.getScraperOptions(input.versionId);
|
|
2421
|
+
}
|
|
2422
|
+
),
|
|
2423
|
+
updateVersionStatus: tt.procedure.input(
|
|
2424
|
+
z$1.object({
|
|
2425
|
+
versionId: z$1.number().int().positive(),
|
|
2426
|
+
status: z$1.string(),
|
|
2427
|
+
errorMessage: z$1.string().optional().nullable()
|
|
2428
|
+
})
|
|
2429
|
+
).mutation(
|
|
2430
|
+
async ({
|
|
2431
|
+
ctx,
|
|
2432
|
+
input
|
|
2433
|
+
}) => {
|
|
2434
|
+
await ctx.docService.updateVersionStatus(
|
|
2435
|
+
input.versionId,
|
|
2436
|
+
input.status,
|
|
2437
|
+
input.errorMessage ?? void 0
|
|
2438
|
+
);
|
|
2439
|
+
return { ok: true };
|
|
2440
|
+
}
|
|
2441
|
+
),
|
|
2442
|
+
updateVersionProgress: tt.procedure.input(
|
|
2443
|
+
z$1.object({
|
|
2444
|
+
versionId: z$1.number().int().positive(),
|
|
2445
|
+
pages: z$1.number().int().nonnegative(),
|
|
2446
|
+
maxPages: z$1.number().int().positive()
|
|
2447
|
+
})
|
|
2448
|
+
).mutation(
|
|
2449
|
+
async ({
|
|
2450
|
+
ctx,
|
|
2451
|
+
input
|
|
2452
|
+
}) => {
|
|
2453
|
+
await ctx.docService.updateVersionProgress(
|
|
2454
|
+
input.versionId,
|
|
2455
|
+
input.pages,
|
|
2456
|
+
input.maxPages
|
|
2457
|
+
);
|
|
2458
|
+
return { ok: true };
|
|
2332
2459
|
}
|
|
2333
|
-
)
|
|
2334
|
-
|
|
2335
|
-
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2460
|
+
),
|
|
2461
|
+
storeScraperOptions: tt.procedure.input(
|
|
2462
|
+
z$1.object({
|
|
2463
|
+
versionId: z$1.number().int().positive(),
|
|
2464
|
+
options: z$1.unknown()
|
|
2465
|
+
})
|
|
2466
|
+
).mutation(
|
|
2467
|
+
async ({
|
|
2468
|
+
ctx,
|
|
2469
|
+
input
|
|
2470
|
+
}) => {
|
|
2471
|
+
await ctx.docService.storeScraperOptions(
|
|
2472
|
+
input.versionId,
|
|
2473
|
+
input.options
|
|
2474
|
+
);
|
|
2475
|
+
return { ok: true };
|
|
2347
2476
|
}
|
|
2348
|
-
)
|
|
2349
|
-
|
|
2350
|
-
}
|
|
2477
|
+
)
|
|
2478
|
+
});
|
|
2351
2479
|
}
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2480
|
+
createDataRouter(t);
|
|
2481
|
+
async function registerTrpcService(server, pipeline, docService) {
|
|
2482
|
+
const t2 = initTRPC.context().create();
|
|
2483
|
+
const healthRouter = t2.router({
|
|
2484
|
+
ping: t2.procedure.query(async () => ({ status: "ok", ts: Date.now() }))
|
|
2485
|
+
});
|
|
2486
|
+
const router = t2.mergeRouters(
|
|
2487
|
+
healthRouter,
|
|
2488
|
+
createPipelineRouter(t2),
|
|
2489
|
+
createDataRouter(t2)
|
|
2490
|
+
);
|
|
2491
|
+
await server.register(fastifyTRPCPlugin, {
|
|
2492
|
+
prefix: "/api",
|
|
2493
|
+
trpcOptions: {
|
|
2494
|
+
router,
|
|
2495
|
+
createContext: async () => ({ pipeline, docService })
|
|
2496
|
+
}
|
|
2497
|
+
});
|
|
2355
2498
|
}
|
|
2356
2499
|
const Layout = ({ title, version: version2, children }) => {
|
|
2357
2500
|
let versionString = version2;
|
|
@@ -2522,7 +2665,7 @@ function normalizeVersionName(name) {
|
|
|
2522
2665
|
return name ?? "";
|
|
2523
2666
|
}
|
|
2524
2667
|
function denormalizeVersionName(name) {
|
|
2525
|
-
return name === "" ?
|
|
2668
|
+
return name === "" ? "" : name;
|
|
2526
2669
|
}
|
|
2527
2670
|
function getStatusDescription(status) {
|
|
2528
2671
|
const descriptions = {
|
|
@@ -3426,8 +3569,8 @@ const VersionDetailsRow = ({
|
|
|
3426
3569
|
// Default to true
|
|
3427
3570
|
}) => {
|
|
3428
3571
|
const indexedDate = version2.indexedAt ? new Date(version2.indexedAt).toLocaleDateString() : "N/A";
|
|
3429
|
-
const versionLabel = version2.version || "Unversioned";
|
|
3430
|
-
const versionParam = version2.version || "";
|
|
3572
|
+
const versionLabel = version2.ref.version || "Unversioned";
|
|
3573
|
+
const versionParam = version2.ref.version || "";
|
|
3431
3574
|
const sanitizedLibraryName = libraryName.replace(/[^a-zA-Z0-9-_]/g, "-");
|
|
3432
3575
|
const sanitizedVersionParam = versionParam.replace(/[^a-zA-Z0-9-_]/g, "-");
|
|
3433
3576
|
const rowId = `row-${sanitizedLibraryName}-${sanitizedVersionParam}`;
|
|
@@ -3446,19 +3589,19 @@ const VersionDetailsRow = ({
|
|
|
3446
3589
|
{
|
|
3447
3590
|
class: "text-sm text-gray-900 dark:text-white w-1/4 truncate",
|
|
3448
3591
|
title: versionLabel,
|
|
3449
|
-
children: version2.version ? /* @__PURE__ */ jsx(VersionBadge, { version: version2.version }) : /* @__PURE__ */ jsx("span", { children: "Unversioned" })
|
|
3592
|
+
children: version2.ref.version ? /* @__PURE__ */ jsx(VersionBadge, { version: version2.ref.version }) : /* @__PURE__ */ jsx("span", { children: "Unversioned" })
|
|
3450
3593
|
}
|
|
3451
3594
|
),
|
|
3452
3595
|
/* @__PURE__ */ jsxs("div", { class: "flex space-x-2 text-sm text-gray-600 dark:text-gray-400 w-3/4 justify-end items-center", children: [
|
|
3453
3596
|
/* @__PURE__ */ jsxs("span", { title: "Number of unique pages indexed", children: [
|
|
3454
3597
|
"Pages:",
|
|
3455
3598
|
" ",
|
|
3456
|
-
/* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.
|
|
3599
|
+
/* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.counts.uniqueUrls.toLocaleString() })
|
|
3457
3600
|
] }),
|
|
3458
3601
|
/* @__PURE__ */ jsxs("span", { title: "Number of indexed snippets", children: [
|
|
3459
3602
|
"Snippets:",
|
|
3460
3603
|
" ",
|
|
3461
|
-
/* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.
|
|
3604
|
+
/* @__PURE__ */ jsx("span", { class: "font-semibold", safe: true, children: version2.counts.documents.toLocaleString() })
|
|
3462
3605
|
] }),
|
|
3463
3606
|
/* @__PURE__ */ jsxs("span", { title: "Date last indexed", children: [
|
|
3464
3607
|
"Last Update:",
|
|
@@ -3558,17 +3701,28 @@ const LibraryDetailCard = ({ library }) => (
|
|
|
3558
3701
|
// Use Flowbite Card structure with updated padding and border, and white background
|
|
3559
3702
|
/* @__PURE__ */ jsxs("div", { class: "block p-4 bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-300 dark:border-gray-600 mb-4", children: [
|
|
3560
3703
|
/* @__PURE__ */ jsx("h3", { class: "text-lg font-medium text-gray-900 dark:text-white mb-1", children: /* @__PURE__ */ jsx("span", { safe: true, children: library.name }) }),
|
|
3561
|
-
/* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((
|
|
3562
|
-
|
|
3563
|
-
|
|
3564
|
-
|
|
3565
|
-
|
|
3566
|
-
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
|
|
3704
|
+
/* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((v) => {
|
|
3705
|
+
const adapted = {
|
|
3706
|
+
id: -1,
|
|
3707
|
+
ref: { library: library.name, version: v.version },
|
|
3708
|
+
status: v.status,
|
|
3709
|
+
progress: v.progress,
|
|
3710
|
+
counts: {
|
|
3711
|
+
documents: v.documentCount,
|
|
3712
|
+
uniqueUrls: v.uniqueUrlCount
|
|
3713
|
+
},
|
|
3714
|
+
indexedAt: v.indexedAt,
|
|
3715
|
+
sourceUrl: v.sourceUrl ?? void 0
|
|
3716
|
+
};
|
|
3717
|
+
return /* @__PURE__ */ jsx(
|
|
3718
|
+
VersionDetailsRow,
|
|
3719
|
+
{
|
|
3720
|
+
libraryName: library.name,
|
|
3721
|
+
version: adapted,
|
|
3722
|
+
showDelete: false
|
|
3723
|
+
}
|
|
3724
|
+
);
|
|
3725
|
+
}) : /* @__PURE__ */ jsx("p", { class: "text-sm text-gray-500 dark:text-gray-400 italic", children: "No versions indexed." }) })
|
|
3572
3726
|
] })
|
|
3573
3727
|
);
|
|
3574
3728
|
const LibrarySearchCard = ({ library }) => {
|
|
@@ -3733,7 +3887,21 @@ const LibraryItem = ({ library }) => (
|
|
|
3733
3887
|
children: /* @__PURE__ */ jsx("span", { safe: true, children: library.name })
|
|
3734
3888
|
}
|
|
3735
3889
|
) }),
|
|
3736
|
-
/* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((
|
|
3890
|
+
/* @__PURE__ */ jsx("div", { class: "mt-1", children: library.versions.length > 0 ? library.versions.map((v) => {
|
|
3891
|
+
const adapted = {
|
|
3892
|
+
id: -1,
|
|
3893
|
+
ref: { library: library.name, version: v.version },
|
|
3894
|
+
status: v.status,
|
|
3895
|
+
progress: v.progress,
|
|
3896
|
+
counts: {
|
|
3897
|
+
documents: v.documentCount,
|
|
3898
|
+
uniqueUrls: v.uniqueUrlCount
|
|
3899
|
+
},
|
|
3900
|
+
indexedAt: v.indexedAt,
|
|
3901
|
+
sourceUrl: v.sourceUrl ?? void 0
|
|
3902
|
+
};
|
|
3903
|
+
return /* @__PURE__ */ jsx(VersionDetailsRow, { libraryName: library.name, version: adapted });
|
|
3904
|
+
}) : (
|
|
3737
3905
|
// Display message if no versions are indexed
|
|
3738
3906
|
/* @__PURE__ */ jsx("p", { class: "text-sm text-gray-500 dark:text-gray-400 italic", children: "No versions indexed." })
|
|
3739
3907
|
) })
|
|
@@ -3861,9 +4029,9 @@ class AppServer {
|
|
|
3861
4029
|
);
|
|
3862
4030
|
}
|
|
3863
4031
|
}
|
|
3864
|
-
if (this.config.enableWorker && !this.config.
|
|
4032
|
+
if (this.config.enableWorker && !this.config.enableApiServer) {
|
|
3865
4033
|
logger.warn(
|
|
3866
|
-
"Warning: Worker is enabled but
|
|
4034
|
+
"Warning: Worker is enabled but API server is disabled. Consider enabling the API for better observability."
|
|
3867
4035
|
);
|
|
3868
4036
|
}
|
|
3869
4037
|
}
|
|
@@ -3915,8 +4083,8 @@ class AppServer {
|
|
|
3915
4083
|
if (this.config.enableMcpServer) {
|
|
3916
4084
|
await this.enableMcpServer();
|
|
3917
4085
|
}
|
|
3918
|
-
if (this.config.
|
|
3919
|
-
await this.
|
|
4086
|
+
if (this.config.enableApiServer) {
|
|
4087
|
+
await this.enableTrpcApi();
|
|
3920
4088
|
}
|
|
3921
4089
|
if (this.config.enableWorker) {
|
|
3922
4090
|
await this.enableWorker();
|
|
@@ -3944,11 +4112,11 @@ class AppServer {
|
|
|
3944
4112
|
logger.debug("MCP server service enabled");
|
|
3945
4113
|
}
|
|
3946
4114
|
/**
|
|
3947
|
-
* Enable Pipeline
|
|
4115
|
+
* Enable Pipeline RPC (tRPC) service.
|
|
3948
4116
|
*/
|
|
3949
|
-
async
|
|
3950
|
-
await
|
|
3951
|
-
logger.debug("
|
|
4117
|
+
async enableTrpcApi() {
|
|
4118
|
+
await registerTrpcService(this.server, this.pipeline, this.docService);
|
|
4119
|
+
logger.debug("API server (tRPC) enabled");
|
|
3952
4120
|
}
|
|
3953
4121
|
/**
|
|
3954
4122
|
* Enable worker service.
|
|
@@ -3977,10 +4145,10 @@ class AppServer {
|
|
|
3977
4145
|
enabledServices.push(`Web interface: ${address}`);
|
|
3978
4146
|
}
|
|
3979
4147
|
if (this.config.enableMcpServer) {
|
|
3980
|
-
enabledServices.push(`MCP
|
|
4148
|
+
enabledServices.push(`MCP endpoints: ${address}/mcp, ${address}/sse`);
|
|
3981
4149
|
}
|
|
3982
|
-
if (this.config.
|
|
3983
|
-
enabledServices.push(`
|
|
4150
|
+
if (this.config.enableApiServer) {
|
|
4151
|
+
enabledServices.push(`API: ${address}/api`);
|
|
3984
4152
|
}
|
|
3985
4153
|
if (this.config.enableWorker) {
|
|
3986
4154
|
enabledServices.push("Embedded worker: enabled");
|
|
@@ -4005,6 +4173,161 @@ async function startStdioServer(tools) {
|
|
|
4005
4173
|
logger.info("🤖 MCP server listening on stdio");
|
|
4006
4174
|
return server;
|
|
4007
4175
|
}
|
|
4176
|
+
class StoreError extends Error {
|
|
4177
|
+
constructor(message, cause) {
|
|
4178
|
+
super(cause ? `${message} caused by ${cause}` : message);
|
|
4179
|
+
this.cause = cause;
|
|
4180
|
+
this.name = this.constructor.name;
|
|
4181
|
+
const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
|
|
4182
|
+
if (causeError?.stack) {
|
|
4183
|
+
this.stack = causeError.stack;
|
|
4184
|
+
}
|
|
4185
|
+
}
|
|
4186
|
+
}
|
|
4187
|
+
class DimensionError extends StoreError {
|
|
4188
|
+
constructor(modelName, modelDimension, dbDimension) {
|
|
4189
|
+
super(
|
|
4190
|
+
`Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
|
|
4191
|
+
);
|
|
4192
|
+
this.modelName = modelName;
|
|
4193
|
+
this.modelDimension = modelDimension;
|
|
4194
|
+
this.dbDimension = dbDimension;
|
|
4195
|
+
}
|
|
4196
|
+
}
|
|
4197
|
+
class ConnectionError extends StoreError {
|
|
4198
|
+
}
|
|
4199
|
+
const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
|
|
4200
|
+
const MIGRATIONS_TABLE = "_schema_migrations";
|
|
4201
|
+
function ensureMigrationsTable(db) {
|
|
4202
|
+
db.exec(`
|
|
4203
|
+
CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
|
|
4204
|
+
id TEXT PRIMARY KEY,
|
|
4205
|
+
applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
4206
|
+
);
|
|
4207
|
+
`);
|
|
4208
|
+
}
|
|
4209
|
+
function getAppliedMigrations(db) {
|
|
4210
|
+
const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
|
|
4211
|
+
const rows = stmt.all();
|
|
4212
|
+
return new Set(rows.map((row) => row.id));
|
|
4213
|
+
}
|
|
4214
|
+
async function applyMigrations(db) {
|
|
4215
|
+
try {
|
|
4216
|
+
db.pragma("journal_mode = OFF");
|
|
4217
|
+
db.pragma("synchronous = OFF");
|
|
4218
|
+
db.pragma("mmap_size = 268435456");
|
|
4219
|
+
db.pragma("cache_size = -64000");
|
|
4220
|
+
db.pragma("temp_store = MEMORY");
|
|
4221
|
+
logger.debug("Applied performance optimizations for migration");
|
|
4222
|
+
} catch (_error) {
|
|
4223
|
+
logger.warn("⚠️ Could not apply all performance optimizations for migration");
|
|
4224
|
+
}
|
|
4225
|
+
const overallTransaction = db.transaction(() => {
|
|
4226
|
+
logger.debug("Checking database migrations...");
|
|
4227
|
+
ensureMigrationsTable(db);
|
|
4228
|
+
const appliedMigrations = getAppliedMigrations(db);
|
|
4229
|
+
if (!fs$1.existsSync(MIGRATIONS_DIR)) {
|
|
4230
|
+
throw new StoreError("Migrations directory not found");
|
|
4231
|
+
}
|
|
4232
|
+
const migrationFiles = fs$1.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
|
|
4233
|
+
const pendingMigrations = migrationFiles.filter(
|
|
4234
|
+
(filename) => !appliedMigrations.has(filename)
|
|
4235
|
+
);
|
|
4236
|
+
if (pendingMigrations.length > 0) {
|
|
4237
|
+
logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
|
|
4238
|
+
}
|
|
4239
|
+
let appliedCount = 0;
|
|
4240
|
+
for (const filename of pendingMigrations) {
|
|
4241
|
+
logger.debug(`Applying migration: ${filename}`);
|
|
4242
|
+
const filePath = path.join(MIGRATIONS_DIR, filename);
|
|
4243
|
+
const sql = fs$1.readFileSync(filePath, "utf8");
|
|
4244
|
+
try {
|
|
4245
|
+
db.exec(sql);
|
|
4246
|
+
const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
|
|
4247
|
+
insertStmt.run(filename);
|
|
4248
|
+
logger.debug(`✅ Applied migration: ${filename}`);
|
|
4249
|
+
appliedCount++;
|
|
4250
|
+
} catch (error) {
|
|
4251
|
+
logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
|
|
4252
|
+
throw new StoreError(`Migration failed: ${filename}`, error);
|
|
4253
|
+
}
|
|
4254
|
+
}
|
|
4255
|
+
if (appliedCount > 0) {
|
|
4256
|
+
logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
|
|
4257
|
+
} else {
|
|
4258
|
+
logger.debug("Database schema is up to date");
|
|
4259
|
+
}
|
|
4260
|
+
return appliedCount;
|
|
4261
|
+
});
|
|
4262
|
+
let retries = 0;
|
|
4263
|
+
let appliedMigrationsCount = 0;
|
|
4264
|
+
while (true) {
|
|
4265
|
+
try {
|
|
4266
|
+
appliedMigrationsCount = overallTransaction.immediate();
|
|
4267
|
+
logger.debug("Database migrations completed successfully");
|
|
4268
|
+
if (appliedMigrationsCount > 0) {
|
|
4269
|
+
try {
|
|
4270
|
+
logger.debug(
|
|
4271
|
+
`Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
|
|
4272
|
+
);
|
|
4273
|
+
db.exec("VACUUM");
|
|
4274
|
+
logger.debug("Database vacuum completed successfully");
|
|
4275
|
+
} catch (error) {
|
|
4276
|
+
logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
|
|
4277
|
+
}
|
|
4278
|
+
} else {
|
|
4279
|
+
logger.debug("Skipping VACUUM - no migrations were applied");
|
|
4280
|
+
}
|
|
4281
|
+
break;
|
|
4282
|
+
} catch (error) {
|
|
4283
|
+
if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
|
|
4284
|
+
retries++;
|
|
4285
|
+
logger.warn(
|
|
4286
|
+
`⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
|
|
4287
|
+
);
|
|
4288
|
+
await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
|
|
4289
|
+
} else {
|
|
4290
|
+
if (error?.code === "SQLITE_BUSY") {
|
|
4291
|
+
logger.error(
|
|
4292
|
+
`❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
|
|
4293
|
+
);
|
|
4294
|
+
}
|
|
4295
|
+
if (error instanceof StoreError) {
|
|
4296
|
+
throw error;
|
|
4297
|
+
}
|
|
4298
|
+
throw new StoreError("Failed during migration process", error);
|
|
4299
|
+
}
|
|
4300
|
+
}
|
|
4301
|
+
}
|
|
4302
|
+
try {
|
|
4303
|
+
db.pragma("journal_mode = WAL");
|
|
4304
|
+
db.pragma("wal_autocheckpoint = 1000");
|
|
4305
|
+
db.pragma("busy_timeout = 30000");
|
|
4306
|
+
db.pragma("foreign_keys = ON");
|
|
4307
|
+
db.pragma("synchronous = NORMAL");
|
|
4308
|
+
logger.debug(
|
|
4309
|
+
"Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
|
|
4310
|
+
);
|
|
4311
|
+
} catch (_error) {
|
|
4312
|
+
logger.warn("⚠️ Could not apply all production database settings");
|
|
4313
|
+
}
|
|
4314
|
+
}
|
|
4315
|
+
async function createDocumentManagement(options = {}) {
|
|
4316
|
+
if (options.serverUrl) {
|
|
4317
|
+
const { DocumentManagementClient } = await import("./DocumentManagementClient-CAFdDwTu.js");
|
|
4318
|
+
const client = new DocumentManagementClient(options.serverUrl);
|
|
4319
|
+
await client.initialize();
|
|
4320
|
+
return client;
|
|
4321
|
+
}
|
|
4322
|
+
const service = new (await import("./DocumentManagementService-BH02TJEe.js")).DocumentManagementService();
|
|
4323
|
+
await service.initialize();
|
|
4324
|
+
return service;
|
|
4325
|
+
}
|
|
4326
|
+
async function createLocalDocumentManagement() {
|
|
4327
|
+
const service = new (await import("./DocumentManagementService-BH02TJEe.js")).DocumentManagementService();
|
|
4328
|
+
await service.initialize();
|
|
4329
|
+
return service;
|
|
4330
|
+
}
|
|
4008
4331
|
function deserializeJob(serializedJob) {
|
|
4009
4332
|
return {
|
|
4010
4333
|
...serializedJob,
|
|
@@ -4016,21 +4339,22 @@ function deserializeJob(serializedJob) {
|
|
|
4016
4339
|
}
|
|
4017
4340
|
class PipelineClient {
|
|
4018
4341
|
baseUrl;
|
|
4342
|
+
client;
|
|
4019
4343
|
pollingInterval = 1e3;
|
|
4020
4344
|
// 1 second
|
|
4021
4345
|
activePolling = /* @__PURE__ */ new Set();
|
|
4022
4346
|
// Track jobs being polled for completion
|
|
4023
4347
|
constructor(serverUrl) {
|
|
4024
4348
|
this.baseUrl = serverUrl.replace(/\/$/, "");
|
|
4025
|
-
|
|
4349
|
+
this.client = createTRPCProxyClient({
|
|
4350
|
+
links: [httpBatchLink({ url: this.baseUrl })]
|
|
4351
|
+
});
|
|
4352
|
+
logger.debug(`PipelineClient (tRPC) created for: ${this.baseUrl}`);
|
|
4026
4353
|
}
|
|
4027
4354
|
async start() {
|
|
4028
4355
|
try {
|
|
4029
|
-
|
|
4030
|
-
|
|
4031
|
-
throw new Error(`External worker health check failed: ${response.status}`);
|
|
4032
|
-
}
|
|
4033
|
-
logger.debug("PipelineClient connected to external worker");
|
|
4356
|
+
await this.client.ping.query();
|
|
4357
|
+
logger.debug("PipelineClient connected to external worker via tRPC");
|
|
4034
4358
|
} catch (error) {
|
|
4035
4359
|
throw new Error(
|
|
4036
4360
|
`Failed to connect to external worker at ${this.baseUrl}: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -4043,25 +4367,14 @@ class PipelineClient {
|
|
|
4043
4367
|
}
|
|
4044
4368
|
async enqueueJob(library, version2, options) {
|
|
4045
4369
|
try {
|
|
4046
|
-
const
|
|
4047
|
-
|
|
4048
|
-
|
|
4049
|
-
|
|
4050
|
-
|
|
4051
|
-
body: JSON.stringify({
|
|
4052
|
-
library,
|
|
4053
|
-
version: version2,
|
|
4054
|
-
options
|
|
4055
|
-
})
|
|
4370
|
+
const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
|
|
4371
|
+
const result = await this.client.enqueueJob.mutate({
|
|
4372
|
+
library,
|
|
4373
|
+
version: normalizedVersion,
|
|
4374
|
+
options
|
|
4056
4375
|
});
|
|
4057
|
-
|
|
4058
|
-
|
|
4059
|
-
throw new Error(`Failed to enqueue job: ${response.status} ${errorText}`);
|
|
4060
|
-
}
|
|
4061
|
-
const result = await response.json();
|
|
4062
|
-
const jobId = result.jobId;
|
|
4063
|
-
logger.debug(`Job ${jobId} enqueued successfully`);
|
|
4064
|
-
return jobId;
|
|
4376
|
+
logger.debug(`Job ${result.jobId} enqueued successfully`);
|
|
4377
|
+
return result.jobId;
|
|
4065
4378
|
} catch (error) {
|
|
4066
4379
|
throw new Error(
|
|
4067
4380
|
`Failed to enqueue job: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -4070,15 +4383,8 @@ class PipelineClient {
|
|
|
4070
4383
|
}
|
|
4071
4384
|
async getJob(jobId) {
|
|
4072
4385
|
try {
|
|
4073
|
-
const
|
|
4074
|
-
|
|
4075
|
-
return void 0;
|
|
4076
|
-
}
|
|
4077
|
-
if (!response.ok) {
|
|
4078
|
-
throw new Error(`Failed to get job: ${response.status} ${response.statusText}`);
|
|
4079
|
-
}
|
|
4080
|
-
const serializedJob = await response.json();
|
|
4081
|
-
return deserializeJob(serializedJob);
|
|
4386
|
+
const serializedJob = await this.client.getJob.query({ id: jobId });
|
|
4387
|
+
return serializedJob ? deserializeJob(serializedJob) : void 0;
|
|
4082
4388
|
} catch (error) {
|
|
4083
4389
|
throw new Error(
|
|
4084
4390
|
`Failed to get job ${jobId}: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -4087,18 +4393,11 @@ class PipelineClient {
|
|
|
4087
4393
|
}
|
|
4088
4394
|
async getJobs(status) {
|
|
4089
4395
|
try {
|
|
4090
|
-
const
|
|
4091
|
-
if (status) {
|
|
4092
|
-
url.searchParams.set("status", status);
|
|
4093
|
-
}
|
|
4094
|
-
const response = await fetch(url.toString());
|
|
4095
|
-
if (!response.ok) {
|
|
4096
|
-
const errorText = await response.text();
|
|
4097
|
-
throw new Error(`Failed to get jobs: ${response.status} ${errorText}`);
|
|
4098
|
-
}
|
|
4099
|
-
const result = await response.json();
|
|
4396
|
+
const result = await this.client.getJobs.query({ status });
|
|
4100
4397
|
const serializedJobs = result.jobs || [];
|
|
4101
|
-
return serializedJobs.map(
|
|
4398
|
+
return serializedJobs.map(
|
|
4399
|
+
(j) => deserializeJob(j)
|
|
4400
|
+
);
|
|
4102
4401
|
} catch (error) {
|
|
4103
4402
|
logger.error(`Failed to get jobs from external worker: ${error}`);
|
|
4104
4403
|
throw error;
|
|
@@ -4106,13 +4405,7 @@ class PipelineClient {
|
|
|
4106
4405
|
}
|
|
4107
4406
|
async cancelJob(jobId) {
|
|
4108
4407
|
try {
|
|
4109
|
-
|
|
4110
|
-
method: "DELETE"
|
|
4111
|
-
});
|
|
4112
|
-
if (!response.ok) {
|
|
4113
|
-
const errorText = await response.text();
|
|
4114
|
-
throw new Error(`Failed to cancel job: ${response.status} ${errorText}`);
|
|
4115
|
-
}
|
|
4408
|
+
await this.client.cancelJob.mutate({ id: jobId });
|
|
4116
4409
|
logger.debug(`Job cancelled via external worker: ${jobId}`);
|
|
4117
4410
|
} catch (error) {
|
|
4118
4411
|
logger.error(`Failed to cancel job ${jobId} via external worker: ${error}`);
|
|
@@ -4121,16 +4414,7 @@ class PipelineClient {
|
|
|
4121
4414
|
}
|
|
4122
4415
|
async clearCompletedJobs() {
|
|
4123
4416
|
try {
|
|
4124
|
-
const
|
|
4125
|
-
method: "DELETE"
|
|
4126
|
-
});
|
|
4127
|
-
if (!response.ok) {
|
|
4128
|
-
const errorText = await response.text();
|
|
4129
|
-
throw new Error(
|
|
4130
|
-
`Failed to clear completed jobs: ${response.status} ${errorText}`
|
|
4131
|
-
);
|
|
4132
|
-
}
|
|
4133
|
-
const result = await response.json();
|
|
4417
|
+
const result = await this.client.clearCompletedJobs.mutate();
|
|
4134
4418
|
logger.debug(`Cleared ${result.count} completed jobs via external worker`);
|
|
4135
4419
|
return result.count || 0;
|
|
4136
4420
|
} catch (error) {
|
|
@@ -4210,17 +4494,33 @@ function validateUrl(url) {
|
|
|
4210
4494
|
throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
|
|
4211
4495
|
}
|
|
4212
4496
|
}
|
|
4213
|
-
function
|
|
4214
|
-
|
|
4215
|
-
|
|
4216
|
-
|
|
4217
|
-
const
|
|
4218
|
-
|
|
4219
|
-
|
|
4497
|
+
function computeBaseDirectory(pathname) {
|
|
4498
|
+
if (pathname === "") return "/";
|
|
4499
|
+
if (pathname.endsWith("/")) return pathname;
|
|
4500
|
+
const lastSegment = pathname.split("/").at(-1) || "";
|
|
4501
|
+
const looksLikeFile = lastSegment.includes(".");
|
|
4502
|
+
if (looksLikeFile) {
|
|
4503
|
+
return pathname.replace(/\/[^/]*$/, "/");
|
|
4504
|
+
}
|
|
4505
|
+
return `${pathname}/`;
|
|
4220
4506
|
}
|
|
4221
|
-
function
|
|
4222
|
-
|
|
4223
|
-
|
|
4507
|
+
function isInScope(baseUrl, targetUrl, scope) {
|
|
4508
|
+
if (baseUrl.protocol !== targetUrl.protocol) return false;
|
|
4509
|
+
switch (scope) {
|
|
4510
|
+
case "subpages": {
|
|
4511
|
+
if (baseUrl.hostname !== targetUrl.hostname) return false;
|
|
4512
|
+
const baseDir = computeBaseDirectory(baseUrl.pathname);
|
|
4513
|
+
return targetUrl.pathname.startsWith(baseDir);
|
|
4514
|
+
}
|
|
4515
|
+
case "hostname":
|
|
4516
|
+
return baseUrl.hostname === targetUrl.hostname;
|
|
4517
|
+
case "domain": {
|
|
4518
|
+
const getDomain = (host) => host.split(".").slice(-2).join(".");
|
|
4519
|
+
return getDomain(baseUrl.hostname) === getDomain(targetUrl.hostname);
|
|
4520
|
+
}
|
|
4521
|
+
default:
|
|
4522
|
+
return false;
|
|
4523
|
+
}
|
|
4224
4524
|
}
|
|
4225
4525
|
function isRegexPattern(pattern) {
|
|
4226
4526
|
return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
|
|
@@ -4268,24 +4568,6 @@ function shouldIncludeUrl(url, includePatterns, excludePatterns) {
|
|
|
4268
4568
|
if (!includePatterns || includePatterns.length === 0) return true;
|
|
4269
4569
|
return matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
|
|
4270
4570
|
}
|
|
4271
|
-
function isInScope(baseUrl, targetUrl, scope) {
|
|
4272
|
-
if (baseUrl.protocol !== targetUrl.protocol) return false;
|
|
4273
|
-
switch (scope) {
|
|
4274
|
-
case "subpages": {
|
|
4275
|
-
if (baseUrl.hostname !== targetUrl.hostname) return false;
|
|
4276
|
-
const baseDir = baseUrl.pathname.endsWith("/") ? baseUrl.pathname : baseUrl.pathname.replace(/\/[^/]*$/, "/");
|
|
4277
|
-
return targetUrl.pathname.startsWith(baseDir);
|
|
4278
|
-
}
|
|
4279
|
-
case "hostname":
|
|
4280
|
-
return baseUrl.hostname === targetUrl.hostname;
|
|
4281
|
-
case "domain": {
|
|
4282
|
-
const getDomain = (host) => host.split(".").slice(-2).join(".");
|
|
4283
|
-
return getDomain(baseUrl.hostname) === getDomain(targetUrl.hostname);
|
|
4284
|
-
}
|
|
4285
|
-
default:
|
|
4286
|
-
return false;
|
|
4287
|
-
}
|
|
4288
|
-
}
|
|
4289
4571
|
const DEFAULT_MAX_DEPTH = 3;
|
|
4290
4572
|
const DEFAULT_CONCURRENCY = 3;
|
|
4291
4573
|
class BaseScraperStrategy {
|
|
@@ -4294,6 +4576,8 @@ class BaseScraperStrategy {
|
|
|
4294
4576
|
totalDiscovered = 0;
|
|
4295
4577
|
// Track total URLs discovered (unlimited)
|
|
4296
4578
|
effectiveTotal = 0;
|
|
4579
|
+
// Track effective total (limited by maxPages)
|
|
4580
|
+
canonicalBaseUrl;
|
|
4297
4581
|
options;
|
|
4298
4582
|
constructor(options = {}) {
|
|
4299
4583
|
this.options = options;
|
|
@@ -4305,7 +4589,7 @@ class BaseScraperStrategy {
|
|
|
4305
4589
|
shouldProcessUrl(url, options) {
|
|
4306
4590
|
if (options.scope) {
|
|
4307
4591
|
try {
|
|
4308
|
-
const base = new URL$1(options.url);
|
|
4592
|
+
const base = this.canonicalBaseUrl ?? new URL$1(options.url);
|
|
4309
4593
|
const target = new URL$1(url);
|
|
4310
4594
|
if (!isInScope(base, target, options.scope)) return false;
|
|
4311
4595
|
} catch {
|
|
@@ -4328,6 +4612,23 @@ class BaseScraperStrategy {
|
|
|
4328
4612
|
}
|
|
4329
4613
|
try {
|
|
4330
4614
|
const result = await this.processItem(item, options, void 0, signal);
|
|
4615
|
+
if (item.depth === 0 && !this.canonicalBaseUrl && result?.finalUrl) {
|
|
4616
|
+
try {
|
|
4617
|
+
const finalUrlStr = result.finalUrl;
|
|
4618
|
+
const original = new URL$1(options.url);
|
|
4619
|
+
const finalUrlObj = new URL$1(finalUrlStr);
|
|
4620
|
+
if (finalUrlObj.href !== original.href && (finalUrlObj.protocol === "http:" || finalUrlObj.protocol === "https:")) {
|
|
4621
|
+
this.canonicalBaseUrl = finalUrlObj;
|
|
4622
|
+
logger.debug(
|
|
4623
|
+
`Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`
|
|
4624
|
+
);
|
|
4625
|
+
} else {
|
|
4626
|
+
this.canonicalBaseUrl = original;
|
|
4627
|
+
}
|
|
4628
|
+
} catch {
|
|
4629
|
+
this.canonicalBaseUrl = new URL$1(options.url);
|
|
4630
|
+
}
|
|
4631
|
+
}
|
|
4331
4632
|
if (result.document) {
|
|
4332
4633
|
this.pageCount++;
|
|
4333
4634
|
logger.info(
|
|
@@ -4388,7 +4689,8 @@ class BaseScraperStrategy {
|
|
|
4388
4689
|
this.pageCount = 0;
|
|
4389
4690
|
this.totalDiscovered = 1;
|
|
4390
4691
|
this.effectiveTotal = 1;
|
|
4391
|
-
|
|
4692
|
+
this.canonicalBaseUrl = new URL$1(options.url);
|
|
4693
|
+
let baseUrl = this.canonicalBaseUrl;
|
|
4392
4694
|
const queue = [{ url: options.url, depth: 0 }];
|
|
4393
4695
|
this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
|
|
4394
4696
|
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
@@ -4409,6 +4711,7 @@ class BaseScraperStrategy {
|
|
|
4409
4711
|
queue.length
|
|
4410
4712
|
);
|
|
4411
4713
|
const batch = queue.splice(0, batchSize);
|
|
4714
|
+
baseUrl = this.canonicalBaseUrl ?? baseUrl;
|
|
4412
4715
|
const newUrls = await this.processBatch(
|
|
4413
4716
|
batch,
|
|
4414
4717
|
baseUrl,
|
|
@@ -4441,22 +4744,7 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
4441
4744
|
return false;
|
|
4442
4745
|
}
|
|
4443
4746
|
}
|
|
4444
|
-
|
|
4445
|
-
* Determines if a target URL should be followed based on the scope setting.
|
|
4446
|
-
*/
|
|
4447
|
-
isInScope(baseUrl, targetUrl, scope) {
|
|
4448
|
-
try {
|
|
4449
|
-
if (scope === "domain") {
|
|
4450
|
-
return hasSameDomain(baseUrl, targetUrl);
|
|
4451
|
-
}
|
|
4452
|
-
if (scope === "hostname") {
|
|
4453
|
-
return hasSameHostname(baseUrl, targetUrl);
|
|
4454
|
-
}
|
|
4455
|
-
return hasSameHostname(baseUrl, targetUrl) && isSubpath(baseUrl, targetUrl);
|
|
4456
|
-
} catch {
|
|
4457
|
-
return false;
|
|
4458
|
-
}
|
|
4459
|
-
}
|
|
4747
|
+
// Removed custom isInScope logic; using shared scope utility for consistent behavior
|
|
4460
4748
|
/**
|
|
4461
4749
|
* Processes a single queue item by fetching its content and processing it through pipelines.
|
|
4462
4750
|
* @param item - The queue item to process.
|
|
@@ -4497,12 +4785,12 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
4497
4785
|
);
|
|
4498
4786
|
return { document: void 0, links: processed.links };
|
|
4499
4787
|
}
|
|
4500
|
-
const baseUrl = new URL(options.url);
|
|
4788
|
+
const baseUrl = item.depth === 0 ? new URL(rawContent.source) : this.canonicalBaseUrl ?? new URL(options.url);
|
|
4501
4789
|
const filteredLinks = processed.links.filter((link) => {
|
|
4502
4790
|
try {
|
|
4503
4791
|
const targetUrl = new URL(link);
|
|
4504
4792
|
const scope = options.scope || "subpages";
|
|
4505
|
-
return
|
|
4793
|
+
return isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
|
|
4506
4794
|
} catch {
|
|
4507
4795
|
return false;
|
|
4508
4796
|
}
|
|
@@ -4518,7 +4806,8 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
4518
4806
|
...processed.metadata
|
|
4519
4807
|
}
|
|
4520
4808
|
},
|
|
4521
|
-
links: filteredLinks
|
|
4809
|
+
links: filteredLinks,
|
|
4810
|
+
finalUrl: rawContent.source
|
|
4522
4811
|
};
|
|
4523
4812
|
} catch (error) {
|
|
4524
4813
|
logger.error(`❌ Failed processing page ${url}: ${error}`);
|
|
@@ -4901,7 +5190,9 @@ class PipelineManager {
|
|
|
4901
5190
|
*/
|
|
4902
5191
|
async recoverPendingJobs() {
|
|
4903
5192
|
try {
|
|
4904
|
-
const runningVersions = await this.store.
|
|
5193
|
+
const runningVersions = await this.store.getVersionsByStatus([
|
|
5194
|
+
VersionStatus.RUNNING
|
|
5195
|
+
]);
|
|
4905
5196
|
for (const version2 of runningVersions) {
|
|
4906
5197
|
await this.store.updateVersionStatus(version2.id, VersionStatus.QUEUED);
|
|
4907
5198
|
logger.info(
|
|
@@ -5056,25 +5347,25 @@ class PipelineManager {
|
|
|
5056
5347
|
async enqueueJobWithStoredOptions(library, version2) {
|
|
5057
5348
|
const normalizedVersion = version2 ?? "";
|
|
5058
5349
|
try {
|
|
5059
|
-
const versionId = await this.store.
|
|
5350
|
+
const versionId = await this.store.ensureVersion({
|
|
5060
5351
|
library,
|
|
5061
|
-
normalizedVersion
|
|
5062
|
-
);
|
|
5063
|
-
const
|
|
5064
|
-
if (!
|
|
5352
|
+
version: normalizedVersion
|
|
5353
|
+
});
|
|
5354
|
+
const stored = await this.store.getScraperOptions(versionId);
|
|
5355
|
+
if (!stored) {
|
|
5065
5356
|
throw new Error(
|
|
5066
5357
|
`No stored scraper options found for ${library}@${normalizedVersion || "unversioned"}`
|
|
5067
5358
|
);
|
|
5068
5359
|
}
|
|
5069
|
-
const storedOptions =
|
|
5360
|
+
const storedOptions = stored.options;
|
|
5070
5361
|
const completeOptions = {
|
|
5071
|
-
url:
|
|
5362
|
+
url: stored.sourceUrl,
|
|
5072
5363
|
library,
|
|
5073
5364
|
version: normalizedVersion,
|
|
5074
5365
|
...storedOptions
|
|
5075
5366
|
};
|
|
5076
5367
|
logger.info(
|
|
5077
|
-
`🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${
|
|
5368
|
+
`🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${stored.sourceUrl}`
|
|
5078
5369
|
);
|
|
5079
5370
|
return this.enqueueJob(library, normalizedVersion, completeOptions);
|
|
5080
5371
|
} catch (error) {
|
|
@@ -5351,2130 +5642,123 @@ var PipelineFactory;
|
|
|
5351
5642
|
logger.debug(`Creating PipelineClient for external worker at: ${serverUrl}`);
|
|
5352
5643
|
return new PipelineClient(serverUrl);
|
|
5353
5644
|
}
|
|
5354
|
-
return new PipelineManager(docService, concurrency, {
|
|
5645
|
+
return new PipelineManager(docService, concurrency, {
|
|
5646
|
+
recoverJobs
|
|
5647
|
+
});
|
|
5355
5648
|
}
|
|
5356
5649
|
PipelineFactory2.createPipeline = createPipeline;
|
|
5357
5650
|
})(PipelineFactory || (PipelineFactory = {}));
|
|
5358
|
-
|
|
5359
|
-
|
|
5360
|
-
|
|
5361
|
-
|
|
5362
|
-
|
|
5363
|
-
`Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
|
|
5651
|
+
function ensurePlaywrightBrowsersInstalled() {
|
|
5652
|
+
const chromiumEnvPath = process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH;
|
|
5653
|
+
if (chromiumEnvPath && existsSync(chromiumEnvPath)) {
|
|
5654
|
+
logger.debug(
|
|
5655
|
+
`PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH is set to '${chromiumEnvPath}', skipping Playwright browser install.`
|
|
5364
5656
|
);
|
|
5657
|
+
return;
|
|
5365
5658
|
}
|
|
5366
|
-
|
|
5367
|
-
|
|
5368
|
-
|
|
5369
|
-
|
|
5370
|
-
baseSplitter;
|
|
5371
|
-
minChunkSize;
|
|
5372
|
-
preferredChunkSize;
|
|
5373
|
-
/**
|
|
5374
|
-
* Combines a base document splitter with size constraints to produce optimally-sized chunks.
|
|
5375
|
-
* The base splitter handles the initial semantic splitting, while this class handles
|
|
5376
|
-
* the concatenation strategy.
|
|
5377
|
-
*/
|
|
5378
|
-
constructor(baseSplitter, minChunkSize, preferredChunkSize) {
|
|
5379
|
-
this.baseSplitter = baseSplitter;
|
|
5380
|
-
this.minChunkSize = minChunkSize;
|
|
5381
|
-
this.preferredChunkSize = preferredChunkSize;
|
|
5382
|
-
}
|
|
5383
|
-
/**
|
|
5384
|
-
* Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
|
|
5385
|
-
* are combined until they reach the minimum size, but splits are preserved at major
|
|
5386
|
-
* section boundaries to maintain document structure. This balances the need for
|
|
5387
|
-
* context with semantic coherence.
|
|
5388
|
-
*/
|
|
5389
|
-
async splitText(markdown) {
|
|
5390
|
-
const initialChunks = await this.baseSplitter.splitText(markdown);
|
|
5391
|
-
const concatenatedChunks = [];
|
|
5392
|
-
let currentChunk = null;
|
|
5393
|
-
for (const nextChunk of initialChunks) {
|
|
5394
|
-
if (currentChunk) {
|
|
5395
|
-
if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
|
|
5396
|
-
concatenatedChunks.push(currentChunk);
|
|
5397
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
5398
|
-
continue;
|
|
5399
|
-
}
|
|
5400
|
-
if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
|
|
5401
|
-
concatenatedChunks.push(currentChunk);
|
|
5402
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
5403
|
-
continue;
|
|
5404
|
-
}
|
|
5405
|
-
currentChunk.content += `
|
|
5406
|
-
${nextChunk.content}`;
|
|
5407
|
-
currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
|
|
5408
|
-
currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
|
|
5409
|
-
} else {
|
|
5410
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
5411
|
-
}
|
|
5412
|
-
}
|
|
5413
|
-
if (currentChunk) {
|
|
5414
|
-
concatenatedChunks.push(currentChunk);
|
|
5415
|
-
}
|
|
5416
|
-
return concatenatedChunks;
|
|
5417
|
-
}
|
|
5418
|
-
cloneChunk(chunk) {
|
|
5419
|
-
return {
|
|
5420
|
-
types: [...chunk.types],
|
|
5421
|
-
content: chunk.content,
|
|
5422
|
-
section: {
|
|
5423
|
-
level: chunk.section.level,
|
|
5424
|
-
path: [...chunk.section.path]
|
|
5425
|
-
}
|
|
5426
|
-
};
|
|
5427
|
-
}
|
|
5428
|
-
/**
|
|
5429
|
-
* H1 and H2 headings represent major conceptual breaks in the document.
|
|
5430
|
-
* Preserving these splits helps maintain the document's logical structure.
|
|
5431
|
-
*/
|
|
5432
|
-
startsNewMajorSection(chunk) {
|
|
5433
|
-
return chunk.section.level === 1 || chunk.section.level === 2;
|
|
5434
|
-
}
|
|
5435
|
-
/**
|
|
5436
|
-
* Size limit check to ensure chunks remain within embedding model constraints.
|
|
5437
|
-
* Essential for maintaining consistent embedding quality and avoiding truncation.
|
|
5438
|
-
*/
|
|
5439
|
-
wouldExceedMaxSize(currentChunk, nextChunk) {
|
|
5440
|
-
if (!currentChunk) {
|
|
5441
|
-
return false;
|
|
5442
|
-
}
|
|
5443
|
-
return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
|
|
5444
|
-
}
|
|
5445
|
-
/**
|
|
5446
|
-
* Checks if one path is a prefix of another path, indicating a parent-child relationship
|
|
5447
|
-
*/
|
|
5448
|
-
isPathIncluded(parentPath, childPath) {
|
|
5449
|
-
if (parentPath.length >= childPath.length) return false;
|
|
5450
|
-
return parentPath.every((part, i) => part === childPath[i]);
|
|
5451
|
-
}
|
|
5452
|
-
/**
|
|
5453
|
-
* Merges section metadata when concatenating chunks, following these rules:
|
|
5454
|
-
* 1. Level: Always uses the lowest (most general) level between chunks
|
|
5455
|
-
* 2. Path selection:
|
|
5456
|
-
* - For parent-child relationships (one path includes the other), uses the child's path
|
|
5457
|
-
* - For siblings/unrelated sections, uses the common parent path
|
|
5458
|
-
* - If no common path exists, uses the root path ([])
|
|
5459
|
-
*/
|
|
5460
|
-
mergeSectionInfo(currentChunk, nextChunk) {
|
|
5461
|
-
const level = Math.min(currentChunk.section.level, nextChunk.section.level);
|
|
5462
|
-
if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
|
|
5463
|
-
return currentChunk.section;
|
|
5464
|
-
}
|
|
5465
|
-
if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
|
|
5466
|
-
return {
|
|
5467
|
-
path: nextChunk.section.path,
|
|
5468
|
-
level
|
|
5469
|
-
};
|
|
5470
|
-
}
|
|
5471
|
-
if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
|
|
5472
|
-
return {
|
|
5473
|
-
path: currentChunk.section.path,
|
|
5474
|
-
level
|
|
5475
|
-
};
|
|
5659
|
+
try {
|
|
5660
|
+
const chromiumPath = chromium.executablePath();
|
|
5661
|
+
if (!chromiumPath || !existsSync(chromiumPath)) {
|
|
5662
|
+
throw new Error("Playwright Chromium browser not found");
|
|
5476
5663
|
}
|
|
5477
|
-
|
|
5478
|
-
|
|
5479
|
-
|
|
5664
|
+
} catch (_err) {
|
|
5665
|
+
logger.debug(
|
|
5666
|
+
"Playwright browsers not found. Installing Chromium browser for dynamic scraping (this may take a minute)..."
|
|
5480
5667
|
);
|
|
5481
|
-
|
|
5482
|
-
|
|
5483
|
-
|
|
5484
|
-
|
|
5485
|
-
|
|
5486
|
-
|
|
5487
|
-
|
|
5488
|
-
|
|
5489
|
-
|
|
5490
|
-
|
|
5491
|
-
|
|
5492
|
-
|
|
5493
|
-
const common = [];
|
|
5494
|
-
for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
|
|
5495
|
-
if (path1[i] === path2[i]) {
|
|
5496
|
-
common.push(path1[i]);
|
|
5497
|
-
} else {
|
|
5498
|
-
break;
|
|
5499
|
-
}
|
|
5668
|
+
try {
|
|
5669
|
+
logger.debug("Installing Playwright Chromium browser...");
|
|
5670
|
+
execSync("npm exec -y playwright install --no-shell --with-deps chromium", {
|
|
5671
|
+
stdio: "ignore",
|
|
5672
|
+
// Suppress output
|
|
5673
|
+
cwd: getProjectRoot()
|
|
5674
|
+
});
|
|
5675
|
+
} catch (_installErr) {
|
|
5676
|
+
console.error(
|
|
5677
|
+
"❌ Failed to install Playwright browsers automatically. Please run:\n npx playwright install --no-shell --with-deps chromium\nand try again."
|
|
5678
|
+
);
|
|
5679
|
+
process.exit(1);
|
|
5500
5680
|
}
|
|
5501
|
-
return common;
|
|
5502
5681
|
}
|
|
5503
5682
|
}
|
|
5504
|
-
|
|
5505
|
-
|
|
5506
|
-
|
|
5507
|
-
|
|
5508
|
-
constructor(options) {
|
|
5509
|
-
this.options = options;
|
|
5510
|
-
}
|
|
5511
|
-
async split(content) {
|
|
5512
|
-
const language = content.match(/^```(\w+)\n/)?.[1];
|
|
5513
|
-
const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
|
|
5514
|
-
const lines = strippedContent.split("\n");
|
|
5515
|
-
const chunks = [];
|
|
5516
|
-
let currentChunkLines = [];
|
|
5517
|
-
for (const line of lines) {
|
|
5518
|
-
const singleLineSize = this.wrap(line, language).length;
|
|
5519
|
-
if (singleLineSize > this.options.chunkSize) {
|
|
5520
|
-
throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
|
|
5521
|
-
}
|
|
5522
|
-
currentChunkLines.push(line);
|
|
5523
|
-
const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
|
|
5524
|
-
const newChunkSize = newChunkContent.length;
|
|
5525
|
-
if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
|
|
5526
|
-
const lastLine = currentChunkLines.pop();
|
|
5527
|
-
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
5528
|
-
currentChunkLines = [lastLine];
|
|
5529
|
-
}
|
|
5530
|
-
}
|
|
5531
|
-
if (currentChunkLines.length > 0) {
|
|
5532
|
-
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
5683
|
+
function resolveProtocol(protocol) {
|
|
5684
|
+
if (protocol === "auto") {
|
|
5685
|
+
if (!process.stdin.isTTY && !process.stdout.isTTY) {
|
|
5686
|
+
return "stdio";
|
|
5533
5687
|
}
|
|
5534
|
-
return
|
|
5688
|
+
return "http";
|
|
5535
5689
|
}
|
|
5536
|
-
|
|
5537
|
-
return
|
|
5538
|
-
${content.replace(/\n+$/, "")}
|
|
5539
|
-
\`\`\``;
|
|
5690
|
+
if (protocol === "stdio" || protocol === "http") {
|
|
5691
|
+
return protocol;
|
|
5540
5692
|
}
|
|
5693
|
+
throw new Error(`Invalid protocol: ${protocol}. Must be 'auto', 'stdio', or 'http'`);
|
|
5541
5694
|
}
|
|
5542
|
-
|
|
5543
|
-
|
|
5544
|
-
|
|
5545
|
-
|
|
5546
|
-
|
|
5547
|
-
|
|
5548
|
-
*/
|
|
5549
|
-
async split(content) {
|
|
5550
|
-
const parsedTable = this.parseTable(content);
|
|
5551
|
-
if (!parsedTable) {
|
|
5552
|
-
return [content];
|
|
5553
|
-
}
|
|
5554
|
-
const { headers, rows } = parsedTable;
|
|
5555
|
-
const chunks = [];
|
|
5556
|
-
let currentRows = [];
|
|
5557
|
-
for (const row of rows) {
|
|
5558
|
-
const singleRowSize = this.wrap(row, headers).length;
|
|
5559
|
-
if (singleRowSize > this.options.chunkSize) {
|
|
5560
|
-
throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
|
|
5561
|
-
}
|
|
5562
|
-
const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
|
|
5563
|
-
const newChunkSize = newChunkContent.length;
|
|
5564
|
-
if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
|
|
5565
|
-
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
5566
|
-
currentRows = [row];
|
|
5567
|
-
} else {
|
|
5568
|
-
currentRows.push(row);
|
|
5569
|
-
}
|
|
5570
|
-
}
|
|
5571
|
-
if (currentRows.length > 0) {
|
|
5572
|
-
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
5573
|
-
}
|
|
5574
|
-
return chunks;
|
|
5575
|
-
}
|
|
5576
|
-
wrap(content, headers) {
|
|
5577
|
-
const headerRow = `| ${headers.join(" | ")} |`;
|
|
5578
|
-
const separatorRow = `|${headers.map(() => "---").join("|")}|`;
|
|
5579
|
-
return [headerRow, separatorRow, content].join("\n");
|
|
5580
|
-
}
|
|
5581
|
-
parseTable(content) {
|
|
5582
|
-
const lines = content.trim().split("\n");
|
|
5583
|
-
if (lines.length < 3) return null;
|
|
5584
|
-
const headers = this.parseRow(lines[0]);
|
|
5585
|
-
if (!headers) return null;
|
|
5586
|
-
const separator = lines[1];
|
|
5587
|
-
if (!this.isValidSeparator(separator)) return null;
|
|
5588
|
-
const rows = lines.slice(2).filter((row) => row.trim() !== "");
|
|
5589
|
-
return { headers, separator, rows };
|
|
5590
|
-
}
|
|
5591
|
-
/**
|
|
5592
|
-
* Parses a table row into cells
|
|
5593
|
-
*/
|
|
5594
|
-
parseRow(row) {
|
|
5595
|
-
if (!row.includes("|")) return null;
|
|
5596
|
-
return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
|
|
5695
|
+
const formatOutput = (data) => JSON.stringify(data, null, 2);
|
|
5696
|
+
function setupLogging(options, protocol) {
|
|
5697
|
+
if (options.silent) {
|
|
5698
|
+
setLogLevel(LogLevel.ERROR);
|
|
5699
|
+
} else if (options.verbose) {
|
|
5700
|
+
setLogLevel(LogLevel.DEBUG);
|
|
5597
5701
|
}
|
|
5598
|
-
|
|
5599
|
-
|
|
5600
|
-
*/
|
|
5601
|
-
isValidSeparator(separator) {
|
|
5602
|
-
return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
|
|
5702
|
+
if (protocol === "stdio") {
|
|
5703
|
+
setLogLevel(LogLevel.ERROR);
|
|
5603
5704
|
}
|
|
5604
5705
|
}
|
|
5605
|
-
|
|
5606
|
-
|
|
5607
|
-
|
|
5706
|
+
function validatePort(portString) {
|
|
5707
|
+
const port = Number.parseInt(portString, 10);
|
|
5708
|
+
if (Number.isNaN(port) || port < 1 || port > 65535) {
|
|
5709
|
+
throw new Error("❌ Invalid port number");
|
|
5608
5710
|
}
|
|
5609
|
-
|
|
5610
|
-
|
|
5611
|
-
|
|
5612
|
-
|
|
5613
|
-
|
|
5614
|
-
|
|
5615
|
-
if (
|
|
5616
|
-
|
|
5617
|
-
}
|
|
5618
|
-
const words = trimmedContent.split(/\s+/);
|
|
5619
|
-
const longestWord = words.reduce(
|
|
5620
|
-
(max, word) => word.length > max.length ? word : max
|
|
5621
|
-
);
|
|
5622
|
-
if (longestWord.length > this.options.chunkSize) {
|
|
5623
|
-
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
5624
|
-
}
|
|
5625
|
-
const paragraphChunks = this.splitByParagraphs(trimmedContent);
|
|
5626
|
-
if (this.areChunksValid(paragraphChunks)) {
|
|
5627
|
-
return paragraphChunks;
|
|
5711
|
+
return port;
|
|
5712
|
+
}
|
|
5713
|
+
async function createPipelineWithCallbacks(docService, options = {}) {
|
|
5714
|
+
logger.debug(`Initializing pipeline with options: ${JSON.stringify(options)}`);
|
|
5715
|
+
const { serverUrl, ...rest } = options;
|
|
5716
|
+
const pipeline = serverUrl ? await PipelineFactory.createPipeline(void 0, { serverUrl, ...rest }) : await (async () => {
|
|
5717
|
+
if (!docService) {
|
|
5718
|
+
throw new Error("Local pipeline requires a DocumentManagementService instance");
|
|
5628
5719
|
}
|
|
5629
|
-
|
|
5630
|
-
|
|
5631
|
-
|
|
5720
|
+
return PipelineFactory.createPipeline(docService, rest);
|
|
5721
|
+
})();
|
|
5722
|
+
pipeline.setCallbacks({
|
|
5723
|
+
onJobProgress: async (job, progress) => {
|
|
5724
|
+
logger.debug(
|
|
5725
|
+
`📊 Job ${job.id} progress: ${progress.pagesScraped}/${progress.totalPages} pages`
|
|
5726
|
+
);
|
|
5727
|
+
},
|
|
5728
|
+
onJobStatusChange: async (job) => {
|
|
5729
|
+
logger.debug(`🔄 Job ${job.id} status changed to: ${job.status}`);
|
|
5730
|
+
},
|
|
5731
|
+
onJobError: async (job, error, document) => {
|
|
5732
|
+
logger.warn(
|
|
5733
|
+
`⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`
|
|
5734
|
+
);
|
|
5632
5735
|
}
|
|
5633
|
-
|
|
5634
|
-
|
|
5635
|
-
|
|
5636
|
-
|
|
5637
|
-
|
|
5638
|
-
|
|
5639
|
-
|
|
5640
|
-
|
|
5641
|
-
|
|
5642
|
-
|
|
5643
|
-
|
|
5644
|
-
|
|
5645
|
-
|
|
5646
|
-
|
|
5647
|
-
|
|
5648
|
-
|
|
5649
|
-
|
|
5650
|
-
|
|
5651
|
-
|
|
5652
|
-
|
|
5653
|
-
|
|
5654
|
-
|
|
5655
|
-
}
|
|
5656
|
-
/**
|
|
5657
|
-
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
5658
|
-
*/
|
|
5659
|
-
async splitByWords(text) {
|
|
5660
|
-
const splitter = new RecursiveCharacterTextSplitter({
|
|
5661
|
-
chunkSize: this.options.chunkSize,
|
|
5662
|
-
chunkOverlap: 0
|
|
5663
|
-
});
|
|
5664
|
-
const chunks = await splitter.splitText(text);
|
|
5665
|
-
return chunks;
|
|
5666
|
-
}
|
|
5667
|
-
/**
|
|
5668
|
-
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
5669
|
-
* Only merges if combined size is within maxChunkSize.
|
|
5670
|
-
*/
|
|
5671
|
-
mergeChunks(chunks, separator) {
|
|
5672
|
-
const mergedChunks = [];
|
|
5673
|
-
let currentChunk = null;
|
|
5674
|
-
for (const chunk of chunks) {
|
|
5675
|
-
if (currentChunk === null) {
|
|
5676
|
-
currentChunk = chunk;
|
|
5677
|
-
continue;
|
|
5678
|
-
}
|
|
5679
|
-
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
5680
|
-
const nextChunkSize = this.getChunkSize(chunk);
|
|
5681
|
-
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
5682
|
-
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
5683
|
-
} else {
|
|
5684
|
-
mergedChunks.push(currentChunk);
|
|
5685
|
-
currentChunk = chunk;
|
|
5736
|
+
});
|
|
5737
|
+
return pipeline;
|
|
5738
|
+
}
|
|
5739
|
+
function createAppServerConfig(options) {
|
|
5740
|
+
return {
|
|
5741
|
+
enableWebInterface: options.enableWebInterface ?? false,
|
|
5742
|
+
enableMcpServer: options.enableMcpServer ?? true,
|
|
5743
|
+
enableApiServer: options.enableApiServer ?? false,
|
|
5744
|
+
enableWorker: options.enableWorker ?? true,
|
|
5745
|
+
port: options.port,
|
|
5746
|
+
externalWorkerUrl: options.externalWorkerUrl
|
|
5747
|
+
};
|
|
5748
|
+
}
|
|
5749
|
+
function parseHeaders(headerOptions) {
|
|
5750
|
+
const headers = {};
|
|
5751
|
+
if (Array.isArray(headerOptions)) {
|
|
5752
|
+
for (const entry of headerOptions) {
|
|
5753
|
+
const idx = entry.indexOf(":");
|
|
5754
|
+
if (idx > 0) {
|
|
5755
|
+
const name = entry.slice(0, idx).trim();
|
|
5756
|
+
const value = entry.slice(idx + 1).trim();
|
|
5757
|
+
if (name) headers[name] = value;
|
|
5686
5758
|
}
|
|
5687
5759
|
}
|
|
5688
|
-
if (currentChunk) {
|
|
5689
|
-
mergedChunks.push(currentChunk);
|
|
5690
|
-
}
|
|
5691
|
-
return mergedChunks;
|
|
5692
|
-
}
|
|
5693
|
-
getChunkSize(chunk) {
|
|
5694
|
-
return chunk.length;
|
|
5695
|
-
}
|
|
5696
|
-
wrap(content) {
|
|
5697
|
-
return content;
|
|
5698
5760
|
}
|
|
5699
|
-
|
|
5700
|
-
class SemanticMarkdownSplitter {
|
|
5701
|
-
constructor(preferredChunkSize, maxChunkSize) {
|
|
5702
|
-
this.preferredChunkSize = preferredChunkSize;
|
|
5703
|
-
this.maxChunkSize = maxChunkSize;
|
|
5704
|
-
this.turndownService = new TurndownService({
|
|
5705
|
-
headingStyle: "atx",
|
|
5706
|
-
hr: "---",
|
|
5707
|
-
bulletListMarker: "-",
|
|
5708
|
-
codeBlockStyle: "fenced",
|
|
5709
|
-
emDelimiter: "_",
|
|
5710
|
-
strongDelimiter: "**",
|
|
5711
|
-
linkStyle: "inlined"
|
|
5712
|
-
});
|
|
5713
|
-
this.turndownService.addRule("table", {
|
|
5714
|
-
filter: ["table"],
|
|
5715
|
-
replacement: (_content, node) => {
|
|
5716
|
-
const table = node;
|
|
5717
|
-
const headers = Array.from(table.querySelectorAll("th")).map(
|
|
5718
|
-
(th) => th.textContent?.trim() || ""
|
|
5719
|
-
);
|
|
5720
|
-
const rows = Array.from(table.querySelectorAll("tr")).filter(
|
|
5721
|
-
(tr) => !tr.querySelector("th")
|
|
5722
|
-
);
|
|
5723
|
-
if (headers.length === 0 && rows.length === 0) return "";
|
|
5724
|
-
let markdown = "\n";
|
|
5725
|
-
if (headers.length > 0) {
|
|
5726
|
-
markdown += `| ${headers.join(" | ")} |
|
|
5727
|
-
`;
|
|
5728
|
-
markdown += `|${headers.map(() => "---").join("|")}|
|
|
5729
|
-
`;
|
|
5730
|
-
}
|
|
5731
|
-
for (const row of rows) {
|
|
5732
|
-
const cells = Array.from(row.querySelectorAll("td")).map(
|
|
5733
|
-
(td) => td.textContent?.trim() || ""
|
|
5734
|
-
);
|
|
5735
|
-
markdown += `| ${cells.join(" | ")} |
|
|
5736
|
-
`;
|
|
5737
|
-
}
|
|
5738
|
-
return markdown;
|
|
5739
|
-
}
|
|
5740
|
-
});
|
|
5741
|
-
this.textSplitter = new TextContentSplitter({
|
|
5742
|
-
chunkSize: this.preferredChunkSize
|
|
5743
|
-
});
|
|
5744
|
-
this.codeSplitter = new CodeContentSplitter({
|
|
5745
|
-
chunkSize: this.maxChunkSize
|
|
5746
|
-
});
|
|
5747
|
-
this.tableSplitter = new TableContentSplitter({
|
|
5748
|
-
chunkSize: this.maxChunkSize
|
|
5749
|
-
});
|
|
5750
|
-
}
|
|
5751
|
-
turndownService;
|
|
5752
|
-
textSplitter;
|
|
5753
|
-
codeSplitter;
|
|
5754
|
-
tableSplitter;
|
|
5755
|
-
/**
|
|
5756
|
-
* Main entry point for splitting markdown content
|
|
5757
|
-
*/
|
|
5758
|
-
async splitText(markdown) {
|
|
5759
|
-
const html = await this.markdownToHtml(markdown);
|
|
5760
|
-
const dom = await this.parseHtml(html);
|
|
5761
|
-
const sections = await this.splitIntoSections(dom);
|
|
5762
|
-
return this.splitSectionContent(sections);
|
|
5763
|
-
}
|
|
5764
|
-
/**
|
|
5765
|
-
* Step 1: Split document into sections based on H1-H6 headings,
|
|
5766
|
-
* as well as code blocks and tables.
|
|
5767
|
-
*/
|
|
5768
|
-
async splitIntoSections(dom) {
|
|
5769
|
-
const body = dom.querySelector("body");
|
|
5770
|
-
if (!body) {
|
|
5771
|
-
throw new Error("Invalid HTML structure: no body element found");
|
|
5772
|
-
}
|
|
5773
|
-
let currentSection = this.createRootSection();
|
|
5774
|
-
const sections = [];
|
|
5775
|
-
const stack = [currentSection];
|
|
5776
|
-
for (const element of Array.from(body.children)) {
|
|
5777
|
-
const headingMatch = element.tagName.match(/H([1-6])/);
|
|
5778
|
-
if (headingMatch) {
|
|
5779
|
-
const level = Number.parseInt(headingMatch[1], 10);
|
|
5780
|
-
const title = fullTrim(element.textContent || "");
|
|
5781
|
-
while (stack.length > 1 && stack[stack.length - 1].level >= level) {
|
|
5782
|
-
stack.pop();
|
|
5783
|
-
}
|
|
5784
|
-
currentSection = {
|
|
5785
|
-
level,
|
|
5786
|
-
path: [
|
|
5787
|
-
...stack.slice(1).reduce((acc, s) => {
|
|
5788
|
-
const lastPath = s.path[s.path.length - 1];
|
|
5789
|
-
if (lastPath) acc.push(lastPath);
|
|
5790
|
-
return acc;
|
|
5791
|
-
}, []),
|
|
5792
|
-
title
|
|
5793
|
-
],
|
|
5794
|
-
content: [
|
|
5795
|
-
{
|
|
5796
|
-
type: "heading",
|
|
5797
|
-
text: `${"#".repeat(level)} ${title}`
|
|
5798
|
-
}
|
|
5799
|
-
]
|
|
5800
|
-
};
|
|
5801
|
-
sections.push(currentSection);
|
|
5802
|
-
stack.push(currentSection);
|
|
5803
|
-
} else if (element.tagName === "PRE") {
|
|
5804
|
-
const code = element.querySelector("code");
|
|
5805
|
-
const language = code?.className.replace("language-", "") || "";
|
|
5806
|
-
const content = code?.textContent || element.textContent || "";
|
|
5807
|
-
const markdown = `${"```"}${language}
|
|
5808
|
-
${content}
|
|
5809
|
-
${"```"}`;
|
|
5810
|
-
currentSection = {
|
|
5811
|
-
level: currentSection.level,
|
|
5812
|
-
path: currentSection.path,
|
|
5813
|
-
content: [
|
|
5814
|
-
{
|
|
5815
|
-
type: "code",
|
|
5816
|
-
text: markdown
|
|
5817
|
-
}
|
|
5818
|
-
]
|
|
5819
|
-
};
|
|
5820
|
-
sections.push(currentSection);
|
|
5821
|
-
} else if (element.tagName === "TABLE") {
|
|
5822
|
-
const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
|
|
5823
|
-
currentSection = {
|
|
5824
|
-
level: currentSection.level,
|
|
5825
|
-
path: currentSection.path,
|
|
5826
|
-
content: [
|
|
5827
|
-
{
|
|
5828
|
-
type: "table",
|
|
5829
|
-
text: markdown
|
|
5830
|
-
}
|
|
5831
|
-
]
|
|
5832
|
-
};
|
|
5833
|
-
sections.push(currentSection);
|
|
5834
|
-
} else {
|
|
5835
|
-
const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
|
|
5836
|
-
if (markdown) {
|
|
5837
|
-
currentSection = {
|
|
5838
|
-
level: currentSection.level,
|
|
5839
|
-
path: currentSection.path,
|
|
5840
|
-
content: [
|
|
5841
|
-
{
|
|
5842
|
-
type: "text",
|
|
5843
|
-
text: markdown
|
|
5844
|
-
}
|
|
5845
|
-
]
|
|
5846
|
-
};
|
|
5847
|
-
sections.push(currentSection);
|
|
5848
|
-
}
|
|
5849
|
-
}
|
|
5850
|
-
}
|
|
5851
|
-
return sections;
|
|
5852
|
-
}
|
|
5853
|
-
/**
|
|
5854
|
-
* Step 2: Split section content into smaller chunks
|
|
5855
|
-
*/
|
|
5856
|
-
async splitSectionContent(sections) {
|
|
5857
|
-
const chunks = [];
|
|
5858
|
-
for (const section of sections) {
|
|
5859
|
-
for (const content of section.content) {
|
|
5860
|
-
let splitContent = [];
|
|
5861
|
-
try {
|
|
5862
|
-
switch (content.type) {
|
|
5863
|
-
case "heading":
|
|
5864
|
-
case "text": {
|
|
5865
|
-
splitContent = await this.textSplitter.split(content.text);
|
|
5866
|
-
break;
|
|
5867
|
-
}
|
|
5868
|
-
case "code": {
|
|
5869
|
-
splitContent = await this.codeSplitter.split(content.text);
|
|
5870
|
-
break;
|
|
5871
|
-
}
|
|
5872
|
-
case "table": {
|
|
5873
|
-
splitContent = await this.tableSplitter.split(content.text);
|
|
5874
|
-
break;
|
|
5875
|
-
}
|
|
5876
|
-
}
|
|
5877
|
-
} catch (err) {
|
|
5878
|
-
if (err instanceof MinimumChunkSizeError) {
|
|
5879
|
-
logger.warn(
|
|
5880
|
-
`⚠ Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
|
|
5881
|
-
);
|
|
5882
|
-
const splitter = new RecursiveCharacterTextSplitter({
|
|
5883
|
-
chunkSize: this.maxChunkSize,
|
|
5884
|
-
chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
|
|
5885
|
-
// Use more aggressive separators including empty string as last resort
|
|
5886
|
-
separators: [
|
|
5887
|
-
"\n\n",
|
|
5888
|
-
"\n",
|
|
5889
|
-
" ",
|
|
5890
|
-
" ",
|
|
5891
|
-
".",
|
|
5892
|
-
",",
|
|
5893
|
-
";",
|
|
5894
|
-
":",
|
|
5895
|
-
"-",
|
|
5896
|
-
"(",
|
|
5897
|
-
")",
|
|
5898
|
-
"[",
|
|
5899
|
-
"]",
|
|
5900
|
-
"{",
|
|
5901
|
-
"}",
|
|
5902
|
-
""
|
|
5903
|
-
]
|
|
5904
|
-
});
|
|
5905
|
-
const chunks2 = await splitter.splitText(content.text);
|
|
5906
|
-
if (chunks2.length === 0) {
|
|
5907
|
-
splitContent = [content.text.substring(0, this.maxChunkSize)];
|
|
5908
|
-
} else {
|
|
5909
|
-
splitContent = chunks2;
|
|
5910
|
-
}
|
|
5911
|
-
} else {
|
|
5912
|
-
const errMessage = err instanceof Error ? err.message : String(err);
|
|
5913
|
-
throw new ContentSplitterError(
|
|
5914
|
-
`Failed to split ${content.type} content: ${errMessage}`
|
|
5915
|
-
);
|
|
5916
|
-
}
|
|
5917
|
-
}
|
|
5918
|
-
chunks.push(
|
|
5919
|
-
...splitContent.map(
|
|
5920
|
-
(text) => ({
|
|
5921
|
-
types: [content.type],
|
|
5922
|
-
content: text,
|
|
5923
|
-
section: {
|
|
5924
|
-
level: section.level,
|
|
5925
|
-
path: section.path
|
|
5926
|
-
}
|
|
5927
|
-
})
|
|
5928
|
-
)
|
|
5929
|
-
);
|
|
5930
|
-
}
|
|
5931
|
-
}
|
|
5932
|
-
return chunks;
|
|
5933
|
-
}
|
|
5934
|
-
/**
|
|
5935
|
-
* Helper to create the root section
|
|
5936
|
-
*/
|
|
5937
|
-
createRootSection() {
|
|
5938
|
-
return {
|
|
5939
|
-
level: 0,
|
|
5940
|
-
path: [],
|
|
5941
|
-
content: []
|
|
5942
|
-
};
|
|
5943
|
-
}
|
|
5944
|
-
/**
|
|
5945
|
-
* Convert markdown to HTML using remark
|
|
5946
|
-
*/
|
|
5947
|
-
async markdownToHtml(markdown) {
|
|
5948
|
-
const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
|
|
5949
|
-
return `<!DOCTYPE html>
|
|
5950
|
-
<html>
|
|
5951
|
-
<body>
|
|
5952
|
-
${String(html)}
|
|
5953
|
-
</body>
|
|
5954
|
-
</html>`;
|
|
5955
|
-
}
|
|
5956
|
-
/**
|
|
5957
|
-
* Parse HTML
|
|
5958
|
-
*/
|
|
5959
|
-
async parseHtml(html) {
|
|
5960
|
-
const { window } = createJSDOM(html);
|
|
5961
|
-
return window.document;
|
|
5962
|
-
}
|
|
5963
|
-
}
|
|
5964
|
-
const CHILD_LIMIT = 5;
|
|
5965
|
-
const SIBLING_LIMIT = 2;
|
|
5966
|
-
class DocumentRetrieverService {
|
|
5967
|
-
documentStore;
|
|
5968
|
-
constructor(documentStore) {
|
|
5969
|
-
this.documentStore = documentStore;
|
|
5970
|
-
}
|
|
5971
|
-
/**
|
|
5972
|
-
* Collects all related chunk IDs for a given initial hit.
|
|
5973
|
-
* Returns an object with url, hitId, relatedIds (Set), and score.
|
|
5974
|
-
*/
|
|
5975
|
-
async getRelatedChunkIds(library, version2, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
|
|
5976
|
-
const id = doc.id;
|
|
5977
|
-
const url = doc.metadata.url;
|
|
5978
|
-
const score = doc.metadata.score;
|
|
5979
|
-
const relatedIds = /* @__PURE__ */ new Set();
|
|
5980
|
-
relatedIds.add(id);
|
|
5981
|
-
const parent = await this.documentStore.findParentChunk(library, version2, id);
|
|
5982
|
-
if (parent) {
|
|
5983
|
-
relatedIds.add(parent.id);
|
|
5984
|
-
}
|
|
5985
|
-
const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
|
|
5986
|
-
library,
|
|
5987
|
-
version2,
|
|
5988
|
-
id,
|
|
5989
|
-
siblingLimit
|
|
5990
|
-
);
|
|
5991
|
-
for (const sib of precedingSiblings) {
|
|
5992
|
-
relatedIds.add(sib.id);
|
|
5993
|
-
}
|
|
5994
|
-
const childChunks = await this.documentStore.findChildChunks(
|
|
5995
|
-
library,
|
|
5996
|
-
version2,
|
|
5997
|
-
id,
|
|
5998
|
-
childLimit
|
|
5999
|
-
);
|
|
6000
|
-
for (const child of childChunks) {
|
|
6001
|
-
relatedIds.add(child.id);
|
|
6002
|
-
}
|
|
6003
|
-
const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
|
|
6004
|
-
library,
|
|
6005
|
-
version2,
|
|
6006
|
-
id,
|
|
6007
|
-
siblingLimit
|
|
6008
|
-
);
|
|
6009
|
-
for (const sib of subsequentSiblings) {
|
|
6010
|
-
relatedIds.add(sib.id);
|
|
6011
|
-
}
|
|
6012
|
-
return { url, hitId: id, relatedIds, score };
|
|
6013
|
-
}
|
|
6014
|
-
/**
|
|
6015
|
-
* Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
|
|
6016
|
-
*/
|
|
6017
|
-
groupAndPrepareFetch(relatedInfos) {
|
|
6018
|
-
const urlMap = /* @__PURE__ */ new Map();
|
|
6019
|
-
for (const info of relatedInfos) {
|
|
6020
|
-
let entry = urlMap.get(info.url);
|
|
6021
|
-
if (!entry) {
|
|
6022
|
-
entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
|
|
6023
|
-
urlMap.set(info.url, entry);
|
|
6024
|
-
}
|
|
6025
|
-
for (const id of info.relatedIds) {
|
|
6026
|
-
entry.uniqueChunkIds.add(id);
|
|
6027
|
-
}
|
|
6028
|
-
if (info.score > entry.maxScore) {
|
|
6029
|
-
entry.maxScore = info.score;
|
|
6030
|
-
}
|
|
6031
|
-
}
|
|
6032
|
-
return urlMap;
|
|
6033
|
-
}
|
|
6034
|
-
/**
|
|
6035
|
-
* Finalizes the merged result for a URL group by fetching, sorting, and joining content.
|
|
6036
|
-
*/
|
|
6037
|
-
async finalizeResult(library, version2, url, uniqueChunkIds, maxScore) {
|
|
6038
|
-
const ids = Array.from(uniqueChunkIds);
|
|
6039
|
-
const docs = await this.documentStore.findChunksByIds(library, version2, ids);
|
|
6040
|
-
const content = docs.map((d) => d.pageContent).join("\n\n");
|
|
6041
|
-
return {
|
|
6042
|
-
url,
|
|
6043
|
-
content,
|
|
6044
|
-
score: maxScore
|
|
6045
|
-
};
|
|
6046
|
-
}
|
|
6047
|
-
/**
|
|
6048
|
-
* Searches for documents and expands the context around the matches.
|
|
6049
|
-
* @param library The library name.
|
|
6050
|
-
* @param version The library version.
|
|
6051
|
-
* @param query The search query.
|
|
6052
|
-
* @param version The library version (optional, defaults to searching documents without a version).
|
|
6053
|
-
* @param query The search query.
|
|
6054
|
-
* @param limit The optional limit for the initial search results.
|
|
6055
|
-
* @returns An array of strings representing the aggregated content of the retrieved chunks.
|
|
6056
|
-
*/
|
|
6057
|
-
async search(library, version2, query, limit) {
|
|
6058
|
-
const normalizedVersion = (version2 ?? "").toLowerCase();
|
|
6059
|
-
const initialResults = await this.documentStore.findByContent(
|
|
6060
|
-
library,
|
|
6061
|
-
normalizedVersion,
|
|
6062
|
-
query,
|
|
6063
|
-
limit ?? 10
|
|
6064
|
-
);
|
|
6065
|
-
const relatedInfos = await Promise.all(
|
|
6066
|
-
initialResults.map(
|
|
6067
|
-
(doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
|
|
6068
|
-
)
|
|
6069
|
-
);
|
|
6070
|
-
const urlMap = this.groupAndPrepareFetch(relatedInfos);
|
|
6071
|
-
const results = [];
|
|
6072
|
-
for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
|
|
6073
|
-
const result = await this.finalizeResult(
|
|
6074
|
-
library,
|
|
6075
|
-
normalizedVersion,
|
|
6076
|
-
url,
|
|
6077
|
-
uniqueChunkIds,
|
|
6078
|
-
maxScore
|
|
6079
|
-
);
|
|
6080
|
-
results.push(result);
|
|
6081
|
-
}
|
|
6082
|
-
return results;
|
|
6083
|
-
}
|
|
6084
|
-
}
|
|
6085
|
-
class StoreError extends Error {
|
|
6086
|
-
constructor(message, cause) {
|
|
6087
|
-
super(cause ? `${message} caused by ${cause}` : message);
|
|
6088
|
-
this.cause = cause;
|
|
6089
|
-
this.name = this.constructor.name;
|
|
6090
|
-
const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
|
|
6091
|
-
if (causeError?.stack) {
|
|
6092
|
-
this.stack = causeError.stack;
|
|
6093
|
-
}
|
|
6094
|
-
}
|
|
6095
|
-
}
|
|
6096
|
-
class DimensionError extends StoreError {
|
|
6097
|
-
constructor(modelName, modelDimension, dbDimension) {
|
|
6098
|
-
super(
|
|
6099
|
-
`Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
|
|
6100
|
-
);
|
|
6101
|
-
this.modelName = modelName;
|
|
6102
|
-
this.modelDimension = modelDimension;
|
|
6103
|
-
this.dbDimension = dbDimension;
|
|
6104
|
-
}
|
|
6105
|
-
}
|
|
6106
|
-
class ConnectionError extends StoreError {
|
|
6107
|
-
}
|
|
6108
|
-
const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
|
|
6109
|
-
const MIGRATIONS_TABLE = "_schema_migrations";
|
|
6110
|
-
function ensureMigrationsTable(db) {
|
|
6111
|
-
db.exec(`
|
|
6112
|
-
CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
|
|
6113
|
-
id TEXT PRIMARY KEY,
|
|
6114
|
-
applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
6115
|
-
);
|
|
6116
|
-
`);
|
|
6117
|
-
}
|
|
6118
|
-
function getAppliedMigrations(db) {
|
|
6119
|
-
const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
|
|
6120
|
-
const rows = stmt.all();
|
|
6121
|
-
return new Set(rows.map((row) => row.id));
|
|
6122
|
-
}
|
|
6123
|
-
async function applyMigrations(db) {
|
|
6124
|
-
try {
|
|
6125
|
-
db.pragma("journal_mode = OFF");
|
|
6126
|
-
db.pragma("synchronous = OFF");
|
|
6127
|
-
db.pragma("mmap_size = 268435456");
|
|
6128
|
-
db.pragma("cache_size = -64000");
|
|
6129
|
-
db.pragma("temp_store = MEMORY");
|
|
6130
|
-
logger.debug("Applied performance optimizations for migration");
|
|
6131
|
-
} catch (_error) {
|
|
6132
|
-
logger.warn("⚠️ Could not apply all performance optimizations for migration");
|
|
6133
|
-
}
|
|
6134
|
-
const overallTransaction = db.transaction(() => {
|
|
6135
|
-
logger.debug("Checking database migrations...");
|
|
6136
|
-
ensureMigrationsTable(db);
|
|
6137
|
-
const appliedMigrations = getAppliedMigrations(db);
|
|
6138
|
-
if (!fs$1.existsSync(MIGRATIONS_DIR)) {
|
|
6139
|
-
throw new StoreError("Migrations directory not found");
|
|
6140
|
-
}
|
|
6141
|
-
const migrationFiles = fs$1.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
|
|
6142
|
-
const pendingMigrations = migrationFiles.filter(
|
|
6143
|
-
(filename) => !appliedMigrations.has(filename)
|
|
6144
|
-
);
|
|
6145
|
-
if (pendingMigrations.length > 0) {
|
|
6146
|
-
logger.info(`🔄 Applying ${pendingMigrations.length} database migration(s)...`);
|
|
6147
|
-
}
|
|
6148
|
-
let appliedCount = 0;
|
|
6149
|
-
for (const filename of pendingMigrations) {
|
|
6150
|
-
logger.debug(`Applying migration: ${filename}`);
|
|
6151
|
-
const filePath = path.join(MIGRATIONS_DIR, filename);
|
|
6152
|
-
const sql = fs$1.readFileSync(filePath, "utf8");
|
|
6153
|
-
try {
|
|
6154
|
-
db.exec(sql);
|
|
6155
|
-
const insertStmt = db.prepare(`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`);
|
|
6156
|
-
insertStmt.run(filename);
|
|
6157
|
-
logger.debug(`✅ Applied migration: ${filename}`);
|
|
6158
|
-
appliedCount++;
|
|
6159
|
-
} catch (error) {
|
|
6160
|
-
logger.error(`❌ Failed to apply migration: ${filename} - ${error}`);
|
|
6161
|
-
throw new StoreError(`Migration failed: ${filename}`, error);
|
|
6162
|
-
}
|
|
6163
|
-
}
|
|
6164
|
-
if (appliedCount > 0) {
|
|
6165
|
-
logger.info(`✅ Successfully applied ${appliedCount} migration(s)`);
|
|
6166
|
-
} else {
|
|
6167
|
-
logger.debug("Database schema is up to date");
|
|
6168
|
-
}
|
|
6169
|
-
return appliedCount;
|
|
6170
|
-
});
|
|
6171
|
-
let retries = 0;
|
|
6172
|
-
let appliedMigrationsCount = 0;
|
|
6173
|
-
while (true) {
|
|
6174
|
-
try {
|
|
6175
|
-
appliedMigrationsCount = overallTransaction.immediate();
|
|
6176
|
-
logger.debug("Database migrations completed successfully");
|
|
6177
|
-
if (appliedMigrationsCount > 0) {
|
|
6178
|
-
try {
|
|
6179
|
-
logger.debug(
|
|
6180
|
-
`Running VACUUM after applying ${appliedMigrationsCount} migration(s)...`
|
|
6181
|
-
);
|
|
6182
|
-
db.exec("VACUUM");
|
|
6183
|
-
logger.debug("Database vacuum completed successfully");
|
|
6184
|
-
} catch (error) {
|
|
6185
|
-
logger.warn(`⚠️ Could not vacuum database after migrations: ${error}`);
|
|
6186
|
-
}
|
|
6187
|
-
} else {
|
|
6188
|
-
logger.debug("Skipping VACUUM - no migrations were applied");
|
|
6189
|
-
}
|
|
6190
|
-
break;
|
|
6191
|
-
} catch (error) {
|
|
6192
|
-
if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
|
|
6193
|
-
retries++;
|
|
6194
|
-
logger.warn(
|
|
6195
|
-
`⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
|
|
6196
|
-
);
|
|
6197
|
-
await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
|
|
6198
|
-
} else {
|
|
6199
|
-
if (error?.code === "SQLITE_BUSY") {
|
|
6200
|
-
logger.error(
|
|
6201
|
-
`❌ Migrations still busy after ${MIGRATION_MAX_RETRIES} retries. Giving up: ${error}`
|
|
6202
|
-
);
|
|
6203
|
-
}
|
|
6204
|
-
if (error instanceof StoreError) {
|
|
6205
|
-
throw error;
|
|
6206
|
-
}
|
|
6207
|
-
throw new StoreError("Failed during migration process", error);
|
|
6208
|
-
}
|
|
6209
|
-
}
|
|
6210
|
-
}
|
|
6211
|
-
try {
|
|
6212
|
-
db.pragma("journal_mode = WAL");
|
|
6213
|
-
db.pragma("wal_autocheckpoint = 1000");
|
|
6214
|
-
db.pragma("busy_timeout = 30000");
|
|
6215
|
-
db.pragma("foreign_keys = ON");
|
|
6216
|
-
db.pragma("synchronous = NORMAL");
|
|
6217
|
-
logger.debug(
|
|
6218
|
-
"Applied production database configuration (WAL mode, autocheckpoint, foreign keys, busy timeout)"
|
|
6219
|
-
);
|
|
6220
|
-
} catch (_error) {
|
|
6221
|
-
logger.warn("⚠️ Could not apply all production database settings");
|
|
6222
|
-
}
|
|
6223
|
-
}
|
|
6224
|
-
class DocumentStore {
|
|
6225
|
-
db;
|
|
6226
|
-
embeddings;
|
|
6227
|
-
dbDimension = VECTOR_DIMENSION;
|
|
6228
|
-
modelDimension;
|
|
6229
|
-
statements;
|
|
6230
|
-
/**
|
|
6231
|
-
* Calculates Reciprocal Rank Fusion score for a result
|
|
6232
|
-
*/
|
|
6233
|
-
calculateRRF(vecRank, ftsRank, k = 60) {
|
|
6234
|
-
let rrf = 0;
|
|
6235
|
-
if (vecRank !== void 0) {
|
|
6236
|
-
rrf += 1 / (k + vecRank);
|
|
6237
|
-
}
|
|
6238
|
-
if (ftsRank !== void 0) {
|
|
6239
|
-
rrf += 1 / (k + ftsRank);
|
|
6240
|
-
}
|
|
6241
|
-
return rrf;
|
|
6242
|
-
}
|
|
6243
|
-
/**
|
|
6244
|
-
* Assigns ranks to search results based on their scores
|
|
6245
|
-
*/
|
|
6246
|
-
assignRanks(results) {
|
|
6247
|
-
const vecRanks = /* @__PURE__ */ new Map();
|
|
6248
|
-
const ftsRanks = /* @__PURE__ */ new Map();
|
|
6249
|
-
results.filter((r) => r.vec_score !== void 0).sort((a, b) => (b.vec_score ?? 0) - (a.vec_score ?? 0)).forEach((result, index) => {
|
|
6250
|
-
vecRanks.set(Number(result.id), index + 1);
|
|
6251
|
-
});
|
|
6252
|
-
results.filter((r) => r.fts_score !== void 0).sort((a, b) => (b.fts_score ?? 0) - (a.fts_score ?? 0)).forEach((result, index) => {
|
|
6253
|
-
ftsRanks.set(Number(result.id), index + 1);
|
|
6254
|
-
});
|
|
6255
|
-
return results.map((result) => ({
|
|
6256
|
-
...result,
|
|
6257
|
-
vec_rank: vecRanks.get(Number(result.id)),
|
|
6258
|
-
fts_rank: ftsRanks.get(Number(result.id)),
|
|
6259
|
-
rrf_score: this.calculateRRF(
|
|
6260
|
-
vecRanks.get(Number(result.id)),
|
|
6261
|
-
ftsRanks.get(Number(result.id))
|
|
6262
|
-
)
|
|
6263
|
-
}));
|
|
6264
|
-
}
|
|
6265
|
-
constructor(dbPath) {
|
|
6266
|
-
if (!dbPath) {
|
|
6267
|
-
throw new StoreError("Missing required database path");
|
|
6268
|
-
}
|
|
6269
|
-
this.db = new Database(dbPath);
|
|
6270
|
-
}
|
|
6271
|
-
/**
|
|
6272
|
-
* Sets up prepared statements for database queries
|
|
6273
|
-
*/
|
|
6274
|
-
prepareStatements() {
|
|
6275
|
-
const statements = {
|
|
6276
|
-
getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
|
|
6277
|
-
insertDocument: this.db.prepare(
|
|
6278
|
-
"INSERT INTO documents (library_id, version_id, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
|
|
6279
|
-
),
|
|
6280
|
-
insertEmbedding: this.db.prepare(
|
|
6281
|
-
"INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)"
|
|
6282
|
-
),
|
|
6283
|
-
insertLibrary: this.db.prepare(
|
|
6284
|
-
"INSERT INTO libraries (name) VALUES (?) ON CONFLICT(name) DO NOTHING"
|
|
6285
|
-
),
|
|
6286
|
-
getLibraryIdByName: this.db.prepare(
|
|
6287
|
-
"SELECT id FROM libraries WHERE name = ?"
|
|
6288
|
-
),
|
|
6289
|
-
// New version-related statements
|
|
6290
|
-
insertVersion: this.db.prepare(
|
|
6291
|
-
"INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
|
|
6292
|
-
),
|
|
6293
|
-
resolveVersionId: this.db.prepare(
|
|
6294
|
-
"SELECT id FROM versions WHERE library_id = ? AND name IS ?"
|
|
6295
|
-
),
|
|
6296
|
-
getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
|
|
6297
|
-
queryVersionsByLibraryId: this.db.prepare(
|
|
6298
|
-
"SELECT * FROM versions WHERE library_id = ? ORDER BY name"
|
|
6299
|
-
),
|
|
6300
|
-
deleteLibraryDocuments: this.db.prepare(
|
|
6301
|
-
`DELETE FROM documents
|
|
6302
|
-
WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6303
|
-
AND version_id = (
|
|
6304
|
-
SELECT v.id FROM versions v
|
|
6305
|
-
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6306
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6307
|
-
)`
|
|
6308
|
-
),
|
|
6309
|
-
deleteDocuments: this.db.prepare(
|
|
6310
|
-
`DELETE FROM documents
|
|
6311
|
-
WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6312
|
-
AND version_id = (
|
|
6313
|
-
SELECT v.id FROM versions v
|
|
6314
|
-
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6315
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6316
|
-
)`
|
|
6317
|
-
),
|
|
6318
|
-
deleteDocumentsByUrl: this.db.prepare(
|
|
6319
|
-
`DELETE FROM documents
|
|
6320
|
-
WHERE url = ?
|
|
6321
|
-
AND library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6322
|
-
AND version_id = (
|
|
6323
|
-
SELECT v.id FROM versions v
|
|
6324
|
-
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
6325
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6326
|
-
)`
|
|
6327
|
-
),
|
|
6328
|
-
getDocumentBySort: this.db.prepare(
|
|
6329
|
-
`SELECT d.id
|
|
6330
|
-
FROM documents d
|
|
6331
|
-
JOIN versions v ON d.version_id = v.id
|
|
6332
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6333
|
-
WHERE l.name = ?
|
|
6334
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6335
|
-
LIMIT 1`
|
|
6336
|
-
),
|
|
6337
|
-
queryVersions: this.db.prepare(
|
|
6338
|
-
`SELECT DISTINCT v.name
|
|
6339
|
-
FROM versions v
|
|
6340
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6341
|
-
WHERE l.name = ?
|
|
6342
|
-
ORDER BY v.name`
|
|
6343
|
-
),
|
|
6344
|
-
checkExists: this.db.prepare(
|
|
6345
|
-
`SELECT d.id FROM documents d
|
|
6346
|
-
JOIN versions v ON d.version_id = v.id
|
|
6347
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6348
|
-
WHERE l.name = ?
|
|
6349
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6350
|
-
LIMIT 1`
|
|
6351
|
-
),
|
|
6352
|
-
queryLibraryVersions: this.db.prepare(
|
|
6353
|
-
`SELECT
|
|
6354
|
-
l.name as library,
|
|
6355
|
-
v.name as version,
|
|
6356
|
-
COUNT(*) as documentCount,
|
|
6357
|
-
COUNT(DISTINCT d.url) as uniqueUrlCount,
|
|
6358
|
-
MIN(d.indexed_at) as indexedAt
|
|
6359
|
-
FROM documents d
|
|
6360
|
-
JOIN versions v ON d.version_id = v.id
|
|
6361
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6362
|
-
GROUP BY l.name, v.name
|
|
6363
|
-
ORDER BY l.name, v.name`
|
|
6364
|
-
),
|
|
6365
|
-
getChildChunks: this.db.prepare(`
|
|
6366
|
-
SELECT d.* FROM documents d
|
|
6367
|
-
JOIN versions v ON d.version_id = v.id
|
|
6368
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6369
|
-
WHERE l.name = ?
|
|
6370
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6371
|
-
AND d.url = ?
|
|
6372
|
-
AND json_array_length(json_extract(d.metadata, '$.path')) = ?
|
|
6373
|
-
AND json_extract(d.metadata, '$.path') LIKE ? || '%'
|
|
6374
|
-
AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
6375
|
-
ORDER BY d.sort_order
|
|
6376
|
-
LIMIT ?
|
|
6377
|
-
`),
|
|
6378
|
-
getPrecedingSiblings: this.db.prepare(`
|
|
6379
|
-
SELECT d.* FROM documents d
|
|
6380
|
-
JOIN versions v ON d.version_id = v.id
|
|
6381
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6382
|
-
WHERE l.name = ?
|
|
6383
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6384
|
-
AND d.url = ?
|
|
6385
|
-
AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
6386
|
-
AND json_extract(d.metadata, '$.path') = ?
|
|
6387
|
-
ORDER BY d.sort_order DESC
|
|
6388
|
-
LIMIT ?
|
|
6389
|
-
`),
|
|
6390
|
-
getSubsequentSiblings: this.db.prepare(`
|
|
6391
|
-
SELECT d.* FROM documents d
|
|
6392
|
-
JOIN versions v ON d.version_id = v.id
|
|
6393
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6394
|
-
WHERE l.name = ?
|
|
6395
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6396
|
-
AND d.url = ?
|
|
6397
|
-
AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
6398
|
-
AND json_extract(d.metadata, '$.path') = ?
|
|
6399
|
-
ORDER BY d.sort_order
|
|
6400
|
-
LIMIT ?
|
|
6401
|
-
`),
|
|
6402
|
-
getParentChunk: this.db.prepare(`
|
|
6403
|
-
SELECT d.* FROM documents d
|
|
6404
|
-
JOIN versions v ON d.version_id = v.id
|
|
6405
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6406
|
-
WHERE l.name = ?
|
|
6407
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6408
|
-
AND d.url = ?
|
|
6409
|
-
AND json_extract(d.metadata, '$.path') = ?
|
|
6410
|
-
AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
6411
|
-
ORDER BY d.sort_order DESC
|
|
6412
|
-
LIMIT 1
|
|
6413
|
-
`),
|
|
6414
|
-
// Status tracking statements
|
|
6415
|
-
updateVersionStatus: this.db.prepare(
|
|
6416
|
-
"UPDATE versions SET status = ?, error_message = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
6417
|
-
),
|
|
6418
|
-
updateVersionProgress: this.db.prepare(
|
|
6419
|
-
"UPDATE versions SET progress_pages = ?, progress_max_pages = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
6420
|
-
),
|
|
6421
|
-
getVersionsByStatus: this.db.prepare(
|
|
6422
|
-
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN (SELECT value FROM json_each(?))"
|
|
6423
|
-
),
|
|
6424
|
-
getRunningVersions: this.db.prepare(
|
|
6425
|
-
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status = 'running' ORDER BY v.started_at"
|
|
6426
|
-
),
|
|
6427
|
-
getActiveVersions: this.db.prepare(
|
|
6428
|
-
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN ('queued', 'running', 'updating') ORDER BY v.created_at"
|
|
6429
|
-
),
|
|
6430
|
-
// Scraper options statements
|
|
6431
|
-
updateVersionScraperOptions: this.db.prepare(
|
|
6432
|
-
"UPDATE versions SET source_url = ?, scraper_options = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
6433
|
-
),
|
|
6434
|
-
getVersionWithOptions: this.db.prepare(
|
|
6435
|
-
"SELECT * FROM versions WHERE id = ?"
|
|
6436
|
-
),
|
|
6437
|
-
getVersionsBySourceUrl: this.db.prepare(
|
|
6438
|
-
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.source_url = ? ORDER BY v.created_at DESC"
|
|
6439
|
-
)
|
|
6440
|
-
};
|
|
6441
|
-
this.statements = statements;
|
|
6442
|
-
}
|
|
6443
|
-
/**
|
|
6444
|
-
* Pads a vector to the fixed database dimension by appending zeros.
|
|
6445
|
-
* Throws an error if the input vector is longer than the database dimension.
|
|
6446
|
-
*/
|
|
6447
|
-
padVector(vector) {
|
|
6448
|
-
if (vector.length > this.dbDimension) {
|
|
6449
|
-
throw new Error(
|
|
6450
|
-
`Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
|
|
6451
|
-
);
|
|
6452
|
-
}
|
|
6453
|
-
if (vector.length === this.dbDimension) {
|
|
6454
|
-
return vector;
|
|
6455
|
-
}
|
|
6456
|
-
return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
|
|
6457
|
-
}
|
|
6458
|
-
/**
|
|
6459
|
-
* Initializes embeddings client using environment variables for configuration.
|
|
6460
|
-
*
|
|
6461
|
-
* The embedding model is configured using DOCS_MCP_EMBEDDING_MODEL environment variable.
|
|
6462
|
-
* Format: "provider:model_name" (e.g., "google:text-embedding-004") or just "model_name"
|
|
6463
|
-
* for OpenAI (default).
|
|
6464
|
-
*
|
|
6465
|
-
* Supported providers and their required environment variables:
|
|
6466
|
-
* - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
|
|
6467
|
-
* - google: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
|
|
6468
|
-
* - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION (or BEDROCK_AWS_REGION)
|
|
6469
|
-
* - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
|
|
6470
|
-
*/
|
|
6471
|
-
async initializeEmbeddings() {
|
|
6472
|
-
const modelSpec = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
|
|
6473
|
-
const { createEmbeddingModel } = await import("./EmbeddingFactory-CElwVk3X.js");
|
|
6474
|
-
this.embeddings = createEmbeddingModel(modelSpec);
|
|
6475
|
-
const testVector = await this.embeddings.embedQuery("test");
|
|
6476
|
-
this.modelDimension = testVector.length;
|
|
6477
|
-
if (this.modelDimension > this.dbDimension) {
|
|
6478
|
-
throw new DimensionError(modelSpec, this.modelDimension, this.dbDimension);
|
|
6479
|
-
}
|
|
6480
|
-
}
|
|
6481
|
-
/**
|
|
6482
|
-
* Escapes a query string for use with SQLite FTS5 MATCH operator.
|
|
6483
|
-
* Wraps the query in double quotes and escapes internal double quotes.
|
|
6484
|
-
*/
|
|
6485
|
-
escapeFtsQuery(query) {
|
|
6486
|
-
const escapedQuotes = query.replace(/"/g, '""');
|
|
6487
|
-
return `"${escapedQuotes}"`;
|
|
6488
|
-
}
|
|
6489
|
-
/**
|
|
6490
|
-
* Initializes database connection and ensures readiness
|
|
6491
|
-
*/
|
|
6492
|
-
async initialize() {
|
|
6493
|
-
try {
|
|
6494
|
-
sqliteVec.load(this.db);
|
|
6495
|
-
applyMigrations(this.db);
|
|
6496
|
-
this.prepareStatements();
|
|
6497
|
-
await this.initializeEmbeddings();
|
|
6498
|
-
} catch (error) {
|
|
6499
|
-
if (error instanceof StoreError) {
|
|
6500
|
-
throw error;
|
|
6501
|
-
}
|
|
6502
|
-
throw new ConnectionError("Failed to initialize database connection", error);
|
|
6503
|
-
}
|
|
6504
|
-
}
|
|
6505
|
-
/**
|
|
6506
|
-
* Gracefully closes database connections
|
|
6507
|
-
*/
|
|
6508
|
-
async shutdown() {
|
|
6509
|
-
this.db.close();
|
|
6510
|
-
}
|
|
6511
|
-
/**
|
|
6512
|
-
* Resolves a library name and version string to library_id and version_id.
|
|
6513
|
-
* Creates library and version records if they don't exist.
|
|
6514
|
-
*/
|
|
6515
|
-
async resolveLibraryAndVersionIds(library, version2) {
|
|
6516
|
-
const normalizedLibrary = library.toLowerCase();
|
|
6517
|
-
const normalizedVersion = denormalizeVersionName(version2.toLowerCase());
|
|
6518
|
-
this.statements.insertLibrary.run(normalizedLibrary);
|
|
6519
|
-
const libraryIdRow = this.statements.getLibraryIdByName.get(normalizedLibrary);
|
|
6520
|
-
if (!libraryIdRow || typeof libraryIdRow.id !== "number") {
|
|
6521
|
-
throw new StoreError(`Failed to resolve library_id for library: ${library}`);
|
|
6522
|
-
}
|
|
6523
|
-
const libraryId = libraryIdRow.id;
|
|
6524
|
-
this.statements.insertVersion.run(libraryId, normalizedVersion);
|
|
6525
|
-
const versionIdRow = this.statements.resolveVersionId.get(
|
|
6526
|
-
libraryId,
|
|
6527
|
-
normalizedVersion
|
|
6528
|
-
);
|
|
6529
|
-
if (!versionIdRow || typeof versionIdRow.id !== "number") {
|
|
6530
|
-
throw new StoreError(
|
|
6531
|
-
`Failed to resolve version_id for library: ${library}, version: ${version2}`
|
|
6532
|
-
);
|
|
6533
|
-
}
|
|
6534
|
-
return { libraryId, versionId: versionIdRow.id };
|
|
6535
|
-
}
|
|
6536
|
-
/**
|
|
6537
|
-
* Retrieves all unique versions for a specific library
|
|
6538
|
-
*/
|
|
6539
|
-
async queryUniqueVersions(library) {
|
|
6540
|
-
try {
|
|
6541
|
-
const rows = this.statements.queryVersions.all(library.toLowerCase());
|
|
6542
|
-
return rows.map((row) => normalizeVersionName(row.name));
|
|
6543
|
-
} catch (error) {
|
|
6544
|
-
throw new ConnectionError("Failed to query versions", error);
|
|
6545
|
-
}
|
|
6546
|
-
}
|
|
6547
|
-
/**
|
|
6548
|
-
* Updates the status of a version record in the database.
|
|
6549
|
-
* @param versionId The version ID to update
|
|
6550
|
-
* @param status The new status to set
|
|
6551
|
-
* @param errorMessage Optional error message for failed statuses
|
|
6552
|
-
*/
|
|
6553
|
-
async updateVersionStatus(versionId, status, errorMessage) {
|
|
6554
|
-
try {
|
|
6555
|
-
this.statements.updateVersionStatus.run(status, errorMessage ?? null, versionId);
|
|
6556
|
-
} catch (error) {
|
|
6557
|
-
throw new StoreError(`Failed to update version status: ${error}`);
|
|
6558
|
-
}
|
|
6559
|
-
}
|
|
6560
|
-
/**
|
|
6561
|
-
* Updates the progress counters for a version being indexed.
|
|
6562
|
-
* @param versionId The version ID to update
|
|
6563
|
-
* @param pages Current number of pages processed
|
|
6564
|
-
* @param maxPages Total number of pages to process
|
|
6565
|
-
*/
|
|
6566
|
-
async updateVersionProgress(versionId, pages, maxPages) {
|
|
6567
|
-
try {
|
|
6568
|
-
this.statements.updateVersionProgress.run(pages, maxPages, versionId);
|
|
6569
|
-
} catch (error) {
|
|
6570
|
-
throw new StoreError(`Failed to update version progress: ${error}`);
|
|
6571
|
-
}
|
|
6572
|
-
}
|
|
6573
|
-
/**
|
|
6574
|
-
* Retrieves versions by their status.
|
|
6575
|
-
* @param statuses Array of statuses to filter by
|
|
6576
|
-
* @returns Array of version records matching the statuses
|
|
6577
|
-
*/
|
|
6578
|
-
async getVersionsByStatus(statuses) {
|
|
6579
|
-
try {
|
|
6580
|
-
const statusJson = JSON.stringify(statuses);
|
|
6581
|
-
const rows = this.statements.getVersionsByStatus.all(
|
|
6582
|
-
statusJson
|
|
6583
|
-
);
|
|
6584
|
-
return rows;
|
|
6585
|
-
} catch (error) {
|
|
6586
|
-
throw new StoreError(`Failed to get versions by status: ${error}`);
|
|
6587
|
-
}
|
|
6588
|
-
}
|
|
6589
|
-
/**
|
|
6590
|
-
* Retrieves all versions currently in RUNNING status.
|
|
6591
|
-
* @returns Array of running version records with library names
|
|
6592
|
-
*/
|
|
6593
|
-
async getRunningVersions() {
|
|
6594
|
-
try {
|
|
6595
|
-
const rows = this.statements.getRunningVersions.all();
|
|
6596
|
-
return rows;
|
|
6597
|
-
} catch (error) {
|
|
6598
|
-
throw new StoreError(`Failed to get running versions: ${error}`);
|
|
6599
|
-
}
|
|
6600
|
-
}
|
|
6601
|
-
/**
|
|
6602
|
-
* Retrieves all versions in active states (queued, running, updating).
|
|
6603
|
-
* @returns Array of active version records with library names
|
|
6604
|
-
*/
|
|
6605
|
-
async getActiveVersions() {
|
|
6606
|
-
try {
|
|
6607
|
-
const rows = this.statements.getActiveVersions.all();
|
|
6608
|
-
return rows;
|
|
6609
|
-
} catch (error) {
|
|
6610
|
-
throw new StoreError(`Failed to get active versions: ${error}`);
|
|
6611
|
-
}
|
|
6612
|
-
}
|
|
6613
|
-
/**
|
|
6614
|
-
* Stores scraper options for a version to enable reproducible indexing.
|
|
6615
|
-
* @param versionId The version ID to update
|
|
6616
|
-
* @param options Complete scraper options used for indexing
|
|
6617
|
-
*/
|
|
6618
|
-
async storeScraperOptions(versionId, options) {
|
|
6619
|
-
try {
|
|
6620
|
-
const { url: source_url, library, version: version2, signal, ...scraper_options } = options;
|
|
6621
|
-
const optionsJson = JSON.stringify(scraper_options);
|
|
6622
|
-
this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
|
|
6623
|
-
} catch (error) {
|
|
6624
|
-
throw new StoreError(`Failed to store scraper options: ${error}`);
|
|
6625
|
-
}
|
|
6626
|
-
}
|
|
6627
|
-
/**
|
|
6628
|
-
* Retrieves stored scraper options for a version.
|
|
6629
|
-
* @param versionId The version ID to query
|
|
6630
|
-
* @returns Stored scraper options or null if none stored
|
|
6631
|
-
*/
|
|
6632
|
-
async getVersionScraperOptions(versionId) {
|
|
6633
|
-
try {
|
|
6634
|
-
const row = this.statements.getVersionWithOptions.get(versionId);
|
|
6635
|
-
if (!row?.scraper_options) {
|
|
6636
|
-
return null;
|
|
6637
|
-
}
|
|
6638
|
-
return JSON.parse(row.scraper_options);
|
|
6639
|
-
} catch (error) {
|
|
6640
|
-
throw new StoreError(`Failed to get version scraper options: ${error}`);
|
|
6641
|
-
}
|
|
6642
|
-
}
|
|
6643
|
-
/**
|
|
6644
|
-
* Retrieves a version record with all stored options.
|
|
6645
|
-
* @param versionId The version ID to query
|
|
6646
|
-
* @returns Complete version record or null if not found
|
|
6647
|
-
*/
|
|
6648
|
-
async getVersionWithStoredOptions(versionId) {
|
|
6649
|
-
try {
|
|
6650
|
-
const row = this.statements.getVersionWithOptions.get(versionId);
|
|
6651
|
-
return row || null;
|
|
6652
|
-
} catch (error) {
|
|
6653
|
-
throw new StoreError(`Failed to get version with stored options: ${error}`);
|
|
6654
|
-
}
|
|
6655
|
-
}
|
|
6656
|
-
/**
|
|
6657
|
-
* Finds versions that were indexed from the same source URL.
|
|
6658
|
-
* Useful for finding similar configurations or detecting duplicates.
|
|
6659
|
-
* @param url Source URL to search for
|
|
6660
|
-
* @returns Array of versions with the same source URL
|
|
6661
|
-
*/
|
|
6662
|
-
async findVersionsBySourceUrl(url) {
|
|
6663
|
-
try {
|
|
6664
|
-
const rows = this.statements.getVersionsBySourceUrl.all(
|
|
6665
|
-
url
|
|
6666
|
-
);
|
|
6667
|
-
return rows;
|
|
6668
|
-
} catch (error) {
|
|
6669
|
-
throw new StoreError(`Failed to find versions by source URL: ${error}`);
|
|
6670
|
-
}
|
|
6671
|
-
}
|
|
6672
|
-
/**
|
|
6673
|
-
* Verifies existence of documents for a specific library version
|
|
6674
|
-
*/
|
|
6675
|
-
async checkDocumentExists(library, version2) {
|
|
6676
|
-
try {
|
|
6677
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6678
|
-
const result = this.statements.checkExists.get(
|
|
6679
|
-
library.toLowerCase(),
|
|
6680
|
-
normalizedVersion
|
|
6681
|
-
);
|
|
6682
|
-
return result !== void 0;
|
|
6683
|
-
} catch (error) {
|
|
6684
|
-
throw new ConnectionError("Failed to check document existence", error);
|
|
6685
|
-
}
|
|
6686
|
-
}
|
|
6687
|
-
/**
|
|
6688
|
-
* Retrieves a mapping of all libraries to their available versions with details.
|
|
6689
|
-
*/
|
|
6690
|
-
async queryLibraryVersions() {
|
|
6691
|
-
try {
|
|
6692
|
-
const rows = this.statements.queryLibraryVersions.all();
|
|
6693
|
-
const libraryMap = /* @__PURE__ */ new Map();
|
|
6694
|
-
for (const row of rows) {
|
|
6695
|
-
const library = row.library;
|
|
6696
|
-
if (!libraryMap.has(library)) {
|
|
6697
|
-
libraryMap.set(library, []);
|
|
6698
|
-
}
|
|
6699
|
-
const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
|
|
6700
|
-
libraryMap.get(library)?.push({
|
|
6701
|
-
version: row.version,
|
|
6702
|
-
documentCount: row.documentCount,
|
|
6703
|
-
uniqueUrlCount: row.uniqueUrlCount,
|
|
6704
|
-
indexedAt: indexedAtISO
|
|
6705
|
-
});
|
|
6706
|
-
}
|
|
6707
|
-
for (const versions of libraryMap.values()) {
|
|
6708
|
-
versions.sort((a, b) => {
|
|
6709
|
-
if (a.version === "" && b.version !== "") {
|
|
6710
|
-
return -1;
|
|
6711
|
-
}
|
|
6712
|
-
if (a.version !== "" && b.version === "") {
|
|
6713
|
-
return 1;
|
|
6714
|
-
}
|
|
6715
|
-
if (a.version === "" && b.version === "") {
|
|
6716
|
-
return 0;
|
|
6717
|
-
}
|
|
6718
|
-
try {
|
|
6719
|
-
return semver__default.compare(a.version, b.version);
|
|
6720
|
-
} catch (_error) {
|
|
6721
|
-
return a.version.localeCompare(b.version);
|
|
6722
|
-
}
|
|
6723
|
-
});
|
|
6724
|
-
}
|
|
6725
|
-
return libraryMap;
|
|
6726
|
-
} catch (error) {
|
|
6727
|
-
throw new ConnectionError("Failed to query library versions", error);
|
|
6728
|
-
}
|
|
6729
|
-
}
|
|
6730
|
-
/**
|
|
6731
|
-
* Stores documents with library and version metadata, generating embeddings
|
|
6732
|
-
* for vector similarity search. Automatically removes any existing documents
|
|
6733
|
-
* for the same URLs before adding new ones to prevent UNIQUE constraint violations.
|
|
6734
|
-
*/
|
|
6735
|
-
async addDocuments(library, version2, documents) {
|
|
6736
|
-
try {
|
|
6737
|
-
if (documents.length === 0) {
|
|
6738
|
-
return;
|
|
6739
|
-
}
|
|
6740
|
-
const urls = /* @__PURE__ */ new Set();
|
|
6741
|
-
for (const doc of documents) {
|
|
6742
|
-
const url = doc.metadata.url;
|
|
6743
|
-
if (!url || typeof url !== "string" || !url.trim()) {
|
|
6744
|
-
throw new StoreError("Document metadata must include a valid URL");
|
|
6745
|
-
}
|
|
6746
|
-
urls.add(url);
|
|
6747
|
-
}
|
|
6748
|
-
const texts = documents.map((doc) => {
|
|
6749
|
-
const header = `<title>${doc.metadata.title}</title>
|
|
6750
|
-
<url>${doc.metadata.url}</url>
|
|
6751
|
-
<path>${doc.metadata.path.join(" / ")}</path>
|
|
6752
|
-
`;
|
|
6753
|
-
return `${header}${doc.pageContent}`;
|
|
6754
|
-
});
|
|
6755
|
-
const rawEmbeddings = [];
|
|
6756
|
-
for (let i = 0; i < texts.length; i += EMBEDDING_BATCH_SIZE) {
|
|
6757
|
-
const batchTexts = texts.slice(i, i + EMBEDDING_BATCH_SIZE);
|
|
6758
|
-
const batchEmbeddings = await this.embeddings.embedDocuments(batchTexts);
|
|
6759
|
-
rawEmbeddings.push(...batchEmbeddings);
|
|
6760
|
-
}
|
|
6761
|
-
const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
|
|
6762
|
-
const { libraryId, versionId } = await this.resolveLibraryAndVersionIds(
|
|
6763
|
-
library,
|
|
6764
|
-
version2
|
|
6765
|
-
);
|
|
6766
|
-
for (const url of urls) {
|
|
6767
|
-
const deletedCount = await this.deleteDocumentsByUrl(library, version2, url);
|
|
6768
|
-
if (deletedCount > 0) {
|
|
6769
|
-
logger.debug(`🗑️ Deleted ${deletedCount} existing documents for URL: ${url}`);
|
|
6770
|
-
}
|
|
6771
|
-
}
|
|
6772
|
-
const transaction = this.db.transaction((docs) => {
|
|
6773
|
-
for (let i = 0; i < docs.length; i++) {
|
|
6774
|
-
const doc = docs[i];
|
|
6775
|
-
const url = doc.metadata.url;
|
|
6776
|
-
const result = this.statements.insertDocument.run(
|
|
6777
|
-
BigInt(libraryId),
|
|
6778
|
-
BigInt(versionId),
|
|
6779
|
-
url,
|
|
6780
|
-
doc.pageContent,
|
|
6781
|
-
JSON.stringify(doc.metadata),
|
|
6782
|
-
i,
|
|
6783
|
-
(/* @__PURE__ */ new Date()).toISOString()
|
|
6784
|
-
// Pass current timestamp for indexed_at
|
|
6785
|
-
);
|
|
6786
|
-
const rowId = result.lastInsertRowid;
|
|
6787
|
-
this.statements.insertEmbedding.run(
|
|
6788
|
-
BigInt(rowId),
|
|
6789
|
-
BigInt(libraryId),
|
|
6790
|
-
BigInt(versionId),
|
|
6791
|
-
JSON.stringify(paddedEmbeddings[i])
|
|
6792
|
-
);
|
|
6793
|
-
}
|
|
6794
|
-
});
|
|
6795
|
-
transaction(documents);
|
|
6796
|
-
} catch (error) {
|
|
6797
|
-
throw new ConnectionError("Failed to add documents to store", error);
|
|
6798
|
-
}
|
|
6799
|
-
}
|
|
6800
|
-
/**
|
|
6801
|
-
* Removes documents matching specified library and version
|
|
6802
|
-
* @returns Number of documents deleted
|
|
6803
|
-
*/
|
|
6804
|
-
async deleteDocuments(library, version2) {
|
|
6805
|
-
try {
|
|
6806
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6807
|
-
const result = this.statements.deleteDocuments.run(
|
|
6808
|
-
library.toLowerCase(),
|
|
6809
|
-
library.toLowerCase(),
|
|
6810
|
-
// library name appears twice in the query
|
|
6811
|
-
normalizedVersion
|
|
6812
|
-
);
|
|
6813
|
-
return result.changes;
|
|
6814
|
-
} catch (error) {
|
|
6815
|
-
throw new ConnectionError("Failed to delete documents", error);
|
|
6816
|
-
}
|
|
6817
|
-
}
|
|
6818
|
-
/**
|
|
6819
|
-
* Removes documents for a specific URL within a library and version
|
|
6820
|
-
* @returns Number of documents deleted
|
|
6821
|
-
*/
|
|
6822
|
-
async deleteDocumentsByUrl(library, version2, url) {
|
|
6823
|
-
try {
|
|
6824
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6825
|
-
const result = this.statements.deleteDocumentsByUrl.run(
|
|
6826
|
-
url,
|
|
6827
|
-
library.toLowerCase(),
|
|
6828
|
-
library.toLowerCase(),
|
|
6829
|
-
// library name appears twice in the query
|
|
6830
|
-
normalizedVersion
|
|
6831
|
-
);
|
|
6832
|
-
return result.changes;
|
|
6833
|
-
} catch (error) {
|
|
6834
|
-
throw new ConnectionError("Failed to delete documents by URL", error);
|
|
6835
|
-
}
|
|
6836
|
-
}
|
|
6837
|
-
/**
|
|
6838
|
-
* Retrieves a document by its ID.
|
|
6839
|
-
* @param id The ID of the document.
|
|
6840
|
-
* @returns The document, or null if not found.
|
|
6841
|
-
*/
|
|
6842
|
-
async getById(id) {
|
|
6843
|
-
try {
|
|
6844
|
-
const row = this.statements.getById.get(BigInt(id));
|
|
6845
|
-
if (!row) {
|
|
6846
|
-
return null;
|
|
6847
|
-
}
|
|
6848
|
-
return mapDbDocumentToDocument(row);
|
|
6849
|
-
} catch (error) {
|
|
6850
|
-
throw new ConnectionError(`Failed to get document by ID ${id}`, error);
|
|
6851
|
-
}
|
|
6852
|
-
}
|
|
6853
|
-
/**
|
|
6854
|
-
* Finds documents matching a text query using hybrid search.
|
|
6855
|
-
* Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
|
|
6856
|
-
*/
|
|
6857
|
-
async findByContent(library, version2, query, limit) {
|
|
6858
|
-
try {
|
|
6859
|
-
const rawEmbedding = await this.embeddings.embedQuery(query);
|
|
6860
|
-
const embedding = this.padVector(rawEmbedding);
|
|
6861
|
-
const ftsQuery = this.escapeFtsQuery(query);
|
|
6862
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6863
|
-
const stmt = this.db.prepare(`
|
|
6864
|
-
WITH vec_distances AS (
|
|
6865
|
-
SELECT
|
|
6866
|
-
dv.rowid as id,
|
|
6867
|
-
dv.distance as vec_distance
|
|
6868
|
-
FROM documents_vec dv
|
|
6869
|
-
JOIN versions v ON dv.version_id = v.id
|
|
6870
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6871
|
-
WHERE l.name = ?
|
|
6872
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6873
|
-
AND dv.embedding MATCH ?
|
|
6874
|
-
AND dv.k = ?
|
|
6875
|
-
ORDER BY dv.distance
|
|
6876
|
-
),
|
|
6877
|
-
fts_scores AS (
|
|
6878
|
-
SELECT
|
|
6879
|
-
f.rowid as id,
|
|
6880
|
-
bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
|
|
6881
|
-
FROM documents_fts f
|
|
6882
|
-
JOIN documents d ON f.rowid = d.id
|
|
6883
|
-
JOIN versions v ON d.version_id = v.id
|
|
6884
|
-
JOIN libraries l ON v.library_id = l.id
|
|
6885
|
-
WHERE l.name = ?
|
|
6886
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
6887
|
-
AND documents_fts MATCH ?
|
|
6888
|
-
ORDER BY fts_score
|
|
6889
|
-
LIMIT ?
|
|
6890
|
-
)
|
|
6891
|
-
SELECT
|
|
6892
|
-
d.id,
|
|
6893
|
-
d.content,
|
|
6894
|
-
d.metadata,
|
|
6895
|
-
COALESCE(1 / (1 + v.vec_distance), 0) as vec_score,
|
|
6896
|
-
COALESCE(-MIN(f.fts_score, 0), 0) as fts_score
|
|
6897
|
-
FROM documents d
|
|
6898
|
-
LEFT JOIN vec_distances v ON d.id = v.id
|
|
6899
|
-
LEFT JOIN fts_scores f ON d.id = f.id
|
|
6900
|
-
WHERE v.id IS NOT NULL OR f.id IS NOT NULL
|
|
6901
|
-
`);
|
|
6902
|
-
const rawResults = stmt.all(
|
|
6903
|
-
library.toLowerCase(),
|
|
6904
|
-
normalizedVersion,
|
|
6905
|
-
JSON.stringify(embedding),
|
|
6906
|
-
limit,
|
|
6907
|
-
library.toLowerCase(),
|
|
6908
|
-
normalizedVersion,
|
|
6909
|
-
ftsQuery,
|
|
6910
|
-
// Use the escaped query
|
|
6911
|
-
limit
|
|
6912
|
-
);
|
|
6913
|
-
const rankedResults = this.assignRanks(rawResults);
|
|
6914
|
-
const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
|
|
6915
|
-
return topResults.map((row) => ({
|
|
6916
|
-
...mapDbDocumentToDocument(row),
|
|
6917
|
-
metadata: {
|
|
6918
|
-
...JSON.parse(row.metadata),
|
|
6919
|
-
id: row.id,
|
|
6920
|
-
score: row.rrf_score,
|
|
6921
|
-
vec_rank: row.vec_rank,
|
|
6922
|
-
fts_rank: row.fts_rank
|
|
6923
|
-
}
|
|
6924
|
-
}));
|
|
6925
|
-
} catch (error) {
|
|
6926
|
-
throw new ConnectionError(
|
|
6927
|
-
`Failed to find documents by content with query "${query}"`,
|
|
6928
|
-
error
|
|
6929
|
-
);
|
|
6930
|
-
}
|
|
6931
|
-
}
|
|
6932
|
-
/**
|
|
6933
|
-
* Finds child chunks of a given document based on path hierarchy.
|
|
6934
|
-
*/
|
|
6935
|
-
async findChildChunks(library, version2, id, limit) {
|
|
6936
|
-
try {
|
|
6937
|
-
const parent = await this.getById(id);
|
|
6938
|
-
if (!parent) {
|
|
6939
|
-
return [];
|
|
6940
|
-
}
|
|
6941
|
-
const parentPath = parent.metadata.path ?? [];
|
|
6942
|
-
const parentUrl = parent.metadata.url;
|
|
6943
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6944
|
-
const result = this.statements.getChildChunks.all(
|
|
6945
|
-
library.toLowerCase(),
|
|
6946
|
-
normalizedVersion,
|
|
6947
|
-
parentUrl,
|
|
6948
|
-
parentPath.length + 1,
|
|
6949
|
-
JSON.stringify(parentPath),
|
|
6950
|
-
BigInt(id),
|
|
6951
|
-
limit
|
|
6952
|
-
);
|
|
6953
|
-
return result.map((row) => mapDbDocumentToDocument(row));
|
|
6954
|
-
} catch (error) {
|
|
6955
|
-
throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
|
|
6956
|
-
}
|
|
6957
|
-
}
|
|
6958
|
-
/**
|
|
6959
|
-
* Finds preceding sibling chunks of a given document.
|
|
6960
|
-
*/
|
|
6961
|
-
async findPrecedingSiblingChunks(library, version2, id, limit) {
|
|
6962
|
-
try {
|
|
6963
|
-
const reference = await this.getById(id);
|
|
6964
|
-
if (!reference) {
|
|
6965
|
-
return [];
|
|
6966
|
-
}
|
|
6967
|
-
const refMetadata = reference.metadata;
|
|
6968
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6969
|
-
const result = this.statements.getPrecedingSiblings.all(
|
|
6970
|
-
library.toLowerCase(),
|
|
6971
|
-
normalizedVersion,
|
|
6972
|
-
refMetadata.url,
|
|
6973
|
-
BigInt(id),
|
|
6974
|
-
JSON.stringify(refMetadata.path),
|
|
6975
|
-
limit
|
|
6976
|
-
);
|
|
6977
|
-
return result.reverse().map((row) => mapDbDocumentToDocument(row));
|
|
6978
|
-
} catch (error) {
|
|
6979
|
-
throw new ConnectionError(
|
|
6980
|
-
`Failed to find preceding sibling chunks for ID ${id}`,
|
|
6981
|
-
error
|
|
6982
|
-
);
|
|
6983
|
-
}
|
|
6984
|
-
}
|
|
6985
|
-
/**
|
|
6986
|
-
* Finds subsequent sibling chunks of a given document.
|
|
6987
|
-
*/
|
|
6988
|
-
async findSubsequentSiblingChunks(library, version2, id, limit) {
|
|
6989
|
-
try {
|
|
6990
|
-
const reference = await this.getById(id);
|
|
6991
|
-
if (!reference) {
|
|
6992
|
-
return [];
|
|
6993
|
-
}
|
|
6994
|
-
const refMetadata = reference.metadata;
|
|
6995
|
-
const normalizedVersion = version2.toLowerCase();
|
|
6996
|
-
const result = this.statements.getSubsequentSiblings.all(
|
|
6997
|
-
library.toLowerCase(),
|
|
6998
|
-
normalizedVersion,
|
|
6999
|
-
refMetadata.url,
|
|
7000
|
-
BigInt(id),
|
|
7001
|
-
JSON.stringify(refMetadata.path),
|
|
7002
|
-
limit
|
|
7003
|
-
);
|
|
7004
|
-
return result.map((row) => mapDbDocumentToDocument(row));
|
|
7005
|
-
} catch (error) {
|
|
7006
|
-
throw new ConnectionError(
|
|
7007
|
-
`Failed to find subsequent sibling chunks for ID ${id}`,
|
|
7008
|
-
error
|
|
7009
|
-
);
|
|
7010
|
-
}
|
|
7011
|
-
}
|
|
7012
|
-
/**
|
|
7013
|
-
* Finds the parent chunk of a given document.
|
|
7014
|
-
*/
|
|
7015
|
-
async findParentChunk(library, version2, id) {
|
|
7016
|
-
try {
|
|
7017
|
-
const child = await this.getById(id);
|
|
7018
|
-
if (!child) {
|
|
7019
|
-
return null;
|
|
7020
|
-
}
|
|
7021
|
-
const childMetadata = child.metadata;
|
|
7022
|
-
const path2 = childMetadata.path ?? [];
|
|
7023
|
-
const parentPath = path2.slice(0, -1);
|
|
7024
|
-
if (parentPath.length === 0) {
|
|
7025
|
-
return null;
|
|
7026
|
-
}
|
|
7027
|
-
const normalizedVersion = version2.toLowerCase();
|
|
7028
|
-
const result = this.statements.getParentChunk.get(
|
|
7029
|
-
library.toLowerCase(),
|
|
7030
|
-
normalizedVersion,
|
|
7031
|
-
childMetadata.url,
|
|
7032
|
-
JSON.stringify(parentPath),
|
|
7033
|
-
BigInt(id)
|
|
7034
|
-
);
|
|
7035
|
-
if (!result) {
|
|
7036
|
-
return null;
|
|
7037
|
-
}
|
|
7038
|
-
return mapDbDocumentToDocument(result);
|
|
7039
|
-
} catch (error) {
|
|
7040
|
-
throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
|
|
7041
|
-
}
|
|
7042
|
-
}
|
|
7043
|
-
/**
|
|
7044
|
-
* Fetches multiple documents by their IDs in a single call.
|
|
7045
|
-
* Returns an array of Document objects, sorted by their sort_order.
|
|
7046
|
-
*/
|
|
7047
|
-
async findChunksByIds(library, version2, ids) {
|
|
7048
|
-
if (!ids.length) return [];
|
|
7049
|
-
try {
|
|
7050
|
-
const normalizedVersion = version2.toLowerCase();
|
|
7051
|
-
const placeholders = ids.map(() => "?").join(",");
|
|
7052
|
-
const stmt = this.db.prepare(
|
|
7053
|
-
`SELECT d.* FROM documents d
|
|
7054
|
-
JOIN libraries l ON d.library_id = l.id
|
|
7055
|
-
JOIN versions v ON d.version_id = v.id
|
|
7056
|
-
WHERE l.name = ?
|
|
7057
|
-
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
7058
|
-
AND d.id IN (${placeholders})
|
|
7059
|
-
ORDER BY d.sort_order`
|
|
7060
|
-
);
|
|
7061
|
-
const rows = stmt.all(
|
|
7062
|
-
library.toLowerCase(),
|
|
7063
|
-
normalizedVersion,
|
|
7064
|
-
...ids
|
|
7065
|
-
);
|
|
7066
|
-
return rows.map((row) => mapDbDocumentToDocument(row));
|
|
7067
|
-
} catch (error) {
|
|
7068
|
-
throw new ConnectionError("Failed to fetch documents by IDs", error);
|
|
7069
|
-
}
|
|
7070
|
-
}
|
|
7071
|
-
}
|
|
7072
|
-
class DocumentManagementService {
|
|
7073
|
-
store;
|
|
7074
|
-
documentRetriever;
|
|
7075
|
-
splitter;
|
|
7076
|
-
/**
|
|
7077
|
-
* Normalizes a version string, converting null or undefined to an empty string
|
|
7078
|
-
* and converting to lowercase.
|
|
7079
|
-
*/
|
|
7080
|
-
normalizeVersion(version2) {
|
|
7081
|
-
return (version2 ?? "").toLowerCase();
|
|
7082
|
-
}
|
|
7083
|
-
constructor() {
|
|
7084
|
-
let dbPath;
|
|
7085
|
-
let dbDir;
|
|
7086
|
-
const envStorePath = process.env.DOCS_MCP_STORE_PATH;
|
|
7087
|
-
if (envStorePath) {
|
|
7088
|
-
dbDir = envStorePath;
|
|
7089
|
-
dbPath = path.join(dbDir, "documents.db");
|
|
7090
|
-
logger.debug(`💾 Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
|
|
7091
|
-
} else {
|
|
7092
|
-
const projectRoot2 = getProjectRoot();
|
|
7093
|
-
const oldDbDir = path.join(projectRoot2, ".store");
|
|
7094
|
-
const oldDbPath = path.join(oldDbDir, "documents.db");
|
|
7095
|
-
const oldDbExists = fs$1.existsSync(oldDbPath);
|
|
7096
|
-
if (oldDbExists) {
|
|
7097
|
-
dbPath = oldDbPath;
|
|
7098
|
-
dbDir = oldDbDir;
|
|
7099
|
-
logger.debug(`💾 Using legacy database path: ${dbPath}`);
|
|
7100
|
-
} else {
|
|
7101
|
-
const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
|
|
7102
|
-
dbDir = standardPaths.data;
|
|
7103
|
-
dbPath = path.join(dbDir, "documents.db");
|
|
7104
|
-
logger.debug(`💾 Using standard database directory: ${dbDir}`);
|
|
7105
|
-
}
|
|
7106
|
-
}
|
|
7107
|
-
try {
|
|
7108
|
-
fs$1.mkdirSync(dbDir, { recursive: true });
|
|
7109
|
-
} catch (error) {
|
|
7110
|
-
logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
|
|
7111
|
-
}
|
|
7112
|
-
this.store = new DocumentStore(dbPath);
|
|
7113
|
-
this.documentRetriever = new DocumentRetrieverService(this.store);
|
|
7114
|
-
const semanticSplitter = new SemanticMarkdownSplitter(
|
|
7115
|
-
SPLITTER_PREFERRED_CHUNK_SIZE,
|
|
7116
|
-
SPLITTER_MAX_CHUNK_SIZE
|
|
7117
|
-
);
|
|
7118
|
-
const greedySplitter = new GreedySplitter(
|
|
7119
|
-
semanticSplitter,
|
|
7120
|
-
SPLITTER_MIN_CHUNK_SIZE,
|
|
7121
|
-
SPLITTER_PREFERRED_CHUNK_SIZE
|
|
7122
|
-
);
|
|
7123
|
-
this.splitter = greedySplitter;
|
|
7124
|
-
}
|
|
7125
|
-
/**
|
|
7126
|
-
* Initializes the underlying document store.
|
|
7127
|
-
*/
|
|
7128
|
-
async initialize() {
|
|
7129
|
-
await this.store.initialize();
|
|
7130
|
-
}
|
|
7131
|
-
/**
|
|
7132
|
-
* Shuts down the underlying document store.
|
|
7133
|
-
*/
|
|
7134
|
-
async shutdown() {
|
|
7135
|
-
logger.debug("Shutting down store manager");
|
|
7136
|
-
await this.store.shutdown();
|
|
7137
|
-
}
|
|
7138
|
-
// Status tracking methods for pipeline integration
|
|
7139
|
-
/**
|
|
7140
|
-
* Gets versions by their current status.
|
|
7141
|
-
*/
|
|
7142
|
-
async getVersionsByStatus(statuses) {
|
|
7143
|
-
return this.store.getVersionsByStatus(statuses);
|
|
7144
|
-
}
|
|
7145
|
-
/**
|
|
7146
|
-
* Gets all versions currently in RUNNING status.
|
|
7147
|
-
*/
|
|
7148
|
-
async getRunningVersions() {
|
|
7149
|
-
return this.store.getRunningVersions();
|
|
7150
|
-
}
|
|
7151
|
-
/**
|
|
7152
|
-
* Updates the status of a version.
|
|
7153
|
-
*/
|
|
7154
|
-
async updateVersionStatus(versionId, status, errorMessage) {
|
|
7155
|
-
return this.store.updateVersionStatus(versionId, status, errorMessage);
|
|
7156
|
-
}
|
|
7157
|
-
/**
|
|
7158
|
-
* Updates the progress of a version being indexed.
|
|
7159
|
-
*/
|
|
7160
|
-
async updateVersionProgress(versionId, pages, maxPages) {
|
|
7161
|
-
return this.store.updateVersionProgress(versionId, pages, maxPages);
|
|
7162
|
-
}
|
|
7163
|
-
/**
|
|
7164
|
-
* Stores scraper options for a version to enable reproducible indexing.
|
|
7165
|
-
*/
|
|
7166
|
-
async storeScraperOptions(versionId, options) {
|
|
7167
|
-
return this.store.storeScraperOptions(versionId, options);
|
|
7168
|
-
}
|
|
7169
|
-
/**
|
|
7170
|
-
* Retrieves stored scraper options for a version.
|
|
7171
|
-
*/
|
|
7172
|
-
async getVersionScraperOptions(versionId) {
|
|
7173
|
-
return this.store.getVersionScraperOptions(versionId);
|
|
7174
|
-
}
|
|
7175
|
-
/**
|
|
7176
|
-
* Retrieves a version record with all stored options.
|
|
7177
|
-
*/
|
|
7178
|
-
async getVersionWithStoredOptions(versionId) {
|
|
7179
|
-
return this.store.getVersionWithStoredOptions(versionId);
|
|
7180
|
-
}
|
|
7181
|
-
/**
|
|
7182
|
-
* Finds versions that were indexed from the same source URL.
|
|
7183
|
-
*/
|
|
7184
|
-
async findVersionsBySourceUrl(url) {
|
|
7185
|
-
return this.store.findVersionsBySourceUrl(url);
|
|
7186
|
-
}
|
|
7187
|
-
/**
|
|
7188
|
-
* Validates if a library exists in the store (either versioned or unversioned).
|
|
7189
|
-
* Throws LibraryNotFoundError with suggestions if the library is not found.
|
|
7190
|
-
* @param library The name of the library to validate.
|
|
7191
|
-
* @throws {LibraryNotFoundError} If the library does not exist.
|
|
7192
|
-
*/
|
|
7193
|
-
async validateLibraryExists(library) {
|
|
7194
|
-
logger.info(`🔎 Validating existence of library: ${library}`);
|
|
7195
|
-
const normalizedLibrary = library.toLowerCase();
|
|
7196
|
-
const versions = await this.listVersions(normalizedLibrary);
|
|
7197
|
-
const hasUnversioned = await this.exists(normalizedLibrary, "");
|
|
7198
|
-
if (versions.length === 0 && !hasUnversioned) {
|
|
7199
|
-
logger.warn(`⚠️ Library '${library}' not found.`);
|
|
7200
|
-
const allLibraries = await this.listLibraries();
|
|
7201
|
-
const libraryNames = allLibraries.map((lib) => lib.library);
|
|
7202
|
-
let suggestions = [];
|
|
7203
|
-
if (libraryNames.length > 0) {
|
|
7204
|
-
const fuse = new Fuse(libraryNames, {
|
|
7205
|
-
// Configure fuse.js options if needed (e.g., threshold)
|
|
7206
|
-
// isCaseSensitive: false, // Handled by normalizing library names
|
|
7207
|
-
// includeScore: true,
|
|
7208
|
-
threshold: 0.4
|
|
7209
|
-
// Adjust threshold for desired fuzziness (0=exact, 1=match anything)
|
|
7210
|
-
});
|
|
7211
|
-
const results = fuse.search(normalizedLibrary);
|
|
7212
|
-
suggestions = results.slice(0, 3).map((result) => result.item);
|
|
7213
|
-
logger.info(`🔍 Found suggestions: ${suggestions.join(", ")}`);
|
|
7214
|
-
}
|
|
7215
|
-
throw new LibraryNotFoundError(library, suggestions);
|
|
7216
|
-
}
|
|
7217
|
-
logger.info(`✅ Library '${library}' confirmed to exist.`);
|
|
7218
|
-
}
|
|
7219
|
-
/**
|
|
7220
|
-
* Returns a list of all available semantic versions for a library.
|
|
7221
|
-
*/
|
|
7222
|
-
async listVersions(library) {
|
|
7223
|
-
const versions = await this.store.queryUniqueVersions(library);
|
|
7224
|
-
return versions.filter((v) => semver__default.valid(v));
|
|
7225
|
-
}
|
|
7226
|
-
/**
|
|
7227
|
-
* Checks if documents exist for a given library and optional version.
|
|
7228
|
-
* If version is omitted, checks for documents without a specific version.
|
|
7229
|
-
*/
|
|
7230
|
-
async exists(library, version2) {
|
|
7231
|
-
const normalizedVersion = this.normalizeVersion(version2);
|
|
7232
|
-
return this.store.checkDocumentExists(library, normalizedVersion);
|
|
7233
|
-
}
|
|
7234
|
-
/**
|
|
7235
|
-
* Finds the most appropriate version of documentation based on the requested version.
|
|
7236
|
-
* When no target version is specified, returns the latest version.
|
|
7237
|
-
*
|
|
7238
|
-
* Version matching behavior:
|
|
7239
|
-
* - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
|
|
7240
|
-
* - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
|
|
7241
|
-
* - "latest" or no version: Returns the latest available version
|
|
7242
|
-
*
|
|
7243
|
-
* For documentation, we prefer matching older versions over no match at all,
|
|
7244
|
-
* since older docs are often still relevant and useful.
|
|
7245
|
-
* Also checks if unversioned documents exist for the library.
|
|
7246
|
-
*/
|
|
7247
|
-
async findBestVersion(library, targetVersion) {
|
|
7248
|
-
const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
|
|
7249
|
-
logger.info(`🔍 Finding best version for ${libraryAndVersion}`);
|
|
7250
|
-
const hasUnversioned = await this.store.checkDocumentExists(library, "");
|
|
7251
|
-
const versionStrings = await this.listVersions(library);
|
|
7252
|
-
if (versionStrings.length === 0) {
|
|
7253
|
-
if (hasUnversioned) {
|
|
7254
|
-
logger.info(`ℹ️ Unversioned documents exist for ${library}`);
|
|
7255
|
-
return { bestMatch: null, hasUnversioned: true };
|
|
7256
|
-
}
|
|
7257
|
-
logger.warn(`⚠️ No valid versions found for ${library}`);
|
|
7258
|
-
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
7259
|
-
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
7260
|
-
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
7261
|
-
}
|
|
7262
|
-
let bestMatch = null;
|
|
7263
|
-
if (!targetVersion || targetVersion === "latest") {
|
|
7264
|
-
bestMatch = semver__default.maxSatisfying(versionStrings, "*");
|
|
7265
|
-
} else {
|
|
7266
|
-
const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
|
|
7267
|
-
if (!versionRegex.test(targetVersion)) {
|
|
7268
|
-
logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
|
|
7269
|
-
} else {
|
|
7270
|
-
let range = targetVersion;
|
|
7271
|
-
if (!semver__default.validRange(targetVersion)) {
|
|
7272
|
-
range = `~${targetVersion}`;
|
|
7273
|
-
} else if (semver__default.valid(targetVersion)) {
|
|
7274
|
-
range = `${range} || <=${targetVersion}`;
|
|
7275
|
-
}
|
|
7276
|
-
bestMatch = semver__default.maxSatisfying(versionStrings, range);
|
|
7277
|
-
}
|
|
7278
|
-
}
|
|
7279
|
-
if (bestMatch) {
|
|
7280
|
-
logger.info(`✅ Found best match version ${bestMatch} for ${libraryAndVersion}`);
|
|
7281
|
-
} else {
|
|
7282
|
-
logger.warn(`⚠️ No matching semver version found for ${libraryAndVersion}`);
|
|
7283
|
-
}
|
|
7284
|
-
if (!bestMatch && !hasUnversioned) {
|
|
7285
|
-
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
7286
|
-
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
7287
|
-
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
7288
|
-
}
|
|
7289
|
-
return { bestMatch, hasUnversioned };
|
|
7290
|
-
}
|
|
7291
|
-
/**
|
|
7292
|
-
* Removes all documents for a specific library and optional version.
|
|
7293
|
-
* If version is omitted, removes documents without a specific version.
|
|
7294
|
-
*/
|
|
7295
|
-
async removeAllDocuments(library, version2) {
|
|
7296
|
-
const normalizedVersion = this.normalizeVersion(version2);
|
|
7297
|
-
logger.info(
|
|
7298
|
-
`🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
|
|
7299
|
-
);
|
|
7300
|
-
const count = await this.store.deleteDocuments(library, normalizedVersion);
|
|
7301
|
-
logger.info(`📊 Deleted ${count} documents`);
|
|
7302
|
-
}
|
|
7303
|
-
/**
|
|
7304
|
-
* Adds a document to the store, splitting it into smaller chunks for better search results.
|
|
7305
|
-
* Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
|
|
7306
|
-
* Preserves hierarchical structure of documents and distinguishes between text and code segments.
|
|
7307
|
-
* If version is omitted, the document is added without a specific version.
|
|
7308
|
-
*/
|
|
7309
|
-
async addDocument(library, version2, document) {
|
|
7310
|
-
const normalizedVersion = this.normalizeVersion(version2);
|
|
7311
|
-
const url = document.metadata.url;
|
|
7312
|
-
if (!url || typeof url !== "string" || !url.trim()) {
|
|
7313
|
-
throw new StoreError("Document metadata must include a valid URL");
|
|
7314
|
-
}
|
|
7315
|
-
logger.info(`📚 Adding document: ${document.metadata.title}`);
|
|
7316
|
-
if (!document.pageContent.trim()) {
|
|
7317
|
-
throw new Error("Document content cannot be empty");
|
|
7318
|
-
}
|
|
7319
|
-
const chunks = await this.splitter.splitText(document.pageContent);
|
|
7320
|
-
const splitDocs = chunks.map((chunk) => ({
|
|
7321
|
-
pageContent: chunk.content,
|
|
7322
|
-
metadata: {
|
|
7323
|
-
...document.metadata,
|
|
7324
|
-
level: chunk.section.level,
|
|
7325
|
-
path: chunk.section.path
|
|
7326
|
-
}
|
|
7327
|
-
}));
|
|
7328
|
-
logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
|
|
7329
|
-
await this.store.addDocuments(library, normalizedVersion, splitDocs);
|
|
7330
|
-
}
|
|
7331
|
-
/**
|
|
7332
|
-
* Searches for documentation content across versions.
|
|
7333
|
-
* Uses hybrid search (vector + FTS).
|
|
7334
|
-
* If version is omitted, searches documents without a specific version.
|
|
7335
|
-
*/
|
|
7336
|
-
async searchStore(library, version2, query, limit = 5) {
|
|
7337
|
-
const normalizedVersion = this.normalizeVersion(version2);
|
|
7338
|
-
return this.documentRetriever.search(library, normalizedVersion, query, limit);
|
|
7339
|
-
}
|
|
7340
|
-
async listLibraries() {
|
|
7341
|
-
const libraryMap = await this.store.queryLibraryVersions();
|
|
7342
|
-
return Array.from(libraryMap.entries()).map(([library, versions]) => ({
|
|
7343
|
-
library,
|
|
7344
|
-
versions
|
|
7345
|
-
// The versions array already contains LibraryVersionDetails
|
|
7346
|
-
}));
|
|
7347
|
-
}
|
|
7348
|
-
/**
|
|
7349
|
-
* Gets all versions in active states (queued, running, updating).
|
|
7350
|
-
*/
|
|
7351
|
-
async getActiveVersions() {
|
|
7352
|
-
return this.store.getActiveVersions();
|
|
7353
|
-
}
|
|
7354
|
-
/**
|
|
7355
|
-
* Ensures a library and version exist in the database and returns the version ID.
|
|
7356
|
-
* Creates the library and version records if they don't exist.
|
|
7357
|
-
*/
|
|
7358
|
-
async ensureLibraryAndVersion(library, version2) {
|
|
7359
|
-
const normalizedLibrary = library.toLowerCase();
|
|
7360
|
-
const normalizedVersion = this.normalizeVersion(version2);
|
|
7361
|
-
const { versionId } = await this.store.resolveLibraryAndVersionIds(
|
|
7362
|
-
normalizedLibrary,
|
|
7363
|
-
normalizedVersion
|
|
7364
|
-
);
|
|
7365
|
-
return versionId;
|
|
7366
|
-
}
|
|
7367
|
-
}
|
|
7368
|
-
function ensurePlaywrightBrowsersInstalled() {
|
|
7369
|
-
const chromiumEnvPath = process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH;
|
|
7370
|
-
if (chromiumEnvPath && existsSync(chromiumEnvPath)) {
|
|
7371
|
-
logger.debug(
|
|
7372
|
-
`PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH is set to '${chromiumEnvPath}', skipping Playwright browser install.`
|
|
7373
|
-
);
|
|
7374
|
-
return;
|
|
7375
|
-
}
|
|
7376
|
-
try {
|
|
7377
|
-
const chromiumPath = chromium.executablePath();
|
|
7378
|
-
if (!chromiumPath || !existsSync(chromiumPath)) {
|
|
7379
|
-
throw new Error("Playwright Chromium browser not found");
|
|
7380
|
-
}
|
|
7381
|
-
} catch (_err) {
|
|
7382
|
-
logger.debug(
|
|
7383
|
-
"Playwright browsers not found. Installing Chromium browser for dynamic scraping (this may take a minute)..."
|
|
7384
|
-
);
|
|
7385
|
-
try {
|
|
7386
|
-
logger.debug("Installing Playwright Chromium browser...");
|
|
7387
|
-
execSync("npm exec -y playwright install --no-shell --with-deps chromium", {
|
|
7388
|
-
stdio: "ignore",
|
|
7389
|
-
// Suppress output
|
|
7390
|
-
cwd: getProjectRoot()
|
|
7391
|
-
});
|
|
7392
|
-
} catch (_installErr) {
|
|
7393
|
-
console.error(
|
|
7394
|
-
"❌ Failed to install Playwright browsers automatically. Please run:\n npx playwright install --no-shell --with-deps chromium\nand try again."
|
|
7395
|
-
);
|
|
7396
|
-
process.exit(1);
|
|
7397
|
-
}
|
|
7398
|
-
}
|
|
7399
|
-
}
|
|
7400
|
-
function resolveProtocol(protocol) {
|
|
7401
|
-
if (protocol === "auto") {
|
|
7402
|
-
if (!process.stdin.isTTY && !process.stdout.isTTY) {
|
|
7403
|
-
return "stdio";
|
|
7404
|
-
}
|
|
7405
|
-
return "http";
|
|
7406
|
-
}
|
|
7407
|
-
if (protocol === "stdio" || protocol === "http") {
|
|
7408
|
-
return protocol;
|
|
7409
|
-
}
|
|
7410
|
-
throw new Error(`Invalid protocol: ${protocol}. Must be 'auto', 'stdio', or 'http'`);
|
|
7411
|
-
}
|
|
7412
|
-
const formatOutput = (data) => JSON.stringify(data, null, 2);
|
|
7413
|
-
function setupLogging(options, protocol) {
|
|
7414
|
-
if (options.silent) {
|
|
7415
|
-
setLogLevel(LogLevel.ERROR);
|
|
7416
|
-
} else if (options.verbose) {
|
|
7417
|
-
setLogLevel(LogLevel.DEBUG);
|
|
7418
|
-
}
|
|
7419
|
-
if (protocol === "stdio") {
|
|
7420
|
-
setLogLevel(LogLevel.ERROR);
|
|
7421
|
-
}
|
|
7422
|
-
}
|
|
7423
|
-
function validatePort(portString) {
|
|
7424
|
-
const port = Number.parseInt(portString, 10);
|
|
7425
|
-
if (Number.isNaN(port) || port < 1 || port > 65535) {
|
|
7426
|
-
throw new Error("❌ Invalid port number");
|
|
7427
|
-
}
|
|
7428
|
-
return port;
|
|
7429
|
-
}
|
|
7430
|
-
async function initializeDocumentService() {
|
|
7431
|
-
const docService = new DocumentManagementService();
|
|
7432
|
-
await docService.initialize();
|
|
7433
|
-
return docService;
|
|
7434
|
-
}
|
|
7435
|
-
async function initializePipeline(docService, options = {}) {
|
|
7436
|
-
logger.debug(`Initializing PipelineManager with options: ${JSON.stringify(options)}`);
|
|
7437
|
-
const manager = await PipelineFactory.createPipeline(docService, options);
|
|
7438
|
-
manager.setCallbacks({
|
|
7439
|
-
onJobProgress: async (job, progress) => {
|
|
7440
|
-
logger.debug(
|
|
7441
|
-
`📊 Job ${job.id} progress: ${progress.pagesScraped}/${progress.totalPages} pages`
|
|
7442
|
-
);
|
|
7443
|
-
},
|
|
7444
|
-
onJobStatusChange: async (job) => {
|
|
7445
|
-
logger.debug(`🔄 Job ${job.id} status changed to: ${job.status}`);
|
|
7446
|
-
},
|
|
7447
|
-
onJobError: async (job, error, document) => {
|
|
7448
|
-
logger.warn(
|
|
7449
|
-
`⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`
|
|
7450
|
-
);
|
|
7451
|
-
}
|
|
7452
|
-
});
|
|
7453
|
-
return manager;
|
|
7454
|
-
}
|
|
7455
|
-
function createAppServerConfig(options) {
|
|
7456
|
-
return {
|
|
7457
|
-
enableWebInterface: options.enableWebInterface ?? false,
|
|
7458
|
-
enableMcpServer: options.enableMcpServer ?? true,
|
|
7459
|
-
enablePipelineApi: options.enablePipelineApi ?? false,
|
|
7460
|
-
enableWorker: options.enableWorker ?? true,
|
|
7461
|
-
port: options.port,
|
|
7462
|
-
externalWorkerUrl: options.externalWorkerUrl
|
|
7463
|
-
};
|
|
7464
|
-
}
|
|
7465
|
-
function parseHeaders(headerOptions) {
|
|
7466
|
-
const headers = {};
|
|
7467
|
-
if (Array.isArray(headerOptions)) {
|
|
7468
|
-
for (const entry of headerOptions) {
|
|
7469
|
-
const idx = entry.indexOf(":");
|
|
7470
|
-
if (idx > 0) {
|
|
7471
|
-
const name = entry.slice(0, idx).trim();
|
|
7472
|
-
const value = entry.slice(idx + 1).trim();
|
|
7473
|
-
if (name) headers[name] = value;
|
|
7474
|
-
}
|
|
7475
|
-
}
|
|
7476
|
-
}
|
|
7477
|
-
return headers;
|
|
5761
|
+
return headers;
|
|
7478
5762
|
}
|
|
7479
5763
|
const CLI_DEFAULTS = {
|
|
7480
5764
|
PROTOCOL: DEFAULT_PROTOCOL,
|
|
@@ -7483,11 +5767,17 @@ const CLI_DEFAULTS = {
|
|
|
7483
5767
|
MAX_CONCURRENCY: DEFAULT_MAX_CONCURRENCY
|
|
7484
5768
|
};
|
|
7485
5769
|
function createDefaultAction(program) {
|
|
7486
|
-
return program.
|
|
7487
|
-
"--protocol <
|
|
7488
|
-
|
|
7489
|
-
"
|
|
7490
|
-
|
|
5770
|
+
return program.addOption(
|
|
5771
|
+
new Option("--protocol <protocol>", "Protocol for MCP server").choices(["auto", "stdio", "http"]).default("auto")
|
|
5772
|
+
).addOption(
|
|
5773
|
+
new Option("--port <number>", "Port for the server").argParser((v) => {
|
|
5774
|
+
const n = Number(v);
|
|
5775
|
+
if (!Number.isInteger(n) || n < 1 || n > 65535) {
|
|
5776
|
+
throw new Error("Port must be an integer between 1 and 65535");
|
|
5777
|
+
}
|
|
5778
|
+
return String(n);
|
|
5779
|
+
}).default(CLI_DEFAULTS.HTTP_PORT.toString())
|
|
5780
|
+
).option("--resume", "Resume interrupted jobs on startup", false).option("--no-resume", "Do not resume jobs on startup").action(
|
|
7491
5781
|
async (options, command) => {
|
|
7492
5782
|
const globalOptions = command.opts();
|
|
7493
5783
|
const resolvedProtocol = resolveProtocol(options.protocol);
|
|
@@ -7495,13 +5785,13 @@ function createDefaultAction(program) {
|
|
|
7495
5785
|
logger.debug("No subcommand specified, starting unified server by default...");
|
|
7496
5786
|
const port = validatePort(options.port);
|
|
7497
5787
|
ensurePlaywrightBrowsersInstalled();
|
|
7498
|
-
const docService = await
|
|
5788
|
+
const docService = await createLocalDocumentManagement();
|
|
7499
5789
|
const pipelineOptions = {
|
|
7500
5790
|
recoverJobs: options.resume || false,
|
|
7501
5791
|
// Use --resume flag for job recovery
|
|
7502
5792
|
concurrency: 3
|
|
7503
5793
|
};
|
|
7504
|
-
const pipeline = await
|
|
5794
|
+
const pipeline = await createPipelineWithCallbacks(docService, pipelineOptions);
|
|
7505
5795
|
if (resolvedProtocol === "stdio") {
|
|
7506
5796
|
logger.debug(`🔍 Auto-detected stdio protocol (no TTY)`);
|
|
7507
5797
|
await pipeline.start();
|
|
@@ -7516,8 +5806,8 @@ function createDefaultAction(program) {
|
|
|
7516
5806
|
// Enable web interface in http mode
|
|
7517
5807
|
enableMcpServer: true,
|
|
7518
5808
|
// Always enable MCP server
|
|
7519
|
-
|
|
7520
|
-
// Enable
|
|
5809
|
+
enableApiServer: true,
|
|
5810
|
+
// Enable API (tRPC) in http mode
|
|
7521
5811
|
enableWorker: true,
|
|
7522
5812
|
// Always enable in-process worker for unified server
|
|
7523
5813
|
port
|
|
@@ -7529,6 +5819,19 @@ function createDefaultAction(program) {
|
|
|
7529
5819
|
}
|
|
7530
5820
|
);
|
|
7531
5821
|
}
|
|
5822
|
+
async function fetchUrlAction(url, options, command) {
|
|
5823
|
+
const globalOptions = command.parent?.opts() || {};
|
|
5824
|
+
setupLogging(globalOptions);
|
|
5825
|
+
const headers = parseHeaders(options.header);
|
|
5826
|
+
const fetchUrlTool = new FetchUrlTool(new HttpFetcher(), new FileFetcher());
|
|
5827
|
+
const content = await fetchUrlTool.execute({
|
|
5828
|
+
url,
|
|
5829
|
+
followRedirects: options.followRedirects,
|
|
5830
|
+
scrapeMode: options.scrapeMode,
|
|
5831
|
+
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
5832
|
+
});
|
|
5833
|
+
console.log(content);
|
|
5834
|
+
}
|
|
7532
5835
|
function createFetchUrlCommand(program) {
|
|
7533
5836
|
return program.command("fetch-url <url>").description("Fetch a URL and convert its content to Markdown").option(
|
|
7534
5837
|
"--no-follow-redirects",
|
|
@@ -7552,66 +5855,64 @@ function createFetchUrlCommand(program) {
|
|
|
7552
5855
|
"Custom HTTP header to send with the request (can be specified multiple times)",
|
|
7553
5856
|
(val, prev = []) => prev.concat([val]),
|
|
7554
5857
|
[]
|
|
7555
|
-
).action(
|
|
7556
|
-
|
|
7557
|
-
|
|
7558
|
-
|
|
7559
|
-
|
|
7560
|
-
|
|
7561
|
-
|
|
7562
|
-
|
|
7563
|
-
|
|
7564
|
-
|
|
7565
|
-
|
|
7566
|
-
|
|
7567
|
-
|
|
7568
|
-
|
|
7569
|
-
|
|
5858
|
+
).action(fetchUrlAction);
|
|
5859
|
+
}
|
|
5860
|
+
async function findVersionAction(library, options, command) {
|
|
5861
|
+
const globalOptions = command.parent?.opts() || {};
|
|
5862
|
+
setupLogging(globalOptions);
|
|
5863
|
+
const serverUrl = options.serverUrl;
|
|
5864
|
+
const docService = await createDocumentManagement({ serverUrl });
|
|
5865
|
+
try {
|
|
5866
|
+
const findVersionTool = new FindVersionTool(docService);
|
|
5867
|
+
const versionInfo = await findVersionTool.execute({
|
|
5868
|
+
library,
|
|
5869
|
+
targetVersion: options.version
|
|
5870
|
+
});
|
|
5871
|
+
if (!versionInfo) throw new Error("Failed to get version information");
|
|
5872
|
+
console.log(versionInfo);
|
|
5873
|
+
} finally {
|
|
5874
|
+
await docService.shutdown();
|
|
5875
|
+
}
|
|
7570
5876
|
}
|
|
7571
5877
|
function createFindVersionCommand(program) {
|
|
7572
|
-
return program.command("find-version <library>").description("Find the best matching version for a library").option("-v, --version <string>", "Pattern to match (optional, supports ranges)").
|
|
7573
|
-
|
|
7574
|
-
|
|
7575
|
-
|
|
7576
|
-
|
|
7577
|
-
|
|
7578
|
-
|
|
7579
|
-
|
|
7580
|
-
|
|
7581
|
-
|
|
7582
|
-
|
|
7583
|
-
|
|
7584
|
-
|
|
7585
|
-
|
|
7586
|
-
|
|
7587
|
-
|
|
5878
|
+
return program.command("find-version <library>").description("Find the best matching version for a library").option("-v, --version <string>", "Pattern to match (optional, supports ranges)").option(
|
|
5879
|
+
"--server-url <url>",
|
|
5880
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
5881
|
+
).action(findVersionAction);
|
|
5882
|
+
}
|
|
5883
|
+
async function listAction(options, command) {
|
|
5884
|
+
const globalOptions = command.parent?.opts() || {};
|
|
5885
|
+
setupLogging(globalOptions);
|
|
5886
|
+
const { serverUrl } = options;
|
|
5887
|
+
const docService = await createDocumentManagement({ serverUrl });
|
|
5888
|
+
try {
|
|
5889
|
+
const listLibrariesTool = new ListLibrariesTool(docService);
|
|
5890
|
+
const result = await listLibrariesTool.execute();
|
|
5891
|
+
console.log(formatOutput(result.libraries));
|
|
5892
|
+
} finally {
|
|
5893
|
+
await docService.shutdown();
|
|
5894
|
+
}
|
|
7588
5895
|
}
|
|
7589
5896
|
function createListCommand(program) {
|
|
7590
|
-
return program.command("list").description("List all available libraries and their versions").
|
|
7591
|
-
|
|
7592
|
-
|
|
7593
|
-
|
|
7594
|
-
try {
|
|
7595
|
-
const listLibrariesTool = new ListLibrariesTool(docService);
|
|
7596
|
-
const result = await listLibrariesTool.execute();
|
|
7597
|
-
console.log(formatOutput(result.libraries));
|
|
7598
|
-
} finally {
|
|
7599
|
-
await docService.shutdown();
|
|
7600
|
-
}
|
|
7601
|
-
});
|
|
5897
|
+
return program.command("list").description("List all available libraries and their versions").option(
|
|
5898
|
+
"--server-url <url>",
|
|
5899
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
5900
|
+
).action(listAction);
|
|
7602
5901
|
}
|
|
7603
5902
|
function createMcpCommand(program) {
|
|
7604
|
-
return program.command("mcp").description("Start MCP server only").
|
|
7605
|
-
"--protocol <
|
|
7606
|
-
|
|
7607
|
-
|
|
7608
|
-
|
|
7609
|
-
|
|
7610
|
-
|
|
7611
|
-
|
|
5903
|
+
return program.command("mcp").description("Start MCP server only").addOption(
|
|
5904
|
+
new Option("--protocol <protocol>", "Protocol for MCP server").choices(["auto", "stdio", "http"]).default(CLI_DEFAULTS.PROTOCOL)
|
|
5905
|
+
).addOption(
|
|
5906
|
+
new Option("--port <number>", "Port for the MCP server").argParser((v) => {
|
|
5907
|
+
const n = Number(v);
|
|
5908
|
+
if (!Number.isInteger(n) || n < 1 || n > 65535) {
|
|
5909
|
+
throw new Error("Port must be an integer between 1 and 65535");
|
|
5910
|
+
}
|
|
5911
|
+
return String(n);
|
|
5912
|
+
}).default(CLI_DEFAULTS.HTTP_PORT.toString())
|
|
7612
5913
|
).option(
|
|
7613
5914
|
"--server-url <url>",
|
|
7614
|
-
"URL of external pipeline worker
|
|
5915
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
7615
5916
|
).action(
|
|
7616
5917
|
async (cmdOptions, command) => {
|
|
7617
5918
|
const globalOptions = command.parent?.opts() || {};
|
|
@@ -7620,14 +5921,19 @@ function createMcpCommand(program) {
|
|
|
7620
5921
|
const resolvedProtocol = resolveProtocol(cmdOptions.protocol);
|
|
7621
5922
|
setupLogging(globalOptions, resolvedProtocol);
|
|
7622
5923
|
try {
|
|
7623
|
-
const docService = await
|
|
5924
|
+
const docService = await createDocumentManagement({
|
|
5925
|
+
serverUrl
|
|
5926
|
+
});
|
|
7624
5927
|
const pipelineOptions = {
|
|
7625
5928
|
recoverJobs: false,
|
|
7626
5929
|
// MCP command doesn't support job recovery
|
|
7627
5930
|
serverUrl,
|
|
7628
5931
|
concurrency: 3
|
|
7629
5932
|
};
|
|
7630
|
-
const pipeline = await
|
|
5933
|
+
const pipeline = await createPipelineWithCallbacks(
|
|
5934
|
+
serverUrl ? void 0 : docService,
|
|
5935
|
+
pipelineOptions
|
|
5936
|
+
);
|
|
7631
5937
|
if (resolvedProtocol === "stdio") {
|
|
7632
5938
|
logger.debug(`🔍 Auto-detected stdio protocol (no TTY)`);
|
|
7633
5939
|
logger.info("🚀 Starting MCP server (stdio mode)");
|
|
@@ -7643,8 +5949,8 @@ function createMcpCommand(program) {
|
|
|
7643
5949
|
enableWebInterface: false,
|
|
7644
5950
|
// Never enable web interface in mcp command
|
|
7645
5951
|
enableMcpServer: true,
|
|
7646
|
-
|
|
7647
|
-
// Never enable
|
|
5952
|
+
enableApiServer: false,
|
|
5953
|
+
// Never enable API in mcp command
|
|
7648
5954
|
enableWorker: !serverUrl,
|
|
7649
5955
|
port,
|
|
7650
5956
|
externalWorkerUrl: serverUrl
|
|
@@ -7660,30 +5966,81 @@ function createMcpCommand(program) {
|
|
|
7660
5966
|
}
|
|
7661
5967
|
);
|
|
7662
5968
|
}
|
|
5969
|
+
async function removeAction(library, options, command) {
|
|
5970
|
+
const globalOptions = command.parent?.opts() || {};
|
|
5971
|
+
setupLogging(globalOptions);
|
|
5972
|
+
const serverUrl = options.serverUrl;
|
|
5973
|
+
const docService = await createDocumentManagement({ serverUrl });
|
|
5974
|
+
const { version: version2 } = options;
|
|
5975
|
+
try {
|
|
5976
|
+
await docService.removeAllDocuments(library, version2);
|
|
5977
|
+
console.log(
|
|
5978
|
+
`✅ Successfully removed documents for ${library}${version2 ? `@${version2}` : " (unversioned)"}.`
|
|
5979
|
+
);
|
|
5980
|
+
} catch (error) {
|
|
5981
|
+
console.error(
|
|
5982
|
+
`❌ Failed to remove documents for ${library}${version2 ? `@${version2}` : " (unversioned)"}:`,
|
|
5983
|
+
error instanceof Error ? error.message : String(error)
|
|
5984
|
+
);
|
|
5985
|
+
throw error;
|
|
5986
|
+
} finally {
|
|
5987
|
+
await docService.shutdown();
|
|
5988
|
+
}
|
|
5989
|
+
}
|
|
7663
5990
|
function createRemoveCommand(program) {
|
|
7664
5991
|
return program.command("remove <library>").description("Remove documents for a specific library and version").option(
|
|
7665
5992
|
"-v, --version <string>",
|
|
7666
5993
|
"Version to remove (optional, removes unversioned if omitted)"
|
|
7667
|
-
).
|
|
7668
|
-
|
|
7669
|
-
|
|
7670
|
-
|
|
7671
|
-
|
|
7672
|
-
|
|
7673
|
-
|
|
7674
|
-
|
|
7675
|
-
|
|
7676
|
-
|
|
7677
|
-
|
|
7678
|
-
|
|
7679
|
-
|
|
7680
|
-
|
|
7681
|
-
|
|
7682
|
-
|
|
7683
|
-
}
|
|
7684
|
-
|
|
5994
|
+
).option(
|
|
5995
|
+
"--server-url <url>",
|
|
5996
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
5997
|
+
).action(removeAction);
|
|
5998
|
+
}
|
|
5999
|
+
async function scrapeAction(library, url, options, command) {
|
|
6000
|
+
const globalOptions = command.parent?.opts() || {};
|
|
6001
|
+
setupLogging(globalOptions);
|
|
6002
|
+
const serverUrl = options.serverUrl;
|
|
6003
|
+
const docService = await createDocumentManagement({ serverUrl });
|
|
6004
|
+
let pipeline = null;
|
|
6005
|
+
try {
|
|
6006
|
+
const pipelineOptions = {
|
|
6007
|
+
recoverJobs: false,
|
|
6008
|
+
concurrency: 1,
|
|
6009
|
+
serverUrl
|
|
6010
|
+
};
|
|
6011
|
+
pipeline = await createPipelineWithCallbacks(
|
|
6012
|
+
serverUrl ? void 0 : docService,
|
|
6013
|
+
pipelineOptions
|
|
6014
|
+
);
|
|
6015
|
+
await pipeline.start();
|
|
6016
|
+
const scrapeTool = new ScrapeTool(pipeline);
|
|
6017
|
+
const headers = parseHeaders(options.header);
|
|
6018
|
+
const result = await scrapeTool.execute({
|
|
6019
|
+
url,
|
|
6020
|
+
library,
|
|
6021
|
+
version: options.version,
|
|
6022
|
+
options: {
|
|
6023
|
+
maxPages: Number.parseInt(options.maxPages),
|
|
6024
|
+
maxDepth: Number.parseInt(options.maxDepth),
|
|
6025
|
+
maxConcurrency: Number.parseInt(options.maxConcurrency),
|
|
6026
|
+
ignoreErrors: options.ignoreErrors,
|
|
6027
|
+
scope: options.scope,
|
|
6028
|
+
followRedirects: options.followRedirects,
|
|
6029
|
+
scrapeMode: options.scrapeMode,
|
|
6030
|
+
includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
|
|
6031
|
+
excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
|
|
6032
|
+
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
6033
|
+
}
|
|
6034
|
+
});
|
|
6035
|
+
if ("pagesScraped" in result) {
|
|
6036
|
+
console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
|
|
6037
|
+
} else {
|
|
6038
|
+
console.log(`🚀 Scraping job started with ID: ${result.jobId}`);
|
|
7685
6039
|
}
|
|
7686
|
-
}
|
|
6040
|
+
} finally {
|
|
6041
|
+
if (pipeline) await pipeline.stop();
|
|
6042
|
+
await docService.shutdown();
|
|
6043
|
+
}
|
|
7687
6044
|
}
|
|
7688
6045
|
function createScrapeCommand(program) {
|
|
7689
6046
|
return program.command("scrape <library> <url>").description(
|
|
@@ -7746,55 +6103,27 @@ function createScrapeCommand(program) {
|
|
|
7746
6103
|
[]
|
|
7747
6104
|
).option(
|
|
7748
6105
|
"--server-url <url>",
|
|
7749
|
-
"URL of external pipeline worker
|
|
7750
|
-
).action(
|
|
7751
|
-
|
|
7752
|
-
|
|
7753
|
-
|
|
7754
|
-
|
|
7755
|
-
|
|
7756
|
-
|
|
7757
|
-
|
|
7758
|
-
|
|
7759
|
-
|
|
7760
|
-
|
|
7761
|
-
|
|
7762
|
-
|
|
7763
|
-
|
|
7764
|
-
|
|
7765
|
-
|
|
7766
|
-
|
|
7767
|
-
|
|
7768
|
-
|
|
7769
|
-
|
|
7770
|
-
const result = await scrapeTool.execute({
|
|
7771
|
-
url,
|
|
7772
|
-
library,
|
|
7773
|
-
version: options.version,
|
|
7774
|
-
options: {
|
|
7775
|
-
maxPages: Number.parseInt(options.maxPages),
|
|
7776
|
-
maxDepth: Number.parseInt(options.maxDepth),
|
|
7777
|
-
maxConcurrency: Number.parseInt(options.maxConcurrency),
|
|
7778
|
-
ignoreErrors: options.ignoreErrors,
|
|
7779
|
-
scope: options.scope,
|
|
7780
|
-
followRedirects: options.followRedirects,
|
|
7781
|
-
scrapeMode: options.scrapeMode,
|
|
7782
|
-
includePatterns: Array.isArray(options.includePattern) && options.includePattern.length > 0 ? options.includePattern : void 0,
|
|
7783
|
-
excludePatterns: Array.isArray(options.excludePattern) && options.excludePattern.length > 0 ? options.excludePattern : void 0,
|
|
7784
|
-
headers: Object.keys(headers).length > 0 ? headers : void 0
|
|
7785
|
-
}
|
|
7786
|
-
});
|
|
7787
|
-
if ("pagesScraped" in result) {
|
|
7788
|
-
console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
|
|
7789
|
-
} else {
|
|
7790
|
-
console.log(`🚀 Scraping job started with ID: ${result.jobId}`);
|
|
7791
|
-
}
|
|
7792
|
-
} finally {
|
|
7793
|
-
if (pipeline) await pipeline.stop();
|
|
7794
|
-
await docService.shutdown();
|
|
7795
|
-
}
|
|
7796
|
-
}
|
|
7797
|
-
);
|
|
6106
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
6107
|
+
).action(scrapeAction);
|
|
6108
|
+
}
|
|
6109
|
+
async function searchAction(library, query, options, command) {
|
|
6110
|
+
const globalOptions = command.parent?.opts() || {};
|
|
6111
|
+
setupLogging(globalOptions);
|
|
6112
|
+
const serverUrl = options.serverUrl;
|
|
6113
|
+
const docService = await createDocumentManagement({ serverUrl });
|
|
6114
|
+
try {
|
|
6115
|
+
const searchTool = new SearchTool(docService);
|
|
6116
|
+
const result = await searchTool.execute({
|
|
6117
|
+
library,
|
|
6118
|
+
version: options.version,
|
|
6119
|
+
query,
|
|
6120
|
+
limit: Number.parseInt(options.limit),
|
|
6121
|
+
exactMatch: options.exactMatch
|
|
6122
|
+
});
|
|
6123
|
+
console.log(formatOutput(result.results));
|
|
6124
|
+
} finally {
|
|
6125
|
+
await docService.shutdown();
|
|
6126
|
+
}
|
|
7798
6127
|
}
|
|
7799
6128
|
function createSearchCommand(program) {
|
|
7800
6129
|
return program.command("search <library> <query>").description(
|
|
@@ -7802,35 +6131,23 @@ function createSearchCommand(program) {
|
|
|
7802
6131
|
).option(
|
|
7803
6132
|
"-v, --version <string>",
|
|
7804
6133
|
"Version of the library (optional, supports ranges)"
|
|
7805
|
-
).option("-l, --limit <number>", "Maximum number of results", "5").option("-e, --exact-match", "Only use exact version match (default: false)", false).
|
|
7806
|
-
|
|
7807
|
-
|
|
7808
|
-
|
|
7809
|
-
const docService = await initializeDocumentService();
|
|
7810
|
-
try {
|
|
7811
|
-
const searchTool = new SearchTool(docService);
|
|
7812
|
-
const result = await searchTool.execute({
|
|
7813
|
-
library,
|
|
7814
|
-
version: options.version,
|
|
7815
|
-
query,
|
|
7816
|
-
limit: Number.parseInt(options.limit),
|
|
7817
|
-
exactMatch: options.exactMatch
|
|
7818
|
-
});
|
|
7819
|
-
console.log(formatOutput(result.results));
|
|
7820
|
-
} finally {
|
|
7821
|
-
await docService.shutdown();
|
|
7822
|
-
}
|
|
7823
|
-
}
|
|
7824
|
-
);
|
|
6134
|
+
).option("-l, --limit <number>", "Maximum number of results", "5").option("-e, --exact-match", "Only use exact version match (default: false)", false).option(
|
|
6135
|
+
"--server-url <url>",
|
|
6136
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
6137
|
+
).action(searchAction);
|
|
7825
6138
|
}
|
|
7826
6139
|
function createWebCommand(program) {
|
|
7827
|
-
return program.command("web").description("Start web interface only").
|
|
7828
|
-
"--port <number>",
|
|
7829
|
-
|
|
7830
|
-
|
|
6140
|
+
return program.command("web").description("Start web interface only").addOption(
|
|
6141
|
+
new Option("--port <number>", "Port for the web interface").argParser((v) => {
|
|
6142
|
+
const n = Number(v);
|
|
6143
|
+
if (!Number.isInteger(n) || n < 1 || n > 65535) {
|
|
6144
|
+
throw new Error("Port must be an integer between 1 and 65535");
|
|
6145
|
+
}
|
|
6146
|
+
return String(n);
|
|
6147
|
+
}).default(CLI_DEFAULTS.WEB_PORT.toString())
|
|
7831
6148
|
).option(
|
|
7832
6149
|
"--server-url <url>",
|
|
7833
|
-
"URL of external pipeline worker
|
|
6150
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
|
|
7834
6151
|
).action(
|
|
7835
6152
|
async (cmdOptions, command) => {
|
|
7836
6153
|
const globalOptions = command.parent?.opts() || {};
|
|
@@ -7838,18 +6155,23 @@ function createWebCommand(program) {
|
|
|
7838
6155
|
const serverUrl = cmdOptions.serverUrl;
|
|
7839
6156
|
setupLogging(globalOptions);
|
|
7840
6157
|
try {
|
|
7841
|
-
const docService = await
|
|
6158
|
+
const docService = await createDocumentManagement({
|
|
6159
|
+
serverUrl
|
|
6160
|
+
});
|
|
7842
6161
|
const pipelineOptions = {
|
|
7843
6162
|
recoverJobs: false,
|
|
7844
6163
|
// Web command doesn't support job recovery
|
|
7845
6164
|
serverUrl,
|
|
7846
6165
|
concurrency: 3
|
|
7847
6166
|
};
|
|
7848
|
-
const pipeline = await
|
|
6167
|
+
const pipeline = await createPipelineWithCallbacks(
|
|
6168
|
+
serverUrl ? void 0 : docService,
|
|
6169
|
+
pipelineOptions
|
|
6170
|
+
);
|
|
7849
6171
|
const config = createAppServerConfig({
|
|
7850
6172
|
enableWebInterface: true,
|
|
7851
6173
|
enableMcpServer: false,
|
|
7852
|
-
|
|
6174
|
+
enableApiServer: false,
|
|
7853
6175
|
enableWorker: !serverUrl,
|
|
7854
6176
|
port,
|
|
7855
6177
|
externalWorkerUrl: serverUrl
|
|
@@ -7868,28 +6190,35 @@ function createWebCommand(program) {
|
|
|
7868
6190
|
);
|
|
7869
6191
|
}
|
|
7870
6192
|
function createWorkerCommand(program) {
|
|
7871
|
-
return program.command("worker").description("Start external pipeline worker (HTTP API)").
|
|
6193
|
+
return program.command("worker").description("Start external pipeline worker (HTTP API)").addOption(
|
|
6194
|
+
new Option("--port <number>", "Port for worker API").argParser((v) => {
|
|
6195
|
+
const n = Number(v);
|
|
6196
|
+
if (!Number.isInteger(n) || n < 1 || n > 65535) {
|
|
6197
|
+
throw new Error("Port must be an integer between 1 and 65535");
|
|
6198
|
+
}
|
|
6199
|
+
return String(n);
|
|
6200
|
+
}).default("8080")
|
|
6201
|
+
).option("--resume", "Resume interrupted jobs on startup", true).option("--no-resume", "Do not resume jobs on startup").action(async (cmdOptions, command) => {
|
|
7872
6202
|
const globalOptions = command.parent?.opts() || {};
|
|
7873
6203
|
const port = validatePort(cmdOptions.port);
|
|
7874
6204
|
setupLogging(globalOptions);
|
|
7875
6205
|
try {
|
|
7876
6206
|
logger.info(`🚀 Starting external pipeline worker on port ${port}`);
|
|
7877
6207
|
ensurePlaywrightBrowsersInstalled();
|
|
7878
|
-
const docService = await
|
|
6208
|
+
const docService = await createLocalDocumentManagement();
|
|
7879
6209
|
const pipelineOptions = {
|
|
7880
6210
|
recoverJobs: cmdOptions.resume,
|
|
7881
6211
|
// Use the resume option
|
|
7882
6212
|
concurrency: CLI_DEFAULTS.MAX_CONCURRENCY
|
|
7883
6213
|
};
|
|
7884
|
-
const pipeline = await
|
|
6214
|
+
const pipeline = await createPipelineWithCallbacks(docService, pipelineOptions);
|
|
7885
6215
|
const config = createAppServerConfig({
|
|
7886
6216
|
enableWebInterface: false,
|
|
7887
6217
|
enableMcpServer: false,
|
|
7888
|
-
|
|
6218
|
+
enableApiServer: true,
|
|
7889
6219
|
enableWorker: true,
|
|
7890
6220
|
port
|
|
7891
6221
|
});
|
|
7892
|
-
logger.info(`🚀 Starting external pipeline worker with HTTP API`);
|
|
7893
6222
|
await startAppServer(docService, pipeline, config);
|
|
7894
6223
|
await new Promise(() => {
|
|
7895
6224
|
});
|
|
@@ -7901,7 +6230,9 @@ function createWorkerCommand(program) {
|
|
|
7901
6230
|
}
|
|
7902
6231
|
function createCliProgram() {
|
|
7903
6232
|
const program = new Command();
|
|
7904
|
-
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version(packageJson.version).
|
|
6233
|
+
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version(packageJson.version).addOption(
|
|
6234
|
+
new Option("--verbose", "Enable verbose (debug) logging").conflicts("silent")
|
|
6235
|
+
).addOption(new Option("--silent", "Disable all logging except errors")).enablePositionalOptions().allowExcessArguments(false).showHelpAfterError(true);
|
|
7905
6236
|
program.hook("preAction", (thisCommand, _actionCommand) => {
|
|
7906
6237
|
const globalOptions = thisCommand.opts();
|
|
7907
6238
|
if (globalOptions.silent) setLogLevel(LogLevel.ERROR);
|
|
@@ -8023,7 +6354,23 @@ runCli().catch((error) => {
|
|
|
8023
6354
|
process.exit(1);
|
|
8024
6355
|
});
|
|
8025
6356
|
export {
|
|
6357
|
+
ConnectionError as C,
|
|
8026
6358
|
DimensionError as D,
|
|
8027
|
-
|
|
6359
|
+
EMBEDDING_BATCH_CHARS as E,
|
|
6360
|
+
LibraryNotFoundError as L,
|
|
6361
|
+
StoreError as S,
|
|
6362
|
+
VECTOR_DIMENSION as V,
|
|
6363
|
+
applyMigrations as a,
|
|
6364
|
+
EMBEDDING_BATCH_SIZE as b,
|
|
6365
|
+
createJSDOM as c,
|
|
6366
|
+
denormalizeVersionName as d,
|
|
6367
|
+
SPLITTER_PREFERRED_CHUNK_SIZE as e,
|
|
6368
|
+
SPLITTER_MAX_CHUNK_SIZE as f,
|
|
6369
|
+
getProjectRoot as g,
|
|
6370
|
+
VersionNotFoundError as h,
|
|
6371
|
+
SPLITTER_MIN_CHUNK_SIZE as i,
|
|
6372
|
+
logger as l,
|
|
6373
|
+
mapDbDocumentToDocument as m,
|
|
6374
|
+
normalizeVersionName as n
|
|
8028
6375
|
};
|
|
8029
6376
|
//# sourceMappingURL=index.js.map
|