@arabold/docs-mcp-server 1.36.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -509
- package/dist/index.js +797 -166
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/index.js
CHANGED
|
@@ -64,11 +64,12 @@ import { escapeHtml } from "@kitajs/html";
|
|
|
64
64
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
65
65
|
import { v4 } from "uuid";
|
|
66
66
|
import { minimatch } from "minimatch";
|
|
67
|
+
import { exec, execSync } from "node:child_process";
|
|
68
|
+
import { promisify } from "node:util";
|
|
67
69
|
import { Readable } from "node:stream";
|
|
68
70
|
import * as tar from "tar";
|
|
69
71
|
import yauzl from "yauzl";
|
|
70
72
|
import os from "node:os";
|
|
71
|
-
import { execSync } from "node:child_process";
|
|
72
73
|
class StoreError extends Error {
|
|
73
74
|
constructor(message, cause) {
|
|
74
75
|
super(cause ? `${message} caused by ${cause}` : message);
|
|
@@ -919,6 +920,10 @@ const DEFAULT_CONFIG = {
|
|
|
919
920
|
baseDelayMs: 1e3,
|
|
920
921
|
maxCacheItems: 200,
|
|
921
922
|
maxCacheItemSizeBytes: 500 * 1024
|
|
923
|
+
},
|
|
924
|
+
document: {
|
|
925
|
+
maxSize: 10 * 1024 * 1024
|
|
926
|
+
// 10MB max size for PDF/Office documents
|
|
922
927
|
}
|
|
923
928
|
},
|
|
924
929
|
splitter: {
|
|
@@ -957,10 +962,6 @@ const DEFAULT_CONFIG = {
|
|
|
957
962
|
precedingSiblingsLimit: 1,
|
|
958
963
|
subsequentSiblingsLimit: 2,
|
|
959
964
|
maxChunkDistance: 3
|
|
960
|
-
},
|
|
961
|
-
document: {
|
|
962
|
-
maxSize: 10 * 1024 * 1024
|
|
963
|
-
// 10MB max size for PDF/Office documents
|
|
964
965
|
}
|
|
965
966
|
};
|
|
966
967
|
const AppConfigSchema = z.object({
|
|
@@ -997,7 +998,10 @@ const AppConfigSchema = z.object({
|
|
|
997
998
|
baseDelayMs: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.baseDelayMs),
|
|
998
999
|
maxCacheItems: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.maxCacheItems),
|
|
999
1000
|
maxCacheItemSizeBytes: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.maxCacheItemSizeBytes)
|
|
1000
|
-
}).default(DEFAULT_CONFIG.scraper.fetcher)
|
|
1001
|
+
}).default(DEFAULT_CONFIG.scraper.fetcher),
|
|
1002
|
+
document: z.object({
|
|
1003
|
+
maxSize: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.document.maxSize)
|
|
1004
|
+
}).default(DEFAULT_CONFIG.scraper.document)
|
|
1001
1005
|
}).default(DEFAULT_CONFIG.scraper),
|
|
1002
1006
|
splitter: z.object({
|
|
1003
1007
|
minChunkSize: z.coerce.number().int().default(DEFAULT_CONFIG.splitter.minChunkSize),
|
|
@@ -1035,10 +1039,7 @@ const AppConfigSchema = z.object({
|
|
|
1035
1039
|
precedingSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.precedingSiblingsLimit),
|
|
1036
1040
|
subsequentSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.subsequentSiblingsLimit),
|
|
1037
1041
|
maxChunkDistance: z.coerce.number().int().min(0).default(DEFAULT_CONFIG.assembly.maxChunkDistance)
|
|
1038
|
-
}).default(DEFAULT_CONFIG.assembly)
|
|
1039
|
-
document: z.object({
|
|
1040
|
-
maxSize: z.coerce.number().int().default(DEFAULT_CONFIG.document.maxSize)
|
|
1041
|
-
}).default(DEFAULT_CONFIG.document)
|
|
1042
|
+
}).default(DEFAULT_CONFIG.assembly)
|
|
1042
1043
|
});
|
|
1043
1044
|
const defaults = AppConfigSchema.parse({});
|
|
1044
1045
|
const configMappings = [
|
|
@@ -1159,6 +1160,12 @@ function mapEnvToConfig() {
|
|
|
1159
1160
|
}
|
|
1160
1161
|
}
|
|
1161
1162
|
}
|
|
1163
|
+
for (const pathArr of ALL_CONFIG_LEAF_PATHS) {
|
|
1164
|
+
const envVar = pathToEnvVar(pathArr);
|
|
1165
|
+
if (process.env[envVar] !== void 0) {
|
|
1166
|
+
setAtPath(config, pathArr, process.env[envVar]);
|
|
1167
|
+
}
|
|
1168
|
+
}
|
|
1162
1169
|
return config;
|
|
1163
1170
|
}
|
|
1164
1171
|
function mapCliToConfig(args) {
|
|
@@ -1170,6 +1177,25 @@ function mapCliToConfig(args) {
|
|
|
1170
1177
|
}
|
|
1171
1178
|
return config;
|
|
1172
1179
|
}
|
|
1180
|
+
function camelToUpperSnake(str) {
|
|
1181
|
+
return str.replace(/([a-z])([A-Z])/g, "$1_$2").toUpperCase();
|
|
1182
|
+
}
|
|
1183
|
+
function pathToEnvVar(pathArr) {
|
|
1184
|
+
return `DOCS_MCP_${pathArr.map(camelToUpperSnake).join("_")}`;
|
|
1185
|
+
}
|
|
1186
|
+
function collectLeafPaths(obj, prefix = []) {
|
|
1187
|
+
const paths = [];
|
|
1188
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
1189
|
+
const currentPath = [...prefix, key];
|
|
1190
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
1191
|
+
paths.push(...collectLeafPaths(value, currentPath));
|
|
1192
|
+
} else {
|
|
1193
|
+
paths.push(currentPath);
|
|
1194
|
+
}
|
|
1195
|
+
}
|
|
1196
|
+
return paths;
|
|
1197
|
+
}
|
|
1198
|
+
const ALL_CONFIG_LEAF_PATHS = collectLeafPaths(DEFAULT_CONFIG);
|
|
1173
1199
|
function setAtPath(obj, pathArr, value) {
|
|
1174
1200
|
let current = obj;
|
|
1175
1201
|
for (let i = 0; i < pathArr.length - 1; i++) {
|
|
@@ -1206,17 +1232,168 @@ function deepMerge(target, source) {
|
|
|
1206
1232
|
}
|
|
1207
1233
|
return output;
|
|
1208
1234
|
}
|
|
1235
|
+
function isValidConfigPath(path2) {
|
|
1236
|
+
const pathArr = path2.split(".");
|
|
1237
|
+
return getAtPath(DEFAULT_CONFIG, pathArr) !== void 0;
|
|
1238
|
+
}
|
|
1239
|
+
function getConfigValue(config, path2) {
|
|
1240
|
+
const pathArr = path2.split(".");
|
|
1241
|
+
return getAtPath(config, pathArr);
|
|
1242
|
+
}
|
|
1243
|
+
function parseConfigValue(value) {
|
|
1244
|
+
const num = Number(value);
|
|
1245
|
+
if (!Number.isNaN(num) && value.trim() !== "") {
|
|
1246
|
+
return num;
|
|
1247
|
+
}
|
|
1248
|
+
const lower = value.toLowerCase();
|
|
1249
|
+
if (lower === "true") return true;
|
|
1250
|
+
if (lower === "false") return false;
|
|
1251
|
+
return value;
|
|
1252
|
+
}
|
|
1253
|
+
function setConfigValue(path2, value) {
|
|
1254
|
+
const configPath = getDefaultConfigPath();
|
|
1255
|
+
const fileConfig = loadConfigFile(configPath) || {};
|
|
1256
|
+
const pathArr = path2.split(".");
|
|
1257
|
+
const parsedValue = parseConfigValue(value);
|
|
1258
|
+
const updatedConfig = JSON.parse(JSON.stringify(fileConfig));
|
|
1259
|
+
setAtPath(updatedConfig, pathArr, parsedValue);
|
|
1260
|
+
try {
|
|
1261
|
+
AppConfigSchema.parse(updatedConfig);
|
|
1262
|
+
} catch (err) {
|
|
1263
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
1264
|
+
throw new Error(`Invalid config value for "${path2}": ${errorMsg}`);
|
|
1265
|
+
}
|
|
1266
|
+
saveConfigFile(configPath, updatedConfig);
|
|
1267
|
+
return configPath;
|
|
1268
|
+
}
|
|
1269
|
+
function getDefaultConfigPath() {
|
|
1270
|
+
return path.join(systemPaths.config, "config.yaml");
|
|
1271
|
+
}
|
|
1272
|
+
function formatOutput$1(value, format) {
|
|
1273
|
+
if (format === "auto") {
|
|
1274
|
+
if (typeof value === "object" && value !== null) {
|
|
1275
|
+
return JSON.stringify(value, null, 2);
|
|
1276
|
+
}
|
|
1277
|
+
return String(value);
|
|
1278
|
+
}
|
|
1279
|
+
if (format === "yaml") {
|
|
1280
|
+
return yaml.stringify(value).trim();
|
|
1281
|
+
}
|
|
1282
|
+
return JSON.stringify(value, null, 2);
|
|
1283
|
+
}
|
|
1209
1284
|
function createConfigCommand(cli) {
|
|
1210
1285
|
cli.command(
|
|
1211
1286
|
"config",
|
|
1212
|
-
"
|
|
1213
|
-
(yargs2) =>
|
|
1287
|
+
"View or modify configuration",
|
|
1288
|
+
(yargs2) => {
|
|
1289
|
+
return yargs2.option("json", {
|
|
1290
|
+
type: "boolean",
|
|
1291
|
+
description: "Output in JSON format",
|
|
1292
|
+
conflicts: "yaml"
|
|
1293
|
+
}).option("yaml", {
|
|
1294
|
+
type: "boolean",
|
|
1295
|
+
description: "Output in YAML format",
|
|
1296
|
+
conflicts: "json"
|
|
1297
|
+
}).command(
|
|
1298
|
+
"get <path>",
|
|
1299
|
+
"Get a configuration value",
|
|
1300
|
+
(y) => y.positional("path", {
|
|
1301
|
+
type: "string",
|
|
1302
|
+
description: "Dot-separated config path (e.g., scraper.maxPages)",
|
|
1303
|
+
demandOption: true
|
|
1304
|
+
}).option("json", {
|
|
1305
|
+
type: "boolean",
|
|
1306
|
+
description: "Output in JSON format",
|
|
1307
|
+
conflicts: "yaml"
|
|
1308
|
+
}).option("yaml", {
|
|
1309
|
+
type: "boolean",
|
|
1310
|
+
description: "Output in YAML format",
|
|
1311
|
+
conflicts: "json"
|
|
1312
|
+
}),
|
|
1313
|
+
(argv) => {
|
|
1314
|
+
const path2 = argv.path;
|
|
1315
|
+
if (!isValidConfigPath(path2)) {
|
|
1316
|
+
console.error(`Error: Invalid config path '${path2}'`);
|
|
1317
|
+
console.error("Use 'docs-mcp-server config' to see all available paths.");
|
|
1318
|
+
process.exitCode = 1;
|
|
1319
|
+
return;
|
|
1320
|
+
}
|
|
1321
|
+
const config = loadConfig(argv, {
|
|
1322
|
+
configPath: argv.config,
|
|
1323
|
+
searchDir: argv.storePath
|
|
1324
|
+
});
|
|
1325
|
+
const value = getConfigValue(config, path2);
|
|
1326
|
+
const format = argv.json ? "json" : argv.yaml ? "yaml" : "auto";
|
|
1327
|
+
console.log(formatOutput$1(value, format));
|
|
1328
|
+
}
|
|
1329
|
+
).command(
|
|
1330
|
+
"set <path> <value>",
|
|
1331
|
+
"Set a configuration value",
|
|
1332
|
+
(y) => y.positional("path", {
|
|
1333
|
+
type: "string",
|
|
1334
|
+
description: "Dot-separated config path (e.g., scraper.maxPages)",
|
|
1335
|
+
demandOption: true
|
|
1336
|
+
}).positional("value", {
|
|
1337
|
+
type: "string",
|
|
1338
|
+
description: "Value to set",
|
|
1339
|
+
demandOption: true
|
|
1340
|
+
}),
|
|
1341
|
+
(argv) => {
|
|
1342
|
+
const configPath = argv.config;
|
|
1343
|
+
const path2 = argv.path;
|
|
1344
|
+
const value = argv.value;
|
|
1345
|
+
if (configPath) {
|
|
1346
|
+
console.error(
|
|
1347
|
+
"Error: Cannot modify configuration when using explicit --config file."
|
|
1348
|
+
);
|
|
1349
|
+
console.error(
|
|
1350
|
+
"Remove the --config flag to modify the default configuration."
|
|
1351
|
+
);
|
|
1352
|
+
process.exitCode = 1;
|
|
1353
|
+
return;
|
|
1354
|
+
}
|
|
1355
|
+
if (!isValidConfigPath(path2)) {
|
|
1356
|
+
console.error(`Error: Invalid config path '${path2}'`);
|
|
1357
|
+
console.error("Use 'docs-mcp-server config' to see all available paths.");
|
|
1358
|
+
process.exitCode = 1;
|
|
1359
|
+
return;
|
|
1360
|
+
}
|
|
1361
|
+
const config = loadConfig(argv, {
|
|
1362
|
+
configPath: argv.config,
|
|
1363
|
+
searchDir: argv.storePath
|
|
1364
|
+
});
|
|
1365
|
+
const currentValue = getConfigValue(config, path2);
|
|
1366
|
+
if (currentValue !== void 0 && currentValue !== null && typeof currentValue === "object" && !Array.isArray(currentValue)) {
|
|
1367
|
+
console.error(
|
|
1368
|
+
`Error: Config path '${path2}' refers to an object. Use a more specific leaf path to set a scalar value.`
|
|
1369
|
+
);
|
|
1370
|
+
console.error(
|
|
1371
|
+
"Hint: Run 'docs-mcp-server config' to inspect the current structure."
|
|
1372
|
+
);
|
|
1373
|
+
process.exitCode = 1;
|
|
1374
|
+
return;
|
|
1375
|
+
}
|
|
1376
|
+
try {
|
|
1377
|
+
const savedPath = setConfigValue(path2, value);
|
|
1378
|
+
const parsedValue = parseConfigValue(value);
|
|
1379
|
+
console.log(`Updated ${path2} = ${JSON.stringify(parsedValue)}`);
|
|
1380
|
+
console.log(`Saved to: ${savedPath}`);
|
|
1381
|
+
} catch (error) {
|
|
1382
|
+
console.error(
|
|
1383
|
+
`Error: Failed to save configuration: ${error instanceof Error ? error.message : String(error)}`
|
|
1384
|
+
);
|
|
1385
|
+
process.exitCode = 1;
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1388
|
+
);
|
|
1389
|
+
},
|
|
1214
1390
|
(argv) => {
|
|
1215
1391
|
const config = loadConfig(argv, {
|
|
1216
1392
|
configPath: argv.config,
|
|
1217
1393
|
searchDir: argv.storePath
|
|
1218
1394
|
});
|
|
1219
|
-
|
|
1395
|
+
const format = argv.json ? "json" : argv.yaml ? "yaml" : "json";
|
|
1396
|
+
console.log(formatOutput$1(config, format));
|
|
1220
1397
|
}
|
|
1221
1398
|
);
|
|
1222
1399
|
}
|
|
@@ -2443,42 +2620,135 @@ class MimeTypeUtils {
|
|
|
2443
2620
|
static detectMimeTypeFromPath(filePath) {
|
|
2444
2621
|
const extension = filePath.toLowerCase().split(".").pop();
|
|
2445
2622
|
const customMimeTypes = {
|
|
2623
|
+
// JavaScript/TypeScript family
|
|
2446
2624
|
ts: "text/x-typescript",
|
|
2447
2625
|
tsx: "text/x-tsx",
|
|
2626
|
+
mts: "text/x-typescript",
|
|
2627
|
+
// TypeScript ES modules
|
|
2628
|
+
cts: "text/x-typescript",
|
|
2629
|
+
// TypeScript CommonJS modules
|
|
2448
2630
|
js: "text/javascript",
|
|
2449
2631
|
jsx: "text/x-jsx",
|
|
2450
2632
|
cjs: "text/javascript",
|
|
2451
2633
|
// CommonJS modules
|
|
2452
2634
|
mjs: "text/javascript",
|
|
2453
2635
|
// ES modules
|
|
2636
|
+
// Python family
|
|
2454
2637
|
py: "text/x-python",
|
|
2455
2638
|
pyw: "text/x-python",
|
|
2456
2639
|
pyi: "text/x-python",
|
|
2640
|
+
pyx: "text/x-cython",
|
|
2641
|
+
// Cython
|
|
2642
|
+
pxd: "text/x-cython",
|
|
2643
|
+
// Cython
|
|
2644
|
+
// Systems languages
|
|
2457
2645
|
go: "text/x-go",
|
|
2458
2646
|
rs: "text/x-rust",
|
|
2647
|
+
c: "text/x-csrc",
|
|
2648
|
+
h: "text/x-chdr",
|
|
2649
|
+
cpp: "text/x-c++src",
|
|
2650
|
+
cxx: "text/x-c++src",
|
|
2651
|
+
cc: "text/x-c++src",
|
|
2652
|
+
hpp: "text/x-c++hdr",
|
|
2653
|
+
hxx: "text/x-c++hdr",
|
|
2654
|
+
zig: "text/x-zig",
|
|
2655
|
+
nim: "text/x-nim",
|
|
2656
|
+
v: "text/x-v",
|
|
2657
|
+
cr: "text/x-crystal",
|
|
2658
|
+
// JVM languages
|
|
2459
2659
|
kt: "text/x-kotlin",
|
|
2660
|
+
kts: "text/x-kotlin",
|
|
2661
|
+
// Kotlin script
|
|
2460
2662
|
scala: "text/x-scala",
|
|
2663
|
+
groovy: "text/x-groovy",
|
|
2664
|
+
gradle: "text/x-gradle",
|
|
2665
|
+
// Apple/Mobile
|
|
2461
2666
|
swift: "text/x-swift",
|
|
2667
|
+
dart: "text/x-dart",
|
|
2668
|
+
// Scripting languages
|
|
2462
2669
|
rb: "text/x-ruby",
|
|
2670
|
+
rake: "text/x-ruby",
|
|
2671
|
+
// Rakefile
|
|
2463
2672
|
php: "text/x-php",
|
|
2673
|
+
lua: "text/x-lua",
|
|
2674
|
+
pl: "text/x-perl",
|
|
2675
|
+
pm: "text/x-perl",
|
|
2676
|
+
r: "text/x-r",
|
|
2677
|
+
// Also handles .R since extension is lowercased
|
|
2678
|
+
// Functional languages
|
|
2679
|
+
hs: "text/x-haskell",
|
|
2680
|
+
lhs: "text/x-haskell",
|
|
2681
|
+
// Literate Haskell
|
|
2682
|
+
elm: "text/x-elm",
|
|
2683
|
+
erl: "text/x-erlang",
|
|
2684
|
+
ex: "text/x-elixir",
|
|
2685
|
+
exs: "text/x-elixir",
|
|
2686
|
+
clj: "text/x-clojure",
|
|
2687
|
+
cljs: "text/x-clojure",
|
|
2688
|
+
cljc: "text/x-clojure",
|
|
2689
|
+
jl: "text/x-julia",
|
|
2690
|
+
// .NET
|
|
2464
2691
|
cs: "text/x-csharp",
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
|
|
2470
|
-
|
|
2471
|
-
|
|
2692
|
+
// Web3/Smart contracts
|
|
2693
|
+
sol: "text/x-solidity",
|
|
2694
|
+
move: "text/x-move",
|
|
2695
|
+
cairo: "text/x-cairo",
|
|
2696
|
+
// Modern web frameworks
|
|
2697
|
+
vue: "text/x-vue",
|
|
2698
|
+
svelte: "text/x-svelte",
|
|
2699
|
+
astro: "text/x-astro",
|
|
2700
|
+
// Shell scripting
|
|
2472
2701
|
sh: "text/x-shellscript",
|
|
2473
2702
|
bash: "text/x-shellscript",
|
|
2474
2703
|
zsh: "text/x-shellscript",
|
|
2475
2704
|
fish: "text/x-shellscript",
|
|
2476
2705
|
ps1: "text/x-powershell",
|
|
2706
|
+
// Documentation formats
|
|
2707
|
+
rst: "text/x-rst",
|
|
2708
|
+
// reStructuredText
|
|
2709
|
+
adoc: "text/x-asciidoc",
|
|
2710
|
+
asciidoc: "text/x-asciidoc",
|
|
2711
|
+
textile: "text/x-textile",
|
|
2712
|
+
org: "text/x-org",
|
|
2713
|
+
// Org-mode
|
|
2714
|
+
pod: "text/x-pod",
|
|
2715
|
+
// Perl documentation
|
|
2716
|
+
rdoc: "text/x-rdoc",
|
|
2717
|
+
// Ruby documentation
|
|
2718
|
+
wiki: "text/x-wiki",
|
|
2719
|
+
rmd: "text/x-rmarkdown",
|
|
2720
|
+
// R Markdown
|
|
2721
|
+
// Configuration files
|
|
2722
|
+
toml: "text/x-toml",
|
|
2723
|
+
ini: "text/x-ini",
|
|
2724
|
+
cfg: "text/x-ini",
|
|
2725
|
+
conf: "text/x-conf",
|
|
2726
|
+
properties: "text/x-properties",
|
|
2727
|
+
env: "text/x-dotenv",
|
|
2728
|
+
// Build systems
|
|
2729
|
+
dockerfile: "text/x-dockerfile",
|
|
2730
|
+
containerfile: "text/x-dockerfile",
|
|
2731
|
+
makefile: "text/x-makefile",
|
|
2732
|
+
cmake: "text/x-cmake",
|
|
2733
|
+
bazel: "text/x-bazel",
|
|
2734
|
+
bzl: "text/x-bazel",
|
|
2735
|
+
buck: "text/x-buck",
|
|
2736
|
+
// Infrastructure as Code
|
|
2737
|
+
tf: "text/x-terraform",
|
|
2738
|
+
tfvars: "text/x-terraform",
|
|
2739
|
+
hcl: "text/x-hcl",
|
|
2740
|
+
// Data/Query languages
|
|
2477
2741
|
sql: "text/x-sql",
|
|
2478
2742
|
graphql: "text/x-graphql",
|
|
2479
2743
|
gql: "text/x-graphql",
|
|
2744
|
+
// Schema/API definitions
|
|
2480
2745
|
proto: "text/x-proto",
|
|
2481
|
-
|
|
2746
|
+
prisma: "text/x-prisma",
|
|
2747
|
+
thrift: "text/x-thrift",
|
|
2748
|
+
avro: "text/x-avro",
|
|
2749
|
+
// TeX/LaTeX
|
|
2750
|
+
tex: "text/x-tex",
|
|
2751
|
+
latex: "text/x-latex"
|
|
2482
2752
|
};
|
|
2483
2753
|
if (extension && customMimeTypes[extension]) {
|
|
2484
2754
|
return customMimeTypes[extension];
|
|
@@ -2498,8 +2768,24 @@ class MimeTypeUtils {
|
|
|
2498
2768
|
return null;
|
|
2499
2769
|
}
|
|
2500
2770
|
const mimeTypeNormalization = {
|
|
2501
|
-
"application/node": "text/javascript"
|
|
2502
|
-
// .cjs files
|
|
2771
|
+
"application/node": "text/javascript",
|
|
2772
|
+
// .cjs files
|
|
2773
|
+
"video/mp2t": "text/x-typescript",
|
|
2774
|
+
// .ts/.mts files (MPEG-2 transport stream conflict)
|
|
2775
|
+
"application/rls-services+xml": "text/x-rust",
|
|
2776
|
+
// .rs files
|
|
2777
|
+
"application/vnd.lotus-organizer": "text/x-org",
|
|
2778
|
+
// .org files (Lotus Organizer conflict)
|
|
2779
|
+
"application/vnd.dart": "text/x-dart",
|
|
2780
|
+
// .dart files
|
|
2781
|
+
"application/x-perl": "text/x-perl",
|
|
2782
|
+
// .pl/.pm files
|
|
2783
|
+
"application/x-tex": "text/x-tex",
|
|
2784
|
+
// .tex files
|
|
2785
|
+
"application/x-latex": "text/x-latex",
|
|
2786
|
+
// .latex files
|
|
2787
|
+
"application/toml": "text/x-toml"
|
|
2788
|
+
// .toml files
|
|
2503
2789
|
};
|
|
2504
2790
|
return mimeTypeNormalization[mimeType] || mimeType;
|
|
2505
2791
|
}
|
|
@@ -2511,6 +2797,7 @@ class MimeTypeUtils {
|
|
|
2511
2797
|
*/
|
|
2512
2798
|
static extractLanguageFromMimeType(mimeType) {
|
|
2513
2799
|
const mimeToLanguage = {
|
|
2800
|
+
// JavaScript/TypeScript
|
|
2514
2801
|
"text/x-typescript": "typescript",
|
|
2515
2802
|
"text/typescript": "typescript",
|
|
2516
2803
|
"application/typescript": "typescript",
|
|
@@ -2519,22 +2806,84 @@ class MimeTypeUtils {
|
|
|
2519
2806
|
"application/javascript": "javascript",
|
|
2520
2807
|
"application/x-javascript": "javascript",
|
|
2521
2808
|
"text/x-jsx": "jsx",
|
|
2809
|
+
// Python
|
|
2522
2810
|
"text/x-python": "python",
|
|
2523
|
-
"text/x-
|
|
2811
|
+
"text/x-cython": "cython",
|
|
2812
|
+
// Systems languages
|
|
2524
2813
|
"text/x-c": "c",
|
|
2525
2814
|
"text/x-csrc": "c",
|
|
2526
2815
|
"text/x-chdr": "c",
|
|
2527
2816
|
"text/x-c++": "cpp",
|
|
2528
2817
|
"text/x-c++src": "cpp",
|
|
2529
2818
|
"text/x-c++hdr": "cpp",
|
|
2530
|
-
"text/x-csharp": "csharp",
|
|
2531
2819
|
"text/x-go": "go",
|
|
2532
2820
|
"text/x-rust": "rust",
|
|
2533
|
-
"text/x-
|
|
2534
|
-
"text/x-
|
|
2535
|
-
"text/x-
|
|
2821
|
+
"text/x-zig": "zig",
|
|
2822
|
+
"text/x-nim": "nim",
|
|
2823
|
+
"text/x-v": "v",
|
|
2824
|
+
"text/x-crystal": "crystal",
|
|
2825
|
+
// JVM languages
|
|
2826
|
+
"text/x-java": "java",
|
|
2536
2827
|
"text/x-kotlin": "kotlin",
|
|
2537
2828
|
"text/x-scala": "scala",
|
|
2829
|
+
"text/x-groovy": "groovy",
|
|
2830
|
+
"text/x-gradle": "groovy",
|
|
2831
|
+
// Apple/Mobile
|
|
2832
|
+
"text/x-swift": "swift",
|
|
2833
|
+
"text/x-dart": "dart",
|
|
2834
|
+
// .NET
|
|
2835
|
+
"text/x-csharp": "csharp",
|
|
2836
|
+
// Scripting languages
|
|
2837
|
+
"text/x-ruby": "ruby",
|
|
2838
|
+
"text/x-php": "php",
|
|
2839
|
+
"text/x-lua": "lua",
|
|
2840
|
+
"text/x-perl": "perl",
|
|
2841
|
+
"text/x-r": "r",
|
|
2842
|
+
// Functional languages
|
|
2843
|
+
"text/x-haskell": "haskell",
|
|
2844
|
+
"text/x-elm": "elm",
|
|
2845
|
+
"text/x-erlang": "erlang",
|
|
2846
|
+
"text/x-elixir": "elixir",
|
|
2847
|
+
"text/x-clojure": "clojure",
|
|
2848
|
+
"text/x-julia": "julia",
|
|
2849
|
+
// Web3/Smart contracts
|
|
2850
|
+
"text/x-solidity": "solidity",
|
|
2851
|
+
"text/x-move": "move",
|
|
2852
|
+
"text/x-cairo": "cairo",
|
|
2853
|
+
// Modern web frameworks
|
|
2854
|
+
"text/x-vue": "vue",
|
|
2855
|
+
"text/x-svelte": "svelte",
|
|
2856
|
+
"text/x-astro": "astro",
|
|
2857
|
+
// Shell
|
|
2858
|
+
"text/x-sh": "bash",
|
|
2859
|
+
"text/x-shellscript": "bash",
|
|
2860
|
+
"application/x-sh": "bash",
|
|
2861
|
+
"text/x-powershell": "powershell",
|
|
2862
|
+
// Documentation formats
|
|
2863
|
+
"text/x-rst": "rst",
|
|
2864
|
+
"text/x-asciidoc": "asciidoc",
|
|
2865
|
+
"text/x-textile": "textile",
|
|
2866
|
+
"text/x-org": "org",
|
|
2867
|
+
"text/x-pod": "pod",
|
|
2868
|
+
"text/x-rdoc": "rdoc",
|
|
2869
|
+
"text/x-wiki": "wiki",
|
|
2870
|
+
"text/x-rmarkdown": "rmarkdown",
|
|
2871
|
+
// Configuration files
|
|
2872
|
+
"text/x-toml": "toml",
|
|
2873
|
+
"text/x-ini": "ini",
|
|
2874
|
+
"text/x-conf": "conf",
|
|
2875
|
+
"text/x-properties": "properties",
|
|
2876
|
+
"text/x-dotenv": "dotenv",
|
|
2877
|
+
// Build systems
|
|
2878
|
+
"text/x-dockerfile": "dockerfile",
|
|
2879
|
+
"text/x-makefile": "makefile",
|
|
2880
|
+
"text/x-cmake": "cmake",
|
|
2881
|
+
"text/x-bazel": "bazel",
|
|
2882
|
+
"text/x-buck": "buck",
|
|
2883
|
+
// Infrastructure as Code
|
|
2884
|
+
"text/x-terraform": "hcl",
|
|
2885
|
+
"text/x-hcl": "hcl",
|
|
2886
|
+
// Data formats
|
|
2538
2887
|
"text/x-yaml": "yaml",
|
|
2539
2888
|
"application/x-yaml": "yaml",
|
|
2540
2889
|
"application/yaml": "yaml",
|
|
@@ -2544,13 +2893,15 @@ class MimeTypeUtils {
|
|
|
2544
2893
|
"text/xml": "xml",
|
|
2545
2894
|
"application/xml": "xml",
|
|
2546
2895
|
"text/x-sql": "sql",
|
|
2547
|
-
"text/x-sh": "bash",
|
|
2548
|
-
"text/x-shellscript": "bash",
|
|
2549
|
-
"application/x-sh": "bash",
|
|
2550
|
-
"text/x-powershell": "powershell",
|
|
2551
2896
|
"text/x-graphql": "graphql",
|
|
2897
|
+
// Schema/API definitions
|
|
2552
2898
|
"text/x-proto": "protobuf",
|
|
2553
|
-
"text/x-
|
|
2899
|
+
"text/x-prisma": "prisma",
|
|
2900
|
+
"text/x-thrift": "thrift",
|
|
2901
|
+
"text/x-avro": "avro",
|
|
2902
|
+
// TeX/LaTeX
|
|
2903
|
+
"text/x-tex": "tex",
|
|
2904
|
+
"text/x-latex": "latex"
|
|
2554
2905
|
};
|
|
2555
2906
|
return mimeToLanguage[mimeType] || "";
|
|
2556
2907
|
}
|
|
@@ -2672,22 +3023,29 @@ class BrowserFetcher {
|
|
|
2672
3023
|
}
|
|
2673
3024
|
}
|
|
2674
3025
|
/**
|
|
2675
|
-
* Close the browser and clean up resources
|
|
3026
|
+
* Close the browser and clean up resources.
|
|
3027
|
+
* Always attempts cleanup even if browser is disconnected to reap zombie processes.
|
|
2676
3028
|
*/
|
|
2677
3029
|
async close() {
|
|
2678
|
-
|
|
2679
|
-
|
|
3030
|
+
if (this.page) {
|
|
3031
|
+
try {
|
|
2680
3032
|
await this.page.close();
|
|
3033
|
+
} catch (error) {
|
|
3034
|
+
logger.warn(`⚠️ Error closing browser page: ${error}`);
|
|
3035
|
+
} finally {
|
|
2681
3036
|
this.page = null;
|
|
2682
3037
|
}
|
|
2683
|
-
|
|
3038
|
+
}
|
|
3039
|
+
if (this.browser) {
|
|
3040
|
+
try {
|
|
2684
3041
|
await this.browser.close();
|
|
3042
|
+
} catch (error) {
|
|
3043
|
+
logger.warn(`⚠️ Error closing browser: ${error}`);
|
|
3044
|
+
} finally {
|
|
2685
3045
|
this.browser = null;
|
|
2686
3046
|
}
|
|
2687
|
-
logger.debug("Browser closed successfully");
|
|
2688
|
-
} catch (error) {
|
|
2689
|
-
logger.warn(`⚠️ Error closing browser: ${error}`);
|
|
2690
3047
|
}
|
|
3048
|
+
logger.debug("Browser closed successfully");
|
|
2691
3049
|
}
|
|
2692
3050
|
}
|
|
2693
3051
|
class FileFetcher {
|
|
@@ -4019,7 +4377,7 @@ class DocumentPipeline extends BasePipeline {
|
|
|
4019
4377
|
constructor(config) {
|
|
4020
4378
|
super();
|
|
4021
4379
|
this.markitdown = new MarkItDown();
|
|
4022
|
-
this.maxSize = config.document.maxSize;
|
|
4380
|
+
this.maxSize = config.scraper.document.maxSize;
|
|
4023
4381
|
const semanticSplitter = new SemanticMarkdownSplitter(
|
|
4024
4382
|
config.splitter.preferredChunkSize,
|
|
4025
4383
|
config.splitter.maxChunkSize
|
|
@@ -4505,12 +4863,18 @@ class HtmlPlaywrightMiddleware {
|
|
|
4505
4863
|
/**
|
|
4506
4864
|
* Closes the Playwright browser instance if it exists.
|
|
4507
4865
|
* Should be called during application shutdown.
|
|
4866
|
+
* Attempts to close even if the browser is disconnected to ensure proper cleanup of zombie processes.
|
|
4508
4867
|
*/
|
|
4509
4868
|
async closeBrowser() {
|
|
4510
|
-
if (this.browser
|
|
4511
|
-
|
|
4512
|
-
|
|
4513
|
-
|
|
4869
|
+
if (this.browser) {
|
|
4870
|
+
try {
|
|
4871
|
+
logger.debug("Closing Playwright browser instance...");
|
|
4872
|
+
await this.browser.close();
|
|
4873
|
+
} catch (error) {
|
|
4874
|
+
logger.warn(`⚠️ Error closing Playwright browser: ${error}`);
|
|
4875
|
+
} finally {
|
|
4876
|
+
this.browser = null;
|
|
4877
|
+
}
|
|
4514
4878
|
}
|
|
4515
4879
|
}
|
|
4516
4880
|
/**
|
|
@@ -5615,10 +5979,15 @@ class HtmlPipeline extends BasePipeline {
|
|
|
5615
5979
|
}
|
|
5616
5980
|
/**
|
|
5617
5981
|
* Cleanup resources used by this pipeline, specifically the Playwright browser instance.
|
|
5982
|
+
* Errors during cleanup are logged but not propagated to ensure graceful shutdown.
|
|
5618
5983
|
*/
|
|
5619
5984
|
async close() {
|
|
5620
5985
|
await super.close();
|
|
5621
|
-
|
|
5986
|
+
try {
|
|
5987
|
+
await this.playwrightMiddleware.closeBrowser();
|
|
5988
|
+
} catch (error) {
|
|
5989
|
+
logger.warn(`⚠️ Error during browser cleanup: ${error}`);
|
|
5990
|
+
}
|
|
5622
5991
|
}
|
|
5623
5992
|
}
|
|
5624
5993
|
class TextDocumentSplitter {
|
|
@@ -6643,6 +7012,11 @@ class TypeScriptParser {
|
|
|
6643
7012
|
".cjs"
|
|
6644
7013
|
];
|
|
6645
7014
|
mimeTypes = [
|
|
7015
|
+
// text/x-* variants (output by MimeTypeUtils.detectMimeTypeFromPath)
|
|
7016
|
+
"text/x-typescript",
|
|
7017
|
+
"text/x-tsx",
|
|
7018
|
+
"text/x-jsx",
|
|
7019
|
+
// Standard variants
|
|
6646
7020
|
"text/typescript",
|
|
6647
7021
|
"application/typescript",
|
|
6648
7022
|
"text/tsx",
|
|
@@ -6984,6 +7358,8 @@ class LanguageParserRegistry {
|
|
|
6984
7358
|
// Narrow advertised extensions/mime types for the alias (informational only).
|
|
6985
7359
|
fileExtensions: [".js", ".jsx", ".mjs", ".cjs"],
|
|
6986
7360
|
mimeTypes: [
|
|
7361
|
+
"text/x-jsx",
|
|
7362
|
+
// Output by MimeTypeUtils.detectMimeTypeFromPath
|
|
6987
7363
|
"text/javascript",
|
|
6988
7364
|
"application/javascript",
|
|
6989
7365
|
"text/jsx",
|
|
@@ -6996,6 +7372,8 @@ class LanguageParserRegistry {
|
|
|
6996
7372
|
this.extensionMap.set(ext.toLowerCase(), "javascript");
|
|
6997
7373
|
}
|
|
6998
7374
|
const jsMimes = [
|
|
7375
|
+
"text/x-jsx",
|
|
7376
|
+
// Output by MimeTypeUtils.detectMimeTypeFromPath
|
|
6999
7377
|
"text/javascript",
|
|
7000
7378
|
"application/javascript",
|
|
7001
7379
|
"text/jsx",
|
|
@@ -10897,6 +11275,7 @@ async function registerMcpService(server, docService, pipeline, config, authMana
|
|
|
10897
11275
|
const mcpServer = createMcpServerInstance(mcpTools, config);
|
|
10898
11276
|
const authMiddleware = authManager ? createAuthMiddleware(authManager) : null;
|
|
10899
11277
|
const sseTransports = {};
|
|
11278
|
+
const sseServers = {};
|
|
10900
11279
|
const heartbeatIntervals = {};
|
|
10901
11280
|
server.route({
|
|
10902
11281
|
method: "GET",
|
|
@@ -10906,6 +11285,8 @@ async function registerMcpService(server, docService, pipeline, config, authMana
|
|
|
10906
11285
|
try {
|
|
10907
11286
|
const transport = new SSEServerTransport("/messages", reply.raw);
|
|
10908
11287
|
sseTransports[transport.sessionId] = transport;
|
|
11288
|
+
const sessionServer = createMcpServerInstance(mcpTools, config);
|
|
11289
|
+
sseServers[transport.sessionId] = sessionServer;
|
|
10909
11290
|
if (telemetry.isEnabled()) {
|
|
10910
11291
|
logger.info(`🔗 MCP client connected: ${transport.sessionId}`);
|
|
10911
11292
|
}
|
|
@@ -10924,6 +11305,13 @@ async function registerMcpService(server, docService, pipeline, config, authMana
|
|
|
10924
11305
|
clearInterval(interval);
|
|
10925
11306
|
delete heartbeatIntervals[transport.sessionId];
|
|
10926
11307
|
}
|
|
11308
|
+
const serverToClose = sseServers[transport.sessionId];
|
|
11309
|
+
if (serverToClose) {
|
|
11310
|
+
delete sseServers[transport.sessionId];
|
|
11311
|
+
void serverToClose.close().catch((error) => {
|
|
11312
|
+
logger.error(`❌ Failed to close SSE server instance: ${error}`);
|
|
11313
|
+
});
|
|
11314
|
+
}
|
|
10927
11315
|
delete sseTransports[transport.sessionId];
|
|
10928
11316
|
transport.close();
|
|
10929
11317
|
if (telemetry.isEnabled()) {
|
|
@@ -10935,7 +11323,7 @@ async function registerMcpService(server, docService, pipeline, config, authMana
|
|
|
10935
11323
|
logger.debug(`SSE connection error: ${error}`);
|
|
10936
11324
|
cleanupConnection();
|
|
10937
11325
|
});
|
|
10938
|
-
await
|
|
11326
|
+
await sessionServer.connect(transport);
|
|
10939
11327
|
} catch (error) {
|
|
10940
11328
|
logger.error(`❌ Error in SSE endpoint: ${error}`);
|
|
10941
11329
|
reply.code(500).send({
|
|
@@ -10995,7 +11383,16 @@ async function registerMcpService(server, docService, pipeline, config, authMana
|
|
|
10995
11383
|
}
|
|
10996
11384
|
}
|
|
10997
11385
|
});
|
|
11386
|
+
server.route({
|
|
11387
|
+
method: "GET",
|
|
11388
|
+
url: "/mcp",
|
|
11389
|
+
preHandler: authMiddleware ? [authMiddleware] : void 0,
|
|
11390
|
+
handler: async (_request, reply) => {
|
|
11391
|
+
reply.code(405).header("Allow", "POST").send();
|
|
11392
|
+
}
|
|
11393
|
+
});
|
|
10998
11394
|
mcpServer._sseTransports = sseTransports;
|
|
11395
|
+
mcpServer._sseServers = sseServers;
|
|
10999
11396
|
mcpServer._heartbeatIntervals = heartbeatIntervals;
|
|
11000
11397
|
return mcpServer;
|
|
11001
11398
|
}
|
|
@@ -11013,6 +11410,12 @@ async function cleanupMcpService(mcpServer) {
|
|
|
11013
11410
|
await transport.close();
|
|
11014
11411
|
}
|
|
11015
11412
|
}
|
|
11413
|
+
const sseServers = mcpServer._sseServers;
|
|
11414
|
+
if (sseServers) {
|
|
11415
|
+
for (const server of Object.values(sseServers)) {
|
|
11416
|
+
await server.close();
|
|
11417
|
+
}
|
|
11418
|
+
}
|
|
11016
11419
|
await mcpServer.close();
|
|
11017
11420
|
logger.debug("MCP service cleaned up");
|
|
11018
11421
|
} catch (error) {
|
|
@@ -11610,7 +12013,7 @@ const Layout = ({
|
|
|
11610
12013
|
children,
|
|
11611
12014
|
eventClientConfig
|
|
11612
12015
|
}) => {
|
|
11613
|
-
const versionString = version || "
|
|
12016
|
+
const versionString = version || "2.0.0";
|
|
11614
12017
|
const versionInitializer = `versionUpdate({ currentVersion: ${`'${versionString}'`} })`;
|
|
11615
12018
|
return /* @__PURE__ */ jsxs("html", { lang: "en", children: [
|
|
11616
12019
|
/* @__PURE__ */ jsxs("head", { children: [
|
|
@@ -13903,6 +14306,15 @@ async function stopWorkerService(pipeline) {
|
|
|
13903
14306
|
await pipeline.stop();
|
|
13904
14307
|
logger.debug("Worker service stopped");
|
|
13905
14308
|
}
|
|
14309
|
+
const BANNER = [
|
|
14310
|
+
"\x1B[90m █▀▀ █▀█ █▀█ █ █ █▄ █ █▀▄ █▀▀ █▀▄ █▀▄ █▀█ █▀▀ █▀▀\x1B[0m",
|
|
14311
|
+
"\x1B[97m █▄█ █▀▄ █▄█ █▄█ █ ▀█ █▄▀ ██▄ █▄▀ █▄▀ █▄█ █▄▄ ▄▄█\x1B[0m"
|
|
14312
|
+
].join("\n");
|
|
14313
|
+
function printBanner() {
|
|
14314
|
+
console.log();
|
|
14315
|
+
console.log(BANNER);
|
|
14316
|
+
console.log();
|
|
14317
|
+
}
|
|
13906
14318
|
class AppServer {
|
|
13907
14319
|
constructor(docService, pipeline, eventBus, serverConfig, appConfig) {
|
|
13908
14320
|
this.docService = docService;
|
|
@@ -13951,7 +14363,7 @@ class AppServer {
|
|
|
13951
14363
|
try {
|
|
13952
14364
|
if (telemetry.isEnabled()) {
|
|
13953
14365
|
telemetry.setGlobalContext({
|
|
13954
|
-
appVersion: "
|
|
14366
|
+
appVersion: "2.0.0",
|
|
13955
14367
|
appPlatform: process.platform,
|
|
13956
14368
|
appNodeVersion: process.version,
|
|
13957
14369
|
appServicesEnabled: this.getActiveServicesList(),
|
|
@@ -14280,6 +14692,9 @@ class AppServer {
|
|
|
14280
14692
|
* Log startup information showing which services are enabled.
|
|
14281
14693
|
*/
|
|
14282
14694
|
logStartupInfo(address) {
|
|
14695
|
+
if (this.serverConfig.showLogo !== false) {
|
|
14696
|
+
printBanner();
|
|
14697
|
+
}
|
|
14283
14698
|
const isWorkerOnly = this.serverConfig.enableWorker && !this.serverConfig.enableWebInterface && !this.serverConfig.enableMcpServer;
|
|
14284
14699
|
const isWebOnly = this.serverConfig.enableWebInterface && !this.serverConfig.enableWorker && !this.serverConfig.enableMcpServer;
|
|
14285
14700
|
const isMcpOnly = this.serverConfig.enableMcpServer && !this.serverConfig.enableWebInterface && !this.serverConfig.enableWorker;
|
|
@@ -14691,6 +15106,9 @@ class BaseScraperStrategy {
|
|
|
14691
15106
|
return null;
|
|
14692
15107
|
}).filter((item2) => item2 !== null);
|
|
14693
15108
|
} catch (error) {
|
|
15109
|
+
if (item.depth === 0) {
|
|
15110
|
+
throw error;
|
|
15111
|
+
}
|
|
14694
15112
|
if (options.ignoreErrors) {
|
|
14695
15113
|
logger.error(`❌ Failed to process ${item.url}: ${error}`);
|
|
14696
15114
|
return [];
|
|
@@ -14805,10 +15223,10 @@ class GitHubRepoProcessor {
|
|
|
14805
15223
|
/**
|
|
14806
15224
|
* Fetches the raw content of a file from GitHub.
|
|
14807
15225
|
*/
|
|
14808
|
-
async fetchFileContent(repoInfo, filePath, etag, signal) {
|
|
15226
|
+
async fetchFileContent(repoInfo, filePath, etag, headers, signal) {
|
|
14809
15227
|
const { owner, repo, branch } = repoInfo;
|
|
14810
15228
|
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
|
|
14811
|
-
const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag });
|
|
15229
|
+
const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag, headers });
|
|
14812
15230
|
const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
|
|
14813
15231
|
if (detectedMimeType && (rawContent.mimeType === "text/plain" || rawContent.mimeType === "application/octet-stream")) {
|
|
14814
15232
|
return {
|
|
@@ -14821,13 +15239,14 @@ class GitHubRepoProcessor {
|
|
|
14821
15239
|
/**
|
|
14822
15240
|
* Processes a single GitHub repository file from an HTTPS blob URL.
|
|
14823
15241
|
*/
|
|
14824
|
-
async process(item, options, signal) {
|
|
15242
|
+
async process(item, options, headers, signal) {
|
|
14825
15243
|
const repoInfo = this.parseHttpsBlobUrl(item.url);
|
|
14826
15244
|
const { owner, repo, branch, filePath } = repoInfo;
|
|
14827
15245
|
const rawContent = await this.fetchFileContent(
|
|
14828
15246
|
{ owner, repo, branch },
|
|
14829
15247
|
filePath,
|
|
14830
15248
|
item.etag,
|
|
15249
|
+
headers,
|
|
14831
15250
|
signal
|
|
14832
15251
|
);
|
|
14833
15252
|
if (rawContent.status !== FetchStatus.SUCCESS) {
|
|
@@ -14918,12 +15337,13 @@ class GitHubWikiProcessor {
|
|
|
14918
15337
|
/**
|
|
14919
15338
|
* Processes a single GitHub wiki page.
|
|
14920
15339
|
*/
|
|
14921
|
-
async process(item, options, signal) {
|
|
15340
|
+
async process(item, options, headers, signal) {
|
|
14922
15341
|
const currentUrl = item.url;
|
|
14923
15342
|
try {
|
|
14924
15343
|
const rawContent = await this.httpFetcher.fetch(currentUrl, {
|
|
14925
15344
|
signal,
|
|
14926
|
-
etag: item.etag
|
|
15345
|
+
etag: item.etag,
|
|
15346
|
+
headers
|
|
14927
15347
|
});
|
|
14928
15348
|
if (rawContent.status !== FetchStatus.SUCCESS) {
|
|
14929
15349
|
return { url: currentUrl, links: [], status: rawContent.status };
|
|
@@ -14994,10 +15414,52 @@ class GitHubWikiProcessor {
|
|
|
14994
15414
|
await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
|
|
14995
15415
|
}
|
|
14996
15416
|
}
|
|
15417
|
+
const execAsync = promisify(exec);
|
|
15418
|
+
async function resolveGitHubAuth(explicitHeaders) {
|
|
15419
|
+
if (explicitHeaders) {
|
|
15420
|
+
const hasAuthHeader = Object.keys(explicitHeaders).some(
|
|
15421
|
+
(key) => key.toLowerCase() === "authorization"
|
|
15422
|
+
);
|
|
15423
|
+
if (hasAuthHeader) {
|
|
15424
|
+
return explicitHeaders;
|
|
15425
|
+
}
|
|
15426
|
+
}
|
|
15427
|
+
const githubToken = process.env.GITHUB_TOKEN;
|
|
15428
|
+
if (githubToken) {
|
|
15429
|
+
logger.debug("Using GitHub token from GITHUB_TOKEN environment variable");
|
|
15430
|
+
return {
|
|
15431
|
+
...explicitHeaders,
|
|
15432
|
+
Authorization: `Bearer ${githubToken}`
|
|
15433
|
+
};
|
|
15434
|
+
}
|
|
15435
|
+
const ghToken = process.env.GH_TOKEN;
|
|
15436
|
+
if (ghToken) {
|
|
15437
|
+
logger.debug("Using GitHub token from GH_TOKEN environment variable");
|
|
15438
|
+
return {
|
|
15439
|
+
...explicitHeaders,
|
|
15440
|
+
Authorization: `Bearer ${ghToken}`
|
|
15441
|
+
};
|
|
15442
|
+
}
|
|
15443
|
+
try {
|
|
15444
|
+
const { stdout } = await execAsync("gh auth token", { timeout: 5e3 });
|
|
15445
|
+
const cliToken = stdout.trim();
|
|
15446
|
+
if (cliToken) {
|
|
15447
|
+
logger.debug("Using GitHub token from local gh CLI");
|
|
15448
|
+
return {
|
|
15449
|
+
...explicitHeaders,
|
|
15450
|
+
Authorization: `Bearer ${cliToken}`
|
|
15451
|
+
};
|
|
15452
|
+
}
|
|
15453
|
+
} catch {
|
|
15454
|
+
}
|
|
15455
|
+
return explicitHeaders ?? {};
|
|
15456
|
+
}
|
|
14997
15457
|
class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
14998
15458
|
httpFetcher;
|
|
14999
15459
|
wikiProcessor;
|
|
15000
15460
|
repoProcessor;
|
|
15461
|
+
resolvedAuthHeaders;
|
|
15462
|
+
resolvedAuthKey;
|
|
15001
15463
|
constructor(config) {
|
|
15002
15464
|
super(config);
|
|
15003
15465
|
this.httpFetcher = new HttpFetcher(config.scraper);
|
|
@@ -15054,31 +15516,117 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15054
15516
|
}
|
|
15055
15517
|
return { owner, repo };
|
|
15056
15518
|
}
|
|
15519
|
+
buildAuthCacheKey(explicitHeaders) {
|
|
15520
|
+
const normalizedHeaders = explicitHeaders ? Object.keys(explicitHeaders).sort().map((key) => [key, explicitHeaders[key]]) : [];
|
|
15521
|
+
const envKey = `${process.env.GITHUB_TOKEN ?? ""}|${process.env.GH_TOKEN ?? ""}`;
|
|
15522
|
+
return JSON.stringify({ headers: normalizedHeaders, env: envKey });
|
|
15523
|
+
}
|
|
15524
|
+
async getResolvedAuthHeaders(explicitHeaders) {
|
|
15525
|
+
const cacheKey = this.buildAuthCacheKey(explicitHeaders);
|
|
15526
|
+
if (this.resolvedAuthHeaders && this.resolvedAuthKey === cacheKey) {
|
|
15527
|
+
return this.resolvedAuthHeaders;
|
|
15528
|
+
}
|
|
15529
|
+
const resolved = await resolveGitHubAuth(explicitHeaders);
|
|
15530
|
+
this.resolvedAuthHeaders = resolved;
|
|
15531
|
+
this.resolvedAuthKey = cacheKey;
|
|
15532
|
+
return resolved;
|
|
15533
|
+
}
|
|
15057
15534
|
/**
|
|
15058
15535
|
* Fetches the repository tree structure from GitHub API.
|
|
15059
15536
|
*/
|
|
15060
|
-
async fetchRepositoryTree(repoInfo, signal) {
|
|
15537
|
+
async fetchRepositoryTree(repoInfo, headers, signal) {
|
|
15061
15538
|
const { owner, repo, branch } = repoInfo;
|
|
15062
15539
|
let targetBranch = branch;
|
|
15063
15540
|
if (!targetBranch) {
|
|
15541
|
+
const repoUrl = `https://api.github.com/repos/${owner}/${repo}`;
|
|
15542
|
+
logger.debug(`Fetching repository info: ${repoUrl}`);
|
|
15543
|
+
let repoContent;
|
|
15544
|
+
try {
|
|
15545
|
+
repoContent = await this.httpFetcher.fetch(repoUrl, { signal, headers });
|
|
15546
|
+
} catch (error) {
|
|
15547
|
+
if (error instanceof ScraperError) {
|
|
15548
|
+
if (error.message.includes("401")) {
|
|
15549
|
+
throw new ScraperError(
|
|
15550
|
+
`GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`,
|
|
15551
|
+
false,
|
|
15552
|
+
error
|
|
15553
|
+
);
|
|
15554
|
+
}
|
|
15555
|
+
if (error.message.includes("403")) {
|
|
15556
|
+
throw new ScraperError(
|
|
15557
|
+
`GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`,
|
|
15558
|
+
false,
|
|
15559
|
+
error
|
|
15560
|
+
);
|
|
15561
|
+
}
|
|
15562
|
+
}
|
|
15563
|
+
throw error;
|
|
15564
|
+
}
|
|
15565
|
+
if (repoContent.status === FetchStatus.NOT_FOUND) {
|
|
15566
|
+
throw new ScraperError(
|
|
15567
|
+
`Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`,
|
|
15568
|
+
false
|
|
15569
|
+
);
|
|
15570
|
+
}
|
|
15064
15571
|
try {
|
|
15065
|
-
const repoUrl = `https://api.github.com/repos/${owner}/${repo}`;
|
|
15066
|
-
logger.debug(`Fetching repository info: ${repoUrl}`);
|
|
15067
|
-
const repoContent = await this.httpFetcher.fetch(repoUrl, { signal });
|
|
15068
15572
|
const content2 = typeof repoContent.content === "string" ? repoContent.content : repoContent.content.toString("utf-8");
|
|
15069
15573
|
const repoData = JSON.parse(content2);
|
|
15070
|
-
|
|
15071
|
-
|
|
15072
|
-
|
|
15073
|
-
|
|
15574
|
+
const defaultBranch = typeof repoData.default_branch === "string" ? repoData.default_branch.trim() : "";
|
|
15575
|
+
if (!defaultBranch) {
|
|
15576
|
+
logger.warn(
|
|
15577
|
+
`⚠️ Repository info missing default_branch for ${owner}/${repo}, using 'main'`
|
|
15578
|
+
);
|
|
15579
|
+
targetBranch = "main";
|
|
15580
|
+
} else {
|
|
15581
|
+
targetBranch = defaultBranch;
|
|
15582
|
+
logger.debug(`Using default branch: ${targetBranch}`);
|
|
15583
|
+
}
|
|
15584
|
+
} catch (parseError) {
|
|
15585
|
+
logger.warn(`⚠️ Could not parse repository info, using 'main': ${parseError}`);
|
|
15074
15586
|
targetBranch = "main";
|
|
15075
15587
|
}
|
|
15076
15588
|
}
|
|
15077
15589
|
const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`;
|
|
15078
15590
|
logger.debug(`Fetching repository tree: ${treeUrl}`);
|
|
15079
|
-
|
|
15591
|
+
let rawContent;
|
|
15592
|
+
try {
|
|
15593
|
+
rawContent = await this.httpFetcher.fetch(treeUrl, { signal, headers });
|
|
15594
|
+
} catch (error) {
|
|
15595
|
+
if (error instanceof ScraperError) {
|
|
15596
|
+
if (error.message.includes("401")) {
|
|
15597
|
+
throw new ScraperError(
|
|
15598
|
+
`GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`,
|
|
15599
|
+
false,
|
|
15600
|
+
error
|
|
15601
|
+
);
|
|
15602
|
+
}
|
|
15603
|
+
if (error.message.includes("403")) {
|
|
15604
|
+
throw new ScraperError(
|
|
15605
|
+
`GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`,
|
|
15606
|
+
false,
|
|
15607
|
+
error
|
|
15608
|
+
);
|
|
15609
|
+
}
|
|
15610
|
+
}
|
|
15611
|
+
throw error;
|
|
15612
|
+
}
|
|
15613
|
+
if (rawContent.status === FetchStatus.NOT_FOUND) {
|
|
15614
|
+
throw new ScraperError(
|
|
15615
|
+
`Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`,
|
|
15616
|
+
false
|
|
15617
|
+
);
|
|
15618
|
+
}
|
|
15080
15619
|
const content = typeof rawContent.content === "string" ? rawContent.content : rawContent.content.toString("utf-8");
|
|
15081
|
-
|
|
15620
|
+
let treeData;
|
|
15621
|
+
try {
|
|
15622
|
+
treeData = JSON.parse(content);
|
|
15623
|
+
} catch (parseError) {
|
|
15624
|
+
throw new ScraperError(
|
|
15625
|
+
`Failed to parse GitHub API response for "${owner}/${repo}". The repository may be inaccessible or the API returned an unexpected response.`,
|
|
15626
|
+
false,
|
|
15627
|
+
parseError instanceof Error ? parseError : void 0
|
|
15628
|
+
);
|
|
15629
|
+
}
|
|
15082
15630
|
if (treeData.truncated) {
|
|
15083
15631
|
logger.warn(
|
|
15084
15632
|
`⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`
|
|
@@ -15217,7 +15765,7 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15217
15765
|
if (hasTextExtension || hasDocumentExtension || hasCompoundExtension || isCommonTextFile) {
|
|
15218
15766
|
return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
|
|
15219
15767
|
}
|
|
15220
|
-
const mimeType =
|
|
15768
|
+
const mimeType = MimeTypeUtils.detectMimeTypeFromPath(path2);
|
|
15221
15769
|
if (mimeType?.startsWith("text/")) {
|
|
15222
15770
|
logger.debug(`Including file with text MIME type: ${path2} (${mimeType})`);
|
|
15223
15771
|
return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
|
|
@@ -15252,10 +15800,11 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15252
15800
|
status: FetchStatus.NOT_FOUND
|
|
15253
15801
|
};
|
|
15254
15802
|
}
|
|
15803
|
+
const headers = await this.getResolvedAuthHeaders(options.headers);
|
|
15255
15804
|
try {
|
|
15256
15805
|
const parsedUrl = new URL(item.url);
|
|
15257
15806
|
if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) {
|
|
15258
|
-
return await this.wikiProcessor.process(item, options, signal);
|
|
15807
|
+
return await this.wikiProcessor.process(item, options, headers, signal);
|
|
15259
15808
|
}
|
|
15260
15809
|
} catch {
|
|
15261
15810
|
}
|
|
@@ -15281,7 +15830,11 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15281
15830
|
const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`;
|
|
15282
15831
|
discoveredLinks.push(wikiUrl);
|
|
15283
15832
|
logger.debug(`Discovered wiki URL: ${wikiUrl}`);
|
|
15284
|
-
const { tree, resolvedBranch } = await this.fetchRepositoryTree(
|
|
15833
|
+
const { tree, resolvedBranch } = await this.fetchRepositoryTree(
|
|
15834
|
+
repoInfo,
|
|
15835
|
+
headers,
|
|
15836
|
+
signal
|
|
15837
|
+
);
|
|
15285
15838
|
const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
|
|
15286
15839
|
logger.debug(
|
|
15287
15840
|
`Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
|
|
@@ -15299,7 +15852,7 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15299
15852
|
const parsedUrl = new URL(item.url);
|
|
15300
15853
|
if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) {
|
|
15301
15854
|
logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`);
|
|
15302
|
-
return await this.repoProcessor.process(item, options, signal);
|
|
15855
|
+
return await this.repoProcessor.process(item, options, headers, signal);
|
|
15303
15856
|
}
|
|
15304
15857
|
} catch (error) {
|
|
15305
15858
|
logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`);
|
|
@@ -15313,7 +15866,13 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15313
15866
|
if (!url.hostname.includes("github.com")) {
|
|
15314
15867
|
throw new Error("URL must be a GitHub URL");
|
|
15315
15868
|
}
|
|
15316
|
-
await
|
|
15869
|
+
await this.getResolvedAuthHeaders(options.headers);
|
|
15870
|
+
try {
|
|
15871
|
+
await super.scrape(options, progressCallback, signal);
|
|
15872
|
+
} finally {
|
|
15873
|
+
this.resolvedAuthHeaders = void 0;
|
|
15874
|
+
this.resolvedAuthKey = void 0;
|
|
15875
|
+
}
|
|
15317
15876
|
}
|
|
15318
15877
|
async cleanup() {
|
|
15319
15878
|
await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]);
|
|
@@ -15689,7 +16248,7 @@ class LocalFileStrategy extends BaseScraperStrategy {
|
|
|
15689
16248
|
logger.debug(`Reading archive entry: ${innerPath} inside ${archivePath}`);
|
|
15690
16249
|
try {
|
|
15691
16250
|
const contentBuffer = await adapter.getContent(innerPath);
|
|
15692
|
-
const mimeType =
|
|
16251
|
+
const mimeType = MimeTypeUtils.detectMimeTypeFromPath(innerPath) || "application/octet-stream";
|
|
15693
16252
|
const rawContent = {
|
|
15694
16253
|
source: item.url,
|
|
15695
16254
|
content: contentBuffer,
|
|
@@ -15976,31 +16535,90 @@ class PyPiScraperStrategy {
|
|
|
15976
16535
|
}
|
|
15977
16536
|
}
|
|
15978
16537
|
class ScraperRegistry {
|
|
15979
|
-
|
|
16538
|
+
config;
|
|
15980
16539
|
constructor(config) {
|
|
15981
|
-
this.
|
|
15982
|
-
new NpmScraperStrategy(config),
|
|
15983
|
-
new PyPiScraperStrategy(config),
|
|
15984
|
-
new GitHubScraperStrategy(config),
|
|
15985
|
-
new WebScraperStrategy(config, {}),
|
|
15986
|
-
new LocalFileStrategy(config)
|
|
15987
|
-
];
|
|
16540
|
+
this.config = config;
|
|
15988
16541
|
}
|
|
16542
|
+
/**
|
|
16543
|
+
* Creates and returns a fresh strategy instance for the given URL.
|
|
16544
|
+
* Each call returns a new instance to ensure state isolation between parallel scrapes.
|
|
16545
|
+
*/
|
|
15989
16546
|
getStrategy(url) {
|
|
15990
|
-
|
|
15991
|
-
|
|
15992
|
-
if (!strategy) {
|
|
15993
|
-
throw new ScraperError(`No strategy found for URL: ${url}`);
|
|
16547
|
+
if (!url.startsWith("github-file://")) {
|
|
16548
|
+
validateUrl(url);
|
|
15994
16549
|
}
|
|
15995
|
-
|
|
15996
|
-
|
|
16550
|
+
if (isLocalFileUrl(url)) {
|
|
16551
|
+
logger.debug(`Using strategy "LocalFileStrategy" for URL: ${url}`);
|
|
16552
|
+
return new LocalFileStrategy(this.config);
|
|
16553
|
+
}
|
|
16554
|
+
if (isNpmUrl(url)) {
|
|
16555
|
+
logger.debug(`Using strategy "NpmScraperStrategy" for URL: ${url}`);
|
|
16556
|
+
return new NpmScraperStrategy(this.config);
|
|
16557
|
+
}
|
|
16558
|
+
if (isPyPiUrl(url)) {
|
|
16559
|
+
logger.debug(`Using strategy "PyPiScraperStrategy" for URL: ${url}`);
|
|
16560
|
+
return new PyPiScraperStrategy(this.config);
|
|
16561
|
+
}
|
|
16562
|
+
if (isGitHubUrl(url)) {
|
|
16563
|
+
logger.debug(`Using strategy "GitHubScraperStrategy" for URL: ${url}`);
|
|
16564
|
+
return new GitHubScraperStrategy(this.config);
|
|
16565
|
+
}
|
|
16566
|
+
if (isWebUrl(url)) {
|
|
16567
|
+
logger.debug(`Using strategy "WebScraperStrategy" for URL: ${url}`);
|
|
16568
|
+
return new WebScraperStrategy(this.config, {});
|
|
16569
|
+
}
|
|
16570
|
+
throw new ScraperError(`No strategy found for URL: ${url}`);
|
|
15997
16571
|
}
|
|
15998
|
-
|
|
15999
|
-
|
|
16000
|
-
|
|
16001
|
-
|
|
16002
|
-
|
|
16003
|
-
|
|
16572
|
+
}
|
|
16573
|
+
function isLocalFileUrl(url) {
|
|
16574
|
+
return url.startsWith("file://");
|
|
16575
|
+
}
|
|
16576
|
+
function isNpmUrl(url) {
|
|
16577
|
+
try {
|
|
16578
|
+
const { hostname } = new URL(url);
|
|
16579
|
+
return ["npmjs.org", "npmjs.com", "www.npmjs.com"].includes(hostname);
|
|
16580
|
+
} catch {
|
|
16581
|
+
return false;
|
|
16582
|
+
}
|
|
16583
|
+
}
|
|
16584
|
+
function isPyPiUrl(url) {
|
|
16585
|
+
try {
|
|
16586
|
+
const { hostname } = new URL(url);
|
|
16587
|
+
return ["pypi.org", "www.pypi.org"].includes(hostname);
|
|
16588
|
+
} catch {
|
|
16589
|
+
return false;
|
|
16590
|
+
}
|
|
16591
|
+
}
|
|
16592
|
+
function isGitHubUrl(url) {
|
|
16593
|
+
if (url.startsWith("github-file://")) {
|
|
16594
|
+
return true;
|
|
16595
|
+
}
|
|
16596
|
+
try {
|
|
16597
|
+
const parsedUrl = new URL(url);
|
|
16598
|
+
const { hostname, pathname } = parsedUrl;
|
|
16599
|
+
if (!["github.com", "www.github.com"].includes(hostname)) {
|
|
16600
|
+
return false;
|
|
16601
|
+
}
|
|
16602
|
+
if (pathname.match(/^\/[^/]+\/[^/]+\/?$/)) {
|
|
16603
|
+
return true;
|
|
16604
|
+
}
|
|
16605
|
+
if (pathname.match(/^\/[^/]+\/[^/]+\/tree\//)) {
|
|
16606
|
+
return true;
|
|
16607
|
+
}
|
|
16608
|
+
if (pathname.match(/^\/[^/]+\/[^/]+\/blob\//)) {
|
|
16609
|
+
return true;
|
|
16610
|
+
}
|
|
16611
|
+
return false;
|
|
16612
|
+
} catch {
|
|
16613
|
+
return false;
|
|
16614
|
+
}
|
|
16615
|
+
}
|
|
16616
|
+
function isWebUrl(url) {
|
|
16617
|
+
try {
|
|
16618
|
+
const parsedUrl = new URL(url);
|
|
16619
|
+
return parsedUrl.protocol === "http:" || parsedUrl.protocol === "https:";
|
|
16620
|
+
} catch {
|
|
16621
|
+
return false;
|
|
16004
16622
|
}
|
|
16005
16623
|
}
|
|
16006
16624
|
class ScraperService {
|
|
@@ -16011,20 +16629,35 @@ class ScraperService {
|
|
|
16011
16629
|
/**
|
|
16012
16630
|
* Scrapes content from the provided URL using the appropriate strategy.
|
|
16013
16631
|
* Reports progress via callback and handles errors.
|
|
16632
|
+
* Cleans up strategy resources after scrape completes (success or failure).
|
|
16014
16633
|
*/
|
|
16015
16634
|
async scrape(options, progressCallback, signal) {
|
|
16016
16635
|
const strategy = this.registry.getStrategy(options.url);
|
|
16017
|
-
|
|
16018
|
-
|
|
16636
|
+
let scrapeError = null;
|
|
16637
|
+
let cleanupErrorToThrow = null;
|
|
16638
|
+
try {
|
|
16639
|
+
await strategy.scrape(options, progressCallback, signal);
|
|
16640
|
+
} catch (error) {
|
|
16641
|
+
scrapeError = error instanceof Error ? error : new ScraperError(`Scrape failed for URL: ${options.url}`, false);
|
|
16642
|
+
} finally {
|
|
16643
|
+
try {
|
|
16644
|
+
await strategy.cleanup?.();
|
|
16645
|
+
} catch (cleanupError) {
|
|
16646
|
+
logger.error(`❌ Strategy cleanup failed for ${options.url}: ${cleanupError}`);
|
|
16647
|
+
if (!scrapeError) {
|
|
16648
|
+
cleanupErrorToThrow = cleanupError instanceof Error ? cleanupError : new ScraperError(
|
|
16649
|
+
`Strategy cleanup failed for URL: ${options.url}`,
|
|
16650
|
+
false
|
|
16651
|
+
);
|
|
16652
|
+
}
|
|
16653
|
+
}
|
|
16654
|
+
}
|
|
16655
|
+
if (scrapeError) {
|
|
16656
|
+
throw scrapeError;
|
|
16657
|
+
}
|
|
16658
|
+
if (cleanupErrorToThrow) {
|
|
16659
|
+
throw cleanupErrorToThrow;
|
|
16019
16660
|
}
|
|
16020
|
-
await strategy.scrape(options, progressCallback, signal);
|
|
16021
|
-
}
|
|
16022
|
-
/**
|
|
16023
|
-
* Cleanup the scraper registry and all its strategies.
|
|
16024
|
-
* Should be called when the service is no longer needed.
|
|
16025
|
-
*/
|
|
16026
|
-
async cleanup() {
|
|
16027
|
-
await this.registry.cleanup();
|
|
16028
16661
|
}
|
|
16029
16662
|
}
|
|
16030
16663
|
class PipelineWorker {
|
|
@@ -16187,7 +16820,7 @@ class PipelineManager {
|
|
|
16187
16820
|
if (this.shouldRecoverJobs) {
|
|
16188
16821
|
await this.recoverPendingJobs();
|
|
16189
16822
|
} else {
|
|
16190
|
-
|
|
16823
|
+
await this.markInterruptedJobsAsFailed();
|
|
16191
16824
|
}
|
|
16192
16825
|
this._processQueue().catch((error) => {
|
|
16193
16826
|
logger.error(`❌ Error in processQueue during start: ${error}`);
|
|
@@ -16195,79 +16828,69 @@ class PipelineManager {
|
|
|
16195
16828
|
}
|
|
16196
16829
|
/**
|
|
16197
16830
|
* Recovers pending jobs from the database after server restart.
|
|
16198
|
-
*
|
|
16199
|
-
*
|
|
16831
|
+
* Uses enqueueRefreshJob() to properly continue interrupted jobs,
|
|
16832
|
+
* leveraging existing pages and ETags when available.
|
|
16200
16833
|
*/
|
|
16201
16834
|
async recoverPendingJobs() {
|
|
16202
16835
|
try {
|
|
16203
|
-
const
|
|
16204
|
-
VersionStatus.RUNNING
|
|
16836
|
+
const interruptedVersions = await this.store.getVersionsByStatus([
|
|
16837
|
+
VersionStatus.RUNNING,
|
|
16838
|
+
VersionStatus.QUEUED
|
|
16205
16839
|
]);
|
|
16206
|
-
|
|
16207
|
-
|
|
16208
|
-
|
|
16209
|
-
`🔄 Reset interrupted job to QUEUED: ${version.library_name}@${version.name || "latest"}`
|
|
16210
|
-
);
|
|
16840
|
+
if (interruptedVersions.length === 0) {
|
|
16841
|
+
logger.debug("No pending jobs to recover from database");
|
|
16842
|
+
return;
|
|
16211
16843
|
}
|
|
16212
|
-
|
|
16213
|
-
|
|
16214
|
-
|
|
16215
|
-
|
|
16216
|
-
|
|
16217
|
-
|
|
16218
|
-
|
|
16219
|
-
|
|
16220
|
-
|
|
16221
|
-
|
|
16222
|
-
|
|
16223
|
-
|
|
16224
|
-
|
|
16225
|
-
|
|
16226
|
-
|
|
16227
|
-
|
|
16228
|
-
} catch (error) {
|
|
16229
|
-
logger.warn(
|
|
16230
|
-
`⚠️ Failed to parse scraper options for ${version.library_name}@${version.name || "latest"}: ${error}`
|
|
16231
|
-
);
|
|
16232
|
-
}
|
|
16844
|
+
logger.info(
|
|
16845
|
+
`📥 Recovering ${interruptedVersions.length} pending job(s) from database`
|
|
16846
|
+
);
|
|
16847
|
+
for (const version of interruptedVersions) {
|
|
16848
|
+
const versionLabel = `${version.library_name}@${version.name || "latest"}`;
|
|
16849
|
+
try {
|
|
16850
|
+
await this.enqueueRefreshJob(version.library_name, version.name);
|
|
16851
|
+
logger.info(`🔄 Recovering job: ${versionLabel}`);
|
|
16852
|
+
} catch (error) {
|
|
16853
|
+
const errorMessage = `Recovery failed: ${error instanceof Error ? error.message : String(error)}`;
|
|
16854
|
+
await this.store.updateVersionStatus(
|
|
16855
|
+
version.id,
|
|
16856
|
+
VersionStatus.FAILED,
|
|
16857
|
+
errorMessage
|
|
16858
|
+
);
|
|
16859
|
+
logger.warn(`⚠️ Failed to recover job ${versionLabel}: ${error}`);
|
|
16233
16860
|
}
|
|
16234
|
-
const job = {
|
|
16235
|
-
id: jobId,
|
|
16236
|
-
library: version.library_name,
|
|
16237
|
-
version: version.name || "",
|
|
16238
|
-
status: PipelineJobStatus.QUEUED,
|
|
16239
|
-
progress: null,
|
|
16240
|
-
error: null,
|
|
16241
|
-
createdAt: new Date(version.created_at),
|
|
16242
|
-
// For recovered QUEUED jobs, startedAt must be null to reflect queued state.
|
|
16243
|
-
startedAt: null,
|
|
16244
|
-
finishedAt: null,
|
|
16245
|
-
abortController,
|
|
16246
|
-
completionPromise,
|
|
16247
|
-
resolveCompletion,
|
|
16248
|
-
rejectCompletion,
|
|
16249
|
-
// Database fields (single source of truth)
|
|
16250
|
-
versionId: version.id,
|
|
16251
|
-
versionStatus: version.status,
|
|
16252
|
-
progressPages: version.progress_pages,
|
|
16253
|
-
progressMaxPages: version.progress_max_pages,
|
|
16254
|
-
errorMessage: version.error_message,
|
|
16255
|
-
updatedAt: new Date(version.updated_at),
|
|
16256
|
-
sourceUrl: version.source_url,
|
|
16257
|
-
scraperOptions: parsedScraperOptions
|
|
16258
|
-
};
|
|
16259
|
-
this.jobMap.set(jobId, job);
|
|
16260
|
-
this.jobQueue.push(jobId);
|
|
16261
|
-
}
|
|
16262
|
-
if (queuedVersions.length > 0) {
|
|
16263
|
-
logger.info(`📥 Recovered ${queuedVersions.length} pending job(s) from database`);
|
|
16264
|
-
} else {
|
|
16265
|
-
logger.debug("No pending jobs to recover from database");
|
|
16266
16861
|
}
|
|
16267
16862
|
} catch (error) {
|
|
16268
16863
|
logger.error(`❌ Failed to recover pending jobs: ${error}`);
|
|
16269
16864
|
}
|
|
16270
16865
|
}
|
|
16866
|
+
/**
|
|
16867
|
+
* Marks all interrupted jobs (RUNNING/QUEUED) as FAILED.
|
|
16868
|
+
* Called when recoverJobs is false to allow users to manually retry via UI.
|
|
16869
|
+
*/
|
|
16870
|
+
async markInterruptedJobsAsFailed() {
|
|
16871
|
+
try {
|
|
16872
|
+
const interruptedVersions = await this.store.getVersionsByStatus([
|
|
16873
|
+
VersionStatus.RUNNING,
|
|
16874
|
+
VersionStatus.QUEUED
|
|
16875
|
+
]);
|
|
16876
|
+
if (interruptedVersions.length === 0) {
|
|
16877
|
+
logger.debug("No interrupted jobs to mark as failed");
|
|
16878
|
+
return;
|
|
16879
|
+
}
|
|
16880
|
+
for (const version of interruptedVersions) {
|
|
16881
|
+
await this.store.updateVersionStatus(
|
|
16882
|
+
version.id,
|
|
16883
|
+
VersionStatus.FAILED,
|
|
16884
|
+
"Job interrupted"
|
|
16885
|
+
);
|
|
16886
|
+
logger.info(
|
|
16887
|
+
`❌ Marked interrupted job as failed: ${version.library_name}@${version.name || "latest"}`
|
|
16888
|
+
);
|
|
16889
|
+
}
|
|
16890
|
+
} catch (error) {
|
|
16891
|
+
logger.error(`❌ Failed to mark interrupted jobs as failed: ${error}`);
|
|
16892
|
+
}
|
|
16893
|
+
}
|
|
16271
16894
|
/**
|
|
16272
16895
|
* Stops the pipeline manager and attempts to gracefully shut down workers.
|
|
16273
16896
|
* Currently, it just stops processing new jobs. Cancellation of active jobs
|
|
@@ -16280,7 +16903,6 @@ class PipelineManager {
|
|
|
16280
16903
|
}
|
|
16281
16904
|
this.isRunning = false;
|
|
16282
16905
|
logger.debug("PipelineManager stopping. No new jobs will be started.");
|
|
16283
|
-
await this.scraperService.cleanup();
|
|
16284
16906
|
}
|
|
16285
16907
|
/**
|
|
16286
16908
|
* Enqueues a new document processing job, aborting any existing QUEUED/RUNNING job for the same library+version (including unversioned).
|
|
@@ -16870,6 +17492,7 @@ function createAppServerConfig(options) {
|
|
|
16870
17492
|
enableWorker: options.enableWorker ?? true,
|
|
16871
17493
|
port: options.port,
|
|
16872
17494
|
externalWorkerUrl: options.externalWorkerUrl,
|
|
17495
|
+
showLogo: options.showLogo ?? true,
|
|
16873
17496
|
startupContext: options.startupContext
|
|
16874
17497
|
};
|
|
16875
17498
|
}
|
|
@@ -17064,6 +17687,7 @@ function createDefaultAction(cli) {
|
|
|
17064
17687
|
enableApiServer: true,
|
|
17065
17688
|
enableWorker: true,
|
|
17066
17689
|
port: appConfig.server.ports.default,
|
|
17690
|
+
showLogo: argv.logo,
|
|
17067
17691
|
startupContext: {
|
|
17068
17692
|
cliCommand: "default",
|
|
17069
17693
|
mcpProtocol: "http"
|
|
@@ -17351,6 +17975,7 @@ function createMcpCommand(cli) {
|
|
|
17351
17975
|
enableWorker: !serverUrl,
|
|
17352
17976
|
port: appConfig.server.ports.mcp,
|
|
17353
17977
|
externalWorkerUrl: serverUrl,
|
|
17978
|
+
showLogo: argv.logo,
|
|
17354
17979
|
startupContext: {
|
|
17355
17980
|
cliCommand: "mcp",
|
|
17356
17981
|
mcpProtocol: "http"
|
|
@@ -17877,6 +18502,7 @@ function createWebCommand(cli) {
|
|
|
17877
18502
|
enableWorker: !serverUrl,
|
|
17878
18503
|
port: appConfig.server.ports.web,
|
|
17879
18504
|
externalWorkerUrl: serverUrl,
|
|
18505
|
+
showLogo: argv.logo,
|
|
17880
18506
|
startupContext: {
|
|
17881
18507
|
cliCommand: "web"
|
|
17882
18508
|
}
|
|
@@ -17961,6 +18587,7 @@ function createWorkerCommand(cli) {
|
|
|
17961
18587
|
enableApiServer: true,
|
|
17962
18588
|
enableWorker: true,
|
|
17963
18589
|
port: appConfig.server.ports.worker,
|
|
18590
|
+
showLogo: argv.logo,
|
|
17964
18591
|
startupContext: {
|
|
17965
18592
|
cliCommand: "worker"
|
|
17966
18593
|
}
|
|
@@ -17989,7 +18616,7 @@ function createCli(argv) {
|
|
|
17989
18616
|
let globalEventBus = null;
|
|
17990
18617
|
let globalTelemetryService = null;
|
|
17991
18618
|
const commandStartTimes = /* @__PURE__ */ new Map();
|
|
17992
|
-
const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("
|
|
18619
|
+
const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("2.0.0").option("verbose", {
|
|
17993
18620
|
type: "boolean",
|
|
17994
18621
|
description: "Enable verbose (debug) logging",
|
|
17995
18622
|
default: false
|
|
@@ -18013,6 +18640,10 @@ function createCli(argv) {
|
|
|
18013
18640
|
}).option("config", {
|
|
18014
18641
|
type: "string",
|
|
18015
18642
|
description: "Path to configuration file"
|
|
18643
|
+
}).option("logo", {
|
|
18644
|
+
type: "boolean",
|
|
18645
|
+
description: "Show ASCII art logo on startup",
|
|
18646
|
+
default: true
|
|
18016
18647
|
}).middleware(async (argv2) => {
|
|
18017
18648
|
if (argv2.verbose && argv2.silent) {
|
|
18018
18649
|
throw new Error("Arguments verbose and silent are mutually exclusive");
|
|
@@ -18045,7 +18676,7 @@ function createCli(argv) {
|
|
|
18045
18676
|
if (shouldEnableTelemetry() && telemetry.isEnabled()) {
|
|
18046
18677
|
const commandName = argv2._[0]?.toString() || "default";
|
|
18047
18678
|
telemetry.setGlobalContext({
|
|
18048
|
-
appVersion: "
|
|
18679
|
+
appVersion: "2.0.0",
|
|
18049
18680
|
appPlatform: process.platform,
|
|
18050
18681
|
appNodeVersion: process.version,
|
|
18051
18682
|
appInterface: "cli",
|