@arabold/docs-mcp-server 1.37.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/index.js +750 -165
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -64,11 +64,12 @@ import { escapeHtml } from "@kitajs/html";
|
|
|
64
64
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
65
65
|
import { v4 } from "uuid";
|
|
66
66
|
import { minimatch } from "minimatch";
|
|
67
|
+
import { exec, execSync } from "node:child_process";
|
|
68
|
+
import { promisify } from "node:util";
|
|
67
69
|
import { Readable } from "node:stream";
|
|
68
70
|
import * as tar from "tar";
|
|
69
71
|
import yauzl from "yauzl";
|
|
70
72
|
import os from "node:os";
|
|
71
|
-
import { execSync } from "node:child_process";
|
|
72
73
|
class StoreError extends Error {
|
|
73
74
|
constructor(message, cause) {
|
|
74
75
|
super(cause ? `${message} caused by ${cause}` : message);
|
|
@@ -919,6 +920,10 @@ const DEFAULT_CONFIG = {
|
|
|
919
920
|
baseDelayMs: 1e3,
|
|
920
921
|
maxCacheItems: 200,
|
|
921
922
|
maxCacheItemSizeBytes: 500 * 1024
|
|
923
|
+
},
|
|
924
|
+
document: {
|
|
925
|
+
maxSize: 10 * 1024 * 1024
|
|
926
|
+
// 10MB max size for PDF/Office documents
|
|
922
927
|
}
|
|
923
928
|
},
|
|
924
929
|
splitter: {
|
|
@@ -957,10 +962,6 @@ const DEFAULT_CONFIG = {
|
|
|
957
962
|
precedingSiblingsLimit: 1,
|
|
958
963
|
subsequentSiblingsLimit: 2,
|
|
959
964
|
maxChunkDistance: 3
|
|
960
|
-
},
|
|
961
|
-
document: {
|
|
962
|
-
maxSize: 10 * 1024 * 1024
|
|
963
|
-
// 10MB max size for PDF/Office documents
|
|
964
965
|
}
|
|
965
966
|
};
|
|
966
967
|
const AppConfigSchema = z.object({
|
|
@@ -997,7 +998,10 @@ const AppConfigSchema = z.object({
|
|
|
997
998
|
baseDelayMs: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.baseDelayMs),
|
|
998
999
|
maxCacheItems: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.maxCacheItems),
|
|
999
1000
|
maxCacheItemSizeBytes: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.maxCacheItemSizeBytes)
|
|
1000
|
-
}).default(DEFAULT_CONFIG.scraper.fetcher)
|
|
1001
|
+
}).default(DEFAULT_CONFIG.scraper.fetcher),
|
|
1002
|
+
document: z.object({
|
|
1003
|
+
maxSize: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.document.maxSize)
|
|
1004
|
+
}).default(DEFAULT_CONFIG.scraper.document)
|
|
1001
1005
|
}).default(DEFAULT_CONFIG.scraper),
|
|
1002
1006
|
splitter: z.object({
|
|
1003
1007
|
minChunkSize: z.coerce.number().int().default(DEFAULT_CONFIG.splitter.minChunkSize),
|
|
@@ -1035,10 +1039,7 @@ const AppConfigSchema = z.object({
|
|
|
1035
1039
|
precedingSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.precedingSiblingsLimit),
|
|
1036
1040
|
subsequentSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.subsequentSiblingsLimit),
|
|
1037
1041
|
maxChunkDistance: z.coerce.number().int().min(0).default(DEFAULT_CONFIG.assembly.maxChunkDistance)
|
|
1038
|
-
}).default(DEFAULT_CONFIG.assembly)
|
|
1039
|
-
document: z.object({
|
|
1040
|
-
maxSize: z.coerce.number().int().default(DEFAULT_CONFIG.document.maxSize)
|
|
1041
|
-
}).default(DEFAULT_CONFIG.document)
|
|
1042
|
+
}).default(DEFAULT_CONFIG.assembly)
|
|
1042
1043
|
});
|
|
1043
1044
|
const defaults = AppConfigSchema.parse({});
|
|
1044
1045
|
const configMappings = [
|
|
@@ -1159,6 +1160,12 @@ function mapEnvToConfig() {
|
|
|
1159
1160
|
}
|
|
1160
1161
|
}
|
|
1161
1162
|
}
|
|
1163
|
+
for (const pathArr of ALL_CONFIG_LEAF_PATHS) {
|
|
1164
|
+
const envVar = pathToEnvVar(pathArr);
|
|
1165
|
+
if (process.env[envVar] !== void 0) {
|
|
1166
|
+
setAtPath(config, pathArr, process.env[envVar]);
|
|
1167
|
+
}
|
|
1168
|
+
}
|
|
1162
1169
|
return config;
|
|
1163
1170
|
}
|
|
1164
1171
|
function mapCliToConfig(args) {
|
|
@@ -1170,6 +1177,25 @@ function mapCliToConfig(args) {
|
|
|
1170
1177
|
}
|
|
1171
1178
|
return config;
|
|
1172
1179
|
}
|
|
1180
|
+
function camelToUpperSnake(str) {
|
|
1181
|
+
return str.replace(/([a-z])([A-Z])/g, "$1_$2").toUpperCase();
|
|
1182
|
+
}
|
|
1183
|
+
function pathToEnvVar(pathArr) {
|
|
1184
|
+
return `DOCS_MCP_${pathArr.map(camelToUpperSnake).join("_")}`;
|
|
1185
|
+
}
|
|
1186
|
+
function collectLeafPaths(obj, prefix = []) {
|
|
1187
|
+
const paths = [];
|
|
1188
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
1189
|
+
const currentPath = [...prefix, key];
|
|
1190
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
1191
|
+
paths.push(...collectLeafPaths(value, currentPath));
|
|
1192
|
+
} else {
|
|
1193
|
+
paths.push(currentPath);
|
|
1194
|
+
}
|
|
1195
|
+
}
|
|
1196
|
+
return paths;
|
|
1197
|
+
}
|
|
1198
|
+
const ALL_CONFIG_LEAF_PATHS = collectLeafPaths(DEFAULT_CONFIG);
|
|
1173
1199
|
function setAtPath(obj, pathArr, value) {
|
|
1174
1200
|
let current = obj;
|
|
1175
1201
|
for (let i = 0; i < pathArr.length - 1; i++) {
|
|
@@ -1206,17 +1232,168 @@ function deepMerge(target, source) {
|
|
|
1206
1232
|
}
|
|
1207
1233
|
return output;
|
|
1208
1234
|
}
|
|
1235
|
+
function isValidConfigPath(path2) {
|
|
1236
|
+
const pathArr = path2.split(".");
|
|
1237
|
+
return getAtPath(DEFAULT_CONFIG, pathArr) !== void 0;
|
|
1238
|
+
}
|
|
1239
|
+
function getConfigValue(config, path2) {
|
|
1240
|
+
const pathArr = path2.split(".");
|
|
1241
|
+
return getAtPath(config, pathArr);
|
|
1242
|
+
}
|
|
1243
|
+
function parseConfigValue(value) {
|
|
1244
|
+
const num = Number(value);
|
|
1245
|
+
if (!Number.isNaN(num) && value.trim() !== "") {
|
|
1246
|
+
return num;
|
|
1247
|
+
}
|
|
1248
|
+
const lower = value.toLowerCase();
|
|
1249
|
+
if (lower === "true") return true;
|
|
1250
|
+
if (lower === "false") return false;
|
|
1251
|
+
return value;
|
|
1252
|
+
}
|
|
1253
|
+
function setConfigValue(path2, value) {
|
|
1254
|
+
const configPath = getDefaultConfigPath();
|
|
1255
|
+
const fileConfig = loadConfigFile(configPath) || {};
|
|
1256
|
+
const pathArr = path2.split(".");
|
|
1257
|
+
const parsedValue = parseConfigValue(value);
|
|
1258
|
+
const updatedConfig = JSON.parse(JSON.stringify(fileConfig));
|
|
1259
|
+
setAtPath(updatedConfig, pathArr, parsedValue);
|
|
1260
|
+
try {
|
|
1261
|
+
AppConfigSchema.parse(updatedConfig);
|
|
1262
|
+
} catch (err) {
|
|
1263
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
1264
|
+
throw new Error(`Invalid config value for "${path2}": ${errorMsg}`);
|
|
1265
|
+
}
|
|
1266
|
+
saveConfigFile(configPath, updatedConfig);
|
|
1267
|
+
return configPath;
|
|
1268
|
+
}
|
|
1269
|
+
function getDefaultConfigPath() {
|
|
1270
|
+
return path.join(systemPaths.config, "config.yaml");
|
|
1271
|
+
}
|
|
1272
|
+
function formatOutput$1(value, format) {
|
|
1273
|
+
if (format === "auto") {
|
|
1274
|
+
if (typeof value === "object" && value !== null) {
|
|
1275
|
+
return JSON.stringify(value, null, 2);
|
|
1276
|
+
}
|
|
1277
|
+
return String(value);
|
|
1278
|
+
}
|
|
1279
|
+
if (format === "yaml") {
|
|
1280
|
+
return yaml.stringify(value).trim();
|
|
1281
|
+
}
|
|
1282
|
+
return JSON.stringify(value, null, 2);
|
|
1283
|
+
}
|
|
1209
1284
|
function createConfigCommand(cli) {
|
|
1210
1285
|
cli.command(
|
|
1211
1286
|
"config",
|
|
1212
|
-
"
|
|
1213
|
-
(yargs2) =>
|
|
1287
|
+
"View or modify configuration",
|
|
1288
|
+
(yargs2) => {
|
|
1289
|
+
return yargs2.option("json", {
|
|
1290
|
+
type: "boolean",
|
|
1291
|
+
description: "Output in JSON format",
|
|
1292
|
+
conflicts: "yaml"
|
|
1293
|
+
}).option("yaml", {
|
|
1294
|
+
type: "boolean",
|
|
1295
|
+
description: "Output in YAML format",
|
|
1296
|
+
conflicts: "json"
|
|
1297
|
+
}).command(
|
|
1298
|
+
"get <path>",
|
|
1299
|
+
"Get a configuration value",
|
|
1300
|
+
(y) => y.positional("path", {
|
|
1301
|
+
type: "string",
|
|
1302
|
+
description: "Dot-separated config path (e.g., scraper.maxPages)",
|
|
1303
|
+
demandOption: true
|
|
1304
|
+
}).option("json", {
|
|
1305
|
+
type: "boolean",
|
|
1306
|
+
description: "Output in JSON format",
|
|
1307
|
+
conflicts: "yaml"
|
|
1308
|
+
}).option("yaml", {
|
|
1309
|
+
type: "boolean",
|
|
1310
|
+
description: "Output in YAML format",
|
|
1311
|
+
conflicts: "json"
|
|
1312
|
+
}),
|
|
1313
|
+
(argv) => {
|
|
1314
|
+
const path2 = argv.path;
|
|
1315
|
+
if (!isValidConfigPath(path2)) {
|
|
1316
|
+
console.error(`Error: Invalid config path '${path2}'`);
|
|
1317
|
+
console.error("Use 'docs-mcp-server config' to see all available paths.");
|
|
1318
|
+
process.exitCode = 1;
|
|
1319
|
+
return;
|
|
1320
|
+
}
|
|
1321
|
+
const config = loadConfig(argv, {
|
|
1322
|
+
configPath: argv.config,
|
|
1323
|
+
searchDir: argv.storePath
|
|
1324
|
+
});
|
|
1325
|
+
const value = getConfigValue(config, path2);
|
|
1326
|
+
const format = argv.json ? "json" : argv.yaml ? "yaml" : "auto";
|
|
1327
|
+
console.log(formatOutput$1(value, format));
|
|
1328
|
+
}
|
|
1329
|
+
).command(
|
|
1330
|
+
"set <path> <value>",
|
|
1331
|
+
"Set a configuration value",
|
|
1332
|
+
(y) => y.positional("path", {
|
|
1333
|
+
type: "string",
|
|
1334
|
+
description: "Dot-separated config path (e.g., scraper.maxPages)",
|
|
1335
|
+
demandOption: true
|
|
1336
|
+
}).positional("value", {
|
|
1337
|
+
type: "string",
|
|
1338
|
+
description: "Value to set",
|
|
1339
|
+
demandOption: true
|
|
1340
|
+
}),
|
|
1341
|
+
(argv) => {
|
|
1342
|
+
const configPath = argv.config;
|
|
1343
|
+
const path2 = argv.path;
|
|
1344
|
+
const value = argv.value;
|
|
1345
|
+
if (configPath) {
|
|
1346
|
+
console.error(
|
|
1347
|
+
"Error: Cannot modify configuration when using explicit --config file."
|
|
1348
|
+
);
|
|
1349
|
+
console.error(
|
|
1350
|
+
"Remove the --config flag to modify the default configuration."
|
|
1351
|
+
);
|
|
1352
|
+
process.exitCode = 1;
|
|
1353
|
+
return;
|
|
1354
|
+
}
|
|
1355
|
+
if (!isValidConfigPath(path2)) {
|
|
1356
|
+
console.error(`Error: Invalid config path '${path2}'`);
|
|
1357
|
+
console.error("Use 'docs-mcp-server config' to see all available paths.");
|
|
1358
|
+
process.exitCode = 1;
|
|
1359
|
+
return;
|
|
1360
|
+
}
|
|
1361
|
+
const config = loadConfig(argv, {
|
|
1362
|
+
configPath: argv.config,
|
|
1363
|
+
searchDir: argv.storePath
|
|
1364
|
+
});
|
|
1365
|
+
const currentValue = getConfigValue(config, path2);
|
|
1366
|
+
if (currentValue !== void 0 && currentValue !== null && typeof currentValue === "object" && !Array.isArray(currentValue)) {
|
|
1367
|
+
console.error(
|
|
1368
|
+
`Error: Config path '${path2}' refers to an object. Use a more specific leaf path to set a scalar value.`
|
|
1369
|
+
);
|
|
1370
|
+
console.error(
|
|
1371
|
+
"Hint: Run 'docs-mcp-server config' to inspect the current structure."
|
|
1372
|
+
);
|
|
1373
|
+
process.exitCode = 1;
|
|
1374
|
+
return;
|
|
1375
|
+
}
|
|
1376
|
+
try {
|
|
1377
|
+
const savedPath = setConfigValue(path2, value);
|
|
1378
|
+
const parsedValue = parseConfigValue(value);
|
|
1379
|
+
console.log(`Updated ${path2} = ${JSON.stringify(parsedValue)}`);
|
|
1380
|
+
console.log(`Saved to: ${savedPath}`);
|
|
1381
|
+
} catch (error) {
|
|
1382
|
+
console.error(
|
|
1383
|
+
`Error: Failed to save configuration: ${error instanceof Error ? error.message : String(error)}`
|
|
1384
|
+
);
|
|
1385
|
+
process.exitCode = 1;
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1388
|
+
);
|
|
1389
|
+
},
|
|
1214
1390
|
(argv) => {
|
|
1215
1391
|
const config = loadConfig(argv, {
|
|
1216
1392
|
configPath: argv.config,
|
|
1217
1393
|
searchDir: argv.storePath
|
|
1218
1394
|
});
|
|
1219
|
-
|
|
1395
|
+
const format = argv.json ? "json" : argv.yaml ? "yaml" : "json";
|
|
1396
|
+
console.log(formatOutput$1(config, format));
|
|
1220
1397
|
}
|
|
1221
1398
|
);
|
|
1222
1399
|
}
|
|
@@ -2443,42 +2620,135 @@ class MimeTypeUtils {
|
|
|
2443
2620
|
static detectMimeTypeFromPath(filePath) {
|
|
2444
2621
|
const extension = filePath.toLowerCase().split(".").pop();
|
|
2445
2622
|
const customMimeTypes = {
|
|
2623
|
+
// JavaScript/TypeScript family
|
|
2446
2624
|
ts: "text/x-typescript",
|
|
2447
2625
|
tsx: "text/x-tsx",
|
|
2626
|
+
mts: "text/x-typescript",
|
|
2627
|
+
// TypeScript ES modules
|
|
2628
|
+
cts: "text/x-typescript",
|
|
2629
|
+
// TypeScript CommonJS modules
|
|
2448
2630
|
js: "text/javascript",
|
|
2449
2631
|
jsx: "text/x-jsx",
|
|
2450
2632
|
cjs: "text/javascript",
|
|
2451
2633
|
// CommonJS modules
|
|
2452
2634
|
mjs: "text/javascript",
|
|
2453
2635
|
// ES modules
|
|
2636
|
+
// Python family
|
|
2454
2637
|
py: "text/x-python",
|
|
2455
2638
|
pyw: "text/x-python",
|
|
2456
2639
|
pyi: "text/x-python",
|
|
2640
|
+
pyx: "text/x-cython",
|
|
2641
|
+
// Cython
|
|
2642
|
+
pxd: "text/x-cython",
|
|
2643
|
+
// Cython
|
|
2644
|
+
// Systems languages
|
|
2457
2645
|
go: "text/x-go",
|
|
2458
2646
|
rs: "text/x-rust",
|
|
2647
|
+
c: "text/x-csrc",
|
|
2648
|
+
h: "text/x-chdr",
|
|
2649
|
+
cpp: "text/x-c++src",
|
|
2650
|
+
cxx: "text/x-c++src",
|
|
2651
|
+
cc: "text/x-c++src",
|
|
2652
|
+
hpp: "text/x-c++hdr",
|
|
2653
|
+
hxx: "text/x-c++hdr",
|
|
2654
|
+
zig: "text/x-zig",
|
|
2655
|
+
nim: "text/x-nim",
|
|
2656
|
+
v: "text/x-v",
|
|
2657
|
+
cr: "text/x-crystal",
|
|
2658
|
+
// JVM languages
|
|
2459
2659
|
kt: "text/x-kotlin",
|
|
2660
|
+
kts: "text/x-kotlin",
|
|
2661
|
+
// Kotlin script
|
|
2460
2662
|
scala: "text/x-scala",
|
|
2663
|
+
groovy: "text/x-groovy",
|
|
2664
|
+
gradle: "text/x-gradle",
|
|
2665
|
+
// Apple/Mobile
|
|
2461
2666
|
swift: "text/x-swift",
|
|
2667
|
+
dart: "text/x-dart",
|
|
2668
|
+
// Scripting languages
|
|
2462
2669
|
rb: "text/x-ruby",
|
|
2670
|
+
rake: "text/x-ruby",
|
|
2671
|
+
// Rakefile
|
|
2463
2672
|
php: "text/x-php",
|
|
2673
|
+
lua: "text/x-lua",
|
|
2674
|
+
pl: "text/x-perl",
|
|
2675
|
+
pm: "text/x-perl",
|
|
2676
|
+
r: "text/x-r",
|
|
2677
|
+
// Also handles .R since extension is lowercased
|
|
2678
|
+
// Functional languages
|
|
2679
|
+
hs: "text/x-haskell",
|
|
2680
|
+
lhs: "text/x-haskell",
|
|
2681
|
+
// Literate Haskell
|
|
2682
|
+
elm: "text/x-elm",
|
|
2683
|
+
erl: "text/x-erlang",
|
|
2684
|
+
ex: "text/x-elixir",
|
|
2685
|
+
exs: "text/x-elixir",
|
|
2686
|
+
clj: "text/x-clojure",
|
|
2687
|
+
cljs: "text/x-clojure",
|
|
2688
|
+
cljc: "text/x-clojure",
|
|
2689
|
+
jl: "text/x-julia",
|
|
2690
|
+
// .NET
|
|
2464
2691
|
cs: "text/x-csharp",
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
|
|
2470
|
-
|
|
2471
|
-
|
|
2692
|
+
// Web3/Smart contracts
|
|
2693
|
+
sol: "text/x-solidity",
|
|
2694
|
+
move: "text/x-move",
|
|
2695
|
+
cairo: "text/x-cairo",
|
|
2696
|
+
// Modern web frameworks
|
|
2697
|
+
vue: "text/x-vue",
|
|
2698
|
+
svelte: "text/x-svelte",
|
|
2699
|
+
astro: "text/x-astro",
|
|
2700
|
+
// Shell scripting
|
|
2472
2701
|
sh: "text/x-shellscript",
|
|
2473
2702
|
bash: "text/x-shellscript",
|
|
2474
2703
|
zsh: "text/x-shellscript",
|
|
2475
2704
|
fish: "text/x-shellscript",
|
|
2476
2705
|
ps1: "text/x-powershell",
|
|
2706
|
+
// Documentation formats
|
|
2707
|
+
rst: "text/x-rst",
|
|
2708
|
+
// reStructuredText
|
|
2709
|
+
adoc: "text/x-asciidoc",
|
|
2710
|
+
asciidoc: "text/x-asciidoc",
|
|
2711
|
+
textile: "text/x-textile",
|
|
2712
|
+
org: "text/x-org",
|
|
2713
|
+
// Org-mode
|
|
2714
|
+
pod: "text/x-pod",
|
|
2715
|
+
// Perl documentation
|
|
2716
|
+
rdoc: "text/x-rdoc",
|
|
2717
|
+
// Ruby documentation
|
|
2718
|
+
wiki: "text/x-wiki",
|
|
2719
|
+
rmd: "text/x-rmarkdown",
|
|
2720
|
+
// R Markdown
|
|
2721
|
+
// Configuration files
|
|
2722
|
+
toml: "text/x-toml",
|
|
2723
|
+
ini: "text/x-ini",
|
|
2724
|
+
cfg: "text/x-ini",
|
|
2725
|
+
conf: "text/x-conf",
|
|
2726
|
+
properties: "text/x-properties",
|
|
2727
|
+
env: "text/x-dotenv",
|
|
2728
|
+
// Build systems
|
|
2729
|
+
dockerfile: "text/x-dockerfile",
|
|
2730
|
+
containerfile: "text/x-dockerfile",
|
|
2731
|
+
makefile: "text/x-makefile",
|
|
2732
|
+
cmake: "text/x-cmake",
|
|
2733
|
+
bazel: "text/x-bazel",
|
|
2734
|
+
bzl: "text/x-bazel",
|
|
2735
|
+
buck: "text/x-buck",
|
|
2736
|
+
// Infrastructure as Code
|
|
2737
|
+
tf: "text/x-terraform",
|
|
2738
|
+
tfvars: "text/x-terraform",
|
|
2739
|
+
hcl: "text/x-hcl",
|
|
2740
|
+
// Data/Query languages
|
|
2477
2741
|
sql: "text/x-sql",
|
|
2478
2742
|
graphql: "text/x-graphql",
|
|
2479
2743
|
gql: "text/x-graphql",
|
|
2744
|
+
// Schema/API definitions
|
|
2480
2745
|
proto: "text/x-proto",
|
|
2481
|
-
|
|
2746
|
+
prisma: "text/x-prisma",
|
|
2747
|
+
thrift: "text/x-thrift",
|
|
2748
|
+
avro: "text/x-avro",
|
|
2749
|
+
// TeX/LaTeX
|
|
2750
|
+
tex: "text/x-tex",
|
|
2751
|
+
latex: "text/x-latex"
|
|
2482
2752
|
};
|
|
2483
2753
|
if (extension && customMimeTypes[extension]) {
|
|
2484
2754
|
return customMimeTypes[extension];
|
|
@@ -2498,8 +2768,24 @@ class MimeTypeUtils {
|
|
|
2498
2768
|
return null;
|
|
2499
2769
|
}
|
|
2500
2770
|
const mimeTypeNormalization = {
|
|
2501
|
-
"application/node": "text/javascript"
|
|
2502
|
-
// .cjs files
|
|
2771
|
+
"application/node": "text/javascript",
|
|
2772
|
+
// .cjs files
|
|
2773
|
+
"video/mp2t": "text/x-typescript",
|
|
2774
|
+
// .ts/.mts files (MPEG-2 transport stream conflict)
|
|
2775
|
+
"application/rls-services+xml": "text/x-rust",
|
|
2776
|
+
// .rs files
|
|
2777
|
+
"application/vnd.lotus-organizer": "text/x-org",
|
|
2778
|
+
// .org files (Lotus Organizer conflict)
|
|
2779
|
+
"application/vnd.dart": "text/x-dart",
|
|
2780
|
+
// .dart files
|
|
2781
|
+
"application/x-perl": "text/x-perl",
|
|
2782
|
+
// .pl/.pm files
|
|
2783
|
+
"application/x-tex": "text/x-tex",
|
|
2784
|
+
// .tex files
|
|
2785
|
+
"application/x-latex": "text/x-latex",
|
|
2786
|
+
// .latex files
|
|
2787
|
+
"application/toml": "text/x-toml"
|
|
2788
|
+
// .toml files
|
|
2503
2789
|
};
|
|
2504
2790
|
return mimeTypeNormalization[mimeType] || mimeType;
|
|
2505
2791
|
}
|
|
@@ -2511,6 +2797,7 @@ class MimeTypeUtils {
|
|
|
2511
2797
|
*/
|
|
2512
2798
|
static extractLanguageFromMimeType(mimeType) {
|
|
2513
2799
|
const mimeToLanguage = {
|
|
2800
|
+
// JavaScript/TypeScript
|
|
2514
2801
|
"text/x-typescript": "typescript",
|
|
2515
2802
|
"text/typescript": "typescript",
|
|
2516
2803
|
"application/typescript": "typescript",
|
|
@@ -2519,22 +2806,84 @@ class MimeTypeUtils {
|
|
|
2519
2806
|
"application/javascript": "javascript",
|
|
2520
2807
|
"application/x-javascript": "javascript",
|
|
2521
2808
|
"text/x-jsx": "jsx",
|
|
2809
|
+
// Python
|
|
2522
2810
|
"text/x-python": "python",
|
|
2523
|
-
"text/x-
|
|
2811
|
+
"text/x-cython": "cython",
|
|
2812
|
+
// Systems languages
|
|
2524
2813
|
"text/x-c": "c",
|
|
2525
2814
|
"text/x-csrc": "c",
|
|
2526
2815
|
"text/x-chdr": "c",
|
|
2527
2816
|
"text/x-c++": "cpp",
|
|
2528
2817
|
"text/x-c++src": "cpp",
|
|
2529
2818
|
"text/x-c++hdr": "cpp",
|
|
2530
|
-
"text/x-csharp": "csharp",
|
|
2531
2819
|
"text/x-go": "go",
|
|
2532
2820
|
"text/x-rust": "rust",
|
|
2533
|
-
"text/x-
|
|
2534
|
-
"text/x-
|
|
2535
|
-
"text/x-
|
|
2821
|
+
"text/x-zig": "zig",
|
|
2822
|
+
"text/x-nim": "nim",
|
|
2823
|
+
"text/x-v": "v",
|
|
2824
|
+
"text/x-crystal": "crystal",
|
|
2825
|
+
// JVM languages
|
|
2826
|
+
"text/x-java": "java",
|
|
2536
2827
|
"text/x-kotlin": "kotlin",
|
|
2537
2828
|
"text/x-scala": "scala",
|
|
2829
|
+
"text/x-groovy": "groovy",
|
|
2830
|
+
"text/x-gradle": "groovy",
|
|
2831
|
+
// Apple/Mobile
|
|
2832
|
+
"text/x-swift": "swift",
|
|
2833
|
+
"text/x-dart": "dart",
|
|
2834
|
+
// .NET
|
|
2835
|
+
"text/x-csharp": "csharp",
|
|
2836
|
+
// Scripting languages
|
|
2837
|
+
"text/x-ruby": "ruby",
|
|
2838
|
+
"text/x-php": "php",
|
|
2839
|
+
"text/x-lua": "lua",
|
|
2840
|
+
"text/x-perl": "perl",
|
|
2841
|
+
"text/x-r": "r",
|
|
2842
|
+
// Functional languages
|
|
2843
|
+
"text/x-haskell": "haskell",
|
|
2844
|
+
"text/x-elm": "elm",
|
|
2845
|
+
"text/x-erlang": "erlang",
|
|
2846
|
+
"text/x-elixir": "elixir",
|
|
2847
|
+
"text/x-clojure": "clojure",
|
|
2848
|
+
"text/x-julia": "julia",
|
|
2849
|
+
// Web3/Smart contracts
|
|
2850
|
+
"text/x-solidity": "solidity",
|
|
2851
|
+
"text/x-move": "move",
|
|
2852
|
+
"text/x-cairo": "cairo",
|
|
2853
|
+
// Modern web frameworks
|
|
2854
|
+
"text/x-vue": "vue",
|
|
2855
|
+
"text/x-svelte": "svelte",
|
|
2856
|
+
"text/x-astro": "astro",
|
|
2857
|
+
// Shell
|
|
2858
|
+
"text/x-sh": "bash",
|
|
2859
|
+
"text/x-shellscript": "bash",
|
|
2860
|
+
"application/x-sh": "bash",
|
|
2861
|
+
"text/x-powershell": "powershell",
|
|
2862
|
+
// Documentation formats
|
|
2863
|
+
"text/x-rst": "rst",
|
|
2864
|
+
"text/x-asciidoc": "asciidoc",
|
|
2865
|
+
"text/x-textile": "textile",
|
|
2866
|
+
"text/x-org": "org",
|
|
2867
|
+
"text/x-pod": "pod",
|
|
2868
|
+
"text/x-rdoc": "rdoc",
|
|
2869
|
+
"text/x-wiki": "wiki",
|
|
2870
|
+
"text/x-rmarkdown": "rmarkdown",
|
|
2871
|
+
// Configuration files
|
|
2872
|
+
"text/x-toml": "toml",
|
|
2873
|
+
"text/x-ini": "ini",
|
|
2874
|
+
"text/x-conf": "conf",
|
|
2875
|
+
"text/x-properties": "properties",
|
|
2876
|
+
"text/x-dotenv": "dotenv",
|
|
2877
|
+
// Build systems
|
|
2878
|
+
"text/x-dockerfile": "dockerfile",
|
|
2879
|
+
"text/x-makefile": "makefile",
|
|
2880
|
+
"text/x-cmake": "cmake",
|
|
2881
|
+
"text/x-bazel": "bazel",
|
|
2882
|
+
"text/x-buck": "buck",
|
|
2883
|
+
// Infrastructure as Code
|
|
2884
|
+
"text/x-terraform": "hcl",
|
|
2885
|
+
"text/x-hcl": "hcl",
|
|
2886
|
+
// Data formats
|
|
2538
2887
|
"text/x-yaml": "yaml",
|
|
2539
2888
|
"application/x-yaml": "yaml",
|
|
2540
2889
|
"application/yaml": "yaml",
|
|
@@ -2544,13 +2893,15 @@ class MimeTypeUtils {
|
|
|
2544
2893
|
"text/xml": "xml",
|
|
2545
2894
|
"application/xml": "xml",
|
|
2546
2895
|
"text/x-sql": "sql",
|
|
2547
|
-
"text/x-sh": "bash",
|
|
2548
|
-
"text/x-shellscript": "bash",
|
|
2549
|
-
"application/x-sh": "bash",
|
|
2550
|
-
"text/x-powershell": "powershell",
|
|
2551
2896
|
"text/x-graphql": "graphql",
|
|
2897
|
+
// Schema/API definitions
|
|
2552
2898
|
"text/x-proto": "protobuf",
|
|
2553
|
-
"text/x-
|
|
2899
|
+
"text/x-prisma": "prisma",
|
|
2900
|
+
"text/x-thrift": "thrift",
|
|
2901
|
+
"text/x-avro": "avro",
|
|
2902
|
+
// TeX/LaTeX
|
|
2903
|
+
"text/x-tex": "tex",
|
|
2904
|
+
"text/x-latex": "latex"
|
|
2554
2905
|
};
|
|
2555
2906
|
return mimeToLanguage[mimeType] || "";
|
|
2556
2907
|
}
|
|
@@ -2672,22 +3023,29 @@ class BrowserFetcher {
|
|
|
2672
3023
|
}
|
|
2673
3024
|
}
|
|
2674
3025
|
/**
|
|
2675
|
-
* Close the browser and clean up resources
|
|
3026
|
+
* Close the browser and clean up resources.
|
|
3027
|
+
* Always attempts cleanup even if browser is disconnected to reap zombie processes.
|
|
2676
3028
|
*/
|
|
2677
3029
|
async close() {
|
|
2678
|
-
|
|
2679
|
-
|
|
3030
|
+
if (this.page) {
|
|
3031
|
+
try {
|
|
2680
3032
|
await this.page.close();
|
|
3033
|
+
} catch (error) {
|
|
3034
|
+
logger.warn(`⚠️ Error closing browser page: ${error}`);
|
|
3035
|
+
} finally {
|
|
2681
3036
|
this.page = null;
|
|
2682
3037
|
}
|
|
2683
|
-
|
|
3038
|
+
}
|
|
3039
|
+
if (this.browser) {
|
|
3040
|
+
try {
|
|
2684
3041
|
await this.browser.close();
|
|
3042
|
+
} catch (error) {
|
|
3043
|
+
logger.warn(`⚠️ Error closing browser: ${error}`);
|
|
3044
|
+
} finally {
|
|
2685
3045
|
this.browser = null;
|
|
2686
3046
|
}
|
|
2687
|
-
logger.debug("Browser closed successfully");
|
|
2688
|
-
} catch (error) {
|
|
2689
|
-
logger.warn(`⚠️ Error closing browser: ${error}`);
|
|
2690
3047
|
}
|
|
3048
|
+
logger.debug("Browser closed successfully");
|
|
2691
3049
|
}
|
|
2692
3050
|
}
|
|
2693
3051
|
class FileFetcher {
|
|
@@ -4019,7 +4377,7 @@ class DocumentPipeline extends BasePipeline {
|
|
|
4019
4377
|
constructor(config) {
|
|
4020
4378
|
super();
|
|
4021
4379
|
this.markitdown = new MarkItDown();
|
|
4022
|
-
this.maxSize = config.document.maxSize;
|
|
4380
|
+
this.maxSize = config.scraper.document.maxSize;
|
|
4023
4381
|
const semanticSplitter = new SemanticMarkdownSplitter(
|
|
4024
4382
|
config.splitter.preferredChunkSize,
|
|
4025
4383
|
config.splitter.maxChunkSize
|
|
@@ -4505,12 +4863,18 @@ class HtmlPlaywrightMiddleware {
|
|
|
4505
4863
|
/**
|
|
4506
4864
|
* Closes the Playwright browser instance if it exists.
|
|
4507
4865
|
* Should be called during application shutdown.
|
|
4866
|
+
* Attempts to close even if the browser is disconnected to ensure proper cleanup of zombie processes.
|
|
4508
4867
|
*/
|
|
4509
4868
|
async closeBrowser() {
|
|
4510
|
-
if (this.browser
|
|
4511
|
-
|
|
4512
|
-
|
|
4513
|
-
|
|
4869
|
+
if (this.browser) {
|
|
4870
|
+
try {
|
|
4871
|
+
logger.debug("Closing Playwright browser instance...");
|
|
4872
|
+
await this.browser.close();
|
|
4873
|
+
} catch (error) {
|
|
4874
|
+
logger.warn(`⚠️ Error closing Playwright browser: ${error}`);
|
|
4875
|
+
} finally {
|
|
4876
|
+
this.browser = null;
|
|
4877
|
+
}
|
|
4514
4878
|
}
|
|
4515
4879
|
}
|
|
4516
4880
|
/**
|
|
@@ -5615,10 +5979,15 @@ class HtmlPipeline extends BasePipeline {
|
|
|
5615
5979
|
}
|
|
5616
5980
|
/**
|
|
5617
5981
|
* Cleanup resources used by this pipeline, specifically the Playwright browser instance.
|
|
5982
|
+
* Errors during cleanup are logged but not propagated to ensure graceful shutdown.
|
|
5618
5983
|
*/
|
|
5619
5984
|
async close() {
|
|
5620
5985
|
await super.close();
|
|
5621
|
-
|
|
5986
|
+
try {
|
|
5987
|
+
await this.playwrightMiddleware.closeBrowser();
|
|
5988
|
+
} catch (error) {
|
|
5989
|
+
logger.warn(`⚠️ Error during browser cleanup: ${error}`);
|
|
5990
|
+
}
|
|
5622
5991
|
}
|
|
5623
5992
|
}
|
|
5624
5993
|
class TextDocumentSplitter {
|
|
@@ -6643,6 +7012,11 @@ class TypeScriptParser {
|
|
|
6643
7012
|
".cjs"
|
|
6644
7013
|
];
|
|
6645
7014
|
mimeTypes = [
|
|
7015
|
+
// text/x-* variants (output by MimeTypeUtils.detectMimeTypeFromPath)
|
|
7016
|
+
"text/x-typescript",
|
|
7017
|
+
"text/x-tsx",
|
|
7018
|
+
"text/x-jsx",
|
|
7019
|
+
// Standard variants
|
|
6646
7020
|
"text/typescript",
|
|
6647
7021
|
"application/typescript",
|
|
6648
7022
|
"text/tsx",
|
|
@@ -6984,6 +7358,8 @@ class LanguageParserRegistry {
|
|
|
6984
7358
|
// Narrow advertised extensions/mime types for the alias (informational only).
|
|
6985
7359
|
fileExtensions: [".js", ".jsx", ".mjs", ".cjs"],
|
|
6986
7360
|
mimeTypes: [
|
|
7361
|
+
"text/x-jsx",
|
|
7362
|
+
// Output by MimeTypeUtils.detectMimeTypeFromPath
|
|
6987
7363
|
"text/javascript",
|
|
6988
7364
|
"application/javascript",
|
|
6989
7365
|
"text/jsx",
|
|
@@ -6996,6 +7372,8 @@ class LanguageParserRegistry {
|
|
|
6996
7372
|
this.extensionMap.set(ext.toLowerCase(), "javascript");
|
|
6997
7373
|
}
|
|
6998
7374
|
const jsMimes = [
|
|
7375
|
+
"text/x-jsx",
|
|
7376
|
+
// Output by MimeTypeUtils.detectMimeTypeFromPath
|
|
6999
7377
|
"text/javascript",
|
|
7000
7378
|
"application/javascript",
|
|
7001
7379
|
"text/jsx",
|
|
@@ -11635,7 +12013,7 @@ const Layout = ({
|
|
|
11635
12013
|
children,
|
|
11636
12014
|
eventClientConfig
|
|
11637
12015
|
}) => {
|
|
11638
|
-
const versionString = version || "
|
|
12016
|
+
const versionString = version || "2.0.0";
|
|
11639
12017
|
const versionInitializer = `versionUpdate({ currentVersion: ${`'${versionString}'`} })`;
|
|
11640
12018
|
return /* @__PURE__ */ jsxs("html", { lang: "en", children: [
|
|
11641
12019
|
/* @__PURE__ */ jsxs("head", { children: [
|
|
@@ -13985,7 +14363,7 @@ class AppServer {
|
|
|
13985
14363
|
try {
|
|
13986
14364
|
if (telemetry.isEnabled()) {
|
|
13987
14365
|
telemetry.setGlobalContext({
|
|
13988
|
-
appVersion: "
|
|
14366
|
+
appVersion: "2.0.0",
|
|
13989
14367
|
appPlatform: process.platform,
|
|
13990
14368
|
appNodeVersion: process.version,
|
|
13991
14369
|
appServicesEnabled: this.getActiveServicesList(),
|
|
@@ -14728,6 +15106,9 @@ class BaseScraperStrategy {
|
|
|
14728
15106
|
return null;
|
|
14729
15107
|
}).filter((item2) => item2 !== null);
|
|
14730
15108
|
} catch (error) {
|
|
15109
|
+
if (item.depth === 0) {
|
|
15110
|
+
throw error;
|
|
15111
|
+
}
|
|
14731
15112
|
if (options.ignoreErrors) {
|
|
14732
15113
|
logger.error(`❌ Failed to process ${item.url}: ${error}`);
|
|
14733
15114
|
return [];
|
|
@@ -14842,10 +15223,10 @@ class GitHubRepoProcessor {
|
|
|
14842
15223
|
/**
|
|
14843
15224
|
* Fetches the raw content of a file from GitHub.
|
|
14844
15225
|
*/
|
|
14845
|
-
async fetchFileContent(repoInfo, filePath, etag, signal) {
|
|
15226
|
+
async fetchFileContent(repoInfo, filePath, etag, headers, signal) {
|
|
14846
15227
|
const { owner, repo, branch } = repoInfo;
|
|
14847
15228
|
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
|
|
14848
|
-
const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag });
|
|
15229
|
+
const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag, headers });
|
|
14849
15230
|
const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
|
|
14850
15231
|
if (detectedMimeType && (rawContent.mimeType === "text/plain" || rawContent.mimeType === "application/octet-stream")) {
|
|
14851
15232
|
return {
|
|
@@ -14858,13 +15239,14 @@ class GitHubRepoProcessor {
|
|
|
14858
15239
|
/**
|
|
14859
15240
|
* Processes a single GitHub repository file from an HTTPS blob URL.
|
|
14860
15241
|
*/
|
|
14861
|
-
async process(item, options, signal) {
|
|
15242
|
+
async process(item, options, headers, signal) {
|
|
14862
15243
|
const repoInfo = this.parseHttpsBlobUrl(item.url);
|
|
14863
15244
|
const { owner, repo, branch, filePath } = repoInfo;
|
|
14864
15245
|
const rawContent = await this.fetchFileContent(
|
|
14865
15246
|
{ owner, repo, branch },
|
|
14866
15247
|
filePath,
|
|
14867
15248
|
item.etag,
|
|
15249
|
+
headers,
|
|
14868
15250
|
signal
|
|
14869
15251
|
);
|
|
14870
15252
|
if (rawContent.status !== FetchStatus.SUCCESS) {
|
|
@@ -14955,12 +15337,13 @@ class GitHubWikiProcessor {
|
|
|
14955
15337
|
/**
|
|
14956
15338
|
* Processes a single GitHub wiki page.
|
|
14957
15339
|
*/
|
|
14958
|
-
async process(item, options, signal) {
|
|
15340
|
+
async process(item, options, headers, signal) {
|
|
14959
15341
|
const currentUrl = item.url;
|
|
14960
15342
|
try {
|
|
14961
15343
|
const rawContent = await this.httpFetcher.fetch(currentUrl, {
|
|
14962
15344
|
signal,
|
|
14963
|
-
etag: item.etag
|
|
15345
|
+
etag: item.etag,
|
|
15346
|
+
headers
|
|
14964
15347
|
});
|
|
14965
15348
|
if (rawContent.status !== FetchStatus.SUCCESS) {
|
|
14966
15349
|
return { url: currentUrl, links: [], status: rawContent.status };
|
|
@@ -15031,10 +15414,52 @@ class GitHubWikiProcessor {
|
|
|
15031
15414
|
await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
|
|
15032
15415
|
}
|
|
15033
15416
|
}
|
|
15417
|
+
const execAsync = promisify(exec);
|
|
15418
|
+
async function resolveGitHubAuth(explicitHeaders) {
|
|
15419
|
+
if (explicitHeaders) {
|
|
15420
|
+
const hasAuthHeader = Object.keys(explicitHeaders).some(
|
|
15421
|
+
(key) => key.toLowerCase() === "authorization"
|
|
15422
|
+
);
|
|
15423
|
+
if (hasAuthHeader) {
|
|
15424
|
+
return explicitHeaders;
|
|
15425
|
+
}
|
|
15426
|
+
}
|
|
15427
|
+
const githubToken = process.env.GITHUB_TOKEN;
|
|
15428
|
+
if (githubToken) {
|
|
15429
|
+
logger.debug("Using GitHub token from GITHUB_TOKEN environment variable");
|
|
15430
|
+
return {
|
|
15431
|
+
...explicitHeaders,
|
|
15432
|
+
Authorization: `Bearer ${githubToken}`
|
|
15433
|
+
};
|
|
15434
|
+
}
|
|
15435
|
+
const ghToken = process.env.GH_TOKEN;
|
|
15436
|
+
if (ghToken) {
|
|
15437
|
+
logger.debug("Using GitHub token from GH_TOKEN environment variable");
|
|
15438
|
+
return {
|
|
15439
|
+
...explicitHeaders,
|
|
15440
|
+
Authorization: `Bearer ${ghToken}`
|
|
15441
|
+
};
|
|
15442
|
+
}
|
|
15443
|
+
try {
|
|
15444
|
+
const { stdout } = await execAsync("gh auth token", { timeout: 5e3 });
|
|
15445
|
+
const cliToken = stdout.trim();
|
|
15446
|
+
if (cliToken) {
|
|
15447
|
+
logger.debug("Using GitHub token from local gh CLI");
|
|
15448
|
+
return {
|
|
15449
|
+
...explicitHeaders,
|
|
15450
|
+
Authorization: `Bearer ${cliToken}`
|
|
15451
|
+
};
|
|
15452
|
+
}
|
|
15453
|
+
} catch {
|
|
15454
|
+
}
|
|
15455
|
+
return explicitHeaders ?? {};
|
|
15456
|
+
}
|
|
15034
15457
|
class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
15035
15458
|
httpFetcher;
|
|
15036
15459
|
wikiProcessor;
|
|
15037
15460
|
repoProcessor;
|
|
15461
|
+
resolvedAuthHeaders;
|
|
15462
|
+
resolvedAuthKey;
|
|
15038
15463
|
constructor(config) {
|
|
15039
15464
|
super(config);
|
|
15040
15465
|
this.httpFetcher = new HttpFetcher(config.scraper);
|
|
@@ -15091,31 +15516,117 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15091
15516
|
}
|
|
15092
15517
|
return { owner, repo };
|
|
15093
15518
|
}
|
|
15519
|
+
buildAuthCacheKey(explicitHeaders) {
|
|
15520
|
+
const normalizedHeaders = explicitHeaders ? Object.keys(explicitHeaders).sort().map((key) => [key, explicitHeaders[key]]) : [];
|
|
15521
|
+
const envKey = `${process.env.GITHUB_TOKEN ?? ""}|${process.env.GH_TOKEN ?? ""}`;
|
|
15522
|
+
return JSON.stringify({ headers: normalizedHeaders, env: envKey });
|
|
15523
|
+
}
|
|
15524
|
+
async getResolvedAuthHeaders(explicitHeaders) {
|
|
15525
|
+
const cacheKey = this.buildAuthCacheKey(explicitHeaders);
|
|
15526
|
+
if (this.resolvedAuthHeaders && this.resolvedAuthKey === cacheKey) {
|
|
15527
|
+
return this.resolvedAuthHeaders;
|
|
15528
|
+
}
|
|
15529
|
+
const resolved = await resolveGitHubAuth(explicitHeaders);
|
|
15530
|
+
this.resolvedAuthHeaders = resolved;
|
|
15531
|
+
this.resolvedAuthKey = cacheKey;
|
|
15532
|
+
return resolved;
|
|
15533
|
+
}
|
|
15094
15534
|
/**
|
|
15095
15535
|
* Fetches the repository tree structure from GitHub API.
|
|
15096
15536
|
*/
|
|
15097
|
-
async fetchRepositoryTree(repoInfo, signal) {
|
|
15537
|
+
async fetchRepositoryTree(repoInfo, headers, signal) {
|
|
15098
15538
|
const { owner, repo, branch } = repoInfo;
|
|
15099
15539
|
let targetBranch = branch;
|
|
15100
15540
|
if (!targetBranch) {
|
|
15541
|
+
const repoUrl = `https://api.github.com/repos/${owner}/${repo}`;
|
|
15542
|
+
logger.debug(`Fetching repository info: ${repoUrl}`);
|
|
15543
|
+
let repoContent;
|
|
15544
|
+
try {
|
|
15545
|
+
repoContent = await this.httpFetcher.fetch(repoUrl, { signal, headers });
|
|
15546
|
+
} catch (error) {
|
|
15547
|
+
if (error instanceof ScraperError) {
|
|
15548
|
+
if (error.message.includes("401")) {
|
|
15549
|
+
throw new ScraperError(
|
|
15550
|
+
`GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`,
|
|
15551
|
+
false,
|
|
15552
|
+
error
|
|
15553
|
+
);
|
|
15554
|
+
}
|
|
15555
|
+
if (error.message.includes("403")) {
|
|
15556
|
+
throw new ScraperError(
|
|
15557
|
+
`GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`,
|
|
15558
|
+
false,
|
|
15559
|
+
error
|
|
15560
|
+
);
|
|
15561
|
+
}
|
|
15562
|
+
}
|
|
15563
|
+
throw error;
|
|
15564
|
+
}
|
|
15565
|
+
if (repoContent.status === FetchStatus.NOT_FOUND) {
|
|
15566
|
+
throw new ScraperError(
|
|
15567
|
+
`Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`,
|
|
15568
|
+
false
|
|
15569
|
+
);
|
|
15570
|
+
}
|
|
15101
15571
|
try {
|
|
15102
|
-
const repoUrl = `https://api.github.com/repos/${owner}/${repo}`;
|
|
15103
|
-
logger.debug(`Fetching repository info: ${repoUrl}`);
|
|
15104
|
-
const repoContent = await this.httpFetcher.fetch(repoUrl, { signal });
|
|
15105
15572
|
const content2 = typeof repoContent.content === "string" ? repoContent.content : repoContent.content.toString("utf-8");
|
|
15106
15573
|
const repoData = JSON.parse(content2);
|
|
15107
|
-
|
|
15108
|
-
|
|
15109
|
-
|
|
15110
|
-
|
|
15574
|
+
const defaultBranch = typeof repoData.default_branch === "string" ? repoData.default_branch.trim() : "";
|
|
15575
|
+
if (!defaultBranch) {
|
|
15576
|
+
logger.warn(
|
|
15577
|
+
`⚠️ Repository info missing default_branch for ${owner}/${repo}, using 'main'`
|
|
15578
|
+
);
|
|
15579
|
+
targetBranch = "main";
|
|
15580
|
+
} else {
|
|
15581
|
+
targetBranch = defaultBranch;
|
|
15582
|
+
logger.debug(`Using default branch: ${targetBranch}`);
|
|
15583
|
+
}
|
|
15584
|
+
} catch (parseError) {
|
|
15585
|
+
logger.warn(`⚠️ Could not parse repository info, using 'main': ${parseError}`);
|
|
15111
15586
|
targetBranch = "main";
|
|
15112
15587
|
}
|
|
15113
15588
|
}
|
|
15114
15589
|
const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`;
|
|
15115
15590
|
logger.debug(`Fetching repository tree: ${treeUrl}`);
|
|
15116
|
-
|
|
15591
|
+
let rawContent;
|
|
15592
|
+
try {
|
|
15593
|
+
rawContent = await this.httpFetcher.fetch(treeUrl, { signal, headers });
|
|
15594
|
+
} catch (error) {
|
|
15595
|
+
if (error instanceof ScraperError) {
|
|
15596
|
+
if (error.message.includes("401")) {
|
|
15597
|
+
throw new ScraperError(
|
|
15598
|
+
`GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`,
|
|
15599
|
+
false,
|
|
15600
|
+
error
|
|
15601
|
+
);
|
|
15602
|
+
}
|
|
15603
|
+
if (error.message.includes("403")) {
|
|
15604
|
+
throw new ScraperError(
|
|
15605
|
+
`GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`,
|
|
15606
|
+
false,
|
|
15607
|
+
error
|
|
15608
|
+
);
|
|
15609
|
+
}
|
|
15610
|
+
}
|
|
15611
|
+
throw error;
|
|
15612
|
+
}
|
|
15613
|
+
if (rawContent.status === FetchStatus.NOT_FOUND) {
|
|
15614
|
+
throw new ScraperError(
|
|
15615
|
+
`Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`,
|
|
15616
|
+
false
|
|
15617
|
+
);
|
|
15618
|
+
}
|
|
15117
15619
|
const content = typeof rawContent.content === "string" ? rawContent.content : rawContent.content.toString("utf-8");
|
|
15118
|
-
|
|
15620
|
+
let treeData;
|
|
15621
|
+
try {
|
|
15622
|
+
treeData = JSON.parse(content);
|
|
15623
|
+
} catch (parseError) {
|
|
15624
|
+
throw new ScraperError(
|
|
15625
|
+
`Failed to parse GitHub API response for "${owner}/${repo}". The repository may be inaccessible or the API returned an unexpected response.`,
|
|
15626
|
+
false,
|
|
15627
|
+
parseError instanceof Error ? parseError : void 0
|
|
15628
|
+
);
|
|
15629
|
+
}
|
|
15119
15630
|
if (treeData.truncated) {
|
|
15120
15631
|
logger.warn(
|
|
15121
15632
|
`⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`
|
|
@@ -15254,7 +15765,7 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15254
15765
|
if (hasTextExtension || hasDocumentExtension || hasCompoundExtension || isCommonTextFile) {
|
|
15255
15766
|
return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
|
|
15256
15767
|
}
|
|
15257
|
-
const mimeType =
|
|
15768
|
+
const mimeType = MimeTypeUtils.detectMimeTypeFromPath(path2);
|
|
15258
15769
|
if (mimeType?.startsWith("text/")) {
|
|
15259
15770
|
logger.debug(`Including file with text MIME type: ${path2} (${mimeType})`);
|
|
15260
15771
|
return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
|
|
@@ -15289,10 +15800,11 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15289
15800
|
status: FetchStatus.NOT_FOUND
|
|
15290
15801
|
};
|
|
15291
15802
|
}
|
|
15803
|
+
const headers = await this.getResolvedAuthHeaders(options.headers);
|
|
15292
15804
|
try {
|
|
15293
15805
|
const parsedUrl = new URL(item.url);
|
|
15294
15806
|
if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) {
|
|
15295
|
-
return await this.wikiProcessor.process(item, options, signal);
|
|
15807
|
+
return await this.wikiProcessor.process(item, options, headers, signal);
|
|
15296
15808
|
}
|
|
15297
15809
|
} catch {
|
|
15298
15810
|
}
|
|
@@ -15318,7 +15830,11 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15318
15830
|
const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`;
|
|
15319
15831
|
discoveredLinks.push(wikiUrl);
|
|
15320
15832
|
logger.debug(`Discovered wiki URL: ${wikiUrl}`);
|
|
15321
|
-
const { tree, resolvedBranch } = await this.fetchRepositoryTree(
|
|
15833
|
+
const { tree, resolvedBranch } = await this.fetchRepositoryTree(
|
|
15834
|
+
repoInfo,
|
|
15835
|
+
headers,
|
|
15836
|
+
signal
|
|
15837
|
+
);
|
|
15322
15838
|
const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
|
|
15323
15839
|
logger.debug(
|
|
15324
15840
|
`Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
|
|
@@ -15336,7 +15852,7 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15336
15852
|
const parsedUrl = new URL(item.url);
|
|
15337
15853
|
if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) {
|
|
15338
15854
|
logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`);
|
|
15339
|
-
return await this.repoProcessor.process(item, options, signal);
|
|
15855
|
+
return await this.repoProcessor.process(item, options, headers, signal);
|
|
15340
15856
|
}
|
|
15341
15857
|
} catch (error) {
|
|
15342
15858
|
logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`);
|
|
@@ -15350,7 +15866,13 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
|
15350
15866
|
if (!url.hostname.includes("github.com")) {
|
|
15351
15867
|
throw new Error("URL must be a GitHub URL");
|
|
15352
15868
|
}
|
|
15353
|
-
await
|
|
15869
|
+
await this.getResolvedAuthHeaders(options.headers);
|
|
15870
|
+
try {
|
|
15871
|
+
await super.scrape(options, progressCallback, signal);
|
|
15872
|
+
} finally {
|
|
15873
|
+
this.resolvedAuthHeaders = void 0;
|
|
15874
|
+
this.resolvedAuthKey = void 0;
|
|
15875
|
+
}
|
|
15354
15876
|
}
|
|
15355
15877
|
async cleanup() {
|
|
15356
15878
|
await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]);
|
|
@@ -15726,7 +16248,7 @@ class LocalFileStrategy extends BaseScraperStrategy {
|
|
|
15726
16248
|
logger.debug(`Reading archive entry: ${innerPath} inside ${archivePath}`);
|
|
15727
16249
|
try {
|
|
15728
16250
|
const contentBuffer = await adapter.getContent(innerPath);
|
|
15729
|
-
const mimeType =
|
|
16251
|
+
const mimeType = MimeTypeUtils.detectMimeTypeFromPath(innerPath) || "application/octet-stream";
|
|
15730
16252
|
const rawContent = {
|
|
15731
16253
|
source: item.url,
|
|
15732
16254
|
content: contentBuffer,
|
|
@@ -16013,31 +16535,90 @@ class PyPiScraperStrategy {
|
|
|
16013
16535
|
}
|
|
16014
16536
|
}
|
|
16015
16537
|
class ScraperRegistry {
|
|
16016
|
-
|
|
16538
|
+
config;
|
|
16017
16539
|
constructor(config) {
|
|
16018
|
-
this.
|
|
16019
|
-
new NpmScraperStrategy(config),
|
|
16020
|
-
new PyPiScraperStrategy(config),
|
|
16021
|
-
new GitHubScraperStrategy(config),
|
|
16022
|
-
new WebScraperStrategy(config, {}),
|
|
16023
|
-
new LocalFileStrategy(config)
|
|
16024
|
-
];
|
|
16540
|
+
this.config = config;
|
|
16025
16541
|
}
|
|
16542
|
+
/**
|
|
16543
|
+
* Creates and returns a fresh strategy instance for the given URL.
|
|
16544
|
+
* Each call returns a new instance to ensure state isolation between parallel scrapes.
|
|
16545
|
+
*/
|
|
16026
16546
|
getStrategy(url) {
|
|
16027
|
-
|
|
16028
|
-
|
|
16029
|
-
|
|
16030
|
-
|
|
16547
|
+
if (!url.startsWith("github-file://")) {
|
|
16548
|
+
validateUrl(url);
|
|
16549
|
+
}
|
|
16550
|
+
if (isLocalFileUrl(url)) {
|
|
16551
|
+
logger.debug(`Using strategy "LocalFileStrategy" for URL: ${url}`);
|
|
16552
|
+
return new LocalFileStrategy(this.config);
|
|
16553
|
+
}
|
|
16554
|
+
if (isNpmUrl(url)) {
|
|
16555
|
+
logger.debug(`Using strategy "NpmScraperStrategy" for URL: ${url}`);
|
|
16556
|
+
return new NpmScraperStrategy(this.config);
|
|
16557
|
+
}
|
|
16558
|
+
if (isPyPiUrl(url)) {
|
|
16559
|
+
logger.debug(`Using strategy "PyPiScraperStrategy" for URL: ${url}`);
|
|
16560
|
+
return new PyPiScraperStrategy(this.config);
|
|
16031
16561
|
}
|
|
16032
|
-
|
|
16033
|
-
|
|
16562
|
+
if (isGitHubUrl(url)) {
|
|
16563
|
+
logger.debug(`Using strategy "GitHubScraperStrategy" for URL: ${url}`);
|
|
16564
|
+
return new GitHubScraperStrategy(this.config);
|
|
16565
|
+
}
|
|
16566
|
+
if (isWebUrl(url)) {
|
|
16567
|
+
logger.debug(`Using strategy "WebScraperStrategy" for URL: ${url}`);
|
|
16568
|
+
return new WebScraperStrategy(this.config, {});
|
|
16569
|
+
}
|
|
16570
|
+
throw new ScraperError(`No strategy found for URL: ${url}`);
|
|
16034
16571
|
}
|
|
16035
|
-
|
|
16036
|
-
|
|
16037
|
-
|
|
16038
|
-
|
|
16039
|
-
|
|
16040
|
-
|
|
16572
|
+
}
|
|
16573
|
+
function isLocalFileUrl(url) {
|
|
16574
|
+
return url.startsWith("file://");
|
|
16575
|
+
}
|
|
16576
|
+
function isNpmUrl(url) {
|
|
16577
|
+
try {
|
|
16578
|
+
const { hostname } = new URL(url);
|
|
16579
|
+
return ["npmjs.org", "npmjs.com", "www.npmjs.com"].includes(hostname);
|
|
16580
|
+
} catch {
|
|
16581
|
+
return false;
|
|
16582
|
+
}
|
|
16583
|
+
}
|
|
16584
|
+
function isPyPiUrl(url) {
|
|
16585
|
+
try {
|
|
16586
|
+
const { hostname } = new URL(url);
|
|
16587
|
+
return ["pypi.org", "www.pypi.org"].includes(hostname);
|
|
16588
|
+
} catch {
|
|
16589
|
+
return false;
|
|
16590
|
+
}
|
|
16591
|
+
}
|
|
16592
|
+
function isGitHubUrl(url) {
|
|
16593
|
+
if (url.startsWith("github-file://")) {
|
|
16594
|
+
return true;
|
|
16595
|
+
}
|
|
16596
|
+
try {
|
|
16597
|
+
const parsedUrl = new URL(url);
|
|
16598
|
+
const { hostname, pathname } = parsedUrl;
|
|
16599
|
+
if (!["github.com", "www.github.com"].includes(hostname)) {
|
|
16600
|
+
return false;
|
|
16601
|
+
}
|
|
16602
|
+
if (pathname.match(/^\/[^/]+\/[^/]+\/?$/)) {
|
|
16603
|
+
return true;
|
|
16604
|
+
}
|
|
16605
|
+
if (pathname.match(/^\/[^/]+\/[^/]+\/tree\//)) {
|
|
16606
|
+
return true;
|
|
16607
|
+
}
|
|
16608
|
+
if (pathname.match(/^\/[^/]+\/[^/]+\/blob\//)) {
|
|
16609
|
+
return true;
|
|
16610
|
+
}
|
|
16611
|
+
return false;
|
|
16612
|
+
} catch {
|
|
16613
|
+
return false;
|
|
16614
|
+
}
|
|
16615
|
+
}
|
|
16616
|
+
function isWebUrl(url) {
|
|
16617
|
+
try {
|
|
16618
|
+
const parsedUrl = new URL(url);
|
|
16619
|
+
return parsedUrl.protocol === "http:" || parsedUrl.protocol === "https:";
|
|
16620
|
+
} catch {
|
|
16621
|
+
return false;
|
|
16041
16622
|
}
|
|
16042
16623
|
}
|
|
16043
16624
|
class ScraperService {
|
|
@@ -16048,20 +16629,35 @@ class ScraperService {
|
|
|
16048
16629
|
/**
|
|
16049
16630
|
* Scrapes content from the provided URL using the appropriate strategy.
|
|
16050
16631
|
* Reports progress via callback and handles errors.
|
|
16632
|
+
* Cleans up strategy resources after scrape completes (success or failure).
|
|
16051
16633
|
*/
|
|
16052
16634
|
async scrape(options, progressCallback, signal) {
|
|
16053
16635
|
const strategy = this.registry.getStrategy(options.url);
|
|
16054
|
-
|
|
16055
|
-
|
|
16636
|
+
let scrapeError = null;
|
|
16637
|
+
let cleanupErrorToThrow = null;
|
|
16638
|
+
try {
|
|
16639
|
+
await strategy.scrape(options, progressCallback, signal);
|
|
16640
|
+
} catch (error) {
|
|
16641
|
+
scrapeError = error instanceof Error ? error : new ScraperError(`Scrape failed for URL: ${options.url}`, false);
|
|
16642
|
+
} finally {
|
|
16643
|
+
try {
|
|
16644
|
+
await strategy.cleanup?.();
|
|
16645
|
+
} catch (cleanupError) {
|
|
16646
|
+
logger.error(`❌ Strategy cleanup failed for ${options.url}: ${cleanupError}`);
|
|
16647
|
+
if (!scrapeError) {
|
|
16648
|
+
cleanupErrorToThrow = cleanupError instanceof Error ? cleanupError : new ScraperError(
|
|
16649
|
+
`Strategy cleanup failed for URL: ${options.url}`,
|
|
16650
|
+
false
|
|
16651
|
+
);
|
|
16652
|
+
}
|
|
16653
|
+
}
|
|
16654
|
+
}
|
|
16655
|
+
if (scrapeError) {
|
|
16656
|
+
throw scrapeError;
|
|
16657
|
+
}
|
|
16658
|
+
if (cleanupErrorToThrow) {
|
|
16659
|
+
throw cleanupErrorToThrow;
|
|
16056
16660
|
}
|
|
16057
|
-
await strategy.scrape(options, progressCallback, signal);
|
|
16058
|
-
}
|
|
16059
|
-
/**
|
|
16060
|
-
* Cleanup the scraper registry and all its strategies.
|
|
16061
|
-
* Should be called when the service is no longer needed.
|
|
16062
|
-
*/
|
|
16063
|
-
async cleanup() {
|
|
16064
|
-
await this.registry.cleanup();
|
|
16065
16661
|
}
|
|
16066
16662
|
}
|
|
16067
16663
|
class PipelineWorker {
|
|
@@ -16224,7 +16820,7 @@ class PipelineManager {
|
|
|
16224
16820
|
if (this.shouldRecoverJobs) {
|
|
16225
16821
|
await this.recoverPendingJobs();
|
|
16226
16822
|
} else {
|
|
16227
|
-
|
|
16823
|
+
await this.markInterruptedJobsAsFailed();
|
|
16228
16824
|
}
|
|
16229
16825
|
this._processQueue().catch((error) => {
|
|
16230
16826
|
logger.error(`❌ Error in processQueue during start: ${error}`);
|
|
@@ -16232,79 +16828,69 @@ class PipelineManager {
|
|
|
16232
16828
|
}
|
|
16233
16829
|
/**
|
|
16234
16830
|
* Recovers pending jobs from the database after server restart.
|
|
16235
|
-
*
|
|
16236
|
-
*
|
|
16831
|
+
* Uses enqueueRefreshJob() to properly continue interrupted jobs,
|
|
16832
|
+
* leveraging existing pages and ETags when available.
|
|
16237
16833
|
*/
|
|
16238
16834
|
async recoverPendingJobs() {
|
|
16239
16835
|
try {
|
|
16240
|
-
const
|
|
16241
|
-
VersionStatus.RUNNING
|
|
16836
|
+
const interruptedVersions = await this.store.getVersionsByStatus([
|
|
16837
|
+
VersionStatus.RUNNING,
|
|
16838
|
+
VersionStatus.QUEUED
|
|
16242
16839
|
]);
|
|
16243
|
-
|
|
16244
|
-
|
|
16245
|
-
|
|
16246
|
-
`🔄 Reset interrupted job to QUEUED: ${version.library_name}@${version.name || "latest"}`
|
|
16247
|
-
);
|
|
16840
|
+
if (interruptedVersions.length === 0) {
|
|
16841
|
+
logger.debug("No pending jobs to recover from database");
|
|
16842
|
+
return;
|
|
16248
16843
|
}
|
|
16249
|
-
|
|
16250
|
-
|
|
16251
|
-
|
|
16252
|
-
|
|
16253
|
-
|
|
16254
|
-
|
|
16255
|
-
|
|
16256
|
-
|
|
16257
|
-
|
|
16258
|
-
|
|
16259
|
-
|
|
16260
|
-
|
|
16261
|
-
|
|
16262
|
-
|
|
16263
|
-
|
|
16264
|
-
|
|
16265
|
-
} catch (error) {
|
|
16266
|
-
logger.warn(
|
|
16267
|
-
`⚠️ Failed to parse scraper options for ${version.library_name}@${version.name || "latest"}: ${error}`
|
|
16268
|
-
);
|
|
16269
|
-
}
|
|
16844
|
+
logger.info(
|
|
16845
|
+
`📥 Recovering ${interruptedVersions.length} pending job(s) from database`
|
|
16846
|
+
);
|
|
16847
|
+
for (const version of interruptedVersions) {
|
|
16848
|
+
const versionLabel = `${version.library_name}@${version.name || "latest"}`;
|
|
16849
|
+
try {
|
|
16850
|
+
await this.enqueueRefreshJob(version.library_name, version.name);
|
|
16851
|
+
logger.info(`🔄 Recovering job: ${versionLabel}`);
|
|
16852
|
+
} catch (error) {
|
|
16853
|
+
const errorMessage = `Recovery failed: ${error instanceof Error ? error.message : String(error)}`;
|
|
16854
|
+
await this.store.updateVersionStatus(
|
|
16855
|
+
version.id,
|
|
16856
|
+
VersionStatus.FAILED,
|
|
16857
|
+
errorMessage
|
|
16858
|
+
);
|
|
16859
|
+
logger.warn(`⚠️ Failed to recover job ${versionLabel}: ${error}`);
|
|
16270
16860
|
}
|
|
16271
|
-
const job = {
|
|
16272
|
-
id: jobId,
|
|
16273
|
-
library: version.library_name,
|
|
16274
|
-
version: version.name || "",
|
|
16275
|
-
status: PipelineJobStatus.QUEUED,
|
|
16276
|
-
progress: null,
|
|
16277
|
-
error: null,
|
|
16278
|
-
createdAt: new Date(version.created_at),
|
|
16279
|
-
// For recovered QUEUED jobs, startedAt must be null to reflect queued state.
|
|
16280
|
-
startedAt: null,
|
|
16281
|
-
finishedAt: null,
|
|
16282
|
-
abortController,
|
|
16283
|
-
completionPromise,
|
|
16284
|
-
resolveCompletion,
|
|
16285
|
-
rejectCompletion,
|
|
16286
|
-
// Database fields (single source of truth)
|
|
16287
|
-
versionId: version.id,
|
|
16288
|
-
versionStatus: version.status,
|
|
16289
|
-
progressPages: version.progress_pages,
|
|
16290
|
-
progressMaxPages: version.progress_max_pages,
|
|
16291
|
-
errorMessage: version.error_message,
|
|
16292
|
-
updatedAt: new Date(version.updated_at),
|
|
16293
|
-
sourceUrl: version.source_url,
|
|
16294
|
-
scraperOptions: parsedScraperOptions
|
|
16295
|
-
};
|
|
16296
|
-
this.jobMap.set(jobId, job);
|
|
16297
|
-
this.jobQueue.push(jobId);
|
|
16298
|
-
}
|
|
16299
|
-
if (queuedVersions.length > 0) {
|
|
16300
|
-
logger.info(`📥 Recovered ${queuedVersions.length} pending job(s) from database`);
|
|
16301
|
-
} else {
|
|
16302
|
-
logger.debug("No pending jobs to recover from database");
|
|
16303
16861
|
}
|
|
16304
16862
|
} catch (error) {
|
|
16305
16863
|
logger.error(`❌ Failed to recover pending jobs: ${error}`);
|
|
16306
16864
|
}
|
|
16307
16865
|
}
|
|
16866
|
+
/**
|
|
16867
|
+
* Marks all interrupted jobs (RUNNING/QUEUED) as FAILED.
|
|
16868
|
+
* Called when recoverJobs is false to allow users to manually retry via UI.
|
|
16869
|
+
*/
|
|
16870
|
+
async markInterruptedJobsAsFailed() {
|
|
16871
|
+
try {
|
|
16872
|
+
const interruptedVersions = await this.store.getVersionsByStatus([
|
|
16873
|
+
VersionStatus.RUNNING,
|
|
16874
|
+
VersionStatus.QUEUED
|
|
16875
|
+
]);
|
|
16876
|
+
if (interruptedVersions.length === 0) {
|
|
16877
|
+
logger.debug("No interrupted jobs to mark as failed");
|
|
16878
|
+
return;
|
|
16879
|
+
}
|
|
16880
|
+
for (const version of interruptedVersions) {
|
|
16881
|
+
await this.store.updateVersionStatus(
|
|
16882
|
+
version.id,
|
|
16883
|
+
VersionStatus.FAILED,
|
|
16884
|
+
"Job interrupted"
|
|
16885
|
+
);
|
|
16886
|
+
logger.info(
|
|
16887
|
+
`❌ Marked interrupted job as failed: ${version.library_name}@${version.name || "latest"}`
|
|
16888
|
+
);
|
|
16889
|
+
}
|
|
16890
|
+
} catch (error) {
|
|
16891
|
+
logger.error(`❌ Failed to mark interrupted jobs as failed: ${error}`);
|
|
16892
|
+
}
|
|
16893
|
+
}
|
|
16308
16894
|
/**
|
|
16309
16895
|
* Stops the pipeline manager and attempts to gracefully shut down workers.
|
|
16310
16896
|
* Currently, it just stops processing new jobs. Cancellation of active jobs
|
|
@@ -16317,7 +16903,6 @@ class PipelineManager {
|
|
|
16317
16903
|
}
|
|
16318
16904
|
this.isRunning = false;
|
|
16319
16905
|
logger.debug("PipelineManager stopping. No new jobs will be started.");
|
|
16320
|
-
await this.scraperService.cleanup();
|
|
16321
16906
|
}
|
|
16322
16907
|
/**
|
|
16323
16908
|
* Enqueues a new document processing job, aborting any existing QUEUED/RUNNING job for the same library+version (including unversioned).
|
|
@@ -18031,7 +18616,7 @@ function createCli(argv) {
|
|
|
18031
18616
|
let globalEventBus = null;
|
|
18032
18617
|
let globalTelemetryService = null;
|
|
18033
18618
|
const commandStartTimes = /* @__PURE__ */ new Map();
|
|
18034
|
-
const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("
|
|
18619
|
+
const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("2.0.0").option("verbose", {
|
|
18035
18620
|
type: "boolean",
|
|
18036
18621
|
description: "Enable verbose (debug) logging",
|
|
18037
18622
|
default: false
|
|
@@ -18091,7 +18676,7 @@ function createCli(argv) {
|
|
|
18091
18676
|
if (shouldEnableTelemetry() && telemetry.isEnabled()) {
|
|
18092
18677
|
const commandName = argv2._[0]?.toString() || "default";
|
|
18093
18678
|
telemetry.setGlobalContext({
|
|
18094
|
-
appVersion: "
|
|
18679
|
+
appVersion: "2.0.0",
|
|
18095
18680
|
appPlatform: process.platform,
|
|
18096
18681
|
appNodeVersion: process.version,
|
|
18097
18682
|
appInterface: "cli",
|