@arabold/docs-mcp-server 1.37.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -64,11 +64,12 @@ import { escapeHtml } from "@kitajs/html";
64
64
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
65
65
  import { v4 } from "uuid";
66
66
  import { minimatch } from "minimatch";
67
+ import { exec, execSync } from "node:child_process";
68
+ import { promisify } from "node:util";
67
69
  import { Readable } from "node:stream";
68
70
  import * as tar from "tar";
69
71
  import yauzl from "yauzl";
70
72
  import os from "node:os";
71
- import { execSync } from "node:child_process";
72
73
  class StoreError extends Error {
73
74
  constructor(message, cause) {
74
75
  super(cause ? `${message} caused by ${cause}` : message);
@@ -919,6 +920,10 @@ const DEFAULT_CONFIG = {
919
920
  baseDelayMs: 1e3,
920
921
  maxCacheItems: 200,
921
922
  maxCacheItemSizeBytes: 500 * 1024
923
+ },
924
+ document: {
925
+ maxSize: 10 * 1024 * 1024
926
+ // 10MB max size for PDF/Office documents
922
927
  }
923
928
  },
924
929
  splitter: {
@@ -957,10 +962,6 @@ const DEFAULT_CONFIG = {
957
962
  precedingSiblingsLimit: 1,
958
963
  subsequentSiblingsLimit: 2,
959
964
  maxChunkDistance: 3
960
- },
961
- document: {
962
- maxSize: 10 * 1024 * 1024
963
- // 10MB max size for PDF/Office documents
964
965
  }
965
966
  };
966
967
  const AppConfigSchema = z.object({
@@ -997,7 +998,10 @@ const AppConfigSchema = z.object({
997
998
  baseDelayMs: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.baseDelayMs),
998
999
  maxCacheItems: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.maxCacheItems),
999
1000
  maxCacheItemSizeBytes: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.maxCacheItemSizeBytes)
1000
- }).default(DEFAULT_CONFIG.scraper.fetcher)
1001
+ }).default(DEFAULT_CONFIG.scraper.fetcher),
1002
+ document: z.object({
1003
+ maxSize: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.document.maxSize)
1004
+ }).default(DEFAULT_CONFIG.scraper.document)
1001
1005
  }).default(DEFAULT_CONFIG.scraper),
1002
1006
  splitter: z.object({
1003
1007
  minChunkSize: z.coerce.number().int().default(DEFAULT_CONFIG.splitter.minChunkSize),
@@ -1035,10 +1039,7 @@ const AppConfigSchema = z.object({
1035
1039
  precedingSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.precedingSiblingsLimit),
1036
1040
  subsequentSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.subsequentSiblingsLimit),
1037
1041
  maxChunkDistance: z.coerce.number().int().min(0).default(DEFAULT_CONFIG.assembly.maxChunkDistance)
1038
- }).default(DEFAULT_CONFIG.assembly),
1039
- document: z.object({
1040
- maxSize: z.coerce.number().int().default(DEFAULT_CONFIG.document.maxSize)
1041
- }).default(DEFAULT_CONFIG.document)
1042
+ }).default(DEFAULT_CONFIG.assembly)
1042
1043
  });
1043
1044
  const defaults = AppConfigSchema.parse({});
1044
1045
  const configMappings = [
@@ -1159,6 +1160,12 @@ function mapEnvToConfig() {
1159
1160
  }
1160
1161
  }
1161
1162
  }
1163
+ for (const pathArr of ALL_CONFIG_LEAF_PATHS) {
1164
+ const envVar = pathToEnvVar(pathArr);
1165
+ if (process.env[envVar] !== void 0) {
1166
+ setAtPath(config, pathArr, process.env[envVar]);
1167
+ }
1168
+ }
1162
1169
  return config;
1163
1170
  }
1164
1171
  function mapCliToConfig(args) {
@@ -1170,6 +1177,25 @@ function mapCliToConfig(args) {
1170
1177
  }
1171
1178
  return config;
1172
1179
  }
1180
+ function camelToUpperSnake(str) {
1181
+ return str.replace(/([a-z])([A-Z])/g, "$1_$2").toUpperCase();
1182
+ }
1183
+ function pathToEnvVar(pathArr) {
1184
+ return `DOCS_MCP_${pathArr.map(camelToUpperSnake).join("_")}`;
1185
+ }
1186
+ function collectLeafPaths(obj, prefix = []) {
1187
+ const paths = [];
1188
+ for (const [key, value] of Object.entries(obj)) {
1189
+ const currentPath = [...prefix, key];
1190
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
1191
+ paths.push(...collectLeafPaths(value, currentPath));
1192
+ } else {
1193
+ paths.push(currentPath);
1194
+ }
1195
+ }
1196
+ return paths;
1197
+ }
1198
+ const ALL_CONFIG_LEAF_PATHS = collectLeafPaths(DEFAULT_CONFIG);
1173
1199
  function setAtPath(obj, pathArr, value) {
1174
1200
  let current = obj;
1175
1201
  for (let i = 0; i < pathArr.length - 1; i++) {
@@ -1206,17 +1232,168 @@ function deepMerge(target, source) {
1206
1232
  }
1207
1233
  return output;
1208
1234
  }
1235
+ function isValidConfigPath(path2) {
1236
+ const pathArr = path2.split(".");
1237
+ return getAtPath(DEFAULT_CONFIG, pathArr) !== void 0;
1238
+ }
1239
+ function getConfigValue(config, path2) {
1240
+ const pathArr = path2.split(".");
1241
+ return getAtPath(config, pathArr);
1242
+ }
1243
+ function parseConfigValue(value) {
1244
+ const num = Number(value);
1245
+ if (!Number.isNaN(num) && value.trim() !== "") {
1246
+ return num;
1247
+ }
1248
+ const lower = value.toLowerCase();
1249
+ if (lower === "true") return true;
1250
+ if (lower === "false") return false;
1251
+ return value;
1252
+ }
1253
+ function setConfigValue(path2, value) {
1254
+ const configPath = getDefaultConfigPath();
1255
+ const fileConfig = loadConfigFile(configPath) || {};
1256
+ const pathArr = path2.split(".");
1257
+ const parsedValue = parseConfigValue(value);
1258
+ const updatedConfig = JSON.parse(JSON.stringify(fileConfig));
1259
+ setAtPath(updatedConfig, pathArr, parsedValue);
1260
+ try {
1261
+ AppConfigSchema.parse(updatedConfig);
1262
+ } catch (err) {
1263
+ const errorMsg = err instanceof Error ? err.message : String(err);
1264
+ throw new Error(`Invalid config value for "${path2}": ${errorMsg}`);
1265
+ }
1266
+ saveConfigFile(configPath, updatedConfig);
1267
+ return configPath;
1268
+ }
1269
+ function getDefaultConfigPath() {
1270
+ return path.join(systemPaths.config, "config.yaml");
1271
+ }
1272
+ function formatOutput$1(value, format) {
1273
+ if (format === "auto") {
1274
+ if (typeof value === "object" && value !== null) {
1275
+ return JSON.stringify(value, null, 2);
1276
+ }
1277
+ return String(value);
1278
+ }
1279
+ if (format === "yaml") {
1280
+ return yaml.stringify(value).trim();
1281
+ }
1282
+ return JSON.stringify(value, null, 2);
1283
+ }
1209
1284
  function createConfigCommand(cli) {
1210
1285
  cli.command(
1211
1286
  "config",
1212
- "Fetch a URL and transform it into Markdown format",
1213
- (yargs2) => yargs2,
1287
+ "View or modify configuration",
1288
+ (yargs2) => {
1289
+ return yargs2.option("json", {
1290
+ type: "boolean",
1291
+ description: "Output in JSON format",
1292
+ conflicts: "yaml"
1293
+ }).option("yaml", {
1294
+ type: "boolean",
1295
+ description: "Output in YAML format",
1296
+ conflicts: "json"
1297
+ }).command(
1298
+ "get <path>",
1299
+ "Get a configuration value",
1300
+ (y) => y.positional("path", {
1301
+ type: "string",
1302
+ description: "Dot-separated config path (e.g., scraper.maxPages)",
1303
+ demandOption: true
1304
+ }).option("json", {
1305
+ type: "boolean",
1306
+ description: "Output in JSON format",
1307
+ conflicts: "yaml"
1308
+ }).option("yaml", {
1309
+ type: "boolean",
1310
+ description: "Output in YAML format",
1311
+ conflicts: "json"
1312
+ }),
1313
+ (argv) => {
1314
+ const path2 = argv.path;
1315
+ if (!isValidConfigPath(path2)) {
1316
+ console.error(`Error: Invalid config path '${path2}'`);
1317
+ console.error("Use 'docs-mcp-server config' to see all available paths.");
1318
+ process.exitCode = 1;
1319
+ return;
1320
+ }
1321
+ const config = loadConfig(argv, {
1322
+ configPath: argv.config,
1323
+ searchDir: argv.storePath
1324
+ });
1325
+ const value = getConfigValue(config, path2);
1326
+ const format = argv.json ? "json" : argv.yaml ? "yaml" : "auto";
1327
+ console.log(formatOutput$1(value, format));
1328
+ }
1329
+ ).command(
1330
+ "set <path> <value>",
1331
+ "Set a configuration value",
1332
+ (y) => y.positional("path", {
1333
+ type: "string",
1334
+ description: "Dot-separated config path (e.g., scraper.maxPages)",
1335
+ demandOption: true
1336
+ }).positional("value", {
1337
+ type: "string",
1338
+ description: "Value to set",
1339
+ demandOption: true
1340
+ }),
1341
+ (argv) => {
1342
+ const configPath = argv.config;
1343
+ const path2 = argv.path;
1344
+ const value = argv.value;
1345
+ if (configPath) {
1346
+ console.error(
1347
+ "Error: Cannot modify configuration when using explicit --config file."
1348
+ );
1349
+ console.error(
1350
+ "Remove the --config flag to modify the default configuration."
1351
+ );
1352
+ process.exitCode = 1;
1353
+ return;
1354
+ }
1355
+ if (!isValidConfigPath(path2)) {
1356
+ console.error(`Error: Invalid config path '${path2}'`);
1357
+ console.error("Use 'docs-mcp-server config' to see all available paths.");
1358
+ process.exitCode = 1;
1359
+ return;
1360
+ }
1361
+ const config = loadConfig(argv, {
1362
+ configPath: argv.config,
1363
+ searchDir: argv.storePath
1364
+ });
1365
+ const currentValue = getConfigValue(config, path2);
1366
+ if (currentValue !== void 0 && currentValue !== null && typeof currentValue === "object" && !Array.isArray(currentValue)) {
1367
+ console.error(
1368
+ `Error: Config path '${path2}' refers to an object. Use a more specific leaf path to set a scalar value.`
1369
+ );
1370
+ console.error(
1371
+ "Hint: Run 'docs-mcp-server config' to inspect the current structure."
1372
+ );
1373
+ process.exitCode = 1;
1374
+ return;
1375
+ }
1376
+ try {
1377
+ const savedPath = setConfigValue(path2, value);
1378
+ const parsedValue = parseConfigValue(value);
1379
+ console.log(`Updated ${path2} = ${JSON.stringify(parsedValue)}`);
1380
+ console.log(`Saved to: ${savedPath}`);
1381
+ } catch (error) {
1382
+ console.error(
1383
+ `Error: Failed to save configuration: ${error instanceof Error ? error.message : String(error)}`
1384
+ );
1385
+ process.exitCode = 1;
1386
+ }
1387
+ }
1388
+ );
1389
+ },
1214
1390
  (argv) => {
1215
1391
  const config = loadConfig(argv, {
1216
1392
  configPath: argv.config,
1217
1393
  searchDir: argv.storePath
1218
1394
  });
1219
- console.log(JSON.stringify(config, null, 2));
1395
+ const format = argv.json ? "json" : argv.yaml ? "yaml" : "json";
1396
+ console.log(formatOutput$1(config, format));
1220
1397
  }
1221
1398
  );
1222
1399
  }
@@ -2443,42 +2620,135 @@ class MimeTypeUtils {
2443
2620
  static detectMimeTypeFromPath(filePath) {
2444
2621
  const extension = filePath.toLowerCase().split(".").pop();
2445
2622
  const customMimeTypes = {
2623
+ // JavaScript/TypeScript family
2446
2624
  ts: "text/x-typescript",
2447
2625
  tsx: "text/x-tsx",
2626
+ mts: "text/x-typescript",
2627
+ // TypeScript ES modules
2628
+ cts: "text/x-typescript",
2629
+ // TypeScript CommonJS modules
2448
2630
  js: "text/javascript",
2449
2631
  jsx: "text/x-jsx",
2450
2632
  cjs: "text/javascript",
2451
2633
  // CommonJS modules
2452
2634
  mjs: "text/javascript",
2453
2635
  // ES modules
2636
+ // Python family
2454
2637
  py: "text/x-python",
2455
2638
  pyw: "text/x-python",
2456
2639
  pyi: "text/x-python",
2640
+ pyx: "text/x-cython",
2641
+ // Cython
2642
+ pxd: "text/x-cython",
2643
+ // Cython
2644
+ // Systems languages
2457
2645
  go: "text/x-go",
2458
2646
  rs: "text/x-rust",
2647
+ c: "text/x-csrc",
2648
+ h: "text/x-chdr",
2649
+ cpp: "text/x-c++src",
2650
+ cxx: "text/x-c++src",
2651
+ cc: "text/x-c++src",
2652
+ hpp: "text/x-c++hdr",
2653
+ hxx: "text/x-c++hdr",
2654
+ zig: "text/x-zig",
2655
+ nim: "text/x-nim",
2656
+ v: "text/x-v",
2657
+ cr: "text/x-crystal",
2658
+ // JVM languages
2459
2659
  kt: "text/x-kotlin",
2660
+ kts: "text/x-kotlin",
2661
+ // Kotlin script
2460
2662
  scala: "text/x-scala",
2663
+ groovy: "text/x-groovy",
2664
+ gradle: "text/x-gradle",
2665
+ // Apple/Mobile
2461
2666
  swift: "text/x-swift",
2667
+ dart: "text/x-dart",
2668
+ // Scripting languages
2462
2669
  rb: "text/x-ruby",
2670
+ rake: "text/x-ruby",
2671
+ // Rakefile
2463
2672
  php: "text/x-php",
2673
+ lua: "text/x-lua",
2674
+ pl: "text/x-perl",
2675
+ pm: "text/x-perl",
2676
+ r: "text/x-r",
2677
+ // Also handles .R since extension is lowercased
2678
+ // Functional languages
2679
+ hs: "text/x-haskell",
2680
+ lhs: "text/x-haskell",
2681
+ // Literate Haskell
2682
+ elm: "text/x-elm",
2683
+ erl: "text/x-erlang",
2684
+ ex: "text/x-elixir",
2685
+ exs: "text/x-elixir",
2686
+ clj: "text/x-clojure",
2687
+ cljs: "text/x-clojure",
2688
+ cljc: "text/x-clojure",
2689
+ jl: "text/x-julia",
2690
+ // .NET
2464
2691
  cs: "text/x-csharp",
2465
- cpp: "text/x-c++src",
2466
- cxx: "text/x-c++src",
2467
- cc: "text/x-c++src",
2468
- hpp: "text/x-c++hdr",
2469
- hxx: "text/x-c++hdr",
2470
- h: "text/x-chdr",
2471
- c: "text/x-csrc",
2692
+ // Web3/Smart contracts
2693
+ sol: "text/x-solidity",
2694
+ move: "text/x-move",
2695
+ cairo: "text/x-cairo",
2696
+ // Modern web frameworks
2697
+ vue: "text/x-vue",
2698
+ svelte: "text/x-svelte",
2699
+ astro: "text/x-astro",
2700
+ // Shell scripting
2472
2701
  sh: "text/x-shellscript",
2473
2702
  bash: "text/x-shellscript",
2474
2703
  zsh: "text/x-shellscript",
2475
2704
  fish: "text/x-shellscript",
2476
2705
  ps1: "text/x-powershell",
2706
+ // Documentation formats
2707
+ rst: "text/x-rst",
2708
+ // reStructuredText
2709
+ adoc: "text/x-asciidoc",
2710
+ asciidoc: "text/x-asciidoc",
2711
+ textile: "text/x-textile",
2712
+ org: "text/x-org",
2713
+ // Org-mode
2714
+ pod: "text/x-pod",
2715
+ // Perl documentation
2716
+ rdoc: "text/x-rdoc",
2717
+ // Ruby documentation
2718
+ wiki: "text/x-wiki",
2719
+ rmd: "text/x-rmarkdown",
2720
+ // R Markdown
2721
+ // Configuration files
2722
+ toml: "text/x-toml",
2723
+ ini: "text/x-ini",
2724
+ cfg: "text/x-ini",
2725
+ conf: "text/x-conf",
2726
+ properties: "text/x-properties",
2727
+ env: "text/x-dotenv",
2728
+ // Build systems
2729
+ dockerfile: "text/x-dockerfile",
2730
+ containerfile: "text/x-dockerfile",
2731
+ makefile: "text/x-makefile",
2732
+ cmake: "text/x-cmake",
2733
+ bazel: "text/x-bazel",
2734
+ bzl: "text/x-bazel",
2735
+ buck: "text/x-buck",
2736
+ // Infrastructure as Code
2737
+ tf: "text/x-terraform",
2738
+ tfvars: "text/x-terraform",
2739
+ hcl: "text/x-hcl",
2740
+ // Data/Query languages
2477
2741
  sql: "text/x-sql",
2478
2742
  graphql: "text/x-graphql",
2479
2743
  gql: "text/x-graphql",
2744
+ // Schema/API definitions
2480
2745
  proto: "text/x-proto",
2481
- dockerfile: "text/x-dockerfile"
2746
+ prisma: "text/x-prisma",
2747
+ thrift: "text/x-thrift",
2748
+ avro: "text/x-avro",
2749
+ // TeX/LaTeX
2750
+ tex: "text/x-tex",
2751
+ latex: "text/x-latex"
2482
2752
  };
2483
2753
  if (extension && customMimeTypes[extension]) {
2484
2754
  return customMimeTypes[extension];
@@ -2498,8 +2768,24 @@ class MimeTypeUtils {
2498
2768
  return null;
2499
2769
  }
2500
2770
  const mimeTypeNormalization = {
2501
- "application/node": "text/javascript"
2502
- // .cjs files are detected as this
2771
+ "application/node": "text/javascript",
2772
+ // .cjs files
2773
+ "video/mp2t": "text/x-typescript",
2774
+ // .ts/.mts files (MPEG-2 transport stream conflict)
2775
+ "application/rls-services+xml": "text/x-rust",
2776
+ // .rs files
2777
+ "application/vnd.lotus-organizer": "text/x-org",
2778
+ // .org files (Lotus Organizer conflict)
2779
+ "application/vnd.dart": "text/x-dart",
2780
+ // .dart files
2781
+ "application/x-perl": "text/x-perl",
2782
+ // .pl/.pm files
2783
+ "application/x-tex": "text/x-tex",
2784
+ // .tex files
2785
+ "application/x-latex": "text/x-latex",
2786
+ // .latex files
2787
+ "application/toml": "text/x-toml"
2788
+ // .toml files
2503
2789
  };
2504
2790
  return mimeTypeNormalization[mimeType] || mimeType;
2505
2791
  }
@@ -2511,6 +2797,7 @@ class MimeTypeUtils {
2511
2797
  */
2512
2798
  static extractLanguageFromMimeType(mimeType) {
2513
2799
  const mimeToLanguage = {
2800
+ // JavaScript/TypeScript
2514
2801
  "text/x-typescript": "typescript",
2515
2802
  "text/typescript": "typescript",
2516
2803
  "application/typescript": "typescript",
@@ -2519,22 +2806,84 @@ class MimeTypeUtils {
2519
2806
  "application/javascript": "javascript",
2520
2807
  "application/x-javascript": "javascript",
2521
2808
  "text/x-jsx": "jsx",
2809
+ // Python
2522
2810
  "text/x-python": "python",
2523
- "text/x-java": "java",
2811
+ "text/x-cython": "cython",
2812
+ // Systems languages
2524
2813
  "text/x-c": "c",
2525
2814
  "text/x-csrc": "c",
2526
2815
  "text/x-chdr": "c",
2527
2816
  "text/x-c++": "cpp",
2528
2817
  "text/x-c++src": "cpp",
2529
2818
  "text/x-c++hdr": "cpp",
2530
- "text/x-csharp": "csharp",
2531
2819
  "text/x-go": "go",
2532
2820
  "text/x-rust": "rust",
2533
- "text/x-php": "php",
2534
- "text/x-ruby": "ruby",
2535
- "text/x-swift": "swift",
2821
+ "text/x-zig": "zig",
2822
+ "text/x-nim": "nim",
2823
+ "text/x-v": "v",
2824
+ "text/x-crystal": "crystal",
2825
+ // JVM languages
2826
+ "text/x-java": "java",
2536
2827
  "text/x-kotlin": "kotlin",
2537
2828
  "text/x-scala": "scala",
2829
+ "text/x-groovy": "groovy",
2830
+ "text/x-gradle": "groovy",
2831
+ // Apple/Mobile
2832
+ "text/x-swift": "swift",
2833
+ "text/x-dart": "dart",
2834
+ // .NET
2835
+ "text/x-csharp": "csharp",
2836
+ // Scripting languages
2837
+ "text/x-ruby": "ruby",
2838
+ "text/x-php": "php",
2839
+ "text/x-lua": "lua",
2840
+ "text/x-perl": "perl",
2841
+ "text/x-r": "r",
2842
+ // Functional languages
2843
+ "text/x-haskell": "haskell",
2844
+ "text/x-elm": "elm",
2845
+ "text/x-erlang": "erlang",
2846
+ "text/x-elixir": "elixir",
2847
+ "text/x-clojure": "clojure",
2848
+ "text/x-julia": "julia",
2849
+ // Web3/Smart contracts
2850
+ "text/x-solidity": "solidity",
2851
+ "text/x-move": "move",
2852
+ "text/x-cairo": "cairo",
2853
+ // Modern web frameworks
2854
+ "text/x-vue": "vue",
2855
+ "text/x-svelte": "svelte",
2856
+ "text/x-astro": "astro",
2857
+ // Shell
2858
+ "text/x-sh": "bash",
2859
+ "text/x-shellscript": "bash",
2860
+ "application/x-sh": "bash",
2861
+ "text/x-powershell": "powershell",
2862
+ // Documentation formats
2863
+ "text/x-rst": "rst",
2864
+ "text/x-asciidoc": "asciidoc",
2865
+ "text/x-textile": "textile",
2866
+ "text/x-org": "org",
2867
+ "text/x-pod": "pod",
2868
+ "text/x-rdoc": "rdoc",
2869
+ "text/x-wiki": "wiki",
2870
+ "text/x-rmarkdown": "rmarkdown",
2871
+ // Configuration files
2872
+ "text/x-toml": "toml",
2873
+ "text/x-ini": "ini",
2874
+ "text/x-conf": "conf",
2875
+ "text/x-properties": "properties",
2876
+ "text/x-dotenv": "dotenv",
2877
+ // Build systems
2878
+ "text/x-dockerfile": "dockerfile",
2879
+ "text/x-makefile": "makefile",
2880
+ "text/x-cmake": "cmake",
2881
+ "text/x-bazel": "bazel",
2882
+ "text/x-buck": "buck",
2883
+ // Infrastructure as Code
2884
+ "text/x-terraform": "hcl",
2885
+ "text/x-hcl": "hcl",
2886
+ // Data formats
2538
2887
  "text/x-yaml": "yaml",
2539
2888
  "application/x-yaml": "yaml",
2540
2889
  "application/yaml": "yaml",
@@ -2544,13 +2893,15 @@ class MimeTypeUtils {
2544
2893
  "text/xml": "xml",
2545
2894
  "application/xml": "xml",
2546
2895
  "text/x-sql": "sql",
2547
- "text/x-sh": "bash",
2548
- "text/x-shellscript": "bash",
2549
- "application/x-sh": "bash",
2550
- "text/x-powershell": "powershell",
2551
2896
  "text/x-graphql": "graphql",
2897
+ // Schema/API definitions
2552
2898
  "text/x-proto": "protobuf",
2553
- "text/x-dockerfile": "dockerfile"
2899
+ "text/x-prisma": "prisma",
2900
+ "text/x-thrift": "thrift",
2901
+ "text/x-avro": "avro",
2902
+ // TeX/LaTeX
2903
+ "text/x-tex": "tex",
2904
+ "text/x-latex": "latex"
2554
2905
  };
2555
2906
  return mimeToLanguage[mimeType] || "";
2556
2907
  }
@@ -2672,22 +3023,29 @@ class BrowserFetcher {
2672
3023
  }
2673
3024
  }
2674
3025
  /**
2675
- * Close the browser and clean up resources
3026
+ * Close the browser and clean up resources.
3027
+ * Always attempts cleanup even if browser is disconnected to reap zombie processes.
2676
3028
  */
2677
3029
  async close() {
2678
- try {
2679
- if (this.page) {
3030
+ if (this.page) {
3031
+ try {
2680
3032
  await this.page.close();
3033
+ } catch (error) {
3034
+ logger.warn(`⚠️ Error closing browser page: ${error}`);
3035
+ } finally {
2681
3036
  this.page = null;
2682
3037
  }
2683
- if (this.browser) {
3038
+ }
3039
+ if (this.browser) {
3040
+ try {
2684
3041
  await this.browser.close();
3042
+ } catch (error) {
3043
+ logger.warn(`⚠️ Error closing browser: ${error}`);
3044
+ } finally {
2685
3045
  this.browser = null;
2686
3046
  }
2687
- logger.debug("Browser closed successfully");
2688
- } catch (error) {
2689
- logger.warn(`⚠️ Error closing browser: ${error}`);
2690
3047
  }
3048
+ logger.debug("Browser closed successfully");
2691
3049
  }
2692
3050
  }
2693
3051
  class FileFetcher {
@@ -4019,7 +4377,7 @@ class DocumentPipeline extends BasePipeline {
4019
4377
  constructor(config) {
4020
4378
  super();
4021
4379
  this.markitdown = new MarkItDown();
4022
- this.maxSize = config.document.maxSize;
4380
+ this.maxSize = config.scraper.document.maxSize;
4023
4381
  const semanticSplitter = new SemanticMarkdownSplitter(
4024
4382
  config.splitter.preferredChunkSize,
4025
4383
  config.splitter.maxChunkSize
@@ -4505,12 +4863,18 @@ class HtmlPlaywrightMiddleware {
4505
4863
  /**
4506
4864
  * Closes the Playwright browser instance if it exists.
4507
4865
  * Should be called during application shutdown.
4866
+ * Attempts to close even if the browser is disconnected to ensure proper cleanup of zombie processes.
4508
4867
  */
4509
4868
  async closeBrowser() {
4510
- if (this.browser?.isConnected()) {
4511
- logger.debug("Closing Playwright browser instance...");
4512
- await this.browser.close();
4513
- this.browser = null;
4869
+ if (this.browser) {
4870
+ try {
4871
+ logger.debug("Closing Playwright browser instance...");
4872
+ await this.browser.close();
4873
+ } catch (error) {
4874
+ logger.warn(`⚠️ Error closing Playwright browser: ${error}`);
4875
+ } finally {
4876
+ this.browser = null;
4877
+ }
4514
4878
  }
4515
4879
  }
4516
4880
  /**
@@ -5615,10 +5979,15 @@ class HtmlPipeline extends BasePipeline {
5615
5979
  }
5616
5980
  /**
5617
5981
  * Cleanup resources used by this pipeline, specifically the Playwright browser instance.
5982
+ * Errors during cleanup are logged but not propagated to ensure graceful shutdown.
5618
5983
  */
5619
5984
  async close() {
5620
5985
  await super.close();
5621
- await this.playwrightMiddleware.closeBrowser();
5986
+ try {
5987
+ await this.playwrightMiddleware.closeBrowser();
5988
+ } catch (error) {
5989
+ logger.warn(`⚠️ Error during browser cleanup: ${error}`);
5990
+ }
5622
5991
  }
5623
5992
  }
5624
5993
  class TextDocumentSplitter {
@@ -6643,6 +7012,11 @@ class TypeScriptParser {
6643
7012
  ".cjs"
6644
7013
  ];
6645
7014
  mimeTypes = [
7015
+ // text/x-* variants (output by MimeTypeUtils.detectMimeTypeFromPath)
7016
+ "text/x-typescript",
7017
+ "text/x-tsx",
7018
+ "text/x-jsx",
7019
+ // Standard variants
6646
7020
  "text/typescript",
6647
7021
  "application/typescript",
6648
7022
  "text/tsx",
@@ -6984,6 +7358,8 @@ class LanguageParserRegistry {
6984
7358
  // Narrow advertised extensions/mime types for the alias (informational only).
6985
7359
  fileExtensions: [".js", ".jsx", ".mjs", ".cjs"],
6986
7360
  mimeTypes: [
7361
+ "text/x-jsx",
7362
+ // Output by MimeTypeUtils.detectMimeTypeFromPath
6987
7363
  "text/javascript",
6988
7364
  "application/javascript",
6989
7365
  "text/jsx",
@@ -6996,6 +7372,8 @@ class LanguageParserRegistry {
6996
7372
  this.extensionMap.set(ext.toLowerCase(), "javascript");
6997
7373
  }
6998
7374
  const jsMimes = [
7375
+ "text/x-jsx",
7376
+ // Output by MimeTypeUtils.detectMimeTypeFromPath
6999
7377
  "text/javascript",
7000
7378
  "application/javascript",
7001
7379
  "text/jsx",
@@ -11635,7 +12013,7 @@ const Layout = ({
11635
12013
  children,
11636
12014
  eventClientConfig
11637
12015
  }) => {
11638
- const versionString = version || "1.37.0";
12016
+ const versionString = version || "2.0.0";
11639
12017
  const versionInitializer = `versionUpdate({ currentVersion: ${`'${versionString}'`} })`;
11640
12018
  return /* @__PURE__ */ jsxs("html", { lang: "en", children: [
11641
12019
  /* @__PURE__ */ jsxs("head", { children: [
@@ -13985,7 +14363,7 @@ class AppServer {
13985
14363
  try {
13986
14364
  if (telemetry.isEnabled()) {
13987
14365
  telemetry.setGlobalContext({
13988
- appVersion: "1.37.0",
14366
+ appVersion: "2.0.0",
13989
14367
  appPlatform: process.platform,
13990
14368
  appNodeVersion: process.version,
13991
14369
  appServicesEnabled: this.getActiveServicesList(),
@@ -14728,6 +15106,9 @@ class BaseScraperStrategy {
14728
15106
  return null;
14729
15107
  }).filter((item2) => item2 !== null);
14730
15108
  } catch (error) {
15109
+ if (item.depth === 0) {
15110
+ throw error;
15111
+ }
14731
15112
  if (options.ignoreErrors) {
14732
15113
  logger.error(`❌ Failed to process ${item.url}: ${error}`);
14733
15114
  return [];
@@ -14842,10 +15223,10 @@ class GitHubRepoProcessor {
14842
15223
  /**
14843
15224
  * Fetches the raw content of a file from GitHub.
14844
15225
  */
14845
- async fetchFileContent(repoInfo, filePath, etag, signal) {
15226
+ async fetchFileContent(repoInfo, filePath, etag, headers, signal) {
14846
15227
  const { owner, repo, branch } = repoInfo;
14847
15228
  const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
14848
- const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag });
15229
+ const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag, headers });
14849
15230
  const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
14850
15231
  if (detectedMimeType && (rawContent.mimeType === "text/plain" || rawContent.mimeType === "application/octet-stream")) {
14851
15232
  return {
@@ -14858,13 +15239,14 @@ class GitHubRepoProcessor {
14858
15239
  /**
14859
15240
  * Processes a single GitHub repository file from an HTTPS blob URL.
14860
15241
  */
14861
- async process(item, options, signal) {
15242
+ async process(item, options, headers, signal) {
14862
15243
  const repoInfo = this.parseHttpsBlobUrl(item.url);
14863
15244
  const { owner, repo, branch, filePath } = repoInfo;
14864
15245
  const rawContent = await this.fetchFileContent(
14865
15246
  { owner, repo, branch },
14866
15247
  filePath,
14867
15248
  item.etag,
15249
+ headers,
14868
15250
  signal
14869
15251
  );
14870
15252
  if (rawContent.status !== FetchStatus.SUCCESS) {
@@ -14955,12 +15337,13 @@ class GitHubWikiProcessor {
14955
15337
  /**
14956
15338
  * Processes a single GitHub wiki page.
14957
15339
  */
14958
- async process(item, options, signal) {
15340
+ async process(item, options, headers, signal) {
14959
15341
  const currentUrl = item.url;
14960
15342
  try {
14961
15343
  const rawContent = await this.httpFetcher.fetch(currentUrl, {
14962
15344
  signal,
14963
- etag: item.etag
15345
+ etag: item.etag,
15346
+ headers
14964
15347
  });
14965
15348
  if (rawContent.status !== FetchStatus.SUCCESS) {
14966
15349
  return { url: currentUrl, links: [], status: rawContent.status };
@@ -15031,10 +15414,52 @@ class GitHubWikiProcessor {
15031
15414
  await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
15032
15415
  }
15033
15416
  }
15417
+ const execAsync = promisify(exec);
15418
+ async function resolveGitHubAuth(explicitHeaders) {
15419
+ if (explicitHeaders) {
15420
+ const hasAuthHeader = Object.keys(explicitHeaders).some(
15421
+ (key) => key.toLowerCase() === "authorization"
15422
+ );
15423
+ if (hasAuthHeader) {
15424
+ return explicitHeaders;
15425
+ }
15426
+ }
15427
+ const githubToken = process.env.GITHUB_TOKEN;
15428
+ if (githubToken) {
15429
+ logger.debug("Using GitHub token from GITHUB_TOKEN environment variable");
15430
+ return {
15431
+ ...explicitHeaders,
15432
+ Authorization: `Bearer ${githubToken}`
15433
+ };
15434
+ }
15435
+ const ghToken = process.env.GH_TOKEN;
15436
+ if (ghToken) {
15437
+ logger.debug("Using GitHub token from GH_TOKEN environment variable");
15438
+ return {
15439
+ ...explicitHeaders,
15440
+ Authorization: `Bearer ${ghToken}`
15441
+ };
15442
+ }
15443
+ try {
15444
+ const { stdout } = await execAsync("gh auth token", { timeout: 5e3 });
15445
+ const cliToken = stdout.trim();
15446
+ if (cliToken) {
15447
+ logger.debug("Using GitHub token from local gh CLI");
15448
+ return {
15449
+ ...explicitHeaders,
15450
+ Authorization: `Bearer ${cliToken}`
15451
+ };
15452
+ }
15453
+ } catch {
15454
+ }
15455
+ return explicitHeaders ?? {};
15456
+ }
15034
15457
  class GitHubScraperStrategy extends BaseScraperStrategy {
15035
15458
  httpFetcher;
15036
15459
  wikiProcessor;
15037
15460
  repoProcessor;
15461
+ resolvedAuthHeaders;
15462
+ resolvedAuthKey;
15038
15463
  constructor(config) {
15039
15464
  super(config);
15040
15465
  this.httpFetcher = new HttpFetcher(config.scraper);
@@ -15091,31 +15516,117 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15091
15516
  }
15092
15517
  return { owner, repo };
15093
15518
  }
15519
+ buildAuthCacheKey(explicitHeaders) {
15520
+ const normalizedHeaders = explicitHeaders ? Object.keys(explicitHeaders).sort().map((key) => [key, explicitHeaders[key]]) : [];
15521
+ const envKey = `${process.env.GITHUB_TOKEN ?? ""}|${process.env.GH_TOKEN ?? ""}`;
15522
+ return JSON.stringify({ headers: normalizedHeaders, env: envKey });
15523
+ }
15524
+ async getResolvedAuthHeaders(explicitHeaders) {
15525
+ const cacheKey = this.buildAuthCacheKey(explicitHeaders);
15526
+ if (this.resolvedAuthHeaders && this.resolvedAuthKey === cacheKey) {
15527
+ return this.resolvedAuthHeaders;
15528
+ }
15529
+ const resolved = await resolveGitHubAuth(explicitHeaders);
15530
+ this.resolvedAuthHeaders = resolved;
15531
+ this.resolvedAuthKey = cacheKey;
15532
+ return resolved;
15533
+ }
15094
15534
  /**
15095
15535
  * Fetches the repository tree structure from GitHub API.
15096
15536
  */
15097
- async fetchRepositoryTree(repoInfo, signal) {
15537
+ async fetchRepositoryTree(repoInfo, headers, signal) {
15098
15538
  const { owner, repo, branch } = repoInfo;
15099
15539
  let targetBranch = branch;
15100
15540
  if (!targetBranch) {
15541
+ const repoUrl = `https://api.github.com/repos/${owner}/${repo}`;
15542
+ logger.debug(`Fetching repository info: ${repoUrl}`);
15543
+ let repoContent;
15544
+ try {
15545
+ repoContent = await this.httpFetcher.fetch(repoUrl, { signal, headers });
15546
+ } catch (error) {
15547
+ if (error instanceof ScraperError) {
15548
+ if (error.message.includes("401")) {
15549
+ throw new ScraperError(
15550
+ `GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`,
15551
+ false,
15552
+ error
15553
+ );
15554
+ }
15555
+ if (error.message.includes("403")) {
15556
+ throw new ScraperError(
15557
+ `GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`,
15558
+ false,
15559
+ error
15560
+ );
15561
+ }
15562
+ }
15563
+ throw error;
15564
+ }
15565
+ if (repoContent.status === FetchStatus.NOT_FOUND) {
15566
+ throw new ScraperError(
15567
+ `Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`,
15568
+ false
15569
+ );
15570
+ }
15101
15571
  try {
15102
- const repoUrl = `https://api.github.com/repos/${owner}/${repo}`;
15103
- logger.debug(`Fetching repository info: ${repoUrl}`);
15104
- const repoContent = await this.httpFetcher.fetch(repoUrl, { signal });
15105
15572
  const content2 = typeof repoContent.content === "string" ? repoContent.content : repoContent.content.toString("utf-8");
15106
15573
  const repoData = JSON.parse(content2);
15107
- targetBranch = repoData.default_branch;
15108
- logger.debug(`Using default branch: ${targetBranch}`);
15109
- } catch (error) {
15110
- logger.warn(`⚠️ Could not fetch default branch, using 'main': ${error}`);
15574
+ const defaultBranch = typeof repoData.default_branch === "string" ? repoData.default_branch.trim() : "";
15575
+ if (!defaultBranch) {
15576
+ logger.warn(
15577
+ `⚠️ Repository info missing default_branch for ${owner}/${repo}, using 'main'`
15578
+ );
15579
+ targetBranch = "main";
15580
+ } else {
15581
+ targetBranch = defaultBranch;
15582
+ logger.debug(`Using default branch: ${targetBranch}`);
15583
+ }
15584
+ } catch (parseError) {
15585
+ logger.warn(`⚠️ Could not parse repository info, using 'main': ${parseError}`);
15111
15586
  targetBranch = "main";
15112
15587
  }
15113
15588
  }
15114
15589
  const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`;
15115
15590
  logger.debug(`Fetching repository tree: ${treeUrl}`);
15116
- const rawContent = await this.httpFetcher.fetch(treeUrl, { signal });
15591
+ let rawContent;
15592
+ try {
15593
+ rawContent = await this.httpFetcher.fetch(treeUrl, { signal, headers });
15594
+ } catch (error) {
15595
+ if (error instanceof ScraperError) {
15596
+ if (error.message.includes("401")) {
15597
+ throw new ScraperError(
15598
+ `GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`,
15599
+ false,
15600
+ error
15601
+ );
15602
+ }
15603
+ if (error.message.includes("403")) {
15604
+ throw new ScraperError(
15605
+ `GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`,
15606
+ false,
15607
+ error
15608
+ );
15609
+ }
15610
+ }
15611
+ throw error;
15612
+ }
15613
+ if (rawContent.status === FetchStatus.NOT_FOUND) {
15614
+ throw new ScraperError(
15615
+ `Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`,
15616
+ false
15617
+ );
15618
+ }
15117
15619
  const content = typeof rawContent.content === "string" ? rawContent.content : rawContent.content.toString("utf-8");
15118
- const treeData = JSON.parse(content);
15620
+ let treeData;
15621
+ try {
15622
+ treeData = JSON.parse(content);
15623
+ } catch (parseError) {
15624
+ throw new ScraperError(
15625
+ `Failed to parse GitHub API response for "${owner}/${repo}". The repository may be inaccessible or the API returned an unexpected response.`,
15626
+ false,
15627
+ parseError instanceof Error ? parseError : void 0
15628
+ );
15629
+ }
15119
15630
  if (treeData.truncated) {
15120
15631
  logger.warn(
15121
15632
  `⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`
@@ -15254,7 +15765,7 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15254
15765
  if (hasTextExtension || hasDocumentExtension || hasCompoundExtension || isCommonTextFile) {
15255
15766
  return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
15256
15767
  }
15257
- const mimeType = mime.getType(path2);
15768
+ const mimeType = MimeTypeUtils.detectMimeTypeFromPath(path2);
15258
15769
  if (mimeType?.startsWith("text/")) {
15259
15770
  logger.debug(`Including file with text MIME type: ${path2} (${mimeType})`);
15260
15771
  return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
@@ -15289,10 +15800,11 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15289
15800
  status: FetchStatus.NOT_FOUND
15290
15801
  };
15291
15802
  }
15803
+ const headers = await this.getResolvedAuthHeaders(options.headers);
15292
15804
  try {
15293
15805
  const parsedUrl = new URL(item.url);
15294
15806
  if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) {
15295
- return await this.wikiProcessor.process(item, options, signal);
15807
+ return await this.wikiProcessor.process(item, options, headers, signal);
15296
15808
  }
15297
15809
  } catch {
15298
15810
  }
@@ -15318,7 +15830,11 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15318
15830
  const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`;
15319
15831
  discoveredLinks.push(wikiUrl);
15320
15832
  logger.debug(`Discovered wiki URL: ${wikiUrl}`);
15321
- const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
15833
+ const { tree, resolvedBranch } = await this.fetchRepositoryTree(
15834
+ repoInfo,
15835
+ headers,
15836
+ signal
15837
+ );
15322
15838
  const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
15323
15839
  logger.debug(
15324
15840
  `Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
@@ -15336,7 +15852,7 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15336
15852
  const parsedUrl = new URL(item.url);
15337
15853
  if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) {
15338
15854
  logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`);
15339
- return await this.repoProcessor.process(item, options, signal);
15855
+ return await this.repoProcessor.process(item, options, headers, signal);
15340
15856
  }
15341
15857
  } catch (error) {
15342
15858
  logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`);
@@ -15350,7 +15866,13 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15350
15866
  if (!url.hostname.includes("github.com")) {
15351
15867
  throw new Error("URL must be a GitHub URL");
15352
15868
  }
15353
- await super.scrape(options, progressCallback, signal);
15869
+ await this.getResolvedAuthHeaders(options.headers);
15870
+ try {
15871
+ await super.scrape(options, progressCallback, signal);
15872
+ } finally {
15873
+ this.resolvedAuthHeaders = void 0;
15874
+ this.resolvedAuthKey = void 0;
15875
+ }
15354
15876
  }
15355
15877
  async cleanup() {
15356
15878
  await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]);
@@ -15726,7 +16248,7 @@ class LocalFileStrategy extends BaseScraperStrategy {
15726
16248
  logger.debug(`Reading archive entry: ${innerPath} inside ${archivePath}`);
15727
16249
  try {
15728
16250
  const contentBuffer = await adapter.getContent(innerPath);
15729
- const mimeType = mime.getType(innerPath) || "application/octet-stream";
16251
+ const mimeType = MimeTypeUtils.detectMimeTypeFromPath(innerPath) || "application/octet-stream";
15730
16252
  const rawContent = {
15731
16253
  source: item.url,
15732
16254
  content: contentBuffer,
@@ -16013,31 +16535,90 @@ class PyPiScraperStrategy {
16013
16535
  }
16014
16536
  }
16015
16537
  class ScraperRegistry {
16016
- strategies;
16538
+ config;
16017
16539
  constructor(config) {
16018
- this.strategies = [
16019
- new NpmScraperStrategy(config),
16020
- new PyPiScraperStrategy(config),
16021
- new GitHubScraperStrategy(config),
16022
- new WebScraperStrategy(config, {}),
16023
- new LocalFileStrategy(config)
16024
- ];
16540
+ this.config = config;
16025
16541
  }
16542
+ /**
16543
+ * Creates and returns a fresh strategy instance for the given URL.
16544
+ * Each call returns a new instance to ensure state isolation between parallel scrapes.
16545
+ */
16026
16546
  getStrategy(url) {
16027
- validateUrl(url);
16028
- const strategy = this.strategies.find((s) => s.canHandle(url));
16029
- if (!strategy) {
16030
- throw new ScraperError(`No strategy found for URL: ${url}`);
16547
+ if (!url.startsWith("github-file://")) {
16548
+ validateUrl(url);
16549
+ }
16550
+ if (isLocalFileUrl(url)) {
16551
+ logger.debug(`Using strategy "LocalFileStrategy" for URL: ${url}`);
16552
+ return new LocalFileStrategy(this.config);
16553
+ }
16554
+ if (isNpmUrl(url)) {
16555
+ logger.debug(`Using strategy "NpmScraperStrategy" for URL: ${url}`);
16556
+ return new NpmScraperStrategy(this.config);
16557
+ }
16558
+ if (isPyPiUrl(url)) {
16559
+ logger.debug(`Using strategy "PyPiScraperStrategy" for URL: ${url}`);
16560
+ return new PyPiScraperStrategy(this.config);
16031
16561
  }
16032
- logger.debug(`Using strategy "${strategy.constructor.name}" for URL: ${url}`);
16033
- return strategy;
16562
+ if (isGitHubUrl(url)) {
16563
+ logger.debug(`Using strategy "GitHubScraperStrategy" for URL: ${url}`);
16564
+ return new GitHubScraperStrategy(this.config);
16565
+ }
16566
+ if (isWebUrl(url)) {
16567
+ logger.debug(`Using strategy "WebScraperStrategy" for URL: ${url}`);
16568
+ return new WebScraperStrategy(this.config, {});
16569
+ }
16570
+ throw new ScraperError(`No strategy found for URL: ${url}`);
16034
16571
  }
16035
- /**
16036
- * Cleanup all registered strategies to prevent resource leaks.
16037
- * Should be called when the registry is no longer needed.
16038
- */
16039
- async cleanup() {
16040
- await Promise.allSettled(this.strategies.map((strategy) => strategy.cleanup?.()));
16572
+ }
16573
+ function isLocalFileUrl(url) {
16574
+ return url.startsWith("file://");
16575
+ }
16576
+ function isNpmUrl(url) {
16577
+ try {
16578
+ const { hostname } = new URL(url);
16579
+ return ["npmjs.org", "npmjs.com", "www.npmjs.com"].includes(hostname);
16580
+ } catch {
16581
+ return false;
16582
+ }
16583
+ }
16584
+ function isPyPiUrl(url) {
16585
+ try {
16586
+ const { hostname } = new URL(url);
16587
+ return ["pypi.org", "www.pypi.org"].includes(hostname);
16588
+ } catch {
16589
+ return false;
16590
+ }
16591
+ }
16592
+ function isGitHubUrl(url) {
16593
+ if (url.startsWith("github-file://")) {
16594
+ return true;
16595
+ }
16596
+ try {
16597
+ const parsedUrl = new URL(url);
16598
+ const { hostname, pathname } = parsedUrl;
16599
+ if (!["github.com", "www.github.com"].includes(hostname)) {
16600
+ return false;
16601
+ }
16602
+ if (pathname.match(/^\/[^/]+\/[^/]+\/?$/)) {
16603
+ return true;
16604
+ }
16605
+ if (pathname.match(/^\/[^/]+\/[^/]+\/tree\//)) {
16606
+ return true;
16607
+ }
16608
+ if (pathname.match(/^\/[^/]+\/[^/]+\/blob\//)) {
16609
+ return true;
16610
+ }
16611
+ return false;
16612
+ } catch {
16613
+ return false;
16614
+ }
16615
+ }
16616
+ function isWebUrl(url) {
16617
+ try {
16618
+ const parsedUrl = new URL(url);
16619
+ return parsedUrl.protocol === "http:" || parsedUrl.protocol === "https:";
16620
+ } catch {
16621
+ return false;
16041
16622
  }
16042
16623
  }
16043
16624
  class ScraperService {
@@ -16048,20 +16629,35 @@ class ScraperService {
16048
16629
  /**
16049
16630
  * Scrapes content from the provided URL using the appropriate strategy.
16050
16631
  * Reports progress via callback and handles errors.
16632
+ * Cleans up strategy resources after scrape completes (success or failure).
16051
16633
  */
16052
16634
  async scrape(options, progressCallback, signal) {
16053
16635
  const strategy = this.registry.getStrategy(options.url);
16054
- if (!strategy) {
16055
- throw new ScraperError(`No scraper strategy found for URL: ${options.url}`, false);
16636
+ let scrapeError = null;
16637
+ let cleanupErrorToThrow = null;
16638
+ try {
16639
+ await strategy.scrape(options, progressCallback, signal);
16640
+ } catch (error) {
16641
+ scrapeError = error instanceof Error ? error : new ScraperError(`Scrape failed for URL: ${options.url}`, false);
16642
+ } finally {
16643
+ try {
16644
+ await strategy.cleanup?.();
16645
+ } catch (cleanupError) {
16646
+ logger.error(`❌ Strategy cleanup failed for ${options.url}: ${cleanupError}`);
16647
+ if (!scrapeError) {
16648
+ cleanupErrorToThrow = cleanupError instanceof Error ? cleanupError : new ScraperError(
16649
+ `Strategy cleanup failed for URL: ${options.url}`,
16650
+ false
16651
+ );
16652
+ }
16653
+ }
16654
+ }
16655
+ if (scrapeError) {
16656
+ throw scrapeError;
16657
+ }
16658
+ if (cleanupErrorToThrow) {
16659
+ throw cleanupErrorToThrow;
16056
16660
  }
16057
- await strategy.scrape(options, progressCallback, signal);
16058
- }
16059
- /**
16060
- * Cleanup the scraper registry and all its strategies.
16061
- * Should be called when the service is no longer needed.
16062
- */
16063
- async cleanup() {
16064
- await this.registry.cleanup();
16065
16661
  }
16066
16662
  }
16067
16663
  class PipelineWorker {
@@ -16224,7 +16820,7 @@ class PipelineManager {
16224
16820
  if (this.shouldRecoverJobs) {
16225
16821
  await this.recoverPendingJobs();
16226
16822
  } else {
16227
- logger.debug("Job recovery disabled for this PipelineManager instance");
16823
+ await this.markInterruptedJobsAsFailed();
16228
16824
  }
16229
16825
  this._processQueue().catch((error) => {
16230
16826
  logger.error(`❌ Error in processQueue during start: ${error}`);
@@ -16232,79 +16828,69 @@ class PipelineManager {
16232
16828
  }
16233
16829
  /**
16234
16830
  * Recovers pending jobs from the database after server restart.
16235
- * Finds versions with RUNNING status and resets them to QUEUED for re-processing.
16236
- * Also loads all QUEUED versions back into the pipeline queue.
16831
+ * Uses enqueueRefreshJob() to properly continue interrupted jobs,
16832
+ * leveraging existing pages and ETags when available.
16237
16833
  */
16238
16834
  async recoverPendingJobs() {
16239
16835
  try {
16240
- const runningVersions = await this.store.getVersionsByStatus([
16241
- VersionStatus.RUNNING
16836
+ const interruptedVersions = await this.store.getVersionsByStatus([
16837
+ VersionStatus.RUNNING,
16838
+ VersionStatus.QUEUED
16242
16839
  ]);
16243
- for (const version of runningVersions) {
16244
- await this.store.updateVersionStatus(version.id, VersionStatus.QUEUED);
16245
- logger.info(
16246
- `🔄 Reset interrupted job to QUEUED: ${version.library_name}@${version.name || "latest"}`
16247
- );
16840
+ if (interruptedVersions.length === 0) {
16841
+ logger.debug("No pending jobs to recover from database");
16842
+ return;
16248
16843
  }
16249
- const queuedVersions = await this.store.getVersionsByStatus([VersionStatus.QUEUED]);
16250
- for (const version of queuedVersions) {
16251
- const jobId = v4();
16252
- const abortController = new AbortController();
16253
- let resolveCompletion;
16254
- let rejectCompletion;
16255
- const completionPromise = new Promise((resolve, reject) => {
16256
- resolveCompletion = resolve;
16257
- rejectCompletion = reject;
16258
- });
16259
- completionPromise.catch(() => {
16260
- });
16261
- let parsedScraperOptions = null;
16262
- if (version.scraper_options) {
16263
- try {
16264
- parsedScraperOptions = JSON.parse(version.scraper_options);
16265
- } catch (error) {
16266
- logger.warn(
16267
- `⚠️ Failed to parse scraper options for ${version.library_name}@${version.name || "latest"}: ${error}`
16268
- );
16269
- }
16844
+ logger.info(
16845
+ `📥 Recovering ${interruptedVersions.length} pending job(s) from database`
16846
+ );
16847
+ for (const version of interruptedVersions) {
16848
+ const versionLabel = `${version.library_name}@${version.name || "latest"}`;
16849
+ try {
16850
+ await this.enqueueRefreshJob(version.library_name, version.name);
16851
+ logger.info(`🔄 Recovering job: ${versionLabel}`);
16852
+ } catch (error) {
16853
+ const errorMessage = `Recovery failed: ${error instanceof Error ? error.message : String(error)}`;
16854
+ await this.store.updateVersionStatus(
16855
+ version.id,
16856
+ VersionStatus.FAILED,
16857
+ errorMessage
16858
+ );
16859
+ logger.warn(`⚠️ Failed to recover job ${versionLabel}: ${error}`);
16270
16860
  }
16271
- const job = {
16272
- id: jobId,
16273
- library: version.library_name,
16274
- version: version.name || "",
16275
- status: PipelineJobStatus.QUEUED,
16276
- progress: null,
16277
- error: null,
16278
- createdAt: new Date(version.created_at),
16279
- // For recovered QUEUED jobs, startedAt must be null to reflect queued state.
16280
- startedAt: null,
16281
- finishedAt: null,
16282
- abortController,
16283
- completionPromise,
16284
- resolveCompletion,
16285
- rejectCompletion,
16286
- // Database fields (single source of truth)
16287
- versionId: version.id,
16288
- versionStatus: version.status,
16289
- progressPages: version.progress_pages,
16290
- progressMaxPages: version.progress_max_pages,
16291
- errorMessage: version.error_message,
16292
- updatedAt: new Date(version.updated_at),
16293
- sourceUrl: version.source_url,
16294
- scraperOptions: parsedScraperOptions
16295
- };
16296
- this.jobMap.set(jobId, job);
16297
- this.jobQueue.push(jobId);
16298
- }
16299
- if (queuedVersions.length > 0) {
16300
- logger.info(`📥 Recovered ${queuedVersions.length} pending job(s) from database`);
16301
- } else {
16302
- logger.debug("No pending jobs to recover from database");
16303
16861
  }
16304
16862
  } catch (error) {
16305
16863
  logger.error(`❌ Failed to recover pending jobs: ${error}`);
16306
16864
  }
16307
16865
  }
16866
+ /**
16867
+ * Marks all interrupted jobs (RUNNING/QUEUED) as FAILED.
16868
+ * Called when recoverJobs is false to allow users to manually retry via UI.
16869
+ */
16870
+ async markInterruptedJobsAsFailed() {
16871
+ try {
16872
+ const interruptedVersions = await this.store.getVersionsByStatus([
16873
+ VersionStatus.RUNNING,
16874
+ VersionStatus.QUEUED
16875
+ ]);
16876
+ if (interruptedVersions.length === 0) {
16877
+ logger.debug("No interrupted jobs to mark as failed");
16878
+ return;
16879
+ }
16880
+ for (const version of interruptedVersions) {
16881
+ await this.store.updateVersionStatus(
16882
+ version.id,
16883
+ VersionStatus.FAILED,
16884
+ "Job interrupted"
16885
+ );
16886
+ logger.info(
16887
+ `❌ Marked interrupted job as failed: ${version.library_name}@${version.name || "latest"}`
16888
+ );
16889
+ }
16890
+ } catch (error) {
16891
+ logger.error(`❌ Failed to mark interrupted jobs as failed: ${error}`);
16892
+ }
16893
+ }
16308
16894
  /**
16309
16895
  * Stops the pipeline manager and attempts to gracefully shut down workers.
16310
16896
  * Currently, it just stops processing new jobs. Cancellation of active jobs
@@ -16317,7 +16903,6 @@ class PipelineManager {
16317
16903
  }
16318
16904
  this.isRunning = false;
16319
16905
  logger.debug("PipelineManager stopping. No new jobs will be started.");
16320
- await this.scraperService.cleanup();
16321
16906
  }
16322
16907
  /**
16323
16908
  * Enqueues a new document processing job, aborting any existing QUEUED/RUNNING job for the same library+version (including unversioned).
@@ -18031,7 +18616,7 @@ function createCli(argv) {
18031
18616
  let globalEventBus = null;
18032
18617
  let globalTelemetryService = null;
18033
18618
  const commandStartTimes = /* @__PURE__ */ new Map();
18034
- const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("1.37.0").option("verbose", {
18619
+ const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("2.0.0").option("verbose", {
18035
18620
  type: "boolean",
18036
18621
  description: "Enable verbose (debug) logging",
18037
18622
  default: false
@@ -18091,7 +18676,7 @@ function createCli(argv) {
18091
18676
  if (shouldEnableTelemetry() && telemetry.isEnabled()) {
18092
18677
  const commandName = argv2._[0]?.toString() || "default";
18093
18678
  telemetry.setGlobalContext({
18094
- appVersion: "1.37.0",
18679
+ appVersion: "2.0.0",
18095
18680
  appPlatform: process.platform,
18096
18681
  appNodeVersion: process.version,
18097
18682
  appInterface: "cli",