@arabold/docs-mcp-server 1.36.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -64,11 +64,12 @@ import { escapeHtml } from "@kitajs/html";
64
64
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
65
65
  import { v4 } from "uuid";
66
66
  import { minimatch } from "minimatch";
67
+ import { exec, execSync } from "node:child_process";
68
+ import { promisify } from "node:util";
67
69
  import { Readable } from "node:stream";
68
70
  import * as tar from "tar";
69
71
  import yauzl from "yauzl";
70
72
  import os from "node:os";
71
- import { execSync } from "node:child_process";
72
73
  class StoreError extends Error {
73
74
  constructor(message, cause) {
74
75
  super(cause ? `${message} caused by ${cause}` : message);
@@ -919,6 +920,10 @@ const DEFAULT_CONFIG = {
919
920
  baseDelayMs: 1e3,
920
921
  maxCacheItems: 200,
921
922
  maxCacheItemSizeBytes: 500 * 1024
923
+ },
924
+ document: {
925
+ maxSize: 10 * 1024 * 1024
926
+ // 10MB max size for PDF/Office documents
922
927
  }
923
928
  },
924
929
  splitter: {
@@ -957,10 +962,6 @@ const DEFAULT_CONFIG = {
957
962
  precedingSiblingsLimit: 1,
958
963
  subsequentSiblingsLimit: 2,
959
964
  maxChunkDistance: 3
960
- },
961
- document: {
962
- maxSize: 10 * 1024 * 1024
963
- // 10MB max size for PDF/Office documents
964
965
  }
965
966
  };
966
967
  const AppConfigSchema = z.object({
@@ -997,7 +998,10 @@ const AppConfigSchema = z.object({
997
998
  baseDelayMs: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.baseDelayMs),
998
999
  maxCacheItems: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.maxCacheItems),
999
1000
  maxCacheItemSizeBytes: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.fetcher.maxCacheItemSizeBytes)
1000
- }).default(DEFAULT_CONFIG.scraper.fetcher)
1001
+ }).default(DEFAULT_CONFIG.scraper.fetcher),
1002
+ document: z.object({
1003
+ maxSize: z.coerce.number().int().default(DEFAULT_CONFIG.scraper.document.maxSize)
1004
+ }).default(DEFAULT_CONFIG.scraper.document)
1001
1005
  }).default(DEFAULT_CONFIG.scraper),
1002
1006
  splitter: z.object({
1003
1007
  minChunkSize: z.coerce.number().int().default(DEFAULT_CONFIG.splitter.minChunkSize),
@@ -1035,10 +1039,7 @@ const AppConfigSchema = z.object({
1035
1039
  precedingSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.precedingSiblingsLimit),
1036
1040
  subsequentSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.subsequentSiblingsLimit),
1037
1041
  maxChunkDistance: z.coerce.number().int().min(0).default(DEFAULT_CONFIG.assembly.maxChunkDistance)
1038
- }).default(DEFAULT_CONFIG.assembly),
1039
- document: z.object({
1040
- maxSize: z.coerce.number().int().default(DEFAULT_CONFIG.document.maxSize)
1041
- }).default(DEFAULT_CONFIG.document)
1042
+ }).default(DEFAULT_CONFIG.assembly)
1042
1043
  });
1043
1044
  const defaults = AppConfigSchema.parse({});
1044
1045
  const configMappings = [
@@ -1159,6 +1160,12 @@ function mapEnvToConfig() {
1159
1160
  }
1160
1161
  }
1161
1162
  }
1163
+ for (const pathArr of ALL_CONFIG_LEAF_PATHS) {
1164
+ const envVar = pathToEnvVar(pathArr);
1165
+ if (process.env[envVar] !== void 0) {
1166
+ setAtPath(config, pathArr, process.env[envVar]);
1167
+ }
1168
+ }
1162
1169
  return config;
1163
1170
  }
1164
1171
  function mapCliToConfig(args) {
@@ -1170,6 +1177,25 @@ function mapCliToConfig(args) {
1170
1177
  }
1171
1178
  return config;
1172
1179
  }
1180
+ function camelToUpperSnake(str) {
1181
+ return str.replace(/([a-z])([A-Z])/g, "$1_$2").toUpperCase();
1182
+ }
1183
+ function pathToEnvVar(pathArr) {
1184
+ return `DOCS_MCP_${pathArr.map(camelToUpperSnake).join("_")}`;
1185
+ }
1186
+ function collectLeafPaths(obj, prefix = []) {
1187
+ const paths = [];
1188
+ for (const [key, value] of Object.entries(obj)) {
1189
+ const currentPath = [...prefix, key];
1190
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
1191
+ paths.push(...collectLeafPaths(value, currentPath));
1192
+ } else {
1193
+ paths.push(currentPath);
1194
+ }
1195
+ }
1196
+ return paths;
1197
+ }
1198
+ const ALL_CONFIG_LEAF_PATHS = collectLeafPaths(DEFAULT_CONFIG);
1173
1199
  function setAtPath(obj, pathArr, value) {
1174
1200
  let current = obj;
1175
1201
  for (let i = 0; i < pathArr.length - 1; i++) {
@@ -1206,17 +1232,168 @@ function deepMerge(target, source) {
1206
1232
  }
1207
1233
  return output;
1208
1234
  }
1235
+ function isValidConfigPath(path2) {
1236
+ const pathArr = path2.split(".");
1237
+ return getAtPath(DEFAULT_CONFIG, pathArr) !== void 0;
1238
+ }
1239
+ function getConfigValue(config, path2) {
1240
+ const pathArr = path2.split(".");
1241
+ return getAtPath(config, pathArr);
1242
+ }
1243
+ function parseConfigValue(value) {
1244
+ const num = Number(value);
1245
+ if (!Number.isNaN(num) && value.trim() !== "") {
1246
+ return num;
1247
+ }
1248
+ const lower = value.toLowerCase();
1249
+ if (lower === "true") return true;
1250
+ if (lower === "false") return false;
1251
+ return value;
1252
+ }
1253
+ function setConfigValue(path2, value) {
1254
+ const configPath = getDefaultConfigPath();
1255
+ const fileConfig = loadConfigFile(configPath) || {};
1256
+ const pathArr = path2.split(".");
1257
+ const parsedValue = parseConfigValue(value);
1258
+ const updatedConfig = JSON.parse(JSON.stringify(fileConfig));
1259
+ setAtPath(updatedConfig, pathArr, parsedValue);
1260
+ try {
1261
+ AppConfigSchema.parse(updatedConfig);
1262
+ } catch (err) {
1263
+ const errorMsg = err instanceof Error ? err.message : String(err);
1264
+ throw new Error(`Invalid config value for "${path2}": ${errorMsg}`);
1265
+ }
1266
+ saveConfigFile(configPath, updatedConfig);
1267
+ return configPath;
1268
+ }
1269
+ function getDefaultConfigPath() {
1270
+ return path.join(systemPaths.config, "config.yaml");
1271
+ }
1272
+ function formatOutput$1(value, format) {
1273
+ if (format === "auto") {
1274
+ if (typeof value === "object" && value !== null) {
1275
+ return JSON.stringify(value, null, 2);
1276
+ }
1277
+ return String(value);
1278
+ }
1279
+ if (format === "yaml") {
1280
+ return yaml.stringify(value).trim();
1281
+ }
1282
+ return JSON.stringify(value, null, 2);
1283
+ }
1209
1284
  function createConfigCommand(cli) {
1210
1285
  cli.command(
1211
1286
  "config",
1212
- "Fetch a URL and transform it into Markdown format",
1213
- (yargs2) => yargs2,
1287
+ "View or modify configuration",
1288
+ (yargs2) => {
1289
+ return yargs2.option("json", {
1290
+ type: "boolean",
1291
+ description: "Output in JSON format",
1292
+ conflicts: "yaml"
1293
+ }).option("yaml", {
1294
+ type: "boolean",
1295
+ description: "Output in YAML format",
1296
+ conflicts: "json"
1297
+ }).command(
1298
+ "get <path>",
1299
+ "Get a configuration value",
1300
+ (y) => y.positional("path", {
1301
+ type: "string",
1302
+ description: "Dot-separated config path (e.g., scraper.maxPages)",
1303
+ demandOption: true
1304
+ }).option("json", {
1305
+ type: "boolean",
1306
+ description: "Output in JSON format",
1307
+ conflicts: "yaml"
1308
+ }).option("yaml", {
1309
+ type: "boolean",
1310
+ description: "Output in YAML format",
1311
+ conflicts: "json"
1312
+ }),
1313
+ (argv) => {
1314
+ const path2 = argv.path;
1315
+ if (!isValidConfigPath(path2)) {
1316
+ console.error(`Error: Invalid config path '${path2}'`);
1317
+ console.error("Use 'docs-mcp-server config' to see all available paths.");
1318
+ process.exitCode = 1;
1319
+ return;
1320
+ }
1321
+ const config = loadConfig(argv, {
1322
+ configPath: argv.config,
1323
+ searchDir: argv.storePath
1324
+ });
1325
+ const value = getConfigValue(config, path2);
1326
+ const format = argv.json ? "json" : argv.yaml ? "yaml" : "auto";
1327
+ console.log(formatOutput$1(value, format));
1328
+ }
1329
+ ).command(
1330
+ "set <path> <value>",
1331
+ "Set a configuration value",
1332
+ (y) => y.positional("path", {
1333
+ type: "string",
1334
+ description: "Dot-separated config path (e.g., scraper.maxPages)",
1335
+ demandOption: true
1336
+ }).positional("value", {
1337
+ type: "string",
1338
+ description: "Value to set",
1339
+ demandOption: true
1340
+ }),
1341
+ (argv) => {
1342
+ const configPath = argv.config;
1343
+ const path2 = argv.path;
1344
+ const value = argv.value;
1345
+ if (configPath) {
1346
+ console.error(
1347
+ "Error: Cannot modify configuration when using explicit --config file."
1348
+ );
1349
+ console.error(
1350
+ "Remove the --config flag to modify the default configuration."
1351
+ );
1352
+ process.exitCode = 1;
1353
+ return;
1354
+ }
1355
+ if (!isValidConfigPath(path2)) {
1356
+ console.error(`Error: Invalid config path '${path2}'`);
1357
+ console.error("Use 'docs-mcp-server config' to see all available paths.");
1358
+ process.exitCode = 1;
1359
+ return;
1360
+ }
1361
+ const config = loadConfig(argv, {
1362
+ configPath: argv.config,
1363
+ searchDir: argv.storePath
1364
+ });
1365
+ const currentValue = getConfigValue(config, path2);
1366
+ if (currentValue !== void 0 && currentValue !== null && typeof currentValue === "object" && !Array.isArray(currentValue)) {
1367
+ console.error(
1368
+ `Error: Config path '${path2}' refers to an object. Use a more specific leaf path to set a scalar value.`
1369
+ );
1370
+ console.error(
1371
+ "Hint: Run 'docs-mcp-server config' to inspect the current structure."
1372
+ );
1373
+ process.exitCode = 1;
1374
+ return;
1375
+ }
1376
+ try {
1377
+ const savedPath = setConfigValue(path2, value);
1378
+ const parsedValue = parseConfigValue(value);
1379
+ console.log(`Updated ${path2} = ${JSON.stringify(parsedValue)}`);
1380
+ console.log(`Saved to: ${savedPath}`);
1381
+ } catch (error) {
1382
+ console.error(
1383
+ `Error: Failed to save configuration: ${error instanceof Error ? error.message : String(error)}`
1384
+ );
1385
+ process.exitCode = 1;
1386
+ }
1387
+ }
1388
+ );
1389
+ },
1214
1390
  (argv) => {
1215
1391
  const config = loadConfig(argv, {
1216
1392
  configPath: argv.config,
1217
1393
  searchDir: argv.storePath
1218
1394
  });
1219
- console.log(JSON.stringify(config, null, 2));
1395
+ const format = argv.json ? "json" : argv.yaml ? "yaml" : "json";
1396
+ console.log(formatOutput$1(config, format));
1220
1397
  }
1221
1398
  );
1222
1399
  }
@@ -2443,42 +2620,135 @@ class MimeTypeUtils {
2443
2620
  static detectMimeTypeFromPath(filePath) {
2444
2621
  const extension = filePath.toLowerCase().split(".").pop();
2445
2622
  const customMimeTypes = {
2623
+ // JavaScript/TypeScript family
2446
2624
  ts: "text/x-typescript",
2447
2625
  tsx: "text/x-tsx",
2626
+ mts: "text/x-typescript",
2627
+ // TypeScript ES modules
2628
+ cts: "text/x-typescript",
2629
+ // TypeScript CommonJS modules
2448
2630
  js: "text/javascript",
2449
2631
  jsx: "text/x-jsx",
2450
2632
  cjs: "text/javascript",
2451
2633
  // CommonJS modules
2452
2634
  mjs: "text/javascript",
2453
2635
  // ES modules
2636
+ // Python family
2454
2637
  py: "text/x-python",
2455
2638
  pyw: "text/x-python",
2456
2639
  pyi: "text/x-python",
2640
+ pyx: "text/x-cython",
2641
+ // Cython
2642
+ pxd: "text/x-cython",
2643
+ // Cython
2644
+ // Systems languages
2457
2645
  go: "text/x-go",
2458
2646
  rs: "text/x-rust",
2647
+ c: "text/x-csrc",
2648
+ h: "text/x-chdr",
2649
+ cpp: "text/x-c++src",
2650
+ cxx: "text/x-c++src",
2651
+ cc: "text/x-c++src",
2652
+ hpp: "text/x-c++hdr",
2653
+ hxx: "text/x-c++hdr",
2654
+ zig: "text/x-zig",
2655
+ nim: "text/x-nim",
2656
+ v: "text/x-v",
2657
+ cr: "text/x-crystal",
2658
+ // JVM languages
2459
2659
  kt: "text/x-kotlin",
2660
+ kts: "text/x-kotlin",
2661
+ // Kotlin script
2460
2662
  scala: "text/x-scala",
2663
+ groovy: "text/x-groovy",
2664
+ gradle: "text/x-gradle",
2665
+ // Apple/Mobile
2461
2666
  swift: "text/x-swift",
2667
+ dart: "text/x-dart",
2668
+ // Scripting languages
2462
2669
  rb: "text/x-ruby",
2670
+ rake: "text/x-ruby",
2671
+ // Rakefile
2463
2672
  php: "text/x-php",
2673
+ lua: "text/x-lua",
2674
+ pl: "text/x-perl",
2675
+ pm: "text/x-perl",
2676
+ r: "text/x-r",
2677
+ // Also handles .R since extension is lowercased
2678
+ // Functional languages
2679
+ hs: "text/x-haskell",
2680
+ lhs: "text/x-haskell",
2681
+ // Literate Haskell
2682
+ elm: "text/x-elm",
2683
+ erl: "text/x-erlang",
2684
+ ex: "text/x-elixir",
2685
+ exs: "text/x-elixir",
2686
+ clj: "text/x-clojure",
2687
+ cljs: "text/x-clojure",
2688
+ cljc: "text/x-clojure",
2689
+ jl: "text/x-julia",
2690
+ // .NET
2464
2691
  cs: "text/x-csharp",
2465
- cpp: "text/x-c++src",
2466
- cxx: "text/x-c++src",
2467
- cc: "text/x-c++src",
2468
- hpp: "text/x-c++hdr",
2469
- hxx: "text/x-c++hdr",
2470
- h: "text/x-chdr",
2471
- c: "text/x-csrc",
2692
+ // Web3/Smart contracts
2693
+ sol: "text/x-solidity",
2694
+ move: "text/x-move",
2695
+ cairo: "text/x-cairo",
2696
+ // Modern web frameworks
2697
+ vue: "text/x-vue",
2698
+ svelte: "text/x-svelte",
2699
+ astro: "text/x-astro",
2700
+ // Shell scripting
2472
2701
  sh: "text/x-shellscript",
2473
2702
  bash: "text/x-shellscript",
2474
2703
  zsh: "text/x-shellscript",
2475
2704
  fish: "text/x-shellscript",
2476
2705
  ps1: "text/x-powershell",
2706
+ // Documentation formats
2707
+ rst: "text/x-rst",
2708
+ // reStructuredText
2709
+ adoc: "text/x-asciidoc",
2710
+ asciidoc: "text/x-asciidoc",
2711
+ textile: "text/x-textile",
2712
+ org: "text/x-org",
2713
+ // Org-mode
2714
+ pod: "text/x-pod",
2715
+ // Perl documentation
2716
+ rdoc: "text/x-rdoc",
2717
+ // Ruby documentation
2718
+ wiki: "text/x-wiki",
2719
+ rmd: "text/x-rmarkdown",
2720
+ // R Markdown
2721
+ // Configuration files
2722
+ toml: "text/x-toml",
2723
+ ini: "text/x-ini",
2724
+ cfg: "text/x-ini",
2725
+ conf: "text/x-conf",
2726
+ properties: "text/x-properties",
2727
+ env: "text/x-dotenv",
2728
+ // Build systems
2729
+ dockerfile: "text/x-dockerfile",
2730
+ containerfile: "text/x-dockerfile",
2731
+ makefile: "text/x-makefile",
2732
+ cmake: "text/x-cmake",
2733
+ bazel: "text/x-bazel",
2734
+ bzl: "text/x-bazel",
2735
+ buck: "text/x-buck",
2736
+ // Infrastructure as Code
2737
+ tf: "text/x-terraform",
2738
+ tfvars: "text/x-terraform",
2739
+ hcl: "text/x-hcl",
2740
+ // Data/Query languages
2477
2741
  sql: "text/x-sql",
2478
2742
  graphql: "text/x-graphql",
2479
2743
  gql: "text/x-graphql",
2744
+ // Schema/API definitions
2480
2745
  proto: "text/x-proto",
2481
- dockerfile: "text/x-dockerfile"
2746
+ prisma: "text/x-prisma",
2747
+ thrift: "text/x-thrift",
2748
+ avro: "text/x-avro",
2749
+ // TeX/LaTeX
2750
+ tex: "text/x-tex",
2751
+ latex: "text/x-latex"
2482
2752
  };
2483
2753
  if (extension && customMimeTypes[extension]) {
2484
2754
  return customMimeTypes[extension];
@@ -2498,8 +2768,24 @@ class MimeTypeUtils {
2498
2768
  return null;
2499
2769
  }
2500
2770
  const mimeTypeNormalization = {
2501
- "application/node": "text/javascript"
2502
- // .cjs files are detected as this
2771
+ "application/node": "text/javascript",
2772
+ // .cjs files
2773
+ "video/mp2t": "text/x-typescript",
2774
+ // .ts/.mts files (MPEG-2 transport stream conflict)
2775
+ "application/rls-services+xml": "text/x-rust",
2776
+ // .rs files
2777
+ "application/vnd.lotus-organizer": "text/x-org",
2778
+ // .org files (Lotus Organizer conflict)
2779
+ "application/vnd.dart": "text/x-dart",
2780
+ // .dart files
2781
+ "application/x-perl": "text/x-perl",
2782
+ // .pl/.pm files
2783
+ "application/x-tex": "text/x-tex",
2784
+ // .tex files
2785
+ "application/x-latex": "text/x-latex",
2786
+ // .latex files
2787
+ "application/toml": "text/x-toml"
2788
+ // .toml files
2503
2789
  };
2504
2790
  return mimeTypeNormalization[mimeType] || mimeType;
2505
2791
  }
@@ -2511,6 +2797,7 @@ class MimeTypeUtils {
2511
2797
  */
2512
2798
  static extractLanguageFromMimeType(mimeType) {
2513
2799
  const mimeToLanguage = {
2800
+ // JavaScript/TypeScript
2514
2801
  "text/x-typescript": "typescript",
2515
2802
  "text/typescript": "typescript",
2516
2803
  "application/typescript": "typescript",
@@ -2519,22 +2806,84 @@ class MimeTypeUtils {
2519
2806
  "application/javascript": "javascript",
2520
2807
  "application/x-javascript": "javascript",
2521
2808
  "text/x-jsx": "jsx",
2809
+ // Python
2522
2810
  "text/x-python": "python",
2523
- "text/x-java": "java",
2811
+ "text/x-cython": "cython",
2812
+ // Systems languages
2524
2813
  "text/x-c": "c",
2525
2814
  "text/x-csrc": "c",
2526
2815
  "text/x-chdr": "c",
2527
2816
  "text/x-c++": "cpp",
2528
2817
  "text/x-c++src": "cpp",
2529
2818
  "text/x-c++hdr": "cpp",
2530
- "text/x-csharp": "csharp",
2531
2819
  "text/x-go": "go",
2532
2820
  "text/x-rust": "rust",
2533
- "text/x-php": "php",
2534
- "text/x-ruby": "ruby",
2535
- "text/x-swift": "swift",
2821
+ "text/x-zig": "zig",
2822
+ "text/x-nim": "nim",
2823
+ "text/x-v": "v",
2824
+ "text/x-crystal": "crystal",
2825
+ // JVM languages
2826
+ "text/x-java": "java",
2536
2827
  "text/x-kotlin": "kotlin",
2537
2828
  "text/x-scala": "scala",
2829
+ "text/x-groovy": "groovy",
2830
+ "text/x-gradle": "groovy",
2831
+ // Apple/Mobile
2832
+ "text/x-swift": "swift",
2833
+ "text/x-dart": "dart",
2834
+ // .NET
2835
+ "text/x-csharp": "csharp",
2836
+ // Scripting languages
2837
+ "text/x-ruby": "ruby",
2838
+ "text/x-php": "php",
2839
+ "text/x-lua": "lua",
2840
+ "text/x-perl": "perl",
2841
+ "text/x-r": "r",
2842
+ // Functional languages
2843
+ "text/x-haskell": "haskell",
2844
+ "text/x-elm": "elm",
2845
+ "text/x-erlang": "erlang",
2846
+ "text/x-elixir": "elixir",
2847
+ "text/x-clojure": "clojure",
2848
+ "text/x-julia": "julia",
2849
+ // Web3/Smart contracts
2850
+ "text/x-solidity": "solidity",
2851
+ "text/x-move": "move",
2852
+ "text/x-cairo": "cairo",
2853
+ // Modern web frameworks
2854
+ "text/x-vue": "vue",
2855
+ "text/x-svelte": "svelte",
2856
+ "text/x-astro": "astro",
2857
+ // Shell
2858
+ "text/x-sh": "bash",
2859
+ "text/x-shellscript": "bash",
2860
+ "application/x-sh": "bash",
2861
+ "text/x-powershell": "powershell",
2862
+ // Documentation formats
2863
+ "text/x-rst": "rst",
2864
+ "text/x-asciidoc": "asciidoc",
2865
+ "text/x-textile": "textile",
2866
+ "text/x-org": "org",
2867
+ "text/x-pod": "pod",
2868
+ "text/x-rdoc": "rdoc",
2869
+ "text/x-wiki": "wiki",
2870
+ "text/x-rmarkdown": "rmarkdown",
2871
+ // Configuration files
2872
+ "text/x-toml": "toml",
2873
+ "text/x-ini": "ini",
2874
+ "text/x-conf": "conf",
2875
+ "text/x-properties": "properties",
2876
+ "text/x-dotenv": "dotenv",
2877
+ // Build systems
2878
+ "text/x-dockerfile": "dockerfile",
2879
+ "text/x-makefile": "makefile",
2880
+ "text/x-cmake": "cmake",
2881
+ "text/x-bazel": "bazel",
2882
+ "text/x-buck": "buck",
2883
+ // Infrastructure as Code
2884
+ "text/x-terraform": "hcl",
2885
+ "text/x-hcl": "hcl",
2886
+ // Data formats
2538
2887
  "text/x-yaml": "yaml",
2539
2888
  "application/x-yaml": "yaml",
2540
2889
  "application/yaml": "yaml",
@@ -2544,13 +2893,15 @@ class MimeTypeUtils {
2544
2893
  "text/xml": "xml",
2545
2894
  "application/xml": "xml",
2546
2895
  "text/x-sql": "sql",
2547
- "text/x-sh": "bash",
2548
- "text/x-shellscript": "bash",
2549
- "application/x-sh": "bash",
2550
- "text/x-powershell": "powershell",
2551
2896
  "text/x-graphql": "graphql",
2897
+ // Schema/API definitions
2552
2898
  "text/x-proto": "protobuf",
2553
- "text/x-dockerfile": "dockerfile"
2899
+ "text/x-prisma": "prisma",
2900
+ "text/x-thrift": "thrift",
2901
+ "text/x-avro": "avro",
2902
+ // TeX/LaTeX
2903
+ "text/x-tex": "tex",
2904
+ "text/x-latex": "latex"
2554
2905
  };
2555
2906
  return mimeToLanguage[mimeType] || "";
2556
2907
  }
@@ -2672,22 +3023,29 @@ class BrowserFetcher {
2672
3023
  }
2673
3024
  }
2674
3025
  /**
2675
- * Close the browser and clean up resources
3026
+ * Close the browser and clean up resources.
3027
+ * Always attempts cleanup even if browser is disconnected to reap zombie processes.
2676
3028
  */
2677
3029
  async close() {
2678
- try {
2679
- if (this.page) {
3030
+ if (this.page) {
3031
+ try {
2680
3032
  await this.page.close();
3033
+ } catch (error) {
3034
+ logger.warn(`⚠️ Error closing browser page: ${error}`);
3035
+ } finally {
2681
3036
  this.page = null;
2682
3037
  }
2683
- if (this.browser) {
3038
+ }
3039
+ if (this.browser) {
3040
+ try {
2684
3041
  await this.browser.close();
3042
+ } catch (error) {
3043
+ logger.warn(`⚠️ Error closing browser: ${error}`);
3044
+ } finally {
2685
3045
  this.browser = null;
2686
3046
  }
2687
- logger.debug("Browser closed successfully");
2688
- } catch (error) {
2689
- logger.warn(`⚠️ Error closing browser: ${error}`);
2690
3047
  }
3048
+ logger.debug("Browser closed successfully");
2691
3049
  }
2692
3050
  }
2693
3051
  class FileFetcher {
@@ -4019,7 +4377,7 @@ class DocumentPipeline extends BasePipeline {
4019
4377
  constructor(config) {
4020
4378
  super();
4021
4379
  this.markitdown = new MarkItDown();
4022
- this.maxSize = config.document.maxSize;
4380
+ this.maxSize = config.scraper.document.maxSize;
4023
4381
  const semanticSplitter = new SemanticMarkdownSplitter(
4024
4382
  config.splitter.preferredChunkSize,
4025
4383
  config.splitter.maxChunkSize
@@ -4505,12 +4863,18 @@ class HtmlPlaywrightMiddleware {
4505
4863
  /**
4506
4864
  * Closes the Playwright browser instance if it exists.
4507
4865
  * Should be called during application shutdown.
4866
+ * Attempts to close even if the browser is disconnected to ensure proper cleanup of zombie processes.
4508
4867
  */
4509
4868
  async closeBrowser() {
4510
- if (this.browser?.isConnected()) {
4511
- logger.debug("Closing Playwright browser instance...");
4512
- await this.browser.close();
4513
- this.browser = null;
4869
+ if (this.browser) {
4870
+ try {
4871
+ logger.debug("Closing Playwright browser instance...");
4872
+ await this.browser.close();
4873
+ } catch (error) {
4874
+ logger.warn(`⚠️ Error closing Playwright browser: ${error}`);
4875
+ } finally {
4876
+ this.browser = null;
4877
+ }
4514
4878
  }
4515
4879
  }
4516
4880
  /**
@@ -5615,10 +5979,15 @@ class HtmlPipeline extends BasePipeline {
5615
5979
  }
5616
5980
  /**
5617
5981
  * Cleanup resources used by this pipeline, specifically the Playwright browser instance.
5982
+ * Errors during cleanup are logged but not propagated to ensure graceful shutdown.
5618
5983
  */
5619
5984
  async close() {
5620
5985
  await super.close();
5621
- await this.playwrightMiddleware.closeBrowser();
5986
+ try {
5987
+ await this.playwrightMiddleware.closeBrowser();
5988
+ } catch (error) {
5989
+ logger.warn(`⚠️ Error during browser cleanup: ${error}`);
5990
+ }
5622
5991
  }
5623
5992
  }
5624
5993
  class TextDocumentSplitter {
@@ -6643,6 +7012,11 @@ class TypeScriptParser {
6643
7012
  ".cjs"
6644
7013
  ];
6645
7014
  mimeTypes = [
7015
+ // text/x-* variants (output by MimeTypeUtils.detectMimeTypeFromPath)
7016
+ "text/x-typescript",
7017
+ "text/x-tsx",
7018
+ "text/x-jsx",
7019
+ // Standard variants
6646
7020
  "text/typescript",
6647
7021
  "application/typescript",
6648
7022
  "text/tsx",
@@ -6984,6 +7358,8 @@ class LanguageParserRegistry {
6984
7358
  // Narrow advertised extensions/mime types for the alias (informational only).
6985
7359
  fileExtensions: [".js", ".jsx", ".mjs", ".cjs"],
6986
7360
  mimeTypes: [
7361
+ "text/x-jsx",
7362
+ // Output by MimeTypeUtils.detectMimeTypeFromPath
6987
7363
  "text/javascript",
6988
7364
  "application/javascript",
6989
7365
  "text/jsx",
@@ -6996,6 +7372,8 @@ class LanguageParserRegistry {
6996
7372
  this.extensionMap.set(ext.toLowerCase(), "javascript");
6997
7373
  }
6998
7374
  const jsMimes = [
7375
+ "text/x-jsx",
7376
+ // Output by MimeTypeUtils.detectMimeTypeFromPath
6999
7377
  "text/javascript",
7000
7378
  "application/javascript",
7001
7379
  "text/jsx",
@@ -10897,6 +11275,7 @@ async function registerMcpService(server, docService, pipeline, config, authMana
10897
11275
  const mcpServer = createMcpServerInstance(mcpTools, config);
10898
11276
  const authMiddleware = authManager ? createAuthMiddleware(authManager) : null;
10899
11277
  const sseTransports = {};
11278
+ const sseServers = {};
10900
11279
  const heartbeatIntervals = {};
10901
11280
  server.route({
10902
11281
  method: "GET",
@@ -10906,6 +11285,8 @@ async function registerMcpService(server, docService, pipeline, config, authMana
10906
11285
  try {
10907
11286
  const transport = new SSEServerTransport("/messages", reply.raw);
10908
11287
  sseTransports[transport.sessionId] = transport;
11288
+ const sessionServer = createMcpServerInstance(mcpTools, config);
11289
+ sseServers[transport.sessionId] = sessionServer;
10909
11290
  if (telemetry.isEnabled()) {
10910
11291
  logger.info(`🔗 MCP client connected: ${transport.sessionId}`);
10911
11292
  }
@@ -10924,6 +11305,13 @@ async function registerMcpService(server, docService, pipeline, config, authMana
10924
11305
  clearInterval(interval);
10925
11306
  delete heartbeatIntervals[transport.sessionId];
10926
11307
  }
11308
+ const serverToClose = sseServers[transport.sessionId];
11309
+ if (serverToClose) {
11310
+ delete sseServers[transport.sessionId];
11311
+ void serverToClose.close().catch((error) => {
11312
+ logger.error(`❌ Failed to close SSE server instance: ${error}`);
11313
+ });
11314
+ }
10927
11315
  delete sseTransports[transport.sessionId];
10928
11316
  transport.close();
10929
11317
  if (telemetry.isEnabled()) {
@@ -10935,7 +11323,7 @@ async function registerMcpService(server, docService, pipeline, config, authMana
10935
11323
  logger.debug(`SSE connection error: ${error}`);
10936
11324
  cleanupConnection();
10937
11325
  });
10938
- await mcpServer.connect(transport);
11326
+ await sessionServer.connect(transport);
10939
11327
  } catch (error) {
10940
11328
  logger.error(`❌ Error in SSE endpoint: ${error}`);
10941
11329
  reply.code(500).send({
@@ -10995,7 +11383,16 @@ async function registerMcpService(server, docService, pipeline, config, authMana
10995
11383
  }
10996
11384
  }
10997
11385
  });
11386
+ server.route({
11387
+ method: "GET",
11388
+ url: "/mcp",
11389
+ preHandler: authMiddleware ? [authMiddleware] : void 0,
11390
+ handler: async (_request, reply) => {
11391
+ reply.code(405).header("Allow", "POST").send();
11392
+ }
11393
+ });
10998
11394
  mcpServer._sseTransports = sseTransports;
11395
+ mcpServer._sseServers = sseServers;
10999
11396
  mcpServer._heartbeatIntervals = heartbeatIntervals;
11000
11397
  return mcpServer;
11001
11398
  }
@@ -11013,6 +11410,12 @@ async function cleanupMcpService(mcpServer) {
11013
11410
  await transport.close();
11014
11411
  }
11015
11412
  }
11413
+ const sseServers = mcpServer._sseServers;
11414
+ if (sseServers) {
11415
+ for (const server of Object.values(sseServers)) {
11416
+ await server.close();
11417
+ }
11418
+ }
11016
11419
  await mcpServer.close();
11017
11420
  logger.debug("MCP service cleaned up");
11018
11421
  } catch (error) {
@@ -11610,7 +12013,7 @@ const Layout = ({
11610
12013
  children,
11611
12014
  eventClientConfig
11612
12015
  }) => {
11613
- const versionString = version || "1.36.0";
12016
+ const versionString = version || "2.0.0";
11614
12017
  const versionInitializer = `versionUpdate({ currentVersion: ${`'${versionString}'`} })`;
11615
12018
  return /* @__PURE__ */ jsxs("html", { lang: "en", children: [
11616
12019
  /* @__PURE__ */ jsxs("head", { children: [
@@ -13903,6 +14306,15 @@ async function stopWorkerService(pipeline) {
13903
14306
  await pipeline.stop();
13904
14307
  logger.debug("Worker service stopped");
13905
14308
  }
14309
+ const BANNER = [
14310
+ "\x1B[90m █▀▀ █▀█ █▀█ █ █ █▄ █ █▀▄ █▀▀ █▀▄ █▀▄ █▀█ █▀▀ █▀▀\x1B[0m",
14311
+ "\x1B[97m █▄█ █▀▄ █▄█ █▄█ █ ▀█ █▄▀ ██▄ █▄▀ █▄▀ █▄█ █▄▄ ▄▄█\x1B[0m"
14312
+ ].join("\n");
14313
+ function printBanner() {
14314
+ console.log();
14315
+ console.log(BANNER);
14316
+ console.log();
14317
+ }
13906
14318
  class AppServer {
13907
14319
  constructor(docService, pipeline, eventBus, serverConfig, appConfig) {
13908
14320
  this.docService = docService;
@@ -13951,7 +14363,7 @@ class AppServer {
13951
14363
  try {
13952
14364
  if (telemetry.isEnabled()) {
13953
14365
  telemetry.setGlobalContext({
13954
- appVersion: "1.36.0",
14366
+ appVersion: "2.0.0",
13955
14367
  appPlatform: process.platform,
13956
14368
  appNodeVersion: process.version,
13957
14369
  appServicesEnabled: this.getActiveServicesList(),
@@ -14280,6 +14692,9 @@ class AppServer {
14280
14692
  * Log startup information showing which services are enabled.
14281
14693
  */
14282
14694
  logStartupInfo(address) {
14695
+ if (this.serverConfig.showLogo !== false) {
14696
+ printBanner();
14697
+ }
14283
14698
  const isWorkerOnly = this.serverConfig.enableWorker && !this.serverConfig.enableWebInterface && !this.serverConfig.enableMcpServer;
14284
14699
  const isWebOnly = this.serverConfig.enableWebInterface && !this.serverConfig.enableWorker && !this.serverConfig.enableMcpServer;
14285
14700
  const isMcpOnly = this.serverConfig.enableMcpServer && !this.serverConfig.enableWebInterface && !this.serverConfig.enableWorker;
@@ -14691,6 +15106,9 @@ class BaseScraperStrategy {
14691
15106
  return null;
14692
15107
  }).filter((item2) => item2 !== null);
14693
15108
  } catch (error) {
15109
+ if (item.depth === 0) {
15110
+ throw error;
15111
+ }
14694
15112
  if (options.ignoreErrors) {
14695
15113
  logger.error(`❌ Failed to process ${item.url}: ${error}`);
14696
15114
  return [];
@@ -14805,10 +15223,10 @@ class GitHubRepoProcessor {
14805
15223
  /**
14806
15224
  * Fetches the raw content of a file from GitHub.
14807
15225
  */
14808
- async fetchFileContent(repoInfo, filePath, etag, signal) {
15226
+ async fetchFileContent(repoInfo, filePath, etag, headers, signal) {
14809
15227
  const { owner, repo, branch } = repoInfo;
14810
15228
  const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
14811
- const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag });
15229
+ const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag, headers });
14812
15230
  const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
14813
15231
  if (detectedMimeType && (rawContent.mimeType === "text/plain" || rawContent.mimeType === "application/octet-stream")) {
14814
15232
  return {
@@ -14821,13 +15239,14 @@ class GitHubRepoProcessor {
14821
15239
  /**
14822
15240
  * Processes a single GitHub repository file from an HTTPS blob URL.
14823
15241
  */
14824
- async process(item, options, signal) {
15242
+ async process(item, options, headers, signal) {
14825
15243
  const repoInfo = this.parseHttpsBlobUrl(item.url);
14826
15244
  const { owner, repo, branch, filePath } = repoInfo;
14827
15245
  const rawContent = await this.fetchFileContent(
14828
15246
  { owner, repo, branch },
14829
15247
  filePath,
14830
15248
  item.etag,
15249
+ headers,
14831
15250
  signal
14832
15251
  );
14833
15252
  if (rawContent.status !== FetchStatus.SUCCESS) {
@@ -14918,12 +15337,13 @@ class GitHubWikiProcessor {
14918
15337
  /**
14919
15338
  * Processes a single GitHub wiki page.
14920
15339
  */
14921
- async process(item, options, signal) {
15340
+ async process(item, options, headers, signal) {
14922
15341
  const currentUrl = item.url;
14923
15342
  try {
14924
15343
  const rawContent = await this.httpFetcher.fetch(currentUrl, {
14925
15344
  signal,
14926
- etag: item.etag
15345
+ etag: item.etag,
15346
+ headers
14927
15347
  });
14928
15348
  if (rawContent.status !== FetchStatus.SUCCESS) {
14929
15349
  return { url: currentUrl, links: [], status: rawContent.status };
@@ -14994,10 +15414,52 @@ class GitHubWikiProcessor {
14994
15414
  await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
14995
15415
  }
14996
15416
  }
15417
+ const execAsync = promisify(exec);
15418
+ async function resolveGitHubAuth(explicitHeaders) {
15419
+ if (explicitHeaders) {
15420
+ const hasAuthHeader = Object.keys(explicitHeaders).some(
15421
+ (key) => key.toLowerCase() === "authorization"
15422
+ );
15423
+ if (hasAuthHeader) {
15424
+ return explicitHeaders;
15425
+ }
15426
+ }
15427
+ const githubToken = process.env.GITHUB_TOKEN;
15428
+ if (githubToken) {
15429
+ logger.debug("Using GitHub token from GITHUB_TOKEN environment variable");
15430
+ return {
15431
+ ...explicitHeaders,
15432
+ Authorization: `Bearer ${githubToken}`
15433
+ };
15434
+ }
15435
+ const ghToken = process.env.GH_TOKEN;
15436
+ if (ghToken) {
15437
+ logger.debug("Using GitHub token from GH_TOKEN environment variable");
15438
+ return {
15439
+ ...explicitHeaders,
15440
+ Authorization: `Bearer ${ghToken}`
15441
+ };
15442
+ }
15443
+ try {
15444
+ const { stdout } = await execAsync("gh auth token", { timeout: 5e3 });
15445
+ const cliToken = stdout.trim();
15446
+ if (cliToken) {
15447
+ logger.debug("Using GitHub token from local gh CLI");
15448
+ return {
15449
+ ...explicitHeaders,
15450
+ Authorization: `Bearer ${cliToken}`
15451
+ };
15452
+ }
15453
+ } catch {
15454
+ }
15455
+ return explicitHeaders ?? {};
15456
+ }
14997
15457
  class GitHubScraperStrategy extends BaseScraperStrategy {
14998
15458
  httpFetcher;
14999
15459
  wikiProcessor;
15000
15460
  repoProcessor;
15461
+ resolvedAuthHeaders;
15462
+ resolvedAuthKey;
15001
15463
  constructor(config) {
15002
15464
  super(config);
15003
15465
  this.httpFetcher = new HttpFetcher(config.scraper);
@@ -15054,31 +15516,117 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15054
15516
  }
15055
15517
  return { owner, repo };
15056
15518
  }
15519
+ buildAuthCacheKey(explicitHeaders) {
15520
+ const normalizedHeaders = explicitHeaders ? Object.keys(explicitHeaders).sort().map((key) => [key, explicitHeaders[key]]) : [];
15521
+ const envKey = `${process.env.GITHUB_TOKEN ?? ""}|${process.env.GH_TOKEN ?? ""}`;
15522
+ return JSON.stringify({ headers: normalizedHeaders, env: envKey });
15523
+ }
15524
+ async getResolvedAuthHeaders(explicitHeaders) {
15525
+ const cacheKey = this.buildAuthCacheKey(explicitHeaders);
15526
+ if (this.resolvedAuthHeaders && this.resolvedAuthKey === cacheKey) {
15527
+ return this.resolvedAuthHeaders;
15528
+ }
15529
+ const resolved = await resolveGitHubAuth(explicitHeaders);
15530
+ this.resolvedAuthHeaders = resolved;
15531
+ this.resolvedAuthKey = cacheKey;
15532
+ return resolved;
15533
+ }
15057
15534
  /**
15058
15535
  * Fetches the repository tree structure from GitHub API.
15059
15536
  */
15060
- async fetchRepositoryTree(repoInfo, signal) {
15537
+ async fetchRepositoryTree(repoInfo, headers, signal) {
15061
15538
  const { owner, repo, branch } = repoInfo;
15062
15539
  let targetBranch = branch;
15063
15540
  if (!targetBranch) {
15541
+ const repoUrl = `https://api.github.com/repos/${owner}/${repo}`;
15542
+ logger.debug(`Fetching repository info: ${repoUrl}`);
15543
+ let repoContent;
15544
+ try {
15545
+ repoContent = await this.httpFetcher.fetch(repoUrl, { signal, headers });
15546
+ } catch (error) {
15547
+ if (error instanceof ScraperError) {
15548
+ if (error.message.includes("401")) {
15549
+ throw new ScraperError(
15550
+ `GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`,
15551
+ false,
15552
+ error
15553
+ );
15554
+ }
15555
+ if (error.message.includes("403")) {
15556
+ throw new ScraperError(
15557
+ `GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`,
15558
+ false,
15559
+ error
15560
+ );
15561
+ }
15562
+ }
15563
+ throw error;
15564
+ }
15565
+ if (repoContent.status === FetchStatus.NOT_FOUND) {
15566
+ throw new ScraperError(
15567
+ `Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`,
15568
+ false
15569
+ );
15570
+ }
15064
15571
  try {
15065
- const repoUrl = `https://api.github.com/repos/${owner}/${repo}`;
15066
- logger.debug(`Fetching repository info: ${repoUrl}`);
15067
- const repoContent = await this.httpFetcher.fetch(repoUrl, { signal });
15068
15572
  const content2 = typeof repoContent.content === "string" ? repoContent.content : repoContent.content.toString("utf-8");
15069
15573
  const repoData = JSON.parse(content2);
15070
- targetBranch = repoData.default_branch;
15071
- logger.debug(`Using default branch: ${targetBranch}`);
15072
- } catch (error) {
15073
- logger.warn(`⚠️ Could not fetch default branch, using 'main': ${error}`);
15574
+ const defaultBranch = typeof repoData.default_branch === "string" ? repoData.default_branch.trim() : "";
15575
+ if (!defaultBranch) {
15576
+ logger.warn(
15577
+ `⚠️ Repository info missing default_branch for ${owner}/${repo}, using 'main'`
15578
+ );
15579
+ targetBranch = "main";
15580
+ } else {
15581
+ targetBranch = defaultBranch;
15582
+ logger.debug(`Using default branch: ${targetBranch}`);
15583
+ }
15584
+ } catch (parseError) {
15585
+ logger.warn(`⚠️ Could not parse repository info, using 'main': ${parseError}`);
15074
15586
  targetBranch = "main";
15075
15587
  }
15076
15588
  }
15077
15589
  const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`;
15078
15590
  logger.debug(`Fetching repository tree: ${treeUrl}`);
15079
- const rawContent = await this.httpFetcher.fetch(treeUrl, { signal });
15591
+ let rawContent;
15592
+ try {
15593
+ rawContent = await this.httpFetcher.fetch(treeUrl, { signal, headers });
15594
+ } catch (error) {
15595
+ if (error instanceof ScraperError) {
15596
+ if (error.message.includes("401")) {
15597
+ throw new ScraperError(
15598
+ `GitHub authentication failed for "${owner}/${repo}". Your token is invalid or expired. Please check your GITHUB_TOKEN or GH_TOKEN environment variable.`,
15599
+ false,
15600
+ error
15601
+ );
15602
+ }
15603
+ if (error.message.includes("403")) {
15604
+ throw new ScraperError(
15605
+ `GitHub access denied for "${owner}/${repo}". Your token may lack the required permissions, or you may be rate-limited. Please check your GITHUB_TOKEN or GH_TOKEN.`,
15606
+ false,
15607
+ error
15608
+ );
15609
+ }
15610
+ }
15611
+ throw error;
15612
+ }
15613
+ if (rawContent.status === FetchStatus.NOT_FOUND) {
15614
+ throw new ScraperError(
15615
+ `Repository "${owner}/${repo}" not found or not accessible. For private repositories, set the GITHUB_TOKEN environment variable.`,
15616
+ false
15617
+ );
15618
+ }
15080
15619
  const content = typeof rawContent.content === "string" ? rawContent.content : rawContent.content.toString("utf-8");
15081
- const treeData = JSON.parse(content);
15620
+ let treeData;
15621
+ try {
15622
+ treeData = JSON.parse(content);
15623
+ } catch (parseError) {
15624
+ throw new ScraperError(
15625
+ `Failed to parse GitHub API response for "${owner}/${repo}". The repository may be inaccessible or the API returned an unexpected response.`,
15626
+ false,
15627
+ parseError instanceof Error ? parseError : void 0
15628
+ );
15629
+ }
15082
15630
  if (treeData.truncated) {
15083
15631
  logger.warn(
15084
15632
  `⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`
@@ -15217,7 +15765,7 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15217
15765
  if (hasTextExtension || hasDocumentExtension || hasCompoundExtension || isCommonTextFile) {
15218
15766
  return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
15219
15767
  }
15220
- const mimeType = mime.getType(path2);
15768
+ const mimeType = MimeTypeUtils.detectMimeTypeFromPath(path2);
15221
15769
  if (mimeType?.startsWith("text/")) {
15222
15770
  logger.debug(`Including file with text MIME type: ${path2} (${mimeType})`);
15223
15771
  return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
@@ -15252,10 +15800,11 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15252
15800
  status: FetchStatus.NOT_FOUND
15253
15801
  };
15254
15802
  }
15803
+ const headers = await this.getResolvedAuthHeaders(options.headers);
15255
15804
  try {
15256
15805
  const parsedUrl = new URL(item.url);
15257
15806
  if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) {
15258
- return await this.wikiProcessor.process(item, options, signal);
15807
+ return await this.wikiProcessor.process(item, options, headers, signal);
15259
15808
  }
15260
15809
  } catch {
15261
15810
  }
@@ -15281,7 +15830,11 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15281
15830
  const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`;
15282
15831
  discoveredLinks.push(wikiUrl);
15283
15832
  logger.debug(`Discovered wiki URL: ${wikiUrl}`);
15284
- const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
15833
+ const { tree, resolvedBranch } = await this.fetchRepositoryTree(
15834
+ repoInfo,
15835
+ headers,
15836
+ signal
15837
+ );
15285
15838
  const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
15286
15839
  logger.debug(
15287
15840
  `Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
@@ -15299,7 +15852,7 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15299
15852
  const parsedUrl = new URL(item.url);
15300
15853
  if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) {
15301
15854
  logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`);
15302
- return await this.repoProcessor.process(item, options, signal);
15855
+ return await this.repoProcessor.process(item, options, headers, signal);
15303
15856
  }
15304
15857
  } catch (error) {
15305
15858
  logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`);
@@ -15313,7 +15866,13 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
15313
15866
  if (!url.hostname.includes("github.com")) {
15314
15867
  throw new Error("URL must be a GitHub URL");
15315
15868
  }
15316
- await super.scrape(options, progressCallback, signal);
15869
+ await this.getResolvedAuthHeaders(options.headers);
15870
+ try {
15871
+ await super.scrape(options, progressCallback, signal);
15872
+ } finally {
15873
+ this.resolvedAuthHeaders = void 0;
15874
+ this.resolvedAuthKey = void 0;
15875
+ }
15317
15876
  }
15318
15877
  async cleanup() {
15319
15878
  await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]);
@@ -15689,7 +16248,7 @@ class LocalFileStrategy extends BaseScraperStrategy {
15689
16248
  logger.debug(`Reading archive entry: ${innerPath} inside ${archivePath}`);
15690
16249
  try {
15691
16250
  const contentBuffer = await adapter.getContent(innerPath);
15692
- const mimeType = mime.getType(innerPath) || "application/octet-stream";
16251
+ const mimeType = MimeTypeUtils.detectMimeTypeFromPath(innerPath) || "application/octet-stream";
15693
16252
  const rawContent = {
15694
16253
  source: item.url,
15695
16254
  content: contentBuffer,
@@ -15976,31 +16535,90 @@ class PyPiScraperStrategy {
15976
16535
  }
15977
16536
  }
15978
16537
  class ScraperRegistry {
15979
- strategies;
16538
+ config;
15980
16539
  constructor(config) {
15981
- this.strategies = [
15982
- new NpmScraperStrategy(config),
15983
- new PyPiScraperStrategy(config),
15984
- new GitHubScraperStrategy(config),
15985
- new WebScraperStrategy(config, {}),
15986
- new LocalFileStrategy(config)
15987
- ];
16540
+ this.config = config;
15988
16541
  }
16542
+ /**
16543
+ * Creates and returns a fresh strategy instance for the given URL.
16544
+ * Each call returns a new instance to ensure state isolation between parallel scrapes.
16545
+ */
15989
16546
  getStrategy(url) {
15990
- validateUrl(url);
15991
- const strategy = this.strategies.find((s) => s.canHandle(url));
15992
- if (!strategy) {
15993
- throw new ScraperError(`No strategy found for URL: ${url}`);
16547
+ if (!url.startsWith("github-file://")) {
16548
+ validateUrl(url);
15994
16549
  }
15995
- logger.debug(`Using strategy "${strategy.constructor.name}" for URL: ${url}`);
15996
- return strategy;
16550
+ if (isLocalFileUrl(url)) {
16551
+ logger.debug(`Using strategy "LocalFileStrategy" for URL: ${url}`);
16552
+ return new LocalFileStrategy(this.config);
16553
+ }
16554
+ if (isNpmUrl(url)) {
16555
+ logger.debug(`Using strategy "NpmScraperStrategy" for URL: ${url}`);
16556
+ return new NpmScraperStrategy(this.config);
16557
+ }
16558
+ if (isPyPiUrl(url)) {
16559
+ logger.debug(`Using strategy "PyPiScraperStrategy" for URL: ${url}`);
16560
+ return new PyPiScraperStrategy(this.config);
16561
+ }
16562
+ if (isGitHubUrl(url)) {
16563
+ logger.debug(`Using strategy "GitHubScraperStrategy" for URL: ${url}`);
16564
+ return new GitHubScraperStrategy(this.config);
16565
+ }
16566
+ if (isWebUrl(url)) {
16567
+ logger.debug(`Using strategy "WebScraperStrategy" for URL: ${url}`);
16568
+ return new WebScraperStrategy(this.config, {});
16569
+ }
16570
+ throw new ScraperError(`No strategy found for URL: ${url}`);
15997
16571
  }
15998
- /**
15999
- * Cleanup all registered strategies to prevent resource leaks.
16000
- * Should be called when the registry is no longer needed.
16001
- */
16002
- async cleanup() {
16003
- await Promise.allSettled(this.strategies.map((strategy) => strategy.cleanup?.()));
16572
+ }
16573
+ function isLocalFileUrl(url) {
16574
+ return url.startsWith("file://");
16575
+ }
16576
+ function isNpmUrl(url) {
16577
+ try {
16578
+ const { hostname } = new URL(url);
16579
+ return ["npmjs.org", "npmjs.com", "www.npmjs.com"].includes(hostname);
16580
+ } catch {
16581
+ return false;
16582
+ }
16583
+ }
16584
+ function isPyPiUrl(url) {
16585
+ try {
16586
+ const { hostname } = new URL(url);
16587
+ return ["pypi.org", "www.pypi.org"].includes(hostname);
16588
+ } catch {
16589
+ return false;
16590
+ }
16591
+ }
16592
+ function isGitHubUrl(url) {
16593
+ if (url.startsWith("github-file://")) {
16594
+ return true;
16595
+ }
16596
+ try {
16597
+ const parsedUrl = new URL(url);
16598
+ const { hostname, pathname } = parsedUrl;
16599
+ if (!["github.com", "www.github.com"].includes(hostname)) {
16600
+ return false;
16601
+ }
16602
+ if (pathname.match(/^\/[^/]+\/[^/]+\/?$/)) {
16603
+ return true;
16604
+ }
16605
+ if (pathname.match(/^\/[^/]+\/[^/]+\/tree\//)) {
16606
+ return true;
16607
+ }
16608
+ if (pathname.match(/^\/[^/]+\/[^/]+\/blob\//)) {
16609
+ return true;
16610
+ }
16611
+ return false;
16612
+ } catch {
16613
+ return false;
16614
+ }
16615
+ }
16616
+ function isWebUrl(url) {
16617
+ try {
16618
+ const parsedUrl = new URL(url);
16619
+ return parsedUrl.protocol === "http:" || parsedUrl.protocol === "https:";
16620
+ } catch {
16621
+ return false;
16004
16622
  }
16005
16623
  }
16006
16624
  class ScraperService {
@@ -16011,20 +16629,35 @@ class ScraperService {
16011
16629
  /**
16012
16630
  * Scrapes content from the provided URL using the appropriate strategy.
16013
16631
  * Reports progress via callback and handles errors.
16632
+ * Cleans up strategy resources after scrape completes (success or failure).
16014
16633
  */
16015
16634
  async scrape(options, progressCallback, signal) {
16016
16635
  const strategy = this.registry.getStrategy(options.url);
16017
- if (!strategy) {
16018
- throw new ScraperError(`No scraper strategy found for URL: ${options.url}`, false);
16636
+ let scrapeError = null;
16637
+ let cleanupErrorToThrow = null;
16638
+ try {
16639
+ await strategy.scrape(options, progressCallback, signal);
16640
+ } catch (error) {
16641
+ scrapeError = error instanceof Error ? error : new ScraperError(`Scrape failed for URL: ${options.url}`, false);
16642
+ } finally {
16643
+ try {
16644
+ await strategy.cleanup?.();
16645
+ } catch (cleanupError) {
16646
+ logger.error(`❌ Strategy cleanup failed for ${options.url}: ${cleanupError}`);
16647
+ if (!scrapeError) {
16648
+ cleanupErrorToThrow = cleanupError instanceof Error ? cleanupError : new ScraperError(
16649
+ `Strategy cleanup failed for URL: ${options.url}`,
16650
+ false
16651
+ );
16652
+ }
16653
+ }
16654
+ }
16655
+ if (scrapeError) {
16656
+ throw scrapeError;
16657
+ }
16658
+ if (cleanupErrorToThrow) {
16659
+ throw cleanupErrorToThrow;
16019
16660
  }
16020
- await strategy.scrape(options, progressCallback, signal);
16021
- }
16022
- /**
16023
- * Cleanup the scraper registry and all its strategies.
16024
- * Should be called when the service is no longer needed.
16025
- */
16026
- async cleanup() {
16027
- await this.registry.cleanup();
16028
16661
  }
16029
16662
  }
16030
16663
  class PipelineWorker {
@@ -16187,7 +16820,7 @@ class PipelineManager {
16187
16820
  if (this.shouldRecoverJobs) {
16188
16821
  await this.recoverPendingJobs();
16189
16822
  } else {
16190
- logger.debug("Job recovery disabled for this PipelineManager instance");
16823
+ await this.markInterruptedJobsAsFailed();
16191
16824
  }
16192
16825
  this._processQueue().catch((error) => {
16193
16826
  logger.error(`❌ Error in processQueue during start: ${error}`);
@@ -16195,79 +16828,69 @@ class PipelineManager {
16195
16828
  }
16196
16829
  /**
16197
16830
  * Recovers pending jobs from the database after server restart.
16198
- * Finds versions with RUNNING status and resets them to QUEUED for re-processing.
16199
- * Also loads all QUEUED versions back into the pipeline queue.
16831
+ * Uses enqueueRefreshJob() to properly continue interrupted jobs,
16832
+ * leveraging existing pages and ETags when available.
16200
16833
  */
16201
16834
  async recoverPendingJobs() {
16202
16835
  try {
16203
- const runningVersions = await this.store.getVersionsByStatus([
16204
- VersionStatus.RUNNING
16836
+ const interruptedVersions = await this.store.getVersionsByStatus([
16837
+ VersionStatus.RUNNING,
16838
+ VersionStatus.QUEUED
16205
16839
  ]);
16206
- for (const version of runningVersions) {
16207
- await this.store.updateVersionStatus(version.id, VersionStatus.QUEUED);
16208
- logger.info(
16209
- `🔄 Reset interrupted job to QUEUED: ${version.library_name}@${version.name || "latest"}`
16210
- );
16840
+ if (interruptedVersions.length === 0) {
16841
+ logger.debug("No pending jobs to recover from database");
16842
+ return;
16211
16843
  }
16212
- const queuedVersions = await this.store.getVersionsByStatus([VersionStatus.QUEUED]);
16213
- for (const version of queuedVersions) {
16214
- const jobId = v4();
16215
- const abortController = new AbortController();
16216
- let resolveCompletion;
16217
- let rejectCompletion;
16218
- const completionPromise = new Promise((resolve, reject) => {
16219
- resolveCompletion = resolve;
16220
- rejectCompletion = reject;
16221
- });
16222
- completionPromise.catch(() => {
16223
- });
16224
- let parsedScraperOptions = null;
16225
- if (version.scraper_options) {
16226
- try {
16227
- parsedScraperOptions = JSON.parse(version.scraper_options);
16228
- } catch (error) {
16229
- logger.warn(
16230
- `⚠️ Failed to parse scraper options for ${version.library_name}@${version.name || "latest"}: ${error}`
16231
- );
16232
- }
16844
+ logger.info(
16845
+ `📥 Recovering ${interruptedVersions.length} pending job(s) from database`
16846
+ );
16847
+ for (const version of interruptedVersions) {
16848
+ const versionLabel = `${version.library_name}@${version.name || "latest"}`;
16849
+ try {
16850
+ await this.enqueueRefreshJob(version.library_name, version.name);
16851
+ logger.info(`🔄 Recovering job: ${versionLabel}`);
16852
+ } catch (error) {
16853
+ const errorMessage = `Recovery failed: ${error instanceof Error ? error.message : String(error)}`;
16854
+ await this.store.updateVersionStatus(
16855
+ version.id,
16856
+ VersionStatus.FAILED,
16857
+ errorMessage
16858
+ );
16859
+ logger.warn(`⚠️ Failed to recover job ${versionLabel}: ${error}`);
16233
16860
  }
16234
- const job = {
16235
- id: jobId,
16236
- library: version.library_name,
16237
- version: version.name || "",
16238
- status: PipelineJobStatus.QUEUED,
16239
- progress: null,
16240
- error: null,
16241
- createdAt: new Date(version.created_at),
16242
- // For recovered QUEUED jobs, startedAt must be null to reflect queued state.
16243
- startedAt: null,
16244
- finishedAt: null,
16245
- abortController,
16246
- completionPromise,
16247
- resolveCompletion,
16248
- rejectCompletion,
16249
- // Database fields (single source of truth)
16250
- versionId: version.id,
16251
- versionStatus: version.status,
16252
- progressPages: version.progress_pages,
16253
- progressMaxPages: version.progress_max_pages,
16254
- errorMessage: version.error_message,
16255
- updatedAt: new Date(version.updated_at),
16256
- sourceUrl: version.source_url,
16257
- scraperOptions: parsedScraperOptions
16258
- };
16259
- this.jobMap.set(jobId, job);
16260
- this.jobQueue.push(jobId);
16261
- }
16262
- if (queuedVersions.length > 0) {
16263
- logger.info(`📥 Recovered ${queuedVersions.length} pending job(s) from database`);
16264
- } else {
16265
- logger.debug("No pending jobs to recover from database");
16266
16861
  }
16267
16862
  } catch (error) {
16268
16863
  logger.error(`❌ Failed to recover pending jobs: ${error}`);
16269
16864
  }
16270
16865
  }
16866
+ /**
16867
+ * Marks all interrupted jobs (RUNNING/QUEUED) as FAILED.
16868
+ * Called when recoverJobs is false to allow users to manually retry via UI.
16869
+ */
16870
+ async markInterruptedJobsAsFailed() {
16871
+ try {
16872
+ const interruptedVersions = await this.store.getVersionsByStatus([
16873
+ VersionStatus.RUNNING,
16874
+ VersionStatus.QUEUED
16875
+ ]);
16876
+ if (interruptedVersions.length === 0) {
16877
+ logger.debug("No interrupted jobs to mark as failed");
16878
+ return;
16879
+ }
16880
+ for (const version of interruptedVersions) {
16881
+ await this.store.updateVersionStatus(
16882
+ version.id,
16883
+ VersionStatus.FAILED,
16884
+ "Job interrupted"
16885
+ );
16886
+ logger.info(
16887
+ `❌ Marked interrupted job as failed: ${version.library_name}@${version.name || "latest"}`
16888
+ );
16889
+ }
16890
+ } catch (error) {
16891
+ logger.error(`❌ Failed to mark interrupted jobs as failed: ${error}`);
16892
+ }
16893
+ }
16271
16894
  /**
16272
16895
  * Stops the pipeline manager and attempts to gracefully shut down workers.
16273
16896
  * Currently, it just stops processing new jobs. Cancellation of active jobs
@@ -16280,7 +16903,6 @@ class PipelineManager {
16280
16903
  }
16281
16904
  this.isRunning = false;
16282
16905
  logger.debug("PipelineManager stopping. No new jobs will be started.");
16283
- await this.scraperService.cleanup();
16284
16906
  }
16285
16907
  /**
16286
16908
  * Enqueues a new document processing job, aborting any existing QUEUED/RUNNING job for the same library+version (including unversioned).
@@ -16870,6 +17492,7 @@ function createAppServerConfig(options) {
16870
17492
  enableWorker: options.enableWorker ?? true,
16871
17493
  port: options.port,
16872
17494
  externalWorkerUrl: options.externalWorkerUrl,
17495
+ showLogo: options.showLogo ?? true,
16873
17496
  startupContext: options.startupContext
16874
17497
  };
16875
17498
  }
@@ -17064,6 +17687,7 @@ function createDefaultAction(cli) {
17064
17687
  enableApiServer: true,
17065
17688
  enableWorker: true,
17066
17689
  port: appConfig.server.ports.default,
17690
+ showLogo: argv.logo,
17067
17691
  startupContext: {
17068
17692
  cliCommand: "default",
17069
17693
  mcpProtocol: "http"
@@ -17351,6 +17975,7 @@ function createMcpCommand(cli) {
17351
17975
  enableWorker: !serverUrl,
17352
17976
  port: appConfig.server.ports.mcp,
17353
17977
  externalWorkerUrl: serverUrl,
17978
+ showLogo: argv.logo,
17354
17979
  startupContext: {
17355
17980
  cliCommand: "mcp",
17356
17981
  mcpProtocol: "http"
@@ -17877,6 +18502,7 @@ function createWebCommand(cli) {
17877
18502
  enableWorker: !serverUrl,
17878
18503
  port: appConfig.server.ports.web,
17879
18504
  externalWorkerUrl: serverUrl,
18505
+ showLogo: argv.logo,
17880
18506
  startupContext: {
17881
18507
  cliCommand: "web"
17882
18508
  }
@@ -17961,6 +18587,7 @@ function createWorkerCommand(cli) {
17961
18587
  enableApiServer: true,
17962
18588
  enableWorker: true,
17963
18589
  port: appConfig.server.ports.worker,
18590
+ showLogo: argv.logo,
17964
18591
  startupContext: {
17965
18592
  cliCommand: "worker"
17966
18593
  }
@@ -17989,7 +18616,7 @@ function createCli(argv) {
17989
18616
  let globalEventBus = null;
17990
18617
  let globalTelemetryService = null;
17991
18618
  const commandStartTimes = /* @__PURE__ */ new Map();
17992
- const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("1.36.0").option("verbose", {
18619
+ const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("2.0.0").option("verbose", {
17993
18620
  type: "boolean",
17994
18621
  description: "Enable verbose (debug) logging",
17995
18622
  default: false
@@ -18013,6 +18640,10 @@ function createCli(argv) {
18013
18640
  }).option("config", {
18014
18641
  type: "string",
18015
18642
  description: "Path to configuration file"
18643
+ }).option("logo", {
18644
+ type: "boolean",
18645
+ description: "Show ASCII art logo on startup",
18646
+ default: true
18016
18647
  }).middleware(async (argv2) => {
18017
18648
  if (argv2.verbose && argv2.silent) {
18018
18649
  throw new Error("Arguments verbose and silent are mutually exclusive");
@@ -18045,7 +18676,7 @@ function createCli(argv) {
18045
18676
  if (shouldEnableTelemetry() && telemetry.isEnabled()) {
18046
18677
  const commandName = argv2._[0]?.toString() || "default";
18047
18678
  telemetry.setGlobalContext({
18048
- appVersion: "1.36.0",
18679
+ appVersion: "2.0.0",
18049
18680
  appPlatform: process.platform,
18050
18681
  appNodeVersion: process.version,
18051
18682
  appInterface: "cli",