web-to-markdown-crawler 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +10 -6
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -68592,16 +68592,17 @@ async function processPage(item, config, visited, graph, meta, newItems, state)
68592
68592
  const links = extractLinks(html3, effectiveUrl);
68593
68593
  for (const rawLink of links) {
68594
68594
  const link = normalizeUrl2(rawLink);
68595
- const linkHostname = new URL(link).hostname;
68596
- const isInternal = state.internalHostnames.has(linkHostname);
68595
+ const parsedLink = new URL(link);
68596
+ const isInternal = state.internalHostnames.has(parsedLink.hostname);
68597
+ const isExcluded = config.exclude?.some((pattern) => parsedLink.pathname.startsWith(pattern)) ?? false;
68597
68598
  graph.dir(url, link);
68598
68599
  if (!meta.has(link)) {
68599
68600
  meta.set(link, {
68600
68601
  depth: depth + 1,
68601
- status: isInternal ? "pending" : "skipped"
68602
+ status: isInternal && !isExcluded ? "pending" : "skipped"
68602
68603
  });
68603
68604
  }
68604
- if (isInternal && !visited.has(link) && (config.maxDepth === undefined || depth + 1 <= config.maxDepth)) {
68605
+ if (isInternal && !isExcluded && !visited.has(link) && (config.maxDepth === undefined || depth + 1 <= config.maxDepth)) {
68605
68606
  visited.add(link);
68606
68607
  newItems.push({ url: link, depth: depth + 1 });
68607
68608
  }
@@ -68669,7 +68670,7 @@ function parseStartUrl(raw) {
68669
68670
  }
68670
68671
  }
68671
68672
  var program2 = new Command;
68672
- program2.name("crawl").description("Crawl a website and convert pages to markdown").argument("<url>", "Starting URL to crawl").option("-o, --output <dir>", "Output directory", "./output").option("-c, --concurrency <n>", "Number of parallel fetches", "5").option("--max-depth <n>", "Maximum crawl depth (0 = start page only)").option("--max-pages <n>", "Maximum number of pages to crawl").option("--delay <ms>", "Delay in milliseconds between requests").action(async (url, opts) => {
68673
+ program2.name("crawl").description("Crawl a website and convert pages to markdown").argument("<url>", "Starting URL to crawl").option("-o, --output <dir>", "Output directory", "./output").option("-c, --concurrency <n>", "Number of parallel fetches", "5").option("--max-depth <n>", "Maximum crawl depth (0 = start page only)").option("--max-pages <n>", "Maximum number of pages to crawl").option("--delay <ms>", "Delay in milliseconds between requests").option("--exclude <path>", "Exclude a URL path prefix from crawling — can be repeated (e.g. --exclude /admin --exclude /login)", (val2, prev2) => [...prev2, val2], []).action(async (url, opts) => {
68673
68674
  const startUrl = parseStartUrl(url);
68674
68675
  const concurrency = parseInt(opts["concurrency"] ?? "5", 10);
68675
68676
  if (isNaN(concurrency) || concurrency < 1) {
@@ -68679,6 +68680,8 @@ program2.name("crawl").description("Crawl a website and convert pages to markdow
68679
68680
  const maxDepth = opts["maxDepth"] != null ? parseInt(opts["maxDepth"], 10) : undefined;
68680
68681
  const maxPages = opts["maxPages"] != null ? parseInt(opts["maxPages"], 10) : undefined;
68681
68682
  const delayMs = opts["delay"] != null ? parseInt(opts["delay"], 10) : undefined;
68683
+ const excludeRaw = opts["exclude"];
68684
+ const exclude = excludeRaw.length ? excludeRaw : undefined;
68682
68685
  if (maxDepth !== undefined && isNaN(maxDepth)) {
68683
68686
  console.error("Error: --max-depth must be an integer");
68684
68687
  process.exit(1);
@@ -68697,7 +68700,8 @@ program2.name("crawl").description("Crawl a website and convert pages to markdow
68697
68700
  concurrency,
68698
68701
  maxDepth,
68699
68702
  maxPages,
68700
- delayMs
68703
+ delayMs,
68704
+ exclude
68701
68705
  };
68702
68706
  console.log(`Starting crawl of ${startUrl}`);
68703
68707
  console.log(`Output: ${config.outputDir} | Concurrency: ${config.concurrency}${maxDepth !== undefined ? ` | Max depth: ${maxDepth}` : ""}${maxPages !== undefined ? ` | Max pages: ${maxPages}` : ""}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "web-to-markdown-crawler",
3
- "version": "1.0.2",
3
+ "version": "1.0.3",
4
4
  "description": "Web crawler that converts site pages to markdown, mirroring the URL structure locally",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",