web-to-markdown-crawler 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +3 -0
  2. package/dist/index.js +24 -11
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # web-to-markdown-crawler
2
2
 
3
+ [![npm version](https://img.shields.io/npm/v/web-to-markdown-crawler)](https://www.npmjs.com/package/web-to-markdown-crawler)
4
+ [![CI](https://github.com/leochilds/web-to-markdown-crawler/actions/workflows/ci.yml/badge.svg)](https://github.com/leochilds/web-to-markdown-crawler/actions/workflows/ci.yml)
5
+
3
6
  A CLI tool that crawls a website and converts every page to Markdown, mirroring the site's URL structure as a local directory tree. Internal links are rewritten to relative `.md` paths so the output works as a self-contained document collection.
4
7
 
5
8
  ## Features
package/dist/index.js CHANGED
@@ -68565,7 +68565,7 @@ function serializeGraph(graph, meta, startUrl, totalPages, crawledAt) {
68565
68565
  }
68566
68566
  return { startUrl, crawledAt, totalPages, nodes };
68567
68567
  }
68568
- async function processPage(item, config, startHostname, visited, graph, meta, newItems, state) {
68568
+ async function processPage(item, config, visited, graph, meta, newItems, state) {
68569
68569
  const { url, depth } = item;
68570
68570
  if (config.maxPages !== undefined && state.totalPages >= config.maxPages) {
68571
68571
  const node2 = meta.get(url);
@@ -68584,19 +68584,25 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
68584
68584
  meta.set(effectiveUrl, { depth, status: "pending" });
68585
68585
  }
68586
68586
  }
68587
+ if (depth === 0) {
68588
+ const effectiveHostname = new URL(effectiveUrl).hostname;
68589
+ state.startHostname = effectiveHostname;
68590
+ state.internalHostnames.add(effectiveHostname);
68591
+ }
68587
68592
  const links = extractLinks(html3, effectiveUrl);
68588
68593
  for (const rawLink of links) {
68589
68594
  const link = normalizeUrl2(rawLink);
68590
- const linkHostname = new URL(link).hostname;
68591
- const isInternal = linkHostname === startHostname;
68595
+ const parsedLink = new URL(link);
68596
+ const isInternal = state.internalHostnames.has(parsedLink.hostname);
68597
+ const isExcluded = config.exclude?.some((pattern) => parsedLink.pathname.startsWith(pattern)) ?? false;
68592
68598
  graph.dir(url, link);
68593
68599
  if (!meta.has(link)) {
68594
68600
  meta.set(link, {
68595
68601
  depth: depth + 1,
68596
- status: isInternal ? "pending" : "skipped"
68602
+ status: isInternal && !isExcluded ? "pending" : "skipped"
68597
68603
  });
68598
68604
  }
68599
- if (isInternal && !visited.has(link) && (config.maxDepth === undefined || depth + 1 <= config.maxDepth)) {
68605
+ if (isInternal && !isExcluded && !visited.has(link) && (config.maxDepth === undefined || depth + 1 <= config.maxDepth)) {
68600
68606
  visited.add(link);
68601
68607
  newItems.push({ url: link, depth: depth + 1 });
68602
68608
  }
@@ -68604,7 +68610,7 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
68604
68610
  const outputPath = urlToOutputPath(effectiveUrl, config.outputDir);
68605
68611
  const relativeOutputPath = path3.relative(path3.resolve(config.outputDir), outputPath);
68606
68612
  const markdown = convertToMarkdown(html3, effectiveUrl);
68607
- const rewritten = rewriteInternalLinks(markdown, startHostname, relativeOutputPath, effectiveUrl);
68613
+ const rewritten = rewriteInternalLinks(markdown, state.startHostname, relativeOutputPath, effectiveUrl);
68608
68614
  await writePage(outputPath, rewritten);
68609
68615
  const node2 = meta.get(url);
68610
68616
  node2.status = "success";
@@ -68620,12 +68626,16 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
68620
68626
  }
68621
68627
  async function run(config) {
68622
68628
  const startUrl = normalizeUrl2(config.startUrl);
68623
- const startHostname = new URL(startUrl).hostname;
68624
68629
  const crawledAt = new Date().toISOString();
68625
68630
  const visited = new Set([startUrl]);
68626
68631
  const graph = createGraph();
68627
68632
  const meta = new Map;
68628
- const state = { totalPages: 0 };
68633
+ const initialHostname = new URL(startUrl).hostname;
68634
+ const state = {
68635
+ totalPages: 0,
68636
+ startHostname: initialHostname,
68637
+ internalHostnames: new Set([initialHostname])
68638
+ };
68629
68639
  meta.set(startUrl, { depth: 0, status: "pending" });
68630
68640
  const limit = pLimit(config.concurrency);
68631
68641
  let queue = [{ url: startUrl, depth: 0 }];
@@ -68634,7 +68644,7 @@ async function run(config) {
68634
68644
  break;
68635
68645
  const batch = queue.splice(0);
68636
68646
  const newItems = [];
68637
- const tasks = batch.map((item) => limit(() => processPage(item, config, startHostname, visited, graph, meta, newItems, state)));
68647
+ const tasks = batch.map((item) => limit(() => processPage(item, config, visited, graph, meta, newItems, state)));
68638
68648
  await Promise.allSettled(tasks);
68639
68649
  queue = newItems;
68640
68650
  }
@@ -68660,7 +68670,7 @@ function parseStartUrl(raw) {
68660
68670
  }
68661
68671
  }
68662
68672
  var program2 = new Command;
68663
- program2.name("crawl").description("Crawl a website and convert pages to markdown").argument("<url>", "Starting URL to crawl").option("-o, --output <dir>", "Output directory", "./output").option("-c, --concurrency <n>", "Number of parallel fetches", "5").option("--max-depth <n>", "Maximum crawl depth (0 = start page only)").option("--max-pages <n>", "Maximum number of pages to crawl").option("--delay <ms>", "Delay in milliseconds between requests").action(async (url, opts) => {
68673
+ program2.name("crawl").description("Crawl a website and convert pages to markdown").argument("<url>", "Starting URL to crawl").option("-o, --output <dir>", "Output directory", "./output").option("-c, --concurrency <n>", "Number of parallel fetches", "5").option("--max-depth <n>", "Maximum crawl depth (0 = start page only)").option("--max-pages <n>", "Maximum number of pages to crawl").option("--delay <ms>", "Delay in milliseconds between requests").option("--exclude <path>", "Exclude a URL path prefix from crawling — can be repeated (e.g. --exclude /admin --exclude /login)", (val2, prev2) => [...prev2, val2], []).action(async (url, opts) => {
68664
68674
  const startUrl = parseStartUrl(url);
68665
68675
  const concurrency = parseInt(opts["concurrency"] ?? "5", 10);
68666
68676
  if (isNaN(concurrency) || concurrency < 1) {
@@ -68670,6 +68680,8 @@ program2.name("crawl").description("Crawl a website and convert pages to markdow
68670
68680
  const maxDepth = opts["maxDepth"] != null ? parseInt(opts["maxDepth"], 10) : undefined;
68671
68681
  const maxPages = opts["maxPages"] != null ? parseInt(opts["maxPages"], 10) : undefined;
68672
68682
  const delayMs = opts["delay"] != null ? parseInt(opts["delay"], 10) : undefined;
68683
+ const excludeRaw = opts["exclude"];
68684
+ const exclude = excludeRaw.length ? excludeRaw : undefined;
68673
68685
  if (maxDepth !== undefined && isNaN(maxDepth)) {
68674
68686
  console.error("Error: --max-depth must be an integer");
68675
68687
  process.exit(1);
@@ -68688,7 +68700,8 @@ program2.name("crawl").description("Crawl a website and convert pages to markdow
68688
68700
  concurrency,
68689
68701
  maxDepth,
68690
68702
  maxPages,
68691
- delayMs
68703
+ delayMs,
68704
+ exclude
68692
68705
  };
68693
68706
  console.log(`Starting crawl of ${startUrl}`);
68694
68707
  console.log(`Output: ${config.outputDir} | Concurrency: ${config.concurrency}${maxDepth !== undefined ? ` | Max depth: ${maxDepth}` : ""}${maxPages !== undefined ? ` | Max pages: ${maxPages}` : ""}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "web-to-markdown-crawler",
3
- "version": "1.0.1",
3
+ "version": "1.0.3",
4
4
  "description": "Web crawler that converts site pages to markdown, mirroring the URL structure locally",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",