npm - web-to-markdown-crawler - Versions diffs - 1.0.1 → 1.0.3 - Mend

web-to-markdown-crawler 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -1,5 +1,8 @@
 # web-to-markdown-crawler
+[![npm version](https://img.shields.io/npm/v/web-to-markdown-crawler)](https://www.npmjs.com/package/web-to-markdown-crawler)
+[![CI](https://github.com/leochilds/web-to-markdown-crawler/actions/workflows/ci.yml/badge.svg)](https://github.com/leochilds/web-to-markdown-crawler/actions/workflows/ci.yml)
 A CLI tool that crawls a website and converts every page to Markdown, mirroring the site's URL structure as a local directory tree. Internal links are rewritten to relative `.md` paths so the output works as a self-contained document collection.
 ## Features

package/dist/index.js CHANGED Viewed

@@ -68565,7 +68565,7 @@ function serializeGraph(graph, meta, startUrl, totalPages, crawledAt) {
   }
   return { startUrl, crawledAt, totalPages, nodes };
 }
-async function processPage(item, config, startHostname, visited, graph, meta, newItems, state) {
+async function processPage(item, config, visited, graph, meta, newItems, state) {
   const { url, depth } = item;
   if (config.maxPages !== undefined && state.totalPages >= config.maxPages) {
     const node2 = meta.get(url);
@@ -68584,19 +68584,25 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
         meta.set(effectiveUrl, { depth, status: "pending" });
       }
     }
+    if (depth === 0) {
+      const effectiveHostname = new URL(effectiveUrl).hostname;
+      state.startHostname = effectiveHostname;
+      state.internalHostnames.add(effectiveHostname);
+    }
     const links = extractLinks(html3, effectiveUrl);
     for (const rawLink of links) {
       const link = normalizeUrl2(rawLink);
-      const linkHostname = new URL(link).hostname;
-      const isInternal = linkHostname === startHostname;
+      const parsedLink = new URL(link);
+      const isInternal = state.internalHostnames.has(parsedLink.hostname);
+      const isExcluded = config.exclude?.some((pattern) => parsedLink.pathname.startsWith(pattern)) ?? false;
       graph.dir(url, link);
       if (!meta.has(link)) {
         meta.set(link, {
           depth: depth + 1,
-          status: isInternal ? "pending" : "skipped"
+          status: isInternal && !isExcluded ? "pending" : "skipped"
         });
       }
-      if (isInternal && !visited.has(link) && (config.maxDepth === undefined || depth + 1 <= config.maxDepth)) {
+      if (isInternal && !isExcluded && !visited.has(link) && (config.maxDepth === undefined || depth + 1 <= config.maxDepth)) {
         visited.add(link);
         newItems.push({ url: link, depth: depth + 1 });
       }
@@ -68604,7 +68610,7 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
     const outputPath = urlToOutputPath(effectiveUrl, config.outputDir);
     const relativeOutputPath = path3.relative(path3.resolve(config.outputDir), outputPath);
     const markdown = convertToMarkdown(html3, effectiveUrl);
-    const rewritten = rewriteInternalLinks(markdown, startHostname, relativeOutputPath, effectiveUrl);
+    const rewritten = rewriteInternalLinks(markdown, state.startHostname, relativeOutputPath, effectiveUrl);
     await writePage(outputPath, rewritten);
     const node2 = meta.get(url);
     node2.status = "success";
@@ -68620,12 +68626,16 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
 }
 async function run(config) {
   const startUrl = normalizeUrl2(config.startUrl);
-  const startHostname = new URL(startUrl).hostname;
   const crawledAt = new Date().toISOString();
   const visited = new Set([startUrl]);
   const graph = createGraph();
   const meta = new Map;
-  const state = { totalPages: 0 };
+  const initialHostname = new URL(startUrl).hostname;
+  const state = {
+    totalPages: 0,
+    startHostname: initialHostname,
+    internalHostnames: new Set([initialHostname])
+  };
   meta.set(startUrl, { depth: 0, status: "pending" });
   const limit = pLimit(config.concurrency);
   let queue = [{ url: startUrl, depth: 0 }];
@@ -68634,7 +68644,7 @@ async function run(config) {
       break;
     const batch = queue.splice(0);
     const newItems = [];
-    const tasks = batch.map((item) => limit(() => processPage(item, config, startHostname, visited, graph, meta, newItems, state)));
+    const tasks = batch.map((item) => limit(() => processPage(item, config, visited, graph, meta, newItems, state)));
     await Promise.allSettled(tasks);
     queue = newItems;
   }
@@ -68660,7 +68670,7 @@ function parseStartUrl(raw) {
   }
 }
 var program2 = new Command;
-program2.name("crawl").description("Crawl a website and convert pages to markdown").argument("<url>", "Starting URL to crawl").option("-o, --output <dir>", "Output directory", "./output").option("-c, --concurrency <n>", "Number of parallel fetches", "5").option("--max-depth <n>", "Maximum crawl depth (0 = start page only)").option("--max-pages <n>", "Maximum number of pages to crawl").option("--delay <ms>", "Delay in milliseconds between requests").action(async (url, opts) => {
+program2.name("crawl").description("Crawl a website and convert pages to markdown").argument("<url>", "Starting URL to crawl").option("-o, --output <dir>", "Output directory", "./output").option("-c, --concurrency <n>", "Number of parallel fetches", "5").option("--max-depth <n>", "Maximum crawl depth (0 = start page only)").option("--max-pages <n>", "Maximum number of pages to crawl").option("--delay <ms>", "Delay in milliseconds between requests").option("--exclude <path>", "Exclude a URL path prefix from crawling — can be repeated (e.g. --exclude /admin --exclude /login)", (val2, prev2) => [...prev2, val2], []).action(async (url, opts) => {
   const startUrl = parseStartUrl(url);
   const concurrency = parseInt(opts["concurrency"] ?? "5", 10);
   if (isNaN(concurrency) || concurrency < 1) {
@@ -68670,6 +68680,8 @@ program2.name("crawl").description("Crawl a website and convert pages to markdow
   const maxDepth = opts["maxDepth"] != null ? parseInt(opts["maxDepth"], 10) : undefined;
   const maxPages = opts["maxPages"] != null ? parseInt(opts["maxPages"], 10) : undefined;
   const delayMs = opts["delay"] != null ? parseInt(opts["delay"], 10) : undefined;
+  const excludeRaw = opts["exclude"];
+  const exclude = excludeRaw.length ? excludeRaw : undefined;
   if (maxDepth !== undefined && isNaN(maxDepth)) {
     console.error("Error: --max-depth must be an integer");
     process.exit(1);
@@ -68688,7 +68700,8 @@ program2.name("crawl").description("Crawl a website and convert pages to markdow
     concurrency,
     maxDepth,
     maxPages,
-    delayMs
+    delayMs,
+    exclude
   };
   console.log(`Starting crawl of ${startUrl}`);
   console.log(`Output: ${config.outputDir} | Concurrency: ${config.concurrency}${maxDepth !== undefined ? ` | Max depth: ${maxDepth}` : ""}${maxPages !== undefined ? ` | Max pages: ${maxPages}` : ""}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "web-to-markdown-crawler",
-  "version": "1.0.1",
+  "version": "1.0.3",
   "description": "Web crawler that converts site pages to markdown, mirroring the URL structure locally",
   "type": "module",
   "main": "dist/index.js",