web-to-markdown-crawler 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # web-to-markdown-crawler
2
2
 
3
+ [![npm version](https://img.shields.io/npm/v/web-to-markdown-crawler)](https://www.npmjs.com/package/web-to-markdown-crawler)
4
+ [![CI](https://github.com/leochilds/web-to-markdown-crawler/actions/workflows/ci.yml/badge.svg)](https://github.com/leochilds/web-to-markdown-crawler/actions/workflows/ci.yml)
5
+
3
6
  A CLI tool that crawls a website and converts every page to Markdown, mirroring the site's URL structure as a local directory tree. Internal links are rewritten to relative `.md` paths so the output works as a self-contained document collection.
4
7
 
5
8
  ## Features
package/dist/index.js CHANGED
@@ -68287,7 +68287,7 @@ var STRINGS = {
68287
68287
  COMMENT_END: toUint8Array("-->")
68288
68288
  };
68289
68289
  // node_modules/undici/index.js
68290
- var __filename = "/home/leo/code/web-to-markdown-crawler/node_modules/undici/index.js";
68290
+ var __filename = "/home/runner/work/web-to-markdown-crawler/web-to-markdown-crawler/node_modules/undici/index.js";
68291
68291
  var Client = require_client();
68292
68292
  var Dispatcher = require_dispatcher();
68293
68293
  var Pool = require_pool();
@@ -68565,7 +68565,7 @@ function serializeGraph(graph, meta, startUrl, totalPages, crawledAt) {
68565
68565
  }
68566
68566
  return { startUrl, crawledAt, totalPages, nodes };
68567
68567
  }
68568
- async function processPage(item, config, startHostname, visited, graph, meta, newItems, state) {
68568
+ async function processPage(item, config, visited, graph, meta, newItems, state) {
68569
68569
  const { url, depth } = item;
68570
68570
  if (config.maxPages !== undefined && state.totalPages >= config.maxPages) {
68571
68571
  const node2 = meta.get(url);
@@ -68584,11 +68584,16 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
68584
68584
  meta.set(effectiveUrl, { depth, status: "pending" });
68585
68585
  }
68586
68586
  }
68587
+ if (depth === 0) {
68588
+ const effectiveHostname = new URL(effectiveUrl).hostname;
68589
+ state.startHostname = effectiveHostname;
68590
+ state.internalHostnames.add(effectiveHostname);
68591
+ }
68587
68592
  const links = extractLinks(html3, effectiveUrl);
68588
68593
  for (const rawLink of links) {
68589
68594
  const link = normalizeUrl2(rawLink);
68590
68595
  const linkHostname = new URL(link).hostname;
68591
- const isInternal = linkHostname === startHostname;
68596
+ const isInternal = state.internalHostnames.has(linkHostname);
68592
68597
  graph.dir(url, link);
68593
68598
  if (!meta.has(link)) {
68594
68599
  meta.set(link, {
@@ -68604,7 +68609,7 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
68604
68609
  const outputPath = urlToOutputPath(effectiveUrl, config.outputDir);
68605
68610
  const relativeOutputPath = path3.relative(path3.resolve(config.outputDir), outputPath);
68606
68611
  const markdown = convertToMarkdown(html3, effectiveUrl);
68607
- const rewritten = rewriteInternalLinks(markdown, startHostname, relativeOutputPath, effectiveUrl);
68612
+ const rewritten = rewriteInternalLinks(markdown, state.startHostname, relativeOutputPath, effectiveUrl);
68608
68613
  await writePage(outputPath, rewritten);
68609
68614
  const node2 = meta.get(url);
68610
68615
  node2.status = "success";
@@ -68620,12 +68625,16 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
68620
68625
  }
68621
68626
  async function run(config) {
68622
68627
  const startUrl = normalizeUrl2(config.startUrl);
68623
- const startHostname = new URL(startUrl).hostname;
68624
68628
  const crawledAt = new Date().toISOString();
68625
68629
  const visited = new Set([startUrl]);
68626
68630
  const graph = createGraph();
68627
68631
  const meta = new Map;
68628
- const state = { totalPages: 0 };
68632
+ const initialHostname = new URL(startUrl).hostname;
68633
+ const state = {
68634
+ totalPages: 0,
68635
+ startHostname: initialHostname,
68636
+ internalHostnames: new Set([initialHostname])
68637
+ };
68629
68638
  meta.set(startUrl, { depth: 0, status: "pending" });
68630
68639
  const limit = pLimit(config.concurrency);
68631
68640
  let queue = [{ url: startUrl, depth: 0 }];
@@ -68634,7 +68643,7 @@ async function run(config) {
68634
68643
  break;
68635
68644
  const batch = queue.splice(0);
68636
68645
  const newItems = [];
68637
- const tasks = batch.map((item) => limit(() => processPage(item, config, startHostname, visited, graph, meta, newItems, state)));
68646
+ const tasks = batch.map((item) => limit(() => processPage(item, config, visited, graph, meta, newItems, state)));
68638
68647
  await Promise.allSettled(tasks);
68639
68648
  queue = newItems;
68640
68649
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "web-to-markdown-crawler",
3
- "version": "1.0.0",
3
+ "version": "1.0.2",
4
4
  "description": "Web crawler that converts site pages to markdown, mirroring the URL structure locally",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -10,6 +10,10 @@
10
10
  "files": [
11
11
  "dist"
12
12
  ],
13
+ "repository": {
14
+ "type": "git",
15
+ "url": "https://github.com/leochilds/web-to-markdown-crawler.git"
16
+ },
13
17
  "engines": {
14
18
  "node": ">=18"
15
19
  },