web-to-markdown-crawler 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/dist/index.js +24 -11
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# web-to-markdown-crawler
|
|
2
2
|
|
|
3
|
+
[](https://www.npmjs.com/package/web-to-markdown-crawler)
|
|
4
|
+
[](https://github.com/leochilds/web-to-markdown-crawler/actions/workflows/ci.yml)
|
|
5
|
+
|
|
3
6
|
A CLI tool that crawls a website and converts every page to Markdown, mirroring the site's URL structure as a local directory tree. Internal links are rewritten to relative `.md` paths so the output works as a self-contained document collection.
|
|
4
7
|
|
|
5
8
|
## Features
|
package/dist/index.js
CHANGED
|
@@ -68565,7 +68565,7 @@ function serializeGraph(graph, meta, startUrl, totalPages, crawledAt) {
|
|
|
68565
68565
|
}
|
|
68566
68566
|
return { startUrl, crawledAt, totalPages, nodes };
|
|
68567
68567
|
}
|
|
68568
|
-
async function processPage(item, config,
|
|
68568
|
+
async function processPage(item, config, visited, graph, meta, newItems, state) {
|
|
68569
68569
|
const { url, depth } = item;
|
|
68570
68570
|
if (config.maxPages !== undefined && state.totalPages >= config.maxPages) {
|
|
68571
68571
|
const node2 = meta.get(url);
|
|
@@ -68584,19 +68584,25 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
|
|
|
68584
68584
|
meta.set(effectiveUrl, { depth, status: "pending" });
|
|
68585
68585
|
}
|
|
68586
68586
|
}
|
|
68587
|
+
if (depth === 0) {
|
|
68588
|
+
const effectiveHostname = new URL(effectiveUrl).hostname;
|
|
68589
|
+
state.startHostname = effectiveHostname;
|
|
68590
|
+
state.internalHostnames.add(effectiveHostname);
|
|
68591
|
+
}
|
|
68587
68592
|
const links = extractLinks(html3, effectiveUrl);
|
|
68588
68593
|
for (const rawLink of links) {
|
|
68589
68594
|
const link = normalizeUrl2(rawLink);
|
|
68590
|
-
const
|
|
68591
|
-
const isInternal =
|
|
68595
|
+
const parsedLink = new URL(link);
|
|
68596
|
+
const isInternal = state.internalHostnames.has(parsedLink.hostname);
|
|
68597
|
+
const isExcluded = config.exclude?.some((pattern) => parsedLink.pathname.startsWith(pattern)) ?? false;
|
|
68592
68598
|
graph.dir(url, link);
|
|
68593
68599
|
if (!meta.has(link)) {
|
|
68594
68600
|
meta.set(link, {
|
|
68595
68601
|
depth: depth + 1,
|
|
68596
|
-
status: isInternal ? "pending" : "skipped"
|
|
68602
|
+
status: isInternal && !isExcluded ? "pending" : "skipped"
|
|
68597
68603
|
});
|
|
68598
68604
|
}
|
|
68599
|
-
if (isInternal && !visited.has(link) && (config.maxDepth === undefined || depth + 1 <= config.maxDepth)) {
|
|
68605
|
+
if (isInternal && !isExcluded && !visited.has(link) && (config.maxDepth === undefined || depth + 1 <= config.maxDepth)) {
|
|
68600
68606
|
visited.add(link);
|
|
68601
68607
|
newItems.push({ url: link, depth: depth + 1 });
|
|
68602
68608
|
}
|
|
@@ -68604,7 +68610,7 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
|
|
|
68604
68610
|
const outputPath = urlToOutputPath(effectiveUrl, config.outputDir);
|
|
68605
68611
|
const relativeOutputPath = path3.relative(path3.resolve(config.outputDir), outputPath);
|
|
68606
68612
|
const markdown = convertToMarkdown(html3, effectiveUrl);
|
|
68607
|
-
const rewritten = rewriteInternalLinks(markdown, startHostname, relativeOutputPath, effectiveUrl);
|
|
68613
|
+
const rewritten = rewriteInternalLinks(markdown, state.startHostname, relativeOutputPath, effectiveUrl);
|
|
68608
68614
|
await writePage(outputPath, rewritten);
|
|
68609
68615
|
const node2 = meta.get(url);
|
|
68610
68616
|
node2.status = "success";
|
|
@@ -68620,12 +68626,16 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
|
|
|
68620
68626
|
}
|
|
68621
68627
|
async function run(config) {
|
|
68622
68628
|
const startUrl = normalizeUrl2(config.startUrl);
|
|
68623
|
-
const startHostname = new URL(startUrl).hostname;
|
|
68624
68629
|
const crawledAt = new Date().toISOString();
|
|
68625
68630
|
const visited = new Set([startUrl]);
|
|
68626
68631
|
const graph = createGraph();
|
|
68627
68632
|
const meta = new Map;
|
|
68628
|
-
const
|
|
68633
|
+
const initialHostname = new URL(startUrl).hostname;
|
|
68634
|
+
const state = {
|
|
68635
|
+
totalPages: 0,
|
|
68636
|
+
startHostname: initialHostname,
|
|
68637
|
+
internalHostnames: new Set([initialHostname])
|
|
68638
|
+
};
|
|
68629
68639
|
meta.set(startUrl, { depth: 0, status: "pending" });
|
|
68630
68640
|
const limit = pLimit(config.concurrency);
|
|
68631
68641
|
let queue = [{ url: startUrl, depth: 0 }];
|
|
@@ -68634,7 +68644,7 @@ async function run(config) {
|
|
|
68634
68644
|
break;
|
|
68635
68645
|
const batch = queue.splice(0);
|
|
68636
68646
|
const newItems = [];
|
|
68637
|
-
const tasks = batch.map((item) => limit(() => processPage(item, config,
|
|
68647
|
+
const tasks = batch.map((item) => limit(() => processPage(item, config, visited, graph, meta, newItems, state)));
|
|
68638
68648
|
await Promise.allSettled(tasks);
|
|
68639
68649
|
queue = newItems;
|
|
68640
68650
|
}
|
|
@@ -68660,7 +68670,7 @@ function parseStartUrl(raw) {
|
|
|
68660
68670
|
}
|
|
68661
68671
|
}
|
|
68662
68672
|
var program2 = new Command;
|
|
68663
|
-
program2.name("crawl").description("Crawl a website and convert pages to markdown").argument("<url>", "Starting URL to crawl").option("-o, --output <dir>", "Output directory", "./output").option("-c, --concurrency <n>", "Number of parallel fetches", "5").option("--max-depth <n>", "Maximum crawl depth (0 = start page only)").option("--max-pages <n>", "Maximum number of pages to crawl").option("--delay <ms>", "Delay in milliseconds between requests").action(async (url, opts) => {
|
|
68673
|
+
program2.name("crawl").description("Crawl a website and convert pages to markdown").argument("<url>", "Starting URL to crawl").option("-o, --output <dir>", "Output directory", "./output").option("-c, --concurrency <n>", "Number of parallel fetches", "5").option("--max-depth <n>", "Maximum crawl depth (0 = start page only)").option("--max-pages <n>", "Maximum number of pages to crawl").option("--delay <ms>", "Delay in milliseconds between requests").option("--exclude <path>", "Exclude a URL path prefix from crawling — can be repeated (e.g. --exclude /admin --exclude /login)", (val2, prev2) => [...prev2, val2], []).action(async (url, opts) => {
|
|
68664
68674
|
const startUrl = parseStartUrl(url);
|
|
68665
68675
|
const concurrency = parseInt(opts["concurrency"] ?? "5", 10);
|
|
68666
68676
|
if (isNaN(concurrency) || concurrency < 1) {
|
|
@@ -68670,6 +68680,8 @@ program2.name("crawl").description("Crawl a website and convert pages to markdow
|
|
|
68670
68680
|
const maxDepth = opts["maxDepth"] != null ? parseInt(opts["maxDepth"], 10) : undefined;
|
|
68671
68681
|
const maxPages = opts["maxPages"] != null ? parseInt(opts["maxPages"], 10) : undefined;
|
|
68672
68682
|
const delayMs = opts["delay"] != null ? parseInt(opts["delay"], 10) : undefined;
|
|
68683
|
+
const excludeRaw = opts["exclude"];
|
|
68684
|
+
const exclude = excludeRaw.length ? excludeRaw : undefined;
|
|
68673
68685
|
if (maxDepth !== undefined && isNaN(maxDepth)) {
|
|
68674
68686
|
console.error("Error: --max-depth must be an integer");
|
|
68675
68687
|
process.exit(1);
|
|
@@ -68688,7 +68700,8 @@ program2.name("crawl").description("Crawl a website and convert pages to markdow
|
|
|
68688
68700
|
concurrency,
|
|
68689
68701
|
maxDepth,
|
|
68690
68702
|
maxPages,
|
|
68691
|
-
delayMs
|
|
68703
|
+
delayMs,
|
|
68704
|
+
exclude
|
|
68692
68705
|
};
|
|
68693
68706
|
console.log(`Starting crawl of ${startUrl}`);
|
|
68694
68707
|
console.log(`Output: ${config.outputDir} | Concurrency: ${config.concurrency}${maxDepth !== undefined ? ` | Max depth: ${maxDepth}` : ""}${maxPages !== undefined ? ` | Max pages: ${maxPages}` : ""}
|