web-to-markdown-crawler 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/dist/index.js +15 -6
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# web-to-markdown-crawler
|
|
2
2
|
|
|
3
|
+
[](https://www.npmjs.com/package/web-to-markdown-crawler)
|
|
4
|
+
[](https://github.com/leochilds/web-to-markdown-crawler/actions/workflows/ci.yml)
|
|
5
|
+
|
|
3
6
|
A CLI tool that crawls a website and converts every page to Markdown, mirroring the site's URL structure as a local directory tree. Internal links are rewritten to relative `.md` paths so the output works as a self-contained document collection.
|
|
4
7
|
|
|
5
8
|
## Features
|
package/dist/index.js
CHANGED
|
@@ -68565,7 +68565,7 @@ function serializeGraph(graph, meta, startUrl, totalPages, crawledAt) {
|
|
|
68565
68565
|
}
|
|
68566
68566
|
return { startUrl, crawledAt, totalPages, nodes };
|
|
68567
68567
|
}
|
|
68568
|
-
async function processPage(item, config,
|
|
68568
|
+
async function processPage(item, config, visited, graph, meta, newItems, state) {
|
|
68569
68569
|
const { url, depth } = item;
|
|
68570
68570
|
if (config.maxPages !== undefined && state.totalPages >= config.maxPages) {
|
|
68571
68571
|
const node2 = meta.get(url);
|
|
@@ -68584,11 +68584,16 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
|
|
|
68584
68584
|
meta.set(effectiveUrl, { depth, status: "pending" });
|
|
68585
68585
|
}
|
|
68586
68586
|
}
|
|
68587
|
+
if (depth === 0) {
|
|
68588
|
+
const effectiveHostname = new URL(effectiveUrl).hostname;
|
|
68589
|
+
state.startHostname = effectiveHostname;
|
|
68590
|
+
state.internalHostnames.add(effectiveHostname);
|
|
68591
|
+
}
|
|
68587
68592
|
const links = extractLinks(html3, effectiveUrl);
|
|
68588
68593
|
for (const rawLink of links) {
|
|
68589
68594
|
const link = normalizeUrl2(rawLink);
|
|
68590
68595
|
const linkHostname = new URL(link).hostname;
|
|
68591
|
-
const isInternal = linkHostname
|
|
68596
|
+
const isInternal = state.internalHostnames.has(linkHostname);
|
|
68592
68597
|
graph.dir(url, link);
|
|
68593
68598
|
if (!meta.has(link)) {
|
|
68594
68599
|
meta.set(link, {
|
|
@@ -68604,7 +68609,7 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
|
|
|
68604
68609
|
const outputPath = urlToOutputPath(effectiveUrl, config.outputDir);
|
|
68605
68610
|
const relativeOutputPath = path3.relative(path3.resolve(config.outputDir), outputPath);
|
|
68606
68611
|
const markdown = convertToMarkdown(html3, effectiveUrl);
|
|
68607
|
-
const rewritten = rewriteInternalLinks(markdown, startHostname, relativeOutputPath, effectiveUrl);
|
|
68612
|
+
const rewritten = rewriteInternalLinks(markdown, state.startHostname, relativeOutputPath, effectiveUrl);
|
|
68608
68613
|
await writePage(outputPath, rewritten);
|
|
68609
68614
|
const node2 = meta.get(url);
|
|
68610
68615
|
node2.status = "success";
|
|
@@ -68620,12 +68625,16 @@ async function processPage(item, config, startHostname, visited, graph, meta, ne
|
|
|
68620
68625
|
}
|
|
68621
68626
|
async function run(config) {
|
|
68622
68627
|
const startUrl = normalizeUrl2(config.startUrl);
|
|
68623
|
-
const startHostname = new URL(startUrl).hostname;
|
|
68624
68628
|
const crawledAt = new Date().toISOString();
|
|
68625
68629
|
const visited = new Set([startUrl]);
|
|
68626
68630
|
const graph = createGraph();
|
|
68627
68631
|
const meta = new Map;
|
|
68628
|
-
const
|
|
68632
|
+
const initialHostname = new URL(startUrl).hostname;
|
|
68633
|
+
const state = {
|
|
68634
|
+
totalPages: 0,
|
|
68635
|
+
startHostname: initialHostname,
|
|
68636
|
+
internalHostnames: new Set([initialHostname])
|
|
68637
|
+
};
|
|
68629
68638
|
meta.set(startUrl, { depth: 0, status: "pending" });
|
|
68630
68639
|
const limit = pLimit(config.concurrency);
|
|
68631
68640
|
let queue = [{ url: startUrl, depth: 0 }];
|
|
@@ -68634,7 +68643,7 @@ async function run(config) {
|
|
|
68634
68643
|
break;
|
|
68635
68644
|
const batch = queue.splice(0);
|
|
68636
68645
|
const newItems = [];
|
|
68637
|
-
const tasks = batch.map((item) => limit(() => processPage(item, config,
|
|
68646
|
+
const tasks = batch.map((item) => limit(() => processPage(item, config, visited, graph, meta, newItems, state)));
|
|
68638
68647
|
await Promise.allSettled(tasks);
|
|
68639
68648
|
queue = newItems;
|
|
68640
68649
|
}
|