web-to-markdown-crawler 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +10 -6
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -68592,16 +68592,17 @@ async function processPage(item, config, visited, graph, meta, newItems, state)
|
|
|
68592
68592
|
const links = extractLinks(html3, effectiveUrl);
|
|
68593
68593
|
for (const rawLink of links) {
|
|
68594
68594
|
const link = normalizeUrl2(rawLink);
|
|
68595
|
-
const
|
|
68596
|
-
const isInternal = state.internalHostnames.has(
|
|
68595
|
+
const parsedLink = new URL(link);
|
|
68596
|
+
const isInternal = state.internalHostnames.has(parsedLink.hostname);
|
|
68597
|
+
const isExcluded = config.exclude?.some((pattern) => parsedLink.pathname.startsWith(pattern)) ?? false;
|
|
68597
68598
|
graph.dir(url, link);
|
|
68598
68599
|
if (!meta.has(link)) {
|
|
68599
68600
|
meta.set(link, {
|
|
68600
68601
|
depth: depth + 1,
|
|
68601
|
-
status: isInternal ? "pending" : "skipped"
|
|
68602
|
+
status: isInternal && !isExcluded ? "pending" : "skipped"
|
|
68602
68603
|
});
|
|
68603
68604
|
}
|
|
68604
|
-
if (isInternal && !visited.has(link) && (config.maxDepth === undefined || depth + 1 <= config.maxDepth)) {
|
|
68605
|
+
if (isInternal && !isExcluded && !visited.has(link) && (config.maxDepth === undefined || depth + 1 <= config.maxDepth)) {
|
|
68605
68606
|
visited.add(link);
|
|
68606
68607
|
newItems.push({ url: link, depth: depth + 1 });
|
|
68607
68608
|
}
|
|
@@ -68669,7 +68670,7 @@ function parseStartUrl(raw) {
|
|
|
68669
68670
|
}
|
|
68670
68671
|
}
|
|
68671
68672
|
var program2 = new Command;
|
|
68672
|
-
program2.name("crawl").description("Crawl a website and convert pages to markdown").argument("<url>", "Starting URL to crawl").option("-o, --output <dir>", "Output directory", "./output").option("-c, --concurrency <n>", "Number of parallel fetches", "5").option("--max-depth <n>", "Maximum crawl depth (0 = start page only)").option("--max-pages <n>", "Maximum number of pages to crawl").option("--delay <ms>", "Delay in milliseconds between requests").action(async (url, opts) => {
|
|
68673
|
+
program2.name("crawl").description("Crawl a website and convert pages to markdown").argument("<url>", "Starting URL to crawl").option("-o, --output <dir>", "Output directory", "./output").option("-c, --concurrency <n>", "Number of parallel fetches", "5").option("--max-depth <n>", "Maximum crawl depth (0 = start page only)").option("--max-pages <n>", "Maximum number of pages to crawl").option("--delay <ms>", "Delay in milliseconds between requests").option("--exclude <path>", "Exclude a URL path prefix from crawling — can be repeated (e.g. --exclude /admin --exclude /login)", (val2, prev2) => [...prev2, val2], []).action(async (url, opts) => {
|
|
68673
68674
|
const startUrl = parseStartUrl(url);
|
|
68674
68675
|
const concurrency = parseInt(opts["concurrency"] ?? "5", 10);
|
|
68675
68676
|
if (isNaN(concurrency) || concurrency < 1) {
|
|
@@ -68679,6 +68680,8 @@ program2.name("crawl").description("Crawl a website and convert pages to markdow
|
|
|
68679
68680
|
const maxDepth = opts["maxDepth"] != null ? parseInt(opts["maxDepth"], 10) : undefined;
|
|
68680
68681
|
const maxPages = opts["maxPages"] != null ? parseInt(opts["maxPages"], 10) : undefined;
|
|
68681
68682
|
const delayMs = opts["delay"] != null ? parseInt(opts["delay"], 10) : undefined;
|
|
68683
|
+
const excludeRaw = opts["exclude"];
|
|
68684
|
+
const exclude = excludeRaw.length ? excludeRaw : undefined;
|
|
68682
68685
|
if (maxDepth !== undefined && isNaN(maxDepth)) {
|
|
68683
68686
|
console.error("Error: --max-depth must be an integer");
|
|
68684
68687
|
process.exit(1);
|
|
@@ -68697,7 +68700,8 @@ program2.name("crawl").description("Crawl a website and convert pages to markdow
|
|
|
68697
68700
|
concurrency,
|
|
68698
68701
|
maxDepth,
|
|
68699
68702
|
maxPages,
|
|
68700
|
-
delayMs
|
|
68703
|
+
delayMs,
|
|
68704
|
+
exclude
|
|
68701
68705
|
};
|
|
68702
68706
|
console.log(`Starting crawl of ${startUrl}`);
|
|
68703
68707
|
console.log(`Output: ${config.outputDir} | Concurrency: ${config.concurrency}${maxDepth !== undefined ? ` | Max depth: ${maxDepth}` : ""}${maxPages !== undefined ? ` | Max pages: ${maxPages}` : ""}
|