@mdream/crawl 1.0.0-beta.9 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +416 -56
- package/dist/_chunks/crawl.mjs +366 -277
- package/dist/_chunks/playwright-utils.mjs +59 -0
- package/dist/cli.mjs +79 -89
- package/dist/index.d.mts +40 -2
- package/dist/index.mjs +6 -1
- package/package.json +11 -4
package/dist/_chunks/crawl.mjs
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { writeFile } from "node:fs/promises";
|
|
1
|
+
import { mkdirSync } from "node:fs";
|
|
2
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
3
3
|
import * as p from "@clack/prompts";
|
|
4
4
|
import { generateLlmsTxtArtifacts } from "@mdream/js/llms-txt";
|
|
5
|
-
import {
|
|
5
|
+
import { createHooks } from "hookable";
|
|
6
6
|
import { htmlToMarkdown } from "mdream";
|
|
7
|
+
import { ofetch } from "ofetch";
|
|
7
8
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
8
9
|
import { withHttps } from "ufo";
|
|
9
10
|
import picomatch from "picomatch";
|
|
11
|
+
import { getDomain } from "tldts";
|
|
10
12
|
//#region src/glob-utils.ts
|
|
11
13
|
function stripGlobTail(s) {
|
|
12
14
|
const idx = s.indexOf("*");
|
|
@@ -14,6 +16,14 @@ function stripGlobTail(s) {
|
|
|
14
16
|
}
|
|
15
17
|
const GLOB_CHAR_RE = /[*?[]/;
|
|
16
18
|
/**
|
|
19
|
+
* Extract the registrable domain from a hostname using the public suffix list.
|
|
20
|
+
* Handles multi-part TLDs (.co.uk, .github.io, etc.) correctly.
|
|
21
|
+
* Returns the hostname unchanged for IPs or when parsing fails.
|
|
22
|
+
*/
|
|
23
|
+
function getRegistrableDomain(hostname) {
|
|
24
|
+
return getDomain(hostname, { allowPrivateDomains: true }) || hostname;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
17
27
|
* Parse a URL that may contain glob patterns
|
|
18
28
|
* Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
|
|
19
29
|
*/
|
|
@@ -40,12 +50,15 @@ function parseUrlPattern(input) {
|
|
|
40
50
|
/**
|
|
41
51
|
* Check if a URL matches a glob pattern
|
|
42
52
|
*/
|
|
43
|
-
function matchesGlobPattern(url, parsedPattern) {
|
|
53
|
+
function matchesGlobPattern(url, parsedPattern, allowSubdomains = false) {
|
|
44
54
|
if (!parsedPattern.isGlob) return true;
|
|
45
55
|
try {
|
|
46
56
|
const urlObj = new URL(url);
|
|
47
57
|
const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
|
|
48
|
-
if (
|
|
58
|
+
if (allowSubdomains) {
|
|
59
|
+
const patternUrl = new URL(parsedPattern.baseUrl);
|
|
60
|
+
if (getRegistrableDomain(urlObj.hostname) !== getRegistrableDomain(patternUrl.hostname)) return false;
|
|
61
|
+
} else if (`${urlObj.protocol}//${urlObj.host}` !== parsedPattern.baseUrl) return false;
|
|
49
62
|
let pattern = parsedPattern.pattern;
|
|
50
63
|
if (pattern.endsWith("*") && !pattern.endsWith("**") && !pattern.endsWith("/*")) {
|
|
51
64
|
const base = pattern.slice(0, -1);
|
|
@@ -73,7 +86,7 @@ function getStartingUrl(parsedPattern) {
|
|
|
73
86
|
/**
|
|
74
87
|
* Check if a URL should be excluded based on exclude patterns
|
|
75
88
|
*/
|
|
76
|
-
function isUrlExcluded(url, excludePatterns) {
|
|
89
|
+
function isUrlExcluded(url, excludePatterns, allowSubdomains = false) {
|
|
77
90
|
if (!excludePatterns || excludePatterns.length === 0) return false;
|
|
78
91
|
try {
|
|
79
92
|
const urlObj = new URL(url);
|
|
@@ -81,7 +94,7 @@ function isUrlExcluded(url, excludePatterns) {
|
|
|
81
94
|
return excludePatterns.some((pattern) => {
|
|
82
95
|
if (pattern.includes("://")) {
|
|
83
96
|
const parsedPattern = parseUrlPattern(pattern);
|
|
84
|
-
if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern);
|
|
97
|
+
if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern, allowSubdomains);
|
|
85
98
|
return url === pattern;
|
|
86
99
|
}
|
|
87
100
|
if (pattern.startsWith("/")) return picomatch(pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern)(urlPath);
|
|
@@ -109,21 +122,75 @@ function validateGlobPattern(pattern) {
|
|
|
109
122
|
}
|
|
110
123
|
}
|
|
111
124
|
//#endregion
|
|
112
|
-
//#region src/
|
|
113
|
-
|
|
114
|
-
|
|
125
|
+
//#region src/crawl.ts
|
|
126
|
+
const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
|
|
127
|
+
const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
|
|
128
|
+
const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
|
|
129
|
+
const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
|
|
130
|
+
const ROBOTS_CRAWL_DELAY_RE = /Crawl-delay:\s*(\d+(?:\.\d+)?)/i;
|
|
131
|
+
const URL_TRAILING_SLASH_RE = /\/$/;
|
|
132
|
+
const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
|
|
133
|
+
const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
|
|
134
|
+
const FETCH_HEADERS = {
|
|
135
|
+
"User-Agent": "mdream-crawler/1.0",
|
|
136
|
+
"Accept": "text/html,application/xhtml+xml,text/markdown"
|
|
137
|
+
};
|
|
138
|
+
const DEFAULT_CONCURRENCY = 20;
|
|
139
|
+
function extractCdataUrl(url) {
|
|
140
|
+
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) return url.slice(9, -3);
|
|
141
|
+
return url;
|
|
142
|
+
}
|
|
143
|
+
async function loadSitemap(sitemapUrl) {
|
|
144
|
+
const xmlContent = await ofetch(sitemapUrl, {
|
|
145
|
+
headers: FETCH_HEADERS,
|
|
146
|
+
timeout: 1e4,
|
|
147
|
+
responseType: "text",
|
|
148
|
+
retry: 0
|
|
149
|
+
});
|
|
150
|
+
if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
|
|
151
|
+
if (xmlContent.includes("<sitemapindex")) {
|
|
152
|
+
SITEMAP_INDEX_LOC_RE.lastIndex = 0;
|
|
153
|
+
const childSitemaps = [];
|
|
154
|
+
let match;
|
|
155
|
+
while (true) {
|
|
156
|
+
match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
|
|
157
|
+
if (match === null) break;
|
|
158
|
+
childSitemaps.push(extractCdataUrl(match[1]));
|
|
159
|
+
}
|
|
160
|
+
const childResults = await Promise.allSettled(childSitemaps.map((url) => loadSitemap(url)));
|
|
161
|
+
const allUrls = [];
|
|
162
|
+
for (const result of childResults) if (result.status === "fulfilled") allUrls.push(...result.value);
|
|
163
|
+
return allUrls;
|
|
164
|
+
}
|
|
165
|
+
const urls = [];
|
|
166
|
+
SITEMAP_URL_LOC_RE.lastIndex = 0;
|
|
167
|
+
let match;
|
|
168
|
+
while (true) {
|
|
169
|
+
match = SITEMAP_URL_LOC_RE.exec(xmlContent);
|
|
170
|
+
if (match === null) break;
|
|
171
|
+
urls.push(extractCdataUrl(match[1]));
|
|
172
|
+
}
|
|
173
|
+
return urls;
|
|
174
|
+
}
|
|
175
|
+
function extractMetadataInline(parsedUrl, allowedDomains) {
|
|
176
|
+
const links = /* @__PURE__ */ new Set();
|
|
115
177
|
let title = "";
|
|
116
178
|
let description = "";
|
|
117
179
|
let keywords = "";
|
|
118
180
|
let author = "";
|
|
119
|
-
|
|
120
|
-
|
|
181
|
+
const url = parsedUrl.href;
|
|
182
|
+
const originPrefix = `${parsedUrl.origin}/`;
|
|
183
|
+
return {
|
|
121
184
|
extraction: {
|
|
122
185
|
"a[href]": (el) => {
|
|
123
186
|
const href = el.attributes.href;
|
|
124
187
|
if (href) try {
|
|
125
|
-
const
|
|
126
|
-
|
|
188
|
+
const resolved = new URL(href, url);
|
|
189
|
+
const absoluteUrl = resolved.href;
|
|
190
|
+
if (allowedDomains) {
|
|
191
|
+
const domain = getRegistrableDomain(resolved.hostname);
|
|
192
|
+
if (domain && allowedDomains.has(domain)) links.add(absoluteUrl);
|
|
193
|
+
} else if (absoluteUrl.startsWith(originPrefix) || absoluteUrl === parsedUrl.origin) links.add(absoluteUrl);
|
|
127
194
|
} catch {}
|
|
128
195
|
},
|
|
129
196
|
"title": (el) => {
|
|
@@ -144,88 +211,35 @@ function extractMetadata(html, url) {
|
|
|
144
211
|
"meta[property=\"og:title\"]": (el) => {
|
|
145
212
|
if (!title) title = el.attributes.content || "";
|
|
146
213
|
}
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
links: links.filter((link) => {
|
|
155
|
-
try {
|
|
156
|
-
const linkUrl = new URL(link);
|
|
157
|
-
const baseUrl = new URL(url);
|
|
158
|
-
return linkUrl.hostname === baseUrl.hostname;
|
|
159
|
-
} catch {
|
|
160
|
-
return false;
|
|
161
|
-
}
|
|
214
|
+
},
|
|
215
|
+
getMetadata: () => ({
|
|
216
|
+
title: title.trim() || parsedUrl.pathname,
|
|
217
|
+
description: description.trim() || void 0,
|
|
218
|
+
keywords: keywords.trim() || void 0,
|
|
219
|
+
author: author.trim() || void 0,
|
|
220
|
+
links: [...links]
|
|
162
221
|
})
|
|
163
222
|
};
|
|
164
223
|
}
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
const
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
const controller = new AbortController();
|
|
176
|
-
const timeoutId = setTimeout(() => controller.abort(), 1e4);
|
|
177
|
-
try {
|
|
178
|
-
const response = await fetch(sitemapUrl, {
|
|
179
|
-
signal: controller.signal,
|
|
180
|
-
headers: { "User-Agent": "mdream-crawler/1.0" }
|
|
181
|
-
});
|
|
182
|
-
clearTimeout(timeoutId);
|
|
183
|
-
if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
|
|
184
|
-
const xmlContent = await response.text();
|
|
185
|
-
if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
|
|
186
|
-
if (xmlContent.includes("<sitemapindex")) {
|
|
187
|
-
SITEMAP_INDEX_LOC_RE.lastIndex = 0;
|
|
188
|
-
const childSitemaps = [];
|
|
189
|
-
let match;
|
|
190
|
-
while (true) {
|
|
191
|
-
match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
|
|
192
|
-
if (match === null) break;
|
|
193
|
-
let url = match[1];
|
|
194
|
-
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
195
|
-
childSitemaps.push(url);
|
|
196
|
-
}
|
|
197
|
-
const allUrls = [];
|
|
198
|
-
for (const childSitemapUrl of childSitemaps) try {
|
|
199
|
-
const childUrls = await loadSitemapWithoutRetries(childSitemapUrl);
|
|
200
|
-
allUrls.push(...childUrls);
|
|
201
|
-
} catch (error) {
|
|
202
|
-
console.warn(`Failed to load child sitemap ${childSitemapUrl}:`, error instanceof Error ? error.message : "Unknown error");
|
|
203
|
-
}
|
|
204
|
-
return allUrls;
|
|
205
|
-
} else {
|
|
206
|
-
const urls = [];
|
|
207
|
-
SITEMAP_URL_LOC_RE.lastIndex = 0;
|
|
208
|
-
let match;
|
|
209
|
-
while (true) {
|
|
210
|
-
match = SITEMAP_URL_LOC_RE.exec(xmlContent);
|
|
211
|
-
if (match === null) break;
|
|
212
|
-
let url = match[1];
|
|
213
|
-
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
214
|
-
urls.push(url);
|
|
215
|
-
}
|
|
216
|
-
return urls;
|
|
217
|
-
}
|
|
218
|
-
} catch (error) {
|
|
219
|
-
clearTimeout(timeoutId);
|
|
220
|
-
if (error instanceof Error && error.name === "AbortError") throw new Error("Sitemap request timed out after 10 seconds");
|
|
221
|
-
throw error;
|
|
222
|
-
}
|
|
224
|
+
function filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, allPatterns, allowSubdomains = false) {
|
|
225
|
+
if (hasGlobPatterns) return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude, allowSubdomains) && allPatterns.some((pattern) => matchesGlobPattern(url, pattern, allowSubdomains)));
|
|
226
|
+
return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude, allowSubdomains));
|
|
227
|
+
}
|
|
228
|
+
async function runConcurrent(items, concurrency, fn) {
|
|
229
|
+
let idx = 0;
|
|
230
|
+
const workers = Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
231
|
+
while (idx < items.length) await fn(items[idx++]);
|
|
232
|
+
});
|
|
233
|
+
await Promise.all(workers);
|
|
223
234
|
}
|
|
224
235
|
async function crawlAndGenerate(options, onProgress) {
|
|
225
|
-
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
|
|
236
|
+
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay: userCrawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, allowSubdomains = false, hooks: hooksConfig, onPage } = options;
|
|
237
|
+
const hooks = createHooks();
|
|
238
|
+
if (hooksConfig) hooks.addHooks(hooksConfig);
|
|
239
|
+
if (onPage) hooks.hook("crawl:page", onPage);
|
|
240
|
+
const singlePageMode = maxDepth === 0;
|
|
226
241
|
const outputDir = resolve(normalize(rawOutputDir));
|
|
227
|
-
|
|
228
|
-
else log.setLevel(log.LEVELS.OFF);
|
|
242
|
+
let crawlDelay = userCrawlDelay;
|
|
229
243
|
let patterns;
|
|
230
244
|
try {
|
|
231
245
|
patterns = globPatterns.length > 0 ? globPatterns : urls.map(parseUrlPattern);
|
|
@@ -233,6 +247,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
233
247
|
throw new Error(`Invalid URL pattern: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
234
248
|
}
|
|
235
249
|
let startingUrls = patterns.map(getStartingUrl);
|
|
250
|
+
const hasGlobPatterns = patterns.some((p) => p.isGlob);
|
|
236
251
|
const progress = {
|
|
237
252
|
sitemap: {
|
|
238
253
|
status: "discovering",
|
|
@@ -242,60 +257,62 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
242
257
|
crawling: {
|
|
243
258
|
status: "starting",
|
|
244
259
|
total: 0,
|
|
245
|
-
processed: 0
|
|
260
|
+
processed: 0,
|
|
261
|
+
failed: 0,
|
|
262
|
+
latency: {
|
|
263
|
+
total: 0,
|
|
264
|
+
min: Infinity,
|
|
265
|
+
max: 0,
|
|
266
|
+
count: 0
|
|
267
|
+
}
|
|
246
268
|
},
|
|
247
269
|
generation: { status: "idle" }
|
|
248
270
|
};
|
|
249
271
|
const sitemapAttempts = [];
|
|
250
|
-
if (startingUrls.length > 0 && !skipSitemap) {
|
|
272
|
+
if (startingUrls.length > 0 && !skipSitemap && !singlePageMode) {
|
|
251
273
|
const baseUrl = new URL(startingUrls[0]).origin;
|
|
252
274
|
const homePageUrl = baseUrl;
|
|
253
275
|
onProgress?.(progress);
|
|
254
|
-
|
|
255
|
-
const robotsController = new AbortController();
|
|
256
|
-
const robotsTimeoutId = setTimeout(() => robotsController.abort(), 1e4);
|
|
257
|
-
let robotsResponse;
|
|
276
|
+
let robotsContent = null;
|
|
258
277
|
try {
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
278
|
+
robotsContent = await ofetch(`${baseUrl}/robots.txt`, {
|
|
279
|
+
headers: FETCH_HEADERS,
|
|
280
|
+
timeout: 1e4,
|
|
281
|
+
responseType: "text",
|
|
282
|
+
retry: 0
|
|
262
283
|
});
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
284
|
+
} catch {}
|
|
285
|
+
if (robotsContent && !crawlDelay) {
|
|
286
|
+
const crawlDelayMatch = robotsContent.match(ROBOTS_CRAWL_DELAY_RE);
|
|
287
|
+
if (crawlDelayMatch) {
|
|
288
|
+
crawlDelay = Number.parseFloat(crawlDelayMatch[1]);
|
|
289
|
+
p.log.info(`[ROBOTS] Crawl-delay: ${crawlDelay}s`);
|
|
290
|
+
}
|
|
267
291
|
}
|
|
268
|
-
if (
|
|
269
|
-
const sitemapMatches =
|
|
292
|
+
if (robotsContent) {
|
|
293
|
+
const sitemapMatches = robotsContent.match(ROBOTS_SITEMAP_RE);
|
|
270
294
|
if (sitemapMatches && sitemapMatches.length > 0) {
|
|
271
295
|
progress.sitemap.found = sitemapMatches.length;
|
|
272
296
|
progress.sitemap.status = "processing";
|
|
273
297
|
onProgress?.(progress);
|
|
274
298
|
const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
|
|
275
299
|
for (const sitemapUrl of robotsSitemaps) try {
|
|
276
|
-
const robotsUrls = await
|
|
300
|
+
const robotsUrls = await loadSitemap(sitemapUrl);
|
|
277
301
|
sitemapAttempts.push({
|
|
278
302
|
url: sitemapUrl,
|
|
279
303
|
success: true
|
|
280
304
|
});
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
305
|
+
const filteredUrls = filterSitemapUrls(robotsUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
|
|
306
|
+
if (hasGlobPatterns) {
|
|
307
|
+
startingUrls = filteredUrls;
|
|
308
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
309
|
+
onProgress?.(progress);
|
|
310
|
+
break;
|
|
311
|
+
} else if (filteredUrls.length > 0) {
|
|
285
312
|
startingUrls = filteredUrls;
|
|
286
313
|
progress.sitemap.processed = filteredUrls.length;
|
|
287
314
|
onProgress?.(progress);
|
|
288
315
|
break;
|
|
289
|
-
} else {
|
|
290
|
-
const filteredUrls = robotsUrls.filter((url) => {
|
|
291
|
-
return !isUrlExcluded(url, exclude);
|
|
292
|
-
});
|
|
293
|
-
if (filteredUrls.length > 0) {
|
|
294
|
-
startingUrls = filteredUrls;
|
|
295
|
-
progress.sitemap.processed = filteredUrls.length;
|
|
296
|
-
onProgress?.(progress);
|
|
297
|
-
break;
|
|
298
|
-
}
|
|
299
316
|
}
|
|
300
317
|
} catch (error) {
|
|
301
318
|
sitemapAttempts.push({
|
|
@@ -309,31 +326,24 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
309
326
|
let mainSitemapProcessed = false;
|
|
310
327
|
const mainSitemapUrl = `${baseUrl}/sitemap.xml`;
|
|
311
328
|
try {
|
|
312
|
-
const sitemapUrls = await
|
|
329
|
+
const sitemapUrls = await loadSitemap(mainSitemapUrl);
|
|
313
330
|
sitemapAttempts.push({
|
|
314
331
|
url: mainSitemapUrl,
|
|
315
332
|
success: true
|
|
316
333
|
});
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
334
|
+
const filteredUrls = filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
|
|
335
|
+
if (hasGlobPatterns) {
|
|
336
|
+
startingUrls = filteredUrls;
|
|
337
|
+
progress.sitemap.found = sitemapUrls.length;
|
|
338
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
339
|
+
onProgress?.(progress);
|
|
340
|
+
mainSitemapProcessed = true;
|
|
341
|
+
} else if (filteredUrls.length > 0) {
|
|
321
342
|
startingUrls = filteredUrls;
|
|
322
343
|
progress.sitemap.found = sitemapUrls.length;
|
|
323
344
|
progress.sitemap.processed = filteredUrls.length;
|
|
324
345
|
onProgress?.(progress);
|
|
325
346
|
mainSitemapProcessed = true;
|
|
326
|
-
} else {
|
|
327
|
-
const filteredUrls = sitemapUrls.filter((url) => {
|
|
328
|
-
return !isUrlExcluded(url, exclude);
|
|
329
|
-
});
|
|
330
|
-
if (filteredUrls.length > 0) {
|
|
331
|
-
startingUrls = filteredUrls;
|
|
332
|
-
progress.sitemap.found = sitemapUrls.length;
|
|
333
|
-
progress.sitemap.processed = filteredUrls.length;
|
|
334
|
-
onProgress?.(progress);
|
|
335
|
-
mainSitemapProcessed = true;
|
|
336
|
-
}
|
|
337
347
|
}
|
|
338
348
|
} catch (error) {
|
|
339
349
|
sitemapAttempts.push({
|
|
@@ -348,31 +358,24 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
348
358
|
`${baseUrl}/sitemap-index.xml`
|
|
349
359
|
];
|
|
350
360
|
for (const sitemapUrl of commonSitemaps) try {
|
|
351
|
-
const altUrls = await
|
|
361
|
+
const altUrls = await loadSitemap(sitemapUrl);
|
|
352
362
|
sitemapAttempts.push({
|
|
353
363
|
url: sitemapUrl,
|
|
354
364
|
success: true
|
|
355
365
|
});
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
366
|
+
const filteredUrls = filterSitemapUrls(altUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
|
|
367
|
+
if (hasGlobPatterns) {
|
|
368
|
+
startingUrls = filteredUrls;
|
|
369
|
+
progress.sitemap.found = altUrls.length;
|
|
370
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
371
|
+
onProgress?.(progress);
|
|
372
|
+
break;
|
|
373
|
+
} else if (filteredUrls.length > 0) {
|
|
360
374
|
startingUrls = filteredUrls;
|
|
361
375
|
progress.sitemap.found = altUrls.length;
|
|
362
376
|
progress.sitemap.processed = filteredUrls.length;
|
|
363
377
|
onProgress?.(progress);
|
|
364
378
|
break;
|
|
365
|
-
} else {
|
|
366
|
-
const filteredUrls = altUrls.filter((url) => {
|
|
367
|
-
return !isUrlExcluded(url, exclude);
|
|
368
|
-
});
|
|
369
|
-
if (filteredUrls.length > 0) {
|
|
370
|
-
startingUrls = filteredUrls;
|
|
371
|
-
progress.sitemap.found = altUrls.length;
|
|
372
|
-
progress.sitemap.processed = filteredUrls.length;
|
|
373
|
-
onProgress?.(progress);
|
|
374
|
-
break;
|
|
375
|
-
}
|
|
376
379
|
}
|
|
377
380
|
} catch (error) {
|
|
378
381
|
sitemapAttempts.push({
|
|
@@ -398,169 +401,256 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
398
401
|
progress.sitemap.status = "completed";
|
|
399
402
|
progress.crawling.total = startingUrls.length;
|
|
400
403
|
onProgress?.(progress);
|
|
401
|
-
} else if (skipSitemap && startingUrls.length > 0) {
|
|
404
|
+
} else if ((skipSitemap || singlePageMode) && startingUrls.length > 0) {
|
|
402
405
|
progress.sitemap.status = "completed";
|
|
403
406
|
progress.sitemap.found = 0;
|
|
404
407
|
progress.sitemap.processed = 0;
|
|
405
408
|
progress.crawling.total = startingUrls.length;
|
|
406
409
|
onProgress?.(progress);
|
|
407
410
|
}
|
|
408
|
-
|
|
411
|
+
mkdirSync(outputDir, { recursive: true });
|
|
409
412
|
const results = [];
|
|
410
413
|
const processedUrls = /* @__PURE__ */ new Set();
|
|
414
|
+
const allowedRegistrableDomains = allowSubdomains ? new Set(startingUrls.map((u) => {
|
|
415
|
+
try {
|
|
416
|
+
return getRegistrableDomain(new URL(u).hostname);
|
|
417
|
+
} catch {
|
|
418
|
+
return "";
|
|
419
|
+
}
|
|
420
|
+
}).filter(Boolean)) : void 0;
|
|
411
421
|
const shouldCrawlUrl = (url) => {
|
|
412
|
-
if (isUrlExcluded(url, exclude)) return false;
|
|
413
|
-
if (!
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
const startTime = Date.now();
|
|
419
|
-
progress.crawling.currentUrl = request.loadedUrl;
|
|
420
|
-
onProgress?.(progress);
|
|
421
|
-
if (response?.statusCode && (response.statusCode < 200 || response.statusCode >= 300)) return;
|
|
422
|
-
const homePageUrl = new URL(startingUrls[0]).origin;
|
|
423
|
-
let html;
|
|
424
|
-
let title;
|
|
425
|
-
if (crawlerType === "playwright") {
|
|
426
|
-
await page.waitForLoadState("networkidle");
|
|
427
|
-
title = await page.title();
|
|
428
|
-
html = await page.innerHTML("html");
|
|
429
|
-
} else {
|
|
430
|
-
html = typeof body === "string" ? body : body.toString();
|
|
431
|
-
title = "";
|
|
422
|
+
if (isUrlExcluded(url, exclude, allowSubdomains)) return false;
|
|
423
|
+
if (!hasGlobPatterns) {
|
|
424
|
+
if (allowedRegistrableDomains) try {
|
|
425
|
+
return allowedRegistrableDomains.has(getRegistrableDomain(new URL(url).hostname));
|
|
426
|
+
} catch {
|
|
427
|
+
return false;
|
|
432
428
|
}
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
429
|
+
return true;
|
|
430
|
+
}
|
|
431
|
+
return patterns.some((pattern) => matchesGlobPattern(url, pattern, allowSubdomains));
|
|
432
|
+
};
|
|
433
|
+
const recordLatency = (ms) => {
|
|
434
|
+
const lat = progress.crawling.latency;
|
|
435
|
+
lat.total += ms;
|
|
436
|
+
lat.count++;
|
|
437
|
+
if (ms < lat.min) lat.min = ms;
|
|
438
|
+
if (ms > lat.max) lat.max = ms;
|
|
439
|
+
};
|
|
440
|
+
const normalizedHomePageUrl = (startingUrls.length > 0 ? new URL(startingUrls[0]).origin : "").replace(URL_TRAILING_SLASH_RE, "");
|
|
441
|
+
const createdDirs = /* @__PURE__ */ new Set();
|
|
442
|
+
const sharedOrigin = origin || "";
|
|
443
|
+
const processPage = async (url, content, initialTitle, depth, isMarkdown = false) => {
|
|
444
|
+
const parsedUrl = new URL(url);
|
|
445
|
+
const shouldProcessMarkdown = shouldCrawlUrl(url);
|
|
446
|
+
const pageOrigin = sharedOrigin || parsedUrl.origin;
|
|
447
|
+
let md;
|
|
448
|
+
let metadata;
|
|
449
|
+
if (isMarkdown) {
|
|
450
|
+
md = content;
|
|
451
|
+
metadata = {
|
|
452
|
+
title: initialTitle || parsedUrl.pathname,
|
|
453
|
+
links: []
|
|
454
|
+
};
|
|
455
|
+
} else {
|
|
456
|
+
const { extraction, getMetadata } = extractMetadataInline(parsedUrl, allowedRegistrableDomains);
|
|
457
|
+
md = htmlToMarkdown(content, {
|
|
458
|
+
origin: pageOrigin,
|
|
459
|
+
extraction
|
|
460
|
+
});
|
|
461
|
+
metadata = getMetadata();
|
|
462
|
+
}
|
|
463
|
+
let title = initialTitle || metadata.title;
|
|
464
|
+
if (shouldProcessMarkdown) {
|
|
465
|
+
const pageData = {
|
|
466
|
+
url,
|
|
467
|
+
html: isMarkdown ? "" : content,
|
|
440
468
|
title,
|
|
441
469
|
metadata,
|
|
442
470
|
origin: pageOrigin
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
results.push(result);
|
|
468
|
-
progress.crawling.processed = results.length;
|
|
469
|
-
onProgress?.(progress);
|
|
470
|
-
}
|
|
471
|
-
if (followLinks && (request.userData?.depth || 0) < maxDepth) {
|
|
472
|
-
const currentDepth = (request.userData?.depth || 0) + 1;
|
|
473
|
-
const filteredLinks = metadata.links.filter((link) => {
|
|
474
|
-
return shouldCrawlUrl(link);
|
|
475
|
-
});
|
|
476
|
-
if (enqueueLinks) await enqueueLinks({
|
|
477
|
-
urls: filteredLinks,
|
|
478
|
-
userData: { depth: currentDepth }
|
|
479
|
-
});
|
|
480
|
-
else for (const link of filteredLinks) if (!processedUrls.has(link)) processedUrls.add(link);
|
|
471
|
+
};
|
|
472
|
+
await hooks.callHook("crawl:page", pageData);
|
|
473
|
+
title = pageData.title;
|
|
474
|
+
}
|
|
475
|
+
let filePath;
|
|
476
|
+
if (shouldProcessMarkdown && generateIndividualMd) {
|
|
477
|
+
const urlPath = parsedUrl.pathname === "/" ? "/index" : parsedUrl.pathname;
|
|
478
|
+
const hostPrefix = allowSubdomains ? [parsedUrl.hostname.replace(URL_PATH_UNSAFE_CHARS_RE, "-")] : [];
|
|
479
|
+
const pathSegments = urlPath.replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0);
|
|
480
|
+
const safeSegments = [...hostPrefix, ...pathSegments.map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"))];
|
|
481
|
+
filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
|
|
482
|
+
const contentCtx = {
|
|
483
|
+
url,
|
|
484
|
+
title,
|
|
485
|
+
content: md,
|
|
486
|
+
filePath
|
|
487
|
+
};
|
|
488
|
+
await hooks.callHook("crawl:content", contentCtx);
|
|
489
|
+
md = contentCtx.content;
|
|
490
|
+
filePath = contentCtx.filePath;
|
|
491
|
+
const fileDir = dirname(filePath);
|
|
492
|
+
if (fileDir && !createdDirs.has(fileDir)) {
|
|
493
|
+
await mkdir(fileDir, { recursive: true });
|
|
494
|
+
createdDirs.add(fileDir);
|
|
481
495
|
}
|
|
482
|
-
|
|
496
|
+
await writeFile(filePath, md, "utf-8");
|
|
497
|
+
}
|
|
498
|
+
const isHomePage = parsedUrl.pathname === "/" && parsedUrl.origin === normalizedHomePageUrl;
|
|
499
|
+
if (shouldProcessMarkdown || isHomePage) {
|
|
500
|
+
const result = {
|
|
501
|
+
url,
|
|
502
|
+
title,
|
|
503
|
+
content: md,
|
|
504
|
+
filePath: shouldProcessMarkdown ? filePath : void 0,
|
|
505
|
+
timestamp: Date.now(),
|
|
506
|
+
success: true,
|
|
507
|
+
metadata,
|
|
508
|
+
depth
|
|
509
|
+
};
|
|
510
|
+
results.push(result);
|
|
511
|
+
progress.crawling.processed = results.length;
|
|
512
|
+
onProgress?.(progress);
|
|
513
|
+
}
|
|
514
|
+
if (followLinks && !singlePageMode && depth < maxDepth) {
|
|
515
|
+
const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link));
|
|
516
|
+
for (const link of filteredLinks) processedUrls.add(link);
|
|
517
|
+
}
|
|
483
518
|
};
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
title: "",
|
|
500
|
-
description: "",
|
|
501
|
-
links: []
|
|
502
|
-
},
|
|
503
|
-
depth: request.userData?.depth || 0
|
|
519
|
+
const urlsToProcess = startingUrls.slice(0, maxRequestsPerCrawl);
|
|
520
|
+
progress.crawling.status = "processing";
|
|
521
|
+
progress.crawling.total = urlsToProcess.length;
|
|
522
|
+
onProgress?.(progress);
|
|
523
|
+
if (driver === "playwright") {
|
|
524
|
+
const { log, PlaywrightCrawler, purgeDefaultStorages } = await import("crawlee");
|
|
525
|
+
if (verbose) log.setLevel(log.LEVELS.INFO);
|
|
526
|
+
else log.setLevel(log.LEVELS.OFF);
|
|
527
|
+
const crawlerOptions = {
|
|
528
|
+
requestHandler: async ({ request, page }) => {
|
|
529
|
+
progress.crawling.currentUrl = request.loadedUrl;
|
|
530
|
+
onProgress?.(progress);
|
|
531
|
+
const urlCtx = {
|
|
532
|
+
url: request.loadedUrl,
|
|
533
|
+
skip: false
|
|
504
534
|
};
|
|
505
|
-
|
|
506
|
-
|
|
535
|
+
await hooks.callHook("crawl:url", urlCtx);
|
|
536
|
+
if (urlCtx.skip) return;
|
|
537
|
+
const fetchStart = Date.now();
|
|
538
|
+
await page.waitForLoadState("networkidle");
|
|
539
|
+
const title = await page.title();
|
|
540
|
+
const html = await page.innerHTML("html");
|
|
541
|
+
recordLatency(Date.now() - fetchStart);
|
|
542
|
+
await processPage(request.loadedUrl, html, title, request.userData?.depth || 0);
|
|
543
|
+
},
|
|
544
|
+
errorHandler: async ({ request, response, error }) => {
|
|
545
|
+
if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
|
|
507
546
|
request.noRetry = true;
|
|
508
|
-
|
|
547
|
+
progress.crawling.failed++;
|
|
548
|
+
results.push({
|
|
509
549
|
url: request.url,
|
|
510
550
|
title: "",
|
|
511
551
|
content: "",
|
|
512
552
|
timestamp: Date.now(),
|
|
513
553
|
success: false,
|
|
514
|
-
error: error
|
|
554
|
+
error: response?.statusCode ? `HTTP ${response.statusCode}` : error?.message || "Unknown error",
|
|
515
555
|
metadata: {
|
|
516
556
|
title: "",
|
|
517
557
|
description: "",
|
|
518
558
|
links: []
|
|
519
559
|
},
|
|
520
560
|
depth: request.userData?.depth || 0
|
|
521
|
-
};
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
if (driver === "playwright") {
|
|
530
|
-
const playwrightOptions = crawlerOptions;
|
|
531
|
-
if (useChrome) playwrightOptions.launchContext = {
|
|
532
|
-
...playwrightOptions.launchContext,
|
|
561
|
+
});
|
|
562
|
+
},
|
|
563
|
+
maxRequestsPerCrawl,
|
|
564
|
+
respectRobotsTxtFile: false
|
|
565
|
+
};
|
|
566
|
+
if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
|
|
567
|
+
if (useChrome) crawlerOptions.launchContext = {
|
|
568
|
+
...crawlerOptions.launchContext,
|
|
533
569
|
useChrome
|
|
534
570
|
};
|
|
535
|
-
crawler = new PlaywrightCrawler(
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
571
|
+
const crawler = new PlaywrightCrawler(crawlerOptions);
|
|
572
|
+
const initialRequests = urlsToProcess.map((url) => ({
|
|
573
|
+
url,
|
|
574
|
+
userData: { depth: 0 }
|
|
575
|
+
}));
|
|
576
|
+
try {
|
|
577
|
+
await crawler.run(initialRequests);
|
|
578
|
+
} catch (error) {
|
|
579
|
+
const msg = error instanceof Error ? error.message : "";
|
|
580
|
+
if (msg.includes("wmic") || msg.includes("ENOENT")) throw new Error(`Crawlee failed to spawn a system process (${msg}). On Windows 11+, wmic.exe is no longer available. Upgrade crawlee to >=3.16.0 or use the HTTP driver instead (--driver http).`);
|
|
581
|
+
if (verbose) {
|
|
582
|
+
console.error(`[CRAWLER ERROR] ${msg || "Unknown error"}`);
|
|
583
|
+
console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
|
|
584
|
+
}
|
|
585
|
+
throw error;
|
|
550
586
|
}
|
|
551
|
-
|
|
552
|
-
}
|
|
587
|
+
await purgeDefaultStorages();
|
|
588
|
+
} else await runConcurrent(urlsToProcess, DEFAULT_CONCURRENCY, async (url) => {
|
|
589
|
+
progress.crawling.currentUrl = url;
|
|
590
|
+
onProgress?.(progress);
|
|
591
|
+
if (crawlDelay) {
|
|
592
|
+
const delay = crawlDelay;
|
|
593
|
+
await new Promise((resolve) => setTimeout(resolve, delay * 1e3));
|
|
594
|
+
}
|
|
595
|
+
const urlCtx = {
|
|
596
|
+
url,
|
|
597
|
+
skip: false
|
|
598
|
+
};
|
|
599
|
+
await hooks.callHook("crawl:url", urlCtx);
|
|
600
|
+
if (urlCtx.skip) return;
|
|
601
|
+
try {
|
|
602
|
+
const fetchStart = Date.now();
|
|
603
|
+
const response = await ofetch.raw(url, {
|
|
604
|
+
headers: FETCH_HEADERS,
|
|
605
|
+
responseType: "text",
|
|
606
|
+
retry: 2,
|
|
607
|
+
retryDelay: 500,
|
|
608
|
+
timeout: 1e4,
|
|
609
|
+
onResponseError({ response }) {
|
|
610
|
+
if (response.status === 429) {
|
|
611
|
+
const retryAfter = response.headers.get("retry-after");
|
|
612
|
+
const delaySec = retryAfter ? Number.parseInt(retryAfter) || 1 : 2;
|
|
613
|
+
if (!crawlDelay || delaySec > crawlDelay) crawlDelay = delaySec;
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
});
|
|
617
|
+
recordLatency(Date.now() - fetchStart);
|
|
618
|
+
const body = response._data ?? "";
|
|
619
|
+
const contentType = response.headers.get("content-type") || "";
|
|
620
|
+
await processPage(url, body, "", 0, contentType.includes("text/markdown") || contentType.includes("text/x-markdown"));
|
|
621
|
+
} catch (error) {
|
|
622
|
+
if (verbose) console.error(`[ERROR] URL: ${url}, Error: ${error instanceof Error ? error.message : "Unknown"}`);
|
|
623
|
+
progress.crawling.failed++;
|
|
624
|
+
results.push({
|
|
625
|
+
url,
|
|
626
|
+
title: "",
|
|
627
|
+
content: "",
|
|
628
|
+
timestamp: Date.now(),
|
|
629
|
+
success: false,
|
|
630
|
+
error: error instanceof Error ? error.message : "Unknown error",
|
|
631
|
+
metadata: {
|
|
632
|
+
title: "",
|
|
633
|
+
description: "",
|
|
634
|
+
links: []
|
|
635
|
+
},
|
|
636
|
+
depth: 0
|
|
637
|
+
});
|
|
638
|
+
progress.crawling.processed = results.length;
|
|
639
|
+
onProgress?.(progress);
|
|
640
|
+
}
|
|
641
|
+
});
|
|
553
642
|
progress.crawling.status = "completed";
|
|
554
643
|
onProgress?.(progress);
|
|
644
|
+
await hooks.callHook("crawl:done", { results });
|
|
555
645
|
if (results.some((r) => r.success)) {
|
|
556
646
|
progress.generation.status = "generating";
|
|
557
647
|
onProgress?.(progress);
|
|
558
648
|
const successfulResults = results.filter((r) => r.success);
|
|
559
649
|
const firstUrl = new URL(withHttps(urls[0]));
|
|
560
|
-
const
|
|
650
|
+
const originUrl = firstUrl.origin;
|
|
561
651
|
const homePageResult = successfulResults.find((r) => {
|
|
562
652
|
const resultUrl = new URL(withHttps(r.url));
|
|
563
|
-
return resultUrl.href ===
|
|
653
|
+
return resultUrl.href === originUrl || resultUrl.href === `${originUrl}/`;
|
|
564
654
|
});
|
|
565
655
|
const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
|
|
566
656
|
const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
|
|
@@ -586,7 +676,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
586
676
|
})),
|
|
587
677
|
siteName,
|
|
588
678
|
description,
|
|
589
|
-
origin:
|
|
679
|
+
origin: originUrl || firstUrl.origin,
|
|
590
680
|
generateFull: generateLlmsFullTxt,
|
|
591
681
|
outputDir
|
|
592
682
|
});
|
|
@@ -604,7 +694,6 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
604
694
|
progress.generation.status = "completed";
|
|
605
695
|
onProgress?.(progress);
|
|
606
696
|
}
|
|
607
|
-
await purgeDefaultStorages();
|
|
608
697
|
return results;
|
|
609
698
|
}
|
|
610
699
|
//#endregion
|