@mdream/crawl 1.0.0-beta.9 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,14 @@
1
- import { existsSync, mkdirSync } from "node:fs";
2
- import { writeFile } from "node:fs/promises";
1
+ import { mkdirSync } from "node:fs";
2
+ import { mkdir, writeFile } from "node:fs/promises";
3
3
  import * as p from "@clack/prompts";
4
4
  import { generateLlmsTxtArtifacts } from "@mdream/js/llms-txt";
5
- import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawlee";
5
+ import { createHooks } from "hookable";
6
6
  import { htmlToMarkdown } from "mdream";
7
+ import { ofetch } from "ofetch";
7
8
  import { dirname, join, normalize, resolve } from "pathe";
8
9
  import { withHttps } from "ufo";
9
10
  import picomatch from "picomatch";
11
+ import { getDomain } from "tldts";
10
12
  //#region src/glob-utils.ts
11
13
  function stripGlobTail(s) {
12
14
  const idx = s.indexOf("*");
@@ -14,6 +16,14 @@ function stripGlobTail(s) {
14
16
  }
15
17
  const GLOB_CHAR_RE = /[*?[]/;
16
18
  /**
19
+ * Extract the registrable domain from a hostname using the public suffix list.
20
+ * Handles multi-part TLDs (.co.uk, .github.io, etc.) correctly.
21
+ * Returns the hostname unchanged for IPs or when parsing fails.
22
+ */
23
+ function getRegistrableDomain(hostname) {
24
+ return getDomain(hostname, { allowPrivateDomains: true }) || hostname;
25
+ }
26
+ /**
17
27
  * Parse a URL that may contain glob patterns
18
28
  * Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
19
29
  */
@@ -40,12 +50,15 @@ function parseUrlPattern(input) {
40
50
  /**
41
51
  * Check if a URL matches a glob pattern
42
52
  */
43
- function matchesGlobPattern(url, parsedPattern) {
53
+ function matchesGlobPattern(url, parsedPattern, allowSubdomains = false) {
44
54
  if (!parsedPattern.isGlob) return true;
45
55
  try {
46
56
  const urlObj = new URL(url);
47
57
  const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
48
- if (`${urlObj.protocol}//${urlObj.host}` !== parsedPattern.baseUrl) return false;
58
+ if (allowSubdomains) {
59
+ const patternUrl = new URL(parsedPattern.baseUrl);
60
+ if (getRegistrableDomain(urlObj.hostname) !== getRegistrableDomain(patternUrl.hostname)) return false;
61
+ } else if (`${urlObj.protocol}//${urlObj.host}` !== parsedPattern.baseUrl) return false;
49
62
  let pattern = parsedPattern.pattern;
50
63
  if (pattern.endsWith("*") && !pattern.endsWith("**") && !pattern.endsWith("/*")) {
51
64
  const base = pattern.slice(0, -1);
@@ -73,7 +86,7 @@ function getStartingUrl(parsedPattern) {
73
86
  /**
74
87
  * Check if a URL should be excluded based on exclude patterns
75
88
  */
76
- function isUrlExcluded(url, excludePatterns) {
89
+ function isUrlExcluded(url, excludePatterns, allowSubdomains = false) {
77
90
  if (!excludePatterns || excludePatterns.length === 0) return false;
78
91
  try {
79
92
  const urlObj = new URL(url);
@@ -81,7 +94,7 @@ function isUrlExcluded(url, excludePatterns) {
81
94
  return excludePatterns.some((pattern) => {
82
95
  if (pattern.includes("://")) {
83
96
  const parsedPattern = parseUrlPattern(pattern);
84
- if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern);
97
+ if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern, allowSubdomains);
85
98
  return url === pattern;
86
99
  }
87
100
  if (pattern.startsWith("/")) return picomatch(pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern)(urlPath);
@@ -109,21 +122,75 @@ function validateGlobPattern(pattern) {
109
122
  }
110
123
  }
111
124
  //#endregion
112
- //#region src/metadata-extractor.ts
113
- function extractMetadata(html, url) {
114
- const links = [];
125
+ //#region src/crawl.ts
126
+ const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
127
+ const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
128
+ const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
129
+ const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
130
+ const ROBOTS_CRAWL_DELAY_RE = /Crawl-delay:\s*(\d+(?:\.\d+)?)/i;
131
+ const URL_TRAILING_SLASH_RE = /\/$/;
132
+ const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
133
+ const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
134
+ const FETCH_HEADERS = {
135
+ "User-Agent": "mdream-crawler/1.0",
136
+ "Accept": "text/html,application/xhtml+xml,text/markdown"
137
+ };
138
+ const DEFAULT_CONCURRENCY = 20;
139
+ function extractCdataUrl(url) {
140
+ if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) return url.slice(9, -3);
141
+ return url;
142
+ }
143
+ async function loadSitemap(sitemapUrl) {
144
+ const xmlContent = await ofetch(sitemapUrl, {
145
+ headers: FETCH_HEADERS,
146
+ timeout: 1e4,
147
+ responseType: "text",
148
+ retry: 0
149
+ });
150
+ if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
151
+ if (xmlContent.includes("<sitemapindex")) {
152
+ SITEMAP_INDEX_LOC_RE.lastIndex = 0;
153
+ const childSitemaps = [];
154
+ let match;
155
+ while (true) {
156
+ match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
157
+ if (match === null) break;
158
+ childSitemaps.push(extractCdataUrl(match[1]));
159
+ }
160
+ const childResults = await Promise.allSettled(childSitemaps.map((url) => loadSitemap(url)));
161
+ const allUrls = [];
162
+ for (const result of childResults) if (result.status === "fulfilled") allUrls.push(...result.value);
163
+ return allUrls;
164
+ }
165
+ const urls = [];
166
+ SITEMAP_URL_LOC_RE.lastIndex = 0;
167
+ let match;
168
+ while (true) {
169
+ match = SITEMAP_URL_LOC_RE.exec(xmlContent);
170
+ if (match === null) break;
171
+ urls.push(extractCdataUrl(match[1]));
172
+ }
173
+ return urls;
174
+ }
175
+ function extractMetadataInline(parsedUrl, allowedDomains) {
176
+ const links = /* @__PURE__ */ new Set();
115
177
  let title = "";
116
178
  let description = "";
117
179
  let keywords = "";
118
180
  let author = "";
119
- htmlToMarkdown(html, {
120
- origin: new URL(url).origin,
181
+ const url = parsedUrl.href;
182
+ const originPrefix = `${parsedUrl.origin}/`;
183
+ return {
121
184
  extraction: {
122
185
  "a[href]": (el) => {
123
186
  const href = el.attributes.href;
124
187
  if (href) try {
125
- const absoluteUrl = new URL(href, url).href;
126
- if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
188
+ const resolved = new URL(href, url);
189
+ const absoluteUrl = resolved.href;
190
+ if (allowedDomains) {
191
+ const domain = getRegistrableDomain(resolved.hostname);
192
+ if (domain && allowedDomains.has(domain)) links.add(absoluteUrl);
193
+ } else if (absoluteUrl.startsWith(originPrefix) || absoluteUrl === parsedUrl.origin) links.add(absoluteUrl);
127
194
  } catch {}
128
195
  },
129
196
  "title": (el) => {
@@ -144,88 +211,35 @@ function extractMetadata(html, url) {
144
211
  "meta[property=\"og:title\"]": (el) => {
145
212
  if (!title) title = el.attributes.content || "";
146
213
  }
147
- }
148
- });
149
- return {
150
- title: title.trim() || new URL(url).pathname,
151
- description: description.trim() || void 0,
152
- keywords: keywords.trim() || void 0,
153
- author: author.trim() || void 0,
154
- links: links.filter((link) => {
155
- try {
156
- const linkUrl = new URL(link);
157
- const baseUrl = new URL(url);
158
- return linkUrl.hostname === baseUrl.hostname;
159
- } catch {
160
- return false;
161
- }
214
+ },
215
+ getMetadata: () => ({
216
+ title: title.trim() || parsedUrl.pathname,
217
+ description: description.trim() || void 0,
218
+ keywords: keywords.trim() || void 0,
219
+ author: author.trim() || void 0,
220
+ links: [...links]
162
221
  })
163
222
  };
164
223
  }
165
- //#endregion
166
- //#region src/crawl.ts
167
- const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
168
- const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
169
- const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
170
- const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
171
- const URL_TRAILING_SLASH_RE = /\/$/;
172
- const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
173
- const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
174
- async function loadSitemapWithoutRetries(sitemapUrl) {
175
- const controller = new AbortController();
176
- const timeoutId = setTimeout(() => controller.abort(), 1e4);
177
- try {
178
- const response = await fetch(sitemapUrl, {
179
- signal: controller.signal,
180
- headers: { "User-Agent": "mdream-crawler/1.0" }
181
- });
182
- clearTimeout(timeoutId);
183
- if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
184
- const xmlContent = await response.text();
185
- if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
186
- if (xmlContent.includes("<sitemapindex")) {
187
- SITEMAP_INDEX_LOC_RE.lastIndex = 0;
188
- const childSitemaps = [];
189
- let match;
190
- while (true) {
191
- match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
192
- if (match === null) break;
193
- let url = match[1];
194
- if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
195
- childSitemaps.push(url);
196
- }
197
- const allUrls = [];
198
- for (const childSitemapUrl of childSitemaps) try {
199
- const childUrls = await loadSitemapWithoutRetries(childSitemapUrl);
200
- allUrls.push(...childUrls);
201
- } catch (error) {
202
- console.warn(`Failed to load child sitemap ${childSitemapUrl}:`, error instanceof Error ? error.message : "Unknown error");
203
- }
204
- return allUrls;
205
- } else {
206
- const urls = [];
207
- SITEMAP_URL_LOC_RE.lastIndex = 0;
208
- let match;
209
- while (true) {
210
- match = SITEMAP_URL_LOC_RE.exec(xmlContent);
211
- if (match === null) break;
212
- let url = match[1];
213
- if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
214
- urls.push(url);
215
- }
216
- return urls;
217
- }
218
- } catch (error) {
219
- clearTimeout(timeoutId);
220
- if (error instanceof Error && error.name === "AbortError") throw new Error("Sitemap request timed out after 10 seconds");
221
- throw error;
222
- }
224
+ function filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, allPatterns, allowSubdomains = false) {
225
+ if (hasGlobPatterns) return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude, allowSubdomains) && allPatterns.some((pattern) => matchesGlobPattern(url, pattern, allowSubdomains)));
226
+ return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude, allowSubdomains));
227
+ }
228
+ async function runConcurrent(items, concurrency, fn) {
229
+ let idx = 0;
230
+ const workers = Array.from({ length: Math.min(concurrency, items.length) }, async () => {
231
+ while (idx < items.length) await fn(items[idx++]);
232
+ });
233
+ await Promise.all(workers);
223
234
  }
224
235
  async function crawlAndGenerate(options, onProgress) {
225
- const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
236
+ const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay: userCrawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, allowSubdomains = false, hooks: hooksConfig, onPage } = options;
237
+ const hooks = createHooks();
238
+ if (hooksConfig) hooks.addHooks(hooksConfig);
239
+ if (onPage) hooks.hook("crawl:page", onPage);
240
+ const singlePageMode = maxDepth === 0;
226
241
  const outputDir = resolve(normalize(rawOutputDir));
227
- if (verbose) log.setLevel(log.LEVELS.INFO);
228
- else log.setLevel(log.LEVELS.OFF);
242
+ let crawlDelay = userCrawlDelay;
229
243
  let patterns;
230
244
  try {
231
245
  patterns = globPatterns.length > 0 ? globPatterns : urls.map(parseUrlPattern);
@@ -233,6 +247,7 @@ async function crawlAndGenerate(options, onProgress) {
233
247
  throw new Error(`Invalid URL pattern: ${error instanceof Error ? error.message : "Unknown error"}`);
234
248
  }
235
249
  let startingUrls = patterns.map(getStartingUrl);
250
+ const hasGlobPatterns = patterns.some((p) => p.isGlob);
236
251
  const progress = {
237
252
  sitemap: {
238
253
  status: "discovering",
@@ -242,60 +257,62 @@ async function crawlAndGenerate(options, onProgress) {
242
257
  crawling: {
243
258
  status: "starting",
244
259
  total: 0,
245
- processed: 0
260
+ processed: 0,
261
+ failed: 0,
262
+ latency: {
263
+ total: 0,
264
+ min: Infinity,
265
+ max: 0,
266
+ count: 0
267
+ }
246
268
  },
247
269
  generation: { status: "idle" }
248
270
  };
249
271
  const sitemapAttempts = [];
250
- if (startingUrls.length > 0 && !skipSitemap) {
272
+ if (startingUrls.length > 0 && !skipSitemap && !singlePageMode) {
251
273
  const baseUrl = new URL(startingUrls[0]).origin;
252
274
  const homePageUrl = baseUrl;
253
275
  onProgress?.(progress);
254
- const robotsUrl = new URL("/robots.txt", baseUrl).toString();
255
- const robotsController = new AbortController();
256
- const robotsTimeoutId = setTimeout(() => robotsController.abort(), 1e4);
257
- let robotsResponse;
276
+ let robotsContent = null;
258
277
  try {
259
- robotsResponse = await fetch(robotsUrl, {
260
- signal: robotsController.signal,
261
- headers: { "User-Agent": "mdream-crawler/1.0" }
278
+ robotsContent = await ofetch(`${baseUrl}/robots.txt`, {
279
+ headers: FETCH_HEADERS,
280
+ timeout: 1e4,
281
+ responseType: "text",
282
+ retry: 0
262
283
  });
263
- clearTimeout(robotsTimeoutId);
264
- } catch {
265
- clearTimeout(robotsTimeoutId);
266
- robotsResponse = null;
284
+ } catch {}
285
+ if (robotsContent && !crawlDelay) {
286
+ const crawlDelayMatch = robotsContent.match(ROBOTS_CRAWL_DELAY_RE);
287
+ if (crawlDelayMatch) {
288
+ crawlDelay = Number.parseFloat(crawlDelayMatch[1]);
289
+ p.log.info(`[ROBOTS] Crawl-delay: ${crawlDelay}s`);
290
+ }
267
291
  }
268
- if (robotsResponse?.ok) {
269
- const sitemapMatches = (await robotsResponse.text()).match(ROBOTS_SITEMAP_RE);
292
+ if (robotsContent) {
293
+ const sitemapMatches = robotsContent.match(ROBOTS_SITEMAP_RE);
270
294
  if (sitemapMatches && sitemapMatches.length > 0) {
271
295
  progress.sitemap.found = sitemapMatches.length;
272
296
  progress.sitemap.status = "processing";
273
297
  onProgress?.(progress);
274
298
  const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
275
299
  for (const sitemapUrl of robotsSitemaps) try {
276
- const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
300
+ const robotsUrls = await loadSitemap(sitemapUrl);
277
301
  sitemapAttempts.push({
278
302
  url: sitemapUrl,
279
303
  success: true
280
304
  });
281
- if (patterns.some((p) => p.isGlob)) {
282
- const filteredUrls = robotsUrls.filter((url) => {
283
- return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
284
- });
305
+ const filteredUrls = filterSitemapUrls(robotsUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
306
+ if (hasGlobPatterns) {
307
+ startingUrls = filteredUrls;
308
+ progress.sitemap.processed = filteredUrls.length;
309
+ onProgress?.(progress);
310
+ break;
311
+ } else if (filteredUrls.length > 0) {
285
312
  startingUrls = filteredUrls;
286
313
  progress.sitemap.processed = filteredUrls.length;
287
314
  onProgress?.(progress);
288
315
  break;
289
- } else {
290
- const filteredUrls = robotsUrls.filter((url) => {
291
- return !isUrlExcluded(url, exclude);
292
- });
293
- if (filteredUrls.length > 0) {
294
- startingUrls = filteredUrls;
295
- progress.sitemap.processed = filteredUrls.length;
296
- onProgress?.(progress);
297
- break;
298
- }
299
316
  }
300
317
  } catch (error) {
301
318
  sitemapAttempts.push({
@@ -309,31 +326,24 @@ async function crawlAndGenerate(options, onProgress) {
309
326
  let mainSitemapProcessed = false;
310
327
  const mainSitemapUrl = `${baseUrl}/sitemap.xml`;
311
328
  try {
312
- const sitemapUrls = await loadSitemapWithoutRetries(mainSitemapUrl);
329
+ const sitemapUrls = await loadSitemap(mainSitemapUrl);
313
330
  sitemapAttempts.push({
314
331
  url: mainSitemapUrl,
315
332
  success: true
316
333
  });
317
- if (patterns.some((p) => p.isGlob)) {
318
- const filteredUrls = sitemapUrls.filter((url) => {
319
- return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
320
- });
334
+ const filteredUrls = filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
335
+ if (hasGlobPatterns) {
336
+ startingUrls = filteredUrls;
337
+ progress.sitemap.found = sitemapUrls.length;
338
+ progress.sitemap.processed = filteredUrls.length;
339
+ onProgress?.(progress);
340
+ mainSitemapProcessed = true;
341
+ } else if (filteredUrls.length > 0) {
321
342
  startingUrls = filteredUrls;
322
343
  progress.sitemap.found = sitemapUrls.length;
323
344
  progress.sitemap.processed = filteredUrls.length;
324
345
  onProgress?.(progress);
325
346
  mainSitemapProcessed = true;
326
- } else {
327
- const filteredUrls = sitemapUrls.filter((url) => {
328
- return !isUrlExcluded(url, exclude);
329
- });
330
- if (filteredUrls.length > 0) {
331
- startingUrls = filteredUrls;
332
- progress.sitemap.found = sitemapUrls.length;
333
- progress.sitemap.processed = filteredUrls.length;
334
- onProgress?.(progress);
335
- mainSitemapProcessed = true;
336
- }
337
347
  }
338
348
  } catch (error) {
339
349
  sitemapAttempts.push({
@@ -348,31 +358,24 @@ async function crawlAndGenerate(options, onProgress) {
348
358
  `${baseUrl}/sitemap-index.xml`
349
359
  ];
350
360
  for (const sitemapUrl of commonSitemaps) try {
351
- const altUrls = await loadSitemapWithoutRetries(sitemapUrl);
361
+ const altUrls = await loadSitemap(sitemapUrl);
352
362
  sitemapAttempts.push({
353
363
  url: sitemapUrl,
354
364
  success: true
355
365
  });
356
- if (patterns.some((p) => p.isGlob)) {
357
- const filteredUrls = altUrls.filter((url) => {
358
- return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
359
- });
366
+ const filteredUrls = filterSitemapUrls(altUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
367
+ if (hasGlobPatterns) {
368
+ startingUrls = filteredUrls;
369
+ progress.sitemap.found = altUrls.length;
370
+ progress.sitemap.processed = filteredUrls.length;
371
+ onProgress?.(progress);
372
+ break;
373
+ } else if (filteredUrls.length > 0) {
360
374
  startingUrls = filteredUrls;
361
375
  progress.sitemap.found = altUrls.length;
362
376
  progress.sitemap.processed = filteredUrls.length;
363
377
  onProgress?.(progress);
364
378
  break;
365
- } else {
366
- const filteredUrls = altUrls.filter((url) => {
367
- return !isUrlExcluded(url, exclude);
368
- });
369
- if (filteredUrls.length > 0) {
370
- startingUrls = filteredUrls;
371
- progress.sitemap.found = altUrls.length;
372
- progress.sitemap.processed = filteredUrls.length;
373
- onProgress?.(progress);
374
- break;
375
- }
376
379
  }
377
380
  } catch (error) {
378
381
  sitemapAttempts.push({
@@ -398,169 +401,256 @@ async function crawlAndGenerate(options, onProgress) {
398
401
  progress.sitemap.status = "completed";
399
402
  progress.crawling.total = startingUrls.length;
400
403
  onProgress?.(progress);
401
- } else if (skipSitemap && startingUrls.length > 0) {
404
+ } else if ((skipSitemap || singlePageMode) && startingUrls.length > 0) {
402
405
  progress.sitemap.status = "completed";
403
406
  progress.sitemap.found = 0;
404
407
  progress.sitemap.processed = 0;
405
408
  progress.crawling.total = startingUrls.length;
406
409
  onProgress?.(progress);
407
410
  }
408
- if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
411
+ mkdirSync(outputDir, { recursive: true });
409
412
  const results = [];
410
413
  const processedUrls = /* @__PURE__ */ new Set();
414
+ const allowedRegistrableDomains = allowSubdomains ? new Set(startingUrls.map((u) => {
415
+ try {
416
+ return getRegistrableDomain(new URL(u).hostname);
417
+ } catch {
418
+ return "";
419
+ }
420
+ }).filter(Boolean)) : void 0;
411
421
  const shouldCrawlUrl = (url) => {
412
- if (isUrlExcluded(url, exclude)) return false;
413
- if (!patterns.some((p) => p.isGlob)) return true;
414
- return patterns.some((pattern) => matchesGlobPattern(url, pattern));
415
- };
416
- const createRequestHandler = (crawlerType) => {
417
- return async ({ request, body, page, enqueueLinks, response }) => {
418
- const startTime = Date.now();
419
- progress.crawling.currentUrl = request.loadedUrl;
420
- onProgress?.(progress);
421
- if (response?.statusCode && (response.statusCode < 200 || response.statusCode >= 300)) return;
422
- const homePageUrl = new URL(startingUrls[0]).origin;
423
- let html;
424
- let title;
425
- if (crawlerType === "playwright") {
426
- await page.waitForLoadState("networkidle");
427
- title = await page.title();
428
- html = await page.innerHTML("html");
429
- } else {
430
- html = typeof body === "string" ? body : body.toString();
431
- title = "";
422
+ if (isUrlExcluded(url, exclude, allowSubdomains)) return false;
423
+ if (!hasGlobPatterns) {
424
+ if (allowedRegistrableDomains) try {
425
+ return allowedRegistrableDomains.has(getRegistrableDomain(new URL(url).hostname));
426
+ } catch {
427
+ return false;
432
428
  }
433
- const metadata = extractMetadata(html, request.loadedUrl);
434
- if (!title) title = metadata.title;
435
- const shouldProcessMarkdown = shouldCrawlUrl(request.loadedUrl);
436
- const pageOrigin = origin || new URL(request.loadedUrl).origin;
437
- if (onPage && shouldProcessMarkdown) await onPage({
438
- url: request.loadedUrl,
439
- html,
429
+ return true;
430
+ }
431
+ return patterns.some((pattern) => matchesGlobPattern(url, pattern, allowSubdomains));
432
+ };
433
+ const recordLatency = (ms) => {
434
+ const lat = progress.crawling.latency;
435
+ lat.total += ms;
436
+ lat.count++;
437
+ if (ms < lat.min) lat.min = ms;
438
+ if (ms > lat.max) lat.max = ms;
439
+ };
440
+ const normalizedHomePageUrl = (startingUrls.length > 0 ? new URL(startingUrls[0]).origin : "").replace(URL_TRAILING_SLASH_RE, "");
441
+ const createdDirs = /* @__PURE__ */ new Set();
442
+ const sharedOrigin = origin || "";
443
+ const processPage = async (url, content, initialTitle, depth, isMarkdown = false) => {
444
+ const parsedUrl = new URL(url);
445
+ const shouldProcessMarkdown = shouldCrawlUrl(url);
446
+ const pageOrigin = sharedOrigin || parsedUrl.origin;
447
+ let md;
448
+ let metadata;
449
+ if (isMarkdown) {
450
+ md = content;
451
+ metadata = {
452
+ title: initialTitle || parsedUrl.pathname,
453
+ links: []
454
+ };
455
+ } else {
456
+ const { extraction, getMetadata } = extractMetadataInline(parsedUrl, allowedRegistrableDomains);
457
+ md = htmlToMarkdown(content, {
458
+ origin: pageOrigin,
459
+ extraction
460
+ });
461
+ metadata = getMetadata();
462
+ }
463
+ let title = initialTitle || metadata.title;
464
+ if (shouldProcessMarkdown) {
465
+ const pageData = {
466
+ url,
467
+ html: isMarkdown ? "" : content,
440
468
  title,
441
469
  metadata,
442
470
  origin: pageOrigin
443
- });
444
- let md = "";
445
- if (shouldProcessMarkdown) md = htmlToMarkdown(html, { origin: pageOrigin });
446
- let filePath;
447
- if (shouldProcessMarkdown && generateIndividualMd) {
448
- const urlObj = new URL(request.loadedUrl);
449
- const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
450
- filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
451
- const fileDir = dirname(filePath);
452
- if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
453
- await writeFile(filePath, md, "utf-8");
454
- }
455
- const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
456
- if (shouldProcessMarkdown || isHomePage) {
457
- const result = {
458
- url: request.loadedUrl,
459
- title,
460
- content: md,
461
- filePath: shouldProcessMarkdown ? filePath : void 0,
462
- timestamp: startTime,
463
- success: true,
464
- metadata,
465
- depth: request.userData?.depth || 0
466
- };
467
- results.push(result);
468
- progress.crawling.processed = results.length;
469
- onProgress?.(progress);
470
- }
471
- if (followLinks && (request.userData?.depth || 0) < maxDepth) {
472
- const currentDepth = (request.userData?.depth || 0) + 1;
473
- const filteredLinks = metadata.links.filter((link) => {
474
- return shouldCrawlUrl(link);
475
- });
476
- if (enqueueLinks) await enqueueLinks({
477
- urls: filteredLinks,
478
- userData: { depth: currentDepth }
479
- });
480
- else for (const link of filteredLinks) if (!processedUrls.has(link)) processedUrls.add(link);
471
+ };
472
+ await hooks.callHook("crawl:page", pageData);
473
+ title = pageData.title;
474
+ }
475
+ let filePath;
476
+ if (shouldProcessMarkdown && generateIndividualMd) {
477
+ const urlPath = parsedUrl.pathname === "/" ? "/index" : parsedUrl.pathname;
478
+ const hostPrefix = allowSubdomains ? [parsedUrl.hostname.replace(URL_PATH_UNSAFE_CHARS_RE, "-")] : [];
479
+ const pathSegments = urlPath.replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0);
480
+ const safeSegments = [...hostPrefix, ...pathSegments.map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"))];
481
+ filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
482
+ const contentCtx = {
483
+ url,
484
+ title,
485
+ content: md,
486
+ filePath
487
+ };
488
+ await hooks.callHook("crawl:content", contentCtx);
489
+ md = contentCtx.content;
490
+ filePath = contentCtx.filePath;
491
+ const fileDir = dirname(filePath);
492
+ if (fileDir && !createdDirs.has(fileDir)) {
493
+ await mkdir(fileDir, { recursive: true });
494
+ createdDirs.add(fileDir);
481
495
  }
482
- };
496
+ await writeFile(filePath, md, "utf-8");
497
+ }
498
+ const isHomePage = parsedUrl.pathname === "/" && parsedUrl.origin === normalizedHomePageUrl;
499
+ if (shouldProcessMarkdown || isHomePage) {
500
+ const result = {
501
+ url,
502
+ title,
503
+ content: md,
504
+ filePath: shouldProcessMarkdown ? filePath : void 0,
505
+ timestamp: Date.now(),
506
+ success: true,
507
+ metadata,
508
+ depth
509
+ };
510
+ results.push(result);
511
+ progress.crawling.processed = results.length;
512
+ onProgress?.(progress);
513
+ }
514
+ if (followLinks && !singlePageMode && depth < maxDepth) {
515
+ const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link));
516
+ for (const link of filteredLinks) processedUrls.add(link);
517
+ }
483
518
  };
484
- let crawler;
485
- const crawlerOptions = {
486
- requestHandler: createRequestHandler(driver),
487
- errorHandler: async ({ request, response, error }) => {
488
- if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
489
- if (response?.statusCode && response?.statusCode >= 400) {
490
- request.noRetry = true;
491
- const result = {
492
- url: request.url,
493
- title: "",
494
- content: "",
495
- timestamp: Date.now(),
496
- success: false,
497
- error: `HTTP ${response.statusCode}`,
498
- metadata: {
499
- title: "",
500
- description: "",
501
- links: []
502
- },
503
- depth: request.userData?.depth || 0
519
+ const urlsToProcess = startingUrls.slice(0, maxRequestsPerCrawl);
520
+ progress.crawling.status = "processing";
521
+ progress.crawling.total = urlsToProcess.length;
522
+ onProgress?.(progress);
523
+ if (driver === "playwright") {
524
+ const { log, PlaywrightCrawler, purgeDefaultStorages } = await import("crawlee");
525
+ if (verbose) log.setLevel(log.LEVELS.INFO);
526
+ else log.setLevel(log.LEVELS.OFF);
527
+ const crawlerOptions = {
528
+ requestHandler: async ({ request, page }) => {
529
+ progress.crawling.currentUrl = request.loadedUrl;
530
+ onProgress?.(progress);
531
+ const urlCtx = {
532
+ url: request.loadedUrl,
533
+ skip: false
504
534
  };
505
- results.push(result);
506
- } else if (error) {
535
+ await hooks.callHook("crawl:url", urlCtx);
536
+ if (urlCtx.skip) return;
537
+ const fetchStart = Date.now();
538
+ await page.waitForLoadState("networkidle");
539
+ const title = await page.title();
540
+ const html = await page.innerHTML("html");
541
+ recordLatency(Date.now() - fetchStart);
542
+ await processPage(request.loadedUrl, html, title, request.userData?.depth || 0);
543
+ },
544
+ errorHandler: async ({ request, response, error }) => {
545
+ if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
507
546
  request.noRetry = true;
508
- const result = {
547
+ progress.crawling.failed++;
548
+ results.push({
509
549
  url: request.url,
510
550
  title: "",
511
551
  content: "",
512
552
  timestamp: Date.now(),
513
553
  success: false,
514
- error: error.message || "Unknown error",
554
+ error: response?.statusCode ? `HTTP ${response.statusCode}` : error?.message || "Unknown error",
515
555
  metadata: {
516
556
  title: "",
517
557
  description: "",
518
558
  links: []
519
559
  },
520
560
  depth: request.userData?.depth || 0
521
- };
522
- results.push(result);
523
- }
524
- },
525
- maxRequestsPerCrawl,
526
- respectRobotsTxtFile: false
527
- };
528
- if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
529
- if (driver === "playwright") {
530
- const playwrightOptions = crawlerOptions;
531
- if (useChrome) playwrightOptions.launchContext = {
532
- ...playwrightOptions.launchContext,
561
+ });
562
+ },
563
+ maxRequestsPerCrawl,
564
+ respectRobotsTxtFile: false
565
+ };
566
+ if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
567
+ if (useChrome) crawlerOptions.launchContext = {
568
+ ...crawlerOptions.launchContext,
533
569
  useChrome
534
570
  };
535
- crawler = new PlaywrightCrawler(playwrightOptions);
536
- } else crawler = new HttpCrawler(crawlerOptions);
537
- const initialRequests = startingUrls.map((url) => ({
538
- url,
539
- userData: { depth: 0 }
540
- }));
541
- progress.crawling.status = "processing";
542
- progress.crawling.total = startingUrls.length;
543
- onProgress?.(progress);
544
- try {
545
- await crawler.run(initialRequests);
546
- } catch (error) {
547
- if (verbose) {
548
- console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
549
- console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
571
+ const crawler = new PlaywrightCrawler(crawlerOptions);
572
+ const initialRequests = urlsToProcess.map((url) => ({
573
+ url,
574
+ userData: { depth: 0 }
575
+ }));
576
+ try {
577
+ await crawler.run(initialRequests);
578
+ } catch (error) {
579
+ const msg = error instanceof Error ? error.message : "";
580
+ if (msg.includes("wmic") || msg.includes("ENOENT")) throw new Error(`Crawlee failed to spawn a system process (${msg}). On Windows 11+, wmic.exe is no longer available. Upgrade crawlee to >=3.16.0 or use the HTTP driver instead (--driver http).`);
581
+ if (verbose) {
582
+ console.error(`[CRAWLER ERROR] ${msg || "Unknown error"}`);
583
+ console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
584
+ }
585
+ throw error;
550
586
  }
551
- throw error;
552
- }
587
+ await purgeDefaultStorages();
588
+ } else await runConcurrent(urlsToProcess, DEFAULT_CONCURRENCY, async (url) => {
589
+ progress.crawling.currentUrl = url;
590
+ onProgress?.(progress);
591
+ if (crawlDelay) {
592
+ const delay = crawlDelay;
593
+ await new Promise((resolve) => setTimeout(resolve, delay * 1e3));
594
+ }
595
+ const urlCtx = {
596
+ url,
597
+ skip: false
598
+ };
599
+ await hooks.callHook("crawl:url", urlCtx);
600
+ if (urlCtx.skip) return;
601
+ try {
602
+ const fetchStart = Date.now();
603
+ const response = await ofetch.raw(url, {
604
+ headers: FETCH_HEADERS,
605
+ responseType: "text",
606
+ retry: 2,
607
+ retryDelay: 500,
608
+ timeout: 1e4,
609
+ onResponseError({ response }) {
610
+ if (response.status === 429) {
611
+ const retryAfter = response.headers.get("retry-after");
612
+ const delaySec = retryAfter ? Number.parseInt(retryAfter) || 1 : 2;
613
+ if (!crawlDelay || delaySec > crawlDelay) crawlDelay = delaySec;
614
+ }
615
+ }
616
+ });
617
+ recordLatency(Date.now() - fetchStart);
618
+ const body = response._data ?? "";
619
+ const contentType = response.headers.get("content-type") || "";
620
+ await processPage(url, body, "", 0, contentType.includes("text/markdown") || contentType.includes("text/x-markdown"));
621
+ } catch (error) {
622
+ if (verbose) console.error(`[ERROR] URL: ${url}, Error: ${error instanceof Error ? error.message : "Unknown"}`);
623
+ progress.crawling.failed++;
624
+ results.push({
625
+ url,
626
+ title: "",
627
+ content: "",
628
+ timestamp: Date.now(),
629
+ success: false,
630
+ error: error instanceof Error ? error.message : "Unknown error",
631
+ metadata: {
632
+ title: "",
633
+ description: "",
634
+ links: []
635
+ },
636
+ depth: 0
637
+ });
638
+ progress.crawling.processed = results.length;
639
+ onProgress?.(progress);
640
+ }
641
+ });
553
642
  progress.crawling.status = "completed";
554
643
  onProgress?.(progress);
644
+ await hooks.callHook("crawl:done", { results });
555
645
  if (results.some((r) => r.success)) {
556
646
  progress.generation.status = "generating";
557
647
  onProgress?.(progress);
558
648
  const successfulResults = results.filter((r) => r.success);
559
649
  const firstUrl = new URL(withHttps(urls[0]));
560
- const origin = firstUrl.origin;
650
+ const originUrl = firstUrl.origin;
561
651
  const homePageResult = successfulResults.find((r) => {
562
652
  const resultUrl = new URL(withHttps(r.url));
563
- return resultUrl.href === origin || resultUrl.href === `${origin}/`;
653
+ return resultUrl.href === originUrl || resultUrl.href === `${originUrl}/`;
564
654
  });
565
655
  const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
566
656
  const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
@@ -586,7 +676,7 @@ async function crawlAndGenerate(options, onProgress) {
586
676
  })),
587
677
  siteName,
588
678
  description,
589
- origin: origin || firstUrl.origin,
679
+ origin: originUrl || firstUrl.origin,
590
680
  generateFull: generateLlmsFullTxt,
591
681
  outputDir
592
682
  });
@@ -604,7 +694,6 @@ async function crawlAndGenerate(options, onProgress) {
604
694
  progress.generation.status = "completed";
605
695
  onProgress?.(progress);
606
696
  }
607
- await purgeDefaultStorages();
608
697
  return results;
609
698
  }
610
699
  //#endregion