@mdream/crawl 1.0.0-beta.10 → 1.0.0-beta.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -3
- package/dist/_chunks/crawl.mjs +293 -270
- package/dist/_chunks/playwright-utils.mjs +59 -0
- package/dist/cli.mjs +42 -83
- package/dist/index.d.mts +8 -1
- package/package.json +8 -4
package/README.md
CHANGED
|
@@ -7,7 +7,7 @@ Multi-page website crawler that generates comprehensive llms.txt files by follow
|
|
|
7
7
|
## Installation
|
|
8
8
|
|
|
9
9
|
```bash
|
|
10
|
-
npm install @mdream/crawl
|
|
10
|
+
npm install @mdream/crawl@beta
|
|
11
11
|
```
|
|
12
12
|
|
|
13
13
|
## Usage
|
|
@@ -15,7 +15,7 @@ npm install @mdream/crawl
|
|
|
15
15
|
Simply run the command to start the interactive multi-page website crawler:
|
|
16
16
|
|
|
17
17
|
```bash
|
|
18
|
-
npx @mdream/crawl
|
|
18
|
+
npx @mdream/crawl@beta
|
|
19
19
|
```
|
|
20
20
|
|
|
21
21
|
The crawler will automatically discover and follow internal links to crawl entire websites. The interactive interface provides:
|
|
@@ -46,6 +46,16 @@ const results = await crawlAndGenerate({
|
|
|
46
46
|
})
|
|
47
47
|
```
|
|
48
48
|
|
|
49
|
+
### Playwright Driver
|
|
50
|
+
|
|
51
|
+
The default HTTP driver works for most sites. For JavaScript-heavy sites that require a browser, install the optional dependencies:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
npm install crawlee playwright
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Then use `--driver playwright` or `driver: 'playwright'` in the API.
|
|
58
|
+
|
|
49
59
|
> **Note**: llms.txt artifact generation is handled by [`@mdream/js/llms-txt`](../js). The crawl package uses it internally when `generateLlmsTxt: true`.
|
|
50
60
|
|
|
51
61
|
## Output
|
|
@@ -70,7 +80,7 @@ The crawler generates comprehensive output from entire websites:
|
|
|
70
80
|
|
|
71
81
|
- ✅ **Multi-Page Website Crawling**: Designed specifically for crawling entire websites by following internal links
|
|
72
82
|
- ✅ **Purely Interactive**: No complex command-line options to remember
|
|
73
|
-
- ✅ **Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites
|
|
83
|
+
- ✅ **Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites (requires `crawlee` and `playwright`)
|
|
74
84
|
- ✅ **Smart Link Discovery**: Uses mdream's extraction plugin to find and follow internal links
|
|
75
85
|
- ✅ **Rich Metadata Extraction**: Extracts titles, descriptions, keywords, and author info from all pages
|
|
76
86
|
- ✅ **Comprehensive llms.txt Generation**: Creates complete site documentation files
|
package/dist/_chunks/crawl.mjs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { writeFile } from "node:fs/promises";
|
|
1
|
+
import { mkdirSync } from "node:fs";
|
|
2
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
3
3
|
import * as p from "@clack/prompts";
|
|
4
4
|
import { generateLlmsTxtArtifacts } from "@mdream/js/llms-txt";
|
|
5
|
-
import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawlee";
|
|
6
5
|
import { htmlToMarkdown } from "mdream";
|
|
6
|
+
import { ofetch } from "ofetch";
|
|
7
7
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
8
8
|
import { withHttps } from "ufo";
|
|
9
9
|
import picomatch from "picomatch";
|
|
@@ -109,21 +109,71 @@ function validateGlobPattern(pattern) {
|
|
|
109
109
|
}
|
|
110
110
|
}
|
|
111
111
|
//#endregion
|
|
112
|
-
//#region src/
|
|
113
|
-
|
|
114
|
-
|
|
112
|
+
//#region src/crawl.ts
|
|
113
|
+
const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
|
|
114
|
+
const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
|
|
115
|
+
const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
|
|
116
|
+
const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
|
|
117
|
+
const ROBOTS_CRAWL_DELAY_RE = /Crawl-delay:\s*(\d+(?:\.\d+)?)/i;
|
|
118
|
+
const URL_TRAILING_SLASH_RE = /\/$/;
|
|
119
|
+
const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
|
|
120
|
+
const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
|
|
121
|
+
const FETCH_HEADERS = {
|
|
122
|
+
"User-Agent": "mdream-crawler/1.0",
|
|
123
|
+
"Accept": "text/html,application/xhtml+xml,text/markdown"
|
|
124
|
+
};
|
|
125
|
+
const DEFAULT_CONCURRENCY = 20;
|
|
126
|
+
function extractCdataUrl(url) {
|
|
127
|
+
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) return url.slice(9, -3);
|
|
128
|
+
return url;
|
|
129
|
+
}
|
|
130
|
+
async function loadSitemap(sitemapUrl) {
|
|
131
|
+
const xmlContent = await ofetch(sitemapUrl, {
|
|
132
|
+
headers: FETCH_HEADERS,
|
|
133
|
+
timeout: 1e4,
|
|
134
|
+
responseType: "text",
|
|
135
|
+
retry: 0
|
|
136
|
+
});
|
|
137
|
+
if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
|
|
138
|
+
if (xmlContent.includes("<sitemapindex")) {
|
|
139
|
+
SITEMAP_INDEX_LOC_RE.lastIndex = 0;
|
|
140
|
+
const childSitemaps = [];
|
|
141
|
+
let match;
|
|
142
|
+
while (true) {
|
|
143
|
+
match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
|
|
144
|
+
if (match === null) break;
|
|
145
|
+
childSitemaps.push(extractCdataUrl(match[1]));
|
|
146
|
+
}
|
|
147
|
+
const childResults = await Promise.allSettled(childSitemaps.map((url) => loadSitemap(url)));
|
|
148
|
+
const allUrls = [];
|
|
149
|
+
for (const result of childResults) if (result.status === "fulfilled") allUrls.push(...result.value);
|
|
150
|
+
return allUrls;
|
|
151
|
+
}
|
|
152
|
+
const urls = [];
|
|
153
|
+
SITEMAP_URL_LOC_RE.lastIndex = 0;
|
|
154
|
+
let match;
|
|
155
|
+
while (true) {
|
|
156
|
+
match = SITEMAP_URL_LOC_RE.exec(xmlContent);
|
|
157
|
+
if (match === null) break;
|
|
158
|
+
urls.push(extractCdataUrl(match[1]));
|
|
159
|
+
}
|
|
160
|
+
return urls;
|
|
161
|
+
}
|
|
162
|
+
function extractMetadataInline(parsedUrl) {
|
|
163
|
+
const links = /* @__PURE__ */ new Set();
|
|
115
164
|
let title = "";
|
|
116
165
|
let description = "";
|
|
117
166
|
let keywords = "";
|
|
118
167
|
let author = "";
|
|
119
|
-
|
|
120
|
-
|
|
168
|
+
const url = parsedUrl.href;
|
|
169
|
+
const originPrefix = `${parsedUrl.origin}/`;
|
|
170
|
+
return {
|
|
121
171
|
extraction: {
|
|
122
172
|
"a[href]": (el) => {
|
|
123
173
|
const href = el.attributes.href;
|
|
124
174
|
if (href) try {
|
|
125
175
|
const absoluteUrl = new URL(href, url).href;
|
|
126
|
-
if (
|
|
176
|
+
if (absoluteUrl.startsWith(originPrefix) || absoluteUrl === parsedUrl.origin) links.add(absoluteUrl);
|
|
127
177
|
} catch {}
|
|
128
178
|
},
|
|
129
179
|
"title": (el) => {
|
|
@@ -144,88 +194,31 @@ function extractMetadata(html, url) {
|
|
|
144
194
|
"meta[property=\"og:title\"]": (el) => {
|
|
145
195
|
if (!title) title = el.attributes.content || "";
|
|
146
196
|
}
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
links: links.filter((link) => {
|
|
155
|
-
try {
|
|
156
|
-
const linkUrl = new URL(link);
|
|
157
|
-
const baseUrl = new URL(url);
|
|
158
|
-
return linkUrl.hostname === baseUrl.hostname;
|
|
159
|
-
} catch {
|
|
160
|
-
return false;
|
|
161
|
-
}
|
|
197
|
+
},
|
|
198
|
+
getMetadata: () => ({
|
|
199
|
+
title: title.trim() || parsedUrl.pathname,
|
|
200
|
+
description: description.trim() || void 0,
|
|
201
|
+
keywords: keywords.trim() || void 0,
|
|
202
|
+
author: author.trim() || void 0,
|
|
203
|
+
links: [...links]
|
|
162
204
|
})
|
|
163
205
|
};
|
|
164
206
|
}
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
const
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
const controller = new AbortController();
|
|
176
|
-
const timeoutId = setTimeout(() => controller.abort(), 1e4);
|
|
177
|
-
try {
|
|
178
|
-
const response = await fetch(sitemapUrl, {
|
|
179
|
-
signal: controller.signal,
|
|
180
|
-
headers: { "User-Agent": "mdream-crawler/1.0" }
|
|
181
|
-
});
|
|
182
|
-
clearTimeout(timeoutId);
|
|
183
|
-
if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
|
|
184
|
-
const xmlContent = await response.text();
|
|
185
|
-
if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
|
|
186
|
-
if (xmlContent.includes("<sitemapindex")) {
|
|
187
|
-
SITEMAP_INDEX_LOC_RE.lastIndex = 0;
|
|
188
|
-
const childSitemaps = [];
|
|
189
|
-
let match;
|
|
190
|
-
while (true) {
|
|
191
|
-
match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
|
|
192
|
-
if (match === null) break;
|
|
193
|
-
let url = match[1];
|
|
194
|
-
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
195
|
-
childSitemaps.push(url);
|
|
196
|
-
}
|
|
197
|
-
const allUrls = [];
|
|
198
|
-
for (const childSitemapUrl of childSitemaps) try {
|
|
199
|
-
const childUrls = await loadSitemapWithoutRetries(childSitemapUrl);
|
|
200
|
-
allUrls.push(...childUrls);
|
|
201
|
-
} catch (error) {
|
|
202
|
-
console.warn(`Failed to load child sitemap ${childSitemapUrl}:`, error instanceof Error ? error.message : "Unknown error");
|
|
203
|
-
}
|
|
204
|
-
return allUrls;
|
|
205
|
-
} else {
|
|
206
|
-
const urls = [];
|
|
207
|
-
SITEMAP_URL_LOC_RE.lastIndex = 0;
|
|
208
|
-
let match;
|
|
209
|
-
while (true) {
|
|
210
|
-
match = SITEMAP_URL_LOC_RE.exec(xmlContent);
|
|
211
|
-
if (match === null) break;
|
|
212
|
-
let url = match[1];
|
|
213
|
-
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
214
|
-
urls.push(url);
|
|
215
|
-
}
|
|
216
|
-
return urls;
|
|
217
|
-
}
|
|
218
|
-
} catch (error) {
|
|
219
|
-
clearTimeout(timeoutId);
|
|
220
|
-
if (error instanceof Error && error.name === "AbortError") throw new Error("Sitemap request timed out after 10 seconds");
|
|
221
|
-
throw error;
|
|
222
|
-
}
|
|
207
|
+
function filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, allPatterns) {
|
|
208
|
+
if (hasGlobPatterns) return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude) && allPatterns.some((pattern) => matchesGlobPattern(url, pattern)));
|
|
209
|
+
return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude));
|
|
210
|
+
}
|
|
211
|
+
async function runConcurrent(items, concurrency, fn) {
|
|
212
|
+
let idx = 0;
|
|
213
|
+
const workers = Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
214
|
+
while (idx < items.length) await fn(items[idx++]);
|
|
215
|
+
});
|
|
216
|
+
await Promise.all(workers);
|
|
223
217
|
}
|
|
224
218
|
async function crawlAndGenerate(options, onProgress) {
|
|
225
|
-
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
|
|
219
|
+
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay: userCrawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
|
|
226
220
|
const outputDir = resolve(normalize(rawOutputDir));
|
|
227
|
-
|
|
228
|
-
else log.setLevel(log.LEVELS.OFF);
|
|
221
|
+
let crawlDelay = userCrawlDelay;
|
|
229
222
|
let patterns;
|
|
230
223
|
try {
|
|
231
224
|
patterns = globPatterns.length > 0 ? globPatterns : urls.map(parseUrlPattern);
|
|
@@ -233,6 +226,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
233
226
|
throw new Error(`Invalid URL pattern: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
234
227
|
}
|
|
235
228
|
let startingUrls = patterns.map(getStartingUrl);
|
|
229
|
+
const hasGlobPatterns = patterns.some((p) => p.isGlob);
|
|
236
230
|
const progress = {
|
|
237
231
|
sitemap: {
|
|
238
232
|
status: "discovering",
|
|
@@ -242,7 +236,14 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
242
236
|
crawling: {
|
|
243
237
|
status: "starting",
|
|
244
238
|
total: 0,
|
|
245
|
-
processed: 0
|
|
239
|
+
processed: 0,
|
|
240
|
+
failed: 0,
|
|
241
|
+
latency: {
|
|
242
|
+
total: 0,
|
|
243
|
+
min: Infinity,
|
|
244
|
+
max: 0,
|
|
245
|
+
count: 0
|
|
246
|
+
}
|
|
246
247
|
},
|
|
247
248
|
generation: { status: "idle" }
|
|
248
249
|
};
|
|
@@ -251,51 +252,46 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
251
252
|
const baseUrl = new URL(startingUrls[0]).origin;
|
|
252
253
|
const homePageUrl = baseUrl;
|
|
253
254
|
onProgress?.(progress);
|
|
254
|
-
|
|
255
|
-
const robotsController = new AbortController();
|
|
256
|
-
const robotsTimeoutId = setTimeout(() => robotsController.abort(), 1e4);
|
|
257
|
-
let robotsResponse;
|
|
255
|
+
let robotsContent = null;
|
|
258
256
|
try {
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
257
|
+
robotsContent = await ofetch(`${baseUrl}/robots.txt`, {
|
|
258
|
+
headers: FETCH_HEADERS,
|
|
259
|
+
timeout: 1e4,
|
|
260
|
+
responseType: "text",
|
|
261
|
+
retry: 0
|
|
262
262
|
});
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
263
|
+
} catch {}
|
|
264
|
+
if (robotsContent && !crawlDelay) {
|
|
265
|
+
const crawlDelayMatch = robotsContent.match(ROBOTS_CRAWL_DELAY_RE);
|
|
266
|
+
if (crawlDelayMatch) {
|
|
267
|
+
crawlDelay = Number.parseFloat(crawlDelayMatch[1]);
|
|
268
|
+
p.log(`[ROBOTS] Crawl-delay: ${crawlDelay}s`);
|
|
269
|
+
}
|
|
267
270
|
}
|
|
268
|
-
if (
|
|
269
|
-
const sitemapMatches =
|
|
271
|
+
if (robotsContent) {
|
|
272
|
+
const sitemapMatches = robotsContent.match(ROBOTS_SITEMAP_RE);
|
|
270
273
|
if (sitemapMatches && sitemapMatches.length > 0) {
|
|
271
274
|
progress.sitemap.found = sitemapMatches.length;
|
|
272
275
|
progress.sitemap.status = "processing";
|
|
273
276
|
onProgress?.(progress);
|
|
274
277
|
const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
|
|
275
278
|
for (const sitemapUrl of robotsSitemaps) try {
|
|
276
|
-
const robotsUrls = await
|
|
279
|
+
const robotsUrls = await loadSitemap(sitemapUrl);
|
|
277
280
|
sitemapAttempts.push({
|
|
278
281
|
url: sitemapUrl,
|
|
279
282
|
success: true
|
|
280
283
|
});
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
284
|
+
const filteredUrls = filterSitemapUrls(robotsUrls, hasGlobPatterns, exclude, patterns);
|
|
285
|
+
if (hasGlobPatterns) {
|
|
286
|
+
startingUrls = filteredUrls;
|
|
287
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
288
|
+
onProgress?.(progress);
|
|
289
|
+
break;
|
|
290
|
+
} else if (filteredUrls.length > 0) {
|
|
285
291
|
startingUrls = filteredUrls;
|
|
286
292
|
progress.sitemap.processed = filteredUrls.length;
|
|
287
293
|
onProgress?.(progress);
|
|
288
294
|
break;
|
|
289
|
-
} else {
|
|
290
|
-
const filteredUrls = robotsUrls.filter((url) => {
|
|
291
|
-
return !isUrlExcluded(url, exclude);
|
|
292
|
-
});
|
|
293
|
-
if (filteredUrls.length > 0) {
|
|
294
|
-
startingUrls = filteredUrls;
|
|
295
|
-
progress.sitemap.processed = filteredUrls.length;
|
|
296
|
-
onProgress?.(progress);
|
|
297
|
-
break;
|
|
298
|
-
}
|
|
299
295
|
}
|
|
300
296
|
} catch (error) {
|
|
301
297
|
sitemapAttempts.push({
|
|
@@ -309,31 +305,24 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
309
305
|
let mainSitemapProcessed = false;
|
|
310
306
|
const mainSitemapUrl = `${baseUrl}/sitemap.xml`;
|
|
311
307
|
try {
|
|
312
|
-
const sitemapUrls = await
|
|
308
|
+
const sitemapUrls = await loadSitemap(mainSitemapUrl);
|
|
313
309
|
sitemapAttempts.push({
|
|
314
310
|
url: mainSitemapUrl,
|
|
315
311
|
success: true
|
|
316
312
|
});
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
313
|
+
const filteredUrls = filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, patterns);
|
|
314
|
+
if (hasGlobPatterns) {
|
|
315
|
+
startingUrls = filteredUrls;
|
|
316
|
+
progress.sitemap.found = sitemapUrls.length;
|
|
317
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
318
|
+
onProgress?.(progress);
|
|
319
|
+
mainSitemapProcessed = true;
|
|
320
|
+
} else if (filteredUrls.length > 0) {
|
|
321
321
|
startingUrls = filteredUrls;
|
|
322
322
|
progress.sitemap.found = sitemapUrls.length;
|
|
323
323
|
progress.sitemap.processed = filteredUrls.length;
|
|
324
324
|
onProgress?.(progress);
|
|
325
325
|
mainSitemapProcessed = true;
|
|
326
|
-
} else {
|
|
327
|
-
const filteredUrls = sitemapUrls.filter((url) => {
|
|
328
|
-
return !isUrlExcluded(url, exclude);
|
|
329
|
-
});
|
|
330
|
-
if (filteredUrls.length > 0) {
|
|
331
|
-
startingUrls = filteredUrls;
|
|
332
|
-
progress.sitemap.found = sitemapUrls.length;
|
|
333
|
-
progress.sitemap.processed = filteredUrls.length;
|
|
334
|
-
onProgress?.(progress);
|
|
335
|
-
mainSitemapProcessed = true;
|
|
336
|
-
}
|
|
337
326
|
}
|
|
338
327
|
} catch (error) {
|
|
339
328
|
sitemapAttempts.push({
|
|
@@ -348,31 +337,24 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
348
337
|
`${baseUrl}/sitemap-index.xml`
|
|
349
338
|
];
|
|
350
339
|
for (const sitemapUrl of commonSitemaps) try {
|
|
351
|
-
const altUrls = await
|
|
340
|
+
const altUrls = await loadSitemap(sitemapUrl);
|
|
352
341
|
sitemapAttempts.push({
|
|
353
342
|
url: sitemapUrl,
|
|
354
343
|
success: true
|
|
355
344
|
});
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
345
|
+
const filteredUrls = filterSitemapUrls(altUrls, hasGlobPatterns, exclude, patterns);
|
|
346
|
+
if (hasGlobPatterns) {
|
|
347
|
+
startingUrls = filteredUrls;
|
|
348
|
+
progress.sitemap.found = altUrls.length;
|
|
349
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
350
|
+
onProgress?.(progress);
|
|
351
|
+
break;
|
|
352
|
+
} else if (filteredUrls.length > 0) {
|
|
360
353
|
startingUrls = filteredUrls;
|
|
361
354
|
progress.sitemap.found = altUrls.length;
|
|
362
355
|
progress.sitemap.processed = filteredUrls.length;
|
|
363
356
|
onProgress?.(progress);
|
|
364
357
|
break;
|
|
365
|
-
} else {
|
|
366
|
-
const filteredUrls = altUrls.filter((url) => {
|
|
367
|
-
return !isUrlExcluded(url, exclude);
|
|
368
|
-
});
|
|
369
|
-
if (filteredUrls.length > 0) {
|
|
370
|
-
startingUrls = filteredUrls;
|
|
371
|
-
progress.sitemap.found = altUrls.length;
|
|
372
|
-
progress.sitemap.processed = filteredUrls.length;
|
|
373
|
-
onProgress?.(progress);
|
|
374
|
-
break;
|
|
375
|
-
}
|
|
376
358
|
}
|
|
377
359
|
} catch (error) {
|
|
378
360
|
sitemapAttempts.push({
|
|
@@ -405,151 +387,193 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
405
387
|
progress.crawling.total = startingUrls.length;
|
|
406
388
|
onProgress?.(progress);
|
|
407
389
|
}
|
|
408
|
-
|
|
390
|
+
mkdirSync(outputDir, { recursive: true });
|
|
409
391
|
const results = [];
|
|
410
392
|
const processedUrls = /* @__PURE__ */ new Set();
|
|
411
393
|
const shouldCrawlUrl = (url) => {
|
|
412
394
|
if (isUrlExcluded(url, exclude)) return false;
|
|
413
|
-
if (!
|
|
395
|
+
if (!hasGlobPatterns) return true;
|
|
414
396
|
return patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
415
397
|
};
|
|
416
|
-
const
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
398
|
+
const recordLatency = (ms) => {
|
|
399
|
+
const lat = progress.crawling.latency;
|
|
400
|
+
lat.total += ms;
|
|
401
|
+
lat.count++;
|
|
402
|
+
if (ms < lat.min) lat.min = ms;
|
|
403
|
+
if (ms > lat.max) lat.max = ms;
|
|
404
|
+
};
|
|
405
|
+
const normalizedHomePageUrl = (startingUrls.length > 0 ? new URL(startingUrls[0]).origin : "").replace(URL_TRAILING_SLASH_RE, "");
|
|
406
|
+
const createdDirs = /* @__PURE__ */ new Set();
|
|
407
|
+
const sharedOrigin = origin || "";
|
|
408
|
+
const processPage = async (url, content, initialTitle, depth, isMarkdown = false) => {
|
|
409
|
+
const parsedUrl = new URL(url);
|
|
410
|
+
const shouldProcessMarkdown = shouldCrawlUrl(url);
|
|
411
|
+
const pageOrigin = sharedOrigin || parsedUrl.origin;
|
|
412
|
+
let md;
|
|
413
|
+
let metadata;
|
|
414
|
+
if (isMarkdown) {
|
|
415
|
+
md = content;
|
|
416
|
+
metadata = {
|
|
417
|
+
title: initialTitle || parsedUrl.pathname,
|
|
418
|
+
links: []
|
|
419
|
+
};
|
|
420
|
+
} else {
|
|
421
|
+
const { extraction, getMetadata } = extractMetadataInline(parsedUrl);
|
|
422
|
+
md = htmlToMarkdown(content, {
|
|
423
|
+
origin: pageOrigin,
|
|
424
|
+
extraction
|
|
425
|
+
});
|
|
426
|
+
metadata = getMetadata();
|
|
427
|
+
}
|
|
428
|
+
const title = initialTitle || metadata.title;
|
|
429
|
+
if (onPage && shouldProcessMarkdown) await onPage({
|
|
430
|
+
url,
|
|
431
|
+
html: isMarkdown ? "" : content,
|
|
432
|
+
title,
|
|
433
|
+
metadata,
|
|
434
|
+
origin: pageOrigin
|
|
435
|
+
});
|
|
436
|
+
let filePath;
|
|
437
|
+
if (shouldProcessMarkdown && generateIndividualMd) {
|
|
438
|
+
const safeSegments = (parsedUrl.pathname === "/" ? "/index" : parsedUrl.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
|
|
439
|
+
filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
|
|
440
|
+
const fileDir = dirname(filePath);
|
|
441
|
+
if (fileDir && !createdDirs.has(fileDir)) {
|
|
442
|
+
await mkdir(fileDir, { recursive: true });
|
|
443
|
+
createdDirs.add(fileDir);
|
|
432
444
|
}
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
url
|
|
439
|
-
html,
|
|
445
|
+
await writeFile(filePath, md, "utf-8");
|
|
446
|
+
}
|
|
447
|
+
const isHomePage = parsedUrl.pathname === "/" && parsedUrl.origin === normalizedHomePageUrl;
|
|
448
|
+
if (shouldProcessMarkdown || isHomePage) {
|
|
449
|
+
const result = {
|
|
450
|
+
url,
|
|
440
451
|
title,
|
|
452
|
+
content: md,
|
|
453
|
+
filePath: shouldProcessMarkdown ? filePath : void 0,
|
|
454
|
+
timestamp: Date.now(),
|
|
455
|
+
success: true,
|
|
441
456
|
metadata,
|
|
442
|
-
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
|
|
453
|
-
await writeFile(filePath, md, "utf-8");
|
|
454
|
-
}
|
|
455
|
-
const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
|
|
456
|
-
if (shouldProcessMarkdown || isHomePage) {
|
|
457
|
-
const result = {
|
|
458
|
-
url: request.loadedUrl,
|
|
459
|
-
title,
|
|
460
|
-
content: md,
|
|
461
|
-
filePath: shouldProcessMarkdown ? filePath : void 0,
|
|
462
|
-
timestamp: startTime,
|
|
463
|
-
success: true,
|
|
464
|
-
metadata,
|
|
465
|
-
depth: request.userData?.depth || 0
|
|
466
|
-
};
|
|
467
|
-
results.push(result);
|
|
468
|
-
progress.crawling.processed = results.length;
|
|
469
|
-
onProgress?.(progress);
|
|
470
|
-
}
|
|
471
|
-
if (followLinks && (request.userData?.depth || 0) < maxDepth) {
|
|
472
|
-
const currentDepth = (request.userData?.depth || 0) + 1;
|
|
473
|
-
const filteredLinks = metadata.links.filter((link) => {
|
|
474
|
-
return shouldCrawlUrl(link);
|
|
475
|
-
});
|
|
476
|
-
if (enqueueLinks) await enqueueLinks({
|
|
477
|
-
urls: filteredLinks,
|
|
478
|
-
userData: { depth: currentDepth }
|
|
479
|
-
});
|
|
480
|
-
else for (const link of filteredLinks) if (!processedUrls.has(link)) processedUrls.add(link);
|
|
481
|
-
}
|
|
482
|
-
};
|
|
457
|
+
depth
|
|
458
|
+
};
|
|
459
|
+
results.push(result);
|
|
460
|
+
progress.crawling.processed = results.length;
|
|
461
|
+
onProgress?.(progress);
|
|
462
|
+
}
|
|
463
|
+
if (followLinks && depth < maxDepth) {
|
|
464
|
+
const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link));
|
|
465
|
+
for (const link of filteredLinks) processedUrls.add(link);
|
|
466
|
+
}
|
|
483
467
|
};
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
};
|
|
505
|
-
results.push(result);
|
|
506
|
-
} else if (error) {
|
|
468
|
+
const urlsToProcess = startingUrls.slice(0, maxRequestsPerCrawl);
|
|
469
|
+
progress.crawling.status = "processing";
|
|
470
|
+
progress.crawling.total = urlsToProcess.length;
|
|
471
|
+
onProgress?.(progress);
|
|
472
|
+
if (driver === "playwright") {
|
|
473
|
+
const { log, PlaywrightCrawler, purgeDefaultStorages } = await import("crawlee");
|
|
474
|
+
if (verbose) log.setLevel(log.LEVELS.INFO);
|
|
475
|
+
else log.setLevel(log.LEVELS.OFF);
|
|
476
|
+
const crawlerOptions = {
|
|
477
|
+
requestHandler: async ({ request, page }) => {
|
|
478
|
+
progress.crawling.currentUrl = request.loadedUrl;
|
|
479
|
+
onProgress?.(progress);
|
|
480
|
+
const fetchStart = Date.now();
|
|
481
|
+
await page.waitForLoadState("networkidle");
|
|
482
|
+
const title = await page.title();
|
|
483
|
+
const html = await page.innerHTML("html");
|
|
484
|
+
recordLatency(Date.now() - fetchStart);
|
|
485
|
+
await processPage(request.loadedUrl, html, title, request.userData?.depth || 0);
|
|
486
|
+
},
|
|
487
|
+
errorHandler: async ({ request, response, error }) => {
|
|
488
|
+
if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
|
|
507
489
|
request.noRetry = true;
|
|
508
|
-
|
|
490
|
+
progress.crawling.failed++;
|
|
491
|
+
results.push({
|
|
509
492
|
url: request.url,
|
|
510
493
|
title: "",
|
|
511
494
|
content: "",
|
|
512
495
|
timestamp: Date.now(),
|
|
513
496
|
success: false,
|
|
514
|
-
error: error
|
|
497
|
+
error: response?.statusCode ? `HTTP ${response.statusCode}` : error?.message || "Unknown error",
|
|
515
498
|
metadata: {
|
|
516
499
|
title: "",
|
|
517
500
|
description: "",
|
|
518
501
|
links: []
|
|
519
502
|
},
|
|
520
503
|
depth: request.userData?.depth || 0
|
|
521
|
-
};
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
if (driver === "playwright") {
|
|
530
|
-
const playwrightOptions = crawlerOptions;
|
|
531
|
-
if (useChrome) playwrightOptions.launchContext = {
|
|
532
|
-
...playwrightOptions.launchContext,
|
|
504
|
+
});
|
|
505
|
+
},
|
|
506
|
+
maxRequestsPerCrawl,
|
|
507
|
+
respectRobotsTxtFile: false
|
|
508
|
+
};
|
|
509
|
+
if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
|
|
510
|
+
if (useChrome) crawlerOptions.launchContext = {
|
|
511
|
+
...crawlerOptions.launchContext,
|
|
533
512
|
useChrome
|
|
534
513
|
};
|
|
535
|
-
crawler = new PlaywrightCrawler(
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
549
|
-
console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
|
|
514
|
+
const crawler = new PlaywrightCrawler(crawlerOptions);
|
|
515
|
+
const initialRequests = urlsToProcess.map((url) => ({
|
|
516
|
+
url,
|
|
517
|
+
userData: { depth: 0 }
|
|
518
|
+
}));
|
|
519
|
+
try {
|
|
520
|
+
await crawler.run(initialRequests);
|
|
521
|
+
} catch (error) {
|
|
522
|
+
if (verbose) {
|
|
523
|
+
console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
524
|
+
console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
|
|
525
|
+
}
|
|
526
|
+
throw error;
|
|
550
527
|
}
|
|
551
|
-
|
|
552
|
-
}
|
|
528
|
+
await purgeDefaultStorages();
|
|
529
|
+
} else await runConcurrent(urlsToProcess, DEFAULT_CONCURRENCY, async (url) => {
|
|
530
|
+
progress.crawling.currentUrl = url;
|
|
531
|
+
onProgress?.(progress);
|
|
532
|
+
if (crawlDelay) {
|
|
533
|
+
const delay = crawlDelay;
|
|
534
|
+
await new Promise((resolve) => setTimeout(resolve, delay * 1e3));
|
|
535
|
+
}
|
|
536
|
+
try {
|
|
537
|
+
const fetchStart = Date.now();
|
|
538
|
+
const response = await ofetch.raw(url, {
|
|
539
|
+
headers: FETCH_HEADERS,
|
|
540
|
+
responseType: "text",
|
|
541
|
+
retry: 2,
|
|
542
|
+
retryDelay: 500,
|
|
543
|
+
timeout: 1e4,
|
|
544
|
+
onResponseError({ response }) {
|
|
545
|
+
if (response.status === 429) {
|
|
546
|
+
const retryAfter = response.headers.get("retry-after");
|
|
547
|
+
const delaySec = retryAfter ? Number.parseInt(retryAfter) || 1 : 2;
|
|
548
|
+
if (!crawlDelay || delaySec > crawlDelay) crawlDelay = delaySec;
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
});
|
|
552
|
+
recordLatency(Date.now() - fetchStart);
|
|
553
|
+
const body = response._data ?? "";
|
|
554
|
+
const contentType = response.headers.get("content-type") || "";
|
|
555
|
+
await processPage(url, body, "", 0, contentType.includes("text/markdown") || contentType.includes("text/x-markdown"));
|
|
556
|
+
} catch (error) {
|
|
557
|
+
if (verbose) console.error(`[ERROR] URL: ${url}, Error: ${error instanceof Error ? error.message : "Unknown"}`);
|
|
558
|
+
progress.crawling.failed++;
|
|
559
|
+
results.push({
|
|
560
|
+
url,
|
|
561
|
+
title: "",
|
|
562
|
+
content: "",
|
|
563
|
+
timestamp: Date.now(),
|
|
564
|
+
success: false,
|
|
565
|
+
error: error instanceof Error ? error.message : "Unknown error",
|
|
566
|
+
metadata: {
|
|
567
|
+
title: "",
|
|
568
|
+
description: "",
|
|
569
|
+
links: []
|
|
570
|
+
},
|
|
571
|
+
depth: 0
|
|
572
|
+
});
|
|
573
|
+
progress.crawling.processed = results.length;
|
|
574
|
+
onProgress?.(progress);
|
|
575
|
+
}
|
|
576
|
+
});
|
|
553
577
|
progress.crawling.status = "completed";
|
|
554
578
|
onProgress?.(progress);
|
|
555
579
|
if (results.some((r) => r.success)) {
|
|
@@ -557,10 +581,10 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
557
581
|
onProgress?.(progress);
|
|
558
582
|
const successfulResults = results.filter((r) => r.success);
|
|
559
583
|
const firstUrl = new URL(withHttps(urls[0]));
|
|
560
|
-
const
|
|
584
|
+
const originUrl = firstUrl.origin;
|
|
561
585
|
const homePageResult = successfulResults.find((r) => {
|
|
562
586
|
const resultUrl = new URL(withHttps(r.url));
|
|
563
|
-
return resultUrl.href ===
|
|
587
|
+
return resultUrl.href === originUrl || resultUrl.href === `${originUrl}/`;
|
|
564
588
|
});
|
|
565
589
|
const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
|
|
566
590
|
const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
|
|
@@ -586,7 +610,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
586
610
|
})),
|
|
587
611
|
siteName,
|
|
588
612
|
description,
|
|
589
|
-
origin:
|
|
613
|
+
origin: originUrl || firstUrl.origin,
|
|
590
614
|
generateFull: generateLlmsFullTxt,
|
|
591
615
|
outputDir
|
|
592
616
|
});
|
|
@@ -604,7 +628,6 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
604
628
|
progress.generation.status = "completed";
|
|
605
629
|
onProgress?.(progress);
|
|
606
630
|
}
|
|
607
|
-
await purgeDefaultStorages();
|
|
608
631
|
return results;
|
|
609
632
|
}
|
|
610
633
|
//#endregion
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import * as p from "@clack/prompts";
|
|
2
|
+
import { addDependency } from "nypm";
|
|
3
|
+
//#region src/playwright-utils.ts
|
|
4
|
+
async function checkPlaywrightInstallation() {
|
|
5
|
+
try {
|
|
6
|
+
await import("playwright");
|
|
7
|
+
return true;
|
|
8
|
+
} catch {
|
|
9
|
+
return false;
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
async function promptPlaywrightInstall() {
|
|
13
|
+
const shouldInstall = await p.confirm({
|
|
14
|
+
message: "Playwright is required for the Playwright driver. Install it now?",
|
|
15
|
+
initialValue: true
|
|
16
|
+
});
|
|
17
|
+
if (p.isCancel(shouldInstall) || !shouldInstall) return false;
|
|
18
|
+
const s = p.spinner();
|
|
19
|
+
s.start("Installing Playwright globally...");
|
|
20
|
+
try {
|
|
21
|
+
await addDependency("playwright", { global: true });
|
|
22
|
+
s.stop("Playwright installed successfully!");
|
|
23
|
+
return true;
|
|
24
|
+
} catch (fallbackError) {
|
|
25
|
+
s.stop("Failed to install Playwright");
|
|
26
|
+
p.log.error(`Installation failed: ${fallbackError}`);
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
async function ensurePlaywrightInstalled() {
|
|
31
|
+
if (await checkPlaywrightInstallation()) return true;
|
|
32
|
+
p.log.warn("Playwright driver selected but Playwright is not installed.");
|
|
33
|
+
if (!await promptPlaywrightInstall()) {
|
|
34
|
+
p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
return true;
|
|
38
|
+
}
|
|
39
|
+
async function isUseChromeSupported() {
|
|
40
|
+
try {
|
|
41
|
+
const { PlaywrightCrawler } = await import("crawlee");
|
|
42
|
+
const crawler = new PlaywrightCrawler({
|
|
43
|
+
launchContext: { useChrome: true },
|
|
44
|
+
requestHandler: async () => {},
|
|
45
|
+
maxRequestsPerCrawl: 1
|
|
46
|
+
});
|
|
47
|
+
const page = await crawler.browserPool.newPage();
|
|
48
|
+
await page.evaluate(() => {
|
|
49
|
+
return window.navigator.userAgent;
|
|
50
|
+
});
|
|
51
|
+
await page.close();
|
|
52
|
+
await crawler.browserPool.closeAllBrowsers();
|
|
53
|
+
crawler.stop();
|
|
54
|
+
return true;
|
|
55
|
+
} catch {}
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
//#endregion
|
|
59
|
+
export { ensurePlaywrightInstalled, isUseChromeSupported };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,66 +1,9 @@
|
|
|
1
1
|
import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl.mjs";
|
|
2
2
|
import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
3
3
|
import * as p from "@clack/prompts";
|
|
4
|
-
import { PlaywrightCrawler } from "crawlee";
|
|
5
4
|
import { dirname, join, resolve } from "pathe";
|
|
6
5
|
import { withHttps } from "ufo";
|
|
7
6
|
import { fileURLToPath } from "node:url";
|
|
8
|
-
import { addDependency } from "nypm";
|
|
9
|
-
//#region src/playwright-utils.ts
|
|
10
|
-
async function checkPlaywrightInstallation() {
|
|
11
|
-
try {
|
|
12
|
-
await import("playwright");
|
|
13
|
-
return true;
|
|
14
|
-
} catch {
|
|
15
|
-
return false;
|
|
16
|
-
}
|
|
17
|
-
}
|
|
18
|
-
async function promptPlaywrightInstall() {
|
|
19
|
-
const shouldInstall = await p.confirm({
|
|
20
|
-
message: "Playwright is required for the Playwright driver. Install it now?",
|
|
21
|
-
initialValue: true
|
|
22
|
-
});
|
|
23
|
-
if (p.isCancel(shouldInstall) || !shouldInstall) return false;
|
|
24
|
-
const s = p.spinner();
|
|
25
|
-
s.start("Installing Playwright globally...");
|
|
26
|
-
try {
|
|
27
|
-
await addDependency("playwright", { global: true });
|
|
28
|
-
s.stop("Playwright installed successfully!");
|
|
29
|
-
return true;
|
|
30
|
-
} catch (fallbackError) {
|
|
31
|
-
s.stop("Failed to install Playwright");
|
|
32
|
-
p.log.error(`Installation failed: ${fallbackError}`);
|
|
33
|
-
return false;
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
async function ensurePlaywrightInstalled() {
|
|
37
|
-
if (await checkPlaywrightInstallation()) return true;
|
|
38
|
-
p.log.warn("Playwright driver selected but Playwright is not installed.");
|
|
39
|
-
if (!await promptPlaywrightInstall()) {
|
|
40
|
-
p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
|
|
41
|
-
return false;
|
|
42
|
-
}
|
|
43
|
-
return true;
|
|
44
|
-
}
|
|
45
|
-
async function isUseChromeSupported() {
|
|
46
|
-
try {
|
|
47
|
-
const crawler = new PlaywrightCrawler({
|
|
48
|
-
launchContext: { useChrome: true },
|
|
49
|
-
requestHandler: async () => {},
|
|
50
|
-
maxRequestsPerCrawl: 1
|
|
51
|
-
});
|
|
52
|
-
const page = await crawler.browserPool.newPage();
|
|
53
|
-
await page.evaluate(() => {
|
|
54
|
-
return window.navigator.userAgent;
|
|
55
|
-
});
|
|
56
|
-
await page.close();
|
|
57
|
-
await crawler.browserPool.closeAllBrowsers();
|
|
58
|
-
crawler.stop();
|
|
59
|
-
return true;
|
|
60
|
-
} catch {}
|
|
61
|
-
return false;
|
|
62
|
-
}
|
|
63
|
-
//#endregion
|
|
64
7
|
//#region src/cli.ts
|
|
65
8
|
const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
|
|
66
9
|
const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
|
|
@@ -225,11 +168,17 @@ async function interactiveCrawl() {
|
|
|
225
168
|
skipSitemap: advancedOptions.skipSitemap
|
|
226
169
|
};
|
|
227
170
|
}
|
|
228
|
-
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
|
|
171
|
+
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds, latency) {
|
|
229
172
|
const messages = [];
|
|
230
173
|
const durationStr = `${durationSeconds.toFixed(1)}s`;
|
|
231
|
-
|
|
232
|
-
messages.push(
|
|
174
|
+
messages.push(`📄 ${successful} pages \u00B7 ⏱️ ${durationStr}`);
|
|
175
|
+
if (failed > 0) messages.push(`⚠️ ${failed} failed`);
|
|
176
|
+
if (latency && latency.count > 0) {
|
|
177
|
+
const avg = Math.round(latency.total / latency.count);
|
|
178
|
+
const min = latency.min === Infinity ? 0 : Math.round(latency.min);
|
|
179
|
+
const max = Math.round(latency.max);
|
|
180
|
+
messages.push(`🏓 avg ${avg}ms \u00B7 min ${min}ms \u00B7 max ${max}ms`);
|
|
181
|
+
}
|
|
233
182
|
messages.push(`📦 ${generatedFiles.join(", ")}`);
|
|
234
183
|
messages.push(`📁 ${outputDir}`);
|
|
235
184
|
p.note(messages.join("\n"), "✅ Complete");
|
|
@@ -431,36 +380,46 @@ async function main() {
|
|
|
431
380
|
if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
|
|
432
381
|
process.exit(1);
|
|
433
382
|
}
|
|
434
|
-
if (options.driver === "playwright")
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
|
|
383
|
+
if (options.driver === "playwright") {
|
|
384
|
+
try {
|
|
385
|
+
await import("crawlee");
|
|
386
|
+
} catch {
|
|
387
|
+
p.log.error("The Playwright driver requires crawlee. Install it with: npm install crawlee");
|
|
440
388
|
process.exit(1);
|
|
441
389
|
}
|
|
442
|
-
|
|
390
|
+
const { ensurePlaywrightInstalled, isUseChromeSupported } = await import("./_chunks/playwright-utils.mjs");
|
|
391
|
+
if (await isUseChromeSupported()) {
|
|
392
|
+
options.useChrome = true;
|
|
393
|
+
p.log.info("System Chrome detected and enabled.");
|
|
394
|
+
} else {
|
|
395
|
+
if (!await ensurePlaywrightInstalled()) {
|
|
396
|
+
p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
|
|
397
|
+
process.exit(1);
|
|
398
|
+
}
|
|
399
|
+
p.log.info("Using global playwright instance.");
|
|
400
|
+
}
|
|
443
401
|
}
|
|
444
402
|
const s = p.spinner();
|
|
445
|
-
s.start("
|
|
403
|
+
s.start("Discovering sitemaps");
|
|
446
404
|
const startTime = Date.now();
|
|
405
|
+
let crawlStartTime = 0;
|
|
406
|
+
let lastProgress;
|
|
447
407
|
const results = await crawlAndGenerate(options, (progress) => {
|
|
448
|
-
|
|
408
|
+
lastProgress = progress;
|
|
409
|
+
if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps");
|
|
449
410
|
else if (progress.sitemap.status === "processing") s.message(`Processing sitemap... Found ${progress.sitemap.found} URLs`);
|
|
450
411
|
else if (progress.crawling.status === "processing") {
|
|
451
|
-
|
|
452
|
-
const
|
|
453
|
-
const
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
s.message(current);
|
|
463
|
-
}
|
|
412
|
+
if (!crawlStartTime) crawlStartTime = Date.now();
|
|
413
|
+
const processed = progress.crawling.processed;
|
|
414
|
+
const total = progress.crawling.total;
|
|
415
|
+
const failed = progress.crawling.failed;
|
|
416
|
+
const elapsed = (Date.now() - crawlStartTime) / 1e3;
|
|
417
|
+
const rate = elapsed > .1 ? Math.round(processed / elapsed) : 0;
|
|
418
|
+
let msg = processed > total ? `Crawling ${processed} pages` : `Crawling ${processed}/${total}`;
|
|
419
|
+
if (rate > 0) msg += ` \u00B7 ${rate}/s`;
|
|
420
|
+
if (failed > 0) msg += ` \u00B7 ${failed} failed`;
|
|
421
|
+
s.message(msg);
|
|
422
|
+
} else if (progress.generation.status === "generating") s.message(progress.generation.current || "Generating files");
|
|
464
423
|
});
|
|
465
424
|
s.stop();
|
|
466
425
|
const durationSeconds = (Date.now() - startTime) / 1e3;
|
|
@@ -484,7 +443,7 @@ async function main() {
|
|
|
484
443
|
if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
|
|
485
444
|
if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
|
|
486
445
|
}
|
|
487
|
-
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
|
|
446
|
+
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds, lastProgress?.crawling.latency);
|
|
488
447
|
process.exit(0);
|
|
489
448
|
}
|
|
490
449
|
main().catch((error) => {
|
package/dist/index.d.mts
CHANGED
|
@@ -63,7 +63,14 @@ interface CrawlProgress {
|
|
|
63
63
|
status: 'starting' | 'processing' | 'completed';
|
|
64
64
|
total: number;
|
|
65
65
|
processed: number;
|
|
66
|
-
|
|
66
|
+
failed: number;
|
|
67
|
+
currentUrl?: string; /** Page fetch latency stats in ms */
|
|
68
|
+
latency: {
|
|
69
|
+
total: number;
|
|
70
|
+
min: number;
|
|
71
|
+
max: number;
|
|
72
|
+
count: number;
|
|
73
|
+
};
|
|
67
74
|
};
|
|
68
75
|
generation: {
|
|
69
76
|
status: 'idle' | 'generating' | 'completed';
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "1.0.0-beta.
|
|
4
|
+
"version": "1.0.0-beta.11",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -42,22 +42,26 @@
|
|
|
42
42
|
"dist"
|
|
43
43
|
],
|
|
44
44
|
"peerDependencies": {
|
|
45
|
+
"crawlee": "^3.16.0",
|
|
45
46
|
"playwright": "^1.53.2"
|
|
46
47
|
},
|
|
47
48
|
"peerDependenciesMeta": {
|
|
49
|
+
"crawlee": {
|
|
50
|
+
"optional": true
|
|
51
|
+
},
|
|
48
52
|
"playwright": {
|
|
49
53
|
"optional": true
|
|
50
54
|
}
|
|
51
55
|
},
|
|
52
56
|
"dependencies": {
|
|
53
57
|
"@clack/prompts": "^1.1.0",
|
|
54
|
-
"crawlee": "^3.16.0",
|
|
55
58
|
"nypm": "^0.6.5",
|
|
59
|
+
"ofetch": "^1.5.1",
|
|
56
60
|
"pathe": "^2.0.3",
|
|
57
61
|
"picomatch": "^4.0.3",
|
|
58
62
|
"ufo": "^1.6.3",
|
|
59
|
-
"mdream": "1.0.0-beta.
|
|
60
|
-
"
|
|
63
|
+
"@mdream/js": "1.0.0-beta.11",
|
|
64
|
+
"mdream": "1.0.0-beta.11"
|
|
61
65
|
},
|
|
62
66
|
"devDependencies": {
|
|
63
67
|
"@types/picomatch": "^4.0.2"
|