@mdream/crawl 0.17.1 → 1.0.0-beta.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -11
- package/dist/_chunks/crawl.mjs +314 -290
- package/dist/_chunks/playwright-utils.mjs +59 -0
- package/dist/cli.mjs +42 -83
- package/dist/index.d.mts +9 -12
- package/dist/index.mjs +1 -63
- package/package.json +8 -3
package/README.md
CHANGED
|
@@ -7,7 +7,7 @@ Multi-page website crawler that generates comprehensive llms.txt files by follow
|
|
|
7
7
|
## Installation
|
|
8
8
|
|
|
9
9
|
```bash
|
|
10
|
-
npm install @mdream/crawl
|
|
10
|
+
npm install @mdream/crawl@beta
|
|
11
11
|
```
|
|
12
12
|
|
|
13
13
|
## Usage
|
|
@@ -15,7 +15,7 @@ npm install @mdream/crawl
|
|
|
15
15
|
Simply run the command to start the interactive multi-page website crawler:
|
|
16
16
|
|
|
17
17
|
```bash
|
|
18
|
-
npx @mdream/crawl
|
|
18
|
+
npx @mdream/crawl@beta
|
|
19
19
|
```
|
|
20
20
|
|
|
21
21
|
The crawler will automatically discover and follow internal links to crawl entire websites. The interactive interface provides:
|
|
@@ -31,7 +31,7 @@ The crawler will automatically discover and follow internal links to crawl entir
|
|
|
31
31
|
You can also use @mdream/crawl programmatically in your Node.js applications:
|
|
32
32
|
|
|
33
33
|
```typescript
|
|
34
|
-
import { crawlAndGenerate
|
|
34
|
+
import { crawlAndGenerate } from '@mdream/crawl'
|
|
35
35
|
|
|
36
36
|
// Crawl entire websites programmatically
|
|
37
37
|
const results = await crawlAndGenerate({
|
|
@@ -44,16 +44,20 @@ const results = await crawlAndGenerate({
|
|
|
44
44
|
driver: 'http', // or 'playwright' for JS-heavy sites
|
|
45
45
|
verbose: true
|
|
46
46
|
})
|
|
47
|
+
```
|
|
47
48
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
})
|
|
49
|
+
### Playwright Driver
|
|
50
|
+
|
|
51
|
+
The default HTTP driver works for most sites. For JavaScript-heavy sites that require a browser, install the optional dependencies:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
npm install crawlee playwright
|
|
55
55
|
```
|
|
56
56
|
|
|
57
|
+
Then use `--driver playwright` or `driver: 'playwright'` in the API.
|
|
58
|
+
|
|
59
|
+
> **Note**: llms.txt artifact generation is handled by [`@mdream/js/llms-txt`](../js). The crawl package uses it internally when `generateLlmsTxt: true`.
|
|
60
|
+
|
|
57
61
|
## Output
|
|
58
62
|
|
|
59
63
|
The crawler generates comprehensive output from entire websites:
|
|
@@ -76,7 +80,7 @@ The crawler generates comprehensive output from entire websites:
|
|
|
76
80
|
|
|
77
81
|
- ✅ **Multi-Page Website Crawling**: Designed specifically for crawling entire websites by following internal links
|
|
78
82
|
- ✅ **Purely Interactive**: No complex command-line options to remember
|
|
79
|
-
- ✅ **Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites
|
|
83
|
+
- ✅ **Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites (requires `crawlee` and `playwright`)
|
|
80
84
|
- ✅ **Smart Link Discovery**: Uses mdream's extraction plugin to find and follow internal links
|
|
81
85
|
- ✅ **Rich Metadata Extraction**: Extracts titles, descriptions, keywords, and author info from all pages
|
|
82
86
|
- ✅ **Comprehensive llms.txt Generation**: Creates complete site documentation files
|
package/dist/_chunks/crawl.mjs
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { writeFile } from "node:fs/promises";
|
|
1
|
+
import { mkdirSync } from "node:fs";
|
|
2
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
3
3
|
import * as p from "@clack/prompts";
|
|
4
|
-
import {
|
|
4
|
+
import { generateLlmsTxtArtifacts } from "@mdream/js/llms-txt";
|
|
5
5
|
import { htmlToMarkdown } from "mdream";
|
|
6
|
-
import {
|
|
7
|
-
import { withMinimalPreset } from "mdream/preset/minimal";
|
|
6
|
+
import { ofetch } from "ofetch";
|
|
8
7
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
9
8
|
import { withHttps } from "ufo";
|
|
10
9
|
import picomatch from "picomatch";
|
|
11
|
-
import { extractionPlugin } from "mdream/plugins";
|
|
12
10
|
//#region src/glob-utils.ts
|
|
13
|
-
|
|
11
|
+
function stripGlobTail(s) {
|
|
12
|
+
const idx = s.indexOf("*");
|
|
13
|
+
return idx === -1 ? s : s.slice(0, idx);
|
|
14
|
+
}
|
|
14
15
|
const GLOB_CHAR_RE = /[*?[]/;
|
|
15
16
|
/**
|
|
16
17
|
* Parse a URL that may contain glob patterns
|
|
@@ -23,7 +24,7 @@ function parseUrlPattern(input) {
|
|
|
23
24
|
isGlob: false
|
|
24
25
|
};
|
|
25
26
|
try {
|
|
26
|
-
const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`)
|
|
27
|
+
const urlWithoutGlob = stripGlobTail(input.startsWith("http") ? input : `https://${input}`);
|
|
27
28
|
const url = new URL(urlWithoutGlob);
|
|
28
29
|
const baseUrl = `${url.protocol}//${url.host}`;
|
|
29
30
|
const patternStart = input.indexOf(url.host) + url.host.length;
|
|
@@ -108,123 +109,116 @@ function validateGlobPattern(pattern) {
|
|
|
108
109
|
}
|
|
109
110
|
}
|
|
110
111
|
//#endregion
|
|
111
|
-
//#region src/
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
//#region src/crawl.ts
|
|
113
|
+
const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
|
|
114
|
+
const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
|
|
115
|
+
const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
|
|
116
|
+
const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
|
|
117
|
+
const ROBOTS_CRAWL_DELAY_RE = /Crawl-delay:\s*(\d+(?:\.\d+)?)/i;
|
|
118
|
+
const URL_TRAILING_SLASH_RE = /\/$/;
|
|
119
|
+
const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
|
|
120
|
+
const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
|
|
121
|
+
const FETCH_HEADERS = {
|
|
122
|
+
"User-Agent": "mdream-crawler/1.0",
|
|
123
|
+
"Accept": "text/html,application/xhtml+xml,text/markdown"
|
|
124
|
+
};
|
|
125
|
+
const DEFAULT_CONCURRENCY = 20;
|
|
126
|
+
function extractCdataUrl(url) {
|
|
127
|
+
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) return url.slice(9, -3);
|
|
128
|
+
return url;
|
|
129
|
+
}
|
|
130
|
+
async function loadSitemap(sitemapUrl) {
|
|
131
|
+
const xmlContent = await ofetch(sitemapUrl, {
|
|
132
|
+
headers: FETCH_HEADERS,
|
|
133
|
+
timeout: 1e4,
|
|
134
|
+
responseType: "text",
|
|
135
|
+
retry: 0
|
|
136
|
+
});
|
|
137
|
+
if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
|
|
138
|
+
if (xmlContent.includes("<sitemapindex")) {
|
|
139
|
+
SITEMAP_INDEX_LOC_RE.lastIndex = 0;
|
|
140
|
+
const childSitemaps = [];
|
|
141
|
+
let match;
|
|
142
|
+
while (true) {
|
|
143
|
+
match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
|
|
144
|
+
if (match === null) break;
|
|
145
|
+
childSitemaps.push(extractCdataUrl(match[1]));
|
|
146
|
+
}
|
|
147
|
+
const childResults = await Promise.allSettled(childSitemaps.map((url) => loadSitemap(url)));
|
|
148
|
+
const allUrls = [];
|
|
149
|
+
for (const result of childResults) if (result.status === "fulfilled") allUrls.push(...result.value);
|
|
150
|
+
return allUrls;
|
|
151
|
+
}
|
|
152
|
+
const urls = [];
|
|
153
|
+
SITEMAP_URL_LOC_RE.lastIndex = 0;
|
|
154
|
+
let match;
|
|
155
|
+
while (true) {
|
|
156
|
+
match = SITEMAP_URL_LOC_RE.exec(xmlContent);
|
|
157
|
+
if (match === null) break;
|
|
158
|
+
urls.push(extractCdataUrl(match[1]));
|
|
159
|
+
}
|
|
160
|
+
return urls;
|
|
161
|
+
}
|
|
162
|
+
function extractMetadataInline(parsedUrl) {
|
|
163
|
+
const links = /* @__PURE__ */ new Set();
|
|
114
164
|
let title = "";
|
|
115
165
|
let description = "";
|
|
116
166
|
let keywords = "";
|
|
117
167
|
let author = "";
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
168
|
+
const url = parsedUrl.href;
|
|
169
|
+
const originPrefix = `${parsedUrl.origin}/`;
|
|
170
|
+
return {
|
|
171
|
+
extraction: {
|
|
172
|
+
"a[href]": (el) => {
|
|
173
|
+
const href = el.attributes.href;
|
|
122
174
|
if (href) try {
|
|
123
175
|
const absoluteUrl = new URL(href, url).href;
|
|
124
|
-
if (
|
|
176
|
+
if (absoluteUrl.startsWith(originPrefix) || absoluteUrl === parsedUrl.origin) links.add(absoluteUrl);
|
|
125
177
|
} catch {}
|
|
126
178
|
},
|
|
127
|
-
"title": (
|
|
128
|
-
if (!title
|
|
179
|
+
"title": (el) => {
|
|
180
|
+
if (!title) title = el.textContent;
|
|
129
181
|
},
|
|
130
|
-
"meta[name=\"description\"]": (
|
|
131
|
-
if (!description
|
|
182
|
+
"meta[name=\"description\"]": (el) => {
|
|
183
|
+
if (!description) description = el.attributes.content || "";
|
|
132
184
|
},
|
|
133
|
-
"meta[property=\"og:description\"]": (
|
|
134
|
-
if (!description
|
|
185
|
+
"meta[property=\"og:description\"]": (el) => {
|
|
186
|
+
if (!description) description = el.attributes.content || "";
|
|
135
187
|
},
|
|
136
|
-
"meta[name=\"keywords\"]": (
|
|
137
|
-
if (!keywords
|
|
188
|
+
"meta[name=\"keywords\"]": (el) => {
|
|
189
|
+
if (!keywords) keywords = el.attributes.content || "";
|
|
138
190
|
},
|
|
139
|
-
"meta[name=\"author\"]": (
|
|
140
|
-
if (!author
|
|
191
|
+
"meta[name=\"author\"]": (el) => {
|
|
192
|
+
if (!author) author = el.attributes.content || "";
|
|
141
193
|
},
|
|
142
|
-
"meta[property=\"og:title\"]": (
|
|
143
|
-
if (!title
|
|
144
|
-
}
|
|
145
|
-
})],
|
|
146
|
-
origin: new URL(url).origin
|
|
147
|
-
});
|
|
148
|
-
return {
|
|
149
|
-
title: title || new URL(url).pathname,
|
|
150
|
-
description: description || void 0,
|
|
151
|
-
keywords: keywords || void 0,
|
|
152
|
-
author: author || void 0,
|
|
153
|
-
links: links.filter((link) => {
|
|
154
|
-
try {
|
|
155
|
-
const linkUrl = new URL(link);
|
|
156
|
-
const baseUrl = new URL(url);
|
|
157
|
-
return linkUrl.hostname === baseUrl.hostname;
|
|
158
|
-
} catch {
|
|
159
|
-
return false;
|
|
194
|
+
"meta[property=\"og:title\"]": (el) => {
|
|
195
|
+
if (!title) title = el.attributes.content || "";
|
|
160
196
|
}
|
|
197
|
+
},
|
|
198
|
+
getMetadata: () => ({
|
|
199
|
+
title: title.trim() || parsedUrl.pathname,
|
|
200
|
+
description: description.trim() || void 0,
|
|
201
|
+
keywords: keywords.trim() || void 0,
|
|
202
|
+
author: author.trim() || void 0,
|
|
203
|
+
links: [...links]
|
|
161
204
|
})
|
|
162
205
|
};
|
|
163
206
|
}
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
const
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
const controller = new AbortController();
|
|
175
|
-
const timeoutId = setTimeout(() => controller.abort(), 1e4);
|
|
176
|
-
try {
|
|
177
|
-
const response = await fetch(sitemapUrl, {
|
|
178
|
-
signal: controller.signal,
|
|
179
|
-
headers: { "User-Agent": "mdream-crawler/1.0" }
|
|
180
|
-
});
|
|
181
|
-
clearTimeout(timeoutId);
|
|
182
|
-
if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
|
|
183
|
-
const xmlContent = await response.text();
|
|
184
|
-
if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
|
|
185
|
-
if (xmlContent.includes("<sitemapindex")) {
|
|
186
|
-
SITEMAP_INDEX_LOC_RE.lastIndex = 0;
|
|
187
|
-
const childSitemaps = [];
|
|
188
|
-
let match;
|
|
189
|
-
while (true) {
|
|
190
|
-
match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
|
|
191
|
-
if (match === null) break;
|
|
192
|
-
let url = match[1];
|
|
193
|
-
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
194
|
-
childSitemaps.push(url);
|
|
195
|
-
}
|
|
196
|
-
const allUrls = [];
|
|
197
|
-
for (const childSitemapUrl of childSitemaps) try {
|
|
198
|
-
const childUrls = await loadSitemapWithoutRetries(childSitemapUrl);
|
|
199
|
-
allUrls.push(...childUrls);
|
|
200
|
-
} catch (error) {
|
|
201
|
-
console.warn(`Failed to load child sitemap ${childSitemapUrl}:`, error instanceof Error ? error.message : "Unknown error");
|
|
202
|
-
}
|
|
203
|
-
return allUrls;
|
|
204
|
-
} else {
|
|
205
|
-
const urls = [];
|
|
206
|
-
SITEMAP_URL_LOC_RE.lastIndex = 0;
|
|
207
|
-
let match;
|
|
208
|
-
while (true) {
|
|
209
|
-
match = SITEMAP_URL_LOC_RE.exec(xmlContent);
|
|
210
|
-
if (match === null) break;
|
|
211
|
-
let url = match[1];
|
|
212
|
-
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
213
|
-
urls.push(url);
|
|
214
|
-
}
|
|
215
|
-
return urls;
|
|
216
|
-
}
|
|
217
|
-
} catch (error) {
|
|
218
|
-
clearTimeout(timeoutId);
|
|
219
|
-
if (error instanceof Error && error.name === "AbortError") throw new Error("Sitemap request timed out after 10 seconds");
|
|
220
|
-
throw error;
|
|
221
|
-
}
|
|
207
|
+
function filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, allPatterns) {
|
|
208
|
+
if (hasGlobPatterns) return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude) && allPatterns.some((pattern) => matchesGlobPattern(url, pattern)));
|
|
209
|
+
return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude));
|
|
210
|
+
}
|
|
211
|
+
async function runConcurrent(items, concurrency, fn) {
|
|
212
|
+
let idx = 0;
|
|
213
|
+
const workers = Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
214
|
+
while (idx < items.length) await fn(items[idx++]);
|
|
215
|
+
});
|
|
216
|
+
await Promise.all(workers);
|
|
222
217
|
}
|
|
223
218
|
async function crawlAndGenerate(options, onProgress) {
|
|
224
|
-
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
|
|
219
|
+
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay: userCrawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
|
|
225
220
|
const outputDir = resolve(normalize(rawOutputDir));
|
|
226
|
-
|
|
227
|
-
else log.setLevel(log.LEVELS.OFF);
|
|
221
|
+
let crawlDelay = userCrawlDelay;
|
|
228
222
|
let patterns;
|
|
229
223
|
try {
|
|
230
224
|
patterns = globPatterns.length > 0 ? globPatterns : urls.map(parseUrlPattern);
|
|
@@ -232,6 +226,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
232
226
|
throw new Error(`Invalid URL pattern: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
233
227
|
}
|
|
234
228
|
let startingUrls = patterns.map(getStartingUrl);
|
|
229
|
+
const hasGlobPatterns = patterns.some((p) => p.isGlob);
|
|
235
230
|
const progress = {
|
|
236
231
|
sitemap: {
|
|
237
232
|
status: "discovering",
|
|
@@ -241,7 +236,14 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
241
236
|
crawling: {
|
|
242
237
|
status: "starting",
|
|
243
238
|
total: 0,
|
|
244
|
-
processed: 0
|
|
239
|
+
processed: 0,
|
|
240
|
+
failed: 0,
|
|
241
|
+
latency: {
|
|
242
|
+
total: 0,
|
|
243
|
+
min: Infinity,
|
|
244
|
+
max: 0,
|
|
245
|
+
count: 0
|
|
246
|
+
}
|
|
245
247
|
},
|
|
246
248
|
generation: { status: "idle" }
|
|
247
249
|
};
|
|
@@ -250,51 +252,46 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
250
252
|
const baseUrl = new URL(startingUrls[0]).origin;
|
|
251
253
|
const homePageUrl = baseUrl;
|
|
252
254
|
onProgress?.(progress);
|
|
253
|
-
|
|
254
|
-
const robotsController = new AbortController();
|
|
255
|
-
const robotsTimeoutId = setTimeout(() => robotsController.abort(), 1e4);
|
|
256
|
-
let robotsResponse;
|
|
255
|
+
let robotsContent = null;
|
|
257
256
|
try {
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
257
|
+
robotsContent = await ofetch(`${baseUrl}/robots.txt`, {
|
|
258
|
+
headers: FETCH_HEADERS,
|
|
259
|
+
timeout: 1e4,
|
|
260
|
+
responseType: "text",
|
|
261
|
+
retry: 0
|
|
261
262
|
});
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
263
|
+
} catch {}
|
|
264
|
+
if (robotsContent && !crawlDelay) {
|
|
265
|
+
const crawlDelayMatch = robotsContent.match(ROBOTS_CRAWL_DELAY_RE);
|
|
266
|
+
if (crawlDelayMatch) {
|
|
267
|
+
crawlDelay = Number.parseFloat(crawlDelayMatch[1]);
|
|
268
|
+
p.log(`[ROBOTS] Crawl-delay: ${crawlDelay}s`);
|
|
269
|
+
}
|
|
266
270
|
}
|
|
267
|
-
if (
|
|
268
|
-
const sitemapMatches =
|
|
271
|
+
if (robotsContent) {
|
|
272
|
+
const sitemapMatches = robotsContent.match(ROBOTS_SITEMAP_RE);
|
|
269
273
|
if (sitemapMatches && sitemapMatches.length > 0) {
|
|
270
274
|
progress.sitemap.found = sitemapMatches.length;
|
|
271
275
|
progress.sitemap.status = "processing";
|
|
272
276
|
onProgress?.(progress);
|
|
273
277
|
const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
|
|
274
278
|
for (const sitemapUrl of robotsSitemaps) try {
|
|
275
|
-
const robotsUrls = await
|
|
279
|
+
const robotsUrls = await loadSitemap(sitemapUrl);
|
|
276
280
|
sitemapAttempts.push({
|
|
277
281
|
url: sitemapUrl,
|
|
278
282
|
success: true
|
|
279
283
|
});
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
+
const filteredUrls = filterSitemapUrls(robotsUrls, hasGlobPatterns, exclude, patterns);
|
|
285
|
+
if (hasGlobPatterns) {
|
|
286
|
+
startingUrls = filteredUrls;
|
|
287
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
288
|
+
onProgress?.(progress);
|
|
289
|
+
break;
|
|
290
|
+
} else if (filteredUrls.length > 0) {
|
|
284
291
|
startingUrls = filteredUrls;
|
|
285
292
|
progress.sitemap.processed = filteredUrls.length;
|
|
286
293
|
onProgress?.(progress);
|
|
287
294
|
break;
|
|
288
|
-
} else {
|
|
289
|
-
const filteredUrls = robotsUrls.filter((url) => {
|
|
290
|
-
return !isUrlExcluded(url, exclude);
|
|
291
|
-
});
|
|
292
|
-
if (filteredUrls.length > 0) {
|
|
293
|
-
startingUrls = filteredUrls;
|
|
294
|
-
progress.sitemap.processed = filteredUrls.length;
|
|
295
|
-
onProgress?.(progress);
|
|
296
|
-
break;
|
|
297
|
-
}
|
|
298
295
|
}
|
|
299
296
|
} catch (error) {
|
|
300
297
|
sitemapAttempts.push({
|
|
@@ -308,31 +305,24 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
308
305
|
let mainSitemapProcessed = false;
|
|
309
306
|
const mainSitemapUrl = `${baseUrl}/sitemap.xml`;
|
|
310
307
|
try {
|
|
311
|
-
const sitemapUrls = await
|
|
308
|
+
const sitemapUrls = await loadSitemap(mainSitemapUrl);
|
|
312
309
|
sitemapAttempts.push({
|
|
313
310
|
url: mainSitemapUrl,
|
|
314
311
|
success: true
|
|
315
312
|
});
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
313
|
+
const filteredUrls = filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, patterns);
|
|
314
|
+
if (hasGlobPatterns) {
|
|
315
|
+
startingUrls = filteredUrls;
|
|
316
|
+
progress.sitemap.found = sitemapUrls.length;
|
|
317
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
318
|
+
onProgress?.(progress);
|
|
319
|
+
mainSitemapProcessed = true;
|
|
320
|
+
} else if (filteredUrls.length > 0) {
|
|
320
321
|
startingUrls = filteredUrls;
|
|
321
322
|
progress.sitemap.found = sitemapUrls.length;
|
|
322
323
|
progress.sitemap.processed = filteredUrls.length;
|
|
323
324
|
onProgress?.(progress);
|
|
324
325
|
mainSitemapProcessed = true;
|
|
325
|
-
} else {
|
|
326
|
-
const filteredUrls = sitemapUrls.filter((url) => {
|
|
327
|
-
return !isUrlExcluded(url, exclude);
|
|
328
|
-
});
|
|
329
|
-
if (filteredUrls.length > 0) {
|
|
330
|
-
startingUrls = filteredUrls;
|
|
331
|
-
progress.sitemap.found = sitemapUrls.length;
|
|
332
|
-
progress.sitemap.processed = filteredUrls.length;
|
|
333
|
-
onProgress?.(progress);
|
|
334
|
-
mainSitemapProcessed = true;
|
|
335
|
-
}
|
|
336
326
|
}
|
|
337
327
|
} catch (error) {
|
|
338
328
|
sitemapAttempts.push({
|
|
@@ -347,31 +337,24 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
347
337
|
`${baseUrl}/sitemap-index.xml`
|
|
348
338
|
];
|
|
349
339
|
for (const sitemapUrl of commonSitemaps) try {
|
|
350
|
-
const altUrls = await
|
|
340
|
+
const altUrls = await loadSitemap(sitemapUrl);
|
|
351
341
|
sitemapAttempts.push({
|
|
352
342
|
url: sitemapUrl,
|
|
353
343
|
success: true
|
|
354
344
|
});
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
345
|
+
const filteredUrls = filterSitemapUrls(altUrls, hasGlobPatterns, exclude, patterns);
|
|
346
|
+
if (hasGlobPatterns) {
|
|
347
|
+
startingUrls = filteredUrls;
|
|
348
|
+
progress.sitemap.found = altUrls.length;
|
|
349
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
350
|
+
onProgress?.(progress);
|
|
351
|
+
break;
|
|
352
|
+
} else if (filteredUrls.length > 0) {
|
|
359
353
|
startingUrls = filteredUrls;
|
|
360
354
|
progress.sitemap.found = altUrls.length;
|
|
361
355
|
progress.sitemap.processed = filteredUrls.length;
|
|
362
356
|
onProgress?.(progress);
|
|
363
357
|
break;
|
|
364
|
-
} else {
|
|
365
|
-
const filteredUrls = altUrls.filter((url) => {
|
|
366
|
-
return !isUrlExcluded(url, exclude);
|
|
367
|
-
});
|
|
368
|
-
if (filteredUrls.length > 0) {
|
|
369
|
-
startingUrls = filteredUrls;
|
|
370
|
-
progress.sitemap.found = altUrls.length;
|
|
371
|
-
progress.sitemap.processed = filteredUrls.length;
|
|
372
|
-
onProgress?.(progress);
|
|
373
|
-
break;
|
|
374
|
-
}
|
|
375
358
|
}
|
|
376
359
|
} catch (error) {
|
|
377
360
|
sitemapAttempts.push({
|
|
@@ -404,151 +387,193 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
404
387
|
progress.crawling.total = startingUrls.length;
|
|
405
388
|
onProgress?.(progress);
|
|
406
389
|
}
|
|
407
|
-
|
|
390
|
+
mkdirSync(outputDir, { recursive: true });
|
|
408
391
|
const results = [];
|
|
409
392
|
const processedUrls = /* @__PURE__ */ new Set();
|
|
410
393
|
const shouldCrawlUrl = (url) => {
|
|
411
394
|
if (isUrlExcluded(url, exclude)) return false;
|
|
412
|
-
if (!
|
|
395
|
+
if (!hasGlobPatterns) return true;
|
|
413
396
|
return patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
414
397
|
};
|
|
415
|
-
const
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
398
|
+
const recordLatency = (ms) => {
|
|
399
|
+
const lat = progress.crawling.latency;
|
|
400
|
+
lat.total += ms;
|
|
401
|
+
lat.count++;
|
|
402
|
+
if (ms < lat.min) lat.min = ms;
|
|
403
|
+
if (ms > lat.max) lat.max = ms;
|
|
404
|
+
};
|
|
405
|
+
const normalizedHomePageUrl = (startingUrls.length > 0 ? new URL(startingUrls[0]).origin : "").replace(URL_TRAILING_SLASH_RE, "");
|
|
406
|
+
const createdDirs = /* @__PURE__ */ new Set();
|
|
407
|
+
const sharedOrigin = origin || "";
|
|
408
|
+
const processPage = async (url, content, initialTitle, depth, isMarkdown = false) => {
|
|
409
|
+
const parsedUrl = new URL(url);
|
|
410
|
+
const shouldProcessMarkdown = shouldCrawlUrl(url);
|
|
411
|
+
const pageOrigin = sharedOrigin || parsedUrl.origin;
|
|
412
|
+
let md;
|
|
413
|
+
let metadata;
|
|
414
|
+
if (isMarkdown) {
|
|
415
|
+
md = content;
|
|
416
|
+
metadata = {
|
|
417
|
+
title: initialTitle || parsedUrl.pathname,
|
|
418
|
+
links: []
|
|
419
|
+
};
|
|
420
|
+
} else {
|
|
421
|
+
const { extraction, getMetadata } = extractMetadataInline(parsedUrl);
|
|
422
|
+
md = htmlToMarkdown(content, {
|
|
423
|
+
origin: pageOrigin,
|
|
424
|
+
extraction
|
|
425
|
+
});
|
|
426
|
+
metadata = getMetadata();
|
|
427
|
+
}
|
|
428
|
+
const title = initialTitle || metadata.title;
|
|
429
|
+
if (onPage && shouldProcessMarkdown) await onPage({
|
|
430
|
+
url,
|
|
431
|
+
html: isMarkdown ? "" : content,
|
|
432
|
+
title,
|
|
433
|
+
metadata,
|
|
434
|
+
origin: pageOrigin
|
|
435
|
+
});
|
|
436
|
+
let filePath;
|
|
437
|
+
if (shouldProcessMarkdown && generateIndividualMd) {
|
|
438
|
+
const safeSegments = (parsedUrl.pathname === "/" ? "/index" : parsedUrl.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
|
|
439
|
+
filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
|
|
440
|
+
const fileDir = dirname(filePath);
|
|
441
|
+
if (fileDir && !createdDirs.has(fileDir)) {
|
|
442
|
+
await mkdir(fileDir, { recursive: true });
|
|
443
|
+
createdDirs.add(fileDir);
|
|
431
444
|
}
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
url
|
|
438
|
-
html,
|
|
445
|
+
await writeFile(filePath, md, "utf-8");
|
|
446
|
+
}
|
|
447
|
+
const isHomePage = parsedUrl.pathname === "/" && parsedUrl.origin === normalizedHomePageUrl;
|
|
448
|
+
if (shouldProcessMarkdown || isHomePage) {
|
|
449
|
+
const result = {
|
|
450
|
+
url,
|
|
439
451
|
title,
|
|
452
|
+
content: md,
|
|
453
|
+
filePath: shouldProcessMarkdown ? filePath : void 0,
|
|
454
|
+
timestamp: Date.now(),
|
|
455
|
+
success: true,
|
|
440
456
|
metadata,
|
|
441
|
-
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
|
|
452
|
-
await writeFile(filePath, md, "utf-8");
|
|
453
|
-
}
|
|
454
|
-
const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
|
|
455
|
-
if (shouldProcessMarkdown || isHomePage) {
|
|
456
|
-
const result = {
|
|
457
|
-
url: request.loadedUrl,
|
|
458
|
-
title,
|
|
459
|
-
content: md,
|
|
460
|
-
filePath: shouldProcessMarkdown ? filePath : void 0,
|
|
461
|
-
timestamp: startTime,
|
|
462
|
-
success: true,
|
|
463
|
-
metadata,
|
|
464
|
-
depth: request.userData?.depth || 0
|
|
465
|
-
};
|
|
466
|
-
results.push(result);
|
|
467
|
-
progress.crawling.processed = results.length;
|
|
468
|
-
onProgress?.(progress);
|
|
469
|
-
}
|
|
470
|
-
if (followLinks && (request.userData?.depth || 0) < maxDepth) {
|
|
471
|
-
const currentDepth = (request.userData?.depth || 0) + 1;
|
|
472
|
-
const filteredLinks = metadata.links.filter((link) => {
|
|
473
|
-
return shouldCrawlUrl(link);
|
|
474
|
-
});
|
|
475
|
-
if (enqueueLinks) await enqueueLinks({
|
|
476
|
-
urls: filteredLinks,
|
|
477
|
-
userData: { depth: currentDepth }
|
|
478
|
-
});
|
|
479
|
-
else for (const link of filteredLinks) if (!processedUrls.has(link)) processedUrls.add(link);
|
|
480
|
-
}
|
|
481
|
-
};
|
|
457
|
+
depth
|
|
458
|
+
};
|
|
459
|
+
results.push(result);
|
|
460
|
+
progress.crawling.processed = results.length;
|
|
461
|
+
onProgress?.(progress);
|
|
462
|
+
}
|
|
463
|
+
if (followLinks && depth < maxDepth) {
|
|
464
|
+
const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link));
|
|
465
|
+
for (const link of filteredLinks) processedUrls.add(link);
|
|
466
|
+
}
|
|
482
467
|
};
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
};
|
|
504
|
-
results.push(result);
|
|
505
|
-
} else if (error) {
|
|
468
|
+
const urlsToProcess = startingUrls.slice(0, maxRequestsPerCrawl);
|
|
469
|
+
progress.crawling.status = "processing";
|
|
470
|
+
progress.crawling.total = urlsToProcess.length;
|
|
471
|
+
onProgress?.(progress);
|
|
472
|
+
if (driver === "playwright") {
|
|
473
|
+
const { log, PlaywrightCrawler, purgeDefaultStorages } = await import("crawlee");
|
|
474
|
+
if (verbose) log.setLevel(log.LEVELS.INFO);
|
|
475
|
+
else log.setLevel(log.LEVELS.OFF);
|
|
476
|
+
const crawlerOptions = {
|
|
477
|
+
requestHandler: async ({ request, page }) => {
|
|
478
|
+
progress.crawling.currentUrl = request.loadedUrl;
|
|
479
|
+
onProgress?.(progress);
|
|
480
|
+
const fetchStart = Date.now();
|
|
481
|
+
await page.waitForLoadState("networkidle");
|
|
482
|
+
const title = await page.title();
|
|
483
|
+
const html = await page.innerHTML("html");
|
|
484
|
+
recordLatency(Date.now() - fetchStart);
|
|
485
|
+
await processPage(request.loadedUrl, html, title, request.userData?.depth || 0);
|
|
486
|
+
},
|
|
487
|
+
errorHandler: async ({ request, response, error }) => {
|
|
488
|
+
if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
|
|
506
489
|
request.noRetry = true;
|
|
507
|
-
|
|
490
|
+
progress.crawling.failed++;
|
|
491
|
+
results.push({
|
|
508
492
|
url: request.url,
|
|
509
493
|
title: "",
|
|
510
494
|
content: "",
|
|
511
495
|
timestamp: Date.now(),
|
|
512
496
|
success: false,
|
|
513
|
-
error: error
|
|
497
|
+
error: response?.statusCode ? `HTTP ${response.statusCode}` : error?.message || "Unknown error",
|
|
514
498
|
metadata: {
|
|
515
499
|
title: "",
|
|
516
500
|
description: "",
|
|
517
501
|
links: []
|
|
518
502
|
},
|
|
519
503
|
depth: request.userData?.depth || 0
|
|
520
|
-
};
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
if (driver === "playwright") {
|
|
529
|
-
const playwrightOptions = crawlerOptions;
|
|
530
|
-
if (useChrome) playwrightOptions.launchContext = {
|
|
531
|
-
...playwrightOptions.launchContext,
|
|
504
|
+
});
|
|
505
|
+
},
|
|
506
|
+
maxRequestsPerCrawl,
|
|
507
|
+
respectRobotsTxtFile: false
|
|
508
|
+
};
|
|
509
|
+
if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
|
|
510
|
+
if (useChrome) crawlerOptions.launchContext = {
|
|
511
|
+
...crawlerOptions.launchContext,
|
|
532
512
|
useChrome
|
|
533
513
|
};
|
|
534
|
-
crawler = new PlaywrightCrawler(
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
548
|
-
console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
|
|
514
|
+
const crawler = new PlaywrightCrawler(crawlerOptions);
|
|
515
|
+
const initialRequests = urlsToProcess.map((url) => ({
|
|
516
|
+
url,
|
|
517
|
+
userData: { depth: 0 }
|
|
518
|
+
}));
|
|
519
|
+
try {
|
|
520
|
+
await crawler.run(initialRequests);
|
|
521
|
+
} catch (error) {
|
|
522
|
+
if (verbose) {
|
|
523
|
+
console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
524
|
+
console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
|
|
525
|
+
}
|
|
526
|
+
throw error;
|
|
549
527
|
}
|
|
550
|
-
|
|
551
|
-
}
|
|
528
|
+
await purgeDefaultStorages();
|
|
529
|
+
} else await runConcurrent(urlsToProcess, DEFAULT_CONCURRENCY, async (url) => {
|
|
530
|
+
progress.crawling.currentUrl = url;
|
|
531
|
+
onProgress?.(progress);
|
|
532
|
+
if (crawlDelay) {
|
|
533
|
+
const delay = crawlDelay;
|
|
534
|
+
await new Promise((resolve) => setTimeout(resolve, delay * 1e3));
|
|
535
|
+
}
|
|
536
|
+
try {
|
|
537
|
+
const fetchStart = Date.now();
|
|
538
|
+
const response = await ofetch.raw(url, {
|
|
539
|
+
headers: FETCH_HEADERS,
|
|
540
|
+
responseType: "text",
|
|
541
|
+
retry: 2,
|
|
542
|
+
retryDelay: 500,
|
|
543
|
+
timeout: 1e4,
|
|
544
|
+
onResponseError({ response }) {
|
|
545
|
+
if (response.status === 429) {
|
|
546
|
+
const retryAfter = response.headers.get("retry-after");
|
|
547
|
+
const delaySec = retryAfter ? Number.parseInt(retryAfter) || 1 : 2;
|
|
548
|
+
if (!crawlDelay || delaySec > crawlDelay) crawlDelay = delaySec;
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
});
|
|
552
|
+
recordLatency(Date.now() - fetchStart);
|
|
553
|
+
const body = response._data ?? "";
|
|
554
|
+
const contentType = response.headers.get("content-type") || "";
|
|
555
|
+
await processPage(url, body, "", 0, contentType.includes("text/markdown") || contentType.includes("text/x-markdown"));
|
|
556
|
+
} catch (error) {
|
|
557
|
+
if (verbose) console.error(`[ERROR] URL: ${url}, Error: ${error instanceof Error ? error.message : "Unknown"}`);
|
|
558
|
+
progress.crawling.failed++;
|
|
559
|
+
results.push({
|
|
560
|
+
url,
|
|
561
|
+
title: "",
|
|
562
|
+
content: "",
|
|
563
|
+
timestamp: Date.now(),
|
|
564
|
+
success: false,
|
|
565
|
+
error: error instanceof Error ? error.message : "Unknown error",
|
|
566
|
+
metadata: {
|
|
567
|
+
title: "",
|
|
568
|
+
description: "",
|
|
569
|
+
links: []
|
|
570
|
+
},
|
|
571
|
+
depth: 0
|
|
572
|
+
});
|
|
573
|
+
progress.crawling.processed = results.length;
|
|
574
|
+
onProgress?.(progress);
|
|
575
|
+
}
|
|
576
|
+
});
|
|
552
577
|
progress.crawling.status = "completed";
|
|
553
578
|
onProgress?.(progress);
|
|
554
579
|
if (results.some((r) => r.success)) {
|
|
@@ -556,10 +581,10 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
556
581
|
onProgress?.(progress);
|
|
557
582
|
const successfulResults = results.filter((r) => r.success);
|
|
558
583
|
const firstUrl = new URL(withHttps(urls[0]));
|
|
559
|
-
const
|
|
584
|
+
const originUrl = firstUrl.origin;
|
|
560
585
|
const homePageResult = successfulResults.find((r) => {
|
|
561
586
|
const resultUrl = new URL(withHttps(r.url));
|
|
562
|
-
return resultUrl.href ===
|
|
587
|
+
return resultUrl.href === originUrl || resultUrl.href === `${originUrl}/`;
|
|
563
588
|
});
|
|
564
589
|
const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
|
|
565
590
|
const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
|
|
@@ -585,7 +610,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
585
610
|
})),
|
|
586
611
|
siteName,
|
|
587
612
|
description,
|
|
588
|
-
origin:
|
|
613
|
+
origin: originUrl || firstUrl.origin,
|
|
589
614
|
generateFull: generateLlmsFullTxt,
|
|
590
615
|
outputDir
|
|
591
616
|
});
|
|
@@ -603,7 +628,6 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
603
628
|
progress.generation.status = "completed";
|
|
604
629
|
onProgress?.(progress);
|
|
605
630
|
}
|
|
606
|
-
await purgeDefaultStorages();
|
|
607
631
|
return results;
|
|
608
632
|
}
|
|
609
633
|
//#endregion
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import * as p from "@clack/prompts";
|
|
2
|
+
import { addDependency } from "nypm";
|
|
3
|
+
//#region src/playwright-utils.ts
|
|
4
|
+
async function checkPlaywrightInstallation() {
|
|
5
|
+
try {
|
|
6
|
+
await import("playwright");
|
|
7
|
+
return true;
|
|
8
|
+
} catch {
|
|
9
|
+
return false;
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
async function promptPlaywrightInstall() {
|
|
13
|
+
const shouldInstall = await p.confirm({
|
|
14
|
+
message: "Playwright is required for the Playwright driver. Install it now?",
|
|
15
|
+
initialValue: true
|
|
16
|
+
});
|
|
17
|
+
if (p.isCancel(shouldInstall) || !shouldInstall) return false;
|
|
18
|
+
const s = p.spinner();
|
|
19
|
+
s.start("Installing Playwright globally...");
|
|
20
|
+
try {
|
|
21
|
+
await addDependency("playwright", { global: true });
|
|
22
|
+
s.stop("Playwright installed successfully!");
|
|
23
|
+
return true;
|
|
24
|
+
} catch (fallbackError) {
|
|
25
|
+
s.stop("Failed to install Playwright");
|
|
26
|
+
p.log.error(`Installation failed: ${fallbackError}`);
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
async function ensurePlaywrightInstalled() {
|
|
31
|
+
if (await checkPlaywrightInstallation()) return true;
|
|
32
|
+
p.log.warn("Playwright driver selected but Playwright is not installed.");
|
|
33
|
+
if (!await promptPlaywrightInstall()) {
|
|
34
|
+
p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
return true;
|
|
38
|
+
}
|
|
39
|
+
async function isUseChromeSupported() {
|
|
40
|
+
try {
|
|
41
|
+
const { PlaywrightCrawler } = await import("crawlee");
|
|
42
|
+
const crawler = new PlaywrightCrawler({
|
|
43
|
+
launchContext: { useChrome: true },
|
|
44
|
+
requestHandler: async () => {},
|
|
45
|
+
maxRequestsPerCrawl: 1
|
|
46
|
+
});
|
|
47
|
+
const page = await crawler.browserPool.newPage();
|
|
48
|
+
await page.evaluate(() => {
|
|
49
|
+
return window.navigator.userAgent;
|
|
50
|
+
});
|
|
51
|
+
await page.close();
|
|
52
|
+
await crawler.browserPool.closeAllBrowsers();
|
|
53
|
+
crawler.stop();
|
|
54
|
+
return true;
|
|
55
|
+
} catch {}
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
//#endregion
|
|
59
|
+
export { ensurePlaywrightInstalled, isUseChromeSupported };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,66 +1,9 @@
|
|
|
1
1
|
import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl.mjs";
|
|
2
2
|
import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
3
3
|
import * as p from "@clack/prompts";
|
|
4
|
-
import { PlaywrightCrawler } from "crawlee";
|
|
5
4
|
import { dirname, join, resolve } from "pathe";
|
|
6
5
|
import { withHttps } from "ufo";
|
|
7
6
|
import { fileURLToPath } from "node:url";
|
|
8
|
-
import { addDependency } from "nypm";
|
|
9
|
-
//#region src/playwright-utils.ts
|
|
10
|
-
async function checkPlaywrightInstallation() {
|
|
11
|
-
try {
|
|
12
|
-
await import("playwright");
|
|
13
|
-
return true;
|
|
14
|
-
} catch {
|
|
15
|
-
return false;
|
|
16
|
-
}
|
|
17
|
-
}
|
|
18
|
-
async function promptPlaywrightInstall() {
|
|
19
|
-
const shouldInstall = await p.confirm({
|
|
20
|
-
message: "Playwright is required for the Playwright driver. Install it now?",
|
|
21
|
-
initialValue: true
|
|
22
|
-
});
|
|
23
|
-
if (p.isCancel(shouldInstall) || !shouldInstall) return false;
|
|
24
|
-
const s = p.spinner();
|
|
25
|
-
s.start("Installing Playwright globally...");
|
|
26
|
-
try {
|
|
27
|
-
await addDependency("playwright", { global: true });
|
|
28
|
-
s.stop("Playwright installed successfully!");
|
|
29
|
-
return true;
|
|
30
|
-
} catch (fallbackError) {
|
|
31
|
-
s.stop("Failed to install Playwright");
|
|
32
|
-
p.log.error(`Installation failed: ${fallbackError}`);
|
|
33
|
-
return false;
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
async function ensurePlaywrightInstalled() {
|
|
37
|
-
if (await checkPlaywrightInstallation()) return true;
|
|
38
|
-
p.log.warn("Playwright driver selected but Playwright is not installed.");
|
|
39
|
-
if (!await promptPlaywrightInstall()) {
|
|
40
|
-
p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
|
|
41
|
-
return false;
|
|
42
|
-
}
|
|
43
|
-
return true;
|
|
44
|
-
}
|
|
45
|
-
async function isUseChromeSupported() {
|
|
46
|
-
try {
|
|
47
|
-
const crawler = new PlaywrightCrawler({
|
|
48
|
-
launchContext: { useChrome: true },
|
|
49
|
-
requestHandler: async () => {},
|
|
50
|
-
maxRequestsPerCrawl: 1
|
|
51
|
-
});
|
|
52
|
-
const page = await crawler.browserPool.newPage();
|
|
53
|
-
await page.evaluate(() => {
|
|
54
|
-
return window.navigator.userAgent;
|
|
55
|
-
});
|
|
56
|
-
await page.close();
|
|
57
|
-
await crawler.browserPool.closeAllBrowsers();
|
|
58
|
-
crawler.stop();
|
|
59
|
-
return true;
|
|
60
|
-
} catch {}
|
|
61
|
-
return false;
|
|
62
|
-
}
|
|
63
|
-
//#endregion
|
|
64
7
|
//#region src/cli.ts
|
|
65
8
|
const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
|
|
66
9
|
const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
|
|
@@ -225,11 +168,17 @@ async function interactiveCrawl() {
|
|
|
225
168
|
skipSitemap: advancedOptions.skipSitemap
|
|
226
169
|
};
|
|
227
170
|
}
|
|
228
|
-
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
|
|
171
|
+
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds, latency) {
|
|
229
172
|
const messages = [];
|
|
230
173
|
const durationStr = `${durationSeconds.toFixed(1)}s`;
|
|
231
|
-
|
|
232
|
-
messages.push(
|
|
174
|
+
messages.push(`📄 ${successful} pages \u00B7 ⏱️ ${durationStr}`);
|
|
175
|
+
if (failed > 0) messages.push(`⚠️ ${failed} failed`);
|
|
176
|
+
if (latency && latency.count > 0) {
|
|
177
|
+
const avg = Math.round(latency.total / latency.count);
|
|
178
|
+
const min = latency.min === Infinity ? 0 : Math.round(latency.min);
|
|
179
|
+
const max = Math.round(latency.max);
|
|
180
|
+
messages.push(`🏓 avg ${avg}ms \u00B7 min ${min}ms \u00B7 max ${max}ms`);
|
|
181
|
+
}
|
|
233
182
|
messages.push(`📦 ${generatedFiles.join(", ")}`);
|
|
234
183
|
messages.push(`📁 ${outputDir}`);
|
|
235
184
|
p.note(messages.join("\n"), "✅ Complete");
|
|
@@ -431,36 +380,46 @@ async function main() {
|
|
|
431
380
|
if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
|
|
432
381
|
process.exit(1);
|
|
433
382
|
}
|
|
434
|
-
if (options.driver === "playwright")
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
|
|
383
|
+
if (options.driver === "playwright") {
|
|
384
|
+
try {
|
|
385
|
+
await import("crawlee");
|
|
386
|
+
} catch {
|
|
387
|
+
p.log.error("The Playwright driver requires crawlee. Install it with: npm install crawlee");
|
|
440
388
|
process.exit(1);
|
|
441
389
|
}
|
|
442
|
-
|
|
390
|
+
const { ensurePlaywrightInstalled, isUseChromeSupported } = await import("./_chunks/playwright-utils.mjs");
|
|
391
|
+
if (await isUseChromeSupported()) {
|
|
392
|
+
options.useChrome = true;
|
|
393
|
+
p.log.info("System Chrome detected and enabled.");
|
|
394
|
+
} else {
|
|
395
|
+
if (!await ensurePlaywrightInstalled()) {
|
|
396
|
+
p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
|
|
397
|
+
process.exit(1);
|
|
398
|
+
}
|
|
399
|
+
p.log.info("Using global playwright instance.");
|
|
400
|
+
}
|
|
443
401
|
}
|
|
444
402
|
const s = p.spinner();
|
|
445
|
-
s.start("
|
|
403
|
+
s.start("Discovering sitemaps");
|
|
446
404
|
const startTime = Date.now();
|
|
405
|
+
let crawlStartTime = 0;
|
|
406
|
+
let lastProgress;
|
|
447
407
|
const results = await crawlAndGenerate(options, (progress) => {
|
|
448
|
-
|
|
408
|
+
lastProgress = progress;
|
|
409
|
+
if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps");
|
|
449
410
|
else if (progress.sitemap.status === "processing") s.message(`Processing sitemap... Found ${progress.sitemap.found} URLs`);
|
|
450
411
|
else if (progress.crawling.status === "processing") {
|
|
451
|
-
|
|
452
|
-
const
|
|
453
|
-
const
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
s.message(current);
|
|
463
|
-
}
|
|
412
|
+
if (!crawlStartTime) crawlStartTime = Date.now();
|
|
413
|
+
const processed = progress.crawling.processed;
|
|
414
|
+
const total = progress.crawling.total;
|
|
415
|
+
const failed = progress.crawling.failed;
|
|
416
|
+
const elapsed = (Date.now() - crawlStartTime) / 1e3;
|
|
417
|
+
const rate = elapsed > .1 ? Math.round(processed / elapsed) : 0;
|
|
418
|
+
let msg = processed > total ? `Crawling ${processed} pages` : `Crawling ${processed}/${total}`;
|
|
419
|
+
if (rate > 0) msg += ` \u00B7 ${rate}/s`;
|
|
420
|
+
if (failed > 0) msg += ` \u00B7 ${failed} failed`;
|
|
421
|
+
s.message(msg);
|
|
422
|
+
} else if (progress.generation.status === "generating") s.message(progress.generation.current || "Generating files");
|
|
464
423
|
});
|
|
465
424
|
s.stop();
|
|
466
425
|
const durationSeconds = (Date.now() - startTime) / 1e3;
|
|
@@ -484,7 +443,7 @@ async function main() {
|
|
|
484
443
|
if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
|
|
485
444
|
if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
|
|
486
445
|
}
|
|
487
|
-
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
|
|
446
|
+
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds, lastProgress?.crawling.latency);
|
|
488
447
|
process.exit(0);
|
|
489
448
|
}
|
|
490
449
|
main().catch((error) => {
|
package/dist/index.d.mts
CHANGED
|
@@ -51,12 +51,6 @@ interface CrawlResult {
|
|
|
51
51
|
metadata?: PageMetadata;
|
|
52
52
|
depth?: number;
|
|
53
53
|
}
|
|
54
|
-
interface LlmsTxtOptions {
|
|
55
|
-
siteName: string;
|
|
56
|
-
description?: string;
|
|
57
|
-
results: CrawlResult[];
|
|
58
|
-
outputPath: string;
|
|
59
|
-
}
|
|
60
54
|
//#endregion
|
|
61
55
|
//#region src/crawl.d.ts
|
|
62
56
|
interface CrawlProgress {
|
|
@@ -69,7 +63,14 @@ interface CrawlProgress {
|
|
|
69
63
|
status: 'starting' | 'processing' | 'completed';
|
|
70
64
|
total: number;
|
|
71
65
|
processed: number;
|
|
72
|
-
|
|
66
|
+
failed: number;
|
|
67
|
+
currentUrl?: string; /** Page fetch latency stats in ms */
|
|
68
|
+
latency: {
|
|
69
|
+
total: number;
|
|
70
|
+
min: number;
|
|
71
|
+
max: number;
|
|
72
|
+
count: number;
|
|
73
|
+
};
|
|
73
74
|
};
|
|
74
75
|
generation: {
|
|
75
76
|
status: 'idle' | 'generating' | 'completed';
|
|
@@ -78,8 +79,4 @@ interface CrawlProgress {
|
|
|
78
79
|
}
|
|
79
80
|
declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult[]>;
|
|
80
81
|
//#endregion
|
|
81
|
-
|
|
82
|
-
declare function generateLlmsTxt(options: LlmsTxtOptions): Promise<void>;
|
|
83
|
-
declare function generateLlmsFullTxt(options: LlmsTxtOptions): Promise<void>;
|
|
84
|
-
//#endregion
|
|
85
|
-
export { type CrawlOptions, type CrawlResult, type LlmsTxtOptions, type PageData, crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
|
82
|
+
export { type CrawlOptions, type CrawlResult, type PageData, crawlAndGenerate };
|
package/dist/index.mjs
CHANGED
|
@@ -1,64 +1,2 @@
|
|
|
1
1
|
import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
|
|
2
|
-
|
|
3
|
-
import { basename, sep } from "pathe";
|
|
4
|
-
//#region src/llms-txt.ts
|
|
5
|
-
const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
|
|
6
|
-
async function generateLlmsTxt(options) {
|
|
7
|
-
const { siteName, description, results, outputPath } = options;
|
|
8
|
-
let content = `# ${siteName}\n\n`;
|
|
9
|
-
if (description) content += `> ${description}\n\n`;
|
|
10
|
-
if (results.length > 0) {
|
|
11
|
-
content += `## Pages\n\n`;
|
|
12
|
-
for (const result of results) {
|
|
13
|
-
let title;
|
|
14
|
-
try {
|
|
15
|
-
title = result.title || new URL(result.url).pathname;
|
|
16
|
-
} catch {
|
|
17
|
-
title = result.title || result.url;
|
|
18
|
-
}
|
|
19
|
-
if (result.filePath) {
|
|
20
|
-
const mdSeparator = `${sep}md${sep}`;
|
|
21
|
-
const mdIndex = result.filePath.indexOf(mdSeparator);
|
|
22
|
-
const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
|
|
23
|
-
content += `- [${title}](md/${linkPath}): ${result.url}\n`;
|
|
24
|
-
} else {
|
|
25
|
-
const description = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
|
|
26
|
-
content += `- [${title}](${result.url})${description ? `: ${description}` : ""}\n`;
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
await writeFile(outputPath, content, "utf-8");
|
|
31
|
-
}
|
|
32
|
-
async function generateLlmsFullTxt(options) {
|
|
33
|
-
const { siteName, description, results, outputPath } = options;
|
|
34
|
-
let content = `# ${siteName}\n\n`;
|
|
35
|
-
if (description) content += `> ${description}\n\n`;
|
|
36
|
-
if (results.length > 0) {
|
|
37
|
-
content += `## Table of Contents\n\n`;
|
|
38
|
-
for (const result of results) {
|
|
39
|
-
let title;
|
|
40
|
-
try {
|
|
41
|
-
title = result.title || new URL(result.url).pathname;
|
|
42
|
-
} catch {
|
|
43
|
-
title = result.title || result.url;
|
|
44
|
-
}
|
|
45
|
-
const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
|
|
46
|
-
content += `- [${title}](#${anchor})\n`;
|
|
47
|
-
}
|
|
48
|
-
content += `\n---\n\n`;
|
|
49
|
-
for (const result of results) {
|
|
50
|
-
let title;
|
|
51
|
-
try {
|
|
52
|
-
title = result.title || new URL(result.url).pathname;
|
|
53
|
-
} catch {
|
|
54
|
-
title = result.title || result.url;
|
|
55
|
-
}
|
|
56
|
-
content += `## ${title}\n\n`;
|
|
57
|
-
content += `**URL:** ${result.url}\n\n`;
|
|
58
|
-
content += `${result.content}\n\n---\n\n`;
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
await writeFile(outputPath, content, "utf-8");
|
|
62
|
-
}
|
|
63
|
-
//#endregion
|
|
64
|
-
export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
|
2
|
+
export { crawlAndGenerate };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "1.0.0-beta.11",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -42,21 +42,26 @@
|
|
|
42
42
|
"dist"
|
|
43
43
|
],
|
|
44
44
|
"peerDependencies": {
|
|
45
|
+
"crawlee": "^3.16.0",
|
|
45
46
|
"playwright": "^1.53.2"
|
|
46
47
|
},
|
|
47
48
|
"peerDependenciesMeta": {
|
|
49
|
+
"crawlee": {
|
|
50
|
+
"optional": true
|
|
51
|
+
},
|
|
48
52
|
"playwright": {
|
|
49
53
|
"optional": true
|
|
50
54
|
}
|
|
51
55
|
},
|
|
52
56
|
"dependencies": {
|
|
53
57
|
"@clack/prompts": "^1.1.0",
|
|
54
|
-
"crawlee": "^3.16.0",
|
|
55
58
|
"nypm": "^0.6.5",
|
|
59
|
+
"ofetch": "^1.5.1",
|
|
56
60
|
"pathe": "^2.0.3",
|
|
57
61
|
"picomatch": "^4.0.3",
|
|
58
62
|
"ufo": "^1.6.3",
|
|
59
|
-
"mdream": "0.
|
|
63
|
+
"@mdream/js": "1.0.0-beta.11",
|
|
64
|
+
"mdream": "1.0.0-beta.11"
|
|
60
65
|
},
|
|
61
66
|
"devDependencies": {
|
|
62
67
|
"@types/picomatch": "^4.0.2"
|