@mdream/crawl 0.15.3 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/crawl.mjs +32 -27
- package/dist/cli.mjs +4 -7
- package/dist/index.mjs +5 -6
- package/package.json +6 -6
package/dist/_chunks/crawl.mjs
CHANGED
|
@@ -9,8 +9,9 @@ import { dirname, join, normalize, resolve } from "pathe";
|
|
|
9
9
|
import { withHttps } from "ufo";
|
|
10
10
|
import picomatch from "picomatch";
|
|
11
11
|
import { extractionPlugin } from "mdream/plugins";
|
|
12
|
-
|
|
13
12
|
//#region src/glob-utils.ts
|
|
13
|
+
const GLOB_STRIP_TAIL_RE = /\*.*$/;
|
|
14
|
+
const GLOB_CHAR_RE = /[*?[]/;
|
|
14
15
|
/**
|
|
15
16
|
* Parse a URL that may contain glob patterns
|
|
16
17
|
* Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
|
|
@@ -22,7 +23,7 @@ function parseUrlPattern(input) {
|
|
|
22
23
|
isGlob: false
|
|
23
24
|
};
|
|
24
25
|
try {
|
|
25
|
-
const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(
|
|
26
|
+
const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(GLOB_STRIP_TAIL_RE, "");
|
|
26
27
|
const url = new URL(urlWithoutGlob);
|
|
27
28
|
const baseUrl = `${url.protocol}//${url.host}`;
|
|
28
29
|
const patternStart = input.indexOf(url.host) + url.host.length;
|
|
@@ -61,7 +62,7 @@ function matchesGlobPattern(url, parsedPattern) {
|
|
|
61
62
|
function getStartingUrl(parsedPattern) {
|
|
62
63
|
if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
|
|
63
64
|
const pattern = parsedPattern.pattern;
|
|
64
|
-
const firstGlobIndex = pattern.search(
|
|
65
|
+
const firstGlobIndex = pattern.search(GLOB_CHAR_RE);
|
|
65
66
|
if (firstGlobIndex === -1) return withHttps(parsedPattern.baseUrl + pattern);
|
|
66
67
|
const beforeGlob = pattern.substring(0, firstGlobIndex);
|
|
67
68
|
const lastSlash = beforeGlob.lastIndexOf("/");
|
|
@@ -100,7 +101,6 @@ function validateGlobPattern(pattern) {
|
|
|
100
101
|
return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
|
|
101
102
|
}
|
|
102
103
|
}
|
|
103
|
-
|
|
104
104
|
//#endregion
|
|
105
105
|
//#region src/metadata-extractor.ts
|
|
106
106
|
function extractMetadata(html, url) {
|
|
@@ -155,9 +155,15 @@ function extractMetadata(html, url) {
|
|
|
155
155
|
})
|
|
156
156
|
};
|
|
157
157
|
}
|
|
158
|
-
|
|
159
158
|
//#endregion
|
|
160
159
|
//#region src/crawl.ts
|
|
160
|
+
const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
|
|
161
|
+
const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
|
|
162
|
+
const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
|
|
163
|
+
const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
|
|
164
|
+
const URL_TRAILING_SLASH_RE = /\/$/;
|
|
165
|
+
const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
|
|
166
|
+
const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
|
|
161
167
|
async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
162
168
|
const controller = new AbortController();
|
|
163
169
|
const timeoutId = setTimeout(() => controller.abort(), 1e4);
|
|
@@ -170,11 +176,11 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
|
170
176
|
if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
|
|
171
177
|
const xmlContent = await response.text();
|
|
172
178
|
if (xmlContent.includes("<sitemapindex")) {
|
|
173
|
-
|
|
179
|
+
SITEMAP_INDEX_LOC_RE.lastIndex = 0;
|
|
174
180
|
const childSitemaps = [];
|
|
175
181
|
let match;
|
|
176
182
|
while (true) {
|
|
177
|
-
match =
|
|
183
|
+
match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
|
|
178
184
|
if (match === null) break;
|
|
179
185
|
let url = match[1];
|
|
180
186
|
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
@@ -190,10 +196,10 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
|
190
196
|
return allUrls;
|
|
191
197
|
} else {
|
|
192
198
|
const urls = [];
|
|
193
|
-
|
|
199
|
+
SITEMAP_URL_LOC_RE.lastIndex = 0;
|
|
194
200
|
let match;
|
|
195
201
|
while (true) {
|
|
196
|
-
match =
|
|
202
|
+
match = SITEMAP_URL_LOC_RE.exec(xmlContent);
|
|
197
203
|
if (match === null) break;
|
|
198
204
|
let url = match[1];
|
|
199
205
|
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
@@ -247,24 +253,24 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
247
253
|
headers: { "User-Agent": "mdream-crawler/1.0" }
|
|
248
254
|
});
|
|
249
255
|
clearTimeout(robotsTimeoutId);
|
|
250
|
-
} catch
|
|
256
|
+
} catch {
|
|
251
257
|
clearTimeout(robotsTimeoutId);
|
|
252
258
|
robotsResponse = null;
|
|
253
259
|
}
|
|
254
260
|
if (robotsResponse?.ok) {
|
|
255
|
-
const sitemapMatches = (await robotsResponse.text()).match(
|
|
261
|
+
const sitemapMatches = (await robotsResponse.text()).match(ROBOTS_SITEMAP_RE);
|
|
256
262
|
if (sitemapMatches && sitemapMatches.length > 0) {
|
|
257
263
|
progress.sitemap.found = sitemapMatches.length;
|
|
258
264
|
progress.sitemap.status = "processing";
|
|
259
265
|
onProgress?.(progress);
|
|
260
|
-
const robotsSitemaps = sitemapMatches.map((match) => match.replace(
|
|
266
|
+
const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
|
|
261
267
|
for (const sitemapUrl of robotsSitemaps) try {
|
|
262
268
|
const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
|
|
263
269
|
sitemapAttempts.push({
|
|
264
270
|
url: sitemapUrl,
|
|
265
271
|
success: true
|
|
266
272
|
});
|
|
267
|
-
if (patterns.some((p
|
|
273
|
+
if (patterns.some((p) => p.isGlob)) {
|
|
268
274
|
const filteredUrls = robotsUrls.filter((url) => {
|
|
269
275
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
270
276
|
});
|
|
@@ -300,7 +306,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
300
306
|
url: mainSitemapUrl,
|
|
301
307
|
success: true
|
|
302
308
|
});
|
|
303
|
-
if (patterns.some((p
|
|
309
|
+
if (patterns.some((p) => p.isGlob)) {
|
|
304
310
|
const filteredUrls = sitemapUrls.filter((url) => {
|
|
305
311
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
306
312
|
});
|
|
@@ -339,7 +345,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
339
345
|
url: sitemapUrl,
|
|
340
346
|
success: true
|
|
341
347
|
});
|
|
342
|
-
if (patterns.some((p
|
|
348
|
+
if (patterns.some((p) => p.isGlob)) {
|
|
343
349
|
const filteredUrls = altUrls.filter((url) => {
|
|
344
350
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
345
351
|
});
|
|
@@ -360,11 +366,11 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
360
366
|
break;
|
|
361
367
|
}
|
|
362
368
|
}
|
|
363
|
-
} catch (error
|
|
369
|
+
} catch (error) {
|
|
364
370
|
sitemapAttempts.push({
|
|
365
371
|
url: sitemapUrl,
|
|
366
372
|
success: false,
|
|
367
|
-
error: error
|
|
373
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
368
374
|
});
|
|
369
375
|
}
|
|
370
376
|
}
|
|
@@ -396,7 +402,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
396
402
|
const processedUrls = /* @__PURE__ */ new Set();
|
|
397
403
|
const shouldCrawlUrl = (url) => {
|
|
398
404
|
if (isUrlExcluded(url, exclude)) return false;
|
|
399
|
-
if (!patterns.some((p
|
|
405
|
+
if (!patterns.some((p) => p.isGlob)) return true;
|
|
400
406
|
return patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
401
407
|
};
|
|
402
408
|
const createRequestHandler = (crawlerType) => {
|
|
@@ -428,17 +434,17 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
428
434
|
origin: pageOrigin
|
|
429
435
|
});
|
|
430
436
|
let md = "";
|
|
431
|
-
if (shouldProcessMarkdown
|
|
437
|
+
if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
|
|
432
438
|
let filePath;
|
|
433
439
|
if (shouldProcessMarkdown && generateIndividualMd) {
|
|
434
440
|
const urlObj = new URL(request.loadedUrl);
|
|
435
|
-
const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(
|
|
441
|
+
const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
|
|
436
442
|
filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
|
|
437
443
|
const fileDir = dirname(filePath);
|
|
438
444
|
if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
|
|
439
445
|
await writeFile(filePath, md, "utf-8");
|
|
440
446
|
}
|
|
441
|
-
const isHomePage = request.loadedUrl.replace(
|
|
447
|
+
const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
|
|
442
448
|
if (shouldProcessMarkdown || isHomePage) {
|
|
443
449
|
const result = {
|
|
444
450
|
url: request.loadedUrl,
|
|
@@ -543,10 +549,10 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
543
549
|
onProgress?.(progress);
|
|
544
550
|
const successfulResults = results.filter((r) => r.success);
|
|
545
551
|
const firstUrl = new URL(withHttps(urls[0]));
|
|
546
|
-
const origin
|
|
552
|
+
const origin = firstUrl.origin;
|
|
547
553
|
const homePageResult = successfulResults.find((r) => {
|
|
548
554
|
const resultUrl = new URL(withHttps(r.url));
|
|
549
|
-
return resultUrl.href === origin
|
|
555
|
+
return resultUrl.href === origin || resultUrl.href === `${origin}/`;
|
|
550
556
|
});
|
|
551
557
|
const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
|
|
552
558
|
const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
|
|
@@ -555,7 +561,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
555
561
|
onProgress?.(progress);
|
|
556
562
|
const contentResults = successfulResults.filter((result) => {
|
|
557
563
|
if (!result.content) return false;
|
|
558
|
-
return result.content.trim().replace(
|
|
564
|
+
return result.content.trim().replace(FRONTMATTER_BLOCK_RE, "").trim().length > 10;
|
|
559
565
|
});
|
|
560
566
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
561
567
|
const llmsResult = await generateLlmsTxtArtifacts({
|
|
@@ -572,7 +578,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
572
578
|
})),
|
|
573
579
|
siteName,
|
|
574
580
|
description,
|
|
575
|
-
origin: origin
|
|
581
|
+
origin: origin || firstUrl.origin,
|
|
576
582
|
generateFull: generateLlmsFullTxt,
|
|
577
583
|
outputDir
|
|
578
584
|
});
|
|
@@ -593,6 +599,5 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
593
599
|
await purgeDefaultStorages();
|
|
594
600
|
return results;
|
|
595
601
|
}
|
|
596
|
-
|
|
597
602
|
//#endregion
|
|
598
|
-
export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
|
|
603
|
+
export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
|
package/dist/cli.mjs
CHANGED
|
@@ -6,7 +6,6 @@ import { dirname, join, resolve } from "pathe";
|
|
|
6
6
|
import { withHttps } from "ufo";
|
|
7
7
|
import { fileURLToPath } from "node:url";
|
|
8
8
|
import { addDependency } from "nypm";
|
|
9
|
-
|
|
10
9
|
//#region src/playwright-utils.ts
|
|
11
10
|
async function checkPlaywrightInstallation() {
|
|
12
11
|
try {
|
|
@@ -61,7 +60,6 @@ async function isUseChromeSupported() {
|
|
|
61
60
|
} catch {}
|
|
62
61
|
return false;
|
|
63
62
|
}
|
|
64
|
-
|
|
65
63
|
//#endregion
|
|
66
64
|
//#region src/cli.ts
|
|
67
65
|
const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
|
|
@@ -106,8 +104,8 @@ async function interactiveCrawl() {
|
|
|
106
104
|
placeholder: "e.g. docs.example.com, site.com/docs/**",
|
|
107
105
|
validate: (value) => {
|
|
108
106
|
if (!value) return "Please enter at least one URL";
|
|
109
|
-
const urls
|
|
110
|
-
for (const url of urls
|
|
107
|
+
const urls = value.split(",").map((url) => url.trim());
|
|
108
|
+
for (const url of urls) {
|
|
111
109
|
const globError = validateGlobPattern(url);
|
|
112
110
|
if (globError) return globError;
|
|
113
111
|
try {
|
|
@@ -210,7 +208,7 @@ async function interactiveCrawl() {
|
|
|
210
208
|
inferredOrigin && `Origin: ${inferredOrigin}`
|
|
211
209
|
].filter(Boolean);
|
|
212
210
|
p.note(summary.join("\n"), "Crawl Configuration");
|
|
213
|
-
if (advancedOptions.skipSitemap && globPatterns.some((p
|
|
211
|
+
if (advancedOptions.skipSitemap && globPatterns.some((p) => p.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
|
|
214
212
|
return {
|
|
215
213
|
urls,
|
|
216
214
|
outputDir: resolve(outputDir),
|
|
@@ -493,6 +491,5 @@ main().catch((error) => {
|
|
|
493
491
|
p.log.error(`Unexpected error: ${error}`);
|
|
494
492
|
process.exit(1);
|
|
495
493
|
});
|
|
496
|
-
|
|
497
494
|
//#endregion
|
|
498
|
-
export {
|
|
495
|
+
export {};
|
package/dist/index.mjs
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
|
|
2
2
|
import { writeFile } from "node:fs/promises";
|
|
3
3
|
import { basename, sep } from "pathe";
|
|
4
|
-
|
|
5
4
|
//#region src/llms-txt.ts
|
|
5
|
+
const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
|
|
6
6
|
async function generateLlmsTxt(options) {
|
|
7
7
|
const { siteName, description, results, outputPath } = options;
|
|
8
8
|
let content = `# ${siteName}\n\n`;
|
|
@@ -22,8 +22,8 @@ async function generateLlmsTxt(options) {
|
|
|
22
22
|
const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
|
|
23
23
|
content += `- [${title}](md/${linkPath}): ${result.url}\n`;
|
|
24
24
|
} else {
|
|
25
|
-
const description
|
|
26
|
-
content += `- [${title}](${result.url})${description
|
|
25
|
+
const description = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
|
|
26
|
+
content += `- [${title}](${result.url})${description ? `: ${description}` : ""}\n`;
|
|
27
27
|
}
|
|
28
28
|
}
|
|
29
29
|
}
|
|
@@ -42,7 +42,7 @@ async function generateLlmsFullTxt(options) {
|
|
|
42
42
|
} catch {
|
|
43
43
|
title = result.title || result.url;
|
|
44
44
|
}
|
|
45
|
-
const anchor = title.toLowerCase().replace(
|
|
45
|
+
const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
|
|
46
46
|
content += `- [${title}](#${anchor})\n`;
|
|
47
47
|
}
|
|
48
48
|
content += `\n---\n\n`;
|
|
@@ -60,6 +60,5 @@ async function generateLlmsFullTxt(options) {
|
|
|
60
60
|
}
|
|
61
61
|
await writeFile(outputPath, content, "utf-8");
|
|
62
62
|
}
|
|
63
|
-
|
|
64
63
|
//#endregion
|
|
65
|
-
export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
|
64
|
+
export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.17.0",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -50,13 +50,13 @@
|
|
|
50
50
|
}
|
|
51
51
|
},
|
|
52
52
|
"dependencies": {
|
|
53
|
-
"@clack/prompts": "^
|
|
54
|
-
"crawlee": "^3.
|
|
55
|
-
"nypm": "^0.6.
|
|
53
|
+
"@clack/prompts": "^1.1.0",
|
|
54
|
+
"crawlee": "^3.16.0",
|
|
55
|
+
"nypm": "^0.6.5",
|
|
56
56
|
"pathe": "^2.0.3",
|
|
57
57
|
"picomatch": "^4.0.3",
|
|
58
|
-
"ufo": "^1.6.
|
|
59
|
-
"mdream": "0.
|
|
58
|
+
"ufo": "^1.6.3",
|
|
59
|
+
"mdream": "0.17.0"
|
|
60
60
|
},
|
|
61
61
|
"devDependencies": {
|
|
62
62
|
"@types/picomatch": "^4.0.2"
|