@mdream/crawl 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/crawl.mjs +44 -12
- package/dist/cli.mjs +4 -0
- package/dist/index.mjs +4 -1
- package/package.json +3 -3
package/dist/_chunks/crawl.mjs
CHANGED
|
@@ -9,6 +9,13 @@ import { dirname, join, normalize, resolve } from "pathe";
|
|
|
9
9
|
import { withHttps } from "ufo";
|
|
10
10
|
import picomatch from "picomatch";
|
|
11
11
|
import { extractionPlugin } from "mdream/plugins";
|
|
12
|
+
//#region src/glob-utils.ts
|
|
13
|
+
const GLOB_STRIP_TAIL_RE = /\*.*$/;
|
|
14
|
+
const GLOB_CHAR_RE = /[*?[]/;
|
|
15
|
+
/**
|
|
16
|
+
* Parse a URL that may contain glob patterns
|
|
17
|
+
* Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
|
|
18
|
+
*/
|
|
12
19
|
function parseUrlPattern(input) {
|
|
13
20
|
if (!(input.includes("*") || input.includes("?") || input.includes("["))) return {
|
|
14
21
|
baseUrl: input,
|
|
@@ -16,7 +23,7 @@ function parseUrlPattern(input) {
|
|
|
16
23
|
isGlob: false
|
|
17
24
|
};
|
|
18
25
|
try {
|
|
19
|
-
const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(
|
|
26
|
+
const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(GLOB_STRIP_TAIL_RE, "");
|
|
20
27
|
const url = new URL(urlWithoutGlob);
|
|
21
28
|
const baseUrl = `${url.protocol}//${url.host}`;
|
|
22
29
|
const patternStart = input.indexOf(url.host) + url.host.length;
|
|
@@ -29,6 +36,9 @@ function parseUrlPattern(input) {
|
|
|
29
36
|
throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
|
|
30
37
|
}
|
|
31
38
|
}
|
|
39
|
+
/**
|
|
40
|
+
* Check if a URL matches a glob pattern
|
|
41
|
+
*/
|
|
32
42
|
function matchesGlobPattern(url, parsedPattern) {
|
|
33
43
|
if (!parsedPattern.isGlob) return true;
|
|
34
44
|
try {
|
|
@@ -45,16 +55,23 @@ function matchesGlobPattern(url, parsedPattern) {
|
|
|
45
55
|
return false;
|
|
46
56
|
}
|
|
47
57
|
}
|
|
58
|
+
/**
|
|
59
|
+
* Get the starting URL for crawling from a glob pattern
|
|
60
|
+
* For https://nuxtseo.com/docs/**, we want to start at https://nuxtseo.com
|
|
61
|
+
*/
|
|
48
62
|
function getStartingUrl(parsedPattern) {
|
|
49
63
|
if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
|
|
50
64
|
const pattern = parsedPattern.pattern;
|
|
51
|
-
const firstGlobIndex = pattern.search(
|
|
65
|
+
const firstGlobIndex = pattern.search(GLOB_CHAR_RE);
|
|
52
66
|
if (firstGlobIndex === -1) return withHttps(parsedPattern.baseUrl + pattern);
|
|
53
67
|
const beforeGlob = pattern.substring(0, firstGlobIndex);
|
|
54
68
|
const lastSlash = beforeGlob.lastIndexOf("/");
|
|
55
69
|
const pathBeforeGlob = lastSlash >= 0 ? beforeGlob.substring(0, lastSlash + 1) : "/";
|
|
56
70
|
return withHttps(parsedPattern.baseUrl + pathBeforeGlob);
|
|
57
71
|
}
|
|
72
|
+
/**
|
|
73
|
+
* Check if a URL should be excluded based on exclude patterns
|
|
74
|
+
*/
|
|
58
75
|
function isUrlExcluded(url, excludePatterns) {
|
|
59
76
|
if (!excludePatterns || excludePatterns.length === 0) return false;
|
|
60
77
|
try {
|
|
@@ -73,6 +90,9 @@ function isUrlExcluded(url, excludePatterns) {
|
|
|
73
90
|
return false;
|
|
74
91
|
}
|
|
75
92
|
}
|
|
93
|
+
/**
|
|
94
|
+
* Validate glob pattern syntax
|
|
95
|
+
*/
|
|
76
96
|
function validateGlobPattern(pattern) {
|
|
77
97
|
try {
|
|
78
98
|
parseUrlPattern(pattern);
|
|
@@ -81,6 +101,8 @@ function validateGlobPattern(pattern) {
|
|
|
81
101
|
return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
|
|
82
102
|
}
|
|
83
103
|
}
|
|
104
|
+
//#endregion
|
|
105
|
+
//#region src/metadata-extractor.ts
|
|
84
106
|
function extractMetadata(html, url) {
|
|
85
107
|
const links = [];
|
|
86
108
|
let title = "";
|
|
@@ -133,6 +155,15 @@ function extractMetadata(html, url) {
|
|
|
133
155
|
})
|
|
134
156
|
};
|
|
135
157
|
}
|
|
158
|
+
//#endregion
|
|
159
|
+
//#region src/crawl.ts
|
|
160
|
+
const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
|
|
161
|
+
const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
|
|
162
|
+
const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
|
|
163
|
+
const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
|
|
164
|
+
const URL_TRAILING_SLASH_RE = /\/$/;
|
|
165
|
+
const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
|
|
166
|
+
const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
|
|
136
167
|
async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
137
168
|
const controller = new AbortController();
|
|
138
169
|
const timeoutId = setTimeout(() => controller.abort(), 1e4);
|
|
@@ -145,11 +176,11 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
|
145
176
|
if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
|
|
146
177
|
const xmlContent = await response.text();
|
|
147
178
|
if (xmlContent.includes("<sitemapindex")) {
|
|
148
|
-
|
|
179
|
+
SITEMAP_INDEX_LOC_RE.lastIndex = 0;
|
|
149
180
|
const childSitemaps = [];
|
|
150
181
|
let match;
|
|
151
182
|
while (true) {
|
|
152
|
-
match =
|
|
183
|
+
match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
|
|
153
184
|
if (match === null) break;
|
|
154
185
|
let url = match[1];
|
|
155
186
|
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
@@ -165,10 +196,10 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
|
165
196
|
return allUrls;
|
|
166
197
|
} else {
|
|
167
198
|
const urls = [];
|
|
168
|
-
|
|
199
|
+
SITEMAP_URL_LOC_RE.lastIndex = 0;
|
|
169
200
|
let match;
|
|
170
201
|
while (true) {
|
|
171
|
-
match =
|
|
202
|
+
match = SITEMAP_URL_LOC_RE.exec(xmlContent);
|
|
172
203
|
if (match === null) break;
|
|
173
204
|
let url = match[1];
|
|
174
205
|
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
@@ -227,12 +258,12 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
227
258
|
robotsResponse = null;
|
|
228
259
|
}
|
|
229
260
|
if (robotsResponse?.ok) {
|
|
230
|
-
const sitemapMatches = (await robotsResponse.text()).match(
|
|
261
|
+
const sitemapMatches = (await robotsResponse.text()).match(ROBOTS_SITEMAP_RE);
|
|
231
262
|
if (sitemapMatches && sitemapMatches.length > 0) {
|
|
232
263
|
progress.sitemap.found = sitemapMatches.length;
|
|
233
264
|
progress.sitemap.status = "processing";
|
|
234
265
|
onProgress?.(progress);
|
|
235
|
-
const robotsSitemaps = sitemapMatches.map((match) => match.replace(
|
|
266
|
+
const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
|
|
236
267
|
for (const sitemapUrl of robotsSitemaps) try {
|
|
237
268
|
const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
|
|
238
269
|
sitemapAttempts.push({
|
|
@@ -403,17 +434,17 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
403
434
|
origin: pageOrigin
|
|
404
435
|
});
|
|
405
436
|
let md = "";
|
|
406
|
-
if (shouldProcessMarkdown
|
|
437
|
+
if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
|
|
407
438
|
let filePath;
|
|
408
439
|
if (shouldProcessMarkdown && generateIndividualMd) {
|
|
409
440
|
const urlObj = new URL(request.loadedUrl);
|
|
410
|
-
const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(
|
|
441
|
+
const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
|
|
411
442
|
filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
|
|
412
443
|
const fileDir = dirname(filePath);
|
|
413
444
|
if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
|
|
414
445
|
await writeFile(filePath, md, "utf-8");
|
|
415
446
|
}
|
|
416
|
-
const isHomePage = request.loadedUrl.replace(
|
|
447
|
+
const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
|
|
417
448
|
if (shouldProcessMarkdown || isHomePage) {
|
|
418
449
|
const result = {
|
|
419
450
|
url: request.loadedUrl,
|
|
@@ -530,7 +561,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
530
561
|
onProgress?.(progress);
|
|
531
562
|
const contentResults = successfulResults.filter((result) => {
|
|
532
563
|
if (!result.content) return false;
|
|
533
|
-
return result.content.trim().replace(
|
|
564
|
+
return result.content.trim().replace(FRONTMATTER_BLOCK_RE, "").trim().length > 10;
|
|
534
565
|
});
|
|
535
566
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
536
567
|
const llmsResult = await generateLlmsTxtArtifacts({
|
|
@@ -568,4 +599,5 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
568
599
|
await purgeDefaultStorages();
|
|
569
600
|
return results;
|
|
570
601
|
}
|
|
602
|
+
//#endregion
|
|
571
603
|
export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
|
package/dist/cli.mjs
CHANGED
|
@@ -6,6 +6,7 @@ import { dirname, join, resolve } from "pathe";
|
|
|
6
6
|
import { withHttps } from "ufo";
|
|
7
7
|
import { fileURLToPath } from "node:url";
|
|
8
8
|
import { addDependency } from "nypm";
|
|
9
|
+
//#region src/playwright-utils.ts
|
|
9
10
|
async function checkPlaywrightInstallation() {
|
|
10
11
|
try {
|
|
11
12
|
await import("playwright");
|
|
@@ -59,6 +60,8 @@ async function isUseChromeSupported() {
|
|
|
59
60
|
} catch {}
|
|
60
61
|
return false;
|
|
61
62
|
}
|
|
63
|
+
//#endregion
|
|
64
|
+
//#region src/cli.ts
|
|
62
65
|
const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
|
|
63
66
|
const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
|
|
64
67
|
function checkOutputDirectoryPermissions(outputDir) {
|
|
@@ -488,4 +491,5 @@ main().catch((error) => {
|
|
|
488
491
|
p.log.error(`Unexpected error: ${error}`);
|
|
489
492
|
process.exit(1);
|
|
490
493
|
});
|
|
494
|
+
//#endregion
|
|
491
495
|
export {};
|
package/dist/index.mjs
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
|
|
2
2
|
import { writeFile } from "node:fs/promises";
|
|
3
3
|
import { basename, sep } from "pathe";
|
|
4
|
+
//#region src/llms-txt.ts
|
|
5
|
+
const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
|
|
4
6
|
async function generateLlmsTxt(options) {
|
|
5
7
|
const { siteName, description, results, outputPath } = options;
|
|
6
8
|
let content = `# ${siteName}\n\n`;
|
|
@@ -40,7 +42,7 @@ async function generateLlmsFullTxt(options) {
|
|
|
40
42
|
} catch {
|
|
41
43
|
title = result.title || result.url;
|
|
42
44
|
}
|
|
43
|
-
const anchor = title.toLowerCase().replace(
|
|
45
|
+
const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
|
|
44
46
|
content += `- [${title}](#${anchor})\n`;
|
|
45
47
|
}
|
|
46
48
|
content += `\n---\n\n`;
|
|
@@ -58,4 +60,5 @@ async function generateLlmsFullTxt(options) {
|
|
|
58
60
|
}
|
|
59
61
|
await writeFile(outputPath, content, "utf-8");
|
|
60
62
|
}
|
|
63
|
+
//#endregion
|
|
61
64
|
export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.17.0",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -50,13 +50,13 @@
|
|
|
50
50
|
}
|
|
51
51
|
},
|
|
52
52
|
"dependencies": {
|
|
53
|
-
"@clack/prompts": "^1.0
|
|
53
|
+
"@clack/prompts": "^1.1.0",
|
|
54
54
|
"crawlee": "^3.16.0",
|
|
55
55
|
"nypm": "^0.6.5",
|
|
56
56
|
"pathe": "^2.0.3",
|
|
57
57
|
"picomatch": "^4.0.3",
|
|
58
58
|
"ufo": "^1.6.3",
|
|
59
|
-
"mdream": "0.
|
|
59
|
+
"mdream": "0.17.0"
|
|
60
60
|
},
|
|
61
61
|
"devDependencies": {
|
|
62
62
|
"@types/picomatch": "^4.0.2"
|