@mdream/crawl 0.16.0 → 0.17.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/crawl.mjs +51 -12
- package/dist/cli.mjs +4 -0
- package/dist/index.mjs +4 -1
- package/package.json +3 -3
package/dist/_chunks/crawl.mjs
CHANGED
|
@@ -9,6 +9,13 @@ import { dirname, join, normalize, resolve } from "pathe";
|
|
|
9
9
|
import { withHttps } from "ufo";
|
|
10
10
|
import picomatch from "picomatch";
|
|
11
11
|
import { extractionPlugin } from "mdream/plugins";
|
|
12
|
+
//#region src/glob-utils.ts
|
|
13
|
+
const GLOB_STRIP_TAIL_RE = /\*.*$/;
|
|
14
|
+
const GLOB_CHAR_RE = /[*?[]/;
|
|
15
|
+
/**
|
|
16
|
+
* Parse a URL that may contain glob patterns
|
|
17
|
+
* Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
|
|
18
|
+
*/
|
|
12
19
|
function parseUrlPattern(input) {
|
|
13
20
|
if (!(input.includes("*") || input.includes("?") || input.includes("["))) return {
|
|
14
21
|
baseUrl: input,
|
|
@@ -16,7 +23,7 @@ function parseUrlPattern(input) {
|
|
|
16
23
|
isGlob: false
|
|
17
24
|
};
|
|
18
25
|
try {
|
|
19
|
-
const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(
|
|
26
|
+
const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(GLOB_STRIP_TAIL_RE, "");
|
|
20
27
|
const url = new URL(urlWithoutGlob);
|
|
21
28
|
const baseUrl = `${url.protocol}//${url.host}`;
|
|
22
29
|
const patternStart = input.indexOf(url.host) + url.host.length;
|
|
@@ -29,6 +36,9 @@ function parseUrlPattern(input) {
|
|
|
29
36
|
throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
|
|
30
37
|
}
|
|
31
38
|
}
|
|
39
|
+
/**
|
|
40
|
+
* Check if a URL matches a glob pattern
|
|
41
|
+
*/
|
|
32
42
|
function matchesGlobPattern(url, parsedPattern) {
|
|
33
43
|
if (!parsedPattern.isGlob) return true;
|
|
34
44
|
try {
|
|
@@ -45,16 +55,23 @@ function matchesGlobPattern(url, parsedPattern) {
|
|
|
45
55
|
return false;
|
|
46
56
|
}
|
|
47
57
|
}
|
|
58
|
+
/**
|
|
59
|
+
* Get the starting URL for crawling from a glob pattern
|
|
60
|
+
* For https://nuxtseo.com/docs/**, we want to start at https://nuxtseo.com
|
|
61
|
+
*/
|
|
48
62
|
function getStartingUrl(parsedPattern) {
|
|
49
63
|
if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
|
|
50
64
|
const pattern = parsedPattern.pattern;
|
|
51
|
-
const firstGlobIndex = pattern.search(
|
|
65
|
+
const firstGlobIndex = pattern.search(GLOB_CHAR_RE);
|
|
52
66
|
if (firstGlobIndex === -1) return withHttps(parsedPattern.baseUrl + pattern);
|
|
53
67
|
const beforeGlob = pattern.substring(0, firstGlobIndex);
|
|
54
68
|
const lastSlash = beforeGlob.lastIndexOf("/");
|
|
55
69
|
const pathBeforeGlob = lastSlash >= 0 ? beforeGlob.substring(0, lastSlash + 1) : "/";
|
|
56
70
|
return withHttps(parsedPattern.baseUrl + pathBeforeGlob);
|
|
57
71
|
}
|
|
72
|
+
/**
|
|
73
|
+
* Check if a URL should be excluded based on exclude patterns
|
|
74
|
+
*/
|
|
58
75
|
function isUrlExcluded(url, excludePatterns) {
|
|
59
76
|
if (!excludePatterns || excludePatterns.length === 0) return false;
|
|
60
77
|
try {
|
|
@@ -73,6 +90,15 @@ function isUrlExcluded(url, excludePatterns) {
|
|
|
73
90
|
return false;
|
|
74
91
|
}
|
|
75
92
|
}
|
|
93
|
+
/**
|
|
94
|
+
* Check if a string is valid sitemap XML content (not an HTML page or other non-sitemap response)
|
|
95
|
+
*/
|
|
96
|
+
function isValidSitemapXml(content) {
|
|
97
|
+
return content.includes("<urlset") || content.includes("<sitemapindex");
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Validate glob pattern syntax
|
|
101
|
+
*/
|
|
76
102
|
function validateGlobPattern(pattern) {
|
|
77
103
|
try {
|
|
78
104
|
parseUrlPattern(pattern);
|
|
@@ -81,6 +107,8 @@ function validateGlobPattern(pattern) {
|
|
|
81
107
|
return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
|
|
82
108
|
}
|
|
83
109
|
}
|
|
110
|
+
//#endregion
|
|
111
|
+
//#region src/metadata-extractor.ts
|
|
84
112
|
function extractMetadata(html, url) {
|
|
85
113
|
const links = [];
|
|
86
114
|
let title = "";
|
|
@@ -133,6 +161,15 @@ function extractMetadata(html, url) {
|
|
|
133
161
|
})
|
|
134
162
|
};
|
|
135
163
|
}
|
|
164
|
+
//#endregion
|
|
165
|
+
//#region src/crawl.ts
|
|
166
|
+
const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
|
|
167
|
+
const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
|
|
168
|
+
const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
|
|
169
|
+
const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
|
|
170
|
+
const URL_TRAILING_SLASH_RE = /\/$/;
|
|
171
|
+
const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
|
|
172
|
+
const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
|
|
136
173
|
async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
137
174
|
const controller = new AbortController();
|
|
138
175
|
const timeoutId = setTimeout(() => controller.abort(), 1e4);
|
|
@@ -144,12 +181,13 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
|
144
181
|
clearTimeout(timeoutId);
|
|
145
182
|
if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
|
|
146
183
|
const xmlContent = await response.text();
|
|
184
|
+
if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
|
|
147
185
|
if (xmlContent.includes("<sitemapindex")) {
|
|
148
|
-
|
|
186
|
+
SITEMAP_INDEX_LOC_RE.lastIndex = 0;
|
|
149
187
|
const childSitemaps = [];
|
|
150
188
|
let match;
|
|
151
189
|
while (true) {
|
|
152
|
-
match =
|
|
190
|
+
match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
|
|
153
191
|
if (match === null) break;
|
|
154
192
|
let url = match[1];
|
|
155
193
|
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
@@ -165,10 +203,10 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
|
165
203
|
return allUrls;
|
|
166
204
|
} else {
|
|
167
205
|
const urls = [];
|
|
168
|
-
|
|
206
|
+
SITEMAP_URL_LOC_RE.lastIndex = 0;
|
|
169
207
|
let match;
|
|
170
208
|
while (true) {
|
|
171
|
-
match =
|
|
209
|
+
match = SITEMAP_URL_LOC_RE.exec(xmlContent);
|
|
172
210
|
if (match === null) break;
|
|
173
211
|
let url = match[1];
|
|
174
212
|
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
@@ -227,12 +265,12 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
227
265
|
robotsResponse = null;
|
|
228
266
|
}
|
|
229
267
|
if (robotsResponse?.ok) {
|
|
230
|
-
const sitemapMatches = (await robotsResponse.text()).match(
|
|
268
|
+
const sitemapMatches = (await robotsResponse.text()).match(ROBOTS_SITEMAP_RE);
|
|
231
269
|
if (sitemapMatches && sitemapMatches.length > 0) {
|
|
232
270
|
progress.sitemap.found = sitemapMatches.length;
|
|
233
271
|
progress.sitemap.status = "processing";
|
|
234
272
|
onProgress?.(progress);
|
|
235
|
-
const robotsSitemaps = sitemapMatches.map((match) => match.replace(
|
|
273
|
+
const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
|
|
236
274
|
for (const sitemapUrl of robotsSitemaps) try {
|
|
237
275
|
const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
|
|
238
276
|
sitemapAttempts.push({
|
|
@@ -403,17 +441,17 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
403
441
|
origin: pageOrigin
|
|
404
442
|
});
|
|
405
443
|
let md = "";
|
|
406
|
-
if (shouldProcessMarkdown
|
|
444
|
+
if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
|
|
407
445
|
let filePath;
|
|
408
446
|
if (shouldProcessMarkdown && generateIndividualMd) {
|
|
409
447
|
const urlObj = new URL(request.loadedUrl);
|
|
410
|
-
const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(
|
|
448
|
+
const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
|
|
411
449
|
filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
|
|
412
450
|
const fileDir = dirname(filePath);
|
|
413
451
|
if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
|
|
414
452
|
await writeFile(filePath, md, "utf-8");
|
|
415
453
|
}
|
|
416
|
-
const isHomePage = request.loadedUrl.replace(
|
|
454
|
+
const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
|
|
417
455
|
if (shouldProcessMarkdown || isHomePage) {
|
|
418
456
|
const result = {
|
|
419
457
|
url: request.loadedUrl,
|
|
@@ -530,7 +568,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
530
568
|
onProgress?.(progress);
|
|
531
569
|
const contentResults = successfulResults.filter((result) => {
|
|
532
570
|
if (!result.content) return false;
|
|
533
|
-
return result.content.trim().replace(
|
|
571
|
+
return result.content.trim().replace(FRONTMATTER_BLOCK_RE, "").trim().length > 10;
|
|
534
572
|
});
|
|
535
573
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
536
574
|
const llmsResult = await generateLlmsTxtArtifacts({
|
|
@@ -568,4 +606,5 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
568
606
|
await purgeDefaultStorages();
|
|
569
607
|
return results;
|
|
570
608
|
}
|
|
609
|
+
//#endregion
|
|
571
610
|
export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
|
package/dist/cli.mjs
CHANGED
|
@@ -6,6 +6,7 @@ import { dirname, join, resolve } from "pathe";
|
|
|
6
6
|
import { withHttps } from "ufo";
|
|
7
7
|
import { fileURLToPath } from "node:url";
|
|
8
8
|
import { addDependency } from "nypm";
|
|
9
|
+
//#region src/playwright-utils.ts
|
|
9
10
|
async function checkPlaywrightInstallation() {
|
|
10
11
|
try {
|
|
11
12
|
await import("playwright");
|
|
@@ -59,6 +60,8 @@ async function isUseChromeSupported() {
|
|
|
59
60
|
} catch {}
|
|
60
61
|
return false;
|
|
61
62
|
}
|
|
63
|
+
//#endregion
|
|
64
|
+
//#region src/cli.ts
|
|
62
65
|
const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
|
|
63
66
|
const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
|
|
64
67
|
function checkOutputDirectoryPermissions(outputDir) {
|
|
@@ -488,4 +491,5 @@ main().catch((error) => {
|
|
|
488
491
|
p.log.error(`Unexpected error: ${error}`);
|
|
489
492
|
process.exit(1);
|
|
490
493
|
});
|
|
494
|
+
//#endregion
|
|
491
495
|
export {};
|
package/dist/index.mjs
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
|
|
2
2
|
import { writeFile } from "node:fs/promises";
|
|
3
3
|
import { basename, sep } from "pathe";
|
|
4
|
+
//#region src/llms-txt.ts
|
|
5
|
+
const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
|
|
4
6
|
async function generateLlmsTxt(options) {
|
|
5
7
|
const { siteName, description, results, outputPath } = options;
|
|
6
8
|
let content = `# ${siteName}\n\n`;
|
|
@@ -40,7 +42,7 @@ async function generateLlmsFullTxt(options) {
|
|
|
40
42
|
} catch {
|
|
41
43
|
title = result.title || result.url;
|
|
42
44
|
}
|
|
43
|
-
const anchor = title.toLowerCase().replace(
|
|
45
|
+
const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
|
|
44
46
|
content += `- [${title}](#${anchor})\n`;
|
|
45
47
|
}
|
|
46
48
|
content += `\n---\n\n`;
|
|
@@ -58,4 +60,5 @@ async function generateLlmsFullTxt(options) {
|
|
|
58
60
|
}
|
|
59
61
|
await writeFile(outputPath, content, "utf-8");
|
|
60
62
|
}
|
|
63
|
+
//#endregion
|
|
61
64
|
export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.17.1",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -50,13 +50,13 @@
|
|
|
50
50
|
}
|
|
51
51
|
},
|
|
52
52
|
"dependencies": {
|
|
53
|
-
"@clack/prompts": "^1.0
|
|
53
|
+
"@clack/prompts": "^1.1.0",
|
|
54
54
|
"crawlee": "^3.16.0",
|
|
55
55
|
"nypm": "^0.6.5",
|
|
56
56
|
"pathe": "^2.0.3",
|
|
57
57
|
"picomatch": "^4.0.3",
|
|
58
58
|
"ufo": "^1.6.3",
|
|
59
|
-
"mdream": "0.
|
|
59
|
+
"mdream": "0.17.1"
|
|
60
60
|
},
|
|
61
61
|
"devDependencies": {
|
|
62
62
|
"@types/picomatch": "^4.0.2"
|