@mdream/crawl 0.15.2 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/{crawl-BInMcRnS.mjs → crawl.mjs} +12 -39
- package/dist/cli.mjs +5 -11
- package/dist/index.mjs +4 -8
- package/package.json +6 -6
|
@@ -9,12 +9,6 @@ import { dirname, join, normalize, resolve } from "pathe";
|
|
|
9
9
|
import { withHttps } from "ufo";
|
|
10
10
|
import picomatch from "picomatch";
|
|
11
11
|
import { extractionPlugin } from "mdream/plugins";
|
|
12
|
-
|
|
13
|
-
//#region src/glob-utils.ts
|
|
14
|
-
/**
|
|
15
|
-
* Parse a URL that may contain glob patterns
|
|
16
|
-
* Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
|
|
17
|
-
*/
|
|
18
12
|
function parseUrlPattern(input) {
|
|
19
13
|
if (!(input.includes("*") || input.includes("?") || input.includes("["))) return {
|
|
20
14
|
baseUrl: input,
|
|
@@ -35,9 +29,6 @@ function parseUrlPattern(input) {
|
|
|
35
29
|
throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
|
|
36
30
|
}
|
|
37
31
|
}
|
|
38
|
-
/**
|
|
39
|
-
* Check if a URL matches a glob pattern
|
|
40
|
-
*/
|
|
41
32
|
function matchesGlobPattern(url, parsedPattern) {
|
|
42
33
|
if (!parsedPattern.isGlob) return true;
|
|
43
34
|
try {
|
|
@@ -54,10 +45,6 @@ function matchesGlobPattern(url, parsedPattern) {
|
|
|
54
45
|
return false;
|
|
55
46
|
}
|
|
56
47
|
}
|
|
57
|
-
/**
|
|
58
|
-
* Get the starting URL for crawling from a glob pattern
|
|
59
|
-
* For https://nuxtseo.com/docs/**, we want to start at https://nuxtseo.com
|
|
60
|
-
*/
|
|
61
48
|
function getStartingUrl(parsedPattern) {
|
|
62
49
|
if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
|
|
63
50
|
const pattern = parsedPattern.pattern;
|
|
@@ -68,9 +55,6 @@ function getStartingUrl(parsedPattern) {
|
|
|
68
55
|
const pathBeforeGlob = lastSlash >= 0 ? beforeGlob.substring(0, lastSlash + 1) : "/";
|
|
69
56
|
return withHttps(parsedPattern.baseUrl + pathBeforeGlob);
|
|
70
57
|
}
|
|
71
|
-
/**
|
|
72
|
-
* Check if a URL should be excluded based on exclude patterns
|
|
73
|
-
*/
|
|
74
58
|
function isUrlExcluded(url, excludePatterns) {
|
|
75
59
|
if (!excludePatterns || excludePatterns.length === 0) return false;
|
|
76
60
|
try {
|
|
@@ -89,9 +73,6 @@ function isUrlExcluded(url, excludePatterns) {
|
|
|
89
73
|
return false;
|
|
90
74
|
}
|
|
91
75
|
}
|
|
92
|
-
/**
|
|
93
|
-
* Validate glob pattern syntax
|
|
94
|
-
*/
|
|
95
76
|
function validateGlobPattern(pattern) {
|
|
96
77
|
try {
|
|
97
78
|
parseUrlPattern(pattern);
|
|
@@ -100,9 +81,6 @@ function validateGlobPattern(pattern) {
|
|
|
100
81
|
return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
|
|
101
82
|
}
|
|
102
83
|
}
|
|
103
|
-
|
|
104
|
-
//#endregion
|
|
105
|
-
//#region src/metadata-extractor.ts
|
|
106
84
|
function extractMetadata(html, url) {
|
|
107
85
|
const links = [];
|
|
108
86
|
let title = "";
|
|
@@ -155,9 +133,6 @@ function extractMetadata(html, url) {
|
|
|
155
133
|
})
|
|
156
134
|
};
|
|
157
135
|
}
|
|
158
|
-
|
|
159
|
-
//#endregion
|
|
160
|
-
//#region src/crawl.ts
|
|
161
136
|
async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
162
137
|
const controller = new AbortController();
|
|
163
138
|
const timeoutId = setTimeout(() => controller.abort(), 1e4);
|
|
@@ -247,7 +222,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
247
222
|
headers: { "User-Agent": "mdream-crawler/1.0" }
|
|
248
223
|
});
|
|
249
224
|
clearTimeout(robotsTimeoutId);
|
|
250
|
-
} catch
|
|
225
|
+
} catch {
|
|
251
226
|
clearTimeout(robotsTimeoutId);
|
|
252
227
|
robotsResponse = null;
|
|
253
228
|
}
|
|
@@ -264,7 +239,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
264
239
|
url: sitemapUrl,
|
|
265
240
|
success: true
|
|
266
241
|
});
|
|
267
|
-
if (patterns.some((p
|
|
242
|
+
if (patterns.some((p) => p.isGlob)) {
|
|
268
243
|
const filteredUrls = robotsUrls.filter((url) => {
|
|
269
244
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
270
245
|
});
|
|
@@ -300,7 +275,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
300
275
|
url: mainSitemapUrl,
|
|
301
276
|
success: true
|
|
302
277
|
});
|
|
303
|
-
if (patterns.some((p
|
|
278
|
+
if (patterns.some((p) => p.isGlob)) {
|
|
304
279
|
const filteredUrls = sitemapUrls.filter((url) => {
|
|
305
280
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
306
281
|
});
|
|
@@ -339,7 +314,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
339
314
|
url: sitemapUrl,
|
|
340
315
|
success: true
|
|
341
316
|
});
|
|
342
|
-
if (patterns.some((p
|
|
317
|
+
if (patterns.some((p) => p.isGlob)) {
|
|
343
318
|
const filteredUrls = altUrls.filter((url) => {
|
|
344
319
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
345
320
|
});
|
|
@@ -360,11 +335,11 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
360
335
|
break;
|
|
361
336
|
}
|
|
362
337
|
}
|
|
363
|
-
} catch (error
|
|
338
|
+
} catch (error) {
|
|
364
339
|
sitemapAttempts.push({
|
|
365
340
|
url: sitemapUrl,
|
|
366
341
|
success: false,
|
|
367
|
-
error: error
|
|
342
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
368
343
|
});
|
|
369
344
|
}
|
|
370
345
|
}
|
|
@@ -396,7 +371,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
396
371
|
const processedUrls = /* @__PURE__ */ new Set();
|
|
397
372
|
const shouldCrawlUrl = (url) => {
|
|
398
373
|
if (isUrlExcluded(url, exclude)) return false;
|
|
399
|
-
if (!patterns.some((p
|
|
374
|
+
if (!patterns.some((p) => p.isGlob)) return true;
|
|
400
375
|
return patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
401
376
|
};
|
|
402
377
|
const createRequestHandler = (crawlerType) => {
|
|
@@ -543,10 +518,10 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
543
518
|
onProgress?.(progress);
|
|
544
519
|
const successfulResults = results.filter((r) => r.success);
|
|
545
520
|
const firstUrl = new URL(withHttps(urls[0]));
|
|
546
|
-
const origin
|
|
521
|
+
const origin = firstUrl.origin;
|
|
547
522
|
const homePageResult = successfulResults.find((r) => {
|
|
548
523
|
const resultUrl = new URL(withHttps(r.url));
|
|
549
|
-
return resultUrl.href === origin
|
|
524
|
+
return resultUrl.href === origin || resultUrl.href === `${origin}/`;
|
|
550
525
|
});
|
|
551
526
|
const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
|
|
552
527
|
const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
|
|
@@ -555,7 +530,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
555
530
|
onProgress?.(progress);
|
|
556
531
|
const contentResults = successfulResults.filter((result) => {
|
|
557
532
|
if (!result.content) return false;
|
|
558
|
-
return result.content.trim().replace(
|
|
533
|
+
return result.content.trim().replace(/^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/, "").trim().length > 10;
|
|
559
534
|
});
|
|
560
535
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
561
536
|
const llmsResult = await generateLlmsTxtArtifacts({
|
|
@@ -572,7 +547,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
572
547
|
})),
|
|
573
548
|
siteName,
|
|
574
549
|
description,
|
|
575
|
-
origin: origin
|
|
550
|
+
origin: origin || firstUrl.origin,
|
|
576
551
|
generateFull: generateLlmsFullTxt,
|
|
577
552
|
outputDir
|
|
578
553
|
});
|
|
@@ -593,6 +568,4 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
593
568
|
await purgeDefaultStorages();
|
|
594
569
|
return results;
|
|
595
570
|
}
|
|
596
|
-
|
|
597
|
-
//#endregion
|
|
598
|
-
export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
|
|
571
|
+
export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl
|
|
1
|
+
import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl.mjs";
|
|
2
2
|
import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
3
3
|
import * as p from "@clack/prompts";
|
|
4
4
|
import { PlaywrightCrawler } from "crawlee";
|
|
@@ -6,8 +6,6 @@ import { dirname, join, resolve } from "pathe";
|
|
|
6
6
|
import { withHttps } from "ufo";
|
|
7
7
|
import { fileURLToPath } from "node:url";
|
|
8
8
|
import { addDependency } from "nypm";
|
|
9
|
-
|
|
10
|
-
//#region src/playwright-utils.ts
|
|
11
9
|
async function checkPlaywrightInstallation() {
|
|
12
10
|
try {
|
|
13
11
|
await import("playwright");
|
|
@@ -61,9 +59,6 @@ async function isUseChromeSupported() {
|
|
|
61
59
|
} catch {}
|
|
62
60
|
return false;
|
|
63
61
|
}
|
|
64
|
-
|
|
65
|
-
//#endregion
|
|
66
|
-
//#region src/cli.ts
|
|
67
62
|
const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
|
|
68
63
|
const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
|
|
69
64
|
function checkOutputDirectoryPermissions(outputDir) {
|
|
@@ -106,8 +101,8 @@ async function interactiveCrawl() {
|
|
|
106
101
|
placeholder: "e.g. docs.example.com, site.com/docs/**",
|
|
107
102
|
validate: (value) => {
|
|
108
103
|
if (!value) return "Please enter at least one URL";
|
|
109
|
-
const urls
|
|
110
|
-
for (const url of urls
|
|
104
|
+
const urls = value.split(",").map((url) => url.trim());
|
|
105
|
+
for (const url of urls) {
|
|
111
106
|
const globError = validateGlobPattern(url);
|
|
112
107
|
if (globError) return globError;
|
|
113
108
|
try {
|
|
@@ -210,7 +205,7 @@ async function interactiveCrawl() {
|
|
|
210
205
|
inferredOrigin && `Origin: ${inferredOrigin}`
|
|
211
206
|
].filter(Boolean);
|
|
212
207
|
p.note(summary.join("\n"), "Crawl Configuration");
|
|
213
|
-
if (advancedOptions.skipSitemap && globPatterns.some((p
|
|
208
|
+
if (advancedOptions.skipSitemap && globPatterns.some((p) => p.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
|
|
214
209
|
return {
|
|
215
210
|
urls,
|
|
216
211
|
outputDir: resolve(outputDir),
|
|
@@ -493,5 +488,4 @@ main().catch((error) => {
|
|
|
493
488
|
p.log.error(`Unexpected error: ${error}`);
|
|
494
489
|
process.exit(1);
|
|
495
490
|
});
|
|
496
|
-
|
|
497
|
-
//#endregion
|
|
491
|
+
export {};
|
package/dist/index.mjs
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
|
-
import { t as crawlAndGenerate } from "./_chunks/crawl
|
|
1
|
+
import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
|
|
2
2
|
import { writeFile } from "node:fs/promises";
|
|
3
3
|
import { basename, sep } from "pathe";
|
|
4
|
-
|
|
5
|
-
//#region src/llms-txt.ts
|
|
6
4
|
async function generateLlmsTxt(options) {
|
|
7
5
|
const { siteName, description, results, outputPath } = options;
|
|
8
6
|
let content = `# ${siteName}\n\n`;
|
|
@@ -22,8 +20,8 @@ async function generateLlmsTxt(options) {
|
|
|
22
20
|
const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
|
|
23
21
|
content += `- [${title}](md/${linkPath}): ${result.url}\n`;
|
|
24
22
|
} else {
|
|
25
|
-
const description
|
|
26
|
-
content += `- [${title}](${result.url})${description
|
|
23
|
+
const description = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
|
|
24
|
+
content += `- [${title}](${result.url})${description ? `: ${description}` : ""}\n`;
|
|
27
25
|
}
|
|
28
26
|
}
|
|
29
27
|
}
|
|
@@ -60,6 +58,4 @@ async function generateLlmsFullTxt(options) {
|
|
|
60
58
|
}
|
|
61
59
|
await writeFile(outputPath, content, "utf-8");
|
|
62
60
|
}
|
|
63
|
-
|
|
64
|
-
//#endregion
|
|
65
|
-
export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
|
61
|
+
export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.16.0",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -50,13 +50,13 @@
|
|
|
50
50
|
}
|
|
51
51
|
},
|
|
52
52
|
"dependencies": {
|
|
53
|
-
"@clack/prompts": "^0.
|
|
54
|
-
"crawlee": "^3.
|
|
55
|
-
"nypm": "^0.6.
|
|
53
|
+
"@clack/prompts": "^1.0.1",
|
|
54
|
+
"crawlee": "^3.16.0",
|
|
55
|
+
"nypm": "^0.6.5",
|
|
56
56
|
"pathe": "^2.0.3",
|
|
57
57
|
"picomatch": "^4.0.3",
|
|
58
|
-
"ufo": "^1.6.
|
|
59
|
-
"mdream": "0.
|
|
58
|
+
"ufo": "^1.6.3",
|
|
59
|
+
"mdream": "0.16.0"
|
|
60
60
|
},
|
|
61
61
|
"devDependencies": {
|
|
62
62
|
"@types/picomatch": "^4.0.2"
|