@mdream/crawl 0.8.4 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -5,24 +5,10 @@ import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawl
|
|
|
5
5
|
import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
|
|
6
6
|
import { withMinimalPreset } from "mdream/preset/minimal";
|
|
7
7
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
8
|
+
import { withHttps } from "ufo";
|
|
8
9
|
import picomatch from "picomatch";
|
|
9
10
|
import { extractionPlugin } from "mdream/plugins";
|
|
10
11
|
|
|
11
|
-
//#region ../../node_modules/.pnpm/ufo@1.6.1/node_modules/ufo/dist/index.mjs
|
|
12
|
-
const r = String.fromCharCode;
|
|
13
|
-
const PROTOCOL_REGEX = /^[\s\w\0+.-]{2,}:([/\\]{2})?/;
|
|
14
|
-
function withHttps(input) {
|
|
15
|
-
return withProtocol(input, "https://");
|
|
16
|
-
}
|
|
17
|
-
function withProtocol(input, protocol) {
|
|
18
|
-
let match = input.match(PROTOCOL_REGEX);
|
|
19
|
-
if (!match) match = input.match(/^\/{2,}/);
|
|
20
|
-
if (!match) return protocol + input;
|
|
21
|
-
return protocol + input.slice(match[0].length);
|
|
22
|
-
}
|
|
23
|
-
const protocolRelative = Symbol.for("ufo:protocolRelative");
|
|
24
|
-
|
|
25
|
-
//#endregion
|
|
26
12
|
//#region src/glob-utils.ts
|
|
27
13
|
/**
|
|
28
14
|
* Parse a URL that may contain glob patterns
|
|
@@ -467,14 +453,14 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
467
453
|
await crawler.run(initialRequests);
|
|
468
454
|
progress.crawling.status = "completed";
|
|
469
455
|
onProgress?.(progress);
|
|
470
|
-
if (results.some((r
|
|
456
|
+
if (results.some((r) => r.success)) {
|
|
471
457
|
progress.generation.status = "generating";
|
|
472
458
|
onProgress?.(progress);
|
|
473
|
-
const successfulResults = results.filter((r
|
|
459
|
+
const successfulResults = results.filter((r) => r.success);
|
|
474
460
|
const firstUrl = new URL(withHttps(urls[0]));
|
|
475
461
|
const origin$1 = firstUrl.origin;
|
|
476
|
-
const homePageResult = successfulResults.find((r
|
|
477
|
-
const resultUrl = new URL(withHttps(r
|
|
462
|
+
const homePageResult = successfulResults.find((r) => {
|
|
463
|
+
const resultUrl = new URL(withHttps(r.url));
|
|
478
464
|
return resultUrl.href === origin$1 || resultUrl.href === `${origin$1}/`;
|
|
479
465
|
});
|
|
480
466
|
const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
|
|
@@ -528,4 +514,4 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
528
514
|
}
|
|
529
515
|
|
|
530
516
|
//#endregion
|
|
531
|
-
export { crawlAndGenerate, parseUrlPattern, validateGlobPattern
|
|
517
|
+
export { crawlAndGenerate, parseUrlPattern, validateGlobPattern };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern
|
|
2
|
-
import { readFileSync } from "node:fs";
|
|
1
|
+
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-BtuYX2_u.mjs";
|
|
2
|
+
import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
3
3
|
import * as p$1 from "@clack/prompts";
|
|
4
4
|
import * as p from "@clack/prompts";
|
|
5
5
|
import { PlaywrightCrawler } from "crawlee";
|
|
6
6
|
import { dirname, join, resolve } from "pathe";
|
|
7
|
+
import { withHttps } from "ufo";
|
|
7
8
|
import { fileURLToPath } from "node:url";
|
|
8
9
|
import { addDependency } from "nypm";
|
|
9
10
|
|
|
@@ -70,6 +71,38 @@ const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
|
70
71
|
const packageJsonPath = join(__dirname, "..", "package.json");
|
|
71
72
|
const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
|
|
72
73
|
const version = packageJson.version;
|
|
74
|
+
function checkOutputDirectoryPermissions(outputDir) {
|
|
75
|
+
try {
|
|
76
|
+
mkdirSync(outputDir, { recursive: true });
|
|
77
|
+
accessSync(outputDir, constants.W_OK);
|
|
78
|
+
const testFile = join(outputDir, ".mdream-test");
|
|
79
|
+
try {
|
|
80
|
+
writeFileSync(testFile, "test");
|
|
81
|
+
unlinkSync(testFile);
|
|
82
|
+
} catch (err) {
|
|
83
|
+
return {
|
|
84
|
+
success: false,
|
|
85
|
+
error: `Cannot write to output directory: ${err instanceof Error ? err.message : "Unknown error"}`
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
return { success: true };
|
|
89
|
+
} catch (err) {
|
|
90
|
+
if (err instanceof Error) {
|
|
91
|
+
if (err.message.includes("EACCES")) return {
|
|
92
|
+
success: false,
|
|
93
|
+
error: `Permission denied: Cannot write to output directory '${outputDir}'. Please check permissions or run with appropriate privileges.`
|
|
94
|
+
};
|
|
95
|
+
return {
|
|
96
|
+
success: false,
|
|
97
|
+
error: `Failed to access output directory: ${err.message}`
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
return {
|
|
101
|
+
success: false,
|
|
102
|
+
error: "Failed to access output directory"
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
}
|
|
73
106
|
async function interactiveCrawl() {
|
|
74
107
|
console.clear();
|
|
75
108
|
p.intro(`☁️ @mdream/crawl v${version}`);
|
|
@@ -108,61 +141,44 @@ async function interactiveCrawl() {
|
|
|
108
141
|
return null;
|
|
109
142
|
}
|
|
110
143
|
const outputDir = "output";
|
|
111
|
-
const crawlerOptions = await p.group({
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
}),
|
|
124
|
-
maxDepth: () => p.text({
|
|
125
|
-
message: "Clicks to page (crawl depth):",
|
|
126
|
-
placeholder: "3",
|
|
127
|
-
defaultValue: "3",
|
|
128
|
-
validate: (value) => {
|
|
129
|
-
const num = Number.parseInt(value);
|
|
130
|
-
if (Number.isNaN(num) || num < 1 || num > 10) return "Depth must be between 1 and 10";
|
|
131
|
-
}
|
|
132
|
-
})
|
|
133
|
-
}, { onCancel: () => {
|
|
144
|
+
const crawlerOptions = await p.group({ driver: () => p.select({
|
|
145
|
+
message: "Select crawler driver:",
|
|
146
|
+
options: [{
|
|
147
|
+
value: "http",
|
|
148
|
+
label: "HTTP Crawler (Fast, for static content)",
|
|
149
|
+
hint: "Recommended"
|
|
150
|
+
}, {
|
|
151
|
+
value: "playwright",
|
|
152
|
+
label: "Playwright (Slower, supports JavaScript)"
|
|
153
|
+
}],
|
|
154
|
+
initialValue: "http"
|
|
155
|
+
}) }, { onCancel: () => {
|
|
134
156
|
p.cancel("Operation cancelled.");
|
|
135
157
|
process.exit(0);
|
|
136
158
|
} });
|
|
137
|
-
const advancedOptions = await p.group({
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
}),
|
|
161
|
-
verbose: () => p.confirm({
|
|
162
|
-
message: "Enable verbose logging?",
|
|
163
|
-
initialValue: false
|
|
164
|
-
})
|
|
165
|
-
}, { onCancel: () => {
|
|
159
|
+
const advancedOptions = await p.group({ outputFormats: () => p.multiselect({
|
|
160
|
+
message: "Select output formats:",
|
|
161
|
+
options: [
|
|
162
|
+
{
|
|
163
|
+
value: "llms.txt",
|
|
164
|
+
label: "llms.txt (basic format)",
|
|
165
|
+
hint: "Recommended"
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
value: "llms-full.txt",
|
|
169
|
+
label: "llms-full.txt (extended format)"
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
value: "markdown",
|
|
173
|
+
label: "Individual Markdown files"
|
|
174
|
+
}
|
|
175
|
+
],
|
|
176
|
+
initialValues: [
|
|
177
|
+
"llms.txt",
|
|
178
|
+
"llms-full.txt",
|
|
179
|
+
"markdown"
|
|
180
|
+
]
|
|
181
|
+
}) }, { onCancel: () => {
|
|
166
182
|
p.cancel("Operation cancelled.");
|
|
167
183
|
process.exit(0);
|
|
168
184
|
} });
|
|
@@ -188,34 +204,25 @@ async function interactiveCrawl() {
|
|
|
188
204
|
`Output: ${outputDir}`,
|
|
189
205
|
`Driver: ${crawlerOptions.driver}`,
|
|
190
206
|
`Max pages: Unlimited`,
|
|
191
|
-
`Follow links: Yes (depth
|
|
207
|
+
`Follow links: Yes (depth 3)`,
|
|
192
208
|
`Output formats: ${outputFormats.join(", ")}`,
|
|
193
209
|
`Sitemap discovery: Automatic`,
|
|
194
|
-
inferredOrigin && `Origin: ${inferredOrigin}
|
|
195
|
-
advancedOptions.verbose && `Verbose logging: Enabled`
|
|
210
|
+
inferredOrigin && `Origin: ${inferredOrigin}`
|
|
196
211
|
].filter(Boolean);
|
|
197
212
|
p.note(summary.join("\n"), "Crawl Configuration");
|
|
198
|
-
const shouldProceed = await p.confirm({
|
|
199
|
-
message: "Start crawling?",
|
|
200
|
-
initialValue: true
|
|
201
|
-
});
|
|
202
|
-
if (p.isCancel(shouldProceed) || !shouldProceed) {
|
|
203
|
-
p.cancel("Crawl cancelled.");
|
|
204
|
-
return null;
|
|
205
|
-
}
|
|
206
213
|
return {
|
|
207
214
|
urls,
|
|
208
215
|
outputDir: resolve(outputDir),
|
|
209
216
|
driver: crawlerOptions.driver,
|
|
210
217
|
maxRequestsPerCrawl: Number.MAX_SAFE_INTEGER,
|
|
211
218
|
followLinks: true,
|
|
212
|
-
maxDepth: Number.parseInt(crawlerOptions.maxDepth),
|
|
213
219
|
generateLlmsTxt: advancedOptions.outputFormats.includes("llms.txt"),
|
|
214
220
|
generateLlmsFullTxt: advancedOptions.outputFormats.includes("llms-full.txt"),
|
|
215
221
|
generateIndividualMd: advancedOptions.outputFormats.includes("markdown"),
|
|
216
222
|
origin: inferredOrigin,
|
|
217
223
|
globPatterns,
|
|
218
|
-
verbose:
|
|
224
|
+
verbose: false,
|
|
225
|
+
maxDepth: 3
|
|
219
226
|
};
|
|
220
227
|
}
|
|
221
228
|
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
|
|
@@ -412,6 +419,12 @@ async function main() {
|
|
|
412
419
|
p.note(summary.join("\n"), "Configuration");
|
|
413
420
|
} else options = await interactiveCrawl();
|
|
414
421
|
if (!options) process.exit(0);
|
|
422
|
+
const permCheck = checkOutputDirectoryPermissions(options.outputDir);
|
|
423
|
+
if (!permCheck.success) {
|
|
424
|
+
p.log.error(permCheck.error);
|
|
425
|
+
if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
|
|
426
|
+
process.exit(1);
|
|
427
|
+
}
|
|
415
428
|
if (options.driver === "playwright") {
|
|
416
429
|
const chromeSupported = await isUseChromeSupported();
|
|
417
430
|
if (chromeSupported) {
|
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.9.0",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -46,11 +46,12 @@
|
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
48
|
"@clack/prompts": "^0.11.0",
|
|
49
|
-
"crawlee": "^3.
|
|
49
|
+
"crawlee": "^3.14.0",
|
|
50
50
|
"nypm": "^0.6.0",
|
|
51
51
|
"pathe": "^2.0.3",
|
|
52
52
|
"picomatch": "^4.0.3",
|
|
53
|
-
"
|
|
53
|
+
"ufo": "^1.6.1",
|
|
54
|
+
"mdream": "0.9.0"
|
|
54
55
|
},
|
|
55
56
|
"devDependencies": {
|
|
56
57
|
"@types/picomatch": "^4.0.1"
|