@mdream/crawl 0.8.5 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.mjs +78 -66
- package/package.json +3 -3
package/dist/cli.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-BtuYX2_u.mjs";
|
|
2
|
-
import { readFileSync } from "node:fs";
|
|
2
|
+
import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
3
3
|
import * as p$1 from "@clack/prompts";
|
|
4
4
|
import * as p from "@clack/prompts";
|
|
5
5
|
import { PlaywrightCrawler } from "crawlee";
|
|
@@ -71,6 +71,38 @@ const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
|
71
71
|
const packageJsonPath = join(__dirname, "..", "package.json");
|
|
72
72
|
const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
|
|
73
73
|
const version = packageJson.version;
|
|
74
|
+
function checkOutputDirectoryPermissions(outputDir) {
|
|
75
|
+
try {
|
|
76
|
+
mkdirSync(outputDir, { recursive: true });
|
|
77
|
+
accessSync(outputDir, constants.W_OK);
|
|
78
|
+
const testFile = join(outputDir, ".mdream-test");
|
|
79
|
+
try {
|
|
80
|
+
writeFileSync(testFile, "test");
|
|
81
|
+
unlinkSync(testFile);
|
|
82
|
+
} catch (err) {
|
|
83
|
+
return {
|
|
84
|
+
success: false,
|
|
85
|
+
error: `Cannot write to output directory: ${err instanceof Error ? err.message : "Unknown error"}`
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
return { success: true };
|
|
89
|
+
} catch (err) {
|
|
90
|
+
if (err instanceof Error) {
|
|
91
|
+
if (err.message.includes("EACCES")) return {
|
|
92
|
+
success: false,
|
|
93
|
+
error: `Permission denied: Cannot write to output directory '${outputDir}'. Please check permissions or run with appropriate privileges.`
|
|
94
|
+
};
|
|
95
|
+
return {
|
|
96
|
+
success: false,
|
|
97
|
+
error: `Failed to access output directory: ${err.message}`
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
return {
|
|
101
|
+
success: false,
|
|
102
|
+
error: "Failed to access output directory"
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
}
|
|
74
106
|
async function interactiveCrawl() {
|
|
75
107
|
console.clear();
|
|
76
108
|
p.intro(`☁️ @mdream/crawl v${version}`);
|
|
@@ -109,61 +141,44 @@ async function interactiveCrawl() {
|
|
|
109
141
|
return null;
|
|
110
142
|
}
|
|
111
143
|
const outputDir = "output";
|
|
112
|
-
const crawlerOptions = await p.group({
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
}),
|
|
125
|
-
maxDepth: () => p.text({
|
|
126
|
-
message: "Clicks to page (crawl depth):",
|
|
127
|
-
placeholder: "3",
|
|
128
|
-
defaultValue: "3",
|
|
129
|
-
validate: (value) => {
|
|
130
|
-
const num = Number.parseInt(value);
|
|
131
|
-
if (Number.isNaN(num) || num < 1 || num > 10) return "Depth must be between 1 and 10";
|
|
132
|
-
}
|
|
133
|
-
})
|
|
134
|
-
}, { onCancel: () => {
|
|
144
|
+
const crawlerOptions = await p.group({ driver: () => p.select({
|
|
145
|
+
message: "Select crawler driver:",
|
|
146
|
+
options: [{
|
|
147
|
+
value: "http",
|
|
148
|
+
label: "HTTP Crawler (Fast, for static content)",
|
|
149
|
+
hint: "Recommended"
|
|
150
|
+
}, {
|
|
151
|
+
value: "playwright",
|
|
152
|
+
label: "Playwright (Slower, supports JavaScript)"
|
|
153
|
+
}],
|
|
154
|
+
initialValue: "http"
|
|
155
|
+
}) }, { onCancel: () => {
|
|
135
156
|
p.cancel("Operation cancelled.");
|
|
136
157
|
process.exit(0);
|
|
137
158
|
} });
|
|
138
|
-
const advancedOptions = await p.group({
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
}),
|
|
162
|
-
verbose: () => p.confirm({
|
|
163
|
-
message: "Enable verbose logging?",
|
|
164
|
-
initialValue: false
|
|
165
|
-
})
|
|
166
|
-
}, { onCancel: () => {
|
|
159
|
+
const advancedOptions = await p.group({ outputFormats: () => p.multiselect({
|
|
160
|
+
message: "Select output formats:",
|
|
161
|
+
options: [
|
|
162
|
+
{
|
|
163
|
+
value: "llms.txt",
|
|
164
|
+
label: "llms.txt (basic format)",
|
|
165
|
+
hint: "Recommended"
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
value: "llms-full.txt",
|
|
169
|
+
label: "llms-full.txt (extended format)"
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
value: "markdown",
|
|
173
|
+
label: "Individual Markdown files"
|
|
174
|
+
}
|
|
175
|
+
],
|
|
176
|
+
initialValues: [
|
|
177
|
+
"llms.txt",
|
|
178
|
+
"llms-full.txt",
|
|
179
|
+
"markdown"
|
|
180
|
+
]
|
|
181
|
+
}) }, { onCancel: () => {
|
|
167
182
|
p.cancel("Operation cancelled.");
|
|
168
183
|
process.exit(0);
|
|
169
184
|
} });
|
|
@@ -189,34 +204,25 @@ async function interactiveCrawl() {
|
|
|
189
204
|
`Output: ${outputDir}`,
|
|
190
205
|
`Driver: ${crawlerOptions.driver}`,
|
|
191
206
|
`Max pages: Unlimited`,
|
|
192
|
-
`Follow links: Yes (depth
|
|
207
|
+
`Follow links: Yes (depth 3)`,
|
|
193
208
|
`Output formats: ${outputFormats.join(", ")}`,
|
|
194
209
|
`Sitemap discovery: Automatic`,
|
|
195
|
-
inferredOrigin && `Origin: ${inferredOrigin}
|
|
196
|
-
advancedOptions.verbose && `Verbose logging: Enabled`
|
|
210
|
+
inferredOrigin && `Origin: ${inferredOrigin}`
|
|
197
211
|
].filter(Boolean);
|
|
198
212
|
p.note(summary.join("\n"), "Crawl Configuration");
|
|
199
|
-
const shouldProceed = await p.confirm({
|
|
200
|
-
message: "Start crawling?",
|
|
201
|
-
initialValue: true
|
|
202
|
-
});
|
|
203
|
-
if (p.isCancel(shouldProceed) || !shouldProceed) {
|
|
204
|
-
p.cancel("Crawl cancelled.");
|
|
205
|
-
return null;
|
|
206
|
-
}
|
|
207
213
|
return {
|
|
208
214
|
urls,
|
|
209
215
|
outputDir: resolve(outputDir),
|
|
210
216
|
driver: crawlerOptions.driver,
|
|
211
217
|
maxRequestsPerCrawl: Number.MAX_SAFE_INTEGER,
|
|
212
218
|
followLinks: true,
|
|
213
|
-
maxDepth: Number.parseInt(crawlerOptions.maxDepth),
|
|
214
219
|
generateLlmsTxt: advancedOptions.outputFormats.includes("llms.txt"),
|
|
215
220
|
generateLlmsFullTxt: advancedOptions.outputFormats.includes("llms-full.txt"),
|
|
216
221
|
generateIndividualMd: advancedOptions.outputFormats.includes("markdown"),
|
|
217
222
|
origin: inferredOrigin,
|
|
218
223
|
globPatterns,
|
|
219
|
-
verbose:
|
|
224
|
+
verbose: false,
|
|
225
|
+
maxDepth: 3
|
|
220
226
|
};
|
|
221
227
|
}
|
|
222
228
|
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
|
|
@@ -413,6 +419,12 @@ async function main() {
|
|
|
413
419
|
p.note(summary.join("\n"), "Configuration");
|
|
414
420
|
} else options = await interactiveCrawl();
|
|
415
421
|
if (!options) process.exit(0);
|
|
422
|
+
const permCheck = checkOutputDirectoryPermissions(options.outputDir);
|
|
423
|
+
if (!permCheck.success) {
|
|
424
|
+
p.log.error(permCheck.error);
|
|
425
|
+
if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
|
|
426
|
+
process.exit(1);
|
|
427
|
+
}
|
|
416
428
|
if (options.driver === "playwright") {
|
|
417
429
|
const chromeSupported = await isUseChromeSupported();
|
|
418
430
|
if (chromeSupported) {
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.9.0",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -46,12 +46,12 @@
|
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
48
|
"@clack/prompts": "^0.11.0",
|
|
49
|
-
"crawlee": "^3.
|
|
49
|
+
"crawlee": "^3.14.0",
|
|
50
50
|
"nypm": "^0.6.0",
|
|
51
51
|
"pathe": "^2.0.3",
|
|
52
52
|
"picomatch": "^4.0.3",
|
|
53
53
|
"ufo": "^1.6.1",
|
|
54
|
-
"mdream": "0.
|
|
54
|
+
"mdream": "0.9.0"
|
|
55
55
|
},
|
|
56
56
|
"devDependencies": {
|
|
57
57
|
"@types/picomatch": "^4.0.1"
|