@mdream/crawl 0.8.5 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/cli.mjs +78 -66
  2. package/package.json +3 -3
package/dist/cli.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-BtuYX2_u.mjs";
2
- import { readFileSync } from "node:fs";
2
+ import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
3
3
  import * as p$1 from "@clack/prompts";
4
4
  import * as p from "@clack/prompts";
5
5
  import { PlaywrightCrawler } from "crawlee";
@@ -71,6 +71,38 @@ const __dirname = dirname(fileURLToPath(import.meta.url));
71
71
  const packageJsonPath = join(__dirname, "..", "package.json");
72
72
  const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
73
73
  const version = packageJson.version;
74
+ function checkOutputDirectoryPermissions(outputDir) {
75
+ try {
76
+ mkdirSync(outputDir, { recursive: true });
77
+ accessSync(outputDir, constants.W_OK);
78
+ const testFile = join(outputDir, ".mdream-test");
79
+ try {
80
+ writeFileSync(testFile, "test");
81
+ unlinkSync(testFile);
82
+ } catch (err) {
83
+ return {
84
+ success: false,
85
+ error: `Cannot write to output directory: ${err instanceof Error ? err.message : "Unknown error"}`
86
+ };
87
+ }
88
+ return { success: true };
89
+ } catch (err) {
90
+ if (err instanceof Error) {
91
+ if (err.message.includes("EACCES")) return {
92
+ success: false,
93
+ error: `Permission denied: Cannot write to output directory '${outputDir}'. Please check permissions or run with appropriate privileges.`
94
+ };
95
+ return {
96
+ success: false,
97
+ error: `Failed to access output directory: ${err.message}`
98
+ };
99
+ }
100
+ return {
101
+ success: false,
102
+ error: "Failed to access output directory"
103
+ };
104
+ }
105
+ }
74
106
  async function interactiveCrawl() {
75
107
  console.clear();
76
108
  p.intro(`☁️ @mdream/crawl v${version}`);
@@ -109,61 +141,44 @@ async function interactiveCrawl() {
109
141
  return null;
110
142
  }
111
143
  const outputDir = "output";
112
- const crawlerOptions = await p.group({
113
- driver: () => p.select({
114
- message: "Select crawler driver:",
115
- options: [{
116
- value: "http",
117
- label: "HTTP Crawler (Fast, for static content)",
118
- hint: "Recommended"
119
- }, {
120
- value: "playwright",
121
- label: "Playwright (Slower, supports JavaScript)"
122
- }],
123
- initialValue: "http"
124
- }),
125
- maxDepth: () => p.text({
126
- message: "Clicks to page (crawl depth):",
127
- placeholder: "3",
128
- defaultValue: "3",
129
- validate: (value) => {
130
- const num = Number.parseInt(value);
131
- if (Number.isNaN(num) || num < 1 || num > 10) return "Depth must be between 1 and 10";
132
- }
133
- })
134
- }, { onCancel: () => {
144
+ const crawlerOptions = await p.group({ driver: () => p.select({
145
+ message: "Select crawler driver:",
146
+ options: [{
147
+ value: "http",
148
+ label: "HTTP Crawler (Fast, for static content)",
149
+ hint: "Recommended"
150
+ }, {
151
+ value: "playwright",
152
+ label: "Playwright (Slower, supports JavaScript)"
153
+ }],
154
+ initialValue: "http"
155
+ }) }, { onCancel: () => {
135
156
  p.cancel("Operation cancelled.");
136
157
  process.exit(0);
137
158
  } });
138
- const advancedOptions = await p.group({
139
- outputFormats: () => p.multiselect({
140
- message: "Select output formats:",
141
- options: [
142
- {
143
- value: "llms.txt",
144
- label: "llms.txt (basic format)",
145
- hint: "Recommended"
146
- },
147
- {
148
- value: "llms-full.txt",
149
- label: "llms-full.txt (extended format)"
150
- },
151
- {
152
- value: "markdown",
153
- label: "Individual Markdown files"
154
- }
155
- ],
156
- initialValues: [
157
- "llms.txt",
158
- "llms-full.txt",
159
- "markdown"
160
- ]
161
- }),
162
- verbose: () => p.confirm({
163
- message: "Enable verbose logging?",
164
- initialValue: false
165
- })
166
- }, { onCancel: () => {
159
+ const advancedOptions = await p.group({ outputFormats: () => p.multiselect({
160
+ message: "Select output formats:",
161
+ options: [
162
+ {
163
+ value: "llms.txt",
164
+ label: "llms.txt (basic format)",
165
+ hint: "Recommended"
166
+ },
167
+ {
168
+ value: "llms-full.txt",
169
+ label: "llms-full.txt (extended format)"
170
+ },
171
+ {
172
+ value: "markdown",
173
+ label: "Individual Markdown files"
174
+ }
175
+ ],
176
+ initialValues: [
177
+ "llms.txt",
178
+ "llms-full.txt",
179
+ "markdown"
180
+ ]
181
+ }) }, { onCancel: () => {
167
182
  p.cancel("Operation cancelled.");
168
183
  process.exit(0);
169
184
  } });
@@ -189,34 +204,25 @@ async function interactiveCrawl() {
189
204
  `Output: ${outputDir}`,
190
205
  `Driver: ${crawlerOptions.driver}`,
191
206
  `Max pages: Unlimited`,
192
- `Follow links: Yes (depth ${crawlerOptions.maxDepth})`,
207
+ `Follow links: Yes (depth 3)`,
193
208
  `Output formats: ${outputFormats.join(", ")}`,
194
209
  `Sitemap discovery: Automatic`,
195
- inferredOrigin && `Origin: ${inferredOrigin}`,
196
- advancedOptions.verbose && `Verbose logging: Enabled`
210
+ inferredOrigin && `Origin: ${inferredOrigin}`
197
211
  ].filter(Boolean);
198
212
  p.note(summary.join("\n"), "Crawl Configuration");
199
- const shouldProceed = await p.confirm({
200
- message: "Start crawling?",
201
- initialValue: true
202
- });
203
- if (p.isCancel(shouldProceed) || !shouldProceed) {
204
- p.cancel("Crawl cancelled.");
205
- return null;
206
- }
207
213
  return {
208
214
  urls,
209
215
  outputDir: resolve(outputDir),
210
216
  driver: crawlerOptions.driver,
211
217
  maxRequestsPerCrawl: Number.MAX_SAFE_INTEGER,
212
218
  followLinks: true,
213
- maxDepth: Number.parseInt(crawlerOptions.maxDepth),
214
219
  generateLlmsTxt: advancedOptions.outputFormats.includes("llms.txt"),
215
220
  generateLlmsFullTxt: advancedOptions.outputFormats.includes("llms-full.txt"),
216
221
  generateIndividualMd: advancedOptions.outputFormats.includes("markdown"),
217
222
  origin: inferredOrigin,
218
223
  globPatterns,
219
- verbose: advancedOptions.verbose
224
+ verbose: false,
225
+ maxDepth: 3
220
226
  };
221
227
  }
222
228
  async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
@@ -413,6 +419,12 @@ async function main() {
413
419
  p.note(summary.join("\n"), "Configuration");
414
420
  } else options = await interactiveCrawl();
415
421
  if (!options) process.exit(0);
422
+ const permCheck = checkOutputDirectoryPermissions(options.outputDir);
423
+ if (!permCheck.success) {
424
+ p.log.error(permCheck.error);
425
+ if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
426
+ process.exit(1);
427
+ }
416
428
  if (options.driver === "playwright") {
417
429
  const chromeSupported = await isUseChromeSupported();
418
430
  if (chromeSupported) {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.8.5",
4
+ "version": "0.9.0",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -46,12 +46,12 @@
46
46
  },
47
47
  "dependencies": {
48
48
  "@clack/prompts": "^0.11.0",
49
- "crawlee": "^3.13.10",
49
+ "crawlee": "^3.14.0",
50
50
  "nypm": "^0.6.0",
51
51
  "pathe": "^2.0.3",
52
52
  "picomatch": "^4.0.3",
53
53
  "ufo": "^1.6.1",
54
- "mdream": "0.8.5"
54
+ "mdream": "0.9.0"
55
55
  },
56
56
  "devDependencies": {
57
57
  "@types/picomatch": "^4.0.1"