@mdream/crawl 0.8.4 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,24 +5,10 @@ import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawl
5
5
  import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
6
6
  import { withMinimalPreset } from "mdream/preset/minimal";
7
7
  import { dirname, join, normalize, resolve } from "pathe";
8
+ import { withHttps } from "ufo";
8
9
  import picomatch from "picomatch";
9
10
  import { extractionPlugin } from "mdream/plugins";
10
11
 
11
- //#region ../../node_modules/.pnpm/ufo@1.6.1/node_modules/ufo/dist/index.mjs
12
- const r = String.fromCharCode;
13
- const PROTOCOL_REGEX = /^[\s\w\0+.-]{2,}:([/\\]{2})?/;
14
- function withHttps(input) {
15
- return withProtocol(input, "https://");
16
- }
17
- function withProtocol(input, protocol) {
18
- let match = input.match(PROTOCOL_REGEX);
19
- if (!match) match = input.match(/^\/{2,}/);
20
- if (!match) return protocol + input;
21
- return protocol + input.slice(match[0].length);
22
- }
23
- const protocolRelative = Symbol.for("ufo:protocolRelative");
24
-
25
- //#endregion
26
12
  //#region src/glob-utils.ts
27
13
  /**
28
14
  * Parse a URL that may contain glob patterns
@@ -467,14 +453,14 @@ async function crawlAndGenerate(options, onProgress) {
467
453
  await crawler.run(initialRequests);
468
454
  progress.crawling.status = "completed";
469
455
  onProgress?.(progress);
470
- if (results.some((r$1) => r$1.success)) {
456
+ if (results.some((r) => r.success)) {
471
457
  progress.generation.status = "generating";
472
458
  onProgress?.(progress);
473
- const successfulResults = results.filter((r$1) => r$1.success);
459
+ const successfulResults = results.filter((r) => r.success);
474
460
  const firstUrl = new URL(withHttps(urls[0]));
475
461
  const origin$1 = firstUrl.origin;
476
- const homePageResult = successfulResults.find((r$1) => {
477
- const resultUrl = new URL(withHttps(r$1.url));
462
+ const homePageResult = successfulResults.find((r) => {
463
+ const resultUrl = new URL(withHttps(r.url));
478
464
  return resultUrl.href === origin$1 || resultUrl.href === `${origin$1}/`;
479
465
  });
480
466
  const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
@@ -528,4 +514,4 @@ async function crawlAndGenerate(options, onProgress) {
528
514
  }
529
515
 
530
516
  //#endregion
531
- export { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps };
517
+ export { crawlAndGenerate, parseUrlPattern, validateGlobPattern };
package/dist/cli.mjs CHANGED
@@ -1,9 +1,10 @@
1
- import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-DYXGzu7W.mjs";
2
- import { readFileSync } from "node:fs";
1
+ import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-BtuYX2_u.mjs";
2
+ import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
3
3
  import * as p$1 from "@clack/prompts";
4
4
  import * as p from "@clack/prompts";
5
5
  import { PlaywrightCrawler } from "crawlee";
6
6
  import { dirname, join, resolve } from "pathe";
7
+ import { withHttps } from "ufo";
7
8
  import { fileURLToPath } from "node:url";
8
9
  import { addDependency } from "nypm";
9
10
 
@@ -70,6 +71,38 @@ const __dirname = dirname(fileURLToPath(import.meta.url));
70
71
  const packageJsonPath = join(__dirname, "..", "package.json");
71
72
  const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
72
73
  const version = packageJson.version;
74
+ function checkOutputDirectoryPermissions(outputDir) {
75
+ try {
76
+ mkdirSync(outputDir, { recursive: true });
77
+ accessSync(outputDir, constants.W_OK);
78
+ const testFile = join(outputDir, ".mdream-test");
79
+ try {
80
+ writeFileSync(testFile, "test");
81
+ unlinkSync(testFile);
82
+ } catch (err) {
83
+ return {
84
+ success: false,
85
+ error: `Cannot write to output directory: ${err instanceof Error ? err.message : "Unknown error"}`
86
+ };
87
+ }
88
+ return { success: true };
89
+ } catch (err) {
90
+ if (err instanceof Error) {
91
+ if (err.message.includes("EACCES")) return {
92
+ success: false,
93
+ error: `Permission denied: Cannot write to output directory '${outputDir}'. Please check permissions or run with appropriate privileges.`
94
+ };
95
+ return {
96
+ success: false,
97
+ error: `Failed to access output directory: ${err.message}`
98
+ };
99
+ }
100
+ return {
101
+ success: false,
102
+ error: "Failed to access output directory"
103
+ };
104
+ }
105
+ }
73
106
  async function interactiveCrawl() {
74
107
  console.clear();
75
108
  p.intro(`☁️ @mdream/crawl v${version}`);
@@ -108,61 +141,44 @@ async function interactiveCrawl() {
108
141
  return null;
109
142
  }
110
143
  const outputDir = "output";
111
- const crawlerOptions = await p.group({
112
- driver: () => p.select({
113
- message: "Select crawler driver:",
114
- options: [{
115
- value: "http",
116
- label: "HTTP Crawler (Fast, for static content)",
117
- hint: "Recommended"
118
- }, {
119
- value: "playwright",
120
- label: "Playwright (Slower, supports JavaScript)"
121
- }],
122
- initialValue: "http"
123
- }),
124
- maxDepth: () => p.text({
125
- message: "Clicks to page (crawl depth):",
126
- placeholder: "3",
127
- defaultValue: "3",
128
- validate: (value) => {
129
- const num = Number.parseInt(value);
130
- if (Number.isNaN(num) || num < 1 || num > 10) return "Depth must be between 1 and 10";
131
- }
132
- })
133
- }, { onCancel: () => {
144
+ const crawlerOptions = await p.group({ driver: () => p.select({
145
+ message: "Select crawler driver:",
146
+ options: [{
147
+ value: "http",
148
+ label: "HTTP Crawler (Fast, for static content)",
149
+ hint: "Recommended"
150
+ }, {
151
+ value: "playwright",
152
+ label: "Playwright (Slower, supports JavaScript)"
153
+ }],
154
+ initialValue: "http"
155
+ }) }, { onCancel: () => {
134
156
  p.cancel("Operation cancelled.");
135
157
  process.exit(0);
136
158
  } });
137
- const advancedOptions = await p.group({
138
- outputFormats: () => p.multiselect({
139
- message: "Select output formats:",
140
- options: [
141
- {
142
- value: "llms.txt",
143
- label: "llms.txt (basic format)",
144
- hint: "Recommended"
145
- },
146
- {
147
- value: "llms-full.txt",
148
- label: "llms-full.txt (extended format)"
149
- },
150
- {
151
- value: "markdown",
152
- label: "Individual Markdown files"
153
- }
154
- ],
155
- initialValues: [
156
- "llms.txt",
157
- "llms-full.txt",
158
- "markdown"
159
- ]
160
- }),
161
- verbose: () => p.confirm({
162
- message: "Enable verbose logging?",
163
- initialValue: false
164
- })
165
- }, { onCancel: () => {
159
+ const advancedOptions = await p.group({ outputFormats: () => p.multiselect({
160
+ message: "Select output formats:",
161
+ options: [
162
+ {
163
+ value: "llms.txt",
164
+ label: "llms.txt (basic format)",
165
+ hint: "Recommended"
166
+ },
167
+ {
168
+ value: "llms-full.txt",
169
+ label: "llms-full.txt (extended format)"
170
+ },
171
+ {
172
+ value: "markdown",
173
+ label: "Individual Markdown files"
174
+ }
175
+ ],
176
+ initialValues: [
177
+ "llms.txt",
178
+ "llms-full.txt",
179
+ "markdown"
180
+ ]
181
+ }) }, { onCancel: () => {
166
182
  p.cancel("Operation cancelled.");
167
183
  process.exit(0);
168
184
  } });
@@ -188,34 +204,25 @@ async function interactiveCrawl() {
188
204
  `Output: ${outputDir}`,
189
205
  `Driver: ${crawlerOptions.driver}`,
190
206
  `Max pages: Unlimited`,
191
- `Follow links: Yes (depth ${crawlerOptions.maxDepth})`,
207
+ `Follow links: Yes (depth 3)`,
192
208
  `Output formats: ${outputFormats.join(", ")}`,
193
209
  `Sitemap discovery: Automatic`,
194
- inferredOrigin && `Origin: ${inferredOrigin}`,
195
- advancedOptions.verbose && `Verbose logging: Enabled`
210
+ inferredOrigin && `Origin: ${inferredOrigin}`
196
211
  ].filter(Boolean);
197
212
  p.note(summary.join("\n"), "Crawl Configuration");
198
- const shouldProceed = await p.confirm({
199
- message: "Start crawling?",
200
- initialValue: true
201
- });
202
- if (p.isCancel(shouldProceed) || !shouldProceed) {
203
- p.cancel("Crawl cancelled.");
204
- return null;
205
- }
206
213
  return {
207
214
  urls,
208
215
  outputDir: resolve(outputDir),
209
216
  driver: crawlerOptions.driver,
210
217
  maxRequestsPerCrawl: Number.MAX_SAFE_INTEGER,
211
218
  followLinks: true,
212
- maxDepth: Number.parseInt(crawlerOptions.maxDepth),
213
219
  generateLlmsTxt: advancedOptions.outputFormats.includes("llms.txt"),
214
220
  generateLlmsFullTxt: advancedOptions.outputFormats.includes("llms-full.txt"),
215
221
  generateIndividualMd: advancedOptions.outputFormats.includes("markdown"),
216
222
  origin: inferredOrigin,
217
223
  globPatterns,
218
- verbose: advancedOptions.verbose
224
+ verbose: false,
225
+ maxDepth: 3
219
226
  };
220
227
  }
221
228
  async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
@@ -412,6 +419,12 @@ async function main() {
412
419
  p.note(summary.join("\n"), "Configuration");
413
420
  } else options = await interactiveCrawl();
414
421
  if (!options) process.exit(0);
422
+ const permCheck = checkOutputDirectoryPermissions(options.outputDir);
423
+ if (!permCheck.success) {
424
+ p.log.error(permCheck.error);
425
+ if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
426
+ process.exit(1);
427
+ }
415
428
  if (options.driver === "playwright") {
416
429
  const chromeSupported = await isUseChromeSupported();
417
430
  if (chromeSupported) {
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate } from "./_chunks/crawl-DYXGzu7W.mjs";
1
+ import { crawlAndGenerate } from "./_chunks/crawl-BtuYX2_u.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
4
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.8.4",
4
+ "version": "0.9.0",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -46,11 +46,12 @@
46
46
  },
47
47
  "dependencies": {
48
48
  "@clack/prompts": "^0.11.0",
49
- "crawlee": "^3.13.10",
49
+ "crawlee": "^3.14.0",
50
50
  "nypm": "^0.6.0",
51
51
  "pathe": "^2.0.3",
52
52
  "picomatch": "^4.0.3",
53
- "mdream": "0.8.4"
53
+ "ufo": "^1.6.1",
54
+ "mdream": "0.9.0"
54
55
  },
55
56
  "devDependencies": {
56
57
  "@types/picomatch": "^4.0.1"