@mdream/crawl 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -166,21 +166,56 @@ function extractMetadata(html, url) {
166
166
  //#endregion
167
167
  //#region src/crawl.ts
168
168
  async function loadSitemapWithoutRetries(sitemapUrl) {
169
- const response = await fetch(sitemapUrl);
170
- if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
171
- const xmlContent = await response.text();
172
- const urls = [];
173
- const urlRegex = /<loc>(.*?)<\/loc>/g;
174
- let match;
175
- while (true) {
176
- match = urlRegex.exec(xmlContent);
177
- if (match === null) break;
178
- urls.push(match[1]);
169
+ const controller = new AbortController();
170
+ const timeoutId = setTimeout(() => controller.abort(), 1e4);
171
+ try {
172
+ const response = await fetch(sitemapUrl, {
173
+ signal: controller.signal,
174
+ headers: { "User-Agent": "mdream-crawler/1.0" }
175
+ });
176
+ clearTimeout(timeoutId);
177
+ if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
178
+ const xmlContent = await response.text();
179
+ if (xmlContent.includes("<sitemapindex")) {
180
+ const sitemapIndexRegex = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
181
+ const childSitemaps = [];
182
+ let match;
183
+ while (true) {
184
+ match = sitemapIndexRegex.exec(xmlContent);
185
+ if (match === null) break;
186
+ let url = match[1];
187
+ if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
188
+ childSitemaps.push(url);
189
+ }
190
+ const allUrls = [];
191
+ for (const childSitemapUrl of childSitemaps) try {
192
+ const childUrls = await loadSitemapWithoutRetries(childSitemapUrl);
193
+ allUrls.push(...childUrls);
194
+ } catch (error) {
195
+ console.warn(`Failed to load child sitemap ${childSitemapUrl}:`, error instanceof Error ? error.message : "Unknown error");
196
+ }
197
+ return allUrls;
198
+ } else {
199
+ const urls = [];
200
+ const urlRegex = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
201
+ let match;
202
+ while (true) {
203
+ match = urlRegex.exec(xmlContent);
204
+ if (match === null) break;
205
+ let url = match[1];
206
+ if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
207
+ urls.push(url);
208
+ }
209
+ return urls;
210
+ }
211
+ } catch (error) {
212
+ clearTimeout(timeoutId);
213
+ if (error instanceof Error && error.name === "AbortError") throw new Error("Sitemap request timed out after 10 seconds");
214
+ throw error;
179
215
  }
180
- return urls;
181
216
  }
182
217
  async function crawlAndGenerate(options, onProgress) {
183
- const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
218
+ const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false } = options;
184
219
  const outputDir = resolve(normalize(rawOutputDir));
185
220
  if (verbose) log.setLevel(log.LEVELS.INFO);
186
221
  else log.setLevel(log.LEVELS.OFF);
@@ -205,13 +240,25 @@ async function crawlAndGenerate(options, onProgress) {
205
240
  generation: { status: "idle" }
206
241
  };
207
242
  const sitemapAttempts = [];
208
- if (startingUrls.length > 0) {
243
+ if (startingUrls.length > 0 && !skipSitemap) {
209
244
  const baseUrl = new URL(startingUrls[0]).origin;
210
245
  const homePageUrl = baseUrl;
211
246
  onProgress?.(progress);
212
247
  const robotsUrl = new URL("/robots.txt", baseUrl).toString();
213
- const robotsResponse = await fetch(robotsUrl);
214
- if (robotsResponse.ok) {
248
+ const robotsController = new AbortController();
249
+ const robotsTimeoutId = setTimeout(() => robotsController.abort(), 1e4);
250
+ let robotsResponse;
251
+ try {
252
+ robotsResponse = await fetch(robotsUrl, {
253
+ signal: robotsController.signal,
254
+ headers: { "User-Agent": "mdream-crawler/1.0" }
255
+ });
256
+ clearTimeout(robotsTimeoutId);
257
+ } catch (error) {
258
+ clearTimeout(robotsTimeoutId);
259
+ robotsResponse = null;
260
+ }
261
+ if (robotsResponse?.ok) {
215
262
  const robotsContent = await robotsResponse.text();
216
263
  const sitemapMatches = robotsContent.match(/Sitemap:\s*(.*)/gi);
217
264
  if (sitemapMatches && sitemapMatches.length > 0) {
@@ -348,6 +395,12 @@ async function crawlAndGenerate(options, onProgress) {
348
395
  progress.sitemap.status = "completed";
349
396
  progress.crawling.total = startingUrls.length;
350
397
  onProgress?.(progress);
398
+ } else if (skipSitemap && startingUrls.length > 0) {
399
+ progress.sitemap.status = "completed";
400
+ progress.sitemap.found = 0;
401
+ progress.sitemap.processed = 0;
402
+ progress.crawling.total = startingUrls.length;
403
+ onProgress?.(progress);
351
404
  }
352
405
  if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
353
406
  const results = [];
@@ -428,11 +481,46 @@ async function crawlAndGenerate(options, onProgress) {
428
481
  let crawler;
429
482
  const crawlerOptions = {
430
483
  requestHandler: createRequestHandler(driver),
431
- errorHandler: async ({ request, response }) => {
432
- if (response?.statusCode && response?.statusCode >= 400) request.noRetry = true;
484
+ errorHandler: async ({ request, response, error }) => {
485
+ if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
486
+ if (response?.statusCode && response?.statusCode >= 400) {
487
+ request.noRetry = true;
488
+ const result = {
489
+ url: request.url,
490
+ title: "",
491
+ content: "",
492
+ timestamp: Date.now(),
493
+ success: false,
494
+ error: `HTTP ${response.statusCode}`,
495
+ metadata: {
496
+ title: "",
497
+ description: "",
498
+ links: []
499
+ },
500
+ depth: request.userData?.depth || 0
501
+ };
502
+ results.push(result);
503
+ } else if (error) {
504
+ request.noRetry = true;
505
+ const result = {
506
+ url: request.url,
507
+ title: "",
508
+ content: "",
509
+ timestamp: Date.now(),
510
+ success: false,
511
+ error: error.message || "Unknown error",
512
+ metadata: {
513
+ title: "",
514
+ description: "",
515
+ links: []
516
+ },
517
+ depth: request.userData?.depth || 0
518
+ };
519
+ results.push(result);
520
+ }
433
521
  },
434
522
  maxRequestsPerCrawl,
435
- respectRobotsTxtFile: true
523
+ respectRobotsTxtFile: false
436
524
  };
437
525
  if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
438
526
  if (driver === "playwright") {
@@ -450,7 +538,15 @@ async function crawlAndGenerate(options, onProgress) {
450
538
  progress.crawling.status = "processing";
451
539
  progress.crawling.total = startingUrls.length;
452
540
  onProgress?.(progress);
453
- await crawler.run(initialRequests);
541
+ try {
542
+ await crawler.run(initialRequests);
543
+ } catch (error) {
544
+ if (verbose) {
545
+ console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
546
+ console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
547
+ }
548
+ throw error;
549
+ }
454
550
  progress.crawling.status = "completed";
455
551
  onProgress?.(progress);
456
552
  if (results.some((r) => r.success)) {
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-BtuYX2_u.mjs";
1
+ import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-D8WIR9L5.mjs";
2
2
  import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
3
3
  import * as p$1 from "@clack/prompts";
4
4
  import * as p from "@clack/prompts";
@@ -156,29 +156,35 @@ async function interactiveCrawl() {
156
156
  p.cancel("Operation cancelled.");
157
157
  process.exit(0);
158
158
  } });
159
- const advancedOptions = await p.group({ outputFormats: () => p.multiselect({
160
- message: "Select output formats:",
161
- options: [
162
- {
163
- value: "llms.txt",
164
- label: "llms.txt (basic format)",
165
- hint: "Recommended"
166
- },
167
- {
168
- value: "llms-full.txt",
169
- label: "llms-full.txt (extended format)"
170
- },
171
- {
172
- value: "markdown",
173
- label: "Individual Markdown files"
174
- }
175
- ],
176
- initialValues: [
177
- "llms.txt",
178
- "llms-full.txt",
179
- "markdown"
180
- ]
181
- }) }, { onCancel: () => {
159
+ const advancedOptions = await p.group({
160
+ outputFormats: () => p.multiselect({
161
+ message: "Select output formats:",
162
+ options: [
163
+ {
164
+ value: "llms.txt",
165
+ label: "llms.txt (basic format)",
166
+ hint: "Recommended"
167
+ },
168
+ {
169
+ value: "llms-full.txt",
170
+ label: "llms-full.txt (extended format)"
171
+ },
172
+ {
173
+ value: "markdown",
174
+ label: "Individual Markdown files"
175
+ }
176
+ ],
177
+ initialValues: [
178
+ "llms.txt",
179
+ "llms-full.txt",
180
+ "markdown"
181
+ ]
182
+ }),
183
+ skipSitemap: () => p.confirm({
184
+ message: "Skip sitemap.xml and robots.txt discovery?",
185
+ initialValue: false
186
+ })
187
+ }, { onCancel: () => {
182
188
  p.cancel("Operation cancelled.");
183
189
  process.exit(0);
184
190
  } });
@@ -206,10 +212,11 @@ async function interactiveCrawl() {
206
212
  `Max pages: Unlimited`,
207
213
  `Follow links: Yes (depth 3)`,
208
214
  `Output formats: ${outputFormats.join(", ")}`,
209
- `Sitemap discovery: Automatic`,
215
+ `Sitemap discovery: ${advancedOptions.skipSitemap ? "Skipped" : "Automatic"}`,
210
216
  inferredOrigin && `Origin: ${inferredOrigin}`
211
217
  ].filter(Boolean);
212
218
  p.note(summary.join("\n"), "Crawl Configuration");
219
+ if (advancedOptions.skipSitemap && globPatterns.some((p$2) => p$2.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
213
220
  return {
214
221
  urls,
215
222
  outputDir: resolve(outputDir),
@@ -222,7 +229,8 @@ async function interactiveCrawl() {
222
229
  origin: inferredOrigin,
223
230
  globPatterns,
224
231
  verbose: false,
225
- maxDepth: 3
232
+ maxDepth: 3,
233
+ skipSitemap: advancedOptions.skipSitemap
226
234
  };
227
235
  }
228
236
  async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
@@ -258,17 +266,19 @@ Options:
258
266
  --max-pages <number> Maximum pages to crawl (default: unlimited)
259
267
  --crawl-delay <seconds> Crawl delay in seconds
260
268
  --exclude <pattern> Exclude URLs matching glob patterns (can be used multiple times)
269
+ --skip-sitemap Skip sitemap.xml and robots.txt discovery
261
270
  -v, --verbose Enable verbose logging
262
271
  -h, --help Show this help message
263
272
  --version Show version number
264
273
 
265
- Note: Sitemap discovery and robots.txt checking are automatic
274
+ Note: Sitemap discovery and robots.txt checking are automatic unless --skip-sitemap is used.
266
275
 
267
276
  Examples:
268
277
  @mdream/crawl -u harlanzw.com --artifacts "llms.txt,markdown"
269
278
  @mdream/crawl --url https://docs.example.com --depth 2 --artifacts "llms-full.txt"
270
279
  @mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
271
280
  @mdream/crawl -u example.com --verbose
281
+ @mdream/crawl -u example.com --skip-sitemap
272
282
  `);
273
283
  process.exit(0);
274
284
  }
@@ -378,6 +388,8 @@ Examples:
378
388
  const descriptionOverride = getArgValue("--description");
379
389
  const patterns = [parsed];
380
390
  const verbose = args.includes("--verbose") || args.includes("-v");
391
+ const skipSitemap = args.includes("--skip-sitemap");
392
+ if (skipSitemap && parsed.isGlob) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
381
393
  return {
382
394
  urls: [url],
383
395
  outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "output"),
@@ -394,7 +406,8 @@ Examples:
394
406
  globPatterns: patterns,
395
407
  crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : void 0,
396
408
  exclude: excludePatterns.length > 0 ? excludePatterns : void 0,
397
- verbose
409
+ verbose,
410
+ skipSitemap
398
411
  };
399
412
  }
400
413
  async function main() {
@@ -414,6 +427,7 @@ async function main() {
414
427
  `Depth: ${options.maxDepth}`,
415
428
  `Formats: ${formats.join(", ")}`,
416
429
  options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}`,
430
+ options.skipSitemap && `Skip sitemap: Yes`,
417
431
  options.verbose && `Verbose: Enabled`
418
432
  ].filter(Boolean);
419
433
  p.note(summary.join("\n"), "Configuration");
package/dist/index.d.mts CHANGED
@@ -18,6 +18,7 @@ interface CrawlOptions {
18
18
  siteNameOverride?: string;
19
19
  descriptionOverride?: string;
20
20
  verbose?: boolean;
21
+ skipSitemap?: boolean;
21
22
  }
22
23
  interface ParsedUrlPattern {
23
24
  baseUrl: string;
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate } from "./_chunks/crawl-BtuYX2_u.mjs";
1
+ import { crawlAndGenerate } from "./_chunks/crawl-D8WIR9L5.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
4
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.9.1",
4
+ "version": "0.10.0",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -46,15 +46,15 @@
46
46
  },
47
47
  "dependencies": {
48
48
  "@clack/prompts": "^0.11.0",
49
- "crawlee": "^3.14.0",
50
- "nypm": "^0.6.0",
49
+ "crawlee": "^3.14.1",
50
+ "nypm": "^0.6.1",
51
51
  "pathe": "^2.0.3",
52
52
  "picomatch": "^4.0.3",
53
53
  "ufo": "^1.6.1",
54
- "mdream": "0.9.1"
54
+ "mdream": "0.10.0"
55
55
  },
56
56
  "devDependencies": {
57
- "@types/picomatch": "^4.0.1"
57
+ "@types/picomatch": "^4.0.2"
58
58
  },
59
59
  "scripts": {
60
60
  "build": "obuild",