@mdream/crawl 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,10 @@
1
1
  import { existsSync, mkdirSync } from "node:fs";
2
2
  import { writeFile } from "node:fs/promises";
3
- import { HttpCrawler, Sitemap, purgeDefaultStorages } from "crawlee";
3
+ import { HttpCrawler, PlaywrightCrawler, Sitemap, purgeDefaultStorages } from "crawlee";
4
4
  import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
5
5
  import { withMinimalPreset } from "mdream/preset/minimal";
6
6
  import { dirname, join, normalize, resolve } from "pathe";
7
- import { minimatch } from "minimatch";
7
+ import picomatch from "picomatch";
8
8
  import { extractionPlugin } from "mdream/plugins";
9
9
 
10
10
  //#region ../../node_modules/.pnpm/ufo@1.6.1/node_modules/ufo/dist/index.mjs
@@ -60,7 +60,12 @@ function matchesGlobPattern(url, parsedPattern) {
60
60
  const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
61
61
  const urlBase = `${urlObj.protocol}//${urlObj.host}`;
62
62
  if (urlBase !== parsedPattern.baseUrl) return false;
63
- return minimatch(urlPath, parsedPattern.pattern);
63
+ let pattern = parsedPattern.pattern;
64
+ if (pattern.endsWith("*") && !pattern.endsWith("**") && !pattern.endsWith("/*")) {
65
+ const base = pattern.slice(0, -1);
66
+ pattern = `{${base},${base}/**}`;
67
+ }
68
+ return picomatch(pattern)(urlPath);
64
69
  } catch {
65
70
  return false;
66
71
  }
@@ -95,9 +100,9 @@ function isUrlExcluded(url, excludePatterns) {
95
100
  }
96
101
  if (pattern.startsWith("/")) {
97
102
  const adjustedPattern = pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern;
98
- return minimatch(urlPath, adjustedPattern);
103
+ return picomatch(adjustedPattern)(urlPath);
99
104
  }
100
- return minimatch(urlPath, pattern) || minimatch(urlPath.substring(1), pattern);
105
+ return picomatch(pattern)(urlPath) || picomatch(pattern)(urlPath.substring(1));
101
106
  });
102
107
  } catch {
103
108
  return false;
@@ -237,6 +242,7 @@ async function crawlAndGenerate(options, onProgress) {
237
242
  }
238
243
  }
239
244
  }
245
+ let mainSitemapProcessed = false;
240
246
  try {
241
247
  const { urls: sitemapUrls } = await Sitemap.load(`${baseUrl}/sitemap.xml`);
242
248
  const hasGlobPatterns = patterns.some((p) => p.isGlob);
@@ -248,6 +254,7 @@ async function crawlAndGenerate(options, onProgress) {
248
254
  progress.sitemap.found = sitemapUrls.length;
249
255
  progress.sitemap.processed = filteredUrls.length;
250
256
  onProgress?.(progress);
257
+ mainSitemapProcessed = true;
251
258
  } else {
252
259
  const filteredUrls = sitemapUrls.filter((url) => {
253
260
  return !isUrlExcluded(url, exclude);
@@ -257,40 +264,43 @@ async function crawlAndGenerate(options, onProgress) {
257
264
  progress.sitemap.found = sitemapUrls.length;
258
265
  progress.sitemap.processed = filteredUrls.length;
259
266
  onProgress?.(progress);
267
+ mainSitemapProcessed = true;
260
268
  }
261
269
  }
262
270
  } catch {
263
- const commonSitemaps = [
264
- `${baseUrl}/sitemap_index.xml`,
265
- `${baseUrl}/sitemaps.xml`,
266
- `${baseUrl}/sitemap-index.xml`
267
- ];
268
- for (const sitemapUrl of commonSitemaps) try {
269
- const { urls: altUrls } = await Sitemap.load(sitemapUrl);
270
- const hasGlobPatterns = patterns.some((p) => p.isGlob);
271
- if (hasGlobPatterns) {
272
- const filteredUrls = altUrls.filter((url) => {
273
- return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
274
- });
275
- startingUrls = filteredUrls;
276
- progress.sitemap.found = altUrls.length;
277
- progress.sitemap.processed = filteredUrls.length;
278
- onProgress?.(progress);
279
- break;
280
- } else {
281
- const filteredUrls = altUrls.filter((url) => {
282
- return !isUrlExcluded(url, exclude);
283
- });
284
- if (filteredUrls.length > 0) {
271
+ if (!mainSitemapProcessed) {
272
+ const commonSitemaps = [
273
+ `${baseUrl}/sitemap_index.xml`,
274
+ `${baseUrl}/sitemaps.xml`,
275
+ `${baseUrl}/sitemap-index.xml`
276
+ ];
277
+ for (const sitemapUrl of commonSitemaps) try {
278
+ const { urls: altUrls } = await Sitemap.load(sitemapUrl);
279
+ const hasGlobPatterns = patterns.some((p) => p.isGlob);
280
+ if (hasGlobPatterns) {
281
+ const filteredUrls = altUrls.filter((url) => {
282
+ return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
283
+ });
285
284
  startingUrls = filteredUrls;
286
285
  progress.sitemap.found = altUrls.length;
287
286
  progress.sitemap.processed = filteredUrls.length;
288
287
  onProgress?.(progress);
289
288
  break;
289
+ } else {
290
+ const filteredUrls = altUrls.filter((url) => {
291
+ return !isUrlExcluded(url, exclude);
292
+ });
293
+ if (filteredUrls.length > 0) {
294
+ startingUrls = filteredUrls;
295
+ progress.sitemap.found = altUrls.length;
296
+ progress.sitemap.processed = filteredUrls.length;
297
+ onProgress?.(progress);
298
+ break;
299
+ }
290
300
  }
301
+ } catch {
302
+ continue;
291
303
  }
292
- } catch {
293
- continue;
294
304
  }
295
305
  }
296
306
  if (!startingUrls.includes(homePageUrl)) startingUrls.unshift(homePageUrl);
@@ -311,8 +321,7 @@ async function crawlAndGenerate(options, onProgress) {
311
321
  const startTime = Date.now();
312
322
  progress.crawling.currentUrl = request.loadedUrl;
313
323
  onProgress?.(progress);
314
- const baseUrl = new URL(startingUrls[0]).origin;
315
- const homePageUrl = baseUrl;
324
+ const homePageUrl = new URL(startingUrls[0]).origin;
316
325
  let html;
317
326
  let title;
318
327
  if (crawlerType === "playwright") {
@@ -375,14 +384,15 @@ async function crawlAndGenerate(options, onProgress) {
375
384
  let crawler;
376
385
  const crawlerOptions = {
377
386
  requestHandler: createRequestHandler(driver),
387
+ errorHandler: async ({ request, response }, error) => {
388
+ if (response?.statusCode && response?.statusCode >= 400) request.noRetry = true;
389
+ },
378
390
  maxRequestsPerCrawl,
379
391
  respectRobotsTxtFile: true
380
392
  };
381
- if (crawlDelay) crawlerOptions.requestHandlerTimeoutMillis = crawlDelay * 1e3;
382
- if (driver === "playwright") {
383
- const { PlaywrightCrawler: PlaywrightCrawlerClass } = await import("crawlee");
384
- crawler = new PlaywrightCrawlerClass(crawlerOptions);
385
- } else crawler = new HttpCrawler(crawlerOptions);
393
+ if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
394
+ if (driver === "playwright") crawler = new PlaywrightCrawler(crawlerOptions);
395
+ else crawler = new HttpCrawler(crawlerOptions);
386
396
  const initialRequests = startingUrls.map((url) => ({
387
397
  url,
388
398
  userData: { depth: 0 }
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-NJU1Dyc-.mjs";
1
+ import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-DEysrw0h.mjs";
2
2
  import { readFileSync } from "node:fs";
3
3
  import { dirname, join, resolve } from "pathe";
4
4
  import { fileURLToPath } from "node:url";
@@ -197,15 +197,16 @@ async function interactiveCrawl() {
197
197
  globPatterns
198
198
  };
199
199
  }
200
- async function showCrawlResults(successful, failed, outputDir, generatedFiles) {
200
+ async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationMinutes, remainingSeconds) {
201
201
  const messages = [];
202
- if (successful > 0) messages.push(`✅ ${successful} pages processed successfully`);
202
+ if (successful > 0) {
203
+ const durationStr = durationMinutes > 0 ? `${durationMinutes}m ${remainingSeconds}s` : `${remainingSeconds}s`;
204
+ messages.push(`✅ ${successful} pages processed in ${durationStr}`);
205
+ }
203
206
  if (failed > 0) messages.push(`❌ ${failed} pages failed`);
204
207
  if (generatedFiles.length > 0) messages.push(`📄 Generated: ${generatedFiles.join(", ")}`);
205
208
  messages.push(`📁 Output: ${outputDir}`);
206
- p.note(messages.join("\n"), "Crawl Results");
207
- if (successful > 0) p.outro("🎉 Crawling completed successfully!");
208
- else p.outro("❌ Crawling failed - no pages processed");
209
+ p.note(messages.join("\n"), "Completed: Results");
209
210
  }
210
211
  function parseCliArgs() {
211
212
  const args = process.argv.slice(2);
@@ -396,6 +397,7 @@ async function main() {
396
397
  }
397
398
  const s = p.spinner();
398
399
  s.start("Starting crawl...");
400
+ const startTime = Date.now();
399
401
  const results = await crawlAndGenerate(options, (progress) => {
400
402
  if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps...");
401
403
  else if (progress.sitemap.status === "processing") s.message(`Processing sitemap... Found ${progress.sitemap.found} URLs`);
@@ -414,7 +416,12 @@ async function main() {
414
416
  s.message(current);
415
417
  }
416
418
  });
417
- s.stop("Crawl completed!");
419
+ s.stop();
420
+ const endTime = Date.now();
421
+ const durationMs = endTime - startTime;
422
+ const durationSeconds = Math.floor(durationMs / 1e3);
423
+ const durationMinutes = Math.floor(durationSeconds / 60);
424
+ const remainingSeconds = durationSeconds % 60;
418
425
  const successful = results.filter((r) => r.success).length;
419
426
  const failed = results.filter((r) => !r.success).length;
420
427
  const failedResults = results.filter((r) => !r.success);
@@ -435,19 +442,18 @@ async function main() {
435
442
  if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
436
443
  if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
437
444
  }
438
- if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles);
445
+ if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationMinutes, remainingSeconds);
439
446
  else {
440
447
  const messages = [];
441
- if (successful > 0) messages.push(`✅ ${successful} pages processed`);
448
+ if (successful > 0) {
449
+ const durationStr = durationMinutes > 0 ? `${durationMinutes}m ${remainingSeconds}s` : `${remainingSeconds}s`;
450
+ messages.push(`✅ ${successful} pages processed in ${durationStr}`);
451
+ }
442
452
  if (failed > 0) messages.push(`❌ ${failed} pages failed`);
443
453
  if (generatedFiles.length > 0) messages.push(`📄 Generated: ${generatedFiles.join(", ")}`);
444
454
  messages.push(`📁 Output: ${options.outputDir}`);
445
- p.note(messages.join("\n"), "Results");
446
- if (successful > 0) p.outro("🎉 Crawling completed!");
447
- else {
448
- p.outro("❌ Crawling failed - no pages processed");
449
- process.exit(1);
450
- }
455
+ p.note(messages.join("\n"), "Completed: Results");
456
+ if (successful === 0) process.exit(1);
451
457
  }
452
458
  }
453
459
  main().catch((error) => {
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate } from "./_chunks/crawl-NJU1Dyc-.mjs";
1
+ import { crawlAndGenerate } from "./_chunks/crawl-DEysrw0h.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
4
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.7.0",
4
+ "version": "0.7.2",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -46,11 +46,14 @@
46
46
  },
47
47
  "dependencies": {
48
48
  "@clack/prompts": "^0.11.0",
49
- "crawlee": "^3.13.9",
50
- "minimatch": "^10.0.3",
49
+ "crawlee": "^3.13.10",
51
50
  "nypm": "^0.6.0",
52
51
  "pathe": "^2.0.3",
53
- "mdream": "0.7.0"
52
+ "picomatch": "^4.0.2",
53
+ "mdream": "0.7.2"
54
+ },
55
+ "devDependencies": {
56
+ "@types/picomatch": "^4.0.0"
54
57
  },
55
58
  "scripts": {
56
59
  "build": "obuild",