@mdream/crawl 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  import { existsSync, mkdirSync } from "node:fs";
2
2
  import { writeFile } from "node:fs/promises";
3
- import { HttpCrawler, Sitemap, purgeDefaultStorages } from "crawlee";
3
+ import { HttpCrawler, PlaywrightCrawler, Sitemap, purgeDefaultStorages } from "crawlee";
4
4
  import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
5
5
  import { withMinimalPreset } from "mdream/preset/minimal";
6
6
  import { dirname, join, normalize, resolve } from "pathe";
@@ -321,8 +321,7 @@ async function crawlAndGenerate(options, onProgress) {
321
321
  const startTime = Date.now();
322
322
  progress.crawling.currentUrl = request.loadedUrl;
323
323
  onProgress?.(progress);
324
- const baseUrl = new URL(startingUrls[0]).origin;
325
- const homePageUrl = baseUrl;
324
+ const homePageUrl = new URL(startingUrls[0]).origin;
326
325
  let html;
327
326
  let title;
328
327
  if (crawlerType === "playwright") {
@@ -385,14 +384,15 @@ async function crawlAndGenerate(options, onProgress) {
385
384
  let crawler;
386
385
  const crawlerOptions = {
387
386
  requestHandler: createRequestHandler(driver),
387
+ errorHandler: async ({ request, response }, error) => {
388
+ if (response?.statusCode && response?.statusCode >= 400) request.noRetry = true;
389
+ },
388
390
  maxRequestsPerCrawl,
389
391
  respectRobotsTxtFile: true
390
392
  };
391
- if (crawlDelay) crawlerOptions.requestHandlerTimeoutMillis = crawlDelay * 1e3;
392
- if (driver === "playwright") {
393
- const { PlaywrightCrawler: PlaywrightCrawlerClass } = await import("crawlee");
394
- crawler = new PlaywrightCrawlerClass(crawlerOptions);
395
- } else crawler = new HttpCrawler(crawlerOptions);
393
+ if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
394
+ if (driver === "playwright") crawler = new PlaywrightCrawler(crawlerOptions);
395
+ else crawler = new HttpCrawler(crawlerOptions);
396
396
  const initialRequests = startingUrls.map((url) => ({
397
397
  url,
398
398
  userData: { depth: 0 }
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-B5MaCj6O.mjs";
1
+ import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-DEysrw0h.mjs";
2
2
  import { readFileSync } from "node:fs";
3
3
  import { dirname, join, resolve } from "pathe";
4
4
  import { fileURLToPath } from "node:url";
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate } from "./_chunks/crawl-B5MaCj6O.mjs";
1
+ import { crawlAndGenerate } from "./_chunks/crawl-DEysrw0h.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
4
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.7.1",
4
+ "version": "0.7.2",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -46,11 +46,11 @@
46
46
  },
47
47
  "dependencies": {
48
48
  "@clack/prompts": "^0.11.0",
49
- "crawlee": "^3.13.9",
49
+ "crawlee": "^3.13.10",
50
50
  "nypm": "^0.6.0",
51
51
  "pathe": "^2.0.3",
52
52
  "picomatch": "^4.0.2",
53
- "mdream": "0.7.1"
53
+ "mdream": "0.7.2"
54
54
  },
55
55
  "devDependencies": {
56
56
  "@types/picomatch": "^4.0.0"