@mdream/crawl 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { existsSync, mkdirSync } from "node:fs";
|
|
2
2
|
import { writeFile } from "node:fs/promises";
|
|
3
|
-
import { HttpCrawler, Sitemap, purgeDefaultStorages } from "crawlee";
|
|
3
|
+
import { HttpCrawler, PlaywrightCrawler, Sitemap, purgeDefaultStorages } from "crawlee";
|
|
4
4
|
import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
|
|
5
5
|
import { withMinimalPreset } from "mdream/preset/minimal";
|
|
6
6
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
@@ -321,8 +321,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
321
321
|
const startTime = Date.now();
|
|
322
322
|
progress.crawling.currentUrl = request.loadedUrl;
|
|
323
323
|
onProgress?.(progress);
|
|
324
|
-
const
|
|
325
|
-
const homePageUrl = baseUrl;
|
|
324
|
+
const homePageUrl = new URL(startingUrls[0]).origin;
|
|
326
325
|
let html;
|
|
327
326
|
let title;
|
|
328
327
|
if (crawlerType === "playwright") {
|
|
@@ -385,14 +384,15 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
385
384
|
let crawler;
|
|
386
385
|
const crawlerOptions = {
|
|
387
386
|
requestHandler: createRequestHandler(driver),
|
|
387
|
+
errorHandler: async ({ request, response }, error) => {
|
|
388
|
+
if (response?.statusCode && response?.statusCode >= 400) request.noRetry = true;
|
|
389
|
+
},
|
|
388
390
|
maxRequestsPerCrawl,
|
|
389
391
|
respectRobotsTxtFile: true
|
|
390
392
|
};
|
|
391
|
-
if (crawlDelay) crawlerOptions.
|
|
392
|
-
if (driver === "playwright")
|
|
393
|
-
|
|
394
|
-
crawler = new PlaywrightCrawlerClass(crawlerOptions);
|
|
395
|
-
} else crawler = new HttpCrawler(crawlerOptions);
|
|
393
|
+
if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
|
|
394
|
+
if (driver === "playwright") crawler = new PlaywrightCrawler(crawlerOptions);
|
|
395
|
+
else crawler = new HttpCrawler(crawlerOptions);
|
|
396
396
|
const initialRequests = startingUrls.map((url) => ({
|
|
397
397
|
url,
|
|
398
398
|
userData: { depth: 0 }
|
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-
|
|
1
|
+
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-DEysrw0h.mjs";
|
|
2
2
|
import { readFileSync } from "node:fs";
|
|
3
3
|
import { dirname, join, resolve } from "pathe";
|
|
4
4
|
import { fileURLToPath } from "node:url";
|
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.7.
|
|
4
|
+
"version": "0.7.2",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -46,11 +46,11 @@
|
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
48
|
"@clack/prompts": "^0.11.0",
|
|
49
|
-
"crawlee": "^3.13.
|
|
49
|
+
"crawlee": "^3.13.10",
|
|
50
50
|
"nypm": "^0.6.0",
|
|
51
51
|
"pathe": "^2.0.3",
|
|
52
52
|
"picomatch": "^4.0.2",
|
|
53
|
-
"mdream": "0.7.
|
|
53
|
+
"mdream": "0.7.2"
|
|
54
54
|
},
|
|
55
55
|
"devDependencies": {
|
|
56
56
|
"@types/picomatch": "^4.0.0"
|