@mdream/crawl 0.8.4 → 0.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -5,24 +5,10 @@ import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawl
|
|
|
5
5
|
import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
|
|
6
6
|
import { withMinimalPreset } from "mdream/preset/minimal";
|
|
7
7
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
8
|
+
import { withHttps } from "ufo";
|
|
8
9
|
import picomatch from "picomatch";
|
|
9
10
|
import { extractionPlugin } from "mdream/plugins";
|
|
10
11
|
|
|
11
|
-
//#region ../../node_modules/.pnpm/ufo@1.6.1/node_modules/ufo/dist/index.mjs
|
|
12
|
-
const r = String.fromCharCode;
|
|
13
|
-
const PROTOCOL_REGEX = /^[\s\w\0+.-]{2,}:([/\\]{2})?/;
|
|
14
|
-
function withHttps(input) {
|
|
15
|
-
return withProtocol(input, "https://");
|
|
16
|
-
}
|
|
17
|
-
function withProtocol(input, protocol) {
|
|
18
|
-
let match = input.match(PROTOCOL_REGEX);
|
|
19
|
-
if (!match) match = input.match(/^\/{2,}/);
|
|
20
|
-
if (!match) return protocol + input;
|
|
21
|
-
return protocol + input.slice(match[0].length);
|
|
22
|
-
}
|
|
23
|
-
const protocolRelative = Symbol.for("ufo:protocolRelative");
|
|
24
|
-
|
|
25
|
-
//#endregion
|
|
26
12
|
//#region src/glob-utils.ts
|
|
27
13
|
/**
|
|
28
14
|
* Parse a URL that may contain glob patterns
|
|
@@ -467,14 +453,14 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
467
453
|
await crawler.run(initialRequests);
|
|
468
454
|
progress.crawling.status = "completed";
|
|
469
455
|
onProgress?.(progress);
|
|
470
|
-
if (results.some((r
|
|
456
|
+
if (results.some((r) => r.success)) {
|
|
471
457
|
progress.generation.status = "generating";
|
|
472
458
|
onProgress?.(progress);
|
|
473
|
-
const successfulResults = results.filter((r
|
|
459
|
+
const successfulResults = results.filter((r) => r.success);
|
|
474
460
|
const firstUrl = new URL(withHttps(urls[0]));
|
|
475
461
|
const origin$1 = firstUrl.origin;
|
|
476
|
-
const homePageResult = successfulResults.find((r
|
|
477
|
-
const resultUrl = new URL(withHttps(r
|
|
462
|
+
const homePageResult = successfulResults.find((r) => {
|
|
463
|
+
const resultUrl = new URL(withHttps(r.url));
|
|
478
464
|
return resultUrl.href === origin$1 || resultUrl.href === `${origin$1}/`;
|
|
479
465
|
});
|
|
480
466
|
const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
|
|
@@ -528,4 +514,4 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
528
514
|
}
|
|
529
515
|
|
|
530
516
|
//#endregion
|
|
531
|
-
export { crawlAndGenerate, parseUrlPattern, validateGlobPattern
|
|
517
|
+
export { crawlAndGenerate, parseUrlPattern, validateGlobPattern };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern
|
|
1
|
+
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-BtuYX2_u.mjs";
|
|
2
2
|
import { readFileSync } from "node:fs";
|
|
3
3
|
import * as p$1 from "@clack/prompts";
|
|
4
4
|
import * as p from "@clack/prompts";
|
|
5
5
|
import { PlaywrightCrawler } from "crawlee";
|
|
6
6
|
import { dirname, join, resolve } from "pathe";
|
|
7
|
+
import { withHttps } from "ufo";
|
|
7
8
|
import { fileURLToPath } from "node:url";
|
|
8
9
|
import { addDependency } from "nypm";
|
|
9
10
|
|
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.8.
|
|
4
|
+
"version": "0.8.5",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -50,7 +50,8 @@
|
|
|
50
50
|
"nypm": "^0.6.0",
|
|
51
51
|
"pathe": "^2.0.3",
|
|
52
52
|
"picomatch": "^4.0.3",
|
|
53
|
-
"
|
|
53
|
+
"ufo": "^1.6.1",
|
|
54
|
+
"mdream": "0.8.5"
|
|
54
55
|
},
|
|
55
56
|
"devDependencies": {
|
|
56
57
|
"@types/picomatch": "^4.0.1"
|