@mdream/crawl 0.17.1 → 1.0.0-beta.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -9
- package/dist/_chunks/crawl.mjs +28 -27
- package/dist/index.d.mts +1 -11
- package/dist/index.mjs +1 -63
- package/package.json +3 -2
package/README.md
CHANGED
|
@@ -31,7 +31,7 @@ The crawler will automatically discover and follow internal links to crawl entir
|
|
|
31
31
|
You can also use @mdream/crawl programmatically in your Node.js applications:
|
|
32
32
|
|
|
33
33
|
```typescript
|
|
34
|
-
import { crawlAndGenerate
|
|
34
|
+
import { crawlAndGenerate } from '@mdream/crawl'
|
|
35
35
|
|
|
36
36
|
// Crawl entire websites programmatically
|
|
37
37
|
const results = await crawlAndGenerate({
|
|
@@ -44,16 +44,10 @@ const results = await crawlAndGenerate({
|
|
|
44
44
|
driver: 'http', // or 'playwright' for JS-heavy sites
|
|
45
45
|
verbose: true
|
|
46
46
|
})
|
|
47
|
-
|
|
48
|
-
// Generate llms.txt manually from existing results
|
|
49
|
-
await generateLlmsTxt({
|
|
50
|
-
siteName: 'Example Site',
|
|
51
|
-
description: 'Documentation for Example Site',
|
|
52
|
-
results: crawlResults,
|
|
53
|
-
outputPath: './output/llms.txt'
|
|
54
|
-
})
|
|
55
47
|
```
|
|
56
48
|
|
|
49
|
+
> **Note**: llms.txt artifact generation is handled by [`@mdream/js/llms-txt`](../js). The crawl package uses it internally when `generateLlmsTxt: true`.
|
|
50
|
+
|
|
57
51
|
## Output
|
|
58
52
|
|
|
59
53
|
The crawler generates comprehensive output from entire websites:
|
package/dist/_chunks/crawl.mjs
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import { existsSync, mkdirSync } from "node:fs";
|
|
2
2
|
import { writeFile } from "node:fs/promises";
|
|
3
3
|
import * as p from "@clack/prompts";
|
|
4
|
+
import { generateLlmsTxtArtifacts } from "@mdream/js/llms-txt";
|
|
4
5
|
import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawlee";
|
|
5
6
|
import { htmlToMarkdown } from "mdream";
|
|
6
|
-
import { generateLlmsTxtArtifacts } from "mdream/llms-txt";
|
|
7
|
-
import { withMinimalPreset } from "mdream/preset/minimal";
|
|
8
7
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
9
8
|
import { withHttps } from "ufo";
|
|
10
9
|
import picomatch from "picomatch";
|
|
11
|
-
import { extractionPlugin } from "mdream/plugins";
|
|
12
10
|
//#region src/glob-utils.ts
|
|
13
|
-
|
|
11
|
+
function stripGlobTail(s) {
|
|
12
|
+
const idx = s.indexOf("*");
|
|
13
|
+
return idx === -1 ? s : s.slice(0, idx);
|
|
14
|
+
}
|
|
14
15
|
const GLOB_CHAR_RE = /[*?[]/;
|
|
15
16
|
/**
|
|
16
17
|
* Parse a URL that may contain glob patterns
|
|
@@ -23,7 +24,7 @@ function parseUrlPattern(input) {
|
|
|
23
24
|
isGlob: false
|
|
24
25
|
};
|
|
25
26
|
try {
|
|
26
|
-
const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`)
|
|
27
|
+
const urlWithoutGlob = stripGlobTail(input.startsWith("http") ? input : `https://${input}`);
|
|
27
28
|
const url = new URL(urlWithoutGlob);
|
|
28
29
|
const baseUrl = `${url.protocol}//${url.host}`;
|
|
29
30
|
const patternStart = input.indexOf(url.host) + url.host.length;
|
|
@@ -116,40 +117,40 @@ function extractMetadata(html, url) {
|
|
|
116
117
|
let keywords = "";
|
|
117
118
|
let author = "";
|
|
118
119
|
htmlToMarkdown(html, {
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
120
|
+
origin: new URL(url).origin,
|
|
121
|
+
extraction: {
|
|
122
|
+
"a[href]": (el) => {
|
|
123
|
+
const href = el.attributes.href;
|
|
122
124
|
if (href) try {
|
|
123
125
|
const absoluteUrl = new URL(href, url).href;
|
|
124
126
|
if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
|
|
125
127
|
} catch {}
|
|
126
128
|
},
|
|
127
|
-
"title": (
|
|
128
|
-
if (!title
|
|
129
|
+
"title": (el) => {
|
|
130
|
+
if (!title) title = el.textContent;
|
|
129
131
|
},
|
|
130
|
-
"meta[name=\"description\"]": (
|
|
131
|
-
if (!description
|
|
132
|
+
"meta[name=\"description\"]": (el) => {
|
|
133
|
+
if (!description) description = el.attributes.content || "";
|
|
132
134
|
},
|
|
133
|
-
"meta[property=\"og:description\"]": (
|
|
134
|
-
if (!description
|
|
135
|
+
"meta[property=\"og:description\"]": (el) => {
|
|
136
|
+
if (!description) description = el.attributes.content || "";
|
|
135
137
|
},
|
|
136
|
-
"meta[name=\"keywords\"]": (
|
|
137
|
-
if (!keywords
|
|
138
|
+
"meta[name=\"keywords\"]": (el) => {
|
|
139
|
+
if (!keywords) keywords = el.attributes.content || "";
|
|
138
140
|
},
|
|
139
|
-
"meta[name=\"author\"]": (
|
|
140
|
-
if (!author
|
|
141
|
+
"meta[name=\"author\"]": (el) => {
|
|
142
|
+
if (!author) author = el.attributes.content || "";
|
|
141
143
|
},
|
|
142
|
-
"meta[property=\"og:title\"]": (
|
|
143
|
-
if (!title
|
|
144
|
+
"meta[property=\"og:title\"]": (el) => {
|
|
145
|
+
if (!title) title = el.attributes.content || "";
|
|
144
146
|
}
|
|
145
|
-
}
|
|
146
|
-
origin: new URL(url).origin
|
|
147
|
+
}
|
|
147
148
|
});
|
|
148
149
|
return {
|
|
149
|
-
title: title || new URL(url).pathname,
|
|
150
|
-
description: description || void 0,
|
|
151
|
-
keywords: keywords || void 0,
|
|
152
|
-
author: author || void 0,
|
|
150
|
+
title: title.trim() || new URL(url).pathname,
|
|
151
|
+
description: description.trim() || void 0,
|
|
152
|
+
keywords: keywords.trim() || void 0,
|
|
153
|
+
author: author.trim() || void 0,
|
|
153
154
|
links: links.filter((link) => {
|
|
154
155
|
try {
|
|
155
156
|
const linkUrl = new URL(link);
|
|
@@ -441,7 +442,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
441
442
|
origin: pageOrigin
|
|
442
443
|
});
|
|
443
444
|
let md = "";
|
|
444
|
-
if (shouldProcessMarkdown) md = htmlToMarkdown(html,
|
|
445
|
+
if (shouldProcessMarkdown) md = htmlToMarkdown(html, { origin: pageOrigin });
|
|
445
446
|
let filePath;
|
|
446
447
|
if (shouldProcessMarkdown && generateIndividualMd) {
|
|
447
448
|
const urlObj = new URL(request.loadedUrl);
|
package/dist/index.d.mts
CHANGED
|
@@ -51,12 +51,6 @@ interface CrawlResult {
|
|
|
51
51
|
metadata?: PageMetadata;
|
|
52
52
|
depth?: number;
|
|
53
53
|
}
|
|
54
|
-
interface LlmsTxtOptions {
|
|
55
|
-
siteName: string;
|
|
56
|
-
description?: string;
|
|
57
|
-
results: CrawlResult[];
|
|
58
|
-
outputPath: string;
|
|
59
|
-
}
|
|
60
54
|
//#endregion
|
|
61
55
|
//#region src/crawl.d.ts
|
|
62
56
|
interface CrawlProgress {
|
|
@@ -78,8 +72,4 @@ interface CrawlProgress {
|
|
|
78
72
|
}
|
|
79
73
|
declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult[]>;
|
|
80
74
|
//#endregion
|
|
81
|
-
|
|
82
|
-
declare function generateLlmsTxt(options: LlmsTxtOptions): Promise<void>;
|
|
83
|
-
declare function generateLlmsFullTxt(options: LlmsTxtOptions): Promise<void>;
|
|
84
|
-
//#endregion
|
|
85
|
-
export { type CrawlOptions, type CrawlResult, type LlmsTxtOptions, type PageData, crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
|
75
|
+
export { type CrawlOptions, type CrawlResult, type PageData, crawlAndGenerate };
|
package/dist/index.mjs
CHANGED
|
@@ -1,64 +1,2 @@
|
|
|
1
1
|
import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
|
|
2
|
-
|
|
3
|
-
import { basename, sep } from "pathe";
|
|
4
|
-
//#region src/llms-txt.ts
|
|
5
|
-
const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
|
|
6
|
-
async function generateLlmsTxt(options) {
|
|
7
|
-
const { siteName, description, results, outputPath } = options;
|
|
8
|
-
let content = `# ${siteName}\n\n`;
|
|
9
|
-
if (description) content += `> ${description}\n\n`;
|
|
10
|
-
if (results.length > 0) {
|
|
11
|
-
content += `## Pages\n\n`;
|
|
12
|
-
for (const result of results) {
|
|
13
|
-
let title;
|
|
14
|
-
try {
|
|
15
|
-
title = result.title || new URL(result.url).pathname;
|
|
16
|
-
} catch {
|
|
17
|
-
title = result.title || result.url;
|
|
18
|
-
}
|
|
19
|
-
if (result.filePath) {
|
|
20
|
-
const mdSeparator = `${sep}md${sep}`;
|
|
21
|
-
const mdIndex = result.filePath.indexOf(mdSeparator);
|
|
22
|
-
const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
|
|
23
|
-
content += `- [${title}](md/${linkPath}): ${result.url}\n`;
|
|
24
|
-
} else {
|
|
25
|
-
const description = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
|
|
26
|
-
content += `- [${title}](${result.url})${description ? `: ${description}` : ""}\n`;
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
await writeFile(outputPath, content, "utf-8");
|
|
31
|
-
}
|
|
32
|
-
async function generateLlmsFullTxt(options) {
|
|
33
|
-
const { siteName, description, results, outputPath } = options;
|
|
34
|
-
let content = `# ${siteName}\n\n`;
|
|
35
|
-
if (description) content += `> ${description}\n\n`;
|
|
36
|
-
if (results.length > 0) {
|
|
37
|
-
content += `## Table of Contents\n\n`;
|
|
38
|
-
for (const result of results) {
|
|
39
|
-
let title;
|
|
40
|
-
try {
|
|
41
|
-
title = result.title || new URL(result.url).pathname;
|
|
42
|
-
} catch {
|
|
43
|
-
title = result.title || result.url;
|
|
44
|
-
}
|
|
45
|
-
const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
|
|
46
|
-
content += `- [${title}](#${anchor})\n`;
|
|
47
|
-
}
|
|
48
|
-
content += `\n---\n\n`;
|
|
49
|
-
for (const result of results) {
|
|
50
|
-
let title;
|
|
51
|
-
try {
|
|
52
|
-
title = result.title || new URL(result.url).pathname;
|
|
53
|
-
} catch {
|
|
54
|
-
title = result.title || result.url;
|
|
55
|
-
}
|
|
56
|
-
content += `## ${title}\n\n`;
|
|
57
|
-
content += `**URL:** ${result.url}\n\n`;
|
|
58
|
-
content += `${result.content}\n\n---\n\n`;
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
await writeFile(outputPath, content, "utf-8");
|
|
62
|
-
}
|
|
63
|
-
//#endregion
|
|
64
|
-
export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
|
2
|
+
export { crawlAndGenerate };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "1.0.0-beta.9",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -56,7 +56,8 @@
|
|
|
56
56
|
"pathe": "^2.0.3",
|
|
57
57
|
"picomatch": "^4.0.3",
|
|
58
58
|
"ufo": "^1.6.3",
|
|
59
|
-
"mdream": "0.
|
|
59
|
+
"@mdream/js": "1.0.0-beta.9",
|
|
60
|
+
"mdream": "1.0.0-beta.9"
|
|
60
61
|
},
|
|
61
62
|
"devDependencies": {
|
|
62
63
|
"@types/picomatch": "^4.0.2"
|