@mdream/crawl 0.17.0 → 1.0.0-beta.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -31,7 +31,7 @@ The crawler will automatically discover and follow internal links to crawl entir
31
31
  You can also use @mdream/crawl programmatically in your Node.js applications:
32
32
 
33
33
  ```typescript
34
- import { crawlAndGenerate, generateLlmsTxt } from '@mdream/crawl'
34
+ import { crawlAndGenerate } from '@mdream/crawl'
35
35
 
36
36
  // Crawl entire websites programmatically
37
37
  const results = await crawlAndGenerate({
@@ -44,16 +44,10 @@ const results = await crawlAndGenerate({
44
44
  driver: 'http', // or 'playwright' for JS-heavy sites
45
45
  verbose: true
46
46
  })
47
-
48
- // Generate llms.txt manually from existing results
49
- await generateLlmsTxt({
50
- siteName: 'Example Site',
51
- description: 'Documentation for Example Site',
52
- results: crawlResults,
53
- outputPath: './output/llms.txt'
54
- })
55
47
  ```
56
48
 
49
+ > **Note**: llms.txt artifact generation is handled by [`@mdream/js/llms-txt`](../js). The crawl package uses it internally when `generateLlmsTxt: true`.
50
+
57
51
  ## Output
58
52
 
59
53
  The crawler generates comprehensive output from entire websites:
@@ -1,16 +1,17 @@
1
1
  import { existsSync, mkdirSync } from "node:fs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import * as p from "@clack/prompts";
4
+ import { generateLlmsTxtArtifacts } from "@mdream/js/llms-txt";
4
5
  import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawlee";
5
6
  import { htmlToMarkdown } from "mdream";
6
- import { generateLlmsTxtArtifacts } from "mdream/llms-txt";
7
- import { withMinimalPreset } from "mdream/preset/minimal";
8
7
  import { dirname, join, normalize, resolve } from "pathe";
9
8
  import { withHttps } from "ufo";
10
9
  import picomatch from "picomatch";
11
- import { extractionPlugin } from "mdream/plugins";
12
10
  //#region src/glob-utils.ts
13
- const GLOB_STRIP_TAIL_RE = /\*.*$/;
11
+ function stripGlobTail(s) {
12
+ const idx = s.indexOf("*");
13
+ return idx === -1 ? s : s.slice(0, idx);
14
+ }
14
15
  const GLOB_CHAR_RE = /[*?[]/;
15
16
  /**
16
17
  * Parse a URL that may contain glob patterns
@@ -23,7 +24,7 @@ function parseUrlPattern(input) {
23
24
  isGlob: false
24
25
  };
25
26
  try {
26
- const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(GLOB_STRIP_TAIL_RE, "");
27
+ const urlWithoutGlob = stripGlobTail(input.startsWith("http") ? input : `https://${input}`);
27
28
  const url = new URL(urlWithoutGlob);
28
29
  const baseUrl = `${url.protocol}//${url.host}`;
29
30
  const patternStart = input.indexOf(url.host) + url.host.length;
@@ -91,6 +92,12 @@ function isUrlExcluded(url, excludePatterns) {
91
92
  }
92
93
  }
93
94
  /**
95
+ * Check if a string is valid sitemap XML content (not an HTML page or other non-sitemap response)
96
+ */
97
+ function isValidSitemapXml(content) {
98
+ return content.includes("<urlset") || content.includes("<sitemapindex");
99
+ }
100
+ /**
94
101
  * Validate glob pattern syntax
95
102
  */
96
103
  function validateGlobPattern(pattern) {
@@ -110,40 +117,40 @@ function extractMetadata(html, url) {
110
117
  let keywords = "";
111
118
  let author = "";
112
119
  htmlToMarkdown(html, {
113
- plugins: [extractionPlugin({
114
- "a[href]": (element) => {
115
- const href = element.attributes?.href;
120
+ origin: new URL(url).origin,
121
+ extraction: {
122
+ "a[href]": (el) => {
123
+ const href = el.attributes.href;
116
124
  if (href) try {
117
125
  const absoluteUrl = new URL(href, url).href;
118
126
  if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
119
127
  } catch {}
120
128
  },
121
- "title": (element) => {
122
- if (!title && element.textContent) title = element.textContent.trim();
129
+ "title": (el) => {
130
+ if (!title) title = el.textContent;
123
131
  },
124
- "meta[name=\"description\"]": (element) => {
125
- if (!description && element.attributes?.content) description = element.attributes.content.trim();
132
+ "meta[name=\"description\"]": (el) => {
133
+ if (!description) description = el.attributes.content || "";
126
134
  },
127
- "meta[property=\"og:description\"]": (element) => {
128
- if (!description && element.attributes?.content) description = element.attributes.content.trim();
135
+ "meta[property=\"og:description\"]": (el) => {
136
+ if (!description) description = el.attributes.content || "";
129
137
  },
130
- "meta[name=\"keywords\"]": (element) => {
131
- if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
138
+ "meta[name=\"keywords\"]": (el) => {
139
+ if (!keywords) keywords = el.attributes.content || "";
132
140
  },
133
- "meta[name=\"author\"]": (element) => {
134
- if (!author && element.attributes?.content) author = element.attributes.content.trim();
141
+ "meta[name=\"author\"]": (el) => {
142
+ if (!author) author = el.attributes.content || "";
135
143
  },
136
- "meta[property=\"og:title\"]": (element) => {
137
- if (!title && element.attributes?.content) title = element.attributes.content.trim();
144
+ "meta[property=\"og:title\"]": (el) => {
145
+ if (!title) title = el.attributes.content || "";
138
146
  }
139
- })],
140
- origin: new URL(url).origin
147
+ }
141
148
  });
142
149
  return {
143
- title: title || new URL(url).pathname,
144
- description: description || void 0,
145
- keywords: keywords || void 0,
146
- author: author || void 0,
150
+ title: title.trim() || new URL(url).pathname,
151
+ description: description.trim() || void 0,
152
+ keywords: keywords.trim() || void 0,
153
+ author: author.trim() || void 0,
147
154
  links: links.filter((link) => {
148
155
  try {
149
156
  const linkUrl = new URL(link);
@@ -175,6 +182,7 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
175
182
  clearTimeout(timeoutId);
176
183
  if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
177
184
  const xmlContent = await response.text();
185
+ if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
178
186
  if (xmlContent.includes("<sitemapindex")) {
179
187
  SITEMAP_INDEX_LOC_RE.lastIndex = 0;
180
188
  const childSitemaps = [];
@@ -434,7 +442,7 @@ async function crawlAndGenerate(options, onProgress) {
434
442
  origin: pageOrigin
435
443
  });
436
444
  let md = "";
437
- if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
445
+ if (shouldProcessMarkdown) md = htmlToMarkdown(html, { origin: pageOrigin });
438
446
  let filePath;
439
447
  if (shouldProcessMarkdown && generateIndividualMd) {
440
448
  const urlObj = new URL(request.loadedUrl);
package/dist/index.d.mts CHANGED
@@ -51,12 +51,6 @@ interface CrawlResult {
51
51
  metadata?: PageMetadata;
52
52
  depth?: number;
53
53
  }
54
- interface LlmsTxtOptions {
55
- siteName: string;
56
- description?: string;
57
- results: CrawlResult[];
58
- outputPath: string;
59
- }
60
54
  //#endregion
61
55
  //#region src/crawl.d.ts
62
56
  interface CrawlProgress {
@@ -78,8 +72,4 @@ interface CrawlProgress {
78
72
  }
79
73
  declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult[]>;
80
74
  //#endregion
81
- //#region src/llms-txt.d.ts
82
- declare function generateLlmsTxt(options: LlmsTxtOptions): Promise<void>;
83
- declare function generateLlmsFullTxt(options: LlmsTxtOptions): Promise<void>;
84
- //#endregion
85
- export { type CrawlOptions, type CrawlResult, type LlmsTxtOptions, type PageData, crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
75
+ export { type CrawlOptions, type CrawlResult, type PageData, crawlAndGenerate };
package/dist/index.mjs CHANGED
@@ -1,64 +1,2 @@
1
1
  import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
2
- import { writeFile } from "node:fs/promises";
3
- import { basename, sep } from "pathe";
4
- //#region src/llms-txt.ts
5
- const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
6
- async function generateLlmsTxt(options) {
7
- const { siteName, description, results, outputPath } = options;
8
- let content = `# ${siteName}\n\n`;
9
- if (description) content += `> ${description}\n\n`;
10
- if (results.length > 0) {
11
- content += `## Pages\n\n`;
12
- for (const result of results) {
13
- let title;
14
- try {
15
- title = result.title || new URL(result.url).pathname;
16
- } catch {
17
- title = result.title || result.url;
18
- }
19
- if (result.filePath) {
20
- const mdSeparator = `${sep}md${sep}`;
21
- const mdIndex = result.filePath.indexOf(mdSeparator);
22
- const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
23
- content += `- [${title}](md/${linkPath}): ${result.url}\n`;
24
- } else {
25
- const description = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
26
- content += `- [${title}](${result.url})${description ? `: ${description}` : ""}\n`;
27
- }
28
- }
29
- }
30
- await writeFile(outputPath, content, "utf-8");
31
- }
32
- async function generateLlmsFullTxt(options) {
33
- const { siteName, description, results, outputPath } = options;
34
- let content = `# ${siteName}\n\n`;
35
- if (description) content += `> ${description}\n\n`;
36
- if (results.length > 0) {
37
- content += `## Table of Contents\n\n`;
38
- for (const result of results) {
39
- let title;
40
- try {
41
- title = result.title || new URL(result.url).pathname;
42
- } catch {
43
- title = result.title || result.url;
44
- }
45
- const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
46
- content += `- [${title}](#${anchor})\n`;
47
- }
48
- content += `\n---\n\n`;
49
- for (const result of results) {
50
- let title;
51
- try {
52
- title = result.title || new URL(result.url).pathname;
53
- } catch {
54
- title = result.title || result.url;
55
- }
56
- content += `## ${title}\n\n`;
57
- content += `**URL:** ${result.url}\n\n`;
58
- content += `${result.content}\n\n---\n\n`;
59
- }
60
- }
61
- await writeFile(outputPath, content, "utf-8");
62
- }
63
- //#endregion
64
- export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
2
+ export { crawlAndGenerate };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.17.0",
4
+ "version": "1.0.0-beta.10",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -56,7 +56,8 @@
56
56
  "pathe": "^2.0.3",
57
57
  "picomatch": "^4.0.3",
58
58
  "ufo": "^1.6.3",
59
- "mdream": "0.17.0"
59
+ "mdream": "1.0.0-beta.10",
60
+ "@mdream/js": "1.0.0-beta.10"
60
61
  },
61
62
  "devDependencies": {
62
63
  "@types/picomatch": "^4.0.2"