@mdream/crawl 0.16.0 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,13 @@ import { dirname, join, normalize, resolve } from "pathe";
9
9
  import { withHttps } from "ufo";
10
10
  import picomatch from "picomatch";
11
11
  import { extractionPlugin } from "mdream/plugins";
12
+ //#region src/glob-utils.ts
13
+ const GLOB_STRIP_TAIL_RE = /\*.*$/;
14
+ const GLOB_CHAR_RE = /[*?[]/;
15
+ /**
16
+ * Parse a URL that may contain glob patterns
17
+ * Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
18
+ */
12
19
  function parseUrlPattern(input) {
13
20
  if (!(input.includes("*") || input.includes("?") || input.includes("["))) return {
14
21
  baseUrl: input,
@@ -16,7 +23,7 @@ function parseUrlPattern(input) {
16
23
  isGlob: false
17
24
  };
18
25
  try {
19
- const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(/\*.*$/, "");
26
+ const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(GLOB_STRIP_TAIL_RE, "");
20
27
  const url = new URL(urlWithoutGlob);
21
28
  const baseUrl = `${url.protocol}//${url.host}`;
22
29
  const patternStart = input.indexOf(url.host) + url.host.length;
@@ -29,6 +36,9 @@ function parseUrlPattern(input) {
29
36
  throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
30
37
  }
31
38
  }
39
+ /**
40
+ * Check if a URL matches a glob pattern
41
+ */
32
42
  function matchesGlobPattern(url, parsedPattern) {
33
43
  if (!parsedPattern.isGlob) return true;
34
44
  try {
@@ -45,16 +55,23 @@ function matchesGlobPattern(url, parsedPattern) {
45
55
  return false;
46
56
  }
47
57
  }
58
+ /**
59
+ * Get the starting URL for crawling from a glob pattern
60
+ * For https://nuxtseo.com/docs/**, we want to start at https://nuxtseo.com
61
+ */
48
62
  function getStartingUrl(parsedPattern) {
49
63
  if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
50
64
  const pattern = parsedPattern.pattern;
51
- const firstGlobIndex = pattern.search(/[*?[]/);
65
+ const firstGlobIndex = pattern.search(GLOB_CHAR_RE);
52
66
  if (firstGlobIndex === -1) return withHttps(parsedPattern.baseUrl + pattern);
53
67
  const beforeGlob = pattern.substring(0, firstGlobIndex);
54
68
  const lastSlash = beforeGlob.lastIndexOf("/");
55
69
  const pathBeforeGlob = lastSlash >= 0 ? beforeGlob.substring(0, lastSlash + 1) : "/";
56
70
  return withHttps(parsedPattern.baseUrl + pathBeforeGlob);
57
71
  }
72
+ /**
73
+ * Check if a URL should be excluded based on exclude patterns
74
+ */
58
75
  function isUrlExcluded(url, excludePatterns) {
59
76
  if (!excludePatterns || excludePatterns.length === 0) return false;
60
77
  try {
@@ -73,6 +90,15 @@ function isUrlExcluded(url, excludePatterns) {
73
90
  return false;
74
91
  }
75
92
  }
93
+ /**
94
+ * Check if a string is valid sitemap XML content (not an HTML page or other non-sitemap response)
95
+ */
96
+ function isValidSitemapXml(content) {
97
+ return content.includes("<urlset") || content.includes("<sitemapindex");
98
+ }
99
+ /**
100
+ * Validate glob pattern syntax
101
+ */
76
102
  function validateGlobPattern(pattern) {
77
103
  try {
78
104
  parseUrlPattern(pattern);
@@ -81,6 +107,8 @@ function validateGlobPattern(pattern) {
81
107
  return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
82
108
  }
83
109
  }
110
+ //#endregion
111
+ //#region src/metadata-extractor.ts
84
112
  function extractMetadata(html, url) {
85
113
  const links = [];
86
114
  let title = "";
@@ -133,6 +161,15 @@ function extractMetadata(html, url) {
133
161
  })
134
162
  };
135
163
  }
164
+ //#endregion
165
+ //#region src/crawl.ts
166
+ const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
167
+ const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
168
+ const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
169
+ const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
170
+ const URL_TRAILING_SLASH_RE = /\/$/;
171
+ const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
172
+ const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
136
173
  async function loadSitemapWithoutRetries(sitemapUrl) {
137
174
  const controller = new AbortController();
138
175
  const timeoutId = setTimeout(() => controller.abort(), 1e4);
@@ -144,12 +181,13 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
144
181
  clearTimeout(timeoutId);
145
182
  if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
146
183
  const xmlContent = await response.text();
184
+ if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
147
185
  if (xmlContent.includes("<sitemapindex")) {
148
- const sitemapIndexRegex = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
186
+ SITEMAP_INDEX_LOC_RE.lastIndex = 0;
149
187
  const childSitemaps = [];
150
188
  let match;
151
189
  while (true) {
152
- match = sitemapIndexRegex.exec(xmlContent);
190
+ match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
153
191
  if (match === null) break;
154
192
  let url = match[1];
155
193
  if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
@@ -165,10 +203,10 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
165
203
  return allUrls;
166
204
  } else {
167
205
  const urls = [];
168
- const urlRegex = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
206
+ SITEMAP_URL_LOC_RE.lastIndex = 0;
169
207
  let match;
170
208
  while (true) {
171
- match = urlRegex.exec(xmlContent);
209
+ match = SITEMAP_URL_LOC_RE.exec(xmlContent);
172
210
  if (match === null) break;
173
211
  let url = match[1];
174
212
  if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
@@ -227,12 +265,12 @@ async function crawlAndGenerate(options, onProgress) {
227
265
  robotsResponse = null;
228
266
  }
229
267
  if (robotsResponse?.ok) {
230
- const sitemapMatches = (await robotsResponse.text()).match(/Sitemap:\s*(.*)/gi);
268
+ const sitemapMatches = (await robotsResponse.text()).match(ROBOTS_SITEMAP_RE);
231
269
  if (sitemapMatches && sitemapMatches.length > 0) {
232
270
  progress.sitemap.found = sitemapMatches.length;
233
271
  progress.sitemap.status = "processing";
234
272
  onProgress?.(progress);
235
- const robotsSitemaps = sitemapMatches.map((match) => match.replace(/Sitemap:\s*/i, "").trim());
273
+ const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
236
274
  for (const sitemapUrl of robotsSitemaps) try {
237
275
  const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
238
276
  sitemapAttempts.push({
@@ -403,17 +441,17 @@ async function crawlAndGenerate(options, onProgress) {
403
441
  origin: pageOrigin
404
442
  });
405
443
  let md = "";
406
- if (shouldProcessMarkdown && (!onPage || generateIndividualMd)) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
444
+ if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
407
445
  let filePath;
408
446
  if (shouldProcessMarkdown && generateIndividualMd) {
409
447
  const urlObj = new URL(request.loadedUrl);
410
- const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(/\/$/, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(/[^\w\-]/g, "-"));
448
+ const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
411
449
  filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
412
450
  const fileDir = dirname(filePath);
413
451
  if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
414
452
  await writeFile(filePath, md, "utf-8");
415
453
  }
416
- const isHomePage = request.loadedUrl.replace(/\/$/, "") === homePageUrl.replace(/\/$/, "");
454
+ const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
417
455
  if (shouldProcessMarkdown || isHomePage) {
418
456
  const result = {
419
457
  url: request.loadedUrl,
@@ -530,7 +568,7 @@ async function crawlAndGenerate(options, onProgress) {
530
568
  onProgress?.(progress);
531
569
  const contentResults = successfulResults.filter((result) => {
532
570
  if (!result.content) return false;
533
- return result.content.trim().replace(/^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/, "").trim().length > 10;
571
+ return result.content.trim().replace(FRONTMATTER_BLOCK_RE, "").trim().length > 10;
534
572
  });
535
573
  const seenUrls = /* @__PURE__ */ new Set();
536
574
  const llmsResult = await generateLlmsTxtArtifacts({
@@ -568,4 +606,5 @@ async function crawlAndGenerate(options, onProgress) {
568
606
  await purgeDefaultStorages();
569
607
  return results;
570
608
  }
609
+ //#endregion
571
610
  export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
package/dist/cli.mjs CHANGED
@@ -6,6 +6,7 @@ import { dirname, join, resolve } from "pathe";
6
6
  import { withHttps } from "ufo";
7
7
  import { fileURLToPath } from "node:url";
8
8
  import { addDependency } from "nypm";
9
+ //#region src/playwright-utils.ts
9
10
  async function checkPlaywrightInstallation() {
10
11
  try {
11
12
  await import("playwright");
@@ -59,6 +60,8 @@ async function isUseChromeSupported() {
59
60
  } catch {}
60
61
  return false;
61
62
  }
63
+ //#endregion
64
+ //#region src/cli.ts
62
65
  const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
63
66
  const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
64
67
  function checkOutputDirectoryPermissions(outputDir) {
@@ -488,4 +491,5 @@ main().catch((error) => {
488
491
  p.log.error(`Unexpected error: ${error}`);
489
492
  process.exit(1);
490
493
  });
494
+ //#endregion
491
495
  export {};
package/dist/index.mjs CHANGED
@@ -1,6 +1,8 @@
1
1
  import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
+ //#region src/llms-txt.ts
5
+ const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
4
6
  async function generateLlmsTxt(options) {
5
7
  const { siteName, description, results, outputPath } = options;
6
8
  let content = `# ${siteName}\n\n`;
@@ -40,7 +42,7 @@ async function generateLlmsFullTxt(options) {
40
42
  } catch {
41
43
  title = result.title || result.url;
42
44
  }
43
- const anchor = title.toLowerCase().replace(/[^a-z0-9]/g, "-");
45
+ const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
44
46
  content += `- [${title}](#${anchor})\n`;
45
47
  }
46
48
  content += `\n---\n\n`;
@@ -58,4 +60,5 @@ async function generateLlmsFullTxt(options) {
58
60
  }
59
61
  await writeFile(outputPath, content, "utf-8");
60
62
  }
63
+ //#endregion
61
64
  export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.16.0",
4
+ "version": "0.17.1",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -50,13 +50,13 @@
50
50
  }
51
51
  },
52
52
  "dependencies": {
53
- "@clack/prompts": "^1.0.1",
53
+ "@clack/prompts": "^1.1.0",
54
54
  "crawlee": "^3.16.0",
55
55
  "nypm": "^0.6.5",
56
56
  "pathe": "^2.0.3",
57
57
  "picomatch": "^4.0.3",
58
58
  "ufo": "^1.6.3",
59
- "mdream": "0.16.0"
59
+ "mdream": "0.17.1"
60
60
  },
61
61
  "devDependencies": {
62
62
  "@types/picomatch": "^4.0.2"