@mdream/crawl 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,13 @@ import { dirname, join, normalize, resolve } from "pathe";
9
9
  import { withHttps } from "ufo";
10
10
  import picomatch from "picomatch";
11
11
  import { extractionPlugin } from "mdream/plugins";
12
+ //#region src/glob-utils.ts
13
+ const GLOB_STRIP_TAIL_RE = /\*.*$/;
14
+ const GLOB_CHAR_RE = /[*?[]/;
15
+ /**
16
+ * Parse a URL that may contain glob patterns
17
+ * Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
18
+ */
12
19
  function parseUrlPattern(input) {
13
20
  if (!(input.includes("*") || input.includes("?") || input.includes("["))) return {
14
21
  baseUrl: input,
@@ -16,7 +23,7 @@ function parseUrlPattern(input) {
16
23
  isGlob: false
17
24
  };
18
25
  try {
19
- const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(/\*.*$/, "");
26
+ const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(GLOB_STRIP_TAIL_RE, "");
20
27
  const url = new URL(urlWithoutGlob);
21
28
  const baseUrl = `${url.protocol}//${url.host}`;
22
29
  const patternStart = input.indexOf(url.host) + url.host.length;
@@ -29,6 +36,9 @@ function parseUrlPattern(input) {
29
36
  throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
30
37
  }
31
38
  }
39
+ /**
40
+ * Check if a URL matches a glob pattern
41
+ */
32
42
  function matchesGlobPattern(url, parsedPattern) {
33
43
  if (!parsedPattern.isGlob) return true;
34
44
  try {
@@ -45,16 +55,23 @@ function matchesGlobPattern(url, parsedPattern) {
45
55
  return false;
46
56
  }
47
57
  }
58
+ /**
59
+ * Get the starting URL for crawling from a glob pattern
60
+ * For https://nuxtseo.com/docs/**, we want to start at https://nuxtseo.com
61
+ */
48
62
  function getStartingUrl(parsedPattern) {
49
63
  if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
50
64
  const pattern = parsedPattern.pattern;
51
- const firstGlobIndex = pattern.search(/[*?[]/);
65
+ const firstGlobIndex = pattern.search(GLOB_CHAR_RE);
52
66
  if (firstGlobIndex === -1) return withHttps(parsedPattern.baseUrl + pattern);
53
67
  const beforeGlob = pattern.substring(0, firstGlobIndex);
54
68
  const lastSlash = beforeGlob.lastIndexOf("/");
55
69
  const pathBeforeGlob = lastSlash >= 0 ? beforeGlob.substring(0, lastSlash + 1) : "/";
56
70
  return withHttps(parsedPattern.baseUrl + pathBeforeGlob);
57
71
  }
72
+ /**
73
+ * Check if a URL should be excluded based on exclude patterns
74
+ */
58
75
  function isUrlExcluded(url, excludePatterns) {
59
76
  if (!excludePatterns || excludePatterns.length === 0) return false;
60
77
  try {
@@ -73,6 +90,9 @@ function isUrlExcluded(url, excludePatterns) {
73
90
  return false;
74
91
  }
75
92
  }
93
+ /**
94
+ * Validate glob pattern syntax
95
+ */
76
96
  function validateGlobPattern(pattern) {
77
97
  try {
78
98
  parseUrlPattern(pattern);
@@ -81,6 +101,8 @@ function validateGlobPattern(pattern) {
81
101
  return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
82
102
  }
83
103
  }
104
+ //#endregion
105
+ //#region src/metadata-extractor.ts
84
106
  function extractMetadata(html, url) {
85
107
  const links = [];
86
108
  let title = "";
@@ -133,6 +155,15 @@ function extractMetadata(html, url) {
133
155
  })
134
156
  };
135
157
  }
158
+ //#endregion
159
+ //#region src/crawl.ts
160
+ const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
161
+ const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
162
+ const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
163
+ const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
164
+ const URL_TRAILING_SLASH_RE = /\/$/;
165
+ const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
166
+ const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
136
167
  async function loadSitemapWithoutRetries(sitemapUrl) {
137
168
  const controller = new AbortController();
138
169
  const timeoutId = setTimeout(() => controller.abort(), 1e4);
@@ -145,11 +176,11 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
145
176
  if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
146
177
  const xmlContent = await response.text();
147
178
  if (xmlContent.includes("<sitemapindex")) {
148
- const sitemapIndexRegex = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
179
+ SITEMAP_INDEX_LOC_RE.lastIndex = 0;
149
180
  const childSitemaps = [];
150
181
  let match;
151
182
  while (true) {
152
- match = sitemapIndexRegex.exec(xmlContent);
183
+ match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
153
184
  if (match === null) break;
154
185
  let url = match[1];
155
186
  if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
@@ -165,10 +196,10 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
165
196
  return allUrls;
166
197
  } else {
167
198
  const urls = [];
168
- const urlRegex = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
199
+ SITEMAP_URL_LOC_RE.lastIndex = 0;
169
200
  let match;
170
201
  while (true) {
171
- match = urlRegex.exec(xmlContent);
202
+ match = SITEMAP_URL_LOC_RE.exec(xmlContent);
172
203
  if (match === null) break;
173
204
  let url = match[1];
174
205
  if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
@@ -227,12 +258,12 @@ async function crawlAndGenerate(options, onProgress) {
227
258
  robotsResponse = null;
228
259
  }
229
260
  if (robotsResponse?.ok) {
230
- const sitemapMatches = (await robotsResponse.text()).match(/Sitemap:\s*(.*)/gi);
261
+ const sitemapMatches = (await robotsResponse.text()).match(ROBOTS_SITEMAP_RE);
231
262
  if (sitemapMatches && sitemapMatches.length > 0) {
232
263
  progress.sitemap.found = sitemapMatches.length;
233
264
  progress.sitemap.status = "processing";
234
265
  onProgress?.(progress);
235
- const robotsSitemaps = sitemapMatches.map((match) => match.replace(/Sitemap:\s*/i, "").trim());
266
+ const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
236
267
  for (const sitemapUrl of robotsSitemaps) try {
237
268
  const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
238
269
  sitemapAttempts.push({
@@ -403,17 +434,17 @@ async function crawlAndGenerate(options, onProgress) {
403
434
  origin: pageOrigin
404
435
  });
405
436
  let md = "";
406
- if (shouldProcessMarkdown && (!onPage || generateIndividualMd)) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
437
+ if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
407
438
  let filePath;
408
439
  if (shouldProcessMarkdown && generateIndividualMd) {
409
440
  const urlObj = new URL(request.loadedUrl);
410
- const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(/\/$/, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(/[^\w\-]/g, "-"));
441
+ const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
411
442
  filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
412
443
  const fileDir = dirname(filePath);
413
444
  if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
414
445
  await writeFile(filePath, md, "utf-8");
415
446
  }
416
- const isHomePage = request.loadedUrl.replace(/\/$/, "") === homePageUrl.replace(/\/$/, "");
447
+ const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
417
448
  if (shouldProcessMarkdown || isHomePage) {
418
449
  const result = {
419
450
  url: request.loadedUrl,
@@ -530,7 +561,7 @@ async function crawlAndGenerate(options, onProgress) {
530
561
  onProgress?.(progress);
531
562
  const contentResults = successfulResults.filter((result) => {
532
563
  if (!result.content) return false;
533
- return result.content.trim().replace(/^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/, "").trim().length > 10;
564
+ return result.content.trim().replace(FRONTMATTER_BLOCK_RE, "").trim().length > 10;
534
565
  });
535
566
  const seenUrls = /* @__PURE__ */ new Set();
536
567
  const llmsResult = await generateLlmsTxtArtifacts({
@@ -568,4 +599,5 @@ async function crawlAndGenerate(options, onProgress) {
568
599
  await purgeDefaultStorages();
569
600
  return results;
570
601
  }
602
+ //#endregion
571
603
  export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
package/dist/cli.mjs CHANGED
@@ -6,6 +6,7 @@ import { dirname, join, resolve } from "pathe";
6
6
  import { withHttps } from "ufo";
7
7
  import { fileURLToPath } from "node:url";
8
8
  import { addDependency } from "nypm";
9
+ //#region src/playwright-utils.ts
9
10
  async function checkPlaywrightInstallation() {
10
11
  try {
11
12
  await import("playwright");
@@ -59,6 +60,8 @@ async function isUseChromeSupported() {
59
60
  } catch {}
60
61
  return false;
61
62
  }
63
+ //#endregion
64
+ //#region src/cli.ts
62
65
  const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
63
66
  const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
64
67
  function checkOutputDirectoryPermissions(outputDir) {
@@ -488,4 +491,5 @@ main().catch((error) => {
488
491
  p.log.error(`Unexpected error: ${error}`);
489
492
  process.exit(1);
490
493
  });
494
+ //#endregion
491
495
  export {};
package/dist/index.mjs CHANGED
@@ -1,6 +1,8 @@
1
1
  import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
+ //#region src/llms-txt.ts
5
+ const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
4
6
  async function generateLlmsTxt(options) {
5
7
  const { siteName, description, results, outputPath } = options;
6
8
  let content = `# ${siteName}\n\n`;
@@ -40,7 +42,7 @@ async function generateLlmsFullTxt(options) {
40
42
  } catch {
41
43
  title = result.title || result.url;
42
44
  }
43
- const anchor = title.toLowerCase().replace(/[^a-z0-9]/g, "-");
45
+ const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
44
46
  content += `- [${title}](#${anchor})\n`;
45
47
  }
46
48
  content += `\n---\n\n`;
@@ -58,4 +60,5 @@ async function generateLlmsFullTxt(options) {
58
60
  }
59
61
  await writeFile(outputPath, content, "utf-8");
60
62
  }
63
+ //#endregion
61
64
  export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.16.0",
4
+ "version": "0.17.0",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -50,13 +50,13 @@
50
50
  }
51
51
  },
52
52
  "dependencies": {
53
- "@clack/prompts": "^1.0.1",
53
+ "@clack/prompts": "^1.1.0",
54
54
  "crawlee": "^3.16.0",
55
55
  "nypm": "^0.6.5",
56
56
  "pathe": "^2.0.3",
57
57
  "picomatch": "^4.0.3",
58
58
  "ufo": "^1.6.3",
59
- "mdream": "0.16.0"
59
+ "mdream": "0.17.0"
60
60
  },
61
61
  "devDependencies": {
62
62
  "@types/picomatch": "^4.0.2"