@mdream/crawl 0.15.3 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,8 +9,9 @@ import { dirname, join, normalize, resolve } from "pathe";
9
9
  import { withHttps } from "ufo";
10
10
  import picomatch from "picomatch";
11
11
  import { extractionPlugin } from "mdream/plugins";
12
-
13
12
  //#region src/glob-utils.ts
13
+ const GLOB_STRIP_TAIL_RE = /\*.*$/;
14
+ const GLOB_CHAR_RE = /[*?[]/;
14
15
  /**
15
16
  * Parse a URL that may contain glob patterns
16
17
  * Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
@@ -22,7 +23,7 @@ function parseUrlPattern(input) {
22
23
  isGlob: false
23
24
  };
24
25
  try {
25
- const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(/\*.*$/, "");
26
+ const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(GLOB_STRIP_TAIL_RE, "");
26
27
  const url = new URL(urlWithoutGlob);
27
28
  const baseUrl = `${url.protocol}//${url.host}`;
28
29
  const patternStart = input.indexOf(url.host) + url.host.length;
@@ -61,7 +62,7 @@ function matchesGlobPattern(url, parsedPattern) {
61
62
  function getStartingUrl(parsedPattern) {
62
63
  if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
63
64
  const pattern = parsedPattern.pattern;
64
- const firstGlobIndex = pattern.search(/[*?[]/);
65
+ const firstGlobIndex = pattern.search(GLOB_CHAR_RE);
65
66
  if (firstGlobIndex === -1) return withHttps(parsedPattern.baseUrl + pattern);
66
67
  const beforeGlob = pattern.substring(0, firstGlobIndex);
67
68
  const lastSlash = beforeGlob.lastIndexOf("/");
@@ -100,7 +101,6 @@ function validateGlobPattern(pattern) {
100
101
  return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
101
102
  }
102
103
  }
103
-
104
104
  //#endregion
105
105
  //#region src/metadata-extractor.ts
106
106
  function extractMetadata(html, url) {
@@ -155,9 +155,15 @@ function extractMetadata(html, url) {
155
155
  })
156
156
  };
157
157
  }
158
-
159
158
  //#endregion
160
159
  //#region src/crawl.ts
160
+ const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
161
+ const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
162
+ const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
163
+ const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
164
+ const URL_TRAILING_SLASH_RE = /\/$/;
165
+ const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
166
+ const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
161
167
  async function loadSitemapWithoutRetries(sitemapUrl) {
162
168
  const controller = new AbortController();
163
169
  const timeoutId = setTimeout(() => controller.abort(), 1e4);
@@ -170,11 +176,11 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
170
176
  if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
171
177
  const xmlContent = await response.text();
172
178
  if (xmlContent.includes("<sitemapindex")) {
173
- const sitemapIndexRegex = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
179
+ SITEMAP_INDEX_LOC_RE.lastIndex = 0;
174
180
  const childSitemaps = [];
175
181
  let match;
176
182
  while (true) {
177
- match = sitemapIndexRegex.exec(xmlContent);
183
+ match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
178
184
  if (match === null) break;
179
185
  let url = match[1];
180
186
  if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
@@ -190,10 +196,10 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
190
196
  return allUrls;
191
197
  } else {
192
198
  const urls = [];
193
- const urlRegex = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
199
+ SITEMAP_URL_LOC_RE.lastIndex = 0;
194
200
  let match;
195
201
  while (true) {
196
- match = urlRegex.exec(xmlContent);
202
+ match = SITEMAP_URL_LOC_RE.exec(xmlContent);
197
203
  if (match === null) break;
198
204
  let url = match[1];
199
205
  if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
@@ -247,24 +253,24 @@ async function crawlAndGenerate(options, onProgress) {
247
253
  headers: { "User-Agent": "mdream-crawler/1.0" }
248
254
  });
249
255
  clearTimeout(robotsTimeoutId);
250
- } catch (error) {
256
+ } catch {
251
257
  clearTimeout(robotsTimeoutId);
252
258
  robotsResponse = null;
253
259
  }
254
260
  if (robotsResponse?.ok) {
255
- const sitemapMatches = (await robotsResponse.text()).match(/Sitemap:\s*(.*)/gi);
261
+ const sitemapMatches = (await robotsResponse.text()).match(ROBOTS_SITEMAP_RE);
256
262
  if (sitemapMatches && sitemapMatches.length > 0) {
257
263
  progress.sitemap.found = sitemapMatches.length;
258
264
  progress.sitemap.status = "processing";
259
265
  onProgress?.(progress);
260
- const robotsSitemaps = sitemapMatches.map((match) => match.replace(/Sitemap:\s*/i, "").trim());
266
+ const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
261
267
  for (const sitemapUrl of robotsSitemaps) try {
262
268
  const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
263
269
  sitemapAttempts.push({
264
270
  url: sitemapUrl,
265
271
  success: true
266
272
  });
267
- if (patterns.some((p$1) => p$1.isGlob)) {
273
+ if (patterns.some((p) => p.isGlob)) {
268
274
  const filteredUrls = robotsUrls.filter((url) => {
269
275
  return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
270
276
  });
@@ -300,7 +306,7 @@ async function crawlAndGenerate(options, onProgress) {
300
306
  url: mainSitemapUrl,
301
307
  success: true
302
308
  });
303
- if (patterns.some((p$1) => p$1.isGlob)) {
309
+ if (patterns.some((p) => p.isGlob)) {
304
310
  const filteredUrls = sitemapUrls.filter((url) => {
305
311
  return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
306
312
  });
@@ -339,7 +345,7 @@ async function crawlAndGenerate(options, onProgress) {
339
345
  url: sitemapUrl,
340
346
  success: true
341
347
  });
342
- if (patterns.some((p$1) => p$1.isGlob)) {
348
+ if (patterns.some((p) => p.isGlob)) {
343
349
  const filteredUrls = altUrls.filter((url) => {
344
350
  return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
345
351
  });
@@ -360,11 +366,11 @@ async function crawlAndGenerate(options, onProgress) {
360
366
  break;
361
367
  }
362
368
  }
363
- } catch (error$1) {
369
+ } catch (error) {
364
370
  sitemapAttempts.push({
365
371
  url: sitemapUrl,
366
372
  success: false,
367
- error: error$1 instanceof Error ? error$1.message : "Unknown error"
373
+ error: error instanceof Error ? error.message : "Unknown error"
368
374
  });
369
375
  }
370
376
  }
@@ -396,7 +402,7 @@ async function crawlAndGenerate(options, onProgress) {
396
402
  const processedUrls = /* @__PURE__ */ new Set();
397
403
  const shouldCrawlUrl = (url) => {
398
404
  if (isUrlExcluded(url, exclude)) return false;
399
- if (!patterns.some((p$1) => p$1.isGlob)) return true;
405
+ if (!patterns.some((p) => p.isGlob)) return true;
400
406
  return patterns.some((pattern) => matchesGlobPattern(url, pattern));
401
407
  };
402
408
  const createRequestHandler = (crawlerType) => {
@@ -428,17 +434,17 @@ async function crawlAndGenerate(options, onProgress) {
428
434
  origin: pageOrigin
429
435
  });
430
436
  let md = "";
431
- if (shouldProcessMarkdown && (!onPage || generateIndividualMd)) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
437
+ if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
432
438
  let filePath;
433
439
  if (shouldProcessMarkdown && generateIndividualMd) {
434
440
  const urlObj = new URL(request.loadedUrl);
435
- const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(/\/$/, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(/[^\w\-]/g, "-"));
441
+ const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
436
442
  filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
437
443
  const fileDir = dirname(filePath);
438
444
  if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
439
445
  await writeFile(filePath, md, "utf-8");
440
446
  }
441
- const isHomePage = request.loadedUrl.replace(/\/$/, "") === homePageUrl.replace(/\/$/, "");
447
+ const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
442
448
  if (shouldProcessMarkdown || isHomePage) {
443
449
  const result = {
444
450
  url: request.loadedUrl,
@@ -543,10 +549,10 @@ async function crawlAndGenerate(options, onProgress) {
543
549
  onProgress?.(progress);
544
550
  const successfulResults = results.filter((r) => r.success);
545
551
  const firstUrl = new URL(withHttps(urls[0]));
546
- const origin$1 = firstUrl.origin;
552
+ const origin = firstUrl.origin;
547
553
  const homePageResult = successfulResults.find((r) => {
548
554
  const resultUrl = new URL(withHttps(r.url));
549
- return resultUrl.href === origin$1 || resultUrl.href === `${origin$1}/`;
555
+ return resultUrl.href === origin || resultUrl.href === `${origin}/`;
550
556
  });
551
557
  const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
552
558
  const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
@@ -555,7 +561,7 @@ async function crawlAndGenerate(options, onProgress) {
555
561
  onProgress?.(progress);
556
562
  const contentResults = successfulResults.filter((result) => {
557
563
  if (!result.content) return false;
558
- return result.content.trim().replace(/^---\s*\n(?:.*\n)*?---\s*/, "").trim().length > 10;
564
+ return result.content.trim().replace(FRONTMATTER_BLOCK_RE, "").trim().length > 10;
559
565
  });
560
566
  const seenUrls = /* @__PURE__ */ new Set();
561
567
  const llmsResult = await generateLlmsTxtArtifacts({
@@ -572,7 +578,7 @@ async function crawlAndGenerate(options, onProgress) {
572
578
  })),
573
579
  siteName,
574
580
  description,
575
- origin: origin$1 || firstUrl.origin,
581
+ origin: origin || firstUrl.origin,
576
582
  generateFull: generateLlmsFullTxt,
577
583
  outputDir
578
584
  });
@@ -593,6 +599,5 @@ async function crawlAndGenerate(options, onProgress) {
593
599
  await purgeDefaultStorages();
594
600
  return results;
595
601
  }
596
-
597
602
  //#endregion
598
- export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
603
+ export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
package/dist/cli.mjs CHANGED
@@ -6,7 +6,6 @@ import { dirname, join, resolve } from "pathe";
6
6
  import { withHttps } from "ufo";
7
7
  import { fileURLToPath } from "node:url";
8
8
  import { addDependency } from "nypm";
9
-
10
9
  //#region src/playwright-utils.ts
11
10
  async function checkPlaywrightInstallation() {
12
11
  try {
@@ -61,7 +60,6 @@ async function isUseChromeSupported() {
61
60
  } catch {}
62
61
  return false;
63
62
  }
64
-
65
63
  //#endregion
66
64
  //#region src/cli.ts
67
65
  const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
@@ -106,8 +104,8 @@ async function interactiveCrawl() {
106
104
  placeholder: "e.g. docs.example.com, site.com/docs/**",
107
105
  validate: (value) => {
108
106
  if (!value) return "Please enter at least one URL";
109
- const urls$1 = value.split(",").map((url) => url.trim());
110
- for (const url of urls$1) {
107
+ const urls = value.split(",").map((url) => url.trim());
108
+ for (const url of urls) {
111
109
  const globError = validateGlobPattern(url);
112
110
  if (globError) return globError;
113
111
  try {
@@ -210,7 +208,7 @@ async function interactiveCrawl() {
210
208
  inferredOrigin && `Origin: ${inferredOrigin}`
211
209
  ].filter(Boolean);
212
210
  p.note(summary.join("\n"), "Crawl Configuration");
213
- if (advancedOptions.skipSitemap && globPatterns.some((p$1) => p$1.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
211
+ if (advancedOptions.skipSitemap && globPatterns.some((p) => p.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
214
212
  return {
215
213
  urls,
216
214
  outputDir: resolve(outputDir),
@@ -493,6 +491,5 @@ main().catch((error) => {
493
491
  p.log.error(`Unexpected error: ${error}`);
494
492
  process.exit(1);
495
493
  });
496
-
497
494
  //#endregion
498
- export { };
495
+ export {};
package/dist/index.mjs CHANGED
@@ -1,8 +1,8 @@
1
1
  import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
-
5
4
  //#region src/llms-txt.ts
5
+ const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
6
6
  async function generateLlmsTxt(options) {
7
7
  const { siteName, description, results, outputPath } = options;
8
8
  let content = `# ${siteName}\n\n`;
@@ -22,8 +22,8 @@ async function generateLlmsTxt(options) {
22
22
  const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
23
23
  content += `- [${title}](md/${linkPath}): ${result.url}\n`;
24
24
  } else {
25
- const description$1 = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
26
- content += `- [${title}](${result.url})${description$1 ? `: ${description$1}` : ""}\n`;
25
+ const description = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
26
+ content += `- [${title}](${result.url})${description ? `: ${description}` : ""}\n`;
27
27
  }
28
28
  }
29
29
  }
@@ -42,7 +42,7 @@ async function generateLlmsFullTxt(options) {
42
42
  } catch {
43
43
  title = result.title || result.url;
44
44
  }
45
- const anchor = title.toLowerCase().replace(/[^a-z0-9]/g, "-");
45
+ const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
46
46
  content += `- [${title}](#${anchor})\n`;
47
47
  }
48
48
  content += `\n---\n\n`;
@@ -60,6 +60,5 @@ async function generateLlmsFullTxt(options) {
60
60
  }
61
61
  await writeFile(outputPath, content, "utf-8");
62
62
  }
63
-
64
63
  //#endregion
65
- export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
64
+ export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.15.3",
4
+ "version": "0.17.0",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -50,13 +50,13 @@
50
50
  }
51
51
  },
52
52
  "dependencies": {
53
- "@clack/prompts": "^0.11.0",
54
- "crawlee": "^3.15.3",
55
- "nypm": "^0.6.2",
53
+ "@clack/prompts": "^1.1.0",
54
+ "crawlee": "^3.16.0",
55
+ "nypm": "^0.6.5",
56
56
  "pathe": "^2.0.3",
57
57
  "picomatch": "^4.0.3",
58
- "ufo": "^1.6.1",
59
- "mdream": "0.15.3"
58
+ "ufo": "^1.6.3",
59
+ "mdream": "0.17.0"
60
60
  },
61
61
  "devDependencies": {
62
62
  "@types/picomatch": "^4.0.2"