@mdream/crawl 0.15.2 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,12 +9,6 @@ import { dirname, join, normalize, resolve } from "pathe";
9
9
  import { withHttps } from "ufo";
10
10
  import picomatch from "picomatch";
11
11
  import { extractionPlugin } from "mdream/plugins";
12
-
13
- //#region src/glob-utils.ts
14
- /**
15
- * Parse a URL that may contain glob patterns
16
- * Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
17
- */
18
12
  function parseUrlPattern(input) {
19
13
  if (!(input.includes("*") || input.includes("?") || input.includes("["))) return {
20
14
  baseUrl: input,
@@ -35,9 +29,6 @@ function parseUrlPattern(input) {
35
29
  throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
36
30
  }
37
31
  }
38
- /**
39
- * Check if a URL matches a glob pattern
40
- */
41
32
  function matchesGlobPattern(url, parsedPattern) {
42
33
  if (!parsedPattern.isGlob) return true;
43
34
  try {
@@ -54,10 +45,6 @@ function matchesGlobPattern(url, parsedPattern) {
54
45
  return false;
55
46
  }
56
47
  }
57
- /**
58
- * Get the starting URL for crawling from a glob pattern
59
- * For https://nuxtseo.com/docs/**, we want to start at https://nuxtseo.com
60
- */
61
48
  function getStartingUrl(parsedPattern) {
62
49
  if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
63
50
  const pattern = parsedPattern.pattern;
@@ -68,9 +55,6 @@ function getStartingUrl(parsedPattern) {
68
55
  const pathBeforeGlob = lastSlash >= 0 ? beforeGlob.substring(0, lastSlash + 1) : "/";
69
56
  return withHttps(parsedPattern.baseUrl + pathBeforeGlob);
70
57
  }
71
- /**
72
- * Check if a URL should be excluded based on exclude patterns
73
- */
74
58
  function isUrlExcluded(url, excludePatterns) {
75
59
  if (!excludePatterns || excludePatterns.length === 0) return false;
76
60
  try {
@@ -89,9 +73,6 @@ function isUrlExcluded(url, excludePatterns) {
89
73
  return false;
90
74
  }
91
75
  }
92
- /**
93
- * Validate glob pattern syntax
94
- */
95
76
  function validateGlobPattern(pattern) {
96
77
  try {
97
78
  parseUrlPattern(pattern);
@@ -100,9 +81,6 @@ function validateGlobPattern(pattern) {
100
81
  return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
101
82
  }
102
83
  }
103
-
104
- //#endregion
105
- //#region src/metadata-extractor.ts
106
84
  function extractMetadata(html, url) {
107
85
  const links = [];
108
86
  let title = "";
@@ -155,9 +133,6 @@ function extractMetadata(html, url) {
155
133
  })
156
134
  };
157
135
  }
158
-
159
- //#endregion
160
- //#region src/crawl.ts
161
136
  async function loadSitemapWithoutRetries(sitemapUrl) {
162
137
  const controller = new AbortController();
163
138
  const timeoutId = setTimeout(() => controller.abort(), 1e4);
@@ -247,7 +222,7 @@ async function crawlAndGenerate(options, onProgress) {
247
222
  headers: { "User-Agent": "mdream-crawler/1.0" }
248
223
  });
249
224
  clearTimeout(robotsTimeoutId);
250
- } catch (error) {
225
+ } catch {
251
226
  clearTimeout(robotsTimeoutId);
252
227
  robotsResponse = null;
253
228
  }
@@ -264,7 +239,7 @@ async function crawlAndGenerate(options, onProgress) {
264
239
  url: sitemapUrl,
265
240
  success: true
266
241
  });
267
- if (patterns.some((p$1) => p$1.isGlob)) {
242
+ if (patterns.some((p) => p.isGlob)) {
268
243
  const filteredUrls = robotsUrls.filter((url) => {
269
244
  return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
270
245
  });
@@ -300,7 +275,7 @@ async function crawlAndGenerate(options, onProgress) {
300
275
  url: mainSitemapUrl,
301
276
  success: true
302
277
  });
303
- if (patterns.some((p$1) => p$1.isGlob)) {
278
+ if (patterns.some((p) => p.isGlob)) {
304
279
  const filteredUrls = sitemapUrls.filter((url) => {
305
280
  return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
306
281
  });
@@ -339,7 +314,7 @@ async function crawlAndGenerate(options, onProgress) {
339
314
  url: sitemapUrl,
340
315
  success: true
341
316
  });
342
- if (patterns.some((p$1) => p$1.isGlob)) {
317
+ if (patterns.some((p) => p.isGlob)) {
343
318
  const filteredUrls = altUrls.filter((url) => {
344
319
  return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
345
320
  });
@@ -360,11 +335,11 @@ async function crawlAndGenerate(options, onProgress) {
360
335
  break;
361
336
  }
362
337
  }
363
- } catch (error$1) {
338
+ } catch (error) {
364
339
  sitemapAttempts.push({
365
340
  url: sitemapUrl,
366
341
  success: false,
367
- error: error$1 instanceof Error ? error$1.message : "Unknown error"
342
+ error: error instanceof Error ? error.message : "Unknown error"
368
343
  });
369
344
  }
370
345
  }
@@ -396,7 +371,7 @@ async function crawlAndGenerate(options, onProgress) {
396
371
  const processedUrls = /* @__PURE__ */ new Set();
397
372
  const shouldCrawlUrl = (url) => {
398
373
  if (isUrlExcluded(url, exclude)) return false;
399
- if (!patterns.some((p$1) => p$1.isGlob)) return true;
374
+ if (!patterns.some((p) => p.isGlob)) return true;
400
375
  return patterns.some((pattern) => matchesGlobPattern(url, pattern));
401
376
  };
402
377
  const createRequestHandler = (crawlerType) => {
@@ -543,10 +518,10 @@ async function crawlAndGenerate(options, onProgress) {
543
518
  onProgress?.(progress);
544
519
  const successfulResults = results.filter((r) => r.success);
545
520
  const firstUrl = new URL(withHttps(urls[0]));
546
- const origin$1 = firstUrl.origin;
521
+ const origin = firstUrl.origin;
547
522
  const homePageResult = successfulResults.find((r) => {
548
523
  const resultUrl = new URL(withHttps(r.url));
549
- return resultUrl.href === origin$1 || resultUrl.href === `${origin$1}/`;
524
+ return resultUrl.href === origin || resultUrl.href === `${origin}/`;
550
525
  });
551
526
  const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
552
527
  const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
@@ -555,7 +530,7 @@ async function crawlAndGenerate(options, onProgress) {
555
530
  onProgress?.(progress);
556
531
  const contentResults = successfulResults.filter((result) => {
557
532
  if (!result.content) return false;
558
- return result.content.trim().replace(/^---\s*\n(?:.*\n)*?---\s*/, "").trim().length > 10;
533
+ return result.content.trim().replace(/^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/, "").trim().length > 10;
559
534
  });
560
535
  const seenUrls = /* @__PURE__ */ new Set();
561
536
  const llmsResult = await generateLlmsTxtArtifacts({
@@ -572,7 +547,7 @@ async function crawlAndGenerate(options, onProgress) {
572
547
  })),
573
548
  siteName,
574
549
  description,
575
- origin: origin$1 || firstUrl.origin,
550
+ origin: origin || firstUrl.origin,
576
551
  generateFull: generateLlmsFullTxt,
577
552
  outputDir
578
553
  });
@@ -593,6 +568,4 @@ async function crawlAndGenerate(options, onProgress) {
593
568
  await purgeDefaultStorages();
594
569
  return results;
595
570
  }
596
-
597
- //#endregion
598
- export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
571
+ export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl-BInMcRnS.mjs";
1
+ import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl.mjs";
2
2
  import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
3
3
  import * as p from "@clack/prompts";
4
4
  import { PlaywrightCrawler } from "crawlee";
@@ -6,8 +6,6 @@ import { dirname, join, resolve } from "pathe";
6
6
  import { withHttps } from "ufo";
7
7
  import { fileURLToPath } from "node:url";
8
8
  import { addDependency } from "nypm";
9
-
10
- //#region src/playwright-utils.ts
11
9
  async function checkPlaywrightInstallation() {
12
10
  try {
13
11
  await import("playwright");
@@ -61,9 +59,6 @@ async function isUseChromeSupported() {
61
59
  } catch {}
62
60
  return false;
63
61
  }
64
-
65
- //#endregion
66
- //#region src/cli.ts
67
62
  const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
68
63
  const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
69
64
  function checkOutputDirectoryPermissions(outputDir) {
@@ -106,8 +101,8 @@ async function interactiveCrawl() {
106
101
  placeholder: "e.g. docs.example.com, site.com/docs/**",
107
102
  validate: (value) => {
108
103
  if (!value) return "Please enter at least one URL";
109
- const urls$1 = value.split(",").map((url) => url.trim());
110
- for (const url of urls$1) {
104
+ const urls = value.split(",").map((url) => url.trim());
105
+ for (const url of urls) {
111
106
  const globError = validateGlobPattern(url);
112
107
  if (globError) return globError;
113
108
  try {
@@ -210,7 +205,7 @@ async function interactiveCrawl() {
210
205
  inferredOrigin && `Origin: ${inferredOrigin}`
211
206
  ].filter(Boolean);
212
207
  p.note(summary.join("\n"), "Crawl Configuration");
213
- if (advancedOptions.skipSitemap && globPatterns.some((p$1) => p$1.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
208
+ if (advancedOptions.skipSitemap && globPatterns.some((p) => p.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
214
209
  return {
215
210
  urls,
216
211
  outputDir: resolve(outputDir),
@@ -493,5 +488,4 @@ main().catch((error) => {
493
488
  p.log.error(`Unexpected error: ${error}`);
494
489
  process.exit(1);
495
490
  });
496
-
497
- //#endregion
491
+ export {};
package/dist/index.mjs CHANGED
@@ -1,8 +1,6 @@
1
- import { t as crawlAndGenerate } from "./_chunks/crawl-BInMcRnS.mjs";
1
+ import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
-
5
- //#region src/llms-txt.ts
6
4
  async function generateLlmsTxt(options) {
7
5
  const { siteName, description, results, outputPath } = options;
8
6
  let content = `# ${siteName}\n\n`;
@@ -22,8 +20,8 @@ async function generateLlmsTxt(options) {
22
20
  const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
23
21
  content += `- [${title}](md/${linkPath}): ${result.url}\n`;
24
22
  } else {
25
- const description$1 = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
26
- content += `- [${title}](${result.url})${description$1 ? `: ${description$1}` : ""}\n`;
23
+ const description = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
24
+ content += `- [${title}](${result.url})${description ? `: ${description}` : ""}\n`;
27
25
  }
28
26
  }
29
27
  }
@@ -60,6 +58,4 @@ async function generateLlmsFullTxt(options) {
60
58
  }
61
59
  await writeFile(outputPath, content, "utf-8");
62
60
  }
63
-
64
- //#endregion
65
- export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
61
+ export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.15.2",
4
+ "version": "0.16.0",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -50,13 +50,13 @@
50
50
  }
51
51
  },
52
52
  "dependencies": {
53
- "@clack/prompts": "^0.11.0",
54
- "crawlee": "^3.15.3",
55
- "nypm": "^0.6.2",
53
+ "@clack/prompts": "^1.0.1",
54
+ "crawlee": "^3.16.0",
55
+ "nypm": "^0.6.5",
56
56
  "pathe": "^2.0.3",
57
57
  "picomatch": "^4.0.3",
58
- "ufo": "^1.6.1",
59
- "mdream": "0.15.2"
58
+ "ufo": "^1.6.3",
59
+ "mdream": "0.16.0"
60
60
  },
61
61
  "devDependencies": {
62
62
  "@types/picomatch": "^4.0.2"