@mdream/crawl 0.13.2 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,22 +16,19 @@ import { extractionPlugin } from "mdream/plugins";
16
16
  * Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
17
17
  */
18
18
  function parseUrlPattern(input) {
19
- const hasGlob = input.includes("*") || input.includes("?") || input.includes("[");
20
- if (!hasGlob) return {
19
+ if (!(input.includes("*") || input.includes("?") || input.includes("["))) return {
21
20
  baseUrl: input,
22
21
  pattern: "",
23
22
  isGlob: false
24
23
  };
25
24
  try {
26
- const urlWithProtocol = input.startsWith("http") ? input : `https://${input}`;
27
- const urlWithoutGlob = urlWithProtocol.replace(/\*.*$/, "");
25
+ const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(/\*.*$/, "");
28
26
  const url = new URL(urlWithoutGlob);
29
27
  const baseUrl = `${url.protocol}//${url.host}`;
30
28
  const patternStart = input.indexOf(url.host) + url.host.length;
31
- const pattern = input.substring(patternStart);
32
29
  return {
33
30
  baseUrl,
34
- pattern,
31
+ pattern: input.substring(patternStart),
35
32
  isGlob: true
36
33
  };
37
34
  } catch {
@@ -46,8 +43,7 @@ function matchesGlobPattern(url, parsedPattern) {
46
43
  try {
47
44
  const urlObj = new URL(url);
48
45
  const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
49
- const urlBase = `${urlObj.protocol}//${urlObj.host}`;
50
- if (urlBase !== parsedPattern.baseUrl) return false;
46
+ if (`${urlObj.protocol}//${urlObj.host}` !== parsedPattern.baseUrl) return false;
51
47
  let pattern = parsedPattern.pattern;
52
48
  if (pattern.endsWith("*") && !pattern.endsWith("**") && !pattern.endsWith("/*")) {
53
49
  const base = pattern.slice(0, -1);
@@ -86,10 +82,7 @@ function isUrlExcluded(url, excludePatterns) {
86
82
  if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern);
87
83
  return url === pattern;
88
84
  }
89
- if (pattern.startsWith("/")) {
90
- const adjustedPattern = pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern;
91
- return picomatch(adjustedPattern)(urlPath);
92
- }
85
+ if (pattern.startsWith("/")) return picomatch(pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern)(urlPath);
93
86
  return picomatch(pattern)(urlPath) || picomatch(pattern)(urlPath.substring(1));
94
87
  });
95
88
  } catch {
@@ -102,7 +95,7 @@ function isUrlExcluded(url, excludePatterns) {
102
95
  function validateGlobPattern(pattern) {
103
96
  try {
104
97
  parseUrlPattern(pattern);
105
- return void 0;
98
+ return;
106
99
  } catch (error) {
107
100
  return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
108
101
  }
@@ -116,35 +109,34 @@ function extractMetadata(html, url) {
116
109
  let description = "";
117
110
  let keywords = "";
118
111
  let author = "";
119
- const extractionPluginInstance = extractionPlugin({
120
- "a[href]": (element) => {
121
- const href = element.attributes?.href;
122
- if (href) try {
123
- const absoluteUrl = new URL(href, url).href;
124
- if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
125
- } catch {}
126
- },
127
- "title": (element) => {
128
- if (!title && element.textContent) title = element.textContent.trim();
129
- },
130
- "meta[name=\"description\"]": (element) => {
131
- if (!description && element.attributes?.content) description = element.attributes.content.trim();
132
- },
133
- "meta[property=\"og:description\"]": (element) => {
134
- if (!description && element.attributes?.content) description = element.attributes.content.trim();
135
- },
136
- "meta[name=\"keywords\"]": (element) => {
137
- if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
138
- },
139
- "meta[name=\"author\"]": (element) => {
140
- if (!author && element.attributes?.content) author = element.attributes.content.trim();
141
- },
142
- "meta[property=\"og:title\"]": (element) => {
143
- if (!title && element.attributes?.content) title = element.attributes.content.trim();
144
- }
145
- });
146
112
  htmlToMarkdown(html, {
147
- plugins: [extractionPluginInstance],
113
+ plugins: [extractionPlugin({
114
+ "a[href]": (element) => {
115
+ const href = element.attributes?.href;
116
+ if (href) try {
117
+ const absoluteUrl = new URL(href, url).href;
118
+ if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
119
+ } catch {}
120
+ },
121
+ "title": (element) => {
122
+ if (!title && element.textContent) title = element.textContent.trim();
123
+ },
124
+ "meta[name=\"description\"]": (element) => {
125
+ if (!description && element.attributes?.content) description = element.attributes.content.trim();
126
+ },
127
+ "meta[property=\"og:description\"]": (element) => {
128
+ if (!description && element.attributes?.content) description = element.attributes.content.trim();
129
+ },
130
+ "meta[name=\"keywords\"]": (element) => {
131
+ if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
132
+ },
133
+ "meta[name=\"author\"]": (element) => {
134
+ if (!author && element.attributes?.content) author = element.attributes.content.trim();
135
+ },
136
+ "meta[property=\"og:title\"]": (element) => {
137
+ if (!title && element.attributes?.content) title = element.attributes.content.trim();
138
+ }
139
+ })],
148
140
  origin: new URL(url).origin
149
141
  });
150
142
  return {
@@ -260,8 +252,7 @@ async function crawlAndGenerate(options, onProgress) {
260
252
  robotsResponse = null;
261
253
  }
262
254
  if (robotsResponse?.ok) {
263
- const robotsContent = await robotsResponse.text();
264
- const sitemapMatches = robotsContent.match(/Sitemap:\s*(.*)/gi);
255
+ const sitemapMatches = (await robotsResponse.text()).match(/Sitemap:\s*(.*)/gi);
265
256
  if (sitemapMatches && sitemapMatches.length > 0) {
266
257
  progress.sitemap.found = sitemapMatches.length;
267
258
  progress.sitemap.status = "processing";
@@ -273,8 +264,7 @@ async function crawlAndGenerate(options, onProgress) {
273
264
  url: sitemapUrl,
274
265
  success: true
275
266
  });
276
- const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
277
- if (hasGlobPatterns) {
267
+ if (patterns.some((p$1) => p$1.isGlob)) {
278
268
  const filteredUrls = robotsUrls.filter((url) => {
279
269
  return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
280
270
  });
@@ -310,8 +300,7 @@ async function crawlAndGenerate(options, onProgress) {
310
300
  url: mainSitemapUrl,
311
301
  success: true
312
302
  });
313
- const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
314
- if (hasGlobPatterns) {
303
+ if (patterns.some((p$1) => p$1.isGlob)) {
315
304
  const filteredUrls = sitemapUrls.filter((url) => {
316
305
  return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
317
306
  });
@@ -350,8 +339,7 @@ async function crawlAndGenerate(options, onProgress) {
350
339
  url: sitemapUrl,
351
340
  success: true
352
341
  });
353
- const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
354
- if (hasGlobPatterns) {
342
+ if (patterns.some((p$1) => p$1.isGlob)) {
355
343
  const filteredUrls = altUrls.filter((url) => {
356
344
  return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
357
345
  });
@@ -405,7 +393,7 @@ async function crawlAndGenerate(options, onProgress) {
405
393
  }
406
394
  if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
407
395
  const results = [];
408
- const processedUrls = new Set();
396
+ const processedUrls = /* @__PURE__ */ new Set();
409
397
  const shouldCrawlUrl = (url) => {
410
398
  if (isUrlExcluded(url, exclude)) return false;
411
399
  if (!patterns.some((p$1) => p$1.isGlob)) return true;
@@ -432,36 +420,25 @@ async function crawlAndGenerate(options, onProgress) {
432
420
  if (!title) title = metadata.title;
433
421
  const shouldProcessMarkdown = shouldCrawlUrl(request.loadedUrl);
434
422
  const pageOrigin = origin || new URL(request.loadedUrl).origin;
435
- if (onPage && shouldProcessMarkdown) {
436
- const pageData = {
437
- url: request.loadedUrl,
438
- html,
439
- title,
440
- metadata,
441
- origin: pageOrigin
442
- };
443
- await onPage(pageData);
444
- }
423
+ if (onPage && shouldProcessMarkdown) await onPage({
424
+ url: request.loadedUrl,
425
+ html,
426
+ title,
427
+ metadata,
428
+ origin: pageOrigin
429
+ });
445
430
  let md = "";
446
431
  if (shouldProcessMarkdown && (!onPage || generateIndividualMd)) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
447
432
  let filePath;
448
- if (shouldProcessMarkdown) {
433
+ if (shouldProcessMarkdown && generateIndividualMd) {
449
434
  const urlObj = new URL(request.loadedUrl);
450
- const urlPath = urlObj.pathname === "/" ? "/index" : urlObj.pathname;
451
- const pathSegments = urlPath.replace(/\/$/, "").split("/").filter((seg) => seg.length > 0);
452
- const safeSegments = pathSegments.map((seg) => seg.replace(/[^\w\-]/g, "-"));
453
- const filename = safeSegments.length > 0 ? safeSegments.join("/") : "index";
454
- const safeFilename = normalize(`${filename}.md`);
455
- filePath = join(outputDir, safeFilename);
456
- if (generateIndividualMd) {
457
- const fileDir = dirname(filePath);
458
- if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
459
- await writeFile(filePath, md, "utf-8");
460
- }
435
+ const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(/\/$/, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(/[^\w\-]/g, "-"));
436
+ filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
437
+ const fileDir = dirname(filePath);
438
+ if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
439
+ await writeFile(filePath, md, "utf-8");
461
440
  }
462
- const normalizedUrl = request.loadedUrl.replace(/\/$/, "");
463
- const normalizedHomePageUrl = homePageUrl.replace(/\/$/, "");
464
- const isHomePage = normalizedUrl === normalizedHomePageUrl;
441
+ const isHomePage = request.loadedUrl.replace(/\/$/, "") === homePageUrl.replace(/\/$/, "");
465
442
  if (shouldProcessMarkdown || isHomePage) {
466
443
  const result = {
467
444
  url: request.loadedUrl,
@@ -578,25 +555,21 @@ async function crawlAndGenerate(options, onProgress) {
578
555
  onProgress?.(progress);
579
556
  const contentResults = successfulResults.filter((result) => {
580
557
  if (!result.content) return false;
581
- const trimmedContent = result.content.trim();
582
- const contentWithoutFrontmatter = trimmedContent.replace(/^---\s*\n(?:.*\n)*?---\s*/, "").trim();
583
- return contentWithoutFrontmatter.length > 10;
584
- });
585
- const seenUrls = new Set();
586
- const deduplicatedResults = contentResults.filter((result) => {
587
- if (seenUrls.has(result.url)) return false;
588
- seenUrls.add(result.url);
589
- return true;
558
+ return result.content.trim().replace(/^---\s*\n(?:.*\n)*?---\s*/, "").trim().length > 10;
590
559
  });
591
- const processedFiles = deduplicatedResults.map((result) => ({
592
- filePath: result.filePath,
593
- title: result.title,
594
- content: result.content,
595
- url: result.url,
596
- metadata: result.metadata
597
- }));
560
+ const seenUrls = /* @__PURE__ */ new Set();
598
561
  const llmsResult = await generateLlmsTxtArtifacts({
599
- files: processedFiles,
562
+ files: contentResults.filter((result) => {
563
+ if (seenUrls.has(result.url)) return false;
564
+ seenUrls.add(result.url);
565
+ return true;
566
+ }).map((result) => ({
567
+ filePath: result.filePath,
568
+ title: result.title,
569
+ content: result.content,
570
+ url: result.url,
571
+ metadata: result.metadata
572
+ })),
600
573
  siteName,
601
574
  description,
602
575
  origin: origin$1 || firstUrl.origin,
@@ -622,4 +595,4 @@ async function crawlAndGenerate(options, onProgress) {
622
595
  }
623
596
 
624
597
  //#endregion
625
- export { crawlAndGenerate, parseUrlPattern, validateGlobPattern };
598
+ export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
package/dist/cli.mjs CHANGED
@@ -1,6 +1,5 @@
1
- import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-DEZX9kH_.mjs";
1
+ import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl-BInMcRnS.mjs";
2
2
  import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
3
- import * as p$1 from "@clack/prompts";
4
3
  import * as p from "@clack/prompts";
5
4
  import { PlaywrightCrawler } from "crawlee";
6
5
  import { dirname, join, resolve } from "pathe";
@@ -18,12 +17,12 @@ async function checkPlaywrightInstallation() {
18
17
  }
19
18
  }
20
19
  async function promptPlaywrightInstall() {
21
- const shouldInstall = await p$1.confirm({
20
+ const shouldInstall = await p.confirm({
22
21
  message: "Playwright is required for the Playwright driver. Install it now?",
23
22
  initialValue: true
24
23
  });
25
- if (p$1.isCancel(shouldInstall) || !shouldInstall) return false;
26
- const s = p$1.spinner();
24
+ if (p.isCancel(shouldInstall) || !shouldInstall) return false;
25
+ const s = p.spinner();
27
26
  s.start("Installing Playwright globally...");
28
27
  try {
29
28
  await addDependency("playwright", { global: true });
@@ -31,17 +30,15 @@ async function promptPlaywrightInstall() {
31
30
  return true;
32
31
  } catch (fallbackError) {
33
32
  s.stop("Failed to install Playwright");
34
- p$1.log.error(`Installation failed: ${fallbackError}`);
33
+ p.log.error(`Installation failed: ${fallbackError}`);
35
34
  return false;
36
35
  }
37
36
  }
38
37
  async function ensurePlaywrightInstalled() {
39
- const isInstalled = await checkPlaywrightInstallation();
40
- if (isInstalled) return true;
41
- p$1.log.warn("Playwright driver selected but Playwright is not installed.");
42
- const installed = await promptPlaywrightInstall();
43
- if (!installed) {
44
- p$1.log.error("Cannot proceed with Playwright driver without Playwright installed.");
38
+ if (await checkPlaywrightInstallation()) return true;
39
+ p.log.warn("Playwright driver selected but Playwright is not installed.");
40
+ if (!await promptPlaywrightInstall()) {
41
+ p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
45
42
  return false;
46
43
  }
47
44
  return true;
@@ -67,10 +64,8 @@ async function isUseChromeSupported() {
67
64
 
68
65
  //#endregion
69
66
  //#region src/cli.ts
70
- const __dirname = dirname(fileURLToPath(import.meta.url));
71
- const packageJsonPath = join(__dirname, "..", "package.json");
72
- const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
73
- const version = packageJson.version;
67
+ const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
68
+ const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
74
69
  function checkOutputDirectoryPermissions(outputDir) {
75
70
  try {
76
71
  mkdirSync(outputDir, { recursive: true });
@@ -116,8 +111,7 @@ async function interactiveCrawl() {
116
111
  const globError = validateGlobPattern(url);
117
112
  if (globError) return globError;
118
113
  try {
119
- const parsed = parseUrlPattern(url);
120
- if (!parsed.isGlob) try {
114
+ if (!parseUrlPattern(url).isGlob) try {
121
115
  new URL(withHttps(url));
122
116
  } catch {
123
117
  return `Invalid URL: ${withHttps(url)}`;
@@ -194,7 +188,7 @@ async function interactiveCrawl() {
194
188
  const url = new URL(withHttps(firstUrl));
195
189
  return `${url.protocol}//${url.host}`;
196
190
  } catch {
197
- return void 0;
191
+ return;
198
192
  }
199
193
  })();
200
194
  const outputFormats = advancedOptions.outputFormats.map((f) => {
@@ -216,7 +210,7 @@ async function interactiveCrawl() {
216
210
  inferredOrigin && `Origin: ${inferredOrigin}`
217
211
  ].filter(Boolean);
218
212
  p.note(summary.join("\n"), "Crawl Configuration");
219
- if (advancedOptions.skipSitemap && globPatterns.some((p$2) => p$2.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
213
+ if (advancedOptions.skipSitemap && globPatterns.some((p$1) => p$1.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
220
214
  return {
221
215
  urls,
222
216
  outputDir: resolve(outputDir),
@@ -381,7 +375,7 @@ Examples:
381
375
  const urlObj = new URL(withHttps(url));
382
376
  return `${urlObj.protocol}//${urlObj.host}`;
383
377
  } catch {
384
- return void 0;
378
+ return;
385
379
  }
386
380
  })();
387
381
  const siteNameOverride = getArgValue("--site-name");
@@ -439,19 +433,15 @@ async function main() {
439
433
  if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
440
434
  process.exit(1);
441
435
  }
442
- if (options.driver === "playwright") {
443
- const chromeSupported = await isUseChromeSupported();
444
- if (chromeSupported) {
445
- options.useChrome = true;
446
- p.log.info("System Chrome detected and enabled.");
447
- } else {
448
- const playwrightInstalled = await ensurePlaywrightInstalled();
449
- if (!playwrightInstalled) {
450
- p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
451
- process.exit(1);
452
- }
453
- p.log.info("Using global playwright instance.");
436
+ if (options.driver === "playwright") if (await isUseChromeSupported()) {
437
+ options.useChrome = true;
438
+ p.log.info("System Chrome detected and enabled.");
439
+ } else {
440
+ if (!await ensurePlaywrightInstalled()) {
441
+ p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
442
+ process.exit(1);
454
443
  }
444
+ p.log.info("Using global playwright instance.");
455
445
  }
456
446
  const s = p.spinner();
457
447
  s.start("Starting crawl...");
@@ -475,9 +465,7 @@ async function main() {
475
465
  }
476
466
  });
477
467
  s.stop();
478
- const endTime = Date.now();
479
- const durationMs = endTime - startTime;
480
- const durationSeconds = durationMs / 1e3;
468
+ const durationSeconds = (Date.now() - startTime) / 1e3;
481
469
  const successful = results.filter((r) => r.success).length;
482
470
  const failed = results.filter((r) => !r.success).length;
483
471
  const failedResults = results.filter((r) => !r.success);
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate } from "./_chunks/crawl-DEZX9kH_.mjs";
1
+ import { t as crawlAndGenerate } from "./_chunks/crawl-BInMcRnS.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
4
 
@@ -19,8 +19,7 @@ async function generateLlmsTxt(options) {
19
19
  if (result.filePath) {
20
20
  const mdSeparator = `${sep}md${sep}`;
21
21
  const mdIndex = result.filePath.indexOf(mdSeparator);
22
- const relativePath = mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath);
23
- const linkPath = relativePath.split(sep).join("/");
22
+ const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
24
23
  content += `- [${title}](md/${linkPath}): ${result.url}\n`;
25
24
  } else {
26
25
  const description$1 = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.13.2",
4
+ "version": "0.14.0",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -51,12 +51,12 @@
51
51
  },
52
52
  "dependencies": {
53
53
  "@clack/prompts": "^0.11.0",
54
- "crawlee": "^3.15.1",
54
+ "crawlee": "^3.15.3",
55
55
  "nypm": "^0.6.2",
56
56
  "pathe": "^2.0.3",
57
57
  "picomatch": "^4.0.3",
58
58
  "ufo": "^1.6.1",
59
- "mdream": "0.13.2"
59
+ "mdream": "0.14.0"
60
60
  },
61
61
  "devDependencies": {
62
62
  "@types/picomatch": "^4.0.2"