@mdream/crawl 0.12.2 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -216,7 +216,7 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
216
216
  }
217
217
  }
218
218
  async function crawlAndGenerate(options, onProgress) {
219
- const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false } = options;
219
+ const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
220
220
  const outputDir = resolve(normalize(rawOutputDir));
221
221
  if (verbose) log.setLevel(log.LEVELS.INFO);
222
222
  else log.setLevel(log.LEVELS.OFF);
@@ -431,8 +431,19 @@ async function crawlAndGenerate(options, onProgress) {
431
431
  const metadata = extractMetadata(html, request.loadedUrl);
432
432
  if (!title) title = metadata.title;
433
433
  const shouldProcessMarkdown = shouldCrawlUrl(request.loadedUrl);
434
+ const pageOrigin = origin || new URL(request.loadedUrl).origin;
435
+ if (onPage && shouldProcessMarkdown) {
436
+ const pageData = {
437
+ url: request.loadedUrl,
438
+ html,
439
+ title,
440
+ metadata,
441
+ origin: pageOrigin
442
+ };
443
+ await onPage(pageData);
444
+ }
434
445
  let md = "";
435
- if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: origin || new URL(request.loadedUrl).origin }));
446
+ if (shouldProcessMarkdown && (!onPage || generateIndividualMd)) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
436
447
  let filePath;
437
448
  if (shouldProcessMarkdown) {
438
449
  const urlObj = new URL(request.loadedUrl);
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-bReaLQf6.mjs";
1
+ import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-DEZX9kH_.mjs";
2
2
  import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
3
3
  import * as p$1 from "@clack/prompts";
4
4
  import * as p from "@clack/prompts";
package/dist/index.d.mts CHANGED
@@ -1,4 +1,11 @@
1
1
  //#region src/types.d.ts
2
+ interface PageData {
3
+ url: string;
4
+ html: string;
5
+ title: string;
6
+ metadata: PageMetadata;
7
+ origin: string;
8
+ }
2
9
  interface CrawlOptions {
3
10
  urls: string[];
4
11
  outputDir: string;
@@ -19,6 +26,7 @@ interface CrawlOptions {
19
26
  descriptionOverride?: string;
20
27
  verbose?: boolean;
21
28
  skipSitemap?: boolean;
29
+ onPage?: (page: PageData) => Promise<void> | void;
22
30
  }
23
31
  interface ParsedUrlPattern {
24
32
  baseUrl: string;
@@ -74,4 +82,4 @@ declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress:
74
82
  declare function generateLlmsTxt(options: LlmsTxtOptions): Promise<void>;
75
83
  declare function generateLlmsFullTxt(options: LlmsTxtOptions): Promise<void>;
76
84
  //#endregion
77
- export { type CrawlOptions, type CrawlResult, type LlmsTxtOptions, crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
85
+ export { type CrawlOptions, type CrawlResult, type LlmsTxtOptions, type PageData, crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate } from "./_chunks/crawl-bReaLQf6.mjs";
1
+ import { crawlAndGenerate } from "./_chunks/crawl-DEZX9kH_.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
4
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.12.2",
4
+ "version": "0.13.0",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -56,7 +56,7 @@
56
56
  "pathe": "^2.0.3",
57
57
  "picomatch": "^4.0.3",
58
58
  "ufo": "^1.6.1",
59
- "mdream": "0.12.2"
59
+ "mdream": "0.13.0"
60
60
  },
61
61
  "devDependencies": {
62
62
  "@types/picomatch": "^4.0.2"