@mdream/crawl 0.12.2 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -216,7 +216,7 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
|
216
216
|
}
|
|
217
217
|
}
|
|
218
218
|
async function crawlAndGenerate(options, onProgress) {
|
|
219
|
-
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false } = options;
|
|
219
|
+
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
|
|
220
220
|
const outputDir = resolve(normalize(rawOutputDir));
|
|
221
221
|
if (verbose) log.setLevel(log.LEVELS.INFO);
|
|
222
222
|
else log.setLevel(log.LEVELS.OFF);
|
|
@@ -431,8 +431,19 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
431
431
|
const metadata = extractMetadata(html, request.loadedUrl);
|
|
432
432
|
if (!title) title = metadata.title;
|
|
433
433
|
const shouldProcessMarkdown = shouldCrawlUrl(request.loadedUrl);
|
|
434
|
+
const pageOrigin = origin || new URL(request.loadedUrl).origin;
|
|
435
|
+
if (onPage && shouldProcessMarkdown) {
|
|
436
|
+
const pageData = {
|
|
437
|
+
url: request.loadedUrl,
|
|
438
|
+
html,
|
|
439
|
+
title,
|
|
440
|
+
metadata,
|
|
441
|
+
origin: pageOrigin
|
|
442
|
+
};
|
|
443
|
+
await onPage(pageData);
|
|
444
|
+
}
|
|
434
445
|
let md = "";
|
|
435
|
-
if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin:
|
|
446
|
+
if (shouldProcessMarkdown && (!onPage || generateIndividualMd)) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
|
|
436
447
|
let filePath;
|
|
437
448
|
if (shouldProcessMarkdown) {
|
|
438
449
|
const urlObj = new URL(request.loadedUrl);
|
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-
|
|
1
|
+
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-DEZX9kH_.mjs";
|
|
2
2
|
import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
3
3
|
import * as p$1 from "@clack/prompts";
|
|
4
4
|
import * as p from "@clack/prompts";
|
package/dist/index.d.mts
CHANGED
|
@@ -1,4 +1,11 @@
|
|
|
1
1
|
//#region src/types.d.ts
|
|
2
|
+
interface PageData {
|
|
3
|
+
url: string;
|
|
4
|
+
html: string;
|
|
5
|
+
title: string;
|
|
6
|
+
metadata: PageMetadata;
|
|
7
|
+
origin: string;
|
|
8
|
+
}
|
|
2
9
|
interface CrawlOptions {
|
|
3
10
|
urls: string[];
|
|
4
11
|
outputDir: string;
|
|
@@ -19,6 +26,7 @@ interface CrawlOptions {
|
|
|
19
26
|
descriptionOverride?: string;
|
|
20
27
|
verbose?: boolean;
|
|
21
28
|
skipSitemap?: boolean;
|
|
29
|
+
onPage?: (page: PageData) => Promise<void> | void;
|
|
22
30
|
}
|
|
23
31
|
interface ParsedUrlPattern {
|
|
24
32
|
baseUrl: string;
|
|
@@ -74,4 +82,4 @@ declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress:
|
|
|
74
82
|
declare function generateLlmsTxt(options: LlmsTxtOptions): Promise<void>;
|
|
75
83
|
declare function generateLlmsFullTxt(options: LlmsTxtOptions): Promise<void>;
|
|
76
84
|
//#endregion
|
|
77
|
-
export { type CrawlOptions, type CrawlResult, type LlmsTxtOptions, crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
|
85
|
+
export { type CrawlOptions, type CrawlResult, type LlmsTxtOptions, type PageData, crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.13.0",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -56,7 +56,7 @@
|
|
|
56
56
|
"pathe": "^2.0.3",
|
|
57
57
|
"picomatch": "^4.0.3",
|
|
58
58
|
"ufo": "^1.6.1",
|
|
59
|
-
"mdream": "0.
|
|
59
|
+
"mdream": "0.13.0"
|
|
60
60
|
},
|
|
61
61
|
"devDependencies": {
|
|
62
62
|
"@types/picomatch": "^4.0.2"
|