@mdream/crawl 0.13.2 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/{crawl-DEZX9kH_.mjs → crawl-BInMcRnS.mjs} +66 -93
- package/dist/cli.mjs +24 -36
- package/dist/index.mjs +2 -3
- package/package.json +3 -3
|
@@ -16,22 +16,19 @@ import { extractionPlugin } from "mdream/plugins";
|
|
|
16
16
|
* Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
|
|
17
17
|
*/
|
|
18
18
|
function parseUrlPattern(input) {
|
|
19
|
-
|
|
20
|
-
if (!hasGlob) return {
|
|
19
|
+
if (!(input.includes("*") || input.includes("?") || input.includes("["))) return {
|
|
21
20
|
baseUrl: input,
|
|
22
21
|
pattern: "",
|
|
23
22
|
isGlob: false
|
|
24
23
|
};
|
|
25
24
|
try {
|
|
26
|
-
const
|
|
27
|
-
const urlWithoutGlob = urlWithProtocol.replace(/\*.*$/, "");
|
|
25
|
+
const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(/\*.*$/, "");
|
|
28
26
|
const url = new URL(urlWithoutGlob);
|
|
29
27
|
const baseUrl = `${url.protocol}//${url.host}`;
|
|
30
28
|
const patternStart = input.indexOf(url.host) + url.host.length;
|
|
31
|
-
const pattern = input.substring(patternStart);
|
|
32
29
|
return {
|
|
33
30
|
baseUrl,
|
|
34
|
-
pattern,
|
|
31
|
+
pattern: input.substring(patternStart),
|
|
35
32
|
isGlob: true
|
|
36
33
|
};
|
|
37
34
|
} catch {
|
|
@@ -46,8 +43,7 @@ function matchesGlobPattern(url, parsedPattern) {
|
|
|
46
43
|
try {
|
|
47
44
|
const urlObj = new URL(url);
|
|
48
45
|
const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
|
|
49
|
-
|
|
50
|
-
if (urlBase !== parsedPattern.baseUrl) return false;
|
|
46
|
+
if (`${urlObj.protocol}//${urlObj.host}` !== parsedPattern.baseUrl) return false;
|
|
51
47
|
let pattern = parsedPattern.pattern;
|
|
52
48
|
if (pattern.endsWith("*") && !pattern.endsWith("**") && !pattern.endsWith("/*")) {
|
|
53
49
|
const base = pattern.slice(0, -1);
|
|
@@ -86,10 +82,7 @@ function isUrlExcluded(url, excludePatterns) {
|
|
|
86
82
|
if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern);
|
|
87
83
|
return url === pattern;
|
|
88
84
|
}
|
|
89
|
-
if (pattern.startsWith("/"))
|
|
90
|
-
const adjustedPattern = pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern;
|
|
91
|
-
return picomatch(adjustedPattern)(urlPath);
|
|
92
|
-
}
|
|
85
|
+
if (pattern.startsWith("/")) return picomatch(pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern)(urlPath);
|
|
93
86
|
return picomatch(pattern)(urlPath) || picomatch(pattern)(urlPath.substring(1));
|
|
94
87
|
});
|
|
95
88
|
} catch {
|
|
@@ -102,7 +95,7 @@ function isUrlExcluded(url, excludePatterns) {
|
|
|
102
95
|
function validateGlobPattern(pattern) {
|
|
103
96
|
try {
|
|
104
97
|
parseUrlPattern(pattern);
|
|
105
|
-
return
|
|
98
|
+
return;
|
|
106
99
|
} catch (error) {
|
|
107
100
|
return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
|
|
108
101
|
}
|
|
@@ -116,35 +109,34 @@ function extractMetadata(html, url) {
|
|
|
116
109
|
let description = "";
|
|
117
110
|
let keywords = "";
|
|
118
111
|
let author = "";
|
|
119
|
-
const extractionPluginInstance = extractionPlugin({
|
|
120
|
-
"a[href]": (element) => {
|
|
121
|
-
const href = element.attributes?.href;
|
|
122
|
-
if (href) try {
|
|
123
|
-
const absoluteUrl = new URL(href, url).href;
|
|
124
|
-
if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
|
|
125
|
-
} catch {}
|
|
126
|
-
},
|
|
127
|
-
"title": (element) => {
|
|
128
|
-
if (!title && element.textContent) title = element.textContent.trim();
|
|
129
|
-
},
|
|
130
|
-
"meta[name=\"description\"]": (element) => {
|
|
131
|
-
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
132
|
-
},
|
|
133
|
-
"meta[property=\"og:description\"]": (element) => {
|
|
134
|
-
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
135
|
-
},
|
|
136
|
-
"meta[name=\"keywords\"]": (element) => {
|
|
137
|
-
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
138
|
-
},
|
|
139
|
-
"meta[name=\"author\"]": (element) => {
|
|
140
|
-
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
141
|
-
},
|
|
142
|
-
"meta[property=\"og:title\"]": (element) => {
|
|
143
|
-
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
144
|
-
}
|
|
145
|
-
});
|
|
146
112
|
htmlToMarkdown(html, {
|
|
147
|
-
plugins: [
|
|
113
|
+
plugins: [extractionPlugin({
|
|
114
|
+
"a[href]": (element) => {
|
|
115
|
+
const href = element.attributes?.href;
|
|
116
|
+
if (href) try {
|
|
117
|
+
const absoluteUrl = new URL(href, url).href;
|
|
118
|
+
if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
|
|
119
|
+
} catch {}
|
|
120
|
+
},
|
|
121
|
+
"title": (element) => {
|
|
122
|
+
if (!title && element.textContent) title = element.textContent.trim();
|
|
123
|
+
},
|
|
124
|
+
"meta[name=\"description\"]": (element) => {
|
|
125
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
126
|
+
},
|
|
127
|
+
"meta[property=\"og:description\"]": (element) => {
|
|
128
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
129
|
+
},
|
|
130
|
+
"meta[name=\"keywords\"]": (element) => {
|
|
131
|
+
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
132
|
+
},
|
|
133
|
+
"meta[name=\"author\"]": (element) => {
|
|
134
|
+
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
135
|
+
},
|
|
136
|
+
"meta[property=\"og:title\"]": (element) => {
|
|
137
|
+
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
138
|
+
}
|
|
139
|
+
})],
|
|
148
140
|
origin: new URL(url).origin
|
|
149
141
|
});
|
|
150
142
|
return {
|
|
@@ -260,8 +252,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
260
252
|
robotsResponse = null;
|
|
261
253
|
}
|
|
262
254
|
if (robotsResponse?.ok) {
|
|
263
|
-
const
|
|
264
|
-
const sitemapMatches = robotsContent.match(/Sitemap:\s*(.*)/gi);
|
|
255
|
+
const sitemapMatches = (await robotsResponse.text()).match(/Sitemap:\s*(.*)/gi);
|
|
265
256
|
if (sitemapMatches && sitemapMatches.length > 0) {
|
|
266
257
|
progress.sitemap.found = sitemapMatches.length;
|
|
267
258
|
progress.sitemap.status = "processing";
|
|
@@ -273,8 +264,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
273
264
|
url: sitemapUrl,
|
|
274
265
|
success: true
|
|
275
266
|
});
|
|
276
|
-
|
|
277
|
-
if (hasGlobPatterns) {
|
|
267
|
+
if (patterns.some((p$1) => p$1.isGlob)) {
|
|
278
268
|
const filteredUrls = robotsUrls.filter((url) => {
|
|
279
269
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
280
270
|
});
|
|
@@ -310,8 +300,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
310
300
|
url: mainSitemapUrl,
|
|
311
301
|
success: true
|
|
312
302
|
});
|
|
313
|
-
|
|
314
|
-
if (hasGlobPatterns) {
|
|
303
|
+
if (patterns.some((p$1) => p$1.isGlob)) {
|
|
315
304
|
const filteredUrls = sitemapUrls.filter((url) => {
|
|
316
305
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
317
306
|
});
|
|
@@ -350,8 +339,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
350
339
|
url: sitemapUrl,
|
|
351
340
|
success: true
|
|
352
341
|
});
|
|
353
|
-
|
|
354
|
-
if (hasGlobPatterns) {
|
|
342
|
+
if (patterns.some((p$1) => p$1.isGlob)) {
|
|
355
343
|
const filteredUrls = altUrls.filter((url) => {
|
|
356
344
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
357
345
|
});
|
|
@@ -405,7 +393,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
405
393
|
}
|
|
406
394
|
if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
|
|
407
395
|
const results = [];
|
|
408
|
-
const processedUrls = new Set();
|
|
396
|
+
const processedUrls = /* @__PURE__ */ new Set();
|
|
409
397
|
const shouldCrawlUrl = (url) => {
|
|
410
398
|
if (isUrlExcluded(url, exclude)) return false;
|
|
411
399
|
if (!patterns.some((p$1) => p$1.isGlob)) return true;
|
|
@@ -432,36 +420,25 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
432
420
|
if (!title) title = metadata.title;
|
|
433
421
|
const shouldProcessMarkdown = shouldCrawlUrl(request.loadedUrl);
|
|
434
422
|
const pageOrigin = origin || new URL(request.loadedUrl).origin;
|
|
435
|
-
if (onPage && shouldProcessMarkdown) {
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
};
|
|
443
|
-
await onPage(pageData);
|
|
444
|
-
}
|
|
423
|
+
if (onPage && shouldProcessMarkdown) await onPage({
|
|
424
|
+
url: request.loadedUrl,
|
|
425
|
+
html,
|
|
426
|
+
title,
|
|
427
|
+
metadata,
|
|
428
|
+
origin: pageOrigin
|
|
429
|
+
});
|
|
445
430
|
let md = "";
|
|
446
431
|
if (shouldProcessMarkdown && (!onPage || generateIndividualMd)) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
|
|
447
432
|
let filePath;
|
|
448
|
-
if (shouldProcessMarkdown) {
|
|
433
|
+
if (shouldProcessMarkdown && generateIndividualMd) {
|
|
449
434
|
const urlObj = new URL(request.loadedUrl);
|
|
450
|
-
const
|
|
451
|
-
|
|
452
|
-
const
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
filePath = join(outputDir, safeFilename);
|
|
456
|
-
if (generateIndividualMd) {
|
|
457
|
-
const fileDir = dirname(filePath);
|
|
458
|
-
if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
|
|
459
|
-
await writeFile(filePath, md, "utf-8");
|
|
460
|
-
}
|
|
435
|
+
const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(/\/$/, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(/[^\w\-]/g, "-"));
|
|
436
|
+
filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
|
|
437
|
+
const fileDir = dirname(filePath);
|
|
438
|
+
if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
|
|
439
|
+
await writeFile(filePath, md, "utf-8");
|
|
461
440
|
}
|
|
462
|
-
const
|
|
463
|
-
const normalizedHomePageUrl = homePageUrl.replace(/\/$/, "");
|
|
464
|
-
const isHomePage = normalizedUrl === normalizedHomePageUrl;
|
|
441
|
+
const isHomePage = request.loadedUrl.replace(/\/$/, "") === homePageUrl.replace(/\/$/, "");
|
|
465
442
|
if (shouldProcessMarkdown || isHomePage) {
|
|
466
443
|
const result = {
|
|
467
444
|
url: request.loadedUrl,
|
|
@@ -578,25 +555,21 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
578
555
|
onProgress?.(progress);
|
|
579
556
|
const contentResults = successfulResults.filter((result) => {
|
|
580
557
|
if (!result.content) return false;
|
|
581
|
-
|
|
582
|
-
const contentWithoutFrontmatter = trimmedContent.replace(/^---\s*\n(?:.*\n)*?---\s*/, "").trim();
|
|
583
|
-
return contentWithoutFrontmatter.length > 10;
|
|
584
|
-
});
|
|
585
|
-
const seenUrls = new Set();
|
|
586
|
-
const deduplicatedResults = contentResults.filter((result) => {
|
|
587
|
-
if (seenUrls.has(result.url)) return false;
|
|
588
|
-
seenUrls.add(result.url);
|
|
589
|
-
return true;
|
|
558
|
+
return result.content.trim().replace(/^---\s*\n(?:.*\n)*?---\s*/, "").trim().length > 10;
|
|
590
559
|
});
|
|
591
|
-
const
|
|
592
|
-
filePath: result.filePath,
|
|
593
|
-
title: result.title,
|
|
594
|
-
content: result.content,
|
|
595
|
-
url: result.url,
|
|
596
|
-
metadata: result.metadata
|
|
597
|
-
}));
|
|
560
|
+
const seenUrls = /* @__PURE__ */ new Set();
|
|
598
561
|
const llmsResult = await generateLlmsTxtArtifacts({
|
|
599
|
-
files:
|
|
562
|
+
files: contentResults.filter((result) => {
|
|
563
|
+
if (seenUrls.has(result.url)) return false;
|
|
564
|
+
seenUrls.add(result.url);
|
|
565
|
+
return true;
|
|
566
|
+
}).map((result) => ({
|
|
567
|
+
filePath: result.filePath,
|
|
568
|
+
title: result.title,
|
|
569
|
+
content: result.content,
|
|
570
|
+
url: result.url,
|
|
571
|
+
metadata: result.metadata
|
|
572
|
+
})),
|
|
600
573
|
siteName,
|
|
601
574
|
description,
|
|
602
575
|
origin: origin$1 || firstUrl.origin,
|
|
@@ -622,4 +595,4 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
622
595
|
}
|
|
623
596
|
|
|
624
597
|
//#endregion
|
|
625
|
-
export {
|
|
598
|
+
export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl-BInMcRnS.mjs";
|
|
2
2
|
import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
3
|
-
import * as p$1 from "@clack/prompts";
|
|
4
3
|
import * as p from "@clack/prompts";
|
|
5
4
|
import { PlaywrightCrawler } from "crawlee";
|
|
6
5
|
import { dirname, join, resolve } from "pathe";
|
|
@@ -18,12 +17,12 @@ async function checkPlaywrightInstallation() {
|
|
|
18
17
|
}
|
|
19
18
|
}
|
|
20
19
|
async function promptPlaywrightInstall() {
|
|
21
|
-
const shouldInstall = await p
|
|
20
|
+
const shouldInstall = await p.confirm({
|
|
22
21
|
message: "Playwright is required for the Playwright driver. Install it now?",
|
|
23
22
|
initialValue: true
|
|
24
23
|
});
|
|
25
|
-
if (p
|
|
26
|
-
const s = p
|
|
24
|
+
if (p.isCancel(shouldInstall) || !shouldInstall) return false;
|
|
25
|
+
const s = p.spinner();
|
|
27
26
|
s.start("Installing Playwright globally...");
|
|
28
27
|
try {
|
|
29
28
|
await addDependency("playwright", { global: true });
|
|
@@ -31,17 +30,15 @@ async function promptPlaywrightInstall() {
|
|
|
31
30
|
return true;
|
|
32
31
|
} catch (fallbackError) {
|
|
33
32
|
s.stop("Failed to install Playwright");
|
|
34
|
-
p
|
|
33
|
+
p.log.error(`Installation failed: ${fallbackError}`);
|
|
35
34
|
return false;
|
|
36
35
|
}
|
|
37
36
|
}
|
|
38
37
|
async function ensurePlaywrightInstalled() {
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
if (!installed) {
|
|
44
|
-
p$1.log.error("Cannot proceed with Playwright driver without Playwright installed.");
|
|
38
|
+
if (await checkPlaywrightInstallation()) return true;
|
|
39
|
+
p.log.warn("Playwright driver selected but Playwright is not installed.");
|
|
40
|
+
if (!await promptPlaywrightInstall()) {
|
|
41
|
+
p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
|
|
45
42
|
return false;
|
|
46
43
|
}
|
|
47
44
|
return true;
|
|
@@ -67,10 +64,8 @@ async function isUseChromeSupported() {
|
|
|
67
64
|
|
|
68
65
|
//#endregion
|
|
69
66
|
//#region src/cli.ts
|
|
70
|
-
const
|
|
71
|
-
const
|
|
72
|
-
const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
|
|
73
|
-
const version = packageJson.version;
|
|
67
|
+
const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
|
|
68
|
+
const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
|
|
74
69
|
function checkOutputDirectoryPermissions(outputDir) {
|
|
75
70
|
try {
|
|
76
71
|
mkdirSync(outputDir, { recursive: true });
|
|
@@ -116,8 +111,7 @@ async function interactiveCrawl() {
|
|
|
116
111
|
const globError = validateGlobPattern(url);
|
|
117
112
|
if (globError) return globError;
|
|
118
113
|
try {
|
|
119
|
-
|
|
120
|
-
if (!parsed.isGlob) try {
|
|
114
|
+
if (!parseUrlPattern(url).isGlob) try {
|
|
121
115
|
new URL(withHttps(url));
|
|
122
116
|
} catch {
|
|
123
117
|
return `Invalid URL: ${withHttps(url)}`;
|
|
@@ -194,7 +188,7 @@ async function interactiveCrawl() {
|
|
|
194
188
|
const url = new URL(withHttps(firstUrl));
|
|
195
189
|
return `${url.protocol}//${url.host}`;
|
|
196
190
|
} catch {
|
|
197
|
-
return
|
|
191
|
+
return;
|
|
198
192
|
}
|
|
199
193
|
})();
|
|
200
194
|
const outputFormats = advancedOptions.outputFormats.map((f) => {
|
|
@@ -216,7 +210,7 @@ async function interactiveCrawl() {
|
|
|
216
210
|
inferredOrigin && `Origin: ${inferredOrigin}`
|
|
217
211
|
].filter(Boolean);
|
|
218
212
|
p.note(summary.join("\n"), "Crawl Configuration");
|
|
219
|
-
if (advancedOptions.skipSitemap && globPatterns.some((p$
|
|
213
|
+
if (advancedOptions.skipSitemap && globPatterns.some((p$1) => p$1.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
|
|
220
214
|
return {
|
|
221
215
|
urls,
|
|
222
216
|
outputDir: resolve(outputDir),
|
|
@@ -381,7 +375,7 @@ Examples:
|
|
|
381
375
|
const urlObj = new URL(withHttps(url));
|
|
382
376
|
return `${urlObj.protocol}//${urlObj.host}`;
|
|
383
377
|
} catch {
|
|
384
|
-
return
|
|
378
|
+
return;
|
|
385
379
|
}
|
|
386
380
|
})();
|
|
387
381
|
const siteNameOverride = getArgValue("--site-name");
|
|
@@ -439,19 +433,15 @@ async function main() {
|
|
|
439
433
|
if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
|
|
440
434
|
process.exit(1);
|
|
441
435
|
}
|
|
442
|
-
if (options.driver === "playwright") {
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
if (!playwrightInstalled) {
|
|
450
|
-
p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
|
|
451
|
-
process.exit(1);
|
|
452
|
-
}
|
|
453
|
-
p.log.info("Using global playwright instance.");
|
|
436
|
+
if (options.driver === "playwright") if (await isUseChromeSupported()) {
|
|
437
|
+
options.useChrome = true;
|
|
438
|
+
p.log.info("System Chrome detected and enabled.");
|
|
439
|
+
} else {
|
|
440
|
+
if (!await ensurePlaywrightInstalled()) {
|
|
441
|
+
p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
|
|
442
|
+
process.exit(1);
|
|
454
443
|
}
|
|
444
|
+
p.log.info("Using global playwright instance.");
|
|
455
445
|
}
|
|
456
446
|
const s = p.spinner();
|
|
457
447
|
s.start("Starting crawl...");
|
|
@@ -475,9 +465,7 @@ async function main() {
|
|
|
475
465
|
}
|
|
476
466
|
});
|
|
477
467
|
s.stop();
|
|
478
|
-
const
|
|
479
|
-
const durationMs = endTime - startTime;
|
|
480
|
-
const durationSeconds = durationMs / 1e3;
|
|
468
|
+
const durationSeconds = (Date.now() - startTime) / 1e3;
|
|
481
469
|
const successful = results.filter((r) => r.success).length;
|
|
482
470
|
const failed = results.filter((r) => !r.success).length;
|
|
483
471
|
const failedResults = results.filter((r) => !r.success);
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { crawlAndGenerate } from "./_chunks/crawl-
|
|
1
|
+
import { t as crawlAndGenerate } from "./_chunks/crawl-BInMcRnS.mjs";
|
|
2
2
|
import { writeFile } from "node:fs/promises";
|
|
3
3
|
import { basename, sep } from "pathe";
|
|
4
4
|
|
|
@@ -19,8 +19,7 @@ async function generateLlmsTxt(options) {
|
|
|
19
19
|
if (result.filePath) {
|
|
20
20
|
const mdSeparator = `${sep}md${sep}`;
|
|
21
21
|
const mdIndex = result.filePath.indexOf(mdSeparator);
|
|
22
|
-
const
|
|
23
|
-
const linkPath = relativePath.split(sep).join("/");
|
|
22
|
+
const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
|
|
24
23
|
content += `- [${title}](md/${linkPath}): ${result.url}\n`;
|
|
25
24
|
} else {
|
|
26
25
|
const description$1 = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.14.0",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -51,12 +51,12 @@
|
|
|
51
51
|
},
|
|
52
52
|
"dependencies": {
|
|
53
53
|
"@clack/prompts": "^0.11.0",
|
|
54
|
-
"crawlee": "^3.15.
|
|
54
|
+
"crawlee": "^3.15.3",
|
|
55
55
|
"nypm": "^0.6.2",
|
|
56
56
|
"pathe": "^2.0.3",
|
|
57
57
|
"picomatch": "^4.0.3",
|
|
58
58
|
"ufo": "^1.6.1",
|
|
59
|
-
"mdream": "0.
|
|
59
|
+
"mdream": "0.14.0"
|
|
60
60
|
},
|
|
61
61
|
"devDependencies": {
|
|
62
62
|
"@types/picomatch": "^4.0.2"
|