@lenne.tech/cli 1.11.1 → 1.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/commands/claude/shortcuts.js +5 -0
- package/build/commands/config/validate.js +36 -2
- package/build/commands/fullstack/init.js +21 -0
- package/build/commands/fullstack/update.js +1 -4
- package/build/commands/git/reset.js +2 -2
- package/build/commands/git/update.js +3 -3
- package/build/commands/server/add-property.js +1 -3
- package/build/commands/server/module.js +1 -1
- package/build/commands/tools/crawl.js +307 -0
- package/build/config/vendor-runtime-deps.json +1 -5
- package/build/extensions/api-mode.js +123 -5
- package/build/extensions/frontend-helper.js +59 -32
- package/build/extensions/git.js +4 -5
- package/build/extensions/server.js +80 -42
- package/build/lib/browser-fetcher.js +139 -0
- package/build/lib/crawler.js +661 -0
- package/build/lib/frontend-framework-detection.js +1 -2
- package/build/lib/hoist-workspace-pnpm-config.js +97 -0
- package/build/lib/markdown-table.js +33 -0
- package/docs/VENDOR-MODE-WORKFLOW.md +73 -0
- package/docs/commands.md +53 -1
- package/docs/lt.config.md +37 -0
- package/package.json +24 -9
|
@@ -0,0 +1,661 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.crawlSite = crawlSite;
|
|
16
|
+
/**
|
|
17
|
+
* Website crawler utilities.
|
|
18
|
+
*
|
|
19
|
+
* Fetches web pages (optionally guided by sitemap.xml), extracts the
|
|
20
|
+
* main content using the same defuddle + Turndown pipeline as the
|
|
21
|
+
* chrome-md browser extension (see ../../../chrome-md/content/content.js),
|
|
22
|
+
* converts it to Markdown, and writes one .md file per page plus an
|
|
23
|
+
* overview README when multiple pages are discovered. Designed for
|
|
24
|
+
* building Claude Code knowledge bases.
|
|
25
|
+
*/
|
|
26
|
+
const axios_1 = __importDefault(require("axios"));
|
|
27
|
+
const crypto_1 = require("crypto");
|
|
28
|
+
const defuddle_1 = __importDefault(require("defuddle"));
|
|
29
|
+
const fs_1 = require("fs");
|
|
30
|
+
const jsdom_1 = require("jsdom");
|
|
31
|
+
const path_1 = require("path");
|
|
32
|
+
const turndown_1 = __importDefault(require("turndown"));
|
|
33
|
+
const turndown_plugin_gfm_1 = require("turndown-plugin-gfm");
|
|
34
|
+
const browser_fetcher_1 = require("./browser-fetcher");
|
|
35
|
+
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; lenneTech-CLI-Crawler/1.0; +https://lenne.tech)';
|
|
36
|
+
/**
|
|
37
|
+
* Crawl a website starting at `options.url` and write the collected
|
|
38
|
+
* pages as Markdown files beneath `options.outDir`.
|
|
39
|
+
*/
|
|
40
|
+
function crawlSite(options) {
|
|
41
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
42
|
+
const { autoInstallBrowser = false, concurrency = 4, depth: rawDepth, includeImages = true, includeSitemap = true, maxPages = 200, onLog = () => undefined, outDir, prune = false, renderJs = false, selector, timeout = 20000, url: seedUrl, userAgent = DEFAULT_USER_AGENT, } = options;
|
|
43
|
+
// Normalize depth. `'all'` and negative numbers mean "follow every
|
|
44
|
+
// same-origin link we find" — bounded by `maxPages`.
|
|
45
|
+
const depth = rawDepth === 'all' || (typeof rawDepth === 'number' && rawDepth < 0) ? Number.POSITIVE_INFINITY : Number(rawDepth);
|
|
46
|
+
const http = axios_1.default.create({
|
|
47
|
+
headers: { 'User-Agent': userAgent },
|
|
48
|
+
maxRedirects: 5,
|
|
49
|
+
responseType: 'text',
|
|
50
|
+
timeout,
|
|
51
|
+
validateStatus: (status) => status >= 200 && status < 400,
|
|
52
|
+
});
|
|
53
|
+
// Headless browser only spun up when needed (SPA-mode).
|
|
54
|
+
let browserFetcher = null;
|
|
55
|
+
if (renderJs) {
|
|
56
|
+
browserFetcher = yield (0, browser_fetcher_1.createBrowserFetcher)({
|
|
57
|
+
autoInstall: autoInstallBrowser,
|
|
58
|
+
extraWaitMs: 500,
|
|
59
|
+
maxWaitMs: timeout,
|
|
60
|
+
onLog,
|
|
61
|
+
userAgent,
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
try {
|
|
65
|
+
const seed = new URL(seedUrl);
|
|
66
|
+
const origin = seed.origin;
|
|
67
|
+
if (!(0, fs_1.existsSync)(outDir)) {
|
|
68
|
+
(0, fs_1.mkdirSync)(outDir, { recursive: true });
|
|
69
|
+
}
|
|
70
|
+
// Queue preserves the depth at which a URL was discovered so children
|
|
71
|
+
// are only followed when `discovered.depth < options.depth`.
|
|
72
|
+
const queue = [{ depth: 0, url: normalizeUrl(seedUrl) }];
|
|
73
|
+
const seen = new Set([normalizeUrl(seedUrl)]);
|
|
74
|
+
if (includeSitemap) {
|
|
75
|
+
onLog(`Checking sitemap at ${origin}/sitemap.xml`);
|
|
76
|
+
const sitemapUrls = yield fetchSitemapUrls(http, origin, onLog);
|
|
77
|
+
for (const sitemapUrl of sitemapUrls) {
|
|
78
|
+
const normalized = normalizeUrl(sitemapUrl);
|
|
79
|
+
if (!seen.has(normalized) && sameOrigin(normalized, origin)) {
|
|
80
|
+
seen.add(normalized);
|
|
81
|
+
queue.push({ depth: 0, url: normalized });
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
if (sitemapUrls.length > 0) {
|
|
85
|
+
onLog(`Sitemap discovered ${sitemapUrls.length} URLs`);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
const pages = [];
|
|
89
|
+
const errors = [];
|
|
90
|
+
const skipped = [];
|
|
91
|
+
// Shared deduplicated image map (content hash -> relative path under outDir).
|
|
92
|
+
const imageHashToPath = new Map();
|
|
93
|
+
// We can't know upfront whether the crawl is single- or multi-page,
|
|
94
|
+
// so we render pages into a buffer first and only materialize files
|
|
95
|
+
// once the queue drains.
|
|
96
|
+
const rendered = [];
|
|
97
|
+
const processPage = (item) => __awaiter(this, void 0, void 0, function* () {
|
|
98
|
+
var _a, _b;
|
|
99
|
+
if (pages.length + errors.length >= maxPages) {
|
|
100
|
+
skipped.push(item.url);
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
onLog(`Fetching (depth ${item.depth}): ${item.url}`);
|
|
104
|
+
try {
|
|
105
|
+
let html;
|
|
106
|
+
let finalUrl = normalizeUrl(item.url);
|
|
107
|
+
if (browserFetcher) {
|
|
108
|
+
// In render mode we trust the URL we navigated to. We can't
|
|
109
|
+
// cheaply detect redirects here, so assume same origin (the
|
|
110
|
+
// crawler already filtered non-HTML URLs out of the queue).
|
|
111
|
+
html = yield browserFetcher.fetch(item.url);
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
const response = yield http.get(item.url);
|
|
115
|
+
finalUrl = normalizeUrl(((_b = (_a = response.request) === null || _a === void 0 ? void 0 : _a.res) === null || _b === void 0 ? void 0 : _b.responseUrl) || item.url);
|
|
116
|
+
if (!sameOrigin(finalUrl, origin)) {
|
|
117
|
+
skipped.push(item.url);
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
const contentType = String(response.headers['content-type'] || '');
|
|
121
|
+
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
|
|
122
|
+
skipped.push(item.url);
|
|
123
|
+
return;
|
|
124
|
+
}
|
|
125
|
+
html = String(response.data || '');
|
|
126
|
+
}
|
|
127
|
+
const extracted = yield extractContent(html, finalUrl, { selector });
|
|
128
|
+
// Follow links when depth budget is left.
|
|
129
|
+
if (item.depth < depth) {
|
|
130
|
+
for (const link of extracted.links) {
|
|
131
|
+
if (!sameOrigin(link, origin))
|
|
132
|
+
continue;
|
|
133
|
+
const normalized = normalizeUrl(link);
|
|
134
|
+
if (seen.has(normalized))
|
|
135
|
+
continue;
|
|
136
|
+
seen.add(normalized);
|
|
137
|
+
queue.push({ depth: item.depth + 1, url: normalized });
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
// Download images and build a URL -> local path map for Turndown.
|
|
141
|
+
const imageEntries = [];
|
|
142
|
+
if (includeImages && extracted.images.length > 0) {
|
|
143
|
+
for (const imgUrl of extracted.images) {
|
|
144
|
+
try {
|
|
145
|
+
const absolute = new URL(imgUrl, finalUrl).href;
|
|
146
|
+
const result = yield fetchImage(http, absolute);
|
|
147
|
+
if (!result)
|
|
148
|
+
continue;
|
|
149
|
+
const hash = (0, crypto_1.createHash)('sha1').update(result.buffer).digest('hex');
|
|
150
|
+
let relativeImagePath = imageHashToPath.get(hash);
|
|
151
|
+
if (!relativeImagePath) {
|
|
152
|
+
// Filename uses a content-hash suffix so re-runs with
|
|
153
|
+
// identical bytes overwrite the same file instead of
|
|
154
|
+
// leaving orphans with rotating counter suffixes.
|
|
155
|
+
const filename = buildImageFilename(absolute, hash, result.contentType);
|
|
156
|
+
relativeImagePath = `images/${filename}`;
|
|
157
|
+
imageHashToPath.set(hash, relativeImagePath);
|
|
158
|
+
imageEntries.push({ data: result.buffer, filename });
|
|
159
|
+
}
|
|
160
|
+
extracted.imageMap.set(imgUrl, relativeImagePath);
|
|
161
|
+
extracted.imageMap.set(absolute, relativeImagePath);
|
|
162
|
+
}
|
|
163
|
+
catch (_c) {
|
|
164
|
+
// Skip image on error; continue with others.
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
const markdown = convertToMarkdown(extracted.contentHtml, finalUrl, extracted.imageMap);
|
|
169
|
+
const filename = buildPageFilename(finalUrl, rendered.length === 0);
|
|
170
|
+
rendered.push({
|
|
171
|
+
filename,
|
|
172
|
+
images: imageEntries,
|
|
173
|
+
info: {
|
|
174
|
+
author: extracted.meta.author,
|
|
175
|
+
depth: item.depth,
|
|
176
|
+
description: extracted.meta.description,
|
|
177
|
+
downloadDate: new Date().toISOString(),
|
|
178
|
+
firstDownloaded: new Date().toISOString(),
|
|
179
|
+
imageCount: imageEntries.length,
|
|
180
|
+
language: extracted.meta.language,
|
|
181
|
+
ogImage: extracted.meta.ogImage,
|
|
182
|
+
title: extracted.meta.title,
|
|
183
|
+
url: finalUrl,
|
|
184
|
+
wordCount: extracted.meta.wordCount || countWords(extracted.contentText),
|
|
185
|
+
},
|
|
186
|
+
markdown,
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
catch (error) {
|
|
190
|
+
errors.push({
|
|
191
|
+
reason: error instanceof Error ? error.message : String(error),
|
|
192
|
+
url: item.url,
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
});
|
|
196
|
+
// Simple parallel worker pool. `queue` grows as pages are discovered,
|
|
197
|
+
// so workers pick new items until nothing is left.
|
|
198
|
+
let cursor = 0;
|
|
199
|
+
const worker = () => __awaiter(this, void 0, void 0, function* () {
|
|
200
|
+
while (cursor < queue.length && pages.length + errors.length < maxPages) {
|
|
201
|
+
const item = queue[cursor++];
|
|
202
|
+
yield processPage(item);
|
|
203
|
+
}
|
|
204
|
+
});
|
|
205
|
+
const workers = Array.from({ length: Math.max(1, concurrency) }, () => worker());
|
|
206
|
+
yield Promise.all(workers);
|
|
207
|
+
// Drain any late discoveries added after all initial workers exited.
|
|
208
|
+
while (cursor < queue.length) {
|
|
209
|
+
yield Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
|
|
210
|
+
}
|
|
211
|
+
const multiPage = rendered.length > 1;
|
|
212
|
+
const pagesDir = multiPage ? (0, path_1.join)(outDir, 'pages') : outDir;
|
|
213
|
+
const imagesDir = (0, path_1.join)(outDir, 'images');
|
|
214
|
+
if (rendered.length === 0) {
|
|
215
|
+
onLog('No pages rendered');
|
|
216
|
+
return { errors, outDir, pages, pruned: [], skipped };
|
|
217
|
+
}
|
|
218
|
+
(0, fs_1.mkdirSync)(pagesDir, { recursive: true });
|
|
219
|
+
if (includeImages && imageHashToPath.size > 0) {
|
|
220
|
+
(0, fs_1.mkdirSync)(imagesDir, { recursive: true });
|
|
221
|
+
}
|
|
222
|
+
// Write deduplicated images.
|
|
223
|
+
const writtenImageFilenames = new Set();
|
|
224
|
+
for (const entry of rendered.flatMap((r) => r.images)) {
|
|
225
|
+
if (writtenImageFilenames.has(entry.filename))
|
|
226
|
+
continue;
|
|
227
|
+
writtenImageFilenames.add(entry.filename);
|
|
228
|
+
(0, fs_1.writeFileSync)((0, path_1.join)(imagesDir, entry.filename), entry.data);
|
|
229
|
+
}
|
|
230
|
+
// Persist pages. When updating, preserve the original
|
|
231
|
+
// `first_downloaded` timestamp so history stays intact.
|
|
232
|
+
for (const entry of rendered) {
|
|
233
|
+
const outputPath = (0, path_1.join)(pagesDir, entry.filename);
|
|
234
|
+
const relativePath = (0, path_1.relative)(outDir, outputPath);
|
|
235
|
+
// Images live under `<outDir>/images/`. Each page rewrites the
|
|
236
|
+
// Turndown-emitted `images/<file>` placeholder to the correct
|
|
237
|
+
// relative path so nested URL slugs (`pages/ueber-uns/…`, or a
|
|
238
|
+
// single-page crawl that lands in `<outDir>/ueber-uns/…`) still
|
|
239
|
+
// render in Markdown previews.
|
|
240
|
+
const imagePrefix = `${(0, path_1.relative)((0, path_1.dirname)(outputPath), imagesDir).split(/[\\/]/).join('/')}/`;
|
|
241
|
+
const fixedMarkdown = entry.markdown.replace(/\]\(images\//g, `](${imagePrefix}`);
|
|
242
|
+
if ((0, fs_1.existsSync)(outputPath)) {
|
|
243
|
+
const existing = (0, fs_1.readFileSync)(outputPath, 'utf8');
|
|
244
|
+
const existingMeta = parseFrontmatter(existing);
|
|
245
|
+
if (existingMeta === null || existingMeta === void 0 ? void 0 : existingMeta.first_downloaded) {
|
|
246
|
+
entry.info.firstDownloaded = String(existingMeta.first_downloaded);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
const frontmatter = renderFrontmatter(entry.info);
|
|
250
|
+
(0, fs_1.mkdirSync)((0, path_1.dirname)(outputPath), { recursive: true });
|
|
251
|
+
(0, fs_1.writeFileSync)(outputPath, `${frontmatter}\n${fixedMarkdown.trim()}\n`);
|
|
252
|
+
pages.push(Object.assign(Object.assign({}, entry.info), { outputPath, relativePath }));
|
|
253
|
+
}
|
|
254
|
+
let indexFile;
|
|
255
|
+
if (multiPage) {
|
|
256
|
+
indexFile = (0, path_1.join)(outDir, 'README.md');
|
|
257
|
+
(0, fs_1.writeFileSync)(indexFile, renderOverview(seed.href, pages));
|
|
258
|
+
}
|
|
259
|
+
// Prune orphans (files left over from previous crawls). Scoped to
|
|
260
|
+
// `pages/` and `images/` so stray user files in outDir root never
|
|
261
|
+
// get touched. Only active in multi-page mode — a single-page
|
|
262
|
+
// crawl writes into `outDir` itself and has no page subfolder to
|
|
263
|
+
// sweep.
|
|
264
|
+
const pruned = [];
|
|
265
|
+
if (prune && multiPage) {
|
|
266
|
+
const keep = new Set(pages.map((p) => p.outputPath));
|
|
267
|
+
for (const entry of rendered.flatMap((r) => r.images)) {
|
|
268
|
+
keep.add((0, path_1.join)(imagesDir, entry.filename));
|
|
269
|
+
}
|
|
270
|
+
pruned.push(...pruneOrphans(pagesDir, keep));
|
|
271
|
+
if ((0, fs_1.existsSync)(imagesDir)) {
|
|
272
|
+
pruned.push(...pruneOrphans(imagesDir, keep));
|
|
273
|
+
}
|
|
274
|
+
if (pruned.length > 0) {
|
|
275
|
+
onLog(`Pruned ${pruned.length} orphaned file(s)`);
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
return { errors, indexFile, outDir, pages, pruned, skipped };
|
|
279
|
+
}
|
|
280
|
+
finally {
|
|
281
|
+
// Guarantee the headless browser is shut down on every exit path,
|
|
282
|
+
// including thrown errors, so no orphan chromium processes linger.
|
|
283
|
+
if (browserFetcher) {
|
|
284
|
+
yield browserFetcher.close().catch(() => undefined);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
function buildImageFilename(url, contentHash, contentType) {
|
|
290
|
+
let basename = 'image';
|
|
291
|
+
let extension = '';
|
|
292
|
+
try {
|
|
293
|
+
const u = new URL(url);
|
|
294
|
+
const last = u.pathname.split('/').filter(Boolean).pop() || '';
|
|
295
|
+
const parsedExt = (0, path_1.extname)(last).replace('.', '').toLowerCase();
|
|
296
|
+
if (parsedExt && /^(jpg|jpeg|png|gif|webp|svg|avif)$/.test(parsedExt)) {
|
|
297
|
+
extension = parsedExt;
|
|
298
|
+
}
|
|
299
|
+
basename =
|
|
300
|
+
last
|
|
301
|
+
.replace((0, path_1.extname)(last), '')
|
|
302
|
+
.replace(/[^a-zA-Z0-9-_]/g, '_')
|
|
303
|
+
.substring(0, 40) || 'image';
|
|
304
|
+
}
|
|
305
|
+
catch (_a) {
|
|
306
|
+
// fall through
|
|
307
|
+
}
|
|
308
|
+
if (!extension) {
|
|
309
|
+
const fromType = contentType.split(';')[0].split('/')[1];
|
|
310
|
+
if (fromType && /^(jpeg|jpg|png|gif|webp|svg\+xml|avif)$/.test(fromType)) {
|
|
311
|
+
extension = fromType === 'svg+xml' ? 'svg' : fromType;
|
|
312
|
+
}
|
|
313
|
+
else {
|
|
314
|
+
extension = 'png';
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
return `${basename}-${contentHash.slice(0, 8)}.${extension}`;
|
|
318
|
+
}
|
|
319
|
+
function buildPageFilename(url, isFirst) {
|
|
320
|
+
const u = new URL(url);
|
|
321
|
+
const segments = u.pathname.split('/').filter(Boolean);
|
|
322
|
+
if (segments.length === 0) {
|
|
323
|
+
return isFirst ? 'index.md' : 'home.md';
|
|
324
|
+
}
|
|
325
|
+
const slugged = segments
|
|
326
|
+
.map((s) => s
|
|
327
|
+
.toLowerCase()
|
|
328
|
+
.replace(/\.(html?|php|aspx?)$/, '')
|
|
329
|
+
.replace(/[^a-z0-9-_]/g, '-')
|
|
330
|
+
.replace(/-+/g, '-')
|
|
331
|
+
.replace(/^-|-$/g, '') || 'page')
|
|
332
|
+
.join('/');
|
|
333
|
+
return `${slugged}.md`;
|
|
334
|
+
}
|
|
335
|
+
function convertToMarkdown(html, baseUrl, imageMap) {
|
|
336
|
+
const turndown = new turndown_1.default({
|
|
337
|
+
bulletListMarker: '-',
|
|
338
|
+
codeBlockStyle: 'fenced',
|
|
339
|
+
emDelimiter: '*',
|
|
340
|
+
headingStyle: 'atx',
|
|
341
|
+
linkStyle: 'inlined',
|
|
342
|
+
strongDelimiter: '**',
|
|
343
|
+
});
|
|
344
|
+
// Enable GFM so tables, strikethrough and task lists convert cleanly.
|
|
345
|
+
if (turndown_plugin_gfm_1.gfm) {
|
|
346
|
+
turndown.use(turndown_plugin_gfm_1.gfm);
|
|
347
|
+
}
|
|
348
|
+
turndown.addRule('absoluteLinks', {
|
|
349
|
+
filter: 'a',
|
|
350
|
+
replacement: (content, node) => {
|
|
351
|
+
var _a, _b, _c, _d;
|
|
352
|
+
const href = ((_b = (_a = node).getAttribute) === null || _b === void 0 ? void 0 : _b.call(_a, 'href')) || '';
|
|
353
|
+
if (!href || href === '#' || href.startsWith('javascript:')) {
|
|
354
|
+
return content;
|
|
355
|
+
}
|
|
356
|
+
let absolute = href;
|
|
357
|
+
try {
|
|
358
|
+
absolute = new URL(href, baseUrl).href;
|
|
359
|
+
}
|
|
360
|
+
catch (_e) {
|
|
361
|
+
// keep original
|
|
362
|
+
}
|
|
363
|
+
const title = (_d = (_c = node).getAttribute) === null || _d === void 0 ? void 0 : _d.call(_c, 'title');
|
|
364
|
+
return title ? `[${content}](${absolute} "${title}")` : `[${content}](${absolute})`;
|
|
365
|
+
},
|
|
366
|
+
});
|
|
367
|
+
turndown.addRule('localImages', {
|
|
368
|
+
filter: 'img',
|
|
369
|
+
replacement: (_content, node) => {
|
|
370
|
+
var _a, _b, _c, _d;
|
|
371
|
+
const src = ((_b = (_a = node).getAttribute) === null || _b === void 0 ? void 0 : _b.call(_a, 'src')) || '';
|
|
372
|
+
if (!src)
|
|
373
|
+
return '';
|
|
374
|
+
let absolute = src;
|
|
375
|
+
try {
|
|
376
|
+
absolute = new URL(src, baseUrl).href;
|
|
377
|
+
}
|
|
378
|
+
catch (_e) {
|
|
379
|
+
// keep original
|
|
380
|
+
}
|
|
381
|
+
const local = imageMap.get(src) || imageMap.get(absolute);
|
|
382
|
+
const alt = ((_d = (_c = node).getAttribute) === null || _d === void 0 ? void 0 : _d.call(_c, 'alt')) || '';
|
|
383
|
+
const target = local || absolute;
|
|
384
|
+
return ``;
|
|
385
|
+
},
|
|
386
|
+
});
|
|
387
|
+
turndown.remove(['script', 'style', 'noscript', 'iframe']);
|
|
388
|
+
const markdown = turndown.turndown(html);
|
|
389
|
+
return markdown.replace(/\n{3,}/g, '\n\n').trim();
|
|
390
|
+
}
|
|
391
|
+
function countWords(text) {
|
|
392
|
+
return text.replace(/\s+/g, ' ').trim().split(' ').filter(Boolean).length;
|
|
393
|
+
}
|
|
394
|
+
function escapeYaml(value) {
|
|
395
|
+
return value.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, ' ');
|
|
396
|
+
}
|
|
397
|
+
/**
|
|
398
|
+
* Extract main content + metadata using defuddle (the same engine as
|
|
399
|
+
* chrome-md). Falls back to a raw body dump if defuddle fails.
|
|
400
|
+
*/
|
|
401
|
+
function extractContent(html, pageUrl, options) {
|
|
402
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
403
|
+
var _a, _b, _c, _d, _e, _f, _g;
|
|
404
|
+
const dom = new jsdom_1.JSDOM(html, { url: pageUrl });
|
|
405
|
+
const doc = dom.window.document;
|
|
406
|
+
const defuddleOptions = {
|
|
407
|
+
markdown: false,
|
|
408
|
+
removeHiddenElements: true,
|
|
409
|
+
removeLowScoring: true,
|
|
410
|
+
removeSmallImages: false,
|
|
411
|
+
};
|
|
412
|
+
if (options.selector) {
|
|
413
|
+
defuddleOptions.contentSelector = options.selector;
|
|
414
|
+
}
|
|
415
|
+
let parsed;
|
|
416
|
+
try {
|
|
417
|
+
// Same class-based API as chrome-md's content script.
|
|
418
|
+
const instance = new defuddle_1.default(doc, defuddleOptions);
|
|
419
|
+
parsed = instance.parse();
|
|
420
|
+
}
|
|
421
|
+
catch (_h) {
|
|
422
|
+
parsed = {
|
|
423
|
+
content: ((_a = doc.body) === null || _a === void 0 ? void 0 : _a.innerHTML) || html,
|
|
424
|
+
title: doc.title,
|
|
425
|
+
};
|
|
426
|
+
}
|
|
427
|
+
const contentHtml = parsed.content || ((_b = doc.body) === null || _b === void 0 ? void 0 : _b.innerHTML) || '';
|
|
428
|
+
// Collect images and links from the cleaned content.
|
|
429
|
+
const helperDom = new jsdom_1.JSDOM(`<!DOCTYPE html><html><body>${contentHtml}</body></html>`, {
|
|
430
|
+
url: pageUrl,
|
|
431
|
+
});
|
|
432
|
+
const contentDoc = helperDom.window.document;
|
|
433
|
+
const links = new Set();
|
|
434
|
+
contentDoc.querySelectorAll('a[href]').forEach((el) => {
|
|
435
|
+
const href = (el.getAttribute('href') || '').trim();
|
|
436
|
+
if (!href || href.startsWith('#') || href.startsWith('mailto:') || href.startsWith('javascript:')) {
|
|
437
|
+
return;
|
|
438
|
+
}
|
|
439
|
+
try {
|
|
440
|
+
links.add(new URL(href, pageUrl).href);
|
|
441
|
+
}
|
|
442
|
+
catch (_a) {
|
|
443
|
+
// ignore malformed URLs
|
|
444
|
+
}
|
|
445
|
+
});
|
|
446
|
+
const images = new Set();
|
|
447
|
+
contentDoc.querySelectorAll('img').forEach((el) => {
|
|
448
|
+
const src = (el.getAttribute('src') || el.getAttribute('data-src') || '').trim();
|
|
449
|
+
if (!src || src.startsWith('data:'))
|
|
450
|
+
return;
|
|
451
|
+
try {
|
|
452
|
+
images.add(new URL(src, pageUrl).href);
|
|
453
|
+
}
|
|
454
|
+
catch (_a) {
|
|
455
|
+
// ignore malformed URLs
|
|
456
|
+
}
|
|
457
|
+
});
|
|
458
|
+
// Some lazy-loading frameworks keep the real URL only in the source
|
|
459
|
+
// document (stripped out by defuddle), so also consult the original DOM.
|
|
460
|
+
doc.querySelectorAll('img[data-src], img[data-lazy-src]').forEach((el) => {
|
|
461
|
+
const src = (el.getAttribute('data-src') || el.getAttribute('data-lazy-src') || '').trim();
|
|
462
|
+
if (!src || src.startsWith('data:'))
|
|
463
|
+
return;
|
|
464
|
+
try {
|
|
465
|
+
images.add(new URL(src, pageUrl).href);
|
|
466
|
+
}
|
|
467
|
+
catch (_a) {
|
|
468
|
+
// ignore
|
|
469
|
+
}
|
|
470
|
+
});
|
|
471
|
+
const meta = {
|
|
472
|
+
author: parsed.author || ((_c = doc.querySelector('meta[name="author"]')) === null || _c === void 0 ? void 0 : _c.getAttribute('content')) || undefined,
|
|
473
|
+
description: parsed.description ||
|
|
474
|
+
((_d = doc.querySelector('meta[name="description"]')) === null || _d === void 0 ? void 0 : _d.getAttribute('content')) ||
|
|
475
|
+
((_e = doc.querySelector('meta[property="og:description"]')) === null || _e === void 0 ? void 0 : _e.getAttribute('content')) ||
|
|
476
|
+
'',
|
|
477
|
+
language: parsed.language || doc.documentElement.getAttribute('lang') || undefined,
|
|
478
|
+
ogImage: parsed.image || ((_f = doc.querySelector('meta[property="og:image"]')) === null || _f === void 0 ? void 0 : _f.getAttribute('content')) || undefined,
|
|
479
|
+
title: parsed.title || doc.title || pageUrl,
|
|
480
|
+
wordCount: parsed.wordCount,
|
|
481
|
+
};
|
|
482
|
+
return {
|
|
483
|
+
contentHtml,
|
|
484
|
+
contentText: ((_g = contentDoc.body) === null || _g === void 0 ? void 0 : _g.textContent) || '',
|
|
485
|
+
imageMap: new Map(),
|
|
486
|
+
images: [...images],
|
|
487
|
+
links: [...links],
|
|
488
|
+
meta,
|
|
489
|
+
};
|
|
490
|
+
});
|
|
491
|
+
}
|
|
492
|
+
function fetchImage(http, url) {
|
|
493
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
494
|
+
try {
|
|
495
|
+
const response = yield http.get(url, { responseType: 'arraybuffer' });
|
|
496
|
+
const buffer = Buffer.from(response.data);
|
|
497
|
+
if (buffer.byteLength === 0)
|
|
498
|
+
return null;
|
|
499
|
+
return { buffer, contentType: String(response.headers['content-type'] || '') };
|
|
500
|
+
}
|
|
501
|
+
catch (_a) {
|
|
502
|
+
return null;
|
|
503
|
+
}
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
function fetchSitemapUrls(http, origin, onLog) {
|
|
507
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
508
|
+
const urls = [];
|
|
509
|
+
const visited = new Set();
|
|
510
|
+
function walk(sitemapUrl) {
|
|
511
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
512
|
+
if (visited.has(sitemapUrl))
|
|
513
|
+
return;
|
|
514
|
+
visited.add(sitemapUrl);
|
|
515
|
+
try {
|
|
516
|
+
const response = yield http.get(sitemapUrl);
|
|
517
|
+
const xml = String(response.data || '');
|
|
518
|
+
// Nested sitemap index: follow each <sitemap><loc>...</loc></sitemap>.
|
|
519
|
+
const nested = [...xml.matchAll(/<sitemap>[\s\S]*?<loc>\s*([^<\s]+)\s*<\/loc>[\s\S]*?<\/sitemap>/gi)].map((m) => m[1]);
|
|
520
|
+
for (const child of nested) {
|
|
521
|
+
yield walk(child);
|
|
522
|
+
}
|
|
523
|
+
const pageMatches = [...xml.matchAll(/<url>[\s\S]*?<loc>\s*([^<\s]+)\s*<\/loc>[\s\S]*?<\/url>/gi)].map((m) => m[1]);
|
|
524
|
+
urls.push(...pageMatches);
|
|
525
|
+
}
|
|
526
|
+
catch (error) {
|
|
527
|
+
onLog(`Sitemap fetch failed for ${sitemapUrl}: ${error instanceof Error ? error.message : String(error)}`);
|
|
528
|
+
}
|
|
529
|
+
});
|
|
530
|
+
}
|
|
531
|
+
yield walk(`${origin}/sitemap.xml`);
|
|
532
|
+
return urls;
|
|
533
|
+
});
|
|
534
|
+
}
|
|
535
|
+
/**
|
|
536
|
+
* Normalize a URL for dedup: strip hash, drop default `index.html`,
|
|
537
|
+
* and remove trailing slashes (except root).
|
|
538
|
+
*/
|
|
539
|
+
function normalizeUrl(raw) {
|
|
540
|
+
try {
|
|
541
|
+
const u = new URL(raw);
|
|
542
|
+
u.hash = '';
|
|
543
|
+
u.pathname = u.pathname.replace(/\/index\.html?$/i, '/');
|
|
544
|
+
if (u.pathname.length > 1 && u.pathname.endsWith('/')) {
|
|
545
|
+
u.pathname = u.pathname.replace(/\/+$/, '');
|
|
546
|
+
}
|
|
547
|
+
return u.href;
|
|
548
|
+
}
|
|
549
|
+
catch (_a) {
|
|
550
|
+
return raw;
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
function parseFrontmatter(markdown) {
|
|
554
|
+
if (!markdown.startsWith('---'))
|
|
555
|
+
return null;
|
|
556
|
+
const end = markdown.indexOf('\n---', 3);
|
|
557
|
+
if (end === -1)
|
|
558
|
+
return null;
|
|
559
|
+
const block = markdown.slice(3, end);
|
|
560
|
+
const result = {};
|
|
561
|
+
for (const line of block.split('\n')) {
|
|
562
|
+
const match = line.match(/^([a-zA-Z0-9_]+):\s*(.*)$/);
|
|
563
|
+
if (!match)
|
|
564
|
+
continue;
|
|
565
|
+
result[match[1]] = match[2].replace(/^"(.*)"$/, '$1');
|
|
566
|
+
}
|
|
567
|
+
return result;
|
|
568
|
+
}
|
|
569
|
+
/**
|
|
570
|
+
* Walk `rootDir` recursively and delete every file whose absolute
|
|
571
|
+
* path is not in `keepPaths`. Empty directories left behind after
|
|
572
|
+
* the sweep are removed, too. Returns the absolute paths that were
|
|
573
|
+
* actually deleted.
|
|
574
|
+
*/
|
|
575
|
+
function pruneOrphans(rootDir, keepPaths) {
|
|
576
|
+
const removed = [];
|
|
577
|
+
if (!(0, fs_1.existsSync)(rootDir))
|
|
578
|
+
return removed;
|
|
579
|
+
const entries = (0, fs_1.readdirSync)(rootDir, { withFileTypes: true });
|
|
580
|
+
for (const entry of entries) {
|
|
581
|
+
const full = (0, path_1.join)(rootDir, entry.name);
|
|
582
|
+
if (entry.isDirectory()) {
|
|
583
|
+
removed.push(...pruneOrphans(full, keepPaths));
|
|
584
|
+
// Remove directory if now empty.
|
|
585
|
+
try {
|
|
586
|
+
if ((0, fs_1.readdirSync)(full).length === 0)
|
|
587
|
+
(0, fs_1.rmdirSync)(full);
|
|
588
|
+
}
|
|
589
|
+
catch (_a) {
|
|
590
|
+
// Directory not empty or already gone — ignore.
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
else if (entry.isFile() && !keepPaths.has(full)) {
|
|
594
|
+
try {
|
|
595
|
+
(0, fs_1.unlinkSync)(full);
|
|
596
|
+
removed.push(full);
|
|
597
|
+
}
|
|
598
|
+
catch (_b) {
|
|
599
|
+
// File already removed or permission denied — skip.
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
return removed;
|
|
604
|
+
}
|
|
605
|
+
function renderFrontmatter(info) {
|
|
606
|
+
const lines = [
|
|
607
|
+
'---',
|
|
608
|
+
`title: "${escapeYaml(info.title)}"`,
|
|
609
|
+
`source_url: "${info.url}"`,
|
|
610
|
+
`source_domain: "${new URL(info.url).hostname}"`,
|
|
611
|
+
`crawl_depth: ${info.depth}`,
|
|
612
|
+
`download_date: "${info.downloadDate}"`,
|
|
613
|
+
`first_downloaded: "${info.firstDownloaded}"`,
|
|
614
|
+
info.description ? `description: "${escapeYaml(truncate(info.description, 500))}"` : null,
|
|
615
|
+
info.author ? `author: "${escapeYaml(info.author)}"` : null,
|
|
616
|
+
info.language ? `language: "${escapeYaml(info.language)}"` : null,
|
|
617
|
+
info.ogImage ? `og_image: "${escapeYaml(info.ogImage)}"` : null,
|
|
618
|
+
info.imageCount ? `image_count: ${info.imageCount}` : null,
|
|
619
|
+
`word_count: ${info.wordCount}`,
|
|
620
|
+
'content_type: "webpage"',
|
|
621
|
+
'---',
|
|
622
|
+
].filter((l) => l !== null);
|
|
623
|
+
return lines.join('\n');
|
|
624
|
+
}
|
|
625
|
+
function renderOverview(startUrl, pages) {
|
|
626
|
+
const ordered = [...pages].sort((a, b) => a.url.localeCompare(b.url));
|
|
627
|
+
const host = new URL(startUrl).host;
|
|
628
|
+
const lines = [];
|
|
629
|
+
lines.push(`# ${host} — Knowledge Base`);
|
|
630
|
+
lines.push('');
|
|
631
|
+
lines.push(`Source: ${startUrl}`);
|
|
632
|
+
lines.push('');
|
|
633
|
+
lines.push(`Generated: ${new Date().toISOString()}`);
|
|
634
|
+
lines.push('');
|
|
635
|
+
lines.push(`Pages: ${ordered.length}`);
|
|
636
|
+
lines.push('');
|
|
637
|
+
lines.push('## Pages');
|
|
638
|
+
lines.push('');
|
|
639
|
+
for (const page of ordered) {
|
|
640
|
+
lines.push(`### [${page.title}](${page.relativePath.split(/[\\/]/).join('/')})`);
|
|
641
|
+
lines.push('');
|
|
642
|
+
lines.push(`- URL: ${page.url}`);
|
|
643
|
+
if (page.description) {
|
|
644
|
+
lines.push(`- ${truncate(page.description, 240)}`);
|
|
645
|
+
}
|
|
646
|
+
lines.push(`- Updated: ${page.downloadDate}`);
|
|
647
|
+
lines.push('');
|
|
648
|
+
}
|
|
649
|
+
return lines.join('\n');
|
|
650
|
+
}
|
|
651
|
+
function sameOrigin(url, origin) {
|
|
652
|
+
try {
|
|
653
|
+
return new URL(url).origin === origin;
|
|
654
|
+
}
|
|
655
|
+
catch (_a) {
|
|
656
|
+
return false;
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
function truncate(value, max) {
|
|
660
|
+
return value.length <= max ? value : `${value.slice(0, max - 1)}…`;
|
|
661
|
+
}
|
|
@@ -81,8 +81,7 @@ function findAppDir(startDir) {
|
|
|
81
81
|
let current = path.resolve(startDir);
|
|
82
82
|
const root = path.parse(current).root;
|
|
83
83
|
while (current !== root) {
|
|
84
|
-
if ((0, node_fs_1.existsSync)(path.join(current, 'nuxt.config.ts')) ||
|
|
85
|
-
(0, node_fs_1.existsSync)(path.join(current, 'nuxt.config.js'))) {
|
|
84
|
+
if ((0, node_fs_1.existsSync)(path.join(current, 'nuxt.config.ts')) || (0, node_fs_1.existsSync)(path.join(current, 'nuxt.config.js'))) {
|
|
86
85
|
return current;
|
|
87
86
|
}
|
|
88
87
|
current = path.dirname(current);
|