@lenne.tech/cli 1.12.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,661 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ exports.crawlSite = crawlSite;
16
+ /**
17
+ * Website crawler utilities.
18
+ *
19
+ * Fetches web pages (optionally guided by sitemap.xml), extracts the
20
+ * main content using the same defuddle + Turndown pipeline as the
21
+ * chrome-md browser extension (see ../../../chrome-md/content/content.js),
22
+ * converts it to Markdown, and writes one .md file per page plus an
23
+ * overview README when multiple pages are discovered. Designed for
24
+ * building Claude Code knowledge bases.
25
+ */
26
+ const axios_1 = __importDefault(require("axios"));
27
+ const crypto_1 = require("crypto");
28
+ const defuddle_1 = __importDefault(require("defuddle"));
29
+ const fs_1 = require("fs");
30
+ const jsdom_1 = require("jsdom");
31
+ const path_1 = require("path");
32
+ const turndown_1 = __importDefault(require("turndown"));
33
+ const turndown_plugin_gfm_1 = require("turndown-plugin-gfm");
34
+ const browser_fetcher_1 = require("./browser-fetcher");
35
+ const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; lenneTech-CLI-Crawler/1.0; +https://lenne.tech)';
36
+ /**
37
+ * Crawl a website starting at `options.url` and write the collected
38
+ * pages as Markdown files beneath `options.outDir`.
39
+ */
40
+ function crawlSite(options) {
41
+ return __awaiter(this, void 0, void 0, function* () {
42
+ const { autoInstallBrowser = false, concurrency = 4, depth: rawDepth, includeImages = true, includeSitemap = true, maxPages = 200, onLog = () => undefined, outDir, prune = false, renderJs = false, selector, timeout = 20000, url: seedUrl, userAgent = DEFAULT_USER_AGENT, } = options;
43
+ // Normalize depth. `'all'` and negative numbers mean "follow every
44
+ // same-origin link we find" — bounded by `maxPages`.
45
+ const depth = rawDepth === 'all' || (typeof rawDepth === 'number' && rawDepth < 0) ? Number.POSITIVE_INFINITY : Number(rawDepth);
46
+ const http = axios_1.default.create({
47
+ headers: { 'User-Agent': userAgent },
48
+ maxRedirects: 5,
49
+ responseType: 'text',
50
+ timeout,
51
+ validateStatus: (status) => status >= 200 && status < 400,
52
+ });
53
+ // Headless browser only spun up when needed (SPA-mode).
54
+ let browserFetcher = null;
55
+ if (renderJs) {
56
+ browserFetcher = yield (0, browser_fetcher_1.createBrowserFetcher)({
57
+ autoInstall: autoInstallBrowser,
58
+ extraWaitMs: 500,
59
+ maxWaitMs: timeout,
60
+ onLog,
61
+ userAgent,
62
+ });
63
+ }
64
+ try {
65
+ const seed = new URL(seedUrl);
66
+ const origin = seed.origin;
67
+ if (!(0, fs_1.existsSync)(outDir)) {
68
+ (0, fs_1.mkdirSync)(outDir, { recursive: true });
69
+ }
70
+ // Queue preserves the depth at which a URL was discovered so children
71
+ // are only followed when `discovered.depth < options.depth`.
72
+ const queue = [{ depth: 0, url: normalizeUrl(seedUrl) }];
73
+ const seen = new Set([normalizeUrl(seedUrl)]);
74
+ if (includeSitemap) {
75
+ onLog(`Checking sitemap at ${origin}/sitemap.xml`);
76
+ const sitemapUrls = yield fetchSitemapUrls(http, origin, onLog);
77
+ for (const sitemapUrl of sitemapUrls) {
78
+ const normalized = normalizeUrl(sitemapUrl);
79
+ if (!seen.has(normalized) && sameOrigin(normalized, origin)) {
80
+ seen.add(normalized);
81
+ queue.push({ depth: 0, url: normalized });
82
+ }
83
+ }
84
+ if (sitemapUrls.length > 0) {
85
+ onLog(`Sitemap discovered ${sitemapUrls.length} URLs`);
86
+ }
87
+ }
88
+ const pages = [];
89
+ const errors = [];
90
+ const skipped = [];
91
+ // Shared deduplicated image map (content hash -> relative path under outDir).
92
+ const imageHashToPath = new Map();
93
+ // We can't know upfront whether the crawl is single- or multi-page,
94
+ // so we render pages into a buffer first and only materialize files
95
+ // once the queue drains.
96
+ const rendered = [];
97
+ const processPage = (item) => __awaiter(this, void 0, void 0, function* () {
98
+ var _a, _b;
99
+ if (pages.length + errors.length >= maxPages) {
100
+ skipped.push(item.url);
101
+ return;
102
+ }
103
+ onLog(`Fetching (depth ${item.depth}): ${item.url}`);
104
+ try {
105
+ let html;
106
+ let finalUrl = normalizeUrl(item.url);
107
+ if (browserFetcher) {
108
+ // In render mode we trust the URL we navigated to. We can't
109
+ // cheaply detect redirects here, so assume same origin (the
110
+ // crawler already filtered non-HTML URLs out of the queue).
111
+ html = yield browserFetcher.fetch(item.url);
112
+ }
113
+ else {
114
+ const response = yield http.get(item.url);
115
+ finalUrl = normalizeUrl(((_b = (_a = response.request) === null || _a === void 0 ? void 0 : _a.res) === null || _b === void 0 ? void 0 : _b.responseUrl) || item.url);
116
+ if (!sameOrigin(finalUrl, origin)) {
117
+ skipped.push(item.url);
118
+ return;
119
+ }
120
+ const contentType = String(response.headers['content-type'] || '');
121
+ if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
122
+ skipped.push(item.url);
123
+ return;
124
+ }
125
+ html = String(response.data || '');
126
+ }
127
+ const extracted = yield extractContent(html, finalUrl, { selector });
128
+ // Follow links when depth budget is left.
129
+ if (item.depth < depth) {
130
+ for (const link of extracted.links) {
131
+ if (!sameOrigin(link, origin))
132
+ continue;
133
+ const normalized = normalizeUrl(link);
134
+ if (seen.has(normalized))
135
+ continue;
136
+ seen.add(normalized);
137
+ queue.push({ depth: item.depth + 1, url: normalized });
138
+ }
139
+ }
140
+ // Download images and build a URL -> local path map for Turndown.
141
+ const imageEntries = [];
142
+ if (includeImages && extracted.images.length > 0) {
143
+ for (const imgUrl of extracted.images) {
144
+ try {
145
+ const absolute = new URL(imgUrl, finalUrl).href;
146
+ const result = yield fetchImage(http, absolute);
147
+ if (!result)
148
+ continue;
149
+ const hash = (0, crypto_1.createHash)('sha1').update(result.buffer).digest('hex');
150
+ let relativeImagePath = imageHashToPath.get(hash);
151
+ if (!relativeImagePath) {
152
+ // Filename uses a content-hash suffix so re-runs with
153
+ // identical bytes overwrite the same file instead of
154
+ // leaving orphans with rotating counter suffixes.
155
+ const filename = buildImageFilename(absolute, hash, result.contentType);
156
+ relativeImagePath = `images/${filename}`;
157
+ imageHashToPath.set(hash, relativeImagePath);
158
+ imageEntries.push({ data: result.buffer, filename });
159
+ }
160
+ extracted.imageMap.set(imgUrl, relativeImagePath);
161
+ extracted.imageMap.set(absolute, relativeImagePath);
162
+ }
163
+ catch (_c) {
164
+ // Skip image on error; continue with others.
165
+ }
166
+ }
167
+ }
168
+ const markdown = convertToMarkdown(extracted.contentHtml, finalUrl, extracted.imageMap);
169
+ const filename = buildPageFilename(finalUrl, rendered.length === 0);
170
+ rendered.push({
171
+ filename,
172
+ images: imageEntries,
173
+ info: {
174
+ author: extracted.meta.author,
175
+ depth: item.depth,
176
+ description: extracted.meta.description,
177
+ downloadDate: new Date().toISOString(),
178
+ firstDownloaded: new Date().toISOString(),
179
+ imageCount: imageEntries.length,
180
+ language: extracted.meta.language,
181
+ ogImage: extracted.meta.ogImage,
182
+ title: extracted.meta.title,
183
+ url: finalUrl,
184
+ wordCount: extracted.meta.wordCount || countWords(extracted.contentText),
185
+ },
186
+ markdown,
187
+ });
188
+ }
189
+ catch (error) {
190
+ errors.push({
191
+ reason: error instanceof Error ? error.message : String(error),
192
+ url: item.url,
193
+ });
194
+ }
195
+ });
196
+ // Simple parallel worker pool. `queue` grows as pages are discovered,
197
+ // so workers pick new items until nothing is left.
198
+ let cursor = 0;
199
+ const worker = () => __awaiter(this, void 0, void 0, function* () {
200
+ while (cursor < queue.length && pages.length + errors.length < maxPages) {
201
+ const item = queue[cursor++];
202
+ yield processPage(item);
203
+ }
204
+ });
205
+ const workers = Array.from({ length: Math.max(1, concurrency) }, () => worker());
206
+ yield Promise.all(workers);
207
+ // Drain any late discoveries added after all initial workers exited.
208
+ while (cursor < queue.length) {
209
+ yield Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
210
+ }
211
+ const multiPage = rendered.length > 1;
212
+ const pagesDir = multiPage ? (0, path_1.join)(outDir, 'pages') : outDir;
213
+ const imagesDir = (0, path_1.join)(outDir, 'images');
214
+ if (rendered.length === 0) {
215
+ onLog('No pages rendered');
216
+ return { errors, outDir, pages, pruned: [], skipped };
217
+ }
218
+ (0, fs_1.mkdirSync)(pagesDir, { recursive: true });
219
+ if (includeImages && imageHashToPath.size > 0) {
220
+ (0, fs_1.mkdirSync)(imagesDir, { recursive: true });
221
+ }
222
+ // Write deduplicated images.
223
+ const writtenImageFilenames = new Set();
224
+ for (const entry of rendered.flatMap((r) => r.images)) {
225
+ if (writtenImageFilenames.has(entry.filename))
226
+ continue;
227
+ writtenImageFilenames.add(entry.filename);
228
+ (0, fs_1.writeFileSync)((0, path_1.join)(imagesDir, entry.filename), entry.data);
229
+ }
230
+ // Persist pages. When updating, preserve the original
231
+ // `first_downloaded` timestamp so history stays intact.
232
+ for (const entry of rendered) {
233
+ const outputPath = (0, path_1.join)(pagesDir, entry.filename);
234
+ const relativePath = (0, path_1.relative)(outDir, outputPath);
235
+ // Images live under `<outDir>/images/`. Each page rewrites the
236
+ // Turndown-emitted `images/<file>` placeholder to the correct
237
+ // relative path so nested URL slugs (`pages/ueber-uns/…`, or a
238
+ // single-page crawl that lands in `<outDir>/ueber-uns/…`) still
239
+ // render in Markdown previews.
240
+ const imagePrefix = `${(0, path_1.relative)((0, path_1.dirname)(outputPath), imagesDir).split(/[\\/]/).join('/')}/`;
241
+ const fixedMarkdown = entry.markdown.replace(/\]\(images\//g, `](${imagePrefix}`);
242
+ if ((0, fs_1.existsSync)(outputPath)) {
243
+ const existing = (0, fs_1.readFileSync)(outputPath, 'utf8');
244
+ const existingMeta = parseFrontmatter(existing);
245
+ if (existingMeta === null || existingMeta === void 0 ? void 0 : existingMeta.first_downloaded) {
246
+ entry.info.firstDownloaded = String(existingMeta.first_downloaded);
247
+ }
248
+ }
249
+ const frontmatter = renderFrontmatter(entry.info);
250
+ (0, fs_1.mkdirSync)((0, path_1.dirname)(outputPath), { recursive: true });
251
+ (0, fs_1.writeFileSync)(outputPath, `${frontmatter}\n${fixedMarkdown.trim()}\n`);
252
+ pages.push(Object.assign(Object.assign({}, entry.info), { outputPath, relativePath }));
253
+ }
254
+ let indexFile;
255
+ if (multiPage) {
256
+ indexFile = (0, path_1.join)(outDir, 'README.md');
257
+ (0, fs_1.writeFileSync)(indexFile, renderOverview(seed.href, pages));
258
+ }
259
+ // Prune orphans (files left over from previous crawls). Scoped to
260
+ // `pages/` and `images/` so stray user files in outDir root never
261
+ // get touched. Only active in multi-page mode — a single-page
262
+ // crawl writes into `outDir` itself and has no page subfolder to
263
+ // sweep.
264
+ const pruned = [];
265
+ if (prune && multiPage) {
266
+ const keep = new Set(pages.map((p) => p.outputPath));
267
+ for (const entry of rendered.flatMap((r) => r.images)) {
268
+ keep.add((0, path_1.join)(imagesDir, entry.filename));
269
+ }
270
+ pruned.push(...pruneOrphans(pagesDir, keep));
271
+ if ((0, fs_1.existsSync)(imagesDir)) {
272
+ pruned.push(...pruneOrphans(imagesDir, keep));
273
+ }
274
+ if (pruned.length > 0) {
275
+ onLog(`Pruned ${pruned.length} orphaned file(s)`);
276
+ }
277
+ }
278
+ return { errors, indexFile, outDir, pages, pruned, skipped };
279
+ }
280
+ finally {
281
+ // Guarantee the headless browser is shut down on every exit path,
282
+ // including thrown errors, so no orphan chromium processes linger.
283
+ if (browserFetcher) {
284
+ yield browserFetcher.close().catch(() => undefined);
285
+ }
286
+ }
287
+ });
288
+ }
289
+ function buildImageFilename(url, contentHash, contentType) {
290
+ let basename = 'image';
291
+ let extension = '';
292
+ try {
293
+ const u = new URL(url);
294
+ const last = u.pathname.split('/').filter(Boolean).pop() || '';
295
+ const parsedExt = (0, path_1.extname)(last).replace('.', '').toLowerCase();
296
+ if (parsedExt && /^(jpg|jpeg|png|gif|webp|svg|avif)$/.test(parsedExt)) {
297
+ extension = parsedExt;
298
+ }
299
+ basename =
300
+ last
301
+ .replace((0, path_1.extname)(last), '')
302
+ .replace(/[^a-zA-Z0-9-_]/g, '_')
303
+ .substring(0, 40) || 'image';
304
+ }
305
+ catch (_a) {
306
+ // fall through
307
+ }
308
+ if (!extension) {
309
+ const fromType = contentType.split(';')[0].split('/')[1];
310
+ if (fromType && /^(jpeg|jpg|png|gif|webp|svg\+xml|avif)$/.test(fromType)) {
311
+ extension = fromType === 'svg+xml' ? 'svg' : fromType;
312
+ }
313
+ else {
314
+ extension = 'png';
315
+ }
316
+ }
317
+ return `${basename}-${contentHash.slice(0, 8)}.${extension}`;
318
+ }
319
+ function buildPageFilename(url, isFirst) {
320
+ const u = new URL(url);
321
+ const segments = u.pathname.split('/').filter(Boolean);
322
+ if (segments.length === 0) {
323
+ return isFirst ? 'index.md' : 'home.md';
324
+ }
325
+ const slugged = segments
326
+ .map((s) => s
327
+ .toLowerCase()
328
+ .replace(/\.(html?|php|aspx?)$/, '')
329
+ .replace(/[^a-z0-9-_]/g, '-')
330
+ .replace(/-+/g, '-')
331
+ .replace(/^-|-$/g, '') || 'page')
332
+ .join('/');
333
+ return `${slugged}.md`;
334
+ }
335
+ function convertToMarkdown(html, baseUrl, imageMap) {
336
+ const turndown = new turndown_1.default({
337
+ bulletListMarker: '-',
338
+ codeBlockStyle: 'fenced',
339
+ emDelimiter: '*',
340
+ headingStyle: 'atx',
341
+ linkStyle: 'inlined',
342
+ strongDelimiter: '**',
343
+ });
344
+ // Enable GFM so tables, strikethrough and task lists convert cleanly.
345
+ if (turndown_plugin_gfm_1.gfm) {
346
+ turndown.use(turndown_plugin_gfm_1.gfm);
347
+ }
348
+ turndown.addRule('absoluteLinks', {
349
+ filter: 'a',
350
+ replacement: (content, node) => {
351
+ var _a, _b, _c, _d;
352
+ const href = ((_b = (_a = node).getAttribute) === null || _b === void 0 ? void 0 : _b.call(_a, 'href')) || '';
353
+ if (!href || href === '#' || href.startsWith('javascript:')) {
354
+ return content;
355
+ }
356
+ let absolute = href;
357
+ try {
358
+ absolute = new URL(href, baseUrl).href;
359
+ }
360
+ catch (_e) {
361
+ // keep original
362
+ }
363
+ const title = (_d = (_c = node).getAttribute) === null || _d === void 0 ? void 0 : _d.call(_c, 'title');
364
+ return title ? `[${content}](${absolute} "${title}")` : `[${content}](${absolute})`;
365
+ },
366
+ });
367
+ turndown.addRule('localImages', {
368
+ filter: 'img',
369
+ replacement: (_content, node) => {
370
+ var _a, _b, _c, _d;
371
+ const src = ((_b = (_a = node).getAttribute) === null || _b === void 0 ? void 0 : _b.call(_a, 'src')) || '';
372
+ if (!src)
373
+ return '';
374
+ let absolute = src;
375
+ try {
376
+ absolute = new URL(src, baseUrl).href;
377
+ }
378
+ catch (_e) {
379
+ // keep original
380
+ }
381
+ const local = imageMap.get(src) || imageMap.get(absolute);
382
+ const alt = ((_d = (_c = node).getAttribute) === null || _d === void 0 ? void 0 : _d.call(_c, 'alt')) || '';
383
+ const target = local || absolute;
384
+ return `![${alt}](${target})`;
385
+ },
386
+ });
387
+ turndown.remove(['script', 'style', 'noscript', 'iframe']);
388
+ const markdown = turndown.turndown(html);
389
+ return markdown.replace(/\n{3,}/g, '\n\n').trim();
390
+ }
391
+ function countWords(text) {
392
+ return text.replace(/\s+/g, ' ').trim().split(' ').filter(Boolean).length;
393
+ }
394
+ function escapeYaml(value) {
395
+ return value.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, ' ');
396
+ }
397
+ /**
398
+ * Extract main content + metadata using defuddle (the same engine as
399
+ * chrome-md). Falls back to a raw body dump if defuddle fails.
400
+ */
401
+ function extractContent(html, pageUrl, options) {
402
+ return __awaiter(this, void 0, void 0, function* () {
403
+ var _a, _b, _c, _d, _e, _f, _g;
404
+ const dom = new jsdom_1.JSDOM(html, { url: pageUrl });
405
+ const doc = dom.window.document;
406
+ const defuddleOptions = {
407
+ markdown: false,
408
+ removeHiddenElements: true,
409
+ removeLowScoring: true,
410
+ removeSmallImages: false,
411
+ };
412
+ if (options.selector) {
413
+ defuddleOptions.contentSelector = options.selector;
414
+ }
415
+ let parsed;
416
+ try {
417
+ // Same class-based API as chrome-md's content script.
418
+ const instance = new defuddle_1.default(doc, defuddleOptions);
419
+ parsed = instance.parse();
420
+ }
421
+ catch (_h) {
422
+ parsed = {
423
+ content: ((_a = doc.body) === null || _a === void 0 ? void 0 : _a.innerHTML) || html,
424
+ title: doc.title,
425
+ };
426
+ }
427
+ const contentHtml = parsed.content || ((_b = doc.body) === null || _b === void 0 ? void 0 : _b.innerHTML) || '';
428
+ // Collect images and links from the cleaned content.
429
+ const helperDom = new jsdom_1.JSDOM(`<!DOCTYPE html><html><body>${contentHtml}</body></html>`, {
430
+ url: pageUrl,
431
+ });
432
+ const contentDoc = helperDom.window.document;
433
+ const links = new Set();
434
+ contentDoc.querySelectorAll('a[href]').forEach((el) => {
435
+ const href = (el.getAttribute('href') || '').trim();
436
+ if (!href || href.startsWith('#') || href.startsWith('mailto:') || href.startsWith('javascript:')) {
437
+ return;
438
+ }
439
+ try {
440
+ links.add(new URL(href, pageUrl).href);
441
+ }
442
+ catch (_a) {
443
+ // ignore malformed URLs
444
+ }
445
+ });
446
+ const images = new Set();
447
+ contentDoc.querySelectorAll('img').forEach((el) => {
448
+ const src = (el.getAttribute('src') || el.getAttribute('data-src') || '').trim();
449
+ if (!src || src.startsWith('data:'))
450
+ return;
451
+ try {
452
+ images.add(new URL(src, pageUrl).href);
453
+ }
454
+ catch (_a) {
455
+ // ignore malformed URLs
456
+ }
457
+ });
458
+ // Some lazy-loading frameworks keep the real URL only in the source
459
+ // document (stripped out by defuddle), so also consult the original DOM.
460
+ doc.querySelectorAll('img[data-src], img[data-lazy-src]').forEach((el) => {
461
+ const src = (el.getAttribute('data-src') || el.getAttribute('data-lazy-src') || '').trim();
462
+ if (!src || src.startsWith('data:'))
463
+ return;
464
+ try {
465
+ images.add(new URL(src, pageUrl).href);
466
+ }
467
+ catch (_a) {
468
+ // ignore
469
+ }
470
+ });
471
+ const meta = {
472
+ author: parsed.author || ((_c = doc.querySelector('meta[name="author"]')) === null || _c === void 0 ? void 0 : _c.getAttribute('content')) || undefined,
473
+ description: parsed.description ||
474
+ ((_d = doc.querySelector('meta[name="description"]')) === null || _d === void 0 ? void 0 : _d.getAttribute('content')) ||
475
+ ((_e = doc.querySelector('meta[property="og:description"]')) === null || _e === void 0 ? void 0 : _e.getAttribute('content')) ||
476
+ '',
477
+ language: parsed.language || doc.documentElement.getAttribute('lang') || undefined,
478
+ ogImage: parsed.image || ((_f = doc.querySelector('meta[property="og:image"]')) === null || _f === void 0 ? void 0 : _f.getAttribute('content')) || undefined,
479
+ title: parsed.title || doc.title || pageUrl,
480
+ wordCount: parsed.wordCount,
481
+ };
482
+ return {
483
+ contentHtml,
484
+ contentText: ((_g = contentDoc.body) === null || _g === void 0 ? void 0 : _g.textContent) || '',
485
+ imageMap: new Map(),
486
+ images: [...images],
487
+ links: [...links],
488
+ meta,
489
+ };
490
+ });
491
+ }
492
+ function fetchImage(http, url) {
493
+ return __awaiter(this, void 0, void 0, function* () {
494
+ try {
495
+ const response = yield http.get(url, { responseType: 'arraybuffer' });
496
+ const buffer = Buffer.from(response.data);
497
+ if (buffer.byteLength === 0)
498
+ return null;
499
+ return { buffer, contentType: String(response.headers['content-type'] || '') };
500
+ }
501
+ catch (_a) {
502
+ return null;
503
+ }
504
+ });
505
+ }
506
+ function fetchSitemapUrls(http, origin, onLog) {
507
+ return __awaiter(this, void 0, void 0, function* () {
508
+ const urls = [];
509
+ const visited = new Set();
510
+ function walk(sitemapUrl) {
511
+ return __awaiter(this, void 0, void 0, function* () {
512
+ if (visited.has(sitemapUrl))
513
+ return;
514
+ visited.add(sitemapUrl);
515
+ try {
516
+ const response = yield http.get(sitemapUrl);
517
+ const xml = String(response.data || '');
518
+ // Nested sitemap index: follow each <sitemap><loc>...</loc></sitemap>.
519
+ const nested = [...xml.matchAll(/<sitemap>[\s\S]*?<loc>\s*([^<\s]+)\s*<\/loc>[\s\S]*?<\/sitemap>/gi)].map((m) => m[1]);
520
+ for (const child of nested) {
521
+ yield walk(child);
522
+ }
523
+ const pageMatches = [...xml.matchAll(/<url>[\s\S]*?<loc>\s*([^<\s]+)\s*<\/loc>[\s\S]*?<\/url>/gi)].map((m) => m[1]);
524
+ urls.push(...pageMatches);
525
+ }
526
+ catch (error) {
527
+ onLog(`Sitemap fetch failed for ${sitemapUrl}: ${error instanceof Error ? error.message : String(error)}`);
528
+ }
529
+ });
530
+ }
531
+ yield walk(`${origin}/sitemap.xml`);
532
+ return urls;
533
+ });
534
+ }
535
+ /**
536
+ * Normalize a URL for dedup: strip hash, drop default `index.html`,
537
+ * and remove trailing slashes (except root).
538
+ */
539
+ function normalizeUrl(raw) {
540
+ try {
541
+ const u = new URL(raw);
542
+ u.hash = '';
543
+ u.pathname = u.pathname.replace(/\/index\.html?$/i, '/');
544
+ if (u.pathname.length > 1 && u.pathname.endsWith('/')) {
545
+ u.pathname = u.pathname.replace(/\/+$/, '');
546
+ }
547
+ return u.href;
548
+ }
549
+ catch (_a) {
550
+ return raw;
551
+ }
552
+ }
553
+ function parseFrontmatter(markdown) {
554
+ if (!markdown.startsWith('---'))
555
+ return null;
556
+ const end = markdown.indexOf('\n---', 3);
557
+ if (end === -1)
558
+ return null;
559
+ const block = markdown.slice(3, end);
560
+ const result = {};
561
+ for (const line of block.split('\n')) {
562
+ const match = line.match(/^([a-zA-Z0-9_]+):\s*(.*)$/);
563
+ if (!match)
564
+ continue;
565
+ result[match[1]] = match[2].replace(/^"(.*)"$/, '$1');
566
+ }
567
+ return result;
568
+ }
569
+ /**
570
+ * Walk `rootDir` recursively and delete every file whose absolute
571
+ * path is not in `keepPaths`. Empty directories left behind after
572
+ * the sweep are removed, too. Returns the absolute paths that were
573
+ * actually deleted.
574
+ */
575
+ function pruneOrphans(rootDir, keepPaths) {
576
+ const removed = [];
577
+ if (!(0, fs_1.existsSync)(rootDir))
578
+ return removed;
579
+ const entries = (0, fs_1.readdirSync)(rootDir, { withFileTypes: true });
580
+ for (const entry of entries) {
581
+ const full = (0, path_1.join)(rootDir, entry.name);
582
+ if (entry.isDirectory()) {
583
+ removed.push(...pruneOrphans(full, keepPaths));
584
+ // Remove directory if now empty.
585
+ try {
586
+ if ((0, fs_1.readdirSync)(full).length === 0)
587
+ (0, fs_1.rmdirSync)(full);
588
+ }
589
+ catch (_a) {
590
+ // Directory not empty or already gone — ignore.
591
+ }
592
+ }
593
+ else if (entry.isFile() && !keepPaths.has(full)) {
594
+ try {
595
+ (0, fs_1.unlinkSync)(full);
596
+ removed.push(full);
597
+ }
598
+ catch (_b) {
599
+ // File already removed or permission denied — skip.
600
+ }
601
+ }
602
+ }
603
+ return removed;
604
+ }
605
+ function renderFrontmatter(info) {
606
+ const lines = [
607
+ '---',
608
+ `title: "${escapeYaml(info.title)}"`,
609
+ `source_url: "${info.url}"`,
610
+ `source_domain: "${new URL(info.url).hostname}"`,
611
+ `crawl_depth: ${info.depth}`,
612
+ `download_date: "${info.downloadDate}"`,
613
+ `first_downloaded: "${info.firstDownloaded}"`,
614
+ info.description ? `description: "${escapeYaml(truncate(info.description, 500))}"` : null,
615
+ info.author ? `author: "${escapeYaml(info.author)}"` : null,
616
+ info.language ? `language: "${escapeYaml(info.language)}"` : null,
617
+ info.ogImage ? `og_image: "${escapeYaml(info.ogImage)}"` : null,
618
+ info.imageCount ? `image_count: ${info.imageCount}` : null,
619
+ `word_count: ${info.wordCount}`,
620
+ 'content_type: "webpage"',
621
+ '---',
622
+ ].filter((l) => l !== null);
623
+ return lines.join('\n');
624
+ }
625
+ function renderOverview(startUrl, pages) {
626
+ const ordered = [...pages].sort((a, b) => a.url.localeCompare(b.url));
627
+ const host = new URL(startUrl).host;
628
+ const lines = [];
629
+ lines.push(`# ${host} — Knowledge Base`);
630
+ lines.push('');
631
+ lines.push(`Source: ${startUrl}`);
632
+ lines.push('');
633
+ lines.push(`Generated: ${new Date().toISOString()}`);
634
+ lines.push('');
635
+ lines.push(`Pages: ${ordered.length}`);
636
+ lines.push('');
637
+ lines.push('## Pages');
638
+ lines.push('');
639
+ for (const page of ordered) {
640
+ lines.push(`### [${page.title}](${page.relativePath.split(/[\\/]/).join('/')})`);
641
+ lines.push('');
642
+ lines.push(`- URL: ${page.url}`);
643
+ if (page.description) {
644
+ lines.push(`- ${truncate(page.description, 240)}`);
645
+ }
646
+ lines.push(`- Updated: ${page.downloadDate}`);
647
+ lines.push('');
648
+ }
649
+ return lines.join('\n');
650
+ }
651
+ function sameOrigin(url, origin) {
652
+ try {
653
+ return new URL(url).origin === origin;
654
+ }
655
+ catch (_a) {
656
+ return false;
657
+ }
658
+ }
659
+ function truncate(value, max) {
660
+ return value.length <= max ? value : `${value.slice(0, max - 1)}…`;
661
+ }
@@ -117,6 +117,7 @@ Flags:
117
117
  - `--frontend-framework-mode npm|vendor` — Frontend mode
118
118
  - `--framework-upstream-branch <tag>` — Specific nest-server version for vendor
119
119
  - `--dry-run` — Show plan without making changes
120
+ - `--next` — **Experimental:** clone [`nest-base`](https://github.com/lenneTech/nest-base) (Bun + Prisma 7 + Postgres + Better-Auth) for the API instead of `nest-server-starter`. Forces `--api-mode Rest`, `--framework-mode npm`, and skips workspace install (run `pnpm install` for app and `bun install` for api manually). Downstream `lt server module/object/addProp/test/permissions` are NOT compatible with the resulting layout.
120
121
 
121
122
  ---
122
123