@lenne.tech/cli 1.12.0 → 1.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/commands/config/validate.js +36 -2
- package/build/commands/tools/crawl.js +307 -0
- package/build/lib/browser-fetcher.js +139 -0
- package/build/lib/crawler.js +661 -0
- package/docs/commands.md +52 -0
- package/docs/lt.config.md +37 -0
- package/package.json +8 -1
|
@@ -114,6 +114,22 @@ const KNOWN_KEYS = {
|
|
|
114
114
|
path: 'string',
|
|
115
115
|
},
|
|
116
116
|
},
|
|
117
|
+
tools: {
|
|
118
|
+
crawl: {
|
|
119
|
+
concurrency: 'number',
|
|
120
|
+
depth: 'number|all',
|
|
121
|
+
includeImages: 'boolean',
|
|
122
|
+
includeSitemap: 'boolean',
|
|
123
|
+
maxPages: 'number',
|
|
124
|
+
noConfirm: 'boolean',
|
|
125
|
+
out: 'string',
|
|
126
|
+
prune: 'boolean',
|
|
127
|
+
renderJs: 'boolean',
|
|
128
|
+
selector: 'string',
|
|
129
|
+
timeout: 'number',
|
|
130
|
+
},
|
|
131
|
+
noConfirm: 'boolean',
|
|
132
|
+
},
|
|
117
133
|
typescript: {
|
|
118
134
|
create: { author: 'string', noConfirm: 'boolean', updatePackages: 'boolean' },
|
|
119
135
|
},
|
|
@@ -165,8 +181,26 @@ function validateConfig(config, knownKeys, path = '') {
|
|
|
165
181
|
}
|
|
166
182
|
// Validate type
|
|
167
183
|
if (typeof expectedType === 'string') {
|
|
168
|
-
// Simple type check
|
|
169
|
-
if (expectedType
|
|
184
|
+
// Simple type check. `'a|b'` means union (e.g. "number|all").
|
|
185
|
+
if (expectedType.includes('|')) {
|
|
186
|
+
const tokens = expectedType.split('|').map((t) => t.trim());
|
|
187
|
+
const ok = tokens.some((token) => {
|
|
188
|
+
if (token === 'string')
|
|
189
|
+
return typeof value === 'string';
|
|
190
|
+
if (token === 'number')
|
|
191
|
+
return typeof value === 'number';
|
|
192
|
+
if (token === 'boolean')
|
|
193
|
+
return typeof value === 'boolean';
|
|
194
|
+
if (token === 'array')
|
|
195
|
+
return Array.isArray(value);
|
|
196
|
+
// Everything else is treated as a string literal enum member.
|
|
197
|
+
return value === token;
|
|
198
|
+
});
|
|
199
|
+
if (!ok) {
|
|
200
|
+
result.errors.push(`${currentPath}: expected ${tokens.join(' | ')}, got ${typeof value}`);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
else if (expectedType === 'string' && typeof value !== 'string') {
|
|
170
204
|
result.errors.push(`${currentPath}: expected string, got ${typeof value}`);
|
|
171
205
|
}
|
|
172
206
|
else if (expectedType === 'boolean' && typeof value !== 'boolean') {
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
const path_1 = require("path");
|
|
13
|
+
const crawler_1 = require("../../lib/crawler");
|
|
14
|
+
/**
|
|
15
|
+
* Crawl a website (optionally following same-origin links up to a
|
|
16
|
+
* configurable depth) and store the content as Markdown files for use
|
|
17
|
+
* as a Claude Code knowledge base. Inspired by ../../../../chrome-md:
|
|
18
|
+
* shares the defuddle + Turndown extraction pipeline but runs headless
|
|
19
|
+
* from Node and follows links / sitemaps automatically.
|
|
20
|
+
*/
|
|
21
|
+
const NewCommand = {
|
|
22
|
+
alias: ['cr'],
|
|
23
|
+
description: 'Crawl site to Markdown',
|
|
24
|
+
hidden: false,
|
|
25
|
+
name: 'crawl',
|
|
26
|
+
run: (toolbox) => __awaiter(void 0, void 0, void 0, function* () {
|
|
27
|
+
var _a, _b, _c, _d, _e;
|
|
28
|
+
const { config, filesystem, helper, parameters, print: { error, info, spin, success, warning }, prompt: { confirm }, tools, } = toolbox;
|
|
29
|
+
if (tools.helpJson({
|
|
30
|
+
aliases: ['cr'],
|
|
31
|
+
description: 'Crawl a website into Markdown files (for Claude Code knowledge bases)',
|
|
32
|
+
name: 'crawl',
|
|
33
|
+
options: [
|
|
34
|
+
{
|
|
35
|
+
description: 'Start URL (absolute http/https URL)',
|
|
36
|
+
flag: '--url',
|
|
37
|
+
required: true,
|
|
38
|
+
type: 'string',
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
default: '.',
|
|
42
|
+
description: 'Output directory (created if missing)',
|
|
43
|
+
flag: '--out',
|
|
44
|
+
type: 'string',
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
default: 0,
|
|
48
|
+
description: 'Link depth. 0 = only start page; 1 = + direct links; N = up to N hops; "all" (or -1) = follow every same-origin link until --max-pages is reached',
|
|
49
|
+
flag: '--depth',
|
|
50
|
+
type: 'number|all',
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
default: true,
|
|
54
|
+
description: 'Download images and inline them with local paths',
|
|
55
|
+
flag: '--images',
|
|
56
|
+
type: 'boolean',
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
default: true,
|
|
60
|
+
description: 'Also seed queue from <origin>/sitemap.xml',
|
|
61
|
+
flag: '--sitemap',
|
|
62
|
+
type: 'boolean',
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
default: 4,
|
|
66
|
+
description: 'Parallel HTTP requests',
|
|
67
|
+
flag: '--concurrency',
|
|
68
|
+
type: 'number',
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
default: 200,
|
|
72
|
+
description: 'Maximum number of pages to crawl (safety cap)',
|
|
73
|
+
flag: '--max-pages',
|
|
74
|
+
type: 'number',
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
description: 'CSS selector for the main content container',
|
|
78
|
+
flag: '--selector',
|
|
79
|
+
type: 'string',
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
default: 20000,
|
|
83
|
+
description: 'HTTP request timeout in ms',
|
|
84
|
+
flag: '--timeout',
|
|
85
|
+
type: 'number',
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
default: false,
|
|
89
|
+
description: 'Shortcut for --depth all (follows every same-origin link until --max-pages)',
|
|
90
|
+
flag: '--all',
|
|
91
|
+
type: 'boolean',
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
default: true,
|
|
95
|
+
description: "Render pages through a headless browser before extracting (for SPAs like Vue/Nuxt/React/Angular). Uses playwright-core with system Chrome / Edge, falling back to Playwright's bundled chromium. Disable with --no-render for plain HTTP fetches.",
|
|
96
|
+
flag: '--render',
|
|
97
|
+
type: 'boolean',
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
default: false,
|
|
101
|
+
description: 'If --render cannot find any browser, auto-install Playwright chromium (one-time ~170 MB download).',
|
|
102
|
+
flag: '--install-browser',
|
|
103
|
+
type: 'boolean',
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
default: true,
|
|
107
|
+
description: 'After a multi-page crawl, remove any .md or image files inside <outDir>/pages and <outDir>/images that were not written by this run. Disable with --no-prune to preserve old files.',
|
|
108
|
+
flag: '--prune',
|
|
109
|
+
type: 'boolean',
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
default: false,
|
|
113
|
+
description: 'Skip confirmation prompts',
|
|
114
|
+
flag: '--noConfirm',
|
|
115
|
+
type: 'boolean',
|
|
116
|
+
},
|
|
117
|
+
],
|
|
118
|
+
})) {
|
|
119
|
+
return 'crawl';
|
|
120
|
+
}
|
|
121
|
+
tools.nonInteractiveHint('lt tools crawl <url> --out <dir> --depth 1 --noConfirm');
|
|
122
|
+
const ltConfig = config.loadConfig();
|
|
123
|
+
const commandConfig = (_b = (_a = ltConfig === null || ltConfig === void 0 ? void 0 : ltConfig.commands) === null || _a === void 0 ? void 0 : _a.tools) === null || _b === void 0 ? void 0 : _b.crawl;
|
|
124
|
+
// URL: positional argument > --url > interactive prompt.
|
|
125
|
+
const urlInput = parameters.first ||
|
|
126
|
+
parameters.options.url ||
|
|
127
|
+
(yield helper.getInput(undefined, { name: 'Website URL', showError: false }));
|
|
128
|
+
if (!urlInput) {
|
|
129
|
+
error('No URL provided');
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
132
|
+
const url = normalizeSeedUrl(urlInput);
|
|
133
|
+
try {
|
|
134
|
+
new URL(url);
|
|
135
|
+
}
|
|
136
|
+
catch (_f) {
|
|
137
|
+
error(`Invalid URL: ${urlInput}`);
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
const depthRaw = config.getValue({
|
|
141
|
+
// `--all` is a convenience shortcut for `--depth all`. It wins
|
|
142
|
+
// over a numeric `--depth` so users can combine both.
|
|
143
|
+
cliValue: parameters.options.all === true ? 'all' : parameters.options.depth,
|
|
144
|
+
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.depth,
|
|
145
|
+
defaultValue: 0,
|
|
146
|
+
});
|
|
147
|
+
const depth = parseDepth(depthRaw);
|
|
148
|
+
const includeImages = config.getValue({
|
|
149
|
+
cliValue: parameters.options.images === false ? false : undefined,
|
|
150
|
+
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.includeImages,
|
|
151
|
+
defaultValue: true,
|
|
152
|
+
});
|
|
153
|
+
const includeSitemap = config.getValue({
|
|
154
|
+
cliValue: parameters.options.sitemap === false ? false : undefined,
|
|
155
|
+
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.includeSitemap,
|
|
156
|
+
defaultValue: true,
|
|
157
|
+
});
|
|
158
|
+
const concurrency = Number(config.getValue({
|
|
159
|
+
cliValue: parameters.options.concurrency,
|
|
160
|
+
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.concurrency,
|
|
161
|
+
defaultValue: 4,
|
|
162
|
+
}));
|
|
163
|
+
const maxPages = Number(config.getValue({
|
|
164
|
+
cliValue: (_c = parameters.options.maxPages) !== null && _c !== void 0 ? _c : parameters.options['max-pages'],
|
|
165
|
+
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.maxPages,
|
|
166
|
+
defaultValue: 200,
|
|
167
|
+
}));
|
|
168
|
+
const timeout = Number(config.getValue({
|
|
169
|
+
cliValue: parameters.options.timeout,
|
|
170
|
+
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.timeout,
|
|
171
|
+
defaultValue: 20000,
|
|
172
|
+
}));
|
|
173
|
+
const selector = config.getValue({
|
|
174
|
+
cliValue: parameters.options.selector,
|
|
175
|
+
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.selector,
|
|
176
|
+
});
|
|
177
|
+
// `--render` and `--prune` default ON — the common case is a
|
|
178
|
+
// full SPA-aware knowledge-base crawl that stays in sync on
|
|
179
|
+
// updates. `--no-render` / `--no-prune` opt out explicitly.
|
|
180
|
+
const renderJs = config.getValue({
|
|
181
|
+
cliValue: parameters.options.render === false ? false : undefined,
|
|
182
|
+
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.renderJs,
|
|
183
|
+
defaultValue: true,
|
|
184
|
+
});
|
|
185
|
+
const installBrowser = parameters.options['install-browser'] === true || parameters.options.installBrowser === true;
|
|
186
|
+
const pruneOrphans = config.getValue({
|
|
187
|
+
cliValue: parameters.options.prune === false ? false : undefined,
|
|
188
|
+
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.prune,
|
|
189
|
+
defaultValue: true,
|
|
190
|
+
});
|
|
191
|
+
const outDir = (0, path_1.resolve)(config.getValue({
|
|
192
|
+
cliValue: (_d = parameters.options.out) !== null && _d !== void 0 ? _d : parameters.options.output,
|
|
193
|
+
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.out,
|
|
194
|
+
defaultValue: filesystem.cwd(),
|
|
195
|
+
}) || filesystem.cwd());
|
|
196
|
+
const noConfirm = config.getNoConfirm({
|
|
197
|
+
cliValue: parameters.options.noConfirm,
|
|
198
|
+
commandConfig,
|
|
199
|
+
config: ltConfig,
|
|
200
|
+
parentConfig: (_e = ltConfig === null || ltConfig === void 0 ? void 0 : ltConfig.commands) === null || _e === void 0 ? void 0 : _e.tools,
|
|
201
|
+
});
|
|
202
|
+
info('');
|
|
203
|
+
info(`Crawling: ${url}`);
|
|
204
|
+
info(`Output: ${outDir}`);
|
|
205
|
+
info(`Depth: ${depth === 'all' ? 'all (bounded by --max-pages)' : depth}`);
|
|
206
|
+
info(`Sitemap: ${includeSitemap ? 'yes' : 'no'}`);
|
|
207
|
+
info(`Images: ${includeImages ? 'yes' : 'no'}`);
|
|
208
|
+
info(`Parallel: ${concurrency}`);
|
|
209
|
+
info(`Max: ${maxPages} pages`);
|
|
210
|
+
info(`Render: ${renderJs ? 'yes (headless browser)' : 'no (raw HTTP)'}`);
|
|
211
|
+
info(`Prune: ${pruneOrphans ? 'yes (remove orphaned pages/images)' : 'no'}`);
|
|
212
|
+
if (selector)
|
|
213
|
+
info(`Selector: ${selector}`);
|
|
214
|
+
info('');
|
|
215
|
+
if (!noConfirm && !(yield confirm('Start crawl?'))) {
|
|
216
|
+
return 'crawl cancelled';
|
|
217
|
+
}
|
|
218
|
+
const spinner = spin('Crawling...');
|
|
219
|
+
const result = yield (0, crawler_1.crawlSite)({
|
|
220
|
+
autoInstallBrowser: installBrowser,
|
|
221
|
+
concurrency,
|
|
222
|
+
depth,
|
|
223
|
+
includeImages,
|
|
224
|
+
includeSitemap,
|
|
225
|
+
maxPages,
|
|
226
|
+
onLog: (msg) => {
|
|
227
|
+
spinner.text = msg;
|
|
228
|
+
},
|
|
229
|
+
outDir,
|
|
230
|
+
prune: pruneOrphans,
|
|
231
|
+
renderJs,
|
|
232
|
+
selector,
|
|
233
|
+
timeout,
|
|
234
|
+
url,
|
|
235
|
+
}).catch((err) => {
|
|
236
|
+
spinner.fail('Crawl failed');
|
|
237
|
+
error(err.message);
|
|
238
|
+
return null;
|
|
239
|
+
});
|
|
240
|
+
if (!result) {
|
|
241
|
+
return;
|
|
242
|
+
}
|
|
243
|
+
spinner.succeed(`Crawl complete: ${result.pages.length} page(s)`);
|
|
244
|
+
info('');
|
|
245
|
+
if (result.indexFile) {
|
|
246
|
+
success(`Overview: ${result.indexFile}`);
|
|
247
|
+
}
|
|
248
|
+
for (const page of result.pages.slice(0, 10)) {
|
|
249
|
+
info(` - ${page.relativePath} (${page.url})`);
|
|
250
|
+
}
|
|
251
|
+
if (result.pages.length > 10) {
|
|
252
|
+
info(` ... and ${result.pages.length - 10} more`);
|
|
253
|
+
}
|
|
254
|
+
if (result.pruned.length > 0) {
|
|
255
|
+
info(`Pruned ${result.pruned.length} orphaned file(s)`);
|
|
256
|
+
for (const path of result.pruned.slice(0, 5)) {
|
|
257
|
+
info(` - ${path}`);
|
|
258
|
+
}
|
|
259
|
+
if (result.pruned.length > 5)
|
|
260
|
+
info(` ... and ${result.pruned.length - 5} more`);
|
|
261
|
+
}
|
|
262
|
+
if (result.skipped.length > 0) {
|
|
263
|
+
warning(`Skipped ${result.skipped.length} URL(s) (non-HTML or foreign origin)`);
|
|
264
|
+
}
|
|
265
|
+
if (result.errors.length > 0) {
|
|
266
|
+
warning(`${result.errors.length} error(s):`);
|
|
267
|
+
for (const err of result.errors.slice(0, 5)) {
|
|
268
|
+
warning(` - ${err.url}: ${err.reason}`);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
if (!toolbox.parameters.options.fromGluegunMenu) {
|
|
272
|
+
process.exit();
|
|
273
|
+
}
|
|
274
|
+
return `crawled ${result.pages.length} pages`;
|
|
275
|
+
}),
|
|
276
|
+
};
|
|
277
|
+
function normalizeSeedUrl(raw) {
|
|
278
|
+
const trimmed = raw.trim();
|
|
279
|
+
if (/^https?:\/\//i.test(trimmed))
|
|
280
|
+
return trimmed;
|
|
281
|
+
return `https://${trimmed}`;
|
|
282
|
+
}
|
|
283
|
+
/**
|
|
284
|
+
* Parse the --depth parameter. Accepts positive integers, the string
|
|
285
|
+
* "all", and negative values (treated as "all"). Invalid values fall
|
|
286
|
+
* back to `0` so the crawl still runs against the seed URL.
|
|
287
|
+
*/
|
|
288
|
+
function parseDepth(raw) {
|
|
289
|
+
if (raw === undefined || raw === null)
|
|
290
|
+
return 0;
|
|
291
|
+
if (typeof raw === 'string') {
|
|
292
|
+
const normalized = raw.trim().toLowerCase();
|
|
293
|
+
if (normalized === 'all' || normalized === '-1')
|
|
294
|
+
return 'all';
|
|
295
|
+
const n = Number(normalized);
|
|
296
|
+
if (!Number.isFinite(n))
|
|
297
|
+
return 0;
|
|
298
|
+
return n < 0 ? 'all' : Math.floor(n);
|
|
299
|
+
}
|
|
300
|
+
if (typeof raw === 'number') {
|
|
301
|
+
if (!Number.isFinite(raw))
|
|
302
|
+
return 'all';
|
|
303
|
+
return raw < 0 ? 'all' : Math.floor(raw);
|
|
304
|
+
}
|
|
305
|
+
return 0;
|
|
306
|
+
}
|
|
307
|
+
exports.default = NewCommand;
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.createBrowserFetcher = createBrowserFetcher;
|
|
13
|
+
/**
|
|
14
|
+
* Headless-browser HTML fetcher for single-page applications.
|
|
15
|
+
*
|
|
16
|
+
* Mirrors the chrome-md content script's PageReadyDetector:
|
|
17
|
+
* waits for the network to settle, then returns the fully hydrated
|
|
18
|
+
* HTML so Defuddle can extract the real content instead of the
|
|
19
|
+
* pre-render shell.
|
|
20
|
+
*
|
|
21
|
+
* Uses `playwright-core` with a three-tier strategy:
|
|
22
|
+
* 1. System Chrome / Edge via `channel: 'chrome' | 'msedge'`.
|
|
23
|
+
* 2. Playwright's own bundled Chromium (if already installed).
|
|
24
|
+
* 3. Auto-install Playwright's Chromium (`npx playwright install
|
|
25
|
+
* chromium`) and retry — opt-in via `autoInstall`.
|
|
26
|
+
*/
|
|
27
|
+
const child_process_1 = require("child_process");
|
|
28
|
+
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; lenneTech-CLI-Crawler/1.0; +https://lenne.tech)';
|
|
29
|
+
/**
|
|
30
|
+
* Try to construct a browser fetcher. Prefers a system Chrome /
|
|
31
|
+
* Edge via Playwright channels, falls back to Playwright's bundled
|
|
32
|
+
* Chromium, and (optionally) auto-installs Chromium on demand.
|
|
33
|
+
*/
|
|
34
|
+
function createBrowserFetcher() {
|
|
35
|
+
return __awaiter(this, arguments, void 0, function* (options = {}) {
|
|
36
|
+
const log = options.onLog || (() => undefined);
|
|
37
|
+
const reasons = [];
|
|
38
|
+
const { chromium } = require('playwright-core');
|
|
39
|
+
// 1. System Chrome.
|
|
40
|
+
const chromeFetcher = yield launch(chromium, { channel: 'chrome' }, options, 'system-chrome').catch((error) => {
|
|
41
|
+
reasons.push(`channel:chrome: ${error.message}`);
|
|
42
|
+
return null;
|
|
43
|
+
});
|
|
44
|
+
if (chromeFetcher) {
|
|
45
|
+
log(`Browser engine: ${chromeFetcher.engine}`);
|
|
46
|
+
return chromeFetcher;
|
|
47
|
+
}
|
|
48
|
+
// 2. System Edge (Windows fallback, also common on macOS).
|
|
49
|
+
const edgeFetcher = yield launch(chromium, { channel: 'msedge' }, options, 'system-edge').catch((error) => {
|
|
50
|
+
reasons.push(`channel:msedge: ${error.message}`);
|
|
51
|
+
return null;
|
|
52
|
+
});
|
|
53
|
+
if (edgeFetcher) {
|
|
54
|
+
log(`Browser engine: ${edgeFetcher.engine}`);
|
|
55
|
+
return edgeFetcher;
|
|
56
|
+
}
|
|
57
|
+
// 3. Playwright's bundled Chromium.
|
|
58
|
+
const bundledFetcher = yield launch(chromium, {}, options, 'playwright-chromium').catch((error) => {
|
|
59
|
+
reasons.push(`playwright-chromium: ${error.message}`);
|
|
60
|
+
return null;
|
|
61
|
+
});
|
|
62
|
+
if (bundledFetcher) {
|
|
63
|
+
log(`Browser engine: ${bundledFetcher.engine}`);
|
|
64
|
+
return bundledFetcher;
|
|
65
|
+
}
|
|
66
|
+
// 4. Optional auto-install, then retry Playwright's chromium.
|
|
67
|
+
if (options.autoInstall) {
|
|
68
|
+
log('No browser available — installing Playwright chromium (one-time download, ~170 MB)…');
|
|
69
|
+
try {
|
|
70
|
+
yield runNpx(['playwright', 'install', 'chromium']);
|
|
71
|
+
const retry = yield launch(chromium, {}, options, 'playwright-chromium');
|
|
72
|
+
if (retry) {
|
|
73
|
+
log(`Browser engine: ${retry.engine}`);
|
|
74
|
+
return retry;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
catch (error) {
|
|
78
|
+
reasons.push(`auto-install: ${error instanceof Error ? error.message : String(error)}`);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
throw new Error([
|
|
82
|
+
'Could not start a headless browser for SPA rendering.',
|
|
83
|
+
...reasons.map((r) => ` - ${r}`),
|
|
84
|
+
'',
|
|
85
|
+
'Fix one of these:',
|
|
86
|
+
' 1. Install Google Chrome or Microsoft Edge (Playwright picks them up automatically).',
|
|
87
|
+
' 2. Install Playwright browsers manually: `npx playwright install chromium`.',
|
|
88
|
+
' 3. Re-run with --install-browser to let the CLI install them.',
|
|
89
|
+
].join('\n'));
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
function launch(chromium, launchOptions, options, engineLabel) {
|
|
93
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
94
|
+
const browser = yield chromium.launch(Object.assign(Object.assign({}, launchOptions), { headless: true }));
|
|
95
|
+
const context = yield browser.newContext({
|
|
96
|
+
userAgent: options.userAgent || DEFAULT_USER_AGENT,
|
|
97
|
+
});
|
|
98
|
+
return {
|
|
99
|
+
close: () => __awaiter(this, void 0, void 0, function* () {
|
|
100
|
+
yield context.close();
|
|
101
|
+
yield browser.close();
|
|
102
|
+
}),
|
|
103
|
+
engine: engineLabel,
|
|
104
|
+
fetch: (url) => __awaiter(this, void 0, void 0, function* () {
|
|
105
|
+
var _a;
|
|
106
|
+
const page = yield context.newPage();
|
|
107
|
+
try {
|
|
108
|
+
yield page.goto(url, {
|
|
109
|
+
timeout: (_a = options.maxWaitMs) !== null && _a !== void 0 ? _a : 20000,
|
|
110
|
+
waitUntil: 'networkidle',
|
|
111
|
+
});
|
|
112
|
+
if (options.extraWaitMs) {
|
|
113
|
+
yield page.waitForTimeout(options.extraWaitMs);
|
|
114
|
+
}
|
|
115
|
+
return yield page.content();
|
|
116
|
+
}
|
|
117
|
+
finally {
|
|
118
|
+
yield page.close();
|
|
119
|
+
}
|
|
120
|
+
}),
|
|
121
|
+
};
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Run an `npx` command, streaming its output to the current stdio.
|
|
126
|
+
* Resolves on exit code 0, rejects otherwise.
|
|
127
|
+
*/
|
|
128
|
+
function runNpx(args) {
|
|
129
|
+
return new Promise((resolve, reject) => {
|
|
130
|
+
const child = (0, child_process_1.spawn)('npx', args, { shell: false, stdio: 'inherit' });
|
|
131
|
+
child.on('error', reject);
|
|
132
|
+
child.on('exit', (code) => {
|
|
133
|
+
if (code === 0)
|
|
134
|
+
resolve();
|
|
135
|
+
else
|
|
136
|
+
reject(new Error(`npx ${args.join(' ')} exited with code ${code}`));
|
|
137
|
+
});
|
|
138
|
+
});
|
|
139
|
+
}
|
|
@@ -0,0 +1,661 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.crawlSite = crawlSite;
|
|
16
|
+
/**
|
|
17
|
+
* Website crawler utilities.
|
|
18
|
+
*
|
|
19
|
+
* Fetches web pages (optionally guided by sitemap.xml), extracts the
|
|
20
|
+
* main content using the same defuddle + Turndown pipeline as the
|
|
21
|
+
* chrome-md browser extension (see ../../../chrome-md/content/content.js),
|
|
22
|
+
* converts it to Markdown, and writes one .md file per page plus an
|
|
23
|
+
* overview README when multiple pages are discovered. Designed for
|
|
24
|
+
* building Claude Code knowledge bases.
|
|
25
|
+
*/
|
|
26
|
+
const axios_1 = __importDefault(require("axios"));
|
|
27
|
+
const crypto_1 = require("crypto");
|
|
28
|
+
const defuddle_1 = __importDefault(require("defuddle"));
|
|
29
|
+
const fs_1 = require("fs");
|
|
30
|
+
const jsdom_1 = require("jsdom");
|
|
31
|
+
const path_1 = require("path");
|
|
32
|
+
const turndown_1 = __importDefault(require("turndown"));
|
|
33
|
+
const turndown_plugin_gfm_1 = require("turndown-plugin-gfm");
|
|
34
|
+
const browser_fetcher_1 = require("./browser-fetcher");
|
|
35
|
+
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; lenneTech-CLI-Crawler/1.0; +https://lenne.tech)';
|
|
36
|
+
/**
|
|
37
|
+
* Crawl a website starting at `options.url` and write the collected
|
|
38
|
+
* pages as Markdown files beneath `options.outDir`.
|
|
39
|
+
*/
|
|
40
|
+
function crawlSite(options) {
|
|
41
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
42
|
+
const { autoInstallBrowser = false, concurrency = 4, depth: rawDepth, includeImages = true, includeSitemap = true, maxPages = 200, onLog = () => undefined, outDir, prune = false, renderJs = false, selector, timeout = 20000, url: seedUrl, userAgent = DEFAULT_USER_AGENT, } = options;
|
|
43
|
+
// Normalize depth. `'all'` and negative numbers mean "follow every
|
|
44
|
+
// same-origin link we find" — bounded by `maxPages`.
|
|
45
|
+
const depth = rawDepth === 'all' || (typeof rawDepth === 'number' && rawDepth < 0) ? Number.POSITIVE_INFINITY : Number(rawDepth);
|
|
46
|
+
const http = axios_1.default.create({
|
|
47
|
+
headers: { 'User-Agent': userAgent },
|
|
48
|
+
maxRedirects: 5,
|
|
49
|
+
responseType: 'text',
|
|
50
|
+
timeout,
|
|
51
|
+
validateStatus: (status) => status >= 200 && status < 400,
|
|
52
|
+
});
|
|
53
|
+
// Headless browser only spun up when needed (SPA-mode).
|
|
54
|
+
let browserFetcher = null;
|
|
55
|
+
if (renderJs) {
|
|
56
|
+
browserFetcher = yield (0, browser_fetcher_1.createBrowserFetcher)({
|
|
57
|
+
autoInstall: autoInstallBrowser,
|
|
58
|
+
extraWaitMs: 500,
|
|
59
|
+
maxWaitMs: timeout,
|
|
60
|
+
onLog,
|
|
61
|
+
userAgent,
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
try {
|
|
65
|
+
const seed = new URL(seedUrl);
|
|
66
|
+
const origin = seed.origin;
|
|
67
|
+
if (!(0, fs_1.existsSync)(outDir)) {
|
|
68
|
+
(0, fs_1.mkdirSync)(outDir, { recursive: true });
|
|
69
|
+
}
|
|
70
|
+
// Queue preserves the depth at which a URL was discovered so children
|
|
71
|
+
// are only followed when `discovered.depth < options.depth`.
|
|
72
|
+
const queue = [{ depth: 0, url: normalizeUrl(seedUrl) }];
|
|
73
|
+
const seen = new Set([normalizeUrl(seedUrl)]);
|
|
74
|
+
if (includeSitemap) {
|
|
75
|
+
onLog(`Checking sitemap at ${origin}/sitemap.xml`);
|
|
76
|
+
const sitemapUrls = yield fetchSitemapUrls(http, origin, onLog);
|
|
77
|
+
for (const sitemapUrl of sitemapUrls) {
|
|
78
|
+
const normalized = normalizeUrl(sitemapUrl);
|
|
79
|
+
if (!seen.has(normalized) && sameOrigin(normalized, origin)) {
|
|
80
|
+
seen.add(normalized);
|
|
81
|
+
queue.push({ depth: 0, url: normalized });
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
if (sitemapUrls.length > 0) {
|
|
85
|
+
onLog(`Sitemap discovered ${sitemapUrls.length} URLs`);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
const pages = [];
|
|
89
|
+
const errors = [];
|
|
90
|
+
const skipped = [];
|
|
91
|
+
// Shared deduplicated image map (content hash -> relative path under outDir).
|
|
92
|
+
const imageHashToPath = new Map();
|
|
93
|
+
// We can't know upfront whether the crawl is single- or multi-page,
|
|
94
|
+
// so we render pages into a buffer first and only materialize files
|
|
95
|
+
// once the queue drains.
|
|
96
|
+
const rendered = [];
|
|
97
|
+
const processPage = (item) => __awaiter(this, void 0, void 0, function* () {
|
|
98
|
+
var _a, _b;
|
|
99
|
+
if (pages.length + errors.length >= maxPages) {
|
|
100
|
+
skipped.push(item.url);
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
onLog(`Fetching (depth ${item.depth}): ${item.url}`);
|
|
104
|
+
try {
|
|
105
|
+
let html;
|
|
106
|
+
let finalUrl = normalizeUrl(item.url);
|
|
107
|
+
if (browserFetcher) {
|
|
108
|
+
// In render mode we trust the URL we navigated to. We can't
|
|
109
|
+
// cheaply detect redirects here, so assume same origin (the
|
|
110
|
+
// crawler already filtered non-HTML URLs out of the queue).
|
|
111
|
+
html = yield browserFetcher.fetch(item.url);
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
const response = yield http.get(item.url);
|
|
115
|
+
finalUrl = normalizeUrl(((_b = (_a = response.request) === null || _a === void 0 ? void 0 : _a.res) === null || _b === void 0 ? void 0 : _b.responseUrl) || item.url);
|
|
116
|
+
if (!sameOrigin(finalUrl, origin)) {
|
|
117
|
+
skipped.push(item.url);
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
const contentType = String(response.headers['content-type'] || '');
|
|
121
|
+
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
|
|
122
|
+
skipped.push(item.url);
|
|
123
|
+
return;
|
|
124
|
+
}
|
|
125
|
+
html = String(response.data || '');
|
|
126
|
+
}
|
|
127
|
+
const extracted = yield extractContent(html, finalUrl, { selector });
|
|
128
|
+
// Follow links when depth budget is left.
|
|
129
|
+
if (item.depth < depth) {
|
|
130
|
+
for (const link of extracted.links) {
|
|
131
|
+
if (!sameOrigin(link, origin))
|
|
132
|
+
continue;
|
|
133
|
+
const normalized = normalizeUrl(link);
|
|
134
|
+
if (seen.has(normalized))
|
|
135
|
+
continue;
|
|
136
|
+
seen.add(normalized);
|
|
137
|
+
queue.push({ depth: item.depth + 1, url: normalized });
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
// Download images and build a URL -> local path map for Turndown.
|
|
141
|
+
const imageEntries = [];
|
|
142
|
+
if (includeImages && extracted.images.length > 0) {
|
|
143
|
+
for (const imgUrl of extracted.images) {
|
|
144
|
+
try {
|
|
145
|
+
const absolute = new URL(imgUrl, finalUrl).href;
|
|
146
|
+
const result = yield fetchImage(http, absolute);
|
|
147
|
+
if (!result)
|
|
148
|
+
continue;
|
|
149
|
+
const hash = (0, crypto_1.createHash)('sha1').update(result.buffer).digest('hex');
|
|
150
|
+
let relativeImagePath = imageHashToPath.get(hash);
|
|
151
|
+
if (!relativeImagePath) {
|
|
152
|
+
// Filename uses a content-hash suffix so re-runs with
|
|
153
|
+
// identical bytes overwrite the same file instead of
|
|
154
|
+
// leaving orphans with rotating counter suffixes.
|
|
155
|
+
const filename = buildImageFilename(absolute, hash, result.contentType);
|
|
156
|
+
relativeImagePath = `images/${filename}`;
|
|
157
|
+
imageHashToPath.set(hash, relativeImagePath);
|
|
158
|
+
imageEntries.push({ data: result.buffer, filename });
|
|
159
|
+
}
|
|
160
|
+
extracted.imageMap.set(imgUrl, relativeImagePath);
|
|
161
|
+
extracted.imageMap.set(absolute, relativeImagePath);
|
|
162
|
+
}
|
|
163
|
+
catch (_c) {
|
|
164
|
+
// Skip image on error; continue with others.
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
const markdown = convertToMarkdown(extracted.contentHtml, finalUrl, extracted.imageMap);
|
|
169
|
+
const filename = buildPageFilename(finalUrl, rendered.length === 0);
|
|
170
|
+
rendered.push({
|
|
171
|
+
filename,
|
|
172
|
+
images: imageEntries,
|
|
173
|
+
info: {
|
|
174
|
+
author: extracted.meta.author,
|
|
175
|
+
depth: item.depth,
|
|
176
|
+
description: extracted.meta.description,
|
|
177
|
+
downloadDate: new Date().toISOString(),
|
|
178
|
+
firstDownloaded: new Date().toISOString(),
|
|
179
|
+
imageCount: imageEntries.length,
|
|
180
|
+
language: extracted.meta.language,
|
|
181
|
+
ogImage: extracted.meta.ogImage,
|
|
182
|
+
title: extracted.meta.title,
|
|
183
|
+
url: finalUrl,
|
|
184
|
+
wordCount: extracted.meta.wordCount || countWords(extracted.contentText),
|
|
185
|
+
},
|
|
186
|
+
markdown,
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
catch (error) {
|
|
190
|
+
errors.push({
|
|
191
|
+
reason: error instanceof Error ? error.message : String(error),
|
|
192
|
+
url: item.url,
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
});
|
|
196
|
+
// Simple parallel worker pool. `queue` grows as pages are discovered,
|
|
197
|
+
// so workers pick new items until nothing is left.
|
|
198
|
+
let cursor = 0;
|
|
199
|
+
const worker = () => __awaiter(this, void 0, void 0, function* () {
|
|
200
|
+
while (cursor < queue.length && pages.length + errors.length < maxPages) {
|
|
201
|
+
const item = queue[cursor++];
|
|
202
|
+
yield processPage(item);
|
|
203
|
+
}
|
|
204
|
+
});
|
|
205
|
+
const workers = Array.from({ length: Math.max(1, concurrency) }, () => worker());
|
|
206
|
+
yield Promise.all(workers);
|
|
207
|
+
// Drain any late discoveries added after all initial workers exited.
|
|
208
|
+
while (cursor < queue.length) {
|
|
209
|
+
yield Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
|
|
210
|
+
}
|
|
211
|
+
const multiPage = rendered.length > 1;
|
|
212
|
+
const pagesDir = multiPage ? (0, path_1.join)(outDir, 'pages') : outDir;
|
|
213
|
+
const imagesDir = (0, path_1.join)(outDir, 'images');
|
|
214
|
+
if (rendered.length === 0) {
|
|
215
|
+
onLog('No pages rendered');
|
|
216
|
+
return { errors, outDir, pages, pruned: [], skipped };
|
|
217
|
+
}
|
|
218
|
+
(0, fs_1.mkdirSync)(pagesDir, { recursive: true });
|
|
219
|
+
if (includeImages && imageHashToPath.size > 0) {
|
|
220
|
+
(0, fs_1.mkdirSync)(imagesDir, { recursive: true });
|
|
221
|
+
}
|
|
222
|
+
// Write deduplicated images.
|
|
223
|
+
const writtenImageFilenames = new Set();
|
|
224
|
+
for (const entry of rendered.flatMap((r) => r.images)) {
|
|
225
|
+
if (writtenImageFilenames.has(entry.filename))
|
|
226
|
+
continue;
|
|
227
|
+
writtenImageFilenames.add(entry.filename);
|
|
228
|
+
(0, fs_1.writeFileSync)((0, path_1.join)(imagesDir, entry.filename), entry.data);
|
|
229
|
+
}
|
|
230
|
+
// Persist pages. When updating, preserve the original
|
|
231
|
+
// `first_downloaded` timestamp so history stays intact.
|
|
232
|
+
for (const entry of rendered) {
|
|
233
|
+
const outputPath = (0, path_1.join)(pagesDir, entry.filename);
|
|
234
|
+
const relativePath = (0, path_1.relative)(outDir, outputPath);
|
|
235
|
+
// Images live under `<outDir>/images/`. Each page rewrites the
|
|
236
|
+
// Turndown-emitted `images/<file>` placeholder to the correct
|
|
237
|
+
// relative path so nested URL slugs (`pages/ueber-uns/…`, or a
|
|
238
|
+
// single-page crawl that lands in `<outDir>/ueber-uns/…`) still
|
|
239
|
+
// render in Markdown previews.
|
|
240
|
+
const imagePrefix = `${(0, path_1.relative)((0, path_1.dirname)(outputPath), imagesDir).split(/[\\/]/).join('/')}/`;
|
|
241
|
+
const fixedMarkdown = entry.markdown.replace(/\]\(images\//g, `](${imagePrefix}`);
|
|
242
|
+
if ((0, fs_1.existsSync)(outputPath)) {
|
|
243
|
+
const existing = (0, fs_1.readFileSync)(outputPath, 'utf8');
|
|
244
|
+
const existingMeta = parseFrontmatter(existing);
|
|
245
|
+
if (existingMeta === null || existingMeta === void 0 ? void 0 : existingMeta.first_downloaded) {
|
|
246
|
+
entry.info.firstDownloaded = String(existingMeta.first_downloaded);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
const frontmatter = renderFrontmatter(entry.info);
|
|
250
|
+
(0, fs_1.mkdirSync)((0, path_1.dirname)(outputPath), { recursive: true });
|
|
251
|
+
(0, fs_1.writeFileSync)(outputPath, `${frontmatter}\n${fixedMarkdown.trim()}\n`);
|
|
252
|
+
pages.push(Object.assign(Object.assign({}, entry.info), { outputPath, relativePath }));
|
|
253
|
+
}
|
|
254
|
+
let indexFile;
|
|
255
|
+
if (multiPage) {
|
|
256
|
+
indexFile = (0, path_1.join)(outDir, 'README.md');
|
|
257
|
+
(0, fs_1.writeFileSync)(indexFile, renderOverview(seed.href, pages));
|
|
258
|
+
}
|
|
259
|
+
// Prune orphans (files left over from previous crawls). Scoped to
|
|
260
|
+
// `pages/` and `images/` so stray user files in outDir root never
|
|
261
|
+
// get touched. Only active in multi-page mode — a single-page
|
|
262
|
+
// crawl writes into `outDir` itself and has no page subfolder to
|
|
263
|
+
// sweep.
|
|
264
|
+
const pruned = [];
|
|
265
|
+
if (prune && multiPage) {
|
|
266
|
+
const keep = new Set(pages.map((p) => p.outputPath));
|
|
267
|
+
for (const entry of rendered.flatMap((r) => r.images)) {
|
|
268
|
+
keep.add((0, path_1.join)(imagesDir, entry.filename));
|
|
269
|
+
}
|
|
270
|
+
pruned.push(...pruneOrphans(pagesDir, keep));
|
|
271
|
+
if ((0, fs_1.existsSync)(imagesDir)) {
|
|
272
|
+
pruned.push(...pruneOrphans(imagesDir, keep));
|
|
273
|
+
}
|
|
274
|
+
if (pruned.length > 0) {
|
|
275
|
+
onLog(`Pruned ${pruned.length} orphaned file(s)`);
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
return { errors, indexFile, outDir, pages, pruned, skipped };
|
|
279
|
+
}
|
|
280
|
+
finally {
|
|
281
|
+
// Guarantee the headless browser is shut down on every exit path,
|
|
282
|
+
// including thrown errors, so no orphan chromium processes linger.
|
|
283
|
+
if (browserFetcher) {
|
|
284
|
+
yield browserFetcher.close().catch(() => undefined);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
function buildImageFilename(url, contentHash, contentType) {
|
|
290
|
+
let basename = 'image';
|
|
291
|
+
let extension = '';
|
|
292
|
+
try {
|
|
293
|
+
const u = new URL(url);
|
|
294
|
+
const last = u.pathname.split('/').filter(Boolean).pop() || '';
|
|
295
|
+
const parsedExt = (0, path_1.extname)(last).replace('.', '').toLowerCase();
|
|
296
|
+
if (parsedExt && /^(jpg|jpeg|png|gif|webp|svg|avif)$/.test(parsedExt)) {
|
|
297
|
+
extension = parsedExt;
|
|
298
|
+
}
|
|
299
|
+
basename =
|
|
300
|
+
last
|
|
301
|
+
.replace((0, path_1.extname)(last), '')
|
|
302
|
+
.replace(/[^a-zA-Z0-9-_]/g, '_')
|
|
303
|
+
.substring(0, 40) || 'image';
|
|
304
|
+
}
|
|
305
|
+
catch (_a) {
|
|
306
|
+
// fall through
|
|
307
|
+
}
|
|
308
|
+
if (!extension) {
|
|
309
|
+
const fromType = contentType.split(';')[0].split('/')[1];
|
|
310
|
+
if (fromType && /^(jpeg|jpg|png|gif|webp|svg\+xml|avif)$/.test(fromType)) {
|
|
311
|
+
extension = fromType === 'svg+xml' ? 'svg' : fromType;
|
|
312
|
+
}
|
|
313
|
+
else {
|
|
314
|
+
extension = 'png';
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
return `${basename}-${contentHash.slice(0, 8)}.${extension}`;
|
|
318
|
+
}
|
|
319
|
+
function buildPageFilename(url, isFirst) {
|
|
320
|
+
const u = new URL(url);
|
|
321
|
+
const segments = u.pathname.split('/').filter(Boolean);
|
|
322
|
+
if (segments.length === 0) {
|
|
323
|
+
return isFirst ? 'index.md' : 'home.md';
|
|
324
|
+
}
|
|
325
|
+
const slugged = segments
|
|
326
|
+
.map((s) => s
|
|
327
|
+
.toLowerCase()
|
|
328
|
+
.replace(/\.(html?|php|aspx?)$/, '')
|
|
329
|
+
.replace(/[^a-z0-9-_]/g, '-')
|
|
330
|
+
.replace(/-+/g, '-')
|
|
331
|
+
.replace(/^-|-$/g, '') || 'page')
|
|
332
|
+
.join('/');
|
|
333
|
+
return `${slugged}.md`;
|
|
334
|
+
}
|
|
335
|
+
function convertToMarkdown(html, baseUrl, imageMap) {
|
|
336
|
+
const turndown = new turndown_1.default({
|
|
337
|
+
bulletListMarker: '-',
|
|
338
|
+
codeBlockStyle: 'fenced',
|
|
339
|
+
emDelimiter: '*',
|
|
340
|
+
headingStyle: 'atx',
|
|
341
|
+
linkStyle: 'inlined',
|
|
342
|
+
strongDelimiter: '**',
|
|
343
|
+
});
|
|
344
|
+
// Enable GFM so tables, strikethrough and task lists convert cleanly.
|
|
345
|
+
if (turndown_plugin_gfm_1.gfm) {
|
|
346
|
+
turndown.use(turndown_plugin_gfm_1.gfm);
|
|
347
|
+
}
|
|
348
|
+
turndown.addRule('absoluteLinks', {
|
|
349
|
+
filter: 'a',
|
|
350
|
+
replacement: (content, node) => {
|
|
351
|
+
var _a, _b, _c, _d;
|
|
352
|
+
const href = ((_b = (_a = node).getAttribute) === null || _b === void 0 ? void 0 : _b.call(_a, 'href')) || '';
|
|
353
|
+
if (!href || href === '#' || href.startsWith('javascript:')) {
|
|
354
|
+
return content;
|
|
355
|
+
}
|
|
356
|
+
let absolute = href;
|
|
357
|
+
try {
|
|
358
|
+
absolute = new URL(href, baseUrl).href;
|
|
359
|
+
}
|
|
360
|
+
catch (_e) {
|
|
361
|
+
// keep original
|
|
362
|
+
}
|
|
363
|
+
const title = (_d = (_c = node).getAttribute) === null || _d === void 0 ? void 0 : _d.call(_c, 'title');
|
|
364
|
+
return title ? `[${content}](${absolute} "${title}")` : `[${content}](${absolute})`;
|
|
365
|
+
},
|
|
366
|
+
});
|
|
367
|
+
turndown.addRule('localImages', {
|
|
368
|
+
filter: 'img',
|
|
369
|
+
replacement: (_content, node) => {
|
|
370
|
+
var _a, _b, _c, _d;
|
|
371
|
+
const src = ((_b = (_a = node).getAttribute) === null || _b === void 0 ? void 0 : _b.call(_a, 'src')) || '';
|
|
372
|
+
if (!src)
|
|
373
|
+
return '';
|
|
374
|
+
let absolute = src;
|
|
375
|
+
try {
|
|
376
|
+
absolute = new URL(src, baseUrl).href;
|
|
377
|
+
}
|
|
378
|
+
catch (_e) {
|
|
379
|
+
// keep original
|
|
380
|
+
}
|
|
381
|
+
const local = imageMap.get(src) || imageMap.get(absolute);
|
|
382
|
+
const alt = ((_d = (_c = node).getAttribute) === null || _d === void 0 ? void 0 : _d.call(_c, 'alt')) || '';
|
|
383
|
+
const target = local || absolute;
|
|
384
|
+
return ``;
|
|
385
|
+
},
|
|
386
|
+
});
|
|
387
|
+
turndown.remove(['script', 'style', 'noscript', 'iframe']);
|
|
388
|
+
const markdown = turndown.turndown(html);
|
|
389
|
+
return markdown.replace(/\n{3,}/g, '\n\n').trim();
|
|
390
|
+
}
|
|
391
|
+
function countWords(text) {
|
|
392
|
+
return text.replace(/\s+/g, ' ').trim().split(' ').filter(Boolean).length;
|
|
393
|
+
}
|
|
394
|
+
function escapeYaml(value) {
|
|
395
|
+
return value.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, ' ');
|
|
396
|
+
}
|
|
397
|
+
/**
|
|
398
|
+
* Extract main content + metadata using defuddle (the same engine as
|
|
399
|
+
* chrome-md). Falls back to a raw body dump if defuddle fails.
|
|
400
|
+
*/
|
|
401
|
+
function extractContent(html, pageUrl, options) {
|
|
402
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
403
|
+
var _a, _b, _c, _d, _e, _f, _g;
|
|
404
|
+
const dom = new jsdom_1.JSDOM(html, { url: pageUrl });
|
|
405
|
+
const doc = dom.window.document;
|
|
406
|
+
const defuddleOptions = {
|
|
407
|
+
markdown: false,
|
|
408
|
+
removeHiddenElements: true,
|
|
409
|
+
removeLowScoring: true,
|
|
410
|
+
removeSmallImages: false,
|
|
411
|
+
};
|
|
412
|
+
if (options.selector) {
|
|
413
|
+
defuddleOptions.contentSelector = options.selector;
|
|
414
|
+
}
|
|
415
|
+
let parsed;
|
|
416
|
+
try {
|
|
417
|
+
// Same class-based API as chrome-md's content script.
|
|
418
|
+
const instance = new defuddle_1.default(doc, defuddleOptions);
|
|
419
|
+
parsed = instance.parse();
|
|
420
|
+
}
|
|
421
|
+
catch (_h) {
|
|
422
|
+
parsed = {
|
|
423
|
+
content: ((_a = doc.body) === null || _a === void 0 ? void 0 : _a.innerHTML) || html,
|
|
424
|
+
title: doc.title,
|
|
425
|
+
};
|
|
426
|
+
}
|
|
427
|
+
const contentHtml = parsed.content || ((_b = doc.body) === null || _b === void 0 ? void 0 : _b.innerHTML) || '';
|
|
428
|
+
// Collect images and links from the cleaned content.
|
|
429
|
+
const helperDom = new jsdom_1.JSDOM(`<!DOCTYPE html><html><body>${contentHtml}</body></html>`, {
|
|
430
|
+
url: pageUrl,
|
|
431
|
+
});
|
|
432
|
+
const contentDoc = helperDom.window.document;
|
|
433
|
+
const links = new Set();
|
|
434
|
+
contentDoc.querySelectorAll('a[href]').forEach((el) => {
|
|
435
|
+
const href = (el.getAttribute('href') || '').trim();
|
|
436
|
+
if (!href || href.startsWith('#') || href.startsWith('mailto:') || href.startsWith('javascript:')) {
|
|
437
|
+
return;
|
|
438
|
+
}
|
|
439
|
+
try {
|
|
440
|
+
links.add(new URL(href, pageUrl).href);
|
|
441
|
+
}
|
|
442
|
+
catch (_a) {
|
|
443
|
+
// ignore malformed URLs
|
|
444
|
+
}
|
|
445
|
+
});
|
|
446
|
+
const images = new Set();
|
|
447
|
+
contentDoc.querySelectorAll('img').forEach((el) => {
|
|
448
|
+
const src = (el.getAttribute('src') || el.getAttribute('data-src') || '').trim();
|
|
449
|
+
if (!src || src.startsWith('data:'))
|
|
450
|
+
return;
|
|
451
|
+
try {
|
|
452
|
+
images.add(new URL(src, pageUrl).href);
|
|
453
|
+
}
|
|
454
|
+
catch (_a) {
|
|
455
|
+
// ignore malformed URLs
|
|
456
|
+
}
|
|
457
|
+
});
|
|
458
|
+
// Some lazy-loading frameworks keep the real URL only in the source
|
|
459
|
+
// document (stripped out by defuddle), so also consult the original DOM.
|
|
460
|
+
doc.querySelectorAll('img[data-src], img[data-lazy-src]').forEach((el) => {
|
|
461
|
+
const src = (el.getAttribute('data-src') || el.getAttribute('data-lazy-src') || '').trim();
|
|
462
|
+
if (!src || src.startsWith('data:'))
|
|
463
|
+
return;
|
|
464
|
+
try {
|
|
465
|
+
images.add(new URL(src, pageUrl).href);
|
|
466
|
+
}
|
|
467
|
+
catch (_a) {
|
|
468
|
+
// ignore
|
|
469
|
+
}
|
|
470
|
+
});
|
|
471
|
+
const meta = {
|
|
472
|
+
author: parsed.author || ((_c = doc.querySelector('meta[name="author"]')) === null || _c === void 0 ? void 0 : _c.getAttribute('content')) || undefined,
|
|
473
|
+
description: parsed.description ||
|
|
474
|
+
((_d = doc.querySelector('meta[name="description"]')) === null || _d === void 0 ? void 0 : _d.getAttribute('content')) ||
|
|
475
|
+
((_e = doc.querySelector('meta[property="og:description"]')) === null || _e === void 0 ? void 0 : _e.getAttribute('content')) ||
|
|
476
|
+
'',
|
|
477
|
+
language: parsed.language || doc.documentElement.getAttribute('lang') || undefined,
|
|
478
|
+
ogImage: parsed.image || ((_f = doc.querySelector('meta[property="og:image"]')) === null || _f === void 0 ? void 0 : _f.getAttribute('content')) || undefined,
|
|
479
|
+
title: parsed.title || doc.title || pageUrl,
|
|
480
|
+
wordCount: parsed.wordCount,
|
|
481
|
+
};
|
|
482
|
+
return {
|
|
483
|
+
contentHtml,
|
|
484
|
+
contentText: ((_g = contentDoc.body) === null || _g === void 0 ? void 0 : _g.textContent) || '',
|
|
485
|
+
imageMap: new Map(),
|
|
486
|
+
images: [...images],
|
|
487
|
+
links: [...links],
|
|
488
|
+
meta,
|
|
489
|
+
};
|
|
490
|
+
});
|
|
491
|
+
}
|
|
492
|
+
function fetchImage(http, url) {
|
|
493
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
494
|
+
try {
|
|
495
|
+
const response = yield http.get(url, { responseType: 'arraybuffer' });
|
|
496
|
+
const buffer = Buffer.from(response.data);
|
|
497
|
+
if (buffer.byteLength === 0)
|
|
498
|
+
return null;
|
|
499
|
+
return { buffer, contentType: String(response.headers['content-type'] || '') };
|
|
500
|
+
}
|
|
501
|
+
catch (_a) {
|
|
502
|
+
return null;
|
|
503
|
+
}
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
function fetchSitemapUrls(http, origin, onLog) {
|
|
507
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
508
|
+
const urls = [];
|
|
509
|
+
const visited = new Set();
|
|
510
|
+
function walk(sitemapUrl) {
|
|
511
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
512
|
+
if (visited.has(sitemapUrl))
|
|
513
|
+
return;
|
|
514
|
+
visited.add(sitemapUrl);
|
|
515
|
+
try {
|
|
516
|
+
const response = yield http.get(sitemapUrl);
|
|
517
|
+
const xml = String(response.data || '');
|
|
518
|
+
// Nested sitemap index: follow each <sitemap><loc>...</loc></sitemap>.
|
|
519
|
+
const nested = [...xml.matchAll(/<sitemap>[\s\S]*?<loc>\s*([^<\s]+)\s*<\/loc>[\s\S]*?<\/sitemap>/gi)].map((m) => m[1]);
|
|
520
|
+
for (const child of nested) {
|
|
521
|
+
yield walk(child);
|
|
522
|
+
}
|
|
523
|
+
const pageMatches = [...xml.matchAll(/<url>[\s\S]*?<loc>\s*([^<\s]+)\s*<\/loc>[\s\S]*?<\/url>/gi)].map((m) => m[1]);
|
|
524
|
+
urls.push(...pageMatches);
|
|
525
|
+
}
|
|
526
|
+
catch (error) {
|
|
527
|
+
onLog(`Sitemap fetch failed for ${sitemapUrl}: ${error instanceof Error ? error.message : String(error)}`);
|
|
528
|
+
}
|
|
529
|
+
});
|
|
530
|
+
}
|
|
531
|
+
yield walk(`${origin}/sitemap.xml`);
|
|
532
|
+
return urls;
|
|
533
|
+
});
|
|
534
|
+
}
|
|
535
|
+
/**
|
|
536
|
+
* Normalize a URL for dedup: strip hash, drop default `index.html`,
|
|
537
|
+
* and remove trailing slashes (except root).
|
|
538
|
+
*/
|
|
539
|
+
function normalizeUrl(raw) {
|
|
540
|
+
try {
|
|
541
|
+
const u = new URL(raw);
|
|
542
|
+
u.hash = '';
|
|
543
|
+
u.pathname = u.pathname.replace(/\/index\.html?$/i, '/');
|
|
544
|
+
if (u.pathname.length > 1 && u.pathname.endsWith('/')) {
|
|
545
|
+
u.pathname = u.pathname.replace(/\/+$/, '');
|
|
546
|
+
}
|
|
547
|
+
return u.href;
|
|
548
|
+
}
|
|
549
|
+
catch (_a) {
|
|
550
|
+
return raw;
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
function parseFrontmatter(markdown) {
|
|
554
|
+
if (!markdown.startsWith('---'))
|
|
555
|
+
return null;
|
|
556
|
+
const end = markdown.indexOf('\n---', 3);
|
|
557
|
+
if (end === -1)
|
|
558
|
+
return null;
|
|
559
|
+
const block = markdown.slice(3, end);
|
|
560
|
+
const result = {};
|
|
561
|
+
for (const line of block.split('\n')) {
|
|
562
|
+
const match = line.match(/^([a-zA-Z0-9_]+):\s*(.*)$/);
|
|
563
|
+
if (!match)
|
|
564
|
+
continue;
|
|
565
|
+
result[match[1]] = match[2].replace(/^"(.*)"$/, '$1');
|
|
566
|
+
}
|
|
567
|
+
return result;
|
|
568
|
+
}
|
|
569
|
+
/**
|
|
570
|
+
* Walk `rootDir` recursively and delete every file whose absolute
|
|
571
|
+
* path is not in `keepPaths`. Empty directories left behind after
|
|
572
|
+
* the sweep are removed, too. Returns the absolute paths that were
|
|
573
|
+
* actually deleted.
|
|
574
|
+
*/
|
|
575
|
+
function pruneOrphans(rootDir, keepPaths) {
|
|
576
|
+
const removed = [];
|
|
577
|
+
if (!(0, fs_1.existsSync)(rootDir))
|
|
578
|
+
return removed;
|
|
579
|
+
const entries = (0, fs_1.readdirSync)(rootDir, { withFileTypes: true });
|
|
580
|
+
for (const entry of entries) {
|
|
581
|
+
const full = (0, path_1.join)(rootDir, entry.name);
|
|
582
|
+
if (entry.isDirectory()) {
|
|
583
|
+
removed.push(...pruneOrphans(full, keepPaths));
|
|
584
|
+
// Remove directory if now empty.
|
|
585
|
+
try {
|
|
586
|
+
if ((0, fs_1.readdirSync)(full).length === 0)
|
|
587
|
+
(0, fs_1.rmdirSync)(full);
|
|
588
|
+
}
|
|
589
|
+
catch (_a) {
|
|
590
|
+
// Directory not empty or already gone — ignore.
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
else if (entry.isFile() && !keepPaths.has(full)) {
|
|
594
|
+
try {
|
|
595
|
+
(0, fs_1.unlinkSync)(full);
|
|
596
|
+
removed.push(full);
|
|
597
|
+
}
|
|
598
|
+
catch (_b) {
|
|
599
|
+
// File already removed or permission denied — skip.
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
return removed;
|
|
604
|
+
}
|
|
605
|
+
function renderFrontmatter(info) {
|
|
606
|
+
const lines = [
|
|
607
|
+
'---',
|
|
608
|
+
`title: "${escapeYaml(info.title)}"`,
|
|
609
|
+
`source_url: "${info.url}"`,
|
|
610
|
+
`source_domain: "${new URL(info.url).hostname}"`,
|
|
611
|
+
`crawl_depth: ${info.depth}`,
|
|
612
|
+
`download_date: "${info.downloadDate}"`,
|
|
613
|
+
`first_downloaded: "${info.firstDownloaded}"`,
|
|
614
|
+
info.description ? `description: "${escapeYaml(truncate(info.description, 500))}"` : null,
|
|
615
|
+
info.author ? `author: "${escapeYaml(info.author)}"` : null,
|
|
616
|
+
info.language ? `language: "${escapeYaml(info.language)}"` : null,
|
|
617
|
+
info.ogImage ? `og_image: "${escapeYaml(info.ogImage)}"` : null,
|
|
618
|
+
info.imageCount ? `image_count: ${info.imageCount}` : null,
|
|
619
|
+
`word_count: ${info.wordCount}`,
|
|
620
|
+
'content_type: "webpage"',
|
|
621
|
+
'---',
|
|
622
|
+
].filter((l) => l !== null);
|
|
623
|
+
return lines.join('\n');
|
|
624
|
+
}
|
|
625
|
+
function renderOverview(startUrl, pages) {
|
|
626
|
+
const ordered = [...pages].sort((a, b) => a.url.localeCompare(b.url));
|
|
627
|
+
const host = new URL(startUrl).host;
|
|
628
|
+
const lines = [];
|
|
629
|
+
lines.push(`# ${host} — Knowledge Base`);
|
|
630
|
+
lines.push('');
|
|
631
|
+
lines.push(`Source: ${startUrl}`);
|
|
632
|
+
lines.push('');
|
|
633
|
+
lines.push(`Generated: ${new Date().toISOString()}`);
|
|
634
|
+
lines.push('');
|
|
635
|
+
lines.push(`Pages: ${ordered.length}`);
|
|
636
|
+
lines.push('');
|
|
637
|
+
lines.push('## Pages');
|
|
638
|
+
lines.push('');
|
|
639
|
+
for (const page of ordered) {
|
|
640
|
+
lines.push(`### [${page.title}](${page.relativePath.split(/[\\/]/).join('/')})`);
|
|
641
|
+
lines.push('');
|
|
642
|
+
lines.push(`- URL: ${page.url}`);
|
|
643
|
+
if (page.description) {
|
|
644
|
+
lines.push(`- ${truncate(page.description, 240)}`);
|
|
645
|
+
}
|
|
646
|
+
lines.push(`- Updated: ${page.downloadDate}`);
|
|
647
|
+
lines.push('');
|
|
648
|
+
}
|
|
649
|
+
return lines.join('\n');
|
|
650
|
+
}
|
|
651
|
+
function sameOrigin(url, origin) {
|
|
652
|
+
try {
|
|
653
|
+
return new URL(url).origin === origin;
|
|
654
|
+
}
|
|
655
|
+
catch (_a) {
|
|
656
|
+
return false;
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
function truncate(value, max) {
|
|
660
|
+
return value.length <= max ? value : `${value.slice(0, max - 1)}…`;
|
|
661
|
+
}
|
package/docs/commands.md
CHANGED
|
@@ -1409,6 +1409,58 @@ lt tools regex [pattern] [text]
|
|
|
1409
1409
|
|
|
1410
1410
|
---
|
|
1411
1411
|
|
|
1412
|
+
### `lt tools crawl`
|
|
1413
|
+
|
|
1414
|
+
Crawls a website and stores each page as a Markdown file (with YAML frontmatter containing `source_url`, `download_date`, `first_downloaded`, `description`, language, word count, etc.) so it can be consumed as a Claude Code knowledge base. Optionally follows same-origin links up to a configurable depth, seeds the queue from `<origin>/sitemap.xml`, and downloads referenced images into a shared `images/` folder (deduplicated by content hash). Re-running the command against the same output directory updates existing pages while preserving their original `first_downloaded` timestamp.
|
|
1415
|
+
|
|
1416
|
+
**Alias:** `cr`
|
|
1417
|
+
|
|
1418
|
+
**Usage:**
|
|
1419
|
+
```bash
|
|
1420
|
+
lt tools crawl <url> [options]
|
|
1421
|
+
```
|
|
1422
|
+
|
|
1423
|
+
**Options:**
|
|
1424
|
+
- `--out <dir>` — Output directory (default: current directory). Single-page crawls write the `.md` directly here; multi-page crawls generate `<out>/README.md` plus `<out>/pages/` and `<out>/images/`.
|
|
1425
|
+
- `--depth <n|all>` — Link depth (default `0`). `0` = only the start page, `1` = start page + direct same-origin links, `2` = and their links, ... Use `--depth all` (or `--depth -1`, or the shortcut flag `--all`) to follow every same-origin link transitively; the crawl then stops when `--max-pages` is reached.
|
|
1426
|
+
- `--all` — Shortcut for `--depth all`.
|
|
1427
|
+
- `--render` / `--no-render` — Render each page through a headless browser before extraction (default **on**). Required for SPAs (Vue/Nuxt/React/Angular) whose content is client-rendered. Uses `playwright-core` with system Chrome / Edge first, then Playwright's bundled Chromium. Use `--no-render` for a plain HTTP fetch when you know the site is static (faster, no browser needed).
|
|
1428
|
+
- `--install-browser` — If `--render` finds no browser, auto-install Playwright's Chromium (one-time ~170 MB download).
|
|
1429
|
+
- `--prune` / `--no-prune` — After a multi-page crawl, remove any `.md` or image files inside `<out>/pages` and `<out>/images` that were not written by the current run (default **on**). Keeps the knowledge base aligned with the live site on update runs. Empty subdirectories are cleaned up too. Ignored in single-page mode. Use `--no-prune` to preserve old files.
|
|
1430
|
+
- `--no-images` — Disable image downloads.
|
|
1431
|
+
- `--no-sitemap` — Skip discovery via `<origin>/sitemap.xml`.
|
|
1432
|
+
- `--concurrency <n>` — Parallel HTTP requests (default `4`).
|
|
1433
|
+
- `--max-pages <n>` — Safety cap on total pages (default `200`).
|
|
1434
|
+
- `--selector <css>` — CSS selector scoping the main content (e.g. `article`, `main`).
|
|
1435
|
+
- `--timeout <ms>` — HTTP request timeout in ms (default `20000`).
|
|
1436
|
+
- `--noConfirm` — Skip confirmation prompts.
|
|
1437
|
+
|
|
1438
|
+
**Examples:**
|
|
1439
|
+
```bash
|
|
1440
|
+
# Single page into the current directory
|
|
1441
|
+
lt tools crawl https://example.com/article --noConfirm
|
|
1442
|
+
|
|
1443
|
+
# Crawl start page + direct links into ./knowledge
|
|
1444
|
+
lt tools crawl https://example.com --out ./knowledge --depth 1 --noConfirm
|
|
1445
|
+
|
|
1446
|
+
# Full mini-site with sitemap seeding and images
|
|
1447
|
+
lt tools crawl https://example.com --out ./kb --depth 2 --max-pages 100 --noConfirm
|
|
1448
|
+
|
|
1449
|
+
# Crawl every reachable same-origin page (safety cap via --max-pages)
|
|
1450
|
+
lt tools crawl https://example.com --out ./kb --depth all --max-pages 500 --noConfirm
|
|
1451
|
+
|
|
1452
|
+
# Same, using the --all shortcut
|
|
1453
|
+
lt tools crawl https://example.com --out ./kb --all --max-pages 500 --noConfirm
|
|
1454
|
+
|
|
1455
|
+
# Full SPA-aware crawl (render + prune are on by default)
|
|
1456
|
+
lt tools crawl https://lenne.tech --all --noConfirm
|
|
1457
|
+
|
|
1458
|
+
# Opt-out: plain HTTP fetch for a known-static site, keep orphans
|
|
1459
|
+
lt tools crawl https://example.com --all --no-render --no-prune --noConfirm
|
|
1460
|
+
```
|
|
1461
|
+
|
|
1462
|
+
---
|
|
1463
|
+
|
|
1412
1464
|
## Configuration Priority
|
|
1413
1465
|
|
|
1414
1466
|
All configurable commands follow this priority order (highest to lowest):
|
package/docs/lt.config.md
CHANGED
|
@@ -923,6 +923,43 @@ Reinitializes npm packages.
|
|
|
923
923
|
}
|
|
924
924
|
```
|
|
925
925
|
|
|
926
|
+
#### `lt tools crawl`
|
|
927
|
+
|
|
928
|
+
Crawls a website into Markdown files (knowledge base builder).
|
|
929
|
+
|
|
930
|
+
| Field | Type | Default | Description |
|
|
931
|
+
|-------|------|---------|-------------|
|
|
932
|
+
| `commands.tools.crawl.out` | `string` | `.` | Output directory |
|
|
933
|
+
| `commands.tools.crawl.depth` | `number \| "all"` | `0` | Link depth (0 = only start page, 1 = direct links, ..., `"all"` or `-1` = follow every same-origin link, bounded by `maxPages`) |
|
|
934
|
+
| `commands.tools.crawl.includeImages` | `boolean` | `true` | Download images and inline with local paths |
|
|
935
|
+
| `commands.tools.crawl.includeSitemap` | `boolean` | `true` | Also seed queue from `<origin>/sitemap.xml` |
|
|
936
|
+
| `commands.tools.crawl.concurrency` | `number` | `4` | Parallel HTTP requests |
|
|
937
|
+
| `commands.tools.crawl.maxPages` | `number` | `200` | Safety cap on total pages |
|
|
938
|
+
| `commands.tools.crawl.selector` | `string` | – | CSS selector for main content |
|
|
939
|
+
| `commands.tools.crawl.timeout` | `number` | `20000` | HTTP request timeout in ms |
|
|
940
|
+
| `commands.tools.crawl.renderJs` | `boolean` | `true` | Render pages through a headless browser (for SPAs). Uses playwright-core. Set to `false` for plain HTTP. |
|
|
941
|
+
| `commands.tools.crawl.prune` | `boolean` | `true` | Remove orphaned `.md` / image files after a multi-page crawl (update-in-place). Set to `false` to preserve old files. |
|
|
942
|
+
| `commands.tools.crawl.noConfirm` | `boolean` | `false` | Skip confirmation prompts |
|
|
943
|
+
|
|
944
|
+
**Example:**
|
|
945
|
+
```json
|
|
946
|
+
{
|
|
947
|
+
"commands": {
|
|
948
|
+
"tools": {
|
|
949
|
+
"crawl": {
|
|
950
|
+
"out": "./knowledge",
|
|
951
|
+
"depth": 2,
|
|
952
|
+
"includeImages": true,
|
|
953
|
+
"includeSitemap": true,
|
|
954
|
+
"concurrency": 4,
|
|
955
|
+
"maxPages": 200,
|
|
956
|
+
"noConfirm": true
|
|
957
|
+
}
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
```
|
|
962
|
+
|
|
926
963
|
---
|
|
927
964
|
|
|
928
965
|
### Metadata
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lenne.tech/cli",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.13.0",
|
|
4
4
|
"description": "lenne.Tech CLI: lt",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"lenne.Tech",
|
|
@@ -61,14 +61,19 @@
|
|
|
61
61
|
"@lenne.tech/cli-plugin-helper": "0.0.14",
|
|
62
62
|
"axios": "1.15.0",
|
|
63
63
|
"bcrypt": "6.0.0",
|
|
64
|
+
"defuddle": "0.17.0",
|
|
64
65
|
"glob": "13.0.6",
|
|
65
66
|
"gluegun": "5.2.2",
|
|
66
67
|
"js-sha256": "0.11.1",
|
|
67
68
|
"js-yaml": "4.1.1",
|
|
69
|
+
"jsdom": "29.0.2",
|
|
68
70
|
"lodash": "4.18.1",
|
|
69
71
|
"open": "11.0.0",
|
|
72
|
+
"playwright-core": "1.59.1",
|
|
70
73
|
"ts-morph": "28.0.0",
|
|
71
74
|
"ts-node": "10.9.2",
|
|
75
|
+
"turndown": "7.2.4",
|
|
76
|
+
"turndown-plugin-gfm": "1.0.2",
|
|
72
77
|
"typescript": "6.0.3"
|
|
73
78
|
},
|
|
74
79
|
"devDependencies": {
|
|
@@ -77,8 +82,10 @@
|
|
|
77
82
|
"@types/ejs": "3.1.5",
|
|
78
83
|
"@types/jest": "30.0.0",
|
|
79
84
|
"@types/js-yaml": "4.0.9",
|
|
85
|
+
"@types/jsdom": "28.0.1",
|
|
80
86
|
"@types/lodash": "4.17.24",
|
|
81
87
|
"@types/node": "25.6.0",
|
|
88
|
+
"@types/turndown": "5.0.6",
|
|
82
89
|
"@typescript-eslint/eslint-plugin": "8.58.2",
|
|
83
90
|
"@typescript-eslint/parser": "8.58.2",
|
|
84
91
|
"ejs": "5.0.2",
|