@happyalienai/vite-plugin-llm-spider 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -4
- package/dist/index.cjs +260 -166
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +7 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.js +260 -166
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,10 +1,20 @@
|
|
|
1
1
|
# vite-plugin-llm-spider
|
|
2
2
|
|
|
3
|
-
>
|
|
3
|
+
> Make your Vite SPA discoverable by AI search engines like ChatGPT, Perplexity, and Google AI Overviews
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
**Single Page Apps are invisible to AI.** While tools like ChatGPT, Claude, and Perplexity reshape how people find information, SPAs remain hidden behind JavaScript walls. This plugin bridges that gap by generating clean, LLM-friendly markdown snapshots and a standardized index.
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
- 🤖 **Zero-click optimization** — Get cited in AI answers without complex SSR
|
|
8
|
+
- 📈 **LLM SEO / GEO ready** — Clean markdown format that LLMs prefer
|
|
9
|
+
- 🎯 **[llms.txt standard](https://llmstxt.org/)** — Machine-readable index for AI agents
|
|
10
|
+
|
|
11
|
+
> **Built by [Happy Alien AI](https://happyalien.ai)** — We take the busy work out of training development.
|
|
12
|
+
|
|
13
|
+
## Why This Matters
|
|
14
|
+
|
|
15
|
+
Traditional SEO optimizes for Google's crawler. **Generative Engine Optimization (GEO)** optimizes for AI systems that synthesize answers from your content. When someone asks ChatGPT or Perplexity a question your site answers, you want to be cited.
|
|
16
|
+
|
|
17
|
+
SPAs render content via JavaScript — invisible to most AI crawlers. This plugin runs Puppeteer at build time to capture your rendered pages as clean markdown, plus generates an `llms.txt` index that tells AI agents exactly where to look.
|
|
8
18
|
|
|
9
19
|
## Features
|
|
10
20
|
|
|
@@ -72,6 +82,29 @@ The generated `llms.txt` follows the [llmstxt.org](https://llmstxt.org/) spec:
|
|
|
72
82
|
|
|
73
83
|
## Configuration
|
|
74
84
|
|
|
85
|
+
### Static Mode (No Browser Required)
|
|
86
|
+
|
|
87
|
+
By default, when crawl is disabled, the plugin reads HTML files directly from your `dist/` folder — **no Puppeteer or browser needed**. This works great for:
|
|
88
|
+
|
|
89
|
+
- Pre-rendered/SSG sites
|
|
90
|
+
- CI environments without Chrome (WSL, Docker, etc.)
|
|
91
|
+
- Simple static sites
|
|
92
|
+
|
|
93
|
+
```js
|
|
94
|
+
llmSpider({
|
|
95
|
+
static: true, // Force static mode (default: "auto")
|
|
96
|
+
routes: [
|
|
97
|
+
{ path: "/", title: "Home" },
|
|
98
|
+
{ path: "/about", title: "About" },
|
|
99
|
+
],
|
|
100
|
+
})
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Options:
|
|
104
|
+
- `true` — Always read HTML files directly (no browser)
|
|
105
|
+
- `false` — Always use Puppeteer (required for SPAs that need JS rendering)
|
|
106
|
+
- `"auto"` (default) — Use static when crawl is disabled, browser when crawl is enabled
|
|
107
|
+
|
|
75
108
|
### Route Definitions
|
|
76
109
|
|
|
77
110
|
```js
|
|
@@ -207,6 +240,7 @@ render: {
|
|
|
207
240
|
- Add more `removeSelectors`
|
|
208
241
|
|
|
209
242
|
### CI fails to launch browser
|
|
243
|
+
- **Easiest fix:** Use `static: true` if you don't need JS rendering
|
|
210
244
|
- Add `--no-sandbox` to launch args
|
|
211
245
|
- Ensure Puppeteer dependencies are installed
|
|
212
246
|
|
|
@@ -221,5 +255,5 @@ MIT
|
|
|
221
255
|
<strong>Happy Alien AI</strong>
|
|
222
256
|
</a>
|
|
223
257
|
<br>
|
|
224
|
-
|
|
258
|
+
We take the busy work out of training development.
|
|
225
259
|
</p>
|
package/dist/index.cjs
CHANGED
|
@@ -39,7 +39,13 @@ var import_node_path = __toESM(require("path"), 1);
|
|
|
39
39
|
var cheerio = __toESM(require("cheerio"), 1);
|
|
40
40
|
var import_turndown = __toESM(require("turndown"), 1);
|
|
41
41
|
var import_turndown_plugin_gfm = require("turndown-plugin-gfm");
|
|
42
|
-
var
|
|
42
|
+
var puppeteer = null;
|
|
43
|
+
async function loadPuppeteer() {
|
|
44
|
+
if (!puppeteer) {
|
|
45
|
+
puppeteer = await import("puppeteer");
|
|
46
|
+
}
|
|
47
|
+
return puppeteer.default || puppeteer;
|
|
48
|
+
}
|
|
43
49
|
function llmSpiderPlugin(userOptions = {}) {
|
|
44
50
|
let resolvedConfig;
|
|
45
51
|
function deepMerge(target, source) {
|
|
@@ -55,6 +61,11 @@ function llmSpiderPlugin(userOptions = {}) {
|
|
|
55
61
|
}
|
|
56
62
|
const defaults = {
|
|
57
63
|
enabled: true,
|
|
64
|
+
// Static mode: read HTML files directly from dist/ without browser
|
|
65
|
+
// - true: always use static mode (no Puppeteer)
|
|
66
|
+
// - false: always use browser rendering
|
|
67
|
+
// - "auto" (default): use static when crawl is disabled, browser when crawl is enabled
|
|
68
|
+
static: "auto",
|
|
58
69
|
// Recommended: explicit list
|
|
59
70
|
routes: (
|
|
60
71
|
/** @type {RouteDef[] | undefined} */
|
|
@@ -185,6 +196,11 @@ function llmSpiderPlugin(userOptions = {}) {
|
|
|
185
196
|
}
|
|
186
197
|
return import_node_path.default.join(distDir, rel);
|
|
187
198
|
}
|
|
199
|
+
function routeToHtmlFsPath(distDir, route) {
|
|
200
|
+
if (route === "/") return import_node_path.default.join(distDir, "index.html");
|
|
201
|
+
if (route.endsWith("/")) return import_node_path.default.join(distDir, route.slice(1), "index.html");
|
|
202
|
+
return import_node_path.default.join(distDir, route.slice(1) + ".html");
|
|
203
|
+
}
|
|
188
204
|
function makeLlmsLink(relMdPath) {
|
|
189
205
|
return relMdPath.replace(/\\/g, "/");
|
|
190
206
|
}
|
|
@@ -193,6 +209,12 @@ function llmSpiderPlugin(userOptions = {}) {
|
|
|
193
209
|
server.close((err) => err ? reject(err) : resolve());
|
|
194
210
|
});
|
|
195
211
|
}
|
|
212
|
+
function shouldUseStaticMode() {
|
|
213
|
+
var _a;
|
|
214
|
+
if (options.static === true) return true;
|
|
215
|
+
if (options.static === false) return false;
|
|
216
|
+
return !((_a = options.crawl) == null ? void 0 : _a.enabled);
|
|
217
|
+
}
|
|
196
218
|
return {
|
|
197
219
|
name: "vite-plugin-llm-spider",
|
|
198
220
|
apply: "build",
|
|
@@ -206,6 +228,7 @@ function llmSpiderPlugin(userOptions = {}) {
|
|
|
206
228
|
throw new Error("LLM Spider: missing resolved Vite config");
|
|
207
229
|
const distDir = resolvedConfig.build.outDir || "dist";
|
|
208
230
|
const basePath = (resolvedConfig.base || "/").replace(/\\/g, "/");
|
|
231
|
+
const useStaticMode = shouldUseStaticMode();
|
|
209
232
|
let routeDefs = [];
|
|
210
233
|
if (Array.isArray(options.routes) && options.routes.length) {
|
|
211
234
|
routeDefs = options.routes.map((r) => ({
|
|
@@ -220,98 +243,41 @@ function llmSpiderPlugin(userOptions = {}) {
|
|
|
220
243
|
} else {
|
|
221
244
|
routeDefs = [{ path: "/", section: "Pages" }];
|
|
222
245
|
}
|
|
223
|
-
log.info(
|
|
246
|
+
log.info(`
|
|
247
|
+
LLM Spider: generating markdown + llms.txt (${useStaticMode ? "static" : "browser"} mode)`);
|
|
224
248
|
log.debug("distDir:", distDir, "base:", basePath);
|
|
225
|
-
const previewServer = await (0, import_vite.preview)({
|
|
226
|
-
root: resolvedConfig.root,
|
|
227
|
-
base: resolvedConfig.base,
|
|
228
|
-
build: { outDir: distDir },
|
|
229
|
-
preview: { port: 0, open: false, host: "127.0.0.1" },
|
|
230
|
-
configFile: false,
|
|
231
|
-
plugins: [],
|
|
232
|
-
// avoid loading user plugins again
|
|
233
|
-
logLevel: "silent"
|
|
234
|
-
});
|
|
235
|
-
await new Promise((resolve, reject) => {
|
|
236
|
-
const server = previewServer.httpServer;
|
|
237
|
-
if (server.listening) {
|
|
238
|
-
resolve();
|
|
239
|
-
} else {
|
|
240
|
-
server.once("listening", resolve);
|
|
241
|
-
server.once("error", reject);
|
|
242
|
-
setTimeout(() => reject(new Error("Preview server failed to start")), 5e3);
|
|
243
|
-
}
|
|
244
|
-
});
|
|
245
|
-
const addr = previewServer.httpServer.address();
|
|
246
|
-
if (!addr || typeof addr === "string") {
|
|
247
|
-
await safeCloseHttpServer(previewServer.httpServer);
|
|
248
|
-
throw new Error("LLM Spider: could not determine preview server port");
|
|
249
|
-
}
|
|
250
|
-
const normalizedBase = basePath.endsWith("/") ? basePath : basePath + "/";
|
|
251
|
-
const baseUrl = `http://127.0.0.1:${addr.port}${normalizedBase}`;
|
|
252
|
-
log.debug("Preview server at:", baseUrl);
|
|
253
|
-
const browser = await import_puppeteer.default.launch(options.render.launchOptions);
|
|
254
249
|
const turndown = new import_turndown.default(options.markdown.turndown);
|
|
255
250
|
turndown.use(import_turndown_plugin_gfm.gfm);
|
|
256
|
-
const visited = /* @__PURE__ */ new Set();
|
|
257
251
|
const captured = [];
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
for (const
|
|
261
|
-
const
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
(p) => p instanceof RegExp ? p.test(url) : url.includes(p)
|
|
285
|
-
);
|
|
286
|
-
if (blocked) req.abort();
|
|
287
|
-
else req.continue();
|
|
288
|
-
});
|
|
289
|
-
}
|
|
290
|
-
try {
|
|
291
|
-
const pageUrl = route === "/" ? baseUrl : baseUrl + route.replace(/^\//, "");
|
|
292
|
-
await options.render.beforeGoto(page, { route });
|
|
293
|
-
await page.goto(pageUrl, {
|
|
294
|
-
waitUntil: options.render.waitUntil,
|
|
295
|
-
timeout: options.render.timeoutMs
|
|
296
|
-
});
|
|
297
|
-
if (options.render.waitForSelector) {
|
|
298
|
-
await page.waitForSelector(options.render.waitForSelector, {
|
|
299
|
-
timeout: options.render.timeoutMs
|
|
300
|
-
});
|
|
301
|
-
}
|
|
302
|
-
if (options.render.postLoadDelayMs > 0) {
|
|
303
|
-
await new Promise(
|
|
304
|
-
(r) => setTimeout(r, options.render.postLoadDelayMs)
|
|
305
|
-
);
|
|
306
|
-
}
|
|
307
|
-
await options.render.beforeExtract(page, { route });
|
|
308
|
-
const html = await page.content();
|
|
309
|
-
const $ = cheerio.load(html);
|
|
310
|
-
let harvestedHrefs = [];
|
|
311
|
-
if ((_b2 = options.crawl) == null ? void 0 : _b2.enabled) {
|
|
312
|
-
harvestedHrefs = $("a[href]").map((_, a) => $(a).attr("href")).get();
|
|
313
|
-
log.debug(` Found ${harvestedHrefs.length} links on ${route}:`, harvestedHrefs.slice(0, 15));
|
|
252
|
+
if (useStaticMode) {
|
|
253
|
+
log.debug("Using static mode - reading HTML files directly from dist/");
|
|
254
|
+
for (const rd of routeDefs) {
|
|
255
|
+
const route = rd.path;
|
|
256
|
+
if (isExcluded(route)) continue;
|
|
257
|
+
let htmlPath = routeToHtmlFsPath(distDir, route);
|
|
258
|
+
let htmlContent = null;
|
|
259
|
+
try {
|
|
260
|
+
htmlContent = await import_promises.default.readFile(htmlPath, "utf8");
|
|
261
|
+
} catch {
|
|
262
|
+
if (!route.endsWith("/") && route !== "/") {
|
|
263
|
+
const altPath = import_node_path.default.join(distDir, route.slice(1), "index.html");
|
|
264
|
+
try {
|
|
265
|
+
htmlContent = await import_promises.default.readFile(altPath, "utf8");
|
|
266
|
+
htmlPath = altPath;
|
|
267
|
+
} catch {
|
|
268
|
+
try {
|
|
269
|
+
htmlContent = await import_promises.default.readFile(import_node_path.default.join(distDir, "index.html"), "utf8");
|
|
270
|
+
htmlPath = import_node_path.default.join(distDir, "index.html");
|
|
271
|
+
log.debug(` Using SPA fallback index.html for ${route}`);
|
|
272
|
+
} catch {
|
|
273
|
+
log.warn(` \u26A0\uFE0F No HTML found for ${route}`);
|
|
274
|
+
continue;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
314
278
|
}
|
|
279
|
+
if (!htmlContent) continue;
|
|
280
|
+
const $ = cheerio.load(htmlContent);
|
|
315
281
|
for (const sel of options.extract.removeSelectors || [])
|
|
316
282
|
$(sel).remove();
|
|
317
283
|
const mainSelectors = Array.isArray(options.extract.mainSelector) ? options.extract.mainSelector : [options.extract.mainSelector];
|
|
@@ -341,112 +307,240 @@ generated_at: ${(/* @__PURE__ */ new Date()).toISOString()}
|
|
|
341
307
|
|
|
342
308
|
` : "";
|
|
343
309
|
await import_promises.default.writeFile(fsPath, frontmatter + markdownBody, "utf8");
|
|
344
|
-
const meta = routeDefs.find((r) => r.path === route);
|
|
345
310
|
captured.push({
|
|
346
311
|
route,
|
|
347
|
-
title:
|
|
348
|
-
section:
|
|
349
|
-
optional: !!
|
|
350
|
-
notes:
|
|
312
|
+
title: rd.title || title,
|
|
313
|
+
section: rd.section || "Pages",
|
|
314
|
+
optional: !!rd.optional,
|
|
315
|
+
notes: rd.notes,
|
|
351
316
|
mdRelPath
|
|
352
317
|
});
|
|
353
318
|
log.info(` \u2705 ${route} -> ${mdRelPath}`);
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
319
|
+
}
|
|
320
|
+
} else {
|
|
321
|
+
const previewServer = await (0, import_vite.preview)({
|
|
322
|
+
root: resolvedConfig.root,
|
|
323
|
+
base: resolvedConfig.base,
|
|
324
|
+
build: { outDir: distDir },
|
|
325
|
+
preview: { port: 0, open: false, host: "127.0.0.1" },
|
|
326
|
+
configFile: false,
|
|
327
|
+
plugins: [],
|
|
328
|
+
logLevel: "silent"
|
|
329
|
+
});
|
|
330
|
+
await new Promise((resolve, reject) => {
|
|
331
|
+
const server = previewServer.httpServer;
|
|
332
|
+
if (server.listening) {
|
|
333
|
+
resolve();
|
|
334
|
+
} else {
|
|
335
|
+
server.once("listening", resolve);
|
|
336
|
+
server.once("error", reject);
|
|
337
|
+
setTimeout(() => reject(new Error("Preview server failed to start")), 5e3);
|
|
338
|
+
}
|
|
339
|
+
});
|
|
340
|
+
const addr = previewServer.httpServer.address();
|
|
341
|
+
if (!addr || typeof addr === "string") {
|
|
342
|
+
await safeCloseHttpServer(previewServer.httpServer);
|
|
343
|
+
throw new Error("LLM Spider: could not determine preview server port");
|
|
344
|
+
}
|
|
345
|
+
const normalizedBase = basePath.endsWith("/") ? basePath : basePath + "/";
|
|
346
|
+
const baseUrl = `http://127.0.0.1:${addr.port}${normalizedBase}`;
|
|
347
|
+
log.debug("Preview server at:", baseUrl);
|
|
348
|
+
const pup = await loadPuppeteer();
|
|
349
|
+
const browser = await pup.launch(options.render.launchOptions);
|
|
350
|
+
const visited = /* @__PURE__ */ new Set();
|
|
351
|
+
const queue = [];
|
|
352
|
+
if ((_b = options.crawl) == null ? void 0 : _b.enabled) {
|
|
353
|
+
for (const seed of options.crawl.seeds || ["/"]) {
|
|
354
|
+
const nr = normalizeRoute(seed, {
|
|
355
|
+
stripQuery: options.crawl.stripQuery
|
|
356
|
+
});
|
|
357
|
+
if (nr) queue.push({ route: nr, depth: 0 });
|
|
358
|
+
}
|
|
359
|
+
} else {
|
|
360
|
+
for (const rd of routeDefs) queue.push({ route: rd.path, depth: 0 });
|
|
361
|
+
}
|
|
362
|
+
const maxDepth = ((_c = options.crawl) == null ? void 0 : _c.enabled) ? options.crawl.maxDepth : 0;
|
|
363
|
+
const maxPages = ((_d = options.crawl) == null ? void 0 : _d.enabled) ? options.crawl.maxPages : queue.length;
|
|
364
|
+
const concurrency = ((_e = options.crawl) == null ? void 0 : _e.enabled) ? options.crawl.concurrency : 3;
|
|
365
|
+
async function captureOne(route) {
|
|
366
|
+
var _a2, _b2, _c2;
|
|
367
|
+
if (visited.has(route)) return;
|
|
368
|
+
if (isExcluded(route)) return;
|
|
369
|
+
if (captured.length >= maxPages) return;
|
|
370
|
+
visited.add(route);
|
|
371
|
+
const page = await browser.newPage();
|
|
372
|
+
if ((_a2 = options.render.blockRequests) == null ? void 0 : _a2.length) {
|
|
373
|
+
await page.setRequestInterception(true);
|
|
374
|
+
page.on("request", (req) => {
|
|
375
|
+
const url = req.url();
|
|
376
|
+
const blocked = options.render.blockRequests.some(
|
|
377
|
+
(p) => p instanceof RegExp ? p.test(url) : url.includes(p)
|
|
378
|
+
);
|
|
379
|
+
if (blocked) req.abort();
|
|
380
|
+
else req.continue();
|
|
381
|
+
});
|
|
382
|
+
}
|
|
383
|
+
try {
|
|
384
|
+
const pageUrl = route === "/" ? baseUrl : baseUrl + route.replace(/^\//, "");
|
|
385
|
+
await options.render.beforeGoto(page, { route });
|
|
386
|
+
await page.goto(pageUrl, {
|
|
387
|
+
waitUntil: options.render.waitUntil,
|
|
388
|
+
timeout: options.render.timeoutMs
|
|
389
|
+
});
|
|
390
|
+
if (options.render.waitForSelector) {
|
|
391
|
+
await page.waitForSelector(options.render.waitForSelector, {
|
|
392
|
+
timeout: options.render.timeoutMs
|
|
358
393
|
});
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
394
|
+
}
|
|
395
|
+
if (options.render.postLoadDelayMs > 0) {
|
|
396
|
+
await new Promise(
|
|
397
|
+
(r) => setTimeout(r, options.render.postLoadDelayMs)
|
|
398
|
+
);
|
|
399
|
+
}
|
|
400
|
+
await options.render.beforeExtract(page, { route });
|
|
401
|
+
const html = await page.content();
|
|
402
|
+
const $ = cheerio.load(html);
|
|
403
|
+
let harvestedHrefs = [];
|
|
404
|
+
if ((_b2 = options.crawl) == null ? void 0 : _b2.enabled) {
|
|
405
|
+
harvestedHrefs = $("a[href]").map((_, a) => $(a).attr("href")).get();
|
|
406
|
+
log.debug(` Found ${harvestedHrefs.length} links on ${route}:`, harvestedHrefs.slice(0, 15));
|
|
407
|
+
}
|
|
408
|
+
for (const sel of options.extract.removeSelectors || [])
|
|
409
|
+
$(sel).remove();
|
|
410
|
+
const mainSelectors = Array.isArray(options.extract.mainSelector) ? options.extract.mainSelector : [options.extract.mainSelector];
|
|
411
|
+
let mainHtml = null;
|
|
412
|
+
for (const sel of mainSelectors) {
|
|
413
|
+
if (!sel) continue;
|
|
414
|
+
const node = $(sel).first();
|
|
415
|
+
if (node && node.length) {
|
|
416
|
+
mainHtml = node.html();
|
|
417
|
+
break;
|
|
364
418
|
}
|
|
365
|
-
|
|
366
|
-
|
|
419
|
+
}
|
|
420
|
+
if (!mainHtml) {
|
|
421
|
+
const main = $("main").first();
|
|
422
|
+
mainHtml = main.length ? main.html() : $("body").html();
|
|
423
|
+
}
|
|
424
|
+
const title = ($("title").text() || "").trim() || route;
|
|
425
|
+
const markdownBody = turndown.turndown(mainHtml || "");
|
|
426
|
+
const mdRelPath = options.output.mode === "subdir" ? import_node_path.default.posix.join(options.output.subdir, routeToMdWebPath(route)) : routeToMdWebPath(route);
|
|
427
|
+
const fsPath = routeToMdFsPath(distDir, route);
|
|
428
|
+
await import_promises.default.mkdir(import_node_path.default.dirname(fsPath), { recursive: true });
|
|
429
|
+
const frontmatter = options.markdown.addFrontmatter ? `---
|
|
430
|
+
source: ${route}
|
|
431
|
+
title: ${title}
|
|
432
|
+
generated_at: ${(/* @__PURE__ */ new Date()).toISOString()}
|
|
433
|
+
---
|
|
434
|
+
|
|
435
|
+
` : "";
|
|
436
|
+
await import_promises.default.writeFile(fsPath, frontmatter + markdownBody, "utf8");
|
|
437
|
+
const meta = routeDefs.find((r) => r.path === route);
|
|
438
|
+
captured.push({
|
|
439
|
+
route,
|
|
440
|
+
title: (meta == null ? void 0 : meta.title) || title,
|
|
441
|
+
section: (meta == null ? void 0 : meta.section) || "Pages",
|
|
442
|
+
optional: !!(meta == null ? void 0 : meta.optional),
|
|
443
|
+
notes: meta == null ? void 0 : meta.notes,
|
|
444
|
+
mdRelPath
|
|
445
|
+
});
|
|
446
|
+
log.info(` \u2705 ${route} -> ${mdRelPath}`);
|
|
447
|
+
if ((_c2 = options.crawl) == null ? void 0 : _c2.enabled) {
|
|
448
|
+
for (const href of harvestedHrefs) {
|
|
449
|
+
const n = normalizeRoute(href, {
|
|
450
|
+
stripQuery: options.crawl.stripQuery
|
|
451
|
+
});
|
|
452
|
+
if (!n) continue;
|
|
453
|
+
let baseRelative = n;
|
|
454
|
+
if (normalizedBase !== "/" && baseRelative.startsWith(normalizedBase)) {
|
|
455
|
+
baseRelative = "/" + baseRelative.slice(normalizedBase.length);
|
|
456
|
+
baseRelative = baseRelative === "//" ? "/" : baseRelative.replace(/\/{2,}/g, "/");
|
|
457
|
+
}
|
|
458
|
+
if (!visited.has(baseRelative) && !isExcluded(baseRelative)) {
|
|
459
|
+
queue.push({ route: baseRelative, depth: -1 });
|
|
460
|
+
}
|
|
367
461
|
}
|
|
368
462
|
}
|
|
463
|
+
} catch (err) {
|
|
464
|
+
log.warn(` \u26A0\uFE0F failed ${route}: ${(err == null ? void 0 : err.message) || err}`);
|
|
465
|
+
} finally {
|
|
466
|
+
await page.close();
|
|
369
467
|
}
|
|
370
|
-
} catch (err) {
|
|
371
|
-
log.warn(` \u26A0\uFE0F failed ${route}: ${(err == null ? void 0 : err.message) || err}`);
|
|
372
|
-
} finally {
|
|
373
|
-
await page.close();
|
|
374
468
|
}
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
469
|
+
try {
|
|
470
|
+
while (queue.length && captured.length < maxPages) {
|
|
471
|
+
const batch = queue.splice(0, concurrency).map((item) => {
|
|
472
|
+
const depth = item.depth >= 0 ? item.depth : 1;
|
|
473
|
+
return { route: item.route, depth };
|
|
474
|
+
});
|
|
475
|
+
await Promise.all(
|
|
476
|
+
batch.map(async ({ route, depth }) => {
|
|
477
|
+
var _a2, _b2;
|
|
478
|
+
if (((_a2 = options.crawl) == null ? void 0 : _a2.enabled) && depth > maxDepth) return;
|
|
479
|
+
await captureOne(route);
|
|
480
|
+
if ((_b2 = options.crawl) == null ? void 0 : _b2.enabled) {
|
|
481
|
+
for (let i = 0; i < queue.length; i++) {
|
|
482
|
+
if (queue[i].depth === -1) queue[i].depth = depth + 1;
|
|
483
|
+
}
|
|
390
484
|
}
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
);
|
|
394
|
-
}
|
|
395
|
-
const llmsTitle = options.output.llmsTitle || ((_f = resolvedConfig == null ? void 0 : resolvedConfig.env) == null ? void 0 : _f.mode) || "Site";
|
|
396
|
-
const items = options.output.sort ? [...captured].sort((a, b) => a.route.localeCompare(b.route)) : captured;
|
|
397
|
-
const bySection = /* @__PURE__ */ new Map();
|
|
398
|
-
const optionalItems = [];
|
|
399
|
-
for (const item of items) {
|
|
400
|
-
if (item.optional) optionalItems.push(item);
|
|
401
|
-
else {
|
|
402
|
-
const s = item.section || "Pages";
|
|
403
|
-
bySection.set(s, [...bySection.get(s) || [], item]);
|
|
485
|
+
})
|
|
486
|
+
);
|
|
404
487
|
}
|
|
488
|
+
} finally {
|
|
489
|
+
await browser.close();
|
|
490
|
+
await safeCloseHttpServer(previewServer.httpServer);
|
|
405
491
|
}
|
|
406
|
-
|
|
492
|
+
}
|
|
493
|
+
const llmsTitle = options.output.llmsTitle || ((_f = resolvedConfig == null ? void 0 : resolvedConfig.env) == null ? void 0 : _f.mode) || "Site";
|
|
494
|
+
const items = options.output.sort ? [...captured].sort((a, b) => a.route.localeCompare(b.route)) : captured;
|
|
495
|
+
const bySection = /* @__PURE__ */ new Map();
|
|
496
|
+
const optionalItems = [];
|
|
497
|
+
for (const item of items) {
|
|
498
|
+
if (item.optional) optionalItems.push(item);
|
|
499
|
+
else {
|
|
500
|
+
const s = item.section || "Pages";
|
|
501
|
+
bySection.set(s, [...bySection.get(s) || [], item]);
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
let llms = `# ${llmsTitle}
|
|
407
505
|
|
|
408
506
|
> ${options.output.llmsSummary}
|
|
409
507
|
|
|
410
508
|
`;
|
|
411
|
-
|
|
412
|
-
|
|
509
|
+
for (const [section, sectionItems] of bySection.entries()) {
|
|
510
|
+
llms += `## ${section}
|
|
413
511
|
|
|
414
512
|
`;
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
`;
|
|
421
|
-
}
|
|
422
|
-
llms += `
|
|
513
|
+
for (const it of sectionItems) {
|
|
514
|
+
const link = makeLlmsLink(it.mdRelPath);
|
|
515
|
+
const label = it.title || it.route;
|
|
516
|
+
const notes = it.notes ? `: ${it.notes}` : "";
|
|
517
|
+
llms += `- [${label}](${link})${notes}
|
|
423
518
|
`;
|
|
424
519
|
}
|
|
425
|
-
|
|
426
|
-
llms += `## Optional
|
|
427
|
-
|
|
520
|
+
llms += `
|
|
428
521
|
`;
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
llms += `- [${label}](${link})${notes}
|
|
522
|
+
}
|
|
523
|
+
if (optionalItems.length) {
|
|
524
|
+
llms += `## Optional
|
|
525
|
+
|
|
434
526
|
`;
|
|
435
|
-
|
|
436
|
-
|
|
527
|
+
for (const it of optionalItems) {
|
|
528
|
+
const link = makeLlmsLink(it.mdRelPath);
|
|
529
|
+
const label = it.title || it.route;
|
|
530
|
+
const notes = it.notes ? `: ${it.notes}` : "";
|
|
531
|
+
llms += `- [${label}](${link})${notes}
|
|
437
532
|
`;
|
|
438
533
|
}
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
534
|
+
llms += `
|
|
535
|
+
`;
|
|
536
|
+
}
|
|
537
|
+
const llmsPath = import_node_path.default.join(distDir, options.output.llmsTxtFileName);
|
|
538
|
+
await import_promises.default.writeFile(llmsPath, llms, "utf8");
|
|
539
|
+
log.info(
|
|
540
|
+
`
|
|
443
541
|
LLM Spider: wrote ${captured.length} markdown pages + ${options.output.llmsTxtFileName}
|
|
444
542
|
`
|
|
445
|
-
|
|
446
|
-
} finally {
|
|
447
|
-
await browser.close();
|
|
448
|
-
await safeCloseHttpServer(previewServer.httpServer);
|
|
449
|
-
}
|
|
543
|
+
);
|
|
450
544
|
}
|
|
451
545
|
};
|
|
452
546
|
}
|