@happyalienai/vite-plugin-llm-spider 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,7 +5,13 @@ import path from "path";
5
5
  import * as cheerio from "cheerio";
6
6
  import TurndownService from "turndown";
7
7
  import { gfm } from "turndown-plugin-gfm";
8
- import puppeteer from "puppeteer";
8
+ var puppeteer = null;
9
+ async function loadPuppeteer() {
10
+ if (!puppeteer) {
11
+ puppeteer = await import("puppeteer");
12
+ }
13
+ return puppeteer.default || puppeteer;
14
+ }
9
15
  function llmSpiderPlugin(userOptions = {}) {
10
16
  let resolvedConfig;
11
17
  function deepMerge(target, source) {
@@ -21,6 +27,11 @@ function llmSpiderPlugin(userOptions = {}) {
21
27
  }
22
28
  const defaults = {
23
29
  enabled: true,
30
+ // Static mode: read HTML files directly from dist/ without browser
31
+ // - true: always use static mode (no Puppeteer)
32
+ // - false: always use browser rendering
33
+ // - "auto" (default): use static when crawl is disabled, browser when crawl is enabled
34
+ static: "auto",
24
35
  // Recommended: explicit list
25
36
  routes: (
26
37
  /** @type {RouteDef[] | undefined} */
@@ -151,6 +162,11 @@ function llmSpiderPlugin(userOptions = {}) {
151
162
  }
152
163
  return path.join(distDir, rel);
153
164
  }
165
+ function routeToHtmlFsPath(distDir, route) {
166
+ if (route === "/") return path.join(distDir, "index.html");
167
+ if (route.endsWith("/")) return path.join(distDir, route.slice(1), "index.html");
168
+ return path.join(distDir, route.slice(1) + ".html");
169
+ }
154
170
  function makeLlmsLink(relMdPath) {
155
171
  return relMdPath.replace(/\\/g, "/");
156
172
  }
@@ -159,6 +175,12 @@ function llmSpiderPlugin(userOptions = {}) {
159
175
  server.close((err) => err ? reject(err) : resolve());
160
176
  });
161
177
  }
178
+ function shouldUseStaticMode() {
179
+ var _a;
180
+ if (options.static === true) return true;
181
+ if (options.static === false) return false;
182
+ return !((_a = options.crawl) == null ? void 0 : _a.enabled);
183
+ }
162
184
  return {
163
185
  name: "vite-plugin-llm-spider",
164
186
  apply: "build",
@@ -172,6 +194,7 @@ function llmSpiderPlugin(userOptions = {}) {
172
194
  throw new Error("LLM Spider: missing resolved Vite config");
173
195
  const distDir = resolvedConfig.build.outDir || "dist";
174
196
  const basePath = (resolvedConfig.base || "/").replace(/\\/g, "/");
197
+ const useStaticMode = shouldUseStaticMode();
175
198
  let routeDefs = [];
176
199
  if (Array.isArray(options.routes) && options.routes.length) {
177
200
  routeDefs = options.routes.map((r) => ({
@@ -186,98 +209,41 @@ function llmSpiderPlugin(userOptions = {}) {
186
209
  } else {
187
210
  routeDefs = [{ path: "/", section: "Pages" }];
188
211
  }
189
- log.info("\nLLM Spider: generating markdown + llms.txt");
212
+ log.info(`
213
+ LLM Spider: generating markdown + llms.txt (${useStaticMode ? "static" : "browser"} mode)`);
190
214
  log.debug("distDir:", distDir, "base:", basePath);
191
- const previewServer = await preview({
192
- root: resolvedConfig.root,
193
- base: resolvedConfig.base,
194
- build: { outDir: distDir },
195
- preview: { port: 0, open: false, host: "127.0.0.1" },
196
- configFile: false,
197
- plugins: [],
198
- // avoid loading user plugins again
199
- logLevel: "silent"
200
- });
201
- await new Promise((resolve, reject) => {
202
- const server = previewServer.httpServer;
203
- if (server.listening) {
204
- resolve();
205
- } else {
206
- server.once("listening", resolve);
207
- server.once("error", reject);
208
- setTimeout(() => reject(new Error("Preview server failed to start")), 5e3);
209
- }
210
- });
211
- const addr = previewServer.httpServer.address();
212
- if (!addr || typeof addr === "string") {
213
- await safeCloseHttpServer(previewServer.httpServer);
214
- throw new Error("LLM Spider: could not determine preview server port");
215
- }
216
- const normalizedBase = basePath.endsWith("/") ? basePath : basePath + "/";
217
- const baseUrl = `http://127.0.0.1:${addr.port}${normalizedBase}`;
218
- log.debug("Preview server at:", baseUrl);
219
- const browser = await puppeteer.launch(options.render.launchOptions);
220
215
  const turndown = new TurndownService(options.markdown.turndown);
221
216
  turndown.use(gfm);
222
- const visited = /* @__PURE__ */ new Set();
223
217
  const captured = [];
224
- const queue = [];
225
- if ((_b = options.crawl) == null ? void 0 : _b.enabled) {
226
- for (const seed of options.crawl.seeds || ["/"]) {
227
- const nr = normalizeRoute(seed, {
228
- stripQuery: options.crawl.stripQuery
229
- });
230
- if (nr) queue.push({ route: nr, depth: 0 });
231
- }
232
- } else {
233
- for (const rd of routeDefs) queue.push({ route: rd.path, depth: 0 });
234
- }
235
- const maxDepth = ((_c = options.crawl) == null ? void 0 : _c.enabled) ? options.crawl.maxDepth : 0;
236
- const maxPages = ((_d = options.crawl) == null ? void 0 : _d.enabled) ? options.crawl.maxPages : queue.length;
237
- const concurrency = ((_e = options.crawl) == null ? void 0 : _e.enabled) ? options.crawl.concurrency : 3;
238
- async function captureOne(route) {
239
- var _a2, _b2, _c2;
240
- if (visited.has(route)) return;
241
- if (isExcluded(route)) return;
242
- if (captured.length >= maxPages) return;
243
- visited.add(route);
244
- const page = await browser.newPage();
245
- if ((_a2 = options.render.blockRequests) == null ? void 0 : _a2.length) {
246
- await page.setRequestInterception(true);
247
- page.on("request", (req) => {
248
- const url = req.url();
249
- const blocked = options.render.blockRequests.some(
250
- (p) => p instanceof RegExp ? p.test(url) : url.includes(p)
251
- );
252
- if (blocked) req.abort();
253
- else req.continue();
254
- });
255
- }
256
- try {
257
- const pageUrl = route === "/" ? baseUrl : baseUrl + route.replace(/^\//, "");
258
- await options.render.beforeGoto(page, { route });
259
- await page.goto(pageUrl, {
260
- waitUntil: options.render.waitUntil,
261
- timeout: options.render.timeoutMs
262
- });
263
- if (options.render.waitForSelector) {
264
- await page.waitForSelector(options.render.waitForSelector, {
265
- timeout: options.render.timeoutMs
266
- });
267
- }
268
- if (options.render.postLoadDelayMs > 0) {
269
- await new Promise(
270
- (r) => setTimeout(r, options.render.postLoadDelayMs)
271
- );
272
- }
273
- await options.render.beforeExtract(page, { route });
274
- const html = await page.content();
275
- const $ = cheerio.load(html);
276
- let harvestedHrefs = [];
277
- if ((_b2 = options.crawl) == null ? void 0 : _b2.enabled) {
278
- harvestedHrefs = $("a[href]").map((_, a) => $(a).attr("href")).get();
279
- log.debug(` Found ${harvestedHrefs.length} links on ${route}:`, harvestedHrefs.slice(0, 15));
218
+ if (useStaticMode) {
219
+ log.debug("Using static mode - reading HTML files directly from dist/");
220
+ for (const rd of routeDefs) {
221
+ const route = rd.path;
222
+ if (isExcluded(route)) continue;
223
+ let htmlPath = routeToHtmlFsPath(distDir, route);
224
+ let htmlContent = null;
225
+ try {
226
+ htmlContent = await fs.readFile(htmlPath, "utf8");
227
+ } catch {
228
+ if (!route.endsWith("/") && route !== "/") {
229
+ const altPath = path.join(distDir, route.slice(1), "index.html");
230
+ try {
231
+ htmlContent = await fs.readFile(altPath, "utf8");
232
+ htmlPath = altPath;
233
+ } catch {
234
+ try {
235
+ htmlContent = await fs.readFile(path.join(distDir, "index.html"), "utf8");
236
+ htmlPath = path.join(distDir, "index.html");
237
+ log.debug(` Using SPA fallback index.html for ${route}`);
238
+ } catch {
239
+ log.warn(` \u26A0\uFE0F No HTML found for ${route}`);
240
+ continue;
241
+ }
242
+ }
243
+ }
280
244
  }
245
+ if (!htmlContent) continue;
246
+ const $ = cheerio.load(htmlContent);
281
247
  for (const sel of options.extract.removeSelectors || [])
282
248
  $(sel).remove();
283
249
  const mainSelectors = Array.isArray(options.extract.mainSelector) ? options.extract.mainSelector : [options.extract.mainSelector];
@@ -307,112 +273,240 @@ generated_at: ${(/* @__PURE__ */ new Date()).toISOString()}
307
273
 
308
274
  ` : "";
309
275
  await fs.writeFile(fsPath, frontmatter + markdownBody, "utf8");
310
- const meta = routeDefs.find((r) => r.path === route);
311
276
  captured.push({
312
277
  route,
313
- title: (meta == null ? void 0 : meta.title) || title,
314
- section: (meta == null ? void 0 : meta.section) || "Pages",
315
- optional: !!(meta == null ? void 0 : meta.optional),
316
- notes: meta == null ? void 0 : meta.notes,
278
+ title: rd.title || title,
279
+ section: rd.section || "Pages",
280
+ optional: !!rd.optional,
281
+ notes: rd.notes,
317
282
  mdRelPath
318
283
  });
319
284
  log.info(` \u2705 ${route} -> ${mdRelPath}`);
320
- if ((_c2 = options.crawl) == null ? void 0 : _c2.enabled) {
321
- for (const href of harvestedHrefs) {
322
- const n = normalizeRoute(href, {
323
- stripQuery: options.crawl.stripQuery
285
+ }
286
+ } else {
287
+ const previewServer = await preview({
288
+ root: resolvedConfig.root,
289
+ base: resolvedConfig.base,
290
+ build: { outDir: distDir },
291
+ preview: { port: 0, open: false, host: "127.0.0.1" },
292
+ configFile: false,
293
+ plugins: [],
294
+ logLevel: "silent"
295
+ });
296
+ await new Promise((resolve, reject) => {
297
+ const server = previewServer.httpServer;
298
+ if (server.listening) {
299
+ resolve();
300
+ } else {
301
+ server.once("listening", resolve);
302
+ server.once("error", reject);
303
+ setTimeout(() => reject(new Error("Preview server failed to start")), 5e3);
304
+ }
305
+ });
306
+ const addr = previewServer.httpServer.address();
307
+ if (!addr || typeof addr === "string") {
308
+ await safeCloseHttpServer(previewServer.httpServer);
309
+ throw new Error("LLM Spider: could not determine preview server port");
310
+ }
311
+ const normalizedBase = basePath.endsWith("/") ? basePath : basePath + "/";
312
+ const baseUrl = `http://127.0.0.1:${addr.port}${normalizedBase}`;
313
+ log.debug("Preview server at:", baseUrl);
314
+ const pup = await loadPuppeteer();
315
+ const browser = await pup.launch(options.render.launchOptions);
316
+ const visited = /* @__PURE__ */ new Set();
317
+ const queue = [];
318
+ if ((_b = options.crawl) == null ? void 0 : _b.enabled) {
319
+ for (const seed of options.crawl.seeds || ["/"]) {
320
+ const nr = normalizeRoute(seed, {
321
+ stripQuery: options.crawl.stripQuery
322
+ });
323
+ if (nr) queue.push({ route: nr, depth: 0 });
324
+ }
325
+ } else {
326
+ for (const rd of routeDefs) queue.push({ route: rd.path, depth: 0 });
327
+ }
328
+ const maxDepth = ((_c = options.crawl) == null ? void 0 : _c.enabled) ? options.crawl.maxDepth : 0;
329
+ const maxPages = ((_d = options.crawl) == null ? void 0 : _d.enabled) ? options.crawl.maxPages : queue.length;
330
+ const concurrency = ((_e = options.crawl) == null ? void 0 : _e.enabled) ? options.crawl.concurrency : 3;
331
+ async function captureOne(route) {
332
+ var _a2, _b2, _c2;
333
+ if (visited.has(route)) return;
334
+ if (isExcluded(route)) return;
335
+ if (captured.length >= maxPages) return;
336
+ visited.add(route);
337
+ const page = await browser.newPage();
338
+ if ((_a2 = options.render.blockRequests) == null ? void 0 : _a2.length) {
339
+ await page.setRequestInterception(true);
340
+ page.on("request", (req) => {
341
+ const url = req.url();
342
+ const blocked = options.render.blockRequests.some(
343
+ (p) => p instanceof RegExp ? p.test(url) : url.includes(p)
344
+ );
345
+ if (blocked) req.abort();
346
+ else req.continue();
347
+ });
348
+ }
349
+ try {
350
+ const pageUrl = route === "/" ? baseUrl : baseUrl + route.replace(/^\//, "");
351
+ await options.render.beforeGoto(page, { route });
352
+ await page.goto(pageUrl, {
353
+ waitUntil: options.render.waitUntil,
354
+ timeout: options.render.timeoutMs
355
+ });
356
+ if (options.render.waitForSelector) {
357
+ await page.waitForSelector(options.render.waitForSelector, {
358
+ timeout: options.render.timeoutMs
324
359
  });
325
- if (!n) continue;
326
- let baseRelative = n;
327
- if (normalizedBase !== "/" && baseRelative.startsWith(normalizedBase)) {
328
- baseRelative = "/" + baseRelative.slice(normalizedBase.length);
329
- baseRelative = baseRelative === "//" ? "/" : baseRelative.replace(/\/{2,}/g, "/");
360
+ }
361
+ if (options.render.postLoadDelayMs > 0) {
362
+ await new Promise(
363
+ (r) => setTimeout(r, options.render.postLoadDelayMs)
364
+ );
365
+ }
366
+ await options.render.beforeExtract(page, { route });
367
+ const html = await page.content();
368
+ const $ = cheerio.load(html);
369
+ let harvestedHrefs = [];
370
+ if ((_b2 = options.crawl) == null ? void 0 : _b2.enabled) {
371
+ harvestedHrefs = $("a[href]").map((_, a) => $(a).attr("href")).get();
372
+ log.debug(` Found ${harvestedHrefs.length} links on ${route}:`, harvestedHrefs.slice(0, 15));
373
+ }
374
+ for (const sel of options.extract.removeSelectors || [])
375
+ $(sel).remove();
376
+ const mainSelectors = Array.isArray(options.extract.mainSelector) ? options.extract.mainSelector : [options.extract.mainSelector];
377
+ let mainHtml = null;
378
+ for (const sel of mainSelectors) {
379
+ if (!sel) continue;
380
+ const node = $(sel).first();
381
+ if (node && node.length) {
382
+ mainHtml = node.html();
383
+ break;
330
384
  }
331
- if (!visited.has(baseRelative) && !isExcluded(baseRelative)) {
332
- queue.push({ route: baseRelative, depth: -1 });
385
+ }
386
+ if (!mainHtml) {
387
+ const main = $("main").first();
388
+ mainHtml = main.length ? main.html() : $("body").html();
389
+ }
390
+ const title = ($("title").text() || "").trim() || route;
391
+ const markdownBody = turndown.turndown(mainHtml || "");
392
+ const mdRelPath = options.output.mode === "subdir" ? path.posix.join(options.output.subdir, routeToMdWebPath(route)) : routeToMdWebPath(route);
393
+ const fsPath = routeToMdFsPath(distDir, route);
394
+ await fs.mkdir(path.dirname(fsPath), { recursive: true });
395
+ const frontmatter = options.markdown.addFrontmatter ? `---
396
+ source: ${route}
397
+ title: ${title}
398
+ generated_at: ${(/* @__PURE__ */ new Date()).toISOString()}
399
+ ---
400
+
401
+ ` : "";
402
+ await fs.writeFile(fsPath, frontmatter + markdownBody, "utf8");
403
+ const meta = routeDefs.find((r) => r.path === route);
404
+ captured.push({
405
+ route,
406
+ title: (meta == null ? void 0 : meta.title) || title,
407
+ section: (meta == null ? void 0 : meta.section) || "Pages",
408
+ optional: !!(meta == null ? void 0 : meta.optional),
409
+ notes: meta == null ? void 0 : meta.notes,
410
+ mdRelPath
411
+ });
412
+ log.info(` \u2705 ${route} -> ${mdRelPath}`);
413
+ if ((_c2 = options.crawl) == null ? void 0 : _c2.enabled) {
414
+ for (const href of harvestedHrefs) {
415
+ const n = normalizeRoute(href, {
416
+ stripQuery: options.crawl.stripQuery
417
+ });
418
+ if (!n) continue;
419
+ let baseRelative = n;
420
+ if (normalizedBase !== "/" && baseRelative.startsWith(normalizedBase)) {
421
+ baseRelative = "/" + baseRelative.slice(normalizedBase.length);
422
+ baseRelative = baseRelative === "//" ? "/" : baseRelative.replace(/\/{2,}/g, "/");
423
+ }
424
+ if (!visited.has(baseRelative) && !isExcluded(baseRelative)) {
425
+ queue.push({ route: baseRelative, depth: -1 });
426
+ }
333
427
  }
334
428
  }
429
+ } catch (err) {
430
+ log.warn(` \u26A0\uFE0F failed ${route}: ${(err == null ? void 0 : err.message) || err}`);
431
+ } finally {
432
+ await page.close();
335
433
  }
336
- } catch (err) {
337
- log.warn(` \u26A0\uFE0F failed ${route}: ${(err == null ? void 0 : err.message) || err}`);
338
- } finally {
339
- await page.close();
340
434
  }
341
- }
342
- try {
343
- while (queue.length && captured.length < maxPages) {
344
- const batch = queue.splice(0, concurrency).map((item) => {
345
- const depth = item.depth >= 0 ? item.depth : 1;
346
- return { route: item.route, depth };
347
- });
348
- await Promise.all(
349
- batch.map(async ({ route, depth }) => {
350
- var _a2, _b2;
351
- if (((_a2 = options.crawl) == null ? void 0 : _a2.enabled) && depth > maxDepth) return;
352
- await captureOne(route);
353
- if ((_b2 = options.crawl) == null ? void 0 : _b2.enabled) {
354
- for (let i = 0; i < queue.length; i++) {
355
- if (queue[i].depth === -1) queue[i].depth = depth + 1;
435
+ try {
436
+ while (queue.length && captured.length < maxPages) {
437
+ const batch = queue.splice(0, concurrency).map((item) => {
438
+ const depth = item.depth >= 0 ? item.depth : 1;
439
+ return { route: item.route, depth };
440
+ });
441
+ await Promise.all(
442
+ batch.map(async ({ route, depth }) => {
443
+ var _a2, _b2;
444
+ if (((_a2 = options.crawl) == null ? void 0 : _a2.enabled) && depth > maxDepth) return;
445
+ await captureOne(route);
446
+ if ((_b2 = options.crawl) == null ? void 0 : _b2.enabled) {
447
+ for (let i = 0; i < queue.length; i++) {
448
+ if (queue[i].depth === -1) queue[i].depth = depth + 1;
449
+ }
356
450
  }
357
- }
358
- })
359
- );
360
- }
361
- const llmsTitle = options.output.llmsTitle || ((_f = resolvedConfig == null ? void 0 : resolvedConfig.env) == null ? void 0 : _f.mode) || "Site";
362
- const items = options.output.sort ? [...captured].sort((a, b) => a.route.localeCompare(b.route)) : captured;
363
- const bySection = /* @__PURE__ */ new Map();
364
- const optionalItems = [];
365
- for (const item of items) {
366
- if (item.optional) optionalItems.push(item);
367
- else {
368
- const s = item.section || "Pages";
369
- bySection.set(s, [...bySection.get(s) || [], item]);
451
+ })
452
+ );
370
453
  }
454
+ } finally {
455
+ await browser.close();
456
+ await safeCloseHttpServer(previewServer.httpServer);
371
457
  }
372
- let llms = `# ${llmsTitle}
458
+ }
459
+ const llmsTitle = options.output.llmsTitle || ((_f = resolvedConfig == null ? void 0 : resolvedConfig.env) == null ? void 0 : _f.mode) || "Site";
460
+ const items = options.output.sort ? [...captured].sort((a, b) => a.route.localeCompare(b.route)) : captured;
461
+ const bySection = /* @__PURE__ */ new Map();
462
+ const optionalItems = [];
463
+ for (const item of items) {
464
+ if (item.optional) optionalItems.push(item);
465
+ else {
466
+ const s = item.section || "Pages";
467
+ bySection.set(s, [...bySection.get(s) || [], item]);
468
+ }
469
+ }
470
+ let llms = `# ${llmsTitle}
373
471
 
374
472
  > ${options.output.llmsSummary}
375
473
 
376
474
  `;
377
- for (const [section, sectionItems] of bySection.entries()) {
378
- llms += `## ${section}
475
+ for (const [section, sectionItems] of bySection.entries()) {
476
+ llms += `## ${section}
379
477
 
380
478
  `;
381
- for (const it of sectionItems) {
382
- const link = makeLlmsLink(it.mdRelPath);
383
- const label = it.title || it.route;
384
- const notes = it.notes ? `: ${it.notes}` : "";
385
- llms += `- [${label}](${link})${notes}
386
- `;
387
- }
388
- llms += `
479
+ for (const it of sectionItems) {
480
+ const link = makeLlmsLink(it.mdRelPath);
481
+ const label = it.title || it.route;
482
+ const notes = it.notes ? `: ${it.notes}` : "";
483
+ llms += `- [${label}](${link})${notes}
389
484
  `;
390
485
  }
391
- if (optionalItems.length) {
392
- llms += `## Optional
393
-
486
+ llms += `
394
487
  `;
395
- for (const it of optionalItems) {
396
- const link = makeLlmsLink(it.mdRelPath);
397
- const label = it.title || it.route;
398
- const notes = it.notes ? `: ${it.notes}` : "";
399
- llms += `- [${label}](${link})${notes}
488
+ }
489
+ if (optionalItems.length) {
490
+ llms += `## Optional
491
+
400
492
  `;
401
- }
402
- llms += `
493
+ for (const it of optionalItems) {
494
+ const link = makeLlmsLink(it.mdRelPath);
495
+ const label = it.title || it.route;
496
+ const notes = it.notes ? `: ${it.notes}` : "";
497
+ llms += `- [${label}](${link})${notes}
403
498
  `;
404
499
  }
405
- const llmsPath = path.join(distDir, options.output.llmsTxtFileName);
406
- await fs.writeFile(llmsPath, llms, "utf8");
407
- log.info(
408
- `
500
+ llms += `
501
+ `;
502
+ }
503
+ const llmsPath = path.join(distDir, options.output.llmsTxtFileName);
504
+ await fs.writeFile(llmsPath, llms, "utf8");
505
+ log.info(
506
+ `
409
507
  LLM Spider: wrote ${captured.length} markdown pages + ${options.output.llmsTxtFileName}
410
508
  `
411
- );
412
- } finally {
413
- await browser.close();
414
- await safeCloseHttpServer(previewServer.httpServer);
415
- }
509
+ );
416
510
  }
417
511
  };
418
512
  }