@happyalienai/vite-plugin-llm-spider 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +225 -0
- package/dist/index.cjs +457 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +111 -0
- package/dist/index.d.ts +111 -0
- package/dist/index.js +423 -0
- package/dist/index.js.map +1 -0
- package/package.json +68 -0
package/README.md
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# vite-plugin-llm-spider
|
|
2
|
+
|
|
3
|
+
> **Built by [Happy Alien AI](https://happyalien.ai)** โ AI-powered tools for eLearning creators.
|
|
4
|
+
|
|
5
|
+
A Vite build plugin that generates **LLM-friendly Markdown snapshots** of selected public routes and publishes a curated index at **`/llms.txt`**.
|
|
6
|
+
|
|
7
|
+
Makes SPAs and content-heavy Vite apps easier for AI agents/tools to understand by providing clean, low-noise text renditions plus a deterministic index.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- ๐ท๏ธ **Two discovery modes:** explicit route list (recommended) or controlled BFS crawl
|
|
12
|
+
- ๐ **Markdown output:** clean, readable `.md` files following the [llms.txt spec](https://llmstxt.org/)
|
|
13
|
+
- ๐งน **Noise removal:** strips nav, footer, modals, cookie banners, etc.
|
|
14
|
+
- ๐ **Safe by default:** explicit excludes, no auth pages by accident
|
|
15
|
+
- โก **Works with any Vite framework:** Vue, React, Svelte, Solid, etc.
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
npm i -D @happyalienai/vite-plugin-llm-spider
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```js
|
|
26
|
+
// vite.config.js
|
|
27
|
+
import { defineConfig } from "vite";
|
|
28
|
+
import llmSpider from "@happyalienai/vite-plugin-llm-spider";
|
|
29
|
+
|
|
30
|
+
export default defineConfig({
|
|
31
|
+
plugins: [
|
|
32
|
+
llmSpider({
|
|
33
|
+
routes: [
|
|
34
|
+
{ path: "/", title: "Home", section: "Product" },
|
|
35
|
+
{ path: "/pricing", title: "Pricing", section: "Product" },
|
|
36
|
+
{ path: "/docs/", title: "Docs", section: "Docs", optional: true },
|
|
37
|
+
],
|
|
38
|
+
exclude: ["/login", "/admin", "/account"],
|
|
39
|
+
render: {
|
|
40
|
+
waitForSelector: "main",
|
|
41
|
+
},
|
|
42
|
+
}),
|
|
43
|
+
],
|
|
44
|
+
});
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
After `npm run build`, you'll get:
|
|
48
|
+
|
|
49
|
+
- `dist/llms.txt` โ curated index
|
|
50
|
+
- `dist/index.html.md` โ home page
|
|
51
|
+
- `dist/pricing.md` โ pricing page
|
|
52
|
+
- `dist/docs/index.html.md` โ docs page
|
|
53
|
+
|
|
54
|
+
## Output Format
|
|
55
|
+
|
|
56
|
+
The generated `llms.txt` follows the [llmstxt.org](https://llmstxt.org/) spec:
|
|
57
|
+
|
|
58
|
+
```markdown
|
|
59
|
+
# My Site
|
|
60
|
+
|
|
61
|
+
> LLM-friendly index of important pages and their Markdown equivalents.
|
|
62
|
+
|
|
63
|
+
## Product
|
|
64
|
+
|
|
65
|
+
- [Home](index.html.md)
|
|
66
|
+
- [Pricing](pricing.md)
|
|
67
|
+
|
|
68
|
+
## Optional
|
|
69
|
+
|
|
70
|
+
- [Docs](docs/index.html.md)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Configuration
|
|
74
|
+
|
|
75
|
+
### Route Definitions
|
|
76
|
+
|
|
77
|
+
```js
|
|
78
|
+
routes: [
|
|
79
|
+
{
|
|
80
|
+
path: "/pricing", // URL path (required)
|
|
81
|
+
title: "Pricing", // Display title in llms.txt
|
|
82
|
+
section: "Product", // H2 section grouping
|
|
83
|
+
optional: false, // If true, goes under "## Optional"
|
|
84
|
+
notes: "Updated weekly" // Appended to link in llms.txt
|
|
85
|
+
}
|
|
86
|
+
]
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Crawl Mode (opt-in)
|
|
90
|
+
|
|
91
|
+
```js
|
|
92
|
+
llmSpider({
|
|
93
|
+
crawl: {
|
|
94
|
+
enabled: true,
|
|
95
|
+
seeds: ["/"],
|
|
96
|
+
maxDepth: 2,
|
|
97
|
+
maxPages: 50,
|
|
98
|
+
concurrency: 3,
|
|
99
|
+
},
|
|
100
|
+
exclude: ["/login", "/admin"],
|
|
101
|
+
})
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Rendering Options
|
|
105
|
+
|
|
106
|
+
```js
|
|
107
|
+
render: {
|
|
108
|
+
waitUntil: "networkidle2", // Puppeteer wait strategy
|
|
109
|
+
timeoutMs: 30_000, // Page load timeout
|
|
110
|
+
waitForSelector: "main", // Wait for element before extracting
|
|
111
|
+
postLoadDelayMs: 200, // Extra delay after load
|
|
112
|
+
blockRequests: [ // Block analytics/trackers
|
|
113
|
+
/google-analytics\.com/i,
|
|
114
|
+
/hotjar\.com/i,
|
|
115
|
+
],
|
|
116
|
+
launchOptions: { // Puppeteer launch options
|
|
117
|
+
headless: "new",
|
|
118
|
+
args: ["--no-sandbox"], // For CI/Docker
|
|
119
|
+
},
|
|
120
|
+
}
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Extraction Options
|
|
124
|
+
|
|
125
|
+
```js
|
|
126
|
+
extract: {
|
|
127
|
+
mainSelector: ["main", "#content", "[data-main]"], // Content selectors (first match wins)
|
|
128
|
+
removeSelectors: [ // Elements to strip
|
|
129
|
+
"nav", "header", "footer", "svg", ".modal", ".cookie-banner"
|
|
130
|
+
],
|
|
131
|
+
}
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Output Options
|
|
135
|
+
|
|
136
|
+
```js
|
|
137
|
+
output: {
|
|
138
|
+
mode: "sibling", // "sibling" (default) or "subdir"
|
|
139
|
+
subdir: "ai", // Subdir name when mode="subdir"
|
|
140
|
+
llmsTxtFileName: "llms.txt", // Index filename
|
|
141
|
+
llmsTitle: "My App", // H1 title
|
|
142
|
+
llmsSummary: "AI-friendly pages", // Summary blockquote
|
|
143
|
+
sort: true, // Alphabetical ordering
|
|
144
|
+
}
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Markdown Options
|
|
148
|
+
|
|
149
|
+
```js
|
|
150
|
+
markdown: {
|
|
151
|
+
addFrontmatter: true, // Add YAML frontmatter with source/title/date
|
|
152
|
+
turndown: { // Turndown options
|
|
153
|
+
headingStyle: "atx",
|
|
154
|
+
codeBlockStyle: "fenced",
|
|
155
|
+
},
|
|
156
|
+
}
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## URL Mapping
|
|
160
|
+
|
|
161
|
+
Following the llms.txt spec:
|
|
162
|
+
|
|
163
|
+
| Route | Output File |
|
|
164
|
+
|-------|-------------|
|
|
165
|
+
| `/` | `index.html.md` |
|
|
166
|
+
| `/pricing` | `pricing.md` |
|
|
167
|
+
| `/docs/` | `docs/index.html.md` |
|
|
168
|
+
| `/docs/api` | `docs/api.md` |
|
|
169
|
+
|
|
170
|
+
## Hooks
|
|
171
|
+
|
|
172
|
+
```js
|
|
173
|
+
render: {
|
|
174
|
+
async beforeGoto(page, { route }) {
|
|
175
|
+
// Inject auth token for protected pages (use carefully!)
|
|
176
|
+
await page.evaluate(() => {
|
|
177
|
+
localStorage.setItem("token", "dev-token");
|
|
178
|
+
});
|
|
179
|
+
},
|
|
180
|
+
async beforeExtract(page, { route }) {
|
|
181
|
+
// Custom cleanup before extraction
|
|
182
|
+
},
|
|
183
|
+
}
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## CI/Docker
|
|
187
|
+
|
|
188
|
+
For headless environments:
|
|
189
|
+
|
|
190
|
+
```js
|
|
191
|
+
render: {
|
|
192
|
+
launchOptions: {
|
|
193
|
+
headless: "new",
|
|
194
|
+
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
|
195
|
+
},
|
|
196
|
+
}
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Troubleshooting
|
|
200
|
+
|
|
201
|
+
### Timeouts
|
|
202
|
+
- Use `waitForSelector: "main"` instead of relying on `networkidle`
|
|
203
|
+
- Increase `timeoutMs` or add `postLoadDelayMs`
|
|
204
|
+
|
|
205
|
+
### Output is mostly nav/footer
|
|
206
|
+
- Tighten `mainSelector` to your content wrapper
|
|
207
|
+
- Add more `removeSelectors`
|
|
208
|
+
|
|
209
|
+
### CI fails to launch browser
|
|
210
|
+
- Add `--no-sandbox` to launch args
|
|
211
|
+
- Ensure Puppeteer dependencies are installed
|
|
212
|
+
|
|
213
|
+
## License
|
|
214
|
+
|
|
215
|
+
MIT
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
<p align="center">
|
|
220
|
+
<a href="https://happyalien.ai">
|
|
221
|
+
<strong>Happy Alien AI</strong>
|
|
222
|
+
</a>
|
|
223
|
+
<br>
|
|
224
|
+
AI-powered tools for instructional designers and eLearning teams
|
|
225
|
+
</p>
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
var __create = Object.create;
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
6
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
7
|
+
var __export = (target, all) => {
|
|
8
|
+
for (var name in all)
|
|
9
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
10
|
+
};
|
|
11
|
+
var __copyProps = (to, from, except, desc) => {
|
|
12
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
13
|
+
for (let key of __getOwnPropNames(from))
|
|
14
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
15
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
16
|
+
}
|
|
17
|
+
return to;
|
|
18
|
+
};
|
|
19
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
20
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
21
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
22
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
23
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
24
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
25
|
+
mod
|
|
26
|
+
));
|
|
27
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
28
|
+
|
|
29
|
+
// src/index.js
|
|
30
|
+
var index_exports = {};
|
|
31
|
+
__export(index_exports, {
|
|
32
|
+
default: () => llmSpiderPlugin,
|
|
33
|
+
llmSpiderPlugin: () => llmSpiderPlugin
|
|
34
|
+
});
|
|
35
|
+
module.exports = __toCommonJS(index_exports);
|
|
36
|
+
var import_vite = require("vite");
|
|
37
|
+
var import_promises = __toESM(require("fs/promises"), 1);
|
|
38
|
+
var import_node_path = __toESM(require("path"), 1);
|
|
39
|
+
var cheerio = __toESM(require("cheerio"), 1);
|
|
40
|
+
var import_turndown = __toESM(require("turndown"), 1);
|
|
41
|
+
var import_turndown_plugin_gfm = require("turndown-plugin-gfm");
|
|
42
|
+
var import_puppeteer = __toESM(require("puppeteer"), 1);
|
|
43
|
+
function llmSpiderPlugin(userOptions = {}) {
|
|
44
|
+
let resolvedConfig;
|
|
45
|
+
function deepMerge(target, source) {
|
|
46
|
+
const result = { ...target };
|
|
47
|
+
for (const key of Object.keys(source)) {
|
|
48
|
+
if (source[key] && typeof source[key] === "object" && !Array.isArray(source[key]) && !(source[key] instanceof RegExp)) {
|
|
49
|
+
result[key] = deepMerge(target[key] || {}, source[key]);
|
|
50
|
+
} else {
|
|
51
|
+
result[key] = source[key];
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return result;
|
|
55
|
+
}
|
|
56
|
+
const defaults = {
|
|
57
|
+
enabled: true,
|
|
58
|
+
// Recommended: explicit list
|
|
59
|
+
routes: (
|
|
60
|
+
/** @type {RouteDef[] | undefined} */
|
|
61
|
+
void 0
|
|
62
|
+
),
|
|
63
|
+
// Optional crawl mode (off by default)
|
|
64
|
+
crawl: {
|
|
65
|
+
enabled: false,
|
|
66
|
+
seeds: ["/"],
|
|
67
|
+
maxDepth: 2,
|
|
68
|
+
maxPages: 50,
|
|
69
|
+
concurrency: 3,
|
|
70
|
+
stripQuery: true
|
|
71
|
+
},
|
|
72
|
+
exclude: ["/login", "/admin", "/account"],
|
|
73
|
+
render: {
|
|
74
|
+
waitUntil: "networkidle2",
|
|
75
|
+
// more forgiving than networkidle0 for SPAs
|
|
76
|
+
timeoutMs: 3e4,
|
|
77
|
+
waitForSelector: null,
|
|
78
|
+
// e.g. "main" or "#app main"
|
|
79
|
+
postLoadDelayMs: 0,
|
|
80
|
+
blockRequests: [
|
|
81
|
+
/google-analytics\.com/i,
|
|
82
|
+
/googletagmanager\.com/i,
|
|
83
|
+
/segment\.com/i,
|
|
84
|
+
/hotjar\.com/i
|
|
85
|
+
],
|
|
86
|
+
launchOptions: {
|
|
87
|
+
headless: "new"
|
|
88
|
+
// For CI containers you may need:
|
|
89
|
+
// args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
|
90
|
+
},
|
|
91
|
+
/**
|
|
92
|
+
* @param {import('puppeteer').Page} _page
|
|
93
|
+
* @param {{ route: string }} _ctx
|
|
94
|
+
*/
|
|
95
|
+
beforeGoto: async (_page, _ctx) => {
|
|
96
|
+
},
|
|
97
|
+
/**
|
|
98
|
+
* @param {import('puppeteer').Page} _page
|
|
99
|
+
* @param {{ route: string }} _ctx
|
|
100
|
+
*/
|
|
101
|
+
beforeExtract: async (_page, _ctx) => {
|
|
102
|
+
}
|
|
103
|
+
},
|
|
104
|
+
extract: {
|
|
105
|
+
mainSelector: ["main", "#main-content", "[data-main]"],
|
|
106
|
+
removeSelectors: [
|
|
107
|
+
"script",
|
|
108
|
+
"style",
|
|
109
|
+
"noscript",
|
|
110
|
+
"nav",
|
|
111
|
+
"header",
|
|
112
|
+
"footer",
|
|
113
|
+
"svg",
|
|
114
|
+
"iframe",
|
|
115
|
+
"[role='alert']",
|
|
116
|
+
".cookie",
|
|
117
|
+
".cookie-banner",
|
|
118
|
+
".modal"
|
|
119
|
+
]
|
|
120
|
+
},
|
|
121
|
+
markdown: {
|
|
122
|
+
addFrontmatter: true,
|
|
123
|
+
turndown: {
|
|
124
|
+
headingStyle: "atx",
|
|
125
|
+
codeBlockStyle: "fenced",
|
|
126
|
+
emDelimiter: "_"
|
|
127
|
+
}
|
|
128
|
+
},
|
|
129
|
+
output: {
|
|
130
|
+
// "sibling" => /pricing -> pricing.md ; /docs/ -> docs/index.html.md ; / -> index.html.md
|
|
131
|
+
mode: "sibling",
|
|
132
|
+
subdir: "ai",
|
|
133
|
+
// used only when mode === "subdir"
|
|
134
|
+
llmsTxtFileName: "llms.txt",
|
|
135
|
+
llmsTitle: null,
|
|
136
|
+
// defaults to package name or project dir
|
|
137
|
+
llmsSummary: "LLM-friendly index of important pages and their Markdown equivalents.",
|
|
138
|
+
sort: true
|
|
139
|
+
},
|
|
140
|
+
logLevel: "info"
|
|
141
|
+
// "silent" | "info" | "debug"
|
|
142
|
+
};
|
|
143
|
+
const options = deepMerge(defaults, userOptions);
|
|
144
|
+
const log = {
|
|
145
|
+
info: (...args) => options.logLevel === "info" || options.logLevel === "debug" ? console.log(...args) : void 0,
|
|
146
|
+
debug: (...args) => options.logLevel === "debug" ? console.log(...args) : void 0,
|
|
147
|
+
warn: (...args) => options.logLevel !== "silent" ? console.warn(...args) : void 0
|
|
148
|
+
};
|
|
149
|
+
function isExcluded(route) {
|
|
150
|
+
return (options.exclude || []).some((p) => {
|
|
151
|
+
if (p instanceof RegExp) return p.test(route);
|
|
152
|
+
return route.includes(p);
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
function normalizeRoute(input, { stripQuery = true } = {}) {
|
|
156
|
+
if (!input) return null;
|
|
157
|
+
if (input.startsWith("mailto:") || input.startsWith("tel:") || input.startsWith("javascript:"))
|
|
158
|
+
return null;
|
|
159
|
+
let s = input.trim();
|
|
160
|
+
if (s.startsWith("http://") || s.startsWith("https://")) return null;
|
|
161
|
+
const hashIdx = s.indexOf("#");
|
|
162
|
+
if (hashIdx >= 0) s = s.slice(0, hashIdx);
|
|
163
|
+
if (stripQuery) {
|
|
164
|
+
const qIdx = s.indexOf("?");
|
|
165
|
+
if (qIdx >= 0) s = s.slice(0, qIdx);
|
|
166
|
+
}
|
|
167
|
+
if (!s) return null;
|
|
168
|
+
if (!s.startsWith("/")) {
|
|
169
|
+
if (s.startsWith("./"))
|
|
170
|
+
s = s.slice(1);
|
|
171
|
+
else s = "/" + s;
|
|
172
|
+
}
|
|
173
|
+
s = s.replace(/\/{2,}/g, "/");
|
|
174
|
+
return s;
|
|
175
|
+
}
|
|
176
|
+
function routeToMdWebPath(route) {
|
|
177
|
+
if (route === "/") return "index.html.md";
|
|
178
|
+
if (route.endsWith("/")) return route.slice(1) + "index.html.md";
|
|
179
|
+
return route.slice(1) + ".md";
|
|
180
|
+
}
|
|
181
|
+
function routeToMdFsPath(distDir, route) {
|
|
182
|
+
const rel = routeToMdWebPath(route);
|
|
183
|
+
if (options.output.mode === "subdir") {
|
|
184
|
+
return import_node_path.default.join(distDir, options.output.subdir, rel);
|
|
185
|
+
}
|
|
186
|
+
return import_node_path.default.join(distDir, rel);
|
|
187
|
+
}
|
|
188
|
+
function makeLlmsLink(relMdPath) {
|
|
189
|
+
return relMdPath.replace(/\\/g, "/");
|
|
190
|
+
}
|
|
191
|
+
async function safeCloseHttpServer(server) {
|
|
192
|
+
await new Promise((resolve, reject) => {
|
|
193
|
+
server.close((err) => err ? reject(err) : resolve());
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
return {
|
|
197
|
+
name: "vite-plugin-llm-spider",
|
|
198
|
+
apply: "build",
|
|
199
|
+
configResolved(rc) {
|
|
200
|
+
resolvedConfig = rc;
|
|
201
|
+
},
|
|
202
|
+
async closeBundle() {
|
|
203
|
+
var _a, _b, _c, _d, _e, _f;
|
|
204
|
+
if (!options.enabled) return;
|
|
205
|
+
if (!resolvedConfig)
|
|
206
|
+
throw new Error("LLM Spider: missing resolved Vite config");
|
|
207
|
+
const distDir = resolvedConfig.build.outDir || "dist";
|
|
208
|
+
const basePath = (resolvedConfig.base || "/").replace(/\\/g, "/");
|
|
209
|
+
let routeDefs = [];
|
|
210
|
+
if (Array.isArray(options.routes) && options.routes.length) {
|
|
211
|
+
routeDefs = options.routes.map((r) => ({
|
|
212
|
+
path: normalizeRoute(r.path, { stripQuery: true }) || "/",
|
|
213
|
+
title: r.title,
|
|
214
|
+
section: r.section || "Pages",
|
|
215
|
+
optional: !!r.optional,
|
|
216
|
+
notes: r.notes
|
|
217
|
+
}));
|
|
218
|
+
} else if ((_a = options.crawl) == null ? void 0 : _a.enabled) {
|
|
219
|
+
routeDefs = [];
|
|
220
|
+
} else {
|
|
221
|
+
routeDefs = [{ path: "/", section: "Pages" }];
|
|
222
|
+
}
|
|
223
|
+
log.info("\nLLM Spider: generating markdown + llms.txt");
|
|
224
|
+
log.debug("distDir:", distDir, "base:", basePath);
|
|
225
|
+
const previewServer = await (0, import_vite.preview)({
|
|
226
|
+
root: resolvedConfig.root,
|
|
227
|
+
base: resolvedConfig.base,
|
|
228
|
+
build: { outDir: distDir },
|
|
229
|
+
preview: { port: 0, open: false, host: "127.0.0.1" },
|
|
230
|
+
configFile: false,
|
|
231
|
+
plugins: [],
|
|
232
|
+
// avoid loading user plugins again
|
|
233
|
+
logLevel: "silent"
|
|
234
|
+
});
|
|
235
|
+
await new Promise((resolve, reject) => {
|
|
236
|
+
const server = previewServer.httpServer;
|
|
237
|
+
if (server.listening) {
|
|
238
|
+
resolve();
|
|
239
|
+
} else {
|
|
240
|
+
server.once("listening", resolve);
|
|
241
|
+
server.once("error", reject);
|
|
242
|
+
setTimeout(() => reject(new Error("Preview server failed to start")), 5e3);
|
|
243
|
+
}
|
|
244
|
+
});
|
|
245
|
+
const addr = previewServer.httpServer.address();
|
|
246
|
+
if (!addr || typeof addr === "string") {
|
|
247
|
+
await safeCloseHttpServer(previewServer.httpServer);
|
|
248
|
+
throw new Error("LLM Spider: could not determine preview server port");
|
|
249
|
+
}
|
|
250
|
+
const normalizedBase = basePath.endsWith("/") ? basePath : basePath + "/";
|
|
251
|
+
const baseUrl = `http://127.0.0.1:${addr.port}${normalizedBase}`;
|
|
252
|
+
log.debug("Preview server at:", baseUrl);
|
|
253
|
+
const browser = await import_puppeteer.default.launch(options.render.launchOptions);
|
|
254
|
+
const turndown = new import_turndown.default(options.markdown.turndown);
|
|
255
|
+
turndown.use(import_turndown_plugin_gfm.gfm);
|
|
256
|
+
const visited = /* @__PURE__ */ new Set();
|
|
257
|
+
const captured = [];
|
|
258
|
+
const queue = [];
|
|
259
|
+
if ((_b = options.crawl) == null ? void 0 : _b.enabled) {
|
|
260
|
+
for (const seed of options.crawl.seeds || ["/"]) {
|
|
261
|
+
const nr = normalizeRoute(seed, {
|
|
262
|
+
stripQuery: options.crawl.stripQuery
|
|
263
|
+
});
|
|
264
|
+
if (nr) queue.push({ route: nr, depth: 0 });
|
|
265
|
+
}
|
|
266
|
+
} else {
|
|
267
|
+
for (const rd of routeDefs) queue.push({ route: rd.path, depth: 0 });
|
|
268
|
+
}
|
|
269
|
+
const maxDepth = ((_c = options.crawl) == null ? void 0 : _c.enabled) ? options.crawl.maxDepth : 0;
|
|
270
|
+
const maxPages = ((_d = options.crawl) == null ? void 0 : _d.enabled) ? options.crawl.maxPages : queue.length;
|
|
271
|
+
const concurrency = ((_e = options.crawl) == null ? void 0 : _e.enabled) ? options.crawl.concurrency : 3;
|
|
272
|
+
async function captureOne(route) {
|
|
273
|
+
var _a2, _b2, _c2;
|
|
274
|
+
if (visited.has(route)) return;
|
|
275
|
+
if (isExcluded(route)) return;
|
|
276
|
+
if (captured.length >= maxPages) return;
|
|
277
|
+
visited.add(route);
|
|
278
|
+
const page = await browser.newPage();
|
|
279
|
+
if ((_a2 = options.render.blockRequests) == null ? void 0 : _a2.length) {
|
|
280
|
+
await page.setRequestInterception(true);
|
|
281
|
+
page.on("request", (req) => {
|
|
282
|
+
const url = req.url();
|
|
283
|
+
const blocked = options.render.blockRequests.some(
|
|
284
|
+
(p) => p instanceof RegExp ? p.test(url) : url.includes(p)
|
|
285
|
+
);
|
|
286
|
+
if (blocked) req.abort();
|
|
287
|
+
else req.continue();
|
|
288
|
+
});
|
|
289
|
+
}
|
|
290
|
+
try {
|
|
291
|
+
const pageUrl = route === "/" ? baseUrl : baseUrl + route.replace(/^\//, "");
|
|
292
|
+
await options.render.beforeGoto(page, { route });
|
|
293
|
+
await page.goto(pageUrl, {
|
|
294
|
+
waitUntil: options.render.waitUntil,
|
|
295
|
+
timeout: options.render.timeoutMs
|
|
296
|
+
});
|
|
297
|
+
if (options.render.waitForSelector) {
|
|
298
|
+
await page.waitForSelector(options.render.waitForSelector, {
|
|
299
|
+
timeout: options.render.timeoutMs
|
|
300
|
+
});
|
|
301
|
+
}
|
|
302
|
+
if (options.render.postLoadDelayMs > 0) {
|
|
303
|
+
await new Promise(
|
|
304
|
+
(r) => setTimeout(r, options.render.postLoadDelayMs)
|
|
305
|
+
);
|
|
306
|
+
}
|
|
307
|
+
await options.render.beforeExtract(page, { route });
|
|
308
|
+
const html = await page.content();
|
|
309
|
+
const $ = cheerio.load(html);
|
|
310
|
+
let harvestedHrefs = [];
|
|
311
|
+
if ((_b2 = options.crawl) == null ? void 0 : _b2.enabled) {
|
|
312
|
+
harvestedHrefs = $("a[href]").map((_, a) => $(a).attr("href")).get();
|
|
313
|
+
log.debug(` Found ${harvestedHrefs.length} links on ${route}:`, harvestedHrefs.slice(0, 15));
|
|
314
|
+
}
|
|
315
|
+
for (const sel of options.extract.removeSelectors || [])
|
|
316
|
+
$(sel).remove();
|
|
317
|
+
const mainSelectors = Array.isArray(options.extract.mainSelector) ? options.extract.mainSelector : [options.extract.mainSelector];
|
|
318
|
+
let mainHtml = null;
|
|
319
|
+
for (const sel of mainSelectors) {
|
|
320
|
+
if (!sel) continue;
|
|
321
|
+
const node = $(sel).first();
|
|
322
|
+
if (node && node.length) {
|
|
323
|
+
mainHtml = node.html();
|
|
324
|
+
break;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
if (!mainHtml) {
|
|
328
|
+
const main = $("main").first();
|
|
329
|
+
mainHtml = main.length ? main.html() : $("body").html();
|
|
330
|
+
}
|
|
331
|
+
const title = ($("title").text() || "").trim() || route;
|
|
332
|
+
const markdownBody = turndown.turndown(mainHtml || "");
|
|
333
|
+
const mdRelPath = options.output.mode === "subdir" ? import_node_path.default.posix.join(options.output.subdir, routeToMdWebPath(route)) : routeToMdWebPath(route);
|
|
334
|
+
const fsPath = routeToMdFsPath(distDir, route);
|
|
335
|
+
await import_promises.default.mkdir(import_node_path.default.dirname(fsPath), { recursive: true });
|
|
336
|
+
const frontmatter = options.markdown.addFrontmatter ? `---
|
|
337
|
+
source: ${route}
|
|
338
|
+
title: ${title}
|
|
339
|
+
generated_at: ${(/* @__PURE__ */ new Date()).toISOString()}
|
|
340
|
+
---
|
|
341
|
+
|
|
342
|
+
` : "";
|
|
343
|
+
await import_promises.default.writeFile(fsPath, frontmatter + markdownBody, "utf8");
|
|
344
|
+
const meta = routeDefs.find((r) => r.path === route);
|
|
345
|
+
captured.push({
|
|
346
|
+
route,
|
|
347
|
+
title: (meta == null ? void 0 : meta.title) || title,
|
|
348
|
+
section: (meta == null ? void 0 : meta.section) || "Pages",
|
|
349
|
+
optional: !!(meta == null ? void 0 : meta.optional),
|
|
350
|
+
notes: meta == null ? void 0 : meta.notes,
|
|
351
|
+
mdRelPath
|
|
352
|
+
});
|
|
353
|
+
log.info(` \u2705 ${route} -> ${mdRelPath}`);
|
|
354
|
+
if ((_c2 = options.crawl) == null ? void 0 : _c2.enabled) {
|
|
355
|
+
for (const href of harvestedHrefs) {
|
|
356
|
+
const n = normalizeRoute(href, {
|
|
357
|
+
stripQuery: options.crawl.stripQuery
|
|
358
|
+
});
|
|
359
|
+
if (!n) continue;
|
|
360
|
+
let baseRelative = n;
|
|
361
|
+
if (normalizedBase !== "/" && baseRelative.startsWith(normalizedBase)) {
|
|
362
|
+
baseRelative = "/" + baseRelative.slice(normalizedBase.length);
|
|
363
|
+
baseRelative = baseRelative === "//" ? "/" : baseRelative.replace(/\/{2,}/g, "/");
|
|
364
|
+
}
|
|
365
|
+
if (!visited.has(baseRelative) && !isExcluded(baseRelative)) {
|
|
366
|
+
queue.push({ route: baseRelative, depth: -1 });
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
} catch (err) {
|
|
371
|
+
log.warn(` \u26A0\uFE0F failed ${route}: ${(err == null ? void 0 : err.message) || err}`);
|
|
372
|
+
} finally {
|
|
373
|
+
await page.close();
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
try {
|
|
377
|
+
while (queue.length && captured.length < maxPages) {
|
|
378
|
+
const batch = queue.splice(0, concurrency).map((item) => {
|
|
379
|
+
const depth = item.depth >= 0 ? item.depth : 1;
|
|
380
|
+
return { route: item.route, depth };
|
|
381
|
+
});
|
|
382
|
+
await Promise.all(
|
|
383
|
+
batch.map(async ({ route, depth }) => {
|
|
384
|
+
var _a2, _b2;
|
|
385
|
+
if (((_a2 = options.crawl) == null ? void 0 : _a2.enabled) && depth > maxDepth) return;
|
|
386
|
+
await captureOne(route);
|
|
387
|
+
if ((_b2 = options.crawl) == null ? void 0 : _b2.enabled) {
|
|
388
|
+
for (let i = 0; i < queue.length; i++) {
|
|
389
|
+
if (queue[i].depth === -1) queue[i].depth = depth + 1;
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
})
|
|
393
|
+
);
|
|
394
|
+
}
|
|
395
|
+
const llmsTitle = options.output.llmsTitle || ((_f = resolvedConfig == null ? void 0 : resolvedConfig.env) == null ? void 0 : _f.mode) || "Site";
|
|
396
|
+
const items = options.output.sort ? [...captured].sort((a, b) => a.route.localeCompare(b.route)) : captured;
|
|
397
|
+
const bySection = /* @__PURE__ */ new Map();
|
|
398
|
+
const optionalItems = [];
|
|
399
|
+
for (const item of items) {
|
|
400
|
+
if (item.optional) optionalItems.push(item);
|
|
401
|
+
else {
|
|
402
|
+
const s = item.section || "Pages";
|
|
403
|
+
bySection.set(s, [...bySection.get(s) || [], item]);
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
let llms = `# ${llmsTitle}
|
|
407
|
+
|
|
408
|
+
> ${options.output.llmsSummary}
|
|
409
|
+
|
|
410
|
+
`;
|
|
411
|
+
for (const [section, sectionItems] of bySection.entries()) {
|
|
412
|
+
llms += `## ${section}
|
|
413
|
+
|
|
414
|
+
`;
|
|
415
|
+
for (const it of sectionItems) {
|
|
416
|
+
const link = makeLlmsLink(it.mdRelPath);
|
|
417
|
+
const label = it.title || it.route;
|
|
418
|
+
const notes = it.notes ? `: ${it.notes}` : "";
|
|
419
|
+
llms += `- [${label}](${link})${notes}
|
|
420
|
+
`;
|
|
421
|
+
}
|
|
422
|
+
llms += `
|
|
423
|
+
`;
|
|
424
|
+
}
|
|
425
|
+
if (optionalItems.length) {
|
|
426
|
+
llms += `## Optional
|
|
427
|
+
|
|
428
|
+
`;
|
|
429
|
+
for (const it of optionalItems) {
|
|
430
|
+
const link = makeLlmsLink(it.mdRelPath);
|
|
431
|
+
const label = it.title || it.route;
|
|
432
|
+
const notes = it.notes ? `: ${it.notes}` : "";
|
|
433
|
+
llms += `- [${label}](${link})${notes}
|
|
434
|
+
`;
|
|
435
|
+
}
|
|
436
|
+
llms += `
|
|
437
|
+
`;
|
|
438
|
+
}
|
|
439
|
+
const llmsPath = import_node_path.default.join(distDir, options.output.llmsTxtFileName);
|
|
440
|
+
await import_promises.default.writeFile(llmsPath, llms, "utf8");
|
|
441
|
+
log.info(
|
|
442
|
+
`
|
|
443
|
+
LLM Spider: wrote ${captured.length} markdown pages + ${options.output.llmsTxtFileName}
|
|
444
|
+
`
|
|
445
|
+
);
|
|
446
|
+
} finally {
|
|
447
|
+
await browser.close();
|
|
448
|
+
await safeCloseHttpServer(previewServer.httpServer);
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
};
|
|
452
|
+
}
|
|
453
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
454
|
+
0 && (module.exports = {
|
|
455
|
+
llmSpiderPlugin
|
|
456
|
+
});
|
|
457
|
+
//# sourceMappingURL=index.cjs.map
|