agentsite-kit 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +226 -0
- package/README.zh-CN.md +226 -0
- package/bin/agentsite.js +2 -0
- package/dist/chunk-AYLZUDQP.js +281 -0
- package/dist/chunk-AYLZUDQP.js.map +1 -0
- package/dist/chunk-YWR5EH3F.js +339 -0
- package/dist/chunk-YWR5EH3F.js.map +1 -0
- package/dist/chunk-YWUDTSOR.js +360 -0
- package/dist/chunk-YWUDTSOR.js.map +1 -0
- package/dist/generate-V5JMMT4J.js +10 -0
- package/dist/generate-V5JMMT4J.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +1238 -0
- package/dist/index.js.map +1 -0
- package/dist/scan-QWNB65DB.js +10 -0
- package/dist/scan-QWNB65DB.js.map +1 -0
- package/package.json +66 -0
- package/templates/agentsite.config.yaml +32 -0
- package/templates/presets/api-docs.yaml +29 -0
- package/templates/presets/blog.yaml +26 -0
- package/templates/presets/community.yaml +31 -0
- package/templates/presets/docs-site.yaml +26 -0
- package/templates/presets/ecommerce.yaml +32 -0
- package/templates/presets/knowledge-base.yaml +28 -0
- package/templates/presets/portfolio.yaml +27 -0
- package/templates/presets/saas.yaml +30 -0
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
import {
|
|
2
|
+
extractContent,
|
|
3
|
+
getLlmConfig,
|
|
4
|
+
llmExtractTags,
|
|
5
|
+
loadConfig,
|
|
6
|
+
loadPlugins,
|
|
7
|
+
log,
|
|
8
|
+
runHook,
|
|
9
|
+
sha256,
|
|
10
|
+
spinner
|
|
11
|
+
} from "./chunk-YWUDTSOR.js";
|
|
12
|
+
|
|
13
|
+
// src/commands/generate.ts
|
|
14
|
+
import { readFileSync as readFileSync2, writeFileSync, existsSync as existsSync2, mkdirSync } from "fs";
|
|
15
|
+
|
|
16
|
+
// src/generator/llms-txt.ts
|
|
17
|
+
function generateLlmsTxt(scanResult, siteName, siteDescription) {
|
|
18
|
+
const lines = [];
|
|
19
|
+
lines.push(`# ${siteName}`);
|
|
20
|
+
lines.push(`> ${siteDescription}`);
|
|
21
|
+
lines.push("");
|
|
22
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
23
|
+
for (const page of scanResult.pages) {
|
|
24
|
+
const group = grouped.get(page.type) ?? [];
|
|
25
|
+
group.push(page);
|
|
26
|
+
grouped.set(page.type, group);
|
|
27
|
+
}
|
|
28
|
+
const sectionOrder = ["docs", "faq", "product", "blog", "pricing", "about", "contact", "homepage", "unknown"];
|
|
29
|
+
const sectionLabels = {
|
|
30
|
+
docs: "Docs",
|
|
31
|
+
faq: "FAQ",
|
|
32
|
+
product: "Products",
|
|
33
|
+
blog: "Articles",
|
|
34
|
+
pricing: "Pricing",
|
|
35
|
+
about: "About",
|
|
36
|
+
contact: "Contact",
|
|
37
|
+
homepage: "Home",
|
|
38
|
+
unknown: "Other"
|
|
39
|
+
};
|
|
40
|
+
for (const type of sectionOrder) {
|
|
41
|
+
const pages = grouped.get(type);
|
|
42
|
+
if (!pages?.length) continue;
|
|
43
|
+
lines.push(`## ${sectionLabels[type] ?? type}`);
|
|
44
|
+
for (const page of pages) {
|
|
45
|
+
const summary = page.summary ? `: ${page.summary}` : "";
|
|
46
|
+
lines.push(`- [${page.title || page.url}](${page.url})${summary}`);
|
|
47
|
+
}
|
|
48
|
+
lines.push("");
|
|
49
|
+
}
|
|
50
|
+
return lines.join("\n");
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// src/generator/agent-sitemap.ts
|
|
54
|
+
function generateAgentSitemap(scanResult) {
|
|
55
|
+
const entries = scanResult.pages.map((p) => ({
|
|
56
|
+
url: p.url,
|
|
57
|
+
title: p.title,
|
|
58
|
+
type: p.type,
|
|
59
|
+
summary: p.summary,
|
|
60
|
+
lastModified: p.lastModified,
|
|
61
|
+
wordCount: p.wordCount
|
|
62
|
+
}));
|
|
63
|
+
return {
|
|
64
|
+
version: "1.0",
|
|
65
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
66
|
+
siteUrl: scanResult.siteUrl,
|
|
67
|
+
totalPages: entries.length,
|
|
68
|
+
pages: entries
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// src/generator/agent-index.ts
|
|
73
|
+
function generateAgentIndex(scanResult, siteName, siteDescription) {
|
|
74
|
+
const typeGroups = {};
|
|
75
|
+
for (const p of scanResult.pages) {
|
|
76
|
+
typeGroups[p.type] = (typeGroups[p.type] ?? 0) + 1;
|
|
77
|
+
}
|
|
78
|
+
const endpoints = {
|
|
79
|
+
search: "/api/search?q={query}",
|
|
80
|
+
pages: "/api/pages/{id}",
|
|
81
|
+
faq: "/api/faq",
|
|
82
|
+
products: "/api/products",
|
|
83
|
+
docs: "/api/docs",
|
|
84
|
+
articles: "/api/articles",
|
|
85
|
+
pricing: "/api/pricing",
|
|
86
|
+
changelog: "/api/changelog",
|
|
87
|
+
stats: "/api/stats",
|
|
88
|
+
config: "/api/config"
|
|
89
|
+
};
|
|
90
|
+
return {
|
|
91
|
+
version: "1.0",
|
|
92
|
+
site: {
|
|
93
|
+
name: siteName,
|
|
94
|
+
description: siteDescription,
|
|
95
|
+
url: scanResult.siteUrl
|
|
96
|
+
},
|
|
97
|
+
stats: {
|
|
98
|
+
totalPages: scanResult.totalPages,
|
|
99
|
+
scannedAt: scanResult.scannedAt,
|
|
100
|
+
pageTypes: typeGroups
|
|
101
|
+
},
|
|
102
|
+
endpoints,
|
|
103
|
+
files: {
|
|
104
|
+
"llms.txt": "LLM-friendly site overview",
|
|
105
|
+
"agent-sitemap.json": "Machine-readable sitemap with metadata",
|
|
106
|
+
"data/docs.json": "Documentation entries",
|
|
107
|
+
"data/faq.json": "FAQ entries",
|
|
108
|
+
"data/articles.json": "Blog/article entries",
|
|
109
|
+
"data/products.json": "Product entries",
|
|
110
|
+
"data/pricing.json": "Pricing/plan entries",
|
|
111
|
+
"data/changelog.json": "Changelog/release entries"
|
|
112
|
+
}
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// src/generator/structured-export.ts
|
|
117
|
+
import { readFileSync, existsSync } from "fs";
|
|
118
|
+
function loadCachedHtml(outDir, url) {
|
|
119
|
+
const path = `${outDir}/cache/pages/${sha256(url)}.html`;
|
|
120
|
+
if (!existsSync(path)) return null;
|
|
121
|
+
return readFileSync(path, "utf-8");
|
|
122
|
+
}
|
|
123
|
+
async function tryExtractTags(llm, title, bodyText) {
|
|
124
|
+
if (!llm) return [];
|
|
125
|
+
try {
|
|
126
|
+
return await llmExtractTags(llm, title, bodyText);
|
|
127
|
+
} catch {
|
|
128
|
+
return [];
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
async function generateStructuredExports(scanResult, outDir, config) {
|
|
132
|
+
const docs = [];
|
|
133
|
+
const faq = [];
|
|
134
|
+
const products = [];
|
|
135
|
+
const articles = [];
|
|
136
|
+
const pricing = [];
|
|
137
|
+
const changelog = [];
|
|
138
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
139
|
+
const llm = getLlmConfig(config);
|
|
140
|
+
for (const page of scanResult.pages) {
|
|
141
|
+
const html = loadCachedHtml(outDir, page.url);
|
|
142
|
+
if (!html) continue;
|
|
143
|
+
const content = extractContent(html, page.url);
|
|
144
|
+
switch (page.type) {
|
|
145
|
+
case "docs": {
|
|
146
|
+
const tags = await tryExtractTags(llm, page.title, content.bodyText);
|
|
147
|
+
docs.push({
|
|
148
|
+
title: page.title,
|
|
149
|
+
url: page.url,
|
|
150
|
+
section: page.headings[0] ?? "",
|
|
151
|
+
summary: page.summary,
|
|
152
|
+
tags,
|
|
153
|
+
updated_at: page.lastModified ?? now
|
|
154
|
+
});
|
|
155
|
+
break;
|
|
156
|
+
}
|
|
157
|
+
case "faq":
|
|
158
|
+
if (content.faqItems.length > 0) {
|
|
159
|
+
for (const item of content.faqItems) {
|
|
160
|
+
faq.push({
|
|
161
|
+
question: item.question,
|
|
162
|
+
answer: item.answer,
|
|
163
|
+
category: page.headings[0] ?? "General",
|
|
164
|
+
url: page.url,
|
|
165
|
+
updated_at: page.lastModified ?? now
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
} else {
|
|
169
|
+
faq.push({
|
|
170
|
+
question: page.title,
|
|
171
|
+
answer: page.summary,
|
|
172
|
+
category: "General",
|
|
173
|
+
url: page.url,
|
|
174
|
+
updated_at: page.lastModified ?? now
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
break;
|
|
178
|
+
case "product":
|
|
179
|
+
products.push({
|
|
180
|
+
product_name: page.title,
|
|
181
|
+
description: page.summary,
|
|
182
|
+
features: content.features.slice(0, 10),
|
|
183
|
+
pricing: "",
|
|
184
|
+
url: page.url,
|
|
185
|
+
updated_at: page.lastModified ?? now
|
|
186
|
+
});
|
|
187
|
+
break;
|
|
188
|
+
case "blog": {
|
|
189
|
+
const tags = await tryExtractTags(llm, page.title, content.bodyText);
|
|
190
|
+
articles.push({
|
|
191
|
+
title: page.title,
|
|
192
|
+
summary: page.summary,
|
|
193
|
+
published_at: page.lastModified ?? now,
|
|
194
|
+
updated_at: page.lastModified ?? now,
|
|
195
|
+
tags,
|
|
196
|
+
url: page.url
|
|
197
|
+
});
|
|
198
|
+
break;
|
|
199
|
+
}
|
|
200
|
+
case "pricing":
|
|
201
|
+
pricing.push({
|
|
202
|
+
plan_name: page.title,
|
|
203
|
+
price: content.bodyText.match(/\$\d+(?:\.\d{2})?/)?.[0] ?? "",
|
|
204
|
+
features: content.features.slice(0, 10),
|
|
205
|
+
url: page.url,
|
|
206
|
+
updated_at: page.lastModified ?? now
|
|
207
|
+
});
|
|
208
|
+
break;
|
|
209
|
+
case "changelog":
|
|
210
|
+
changelog.push({
|
|
211
|
+
version: page.version ?? page.title,
|
|
212
|
+
date: page.publishedAt ?? page.lastModified ?? now,
|
|
213
|
+
changes: content.headings.slice(1),
|
|
214
|
+
url: page.url
|
|
215
|
+
});
|
|
216
|
+
break;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
return { docs, faq, products, articles, pricing, changelog };
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// src/commands/generate.ts
|
|
223
|
+
async function runGenerate(configOverride) {
|
|
224
|
+
const config = configOverride ?? loadConfig();
|
|
225
|
+
const outDir = config.output.dir;
|
|
226
|
+
const scanPath = `${outDir}/scan-result.json`;
|
|
227
|
+
if (!existsSync2(scanPath)) {
|
|
228
|
+
log.error("No scan result found. Run `agentsite scan` first.");
|
|
229
|
+
throw new Error("No scan result found");
|
|
230
|
+
}
|
|
231
|
+
const scanResult = JSON.parse(readFileSync2(scanPath, "utf-8"));
|
|
232
|
+
mkdirSync(`${outDir}/data`, { recursive: true });
|
|
233
|
+
const plugins = await loadPlugins(config.plugins ?? []);
|
|
234
|
+
await runHook(plugins, "beforeGenerate", scanResult);
|
|
235
|
+
const sp = spinner("Generating files...");
|
|
236
|
+
let count = 0;
|
|
237
|
+
const llmsTxt = generateLlmsTxt(scanResult, config.site.name, config.site.description);
|
|
238
|
+
writeFileSync(`${outDir}/llms.txt`, llmsTxt, "utf-8");
|
|
239
|
+
count++;
|
|
240
|
+
const sitemap = generateAgentSitemap(scanResult);
|
|
241
|
+
writeFileSync(`${outDir}/agent-sitemap.json`, JSON.stringify(sitemap, null, 2), "utf-8");
|
|
242
|
+
count++;
|
|
243
|
+
const index = generateAgentIndex(scanResult, config.site.name, config.site.description);
|
|
244
|
+
writeFileSync(`${outDir}/agent-index.json`, JSON.stringify(index, null, 2), "utf-8");
|
|
245
|
+
count++;
|
|
246
|
+
const { docs, faq, products, articles, pricing, changelog } = await generateStructuredExports(scanResult, outDir, config);
|
|
247
|
+
writeFileSync(`${outDir}/data/docs.json`, JSON.stringify(docs, null, 2), "utf-8");
|
|
248
|
+
writeFileSync(`${outDir}/data/faq.json`, JSON.stringify(faq, null, 2), "utf-8");
|
|
249
|
+
writeFileSync(`${outDir}/data/products.json`, JSON.stringify(products, null, 2), "utf-8");
|
|
250
|
+
writeFileSync(`${outDir}/data/articles.json`, JSON.stringify(articles, null, 2), "utf-8");
|
|
251
|
+
writeFileSync(`${outDir}/data/pricing.json`, JSON.stringify(pricing, null, 2), "utf-8");
|
|
252
|
+
writeFileSync(`${outDir}/data/changelog.json`, JSON.stringify(changelog, null, 2), "utf-8");
|
|
253
|
+
count += 6;
|
|
254
|
+
await runHook(plugins, "afterGenerate", outDir);
|
|
255
|
+
sp.succeed(`Generated ${count} files`);
|
|
256
|
+
log.info(`Output directory: ${outDir}/`);
|
|
257
|
+
log.info(` llms.txt (${llmsTxt.length} bytes)`);
|
|
258
|
+
log.info(` agent-sitemap.json (${scanResult.totalPages} pages)`);
|
|
259
|
+
log.info(` agent-index.json`);
|
|
260
|
+
log.info(` data/docs.json (${docs.length} entries)`);
|
|
261
|
+
log.info(` data/faq.json (${faq.length} entries)`);
|
|
262
|
+
log.info(` data/products.json (${products.length} entries)`);
|
|
263
|
+
log.info(` data/articles.json (${articles.length} entries)`);
|
|
264
|
+
log.info(` data/pricing.json (${pricing.length} entries)`);
|
|
265
|
+
log.info(` data/changelog.json (${changelog.length} entries)`);
|
|
266
|
+
}
|
|
267
|
+
function registerGenerateCommand(program) {
|
|
268
|
+
program.command("generate").description("Generate Agent-friendly files from scan results").action(async () => {
|
|
269
|
+
await runGenerate();
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
export {
|
|
274
|
+
generateLlmsTxt,
|
|
275
|
+
generateAgentSitemap,
|
|
276
|
+
generateAgentIndex,
|
|
277
|
+
generateStructuredExports,
|
|
278
|
+
runGenerate,
|
|
279
|
+
registerGenerateCommand
|
|
280
|
+
};
|
|
281
|
+
//# sourceMappingURL=chunk-AYLZUDQP.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/commands/generate.ts","../src/generator/llms-txt.ts","../src/generator/agent-sitemap.ts","../src/generator/agent-index.ts","../src/generator/structured-export.ts"],"sourcesContent":["import { Command } from 'commander';\nimport { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';\nimport { loadConfig } from '../config/loader.js';\nimport { generateLlmsTxt } from '../generator/llms-txt.js';\nimport { generateAgentSitemap } from '../generator/agent-sitemap.js';\nimport { generateAgentIndex } from '../generator/agent-index.js';\nimport { generateStructuredExports } from '../generator/structured-export.js';\nimport { loadPlugins, runHook } from '../plugins/loader.js';\nimport { log, spinner } from '../utils/logger.js';\nimport type { ScanResult } from '../types/page.js';\n\nexport async function runGenerate(configOverride?: ReturnType<typeof loadConfig>): Promise<void> {\n const config = configOverride ?? loadConfig();\n const outDir = config.output.dir;\n const scanPath = `${outDir}/scan-result.json`;\n\n if (!existsSync(scanPath)) {\n log.error('No scan result found. Run `agentsite scan` first.');\n throw new Error('No scan result found');\n }\n\n const scanResult: ScanResult = JSON.parse(readFileSync(scanPath, 'utf-8'));\n mkdirSync(`${outDir}/data`, { recursive: true });\n\n // Load plugins\n const plugins = await loadPlugins(config.plugins ?? []);\n await runHook(plugins, 'beforeGenerate', scanResult);\n\n const sp = spinner('Generating files...');\n let count = 0;\n\n // llms.txt\n const llmsTxt = generateLlmsTxt(scanResult, config.site.name, config.site.description);\n writeFileSync(`${outDir}/llms.txt`, llmsTxt, 'utf-8');\n count++;\n\n // agent-sitemap.json\n const sitemap = generateAgentSitemap(scanResult);\n writeFileSync(`${outDir}/agent-sitemap.json`, JSON.stringify(sitemap, null, 2), 'utf-8');\n count++;\n\n // agent-index.json\n const index = generateAgentIndex(scanResult, config.site.name, config.site.description);\n writeFileSync(`${outDir}/agent-index.json`, JSON.stringify(index, null, 2), 'utf-8');\n count++;\n\n // Structured exports\n const { docs, faq, products, articles, pricing, changelog } = await generateStructuredExports(scanResult, outDir, config);\n writeFileSync(`${outDir}/data/docs.json`, JSON.stringify(docs, null, 2), 'utf-8');\n writeFileSync(`${outDir}/data/faq.json`, JSON.stringify(faq, null, 2), 'utf-8');\n writeFileSync(`${outDir}/data/products.json`, JSON.stringify(products, null, 2), 'utf-8');\n writeFileSync(`${outDir}/data/articles.json`, JSON.stringify(articles, null, 2), 'utf-8');\n writeFileSync(`${outDir}/data/pricing.json`, JSON.stringify(pricing, null, 2), 'utf-8');\n writeFileSync(`${outDir}/data/changelog.json`, JSON.stringify(changelog, null, 2), 'utf-8');\n count += 6;\n\n // Run afterGenerate plugin hooks\n await runHook(plugins, 'afterGenerate', outDir);\n\n sp.succeed(`Generated ${count} files`);\n\n log.info(`Output directory: ${outDir}/`);\n log.info(` llms.txt (${llmsTxt.length} bytes)`);\n log.info(` agent-sitemap.json (${scanResult.totalPages} pages)`);\n log.info(` agent-index.json`);\n log.info(` data/docs.json (${docs.length} entries)`);\n log.info(` data/faq.json (${faq.length} entries)`);\n log.info(` data/products.json (${products.length} entries)`);\n log.info(` data/articles.json (${articles.length} entries)`);\n log.info(` data/pricing.json (${pricing.length} entries)`);\n log.info(` data/changelog.json (${changelog.length} entries)`);\n}\n\nexport function registerGenerateCommand(program: Command) {\n program\n .command('generate')\n .description('Generate Agent-friendly files from scan results')\n .action(async () => {\n await runGenerate();\n });\n}\n","import type { ScanResult } from '../types/page.js';\n\nexport function generateLlmsTxt(scanResult: ScanResult, siteName: string, siteDescription: string): string {\n const lines: string[] = [];\n lines.push(`# ${siteName}`);\n lines.push(`> ${siteDescription}`);\n lines.push('');\n\n const grouped = new Map<string, typeof scanResult.pages>();\n for (const page of scanResult.pages) {\n const group = grouped.get(page.type) ?? [];\n group.push(page);\n grouped.set(page.type, group);\n }\n\n const sectionOrder = ['docs', 'faq', 'product', 'blog', 'pricing', 'about', 'contact', 'homepage', 'unknown'];\n const sectionLabels: Record<string, string> = {\n docs: 'Docs',\n faq: 'FAQ',\n product: 'Products',\n blog: 'Articles',\n pricing: 'Pricing',\n about: 'About',\n contact: 'Contact',\n homepage: 'Home',\n unknown: 'Other',\n };\n\n for (const type of sectionOrder) {\n const pages = grouped.get(type);\n if (!pages?.length) continue;\n\n lines.push(`## ${sectionLabels[type] ?? type}`);\n for (const page of pages) {\n const summary = page.summary ? `: ${page.summary}` : '';\n lines.push(`- [${page.title || page.url}](${page.url})${summary}`);\n }\n lines.push('');\n }\n\n return lines.join('\\n');\n}\n","import type { ScanResult } from '../types/page.js';\n\ninterface AgentSitemapEntry {\n url: string;\n title: string;\n type: string;\n summary: string;\n lastModified?: string;\n wordCount: number;\n}\n\nexport function generateAgentSitemap(scanResult: ScanResult): object {\n const entries: AgentSitemapEntry[] = scanResult.pages.map((p) => ({\n url: p.url,\n title: p.title,\n type: p.type,\n summary: p.summary,\n lastModified: p.lastModified,\n wordCount: p.wordCount,\n }));\n\n return {\n version: '1.0',\n generatedAt: new Date().toISOString(),\n siteUrl: scanResult.siteUrl,\n totalPages: entries.length,\n pages: entries,\n };\n}\n","import type { ScanResult } from '../types/page.js';\n\nexport function generateAgentIndex(scanResult: ScanResult, siteName: string, siteDescription: string): object {\n const typeGroups: Record<string, number> = {};\n for (const p of scanResult.pages) {\n typeGroups[p.type] = (typeGroups[p.type] ?? 0) + 1;\n }\n\n const endpoints: Record<string, string> = {\n search: '/api/search?q={query}',\n pages: '/api/pages/{id}',\n faq: '/api/faq',\n products: '/api/products',\n docs: '/api/docs',\n articles: '/api/articles',\n pricing: '/api/pricing',\n changelog: '/api/changelog',\n stats: '/api/stats',\n config: '/api/config',\n };\n\n return {\n version: '1.0',\n site: {\n name: siteName,\n description: siteDescription,\n url: scanResult.siteUrl,\n },\n stats: {\n totalPages: scanResult.totalPages,\n scannedAt: scanResult.scannedAt,\n pageTypes: typeGroups,\n },\n endpoints,\n files: {\n 'llms.txt': 'LLM-friendly site overview',\n 'agent-sitemap.json': 'Machine-readable sitemap with metadata',\n 'data/docs.json': 'Documentation entries',\n 'data/faq.json': 'FAQ entries',\n 'data/articles.json': 'Blog/article entries',\n 'data/products.json': 'Product entries',\n 'data/pricing.json': 'Pricing/plan entries',\n 'data/changelog.json': 'Changelog/release entries',\n },\n };\n}\n","import { readFileSync, existsSync } from 'node:fs';\nimport { sha256 } from '../utils/hash.js';\nimport { extractContent } from '../scanner/content-extractor.js';\nimport { getLlmConfig, type LlmConfig } from '../llm/client.js';\nimport { llmExtractTags } from '../llm/summarizer.js';\nimport type { ScanResult } from '../types/page.js';\nimport type { DocEntry, FaqEntry, ProductEntry, ArticleEntry, PricingEntry, ChangelogEntry } from '../types/content.js';\nimport type { AgentSiteConfig } from '../types/config.js';\n\nfunction loadCachedHtml(outDir: string, url: string): string | null {\n const path = `${outDir}/cache/pages/${sha256(url)}.html`;\n if (!existsSync(path)) return null;\n return readFileSync(path, 'utf-8');\n}\n\nasync function tryExtractTags(llm: LlmConfig | null, title: string, bodyText: string): Promise<string[]> {\n if (!llm) return [];\n try {\n return await llmExtractTags(llm, title, bodyText);\n } catch {\n return [];\n }\n}\n\nexport async function generateStructuredExports(scanResult: ScanResult, outDir: string, config?: AgentSiteConfig) {\n const docs: DocEntry[] = [];\n const faq: FaqEntry[] = [];\n const products: ProductEntry[] = [];\n const articles: ArticleEntry[] = [];\n const pricing: PricingEntry[] = [];\n const changelog: ChangelogEntry[] = [];\n const now = new Date().toISOString();\n const llm = getLlmConfig(config);\n\n for (const page of scanResult.pages) {\n const html = loadCachedHtml(outDir, page.url);\n if (!html) continue;\n\n const content = extractContent(html, page.url);\n\n switch (page.type) {\n case 'docs': {\n const tags = await tryExtractTags(llm, page.title, content.bodyText);\n docs.push({\n title: page.title,\n url: page.url,\n section: page.headings[0] ?? '',\n summary: page.summary,\n tags,\n updated_at: page.lastModified ?? now,\n });\n break;\n }\n\n case 'faq':\n if (content.faqItems.length > 0) {\n for (const item of content.faqItems) {\n faq.push({\n question: item.question,\n answer: item.answer,\n category: page.headings[0] ?? 'General',\n url: page.url,\n updated_at: page.lastModified ?? now,\n });\n }\n } else {\n faq.push({\n question: page.title,\n answer: page.summary,\n category: 'General',\n url: page.url,\n updated_at: page.lastModified ?? now,\n });\n }\n break;\n\n case 'product':\n products.push({\n product_name: page.title,\n description: page.summary,\n features: content.features.slice(0, 10),\n pricing: '',\n url: page.url,\n updated_at: page.lastModified ?? now,\n });\n break;\n\n case 'blog': {\n const tags = await tryExtractTags(llm, page.title, content.bodyText);\n articles.push({\n title: page.title,\n summary: page.summary,\n published_at: page.lastModified ?? now,\n updated_at: page.lastModified ?? now,\n tags,\n url: page.url,\n });\n break;\n }\n\n case 'pricing':\n pricing.push({\n plan_name: page.title,\n price: content.bodyText.match(/\\$\\d+(?:\\.\\d{2})?/)?.[0] ?? '',\n features: content.features.slice(0, 10),\n url: page.url,\n updated_at: page.lastModified ?? now,\n });\n break;\n\n case 'changelog':\n changelog.push({\n version: page.version ?? page.title,\n date: page.publishedAt ?? page.lastModified ?? now,\n changes: content.headings.slice(1),\n url: page.url,\n });\n break;\n }\n }\n\n return { docs, faq, products, articles, pricing, changelog };\n}\n"],"mappings":";;;;;;;;;;;;;AACA,SAAS,gBAAAA,eAAc,eAAe,cAAAC,aAAY,iBAAiB;;;ACC5D,SAAS,gBAAgB,YAAwB,UAAkB,iBAAiC;AACzG,QAAM,QAAkB,CAAC;AACzB,QAAM,KAAK,KAAK,QAAQ,EAAE;AAC1B,QAAM,KAAK,KAAK,eAAe,EAAE;AACjC,QAAM,KAAK,EAAE;AAEb,QAAM,UAAU,oBAAI,IAAqC;AACzD,aAAW,QAAQ,WAAW,OAAO;AACnC,UAAM,QAAQ,QAAQ,IAAI,KAAK,IAAI,KAAK,CAAC;AACzC,UAAM,KAAK,IAAI;AACf,YAAQ,IAAI,KAAK,MAAM,KAAK;AAAA,EAC9B;AAEA,QAAM,eAAe,CAAC,QAAQ,OAAO,WAAW,QAAQ,WAAW,SAAS,WAAW,YAAY,SAAS;AAC5G,QAAM,gBAAwC;AAAA,IAC5C,MAAM;AAAA,IACN,KAAK;AAAA,IACL,SAAS;AAAA,IACT,MAAM;AAAA,IACN,SAAS;AAAA,IACT,OAAO;AAAA,IACP,SAAS;AAAA,IACT,UAAU;AAAA,IACV,SAAS;AAAA,EACX;AAEA,aAAW,QAAQ,cAAc;AAC/B,UAAM,QAAQ,QAAQ,IAAI,IAAI;AAC9B,QAAI,CAAC,OAAO,OAAQ;AAEpB,UAAM,KAAK,MAAM,cAAc,IAAI,KAAK,IAAI,EAAE;AAC9C,eAAW,QAAQ,OAAO;AACxB,YAAM,UAAU,KAAK,UAAU,KAAK,KAAK,OAAO,KAAK;AACrD,YAAM,KAAK,MAAM,KAAK,SAAS,KAAK,GAAG,KAAK,KAAK,GAAG,IAAI,OAAO,EAAE;AAAA,IACnE;AACA,UAAM,KAAK,EAAE;AAAA,EACf;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;;;AC9BO,SAAS,qBAAqB,YAAgC;AACnE,QAAM,UAA+B,WAAW,MAAM,IAAI,CAAC,OAAO;AAAA,IAChE,KAAK,EAAE;AAAA,IACP,OAAO,EAAE;AAAA,IACT,MAAM,EAAE;AAAA,IACR,SAAS,EAAE;AAAA,IACX,cAAc,EAAE;AAAA,IAChB,WAAW,EAAE;AAAA,EACf,EAAE;AAEF,SAAO;AAAA,IACL,SAAS;AAAA,IACT,cAAa,oBAAI,KAAK,GAAE,YAAY;AAAA,IACpC,SAAS,WAAW;AAAA,IACpB,YAAY,QAAQ;AAAA,IACpB,OAAO;AAAA,EACT;AACF;;;AC1BO,SAAS,mBAAmB,YAAwB,UAAkB,iBAAiC;AAC5G,QAAM,aAAqC,CAAC;AAC5C,aAAW,KAAK,WAAW,OAAO;AAChC,eAAW,EAAE,IAAI,KAAK,WAAW,EAAE,IAAI,KAAK,KAAK;AAAA,EACnD;AAEA,QAAM,YAAoC;AAAA,IACxC,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,KAAK;AAAA,IACL,UAAU;AAAA,IACV,MAAM;AAAA,IACN,UAAU;AAAA,IACV,SAAS;AAAA,IACT,WAAW;AAAA,IACX,OAAO;AAAA,IACP,QAAQ;AAAA,EACV;AAEA,SAAO;AAAA,IACL,SAAS;AAAA,IACT,MAAM;AAAA,MACJ,MAAM;AAAA,MACN,aAAa;AAAA,MACb,KAAK,WAAW;AAAA,IAClB;AAAA,IACA,OAAO;AAAA,MACL,YAAY,WAAW;AAAA,MACvB,WAAW,WAAW;AAAA,MACtB,WAAW;AAAA,IACb;AAAA,IACA;AAAA,IACA,OAAO;AAAA,MACL,YAAY;AAAA,MACZ,sBAAsB;AAAA,MACtB,kBAAkB;AAAA,MAClB,iBAAiB;AAAA,MACjB,sBAAsB;AAAA,MACtB,sBAAsB;AAAA,MACtB,qBAAqB;AAAA,MACrB,uBAAuB;AAAA,IACzB;AAAA,EACF;AACF;;;AC7CA,SAAS,cAAc,kBAAkB;AASzC,SAAS,eAAe,QAAgB,KAA4B;AAClE,QAAM,OAAO,GAAG,MAAM,gBAAgB,OAAO,GAAG,CAAC;AACjD,MAAI,CAAC,WAAW,IAAI,EAAG,QAAO;AAC9B,SAAO,aAAa,MAAM,OAAO;AACnC;AAEA,eAAe,eAAe,KAAuB,OAAe,UAAqC;AACvG,MAAI,CAAC,IAAK,QAAO,CAAC;AAClB,MAAI;AACF,WAAO,MAAM,eAAe,KAAK,OAAO,QAAQ;AAAA,EAClD,QAAQ;AACN,WAAO,CAAC;AAAA,EACV;AACF;AAEA,eAAsB,0BAA0B,YAAwB,QAAgB,QAA0B;AAChH,QAAM,OAAmB,CAAC;AAC1B,QAAM,MAAkB,CAAC;AACzB,QAAM,WAA2B,CAAC;AAClC,QAAM,WAA2B,CAAC;AAClC,QAAM,UAA0B,CAAC;AACjC,QAAM,YAA8B,CAAC;AACrC,QAAM,OAAM,oBAAI,KAAK,GAAE,YAAY;AACnC,QAAM,MAAM,aAAa,MAAM;AAE/B,aAAW,QAAQ,WAAW,OAAO;AACnC,UAAM,OAAO,eAAe,QAAQ,KAAK,GAAG;AAC5C,QAAI,CAAC,KAAM;AAEX,UAAM,UAAU,eAAe,MAAM,KAAK,GAAG;AAE7C,YAAQ,KAAK,MAAM;AAAA,MACjB,KAAK,QAAQ;AACX,cAAM,OAAO,MAAM,eAAe,KAAK,KAAK,OAAO,QAAQ,QAAQ;AACnE,aAAK,KAAK;AAAA,UACR,OAAO,KAAK;AAAA,UACZ,KAAK,KAAK;AAAA,UACV,SAAS,KAAK,SAAS,CAAC,KAAK;AAAA,UAC7B,SAAS,KAAK;AAAA,UACd;AAAA,UACA,YAAY,KAAK,gBAAgB;AAAA,QACnC,CAAC;AACD;AAAA,MACF;AAAA,MAEA,KAAK;AACH,YAAI,QAAQ,SAAS,SAAS,GAAG;AAC/B,qBAAW,QAAQ,QAAQ,UAAU;AACnC,gBAAI,KAAK;AAAA,cACP,UAAU,KAAK;AAAA,cACf,QAAQ,KAAK;AAAA,cACb,UAAU,KAAK,SAAS,CAAC,KAAK;AAAA,cAC9B,KAAK,KAAK;AAAA,cACV,YAAY,KAAK,gBAAgB;AAAA,YACnC,CAAC;AAAA,UACH;AAAA,QACF,OAAO;AACL,cAAI,KAAK;AAAA,YACP,UAAU,KAAK;AAAA,YACf,QAAQ,KAAK;AAAA,YACb,UAAU;AAAA,YACV,KAAK,KAAK;AAAA,YACV,YAAY,KAAK,gBAAgB;AAAA,UACnC,CAAC;AAAA,QACH;AACA;AAAA,MAEF,KAAK;AACH,iBAAS,KAAK;AAAA,UACZ,cAAc,KAAK;AAAA,UACnB,aAAa,KAAK;AAAA,UAClB,UAAU,QAAQ,SAAS,MAAM,GAAG,EAAE;AAAA,UACtC,SAAS;AAAA,UACT,KAAK,KAAK;AAAA,UACV,YAAY,KAAK,gBAAgB;AAAA,QACnC,CAAC;AACD;AAAA,MAEF,KAAK,QAAQ;AACX,cAAM,OAAO,MAAM,eAAe,KAAK,KAAK,OAAO,QAAQ,QAAQ;AACnE,iBAAS,KAAK;AAAA,UACZ,OAAO,KAAK;AAAA,UACZ,SAAS,KAAK;AAAA,UACd,cAAc,KAAK,gBAAgB;AAAA,UACnC,YAAY,KAAK,gBAAgB;AAAA,UACjC;AAAA,UACA,KAAK,KAAK;AAAA,QACZ,CAAC;AACD;AAAA,MACF;AAAA,MAEA,KAAK;AACH,gBAAQ,KAAK;AAAA,UACX,WAAW,KAAK;AAAA,UAChB,OAAO,QAAQ,SAAS,MAAM,mBAAmB,IAAI,CAAC,KAAK;AAAA,UAC3D,UAAU,QAAQ,SAAS,MAAM,GAAG,EAAE;AAAA,UACtC,KAAK,KAAK;AAAA,UACV,YAAY,KAAK,gBAAgB;AAAA,QACnC,CAAC;AACD;AAAA,MAEF,KAAK;AACH,kBAAU,KAAK;AAAA,UACb,SAAS,KAAK,WAAW,KAAK;AAAA,UAC9B,MAAM,KAAK,eAAe,KAAK,gBAAgB;AAAA,UAC/C,SAAS,QAAQ,SAAS,MAAM,CAAC;AAAA,UACjC,KAAK,KAAK;AAAA,QACZ,CAAC;AACD;AAAA,IACJ;AAAA,EACF;AAEA,SAAO,EAAE,MAAM,KAAK,UAAU,UAAU,SAAS,UAAU;AAC7D;;;AJ/GA,eAAsB,YAAY,gBAA+D;AAC/F,QAAM,SAAS,kBAAkB,WAAW;AAC5C,QAAM,SAAS,OAAO,OAAO;AAC7B,QAAM,WAAW,GAAG,MAAM;AAE1B,MAAI,CAACC,YAAW,QAAQ,GAAG;AACzB,QAAI,MAAM,mDAAmD;AAC7D,UAAM,IAAI,MAAM,sBAAsB;AAAA,EACxC;AAEA,QAAM,aAAyB,KAAK,MAAMC,cAAa,UAAU,OAAO,CAAC;AACzE,YAAU,GAAG,MAAM,SAAS,EAAE,WAAW,KAAK,CAAC;AAG/C,QAAM,UAAU,MAAM,YAAY,OAAO,WAAW,CAAC,CAAC;AACtD,QAAM,QAAQ,SAAS,kBAAkB,UAAU;AAEnD,QAAM,KAAK,QAAQ,qBAAqB;AACxC,MAAI,QAAQ;AAGZ,QAAM,UAAU,gBAAgB,YAAY,OAAO,KAAK,MAAM,OAAO,KAAK,WAAW;AACrF,gBAAc,GAAG,MAAM,aAAa,SAAS,OAAO;AACpD;AAGA,QAAM,UAAU,qBAAqB,UAAU;AAC/C,gBAAc,GAAG,MAAM,uBAAuB,KAAK,UAAU,SAAS,MAAM,CAAC,GAAG,OAAO;AACvF;AAGA,QAAM,QAAQ,mBAAmB,YAAY,OAAO,KAAK,MAAM,OAAO,KAAK,WAAW;AACtF,gBAAc,GAAG,MAAM,qBAAqB,KAAK,UAAU,OAAO,MAAM,CAAC,GAAG,OAAO;AACnF;AAGA,QAAM,EAAE,MAAM,KAAK,UAAU,UAAU,SAAS,UAAU,IAAI,MAAM,0BAA0B,YAAY,QAAQ,MAAM;AACxH,gBAAc,GAAG,MAAM,mBAAmB,KAAK,UAAU,MAAM,MAAM,CAAC,GAAG,OAAO;AAChF,gBAAc,GAAG,MAAM,kBAAkB,KAAK,UAAU,KAAK,MAAM,CAAC,GAAG,OAAO;AAC9E,gBAAc,GAAG,MAAM,uBAAuB,KAAK,UAAU,UAAU,MAAM,CAAC,GAAG,OAAO;AACxF,gBAAc,GAAG,MAAM,uBAAuB,KAAK,UAAU,UAAU,MAAM,CAAC,GAAG,OAAO;AACxF,gBAAc,GAAG,MAAM,sBAAsB,KAAK,UAAU,SAAS,MAAM,CAAC,GAAG,OAAO;AACtF,gBAAc,GAAG,MAAM,wBAAwB,KAAK,UAAU,WAAW,MAAM,CAAC,GAAG,OAAO;AAC1F,WAAS;AAGT,QAAM,QAAQ,SAAS,iBAAiB,MAAM;AAE9C,KAAG,QAAQ,aAAa,KAAK,QAAQ;AAErC,MAAI,KAAK,qBAAqB,MAAM,GAAG;AACvC,MAAI,KAAK,eAAe,QAAQ,MAAM,SAAS;AAC/C,MAAI,KAAK,yBAAyB,WAAW,UAAU,SAAS;AAChE,MAAI,KAAK,oBAAoB;AAC7B,MAAI,KAAK,qBAAqB,KAAK,MAAM,WAAW;AACpD,MAAI,KAAK,oBAAoB,IAAI,MAAM,WAAW;AAClD,MAAI,KAAK,yBAAyB,SAAS,MAAM,WAAW;AAC5D,MAAI,KAAK,yBAAyB,SAAS,MAAM,WAAW;AAC5D,MAAI,KAAK,wBAAwB,QAAQ,MAAM,WAAW;AAC1D,MAAI,KAAK,0BAA0B,UAAU,MAAM,WAAW;AAChE;AAEO,SAAS,wBAAwB,SAAkB;AACxD,UACG,QAAQ,UAAU,EAClB,YAAY,iDAAiD,EAC7D,OAAO,YAAY;AAClB,UAAM,YAAY;AAAA,EACpB,CAAC;AACL;","names":["readFileSync","existsSync","existsSync","readFileSync"]}
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
import {
|
|
2
|
+
chatCompletion,
|
|
3
|
+
extractContent,
|
|
4
|
+
getLlmConfig,
|
|
5
|
+
llmSummarize,
|
|
6
|
+
loadConfig,
|
|
7
|
+
loadPlugins,
|
|
8
|
+
log,
|
|
9
|
+
runAfterScan,
|
|
10
|
+
runHook,
|
|
11
|
+
sha256,
|
|
12
|
+
spinner
|
|
13
|
+
} from "./chunk-YWUDTSOR.js";
|
|
14
|
+
|
|
15
|
+
// src/commands/scan.ts
|
|
16
|
+
import { writeFileSync as writeFileSync2, mkdirSync as mkdirSync2 } from "fs";
|
|
17
|
+
|
|
18
|
+
// src/scanner/crawler.ts
|
|
19
|
+
import axios from "axios";
|
|
20
|
+
import * as cheerio from "cheerio";
|
|
21
|
+
import { writeFileSync, mkdirSync } from "fs";
|
|
22
|
+
import pLimit from "p-limit";
|
|
23
|
+
|
|
24
|
+
// src/utils/url.ts
|
|
25
|
+
function normalizeUrl(raw) {
|
|
26
|
+
let url = raw.trim();
|
|
27
|
+
if (!/^https?:\/\//i.test(url)) url = "https://" + url;
|
|
28
|
+
return url.replace(/\/+$/, "");
|
|
29
|
+
}
|
|
30
|
+
function isSameOrigin(base, target) {
|
|
31
|
+
try {
|
|
32
|
+
return new URL(base).origin === new URL(target).origin;
|
|
33
|
+
} catch {
|
|
34
|
+
return false;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
function urlToId(url) {
|
|
38
|
+
return Buffer.from(url).toString("base64url");
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// src/scanner/crawler.ts
|
|
42
|
+
async function crawlSite(config, seedUrls, onPage) {
|
|
43
|
+
const { maxPages, concurrency, delayMs } = config.scan;
|
|
44
|
+
const baseUrl = config.site.url;
|
|
45
|
+
const visited = /* @__PURE__ */ new Set();
|
|
46
|
+
const queue = [...seedUrls, baseUrl];
|
|
47
|
+
const results = [];
|
|
48
|
+
const limit = pLimit(concurrency);
|
|
49
|
+
const cacheDir = `${config.output.dir}/cache/pages`;
|
|
50
|
+
mkdirSync(cacheDir, { recursive: true });
|
|
51
|
+
const delay = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
52
|
+
while (queue.length > 0 && visited.size < maxPages) {
|
|
53
|
+
const batch = queue.splice(0, concurrency).filter((u) => !visited.has(u));
|
|
54
|
+
if (batch.length === 0) continue;
|
|
55
|
+
const tasks = batch.map(
|
|
56
|
+
(url) => limit(async () => {
|
|
57
|
+
if (visited.has(url) || visited.size >= maxPages) return null;
|
|
58
|
+
visited.add(url);
|
|
59
|
+
try {
|
|
60
|
+
const res = await axios.get(url, {
|
|
61
|
+
timeout: 15e3,
|
|
62
|
+
headers: { "User-Agent": "AgentSite-Kit/0.1" },
|
|
63
|
+
maxRedirects: 3
|
|
64
|
+
});
|
|
65
|
+
const html = res.data;
|
|
66
|
+
onPage?.(url, visited.size);
|
|
67
|
+
const filename = sha256(url);
|
|
68
|
+
writeFileSync(`${cacheDir}/${filename}.html`, html, "utf-8");
|
|
69
|
+
const $ = cheerio.load(html);
|
|
70
|
+
$("a[href]").each((_, el) => {
|
|
71
|
+
try {
|
|
72
|
+
const href = $(el).attr("href");
|
|
73
|
+
if (!href) return;
|
|
74
|
+
const resolved = new URL(href, url).href.split("#")[0].split("?")[0];
|
|
75
|
+
if (isSameOrigin(baseUrl, resolved) && !visited.has(resolved)) {
|
|
76
|
+
queue.push(resolved);
|
|
77
|
+
}
|
|
78
|
+
} catch {
|
|
79
|
+
}
|
|
80
|
+
});
|
|
81
|
+
if (delayMs > 0) await delay(delayMs);
|
|
82
|
+
return { url, html, status: res.status };
|
|
83
|
+
} catch {
|
|
84
|
+
return null;
|
|
85
|
+
}
|
|
86
|
+
})
|
|
87
|
+
);
|
|
88
|
+
const settled = await Promise.all(tasks);
|
|
89
|
+
for (const r of settled) {
|
|
90
|
+
if (r) results.push(r);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return results;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// src/scanner/sitemap-parser.ts
|
|
97
|
+
import axios2 from "axios";
|
|
98
|
+
import { XMLParser } from "fast-xml-parser";
|
|
99
|
+
async function parseSitemap(siteUrl) {
|
|
100
|
+
const urls = [];
|
|
101
|
+
const sitemapUrl = `${siteUrl}/sitemap.xml`;
|
|
102
|
+
try {
|
|
103
|
+
const res = await axios2.get(sitemapUrl, {
|
|
104
|
+
timeout: 1e4,
|
|
105
|
+
headers: { "User-Agent": "AgentSite-Kit/0.1" }
|
|
106
|
+
});
|
|
107
|
+
const parser = new XMLParser();
|
|
108
|
+
const parsed = parser.parse(res.data);
|
|
109
|
+
if (parsed.sitemapindex?.sitemap) {
|
|
110
|
+
const sitemaps = Array.isArray(parsed.sitemapindex.sitemap) ? parsed.sitemapindex.sitemap : [parsed.sitemapindex.sitemap];
|
|
111
|
+
for (const sm of sitemaps) {
|
|
112
|
+
if (sm.loc) {
|
|
113
|
+
const childUrls = await parseSitemapFromUrl(sm.loc);
|
|
114
|
+
urls.push(...childUrls);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
if (parsed.urlset?.url) {
|
|
119
|
+
const entries = Array.isArray(parsed.urlset.url) ? parsed.urlset.url : [parsed.urlset.url];
|
|
120
|
+
for (const entry of entries) {
|
|
121
|
+
if (entry.loc) urls.push(entry.loc);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
} catch {
|
|
125
|
+
}
|
|
126
|
+
return urls;
|
|
127
|
+
}
|
|
128
|
+
async function parseSitemapFromUrl(url) {
|
|
129
|
+
try {
|
|
130
|
+
const res = await axios2.get(url, {
|
|
131
|
+
timeout: 1e4,
|
|
132
|
+
headers: { "User-Agent": "AgentSite-Kit/0.1" }
|
|
133
|
+
});
|
|
134
|
+
const parser = new XMLParser();
|
|
135
|
+
const parsed = parser.parse(res.data);
|
|
136
|
+
if (parsed.urlset?.url) {
|
|
137
|
+
const entries = Array.isArray(parsed.urlset.url) ? parsed.urlset.url : [parsed.urlset.url];
|
|
138
|
+
return entries.map((e) => e.loc).filter(Boolean);
|
|
139
|
+
}
|
|
140
|
+
} catch {
|
|
141
|
+
}
|
|
142
|
+
return [];
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// src/scanner/page-classifier.ts
|
|
146
|
+
var URL_PATTERNS = [
|
|
147
|
+
[/\/(docs?|documentation|guide|manual|reference|api-docs)(\/|$)/i, "docs", 3],
|
|
148
|
+
[/\/(faq|frequently-asked|help\/faq)(\/|$)/i, "faq", 3],
|
|
149
|
+
[/\/(blog|posts?|articles?|news)(\/|$)/i, "blog", 3],
|
|
150
|
+
[/\/(products?|features?|solutions?)(\/|$)/i, "product", 2],
|
|
151
|
+
[/\/(pricing|plans?|packages?)(\/|$)/i, "pricing", 3],
|
|
152
|
+
[/\/(about|about-us|team|company)(\/|$)/i, "about", 3],
|
|
153
|
+
[/\/(contact|contact-us|support)(\/|$)/i, "contact", 3],
|
|
154
|
+
[/\/(changelog|changes|release-notes|releases|updates|whats-new)(\/|$)/i, "changelog", 3]
|
|
155
|
+
];
|
|
156
|
+
var TITLE_KEYWORDS = [
|
|
157
|
+
[/\b(documentation|docs|guide|reference|api)\b/i, "docs", 2],
|
|
158
|
+
[/\b(faq|frequently asked|common questions)\b/i, "faq", 2],
|
|
159
|
+
[/\b(blog|article|post)\b/i, "blog", 2],
|
|
160
|
+
[/\b(pricing|plans|packages)\b/i, "pricing", 2],
|
|
161
|
+
[/\b(product|feature|solution)\b/i, "product", 1],
|
|
162
|
+
[/\b(about|team|company|who we are)\b/i, "about", 2],
|
|
163
|
+
[/\b(contact|get in touch|support)\b/i, "contact", 2],
|
|
164
|
+
[/\b(changelog|release notes|what's new|updates|releases)\b/i, "changelog", 2]
|
|
165
|
+
];
|
|
166
|
+
function classifyPage(input) {
|
|
167
|
+
const { url, title, bodyText } = input;
|
|
168
|
+
const scores = /* @__PURE__ */ new Map();
|
|
169
|
+
const add = (type, weight) => {
|
|
170
|
+
scores.set(type, (scores.get(type) ?? 0) + weight);
|
|
171
|
+
};
|
|
172
|
+
try {
|
|
173
|
+
const parsed = new URL(url);
|
|
174
|
+
if (parsed.pathname === "/" || parsed.pathname === "") {
|
|
175
|
+
return "homepage";
|
|
176
|
+
}
|
|
177
|
+
} catch {
|
|
178
|
+
}
|
|
179
|
+
for (const [pattern, type, weight] of URL_PATTERNS) {
|
|
180
|
+
if (pattern.test(url)) add(type, weight);
|
|
181
|
+
}
|
|
182
|
+
for (const [pattern, type, weight] of TITLE_KEYWORDS) {
|
|
183
|
+
if (pattern.test(title)) add(type, weight);
|
|
184
|
+
}
|
|
185
|
+
const qaPairs = (bodyText.match(/\?[\s\n]/g) || []).length;
|
|
186
|
+
if (qaPairs >= 3) add("faq", 2);
|
|
187
|
+
if (/\$\d+|\d+\/mo|per month|free tier/i.test(bodyText)) add("pricing", 2);
|
|
188
|
+
let best = "unknown";
|
|
189
|
+
let bestScore = 1;
|
|
190
|
+
for (const [type, score] of scores) {
|
|
191
|
+
if (score > bestScore) {
|
|
192
|
+
best = type;
|
|
193
|
+
bestScore = score;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
return best;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// src/llm/classifier.ts
|
|
200
|
+
var VALID_TYPES = ["homepage", "docs", "faq", "blog", "product", "pricing", "about", "contact", "unknown"];
|
|
201
|
+
async function llmClassifyPage(config, url, title, bodyText) {
|
|
202
|
+
const truncated = bodyText.slice(0, 2e3);
|
|
203
|
+
const response = await chatCompletion(config, [
|
|
204
|
+
{
|
|
205
|
+
role: "system",
|
|
206
|
+
content: `You are a web page classifier. Classify the page into exactly one of these types: ${VALID_TYPES.join(", ")}. Reply with only the type name, nothing else.`
|
|
207
|
+
},
|
|
208
|
+
{
|
|
209
|
+
role: "user",
|
|
210
|
+
content: `URL: ${url}
|
|
211
|
+
Title: ${title}
|
|
212
|
+
|
|
213
|
+
Content:
|
|
214
|
+
${truncated}`
|
|
215
|
+
}
|
|
216
|
+
], { temperature: 0.1, maxTokens: 32 });
|
|
217
|
+
const cleaned = response.toLowerCase().trim();
|
|
218
|
+
if (VALID_TYPES.includes(cleaned)) return cleaned;
|
|
219
|
+
return "unknown";
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// src/commands/scan.ts
|
|
223
|
+
async function runScan(configOverride) {
|
|
224
|
+
const config = configOverride ?? loadConfig();
|
|
225
|
+
const outDir = config.output.dir;
|
|
226
|
+
mkdirSync2(`${outDir}/cache/pages`, { recursive: true });
|
|
227
|
+
mkdirSync2(`${outDir}/data`, { recursive: true });
|
|
228
|
+
const plugins = await loadPlugins(config.plugins ?? []);
|
|
229
|
+
await runHook(plugins, "beforeScan", config);
|
|
230
|
+
const llmConfig = getLlmConfig(config);
|
|
231
|
+
if (llmConfig) {
|
|
232
|
+
log.info("LLM enabled \u2014 using AI-assisted classification & summarization");
|
|
233
|
+
}
|
|
234
|
+
const sp = spinner("Fetching sitemap...");
|
|
235
|
+
const sitemapUrls = await parseSitemap(config.site.url);
|
|
236
|
+
if (sitemapUrls.length > 0) {
|
|
237
|
+
sp.succeed(`Found ${sitemapUrls.length} URLs in sitemap`);
|
|
238
|
+
} else {
|
|
239
|
+
sp.info("No sitemap found, will discover pages by crawling");
|
|
240
|
+
}
|
|
241
|
+
const crawlSp = spinner("Crawling site...");
|
|
242
|
+
const crawled = await crawlSite(config, sitemapUrls, (url, i) => {
|
|
243
|
+
crawlSp.text = `Crawling (${i}/${config.scan.maxPages})... ${url}`;
|
|
244
|
+
});
|
|
245
|
+
crawlSp.succeed(`Crawled ${crawled.length} pages`);
|
|
246
|
+
const extractSp = spinner("Extracting content...");
|
|
247
|
+
let pages = [];
|
|
248
|
+
for (let idx = 0; idx < crawled.length; idx++) {
|
|
249
|
+
const { url, html } = crawled[idx];
|
|
250
|
+
const content = extractContent(html, url);
|
|
251
|
+
extractSp.text = `Processing (${idx + 1}/${crawled.length})... ${url}`;
|
|
252
|
+
let pageType;
|
|
253
|
+
if (llmConfig) {
|
|
254
|
+
try {
|
|
255
|
+
pageType = await llmClassifyPage(llmConfig, url, content.title, content.bodyText);
|
|
256
|
+
} catch {
|
|
257
|
+
pageType = classifyPage({
|
|
258
|
+
url,
|
|
259
|
+
title: content.title,
|
|
260
|
+
metaOgType: content.metaOgType,
|
|
261
|
+
headings: content.headings,
|
|
262
|
+
bodyText: content.bodyText
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
} else {
|
|
266
|
+
pageType = classifyPage({
|
|
267
|
+
url,
|
|
268
|
+
title: content.title,
|
|
269
|
+
metaOgType: content.metaOgType,
|
|
270
|
+
headings: content.headings,
|
|
271
|
+
bodyText: content.bodyText
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
let summary = content.summary;
|
|
275
|
+
if (llmConfig) {
|
|
276
|
+
try {
|
|
277
|
+
summary = await llmSummarize(llmConfig, content.title, content.bodyText);
|
|
278
|
+
} catch {
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
pages.push({
|
|
282
|
+
url,
|
|
283
|
+
title: content.title,
|
|
284
|
+
type: pageType,
|
|
285
|
+
contentHash: sha256(content.bodyText),
|
|
286
|
+
summary,
|
|
287
|
+
headings: content.headings,
|
|
288
|
+
lastModified: content.lastModified,
|
|
289
|
+
scannedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
290
|
+
wordCount: content.wordCount,
|
|
291
|
+
tags: content.tags,
|
|
292
|
+
version: content.version,
|
|
293
|
+
author: content.author,
|
|
294
|
+
publishedAt: content.publishedAt,
|
|
295
|
+
updatedAt: content.lastModified
|
|
296
|
+
});
|
|
297
|
+
}
|
|
298
|
+
extractSp.succeed(`Extracted content from ${pages.length} pages`);
|
|
299
|
+
pages = await runAfterScan(plugins, pages);
|
|
300
|
+
const result = {
|
|
301
|
+
siteUrl: config.site.url,
|
|
302
|
+
scannedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
303
|
+
totalPages: pages.length,
|
|
304
|
+
pages
|
|
305
|
+
};
|
|
306
|
+
writeFileSync2(`${outDir}/scan-result.json`, JSON.stringify(result, null, 2), "utf-8");
|
|
307
|
+
log.success(`Scan result saved to ${outDir}/scan-result.json`);
|
|
308
|
+
const typeCounts = /* @__PURE__ */ new Map();
|
|
309
|
+
for (const p of pages) {
|
|
310
|
+
typeCounts.set(p.type, (typeCounts.get(p.type) ?? 0) + 1);
|
|
311
|
+
}
|
|
312
|
+
log.info("Page types:");
|
|
313
|
+
for (const [type, count] of typeCounts) {
|
|
314
|
+
console.log(` ${type}: ${count}`);
|
|
315
|
+
}
|
|
316
|
+
return result;
|
|
317
|
+
}
|
|
318
|
+
function registerScanCommand(program) {
|
|
319
|
+
program.command("scan").description("Scan your website and classify pages").option("--no-llm", "Disable LLM-assisted classification and summarization").action(async (opts) => {
|
|
320
|
+
if (!opts.llm) {
|
|
321
|
+
const config = loadConfig();
|
|
322
|
+
config.llm = void 0;
|
|
323
|
+
await runScan(config);
|
|
324
|
+
} else {
|
|
325
|
+
await runScan();
|
|
326
|
+
}
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
export {
|
|
331
|
+
normalizeUrl,
|
|
332
|
+
urlToId,
|
|
333
|
+
crawlSite,
|
|
334
|
+
parseSitemap,
|
|
335
|
+
classifyPage,
|
|
336
|
+
runScan,
|
|
337
|
+
registerScanCommand
|
|
338
|
+
};
|
|
339
|
+
//# sourceMappingURL=chunk-YWR5EH3F.js.map
|