@voicenter-team/nuxt-llms-generator 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunks/llms-files-generator.mjs +228 -208
- package/dist/module.d.mts +28 -34
- package/dist/module.d.ts +28 -34
- package/dist/module.json +1 -1
- package/dist/module.mjs +2 -1
- package/dist/shared/{nuxt-llms-generator.ab079b9f.mjs → nuxt-llms-generator.11eb2a36.mjs} +69 -15
- package/package.json +1 -7
|
@@ -4,10 +4,10 @@ import Mustache from 'mustache';
|
|
|
4
4
|
import Anthropic from '@anthropic-ai/sdk';
|
|
5
5
|
import { createHash } from 'crypto';
|
|
6
6
|
import { JSONPath } from 'jsonpath-plus';
|
|
7
|
-
import { T as TemplateError, E as ErrorCode, w as withErrorHandling } from '../shared/nuxt-llms-generator.
|
|
8
|
-
import { NodeHtmlMarkdown } from 'node-html-markdown';
|
|
7
|
+
import { T as TemplateError, E as ErrorCode, w as withErrorHandling } from '../shared/nuxt-llms-generator.11eb2a36.mjs';
|
|
9
8
|
import '@nuxt/kit';
|
|
10
9
|
import 'zod';
|
|
10
|
+
import 'node-html-markdown';
|
|
11
11
|
|
|
12
12
|
class AnthropicClient {
|
|
13
13
|
client;
|
|
@@ -49,121 +49,133 @@ class AnthropicClient {
|
|
|
49
49
|
throw new Error("Failed to generate template");
|
|
50
50
|
}
|
|
51
51
|
buildPrompt(request) {
|
|
52
|
-
return
|
|
52
|
+
return `# llms\u2011aware Mustache Template Generator Prompt (Key\u2011Agnostic)
|
|
53
53
|
|
|
54
|
-
|
|
54
|
+
You are an expert in crafting **Mustache.js templates** whose output is **clean Markdown pages** (with \`.md\` extension) intended for **LLM-friendly ingestion** following the [\`llms.txt\`](https://llmstxt.org/) standard.
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## \u{1F3AF} Purpose & Goal
|
|
59
|
+
|
|
60
|
+
The \`.md\` files generated from these templates represent the **structured, machine-readable versions of website pages**.
|
|
61
|
+
They are designed to be indexed and understood by **Large Language Models** through the \`llms.txt\` manifest.
|
|
62
|
+
Each \`.md\` file provides semantic, human-readable, and hierarchically organized content for LLMs to learn from.
|
|
63
|
+
|
|
64
|
+
Your task: Given an arbitrary JSON page structure (potentially deeply nested, dynamic, and unpredictable), produce a **generic Mustache template** that renders this JSON into an \`.md\` file conforming to \`llms.txt\` principles \u2014 emphasizing clarity, structure, and interpretability.
|
|
65
|
+
|
|
66
|
+
These templates must be domain-agnostic: usable for **B2B SaaS**, **marketplaces**, **landing pages**, **blogs**, **docs**, or **personal websites**.
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## \u{1F4E5} Context Supplied to You
|
|
55
71
|
|
|
56
|
-
**CURRENT PAGE ANALYSIS:**
|
|
57
72
|
- URL: ${request.url}
|
|
58
|
-
- Template: ${request.templateAlias}
|
|
59
|
-
-
|
|
73
|
+
- Template Alias: ${request.templateAlias}
|
|
74
|
+
- JSON Path: ${request.jpath}
|
|
60
75
|
|
|
61
|
-
|
|
76
|
+
### Available Data
|
|
62
77
|
\`\`\`json
|
|
63
78
|
${JSON.stringify(request.pageContent, null, 2)}
|
|
64
79
|
\`\`\`
|
|
65
80
|
|
|
66
|
-
|
|
81
|
+
---
|
|
67
82
|
|
|
68
|
-
|
|
69
|
-
\u274C Wrong: {{pageTitle}}
|
|
70
|
-
\u2705 Correct: {{pageTittle}} or {{pageDescription}} (match actual JSON keys)
|
|
83
|
+
## \u{1F9E9} Core Principles (Key\u2011Agnostic)
|
|
71
84
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
- Blockquote: Value proposition
|
|
75
|
-
- H2: Key capabilities/benefits
|
|
76
|
-
- H3: Technical details/specs
|
|
77
|
-
- Lists: Features, integrations, use cases
|
|
85
|
+
1. **Do not assume fixed property names.**
|
|
86
|
+
Infer content type and importance dynamically from value structure, length, and position.
|
|
78
87
|
|
|
79
|
-
|
|
80
|
-
-
|
|
81
|
-
-
|
|
82
|
-
-
|
|
83
|
-
- API documentation \u2192 Implementation clarity
|
|
84
|
-
- Contact info \u2192 Business value context
|
|
88
|
+
2. **Purpose\u2011Driven Hierarchy (LLMS.txt\u2011Friendly):**
|
|
89
|
+
- Start with the main concept (\`#\` heading)
|
|
90
|
+
- Follow with a one-paragraph value statement or summary (\`>\` blockquote)
|
|
91
|
+
- Then expand into structured content with \`##\`, \`###\`, and lists
|
|
85
92
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
- Map complex nested arrays to structured lists
|
|
90
|
-
- Extract key differentiators and benefits
|
|
93
|
+
3. **Semantic Markdown Only**
|
|
94
|
+
Use only Markdown syntax (\`#\`, \`>\`, \`-\`, \`[link](url)\`, \`\`).
|
|
95
|
+
**No HTML**, **no entities**, **no inline styles**, **no attributes**.
|
|
91
96
|
|
|
92
|
-
**
|
|
97
|
+
4. **Dynamic Interpretation**
|
|
98
|
+
- Short textual fields near the root likely represent titles.
|
|
99
|
+
- Long text blocks indicate overviews, descriptions, or stories.
|
|
100
|
+
- Arrays of primitives \u2192 bullet lists.
|
|
101
|
+
- Arrays of objects \u2192 repeated sub-sections or tables.
|
|
102
|
+
- Objects \u2192 nested sections with humanized headings.
|
|
93
103
|
|
|
94
|
-
**
|
|
95
|
-
|
|
96
|
-
|
|
104
|
+
5. **Exact Property Bindings**
|
|
105
|
+
When referencing values, always use the **exact property name** from JSON (\`{{keyName}}\`).
|
|
106
|
+
Do not rename or modify binding identifiers.
|
|
97
107
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
108
|
+
6. **Humanized Section Titles**
|
|
109
|
+
Convert keys into readable Markdown headings:
|
|
110
|
+
e.g., \`productDetails\` \u2192 \u201CProduct Details\u201D, \`seo_meta\` \u2192 \u201CSEO Meta\u201D.
|
|
111
|
+
These headings are for readability; the Mustache bindings remain exact.
|
|
101
112
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
{{serviceDescription}}
|
|
105
|
-
{{/serviceDescription}}
|
|
113
|
+
7. **Omit Noise**
|
|
114
|
+
Exclude non-content fields like IDs, timestamps, internal flags, etc.
|
|
106
115
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
{{
|
|
110
|
-
- {{textItem}}
|
|
111
|
-
{{/serviceTools}}
|
|
112
|
-
{{/serviceTools.0}}
|
|
116
|
+
8. **URL Handling**
|
|
117
|
+
- If a value looks like a URL, render \`[Label]({{key}})\`.
|
|
118
|
+
- If image URL, render \`\`.
|
|
113
119
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
{{/serviceLink}}
|
|
118
|
-
\`\`\`
|
|
120
|
+
9. **Recursion & Nesting**
|
|
121
|
+
Apply the same logic recursively to all nested objects and arrays.
|
|
122
|
+
Heading depth corresponds to the nesting level, but avoid exceeding four \`#\` levels.
|
|
119
123
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## \u{1F9E0} LLMS.txt Relevance
|
|
127
|
+
|
|
128
|
+
The resulting \`.md\` output should be suitable for inclusion in or referencing from \`llms.txt\`.
|
|
129
|
+
Each file acts as an **LLM-ingestible mirror** of the website content \u2014 optimized for understanding by AI systems.
|
|
130
|
+
Focus on **semantic clarity**, **hierarchical consistency**, and **contextual richness**.
|
|
131
|
+
This ensures that LLMs reading the \`.md\` files will correctly infer what each page is about.
|
|
123
132
|
|
|
124
|
-
|
|
125
|
-
> {{cardText}}
|
|
126
|
-
{{/cardText}}
|
|
133
|
+
---
|
|
127
134
|
|
|
128
|
-
{
|
|
129
|
-
## How It Works
|
|
130
|
-
{{featureDescription}}
|
|
131
|
-
{{/featureDescription}}
|
|
135
|
+
## \u{1F9F1} Example Structural Patterns
|
|
132
136
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
{{
|
|
136
|
-
|
|
137
|
-
{{
|
|
137
|
+
### Generic Page Template
|
|
138
|
+
\`\`\`mustache
|
|
139
|
+
# {{mainTitle}}
|
|
140
|
+
|
|
141
|
+
{{#summary}}
|
|
142
|
+
> {{summary}}
|
|
143
|
+
{{/summary}}
|
|
138
144
|
|
|
139
|
-
{{
|
|
140
|
-
{{
|
|
145
|
+
{{#sections.0}}
|
|
146
|
+
## {{sectionTitle}}
|
|
147
|
+
{{#sections}}
|
|
148
|
+
### {{itemTitle}}
|
|
149
|
+
{{itemDescription}}
|
|
150
|
+
{{/sections}}
|
|
151
|
+
{{/sections.0}}
|
|
141
152
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
-
|
|
153
|
+
{{#links.0}}
|
|
154
|
+
## Links
|
|
155
|
+
{{#links}}
|
|
156
|
+
- [{{label}}]({{url}})
|
|
157
|
+
{{/links}}
|
|
158
|
+
{{/links.0}}
|
|
146
159
|
\`\`\`
|
|
147
160
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
5. **Maintain SEO-friendly** heading structure
|
|
161
|
+
> Example only \u2014 the actual template must reflect the JSON shape dynamically.
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## \u2705 Output Requirements
|
|
154
166
|
|
|
155
|
-
**
|
|
156
|
-
-
|
|
157
|
-
-
|
|
158
|
-
-
|
|
159
|
-
-
|
|
160
|
-
- Follow LLMS.txt hierarchical structure
|
|
161
|
-
- **CLEAN MARKDOWN ONLY**: No HTML tags, entities, or attributes
|
|
162
|
-
- **NO HTML**: Use pure Markdown syntax (##, **, -, etc.)
|
|
163
|
-
- **NO ENTITIES**: Use actual characters, not & or /
|
|
164
|
-
- **NO ATTRIBUTES**: No dir="RTL", style="", class="" etc.
|
|
167
|
+
- Output **only** the Mustache template (no extra text, no code fences).
|
|
168
|
+
- Use **exact JSON property names** in bindings.
|
|
169
|
+
- Render **clean, human-readable Markdown** suitable for \`llms.txt\`.
|
|
170
|
+
- Maintain logical hierarchy derived from JSON structure.
|
|
171
|
+
- Avoid domain-specific or brand-specific logic.
|
|
165
172
|
|
|
166
|
-
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## \u{1F680} Task
|
|
176
|
+
|
|
177
|
+
Analyze the provided JSON and **generate the Mustache template** that will produce the \`.md\` page following these rules.
|
|
178
|
+
`;
|
|
167
179
|
}
|
|
168
180
|
parseResponse(responseText) {
|
|
169
181
|
const codeBlockRegex = /```(?:mustache)?\n?([\s\S]*?)```/;
|
|
@@ -1128,13 +1140,10 @@ class TemplateGenerator {
|
|
|
1128
1140
|
promptAnalyzer;
|
|
1129
1141
|
cache;
|
|
1130
1142
|
config;
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
void 0,
|
|
1134
|
-
void 0
|
|
1135
|
-
);
|
|
1136
|
-
constructor(config) {
|
|
1143
|
+
umbracoData;
|
|
1144
|
+
constructor(config, umbracoData) {
|
|
1137
1145
|
this.config = config;
|
|
1146
|
+
this.umbracoData = umbracoData;
|
|
1138
1147
|
this.anthropicClient = new AnthropicClient(config);
|
|
1139
1148
|
this.promptAnalyzer = new PromptAnalyzer();
|
|
1140
1149
|
this.cache = new LLMSCache(config.templatesDir || "./.llms-templates");
|
|
@@ -1153,22 +1162,26 @@ class TemplateGenerator {
|
|
|
1153
1162
|
}
|
|
1154
1163
|
return await this.generateTemplateWithAI(pageContent, urlItem);
|
|
1155
1164
|
}
|
|
1156
|
-
async generateAllTemplates(
|
|
1165
|
+
async generateAllTemplates() {
|
|
1157
1166
|
const templates = [];
|
|
1158
1167
|
const maxConcurrent = this.config.maxConcurrent || 5;
|
|
1159
|
-
await performAutomaticCleanup(
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1168
|
+
await performAutomaticCleanup(
|
|
1169
|
+
this.umbracoData,
|
|
1170
|
+
this.config.templatesDir || "./.llms-templates",
|
|
1171
|
+
{
|
|
1172
|
+
enableAutoCleanup: this.config.enableAutoCleanup ?? true,
|
|
1173
|
+
cleanupOrphaned: this.config.cleanupOrphaned ?? true,
|
|
1174
|
+
cleanupHidden: this.config.cleanupHidden ?? true,
|
|
1175
|
+
dryRun: false
|
|
1176
|
+
}
|
|
1177
|
+
);
|
|
1178
|
+
const visibilityStats = getPageVisibilityStats(this.umbracoData);
|
|
1166
1179
|
console.log("\u{1F4CA} Page visibility stats:", visibilityStats);
|
|
1167
|
-
const visiblePages = umbracoData.urlList.filter(
|
|
1168
|
-
(urlItem) => shouldGenerateTemplate(umbracoData, urlItem)
|
|
1180
|
+
const visiblePages = this.umbracoData.urlList.filter(
|
|
1181
|
+
(urlItem) => shouldGenerateTemplate(this.umbracoData, urlItem)
|
|
1169
1182
|
);
|
|
1170
|
-
console.log(`Checking ${visiblePages.length}/${umbracoData.urlList.length} visible pages for cache status...`);
|
|
1171
|
-
const { cached, needGeneration } = this.identifyTemplatesNeeded(
|
|
1183
|
+
console.log(`Checking ${visiblePages.length}/${this.umbracoData.urlList.length} visible pages for cache status...`);
|
|
1184
|
+
const { cached, needGeneration } = this.identifyTemplatesNeeded(visiblePages);
|
|
1172
1185
|
console.log(`\u{1F4C8} Template status: ${cached.length} cached, ${needGeneration.length} need generation`);
|
|
1173
1186
|
templates.push(...cached);
|
|
1174
1187
|
if (needGeneration.length === 0) {
|
|
@@ -1197,11 +1210,11 @@ class TemplateGenerator {
|
|
|
1197
1210
|
console.log(`Generated ${templates.length} total templates (${cached.length} from cache, ${templates.length - cached.length} newly generated)`);
|
|
1198
1211
|
return templates;
|
|
1199
1212
|
}
|
|
1200
|
-
identifyTemplatesNeeded(
|
|
1213
|
+
identifyTemplatesNeeded(visiblePages) {
|
|
1201
1214
|
const cached = [];
|
|
1202
1215
|
const needGeneration = [];
|
|
1203
1216
|
for (const urlItem of visiblePages) {
|
|
1204
|
-
const pageContent = extractPageContent(umbracoData, urlItem.Jpath);
|
|
1217
|
+
const pageContent = extractPageContent(this.umbracoData, urlItem.Jpath);
|
|
1205
1218
|
if (!pageContent) {
|
|
1206
1219
|
console.warn(`No content found for ${urlItem.url} (${urlItem.Jpath})`);
|
|
1207
1220
|
continue;
|
|
@@ -1267,11 +1280,7 @@ class TemplateGenerator {
|
|
|
1267
1280
|
async renderTemplate(template, data) {
|
|
1268
1281
|
return withErrorHandling(async () => {
|
|
1269
1282
|
const validatedTemplate = await templateValidationPipeline.validateAndFix(template);
|
|
1270
|
-
|
|
1271
|
-
if (this.config.enableHtmlToMarkdown) {
|
|
1272
|
-
return this.nhm.translate(renderedContent);
|
|
1273
|
-
}
|
|
1274
|
-
return renderedContent;
|
|
1283
|
+
return Mustache.render(validatedTemplate, data);
|
|
1275
1284
|
}, {
|
|
1276
1285
|
template: template.substring(0, 200) + "...",
|
|
1277
1286
|
dataKeys: Object.keys(data)
|
|
@@ -1325,21 +1334,22 @@ class TemplateGenerator {
|
|
|
1325
1334
|
class LLMSFilesGenerator {
|
|
1326
1335
|
config;
|
|
1327
1336
|
templateGenerator;
|
|
1328
|
-
|
|
1329
|
-
constructor(config) {
|
|
1337
|
+
umbracoData;
|
|
1338
|
+
constructor(config, umbracoData) {
|
|
1330
1339
|
this.config = config;
|
|
1331
|
-
this.
|
|
1340
|
+
this.umbracoData = umbracoData;
|
|
1341
|
+
this.templateGenerator = new TemplateGenerator(config, umbracoData);
|
|
1332
1342
|
}
|
|
1333
|
-
async generateAllFiles(
|
|
1343
|
+
async generateAllFiles() {
|
|
1334
1344
|
const startTime = Date.now();
|
|
1335
1345
|
console.log("\u{1F680} Starting LLMS files generation...");
|
|
1336
|
-
const templates = await this.templateGenerator.generateAllTemplates(
|
|
1346
|
+
const templates = await this.templateGenerator.generateAllTemplates();
|
|
1337
1347
|
console.log("\u{1F4C4} Generating individual markdown files...");
|
|
1338
|
-
const individualMdFiles = this.config.enableIndividualMd ? await this.generateIndividualMarkdownFiles(
|
|
1348
|
+
const individualMdFiles = this.config.enableIndividualMd ? await this.generateIndividualMarkdownFiles(templates) : void 0;
|
|
1339
1349
|
console.log("\u{1F4DD} Generating llms.txt navigation file...");
|
|
1340
|
-
const llmsTxt = this.generateLLMSTxt(
|
|
1350
|
+
const llmsTxt = this.generateLLMSTxt(individualMdFiles || []);
|
|
1341
1351
|
console.log("\u{1F4DA} Generating llms-full.txt...");
|
|
1342
|
-
const llmsFullTxt = this.config.enableLLMSFullTxt ? this.generateLLMSFullTxt(
|
|
1352
|
+
const llmsFullTxt = this.config.enableLLMSFullTxt ? this.generateLLMSFullTxt(individualMdFiles || []) : void 0;
|
|
1343
1353
|
const files = {
|
|
1344
1354
|
llmsTxt,
|
|
1345
1355
|
llmsFullTxt,
|
|
@@ -1350,18 +1360,18 @@ class LLMSFilesGenerator {
|
|
|
1350
1360
|
console.log(`\u2705 LLMS files generation completed in ${duration}ms`);
|
|
1351
1361
|
return files;
|
|
1352
1362
|
}
|
|
1353
|
-
async generateIndividualMarkdownFiles(
|
|
1363
|
+
async generateIndividualMarkdownFiles(templates) {
|
|
1354
1364
|
const mdFiles = [];
|
|
1355
1365
|
for (const template of templates) {
|
|
1356
1366
|
try {
|
|
1357
|
-
const urlItem = umbracoData.urlList.find(
|
|
1367
|
+
const urlItem = this.umbracoData.urlList.find(
|
|
1358
1368
|
(item) => generatePageId(item) === template.pageId
|
|
1359
1369
|
);
|
|
1360
1370
|
if (!urlItem) {
|
|
1361
1371
|
console.warn(`URL item not found for template ${template.pageId}`);
|
|
1362
1372
|
continue;
|
|
1363
1373
|
}
|
|
1364
|
-
const pageContent = extractPageContent(umbracoData, urlItem.Jpath);
|
|
1374
|
+
const pageContent = extractPageContent(this.umbracoData, urlItem.Jpath);
|
|
1365
1375
|
if (!pageContent) {
|
|
1366
1376
|
console.warn(`Page content not found for ${urlItem.url}`);
|
|
1367
1377
|
continue;
|
|
@@ -1385,9 +1395,9 @@ class LLMSFilesGenerator {
|
|
|
1385
1395
|
}
|
|
1386
1396
|
return mdFiles;
|
|
1387
1397
|
}
|
|
1388
|
-
generateLLMSTxt(
|
|
1389
|
-
const siteTitle = this.extractSiteTitle(
|
|
1390
|
-
const siteDescription = this.extractSiteDescription(
|
|
1398
|
+
generateLLMSTxt(mdFiles) {
|
|
1399
|
+
const siteTitle = this.extractSiteTitle();
|
|
1400
|
+
const siteDescription = this.extractSiteDescription();
|
|
1391
1401
|
let content = `# ${siteTitle}
|
|
1392
1402
|
|
|
1393
1403
|
`;
|
|
@@ -1399,7 +1409,7 @@ class LLMSFilesGenerator {
|
|
|
1399
1409
|
content += `This website contains comprehensive information about ${siteTitle.toLowerCase()}. The content is organized into the following sections:
|
|
1400
1410
|
|
|
1401
1411
|
`;
|
|
1402
|
-
const pagesByCategory = this.groupPagesByCategory(
|
|
1412
|
+
const pagesByCategory = this.groupPagesByCategory(mdFiles);
|
|
1403
1413
|
for (const [category, pages] of Object.entries(pagesByCategory)) {
|
|
1404
1414
|
if (pages.length === 0)
|
|
1405
1415
|
continue;
|
|
@@ -1407,33 +1417,32 @@ class LLMSFilesGenerator {
|
|
|
1407
1417
|
|
|
1408
1418
|
`;
|
|
1409
1419
|
for (const page of pages) {
|
|
1410
|
-
const urlItem = umbracoData.urlList.find((item) => item.url === page.url);
|
|
1411
|
-
const pageTitle = this.extractPageTitle(
|
|
1420
|
+
const urlItem = this.umbracoData.urlList.find((item) => item.url === page.url);
|
|
1421
|
+
const pageTitle = this.extractPageTitle(urlItem);
|
|
1412
1422
|
const relativeFilePath = this.getLLMSFilePath(page.path);
|
|
1413
|
-
content += `- [${pageTitle}](${relativeFilePath}): ${this.generatePageDescription(
|
|
1423
|
+
content += `- [${pageTitle}](${relativeFilePath}): ${this.generatePageDescription(urlItem)}
|
|
1414
1424
|
`;
|
|
1415
1425
|
}
|
|
1416
1426
|
content += "\n";
|
|
1417
1427
|
}
|
|
1418
|
-
const visiblePages = getVisiblePages(umbracoData);
|
|
1419
|
-
const hiddenCount = umbracoData.urlList.length - visiblePages.length;
|
|
1428
|
+
const visiblePages = getVisiblePages(this.umbracoData);
|
|
1429
|
+
const hiddenCount = this.umbracoData.urlList.length - visiblePages.length;
|
|
1420
1430
|
if (hiddenCount > 0) {
|
|
1421
1431
|
content += `*Note: ${hiddenCount} pages are excluded from this documentation as they are marked as hidden.*
|
|
1422
1432
|
|
|
1423
1433
|
`;
|
|
1424
1434
|
}
|
|
1425
1435
|
content += "## Optional\n\n";
|
|
1426
|
-
content +=
|
|
1427
|
-
content += "- [Site Map](sitemap.xml): XML sitemap of all pages\n";
|
|
1436
|
+
content += `- [Complete Documentation](${this.makeUrl("/llms-full.txt")}): All content combined in a single file`;
|
|
1428
1437
|
const outputPath = join(this.getOutputDir(), "llms.txt");
|
|
1429
1438
|
return {
|
|
1430
1439
|
path: outputPath,
|
|
1431
1440
|
content: content.trim()
|
|
1432
1441
|
};
|
|
1433
1442
|
}
|
|
1434
|
-
generateLLMSFullTxt(
|
|
1435
|
-
const siteTitle = this.extractSiteTitle(
|
|
1436
|
-
const siteDescription = this.extractSiteDescription(
|
|
1443
|
+
generateLLMSFullTxt(mdFiles) {
|
|
1444
|
+
const siteTitle = this.extractSiteTitle();
|
|
1445
|
+
const siteDescription = this.extractSiteDescription();
|
|
1437
1446
|
let content = `# ${siteTitle} - Complete Documentation
|
|
1438
1447
|
|
|
1439
1448
|
`;
|
|
@@ -1444,15 +1453,10 @@ class LLMSFilesGenerator {
|
|
|
1444
1453
|
}
|
|
1445
1454
|
content += "---\n\n";
|
|
1446
1455
|
for (const mdFile of mdFiles) {
|
|
1447
|
-
const urlItem = umbracoData.urlList.find((item) => item.url === mdFile.url);
|
|
1456
|
+
const urlItem = this.umbracoData.urlList.find((item) => item.url === mdFile.url);
|
|
1448
1457
|
if (!urlItem)
|
|
1449
1458
|
continue;
|
|
1450
|
-
content += `## Page: ${mdFile.url}
|
|
1451
|
-
|
|
1452
|
-
`;
|
|
1453
|
-
content += `**Template**: ${urlItem.TemplateAlias}
|
|
1454
|
-
`;
|
|
1455
|
-
content += `**Node ID**: ${urlItem.nodeID}
|
|
1459
|
+
content += `## Page: ${this.makeUrl(mdFile.url)}
|
|
1456
1460
|
|
|
1457
1461
|
`;
|
|
1458
1462
|
content += mdFile.content;
|
|
@@ -1481,111 +1485,127 @@ class LLMSFilesGenerator {
|
|
|
1481
1485
|
console.log(`\u{1F4DD} Saved: ${files.individualMdFiles.length} markdown files to llms/ subdirectory`);
|
|
1482
1486
|
}
|
|
1483
1487
|
}
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1488
|
+
/**
|
|
1489
|
+
* Groups pages by their first-level URL segment.
|
|
1490
|
+
* Example:
|
|
1491
|
+
* /ai-marketplace -> category "ai-marketplace"
|
|
1492
|
+
* /ai-marketplace/asda -> category "ai-marketplace"
|
|
1493
|
+
* /marketplace -> category "marketplace"
|
|
1494
|
+
* / -> category "main"
|
|
1495
|
+
*/
|
|
1496
|
+
groupPagesByCategory(mdFiles) {
|
|
1497
|
+
const categories = {};
|
|
1493
1498
|
for (const mdFile of mdFiles) {
|
|
1494
|
-
const urlItem = umbracoData.urlList.find((item) => item.url === mdFile.url);
|
|
1499
|
+
const urlItem = this.umbracoData.urlList.find((item) => item.url === mdFile.url);
|
|
1495
1500
|
if (!urlItem)
|
|
1496
1501
|
continue;
|
|
1497
1502
|
const category = this.categorizeUrlItem(urlItem);
|
|
1498
|
-
if (!categories[category])
|
|
1503
|
+
if (!categories[category])
|
|
1499
1504
|
categories[category] = [];
|
|
1500
|
-
}
|
|
1501
1505
|
categories[category].push(mdFile);
|
|
1502
1506
|
}
|
|
1503
1507
|
return categories;
|
|
1504
1508
|
}
|
|
1509
|
+
/**
|
|
1510
|
+
* Determines a logical category name based on the URL structure.
|
|
1511
|
+
* Uses the first path segment as the category.
|
|
1512
|
+
*/
|
|
1505
1513
|
categorizeUrlItem(urlItem) {
|
|
1506
|
-
const
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1514
|
+
const url = urlItem.url.toLowerCase().trim();
|
|
1515
|
+
if (url === "/" || url === "")
|
|
1516
|
+
return "main";
|
|
1517
|
+
const cleaned = url.replace(/^https?:\/\/[^/]+/, "").replace(/\/+$/, "");
|
|
1518
|
+
const segments = cleaned.split("/").filter(Boolean);
|
|
1519
|
+
if (segments.length === 0)
|
|
1510
1520
|
return "main";
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
if (
|
|
1514
|
-
return "
|
|
1515
|
-
if (
|
|
1516
|
-
return "
|
|
1517
|
-
return
|
|
1518
|
-
}
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
+
const firstSegment = segments[0];
|
|
1522
|
+
const ignored = ["media", "assets", "static", "files", "uploads"];
|
|
1523
|
+
if (ignored.includes(firstSegment))
|
|
1524
|
+
return "other";
|
|
1525
|
+
if (firstSegment.length < 2 || /^\d+$/.test(firstSegment))
|
|
1526
|
+
return "other";
|
|
1527
|
+
return firstSegment;
|
|
1528
|
+
}
|
|
1529
|
+
/**
|
|
1530
|
+
* Returns a formatted, human-readable category name for llms.txt output.
|
|
1531
|
+
*/
|
|
1532
|
+
formatCategoryName(category) {
|
|
1533
|
+
if (category === "main")
|
|
1534
|
+
return "Main Pages";
|
|
1535
|
+
if (category === "other")
|
|
1536
|
+
return "Other Pages";
|
|
1537
|
+
return category.split("-").map((word) => word.length <= 3 ? word.toUpperCase() : word.charAt(0).toUpperCase() + word.slice(1)).join(" ");
|
|
1538
|
+
}
|
|
1539
|
+
extractSiteTitle() {
|
|
1540
|
+
const siteData = this.umbracoData.SiteData;
|
|
1521
1541
|
const rawTitle = siteData?.pageTitle || siteData?.mainHeaderBlockTitle || "Website Documentation";
|
|
1522
|
-
return
|
|
1542
|
+
return rawTitle;
|
|
1523
1543
|
}
|
|
1524
|
-
extractSiteDescription(
|
|
1525
|
-
const siteData = umbracoData.SiteData;
|
|
1544
|
+
extractSiteDescription() {
|
|
1545
|
+
const siteData = this.umbracoData.SiteData;
|
|
1526
1546
|
const rawDescription = siteData?.pageDescription || siteData?.ogDescription || null;
|
|
1527
|
-
return rawDescription ?
|
|
1528
|
-
}
|
|
1529
|
-
cleanHtmlContent(content) {
|
|
1530
|
-
if (!this.config.enableHtmlToMarkdown) {
|
|
1531
|
-
return content;
|
|
1532
|
-
}
|
|
1533
|
-
if (/<[^>]+>/.test(content)) {
|
|
1534
|
-
return this.nhm.translate(content).trim();
|
|
1535
|
-
}
|
|
1536
|
-
return content;
|
|
1547
|
+
return rawDescription ? rawDescription : null;
|
|
1537
1548
|
}
|
|
1538
|
-
extractPageTitle(
|
|
1549
|
+
extractPageTitle(urlItem) {
|
|
1539
1550
|
if (!urlItem)
|
|
1540
1551
|
return "Untitled Page";
|
|
1541
|
-
const pageContent = extractPageContent(umbracoData, urlItem.Jpath);
|
|
1552
|
+
const pageContent = extractPageContent(this.umbracoData, urlItem.Jpath);
|
|
1542
1553
|
if (!pageContent)
|
|
1543
1554
|
return urlItem.TemplateAlias;
|
|
1544
|
-
return
|
|
1555
|
+
return String(
|
|
1556
|
+
pageContent.pageTitle || pageContent.title || pageContent.pageTittle || urlItem.TemplateAlias
|
|
1557
|
+
);
|
|
1545
1558
|
}
|
|
1546
|
-
generatePageDescription(
|
|
1559
|
+
generatePageDescription(urlItem) {
|
|
1547
1560
|
if (!urlItem)
|
|
1548
1561
|
return "Page information";
|
|
1549
|
-
const pageContent = extractPageContent(umbracoData, urlItem.Jpath);
|
|
1562
|
+
const pageContent = extractPageContent(this.umbracoData, urlItem.Jpath);
|
|
1550
1563
|
if (!pageContent)
|
|
1551
1564
|
return `${urlItem.TemplateAlias} page`;
|
|
1552
1565
|
const desc = pageContent.pageDescription || pageContent.description || pageContent.headerBlockSubtitle;
|
|
1553
1566
|
if (desc && typeof desc === "string") {
|
|
1554
|
-
return desc
|
|
1567
|
+
return desc;
|
|
1555
1568
|
}
|
|
1556
1569
|
return `Information about ${urlItem.url}`;
|
|
1557
1570
|
}
|
|
1558
|
-
formatCategoryName(category) {
|
|
1559
|
-
const names = {
|
|
1560
|
-
main: "Main Pages",
|
|
1561
|
-
blog: "Blog & Articles",
|
|
1562
|
-
services: "Services & Products",
|
|
1563
|
-
info: "Information Pages",
|
|
1564
|
-
other: "Other Pages"
|
|
1565
|
-
};
|
|
1566
|
-
return names[category] || category.charAt(0).toUpperCase() + category.slice(1);
|
|
1567
|
-
}
|
|
1568
1571
|
sanitizeUrlForFilename(url) {
|
|
1569
|
-
|
|
1572
|
+
if (url === "/") {
|
|
1573
|
+
return "index";
|
|
1574
|
+
}
|
|
1575
|
+
let filename = url.replace(/^\//, "").replace(/\/$/, "").replace(/\//g, "-").replace(/--+/g, "-").replace(/^-+|-+$/g, "");
|
|
1570
1576
|
if (!filename || filename === "") {
|
|
1571
|
-
filename =
|
|
1577
|
+
filename = `index_${url.length}_${Date.now()}`;
|
|
1572
1578
|
}
|
|
1573
1579
|
if (filename.startsWith("-") || filename.startsWith(".")) {
|
|
1574
1580
|
filename = "page-" + filename.replace(/^[-.]/, "");
|
|
1575
1581
|
}
|
|
1576
1582
|
return filename;
|
|
1577
1583
|
}
|
|
1578
|
-
getRelativeFilePath(fullPath) {
|
|
1579
|
-
const filename = fullPath.split("/").pop() || "";
|
|
1580
|
-
return filename;
|
|
1581
|
-
}
|
|
1582
1584
|
getLLMSFilePath(fullPath) {
|
|
1583
1585
|
const filename = basename(fullPath);
|
|
1584
|
-
return `/llms/${filename}
|
|
1586
|
+
return this.makeUrl(`/llms/${filename}`);
|
|
1585
1587
|
}
|
|
1586
1588
|
getOutputDir() {
|
|
1587
1589
|
return this.config.finalOutputDir || "dist";
|
|
1588
1590
|
}
|
|
1591
|
+
getBaseSiteUrl() {
|
|
1592
|
+
if (this.config.baseSiteUrl) {
|
|
1593
|
+
return this.config.baseSiteUrl;
|
|
1594
|
+
} else if (this.config.baseSiteUrlUmbracoDataKey) {
|
|
1595
|
+
return this.config.baseSiteUrlUmbracoDataKey in this.umbracoData.SiteData ? String(this.umbracoData.SiteData[this.config.baseSiteUrlUmbracoDataKey]) : "";
|
|
1596
|
+
}
|
|
1597
|
+
return "";
|
|
1598
|
+
}
|
|
1599
|
+
makeUrl(path) {
|
|
1600
|
+
const base = this.getBaseSiteUrl();
|
|
1601
|
+
try {
|
|
1602
|
+
return new URL(path, base).toString();
|
|
1603
|
+
} catch {
|
|
1604
|
+
const baseClean = base?.replace(/\/+$/, "") || "";
|
|
1605
|
+
const pathClean = path?.replace(/^\/+/, "") || "";
|
|
1606
|
+
return baseClean && pathClean ? `${baseClean}/${pathClean}` : baseClean || pathClean;
|
|
1607
|
+
}
|
|
1608
|
+
}
|
|
1589
1609
|
}
|
|
1590
1610
|
|
|
1591
1611
|
export { LLMSFilesGenerator };
|
package/dist/module.d.mts
CHANGED
|
@@ -1,19 +1,33 @@
|
|
|
1
1
|
import * as _nuxt_schema from '@nuxt/schema';
|
|
2
|
+
import { z } from 'zod';
|
|
2
3
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
4
|
+
/**
|
|
5
|
+
* Zod validation schemas for LLMS generator
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
declare const LLMSConfigSchema: z.ZodObject<{
|
|
9
|
+
anthropicApiKey: z.ZodString;
|
|
10
|
+
umbracoDataPath: z.ZodString;
|
|
11
|
+
templatesDir: z.ZodString;
|
|
12
|
+
finalOutputDir: z.ZodDefault<z.ZodOptional<z.ZodString>>;
|
|
13
|
+
anthropicModel: z.ZodOptional<z.ZodString>;
|
|
14
|
+
baseSiteUrl: z.ZodOptional<z.ZodString>;
|
|
15
|
+
baseSiteUrlUmbracoDataKey: z.ZodOptional<z.ZodString>;
|
|
16
|
+
maxConcurrent: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
17
|
+
enableLLMSFullTxt: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
18
|
+
enableIndividualMd: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
19
|
+
enableAutoCleanup: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
20
|
+
cleanupOrphaned: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
21
|
+
cleanupHidden: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
22
|
+
enableHtmlToMarkdown: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
23
|
+
}, z.core.$strict>;
|
|
24
|
+
type ValidatedLLMSConfig = z.infer<typeof LLMSConfigSchema>;
|
|
25
|
+
|
|
26
|
+
interface LLMSModuleOptions extends Partial<ValidatedLLMSConfig> {
|
|
27
|
+
enabled?: boolean;
|
|
16
28
|
}
|
|
29
|
+
declare const _default: _nuxt_schema.NuxtModule<LLMSModuleOptions, LLMSModuleOptions, false>;
|
|
30
|
+
|
|
17
31
|
interface UmbracoUrlItem {
|
|
18
32
|
nodeID: number;
|
|
19
33
|
url: string;
|
|
@@ -98,11 +112,6 @@ interface AnthropicGenerationResponse {
|
|
|
98
112
|
tags?: string[];
|
|
99
113
|
};
|
|
100
114
|
}
|
|
101
|
-
interface LLMSGeneratorOptions {
|
|
102
|
-
config: LLMSConfig;
|
|
103
|
-
umbracoData: UmbracoData;
|
|
104
|
-
templateCache?: TemplateCache;
|
|
105
|
-
}
|
|
106
115
|
interface PageStructureInfo {
|
|
107
116
|
keys: string[];
|
|
108
117
|
excludedKeys: string[];
|
|
@@ -114,21 +123,6 @@ interface HashGenerationOptions {
|
|
|
114
123
|
excludeKeys?: string[];
|
|
115
124
|
includeOnlyKeys?: string[];
|
|
116
125
|
}
|
|
117
|
-
interface GenerationStats {
|
|
118
|
-
totalPages: number;
|
|
119
|
-
templatesGenerated: number;
|
|
120
|
-
templatesFromCache: number;
|
|
121
|
-
mdFilesGenerated: number;
|
|
122
|
-
llmsTxtGenerated: true;
|
|
123
|
-
llmsFullTxtGenerated: boolean;
|
|
124
|
-
duration: number;
|
|
125
|
-
apiCallsUsed: number;
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
interface LLMSModuleOptions extends Partial<LLMSConfig> {
|
|
129
|
-
enabled?: boolean;
|
|
130
|
-
}
|
|
131
|
-
declare const _default: _nuxt_schema.NuxtModule<LLMSModuleOptions, LLMSModuleOptions, false>;
|
|
132
126
|
|
|
133
127
|
export { _default as default };
|
|
134
|
-
export type { AnthropicGenerationRequest, AnthropicGenerationResponse, GeneratedTemplate,
|
|
128
|
+
export type { AnthropicGenerationRequest, AnthropicGenerationResponse, GeneratedTemplate, HashGenerationOptions, LLMSFiles, PageContentHash, PageStructureInfo, TemplateCache, UmbracoData, UmbracoPageContent, UmbracoSiteData, UmbracoUrlItem };
|
package/dist/module.d.ts
CHANGED
|
@@ -1,19 +1,33 @@
|
|
|
1
1
|
import * as _nuxt_schema from '@nuxt/schema';
|
|
2
|
+
import { z } from 'zod';
|
|
2
3
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
4
|
+
/**
|
|
5
|
+
* Zod validation schemas for LLMS generator
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
declare const LLMSConfigSchema: z.ZodObject<{
|
|
9
|
+
anthropicApiKey: z.ZodString;
|
|
10
|
+
umbracoDataPath: z.ZodString;
|
|
11
|
+
templatesDir: z.ZodString;
|
|
12
|
+
finalOutputDir: z.ZodDefault<z.ZodOptional<z.ZodString>>;
|
|
13
|
+
anthropicModel: z.ZodOptional<z.ZodString>;
|
|
14
|
+
baseSiteUrl: z.ZodOptional<z.ZodString>;
|
|
15
|
+
baseSiteUrlUmbracoDataKey: z.ZodOptional<z.ZodString>;
|
|
16
|
+
maxConcurrent: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
17
|
+
enableLLMSFullTxt: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
18
|
+
enableIndividualMd: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
19
|
+
enableAutoCleanup: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
20
|
+
cleanupOrphaned: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
21
|
+
cleanupHidden: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
22
|
+
enableHtmlToMarkdown: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
23
|
+
}, z.core.$strict>;
|
|
24
|
+
type ValidatedLLMSConfig = z.infer<typeof LLMSConfigSchema>;
|
|
25
|
+
|
|
26
|
+
interface LLMSModuleOptions extends Partial<ValidatedLLMSConfig> {
|
|
27
|
+
enabled?: boolean;
|
|
16
28
|
}
|
|
29
|
+
declare const _default: _nuxt_schema.NuxtModule<LLMSModuleOptions, LLMSModuleOptions, false>;
|
|
30
|
+
|
|
17
31
|
interface UmbracoUrlItem {
|
|
18
32
|
nodeID: number;
|
|
19
33
|
url: string;
|
|
@@ -98,11 +112,6 @@ interface AnthropicGenerationResponse {
|
|
|
98
112
|
tags?: string[];
|
|
99
113
|
};
|
|
100
114
|
}
|
|
101
|
-
interface LLMSGeneratorOptions {
|
|
102
|
-
config: LLMSConfig;
|
|
103
|
-
umbracoData: UmbracoData;
|
|
104
|
-
templateCache?: TemplateCache;
|
|
105
|
-
}
|
|
106
115
|
interface PageStructureInfo {
|
|
107
116
|
keys: string[];
|
|
108
117
|
excludedKeys: string[];
|
|
@@ -114,21 +123,6 @@ interface HashGenerationOptions {
|
|
|
114
123
|
excludeKeys?: string[];
|
|
115
124
|
includeOnlyKeys?: string[];
|
|
116
125
|
}
|
|
117
|
-
interface GenerationStats {
|
|
118
|
-
totalPages: number;
|
|
119
|
-
templatesGenerated: number;
|
|
120
|
-
templatesFromCache: number;
|
|
121
|
-
mdFilesGenerated: number;
|
|
122
|
-
llmsTxtGenerated: true;
|
|
123
|
-
llmsFullTxtGenerated: boolean;
|
|
124
|
-
duration: number;
|
|
125
|
-
apiCallsUsed: number;
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
interface LLMSModuleOptions extends Partial<LLMSConfig> {
|
|
129
|
-
enabled?: boolean;
|
|
130
|
-
}
|
|
131
|
-
declare const _default: _nuxt_schema.NuxtModule<LLMSModuleOptions, LLMSModuleOptions, false>;
|
|
132
126
|
|
|
133
127
|
export { _default as default };
|
|
134
|
-
export type { AnthropicGenerationRequest, AnthropicGenerationResponse, GeneratedTemplate,
|
|
128
|
+
export type { AnthropicGenerationRequest, AnthropicGenerationResponse, GeneratedTemplate, HashGenerationOptions, LLMSFiles, PageContentHash, PageStructureInfo, TemplateCache, UmbracoData, UmbracoPageContent, UmbracoSiteData, UmbracoUrlItem };
|
package/dist/module.json
CHANGED
package/dist/module.mjs
CHANGED
|
@@ -2,6 +2,7 @@ import { defineNuxtModule, useLogger, addTemplate } from '@nuxt/kit';
|
|
|
2
2
|
import { existsSync, readFileSync } from 'fs';
|
|
3
3
|
import { resolve } from 'path';
|
|
4
4
|
import { z } from 'zod';
|
|
5
|
+
import { NodeHtmlMarkdown } from 'node-html-markdown';
|
|
5
6
|
|
|
6
7
|
const existingPath = z.string().refine(
|
|
7
8
|
(path) => existsSync(path)
|
|
@@ -13,6 +14,29 @@ const LLMSConfigSchema = z.object({
|
|
|
13
14
|
templatesDir: z.string().min(1, "Templates directory is required"),
|
|
14
15
|
finalOutputDir: z.string().optional().default("public"),
|
|
15
16
|
anthropicModel: z.string().optional(),
|
|
17
|
+
baseSiteUrl: z.string().optional().refine((url) => {
|
|
18
|
+
try {
|
|
19
|
+
if (!url) {
|
|
20
|
+
return true;
|
|
21
|
+
}
|
|
22
|
+
const parsed = new URL(url);
|
|
23
|
+
if (!["http:", "https:"].includes(parsed.protocol))
|
|
24
|
+
return false;
|
|
25
|
+
if (parsed.pathname !== "" && parsed.pathname !== "/")
|
|
26
|
+
return false;
|
|
27
|
+
if (parsed.pathname === "/") {
|
|
28
|
+
if (url.endsWith("/"))
|
|
29
|
+
return false;
|
|
30
|
+
}
|
|
31
|
+
return !(parsed.search || parsed.hash);
|
|
32
|
+
} catch {
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
}, "Must be a base domain URL like 'https://example.com' (no path, no trailing slash)").refine(
|
|
36
|
+
(url) => !url || !url.endsWith("/"),
|
|
37
|
+
"Must not end with a trailing slash"
|
|
38
|
+
).describe("The base URL of the website to append to links in generated llms files"),
|
|
39
|
+
baseSiteUrlUmbracoDataKey: z.string().optional().describe("If the SiteData of UmbracoData has the key with the base URL you can pass here the key to auto extract the base url"),
|
|
16
40
|
maxConcurrent: z.number().int().min(1, "maxConcurrent must be at least 1").max(10, "maxConcurrent should not exceed 10 to avoid rate limits").optional().default(3),
|
|
17
41
|
enableLLMSFullTxt: z.boolean().optional().default(true),
|
|
18
42
|
enableIndividualMd: z.boolean().optional().default(true),
|
|
@@ -20,19 +44,17 @@ const LLMSConfigSchema = z.object({
|
|
|
20
44
|
cleanupOrphaned: z.boolean().optional().default(true),
|
|
21
45
|
cleanupHidden: z.boolean().optional().default(true),
|
|
22
46
|
enableHtmlToMarkdown: z.boolean().optional().default(true)
|
|
23
|
-
}).
|
|
47
|
+
}).refine(
|
|
48
|
+
(data) => data.baseSiteUrl || data.baseSiteUrlUmbracoDataKey,
|
|
49
|
+
{
|
|
50
|
+
message: 'At least one of "baseSiteUrl" or "baseSiteUrlUmbracoDataKey" must be provided.',
|
|
51
|
+
path: ["baseSiteUrl"]
|
|
52
|
+
// or omit 'path' to make it a general error
|
|
53
|
+
}
|
|
54
|
+
).strict();
|
|
24
55
|
class SchemaValidator {
|
|
25
56
|
static validateConfig(config) {
|
|
26
|
-
|
|
27
|
-
return LLMSConfigSchema.parse(config);
|
|
28
|
-
} catch (error) {
|
|
29
|
-
if (error instanceof z.ZodError) {
|
|
30
|
-
const { errors } = z.treeifyError(error);
|
|
31
|
-
const message = ["Configuration validation failed:", ...errors].join("\n");
|
|
32
|
-
throw new Error(message);
|
|
33
|
-
}
|
|
34
|
-
throw error;
|
|
35
|
-
}
|
|
57
|
+
return LLMSConfigSchema.parse(config);
|
|
36
58
|
}
|
|
37
59
|
}
|
|
38
60
|
|
|
@@ -201,6 +223,33 @@ async function withErrorHandling(operation, context) {
|
|
|
201
223
|
}
|
|
202
224
|
}
|
|
203
225
|
|
|
226
|
+
const nhm = new NodeHtmlMarkdown();
|
|
227
|
+
function convertHtmlToMarkdownDeep(input) {
|
|
228
|
+
if (input === null || input === void 0)
|
|
229
|
+
return input;
|
|
230
|
+
if (typeof input === "string") {
|
|
231
|
+
if (/<[a-z][\s\S]*>/i.test(input)) {
|
|
232
|
+
try {
|
|
233
|
+
return nhm.translate(input).trim();
|
|
234
|
+
} catch {
|
|
235
|
+
return input;
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
return input;
|
|
239
|
+
}
|
|
240
|
+
if (Array.isArray(input)) {
|
|
241
|
+
return input.map(convertHtmlToMarkdownDeep);
|
|
242
|
+
}
|
|
243
|
+
if (typeof input === "object") {
|
|
244
|
+
const result = {};
|
|
245
|
+
for (const [key, value] of Object.entries(input)) {
|
|
246
|
+
result[key] = convertHtmlToMarkdownDeep(value);
|
|
247
|
+
}
|
|
248
|
+
return result;
|
|
249
|
+
}
|
|
250
|
+
return input;
|
|
251
|
+
}
|
|
252
|
+
|
|
204
253
|
const DEFAULT_OPTIONS = {
|
|
205
254
|
anthropicModel: "claude-3-7-sonnet-latest",
|
|
206
255
|
maxConcurrent: 5,
|
|
@@ -255,7 +304,9 @@ const llmsModule = defineNuxtModule({
|
|
|
255
304
|
enableAutoCleanup: options.enableAutoCleanup ?? DEFAULT_OPTIONS.enableAutoCleanup,
|
|
256
305
|
cleanupOrphaned: options.cleanupOrphaned ?? DEFAULT_OPTIONS.cleanupOrphaned,
|
|
257
306
|
cleanupHidden: options.cleanupHidden ?? DEFAULT_OPTIONS.cleanupHidden,
|
|
258
|
-
enableHtmlToMarkdown: options.enableHtmlToMarkdown ?? DEFAULT_OPTIONS.enableHtmlToMarkdown
|
|
307
|
+
enableHtmlToMarkdown: options.enableHtmlToMarkdown ?? DEFAULT_OPTIONS.enableHtmlToMarkdown,
|
|
308
|
+
baseSiteUrl: options.baseSiteUrl,
|
|
309
|
+
baseSiteUrlUmbracoDataKey: options.baseSiteUrlUmbracoDataKey
|
|
259
310
|
};
|
|
260
311
|
let moduleOptions;
|
|
261
312
|
try {
|
|
@@ -273,7 +324,10 @@ const llmsModule = defineNuxtModule({
|
|
|
273
324
|
}
|
|
274
325
|
try {
|
|
275
326
|
const umbracoDataContent = readFileSync(moduleOptions.umbracoDataPath, "utf-8");
|
|
276
|
-
|
|
327
|
+
let umbracoData = JSON.parse(umbracoDataContent);
|
|
328
|
+
if (moduleOptions.enableHtmlToMarkdown) {
|
|
329
|
+
umbracoData = convertHtmlToMarkdownDeep(umbracoData);
|
|
330
|
+
}
|
|
277
331
|
logger.info(`Loaded Umbraco data with ${umbracoData.urlList.length} pages`);
|
|
278
332
|
nuxt.options.runtimeConfig.llmsGenerator = {
|
|
279
333
|
enabled: true,
|
|
@@ -302,7 +356,7 @@ const llmsModule = defineNuxtModule({
|
|
|
302
356
|
async function generateLLMSFiles(config, umbracoData, logger) {
|
|
303
357
|
try {
|
|
304
358
|
const { LLMSFilesGenerator } = await import('../chunks/llms-files-generator.mjs');
|
|
305
|
-
const generator = new LLMSFilesGenerator(config);
|
|
359
|
+
const generator = new LLMSFilesGenerator(config, umbracoData);
|
|
306
360
|
logger.info("Testing Anthropic API connection...");
|
|
307
361
|
const connectionOk = await generator["templateGenerator"].testConnection();
|
|
308
362
|
if (!connectionOk) {
|
|
@@ -310,7 +364,7 @@ async function generateLLMSFiles(config, umbracoData, logger) {
|
|
|
310
364
|
return;
|
|
311
365
|
}
|
|
312
366
|
logger.success("Anthropic API connection successful");
|
|
313
|
-
const files = await generator.generateAllFiles(
|
|
367
|
+
const files = await generator.generateAllFiles();
|
|
314
368
|
logger.success("Generated LLMS files:");
|
|
315
369
|
logger.info(`- llms.txt: ${files.llmsTxt.path}`);
|
|
316
370
|
if (files.llmsFullTxt) {
|
package/package.json
CHANGED
|
@@ -1,16 +1,10 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@voicenter-team/nuxt-llms-generator",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.7",
|
|
4
4
|
"description": "Nuxt 3 module for automatically generating AI-optimized documentation files (llms.txt, llms-full.txt, and individual .md files) from Umbraco CMS data using Anthropic's Claude API.",
|
|
5
5
|
"repository": "https://github.com/VoicenterTeam/nuxt-llms-generator",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"type": "module",
|
|
8
|
-
"jiti": {
|
|
9
|
-
"alias": {
|
|
10
|
-
"@": "./src",
|
|
11
|
-
"@/*": "./src/*"
|
|
12
|
-
}
|
|
13
|
-
},
|
|
14
8
|
"exports": {
|
|
15
9
|
".": {
|
|
16
10
|
"types": "./dist/types.d.ts",
|