@voicenter-team/nuxt-llms-generator 0.1.11 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -0
- package/dist/chunks/llms-files-generator.mjs +399 -293
- package/dist/module.d.mts +1 -0
- package/dist/module.d.ts +1 -0
- package/dist/module.json +1 -1
- package/dist/module.mjs +1 -1
- package/dist/shared/{nuxt-llms-generator.bc139143.mjs → nuxt-llms-generator.db76a78e.mjs} +3 -0
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -242,6 +242,7 @@ npm run build
|
|
|
242
242
|
| `enableLLMSFullTxt` | `boolean` | `true` | Generate combined llms-full.txt file |
|
|
243
243
|
| `enableHtmlToMarkdown` | `boolean` | `true` | Convert HTML content to markdown using [node-html-markdown](https://www.npmjs.com/package/node-html-markdown) |
|
|
244
244
|
| `maxConcurrent` | `number` | `5` | Maximum concurrent API requests |
|
|
245
|
+
| `maxTokens` | `number` | `65000` | Maximum tokens for page content before truncation (Claude context limit protection) |
|
|
245
246
|
| `anthropicModel` | `string` | `claude-3-5-sonnet-20241022` | Claude model to use |
|
|
246
247
|
|
|
247
248
|
### Cleanup Options
|
|
@@ -500,6 +501,7 @@ ls -la public/UmbracoData.json
|
|
|
500
501
|
```typescript
|
|
501
502
|
{
|
|
502
503
|
maxConcurrent: 8, // Higher concurrency
|
|
504
|
+
maxTokens: 80000, // More content per page (if using larger models)
|
|
503
505
|
enableAutoCleanup: true, // Keep cache clean
|
|
504
506
|
}
|
|
505
507
|
```
|
|
@@ -509,6 +511,7 @@ ls -la public/UmbracoData.json
|
|
|
509
511
|
{
|
|
510
512
|
enableIndividualMd: false, // Skip individual files
|
|
511
513
|
maxConcurrent: 2, // Lower API usage
|
|
514
|
+
maxTokens: 50000, // Smaller context for faster processing
|
|
512
515
|
}
|
|
513
516
|
```
|
|
514
517
|
|
|
@@ -518,6 +521,7 @@ ls -la public/UmbracoData.json
|
|
|
518
521
|
enableAutoCleanup: true,
|
|
519
522
|
cleanupOrphaned: true,
|
|
520
523
|
cleanupHidden: true,
|
|
524
|
+
maxTokens: 65000, // Balance between detail and API limits
|
|
521
525
|
enableHtmlToMarkdown: true // Clean HTML from CMS content
|
|
522
526
|
}
|
|
523
527
|
```
|
|
@@ -570,6 +574,7 @@ interface LLMSConfig {
|
|
|
570
574
|
templatesDir?: string; // './.llms-templates'
|
|
571
575
|
finalOutputDir?: string; // './.output/llms'
|
|
572
576
|
anthropicModel?: string; // 'claude-3-5-sonnet-20241022'
|
|
577
|
+
maxTokens?: number; // 65000
|
|
573
578
|
maxConcurrent?: number; // 5
|
|
574
579
|
enableLLMSFullTxt?: boolean; // true
|
|
575
580
|
enableIndividualMd?: boolean; // true
|
|
@@ -3,14 +3,221 @@ import { join, dirname, basename } from 'path';
|
|
|
3
3
|
import { slugify } from 'transliteration';
|
|
4
4
|
import Mustache from 'mustache';
|
|
5
5
|
import Anthropic from '@anthropic-ai/sdk';
|
|
6
|
-
import {
|
|
6
|
+
import { encode } from '@toon-format/toon';
|
|
7
7
|
import { JSONPath } from 'jsonpath-plus';
|
|
8
|
-
import {
|
|
8
|
+
import { createHash } from 'crypto';
|
|
9
|
+
import { w as withErrorHandling } from '../shared/nuxt-llms-generator.db76a78e.mjs';
|
|
9
10
|
import '@nuxt/kit';
|
|
10
11
|
import 'zod';
|
|
11
12
|
import 'node-html-markdown';
|
|
12
13
|
|
|
14
|
+
function extractPageContent(umbracoData, jpath) {
|
|
15
|
+
try {
|
|
16
|
+
const result = JSONPath({
|
|
17
|
+
path: jpath,
|
|
18
|
+
json: umbracoData.SiteData,
|
|
19
|
+
wrap: false
|
|
20
|
+
});
|
|
21
|
+
if (!result || Array.isArray(result) && result.length === 0) {
|
|
22
|
+
return null;
|
|
23
|
+
}
|
|
24
|
+
const pageContent = Array.isArray(result) ? result[0] : result;
|
|
25
|
+
return excludeChildrenFromContent(pageContent);
|
|
26
|
+
} catch (error) {
|
|
27
|
+
console.error(`Failed to extract content for path ${jpath}:`, error);
|
|
28
|
+
return null;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
function excludeChildrenFromContent(content) {
|
|
32
|
+
if (!content || typeof content !== "object") {
|
|
33
|
+
return content;
|
|
34
|
+
}
|
|
35
|
+
const cleanContent = { ...content };
|
|
36
|
+
if ("children" in cleanContent) {
|
|
37
|
+
delete cleanContent.children;
|
|
38
|
+
}
|
|
39
|
+
return cleanContent;
|
|
40
|
+
}
|
|
41
|
+
function generatePageId(urlItem) {
|
|
42
|
+
const templateAlias = urlItem.TemplateAlias || "UnknownTemplate";
|
|
43
|
+
const nodeID = urlItem.nodeID || "UnknownNode";
|
|
44
|
+
return `${templateAlias}_${nodeID}`;
|
|
45
|
+
}
|
|
46
|
+
function isImportantKey(key) {
|
|
47
|
+
const importantPatterns = [
|
|
48
|
+
"title",
|
|
49
|
+
"name",
|
|
50
|
+
"heading",
|
|
51
|
+
"description",
|
|
52
|
+
"summary",
|
|
53
|
+
"content",
|
|
54
|
+
"text",
|
|
55
|
+
"body",
|
|
56
|
+
"value",
|
|
57
|
+
"label",
|
|
58
|
+
"caption",
|
|
59
|
+
"alt",
|
|
60
|
+
"message",
|
|
61
|
+
"url",
|
|
62
|
+
"link",
|
|
63
|
+
"href"
|
|
64
|
+
];
|
|
65
|
+
const lowerKey = key.toLowerCase();
|
|
66
|
+
return importantPatterns.some((pattern) => lowerKey.includes(pattern));
|
|
67
|
+
}
|
|
68
|
+
function isMetadataKey(key) {
|
|
69
|
+
const metadataPatterns = [
|
|
70
|
+
"id",
|
|
71
|
+
"guid",
|
|
72
|
+
"key",
|
|
73
|
+
"_id",
|
|
74
|
+
"nodeid",
|
|
75
|
+
"created",
|
|
76
|
+
"updated",
|
|
77
|
+
"modified",
|
|
78
|
+
"timestamp",
|
|
79
|
+
"date",
|
|
80
|
+
"sort",
|
|
81
|
+
"order",
|
|
82
|
+
"index",
|
|
83
|
+
"position",
|
|
84
|
+
"published",
|
|
85
|
+
"hidden",
|
|
86
|
+
"visible",
|
|
87
|
+
"enabled",
|
|
88
|
+
"status",
|
|
89
|
+
"type",
|
|
90
|
+
"contenttype",
|
|
91
|
+
"template",
|
|
92
|
+
"alias",
|
|
93
|
+
"path",
|
|
94
|
+
"meta",
|
|
95
|
+
"metadata",
|
|
96
|
+
"seo",
|
|
97
|
+
"schema",
|
|
98
|
+
"properties"
|
|
99
|
+
];
|
|
100
|
+
const lowerKey = key.toLowerCase();
|
|
101
|
+
return metadataPatterns.some((pattern) => lowerKey.includes(pattern));
|
|
102
|
+
}
|
|
103
|
+
function recursiveTruncate(content, maxTokens, currentDepth = 0) {
|
|
104
|
+
if (currentDepth > 10) {
|
|
105
|
+
return { _truncated: "Max depth reached" };
|
|
106
|
+
}
|
|
107
|
+
if (maxTokens < 10) {
|
|
108
|
+
return void 0;
|
|
109
|
+
}
|
|
110
|
+
if (content === null || content === void 0) {
|
|
111
|
+
return content;
|
|
112
|
+
}
|
|
113
|
+
if (typeof content !== "object") {
|
|
114
|
+
if (typeof content === "string" && content.length > 2e3) {
|
|
115
|
+
return content.substring(0, 2e3) + "...";
|
|
116
|
+
}
|
|
117
|
+
return content;
|
|
118
|
+
}
|
|
119
|
+
if (Array.isArray(content)) {
|
|
120
|
+
if (content.length === 0)
|
|
121
|
+
return content;
|
|
122
|
+
const itemLimit = Math.max(3, Math.floor(15 / (currentDepth + 1)));
|
|
123
|
+
const tokensPerItem = Math.floor(maxTokens / Math.min(content.length, itemLimit));
|
|
124
|
+
const truncatedArray = content.slice(0, itemLimit).map((item) => recursiveTruncate(item, tokensPerItem, currentDepth + 1)).filter((item) => item !== void 0);
|
|
125
|
+
if (content.length > truncatedArray.length) {
|
|
126
|
+
truncatedArray.push({
|
|
127
|
+
_note: `... and ${content.length - truncatedArray.length} more items`
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
return truncatedArray;
|
|
131
|
+
}
|
|
132
|
+
const truncatedObj = {};
|
|
133
|
+
const entries = Object.entries(content);
|
|
134
|
+
const withoutMetadata = entries.filter(([key]) => !isMetadataKey(key));
|
|
135
|
+
if (withoutMetadata.length === 0) {
|
|
136
|
+
return { _note: "Only metadata, removed" };
|
|
137
|
+
}
|
|
138
|
+
const importantEntries = withoutMetadata.filter(([key]) => isImportantKey(key));
|
|
139
|
+
const normalEntries = withoutMetadata.filter(([key]) => !isImportantKey(key));
|
|
140
|
+
const importantBudget = Math.floor(maxTokens * 0.4);
|
|
141
|
+
const tokensPerImportant = importantEntries.length > 0 ? Math.floor(importantBudget / importantEntries.length) : 0;
|
|
142
|
+
for (const [key, value] of importantEntries) {
|
|
143
|
+
const processedValue = recursiveTruncate(value, tokensPerImportant, currentDepth + 1);
|
|
144
|
+
if (processedValue !== void 0) {
|
|
145
|
+
truncatedObj[key] = processedValue;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
const usedTokens = estimateContentTokens(truncatedObj);
|
|
149
|
+
const remainingBudget = maxTokens - usedTokens;
|
|
150
|
+
if (remainingBudget > 100 && normalEntries.length > 0) {
|
|
151
|
+
const sortedNormal = normalEntries.sort(([_a, valueA], [_b, valueB]) => {
|
|
152
|
+
const sizeA = JSON.stringify(valueA).length;
|
|
153
|
+
const sizeB = JSON.stringify(valueB).length;
|
|
154
|
+
return sizeA - sizeB;
|
|
155
|
+
});
|
|
156
|
+
const tokensPerNormal = Math.floor(remainingBudget / sortedNormal.length);
|
|
157
|
+
for (const [key, value] of sortedNormal) {
|
|
158
|
+
const processedValue = recursiveTruncate(value, tokensPerNormal, currentDepth + 1);
|
|
159
|
+
if (processedValue !== void 0) {
|
|
160
|
+
truncatedObj[key] = processedValue;
|
|
161
|
+
const newSize = estimateContentTokens(truncatedObj);
|
|
162
|
+
if (newSize > maxTokens) {
|
|
163
|
+
delete truncatedObj[key];
|
|
164
|
+
break;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
return Object.keys(truncatedObj).length > 0 ? truncatedObj : void 0;
|
|
170
|
+
}
|
|
171
|
+
function emergencyTruncate(content, maxTokens) {
|
|
172
|
+
const result = { ...content };
|
|
173
|
+
const keys = Object.keys(result).sort((a, b) => {
|
|
174
|
+
const aImportant = isImportantKey(a) ? 1 : 0;
|
|
175
|
+
const bImportant = isImportantKey(b) ? 1 : 0;
|
|
176
|
+
return aImportant - bImportant;
|
|
177
|
+
});
|
|
178
|
+
for (const key of keys) {
|
|
179
|
+
if (estimateContentTokens(result) <= maxTokens)
|
|
180
|
+
break;
|
|
181
|
+
delete result[key];
|
|
182
|
+
console.warn(` Emergency: removed "${key}"`);
|
|
183
|
+
}
|
|
184
|
+
return result;
|
|
185
|
+
}
|
|
186
|
+
function estimateContentTokens(content) {
|
|
187
|
+
try {
|
|
188
|
+
const jsonString = JSON.stringify(content);
|
|
189
|
+
return Math.ceil(jsonString.length / 3);
|
|
190
|
+
} catch {
|
|
191
|
+
return 0;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
function truncateContentIfNeeded(content, maxTokens = 1e5) {
|
|
195
|
+
const estimatedTokens = estimateContentTokens(content);
|
|
196
|
+
if (estimatedTokens <= maxTokens) {
|
|
197
|
+
return content;
|
|
198
|
+
}
|
|
199
|
+
console.warn(`\u26A0\uFE0F Content too large (${estimatedTokens} tokens > ${maxTokens} limit), truncating recursively...`);
|
|
200
|
+
const truncatedContent = recursiveTruncate(content, maxTokens, 0);
|
|
201
|
+
const result = truncatedContent && typeof truncatedContent === "object" && !Array.isArray(truncatedContent) ? truncatedContent : {
|
|
202
|
+
_error: "Content truncation failed",
|
|
203
|
+
original: content
|
|
204
|
+
};
|
|
205
|
+
const finalTokens = estimateContentTokens(result);
|
|
206
|
+
const preservedKeys = Object.keys(result).length;
|
|
207
|
+
const originalKeys = Object.keys(content).length;
|
|
208
|
+
console.log(`\u2705 Content truncated: ${estimatedTokens} \u2192 ${finalTokens} tokens (preserved ${preservedKeys}/${originalKeys} root keys)`);
|
|
209
|
+
if (finalTokens > maxTokens) {
|
|
210
|
+
console.error(`\u274C Recursive truncation insufficient (${finalTokens} > ${maxTokens}), performing emergency truncation...`);
|
|
211
|
+
return emergencyTruncate(result, maxTokens);
|
|
212
|
+
}
|
|
213
|
+
return result;
|
|
214
|
+
}
|
|
215
|
+
|
|
13
216
|
function buildLLMSTemplatePrompt(request) {
|
|
217
|
+
const jsonTokens = estimateContentTokens(request.pageContent);
|
|
218
|
+
const toonData = encode(request.pageContent, { delimiter: " " });
|
|
219
|
+
const toonTokens = estimateContentTokens(toonData);
|
|
220
|
+
console.log(`\u{1F4CA} ${request.url}: JSON ${jsonTokens} \u2192 TOON ${toonTokens} (${((1 - toonTokens / jsonTokens) * 100).toFixed(0)}% saved)`);
|
|
14
221
|
return `# LLMS.txt-Optimized Mustache Template Generator
|
|
15
222
|
|
|
16
223
|
You are an expert at creating **Mustache.js templates** that generate **LLM knowledge base entries** following the [\`llms.txt\` standard](https://llmstxt.org/).
|
|
@@ -21,9 +228,9 @@ You are an expert at creating **Mustache.js templates** that generate **LLM know
|
|
|
21
228
|
|
|
22
229
|
### 1. DATA-DRIVEN CONTENT ONLY
|
|
23
230
|
- **EVERY piece of content** must come from a Mustache binding: \`{{propertyName}}\`
|
|
24
|
-
- **NEVER invent, assume, or add content** that doesn't exist in the provided
|
|
231
|
+
- **NEVER invent, assume, or add content** that doesn't exist in the provided data
|
|
25
232
|
- **NO hardcoded descriptions, lists, or facts**
|
|
26
|
-
- If a property doesn't exist in
|
|
233
|
+
- If a property doesn't exist in data, don't create a section for it
|
|
27
234
|
|
|
28
235
|
### 2. ALLOWED CONTEXTUAL ADDITIONS
|
|
29
236
|
You MAY add:
|
|
@@ -32,51 +239,47 @@ You MAY add:
|
|
|
32
239
|
- **Structural markers** for clarity (e.g., "Navigation:", "Metadata:")
|
|
33
240
|
|
|
34
241
|
You MAY NOT add:
|
|
35
|
-
- Descriptions of features/benefits not in
|
|
242
|
+
- Descriptions of features/benefits not in data
|
|
36
243
|
- Explanatory text about what something does
|
|
37
244
|
- Lists of items not present in data
|
|
38
245
|
- Assumptions about the page purpose
|
|
39
246
|
|
|
40
|
-
### 3.
|
|
247
|
+
### 3. UNDERSTANDING TOON FORMAT
|
|
41
248
|
|
|
42
|
-
|
|
43
|
-
\`\`\`mustache
|
|
44
|
-
## Key Benefits
|
|
45
|
-
- Real-time monitoring
|
|
46
|
-
- Detailed analytics
|
|
47
|
-
- Easy to use
|
|
48
|
-
\`\`\`
|
|
49
|
-
*Problem: These benefits are invented, not from JSON*
|
|
249
|
+
The data below is in **TOON format** (Token-Oriented Object Notation) for efficiency.
|
|
50
250
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
*Problem: Description is made up*
|
|
251
|
+
**How to read TOON:**
|
|
252
|
+
- \`propertyName: value\` \u2192 Single property
|
|
253
|
+
- \`array[3]{prop1,prop2}\` \u2192 Array of 3 objects with properties prop1, prop2
|
|
254
|
+
- Properties in \`{braces}\` are the **exact field names** to use in Mustache bindings
|
|
56
255
|
|
|
57
|
-
|
|
58
|
-
\`\`\`
|
|
59
|
-
{
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
- **{{name}}**: {{description}}
|
|
63
|
-
{{/features}}
|
|
64
|
-
{{/features.0}}
|
|
256
|
+
**Example:**
|
|
257
|
+
\`\`\`toon
|
|
258
|
+
users[2]{id,name,role}:
|
|
259
|
+
1 Alice admin
|
|
260
|
+
2 Bob user
|
|
65
261
|
\`\`\`
|
|
66
|
-
*Good: Content comes from JSON, heading provides context*
|
|
67
262
|
|
|
68
|
-
|
|
263
|
+
**Your Mustache template:**
|
|
69
264
|
\`\`\`mustache
|
|
70
|
-
{{#
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
{{#items}}
|
|
74
|
-
- {{title}}
|
|
75
|
-
{{/items}}
|
|
76
|
-
{{/items.0}}
|
|
265
|
+
{{#users}}
|
|
266
|
+
- {{id}}: {{name}} ({{role}})
|
|
267
|
+
{{/users}}
|
|
77
268
|
\`\`\`
|
|
78
|
-
*Good: Brief intro, but content is from JSON*
|
|
79
269
|
|
|
270
|
+
**CRITICAL:** Use the EXACT property names shown in TOON \`{braces}\` for your Mustache bindings.
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
## \u{1F6AB} Content Exclusion Rules
|
|
274
|
+
When generating the template, **DO NOT** create sections for these types of data even if they appear in TOON:
|
|
275
|
+
- **Image/media properties**: URLs to images, avatars, thumbnails, icons, or any media files
|
|
276
|
+
- **UI-only labels**: Search placeholders, filter menu text, "Show More"/"Show Less", pagination labels
|
|
277
|
+
- **SEO/meta fields**: ogTitle, ogDescription, ogImage, canonical URLs, changefreq, priority, sitemap fields
|
|
278
|
+
- **Legal boilerplate**: Copyright text, "All rights reserved"
|
|
279
|
+
- **Navigation chrome**: Breadcrumbs, menu items, footer links \u2014 unless they ARE the primary page content
|
|
280
|
+
- **System identifiers**: Internal IDs, GUIDs, sort orders, node paths, template aliases
|
|
281
|
+
- **Empty/null values**: Skip any property that holds no meaningful value
|
|
282
|
+
**Focus ONLY on**: titles, descriptions, features, pricing, reviews, specifications, contact info, and other business-relevant content.
|
|
80
283
|
---
|
|
81
284
|
|
|
82
285
|
## \u{1F3AF} TRUE PURPOSE: Help LLMs Answer Questions Efficiently
|
|
@@ -100,9 +303,10 @@ These \`.md\` files are **LLM knowledge base entries** designed for **inference*
|
|
|
100
303
|
- **Template Alias:** ${request.templateAlias}
|
|
101
304
|
- **JSON Path:** ${request.jpath}
|
|
102
305
|
|
|
103
|
-
### Available Data
|
|
104
|
-
|
|
105
|
-
|
|
306
|
+
### Available Data (TOON Format)
|
|
307
|
+
|
|
308
|
+
\`\`\`toon
|
|
309
|
+
${toonData}
|
|
106
310
|
\`\`\`
|
|
107
311
|
|
|
108
312
|
---
|
|
@@ -117,13 +321,13 @@ ${JSON.stringify(request.pageContent, null, 2)}
|
|
|
117
321
|
### 2. Structure for Question-Answering
|
|
118
322
|
Anticipate questions an LLM might need to answer:
|
|
119
323
|
- "What is this?" \u2192 Main heading + description properties
|
|
120
|
-
- "What does it offer?" \u2192 Lists of items/features from
|
|
324
|
+
- "What does it offer?" \u2192 Lists of items/features from data
|
|
121
325
|
- "Who is it for?" \u2192 Target audience properties (if they exist)
|
|
122
326
|
- "What are the details?" \u2192 Technical/metadata properties
|
|
123
327
|
|
|
124
|
-
### 3. Prioritize by
|
|
328
|
+
### 3. Prioritize by Data Importance
|
|
125
329
|
**Essential First:**
|
|
126
|
-
-
|
|
330
|
+
- Title/name/heading properties
|
|
127
331
|
- Description/summary properties
|
|
128
332
|
- Main content arrays
|
|
129
333
|
|
|
@@ -146,17 +350,15 @@ Anticipate questions an LLM might need to answer:
|
|
|
146
350
|
|
|
147
351
|
## \u{1F527} Technical Principles (Key-Agnostic Design)
|
|
148
352
|
|
|
149
|
-
### 1.
|
|
150
|
-
|
|
151
|
-
-
|
|
152
|
-
-
|
|
153
|
-
- **Position in JSON:** Root-level = high importance
|
|
154
|
-
- **Semantic patterns:** URLs, images, dates
|
|
353
|
+
### 1. Extract Property Names from TOON
|
|
354
|
+
Look at TOON headers to identify properties:
|
|
355
|
+
- \`{id,name,role}\` \u2192 Use \`{{id}}\`, \`{{name}}\`, \`{{role}}\`
|
|
356
|
+
- \`breadcrumbsLinks[5]{title,link}\` \u2192 Use \`{{#breadcrumbsLinks}}{{title}} {{link}}{{/breadcrumbsLinks}}\`
|
|
155
357
|
|
|
156
358
|
### 2. Exact Property Bindings
|
|
157
|
-
- Always use **exact property name** from
|
|
359
|
+
- Always use **exact property name** from TOON: \`{{actualKeyName}}\`
|
|
158
360
|
- Do NOT rename or modify binding identifiers
|
|
159
|
-
- Mustache bindings must match
|
|
361
|
+
- Mustache bindings must match TOON property names precisely
|
|
160
362
|
|
|
161
363
|
### 3. Humanized Section Headings
|
|
162
364
|
While bindings stay exact, convert keys to readable headings:
|
|
@@ -164,20 +366,33 @@ While bindings stay exact, convert keys to readable headings:
|
|
|
164
366
|
- \`supportPageItems\` \u2192 "Available Support Topics"
|
|
165
367
|
- \`breadcrumbsLinks\` \u2192 "Navigation Path"
|
|
166
368
|
|
|
167
|
-
### 4.
|
|
168
|
-
|
|
169
|
-
-
|
|
170
|
-
-
|
|
171
|
-
-
|
|
172
|
-
|
|
173
|
-
|
|
369
|
+
### 4. Working with Arrays
|
|
370
|
+
When you see \`arrayName[N]{prop1,prop2}\`:
|
|
371
|
+
- Use \`{{#arrayName.0}}\` to check if array exists
|
|
372
|
+
- Iterate with \`{{#arrayName}}\`
|
|
373
|
+
- Access properties with \`{{prop1}}\`, \`{{prop2}}\`
|
|
374
|
+
|
|
375
|
+
**Example:**
|
|
376
|
+
\`\`\`toon
|
|
377
|
+
items[3]{title,description}:
|
|
378
|
+
...
|
|
379
|
+
\`\`\`
|
|
380
|
+
\u2192
|
|
381
|
+
\`\`\`mustache
|
|
382
|
+
{{#items.0}}
|
|
383
|
+
## Items
|
|
384
|
+
{{#items}}
|
|
385
|
+
- {{title}}: {{description}}
|
|
386
|
+
{{/items}}
|
|
387
|
+
{{/items.0}}
|
|
388
|
+
\`\`\`
|
|
174
389
|
|
|
175
390
|
### 5. Noise Filtering
|
|
176
|
-
**Exclude technical metadata
|
|
391
|
+
**Exclude technical metadata** (if present in TOON):
|
|
177
392
|
- IDs: \`id\`, \`nodeId\`, \`_id\`, \`guid\`
|
|
178
393
|
- Timestamps: \`createdAt\`, \`updatedAt\`
|
|
179
394
|
- Flags: \`isPublished\`, \`sortOrder\`, \`hidden\`
|
|
180
|
-
- System: \`_type\`, \`contentType
|
|
395
|
+
- System: \`_type\`, \`contentType\`
|
|
181
396
|
|
|
182
397
|
### 6. Hierarchy & Nesting
|
|
183
398
|
- **Root level** \u2192 \`#\` (H1) \u2014 one per document
|
|
@@ -198,7 +413,7 @@ While bindings stay exact, convert keys to readable headings:
|
|
|
198
413
|
{{/summaryProperty}}
|
|
199
414
|
\`\`\`
|
|
200
415
|
|
|
201
|
-
###
|
|
416
|
+
### Example Sections (adapt to actual TOON data)
|
|
202
417
|
\`\`\`mustache
|
|
203
418
|
{{#mainDescription}}
|
|
204
419
|
## Overview
|
|
@@ -219,45 +434,94 @@ While bindings stay exact, convert keys to readable headings:
|
|
|
219
434
|
- [{{title}}]({{link}})
|
|
220
435
|
{{/navigationLinks}}
|
|
221
436
|
{{/navigationLinks.0}}
|
|
222
|
-
|
|
223
|
-
{{#technicalData}}
|
|
224
|
-
## Technical Information
|
|
225
|
-
- **URL**: {{url}}
|
|
226
|
-
- **Type**: {{type}}
|
|
227
|
-
{{/technicalData}}
|
|
228
437
|
\`\`\`
|
|
229
438
|
|
|
230
|
-
**Important:** These are examples. Your template must match the ACTUAL
|
|
439
|
+
**Important:** These are examples. Your template must match the ACTUAL TOON structure provided.
|
|
231
440
|
|
|
232
441
|
---
|
|
233
442
|
|
|
234
443
|
## \u2705 Output Requirements
|
|
235
444
|
|
|
236
445
|
1. **Output ONLY the Mustache template** \u2014 no explanations, no markdown code fences, no preamble
|
|
237
|
-
2. **Use exact
|
|
446
|
+
2. **Use exact property names from TOON \`{braces}\`** in all bindings
|
|
238
447
|
3. **Generate clean Markdown** \u2014 no HTML, entities, or attributes
|
|
239
448
|
4. **Data-driven content** \u2014 no invented facts or descriptions
|
|
240
|
-
5. **Contextual headings allowed** \u2014 but content must be from
|
|
449
|
+
5. **Contextual headings allowed** \u2014 but content must be from data
|
|
241
450
|
6. **Be concise** \u2014 optimize for limited context windows
|
|
242
451
|
7. **Structure for questions** \u2014 LLMs should easily extract facts
|
|
243
452
|
|
|
244
453
|
---
|
|
245
454
|
|
|
455
|
+
## \u26A0\uFE0F CRITICAL: Mustache Syntax Validation
|
|
456
|
+
|
|
457
|
+
**Every \`{{#tag}}\` MUST have matching \`{{/tag}}\`**
|
|
458
|
+
|
|
459
|
+
### Common Errors (from real failures):
|
|
460
|
+
|
|
461
|
+
\u274C **Missing closing tag:**
|
|
462
|
+
\`\`\`mustache
|
|
463
|
+
{{#pageDescription}}
|
|
464
|
+
content
|
|
465
|
+
// \u274C Missing {{/pageDescription}}
|
|
466
|
+
\`\`\`
|
|
467
|
+
|
|
468
|
+
\u274C **Nested check without outer closing:**
|
|
469
|
+
\`\`\`mustache
|
|
470
|
+
{{#items.0}}
|
|
471
|
+
{{#items}}...{{/items}}
|
|
472
|
+
// \u274C Missing {{/items.0}}
|
|
473
|
+
\`\`\`
|
|
474
|
+
|
|
475
|
+
\u274C **Capitalization mismatch:**
|
|
476
|
+
\`\`\`mustache
|
|
477
|
+
{{#aIFeaturesCTATitle}}
|
|
478
|
+
...
|
|
479
|
+
{{/aiFeaturesCTATitle}} \u274C Different capitalization!
|
|
480
|
+
\`\`\`
|
|
481
|
+
|
|
482
|
+
### Validation Checklist:
|
|
483
|
+
|
|
484
|
+
**Before output:**
|
|
485
|
+
1. Count \`{{#\` tags = ___
|
|
486
|
+
2. Count \`{{/\` tags = ___
|
|
487
|
+
3. Numbers match? If NO \u2192 Find and add missing closing tags
|
|
488
|
+
4. Tag names exact match (including dots, numbers, capitalization)?
|
|
489
|
+
|
|
490
|
+
\u2705 **Valid example:**
|
|
491
|
+
\`\`\`mustache
|
|
492
|
+
{{#section}} \u2190 1 open
|
|
493
|
+
{{#nested.0}} \u2190 2 open
|
|
494
|
+
content
|
|
495
|
+
{{/nested.0}} \u2190 2 close
|
|
496
|
+
{{/section}} \u2190 1 close
|
|
497
|
+
\`\`\`
|
|
498
|
+
Count: 2 = 2 \u2713
|
|
499
|
+
|
|
500
|
+
---
|
|
501
|
+
|
|
246
502
|
## \u{1F680} Your Task
|
|
247
503
|
|
|
248
|
-
Analyze the provided
|
|
504
|
+
Analyze the provided TOON data structure and **generate a Mustache template** that:
|
|
249
505
|
|
|
250
|
-
1. **Uses ONLY data from
|
|
251
|
-
2. **
|
|
252
|
-
3. **
|
|
253
|
-
4. **
|
|
254
|
-
5. **
|
|
506
|
+
1. **Uses ONLY data from TOON** (no invented content)
|
|
507
|
+
2. **Extracts exact property names from \`{braces}\`**
|
|
508
|
+
3. **Adds logical section headings** for context
|
|
509
|
+
4. **Structures data for question-answering**
|
|
510
|
+
5. **Prioritizes most important properties first**
|
|
511
|
+
6. **Remains universal** (works for any data shape)
|
|
512
|
+
7. **\u2705 ALL Mustache tags properly closed**
|
|
255
513
|
|
|
256
514
|
**Remember:**
|
|
257
|
-
-
|
|
258
|
-
-
|
|
259
|
-
-
|
|
260
|
-
-
|
|
515
|
+
- Parse TOON structure naturally \u2705
|
|
516
|
+
- Use exact property names from \`{braces}\` \u2705\u2705\u2705
|
|
517
|
+
- Headings can be contextual \u2705
|
|
518
|
+
- Content must be from data \u2705\u2705\u2705
|
|
519
|
+
- No made-up descriptions \u274C
|
|
520
|
+
- No assumed features \u274C
|
|
521
|
+
- **Every {{#tag}} has {{/tag}}** \u2705\u2705\u2705
|
|
522
|
+
|
|
523
|
+
**Final Step Before Output:**
|
|
524
|
+
Count your \`{{#\` and \`{{/\` tags. If numbers don't match, find and add missing closing tags.
|
|
261
525
|
|
|
262
526
|
Generate the template now.
|
|
263
527
|
`;
|
|
@@ -727,208 +991,11 @@ function getValueType(value) {
|
|
|
727
991
|
return typeof value;
|
|
728
992
|
}
|
|
729
993
|
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
wrap: false
|
|
736
|
-
});
|
|
737
|
-
if (!result || Array.isArray(result) && result.length === 0) {
|
|
738
|
-
return null;
|
|
739
|
-
}
|
|
740
|
-
const pageContent = Array.isArray(result) ? result[0] : result;
|
|
741
|
-
return excludeChildrenFromContent(pageContent);
|
|
742
|
-
} catch (error) {
|
|
743
|
-
console.error(`Failed to extract content for path ${jpath}:`, error);
|
|
744
|
-
return null;
|
|
745
|
-
}
|
|
746
|
-
}
|
|
747
|
-
function excludeChildrenFromContent(content) {
|
|
748
|
-
if (!content || typeof content !== "object") {
|
|
749
|
-
return content;
|
|
750
|
-
}
|
|
751
|
-
const cleanContent = { ...content };
|
|
752
|
-
if ("children" in cleanContent) {
|
|
753
|
-
delete cleanContent.children;
|
|
754
|
-
}
|
|
755
|
-
return cleanContent;
|
|
756
|
-
}
|
|
757
|
-
function generatePageId(urlItem) {
|
|
758
|
-
const templateAlias = urlItem.TemplateAlias || "UnknownTemplate";
|
|
759
|
-
const nodeID = urlItem.nodeID || "UnknownNode";
|
|
760
|
-
return `${templateAlias}_${nodeID}`;
|
|
761
|
-
}
|
|
762
|
-
function isImportantKey(key) {
|
|
763
|
-
const importantPatterns = [
|
|
764
|
-
"title",
|
|
765
|
-
"name",
|
|
766
|
-
"heading",
|
|
767
|
-
"description",
|
|
768
|
-
"summary",
|
|
769
|
-
"content",
|
|
770
|
-
"text",
|
|
771
|
-
"body",
|
|
772
|
-
"value",
|
|
773
|
-
"label",
|
|
774
|
-
"caption",
|
|
775
|
-
"alt",
|
|
776
|
-
"message",
|
|
777
|
-
"url",
|
|
778
|
-
"link",
|
|
779
|
-
"href"
|
|
780
|
-
];
|
|
781
|
-
const lowerKey = key.toLowerCase();
|
|
782
|
-
return importantPatterns.some((pattern) => lowerKey.includes(pattern));
|
|
783
|
-
}
|
|
784
|
-
function isMetadataKey(key) {
|
|
785
|
-
const metadataPatterns = [
|
|
786
|
-
"id",
|
|
787
|
-
"guid",
|
|
788
|
-
"key",
|
|
789
|
-
"_id",
|
|
790
|
-
"nodeid",
|
|
791
|
-
"created",
|
|
792
|
-
"updated",
|
|
793
|
-
"modified",
|
|
794
|
-
"timestamp",
|
|
795
|
-
"date",
|
|
796
|
-
"sort",
|
|
797
|
-
"order",
|
|
798
|
-
"index",
|
|
799
|
-
"position",
|
|
800
|
-
"published",
|
|
801
|
-
"hidden",
|
|
802
|
-
"visible",
|
|
803
|
-
"enabled",
|
|
804
|
-
"status",
|
|
805
|
-
"type",
|
|
806
|
-
"contenttype",
|
|
807
|
-
"template",
|
|
808
|
-
"alias",
|
|
809
|
-
"path",
|
|
810
|
-
"meta",
|
|
811
|
-
"metadata",
|
|
812
|
-
"seo",
|
|
813
|
-
"schema",
|
|
814
|
-
"properties"
|
|
815
|
-
];
|
|
816
|
-
const lowerKey = key.toLowerCase();
|
|
817
|
-
return metadataPatterns.some((pattern) => lowerKey.includes(pattern));
|
|
818
|
-
}
|
|
819
|
-
function recursiveTruncate(content, maxTokens, currentDepth = 0) {
|
|
820
|
-
if (currentDepth > 10) {
|
|
821
|
-
return { _truncated: "Max depth reached" };
|
|
822
|
-
}
|
|
823
|
-
if (maxTokens < 10) {
|
|
824
|
-
return void 0;
|
|
825
|
-
}
|
|
826
|
-
if (content === null || content === void 0) {
|
|
827
|
-
return content;
|
|
828
|
-
}
|
|
829
|
-
if (typeof content !== "object") {
|
|
830
|
-
if (typeof content === "string" && content.length > 2e3) {
|
|
831
|
-
return content.substring(0, 2e3) + "...";
|
|
832
|
-
}
|
|
833
|
-
return content;
|
|
834
|
-
}
|
|
835
|
-
if (Array.isArray(content)) {
|
|
836
|
-
if (content.length === 0)
|
|
837
|
-
return content;
|
|
838
|
-
const itemLimit = Math.max(3, Math.floor(15 / (currentDepth + 1)));
|
|
839
|
-
const tokensPerItem = Math.floor(maxTokens / Math.min(content.length, itemLimit));
|
|
840
|
-
const truncatedArray = content.slice(0, itemLimit).map((item) => recursiveTruncate(item, tokensPerItem, currentDepth + 1)).filter((item) => item !== void 0);
|
|
841
|
-
if (content.length > truncatedArray.length) {
|
|
842
|
-
truncatedArray.push({
|
|
843
|
-
_note: `... and ${content.length - truncatedArray.length} more items`
|
|
844
|
-
});
|
|
845
|
-
}
|
|
846
|
-
return truncatedArray;
|
|
847
|
-
}
|
|
848
|
-
const truncatedObj = {};
|
|
849
|
-
const entries = Object.entries(content);
|
|
850
|
-
const withoutMetadata = entries.filter(([key]) => !isMetadataKey(key));
|
|
851
|
-
if (withoutMetadata.length === 0) {
|
|
852
|
-
return { _note: "Only metadata, removed" };
|
|
853
|
-
}
|
|
854
|
-
const importantEntries = withoutMetadata.filter(([key]) => isImportantKey(key));
|
|
855
|
-
const normalEntries = withoutMetadata.filter(([key]) => !isImportantKey(key));
|
|
856
|
-
const importantBudget = Math.floor(maxTokens * 0.4);
|
|
857
|
-
const tokensPerImportant = importantEntries.length > 0 ? Math.floor(importantBudget / importantEntries.length) : 0;
|
|
858
|
-
for (const [key, value] of importantEntries) {
|
|
859
|
-
const processedValue = recursiveTruncate(value, tokensPerImportant, currentDepth + 1);
|
|
860
|
-
if (processedValue !== void 0) {
|
|
861
|
-
truncatedObj[key] = processedValue;
|
|
862
|
-
}
|
|
863
|
-
}
|
|
864
|
-
const usedTokens = estimateContentTokens(truncatedObj);
|
|
865
|
-
const remainingBudget = maxTokens - usedTokens;
|
|
866
|
-
if (remainingBudget > 100 && normalEntries.length > 0) {
|
|
867
|
-
const sortedNormal = normalEntries.sort(([_a, valueA], [_b, valueB]) => {
|
|
868
|
-
const sizeA = JSON.stringify(valueA).length;
|
|
869
|
-
const sizeB = JSON.stringify(valueB).length;
|
|
870
|
-
return sizeA - sizeB;
|
|
871
|
-
});
|
|
872
|
-
const tokensPerNormal = Math.floor(remainingBudget / sortedNormal.length);
|
|
873
|
-
for (const [key, value] of sortedNormal) {
|
|
874
|
-
const processedValue = recursiveTruncate(value, tokensPerNormal, currentDepth + 1);
|
|
875
|
-
if (processedValue !== void 0) {
|
|
876
|
-
truncatedObj[key] = processedValue;
|
|
877
|
-
const newSize = estimateContentTokens(truncatedObj);
|
|
878
|
-
if (newSize > maxTokens) {
|
|
879
|
-
delete truncatedObj[key];
|
|
880
|
-
break;
|
|
881
|
-
}
|
|
882
|
-
}
|
|
883
|
-
}
|
|
884
|
-
}
|
|
885
|
-
return Object.keys(truncatedObj).length > 0 ? truncatedObj : void 0;
|
|
886
|
-
}
|
|
887
|
-
function emergencyTruncate(content, maxTokens) {
|
|
888
|
-
const result = { ...content };
|
|
889
|
-
const keys = Object.keys(result).sort((a, b) => {
|
|
890
|
-
const aImportant = isImportantKey(a) ? 1 : 0;
|
|
891
|
-
const bImportant = isImportantKey(b) ? 1 : 0;
|
|
892
|
-
return aImportant - bImportant;
|
|
893
|
-
});
|
|
894
|
-
for (const key of keys) {
|
|
895
|
-
if (estimateContentTokens(result) <= maxTokens)
|
|
896
|
-
break;
|
|
897
|
-
delete result[key];
|
|
898
|
-
console.warn(` Emergency: removed "${key}"`);
|
|
899
|
-
}
|
|
900
|
-
return result;
|
|
901
|
-
}
|
|
902
|
-
function estimateContentTokens(content) {
|
|
903
|
-
try {
|
|
904
|
-
const jsonString = JSON.stringify(content);
|
|
905
|
-
return Math.ceil(jsonString.length / 3);
|
|
906
|
-
} catch {
|
|
907
|
-
return 0;
|
|
908
|
-
}
|
|
909
|
-
}
|
|
910
|
-
function truncateContentIfNeeded(content, maxTokens = 1e5) {
|
|
911
|
-
const estimatedTokens = estimateContentTokens(content);
|
|
912
|
-
if (estimatedTokens <= maxTokens) {
|
|
913
|
-
return content;
|
|
914
|
-
}
|
|
915
|
-
console.warn(`\u26A0\uFE0F Content too large (${estimatedTokens} tokens > ${maxTokens} limit), truncating recursively...`);
|
|
916
|
-
const truncatedContent = recursiveTruncate(content, maxTokens, 0);
|
|
917
|
-
const result = truncatedContent && typeof truncatedContent === "object" && !Array.isArray(truncatedContent) ? truncatedContent : {
|
|
918
|
-
_error: "Content truncation failed",
|
|
919
|
-
original: content
|
|
920
|
-
};
|
|
921
|
-
const finalTokens = estimateContentTokens(result);
|
|
922
|
-
const preservedKeys = Object.keys(result).length;
|
|
923
|
-
const originalKeys = Object.keys(content).length;
|
|
924
|
-
console.log(`\u2705 Content truncated: ${estimatedTokens} \u2192 ${finalTokens} tokens (preserved ${preservedKeys}/${originalKeys} root keys)`);
|
|
925
|
-
if (finalTokens > maxTokens) {
|
|
926
|
-
console.error(`\u274C Recursive truncation insufficient (${finalTokens} > ${maxTokens}), performing emergency truncation...`);
|
|
927
|
-
return emergencyTruncate(result, maxTokens);
|
|
928
|
-
}
|
|
929
|
-
return result;
|
|
930
|
-
}
|
|
931
|
-
|
|
994
|
+
const PLACEHOLDER_PATTERNS = [
|
|
995
|
+
"lorem ipsum",
|
|
996
|
+
"dolor sit amet",
|
|
997
|
+
"consectetuer adipi"
|
|
998
|
+
];
|
|
932
999
|
function shouldGenerateTemplate(umbracoData, urlItem) {
|
|
933
1000
|
try {
|
|
934
1001
|
const pageContent = extractPageContent(umbracoData, urlItem.Jpath);
|
|
@@ -941,6 +1008,16 @@ function shouldGenerateTemplate(umbracoData, urlItem) {
|
|
|
941
1008
|
console.log(`Page ${urlItem.url} is hidden (hidePage: ${hidePage}), skipping template generation`);
|
|
942
1009
|
return false;
|
|
943
1010
|
}
|
|
1011
|
+
const title = pageContent.pageTitle ?? pageContent.pageTittle ?? pageContent.ogTitle ?? pageContent.headerBlockTitle;
|
|
1012
|
+
if (!title || title === "undefined" || title === "null") {
|
|
1013
|
+
console.log(`Page ${urlItem.url} has no valid title, skipping template generation`);
|
|
1014
|
+
return false;
|
|
1015
|
+
}
|
|
1016
|
+
const bodyText = JSON.stringify(pageContent).toLowerCase();
|
|
1017
|
+
if (PLACEHOLDER_PATTERNS.some((p) => bodyText.includes(p))) {
|
|
1018
|
+
console.log(`Page ${urlItem.url} contains placeholder text, skipping template generation`);
|
|
1019
|
+
return false;
|
|
1020
|
+
}
|
|
944
1021
|
return true;
|
|
945
1022
|
} catch (error) {
|
|
946
1023
|
console.error(`Error checking visibility for ${urlItem.url}:`, error);
|
|
@@ -1156,6 +1233,19 @@ async function performAutomaticCleanup(umbracoData, cacheDir, options = {}) {
|
|
|
1156
1233
|
return stats;
|
|
1157
1234
|
}
|
|
1158
1235
|
|
|
1236
|
+
function sanitizeRenderedMarkdown(markdown) {
|
|
1237
|
+
let output = markdown;
|
|
1238
|
+
output = output.replace(/!\[.*?]\(.*?\)/g, "");
|
|
1239
|
+
output = output.replace(/^(#{1,6})\s+\d+,\s*/gm, "$1 ");
|
|
1240
|
+
output = output.replace(///g, "/").replace(/'/g, "'").replace(/'/g, "'").replace(/"/g, '"').replace(/&/g, "&").replace(/=/g, "=").replace(/`/g, "`").replace(/</g, "<").replace(/>/g, ">");
|
|
1241
|
+
output = output.replace(/^- .+?:\s*$/gm, "");
|
|
1242
|
+
output = output.replace(/\[הרחבה]\([^)]*\)/g, "");
|
|
1243
|
+
output = output.replace(/(?<!:)\/{2,}/g, "/");
|
|
1244
|
+
output = output.replace(/^(#{2,6})\s+.+\n(\s*\n)+(?=#{1,6}\s|$)/gm, "");
|
|
1245
|
+
output = output.replace(/\n{3,}/g, "\n\n");
|
|
1246
|
+
return output.trim();
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1159
1249
|
class TemplateGenerator {
|
|
1160
1250
|
anthropicClient;
|
|
1161
1251
|
promptAnalyzer;
|
|
@@ -1264,12 +1354,11 @@ class TemplateGenerator {
|
|
|
1264
1354
|
const pageId = generatePageId(urlItem);
|
|
1265
1355
|
console.log(`Generating new template for ${pageId} (${urlItem.url})`);
|
|
1266
1356
|
const tokensBeforeTruncation = estimateContentTokens(pageContent);
|
|
1267
|
-
const truncatedContent = truncateContentIfNeeded(pageContent,
|
|
1357
|
+
const truncatedContent = truncateContentIfNeeded(pageContent, this.config.maxTokens);
|
|
1268
1358
|
const tokensAfterTruncation = estimateContentTokens(truncatedContent);
|
|
1269
1359
|
if (tokensBeforeTruncation > tokensAfterTruncation) {
|
|
1270
1360
|
console.warn(`Page ${pageId} content truncated: ${tokensBeforeTruncation} -> ${tokensAfterTruncation} tokens`);
|
|
1271
1361
|
}
|
|
1272
|
-
this.promptAnalyzer.analyzeContent(truncatedContent, urlItem);
|
|
1273
1362
|
const request = {
|
|
1274
1363
|
pageContent: truncatedContent,
|
|
1275
1364
|
templateAlias: urlItem.TemplateAlias,
|
|
@@ -1300,7 +1389,14 @@ class TemplateGenerator {
|
|
|
1300
1389
|
}
|
|
1301
1390
|
async renderTemplate(template, data) {
|
|
1302
1391
|
return withErrorHandling(async () => {
|
|
1303
|
-
|
|
1392
|
+
const originalEscape = Mustache.escape;
|
|
1393
|
+
Mustache.escape = (text) => text;
|
|
1394
|
+
try {
|
|
1395
|
+
const rendered = Mustache.render(template, data);
|
|
1396
|
+
return sanitizeRenderedMarkdown(rendered);
|
|
1397
|
+
} finally {
|
|
1398
|
+
Mustache.escape = originalEscape;
|
|
1399
|
+
}
|
|
1304
1400
|
}, {
|
|
1305
1401
|
template: template.substring(0, 200) + "...",
|
|
1306
1402
|
dataKeys: Object.keys(data)
|
|
@@ -1429,7 +1525,8 @@ class LLMSFilesGenerator {
|
|
|
1429
1525
|
content += `This website contains comprehensive information about ${siteTitle.toLowerCase()}. The content is organized into the following sections:
|
|
1430
1526
|
|
|
1431
1527
|
`;
|
|
1432
|
-
const
|
|
1528
|
+
const deduplicatedFiles = this.deduplicateByUrl(mdFiles);
|
|
1529
|
+
const pagesByCategory = this.groupPagesByCategory(deduplicatedFiles);
|
|
1433
1530
|
for (const [category, pages] of Object.entries(pagesByCategory)) {
|
|
1434
1531
|
if (pages.length === 0)
|
|
1435
1532
|
continue;
|
|
@@ -1472,7 +1569,8 @@ class LLMSFilesGenerator {
|
|
|
1472
1569
|
`;
|
|
1473
1570
|
}
|
|
1474
1571
|
content += "---\n\n";
|
|
1475
|
-
|
|
1572
|
+
const deduplicatedFiles = this.deduplicateByUrl(mdFiles);
|
|
1573
|
+
for (const mdFile of deduplicatedFiles) {
|
|
1476
1574
|
const urlItem = this.umbracoData.urlList.find((item) => item.url === mdFile.url);
|
|
1477
1575
|
if (!urlItem)
|
|
1478
1576
|
continue;
|
|
@@ -1513,6 +1611,15 @@ class LLMSFilesGenerator {
|
|
|
1513
1611
|
* /marketplace -> category "marketplace"
|
|
1514
1612
|
* / -> category "main"
|
|
1515
1613
|
*/
|
|
1614
|
+
deduplicateByUrl(mdFiles) {
|
|
1615
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1616
|
+
return mdFiles.filter((file) => {
|
|
1617
|
+
if (seen.has(file.url))
|
|
1618
|
+
return false;
|
|
1619
|
+
seen.add(file.url);
|
|
1620
|
+
return true;
|
|
1621
|
+
});
|
|
1622
|
+
}
|
|
1516
1623
|
groupPagesByCategory(mdFiles) {
|
|
1517
1624
|
const categories = {};
|
|
1518
1625
|
for (const mdFile of mdFiles) {
|
|
@@ -1558,8 +1665,7 @@ class LLMSFilesGenerator {
|
|
|
1558
1665
|
}
|
|
1559
1666
|
extractSiteTitle() {
|
|
1560
1667
|
const siteData = this.umbracoData.SiteData;
|
|
1561
|
-
|
|
1562
|
-
return rawTitle;
|
|
1668
|
+
return siteData?.pageTitle || siteData?.mainHeaderBlockTitle || "Website Documentation";
|
|
1563
1669
|
}
|
|
1564
1670
|
extractSiteDescription() {
|
|
1565
1671
|
const siteData = this.umbracoData.SiteData;
|
|
@@ -1582,11 +1688,11 @@ class LLMSFilesGenerator {
|
|
|
1582
1688
|
const pageContent = extractPageContent(this.umbracoData, urlItem.Jpath);
|
|
1583
1689
|
if (!pageContent)
|
|
1584
1690
|
return `${urlItem.TemplateAlias} page`;
|
|
1585
|
-
const desc = pageContent.pageDescription || pageContent.description || pageContent.headerBlockSubtitle;
|
|
1586
|
-
if (desc && typeof desc === "string") {
|
|
1587
|
-
return desc;
|
|
1691
|
+
const desc = pageContent.pageDescription || pageContent.description || pageContent.headerBlockSubtitle || pageContent.ogDescription;
|
|
1692
|
+
if (desc && typeof desc === "string" && desc.trim().length > 0) {
|
|
1693
|
+
return desc.trim();
|
|
1588
1694
|
}
|
|
1589
|
-
return
|
|
1695
|
+
return `${urlItem.TemplateAlias} page`;
|
|
1590
1696
|
}
|
|
1591
1697
|
sanitizeUrlForFilename(url) {
|
|
1592
1698
|
if (!url || url === "/")
|
package/dist/module.d.mts
CHANGED
|
@@ -14,6 +14,7 @@ declare const LLMSConfigSchema: z.ZodObject<{
|
|
|
14
14
|
baseSiteUrl: z.ZodOptional<z.ZodString>;
|
|
15
15
|
baseSiteUrlUmbracoDataKey: z.ZodOptional<z.ZodString>;
|
|
16
16
|
maxConcurrent: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
17
|
+
maxTokens: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
17
18
|
enableLLMSFullTxt: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
18
19
|
enableIndividualMd: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
19
20
|
enableAutoCleanup: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
package/dist/module.d.ts
CHANGED
|
@@ -14,6 +14,7 @@ declare const LLMSConfigSchema: z.ZodObject<{
|
|
|
14
14
|
baseSiteUrl: z.ZodOptional<z.ZodString>;
|
|
15
15
|
baseSiteUrlUmbracoDataKey: z.ZodOptional<z.ZodString>;
|
|
16
16
|
maxConcurrent: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
17
|
+
maxTokens: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
17
18
|
enableLLMSFullTxt: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
18
19
|
enableIndividualMd: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
|
19
20
|
enableAutoCleanup: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
|
package/dist/module.json
CHANGED
package/dist/module.mjs
CHANGED
|
@@ -38,6 +38,7 @@ const LLMSConfigSchema = z.object({
|
|
|
38
38
|
).describe("The base URL of the website to append to links in generated llms files"),
|
|
39
39
|
baseSiteUrlUmbracoDataKey: z.string().optional().describe("If the SiteData of UmbracoData has the key with the base URL you can pass here the key to auto extract the base url"),
|
|
40
40
|
maxConcurrent: z.number().int().min(1, "maxConcurrent must be at least 1").max(10, "maxConcurrent should not exceed 10 to avoid rate limits").optional().default(3),
|
|
41
|
+
maxTokens: z.number().int().min(1e3, "maxTokens must be at least 1000").max(2e5, "maxTokens should not exceed 200000").optional().default(65e3).describe("Maximum tokens for page content before truncation"),
|
|
41
42
|
enableLLMSFullTxt: z.boolean().optional().default(true),
|
|
42
43
|
enableIndividualMd: z.boolean().optional().default(true),
|
|
43
44
|
enableAutoCleanup: z.boolean().optional().default(true),
|
|
@@ -225,6 +226,7 @@ function convertHtmlToMarkdownDeep(input) {
|
|
|
225
226
|
const DEFAULT_OPTIONS = {
|
|
226
227
|
anthropicModel: "claude-3-7-sonnet-latest",
|
|
227
228
|
maxConcurrent: 5,
|
|
229
|
+
maxTokens: 65e3,
|
|
228
230
|
enableLLMSFullTxt: true,
|
|
229
231
|
enableIndividualMd: true,
|
|
230
232
|
templatesDir: ".llms-templates",
|
|
@@ -271,6 +273,7 @@ const llmsModule = defineNuxtModule({
|
|
|
271
273
|
finalOutputDir: resolve(nuxt.options.rootDir, options.finalOutputDir ?? "public"),
|
|
272
274
|
anthropicModel: options.anthropicModel || DEFAULT_OPTIONS.anthropicModel,
|
|
273
275
|
maxConcurrent: options.maxConcurrent || DEFAULT_OPTIONS.maxConcurrent,
|
|
276
|
+
maxTokens: options.maxTokens ?? DEFAULT_OPTIONS.maxTokens,
|
|
274
277
|
enableLLMSFullTxt: options.enableLLMSFullTxt ?? DEFAULT_OPTIONS.enableLLMSFullTxt,
|
|
275
278
|
enableIndividualMd: options.enableIndividualMd ?? DEFAULT_OPTIONS.enableIndividualMd,
|
|
276
279
|
enableAutoCleanup: options.enableAutoCleanup ?? DEFAULT_OPTIONS.enableAutoCleanup,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@voicenter-team/nuxt-llms-generator",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.13",
|
|
4
4
|
"description": "Nuxt 3 module for automatically generating AI-optimized documentation files (llms.txt, llms-full.txt, and individual .md files) from Umbraco CMS data using Anthropic's Claude API.",
|
|
5
5
|
"repository": "https://github.com/VoicenterTeam/nuxt-llms-generator",
|
|
6
6
|
"license": "MIT",
|
|
@@ -35,6 +35,7 @@
|
|
|
35
35
|
"dependencies": {
|
|
36
36
|
"@anthropic-ai/sdk": "^0.30.0",
|
|
37
37
|
"@nuxt/kit": "^3.11.2",
|
|
38
|
+
"@toon-format/toon": "^2.1.0",
|
|
38
39
|
"@voicenter-team/eslint-config-ts": "^1.0.22",
|
|
39
40
|
"i": "^0.3.7",
|
|
40
41
|
"jsonpath-plus": "^8.0.0",
|