@voicenter-team/nuxt-llms-generator 0.1.11 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -242,6 +242,7 @@ npm run build
242
242
  | `enableLLMSFullTxt` | `boolean` | `true` | Generate combined llms-full.txt file |
243
243
  | `enableHtmlToMarkdown` | `boolean` | `true` | Convert HTML content to markdown using [node-html-markdown](https://www.npmjs.com/package/node-html-markdown) |
244
244
  | `maxConcurrent` | `number` | `5` | Maximum concurrent API requests |
245
+ | `maxTokens` | `number` | `65000` | Maximum tokens for page content before truncation (Claude context limit protection) |
245
246
  | `anthropicModel` | `string` | `claude-3-5-sonnet-20241022` | Claude model to use |
246
247
 
247
248
  ### Cleanup Options
@@ -500,6 +501,7 @@ ls -la public/UmbracoData.json
500
501
  ```typescript
501
502
  {
502
503
  maxConcurrent: 8, // Higher concurrency
504
+ maxTokens: 80000, // More content per page (if using larger models)
503
505
  enableAutoCleanup: true, // Keep cache clean
504
506
  }
505
507
  ```
@@ -509,6 +511,7 @@ ls -la public/UmbracoData.json
509
511
  {
510
512
  enableIndividualMd: false, // Skip individual files
511
513
  maxConcurrent: 2, // Lower API usage
514
+ maxTokens: 50000, // Smaller context for faster processing
512
515
  }
513
516
  ```
514
517
 
@@ -518,6 +521,7 @@ ls -la public/UmbracoData.json
518
521
  enableAutoCleanup: true,
519
522
  cleanupOrphaned: true,
520
523
  cleanupHidden: true,
524
+ maxTokens: 65000, // Balance between detail and API limits
521
525
  enableHtmlToMarkdown: true // Clean HTML from CMS content
522
526
  }
523
527
  ```
@@ -570,6 +574,7 @@ interface LLMSConfig {
570
574
  templatesDir?: string; // './.llms-templates'
571
575
  finalOutputDir?: string; // './.output/llms'
572
576
  anthropicModel?: string; // 'claude-3-5-sonnet-20241022'
577
+ maxTokens?: number; // 65000
573
578
  maxConcurrent?: number; // 5
574
579
  enableLLMSFullTxt?: boolean; // true
575
580
  enableIndividualMd?: boolean; // true
@@ -3,14 +3,221 @@ import { join, dirname, basename } from 'path';
3
3
  import { slugify } from 'transliteration';
4
4
  import Mustache from 'mustache';
5
5
  import Anthropic from '@anthropic-ai/sdk';
6
- import { createHash } from 'crypto';
6
+ import { encode } from '@toon-format/toon';
7
7
  import { JSONPath } from 'jsonpath-plus';
8
- import { w as withErrorHandling } from '../shared/nuxt-llms-generator.bc139143.mjs';
8
+ import { createHash } from 'crypto';
9
+ import { w as withErrorHandling } from '../shared/nuxt-llms-generator.db76a78e.mjs';
9
10
  import '@nuxt/kit';
10
11
  import 'zod';
11
12
  import 'node-html-markdown';
12
13
 
14
+ function extractPageContent(umbracoData, jpath) {
15
+ try {
16
+ const result = JSONPath({
17
+ path: jpath,
18
+ json: umbracoData.SiteData,
19
+ wrap: false
20
+ });
21
+ if (!result || Array.isArray(result) && result.length === 0) {
22
+ return null;
23
+ }
24
+ const pageContent = Array.isArray(result) ? result[0] : result;
25
+ return excludeChildrenFromContent(pageContent);
26
+ } catch (error) {
27
+ console.error(`Failed to extract content for path ${jpath}:`, error);
28
+ return null;
29
+ }
30
+ }
31
+ function excludeChildrenFromContent(content) {
32
+ if (!content || typeof content !== "object") {
33
+ return content;
34
+ }
35
+ const cleanContent = { ...content };
36
+ if ("children" in cleanContent) {
37
+ delete cleanContent.children;
38
+ }
39
+ return cleanContent;
40
+ }
41
+ function generatePageId(urlItem) {
42
+ const templateAlias = urlItem.TemplateAlias || "UnknownTemplate";
43
+ const nodeID = urlItem.nodeID || "UnknownNode";
44
+ return `${templateAlias}_${nodeID}`;
45
+ }
46
+ function isImportantKey(key) {
47
+ const importantPatterns = [
48
+ "title",
49
+ "name",
50
+ "heading",
51
+ "description",
52
+ "summary",
53
+ "content",
54
+ "text",
55
+ "body",
56
+ "value",
57
+ "label",
58
+ "caption",
59
+ "alt",
60
+ "message",
61
+ "url",
62
+ "link",
63
+ "href"
64
+ ];
65
+ const lowerKey = key.toLowerCase();
66
+ return importantPatterns.some((pattern) => lowerKey.includes(pattern));
67
+ }
68
+ function isMetadataKey(key) {
69
+ const metadataPatterns = [
70
+ "id",
71
+ "guid",
72
+ "key",
73
+ "_id",
74
+ "nodeid",
75
+ "created",
76
+ "updated",
77
+ "modified",
78
+ "timestamp",
79
+ "date",
80
+ "sort",
81
+ "order",
82
+ "index",
83
+ "position",
84
+ "published",
85
+ "hidden",
86
+ "visible",
87
+ "enabled",
88
+ "status",
89
+ "type",
90
+ "contenttype",
91
+ "template",
92
+ "alias",
93
+ "path",
94
+ "meta",
95
+ "metadata",
96
+ "seo",
97
+ "schema",
98
+ "properties"
99
+ ];
100
+ const lowerKey = key.toLowerCase();
101
+ return metadataPatterns.some((pattern) => lowerKey.includes(pattern));
102
+ }
103
+ function recursiveTruncate(content, maxTokens, currentDepth = 0) {
104
+ if (currentDepth > 10) {
105
+ return { _truncated: "Max depth reached" };
106
+ }
107
+ if (maxTokens < 10) {
108
+ return void 0;
109
+ }
110
+ if (content === null || content === void 0) {
111
+ return content;
112
+ }
113
+ if (typeof content !== "object") {
114
+ if (typeof content === "string" && content.length > 2e3) {
115
+ return content.substring(0, 2e3) + "...";
116
+ }
117
+ return content;
118
+ }
119
+ if (Array.isArray(content)) {
120
+ if (content.length === 0)
121
+ return content;
122
+ const itemLimit = Math.max(3, Math.floor(15 / (currentDepth + 1)));
123
+ const tokensPerItem = Math.floor(maxTokens / Math.min(content.length, itemLimit));
124
+ const truncatedArray = content.slice(0, itemLimit).map((item) => recursiveTruncate(item, tokensPerItem, currentDepth + 1)).filter((item) => item !== void 0);
125
+ if (content.length > truncatedArray.length) {
126
+ truncatedArray.push({
127
+ _note: `... and ${content.length - truncatedArray.length} more items`
128
+ });
129
+ }
130
+ return truncatedArray;
131
+ }
132
+ const truncatedObj = {};
133
+ const entries = Object.entries(content);
134
+ const withoutMetadata = entries.filter(([key]) => !isMetadataKey(key));
135
+ if (withoutMetadata.length === 0) {
136
+ return { _note: "Only metadata, removed" };
137
+ }
138
+ const importantEntries = withoutMetadata.filter(([key]) => isImportantKey(key));
139
+ const normalEntries = withoutMetadata.filter(([key]) => !isImportantKey(key));
140
+ const importantBudget = Math.floor(maxTokens * 0.4);
141
+ const tokensPerImportant = importantEntries.length > 0 ? Math.floor(importantBudget / importantEntries.length) : 0;
142
+ for (const [key, value] of importantEntries) {
143
+ const processedValue = recursiveTruncate(value, tokensPerImportant, currentDepth + 1);
144
+ if (processedValue !== void 0) {
145
+ truncatedObj[key] = processedValue;
146
+ }
147
+ }
148
+ const usedTokens = estimateContentTokens(truncatedObj);
149
+ const remainingBudget = maxTokens - usedTokens;
150
+ if (remainingBudget > 100 && normalEntries.length > 0) {
151
+ const sortedNormal = normalEntries.sort(([_a, valueA], [_b, valueB]) => {
152
+ const sizeA = JSON.stringify(valueA).length;
153
+ const sizeB = JSON.stringify(valueB).length;
154
+ return sizeA - sizeB;
155
+ });
156
+ const tokensPerNormal = Math.floor(remainingBudget / sortedNormal.length);
157
+ for (const [key, value] of sortedNormal) {
158
+ const processedValue = recursiveTruncate(value, tokensPerNormal, currentDepth + 1);
159
+ if (processedValue !== void 0) {
160
+ truncatedObj[key] = processedValue;
161
+ const newSize = estimateContentTokens(truncatedObj);
162
+ if (newSize > maxTokens) {
163
+ delete truncatedObj[key];
164
+ break;
165
+ }
166
+ }
167
+ }
168
+ }
169
+ return Object.keys(truncatedObj).length > 0 ? truncatedObj : void 0;
170
+ }
171
+ function emergencyTruncate(content, maxTokens) {
172
+ const result = { ...content };
173
+ const keys = Object.keys(result).sort((a, b) => {
174
+ const aImportant = isImportantKey(a) ? 1 : 0;
175
+ const bImportant = isImportantKey(b) ? 1 : 0;
176
+ return aImportant - bImportant;
177
+ });
178
+ for (const key of keys) {
179
+ if (estimateContentTokens(result) <= maxTokens)
180
+ break;
181
+ delete result[key];
182
+ console.warn(` Emergency: removed "${key}"`);
183
+ }
184
+ return result;
185
+ }
186
+ function estimateContentTokens(content) {
187
+ try {
188
+ const jsonString = JSON.stringify(content);
189
+ return Math.ceil(jsonString.length / 3);
190
+ } catch {
191
+ return 0;
192
+ }
193
+ }
194
+ function truncateContentIfNeeded(content, maxTokens = 1e5) {
195
+ const estimatedTokens = estimateContentTokens(content);
196
+ if (estimatedTokens <= maxTokens) {
197
+ return content;
198
+ }
199
+ console.warn(`\u26A0\uFE0F Content too large (${estimatedTokens} tokens > ${maxTokens} limit), truncating recursively...`);
200
+ const truncatedContent = recursiveTruncate(content, maxTokens, 0);
201
+ const result = truncatedContent && typeof truncatedContent === "object" && !Array.isArray(truncatedContent) ? truncatedContent : {
202
+ _error: "Content truncation failed",
203
+ original: content
204
+ };
205
+ const finalTokens = estimateContentTokens(result);
206
+ const preservedKeys = Object.keys(result).length;
207
+ const originalKeys = Object.keys(content).length;
208
+ console.log(`\u2705 Content truncated: ${estimatedTokens} \u2192 ${finalTokens} tokens (preserved ${preservedKeys}/${originalKeys} root keys)`);
209
+ if (finalTokens > maxTokens) {
210
+ console.error(`\u274C Recursive truncation insufficient (${finalTokens} > ${maxTokens}), performing emergency truncation...`);
211
+ return emergencyTruncate(result, maxTokens);
212
+ }
213
+ return result;
214
+ }
215
+
13
216
  function buildLLMSTemplatePrompt(request) {
217
+ const jsonTokens = estimateContentTokens(request.pageContent);
218
+ const toonData = encode(request.pageContent, { delimiter: " " });
219
+ const toonTokens = estimateContentTokens(toonData);
220
+ console.log(`\u{1F4CA} ${request.url}: JSON ${jsonTokens} \u2192 TOON ${toonTokens} (${((1 - toonTokens / jsonTokens) * 100).toFixed(0)}% saved)`);
14
221
  return `# LLMS.txt-Optimized Mustache Template Generator
15
222
 
16
223
  You are an expert at creating **Mustache.js templates** that generate **LLM knowledge base entries** following the [\`llms.txt\` standard](https://llmstxt.org/).
@@ -21,9 +228,9 @@ You are an expert at creating **Mustache.js templates** that generate **LLM know
21
228
 
22
229
  ### 1. DATA-DRIVEN CONTENT ONLY
23
230
  - **EVERY piece of content** must come from a Mustache binding: \`{{propertyName}}\`
24
- - **NEVER invent, assume, or add content** that doesn't exist in the provided JSON
231
+ - **NEVER invent, assume, or add content** that doesn't exist in the provided data
25
232
  - **NO hardcoded descriptions, lists, or facts**
26
- - If a property doesn't exist in JSON, don't create a section for it
233
+ - If a property doesn't exist in data, don't create a section for it
27
234
 
28
235
  ### 2. ALLOWED CONTEXTUAL ADDITIONS
29
236
  You MAY add:
@@ -32,51 +239,47 @@ You MAY add:
32
239
  - **Structural markers** for clarity (e.g., "Navigation:", "Metadata:")
33
240
 
34
241
  You MAY NOT add:
35
- - Descriptions of features/benefits not in JSON
242
+ - Descriptions of features/benefits not in data
36
243
  - Explanatory text about what something does
37
244
  - Lists of items not present in data
38
245
  - Assumptions about the page purpose
39
246
 
40
- ### 3. EXAMPLES OF VIOLATIONS
247
+ ### 3. UNDERSTANDING TOON FORMAT
41
248
 
42
- \u274C **BAD - Hardcoded content:**
43
- \`\`\`mustache
44
- ## Key Benefits
45
- - Real-time monitoring
46
- - Detailed analytics
47
- - Easy to use
48
- \`\`\`
49
- *Problem: These benefits are invented, not from JSON*
249
+ The data below is in **TOON format** (Token-Oriented Object Notation) for efficiency.
50
250
 
51
- \u274C **BAD - Invented descriptions:**
52
- \`\`\`mustache
53
- This dashboard provides comprehensive monitoring capabilities for call centers...
54
- \`\`\`
55
- *Problem: Description is made up*
251
+ **How to read TOON:**
252
+ - \`propertyName: value\` \u2192 Single property
253
+ - \`array[3]{prop1,prop2}\` \u2192 Array of 3 objects with properties prop1, prop2
254
+ - Properties in \`{braces}\` are the **exact field names** to use in Mustache bindings
56
255
 
57
- \u2705 **GOOD - Data-driven with context:**
58
- \`\`\`mustache
59
- {{#features.0}}
60
- ## Available Features
61
- {{#features}}
62
- - **{{name}}**: {{description}}
63
- {{/features}}
64
- {{/features.0}}
256
+ **Example:**
257
+ \`\`\`toon
258
+ users[2]{id,name,role}:
259
+ 1 Alice admin
260
+ 2 Bob user
65
261
  \`\`\`
66
- *Good: Content comes from JSON, heading provides context*
67
262
 
68
- \u2705 **GOOD - Minimal introduction:**
263
+ **Your Mustache template:**
69
264
  \`\`\`mustache
70
- {{#items.0}}
71
- ## Items Overview
72
- The following items are available:
73
- {{#items}}
74
- - {{title}}
75
- {{/items}}
76
- {{/items.0}}
265
+ {{#users}}
266
+ - {{id}}: {{name}} ({{role}})
267
+ {{/users}}
77
268
  \`\`\`
78
- *Good: Brief intro, but content is from JSON*
79
269
 
270
+ **CRITICAL:** Use the EXACT property names shown in TOON \`{braces}\` for your Mustache bindings.
271
+
272
+ ---
273
+ ## \u{1F6AB} Content Exclusion Rules
274
+ When generating the template, **DO NOT** create sections for these types of data even if they appear in TOON:
275
+ - **Image/media properties**: URLs to images, avatars, thumbnails, icons, or any media files
276
+ - **UI-only labels**: Search placeholders, filter menu text, "Show More"/"Show Less", pagination labels
277
+ - **SEO/meta fields**: ogTitle, ogDescription, ogImage, canonical URLs, changefreq, priority, sitemap fields
278
+ - **Legal boilerplate**: Copyright text, "All rights reserved"
279
+ - **Navigation chrome**: Breadcrumbs, menu items, footer links \u2014 unless they ARE the primary page content
280
+ - **System identifiers**: Internal IDs, GUIDs, sort orders, node paths, template aliases
281
+ - **Empty/null values**: Skip any property that holds no meaningful value
282
+ **Focus ONLY on**: titles, descriptions, features, pricing, reviews, specifications, contact info, and other business-relevant content.
80
283
  ---
81
284
 
82
285
  ## \u{1F3AF} TRUE PURPOSE: Help LLMs Answer Questions Efficiently
@@ -100,9 +303,10 @@ These \`.md\` files are **LLM knowledge base entries** designed for **inference*
100
303
  - **Template Alias:** ${request.templateAlias}
101
304
  - **JSON Path:** ${request.jpath}
102
305
 
103
- ### Available Data
104
- \`\`\`json
105
- ${JSON.stringify(request.pageContent, null, 2)}
306
+ ### Available Data (TOON Format)
307
+
308
+ \`\`\`toon
309
+ ${toonData}
106
310
  \`\`\`
107
311
 
108
312
  ---
@@ -117,13 +321,13 @@ ${JSON.stringify(request.pageContent, null, 2)}
117
321
  ### 2. Structure for Question-Answering
118
322
  Anticipate questions an LLM might need to answer:
119
323
  - "What is this?" \u2192 Main heading + description properties
120
- - "What does it offer?" \u2192 Lists of items/features from JSON
324
+ - "What does it offer?" \u2192 Lists of items/features from data
121
325
  - "Who is it for?" \u2192 Target audience properties (if they exist)
122
326
  - "What are the details?" \u2192 Technical/metadata properties
123
327
 
124
- ### 3. Prioritize by JSON Structure
328
+ ### 3. Prioritize by Data Importance
125
329
  **Essential First:**
126
- - Root-level title/name/heading properties
330
+ - Title/name/heading properties
127
331
  - Description/summary properties
128
332
  - Main content arrays
129
333
 
@@ -146,17 +350,15 @@ Anticipate questions an LLM might need to answer:
146
350
 
147
351
  ## \u{1F527} Technical Principles (Key-Agnostic Design)
148
352
 
149
- ### 1. Dynamic Property Inference
150
- **Do not assume fixed property names.** Infer content type from:
151
- - **Value structure:** Object, array, string, number
152
- - **Value length:** Short strings = titles; long text = descriptions
153
- - **Position in JSON:** Root-level = high importance
154
- - **Semantic patterns:** URLs, images, dates
353
+ ### 1. Extract Property Names from TOON
354
+ Look at TOON headers to identify properties:
355
+ - \`{id,name,role}\` \u2192 Use \`{{id}}\`, \`{{name}}\`, \`{{role}}\`
356
+ - \`breadcrumbsLinks[5]{title,link}\` \u2192 Use \`{{#breadcrumbsLinks}}{{title}} {{link}}{{/breadcrumbsLinks}}\`
155
357
 
156
358
  ### 2. Exact Property Bindings
157
- - Always use **exact property name** from JSON: \`{{actualKeyName}}\`
359
+ - Always use **exact property name** from TOON: \`{{actualKeyName}}\`
158
360
  - Do NOT rename or modify binding identifiers
159
- - Mustache bindings must match JSON precisely
361
+ - Mustache bindings must match TOON property names precisely
160
362
 
161
363
  ### 3. Humanized Section Headings
162
364
  While bindings stay exact, convert keys to readable headings:
@@ -164,20 +366,33 @@ While bindings stay exact, convert keys to readable headings:
164
366
  - \`supportPageItems\` \u2192 "Available Support Topics"
165
367
  - \`breadcrumbsLinks\` \u2192 "Navigation Path"
166
368
 
167
- ### 4. Semantic Interpretation Guide
168
- - **Short root strings (5-50 chars)** \u2192 Likely page title
169
- - **Medium text (50-300 chars)** \u2192 Likely summary/tagline
170
- - **Long text (300+ chars)** \u2192 Likely detailed description
171
- - **Arrays of objects** \u2192 Repeated sections with structure
172
- - **Arrays of primitives** \u2192 Bullet lists
173
- - **URL-like strings** \u2192 Render as \`[Label]({{url}})\`
369
+ ### 4. Working with Arrays
370
+ When you see \`arrayName[N]{prop1,prop2}\`:
371
+ - Use \`{{#arrayName.0}}\` to check if array exists
372
+ - Iterate with \`{{#arrayName}}\`
373
+ - Access properties with \`{{prop1}}\`, \`{{prop2}}\`
374
+
375
+ **Example:**
376
+ \`\`\`toon
377
+ items[3]{title,description}:
378
+ ...
379
+ \`\`\`
380
+ \u2192
381
+ \`\`\`mustache
382
+ {{#items.0}}
383
+ ## Items
384
+ {{#items}}
385
+ - {{title}}: {{description}}
386
+ {{/items}}
387
+ {{/items.0}}
388
+ \`\`\`
174
389
 
175
390
  ### 5. Noise Filtering
176
- **Exclude technical metadata:**
391
+ **Exclude technical metadata** (if present in TOON):
177
392
  - IDs: \`id\`, \`nodeId\`, \`_id\`, \`guid\`
178
393
  - Timestamps: \`createdAt\`, \`updatedAt\`
179
394
  - Flags: \`isPublished\`, \`sortOrder\`, \`hidden\`
180
- - System: \`_type\`, \`contentType\`, \`template\`
395
+ - System: \`_type\`, \`contentType\`
181
396
 
182
397
  ### 6. Hierarchy & Nesting
183
398
  - **Root level** \u2192 \`#\` (H1) \u2014 one per document
@@ -198,7 +413,7 @@ While bindings stay exact, convert keys to readable headings:
198
413
  {{/summaryProperty}}
199
414
  \`\`\`
200
415
 
201
- ### Recommended Sections (adapt to actual JSON)
416
+ ### Example Sections (adapt to actual TOON data)
202
417
  \`\`\`mustache
203
418
  {{#mainDescription}}
204
419
  ## Overview
@@ -219,45 +434,94 @@ While bindings stay exact, convert keys to readable headings:
219
434
  - [{{title}}]({{link}})
220
435
  {{/navigationLinks}}
221
436
  {{/navigationLinks.0}}
222
-
223
- {{#technicalData}}
224
- ## Technical Information
225
- - **URL**: {{url}}
226
- - **Type**: {{type}}
227
- {{/technicalData}}
228
437
  \`\`\`
229
438
 
230
- **Important:** These are examples. Your template must match the ACTUAL JSON structure provided.
439
+ **Important:** These are examples. Your template must match the ACTUAL TOON structure provided.
231
440
 
232
441
  ---
233
442
 
234
443
  ## \u2705 Output Requirements
235
444
 
236
445
  1. **Output ONLY the Mustache template** \u2014 no explanations, no markdown code fences, no preamble
237
- 2. **Use exact JSON property names** in all bindings
446
+ 2. **Use exact property names from TOON \`{braces}\`** in all bindings
238
447
  3. **Generate clean Markdown** \u2014 no HTML, entities, or attributes
239
448
  4. **Data-driven content** \u2014 no invented facts or descriptions
240
- 5. **Contextual headings allowed** \u2014 but content must be from JSON
449
+ 5. **Contextual headings allowed** \u2014 but content must be from data
241
450
  6. **Be concise** \u2014 optimize for limited context windows
242
451
  7. **Structure for questions** \u2014 LLMs should easily extract facts
243
452
 
244
453
  ---
245
454
 
455
+ ## \u26A0\uFE0F CRITICAL: Mustache Syntax Validation
456
+
457
+ **Every \`{{#tag}}\` MUST have matching \`{{/tag}}\`**
458
+
459
+ ### Common Errors (from real failures):
460
+
461
+ \u274C **Missing closing tag:**
462
+ \`\`\`mustache
463
+ {{#pageDescription}}
464
+ content
465
+ // \u274C Missing {{/pageDescription}}
466
+ \`\`\`
467
+
468
+ \u274C **Nested check without outer closing:**
469
+ \`\`\`mustache
470
+ {{#items.0}}
471
+ {{#items}}...{{/items}}
472
+ // \u274C Missing {{/items.0}}
473
+ \`\`\`
474
+
475
+ \u274C **Capitalization mismatch:**
476
+ \`\`\`mustache
477
+ {{#aIFeaturesCTATitle}}
478
+ ...
479
+ {{/aiFeaturesCTATitle}} \u274C Different capitalization!
480
+ \`\`\`
481
+
482
+ ### Validation Checklist:
483
+
484
+ **Before output:**
485
+ 1. Count \`{{#\` tags = ___
486
+ 2. Count \`{{/\` tags = ___
487
+ 3. Numbers match? If NO \u2192 Find and add missing closing tags
488
+ 4. Tag names exact match (including dots, numbers, capitalization)?
489
+
490
+ \u2705 **Valid example:**
491
+ \`\`\`mustache
492
+ {{#section}} \u2190 1 open
493
+ {{#nested.0}} \u2190 2 open
494
+ content
495
+ {{/nested.0}} \u2190 2 close
496
+ {{/section}} \u2190 1 close
497
+ \`\`\`
498
+ Count: 2 = 2 \u2713
499
+
500
+ ---
501
+
246
502
  ## \u{1F680} Your Task
247
503
 
248
- Analyze the provided JSON structure and **generate a Mustache template** that:
504
+ Analyze the provided TOON data structure and **generate a Mustache template** that:
249
505
 
250
- 1. **Uses ONLY data from JSON** (no invented content)
251
- 2. **Adds logical section headings** for context
252
- 3. **Structures data for question-answering**
253
- 4. **Prioritizes most important properties first**
254
- 5. **Remains universal** (works for any JSON shape)
506
+ 1. **Uses ONLY data from TOON** (no invented content)
507
+ 2. **Extracts exact property names from \`{braces}\`**
508
+ 3. **Adds logical section headings** for context
509
+ 4. **Structures data for question-answering**
510
+ 5. **Prioritizes most important properties first**
511
+ 6. **Remains universal** (works for any data shape)
512
+ 7. **\u2705 ALL Mustache tags properly closed**
255
513
 
256
514
  **Remember:**
257
- - Headings can be contextual: \u2705
258
- - Content must be from JSON: \u2705\u2705\u2705
259
- - No made-up descriptions: \u274C
260
- - No assumed features: \u274C
515
+ - Parse TOON structure naturally \u2705
516
+ - Use exact property names from \`{braces}\` \u2705\u2705\u2705
517
+ - Headings can be contextual \u2705
518
+ - Content must be from data \u2705\u2705\u2705
519
+ - No made-up descriptions \u274C
520
+ - No assumed features \u274C
521
+ - **Every {{#tag}} has {{/tag}}** \u2705\u2705\u2705
522
+
523
+ **Final Step Before Output:**
524
+ Count your \`{{#\` and \`{{/\` tags. If numbers don't match, find and add missing closing tags.
261
525
 
262
526
  Generate the template now.
263
527
  `;
@@ -727,208 +991,11 @@ function getValueType(value) {
727
991
  return typeof value;
728
992
  }
729
993
 
730
- function extractPageContent(umbracoData, jpath) {
731
- try {
732
- const result = JSONPath({
733
- path: jpath,
734
- json: umbracoData.SiteData,
735
- wrap: false
736
- });
737
- if (!result || Array.isArray(result) && result.length === 0) {
738
- return null;
739
- }
740
- const pageContent = Array.isArray(result) ? result[0] : result;
741
- return excludeChildrenFromContent(pageContent);
742
- } catch (error) {
743
- console.error(`Failed to extract content for path ${jpath}:`, error);
744
- return null;
745
- }
746
- }
747
- function excludeChildrenFromContent(content) {
748
- if (!content || typeof content !== "object") {
749
- return content;
750
- }
751
- const cleanContent = { ...content };
752
- if ("children" in cleanContent) {
753
- delete cleanContent.children;
754
- }
755
- return cleanContent;
756
- }
757
- function generatePageId(urlItem) {
758
- const templateAlias = urlItem.TemplateAlias || "UnknownTemplate";
759
- const nodeID = urlItem.nodeID || "UnknownNode";
760
- return `${templateAlias}_${nodeID}`;
761
- }
762
- function isImportantKey(key) {
763
- const importantPatterns = [
764
- "title",
765
- "name",
766
- "heading",
767
- "description",
768
- "summary",
769
- "content",
770
- "text",
771
- "body",
772
- "value",
773
- "label",
774
- "caption",
775
- "alt",
776
- "message",
777
- "url",
778
- "link",
779
- "href"
780
- ];
781
- const lowerKey = key.toLowerCase();
782
- return importantPatterns.some((pattern) => lowerKey.includes(pattern));
783
- }
784
- function isMetadataKey(key) {
785
- const metadataPatterns = [
786
- "id",
787
- "guid",
788
- "key",
789
- "_id",
790
- "nodeid",
791
- "created",
792
- "updated",
793
- "modified",
794
- "timestamp",
795
- "date",
796
- "sort",
797
- "order",
798
- "index",
799
- "position",
800
- "published",
801
- "hidden",
802
- "visible",
803
- "enabled",
804
- "status",
805
- "type",
806
- "contenttype",
807
- "template",
808
- "alias",
809
- "path",
810
- "meta",
811
- "metadata",
812
- "seo",
813
- "schema",
814
- "properties"
815
- ];
816
- const lowerKey = key.toLowerCase();
817
- return metadataPatterns.some((pattern) => lowerKey.includes(pattern));
818
- }
819
- function recursiveTruncate(content, maxTokens, currentDepth = 0) {
820
- if (currentDepth > 10) {
821
- return { _truncated: "Max depth reached" };
822
- }
823
- if (maxTokens < 10) {
824
- return void 0;
825
- }
826
- if (content === null || content === void 0) {
827
- return content;
828
- }
829
- if (typeof content !== "object") {
830
- if (typeof content === "string" && content.length > 2e3) {
831
- return content.substring(0, 2e3) + "...";
832
- }
833
- return content;
834
- }
835
- if (Array.isArray(content)) {
836
- if (content.length === 0)
837
- return content;
838
- const itemLimit = Math.max(3, Math.floor(15 / (currentDepth + 1)));
839
- const tokensPerItem = Math.floor(maxTokens / Math.min(content.length, itemLimit));
840
- const truncatedArray = content.slice(0, itemLimit).map((item) => recursiveTruncate(item, tokensPerItem, currentDepth + 1)).filter((item) => item !== void 0);
841
- if (content.length > truncatedArray.length) {
842
- truncatedArray.push({
843
- _note: `... and ${content.length - truncatedArray.length} more items`
844
- });
845
- }
846
- return truncatedArray;
847
- }
848
- const truncatedObj = {};
849
- const entries = Object.entries(content);
850
- const withoutMetadata = entries.filter(([key]) => !isMetadataKey(key));
851
- if (withoutMetadata.length === 0) {
852
- return { _note: "Only metadata, removed" };
853
- }
854
- const importantEntries = withoutMetadata.filter(([key]) => isImportantKey(key));
855
- const normalEntries = withoutMetadata.filter(([key]) => !isImportantKey(key));
856
- const importantBudget = Math.floor(maxTokens * 0.4);
857
- const tokensPerImportant = importantEntries.length > 0 ? Math.floor(importantBudget / importantEntries.length) : 0;
858
- for (const [key, value] of importantEntries) {
859
- const processedValue = recursiveTruncate(value, tokensPerImportant, currentDepth + 1);
860
- if (processedValue !== void 0) {
861
- truncatedObj[key] = processedValue;
862
- }
863
- }
864
- const usedTokens = estimateContentTokens(truncatedObj);
865
- const remainingBudget = maxTokens - usedTokens;
866
- if (remainingBudget > 100 && normalEntries.length > 0) {
867
- const sortedNormal = normalEntries.sort(([_a, valueA], [_b, valueB]) => {
868
- const sizeA = JSON.stringify(valueA).length;
869
- const sizeB = JSON.stringify(valueB).length;
870
- return sizeA - sizeB;
871
- });
872
- const tokensPerNormal = Math.floor(remainingBudget / sortedNormal.length);
873
- for (const [key, value] of sortedNormal) {
874
- const processedValue = recursiveTruncate(value, tokensPerNormal, currentDepth + 1);
875
- if (processedValue !== void 0) {
876
- truncatedObj[key] = processedValue;
877
- const newSize = estimateContentTokens(truncatedObj);
878
- if (newSize > maxTokens) {
879
- delete truncatedObj[key];
880
- break;
881
- }
882
- }
883
- }
884
- }
885
- return Object.keys(truncatedObj).length > 0 ? truncatedObj : void 0;
886
- }
887
- function emergencyTruncate(content, maxTokens) {
888
- const result = { ...content };
889
- const keys = Object.keys(result).sort((a, b) => {
890
- const aImportant = isImportantKey(a) ? 1 : 0;
891
- const bImportant = isImportantKey(b) ? 1 : 0;
892
- return aImportant - bImportant;
893
- });
894
- for (const key of keys) {
895
- if (estimateContentTokens(result) <= maxTokens)
896
- break;
897
- delete result[key];
898
- console.warn(` Emergency: removed "${key}"`);
899
- }
900
- return result;
901
- }
902
- function estimateContentTokens(content) {
903
- try {
904
- const jsonString = JSON.stringify(content);
905
- return Math.ceil(jsonString.length / 3);
906
- } catch {
907
- return 0;
908
- }
909
- }
910
- function truncateContentIfNeeded(content, maxTokens = 1e5) {
911
- const estimatedTokens = estimateContentTokens(content);
912
- if (estimatedTokens <= maxTokens) {
913
- return content;
914
- }
915
- console.warn(`\u26A0\uFE0F Content too large (${estimatedTokens} tokens > ${maxTokens} limit), truncating recursively...`);
916
- const truncatedContent = recursiveTruncate(content, maxTokens, 0);
917
- const result = truncatedContent && typeof truncatedContent === "object" && !Array.isArray(truncatedContent) ? truncatedContent : {
918
- _error: "Content truncation failed",
919
- original: content
920
- };
921
- const finalTokens = estimateContentTokens(result);
922
- const preservedKeys = Object.keys(result).length;
923
- const originalKeys = Object.keys(content).length;
924
- console.log(`\u2705 Content truncated: ${estimatedTokens} \u2192 ${finalTokens} tokens (preserved ${preservedKeys}/${originalKeys} root keys)`);
925
- if (finalTokens > maxTokens) {
926
- console.error(`\u274C Recursive truncation insufficient (${finalTokens} > ${maxTokens}), performing emergency truncation...`);
927
- return emergencyTruncate(result, maxTokens);
928
- }
929
- return result;
930
- }
931
-
994
+ const PLACEHOLDER_PATTERNS = [
995
+ "lorem ipsum",
996
+ "dolor sit amet",
997
+ "consectetuer adipi"
998
+ ];
932
999
  function shouldGenerateTemplate(umbracoData, urlItem) {
933
1000
  try {
934
1001
  const pageContent = extractPageContent(umbracoData, urlItem.Jpath);
@@ -941,6 +1008,16 @@ function shouldGenerateTemplate(umbracoData, urlItem) {
941
1008
  console.log(`Page ${urlItem.url} is hidden (hidePage: ${hidePage}), skipping template generation`);
942
1009
  return false;
943
1010
  }
1011
+ const title = pageContent.pageTitle ?? pageContent.pageTittle ?? pageContent.ogTitle ?? pageContent.headerBlockTitle;
1012
+ if (!title || title === "undefined" || title === "null") {
1013
+ console.log(`Page ${urlItem.url} has no valid title, skipping template generation`);
1014
+ return false;
1015
+ }
1016
+ const bodyText = JSON.stringify(pageContent).toLowerCase();
1017
+ if (PLACEHOLDER_PATTERNS.some((p) => bodyText.includes(p))) {
1018
+ console.log(`Page ${urlItem.url} contains placeholder text, skipping template generation`);
1019
+ return false;
1020
+ }
944
1021
  return true;
945
1022
  } catch (error) {
946
1023
  console.error(`Error checking visibility for ${urlItem.url}:`, error);
@@ -1156,6 +1233,19 @@ async function performAutomaticCleanup(umbracoData, cacheDir, options = {}) {
1156
1233
  return stats;
1157
1234
  }
1158
1235
 
1236
+ function sanitizeRenderedMarkdown(markdown) {
1237
+ let output = markdown;
1238
+ output = output.replace(/!\[.*?]\(.*?\)/g, "");
1239
+ output = output.replace(/^(#{1,6})\s+\d+,\s*/gm, "$1 ");
1240
+ output = output.replace(/&#x2F;/g, "/").replace(/&#39;/g, "'").replace(/&#x27;/g, "'").replace(/&quot;/g, '"').replace(/&amp;/g, "&").replace(/&#x3D;/g, "=").replace(/&#x60;/g, "`").replace(/&lt;/g, "<").replace(/&gt;/g, ">");
1241
+ output = output.replace(/^- .+?:\s*$/gm, "");
1242
+ output = output.replace(/\[הרחבה]\([^)]*\)/g, "");
1243
+ output = output.replace(/(?<!:)\/{2,}/g, "/");
1244
+ output = output.replace(/^(#{2,6})\s+.+\n(\s*\n)+(?=#{1,6}\s|$)/gm, "");
1245
+ output = output.replace(/\n{3,}/g, "\n\n");
1246
+ return output.trim();
1247
+ }
1248
+
1159
1249
  class TemplateGenerator {
1160
1250
  anthropicClient;
1161
1251
  promptAnalyzer;
@@ -1264,12 +1354,11 @@ class TemplateGenerator {
1264
1354
  const pageId = generatePageId(urlItem);
1265
1355
  console.log(`Generating new template for ${pageId} (${urlItem.url})`);
1266
1356
  const tokensBeforeTruncation = estimateContentTokens(pageContent);
1267
- const truncatedContent = truncateContentIfNeeded(pageContent, 65e3);
1357
+ const truncatedContent = truncateContentIfNeeded(pageContent, this.config.maxTokens);
1268
1358
  const tokensAfterTruncation = estimateContentTokens(truncatedContent);
1269
1359
  if (tokensBeforeTruncation > tokensAfterTruncation) {
1270
1360
  console.warn(`Page ${pageId} content truncated: ${tokensBeforeTruncation} -> ${tokensAfterTruncation} tokens`);
1271
1361
  }
1272
- this.promptAnalyzer.analyzeContent(truncatedContent, urlItem);
1273
1362
  const request = {
1274
1363
  pageContent: truncatedContent,
1275
1364
  templateAlias: urlItem.TemplateAlias,
@@ -1300,7 +1389,14 @@ class TemplateGenerator {
1300
1389
  }
1301
1390
  async renderTemplate(template, data) {
1302
1391
  return withErrorHandling(async () => {
1303
- return Mustache.render(template, data);
1392
+ const originalEscape = Mustache.escape;
1393
+ Mustache.escape = (text) => text;
1394
+ try {
1395
+ const rendered = Mustache.render(template, data);
1396
+ return sanitizeRenderedMarkdown(rendered);
1397
+ } finally {
1398
+ Mustache.escape = originalEscape;
1399
+ }
1304
1400
  }, {
1305
1401
  template: template.substring(0, 200) + "...",
1306
1402
  dataKeys: Object.keys(data)
@@ -1429,7 +1525,8 @@ class LLMSFilesGenerator {
1429
1525
  content += `This website contains comprehensive information about ${siteTitle.toLowerCase()}. The content is organized into the following sections:
1430
1526
 
1431
1527
  `;
1432
- const pagesByCategory = this.groupPagesByCategory(mdFiles);
1528
+ const deduplicatedFiles = this.deduplicateByUrl(mdFiles);
1529
+ const pagesByCategory = this.groupPagesByCategory(deduplicatedFiles);
1433
1530
  for (const [category, pages] of Object.entries(pagesByCategory)) {
1434
1531
  if (pages.length === 0)
1435
1532
  continue;
@@ -1472,7 +1569,8 @@ class LLMSFilesGenerator {
1472
1569
  `;
1473
1570
  }
1474
1571
  content += "---\n\n";
1475
- for (const mdFile of mdFiles) {
1572
+ const deduplicatedFiles = this.deduplicateByUrl(mdFiles);
1573
+ for (const mdFile of deduplicatedFiles) {
1476
1574
  const urlItem = this.umbracoData.urlList.find((item) => item.url === mdFile.url);
1477
1575
  if (!urlItem)
1478
1576
  continue;
@@ -1513,6 +1611,15 @@ class LLMSFilesGenerator {
1513
1611
  * /marketplace -> category "marketplace"
1514
1612
  * / -> category "main"
1515
1613
  */
1614
+ deduplicateByUrl(mdFiles) {
1615
+ const seen = /* @__PURE__ */ new Set();
1616
+ return mdFiles.filter((file) => {
1617
+ if (seen.has(file.url))
1618
+ return false;
1619
+ seen.add(file.url);
1620
+ return true;
1621
+ });
1622
+ }
1516
1623
  groupPagesByCategory(mdFiles) {
1517
1624
  const categories = {};
1518
1625
  for (const mdFile of mdFiles) {
@@ -1558,8 +1665,7 @@ class LLMSFilesGenerator {
1558
1665
  }
1559
1666
  extractSiteTitle() {
1560
1667
  const siteData = this.umbracoData.SiteData;
1561
- const rawTitle = siteData?.pageTitle || siteData?.mainHeaderBlockTitle || "Website Documentation";
1562
- return rawTitle;
1668
+ return siteData?.pageTitle || siteData?.mainHeaderBlockTitle || "Website Documentation";
1563
1669
  }
1564
1670
  extractSiteDescription() {
1565
1671
  const siteData = this.umbracoData.SiteData;
@@ -1582,11 +1688,11 @@ class LLMSFilesGenerator {
1582
1688
  const pageContent = extractPageContent(this.umbracoData, urlItem.Jpath);
1583
1689
  if (!pageContent)
1584
1690
  return `${urlItem.TemplateAlias} page`;
1585
- const desc = pageContent.pageDescription || pageContent.description || pageContent.headerBlockSubtitle;
1586
- if (desc && typeof desc === "string") {
1587
- return desc;
1691
+ const desc = pageContent.pageDescription || pageContent.description || pageContent.headerBlockSubtitle || pageContent.ogDescription;
1692
+ if (desc && typeof desc === "string" && desc.trim().length > 0) {
1693
+ return desc.trim();
1588
1694
  }
1589
- return `Information about ${urlItem.url}`;
1695
+ return `${urlItem.TemplateAlias} page`;
1590
1696
  }
1591
1697
  sanitizeUrlForFilename(url) {
1592
1698
  if (!url || url === "/")
package/dist/module.d.mts CHANGED
@@ -14,6 +14,7 @@ declare const LLMSConfigSchema: z.ZodObject<{
14
14
  baseSiteUrl: z.ZodOptional<z.ZodString>;
15
15
  baseSiteUrlUmbracoDataKey: z.ZodOptional<z.ZodString>;
16
16
  maxConcurrent: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
17
+ maxTokens: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
17
18
  enableLLMSFullTxt: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
18
19
  enableIndividualMd: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
19
20
  enableAutoCleanup: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
package/dist/module.d.ts CHANGED
@@ -14,6 +14,7 @@ declare const LLMSConfigSchema: z.ZodObject<{
14
14
  baseSiteUrl: z.ZodOptional<z.ZodString>;
15
15
  baseSiteUrlUmbracoDataKey: z.ZodOptional<z.ZodString>;
16
16
  maxConcurrent: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
17
+ maxTokens: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
17
18
  enableLLMSFullTxt: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
18
19
  enableIndividualMd: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
19
20
  enableAutoCleanup: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
package/dist/module.json CHANGED
@@ -4,5 +4,5 @@
4
4
  "compatibility": {
5
5
  "nuxt": "^3.0.0"
6
6
  },
7
- "version": "0.1.11"
7
+ "version": "0.1.13"
8
8
  }
package/dist/module.mjs CHANGED
@@ -1,4 +1,4 @@
1
- export { l as default } from './shared/nuxt-llms-generator.bc139143.mjs';
1
+ export { l as default } from './shared/nuxt-llms-generator.db76a78e.mjs';
2
2
  import '@nuxt/kit';
3
3
  import 'fs';
4
4
  import 'path';
@@ -38,6 +38,7 @@ const LLMSConfigSchema = z.object({
38
38
  ).describe("The base URL of the website to append to links in generated llms files"),
39
39
  baseSiteUrlUmbracoDataKey: z.string().optional().describe("If the SiteData of UmbracoData has the key with the base URL you can pass here the key to auto extract the base url"),
40
40
  maxConcurrent: z.number().int().min(1, "maxConcurrent must be at least 1").max(10, "maxConcurrent should not exceed 10 to avoid rate limits").optional().default(3),
41
+ maxTokens: z.number().int().min(1e3, "maxTokens must be at least 1000").max(2e5, "maxTokens should not exceed 200000").optional().default(65e3).describe("Maximum tokens for page content before truncation"),
41
42
  enableLLMSFullTxt: z.boolean().optional().default(true),
42
43
  enableIndividualMd: z.boolean().optional().default(true),
43
44
  enableAutoCleanup: z.boolean().optional().default(true),
@@ -225,6 +226,7 @@ function convertHtmlToMarkdownDeep(input) {
225
226
  const DEFAULT_OPTIONS = {
226
227
  anthropicModel: "claude-3-7-sonnet-latest",
227
228
  maxConcurrent: 5,
229
+ maxTokens: 65e3,
228
230
  enableLLMSFullTxt: true,
229
231
  enableIndividualMd: true,
230
232
  templatesDir: ".llms-templates",
@@ -271,6 +273,7 @@ const llmsModule = defineNuxtModule({
271
273
  finalOutputDir: resolve(nuxt.options.rootDir, options.finalOutputDir ?? "public"),
272
274
  anthropicModel: options.anthropicModel || DEFAULT_OPTIONS.anthropicModel,
273
275
  maxConcurrent: options.maxConcurrent || DEFAULT_OPTIONS.maxConcurrent,
276
+ maxTokens: options.maxTokens ?? DEFAULT_OPTIONS.maxTokens,
274
277
  enableLLMSFullTxt: options.enableLLMSFullTxt ?? DEFAULT_OPTIONS.enableLLMSFullTxt,
275
278
  enableIndividualMd: options.enableIndividualMd ?? DEFAULT_OPTIONS.enableIndividualMd,
276
279
  enableAutoCleanup: options.enableAutoCleanup ?? DEFAULT_OPTIONS.enableAutoCleanup,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@voicenter-team/nuxt-llms-generator",
3
- "version": "0.1.11",
3
+ "version": "0.1.13",
4
4
  "description": "Nuxt 3 module for automatically generating AI-optimized documentation files (llms.txt, llms-full.txt, and individual .md files) from Umbraco CMS data using Anthropic's Claude API.",
5
5
  "repository": "https://github.com/VoicenterTeam/nuxt-llms-generator",
6
6
  "license": "MIT",
@@ -35,6 +35,7 @@
35
35
  "dependencies": {
36
36
  "@anthropic-ai/sdk": "^0.30.0",
37
37
  "@nuxt/kit": "^3.11.2",
38
+ "@toon-format/toon": "^2.1.0",
38
39
  "@voicenter-team/eslint-config-ts": "^1.0.22",
39
40
  "i": "^0.3.7",
40
41
  "jsonpath-plus": "^8.0.0",