@voicenter-team/nuxt-llms-generator 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -242,6 +242,7 @@ npm run build
242
242
  | `enableLLMSFullTxt` | `boolean` | `true` | Generate combined llms-full.txt file |
243
243
  | `enableHtmlToMarkdown` | `boolean` | `true` | Convert HTML content to markdown using [node-html-markdown](https://www.npmjs.com/package/node-html-markdown) |
244
244
  | `maxConcurrent` | `number` | `5` | Maximum concurrent API requests |
245
+ | `maxTokens` | `number` | `65000` | Maximum tokens for page content before truncation (Claude context limit protection) |
245
246
  | `anthropicModel` | `string` | `claude-3-5-sonnet-20241022` | Claude model to use |
246
247
 
247
248
  ### Cleanup Options
@@ -500,6 +501,7 @@ ls -la public/UmbracoData.json
500
501
  ```typescript
501
502
  {
502
503
  maxConcurrent: 8, // Higher concurrency
504
+ maxTokens: 80000, // More content per page (if using larger models)
503
505
  enableAutoCleanup: true, // Keep cache clean
504
506
  }
505
507
  ```
@@ -509,6 +511,7 @@ ls -la public/UmbracoData.json
509
511
  {
510
512
  enableIndividualMd: false, // Skip individual files
511
513
  maxConcurrent: 2, // Lower API usage
514
+ maxTokens: 50000, // Smaller context for faster processing
512
515
  }
513
516
  ```
514
517
 
@@ -518,6 +521,7 @@ ls -la public/UmbracoData.json
518
521
  enableAutoCleanup: true,
519
522
  cleanupOrphaned: true,
520
523
  cleanupHidden: true,
524
+ maxTokens: 65000, // Balance between detail and API limits
521
525
  enableHtmlToMarkdown: true // Clean HTML from CMS content
522
526
  }
523
527
  ```
@@ -570,6 +574,7 @@ interface LLMSConfig {
570
574
  templatesDir?: string; // './.llms-templates'
571
575
  finalOutputDir?: string; // './.output/llms'
572
576
  anthropicModel?: string; // 'claude-3-5-sonnet-20241022'
577
+ maxTokens?: number; // 65000
573
578
  maxConcurrent?: number; // 5
574
579
  enableLLMSFullTxt?: boolean; // true
575
580
  enableIndividualMd?: boolean; // true
@@ -3,14 +3,221 @@ import { join, dirname, basename } from 'path';
3
3
  import { slugify } from 'transliteration';
4
4
  import Mustache from 'mustache';
5
5
  import Anthropic from '@anthropic-ai/sdk';
6
- import { createHash } from 'crypto';
6
+ import { encode } from '@toon-format/toon';
7
7
  import { JSONPath } from 'jsonpath-plus';
8
- import { w as withErrorHandling } from '../shared/nuxt-llms-generator.bc139143.mjs';
8
+ import { createHash } from 'crypto';
9
+ import { w as withErrorHandling } from '../shared/nuxt-llms-generator.db76a78e.mjs';
9
10
  import '@nuxt/kit';
10
11
  import 'zod';
11
12
  import 'node-html-markdown';
12
13
 
14
+ function extractPageContent(umbracoData, jpath) {
15
+ try {
16
+ const result = JSONPath({
17
+ path: jpath,
18
+ json: umbracoData.SiteData,
19
+ wrap: false
20
+ });
21
+ if (!result || Array.isArray(result) && result.length === 0) {
22
+ return null;
23
+ }
24
+ const pageContent = Array.isArray(result) ? result[0] : result;
25
+ return excludeChildrenFromContent(pageContent);
26
+ } catch (error) {
27
+ console.error(`Failed to extract content for path ${jpath}:`, error);
28
+ return null;
29
+ }
30
+ }
31
+ function excludeChildrenFromContent(content) {
32
+ if (!content || typeof content !== "object") {
33
+ return content;
34
+ }
35
+ const cleanContent = { ...content };
36
+ if ("children" in cleanContent) {
37
+ delete cleanContent.children;
38
+ }
39
+ return cleanContent;
40
+ }
41
+ function generatePageId(urlItem) {
42
+ const templateAlias = urlItem.TemplateAlias || "UnknownTemplate";
43
+ const nodeID = urlItem.nodeID || "UnknownNode";
44
+ return `${templateAlias}_${nodeID}`;
45
+ }
46
+ function isImportantKey(key) {
47
+ const importantPatterns = [
48
+ "title",
49
+ "name",
50
+ "heading",
51
+ "description",
52
+ "summary",
53
+ "content",
54
+ "text",
55
+ "body",
56
+ "value",
57
+ "label",
58
+ "caption",
59
+ "alt",
60
+ "message",
61
+ "url",
62
+ "link",
63
+ "href"
64
+ ];
65
+ const lowerKey = key.toLowerCase();
66
+ return importantPatterns.some((pattern) => lowerKey.includes(pattern));
67
+ }
68
+ function isMetadataKey(key) {
69
+ const metadataPatterns = [
70
+ "id",
71
+ "guid",
72
+ "key",
73
+ "_id",
74
+ "nodeid",
75
+ "created",
76
+ "updated",
77
+ "modified",
78
+ "timestamp",
79
+ "date",
80
+ "sort",
81
+ "order",
82
+ "index",
83
+ "position",
84
+ "published",
85
+ "hidden",
86
+ "visible",
87
+ "enabled",
88
+ "status",
89
+ "type",
90
+ "contenttype",
91
+ "template",
92
+ "alias",
93
+ "path",
94
+ "meta",
95
+ "metadata",
96
+ "seo",
97
+ "schema",
98
+ "properties"
99
+ ];
100
+ const lowerKey = key.toLowerCase();
101
+ return metadataPatterns.some((pattern) => lowerKey.includes(pattern));
102
+ }
103
+ function recursiveTruncate(content, maxTokens, currentDepth = 0) {
104
+ if (currentDepth > 10) {
105
+ return { _truncated: "Max depth reached" };
106
+ }
107
+ if (maxTokens < 10) {
108
+ return void 0;
109
+ }
110
+ if (content === null || content === void 0) {
111
+ return content;
112
+ }
113
+ if (typeof content !== "object") {
114
+ if (typeof content === "string" && content.length > 2e3) {
115
+ return content.substring(0, 2e3) + "...";
116
+ }
117
+ return content;
118
+ }
119
+ if (Array.isArray(content)) {
120
+ if (content.length === 0)
121
+ return content;
122
+ const itemLimit = Math.max(3, Math.floor(15 / (currentDepth + 1)));
123
+ const tokensPerItem = Math.floor(maxTokens / Math.min(content.length, itemLimit));
124
+ const truncatedArray = content.slice(0, itemLimit).map((item) => recursiveTruncate(item, tokensPerItem, currentDepth + 1)).filter((item) => item !== void 0);
125
+ if (content.length > truncatedArray.length) {
126
+ truncatedArray.push({
127
+ _note: `... and ${content.length - truncatedArray.length} more items`
128
+ });
129
+ }
130
+ return truncatedArray;
131
+ }
132
+ const truncatedObj = {};
133
+ const entries = Object.entries(content);
134
+ const withoutMetadata = entries.filter(([key]) => !isMetadataKey(key));
135
+ if (withoutMetadata.length === 0) {
136
+ return { _note: "Only metadata, removed" };
137
+ }
138
+ const importantEntries = withoutMetadata.filter(([key]) => isImportantKey(key));
139
+ const normalEntries = withoutMetadata.filter(([key]) => !isImportantKey(key));
140
+ const importantBudget = Math.floor(maxTokens * 0.4);
141
+ const tokensPerImportant = importantEntries.length > 0 ? Math.floor(importantBudget / importantEntries.length) : 0;
142
+ for (const [key, value] of importantEntries) {
143
+ const processedValue = recursiveTruncate(value, tokensPerImportant, currentDepth + 1);
144
+ if (processedValue !== void 0) {
145
+ truncatedObj[key] = processedValue;
146
+ }
147
+ }
148
+ const usedTokens = estimateContentTokens(truncatedObj);
149
+ const remainingBudget = maxTokens - usedTokens;
150
+ if (remainingBudget > 100 && normalEntries.length > 0) {
151
+ const sortedNormal = normalEntries.sort(([_a, valueA], [_b, valueB]) => {
152
+ const sizeA = JSON.stringify(valueA).length;
153
+ const sizeB = JSON.stringify(valueB).length;
154
+ return sizeA - sizeB;
155
+ });
156
+ const tokensPerNormal = Math.floor(remainingBudget / sortedNormal.length);
157
+ for (const [key, value] of sortedNormal) {
158
+ const processedValue = recursiveTruncate(value, tokensPerNormal, currentDepth + 1);
159
+ if (processedValue !== void 0) {
160
+ truncatedObj[key] = processedValue;
161
+ const newSize = estimateContentTokens(truncatedObj);
162
+ if (newSize > maxTokens) {
163
+ delete truncatedObj[key];
164
+ break;
165
+ }
166
+ }
167
+ }
168
+ }
169
+ return Object.keys(truncatedObj).length > 0 ? truncatedObj : void 0;
170
+ }
171
+ function emergencyTruncate(content, maxTokens) {
172
+ const result = { ...content };
173
+ const keys = Object.keys(result).sort((a, b) => {
174
+ const aImportant = isImportantKey(a) ? 1 : 0;
175
+ const bImportant = isImportantKey(b) ? 1 : 0;
176
+ return aImportant - bImportant;
177
+ });
178
+ for (const key of keys) {
179
+ if (estimateContentTokens(result) <= maxTokens)
180
+ break;
181
+ delete result[key];
182
+ console.warn(` Emergency: removed "${key}"`);
183
+ }
184
+ return result;
185
+ }
186
+ function estimateContentTokens(content) {
187
+ try {
188
+ const jsonString = JSON.stringify(content);
189
+ return Math.ceil(jsonString.length / 3);
190
+ } catch {
191
+ return 0;
192
+ }
193
+ }
194
+ function truncateContentIfNeeded(content, maxTokens = 1e5) {
195
+ const estimatedTokens = estimateContentTokens(content);
196
+ if (estimatedTokens <= maxTokens) {
197
+ return content;
198
+ }
199
+ console.warn(`\u26A0\uFE0F Content too large (${estimatedTokens} tokens > ${maxTokens} limit), truncating recursively...`);
200
+ const truncatedContent = recursiveTruncate(content, maxTokens, 0);
201
+ const result = truncatedContent && typeof truncatedContent === "object" && !Array.isArray(truncatedContent) ? truncatedContent : {
202
+ _error: "Content truncation failed",
203
+ original: content
204
+ };
205
+ const finalTokens = estimateContentTokens(result);
206
+ const preservedKeys = Object.keys(result).length;
207
+ const originalKeys = Object.keys(content).length;
208
+ console.log(`\u2705 Content truncated: ${estimatedTokens} \u2192 ${finalTokens} tokens (preserved ${preservedKeys}/${originalKeys} root keys)`);
209
+ if (finalTokens > maxTokens) {
210
+ console.error(`\u274C Recursive truncation insufficient (${finalTokens} > ${maxTokens}), performing emergency truncation...`);
211
+ return emergencyTruncate(result, maxTokens);
212
+ }
213
+ return result;
214
+ }
215
+
13
216
  function buildLLMSTemplatePrompt(request) {
217
+ const jsonTokens = estimateContentTokens(request.pageContent);
218
+ const toonData = encode(request.pageContent, { delimiter: " " });
219
+ const toonTokens = estimateContentTokens(toonData);
220
+ console.log(`\u{1F4CA} ${request.url}: JSON ${jsonTokens} \u2192 TOON ${toonTokens} (${((1 - toonTokens / jsonTokens) * 100).toFixed(0)}% saved)`);
14
221
  return `# LLMS.txt-Optimized Mustache Template Generator
15
222
 
16
223
  You are an expert at creating **Mustache.js templates** that generate **LLM knowledge base entries** following the [\`llms.txt\` standard](https://llmstxt.org/).
@@ -21,9 +228,9 @@ You are an expert at creating **Mustache.js templates** that generate **LLM know
21
228
 
22
229
  ### 1. DATA-DRIVEN CONTENT ONLY
23
230
  - **EVERY piece of content** must come from a Mustache binding: \`{{propertyName}}\`
24
- - **NEVER invent, assume, or add content** that doesn't exist in the provided JSON
231
+ - **NEVER invent, assume, or add content** that doesn't exist in the provided data
25
232
  - **NO hardcoded descriptions, lists, or facts**
26
- - If a property doesn't exist in JSON, don't create a section for it
233
+ - If a property doesn't exist in data, don't create a section for it
27
234
 
28
235
  ### 2. ALLOWED CONTEXTUAL ADDITIONS
29
236
  You MAY add:
@@ -32,50 +239,35 @@ You MAY add:
32
239
  - **Structural markers** for clarity (e.g., "Navigation:", "Metadata:")
33
240
 
34
241
  You MAY NOT add:
35
- - Descriptions of features/benefits not in JSON
242
+ - Descriptions of features/benefits not in data
36
243
  - Explanatory text about what something does
37
244
  - Lists of items not present in data
38
245
  - Assumptions about the page purpose
39
246
 
40
- ### 3. EXAMPLES OF VIOLATIONS
247
+ ### 3. UNDERSTANDING TOON FORMAT
41
248
 
42
- \u274C **BAD - Hardcoded content:**
43
- \`\`\`mustache
44
- ## Key Benefits
45
- - Real-time monitoring
46
- - Detailed analytics
47
- - Easy to use
48
- \`\`\`
49
- *Problem: These benefits are invented, not from JSON*
249
+ The data below is in **TOON format** (Token-Oriented Object Notation) for efficiency.
50
250
 
51
- \u274C **BAD - Invented descriptions:**
52
- \`\`\`mustache
53
- This dashboard provides comprehensive monitoring capabilities for call centers...
54
- \`\`\`
55
- *Problem: Description is made up*
251
+ **How to read TOON:**
252
+ - \`propertyName: value\` \u2192 Single property
253
+ - \`array[3]{prop1,prop2}\` \u2192 Array of 3 objects with properties prop1, prop2
254
+ - Properties in \`{braces}\` are the **exact field names** to use in Mustache bindings
56
255
 
57
- \u2705 **GOOD - Data-driven with context:**
58
- \`\`\`mustache
59
- {{#features.0}}
60
- ## Available Features
61
- {{#features}}
62
- - **{{name}}**: {{description}}
63
- {{/features}}
64
- {{/features.0}}
256
+ **Example:**
257
+ \`\`\`toon
258
+ users[2]{id,name,role}:
259
+ 1 Alice admin
260
+ 2 Bob user
65
261
  \`\`\`
66
- *Good: Content comes from JSON, heading provides context*
67
262
 
68
- \u2705 **GOOD - Minimal introduction:**
263
+ **Your Mustache template:**
69
264
  \`\`\`mustache
70
- {{#items.0}}
71
- ## Items Overview
72
- The following items are available:
73
- {{#items}}
74
- - {{title}}
75
- {{/items}}
76
- {{/items.0}}
265
+ {{#users}}
266
+ - {{id}}: {{name}} ({{role}})
267
+ {{/users}}
77
268
  \`\`\`
78
- *Good: Brief intro, but content is from JSON*
269
+
270
+ **CRITICAL:** Use the EXACT property names shown in TOON \`{braces}\` for your Mustache bindings.
79
271
 
80
272
  ---
81
273
 
@@ -100,9 +292,10 @@ These \`.md\` files are **LLM knowledge base entries** designed for **inference*
100
292
  - **Template Alias:** ${request.templateAlias}
101
293
  - **JSON Path:** ${request.jpath}
102
294
 
103
- ### Available Data
104
- \`\`\`json
105
- ${JSON.stringify(request.pageContent, null, 2)}
295
+ ### Available Data (TOON Format)
296
+
297
+ \`\`\`toon
298
+ ${toonData}
106
299
  \`\`\`
107
300
 
108
301
  ---
@@ -117,13 +310,13 @@ ${JSON.stringify(request.pageContent, null, 2)}
117
310
  ### 2. Structure for Question-Answering
118
311
  Anticipate questions an LLM might need to answer:
119
312
  - "What is this?" \u2192 Main heading + description properties
120
- - "What does it offer?" \u2192 Lists of items/features from JSON
313
+ - "What does it offer?" \u2192 Lists of items/features from data
121
314
  - "Who is it for?" \u2192 Target audience properties (if they exist)
122
315
  - "What are the details?" \u2192 Technical/metadata properties
123
316
 
124
- ### 3. Prioritize by JSON Structure
317
+ ### 3. Prioritize by Data Importance
125
318
  **Essential First:**
126
- - Root-level title/name/heading properties
319
+ - Title/name/heading properties
127
320
  - Description/summary properties
128
321
  - Main content arrays
129
322
 
@@ -146,17 +339,15 @@ Anticipate questions an LLM might need to answer:
146
339
 
147
340
  ## \u{1F527} Technical Principles (Key-Agnostic Design)
148
341
 
149
- ### 1. Dynamic Property Inference
150
- **Do not assume fixed property names.** Infer content type from:
151
- - **Value structure:** Object, array, string, number
152
- - **Value length:** Short strings = titles; long text = descriptions
153
- - **Position in JSON:** Root-level = high importance
154
- - **Semantic patterns:** URLs, images, dates
342
+ ### 1. Extract Property Names from TOON
343
+ Look at TOON headers to identify properties:
344
+ - \`{id,name,role}\` \u2192 Use \`{{id}}\`, \`{{name}}\`, \`{{role}}\`
345
+ - \`breadcrumbsLinks[5]{title,link}\` \u2192 Use \`{{#breadcrumbsLinks}}{{title}} {{link}}{{/breadcrumbsLinks}}\`
155
346
 
156
347
  ### 2. Exact Property Bindings
157
- - Always use **exact property name** from JSON: \`{{actualKeyName}}\`
348
+ - Always use **exact property name** from TOON: \`{{actualKeyName}}\`
158
349
  - Do NOT rename or modify binding identifiers
159
- - Mustache bindings must match JSON precisely
350
+ - Mustache bindings must match TOON property names precisely
160
351
 
161
352
  ### 3. Humanized Section Headings
162
353
  While bindings stay exact, convert keys to readable headings:
@@ -164,20 +355,33 @@ While bindings stay exact, convert keys to readable headings:
164
355
  - \`supportPageItems\` \u2192 "Available Support Topics"
165
356
  - \`breadcrumbsLinks\` \u2192 "Navigation Path"
166
357
 
167
- ### 4. Semantic Interpretation Guide
168
- - **Short root strings (5-50 chars)** \u2192 Likely page title
169
- - **Medium text (50-300 chars)** \u2192 Likely summary/tagline
170
- - **Long text (300+ chars)** \u2192 Likely detailed description
171
- - **Arrays of objects** \u2192 Repeated sections with structure
172
- - **Arrays of primitives** \u2192 Bullet lists
173
- - **URL-like strings** \u2192 Render as \`[Label]({{url}})\`
358
+ ### 4. Working with Arrays
359
+ When you see \`arrayName[N]{prop1,prop2}\`:
360
+ - Use \`{{#arrayName.0}}\` to check if array exists
361
+ - Iterate with \`{{#arrayName}}\`
362
+ - Access properties with \`{{prop1}}\`, \`{{prop2}}\`
363
+
364
+ **Example:**
365
+ \`\`\`toon
366
+ items[3]{title,description}:
367
+ ...
368
+ \`\`\`
369
+ \u2192
370
+ \`\`\`mustache
371
+ {{#items.0}}
372
+ ## Items
373
+ {{#items}}
374
+ - {{title}}: {{description}}
375
+ {{/items}}
376
+ {{/items.0}}
377
+ \`\`\`
174
378
 
175
379
  ### 5. Noise Filtering
176
- **Exclude technical metadata:**
380
+ **Exclude technical metadata** (if present in TOON):
177
381
  - IDs: \`id\`, \`nodeId\`, \`_id\`, \`guid\`
178
382
  - Timestamps: \`createdAt\`, \`updatedAt\`
179
383
  - Flags: \`isPublished\`, \`sortOrder\`, \`hidden\`
180
- - System: \`_type\`, \`contentType\`, \`template\`
384
+ - System: \`_type\`, \`contentType\`
181
385
 
182
386
  ### 6. Hierarchy & Nesting
183
387
  - **Root level** \u2192 \`#\` (H1) \u2014 one per document
@@ -198,7 +402,7 @@ While bindings stay exact, convert keys to readable headings:
198
402
  {{/summaryProperty}}
199
403
  \`\`\`
200
404
 
201
- ### Recommended Sections (adapt to actual JSON)
405
+ ### Example Sections (adapt to actual TOON data)
202
406
  \`\`\`mustache
203
407
  {{#mainDescription}}
204
408
  ## Overview
@@ -219,45 +423,94 @@ While bindings stay exact, convert keys to readable headings:
219
423
  - [{{title}}]({{link}})
220
424
  {{/navigationLinks}}
221
425
  {{/navigationLinks.0}}
222
-
223
- {{#technicalData}}
224
- ## Technical Information
225
- - **URL**: {{url}}
226
- - **Type**: {{type}}
227
- {{/technicalData}}
228
426
  \`\`\`
229
427
 
230
- **Important:** These are examples. Your template must match the ACTUAL JSON structure provided.
428
+ **Important:** These are examples. Your template must match the ACTUAL TOON structure provided.
231
429
 
232
430
  ---
233
431
 
234
432
  ## \u2705 Output Requirements
235
433
 
236
434
  1. **Output ONLY the Mustache template** \u2014 no explanations, no markdown code fences, no preamble
237
- 2. **Use exact JSON property names** in all bindings
435
+ 2. **Use exact property names from TOON \`{braces}\`** in all bindings
238
436
  3. **Generate clean Markdown** \u2014 no HTML, entities, or attributes
239
437
  4. **Data-driven content** \u2014 no invented facts or descriptions
240
- 5. **Contextual headings allowed** \u2014 but content must be from JSON
438
+ 5. **Contextual headings allowed** \u2014 but content must be from data
241
439
  6. **Be concise** \u2014 optimize for limited context windows
242
440
  7. **Structure for questions** \u2014 LLMs should easily extract facts
243
441
 
244
442
  ---
245
443
 
444
+ ## \u26A0\uFE0F CRITICAL: Mustache Syntax Validation
445
+
446
+ **Every \`{{#tag}}\` MUST have matching \`{{/tag}}\`**
447
+
448
+ ### Common Errors (from real failures):
449
+
450
+ \u274C **Missing closing tag:**
451
+ \`\`\`mustache
452
+ {{#pageDescription}}
453
+ content
454
+ // \u274C Missing {{/pageDescription}}
455
+ \`\`\`
456
+
457
+ \u274C **Nested check without outer closing:**
458
+ \`\`\`mustache
459
+ {{#items.0}}
460
+ {{#items}}...{{/items}}
461
+ // \u274C Missing {{/items.0}}
462
+ \`\`\`
463
+
464
+ \u274C **Capitalization mismatch:**
465
+ \`\`\`mustache
466
+ {{#aIFeaturesCTATitle}}
467
+ ...
468
+ {{/aiFeaturesCTATitle}} \u274C Different capitalization!
469
+ \`\`\`
470
+
471
+ ### Validation Checklist:
472
+
473
+ **Before output:**
474
+ 1. Count \`{{#\` tags = ___
475
+ 2. Count \`{{/\` tags = ___
476
+ 3. Numbers match? If NO \u2192 Find and add missing closing tags
477
+ 4. Tag names exact match (including dots, numbers, capitalization)?
478
+
479
+ \u2705 **Valid example:**
480
+ \`\`\`mustache
481
+ {{#section}} \u2190 1 open
482
+ {{#nested.0}} \u2190 2 open
483
+ content
484
+ {{/nested.0}} \u2190 2 close
485
+ {{/section}} \u2190 1 close
486
+ \`\`\`
487
+ Count: 2 = 2 \u2713
488
+
489
+ ---
490
+
246
491
  ## \u{1F680} Your Task
247
492
 
248
- Analyze the provided JSON structure and **generate a Mustache template** that:
493
+ Analyze the provided TOON data structure and **generate a Mustache template** that:
249
494
 
250
- 1. **Uses ONLY data from JSON** (no invented content)
251
- 2. **Adds logical section headings** for context
252
- 3. **Structures data for question-answering**
253
- 4. **Prioritizes most important properties first**
254
- 5. **Remains universal** (works for any JSON shape)
495
+ 1. **Uses ONLY data from TOON** (no invented content)
496
+ 2. **Extracts exact property names from \`{braces}\`**
497
+ 3. **Adds logical section headings** for context
498
+ 4. **Structures data for question-answering**
499
+ 5. **Prioritizes most important properties first**
500
+ 6. **Remains universal** (works for any data shape)
501
+ 7. **\u2705 ALL Mustache tags properly closed**
255
502
 
256
503
  **Remember:**
257
- - Headings can be contextual: \u2705
258
- - Content must be from JSON: \u2705\u2705\u2705
259
- - No made-up descriptions: \u274C
260
- - No assumed features: \u274C
504
+ - Parse TOON structure naturally \u2705
505
+ - Use exact property names from \`{braces}\` \u2705\u2705\u2705
506
+ - Headings can be contextual \u2705
507
+ - Content must be from data \u2705\u2705\u2705
508
+ - No made-up descriptions \u274C
509
+ - No assumed features \u274C
510
+ - **Every {{#tag}} has {{/tag}}** \u2705\u2705\u2705
511
+
512
+ **Final Step Before Output:**
513
+ Count your \`{{#\` and \`{{/\` tags. If numbers don't match, find and add missing closing tags.
261
514
 
262
515
  Generate the template now.
263
516
  `;
@@ -727,208 +980,6 @@ function getValueType(value) {
727
980
  return typeof value;
728
981
  }
729
982
 
730
- function extractPageContent(umbracoData, jpath) {
731
- try {
732
- const result = JSONPath({
733
- path: jpath,
734
- json: umbracoData.SiteData,
735
- wrap: false
736
- });
737
- if (!result || Array.isArray(result) && result.length === 0) {
738
- return null;
739
- }
740
- const pageContent = Array.isArray(result) ? result[0] : result;
741
- return excludeChildrenFromContent(pageContent);
742
- } catch (error) {
743
- console.error(`Failed to extract content for path ${jpath}:`, error);
744
- return null;
745
- }
746
- }
747
- function excludeChildrenFromContent(content) {
748
- if (!content || typeof content !== "object") {
749
- return content;
750
- }
751
- const cleanContent = { ...content };
752
- if ("children" in cleanContent) {
753
- delete cleanContent.children;
754
- }
755
- return cleanContent;
756
- }
757
- function generatePageId(urlItem) {
758
- const templateAlias = urlItem.TemplateAlias || "UnknownTemplate";
759
- const nodeID = urlItem.nodeID || "UnknownNode";
760
- return `${templateAlias}_${nodeID}`;
761
- }
762
- function isImportantKey(key) {
763
- const importantPatterns = [
764
- "title",
765
- "name",
766
- "heading",
767
- "description",
768
- "summary",
769
- "content",
770
- "text",
771
- "body",
772
- "value",
773
- "label",
774
- "caption",
775
- "alt",
776
- "message",
777
- "url",
778
- "link",
779
- "href"
780
- ];
781
- const lowerKey = key.toLowerCase();
782
- return importantPatterns.some((pattern) => lowerKey.includes(pattern));
783
- }
784
- function isMetadataKey(key) {
785
- const metadataPatterns = [
786
- "id",
787
- "guid",
788
- "key",
789
- "_id",
790
- "nodeid",
791
- "created",
792
- "updated",
793
- "modified",
794
- "timestamp",
795
- "date",
796
- "sort",
797
- "order",
798
- "index",
799
- "position",
800
- "published",
801
- "hidden",
802
- "visible",
803
- "enabled",
804
- "status",
805
- "type",
806
- "contenttype",
807
- "template",
808
- "alias",
809
- "path",
810
- "meta",
811
- "metadata",
812
- "seo",
813
- "schema",
814
- "properties"
815
- ];
816
- const lowerKey = key.toLowerCase();
817
- return metadataPatterns.some((pattern) => lowerKey.includes(pattern));
818
- }
819
- function recursiveTruncate(content, maxTokens, currentDepth = 0) {
820
- if (currentDepth > 10) {
821
- return { _truncated: "Max depth reached" };
822
- }
823
- if (maxTokens < 10) {
824
- return void 0;
825
- }
826
- if (content === null || content === void 0) {
827
- return content;
828
- }
829
- if (typeof content !== "object") {
830
- if (typeof content === "string" && content.length > 2e3) {
831
- return content.substring(0, 2e3) + "...";
832
- }
833
- return content;
834
- }
835
- if (Array.isArray(content)) {
836
- if (content.length === 0)
837
- return content;
838
- const itemLimit = Math.max(3, Math.floor(15 / (currentDepth + 1)));
839
- const tokensPerItem = Math.floor(maxTokens / Math.min(content.length, itemLimit));
840
- const truncatedArray = content.slice(0, itemLimit).map((item) => recursiveTruncate(item, tokensPerItem, currentDepth + 1)).filter((item) => item !== void 0);
841
- if (content.length > truncatedArray.length) {
842
- truncatedArray.push({
843
- _note: `... and ${content.length - truncatedArray.length} more items`
844
- });
845
- }
846
- return truncatedArray;
847
- }
848
- const truncatedObj = {};
849
- const entries = Object.entries(content);
850
- const withoutMetadata = entries.filter(([key]) => !isMetadataKey(key));
851
- if (withoutMetadata.length === 0) {
852
- return { _note: "Only metadata, removed" };
853
- }
854
- const importantEntries = withoutMetadata.filter(([key]) => isImportantKey(key));
855
- const normalEntries = withoutMetadata.filter(([key]) => !isImportantKey(key));
856
- const importantBudget = Math.floor(maxTokens * 0.4);
857
- const tokensPerImportant = importantEntries.length > 0 ? Math.floor(importantBudget / importantEntries.length) : 0;
858
- for (const [key, value] of importantEntries) {
859
- const processedValue = recursiveTruncate(value, tokensPerImportant, currentDepth + 1);
860
- if (processedValue !== void 0) {
861
- truncatedObj[key] = processedValue;
862
- }
863
- }
864
- const usedTokens = estimateContentTokens(truncatedObj);
865
- const remainingBudget = maxTokens - usedTokens;
866
- if (remainingBudget > 100 && normalEntries.length > 0) {
867
- const sortedNormal = normalEntries.sort(([_a, valueA], [_b, valueB]) => {
868
- const sizeA = JSON.stringify(valueA).length;
869
- const sizeB = JSON.stringify(valueB).length;
870
- return sizeA - sizeB;
871
- });
872
- const tokensPerNormal = Math.floor(remainingBudget / sortedNormal.length);
873
- for (const [key, value] of sortedNormal) {
874
- const processedValue = recursiveTruncate(value, tokensPerNormal, currentDepth + 1);
875
- if (processedValue !== void 0) {
876
- truncatedObj[key] = processedValue;
877
- const newSize = estimateContentTokens(truncatedObj);
878
- if (newSize > maxTokens) {
879
- delete truncatedObj[key];
880
- break;
881
- }
882
- }
883
- }
884
- }
885
- return Object.keys(truncatedObj).length > 0 ? truncatedObj : void 0;
886
- }
887
- function emergencyTruncate(content, maxTokens) {
888
- const result = { ...content };
889
- const keys = Object.keys(result).sort((a, b) => {
890
- const aImportant = isImportantKey(a) ? 1 : 0;
891
- const bImportant = isImportantKey(b) ? 1 : 0;
892
- return aImportant - bImportant;
893
- });
894
- for (const key of keys) {
895
- if (estimateContentTokens(result) <= maxTokens)
896
- break;
897
- delete result[key];
898
- console.warn(` Emergency: removed "${key}"`);
899
- }
900
- return result;
901
- }
902
- function estimateContentTokens(content) {
903
- try {
904
- const jsonString = JSON.stringify(content);
905
- return Math.ceil(jsonString.length / 3);
906
- } catch {
907
- return 0;
908
- }
909
- }
910
- function truncateContentIfNeeded(content, maxTokens = 1e5) {
911
- const estimatedTokens = estimateContentTokens(content);
912
- if (estimatedTokens <= maxTokens) {
913
- return content;
914
- }
915
- console.warn(`\u26A0\uFE0F Content too large (${estimatedTokens} tokens > ${maxTokens} limit), truncating recursively...`);
916
- const truncatedContent = recursiveTruncate(content, maxTokens, 0);
917
- const result = truncatedContent && typeof truncatedContent === "object" && !Array.isArray(truncatedContent) ? truncatedContent : {
918
- _error: "Content truncation failed",
919
- original: content
920
- };
921
- const finalTokens = estimateContentTokens(result);
922
- const preservedKeys = Object.keys(result).length;
923
- const originalKeys = Object.keys(content).length;
924
- console.log(`\u2705 Content truncated: ${estimatedTokens} \u2192 ${finalTokens} tokens (preserved ${preservedKeys}/${originalKeys} root keys)`);
925
- if (finalTokens > maxTokens) {
926
- console.error(`\u274C Recursive truncation insufficient (${finalTokens} > ${maxTokens}), performing emergency truncation...`);
927
- return emergencyTruncate(result, maxTokens);
928
- }
929
- return result;
930
- }
931
-
932
983
  function shouldGenerateTemplate(umbracoData, urlItem) {
933
984
  try {
934
985
  const pageContent = extractPageContent(umbracoData, urlItem.Jpath);
@@ -1264,7 +1315,7 @@ class TemplateGenerator {
1264
1315
  const pageId = generatePageId(urlItem);
1265
1316
  console.log(`Generating new template for ${pageId} (${urlItem.url})`);
1266
1317
  const tokensBeforeTruncation = estimateContentTokens(pageContent);
1267
- const truncatedContent = truncateContentIfNeeded(pageContent, 65e3);
1318
+ const truncatedContent = truncateContentIfNeeded(pageContent, this.config.maxTokens);
1268
1319
  const tokensAfterTruncation = estimateContentTokens(truncatedContent);
1269
1320
  if (tokensBeforeTruncation > tokensAfterTruncation) {
1270
1321
  console.warn(`Page ${pageId} content truncated: ${tokensBeforeTruncation} -> ${tokensAfterTruncation} tokens`);
package/dist/module.d.mts CHANGED
@@ -14,6 +14,7 @@ declare const LLMSConfigSchema: z.ZodObject<{
14
14
  baseSiteUrl: z.ZodOptional<z.ZodString>;
15
15
  baseSiteUrlUmbracoDataKey: z.ZodOptional<z.ZodString>;
16
16
  maxConcurrent: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
17
+ maxTokens: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
17
18
  enableLLMSFullTxt: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
18
19
  enableIndividualMd: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
19
20
  enableAutoCleanup: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
package/dist/module.d.ts CHANGED
@@ -14,6 +14,7 @@ declare const LLMSConfigSchema: z.ZodObject<{
14
14
  baseSiteUrl: z.ZodOptional<z.ZodString>;
15
15
  baseSiteUrlUmbracoDataKey: z.ZodOptional<z.ZodString>;
16
16
  maxConcurrent: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
17
+ maxTokens: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
17
18
  enableLLMSFullTxt: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
18
19
  enableIndividualMd: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
19
20
  enableAutoCleanup: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
package/dist/module.json CHANGED
@@ -4,5 +4,5 @@
4
4
  "compatibility": {
5
5
  "nuxt": "^3.0.0"
6
6
  },
7
- "version": "0.1.11"
7
+ "version": "0.1.12"
8
8
  }
package/dist/module.mjs CHANGED
@@ -1,4 +1,4 @@
1
- export { l as default } from './shared/nuxt-llms-generator.bc139143.mjs';
1
+ export { l as default } from './shared/nuxt-llms-generator.db76a78e.mjs';
2
2
  import '@nuxt/kit';
3
3
  import 'fs';
4
4
  import 'path';
@@ -38,6 +38,7 @@ const LLMSConfigSchema = z.object({
38
38
  ).describe("The base URL of the website to append to links in generated llms files"),
39
39
  baseSiteUrlUmbracoDataKey: z.string().optional().describe("If the SiteData of UmbracoData has the key with the base URL you can pass here the key to auto extract the base url"),
40
40
  maxConcurrent: z.number().int().min(1, "maxConcurrent must be at least 1").max(10, "maxConcurrent should not exceed 10 to avoid rate limits").optional().default(3),
41
+ maxTokens: z.number().int().min(1e3, "maxTokens must be at least 1000").max(2e5, "maxTokens should not exceed 200000").optional().default(65e3).describe("Maximum tokens for page content before truncation"),
41
42
  enableLLMSFullTxt: z.boolean().optional().default(true),
42
43
  enableIndividualMd: z.boolean().optional().default(true),
43
44
  enableAutoCleanup: z.boolean().optional().default(true),
@@ -225,6 +226,7 @@ function convertHtmlToMarkdownDeep(input) {
225
226
  const DEFAULT_OPTIONS = {
226
227
  anthropicModel: "claude-3-7-sonnet-latest",
227
228
  maxConcurrent: 5,
229
+ maxTokens: 65e3,
228
230
  enableLLMSFullTxt: true,
229
231
  enableIndividualMd: true,
230
232
  templatesDir: ".llms-templates",
@@ -271,6 +273,7 @@ const llmsModule = defineNuxtModule({
271
273
  finalOutputDir: resolve(nuxt.options.rootDir, options.finalOutputDir ?? "public"),
272
274
  anthropicModel: options.anthropicModel || DEFAULT_OPTIONS.anthropicModel,
273
275
  maxConcurrent: options.maxConcurrent || DEFAULT_OPTIONS.maxConcurrent,
276
+ maxTokens: options.maxTokens ?? DEFAULT_OPTIONS.maxTokens,
274
277
  enableLLMSFullTxt: options.enableLLMSFullTxt ?? DEFAULT_OPTIONS.enableLLMSFullTxt,
275
278
  enableIndividualMd: options.enableIndividualMd ?? DEFAULT_OPTIONS.enableIndividualMd,
276
279
  enableAutoCleanup: options.enableAutoCleanup ?? DEFAULT_OPTIONS.enableAutoCleanup,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@voicenter-team/nuxt-llms-generator",
3
- "version": "0.1.11",
3
+ "version": "0.1.12",
4
4
  "description": "Nuxt 3 module for automatically generating AI-optimized documentation files (llms.txt, llms-full.txt, and individual .md files) from Umbraco CMS data using Anthropic's Claude API.",
5
5
  "repository": "https://github.com/VoicenterTeam/nuxt-llms-generator",
6
6
  "license": "MIT",
@@ -35,6 +35,7 @@
35
35
  "dependencies": {
36
36
  "@anthropic-ai/sdk": "^0.30.0",
37
37
  "@nuxt/kit": "^3.11.2",
38
+ "@toon-format/toon": "^2.1.0",
38
39
  "@voicenter-team/eslint-config-ts": "^1.0.22",
39
40
  "i": "^0.3.7",
40
41
  "jsonpath-plus": "^8.0.0",