npm - @voicenter-team/nuxt-llms-generator - Versions diffs - 0.1.11 → 0.1.13 - Mend

@voicenter-team/nuxt-llms-generator 0.1.11 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +5 -0
package/dist/chunks/llms-files-generator.mjs +399 -293
package/dist/module.d.mts +1 -0
package/dist/module.d.ts +1 -0
package/dist/module.json +1 -1
package/dist/module.mjs +1 -1
package/dist/shared/{nuxt-llms-generator.bc139143.mjs → nuxt-llms-generator.db76a78e.mjs} +3 -0
package/package.json +2 -1

package/README.md CHANGED Viewed

@@ -242,6 +242,7 @@ npm run build
 | `enableLLMSFullTxt`    | `boolean` | `true`                       | Generate combined llms-full.txt file    |
 | `enableHtmlToMarkdown` | `boolean` | `true`                       | Convert HTML content to markdown using [node-html-markdown](https://www.npmjs.com/package/node-html-markdown) |
 | `maxConcurrent`        | `number`  | `5`                          | Maximum concurrent API requests          |
+| `maxTokens`            | `number`  | `65000`                      | Maximum tokens for page content before truncation (Claude context limit protection) |
 | `anthropicModel`       | `string`  | `claude-3-5-sonnet-20241022` | Claude model to use                      |
 ### Cleanup Options
@@ -500,6 +501,7 @@ ls -la public/UmbracoData.json
    ```typescript
    {
      maxConcurrent: 8, // Higher concurrency
+     maxTokens: 80000, // More content per page (if using larger models)
      enableAutoCleanup: true, // Keep cache clean
    }
    ```
@@ -509,6 +511,7 @@ ls -la public/UmbracoData.json
    {
      enableIndividualMd: false, // Skip individual files
      maxConcurrent: 2, // Lower API usage
+     maxTokens: 50000, // Smaller context for faster processing
    }
    ```
@@ -518,6 +521,7 @@ ls -la public/UmbracoData.json
      enableAutoCleanup: true,
      cleanupOrphaned: true,
      cleanupHidden: true,
+     maxTokens: 65000, // Balance between detail and API limits
      enableHtmlToMarkdown: true  // Clean HTML from CMS content
    }
    ```
@@ -570,6 +574,7 @@ interface LLMSConfig {
   templatesDir?: string;              // './.llms-templates'
   finalOutputDir?: string;            // './.output/llms'
   anthropicModel?: string;            // 'claude-3-5-sonnet-20241022'
+  maxTokens?: number;                 // 65000
   maxConcurrent?: number;             // 5
   enableLLMSFullTxt?: boolean;        // true
   enableIndividualMd?: boolean;       // true

package/dist/chunks/llms-files-generator.mjs CHANGED Viewed

@@ -3,14 +3,221 @@ import { join, dirname, basename } from 'path';
 import { slugify } from 'transliteration';
 import Mustache from 'mustache';
 import Anthropic from '@anthropic-ai/sdk';
-import { createHash } from 'crypto';
+import { encode } from '@toon-format/toon';
 import { JSONPath } from 'jsonpath-plus';
-import { w as withErrorHandling } from '../shared/nuxt-llms-generator.bc139143.mjs';
+import { createHash } from 'crypto';
+import { w as withErrorHandling } from '../shared/nuxt-llms-generator.db76a78e.mjs';
 import '@nuxt/kit';
 import 'zod';
 import 'node-html-markdown';
+function extractPageContent(umbracoData, jpath) {
+  try {
+    const result = JSONPath({
+      path: jpath,
+      json: umbracoData.SiteData,
+      wrap: false
+    });
+    if (!result || Array.isArray(result) && result.length === 0) {
+      return null;
+    }
+    const pageContent = Array.isArray(result) ? result[0] : result;
+    return excludeChildrenFromContent(pageContent);
+  } catch (error) {
+    console.error(`Failed to extract content for path ${jpath}:`, error);
+    return null;
+  }
+}
+function excludeChildrenFromContent(content) {
+  if (!content || typeof content !== "object") {
+    return content;
+  }
+  const cleanContent = { ...content };
+  if ("children" in cleanContent) {
+    delete cleanContent.children;
+  }
+  return cleanContent;
+}
+function generatePageId(urlItem) {
+  const templateAlias = urlItem.TemplateAlias || "UnknownTemplate";
+  const nodeID = urlItem.nodeID || "UnknownNode";
+  return `${templateAlias}_${nodeID}`;
+}
+function isImportantKey(key) {
+  const importantPatterns = [
+    "title",
+    "name",
+    "heading",
+    "description",
+    "summary",
+    "content",
+    "text",
+    "body",
+    "value",
+    "label",
+    "caption",
+    "alt",
+    "message",
+    "url",
+    "link",
+    "href"
+  ];
+  const lowerKey = key.toLowerCase();
+  return importantPatterns.some((pattern) => lowerKey.includes(pattern));
+}
+function isMetadataKey(key) {
+  const metadataPatterns = [
+    "id",
+    "guid",
+    "key",
+    "_id",
+    "nodeid",
+    "created",
+    "updated",
+    "modified",
+    "timestamp",
+    "date",
+    "sort",
+    "order",
+    "index",
+    "position",
+    "published",
+    "hidden",
+    "visible",
+    "enabled",
+    "status",
+    "type",
+    "contenttype",
+    "template",
+    "alias",
+    "path",
+    "meta",
+    "metadata",
+    "seo",
+    "schema",
+    "properties"
+  ];
+  const lowerKey = key.toLowerCase();
+  return metadataPatterns.some((pattern) => lowerKey.includes(pattern));
+}
+function recursiveTruncate(content, maxTokens, currentDepth = 0) {
+  if (currentDepth > 10) {
+    return { _truncated: "Max depth reached" };
+  }
+  if (maxTokens < 10) {
+    return void 0;
+  }
+  if (content === null || content === void 0) {
+    return content;
+  }
+  if (typeof content !== "object") {
+    if (typeof content === "string" && content.length > 2e3) {
+      return content.substring(0, 2e3) + "...";
+    }
+    return content;
+  }
+  if (Array.isArray(content)) {
+    if (content.length === 0)
+      return content;
+    const itemLimit = Math.max(3, Math.floor(15 / (currentDepth + 1)));
+    const tokensPerItem = Math.floor(maxTokens / Math.min(content.length, itemLimit));
+    const truncatedArray = content.slice(0, itemLimit).map((item) => recursiveTruncate(item, tokensPerItem, currentDepth + 1)).filter((item) => item !== void 0);
+    if (content.length > truncatedArray.length) {
+      truncatedArray.push({
+        _note: `... and ${content.length - truncatedArray.length} more items`
+      });
+    }
+    return truncatedArray;
+  }
+  const truncatedObj = {};
+  const entries = Object.entries(content);
+  const withoutMetadata = entries.filter(([key]) => !isMetadataKey(key));
+  if (withoutMetadata.length === 0) {
+    return { _note: "Only metadata, removed" };
+  }
+  const importantEntries = withoutMetadata.filter(([key]) => isImportantKey(key));
+  const normalEntries = withoutMetadata.filter(([key]) => !isImportantKey(key));
+  const importantBudget = Math.floor(maxTokens * 0.4);
+  const tokensPerImportant = importantEntries.length > 0 ? Math.floor(importantBudget / importantEntries.length) : 0;
+  for (const [key, value] of importantEntries) {
+    const processedValue = recursiveTruncate(value, tokensPerImportant, currentDepth + 1);
+    if (processedValue !== void 0) {
+      truncatedObj[key] = processedValue;
+    }
+  }
+  const usedTokens = estimateContentTokens(truncatedObj);
+  const remainingBudget = maxTokens - usedTokens;
+  if (remainingBudget > 100 && normalEntries.length > 0) {
+    const sortedNormal = normalEntries.sort(([_a, valueA], [_b, valueB]) => {
+      const sizeA = JSON.stringify(valueA).length;
+      const sizeB = JSON.stringify(valueB).length;
+      return sizeA - sizeB;
+    });
+    const tokensPerNormal = Math.floor(remainingBudget / sortedNormal.length);
+    for (const [key, value] of sortedNormal) {
+      const processedValue = recursiveTruncate(value, tokensPerNormal, currentDepth + 1);
+      if (processedValue !== void 0) {
+        truncatedObj[key] = processedValue;
+        const newSize = estimateContentTokens(truncatedObj);
+        if (newSize > maxTokens) {
+          delete truncatedObj[key];
+          break;
+        }
+      }
+    }
+  }
+  return Object.keys(truncatedObj).length > 0 ? truncatedObj : void 0;
+}
+function emergencyTruncate(content, maxTokens) {
+  const result = { ...content };
+  const keys = Object.keys(result).sort((a, b) => {
+    const aImportant = isImportantKey(a) ? 1 : 0;
+    const bImportant = isImportantKey(b) ? 1 : 0;
+    return aImportant - bImportant;
+  });
+  for (const key of keys) {
+    if (estimateContentTokens(result) <= maxTokens)
+      break;
+    delete result[key];
+    console.warn(`    Emergency: removed "${key}"`);
+  }
+  return result;
+}
+function estimateContentTokens(content) {
+  try {
+    const jsonString = JSON.stringify(content);
+    return Math.ceil(jsonString.length / 3);
+  } catch {
+    return 0;
+  }
+}
+function truncateContentIfNeeded(content, maxTokens = 1e5) {
+  const estimatedTokens = estimateContentTokens(content);
+  if (estimatedTokens <= maxTokens) {
+    return content;
+  }
+  console.warn(`\u26A0\uFE0F  Content too large (${estimatedTokens} tokens > ${maxTokens} limit), truncating recursively...`);
+  const truncatedContent = recursiveTruncate(content, maxTokens, 0);
+  const result = truncatedContent && typeof truncatedContent === "object" && !Array.isArray(truncatedContent) ? truncatedContent : {
+    _error: "Content truncation failed",
+    original: content
+  };
+  const finalTokens = estimateContentTokens(result);
+  const preservedKeys = Object.keys(result).length;
+  const originalKeys = Object.keys(content).length;
+  console.log(`\u2705 Content truncated: ${estimatedTokens} \u2192 ${finalTokens} tokens (preserved ${preservedKeys}/${originalKeys} root keys)`);
+  if (finalTokens > maxTokens) {
+    console.error(`\u274C Recursive truncation insufficient (${finalTokens} > ${maxTokens}), performing emergency truncation...`);
+    return emergencyTruncate(result, maxTokens);
+  }
+  return result;
+}
 function buildLLMSTemplatePrompt(request) {
+  const jsonTokens = estimateContentTokens(request.pageContent);
+  const toonData = encode(request.pageContent, { delimiter: "	" });
+  const toonTokens = estimateContentTokens(toonData);
+  console.log(`\u{1F4CA} ${request.url}: JSON ${jsonTokens} \u2192 TOON ${toonTokens} (${((1 - toonTokens / jsonTokens) * 100).toFixed(0)}% saved)`);
   return `# LLMS.txt-Optimized Mustache Template Generator
 You are an expert at creating **Mustache.js templates** that generate **LLM knowledge base entries** following the [\`llms.txt\` standard](https://llmstxt.org/).
@@ -21,9 +228,9 @@ You are an expert at creating **Mustache.js templates** that generate **LLM know
 ### 1. DATA-DRIVEN CONTENT ONLY
 - **EVERY piece of content** must come from a Mustache binding: \`{{propertyName}}\`
-- **NEVER invent, assume, or add content** that doesn't exist in the provided JSON
+- **NEVER invent, assume, or add content** that doesn't exist in the provided data
 - **NO hardcoded descriptions, lists, or facts**
-- If a property doesn't exist in JSON, don't create a section for it
+- If a property doesn't exist in data, don't create a section for it
 ### 2. ALLOWED CONTEXTUAL ADDITIONS
 You MAY add:
@@ -32,51 +239,47 @@ You MAY add:
 - **Structural markers** for clarity (e.g., "Navigation:", "Metadata:")
 You MAY NOT add:
-- Descriptions of features/benefits not in JSON
+- Descriptions of features/benefits not in data
 - Explanatory text about what something does
 - Lists of items not present in data
 - Assumptions about the page purpose
-### 3. EXAMPLES OF VIOLATIONS
+### 3. UNDERSTANDING TOON FORMAT
-\u274C **BAD - Hardcoded content:**
-\`\`\`mustache
-## Key Benefits
-- Real-time monitoring
-- Detailed analytics
-- Easy to use
-\`\`\`
-*Problem: These benefits are invented, not from JSON*
+The data below is in **TOON format** (Token-Oriented Object Notation) for efficiency.
-\u274C **BAD - Invented descriptions:**
-\`\`\`mustache
-This dashboard provides comprehensive monitoring capabilities for call centers...
-\`\`\`
-*Problem: Description is made up*
+**How to read TOON:**
+- \`propertyName: value\` \u2192 Single property
+- \`array[3]{prop1,prop2}\` \u2192 Array of 3 objects with properties prop1, prop2
+- Properties in \`{braces}\` are the **exact field names** to use in Mustache bindings
-\u2705 **GOOD - Data-driven with context:**
-\`\`\`mustache
-{{#features.0}}
-## Available Features
-{{#features}}
-- **{{name}}**: {{description}}
-{{/features}}
-{{/features.0}}
+**Example:**
+\`\`\`toon
+users[2]{id,name,role}:
+  1	Alice	admin
+  2	Bob	user
 \`\`\`
-*Good: Content comes from JSON, heading provides context*
-\u2705 **GOOD - Minimal introduction:**
+**Your Mustache template:**
 \`\`\`mustache
-{{#items.0}}
-## Items Overview
-The following items are available:
-{{#items}}
-- {{title}}
-{{/items}}
-{{/items.0}}
+{{#users}}
+- {{id}}: {{name}} ({{role}})
+{{/users}}
 \`\`\`
-*Good: Brief intro, but content is from JSON*
+**CRITICAL:** Use the EXACT property names shown in TOON \`{braces}\` for your Mustache bindings.
+---
+## \u{1F6AB} Content Exclusion Rules
+When generating the template, **DO NOT** create sections for these types of data even if they appear in TOON:
+- **Image/media properties**: URLs to images, avatars, thumbnails, icons, or any media files
+- **UI-only labels**: Search placeholders, filter menu text, "Show More"/"Show Less", pagination labels
+- **SEO/meta fields**: ogTitle, ogDescription, ogImage, canonical URLs, changefreq, priority, sitemap fields
+- **Legal boilerplate**: Copyright text, "All rights reserved"
+- **Navigation chrome**: Breadcrumbs, menu items, footer links \u2014 unless they ARE the primary page content
+- **System identifiers**: Internal IDs, GUIDs, sort orders, node paths, template aliases
+- **Empty/null values**: Skip any property that holds no meaningful value
+**Focus ONLY on**: titles, descriptions, features, pricing, reviews, specifications, contact info, and other business-relevant content.
 ---
 ## \u{1F3AF} TRUE PURPOSE: Help LLMs Answer Questions Efficiently
@@ -100,9 +303,10 @@ These \`.md\` files are **LLM knowledge base entries** designed for **inference*
 - **Template Alias:** ${request.templateAlias}
 - **JSON Path:** ${request.jpath}
-### Available Data
-\`\`\`json
-${JSON.stringify(request.pageContent, null, 2)}
+### Available Data (TOON Format)
+\`\`\`toon
+${toonData}
 \`\`\`
 ---
@@ -117,13 +321,13 @@ ${JSON.stringify(request.pageContent, null, 2)}
 ### 2. Structure for Question-Answering
 Anticipate questions an LLM might need to answer:
 - "What is this?" \u2192 Main heading + description properties
-- "What does it offer?" \u2192 Lists of items/features from JSON
+- "What does it offer?" \u2192 Lists of items/features from data
 - "Who is it for?" \u2192 Target audience properties (if they exist)
 - "What are the details?" \u2192 Technical/metadata properties
-### 3. Prioritize by JSON Structure
+### 3. Prioritize by Data Importance
 **Essential First:**
-- Root-level title/name/heading properties
+- Title/name/heading properties
 - Description/summary properties
 - Main content arrays
@@ -146,17 +350,15 @@ Anticipate questions an LLM might need to answer:
 ## \u{1F527} Technical Principles (Key-Agnostic Design)
-### 1. Dynamic Property Inference
-**Do not assume fixed property names.** Infer content type from:
-- **Value structure:** Object, array, string, number
-- **Value length:** Short strings = titles; long text = descriptions
-- **Position in JSON:** Root-level = high importance
-- **Semantic patterns:** URLs, images, dates
+### 1. Extract Property Names from TOON
+Look at TOON headers to identify properties:
+- \`{id,name,role}\` \u2192 Use \`{{id}}\`, \`{{name}}\`, \`{{role}}\`
+- \`breadcrumbsLinks[5]{title,link}\` \u2192 Use \`{{#breadcrumbsLinks}}{{title}} {{link}}{{/breadcrumbsLinks}}\`
 ### 2. Exact Property Bindings
-- Always use **exact property name** from JSON: \`{{actualKeyName}}\`
+- Always use **exact property name** from TOON: \`{{actualKeyName}}\`
 - Do NOT rename or modify binding identifiers
-- Mustache bindings must match JSON precisely
+- Mustache bindings must match TOON property names precisely
 ### 3. Humanized Section Headings
 While bindings stay exact, convert keys to readable headings:
@@ -164,20 +366,33 @@ While bindings stay exact, convert keys to readable headings:
 - \`supportPageItems\` \u2192 "Available Support Topics"
 - \`breadcrumbsLinks\` \u2192 "Navigation Path"
-### 4. Semantic Interpretation Guide
-- **Short root strings (5-50 chars)** \u2192 Likely page title
-- **Medium text (50-300 chars)** \u2192 Likely summary/tagline
-- **Long text (300+ chars)** \u2192 Likely detailed description
-- **Arrays of objects** \u2192 Repeated sections with structure
-- **Arrays of primitives** \u2192 Bullet lists
-- **URL-like strings** \u2192 Render as \`[Label]({{url}})\`
+### 4. Working with Arrays
+When you see \`arrayName[N]{prop1,prop2}\`:
+- Use \`{{#arrayName.0}}\` to check if array exists
+- Iterate with \`{{#arrayName}}\`
+- Access properties with \`{{prop1}}\`, \`{{prop2}}\`
+**Example:**
+\`\`\`toon
+items[3]{title,description}:
+  ...
+\`\`\`
+\u2192
+\`\`\`mustache
+{{#items.0}}
+## Items
+{{#items}}
+- {{title}}: {{description}}
+{{/items}}
+{{/items.0}}
+\`\`\`
 ### 5. Noise Filtering
-**Exclude technical metadata:**
+**Exclude technical metadata** (if present in TOON):
 - IDs: \`id\`, \`nodeId\`, \`_id\`, \`guid\`
 - Timestamps: \`createdAt\`, \`updatedAt\`
 - Flags: \`isPublished\`, \`sortOrder\`, \`hidden\`
-- System: \`_type\`, \`contentType\`, \`template\`
+- System: \`_type\`, \`contentType\`
 ### 6. Hierarchy & Nesting
 - **Root level** \u2192 \`#\` (H1) \u2014 one per document
@@ -198,7 +413,7 @@ While bindings stay exact, convert keys to readable headings:
 {{/summaryProperty}}
 \`\`\`
-### Recommended Sections (adapt to actual JSON)
+### Example Sections (adapt to actual TOON data)
 \`\`\`mustache
 {{#mainDescription}}
 ## Overview
@@ -219,45 +434,94 @@ While bindings stay exact, convert keys to readable headings:
 - [{{title}}]({{link}})
 {{/navigationLinks}}
 {{/navigationLinks.0}}
-{{#technicalData}}
-## Technical Information
-- **URL**: {{url}}
-- **Type**: {{type}}
-{{/technicalData}}
 \`\`\`
-**Important:** These are examples. Your template must match the ACTUAL JSON structure provided.
+**Important:** These are examples. Your template must match the ACTUAL TOON structure provided.
 ---
 ## \u2705 Output Requirements
 1. **Output ONLY the Mustache template** \u2014 no explanations, no markdown code fences, no preamble
-2. **Use exact JSON property names** in all bindings
+2. **Use exact property names from TOON \`{braces}\`** in all bindings
 3. **Generate clean Markdown** \u2014 no HTML, entities, or attributes
 4. **Data-driven content** \u2014 no invented facts or descriptions
-5. **Contextual headings allowed** \u2014 but content must be from JSON
+5. **Contextual headings allowed** \u2014 but content must be from data
 6. **Be concise** \u2014 optimize for limited context windows
 7. **Structure for questions** \u2014 LLMs should easily extract facts
 ---
+## \u26A0\uFE0F CRITICAL: Mustache Syntax Validation
+**Every \`{{#tag}}\` MUST have matching \`{{/tag}}\`**
+### Common Errors (from real failures):
+\u274C **Missing closing tag:**
+\`\`\`mustache
+{{#pageDescription}}
+  content
+// \u274C Missing {{/pageDescription}}
+\`\`\`
+\u274C **Nested check without outer closing:**
+\`\`\`mustache
+{{#items.0}}
+  {{#items}}...{{/items}}
+// \u274C Missing {{/items.0}}
+\`\`\`
+\u274C **Capitalization mismatch:**
+\`\`\`mustache
+{{#aIFeaturesCTATitle}}
+  ...
+{{/aiFeaturesCTATitle}}  \u274C Different capitalization!
+\`\`\`
+### Validation Checklist:
+**Before output:**
+1. Count \`{{#\` tags = ___
+2. Count \`{{/\` tags = ___
+3. Numbers match? If NO \u2192 Find and add missing closing tags
+4. Tag names exact match (including dots, numbers, capitalization)?
+\u2705 **Valid example:**
+\`\`\`mustache
+{{#section}}         \u2190 1 open
+  {{#nested.0}}      \u2190 2 open
+    content
+  {{/nested.0}}      \u2190 2 close
+{{/section}}         \u2190 1 close
+\`\`\`
+Count: 2 = 2 \u2713
+---
 ## \u{1F680} Your Task
-Analyze the provided JSON structure and **generate a Mustache template** that:
+Analyze the provided TOON data structure and **generate a Mustache template** that:
-1. **Uses ONLY data from JSON** (no invented content)
-2. **Adds logical section headings** for context
-3. **Structures data for question-answering**
-4. **Prioritizes most important properties first**
-5. **Remains universal** (works for any JSON shape)
+1. **Uses ONLY data from TOON** (no invented content)
+2. **Extracts exact property names from \`{braces}\`**
+3. **Adds logical section headings** for context
+4. **Structures data for question-answering**
+5. **Prioritizes most important properties first**
+6. **Remains universal** (works for any data shape)
+7. **\u2705 ALL Mustache tags properly closed**
 **Remember:**
-- Headings can be contextual: \u2705
-- Content must be from JSON: \u2705\u2705\u2705
-- No made-up descriptions: \u274C
-- No assumed features: \u274C
+- Parse TOON structure naturally \u2705
+- Use exact property names from \`{braces}\` \u2705\u2705\u2705
+- Headings can be contextual \u2705
+- Content must be from data \u2705\u2705\u2705
+- No made-up descriptions \u274C
+- No assumed features \u274C
+- **Every {{#tag}} has {{/tag}}** \u2705\u2705\u2705
+**Final Step Before Output:**
+Count your \`{{#\` and \`{{/\` tags. If numbers don't match, find and add missing closing tags.
 Generate the template now.
 `;
@@ -727,208 +991,11 @@ function getValueType(value) {
   return typeof value;
 }
-function extractPageContent(umbracoData, jpath) {
-  try {
-    const result = JSONPath({
-      path: jpath,
-      json: umbracoData.SiteData,
-      wrap: false
-    });
-    if (!result || Array.isArray(result) && result.length === 0) {
-      return null;
-    }
-    const pageContent = Array.isArray(result) ? result[0] : result;
-    return excludeChildrenFromContent(pageContent);
-  } catch (error) {
-    console.error(`Failed to extract content for path ${jpath}:`, error);
-    return null;
-  }
-}
-function excludeChildrenFromContent(content) {
-  if (!content || typeof content !== "object") {
-    return content;
-  }
-  const cleanContent = { ...content };
-  if ("children" in cleanContent) {
-    delete cleanContent.children;
-  }
-  return cleanContent;
-}
-function generatePageId(urlItem) {
-  const templateAlias = urlItem.TemplateAlias || "UnknownTemplate";
-  const nodeID = urlItem.nodeID || "UnknownNode";
-  return `${templateAlias}_${nodeID}`;
-}
-function isImportantKey(key) {
-  const importantPatterns = [
-    "title",
-    "name",
-    "heading",
-    "description",
-    "summary",
-    "content",
-    "text",
-    "body",
-    "value",
-    "label",
-    "caption",
-    "alt",
-    "message",
-    "url",
-    "link",
-    "href"
-  ];
-  const lowerKey = key.toLowerCase();
-  return importantPatterns.some((pattern) => lowerKey.includes(pattern));
-}
-function isMetadataKey(key) {
-  const metadataPatterns = [
-    "id",
-    "guid",
-    "key",
-    "_id",
-    "nodeid",
-    "created",
-    "updated",
-    "modified",
-    "timestamp",
-    "date",
-    "sort",
-    "order",
-    "index",
-    "position",
-    "published",
-    "hidden",
-    "visible",
-    "enabled",
-    "status",
-    "type",
-    "contenttype",
-    "template",
-    "alias",
-    "path",
-    "meta",
-    "metadata",
-    "seo",
-    "schema",
-    "properties"
-  ];
-  const lowerKey = key.toLowerCase();
-  return metadataPatterns.some((pattern) => lowerKey.includes(pattern));
-}
-function recursiveTruncate(content, maxTokens, currentDepth = 0) {
-  if (currentDepth > 10) {
-    return { _truncated: "Max depth reached" };
-  }
-  if (maxTokens < 10) {
-    return void 0;
-  }
-  if (content === null || content === void 0) {
-    return content;
-  }
-  if (typeof content !== "object") {
-    if (typeof content === "string" && content.length > 2e3) {
-      return content.substring(0, 2e3) + "...";
-    }
-    return content;
-  }
-  if (Array.isArray(content)) {
-    if (content.length === 0)
-      return content;
-    const itemLimit = Math.max(3, Math.floor(15 / (currentDepth + 1)));
-    const tokensPerItem = Math.floor(maxTokens / Math.min(content.length, itemLimit));
-    const truncatedArray = content.slice(0, itemLimit).map((item) => recursiveTruncate(item, tokensPerItem, currentDepth + 1)).filter((item) => item !== void 0);
-    if (content.length > truncatedArray.length) {
-      truncatedArray.push({
-        _note: `... and ${content.length - truncatedArray.length} more items`
-      });
-    }
-    return truncatedArray;
-  }
-  const truncatedObj = {};
-  const entries = Object.entries(content);
-  const withoutMetadata = entries.filter(([key]) => !isMetadataKey(key));
-  if (withoutMetadata.length === 0) {
-    return { _note: "Only metadata, removed" };
-  }
-  const importantEntries = withoutMetadata.filter(([key]) => isImportantKey(key));
-  const normalEntries = withoutMetadata.filter(([key]) => !isImportantKey(key));
-  const importantBudget = Math.floor(maxTokens * 0.4);
-  const tokensPerImportant = importantEntries.length > 0 ? Math.floor(importantBudget / importantEntries.length) : 0;
-  for (const [key, value] of importantEntries) {
-    const processedValue = recursiveTruncate(value, tokensPerImportant, currentDepth + 1);
-    if (processedValue !== void 0) {
-      truncatedObj[key] = processedValue;
-    }
-  }
-  const usedTokens = estimateContentTokens(truncatedObj);
-  const remainingBudget = maxTokens - usedTokens;
-  if (remainingBudget > 100 && normalEntries.length > 0) {
-    const sortedNormal = normalEntries.sort(([_a, valueA], [_b, valueB]) => {
-      const sizeA = JSON.stringify(valueA).length;
-      const sizeB = JSON.stringify(valueB).length;
-      return sizeA - sizeB;
-    });
-    const tokensPerNormal = Math.floor(remainingBudget / sortedNormal.length);
-    for (const [key, value] of sortedNormal) {
-      const processedValue = recursiveTruncate(value, tokensPerNormal, currentDepth + 1);
-      if (processedValue !== void 0) {
-        truncatedObj[key] = processedValue;
-        const newSize = estimateContentTokens(truncatedObj);
-        if (newSize > maxTokens) {
-          delete truncatedObj[key];
-          break;
-        }
-      }
-    }
-  }
-  return Object.keys(truncatedObj).length > 0 ? truncatedObj : void 0;
-}
-function emergencyTruncate(content, maxTokens) {
-  const result = { ...content };
-  const keys = Object.keys(result).sort((a, b) => {
-    const aImportant = isImportantKey(a) ? 1 : 0;
-    const bImportant = isImportantKey(b) ? 1 : 0;
-    return aImportant - bImportant;
-  });
-  for (const key of keys) {
-    if (estimateContentTokens(result) <= maxTokens)
-      break;
-    delete result[key];
-    console.warn(`    Emergency: removed "${key}"`);
-  }
-  return result;
-}
-function estimateContentTokens(content) {
-  try {
-    const jsonString = JSON.stringify(content);
-    return Math.ceil(jsonString.length / 3);
-  } catch {
-    return 0;
-  }
-}
-function truncateContentIfNeeded(content, maxTokens = 1e5) {
-  const estimatedTokens = estimateContentTokens(content);
-  if (estimatedTokens <= maxTokens) {
-    return content;
-  }
-  console.warn(`\u26A0\uFE0F  Content too large (${estimatedTokens} tokens > ${maxTokens} limit), truncating recursively...`);
-  const truncatedContent = recursiveTruncate(content, maxTokens, 0);
-  const result = truncatedContent && typeof truncatedContent === "object" && !Array.isArray(truncatedContent) ? truncatedContent : {
-    _error: "Content truncation failed",
-    original: content
-  };
-  const finalTokens = estimateContentTokens(result);
-  const preservedKeys = Object.keys(result).length;
-  const originalKeys = Object.keys(content).length;
-  console.log(`\u2705 Content truncated: ${estimatedTokens} \u2192 ${finalTokens} tokens (preserved ${preservedKeys}/${originalKeys} root keys)`);
-  if (finalTokens > maxTokens) {
-    console.error(`\u274C Recursive truncation insufficient (${finalTokens} > ${maxTokens}), performing emergency truncation...`);
-    return emergencyTruncate(result, maxTokens);
-  }
-  return result;
-}
+const PLACEHOLDER_PATTERNS = [
+  "lorem ipsum",
+  "dolor sit amet",
+  "consectetuer adipi"
+];
 function shouldGenerateTemplate(umbracoData, urlItem) {
   try {
     const pageContent = extractPageContent(umbracoData, urlItem.Jpath);
@@ -941,6 +1008,16 @@ function shouldGenerateTemplate(umbracoData, urlItem) {
       console.log(`Page ${urlItem.url} is hidden (hidePage: ${hidePage}), skipping template generation`);
       return false;
     }
+    const title = pageContent.pageTitle ?? pageContent.pageTittle ?? pageContent.ogTitle ?? pageContent.headerBlockTitle;
+    if (!title || title === "undefined" || title === "null") {
+      console.log(`Page ${urlItem.url} has no valid title, skipping template generation`);
+      return false;
+    }
+    const bodyText = JSON.stringify(pageContent).toLowerCase();
+    if (PLACEHOLDER_PATTERNS.some((p) => bodyText.includes(p))) {
+      console.log(`Page ${urlItem.url} contains placeholder text, skipping template generation`);
+      return false;
+    }
     return true;
   } catch (error) {
     console.error(`Error checking visibility for ${urlItem.url}:`, error);
@@ -1156,6 +1233,19 @@ async function performAutomaticCleanup(umbracoData, cacheDir, options = {}) {
   return stats;
 }
+function sanitizeRenderedMarkdown(markdown) {
+  let output = markdown;
+  output = output.replace(/!\[.*?]\(.*?\)/g, "");
+  output = output.replace(/^(#{1,6})\s+\d+,\s*/gm, "$1 ");
+  output = output.replace(/&#x2F;/g, "/").replace(/&#39;/g, "'").replace(/&#x27;/g, "'").replace(/&quot;/g, '"').replace(/&amp;/g, "&").replace(/&#x3D;/g, "=").replace(/&#x60;/g, "`").replace(/&lt;/g, "<").replace(/&gt;/g, ">");
+  output = output.replace(/^- .+?:\s*$/gm, "");
+  output = output.replace(/\[הרחבה]\([^)]*\)/g, "");
+  output = output.replace(/(?<!:)\/{2,}/g, "/");
+  output = output.replace(/^(#{2,6})\s+.+\n(\s*\n)+(?=#{1,6}\s|$)/gm, "");
+  output = output.replace(/\n{3,}/g, "\n\n");
+  return output.trim();
+}
 class TemplateGenerator {
   anthropicClient;
   promptAnalyzer;
@@ -1264,12 +1354,11 @@ class TemplateGenerator {
     const pageId = generatePageId(urlItem);
     console.log(`Generating new template for ${pageId} (${urlItem.url})`);
     const tokensBeforeTruncation = estimateContentTokens(pageContent);
-    const truncatedContent = truncateContentIfNeeded(pageContent, 65e3);
+    const truncatedContent = truncateContentIfNeeded(pageContent, this.config.maxTokens);
     const tokensAfterTruncation = estimateContentTokens(truncatedContent);
     if (tokensBeforeTruncation > tokensAfterTruncation) {
       console.warn(`Page ${pageId} content truncated: ${tokensBeforeTruncation} -> ${tokensAfterTruncation} tokens`);
     }
-    this.promptAnalyzer.analyzeContent(truncatedContent, urlItem);
     const request = {
       pageContent: truncatedContent,
       templateAlias: urlItem.TemplateAlias,
@@ -1300,7 +1389,14 @@ class TemplateGenerator {
   }
   async renderTemplate(template, data) {
     return withErrorHandling(async () => {
-      return Mustache.render(template, data);
+      const originalEscape = Mustache.escape;
+      Mustache.escape = (text) => text;
+      try {
+        const rendered = Mustache.render(template, data);
+        return sanitizeRenderedMarkdown(rendered);
+      } finally {
+        Mustache.escape = originalEscape;
+      }
     }, {
       template: template.substring(0, 200) + "...",
       dataKeys: Object.keys(data)
@@ -1429,7 +1525,8 @@ class LLMSFilesGenerator {
     content += `This website contains comprehensive information about ${siteTitle.toLowerCase()}. The content is organized into the following sections:
 `;
-    const pagesByCategory = this.groupPagesByCategory(mdFiles);
+    const deduplicatedFiles = this.deduplicateByUrl(mdFiles);
+    const pagesByCategory = this.groupPagesByCategory(deduplicatedFiles);
     for (const [category, pages] of Object.entries(pagesByCategory)) {
       if (pages.length === 0)
         continue;
@@ -1472,7 +1569,8 @@ class LLMSFilesGenerator {
 `;
     }
     content += "---\n\n";
-    for (const mdFile of mdFiles) {
+    const deduplicatedFiles = this.deduplicateByUrl(mdFiles);
+    for (const mdFile of deduplicatedFiles) {
       const urlItem = this.umbracoData.urlList.find((item) => item.url === mdFile.url);
       if (!urlItem)
         continue;
@@ -1513,6 +1611,15 @@ class LLMSFilesGenerator {
   *   /marketplace           -> category "marketplace"
   *   /                      -> category "main"
   */
+  deduplicateByUrl(mdFiles) {
+    const seen = /* @__PURE__ */ new Set();
+    return mdFiles.filter((file) => {
+      if (seen.has(file.url))
+        return false;
+      seen.add(file.url);
+      return true;
+    });
+  }
   groupPagesByCategory(mdFiles) {
     const categories = {};
     for (const mdFile of mdFiles) {
@@ -1558,8 +1665,7 @@ class LLMSFilesGenerator {
   }
   extractSiteTitle() {
     const siteData = this.umbracoData.SiteData;
-    const rawTitle = siteData?.pageTitle || siteData?.mainHeaderBlockTitle || "Website Documentation";
-    return rawTitle;
+    return siteData?.pageTitle || siteData?.mainHeaderBlockTitle || "Website Documentation";
   }
   extractSiteDescription() {
     const siteData = this.umbracoData.SiteData;
@@ -1582,11 +1688,11 @@ class LLMSFilesGenerator {
     const pageContent = extractPageContent(this.umbracoData, urlItem.Jpath);
     if (!pageContent)
       return `${urlItem.TemplateAlias} page`;
-    const desc = pageContent.pageDescription || pageContent.description || pageContent.headerBlockSubtitle;
-    if (desc && typeof desc === "string") {
-      return desc;
+    const desc = pageContent.pageDescription || pageContent.description || pageContent.headerBlockSubtitle || pageContent.ogDescription;
+    if (desc && typeof desc === "string" && desc.trim().length > 0) {
+      return desc.trim();
     }
-    return `Information about ${urlItem.url}`;
+    return `${urlItem.TemplateAlias} page`;
   }
   sanitizeUrlForFilename(url) {
     if (!url || url === "/")

package/dist/module.d.mts CHANGED Viewed

@@ -14,6 +14,7 @@ declare const LLMSConfigSchema: z.ZodObject<{
     baseSiteUrl: z.ZodOptional<z.ZodString>;
     baseSiteUrlUmbracoDataKey: z.ZodOptional<z.ZodString>;
     maxConcurrent: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
+    maxTokens: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
     enableLLMSFullTxt: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
     enableIndividualMd: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
     enableAutoCleanup: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;

package/dist/module.d.ts CHANGED Viewed

@@ -14,6 +14,7 @@ declare const LLMSConfigSchema: z.ZodObject<{
     baseSiteUrl: z.ZodOptional<z.ZodString>;
     baseSiteUrlUmbracoDataKey: z.ZodOptional<z.ZodString>;
     maxConcurrent: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
+    maxTokens: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
     enableLLMSFullTxt: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
     enableIndividualMd: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
     enableAutoCleanup: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;

package/dist/module.json CHANGED Viewed

@@ -4,5 +4,5 @@
   "compatibility": {
     "nuxt": "^3.0.0"
   },
-  "version": "0.1.11"
+  "version": "0.1.13"
 }

package/dist/module.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-export { l as default } from './shared/nuxt-llms-generator.bc139143.mjs';
+export { l as default } from './shared/nuxt-llms-generator.db76a78e.mjs';
 import '@nuxt/kit';
 import 'fs';
 import 'path';

package/dist/shared/{nuxt-llms-generator.bc139143.mjs → nuxt-llms-generator.db76a78e.mjs} RENAMED Viewed

@@ -38,6 +38,7 @@ const LLMSConfigSchema = z.object({
   ).describe("The base URL of the website to append to links in generated llms files"),
   baseSiteUrlUmbracoDataKey: z.string().optional().describe("If the SiteData of UmbracoData has the key with the base URL you can pass here the key to auto extract the base url"),
   maxConcurrent: z.number().int().min(1, "maxConcurrent must be at least 1").max(10, "maxConcurrent should not exceed 10 to avoid rate limits").optional().default(3),
+  maxTokens: z.number().int().min(1e3, "maxTokens must be at least 1000").max(2e5, "maxTokens should not exceed 200000").optional().default(65e3).describe("Maximum tokens for page content before truncation"),
   enableLLMSFullTxt: z.boolean().optional().default(true),
   enableIndividualMd: z.boolean().optional().default(true),
   enableAutoCleanup: z.boolean().optional().default(true),
@@ -225,6 +226,7 @@ function convertHtmlToMarkdownDeep(input) {
 const DEFAULT_OPTIONS = {
   anthropicModel: "claude-3-7-sonnet-latest",
   maxConcurrent: 5,
+  maxTokens: 65e3,
   enableLLMSFullTxt: true,
   enableIndividualMd: true,
   templatesDir: ".llms-templates",
@@ -271,6 +273,7 @@ const llmsModule = defineNuxtModule({
       finalOutputDir: resolve(nuxt.options.rootDir, options.finalOutputDir ?? "public"),
       anthropicModel: options.anthropicModel || DEFAULT_OPTIONS.anthropicModel,
       maxConcurrent: options.maxConcurrent || DEFAULT_OPTIONS.maxConcurrent,
+      maxTokens: options.maxTokens ?? DEFAULT_OPTIONS.maxTokens,
       enableLLMSFullTxt: options.enableLLMSFullTxt ?? DEFAULT_OPTIONS.enableLLMSFullTxt,
       enableIndividualMd: options.enableIndividualMd ?? DEFAULT_OPTIONS.enableIndividualMd,
       enableAutoCleanup: options.enableAutoCleanup ?? DEFAULT_OPTIONS.enableAutoCleanup,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@voicenter-team/nuxt-llms-generator",
-  "version": "0.1.11",
+  "version": "0.1.13",
   "description": "Nuxt 3 module for automatically generating AI-optimized documentation files (llms.txt, llms-full.txt, and individual .md files) from Umbraco CMS data using Anthropic's Claude API.",
   "repository": "https://github.com/VoicenterTeam/nuxt-llms-generator",
   "license": "MIT",
@@ -35,6 +35,7 @@
   "dependencies": {
     "@anthropic-ai/sdk": "^0.30.0",
     "@nuxt/kit": "^3.11.2",
+    "@toon-format/toon": "^2.1.0",
     "@voicenter-team/eslint-config-ts": "^1.0.22",
     "i": "^0.3.7",
     "jsonpath-plus": "^8.0.0",