@voicenter-team/nuxt-llms-generator 0.1.12 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -269,6 +269,17 @@ users[2]{id,name,role}:
269
269
 
270
270
  **CRITICAL:** Use the EXACT property names shown in TOON \`{braces}\` for your Mustache bindings.
271
271
 
272
+ ---
273
+ ## \u{1F6AB} Content Exclusion Rules
274
+ When generating the template, **DO NOT** create sections for these types of data even if they appear in TOON:
275
+ - **Image/media properties**: URLs to images, avatars, thumbnails, icons, or any media files
276
+ - **UI-only labels**: Search placeholders, filter menu text, "Show More"/"Show Less", pagination labels
277
+ - **SEO/meta fields**: ogTitle, ogDescription, ogImage, canonical URLs, changefreq, priority, sitemap fields
278
+ - **Legal boilerplate**: Copyright text, "All rights reserved"
279
+ - **Navigation chrome**: Breadcrumbs, menu items, footer links \u2014 unless they ARE the primary page content
280
+ - **System identifiers**: Internal IDs, GUIDs, sort orders, node paths, template aliases
281
+ - **Empty/null values**: Skip any property that holds no meaningful value
282
+ **Focus ONLY on**: titles, descriptions, features, pricing, reviews, specifications, contact info, and other business-relevant content.
272
283
  ---
273
284
 
274
285
  ## \u{1F3AF} TRUE PURPOSE: Help LLMs Answer Questions Efficiently
@@ -980,6 +991,11 @@ function getValueType(value) {
980
991
  return typeof value;
981
992
  }
982
993
 
994
+ const PLACEHOLDER_PATTERNS = [
995
+ "lorem ipsum",
996
+ "dolor sit amet",
997
+ "consectetuer adipi"
998
+ ];
983
999
  function shouldGenerateTemplate(umbracoData, urlItem) {
984
1000
  try {
985
1001
  const pageContent = extractPageContent(umbracoData, urlItem.Jpath);
@@ -992,6 +1008,16 @@ function shouldGenerateTemplate(umbracoData, urlItem) {
992
1008
  console.log(`Page ${urlItem.url} is hidden (hidePage: ${hidePage}), skipping template generation`);
993
1009
  return false;
994
1010
  }
1011
+ const title = pageContent.pageTitle ?? pageContent.pageTittle ?? pageContent.ogTitle ?? pageContent.headerBlockTitle;
1012
+ if (!title || title === "undefined" || title === "null") {
1013
+ console.log(`Page ${urlItem.url} has no valid title, skipping template generation`);
1014
+ return false;
1015
+ }
1016
+ const bodyText = JSON.stringify(pageContent).toLowerCase();
1017
+ if (PLACEHOLDER_PATTERNS.some((p) => bodyText.includes(p))) {
1018
+ console.log(`Page ${urlItem.url} contains placeholder text, skipping template generation`);
1019
+ return false;
1020
+ }
995
1021
  return true;
996
1022
  } catch (error) {
997
1023
  console.error(`Error checking visibility for ${urlItem.url}:`, error);
@@ -1207,6 +1233,19 @@ async function performAutomaticCleanup(umbracoData, cacheDir, options = {}) {
1207
1233
  return stats;
1208
1234
  }
1209
1235
 
1236
+ function sanitizeRenderedMarkdown(markdown) {
1237
+ let output = markdown;
1238
+ output = output.replace(/!\[.*?]\(.*?\)/g, "");
1239
+ output = output.replace(/^(#{1,6})\s+\d+,\s*/gm, "$1 ");
1240
+ output = output.replace(/&#x2F;/g, "/").replace(/&#39;/g, "'").replace(/&#x27;/g, "'").replace(/&quot;/g, '"').replace(/&amp;/g, "&").replace(/&#x3D;/g, "=").replace(/&#x60;/g, "`").replace(/&lt;/g, "<").replace(/&gt;/g, ">");
1241
+ output = output.replace(/^- .+?:\s*$/gm, "");
1242
+ output = output.replace(/\[הרחבה]\([^)]*\)/g, "");
1243
+ output = output.replace(/(?<!:)\/{2,}/g, "/");
1244
+ output = output.replace(/^(#{2,6})\s+.+\n(\s*\n)+(?=#{1,6}\s|$)/gm, "");
1245
+ output = output.replace(/\n{3,}/g, "\n\n");
1246
+ return output.trim();
1247
+ }
1248
+
1210
1249
  class TemplateGenerator {
1211
1250
  anthropicClient;
1212
1251
  promptAnalyzer;
@@ -1320,7 +1359,6 @@ class TemplateGenerator {
1320
1359
  if (tokensBeforeTruncation > tokensAfterTruncation) {
1321
1360
  console.warn(`Page ${pageId} content truncated: ${tokensBeforeTruncation} -> ${tokensAfterTruncation} tokens`);
1322
1361
  }
1323
- this.promptAnalyzer.analyzeContent(truncatedContent, urlItem);
1324
1362
  const request = {
1325
1363
  pageContent: truncatedContent,
1326
1364
  templateAlias: urlItem.TemplateAlias,
@@ -1351,7 +1389,14 @@ class TemplateGenerator {
1351
1389
  }
1352
1390
  async renderTemplate(template, data) {
1353
1391
  return withErrorHandling(async () => {
1354
- return Mustache.render(template, data);
1392
+ const originalEscape = Mustache.escape;
1393
+ Mustache.escape = (text) => text;
1394
+ try {
1395
+ const rendered = Mustache.render(template, data);
1396
+ return sanitizeRenderedMarkdown(rendered);
1397
+ } finally {
1398
+ Mustache.escape = originalEscape;
1399
+ }
1355
1400
  }, {
1356
1401
  template: template.substring(0, 200) + "...",
1357
1402
  dataKeys: Object.keys(data)
@@ -1480,7 +1525,8 @@ class LLMSFilesGenerator {
1480
1525
  content += `This website contains comprehensive information about ${siteTitle.toLowerCase()}. The content is organized into the following sections:
1481
1526
 
1482
1527
  `;
1483
- const pagesByCategory = this.groupPagesByCategory(mdFiles);
1528
+ const deduplicatedFiles = this.deduplicateByUrl(mdFiles);
1529
+ const pagesByCategory = this.groupPagesByCategory(deduplicatedFiles);
1484
1530
  for (const [category, pages] of Object.entries(pagesByCategory)) {
1485
1531
  if (pages.length === 0)
1486
1532
  continue;
@@ -1523,7 +1569,8 @@ class LLMSFilesGenerator {
1523
1569
  `;
1524
1570
  }
1525
1571
  content += "---\n\n";
1526
- for (const mdFile of mdFiles) {
1572
+ const deduplicatedFiles = this.deduplicateByUrl(mdFiles);
1573
+ for (const mdFile of deduplicatedFiles) {
1527
1574
  const urlItem = this.umbracoData.urlList.find((item) => item.url === mdFile.url);
1528
1575
  if (!urlItem)
1529
1576
  continue;
@@ -1564,6 +1611,15 @@ class LLMSFilesGenerator {
1564
1611
  * /marketplace -> category "marketplace"
1565
1612
  * / -> category "main"
1566
1613
  */
1614
+ deduplicateByUrl(mdFiles) {
1615
+ const seen = /* @__PURE__ */ new Set();
1616
+ return mdFiles.filter((file) => {
1617
+ if (seen.has(file.url))
1618
+ return false;
1619
+ seen.add(file.url);
1620
+ return true;
1621
+ });
1622
+ }
1567
1623
  groupPagesByCategory(mdFiles) {
1568
1624
  const categories = {};
1569
1625
  for (const mdFile of mdFiles) {
@@ -1609,8 +1665,7 @@ class LLMSFilesGenerator {
1609
1665
  }
1610
1666
  extractSiteTitle() {
1611
1667
  const siteData = this.umbracoData.SiteData;
1612
- const rawTitle = siteData?.pageTitle || siteData?.mainHeaderBlockTitle || "Website Documentation";
1613
- return rawTitle;
1668
+ return siteData?.pageTitle || siteData?.mainHeaderBlockTitle || "Website Documentation";
1614
1669
  }
1615
1670
  extractSiteDescription() {
1616
1671
  const siteData = this.umbracoData.SiteData;
@@ -1633,11 +1688,11 @@ class LLMSFilesGenerator {
1633
1688
  const pageContent = extractPageContent(this.umbracoData, urlItem.Jpath);
1634
1689
  if (!pageContent)
1635
1690
  return `${urlItem.TemplateAlias} page`;
1636
- const desc = pageContent.pageDescription || pageContent.description || pageContent.headerBlockSubtitle;
1637
- if (desc && typeof desc === "string") {
1638
- return desc;
1691
+ const desc = pageContent.pageDescription || pageContent.description || pageContent.headerBlockSubtitle || pageContent.ogDescription;
1692
+ if (desc && typeof desc === "string" && desc.trim().length > 0) {
1693
+ return desc.trim();
1639
1694
  }
1640
- return `Information about ${urlItem.url}`;
1695
+ return `${urlItem.TemplateAlias} page`;
1641
1696
  }
1642
1697
  sanitizeUrlForFilename(url) {
1643
1698
  if (!url || url === "/")
package/dist/module.json CHANGED
@@ -4,5 +4,5 @@
4
4
  "compatibility": {
5
5
  "nuxt": "^3.0.0"
6
6
  },
7
- "version": "0.1.12"
7
+ "version": "0.1.13"
8
8
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@voicenter-team/nuxt-llms-generator",
3
- "version": "0.1.12",
3
+ "version": "0.1.13",
4
4
  "description": "Nuxt 3 module for automatically generating AI-optimized documentation files (llms.txt, llms-full.txt, and individual .md files) from Umbraco CMS data using Anthropic's Claude API.",
5
5
  "repository": "https://github.com/VoicenterTeam/nuxt-llms-generator",
6
6
  "license": "MIT",