npm - @adobe/spacecat-shared-html-analyzer - Versions diffs - 1.2.1 → 1.2.3 - Mend

@adobe/spacecat-shared-html-analyzer 1.2.1 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,17 @@
+# [@adobe/spacecat-shared-html-analyzer-v1.2.3](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.2.2...@adobe/spacecat-shared-html-analyzer-v1.2.3) (2026-02-04)
+### Bug Fixes
+* updated selectors for nav-footer to not exclude the breadcrumbs ([#1309](https://github.com/adobe/spacecat-shared/issues/1309)) ([2c9246c](https://github.com/adobe/spacecat-shared/commit/2c9246c295ab90ab8e2bae39fd5d11d71bba6546))
+# [@adobe/spacecat-shared-html-analyzer-v1.2.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.2.1...@adobe/spacecat-shared-html-analyzer-v1.2.2) (2026-01-22)
+### Bug Fixes
+* added option to include noscript tags in server-side html ([#1274](https://github.com/adobe/spacecat-shared/issues/1274)) ([f26e320](https://github.com/adobe/spacecat-shared/commit/f26e3200cc2b129237073da5c7cae1cbfb3ae4b1))
 # [@adobe/spacecat-shared-html-analyzer-v1.2.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.2.0...@adobe/spacecat-shared-html-analyzer-v1.2.1) (2026-01-15)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@adobe/spacecat-shared-html-analyzer",
-  "version": "1.2.1",
+  "version": "1.2.3",
   "description": "Analyze HTML content visibility for AI crawlers and citations - compare static HTML vs fully rendered content",
   "type": "module",
   "engines": {

package/src/analyzer.js CHANGED Viewed

@@ -25,12 +25,20 @@ import { hashDJB2, pct } from './utils.js';
  * @param {string} initHtml - Initial HTML content (what crawlers see)
  * @param {string} finHtml - Final HTML content (what users see)
  * @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
+ * @param {boolean} [includeNoscriptInFinal=false] -
+ * Whether to include noscript content in final HTML
  * @returns {Promise<Object>} Comprehensive analysis results
  */
-export async function analyzeTextComparison(initHtml, finHtml, ignoreNavFooter = true) {
-  // Handle both sync (browser) and async (Node.js) stripTagsToText
-  const initTextResult = stripTagsToText(initHtml, ignoreNavFooter);
-  const finTextResult = stripTagsToText(finHtml, ignoreNavFooter);
+export async function analyzeTextComparison(
+  initHtml,
+  finHtml,
+  ignoreNavFooter = true,
+  includeNoscriptInFinal = false,
+) {
+  // Server-side (initial): Always includes noscript (true) - what crawlers see
+  const initTextResult = stripTagsToText(initHtml, ignoreNavFooter, true);
+  // Client-side (final): Configurable noscript inclusion - what users see
+  const finTextResult = stripTagsToText(finHtml, ignoreNavFooter, includeNoscriptInFinal);
   const initText = await Promise.resolve(initTextResult);
   const finText = await Promise.resolve(finTextResult);
@@ -61,12 +69,20 @@ export async function analyzeTextComparison(initHtml, finHtml, ignoreNavFooter =
  * @param {string} originalHTML - Initial HTML content
  * @param {string} currentHTML - Final HTML content
  * @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
+ * @param {boolean} [includeNoscriptInCurrent=false] -
+ * Whether to include noscript content in current HTML
  * @returns {Promise<Object>} Basic statistics
  */
-export async function calculateStats(originalHTML, currentHTML, ignoreNavFooter = true) {
-  // Handle both sync (browser) and async (Node.js) stripTagsToText
-  const originalTextResult = stripTagsToText(originalHTML, ignoreNavFooter);
-  const currentTextResult = stripTagsToText(currentHTML, ignoreNavFooter);
+export async function calculateStats(
+  originalHTML,
+  currentHTML,
+  ignoreNavFooter = true,
+  includeNoscriptInCurrent = false,
+) {
+  // Server-side (original): Always includes noscript (true) - what crawlers see
+  const originalTextResult = stripTagsToText(originalHTML, ignoreNavFooter, true);
+  // Client-side (current): Configurable noscript inclusion - what users see
+  const currentTextResult = stripTagsToText(currentHTML, ignoreNavFooter, includeNoscriptInCurrent);
   const originalText = await Promise.resolve(originalTextResult);
   const currentText = await Promise.resolve(currentTextResult);
@@ -103,14 +119,30 @@ export async function calculateStats(originalHTML, currentHTML, ignoreNavFooter
  * Calculate stats for both nav/footer scenarios
  * @param {string} originalHTML - Initial HTML content
  * @param {string} currentHTML - Final HTML content
+ * @param {boolean} [includeNoscriptInCurrent=false] -
+ * Whether to include noscript content in current HTML
  * @returns {Promise<Object>} Analysis results for both scenarios
  */
-export async function calculateBothScenarioStats(originalHTML, currentHTML) {
+export async function calculateBothScenarioStats(
+  originalHTML,
+  currentHTML,
+  includeNoscriptInCurrent = false,
+) {
   // Calculate stats with nav/footer ignored
-  const statsIgnored = await calculateStats(originalHTML, currentHTML, true);
+  const statsIgnored = await calculateStats(
+    originalHTML,
+    currentHTML,
+    true,
+    includeNoscriptInCurrent,
+  );
   // Calculate stats without nav/footer ignored
-  const statsNotIgnored = await calculateStats(originalHTML, currentHTML, false);
+  const statsNotIgnored = await calculateStats(
+    originalHTML,
+    currentHTML,
+    false,
+    includeNoscriptInCurrent,
+  );
   return {
     withNavFooterIgnored: {
       wordCountBefore: statsIgnored.wordCountBefore,

package/src/html-filter.js CHANGED Viewed

@@ -30,8 +30,6 @@ const NAVIGATION_FOOTER_SELECTOR = [
   // Header/footer classes
   '.header', '.site-header', '.page-header', '.top-header', '.header-wrapper',
   '.footer', '.site-footer', '.page-footer', '.bottom-footer', '.footer-wrapper',
-  // Breadcrumb navigation
-  '.breadcrumb', '.breadcrumbs',
   // Common ID selectors
   '#nav', '#navigation', '#navbar', '#header', '#footer', '#menu', '#main-menu',
   '#site-header', '#site-footer', '#page-header', '#page-footer',
@@ -182,16 +180,17 @@ function filterNavigationAndFooterCheerio($) {
  * @param {string} htmlContent - Raw HTML content
  * @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
  * @param {boolean} returnText - Whether to return text only
+ * @param {boolean} includeNoscript - Whether to include noscript elements (false excludes them)
  * @returns {string} Filtered content
  */
-function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
+function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText, includeNoscript) {
   const parser = new DOMParser(); // eslint-disable-line no-undef
   const doc = parser.parseFromString(htmlContent, 'text/html');
   // Process the entire document to capture JSON-LD in both head and body
   const documentElement = doc.documentElement || doc;
-  // Remove script elements except JSON-LD, also remove style, noscript, template
+  // Remove script elements except JSON-LD, also remove style, template
   documentElement.querySelectorAll('script').forEach((n) => {
     // Preserve JSON-LD structured data scripts by converting them to code blocks
     if (n.type === 'application/ld+json') {
@@ -234,7 +233,12 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
     }
     n.remove();
   });
-  documentElement.querySelectorAll('style,noscript,template').forEach((n) => n.remove());
+  if (includeNoscript) {
+    documentElement.querySelectorAll('style,template').forEach((n) => n.remove());
+  } else {
+    documentElement.querySelectorAll('noscript,style,template').forEach((n) => n.remove());
+  }
   // Remove all media elements (images, videos, audio, etc.) to keep only text
   const mediaSelector = 'img,video,audio,picture,svg,canvas,embed,object,iframe';
@@ -259,9 +263,10 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
  * @param {string} htmlContent - Raw HTML content
  * @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
  * @param {boolean} returnText - Whether to return text only
+ * @param {boolean} includeNoscript - Whether to include noscript elements (false excludes them)
  * @returns {Promise<string>} Filtered content
  */
-async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
+async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText, includeNoscript) {
   let cheerio;
   try {
     cheerio = await import('cheerio');
@@ -305,7 +310,12 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
       $(this).remove();
     }
   });
-  $('style, noscript, template').remove();
+  if (includeNoscript) {
+    $('style, template').remove();
+  } else {
+    $('style, noscript, template').remove();
+  }
   // Remove all media elements (images, videos, audio, etc.) to keep only text
   $('img, video, audio, picture, svg, canvas, embed, object, iframe').remove();
@@ -330,45 +340,54 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
 /**
  * Filter HTML content by removing unwanted elements
  * @param {string} htmlContent - Raw HTML content
- * @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
- * @param {boolean} returnText - Whether to return text only (true) or filtered HTML (false)
+ * @param {boolean} [ignoreNavFooter=true] - Whether to remove navigation/footer elements
+ * @param {boolean} [returnText=true] - Whether to return text only (true) or filtered HTML (false)
+ * @param {boolean} [includeNoscript=false] - Whether to include noscript elements
  * @returns {string|Promise<string>} Filtered content (sync in browser, async in Node.js)
  */
-export function filterHtmlContent(htmlContent, ignoreNavFooter = true, returnText = true) {
+export function filterHtmlContent(
+  htmlContent,
+  ignoreNavFooter = true,
+  returnText = true,
+  includeNoscript = false,
+) {
   if (!htmlContent) return '';
   // Browser environment (DOMParser) - works in Chrome extensions too - SYNCHRONOUS
   if (isBrowser()) {
-    return filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText);
+    return filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText, includeNoscript);
   }
   // Node.js environment (cheerio) - dynamic import to avoid bundling issues - ASYNCHRONOUS
-  return filterHtmlNode(htmlContent, ignoreNavFooter, returnText);
+  return filterHtmlNode(htmlContent, ignoreNavFooter, returnText, includeNoscript);
 }
 /**
  * Strip HTML tags and return plain text
+ *
  * @param {string} htmlContent - Raw HTML content
- * @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
+ * @param {boolean} [ignoreNavFooter=true] - Whether to remove navigation/footer elements
+ * @param {boolean} [includeNoscript=false] - Whether to include noscript elements
  * @returns {string|Promise<string>} Plain text content (sync in browser, async in Node.js)
  */
-export function stripTagsToText(htmlContent, ignoreNavFooter = true) {
-  return filterHtmlContent(htmlContent, ignoreNavFooter, true);
+export function stripTagsToText(htmlContent, ignoreNavFooter = true, includeNoscript = false) {
+  return filterHtmlContent(htmlContent, ignoreNavFooter, true, includeNoscript);
 }
 /**
  * Extract word count from HTML content
  * @param {string} htmlContent - Raw HTML content
- * @param {boolean} ignoreNavFooter - Whether to ignore navigation/footer
+ * @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer
+ * @param {boolean} [includeNoscript=false] - Whether to include noscript elements
  * @returns {Object|Promise<Object>} Object with word_count property
  *   (sync in browser, async in Node.js)
  */
-export function extractWordCount(htmlContent, ignoreNavFooter = true) {
+export function extractWordCount(htmlContent, ignoreNavFooter = true, includeNoscript = false) {
   if (!htmlContent) {
     return { word_count: 0 };
   }
-  const textContent = stripTagsToText(htmlContent, ignoreNavFooter);
+  const textContent = stripTagsToText(htmlContent, ignoreNavFooter, includeNoscript);
   // Handle both sync (browser) and async (Node.js) cases
   if (textContent && typeof textContent.then === 'function') {

package/src/index.d.ts CHANGED Viewed

@@ -90,17 +90,30 @@ export function generateDiffReport(initText: string, finText: string, mode?: "wo
 /**
  * Filter HTML content by removing unwanted elements
  */
-export function filterHtmlContent(htmlContent: string, ignoreNavFooter?: boolean, returnText?: boolean): Promise<string>;
+export function filterHtmlContent(
+  htmlContent: string,
+  ignoreNavFooter?: boolean,
+  returnText?: boolean,
+  includeNoscript?: boolean
+): Promise<string>;
 /**
  * Extract plain text from HTML content
  */
-export function stripTagsToText(htmlContent: string, ignoreNavFooter?: boolean): Promise<string>;
+export function stripTagsToText(
+  htmlContent: string,
+  ignoreNavFooter?: boolean,
+  includeNoscript?: boolean
+): Promise<string>;
 /**
  * Extract word count from HTML content
  */
-export function extractWordCount(htmlContent: string, ignoreNavFooter?: boolean): Promise<{ word_count: number }>;
+export function extractWordCount(
+  htmlContent: string,
+  ignoreNavFooter?: boolean,
+  includeNoscript?: boolean
+): Promise<{ word_count: number }>;
 /**
  * Remove navigation and footer elements from DOM element (browser environment)
@@ -150,28 +163,42 @@ interface BothScenariosStats {
 /**
  * Comprehensive text-only analysis between initial and final HTML (original chrome extension logic)
+ * @param initHtml - Initial HTML content (what crawlers/bots see - server-side rendered)
+ * @param finHtml - Final HTML content (what users see - client-side rendered)
+ * @param ignoreNavFooter - Whether to ignore navigation/footer elements
+ * @param includeNoscriptInFinal - Whether to include noscript content in final HTML (client-side)
  */
 export function analyzeTextComparison(
   initHtml: string,
   finHtml: string,
-  ignoreNavFooter?: boolean
+  ignoreNavFooter?: boolean,
+  includeNoscriptInFinal?: boolean
 ): Promise<TextComparison>;
 /**
  * Calculate basic stats from HTML comparison (original chrome extension logic)
+ * @param originalHTML - Initial HTML content (server-side)
+ * @param currentHTML - Final HTML content (client-side)
+ * @param ignoreNavFooter - Whether to ignore navigation/footer elements
+ * @param includeNoscriptInCurrent - Whether to include noscript content in current HTML (client-side)
  */
 export function calculateStats(
   originalHTML: string,
   currentHTML: string,
-  ignoreNavFooter?: boolean
+  ignoreNavFooter?: boolean,
+  includeNoscriptInCurrent?: boolean
 ): Promise<BasicStats>;
 /**
  * Calculate stats for both nav/footer scenarios (original chrome extension logic)
+ * @param originalHTML - Initial HTML content (server-side)
+ * @param currentHTML - Final HTML content (client-side)
+ * @param includeNoscriptInCurrent - Whether to include noscript content in current HTML (client-side)
  */
 export function calculateBothScenarioStats(
   originalHTML: string,
-  currentHTML: string
+  currentHTML: string,
+  includeNoscriptInCurrent?: boolean
 ): Promise<BothScenariosStats>;
 /** MARKDOWN DIFF FUNCTIONS */

package/test/index.test.js CHANGED Viewed

@@ -46,6 +46,35 @@ describe('HTML Visibility Analyzer', () => {
       expect(result.initialText).to.equal('');
       expect(result.finalText.length).to.be.greaterThan(0);
     });
+    it('should include noscript in initial HTML and exclude in final HTML by default', async () => {
+      const initHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p></body></html>';
+      const finHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p><div>Extra</div></body></html>';
+      const result = await analyzeTextComparison(initHtml, finHtml);
+      // Initial text should include noscript content
+      expect(result.initialText).to.include('Enable JS');
+      // Final text should NOT include noscript content by default
+      expect(result.finalText).to.not.include('Enable JS');
+      // Both should have the main content
+      expect(result.initialText).to.include('Title');
+      expect(result.finalText).to.include('Title');
+    });
+    it('should include noscript in final HTML when includeNoscriptInFinal is true', async () => {
+      const initHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p></body></html>';
+      const finHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p><div>Extra</div></body></html>';
+      const result = await analyzeTextComparison(initHtml, finHtml, true, true);
+      // Initial text should include noscript content
+      expect(result.initialText).to.include('Enable JS');
+      // Final text should ALSO include noscript content when flag is true
+      expect(result.finalText).to.include('Enable JS');
+      // Both should have the main content
+      expect(result.initialText).to.include('Title');
+      expect(result.finalText).to.include('Title');
+      expect(result.finalText).to.include('Extra');
+    });
   });
   describe('calculateStats', () => {
@@ -64,6 +93,41 @@ describe('HTML Visibility Analyzer', () => {
       expect(result.contentIncreaseRatio).to.be.a('number');
       expect(result.citationReadability).to.be.a('number');
     });
+    it('should handle noscript elements correctly in word counts by default', async () => {
+      const originalHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p></body></html>';
+      const currentHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p><p>New content</p></body></html>';
+      const result = await calculateStats(originalHtml, currentHtml);
+      // Word counts should reflect the includeNoscript behavior
+      // originalText includes noscript (includeNoscript=true):
+      //     "Title Enable JavaScript Original content"
+      // currentText excludes noscript (includeNoscript=false):
+      //     "Title Original content New content"
+      expect(result.wordCountBefore).to.be.greaterThan(0);
+      expect(result.wordCountAfter).to.be.greaterThan(0);
+      expect(result.contentIncreaseRatio).to.be.a('number');
+    });
+    it('should include noscript in current HTML when includeNoscriptInCurrent is true', async () => {
+      const originalHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p></body></html>';
+      const currentHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p><p>New content</p></body></html>';
+      const resultWithout = await calculateStats(originalHtml, currentHtml, true, false);
+      const resultWith = await calculateStats(originalHtml, currentHtml, true, true);
+      // When noscript is excluded from current, word count should be lower
+      expect(resultWithout.wordCountAfter).to.be.lessThan(resultWith.wordCountAfter);
+      // Note: Text extraction concatenates without spaces, so words merge
+      // originalHtml with noscript: "TitleEnable JavaScriptOriginal content" = 3 words
+      // originalHtml without noscript: "TitleOriginal content" = 2 words
+      // currentHtml without noscript: "TitleOriginal contentNew content" = 3 words
+      // currentHtml with noscript: "TitleEnable JavaScriptOriginal contentNew content" = 4 words
+      expect(resultWithout.wordCountBefore).to.equal(3);
+      expect(resultWithout.wordCountAfter).to.equal(3);
+      expect(resultWith.wordCountBefore).to.equal(3);
+      expect(resultWith.wordCountAfter).to.equal(4);
+    });
   });
   describe('calculateBothScenarioStats', () => {
@@ -118,5 +182,55 @@ describe('HTML Visibility Analyzer', () => {
       expect(text).to.include('Navigation');
       expect(text).to.include('Footer');
     });
+    it('should remove noscript elements by default', async () => {
+      const html = '<html><body><h1>Title</h1><noscript>Please enable JavaScript</noscript><p>Content</p></body></html>';
+      const text = await stripTagsToText(html);
+      expect(text).to.include('Title');
+      expect(text).to.include('Content');
+      expect(text).to.not.include('Please enable JavaScript');
+      expect(text).to.not.include('noscript');
+    });
+    it('should remove noscript elements when includeNoscript is false', async () => {
+      const html = '<html><body><h1>Title</h1><noscript>Noscript content</noscript><p>Regular content</p></body></html>';
+      const text = await stripTagsToText(html, true, false);
+      expect(text).to.include('Title');
+      expect(text).to.include('Regular content');
+      expect(text).to.not.include('Noscript content');
+    });
+    it('should keep noscript elements when includeNoscript is true', async () => {
+      const html = '<html><body><h1>Title</h1><noscript>Noscript fallback</noscript><p>Regular content</p></body></html>';
+      const text = await stripTagsToText(html, true, true);
+      expect(text).to.include('Title');
+      expect(text).to.include('Regular content');
+      expect(text).to.include('Noscript fallback');
+    });
+    it('should handle multiple noscript elements with includeNoscript', async () => {
+      const html = `<html><body>
+        <h1>Title</h1>
+        <noscript>First noscript</noscript>
+        <p>Content</p>
+        <noscript>Second noscript</noscript>
+      </body></html>`;
+      const textWithout = await stripTagsToText(html, true, false);
+      const textWith = await stripTagsToText(html, true, true);
+      expect(textWithout).to.include('Title');
+      expect(textWithout).to.include('Content');
+      expect(textWithout).to.not.include('First noscript');
+      expect(textWithout).to.not.include('Second noscript');
+      expect(textWith).to.include('Title');
+      expect(textWith).to.include('Content');
+      expect(textWith).to.include('First noscript');
+      expect(textWith).to.include('Second noscript');
+    });
   });
 });