npm - @adobe/spacecat-shared-html-analyzer - Versions diffs - 1.0.0 - Mend

@adobe/spacecat-shared-html-analyzer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/rollup.config.js ADDED Viewed

@@ -0,0 +1,52 @@
+/*
+ * Copyright 2025 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+import { nodeResolve } from '@rollup/plugin-node-resolve';
+import terser from '@rollup/plugin-terser';
+export default {
+  input: 'src/browser-entry.js', // Special browser entry point
+  output: [
+    {
+      file: 'dist/html-analyzer.js',
+      format: 'iife', // Immediately Invoked Function Expression for browsers
+      name: 'HTMLAnalyzer',
+      globals: {
+        // No external dependencies in browser bundle
+      },
+    },
+    {
+      file: 'dist/html-analyzer.min.js',
+      format: 'iife',
+      name: 'HTMLAnalyzer',
+      plugins: [terser()], // Minified version
+      globals: {
+        // No external dependencies in browser bundle
+      },
+    },
+  ],
+  plugins: [
+    nodeResolve({
+      browser: true, // Use browser field in package.json
+      preferBuiltins: false, // Don't include Node.js built-ins
+    }),
+  ],
+  external: [
+    // Exclude cheerio from bundle - it won't work in browser anyway
+    'cheerio',
+  ],
+  onwarn(warning, warn) {
+    // Suppress warnings about dynamic imports that we'll handle
+    if (warning.code === 'UNRESOLVED_IMPORT') return;
+    warn(warning);
+  },
+};

package/src/analyzer.js ADDED Viewed

@@ -0,0 +1,126 @@
+/*
+ * Copyright 2025 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+/**
+ * Content analysis and metrics calculation
+ * Provides comprehensive analysis of HTML content differences
+ */
+import { stripTagsToText } from './html-filter.js';
+import { tokenize } from './tokenizer.js';
+import { generateDiffReport } from './diff-engine.js';
+import { hashDJB2, pct } from './utils.js';
+/**
+ * Comprehensive text-only analysis between initial and final HTML
+ * @param {string} initHtml - Initial HTML content (what crawlers see)
+ * @param {string} finHtml - Final HTML content (what users see)
+ * @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
+ * @returns {Promise<Object>} Comprehensive analysis results
+ */
+export async function analyzeTextComparison(initHtml, finHtml, ignoreNavFooter = true) {
+  // Handle both sync (browser) and async (Node.js) stripTagsToText
+  const initTextResult = stripTagsToText(initHtml, ignoreNavFooter);
+  const finTextResult = stripTagsToText(finHtml, ignoreNavFooter);
+  const initText = await Promise.resolve(initTextResult);
+  const finText = await Promise.resolve(finTextResult);
+  const initTextLength = initText.length;
+  const finTextLength = finText.length;
+  const textRetention = finTextLength > 0 ? initTextLength / finTextLength : 0;
+  const wordDiff = generateDiffReport(initText, finText, 'word');
+  const lineDiff = generateDiffReport(initText, finText, 'line');
+  return {
+    initialText: initText,
+    finalText: finText,
+    initialTextLength: initTextLength,
+    finalTextLength: finTextLength,
+    textRetention,
+    textRetentionPercent: pct(textRetention),
+    wordDiff,
+    lineDiff,
+    initialTextHash: hashDJB2(initText),
+    finalTextHash: hashDJB2(finText),
+  };
+}
+/**
+ * Calculate basic stats from HTML comparison
+ * @param {string} originalHTML - Initial HTML content
+ * @param {string} currentHTML - Final HTML content
+ * @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
+ * @returns {Promise<Object>} Basic statistics
+ */
+export async function calculateStats(originalHTML, currentHTML, ignoreNavFooter = true) {
+  // Handle both sync (browser) and async (Node.js) stripTagsToText
+  const originalTextResult = stripTagsToText(originalHTML, ignoreNavFooter);
+  const currentTextResult = stripTagsToText(currentHTML, ignoreNavFooter);
+  const originalText = await Promise.resolve(originalTextResult);
+  const currentText = await Promise.resolve(currentTextResult);
+  // Calculate word counts using consistent tokenization
+  const originalTokens = tokenize(originalText, 'word');
+  const currentTokens = tokenize(currentText, 'word');
+  const wordDiff = Math.abs(currentTokens.length - originalTokens.length);
+  // Calculate content increase ratio (how many times content increased)
+  let contentIncreaseRatio;
+  if (originalTokens.length > 0) {
+    contentIncreaseRatio = currentTokens.length / originalTokens.length;
+  } else {
+    contentIncreaseRatio = currentTokens.length > 0 ? currentTokens.length : 1;
+  }
+  // Calculate citation readability (percentage of original content available in current)
+  const citationReadability = currentTokens.length > 0
+    ? Math.min(100, (originalTokens.length / currentTokens.length) * 100) : 100;
+  return {
+    wordDiff,
+    contentIncreaseRatio: Math.round(contentIncreaseRatio * 100) / 100, // Round to 1 decimal place
+    citationReadability: Math.round(citationReadability),
+  };
+}
+/**
+ * Calculate stats for both nav/footer scenarios
+ * @param {string} originalHTML - Initial HTML content
+ * @param {string} currentHTML - Final HTML content
+ * @returns {Promise<Object>} Analysis results for both scenarios
+ */
+export async function calculateBothScenarioStats(originalHTML, currentHTML) {
+  // Calculate stats with nav/footer ignored
+  const statsIgnored = await calculateStats(originalHTML, currentHTML, true);
+  // Calculate stats without nav/footer ignored
+  const statsNotIgnored = await calculateStats(originalHTML, currentHTML, false);
+  return {
+    withNavFooterIgnored: {
+      wordDiff: statsIgnored.wordDiff,
+      contentIncreaseRatio: statsIgnored.contentIncreaseRatio,
+      citationReadability: statsIgnored.citationReadability,
+      contentGain: `${Math.round(statsIgnored.contentIncreaseRatio * 10) / 10}x`,
+      missingWords: statsIgnored.wordDiff,
+    },
+    withoutNavFooterIgnored: {
+      wordDiff: statsNotIgnored.wordDiff,
+      contentIncreaseRatio: statsNotIgnored.contentIncreaseRatio,
+      citationReadability: statsNotIgnored.citationReadability,
+      contentGain: `${Math.round(statsNotIgnored.contentIncreaseRatio * 10) / 10}x`,
+      missingWords: statsNotIgnored.wordDiff,
+    },
+  };
+}

package/src/browser-entry.js ADDED Viewed

@@ -0,0 +1,92 @@
+/*
+ * Copyright 2025 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+/**
+ * Browser Entry Point for Chrome Extension
+ * Exposes all necessary functions for Chrome extension usage
+ *
+ * This bundle excludes Node.js specific code (cheerio) and creates
+ * a browser-compatible version for Chrome extensions.
+ */
+// Import only browser-compatible functions
+import {
+  analyzeTextComparison,
+  calculateStats,
+  calculateBothScenarioStats,
+  stripTagsToText,
+  filterHtmlContent,
+  extractWordCount,
+  filterNavigationAndFooterBrowser,
+  tokenize,
+  countWords,
+  countLines,
+  diffTokens,
+  generateDiffReport,
+  hashDJB2,
+  pct,
+  formatNumberToK,
+  isBrowser,
+} from './index.js';
+// Create global object for Chrome extension
+const HTMLAnalyzer = {
+  // Core analysis functions (matching Chrome extension API)
+  analyzeTextComparison,
+  calculateStats,
+  calculateBothScenarioStats,
+  // HTML processing functions
+  stripTagsToText,
+  filterHtmlContent,
+  extractWordCount,
+  filterNavigationAndFooterBrowser,
+  // Text processing functions
+  tokenize,
+  countWords,
+  countLines,
+  // Diff engine functions
+  diffTokens,
+  generateDiffReport,
+  // Utility functions
+  hashDJB2,
+  pct,
+  formatNumberToK,
+  isBrowser,
+  // Version info
+  version: '1.0.0',
+  buildFor: 'chrome-extension',
+};
+// Make available globally for Chrome extension script tags
+// This needs to be executed immediately when the bundle loads
+/* eslint-env browser */
+/* global window, self */
+(function setGlobal() {
+  // Determine the global object (works in browser, Node.js, Web Workers)
+  const globalObject = (function getGlobalObject() {
+    if (typeof window !== 'undefined') return window;
+    if (typeof globalThis !== 'undefined') return globalThis;
+    if (typeof self !== 'undefined') return self;
+    return this || {};
+  }());
+  // Assign to global scope
+  globalObject.HTMLAnalyzer = HTMLAnalyzer;
+}());
+// Export for ES modules
+export default HTMLAnalyzer;

package/src/diff-engine.js ADDED Viewed

@@ -0,0 +1,184 @@
+/*
+ * Copyright 2025 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+/**
+ * LCS-based diff engine for text comparison
+ * Provides efficient algorithms for finding differences between text content
+ */
+import { tokenize } from './tokenizer.js';
+/**
+ * Generate LCS-based diff between two strings
+ * @param {string} aStr - First string to compare
+ * @param {string} bStr - Second string to compare
+ * @param {string} [mode="word"] - Tokenization mode: "word" or "line"
+ * @returns {Array} Array of diff operations: {type: 'same'|'add'|'del', text: string}
+ */
+export function diffTokens(aStr, bStr, mode = 'word') {
+  const A = tokenize(aStr, mode);
+  const B = tokenize(bStr, mode);
+  // Map tokens to integers for faster LCS computation
+  const sym = new Map();
+  const mapTok = (t) => {
+    if (!sym.has(t)) sym.set(t, sym.size + 1);
+    return sym.get(t);
+  };
+  const a = A.map(mapTok);
+  const b = B.map(mapTok);
+  // Build LCS length table using space-optimized dynamic programming
+  const m = a.length;
+  const n = b.length;
+  // Optimize by using the smaller dimension for rolling arrays
+  let useTransposed = false;
+  let rows;
+  let cols;
+  let aTokens;
+  let bTokens;
+  let aMapped;
+  let bMapped;
+  if (m <= n) {
+    rows = m;
+    cols = n;
+    aTokens = A;
+    bTokens = B;
+    aMapped = a;
+    bMapped = b;
+  } else {
+    // Transpose to use smaller dimension (swap A and B)
+    rows = n;
+    cols = m;
+    aTokens = B;
+    bTokens = A;
+    aMapped = b;
+    bMapped = a;
+    useTransposed = true;
+  }
+  // Use rolling arrays: only need current and previous row
+  let prev = new Array(cols + 1).fill(0);
+  let curr = new Array(cols + 1).fill(0);
+  // Build LCS length table with rolling arrays
+  for (let i = 1; i <= rows; i += 1) {
+    for (let j = 1; j <= cols; j += 1) {
+      curr[j] = (aMapped[i - 1] === bMapped[j - 1])
+        ? prev[j - 1] + 1
+        : Math.max(prev[j], curr[j - 1]);
+    }
+    // Swap arrays for next iteration
+    const temp = curr;
+    curr = prev;
+    prev = temp;
+  }
+  // Rebuild LCS table for backtracking - using smaller dimension first
+  const dp = Array(rows + 1).fill(0).map(() => Array(cols + 1).fill(0));
+  for (let i = 1; i <= rows; i += 1) {
+    for (let j = 1; j <= cols; j += 1) {
+      dp[i][j] = (aMapped[i - 1] === bMapped[j - 1])
+        ? dp[i - 1][j - 1] + 1
+        : Math.max(dp[i - 1][j], dp[i][j - 1]);
+    }
+  }
+  // Backtrack to generate diff operations
+  const ops = [];
+  let i = rows;
+  let j = cols;
+  while (i > 0 && j > 0) {
+    if (aMapped[i - 1] === bMapped[j - 1]) {
+      ops.push({ type: 'same', text: aTokens[i - 1] });
+      i -= 1;
+      j -= 1;
+    } else if (dp[i - 1][j] >= dp[i][j - 1]) {
+      ops.push({ type: 'del', text: aTokens[i - 1] });
+      i -= 1;
+    } else {
+      ops.push({ type: 'add', text: bTokens[j - 1] });
+      j -= 1;
+    }
+  }
+  // Handle remaining tokens
+  while (i > 0) {
+    ops.push({ type: 'del', text: aTokens[i - 1] });
+    i -= 1;
+  }
+  while (j > 0) {
+    ops.push({ type: 'add', text: bTokens[j - 1] });
+    j -= 1;
+  }
+  // If we transposed, we need to swap add/del operations back
+  if (useTransposed) {
+    for (let opIndex = 0; opIndex < ops.length; opIndex += 1) {
+      const op = ops[opIndex];
+      if (op.type === 'add') {
+        ops[opIndex] = { ...op, type: 'del' };
+      } else if (op.type === 'del') {
+        ops[opIndex] = { ...op, type: 'add' };
+      }
+    }
+  }
+  ops.reverse();
+  return ops;
+}
+/**
+ * Generate comprehensive diff report with statistics
+ * @param {string} initText - Initial text (before changes)
+ * @param {string} finText - Final text (after changes)
+ * @param {string} [mode="word"] - Tokenization mode: "word" or "line"
+ * @returns {Object} Diff report with counts and operations
+ */
+export function generateDiffReport(initText, finText, mode = 'word') {
+  if (!initText || !finText) {
+    return {
+      addCount: 0,
+      delCount: 0,
+      sameCount: 0,
+      diffOps: [],
+      summary: 'No text to compare',
+    };
+  }
+  const ops = diffTokens(initText, finText, mode);
+  let addCount = 0;
+  let delCount = 0;
+  let sameCount = 0;
+  ops.forEach((op) => {
+    if (op.type === 'add') {
+      addCount += 1;
+    } else if (op.type === 'del') {
+      delCount += 1;
+    } else {
+      sameCount += 1;
+    }
+  });
+  return {
+    addCount,
+    delCount,
+    sameCount,
+    diffOps: ops,
+    summary: `Added: ${addCount.toLocaleString()} • Removed: ${delCount.toLocaleString()} • Same: ${sameCount.toLocaleString()} • Granularity: ${mode}`,
+  };
+}