@adobe/spacecat-shared-html-analyzer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ /*
2
+ * Copyright 2025 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ import { nodeResolve } from '@rollup/plugin-node-resolve';
14
+ import terser from '@rollup/plugin-terser';
15
+
16
+ export default {
17
+ input: 'src/browser-entry.js', // Special browser entry point
18
+ output: [
19
+ {
20
+ file: 'dist/html-analyzer.js',
21
+ format: 'iife', // Immediately Invoked Function Expression for browsers
22
+ name: 'HTMLAnalyzer',
23
+ globals: {
24
+ // No external dependencies in browser bundle
25
+ },
26
+ },
27
+ {
28
+ file: 'dist/html-analyzer.min.js',
29
+ format: 'iife',
30
+ name: 'HTMLAnalyzer',
31
+ plugins: [terser()], // Minified version
32
+ globals: {
33
+ // No external dependencies in browser bundle
34
+ },
35
+ },
36
+ ],
37
+ plugins: [
38
+ nodeResolve({
39
+ browser: true, // Use browser field in package.json
40
+ preferBuiltins: false, // Don't include Node.js built-ins
41
+ }),
42
+ ],
43
+ external: [
44
+ // Exclude cheerio from bundle - it won't work in browser anyway
45
+ 'cheerio',
46
+ ],
47
+ onwarn(warning, warn) {
48
+ // Suppress warnings about dynamic imports that we'll handle
49
+ if (warning.code === 'UNRESOLVED_IMPORT') return;
50
+ warn(warning);
51
+ },
52
+ };
@@ -0,0 +1,126 @@
1
+ /*
2
+ * Copyright 2025 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ /**
14
+ * Content analysis and metrics calculation
15
+ * Provides comprehensive analysis of HTML content differences
16
+ */
17
+
18
+ import { stripTagsToText } from './html-filter.js';
19
+ import { tokenize } from './tokenizer.js';
20
+ import { generateDiffReport } from './diff-engine.js';
21
+ import { hashDJB2, pct } from './utils.js';
22
+
23
+ /**
24
+ * Comprehensive text-only analysis between initial and final HTML
25
+ * @param {string} initHtml - Initial HTML content (what crawlers see)
26
+ * @param {string} finHtml - Final HTML content (what users see)
27
+ * @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
28
+ * @returns {Promise<Object>} Comprehensive analysis results
29
+ */
30
+ export async function analyzeTextComparison(initHtml, finHtml, ignoreNavFooter = true) {
31
+ // Handle both sync (browser) and async (Node.js) stripTagsToText
32
+ const initTextResult = stripTagsToText(initHtml, ignoreNavFooter);
33
+ const finTextResult = stripTagsToText(finHtml, ignoreNavFooter);
34
+
35
+ const initText = await Promise.resolve(initTextResult);
36
+ const finText = await Promise.resolve(finTextResult);
37
+
38
+ const initTextLength = initText.length;
39
+ const finTextLength = finText.length;
40
+ const textRetention = finTextLength > 0 ? initTextLength / finTextLength : 0;
41
+
42
+ const wordDiff = generateDiffReport(initText, finText, 'word');
43
+ const lineDiff = generateDiffReport(initText, finText, 'line');
44
+
45
+ return {
46
+ initialText: initText,
47
+ finalText: finText,
48
+ initialTextLength: initTextLength,
49
+ finalTextLength: finTextLength,
50
+ textRetention,
51
+ textRetentionPercent: pct(textRetention),
52
+ wordDiff,
53
+ lineDiff,
54
+ initialTextHash: hashDJB2(initText),
55
+ finalTextHash: hashDJB2(finText),
56
+ };
57
+ }
58
+
59
+ /**
60
+ * Calculate basic stats from HTML comparison
61
+ * @param {string} originalHTML - Initial HTML content
62
+ * @param {string} currentHTML - Final HTML content
63
+ * @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
64
+ * @returns {Promise<Object>} Basic statistics
65
+ */
66
+ export async function calculateStats(originalHTML, currentHTML, ignoreNavFooter = true) {
67
+ // Handle both sync (browser) and async (Node.js) stripTagsToText
68
+ const originalTextResult = stripTagsToText(originalHTML, ignoreNavFooter);
69
+ const currentTextResult = stripTagsToText(currentHTML, ignoreNavFooter);
70
+
71
+ const originalText = await Promise.resolve(originalTextResult);
72
+ const currentText = await Promise.resolve(currentTextResult);
73
+
74
+ // Calculate word counts using consistent tokenization
75
+ const originalTokens = tokenize(originalText, 'word');
76
+ const currentTokens = tokenize(currentText, 'word');
77
+ const wordDiff = Math.abs(currentTokens.length - originalTokens.length);
78
+
79
+ // Calculate content increase ratio (how many times content increased)
80
+ let contentIncreaseRatio;
81
+ if (originalTokens.length > 0) {
82
+ contentIncreaseRatio = currentTokens.length / originalTokens.length;
83
+ } else {
84
+ contentIncreaseRatio = currentTokens.length > 0 ? currentTokens.length : 1;
85
+ }
86
+
87
+ // Calculate citation readability (percentage of original content available in current)
88
+ const citationReadability = currentTokens.length > 0
89
+ ? Math.min(100, (originalTokens.length / currentTokens.length) * 100) : 100;
90
+
91
+ return {
92
+ wordDiff,
93
+ contentIncreaseRatio: Math.round(contentIncreaseRatio * 100) / 100, // Round to 1 decimal place
94
+ citationReadability: Math.round(citationReadability),
95
+ };
96
+ }
97
+
98
+ /**
99
+ * Calculate stats for both nav/footer scenarios
100
+ * @param {string} originalHTML - Initial HTML content
101
+ * @param {string} currentHTML - Final HTML content
102
+ * @returns {Promise<Object>} Analysis results for both scenarios
103
+ */
104
+ export async function calculateBothScenarioStats(originalHTML, currentHTML) {
105
+ // Calculate stats with nav/footer ignored
106
+ const statsIgnored = await calculateStats(originalHTML, currentHTML, true);
107
+
108
+ // Calculate stats without nav/footer ignored
109
+ const statsNotIgnored = await calculateStats(originalHTML, currentHTML, false);
110
+ return {
111
+ withNavFooterIgnored: {
112
+ wordDiff: statsIgnored.wordDiff,
113
+ contentIncreaseRatio: statsIgnored.contentIncreaseRatio,
114
+ citationReadability: statsIgnored.citationReadability,
115
+ contentGain: `${Math.round(statsIgnored.contentIncreaseRatio * 10) / 10}x`,
116
+ missingWords: statsIgnored.wordDiff,
117
+ },
118
+ withoutNavFooterIgnored: {
119
+ wordDiff: statsNotIgnored.wordDiff,
120
+ contentIncreaseRatio: statsNotIgnored.contentIncreaseRatio,
121
+ citationReadability: statsNotIgnored.citationReadability,
122
+ contentGain: `${Math.round(statsNotIgnored.contentIncreaseRatio * 10) / 10}x`,
123
+ missingWords: statsNotIgnored.wordDiff,
124
+ },
125
+ };
126
+ }
@@ -0,0 +1,92 @@
1
+ /*
2
+ * Copyright 2025 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ /**
14
+ * Browser Entry Point for Chrome Extension
15
+ * Exposes all necessary functions for Chrome extension usage
16
+ *
17
+ * This bundle excludes Node.js specific code (cheerio) and creates
18
+ * a browser-compatible version for Chrome extensions.
19
+ */
20
+
21
+ // Import only browser-compatible functions
22
+ import {
23
+ analyzeTextComparison,
24
+ calculateStats,
25
+ calculateBothScenarioStats,
26
+ stripTagsToText,
27
+ filterHtmlContent,
28
+ extractWordCount,
29
+ filterNavigationAndFooterBrowser,
30
+ tokenize,
31
+ countWords,
32
+ countLines,
33
+ diffTokens,
34
+ generateDiffReport,
35
+ hashDJB2,
36
+ pct,
37
+ formatNumberToK,
38
+ isBrowser,
39
+ } from './index.js';
40
+
41
+ // Create global object for Chrome extension
42
+ const HTMLAnalyzer = {
43
+ // Core analysis functions (matching Chrome extension API)
44
+ analyzeTextComparison,
45
+ calculateStats,
46
+ calculateBothScenarioStats,
47
+
48
+ // HTML processing functions
49
+ stripTagsToText,
50
+ filterHtmlContent,
51
+ extractWordCount,
52
+ filterNavigationAndFooterBrowser,
53
+
54
+ // Text processing functions
55
+ tokenize,
56
+ countWords,
57
+ countLines,
58
+
59
+ // Diff engine functions
60
+ diffTokens,
61
+ generateDiffReport,
62
+
63
+ // Utility functions
64
+ hashDJB2,
65
+ pct,
66
+ formatNumberToK,
67
+ isBrowser,
68
+
69
+ // Version info
70
+ version: '1.0.0',
71
+ buildFor: 'chrome-extension',
72
+ };
73
+
74
+ // Make available globally for Chrome extension script tags
75
+ // This needs to be executed immediately when the bundle loads
76
+ /* eslint-env browser */
77
+ /* global window, self */
78
+ (function setGlobal() {
79
+ // Determine the global object (works in browser, Node.js, Web Workers)
80
+ const globalObject = (function getGlobalObject() {
81
+ if (typeof window !== 'undefined') return window;
82
+ if (typeof globalThis !== 'undefined') return globalThis;
83
+ if (typeof self !== 'undefined') return self;
84
+ return this || {};
85
+ }());
86
+
87
+ // Assign to global scope
88
+ globalObject.HTMLAnalyzer = HTMLAnalyzer;
89
+ }());
90
+
91
+ // Export for ES modules
92
+ export default HTMLAnalyzer;
@@ -0,0 +1,184 @@
1
+ /*
2
+ * Copyright 2025 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ /**
14
+ * LCS-based diff engine for text comparison
15
+ * Provides efficient algorithms for finding differences between text content
16
+ */
17
+
18
+ import { tokenize } from './tokenizer.js';
19
+
20
+ /**
21
+ * Generate LCS-based diff between two strings
22
+ * @param {string} aStr - First string to compare
23
+ * @param {string} bStr - Second string to compare
24
+ * @param {string} [mode="word"] - Tokenization mode: "word" or "line"
25
+ * @returns {Array} Array of diff operations: {type: 'same'|'add'|'del', text: string}
26
+ */
27
+ export function diffTokens(aStr, bStr, mode = 'word') {
28
+ const A = tokenize(aStr, mode);
29
+ const B = tokenize(bStr, mode);
30
+
31
+ // Map tokens to integers for faster LCS computation
32
+ const sym = new Map();
33
+ const mapTok = (t) => {
34
+ if (!sym.has(t)) sym.set(t, sym.size + 1);
35
+ return sym.get(t);
36
+ };
37
+ const a = A.map(mapTok);
38
+ const b = B.map(mapTok);
39
+
40
+ // Build LCS length table using space-optimized dynamic programming
41
+ const m = a.length;
42
+ const n = b.length;
43
+
44
+ // Optimize by using the smaller dimension for rolling arrays
45
+ let useTransposed = false;
46
+ let rows;
47
+ let cols;
48
+ let aTokens;
49
+ let bTokens;
50
+ let aMapped;
51
+ let bMapped;
52
+
53
+ if (m <= n) {
54
+ rows = m;
55
+ cols = n;
56
+ aTokens = A;
57
+ bTokens = B;
58
+ aMapped = a;
59
+ bMapped = b;
60
+ } else {
61
+ // Transpose to use smaller dimension (swap A and B)
62
+ rows = n;
63
+ cols = m;
64
+ aTokens = B;
65
+ bTokens = A;
66
+ aMapped = b;
67
+ bMapped = a;
68
+ useTransposed = true;
69
+ }
70
+
71
+ // Use rolling arrays: only need current and previous row
72
+ let prev = new Array(cols + 1).fill(0);
73
+ let curr = new Array(cols + 1).fill(0);
74
+
75
+ // Build LCS length table with rolling arrays
76
+ for (let i = 1; i <= rows; i += 1) {
77
+ for (let j = 1; j <= cols; j += 1) {
78
+ curr[j] = (aMapped[i - 1] === bMapped[j - 1])
79
+ ? prev[j - 1] + 1
80
+ : Math.max(prev[j], curr[j - 1]);
81
+ }
82
+ // Swap arrays for next iteration
83
+ const temp = curr;
84
+ curr = prev;
85
+ prev = temp;
86
+ }
87
+
88
+ // Rebuild LCS table for backtracking - using smaller dimension first
89
+ const dp = Array(rows + 1).fill(0).map(() => Array(cols + 1).fill(0));
90
+
91
+ for (let i = 1; i <= rows; i += 1) {
92
+ for (let j = 1; j <= cols; j += 1) {
93
+ dp[i][j] = (aMapped[i - 1] === bMapped[j - 1])
94
+ ? dp[i - 1][j - 1] + 1
95
+ : Math.max(dp[i - 1][j], dp[i][j - 1]);
96
+ }
97
+ }
98
+
99
+ // Backtrack to generate diff operations
100
+ const ops = [];
101
+ let i = rows;
102
+ let j = cols;
103
+
104
+ while (i > 0 && j > 0) {
105
+ if (aMapped[i - 1] === bMapped[j - 1]) {
106
+ ops.push({ type: 'same', text: aTokens[i - 1] });
107
+ i -= 1;
108
+ j -= 1;
109
+ } else if (dp[i - 1][j] >= dp[i][j - 1]) {
110
+ ops.push({ type: 'del', text: aTokens[i - 1] });
111
+ i -= 1;
112
+ } else {
113
+ ops.push({ type: 'add', text: bTokens[j - 1] });
114
+ j -= 1;
115
+ }
116
+ }
117
+
118
+ // Handle remaining tokens
119
+ while (i > 0) {
120
+ ops.push({ type: 'del', text: aTokens[i - 1] });
121
+ i -= 1;
122
+ }
123
+ while (j > 0) {
124
+ ops.push({ type: 'add', text: bTokens[j - 1] });
125
+ j -= 1;
126
+ }
127
+
128
+ // If we transposed, we need to swap add/del operations back
129
+ if (useTransposed) {
130
+ for (let opIndex = 0; opIndex < ops.length; opIndex += 1) {
131
+ const op = ops[opIndex];
132
+ if (op.type === 'add') {
133
+ ops[opIndex] = { ...op, type: 'del' };
134
+ } else if (op.type === 'del') {
135
+ ops[opIndex] = { ...op, type: 'add' };
136
+ }
137
+ }
138
+ }
139
+
140
+ ops.reverse();
141
+ return ops;
142
+ }
143
+
144
+ /**
145
+ * Generate comprehensive diff report with statistics
146
+ * @param {string} initText - Initial text (before changes)
147
+ * @param {string} finText - Final text (after changes)
148
+ * @param {string} [mode="word"] - Tokenization mode: "word" or "line"
149
+ * @returns {Object} Diff report with counts and operations
150
+ */
151
+ export function generateDiffReport(initText, finText, mode = 'word') {
152
+ if (!initText || !finText) {
153
+ return {
154
+ addCount: 0,
155
+ delCount: 0,
156
+ sameCount: 0,
157
+ diffOps: [],
158
+ summary: 'No text to compare',
159
+ };
160
+ }
161
+
162
+ const ops = diffTokens(initText, finText, mode);
163
+ let addCount = 0;
164
+ let delCount = 0;
165
+ let sameCount = 0;
166
+
167
+ ops.forEach((op) => {
168
+ if (op.type === 'add') {
169
+ addCount += 1;
170
+ } else if (op.type === 'del') {
171
+ delCount += 1;
172
+ } else {
173
+ sameCount += 1;
174
+ }
175
+ });
176
+
177
+ return {
178
+ addCount,
179
+ delCount,
180
+ sameCount,
181
+ diffOps: ops,
182
+ summary: `Added: ${addCount.toLocaleString()} • Removed: ${delCount.toLocaleString()} • Same: ${sameCount.toLocaleString()} • Granularity: ${mode}`,
183
+ };
184
+ }