@adobe/spacecat-shared-html-analyzer 1.0.6 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ # [@adobe/spacecat-shared-html-analyzer-v1.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.0.7...@adobe/spacecat-shared-html-analyzer-v1.1.0) (2025-12-01)
2
+
3
+
4
+ ### Features
5
+
6
+ * added utilities for markdown diff & conversion from LLMO chrome extension ([#1184](https://github.com/adobe/spacecat-shared/issues/1184)) ([dc9867e](https://github.com/adobe/spacecat-shared/commit/dc9867ea4ac0cf9f8bd2fdc3f22ab74cd3e1f12e))
7
+
8
+ # [@adobe/spacecat-shared-html-analyzer-v1.0.7](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.0.6...@adobe/spacecat-shared-html-analyzer-v1.0.7) (2025-11-28)
9
+
10
+
11
+ ### Bug Fixes
12
+
13
+ * update to node 24 ([#1179](https://github.com/adobe/spacecat-shared/issues/1179)) ([0e60c0a](https://github.com/adobe/spacecat-shared/commit/0e60c0ab791b47662d07822f7c93009a8f7048fd))
14
+
1
15
  # [@adobe/spacecat-shared-html-analyzer-v1.0.6](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.0.5...@adobe/spacecat-shared-html-analyzer-v1.0.6) (2025-11-15)
2
16
 
3
17
 
package/package.json CHANGED
@@ -1,10 +1,10 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-html-analyzer",
3
- "version": "1.0.6",
3
+ "version": "1.1.0",
4
4
  "description": "Analyze HTML content visibility for AI crawlers and citations - compare static HTML vs fully rendered content",
5
5
  "type": "module",
6
6
  "engines": {
7
- "node": ">=22.0.0 <23.0.0",
7
+ "node": ">=22.0.0 <25.0.0",
8
8
  "npm": ">=10.9.0 <12.0.0"
9
9
  },
10
10
  "main": "src/index.js",
@@ -12,6 +12,7 @@
12
12
  "scripts": {
13
13
  "test": "c8 mocha",
14
14
  "lint": "eslint .",
15
+ "lint:fix": "eslint --fix .",
15
16
  "clean": "rm -rf package-lock.json node_modules",
16
17
  "build": "rollup -c",
17
18
  "build:chrome": "rollup -c && echo '✅ Chrome extension bundle ready: dist/html-analyzer.min.js'"
@@ -36,7 +37,9 @@
36
37
  "access": "public"
37
38
  },
38
39
  "dependencies": {
39
- "cheerio": "^1.0.0-rc.12"
40
+ "cheerio": "^1.0.0-rc.12",
41
+ "turndown": "^7.2.0",
42
+ "marked": "^16.2.0"
40
43
  },
41
44
  "devDependencies": {
42
45
  "@rollup/plugin-node-resolve": "^16.0.1",
package/rollup.config.js CHANGED
@@ -57,8 +57,10 @@ export default {
57
57
  }),
58
58
  ],
59
59
  external: [
60
- // Exclude cheerio from bundle - it won't work in browser anyway
60
+ // Exclude Node.js-only dependencies from bundle - they won't work in browser anyway
61
61
  'cheerio',
62
+ 'turndown',
63
+ 'marked',
62
64
  ],
63
65
  onwarn(warning, warn) {
64
66
  // Suppress warnings about dynamic imports that we'll handle
@@ -32,10 +32,18 @@ import {
32
32
  countLines,
33
33
  diffTokens,
34
34
  generateDiffReport,
35
+ htmlToMarkdown,
36
+ markdownToHtml,
37
+ htmlToMarkdownToHtml,
38
+ diffDOMBlocks,
39
+ createMarkdownTableDiff,
40
+ generateMarkdownDiff,
41
+ htmlToRenderedMarkdown,
35
42
  hashDJB2,
36
43
  pct,
37
44
  formatNumberToK,
38
45
  isBrowser,
46
+ getGlobalObject,
39
47
  } from './index.js';
40
48
 
41
49
  // Create global object for Chrome extension
@@ -60,6 +68,17 @@ const HTMLAnalyzer = {
60
68
  diffTokens,
61
69
  generateDiffReport,
62
70
 
71
+ // Markdown conversion functions
72
+ htmlToMarkdown,
73
+ markdownToHtml,
74
+ htmlToMarkdownToHtml,
75
+
76
+ // Markdown diff functions
77
+ diffDOMBlocks,
78
+ createMarkdownTableDiff,
79
+ generateMarkdownDiff,
80
+ htmlToRenderedMarkdown,
81
+
63
82
  // Utility functions
64
83
  hashDJB2,
65
84
  pct,
@@ -73,17 +92,8 @@ const HTMLAnalyzer = {
73
92
 
74
93
  // Make available globally for Chrome extension script tags
75
94
  // This needs to be executed immediately when the bundle loads
76
- /* eslint-env browser */
77
- /* global window, self */
78
95
  (function setGlobal() {
79
- // Determine the global object (works in browser, Node.js, Web Workers)
80
- const globalObject = (function getGlobalObject() {
81
- if (typeof window !== 'undefined') return window;
82
- if (typeof globalThis !== 'undefined') return globalThis;
83
- if (typeof self !== 'undefined') return self;
84
- return this || {};
85
- }());
86
-
96
+ const globalObject = getGlobalObject();
87
97
  // Assign to global scope
88
98
  globalObject.HTMLAnalyzer = HTMLAnalyzer;
89
99
  }());
package/src/index.js CHANGED
@@ -40,9 +40,23 @@ export {
40
40
  calculateBothScenarioStats,
41
41
  } from './analyzer.js';
42
42
 
43
+ export {
44
+ htmlToMarkdown,
45
+ markdownToHtml,
46
+ htmlToMarkdownToHtml,
47
+ } from './markdown-converter.js';
48
+
49
+ export {
50
+ diffDOMBlocks,
51
+ createMarkdownTableDiff,
52
+ generateMarkdownDiff,
53
+ htmlToRenderedMarkdown,
54
+ } from './markdown-diff.js';
55
+
43
56
  export {
44
57
  hashDJB2,
45
58
  pct,
46
59
  formatNumberToK,
47
60
  isBrowser,
61
+ getGlobalObject,
48
62
  } from './utils.js';
@@ -0,0 +1,105 @@
1
+ /*
2
+ * Copyright 2025 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ /**
14
+ * Markdown conversion utilities
15
+ * Provides HTML to Markdown and Markdown to HTML conversions
16
+ */
17
+
18
+ import { isBrowser, getGlobalObject } from './utils.js';
19
+
20
+ // Cache for imported modules in Node.js
21
+ let TurndownServiceClass = null;
22
+ let markedParser = null;
23
+
24
+ /**
25
+ * Get Turndown service instance
26
+ * @private
27
+ * @returns {Promise<Object>} TurndownService instance
28
+ */
29
+ async function getTurndownService() {
30
+ if (isBrowser()) {
31
+ // In browser environment, expect global TurndownService
32
+ const globalObj = getGlobalObject();
33
+ if (globalObj.TurndownService) {
34
+ return new globalObj.TurndownService();
35
+ }
36
+ throw new Error('TurndownService must be loaded in browser environment');
37
+ }
38
+ // In Node.js environment, dynamically import turndown
39
+ if (!TurndownServiceClass) {
40
+ const module = await import('turndown');
41
+ TurndownServiceClass = module.default;
42
+ }
43
+ return new TurndownServiceClass();
44
+ }
45
+
46
+ /**
47
+ * Get marked parser
48
+ * @private
49
+ * @returns {Promise<Object>} marked parser
50
+ */
51
+ async function getMarked() {
52
+ if (isBrowser()) {
53
+ // In browser environment, expect global marked
54
+ const globalObj = getGlobalObject();
55
+ if (globalObj.marked) {
56
+ return globalObj.marked;
57
+ }
58
+ throw new Error('marked must be loaded in browser environment');
59
+ }
60
+ // In Node.js environment, dynamically import marked
61
+ if (!markedParser) {
62
+ const module = await import('marked');
63
+ markedParser = module.marked;
64
+ }
65
+ return markedParser;
66
+ }
67
+
68
+ /**
69
+ * Convert HTML to Markdown
70
+ * @param {string} html - HTML content to convert
71
+ * @returns {Promise<string>} Markdown content
72
+ */
73
+ export async function htmlToMarkdown(html) {
74
+ if (!html || typeof html !== 'string') {
75
+ return '';
76
+ }
77
+
78
+ const turndownService = await getTurndownService();
79
+ return turndownService.turndown(html);
80
+ }
81
+
82
+ /**
83
+ * Convert Markdown to HTML
84
+ * @param {string} markdown - Markdown content to convert
85
+ * @returns {Promise<string>} HTML content
86
+ */
87
+ export async function markdownToHtml(markdown) {
88
+ if (!markdown || typeof markdown !== 'string') {
89
+ return '';
90
+ }
91
+
92
+ const marked = await getMarked();
93
+ return marked.parse(markdown);
94
+ }
95
+
96
+ /**
97
+ * Convert HTML to Markdown and then render it back to HTML
98
+ * Useful for normalizing HTML through markdown representation
99
+ * @param {string} html - HTML content to convert
100
+ * @returns {Promise<string>} Rendered HTML from markdown
101
+ */
102
+ export async function htmlToMarkdownToHtml(html) {
103
+ const markdown = await htmlToMarkdown(html);
104
+ return markdownToHtml(markdown);
105
+ }
@@ -0,0 +1,247 @@
1
+ /*
2
+ * Copyright 2025 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ /**
14
+ * Markdown diff utilities
15
+ * Provides DOM block-level diffing for markdown content
16
+ */
17
+
18
+ import { filterHtmlContent } from './html-filter.js';
19
+ import { htmlToMarkdown, markdownToHtml } from './markdown-converter.js';
20
+
21
+ /**
22
+ * Diff DOM blocks using LCS algorithm
23
+ * Compares blocks based on text content while preserving full HTML structure
24
+ * @param {Array<{html: string, text: string, tagName: string}>} originalBlocks
25
+ * - Original DOM blocks
26
+ * @param {Array<{html: string, text: string, tagName: string}>} currentBlocks
27
+ * - Current DOM blocks
28
+ * @returns {Array<{type: 'same'|'del'|'add', originalBlock?: Object,
29
+ * currentBlock?: Object}>} Diff operations
30
+ */
31
+ export function diffDOMBlocks(originalBlocks, currentBlocks) {
32
+ // Create a mapping function that uses text content for comparison
33
+ // while preserving the full HTML structure
34
+ const A = originalBlocks.map((block) => block.text);
35
+ const B = currentBlocks.map((block) => block.text);
36
+
37
+ // Map tokens to ints for faster LCS
38
+ const sym = new Map();
39
+ const mapTok = (t) => {
40
+ if (!sym.has(t)) sym.set(t, sym.size + 1);
41
+ return sym.get(t);
42
+ };
43
+ const a = A.map(mapTok);
44
+ const b = B.map(mapTok);
45
+
46
+ // LCS length table
47
+ const m = a.length;
48
+ const n = b.length;
49
+ const dp = Array(m + 1).fill(0).map(() => Array(n + 1).fill(0));
50
+ for (let i = 1; i <= m; i += 1) {
51
+ for (let j = 1; j <= n; j += 1) {
52
+ dp[i][j] = (a[i - 1] === b[j - 1])
53
+ ? dp[i - 1][j - 1] + 1
54
+ : Math.max(dp[i - 1][j], dp[i][j - 1]);
55
+ }
56
+ }
57
+
58
+ // Backtrack to collect ops with full block data
59
+ const ops = [];
60
+ let i = m;
61
+ let j = n;
62
+ while (i > 0 && j > 0) {
63
+ if (a[i - 1] === b[j - 1]) {
64
+ ops.push({
65
+ type: 'same',
66
+ originalBlock: originalBlocks[i - 1],
67
+ currentBlock: currentBlocks[j - 1],
68
+ });
69
+ i -= 1;
70
+ j -= 1;
71
+ } else if (dp[i - 1][j] >= dp[i][j - 1]) {
72
+ ops.push({
73
+ type: 'del',
74
+ originalBlock: originalBlocks[i - 1],
75
+ });
76
+ i -= 1;
77
+ } else {
78
+ ops.push({
79
+ type: 'add',
80
+ currentBlock: currentBlocks[j - 1],
81
+ });
82
+ j -= 1;
83
+ }
84
+ }
85
+ while (i > 0) {
86
+ ops.push({
87
+ type: 'del',
88
+ originalBlock: originalBlocks[i - 1],
89
+ });
90
+ i -= 1;
91
+ }
92
+ while (j > 0) {
93
+ ops.push({
94
+ type: 'add',
95
+ currentBlock: currentBlocks[j - 1],
96
+ });
97
+ j -= 1;
98
+ }
99
+ ops.reverse();
100
+ return ops;
101
+ }
102
+
103
+ /**
104
+ * Extract blocks from parsed HTML, breaking down lists into individual items
105
+ * @param {Array} children - Array of child elements
106
+ * @returns {Array<{html: string, text: string, tagName: string}>} Extracted blocks
107
+ * @private
108
+ */
109
+ function extractBlocks(children) {
110
+ const blocks = [];
111
+ children.forEach((el) => {
112
+ // If it's a list (ul/ol), break it down into individual list items
113
+ if (el.tagName === 'UL' || el.tagName === 'OL') {
114
+ const listType = el.tagName.toLowerCase();
115
+ Array.from(el.children).forEach((li) => {
116
+ if (li.tagName === 'LI') {
117
+ // Skip empty list items - they cause alignment issues
118
+ const liText = li.textContent?.trim() || '';
119
+ if (!liText) return;
120
+
121
+ // Check if the list item contains nested block elements (p, div, h1-h6, etc.)
122
+ const nestedBlocks = Array.from(li.children).filter((child) => {
123
+ const tag = child.tagName;
124
+ return tag === 'P' || tag === 'DIV' || tag === 'H1' || tag === 'H2'
125
+ || tag === 'H3' || tag === 'H4' || tag === 'H5' || tag === 'H6'
126
+ || tag === 'BLOCKQUOTE' || tag === 'PRE';
127
+ });
128
+
129
+ if (nestedBlocks.length > 0) {
130
+ // Extract nested blocks individually for better matching
131
+ // but wrap them in li/ul for proper display
132
+ nestedBlocks.forEach((child) => {
133
+ const childText = child.textContent?.trim() || '';
134
+ if (!childText) return; // Skip empty nested blocks too
135
+
136
+ blocks.push({
137
+ html: `<${listType}><li>${child.outerHTML}</li></${listType}>`,
138
+ text: child.textContent?.trim() || '',
139
+ tagName: child.tagName.toLowerCase(),
140
+ });
141
+ });
142
+ } else {
143
+ // No nested blocks, treat the whole li as one block
144
+ // wrap in ul/ol for proper display
145
+ blocks.push({
146
+ html: `<${listType}>${li.outerHTML}</${listType}>`,
147
+ text: li.textContent?.trim() || '',
148
+ tagName: 'li',
149
+ });
150
+ }
151
+ }
152
+ });
153
+ } else {
154
+ // For all other elements, add them as-is
155
+ blocks.push({
156
+ html: el.outerHTML,
157
+ text: el.textContent?.trim() || '',
158
+ tagName: el.tagName.toLowerCase(),
159
+ });
160
+ }
161
+ });
162
+ return blocks;
163
+ }
164
+
165
+ /**
166
+ * Create markdown table diff from parsed DOM children
167
+ * @param {Array} originalChildren - Array of original DOM child elements
168
+ * @param {Array} currentChildren - Array of current DOM child elements
169
+ * @returns {{tableHtml: string, counters: string}} Diff table and counter information
170
+ */
171
+ export function createMarkdownTableDiff(originalChildren, currentChildren) {
172
+ // Get all block-level elements from both sides and extract their text content
173
+ const originalBlocks = extractBlocks(originalChildren);
174
+ const currentBlocks = extractBlocks(currentChildren);
175
+
176
+ // Run diff algorithm once and count changes
177
+ const ops = diffDOMBlocks(originalBlocks, currentBlocks);
178
+ let addCount = 0;
179
+ let delCount = 0;
180
+
181
+ // Create table rows based on diff operations and count changes
182
+ const tableRows = [];
183
+ ops.forEach((op) => {
184
+ if (op.type === 'same') {
185
+ // Show unchanged blocks on both sides
186
+ const leftContent = op.originalBlock.html;
187
+ const rightContent = op.currentBlock.html;
188
+ tableRows.push(`<tr><td class="diff-line-same markdown-rendered">${leftContent}</td><td class="diff-line-same markdown-rendered">${rightContent}</td></tr>`);
189
+ } else if (op.type === 'del') {
190
+ // Show deleted blocks only on left side
191
+ delCount += 1;
192
+ const leftContent = op.originalBlock.html;
193
+ tableRows.push(`<tr><td class="diff-line-del markdown-rendered">${leftContent}</td><td class="diff-line-empty"></td></tr>`);
194
+ } else if (op.type === 'add') {
195
+ // Show added blocks only on right side
196
+ addCount += 1;
197
+ const rightContent = op.currentBlock.html;
198
+ tableRows.push(`<tr><td class="diff-line-empty"></td><td class="diff-line-add markdown-rendered">${rightContent}</td></tr>`);
199
+ }
200
+ });
201
+
202
+ const hasChanges = addCount > 0 || delCount > 0;
203
+ const counters = hasChanges
204
+ ? `${addCount} block additions, ${delCount} block deletions`
205
+ : 'No differences';
206
+
207
+ return {
208
+ tableHtml: tableRows.join('\n'),
209
+ counters,
210
+ };
211
+ }
212
+
213
+ /**
214
+ * Convert HTML to rendered markdown HTML (for display)
215
+ * @param {string} html - HTML content to convert
216
+ * @param {boolean} [ignoreNavFooter=true] - Whether to filter nav/footer elements
217
+ * @returns {Promise<string>} Rendered markdown HTML
218
+ */
219
+ export async function htmlToRenderedMarkdown(html, ignoreNavFooter = true) {
220
+ // Extract body content only (with nav/footer filtering applied)
221
+ const bodyContent = await filterHtmlContent(html, ignoreNavFooter, false);
222
+
223
+ // Convert to markdown and back to HTML
224
+ const markdown = await htmlToMarkdown(bodyContent);
225
+ return markdownToHtml(markdown);
226
+ }
227
+
228
+ /**
229
+ * Generate complete markdown diff with HTML to Markdown conversion
230
+ * @param {string} originalHtml - Original HTML content
231
+ * @param {string} currentHtml - Current HTML content
232
+ * @param {boolean} [ignoreNavFooter=true] - Whether to filter nav/footer elements
233
+ * @returns {Promise<{originalRenderedHtml: string, currentRenderedHtml: string}>}
234
+ * Rendered markdown HTML for both sides
235
+ */
236
+ export async function generateMarkdownDiff(originalHtml, currentHtml, ignoreNavFooter = true) {
237
+ // Convert both HTMLs to rendered markdown HTML
238
+ const [originalRenderedHtml, currentRenderedHtml] = await Promise.all([
239
+ htmlToRenderedMarkdown(originalHtml, ignoreNavFooter),
240
+ htmlToRenderedMarkdown(currentHtml, ignoreNavFooter),
241
+ ]);
242
+
243
+ return {
244
+ originalRenderedHtml,
245
+ currentRenderedHtml,
246
+ };
247
+ }
package/src/utils.js CHANGED
@@ -60,3 +60,19 @@ export function formatNumberToK(num) {
60
60
  export function isBrowser() {
61
61
  return typeof window !== 'undefined' && typeof document !== 'undefined';
62
62
  }
63
+
64
+ /**
65
+ * Get global object in a cross-platform way
66
+ * @returns {Object} Global object
67
+ */
68
+ export function getGlobalObject() {
69
+ // eslint-disable-next-line no-undef
70
+ if (typeof globalThis !== 'undefined') return globalThis;
71
+ // eslint-disable-next-line no-undef
72
+ if (typeof self !== 'undefined') return self;
73
+ // eslint-disable-next-line no-undef
74
+ if (typeof window !== 'undefined') return window;
75
+ // eslint-disable-next-line no-undef
76
+ if (typeof global !== 'undefined') return global;
77
+ return {};
78
+ }