@adobe/spacecat-shared-html-analyzer 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.releaserc.cjs ADDED
@@ -0,0 +1,17 @@
1
+ module.exports = {
2
+ extends: "semantic-release-monorepo",
3
+ plugins: [
4
+ "@semantic-release/commit-analyzer",
5
+ "@semantic-release/release-notes-generator",
6
+ ["@semantic-release/changelog", {
7
+ "changelogFile": "CHANGELOG.md",
8
+ }],
9
+ "@semantic-release/npm",
10
+ ["@semantic-release/git", {
11
+ "assets": ["package.json", "CHANGELOG.md"],
12
+ "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
13
+ }],
14
+ ["@semantic-release/github", {}],
15
+ ],
16
+ branches: ['main'],
17
+ };
package/CHANGELOG.md CHANGED
@@ -1,3 +1,36 @@
1
+ # [@adobe/spacecat-shared-html-analyzer-v1.0.3](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.0.2...@adobe/spacecat-shared-html-analyzer-v1.0.3) (2025-10-29)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * version bump of html-analyzer package ([#1068](https://github.com/adobe/spacecat-shared/issues/1068)) ([9318cc5](https://github.com/adobe/spacecat-shared/commit/9318cc51d0572dfe5e659aea2d8548d0f92146bd))
7
+
8
+ # [@adobe/spacecat-shared-html-analyzer-v1.0.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.0.1...@adobe/spacecat-shared-html-analyzer-v1.0.2) (2025-10-29)
9
+
10
+
11
+ ### Bug Fixes
12
+
13
+ * json-ld inclusion, cc banner updates ([#1054](https://github.com/adobe/spacecat-shared/issues/1054)) ([9f993fe](https://github.com/adobe/spacecat-shared/commit/9f993fe3531334d6819112aa535465dcfd4ccfb1))
14
+
15
+ # [@adobe/spacecat-shared-html-analyzer-v1.0.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.0.0...@adobe/spacecat-shared-html-analyzer-v1.0.1) (2025-10-25)
16
+
17
+
18
+ ### Bug Fixes
19
+
20
+ * **deps:** update external fixes ([#1046](https://github.com/adobe/spacecat-shared/issues/1046)) ([bb6e118](https://github.com/adobe/spacecat-shared/commit/bb6e11886b323f73624fcb9e3c2b14d318aa00c9))
21
+
22
+ # @adobe/spacecat-shared-html-analyzer-v1.0.0 (2025-09-24)
23
+
24
+
25
+ ### Bug Fixes
26
+
27
+ * release package ([#984](https://github.com/adobe/spacecat-shared/issues/984)) ([15a620e](https://github.com/adobe/spacecat-shared/commit/15a620ecb276d37b14cc2b5e7ca787f7c478ca2b))
28
+
29
+
30
+ ### Features
31
+
32
+ * package for html comparison ([#968](https://github.com/adobe/spacecat-shared/issues/968)) ([5934c0a](https://github.com/adobe/spacecat-shared/commit/5934c0a1e04b91916a823a1835e65178391e0d76))
33
+
1
34
  # Changelog
2
35
 
3
36
  All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
package/README.md CHANGED
@@ -120,7 +120,7 @@ Generate a minified bundle for Chrome extensions:
120
120
  npm run build:chrome
121
121
  ```
122
122
 
123
- This creates `dist/html-analyzer.min.js` that can be included directly in Chrome extension manifest files. The bundle exposes `HTMLAnalyzer` and `HTMLComparisonUtils` globally.
123
+ This creates `dist/html-analyzer.min.js` that can be included directly in Chrome extension manifest files. The bundle exposes `HTMLAnalyzer` globally.
124
124
 
125
125
  ## Version Information
126
126
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-html-analyzer",
3
- "version": "1.0.2",
3
+ "version": "1.0.3",
4
4
  "description": "Analyze HTML content visibility for AI crawlers and citations - compare static HTML vs fully rendered content",
5
5
  "type": "module",
6
6
  "engines": {
@@ -41,7 +41,7 @@
41
41
  "devDependencies": {
42
42
  "@rollup/plugin-node-resolve": "^16.0.1",
43
43
  "@rollup/plugin-terser": "^0.4.4",
44
- "chai": "6.0.1",
44
+ "chai": "6.2.0",
45
45
  "chai-as-promised": "8.0.2",
46
46
  "rollup": "^4.52.2",
47
47
  "sinon": "21.0.0",
package/rollup.config.js CHANGED
@@ -12,6 +12,21 @@
12
12
 
13
13
  import { nodeResolve } from '@rollup/plugin-node-resolve';
14
14
  import terser from '@rollup/plugin-terser';
15
+ import { readFileSync } from 'fs';
16
+
17
+ // Read package.json version
18
+ const pkg = JSON.parse(readFileSync('./package.json', 'utf-8'));
19
+
20
+ // Simple plugin to inject package version
21
+ const injectVersion = () => ({
22
+ name: 'inject-version',
23
+ transform(code, id) {
24
+ if (id.endsWith('browser-entry.js')) {
25
+ return code.replace('__PACKAGE_VERSION__', pkg.version);
26
+ }
27
+ return null;
28
+ },
29
+ });
15
30
 
16
31
  export default {
17
32
  input: 'src/browser-entry.js', // Special browser entry point
@@ -35,6 +50,7 @@ export default {
35
50
  },
36
51
  ],
37
52
  plugins: [
53
+ injectVersion(), // Inject package version
38
54
  nodeResolve({
39
55
  browser: true, // Use browser field in package.json
40
56
  preferBuiltins: false, // Don't include Node.js built-ins
@@ -66,8 +66,8 @@ const HTMLAnalyzer = {
66
66
  formatNumberToK,
67
67
  isBrowser,
68
68
 
69
- // Version info
70
- version: '1.0.0',
69
+ // Version info (replaced during build from package.json)
70
+ version: '__PACKAGE_VERSION__',
71
71
  buildFor: 'chrome-extension',
72
72
  };
73
73
 
@@ -52,6 +52,35 @@ const COOKIE_KEYWORDS = new Set([
52
52
  'accept all', 'reject all', 'manage preferences',
53
53
  ]);
54
54
 
55
+ const COOKIE_BANNER_CLASS_SELECTORS = [
56
+ '.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
57
+ '.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
58
+ '.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
59
+ '.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
60
+ '.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
61
+ '.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
62
+ '.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
63
+ ];
64
+
65
+ const COOKIE_BANNER_ID_SELECTORS = [
66
+ '#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
67
+ '#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
68
+ '#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar', '#cookiemgmt',
69
+ '#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
70
+ '#onetrust-consent-sdk', '#onetrust-banner-sdk',
71
+ ];
72
+
73
+ const COOKIE_BANNER_ARIA_SELECTORS = [
74
+ '[role="dialog"][aria-label="Consent Banner"]',
75
+ '[role="dialog"][aria-label*="cookie" i]',
76
+ '[role="dialog"][aria-label*="privacy" i]',
77
+ '[role="dialog"][aria-label*="consent" i]',
78
+ '[role="alertdialog"][aria-label*="cookie" i]',
79
+ '[role="alertdialog"][aria-label*="privacy" i]',
80
+ '[aria-describedby*="cookie" i]',
81
+ '[aria-describedby*="privacy" i]',
82
+ ];
83
+
55
84
  /**
56
85
  * Validates if an element is likely a cookie banner based on text content
57
86
  * Optimized: Set lookup + early exit for common keywords (3x faster)
@@ -73,35 +102,12 @@ function isCookieBannerElement(element) {
73
102
  * Uses multiple strategies to identify genuine cookie consent banners
74
103
  */
75
104
  function removeCookieBanners(element) {
76
- const classBasedSelectors = [
77
- '.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
78
- '.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
79
- '.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
80
- '.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
81
- '.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
82
- '.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
83
- '.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
84
- ];
85
-
86
- const idBasedSelectors = [
87
- '#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
88
- '#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
89
- '#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar',
90
- '#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
91
- ];
92
-
93
- const ariaSelectors = [
94
- '[role="dialog"][aria-label*="cookie" i]',
95
- '[role="dialog"][aria-label*="privacy" i]',
96
- '[role="dialog"][aria-label*="consent" i]',
97
- '[role="alertdialog"][aria-label*="cookie" i]',
98
- '[role="alertdialog"][aria-label*="privacy" i]',
99
- '[aria-describedby*="cookie" i]',
100
- '[aria-describedby*="privacy" i]',
101
- ];
102
-
103
105
  // Combine all selectors
104
- const allSelectors = [...classBasedSelectors, ...idBasedSelectors, ...ariaSelectors];
106
+ const allSelectors = [
107
+ ...COOKIE_BANNER_CLASS_SELECTORS,
108
+ ...COOKIE_BANNER_ID_SELECTORS,
109
+ ...COOKIE_BANNER_ARIA_SELECTORS,
110
+ ];
105
111
 
106
112
  // Apply class/ID/ARIA based detection with text validation
107
113
  allSelectors.forEach((selector) => {
@@ -132,35 +138,12 @@ export function filterNavigationAndFooterBrowser(element) {
132
138
  * @param {CheerioAPI} $ - Cheerio instance
133
139
  */
134
140
  function removeCookieBannersCheerio($) {
135
- const classBasedSelectors = [
136
- '.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
137
- '.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
138
- '.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
139
- '.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
140
- '.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
141
- '.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
142
- '.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
143
- ];
144
-
145
- const idBasedSelectors = [
146
- '#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
147
- '#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
148
- '#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar',
149
- '#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
150
- ];
151
-
152
- const ariaSelectors = [
153
- '[role="dialog"][aria-label*="cookie" i]',
154
- '[role="dialog"][aria-label*="privacy" i]',
155
- '[role="dialog"][aria-label*="consent" i]',
156
- '[role="alertdialog"][aria-label*="cookie" i]',
157
- '[role="alertdialog"][aria-label*="privacy" i]',
158
- '[aria-describedby*="cookie" i]',
159
- '[aria-describedby*="privacy" i]',
160
- ];
161
-
162
141
  // Combine all selectors for efficient removal
163
- const allSelectors = [...classBasedSelectors, ...idBasedSelectors, ...ariaSelectors];
142
+ const allSelectors = [
143
+ ...COOKIE_BANNER_CLASS_SELECTORS,
144
+ ...COOKIE_BANNER_ID_SELECTORS,
145
+ ...COOKIE_BANNER_ARIA_SELECTORS,
146
+ ];
164
147
 
165
148
  // Apply class/ID/ARIA based detection with text validation
166
149
  allSelectors.forEach((selector) => {
@@ -204,28 +187,70 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
204
187
  const parser = new DOMParser(); // eslint-disable-line no-undef
205
188
  const doc = parser.parseFromString(htmlContent, 'text/html');
206
189
 
207
- // Get the body element, if it doesn't exist, use the entire document
208
- const bodyElement = doc.body || doc.documentElement;
209
-
210
- // Always remove script, style, noscript, template elements
211
- bodyElement.querySelectorAll('script,style,noscript,template').forEach((n) => n.remove());
190
+ // Process the entire document to capture JSON-LD in both head and body
191
+ const documentElement = doc.documentElement || doc;
192
+
193
+ // Remove script elements except JSON-LD, also remove style, noscript, template
194
+ documentElement.querySelectorAll('script').forEach((n) => {
195
+ // Preserve JSON-LD structured data scripts by converting them to code blocks
196
+ if (n.type === 'application/ld+json') {
197
+ const jsonContent = n.textContent || n.innerText || '';
198
+ if (jsonContent.trim()) {
199
+ try {
200
+ // Parse and re-stringify JSON to ensure consistent formatting
201
+ // Handle both single and double quoted JSON
202
+ const cleanJsonContent = jsonContent.trim();
203
+ // Try to fix common JSON issues like single quotes
204
+ const startsValid = cleanJsonContent.startsWith('{')
205
+ || cleanJsonContent.startsWith('[');
206
+ const endsValid = cleanJsonContent.endsWith('}')
207
+ || cleanJsonContent.endsWith(']');
208
+
209
+ if (!startsValid || !endsValid) {
210
+ throw new Error('Not valid JSON structure');
211
+ }
212
+
213
+ const parsedJson = JSON.parse(cleanJsonContent);
214
+ const formattedJson = JSON.stringify(parsedJson, null, 2);
215
+
216
+ // Create a pre/code block to preserve JSON-LD for markdown conversion
217
+ const codeBlock = document.createElement('pre'); // eslint-disable-line no-undef
218
+ const code = document.createElement('code'); // eslint-disable-line no-undef
219
+ code.className = 'ld-json';
220
+ code.textContent = formattedJson;
221
+ codeBlock.appendChild(code);
222
+ n.parentNode.insertBefore(codeBlock, n);
223
+ } catch (e) {
224
+ // If JSON parsing fails, fall back to original content
225
+ const codeBlock = document.createElement('pre'); // eslint-disable-line no-undef
226
+ const code = document.createElement('code'); // eslint-disable-line no-undef
227
+ code.className = 'ld-json';
228
+ code.textContent = jsonContent.trim();
229
+ codeBlock.appendChild(code);
230
+ n.parentNode.insertBefore(codeBlock, n);
231
+ }
232
+ }
233
+ }
234
+ n.remove();
235
+ });
236
+ documentElement.querySelectorAll('style,noscript,template').forEach((n) => n.remove());
212
237
 
213
238
  // Remove all media elements (images, videos, audio, etc.) to keep only text
214
- bodyElement.querySelectorAll('img,video,audio,picture,svg,canvas,embed,object,iframe')
215
- .forEach((n) => n.remove());
239
+ const mediaSelector = 'img,video,audio,picture,svg,canvas,embed,object,iframe';
240
+ documentElement.querySelectorAll(mediaSelector).forEach((n) => n.remove());
216
241
 
217
242
  // Remove consent banners with intelligent detection
218
- removeCookieBanners(bodyElement);
243
+ removeCookieBanners(documentElement);
219
244
 
220
245
  // Conditionally remove navigation and footer elements
221
246
  if (ignoreNavFooter) {
222
- filterNavigationAndFooterBrowser(bodyElement);
247
+ filterNavigationAndFooterBrowser(documentElement);
223
248
  }
224
249
 
225
250
  if (returnText) {
226
- return (bodyElement && bodyElement.textContent) ? bodyElement.textContent : '';
251
+ return (documentElement && documentElement.textContent) ? documentElement.textContent : '';
227
252
  }
228
- return bodyElement.outerHTML;
253
+ return documentElement.outerHTML;
229
254
  }
230
255
 
231
256
  /**
@@ -245,8 +270,41 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
245
270
 
246
271
  const $ = cheerio.load(htmlContent);
247
272
 
248
- // Always remove script, style, noscript, template tags
249
- $('script, style, noscript, template').remove();
273
+ // Remove script except JSON-LD structured data, also remove style, noscript, template
274
+ $('script').each(function processScript() {
275
+ // Preserve JSON-LD structured data scripts by converting them to code blocks
276
+ if ($(this).attr('type') === 'application/ld+json') {
277
+ const jsonContent = $(this).text().trim();
278
+ if (jsonContent) {
279
+ try {
280
+ // Parse and re-stringify JSON to ensure consistent formatting
281
+ // Handle both single and double quoted JSON
282
+ const cleanJsonContent = jsonContent;
283
+ const startsValid = cleanJsonContent.startsWith('{')
284
+ || cleanJsonContent.startsWith('[');
285
+ const endsValid = cleanJsonContent.endsWith('}')
286
+ || cleanJsonContent.endsWith(']');
287
+
288
+ if (!startsValid || !endsValid) {
289
+ throw new Error('Not valid JSON structure');
290
+ }
291
+
292
+ const parsedJson = JSON.parse(cleanJsonContent);
293
+ const formattedJson = JSON.stringify(parsedJson, null, 2);
294
+ const codeBlock = `<pre><code class="ld-json">${formattedJson}</code></pre>`;
295
+ $(this).before(codeBlock);
296
+ } catch (e) {
297
+ // If JSON parsing fails, fall back to original content
298
+ const codeBlock = `<pre><code class="ld-json">${jsonContent}</code></pre>`;
299
+ $(this).before(codeBlock);
300
+ }
301
+ }
302
+ $(this).remove();
303
+ } else {
304
+ $(this).remove();
305
+ }
306
+ });
307
+ $('style, noscript, template').remove();
250
308
 
251
309
  // Remove all media elements (images, videos, audio, etc.) to keep only text
252
310
  $('img, video, audio, picture, svg, canvas, embed, object, iframe').remove();