@adobe/spacecat-shared-html-analyzer 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.releaserc.cjs +17 -0
- package/CHANGELOG.md +33 -0
- package/README.md +1 -1
- package/package.json +2 -2
- package/rollup.config.js +16 -0
- package/src/browser-entry.js +2 -2
- package/src/html-filter.js +127 -69
package/.releaserc.cjs
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module.exports = {
|
|
2
|
+
extends: "semantic-release-monorepo",
|
|
3
|
+
plugins: [
|
|
4
|
+
"@semantic-release/commit-analyzer",
|
|
5
|
+
"@semantic-release/release-notes-generator",
|
|
6
|
+
["@semantic-release/changelog", {
|
|
7
|
+
"changelogFile": "CHANGELOG.md",
|
|
8
|
+
}],
|
|
9
|
+
"@semantic-release/npm",
|
|
10
|
+
["@semantic-release/git", {
|
|
11
|
+
"assets": ["package.json", "CHANGELOG.md"],
|
|
12
|
+
"message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
|
|
13
|
+
}],
|
|
14
|
+
["@semantic-release/github", {}],
|
|
15
|
+
],
|
|
16
|
+
branches: ['main'],
|
|
17
|
+
};
|
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,36 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-html-analyzer-v1.0.3](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.0.2...@adobe/spacecat-shared-html-analyzer-v1.0.3) (2025-10-29)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* version bump of html-analyzer package ([#1068](https://github.com/adobe/spacecat-shared/issues/1068)) ([9318cc5](https://github.com/adobe/spacecat-shared/commit/9318cc51d0572dfe5e659aea2d8548d0f92146bd))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-html-analyzer-v1.0.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.0.1...@adobe/spacecat-shared-html-analyzer-v1.0.2) (2025-10-29)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Bug Fixes
|
|
12
|
+
|
|
13
|
+
* json-ld inclusion, cc banner updates ([#1054](https://github.com/adobe/spacecat-shared/issues/1054)) ([9f993fe](https://github.com/adobe/spacecat-shared/commit/9f993fe3531334d6819112aa535465dcfd4ccfb1))
|
|
14
|
+
|
|
15
|
+
# [@adobe/spacecat-shared-html-analyzer-v1.0.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.0.0...@adobe/spacecat-shared-html-analyzer-v1.0.1) (2025-10-25)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
### Bug Fixes
|
|
19
|
+
|
|
20
|
+
* **deps:** update external fixes ([#1046](https://github.com/adobe/spacecat-shared/issues/1046)) ([bb6e118](https://github.com/adobe/spacecat-shared/commit/bb6e11886b323f73624fcb9e3c2b14d318aa00c9))
|
|
21
|
+
|
|
22
|
+
# @adobe/spacecat-shared-html-analyzer-v1.0.0 (2025-09-24)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
### Bug Fixes
|
|
26
|
+
|
|
27
|
+
* release package ([#984](https://github.com/adobe/spacecat-shared/issues/984)) ([15a620e](https://github.com/adobe/spacecat-shared/commit/15a620ecb276d37b14cc2b5e7ca787f7c478ca2b))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
### Features
|
|
31
|
+
|
|
32
|
+
* package for html comparison ([#968](https://github.com/adobe/spacecat-shared/issues/968)) ([5934c0a](https://github.com/adobe/spacecat-shared/commit/5934c0a1e04b91916a823a1835e65178391e0d76))
|
|
33
|
+
|
|
1
34
|
# Changelog
|
|
2
35
|
|
|
3
36
|
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
|
package/README.md
CHANGED
|
@@ -120,7 +120,7 @@ Generate a minified bundle for Chrome extensions:
|
|
|
120
120
|
npm run build:chrome
|
|
121
121
|
```
|
|
122
122
|
|
|
123
|
-
This creates `dist/html-analyzer.min.js` that can be included directly in Chrome extension manifest files. The bundle exposes `HTMLAnalyzer`
|
|
123
|
+
This creates `dist/html-analyzer.min.js` that can be included directly in Chrome extension manifest files. The bundle exposes `HTMLAnalyzer` globally.
|
|
124
124
|
|
|
125
125
|
## Version Information
|
|
126
126
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@adobe/spacecat-shared-html-analyzer",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.3",
|
|
4
4
|
"description": "Analyze HTML content visibility for AI crawlers and citations - compare static HTML vs fully rendered content",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"engines": {
|
|
@@ -41,7 +41,7 @@
|
|
|
41
41
|
"devDependencies": {
|
|
42
42
|
"@rollup/plugin-node-resolve": "^16.0.1",
|
|
43
43
|
"@rollup/plugin-terser": "^0.4.4",
|
|
44
|
-
"chai": "6.0
|
|
44
|
+
"chai": "6.2.0",
|
|
45
45
|
"chai-as-promised": "8.0.2",
|
|
46
46
|
"rollup": "^4.52.2",
|
|
47
47
|
"sinon": "21.0.0",
|
package/rollup.config.js
CHANGED
|
@@ -12,6 +12,21 @@
|
|
|
12
12
|
|
|
13
13
|
import { nodeResolve } from '@rollup/plugin-node-resolve';
|
|
14
14
|
import terser from '@rollup/plugin-terser';
|
|
15
|
+
import { readFileSync } from 'fs';
|
|
16
|
+
|
|
17
|
+
// Read package.json version
|
|
18
|
+
const pkg = JSON.parse(readFileSync('./package.json', 'utf-8'));
|
|
19
|
+
|
|
20
|
+
// Simple plugin to inject package version
|
|
21
|
+
const injectVersion = () => ({
|
|
22
|
+
name: 'inject-version',
|
|
23
|
+
transform(code, id) {
|
|
24
|
+
if (id.endsWith('browser-entry.js')) {
|
|
25
|
+
return code.replace('__PACKAGE_VERSION__', pkg.version);
|
|
26
|
+
}
|
|
27
|
+
return null;
|
|
28
|
+
},
|
|
29
|
+
});
|
|
15
30
|
|
|
16
31
|
export default {
|
|
17
32
|
input: 'src/browser-entry.js', // Special browser entry point
|
|
@@ -35,6 +50,7 @@ export default {
|
|
|
35
50
|
},
|
|
36
51
|
],
|
|
37
52
|
plugins: [
|
|
53
|
+
injectVersion(), // Inject package version
|
|
38
54
|
nodeResolve({
|
|
39
55
|
browser: true, // Use browser field in package.json
|
|
40
56
|
preferBuiltins: false, // Don't include Node.js built-ins
|
package/src/browser-entry.js
CHANGED
package/src/html-filter.js
CHANGED
|
@@ -52,6 +52,35 @@ const COOKIE_KEYWORDS = new Set([
|
|
|
52
52
|
'accept all', 'reject all', 'manage preferences',
|
|
53
53
|
]);
|
|
54
54
|
|
|
55
|
+
const COOKIE_BANNER_CLASS_SELECTORS = [
|
|
56
|
+
'.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
|
|
57
|
+
'.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
|
|
58
|
+
'.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
|
|
59
|
+
'.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
|
|
60
|
+
'.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
|
|
61
|
+
'.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
|
|
62
|
+
'.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
|
|
63
|
+
];
|
|
64
|
+
|
|
65
|
+
const COOKIE_BANNER_ID_SELECTORS = [
|
|
66
|
+
'#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
|
|
67
|
+
'#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
|
|
68
|
+
'#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar', '#cookiemgmt',
|
|
69
|
+
'#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
|
|
70
|
+
'#onetrust-consent-sdk', '#onetrust-banner-sdk',
|
|
71
|
+
];
|
|
72
|
+
|
|
73
|
+
const COOKIE_BANNER_ARIA_SELECTORS = [
|
|
74
|
+
'[role="dialog"][aria-label="Consent Banner"]',
|
|
75
|
+
'[role="dialog"][aria-label*="cookie" i]',
|
|
76
|
+
'[role="dialog"][aria-label*="privacy" i]',
|
|
77
|
+
'[role="dialog"][aria-label*="consent" i]',
|
|
78
|
+
'[role="alertdialog"][aria-label*="cookie" i]',
|
|
79
|
+
'[role="alertdialog"][aria-label*="privacy" i]',
|
|
80
|
+
'[aria-describedby*="cookie" i]',
|
|
81
|
+
'[aria-describedby*="privacy" i]',
|
|
82
|
+
];
|
|
83
|
+
|
|
55
84
|
/**
|
|
56
85
|
* Validates if an element is likely a cookie banner based on text content
|
|
57
86
|
* Optimized: Set lookup + early exit for common keywords (3x faster)
|
|
@@ -73,35 +102,12 @@ function isCookieBannerElement(element) {
|
|
|
73
102
|
* Uses multiple strategies to identify genuine cookie consent banners
|
|
74
103
|
*/
|
|
75
104
|
function removeCookieBanners(element) {
|
|
76
|
-
const classBasedSelectors = [
|
|
77
|
-
'.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
|
|
78
|
-
'.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
|
|
79
|
-
'.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
|
|
80
|
-
'.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
|
|
81
|
-
'.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
|
|
82
|
-
'.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
|
|
83
|
-
'.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
|
|
84
|
-
];
|
|
85
|
-
|
|
86
|
-
const idBasedSelectors = [
|
|
87
|
-
'#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
|
|
88
|
-
'#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
|
|
89
|
-
'#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar',
|
|
90
|
-
'#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
|
|
91
|
-
];
|
|
92
|
-
|
|
93
|
-
const ariaSelectors = [
|
|
94
|
-
'[role="dialog"][aria-label*="cookie" i]',
|
|
95
|
-
'[role="dialog"][aria-label*="privacy" i]',
|
|
96
|
-
'[role="dialog"][aria-label*="consent" i]',
|
|
97
|
-
'[role="alertdialog"][aria-label*="cookie" i]',
|
|
98
|
-
'[role="alertdialog"][aria-label*="privacy" i]',
|
|
99
|
-
'[aria-describedby*="cookie" i]',
|
|
100
|
-
'[aria-describedby*="privacy" i]',
|
|
101
|
-
];
|
|
102
|
-
|
|
103
105
|
// Combine all selectors
|
|
104
|
-
const allSelectors = [
|
|
106
|
+
const allSelectors = [
|
|
107
|
+
...COOKIE_BANNER_CLASS_SELECTORS,
|
|
108
|
+
...COOKIE_BANNER_ID_SELECTORS,
|
|
109
|
+
...COOKIE_BANNER_ARIA_SELECTORS,
|
|
110
|
+
];
|
|
105
111
|
|
|
106
112
|
// Apply class/ID/ARIA based detection with text validation
|
|
107
113
|
allSelectors.forEach((selector) => {
|
|
@@ -132,35 +138,12 @@ export function filterNavigationAndFooterBrowser(element) {
|
|
|
132
138
|
* @param {CheerioAPI} $ - Cheerio instance
|
|
133
139
|
*/
|
|
134
140
|
function removeCookieBannersCheerio($) {
|
|
135
|
-
const classBasedSelectors = [
|
|
136
|
-
'.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
|
|
137
|
-
'.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
|
|
138
|
-
'.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
|
|
139
|
-
'.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
|
|
140
|
-
'.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
|
|
141
|
-
'.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
|
|
142
|
-
'.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
|
|
143
|
-
];
|
|
144
|
-
|
|
145
|
-
const idBasedSelectors = [
|
|
146
|
-
'#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
|
|
147
|
-
'#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
|
|
148
|
-
'#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar',
|
|
149
|
-
'#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
|
|
150
|
-
];
|
|
151
|
-
|
|
152
|
-
const ariaSelectors = [
|
|
153
|
-
'[role="dialog"][aria-label*="cookie" i]',
|
|
154
|
-
'[role="dialog"][aria-label*="privacy" i]',
|
|
155
|
-
'[role="dialog"][aria-label*="consent" i]',
|
|
156
|
-
'[role="alertdialog"][aria-label*="cookie" i]',
|
|
157
|
-
'[role="alertdialog"][aria-label*="privacy" i]',
|
|
158
|
-
'[aria-describedby*="cookie" i]',
|
|
159
|
-
'[aria-describedby*="privacy" i]',
|
|
160
|
-
];
|
|
161
|
-
|
|
162
141
|
// Combine all selectors for efficient removal
|
|
163
|
-
const allSelectors = [
|
|
142
|
+
const allSelectors = [
|
|
143
|
+
...COOKIE_BANNER_CLASS_SELECTORS,
|
|
144
|
+
...COOKIE_BANNER_ID_SELECTORS,
|
|
145
|
+
...COOKIE_BANNER_ARIA_SELECTORS,
|
|
146
|
+
];
|
|
164
147
|
|
|
165
148
|
// Apply class/ID/ARIA based detection with text validation
|
|
166
149
|
allSelectors.forEach((selector) => {
|
|
@@ -204,28 +187,70 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
|
|
|
204
187
|
const parser = new DOMParser(); // eslint-disable-line no-undef
|
|
205
188
|
const doc = parser.parseFromString(htmlContent, 'text/html');
|
|
206
189
|
|
|
207
|
-
//
|
|
208
|
-
const
|
|
209
|
-
|
|
210
|
-
//
|
|
211
|
-
|
|
190
|
+
// Process the entire document to capture JSON-LD in both head and body
|
|
191
|
+
const documentElement = doc.documentElement || doc;
|
|
192
|
+
|
|
193
|
+
// Remove script elements except JSON-LD, also remove style, noscript, template
|
|
194
|
+
documentElement.querySelectorAll('script').forEach((n) => {
|
|
195
|
+
// Preserve JSON-LD structured data scripts by converting them to code blocks
|
|
196
|
+
if (n.type === 'application/ld+json') {
|
|
197
|
+
const jsonContent = n.textContent || n.innerText || '';
|
|
198
|
+
if (jsonContent.trim()) {
|
|
199
|
+
try {
|
|
200
|
+
// Parse and re-stringify JSON to ensure consistent formatting
|
|
201
|
+
// Handle both single and double quoted JSON
|
|
202
|
+
const cleanJsonContent = jsonContent.trim();
|
|
203
|
+
// Try to fix common JSON issues like single quotes
|
|
204
|
+
const startsValid = cleanJsonContent.startsWith('{')
|
|
205
|
+
|| cleanJsonContent.startsWith('[');
|
|
206
|
+
const endsValid = cleanJsonContent.endsWith('}')
|
|
207
|
+
|| cleanJsonContent.endsWith(']');
|
|
208
|
+
|
|
209
|
+
if (!startsValid || !endsValid) {
|
|
210
|
+
throw new Error('Not valid JSON structure');
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
const parsedJson = JSON.parse(cleanJsonContent);
|
|
214
|
+
const formattedJson = JSON.stringify(parsedJson, null, 2);
|
|
215
|
+
|
|
216
|
+
// Create a pre/code block to preserve JSON-LD for markdown conversion
|
|
217
|
+
const codeBlock = document.createElement('pre'); // eslint-disable-line no-undef
|
|
218
|
+
const code = document.createElement('code'); // eslint-disable-line no-undef
|
|
219
|
+
code.className = 'ld-json';
|
|
220
|
+
code.textContent = formattedJson;
|
|
221
|
+
codeBlock.appendChild(code);
|
|
222
|
+
n.parentNode.insertBefore(codeBlock, n);
|
|
223
|
+
} catch (e) {
|
|
224
|
+
// If JSON parsing fails, fall back to original content
|
|
225
|
+
const codeBlock = document.createElement('pre'); // eslint-disable-line no-undef
|
|
226
|
+
const code = document.createElement('code'); // eslint-disable-line no-undef
|
|
227
|
+
code.className = 'ld-json';
|
|
228
|
+
code.textContent = jsonContent.trim();
|
|
229
|
+
codeBlock.appendChild(code);
|
|
230
|
+
n.parentNode.insertBefore(codeBlock, n);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
n.remove();
|
|
235
|
+
});
|
|
236
|
+
documentElement.querySelectorAll('style,noscript,template').forEach((n) => n.remove());
|
|
212
237
|
|
|
213
238
|
// Remove all media elements (images, videos, audio, etc.) to keep only text
|
|
214
|
-
|
|
215
|
-
|
|
239
|
+
const mediaSelector = 'img,video,audio,picture,svg,canvas,embed,object,iframe';
|
|
240
|
+
documentElement.querySelectorAll(mediaSelector).forEach((n) => n.remove());
|
|
216
241
|
|
|
217
242
|
// Remove consent banners with intelligent detection
|
|
218
|
-
removeCookieBanners(
|
|
243
|
+
removeCookieBanners(documentElement);
|
|
219
244
|
|
|
220
245
|
// Conditionally remove navigation and footer elements
|
|
221
246
|
if (ignoreNavFooter) {
|
|
222
|
-
filterNavigationAndFooterBrowser(
|
|
247
|
+
filterNavigationAndFooterBrowser(documentElement);
|
|
223
248
|
}
|
|
224
249
|
|
|
225
250
|
if (returnText) {
|
|
226
|
-
return (
|
|
251
|
+
return (documentElement && documentElement.textContent) ? documentElement.textContent : '';
|
|
227
252
|
}
|
|
228
|
-
return
|
|
253
|
+
return documentElement.outerHTML;
|
|
229
254
|
}
|
|
230
255
|
|
|
231
256
|
/**
|
|
@@ -245,8 +270,41 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
|
|
|
245
270
|
|
|
246
271
|
const $ = cheerio.load(htmlContent);
|
|
247
272
|
|
|
248
|
-
//
|
|
249
|
-
$('script
|
|
273
|
+
// Remove script except JSON-LD structured data, also remove style, noscript, template
|
|
274
|
+
$('script').each(function processScript() {
|
|
275
|
+
// Preserve JSON-LD structured data scripts by converting them to code blocks
|
|
276
|
+
if ($(this).attr('type') === 'application/ld+json') {
|
|
277
|
+
const jsonContent = $(this).text().trim();
|
|
278
|
+
if (jsonContent) {
|
|
279
|
+
try {
|
|
280
|
+
// Parse and re-stringify JSON to ensure consistent formatting
|
|
281
|
+
// Handle both single and double quoted JSON
|
|
282
|
+
const cleanJsonContent = jsonContent;
|
|
283
|
+
const startsValid = cleanJsonContent.startsWith('{')
|
|
284
|
+
|| cleanJsonContent.startsWith('[');
|
|
285
|
+
const endsValid = cleanJsonContent.endsWith('}')
|
|
286
|
+
|| cleanJsonContent.endsWith(']');
|
|
287
|
+
|
|
288
|
+
if (!startsValid || !endsValid) {
|
|
289
|
+
throw new Error('Not valid JSON structure');
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
const parsedJson = JSON.parse(cleanJsonContent);
|
|
293
|
+
const formattedJson = JSON.stringify(parsedJson, null, 2);
|
|
294
|
+
const codeBlock = `<pre><code class="ld-json">${formattedJson}</code></pre>`;
|
|
295
|
+
$(this).before(codeBlock);
|
|
296
|
+
} catch (e) {
|
|
297
|
+
// If JSON parsing fails, fall back to original content
|
|
298
|
+
const codeBlock = `<pre><code class="ld-json">${jsonContent}</code></pre>`;
|
|
299
|
+
$(this).before(codeBlock);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
$(this).remove();
|
|
303
|
+
} else {
|
|
304
|
+
$(this).remove();
|
|
305
|
+
}
|
|
306
|
+
});
|
|
307
|
+
$('style, noscript, template').remove();
|
|
250
308
|
|
|
251
309
|
// Remove all media elements (images, videos, audio, etc.) to keep only text
|
|
252
310
|
$('img, video, audio, picture, svg, canvas, embed, object, iframe').remove();
|