@adobe/spacecat-shared-html-analyzer 1.2.1 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/package.json +1 -1
- package/src/analyzer.js +43 -11
- package/src/html-filter.js +37 -18
- package/src/index.d.ts +33 -6
- package/test/index.test.js +114 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-html-analyzer-v1.2.3](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.2.2...@adobe/spacecat-shared-html-analyzer-v1.2.3) (2026-02-04)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* updated selectors for nav-footer to not exclude the breadcrumbs ([#1309](https://github.com/adobe/spacecat-shared/issues/1309)) ([2c9246c](https://github.com/adobe/spacecat-shared/commit/2c9246c295ab90ab8e2bae39fd5d11d71bba6546))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-html-analyzer-v1.2.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.2.1...@adobe/spacecat-shared-html-analyzer-v1.2.2) (2026-01-22)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Bug Fixes
|
|
12
|
+
|
|
13
|
+
* added option to include noscript tags in server-side html ([#1274](https://github.com/adobe/spacecat-shared/issues/1274)) ([f26e320](https://github.com/adobe/spacecat-shared/commit/f26e3200cc2b129237073da5c7cae1cbfb3ae4b1))
|
|
14
|
+
|
|
1
15
|
# [@adobe/spacecat-shared-html-analyzer-v1.2.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.2.0...@adobe/spacecat-shared-html-analyzer-v1.2.1) (2026-01-15)
|
|
2
16
|
|
|
3
17
|
|
package/package.json
CHANGED
package/src/analyzer.js
CHANGED
|
@@ -25,12 +25,20 @@ import { hashDJB2, pct } from './utils.js';
|
|
|
25
25
|
* @param {string} initHtml - Initial HTML content (what crawlers see)
|
|
26
26
|
* @param {string} finHtml - Final HTML content (what users see)
|
|
27
27
|
* @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
|
|
28
|
+
* @param {boolean} [includeNoscriptInFinal=false] -
|
|
29
|
+
* Whether to include noscript content in final HTML
|
|
28
30
|
* @returns {Promise<Object>} Comprehensive analysis results
|
|
29
31
|
*/
|
|
30
|
-
export async function analyzeTextComparison(
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
export async function analyzeTextComparison(
|
|
33
|
+
initHtml,
|
|
34
|
+
finHtml,
|
|
35
|
+
ignoreNavFooter = true,
|
|
36
|
+
includeNoscriptInFinal = false,
|
|
37
|
+
) {
|
|
38
|
+
// Server-side (initial): Always includes noscript (true) - what crawlers see
|
|
39
|
+
const initTextResult = stripTagsToText(initHtml, ignoreNavFooter, true);
|
|
40
|
+
// Client-side (final): Configurable noscript inclusion - what users see
|
|
41
|
+
const finTextResult = stripTagsToText(finHtml, ignoreNavFooter, includeNoscriptInFinal);
|
|
34
42
|
|
|
35
43
|
const initText = await Promise.resolve(initTextResult);
|
|
36
44
|
const finText = await Promise.resolve(finTextResult);
|
|
@@ -61,12 +69,20 @@ export async function analyzeTextComparison(initHtml, finHtml, ignoreNavFooter =
|
|
|
61
69
|
* @param {string} originalHTML - Initial HTML content
|
|
62
70
|
* @param {string} currentHTML - Final HTML content
|
|
63
71
|
* @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
|
|
72
|
+
* @param {boolean} [includeNoscriptInCurrent=false] -
|
|
73
|
+
* Whether to include noscript content in current HTML
|
|
64
74
|
* @returns {Promise<Object>} Basic statistics
|
|
65
75
|
*/
|
|
66
|
-
export async function calculateStats(
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
76
|
+
export async function calculateStats(
|
|
77
|
+
originalHTML,
|
|
78
|
+
currentHTML,
|
|
79
|
+
ignoreNavFooter = true,
|
|
80
|
+
includeNoscriptInCurrent = false,
|
|
81
|
+
) {
|
|
82
|
+
// Server-side (original): Always includes noscript (true) - what crawlers see
|
|
83
|
+
const originalTextResult = stripTagsToText(originalHTML, ignoreNavFooter, true);
|
|
84
|
+
// Client-side (current): Configurable noscript inclusion - what users see
|
|
85
|
+
const currentTextResult = stripTagsToText(currentHTML, ignoreNavFooter, includeNoscriptInCurrent);
|
|
70
86
|
|
|
71
87
|
const originalText = await Promise.resolve(originalTextResult);
|
|
72
88
|
const currentText = await Promise.resolve(currentTextResult);
|
|
@@ -103,14 +119,30 @@ export async function calculateStats(originalHTML, currentHTML, ignoreNavFooter
|
|
|
103
119
|
* Calculate stats for both nav/footer scenarios
|
|
104
120
|
* @param {string} originalHTML - Initial HTML content
|
|
105
121
|
* @param {string} currentHTML - Final HTML content
|
|
122
|
+
* @param {boolean} [includeNoscriptInCurrent=false] -
|
|
123
|
+
* Whether to include noscript content in current HTML
|
|
106
124
|
* @returns {Promise<Object>} Analysis results for both scenarios
|
|
107
125
|
*/
|
|
108
|
-
export async function calculateBothScenarioStats(
|
|
126
|
+
export async function calculateBothScenarioStats(
|
|
127
|
+
originalHTML,
|
|
128
|
+
currentHTML,
|
|
129
|
+
includeNoscriptInCurrent = false,
|
|
130
|
+
) {
|
|
109
131
|
// Calculate stats with nav/footer ignored
|
|
110
|
-
const statsIgnored = await calculateStats(
|
|
132
|
+
const statsIgnored = await calculateStats(
|
|
133
|
+
originalHTML,
|
|
134
|
+
currentHTML,
|
|
135
|
+
true,
|
|
136
|
+
includeNoscriptInCurrent,
|
|
137
|
+
);
|
|
111
138
|
|
|
112
139
|
// Calculate stats without nav/footer ignored
|
|
113
|
-
const statsNotIgnored = await calculateStats(
|
|
140
|
+
const statsNotIgnored = await calculateStats(
|
|
141
|
+
originalHTML,
|
|
142
|
+
currentHTML,
|
|
143
|
+
false,
|
|
144
|
+
includeNoscriptInCurrent,
|
|
145
|
+
);
|
|
114
146
|
return {
|
|
115
147
|
withNavFooterIgnored: {
|
|
116
148
|
wordCountBefore: statsIgnored.wordCountBefore,
|
package/src/html-filter.js
CHANGED
|
@@ -30,8 +30,6 @@ const NAVIGATION_FOOTER_SELECTOR = [
|
|
|
30
30
|
// Header/footer classes
|
|
31
31
|
'.header', '.site-header', '.page-header', '.top-header', '.header-wrapper',
|
|
32
32
|
'.footer', '.site-footer', '.page-footer', '.bottom-footer', '.footer-wrapper',
|
|
33
|
-
// Breadcrumb navigation
|
|
34
|
-
'.breadcrumb', '.breadcrumbs',
|
|
35
33
|
// Common ID selectors
|
|
36
34
|
'#nav', '#navigation', '#navbar', '#header', '#footer', '#menu', '#main-menu',
|
|
37
35
|
'#site-header', '#site-footer', '#page-header', '#page-footer',
|
|
@@ -182,16 +180,17 @@ function filterNavigationAndFooterCheerio($) {
|
|
|
182
180
|
* @param {string} htmlContent - Raw HTML content
|
|
183
181
|
* @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
|
|
184
182
|
* @param {boolean} returnText - Whether to return text only
|
|
183
|
+
* @param {boolean} includeNoscript - Whether to include noscript elements (false excludes them)
|
|
185
184
|
* @returns {string} Filtered content
|
|
186
185
|
*/
|
|
187
|
-
function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
|
|
186
|
+
function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText, includeNoscript) {
|
|
188
187
|
const parser = new DOMParser(); // eslint-disable-line no-undef
|
|
189
188
|
const doc = parser.parseFromString(htmlContent, 'text/html');
|
|
190
189
|
|
|
191
190
|
// Process the entire document to capture JSON-LD in both head and body
|
|
192
191
|
const documentElement = doc.documentElement || doc;
|
|
193
192
|
|
|
194
|
-
// Remove script elements except JSON-LD, also remove style,
|
|
193
|
+
// Remove script elements except JSON-LD, also remove style, template
|
|
195
194
|
documentElement.querySelectorAll('script').forEach((n) => {
|
|
196
195
|
// Preserve JSON-LD structured data scripts by converting them to code blocks
|
|
197
196
|
if (n.type === 'application/ld+json') {
|
|
@@ -234,7 +233,12 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
|
|
|
234
233
|
}
|
|
235
234
|
n.remove();
|
|
236
235
|
});
|
|
237
|
-
|
|
236
|
+
|
|
237
|
+
if (includeNoscript) {
|
|
238
|
+
documentElement.querySelectorAll('style,template').forEach((n) => n.remove());
|
|
239
|
+
} else {
|
|
240
|
+
documentElement.querySelectorAll('noscript,style,template').forEach((n) => n.remove());
|
|
241
|
+
}
|
|
238
242
|
|
|
239
243
|
// Remove all media elements (images, videos, audio, etc.) to keep only text
|
|
240
244
|
const mediaSelector = 'img,video,audio,picture,svg,canvas,embed,object,iframe';
|
|
@@ -259,9 +263,10 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
|
|
|
259
263
|
* @param {string} htmlContent - Raw HTML content
|
|
260
264
|
* @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
|
|
261
265
|
* @param {boolean} returnText - Whether to return text only
|
|
266
|
+
* @param {boolean} includeNoscript - Whether to include noscript elements (false excludes them)
|
|
262
267
|
* @returns {Promise<string>} Filtered content
|
|
263
268
|
*/
|
|
264
|
-
async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
|
|
269
|
+
async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText, includeNoscript) {
|
|
265
270
|
let cheerio;
|
|
266
271
|
try {
|
|
267
272
|
cheerio = await import('cheerio');
|
|
@@ -305,7 +310,12 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
|
|
|
305
310
|
$(this).remove();
|
|
306
311
|
}
|
|
307
312
|
});
|
|
308
|
-
|
|
313
|
+
|
|
314
|
+
if (includeNoscript) {
|
|
315
|
+
$('style, template').remove();
|
|
316
|
+
} else {
|
|
317
|
+
$('style, noscript, template').remove();
|
|
318
|
+
}
|
|
309
319
|
|
|
310
320
|
// Remove all media elements (images, videos, audio, etc.) to keep only text
|
|
311
321
|
$('img, video, audio, picture, svg, canvas, embed, object, iframe').remove();
|
|
@@ -330,45 +340,54 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
|
|
|
330
340
|
/**
|
|
331
341
|
* Filter HTML content by removing unwanted elements
|
|
332
342
|
* @param {string} htmlContent - Raw HTML content
|
|
333
|
-
* @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
|
|
334
|
-
* @param {boolean} returnText - Whether to return text only (true) or filtered HTML (false)
|
|
343
|
+
* @param {boolean} [ignoreNavFooter=true] - Whether to remove navigation/footer elements
|
|
344
|
+
* @param {boolean} [returnText=true] - Whether to return text only (true) or filtered HTML (false)
|
|
345
|
+
* @param {boolean} [includeNoscript=false] - Whether to include noscript elements
|
|
335
346
|
* @returns {string|Promise<string>} Filtered content (sync in browser, async in Node.js)
|
|
336
347
|
*/
|
|
337
|
-
export function filterHtmlContent(
|
|
348
|
+
export function filterHtmlContent(
|
|
349
|
+
htmlContent,
|
|
350
|
+
ignoreNavFooter = true,
|
|
351
|
+
returnText = true,
|
|
352
|
+
includeNoscript = false,
|
|
353
|
+
) {
|
|
338
354
|
if (!htmlContent) return '';
|
|
339
355
|
|
|
340
356
|
// Browser environment (DOMParser) - works in Chrome extensions too - SYNCHRONOUS
|
|
341
357
|
if (isBrowser()) {
|
|
342
|
-
return filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText);
|
|
358
|
+
return filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText, includeNoscript);
|
|
343
359
|
}
|
|
344
360
|
|
|
345
361
|
// Node.js environment (cheerio) - dynamic import to avoid bundling issues - ASYNCHRONOUS
|
|
346
|
-
return filterHtmlNode(htmlContent, ignoreNavFooter, returnText);
|
|
362
|
+
return filterHtmlNode(htmlContent, ignoreNavFooter, returnText, includeNoscript);
|
|
347
363
|
}
|
|
348
364
|
|
|
349
365
|
/**
|
|
350
366
|
* Strip HTML tags and return plain text
|
|
367
|
+
*
|
|
351
368
|
* @param {string} htmlContent - Raw HTML content
|
|
352
|
-
* @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
|
|
369
|
+
* @param {boolean} [ignoreNavFooter=true] - Whether to remove navigation/footer elements
|
|
370
|
+
* @param {boolean} [includeNoscript=false] - Whether to include noscript elements
|
|
353
371
|
* @returns {string|Promise<string>} Plain text content (sync in browser, async in Node.js)
|
|
354
372
|
*/
|
|
355
|
-
export function stripTagsToText(htmlContent, ignoreNavFooter = true) {
|
|
356
|
-
return filterHtmlContent(htmlContent, ignoreNavFooter, true);
|
|
373
|
+
export function stripTagsToText(htmlContent, ignoreNavFooter = true, includeNoscript = false) {
|
|
374
|
+
return filterHtmlContent(htmlContent, ignoreNavFooter, true, includeNoscript);
|
|
357
375
|
}
|
|
358
376
|
|
|
359
377
|
/**
|
|
360
378
|
* Extract word count from HTML content
|
|
361
379
|
* @param {string} htmlContent - Raw HTML content
|
|
362
|
-
* @param {boolean} ignoreNavFooter - Whether to ignore navigation/footer
|
|
380
|
+
* @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer
|
|
381
|
+
* @param {boolean} [includeNoscript=false] - Whether to include noscript elements
|
|
363
382
|
* @returns {Object|Promise<Object>} Object with word_count property
|
|
364
383
|
* (sync in browser, async in Node.js)
|
|
365
384
|
*/
|
|
366
|
-
export function extractWordCount(htmlContent, ignoreNavFooter = true) {
|
|
385
|
+
export function extractWordCount(htmlContent, ignoreNavFooter = true, includeNoscript = false) {
|
|
367
386
|
if (!htmlContent) {
|
|
368
387
|
return { word_count: 0 };
|
|
369
388
|
}
|
|
370
389
|
|
|
371
|
-
const textContent = stripTagsToText(htmlContent, ignoreNavFooter);
|
|
390
|
+
const textContent = stripTagsToText(htmlContent, ignoreNavFooter, includeNoscript);
|
|
372
391
|
|
|
373
392
|
// Handle both sync (browser) and async (Node.js) cases
|
|
374
393
|
if (textContent && typeof textContent.then === 'function') {
|
package/src/index.d.ts
CHANGED
|
@@ -90,17 +90,30 @@ export function generateDiffReport(initText: string, finText: string, mode?: "wo
|
|
|
90
90
|
/**
|
|
91
91
|
* Filter HTML content by removing unwanted elements
|
|
92
92
|
*/
|
|
93
|
-
export function filterHtmlContent(
|
|
93
|
+
export function filterHtmlContent(
|
|
94
|
+
htmlContent: string,
|
|
95
|
+
ignoreNavFooter?: boolean,
|
|
96
|
+
returnText?: boolean,
|
|
97
|
+
includeNoscript?: boolean
|
|
98
|
+
): Promise<string>;
|
|
94
99
|
|
|
95
100
|
/**
|
|
96
101
|
* Extract plain text from HTML content
|
|
97
102
|
*/
|
|
98
|
-
export function stripTagsToText(
|
|
103
|
+
export function stripTagsToText(
|
|
104
|
+
htmlContent: string,
|
|
105
|
+
ignoreNavFooter?: boolean,
|
|
106
|
+
includeNoscript?: boolean
|
|
107
|
+
): Promise<string>;
|
|
99
108
|
|
|
100
109
|
/**
|
|
101
110
|
* Extract word count from HTML content
|
|
102
111
|
*/
|
|
103
|
-
export function extractWordCount(
|
|
112
|
+
export function extractWordCount(
|
|
113
|
+
htmlContent: string,
|
|
114
|
+
ignoreNavFooter?: boolean,
|
|
115
|
+
includeNoscript?: boolean
|
|
116
|
+
): Promise<{ word_count: number }>;
|
|
104
117
|
|
|
105
118
|
/**
|
|
106
119
|
* Remove navigation and footer elements from DOM element (browser environment)
|
|
@@ -150,28 +163,42 @@ interface BothScenariosStats {
|
|
|
150
163
|
|
|
151
164
|
/**
|
|
152
165
|
* Comprehensive text-only analysis between initial and final HTML (original chrome extension logic)
|
|
166
|
+
* @param initHtml - Initial HTML content (what crawlers/bots see - server-side rendered)
|
|
167
|
+
* @param finHtml - Final HTML content (what users see - client-side rendered)
|
|
168
|
+
* @param ignoreNavFooter - Whether to ignore navigation/footer elements
|
|
169
|
+
* @param includeNoscriptInFinal - Whether to include noscript content in final HTML (client-side)
|
|
153
170
|
*/
|
|
154
171
|
export function analyzeTextComparison(
|
|
155
172
|
initHtml: string,
|
|
156
173
|
finHtml: string,
|
|
157
|
-
ignoreNavFooter?: boolean
|
|
174
|
+
ignoreNavFooter?: boolean,
|
|
175
|
+
includeNoscriptInFinal?: boolean
|
|
158
176
|
): Promise<TextComparison>;
|
|
159
177
|
|
|
160
178
|
/**
|
|
161
179
|
* Calculate basic stats from HTML comparison (original chrome extension logic)
|
|
180
|
+
* @param originalHTML - Initial HTML content (server-side)
|
|
181
|
+
* @param currentHTML - Final HTML content (client-side)
|
|
182
|
+
* @param ignoreNavFooter - Whether to ignore navigation/footer elements
|
|
183
|
+
* @param includeNoscriptInCurrent - Whether to include noscript content in current HTML (client-side)
|
|
162
184
|
*/
|
|
163
185
|
export function calculateStats(
|
|
164
186
|
originalHTML: string,
|
|
165
187
|
currentHTML: string,
|
|
166
|
-
ignoreNavFooter?: boolean
|
|
188
|
+
ignoreNavFooter?: boolean,
|
|
189
|
+
includeNoscriptInCurrent?: boolean
|
|
167
190
|
): Promise<BasicStats>;
|
|
168
191
|
|
|
169
192
|
/**
|
|
170
193
|
* Calculate stats for both nav/footer scenarios (original chrome extension logic)
|
|
194
|
+
* @param originalHTML - Initial HTML content (server-side)
|
|
195
|
+
* @param currentHTML - Final HTML content (client-side)
|
|
196
|
+
* @param includeNoscriptInCurrent - Whether to include noscript content in current HTML (client-side)
|
|
171
197
|
*/
|
|
172
198
|
export function calculateBothScenarioStats(
|
|
173
199
|
originalHTML: string,
|
|
174
|
-
currentHTML: string
|
|
200
|
+
currentHTML: string,
|
|
201
|
+
includeNoscriptInCurrent?: boolean
|
|
175
202
|
): Promise<BothScenariosStats>;
|
|
176
203
|
|
|
177
204
|
/** MARKDOWN DIFF FUNCTIONS */
|
package/test/index.test.js
CHANGED
|
@@ -46,6 +46,35 @@ describe('HTML Visibility Analyzer', () => {
|
|
|
46
46
|
expect(result.initialText).to.equal('');
|
|
47
47
|
expect(result.finalText.length).to.be.greaterThan(0);
|
|
48
48
|
});
|
|
49
|
+
|
|
50
|
+
it('should include noscript in initial HTML and exclude in final HTML by default', async () => {
|
|
51
|
+
const initHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p></body></html>';
|
|
52
|
+
const finHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p><div>Extra</div></body></html>';
|
|
53
|
+
const result = await analyzeTextComparison(initHtml, finHtml);
|
|
54
|
+
|
|
55
|
+
// Initial text should include noscript content
|
|
56
|
+
expect(result.initialText).to.include('Enable JS');
|
|
57
|
+
// Final text should NOT include noscript content by default
|
|
58
|
+
expect(result.finalText).to.not.include('Enable JS');
|
|
59
|
+
// Both should have the main content
|
|
60
|
+
expect(result.initialText).to.include('Title');
|
|
61
|
+
expect(result.finalText).to.include('Title');
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it('should include noscript in final HTML when includeNoscriptInFinal is true', async () => {
|
|
65
|
+
const initHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p></body></html>';
|
|
66
|
+
const finHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p><div>Extra</div></body></html>';
|
|
67
|
+
const result = await analyzeTextComparison(initHtml, finHtml, true, true);
|
|
68
|
+
|
|
69
|
+
// Initial text should include noscript content
|
|
70
|
+
expect(result.initialText).to.include('Enable JS');
|
|
71
|
+
// Final text should ALSO include noscript content when flag is true
|
|
72
|
+
expect(result.finalText).to.include('Enable JS');
|
|
73
|
+
// Both should have the main content
|
|
74
|
+
expect(result.initialText).to.include('Title');
|
|
75
|
+
expect(result.finalText).to.include('Title');
|
|
76
|
+
expect(result.finalText).to.include('Extra');
|
|
77
|
+
});
|
|
49
78
|
});
|
|
50
79
|
|
|
51
80
|
describe('calculateStats', () => {
|
|
@@ -64,6 +93,41 @@ describe('HTML Visibility Analyzer', () => {
|
|
|
64
93
|
expect(result.contentIncreaseRatio).to.be.a('number');
|
|
65
94
|
expect(result.citationReadability).to.be.a('number');
|
|
66
95
|
});
|
|
96
|
+
|
|
97
|
+
it('should handle noscript elements correctly in word counts by default', async () => {
|
|
98
|
+
const originalHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p></body></html>';
|
|
99
|
+
const currentHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p><p>New content</p></body></html>';
|
|
100
|
+
const result = await calculateStats(originalHtml, currentHtml);
|
|
101
|
+
|
|
102
|
+
// Word counts should reflect the includeNoscript behavior
|
|
103
|
+
// originalText includes noscript (includeNoscript=true):
|
|
104
|
+
// "Title Enable JavaScript Original content"
|
|
105
|
+
// currentText excludes noscript (includeNoscript=false):
|
|
106
|
+
// "Title Original content New content"
|
|
107
|
+
expect(result.wordCountBefore).to.be.greaterThan(0);
|
|
108
|
+
expect(result.wordCountAfter).to.be.greaterThan(0);
|
|
109
|
+
expect(result.contentIncreaseRatio).to.be.a('number');
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
it('should include noscript in current HTML when includeNoscriptInCurrent is true', async () => {
|
|
113
|
+
const originalHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p></body></html>';
|
|
114
|
+
const currentHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p><p>New content</p></body></html>';
|
|
115
|
+
const resultWithout = await calculateStats(originalHtml, currentHtml, true, false);
|
|
116
|
+
const resultWith = await calculateStats(originalHtml, currentHtml, true, true);
|
|
117
|
+
|
|
118
|
+
// When noscript is excluded from current, word count should be lower
|
|
119
|
+
expect(resultWithout.wordCountAfter).to.be.lessThan(resultWith.wordCountAfter);
|
|
120
|
+
|
|
121
|
+
// Note: Text extraction concatenates without spaces, so words merge
|
|
122
|
+
// originalHtml with noscript: "TitleEnable JavaScriptOriginal content" = 3 words
|
|
123
|
+
// originalHtml without noscript: "TitleOriginal content" = 2 words
|
|
124
|
+
// currentHtml without noscript: "TitleOriginal contentNew content" = 3 words
|
|
125
|
+
// currentHtml with noscript: "TitleEnable JavaScriptOriginal contentNew content" = 4 words
|
|
126
|
+
expect(resultWithout.wordCountBefore).to.equal(3);
|
|
127
|
+
expect(resultWithout.wordCountAfter).to.equal(3);
|
|
128
|
+
expect(resultWith.wordCountBefore).to.equal(3);
|
|
129
|
+
expect(resultWith.wordCountAfter).to.equal(4);
|
|
130
|
+
});
|
|
67
131
|
});
|
|
68
132
|
|
|
69
133
|
describe('calculateBothScenarioStats', () => {
|
|
@@ -118,5 +182,55 @@ describe('HTML Visibility Analyzer', () => {
|
|
|
118
182
|
expect(text).to.include('Navigation');
|
|
119
183
|
expect(text).to.include('Footer');
|
|
120
184
|
});
|
|
185
|
+
|
|
186
|
+
it('should remove noscript elements by default', async () => {
|
|
187
|
+
const html = '<html><body><h1>Title</h1><noscript>Please enable JavaScript</noscript><p>Content</p></body></html>';
|
|
188
|
+
const text = await stripTagsToText(html);
|
|
189
|
+
|
|
190
|
+
expect(text).to.include('Title');
|
|
191
|
+
expect(text).to.include('Content');
|
|
192
|
+
expect(text).to.not.include('Please enable JavaScript');
|
|
193
|
+
expect(text).to.not.include('noscript');
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
it('should remove noscript elements when includeNoscript is false', async () => {
|
|
197
|
+
const html = '<html><body><h1>Title</h1><noscript>Noscript content</noscript><p>Regular content</p></body></html>';
|
|
198
|
+
const text = await stripTagsToText(html, true, false);
|
|
199
|
+
|
|
200
|
+
expect(text).to.include('Title');
|
|
201
|
+
expect(text).to.include('Regular content');
|
|
202
|
+
expect(text).to.not.include('Noscript content');
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
it('should keep noscript elements when includeNoscript is true', async () => {
|
|
206
|
+
const html = '<html><body><h1>Title</h1><noscript>Noscript fallback</noscript><p>Regular content</p></body></html>';
|
|
207
|
+
const text = await stripTagsToText(html, true, true);
|
|
208
|
+
|
|
209
|
+
expect(text).to.include('Title');
|
|
210
|
+
expect(text).to.include('Regular content');
|
|
211
|
+
expect(text).to.include('Noscript fallback');
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
it('should handle multiple noscript elements with includeNoscript', async () => {
|
|
215
|
+
const html = `<html><body>
|
|
216
|
+
<h1>Title</h1>
|
|
217
|
+
<noscript>First noscript</noscript>
|
|
218
|
+
<p>Content</p>
|
|
219
|
+
<noscript>Second noscript</noscript>
|
|
220
|
+
</body></html>`;
|
|
221
|
+
|
|
222
|
+
const textWithout = await stripTagsToText(html, true, false);
|
|
223
|
+
const textWith = await stripTagsToText(html, true, true);
|
|
224
|
+
|
|
225
|
+
expect(textWithout).to.include('Title');
|
|
226
|
+
expect(textWithout).to.include('Content');
|
|
227
|
+
expect(textWithout).to.not.include('First noscript');
|
|
228
|
+
expect(textWithout).to.not.include('Second noscript');
|
|
229
|
+
|
|
230
|
+
expect(textWith).to.include('Title');
|
|
231
|
+
expect(textWith).to.include('Content');
|
|
232
|
+
expect(textWith).to.include('First noscript');
|
|
233
|
+
expect(textWith).to.include('Second noscript');
|
|
234
|
+
});
|
|
121
235
|
});
|
|
122
236
|
});
|