@adobe/spacecat-shared-html-analyzer 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/package.json +1 -1
- package/src/analyzer.js +43 -11
- package/src/html-filter.js +37 -16
- package/src/index.d.ts +80 -6
- package/test/index.test.js +114 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-html-analyzer-v1.2.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.2.1...@adobe/spacecat-shared-html-analyzer-v1.2.2) (2026-01-22)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* added option to include noscript tags in server-side html ([#1274](https://github.com/adobe/spacecat-shared/issues/1274)) ([f26e320](https://github.com/adobe/spacecat-shared/commit/f26e3200cc2b129237073da5c7cae1cbfb3ae4b1))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-html-analyzer-v1.2.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.2.0...@adobe/spacecat-shared-html-analyzer-v1.2.1) (2026-01-15)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Bug Fixes
|
|
12
|
+
|
|
13
|
+
* html-analyser pkg adding exported methods in typescript file ([#1265](https://github.com/adobe/spacecat-shared/issues/1265)) ([10d173b](https://github.com/adobe/spacecat-shared/commit/10d173b68c3c4158a49465a3f3d7a78b68dccc3b))
|
|
14
|
+
|
|
1
15
|
# [@adobe/spacecat-shared-html-analyzer-v1.2.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.1.0...@adobe/spacecat-shared-html-analyzer-v1.2.0) (2025-12-04)
|
|
2
16
|
|
|
3
17
|
|
package/package.json
CHANGED
package/src/analyzer.js
CHANGED
|
@@ -25,12 +25,20 @@ import { hashDJB2, pct } from './utils.js';
|
|
|
25
25
|
* @param {string} initHtml - Initial HTML content (what crawlers see)
|
|
26
26
|
* @param {string} finHtml - Final HTML content (what users see)
|
|
27
27
|
* @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
|
|
28
|
+
* @param {boolean} [includeNoscriptInFinal=false] -
|
|
29
|
+
* Whether to include noscript content in final HTML
|
|
28
30
|
* @returns {Promise<Object>} Comprehensive analysis results
|
|
29
31
|
*/
|
|
30
|
-
export async function analyzeTextComparison(
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
export async function analyzeTextComparison(
|
|
33
|
+
initHtml,
|
|
34
|
+
finHtml,
|
|
35
|
+
ignoreNavFooter = true,
|
|
36
|
+
includeNoscriptInFinal = false,
|
|
37
|
+
) {
|
|
38
|
+
// Server-side (initial): Always includes noscript (true) - what crawlers see
|
|
39
|
+
const initTextResult = stripTagsToText(initHtml, ignoreNavFooter, true);
|
|
40
|
+
// Client-side (final): Configurable noscript inclusion - what users see
|
|
41
|
+
const finTextResult = stripTagsToText(finHtml, ignoreNavFooter, includeNoscriptInFinal);
|
|
34
42
|
|
|
35
43
|
const initText = await Promise.resolve(initTextResult);
|
|
36
44
|
const finText = await Promise.resolve(finTextResult);
|
|
@@ -61,12 +69,20 @@ export async function analyzeTextComparison(initHtml, finHtml, ignoreNavFooter =
|
|
|
61
69
|
* @param {string} originalHTML - Initial HTML content
|
|
62
70
|
* @param {string} currentHTML - Final HTML content
|
|
63
71
|
* @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
|
|
72
|
+
* @param {boolean} [includeNoscriptInCurrent=false] -
|
|
73
|
+
* Whether to include noscript content in current HTML
|
|
64
74
|
* @returns {Promise<Object>} Basic statistics
|
|
65
75
|
*/
|
|
66
|
-
export async function calculateStats(
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
76
|
+
export async function calculateStats(
|
|
77
|
+
originalHTML,
|
|
78
|
+
currentHTML,
|
|
79
|
+
ignoreNavFooter = true,
|
|
80
|
+
includeNoscriptInCurrent = false,
|
|
81
|
+
) {
|
|
82
|
+
// Server-side (original): Always includes noscript (true) - what crawlers see
|
|
83
|
+
const originalTextResult = stripTagsToText(originalHTML, ignoreNavFooter, true);
|
|
84
|
+
// Client-side (current): Configurable noscript inclusion - what users see
|
|
85
|
+
const currentTextResult = stripTagsToText(currentHTML, ignoreNavFooter, includeNoscriptInCurrent);
|
|
70
86
|
|
|
71
87
|
const originalText = await Promise.resolve(originalTextResult);
|
|
72
88
|
const currentText = await Promise.resolve(currentTextResult);
|
|
@@ -103,14 +119,30 @@ export async function calculateStats(originalHTML, currentHTML, ignoreNavFooter
|
|
|
103
119
|
* Calculate stats for both nav/footer scenarios
|
|
104
120
|
* @param {string} originalHTML - Initial HTML content
|
|
105
121
|
* @param {string} currentHTML - Final HTML content
|
|
122
|
+
* @param {boolean} [includeNoscriptInCurrent=false] -
|
|
123
|
+
* Whether to include noscript content in current HTML
|
|
106
124
|
* @returns {Promise<Object>} Analysis results for both scenarios
|
|
107
125
|
*/
|
|
108
|
-
export async function calculateBothScenarioStats(
|
|
126
|
+
export async function calculateBothScenarioStats(
|
|
127
|
+
originalHTML,
|
|
128
|
+
currentHTML,
|
|
129
|
+
includeNoscriptInCurrent = false,
|
|
130
|
+
) {
|
|
109
131
|
// Calculate stats with nav/footer ignored
|
|
110
|
-
const statsIgnored = await calculateStats(
|
|
132
|
+
const statsIgnored = await calculateStats(
|
|
133
|
+
originalHTML,
|
|
134
|
+
currentHTML,
|
|
135
|
+
true,
|
|
136
|
+
includeNoscriptInCurrent,
|
|
137
|
+
);
|
|
111
138
|
|
|
112
139
|
// Calculate stats without nav/footer ignored
|
|
113
|
-
const statsNotIgnored = await calculateStats(
|
|
140
|
+
const statsNotIgnored = await calculateStats(
|
|
141
|
+
originalHTML,
|
|
142
|
+
currentHTML,
|
|
143
|
+
false,
|
|
144
|
+
includeNoscriptInCurrent,
|
|
145
|
+
);
|
|
114
146
|
return {
|
|
115
147
|
withNavFooterIgnored: {
|
|
116
148
|
wordCountBefore: statsIgnored.wordCountBefore,
|
package/src/html-filter.js
CHANGED
|
@@ -182,16 +182,17 @@ function filterNavigationAndFooterCheerio($) {
|
|
|
182
182
|
* @param {string} htmlContent - Raw HTML content
|
|
183
183
|
* @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
|
|
184
184
|
* @param {boolean} returnText - Whether to return text only
|
|
185
|
+
* @param {boolean} includeNoscript - Whether to include noscript elements (false excludes them)
|
|
185
186
|
* @returns {string} Filtered content
|
|
186
187
|
*/
|
|
187
|
-
function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
|
|
188
|
+
function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText, includeNoscript) {
|
|
188
189
|
const parser = new DOMParser(); // eslint-disable-line no-undef
|
|
189
190
|
const doc = parser.parseFromString(htmlContent, 'text/html');
|
|
190
191
|
|
|
191
192
|
// Process the entire document to capture JSON-LD in both head and body
|
|
192
193
|
const documentElement = doc.documentElement || doc;
|
|
193
194
|
|
|
194
|
-
// Remove script elements except JSON-LD, also remove style,
|
|
195
|
+
// Remove script elements except JSON-LD, also remove style, template
|
|
195
196
|
documentElement.querySelectorAll('script').forEach((n) => {
|
|
196
197
|
// Preserve JSON-LD structured data scripts by converting them to code blocks
|
|
197
198
|
if (n.type === 'application/ld+json') {
|
|
@@ -234,7 +235,12 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
|
|
|
234
235
|
}
|
|
235
236
|
n.remove();
|
|
236
237
|
});
|
|
237
|
-
|
|
238
|
+
|
|
239
|
+
if (includeNoscript) {
|
|
240
|
+
documentElement.querySelectorAll('style,template').forEach((n) => n.remove());
|
|
241
|
+
} else {
|
|
242
|
+
documentElement.querySelectorAll('noscript,style,template').forEach((n) => n.remove());
|
|
243
|
+
}
|
|
238
244
|
|
|
239
245
|
// Remove all media elements (images, videos, audio, etc.) to keep only text
|
|
240
246
|
const mediaSelector = 'img,video,audio,picture,svg,canvas,embed,object,iframe';
|
|
@@ -259,9 +265,10 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
|
|
|
259
265
|
* @param {string} htmlContent - Raw HTML content
|
|
260
266
|
* @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
|
|
261
267
|
* @param {boolean} returnText - Whether to return text only
|
|
268
|
+
* @param {boolean} includeNoscript - Whether to include noscript elements (false excludes them)
|
|
262
269
|
* @returns {Promise<string>} Filtered content
|
|
263
270
|
*/
|
|
264
|
-
async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
|
|
271
|
+
async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText, includeNoscript) {
|
|
265
272
|
let cheerio;
|
|
266
273
|
try {
|
|
267
274
|
cheerio = await import('cheerio');
|
|
@@ -305,7 +312,12 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
|
|
|
305
312
|
$(this).remove();
|
|
306
313
|
}
|
|
307
314
|
});
|
|
308
|
-
|
|
315
|
+
|
|
316
|
+
if (includeNoscript) {
|
|
317
|
+
$('style, template').remove();
|
|
318
|
+
} else {
|
|
319
|
+
$('style, noscript, template').remove();
|
|
320
|
+
}
|
|
309
321
|
|
|
310
322
|
// Remove all media elements (images, videos, audio, etc.) to keep only text
|
|
311
323
|
$('img, video, audio, picture, svg, canvas, embed, object, iframe').remove();
|
|
@@ -330,45 +342,54 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
|
|
|
330
342
|
/**
|
|
331
343
|
* Filter HTML content by removing unwanted elements
|
|
332
344
|
* @param {string} htmlContent - Raw HTML content
|
|
333
|
-
* @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
|
|
334
|
-
* @param {boolean} returnText - Whether to return text only (true) or filtered HTML (false)
|
|
345
|
+
* @param {boolean} [ignoreNavFooter=true] - Whether to remove navigation/footer elements
|
|
346
|
+
* @param {boolean} [returnText=true] - Whether to return text only (true) or filtered HTML (false)
|
|
347
|
+
* @param {boolean} [includeNoscript=false] - Whether to include noscript elements
|
|
335
348
|
* @returns {string|Promise<string>} Filtered content (sync in browser, async in Node.js)
|
|
336
349
|
*/
|
|
337
|
-
export function filterHtmlContent(
|
|
350
|
+
export function filterHtmlContent(
|
|
351
|
+
htmlContent,
|
|
352
|
+
ignoreNavFooter = true,
|
|
353
|
+
returnText = true,
|
|
354
|
+
includeNoscript = false,
|
|
355
|
+
) {
|
|
338
356
|
if (!htmlContent) return '';
|
|
339
357
|
|
|
340
358
|
// Browser environment (DOMParser) - works in Chrome extensions too - SYNCHRONOUS
|
|
341
359
|
if (isBrowser()) {
|
|
342
|
-
return filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText);
|
|
360
|
+
return filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText, includeNoscript);
|
|
343
361
|
}
|
|
344
362
|
|
|
345
363
|
// Node.js environment (cheerio) - dynamic import to avoid bundling issues - ASYNCHRONOUS
|
|
346
|
-
return filterHtmlNode(htmlContent, ignoreNavFooter, returnText);
|
|
364
|
+
return filterHtmlNode(htmlContent, ignoreNavFooter, returnText, includeNoscript);
|
|
347
365
|
}
|
|
348
366
|
|
|
349
367
|
/**
|
|
350
368
|
* Strip HTML tags and return plain text
|
|
369
|
+
*
|
|
351
370
|
* @param {string} htmlContent - Raw HTML content
|
|
352
|
-
* @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
|
|
371
|
+
* @param {boolean} [ignoreNavFooter=true] - Whether to remove navigation/footer elements
|
|
372
|
+
* @param {boolean} [includeNoscript=false] - Whether to include noscript elements
|
|
353
373
|
* @returns {string|Promise<string>} Plain text content (sync in browser, async in Node.js)
|
|
354
374
|
*/
|
|
355
|
-
export function stripTagsToText(htmlContent, ignoreNavFooter = true) {
|
|
356
|
-
return filterHtmlContent(htmlContent, ignoreNavFooter, true);
|
|
375
|
+
export function stripTagsToText(htmlContent, ignoreNavFooter = true, includeNoscript = false) {
|
|
376
|
+
return filterHtmlContent(htmlContent, ignoreNavFooter, true, includeNoscript);
|
|
357
377
|
}
|
|
358
378
|
|
|
359
379
|
/**
|
|
360
380
|
* Extract word count from HTML content
|
|
361
381
|
* @param {string} htmlContent - Raw HTML content
|
|
362
|
-
* @param {boolean} ignoreNavFooter - Whether to ignore navigation/footer
|
|
382
|
+
* @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer
|
|
383
|
+
* @param {boolean} [includeNoscript=false] - Whether to include noscript elements
|
|
363
384
|
* @returns {Object|Promise<Object>} Object with word_count property
|
|
364
385
|
* (sync in browser, async in Node.js)
|
|
365
386
|
*/
|
|
366
|
-
export function extractWordCount(htmlContent, ignoreNavFooter = true) {
|
|
387
|
+
export function extractWordCount(htmlContent, ignoreNavFooter = true, includeNoscript = false) {
|
|
367
388
|
if (!htmlContent) {
|
|
368
389
|
return { word_count: 0 };
|
|
369
390
|
}
|
|
370
391
|
|
|
371
|
-
const textContent = stripTagsToText(htmlContent, ignoreNavFooter);
|
|
392
|
+
const textContent = stripTagsToText(htmlContent, ignoreNavFooter, includeNoscript);
|
|
372
393
|
|
|
373
394
|
// Handle both sync (browser) and async (Node.js) cases
|
|
374
395
|
if (textContent && typeof textContent.then === 'function') {
|
package/src/index.d.ts
CHANGED
|
@@ -90,17 +90,30 @@ export function generateDiffReport(initText: string, finText: string, mode?: "wo
|
|
|
90
90
|
/**
|
|
91
91
|
* Filter HTML content by removing unwanted elements
|
|
92
92
|
*/
|
|
93
|
-
export function filterHtmlContent(
|
|
93
|
+
export function filterHtmlContent(
|
|
94
|
+
htmlContent: string,
|
|
95
|
+
ignoreNavFooter?: boolean,
|
|
96
|
+
returnText?: boolean,
|
|
97
|
+
includeNoscript?: boolean
|
|
98
|
+
): Promise<string>;
|
|
94
99
|
|
|
95
100
|
/**
|
|
96
101
|
* Extract plain text from HTML content
|
|
97
102
|
*/
|
|
98
|
-
export function stripTagsToText(
|
|
103
|
+
export function stripTagsToText(
|
|
104
|
+
htmlContent: string,
|
|
105
|
+
ignoreNavFooter?: boolean,
|
|
106
|
+
includeNoscript?: boolean
|
|
107
|
+
): Promise<string>;
|
|
99
108
|
|
|
100
109
|
/**
|
|
101
110
|
* Extract word count from HTML content
|
|
102
111
|
*/
|
|
103
|
-
export function extractWordCount(
|
|
112
|
+
export function extractWordCount(
|
|
113
|
+
htmlContent: string,
|
|
114
|
+
ignoreNavFooter?: boolean,
|
|
115
|
+
includeNoscript?: boolean
|
|
116
|
+
): Promise<{ word_count: number }>;
|
|
104
117
|
|
|
105
118
|
/**
|
|
106
119
|
* Remove navigation and footer elements from DOM element (browser environment)
|
|
@@ -150,27 +163,88 @@ interface BothScenariosStats {
|
|
|
150
163
|
|
|
151
164
|
/**
|
|
152
165
|
* Comprehensive text-only analysis between initial and final HTML (original chrome extension logic)
|
|
166
|
+
* @param initHtml - Initial HTML content (what crawlers/bots see - server-side rendered)
|
|
167
|
+
* @param finHtml - Final HTML content (what users see - client-side rendered)
|
|
168
|
+
* @param ignoreNavFooter - Whether to ignore navigation/footer elements
|
|
169
|
+
* @param includeNoscriptInFinal - Whether to include noscript content in final HTML (client-side)
|
|
153
170
|
*/
|
|
154
171
|
export function analyzeTextComparison(
|
|
155
172
|
initHtml: string,
|
|
156
173
|
finHtml: string,
|
|
157
|
-
ignoreNavFooter?: boolean
|
|
174
|
+
ignoreNavFooter?: boolean,
|
|
175
|
+
includeNoscriptInFinal?: boolean
|
|
158
176
|
): Promise<TextComparison>;
|
|
159
177
|
|
|
160
178
|
/**
|
|
161
179
|
* Calculate basic stats from HTML comparison (original chrome extension logic)
|
|
180
|
+
* @param originalHTML - Initial HTML content (server-side)
|
|
181
|
+
* @param currentHTML - Final HTML content (client-side)
|
|
182
|
+
* @param ignoreNavFooter - Whether to ignore navigation/footer elements
|
|
183
|
+
* @param includeNoscriptInCurrent - Whether to include noscript content in current HTML (client-side)
|
|
162
184
|
*/
|
|
163
185
|
export function calculateStats(
|
|
164
186
|
originalHTML: string,
|
|
165
187
|
currentHTML: string,
|
|
166
|
-
ignoreNavFooter?: boolean
|
|
188
|
+
ignoreNavFooter?: boolean,
|
|
189
|
+
includeNoscriptInCurrent?: boolean
|
|
167
190
|
): Promise<BasicStats>;
|
|
168
191
|
|
|
169
192
|
/**
|
|
170
193
|
* Calculate stats for both nav/footer scenarios (original chrome extension logic)
|
|
194
|
+
* @param originalHTML - Initial HTML content (server-side)
|
|
195
|
+
* @param currentHTML - Final HTML content (client-side)
|
|
196
|
+
* @param includeNoscriptInCurrent - Whether to include noscript content in current HTML (client-side)
|
|
171
197
|
*/
|
|
172
198
|
export function calculateBothScenarioStats(
|
|
173
199
|
originalHTML: string,
|
|
174
|
-
currentHTML: string
|
|
200
|
+
currentHTML: string,
|
|
201
|
+
includeNoscriptInCurrent?: boolean
|
|
175
202
|
): Promise<BothScenariosStats>;
|
|
176
203
|
|
|
204
|
+
/** MARKDOWN DIFF FUNCTIONS */
|
|
205
|
+
interface MarkdownDiffBlock {
|
|
206
|
+
html: string;
|
|
207
|
+
text: string;
|
|
208
|
+
tagName: string;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
interface MarkdownDiffOperation {
|
|
212
|
+
type: "same" | "add" | "del";
|
|
213
|
+
originalBlock?: MarkdownDiffBlock;
|
|
214
|
+
currentBlock?: MarkdownDiffBlock;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* Diff DOM blocks using LCS algorithm
|
|
219
|
+
*/
|
|
220
|
+
export function diffDOMBlocks(
|
|
221
|
+
originalBlocks: MarkdownDiffBlock[],
|
|
222
|
+
currentBlocks: MarkdownDiffBlock[]
|
|
223
|
+
): MarkdownDiffOperation[];
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Create markdown table diff from parsed DOM children
|
|
227
|
+
*/
|
|
228
|
+
export function createMarkdownTableDiff(
|
|
229
|
+
originalChildren: Element[],
|
|
230
|
+
currentChildren: Element[],
|
|
231
|
+
$?: unknown
|
|
232
|
+
): { tableHtml: string; counters: string };
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Convert HTML to rendered markdown HTML (for display)
|
|
236
|
+
*/
|
|
237
|
+
export function htmlToRenderedMarkdown(
|
|
238
|
+
html: string,
|
|
239
|
+
ignoreNavFooter?: boolean
|
|
240
|
+
): Promise<string>;
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Generate complete markdown diff with HTML to Markdown conversion
|
|
244
|
+
*/
|
|
245
|
+
export function generateMarkdownDiff(
|
|
246
|
+
originalHtml: string,
|
|
247
|
+
currentHtml: string,
|
|
248
|
+
ignoreNavFooter?: boolean
|
|
249
|
+
): Promise<{ originalRenderedHtml: string; currentRenderedHtml: string }>;
|
|
250
|
+
|
package/test/index.test.js
CHANGED
|
@@ -46,6 +46,35 @@ describe('HTML Visibility Analyzer', () => {
|
|
|
46
46
|
expect(result.initialText).to.equal('');
|
|
47
47
|
expect(result.finalText.length).to.be.greaterThan(0);
|
|
48
48
|
});
|
|
49
|
+
|
|
50
|
+
it('should include noscript in initial HTML and exclude in final HTML by default', async () => {
|
|
51
|
+
const initHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p></body></html>';
|
|
52
|
+
const finHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p><div>Extra</div></body></html>';
|
|
53
|
+
const result = await analyzeTextComparison(initHtml, finHtml);
|
|
54
|
+
|
|
55
|
+
// Initial text should include noscript content
|
|
56
|
+
expect(result.initialText).to.include('Enable JS');
|
|
57
|
+
// Final text should NOT include noscript content by default
|
|
58
|
+
expect(result.finalText).to.not.include('Enable JS');
|
|
59
|
+
// Both should have the main content
|
|
60
|
+
expect(result.initialText).to.include('Title');
|
|
61
|
+
expect(result.finalText).to.include('Title');
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it('should include noscript in final HTML when includeNoscriptInFinal is true', async () => {
|
|
65
|
+
const initHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p></body></html>';
|
|
66
|
+
const finHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p><div>Extra</div></body></html>';
|
|
67
|
+
const result = await analyzeTextComparison(initHtml, finHtml, true, true);
|
|
68
|
+
|
|
69
|
+
// Initial text should include noscript content
|
|
70
|
+
expect(result.initialText).to.include('Enable JS');
|
|
71
|
+
// Final text should ALSO include noscript content when flag is true
|
|
72
|
+
expect(result.finalText).to.include('Enable JS');
|
|
73
|
+
// Both should have the main content
|
|
74
|
+
expect(result.initialText).to.include('Title');
|
|
75
|
+
expect(result.finalText).to.include('Title');
|
|
76
|
+
expect(result.finalText).to.include('Extra');
|
|
77
|
+
});
|
|
49
78
|
});
|
|
50
79
|
|
|
51
80
|
describe('calculateStats', () => {
|
|
@@ -64,6 +93,41 @@ describe('HTML Visibility Analyzer', () => {
|
|
|
64
93
|
expect(result.contentIncreaseRatio).to.be.a('number');
|
|
65
94
|
expect(result.citationReadability).to.be.a('number');
|
|
66
95
|
});
|
|
96
|
+
|
|
97
|
+
it('should handle noscript elements correctly in word counts by default', async () => {
|
|
98
|
+
const originalHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p></body></html>';
|
|
99
|
+
const currentHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p><p>New content</p></body></html>';
|
|
100
|
+
const result = await calculateStats(originalHtml, currentHtml);
|
|
101
|
+
|
|
102
|
+
// Word counts should reflect the includeNoscript behavior
|
|
103
|
+
// originalText includes noscript (includeNoscript=true):
|
|
104
|
+
// "Title Enable JavaScript Original content"
|
|
105
|
+
// currentText excludes noscript (includeNoscript=false):
|
|
106
|
+
// "Title Original content New content"
|
|
107
|
+
expect(result.wordCountBefore).to.be.greaterThan(0);
|
|
108
|
+
expect(result.wordCountAfter).to.be.greaterThan(0);
|
|
109
|
+
expect(result.contentIncreaseRatio).to.be.a('number');
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
it('should include noscript in current HTML when includeNoscriptInCurrent is true', async () => {
|
|
113
|
+
const originalHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p></body></html>';
|
|
114
|
+
const currentHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p><p>New content</p></body></html>';
|
|
115
|
+
const resultWithout = await calculateStats(originalHtml, currentHtml, true, false);
|
|
116
|
+
const resultWith = await calculateStats(originalHtml, currentHtml, true, true);
|
|
117
|
+
|
|
118
|
+
// When noscript is excluded from current, word count should be lower
|
|
119
|
+
expect(resultWithout.wordCountAfter).to.be.lessThan(resultWith.wordCountAfter);
|
|
120
|
+
|
|
121
|
+
// Note: Text extraction concatenates without spaces, so words merge
|
|
122
|
+
// originalHtml with noscript: "TitleEnable JavaScriptOriginal content" = 3 words
|
|
123
|
+
// originalHtml without noscript: "TitleOriginal content" = 2 words
|
|
124
|
+
// currentHtml without noscript: "TitleOriginal contentNew content" = 3 words
|
|
125
|
+
// currentHtml with noscript: "TitleEnable JavaScriptOriginal contentNew content" = 4 words
|
|
126
|
+
expect(resultWithout.wordCountBefore).to.equal(3);
|
|
127
|
+
expect(resultWithout.wordCountAfter).to.equal(3);
|
|
128
|
+
expect(resultWith.wordCountBefore).to.equal(3);
|
|
129
|
+
expect(resultWith.wordCountAfter).to.equal(4);
|
|
130
|
+
});
|
|
67
131
|
});
|
|
68
132
|
|
|
69
133
|
describe('calculateBothScenarioStats', () => {
|
|
@@ -118,5 +182,55 @@ describe('HTML Visibility Analyzer', () => {
|
|
|
118
182
|
expect(text).to.include('Navigation');
|
|
119
183
|
expect(text).to.include('Footer');
|
|
120
184
|
});
|
|
185
|
+
|
|
186
|
+
it('should remove noscript elements by default', async () => {
|
|
187
|
+
const html = '<html><body><h1>Title</h1><noscript>Please enable JavaScript</noscript><p>Content</p></body></html>';
|
|
188
|
+
const text = await stripTagsToText(html);
|
|
189
|
+
|
|
190
|
+
expect(text).to.include('Title');
|
|
191
|
+
expect(text).to.include('Content');
|
|
192
|
+
expect(text).to.not.include('Please enable JavaScript');
|
|
193
|
+
expect(text).to.not.include('noscript');
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
it('should remove noscript elements when includeNoscript is false', async () => {
|
|
197
|
+
const html = '<html><body><h1>Title</h1><noscript>Noscript content</noscript><p>Regular content</p></body></html>';
|
|
198
|
+
const text = await stripTagsToText(html, true, false);
|
|
199
|
+
|
|
200
|
+
expect(text).to.include('Title');
|
|
201
|
+
expect(text).to.include('Regular content');
|
|
202
|
+
expect(text).to.not.include('Noscript content');
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
it('should keep noscript elements when includeNoscript is true', async () => {
|
|
206
|
+
const html = '<html><body><h1>Title</h1><noscript>Noscript fallback</noscript><p>Regular content</p></body></html>';
|
|
207
|
+
const text = await stripTagsToText(html, true, true);
|
|
208
|
+
|
|
209
|
+
expect(text).to.include('Title');
|
|
210
|
+
expect(text).to.include('Regular content');
|
|
211
|
+
expect(text).to.include('Noscript fallback');
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
it('should handle multiple noscript elements with includeNoscript', async () => {
|
|
215
|
+
const html = `<html><body>
|
|
216
|
+
<h1>Title</h1>
|
|
217
|
+
<noscript>First noscript</noscript>
|
|
218
|
+
<p>Content</p>
|
|
219
|
+
<noscript>Second noscript</noscript>
|
|
220
|
+
</body></html>`;
|
|
221
|
+
|
|
222
|
+
const textWithout = await stripTagsToText(html, true, false);
|
|
223
|
+
const textWith = await stripTagsToText(html, true, true);
|
|
224
|
+
|
|
225
|
+
expect(textWithout).to.include('Title');
|
|
226
|
+
expect(textWithout).to.include('Content');
|
|
227
|
+
expect(textWithout).to.not.include('First noscript');
|
|
228
|
+
expect(textWithout).to.not.include('Second noscript');
|
|
229
|
+
|
|
230
|
+
expect(textWith).to.include('Title');
|
|
231
|
+
expect(textWith).to.include('Content');
|
|
232
|
+
expect(textWith).to.include('First noscript');
|
|
233
|
+
expect(textWith).to.include('Second noscript');
|
|
234
|
+
});
|
|
121
235
|
});
|
|
122
236
|
});
|