@adobe/spacecat-shared-html-analyzer 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ # [@adobe/spacecat-shared-html-analyzer-v1.2.2](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.2.1...@adobe/spacecat-shared-html-analyzer-v1.2.2) (2026-01-22)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * added option to include noscript tags in server-side html ([#1274](https://github.com/adobe/spacecat-shared/issues/1274)) ([f26e320](https://github.com/adobe/spacecat-shared/commit/f26e3200cc2b129237073da5c7cae1cbfb3ae4b1))
7
+
1
8
  # [@adobe/spacecat-shared-html-analyzer-v1.2.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.2.0...@adobe/spacecat-shared-html-analyzer-v1.2.1) (2026-01-15)
2
9
 
3
10
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-html-analyzer",
3
- "version": "1.2.1",
3
+ "version": "1.2.2",
4
4
  "description": "Analyze HTML content visibility for AI crawlers and citations - compare static HTML vs fully rendered content",
5
5
  "type": "module",
6
6
  "engines": {
package/src/analyzer.js CHANGED
@@ -25,12 +25,20 @@ import { hashDJB2, pct } from './utils.js';
25
25
  * @param {string} initHtml - Initial HTML content (what crawlers see)
26
26
  * @param {string} finHtml - Final HTML content (what users see)
27
27
  * @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
28
+ * @param {boolean} [includeNoscriptInFinal=false] -
29
+ * Whether to include noscript content in final HTML
28
30
  * @returns {Promise<Object>} Comprehensive analysis results
29
31
  */
30
- export async function analyzeTextComparison(initHtml, finHtml, ignoreNavFooter = true) {
31
- // Handle both sync (browser) and async (Node.js) stripTagsToText
32
- const initTextResult = stripTagsToText(initHtml, ignoreNavFooter);
33
- const finTextResult = stripTagsToText(finHtml, ignoreNavFooter);
32
+ export async function analyzeTextComparison(
33
+ initHtml,
34
+ finHtml,
35
+ ignoreNavFooter = true,
36
+ includeNoscriptInFinal = false,
37
+ ) {
38
+ // Server-side (initial): Always includes noscript (true) - what crawlers see
39
+ const initTextResult = stripTagsToText(initHtml, ignoreNavFooter, true);
40
+ // Client-side (final): Configurable noscript inclusion - what users see
41
+ const finTextResult = stripTagsToText(finHtml, ignoreNavFooter, includeNoscriptInFinal);
34
42
 
35
43
  const initText = await Promise.resolve(initTextResult);
36
44
  const finText = await Promise.resolve(finTextResult);
@@ -61,12 +69,20 @@ export async function analyzeTextComparison(initHtml, finHtml, ignoreNavFooter =
61
69
  * @param {string} originalHTML - Initial HTML content
62
70
  * @param {string} currentHTML - Final HTML content
63
71
  * @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer elements
72
+ * @param {boolean} [includeNoscriptInCurrent=false] -
73
+ * Whether to include noscript content in current HTML
64
74
  * @returns {Promise<Object>} Basic statistics
65
75
  */
66
- export async function calculateStats(originalHTML, currentHTML, ignoreNavFooter = true) {
67
- // Handle both sync (browser) and async (Node.js) stripTagsToText
68
- const originalTextResult = stripTagsToText(originalHTML, ignoreNavFooter);
69
- const currentTextResult = stripTagsToText(currentHTML, ignoreNavFooter);
76
+ export async function calculateStats(
77
+ originalHTML,
78
+ currentHTML,
79
+ ignoreNavFooter = true,
80
+ includeNoscriptInCurrent = false,
81
+ ) {
82
+ // Server-side (original): Always includes noscript (true) - what crawlers see
83
+ const originalTextResult = stripTagsToText(originalHTML, ignoreNavFooter, true);
84
+ // Client-side (current): Configurable noscript inclusion - what users see
85
+ const currentTextResult = stripTagsToText(currentHTML, ignoreNavFooter, includeNoscriptInCurrent);
70
86
 
71
87
  const originalText = await Promise.resolve(originalTextResult);
72
88
  const currentText = await Promise.resolve(currentTextResult);
@@ -103,14 +119,30 @@ export async function calculateStats(originalHTML, currentHTML, ignoreNavFooter
103
119
  * Calculate stats for both nav/footer scenarios
104
120
  * @param {string} originalHTML - Initial HTML content
105
121
  * @param {string} currentHTML - Final HTML content
122
+ * @param {boolean} [includeNoscriptInCurrent=false] -
123
+ * Whether to include noscript content in current HTML
106
124
  * @returns {Promise<Object>} Analysis results for both scenarios
107
125
  */
108
- export async function calculateBothScenarioStats(originalHTML, currentHTML) {
126
+ export async function calculateBothScenarioStats(
127
+ originalHTML,
128
+ currentHTML,
129
+ includeNoscriptInCurrent = false,
130
+ ) {
109
131
  // Calculate stats with nav/footer ignored
110
- const statsIgnored = await calculateStats(originalHTML, currentHTML, true);
132
+ const statsIgnored = await calculateStats(
133
+ originalHTML,
134
+ currentHTML,
135
+ true,
136
+ includeNoscriptInCurrent,
137
+ );
111
138
 
112
139
  // Calculate stats without nav/footer ignored
113
- const statsNotIgnored = await calculateStats(originalHTML, currentHTML, false);
140
+ const statsNotIgnored = await calculateStats(
141
+ originalHTML,
142
+ currentHTML,
143
+ false,
144
+ includeNoscriptInCurrent,
145
+ );
114
146
  return {
115
147
  withNavFooterIgnored: {
116
148
  wordCountBefore: statsIgnored.wordCountBefore,
@@ -182,16 +182,17 @@ function filterNavigationAndFooterCheerio($) {
182
182
  * @param {string} htmlContent - Raw HTML content
183
183
  * @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
184
184
  * @param {boolean} returnText - Whether to return text only
185
+ * @param {boolean} includeNoscript - Whether to include noscript elements (false excludes them)
185
186
  * @returns {string} Filtered content
186
187
  */
187
- function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
188
+ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText, includeNoscript) {
188
189
  const parser = new DOMParser(); // eslint-disable-line no-undef
189
190
  const doc = parser.parseFromString(htmlContent, 'text/html');
190
191
 
191
192
  // Process the entire document to capture JSON-LD in both head and body
192
193
  const documentElement = doc.documentElement || doc;
193
194
 
194
- // Remove script elements except JSON-LD, also remove style, noscript, template
195
+ // Remove script elements except JSON-LD, also remove style, template
195
196
  documentElement.querySelectorAll('script').forEach((n) => {
196
197
  // Preserve JSON-LD structured data scripts by converting them to code blocks
197
198
  if (n.type === 'application/ld+json') {
@@ -234,7 +235,12 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
234
235
  }
235
236
  n.remove();
236
237
  });
237
- documentElement.querySelectorAll('style,noscript,template').forEach((n) => n.remove());
238
+
239
+ if (includeNoscript) {
240
+ documentElement.querySelectorAll('style,template').forEach((n) => n.remove());
241
+ } else {
242
+ documentElement.querySelectorAll('noscript,style,template').forEach((n) => n.remove());
243
+ }
238
244
 
239
245
  // Remove all media elements (images, videos, audio, etc.) to keep only text
240
246
  const mediaSelector = 'img,video,audio,picture,svg,canvas,embed,object,iframe';
@@ -259,9 +265,10 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
259
265
  * @param {string} htmlContent - Raw HTML content
260
266
  * @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
261
267
  * @param {boolean} returnText - Whether to return text only
268
+ * @param {boolean} includeNoscript - Whether to include noscript elements (false excludes them)
262
269
  * @returns {Promise<string>} Filtered content
263
270
  */
264
- async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
271
+ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText, includeNoscript) {
265
272
  let cheerio;
266
273
  try {
267
274
  cheerio = await import('cheerio');
@@ -305,7 +312,12 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
305
312
  $(this).remove();
306
313
  }
307
314
  });
308
- $('style, noscript, template').remove();
315
+
316
+ if (includeNoscript) {
317
+ $('style, template').remove();
318
+ } else {
319
+ $('style, noscript, template').remove();
320
+ }
309
321
 
310
322
  // Remove all media elements (images, videos, audio, etc.) to keep only text
311
323
  $('img, video, audio, picture, svg, canvas, embed, object, iframe').remove();
@@ -330,45 +342,54 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
330
342
  /**
331
343
  * Filter HTML content by removing unwanted elements
332
344
  * @param {string} htmlContent - Raw HTML content
333
- * @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
334
- * @param {boolean} returnText - Whether to return text only (true) or filtered HTML (false)
345
+ * @param {boolean} [ignoreNavFooter=true] - Whether to remove navigation/footer elements
346
+ * @param {boolean} [returnText=true] - Whether to return text only (true) or filtered HTML (false)
347
+ * @param {boolean} [includeNoscript=false] - Whether to include noscript elements
335
348
  * @returns {string|Promise<string>} Filtered content (sync in browser, async in Node.js)
336
349
  */
337
- export function filterHtmlContent(htmlContent, ignoreNavFooter = true, returnText = true) {
350
+ export function filterHtmlContent(
351
+ htmlContent,
352
+ ignoreNavFooter = true,
353
+ returnText = true,
354
+ includeNoscript = false,
355
+ ) {
338
356
  if (!htmlContent) return '';
339
357
 
340
358
  // Browser environment (DOMParser) - works in Chrome extensions too - SYNCHRONOUS
341
359
  if (isBrowser()) {
342
- return filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText);
360
+ return filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText, includeNoscript);
343
361
  }
344
362
 
345
363
  // Node.js environment (cheerio) - dynamic import to avoid bundling issues - ASYNCHRONOUS
346
- return filterHtmlNode(htmlContent, ignoreNavFooter, returnText);
364
+ return filterHtmlNode(htmlContent, ignoreNavFooter, returnText, includeNoscript);
347
365
  }
348
366
 
349
367
  /**
350
368
  * Strip HTML tags and return plain text
369
+ *
351
370
  * @param {string} htmlContent - Raw HTML content
352
- * @param {boolean} ignoreNavFooter - Whether to remove navigation/footer elements
371
+ * @param {boolean} [ignoreNavFooter=true] - Whether to remove navigation/footer elements
372
+ * @param {boolean} [includeNoscript=false] - Whether to include noscript elements
353
373
  * @returns {string|Promise<string>} Plain text content (sync in browser, async in Node.js)
354
374
  */
355
- export function stripTagsToText(htmlContent, ignoreNavFooter = true) {
356
- return filterHtmlContent(htmlContent, ignoreNavFooter, true);
375
+ export function stripTagsToText(htmlContent, ignoreNavFooter = true, includeNoscript = false) {
376
+ return filterHtmlContent(htmlContent, ignoreNavFooter, true, includeNoscript);
357
377
  }
358
378
 
359
379
  /**
360
380
  * Extract word count from HTML content
361
381
  * @param {string} htmlContent - Raw HTML content
362
- * @param {boolean} ignoreNavFooter - Whether to ignore navigation/footer
382
+ * @param {boolean} [ignoreNavFooter=true] - Whether to ignore navigation/footer
383
+ * @param {boolean} [includeNoscript=false] - Whether to include noscript elements
363
384
  * @returns {Object|Promise<Object>} Object with word_count property
364
385
  * (sync in browser, async in Node.js)
365
386
  */
366
- export function extractWordCount(htmlContent, ignoreNavFooter = true) {
387
+ export function extractWordCount(htmlContent, ignoreNavFooter = true, includeNoscript = false) {
367
388
  if (!htmlContent) {
368
389
  return { word_count: 0 };
369
390
  }
370
391
 
371
- const textContent = stripTagsToText(htmlContent, ignoreNavFooter);
392
+ const textContent = stripTagsToText(htmlContent, ignoreNavFooter, includeNoscript);
372
393
 
373
394
  // Handle both sync (browser) and async (Node.js) cases
374
395
  if (textContent && typeof textContent.then === 'function') {
package/src/index.d.ts CHANGED
@@ -90,17 +90,30 @@ export function generateDiffReport(initText: string, finText: string, mode?: "wo
90
90
  /**
91
91
  * Filter HTML content by removing unwanted elements
92
92
  */
93
- export function filterHtmlContent(htmlContent: string, ignoreNavFooter?: boolean, returnText?: boolean): Promise<string>;
93
+ export function filterHtmlContent(
94
+ htmlContent: string,
95
+ ignoreNavFooter?: boolean,
96
+ returnText?: boolean,
97
+ includeNoscript?: boolean
98
+ ): Promise<string>;
94
99
 
95
100
  /**
96
101
  * Extract plain text from HTML content
97
102
  */
98
- export function stripTagsToText(htmlContent: string, ignoreNavFooter?: boolean): Promise<string>;
103
+ export function stripTagsToText(
104
+ htmlContent: string,
105
+ ignoreNavFooter?: boolean,
106
+ includeNoscript?: boolean
107
+ ): Promise<string>;
99
108
 
100
109
  /**
101
110
  * Extract word count from HTML content
102
111
  */
103
- export function extractWordCount(htmlContent: string, ignoreNavFooter?: boolean): Promise<{ word_count: number }>;
112
+ export function extractWordCount(
113
+ htmlContent: string,
114
+ ignoreNavFooter?: boolean,
115
+ includeNoscript?: boolean
116
+ ): Promise<{ word_count: number }>;
104
117
 
105
118
  /**
106
119
  * Remove navigation and footer elements from DOM element (browser environment)
@@ -150,28 +163,42 @@ interface BothScenariosStats {
150
163
 
151
164
  /**
152
165
  * Comprehensive text-only analysis between initial and final HTML (original chrome extension logic)
166
+ * @param initHtml - Initial HTML content (what crawlers/bots see - server-side rendered)
167
+ * @param finHtml - Final HTML content (what users see - client-side rendered)
168
+ * @param ignoreNavFooter - Whether to ignore navigation/footer elements
169
+ * @param includeNoscriptInFinal - Whether to include noscript content in final HTML (client-side)
153
170
  */
154
171
  export function analyzeTextComparison(
155
172
  initHtml: string,
156
173
  finHtml: string,
157
- ignoreNavFooter?: boolean
174
+ ignoreNavFooter?: boolean,
175
+ includeNoscriptInFinal?: boolean
158
176
  ): Promise<TextComparison>;
159
177
 
160
178
  /**
161
179
  * Calculate basic stats from HTML comparison (original chrome extension logic)
180
+ * @param originalHTML - Initial HTML content (server-side)
181
+ * @param currentHTML - Final HTML content (client-side)
182
+ * @param ignoreNavFooter - Whether to ignore navigation/footer elements
183
+ * @param includeNoscriptInCurrent - Whether to include noscript content in current HTML (client-side)
162
184
  */
163
185
  export function calculateStats(
164
186
  originalHTML: string,
165
187
  currentHTML: string,
166
- ignoreNavFooter?: boolean
188
+ ignoreNavFooter?: boolean,
189
+ includeNoscriptInCurrent?: boolean
167
190
  ): Promise<BasicStats>;
168
191
 
169
192
  /**
170
193
  * Calculate stats for both nav/footer scenarios (original chrome extension logic)
194
+ * @param originalHTML - Initial HTML content (server-side)
195
+ * @param currentHTML - Final HTML content (client-side)
196
+ * @param includeNoscriptInCurrent - Whether to include noscript content in current HTML (client-side)
171
197
  */
172
198
  export function calculateBothScenarioStats(
173
199
  originalHTML: string,
174
- currentHTML: string
200
+ currentHTML: string,
201
+ includeNoscriptInCurrent?: boolean
175
202
  ): Promise<BothScenariosStats>;
176
203
 
177
204
  /** MARKDOWN DIFF FUNCTIONS */
@@ -46,6 +46,35 @@ describe('HTML Visibility Analyzer', () => {
46
46
  expect(result.initialText).to.equal('');
47
47
  expect(result.finalText.length).to.be.greaterThan(0);
48
48
  });
49
+
50
+ it('should include noscript in initial HTML and exclude in final HTML by default', async () => {
51
+ const initHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p></body></html>';
52
+ const finHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p><div>Extra</div></body></html>';
53
+ const result = await analyzeTextComparison(initHtml, finHtml);
54
+
55
+ // Initial text should include noscript content
56
+ expect(result.initialText).to.include('Enable JS');
57
+ // Final text should NOT include noscript content by default
58
+ expect(result.finalText).to.not.include('Enable JS');
59
+ // Both should have the main content
60
+ expect(result.initialText).to.include('Title');
61
+ expect(result.finalText).to.include('Title');
62
+ });
63
+
64
+ it('should include noscript in final HTML when includeNoscriptInFinal is true', async () => {
65
+ const initHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p></body></html>';
66
+ const finHtml = '<html><body><h1>Title</h1><noscript>Enable JS</noscript><p>Content</p><div>Extra</div></body></html>';
67
+ const result = await analyzeTextComparison(initHtml, finHtml, true, true);
68
+
69
+ // Initial text should include noscript content
70
+ expect(result.initialText).to.include('Enable JS');
71
+ // Final text should ALSO include noscript content when flag is true
72
+ expect(result.finalText).to.include('Enable JS');
73
+ // Both should have the main content
74
+ expect(result.initialText).to.include('Title');
75
+ expect(result.finalText).to.include('Title');
76
+ expect(result.finalText).to.include('Extra');
77
+ });
49
78
  });
50
79
 
51
80
  describe('calculateStats', () => {
@@ -64,6 +93,41 @@ describe('HTML Visibility Analyzer', () => {
64
93
  expect(result.contentIncreaseRatio).to.be.a('number');
65
94
  expect(result.citationReadability).to.be.a('number');
66
95
  });
96
+
97
+ it('should handle noscript elements correctly in word counts by default', async () => {
98
+ const originalHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p></body></html>';
99
+ const currentHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p><p>New content</p></body></html>';
100
+ const result = await calculateStats(originalHtml, currentHtml);
101
+
102
+ // Word counts should reflect the includeNoscript behavior
103
+ // originalText includes noscript (includeNoscript=true):
104
+ // "Title Enable JavaScript Original content"
105
+ // currentText excludes noscript (includeNoscript=false):
106
+ // "Title Original content New content"
107
+ expect(result.wordCountBefore).to.be.greaterThan(0);
108
+ expect(result.wordCountAfter).to.be.greaterThan(0);
109
+ expect(result.contentIncreaseRatio).to.be.a('number');
110
+ });
111
+
112
+ it('should include noscript in current HTML when includeNoscriptInCurrent is true', async () => {
113
+ const originalHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p></body></html>';
114
+ const currentHtml = '<html><body><h1>Title</h1><noscript>Enable JavaScript</noscript><p>Original content</p><p>New content</p></body></html>';
115
+ const resultWithout = await calculateStats(originalHtml, currentHtml, true, false);
116
+ const resultWith = await calculateStats(originalHtml, currentHtml, true, true);
117
+
118
+ // When noscript is excluded from current, word count should be lower
119
+ expect(resultWithout.wordCountAfter).to.be.lessThan(resultWith.wordCountAfter);
120
+
121
+ // Note: Text extraction concatenates without spaces, so words merge
122
+ // originalHtml with noscript: "TitleEnable JavaScriptOriginal content" = 3 words
123
+ // originalHtml without noscript: "TitleOriginal content" = 2 words
124
+ // currentHtml without noscript: "TitleOriginal contentNew content" = 3 words
125
+ // currentHtml with noscript: "TitleEnable JavaScriptOriginal contentNew content" = 4 words
126
+ expect(resultWithout.wordCountBefore).to.equal(3);
127
+ expect(resultWithout.wordCountAfter).to.equal(3);
128
+ expect(resultWith.wordCountBefore).to.equal(3);
129
+ expect(resultWith.wordCountAfter).to.equal(4);
130
+ });
67
131
  });
68
132
 
69
133
  describe('calculateBothScenarioStats', () => {
@@ -118,5 +182,55 @@ describe('HTML Visibility Analyzer', () => {
118
182
  expect(text).to.include('Navigation');
119
183
  expect(text).to.include('Footer');
120
184
  });
185
+
186
+ it('should remove noscript elements by default', async () => {
187
+ const html = '<html><body><h1>Title</h1><noscript>Please enable JavaScript</noscript><p>Content</p></body></html>';
188
+ const text = await stripTagsToText(html);
189
+
190
+ expect(text).to.include('Title');
191
+ expect(text).to.include('Content');
192
+ expect(text).to.not.include('Please enable JavaScript');
193
+ expect(text).to.not.include('noscript');
194
+ });
195
+
196
+ it('should remove noscript elements when includeNoscript is false', async () => {
197
+ const html = '<html><body><h1>Title</h1><noscript>Noscript content</noscript><p>Regular content</p></body></html>';
198
+ const text = await stripTagsToText(html, true, false);
199
+
200
+ expect(text).to.include('Title');
201
+ expect(text).to.include('Regular content');
202
+ expect(text).to.not.include('Noscript content');
203
+ });
204
+
205
+ it('should keep noscript elements when includeNoscript is true', async () => {
206
+ const html = '<html><body><h1>Title</h1><noscript>Noscript fallback</noscript><p>Regular content</p></body></html>';
207
+ const text = await stripTagsToText(html, true, true);
208
+
209
+ expect(text).to.include('Title');
210
+ expect(text).to.include('Regular content');
211
+ expect(text).to.include('Noscript fallback');
212
+ });
213
+
214
+ it('should handle multiple noscript elements with includeNoscript', async () => {
215
+ const html = `<html><body>
216
+ <h1>Title</h1>
217
+ <noscript>First noscript</noscript>
218
+ <p>Content</p>
219
+ <noscript>Second noscript</noscript>
220
+ </body></html>`;
221
+
222
+ const textWithout = await stripTagsToText(html, true, false);
223
+ const textWith = await stripTagsToText(html, true, true);
224
+
225
+ expect(textWithout).to.include('Title');
226
+ expect(textWithout).to.include('Content');
227
+ expect(textWithout).to.not.include('First noscript');
228
+ expect(textWithout).to.not.include('Second noscript');
229
+
230
+ expect(textWith).to.include('Title');
231
+ expect(textWith).to.include('Content');
232
+ expect(textWith).to.include('First noscript');
233
+ expect(textWith).to.include('Second noscript');
234
+ });
121
235
  });
122
236
  });