@adobe/spacecat-shared-html-analyzer 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ # [@adobe/spacecat-shared-html-analyzer-v1.2.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.1.0...@adobe/spacecat-shared-html-analyzer-v1.2.0) (2025-12-04)
2
+
3
+
4
+ ### Features
5
+
6
+ * added utility method to get added blocks in markdown diff, made compatible for nodejs scripts ([#1213](https://github.com/adobe/spacecat-shared/issues/1213)) ([07d8b74](https://github.com/adobe/spacecat-shared/commit/07d8b7419a681f2c5e07870c526ae26d28108989))
7
+
1
8
  # [@adobe/spacecat-shared-html-analyzer-v1.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.0.7...@adobe/spacecat-shared-html-analyzer-v1.1.0) (2025-12-01)
2
9
 
3
10
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/spacecat-shared-html-analyzer",
3
- "version": "1.1.0",
3
+ "version": "1.2.0",
4
4
  "description": "Analyze HTML content visibility for AI crawlers and citations - compare static HTML vs fully rendered content",
5
5
  "type": "module",
6
6
  "engines": {
package/src/index.js CHANGED
@@ -49,6 +49,7 @@ export {
49
49
  export {
50
50
  diffDOMBlocks,
51
51
  createMarkdownTableDiff,
52
+ getAddedMarkdownBlocks,
52
53
  generateMarkdownDiff,
53
54
  htmlToRenderedMarkdown,
54
55
  } from './markdown-diff.js';
@@ -18,6 +18,16 @@
18
18
  import { filterHtmlContent } from './html-filter.js';
19
19
  import { htmlToMarkdown, markdownToHtml } from './markdown-converter.js';
20
20
 
21
+ /**
22
+ * Check if element is a browser DOM element (has outerHTML property)
23
+ * @param {Object} el - Element to check
24
+ * @returns {boolean} True if DOM element, false if cheerio element
25
+ * @private
26
+ */
27
+ function isDOMElement(el) {
28
+ return typeof el.outerHTML === 'string';
29
+ }
30
+
21
31
  /**
22
32
  * Diff DOM blocks using LCS algorithm
23
33
  * Compares blocks based on text content while preserving full HTML structure
@@ -100,51 +110,114 @@ export function diffDOMBlocks(originalBlocks, currentBlocks) {
100
110
  return ops;
101
111
  }
102
112
 
113
+ /**
114
+ * Get tag name from element
115
+ * @param {Object} el - Element (DOM or cheerio)
116
+ * @returns {string} Uppercase tag name
117
+ * @private
118
+ */
119
+ function getTagName(el) {
120
+ // DOM elements have uppercase tagName, cheerio has lowercase name
121
+ return (el.tagName || el.name || '').toUpperCase();
122
+ }
123
+
124
+ /**
125
+ * Get children elements
126
+ * @param {Object} el - Element (DOM or cheerio)
127
+ * @returns {Array} Array of child elements
128
+ * @private
129
+ */
130
+ function getChildren(el) {
131
+ if (isDOMElement(el)) {
132
+ return Array.from(el.children || []);
133
+ }
134
+ // Cheerio raw element has children array with type info
135
+ return (el.children || []).filter((c) => c.type === 'tag');
136
+ }
137
+
138
+ /**
139
+ * Get text content from element
140
+ * @param {Object} el - Element (DOM or cheerio)
141
+ * @param {Function} [$] - Cheerio instance (required for cheerio elements)
142
+ * @returns {string} Text content
143
+ * @private
144
+ */
145
+ function getTextContent(el, $) {
146
+ if (isDOMElement(el)) {
147
+ return el.textContent || '';
148
+ }
149
+ // Use cheerio's text() method
150
+ return $(el).text();
151
+ }
152
+
153
+ /**
154
+ * Get outer HTML from element
155
+ * @param {Object} el - Element (DOM or cheerio)
156
+ * @param {Function} [$] - Cheerio instance (required for cheerio elements)
157
+ * @returns {string} Outer HTML
158
+ * @private
159
+ */
160
+ function getOuterHTML(el, $) {
161
+ if (isDOMElement(el)) {
162
+ return el.outerHTML;
163
+ }
164
+ // Use cheerio's html() method
165
+ return $.html(el);
166
+ }
167
+
103
168
  /**
104
169
  * Extract blocks from parsed HTML, breaking down lists into individual items
105
- * @param {Array} children - Array of child elements
170
+ * Works with both browser DOM elements and cheerio raw elements
171
+ * @param {Array} children - Array of child elements (DOM or cheerio)
172
+ * @param {Function} [$] - Cheerio instance (required for Node.js, optional for browser)
106
173
  * @returns {Array<{html: string, text: string, tagName: string}>} Extracted blocks
107
174
  * @private
108
175
  */
109
- function extractBlocks(children) {
176
+ function extractBlocks(children, $) {
110
177
  const blocks = [];
111
178
  children.forEach((el) => {
179
+ const tagName = getTagName(el);
180
+
112
181
  // If it's a list (ul/ol), break it down into individual list items
113
- if (el.tagName === 'UL' || el.tagName === 'OL') {
114
- const listType = el.tagName.toLowerCase();
115
- Array.from(el.children).forEach((li) => {
116
- if (li.tagName === 'LI') {
182
+ if (tagName === 'UL' || tagName === 'OL') {
183
+ const listType = tagName.toLowerCase();
184
+ const listChildren = getChildren(el);
185
+
186
+ listChildren.forEach((li) => {
187
+ if (getTagName(li) === 'LI') {
117
188
  // Skip empty list items - they cause alignment issues
118
- const liText = li.textContent?.trim() || '';
189
+ const liText = getTextContent(li, $).trim();
119
190
  if (!liText) return;
120
191
 
121
192
  // Check if the list item contains nested block elements (p, div, h1-h6, etc.)
122
- const nestedBlocks = Array.from(li.children).filter((child) => {
123
- const tag = child.tagName;
124
- return tag === 'P' || tag === 'DIV' || tag === 'H1' || tag === 'H2'
125
- || tag === 'H3' || tag === 'H4' || tag === 'H5' || tag === 'H6'
126
- || tag === 'BLOCKQUOTE' || tag === 'PRE';
193
+ const liChildren = getChildren(li);
194
+ const nestedBlocks = liChildren.filter((child) => {
195
+ const childTag = getTagName(child);
196
+ return childTag === 'P' || childTag === 'DIV' || childTag === 'H1'
197
+ || childTag === 'H2' || childTag === 'H3' || childTag === 'H4'
198
+ || childTag === 'H5' || childTag === 'H6'
199
+ || childTag === 'BLOCKQUOTE' || childTag === 'PRE';
127
200
  });
128
201
 
129
202
  if (nestedBlocks.length > 0) {
130
203
  // Extract nested blocks individually for better matching
131
204
  // but wrap them in li/ul for proper display
132
205
  nestedBlocks.forEach((child) => {
133
- const childText = child.textContent?.trim() || '';
206
+ const childText = getTextContent(child, $).trim();
134
207
  if (!childText) return; // Skip empty nested blocks too
135
208
 
136
209
  blocks.push({
137
- html: `<${listType}><li>${child.outerHTML}</li></${listType}>`,
138
- text: child.textContent?.trim() || '',
139
- tagName: child.tagName.toLowerCase(),
210
+ html: `<${listType}><li>${getOuterHTML(child, $)}</li></${listType}>`,
211
+ text: childText,
212
+ tagName: getTagName(child).toLowerCase(),
140
213
  });
141
214
  });
142
215
  } else {
143
216
  // No nested blocks, treat the whole li as one block
144
217
  // wrap in ul/ol for proper display
145
218
  blocks.push({
146
- html: `<${listType}>${li.outerHTML}</${listType}>`,
147
- text: li.textContent?.trim() || '',
219
+ html: `<${listType}>${getOuterHTML(li, $)}</${listType}>`,
220
+ text: liText,
148
221
  tagName: 'li',
149
222
  });
150
223
  }
@@ -152,26 +225,58 @@ function extractBlocks(children) {
152
225
  });
153
226
  } else {
154
227
  // For all other elements, add them as-is
155
- blocks.push({
156
- html: el.outerHTML,
157
- text: el.textContent?.trim() || '',
158
- tagName: el.tagName.toLowerCase(),
159
- });
228
+ const text = getTextContent(el, $).trim();
229
+ if (text) {
230
+ blocks.push({
231
+ html: getOuterHTML(el, $),
232
+ text,
233
+ tagName: tagName.toLowerCase(),
234
+ });
235
+ }
160
236
  }
161
237
  });
162
238
  return blocks;
163
239
  }
164
240
 
241
+ /**
242
+ * Get only the added markdown blocks (content in current but not in original)
243
+ * @param {Array} originalChildren - Array of original DOM child elements
244
+ * @param {Array} currentChildren - Array of current DOM child elements
245
+ * @param {Function} [$] - Cheerio instance (required for Node.js, optional for browser)
246
+ * @returns {{addedBlocks: Array<{html: string, text: string}>, addedCount: number}}
247
+ * Added blocks with both HTML and text content
248
+ */
249
+ export function getAddedMarkdownBlocks(originalChildren, currentChildren, $) {
250
+ const originalBlocks = extractBlocks(originalChildren, $);
251
+ const currentBlocks = extractBlocks(currentChildren, $);
252
+
253
+ const ops = diffDOMBlocks(originalBlocks, currentBlocks);
254
+
255
+ // Extract both HTML and text content from added blocks
256
+ const addedBlocks = ops
257
+ .filter((op) => op.type === 'add')
258
+ .map((op) => ({
259
+ html: op.currentBlock.html,
260
+ text: op.currentBlock.text,
261
+ }));
262
+
263
+ return {
264
+ addedBlocks,
265
+ addedCount: addedBlocks.length,
266
+ };
267
+ }
268
+
165
269
  /**
166
270
  * Create markdown table diff from parsed DOM children
167
271
  * @param {Array} originalChildren - Array of original DOM child elements
168
272
  * @param {Array} currentChildren - Array of current DOM child elements
273
+ * @param {Function} [$] - Cheerio instance (required for Node.js, optional for browser)
169
274
  * @returns {{tableHtml: string, counters: string}} Diff table and counter information
170
275
  */
171
- export function createMarkdownTableDiff(originalChildren, currentChildren) {
276
+ export function createMarkdownTableDiff(originalChildren, currentChildren, $) {
172
277
  // Get all block-level elements from both sides and extract their text content
173
- const originalBlocks = extractBlocks(originalChildren);
174
- const currentBlocks = extractBlocks(currentChildren);
278
+ const originalBlocks = extractBlocks(originalChildren, $);
279
+ const currentBlocks = extractBlocks(currentChildren, $);
175
280
 
176
281
  // Run diff algorithm once and count changes
177
282
  const ops = diffDOMBlocks(originalBlocks, currentBlocks);