@adobe/spacecat-shared-html-analyzer 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/package.json +1 -1
- package/src/index.d.ts +47 -0
- package/src/index.js +1 -0
- package/src/markdown-diff.js +131 -26
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [@adobe/spacecat-shared-html-analyzer-v1.2.1](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.2.0...@adobe/spacecat-shared-html-analyzer-v1.2.1) (2026-01-15)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* html-analyser pkg adding exported methods in typescript file ([#1265](https://github.com/adobe/spacecat-shared/issues/1265)) ([10d173b](https://github.com/adobe/spacecat-shared/commit/10d173b68c3c4158a49465a3f3d7a78b68dccc3b))
|
|
7
|
+
|
|
8
|
+
# [@adobe/spacecat-shared-html-analyzer-v1.2.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.1.0...@adobe/spacecat-shared-html-analyzer-v1.2.0) (2025-12-04)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
* added utility method to get added blocks in markdown diff, made compatible for nodejs scripts ([#1213](https://github.com/adobe/spacecat-shared/issues/1213)) ([07d8b74](https://github.com/adobe/spacecat-shared/commit/07d8b7419a681f2c5e07870c526ae26d28108989))
|
|
14
|
+
|
|
1
15
|
# [@adobe/spacecat-shared-html-analyzer-v1.1.0](https://github.com/adobe/spacecat-shared/compare/@adobe/spacecat-shared-html-analyzer-v1.0.7...@adobe/spacecat-shared-html-analyzer-v1.1.0) (2025-12-01)
|
|
2
16
|
|
|
3
17
|
|
package/package.json
CHANGED
package/src/index.d.ts
CHANGED
|
@@ -174,3 +174,50 @@ export function calculateBothScenarioStats(
|
|
|
174
174
|
currentHTML: string
|
|
175
175
|
): Promise<BothScenariosStats>;
|
|
176
176
|
|
|
177
|
+
/** MARKDOWN DIFF FUNCTIONS */
|
|
178
|
+
interface MarkdownDiffBlock {
|
|
179
|
+
html: string;
|
|
180
|
+
text: string;
|
|
181
|
+
tagName: string;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
interface MarkdownDiffOperation {
|
|
185
|
+
type: "same" | "add" | "del";
|
|
186
|
+
originalBlock?: MarkdownDiffBlock;
|
|
187
|
+
currentBlock?: MarkdownDiffBlock;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Diff DOM blocks using LCS algorithm
|
|
192
|
+
*/
|
|
193
|
+
export function diffDOMBlocks(
|
|
194
|
+
originalBlocks: MarkdownDiffBlock[],
|
|
195
|
+
currentBlocks: MarkdownDiffBlock[]
|
|
196
|
+
): MarkdownDiffOperation[];
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Create markdown table diff from parsed DOM children
|
|
200
|
+
*/
|
|
201
|
+
export function createMarkdownTableDiff(
|
|
202
|
+
originalChildren: Element[],
|
|
203
|
+
currentChildren: Element[],
|
|
204
|
+
$?: unknown
|
|
205
|
+
): { tableHtml: string; counters: string };
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* Convert HTML to rendered markdown HTML (for display)
|
|
209
|
+
*/
|
|
210
|
+
export function htmlToRenderedMarkdown(
|
|
211
|
+
html: string,
|
|
212
|
+
ignoreNavFooter?: boolean
|
|
213
|
+
): Promise<string>;
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Generate complete markdown diff with HTML to Markdown conversion
|
|
217
|
+
*/
|
|
218
|
+
export function generateMarkdownDiff(
|
|
219
|
+
originalHtml: string,
|
|
220
|
+
currentHtml: string,
|
|
221
|
+
ignoreNavFooter?: boolean
|
|
222
|
+
): Promise<{ originalRenderedHtml: string; currentRenderedHtml: string }>;
|
|
223
|
+
|
package/src/index.js
CHANGED
package/src/markdown-diff.js
CHANGED
|
@@ -18,6 +18,16 @@
|
|
|
18
18
|
import { filterHtmlContent } from './html-filter.js';
|
|
19
19
|
import { htmlToMarkdown, markdownToHtml } from './markdown-converter.js';
|
|
20
20
|
|
|
21
|
+
/**
|
|
22
|
+
* Check if element is a browser DOM element (has outerHTML property)
|
|
23
|
+
* @param {Object} el - Element to check
|
|
24
|
+
* @returns {boolean} True if DOM element, false if cheerio element
|
|
25
|
+
* @private
|
|
26
|
+
*/
|
|
27
|
+
function isDOMElement(el) {
|
|
28
|
+
return typeof el.outerHTML === 'string';
|
|
29
|
+
}
|
|
30
|
+
|
|
21
31
|
/**
|
|
22
32
|
* Diff DOM blocks using LCS algorithm
|
|
23
33
|
* Compares blocks based on text content while preserving full HTML structure
|
|
@@ -100,51 +110,114 @@ export function diffDOMBlocks(originalBlocks, currentBlocks) {
|
|
|
100
110
|
return ops;
|
|
101
111
|
}
|
|
102
112
|
|
|
113
|
+
/**
|
|
114
|
+
* Get tag name from element
|
|
115
|
+
* @param {Object} el - Element (DOM or cheerio)
|
|
116
|
+
* @returns {string} Uppercase tag name
|
|
117
|
+
* @private
|
|
118
|
+
*/
|
|
119
|
+
function getTagName(el) {
|
|
120
|
+
// DOM elements have uppercase tagName, cheerio has lowercase name
|
|
121
|
+
return (el.tagName || el.name || '').toUpperCase();
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Get children elements
|
|
126
|
+
* @param {Object} el - Element (DOM or cheerio)
|
|
127
|
+
* @returns {Array} Array of child elements
|
|
128
|
+
* @private
|
|
129
|
+
*/
|
|
130
|
+
function getChildren(el) {
|
|
131
|
+
if (isDOMElement(el)) {
|
|
132
|
+
return Array.from(el.children || []);
|
|
133
|
+
}
|
|
134
|
+
// Cheerio raw element has children array with type info
|
|
135
|
+
return (el.children || []).filter((c) => c.type === 'tag');
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Get text content from element
|
|
140
|
+
* @param {Object} el - Element (DOM or cheerio)
|
|
141
|
+
* @param {Function} [$] - Cheerio instance (required for cheerio elements)
|
|
142
|
+
* @returns {string} Text content
|
|
143
|
+
* @private
|
|
144
|
+
*/
|
|
145
|
+
function getTextContent(el, $) {
|
|
146
|
+
if (isDOMElement(el)) {
|
|
147
|
+
return el.textContent || '';
|
|
148
|
+
}
|
|
149
|
+
// Use cheerio's text() method
|
|
150
|
+
return $(el).text();
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Get outer HTML from element
|
|
155
|
+
* @param {Object} el - Element (DOM or cheerio)
|
|
156
|
+
* @param {Function} [$] - Cheerio instance (required for cheerio elements)
|
|
157
|
+
* @returns {string} Outer HTML
|
|
158
|
+
* @private
|
|
159
|
+
*/
|
|
160
|
+
function getOuterHTML(el, $) {
|
|
161
|
+
if (isDOMElement(el)) {
|
|
162
|
+
return el.outerHTML;
|
|
163
|
+
}
|
|
164
|
+
// Use cheerio's html() method
|
|
165
|
+
return $.html(el);
|
|
166
|
+
}
|
|
167
|
+
|
|
103
168
|
/**
|
|
104
169
|
* Extract blocks from parsed HTML, breaking down lists into individual items
|
|
105
|
-
*
|
|
170
|
+
* Works with both browser DOM elements and cheerio raw elements
|
|
171
|
+
* @param {Array} children - Array of child elements (DOM or cheerio)
|
|
172
|
+
* @param {Function} [$] - Cheerio instance (required for Node.js, optional for browser)
|
|
106
173
|
* @returns {Array<{html: string, text: string, tagName: string}>} Extracted blocks
|
|
107
174
|
* @private
|
|
108
175
|
*/
|
|
109
|
-
function extractBlocks(children) {
|
|
176
|
+
function extractBlocks(children, $) {
|
|
110
177
|
const blocks = [];
|
|
111
178
|
children.forEach((el) => {
|
|
179
|
+
const tagName = getTagName(el);
|
|
180
|
+
|
|
112
181
|
// If it's a list (ul/ol), break it down into individual list items
|
|
113
|
-
if (
|
|
114
|
-
const listType =
|
|
115
|
-
|
|
116
|
-
|
|
182
|
+
if (tagName === 'UL' || tagName === 'OL') {
|
|
183
|
+
const listType = tagName.toLowerCase();
|
|
184
|
+
const listChildren = getChildren(el);
|
|
185
|
+
|
|
186
|
+
listChildren.forEach((li) => {
|
|
187
|
+
if (getTagName(li) === 'LI') {
|
|
117
188
|
// Skip empty list items - they cause alignment issues
|
|
118
|
-
const liText = li.
|
|
189
|
+
const liText = getTextContent(li, $).trim();
|
|
119
190
|
if (!liText) return;
|
|
120
191
|
|
|
121
192
|
// Check if the list item contains nested block elements (p, div, h1-h6, etc.)
|
|
122
|
-
const
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
||
|
|
193
|
+
const liChildren = getChildren(li);
|
|
194
|
+
const nestedBlocks = liChildren.filter((child) => {
|
|
195
|
+
const childTag = getTagName(child);
|
|
196
|
+
return childTag === 'P' || childTag === 'DIV' || childTag === 'H1'
|
|
197
|
+
|| childTag === 'H2' || childTag === 'H3' || childTag === 'H4'
|
|
198
|
+
|| childTag === 'H5' || childTag === 'H6'
|
|
199
|
+
|| childTag === 'BLOCKQUOTE' || childTag === 'PRE';
|
|
127
200
|
});
|
|
128
201
|
|
|
129
202
|
if (nestedBlocks.length > 0) {
|
|
130
203
|
// Extract nested blocks individually for better matching
|
|
131
204
|
// but wrap them in li/ul for proper display
|
|
132
205
|
nestedBlocks.forEach((child) => {
|
|
133
|
-
const childText = child.
|
|
206
|
+
const childText = getTextContent(child, $).trim();
|
|
134
207
|
if (!childText) return; // Skip empty nested blocks too
|
|
135
208
|
|
|
136
209
|
blocks.push({
|
|
137
|
-
html: `<${listType}><li>${child
|
|
138
|
-
text:
|
|
139
|
-
tagName: child.
|
|
210
|
+
html: `<${listType}><li>${getOuterHTML(child, $)}</li></${listType}>`,
|
|
211
|
+
text: childText,
|
|
212
|
+
tagName: getTagName(child).toLowerCase(),
|
|
140
213
|
});
|
|
141
214
|
});
|
|
142
215
|
} else {
|
|
143
216
|
// No nested blocks, treat the whole li as one block
|
|
144
217
|
// wrap in ul/ol for proper display
|
|
145
218
|
blocks.push({
|
|
146
|
-
html: `<${listType}>${li
|
|
147
|
-
text:
|
|
219
|
+
html: `<${listType}>${getOuterHTML(li, $)}</${listType}>`,
|
|
220
|
+
text: liText,
|
|
148
221
|
tagName: 'li',
|
|
149
222
|
});
|
|
150
223
|
}
|
|
@@ -152,26 +225,58 @@ function extractBlocks(children) {
|
|
|
152
225
|
});
|
|
153
226
|
} else {
|
|
154
227
|
// For all other elements, add them as-is
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
228
|
+
const text = getTextContent(el, $).trim();
|
|
229
|
+
if (text) {
|
|
230
|
+
blocks.push({
|
|
231
|
+
html: getOuterHTML(el, $),
|
|
232
|
+
text,
|
|
233
|
+
tagName: tagName.toLowerCase(),
|
|
234
|
+
});
|
|
235
|
+
}
|
|
160
236
|
}
|
|
161
237
|
});
|
|
162
238
|
return blocks;
|
|
163
239
|
}
|
|
164
240
|
|
|
241
|
+
/**
|
|
242
|
+
* Get only the added markdown blocks (content in current but not in original)
|
|
243
|
+
* @param {Array} originalChildren - Array of original DOM child elements
|
|
244
|
+
* @param {Array} currentChildren - Array of current DOM child elements
|
|
245
|
+
* @param {Function} [$] - Cheerio instance (required for Node.js, optional for browser)
|
|
246
|
+
* @returns {{addedBlocks: Array<{html: string, text: string}>, addedCount: number}}
|
|
247
|
+
* Added blocks with both HTML and text content
|
|
248
|
+
*/
|
|
249
|
+
export function getAddedMarkdownBlocks(originalChildren, currentChildren, $) {
|
|
250
|
+
const originalBlocks = extractBlocks(originalChildren, $);
|
|
251
|
+
const currentBlocks = extractBlocks(currentChildren, $);
|
|
252
|
+
|
|
253
|
+
const ops = diffDOMBlocks(originalBlocks, currentBlocks);
|
|
254
|
+
|
|
255
|
+
// Extract both HTML and text content from added blocks
|
|
256
|
+
const addedBlocks = ops
|
|
257
|
+
.filter((op) => op.type === 'add')
|
|
258
|
+
.map((op) => ({
|
|
259
|
+
html: op.currentBlock.html,
|
|
260
|
+
text: op.currentBlock.text,
|
|
261
|
+
}));
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
addedBlocks,
|
|
265
|
+
addedCount: addedBlocks.length,
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
|
|
165
269
|
/**
|
|
166
270
|
* Create markdown table diff from parsed DOM children
|
|
167
271
|
* @param {Array} originalChildren - Array of original DOM child elements
|
|
168
272
|
* @param {Array} currentChildren - Array of current DOM child elements
|
|
273
|
+
* @param {Function} [$] - Cheerio instance (required for Node.js, optional for browser)
|
|
169
274
|
* @returns {{tableHtml: string, counters: string}} Diff table and counter information
|
|
170
275
|
*/
|
|
171
|
-
export function createMarkdownTableDiff(originalChildren, currentChildren) {
|
|
276
|
+
export function createMarkdownTableDiff(originalChildren, currentChildren, $) {
|
|
172
277
|
// Get all block-level elements from both sides and extract their text content
|
|
173
|
-
const originalBlocks = extractBlocks(originalChildren);
|
|
174
|
-
const currentBlocks = extractBlocks(currentChildren);
|
|
278
|
+
const originalBlocks = extractBlocks(originalChildren, $);
|
|
279
|
+
const currentBlocks = extractBlocks(currentChildren, $);
|
|
175
280
|
|
|
176
281
|
// Run diff algorithm once and count changes
|
|
177
282
|
const ops = diffDOMBlocks(originalBlocks, currentBlocks);
|