page-analyzer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,344 @@
1
+ const BODY_SELECTOR = 'body';
2
+ const FORM_ELEMENTS = new Set(['input', 'select', 'textarea', 'button']);
3
+
4
+ const SELECTOR_V2_CONTAINER_TAGS = [
5
+ 'DIV',
6
+ 'SECTION',
7
+ 'ARTICLE',
8
+ 'MAIN',
9
+ 'ASIDE',
10
+ 'NAV',
11
+ 'HEADER',
12
+ 'FOOTER'
13
+ ];
14
+
15
+ const SELECTOR_V2_EXCLUDE_TAGS = ['SCRIPT', 'STYLE', 'LINK', 'META', 'NOSCRIPT'];
16
+ const SELECTOR_V2_EXCLUDE_IDS = ['web-pixels-manager-sandbox-container'];
17
+
18
+ const selectorCache = new WeakMap();
19
+
20
+ const selectorMetrics = {
21
+ totalCalls: 0,
22
+ cacheHits: 0,
23
+ averageTime: 0,
24
+ errors: 0,
25
+ updateMetrics(time) {
26
+ this.averageTime = (this.averageTime * this.totalCalls + time) / (this.totalCalls + 1);
27
+ }
28
+ };
29
+
30
+ function now() {
31
+ if (typeof performance !== 'undefined' && typeof performance.now === 'function') {
32
+ return performance.now();
33
+ }
34
+ return Date.now();
35
+ }
36
+
37
+ function isShadowDom(node) {
38
+ return Object.prototype.toString.call(node) === '[object ShadowRoot]';
39
+ }
40
+
41
+ function getShadowDomHost(shadowDom) {
42
+ return shadowDom?.host || null;
43
+ }
44
+
45
+ function hasShadowRoot(node) {
46
+ const shadowRoot = node?.shadowRoot;
47
+ return Boolean(shadowRoot);
48
+ }
49
+
50
+ function escapeCss(component) {
51
+ return String(component || '')
52
+ .replace(/([^a-zA-Z0-9_-])/g, '\\$1')
53
+ .replace(/^([0-9])/, '\\3$1 ')
54
+ .replace(/^(-[0-9-])/, '\\$1');
55
+ }
56
+
57
+ function getElementAttribute(node, attr) {
58
+ if (!node || !node.attribs || typeof node.attribs !== 'object') {
59
+ return null;
60
+ }
61
+ if (!(attr in node.attribs)) {
62
+ return null;
63
+ }
64
+ return String(node.attribs[attr] ?? '');
65
+ }
66
+
67
+ function getTagName(node) {
68
+ return String(node?.name || '').toLowerCase();
69
+ }
70
+
71
+ function getElementChildren($, node) {
72
+ if (!node) {
73
+ return [];
74
+ }
75
+ return $(node)
76
+ .children()
77
+ .toArray()
78
+ .filter((child) => typeof child?.name === 'string' && child.name.length > 0);
79
+ }
80
+
81
+ function getParentElement($, node) {
82
+ if (!node) {
83
+ return null;
84
+ }
85
+ const parent = $(node).parent();
86
+ if (!parent || parent.length === 0) {
87
+ return null;
88
+ }
89
+ const parentNode = parent[0];
90
+ if (!parentNode || typeof parentNode.name !== 'string') {
91
+ return null;
92
+ }
93
+ return parentNode;
94
+ }
95
+
96
+ function hasSameIdSibling($, node, parentNode, id) {
97
+ return getElementChildren($, parentNode).some((child) => {
98
+ if (child === node) {
99
+ return false;
100
+ }
101
+ return getElementAttribute(child, 'id') === id;
102
+ });
103
+ }
104
+
105
+ function isEmptyContainer($, node) {
106
+ const children = getElementChildren($, node);
107
+ if (children.length > 0) {
108
+ const hasValidChild = children.some((child) => {
109
+ const childTag = String(child?.name || '').toUpperCase();
110
+ if (SELECTOR_V2_EXCLUDE_TAGS.includes(childTag)) {
111
+ return false;
112
+ }
113
+ if (!SELECTOR_V2_CONTAINER_TAGS.includes(childTag)) {
114
+ return true;
115
+ }
116
+ return !isEmptyContainer($, child);
117
+ });
118
+ return !hasValidChild;
119
+ }
120
+
121
+ const textContent = String($(node).text() || '').trim();
122
+ return textContent.length === 0;
123
+ }
124
+
125
+ function isInvalidContainer($, node) {
126
+ const tagName = String(node?.name || '').toUpperCase();
127
+ if (SELECTOR_V2_EXCLUDE_TAGS.includes(tagName)) {
128
+ return true;
129
+ }
130
+
131
+ const elementId = getElementAttribute(node, 'id');
132
+ if (elementId && SELECTOR_V2_EXCLUDE_IDS.includes(elementId)) {
133
+ return true;
134
+ }
135
+
136
+ if (SELECTOR_V2_CONTAINER_TAGS.includes(tagName)) {
137
+ return !hasShadowRoot(node) && isEmptyContainer($, node);
138
+ }
139
+
140
+ return false;
141
+ }
142
+
143
+ function getBodyNode($) {
144
+ const body = $('body').first();
145
+ if (!body || body.length === 0) {
146
+ return null;
147
+ }
148
+ return body[0] || null;
149
+ }
150
+
151
+ function getValidBodyChildren($, selectorV2) {
152
+ try {
153
+ const body = getBodyNode($);
154
+ if (!body) {
155
+ return [];
156
+ }
157
+
158
+ const children = getElementChildren($, body);
159
+ if (!selectorV2) {
160
+ return children;
161
+ }
162
+
163
+ return children.filter((child) => !isInvalidContainer($, child));
164
+ } catch (error) {
165
+ console.warn('Error in getValidBodyChildren:', error);
166
+ return [];
167
+ }
168
+ }
169
+
170
+ function getBodyChildSelector($, node, parentNode, ignoreId, selectorV2) {
171
+ const domNodeName = getTagName(node);
172
+ const domId = ignoreId ? '' : getElementAttribute(node, 'id');
173
+
174
+ if (domId && !hasSameIdSibling($, node, parentNode, domId)) {
175
+ return `${BODY_SELECTOR}>#${escapeCss(domId)}`;
176
+ }
177
+
178
+ const validBodyChildren = getValidBodyChildren($, selectorV2);
179
+ const siblings = (validBodyChildren || getElementChildren($, parentNode)).filter(
180
+ (sibling) => getTagName(sibling) === domNodeName
181
+ );
182
+ const position = siblings.indexOf(node);
183
+ return `${BODY_SELECTOR}>${domNodeName}:eq(${position})`;
184
+ }
185
+
186
+ function getFormElementSelector($, node, parentNode, ignoreId, selectorV2, useCache) {
187
+ const domNodeName = getTagName(node);
188
+ if (!FORM_ELEMENTS.has(domNodeName)) {
189
+ return null;
190
+ }
191
+
192
+ const name = getElementAttribute(node, 'name');
193
+ if (!name) {
194
+ return null;
195
+ }
196
+
197
+ const formNodes = $(parentNode)
198
+ .find(`${domNodeName}[name='${name}']`)
199
+ .toArray()
200
+ .filter((candidate) => getTagName(candidate) === domNodeName);
201
+
202
+ if (formNodes.length > 1) {
203
+ for (let index = 0; index < formNodes.length; index += 1) {
204
+ if (formNodes[index] === node) {
205
+ return `${getSelector($, {
206
+ dom: parentNode,
207
+ ignoreId,
208
+ selectorV2,
209
+ useCache
210
+ })}>${domNodeName}:input[name='${name}']:eq(${index})`;
211
+ }
212
+ }
213
+ } else if (formNodes.length === 1) {
214
+ return `${getSelector($, {
215
+ dom: parentNode,
216
+ ignoreId,
217
+ selectorV2,
218
+ useCache
219
+ })}>${domNodeName}:input[name='${name}']`;
220
+ }
221
+
222
+ return null;
223
+ }
224
+
225
+ function getPositionalSelector($, node, parentNode, ignoreId, selectorV2, useCache) {
226
+ const domNodeName = getTagName(node);
227
+ const siblings = getElementChildren($, parentNode).filter((child) => getTagName(child) === domNodeName);
228
+
229
+ for (let index = 0; index < siblings.length; index += 1) {
230
+ if (siblings[index] === node) {
231
+ return `${getSelector($, {
232
+ dom: parentNode,
233
+ ignoreId,
234
+ selectorV2,
235
+ useCache
236
+ })}>${domNodeName}:eq(${index})`;
237
+ }
238
+ }
239
+
240
+ return '';
241
+ }
242
+
243
+ export function getSelector($, { dom, ignoreId = false, selectorV2 = false, useCache = true } = {}) {
244
+ if (!$ || !dom) {
245
+ return '';
246
+ }
247
+
248
+ const start = now();
249
+ selectorMetrics.totalCalls += 1;
250
+
251
+ try {
252
+ if (useCache) {
253
+ const cached = selectorCache.get(dom);
254
+ if (cached) {
255
+ selectorMetrics.cacheHits += 1;
256
+ return cached;
257
+ }
258
+ }
259
+
260
+ let result = '';
261
+ const domNodeName = getTagName(dom);
262
+ if (!domNodeName) {
263
+ return '';
264
+ }
265
+
266
+ if (domNodeName === BODY_SELECTOR || domNodeName === 'html') {
267
+ result = BODY_SELECTOR;
268
+ return result;
269
+ }
270
+
271
+ let parentNode = getParentElement($, dom);
272
+
273
+ if (isShadowDom(dom)) {
274
+ result = `${getSelector($, {
275
+ dom: getShadowDomHost(dom),
276
+ ignoreId,
277
+ selectorV2,
278
+ useCache
279
+ })}>shadowRoot`;
280
+ return cacheAndReturn(result);
281
+ }
282
+
283
+ if (!parentNode) {
284
+ return '';
285
+ }
286
+
287
+ if (getTagName(parentNode) === BODY_SELECTOR) {
288
+ result = getBodyChildSelector($, dom, parentNode, ignoreId, selectorV2);
289
+ return cacheAndReturn(result);
290
+ }
291
+
292
+ const domId = !ignoreId && getElementAttribute(dom, 'id');
293
+ if (parentNode && domId && !hasSameIdSibling($, dom, parentNode, domId)) {
294
+ result = `${getSelector($, {
295
+ dom: parentNode,
296
+ ignoreId,
297
+ selectorV2,
298
+ useCache
299
+ })}>#${escapeCss(domId)}`;
300
+ return cacheAndReturn(result);
301
+ }
302
+
303
+ if (FORM_ELEMENTS.has(domNodeName)) {
304
+ const formResult = getFormElementSelector($, dom, parentNode, ignoreId, selectorV2, useCache);
305
+ if (formResult) {
306
+ return cacheAndReturn(formResult);
307
+ }
308
+ }
309
+
310
+ result = getPositionalSelector($, dom, parentNode, ignoreId, selectorV2, useCache);
311
+ return cacheAndReturn(result);
312
+ } catch (error) {
313
+ console.warn('Failed to generate selector:', error);
314
+ selectorMetrics.errors += 1;
315
+ return '';
316
+ } finally {
317
+ selectorMetrics.updateMetrics(now() - start);
318
+ }
319
+
320
+ function cacheAndReturn(selector) {
321
+ if (useCache) {
322
+ selectorCache.set(dom, selector);
323
+ }
324
+ return selector;
325
+ }
326
+ }
327
+
328
+ export class PtSelectorBuilder {
329
+ static build($, elem, options = {}) {
330
+ return getSelector($, {
331
+ dom: elem,
332
+ ignoreId: options.ignoreId === true,
333
+ selectorV2: options.selectorV2 === true,
334
+ useCache: options.useCache !== false
335
+ });
336
+ }
337
+
338
+ static buildVariants($, elem) {
339
+ return {
340
+ ptSelector: this.build($, elem, { ignoreId: false, selectorV2: false }),
341
+ ptSelectorV2: this.build($, elem, { ignoreId: false, selectorV2: true })
342
+ };
343
+ }
344
+ }
package/html-parser.js ADDED
@@ -0,0 +1,206 @@
1
+ import * as cheerio from 'cheerio';
2
+ import { normalizeUrl } from './utils/url-utils.js';
3
+ import { ContextExtractor } from './extractors/context-extractor.js';
4
+ import { PtSelectorBuilder } from './extractors/pt-selector-builder.js';
5
+
6
+ /**
7
+ * Parse HTML content and extract important DOM elements with context
8
+ */
9
+ export class HtmlParser {
10
+ constructor(config = {}) {
11
+ const contextLevel = String(config.contextLevel || '').trim().toLowerCase() === 'lean'
12
+ ? 'lean'
13
+ : 'full';
14
+
15
+ this.config = {
16
+ importantTags: config.importantTags || [
17
+ 'a',
18
+ 'button',
19
+ 'form',
20
+ 'input',
21
+ 'select',
22
+ 'textarea',
23
+ 'h1',
24
+ 'h2',
25
+ 'h3',
26
+ 'h4',
27
+ 'h5',
28
+ 'h6',
29
+ 'img',
30
+ 'nav'
31
+ ],
32
+ maxTextLength: config.maxTextLength || 200,
33
+ maxParentTextLength: config.maxParentTextLength || 100,
34
+ maxAncestorDepth: config.maxAncestorDepth || 5,
35
+ maxNearbyTexts: config.maxNearbyTexts || 5,
36
+ contextLevel
37
+ };
38
+
39
+ this.contextExtractor = new ContextExtractor(this.config);
40
+ }
41
+
42
+ /**
43
+ * Parse HTML and extract elements
44
+ * @param {string} html - Raw HTML content
45
+ * @param {string} baseUrl - Base URL for resolving relative links
46
+ * @returns {Object} Parsed page data { title, elements, links, metrics }
47
+ */
48
+ parse(html, baseUrl) {
49
+ const parseStartedAt = Date.now();
50
+ const $ = cheerio.load(html);
51
+ const elements = [];
52
+ const selectors = [...this.config.importantTags, '[onclick]', '[role="button"]'];
53
+ const seenElements = new Set();
54
+ let contextBuildMs = 0;
55
+
56
+ selectors.forEach(selector => {
57
+ $(selector).each((i, elem) => {
58
+ if (seenElements.has(elem)) {
59
+ return;
60
+ }
61
+ seenElements.add(elem);
62
+
63
+ const tag = elem?.name || selector;
64
+ const ptSelectors = PtSelectorBuilder.buildVariants($, elem);
65
+ const contextStartedAt = Date.now();
66
+ const context = this.contextExtractor.extract($, elem).toJSON();
67
+ contextBuildMs += Date.now() - contextStartedAt;
68
+ const element = {
69
+ tag,
70
+ text: $(elem).text().trim().substring(0, this.config.maxTextLength),
71
+ ...this.getAttributes(elem, tag, baseUrl, $),
72
+ ptSelector: ptSelectors.ptSelector || '',
73
+ ptSelectorV2: ptSelectors.ptSelectorV2 || '',
74
+ context
75
+ };
76
+ elements.push(element);
77
+ });
78
+ });
79
+
80
+ const links = elements.filter(e => e.tag === 'a' && e.href).map(e => ({
81
+ href: e.href,
82
+ text: e.text
83
+ }));
84
+ const parseMs = Date.now() - parseStartedAt;
85
+ const heapUsedMB = Number((process.memoryUsage().heapUsed / 1024 / 1024).toFixed(1));
86
+
87
+ return {
88
+ title: $('title').text().trim() || 'No Title',
89
+ elements,
90
+ links,
91
+ metrics: {
92
+ parseMs,
93
+ contextBuildMs,
94
+ elementsCount: elements.length,
95
+ linksCount: links.length,
96
+ heapUsedMB,
97
+ contextLevel: this.config.contextLevel
98
+ }
99
+ };
100
+ }
101
+
102
+ getAttributes(elem, tag, baseUrl, $) {
103
+ const $elem = $(elem);
104
+ const attrs = {};
105
+
106
+ switch (tag) {
107
+ case 'a':
108
+ const href = $elem.attr('href');
109
+ if (href) {
110
+ attrs.href = normalizeUrl(href, baseUrl);
111
+ }
112
+ attrs.title = $elem.attr('title') || '';
113
+ attrs.role = $elem.attr('role') || '';
114
+ attrs.ariaLabel = $elem.attr('aria-label') || '';
115
+ attrs.onclick = $elem.attr('onclick') || '';
116
+ break;
117
+
118
+ case 'img':
119
+ attrs.src = normalizeUrl($elem.attr('src') || '', baseUrl);
120
+ attrs.alt = $elem.attr('alt') || '';
121
+ attrs.role = $elem.attr('role') || '';
122
+ attrs.ariaLabel = $elem.attr('aria-label') || '';
123
+ attrs.onclick = $elem.attr('onclick') || '';
124
+ break;
125
+
126
+ case 'button':
127
+ attrs.type = $elem.attr('type') || 'button';
128
+ attrs.id = $elem.attr('id') || '';
129
+ attrs.name = $elem.attr('name') || '';
130
+ attrs.value = $elem.attr('value') || '';
131
+ attrs.role = $elem.attr('role') || '';
132
+ attrs.ariaLabel = $elem.attr('aria-label') || '';
133
+ attrs.onclick = $elem.attr('onclick') || '';
134
+ {
135
+ const formAction = $elem.attr('formaction');
136
+ if (formAction) {
137
+ attrs.formAction = normalizeUrl(formAction, baseUrl);
138
+ }
139
+ }
140
+ break;
141
+
142
+ case 'form':
143
+ attrs.id = $elem.attr('id') || '';
144
+ attrs.class = $elem.attr('class') || '';
145
+ attrs.name = $elem.attr('name') || '';
146
+ attrs.role = $elem.attr('role') || '';
147
+ attrs.ariaLabel = $elem.attr('aria-label') || '';
148
+ attrs.onsubmit = $elem.attr('onsubmit') || '';
149
+ attrs.method = ($elem.attr('method') || 'get').toLowerCase();
150
+ {
151
+ const action = $elem.attr('action');
152
+ if (action) {
153
+ attrs.action = normalizeUrl(action, baseUrl);
154
+ }
155
+ }
156
+ break;
157
+
158
+ case 'input':
159
+ attrs.type = ($elem.attr('type') || 'text').toLowerCase();
160
+ attrs.id = $elem.attr('id') || '';
161
+ attrs.class = $elem.attr('class') || '';
162
+ attrs.name = $elem.attr('name') || '';
163
+ attrs.value = $elem.attr('value') || '';
164
+ attrs.placeholder = $elem.attr('placeholder') || '';
165
+ attrs.role = $elem.attr('role') || '';
166
+ attrs.ariaLabel = $elem.attr('aria-label') || '';
167
+ attrs.onclick = $elem.attr('onclick') || '';
168
+ attrs.onchange = $elem.attr('onchange') || '';
169
+ {
170
+ const formAction = $elem.attr('formaction');
171
+ if (formAction) {
172
+ attrs.formAction = normalizeUrl(formAction, baseUrl);
173
+ }
174
+ }
175
+ break;
176
+
177
+ case 'select':
178
+ attrs.id = $elem.attr('id') || '';
179
+ attrs.class = $elem.attr('class') || '';
180
+ attrs.name = $elem.attr('name') || '';
181
+ attrs.role = $elem.attr('role') || '';
182
+ attrs.ariaLabel = $elem.attr('aria-label') || '';
183
+ attrs.onchange = $elem.attr('onchange') || '';
184
+ break;
185
+
186
+ case 'textarea':
187
+ attrs.id = $elem.attr('id') || '';
188
+ attrs.class = $elem.attr('class') || '';
189
+ attrs.name = $elem.attr('name') || '';
190
+ attrs.placeholder = $elem.attr('placeholder') || '';
191
+ attrs.role = $elem.attr('role') || '';
192
+ attrs.ariaLabel = $elem.attr('aria-label') || '';
193
+ attrs.onchange = $elem.attr('onchange') || '';
194
+ break;
195
+
196
+ default:
197
+ attrs.id = $elem.attr('id') || '';
198
+ attrs.class = $elem.attr('class') || '';
199
+ attrs.role = $elem.attr('role') || '';
200
+ attrs.ariaLabel = $elem.attr('aria-label') || '';
201
+ attrs.onclick = $elem.attr('onclick') || '';
202
+ }
203
+
204
+ return attrs;
205
+ }
206
+ }