page-analyzer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,275 @@
1
+ import { CssSelectorBuilder } from './css-selector-builder.js';
2
+ import { Context } from '../models/context.js';
3
+ import { normalizeText } from '../utils/text-utils.js';
4
+
5
+ /**
6
+ * Extracts enhanced context information for DOM elements
7
+ */
8
+ export class ContextExtractor {
9
+ constructor(config = {}) {
10
+ const level = String(config.contextLevel || '').trim().toLowerCase();
11
+ const contextLevel = level === 'lean' ? 'lean' : 'full';
12
+
13
+ this.config = {
14
+ maxParentTextLength: config.maxParentTextLength || 100,
15
+ maxAncestorDepth: config.maxAncestorDepth || 5,
16
+ maxAncestorTextLength: config.maxAncestorTextLength || 100,
17
+ maxNearbyTexts: config.maxNearbyTexts || 5,
18
+ contextLevel
19
+ };
20
+ }
21
+
22
+ extract($, elem) {
23
+ const context = new Context();
24
+ const selectors = CssSelectorBuilder.buildCandidates($, elem);
25
+ const includeAncestorText = this.config.contextLevel === 'full';
26
+ const ancestorDepth = this.config.contextLevel === 'lean'
27
+ ? Math.min(2, this.config.maxAncestorDepth)
28
+ : this.config.maxAncestorDepth;
29
+
30
+ context.parentTag = this.getParentTag($, elem);
31
+ context.parentText = this.getParentText($, elem);
32
+ context.ancestorTrail = this.getAncestorTrail($, elem, {
33
+ includeText: includeAncestorText,
34
+ maxDepth: ancestorDepth
35
+ });
36
+ context.cssSelector = selectors.primary;
37
+ context.selectorCandidates = selectors.candidates;
38
+ context.nearbyText = this.getNearbyText($, elem);
39
+ context.semanticAnchors = this.getSemanticAnchors($, elem);
40
+ context.containerMeta = this.getContainerMeta($, elem);
41
+
42
+ return context;
43
+ }
44
+
45
+ collectTextWithBudget(node, maxChars = 100, options = {}) {
46
+ const limit = Math.max(0, Number.parseInt(maxChars, 10) || 0);
47
+ if (!node || limit === 0) {
48
+ return '';
49
+ }
50
+
51
+ const includeDescendants = options.includeDescendants !== false;
52
+ const queue = [node];
53
+ const parts = [];
54
+ let remaining = limit;
55
+
56
+ while (queue.length > 0 && remaining > 0) {
57
+ const current = queue.shift();
58
+ if (!current) {
59
+ continue;
60
+ }
61
+
62
+ const children = Array.isArray(current.children) ? current.children : [];
63
+
64
+ for (const child of children) {
65
+ if (remaining <= 0) {
66
+ break;
67
+ }
68
+ if (child.type !== 'text') {
69
+ continue;
70
+ }
71
+
72
+ const normalized = normalizeText(child.data);
73
+ if (!normalized) {
74
+ continue;
75
+ }
76
+
77
+ if (normalized.length <= remaining) {
78
+ parts.push(normalized);
79
+ remaining -= normalized.length;
80
+ } else {
81
+ parts.push(normalized.slice(0, remaining));
82
+ remaining = 0;
83
+ }
84
+ }
85
+
86
+ if (!includeDescendants) {
87
+ continue;
88
+ }
89
+
90
+ for (const child of children) {
91
+ if (child.type !== 'tag') {
92
+ continue;
93
+ }
94
+
95
+ const tagName = String(child.name || '').toLowerCase();
96
+ if (tagName === 'script' || tagName === 'style' || tagName === 'noscript') {
97
+ continue;
98
+ }
99
+ queue.push(child);
100
+ }
101
+ }
102
+
103
+ return normalizeText(parts.join(' ')).slice(0, limit);
104
+ }
105
+
106
+ getParentNode($, elem) {
107
+ const parent = $(elem).parent();
108
+ return parent.length > 0 ? (parent[0] || null) : null;
109
+ }
110
+
111
+ getParentTag($, elem) {
112
+ return this.getParentNode($, elem)?.name || '';
113
+ }
114
+
115
+ getParentText($, elem) {
116
+ const parentNode = this.getParentNode($, elem);
117
+ if (!parentNode) {
118
+ return '';
119
+ }
120
+ return this.collectTextWithBudget(parentNode, this.config.maxParentTextLength);
121
+ }
122
+
123
+ _collectAttributes(node) {
124
+ const source = node?.attribs || {};
125
+ const attributes = {};
126
+
127
+ if (source.class) attributes.class = source.class;
128
+ if (source.id) attributes.id = source.id;
129
+ if (source.role) attributes.role = source.role;
130
+
131
+ Object.keys(source).forEach(key => {
132
+ if (key.startsWith('data-')) {
133
+ attributes[key] = source[key];
134
+ }
135
+ });
136
+
137
+ return attributes;
138
+ }
139
+
140
+ getAncestorTrail($, elem, options = {}) {
141
+ const includeText = options.includeText !== false;
142
+ const maxDepth = Math.max(
143
+ 0,
144
+ Number.parseInt(options.maxDepth, 10) || this.config.maxAncestorDepth
145
+ );
146
+ const ancestors = [];
147
+ let parent = $(elem).parent();
148
+ let depth = 1;
149
+
150
+ while (parent.length && parent[0].name !== 'html' && depth <= maxDepth) {
151
+ const node = parent[0];
152
+ const ancestor = {
153
+ tag: node?.name || '',
154
+ attributes: this._collectAttributes(node),
155
+ depth
156
+ };
157
+
158
+ if (includeText) {
159
+ const text = this.collectTextWithBudget(node, this.config.maxAncestorTextLength || 80);
160
+ if (text) {
161
+ ancestor.text = text;
162
+ }
163
+ }
164
+
165
+ ancestors.push(ancestor);
166
+ parent = parent.parent();
167
+ depth += 1;
168
+ }
169
+
170
+ return ancestors;
171
+ }
172
+
173
+ getNearbyText($, elem) {
174
+ const texts = [];
175
+ const parentNode = this.getParentNode($, elem);
176
+ const maxNearbyTexts = this.config.maxNearbyTexts;
177
+ const appendText = (value) => {
178
+ const normalized = normalizeText(value);
179
+ if (!normalized || normalized.length <= 5) {
180
+ return;
181
+ }
182
+ texts.push(normalized.slice(0, 100));
183
+ };
184
+
185
+ if (parentNode && Array.isArray(parentNode.children)) {
186
+ for (const child of parentNode.children) {
187
+ if (child.type !== 'text') {
188
+ continue;
189
+ }
190
+ appendText(child.data);
191
+ if (texts.length >= maxNearbyTexts) {
192
+ return texts.slice(0, maxNearbyTexts);
193
+ }
194
+ }
195
+ }
196
+
197
+ const prev = $(elem).prev();
198
+ const next = $(elem).next();
199
+
200
+ if (prev.length) {
201
+ appendText(this.collectTextWithBudget(prev[0], 100));
202
+ }
203
+
204
+ if (next.length) {
205
+ appendText(this.collectTextWithBudget(next[0], 100));
206
+ }
207
+
208
+ return texts.slice(0, maxNearbyTexts);
209
+ }
210
+
211
+ getSemanticAnchors($, elem) {
212
+ const anchors = {
213
+ heading: null,
214
+ imageAlt: null
215
+ };
216
+
217
+ let current = $(elem);
218
+ let foundHeading = false;
219
+
220
+ while (current.length && !foundHeading && current[0].name !== 'body') {
221
+ const heading = current.prevAll('h1, h2, h3, h4, h5, h6').first();
222
+ if (heading.length) {
223
+ anchors.heading = this.collectTextWithBudget(heading[0], 120);
224
+ foundHeading = true;
225
+ } else {
226
+ current = current.parent();
227
+ }
228
+ }
229
+
230
+ const nearbyImg = $(elem).find('img').first();
231
+ if (nearbyImg.length) {
232
+ const alt = nearbyImg.attr('alt');
233
+ if (alt) {
234
+ anchors.imageAlt = alt;
235
+ }
236
+ } else {
237
+ const siblingImg = $(elem).siblings('img').first();
238
+ if (siblingImg.length) {
239
+ const alt = siblingImg.attr('alt');
240
+ if (alt) {
241
+ anchors.imageAlt = alt;
242
+ }
243
+ } else {
244
+ const parentImg = $(elem).parent().find('img').first();
245
+ if (parentImg.length) {
246
+ const alt = parentImg.attr('alt');
247
+ if (alt) {
248
+ anchors.imageAlt = alt;
249
+ }
250
+ }
251
+ }
252
+ }
253
+
254
+ return anchors;
255
+ }
256
+
257
+ getContainerMeta($, elem) {
258
+ const parentNode = this.getParentNode($, elem);
259
+ if (!parentNode) {
260
+ return {
261
+ cssSelector: '',
262
+ attributes: {}
263
+ };
264
+ }
265
+
266
+ return {
267
+ cssSelector: CssSelectorBuilder.build($, parentNode),
268
+ attributes: {
269
+ class: parentNode.attribs?.class || '',
270
+ id: parentNode.attribs?.id || '',
271
+ role: parentNode.attribs?.role || ''
272
+ }
273
+ };
274
+ }
275
+ }
@@ -0,0 +1,202 @@
1
+ import { escapeAttributeValue, isLikelyStableClassToken } from '../utils/selector-utils.js';
2
+
3
+ /**
4
+ * Builds CSS selector paths for DOM elements
5
+ */
6
+ export class CssSelectorBuilder {
7
+ static validateSelector($, selector) {
8
+ if (!selector) {
9
+ return false;
10
+ }
11
+
12
+ try {
13
+ $(selector);
14
+ return true;
15
+ } catch {
16
+ return false;
17
+ }
18
+ }
19
+
20
+ static isUniqueSelector($, selector) {
21
+ if (!this.validateSelector($, selector)) {
22
+ return false;
23
+ }
24
+
25
+ try {
26
+ return $(selector).length === 1;
27
+ } catch {
28
+ return false;
29
+ }
30
+ }
31
+
32
+ static addCandidate(candidates, selector, $) {
33
+ const normalized = (selector || '').trim();
34
+ if (!normalized) {
35
+ return;
36
+ }
37
+
38
+ if (candidates.includes(normalized)) {
39
+ return;
40
+ }
41
+
42
+ if (!this.validateSelector($, normalized)) {
43
+ return;
44
+ }
45
+
46
+ candidates.push(normalized);
47
+ }
48
+
49
+ static build($, elem) {
50
+ if (!elem || !elem.name) {
51
+ return '';
52
+ }
53
+
54
+ const path = [];
55
+ let current = $(elem);
56
+
57
+ while (current.length && current[0].name !== 'body' && current[0].name !== 'html') {
58
+ const parent = current.parent();
59
+
60
+ if (!parent.length || parent[0].name === 'html') {
61
+ break;
62
+ }
63
+
64
+ const index = parent.children().index(current);
65
+ const tagName = current[0].name;
66
+
67
+ path.unshift(`${tagName}:nth-child(${index + 1})`);
68
+
69
+ current = parent;
70
+ }
71
+
72
+ if (path.length > 0) {
73
+ path.unshift('body');
74
+ } else if (current[0]?.name === 'body') {
75
+ const bodyChildren = $('body').children();
76
+ const index = bodyChildren.index(elem);
77
+ const tagName = elem.name;
78
+ return `body > ${tagName}:nth-child(${index + 1})`;
79
+ }
80
+
81
+ return path.join(' > ');
82
+ }
83
+
84
+ static buildNthOfType($, elem) {
85
+ if (!elem || !elem.name) {
86
+ return '';
87
+ }
88
+
89
+ const path = [];
90
+ let current = $(elem);
91
+
92
+ while (current.length && current[0].name !== 'body' && current[0].name !== 'html') {
93
+ const parent = current.parent();
94
+ if (!parent.length || parent[0].name === 'html') {
95
+ break;
96
+ }
97
+
98
+ const tagName = current[0].name;
99
+ const sameTagSiblings = parent.children(tagName);
100
+ const index = sameTagSiblings.index(current[0]);
101
+ path.unshift(`${tagName}:nth-of-type(${index + 1})`);
102
+ current = parent;
103
+ }
104
+
105
+ if (path.length > 0) {
106
+ path.unshift('body');
107
+ } else if (current[0]?.name === 'body') {
108
+ const tagName = elem.name;
109
+ const sameTagChildren = $('body').children(tagName);
110
+ const index = sameTagChildren.index(elem);
111
+ return `body > ${tagName}:nth-of-type(${index + 1})`;
112
+ }
113
+
114
+ return path.join(' > ');
115
+ }
116
+
117
+ static buildCandidates($, elem) {
118
+ if (!elem || !elem.name) {
119
+ return { primary: '', candidates: [] };
120
+ }
121
+
122
+ const $elem = $(elem);
123
+ const tagName = elem.name;
124
+ const candidates = [];
125
+ const stableFirst = [];
126
+ const escaped = (value) => escapeAttributeValue(value);
127
+
128
+ const id = $elem.attr('id');
129
+ if (id) {
130
+ stableFirst.push(`[id="${escaped(id)}"]`);
131
+ }
132
+
133
+ const stableDataAttrs = [
134
+ 'data-testid',
135
+ 'data-test',
136
+ 'data-qa',
137
+ 'data-track',
138
+ 'data-tracking-id',
139
+ 'data-tid'
140
+ ];
141
+ for (const attrName of stableDataAttrs) {
142
+ const attrValue = $elem.attr(attrName);
143
+ if (attrValue) {
144
+ stableFirst.push(`[${attrName}="${escaped(attrValue)}"]`);
145
+ }
146
+ }
147
+
148
+ const href = $elem.attr('href');
149
+ if (tagName === 'a' && href) {
150
+ stableFirst.push(`a[href="${escaped(href)}"]`);
151
+ }
152
+
153
+ const name = $elem.attr('name');
154
+ if (name) {
155
+ stableFirst.push(`${tagName}[name="${escaped(name)}"]`);
156
+ }
157
+
158
+ const ariaLabel = $elem.attr('aria-label');
159
+ if (ariaLabel) {
160
+ stableFirst.push(`${tagName}[aria-label="${escaped(ariaLabel)}"]`);
161
+ }
162
+
163
+ const title = $elem.attr('title');
164
+ if (title) {
165
+ stableFirst.push(`${tagName}[title="${escaped(title)}"]`);
166
+ }
167
+
168
+ const alt = $elem.attr('alt');
169
+ if (alt) {
170
+ stableFirst.push(`${tagName}[alt="${escaped(alt)}"]`);
171
+ }
172
+
173
+ const className = $elem.attr('class');
174
+ if (className) {
175
+ const stableClassTokens = className
176
+ .split(/\s+/)
177
+ .map((token) => token.trim())
178
+ .filter((token) => isLikelyStableClassToken(token))
179
+ .slice(0, 3);
180
+
181
+ if (stableClassTokens.length > 0) {
182
+ const classSelector = `${tagName}${stableClassTokens.map((token) => `[class~="${escaped(token)}"]`).join('')}`;
183
+ stableFirst.push(classSelector);
184
+ }
185
+ }
186
+
187
+ const uniqueStableSelectors = stableFirst.filter((selector) => this.isUniqueSelector($, selector));
188
+ uniqueStableSelectors.forEach((selector) => this.addCandidate(candidates, selector, $));
189
+ stableFirst.forEach((selector) => this.addCandidate(candidates, selector, $));
190
+
191
+ const nthOfTypePath = this.buildNthOfType($, elem);
192
+ const nthChildPath = this.build($, elem);
193
+
194
+ this.addCandidate(candidates, nthOfTypePath, $);
195
+ this.addCandidate(candidates, nthChildPath, $);
196
+
197
+ return {
198
+ primary: uniqueStableSelectors[0] || nthOfTypePath || nthChildPath || candidates[0] || '',
199
+ candidates
200
+ };
201
+ }
202
+ }