page-analyzer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,192 @@
1
+ /**
2
+ * CSV Exporter for crawled page elements
3
+ * Exports element data in CSV format with enhanced fields for analysis
4
+ */
5
+ export class CsvExporter {
6
+ constructor(config = {}) {
7
+ const maxContextAncestorLevels = Number.parseInt(config.maxContextAncestorLevels, 10);
8
+ this.maxContextAncestorLevels = Number.isFinite(maxContextAncestorLevels) && maxContextAncestorLevels > 0
9
+ ? maxContextAncestorLevels
10
+ : 3;
11
+ }
12
+
13
+ extractLongestAncestorText(element) {
14
+ const elementText = element.text || element.content || '';
15
+
16
+ if (!elementText) {
17
+ return '';
18
+ }
19
+
20
+ const context = element?.context && typeof element.context === 'object'
21
+ ? element.context
22
+ : {};
23
+ const ancestorTrail = Array.isArray(context.ancestorTrail)
24
+ ? context.ancestorTrail
25
+ : [];
26
+ const ancestors = ancestorTrail.slice(0, this.maxContextAncestorLevels);
27
+
28
+ if (ancestors.length === 0) {
29
+ return '';
30
+ }
31
+
32
+ let longestText = '';
33
+ for (const ancestor of ancestors) {
34
+ if (ancestor.text &&
35
+ ancestor.text.includes(elementText) &&
36
+ ancestor.text.length > longestText.length) {
37
+ longestText = ancestor.text;
38
+ }
39
+ }
40
+
41
+ return longestText;
42
+ }
43
+
44
+ normalizeTag(element) {
45
+ return String(element?.tag || '').trim().toLowerCase();
46
+ }
47
+
48
+ normalizeInputType(element) {
49
+ return String(element?.type || '').trim().toLowerCase();
50
+ }
51
+
52
+ hasDomEventHandler(element) {
53
+ return Boolean(
54
+ element?.onclick ||
55
+ element?.onsubmit ||
56
+ element?.onchange
57
+ );
58
+ }
59
+
60
+ hasButtonRole(element) {
61
+ return String(element?.role || '').trim().toLowerCase() === 'button';
62
+ }
63
+
64
+ isInteractiveInputType(inputType) {
65
+ const interactiveTypes = new Set([
66
+ 'button',
67
+ 'submit',
68
+ 'reset',
69
+ 'image',
70
+ 'checkbox',
71
+ 'radio',
72
+ 'file',
73
+ 'range',
74
+ 'color',
75
+ 'date',
76
+ 'datetime-local',
77
+ 'month',
78
+ 'time',
79
+ 'week',
80
+ 'number'
81
+ ]);
82
+
83
+ return interactiveTypes.has(inputType);
84
+ }
85
+
86
+ isInteractiveElement(element) {
87
+ const tag = this.normalizeTag(element);
88
+ const hasDomEvents = this.hasDomEventHandler(element);
89
+
90
+ if (tag === 'a') {
91
+ return Boolean(element?.href) || hasDomEvents || this.hasButtonRole(element);
92
+ }
93
+
94
+ if (tag === 'button' || tag === 'form' || tag === 'select' || tag === 'textarea') {
95
+ return true;
96
+ }
97
+
98
+ if (tag === 'input') {
99
+ const inputType = this.normalizeInputType(element);
100
+ if (inputType === 'hidden') {
101
+ return false;
102
+ }
103
+ return this.isInteractiveInputType(inputType) || hasDomEvents || Boolean(element?.formAction);
104
+ }
105
+
106
+ return hasDomEvents || this.hasButtonRole(element);
107
+ }
108
+
109
+ transformElement(element, idx) {
110
+ const text = element.text || element.content || '';
111
+
112
+ let context = '';
113
+ if (text) {
114
+ context = this.extractLongestAncestorText(element);
115
+ }
116
+
117
+ const imageAlt = element.context?.semanticAnchors?.imageAlt || '';
118
+
119
+ return {
120
+ idx: idx,
121
+ blockIdx: Number.isInteger(element?.blockIdx) ? element.blockIdx : -1,
122
+ tag: element.tag || '',
123
+ imageAlt: imageAlt,
124
+ text: text,
125
+ context: context,
126
+ href: element.href || ''
127
+ };
128
+ }
129
+
130
+ hasMeaningfulContent(transformed) {
131
+ return Boolean(
132
+ String(transformed?.imageAlt || '').trim() ||
133
+ String(transformed?.text || '').trim() ||
134
+ String(transformed?.context || '').trim() ||
135
+ String(transformed?.href || '').trim()
136
+ );
137
+ }
138
+
139
+ escapeCsvField(field) {
140
+ if (!field) return '';
141
+ let str = String(field);
142
+
143
+ str = str.replace(/\r?\n|\r/g, ' ');
144
+ str = str.replace(/\s+/g, ' ');
145
+ str = str.trim();
146
+
147
+ if (str.includes(',') || str.includes('"')) {
148
+ return `"${str.replace(/"/g, '""')}"`;
149
+ }
150
+
151
+ return str;
152
+ }
153
+
154
+ elementsToCsv(elements) {
155
+ const sourceElements = Array.isArray(elements) ? elements : [];
156
+ const rows = [];
157
+
158
+ rows.push('idx,blockIdx,tag,imageAlt,text,context,href');
159
+
160
+ for (let i = 0; i < sourceElements.length; i++) {
161
+ const element = sourceElements[i];
162
+ if (!this.isInteractiveElement(element)) {
163
+ continue;
164
+ }
165
+ const transformed = this.transformElement(element, i);
166
+ if (!this.hasMeaningfulContent(transformed)) {
167
+ continue;
168
+ }
169
+
170
+ const row = [
171
+ transformed.idx,
172
+ transformed.blockIdx,
173
+ this.escapeCsvField(transformed.tag),
174
+ this.escapeCsvField(transformed.imageAlt),
175
+ this.escapeCsvField(transformed.text),
176
+ this.escapeCsvField(transformed.context),
177
+ this.escapeCsvField(transformed.href)
178
+ ].join(',');
179
+
180
+ rows.push(row);
181
+ }
182
+
183
+ return rows.join('\n');
184
+ }
185
+
186
+ buildCsvContent(nodeId, elements) {
187
+ const sourceElements = Array.isArray(elements) ? elements : [];
188
+ const csvContent = this.elementsToCsv(sourceElements);
189
+ console.log(`[CSV] Built ${nodeId} (${sourceElements.length} elements)`);
190
+ return csvContent;
191
+ }
192
+ }
@@ -0,0 +1,281 @@
1
+ import { normalizeText } from '../utils/text-utils.js';
2
+ import { normalizeUrl } from '../utils/url-utils.js';
3
+
4
+ function toFiniteNumber(value, fallback = 0) {
5
+ const parsed = Number(value);
6
+ return Number.isFinite(parsed) ? parsed : fallback;
7
+ }
8
+
9
+ function normalizeHrefLike(value, baseUrl) {
10
+ const raw = String(value || '').trim();
11
+ if (!raw) {
12
+ return '';
13
+ }
14
+ return normalizeUrl(raw, baseUrl, {
15
+ skipFragments: true,
16
+ normalizeTrailingSlash: true
17
+ });
18
+ }
19
+
20
+ function buildElementSignature(element, baseUrl = '') {
21
+ const tag = String(element?.tag || '').trim().toLowerCase();
22
+ const href = normalizeHrefLike(
23
+ element?.href || element?.action || element?.formAction || '',
24
+ baseUrl
25
+ );
26
+ const text = normalizeText(element?.text || element?.content || '').slice(0, 240);
27
+ const name = String(element?.name || '').trim().toLowerCase();
28
+ const type = String(element?.type || '').trim().toLowerCase();
29
+ const ariaLabel = normalizeText(element?.ariaLabel || '').slice(0, 120);
30
+ return `${tag}|${href}|${text}|${name}|${type}|${ariaLabel}`;
31
+ }
32
+
33
+ function buildGeometrySignature(geometry, baseUrl = '') {
34
+ const tag = String(geometry?.tag || '').trim().toLowerCase();
35
+ const href = normalizeHrefLike(
36
+ geometry?.href || geometry?.action || geometry?.formAction || '',
37
+ baseUrl
38
+ );
39
+ const text = normalizeText(geometry?.text || '').slice(0, 240);
40
+ const name = String(geometry?.name || '').trim().toLowerCase();
41
+ const type = String(geometry?.type || '').trim().toLowerCase();
42
+ const ariaLabel = normalizeText(geometry?.ariaLabel || '').slice(0, 120);
43
+ return `${tag}|${href}|${text}|${name}|${type}|${ariaLabel}`;
44
+ }
45
+
46
+ function overlapArea(rectA, rectB) {
47
+ const left = Math.max(rectA.left, rectB.left);
48
+ const top = Math.max(rectA.top, rectB.top);
49
+ const right = Math.min(rectA.left + rectA.width, rectB.left + rectB.width);
50
+ const bottom = Math.min(rectA.top + rectA.height, rectB.top + rectB.height);
51
+ const width = right - left;
52
+ const height = bottom - top;
53
+ if (width <= 0 || height <= 0) {
54
+ return 0;
55
+ }
56
+ return width * height;
57
+ }
58
+
59
+ function mapRectToBlock(rect, blocks = []) {
60
+ const hasRect = rect && rect.width > 0 && rect.height > 0;
61
+ if (!hasRect || !Array.isArray(blocks) || blocks.length === 0) {
62
+ return -1;
63
+ }
64
+
65
+ let bestByOverlap = null;
66
+ let bestOverlap = 0;
67
+ const rectArea = rect.width * rect.height;
68
+ const centerX = rect.left + rect.width / 2;
69
+ const centerY = rect.top + rect.height / 2;
70
+
71
+ let nearestByDistance = null;
72
+ let nearestDistance = Number.POSITIVE_INFINITY;
73
+
74
+ for (const block of blocks) {
75
+ const width = toFiniteNumber(block?.width, 0);
76
+ const height = toFiniteNumber(block?.height, 0);
77
+ if (width <= 0 || height <= 0) {
78
+ continue;
79
+ }
80
+ const blockRect = {
81
+ top: toFiniteNumber(block?.top, 0),
82
+ left: toFiniteNumber(block?.left, 0),
83
+ width,
84
+ height
85
+ };
86
+
87
+ const overlap = overlapArea(rect, blockRect);
88
+ if (overlap > 0) {
89
+ const ratio = overlap / rectArea;
90
+ if (ratio > bestOverlap) {
91
+ bestOverlap = ratio;
92
+ bestByOverlap = block;
93
+ }
94
+ }
95
+
96
+ const blockCenterX = blockRect.left + blockRect.width / 2;
97
+ const blockCenterY = blockRect.top + blockRect.height / 2;
98
+ const distance = ((centerX - blockCenterX) ** 2) + ((centerY - blockCenterY) ** 2);
99
+ if (distance < nearestDistance) {
100
+ nearestDistance = distance;
101
+ nearestByDistance = block;
102
+ }
103
+ }
104
+
105
+ if (bestByOverlap) {
106
+ const idx = Number.parseInt(String(bestByOverlap?.blockIdx), 10);
107
+ return Number.isInteger(idx) ? idx : -1;
108
+ }
109
+
110
+ if (nearestByDistance) {
111
+ const idx = Number.parseInt(String(nearestByDistance?.blockIdx), 10);
112
+ return Number.isInteger(idx) ? idx : -1;
113
+ }
114
+
115
+ return -1;
116
+ }
117
+
118
+ function normalizeSelectorCandidates(element) {
119
+ const out = [];
120
+ const context = element?.context && typeof element.context === 'object'
121
+ ? element.context
122
+ : {};
123
+ const push = (value) => {
124
+ const text = String(value || '').trim();
125
+ if (!text || out.includes(text)) {
126
+ return;
127
+ }
128
+ out.push(text);
129
+ };
130
+
131
+ push(context.cssSelector);
132
+ for (const candidate of Array.isArray(context.selectorCandidates) ? context.selectorCandidates : []) {
133
+ push(candidate);
134
+ }
135
+ return out;
136
+ }
137
+
138
+ function collectBlockIndexByElements(elements = [], blocks = [], geometries = [], pageUrl = '') {
139
+ const blockList = Array.isArray(blocks) ? blocks : [];
140
+ const geometryList = Array.isArray(geometries) ? geometries : [];
141
+ const bySelector = new Map();
142
+ const bySignature = new Map();
143
+ const usedGeometryIndices = new Set();
144
+
145
+ const pushMap = (map, key, value) => {
146
+ const normalized = String(key || '').trim();
147
+ if (!normalized) {
148
+ return;
149
+ }
150
+ if (!map.has(normalized)) {
151
+ map.set(normalized, []);
152
+ }
153
+ map.get(normalized).push(value);
154
+ };
155
+
156
+ for (let index = 0; index < geometryList.length; index += 1) {
157
+ const geometry = geometryList[index];
158
+ const selectors = [
159
+ geometry?.selectorNthOfType,
160
+ geometry?.selectorNthChild,
161
+ geometry?.selectorById
162
+ ];
163
+ for (const selector of selectors) {
164
+ pushMap(bySelector, selector, index);
165
+ }
166
+ pushMap(bySignature, buildGeometrySignature(geometry, pageUrl), index);
167
+ }
168
+
169
+ const pickUnused = (indices = []) => {
170
+ for (const index of indices) {
171
+ if (!usedGeometryIndices.has(index)) {
172
+ return index;
173
+ }
174
+ }
175
+ return indices.length > 0 ? indices[0] : -1;
176
+ };
177
+
178
+ const resolveGeometryIndex = (element) => {
179
+ const selectors = normalizeSelectorCandidates(element);
180
+ for (const selector of selectors) {
181
+ const candidates = bySelector.get(selector);
182
+ if (!Array.isArray(candidates) || candidates.length === 0) {
183
+ continue;
184
+ }
185
+ const found = pickUnused(candidates);
186
+ if (found >= 0) {
187
+ return found;
188
+ }
189
+ }
190
+
191
+ const signature = buildElementSignature(element, pageUrl);
192
+ const signatureCandidates = bySignature.get(signature);
193
+ if (Array.isArray(signatureCandidates) && signatureCandidates.length > 0) {
194
+ return pickUnused(signatureCandidates);
195
+ }
196
+
197
+ const tag = String(element?.tag || '').trim().toLowerCase();
198
+ const href = normalizeHrefLike(
199
+ element?.href || element?.action || element?.formAction || '',
200
+ pageUrl
201
+ );
202
+ const text = normalizeText(element?.text || element?.content || '').slice(0, 200);
203
+ let best = -1;
204
+ let bestScore = Number.NEGATIVE_INFINITY;
205
+
206
+ for (let index = 0; index < geometryList.length; index += 1) {
207
+ if (usedGeometryIndices.has(index)) {
208
+ continue;
209
+ }
210
+ const geometry = geometryList[index];
211
+ const gTag = String(geometry?.tag || '').trim().toLowerCase();
212
+ if (tag && gTag && tag !== gTag) {
213
+ continue;
214
+ }
215
+ const gHref = normalizeHrefLike(
216
+ geometry?.href || geometry?.action || geometry?.formAction || '',
217
+ pageUrl
218
+ );
219
+ const gText = normalizeText(geometry?.text || '').slice(0, 200);
220
+
221
+ let score = 0;
222
+ if (href && gHref && href === gHref) {
223
+ score += 10;
224
+ }
225
+ if (text && gText) {
226
+ if (text === gText) {
227
+ score += 8;
228
+ } else if (text.includes(gText) || gText.includes(text)) {
229
+ score += 4;
230
+ }
231
+ }
232
+ if (!href && !text) {
233
+ score += 1;
234
+ }
235
+
236
+ if (score > bestScore) {
237
+ bestScore = score;
238
+ best = index;
239
+ }
240
+ }
241
+
242
+ return best;
243
+ };
244
+
245
+ const sourceElements = Array.isArray(elements) ? elements : [];
246
+ return sourceElements.map((element) => {
247
+ const geometryIndex = resolveGeometryIndex(element);
248
+ if (geometryIndex >= 0) {
249
+ usedGeometryIndices.add(geometryIndex);
250
+ const geometry = geometryList[geometryIndex];
251
+ const rect = {
252
+ top: toFiniteNumber(geometry?.top, 0),
253
+ left: toFiniteNumber(geometry?.left, 0),
254
+ width: toFiniteNumber(geometry?.width, 0),
255
+ height: toFiniteNumber(geometry?.height, 0)
256
+ };
257
+ return mapRectToBlock(rect, blockList);
258
+ }
259
+ return -1;
260
+ });
261
+ }
262
+
263
+ /**
264
+ * Assign block indices to elements based on geometry matching.
265
+ * Each element receives a `blockIdx` property (integer >= 0, or -1 if no block matched).
266
+ *
267
+ * @param {Array} elements - Parsed interactive elements
268
+ * @param {Array} blocks - Visual blocks from page extraction
269
+ * @param {Array} elementGeometries - Element geometry records with selectors and coordinates
270
+ * @param {string} pageUrl - Base URL for normalizing element signatures
271
+ * @returns {Array} The same elements array, mutated with blockIdx
272
+ */
273
+ export function assignBlocksToElements(elements = [], blocks = [], elementGeometries = [], pageUrl = '') {
274
+ const source = Array.isArray(elements) ? elements : [];
275
+ const blockIndices = collectBlockIndexByElements(source, blocks, elementGeometries, pageUrl);
276
+ for (let index = 0; index < source.length; index += 1) {
277
+ const value = Number.parseInt(String(blockIndices[index]), 10);
278
+ source[index].blockIdx = Number.isInteger(value) ? value : -1;
279
+ }
280
+ return source;
281
+ }