page-analyzer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/csv-exporter.js +192 -0
- package/extractors/block-assigner.js +281 -0
- package/extractors/context-extractor.js +275 -0
- package/extractors/css-selector-builder.js +202 -0
- package/extractors/pt-selector-builder.js +344 -0
- package/html-parser.js +206 -0
- package/index.js +303 -0
- package/llm/analyzers/event-analyzer/event-analyzer-blocks.js +553 -0
- package/llm/analyzers/event-analyzer/event-analyzer-constants.js +22 -0
- package/llm/analyzers/event-analyzer/event-analyzer-events.js +97 -0
- package/llm/analyzers/event-analyzer/event-analyzer-input.js +168 -0
- package/llm/analyzers/event-analyzer/event-analyzer-metadata.js +15 -0
- package/llm/analyzers/event-analyzer/event-analyzer-prompt.js +71 -0
- package/llm/analyzers/event-analyzer/event-analyzer-response.js +290 -0
- package/llm/analyzers/event-analyzer/event-analyzer-utils.js +96 -0
- package/llm/analyzers/event-analyzer/event-analyzer.js +546 -0
- package/llm/analyzers/prompts/event-analysis.txt +52 -0
- package/llm/analyzers/prompts/special-block-confirmation.txt +127 -0
- package/llm/providers/base-provider.js +64 -0
- package/llm/providers/openai-provider.js +168 -0
- package/llm/utils/event-csv.js +276 -0
- package/models/context.js +44 -0
- package/package.json +16 -0
- package/page-extractor.js +215 -0
- package/utils/selector-utils.js +31 -0
- package/utils/text-utils.js +11 -0
- package/utils/url-utils.js +43 -0
- package/vendor/extract-blocks.js +903 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Playwright-based page extractor.
|
|
3
|
+
* Launches headless Chromium, navigates to URL, scrolls, extracts blocks + element geometries + HTML.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
// In-browser block extraction function (serialized into page.evaluate)
|
|
7
|
+
// Imported from the project's extract-blocks script
|
|
8
|
+
import {
|
|
9
|
+
extractBlocksInBrowser,
|
|
10
|
+
scrollToBottom,
|
|
11
|
+
waitForStableHeight
|
|
12
|
+
} from './vendor/extract-blocks.js';
|
|
13
|
+
|
|
14
|
+
export class PageExtractor {
|
|
15
|
+
constructor(config = {}) {
|
|
16
|
+
this.config = {
|
|
17
|
+
timeoutMs: Number.isInteger(config.timeoutMs) ? Math.max(1000, config.timeoutMs) : 30000,
|
|
18
|
+
viewportWidth: Number.isInteger(config.viewportWidth) ? Math.max(320, config.viewportWidth) : 1440,
|
|
19
|
+
viewportHeight: Number.isInteger(config.viewportHeight) ? Math.max(400, config.viewportHeight) : 900,
|
|
20
|
+
minBlockHeight: Number.isInteger(config.minBlockHeight) ? Math.max(1, config.minBlockHeight) : 40,
|
|
21
|
+
minBlockWidthRatio: Number.isFinite(config.minBlockWidthRatio)
|
|
22
|
+
? Math.min(1, Math.max(0.05, config.minBlockWidthRatio))
|
|
23
|
+
: 0.25,
|
|
24
|
+
blockMaxHeightRatio: Number.isFinite(config.blockMaxHeightRatio)
|
|
25
|
+
? Math.max(0.5, config.blockMaxHeightRatio)
|
|
26
|
+
: 1.5,
|
|
27
|
+
blockMaxDepth: Number.isInteger(config.blockMaxDepth) ? Math.max(1, config.blockMaxDepth) : 15,
|
|
28
|
+
textPreviewMaxChars: Number.isInteger(config.textPreviewMaxChars)
|
|
29
|
+
? Math.max(120, config.textPreviewMaxChars)
|
|
30
|
+
: 1200
|
|
31
|
+
};
|
|
32
|
+
this.playwrightModule = null;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async getPlaywright() {
|
|
36
|
+
if (this.playwrightModule) {
|
|
37
|
+
return this.playwrightModule;
|
|
38
|
+
}
|
|
39
|
+
const mod = await import('playwright');
|
|
40
|
+
this.playwrightModule = mod.default || mod;
|
|
41
|
+
return this.playwrightModule;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
async revealHiddenContent(page) {
|
|
45
|
+
return page.evaluate(() => {
|
|
46
|
+
const CONTENT_THRESHOLD = 20;
|
|
47
|
+
let opacityCount = 0;
|
|
48
|
+
let displayCount = 0;
|
|
49
|
+
|
|
50
|
+
for (const el of document.querySelectorAll('*')) {
|
|
51
|
+
const style = getComputedStyle(el);
|
|
52
|
+
if (parseFloat(style.opacity) === 0 && el.getBoundingClientRect().height > 0) {
|
|
53
|
+
const text = (el.innerText || '').trim();
|
|
54
|
+
if (text.length >= CONTENT_THRESHOLD || el.querySelectorAll('img, video, picture').length > 0) {
|
|
55
|
+
el.style.setProperty('opacity', '1', 'important');
|
|
56
|
+
opacityCount += 1;
|
|
57
|
+
}
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (style.display === 'none') {
|
|
62
|
+
const parent = el.parentElement;
|
|
63
|
+
if (parent && getComputedStyle(parent).display === 'none') {
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const originalDisplay = el.style.display;
|
|
68
|
+
el.style.setProperty('display', 'block', 'important');
|
|
69
|
+
const text = (el.innerText || '').trim();
|
|
70
|
+
|
|
71
|
+
if (text.length >= CONTENT_THRESHOLD) {
|
|
72
|
+
displayCount += 1;
|
|
73
|
+
} else if (originalDisplay) {
|
|
74
|
+
el.style.display = originalDisplay;
|
|
75
|
+
} else {
|
|
76
|
+
el.style.removeProperty('display');
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return { opacityCount, displayCount };
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
async collectElementGeometries(page) {
|
|
86
|
+
return page.evaluate(() => {
|
|
87
|
+
const INTERACTIVE_SELECTOR = 'a, button, form, input, select, textarea, [onclick], [role="button"]';
|
|
88
|
+
const records = [];
|
|
89
|
+
const seenNodes = new Set();
|
|
90
|
+
|
|
91
|
+
const normalizeTextInPage = (value, maxLength = 240) => String(value || '')
|
|
92
|
+
.replace(/\s+/g, ' ')
|
|
93
|
+
.trim()
|
|
94
|
+
.slice(0, maxLength);
|
|
95
|
+
|
|
96
|
+
const normalizeHref = (value) => {
|
|
97
|
+
const raw = String(value || '').trim();
|
|
98
|
+
if (!raw) return '';
|
|
99
|
+
try {
|
|
100
|
+
const resolved = new URL(raw, location.href);
|
|
101
|
+
resolved.hash = '';
|
|
102
|
+
const href = resolved.href;
|
|
103
|
+
return href.endsWith('/') && href !== `${resolved.origin}/`
|
|
104
|
+
? href.slice(0, -1)
|
|
105
|
+
: href;
|
|
106
|
+
} catch {
|
|
107
|
+
return raw;
|
|
108
|
+
}
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
const buildPath = (el, useNthOfType) => {
|
|
112
|
+
if (!el || !el.tagName) return '';
|
|
113
|
+
const parts = [];
|
|
114
|
+
let current = el;
|
|
115
|
+
while (current && current.tagName && current.tagName.toLowerCase() !== 'body') {
|
|
116
|
+
const parent = current.parentElement;
|
|
117
|
+
if (!parent || parent.tagName.toLowerCase() === 'html') break;
|
|
118
|
+
const tag = current.tagName.toLowerCase();
|
|
119
|
+
const siblings = Array.from(parent.children).filter((child) => {
|
|
120
|
+
if (!(child instanceof Element)) return false;
|
|
121
|
+
return useNthOfType ? child.tagName.toLowerCase() === tag : true;
|
|
122
|
+
});
|
|
123
|
+
const index = siblings.indexOf(current) + 1;
|
|
124
|
+
const suffix = useNthOfType
|
|
125
|
+
? `:nth-of-type(${index})`
|
|
126
|
+
: `:nth-child(${index})`;
|
|
127
|
+
parts.unshift(`${tag}${suffix}`);
|
|
128
|
+
current = parent;
|
|
129
|
+
}
|
|
130
|
+
if (parts.length === 0) return '';
|
|
131
|
+
return `body > ${parts.join(' > ')}`;
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
for (const element of document.querySelectorAll(INTERACTIVE_SELECTOR)) {
|
|
135
|
+
if (!(element instanceof Element) || seenNodes.has(element)) continue;
|
|
136
|
+
seenNodes.add(element);
|
|
137
|
+
|
|
138
|
+
const rect = element.getBoundingClientRect();
|
|
139
|
+
if (rect.width <= 0 || rect.height <= 0) continue;
|
|
140
|
+
|
|
141
|
+
records.push({
|
|
142
|
+
tag: element.tagName.toLowerCase(),
|
|
143
|
+
text: normalizeTextInPage(element.innerText || element.textContent || ''),
|
|
144
|
+
href: normalizeHref(element.getAttribute('href') || ''),
|
|
145
|
+
action: normalizeHref(element.getAttribute('action') || ''),
|
|
146
|
+
formAction: normalizeHref(element.getAttribute('formaction') || ''),
|
|
147
|
+
name: normalizeTextInPage(element.getAttribute('name') || '', 120),
|
|
148
|
+
type: normalizeTextInPage(element.getAttribute('type') || '', 40),
|
|
149
|
+
ariaLabel: normalizeTextInPage(element.getAttribute('aria-label') || '', 120),
|
|
150
|
+
top: rect.top + window.scrollY,
|
|
151
|
+
left: rect.left + window.scrollX,
|
|
152
|
+
width: rect.width,
|
|
153
|
+
height: rect.height,
|
|
154
|
+
selectorNthOfType: buildPath(element, true),
|
|
155
|
+
selectorNthChild: buildPath(element, false),
|
|
156
|
+
selectorById: element.id ? `[id="${String(element.id).replace(/"/g, '\\"')}"]` : ''
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return records;
|
|
161
|
+
});
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Extract page data: html, blocks, elementGeometries, markdown
|
|
166
|
+
* @param {string} url - URL to extract
|
|
167
|
+
* @returns {Promise<{html, blocks, elementGeometries, pageSize}>}
|
|
168
|
+
*/
|
|
169
|
+
async extract(url) {
|
|
170
|
+
const targetUrl = String(url || '').trim();
|
|
171
|
+
if (!targetUrl) {
|
|
172
|
+
throw new Error('PageExtractor requires a non-empty URL');
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
const viewport = {
|
|
176
|
+
width: this.config.viewportWidth,
|
|
177
|
+
height: this.config.viewportHeight
|
|
178
|
+
};
|
|
179
|
+
|
|
180
|
+
const playwright = await this.getPlaywright();
|
|
181
|
+
const browser = await playwright.chromium.launch({ headless: true });
|
|
182
|
+
try {
|
|
183
|
+
const page = await browser.newPage({ viewport });
|
|
184
|
+
await page.goto(targetUrl, {
|
|
185
|
+
waitUntil: 'domcontentloaded',
|
|
186
|
+
timeout: this.config.timeoutMs
|
|
187
|
+
});
|
|
188
|
+
await scrollToBottom(page);
|
|
189
|
+
await waitForStableHeight(page, { maxWait: this.config.timeoutMs });
|
|
190
|
+
await this.revealHiddenContent(page);
|
|
191
|
+
|
|
192
|
+
const html = await page.content();
|
|
193
|
+
const pageSize = await page.evaluate(() => ({
|
|
194
|
+
width: document.documentElement.scrollWidth || 0,
|
|
195
|
+
height: document.documentElement.scrollHeight || 0
|
|
196
|
+
}));
|
|
197
|
+
|
|
198
|
+
const minWidth = Math.round(viewport.width * this.config.minBlockWidthRatio);
|
|
199
|
+
const blocksResult = await page.evaluate(extractBlocksInBrowser, {
|
|
200
|
+
minHeight: this.config.minBlockHeight,
|
|
201
|
+
minWidth,
|
|
202
|
+
maxHeight: Math.round(viewport.height * this.config.blockMaxHeightRatio),
|
|
203
|
+
maxDepth: this.config.blockMaxDepth,
|
|
204
|
+
textPreviewMaxChars: this.config.textPreviewMaxChars,
|
|
205
|
+
debug: false
|
|
206
|
+
});
|
|
207
|
+
const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
|
|
208
|
+
const elementGeometries = await this.collectElementGeometries(page);
|
|
209
|
+
|
|
210
|
+
return { html, blocks, elementGeometries, pageSize };
|
|
211
|
+
} finally {
|
|
212
|
+
await browser.close();
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Escape a value for use inside a CSS attribute selector string.
|
|
3
|
+
* Escapes backslashes first, then double-quotes.
|
|
4
|
+
* @param {*} value
|
|
5
|
+
* @returns {string}
|
|
6
|
+
*/
|
|
7
|
+
export function escapeAttributeValue(value) {
|
|
8
|
+
return String(value)
|
|
9
|
+
.replace(/\\/g, '\\\\')
|
|
10
|
+
.replace(/"/g, '\\"');
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Return true for CSS class tokens that are likely stable identifiers
|
|
15
|
+
* (not transient UI-state classes or hash-like tokens).
|
|
16
|
+
* @param {string} token
|
|
17
|
+
* @returns {boolean}
|
|
18
|
+
*/
|
|
19
|
+
export function isLikelyStableClassToken(token) {
|
|
20
|
+
const normalized = String(token || '').trim();
|
|
21
|
+
if (!normalized || normalized.length < 2 || normalized.length > 48) {
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
if (/^(active|selected|open|show|hide|visible|hidden|focus|hover|disabled|enabled)$/i.test(normalized)) {
|
|
25
|
+
return false;
|
|
26
|
+
}
|
|
27
|
+
if (/\d{4,}/.test(normalized)) {
|
|
28
|
+
return false;
|
|
29
|
+
}
|
|
30
|
+
return /^[a-zA-Z0-9_-]+$/.test(normalized);
|
|
31
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Normalize whitespace-heavy text into a compact single-line string.
|
|
3
|
+
* Collapses all whitespace sequences to a single space and trims.
|
|
4
|
+
* @param {*} value
|
|
5
|
+
* @returns {string}
|
|
6
|
+
*/
|
|
7
|
+
export function normalizeText(value) {
|
|
8
|
+
return String(value || '')
|
|
9
|
+
.replace(/\s+/g, ' ')
|
|
10
|
+
.trim();
|
|
11
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { URL } from 'url';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Normalize URL (convert relative to absolute, remove fragments)
|
|
5
|
+
* @param {string} urlStr - URL string to normalize
|
|
6
|
+
* @param {string} baseUrl - Base URL for resolving relative links
|
|
7
|
+
* @param {Object} options - Normalization options
|
|
8
|
+
* @returns {string} Normalized absolute URL
|
|
9
|
+
*/
|
|
10
|
+
export function normalizeUrl(urlStr, baseUrl, options = {}) {
|
|
11
|
+
const {
|
|
12
|
+
skipFragments = true,
|
|
13
|
+
normalizeTrailingSlash = true
|
|
14
|
+
} = options;
|
|
15
|
+
|
|
16
|
+
if (!urlStr) return '';
|
|
17
|
+
|
|
18
|
+
try {
|
|
19
|
+
urlStr = urlStr.trim();
|
|
20
|
+
|
|
21
|
+
if (urlStr.startsWith('javascript:') ||
|
|
22
|
+
urlStr.startsWith('mailto:') ||
|
|
23
|
+
urlStr.startsWith('tel:') ||
|
|
24
|
+
urlStr.startsWith('#')) {
|
|
25
|
+
return urlStr;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const absoluteUrl = new URL(urlStr, baseUrl);
|
|
29
|
+
|
|
30
|
+
if (skipFragments) {
|
|
31
|
+
absoluteUrl.hash = '';
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
let normalized = absoluteUrl.href;
|
|
35
|
+
if (normalizeTrailingSlash && normalized.endsWith('/') && normalized !== absoluteUrl.origin + '/') {
|
|
36
|
+
normalized = normalized.slice(0, -1);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return normalized;
|
|
40
|
+
} catch {
|
|
41
|
+
return urlStr;
|
|
42
|
+
}
|
|
43
|
+
}
|