page-analyzer 1.2.1 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +36 -0
- package/README.md +29 -3
- package/extractors/block-assigner.js +1 -1
- package/index.d.ts +318 -0
- package/index.js +192 -36
- package/llm/analyzers/event-analyzer/event-analyzer-blocks.js +19 -0
- package/llm/providers/claude-cli-provider.js +137 -0
- package/llm/providers/cli-runner.js +129 -0
- package/llm/providers/codex-cli-provider.js +154 -0
- package/llm/providers/index.js +61 -0
- package/package.json +6 -1
- package/page-extractor.js +210 -17
- package/scripts/analyze.js +10 -5
- package/test/smoke.test.js +151 -12
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import { BaseLlmProvider } from './base-provider.js';
|
|
2
|
+
import { runCli, makeTmpOutFile } from './cli-runner.js';
|
|
3
|
+
|
|
4
|
+
const FAST_MODEL_NAME = 'gpt-5.5';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Codex CLI provider — invokes the locally installed `codex exec` command.
|
|
8
|
+
*
|
|
9
|
+
* Auth is whatever the user's local codex install already has; no API key needed.
|
|
10
|
+
* Returns the final agent message (read from `-o` outfile) as a plain string,
|
|
11
|
+
* matching the BaseLlmProvider contract.
|
|
12
|
+
*/
|
|
13
|
+
export class CodexCliProvider extends BaseLlmProvider {
|
|
14
|
+
constructor(config = {}) {
|
|
15
|
+
super(config);
|
|
16
|
+
if (!this.model) {
|
|
17
|
+
throw new Error('CodexCliProvider: model is required');
|
|
18
|
+
}
|
|
19
|
+
this.cliPath = config.cliPath || 'codex';
|
|
20
|
+
this.cwd = config.cwd || null;
|
|
21
|
+
this.fast = typeof config.fast === 'boolean'
|
|
22
|
+
? config.fast
|
|
23
|
+
: this.model === FAST_MODEL_NAME;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
buildArgs(outFile) {
|
|
27
|
+
const args = [
|
|
28
|
+
'exec',
|
|
29
|
+
'-m', this.model,
|
|
30
|
+
'--skip-git-repo-check',
|
|
31
|
+
'--sandbox', 'read-only',
|
|
32
|
+
'--ephemeral',
|
|
33
|
+
'--color', 'never',
|
|
34
|
+
'--dangerously-bypass-approvals-and-sandbox',
|
|
35
|
+
'-o', outFile
|
|
36
|
+
];
|
|
37
|
+
if (this.fast) {
|
|
38
|
+
args.push('-c', 'service_tier="fast"');
|
|
39
|
+
}
|
|
40
|
+
return args;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
resolveInteractionContext(metadata) {
|
|
44
|
+
const context = metadata && typeof metadata === 'object' ? metadata : {};
|
|
45
|
+
const domain = String(context.domain || '').trim();
|
|
46
|
+
const nodeId = String(context.nodeId || '').trim();
|
|
47
|
+
if (!domain || !nodeId) {
|
|
48
|
+
return null;
|
|
49
|
+
}
|
|
50
|
+
return {
|
|
51
|
+
domain,
|
|
52
|
+
nodeId,
|
|
53
|
+
operation: String(context.operation || 'analysis').trim() || 'analysis',
|
|
54
|
+
chunkLabel: String(context.chunkLabel || '').trim() || null
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
async makeRequest(prompt, options = {}) {
|
|
59
|
+
const metadata = options && typeof options.metadata === 'object' ? options.metadata : {};
|
|
60
|
+
const interactionContext = this.resolveInteractionContext(metadata);
|
|
61
|
+
const { filePath: outFile, cleanup } = await makeTmpOutFile('codex-cli', 'last-message.txt');
|
|
62
|
+
const args = this.buildArgs(outFile);
|
|
63
|
+
const requestPayload = { argv: [this.cliPath, ...args], model: this.model, fast: this.fast };
|
|
64
|
+
|
|
65
|
+
let failureLogged = false;
|
|
66
|
+
try {
|
|
67
|
+
const result = await runCli({
|
|
68
|
+
command: this.cliPath,
|
|
69
|
+
args,
|
|
70
|
+
prompt,
|
|
71
|
+
timeoutMs: this.timeout,
|
|
72
|
+
outFile,
|
|
73
|
+
cwd: this.cwd || undefined
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
const outputText = String(result.outFileContent ?? '').trim();
|
|
77
|
+
if (result.code !== 0 || !outputText) {
|
|
78
|
+
if (interactionContext) {
|
|
79
|
+
await this.emitInteractionLog({
|
|
80
|
+
...interactionContext,
|
|
81
|
+
provider: 'Codex',
|
|
82
|
+
model: this.model,
|
|
83
|
+
requestId: null,
|
|
84
|
+
inputText: prompt,
|
|
85
|
+
outputText: outputText || null,
|
|
86
|
+
requestPayload,
|
|
87
|
+
responsePayload: { stdout: result.stdout, stderr: result.stderr, code: result.code, signal: result.signal },
|
|
88
|
+
usagePromptTokens: null,
|
|
89
|
+
usageCompletionTokens: null,
|
|
90
|
+
usageReasoningTokens: null,
|
|
91
|
+
usageCost: null
|
|
92
|
+
});
|
|
93
|
+
failureLogged = true;
|
|
94
|
+
}
|
|
95
|
+
const reason = result.code !== 0
|
|
96
|
+
? `exited with code ${result.code}${result.signal ? ` (signal ${result.signal})` : ''}`
|
|
97
|
+
: 'produced empty output file';
|
|
98
|
+
const stderrTail = String(result.stderr || '').slice(-500);
|
|
99
|
+
throw new Error(`codex exec ${reason}${stderrTail ? `: ${stderrTail}` : ''}`);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (interactionContext) {
|
|
103
|
+
await this.emitInteractionLog({
|
|
104
|
+
...interactionContext,
|
|
105
|
+
provider: 'Codex',
|
|
106
|
+
model: this.model,
|
|
107
|
+
requestId: null,
|
|
108
|
+
inputText: prompt,
|
|
109
|
+
outputText,
|
|
110
|
+
requestPayload,
|
|
111
|
+
responsePayload: { stdout: result.stdout, stderr: result.stderr, code: result.code, signal: result.signal },
|
|
112
|
+
usagePromptTokens: null,
|
|
113
|
+
usageCompletionTokens: null,
|
|
114
|
+
usageReasoningTokens: null,
|
|
115
|
+
usageCost: null
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
return outputText;
|
|
120
|
+
} catch (error) {
|
|
121
|
+
if (!failureLogged && interactionContext) {
|
|
122
|
+
await this.emitInteractionLog({
|
|
123
|
+
...interactionContext,
|
|
124
|
+
provider: 'Codex',
|
|
125
|
+
model: this.model,
|
|
126
|
+
requestId: null,
|
|
127
|
+
inputText: prompt,
|
|
128
|
+
outputText: null,
|
|
129
|
+
requestPayload,
|
|
130
|
+
responsePayload: null,
|
|
131
|
+
usagePromptTokens: null,
|
|
132
|
+
usageCompletionTokens: null,
|
|
133
|
+
usageReasoningTokens: null,
|
|
134
|
+
usageCost: null
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
throw error;
|
|
138
|
+
} finally {
|
|
139
|
+
await cleanup();
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
async analyze(content, options = {}) {
|
|
144
|
+
const requestOptions = { ...options };
|
|
145
|
+
delete requestOptions.parseJson;
|
|
146
|
+
return this.makeRequestWithRetry(() => this.makeRequest(String(content ?? ''), requestOptions));
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
validateConfig() {
|
|
150
|
+
if (!this.model) {
|
|
151
|
+
throw new Error('Model is required');
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { OpenAiProvider } from './openai-provider.js';
|
|
2
|
+
import { CodexCliProvider } from './codex-cli-provider.js';
|
|
3
|
+
import { ClaudeCliProvider } from './claude-cli-provider.js';
|
|
4
|
+
|
|
5
|
+
export const LLM_PROVIDER_TYPES = Object.freeze(['openai', 'codex', 'claude']);
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Build the right LLM provider instance from an llmConfig object.
|
|
9
|
+
*
|
|
10
|
+
* @param {Object} llmConfig
|
|
11
|
+
* @param {'openai'|'codex'|'claude'} [llmConfig.type='openai']
|
|
12
|
+
* @param {string} llmConfig.model
|
|
13
|
+
* @param {string} [llmConfig.apiKey]
|
|
14
|
+
* @param {string} [llmConfig.apiEndpoint]
|
|
15
|
+
* @param {string} [llmConfig.cliPath]
|
|
16
|
+
* @param {string} [llmConfig.cwd]
|
|
17
|
+
* @param {boolean} [llmConfig.fast]
|
|
18
|
+
* @param {number} [llmConfig.maxTokens]
|
|
19
|
+
* @param {number} [llmConfig.temperature]
|
|
20
|
+
* @param {number} [llmConfig.timeout]
|
|
21
|
+
* @param {number} [llmConfig.maxRetries]
|
|
22
|
+
* @param {Function} [llmConfig.interactionLogger]
|
|
23
|
+
*/
|
|
24
|
+
export function createLlmProvider(llmConfig = {}) {
|
|
25
|
+
const type = String(llmConfig.type || 'openai').toLowerCase();
|
|
26
|
+
const shared = {
|
|
27
|
+
model: llmConfig.model,
|
|
28
|
+
timeout: llmConfig.timeout,
|
|
29
|
+
maxRetries: llmConfig.maxRetries,
|
|
30
|
+
maxTokens: llmConfig.maxTokens,
|
|
31
|
+
interactionLogger: llmConfig.interactionLogger
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
switch (type) {
|
|
35
|
+
case 'openai':
|
|
36
|
+
return new OpenAiProvider({
|
|
37
|
+
...shared,
|
|
38
|
+
apiKey: llmConfig.apiKey,
|
|
39
|
+
apiEndpoint: llmConfig.apiEndpoint,
|
|
40
|
+
temperature: llmConfig.temperature
|
|
41
|
+
});
|
|
42
|
+
case 'codex':
|
|
43
|
+
return new CodexCliProvider({
|
|
44
|
+
...shared,
|
|
45
|
+
cliPath: llmConfig.cliPath,
|
|
46
|
+
cwd: llmConfig.cwd,
|
|
47
|
+
fast: llmConfig.fast
|
|
48
|
+
});
|
|
49
|
+
case 'claude':
|
|
50
|
+
return new ClaudeCliProvider({
|
|
51
|
+
...shared,
|
|
52
|
+
cliPath: llmConfig.cliPath,
|
|
53
|
+
cwd: llmConfig.cwd
|
|
54
|
+
});
|
|
55
|
+
default:
|
|
56
|
+
throw new Error(`Unknown llm.type: ${llmConfig.type}. Expected one of: ${LLM_PROVIDER_TYPES.join(', ')}`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export { OpenAiProvider, CodexCliProvider, ClaudeCliProvider };
|
|
61
|
+
export { BaseLlmProvider } from './base-provider.js';
|
package/package.json
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "page-analyzer",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.3",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Standalone page analysis module.",
|
|
6
6
|
"license": "MIT",
|
|
7
|
+
"repository": {
|
|
8
|
+
"type": "git",
|
|
9
|
+
"url": "git+https://github.com/liuzemei/page-analyzer.git"
|
|
10
|
+
},
|
|
7
11
|
"main": "index.js",
|
|
12
|
+
"types": "index.d.ts",
|
|
8
13
|
"scripts": {
|
|
9
14
|
"test": "node test/smoke.test.js",
|
|
10
15
|
"analyze": "node scripts/analyze.js",
|
package/page-extractor.js
CHANGED
|
@@ -5,7 +5,8 @@
|
|
|
5
5
|
|
|
6
6
|
import fs from 'node:fs/promises';
|
|
7
7
|
import path from 'node:path';
|
|
8
|
-
import {
|
|
8
|
+
import { createHash } from 'node:crypto';
|
|
9
|
+
import { HeadObjectCommand, PutObjectCommand, S3Client } from '@aws-sdk/client-s3';
|
|
9
10
|
|
|
10
11
|
// In-browser block extraction function (serialized into page.evaluate)
|
|
11
12
|
// Imported from the project's extract-blocks script
|
|
@@ -41,6 +42,24 @@ function createSnapshotRunId() {
|
|
|
41
42
|
.replace(/^-+|-+$/g, '');
|
|
42
43
|
}
|
|
43
44
|
|
|
45
|
+
function createS3DomainSegment(url) {
|
|
46
|
+
const source = String(url || '').trim();
|
|
47
|
+
try {
|
|
48
|
+
const parsed = new URL(source);
|
|
49
|
+
const hostname = parsed.hostname
|
|
50
|
+
.toLowerCase()
|
|
51
|
+
.replace(/[^a-z0-9.-]+/g, '-')
|
|
52
|
+
.replace(/^-+|-+$/g, '');
|
|
53
|
+
return hostname || 'page';
|
|
54
|
+
} catch {
|
|
55
|
+
return createSnapshotSlug(source);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function createFileMd5(body) {
|
|
60
|
+
return createHash('md5').update(body).digest('hex');
|
|
61
|
+
}
|
|
62
|
+
|
|
44
63
|
function getBlockNumber(block, fallbackIndex) {
|
|
45
64
|
return Number.isInteger(block?.blockIdx) ? block.blockIdx : fallbackIndex;
|
|
46
65
|
}
|
|
@@ -96,8 +115,8 @@ function normalizeS3Config(config) {
|
|
|
96
115
|
};
|
|
97
116
|
}
|
|
98
117
|
|
|
99
|
-
function joinS3Key(
|
|
100
|
-
return
|
|
118
|
+
function joinS3Key(...parts) {
|
|
119
|
+
return parts.filter(Boolean).join('/');
|
|
101
120
|
}
|
|
102
121
|
|
|
103
122
|
function encodeS3Key(key) {
|
|
@@ -119,6 +138,12 @@ function getErrorMessage(error) {
|
|
|
119
138
|
return error instanceof Error ? error.message : String(error);
|
|
120
139
|
}
|
|
121
140
|
|
|
141
|
+
function isS3NotFoundError(error) {
|
|
142
|
+
const statusCode = error?.$metadata?.httpStatusCode;
|
|
143
|
+
const errorName = String(error?.name || error?.Code || error?.code || '');
|
|
144
|
+
return statusCode === 404 || errorName === 'NotFound' || errorName === 'NoSuchKey';
|
|
145
|
+
}
|
|
146
|
+
|
|
122
147
|
export class PageExtractor {
|
|
123
148
|
constructor(config = {}) {
|
|
124
149
|
this.config = {
|
|
@@ -136,6 +161,10 @@ export class PageExtractor {
|
|
|
136
161
|
textPreviewMaxChars: Number.isInteger(config.textPreviewMaxChars)
|
|
137
162
|
? Math.max(120, config.textPreviewMaxChars)
|
|
138
163
|
: 1200,
|
|
164
|
+
sizedElementsEnabled: Boolean(config.sizedElementsEnabled),
|
|
165
|
+
sizedElementMinSize: Number.isInteger(config.sizedElementMinSize)
|
|
166
|
+
? Math.max(0, config.sizedElementMinSize)
|
|
167
|
+
: 24,
|
|
139
168
|
waitForImagesLoaded: Boolean(config.waitForImagesLoaded),
|
|
140
169
|
fullPageScreenshot: Boolean(config.fullPageScreenshot),
|
|
141
170
|
blockScreenshots: Boolean(config.blockScreenshots),
|
|
@@ -176,13 +205,30 @@ export class PageExtractor {
|
|
|
176
205
|
return this.s3Client;
|
|
177
206
|
}
|
|
178
207
|
|
|
179
|
-
async
|
|
208
|
+
async s3ObjectExists(client, key) {
|
|
209
|
+
const s3Config = this.config.s3;
|
|
210
|
+
try {
|
|
211
|
+
await client.send(new HeadObjectCommand({
|
|
212
|
+
Bucket: s3Config.bucket,
|
|
213
|
+
Key: key
|
|
214
|
+
}));
|
|
215
|
+
return true;
|
|
216
|
+
} catch (error) {
|
|
217
|
+
if (isS3NotFoundError(error)) {
|
|
218
|
+
return false;
|
|
219
|
+
}
|
|
220
|
+
throw error;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
async uploadScreenshotToS3(targetUrl, body) {
|
|
180
225
|
const s3Config = this.config.s3;
|
|
181
226
|
if (!s3Config) {
|
|
182
227
|
throw new Error('S3 is not configured');
|
|
183
228
|
}
|
|
184
229
|
|
|
185
|
-
const
|
|
230
|
+
const domain = createS3DomainSegment(targetUrl);
|
|
231
|
+
const key = joinS3Key(s3Config.prefix, domain, `${createFileMd5(body)}.png`);
|
|
186
232
|
const client = this.getS3Client();
|
|
187
233
|
const commandInput = {
|
|
188
234
|
Bucket: s3Config.bucket,
|
|
@@ -194,6 +240,9 @@ export class PageExtractor {
|
|
|
194
240
|
let lastError = null;
|
|
195
241
|
for (let attempt = 1; attempt <= s3Config.maxUploadAttempts; attempt += 1) {
|
|
196
242
|
try {
|
|
243
|
+
if (await this.s3ObjectExists(client, key)) {
|
|
244
|
+
return buildS3Url(s3Config, key);
|
|
245
|
+
}
|
|
197
246
|
const command = new PutObjectCommand(commandInput);
|
|
198
247
|
await client.send(command);
|
|
199
248
|
return buildS3Url(s3Config, key);
|
|
@@ -201,7 +250,7 @@ export class PageExtractor {
|
|
|
201
250
|
lastError = error;
|
|
202
251
|
if (attempt < s3Config.maxUploadAttempts) {
|
|
203
252
|
console.warn(
|
|
204
|
-
`[page-analyzer] Failed to upload ${key} to S3, retrying ` +
|
|
253
|
+
`[page-analyzer] Failed to check/upload ${key} to S3, retrying ` +
|
|
205
254
|
`(${attempt}/${s3Config.maxUploadAttempts}): ${getErrorMessage(error)}`
|
|
206
255
|
);
|
|
207
256
|
}
|
|
@@ -431,6 +480,147 @@ export class PageExtractor {
|
|
|
431
480
|
});
|
|
432
481
|
}
|
|
433
482
|
|
|
483
|
+
/**
|
|
484
|
+
* Collect ALL visible DOM elements that have "some size" (width > minSize OR
|
|
485
|
+
* height > minSize), each annotated with basic raw info. Broader than
|
|
486
|
+
* collectElementGeometries (which is interactive-only); used to nest element
|
|
487
|
+
* detail under each block in the analysis output.
|
|
488
|
+
*/
|
|
489
|
+
async collectSizedElements(page) {
|
|
490
|
+
const minSize = this.config.sizedElementMinSize;
|
|
491
|
+
const records = await page.evaluate((minSizePx) => {
|
|
492
|
+
const INTERACTIVE_SELECTOR = 'a, button, form, input, select, textarea, [onclick], [role="button"], [role="link"]';
|
|
493
|
+
const SKIP_TAGS = new Set([
|
|
494
|
+
'script', 'style', 'noscript', 'template', 'meta', 'link', 'br', 'wbr', 'head', 'title'
|
|
495
|
+
]);
|
|
496
|
+
const MEDIA_TAGS = new Set(['img', 'source', 'video', 'iframe', 'audio', 'embed']);
|
|
497
|
+
const records = [];
|
|
498
|
+
|
|
499
|
+
const normalizeTextInPage = (value, maxLength = 240) => String(value || '')
|
|
500
|
+
.replace(/\s+/g, ' ')
|
|
501
|
+
.trim()
|
|
502
|
+
.slice(0, maxLength);
|
|
503
|
+
|
|
504
|
+
const normalizeHref = (value) => {
|
|
505
|
+
const raw = String(value || '').trim();
|
|
506
|
+
if (!raw) return '';
|
|
507
|
+
try {
|
|
508
|
+
const resolved = new URL(raw, location.href);
|
|
509
|
+
resolved.hash = '';
|
|
510
|
+
const href = resolved.href;
|
|
511
|
+
return href.endsWith('/') && href !== `${resolved.origin}/`
|
|
512
|
+
? href.slice(0, -1)
|
|
513
|
+
: href;
|
|
514
|
+
} catch {
|
|
515
|
+
return raw;
|
|
516
|
+
}
|
|
517
|
+
};
|
|
518
|
+
|
|
519
|
+
const buildPath = (el, useNthOfType) => {
|
|
520
|
+
if (!el || !el.tagName) return '';
|
|
521
|
+
const parts = [];
|
|
522
|
+
let current = el;
|
|
523
|
+
while (current && current.tagName && current.tagName.toLowerCase() !== 'body') {
|
|
524
|
+
const parent = current.parentElement;
|
|
525
|
+
if (!parent || parent.tagName.toLowerCase() === 'html') break;
|
|
526
|
+
const tag = current.tagName.toLowerCase();
|
|
527
|
+
const siblings = Array.from(parent.children).filter((child) => {
|
|
528
|
+
if (!(child instanceof Element)) return false;
|
|
529
|
+
return useNthOfType ? child.tagName.toLowerCase() === tag : true;
|
|
530
|
+
});
|
|
531
|
+
const index = siblings.indexOf(current) + 1;
|
|
532
|
+
const suffix = useNthOfType
|
|
533
|
+
? `:nth-of-type(${index})`
|
|
534
|
+
: `:nth-child(${index})`;
|
|
535
|
+
parts.unshift(`${tag}${suffix}`);
|
|
536
|
+
current = parent;
|
|
537
|
+
}
|
|
538
|
+
if (parts.length === 0) return '';
|
|
539
|
+
return `body > ${parts.join(' > ')}`;
|
|
540
|
+
};
|
|
541
|
+
|
|
542
|
+
const pageWidth = Math.max(
|
|
543
|
+
document.documentElement?.scrollWidth || 0,
|
|
544
|
+
document.body?.scrollWidth || 0
|
|
545
|
+
);
|
|
546
|
+
const pageHeight = Math.max(
|
|
547
|
+
document.documentElement?.scrollHeight || 0,
|
|
548
|
+
document.body?.scrollHeight || 0
|
|
549
|
+
);
|
|
550
|
+
|
|
551
|
+
for (const element of document.querySelectorAll('body *')) {
|
|
552
|
+
if (!(element instanceof Element)) continue;
|
|
553
|
+
const tag = element.tagName.toLowerCase();
|
|
554
|
+
if (SKIP_TAGS.has(tag)) continue;
|
|
555
|
+
|
|
556
|
+
const style = getComputedStyle(element);
|
|
557
|
+
if (style.display === 'none') continue;
|
|
558
|
+
if (style.visibility === 'hidden' || style.visibility === 'collapse') continue;
|
|
559
|
+
if (parseFloat(style.opacity) === 0) continue;
|
|
560
|
+
|
|
561
|
+
const rect = element.getBoundingClientRect();
|
|
562
|
+
if (rect.width <= 0 || rect.height <= 0) continue;
|
|
563
|
+
if (!(rect.width > minSizePx || rect.height > minSizePx)) continue;
|
|
564
|
+
|
|
565
|
+
const top = rect.top + window.scrollY;
|
|
566
|
+
const left = rect.left + window.scrollX;
|
|
567
|
+
// Skip elements positioned fully outside the document (e.g. a11y skip-links at top:-17000)
|
|
568
|
+
if (top + rect.height <= 0 || left + rect.width <= 0) continue;
|
|
569
|
+
if (pageHeight > 0 && top >= pageHeight) continue;
|
|
570
|
+
if (pageWidth > 0 && left >= pageWidth) continue;
|
|
571
|
+
|
|
572
|
+
let imageAlt = '';
|
|
573
|
+
if (tag === 'img') {
|
|
574
|
+
imageAlt = normalizeTextInPage(element.getAttribute('alt') || '', 240);
|
|
575
|
+
} else {
|
|
576
|
+
const childImg = element.querySelector(':scope > img');
|
|
577
|
+
if (childImg) {
|
|
578
|
+
imageAlt = normalizeTextInPage(childImg.getAttribute('alt') || '', 240);
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
let src = '';
|
|
583
|
+
if (MEDIA_TAGS.has(tag)) {
|
|
584
|
+
src = normalizeHref(element.getAttribute('src') || element.currentSrc || '');
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
const role = element.getAttribute('role') || '';
|
|
588
|
+
const roleLower = role.toLowerCase();
|
|
589
|
+
|
|
590
|
+
records.push({
|
|
591
|
+
tag,
|
|
592
|
+
text: normalizeTextInPage(element.innerText || element.textContent || ''),
|
|
593
|
+
href: normalizeHref(
|
|
594
|
+
element.getAttribute('href')
|
|
595
|
+
|| element.getAttribute('action')
|
|
596
|
+
|| element.getAttribute('formaction')
|
|
597
|
+
|| ''
|
|
598
|
+
),
|
|
599
|
+
src,
|
|
600
|
+
width: rect.width,
|
|
601
|
+
height: rect.height,
|
|
602
|
+
top,
|
|
603
|
+
left,
|
|
604
|
+
cssSelector: buildPath(element, true),
|
|
605
|
+
id: element.id || '',
|
|
606
|
+
class: normalizeTextInPage(element.getAttribute('class') || '', 240),
|
|
607
|
+
role,
|
|
608
|
+
ariaLabel: normalizeTextInPage(element.getAttribute('aria-label') || '', 120),
|
|
609
|
+
imageAlt,
|
|
610
|
+
interactive: element.matches(INTERACTIVE_SELECTOR)
|
|
611
|
+
|| element.hasAttribute('onclick')
|
|
612
|
+
|| roleLower === 'button'
|
|
613
|
+
|| roleLower === 'link'
|
|
614
|
+
});
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
return records;
|
|
618
|
+
}, minSize);
|
|
619
|
+
|
|
620
|
+
console.log(`[page-analyzer] Collected ${records.length} sized elements (width or height > ${minSize}px)`);
|
|
621
|
+
return records;
|
|
622
|
+
}
|
|
623
|
+
|
|
434
624
|
async collectPageSize(page) {
|
|
435
625
|
return page.evaluate(() => {
|
|
436
626
|
const html = document.documentElement;
|
|
@@ -572,16 +762,16 @@ export class PageExtractor {
|
|
|
572
762
|
await fs.mkdir(this.config.snapshotDir, { recursive: true });
|
|
573
763
|
}
|
|
574
764
|
|
|
575
|
-
const
|
|
765
|
+
const localPrefix = `${createSnapshotSlug(targetUrl)}-${createSnapshotRunId()}`;
|
|
576
766
|
const screenshots = {};
|
|
577
767
|
|
|
578
768
|
if (fullPageScreenshot) {
|
|
579
|
-
const fullPageFilename = `${prefix}-full-page.png`;
|
|
580
769
|
try {
|
|
581
770
|
if (useS3) {
|
|
582
771
|
const body = await page.screenshot({ fullPage: true });
|
|
583
|
-
screenshots.fullPage = await this.uploadScreenshotToS3(
|
|
772
|
+
screenshots.fullPage = await this.uploadScreenshotToS3(targetUrl, body);
|
|
584
773
|
} else {
|
|
774
|
+
const fullPageFilename = `${localPrefix}-full-page.png`;
|
|
585
775
|
const fullPagePath = path.join(this.config.snapshotDir, fullPageFilename);
|
|
586
776
|
await page.screenshot({
|
|
587
777
|
path: fullPagePath,
|
|
@@ -603,15 +793,13 @@ export class PageExtractor {
|
|
|
603
793
|
const block = blocks[index];
|
|
604
794
|
const blockIdx = getBlockNumber(block, index);
|
|
605
795
|
|
|
606
|
-
const blockLabel = String(blockIdx).padStart(3, '0').replace(/[^0-9a-z-]+/gi, '-');
|
|
607
|
-
const blockFilename = `${prefix}-block-${blockLabel}.png`;
|
|
608
796
|
try {
|
|
609
797
|
if (useS3) {
|
|
610
798
|
const body = await this.captureBlockScreenshotData(page, block);
|
|
611
799
|
if (!body) {
|
|
612
800
|
continue;
|
|
613
801
|
}
|
|
614
|
-
const url = await this.uploadScreenshotToS3(
|
|
802
|
+
const url = await this.uploadScreenshotToS3(targetUrl, body);
|
|
615
803
|
const screenshotRecord = {
|
|
616
804
|
blockIdx,
|
|
617
805
|
path: url
|
|
@@ -626,6 +814,8 @@ export class PageExtractor {
|
|
|
626
814
|
continue;
|
|
627
815
|
}
|
|
628
816
|
|
|
817
|
+
const blockLabel = String(blockIdx).padStart(3, '0').replace(/[^0-9a-z-]+/gi, '-');
|
|
818
|
+
const blockFilename = `${localPrefix}-block-${blockLabel}.png`;
|
|
629
819
|
const blockPath = path.join(this.config.snapshotDir, blockFilename);
|
|
630
820
|
const captured = await this.captureBlockScreenshot(page, block, blockPath);
|
|
631
821
|
if (captured) {
|
|
@@ -696,11 +886,11 @@ export class PageExtractor {
|
|
|
696
886
|
|
|
697
887
|
/**
|
|
698
888
|
* Extract page data from an already prepared Playwright page:
|
|
699
|
-
* html, blocks, elementGeometries, screenshots.
|
|
889
|
+
* html, blocks, elementGeometries, sizedElements, screenshots.
|
|
700
890
|
* When config.s3 is provided, screenshots are uploaded to S3 and returned as URLs.
|
|
701
891
|
* @param {import('playwright').Page} page - Prepared Playwright page
|
|
702
892
|
* @param {string} targetUrl - URL loaded in the page
|
|
703
|
-
* @returns {Promise<{html, blocks, elementGeometries, screenshots, pageSize}>}
|
|
893
|
+
* @returns {Promise<{html, blocks, elementGeometries, sizedElements, screenshots, pageSize}>}
|
|
704
894
|
*/
|
|
705
895
|
async extractPreparedPage(page, targetUrl) {
|
|
706
896
|
const viewport = {
|
|
@@ -721,17 +911,20 @@ export class PageExtractor {
|
|
|
721
911
|
});
|
|
722
912
|
const blocks = Array.isArray(blocksResult?.blocks) ? blocksResult.blocks : [];
|
|
723
913
|
const elementGeometries = await this.collectElementGeometries(page);
|
|
914
|
+
const sizedElements = this.config.sizedElementsEnabled
|
|
915
|
+
? await this.collectSizedElements(page)
|
|
916
|
+
: [];
|
|
724
917
|
const finalPageSize = await this.collectPageSize(page);
|
|
725
918
|
const screenshots = await this.captureScreenshots(page, targetUrl, blocks);
|
|
726
919
|
|
|
727
|
-
return { html, blocks, elementGeometries, screenshots, pageSize: finalPageSize };
|
|
920
|
+
return { html, blocks, elementGeometries, sizedElements, screenshots, pageSize: finalPageSize };
|
|
728
921
|
}
|
|
729
922
|
|
|
730
923
|
/**
|
|
731
|
-
* Extract page data: html, blocks, elementGeometries, screenshots.
|
|
924
|
+
* Extract page data: html, blocks, elementGeometries, sizedElements, screenshots.
|
|
732
925
|
* When config.s3 is provided, screenshots are uploaded to S3 and returned as URLs.
|
|
733
926
|
* @param {string} url - URL to extract
|
|
734
|
-
* @returns {Promise<{html, blocks, elementGeometries, screenshots, pageSize}>}
|
|
927
|
+
* @returns {Promise<{html, blocks, elementGeometries, sizedElements, screenshots, pageSize}>}
|
|
735
928
|
*/
|
|
736
929
|
async extract(url) {
|
|
737
930
|
return this.withPreparedPage(url, async (page, targetUrl) => {
|
package/scripts/analyze.js
CHANGED
|
@@ -33,14 +33,19 @@ loadDotEnv(path.join(projectRoot, '.env'));
|
|
|
33
33
|
|
|
34
34
|
const url = process.argv[2] || 'https://www.jcb.co.jp/ordercard/kojin_card/os_card_w2.html';
|
|
35
35
|
|
|
36
|
+
const llmType = (process.env.LLM_TYPE || 'openai').toLowerCase();
|
|
37
|
+
const llmConfig = { type: llmType, model: process.env.LLM_MODEL };
|
|
38
|
+
if (llmType === 'openai') {
|
|
39
|
+
llmConfig.apiKey = process.env.LLM_API_KEY;
|
|
40
|
+
llmConfig.apiEndpoint = process.env.LLM_API_ENDPOINT;
|
|
41
|
+
}
|
|
42
|
+
|
|
36
43
|
const result = await analyzeUrl(url, {
|
|
37
44
|
fullPageScreenshot: true,
|
|
38
45
|
blockScreenshots: true,
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
model: process.env.LLM_MODEL
|
|
43
|
-
}
|
|
46
|
+
showElement: true,
|
|
47
|
+
elementSize: 24,
|
|
48
|
+
llm: llmConfig
|
|
44
49
|
});
|
|
45
50
|
|
|
46
51
|
fs.writeFileSync(
|