page-analyzer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,127 @@
1
+ You are a strict web layout analyzer.
2
+
3
+ Task:
4
+ From blocks_index.csv, output:
5
+ 1. one short site summary
6
+ 2. logical UI blocks
7
+ 3. possible click events
8
+ 4. an optional second CSV only when a merged block with the same blockName still contains clearly different meanings
9
+
10
+ Be conservative. Do not guess without clear signals.
11
+
12
+ Input columns:
13
+ blockIdx,branchPath,depth,domOrder,tag,fixed,top,left,width,height,blockCssPath,blockPosition,textPreview,childInteractiveCount
14
+
15
+ Use these signals together:
16
+ - branchPath prefix
17
+ - depth
18
+ - domOrder
19
+ - top/left/width/height
20
+ - textPreview continuity
21
+
22
+ Block names:
23
+ Header, Footer, SideNavigation, Hero, TopNavigation, ContentSection, ArticleList, ProductGrid, FeatureCards, CTASection, Form, SearchBar, ImageGallery, ContentBlock
24
+
25
+ Merge only when most signals align:
26
+ - similar branchPath prefix
27
+ - close domOrder
28
+ - similar depth
29
+ - small vertical gap
30
+ - similar left and width
31
+ - continuous or related textPreview
32
+
33
+ Do not merge when:
34
+ - different columns
35
+ - large vertical gaps
36
+ - unrelated branchPath
37
+ - clearly different meanings
38
+
39
+ Events:
40
+ Use click-based, dot-separated labels only.
41
+
42
+ Examples: logo_click nav_click article_click product_click card_click image_click cta_click form_submit_click search_click item_click
43
+
44
+ Rules:
45
+ - infer from textPreview, blockName, and childInteractiveCount
46
+ - if childInteractiveCount = 0, leave events empty
47
+ - if childInteractiveCount > 0, output all possible events.
48
+ - if unclear, use item_click
49
+ - any area may still have click events
50
+
51
+ Second CSV:
52
+ Output only if one merged first-CSV block:
53
+ - contains multiple original blocks
54
+ - has the same high-level blockName
55
+ - still contains clearly different internal meanings worth splitting
56
+
57
+ Use one short semantic label per split group.
58
+
59
+ Examples: product_info review faq testimonial promo signup download comparison filter_bar sort_bar pagination breadcrumb
60
+
61
+ Prefer the single most representative label. Do not assign both a parent label and its sub-aspect.
62
+
63
+ Output format:
64
+ Plain CSV only.
65
+
66
+ First CSV:
67
+ Line 1: site summary sentence
68
+ Line 2: blockIdxs,blockName,blockDescription,blockPossibleEvents
69
+ Line 3+: one row per logical block
70
+
71
+ Rules:
72
+ - blockIdxs must be dot-separated and sorted ascending
73
+ - blockDescription must be a concise LLM-written description of the logical UI block, based on the block text, role, and layout signals
74
+ - blockDescription must not contain commas
75
+ - blockPossibleEvents must be dot-separated
76
+ - no extra columns beyond blockIdxs, blockName, blockDescription, blockPossibleEvents
77
+ - no markdown
78
+ - no explanations
79
+ - each original blockIdx must appear exactly once in the first CSV
80
+
81
+ Optional second CSV:
82
+ If needed, add a separator line exactly as:
83
+ ---
84
+
85
+ Then output:
86
+ blockIdxs,blockSemantic
87
+
88
+ Rules:
89
+ - group nearby sub-blocks with the same meaning when possible
90
+ - use compact indexes such as 1.2, 3.4, 5
91
+ - each blockIdx here must belong to a merged block from the first CSV
92
+ - do not repeat the same blockIdx across rows
93
+ - do not output this CSV if not needed
94
+
95
+ Example 1:
96
+
97
+ ```text
98
+ Marketing page with header, hero, feature cards, and footer
99
+ blockIdxs,blockName,blockDescription,blockPossibleEvents
100
+ 0,Header,Global header with brand and navigation links,logo_click.nav_click
101
+ 1.2,Hero,Hero area introducing the offer with a primary CTA,cta_click
102
+ 3.4.5,FeatureCards,Feature card grid describing product benefits,card_click
103
+ 6,CTASection,Call-to-action section prompting conversion,cta_click
104
+ 7,Footer,Footer with secondary navigation links,nav_click
105
+ ```
106
+
107
+ Example 2:
108
+
109
+ ```text
110
+ Product page with navigation, product content, reviews, and footer
111
+ blockIdxs,blockName,blockDescription,blockPossibleEvents
112
+ 0,Header,Global header with logo and navigation,logo_click.nav_click
113
+ 1.2.3.4.5,ContentBlock,Product detail content with reviews and FAQ links,product_click.cta_click
114
+ 6,CTASection,Conversion CTA section for the product,cta_click
115
+ 7,Footer,Footer with site navigation,nav_click
116
+ ---
117
+ blockIdxs,blockSemantic
118
+ 1.2,product_info
119
+ 3.4,review
120
+ 5,faq
121
+ ```
122
+
123
+ Input:
124
+ blocks_index.csv
125
+ ```csv
126
+ <<<BLOCKS_INDEX_CSV>>>
127
+ ```
@@ -0,0 +1,64 @@
1
+ /**
2
+ * Base class for LLM providers
3
+ * Defines the interface that all LLM providers must implement
4
+ */
5
+ export class BaseLlmProvider {
6
+ constructor(config = {}) {
7
+ this.config = config;
8
+ this.apiKey = config.apiKey;
9
+ this.model = config.model;
10
+ this.interactionLogger = typeof config.interactionLogger === 'function'
11
+ ? config.interactionLogger
12
+ : null;
13
+ this.timeout = config.timeout || 600000;
14
+ this.maxRetries = config.maxRetries || 3;
15
+ this.maxTokens = Number.isInteger(config.maxTokens) && config.maxTokens > 0
16
+ ? config.maxTokens
17
+ : null;
18
+ }
19
+
20
+ async analyze(content, options = {}) {
21
+ throw new Error('analyze() must be implemented by subclass');
22
+ }
23
+
24
+ async makeRequestWithRetry(requestFn) {
25
+ let lastError;
26
+
27
+ for (let attempt = 1; attempt <= this.maxRetries; attempt++) {
28
+ try {
29
+ return await requestFn();
30
+ } catch (err) {
31
+ lastError = err;
32
+ console.warn(`LLM request attempt ${attempt} failed: ${err.message}`);
33
+
34
+ if (attempt < this.maxRetries) {
35
+ const delay = Math.min(1000 * Math.pow(2, attempt - 1), 10000);
36
+ await new Promise(resolve => setTimeout(resolve, delay));
37
+ }
38
+ }
39
+ }
40
+
41
+ throw lastError;
42
+ }
43
+
44
+ async emitInteractionLog(payload = {}) {
45
+ if (!this.interactionLogger) {
46
+ return;
47
+ }
48
+
49
+ try {
50
+ await this.interactionLogger(payload);
51
+ } catch (error) {
52
+ console.warn(`[LLM] Failed to persist interaction log: ${error.message}`);
53
+ }
54
+ }
55
+
56
+ validateConfig() {
57
+ if (!this.apiKey) {
58
+ throw new Error('API key is required');
59
+ }
60
+ if (!this.model) {
61
+ throw new Error('Model is required');
62
+ }
63
+ }
64
+ }
@@ -0,0 +1,168 @@
1
+ import { BaseLlmProvider } from './base-provider.js';
2
+
3
+ /**
4
+ * OpenAI API provider implementation
5
+ */
6
+ export class OpenAiProvider extends BaseLlmProvider {
7
+ constructor(config = {}) {
8
+ super(config);
9
+ this.apiEndpoint = config.apiEndpoint;
10
+ this.temperature = typeof config.temperature === 'number' ? config.temperature : 0;
11
+ if (!this.apiEndpoint) {
12
+ throw new Error('apiEndpoint is required');
13
+ }
14
+ this.validateConfig();
15
+ }
16
+
17
+ buildInputText(messages) {
18
+ if (!Array.isArray(messages)) {
19
+ return String(messages ?? '');
20
+ }
21
+
22
+ return messages
23
+ .map((message, index) => {
24
+ const role = String(message?.role || 'unknown').trim() || 'unknown';
25
+ const content = String(message?.content ?? '');
26
+ return `[${index + 1}] ${role}: ${content}`;
27
+ })
28
+ .join('\n\n');
29
+ }
30
+
31
+ normalizeInteger(value) {
32
+ const parsed = Number.parseInt(value, 10);
33
+ return Number.isFinite(parsed) ? parsed : null;
34
+ }
35
+
36
+ normalizeFloat(value) {
37
+ const parsed = Number.parseFloat(value);
38
+ return Number.isFinite(parsed) ? parsed : null;
39
+ }
40
+
41
+ resolveInteractionContext(metadata) {
42
+ const context = metadata && typeof metadata === 'object' ? metadata : {};
43
+ const domain = String(context.domain || '').trim();
44
+ const nodeId = String(context.nodeId || '').trim();
45
+ if (!domain || !nodeId) {
46
+ return null;
47
+ }
48
+
49
+ return {
50
+ domain,
51
+ nodeId,
52
+ operation: String(context.operation || 'analysis').trim() || 'analysis',
53
+ chunkLabel: String(context.chunkLabel || '').trim() || null
54
+ };
55
+ }
56
+
57
+ async makeRequest(messages, options = {}) {
58
+ const requestOptions = options && typeof options === 'object'
59
+ ? { ...options }
60
+ : {};
61
+ const metadata = requestOptions.metadata && typeof requestOptions.metadata === 'object'
62
+ ? requestOptions.metadata
63
+ : {};
64
+ delete requestOptions.metadata;
65
+
66
+ const payload = {
67
+ model: this.model,
68
+ messages,
69
+ temperature: this.model === 'gpt-5-mini' ? 1 : this.temperature,
70
+ ...requestOptions
71
+ };
72
+ if (payload.max_tokens === undefined && Number.isInteger(this.maxTokens) && this.maxTokens > 0) {
73
+ payload.max_tokens = this.maxTokens;
74
+ }
75
+
76
+ const interactionContext = this.resolveInteractionContext(metadata);
77
+ const inputText = this.buildInputText(messages);
78
+ let failureLogged = false;
79
+
80
+ try {
81
+ const response = await fetch(this.apiEndpoint, {
82
+ method: 'POST',
83
+ headers: {
84
+ 'Content-Type': 'application/json',
85
+ 'Authorization': `Bearer ${this.apiKey}`
86
+ },
87
+ body: JSON.stringify(payload),
88
+ signal: AbortSignal.timeout(this.timeout)
89
+ });
90
+
91
+ if (!response.ok) {
92
+ const error = await response.text();
93
+
94
+ if (interactionContext) {
95
+ await this.emitInteractionLog({
96
+ ...interactionContext,
97
+ provider: 'OpenAI',
98
+ model: this.model,
99
+ requestId: null,
100
+ inputText,
101
+ outputText: null,
102
+ requestPayload: payload,
103
+ responsePayload: null,
104
+ usagePromptTokens: null,
105
+ usageCompletionTokens: null,
106
+ usageReasoningTokens: null,
107
+ usageCost: null
108
+ });
109
+ failureLogged = true;
110
+ }
111
+
112
+ throw new Error(`OpenAI API error: ${response.status} - ${error}`);
113
+ }
114
+
115
+ const data = await response.json();
116
+ const outputText = String(data?.choices?.[0]?.message?.content ?? '');
117
+ const usage = data?.usage || {};
118
+
119
+ if (interactionContext) {
120
+ await this.emitInteractionLog({
121
+ ...interactionContext,
122
+ provider: 'OpenAI',
123
+ model: String(data?.model || this.model || ''),
124
+ requestId: data?.id || null,
125
+ inputText,
126
+ outputText,
127
+ requestPayload: payload,
128
+ responsePayload: data,
129
+ usagePromptTokens: this.normalizeInteger(usage?.prompt_tokens),
130
+ usageCompletionTokens: this.normalizeInteger(usage?.completion_tokens),
131
+ usageReasoningTokens: this.normalizeInteger(usage?.completion_tokens_details?.reasoning_tokens),
132
+ usageCost: this.normalizeFloat(usage?.cost)
133
+ });
134
+ }
135
+
136
+ return outputText;
137
+ } catch (error) {
138
+ if (!failureLogged && interactionContext) {
139
+ await this.emitInteractionLog({
140
+ ...interactionContext,
141
+ provider: 'OpenAI',
142
+ model: this.model,
143
+ requestId: null,
144
+ inputText,
145
+ outputText: null,
146
+ requestPayload: payload,
147
+ responsePayload: null,
148
+ usagePromptTokens: null,
149
+ usageCompletionTokens: null,
150
+ usageReasoningTokens: null,
151
+ usageCost: null
152
+ });
153
+ }
154
+ throw error;
155
+ }
156
+ }
157
+
158
+ async analyze(content, options = {}) {
159
+ const requestOptions = { ...options };
160
+ delete requestOptions.parseJson;
161
+
162
+ const messages = [
163
+ { role: 'user', content }
164
+ ];
165
+
166
+ return this.makeRequestWithRetry(() => this.makeRequest(messages, requestOptions));
167
+ }
168
+ }
@@ -0,0 +1,276 @@
1
+ import { parse } from 'csv-parse/sync';
2
+
3
+ export const EVENT_CSV_HEADERS = ['csv_id', 'event_type', 'attributes_kv'];
4
+ export const NEW_EVENT_TYPES_CSV_HEADERS = ['name', 'why_webmasters_care'];
5
+
6
+ function safeDecodeURIComponent(value) {
7
+ try {
8
+ return decodeURIComponent(value);
9
+ } catch {
10
+ return value;
11
+ }
12
+ }
13
+
14
+ function sanitizeResponseText(content) {
15
+ const raw = String(content || '').trim();
16
+ if (!raw) {
17
+ return '';
18
+ }
19
+
20
+ const codeBlockMatch = raw.match(/```(?:csv)?\s*\n?([\s\S]*?)\n?```/i);
21
+ if (codeBlockMatch) {
22
+ return codeBlockMatch[1].trim();
23
+ }
24
+
25
+ return raw;
26
+ }
27
+
28
+ export function decodeAttributesKv(attributesKv) {
29
+ const source = String(attributesKv || '').trim();
30
+ if (!source) {
31
+ return {};
32
+ }
33
+
34
+ const attributes = {};
35
+ const pairs = source.split('&');
36
+ for (const pair of pairs) {
37
+ const segment = pair.trim();
38
+ if (!segment) {
39
+ continue;
40
+ }
41
+
42
+ const equalsIndex = segment.indexOf('=');
43
+ const keyPart = equalsIndex === -1 ? segment : segment.slice(0, equalsIndex);
44
+ const valuePart = equalsIndex === -1 ? '' : segment.slice(equalsIndex + 1);
45
+ const key = safeDecodeURIComponent(keyPart).trim();
46
+ if (!key) {
47
+ continue;
48
+ }
49
+ attributes[key] = safeDecodeURIComponent(valuePart);
50
+ }
51
+
52
+ return attributes;
53
+ }
54
+
55
+ function normalizeEventType(value) {
56
+ return String(value || '')
57
+ .trim()
58
+ .toLowerCase()
59
+ .replace(/[^a-z0-9_]+/g, '_')
60
+ .replace(/_+/g, '_')
61
+ .replace(/^_+|_+$/g, '');
62
+ }
63
+
64
+ function normalizeReasonsPipe(value) {
65
+ const raw = String(value || '').trim();
66
+ if (!raw) {
67
+ return [];
68
+ }
69
+
70
+ return Array.from(new Set(
71
+ raw
72
+ .split('|')
73
+ .map((item) => String(item || '').trim())
74
+ .filter(Boolean)
75
+ ));
76
+ }
77
+
78
+ export function parseEventsFromCsv(content, options = {}) {
79
+ const logger = options.logger || console;
80
+ const sourceLabel = options.sourceLabel || 'llm_csv';
81
+ const skipInvalidRows = options.skipInvalidRows !== false;
82
+ const sanitized = sanitizeResponseText(content);
83
+ if (!sanitized) {
84
+ return { events: [], invalidRows: 0 };
85
+ }
86
+
87
+ const lines = sanitized.split(/\r?\n/).filter((line) => line.trim().length > 0);
88
+ if (lines.length === 0) {
89
+ return { events: [], invalidRows: 0 };
90
+ }
91
+
92
+ const firstLine = lines[0].trim().toLowerCase();
93
+ const hasHeader = firstLine.startsWith('csv_id,') && firstLine.includes('event_type');
94
+ const csvText = hasHeader
95
+ ? sanitized
96
+ : `${EVENT_CSV_HEADERS.join(',')}\n${sanitized}`;
97
+
98
+ let records;
99
+ try {
100
+ records = parse(csvText, {
101
+ columns: true,
102
+ skip_empty_lines: true,
103
+ trim: true,
104
+ relax_column_count: true,
105
+ relax_quotes: true
106
+ });
107
+ } catch (error) {
108
+ throw new Error(`Failed to parse CSV content from ${sourceLabel}: ${error.message}`);
109
+ }
110
+
111
+ const events = [];
112
+ let invalidRows = 0;
113
+
114
+ records.forEach((record, index) => {
115
+ const rowNumber = index + 2;
116
+ const csvId = String(record.csv_id ?? '').trim();
117
+ const eventType = normalizeEventType(record.event_type);
118
+
119
+ if (!csvId || !eventType) {
120
+ invalidRows++;
121
+ const message = `[LLM][CSV] Invalid row ${rowNumber} in ${sourceLabel}: missing csv_id or event_type`;
122
+ if (!skipInvalidRows) {
123
+ throw new Error(message);
124
+ }
125
+ logger.warn(message);
126
+ return;
127
+ }
128
+
129
+ let attributes;
130
+ try {
131
+ attributes = decodeAttributesKv(record.attributes_kv);
132
+ } catch (error) {
133
+ invalidRows++;
134
+ const message = `[LLM][CSV] Invalid attributes_kv in row ${rowNumber} (${sourceLabel}): ${error.message}`;
135
+ if (!skipInvalidRows) {
136
+ throw new Error(message);
137
+ }
138
+ logger.warn(message);
139
+ return;
140
+ }
141
+
142
+ events.push({
143
+ csv_id: csvId,
144
+ event_type: eventType,
145
+ attributes
146
+ });
147
+ });
148
+
149
+ return { events, invalidRows };
150
+ }
151
+
152
+ export function parseNewEventTypesFromCsv(content, options = {}) {
153
+ const logger = options.logger || console;
154
+ const sourceLabel = options.sourceLabel || 'llm_new_event_types_csv';
155
+ const skipInvalidRows = options.skipInvalidRows !== false;
156
+ const sanitized = sanitizeResponseText(content);
157
+ if (!sanitized) {
158
+ return { newEventTypes: [], invalidRows: 0 };
159
+ }
160
+
161
+ const lines = sanitized.split(/\r?\n/).filter((line) => line.trim().length > 0);
162
+ if (lines.length === 0) {
163
+ return { newEventTypes: [], invalidRows: 0 };
164
+ }
165
+
166
+ const firstLine = lines[0].trim().toLowerCase();
167
+ const hasHeader = firstLine.startsWith('name,') && firstLine.includes('why_webmasters_care');
168
+ const csvText = hasHeader
169
+ ? sanitized
170
+ : `${NEW_EVENT_TYPES_CSV_HEADERS.join(',')}\n${sanitized}`;
171
+
172
+ let records;
173
+ try {
174
+ records = parse(csvText, {
175
+ columns: true,
176
+ skip_empty_lines: true,
177
+ trim: true,
178
+ relax_column_count: true,
179
+ relax_quotes: true
180
+ });
181
+ } catch (error) {
182
+ throw new Error(`Failed to parse CSV content from ${sourceLabel}: ${error.message}`);
183
+ }
184
+
185
+ const byName = new Map();
186
+ let invalidRows = 0;
187
+
188
+ records.forEach((record, index) => {
189
+ const rowNumber = index + 2;
190
+ const name = normalizeEventType(record.name);
191
+ if (!name) {
192
+ invalidRows++;
193
+ const message = `[LLM][CSV] Invalid row ${rowNumber} in ${sourceLabel}: missing name`;
194
+ if (!skipInvalidRows) {
195
+ throw new Error(message);
196
+ }
197
+ logger.warn(message);
198
+ return;
199
+ }
200
+
201
+ const reasons = normalizeReasonsPipe(record.why_webmasters_care);
202
+ if (byName.has(name)) {
203
+ const existing = byName.get(name);
204
+ existing.why_webmasters_care = Array.from(new Set([
205
+ ...existing.why_webmasters_care,
206
+ ...reasons
207
+ ]));
208
+ return;
209
+ }
210
+
211
+ byName.set(name, {
212
+ name,
213
+ why_webmasters_care: reasons
214
+ });
215
+ });
216
+
217
+ return {
218
+ newEventTypes: Array.from(byName.values()),
219
+ invalidRows
220
+ };
221
+ }
222
+
223
+ export function deriveEventTypeMetadata(events, knownEventTypes = []) {
224
+ const knownSet = new Set(
225
+ (Array.isArray(knownEventTypes) ? knownEventTypes : [])
226
+ .map((value) => normalizeEventType(value))
227
+ .filter(Boolean)
228
+ );
229
+
230
+ const grouped = new Map();
231
+ for (const event of Array.isArray(events) ? events : []) {
232
+ const type = normalizeEventType(event?.event_type);
233
+ if (!type) {
234
+ continue;
235
+ }
236
+ if (!grouped.has(type)) {
237
+ grouped.set(type, {
238
+ csvIds: []
239
+ });
240
+ }
241
+ const group = grouped.get(type);
242
+ const csvId = String(event?.csv_id ?? '').trim();
243
+ if (csvId) {
244
+ group.csvIds.push(csvId);
245
+ }
246
+ }
247
+
248
+ const eventTypesSummary = [];
249
+ const newEventTypes = [];
250
+
251
+ for (const [name, group] of grouped.entries()) {
252
+ const coveredCsvIds = Array.from(new Set(group.csvIds));
253
+ const whyWebmastersCare = [];
254
+ eventTypesSummary.push({
255
+ name,
256
+ covered_csv_ids: coveredCsvIds,
257
+ why_webmasters_care: whyWebmastersCare
258
+ });
259
+
260
+ if (!knownSet.has(name)) {
261
+ const item = {
262
+ name,
263
+ why_webmasters_care: whyWebmastersCare
264
+ };
265
+ if (knownSet.size > 0) {
266
+ item.not_in_known_event_types_because = `event_type "${name}" was not found in known_event_types`;
267
+ }
268
+ newEventTypes.push(item);
269
+ }
270
+ }
271
+
272
+ return {
273
+ eventTypesSummary,
274
+ newEventTypes
275
+ };
276
+ }
@@ -0,0 +1,44 @@
1
+ /**
2
+ * Represents element context information
3
+ */
4
+ export class Context {
5
+ constructor() {
6
+ this.contextVersion = 2;
7
+ this.parentTag = '';
8
+ this.parentText = '';
9
+ this.ancestorTrail = [];
10
+ this.cssSelector = '';
11
+ this.selectorCandidates = [];
12
+ this.nearbyText = [];
13
+ this.semanticAnchors = {
14
+ heading: null,
15
+ imageAlt: null
16
+ };
17
+ this.containerMeta = {
18
+ cssSelector: '',
19
+ attributes: {}
20
+ };
21
+ }
22
+
23
+ static fromObject(obj) {
24
+ const context = new Context();
25
+ if (obj && typeof obj === 'object') {
26
+ Object.assign(context, obj);
27
+ }
28
+ return context;
29
+ }
30
+
31
+ toJSON() {
32
+ return {
33
+ contextVersion: 2,
34
+ parentTag: this.parentTag,
35
+ parentText: this.parentText,
36
+ ancestorTrail: this.ancestorTrail,
37
+ cssSelector: this.cssSelector,
38
+ selectorCandidates: this.selectorCandidates,
39
+ nearbyText: this.nearbyText,
40
+ semanticAnchors: this.semanticAnchors,
41
+ containerMeta: this.containerMeta
42
+ };
43
+ }
44
+ }
package/package.json ADDED
@@ -0,0 +1,16 @@
1
+ {
2
+ "name": "page-analyzer",
3
+ "version": "1.0.0",
4
+ "type": "module",
5
+ "description": "Standalone page analysis module.",
6
+ "main": "index.js",
7
+ "scripts": {
8
+ "test": "node test.js",
9
+ "analyze": "node test.js"
10
+ },
11
+ "dependencies": {
12
+ "cheerio": "^1.2.0",
13
+ "csv-parse": "^5.6.0",
14
+ "playwright": "^1.58.2"
15
+ }
16
+ }