page-analyzer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/csv-exporter.js +192 -0
- package/extractors/block-assigner.js +281 -0
- package/extractors/context-extractor.js +275 -0
- package/extractors/css-selector-builder.js +202 -0
- package/extractors/pt-selector-builder.js +344 -0
- package/html-parser.js +206 -0
- package/index.js +303 -0
- package/llm/analyzers/event-analyzer/event-analyzer-blocks.js +553 -0
- package/llm/analyzers/event-analyzer/event-analyzer-constants.js +22 -0
- package/llm/analyzers/event-analyzer/event-analyzer-events.js +97 -0
- package/llm/analyzers/event-analyzer/event-analyzer-input.js +168 -0
- package/llm/analyzers/event-analyzer/event-analyzer-metadata.js +15 -0
- package/llm/analyzers/event-analyzer/event-analyzer-prompt.js +71 -0
- package/llm/analyzers/event-analyzer/event-analyzer-response.js +290 -0
- package/llm/analyzers/event-analyzer/event-analyzer-utils.js +96 -0
- package/llm/analyzers/event-analyzer/event-analyzer.js +546 -0
- package/llm/analyzers/prompts/event-analysis.txt +52 -0
- package/llm/analyzers/prompts/special-block-confirmation.txt +127 -0
- package/llm/providers/base-provider.js +64 -0
- package/llm/providers/openai-provider.js +168 -0
- package/llm/utils/event-csv.js +276 -0
- package/models/context.js +44 -0
- package/package.json +16 -0
- package/page-extractor.js +215 -0
- package/utils/selector-utils.js +31 -0
- package/utils/text-utils.js +11 -0
- package/utils/url-utils.js +43 -0
- package/vendor/extract-blocks.js +903 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
You are a strict web layout analyzer.
|
|
2
|
+
|
|
3
|
+
Task:
|
|
4
|
+
From blocks_index.csv, output:
|
|
5
|
+
1. one short site summary
|
|
6
|
+
2. logical UI blocks
|
|
7
|
+
3. possible click events
|
|
8
|
+
4. an optional second CSV only when a merged block with the same blockName still contains clearly different meanings
|
|
9
|
+
|
|
10
|
+
Be conservative. Do not guess without clear signals.
|
|
11
|
+
|
|
12
|
+
Input columns:
|
|
13
|
+
blockIdx,branchPath,depth,domOrder,tag,fixed,top,left,width,height,blockCssPath,blockPosition,textPreview,childInteractiveCount
|
|
14
|
+
|
|
15
|
+
Use these signals together:
|
|
16
|
+
- branchPath prefix
|
|
17
|
+
- depth
|
|
18
|
+
- domOrder
|
|
19
|
+
- top/left/width/height
|
|
20
|
+
- textPreview continuity
|
|
21
|
+
|
|
22
|
+
Block names:
|
|
23
|
+
Header, Footer, SideNavigation, Hero, TopNavigation, ContentSection, ArticleList, ProductGrid, FeatureCards, CTASection, Form, SearchBar, ImageGallery, ContentBlock
|
|
24
|
+
|
|
25
|
+
Merge only when most signals align:
|
|
26
|
+
- similar branchPath prefix
|
|
27
|
+
- close domOrder
|
|
28
|
+
- similar depth
|
|
29
|
+
- small vertical gap
|
|
30
|
+
- similar left and width
|
|
31
|
+
- continuous or related textPreview
|
|
32
|
+
|
|
33
|
+
Do not merge when:
|
|
34
|
+
- different columns
|
|
35
|
+
- large vertical gaps
|
|
36
|
+
- unrelated branchPath
|
|
37
|
+
- clearly different meanings
|
|
38
|
+
|
|
39
|
+
Events:
|
|
40
|
+
Use click-based, dot-separated labels only.
|
|
41
|
+
|
|
42
|
+
Examples: logo_click nav_click article_click product_click card_click image_click cta_click form_submit_click search_click item_click
|
|
43
|
+
|
|
44
|
+
Rules:
|
|
45
|
+
- infer from textPreview, blockName, and childInteractiveCount
|
|
46
|
+
- if childInteractiveCount = 0, leave events empty
|
|
47
|
+
- if childInteractiveCount > 0, output all possible events.
|
|
48
|
+
- if unclear, use item_click
|
|
49
|
+
- any area may still have click events
|
|
50
|
+
|
|
51
|
+
Second CSV:
|
|
52
|
+
Output only if one merged first-CSV block:
|
|
53
|
+
- contains multiple original blocks
|
|
54
|
+
- has the same high-level blockName
|
|
55
|
+
- still contains clearly different internal meanings worth splitting
|
|
56
|
+
|
|
57
|
+
Use one short semantic label per split group.
|
|
58
|
+
|
|
59
|
+
Examples: product_info review faq testimonial promo signup download comparison filter_bar sort_bar pagination breadcrumb
|
|
60
|
+
|
|
61
|
+
Prefer the single most representative label. Do not assign both a parent label and its sub-aspect.
|
|
62
|
+
|
|
63
|
+
Output format:
|
|
64
|
+
Plain CSV only.
|
|
65
|
+
|
|
66
|
+
First CSV:
|
|
67
|
+
Line 1: site summary sentence
|
|
68
|
+
Line 2: blockIdxs,blockName,blockDescription,blockPossibleEvents
|
|
69
|
+
Line 3+: one row per logical block
|
|
70
|
+
|
|
71
|
+
Rules:
|
|
72
|
+
- blockIdxs must be dot-separated and sorted ascending
|
|
73
|
+
- blockDescription must be a concise LLM-written description of the logical UI block, based on the block text, role, and layout signals
|
|
74
|
+
- blockDescription must not contain commas
|
|
75
|
+
- blockPossibleEvents must be dot-separated
|
|
76
|
+
- no extra columns beyond blockIdxs, blockName, blockDescription, blockPossibleEvents
|
|
77
|
+
- no markdown
|
|
78
|
+
- no explanations
|
|
79
|
+
- each original blockIdx must appear exactly once in the first CSV
|
|
80
|
+
|
|
81
|
+
Optional second CSV:
|
|
82
|
+
If needed, add a separator line exactly as:
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
Then output:
|
|
86
|
+
blockIdxs,blockSemantic
|
|
87
|
+
|
|
88
|
+
Rules:
|
|
89
|
+
- group nearby sub-blocks with the same meaning when possible
|
|
90
|
+
- use compact indexes such as 1.2, 3.4, 5
|
|
91
|
+
- each blockIdx here must belong to a merged block from the first CSV
|
|
92
|
+
- do not repeat the same blockIdx across rows
|
|
93
|
+
- do not output this CSV if not needed
|
|
94
|
+
|
|
95
|
+
Example 1:
|
|
96
|
+
|
|
97
|
+
```text
|
|
98
|
+
Marketing page with header, hero, feature cards, and footer
|
|
99
|
+
blockIdxs,blockName,blockDescription,blockPossibleEvents
|
|
100
|
+
0,Header,Global header with brand and navigation links,logo_click.nav_click
|
|
101
|
+
1.2,Hero,Hero area introducing the offer with a primary CTA,cta_click
|
|
102
|
+
3.4.5,FeatureCards,Feature card grid describing product benefits,card_click
|
|
103
|
+
6,CTASection,Call-to-action section prompting conversion,cta_click
|
|
104
|
+
7,Footer,Footer with secondary navigation links,nav_click
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Example 2:
|
|
108
|
+
|
|
109
|
+
```text
|
|
110
|
+
Product page with navigation, product content, reviews, and footer
|
|
111
|
+
blockIdxs,blockName,blockDescription,blockPossibleEvents
|
|
112
|
+
0,Header,Global header with logo and navigation,logo_click.nav_click
|
|
113
|
+
1.2.3.4.5,ContentBlock,Product detail content with reviews and FAQ links,product_click.cta_click
|
|
114
|
+
6,CTASection,Conversion CTA section for the product,cta_click
|
|
115
|
+
7,Footer,Footer with site navigation,nav_click
|
|
116
|
+
---
|
|
117
|
+
blockIdxs,blockSemantic
|
|
118
|
+
1.2,product_info
|
|
119
|
+
3.4,review
|
|
120
|
+
5,faq
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Input:
|
|
124
|
+
blocks_index.csv
|
|
125
|
+
```csv
|
|
126
|
+
<<<BLOCKS_INDEX_CSV>>>
|
|
127
|
+
```
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Base class for LLM providers
|
|
3
|
+
* Defines the interface that all LLM providers must implement
|
|
4
|
+
*/
|
|
5
|
+
export class BaseLlmProvider {
|
|
6
|
+
constructor(config = {}) {
|
|
7
|
+
this.config = config;
|
|
8
|
+
this.apiKey = config.apiKey;
|
|
9
|
+
this.model = config.model;
|
|
10
|
+
this.interactionLogger = typeof config.interactionLogger === 'function'
|
|
11
|
+
? config.interactionLogger
|
|
12
|
+
: null;
|
|
13
|
+
this.timeout = config.timeout || 600000;
|
|
14
|
+
this.maxRetries = config.maxRetries || 3;
|
|
15
|
+
this.maxTokens = Number.isInteger(config.maxTokens) && config.maxTokens > 0
|
|
16
|
+
? config.maxTokens
|
|
17
|
+
: null;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
async analyze(content, options = {}) {
|
|
21
|
+
throw new Error('analyze() must be implemented by subclass');
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
async makeRequestWithRetry(requestFn) {
|
|
25
|
+
let lastError;
|
|
26
|
+
|
|
27
|
+
for (let attempt = 1; attempt <= this.maxRetries; attempt++) {
|
|
28
|
+
try {
|
|
29
|
+
return await requestFn();
|
|
30
|
+
} catch (err) {
|
|
31
|
+
lastError = err;
|
|
32
|
+
console.warn(`LLM request attempt ${attempt} failed: ${err.message}`);
|
|
33
|
+
|
|
34
|
+
if (attempt < this.maxRetries) {
|
|
35
|
+
const delay = Math.min(1000 * Math.pow(2, attempt - 1), 10000);
|
|
36
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
throw lastError;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
async emitInteractionLog(payload = {}) {
|
|
45
|
+
if (!this.interactionLogger) {
|
|
46
|
+
return;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
try {
|
|
50
|
+
await this.interactionLogger(payload);
|
|
51
|
+
} catch (error) {
|
|
52
|
+
console.warn(`[LLM] Failed to persist interaction log: ${error.message}`);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
validateConfig() {
|
|
57
|
+
if (!this.apiKey) {
|
|
58
|
+
throw new Error('API key is required');
|
|
59
|
+
}
|
|
60
|
+
if (!this.model) {
|
|
61
|
+
throw new Error('Model is required');
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import { BaseLlmProvider } from './base-provider.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* OpenAI API provider implementation
|
|
5
|
+
*/
|
|
6
|
+
export class OpenAiProvider extends BaseLlmProvider {
|
|
7
|
+
constructor(config = {}) {
|
|
8
|
+
super(config);
|
|
9
|
+
this.apiEndpoint = config.apiEndpoint;
|
|
10
|
+
this.temperature = typeof config.temperature === 'number' ? config.temperature : 0;
|
|
11
|
+
if (!this.apiEndpoint) {
|
|
12
|
+
throw new Error('apiEndpoint is required');
|
|
13
|
+
}
|
|
14
|
+
this.validateConfig();
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
buildInputText(messages) {
|
|
18
|
+
if (!Array.isArray(messages)) {
|
|
19
|
+
return String(messages ?? '');
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
return messages
|
|
23
|
+
.map((message, index) => {
|
|
24
|
+
const role = String(message?.role || 'unknown').trim() || 'unknown';
|
|
25
|
+
const content = String(message?.content ?? '');
|
|
26
|
+
return `[${index + 1}] ${role}: ${content}`;
|
|
27
|
+
})
|
|
28
|
+
.join('\n\n');
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
normalizeInteger(value) {
|
|
32
|
+
const parsed = Number.parseInt(value, 10);
|
|
33
|
+
return Number.isFinite(parsed) ? parsed : null;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
normalizeFloat(value) {
|
|
37
|
+
const parsed = Number.parseFloat(value);
|
|
38
|
+
return Number.isFinite(parsed) ? parsed : null;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
resolveInteractionContext(metadata) {
|
|
42
|
+
const context = metadata && typeof metadata === 'object' ? metadata : {};
|
|
43
|
+
const domain = String(context.domain || '').trim();
|
|
44
|
+
const nodeId = String(context.nodeId || '').trim();
|
|
45
|
+
if (!domain || !nodeId) {
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
domain,
|
|
51
|
+
nodeId,
|
|
52
|
+
operation: String(context.operation || 'analysis').trim() || 'analysis',
|
|
53
|
+
chunkLabel: String(context.chunkLabel || '').trim() || null
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async makeRequest(messages, options = {}) {
|
|
58
|
+
const requestOptions = options && typeof options === 'object'
|
|
59
|
+
? { ...options }
|
|
60
|
+
: {};
|
|
61
|
+
const metadata = requestOptions.metadata && typeof requestOptions.metadata === 'object'
|
|
62
|
+
? requestOptions.metadata
|
|
63
|
+
: {};
|
|
64
|
+
delete requestOptions.metadata;
|
|
65
|
+
|
|
66
|
+
const payload = {
|
|
67
|
+
model: this.model,
|
|
68
|
+
messages,
|
|
69
|
+
temperature: this.model === 'gpt-5-mini' ? 1 : this.temperature,
|
|
70
|
+
...requestOptions
|
|
71
|
+
};
|
|
72
|
+
if (payload.max_tokens === undefined && Number.isInteger(this.maxTokens) && this.maxTokens > 0) {
|
|
73
|
+
payload.max_tokens = this.maxTokens;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const interactionContext = this.resolveInteractionContext(metadata);
|
|
77
|
+
const inputText = this.buildInputText(messages);
|
|
78
|
+
let failureLogged = false;
|
|
79
|
+
|
|
80
|
+
try {
|
|
81
|
+
const response = await fetch(this.apiEndpoint, {
|
|
82
|
+
method: 'POST',
|
|
83
|
+
headers: {
|
|
84
|
+
'Content-Type': 'application/json',
|
|
85
|
+
'Authorization': `Bearer ${this.apiKey}`
|
|
86
|
+
},
|
|
87
|
+
body: JSON.stringify(payload),
|
|
88
|
+
signal: AbortSignal.timeout(this.timeout)
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
if (!response.ok) {
|
|
92
|
+
const error = await response.text();
|
|
93
|
+
|
|
94
|
+
if (interactionContext) {
|
|
95
|
+
await this.emitInteractionLog({
|
|
96
|
+
...interactionContext,
|
|
97
|
+
provider: 'OpenAI',
|
|
98
|
+
model: this.model,
|
|
99
|
+
requestId: null,
|
|
100
|
+
inputText,
|
|
101
|
+
outputText: null,
|
|
102
|
+
requestPayload: payload,
|
|
103
|
+
responsePayload: null,
|
|
104
|
+
usagePromptTokens: null,
|
|
105
|
+
usageCompletionTokens: null,
|
|
106
|
+
usageReasoningTokens: null,
|
|
107
|
+
usageCost: null
|
|
108
|
+
});
|
|
109
|
+
failureLogged = true;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
throw new Error(`OpenAI API error: ${response.status} - ${error}`);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const data = await response.json();
|
|
116
|
+
const outputText = String(data?.choices?.[0]?.message?.content ?? '');
|
|
117
|
+
const usage = data?.usage || {};
|
|
118
|
+
|
|
119
|
+
if (interactionContext) {
|
|
120
|
+
await this.emitInteractionLog({
|
|
121
|
+
...interactionContext,
|
|
122
|
+
provider: 'OpenAI',
|
|
123
|
+
model: String(data?.model || this.model || ''),
|
|
124
|
+
requestId: data?.id || null,
|
|
125
|
+
inputText,
|
|
126
|
+
outputText,
|
|
127
|
+
requestPayload: payload,
|
|
128
|
+
responsePayload: data,
|
|
129
|
+
usagePromptTokens: this.normalizeInteger(usage?.prompt_tokens),
|
|
130
|
+
usageCompletionTokens: this.normalizeInteger(usage?.completion_tokens),
|
|
131
|
+
usageReasoningTokens: this.normalizeInteger(usage?.completion_tokens_details?.reasoning_tokens),
|
|
132
|
+
usageCost: this.normalizeFloat(usage?.cost)
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
return outputText;
|
|
137
|
+
} catch (error) {
|
|
138
|
+
if (!failureLogged && interactionContext) {
|
|
139
|
+
await this.emitInteractionLog({
|
|
140
|
+
...interactionContext,
|
|
141
|
+
provider: 'OpenAI',
|
|
142
|
+
model: this.model,
|
|
143
|
+
requestId: null,
|
|
144
|
+
inputText,
|
|
145
|
+
outputText: null,
|
|
146
|
+
requestPayload: payload,
|
|
147
|
+
responsePayload: null,
|
|
148
|
+
usagePromptTokens: null,
|
|
149
|
+
usageCompletionTokens: null,
|
|
150
|
+
usageReasoningTokens: null,
|
|
151
|
+
usageCost: null
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
throw error;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
async analyze(content, options = {}) {
|
|
159
|
+
const requestOptions = { ...options };
|
|
160
|
+
delete requestOptions.parseJson;
|
|
161
|
+
|
|
162
|
+
const messages = [
|
|
163
|
+
{ role: 'user', content }
|
|
164
|
+
];
|
|
165
|
+
|
|
166
|
+
return this.makeRequestWithRetry(() => this.makeRequest(messages, requestOptions));
|
|
167
|
+
}
|
|
168
|
+
}
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import { parse } from 'csv-parse/sync';
|
|
2
|
+
|
|
3
|
+
export const EVENT_CSV_HEADERS = ['csv_id', 'event_type', 'attributes_kv'];
|
|
4
|
+
export const NEW_EVENT_TYPES_CSV_HEADERS = ['name', 'why_webmasters_care'];
|
|
5
|
+
|
|
6
|
+
function safeDecodeURIComponent(value) {
|
|
7
|
+
try {
|
|
8
|
+
return decodeURIComponent(value);
|
|
9
|
+
} catch {
|
|
10
|
+
return value;
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function sanitizeResponseText(content) {
|
|
15
|
+
const raw = String(content || '').trim();
|
|
16
|
+
if (!raw) {
|
|
17
|
+
return '';
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const codeBlockMatch = raw.match(/```(?:csv)?\s*\n?([\s\S]*?)\n?```/i);
|
|
21
|
+
if (codeBlockMatch) {
|
|
22
|
+
return codeBlockMatch[1].trim();
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
return raw;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export function decodeAttributesKv(attributesKv) {
|
|
29
|
+
const source = String(attributesKv || '').trim();
|
|
30
|
+
if (!source) {
|
|
31
|
+
return {};
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const attributes = {};
|
|
35
|
+
const pairs = source.split('&');
|
|
36
|
+
for (const pair of pairs) {
|
|
37
|
+
const segment = pair.trim();
|
|
38
|
+
if (!segment) {
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const equalsIndex = segment.indexOf('=');
|
|
43
|
+
const keyPart = equalsIndex === -1 ? segment : segment.slice(0, equalsIndex);
|
|
44
|
+
const valuePart = equalsIndex === -1 ? '' : segment.slice(equalsIndex + 1);
|
|
45
|
+
const key = safeDecodeURIComponent(keyPart).trim();
|
|
46
|
+
if (!key) {
|
|
47
|
+
continue;
|
|
48
|
+
}
|
|
49
|
+
attributes[key] = safeDecodeURIComponent(valuePart);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return attributes;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function normalizeEventType(value) {
|
|
56
|
+
return String(value || '')
|
|
57
|
+
.trim()
|
|
58
|
+
.toLowerCase()
|
|
59
|
+
.replace(/[^a-z0-9_]+/g, '_')
|
|
60
|
+
.replace(/_+/g, '_')
|
|
61
|
+
.replace(/^_+|_+$/g, '');
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function normalizeReasonsPipe(value) {
|
|
65
|
+
const raw = String(value || '').trim();
|
|
66
|
+
if (!raw) {
|
|
67
|
+
return [];
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
return Array.from(new Set(
|
|
71
|
+
raw
|
|
72
|
+
.split('|')
|
|
73
|
+
.map((item) => String(item || '').trim())
|
|
74
|
+
.filter(Boolean)
|
|
75
|
+
));
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export function parseEventsFromCsv(content, options = {}) {
|
|
79
|
+
const logger = options.logger || console;
|
|
80
|
+
const sourceLabel = options.sourceLabel || 'llm_csv';
|
|
81
|
+
const skipInvalidRows = options.skipInvalidRows !== false;
|
|
82
|
+
const sanitized = sanitizeResponseText(content);
|
|
83
|
+
if (!sanitized) {
|
|
84
|
+
return { events: [], invalidRows: 0 };
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const lines = sanitized.split(/\r?\n/).filter((line) => line.trim().length > 0);
|
|
88
|
+
if (lines.length === 0) {
|
|
89
|
+
return { events: [], invalidRows: 0 };
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const firstLine = lines[0].trim().toLowerCase();
|
|
93
|
+
const hasHeader = firstLine.startsWith('csv_id,') && firstLine.includes('event_type');
|
|
94
|
+
const csvText = hasHeader
|
|
95
|
+
? sanitized
|
|
96
|
+
: `${EVENT_CSV_HEADERS.join(',')}\n${sanitized}`;
|
|
97
|
+
|
|
98
|
+
let records;
|
|
99
|
+
try {
|
|
100
|
+
records = parse(csvText, {
|
|
101
|
+
columns: true,
|
|
102
|
+
skip_empty_lines: true,
|
|
103
|
+
trim: true,
|
|
104
|
+
relax_column_count: true,
|
|
105
|
+
relax_quotes: true
|
|
106
|
+
});
|
|
107
|
+
} catch (error) {
|
|
108
|
+
throw new Error(`Failed to parse CSV content from ${sourceLabel}: ${error.message}`);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const events = [];
|
|
112
|
+
let invalidRows = 0;
|
|
113
|
+
|
|
114
|
+
records.forEach((record, index) => {
|
|
115
|
+
const rowNumber = index + 2;
|
|
116
|
+
const csvId = String(record.csv_id ?? '').trim();
|
|
117
|
+
const eventType = normalizeEventType(record.event_type);
|
|
118
|
+
|
|
119
|
+
if (!csvId || !eventType) {
|
|
120
|
+
invalidRows++;
|
|
121
|
+
const message = `[LLM][CSV] Invalid row ${rowNumber} in ${sourceLabel}: missing csv_id or event_type`;
|
|
122
|
+
if (!skipInvalidRows) {
|
|
123
|
+
throw new Error(message);
|
|
124
|
+
}
|
|
125
|
+
logger.warn(message);
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
let attributes;
|
|
130
|
+
try {
|
|
131
|
+
attributes = decodeAttributesKv(record.attributes_kv);
|
|
132
|
+
} catch (error) {
|
|
133
|
+
invalidRows++;
|
|
134
|
+
const message = `[LLM][CSV] Invalid attributes_kv in row ${rowNumber} (${sourceLabel}): ${error.message}`;
|
|
135
|
+
if (!skipInvalidRows) {
|
|
136
|
+
throw new Error(message);
|
|
137
|
+
}
|
|
138
|
+
logger.warn(message);
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
events.push({
|
|
143
|
+
csv_id: csvId,
|
|
144
|
+
event_type: eventType,
|
|
145
|
+
attributes
|
|
146
|
+
});
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
return { events, invalidRows };
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
export function parseNewEventTypesFromCsv(content, options = {}) {
|
|
153
|
+
const logger = options.logger || console;
|
|
154
|
+
const sourceLabel = options.sourceLabel || 'llm_new_event_types_csv';
|
|
155
|
+
const skipInvalidRows = options.skipInvalidRows !== false;
|
|
156
|
+
const sanitized = sanitizeResponseText(content);
|
|
157
|
+
if (!sanitized) {
|
|
158
|
+
return { newEventTypes: [], invalidRows: 0 };
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const lines = sanitized.split(/\r?\n/).filter((line) => line.trim().length > 0);
|
|
162
|
+
if (lines.length === 0) {
|
|
163
|
+
return { newEventTypes: [], invalidRows: 0 };
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const firstLine = lines[0].trim().toLowerCase();
|
|
167
|
+
const hasHeader = firstLine.startsWith('name,') && firstLine.includes('why_webmasters_care');
|
|
168
|
+
const csvText = hasHeader
|
|
169
|
+
? sanitized
|
|
170
|
+
: `${NEW_EVENT_TYPES_CSV_HEADERS.join(',')}\n${sanitized}`;
|
|
171
|
+
|
|
172
|
+
let records;
|
|
173
|
+
try {
|
|
174
|
+
records = parse(csvText, {
|
|
175
|
+
columns: true,
|
|
176
|
+
skip_empty_lines: true,
|
|
177
|
+
trim: true,
|
|
178
|
+
relax_column_count: true,
|
|
179
|
+
relax_quotes: true
|
|
180
|
+
});
|
|
181
|
+
} catch (error) {
|
|
182
|
+
throw new Error(`Failed to parse CSV content from ${sourceLabel}: ${error.message}`);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
const byName = new Map();
|
|
186
|
+
let invalidRows = 0;
|
|
187
|
+
|
|
188
|
+
records.forEach((record, index) => {
|
|
189
|
+
const rowNumber = index + 2;
|
|
190
|
+
const name = normalizeEventType(record.name);
|
|
191
|
+
if (!name) {
|
|
192
|
+
invalidRows++;
|
|
193
|
+
const message = `[LLM][CSV] Invalid row ${rowNumber} in ${sourceLabel}: missing name`;
|
|
194
|
+
if (!skipInvalidRows) {
|
|
195
|
+
throw new Error(message);
|
|
196
|
+
}
|
|
197
|
+
logger.warn(message);
|
|
198
|
+
return;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const reasons = normalizeReasonsPipe(record.why_webmasters_care);
|
|
202
|
+
if (byName.has(name)) {
|
|
203
|
+
const existing = byName.get(name);
|
|
204
|
+
existing.why_webmasters_care = Array.from(new Set([
|
|
205
|
+
...existing.why_webmasters_care,
|
|
206
|
+
...reasons
|
|
207
|
+
]));
|
|
208
|
+
return;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
byName.set(name, {
|
|
212
|
+
name,
|
|
213
|
+
why_webmasters_care: reasons
|
|
214
|
+
});
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
return {
|
|
218
|
+
newEventTypes: Array.from(byName.values()),
|
|
219
|
+
invalidRows
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
export function deriveEventTypeMetadata(events, knownEventTypes = []) {
|
|
224
|
+
const knownSet = new Set(
|
|
225
|
+
(Array.isArray(knownEventTypes) ? knownEventTypes : [])
|
|
226
|
+
.map((value) => normalizeEventType(value))
|
|
227
|
+
.filter(Boolean)
|
|
228
|
+
);
|
|
229
|
+
|
|
230
|
+
const grouped = new Map();
|
|
231
|
+
for (const event of Array.isArray(events) ? events : []) {
|
|
232
|
+
const type = normalizeEventType(event?.event_type);
|
|
233
|
+
if (!type) {
|
|
234
|
+
continue;
|
|
235
|
+
}
|
|
236
|
+
if (!grouped.has(type)) {
|
|
237
|
+
grouped.set(type, {
|
|
238
|
+
csvIds: []
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
const group = grouped.get(type);
|
|
242
|
+
const csvId = String(event?.csv_id ?? '').trim();
|
|
243
|
+
if (csvId) {
|
|
244
|
+
group.csvIds.push(csvId);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
const eventTypesSummary = [];
|
|
249
|
+
const newEventTypes = [];
|
|
250
|
+
|
|
251
|
+
for (const [name, group] of grouped.entries()) {
|
|
252
|
+
const coveredCsvIds = Array.from(new Set(group.csvIds));
|
|
253
|
+
const whyWebmastersCare = [];
|
|
254
|
+
eventTypesSummary.push({
|
|
255
|
+
name,
|
|
256
|
+
covered_csv_ids: coveredCsvIds,
|
|
257
|
+
why_webmasters_care: whyWebmastersCare
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
if (!knownSet.has(name)) {
|
|
261
|
+
const item = {
|
|
262
|
+
name,
|
|
263
|
+
why_webmasters_care: whyWebmastersCare
|
|
264
|
+
};
|
|
265
|
+
if (knownSet.size > 0) {
|
|
266
|
+
item.not_in_known_event_types_because = `event_type "${name}" was not found in known_event_types`;
|
|
267
|
+
}
|
|
268
|
+
newEventTypes.push(item);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
return {
|
|
273
|
+
eventTypesSummary,
|
|
274
|
+
newEventTypes
|
|
275
|
+
};
|
|
276
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Represents element context information
|
|
3
|
+
*/
|
|
4
|
+
export class Context {
|
|
5
|
+
constructor() {
|
|
6
|
+
this.contextVersion = 2;
|
|
7
|
+
this.parentTag = '';
|
|
8
|
+
this.parentText = '';
|
|
9
|
+
this.ancestorTrail = [];
|
|
10
|
+
this.cssSelector = '';
|
|
11
|
+
this.selectorCandidates = [];
|
|
12
|
+
this.nearbyText = [];
|
|
13
|
+
this.semanticAnchors = {
|
|
14
|
+
heading: null,
|
|
15
|
+
imageAlt: null
|
|
16
|
+
};
|
|
17
|
+
this.containerMeta = {
|
|
18
|
+
cssSelector: '',
|
|
19
|
+
attributes: {}
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
static fromObject(obj) {
|
|
24
|
+
const context = new Context();
|
|
25
|
+
if (obj && typeof obj === 'object') {
|
|
26
|
+
Object.assign(context, obj);
|
|
27
|
+
}
|
|
28
|
+
return context;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
toJSON() {
|
|
32
|
+
return {
|
|
33
|
+
contextVersion: 2,
|
|
34
|
+
parentTag: this.parentTag,
|
|
35
|
+
parentText: this.parentText,
|
|
36
|
+
ancestorTrail: this.ancestorTrail,
|
|
37
|
+
cssSelector: this.cssSelector,
|
|
38
|
+
selectorCandidates: this.selectorCandidates,
|
|
39
|
+
nearbyText: this.nearbyText,
|
|
40
|
+
semanticAnchors: this.semanticAnchors,
|
|
41
|
+
containerMeta: this.containerMeta
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "page-analyzer",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "Standalone page analysis module.",
|
|
6
|
+
"main": "index.js",
|
|
7
|
+
"scripts": {
|
|
8
|
+
"test": "node test.js",
|
|
9
|
+
"analyze": "node test.js"
|
|
10
|
+
},
|
|
11
|
+
"dependencies": {
|
|
12
|
+
"cheerio": "^1.2.0",
|
|
13
|
+
"csv-parse": "^5.6.0",
|
|
14
|
+
"playwright": "^1.58.2"
|
|
15
|
+
}
|
|
16
|
+
}
|