page-analyzer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/csv-exporter.js +192 -0
- package/extractors/block-assigner.js +281 -0
- package/extractors/context-extractor.js +275 -0
- package/extractors/css-selector-builder.js +202 -0
- package/extractors/pt-selector-builder.js +344 -0
- package/html-parser.js +206 -0
- package/index.js +303 -0
- package/llm/analyzers/event-analyzer/event-analyzer-blocks.js +553 -0
- package/llm/analyzers/event-analyzer/event-analyzer-constants.js +22 -0
- package/llm/analyzers/event-analyzer/event-analyzer-events.js +97 -0
- package/llm/analyzers/event-analyzer/event-analyzer-input.js +168 -0
- package/llm/analyzers/event-analyzer/event-analyzer-metadata.js +15 -0
- package/llm/analyzers/event-analyzer/event-analyzer-prompt.js +71 -0
- package/llm/analyzers/event-analyzer/event-analyzer-response.js +290 -0
- package/llm/analyzers/event-analyzer/event-analyzer-utils.js +96 -0
- package/llm/analyzers/event-analyzer/event-analyzer.js +546 -0
- package/llm/analyzers/prompts/event-analysis.txt +52 -0
- package/llm/analyzers/prompts/special-block-confirmation.txt +127 -0
- package/llm/providers/base-provider.js +64 -0
- package/llm/providers/openai-provider.js +168 -0
- package/llm/utils/event-csv.js +276 -0
- package/models/context.js +44 -0
- package/package.json +16 -0
- package/page-extractor.js +215 -0
- package/utils/selector-utils.js +31 -0
- package/utils/text-utils.js +11 -0
- package/utils/url-utils.js +43 -0
- package/vendor/extract-blocks.js +903 -0
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
import { deriveEventTypeMetadata } from '../../utils/event-csv.js';
|
|
5
|
+
import {
|
|
6
|
+
CSV_REPAIR_ATTEMPTS,
|
|
7
|
+
FULL_ANALYSIS_RETRY_ATTEMPTS,
|
|
8
|
+
MAX_CSV_REPAIR_INPUT_CHARS
|
|
9
|
+
} from './event-analyzer-constants.js';
|
|
10
|
+
import { EventAnalyzerResponse } from './event-analyzer-response.js';
|
|
11
|
+
import { buildRunContextMetadata } from './event-analyzer-metadata.js';
|
|
12
|
+
import {
|
|
13
|
+
buildEventAnalysisPrompt,
|
|
14
|
+
buildSpecialBlockPrompt,
|
|
15
|
+
buildCsvRepairPrompt as buildCsvRepairPromptTemplate
|
|
16
|
+
} from './event-analyzer-prompt.js';
|
|
17
|
+
import {
|
|
18
|
+
parseInputCsvRows as parseInputCsvRowsFromInput,
|
|
19
|
+
normalizeBlocks as normalizeBlocksFromInput,
|
|
20
|
+
serializeBlocksIndexCsv as serializeBlocksIndexCsvFromInput,
|
|
21
|
+
serializeBlockAnalysisCsv as serializeBlockAnalysisCsvFromInput,
|
|
22
|
+
serializeDomCsvRows as serializeDomCsvRowsFromInput
|
|
23
|
+
} from './event-analyzer-input.js';
|
|
24
|
+
import {
|
|
25
|
+
unwrapTextResponse as unwrapTextResponseFromBlocks,
|
|
26
|
+
defaultBlockNameForTag as defaultBlockNameForTagFromBlocks,
|
|
27
|
+
normalizePossibleEvents as normalizePossibleEventsFromBlocks,
|
|
28
|
+
parseSpecialBlockResponse as parseSpecialBlockResponseFromBlocks,
|
|
29
|
+
buildBlockContexts as buildBlockContextsFromBlocks,
|
|
30
|
+
buildLlmGroups as buildLlmGroupsFromBlocks,
|
|
31
|
+
buildBlockAnalysisArtifact as buildBlockAnalysisArtifactFromBlocks
|
|
32
|
+
} from './event-analyzer-blocks.js';
|
|
33
|
+
import {
|
|
34
|
+
buildDirectEvent as buildDirectEventFromEvents,
|
|
35
|
+
expandEventAttributes as expandEventAttributesFromEvents,
|
|
36
|
+
enforceAllowedEventType as enforceAllowedEventTypeFromEvents
|
|
37
|
+
} from './event-analyzer-events.js';
|
|
38
|
+
import { toCsvIdValue } from './event-analyzer-utils.js';
|
|
39
|
+
|
|
40
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
41
|
+
const __dirname = path.dirname(__filename);
|
|
42
|
+
|
|
43
|
+
class EventAnalyzer {
|
|
44
|
+
constructor(provider, config, runContext = {}) {
|
|
45
|
+
this.provider = provider;
|
|
46
|
+
this.config = config;
|
|
47
|
+
this.runContext = runContext && typeof runContext === 'object' ? runContext : {};
|
|
48
|
+
this.response = new EventAnalyzerResponse(config);
|
|
49
|
+
|
|
50
|
+
const eventTemplatePath = path.join(__dirname, '../prompts/event-analysis.txt');
|
|
51
|
+
this.eventAnalysisTemplate = fs.readFileSync(eventTemplatePath, 'utf-8');
|
|
52
|
+
const specialTemplatePath = path.join(__dirname, '../prompts/special-block-confirmation.txt');
|
|
53
|
+
this.specialBlockTemplate = fs.readFileSync(specialTemplatePath, 'utf-8');
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
parseInputCsvRows(csvData) {
|
|
57
|
+
return parseInputCsvRowsFromInput(csvData);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
normalizeBlocks(blocks = []) {
|
|
61
|
+
return normalizeBlocksFromInput(blocks);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
serializeBlocksIndexCsv(blocks = []) {
|
|
65
|
+
return serializeBlocksIndexCsvFromInput(blocks);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
serializeBlockAnalysisCsv(blockContexts = []) {
|
|
69
|
+
return serializeBlockAnalysisCsvFromInput(blockContexts);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
serializeDomCsvRows(rows = []) {
|
|
73
|
+
return serializeDomCsvRowsFromInput(rows);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
unwrapTextResponse(response) {
|
|
77
|
+
return unwrapTextResponseFromBlocks(response);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
defaultBlockNameForTag(tag) {
|
|
81
|
+
return defaultBlockNameForTagFromBlocks(tag);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
normalizePossibleEvents(value) {
|
|
85
|
+
return normalizePossibleEventsFromBlocks(this.response, value);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
parseSpecialBlockResponse(response, _blocks = []) {
|
|
89
|
+
return parseSpecialBlockResponseFromBlocks(this.response, response);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
async runSpecialBlockConfirmation(blocks) {
|
|
93
|
+
const blocksIndexCsv = this.serializeBlocksIndexCsv(blocks);
|
|
94
|
+
const prompt = buildSpecialBlockPrompt({
|
|
95
|
+
template: this.specialBlockTemplate,
|
|
96
|
+
blocksIndexCsv
|
|
97
|
+
});
|
|
98
|
+
const attempts = FULL_ANALYSIS_RETRY_ATTEMPTS + 1;
|
|
99
|
+
let lastError = null;
|
|
100
|
+
|
|
101
|
+
for (let attempt = 1; attempt <= attempts; attempt += 1) {
|
|
102
|
+
try {
|
|
103
|
+
const startedAt = Date.now();
|
|
104
|
+
const response = await this.provider.analyze(prompt, {
|
|
105
|
+
parseJson: false,
|
|
106
|
+
metadata: buildRunContextMetadata(this.runContext, {
|
|
107
|
+
operation: 'special_block_confirmation',
|
|
108
|
+
attempt
|
|
109
|
+
})
|
|
110
|
+
});
|
|
111
|
+
const parsed = this.parseSpecialBlockResponse(response, blocks);
|
|
112
|
+
console.log(`[LLM] Special block confirmation completed in ${Date.now() - startedAt}ms`);
|
|
113
|
+
return parsed;
|
|
114
|
+
} catch (error) {
|
|
115
|
+
lastError = error;
|
|
116
|
+
if (attempt < attempts) {
|
|
117
|
+
console.warn(
|
|
118
|
+
`[LLM] Special block confirmation attempt ${attempt}/${attempts} failed: ${error.message}. Retrying...`
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
throw lastError || new Error('special block confirmation failed');
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
buildBlockContexts(blocks, csvRows, specialResult) {
|
|
128
|
+
return buildBlockContextsFromBlocks(blocks, csvRows, specialResult);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
buildLlmGroups(blockContexts = []) {
|
|
132
|
+
return buildLlmGroupsFromBlocks(blockContexts);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
buildDirectEvent(row, blockContext) {
|
|
136
|
+
return buildDirectEventFromEvents(row, blockContext);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
expandEventAttributes(event, row, blockContext, options = {}) {
|
|
140
|
+
return expandEventAttributesFromEvents(event, row, blockContext, options);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
enforceAllowedEventType(event, blockContext) {
|
|
144
|
+
return enforceAllowedEventTypeFromEvents(this.response, event, blockContext);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
async analyzeLlmGroups(groups, siteSummary, rollingKnownEventTypes) {
|
|
148
|
+
const eventsByCsvId = new Map();
|
|
149
|
+
const parsedNewEventTypes = [];
|
|
150
|
+
const attributeKeys = this.response.getAttributeKeys();
|
|
151
|
+
const totalGroups = groups.length;
|
|
152
|
+
|
|
153
|
+
for (let index = 0; index < groups.length; index += 1) {
|
|
154
|
+
const group = groups[index];
|
|
155
|
+
const label = `${index + 1}/${totalGroups}`;
|
|
156
|
+
const groupBlocks = Array.isArray(group.blocks) ? group.blocks : [];
|
|
157
|
+
const groupRows = groupBlocks
|
|
158
|
+
.flatMap((block) => block.rows)
|
|
159
|
+
.sort((left, right) => left.order - right.order);
|
|
160
|
+
const expectedCsvIds = new Set(groupRows.map((row) => row.csv_id));
|
|
161
|
+
const blockContextByCsvId = new Map();
|
|
162
|
+
const rowByCsvId = new Map();
|
|
163
|
+
for (const block of groupBlocks) {
|
|
164
|
+
for (const row of block.rows) {
|
|
165
|
+
blockContextByCsvId.set(row.csv_id, block);
|
|
166
|
+
rowByCsvId.set(row.csv_id, row);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const prompt = buildEventAnalysisPrompt({
|
|
171
|
+
template: this.eventAnalysisTemplate,
|
|
172
|
+
siteSummary,
|
|
173
|
+
blockAnalysisCsv: this.serializeBlockAnalysisCsv(groupBlocks),
|
|
174
|
+
domCsv: this.serializeDomCsvRows(groupRows)
|
|
175
|
+
});
|
|
176
|
+
const parsed = await this.runPromptAnalysis(prompt, rollingKnownEventTypes, attributeKeys, {
|
|
177
|
+
label
|
|
178
|
+
});
|
|
179
|
+
const rows = Array.isArray(parsed?.events_by_node) ? parsed.events_by_node : [];
|
|
180
|
+
for (const event of rows) {
|
|
181
|
+
const csvId = toCsvIdValue(event?.csv_id);
|
|
182
|
+
if (!csvId || !expectedCsvIds.has(csvId)) {
|
|
183
|
+
continue;
|
|
184
|
+
}
|
|
185
|
+
const blockContext = blockContextByCsvId.get(csvId);
|
|
186
|
+
if (!blockContext) {
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
const normalizedEvent = this.enforceAllowedEventType(
|
|
190
|
+
{
|
|
191
|
+
...event,
|
|
192
|
+
csv_id: csvId
|
|
193
|
+
},
|
|
194
|
+
blockContext
|
|
195
|
+
);
|
|
196
|
+
const expandedEvent = this.expandEventAttributes(
|
|
197
|
+
normalizedEvent,
|
|
198
|
+
rowByCsvId.get(csvId),
|
|
199
|
+
blockContext,
|
|
200
|
+
{ includeBlockIdxs: false }
|
|
201
|
+
);
|
|
202
|
+
eventsByCsvId.set(csvId, expandedEvent);
|
|
203
|
+
const known = this.response.normalizeEventType(normalizedEvent.event_type);
|
|
204
|
+
if (known && !rollingKnownEventTypes.includes(known)) {
|
|
205
|
+
rollingKnownEventTypes.push(known);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const fallbackCsvIds = [];
|
|
210
|
+
for (const row of groupRows) {
|
|
211
|
+
if (eventsByCsvId.has(row.csv_id)) {
|
|
212
|
+
continue;
|
|
213
|
+
}
|
|
214
|
+
const blockContext = blockContextByCsvId.get(row.csv_id);
|
|
215
|
+
const fallbackEvent = this.buildDirectEvent(row, blockContext);
|
|
216
|
+
if (!fallbackEvent) {
|
|
217
|
+
continue;
|
|
218
|
+
}
|
|
219
|
+
fallbackCsvIds.push(row.csv_id);
|
|
220
|
+
eventsByCsvId.set(row.csv_id, fallbackEvent);
|
|
221
|
+
}
|
|
222
|
+
if (fallbackCsvIds.length > 0) {
|
|
223
|
+
const preview = fallbackCsvIds.slice(0, 12).join(',');
|
|
224
|
+
console.warn(
|
|
225
|
+
`[LLM] Group ${label} missing ${fallbackCsvIds.length} csv_id(s). ` +
|
|
226
|
+
`Applied block-allowed fallback for: ${preview}${fallbackCsvIds.length > 12 ? ',...' : ''}`
|
|
227
|
+
);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
for (const item of Array.isArray(parsed?.new_event_types) ? parsed.new_event_types : []) {
|
|
231
|
+
parsedNewEventTypes.push(item);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
return {
|
|
236
|
+
events: Array.from(eventsByCsvId.values()),
|
|
237
|
+
parsedNewEventTypes
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
buildBlockAnalysisArtifact(siteSummary, blockContexts = [], llmGroups = []) {
|
|
242
|
+
return buildBlockAnalysisArtifactFromBlocks(siteSummary, blockContexts, llmGroups);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
buildSyntheticBlocksFromCsvRows(csvRows = []) {
|
|
246
|
+
const rows = Array.isArray(csvRows) ? csvRows : [];
|
|
247
|
+
const validBlockIdxs = Array.from(new Set(
|
|
248
|
+
rows
|
|
249
|
+
.map((row) => (Number.isInteger(row?.blockIdx) && row.blockIdx >= 0 ? row.blockIdx : null))
|
|
250
|
+
.filter((value) => value !== null)
|
|
251
|
+
)).sort((left, right) => left - right);
|
|
252
|
+
|
|
253
|
+
if (validBlockIdxs.length > 0) {
|
|
254
|
+
const blocks = validBlockIdxs.map((blockIdx, index) => {
|
|
255
|
+
const blockRows = rows.filter((row) => row.blockIdx === blockIdx);
|
|
256
|
+
const textPreview = blockRows
|
|
257
|
+
.map((row) => String(row?.text || '').trim())
|
|
258
|
+
.filter(Boolean)
|
|
259
|
+
.slice(0, 4)
|
|
260
|
+
.join(' | ')
|
|
261
|
+
.slice(0, 600);
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
blockIdx,
|
|
265
|
+
branchPath: `fallback.${blockIdx}`,
|
|
266
|
+
depth: 0,
|
|
267
|
+
domOrder: index + 1,
|
|
268
|
+
tag: 'section',
|
|
269
|
+
fixed: false,
|
|
270
|
+
top: index * 120,
|
|
271
|
+
left: 0,
|
|
272
|
+
width: 1200,
|
|
273
|
+
height: 360,
|
|
274
|
+
textPreview,
|
|
275
|
+
childInteractiveCount: blockRows.length
|
|
276
|
+
};
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
return {
|
|
280
|
+
blocks: this.normalizeBlocks(blocks),
|
|
281
|
+
rows
|
|
282
|
+
};
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
const remappedRows = rows.map((row) => ({
|
|
286
|
+
...row,
|
|
287
|
+
blockIdx: 0
|
|
288
|
+
}));
|
|
289
|
+
|
|
290
|
+
return {
|
|
291
|
+
blocks: this.normalizeBlocks([{
|
|
292
|
+
blockIdx: 0,
|
|
293
|
+
branchPath: 'fallback.0',
|
|
294
|
+
depth: 0,
|
|
295
|
+
domOrder: 1,
|
|
296
|
+
tag: 'section',
|
|
297
|
+
fixed: false,
|
|
298
|
+
top: 0,
|
|
299
|
+
left: 0,
|
|
300
|
+
width: 1200,
|
|
301
|
+
height: 360,
|
|
302
|
+
textPreview: 'Fallback synthetic block (snapshot missing)',
|
|
303
|
+
childInteractiveCount: remappedRows.length
|
|
304
|
+
}]),
|
|
305
|
+
rows: remappedRows
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
async analyzeEvents(csvData, _mdData, knownEventTypes = [], options = {}) {
|
|
310
|
+
const analyzeNodeEvents = !options?.analyzeNodeEvents;
|
|
311
|
+
const configuredKnownEventTypes = this.response.normalizeStringList(
|
|
312
|
+
this.config?.knownEventTypes,
|
|
313
|
+
{ eventType: true }
|
|
314
|
+
);
|
|
315
|
+
const mergedKnownEventTypes = this.response.normalizeStringList(
|
|
316
|
+
[...configuredKnownEventTypes, ...(knownEventTypes || [])],
|
|
317
|
+
{ eventType: true }
|
|
318
|
+
);
|
|
319
|
+
let csvRows = this.parseInputCsvRows(csvData);
|
|
320
|
+
if (csvRows.length === 0) {
|
|
321
|
+
return {
|
|
322
|
+
events_by_node: [],
|
|
323
|
+
event_types_summary: [],
|
|
324
|
+
new_event_types: [],
|
|
325
|
+
block_analysis: {
|
|
326
|
+
site_summary: '',
|
|
327
|
+
possible_event_types: [],
|
|
328
|
+
blocks: [],
|
|
329
|
+
stats: {
|
|
330
|
+
total_blocks: 0,
|
|
331
|
+
skipped_blocks: 0,
|
|
332
|
+
direct_blocks: 0,
|
|
333
|
+
llm_blocks: 0,
|
|
334
|
+
llm_group_count: 0,
|
|
335
|
+
llm_group_rows: 0
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
};
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
let blocks = this.normalizeBlocks(options?.blocks || []);
|
|
342
|
+
if (blocks.length === 0) {
|
|
343
|
+
const fallback = this.buildSyntheticBlocksFromCsvRows(csvRows);
|
|
344
|
+
blocks = fallback.blocks;
|
|
345
|
+
csvRows = fallback.rows;
|
|
346
|
+
console.warn(
|
|
347
|
+
'[LLM] Missing blocks snapshot for event analysis. ' +
|
|
348
|
+
'Using synthetic fallback blocks from CSV rows.'
|
|
349
|
+
);
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
const specialResult = await this.runSpecialBlockConfirmation(blocks);
|
|
353
|
+
const siteSummary = specialResult.siteSummary || `Page with ${blocks.length} visual blocks.`;
|
|
354
|
+
const blockContexts = this.buildBlockContexts(blocks, csvRows, specialResult);
|
|
355
|
+
if (!analyzeNodeEvents) {
|
|
356
|
+
return {
|
|
357
|
+
events_by_node: [],
|
|
358
|
+
event_types_summary: [],
|
|
359
|
+
new_event_types: [],
|
|
360
|
+
block_analysis: this.buildBlockAnalysisArtifact(siteSummary, blockContexts, [])
|
|
361
|
+
};
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
const blockContextByCsvId = new Map();
|
|
365
|
+
for (const blockContext of blockContexts) {
|
|
366
|
+
for (const row of blockContext.rows) {
|
|
367
|
+
blockContextByCsvId.set(row.csv_id, blockContext);
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
const eventsByCsvId = new Map();
|
|
372
|
+
for (const blockContext of blockContexts) {
|
|
373
|
+
if (blockContext.possibleEvents.length !== 1) {
|
|
374
|
+
continue;
|
|
375
|
+
}
|
|
376
|
+
for (const row of blockContext.rows) {
|
|
377
|
+
const event = this.buildDirectEvent(row, blockContext);
|
|
378
|
+
if (event) {
|
|
379
|
+
eventsByCsvId.set(event.csv_id, event);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
const llmGroups = this.buildLlmGroups(blockContexts);
|
|
385
|
+
const rollingKnownEventTypes = [...mergedKnownEventTypes];
|
|
386
|
+
const llmResult = await this.analyzeLlmGroups(llmGroups, siteSummary, rollingKnownEventTypes);
|
|
387
|
+
for (const event of llmResult.events) {
|
|
388
|
+
eventsByCsvId.set(event.csv_id, event);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
const orderedCsvIds = csvRows.map((row) => row.csv_id);
|
|
392
|
+
const events = orderedCsvIds
|
|
393
|
+
.filter((csvId) => eventsByCsvId.has(csvId))
|
|
394
|
+
.map((csvId) => eventsByCsvId.get(csvId))
|
|
395
|
+
.filter((event) => {
|
|
396
|
+
const blockContext = blockContextByCsvId.get(event.csv_id);
|
|
397
|
+
if (!blockContext) {
|
|
398
|
+
return false;
|
|
399
|
+
}
|
|
400
|
+
if (blockContext.possibleEvents.length === 0) {
|
|
401
|
+
return false;
|
|
402
|
+
}
|
|
403
|
+
return true;
|
|
404
|
+
});
|
|
405
|
+
|
|
406
|
+
const {
|
|
407
|
+
eventTypesSummary,
|
|
408
|
+
newEventTypes
|
|
409
|
+
} = deriveEventTypeMetadata(events, mergedKnownEventTypes);
|
|
410
|
+
const finalEventTypeSet = new Set(
|
|
411
|
+
events
|
|
412
|
+
.map((event) => this.response.normalizeEventType(event?.event_type))
|
|
413
|
+
.filter(Boolean)
|
|
414
|
+
);
|
|
415
|
+
const scopedParsedNewEventTypes = llmResult.parsedNewEventTypes.filter((item) => {
|
|
416
|
+
const normalized = this.response.normalizeEventType(item?.name);
|
|
417
|
+
return normalized && finalEventTypeSet.has(normalized);
|
|
418
|
+
});
|
|
419
|
+
const resolvedNewEventTypes = this.response.resolveNewEventTypes({
|
|
420
|
+
parsedNewEventTypes: scopedParsedNewEventTypes,
|
|
421
|
+
derivedNewEventTypes: newEventTypes,
|
|
422
|
+
knownEventTypes: mergedKnownEventTypes
|
|
423
|
+
});
|
|
424
|
+
const result = {
|
|
425
|
+
events_by_node: events,
|
|
426
|
+
event_types_summary: eventTypesSummary,
|
|
427
|
+
new_event_types: resolvedNewEventTypes,
|
|
428
|
+
block_analysis: this.buildBlockAnalysisArtifact(siteSummary, blockContexts, llmGroups)
|
|
429
|
+
};
|
|
430
|
+
this.response.logNewEventTypes(result);
|
|
431
|
+
return result;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
async runPromptAnalysis(prompt, knownEventTypes, attributeKeys, options = {}) {
|
|
435
|
+
const label = String(options?.label || '').trim();
|
|
436
|
+
const labelPrefix = label ? ` [${label}]` : '';
|
|
437
|
+
const totalAnalysisAttempts = FULL_ANALYSIS_RETRY_ATTEMPTS + 1;
|
|
438
|
+
let lastError;
|
|
439
|
+
|
|
440
|
+
for (let analysisAttempt = 1; analysisAttempt <= totalAnalysisAttempts; analysisAttempt++) {
|
|
441
|
+
try {
|
|
442
|
+
const startTime = Date.now();
|
|
443
|
+
const response = await this.provider.analyze(prompt, {
|
|
444
|
+
parseJson: false,
|
|
445
|
+
metadata: buildRunContextMetadata(this.runContext, {
|
|
446
|
+
operation: 'analysis',
|
|
447
|
+
chunkLabel: label || null,
|
|
448
|
+
analysisAttempt
|
|
449
|
+
})
|
|
450
|
+
});
|
|
451
|
+
|
|
452
|
+
try {
|
|
453
|
+
const result = this.response.parseCsvResponse(response, knownEventTypes, {
|
|
454
|
+
logger: console
|
|
455
|
+
});
|
|
456
|
+
console.log(`[LLM] Event analysis completed${labelPrefix} in ${Date.now() - startTime}ms`);
|
|
457
|
+
return result;
|
|
458
|
+
} catch (parseError) {
|
|
459
|
+
const repaired = await this.tryRepairCsvResponse(
|
|
460
|
+
response,
|
|
461
|
+
knownEventTypes,
|
|
462
|
+
attributeKeys,
|
|
463
|
+
{
|
|
464
|
+
chunkLabel: label || null,
|
|
465
|
+
parseErrorMessage: parseError?.message
|
|
466
|
+
}
|
|
467
|
+
);
|
|
468
|
+
if (!repaired) {
|
|
469
|
+
throw parseError;
|
|
470
|
+
}
|
|
471
|
+
console.log(`[LLM] Event analysis CSV repaired${labelPrefix} in ${Date.now() - startTime}ms`);
|
|
472
|
+
return repaired;
|
|
473
|
+
}
|
|
474
|
+
} catch (error) {
|
|
475
|
+
lastError = error;
|
|
476
|
+
if (analysisAttempt < totalAnalysisAttempts) {
|
|
477
|
+
console.warn(
|
|
478
|
+
`[LLM] Event analysis attempt ${analysisAttempt}/${totalAnalysisAttempts} failed${labelPrefix}: ` +
|
|
479
|
+
`${error.message}. Retrying...`
|
|
480
|
+
);
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
console.error('[LLM] Event analysis error:', lastError?.message || 'unknown error');
|
|
486
|
+
throw lastError || new Error('Event analysis failed');
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
buildCsvRepairPrompt(rawResponse, knownEventTypes = [], attributeKeys = [], options = {}) {
|
|
490
|
+
return buildCsvRepairPromptTemplate({
|
|
491
|
+
rawResponse,
|
|
492
|
+
knownEventTypes,
|
|
493
|
+
attributeKeys,
|
|
494
|
+
parseErrorMessage: options?.parseErrorMessage
|
|
495
|
+
});
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
async tryRepairCsvResponse(response, knownEventTypes = [], attributeKeys = [], options = {}) {
|
|
499
|
+
const raw = String(response || '').trim();
|
|
500
|
+
if (!raw) {
|
|
501
|
+
return null;
|
|
502
|
+
}
|
|
503
|
+
const parseErrorMessage = String(options?.parseErrorMessage || '').trim();
|
|
504
|
+
|
|
505
|
+
const repairInput = raw.length > MAX_CSV_REPAIR_INPUT_CHARS
|
|
506
|
+
? `${raw.slice(0, MAX_CSV_REPAIR_INPUT_CHARS)}\n...[truncated]`
|
|
507
|
+
: raw;
|
|
508
|
+
|
|
509
|
+
for (let attempt = 1; attempt <= CSV_REPAIR_ATTEMPTS; attempt++) {
|
|
510
|
+
try {
|
|
511
|
+
console.warn(
|
|
512
|
+
`[LLM][CSV] Invalid response format. Attempting repair (${attempt}/${CSV_REPAIR_ATTEMPTS})...`
|
|
513
|
+
);
|
|
514
|
+
const repairPrompt = this.buildCsvRepairPrompt(
|
|
515
|
+
repairInput,
|
|
516
|
+
knownEventTypes,
|
|
517
|
+
attributeKeys,
|
|
518
|
+
{
|
|
519
|
+
...options,
|
|
520
|
+
parseErrorMessage
|
|
521
|
+
}
|
|
522
|
+
);
|
|
523
|
+
const repairedResponse = await this.provider.analyze(repairPrompt, {
|
|
524
|
+
parseJson: false,
|
|
525
|
+
metadata: buildRunContextMetadata(this.runContext, {
|
|
526
|
+
operation: 'repair',
|
|
527
|
+
chunkLabel: String(options?.chunkLabel || '').trim() || null,
|
|
528
|
+
repairAttempt: attempt
|
|
529
|
+
})
|
|
530
|
+
});
|
|
531
|
+
return this.response.parseCsvResponse(repairedResponse, knownEventTypes, {
|
|
532
|
+
logger: console
|
|
533
|
+
});
|
|
534
|
+
} catch (error) {
|
|
535
|
+
console.warn(`[LLM][CSV] Repair attempt ${attempt}/${CSV_REPAIR_ATTEMPTS} failed: ${error.message}`);
|
|
536
|
+
if (attempt >= CSV_REPAIR_ATTEMPTS) {
|
|
537
|
+
return null;
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
return null;
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
export { EventAnalyzer };
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
You are a revenue-focused analytics event designer.
|
|
2
|
+
|
|
3
|
+
INPUTS
|
|
4
|
+
|
|
5
|
+
1) site_summary
|
|
6
|
+
<<<SITE_SUMMARY>>>
|
|
7
|
+
|
|
8
|
+
2) block analysis CSV
|
|
9
|
+
<<<BLOCK_ANALYSIS_CSV>>>
|
|
10
|
+
|
|
11
|
+
3) DOM CSV
|
|
12
|
+
<<<DOM_CSV>>>
|
|
13
|
+
|
|
14
|
+
GOAL
|
|
15
|
+
|
|
16
|
+
Detect commercially meaningful interactions from DOM elements.
|
|
17
|
+
|
|
18
|
+
Map events 1:1 to DOM rows.
|
|
19
|
+
|
|
20
|
+
Focus on actions valuable to website owners:
|
|
21
|
+
signup, trial, demo, purchase, contact, subscribe, search, product interest, etc.
|
|
22
|
+
|
|
23
|
+
Use site_summary, blockName, and blockSemantic (if present) as context.
|
|
24
|
+
|
|
25
|
+
RULES
|
|
26
|
+
|
|
27
|
+
- Output CSV only.
|
|
28
|
+
- Each output row must reuse DOM csv_id.
|
|
29
|
+
- Exactly one event_type per csv_id.
|
|
30
|
+
- Try to cover every DOM event.
|
|
31
|
+
|
|
32
|
+
Use concise snake_case names such as:
|
|
33
|
+
signup, login, start_trial, purchase, subscribe, book_demo,
|
|
34
|
+
contact_submit, search, view_product, view_article,
|
|
35
|
+
add_to_cart, checkout_start, pricing_view, cta_click.
|
|
36
|
+
|
|
37
|
+
Create a new business event if needed.
|
|
38
|
+
|
|
39
|
+
ATTRIBUTES
|
|
40
|
+
|
|
41
|
+
attributes_kv should include concise business context extracted from the DOM.
|
|
42
|
+
|
|
43
|
+
Guidelines:
|
|
44
|
+
- Format: key=value&key=value
|
|
45
|
+
- No spaces
|
|
46
|
+
- Prefer semantic information (plan, product, tier, category, intent, action target)
|
|
47
|
+
- Do NOT include structural attributes that can be derived by engineering (e.g. block name, label text, URL)
|
|
48
|
+
- Leave empty only if no meaningful attributes exist
|
|
49
|
+
|
|
50
|
+
OUTPUT (plain CSV only)
|
|
51
|
+
|
|
52
|
+
csv_id,event_type,attributes_kv
|