page-analyzer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/csv-exporter.js +192 -0
- package/extractors/block-assigner.js +281 -0
- package/extractors/context-extractor.js +275 -0
- package/extractors/css-selector-builder.js +202 -0
- package/extractors/pt-selector-builder.js +344 -0
- package/html-parser.js +206 -0
- package/index.js +303 -0
- package/llm/analyzers/event-analyzer/event-analyzer-blocks.js +553 -0
- package/llm/analyzers/event-analyzer/event-analyzer-constants.js +22 -0
- package/llm/analyzers/event-analyzer/event-analyzer-events.js +97 -0
- package/llm/analyzers/event-analyzer/event-analyzer-input.js +168 -0
- package/llm/analyzers/event-analyzer/event-analyzer-metadata.js +15 -0
- package/llm/analyzers/event-analyzer/event-analyzer-prompt.js +71 -0
- package/llm/analyzers/event-analyzer/event-analyzer-response.js +290 -0
- package/llm/analyzers/event-analyzer/event-analyzer-utils.js +96 -0
- package/llm/analyzers/event-analyzer/event-analyzer.js +546 -0
- package/llm/analyzers/prompts/event-analysis.txt +52 -0
- package/llm/analyzers/prompts/special-block-confirmation.txt +127 -0
- package/llm/providers/base-provider.js +64 -0
- package/llm/providers/openai-provider.js +168 -0
- package/llm/utils/event-csv.js +276 -0
- package/models/context.js +44 -0
- package/package.json +16 -0
- package/page-extractor.js +215 -0
- package/utils/selector-utils.js +31 -0
- package/utils/text-utils.js +11 -0
- package/utils/url-utils.js +43 -0
- package/vendor/extract-blocks.js +903 -0
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
import { parse } from 'csv-parse/sync';
|
|
2
|
+
import {
|
|
3
|
+
LLM_GROUP_MAX_ROWS,
|
|
4
|
+
STANDALONE_BLOCK_MIN_ROWS
|
|
5
|
+
} from './event-analyzer-constants.js';
|
|
6
|
+
import {
|
|
7
|
+
cleanText,
|
|
8
|
+
normalizeSpecialBlockName,
|
|
9
|
+
parseBlockIdxList
|
|
10
|
+
} from './event-analyzer-utils.js';
|
|
11
|
+
|
|
12
|
+
function unwrapTextResponse(response) {
|
|
13
|
+
const raw = String(response || '').trim();
|
|
14
|
+
if (!raw) {
|
|
15
|
+
return '';
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const codeBlockRegex = /```(?:csv|text)?\s*\n?([\s\S]*?)\n?```/gi;
|
|
19
|
+
const segments = [];
|
|
20
|
+
let lastIndex = 0;
|
|
21
|
+
let foundCodeBlock = false;
|
|
22
|
+
let match = null;
|
|
23
|
+
|
|
24
|
+
const appendSeparators = (text) => {
|
|
25
|
+
const lines = String(text || '')
|
|
26
|
+
.replace(/\r\n/g, '\n')
|
|
27
|
+
.split('\n')
|
|
28
|
+
.map((line) => line.trim())
|
|
29
|
+
.filter((line) => line === '---');
|
|
30
|
+
if (lines.length > 0) {
|
|
31
|
+
segments.push(...lines);
|
|
32
|
+
}
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
while ((match = codeBlockRegex.exec(raw)) !== null) {
|
|
36
|
+
foundCodeBlock = true;
|
|
37
|
+
appendSeparators(raw.slice(lastIndex, match.index));
|
|
38
|
+
|
|
39
|
+
const block = String(match[1] || '').trim();
|
|
40
|
+
if (block) {
|
|
41
|
+
segments.push(block);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
lastIndex = codeBlockRegex.lastIndex;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (!foundCodeBlock) {
|
|
48
|
+
return raw;
|
|
49
|
+
}
|
|
50
|
+
appendSeparators(raw.slice(lastIndex));
|
|
51
|
+
return segments.join('\n').trim();
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function defaultBlockNameForTag(tag) {
|
|
55
|
+
const normalizedTag = String(tag || '').trim().toLowerCase();
|
|
56
|
+
if (normalizedTag === 'main' || normalizedTag === 'section') return 'ContentSection';
|
|
57
|
+
if (normalizedTag === 'article') return 'ContentSection';
|
|
58
|
+
return 'ContentBlock';
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function toFiniteNumber(value, fallback = 0) {
|
|
62
|
+
const parsed = Number(value);
|
|
63
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function normalizeBlockPosition(position = {}, fallback = {}) {
|
|
67
|
+
const source = position && typeof position === 'object' && !Array.isArray(position)
|
|
68
|
+
? position
|
|
69
|
+
: {};
|
|
70
|
+
return {
|
|
71
|
+
left: toFiniteNumber(source.left ?? fallback.left, 0),
|
|
72
|
+
top: toFiniteNumber(source.top ?? fallback.top, 0),
|
|
73
|
+
height: Math.max(0, toFiniteNumber(source.height ?? fallback.height, 0)),
|
|
74
|
+
width: Math.max(0, toFiniteNumber(source.width ?? fallback.width, 0))
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function buildLogicalBlockPosition(sourceBlocks = []) {
|
|
79
|
+
const rects = [];
|
|
80
|
+
for (const block of Array.isArray(sourceBlocks) ? sourceBlocks : []) {
|
|
81
|
+
const position = normalizeBlockPosition(block?.blockPosition, block);
|
|
82
|
+
if (position.width <= 0 || position.height <= 0) {
|
|
83
|
+
continue;
|
|
84
|
+
}
|
|
85
|
+
rects.push(position);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if (rects.length === 0) {
|
|
89
|
+
return {
|
|
90
|
+
left: 0,
|
|
91
|
+
top: 0,
|
|
92
|
+
height: 0,
|
|
93
|
+
width: 0
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
let minLeft = Number.POSITIVE_INFINITY;
|
|
98
|
+
let minTop = Number.POSITIVE_INFINITY;
|
|
99
|
+
let maxRight = Number.NEGATIVE_INFINITY;
|
|
100
|
+
let maxBottom = Number.NEGATIVE_INFINITY;
|
|
101
|
+
|
|
102
|
+
for (const rect of rects) {
|
|
103
|
+
minLeft = Math.min(minLeft, rect.left);
|
|
104
|
+
minTop = Math.min(minTop, rect.top);
|
|
105
|
+
maxRight = Math.max(maxRight, rect.left + rect.width);
|
|
106
|
+
maxBottom = Math.max(maxBottom, rect.top + rect.height);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return {
|
|
110
|
+
left: minLeft,
|
|
111
|
+
top: minTop,
|
|
112
|
+
height: Math.max(0, maxBottom - minTop),
|
|
113
|
+
width: Math.max(0, maxRight - minLeft)
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function resolveLogicalBlockCssPath(sourceBlocks = []) {
|
|
118
|
+
for (const block of Array.isArray(sourceBlocks) ? sourceBlocks : []) {
|
|
119
|
+
const path = cleanText(block?.blockCssPath || block?.cssPath || '', 500);
|
|
120
|
+
if (path) {
|
|
121
|
+
return path;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
return '';
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function normalizePossibleEvents(responseHelper, value) {
|
|
128
|
+
const raw = String(value || '').trim();
|
|
129
|
+
if (!raw) {
|
|
130
|
+
return [];
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const out = [];
|
|
134
|
+
const seen = new Set();
|
|
135
|
+
for (const item of raw.split(/[,.,]/)) {
|
|
136
|
+
const eventType = responseHelper.normalizeEventType(item);
|
|
137
|
+
if (!eventType || seen.has(eventType)) {
|
|
138
|
+
continue;
|
|
139
|
+
}
|
|
140
|
+
seen.add(eventType);
|
|
141
|
+
out.push(eventType);
|
|
142
|
+
if (out.length >= 3) {
|
|
143
|
+
break;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
return out;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function toSnakeCaseSegment(value) {
|
|
150
|
+
return String(value || '')
|
|
151
|
+
.replace(/([a-z0-9])([A-Z])/g, '$1_$2')
|
|
152
|
+
.replace(/[^a-zA-Z0-9]+/g, '_')
|
|
153
|
+
.replace(/^_+|_+$/g, '')
|
|
154
|
+
.toLowerCase();
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
function resolveStructuralSingleEventType(blockName, possibleEvents = []) {
|
|
158
|
+
const normalizedEvents = Array.isArray(possibleEvents)
|
|
159
|
+
? possibleEvents.filter((eventType) => String(eventType || '').trim())
|
|
160
|
+
: [];
|
|
161
|
+
if (normalizedEvents.length !== 1) {
|
|
162
|
+
return normalizedEvents;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const normalizedBlockName = normalizeSpecialBlockName(blockName, '');
|
|
166
|
+
if (normalizedBlockName === 'Header') {
|
|
167
|
+
return ['click_header'];
|
|
168
|
+
}
|
|
169
|
+
if (normalizedBlockName === 'Footer') {
|
|
170
|
+
return ['click_footer'];
|
|
171
|
+
}
|
|
172
|
+
if (normalizedBlockName.endsWith('Navigation')) {
|
|
173
|
+
const segment = toSnakeCaseSegment(
|
|
174
|
+
normalizedBlockName.slice(0, normalizedBlockName.length - 'Navigation'.length)
|
|
175
|
+
);
|
|
176
|
+
return [segment ? `click_${segment}_nav` : 'click_nav'];
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
return normalizedEvents;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
function splitResponseSections(response) {
|
|
183
|
+
const text = unwrapTextResponse(response);
|
|
184
|
+
const lines = String(text || '')
|
|
185
|
+
.replace(/\r\n/g, '\n')
|
|
186
|
+
.split('\n');
|
|
187
|
+
const separatorIndex = lines.findIndex((line) => String(line || '').trim() === '---');
|
|
188
|
+
|
|
189
|
+
if (separatorIndex === -1) {
|
|
190
|
+
return {
|
|
191
|
+
primarySection: text,
|
|
192
|
+
semanticSection: ''
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return {
|
|
197
|
+
primarySection: lines.slice(0, separatorIndex).join('\n').trim(),
|
|
198
|
+
semanticSection: lines.slice(separatorIndex + 1).join('\n').trim()
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function findHeaderIndex(lines, expectedHeaders = []) {
|
|
203
|
+
return lines.findIndex((line) => {
|
|
204
|
+
const normalized = String(line || '').replace(/\s+/g, '').toLowerCase();
|
|
205
|
+
return expectedHeaders.includes(normalized);
|
|
206
|
+
});
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function parseSpecialBlockResponse(responseHelper, response) {
|
|
210
|
+
const { primarySection, semanticSection } = splitResponseSections(response);
|
|
211
|
+
const lines = String(primarySection || '')
|
|
212
|
+
.replace(/\r\n/g, '\n')
|
|
213
|
+
.split('\n');
|
|
214
|
+
|
|
215
|
+
const headerIndex = findHeaderIndex(lines, [
|
|
216
|
+
'blockidxs,blockname,blockdescription,blockpossibleevents',
|
|
217
|
+
'blockidx,blockname,blockdescription,blockpossibleevents',
|
|
218
|
+
'blockidxs,blockname,blockpossibleevents',
|
|
219
|
+
'blockidx,blockname,blockpossibleevents'
|
|
220
|
+
]);
|
|
221
|
+
if (headerIndex < 0) {
|
|
222
|
+
throw new Error('special-block-confirmation CSV header not found');
|
|
223
|
+
}
|
|
224
|
+
const summaryLine = lines
|
|
225
|
+
.slice(0, headerIndex)
|
|
226
|
+
.find((line) => String(line || '').trim());
|
|
227
|
+
const siteSummary = cleanText(summaryLine || '', 240);
|
|
228
|
+
|
|
229
|
+
const csvText = lines.slice(headerIndex).join('\n').trim();
|
|
230
|
+
const records = parse(csvText, {
|
|
231
|
+
columns: true,
|
|
232
|
+
skip_empty_lines: true,
|
|
233
|
+
trim: true,
|
|
234
|
+
relax_column_count: true,
|
|
235
|
+
relax_quotes: true
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
const logicalBlocks = [];
|
|
239
|
+
for (const record of records) {
|
|
240
|
+
const blockIdxs = parseBlockIdxList(
|
|
241
|
+
record?.blockIdxs ?? record?.blockidxs ?? record?.blockIdx ?? record?.blockidx
|
|
242
|
+
);
|
|
243
|
+
if (blockIdxs.length === 0) {
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
const blockName = normalizeSpecialBlockName(record?.blockName, 'ContentBlock');
|
|
247
|
+
const blockDescription = cleanText(
|
|
248
|
+
record?.blockDescription ?? record?.blockdescription ?? record?.description,
|
|
249
|
+
300
|
|
250
|
+
);
|
|
251
|
+
const possibleEvents = normalizePossibleEvents(
|
|
252
|
+
responseHelper,
|
|
253
|
+
record?.blockPossibleEvents ?? record?.blockpossibleevents
|
|
254
|
+
);
|
|
255
|
+
logicalBlocks.push({
|
|
256
|
+
blockIdxs,
|
|
257
|
+
blockIdxKey: blockIdxs.join('.'),
|
|
258
|
+
blockName,
|
|
259
|
+
blockDescription,
|
|
260
|
+
possibleEvents
|
|
261
|
+
});
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
const semanticGroups = [];
|
|
265
|
+
const semanticLines = String(semanticSection || '')
|
|
266
|
+
.replace(/\r\n/g, '\n')
|
|
267
|
+
.split('\n');
|
|
268
|
+
const semanticHeaderIndex = findHeaderIndex(semanticLines, [
|
|
269
|
+
'blockidxs,blocksemantic',
|
|
270
|
+
'blockidx,blocksemantic'
|
|
271
|
+
]);
|
|
272
|
+
if (semanticHeaderIndex >= 0) {
|
|
273
|
+
const semanticCsvText = semanticLines.slice(semanticHeaderIndex).join('\n').trim();
|
|
274
|
+
const semanticRecords = parse(semanticCsvText, {
|
|
275
|
+
columns: true,
|
|
276
|
+
skip_empty_lines: true,
|
|
277
|
+
trim: true,
|
|
278
|
+
relax_column_count: true,
|
|
279
|
+
relax_quotes: true
|
|
280
|
+
});
|
|
281
|
+
|
|
282
|
+
for (const record of semanticRecords) {
|
|
283
|
+
const blockIdxs = parseBlockIdxList(
|
|
284
|
+
record?.blockIdxs ?? record?.blockidxs ?? record?.blockIdx ?? record?.blockidx
|
|
285
|
+
);
|
|
286
|
+
const blockSemantic = cleanText(
|
|
287
|
+
record?.blockSemantic ?? record?.blocksemantic,
|
|
288
|
+
80
|
|
289
|
+
);
|
|
290
|
+
if (blockIdxs.length === 0 || !blockSemantic) {
|
|
291
|
+
continue;
|
|
292
|
+
}
|
|
293
|
+
semanticGroups.push({
|
|
294
|
+
blockIdxs,
|
|
295
|
+
blockSemantic
|
|
296
|
+
});
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
return {
|
|
301
|
+
siteSummary,
|
|
302
|
+
logicalBlocks,
|
|
303
|
+
semanticGroups
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
function buildBlockContexts(blocks, csvRows, specialResult) {
|
|
308
|
+
const contexts = [];
|
|
309
|
+
const blockByIdx = new Map();
|
|
310
|
+
for (const block of blocks) {
|
|
311
|
+
blockByIdx.set(block.blockIdx, block);
|
|
312
|
+
}
|
|
313
|
+
const mappedIndices = new Set();
|
|
314
|
+
const byPhysicalBlockIdx = new Map();
|
|
315
|
+
const semanticByPhysicalBlockIdx = new Map();
|
|
316
|
+
|
|
317
|
+
for (const group of Array.isArray(specialResult?.semanticGroups) ? specialResult.semanticGroups : []) {
|
|
318
|
+
const blockSemantic = cleanText(group?.blockSemantic || '', 80);
|
|
319
|
+
if (!blockSemantic) {
|
|
320
|
+
continue;
|
|
321
|
+
}
|
|
322
|
+
for (const blockIdx of Array.isArray(group?.blockIdxs) ? group.blockIdxs : []) {
|
|
323
|
+
if (Number.isInteger(blockIdx) && blockIdx >= 0) {
|
|
324
|
+
semanticByPhysicalBlockIdx.set(blockIdx, blockSemantic);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
const createLogicalContext = (memberBlockIdxs, blockNameRaw, blockDescriptionRaw, possibleEventsRaw) => {
|
|
330
|
+
const uniqueMemberIdxs = Array.from(new Set(memberBlockIdxs)).sort((left, right) => left - right);
|
|
331
|
+
const sourceBlocks = uniqueMemberIdxs
|
|
332
|
+
.map((idx) => blockByIdx.get(idx))
|
|
333
|
+
.filter(Boolean)
|
|
334
|
+
.sort((left, right) => left.order - right.order);
|
|
335
|
+
if (sourceBlocks.length === 0) {
|
|
336
|
+
return null;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
uniqueMemberIdxs.forEach((idx) => mappedIndices.add(idx));
|
|
340
|
+
const blockName = normalizeSpecialBlockName(
|
|
341
|
+
blockNameRaw,
|
|
342
|
+
defaultBlockNameForTag(sourceBlocks[0]?.tag || '')
|
|
343
|
+
);
|
|
344
|
+
const blockDescription = cleanText(blockDescriptionRaw || '', 300);
|
|
345
|
+
let possibleEvents = Array.isArray(possibleEventsRaw) ? [...possibleEventsRaw] : [];
|
|
346
|
+
if (sourceBlocks.every((block) => Number(block.childInteractiveCount || 0) <= 0)) {
|
|
347
|
+
possibleEvents = [];
|
|
348
|
+
} else {
|
|
349
|
+
possibleEvents = resolveStructuralSingleEventType(blockName, possibleEvents);
|
|
350
|
+
}
|
|
351
|
+
const semanticLabels = Array.from(new Set(
|
|
352
|
+
uniqueMemberIdxs
|
|
353
|
+
.map((idx) => semanticByPhysicalBlockIdx.get(idx))
|
|
354
|
+
.filter(Boolean)
|
|
355
|
+
));
|
|
356
|
+
const semanticGroups = Array.from(
|
|
357
|
+
uniqueMemberIdxs.reduce((map, idx) => {
|
|
358
|
+
const blockSemantic = semanticByPhysicalBlockIdx.get(idx);
|
|
359
|
+
if (!blockSemantic) {
|
|
360
|
+
return map;
|
|
361
|
+
}
|
|
362
|
+
if (!map.has(blockSemantic)) {
|
|
363
|
+
map.set(blockSemantic, []);
|
|
364
|
+
}
|
|
365
|
+
map.get(blockSemantic).push(idx);
|
|
366
|
+
return map;
|
|
367
|
+
}, new Map())
|
|
368
|
+
).map(([blockSemantic, blockIdxs]) => ({
|
|
369
|
+
blockIdxs: blockIdxs.join('.'),
|
|
370
|
+
blockSemantic
|
|
371
|
+
}));
|
|
372
|
+
|
|
373
|
+
const context = {
|
|
374
|
+
blockIdx: uniqueMemberIdxs[0],
|
|
375
|
+
blockIdxs: uniqueMemberIdxs,
|
|
376
|
+
blockIdxKey: uniqueMemberIdxs.join('.'),
|
|
377
|
+
order: Math.min(...sourceBlocks.map((block) => block.order)),
|
|
378
|
+
blockName,
|
|
379
|
+
blockDescription,
|
|
380
|
+
possibleEvents: possibleEvents.slice(0, 3),
|
|
381
|
+
semanticLabels,
|
|
382
|
+
semanticGroups,
|
|
383
|
+
rows: [],
|
|
384
|
+
sourceBlocks
|
|
385
|
+
};
|
|
386
|
+
|
|
387
|
+
for (const memberIdx of uniqueMemberIdxs) {
|
|
388
|
+
byPhysicalBlockIdx.set(memberIdx, context);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
return context;
|
|
392
|
+
};
|
|
393
|
+
|
|
394
|
+
const logicalBlocks = Array.isArray(specialResult?.logicalBlocks) ? specialResult.logicalBlocks : [];
|
|
395
|
+
for (const logical of logicalBlocks) {
|
|
396
|
+
const context = createLogicalContext(
|
|
397
|
+
logical.blockIdxs,
|
|
398
|
+
logical.blockName,
|
|
399
|
+
logical.blockDescription,
|
|
400
|
+
logical.possibleEvents
|
|
401
|
+
);
|
|
402
|
+
if (context) {
|
|
403
|
+
contexts.push(context);
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
for (const block of blocks) {
|
|
408
|
+
if (mappedIndices.has(block.blockIdx)) {
|
|
409
|
+
continue;
|
|
410
|
+
}
|
|
411
|
+
const context = createLogicalContext([block.blockIdx], defaultBlockNameForTag(block.tag), '', []);
|
|
412
|
+
if (context) {
|
|
413
|
+
contexts.push(context);
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
let fallbackContext = byPhysicalBlockIdx.get(-1);
|
|
418
|
+
if (!fallbackContext) {
|
|
419
|
+
fallbackContext = {
|
|
420
|
+
blockIdx: -1,
|
|
421
|
+
blockIdxs: [-1],
|
|
422
|
+
blockIdxKey: '-1',
|
|
423
|
+
order: Number.MAX_SAFE_INTEGER,
|
|
424
|
+
blockName: 'ContentBlock',
|
|
425
|
+
blockDescription: '',
|
|
426
|
+
possibleEvents: ['item_click'],
|
|
427
|
+
semanticLabels: [],
|
|
428
|
+
semanticGroups: [],
|
|
429
|
+
rows: [],
|
|
430
|
+
sourceBlocks: []
|
|
431
|
+
};
|
|
432
|
+
contexts.push(fallbackContext);
|
|
433
|
+
byPhysicalBlockIdx.set(-1, fallbackContext);
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
for (const row of csvRows) {
|
|
437
|
+
const context = byPhysicalBlockIdx.get(row.blockIdx) || fallbackContext;
|
|
438
|
+
row.logicalBlockIdxKey = context.blockIdxKey;
|
|
439
|
+
row.blockSemantic = semanticByPhysicalBlockIdx.get(row.blockIdx) || '';
|
|
440
|
+
context.rows.push(row);
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
return contexts.sort((left, right) => left.order - right.order);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
function buildLlmGroups(blockContexts = []) {
|
|
447
|
+
const targets = blockContexts
|
|
448
|
+
.filter((context) => context.possibleEvents.length > 1 && context.rows.length > 0)
|
|
449
|
+
.sort((left, right) => left.order - right.order);
|
|
450
|
+
const groups = [];
|
|
451
|
+
let currentBlocks = [];
|
|
452
|
+
let currentRows = 0;
|
|
453
|
+
|
|
454
|
+
const flush = () => {
|
|
455
|
+
if (currentBlocks.length === 0) {
|
|
456
|
+
return;
|
|
457
|
+
}
|
|
458
|
+
groups.push({
|
|
459
|
+
blocks: currentBlocks,
|
|
460
|
+
rowCount: currentRows
|
|
461
|
+
});
|
|
462
|
+
currentBlocks = [];
|
|
463
|
+
currentRows = 0;
|
|
464
|
+
};
|
|
465
|
+
|
|
466
|
+
for (const block of targets) {
|
|
467
|
+
const rowCount = block.rows.length;
|
|
468
|
+
const needsStandalone = block.possibleEvents.length > 2 && rowCount > STANDALONE_BLOCK_MIN_ROWS;
|
|
469
|
+
if (needsStandalone) {
|
|
470
|
+
flush();
|
|
471
|
+
groups.push({
|
|
472
|
+
blocks: [block],
|
|
473
|
+
rowCount
|
|
474
|
+
});
|
|
475
|
+
continue;
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
if (currentRows > 0 && currentRows + rowCount > LLM_GROUP_MAX_ROWS) {
|
|
479
|
+
flush();
|
|
480
|
+
}
|
|
481
|
+
currentBlocks.push(block);
|
|
482
|
+
currentRows += rowCount;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
flush();
|
|
486
|
+
return groups;
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
function buildBlockAnalysisArtifact(siteSummary, blockContexts = [], llmGroups = []) {
|
|
490
|
+
const possibleEventTypes = [];
|
|
491
|
+
const seenEventTypes = new Set();
|
|
492
|
+
const blocks = [];
|
|
493
|
+
let skippedBlocks = 0;
|
|
494
|
+
let directBlocks = 0;
|
|
495
|
+
let llmBlocks = 0;
|
|
496
|
+
|
|
497
|
+
for (const context of blockContexts) {
|
|
498
|
+
if (context.blockIdx === -1 && context.rows.length === 0) {
|
|
499
|
+
continue;
|
|
500
|
+
}
|
|
501
|
+
const mode = context.possibleEvents.length === 0
|
|
502
|
+
? 'skipped'
|
|
503
|
+
: (context.possibleEvents.length === 1 ? 'direct' : 'llm');
|
|
504
|
+
if (mode === 'skipped') skippedBlocks += 1;
|
|
505
|
+
if (mode === 'direct') directBlocks += 1;
|
|
506
|
+
if (mode === 'llm') llmBlocks += 1;
|
|
507
|
+
|
|
508
|
+
for (const eventType of context.possibleEvents) {
|
|
509
|
+
if (seenEventTypes.has(eventType)) {
|
|
510
|
+
continue;
|
|
511
|
+
}
|
|
512
|
+
seenEventTypes.add(eventType);
|
|
513
|
+
possibleEventTypes.push(eventType);
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
blocks.push({
|
|
517
|
+
blockIdxs: context.blockIdxKey,
|
|
518
|
+
blockName: context.blockName,
|
|
519
|
+
blockPossibleEvents: context.possibleEvents,
|
|
520
|
+
blockSemantics: context.semanticLabels,
|
|
521
|
+
blockSemanticGroups: context.semanticGroups,
|
|
522
|
+
blockDescription: context.blockDescription,
|
|
523
|
+
blockCssPath: resolveLogicalBlockCssPath(context.sourceBlocks),
|
|
524
|
+
blockPosition: buildLogicalBlockPosition(context.sourceBlocks),
|
|
525
|
+
rowCount: context.rows.length,
|
|
526
|
+
mode
|
|
527
|
+
});
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
return {
|
|
531
|
+
site_summary: cleanText(siteSummary, 240),
|
|
532
|
+
possible_event_types: possibleEventTypes.slice(0, 24),
|
|
533
|
+
blocks,
|
|
534
|
+
stats: {
|
|
535
|
+
total_blocks: blocks.length,
|
|
536
|
+
skipped_blocks: skippedBlocks,
|
|
537
|
+
direct_blocks: directBlocks,
|
|
538
|
+
llm_blocks: llmBlocks,
|
|
539
|
+
llm_group_count: llmGroups.length,
|
|
540
|
+
llm_group_rows: llmGroups.reduce((sum, group) => sum + (group.rowCount || 0), 0)
|
|
541
|
+
}
|
|
542
|
+
};
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
export {
|
|
546
|
+
unwrapTextResponse,
|
|
547
|
+
defaultBlockNameForTag,
|
|
548
|
+
normalizePossibleEvents,
|
|
549
|
+
parseSpecialBlockResponse,
|
|
550
|
+
buildBlockContexts,
|
|
551
|
+
buildLlmGroups,
|
|
552
|
+
buildBlockAnalysisArtifact
|
|
553
|
+
};
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
const DEFAULT_ATTRIBUTE_KEYS = [
|
|
2
|
+
'text',
|
|
3
|
+
'page_area',
|
|
4
|
+
'content_category(producdt/support/company/legal)',
|
|
5
|
+
'is_external'
|
|
6
|
+
];
|
|
7
|
+
|
|
8
|
+
const CSV_REPAIR_ATTEMPTS = 3;
|
|
9
|
+
const FULL_ANALYSIS_RETRY_ATTEMPTS = 1;
|
|
10
|
+
const MAX_CSV_REPAIR_INPUT_CHARS = 20000;
|
|
11
|
+
|
|
12
|
+
const LLM_GROUP_MAX_ROWS = 60;
|
|
13
|
+
const STANDALONE_BLOCK_MIN_ROWS = 80;
|
|
14
|
+
|
|
15
|
+
export {
|
|
16
|
+
DEFAULT_ATTRIBUTE_KEYS,
|
|
17
|
+
CSV_REPAIR_ATTEMPTS,
|
|
18
|
+
FULL_ANALYSIS_RETRY_ATTEMPTS,
|
|
19
|
+
MAX_CSV_REPAIR_INPUT_CHARS,
|
|
20
|
+
LLM_GROUP_MAX_ROWS,
|
|
21
|
+
STANDALONE_BLOCK_MIN_ROWS
|
|
22
|
+
};
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import {
|
|
2
|
+
cleanText,
|
|
3
|
+
normalizeDestination
|
|
4
|
+
} from './event-analyzer-utils.js';
|
|
5
|
+
|
|
6
|
+
function buildDerivedAttributes(row, blockContext, options = {}) {
|
|
7
|
+
const attributes = {};
|
|
8
|
+
const text = cleanText(row?.text || '', 120);
|
|
9
|
+
if (text) {
|
|
10
|
+
attributes.text = text;
|
|
11
|
+
}
|
|
12
|
+
const blockName = cleanText(blockContext?.blockName || '', 80);
|
|
13
|
+
if (blockName) {
|
|
14
|
+
attributes.block = blockName;
|
|
15
|
+
}
|
|
16
|
+
const destination = normalizeDestination(row?.href || '');
|
|
17
|
+
if (destination) {
|
|
18
|
+
attributes.destination = destination;
|
|
19
|
+
}
|
|
20
|
+
const href = cleanText(row?.href || '', 240);
|
|
21
|
+
if (href) {
|
|
22
|
+
attributes.href = href;
|
|
23
|
+
}
|
|
24
|
+
const tag = cleanText(row?.tag || '', 40);
|
|
25
|
+
if (tag) {
|
|
26
|
+
attributes.tag = tag;
|
|
27
|
+
}
|
|
28
|
+
const context = cleanText(row?.context || '', 200);
|
|
29
|
+
if (context) {
|
|
30
|
+
attributes.context = context;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
return attributes;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function expandEventAttributes(event, row, blockContext, options = {}) {
|
|
37
|
+
const attributes = event?.attributes && typeof event.attributes === 'object' && !Array.isArray(event.attributes)
|
|
38
|
+
? { ...event.attributes }
|
|
39
|
+
: {};
|
|
40
|
+
const derivedAttributes = buildDerivedAttributes(row, blockContext, options);
|
|
41
|
+
|
|
42
|
+
for (const [key, value] of Object.entries(derivedAttributes)) {
|
|
43
|
+
const existingValue = attributes[key];
|
|
44
|
+
if (existingValue === undefined || existingValue === null || String(existingValue).trim() === '') {
|
|
45
|
+
attributes[key] = value;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
...event,
|
|
51
|
+
attributes
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function buildDirectEvent(row, blockContext) {
|
|
56
|
+
const eventType = String(blockContext?.possibleEvents?.[0] || '').trim();
|
|
57
|
+
if (!eventType) {
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
csv_id: row.csv_id,
|
|
63
|
+
event_type: eventType,
|
|
64
|
+
attributes: buildDerivedAttributes(row, blockContext)
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function enforceAllowedEventType(responseHelper, event, blockContext) {
|
|
69
|
+
const normalized = responseHelper.normalizeEventType(event?.event_type);
|
|
70
|
+
if (!normalized) {
|
|
71
|
+
return {
|
|
72
|
+
...event,
|
|
73
|
+
event_type: String(blockContext?.possibleEvents?.[0] || '').trim()
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
const allowed = Array.isArray(blockContext?.possibleEvents)
|
|
77
|
+
? blockContext.possibleEvents
|
|
78
|
+
: [];
|
|
79
|
+
if (allowed.length > 0 && !allowed.includes(normalized)) {
|
|
80
|
+
console.warn(
|
|
81
|
+
`[LLM] Event type "${normalized}" is out of block scope for blockIdx=${blockContext.blockIdxKey}. ` +
|
|
82
|
+
'Keeping analysis output.'
|
|
83
|
+
);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
...event,
|
|
88
|
+
event_type: normalized
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export {
|
|
93
|
+
buildDerivedAttributes,
|
|
94
|
+
expandEventAttributes,
|
|
95
|
+
buildDirectEvent,
|
|
96
|
+
enforceAllowedEventType
|
|
97
|
+
};
|