page-analyzer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/csv-exporter.js +192 -0
- package/extractors/block-assigner.js +281 -0
- package/extractors/context-extractor.js +275 -0
- package/extractors/css-selector-builder.js +202 -0
- package/extractors/pt-selector-builder.js +344 -0
- package/html-parser.js +206 -0
- package/index.js +303 -0
- package/llm/analyzers/event-analyzer/event-analyzer-blocks.js +553 -0
- package/llm/analyzers/event-analyzer/event-analyzer-constants.js +22 -0
- package/llm/analyzers/event-analyzer/event-analyzer-events.js +97 -0
- package/llm/analyzers/event-analyzer/event-analyzer-input.js +168 -0
- package/llm/analyzers/event-analyzer/event-analyzer-metadata.js +15 -0
- package/llm/analyzers/event-analyzer/event-analyzer-prompt.js +71 -0
- package/llm/analyzers/event-analyzer/event-analyzer-response.js +290 -0
- package/llm/analyzers/event-analyzer/event-analyzer-utils.js +96 -0
- package/llm/analyzers/event-analyzer/event-analyzer.js +546 -0
- package/llm/analyzers/prompts/event-analysis.txt +52 -0
- package/llm/analyzers/prompts/special-block-confirmation.txt +127 -0
- package/llm/providers/base-provider.js +64 -0
- package/llm/providers/openai-provider.js +168 -0
- package/llm/utils/event-csv.js +276 -0
- package/models/context.js +44 -0
- package/package.json +16 -0
- package/page-extractor.js +215 -0
- package/utils/selector-utils.js +31 -0
- package/utils/text-utils.js +11 -0
- package/utils/url-utils.js +43 -0
- package/vendor/extract-blocks.js +903 -0
package/index.js
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* page-analyzer — Standalone module
|
|
3
|
+
*
|
|
4
|
+
* Simplest usage — just pass a URL:
|
|
5
|
+
*
|
|
6
|
+
* import { analyzeUrl } from './page-analyzer/index.js';
|
|
7
|
+
*
|
|
8
|
+
* const result = await analyzeUrl('https://example.com', {
|
|
9
|
+
* llm: {
|
|
10
|
+
* apiKey: 'sk-...',
|
|
11
|
+
* apiEndpoint: 'https://api.openai.com/v1/chat/completions',
|
|
12
|
+
* model: 'gpt-4',
|
|
13
|
+
* },
|
|
14
|
+
* showEvents: true
|
|
15
|
+
* });
|
|
16
|
+
*
|
|
17
|
+
* Or step-by-step with analyzePageEvents() for pre-fetched data.
|
|
18
|
+
*
|
|
19
|
+
* npm dependencies: cheerio, csv-parse, playwright
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import { HtmlParser } from './html-parser.js';
|
|
23
|
+
import { assignBlocksToElements } from './extractors/block-assigner.js';
|
|
24
|
+
import { CsvExporter } from './csv-exporter.js';
|
|
25
|
+
import { OpenAiProvider } from './llm/providers/openai-provider.js';
|
|
26
|
+
import { EventAnalyzer } from './llm/analyzers/event-analyzer/event-analyzer.js';
|
|
27
|
+
import { PageExtractor } from './page-extractor.js';
|
|
28
|
+
|
|
29
|
+
function isObject(value) {
|
|
30
|
+
return value && typeof value === 'object' && !Array.isArray(value);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function normalizeDisplayOptions(options = {}) {
|
|
34
|
+
const showEvents = Boolean(options?.showEvents);
|
|
35
|
+
return {
|
|
36
|
+
showEvents,
|
|
37
|
+
showBlockIdx: showEvents || Boolean(options?.showBlockIdx)
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function compactBlockAnalysisBlock(block, displayOptions) {
|
|
42
|
+
const source = isObject(block) ? block : {};
|
|
43
|
+
const out = {
|
|
44
|
+
blockName: source.blockName,
|
|
45
|
+
blockDescription: source.blockDescription,
|
|
46
|
+
blockSemantics: Array.isArray(source.blockSemantics) ? source.blockSemantics : [],
|
|
47
|
+
blockCssPath: source.blockCssPath,
|
|
48
|
+
blockPosition: source.blockPosition
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
if (displayOptions.showBlockIdx) {
|
|
52
|
+
out.blockIdxs = source.blockIdxs;
|
|
53
|
+
out.blockSemanticGroups = Array.isArray(source.blockSemanticGroups)
|
|
54
|
+
? source.blockSemanticGroups
|
|
55
|
+
: [];
|
|
56
|
+
out.rowCount = source.rowCount;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if (displayOptions.showEvents) {
|
|
60
|
+
out.blockPossibleEvents = Array.isArray(source.blockPossibleEvents)
|
|
61
|
+
? source.blockPossibleEvents
|
|
62
|
+
: [];
|
|
63
|
+
out.mode = source.mode;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return out;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function compactBlockAnalysis(blockAnalysis, displayOptions) {
|
|
70
|
+
const source = isObject(blockAnalysis) ? blockAnalysis : {};
|
|
71
|
+
const out = {
|
|
72
|
+
site_summary: source.site_summary,
|
|
73
|
+
blocks: Array.isArray(source.blocks)
|
|
74
|
+
? source.blocks.map((block) => compactBlockAnalysisBlock(block, displayOptions))
|
|
75
|
+
: []
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
if (displayOptions.showEvents) {
|
|
79
|
+
out.possible_event_types = Array.isArray(source.possible_event_types)
|
|
80
|
+
? source.possible_event_types
|
|
81
|
+
: [];
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
if (isObject(source.stats)) {
|
|
85
|
+
out.stats = displayOptions.showEvents
|
|
86
|
+
? source.stats
|
|
87
|
+
: {
|
|
88
|
+
total_blocks: Number.parseInt(String(source.stats.total_blocks || 0), 10) || 0
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
return out;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function buildAnalysisResult(analysis, displayOptions) {
|
|
96
|
+
const source = isObject(analysis) ? analysis : {};
|
|
97
|
+
if (displayOptions.showEvents) {
|
|
98
|
+
return source;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
const result = {};
|
|
102
|
+
|
|
103
|
+
for (const [key, value] of Object.entries(source)) {
|
|
104
|
+
if (
|
|
105
|
+
key === 'events_by_node' ||
|
|
106
|
+
key === 'event_types_summary' ||
|
|
107
|
+
key === 'new_event_types'
|
|
108
|
+
) {
|
|
109
|
+
continue;
|
|
110
|
+
}
|
|
111
|
+
result[key] = key === 'block_analysis'
|
|
112
|
+
? compactBlockAnalysis(value, displayOptions)
|
|
113
|
+
: value;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return result;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
function buildPageAnalysisResult({
|
|
120
|
+
elements,
|
|
121
|
+
csvContent,
|
|
122
|
+
pageData,
|
|
123
|
+
analysis,
|
|
124
|
+
displayOptions
|
|
125
|
+
}) {
|
|
126
|
+
const result = {
|
|
127
|
+
title: pageData.title,
|
|
128
|
+
parseMetrics: pageData.metrics,
|
|
129
|
+
analysis: buildAnalysisResult(analysis, displayOptions)
|
|
130
|
+
};
|
|
131
|
+
|
|
132
|
+
if (displayOptions.showEvents) {
|
|
133
|
+
result.elements = elements;
|
|
134
|
+
result.csvContent = csvContent;
|
|
135
|
+
result.links = pageData.links;
|
|
136
|
+
} else if (displayOptions.showBlockIdx) {
|
|
137
|
+
result.csvContent = csvContent;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return result;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* One-call entry: pass a URL, get back everything.
|
|
145
|
+
*
|
|
146
|
+
* Playwright → HTML parse → block assign → CSV → LLM block/event analysis
|
|
147
|
+
*
|
|
148
|
+
* @param {string} url - URL to analyze
|
|
149
|
+
* @param {Object} options
|
|
150
|
+
* @param {Object} options.llm - { apiKey, apiEndpoint, model, ... }
|
|
151
|
+
* @param {Array} [options.knownEventTypes] - Accumulated event types for consistency
|
|
152
|
+
* @param {Object} [options.parserConfig] - HtmlParser config overrides
|
|
153
|
+
* @param {Object} [options.extractorConfig] - PageExtractor config overrides
|
|
154
|
+
* @param {boolean} [options.showEvents=false] - Include event arrays and full event-related metadata.
|
|
155
|
+
* Also enables node-level event classification.
|
|
156
|
+
* @param {boolean} [options.showBlockIdx=false] - Include CSV/block index alignment fields.
|
|
157
|
+
* @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
|
|
158
|
+
*/
|
|
159
|
+
export async function analyzeUrl(url, options = {}) {
|
|
160
|
+
const { llm: llmConfig, knownEventTypes, parserConfig, extractorConfig, showEvents, showBlockIdx } = options;
|
|
161
|
+
|
|
162
|
+
if (!url) throw new Error('url is required');
|
|
163
|
+
if (!llmConfig?.apiKey || !llmConfig?.apiEndpoint || !llmConfig?.model) {
|
|
164
|
+
throw new Error('options.llm.apiKey, apiEndpoint, and model are required');
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Step 0: Playwright extraction
|
|
168
|
+
console.log(`[page-analyzer] Extracting ${url} ...`);
|
|
169
|
+
const extractor = new PageExtractor(extractorConfig);
|
|
170
|
+
const bundle = await extractor.extract(url);
|
|
171
|
+
console.log(`[page-analyzer] Extracted: ${bundle.blocks.length} blocks, ${bundle.elementGeometries.length} geometries`);
|
|
172
|
+
|
|
173
|
+
// Derive domain from URL
|
|
174
|
+
let domain = '';
|
|
175
|
+
try { domain = new URL(url).hostname.replace(/^www\./, ''); } catch { /* ignore */ }
|
|
176
|
+
|
|
177
|
+
return analyzePageEvents({
|
|
178
|
+
html: bundle.html,
|
|
179
|
+
url,
|
|
180
|
+
blocks: bundle.blocks,
|
|
181
|
+
elementGeometries: bundle.elementGeometries,
|
|
182
|
+
llm: llmConfig,
|
|
183
|
+
knownEventTypes,
|
|
184
|
+
parserConfig,
|
|
185
|
+
showEvents,
|
|
186
|
+
showBlockIdx,
|
|
187
|
+
domain,
|
|
188
|
+
nodeId: `${domain}-root`
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Run the full pipeline: HTML parse → block assign → CSV → LLM block/event analysis.
|
|
194
|
+
*
|
|
195
|
+
* @param {Object} input
|
|
196
|
+
* @param {string} input.html - Raw HTML of the crawled page
|
|
197
|
+
* @param {string} input.url - Page URL
|
|
198
|
+
* @param {Array} input.blocks - Visual blocks from Playwright extraction
|
|
199
|
+
* @param {Array} input.elementGeometries - Element geometry records
|
|
200
|
+
* @param {string} [input.markdown] - Markdown content (reserved for future use)
|
|
201
|
+
* @param {Object} input.llm - LLM provider config
|
|
202
|
+
* @param {string} input.llm.apiKey - API key
|
|
203
|
+
* @param {string} input.llm.apiEndpoint - API endpoint URL
|
|
204
|
+
* @param {string} input.llm.model - Model name
|
|
205
|
+
* @param {number} [input.llm.maxTokens] - Max tokens
|
|
206
|
+
* @param {number} [input.llm.temperature] - Temperature
|
|
207
|
+
* @param {number} [input.llm.timeout] - Request timeout ms
|
|
208
|
+
* @param {number} [input.llm.maxRetries] - Max retries
|
|
209
|
+
* @param {Array} [input.llm.knownEventTypes] - Pre-configured known event types
|
|
210
|
+
* @param {Function} [input.llm.interactionLogger] - Optional interaction logger
|
|
211
|
+
* @param {Array} [input.knownEventTypes] - Accumulated event types for consistency
|
|
212
|
+
* @param {Object} [input.parserConfig] - HtmlParser config overrides
|
|
213
|
+
* @param {boolean} [input.showEvents=false] - Include event arrays and full event-related metadata.
|
|
214
|
+
* Also enables node-level event classification.
|
|
215
|
+
* @param {boolean} [input.showBlockIdx=false] - Include CSV/block index alignment fields.
|
|
216
|
+
* @param {string} [input.nodeId] - Node ID for logging context
|
|
217
|
+
* @param {string} [input.domain] - Domain for logging context
|
|
218
|
+
* @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
|
|
219
|
+
*/
|
|
220
|
+
export async function analyzePageEvents(input) {
|
|
221
|
+
const {
|
|
222
|
+
html,
|
|
223
|
+
url,
|
|
224
|
+
blocks = [],
|
|
225
|
+
elementGeometries = [],
|
|
226
|
+
markdown = '',
|
|
227
|
+
llm: llmConfig,
|
|
228
|
+
knownEventTypes = [],
|
|
229
|
+
parserConfig = {},
|
|
230
|
+
showEvents = false,
|
|
231
|
+
showBlockIdx = false,
|
|
232
|
+
nodeId = '',
|
|
233
|
+
domain = ''
|
|
234
|
+
} = input;
|
|
235
|
+
const displayOptions = normalizeDisplayOptions({ showEvents, showBlockIdx });
|
|
236
|
+
|
|
237
|
+
if (!html) {
|
|
238
|
+
throw new Error('html is required');
|
|
239
|
+
}
|
|
240
|
+
if (!url) {
|
|
241
|
+
throw new Error('url is required');
|
|
242
|
+
}
|
|
243
|
+
if (!llmConfig?.apiKey || !llmConfig?.apiEndpoint || !llmConfig?.model) {
|
|
244
|
+
throw new Error('llm.apiKey, llm.apiEndpoint, and llm.model are required');
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Step 1: Parse HTML → elements
|
|
248
|
+
const htmlParser = new HtmlParser(parserConfig);
|
|
249
|
+
const pageData = htmlParser.parse(html, url);
|
|
250
|
+
const elements = Array.isArray(pageData.elements) ? pageData.elements : [];
|
|
251
|
+
|
|
252
|
+
// Step 2: Assign blocks to elements
|
|
253
|
+
assignBlocksToElements(elements, blocks, elementGeometries, url);
|
|
254
|
+
|
|
255
|
+
// Step 3: Build CSV
|
|
256
|
+
const csvExporter = new CsvExporter();
|
|
257
|
+
const csvContent = csvExporter.buildCsvContent(nodeId || 'page', elements);
|
|
258
|
+
|
|
259
|
+
// Step 4: LLM event analysis
|
|
260
|
+
const provider = new OpenAiProvider({
|
|
261
|
+
apiKey: llmConfig.apiKey,
|
|
262
|
+
apiEndpoint: llmConfig.apiEndpoint,
|
|
263
|
+
model: llmConfig.model,
|
|
264
|
+
maxTokens: llmConfig.maxTokens,
|
|
265
|
+
temperature: llmConfig.temperature,
|
|
266
|
+
timeout: llmConfig.timeout,
|
|
267
|
+
maxRetries: llmConfig.maxRetries,
|
|
268
|
+
interactionLogger: llmConfig.interactionLogger
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
const eventAnalyzer = new EventAnalyzer(provider, llmConfig, {
|
|
272
|
+
domain,
|
|
273
|
+
nodeId,
|
|
274
|
+
url
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
const analysis = await eventAnalyzer.analyzeEvents(
|
|
278
|
+
csvContent,
|
|
279
|
+
markdown,
|
|
280
|
+
knownEventTypes,
|
|
281
|
+
{
|
|
282
|
+
blocks,
|
|
283
|
+
analyzeNodeEvents: displayOptions.showEvents
|
|
284
|
+
}
|
|
285
|
+
);
|
|
286
|
+
|
|
287
|
+
return buildPageAnalysisResult({
|
|
288
|
+
elements,
|
|
289
|
+
csvContent,
|
|
290
|
+
pageData,
|
|
291
|
+
analysis,
|
|
292
|
+
displayOptions
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Re-export individual components for advanced usage
|
|
297
|
+
export { HtmlParser } from './html-parser.js';
|
|
298
|
+
export { assignBlocksToElements } from './extractors/block-assigner.js';
|
|
299
|
+
export { CsvExporter } from './csv-exporter.js';
|
|
300
|
+
export { OpenAiProvider } from './llm/providers/openai-provider.js';
|
|
301
|
+
export { BaseLlmProvider } from './llm/providers/base-provider.js';
|
|
302
|
+
export { EventAnalyzer } from './llm/analyzers/event-analyzer/event-analyzer.js';
|
|
303
|
+
export { PageExtractor } from './page-extractor.js';
|