page-analyzer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js ADDED
@@ -0,0 +1,303 @@
1
+ /**
2
+ * page-analyzer — Standalone module
3
+ *
4
+ * Simplest usage — just pass a URL:
5
+ *
6
+ * import { analyzeUrl } from './page-analyzer/index.js';
7
+ *
8
+ * const result = await analyzeUrl('https://example.com', {
9
+ * llm: {
10
+ * apiKey: 'sk-...',
11
+ * apiEndpoint: 'https://api.openai.com/v1/chat/completions',
12
+ * model: 'gpt-4',
13
+ * },
14
+ * showEvents: true
15
+ * });
16
+ *
17
+ * Or step-by-step with analyzePageEvents() for pre-fetched data.
18
+ *
19
+ * npm dependencies: cheerio, csv-parse, playwright
20
+ */
21
+
22
+ import { HtmlParser } from './html-parser.js';
23
+ import { assignBlocksToElements } from './extractors/block-assigner.js';
24
+ import { CsvExporter } from './csv-exporter.js';
25
+ import { OpenAiProvider } from './llm/providers/openai-provider.js';
26
+ import { EventAnalyzer } from './llm/analyzers/event-analyzer/event-analyzer.js';
27
+ import { PageExtractor } from './page-extractor.js';
28
+
29
+ function isObject(value) {
30
+ return value && typeof value === 'object' && !Array.isArray(value);
31
+ }
32
+
33
+ function normalizeDisplayOptions(options = {}) {
34
+ const showEvents = Boolean(options?.showEvents);
35
+ return {
36
+ showEvents,
37
+ showBlockIdx: showEvents || Boolean(options?.showBlockIdx)
38
+ };
39
+ }
40
+
41
+ function compactBlockAnalysisBlock(block, displayOptions) {
42
+ const source = isObject(block) ? block : {};
43
+ const out = {
44
+ blockName: source.blockName,
45
+ blockDescription: source.blockDescription,
46
+ blockSemantics: Array.isArray(source.blockSemantics) ? source.blockSemantics : [],
47
+ blockCssPath: source.blockCssPath,
48
+ blockPosition: source.blockPosition
49
+ };
50
+
51
+ if (displayOptions.showBlockIdx) {
52
+ out.blockIdxs = source.blockIdxs;
53
+ out.blockSemanticGroups = Array.isArray(source.blockSemanticGroups)
54
+ ? source.blockSemanticGroups
55
+ : [];
56
+ out.rowCount = source.rowCount;
57
+ }
58
+
59
+ if (displayOptions.showEvents) {
60
+ out.blockPossibleEvents = Array.isArray(source.blockPossibleEvents)
61
+ ? source.blockPossibleEvents
62
+ : [];
63
+ out.mode = source.mode;
64
+ }
65
+
66
+ return out;
67
+ }
68
+
69
+ function compactBlockAnalysis(blockAnalysis, displayOptions) {
70
+ const source = isObject(blockAnalysis) ? blockAnalysis : {};
71
+ const out = {
72
+ site_summary: source.site_summary,
73
+ blocks: Array.isArray(source.blocks)
74
+ ? source.blocks.map((block) => compactBlockAnalysisBlock(block, displayOptions))
75
+ : []
76
+ };
77
+
78
+ if (displayOptions.showEvents) {
79
+ out.possible_event_types = Array.isArray(source.possible_event_types)
80
+ ? source.possible_event_types
81
+ : [];
82
+ }
83
+
84
+ if (isObject(source.stats)) {
85
+ out.stats = displayOptions.showEvents
86
+ ? source.stats
87
+ : {
88
+ total_blocks: Number.parseInt(String(source.stats.total_blocks || 0), 10) || 0
89
+ };
90
+ }
91
+
92
+ return out;
93
+ }
94
+
95
+ function buildAnalysisResult(analysis, displayOptions) {
96
+ const source = isObject(analysis) ? analysis : {};
97
+ if (displayOptions.showEvents) {
98
+ return source;
99
+ }
100
+
101
+ const result = {};
102
+
103
+ for (const [key, value] of Object.entries(source)) {
104
+ if (
105
+ key === 'events_by_node' ||
106
+ key === 'event_types_summary' ||
107
+ key === 'new_event_types'
108
+ ) {
109
+ continue;
110
+ }
111
+ result[key] = key === 'block_analysis'
112
+ ? compactBlockAnalysis(value, displayOptions)
113
+ : value;
114
+ }
115
+
116
+ return result;
117
+ }
118
+
119
+ function buildPageAnalysisResult({
120
+ elements,
121
+ csvContent,
122
+ pageData,
123
+ analysis,
124
+ displayOptions
125
+ }) {
126
+ const result = {
127
+ title: pageData.title,
128
+ parseMetrics: pageData.metrics,
129
+ analysis: buildAnalysisResult(analysis, displayOptions)
130
+ };
131
+
132
+ if (displayOptions.showEvents) {
133
+ result.elements = elements;
134
+ result.csvContent = csvContent;
135
+ result.links = pageData.links;
136
+ } else if (displayOptions.showBlockIdx) {
137
+ result.csvContent = csvContent;
138
+ }
139
+
140
+ return result;
141
+ }
142
+
143
+ /**
144
+ * One-call entry: pass a URL, get back everything.
145
+ *
146
+ * Playwright → HTML parse → block assign → CSV → LLM block/event analysis
147
+ *
148
+ * @param {string} url - URL to analyze
149
+ * @param {Object} options
150
+ * @param {Object} options.llm - { apiKey, apiEndpoint, model, ... }
151
+ * @param {Array} [options.knownEventTypes] - Accumulated event types for consistency
152
+ * @param {Object} [options.parserConfig] - HtmlParser config overrides
153
+ * @param {Object} [options.extractorConfig] - PageExtractor config overrides
154
+ * @param {boolean} [options.showEvents=false] - Include event arrays and full event-related metadata.
155
+ * Also enables node-level event classification.
156
+ * @param {boolean} [options.showBlockIdx=false] - Include CSV/block index alignment fields.
157
+ * @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
158
+ */
159
+ export async function analyzeUrl(url, options = {}) {
160
+ const { llm: llmConfig, knownEventTypes, parserConfig, extractorConfig, showEvents, showBlockIdx } = options;
161
+
162
+ if (!url) throw new Error('url is required');
163
+ if (!llmConfig?.apiKey || !llmConfig?.apiEndpoint || !llmConfig?.model) {
164
+ throw new Error('options.llm.apiKey, apiEndpoint, and model are required');
165
+ }
166
+
167
+ // Step 0: Playwright extraction
168
+ console.log(`[page-analyzer] Extracting ${url} ...`);
169
+ const extractor = new PageExtractor(extractorConfig);
170
+ const bundle = await extractor.extract(url);
171
+ console.log(`[page-analyzer] Extracted: ${bundle.blocks.length} blocks, ${bundle.elementGeometries.length} geometries`);
172
+
173
+ // Derive domain from URL
174
+ let domain = '';
175
+ try { domain = new URL(url).hostname.replace(/^www\./, ''); } catch { /* ignore */ }
176
+
177
+ return analyzePageEvents({
178
+ html: bundle.html,
179
+ url,
180
+ blocks: bundle.blocks,
181
+ elementGeometries: bundle.elementGeometries,
182
+ llm: llmConfig,
183
+ knownEventTypes,
184
+ parserConfig,
185
+ showEvents,
186
+ showBlockIdx,
187
+ domain,
188
+ nodeId: `${domain}-root`
189
+ });
190
+ }
191
+
192
+ /**
193
+ * Run the full pipeline: HTML parse → block assign → CSV → LLM block/event analysis.
194
+ *
195
+ * @param {Object} input
196
+ * @param {string} input.html - Raw HTML of the crawled page
197
+ * @param {string} input.url - Page URL
198
+ * @param {Array} input.blocks - Visual blocks from Playwright extraction
199
+ * @param {Array} input.elementGeometries - Element geometry records
200
+ * @param {string} [input.markdown] - Markdown content (reserved for future use)
201
+ * @param {Object} input.llm - LLM provider config
202
+ * @param {string} input.llm.apiKey - API key
203
+ * @param {string} input.llm.apiEndpoint - API endpoint URL
204
+ * @param {string} input.llm.model - Model name
205
+ * @param {number} [input.llm.maxTokens] - Max tokens
206
+ * @param {number} [input.llm.temperature] - Temperature
207
+ * @param {number} [input.llm.timeout] - Request timeout ms
208
+ * @param {number} [input.llm.maxRetries] - Max retries
209
+ * @param {Array} [input.llm.knownEventTypes] - Pre-configured known event types
210
+ * @param {Function} [input.llm.interactionLogger] - Optional interaction logger
211
+ * @param {Array} [input.knownEventTypes] - Accumulated event types for consistency
212
+ * @param {Object} [input.parserConfig] - HtmlParser config overrides
213
+ * @param {boolean} [input.showEvents=false] - Include event arrays and full event-related metadata.
214
+ * Also enables node-level event classification.
215
+ * @param {boolean} [input.showBlockIdx=false] - Include CSV/block index alignment fields.
216
+ * @param {string} [input.nodeId] - Node ID for logging context
217
+ * @param {string} [input.domain] - Domain for logging context
218
+ * @returns {Promise<Object>} Analysis result. Event and idx fields are omitted unless requested.
219
+ */
220
+ export async function analyzePageEvents(input) {
221
+ const {
222
+ html,
223
+ url,
224
+ blocks = [],
225
+ elementGeometries = [],
226
+ markdown = '',
227
+ llm: llmConfig,
228
+ knownEventTypes = [],
229
+ parserConfig = {},
230
+ showEvents = false,
231
+ showBlockIdx = false,
232
+ nodeId = '',
233
+ domain = ''
234
+ } = input;
235
+ const displayOptions = normalizeDisplayOptions({ showEvents, showBlockIdx });
236
+
237
+ if (!html) {
238
+ throw new Error('html is required');
239
+ }
240
+ if (!url) {
241
+ throw new Error('url is required');
242
+ }
243
+ if (!llmConfig?.apiKey || !llmConfig?.apiEndpoint || !llmConfig?.model) {
244
+ throw new Error('llm.apiKey, llm.apiEndpoint, and llm.model are required');
245
+ }
246
+
247
+ // Step 1: Parse HTML → elements
248
+ const htmlParser = new HtmlParser(parserConfig);
249
+ const pageData = htmlParser.parse(html, url);
250
+ const elements = Array.isArray(pageData.elements) ? pageData.elements : [];
251
+
252
+ // Step 2: Assign blocks to elements
253
+ assignBlocksToElements(elements, blocks, elementGeometries, url);
254
+
255
+ // Step 3: Build CSV
256
+ const csvExporter = new CsvExporter();
257
+ const csvContent = csvExporter.buildCsvContent(nodeId || 'page', elements);
258
+
259
+ // Step 4: LLM event analysis
260
+ const provider = new OpenAiProvider({
261
+ apiKey: llmConfig.apiKey,
262
+ apiEndpoint: llmConfig.apiEndpoint,
263
+ model: llmConfig.model,
264
+ maxTokens: llmConfig.maxTokens,
265
+ temperature: llmConfig.temperature,
266
+ timeout: llmConfig.timeout,
267
+ maxRetries: llmConfig.maxRetries,
268
+ interactionLogger: llmConfig.interactionLogger
269
+ });
270
+
271
+ const eventAnalyzer = new EventAnalyzer(provider, llmConfig, {
272
+ domain,
273
+ nodeId,
274
+ url
275
+ });
276
+
277
+ const analysis = await eventAnalyzer.analyzeEvents(
278
+ csvContent,
279
+ markdown,
280
+ knownEventTypes,
281
+ {
282
+ blocks,
283
+ analyzeNodeEvents: displayOptions.showEvents
284
+ }
285
+ );
286
+
287
+ return buildPageAnalysisResult({
288
+ elements,
289
+ csvContent,
290
+ pageData,
291
+ analysis,
292
+ displayOptions
293
+ });
294
+ }
295
+
296
+ // Re-export individual components for advanced usage
297
+ export { HtmlParser } from './html-parser.js';
298
+ export { assignBlocksToElements } from './extractors/block-assigner.js';
299
+ export { CsvExporter } from './csv-exporter.js';
300
+ export { OpenAiProvider } from './llm/providers/openai-provider.js';
301
+ export { BaseLlmProvider } from './llm/providers/base-provider.js';
302
+ export { EventAnalyzer } from './llm/analyzers/event-analyzer/event-analyzer.js';
303
+ export { PageExtractor } from './page-extractor.js';