@monostate/node-scraper 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/browser-session.js +551 -0
- package/index.d.ts +97 -0
- package/index.js +4 -0
- package/lightpanda-server.js +151 -0
- package/package.json +6 -1
|
@@ -0,0 +1,551 @@
|
|
|
1
|
+
import { getLightPandaServer, stopLightPandaServer } from './lightpanda-server.js';
|
|
2
|
+
import browserPool from './browser-pool.js';
|
|
3
|
+
|
|
4
|
+
const FALLBACK_REASONS = {
|
|
5
|
+
SCREENSHOT: 'screenshot_requested',
|
|
6
|
+
CDP_ERROR: 'cdp_protocol_error',
|
|
7
|
+
BOT_DETECTION: 'bot_detection',
|
|
8
|
+
NAVIGATION_FAILED: 'navigation_after_click_failed',
|
|
9
|
+
METHOD_NOT_SUPPORTED: 'method_not_supported',
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
export class BrowserSession {
|
|
13
|
+
/**
|
|
14
|
+
* @param {object} options
|
|
15
|
+
* @param {'headless'|'visual'|'auto'} options.mode - 'headless' (LightPanda), 'visual' (Chrome), 'auto' (LP with Chrome fallback)
|
|
16
|
+
* @param {number} options.timeout - Navigation timeout in ms (default: 15000)
|
|
17
|
+
* @param {string} options.userAgent - Custom user agent
|
|
18
|
+
* @param {string} options.lightpandaPath - Path to LightPanda binary
|
|
19
|
+
* @param {boolean} options.verbose - Enable logging
|
|
20
|
+
*/
|
|
21
|
+
constructor(options = {}) {
|
|
22
|
+
this.mode = options.mode || 'auto';
|
|
23
|
+
this.activeBackend = null; // 'lightpanda' | 'chrome'
|
|
24
|
+
this.timeout = options.timeout || 15000;
|
|
25
|
+
this.userAgent = options.userAgent || 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36';
|
|
26
|
+
this.lightpandaPath = options.lightpandaPath;
|
|
27
|
+
this.verbose = options.verbose || false;
|
|
28
|
+
|
|
29
|
+
this.browser = null;
|
|
30
|
+
this.context = null;
|
|
31
|
+
this.page = null;
|
|
32
|
+
this._chromeBrowser = null; // reference for pool release
|
|
33
|
+
this._connected = false;
|
|
34
|
+
this._fallbackCount = 0;
|
|
35
|
+
|
|
36
|
+
this.history = []; // action log for debugging
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// ── Connection ──────────────────────────────────────────────
|
|
40
|
+
|
|
41
|
+
async connect() {
|
|
42
|
+
if (this._connected) return this;
|
|
43
|
+
|
|
44
|
+
if (this.mode === 'visual') {
|
|
45
|
+
await this._connectChrome();
|
|
46
|
+
} else {
|
|
47
|
+
// 'headless' or 'auto' — start with LightPanda
|
|
48
|
+
try {
|
|
49
|
+
await this._connectLightPanda();
|
|
50
|
+
} catch (err) {
|
|
51
|
+
if (this.mode === 'auto') {
|
|
52
|
+
this._log(`LightPanda unavailable (${err.message}), falling back to Chrome`);
|
|
53
|
+
await this._connectChrome();
|
|
54
|
+
} else {
|
|
55
|
+
throw err;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
this._connected = true;
|
|
61
|
+
return this;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
async _connectLightPanda() {
|
|
65
|
+
const server = getLightPandaServer(this.lightpandaPath);
|
|
66
|
+
const endpoint = await server.start();
|
|
67
|
+
|
|
68
|
+
const puppeteer = await this._getPuppeteer();
|
|
69
|
+
this.browser = await puppeteer.connect({ browserWSEndpoint: endpoint });
|
|
70
|
+
this.context = await this.browser.createBrowserContext();
|
|
71
|
+
this.page = await this.context.newPage();
|
|
72
|
+
|
|
73
|
+
await this.page.setUserAgent(this.userAgent);
|
|
74
|
+
this.activeBackend = 'lightpanda';
|
|
75
|
+
this._log('Connected to LightPanda CDP');
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
async _connectChrome() {
|
|
79
|
+
this._chromeBrowser = await browserPool.getBrowser();
|
|
80
|
+
this.browser = this._chromeBrowser;
|
|
81
|
+
this.page = await this.browser.newPage();
|
|
82
|
+
|
|
83
|
+
await this.page.setUserAgent(this.userAgent);
|
|
84
|
+
await this.page.setViewport({ width: 1280, height: 800 });
|
|
85
|
+
this.activeBackend = 'chrome';
|
|
86
|
+
this._log('Connected to Chrome');
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// ── Navigation ──────────────────────────────────────────────
|
|
90
|
+
|
|
91
|
+
async goto(url) {
|
|
92
|
+
this._ensureConnected();
|
|
93
|
+
try {
|
|
94
|
+
await this.page.goto(url, {
|
|
95
|
+
waitUntil: 'networkidle0',
|
|
96
|
+
timeout: this.timeout,
|
|
97
|
+
});
|
|
98
|
+
this._logAction('goto', { url });
|
|
99
|
+
return { success: true, url: this.page.url() };
|
|
100
|
+
} catch (err) {
|
|
101
|
+
if (await this._shouldFallback(err)) {
|
|
102
|
+
await this._fallbackToChrome(FALLBACK_REASONS.CDP_ERROR);
|
|
103
|
+
return this.goto(url);
|
|
104
|
+
}
|
|
105
|
+
throw err;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
async goBack() {
|
|
110
|
+
this._ensureConnected();
|
|
111
|
+
await this.page.goBack({ waitUntil: 'networkidle0', timeout: this.timeout });
|
|
112
|
+
this._logAction('goBack');
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
async goForward() {
|
|
116
|
+
this._ensureConnected();
|
|
117
|
+
await this.page.goForward({ waitUntil: 'networkidle0', timeout: this.timeout });
|
|
118
|
+
this._logAction('goForward');
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// ── Page Interactions ───────────────────────────────────────
|
|
122
|
+
|
|
123
|
+
async click(selector, options = {}) {
|
|
124
|
+
this._ensureConnected();
|
|
125
|
+
try {
|
|
126
|
+
await this.page.waitForSelector(selector, { timeout: options.timeout || this.timeout });
|
|
127
|
+
const urlBefore = this.page.url();
|
|
128
|
+
await this.page.click(selector);
|
|
129
|
+
|
|
130
|
+
// If navigation expected, wait briefly
|
|
131
|
+
if (options.waitForNavigation !== false) {
|
|
132
|
+
try {
|
|
133
|
+
await this.page.waitForNavigation({ timeout: 3000, waitUntil: 'networkidle0' }).catch(() => {});
|
|
134
|
+
} catch {
|
|
135
|
+
// No navigation happened, that's fine
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
this._logAction('click', { selector });
|
|
140
|
+
|
|
141
|
+
// Check for LP's known click-navigation bug
|
|
142
|
+
if (this.activeBackend === 'lightpanda') {
|
|
143
|
+
const urlAfter = this.page.url();
|
|
144
|
+
if (options.expectNavigation && urlBefore === urlAfter) {
|
|
145
|
+
this._log('Click did not trigger expected navigation — falling back to Chrome');
|
|
146
|
+
await this._fallbackToChrome(FALLBACK_REASONS.NAVIGATION_FAILED);
|
|
147
|
+
return this.click(selector, { ...options, _retried: true });
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return { success: true, url: this.page.url() };
|
|
152
|
+
} catch (err) {
|
|
153
|
+
if (await this._shouldFallback(err)) {
|
|
154
|
+
await this._fallbackToChrome(FALLBACK_REASONS.CDP_ERROR);
|
|
155
|
+
return this.click(selector, options);
|
|
156
|
+
}
|
|
157
|
+
throw err;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
async type(selector, text, options = {}) {
|
|
162
|
+
this._ensureConnected();
|
|
163
|
+
try {
|
|
164
|
+
await this.page.waitForSelector(selector, { timeout: options.timeout || this.timeout });
|
|
165
|
+
|
|
166
|
+
if (options.clear) {
|
|
167
|
+
await this.page.click(selector, { clickCount: 3 });
|
|
168
|
+
await this.page.keyboard.press('Backspace');
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
await this.page.type(selector, text, { delay: options.delay || 0 });
|
|
172
|
+
this._logAction('type', { selector, text: text.substring(0, 20) + (text.length > 20 ? '...' : '') });
|
|
173
|
+
return { success: true };
|
|
174
|
+
} catch (err) {
|
|
175
|
+
if (await this._shouldFallback(err)) {
|
|
176
|
+
await this._fallbackToChrome(FALLBACK_REASONS.CDP_ERROR);
|
|
177
|
+
return this.type(selector, text, options);
|
|
178
|
+
}
|
|
179
|
+
throw err;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
async scroll(direction = 'down', amount = 500) {
|
|
184
|
+
this._ensureConnected();
|
|
185
|
+
const deltaY = direction === 'up' ? -amount : amount;
|
|
186
|
+
await this.page.evaluate((dy) => window.scrollBy(0, dy), deltaY);
|
|
187
|
+
this._logAction('scroll', { direction, amount });
|
|
188
|
+
return { success: true };
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
async hover(selector) {
|
|
192
|
+
this._ensureConnected();
|
|
193
|
+
if (this.activeBackend === 'lightpanda') {
|
|
194
|
+
// LP doesn't support mouseMoved — fall back
|
|
195
|
+
if (this.mode === 'auto') {
|
|
196
|
+
await this._fallbackToChrome(FALLBACK_REASONS.METHOD_NOT_SUPPORTED);
|
|
197
|
+
return this.hover(selector);
|
|
198
|
+
}
|
|
199
|
+
throw new Error('hover() not supported in LightPanda mode');
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
await this.page.waitForSelector(selector, { timeout: this.timeout });
|
|
203
|
+
await this.page.hover(selector);
|
|
204
|
+
this._logAction('hover', { selector });
|
|
205
|
+
return { success: true };
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
async select(selector, ...values) {
|
|
209
|
+
this._ensureConnected();
|
|
210
|
+
await this.page.waitForSelector(selector, { timeout: this.timeout });
|
|
211
|
+
await this.page.select(selector, ...values);
|
|
212
|
+
this._logAction('select', { selector, values });
|
|
213
|
+
return { success: true };
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
async pressKey(key) {
|
|
217
|
+
this._ensureConnected();
|
|
218
|
+
await this.page.keyboard.press(key);
|
|
219
|
+
this._logAction('pressKey', { key });
|
|
220
|
+
return { success: true };
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// ── Content Extraction ──────────────────────────────────────
|
|
224
|
+
|
|
225
|
+
async extractContent() {
|
|
226
|
+
this._ensureConnected();
|
|
227
|
+
return this.page.evaluate(() => {
|
|
228
|
+
const title = document.title;
|
|
229
|
+
const metaDescription = document.querySelector('meta[name="description"]')?.content || '';
|
|
230
|
+
const headings = Array.from(document.querySelectorAll('h1, h2, h3'))
|
|
231
|
+
.map(h => ({ level: h.tagName.toLowerCase(), text: h.textContent.trim() }))
|
|
232
|
+
.filter(h => h.text.length > 0)
|
|
233
|
+
.slice(0, 20);
|
|
234
|
+
const paragraphs = Array.from(document.querySelectorAll('p'))
|
|
235
|
+
.map(p => p.textContent.trim())
|
|
236
|
+
.filter(t => t.length > 20)
|
|
237
|
+
.slice(0, 15);
|
|
238
|
+
const links = Array.from(document.querySelectorAll('a[href]'))
|
|
239
|
+
.map(a => ({ text: a.textContent.trim(), href: a.href }))
|
|
240
|
+
.filter(l => l.text.length > 0 && l.href.startsWith('http'))
|
|
241
|
+
.slice(0, 30);
|
|
242
|
+
const bodyText = document.body?.innerText?.substring(0, 5000) || '';
|
|
243
|
+
|
|
244
|
+
return { title, metaDescription, headings, paragraphs, links, bodyText, url: location.href };
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
async evaluate(fn, ...args) {
|
|
249
|
+
this._ensureConnected();
|
|
250
|
+
return this.page.evaluate(fn, ...args);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
async waitFor(selector, timeout) {
|
|
254
|
+
this._ensureConnected();
|
|
255
|
+
await this.page.waitForSelector(selector, { timeout: timeout || this.timeout });
|
|
256
|
+
return { success: true };
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// ── Screenshot ──────────────────────────────────────────────
|
|
260
|
+
|
|
261
|
+
async screenshot(options = {}) {
|
|
262
|
+
this._ensureConnected();
|
|
263
|
+
|
|
264
|
+
// LightPanda can't screenshot — auto-fallback
|
|
265
|
+
if (this.activeBackend === 'lightpanda') {
|
|
266
|
+
if (this.mode === 'headless') {
|
|
267
|
+
throw new Error('screenshot() not available in headless-only mode (LightPanda has no rendering engine)');
|
|
268
|
+
}
|
|
269
|
+
await this._fallbackToChrome(FALLBACK_REASONS.SCREENSHOT);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// type_ avoids collision with executeAction's { type: 'screenshot' }
|
|
273
|
+
const imageFormat = options.type_ || (options.type !== 'screenshot' && options.type) || 'png';
|
|
274
|
+
const buffer = await this.page.screenshot({
|
|
275
|
+
type: imageFormat,
|
|
276
|
+
fullPage: options.fullPage ?? true,
|
|
277
|
+
encoding: 'base64',
|
|
278
|
+
});
|
|
279
|
+
|
|
280
|
+
this._logAction('screenshot');
|
|
281
|
+
return {
|
|
282
|
+
success: true,
|
|
283
|
+
screenshot: `data:image/${imageFormat};base64,${buffer}`,
|
|
284
|
+
backend: this.activeBackend,
|
|
285
|
+
};
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// ── AI Agent Interface ──────────────────────────────────────
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* Returns structured page state for AI decision-making.
|
|
292
|
+
* Includes interactive elements with selectors for the AI to use.
|
|
293
|
+
*/
|
|
294
|
+
async getPageState(options = {}) {
|
|
295
|
+
this._ensureConnected();
|
|
296
|
+
|
|
297
|
+
const state = await this.page.evaluate(() => {
|
|
298
|
+
const interactiveElements = [];
|
|
299
|
+
|
|
300
|
+
// Buttons
|
|
301
|
+
document.querySelectorAll('button, [role="button"], input[type="submit"], input[type="button"]').forEach((el, i) => {
|
|
302
|
+
if (el.offsetParent === null) return; // hidden
|
|
303
|
+
const text = el.textContent?.trim() || el.value || el.getAttribute('aria-label') || '';
|
|
304
|
+
if (!text) return;
|
|
305
|
+
interactiveElements.push({
|
|
306
|
+
type: 'button',
|
|
307
|
+
text: text.substring(0, 100),
|
|
308
|
+
selector: el.id ? `#${el.id}` : `button:nth-of-type(${i + 1})`,
|
|
309
|
+
tag: el.tagName.toLowerCase(),
|
|
310
|
+
});
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
// Links
|
|
314
|
+
document.querySelectorAll('a[href]').forEach((el) => {
|
|
315
|
+
if (el.offsetParent === null) return;
|
|
316
|
+
const text = el.textContent?.trim();
|
|
317
|
+
if (!text || text.length < 2) return;
|
|
318
|
+
interactiveElements.push({
|
|
319
|
+
type: 'link',
|
|
320
|
+
text: text.substring(0, 100),
|
|
321
|
+
href: el.href,
|
|
322
|
+
selector: el.id ? `#${el.id}` : `a[href="${el.getAttribute('href')}"]`,
|
|
323
|
+
});
|
|
324
|
+
});
|
|
325
|
+
|
|
326
|
+
// Inputs
|
|
327
|
+
document.querySelectorAll('input, textarea, select').forEach((el) => {
|
|
328
|
+
if (el.offsetParent === null || el.type === 'hidden') return;
|
|
329
|
+
const label = el.getAttribute('aria-label')
|
|
330
|
+
|| el.placeholder
|
|
331
|
+
|| document.querySelector(`label[for="${el.id}"]`)?.textContent?.trim()
|
|
332
|
+
|| el.name
|
|
333
|
+
|| '';
|
|
334
|
+
interactiveElements.push({
|
|
335
|
+
type: el.tagName.toLowerCase() === 'select' ? 'select' : 'input',
|
|
336
|
+
inputType: el.type || 'text',
|
|
337
|
+
label: label.substring(0, 100),
|
|
338
|
+
value: el.value?.substring(0, 50) || '',
|
|
339
|
+
selector: el.id ? `#${el.id}` : `[name="${el.name}"]`,
|
|
340
|
+
tag: el.tagName.toLowerCase(),
|
|
341
|
+
});
|
|
342
|
+
});
|
|
343
|
+
|
|
344
|
+
return {
|
|
345
|
+
url: location.href,
|
|
346
|
+
title: document.title,
|
|
347
|
+
text: document.body?.innerText?.substring(0, 3000) || '',
|
|
348
|
+
interactiveElements: interactiveElements.slice(0, 50),
|
|
349
|
+
};
|
|
350
|
+
});
|
|
351
|
+
|
|
352
|
+
// Optionally include screenshot
|
|
353
|
+
if (options.includeScreenshot && this.activeBackend !== 'lightpanda') {
|
|
354
|
+
const { screenshot } = await this.screenshot();
|
|
355
|
+
state.screenshot = screenshot;
|
|
356
|
+
} else if (options.includeScreenshot && this.mode === 'auto') {
|
|
357
|
+
await this._fallbackToChrome(FALLBACK_REASONS.SCREENSHOT);
|
|
358
|
+
const { screenshot } = await this.screenshot();
|
|
359
|
+
state.screenshot = screenshot;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
state.backend = this.activeBackend;
|
|
363
|
+
state.sessionHistory = this.history.slice(-10);
|
|
364
|
+
|
|
365
|
+
return state;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Execute a structured action from an AI agent.
|
|
370
|
+
* @param {{ type: string, selector?: string, text?: string, url?: string, key?: string, direction?: string, amount?: number }} action
|
|
371
|
+
*/
|
|
372
|
+
async executeAction(action) {
|
|
373
|
+
switch (action.type) {
|
|
374
|
+
case 'goto': return this.goto(action.url);
|
|
375
|
+
case 'click': return this.click(action.selector, action);
|
|
376
|
+
case 'type': return this.type(action.selector, action.text, action);
|
|
377
|
+
case 'scroll': return this.scroll(action.direction, action.amount);
|
|
378
|
+
case 'hover': return this.hover(action.selector);
|
|
379
|
+
case 'select': return this.select(action.selector, ...(action.values || []));
|
|
380
|
+
case 'pressKey': return this.pressKey(action.key);
|
|
381
|
+
case 'goBack': return this.goBack();
|
|
382
|
+
case 'goForward': return this.goForward();
|
|
383
|
+
case 'screenshot': return this.screenshot(action);
|
|
384
|
+
case 'extractContent': return this.extractContent();
|
|
385
|
+
case 'waitFor': return this.waitFor(action.selector, action.timeout);
|
|
386
|
+
default: throw new Error(`Unknown action type: ${action.type}`);
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// ── Cookies ─────────────────────────────────────────────────
|
|
391
|
+
|
|
392
|
+
async getCookies() {
|
|
393
|
+
this._ensureConnected();
|
|
394
|
+
return this.page.cookies();
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
async setCookies(cookies) {
|
|
398
|
+
this._ensureConnected();
|
|
399
|
+
if (this.activeBackend === 'lightpanda') {
|
|
400
|
+
// LP doesn't support Network.deleteCookies which Puppeteer's setCookie calls.
|
|
401
|
+
// Use the page's internal CDP session to call Network.setCookies directly.
|
|
402
|
+
try {
|
|
403
|
+
const client = this.page._client();
|
|
404
|
+
await client.send('Network.setCookies', { cookies });
|
|
405
|
+
} catch {
|
|
406
|
+
// Fallback: set cookies via document.cookie (limited to non-httpOnly)
|
|
407
|
+
for (const c of cookies) {
|
|
408
|
+
const parts = [`${c.name}=${c.value}`];
|
|
409
|
+
if (c.domain) parts.push(`domain=${c.domain}`);
|
|
410
|
+
if (c.path) parts.push(`path=${c.path}`);
|
|
411
|
+
await this.page.evaluate((cookieStr) => { document.cookie = cookieStr; }, parts.join('; '));
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
} else {
|
|
415
|
+
await this.page.setCookie(...cookies);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
// ── Fallback ────────────────────────────────────────────────
|
|
420
|
+
|
|
421
|
+
async _fallbackToChrome(reason) {
|
|
422
|
+
if (this.activeBackend === 'chrome') return; // already on Chrome
|
|
423
|
+
if (this._fallbackCount > 2) throw new Error('Too many fallback attempts');
|
|
424
|
+
|
|
425
|
+
this._log(`Falling back to Chrome: ${reason}`);
|
|
426
|
+
this._fallbackCount++;
|
|
427
|
+
|
|
428
|
+
// Save state from LP session
|
|
429
|
+
let cookies = [];
|
|
430
|
+
let currentUrl = null;
|
|
431
|
+
try {
|
|
432
|
+
cookies = await this.page.cookies();
|
|
433
|
+
currentUrl = this.page.url();
|
|
434
|
+
} catch {
|
|
435
|
+
// LP might be in a bad state
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// Close LP page (keep server alive for potential reuse)
|
|
439
|
+
try {
|
|
440
|
+
if (this.page && !this.page.isClosed()) await this.page.close();
|
|
441
|
+
if (this.context) await this.context.close();
|
|
442
|
+
if (this.browser) await this.browser.disconnect();
|
|
443
|
+
} catch {
|
|
444
|
+
// ignore cleanup errors
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
// Connect to Chrome
|
|
448
|
+
await this._connectChrome();
|
|
449
|
+
|
|
450
|
+
// Restore state
|
|
451
|
+
if (cookies.length > 0) {
|
|
452
|
+
await this.page.setCookie(...cookies);
|
|
453
|
+
}
|
|
454
|
+
if (currentUrl && currentUrl !== 'about:blank') {
|
|
455
|
+
await this.page.goto(currentUrl, {
|
|
456
|
+
waitUntil: 'networkidle0',
|
|
457
|
+
timeout: this.timeout,
|
|
458
|
+
});
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
this._logAction('fallback', { reason, from: 'lightpanda', to: 'chrome' });
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
async _shouldFallback(error) {
|
|
465
|
+
if (this.activeBackend === 'chrome') return false;
|
|
466
|
+
if (this.mode === 'headless') return false; // no auto-fallback in explicit headless mode
|
|
467
|
+
|
|
468
|
+
const msg = error.message || '';
|
|
469
|
+
return (
|
|
470
|
+
msg.includes('Protocol error') ||
|
|
471
|
+
msg.includes('not implemented') ||
|
|
472
|
+
msg.includes('Target closed') ||
|
|
473
|
+
msg.includes('Session closed') ||
|
|
474
|
+
msg.includes('Connection closed') ||
|
|
475
|
+
msg.includes('Execution context was destroyed')
|
|
476
|
+
);
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// ── Cleanup ─────────────────────────────────────────────────
|
|
480
|
+
|
|
481
|
+
async close() {
|
|
482
|
+
try {
|
|
483
|
+
if (this.page && !this.page.isClosed()) await this.page.close();
|
|
484
|
+
} catch { /* ignore */ }
|
|
485
|
+
|
|
486
|
+
try {
|
|
487
|
+
if (this.context) await this.context.close();
|
|
488
|
+
} catch { /* ignore */ }
|
|
489
|
+
|
|
490
|
+
if (this.activeBackend === 'lightpanda' && this.browser) {
|
|
491
|
+
try { await this.browser.disconnect(); } catch { /* ignore */ }
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
if (this._chromeBrowser) {
|
|
495
|
+
browserPool.releaseBrowser(this._chromeBrowser);
|
|
496
|
+
this._chromeBrowser = null;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
this.page = null;
|
|
500
|
+
this.context = null;
|
|
501
|
+
this.browser = null;
|
|
502
|
+
this._connected = false;
|
|
503
|
+
this._log('Session closed');
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
// ── Helpers ─────────────────────────────────────────────────
|
|
507
|
+
|
|
508
|
+
_ensureConnected() {
|
|
509
|
+
if (!this._connected || !this.page) {
|
|
510
|
+
throw new Error('Session not connected. Call connect() first.');
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
async _getPuppeteer() {
|
|
515
|
+
try {
|
|
516
|
+
const puppeteer = await import('puppeteer');
|
|
517
|
+
return puppeteer.default || puppeteer;
|
|
518
|
+
} catch {
|
|
519
|
+
throw new Error('Puppeteer is required for BrowserSession. Install with: npm install puppeteer');
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
_log(msg) {
|
|
524
|
+
if (this.verbose) console.log(`[BrowserSession] ${msg}`);
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
_logAction(type, params = {}) {
|
|
528
|
+
const entry = { type, ...params, timestamp: Date.now(), backend: this.activeBackend };
|
|
529
|
+
this.history.push(entry);
|
|
530
|
+
this._log(`${type} ${JSON.stringify(params)}`);
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
getHistory() {
|
|
534
|
+
return this.history;
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
getBackend() {
|
|
538
|
+
return this.activeBackend;
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
/**
|
|
543
|
+
* Convenience function to create and connect a browser session.
|
|
544
|
+
*/
|
|
545
|
+
export async function createSession(options = {}) {
|
|
546
|
+
const session = new BrowserSession(options);
|
|
547
|
+
await session.connect();
|
|
548
|
+
return session;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
export default BrowserSession;
|
package/index.d.ts
CHANGED
|
@@ -455,6 +455,103 @@ export function bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise
|
|
|
455
455
|
*/
|
|
456
456
|
export function bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>;
|
|
457
457
|
|
|
458
|
+
// ── Browser Session ───────────────────────────────────────────
|
|
459
|
+
|
|
460
|
+
export interface BrowserSessionOptions {
|
|
461
|
+
mode?: 'headless' | 'visual' | 'auto';
|
|
462
|
+
timeout?: number;
|
|
463
|
+
userAgent?: string;
|
|
464
|
+
lightpandaPath?: string;
|
|
465
|
+
verbose?: boolean;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
export interface PageState {
|
|
469
|
+
url: string;
|
|
470
|
+
title: string;
|
|
471
|
+
text: string;
|
|
472
|
+
interactiveElements: Array<{
|
|
473
|
+
type: 'button' | 'link' | 'input' | 'select';
|
|
474
|
+
text?: string;
|
|
475
|
+
label?: string;
|
|
476
|
+
href?: string;
|
|
477
|
+
selector: string;
|
|
478
|
+
tag?: string;
|
|
479
|
+
inputType?: string;
|
|
480
|
+
value?: string;
|
|
481
|
+
}>;
|
|
482
|
+
screenshot?: string;
|
|
483
|
+
backend: 'lightpanda' | 'chrome';
|
|
484
|
+
sessionHistory: Array<{ type: string; timestamp: number; backend: string }>;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
export interface ActionResult {
|
|
488
|
+
success: boolean;
|
|
489
|
+
url?: string;
|
|
490
|
+
screenshot?: string;
|
|
491
|
+
backend?: string;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
export interface BrowserAction {
|
|
495
|
+
type: 'goto' | 'click' | 'type' | 'scroll' | 'hover' | 'select' | 'pressKey' | 'goBack' | 'goForward' | 'screenshot' | 'extractContent' | 'waitFor';
|
|
496
|
+
url?: string;
|
|
497
|
+
selector?: string;
|
|
498
|
+
text?: string;
|
|
499
|
+
key?: string;
|
|
500
|
+
direction?: 'up' | 'down';
|
|
501
|
+
amount?: number;
|
|
502
|
+
values?: string[];
|
|
503
|
+
timeout?: number;
|
|
504
|
+
expectNavigation?: boolean;
|
|
505
|
+
waitForNavigation?: boolean;
|
|
506
|
+
clear?: boolean;
|
|
507
|
+
delay?: number;
|
|
508
|
+
fullPage?: boolean;
|
|
509
|
+
type_?: 'png' | 'jpeg' | 'webp';
|
|
510
|
+
includeScreenshot?: boolean;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
export declare class BrowserSession {
|
|
514
|
+
constructor(options?: BrowserSessionOptions);
|
|
515
|
+
|
|
516
|
+
readonly activeBackend: 'lightpanda' | 'chrome' | null;
|
|
517
|
+
readonly mode: 'headless' | 'visual' | 'auto';
|
|
518
|
+
|
|
519
|
+
connect(): Promise<BrowserSession>;
|
|
520
|
+
goto(url: string): Promise<ActionResult>;
|
|
521
|
+
goBack(): Promise<void>;
|
|
522
|
+
goForward(): Promise<void>;
|
|
523
|
+
click(selector: string, options?: { timeout?: number; expectNavigation?: boolean; waitForNavigation?: boolean }): Promise<ActionResult>;
|
|
524
|
+
type(selector: string, text: string, options?: { timeout?: number; clear?: boolean; delay?: number }): Promise<ActionResult>;
|
|
525
|
+
scroll(direction?: 'up' | 'down', amount?: number): Promise<ActionResult>;
|
|
526
|
+
hover(selector: string): Promise<ActionResult>;
|
|
527
|
+
select(selector: string, ...values: string[]): Promise<ActionResult>;
|
|
528
|
+
pressKey(key: string): Promise<ActionResult>;
|
|
529
|
+
screenshot(options?: { type?: 'png' | 'jpeg' | 'webp'; fullPage?: boolean }): Promise<{ success: boolean; screenshot: string; backend: string }>;
|
|
530
|
+
extractContent(): Promise<{ title: string; metaDescription: string; headings: any[]; paragraphs: string[]; links: any[]; bodyText: string; url: string }>;
|
|
531
|
+
evaluate<T>(fn: (...args: any[]) => T, ...args: any[]): Promise<T>;
|
|
532
|
+
waitFor(selector: string, timeout?: number): Promise<ActionResult>;
|
|
533
|
+
getPageState(options?: { includeScreenshot?: boolean }): Promise<PageState>;
|
|
534
|
+
executeAction(action: BrowserAction): Promise<ActionResult>;
|
|
535
|
+
getCookies(): Promise<any[]>;
|
|
536
|
+
setCookies(cookies: any[]): Promise<void>;
|
|
537
|
+
getHistory(): Array<{ type: string; timestamp: number; backend: string }>;
|
|
538
|
+
getBackend(): 'lightpanda' | 'chrome' | null;
|
|
539
|
+
close(): Promise<void>;
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
export function createSession(options?: BrowserSessionOptions): Promise<BrowserSession>;
|
|
543
|
+
|
|
544
|
+
export declare class LightPandaServer {
|
|
545
|
+
constructor(binaryPath?: string);
|
|
546
|
+
start(port?: number): Promise<string>;
|
|
547
|
+
getEndpoint(): string;
|
|
548
|
+
isRunning(): boolean;
|
|
549
|
+
stop(): void;
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
export function getLightPandaServer(binaryPath?: string): LightPandaServer;
|
|
553
|
+
export function stopLightPandaServer(): void;
|
|
554
|
+
|
|
458
555
|
/**
|
|
459
556
|
* Default export - same as BNCASmartScraper class
|
|
460
557
|
*/
|
package/index.js
CHANGED
|
@@ -1795,4 +1795,8 @@ export async function bulkScrapeStream(urls, options = {}) {
|
|
|
1795
1795
|
}
|
|
1796
1796
|
}
|
|
1797
1797
|
|
|
1798
|
+
// Browser session exports
|
|
1799
|
+
export { BrowserSession, createSession } from './browser-session.js';
|
|
1800
|
+
export { default as LightPandaServer, getLightPandaServer, stopLightPandaServer } from './lightpanda-server.js';
|
|
1801
|
+
|
|
1798
1802
|
export default BNCASmartScraper;
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import { spawn } from 'child_process';
|
|
2
|
+
import { createServer } from 'net';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import fs from 'fs';
|
|
5
|
+
|
|
6
|
+
class LightPandaServer {
|
|
7
|
+
constructor(binaryPath) {
|
|
8
|
+
this.binaryPath = binaryPath || this._findBinary();
|
|
9
|
+
this.process = null;
|
|
10
|
+
this.host = '127.0.0.1';
|
|
11
|
+
this.port = null;
|
|
12
|
+
this.ready = false;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
async start(port) {
|
|
16
|
+
if (this.process && this.ready) return this.getEndpoint();
|
|
17
|
+
|
|
18
|
+
this.port = port || await this._findAvailablePort();
|
|
19
|
+
|
|
20
|
+
return new Promise((resolve, reject) => {
|
|
21
|
+
const args = [
|
|
22
|
+
'serve',
|
|
23
|
+
'--host', this.host,
|
|
24
|
+
'--port', String(this.port),
|
|
25
|
+
'--cdp_max_connections', '16',
|
|
26
|
+
];
|
|
27
|
+
|
|
28
|
+
this.process = spawn(this.binaryPath, args, {
|
|
29
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
let stderr = '';
|
|
33
|
+
|
|
34
|
+
const onReady = () => {
|
|
35
|
+
this.ready = true;
|
|
36
|
+
resolve(this.getEndpoint());
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
// LP prints to stderr when ready — wait for it or poll /json/version
|
|
40
|
+
this.process.stderr.on('data', (data) => {
|
|
41
|
+
stderr += data.toString();
|
|
42
|
+
// LightPanda logs server start to stderr
|
|
43
|
+
if (stderr.includes('Listening on') || stderr.includes('server started')) {
|
|
44
|
+
onReady();
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
this.process.on('error', (err) => {
|
|
49
|
+
this.ready = false;
|
|
50
|
+
reject(new Error(`Failed to start LightPanda: ${err.message}`));
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
this.process.on('exit', (code) => {
|
|
54
|
+
this.ready = false;
|
|
55
|
+
this.process = null;
|
|
56
|
+
if (!this.ready) {
|
|
57
|
+
reject(new Error(`LightPanda exited with code ${code}: ${stderr}`));
|
|
58
|
+
}
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
// Fallback: poll /json/version if no stderr signal within 3s
|
|
62
|
+
setTimeout(async () => {
|
|
63
|
+
if (this.ready) return;
|
|
64
|
+
try {
|
|
65
|
+
const res = await fetch(`http://${this.host}:${this.port}/json/version`);
|
|
66
|
+
if (res.ok) onReady();
|
|
67
|
+
} catch {
|
|
68
|
+
// Still starting up, give it more time
|
|
69
|
+
}
|
|
70
|
+
}, 1500);
|
|
71
|
+
|
|
72
|
+
// Hard timeout
|
|
73
|
+
setTimeout(() => {
|
|
74
|
+
if (!this.ready) {
|
|
75
|
+
this.stop();
|
|
76
|
+
reject(new Error(`LightPanda failed to start within 5s. stderr: ${stderr}`));
|
|
77
|
+
}
|
|
78
|
+
}, 5000);
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
getEndpoint() {
|
|
83
|
+
return `ws://${this.host}:${this.port}`;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
isRunning() {
|
|
87
|
+
return this.ready && this.process !== null;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
stop() {
|
|
91
|
+
if (this.process) {
|
|
92
|
+
try {
|
|
93
|
+
this.process.kill('SIGTERM');
|
|
94
|
+
} catch {
|
|
95
|
+
// already dead
|
|
96
|
+
}
|
|
97
|
+
this.process = null;
|
|
98
|
+
}
|
|
99
|
+
this.ready = false;
|
|
100
|
+
this.port = null;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
async _findAvailablePort() {
|
|
104
|
+
return new Promise((resolve, reject) => {
|
|
105
|
+
const server = createServer();
|
|
106
|
+
server.listen(0, '127.0.0.1', () => {
|
|
107
|
+
const port = server.address().port;
|
|
108
|
+
server.close(() => resolve(port));
|
|
109
|
+
});
|
|
110
|
+
server.on('error', reject);
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
_findBinary() {
|
|
115
|
+
// Check common locations
|
|
116
|
+
const candidates = [
|
|
117
|
+
path.join(path.dirname(new URL(import.meta.url).pathname), 'bin', 'lightpanda'),
|
|
118
|
+
'/usr/local/bin/lightpanda',
|
|
119
|
+
'/usr/bin/lightpanda',
|
|
120
|
+
];
|
|
121
|
+
|
|
122
|
+
for (const p of candidates) {
|
|
123
|
+
if (fs.existsSync(p)) return p;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return 'lightpanda'; // hope it's on PATH
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Singleton instance — shared across all sessions
|
|
131
|
+
let _instance = null;
|
|
132
|
+
|
|
133
|
+
export function getLightPandaServer(binaryPath) {
|
|
134
|
+
if (!_instance) {
|
|
135
|
+
_instance = new LightPandaServer(binaryPath);
|
|
136
|
+
}
|
|
137
|
+
return _instance;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
export function stopLightPandaServer() {
|
|
141
|
+
if (_instance) {
|
|
142
|
+
_instance.stop();
|
|
143
|
+
_instance = null;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
process.on('SIGTERM', stopLightPandaServer);
|
|
148
|
+
process.on('SIGINT', stopLightPandaServer);
|
|
149
|
+
process.on('beforeExit', stopLightPandaServer);
|
|
150
|
+
|
|
151
|
+
export default LightPandaServer;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@monostate/node-scraper",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -15,6 +15,8 @@
|
|
|
15
15
|
"index.js",
|
|
16
16
|
"index.d.ts",
|
|
17
17
|
"browser-pool.js",
|
|
18
|
+
"browser-session.js",
|
|
19
|
+
"lightpanda-server.js",
|
|
18
20
|
"README.md",
|
|
19
21
|
"BULK_SCRAPING.md",
|
|
20
22
|
"package.json",
|
|
@@ -34,6 +36,9 @@
|
|
|
34
36
|
"data-extraction",
|
|
35
37
|
"automation",
|
|
36
38
|
"browser",
|
|
39
|
+
"browser-use",
|
|
40
|
+
"cdp",
|
|
41
|
+
"ai-agent",
|
|
37
42
|
"ai-powered",
|
|
38
43
|
"question-answering",
|
|
39
44
|
"pdf-parsing",
|