@monostate/node-scraper 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,551 @@
1
+ import { getLightPandaServer, stopLightPandaServer } from './lightpanda-server.js';
2
+ import browserPool from './browser-pool.js';
3
+
4
+ const FALLBACK_REASONS = {
5
+ SCREENSHOT: 'screenshot_requested',
6
+ CDP_ERROR: 'cdp_protocol_error',
7
+ BOT_DETECTION: 'bot_detection',
8
+ NAVIGATION_FAILED: 'navigation_after_click_failed',
9
+ METHOD_NOT_SUPPORTED: 'method_not_supported',
10
+ };
11
+
12
+ export class BrowserSession {
13
+ /**
14
+ * @param {object} options
15
+ * @param {'headless'|'visual'|'auto'} options.mode - 'headless' (LightPanda), 'visual' (Chrome), 'auto' (LP with Chrome fallback)
16
+ * @param {number} options.timeout - Navigation timeout in ms (default: 15000)
17
+ * @param {string} options.userAgent - Custom user agent
18
+ * @param {string} options.lightpandaPath - Path to LightPanda binary
19
+ * @param {boolean} options.verbose - Enable logging
20
+ */
21
+ constructor(options = {}) {
22
+ this.mode = options.mode || 'auto';
23
+ this.activeBackend = null; // 'lightpanda' | 'chrome'
24
+ this.timeout = options.timeout || 15000;
25
+ this.userAgent = options.userAgent || 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36';
26
+ this.lightpandaPath = options.lightpandaPath;
27
+ this.verbose = options.verbose || false;
28
+
29
+ this.browser = null;
30
+ this.context = null;
31
+ this.page = null;
32
+ this._chromeBrowser = null; // reference for pool release
33
+ this._connected = false;
34
+ this._fallbackCount = 0;
35
+
36
+ this.history = []; // action log for debugging
37
+ }
38
+
39
+ // ── Connection ──────────────────────────────────────────────
40
+
41
+ async connect() {
42
+ if (this._connected) return this;
43
+
44
+ if (this.mode === 'visual') {
45
+ await this._connectChrome();
46
+ } else {
47
+ // 'headless' or 'auto' — start with LightPanda
48
+ try {
49
+ await this._connectLightPanda();
50
+ } catch (err) {
51
+ if (this.mode === 'auto') {
52
+ this._log(`LightPanda unavailable (${err.message}), falling back to Chrome`);
53
+ await this._connectChrome();
54
+ } else {
55
+ throw err;
56
+ }
57
+ }
58
+ }
59
+
60
+ this._connected = true;
61
+ return this;
62
+ }
63
+
64
+ async _connectLightPanda() {
65
+ const server = getLightPandaServer(this.lightpandaPath);
66
+ const endpoint = await server.start();
67
+
68
+ const puppeteer = await this._getPuppeteer();
69
+ this.browser = await puppeteer.connect({ browserWSEndpoint: endpoint });
70
+ this.context = await this.browser.createBrowserContext();
71
+ this.page = await this.context.newPage();
72
+
73
+ await this.page.setUserAgent(this.userAgent);
74
+ this.activeBackend = 'lightpanda';
75
+ this._log('Connected to LightPanda CDP');
76
+ }
77
+
78
+ async _connectChrome() {
79
+ this._chromeBrowser = await browserPool.getBrowser();
80
+ this.browser = this._chromeBrowser;
81
+ this.page = await this.browser.newPage();
82
+
83
+ await this.page.setUserAgent(this.userAgent);
84
+ await this.page.setViewport({ width: 1280, height: 800 });
85
+ this.activeBackend = 'chrome';
86
+ this._log('Connected to Chrome');
87
+ }
88
+
89
+ // ── Navigation ──────────────────────────────────────────────
90
+
91
+ async goto(url) {
92
+ this._ensureConnected();
93
+ try {
94
+ await this.page.goto(url, {
95
+ waitUntil: 'networkidle0',
96
+ timeout: this.timeout,
97
+ });
98
+ this._logAction('goto', { url });
99
+ return { success: true, url: this.page.url() };
100
+ } catch (err) {
101
+ if (await this._shouldFallback(err)) {
102
+ await this._fallbackToChrome(FALLBACK_REASONS.CDP_ERROR);
103
+ return this.goto(url);
104
+ }
105
+ throw err;
106
+ }
107
+ }
108
+
109
+ async goBack() {
110
+ this._ensureConnected();
111
+ await this.page.goBack({ waitUntil: 'networkidle0', timeout: this.timeout });
112
+ this._logAction('goBack');
113
+ }
114
+
115
+ async goForward() {
116
+ this._ensureConnected();
117
+ await this.page.goForward({ waitUntil: 'networkidle0', timeout: this.timeout });
118
+ this._logAction('goForward');
119
+ }
120
+
121
+ // ── Page Interactions ───────────────────────────────────────
122
+
123
+ async click(selector, options = {}) {
124
+ this._ensureConnected();
125
+ try {
126
+ await this.page.waitForSelector(selector, { timeout: options.timeout || this.timeout });
127
+ const urlBefore = this.page.url();
128
+ await this.page.click(selector);
129
+
130
+ // If navigation expected, wait briefly
131
+ if (options.waitForNavigation !== false) {
132
+ try {
133
+ await this.page.waitForNavigation({ timeout: 3000, waitUntil: 'networkidle0' }).catch(() => {});
134
+ } catch {
135
+ // No navigation happened, that's fine
136
+ }
137
+ }
138
+
139
+ this._logAction('click', { selector });
140
+
141
+ // Check for LP's known click-navigation bug
142
+ if (this.activeBackend === 'lightpanda') {
143
+ const urlAfter = this.page.url();
144
+ if (options.expectNavigation && urlBefore === urlAfter) {
145
+ this._log('Click did not trigger expected navigation — falling back to Chrome');
146
+ await this._fallbackToChrome(FALLBACK_REASONS.NAVIGATION_FAILED);
147
+ return this.click(selector, { ...options, _retried: true });
148
+ }
149
+ }
150
+
151
+ return { success: true, url: this.page.url() };
152
+ } catch (err) {
153
+ if (await this._shouldFallback(err)) {
154
+ await this._fallbackToChrome(FALLBACK_REASONS.CDP_ERROR);
155
+ return this.click(selector, options);
156
+ }
157
+ throw err;
158
+ }
159
+ }
160
+
161
+ async type(selector, text, options = {}) {
162
+ this._ensureConnected();
163
+ try {
164
+ await this.page.waitForSelector(selector, { timeout: options.timeout || this.timeout });
165
+
166
+ if (options.clear) {
167
+ await this.page.click(selector, { clickCount: 3 });
168
+ await this.page.keyboard.press('Backspace');
169
+ }
170
+
171
+ await this.page.type(selector, text, { delay: options.delay || 0 });
172
+ this._logAction('type', { selector, text: text.substring(0, 20) + (text.length > 20 ? '...' : '') });
173
+ return { success: true };
174
+ } catch (err) {
175
+ if (await this._shouldFallback(err)) {
176
+ await this._fallbackToChrome(FALLBACK_REASONS.CDP_ERROR);
177
+ return this.type(selector, text, options);
178
+ }
179
+ throw err;
180
+ }
181
+ }
182
+
183
+ async scroll(direction = 'down', amount = 500) {
184
+ this._ensureConnected();
185
+ const deltaY = direction === 'up' ? -amount : amount;
186
+ await this.page.evaluate((dy) => window.scrollBy(0, dy), deltaY);
187
+ this._logAction('scroll', { direction, amount });
188
+ return { success: true };
189
+ }
190
+
191
+ async hover(selector) {
192
+ this._ensureConnected();
193
+ if (this.activeBackend === 'lightpanda') {
194
+ // LP doesn't support mouseMoved — fall back
195
+ if (this.mode === 'auto') {
196
+ await this._fallbackToChrome(FALLBACK_REASONS.METHOD_NOT_SUPPORTED);
197
+ return this.hover(selector);
198
+ }
199
+ throw new Error('hover() not supported in LightPanda mode');
200
+ }
201
+
202
+ await this.page.waitForSelector(selector, { timeout: this.timeout });
203
+ await this.page.hover(selector);
204
+ this._logAction('hover', { selector });
205
+ return { success: true };
206
+ }
207
+
208
+ async select(selector, ...values) {
209
+ this._ensureConnected();
210
+ await this.page.waitForSelector(selector, { timeout: this.timeout });
211
+ await this.page.select(selector, ...values);
212
+ this._logAction('select', { selector, values });
213
+ return { success: true };
214
+ }
215
+
216
+ async pressKey(key) {
217
+ this._ensureConnected();
218
+ await this.page.keyboard.press(key);
219
+ this._logAction('pressKey', { key });
220
+ return { success: true };
221
+ }
222
+
223
+ // ── Content Extraction ──────────────────────────────────────
224
+
225
+ async extractContent() {
226
+ this._ensureConnected();
227
+ return this.page.evaluate(() => {
228
+ const title = document.title;
229
+ const metaDescription = document.querySelector('meta[name="description"]')?.content || '';
230
+ const headings = Array.from(document.querySelectorAll('h1, h2, h3'))
231
+ .map(h => ({ level: h.tagName.toLowerCase(), text: h.textContent.trim() }))
232
+ .filter(h => h.text.length > 0)
233
+ .slice(0, 20);
234
+ const paragraphs = Array.from(document.querySelectorAll('p'))
235
+ .map(p => p.textContent.trim())
236
+ .filter(t => t.length > 20)
237
+ .slice(0, 15);
238
+ const links = Array.from(document.querySelectorAll('a[href]'))
239
+ .map(a => ({ text: a.textContent.trim(), href: a.href }))
240
+ .filter(l => l.text.length > 0 && l.href.startsWith('http'))
241
+ .slice(0, 30);
242
+ const bodyText = document.body?.innerText?.substring(0, 5000) || '';
243
+
244
+ return { title, metaDescription, headings, paragraphs, links, bodyText, url: location.href };
245
+ });
246
+ }
247
+
248
+ async evaluate(fn, ...args) {
249
+ this._ensureConnected();
250
+ return this.page.evaluate(fn, ...args);
251
+ }
252
+
253
+ async waitFor(selector, timeout) {
254
+ this._ensureConnected();
255
+ await this.page.waitForSelector(selector, { timeout: timeout || this.timeout });
256
+ return { success: true };
257
+ }
258
+
259
+ // ── Screenshot ──────────────────────────────────────────────
260
+
261
+ async screenshot(options = {}) {
262
+ this._ensureConnected();
263
+
264
+ // LightPanda can't screenshot — auto-fallback
265
+ if (this.activeBackend === 'lightpanda') {
266
+ if (this.mode === 'headless') {
267
+ throw new Error('screenshot() not available in headless-only mode (LightPanda has no rendering engine)');
268
+ }
269
+ await this._fallbackToChrome(FALLBACK_REASONS.SCREENSHOT);
270
+ }
271
+
272
+ // type_ avoids collision with executeAction's { type: 'screenshot' }
273
+ const imageFormat = options.type_ || (options.type !== 'screenshot' && options.type) || 'png';
274
+ const buffer = await this.page.screenshot({
275
+ type: imageFormat,
276
+ fullPage: options.fullPage ?? true,
277
+ encoding: 'base64',
278
+ });
279
+
280
+ this._logAction('screenshot');
281
+ return {
282
+ success: true,
283
+ screenshot: `data:image/${imageFormat};base64,${buffer}`,
284
+ backend: this.activeBackend,
285
+ };
286
+ }
287
+
288
+ // ── AI Agent Interface ──────────────────────────────────────
289
+
290
+ /**
291
+ * Returns structured page state for AI decision-making.
292
+ * Includes interactive elements with selectors for the AI to use.
293
+ */
294
+ async getPageState(options = {}) {
295
+ this._ensureConnected();
296
+
297
+ const state = await this.page.evaluate(() => {
298
+ const interactiveElements = [];
299
+
300
+ // Buttons
301
+ document.querySelectorAll('button, [role="button"], input[type="submit"], input[type="button"]').forEach((el, i) => {
302
+ if (el.offsetParent === null) return; // hidden
303
+ const text = el.textContent?.trim() || el.value || el.getAttribute('aria-label') || '';
304
+ if (!text) return;
305
+ interactiveElements.push({
306
+ type: 'button',
307
+ text: text.substring(0, 100),
308
+ selector: el.id ? `#${el.id}` : `button:nth-of-type(${i + 1})`,
309
+ tag: el.tagName.toLowerCase(),
310
+ });
311
+ });
312
+
313
+ // Links
314
+ document.querySelectorAll('a[href]').forEach((el) => {
315
+ if (el.offsetParent === null) return;
316
+ const text = el.textContent?.trim();
317
+ if (!text || text.length < 2) return;
318
+ interactiveElements.push({
319
+ type: 'link',
320
+ text: text.substring(0, 100),
321
+ href: el.href,
322
+ selector: el.id ? `#${el.id}` : `a[href="${el.getAttribute('href')}"]`,
323
+ });
324
+ });
325
+
326
+ // Inputs
327
+ document.querySelectorAll('input, textarea, select').forEach((el) => {
328
+ if (el.offsetParent === null || el.type === 'hidden') return;
329
+ const label = el.getAttribute('aria-label')
330
+ || el.placeholder
331
+ || document.querySelector(`label[for="${el.id}"]`)?.textContent?.trim()
332
+ || el.name
333
+ || '';
334
+ interactiveElements.push({
335
+ type: el.tagName.toLowerCase() === 'select' ? 'select' : 'input',
336
+ inputType: el.type || 'text',
337
+ label: label.substring(0, 100),
338
+ value: el.value?.substring(0, 50) || '',
339
+ selector: el.id ? `#${el.id}` : `[name="${el.name}"]`,
340
+ tag: el.tagName.toLowerCase(),
341
+ });
342
+ });
343
+
344
+ return {
345
+ url: location.href,
346
+ title: document.title,
347
+ text: document.body?.innerText?.substring(0, 3000) || '',
348
+ interactiveElements: interactiveElements.slice(0, 50),
349
+ };
350
+ });
351
+
352
+ // Optionally include screenshot
353
+ if (options.includeScreenshot && this.activeBackend !== 'lightpanda') {
354
+ const { screenshot } = await this.screenshot();
355
+ state.screenshot = screenshot;
356
+ } else if (options.includeScreenshot && this.mode === 'auto') {
357
+ await this._fallbackToChrome(FALLBACK_REASONS.SCREENSHOT);
358
+ const { screenshot } = await this.screenshot();
359
+ state.screenshot = screenshot;
360
+ }
361
+
362
+ state.backend = this.activeBackend;
363
+ state.sessionHistory = this.history.slice(-10);
364
+
365
+ return state;
366
+ }
367
+
368
+ /**
369
+ * Execute a structured action from an AI agent.
370
+ * @param {{ type: string, selector?: string, text?: string, url?: string, key?: string, direction?: string, amount?: number }} action
371
+ */
372
+ async executeAction(action) {
373
+ switch (action.type) {
374
+ case 'goto': return this.goto(action.url);
375
+ case 'click': return this.click(action.selector, action);
376
+ case 'type': return this.type(action.selector, action.text, action);
377
+ case 'scroll': return this.scroll(action.direction, action.amount);
378
+ case 'hover': return this.hover(action.selector);
379
+ case 'select': return this.select(action.selector, ...(action.values || []));
380
+ case 'pressKey': return this.pressKey(action.key);
381
+ case 'goBack': return this.goBack();
382
+ case 'goForward': return this.goForward();
383
+ case 'screenshot': return this.screenshot(action);
384
+ case 'extractContent': return this.extractContent();
385
+ case 'waitFor': return this.waitFor(action.selector, action.timeout);
386
+ default: throw new Error(`Unknown action type: ${action.type}`);
387
+ }
388
+ }
389
+
390
+ // ── Cookies ─────────────────────────────────────────────────
391
+
392
+ async getCookies() {
393
+ this._ensureConnected();
394
+ return this.page.cookies();
395
+ }
396
+
397
+ async setCookies(cookies) {
398
+ this._ensureConnected();
399
+ if (this.activeBackend === 'lightpanda') {
400
+ // LP doesn't support Network.deleteCookies which Puppeteer's setCookie calls.
401
+ // Use the page's internal CDP session to call Network.setCookies directly.
402
+ try {
403
+ const client = this.page._client();
404
+ await client.send('Network.setCookies', { cookies });
405
+ } catch {
406
+ // Fallback: set cookies via document.cookie (limited to non-httpOnly)
407
+ for (const c of cookies) {
408
+ const parts = [`${c.name}=${c.value}`];
409
+ if (c.domain) parts.push(`domain=${c.domain}`);
410
+ if (c.path) parts.push(`path=${c.path}`);
411
+ await this.page.evaluate((cookieStr) => { document.cookie = cookieStr; }, parts.join('; '));
412
+ }
413
+ }
414
+ } else {
415
+ await this.page.setCookie(...cookies);
416
+ }
417
+ }
418
+
419
+ // ── Fallback ────────────────────────────────────────────────
420
+
421
+ async _fallbackToChrome(reason) {
422
+ if (this.activeBackend === 'chrome') return; // already on Chrome
423
+ if (this._fallbackCount > 2) throw new Error('Too many fallback attempts');
424
+
425
+ this._log(`Falling back to Chrome: ${reason}`);
426
+ this._fallbackCount++;
427
+
428
+ // Save state from LP session
429
+ let cookies = [];
430
+ let currentUrl = null;
431
+ try {
432
+ cookies = await this.page.cookies();
433
+ currentUrl = this.page.url();
434
+ } catch {
435
+ // LP might be in a bad state
436
+ }
437
+
438
+ // Close LP page (keep server alive for potential reuse)
439
+ try {
440
+ if (this.page && !this.page.isClosed()) await this.page.close();
441
+ if (this.context) await this.context.close();
442
+ if (this.browser) await this.browser.disconnect();
443
+ } catch {
444
+ // ignore cleanup errors
445
+ }
446
+
447
+ // Connect to Chrome
448
+ await this._connectChrome();
449
+
450
+ // Restore state
451
+ if (cookies.length > 0) {
452
+ await this.page.setCookie(...cookies);
453
+ }
454
+ if (currentUrl && currentUrl !== 'about:blank') {
455
+ await this.page.goto(currentUrl, {
456
+ waitUntil: 'networkidle0',
457
+ timeout: this.timeout,
458
+ });
459
+ }
460
+
461
+ this._logAction('fallback', { reason, from: 'lightpanda', to: 'chrome' });
462
+ }
463
+
464
+ async _shouldFallback(error) {
465
+ if (this.activeBackend === 'chrome') return false;
466
+ if (this.mode === 'headless') return false; // no auto-fallback in explicit headless mode
467
+
468
+ const msg = error.message || '';
469
+ return (
470
+ msg.includes('Protocol error') ||
471
+ msg.includes('not implemented') ||
472
+ msg.includes('Target closed') ||
473
+ msg.includes('Session closed') ||
474
+ msg.includes('Connection closed') ||
475
+ msg.includes('Execution context was destroyed')
476
+ );
477
+ }
478
+
479
+ // ── Cleanup ─────────────────────────────────────────────────
480
+
481
+ async close() {
482
+ try {
483
+ if (this.page && !this.page.isClosed()) await this.page.close();
484
+ } catch { /* ignore */ }
485
+
486
+ try {
487
+ if (this.context) await this.context.close();
488
+ } catch { /* ignore */ }
489
+
490
+ if (this.activeBackend === 'lightpanda' && this.browser) {
491
+ try { await this.browser.disconnect(); } catch { /* ignore */ }
492
+ }
493
+
494
+ if (this._chromeBrowser) {
495
+ browserPool.releaseBrowser(this._chromeBrowser);
496
+ this._chromeBrowser = null;
497
+ }
498
+
499
+ this.page = null;
500
+ this.context = null;
501
+ this.browser = null;
502
+ this._connected = false;
503
+ this._log('Session closed');
504
+ }
505
+
506
+ // ── Helpers ─────────────────────────────────────────────────
507
+
508
+ _ensureConnected() {
509
+ if (!this._connected || !this.page) {
510
+ throw new Error('Session not connected. Call connect() first.');
511
+ }
512
+ }
513
+
514
+ async _getPuppeteer() {
515
+ try {
516
+ const puppeteer = await import('puppeteer');
517
+ return puppeteer.default || puppeteer;
518
+ } catch {
519
+ throw new Error('Puppeteer is required for BrowserSession. Install with: npm install puppeteer');
520
+ }
521
+ }
522
+
523
+ _log(msg) {
524
+ if (this.verbose) console.log(`[BrowserSession] ${msg}`);
525
+ }
526
+
527
+ _logAction(type, params = {}) {
528
+ const entry = { type, ...params, timestamp: Date.now(), backend: this.activeBackend };
529
+ this.history.push(entry);
530
+ this._log(`${type} ${JSON.stringify(params)}`);
531
+ }
532
+
533
+ getHistory() {
534
+ return this.history;
535
+ }
536
+
537
+ getBackend() {
538
+ return this.activeBackend;
539
+ }
540
+ }
541
+
542
+ /**
543
+ * Convenience function to create and connect a browser session.
544
+ */
545
+ export async function createSession(options = {}) {
546
+ const session = new BrowserSession(options);
547
+ await session.connect();
548
+ return session;
549
+ }
550
+
551
+ export default BrowserSession;
package/index.d.ts CHANGED
@@ -455,6 +455,103 @@ export function bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise
455
455
  */
456
456
  export function bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>;
457
457
 
458
+ // ── Browser Session ───────────────────────────────────────────
459
+
460
+ export interface BrowserSessionOptions {
461
+ mode?: 'headless' | 'visual' | 'auto';
462
+ timeout?: number;
463
+ userAgent?: string;
464
+ lightpandaPath?: string;
465
+ verbose?: boolean;
466
+ }
467
+
468
+ export interface PageState {
469
+ url: string;
470
+ title: string;
471
+ text: string;
472
+ interactiveElements: Array<{
473
+ type: 'button' | 'link' | 'input' | 'select';
474
+ text?: string;
475
+ label?: string;
476
+ href?: string;
477
+ selector: string;
478
+ tag?: string;
479
+ inputType?: string;
480
+ value?: string;
481
+ }>;
482
+ screenshot?: string;
483
+ backend: 'lightpanda' | 'chrome';
484
+ sessionHistory: Array<{ type: string; timestamp: number; backend: string }>;
485
+ }
486
+
487
+ export interface ActionResult {
488
+ success: boolean;
489
+ url?: string;
490
+ screenshot?: string;
491
+ backend?: string;
492
+ }
493
+
494
+ export interface BrowserAction {
495
+ type: 'goto' | 'click' | 'type' | 'scroll' | 'hover' | 'select' | 'pressKey' | 'goBack' | 'goForward' | 'screenshot' | 'extractContent' | 'waitFor';
496
+ url?: string;
497
+ selector?: string;
498
+ text?: string;
499
+ key?: string;
500
+ direction?: 'up' | 'down';
501
+ amount?: number;
502
+ values?: string[];
503
+ timeout?: number;
504
+ expectNavigation?: boolean;
505
+ waitForNavigation?: boolean;
506
+ clear?: boolean;
507
+ delay?: number;
508
+ fullPage?: boolean;
509
+ type_?: 'png' | 'jpeg' | 'webp';
510
+ includeScreenshot?: boolean;
511
+ }
512
+
513
+ export declare class BrowserSession {
514
+ constructor(options?: BrowserSessionOptions);
515
+
516
+ readonly activeBackend: 'lightpanda' | 'chrome' | null;
517
+ readonly mode: 'headless' | 'visual' | 'auto';
518
+
519
+ connect(): Promise<BrowserSession>;
520
+ goto(url: string): Promise<ActionResult>;
521
+ goBack(): Promise<void>;
522
+ goForward(): Promise<void>;
523
+ click(selector: string, options?: { timeout?: number; expectNavigation?: boolean; waitForNavigation?: boolean }): Promise<ActionResult>;
524
+ type(selector: string, text: string, options?: { timeout?: number; clear?: boolean; delay?: number }): Promise<ActionResult>;
525
+ scroll(direction?: 'up' | 'down', amount?: number): Promise<ActionResult>;
526
+ hover(selector: string): Promise<ActionResult>;
527
+ select(selector: string, ...values: string[]): Promise<ActionResult>;
528
+ pressKey(key: string): Promise<ActionResult>;
529
+ screenshot(options?: { type?: 'png' | 'jpeg' | 'webp'; fullPage?: boolean }): Promise<{ success: boolean; screenshot: string; backend: string }>;
530
+ extractContent(): Promise<{ title: string; metaDescription: string; headings: any[]; paragraphs: string[]; links: any[]; bodyText: string; url: string }>;
531
+ evaluate<T>(fn: (...args: any[]) => T, ...args: any[]): Promise<T>;
532
+ waitFor(selector: string, timeout?: number): Promise<ActionResult>;
533
+ getPageState(options?: { includeScreenshot?: boolean }): Promise<PageState>;
534
+ executeAction(action: BrowserAction): Promise<ActionResult>;
535
+ getCookies(): Promise<any[]>;
536
+ setCookies(cookies: any[]): Promise<void>;
537
+ getHistory(): Array<{ type: string; timestamp: number; backend: string }>;
538
+ getBackend(): 'lightpanda' | 'chrome' | null;
539
+ close(): Promise<void>;
540
+ }
541
+
542
+ export function createSession(options?: BrowserSessionOptions): Promise<BrowserSession>;
543
+
544
+ export declare class LightPandaServer {
545
+ constructor(binaryPath?: string);
546
+ start(port?: number): Promise<string>;
547
+ getEndpoint(): string;
548
+ isRunning(): boolean;
549
+ stop(): void;
550
+ }
551
+
552
+ export function getLightPandaServer(binaryPath?: string): LightPandaServer;
553
+ export function stopLightPandaServer(): void;
554
+
458
555
  /**
459
556
  * Default export - same as BNCASmartScraper class
460
557
  */
package/index.js CHANGED
@@ -1795,4 +1795,8 @@ export async function bulkScrapeStream(urls, options = {}) {
1795
1795
  }
1796
1796
  }
1797
1797
 
1798
+ // Browser session exports
1799
+ export { BrowserSession, createSession } from './browser-session.js';
1800
+ export { default as LightPandaServer, getLightPandaServer, stopLightPandaServer } from './lightpanda-server.js';
1801
+
1798
1802
  export default BNCASmartScraper;
@@ -0,0 +1,151 @@
1
+ import { spawn } from 'child_process';
2
+ import { createServer } from 'net';
3
+ import path from 'path';
4
+ import fs from 'fs';
5
+
6
+ class LightPandaServer {
7
+ constructor(binaryPath) {
8
+ this.binaryPath = binaryPath || this._findBinary();
9
+ this.process = null;
10
+ this.host = '127.0.0.1';
11
+ this.port = null;
12
+ this.ready = false;
13
+ }
14
+
15
+ async start(port) {
16
+ if (this.process && this.ready) return this.getEndpoint();
17
+
18
+ this.port = port || await this._findAvailablePort();
19
+
20
+ return new Promise((resolve, reject) => {
21
+ const args = [
22
+ 'serve',
23
+ '--host', this.host,
24
+ '--port', String(this.port),
25
+ '--cdp_max_connections', '16',
26
+ ];
27
+
28
+ this.process = spawn(this.binaryPath, args, {
29
+ stdio: ['ignore', 'pipe', 'pipe'],
30
+ });
31
+
32
+ let stderr = '';
33
+
34
+ const onReady = () => {
35
+ this.ready = true;
36
+ resolve(this.getEndpoint());
37
+ };
38
+
39
+ // LP prints to stderr when ready — wait for it or poll /json/version
40
+ this.process.stderr.on('data', (data) => {
41
+ stderr += data.toString();
42
+ // LightPanda logs server start to stderr
43
+ if (stderr.includes('Listening on') || stderr.includes('server started')) {
44
+ onReady();
45
+ }
46
+ });
47
+
48
+ this.process.on('error', (err) => {
49
+ this.ready = false;
50
+ reject(new Error(`Failed to start LightPanda: ${err.message}`));
51
+ });
52
+
53
+ this.process.on('exit', (code) => {
54
+ this.ready = false;
55
+ this.process = null;
56
+ if (!this.ready) {
57
+ reject(new Error(`LightPanda exited with code ${code}: ${stderr}`));
58
+ }
59
+ });
60
+
61
+ // Fallback: poll /json/version if no stderr signal within 3s
62
+ setTimeout(async () => {
63
+ if (this.ready) return;
64
+ try {
65
+ const res = await fetch(`http://${this.host}:${this.port}/json/version`);
66
+ if (res.ok) onReady();
67
+ } catch {
68
+ // Still starting up, give it more time
69
+ }
70
+ }, 1500);
71
+
72
+ // Hard timeout
73
+ setTimeout(() => {
74
+ if (!this.ready) {
75
+ this.stop();
76
+ reject(new Error(`LightPanda failed to start within 5s. stderr: ${stderr}`));
77
+ }
78
+ }, 5000);
79
+ });
80
+ }
81
+
82
+ getEndpoint() {
83
+ return `ws://${this.host}:${this.port}`;
84
+ }
85
+
86
+ isRunning() {
87
+ return this.ready && this.process !== null;
88
+ }
89
+
90
+ stop() {
91
+ if (this.process) {
92
+ try {
93
+ this.process.kill('SIGTERM');
94
+ } catch {
95
+ // already dead
96
+ }
97
+ this.process = null;
98
+ }
99
+ this.ready = false;
100
+ this.port = null;
101
+ }
102
+
103
+ async _findAvailablePort() {
104
+ return new Promise((resolve, reject) => {
105
+ const server = createServer();
106
+ server.listen(0, '127.0.0.1', () => {
107
+ const port = server.address().port;
108
+ server.close(() => resolve(port));
109
+ });
110
+ server.on('error', reject);
111
+ });
112
+ }
113
+
114
+ _findBinary() {
115
+ // Check common locations
116
+ const candidates = [
117
+ path.join(path.dirname(new URL(import.meta.url).pathname), 'bin', 'lightpanda'),
118
+ '/usr/local/bin/lightpanda',
119
+ '/usr/bin/lightpanda',
120
+ ];
121
+
122
+ for (const p of candidates) {
123
+ if (fs.existsSync(p)) return p;
124
+ }
125
+
126
+ return 'lightpanda'; // hope it's on PATH
127
+ }
128
+ }
129
+
130
+ // Singleton instance — shared across all sessions
131
+ let _instance = null;
132
+
133
+ export function getLightPandaServer(binaryPath) {
134
+ if (!_instance) {
135
+ _instance = new LightPandaServer(binaryPath);
136
+ }
137
+ return _instance;
138
+ }
139
+
140
+ export function stopLightPandaServer() {
141
+ if (_instance) {
142
+ _instance.stop();
143
+ _instance = null;
144
+ }
145
+ }
146
+
147
+ process.on('SIGTERM', stopLightPandaServer);
148
+ process.on('SIGINT', stopLightPandaServer);
149
+ process.on('beforeExit', stopLightPandaServer);
150
+
151
+ export default LightPandaServer;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@monostate/node-scraper",
3
- "version": "2.0.0",
3
+ "version": "2.1.0",
4
4
  "description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -15,6 +15,8 @@
15
15
  "index.js",
16
16
  "index.d.ts",
17
17
  "browser-pool.js",
18
+ "browser-session.js",
19
+ "lightpanda-server.js",
18
20
  "README.md",
19
21
  "BULK_SCRAPING.md",
20
22
  "package.json",
@@ -34,6 +36,9 @@
34
36
  "data-extraction",
35
37
  "automation",
36
38
  "browser",
39
+ "browser-use",
40
+ "cdp",
41
+ "ai-agent",
37
42
  "ai-powered",
38
43
  "question-answering",
39
44
  "pdf-parsing",