brave-real-browser-mcp-server 2.14.8 → 2.14.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/handlers/advanced-scraping-handlers.js +58 -0
- package/dist/handlers/advanced-video-media-handlers.js +134 -1246
- package/dist/handlers/ai-powered-handlers.js +83 -184
- package/dist/handlers/data-extraction-handlers.js +0 -98
- package/dist/handlers/data-processing-handlers.js +0 -173
- package/dist/handlers/data-quality-handlers.js +39 -185
- package/dist/handlers/data-transform-handlers.js +66 -0
- package/dist/handlers/dom-handlers.js +206 -0
- package/dist/handlers/dynamic-session-handlers.js +0 -204
- package/dist/handlers/multi-element-handlers.js +0 -55
- package/dist/handlers/network-handlers.js +111 -0
- package/dist/handlers/pagination-handlers.js +0 -191
- package/dist/handlers/search-filter-handlers.js +15 -71
- package/dist/index.js +8 -59
- package/dist/mcp-server.js +135 -8
- package/dist/tool-definitions.js +113 -214
- package/package.json +2 -11
|
@@ -127,85 +127,6 @@ export async function handleCookieManager(args) {
|
|
|
127
127
|
/**
|
|
128
128
|
* Session Persistence - Save and restore browser session
|
|
129
129
|
*/
|
|
130
|
-
export async function handleSessionPersistence(args) {
|
|
131
|
-
return await withErrorHandling(async () => {
|
|
132
|
-
validateWorkflow('session_persistence', {
|
|
133
|
-
requireBrowser: true,
|
|
134
|
-
requirePage: true,
|
|
135
|
-
});
|
|
136
|
-
const page = getCurrentPage();
|
|
137
|
-
const action = args.action || 'save'; // save, restore
|
|
138
|
-
if (action === 'save') {
|
|
139
|
-
const cookies = await page.cookies();
|
|
140
|
-
const localStorage = await page.evaluate(() => {
|
|
141
|
-
const items = {};
|
|
142
|
-
for (let i = 0; i < window.localStorage.length; i++) {
|
|
143
|
-
const key = window.localStorage.key(i);
|
|
144
|
-
if (key) {
|
|
145
|
-
items[key] = window.localStorage.getItem(key);
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
return items;
|
|
149
|
-
});
|
|
150
|
-
const sessionStorage = await page.evaluate(() => {
|
|
151
|
-
const items = {};
|
|
152
|
-
for (let i = 0; i < window.sessionStorage.length; i++) {
|
|
153
|
-
const key = window.sessionStorage.key(i);
|
|
154
|
-
if (key) {
|
|
155
|
-
items[key] = window.sessionStorage.getItem(key);
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
return items;
|
|
159
|
-
});
|
|
160
|
-
const sessionData = {
|
|
161
|
-
url: page.url(),
|
|
162
|
-
cookies,
|
|
163
|
-
localStorage,
|
|
164
|
-
sessionStorage,
|
|
165
|
-
timestamp: new Date().toISOString(),
|
|
166
|
-
};
|
|
167
|
-
return {
|
|
168
|
-
content: [{
|
|
169
|
-
type: 'text',
|
|
170
|
-
text: `✅ Session saved\n\n${JSON.stringify(sessionData, null, 2)}`,
|
|
171
|
-
}],
|
|
172
|
-
};
|
|
173
|
-
}
|
|
174
|
-
if (action === 'restore') {
|
|
175
|
-
const sessionData = args.sessionData;
|
|
176
|
-
if (!sessionData) {
|
|
177
|
-
throw new Error('Session data is required for restore');
|
|
178
|
-
}
|
|
179
|
-
// Restore cookies
|
|
180
|
-
if (sessionData.cookies) {
|
|
181
|
-
await Promise.all(sessionData.cookies.map((cookie) => page.setCookie(cookie)));
|
|
182
|
-
}
|
|
183
|
-
// Restore localStorage
|
|
184
|
-
if (sessionData.localStorage) {
|
|
185
|
-
await page.evaluate((items) => {
|
|
186
|
-
for (const [key, value] of Object.entries(items)) {
|
|
187
|
-
window.localStorage.setItem(key, value);
|
|
188
|
-
}
|
|
189
|
-
}, sessionData.localStorage);
|
|
190
|
-
}
|
|
191
|
-
// Restore sessionStorage
|
|
192
|
-
if (sessionData.sessionStorage) {
|
|
193
|
-
await page.evaluate((items) => {
|
|
194
|
-
for (const [key, value] of Object.entries(items)) {
|
|
195
|
-
window.sessionStorage.setItem(key, value);
|
|
196
|
-
}
|
|
197
|
-
}, sessionData.sessionStorage);
|
|
198
|
-
}
|
|
199
|
-
return {
|
|
200
|
-
content: [{
|
|
201
|
-
type: 'text',
|
|
202
|
-
text: `✅ Session restored from ${sessionData.timestamp}`,
|
|
203
|
-
}],
|
|
204
|
-
};
|
|
205
|
-
}
|
|
206
|
-
throw new Error(`Unknown action: ${action}`);
|
|
207
|
-
}, 'Failed session persistence');
|
|
208
|
-
}
|
|
209
130
|
/**
|
|
210
131
|
* Form Auto Fill - Automatically fill form fields
|
|
211
132
|
*/
|
|
@@ -385,131 +306,6 @@ export async function handleAjaxContentWaiter(args) {
|
|
|
385
306
|
/**
|
|
386
307
|
* Modal Popup Handler - Handle modal popups
|
|
387
308
|
*/
|
|
388
|
-
export async function handleModalPopupHandler(args) {
|
|
389
|
-
return await withErrorHandling(async () => {
|
|
390
|
-
validateWorkflow('modal_popup_handler', {
|
|
391
|
-
requireBrowser: true,
|
|
392
|
-
requirePage: true,
|
|
393
|
-
});
|
|
394
|
-
const page = getCurrentPage();
|
|
395
|
-
const action = args.action || 'detect'; // detect, close, interact
|
|
396
|
-
if (action === 'detect') {
|
|
397
|
-
const modals = await page.evaluate(() => {
|
|
398
|
-
const results = [];
|
|
399
|
-
const modalSelectors = [
|
|
400
|
-
'.modal',
|
|
401
|
-
'[role="dialog"]',
|
|
402
|
-
'[class*="popup"]',
|
|
403
|
-
'[class*="overlay"]',
|
|
404
|
-
'.dialog',
|
|
405
|
-
];
|
|
406
|
-
modalSelectors.forEach(selector => {
|
|
407
|
-
const elements = document.querySelectorAll(selector);
|
|
408
|
-
elements.forEach((el) => {
|
|
409
|
-
const isVisible = el.offsetWidth > 0 && el.offsetHeight > 0;
|
|
410
|
-
if (isVisible) {
|
|
411
|
-
results.push({
|
|
412
|
-
selector,
|
|
413
|
-
id: el.id || null,
|
|
414
|
-
className: el.className,
|
|
415
|
-
text: el.textContent?.trim().substring(0, 200) || '',
|
|
416
|
-
});
|
|
417
|
-
}
|
|
418
|
-
});
|
|
419
|
-
});
|
|
420
|
-
return results;
|
|
421
|
-
});
|
|
422
|
-
return {
|
|
423
|
-
content: [{
|
|
424
|
-
type: 'text',
|
|
425
|
-
text: `✅ Found ${modals.length} visible modals\n\n${JSON.stringify(modals, null, 2)}`,
|
|
426
|
-
}],
|
|
427
|
-
};
|
|
428
|
-
}
|
|
429
|
-
if (action === 'close') {
|
|
430
|
-
const closeSelector = args.closeSelector || '.close, [aria-label="Close"], button[class*="close"]';
|
|
431
|
-
try {
|
|
432
|
-
await page.click(closeSelector);
|
|
433
|
-
return {
|
|
434
|
-
content: [{
|
|
435
|
-
type: 'text',
|
|
436
|
-
text: `✅ Modal closed`,
|
|
437
|
-
}],
|
|
438
|
-
};
|
|
439
|
-
}
|
|
440
|
-
catch (e) {
|
|
441
|
-
// Try pressing Escape
|
|
442
|
-
await page.keyboard.press('Escape');
|
|
443
|
-
return {
|
|
444
|
-
content: [{
|
|
445
|
-
type: 'text',
|
|
446
|
-
text: `✅ Pressed Escape key to close modal`,
|
|
447
|
-
}],
|
|
448
|
-
};
|
|
449
|
-
}
|
|
450
|
-
}
|
|
451
|
-
throw new Error(`Unknown action: ${action}`);
|
|
452
|
-
}, 'Failed modal popup handler');
|
|
453
|
-
}
|
|
454
309
|
/**
|
|
455
310
|
* Login Session Manager - Manage login sessions
|
|
456
311
|
*/
|
|
457
|
-
export async function handleLoginSessionManager(args) {
|
|
458
|
-
return await withErrorHandling(async () => {
|
|
459
|
-
validateWorkflow('login_session_manager', {
|
|
460
|
-
requireBrowser: true,
|
|
461
|
-
requirePage: true,
|
|
462
|
-
});
|
|
463
|
-
const page = getCurrentPage();
|
|
464
|
-
const action = args.action || 'check'; // check, login, logout
|
|
465
|
-
if (action === 'check') {
|
|
466
|
-
const isLoggedIn = await page.evaluate(() => {
|
|
467
|
-
// Check common indicators of logged-in state
|
|
468
|
-
const indicators = [
|
|
469
|
-
document.querySelector('[class*="logout"]'),
|
|
470
|
-
document.querySelector('[class*="profile"]'),
|
|
471
|
-
document.querySelector('[class*="account"]'),
|
|
472
|
-
document.cookie.includes('session') || document.cookie.includes('token'),
|
|
473
|
-
localStorage.getItem('token') !== null,
|
|
474
|
-
];
|
|
475
|
-
return indicators.some(indicator => Boolean(indicator));
|
|
476
|
-
});
|
|
477
|
-
const cookies = await page.cookies();
|
|
478
|
-
const sessionCookies = cookies.filter(c => c.name.toLowerCase().includes('session') ||
|
|
479
|
-
c.name.toLowerCase().includes('token') ||
|
|
480
|
-
c.name.toLowerCase().includes('auth'));
|
|
481
|
-
return {
|
|
482
|
-
content: [{
|
|
483
|
-
type: 'text',
|
|
484
|
-
text: `✅ Login Status Check\n\nLikely Logged In: ${isLoggedIn}\nSession Cookies: ${sessionCookies.length}\n\n${JSON.stringify(sessionCookies.map(c => ({ name: c.name, domain: c.domain })), null, 2)}`,
|
|
485
|
-
}],
|
|
486
|
-
};
|
|
487
|
-
}
|
|
488
|
-
if (action === 'login') {
|
|
489
|
-
const username = args.username;
|
|
490
|
-
const password = args.password;
|
|
491
|
-
const usernameSelector = args.usernameSelector || 'input[type="email"], input[type="text"], input[name*="user"], input[name*="email"]';
|
|
492
|
-
const passwordSelector = args.passwordSelector || 'input[type="password"]';
|
|
493
|
-
const submitSelector = args.submitSelector || 'button[type="submit"], input[type="submit"]';
|
|
494
|
-
if (!username || !password) {
|
|
495
|
-
throw new Error('Username and password are required');
|
|
496
|
-
}
|
|
497
|
-
// Fill username
|
|
498
|
-
await page.waitForSelector(usernameSelector, { timeout: 5000 });
|
|
499
|
-
await page.type(usernameSelector, username);
|
|
500
|
-
// Fill password
|
|
501
|
-
await page.waitForSelector(passwordSelector, { timeout: 5000 });
|
|
502
|
-
await page.type(passwordSelector, password);
|
|
503
|
-
// Submit
|
|
504
|
-
await page.click(submitSelector);
|
|
505
|
-
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 10000 }).catch(() => { });
|
|
506
|
-
return {
|
|
507
|
-
content: [{
|
|
508
|
-
type: 'text',
|
|
509
|
-
text: `✅ Login attempted\n\nUsername: ${username}\nCurrent URL: ${page.url()}`,
|
|
510
|
-
}],
|
|
511
|
-
};
|
|
512
|
-
}
|
|
513
|
-
throw new Error(`Unknown action: ${action}`);
|
|
514
|
-
}, 'Failed login session manager');
|
|
515
|
-
}
|
|
@@ -194,61 +194,6 @@ export async function handleAttributeHarvester(args) {
|
|
|
194
194
|
};
|
|
195
195
|
}, 'Failed to harvest attributes');
|
|
196
196
|
}
|
|
197
|
-
/**
|
|
198
|
-
* सभी images URLs, alt text, dimensions के साथ extract करता है
|
|
199
|
-
*/
|
|
200
|
-
export async function handleImageScraper(args) {
|
|
201
|
-
return await withErrorHandling(async () => {
|
|
202
|
-
validateWorkflow('image_scraper', {
|
|
203
|
-
requireBrowser: true,
|
|
204
|
-
requirePage: true,
|
|
205
|
-
});
|
|
206
|
-
const page = getCurrentPage();
|
|
207
|
-
const selector = args.selector || 'img';
|
|
208
|
-
const includeDataUrls = args.includeDataUrls || false;
|
|
209
|
-
const includeDimensions = args.includeDimensions !== false;
|
|
210
|
-
const imageData = await page.evaluate(({ selector, includeDataUrls, includeDimensions }) => {
|
|
211
|
-
const images = document.querySelectorAll(selector);
|
|
212
|
-
const results = [];
|
|
213
|
-
images.forEach((img, index) => {
|
|
214
|
-
const src = img.src || img.getAttribute('data-src') || '';
|
|
215
|
-
// Skip data URLs if not included
|
|
216
|
-
if (!includeDataUrls && src.startsWith('data:')) {
|
|
217
|
-
return;
|
|
218
|
-
}
|
|
219
|
-
const imageInfo = {
|
|
220
|
-
index,
|
|
221
|
-
src,
|
|
222
|
-
alt: img.alt || '',
|
|
223
|
-
title: img.title || '',
|
|
224
|
-
};
|
|
225
|
-
if (includeDimensions) {
|
|
226
|
-
imageInfo.width = img.naturalWidth || img.width || 0;
|
|
227
|
-
imageInfo.height = img.naturalHeight || img.height || 0;
|
|
228
|
-
}
|
|
229
|
-
// Additional attributes
|
|
230
|
-
const srcset = img.srcset || img.getAttribute('data-srcset');
|
|
231
|
-
if (srcset) {
|
|
232
|
-
imageInfo.srcset = srcset;
|
|
233
|
-
}
|
|
234
|
-
const loading = img.loading;
|
|
235
|
-
if (loading) {
|
|
236
|
-
imageInfo.loading = loading;
|
|
237
|
-
}
|
|
238
|
-
results.push(imageInfo);
|
|
239
|
-
});
|
|
240
|
-
return results;
|
|
241
|
-
}, { selector, includeDataUrls, includeDimensions });
|
|
242
|
-
return {
|
|
243
|
-
content: [
|
|
244
|
-
{
|
|
245
|
-
type: 'text',
|
|
246
|
-
text: `✅ Scraped ${imageData.length} images\n\n${JSON.stringify(imageData, null, 2)}`,
|
|
247
|
-
},
|
|
248
|
-
],
|
|
249
|
-
};
|
|
250
|
-
}, 'Failed to scrape images');
|
|
251
|
-
}
|
|
252
197
|
/**
|
|
253
198
|
* Internal/external links classification के साथ collect करता है
|
|
254
199
|
*/
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import { getPageInstance } from '../browser-manager.js';
|
|
2
|
+
async function captureNetwork(page, duration, filter) {
|
|
3
|
+
const captured = [];
|
|
4
|
+
const responseHandler = async (response) => {
|
|
5
|
+
try {
|
|
6
|
+
const request = response.request();
|
|
7
|
+
if (filter(request, response)) {
|
|
8
|
+
let body = '[Binary or Too Large]';
|
|
9
|
+
try {
|
|
10
|
+
const type = response.headers()['content-type'] || '';
|
|
11
|
+
if (type.includes('text') || type.includes('json') || type.includes('xml')) {
|
|
12
|
+
body = await response.text();
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
catch (e) {
|
|
16
|
+
// Ignore
|
|
17
|
+
}
|
|
18
|
+
captured.push({
|
|
19
|
+
url: response.url(),
|
|
20
|
+
method: request.method(),
|
|
21
|
+
type: request.resourceType(),
|
|
22
|
+
status: response.status(),
|
|
23
|
+
headers: response.headers(),
|
|
24
|
+
body: body.slice(0, 5000)
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
catch (e) {
|
|
29
|
+
// ignore
|
|
30
|
+
}
|
|
31
|
+
};
|
|
32
|
+
page.on('response', responseHandler);
|
|
33
|
+
await new Promise(resolve => setTimeout(resolve, duration));
|
|
34
|
+
page.off('response', responseHandler);
|
|
35
|
+
return captured;
|
|
36
|
+
}
|
|
37
|
+
export async function handleNetworkRecorder(args) {
|
|
38
|
+
const page = getPageInstance();
|
|
39
|
+
if (!page)
|
|
40
|
+
throw new Error('Browser not initialized');
|
|
41
|
+
const duration = args.duration || 10000;
|
|
42
|
+
const results = await captureNetwork(page, duration, (req, res) => {
|
|
43
|
+
if (!args.filterTypes || args.filterTypes.length === 0)
|
|
44
|
+
return true;
|
|
45
|
+
const type = req.resourceType().toLowerCase();
|
|
46
|
+
return args.filterTypes.includes(type);
|
|
47
|
+
});
|
|
48
|
+
return {
|
|
49
|
+
content: [{
|
|
50
|
+
type: 'text',
|
|
51
|
+
text: JSON.stringify(results, null, 2)
|
|
52
|
+
}]
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
export async function handleAjaxExtractor(args) {
|
|
56
|
+
const page = getPageInstance();
|
|
57
|
+
if (!page)
|
|
58
|
+
throw new Error('Browser not initialized');
|
|
59
|
+
const duration = args.duration || 15000;
|
|
60
|
+
const results = await captureNetwork(page, duration, (req, res) => {
|
|
61
|
+
const type = req.resourceType();
|
|
62
|
+
const isXhr = type === 'xhr' || type === 'fetch';
|
|
63
|
+
if (!isXhr)
|
|
64
|
+
return false;
|
|
65
|
+
if (args.url && !req.url().includes(args.url))
|
|
66
|
+
return false;
|
|
67
|
+
return true;
|
|
68
|
+
});
|
|
69
|
+
return {
|
|
70
|
+
content: [{
|
|
71
|
+
type: 'text',
|
|
72
|
+
text: JSON.stringify(results, null, 2)
|
|
73
|
+
}]
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
export async function handleFetchXhr(args) {
|
|
77
|
+
return handleAjaxExtractor({ duration: args.duration });
|
|
78
|
+
}
|
|
79
|
+
export async function handleApiFinder(args) {
|
|
80
|
+
const page = getPageInstance();
|
|
81
|
+
if (!page)
|
|
82
|
+
throw new Error('Browser not initialized');
|
|
83
|
+
// 1. Static Analysis
|
|
84
|
+
const staticApis = await page.evaluate(() => {
|
|
85
|
+
const patterns = [/\/api\//, /\/v\d+\//, /graphql/, /\.json$/];
|
|
86
|
+
const candidates = new Set();
|
|
87
|
+
document.querySelectorAll('script').forEach(s => {
|
|
88
|
+
if (s.src)
|
|
89
|
+
candidates.add(s.src);
|
|
90
|
+
});
|
|
91
|
+
document.querySelectorAll('a').forEach(a => {
|
|
92
|
+
candidates.add(a.href);
|
|
93
|
+
});
|
|
94
|
+
return Array.from(candidates).filter(url => patterns.some(p => p.test(url)));
|
|
95
|
+
});
|
|
96
|
+
// 2. Dynamic Analysis
|
|
97
|
+
const dynamicApis = await captureNetwork(page, 5000, (req, res) => {
|
|
98
|
+
const type = req.resourceType();
|
|
99
|
+
const contentType = res.headers()['content-type'] || '';
|
|
100
|
+
return (type === 'xhr' || type === 'fetch') && contentType.includes('json');
|
|
101
|
+
});
|
|
102
|
+
return {
|
|
103
|
+
content: [{
|
|
104
|
+
type: 'text',
|
|
105
|
+
text: JSON.stringify({
|
|
106
|
+
staticAnalysis: staticApis,
|
|
107
|
+
dynamicCapture: dynamicApis.map(d => d.url)
|
|
108
|
+
}, null, 2)
|
|
109
|
+
}]
|
|
110
|
+
};
|
|
111
|
+
}
|
|
@@ -4,129 +4,6 @@
|
|
|
4
4
|
import { getCurrentPage } from '../browser-manager.js';
|
|
5
5
|
import { validateWorkflow } from '../workflow-validation.js';
|
|
6
6
|
import { withErrorHandling, sleep } from '../system-utils.js';
|
|
7
|
-
import * as xml2js from 'xml2js';
|
|
8
|
-
/**
|
|
9
|
-
* "Next" button automatically detect और click करके सभी pages से data collect करता है
|
|
10
|
-
*/
|
|
11
|
-
export async function handleAutoPagination(args) {
|
|
12
|
-
return await withErrorHandling(async () => {
|
|
13
|
-
validateWorkflow('auto_pagination', {
|
|
14
|
-
requireBrowser: true,
|
|
15
|
-
requirePage: true,
|
|
16
|
-
});
|
|
17
|
-
const page = getCurrentPage();
|
|
18
|
-
const nextButtonSelector = args.nextButtonSelector || 'a[rel="next"], button:contains("Next"), .next, .pagination-next';
|
|
19
|
-
const maxPages = args.maxPages || 10;
|
|
20
|
-
const dataSelector = args.dataSelector;
|
|
21
|
-
const waitBetweenPages = args.waitBetweenPages || 1000;
|
|
22
|
-
const allData = [];
|
|
23
|
-
let currentPage = 1;
|
|
24
|
-
let hasMore = true;
|
|
25
|
-
while (currentPage <= maxPages && hasMore) {
|
|
26
|
-
// Extract data from current page
|
|
27
|
-
if (dataSelector) {
|
|
28
|
-
const pageData = await page.evaluate((selector) => {
|
|
29
|
-
const elements = document.querySelectorAll(selector);
|
|
30
|
-
return Array.from(elements).map((el) => ({
|
|
31
|
-
text: el.textContent?.trim() || '',
|
|
32
|
-
html: el.innerHTML,
|
|
33
|
-
}));
|
|
34
|
-
}, dataSelector);
|
|
35
|
-
allData.push(...pageData);
|
|
36
|
-
}
|
|
37
|
-
// Check for next button
|
|
38
|
-
const nextButton = await page.$(nextButtonSelector);
|
|
39
|
-
if (!nextButton) {
|
|
40
|
-
hasMore = false;
|
|
41
|
-
break;
|
|
42
|
-
}
|
|
43
|
-
// Check if button is disabled
|
|
44
|
-
const isDisabled = await page.evaluate((selector) => {
|
|
45
|
-
const btn = document.querySelector(selector);
|
|
46
|
-
return btn?.hasAttribute('disabled') || btn?.classList.contains('disabled');
|
|
47
|
-
}, nextButtonSelector);
|
|
48
|
-
if (isDisabled) {
|
|
49
|
-
hasMore = false;
|
|
50
|
-
break;
|
|
51
|
-
}
|
|
52
|
-
// Click next button
|
|
53
|
-
await nextButton.click();
|
|
54
|
-
await sleep(waitBetweenPages);
|
|
55
|
-
// Wait for navigation or content load
|
|
56
|
-
try {
|
|
57
|
-
await page.waitForNavigation({ timeout: 5000, waitUntil: 'domcontentloaded' });
|
|
58
|
-
}
|
|
59
|
-
catch (e) {
|
|
60
|
-
// No navigation occurred, content loaded dynamically
|
|
61
|
-
await sleep(1000);
|
|
62
|
-
}
|
|
63
|
-
currentPage++;
|
|
64
|
-
}
|
|
65
|
-
return {
|
|
66
|
-
content: [
|
|
67
|
-
{
|
|
68
|
-
type: 'text',
|
|
69
|
-
text: `✅ Auto-paginated through ${currentPage} pages\n\nCollected ${allData.length} items\n\n${JSON.stringify(allData.slice(0, 10), null, 2)}${allData.length > 10 ? '\n\n... (showing first 10 items)' : ''}`,
|
|
70
|
-
},
|
|
71
|
-
],
|
|
72
|
-
};
|
|
73
|
-
}, 'Failed to auto-paginate');
|
|
74
|
-
}
|
|
75
|
-
/**
|
|
76
|
-
* Lazy-loading pages के लिए auto-scroll करता है
|
|
77
|
-
*/
|
|
78
|
-
export async function handleInfiniteScroll(args) {
|
|
79
|
-
return await withErrorHandling(async () => {
|
|
80
|
-
validateWorkflow('infinite_scroll', {
|
|
81
|
-
requireBrowser: true,
|
|
82
|
-
requirePage: true,
|
|
83
|
-
});
|
|
84
|
-
const page = getCurrentPage();
|
|
85
|
-
const maxScrolls = args.maxScrolls || 10;
|
|
86
|
-
const scrollDelay = args.scrollDelay || 1000;
|
|
87
|
-
const dataSelector = args.dataSelector;
|
|
88
|
-
const allData = [];
|
|
89
|
-
let scrollCount = 0;
|
|
90
|
-
let previousHeight = 0;
|
|
91
|
-
while (scrollCount < maxScrolls) {
|
|
92
|
-
// Get current scroll height
|
|
93
|
-
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
94
|
-
// If height hasn't changed, we've reached the end
|
|
95
|
-
if (currentHeight === previousHeight && scrollCount > 0) {
|
|
96
|
-
break;
|
|
97
|
-
}
|
|
98
|
-
previousHeight = currentHeight;
|
|
99
|
-
// Collect data if selector provided
|
|
100
|
-
if (dataSelector) {
|
|
101
|
-
const pageData = await page.evaluate((selector) => {
|
|
102
|
-
const elements = document.querySelectorAll(selector);
|
|
103
|
-
return Array.from(elements).map((el, idx) => ({
|
|
104
|
-
index: idx,
|
|
105
|
-
text: el.textContent?.trim() || '',
|
|
106
|
-
}));
|
|
107
|
-
}, dataSelector);
|
|
108
|
-
// Add only new items (avoid duplicates)
|
|
109
|
-
const newItems = pageData.filter((item) => !allData.some((existing) => existing.text === item.text));
|
|
110
|
-
allData.push(...newItems);
|
|
111
|
-
}
|
|
112
|
-
// Scroll to bottom
|
|
113
|
-
await page.evaluate(() => {
|
|
114
|
-
window.scrollTo(0, document.body.scrollHeight);
|
|
115
|
-
});
|
|
116
|
-
// Wait for new content to load
|
|
117
|
-
await sleep(scrollDelay);
|
|
118
|
-
scrollCount++;
|
|
119
|
-
}
|
|
120
|
-
return {
|
|
121
|
-
content: [
|
|
122
|
-
{
|
|
123
|
-
type: 'text',
|
|
124
|
-
text: `✅ Infinite scroll completed (${scrollCount} scrolls)\n\nCollected ${allData.length} items\n\n${JSON.stringify(allData.slice(0, 10), null, 2)}${allData.length > 10 ? '\n\n... (showing first 10 items)' : ''}`,
|
|
125
|
-
},
|
|
126
|
-
],
|
|
127
|
-
};
|
|
128
|
-
}, 'Failed to handle infinite scroll');
|
|
129
|
-
}
|
|
130
7
|
/**
|
|
131
8
|
* Multiple pages से data collect और merge करता है
|
|
132
9
|
*/
|
|
@@ -178,74 +55,6 @@ export async function handleMultiPageScraper(args) {
|
|
|
178
55
|
};
|
|
179
56
|
}, 'Failed to scrape multiple pages');
|
|
180
57
|
}
|
|
181
|
-
/**
|
|
182
|
-
* sitemap.xml से URLs automatically extract करता है
|
|
183
|
-
*/
|
|
184
|
-
export async function handleSitemapParser(args) {
|
|
185
|
-
return await withErrorHandling(async () => {
|
|
186
|
-
validateWorkflow('sitemap_parser', {
|
|
187
|
-
requireBrowser: true,
|
|
188
|
-
requirePage: true,
|
|
189
|
-
});
|
|
190
|
-
const page = getCurrentPage();
|
|
191
|
-
const currentUrl = page.url();
|
|
192
|
-
const baseUrl = new URL(currentUrl).origin;
|
|
193
|
-
const sitemapUrl = args.sitemapUrl || `${baseUrl}/sitemap.xml`;
|
|
194
|
-
const maxUrls = args.maxUrls || 100;
|
|
195
|
-
const filterPattern = args.filterPattern;
|
|
196
|
-
// Fetch sitemap
|
|
197
|
-
await page.goto(sitemapUrl, { waitUntil: 'domcontentloaded' });
|
|
198
|
-
// Get sitemap XML content
|
|
199
|
-
const sitemapContent = await page.evaluate(() => {
|
|
200
|
-
return document.body.textContent || document.documentElement.innerHTML;
|
|
201
|
-
});
|
|
202
|
-
// Parse XML
|
|
203
|
-
const parser = new xml2js.Parser();
|
|
204
|
-
const result = await parser.parseStringPromise(sitemapContent);
|
|
205
|
-
const urls = [];
|
|
206
|
-
// Extract URLs from sitemap
|
|
207
|
-
if (result.urlset && result.urlset.url) {
|
|
208
|
-
for (const urlEntry of result.urlset.url) {
|
|
209
|
-
if (urls.length >= maxUrls)
|
|
210
|
-
break;
|
|
211
|
-
const loc = urlEntry.loc?.[0];
|
|
212
|
-
if (!loc)
|
|
213
|
-
continue;
|
|
214
|
-
// Apply filter if specified
|
|
215
|
-
if (filterPattern && !loc.includes(filterPattern)) {
|
|
216
|
-
continue;
|
|
217
|
-
}
|
|
218
|
-
urls.push({
|
|
219
|
-
url: loc,
|
|
220
|
-
lastmod: urlEntry.lastmod?.[0],
|
|
221
|
-
changefreq: urlEntry.changefreq?.[0],
|
|
222
|
-
priority: urlEntry.priority?.[0],
|
|
223
|
-
});
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
// Handle sitemap index
|
|
227
|
-
else if (result.sitemapindex && result.sitemapindex.sitemap) {
|
|
228
|
-
for (const sitemapEntry of result.sitemapindex.sitemap) {
|
|
229
|
-
const loc = sitemapEntry.loc?.[0];
|
|
230
|
-
if (loc) {
|
|
231
|
-
urls.push({
|
|
232
|
-
url: loc,
|
|
233
|
-
type: 'sitemap',
|
|
234
|
-
lastmod: sitemapEntry.lastmod?.[0],
|
|
235
|
-
});
|
|
236
|
-
}
|
|
237
|
-
}
|
|
238
|
-
}
|
|
239
|
-
return {
|
|
240
|
-
content: [
|
|
241
|
-
{
|
|
242
|
-
type: 'text',
|
|
243
|
-
text: `✅ Parsed sitemap: ${urls.length} URLs found\n\n${JSON.stringify(urls, null, 2)}`,
|
|
244
|
-
},
|
|
245
|
-
],
|
|
246
|
-
};
|
|
247
|
-
}, 'Failed to parse sitemap');
|
|
248
|
-
}
|
|
249
58
|
/**
|
|
250
59
|
* Site structure follow करके pages scrape करता है
|
|
251
60
|
*/
|