brave-real-browser-mcp-server 2.28.1 → 2.29.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +127 -20
- package/dist/browser-manager.js +344 -0
- package/dist/handlers/advanced-tools.js +863 -170
- package/dist/handlers/navigation-handlers.js +185 -16
- package/dist/handlers/tool-executor.js +201 -0
- package/dist/tool-definitions.js +104 -6
- package/package.json +2 -2
|
@@ -2326,8 +2326,9 @@ export async function handleDeepAnalysis(page, args) {
|
|
|
2326
2326
|
}
|
|
2327
2327
|
}
|
|
2328
2328
|
/**
|
|
2329
|
-
*
|
|
2329
|
+
* Network recorder with API interception capabilities
|
|
2330
2330
|
* ULTRA POWERFUL: API detection, media URLs, smart categorization
|
|
2331
|
+
* NEW: Request interception, mocking, blocking, and header modification
|
|
2331
2332
|
*/
|
|
2332
2333
|
export async function handleNetworkRecorder(page, args) {
|
|
2333
2334
|
// Progress tracking
|
|
@@ -2341,7 +2342,15 @@ export async function handleNetworkRecorder(page, args) {
|
|
|
2341
2342
|
const apis = [];
|
|
2342
2343
|
const mediaUrls = [];
|
|
2343
2344
|
const seen = new Set();
|
|
2344
|
-
|
|
2345
|
+
const interceptedRequests = [];
|
|
2346
|
+
let blockedCount = 0;
|
|
2347
|
+
let mockedCount = 0;
|
|
2348
|
+
const interceptMode = args.interceptMode || 'record';
|
|
2349
|
+
const blockPatterns = args.blockPatterns || [];
|
|
2350
|
+
const mockResponses = args.mockResponses || [];
|
|
2351
|
+
const modifyHeaders = args.modifyHeaders || [];
|
|
2352
|
+
const capturePayloads = args.capturePayloads === true;
|
|
2353
|
+
tracker.setProgress(10, `⏱️ Recording for ${duration}ms (mode: ${interceptMode})...`);
|
|
2345
2354
|
// ============================================================
|
|
2346
2355
|
// SMART CATEGORIZATION HELPER
|
|
2347
2356
|
// ============================================================
|
|
@@ -2375,75 +2384,216 @@ export async function handleNetworkRecorder(page, args) {
|
|
|
2375
2384
|
return 'document';
|
|
2376
2385
|
return 'other';
|
|
2377
2386
|
};
|
|
2378
|
-
//
|
|
2379
|
-
const
|
|
2380
|
-
|
|
2381
|
-
|
|
2382
|
-
|
|
2383
|
-
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
|
|
2387
|
-
|
|
2388
|
-
|
|
2389
|
-
const resourceType = response.request()?.resourceType?.() || 'unknown';
|
|
2390
|
-
const method = response.request()?.method?.() || 'GET';
|
|
2391
|
-
const category = categorizeUrl(url, resourceType);
|
|
2392
|
-
categories[category] = (categories[category] || 0) + 1;
|
|
2393
|
-
// Collect API endpoints
|
|
2394
|
-
if (category === 'api' || resourceType === 'xhr' || resourceType === 'fetch') {
|
|
2395
|
-
apis.push({ url, method, type: resourceType });
|
|
2387
|
+
// Helper to check URL against patterns
|
|
2388
|
+
const matchesPattern = (url, patterns) => {
|
|
2389
|
+
return patterns.some(pattern => {
|
|
2390
|
+
try {
|
|
2391
|
+
if (pattern.startsWith('/') && pattern.endsWith('/')) {
|
|
2392
|
+
// Regex pattern
|
|
2393
|
+
const regex = new RegExp(pattern.slice(1, -1));
|
|
2394
|
+
return regex.test(url);
|
|
2395
|
+
}
|
|
2396
|
+
// Simple includes check
|
|
2397
|
+
return url.includes(pattern);
|
|
2396
2398
|
}
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
mediaUrls.push(url);
|
|
2399
|
+
catch {
|
|
2400
|
+
return url.includes(pattern);
|
|
2400
2401
|
}
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2402
|
+
});
|
|
2403
|
+
};
|
|
2404
|
+
// ============================================================
|
|
2405
|
+
// INTERCEPTION MODE - Uses request interception
|
|
2406
|
+
// ============================================================
|
|
2407
|
+
if (interceptMode === 'intercept' || interceptMode === 'mock') {
|
|
2408
|
+
try {
|
|
2409
|
+
await page.setRequestInterception(true);
|
|
2410
|
+
const requestHandler = async (request) => {
|
|
2411
|
+
const url = request.url();
|
|
2412
|
+
const method = request.method();
|
|
2413
|
+
const resourceType = request.resourceType();
|
|
2414
|
+
const category = categorizeUrl(url, resourceType);
|
|
2415
|
+
// Check if should block
|
|
2416
|
+
if (blockPatterns.length > 0 && matchesPattern(url, blockPatterns)) {
|
|
2417
|
+
blockedCount++;
|
|
2418
|
+
interceptedRequests.push({
|
|
2419
|
+
url,
|
|
2420
|
+
method,
|
|
2421
|
+
action: 'blocked',
|
|
2422
|
+
timestamp: Date.now()
|
|
2423
|
+
});
|
|
2424
|
+
await request.abort();
|
|
2425
|
+
return;
|
|
2412
2426
|
}
|
|
2413
|
-
|
|
2414
|
-
|
|
2427
|
+
// Check if should mock
|
|
2428
|
+
const mockConfig = mockResponses.find(m => matchesPattern(url, [m.urlPattern]));
|
|
2429
|
+
if (mockConfig) {
|
|
2430
|
+
mockedCount++;
|
|
2431
|
+
interceptedRequests.push({
|
|
2432
|
+
url,
|
|
2433
|
+
method,
|
|
2434
|
+
action: 'mocked',
|
|
2435
|
+
mockResponse: mockConfig.response,
|
|
2436
|
+
timestamp: Date.now()
|
|
2437
|
+
});
|
|
2438
|
+
await request.respond({
|
|
2439
|
+
status: mockConfig.statusCode || 200,
|
|
2440
|
+
contentType: 'application/json',
|
|
2441
|
+
body: typeof mockConfig.response === 'string'
|
|
2442
|
+
? mockConfig.response
|
|
2443
|
+
: JSON.stringify(mockConfig.response)
|
|
2444
|
+
});
|
|
2445
|
+
return;
|
|
2415
2446
|
}
|
|
2447
|
+
// Check if should modify headers
|
|
2448
|
+
const headerConfig = modifyHeaders.find(h => matchesPattern(url, [h.urlPattern]));
|
|
2449
|
+
if (headerConfig) {
|
|
2450
|
+
const headers = {
|
|
2451
|
+
...request.headers(),
|
|
2452
|
+
...headerConfig.headers
|
|
2453
|
+
};
|
|
2454
|
+
interceptedRequests.push({
|
|
2455
|
+
url,
|
|
2456
|
+
method,
|
|
2457
|
+
action: 'headers_modified',
|
|
2458
|
+
modifiedHeaders: headerConfig.headers,
|
|
2459
|
+
timestamp: Date.now()
|
|
2460
|
+
});
|
|
2461
|
+
await request.continue({ headers });
|
|
2462
|
+
return;
|
|
2463
|
+
}
|
|
2464
|
+
// Continue normally but record
|
|
2465
|
+
if (!seen.has(url)) {
|
|
2466
|
+
seen.add(url);
|
|
2467
|
+
categories[category] = (categories[category] || 0) + 1;
|
|
2468
|
+
const entry = {
|
|
2469
|
+
url,
|
|
2470
|
+
method,
|
|
2471
|
+
resourceType,
|
|
2472
|
+
category,
|
|
2473
|
+
timestamp: Date.now()
|
|
2474
|
+
};
|
|
2475
|
+
// Capture POST/PUT payloads
|
|
2476
|
+
if (capturePayloads && (method === 'POST' || method === 'PUT' || method === 'PATCH')) {
|
|
2477
|
+
try {
|
|
2478
|
+
entry.payload = request.postData();
|
|
2479
|
+
}
|
|
2480
|
+
catch {
|
|
2481
|
+
// Ignore
|
|
2482
|
+
}
|
|
2483
|
+
}
|
|
2484
|
+
requests.push(entry);
|
|
2485
|
+
// Collect API endpoints
|
|
2486
|
+
if (category === 'api' || resourceType === 'xhr' || resourceType === 'fetch') {
|
|
2487
|
+
apis.push({
|
|
2488
|
+
url,
|
|
2489
|
+
method,
|
|
2490
|
+
type: resourceType,
|
|
2491
|
+
payload: entry.payload
|
|
2492
|
+
});
|
|
2493
|
+
}
|
|
2494
|
+
// Collect media URLs
|
|
2495
|
+
if (category === 'media' || /\.(mp4|webm|m3u8|ts|mp3)/i.test(url)) {
|
|
2496
|
+
mediaUrls.push(url);
|
|
2497
|
+
}
|
|
2498
|
+
}
|
|
2499
|
+
await request.continue();
|
|
2500
|
+
};
|
|
2501
|
+
page.on('request', requestHandler);
|
|
2502
|
+
await new Promise(r => setTimeout(r, duration));
|
|
2503
|
+
page.off('request', requestHandler);
|
|
2504
|
+
await page.setRequestInterception(false);
|
|
2505
|
+
}
|
|
2506
|
+
catch (e) {
|
|
2507
|
+
// Cleanup on error
|
|
2508
|
+
try {
|
|
2509
|
+
await page.setRequestInterception(false);
|
|
2416
2510
|
}
|
|
2417
|
-
|
|
2418
|
-
|
|
2511
|
+
catch { }
|
|
2512
|
+
}
|
|
2513
|
+
}
|
|
2514
|
+
else {
|
|
2515
|
+
// ============================================================
|
|
2516
|
+
// RECORD MODE - Uses response events (safer)
|
|
2517
|
+
// ============================================================
|
|
2518
|
+
const responseHandler = (response) => {
|
|
2419
2519
|
try {
|
|
2420
|
-
const
|
|
2421
|
-
|
|
2422
|
-
|
|
2520
|
+
const url = response.url();
|
|
2521
|
+
// Dedup
|
|
2522
|
+
if (seen.has(url))
|
|
2523
|
+
return;
|
|
2524
|
+
seen.add(url);
|
|
2525
|
+
if (args.filterUrl && !url.includes(args.filterUrl)) {
|
|
2526
|
+
return;
|
|
2527
|
+
}
|
|
2528
|
+
const resourceType = response.request()?.resourceType?.() || 'unknown';
|
|
2529
|
+
const method = response.request()?.method?.() || 'GET';
|
|
2530
|
+
const category = categorizeUrl(url, resourceType);
|
|
2531
|
+
categories[category] = (categories[category] || 0) + 1;
|
|
2532
|
+
// Collect API endpoints
|
|
2533
|
+
if (category === 'api' || resourceType === 'xhr' || resourceType === 'fetch') {
|
|
2534
|
+
const apiEntry = { url, method, type: resourceType };
|
|
2535
|
+
// Capture POST data if enabled
|
|
2536
|
+
if (capturePayloads && (method === 'POST' || method === 'PUT' || method === 'PATCH')) {
|
|
2537
|
+
try {
|
|
2538
|
+
apiEntry.payload = response.request()?.postData?.();
|
|
2539
|
+
}
|
|
2540
|
+
catch { }
|
|
2541
|
+
}
|
|
2542
|
+
apis.push(apiEntry);
|
|
2543
|
+
}
|
|
2544
|
+
// Collect media URLs
|
|
2545
|
+
if (category === 'media' || /\.(mp4|webm|m3u8|ts|mp3)/i.test(url)) {
|
|
2546
|
+
mediaUrls.push(url);
|
|
2547
|
+
}
|
|
2548
|
+
const entry = {
|
|
2549
|
+
url,
|
|
2550
|
+
status: response.status(),
|
|
2551
|
+
resourceType,
|
|
2552
|
+
category,
|
|
2553
|
+
method,
|
|
2554
|
+
timestamp: Date.now(),
|
|
2555
|
+
};
|
|
2556
|
+
if (args.includeHeaders) {
|
|
2557
|
+
try {
|
|
2558
|
+
entry.headers = response.headers();
|
|
2559
|
+
}
|
|
2560
|
+
catch (e) {
|
|
2561
|
+
entry.headers = {};
|
|
2562
|
+
}
|
|
2563
|
+
}
|
|
2564
|
+
requests.push(entry);
|
|
2565
|
+
// Track size from headers
|
|
2566
|
+
try {
|
|
2567
|
+
const headers = response.headers();
|
|
2568
|
+
const size = parseInt(headers['content-length'] || '0', 10);
|
|
2569
|
+
totalSize += size;
|
|
2570
|
+
}
|
|
2571
|
+
catch {
|
|
2572
|
+
// Ignore
|
|
2573
|
+
}
|
|
2423
2574
|
}
|
|
2424
2575
|
catch {
|
|
2425
|
-
// Ignore
|
|
2576
|
+
// Ignore all errors in handler to prevent crash
|
|
2426
2577
|
}
|
|
2427
|
-
}
|
|
2428
|
-
catch {
|
|
2429
|
-
// Ignore all errors in handler to prevent crash
|
|
2430
|
-
}
|
|
2431
|
-
};
|
|
2432
|
-
try {
|
|
2433
|
-
page.on('response', responseHandler);
|
|
2434
|
-
await new Promise((r) => setTimeout(r, duration));
|
|
2435
|
-
}
|
|
2436
|
-
catch (e) {
|
|
2437
|
-
// Capture setup errors
|
|
2438
|
-
}
|
|
2439
|
-
finally {
|
|
2578
|
+
};
|
|
2440
2579
|
try {
|
|
2441
|
-
page.
|
|
2580
|
+
page.on('response', responseHandler);
|
|
2581
|
+
await new Promise((r) => setTimeout(r, duration));
|
|
2442
2582
|
}
|
|
2443
2583
|
catch (e) {
|
|
2444
|
-
//
|
|
2584
|
+
// Capture setup errors
|
|
2585
|
+
}
|
|
2586
|
+
finally {
|
|
2587
|
+
try {
|
|
2588
|
+
page.off('response', responseHandler);
|
|
2589
|
+
}
|
|
2590
|
+
catch (e) {
|
|
2591
|
+
// Ignore cleanup errors
|
|
2592
|
+
}
|
|
2445
2593
|
}
|
|
2446
2594
|
}
|
|
2595
|
+
tracker.setProgress(90, `✅ Recorded ${requests.length} requests`);
|
|
2596
|
+
tracker.complete(`🎉 Network recording complete`);
|
|
2447
2597
|
return {
|
|
2448
2598
|
requests: requests.slice(0, 500),
|
|
2449
2599
|
count: requests.length,
|
|
@@ -2451,7 +2601,12 @@ export async function handleNetworkRecorder(page, args) {
|
|
|
2451
2601
|
categories,
|
|
2452
2602
|
apis: apis.length > 0 ? apis : undefined,
|
|
2453
2603
|
mediaUrls: mediaUrls.length > 0 ? mediaUrls : undefined,
|
|
2454
|
-
|
|
2604
|
+
blockedCount: blockedCount > 0 ? blockedCount : undefined,
|
|
2605
|
+
mockedCount: mockedCount > 0 ? mockedCount : undefined,
|
|
2606
|
+
interceptedRequests: interceptedRequests.length > 0 ? interceptedRequests : undefined,
|
|
2607
|
+
message: `📡 Recorded ${requests.length} requests (${Math.round(totalSize / 1024)}KB) | APIs: ${apis.length} | Media: ${mediaUrls.length}` +
|
|
2608
|
+
(blockedCount > 0 ? ` | Blocked: ${blockedCount}` : '') +
|
|
2609
|
+
(mockedCount > 0 ? ` | Mocked: ${mockedCount}` : '')
|
|
2455
2610
|
};
|
|
2456
2611
|
}
|
|
2457
2612
|
/**
|
|
@@ -2925,6 +3080,7 @@ export async function handleVideoRecording(page, args, recorderState) {
|
|
|
2925
3080
|
/**
|
|
2926
3081
|
* Harvest all links from page
|
|
2927
3082
|
* ULTRA POWERFUL: Pagination detection, smart categorization, file types
|
|
3083
|
+
* NEW: Auto-follow pagination to scrape multiple pages
|
|
2928
3084
|
*/
|
|
2929
3085
|
export async function handleLinkHarvester(page, args) {
|
|
2930
3086
|
// Progress tracking for real-time updates
|
|
@@ -2933,76 +3089,110 @@ export async function handleLinkHarvester(page, args) {
|
|
|
2933
3089
|
tracker.start(100, '🔗 Starting link harvesting...');
|
|
2934
3090
|
const currentUrl = new URL(page.url());
|
|
2935
3091
|
tracker.setProgress(10, `📍 Analyzing page: ${currentUrl.hostname}`);
|
|
2936
|
-
//
|
|
2937
|
-
|
|
2938
|
-
//
|
|
2939
|
-
|
|
2940
|
-
const
|
|
2941
|
-
|
|
2942
|
-
|
|
2943
|
-
|
|
2944
|
-
links
|
|
2945
|
-
|
|
2946
|
-
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
3092
|
+
// Pagination settings
|
|
3093
|
+
const followPagination = args.followPagination === true;
|
|
3094
|
+
const maxPages = Math.min(args.maxPages || 5, 20); // Max 20 pages
|
|
3095
|
+
const delayBetweenPages = args.delayBetweenPages || 1000;
|
|
3096
|
+
const paginationSelector = args.paginationSelector;
|
|
3097
|
+
// Helper function to extract links from current page
|
|
3098
|
+
const extractLinksFromPage = async () => {
|
|
3099
|
+
const allLinks = await page.evaluate(() => {
|
|
3100
|
+
const links = [];
|
|
3101
|
+
document.querySelectorAll('a[href]').forEach((a) => {
|
|
3102
|
+
const anchor = a;
|
|
3103
|
+
links.push({
|
|
3104
|
+
url: anchor.href,
|
|
3105
|
+
text: a.textContent?.trim()?.substring(0, 100) || '',
|
|
3106
|
+
attrs: {
|
|
3107
|
+
rel: anchor.rel || '',
|
|
3108
|
+
target: anchor.target || '',
|
|
3109
|
+
class: anchor.className || '',
|
|
3110
|
+
id: anchor.id || '',
|
|
3111
|
+
download: anchor.download || '',
|
|
3112
|
+
}
|
|
3113
|
+
});
|
|
2954
3114
|
});
|
|
3115
|
+
return links;
|
|
2955
3116
|
});
|
|
2956
|
-
|
|
2957
|
-
|
|
2958
|
-
|
|
2959
|
-
|
|
2960
|
-
|
|
2961
|
-
|
|
2962
|
-
|
|
2963
|
-
|
|
2964
|
-
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
|
|
2970
|
-
|
|
2971
|
-
|
|
2972
|
-
|
|
2973
|
-
|
|
2974
|
-
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
|
|
2978
|
-
|
|
2979
|
-
|
|
2980
|
-
|
|
3117
|
+
// Pagination detection
|
|
3118
|
+
const pagination = await page.evaluate((customSelector) => {
|
|
3119
|
+
let nextPage;
|
|
3120
|
+
let prevPage;
|
|
3121
|
+
let totalPages;
|
|
3122
|
+
let currentPage;
|
|
3123
|
+
// Custom selector first
|
|
3124
|
+
if (customSelector) {
|
|
3125
|
+
try {
|
|
3126
|
+
const el = document.querySelector(customSelector);
|
|
3127
|
+
if (el?.href)
|
|
3128
|
+
nextPage = el.href;
|
|
3129
|
+
}
|
|
3130
|
+
catch { /* invalid selector */ }
|
|
3131
|
+
}
|
|
3132
|
+
// Common pagination selectors
|
|
3133
|
+
const nextSelectors = [
|
|
3134
|
+
'a[rel="next"]', 'a.next', 'a.pagination-next',
|
|
3135
|
+
'[aria-label="Next"]', 'a.page-link.next', '.next a',
|
|
3136
|
+
'.pagination a:last-child', 'a[title="Next"]',
|
|
3137
|
+
'a[aria-label*="next" i]', 'button.next', '[data-testid="next"]'
|
|
3138
|
+
];
|
|
3139
|
+
const prevSelectors = [
|
|
3140
|
+
'a[rel="prev"]', 'a.prev', 'a.pagination-prev',
|
|
3141
|
+
'[aria-label="Previous"]', 'a.page-link.prev', '.prev a'
|
|
3142
|
+
];
|
|
3143
|
+
if (!nextPage) {
|
|
3144
|
+
for (const sel of nextSelectors) {
|
|
3145
|
+
try {
|
|
3146
|
+
const el = document.querySelector(sel);
|
|
3147
|
+
if (el?.href) {
|
|
3148
|
+
nextPage = el.href;
|
|
3149
|
+
break;
|
|
3150
|
+
}
|
|
3151
|
+
}
|
|
3152
|
+
catch { /* invalid selector */ }
|
|
2981
3153
|
}
|
|
2982
3154
|
}
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
3155
|
+
// Text-based next detection
|
|
3156
|
+
if (!nextPage) {
|
|
3157
|
+
const links = Array.from(document.querySelectorAll('a'));
|
|
3158
|
+
for (const link of links) {
|
|
3159
|
+
const text = link.textContent?.toLowerCase().trim() || '';
|
|
3160
|
+
if (text === 'next' || text === 'next →' || text === '>' || text === '»' || text === 'next page') {
|
|
3161
|
+
nextPage = link.href;
|
|
3162
|
+
break;
|
|
3163
|
+
}
|
|
2991
3164
|
}
|
|
2992
3165
|
}
|
|
2993
|
-
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
2997
|
-
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
|
|
3003
|
-
|
|
3166
|
+
for (const sel of prevSelectors) {
|
|
3167
|
+
try {
|
|
3168
|
+
const el = document.querySelector(sel);
|
|
3169
|
+
if (el?.href) {
|
|
3170
|
+
prevPage = el.href;
|
|
3171
|
+
break;
|
|
3172
|
+
}
|
|
3173
|
+
}
|
|
3174
|
+
catch { /* invalid selector */ }
|
|
3175
|
+
}
|
|
3176
|
+
// Detect current page and total pages
|
|
3177
|
+
const pageNumbers = Array.from(document.querySelectorAll('.pagination a, .page-numbers a, nav a, .pager a'))
|
|
3178
|
+
.map(a => ({
|
|
3179
|
+
num: parseInt(a.textContent || '0', 10),
|
|
3180
|
+
isActive: a.classList.contains('active') || a.classList.contains('current') ||
|
|
3181
|
+
a.getAttribute('aria-current') === 'page'
|
|
3182
|
+
}))
|
|
3183
|
+
.filter(p => !isNaN(p.num) && p.num > 0);
|
|
3184
|
+
if (pageNumbers.length > 0) {
|
|
3185
|
+
totalPages = Math.max(...pageNumbers.map(p => p.num));
|
|
3186
|
+
const active = pageNumbers.find(p => p.isActive);
|
|
3187
|
+
if (active)
|
|
3188
|
+
currentPage = active.num;
|
|
3189
|
+
}
|
|
3190
|
+
return { nextPage, prevPage, totalPages, currentPage };
|
|
3191
|
+
}, paginationSelector);
|
|
3192
|
+
return { links: allLinks, pagination };
|
|
3193
|
+
};
|
|
3004
3194
|
// ============================================================
|
|
3005
|
-
//
|
|
3195
|
+
// SMART LINK CATEGORIZATION
|
|
3006
3196
|
// ============================================================
|
|
3007
3197
|
const categorizeLink = (url, text, attrs) => {
|
|
3008
3198
|
const urlLower = url.toLowerCase();
|
|
@@ -3037,55 +3227,107 @@ export async function handleLinkHarvester(page, args) {
|
|
|
3037
3227
|
return 'info';
|
|
3038
3228
|
return 'navigation';
|
|
3039
3229
|
};
|
|
3230
|
+
// ============================================================
|
|
3231
|
+
// MAIN SCRAPING LOGIC
|
|
3232
|
+
// ============================================================
|
|
3040
3233
|
const processedLinks = [];
|
|
3041
3234
|
const categories = {};
|
|
3042
3235
|
const seen = new Set();
|
|
3043
3236
|
let internal = 0;
|
|
3044
3237
|
let external = 0;
|
|
3045
|
-
|
|
3046
|
-
|
|
3047
|
-
|
|
3048
|
-
|
|
3049
|
-
|
|
3050
|
-
|
|
3051
|
-
|
|
3052
|
-
|
|
3053
|
-
|
|
3054
|
-
|
|
3055
|
-
|
|
3056
|
-
|
|
3057
|
-
|
|
3058
|
-
|
|
3059
|
-
|
|
3060
|
-
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
|
|
3065
|
-
|
|
3066
|
-
|
|
3067
|
-
|
|
3068
|
-
|
|
3069
|
-
|
|
3070
|
-
|
|
3071
|
-
|
|
3072
|
-
|
|
3073
|
-
|
|
3238
|
+
let pagesScraped = 0;
|
|
3239
|
+
let lastPagination = {};
|
|
3240
|
+
const visitedPages = new Set();
|
|
3241
|
+
// Process links from a page
|
|
3242
|
+
const processLinks = (allLinks, pageNum) => {
|
|
3243
|
+
for (const link of allLinks) {
|
|
3244
|
+
try {
|
|
3245
|
+
if (seen.has(link.url))
|
|
3246
|
+
continue;
|
|
3247
|
+
seen.add(link.url);
|
|
3248
|
+
const linkUrl = new URL(link.url);
|
|
3249
|
+
const isInternal = linkUrl.hostname === currentUrl.hostname;
|
|
3250
|
+
if (args.filter && !link.url.includes(args.filter) && !link.text.includes(args.filter)) {
|
|
3251
|
+
continue;
|
|
3252
|
+
}
|
|
3253
|
+
if (isInternal && args.includeInternal === false)
|
|
3254
|
+
continue;
|
|
3255
|
+
if (!isInternal && args.includeExternal === false)
|
|
3256
|
+
continue;
|
|
3257
|
+
const category = categorizeLink(link.url, link.text, link.attrs);
|
|
3258
|
+
categories[category] = (categories[category] || 0) + 1;
|
|
3259
|
+
processedLinks.push({
|
|
3260
|
+
url: link.url,
|
|
3261
|
+
text: link.text,
|
|
3262
|
+
type: isInternal ? 'internal' : 'external',
|
|
3263
|
+
category,
|
|
3264
|
+
page: pageNum,
|
|
3265
|
+
});
|
|
3266
|
+
if (isInternal)
|
|
3267
|
+
internal++;
|
|
3268
|
+
else
|
|
3269
|
+
external++;
|
|
3270
|
+
if (args.maxLinks && processedLinks.length >= args.maxLinks)
|
|
3271
|
+
return true; // Stop
|
|
3272
|
+
}
|
|
3273
|
+
catch {
|
|
3274
|
+
// Invalid URL, skip
|
|
3275
|
+
}
|
|
3074
3276
|
}
|
|
3075
|
-
|
|
3076
|
-
|
|
3277
|
+
return false; // Continue
|
|
3278
|
+
};
|
|
3279
|
+
// Scrape first page
|
|
3280
|
+
tracker.setProgress(20, '🔍 Extracting links from page 1...');
|
|
3281
|
+
const firstPage = await extractLinksFromPage();
|
|
3282
|
+
pagesScraped = 1;
|
|
3283
|
+
visitedPages.add(page.url());
|
|
3284
|
+
lastPagination = firstPage.pagination;
|
|
3285
|
+
const shouldStop = processLinks(firstPage.links, 1);
|
|
3286
|
+
// Follow pagination if enabled
|
|
3287
|
+
if (followPagination && !shouldStop && firstPage.pagination.nextPage) {
|
|
3288
|
+
let nextUrl = firstPage.pagination.nextPage;
|
|
3289
|
+
while (nextUrl && pagesScraped < maxPages && !(args.maxLinks && processedLinks.length >= args.maxLinks)) {
|
|
3290
|
+
// Check if we've already visited this page
|
|
3291
|
+
if (visitedPages.has(nextUrl)) {
|
|
3292
|
+
break;
|
|
3293
|
+
}
|
|
3294
|
+
visitedPages.add(nextUrl);
|
|
3295
|
+
tracker.setProgress(20 + (pagesScraped / maxPages) * 60, `📄 Scraping page ${pagesScraped + 1}...`);
|
|
3296
|
+
try {
|
|
3297
|
+
// Navigate to next page
|
|
3298
|
+
await page.goto(nextUrl, {
|
|
3299
|
+
waitUntil: 'domcontentloaded',
|
|
3300
|
+
timeout: 15000
|
|
3301
|
+
});
|
|
3302
|
+
// Wait for content to load
|
|
3303
|
+
await new Promise(r => setTimeout(r, delayBetweenPages));
|
|
3304
|
+
// Extract links from this page
|
|
3305
|
+
const pageData = await extractLinksFromPage();
|
|
3306
|
+
pagesScraped++;
|
|
3307
|
+
lastPagination = pageData.pagination;
|
|
3308
|
+
const stop = processLinks(pageData.links, pagesScraped);
|
|
3309
|
+
if (stop)
|
|
3310
|
+
break;
|
|
3311
|
+
// Get next page URL
|
|
3312
|
+
nextUrl = pageData.pagination.nextPage || undefined;
|
|
3313
|
+
}
|
|
3314
|
+
catch (error) {
|
|
3315
|
+
// Failed to navigate, stop pagination
|
|
3316
|
+
break;
|
|
3317
|
+
}
|
|
3077
3318
|
}
|
|
3078
3319
|
}
|
|
3079
|
-
tracker.setProgress(90, `✅ Processed ${processedLinks.length} links`);
|
|
3080
|
-
tracker.complete(`🎉 Link harvesting complete: ${processedLinks.length} links
|
|
3320
|
+
tracker.setProgress(90, `✅ Processed ${processedLinks.length} links from ${pagesScraped} pages`);
|
|
3321
|
+
tracker.complete(`🎉 Link harvesting complete: ${processedLinks.length} links from ${pagesScraped} pages`);
|
|
3081
3322
|
return {
|
|
3082
3323
|
links: processedLinks,
|
|
3083
3324
|
internal,
|
|
3084
3325
|
external,
|
|
3085
|
-
pagination: (
|
|
3326
|
+
pagination: (lastPagination.nextPage || lastPagination.prevPage || lastPagination.totalPages) ? lastPagination : undefined,
|
|
3086
3327
|
categories,
|
|
3087
|
-
|
|
3088
|
-
|
|
3328
|
+
pagesScraped,
|
|
3329
|
+
message: `🔗 Found ${processedLinks.length} links (${internal} internal, ${external} external) from ${pagesScraped} pages` +
|
|
3330
|
+
(lastPagination.nextPage && pagesScraped >= maxPages ? ` | More pages available: ${lastPagination.nextPage}` : '')
|
|
3089
3331
|
};
|
|
3090
3332
|
}
|
|
3091
3333
|
/**
|
|
@@ -3460,10 +3702,13 @@ export async function handleSolveCaptchaAdvanced(page, args) {
|
|
|
3460
3702
|
}
|
|
3461
3703
|
/**
|
|
3462
3704
|
* Parse and extract HLS/m3u8 streaming URLs
|
|
3705
|
+
* ENHANCED: Segment parsing, bandwidth extraction, playlist fetching
|
|
3463
3706
|
*/
|
|
3464
3707
|
export async function handleM3u8Parser(page, args) {
|
|
3465
3708
|
const streams = [];
|
|
3466
3709
|
const qualities = [];
|
|
3710
|
+
const variants = [];
|
|
3711
|
+
const segments = [];
|
|
3467
3712
|
let masterPlaylist;
|
|
3468
3713
|
// Intercept network requests to find m3u8 files
|
|
3469
3714
|
const m3u8Urls = [];
|
|
@@ -3564,6 +3809,109 @@ export async function handleM3u8Parser(page, args) {
|
|
|
3564
3809
|
streams.push(stream);
|
|
3565
3810
|
}
|
|
3566
3811
|
}
|
|
3812
|
+
// ============================================================
|
|
3813
|
+
// NEW: FETCH AND PARSE MASTER PLAYLIST FOR VARIANTS
|
|
3814
|
+
// ============================================================
|
|
3815
|
+
if ((args.fetchPlaylist || args.extractBandwidth) && masterPlaylist) {
|
|
3816
|
+
try {
|
|
3817
|
+
const playlistContent = await page.evaluate(async (url) => {
|
|
3818
|
+
try {
|
|
3819
|
+
const response = await fetch(url);
|
|
3820
|
+
return await response.text();
|
|
3821
|
+
}
|
|
3822
|
+
catch {
|
|
3823
|
+
return null;
|
|
3824
|
+
}
|
|
3825
|
+
}, masterPlaylist);
|
|
3826
|
+
if (playlistContent) {
|
|
3827
|
+
// Parse #EXT-X-STREAM-INF lines for variants
|
|
3828
|
+
const variantRegex = /#EXT-X-STREAM-INF:.*?BANDWIDTH=(\d+)(?:.*?RESOLUTION=(\d+x\d+))?[^\n]*\n([^\n]+)/g;
|
|
3829
|
+
let match;
|
|
3830
|
+
while ((match = variantRegex.exec(playlistContent)) !== null) {
|
|
3831
|
+
const bandwidth = parseInt(match[1], 10);
|
|
3832
|
+
const resolution = match[2] || undefined;
|
|
3833
|
+
let variantUrl = match[3].trim();
|
|
3834
|
+
// Make relative URLs absolute
|
|
3835
|
+
if (!variantUrl.startsWith('http')) {
|
|
3836
|
+
const baseUrl = masterPlaylist.substring(0, masterPlaylist.lastIndexOf('/') + 1);
|
|
3837
|
+
variantUrl = baseUrl + variantUrl;
|
|
3838
|
+
}
|
|
3839
|
+
// Determine quality from resolution or bandwidth
|
|
3840
|
+
let quality = 'unknown';
|
|
3841
|
+
if (resolution) {
|
|
3842
|
+
const height = parseInt(resolution.split('x')[1], 10);
|
|
3843
|
+
if (height >= 2160)
|
|
3844
|
+
quality = '4K';
|
|
3845
|
+
else if (height >= 1080)
|
|
3846
|
+
quality = '1080p';
|
|
3847
|
+
else if (height >= 720)
|
|
3848
|
+
quality = '720p';
|
|
3849
|
+
else if (height >= 480)
|
|
3850
|
+
quality = '480p';
|
|
3851
|
+
else if (height >= 360)
|
|
3852
|
+
quality = '360p';
|
|
3853
|
+
else
|
|
3854
|
+
quality = `${height}p`;
|
|
3855
|
+
}
|
|
3856
|
+
else if (bandwidth >= 5000000)
|
|
3857
|
+
quality = '1080p';
|
|
3858
|
+
else if (bandwidth >= 2500000)
|
|
3859
|
+
quality = '720p';
|
|
3860
|
+
else if (bandwidth >= 1000000)
|
|
3861
|
+
quality = '480p';
|
|
3862
|
+
else
|
|
3863
|
+
quality = '360p';
|
|
3864
|
+
variants.push({ quality, bandwidth, url: variantUrl, resolution });
|
|
3865
|
+
}
|
|
3866
|
+
// Sort variants by bandwidth (highest first)
|
|
3867
|
+
variants.sort((a, b) => b.bandwidth - a.bandwidth);
|
|
3868
|
+
}
|
|
3869
|
+
}
|
|
3870
|
+
catch (e) {
|
|
3871
|
+
// Ignore fetch errors
|
|
3872
|
+
}
|
|
3873
|
+
}
|
|
3874
|
+
// ============================================================
|
|
3875
|
+
// NEW: PARSE SEGMENTS FROM MEDIA PLAYLIST
|
|
3876
|
+
// ============================================================
|
|
3877
|
+
if (args.parseSegments && streams.length > 0) {
|
|
3878
|
+
const mediaPlaylistUrl = streams[0].url;
|
|
3879
|
+
try {
|
|
3880
|
+
const mediaContent = await page.evaluate(async (url) => {
|
|
3881
|
+
try {
|
|
3882
|
+
const response = await fetch(url);
|
|
3883
|
+
return await response.text();
|
|
3884
|
+
}
|
|
3885
|
+
catch {
|
|
3886
|
+
return null;
|
|
3887
|
+
}
|
|
3888
|
+
}, mediaPlaylistUrl);
|
|
3889
|
+
if (mediaContent) {
|
|
3890
|
+
const lines = mediaContent.split('\n');
|
|
3891
|
+
let segmentIndex = 0;
|
|
3892
|
+
let currentDuration = 0;
|
|
3893
|
+
for (let i = 0; i < lines.length; i++) {
|
|
3894
|
+
const line = lines[i].trim();
|
|
3895
|
+
// Parse duration from #EXTINF
|
|
3896
|
+
if (line.startsWith('#EXTINF:')) {
|
|
3897
|
+
currentDuration = parseFloat(line.replace('#EXTINF:', '').split(',')[0]);
|
|
3898
|
+
}
|
|
3899
|
+
// Capture segment URL
|
|
3900
|
+
else if (line && !line.startsWith('#') && (line.includes('.ts') || line.includes('.m4s'))) {
|
|
3901
|
+
let segmentUrl = line;
|
|
3902
|
+
if (!segmentUrl.startsWith('http')) {
|
|
3903
|
+
const baseUrl = mediaPlaylistUrl.substring(0, mediaPlaylistUrl.lastIndexOf('/') + 1);
|
|
3904
|
+
segmentUrl = baseUrl + segmentUrl;
|
|
3905
|
+
}
|
|
3906
|
+
segments.push({ url: segmentUrl, duration: currentDuration, index: segmentIndex++ });
|
|
3907
|
+
}
|
|
3908
|
+
}
|
|
3909
|
+
}
|
|
3910
|
+
}
|
|
3911
|
+
catch (e) {
|
|
3912
|
+
// Ignore segment parsing errors
|
|
3913
|
+
}
|
|
3914
|
+
}
|
|
3567
3915
|
// Filter audio if not wanted
|
|
3568
3916
|
const filteredStreams = args.includeAudio !== false
|
|
3569
3917
|
? streams
|
|
@@ -3573,6 +3921,8 @@ export async function handleM3u8Parser(page, args) {
|
|
|
3573
3921
|
streams: filteredStreams,
|
|
3574
3922
|
masterPlaylist,
|
|
3575
3923
|
qualities: [...new Set(qualities)],
|
|
3924
|
+
variants: variants.length > 0 ? variants : undefined,
|
|
3925
|
+
segments: segments.length > 0 ? segments : undefined,
|
|
3576
3926
|
};
|
|
3577
3927
|
}
|
|
3578
3928
|
/**
|
|
@@ -4434,11 +4784,76 @@ export async function handleCloudflareBypass(page, args) {
|
|
|
4434
4784
|
/**
|
|
4435
4785
|
* Master tool: Extract direct stream/download URLs
|
|
4436
4786
|
* ULTRA POWERFUL: Handles packed JS, JW Player, Video.js, HLS.js, obfuscated scripts
|
|
4787
|
+
* ENHANCED: Multi-Quality Selector, VidSrc, Filemoon, StreamWish support
|
|
4437
4788
|
*/
|
|
4438
4789
|
export async function handleStreamExtractor(page, args) {
|
|
4439
4790
|
const formats = args.formats || ['mp4', 'mkv', 'm3u8', 'mp3', 'webm', 'flv', 'avi'];
|
|
4440
4791
|
const maxRedirects = args.maxRedirects || 10;
|
|
4441
4792
|
const directUrls = [];
|
|
4793
|
+
const subtitles = [];
|
|
4794
|
+
// Quality priority for auto-selection
|
|
4795
|
+
const qualityPriority = {
|
|
4796
|
+
'2160p': 100, '4k': 100, 'uhd': 100,
|
|
4797
|
+
'1080p': 90, 'fhd': 90, 'full hd': 90,
|
|
4798
|
+
'720p': 80, 'hd': 80,
|
|
4799
|
+
'480p': 70, 'sd': 70,
|
|
4800
|
+
'360p': 60,
|
|
4801
|
+
'240p': 50,
|
|
4802
|
+
'144p': 40,
|
|
4803
|
+
'unknown': 10, 'auto': 10
|
|
4804
|
+
};
|
|
4805
|
+
// Site-specific extraction patterns
|
|
4806
|
+
const sitePatterns = {
|
|
4807
|
+
vidsrc: {
|
|
4808
|
+
urlPattern: /vidsrc|v2\.vidsrc/i,
|
|
4809
|
+
sourcePattern: [
|
|
4810
|
+
/source:\s*["']([^"']+\.m3u8[^"']*)/gi,
|
|
4811
|
+
/file:\s*["']([^"']+\.m3u8[^"']*)/gi
|
|
4812
|
+
]
|
|
4813
|
+
},
|
|
4814
|
+
filemoon: {
|
|
4815
|
+
urlPattern: /filemoon|moonplayer/i,
|
|
4816
|
+
sourcePattern: [
|
|
4817
|
+
/sources:\s*\[\s*\{[^}]*file:\s*["']([^"']+)/gi,
|
|
4818
|
+
/eval\(function\(p,a,c,k,e,[rd]\)/gi
|
|
4819
|
+
]
|
|
4820
|
+
},
|
|
4821
|
+
streamwish: {
|
|
4822
|
+
urlPattern: /streamwish|swish/i,
|
|
4823
|
+
sourcePattern: [
|
|
4824
|
+
/file:\s*["']([^"']+\.m3u8[^"']*)/gi,
|
|
4825
|
+
/sources:\s*\[.*?["']([^"']+\.m3u8[^"']*)/gi
|
|
4826
|
+
]
|
|
4827
|
+
},
|
|
4828
|
+
doodstream: {
|
|
4829
|
+
urlPattern: /dood|doodstream/i,
|
|
4830
|
+
sourcePattern: [
|
|
4831
|
+
/\/pass_md5\/[^"']+/gi,
|
|
4832
|
+
/\$.get\(['"]([^'"]+pass_md5[^'"]+)/gi
|
|
4833
|
+
]
|
|
4834
|
+
},
|
|
4835
|
+
mixdrop: {
|
|
4836
|
+
urlPattern: /mixdrop/i,
|
|
4837
|
+
sourcePattern: [
|
|
4838
|
+
/MDCore\.wurl\s*=\s*["']([^"']+)/gi,
|
|
4839
|
+
/wurl\s*=\s*["']([^"']+)/gi
|
|
4840
|
+
]
|
|
4841
|
+
},
|
|
4842
|
+
streamtape: {
|
|
4843
|
+
urlPattern: /streamtape/i,
|
|
4844
|
+
sourcePattern: [
|
|
4845
|
+
/id=.*?&token=/gi,
|
|
4846
|
+
/robotlink.*?=\s*['"]([^'"]+)/gi
|
|
4847
|
+
]
|
|
4848
|
+
},
|
|
4849
|
+
mp4upload: {
|
|
4850
|
+
urlPattern: /mp4upload/i,
|
|
4851
|
+
sourcePattern: [
|
|
4852
|
+
/player\.src\(\{src:\s*["']([^"']+)/gi,
|
|
4853
|
+
/src:\s*["']([^"']+\.mp4[^"']*)/gi
|
|
4854
|
+
]
|
|
4855
|
+
}
|
|
4856
|
+
};
|
|
4442
4857
|
// Navigate if URL provided
|
|
4443
4858
|
if (args.url) {
|
|
4444
4859
|
await page.goto(args.url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
@@ -4672,25 +5087,173 @@ export async function handleStreamExtractor(page, args) {
|
|
|
4672
5087
|
directUrls.push({ url, format, source: 'network' });
|
|
4673
5088
|
}
|
|
4674
5089
|
}
|
|
5090
|
+
// ============================================================
|
|
5091
|
+
// NEW: SITE-SPECIFIC EXTRACTION (VidSrc, Filemoon, StreamWish, etc.)
|
|
5092
|
+
// ============================================================
|
|
5093
|
+
if (args.siteType && args.siteType !== 'auto') {
|
|
5094
|
+
const siteConfig = sitePatterns[args.siteType];
|
|
5095
|
+
if (siteConfig) {
|
|
5096
|
+
const html = await page.content();
|
|
5097
|
+
for (const pattern of siteConfig.sourcePattern) {
|
|
5098
|
+
let match;
|
|
5099
|
+
while ((match = pattern.exec(html)) !== null) {
|
|
5100
|
+
if (match[1] && !directUrls.some(d => d.url === match[1])) {
|
|
5101
|
+
const format = formats.find(f => match[1].includes(`.${f}`)) || 'm3u8';
|
|
5102
|
+
directUrls.push({ url: match[1], format, source: args.siteType });
|
|
5103
|
+
}
|
|
5104
|
+
}
|
|
5105
|
+
}
|
|
5106
|
+
}
|
|
5107
|
+
}
|
|
5108
|
+
else {
|
|
5109
|
+
// Auto-detect site type from URL
|
|
5110
|
+
const currentUrl = page.url();
|
|
5111
|
+
for (const [siteName, config] of Object.entries(sitePatterns)) {
|
|
5112
|
+
if (config.urlPattern.test(currentUrl)) {
|
|
5113
|
+
const html = await page.content();
|
|
5114
|
+
for (const pattern of config.sourcePattern) {
|
|
5115
|
+
let match;
|
|
5116
|
+
while ((match = pattern.exec(html)) !== null) {
|
|
5117
|
+
if (match[1] && !directUrls.some(d => d.url === match[1])) {
|
|
5118
|
+
const format = formats.find(f => match[1].includes(`.${f}`)) || 'm3u8';
|
|
5119
|
+
directUrls.push({ url: match[1], format, source: siteName });
|
|
5120
|
+
}
|
|
5121
|
+
}
|
|
5122
|
+
}
|
|
5123
|
+
break;
|
|
5124
|
+
}
|
|
5125
|
+
}
|
|
5126
|
+
}
|
|
5127
|
+
// ============================================================
|
|
5128
|
+
// NEW: EXTRACT QUALITY FROM URLs
|
|
5129
|
+
// ============================================================
|
|
5130
|
+
for (const item of directUrls) {
|
|
5131
|
+
if (!item.quality || item.quality === 'auto') {
|
|
5132
|
+
const url = item.url.toLowerCase();
|
|
5133
|
+
if (url.includes('2160') || url.includes('4k') || url.includes('uhd'))
|
|
5134
|
+
item.quality = '2160p';
|
|
5135
|
+
else if (url.includes('1080'))
|
|
5136
|
+
item.quality = '1080p';
|
|
5137
|
+
else if (url.includes('720'))
|
|
5138
|
+
item.quality = '720p';
|
|
5139
|
+
else if (url.includes('480'))
|
|
5140
|
+
item.quality = '480p';
|
|
5141
|
+
else if (url.includes('360'))
|
|
5142
|
+
item.quality = '360p';
|
|
5143
|
+
else if (url.includes('240'))
|
|
5144
|
+
item.quality = '240p';
|
|
5145
|
+
else if (url.includes('144'))
|
|
5146
|
+
item.quality = '144p';
|
|
5147
|
+
else
|
|
5148
|
+
item.quality = 'unknown';
|
|
5149
|
+
}
|
|
5150
|
+
}
|
|
5151
|
+
// ============================================================
|
|
5152
|
+
// NEW: AUTO-SELECT BEST QUALITY
|
|
5153
|
+
// ============================================================
|
|
5154
|
+
let bestQuality;
|
|
5155
|
+
if (args.autoSelectBest || args.preferredQuality) {
|
|
5156
|
+
const preferredQ = args.preferredQuality || 'highest';
|
|
5157
|
+
if (preferredQ === 'highest') {
|
|
5158
|
+
// Sort by quality priority (highest first)
|
|
5159
|
+
const sorted = [...directUrls].sort((a, b) => {
|
|
5160
|
+
const aScore = qualityPriority[a.quality?.toLowerCase() || 'unknown'] || 0;
|
|
5161
|
+
const bScore = qualityPriority[b.quality?.toLowerCase() || 'unknown'] || 0;
|
|
5162
|
+
return bScore - aScore;
|
|
5163
|
+
});
|
|
5164
|
+
if (sorted.length > 0) {
|
|
5165
|
+
bestQuality = { url: sorted[0].url, format: sorted[0].format, quality: sorted[0].quality || 'unknown', source: sorted[0].source };
|
|
5166
|
+
}
|
|
5167
|
+
}
|
|
5168
|
+
else if (preferredQ === 'lowest') {
|
|
5169
|
+
// Sort by quality priority (lowest first)
|
|
5170
|
+
const sorted = [...directUrls].sort((a, b) => {
|
|
5171
|
+
const aScore = qualityPriority[a.quality?.toLowerCase() || 'unknown'] || 0;
|
|
5172
|
+
const bScore = qualityPriority[b.quality?.toLowerCase() || 'unknown'] || 0;
|
|
5173
|
+
return aScore - bScore;
|
|
5174
|
+
});
|
|
5175
|
+
if (sorted.length > 0) {
|
|
5176
|
+
bestQuality = { url: sorted[0].url, format: sorted[0].format, quality: sorted[0].quality || 'unknown', source: sorted[0].source };
|
|
5177
|
+
}
|
|
5178
|
+
}
|
|
5179
|
+
else {
|
|
5180
|
+
// Find exact match for preferred quality
|
|
5181
|
+
const exact = directUrls.find(d => d.quality?.toLowerCase() === preferredQ.toLowerCase());
|
|
5182
|
+
if (exact) {
|
|
5183
|
+
bestQuality = { url: exact.url, format: exact.format, quality: exact.quality || preferredQ, source: exact.source };
|
|
5184
|
+
}
|
|
5185
|
+
else {
|
|
5186
|
+
// Fallback to highest available
|
|
5187
|
+
const sorted = [...directUrls].sort((a, b) => {
|
|
5188
|
+
const aScore = qualityPriority[a.quality?.toLowerCase() || 'unknown'] || 0;
|
|
5189
|
+
const bScore = qualityPriority[b.quality?.toLowerCase() || 'unknown'] || 0;
|
|
5190
|
+
return bScore - aScore;
|
|
5191
|
+
});
|
|
5192
|
+
if (sorted.length > 0) {
|
|
5193
|
+
bestQuality = { url: sorted[0].url, format: sorted[0].format, quality: sorted[0].quality || 'unknown', source: sorted[0].source };
|
|
5194
|
+
}
|
|
5195
|
+
}
|
|
5196
|
+
}
|
|
5197
|
+
}
|
|
5198
|
+
// ============================================================
|
|
5199
|
+
// NEW: EXTRACT SUBTITLES
|
|
5200
|
+
// ============================================================
|
|
5201
|
+
if (args.extractSubtitles) {
|
|
5202
|
+
const subData = await page.evaluate(() => {
|
|
5203
|
+
const subs = [];
|
|
5204
|
+
// HTML5 track elements
|
|
5205
|
+
document.querySelectorAll('track[kind="subtitles"], track[kind="captions"]').forEach(track => {
|
|
5206
|
+
const src = track.getAttribute('src');
|
|
5207
|
+
if (src) {
|
|
5208
|
+
subs.push({
|
|
5209
|
+
url: src,
|
|
5210
|
+
language: track.getAttribute('srclang') || undefined,
|
|
5211
|
+
label: track.getAttribute('label') || undefined
|
|
5212
|
+
});
|
|
5213
|
+
}
|
|
5214
|
+
});
|
|
5215
|
+
// VTT/SRT links
|
|
5216
|
+
document.querySelectorAll('a[href*=".vtt"], a[href*=".srt"], a[href*=".ass"]').forEach(link => {
|
|
5217
|
+
const href = link.href;
|
|
5218
|
+
subs.push({ url: href, label: link.textContent?.trim() || undefined });
|
|
5219
|
+
});
|
|
5220
|
+
// Look in scripts for subtitle URLs
|
|
5221
|
+
const html = document.documentElement.innerHTML;
|
|
5222
|
+
const vttMatches = html.match(/https?:\/\/[^\s"']+\.vtt[^\s"']*/gi);
|
|
5223
|
+
const srtMatches = html.match(/https?:\/\/[^\s"']+\.srt[^\s"']*/gi);
|
|
5224
|
+
if (vttMatches)
|
|
5225
|
+
vttMatches.forEach(url => subs.push({ url }));
|
|
5226
|
+
if (srtMatches)
|
|
5227
|
+
srtMatches.forEach(url => subs.push({ url }));
|
|
5228
|
+
// Deduplicate
|
|
5229
|
+
const seen = new Set();
|
|
5230
|
+
return subs.filter(s => {
|
|
5231
|
+
if (seen.has(s.url))
|
|
5232
|
+
return false;
|
|
5233
|
+
seen.add(s.url);
|
|
5234
|
+
return true;
|
|
5235
|
+
});
|
|
5236
|
+
});
|
|
5237
|
+
subtitles.push(...subData);
|
|
5238
|
+
}
|
|
4675
5239
|
return {
|
|
4676
5240
|
success: directUrls.length > 0,
|
|
4677
5241
|
directUrls,
|
|
5242
|
+
bestQuality,
|
|
5243
|
+
subtitles: args.extractSubtitles ? subtitles : undefined,
|
|
4678
5244
|
message: directUrls.length > 0
|
|
4679
|
-
? `🎬 Found ${directUrls.length}
|
|
5245
|
+
? `🎬 Found ${directUrls.length} URL(s)${bestQuality ? ` | Best: ${bestQuality.quality}` : ''}${subtitles.length > 0 ? ` | ${subtitles.length} subtitles` : ''}`
|
|
4680
5246
|
: 'No direct URLs found',
|
|
4681
5247
|
};
|
|
4682
5248
|
}
|
|
4683
|
-
|
|
4684
|
-
|
|
4685
|
-
|
|
4686
|
-
* Perfect for scraping dynamic/AJAX-loaded content
|
|
4687
|
-
*/
|
|
4688
|
-
export async function handleJsScrape(page, args) {
|
|
5249
|
+
// Helper function to scrape a single URL
|
|
5250
|
+
async function scrapeSingleUrl(page, url, args) {
|
|
5251
|
+
const startTime = Date.now();
|
|
4689
5252
|
const waitForTimeout = args.waitForTimeout || 10000;
|
|
4690
5253
|
const returnType = args.returnType || 'html';
|
|
4691
5254
|
try {
|
|
4692
5255
|
// Step 1: Navigate to URL
|
|
4693
|
-
await page.goto(
|
|
5256
|
+
await page.goto(url, {
|
|
4694
5257
|
waitUntil: 'domcontentloaded',
|
|
4695
5258
|
timeout: waitForTimeout
|
|
4696
5259
|
});
|
|
@@ -4792,26 +5355,156 @@ export async function handleJsScrape(page, args) {
|
|
|
4792
5355
|
}
|
|
4793
5356
|
return {
|
|
4794
5357
|
success: true,
|
|
4795
|
-
url
|
|
5358
|
+
url,
|
|
4796
5359
|
finalUrl,
|
|
4797
5360
|
title,
|
|
4798
5361
|
html,
|
|
4799
5362
|
text,
|
|
4800
5363
|
elements,
|
|
4801
|
-
elementCount
|
|
5364
|
+
elementCount,
|
|
5365
|
+
duration: Date.now() - startTime
|
|
4802
5366
|
};
|
|
4803
5367
|
}
|
|
4804
5368
|
catch (error) {
|
|
4805
5369
|
return {
|
|
4806
5370
|
success: false,
|
|
4807
|
-
url
|
|
4808
|
-
finalUrl: page.url() ||
|
|
5371
|
+
url,
|
|
5372
|
+
finalUrl: page.url() || url,
|
|
4809
5373
|
title: '',
|
|
4810
5374
|
elementCount: 0,
|
|
4811
|
-
error: error instanceof Error ? error.message : String(error)
|
|
5375
|
+
error: error instanceof Error ? error.message : String(error),
|
|
5376
|
+
duration: Date.now() - startTime
|
|
4812
5377
|
};
|
|
4813
5378
|
}
|
|
4814
5379
|
}
|
|
5380
|
+
/**
|
|
5381
|
+
* JS Scrape - Single-call JavaScript-rendered content extraction
|
|
5382
|
+
* Combines navigation, waiting, scrolling, and content extraction in one call
|
|
5383
|
+
* Perfect for scraping dynamic/AJAX-loaded content
|
|
5384
|
+
* NEW: Supports parallel scraping of multiple URLs with concurrency control
|
|
5385
|
+
*/
|
|
5386
|
+
export async function handleJsScrape(page, args) {
|
|
5387
|
+
const startTime = Date.now();
|
|
5388
|
+
// Determine URLs to scrape
|
|
5389
|
+
const urlList = args.urls || (args.url ? [args.url] : []);
|
|
5390
|
+
if (urlList.length === 0) {
|
|
5391
|
+
return {
|
|
5392
|
+
success: false,
|
|
5393
|
+
error: 'No URL(s) provided. Use "url" for single URL or "urls" for multiple URLs.',
|
|
5394
|
+
elementCount: 0
|
|
5395
|
+
};
|
|
5396
|
+
}
|
|
5397
|
+
// Single URL mode - backward compatible
|
|
5398
|
+
if (urlList.length === 1 && !args.urls) {
|
|
5399
|
+
const result = await scrapeSingleUrl(page, urlList[0], args);
|
|
5400
|
+
return result;
|
|
5401
|
+
}
|
|
5402
|
+
// Parallel scraping mode
|
|
5403
|
+
const concurrency = Math.min(args.concurrency || 3, 10); // Max 10 concurrent
|
|
5404
|
+
const continueOnError = args.continueOnError !== false;
|
|
5405
|
+
const delayBetween = args.delayBetween || 500;
|
|
5406
|
+
const results = [];
|
|
5407
|
+
const browser = page.browser();
|
|
5408
|
+
if (!browser) {
|
|
5409
|
+
return {
|
|
5410
|
+
success: false,
|
|
5411
|
+
error: 'Browser not available for parallel scraping',
|
|
5412
|
+
elementCount: 0
|
|
5413
|
+
};
|
|
5414
|
+
}
|
|
5415
|
+
// Create a semaphore for concurrency control
|
|
5416
|
+
let activeCount = 0;
|
|
5417
|
+
const queue = [...urlList];
|
|
5418
|
+
const errors = [];
|
|
5419
|
+
// Process URLs with concurrency limit
|
|
5420
|
+
const processUrl = async (url) => {
|
|
5421
|
+
let newPage = null;
|
|
5422
|
+
try {
|
|
5423
|
+
// Create new page for each URL
|
|
5424
|
+
newPage = await browser.newPage();
|
|
5425
|
+
// Copy settings from original page if needed
|
|
5426
|
+
await newPage.setViewport({ width: 1280, height: 720 });
|
|
5427
|
+
const result = await scrapeSingleUrl(newPage, url, args);
|
|
5428
|
+
return result;
|
|
5429
|
+
}
|
|
5430
|
+
catch (error) {
|
|
5431
|
+
return {
|
|
5432
|
+
success: false,
|
|
5433
|
+
url,
|
|
5434
|
+
finalUrl: url,
|
|
5435
|
+
title: '',
|
|
5436
|
+
elementCount: 0,
|
|
5437
|
+
error: error instanceof Error ? error.message : String(error),
|
|
5438
|
+
duration: 0
|
|
5439
|
+
};
|
|
5440
|
+
}
|
|
5441
|
+
finally {
|
|
5442
|
+
// Close the page
|
|
5443
|
+
if (newPage) {
|
|
5444
|
+
try {
|
|
5445
|
+
await newPage.close();
|
|
5446
|
+
}
|
|
5447
|
+
catch (e) {
|
|
5448
|
+
// Ignore close errors
|
|
5449
|
+
}
|
|
5450
|
+
}
|
|
5451
|
+
}
|
|
5452
|
+
};
|
|
5453
|
+
// Process all URLs with concurrency control
|
|
5454
|
+
const processBatch = async () => {
|
|
5455
|
+
const promises = [];
|
|
5456
|
+
while (queue.length > 0 || activeCount > 0) {
|
|
5457
|
+
// Start new tasks up to concurrency limit
|
|
5458
|
+
while (activeCount < concurrency && queue.length > 0) {
|
|
5459
|
+
const url = queue.shift();
|
|
5460
|
+
activeCount++;
|
|
5461
|
+
const promise = (async () => {
|
|
5462
|
+
try {
|
|
5463
|
+
// Add delay between starting each scrape
|
|
5464
|
+
if (results.length > 0) {
|
|
5465
|
+
await new Promise(r => setTimeout(r, delayBetween));
|
|
5466
|
+
}
|
|
5467
|
+
const result = await processUrl(url);
|
|
5468
|
+
if (result) {
|
|
5469
|
+
results.push(result);
|
|
5470
|
+
if (!result.success && !continueOnError) {
|
|
5471
|
+
// Clear queue to stop processing
|
|
5472
|
+
queue.length = 0;
|
|
5473
|
+
errors.push(`Stopped at ${url}: ${result.error}`);
|
|
5474
|
+
}
|
|
5475
|
+
}
|
|
5476
|
+
}
|
|
5477
|
+
finally {
|
|
5478
|
+
activeCount--;
|
|
5479
|
+
}
|
|
5480
|
+
})();
|
|
5481
|
+
promises.push(promise);
|
|
5482
|
+
}
|
|
5483
|
+
// Wait for at least one to complete before continuing
|
|
5484
|
+
if (promises.length > 0) {
|
|
5485
|
+
await Promise.race(promises);
|
|
5486
|
+
}
|
|
5487
|
+
// Small delay to prevent tight loop
|
|
5488
|
+
await new Promise(r => setTimeout(r, 50));
|
|
5489
|
+
}
|
|
5490
|
+
// Wait for all remaining promises
|
|
5491
|
+
await Promise.all(promises);
|
|
5492
|
+
};
|
|
5493
|
+
await processBatch();
|
|
5494
|
+
const successCount = results.filter(r => r.success).length;
|
|
5495
|
+
const failedCount = results.filter(r => !r.success).length;
|
|
5496
|
+
return {
|
|
5497
|
+
success: successCount > 0,
|
|
5498
|
+
isParallel: true,
|
|
5499
|
+
urls: urlList,
|
|
5500
|
+
results,
|
|
5501
|
+
totalUrls: urlList.length,
|
|
5502
|
+
successCount,
|
|
5503
|
+
failedCount,
|
|
5504
|
+
totalDuration: Date.now() - startTime,
|
|
5505
|
+
error: errors.length > 0 ? errors.join('; ') : undefined
|
|
5506
|
+
};
|
|
5507
|
+
}
|
|
4815
5508
|
/**
|
|
4816
5509
|
* Execute custom JavaScript on page
|
|
4817
5510
|
* ULTRA POWERFUL: Execute any JS code and get results
|