maxun-core 0.0.8 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.js +129 -19
- package/package.json +1 -1
package/build/interpret.js
CHANGED
|
@@ -404,6 +404,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
404
404
|
};
|
|
405
405
|
const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
|
|
406
406
|
console.log("Executing action:", methodName, args);
|
|
407
|
+
if (methodName === 'press' || methodName === 'type') {
|
|
408
|
+
// Extract only the first two arguments for these methods
|
|
409
|
+
const limitedArgs = Array.isArray(args) ? args.slice(0, 2) : [args];
|
|
410
|
+
yield invokee[methodName](...limitedArgs);
|
|
411
|
+
return;
|
|
412
|
+
}
|
|
407
413
|
if (!args || Array.isArray(args)) {
|
|
408
414
|
yield invokee[methodName](...(args !== null && args !== void 0 ? args : []));
|
|
409
415
|
}
|
|
@@ -461,6 +467,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
461
467
|
let previousHeight = 0;
|
|
462
468
|
// track unique items per page to avoid re-scraping
|
|
463
469
|
let scrapedItems = new Set();
|
|
470
|
+
let visitedUrls = [];
|
|
471
|
+
// Debug logging helper
|
|
472
|
+
const debugLog = (message, ...args) => {
|
|
473
|
+
console.log(`[Page ${visitedUrls.length + 1}] ${message}`, ...args);
|
|
474
|
+
};
|
|
475
|
+
let availableSelectors = config.pagination.selector.split(',');
|
|
464
476
|
while (true) {
|
|
465
477
|
switch (config.pagination.type) {
|
|
466
478
|
case 'scrollDown':
|
|
@@ -486,56 +498,154 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
486
498
|
previousHeight = currentTopHeight;
|
|
487
499
|
break;
|
|
488
500
|
case 'clickNext':
|
|
501
|
+
debugLog("Current URL:", page.url());
|
|
489
502
|
const pageResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
490
|
-
// console.log("Page results:", pageResults);
|
|
491
503
|
// Filter out already scraped items
|
|
492
504
|
const newResults = pageResults.filter(item => {
|
|
493
505
|
const uniqueKey = JSON.stringify(item);
|
|
494
506
|
if (scrapedItems.has(uniqueKey))
|
|
495
|
-
return false;
|
|
496
|
-
scrapedItems.add(uniqueKey);
|
|
507
|
+
return false;
|
|
508
|
+
scrapedItems.add(uniqueKey);
|
|
497
509
|
return true;
|
|
498
510
|
});
|
|
499
511
|
allResults = allResults.concat(newResults);
|
|
512
|
+
debugLog("Results collected so far:", allResults.length);
|
|
500
513
|
if (config.limit && allResults.length >= config.limit) {
|
|
501
514
|
return allResults.slice(0, config.limit);
|
|
502
515
|
}
|
|
503
|
-
|
|
516
|
+
yield page.waitForLoadState('networkidle', { timeout: 30000 });
|
|
517
|
+
yield page.waitForTimeout(2000);
|
|
518
|
+
let checkButton = null;
|
|
519
|
+
let workingSelector = null;
|
|
520
|
+
// Try each selector with explicit waiting
|
|
521
|
+
for (const selector of availableSelectors) {
|
|
522
|
+
try {
|
|
523
|
+
checkButton = yield page.waitForSelector(selector, {
|
|
524
|
+
state: 'attached',
|
|
525
|
+
timeout: 30000
|
|
526
|
+
});
|
|
527
|
+
if (checkButton) {
|
|
528
|
+
workingSelector = selector;
|
|
529
|
+
debugLog('Found working selector:', selector);
|
|
530
|
+
break;
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
catch (error) {
|
|
534
|
+
debugLog(`Selector failed: ${selector} - ${error.message}`);
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
if (!workingSelector) {
|
|
538
|
+
debugLog('No working selector found after trying all options');
|
|
539
|
+
return allResults;
|
|
540
|
+
}
|
|
541
|
+
const nextButton = yield page.$(workingSelector);
|
|
504
542
|
if (!nextButton) {
|
|
505
|
-
|
|
543
|
+
debugLog('Next button not found');
|
|
544
|
+
return allResults;
|
|
545
|
+
}
|
|
546
|
+
const selectorIndex = availableSelectors.indexOf(workingSelector);
|
|
547
|
+
availableSelectors = availableSelectors.slice(selectorIndex);
|
|
548
|
+
try {
|
|
549
|
+
// Store current URL to check if navigation succeeded
|
|
550
|
+
const previousUrl = page.url();
|
|
551
|
+
visitedUrls.push(previousUrl);
|
|
552
|
+
// Try both click methods in sequence
|
|
553
|
+
try {
|
|
554
|
+
yield Promise.all([
|
|
555
|
+
page.waitForNavigation({
|
|
556
|
+
waitUntil: 'networkidle',
|
|
557
|
+
timeout: 15000
|
|
558
|
+
}),
|
|
559
|
+
nextButton.click()
|
|
560
|
+
]);
|
|
561
|
+
}
|
|
562
|
+
catch (error) {
|
|
563
|
+
// If we're still on the same URL, try dispatch event
|
|
564
|
+
if (page.url() === previousUrl) {
|
|
565
|
+
yield Promise.all([
|
|
566
|
+
page.waitForNavigation({
|
|
567
|
+
waitUntil: 'networkidle',
|
|
568
|
+
timeout: 15000
|
|
569
|
+
}),
|
|
570
|
+
nextButton.dispatchEvent('click')
|
|
571
|
+
]);
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
yield page.waitForLoadState('domcontentloaded');
|
|
575
|
+
yield page.waitForLoadState('networkidle', { timeout: 30000 });
|
|
576
|
+
const currentUrl = page.url();
|
|
577
|
+
if (visitedUrls.includes(currentUrl)) {
|
|
578
|
+
debugLog(`Navigation failed/Detected navigation to previously visited URL: ${currentUrl}`);
|
|
579
|
+
return allResults;
|
|
580
|
+
}
|
|
581
|
+
// Give the page a moment to stabilize after navigation
|
|
582
|
+
yield page.waitForTimeout(1000);
|
|
583
|
+
}
|
|
584
|
+
catch (error) {
|
|
585
|
+
debugLog(`Navigation failed completely: ${error.message}`);
|
|
586
|
+
return allResults;
|
|
506
587
|
}
|
|
507
|
-
yield Promise.all([
|
|
508
|
-
nextButton.dispatchEvent('click'),
|
|
509
|
-
page.waitForNavigation({ waitUntil: 'networkidle' })
|
|
510
|
-
]);
|
|
511
|
-
yield page.waitForTimeout(1000);
|
|
512
588
|
break;
|
|
513
589
|
case 'clickLoadMore':
|
|
514
590
|
while (true) {
|
|
515
|
-
|
|
591
|
+
let checkButton = null;
|
|
592
|
+
let workingSelector = null;
|
|
593
|
+
for (const selector of availableSelectors) {
|
|
594
|
+
try {
|
|
595
|
+
checkButton = yield page.waitForSelector(selector, {
|
|
596
|
+
state: 'attached',
|
|
597
|
+
timeout: 30000
|
|
598
|
+
});
|
|
599
|
+
if (checkButton) {
|
|
600
|
+
workingSelector = selector;
|
|
601
|
+
debugLog('Found working selector:', selector);
|
|
602
|
+
break;
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
catch (error) {
|
|
606
|
+
debugLog(`Load More selector failed: ${selector}`);
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
if (!workingSelector) {
|
|
610
|
+
debugLog('No working Load More selector found');
|
|
611
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
612
|
+
allResults = allResults.concat(finalResults);
|
|
613
|
+
return allResults;
|
|
614
|
+
}
|
|
615
|
+
const loadMoreButton = yield page.$(workingSelector);
|
|
516
616
|
if (!loadMoreButton) {
|
|
517
|
-
|
|
617
|
+
debugLog('Load More button not found');
|
|
518
618
|
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
519
619
|
allResults = allResults.concat(finalResults);
|
|
520
620
|
return allResults;
|
|
521
621
|
}
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
622
|
+
const selectorIndex = availableSelectors.indexOf(workingSelector);
|
|
623
|
+
availableSelectors = availableSelectors.slice(selectorIndex);
|
|
624
|
+
try {
|
|
625
|
+
try {
|
|
626
|
+
yield loadMoreButton.click();
|
|
627
|
+
}
|
|
628
|
+
catch (error) {
|
|
629
|
+
yield loadMoreButton.dispatchEvent('click');
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
catch (error) {
|
|
633
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
634
|
+
allResults = allResults.concat(finalResults);
|
|
635
|
+
return allResults;
|
|
636
|
+
}
|
|
637
|
+
yield page.waitForTimeout(2000);
|
|
526
638
|
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
527
639
|
yield page.waitForTimeout(2000);
|
|
528
|
-
// Check if more items are available
|
|
529
640
|
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
|
|
530
641
|
if (currentHeight === previousHeight) {
|
|
531
|
-
|
|
642
|
+
debugLog('No more items loaded after Load More');
|
|
532
643
|
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
533
644
|
allResults = allResults.concat(finalResults);
|
|
534
645
|
return allResults;
|
|
535
646
|
}
|
|
536
647
|
previousHeight = currentHeight;
|
|
537
648
|
if (config.limit && allResults.length >= config.limit) {
|
|
538
|
-
// If limit is set and reached, return the limited results
|
|
539
649
|
allResults = allResults.slice(0, config.limit);
|
|
540
650
|
break;
|
|
541
651
|
}
|