maxun-core 0.0.9 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.js +251 -172
- package/package.json +1 -1
package/build/interpret.js
CHANGED
|
@@ -404,6 +404,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
404
404
|
};
|
|
405
405
|
const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
|
|
406
406
|
console.log("Executing action:", methodName, args);
|
|
407
|
+
if (methodName === 'press' || methodName === 'type') {
|
|
408
|
+
// Extract only the first two arguments for these methods
|
|
409
|
+
const limitedArgs = Array.isArray(args) ? args.slice(0, 2) : [args];
|
|
410
|
+
yield invokee[methodName](...limitedArgs);
|
|
411
|
+
return;
|
|
412
|
+
}
|
|
407
413
|
if (!args || Array.isArray(args)) {
|
|
408
414
|
yield invokee[methodName](...(args !== null && args !== void 0 ? args : []));
|
|
409
415
|
}
|
|
@@ -459,199 +465,272 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
459
465
|
return __awaiter(this, void 0, void 0, function* () {
|
|
460
466
|
let allResults = [];
|
|
461
467
|
let previousHeight = 0;
|
|
462
|
-
// track unique items per page to avoid re-scraping
|
|
463
468
|
let scrapedItems = new Set();
|
|
464
|
-
let visitedUrls =
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
const uniqueKey = JSON.stringify(item);
|
|
497
|
-
if (scrapedItems.has(uniqueKey))
|
|
498
|
-
return false; // Ignore if already scraped
|
|
499
|
-
scrapedItems.add(uniqueKey); // Mark as scraped
|
|
500
|
-
return true;
|
|
469
|
+
let visitedUrls = new Set();
|
|
470
|
+
const MAX_RETRIES = 3;
|
|
471
|
+
const RETRY_DELAY = 1000; // 1 second delay between retries
|
|
472
|
+
const debugLog = (message, ...args) => {
|
|
473
|
+
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
|
474
|
+
};
|
|
475
|
+
const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () {
|
|
476
|
+
const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
477
|
+
const newResults = results.filter(item => {
|
|
478
|
+
const uniqueKey = JSON.stringify(item);
|
|
479
|
+
if (scrapedItems.has(uniqueKey))
|
|
480
|
+
return false;
|
|
481
|
+
scrapedItems.add(uniqueKey);
|
|
482
|
+
return true;
|
|
483
|
+
});
|
|
484
|
+
allResults = allResults.concat(newResults);
|
|
485
|
+
debugLog("Results collected:", allResults.length);
|
|
486
|
+
});
|
|
487
|
+
const checkLimit = () => {
|
|
488
|
+
if (config.limit && allResults.length >= config.limit) {
|
|
489
|
+
allResults = allResults.slice(0, config.limit);
|
|
490
|
+
return true;
|
|
491
|
+
}
|
|
492
|
+
return false;
|
|
493
|
+
};
|
|
494
|
+
// Enhanced button finder with retry mechanism
|
|
495
|
+
const findWorkingButton = (selectors, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
|
|
496
|
+
for (const selector of selectors) {
|
|
497
|
+
try {
|
|
498
|
+
const button = yield page.waitForSelector(selector, {
|
|
499
|
+
state: 'attached',
|
|
500
|
+
timeout: 10000 // Reduced timeout for faster checks
|
|
501
501
|
});
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
return allResults.slice(0, config.limit);
|
|
502
|
+
if (button) {
|
|
503
|
+
debugLog('Found working selector:', selector);
|
|
504
|
+
return { button, workingSelector: selector };
|
|
506
505
|
}
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
506
|
+
}
|
|
507
|
+
catch (error) {
|
|
508
|
+
debugLog(`Selector failed: ${selector}`);
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
// Implement retry mechanism when no selectors work
|
|
512
|
+
if (selectors.length > 0 && retryCount < MAX_RETRIES) {
|
|
513
|
+
debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
514
|
+
yield page.waitForTimeout(RETRY_DELAY);
|
|
515
|
+
return findWorkingButton(selectors, retryCount + 1);
|
|
516
|
+
}
|
|
517
|
+
return { button: null, workingSelector: null };
|
|
518
|
+
});
|
|
519
|
+
const retryOperation = (operation, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
|
|
520
|
+
try {
|
|
521
|
+
return yield operation();
|
|
522
|
+
}
|
|
523
|
+
catch (error) {
|
|
524
|
+
if (retryCount < MAX_RETRIES) {
|
|
525
|
+
debugLog(`Retrying operation. Attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
526
|
+
yield page.waitForTimeout(RETRY_DELAY);
|
|
527
|
+
return retryOperation(operation, retryCount + 1);
|
|
528
|
+
}
|
|
529
|
+
debugLog(`Operation failed after ${MAX_RETRIES} retries`);
|
|
530
|
+
return false;
|
|
531
|
+
}
|
|
532
|
+
});
|
|
533
|
+
let availableSelectors = config.pagination.selector.split(',');
|
|
534
|
+
try {
|
|
535
|
+
while (true) {
|
|
536
|
+
// Reduced timeout for faster performance
|
|
537
|
+
yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
|
|
538
|
+
switch (config.pagination.type) {
|
|
539
|
+
case 'scrollDown': {
|
|
540
|
+
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
541
|
+
yield page.waitForTimeout(2000);
|
|
542
|
+
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
|
|
543
|
+
if (currentHeight === previousHeight) {
|
|
544
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
545
|
+
allResults = allResults.concat(finalResults);
|
|
546
|
+
return allResults;
|
|
521
547
|
}
|
|
548
|
+
previousHeight = currentHeight;
|
|
549
|
+
break;
|
|
522
550
|
}
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
const selectorIndex = availableSelectors.indexOf(workingSelector);
|
|
532
|
-
availableSelectors = availableSelectors.slice(selectorIndex);
|
|
533
|
-
// await Promise.all([
|
|
534
|
-
// nextButton.dispatchEvent('click'),
|
|
535
|
-
// page.waitForNavigation({ waitUntil: 'networkidle' })
|
|
536
|
-
// ]);
|
|
537
|
-
const previousUrl = page.url();
|
|
538
|
-
visitedUrls.push(previousUrl);
|
|
539
|
-
try {
|
|
540
|
-
// Try both click methods simultaneously
|
|
541
|
-
yield Promise.race([
|
|
542
|
-
Promise.all([
|
|
543
|
-
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
|
544
|
-
nextButton.click()
|
|
545
|
-
]),
|
|
546
|
-
Promise.all([
|
|
547
|
-
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
|
548
|
-
nextButton.dispatchEvent('click')
|
|
549
|
-
])
|
|
550
|
-
]);
|
|
551
|
-
}
|
|
552
|
-
catch (error) {
|
|
553
|
-
// Verify if navigation actually succeeded
|
|
554
|
-
const currentUrl = page.url();
|
|
555
|
-
if (currentUrl === previousUrl) {
|
|
556
|
-
console.log("Previous URL same as current URL. Navigation failed.");
|
|
551
|
+
case 'scrollUp': {
|
|
552
|
+
yield page.evaluate(() => window.scrollTo(0, 0));
|
|
553
|
+
yield page.waitForTimeout(2000);
|
|
554
|
+
const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
|
|
555
|
+
if (currentTopHeight === 0) {
|
|
556
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
557
|
+
allResults = allResults.concat(finalResults);
|
|
558
|
+
return allResults;
|
|
557
559
|
}
|
|
560
|
+
previousHeight = currentTopHeight;
|
|
561
|
+
break;
|
|
558
562
|
}
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
563
|
+
case 'clickNext': {
|
|
564
|
+
const currentUrl = page.url();
|
|
565
|
+
visitedUrls.add(currentUrl);
|
|
566
|
+
yield scrapeCurrentPage();
|
|
567
|
+
if (checkLimit())
|
|
568
|
+
return allResults;
|
|
569
|
+
const { button, workingSelector } = yield findWorkingButton(availableSelectors);
|
|
570
|
+
if (!button || !workingSelector) {
|
|
571
|
+
// Final retry for navigation when no selectors work
|
|
572
|
+
const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () {
|
|
573
|
+
try {
|
|
574
|
+
yield page.evaluate(() => window.history.forward());
|
|
575
|
+
const newUrl = page.url();
|
|
576
|
+
return !visitedUrls.has(newUrl);
|
|
577
|
+
}
|
|
578
|
+
catch (_a) {
|
|
579
|
+
return false;
|
|
580
|
+
}
|
|
581
|
+
}));
|
|
582
|
+
if (!success)
|
|
583
|
+
return allResults;
|
|
584
|
+
break;
|
|
576
585
|
}
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
case 'clickLoadMore':
|
|
582
|
-
while (true) {
|
|
583
|
-
let checkButton = null;
|
|
584
|
-
let workingSelector = null;
|
|
585
|
-
for (let i = 0; i < availableSelectors.length; i++) {
|
|
586
|
-
const selector = availableSelectors[i];
|
|
586
|
+
availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
|
|
587
|
+
let retryCount = 0;
|
|
588
|
+
let navigationSuccess = false;
|
|
589
|
+
while (retryCount < MAX_RETRIES && !navigationSuccess) {
|
|
587
590
|
try {
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
591
|
+
try {
|
|
592
|
+
yield Promise.all([
|
|
593
|
+
page.waitForNavigation({
|
|
594
|
+
waitUntil: 'networkidle',
|
|
595
|
+
timeout: 15000
|
|
596
|
+
}),
|
|
597
|
+
button.click()
|
|
598
|
+
]);
|
|
599
|
+
navigationSuccess = true;
|
|
600
|
+
}
|
|
601
|
+
catch (error) {
|
|
602
|
+
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
|
|
603
|
+
// If regular click fails, try dispatchEvent
|
|
604
|
+
if (page.url() === currentUrl) {
|
|
605
|
+
try {
|
|
606
|
+
yield Promise.all([
|
|
607
|
+
page.waitForNavigation({
|
|
608
|
+
waitUntil: 'networkidle',
|
|
609
|
+
timeout: 15000
|
|
610
|
+
}),
|
|
611
|
+
button.dispatchEvent('click')
|
|
612
|
+
]);
|
|
613
|
+
navigationSuccess = true;
|
|
614
|
+
}
|
|
615
|
+
catch (dispatchError) {
|
|
616
|
+
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
else {
|
|
620
|
+
navigationSuccess = true;
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
const newUrl = page.url();
|
|
624
|
+
if (visitedUrls.has(newUrl)) {
|
|
625
|
+
debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
|
|
626
|
+
navigationSuccess = false;
|
|
627
|
+
}
|
|
628
|
+
if (navigationSuccess) {
|
|
629
|
+
yield page.waitForTimeout(1000);
|
|
593
630
|
}
|
|
594
631
|
}
|
|
595
632
|
catch (error) {
|
|
596
|
-
|
|
633
|
+
debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
|
|
634
|
+
navigationSuccess = false;
|
|
635
|
+
}
|
|
636
|
+
if (!navigationSuccess) {
|
|
637
|
+
retryCount++;
|
|
638
|
+
if (retryCount < MAX_RETRIES) {
|
|
639
|
+
debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
640
|
+
yield page.waitForTimeout(RETRY_DELAY);
|
|
641
|
+
}
|
|
597
642
|
}
|
|
598
643
|
}
|
|
599
|
-
if (!
|
|
600
|
-
|
|
601
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
602
|
-
allResults = allResults.concat(finalResults);
|
|
603
|
-
return allResults;
|
|
604
|
-
}
|
|
605
|
-
const loadMoreButton = yield page.$(workingSelector);
|
|
606
|
-
if (!loadMoreButton) {
|
|
607
|
-
// No more "Load More" button, so scrape the remaining items
|
|
608
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
609
|
-
allResults = allResults.concat(finalResults);
|
|
644
|
+
if (!navigationSuccess) {
|
|
645
|
+
debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
|
|
610
646
|
return allResults;
|
|
611
647
|
}
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
yield
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
648
|
+
break;
|
|
649
|
+
}
|
|
650
|
+
case 'clickLoadMore': {
|
|
651
|
+
while (true) {
|
|
652
|
+
// Find working button with retry mechanism, consistent with clickNext
|
|
653
|
+
const { button: loadMoreButton, workingSelector } = yield findWorkingButton(availableSelectors);
|
|
654
|
+
if (!workingSelector || !loadMoreButton) {
|
|
655
|
+
debugLog('No working Load More selector found after retries');
|
|
656
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
657
|
+
allResults = allResults.concat(finalResults);
|
|
658
|
+
return allResults;
|
|
659
|
+
}
|
|
660
|
+
// Update available selectors to start from the working one
|
|
661
|
+
availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
|
|
662
|
+
// Implement retry mechanism for clicking the button
|
|
663
|
+
let retryCount = 0;
|
|
664
|
+
let clickSuccess = false;
|
|
665
|
+
while (retryCount < MAX_RETRIES && !clickSuccess) {
|
|
666
|
+
try {
|
|
667
|
+
try {
|
|
668
|
+
yield loadMoreButton.click();
|
|
669
|
+
clickSuccess = true;
|
|
670
|
+
}
|
|
671
|
+
catch (error) {
|
|
672
|
+
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
|
|
673
|
+
// If regular click fails, try dispatchEvent
|
|
674
|
+
try {
|
|
675
|
+
yield loadMoreButton.dispatchEvent('click');
|
|
676
|
+
clickSuccess = true;
|
|
677
|
+
}
|
|
678
|
+
catch (dispatchError) {
|
|
679
|
+
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
|
|
680
|
+
throw dispatchError; // Propagate error to trigger retry
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
if (clickSuccess) {
|
|
684
|
+
yield page.waitForTimeout(1000);
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
catch (error) {
|
|
688
|
+
debugLog(`Click attempt ${retryCount + 1} failed completely.`);
|
|
689
|
+
retryCount++;
|
|
690
|
+
if (retryCount < MAX_RETRIES) {
|
|
691
|
+
debugLog(`Retrying click - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
692
|
+
yield page.waitForTimeout(RETRY_DELAY);
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
if (!clickSuccess) {
|
|
697
|
+
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
|
698
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
699
|
+
allResults = allResults.concat(finalResults);
|
|
700
|
+
return allResults;
|
|
701
|
+
}
|
|
702
|
+
// Wait for content to load and check scroll height
|
|
703
|
+
yield page.waitForTimeout(2000);
|
|
704
|
+
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
705
|
+
yield page.waitForTimeout(2000);
|
|
706
|
+
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
|
|
707
|
+
if (currentHeight === previousHeight) {
|
|
708
|
+
debugLog('No more items loaded after Load More');
|
|
709
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
710
|
+
allResults = allResults.concat(finalResults);
|
|
711
|
+
return allResults;
|
|
712
|
+
}
|
|
713
|
+
previousHeight = currentHeight;
|
|
714
|
+
if (config.limit && allResults.length >= config.limit) {
|
|
715
|
+
allResults = allResults.slice(0, config.limit);
|
|
716
|
+
break;
|
|
717
|
+
}
|
|
642
718
|
}
|
|
719
|
+
break;
|
|
643
720
|
}
|
|
721
|
+
default: {
|
|
722
|
+
yield scrapeCurrentPage();
|
|
723
|
+
return allResults;
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
if (checkLimit())
|
|
644
727
|
break;
|
|
645
|
-
default:
|
|
646
|
-
const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
647
|
-
allResults = allResults.concat(results);
|
|
648
|
-
return allResults;
|
|
649
|
-
}
|
|
650
|
-
if (config.limit && allResults.length >= config.limit) {
|
|
651
|
-
allResults = allResults.slice(0, config.limit);
|
|
652
|
-
break;
|
|
653
728
|
}
|
|
654
729
|
}
|
|
730
|
+
catch (error) {
|
|
731
|
+
debugLog(`Fatal error: ${error.message}`);
|
|
732
|
+
return allResults;
|
|
733
|
+
}
|
|
655
734
|
return allResults;
|
|
656
735
|
});
|
|
657
736
|
}
|