maxun-core 0.0.10 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.js +240 -170
- package/package.json +1 -1
package/build/interpret.js
CHANGED
|
@@ -465,202 +465,272 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
465
465
|
return __awaiter(this, void 0, void 0, function* () {
|
|
466
466
|
let allResults = [];
|
|
467
467
|
let previousHeight = 0;
|
|
468
|
-
// track unique items per page to avoid re-scraping
|
|
469
468
|
let scrapedItems = new Set();
|
|
470
|
-
let visitedUrls =
|
|
471
|
-
|
|
469
|
+
let visitedUrls = new Set();
|
|
470
|
+
const MAX_RETRIES = 3;
|
|
471
|
+
const RETRY_DELAY = 1000; // 1 second delay between retries
|
|
472
472
|
const debugLog = (message, ...args) => {
|
|
473
|
-
console.log(`[Page ${visitedUrls.
|
|
473
|
+
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
|
474
474
|
};
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
debugLog("Current URL:", page.url());
|
|
502
|
-
const pageResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
503
|
-
// Filter out already scraped items
|
|
504
|
-
const newResults = pageResults.filter(item => {
|
|
505
|
-
const uniqueKey = JSON.stringify(item);
|
|
506
|
-
if (scrapedItems.has(uniqueKey))
|
|
507
|
-
return false;
|
|
508
|
-
scrapedItems.add(uniqueKey);
|
|
509
|
-
return true;
|
|
475
|
+
const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () {
|
|
476
|
+
const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
477
|
+
const newResults = results.filter(item => {
|
|
478
|
+
const uniqueKey = JSON.stringify(item);
|
|
479
|
+
if (scrapedItems.has(uniqueKey))
|
|
480
|
+
return false;
|
|
481
|
+
scrapedItems.add(uniqueKey);
|
|
482
|
+
return true;
|
|
483
|
+
});
|
|
484
|
+
allResults = allResults.concat(newResults);
|
|
485
|
+
debugLog("Results collected:", allResults.length);
|
|
486
|
+
});
|
|
487
|
+
const checkLimit = () => {
|
|
488
|
+
if (config.limit && allResults.length >= config.limit) {
|
|
489
|
+
allResults = allResults.slice(0, config.limit);
|
|
490
|
+
return true;
|
|
491
|
+
}
|
|
492
|
+
return false;
|
|
493
|
+
};
|
|
494
|
+
// Enhanced button finder with retry mechanism
|
|
495
|
+
const findWorkingButton = (selectors, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
|
|
496
|
+
for (const selector of selectors) {
|
|
497
|
+
try {
|
|
498
|
+
const button = yield page.waitForSelector(selector, {
|
|
499
|
+
state: 'attached',
|
|
500
|
+
timeout: 10000 // Reduced timeout for faster checks
|
|
510
501
|
});
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
return allResults.slice(0, config.limit);
|
|
502
|
+
if (button) {
|
|
503
|
+
debugLog('Found working selector:', selector);
|
|
504
|
+
return { button, workingSelector: selector };
|
|
515
505
|
}
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
yield
|
|
555
|
-
|
|
556
|
-
waitUntil: 'networkidle',
|
|
557
|
-
timeout: 15000
|
|
558
|
-
}),
|
|
559
|
-
nextButton.click()
|
|
560
|
-
]);
|
|
561
|
-
}
|
|
562
|
-
catch (error) {
|
|
563
|
-
// If we're still on the same URL, try dispatch event
|
|
564
|
-
if (page.url() === previousUrl) {
|
|
565
|
-
yield Promise.all([
|
|
566
|
-
page.waitForNavigation({
|
|
567
|
-
waitUntil: 'networkidle',
|
|
568
|
-
timeout: 15000
|
|
569
|
-
}),
|
|
570
|
-
nextButton.dispatchEvent('click')
|
|
571
|
-
]);
|
|
572
|
-
}
|
|
573
|
-
}
|
|
574
|
-
yield page.waitForLoadState('domcontentloaded');
|
|
575
|
-
yield page.waitForLoadState('networkidle', { timeout: 30000 });
|
|
576
|
-
const currentUrl = page.url();
|
|
577
|
-
if (visitedUrls.includes(currentUrl)) {
|
|
578
|
-
debugLog(`Navigation failed/Detected navigation to previously visited URL: ${currentUrl}`);
|
|
506
|
+
}
|
|
507
|
+
catch (error) {
|
|
508
|
+
debugLog(`Selector failed: ${selector}`);
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
// Implement retry mechanism when no selectors work
|
|
512
|
+
if (selectors.length > 0 && retryCount < MAX_RETRIES) {
|
|
513
|
+
debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
514
|
+
yield page.waitForTimeout(RETRY_DELAY);
|
|
515
|
+
return findWorkingButton(selectors, retryCount + 1);
|
|
516
|
+
}
|
|
517
|
+
return { button: null, workingSelector: null };
|
|
518
|
+
});
|
|
519
|
+
const retryOperation = (operation, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
|
|
520
|
+
try {
|
|
521
|
+
return yield operation();
|
|
522
|
+
}
|
|
523
|
+
catch (error) {
|
|
524
|
+
if (retryCount < MAX_RETRIES) {
|
|
525
|
+
debugLog(`Retrying operation. Attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
526
|
+
yield page.waitForTimeout(RETRY_DELAY);
|
|
527
|
+
return retryOperation(operation, retryCount + 1);
|
|
528
|
+
}
|
|
529
|
+
debugLog(`Operation failed after ${MAX_RETRIES} retries`);
|
|
530
|
+
return false;
|
|
531
|
+
}
|
|
532
|
+
});
|
|
533
|
+
let availableSelectors = config.pagination.selector.split(',');
|
|
534
|
+
try {
|
|
535
|
+
while (true) {
|
|
536
|
+
// Reduced timeout for faster performance
|
|
537
|
+
yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
|
|
538
|
+
switch (config.pagination.type) {
|
|
539
|
+
case 'scrollDown': {
|
|
540
|
+
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
541
|
+
yield page.waitForTimeout(2000);
|
|
542
|
+
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
|
|
543
|
+
if (currentHeight === previousHeight) {
|
|
544
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
545
|
+
allResults = allResults.concat(finalResults);
|
|
579
546
|
return allResults;
|
|
580
547
|
}
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
}
|
|
584
|
-
catch (error) {
|
|
585
|
-
debugLog(`Navigation failed completely: ${error.message}`);
|
|
586
|
-
return allResults;
|
|
548
|
+
previousHeight = currentHeight;
|
|
549
|
+
break;
|
|
587
550
|
}
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
for (const selector of availableSelectors) {
|
|
594
|
-
try {
|
|
595
|
-
checkButton = yield page.waitForSelector(selector, {
|
|
596
|
-
state: 'attached',
|
|
597
|
-
timeout: 30000
|
|
598
|
-
});
|
|
599
|
-
if (checkButton) {
|
|
600
|
-
workingSelector = selector;
|
|
601
|
-
debugLog('Found working selector:', selector);
|
|
602
|
-
break;
|
|
603
|
-
}
|
|
604
|
-
}
|
|
605
|
-
catch (error) {
|
|
606
|
-
debugLog(`Load More selector failed: ${selector}`);
|
|
607
|
-
}
|
|
608
|
-
}
|
|
609
|
-
if (!workingSelector) {
|
|
610
|
-
debugLog('No working Load More selector found');
|
|
551
|
+
case 'scrollUp': {
|
|
552
|
+
yield page.evaluate(() => window.scrollTo(0, 0));
|
|
553
|
+
yield page.waitForTimeout(2000);
|
|
554
|
+
const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
|
|
555
|
+
if (currentTopHeight === 0) {
|
|
611
556
|
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
612
557
|
allResults = allResults.concat(finalResults);
|
|
613
558
|
return allResults;
|
|
614
559
|
}
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
560
|
+
previousHeight = currentTopHeight;
|
|
561
|
+
break;
|
|
562
|
+
}
|
|
563
|
+
case 'clickNext': {
|
|
564
|
+
const currentUrl = page.url();
|
|
565
|
+
visitedUrls.add(currentUrl);
|
|
566
|
+
yield scrapeCurrentPage();
|
|
567
|
+
if (checkLimit())
|
|
620
568
|
return allResults;
|
|
569
|
+
const { button, workingSelector } = yield findWorkingButton(availableSelectors);
|
|
570
|
+
if (!button || !workingSelector) {
|
|
571
|
+
// Final retry for navigation when no selectors work
|
|
572
|
+
const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () {
|
|
573
|
+
try {
|
|
574
|
+
yield page.evaluate(() => window.history.forward());
|
|
575
|
+
const newUrl = page.url();
|
|
576
|
+
return !visitedUrls.has(newUrl);
|
|
577
|
+
}
|
|
578
|
+
catch (_a) {
|
|
579
|
+
return false;
|
|
580
|
+
}
|
|
581
|
+
}));
|
|
582
|
+
if (!success)
|
|
583
|
+
return allResults;
|
|
584
|
+
break;
|
|
621
585
|
}
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
586
|
+
availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
|
|
587
|
+
let retryCount = 0;
|
|
588
|
+
let navigationSuccess = false;
|
|
589
|
+
while (retryCount < MAX_RETRIES && !navigationSuccess) {
|
|
625
590
|
try {
|
|
626
|
-
|
|
591
|
+
try {
|
|
592
|
+
yield Promise.all([
|
|
593
|
+
page.waitForNavigation({
|
|
594
|
+
waitUntil: 'networkidle',
|
|
595
|
+
timeout: 15000
|
|
596
|
+
}),
|
|
597
|
+
button.click()
|
|
598
|
+
]);
|
|
599
|
+
navigationSuccess = true;
|
|
600
|
+
}
|
|
601
|
+
catch (error) {
|
|
602
|
+
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
|
|
603
|
+
// If regular click fails, try dispatchEvent
|
|
604
|
+
if (page.url() === currentUrl) {
|
|
605
|
+
try {
|
|
606
|
+
yield Promise.all([
|
|
607
|
+
page.waitForNavigation({
|
|
608
|
+
waitUntil: 'networkidle',
|
|
609
|
+
timeout: 15000
|
|
610
|
+
}),
|
|
611
|
+
button.dispatchEvent('click')
|
|
612
|
+
]);
|
|
613
|
+
navigationSuccess = true;
|
|
614
|
+
}
|
|
615
|
+
catch (dispatchError) {
|
|
616
|
+
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
else {
|
|
620
|
+
navigationSuccess = true;
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
const newUrl = page.url();
|
|
624
|
+
if (visitedUrls.has(newUrl)) {
|
|
625
|
+
debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
|
|
626
|
+
navigationSuccess = false;
|
|
627
|
+
}
|
|
628
|
+
if (navigationSuccess) {
|
|
629
|
+
yield page.waitForTimeout(1000);
|
|
630
|
+
}
|
|
627
631
|
}
|
|
628
632
|
catch (error) {
|
|
629
|
-
|
|
633
|
+
debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
|
|
634
|
+
navigationSuccess = false;
|
|
635
|
+
}
|
|
636
|
+
if (!navigationSuccess) {
|
|
637
|
+
retryCount++;
|
|
638
|
+
if (retryCount < MAX_RETRIES) {
|
|
639
|
+
debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
640
|
+
yield page.waitForTimeout(RETRY_DELAY);
|
|
641
|
+
}
|
|
630
642
|
}
|
|
631
643
|
}
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
allResults = allResults.concat(finalResults);
|
|
635
|
-
return allResults;
|
|
636
|
-
}
|
|
637
|
-
yield page.waitForTimeout(2000);
|
|
638
|
-
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
639
|
-
yield page.waitForTimeout(2000);
|
|
640
|
-
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
|
|
641
|
-
if (currentHeight === previousHeight) {
|
|
642
|
-
debugLog('No more items loaded after Load More');
|
|
643
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
644
|
-
allResults = allResults.concat(finalResults);
|
|
644
|
+
if (!navigationSuccess) {
|
|
645
|
+
debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
|
|
645
646
|
return allResults;
|
|
646
647
|
}
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
648
|
+
break;
|
|
649
|
+
}
|
|
650
|
+
case 'clickLoadMore': {
|
|
651
|
+
while (true) {
|
|
652
|
+
// Find working button with retry mechanism, consistent with clickNext
|
|
653
|
+
const { button: loadMoreButton, workingSelector } = yield findWorkingButton(availableSelectors);
|
|
654
|
+
if (!workingSelector || !loadMoreButton) {
|
|
655
|
+
debugLog('No working Load More selector found after retries');
|
|
656
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
657
|
+
allResults = allResults.concat(finalResults);
|
|
658
|
+
return allResults;
|
|
659
|
+
}
|
|
660
|
+
// Update available selectors to start from the working one
|
|
661
|
+
availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
|
|
662
|
+
// Implement retry mechanism for clicking the button
|
|
663
|
+
let retryCount = 0;
|
|
664
|
+
let clickSuccess = false;
|
|
665
|
+
while (retryCount < MAX_RETRIES && !clickSuccess) {
|
|
666
|
+
try {
|
|
667
|
+
try {
|
|
668
|
+
yield loadMoreButton.click();
|
|
669
|
+
clickSuccess = true;
|
|
670
|
+
}
|
|
671
|
+
catch (error) {
|
|
672
|
+
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
|
|
673
|
+
// If regular click fails, try dispatchEvent
|
|
674
|
+
try {
|
|
675
|
+
yield loadMoreButton.dispatchEvent('click');
|
|
676
|
+
clickSuccess = true;
|
|
677
|
+
}
|
|
678
|
+
catch (dispatchError) {
|
|
679
|
+
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
|
|
680
|
+
throw dispatchError; // Propagate error to trigger retry
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
if (clickSuccess) {
|
|
684
|
+
yield page.waitForTimeout(1000);
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
catch (error) {
|
|
688
|
+
debugLog(`Click attempt ${retryCount + 1} failed completely.`);
|
|
689
|
+
retryCount++;
|
|
690
|
+
if (retryCount < MAX_RETRIES) {
|
|
691
|
+
debugLog(`Retrying click - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
692
|
+
yield page.waitForTimeout(RETRY_DELAY);
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
if (!clickSuccess) {
|
|
697
|
+
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
|
698
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
699
|
+
allResults = allResults.concat(finalResults);
|
|
700
|
+
return allResults;
|
|
701
|
+
}
|
|
702
|
+
// Wait for content to load and check scroll height
|
|
703
|
+
yield page.waitForTimeout(2000);
|
|
704
|
+
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
705
|
+
yield page.waitForTimeout(2000);
|
|
706
|
+
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
|
|
707
|
+
if (currentHeight === previousHeight) {
|
|
708
|
+
debugLog('No more items loaded after Load More');
|
|
709
|
+
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
710
|
+
allResults = allResults.concat(finalResults);
|
|
711
|
+
return allResults;
|
|
712
|
+
}
|
|
713
|
+
previousHeight = currentHeight;
|
|
714
|
+
if (config.limit && allResults.length >= config.limit) {
|
|
715
|
+
allResults = allResults.slice(0, config.limit);
|
|
716
|
+
break;
|
|
717
|
+
}
|
|
651
718
|
}
|
|
719
|
+
break;
|
|
652
720
|
}
|
|
721
|
+
default: {
|
|
722
|
+
yield scrapeCurrentPage();
|
|
723
|
+
return allResults;
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
if (checkLimit())
|
|
653
727
|
break;
|
|
654
|
-
default:
|
|
655
|
-
const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
656
|
-
allResults = allResults.concat(results);
|
|
657
|
-
return allResults;
|
|
658
|
-
}
|
|
659
|
-
if (config.limit && allResults.length >= config.limit) {
|
|
660
|
-
allResults = allResults.slice(0, config.limit);
|
|
661
|
-
break;
|
|
662
728
|
}
|
|
663
729
|
}
|
|
730
|
+
catch (error) {
|
|
731
|
+
debugLog(`Fatal error: ${error.message}`);
|
|
732
|
+
return allResults;
|
|
733
|
+
}
|
|
664
734
|
return allResults;
|
|
665
735
|
});
|
|
666
736
|
}
|