maxun-core 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/build/interpret.js +240 -170
  2. package/package.json +1 -1
@@ -465,202 +465,272 @@ class Interpreter extends events_1.EventEmitter {
465
465
  return __awaiter(this, void 0, void 0, function* () {
466
466
  let allResults = [];
467
467
  let previousHeight = 0;
468
- // track unique items per page to avoid re-scraping
469
468
  let scrapedItems = new Set();
470
- let visitedUrls = [];
471
- // Debug logging helper
469
+ let visitedUrls = new Set();
470
+ const MAX_RETRIES = 3;
471
+ const RETRY_DELAY = 1000; // 1 second delay between retries
472
472
  const debugLog = (message, ...args) => {
473
- console.log(`[Page ${visitedUrls.length + 1}] ${message}`, ...args);
473
+ console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
474
474
  };
475
- let availableSelectors = config.pagination.selector.split(',');
476
- while (true) {
477
- switch (config.pagination.type) {
478
- case 'scrollDown':
479
- yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
480
- yield page.waitForTimeout(2000);
481
- const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
482
- if (currentHeight === previousHeight) {
483
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
484
- allResults = allResults.concat(finalResults);
485
- return allResults;
486
- }
487
- previousHeight = currentHeight;
488
- break;
489
- case 'scrollUp':
490
- yield page.evaluate(() => window.scrollTo(0, 0));
491
- yield page.waitForTimeout(2000);
492
- const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
493
- if (currentTopHeight === 0) {
494
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
495
- allResults = allResults.concat(finalResults);
496
- return allResults;
497
- }
498
- previousHeight = currentTopHeight;
499
- break;
500
- case 'clickNext':
501
- debugLog("Current URL:", page.url());
502
- const pageResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
503
- // Filter out already scraped items
504
- const newResults = pageResults.filter(item => {
505
- const uniqueKey = JSON.stringify(item);
506
- if (scrapedItems.has(uniqueKey))
507
- return false;
508
- scrapedItems.add(uniqueKey);
509
- return true;
475
+ const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () {
476
+ const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
477
+ const newResults = results.filter(item => {
478
+ const uniqueKey = JSON.stringify(item);
479
+ if (scrapedItems.has(uniqueKey))
480
+ return false;
481
+ scrapedItems.add(uniqueKey);
482
+ return true;
483
+ });
484
+ allResults = allResults.concat(newResults);
485
+ debugLog("Results collected:", allResults.length);
486
+ });
487
+ const checkLimit = () => {
488
+ if (config.limit && allResults.length >= config.limit) {
489
+ allResults = allResults.slice(0, config.limit);
490
+ return true;
491
+ }
492
+ return false;
493
+ };
494
+ // Enhanced button finder with retry mechanism
495
+ const findWorkingButton = (selectors, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
496
+ for (const selector of selectors) {
497
+ try {
498
+ const button = yield page.waitForSelector(selector, {
499
+ state: 'attached',
500
+ timeout: 10000 // Reduced timeout for faster checks
510
501
  });
511
- allResults = allResults.concat(newResults);
512
- debugLog("Results collected so far:", allResults.length);
513
- if (config.limit && allResults.length >= config.limit) {
514
- return allResults.slice(0, config.limit);
502
+ if (button) {
503
+ debugLog('Found working selector:', selector);
504
+ return { button, workingSelector: selector };
515
505
  }
516
- yield page.waitForLoadState('networkidle', { timeout: 30000 });
517
- yield page.waitForTimeout(2000);
518
- let checkButton = null;
519
- let workingSelector = null;
520
- // Try each selector with explicit waiting
521
- for (const selector of availableSelectors) {
522
- try {
523
- checkButton = yield page.waitForSelector(selector, {
524
- state: 'attached',
525
- timeout: 30000
526
- });
527
- if (checkButton) {
528
- workingSelector = selector;
529
- debugLog('Found working selector:', selector);
530
- break;
531
- }
532
- }
533
- catch (error) {
534
- debugLog(`Selector failed: ${selector} - ${error.message}`);
535
- }
536
- }
537
- if (!workingSelector) {
538
- debugLog('No working selector found after trying all options');
539
- return allResults;
540
- }
541
- const nextButton = yield page.$(workingSelector);
542
- if (!nextButton) {
543
- debugLog('Next button not found');
544
- return allResults;
545
- }
546
- const selectorIndex = availableSelectors.indexOf(workingSelector);
547
- availableSelectors = availableSelectors.slice(selectorIndex);
548
- try {
549
- // Store current URL to check if navigation succeeded
550
- const previousUrl = page.url();
551
- visitedUrls.push(previousUrl);
552
- // Try both click methods in sequence
553
- try {
554
- yield Promise.all([
555
- page.waitForNavigation({
556
- waitUntil: 'networkidle',
557
- timeout: 15000
558
- }),
559
- nextButton.click()
560
- ]);
561
- }
562
- catch (error) {
563
- // If we're still on the same URL, try dispatch event
564
- if (page.url() === previousUrl) {
565
- yield Promise.all([
566
- page.waitForNavigation({
567
- waitUntil: 'networkidle',
568
- timeout: 15000
569
- }),
570
- nextButton.dispatchEvent('click')
571
- ]);
572
- }
573
- }
574
- yield page.waitForLoadState('domcontentloaded');
575
- yield page.waitForLoadState('networkidle', { timeout: 30000 });
576
- const currentUrl = page.url();
577
- if (visitedUrls.includes(currentUrl)) {
578
- debugLog(`Navigation failed/Detected navigation to previously visited URL: ${currentUrl}`);
506
+ }
507
+ catch (error) {
508
+ debugLog(`Selector failed: ${selector}`);
509
+ }
510
+ }
511
+ // Implement retry mechanism when no selectors work
512
+ if (selectors.length > 0 && retryCount < MAX_RETRIES) {
513
+ debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
514
+ yield page.waitForTimeout(RETRY_DELAY);
515
+ return findWorkingButton(selectors, retryCount + 1);
516
+ }
517
+ return { button: null, workingSelector: null };
518
+ });
519
+ const retryOperation = (operation, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
520
+ try {
521
+ return yield operation();
522
+ }
523
+ catch (error) {
524
+ if (retryCount < MAX_RETRIES) {
525
+ debugLog(`Retrying operation. Attempt ${retryCount + 1} of ${MAX_RETRIES}`);
526
+ yield page.waitForTimeout(RETRY_DELAY);
527
+ return retryOperation(operation, retryCount + 1);
528
+ }
529
+ debugLog(`Operation failed after ${MAX_RETRIES} retries`);
530
+ return false;
531
+ }
532
+ });
533
+ let availableSelectors = config.pagination.selector.split(',');
534
+ try {
535
+ while (true) {
536
+ // Reduced timeout for faster performance
537
+ yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
538
+ switch (config.pagination.type) {
539
+ case 'scrollDown': {
540
+ yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
541
+ yield page.waitForTimeout(2000);
542
+ const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
543
+ if (currentHeight === previousHeight) {
544
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
545
+ allResults = allResults.concat(finalResults);
579
546
  return allResults;
580
547
  }
581
- // Give the page a moment to stabilize after navigation
582
- yield page.waitForTimeout(1000);
583
- }
584
- catch (error) {
585
- debugLog(`Navigation failed completely: ${error.message}`);
586
- return allResults;
548
+ previousHeight = currentHeight;
549
+ break;
587
550
  }
588
- break;
589
- case 'clickLoadMore':
590
- while (true) {
591
- let checkButton = null;
592
- let workingSelector = null;
593
- for (const selector of availableSelectors) {
594
- try {
595
- checkButton = yield page.waitForSelector(selector, {
596
- state: 'attached',
597
- timeout: 30000
598
- });
599
- if (checkButton) {
600
- workingSelector = selector;
601
- debugLog('Found working selector:', selector);
602
- break;
603
- }
604
- }
605
- catch (error) {
606
- debugLog(`Load More selector failed: ${selector}`);
607
- }
608
- }
609
- if (!workingSelector) {
610
- debugLog('No working Load More selector found');
551
+ case 'scrollUp': {
552
+ yield page.evaluate(() => window.scrollTo(0, 0));
553
+ yield page.waitForTimeout(2000);
554
+ const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
555
+ if (currentTopHeight === 0) {
611
556
  const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
612
557
  allResults = allResults.concat(finalResults);
613
558
  return allResults;
614
559
  }
615
- const loadMoreButton = yield page.$(workingSelector);
616
- if (!loadMoreButton) {
617
- debugLog('Load More button not found');
618
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
619
- allResults = allResults.concat(finalResults);
560
+ previousHeight = currentTopHeight;
561
+ break;
562
+ }
563
+ case 'clickNext': {
564
+ const currentUrl = page.url();
565
+ visitedUrls.add(currentUrl);
566
+ yield scrapeCurrentPage();
567
+ if (checkLimit())
620
568
  return allResults;
569
+ const { button, workingSelector } = yield findWorkingButton(availableSelectors);
570
+ if (!button || !workingSelector) {
571
+ // Final retry for navigation when no selectors work
572
+ const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () {
573
+ try {
574
+ yield page.evaluate(() => window.history.forward());
575
+ const newUrl = page.url();
576
+ return !visitedUrls.has(newUrl);
577
+ }
578
+ catch (_a) {
579
+ return false;
580
+ }
581
+ }));
582
+ if (!success)
583
+ return allResults;
584
+ break;
621
585
  }
622
- const selectorIndex = availableSelectors.indexOf(workingSelector);
623
- availableSelectors = availableSelectors.slice(selectorIndex);
624
- try {
586
+ availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
587
+ let retryCount = 0;
588
+ let navigationSuccess = false;
589
+ while (retryCount < MAX_RETRIES && !navigationSuccess) {
625
590
  try {
626
- yield loadMoreButton.click();
591
+ try {
592
+ yield Promise.all([
593
+ page.waitForNavigation({
594
+ waitUntil: 'networkidle',
595
+ timeout: 15000
596
+ }),
597
+ button.click()
598
+ ]);
599
+ navigationSuccess = true;
600
+ }
601
+ catch (error) {
602
+ debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
603
+ // If regular click fails, try dispatchEvent
604
+ if (page.url() === currentUrl) {
605
+ try {
606
+ yield Promise.all([
607
+ page.waitForNavigation({
608
+ waitUntil: 'networkidle',
609
+ timeout: 15000
610
+ }),
611
+ button.dispatchEvent('click')
612
+ ]);
613
+ navigationSuccess = true;
614
+ }
615
+ catch (dispatchError) {
616
+ debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
617
+ }
618
+ }
619
+ else {
620
+ navigationSuccess = true;
621
+ }
622
+ }
623
+ const newUrl = page.url();
624
+ if (visitedUrls.has(newUrl)) {
625
+ debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
626
+ navigationSuccess = false;
627
+ }
628
+ if (navigationSuccess) {
629
+ yield page.waitForTimeout(1000);
630
+ }
627
631
  }
628
632
  catch (error) {
629
- yield loadMoreButton.dispatchEvent('click');
633
+ debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
634
+ navigationSuccess = false;
635
+ }
636
+ if (!navigationSuccess) {
637
+ retryCount++;
638
+ if (retryCount < MAX_RETRIES) {
639
+ debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
640
+ yield page.waitForTimeout(RETRY_DELAY);
641
+ }
630
642
  }
631
643
  }
632
- catch (error) {
633
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
634
- allResults = allResults.concat(finalResults);
635
- return allResults;
636
- }
637
- yield page.waitForTimeout(2000);
638
- yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
639
- yield page.waitForTimeout(2000);
640
- const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
641
- if (currentHeight === previousHeight) {
642
- debugLog('No more items loaded after Load More');
643
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
644
- allResults = allResults.concat(finalResults);
644
+ if (!navigationSuccess) {
645
+ debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
645
646
  return allResults;
646
647
  }
647
- previousHeight = currentHeight;
648
- if (config.limit && allResults.length >= config.limit) {
649
- allResults = allResults.slice(0, config.limit);
650
- break;
648
+ break;
649
+ }
650
+ case 'clickLoadMore': {
651
+ while (true) {
652
+ // Find working button with retry mechanism, consistent with clickNext
653
+ const { button: loadMoreButton, workingSelector } = yield findWorkingButton(availableSelectors);
654
+ if (!workingSelector || !loadMoreButton) {
655
+ debugLog('No working Load More selector found after retries');
656
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
657
+ allResults = allResults.concat(finalResults);
658
+ return allResults;
659
+ }
660
+ // Update available selectors to start from the working one
661
+ availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
662
+ // Implement retry mechanism for clicking the button
663
+ let retryCount = 0;
664
+ let clickSuccess = false;
665
+ while (retryCount < MAX_RETRIES && !clickSuccess) {
666
+ try {
667
+ try {
668
+ yield loadMoreButton.click();
669
+ clickSuccess = true;
670
+ }
671
+ catch (error) {
672
+ debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
673
+ // If regular click fails, try dispatchEvent
674
+ try {
675
+ yield loadMoreButton.dispatchEvent('click');
676
+ clickSuccess = true;
677
+ }
678
+ catch (dispatchError) {
679
+ debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
680
+ throw dispatchError; // Propagate error to trigger retry
681
+ }
682
+ }
683
+ if (clickSuccess) {
684
+ yield page.waitForTimeout(1000);
685
+ }
686
+ }
687
+ catch (error) {
688
+ debugLog(`Click attempt ${retryCount + 1} failed completely.`);
689
+ retryCount++;
690
+ if (retryCount < MAX_RETRIES) {
691
+ debugLog(`Retrying click - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
692
+ yield page.waitForTimeout(RETRY_DELAY);
693
+ }
694
+ }
695
+ }
696
+ if (!clickSuccess) {
697
+ debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
698
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
699
+ allResults = allResults.concat(finalResults);
700
+ return allResults;
701
+ }
702
+ // Wait for content to load and check scroll height
703
+ yield page.waitForTimeout(2000);
704
+ yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
705
+ yield page.waitForTimeout(2000);
706
+ const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
707
+ if (currentHeight === previousHeight) {
708
+ debugLog('No more items loaded after Load More');
709
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
710
+ allResults = allResults.concat(finalResults);
711
+ return allResults;
712
+ }
713
+ previousHeight = currentHeight;
714
+ if (config.limit && allResults.length >= config.limit) {
715
+ allResults = allResults.slice(0, config.limit);
716
+ break;
717
+ }
651
718
  }
719
+ break;
652
720
  }
721
+ default: {
722
+ yield scrapeCurrentPage();
723
+ return allResults;
724
+ }
725
+ }
726
+ if (checkLimit())
653
727
  break;
654
- default:
655
- const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
656
- allResults = allResults.concat(results);
657
- return allResults;
658
- }
659
- if (config.limit && allResults.length >= config.limit) {
660
- allResults = allResults.slice(0, config.limit);
661
- break;
662
728
  }
663
729
  }
730
+ catch (error) {
731
+ debugLog(`Fatal error: ${error.message}`);
732
+ return allResults;
733
+ }
664
734
  return allResults;
665
735
  });
666
736
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.10",
3
+ "version": "0.0.11",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",