maxun-core 0.0.9 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/build/interpret.js +251 -172
  2. package/package.json +1 -1
@@ -404,6 +404,12 @@ class Interpreter extends events_1.EventEmitter {
404
404
  };
405
405
  const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
406
406
  console.log("Executing action:", methodName, args);
407
+ if (methodName === 'press' || methodName === 'type') {
408
+ // Extract only the first two arguments for these methods
409
+ const limitedArgs = Array.isArray(args) ? args.slice(0, 2) : [args];
410
+ yield invokee[methodName](...limitedArgs);
411
+ return;
412
+ }
407
413
  if (!args || Array.isArray(args)) {
408
414
  yield invokee[methodName](...(args !== null && args !== void 0 ? args : []));
409
415
  }
@@ -459,199 +465,272 @@ class Interpreter extends events_1.EventEmitter {
459
465
  return __awaiter(this, void 0, void 0, function* () {
460
466
  let allResults = [];
461
467
  let previousHeight = 0;
462
- // track unique items per page to avoid re-scraping
463
468
  let scrapedItems = new Set();
464
- let visitedUrls = [];
465
- let availableSelectors = config.pagination.selector.split(',');
466
- while (true) {
467
- switch (config.pagination.type) {
468
- case 'scrollDown':
469
- yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
470
- yield page.waitForTimeout(2000);
471
- const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
472
- if (currentHeight === previousHeight) {
473
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
474
- allResults = allResults.concat(finalResults);
475
- return allResults;
476
- }
477
- previousHeight = currentHeight;
478
- break;
479
- case 'scrollUp':
480
- yield page.evaluate(() => window.scrollTo(0, 0));
481
- yield page.waitForTimeout(2000);
482
- const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
483
- if (currentTopHeight === 0) {
484
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
485
- allResults = allResults.concat(finalResults);
486
- return allResults;
487
- }
488
- previousHeight = currentTopHeight;
489
- break;
490
- case 'clickNext':
491
- console.log("Page URL:", page.url());
492
- const pageResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
493
- // console.log("Page results:", pageResults);
494
- // Filter out already scraped items
495
- const newResults = pageResults.filter(item => {
496
- const uniqueKey = JSON.stringify(item);
497
- if (scrapedItems.has(uniqueKey))
498
- return false; // Ignore if already scraped
499
- scrapedItems.add(uniqueKey); // Mark as scraped
500
- return true;
469
+ let visitedUrls = new Set();
470
+ const MAX_RETRIES = 3;
471
+ const RETRY_DELAY = 1000; // 1 second delay between retries
472
+ const debugLog = (message, ...args) => {
473
+ console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
474
+ };
475
+ const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () {
476
+ const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
477
+ const newResults = results.filter(item => {
478
+ const uniqueKey = JSON.stringify(item);
479
+ if (scrapedItems.has(uniqueKey))
480
+ return false;
481
+ scrapedItems.add(uniqueKey);
482
+ return true;
483
+ });
484
+ allResults = allResults.concat(newResults);
485
+ debugLog("Results collected:", allResults.length);
486
+ });
487
+ const checkLimit = () => {
488
+ if (config.limit && allResults.length >= config.limit) {
489
+ allResults = allResults.slice(0, config.limit);
490
+ return true;
491
+ }
492
+ return false;
493
+ };
494
+ // Enhanced button finder with retry mechanism
495
+ const findWorkingButton = (selectors, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
496
+ for (const selector of selectors) {
497
+ try {
498
+ const button = yield page.waitForSelector(selector, {
499
+ state: 'attached',
500
+ timeout: 10000 // Reduced timeout for faster checks
501
501
  });
502
- allResults = allResults.concat(newResults);
503
- console.log("Results so far:", allResults.length);
504
- if (config.limit && allResults.length >= config.limit) {
505
- return allResults.slice(0, config.limit);
502
+ if (button) {
503
+ debugLog('Found working selector:', selector);
504
+ return { button, workingSelector: selector };
506
505
  }
507
- let checkButton = null;
508
- let workingSelector = null;
509
- for (let i = 0; i < availableSelectors.length; i++) {
510
- const selector = availableSelectors[i];
511
- try {
512
- // Wait for selector with a short timeout
513
- checkButton = yield page.waitForSelector(selector, { state: 'attached' });
514
- if (checkButton) {
515
- workingSelector = selector;
516
- break;
517
- }
518
- }
519
- catch (error) {
520
- console.log(`Selector failed: ${selector}`);
506
+ }
507
+ catch (error) {
508
+ debugLog(`Selector failed: ${selector}`);
509
+ }
510
+ }
511
+ // Implement retry mechanism when no selectors work
512
+ if (selectors.length > 0 && retryCount < MAX_RETRIES) {
513
+ debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
514
+ yield page.waitForTimeout(RETRY_DELAY);
515
+ return findWorkingButton(selectors, retryCount + 1);
516
+ }
517
+ return { button: null, workingSelector: null };
518
+ });
519
+ const retryOperation = (operation, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
520
+ try {
521
+ return yield operation();
522
+ }
523
+ catch (error) {
524
+ if (retryCount < MAX_RETRIES) {
525
+ debugLog(`Retrying operation. Attempt ${retryCount + 1} of ${MAX_RETRIES}`);
526
+ yield page.waitForTimeout(RETRY_DELAY);
527
+ return retryOperation(operation, retryCount + 1);
528
+ }
529
+ debugLog(`Operation failed after ${MAX_RETRIES} retries`);
530
+ return false;
531
+ }
532
+ });
533
+ let availableSelectors = config.pagination.selector.split(',');
534
+ try {
535
+ while (true) {
536
+ // Reduced timeout for faster performance
537
+ yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
538
+ switch (config.pagination.type) {
539
+ case 'scrollDown': {
540
+ yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
541
+ yield page.waitForTimeout(2000);
542
+ const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
543
+ if (currentHeight === previousHeight) {
544
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
545
+ allResults = allResults.concat(finalResults);
546
+ return allResults;
521
547
  }
548
+ previousHeight = currentHeight;
549
+ break;
522
550
  }
523
- if (!workingSelector) {
524
- return allResults;
525
- }
526
- // const nextButton = await page.$(config.pagination.selector);
527
- const nextButton = yield page.$(workingSelector);
528
- if (!nextButton) {
529
- return allResults; // No more pages to scrape
530
- }
531
- const selectorIndex = availableSelectors.indexOf(workingSelector);
532
- availableSelectors = availableSelectors.slice(selectorIndex);
533
- // await Promise.all([
534
- // nextButton.dispatchEvent('click'),
535
- // page.waitForNavigation({ waitUntil: 'networkidle' })
536
- // ]);
537
- const previousUrl = page.url();
538
- visitedUrls.push(previousUrl);
539
- try {
540
- // Try both click methods simultaneously
541
- yield Promise.race([
542
- Promise.all([
543
- page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
544
- nextButton.click()
545
- ]),
546
- Promise.all([
547
- page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
548
- nextButton.dispatchEvent('click')
549
- ])
550
- ]);
551
- }
552
- catch (error) {
553
- // Verify if navigation actually succeeded
554
- const currentUrl = page.url();
555
- if (currentUrl === previousUrl) {
556
- console.log("Previous URL same as current URL. Navigation failed.");
551
+ case 'scrollUp': {
552
+ yield page.evaluate(() => window.scrollTo(0, 0));
553
+ yield page.waitForTimeout(2000);
554
+ const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
555
+ if (currentTopHeight === 0) {
556
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
557
+ allResults = allResults.concat(finalResults);
558
+ return allResults;
557
559
  }
560
+ previousHeight = currentTopHeight;
561
+ break;
558
562
  }
559
- const currentUrl = page.url();
560
- if (visitedUrls.includes(currentUrl)) {
561
- console.log(`Detected navigation to a previously visited URL: ${currentUrl}`);
562
- // Extract the current page number from the URL
563
- const match = currentUrl.match(/\d+/);
564
- if (match) {
565
- const currentNumber = match[0];
566
- // Use visitedUrls.length + 1 as the next page number
567
- const nextNumber = visitedUrls.length + 1;
568
- // Create new URL by replacing the current number with the next number
569
- const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString());
570
- console.log(`Navigating to constructed URL: ${nextUrl}`);
571
- // Navigate to the next page
572
- yield Promise.all([
573
- page.waitForNavigation({ waitUntil: 'networkidle' }),
574
- page.goto(nextUrl)
575
- ]);
563
+ case 'clickNext': {
564
+ const currentUrl = page.url();
565
+ visitedUrls.add(currentUrl);
566
+ yield scrapeCurrentPage();
567
+ if (checkLimit())
568
+ return allResults;
569
+ const { button, workingSelector } = yield findWorkingButton(availableSelectors);
570
+ if (!button || !workingSelector) {
571
+ // Final retry for navigation when no selectors work
572
+ const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () {
573
+ try {
574
+ yield page.evaluate(() => window.history.forward());
575
+ const newUrl = page.url();
576
+ return !visitedUrls.has(newUrl);
577
+ }
578
+ catch (_a) {
579
+ return false;
580
+ }
581
+ }));
582
+ if (!success)
583
+ return allResults;
584
+ break;
576
585
  }
577
- }
578
- // Give the page a moment to stabilize after navigation
579
- yield page.waitForTimeout(1000);
580
- break;
581
- case 'clickLoadMore':
582
- while (true) {
583
- let checkButton = null;
584
- let workingSelector = null;
585
- for (let i = 0; i < availableSelectors.length; i++) {
586
- const selector = availableSelectors[i];
586
+ availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
587
+ let retryCount = 0;
588
+ let navigationSuccess = false;
589
+ while (retryCount < MAX_RETRIES && !navigationSuccess) {
587
590
  try {
588
- // Wait for selector with a short timeout
589
- checkButton = yield page.waitForSelector(selector, { state: 'attached' });
590
- if (checkButton) {
591
- workingSelector = selector;
592
- break;
591
+ try {
592
+ yield Promise.all([
593
+ page.waitForNavigation({
594
+ waitUntil: 'networkidle',
595
+ timeout: 15000
596
+ }),
597
+ button.click()
598
+ ]);
599
+ navigationSuccess = true;
600
+ }
601
+ catch (error) {
602
+ debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
603
+ // If regular click fails, try dispatchEvent
604
+ if (page.url() === currentUrl) {
605
+ try {
606
+ yield Promise.all([
607
+ page.waitForNavigation({
608
+ waitUntil: 'networkidle',
609
+ timeout: 15000
610
+ }),
611
+ button.dispatchEvent('click')
612
+ ]);
613
+ navigationSuccess = true;
614
+ }
615
+ catch (dispatchError) {
616
+ debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
617
+ }
618
+ }
619
+ else {
620
+ navigationSuccess = true;
621
+ }
622
+ }
623
+ const newUrl = page.url();
624
+ if (visitedUrls.has(newUrl)) {
625
+ debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
626
+ navigationSuccess = false;
627
+ }
628
+ if (navigationSuccess) {
629
+ yield page.waitForTimeout(1000);
593
630
  }
594
631
  }
595
632
  catch (error) {
596
- console.log(`Selector failed: ${selector}`);
633
+ debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
634
+ navigationSuccess = false;
635
+ }
636
+ if (!navigationSuccess) {
637
+ retryCount++;
638
+ if (retryCount < MAX_RETRIES) {
639
+ debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
640
+ yield page.waitForTimeout(RETRY_DELAY);
641
+ }
597
642
  }
598
643
  }
599
- if (!workingSelector) {
600
- // No more working selectors available, so scrape the remaining items
601
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
602
- allResults = allResults.concat(finalResults);
603
- return allResults;
604
- }
605
- const loadMoreButton = yield page.$(workingSelector);
606
- if (!loadMoreButton) {
607
- // No more "Load More" button, so scrape the remaining items
608
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
609
- allResults = allResults.concat(finalResults);
644
+ if (!navigationSuccess) {
645
+ debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
610
646
  return allResults;
611
647
  }
612
- const selectorIndex = availableSelectors.indexOf(workingSelector);
613
- availableSelectors = availableSelectors.slice(selectorIndex);
614
- // Click the 'Load More' button to load additional items
615
- // await loadMoreButton.dispatchEvent('click');
616
- try {
617
- yield Promise.race([
618
- loadMoreButton.click(),
619
- loadMoreButton.dispatchEvent('click')
620
- ]);
621
- }
622
- catch (error) {
623
- console.log('Both click attempts failed');
624
- }
625
- yield page.waitForTimeout(2000); // Wait for new items to load
626
- // After clicking 'Load More', scroll down to load more items
627
- yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
628
- yield page.waitForTimeout(2000);
629
- // Check if more items are available
630
- const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
631
- if (currentHeight === previousHeight) {
632
- // No more items loaded, return the scraped results
633
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
634
- allResults = allResults.concat(finalResults);
635
- return allResults;
636
- }
637
- previousHeight = currentHeight;
638
- if (config.limit && allResults.length >= config.limit) {
639
- // If limit is set and reached, return the limited results
640
- allResults = allResults.slice(0, config.limit);
641
- break;
648
+ break;
649
+ }
650
+ case 'clickLoadMore': {
651
+ while (true) {
652
+ // Find working button with retry mechanism, consistent with clickNext
653
+ const { button: loadMoreButton, workingSelector } = yield findWorkingButton(availableSelectors);
654
+ if (!workingSelector || !loadMoreButton) {
655
+ debugLog('No working Load More selector found after retries');
656
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
657
+ allResults = allResults.concat(finalResults);
658
+ return allResults;
659
+ }
660
+ // Update available selectors to start from the working one
661
+ availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
662
+ // Implement retry mechanism for clicking the button
663
+ let retryCount = 0;
664
+ let clickSuccess = false;
665
+ while (retryCount < MAX_RETRIES && !clickSuccess) {
666
+ try {
667
+ try {
668
+ yield loadMoreButton.click();
669
+ clickSuccess = true;
670
+ }
671
+ catch (error) {
672
+ debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
673
+ // If regular click fails, try dispatchEvent
674
+ try {
675
+ yield loadMoreButton.dispatchEvent('click');
676
+ clickSuccess = true;
677
+ }
678
+ catch (dispatchError) {
679
+ debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
680
+ throw dispatchError; // Propagate error to trigger retry
681
+ }
682
+ }
683
+ if (clickSuccess) {
684
+ yield page.waitForTimeout(1000);
685
+ }
686
+ }
687
+ catch (error) {
688
+ debugLog(`Click attempt ${retryCount + 1} failed completely.`);
689
+ retryCount++;
690
+ if (retryCount < MAX_RETRIES) {
691
+ debugLog(`Retrying click - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
692
+ yield page.waitForTimeout(RETRY_DELAY);
693
+ }
694
+ }
695
+ }
696
+ if (!clickSuccess) {
697
+ debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
698
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
699
+ allResults = allResults.concat(finalResults);
700
+ return allResults;
701
+ }
702
+ // Wait for content to load and check scroll height
703
+ yield page.waitForTimeout(2000);
704
+ yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
705
+ yield page.waitForTimeout(2000);
706
+ const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
707
+ if (currentHeight === previousHeight) {
708
+ debugLog('No more items loaded after Load More');
709
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
710
+ allResults = allResults.concat(finalResults);
711
+ return allResults;
712
+ }
713
+ previousHeight = currentHeight;
714
+ if (config.limit && allResults.length >= config.limit) {
715
+ allResults = allResults.slice(0, config.limit);
716
+ break;
717
+ }
642
718
  }
719
+ break;
643
720
  }
721
+ default: {
722
+ yield scrapeCurrentPage();
723
+ return allResults;
724
+ }
725
+ }
726
+ if (checkLimit())
644
727
  break;
645
- default:
646
- const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
647
- allResults = allResults.concat(results);
648
- return allResults;
649
- }
650
- if (config.limit && allResults.length >= config.limit) {
651
- allResults = allResults.slice(0, config.limit);
652
- break;
653
728
  }
654
729
  }
730
+ catch (error) {
731
+ debugLog(`Fatal error: ${error.message}`);
732
+ return allResults;
733
+ }
655
734
  return allResults;
656
735
  });
657
736
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.9",
3
+ "version": "0.0.11",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",