maxun-core 0.0.13 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -467,7 +467,25 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
467
467
  }
468
468
  else if (attribute === 'src' || attribute === 'href') {
469
469
  const attrValue = element.getAttribute(attribute);
470
- return attrValue ? new URL(attrValue, baseURL).href : null;
470
+ const dataAttr = attrValue || element.getAttribute('data-' + attribute);
471
+ if (!dataAttr || dataAttr.trim() === '') {
472
+ if (attribute === 'src') {
473
+ const style = window.getComputedStyle(element);
474
+ const bgImage = style.backgroundImage;
475
+ if (bgImage && bgImage !== 'none') {
476
+ const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
477
+ return matches ? new URL(matches[1], baseURL).href : null;
478
+ }
479
+ }
480
+ return null;
481
+ }
482
+ try {
483
+ return new URL(dataAttr, baseURL).href;
484
+ }
485
+ catch (e) {
486
+ console.warn('Error creating URL from', dataAttr, e);
487
+ return dataAttr; // Return the original value if URL construction fails
488
+ }
471
489
  }
472
490
  return element.getAttribute(attribute);
473
491
  }
@@ -609,64 +609,93 @@ class Interpreter extends events_1.EventEmitter {
609
609
  break;
610
610
  }
611
611
  let retryCount = 0;
612
- let navigationSuccess = false;
613
- while (retryCount < MAX_RETRIES && !navigationSuccess) {
612
+ let paginationSuccess = false;
613
+ // Capture basic content signature before click
614
+ const captureContentSignature = () => __awaiter(this, void 0, void 0, function* () {
615
+ return yield page.evaluate((selector) => {
616
+ const items = document.querySelectorAll(selector);
617
+ return {
618
+ url: window.location.href,
619
+ itemCount: items.length,
620
+ firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|')
621
+ };
622
+ }, config.listSelector);
623
+ });
624
+ const beforeSignature = yield captureContentSignature();
625
+ debugLog(`Before click: ${beforeSignature.itemCount} items`);
626
+ while (retryCount < MAX_RETRIES && !paginationSuccess) {
614
627
  try {
615
628
  try {
616
629
  yield Promise.all([
617
630
  page.waitForNavigation({
618
631
  waitUntil: 'networkidle',
619
632
  timeout: 15000
633
+ }).catch(e => {
634
+ throw e;
620
635
  }),
621
636
  button.click()
622
637
  ]);
623
- navigationSuccess = true;
638
+ debugLog("Navigation successful after regular click");
639
+ paginationSuccess = true;
624
640
  }
625
- catch (error) {
626
- debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
627
- // If regular click fails, try dispatchEvent
628
- if (page.url() === currentUrl) {
641
+ catch (navError) {
642
+ debugLog("Regular click with navigation failed, trying dispatch event with navigation");
643
+ try {
644
+ yield Promise.all([
645
+ page.waitForNavigation({
646
+ waitUntil: 'networkidle',
647
+ timeout: 15000
648
+ }).catch(e => {
649
+ throw e;
650
+ }),
651
+ button.dispatchEvent('click')
652
+ ]);
653
+ debugLog("Navigation successful after dispatch event");
654
+ paginationSuccess = true;
655
+ }
656
+ catch (dispatchNavError) {
629
657
  try {
630
- yield Promise.all([
631
- page.waitForNavigation({
632
- waitUntil: 'networkidle',
633
- timeout: 15000
634
- }),
635
- button.dispatchEvent('click')
636
- ]);
637
- navigationSuccess = true;
658
+ yield button.click();
659
+ yield page.waitForTimeout(2000);
638
660
  }
639
- catch (dispatchError) {
640
- debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
661
+ catch (clickError) {
662
+ yield button.dispatchEvent('click');
663
+ yield page.waitForTimeout(2000);
641
664
  }
642
665
  }
643
- else {
644
- navigationSuccess = true;
645
- }
646
- }
647
- const newUrl = page.url();
648
- if (visitedUrls.has(newUrl)) {
649
- debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
650
- navigationSuccess = false;
651
666
  }
652
- if (navigationSuccess) {
653
- yield page.waitForTimeout(1000);
667
+ yield page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => { });
668
+ if (!paginationSuccess) {
669
+ const newUrl = page.url();
670
+ const afterSignature = yield captureContentSignature();
671
+ if (newUrl !== currentUrl) {
672
+ debugLog(`URL changed to ${newUrl}`);
673
+ visitedUrls.add(newUrl);
674
+ paginationSuccess = true;
675
+ }
676
+ else if (afterSignature.firstItems !== beforeSignature.firstItems) {
677
+ debugLog("Content changed without URL change");
678
+ paginationSuccess = true;
679
+ }
680
+ else if (afterSignature.itemCount !== beforeSignature.itemCount) {
681
+ debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`);
682
+ paginationSuccess = true;
683
+ }
654
684
  }
655
685
  }
656
686
  catch (error) {
657
- debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
658
- navigationSuccess = false;
687
+ debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`);
659
688
  }
660
- if (!navigationSuccess) {
689
+ if (!paginationSuccess) {
661
690
  retryCount++;
662
691
  if (retryCount < MAX_RETRIES) {
663
- debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
692
+ debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
664
693
  yield page.waitForTimeout(RETRY_DELAY);
665
694
  }
666
695
  }
667
696
  }
668
- if (!navigationSuccess) {
669
- debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
697
+ if (!paginationSuccess) {
698
+ debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
670
699
  return allResults;
671
700
  }
672
701
  break;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.13",
3
+ "version": "0.0.14",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",