maxun-core 0.0.13 → 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +19 -1
- package/build/interpret.js +62 -33
- package/package.json +1 -1
|
@@ -467,7 +467,25 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
467
467
|
}
|
|
468
468
|
else if (attribute === 'src' || attribute === 'href') {
|
|
469
469
|
const attrValue = element.getAttribute(attribute);
|
|
470
|
-
|
|
470
|
+
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
|
471
|
+
if (!dataAttr || dataAttr.trim() === '') {
|
|
472
|
+
if (attribute === 'src') {
|
|
473
|
+
const style = window.getComputedStyle(element);
|
|
474
|
+
const bgImage = style.backgroundImage;
|
|
475
|
+
if (bgImage && bgImage !== 'none') {
|
|
476
|
+
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
|
477
|
+
return matches ? new URL(matches[1], baseURL).href : null;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
return null;
|
|
481
|
+
}
|
|
482
|
+
try {
|
|
483
|
+
return new URL(dataAttr, baseURL).href;
|
|
484
|
+
}
|
|
485
|
+
catch (e) {
|
|
486
|
+
console.warn('Error creating URL from', dataAttr, e);
|
|
487
|
+
return dataAttr; // Return the original value if URL construction fails
|
|
488
|
+
}
|
|
471
489
|
}
|
|
472
490
|
return element.getAttribute(attribute);
|
|
473
491
|
}
|
package/build/interpret.js
CHANGED
|
@@ -609,64 +609,93 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
609
609
|
break;
|
|
610
610
|
}
|
|
611
611
|
let retryCount = 0;
|
|
612
|
-
let
|
|
613
|
-
|
|
612
|
+
let paginationSuccess = false;
|
|
613
|
+
// Capture basic content signature before click
|
|
614
|
+
const captureContentSignature = () => __awaiter(this, void 0, void 0, function* () {
|
|
615
|
+
return yield page.evaluate((selector) => {
|
|
616
|
+
const items = document.querySelectorAll(selector);
|
|
617
|
+
return {
|
|
618
|
+
url: window.location.href,
|
|
619
|
+
itemCount: items.length,
|
|
620
|
+
firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|')
|
|
621
|
+
};
|
|
622
|
+
}, config.listSelector);
|
|
623
|
+
});
|
|
624
|
+
const beforeSignature = yield captureContentSignature();
|
|
625
|
+
debugLog(`Before click: ${beforeSignature.itemCount} items`);
|
|
626
|
+
while (retryCount < MAX_RETRIES && !paginationSuccess) {
|
|
614
627
|
try {
|
|
615
628
|
try {
|
|
616
629
|
yield Promise.all([
|
|
617
630
|
page.waitForNavigation({
|
|
618
631
|
waitUntil: 'networkidle',
|
|
619
632
|
timeout: 15000
|
|
633
|
+
}).catch(e => {
|
|
634
|
+
throw e;
|
|
620
635
|
}),
|
|
621
636
|
button.click()
|
|
622
637
|
]);
|
|
623
|
-
|
|
638
|
+
debugLog("Navigation successful after regular click");
|
|
639
|
+
paginationSuccess = true;
|
|
624
640
|
}
|
|
625
|
-
catch (
|
|
626
|
-
debugLog(
|
|
627
|
-
|
|
628
|
-
|
|
641
|
+
catch (navError) {
|
|
642
|
+
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
|
|
643
|
+
try {
|
|
644
|
+
yield Promise.all([
|
|
645
|
+
page.waitForNavigation({
|
|
646
|
+
waitUntil: 'networkidle',
|
|
647
|
+
timeout: 15000
|
|
648
|
+
}).catch(e => {
|
|
649
|
+
throw e;
|
|
650
|
+
}),
|
|
651
|
+
button.dispatchEvent('click')
|
|
652
|
+
]);
|
|
653
|
+
debugLog("Navigation successful after dispatch event");
|
|
654
|
+
paginationSuccess = true;
|
|
655
|
+
}
|
|
656
|
+
catch (dispatchNavError) {
|
|
629
657
|
try {
|
|
630
|
-
yield
|
|
631
|
-
|
|
632
|
-
waitUntil: 'networkidle',
|
|
633
|
-
timeout: 15000
|
|
634
|
-
}),
|
|
635
|
-
button.dispatchEvent('click')
|
|
636
|
-
]);
|
|
637
|
-
navigationSuccess = true;
|
|
658
|
+
yield button.click();
|
|
659
|
+
yield page.waitForTimeout(2000);
|
|
638
660
|
}
|
|
639
|
-
catch (
|
|
640
|
-
|
|
661
|
+
catch (clickError) {
|
|
662
|
+
yield button.dispatchEvent('click');
|
|
663
|
+
yield page.waitForTimeout(2000);
|
|
641
664
|
}
|
|
642
665
|
}
|
|
643
|
-
else {
|
|
644
|
-
navigationSuccess = true;
|
|
645
|
-
}
|
|
646
|
-
}
|
|
647
|
-
const newUrl = page.url();
|
|
648
|
-
if (visitedUrls.has(newUrl)) {
|
|
649
|
-
debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
|
|
650
|
-
navigationSuccess = false;
|
|
651
666
|
}
|
|
652
|
-
|
|
653
|
-
|
|
667
|
+
yield page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => { });
|
|
668
|
+
if (!paginationSuccess) {
|
|
669
|
+
const newUrl = page.url();
|
|
670
|
+
const afterSignature = yield captureContentSignature();
|
|
671
|
+
if (newUrl !== currentUrl) {
|
|
672
|
+
debugLog(`URL changed to ${newUrl}`);
|
|
673
|
+
visitedUrls.add(newUrl);
|
|
674
|
+
paginationSuccess = true;
|
|
675
|
+
}
|
|
676
|
+
else if (afterSignature.firstItems !== beforeSignature.firstItems) {
|
|
677
|
+
debugLog("Content changed without URL change");
|
|
678
|
+
paginationSuccess = true;
|
|
679
|
+
}
|
|
680
|
+
else if (afterSignature.itemCount !== beforeSignature.itemCount) {
|
|
681
|
+
debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`);
|
|
682
|
+
paginationSuccess = true;
|
|
683
|
+
}
|
|
654
684
|
}
|
|
655
685
|
}
|
|
656
686
|
catch (error) {
|
|
657
|
-
debugLog(`
|
|
658
|
-
navigationSuccess = false;
|
|
687
|
+
debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`);
|
|
659
688
|
}
|
|
660
|
-
if (!
|
|
689
|
+
if (!paginationSuccess) {
|
|
661
690
|
retryCount++;
|
|
662
691
|
if (retryCount < MAX_RETRIES) {
|
|
663
|
-
debugLog(`Retrying
|
|
692
|
+
debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
664
693
|
yield page.waitForTimeout(RETRY_DELAY);
|
|
665
694
|
}
|
|
666
695
|
}
|
|
667
696
|
}
|
|
668
|
-
if (!
|
|
669
|
-
debugLog(`
|
|
697
|
+
if (!paginationSuccess) {
|
|
698
|
+
debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
|
|
670
699
|
return allResults;
|
|
671
700
|
}
|
|
672
701
|
break;
|