maxun-core 0.0.13 → 0.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +19 -1
- package/build/interpret.js +94 -39
- package/package.json +1 -1
|
@@ -467,7 +467,25 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
467
467
|
}
|
|
468
468
|
else if (attribute === 'src' || attribute === 'href') {
|
|
469
469
|
const attrValue = element.getAttribute(attribute);
|
|
470
|
-
|
|
470
|
+
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
|
471
|
+
if (!dataAttr || dataAttr.trim() === '') {
|
|
472
|
+
if (attribute === 'src') {
|
|
473
|
+
const style = window.getComputedStyle(element);
|
|
474
|
+
const bgImage = style.backgroundImage;
|
|
475
|
+
if (bgImage && bgImage !== 'none') {
|
|
476
|
+
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
|
477
|
+
return matches ? new URL(matches[1], baseURL).href : null;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
return null;
|
|
481
|
+
}
|
|
482
|
+
try {
|
|
483
|
+
return new URL(dataAttr, baseURL).href;
|
|
484
|
+
}
|
|
485
|
+
catch (e) {
|
|
486
|
+
console.warn('Error creating URL from', dataAttr, e);
|
|
487
|
+
return dataAttr; // Return the original value if URL construction fails
|
|
488
|
+
}
|
|
471
489
|
}
|
|
472
490
|
return element.getAttribute(attribute);
|
|
473
491
|
}
|
package/build/interpret.js
CHANGED
|
@@ -477,6 +477,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
477
477
|
let visitedUrls = new Set();
|
|
478
478
|
const MAX_RETRIES = 3;
|
|
479
479
|
const RETRY_DELAY = 1000; // 1 second delay between retries
|
|
480
|
+
const MAX_UNCHANGED_RESULTS = 5;
|
|
480
481
|
const debugLog = (message, ...args) => {
|
|
481
482
|
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
|
482
483
|
};
|
|
@@ -555,30 +556,55 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
555
556
|
}
|
|
556
557
|
});
|
|
557
558
|
let availableSelectors = config.pagination.selector.split(',');
|
|
559
|
+
let unchangedResultCounter = 0;
|
|
558
560
|
try {
|
|
559
561
|
while (true) {
|
|
560
|
-
// Reduced timeout for faster performance
|
|
561
|
-
yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
|
|
562
562
|
switch (config.pagination.type) {
|
|
563
563
|
case 'scrollDown': {
|
|
564
|
+
let previousResultCount = allResults.length;
|
|
565
|
+
yield scrapeCurrentPage();
|
|
566
|
+
if (checkLimit()) {
|
|
567
|
+
return allResults;
|
|
568
|
+
}
|
|
564
569
|
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
565
570
|
yield page.waitForTimeout(2000);
|
|
566
571
|
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
|
|
572
|
+
const currentResultCount = allResults.length;
|
|
573
|
+
if (currentResultCount === previousResultCount) {
|
|
574
|
+
unchangedResultCounter++;
|
|
575
|
+
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
576
|
+
return allResults;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
else {
|
|
580
|
+
unchangedResultCounter = 0;
|
|
581
|
+
}
|
|
567
582
|
if (currentHeight === previousHeight) {
|
|
568
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
569
|
-
allResults = allResults.concat(finalResults);
|
|
570
583
|
return allResults;
|
|
571
584
|
}
|
|
572
585
|
previousHeight = currentHeight;
|
|
573
586
|
break;
|
|
574
587
|
}
|
|
575
588
|
case 'scrollUp': {
|
|
589
|
+
let previousResultCount = allResults.length;
|
|
590
|
+
yield scrapeCurrentPage();
|
|
591
|
+
if (checkLimit()) {
|
|
592
|
+
return allResults;
|
|
593
|
+
}
|
|
576
594
|
yield page.evaluate(() => window.scrollTo(0, 0));
|
|
577
595
|
yield page.waitForTimeout(2000);
|
|
578
596
|
const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
|
|
597
|
+
const currentResultCount = allResults.length;
|
|
598
|
+
if (currentResultCount === previousResultCount) {
|
|
599
|
+
unchangedResultCounter++;
|
|
600
|
+
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
601
|
+
return allResults;
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
else {
|
|
605
|
+
unchangedResultCounter = 0;
|
|
606
|
+
}
|
|
579
607
|
if (currentTopHeight === 0) {
|
|
580
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
581
|
-
allResults = allResults.concat(finalResults);
|
|
582
608
|
return allResults;
|
|
583
609
|
}
|
|
584
610
|
previousHeight = currentTopHeight;
|
|
@@ -609,64 +635,93 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
609
635
|
break;
|
|
610
636
|
}
|
|
611
637
|
let retryCount = 0;
|
|
612
|
-
let
|
|
613
|
-
|
|
638
|
+
let paginationSuccess = false;
|
|
639
|
+
// Capture basic content signature before click
|
|
640
|
+
const captureContentSignature = () => __awaiter(this, void 0, void 0, function* () {
|
|
641
|
+
return yield page.evaluate((selector) => {
|
|
642
|
+
const items = document.querySelectorAll(selector);
|
|
643
|
+
return {
|
|
644
|
+
url: window.location.href,
|
|
645
|
+
itemCount: items.length,
|
|
646
|
+
firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|')
|
|
647
|
+
};
|
|
648
|
+
}, config.listSelector);
|
|
649
|
+
});
|
|
650
|
+
const beforeSignature = yield captureContentSignature();
|
|
651
|
+
debugLog(`Before click: ${beforeSignature.itemCount} items`);
|
|
652
|
+
while (retryCount < MAX_RETRIES && !paginationSuccess) {
|
|
614
653
|
try {
|
|
615
654
|
try {
|
|
616
655
|
yield Promise.all([
|
|
617
656
|
page.waitForNavigation({
|
|
618
657
|
waitUntil: 'networkidle',
|
|
619
658
|
timeout: 15000
|
|
659
|
+
}).catch(e => {
|
|
660
|
+
throw e;
|
|
620
661
|
}),
|
|
621
662
|
button.click()
|
|
622
663
|
]);
|
|
623
|
-
|
|
664
|
+
debugLog("Navigation successful after regular click");
|
|
665
|
+
paginationSuccess = true;
|
|
624
666
|
}
|
|
625
|
-
catch (
|
|
626
|
-
debugLog(
|
|
627
|
-
|
|
628
|
-
|
|
667
|
+
catch (navError) {
|
|
668
|
+
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
|
|
669
|
+
try {
|
|
670
|
+
yield Promise.all([
|
|
671
|
+
page.waitForNavigation({
|
|
672
|
+
waitUntil: 'networkidle',
|
|
673
|
+
timeout: 15000
|
|
674
|
+
}).catch(e => {
|
|
675
|
+
throw e;
|
|
676
|
+
}),
|
|
677
|
+
button.dispatchEvent('click')
|
|
678
|
+
]);
|
|
679
|
+
debugLog("Navigation successful after dispatch event");
|
|
680
|
+
paginationSuccess = true;
|
|
681
|
+
}
|
|
682
|
+
catch (dispatchNavError) {
|
|
629
683
|
try {
|
|
630
|
-
yield
|
|
631
|
-
|
|
632
|
-
waitUntil: 'networkidle',
|
|
633
|
-
timeout: 15000
|
|
634
|
-
}),
|
|
635
|
-
button.dispatchEvent('click')
|
|
636
|
-
]);
|
|
637
|
-
navigationSuccess = true;
|
|
684
|
+
yield button.click();
|
|
685
|
+
yield page.waitForTimeout(2000);
|
|
638
686
|
}
|
|
639
|
-
catch (
|
|
640
|
-
|
|
687
|
+
catch (clickError) {
|
|
688
|
+
yield button.dispatchEvent('click');
|
|
689
|
+
yield page.waitForTimeout(2000);
|
|
641
690
|
}
|
|
642
691
|
}
|
|
643
|
-
else {
|
|
644
|
-
navigationSuccess = true;
|
|
645
|
-
}
|
|
646
692
|
}
|
|
647
|
-
|
|
648
|
-
if (
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
693
|
+
yield page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => { });
|
|
694
|
+
if (!paginationSuccess) {
|
|
695
|
+
const newUrl = page.url();
|
|
696
|
+
const afterSignature = yield captureContentSignature();
|
|
697
|
+
if (newUrl !== currentUrl) {
|
|
698
|
+
debugLog(`URL changed to ${newUrl}`);
|
|
699
|
+
visitedUrls.add(newUrl);
|
|
700
|
+
paginationSuccess = true;
|
|
701
|
+
}
|
|
702
|
+
else if (afterSignature.firstItems !== beforeSignature.firstItems) {
|
|
703
|
+
debugLog("Content changed without URL change");
|
|
704
|
+
paginationSuccess = true;
|
|
705
|
+
}
|
|
706
|
+
else if (afterSignature.itemCount !== beforeSignature.itemCount) {
|
|
707
|
+
debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`);
|
|
708
|
+
paginationSuccess = true;
|
|
709
|
+
}
|
|
654
710
|
}
|
|
655
711
|
}
|
|
656
712
|
catch (error) {
|
|
657
|
-
debugLog(`
|
|
658
|
-
navigationSuccess = false;
|
|
713
|
+
debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`);
|
|
659
714
|
}
|
|
660
|
-
if (!
|
|
715
|
+
if (!paginationSuccess) {
|
|
661
716
|
retryCount++;
|
|
662
717
|
if (retryCount < MAX_RETRIES) {
|
|
663
|
-
debugLog(`Retrying
|
|
718
|
+
debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
664
719
|
yield page.waitForTimeout(RETRY_DELAY);
|
|
665
720
|
}
|
|
666
721
|
}
|
|
667
722
|
}
|
|
668
|
-
if (!
|
|
669
|
-
debugLog(`
|
|
723
|
+
if (!paginationSuccess) {
|
|
724
|
+
debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
|
|
670
725
|
return allResults;
|
|
671
726
|
}
|
|
672
727
|
break;
|