mx-cloud 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +21 -1
- package/build/interpret.js +2 -0
- package/build/selector.js +9 -1
- package/package.json +1 -1
|
@@ -636,6 +636,26 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
636
636
|
return similarity >= similarityThreshold;
|
|
637
637
|
});
|
|
638
638
|
}
|
|
639
|
+
function tryFallbackSelector(rootElement, originalSelector) {
|
|
640
|
+
let element = queryElement(rootElement, originalSelector);
|
|
641
|
+
if (!element && originalSelector.includes('nth-child')) {
|
|
642
|
+
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
|
643
|
+
if (match) {
|
|
644
|
+
const position = parseInt(match[1], 10);
|
|
645
|
+
for (let i = position - 1; i >= 1; i--) {
|
|
646
|
+
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
|
|
647
|
+
element = queryElement(rootElement, fallbackSelector);
|
|
648
|
+
if (element)
|
|
649
|
+
break;
|
|
650
|
+
}
|
|
651
|
+
if (!element) {
|
|
652
|
+
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
|
|
653
|
+
element = queryElement(rootElement, baseSelector);
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
return element;
|
|
658
|
+
}
|
|
639
659
|
// Main scraping logic with context support
|
|
640
660
|
let containers = queryElementAll(document, listSelector);
|
|
641
661
|
containers = Array.from(containers);
|
|
@@ -786,7 +806,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
786
806
|
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
|
787
807
|
// Get the last part of the selector after any context delimiter
|
|
788
808
|
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
|
789
|
-
const element =
|
|
809
|
+
const element = tryFallbackSelector(container, relativeSelector);
|
|
790
810
|
if (element) {
|
|
791
811
|
record[label] = extractValue(element, attribute);
|
|
792
812
|
}
|
package/build/interpret.js
CHANGED
|
@@ -713,6 +713,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
713
713
|
button.click()
|
|
714
714
|
]);
|
|
715
715
|
debugLog("Navigation successful after regular click");
|
|
716
|
+
yield page.waitForTimeout(2000);
|
|
716
717
|
paginationSuccess = true;
|
|
717
718
|
}
|
|
718
719
|
catch (navError) {
|
|
@@ -728,6 +729,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
728
729
|
button.dispatchEvent('click')
|
|
729
730
|
]);
|
|
730
731
|
debugLog("Navigation successful after dispatch event");
|
|
732
|
+
yield page.waitForTimeout(2000);
|
|
731
733
|
paginationSuccess = true;
|
|
732
734
|
}
|
|
733
735
|
catch (dispatchNavError) {
|
package/build/selector.js
CHANGED
|
@@ -676,7 +676,10 @@ const generateNonUniqueSelectors = (page_1, elementHandle_1, ...args_1) => __awa
|
|
|
676
676
|
});
|
|
677
677
|
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
|
678
678
|
}
|
|
679
|
-
|
|
679
|
+
const elementSelector = getNonUniqueSelector(element);
|
|
680
|
+
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
|
681
|
+
return elementSelector;
|
|
682
|
+
}
|
|
680
683
|
const path = [];
|
|
681
684
|
let currentElement = element;
|
|
682
685
|
const MAX_DEPTH = 2;
|
|
@@ -798,6 +801,7 @@ const generateNonUniqueSelectors = (page_1, elementHandle_1, ...args_1) => __awa
|
|
|
798
801
|
function getSelectorPath(element) {
|
|
799
802
|
if (!element)
|
|
800
803
|
return '';
|
|
804
|
+
// Get the complete context path
|
|
801
805
|
const contextPath = getContextPath(element);
|
|
802
806
|
if (contextPath.length > 0) {
|
|
803
807
|
const selectorParts = [];
|
|
@@ -814,6 +818,10 @@ const generateNonUniqueSelectors = (page_1, elementHandle_1, ...args_1) => __awa
|
|
|
814
818
|
});
|
|
815
819
|
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
|
816
820
|
}
|
|
821
|
+
const elementSelector = getNonUniqueSelector(element);
|
|
822
|
+
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
|
823
|
+
return elementSelector;
|
|
824
|
+
}
|
|
817
825
|
const path = [];
|
|
818
826
|
let currentElement = element;
|
|
819
827
|
const MAX_DEPTH = 2;
|