mx-cloud 0.0.4 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +21 -1
- package/build/interpret.d.ts +1 -0
- package/build/interpret.js +10 -0
- package/build/selector.js +9 -1
- package/package.json +1 -1
|
@@ -636,6 +636,26 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
636
636
|
return similarity >= similarityThreshold;
|
|
637
637
|
});
|
|
638
638
|
}
|
|
639
|
+
function tryFallbackSelector(rootElement, originalSelector) {
|
|
640
|
+
let element = queryElement(rootElement, originalSelector);
|
|
641
|
+
if (!element && originalSelector.includes('nth-child')) {
|
|
642
|
+
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
|
643
|
+
if (match) {
|
|
644
|
+
const position = parseInt(match[1], 10);
|
|
645
|
+
for (let i = position - 1; i >= 1; i--) {
|
|
646
|
+
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
|
|
647
|
+
element = queryElement(rootElement, fallbackSelector);
|
|
648
|
+
if (element)
|
|
649
|
+
break;
|
|
650
|
+
}
|
|
651
|
+
if (!element) {
|
|
652
|
+
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
|
|
653
|
+
element = queryElement(rootElement, baseSelector);
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
return element;
|
|
658
|
+
}
|
|
639
659
|
// Main scraping logic with context support
|
|
640
660
|
let containers = queryElementAll(document, listSelector);
|
|
641
661
|
containers = Array.from(containers);
|
|
@@ -786,7 +806,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
786
806
|
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
|
787
807
|
// Get the last part of the selector after any context delimiter
|
|
788
808
|
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
|
789
|
-
const element =
|
|
809
|
+
const element = tryFallbackSelector(container, relativeSelector);
|
|
790
810
|
if (element) {
|
|
791
811
|
record[label] = extractValue(element, attribute);
|
|
792
812
|
}
|
package/build/interpret.d.ts
CHANGED
package/build/interpret.js
CHANGED
|
@@ -378,6 +378,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
378
378
|
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
379
379
|
this.options.debugChannel.setActionType('scrapeSchema');
|
|
380
380
|
}
|
|
381
|
+
if (this.options.mode && this.options.mode === 'editor') {
|
|
382
|
+
yield this.options.serializableCallback({});
|
|
383
|
+
return;
|
|
384
|
+
}
|
|
381
385
|
yield this.ensureScriptsLoaded(page);
|
|
382
386
|
const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
|
|
383
387
|
if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
|
|
@@ -401,6 +405,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
401
405
|
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
402
406
|
this.options.debugChannel.setActionType('scrapeList');
|
|
403
407
|
}
|
|
408
|
+
if (this.options.mode && this.options.mode === 'editor') {
|
|
409
|
+
yield this.options.serializableCallback({});
|
|
410
|
+
return;
|
|
411
|
+
}
|
|
404
412
|
yield this.ensureScriptsLoaded(page);
|
|
405
413
|
let scrapeResults = [];
|
|
406
414
|
if (!config.pagination) {
|
|
@@ -705,6 +713,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
705
713
|
button.click()
|
|
706
714
|
]);
|
|
707
715
|
debugLog("Navigation successful after regular click");
|
|
716
|
+
yield page.waitForTimeout(2000);
|
|
708
717
|
paginationSuccess = true;
|
|
709
718
|
}
|
|
710
719
|
catch (navError) {
|
|
@@ -720,6 +729,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
720
729
|
button.dispatchEvent('click')
|
|
721
730
|
]);
|
|
722
731
|
debugLog("Navigation successful after dispatch event");
|
|
732
|
+
yield page.waitForTimeout(2000);
|
|
723
733
|
paginationSuccess = true;
|
|
724
734
|
}
|
|
725
735
|
catch (dispatchNavError) {
|
package/build/selector.js
CHANGED
|
@@ -676,7 +676,10 @@ const generateNonUniqueSelectors = (page_1, elementHandle_1, ...args_1) => __awa
|
|
|
676
676
|
});
|
|
677
677
|
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
|
678
678
|
}
|
|
679
|
-
|
|
679
|
+
const elementSelector = getNonUniqueSelector(element);
|
|
680
|
+
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
|
681
|
+
return elementSelector;
|
|
682
|
+
}
|
|
680
683
|
const path = [];
|
|
681
684
|
let currentElement = element;
|
|
682
685
|
const MAX_DEPTH = 2;
|
|
@@ -798,6 +801,7 @@ const generateNonUniqueSelectors = (page_1, elementHandle_1, ...args_1) => __awa
|
|
|
798
801
|
function getSelectorPath(element) {
|
|
799
802
|
if (!element)
|
|
800
803
|
return '';
|
|
804
|
+
// Get the complete context path
|
|
801
805
|
const contextPath = getContextPath(element);
|
|
802
806
|
if (contextPath.length > 0) {
|
|
803
807
|
const selectorParts = [];
|
|
@@ -814,6 +818,10 @@ const generateNonUniqueSelectors = (page_1, elementHandle_1, ...args_1) => __awa
|
|
|
814
818
|
});
|
|
815
819
|
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
|
816
820
|
}
|
|
821
|
+
const elementSelector = getNonUniqueSelector(element);
|
|
822
|
+
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
|
823
|
+
return elementSelector;
|
|
824
|
+
}
|
|
817
825
|
const path = [];
|
|
818
826
|
let currentElement = element;
|
|
819
827
|
const MAX_DEPTH = 2;
|