mx-cloud 0.0.5 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +21 -1
- package/build/interpret.js +10 -4
- package/build/selector.js +9 -1
- package/package.json +1 -1
|
@@ -636,6 +636,26 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
636
636
|
return similarity >= similarityThreshold;
|
|
637
637
|
});
|
|
638
638
|
}
|
|
639
|
+
function tryFallbackSelector(rootElement, originalSelector) {
|
|
640
|
+
let element = queryElement(rootElement, originalSelector);
|
|
641
|
+
if (!element && originalSelector.includes('nth-child')) {
|
|
642
|
+
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
|
643
|
+
if (match) {
|
|
644
|
+
const position = parseInt(match[1], 10);
|
|
645
|
+
for (let i = position - 1; i >= 1; i--) {
|
|
646
|
+
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
|
|
647
|
+
element = queryElement(rootElement, fallbackSelector);
|
|
648
|
+
if (element)
|
|
649
|
+
break;
|
|
650
|
+
}
|
|
651
|
+
if (!element) {
|
|
652
|
+
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
|
|
653
|
+
element = queryElement(rootElement, baseSelector);
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
return element;
|
|
658
|
+
}
|
|
639
659
|
// Main scraping logic with context support
|
|
640
660
|
let containers = queryElementAll(document, listSelector);
|
|
641
661
|
containers = Array.from(containers);
|
|
@@ -786,7 +806,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
786
806
|
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
|
787
807
|
// Get the last part of the selector after any context delimiter
|
|
788
808
|
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
|
789
|
-
const element =
|
|
809
|
+
const element = tryFallbackSelector(container, relativeSelector);
|
|
790
810
|
if (element) {
|
|
791
811
|
record[label] = extractValue(element, attribute);
|
|
792
812
|
}
|
package/build/interpret.js
CHANGED
|
@@ -318,6 +318,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
318
318
|
*/
|
|
319
319
|
carryOutSteps(page, steps) {
|
|
320
320
|
return __awaiter(this, void 0, void 0, function* () {
|
|
321
|
+
var _a;
|
|
321
322
|
/**
|
|
322
323
|
* Defines overloaded (or added) methods/actions usable in the workflow.
|
|
323
324
|
* If a method overloads any existing method of the Page class, it accepts the same set
|
|
@@ -410,14 +411,14 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
410
411
|
return;
|
|
411
412
|
}
|
|
412
413
|
yield this.ensureScriptsLoaded(page);
|
|
413
|
-
let scrapeResults = [];
|
|
414
414
|
if (!config.pagination) {
|
|
415
|
-
scrapeResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
415
|
+
const scrapeResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
416
|
+
yield this.options.serializableCallback(scrapeResults);
|
|
416
417
|
}
|
|
417
418
|
else {
|
|
418
|
-
scrapeResults = yield this.handlePagination(page, config);
|
|
419
|
+
const scrapeResults = yield this.handlePagination(page, config);
|
|
420
|
+
yield this.options.serializableCallback(scrapeResults);
|
|
419
421
|
}
|
|
420
|
-
yield this.options.serializableCallback(scrapeResults);
|
|
421
422
|
}),
|
|
422
423
|
scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
423
424
|
var _a;
|
|
@@ -484,6 +485,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
484
485
|
yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
|
|
485
486
|
}
|
|
486
487
|
else {
|
|
488
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
489
|
+
this.options.debugChannel.setActionType(String(step.action));
|
|
490
|
+
}
|
|
487
491
|
// Implements the dot notation for the "method name" in the workflow
|
|
488
492
|
const levels = String(step.action).split('.');
|
|
489
493
|
const methodName = levels[levels.length - 1];
|
|
@@ -713,6 +717,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
713
717
|
button.click()
|
|
714
718
|
]);
|
|
715
719
|
debugLog("Navigation successful after regular click");
|
|
720
|
+
yield page.waitForTimeout(2000);
|
|
716
721
|
paginationSuccess = true;
|
|
717
722
|
}
|
|
718
723
|
catch (navError) {
|
|
@@ -728,6 +733,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
728
733
|
button.dispatchEvent('click')
|
|
729
734
|
]);
|
|
730
735
|
debugLog("Navigation successful after dispatch event");
|
|
736
|
+
yield page.waitForTimeout(2000);
|
|
731
737
|
paginationSuccess = true;
|
|
732
738
|
}
|
|
733
739
|
catch (dispatchNavError) {
|
package/build/selector.js
CHANGED
|
@@ -676,7 +676,10 @@ const generateNonUniqueSelectors = (page_1, elementHandle_1, ...args_1) => __awa
|
|
|
676
676
|
});
|
|
677
677
|
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
|
678
678
|
}
|
|
679
|
-
|
|
679
|
+
const elementSelector = getNonUniqueSelector(element);
|
|
680
|
+
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
|
681
|
+
return elementSelector;
|
|
682
|
+
}
|
|
680
683
|
const path = [];
|
|
681
684
|
let currentElement = element;
|
|
682
685
|
const MAX_DEPTH = 2;
|
|
@@ -798,6 +801,7 @@ const generateNonUniqueSelectors = (page_1, elementHandle_1, ...args_1) => __awa
|
|
|
798
801
|
function getSelectorPath(element) {
|
|
799
802
|
if (!element)
|
|
800
803
|
return '';
|
|
804
|
+
// Get the complete context path
|
|
801
805
|
const contextPath = getContextPath(element);
|
|
802
806
|
if (contextPath.length > 0) {
|
|
803
807
|
const selectorParts = [];
|
|
@@ -814,6 +818,10 @@ const generateNonUniqueSelectors = (page_1, elementHandle_1, ...args_1) => __awa
|
|
|
814
818
|
});
|
|
815
819
|
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
|
816
820
|
}
|
|
821
|
+
const elementSelector = getNonUniqueSelector(element);
|
|
822
|
+
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
|
823
|
+
return elementSelector;
|
|
824
|
+
}
|
|
817
825
|
const path = [];
|
|
818
826
|
let currentElement = element;
|
|
819
827
|
const MAX_DEPTH = 2;
|