maxun-core 0.0.16 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +21 -1
- package/build/interpret.d.ts +1 -0
- package/build/interpret.js +13 -7
- package/package.json +1 -1
|
@@ -636,6 +636,26 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
636
636
|
return similarity >= similarityThreshold;
|
|
637
637
|
});
|
|
638
638
|
}
|
|
639
|
+
function tryFallbackSelector(rootElement, originalSelector) {
|
|
640
|
+
let element = queryElement(rootElement, originalSelector);
|
|
641
|
+
if (!element && originalSelector.includes('nth-child')) {
|
|
642
|
+
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
|
643
|
+
if (match) {
|
|
644
|
+
const position = parseInt(match[1], 10);
|
|
645
|
+
for (let i = position - 1; i >= 1; i--) {
|
|
646
|
+
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
|
|
647
|
+
element = queryElement(rootElement, fallbackSelector);
|
|
648
|
+
if (element)
|
|
649
|
+
break;
|
|
650
|
+
}
|
|
651
|
+
if (!element) {
|
|
652
|
+
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
|
|
653
|
+
element = queryElement(rootElement, baseSelector);
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
return element;
|
|
658
|
+
}
|
|
639
659
|
// Main scraping logic with context support
|
|
640
660
|
let containers = queryElementAll(document, listSelector);
|
|
641
661
|
containers = Array.from(containers);
|
|
@@ -786,7 +806,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
786
806
|
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
|
787
807
|
// Get the last part of the selector after any context delimiter
|
|
788
808
|
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
|
789
|
-
const element =
|
|
809
|
+
const element = tryFallbackSelector(container, relativeSelector);
|
|
790
810
|
if (element) {
|
|
791
811
|
record[label] = extractValue(element, attribute);
|
|
792
812
|
}
|
package/build/interpret.d.ts
CHANGED
package/build/interpret.js
CHANGED
|
@@ -385,7 +385,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
385
385
|
yield this.options.serializableCallback([mergedResult]);
|
|
386
386
|
}),
|
|
387
387
|
scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
388
|
-
var _f;
|
|
388
|
+
var _f, _g;
|
|
389
389
|
if ((_f = this.options.debugChannel) === null || _f === void 0 ? void 0 : _f.setActionType) {
|
|
390
390
|
this.options.debugChannel.setActionType('scrapeList');
|
|
391
391
|
}
|
|
@@ -394,6 +394,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
394
394
|
return;
|
|
395
395
|
}
|
|
396
396
|
yield this.ensureScriptsLoaded(page);
|
|
397
|
+
if ((_g = this.options.debugChannel) === null || _g === void 0 ? void 0 : _g.incrementScrapeListIndex) {
|
|
398
|
+
this.options.debugChannel.incrementScrapeListIndex();
|
|
399
|
+
}
|
|
397
400
|
if (!config.pagination) {
|
|
398
401
|
const scrapeResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
399
402
|
yield this.options.serializableCallback(scrapeResults);
|
|
@@ -404,8 +407,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
404
407
|
}
|
|
405
408
|
}),
|
|
406
409
|
scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
407
|
-
var
|
|
408
|
-
if ((
|
|
410
|
+
var _h;
|
|
411
|
+
if ((_h = this.options.debugChannel) === null || _h === void 0 ? void 0 : _h.setActionType) {
|
|
409
412
|
this.options.debugChannel.setActionType('scrapeListAuto');
|
|
410
413
|
}
|
|
411
414
|
yield this.ensureScriptsLoaded(page);
|
|
@@ -415,8 +418,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
415
418
|
yield this.options.serializableCallback(scrapeResults);
|
|
416
419
|
}),
|
|
417
420
|
scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
|
|
418
|
-
var
|
|
419
|
-
if ((
|
|
421
|
+
var _j;
|
|
422
|
+
if ((_j = this.options.debugChannel) === null || _j === void 0 ? void 0 : _j.setActionType) {
|
|
420
423
|
this.options.debugChannel.setActionType('scroll');
|
|
421
424
|
}
|
|
422
425
|
yield page.evaluate((pagesInternal) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -427,8 +430,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
427
430
|
}), pages !== null && pages !== void 0 ? pages : 1);
|
|
428
431
|
}),
|
|
429
432
|
script: (code) => __awaiter(this, void 0, void 0, function* () {
|
|
430
|
-
var
|
|
431
|
-
if ((
|
|
433
|
+
var _k;
|
|
434
|
+
if ((_k = this.options.debugChannel) === null || _k === void 0 ? void 0 : _k.setActionType) {
|
|
432
435
|
this.options.debugChannel.setActionType('script');
|
|
433
436
|
}
|
|
434
437
|
const AsyncFunction = Object.getPrototypeOf(() => __awaiter(this, void 0, void 0, function* () { })).constructor;
|
|
@@ -530,6 +533,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
530
533
|
});
|
|
531
534
|
allResults = allResults.concat(newResults);
|
|
532
535
|
debugLog("Results collected:", allResults.length);
|
|
536
|
+
yield this.options.serializableCallback(allResults);
|
|
533
537
|
});
|
|
534
538
|
const checkLimit = () => {
|
|
535
539
|
if (config.limit && allResults.length >= config.limit) {
|
|
@@ -700,6 +704,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
700
704
|
button.click()
|
|
701
705
|
]);
|
|
702
706
|
debugLog("Navigation successful after regular click");
|
|
707
|
+
yield page.waitForTimeout(2000);
|
|
703
708
|
paginationSuccess = true;
|
|
704
709
|
}
|
|
705
710
|
catch (navError) {
|
|
@@ -715,6 +720,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
715
720
|
button.dispatchEvent('click')
|
|
716
721
|
]);
|
|
717
722
|
debugLog("Navigation successful after dispatch event");
|
|
723
|
+
yield page.waitForTimeout(2000);
|
|
718
724
|
paginationSuccess = true;
|
|
719
725
|
}
|
|
720
726
|
catch (dispatchNavError) {
|