maxun-core 0.0.26 → 0.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.d.ts +1 -0
- package/build/interpret.js +65 -48
- package/package.json +1 -1
package/build/interpret.d.ts
CHANGED
|
@@ -59,6 +59,7 @@ export default class Interpreter extends EventEmitter {
|
|
|
59
59
|
private namedResults;
|
|
60
60
|
private screenshotCounter;
|
|
61
61
|
private serializableDataByType;
|
|
62
|
+
private scrapeListCounter;
|
|
62
63
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
|
|
63
64
|
/**
|
|
64
65
|
* Sets the abort flag to immediately stop all operations
|
package/build/interpret.js
CHANGED
|
@@ -71,6 +71,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
71
71
|
scrapeList: {},
|
|
72
72
|
scrapeSchema: {}
|
|
73
73
|
};
|
|
74
|
+
this.scrapeListCounter = 0;
|
|
74
75
|
this.workflow = workflow.workflow;
|
|
75
76
|
this.initializedWorkflow = null;
|
|
76
77
|
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
|
|
@@ -405,7 +406,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
405
406
|
const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
|
|
406
407
|
yield this.options.serializableCallback(scrapeResults);
|
|
407
408
|
}),
|
|
408
|
-
scrapeSchema: (
|
|
409
|
+
scrapeSchema: (schema_1, ...args_1) => __awaiter(this, [schema_1, ...args_1], void 0, function* (schema, actionName = "") {
|
|
409
410
|
var _a;
|
|
410
411
|
if (this.isAborted) {
|
|
411
412
|
this.log('Workflow aborted, stopping scrapeSchema', logger_1.Level.WARN);
|
|
@@ -455,22 +456,22 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
455
456
|
}
|
|
456
457
|
}
|
|
457
458
|
const actionType = "scrapeSchema";
|
|
458
|
-
const
|
|
459
|
+
const name = actionName || "Texts";
|
|
459
460
|
if (!this.namedResults[actionType])
|
|
460
461
|
this.namedResults[actionType] = {};
|
|
461
|
-
this.namedResults[actionType][
|
|
462
|
+
this.namedResults[actionType][name] = this.cumulativeResults;
|
|
462
463
|
if (!this.serializableDataByType[actionType])
|
|
463
464
|
this.serializableDataByType[actionType] = {};
|
|
464
|
-
if (!this.serializableDataByType[actionType][
|
|
465
|
-
this.serializableDataByType[actionType][
|
|
465
|
+
if (!this.serializableDataByType[actionType][name]) {
|
|
466
|
+
this.serializableDataByType[actionType][name] = [];
|
|
466
467
|
}
|
|
467
|
-
this.serializableDataByType[actionType][
|
|
468
|
+
this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
|
|
468
469
|
yield this.options.serializableCallback({
|
|
469
470
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
470
471
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
471
472
|
});
|
|
472
473
|
}),
|
|
473
|
-
scrapeList: (
|
|
474
|
+
scrapeList: (config_1, ...args_1) => __awaiter(this, [config_1, ...args_1], void 0, function* (config, actionName = "") {
|
|
474
475
|
var _a, _b;
|
|
475
476
|
if (this.isAborted) {
|
|
476
477
|
this.log('Workflow aborted, stopping scrapeList', logger_1.Level.WARN);
|
|
@@ -489,6 +490,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
489
490
|
this.options.debugChannel.incrementScrapeListIndex();
|
|
490
491
|
}
|
|
491
492
|
let scrapeResults = [];
|
|
493
|
+
let paginationUsed = false;
|
|
492
494
|
if (!config.pagination) {
|
|
493
495
|
scrapeResults = yield page.evaluate((cfg) => {
|
|
494
496
|
try {
|
|
@@ -501,34 +503,46 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
501
503
|
}, config);
|
|
502
504
|
}
|
|
503
505
|
else {
|
|
504
|
-
|
|
506
|
+
paginationUsed = true;
|
|
507
|
+
scrapeResults = yield this.handlePagination(page, config, actionName);
|
|
505
508
|
}
|
|
506
509
|
if (!Array.isArray(scrapeResults)) {
|
|
507
510
|
scrapeResults = [];
|
|
508
511
|
}
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
512
|
+
console.log(`ScrapeList completed with ${scrapeResults.length} results`);
|
|
513
|
+
if (!paginationUsed) {
|
|
514
|
+
const actionType = "scrapeList";
|
|
515
|
+
let name = actionName || "";
|
|
516
|
+
if (!name || name.trim() === "") {
|
|
517
|
+
this.scrapeListCounter++;
|
|
518
|
+
name = `List ${this.scrapeListCounter}`;
|
|
519
|
+
}
|
|
520
|
+
if (!this.serializableDataByType[actionType])
|
|
521
|
+
this.serializableDataByType[actionType] = {};
|
|
522
|
+
if (!this.serializableDataByType[actionType][name]) {
|
|
523
|
+
this.serializableDataByType[actionType][name] = [];
|
|
524
|
+
}
|
|
525
|
+
this.serializableDataByType[actionType][name].push(...scrapeResults);
|
|
526
|
+
yield this.options.serializableCallback({
|
|
527
|
+
scrapeList: this.serializableDataByType.scrapeList,
|
|
528
|
+
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
529
|
+
});
|
|
515
530
|
}
|
|
516
|
-
this.serializableDataByType[actionType][actionName].push(...scrapeResults);
|
|
517
|
-
yield this.options.serializableCallback({
|
|
518
|
-
scrapeList: this.serializableDataByType.scrapeList,
|
|
519
|
-
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
520
|
-
});
|
|
521
531
|
}
|
|
522
532
|
catch (error) {
|
|
523
533
|
console.error('ScrapeList action failed completely:', error.message);
|
|
524
534
|
const actionType = "scrapeList";
|
|
525
|
-
|
|
535
|
+
let name = actionName || "";
|
|
536
|
+
if (!name || name.trim() === "") {
|
|
537
|
+
this.scrapeListCounter++;
|
|
538
|
+
name = `List ${this.scrapeListCounter}`;
|
|
539
|
+
}
|
|
526
540
|
if (!this.namedResults[actionType])
|
|
527
541
|
this.namedResults[actionType] = {};
|
|
528
|
-
this.namedResults[actionType][
|
|
542
|
+
this.namedResults[actionType][name] = [];
|
|
529
543
|
if (!this.serializableDataByType[actionType])
|
|
530
544
|
this.serializableDataByType[actionType] = {};
|
|
531
|
-
this.serializableDataByType[actionType][
|
|
545
|
+
this.serializableDataByType[actionType][name] = [];
|
|
532
546
|
yield this.options.serializableCallback({
|
|
533
547
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
534
548
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
@@ -610,23 +624,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
610
624
|
if (debug === null || debug === void 0 ? void 0 : debug.setActionType) {
|
|
611
625
|
debug.setActionType(String(step.action));
|
|
612
626
|
}
|
|
613
|
-
|
|
614
|
-
stepName = step.name;
|
|
615
|
-
}
|
|
616
|
-
else if (Array.isArray(step === null || step === void 0 ? void 0 : step.args) &&
|
|
617
|
-
step.args.length > 0 &&
|
|
618
|
-
typeof step.args[0] === "object" &&
|
|
619
|
-
"__name" in step.args[0]) {
|
|
620
|
-
stepName = step.args[0].__name;
|
|
621
|
-
}
|
|
622
|
-
else if (typeof (step === null || step === void 0 ? void 0 : step.args) === "object" &&
|
|
623
|
-
(step === null || step === void 0 ? void 0 : step.args) !== null &&
|
|
624
|
-
"__name" in step.args) {
|
|
625
|
-
stepName = step.args.__name;
|
|
626
|
-
}
|
|
627
|
-
if (!stepName) {
|
|
628
|
-
stepName = String(step.action);
|
|
629
|
-
}
|
|
627
|
+
stepName = (step === null || step === void 0 ? void 0 : step.name) || String(step.action);
|
|
630
628
|
if (debug && typeof debug.setActionName === "function") {
|
|
631
629
|
debug.setActionName(stepName);
|
|
632
630
|
}
|
|
@@ -640,6 +638,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
640
638
|
if (step.action === 'screenshot') {
|
|
641
639
|
yield wawActions.screenshot(...(params !== null && params !== void 0 ? params : []), stepName !== null && stepName !== void 0 ? stepName : undefined);
|
|
642
640
|
}
|
|
641
|
+
else if (step.action === 'scrapeList' || step.action === 'scrapeSchema') {
|
|
642
|
+
const actionName = step.name || "";
|
|
643
|
+
yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []), actionName);
|
|
644
|
+
}
|
|
643
645
|
else {
|
|
644
646
|
yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
|
|
645
647
|
}
|
|
@@ -699,24 +701,35 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
699
701
|
}
|
|
700
702
|
});
|
|
701
703
|
}
|
|
702
|
-
handlePagination(
|
|
703
|
-
return __awaiter(this,
|
|
704
|
+
handlePagination(page_1, config_1) {
|
|
705
|
+
return __awaiter(this, arguments, void 0, function* (page, config, providedActionName = "") {
|
|
704
706
|
if (this.isAborted) {
|
|
705
707
|
this.log('Workflow aborted, stopping pagination', logger_1.Level.WARN);
|
|
706
708
|
return [];
|
|
707
709
|
}
|
|
710
|
+
const actionType = "scrapeList";
|
|
711
|
+
let actionName = providedActionName || "";
|
|
712
|
+
if (!actionName || actionName.trim() === "") {
|
|
713
|
+
this.scrapeListCounter++;
|
|
714
|
+
actionName = `List ${this.scrapeListCounter}`;
|
|
715
|
+
}
|
|
716
|
+
if (!this.serializableDataByType[actionType]) {
|
|
717
|
+
this.serializableDataByType[actionType] = {};
|
|
718
|
+
}
|
|
719
|
+
if (!this.serializableDataByType[actionType][actionName]) {
|
|
720
|
+
this.serializableDataByType[actionType][actionName] = [];
|
|
721
|
+
}
|
|
708
722
|
let allResults = [];
|
|
709
723
|
let previousHeight = 0;
|
|
710
724
|
let scrapedItems = new Set();
|
|
711
725
|
let visitedUrls = new Set();
|
|
712
726
|
const MAX_RETRIES = 3;
|
|
713
|
-
const RETRY_DELAY = 1000;
|
|
727
|
+
const RETRY_DELAY = 1000;
|
|
714
728
|
const MAX_UNCHANGED_RESULTS = 5;
|
|
715
729
|
const debugLog = (message, ...args) => {
|
|
716
730
|
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
|
717
731
|
};
|
|
718
732
|
const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () {
|
|
719
|
-
// Check abort flag before scraping current page
|
|
720
733
|
if (this.isAborted) {
|
|
721
734
|
debugLog("Workflow aborted, stopping scrapeCurrentPage");
|
|
722
735
|
return;
|
|
@@ -740,7 +753,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
740
753
|
});
|
|
741
754
|
allResults = allResults.concat(newResults);
|
|
742
755
|
debugLog("Results collected:", allResults.length);
|
|
743
|
-
|
|
756
|
+
this.serializableDataByType[actionType][actionName] = [...allResults];
|
|
757
|
+
yield this.options.serializableCallback({
|
|
758
|
+
scrapeList: this.serializableDataByType.scrapeList,
|
|
759
|
+
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
760
|
+
});
|
|
744
761
|
});
|
|
745
762
|
const checkLimit = () => {
|
|
746
763
|
if (config.limit && allResults.length >= config.limit) {
|
|
@@ -1006,7 +1023,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1006
1023
|
}).catch(e => {
|
|
1007
1024
|
throw e;
|
|
1008
1025
|
}),
|
|
1009
|
-
|
|
1026
|
+
page.locator(workingSelector).first().click()
|
|
1010
1027
|
]);
|
|
1011
1028
|
debugLog("Navigation successful after regular click");
|
|
1012
1029
|
yield page.waitForTimeout(2000);
|
|
@@ -1022,7 +1039,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1022
1039
|
}).catch(e => {
|
|
1023
1040
|
throw e;
|
|
1024
1041
|
}),
|
|
1025
|
-
|
|
1042
|
+
page.locator(workingSelector).first().dispatchEvent('click')
|
|
1026
1043
|
]);
|
|
1027
1044
|
debugLog("Navigation successful after dispatch event");
|
|
1028
1045
|
yield page.waitForTimeout(2000);
|
|
@@ -1030,11 +1047,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1030
1047
|
}
|
|
1031
1048
|
catch (dispatchNavError) {
|
|
1032
1049
|
try {
|
|
1033
|
-
yield
|
|
1050
|
+
yield page.locator(workingSelector).first().click();
|
|
1034
1051
|
yield page.waitForTimeout(2000);
|
|
1035
1052
|
}
|
|
1036
1053
|
catch (clickError) {
|
|
1037
|
-
yield
|
|
1054
|
+
yield page.locator(workingSelector).first().dispatchEvent('click');
|
|
1038
1055
|
yield page.waitForTimeout(2000);
|
|
1039
1056
|
}
|
|
1040
1057
|
}
|