maxun-core 0.0.21 → 0.0.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.d.ts +5 -0
- package/build/interpret.js +96 -34
- package/package.json +1 -1
package/build/interpret.d.ts
CHANGED
|
@@ -53,10 +53,15 @@ export default class Interpreter extends EventEmitter {
|
|
|
53
53
|
private options;
|
|
54
54
|
private concurrency;
|
|
55
55
|
private stopper;
|
|
56
|
+
private isAborted;
|
|
56
57
|
private log;
|
|
57
58
|
private blocker;
|
|
58
59
|
private cumulativeResults;
|
|
59
60
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
|
|
61
|
+
/**
|
|
62
|
+
* Sets the abort flag to immediately stop all operations
|
|
63
|
+
*/
|
|
64
|
+
abort(): void;
|
|
60
65
|
private applyAdBlocker;
|
|
61
66
|
private disableAdBlocker;
|
|
62
67
|
private getSelectors;
|
package/build/interpret.js
CHANGED
|
@@ -52,6 +52,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
52
52
|
var _a;
|
|
53
53
|
super();
|
|
54
54
|
this.stopper = null;
|
|
55
|
+
this.isAborted = false;
|
|
55
56
|
this.blocker = null;
|
|
56
57
|
this.cumulativeResults = [];
|
|
57
58
|
this.workflow = workflow.workflow;
|
|
@@ -83,6 +84,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
83
84
|
this.blocker = null;
|
|
84
85
|
});
|
|
85
86
|
}
|
|
87
|
+
/**
|
|
88
|
+
* Sets the abort flag to immediately stop all operations
|
|
89
|
+
*/
|
|
90
|
+
abort() {
|
|
91
|
+
this.isAborted = true;
|
|
92
|
+
}
|
|
86
93
|
applyAdBlocker(page) {
|
|
87
94
|
return __awaiter(this, void 0, void 0, function* () {
|
|
88
95
|
if (this.blocker) {
|
|
@@ -304,6 +311,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
304
311
|
carryOutSteps(page, steps) {
|
|
305
312
|
var _a;
|
|
306
313
|
return __awaiter(this, void 0, void 0, function* () {
|
|
314
|
+
if (this.isAborted) {
|
|
315
|
+
this.log('Workflow aborted, stopping execution', logger_1.Level.WARN);
|
|
316
|
+
return;
|
|
317
|
+
}
|
|
307
318
|
/**
|
|
308
319
|
* Defines overloaded (or added) methods/actions usable in the workflow.
|
|
309
320
|
* If a method overloads any existing method of the Page class, it accepts the same set
|
|
@@ -361,6 +372,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
361
372
|
}),
|
|
362
373
|
scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
|
|
363
374
|
var _e;
|
|
375
|
+
if (this.isAborted) {
|
|
376
|
+
this.log('Workflow aborted, stopping scrapeSchema', logger_1.Level.WARN);
|
|
377
|
+
return;
|
|
378
|
+
}
|
|
364
379
|
if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.setActionType) {
|
|
365
380
|
this.options.debugChannel.setActionType('scrapeSchema');
|
|
366
381
|
}
|
|
@@ -388,6 +403,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
388
403
|
}),
|
|
389
404
|
scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
390
405
|
var _f, _g;
|
|
406
|
+
if (this.isAborted) {
|
|
407
|
+
this.log('Workflow aborted, stopping scrapeList', logger_1.Level.WARN);
|
|
408
|
+
return;
|
|
409
|
+
}
|
|
391
410
|
if ((_f = this.options.debugChannel) === null || _f === void 0 ? void 0 : _f.setActionType) {
|
|
392
411
|
this.options.debugChannel.setActionType('scrapeList');
|
|
393
412
|
}
|
|
@@ -528,6 +547,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
528
547
|
}
|
|
529
548
|
handlePagination(page, config) {
|
|
530
549
|
return __awaiter(this, void 0, void 0, function* () {
|
|
550
|
+
if (this.isAborted) {
|
|
551
|
+
this.log('Workflow aborted, stopping pagination', logger_1.Level.WARN);
|
|
552
|
+
return [];
|
|
553
|
+
}
|
|
531
554
|
let allResults = [];
|
|
532
555
|
let previousHeight = 0;
|
|
533
556
|
let scrapedItems = new Set();
|
|
@@ -539,6 +562,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
539
562
|
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
|
540
563
|
};
|
|
541
564
|
const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () {
|
|
565
|
+
// Check abort flag before scraping current page
|
|
566
|
+
if (this.isAborted) {
|
|
567
|
+
debugLog("Workflow aborted, stopping scrapeCurrentPage");
|
|
568
|
+
return;
|
|
569
|
+
}
|
|
542
570
|
const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
543
571
|
const newResults = results.filter(item => {
|
|
544
572
|
const uniqueKey = JSON.stringify(item);
|
|
@@ -617,6 +645,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
617
645
|
let unchangedResultCounter = 0;
|
|
618
646
|
try {
|
|
619
647
|
while (true) {
|
|
648
|
+
if (this.isAborted) {
|
|
649
|
+
this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
|
|
650
|
+
return allResults;
|
|
651
|
+
}
|
|
620
652
|
switch (config.pagination.type) {
|
|
621
653
|
case 'scrollDown': {
|
|
622
654
|
let previousResultCount = allResults.length;
|
|
@@ -624,9 +656,14 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
624
656
|
if (checkLimit()) {
|
|
625
657
|
return allResults;
|
|
626
658
|
}
|
|
627
|
-
yield page.evaluate(() =>
|
|
659
|
+
yield page.evaluate(() => {
|
|
660
|
+
const scrollHeight = Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
|
|
661
|
+
window.scrollTo(0, scrollHeight);
|
|
662
|
+
});
|
|
628
663
|
yield page.waitForTimeout(2000);
|
|
629
|
-
const currentHeight = yield page.evaluate(() =>
|
|
664
|
+
const currentHeight = yield page.evaluate(() => {
|
|
665
|
+
return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
|
|
666
|
+
});
|
|
630
667
|
const currentResultCount = allResults.length;
|
|
631
668
|
if (currentResultCount === previousResultCount) {
|
|
632
669
|
unchangedResultCounter++;
|
|
@@ -832,6 +869,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
832
869
|
// let noNewItemsCounter = 0;
|
|
833
870
|
// const MAX_NO_NEW_ITEMS = 2;
|
|
834
871
|
while (true) {
|
|
872
|
+
if (this.isAborted) {
|
|
873
|
+
this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
|
|
874
|
+
return allResults;
|
|
875
|
+
}
|
|
835
876
|
// Find working button with retry mechanism
|
|
836
877
|
const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
837
878
|
availableSelectors = updatedSelectors;
|
|
@@ -881,9 +922,14 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
881
922
|
}
|
|
882
923
|
// Wait for content to load and check scroll height
|
|
883
924
|
yield page.waitForTimeout(2000);
|
|
884
|
-
yield page.evaluate(() =>
|
|
925
|
+
yield page.evaluate(() => {
|
|
926
|
+
const scrollHeight = Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
|
|
927
|
+
window.scrollTo(0, scrollHeight);
|
|
928
|
+
});
|
|
885
929
|
yield page.waitForTimeout(2000);
|
|
886
|
-
const currentHeight = yield page.evaluate(() =>
|
|
930
|
+
const currentHeight = yield page.evaluate(() => {
|
|
931
|
+
return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
|
|
932
|
+
});
|
|
887
933
|
const heightChanged = currentHeight !== previousHeight;
|
|
888
934
|
previousHeight = currentHeight;
|
|
889
935
|
yield scrapeCurrentPage();
|
|
@@ -962,6 +1008,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
962
1008
|
runLoop(p, workflow) {
|
|
963
1009
|
var _a, _b;
|
|
964
1010
|
return __awaiter(this, void 0, void 0, function* () {
|
|
1011
|
+
if (this.isAborted) {
|
|
1012
|
+
this.log('Workflow aborted in runLoop', logger_1.Level.WARN);
|
|
1013
|
+
return;
|
|
1014
|
+
}
|
|
965
1015
|
let workflowCopy = JSON.parse(JSON.stringify(workflow));
|
|
966
1016
|
workflowCopy = this.removeSpecialSelectors(workflowCopy);
|
|
967
1017
|
// apply ad-blocker to the current page
|
|
@@ -988,6 +1038,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
988
1038
|
let loopIterations = 0;
|
|
989
1039
|
const MAX_LOOP_ITERATIONS = 1000; // Circuit breaker
|
|
990
1040
|
while (true) {
|
|
1041
|
+
if (this.isAborted) {
|
|
1042
|
+
this.log('Workflow aborted during step execution', logger_1.Level.WARN);
|
|
1043
|
+
return;
|
|
1044
|
+
}
|
|
991
1045
|
// Circuit breaker to prevent infinite loops
|
|
992
1046
|
if (++loopIterations > MAX_LOOP_ITERATIONS) {
|
|
993
1047
|
this.log('Maximum loop iterations reached, terminating to prevent infinite loop', logger_1.Level.ERROR);
|
|
@@ -1005,30 +1059,33 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1005
1059
|
yield p.close();
|
|
1006
1060
|
return;
|
|
1007
1061
|
}
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
if (p.isClosed()) {
|
|
1012
|
-
this.log('Page was closed during execution', logger_1.Level.WARN);
|
|
1013
|
-
return;
|
|
1014
|
-
}
|
|
1015
|
-
pageState = yield this.getState(p, workflowCopy, selectors);
|
|
1016
|
-
selectors = [];
|
|
1017
|
-
console.log("Empty selectors:", selectors);
|
|
1018
|
-
}
|
|
1019
|
-
catch (e) {
|
|
1020
|
-
this.log(`Failed to get page state: ${e.message}`, logger_1.Level.ERROR);
|
|
1021
|
-
// If state access fails, attempt graceful recovery
|
|
1022
|
-
if (p.isClosed()) {
|
|
1023
|
-
this.log('Browser has been closed, terminating workflow', logger_1.Level.WARN);
|
|
1024
|
-
return;
|
|
1025
|
-
}
|
|
1026
|
-
// For other errors, continue with empty state to avoid complete failure
|
|
1027
|
-
pageState = { url: p.url(), selectors: [], cookies: {} };
|
|
1028
|
-
}
|
|
1029
|
-
if (this.options.debug) {
|
|
1030
|
-
this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, logger_1.Level.WARN);
|
|
1062
|
+
if (workflowCopy.length === 0) {
|
|
1063
|
+
this.log('All actions completed. Workflow finished.', logger_1.Level.LOG);
|
|
1064
|
+
return;
|
|
1031
1065
|
}
|
|
1066
|
+
// let pageState = {};
|
|
1067
|
+
// try {
|
|
1068
|
+
// // Check if page is still valid before accessing state
|
|
1069
|
+
// if (p.isClosed()) {
|
|
1070
|
+
// this.log('Page was closed during execution', Level.WARN);
|
|
1071
|
+
// return;
|
|
1072
|
+
// }
|
|
1073
|
+
// pageState = await this.getState(p, workflowCopy, selectors);
|
|
1074
|
+
// selectors = [];
|
|
1075
|
+
// console.log("Empty selectors:", selectors)
|
|
1076
|
+
// } catch (e: any) {
|
|
1077
|
+
// this.log(`Failed to get page state: ${e.message}`, Level.ERROR);
|
|
1078
|
+
// // If state access fails, attempt graceful recovery
|
|
1079
|
+
// if (p.isClosed()) {
|
|
1080
|
+
// this.log('Browser has been closed, terminating workflow', Level.WARN);
|
|
1081
|
+
// return;
|
|
1082
|
+
// }
|
|
1083
|
+
// // For other errors, continue with empty state to avoid complete failure
|
|
1084
|
+
// pageState = { url: p.url(), selectors: [], cookies: {} };
|
|
1085
|
+
// }
|
|
1086
|
+
// if (this.options.debug) {
|
|
1087
|
+
// this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
|
|
1088
|
+
// }
|
|
1032
1089
|
// const actionId = workflow.findIndex((step) => {
|
|
1033
1090
|
// const isApplicable = this.applicable(step.where, pageState, usedActions);
|
|
1034
1091
|
// console.log("-------------------------------------------------------------");
|
|
@@ -1038,7 +1095,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1038
1095
|
// console.log("-------------------------------------------------------------");
|
|
1039
1096
|
// return isApplicable;
|
|
1040
1097
|
// });
|
|
1041
|
-
actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
|
|
1098
|
+
// actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
|
|
1099
|
+
const actionId = workflowCopy.length - 1;
|
|
1042
1100
|
const action = workflowCopy[actionId];
|
|
1043
1101
|
console.log("MATCHED ACTION:", action);
|
|
1044
1102
|
console.log("MATCHED ACTION ID:", actionId);
|
|
@@ -1053,6 +1111,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1053
1111
|
return;
|
|
1054
1112
|
}
|
|
1055
1113
|
lastAction = action;
|
|
1114
|
+
if (this.isAborted) {
|
|
1115
|
+
this.log('Workflow aborted before action execution', logger_1.Level.WARN);
|
|
1116
|
+
return;
|
|
1117
|
+
}
|
|
1056
1118
|
try {
|
|
1057
1119
|
console.log("Carrying out:", action.what);
|
|
1058
1120
|
yield this.carryOutSteps(p, action.what);
|
|
@@ -1060,12 +1122,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1060
1122
|
workflowCopy.splice(actionId, 1);
|
|
1061
1123
|
console.log(`Action with ID ${action.id} removed from the workflow copy.`);
|
|
1062
1124
|
// const newSelectors = this.getPreviousSelectors(workflow, actionId);
|
|
1063
|
-
const newSelectors = this.getSelectors(workflowCopy);
|
|
1064
|
-
newSelectors.forEach(selector => {
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
});
|
|
1125
|
+
// const newSelectors = this.getSelectors(workflowCopy);
|
|
1126
|
+
// newSelectors.forEach(selector => {
|
|
1127
|
+
// if (!selectors.includes(selector)) {
|
|
1128
|
+
// selectors.push(selector);
|
|
1129
|
+
// }
|
|
1130
|
+
// });
|
|
1069
1131
|
// Reset loop iteration counter on successful action
|
|
1070
1132
|
loopIterations = 0;
|
|
1071
1133
|
}
|