maxun-core 0.0.21 → 0.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,10 +53,15 @@ export default class Interpreter extends EventEmitter {
53
53
  private options;
54
54
  private concurrency;
55
55
  private stopper;
56
+ private isAborted;
56
57
  private log;
57
58
  private blocker;
58
59
  private cumulativeResults;
59
60
  constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
61
+ /**
62
+ * Sets the abort flag to immediately stop all operations
63
+ */
64
+ abort(): void;
60
65
  private applyAdBlocker;
61
66
  private disableAdBlocker;
62
67
  private getSelectors;
@@ -52,6 +52,7 @@ class Interpreter extends events_1.EventEmitter {
52
52
  var _a;
53
53
  super();
54
54
  this.stopper = null;
55
+ this.isAborted = false;
55
56
  this.blocker = null;
56
57
  this.cumulativeResults = [];
57
58
  this.workflow = workflow.workflow;
@@ -83,6 +84,12 @@ class Interpreter extends events_1.EventEmitter {
83
84
  this.blocker = null;
84
85
  });
85
86
  }
87
+ /**
88
+ * Sets the abort flag to immediately stop all operations
89
+ */
90
+ abort() {
91
+ this.isAborted = true;
92
+ }
86
93
  applyAdBlocker(page) {
87
94
  return __awaiter(this, void 0, void 0, function* () {
88
95
  if (this.blocker) {
@@ -304,6 +311,10 @@ class Interpreter extends events_1.EventEmitter {
304
311
  carryOutSteps(page, steps) {
305
312
  var _a;
306
313
  return __awaiter(this, void 0, void 0, function* () {
314
+ if (this.isAborted) {
315
+ this.log('Workflow aborted, stopping execution', logger_1.Level.WARN);
316
+ return;
317
+ }
307
318
  /**
308
319
  * Defines overloaded (or added) methods/actions usable in the workflow.
309
320
  * If a method overloads any existing method of the Page class, it accepts the same set
@@ -361,6 +372,10 @@ class Interpreter extends events_1.EventEmitter {
361
372
  }),
362
373
  scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
363
374
  var _e;
375
+ if (this.isAborted) {
376
+ this.log('Workflow aborted, stopping scrapeSchema', logger_1.Level.WARN);
377
+ return;
378
+ }
364
379
  if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.setActionType) {
365
380
  this.options.debugChannel.setActionType('scrapeSchema');
366
381
  }
@@ -388,6 +403,10 @@ class Interpreter extends events_1.EventEmitter {
388
403
  }),
389
404
  scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
390
405
  var _f, _g;
406
+ if (this.isAborted) {
407
+ this.log('Workflow aborted, stopping scrapeList', logger_1.Level.WARN);
408
+ return;
409
+ }
391
410
  if ((_f = this.options.debugChannel) === null || _f === void 0 ? void 0 : _f.setActionType) {
392
411
  this.options.debugChannel.setActionType('scrapeList');
393
412
  }
@@ -528,6 +547,10 @@ class Interpreter extends events_1.EventEmitter {
528
547
  }
529
548
  handlePagination(page, config) {
530
549
  return __awaiter(this, void 0, void 0, function* () {
550
+ if (this.isAborted) {
551
+ this.log('Workflow aborted, stopping pagination', logger_1.Level.WARN);
552
+ return [];
553
+ }
531
554
  let allResults = [];
532
555
  let previousHeight = 0;
533
556
  let scrapedItems = new Set();
@@ -539,6 +562,11 @@ class Interpreter extends events_1.EventEmitter {
539
562
  console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
540
563
  };
541
564
  const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () {
565
+ // Check abort flag before scraping current page
566
+ if (this.isAborted) {
567
+ debugLog("Workflow aborted, stopping scrapeCurrentPage");
568
+ return;
569
+ }
542
570
  const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
543
571
  const newResults = results.filter(item => {
544
572
  const uniqueKey = JSON.stringify(item);
@@ -617,6 +645,10 @@ class Interpreter extends events_1.EventEmitter {
617
645
  let unchangedResultCounter = 0;
618
646
  try {
619
647
  while (true) {
648
+ if (this.isAborted) {
649
+ this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
650
+ return allResults;
651
+ }
620
652
  switch (config.pagination.type) {
621
653
  case 'scrollDown': {
622
654
  let previousResultCount = allResults.length;
@@ -624,9 +656,14 @@ class Interpreter extends events_1.EventEmitter {
624
656
  if (checkLimit()) {
625
657
  return allResults;
626
658
  }
627
- yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
659
+ yield page.evaluate(() => {
660
+ const scrollHeight = Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
661
+ window.scrollTo(0, scrollHeight);
662
+ });
628
663
  yield page.waitForTimeout(2000);
629
- const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
664
+ const currentHeight = yield page.evaluate(() => {
665
+ return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
666
+ });
630
667
  const currentResultCount = allResults.length;
631
668
  if (currentResultCount === previousResultCount) {
632
669
  unchangedResultCounter++;
@@ -832,6 +869,10 @@ class Interpreter extends events_1.EventEmitter {
832
869
  // let noNewItemsCounter = 0;
833
870
  // const MAX_NO_NEW_ITEMS = 2;
834
871
  while (true) {
872
+ if (this.isAborted) {
873
+ this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
874
+ return allResults;
875
+ }
835
876
  // Find working button with retry mechanism
836
877
  const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
837
878
  availableSelectors = updatedSelectors;
@@ -881,9 +922,14 @@ class Interpreter extends events_1.EventEmitter {
881
922
  }
882
923
  // Wait for content to load and check scroll height
883
924
  yield page.waitForTimeout(2000);
884
- yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
925
+ yield page.evaluate(() => {
926
+ const scrollHeight = Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
927
+ window.scrollTo(0, scrollHeight);
928
+ });
885
929
  yield page.waitForTimeout(2000);
886
- const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
930
+ const currentHeight = yield page.evaluate(() => {
931
+ return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
932
+ });
887
933
  const heightChanged = currentHeight !== previousHeight;
888
934
  previousHeight = currentHeight;
889
935
  yield scrapeCurrentPage();
@@ -962,6 +1008,10 @@ class Interpreter extends events_1.EventEmitter {
962
1008
  runLoop(p, workflow) {
963
1009
  var _a, _b;
964
1010
  return __awaiter(this, void 0, void 0, function* () {
1011
+ if (this.isAborted) {
1012
+ this.log('Workflow aborted in runLoop', logger_1.Level.WARN);
1013
+ return;
1014
+ }
965
1015
  let workflowCopy = JSON.parse(JSON.stringify(workflow));
966
1016
  workflowCopy = this.removeSpecialSelectors(workflowCopy);
967
1017
  // apply ad-blocker to the current page
@@ -988,6 +1038,10 @@ class Interpreter extends events_1.EventEmitter {
988
1038
  let loopIterations = 0;
989
1039
  const MAX_LOOP_ITERATIONS = 1000; // Circuit breaker
990
1040
  while (true) {
1041
+ if (this.isAborted) {
1042
+ this.log('Workflow aborted during step execution', logger_1.Level.WARN);
1043
+ return;
1044
+ }
991
1045
  // Circuit breaker to prevent infinite loops
992
1046
  if (++loopIterations > MAX_LOOP_ITERATIONS) {
993
1047
  this.log('Maximum loop iterations reached, terminating to prevent infinite loop', logger_1.Level.ERROR);
@@ -1005,30 +1059,33 @@ class Interpreter extends events_1.EventEmitter {
1005
1059
  yield p.close();
1006
1060
  return;
1007
1061
  }
1008
- let pageState = {};
1009
- try {
1010
- // Check if page is still valid before accessing state
1011
- if (p.isClosed()) {
1012
- this.log('Page was closed during execution', logger_1.Level.WARN);
1013
- return;
1014
- }
1015
- pageState = yield this.getState(p, workflowCopy, selectors);
1016
- selectors = [];
1017
- console.log("Empty selectors:", selectors);
1018
- }
1019
- catch (e) {
1020
- this.log(`Failed to get page state: ${e.message}`, logger_1.Level.ERROR);
1021
- // If state access fails, attempt graceful recovery
1022
- if (p.isClosed()) {
1023
- this.log('Browser has been closed, terminating workflow', logger_1.Level.WARN);
1024
- return;
1025
- }
1026
- // For other errors, continue with empty state to avoid complete failure
1027
- pageState = { url: p.url(), selectors: [], cookies: {} };
1028
- }
1029
- if (this.options.debug) {
1030
- this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, logger_1.Level.WARN);
1062
+ if (workflowCopy.length === 0) {
1063
+ this.log('All actions completed. Workflow finished.', logger_1.Level.LOG);
1064
+ return;
1031
1065
  }
1066
+ // let pageState = {};
1067
+ // try {
1068
+ // // Check if page is still valid before accessing state
1069
+ // if (p.isClosed()) {
1070
+ // this.log('Page was closed during execution', Level.WARN);
1071
+ // return;
1072
+ // }
1073
+ // pageState = await this.getState(p, workflowCopy, selectors);
1074
+ // selectors = [];
1075
+ // console.log("Empty selectors:", selectors)
1076
+ // } catch (e: any) {
1077
+ // this.log(`Failed to get page state: ${e.message}`, Level.ERROR);
1078
+ // // If state access fails, attempt graceful recovery
1079
+ // if (p.isClosed()) {
1080
+ // this.log('Browser has been closed, terminating workflow', Level.WARN);
1081
+ // return;
1082
+ // }
1083
+ // // For other errors, continue with empty state to avoid complete failure
1084
+ // pageState = { url: p.url(), selectors: [], cookies: {} };
1085
+ // }
1086
+ // if (this.options.debug) {
1087
+ // this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
1088
+ // }
1032
1089
  // const actionId = workflow.findIndex((step) => {
1033
1090
  // const isApplicable = this.applicable(step.where, pageState, usedActions);
1034
1091
  // console.log("-------------------------------------------------------------");
@@ -1038,7 +1095,8 @@ class Interpreter extends events_1.EventEmitter {
1038
1095
  // console.log("-------------------------------------------------------------");
1039
1096
  // return isApplicable;
1040
1097
  // });
1041
- actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
1098
+ // actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
1099
+ const actionId = workflowCopy.length - 1;
1042
1100
  const action = workflowCopy[actionId];
1043
1101
  console.log("MATCHED ACTION:", action);
1044
1102
  console.log("MATCHED ACTION ID:", actionId);
@@ -1053,6 +1111,10 @@ class Interpreter extends events_1.EventEmitter {
1053
1111
  return;
1054
1112
  }
1055
1113
  lastAction = action;
1114
+ if (this.isAborted) {
1115
+ this.log('Workflow aborted before action execution', logger_1.Level.WARN);
1116
+ return;
1117
+ }
1056
1118
  try {
1057
1119
  console.log("Carrying out:", action.what);
1058
1120
  yield this.carryOutSteps(p, action.what);
@@ -1060,12 +1122,12 @@ class Interpreter extends events_1.EventEmitter {
1060
1122
  workflowCopy.splice(actionId, 1);
1061
1123
  console.log(`Action with ID ${action.id} removed from the workflow copy.`);
1062
1124
  // const newSelectors = this.getPreviousSelectors(workflow, actionId);
1063
- const newSelectors = this.getSelectors(workflowCopy);
1064
- newSelectors.forEach(selector => {
1065
- if (!selectors.includes(selector)) {
1066
- selectors.push(selector);
1067
- }
1068
- });
1125
+ // const newSelectors = this.getSelectors(workflowCopy);
1126
+ // newSelectors.forEach(selector => {
1127
+ // if (!selectors.includes(selector)) {
1128
+ // selectors.push(selector);
1129
+ // }
1130
+ // });
1069
1131
  // Reset loop iteration counter on successful action
1070
1132
  loopIterations = 0;
1071
1133
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.21",
3
+ "version": "0.0.23",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",