maxun-core 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -220,8 +220,27 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
220
220
  return __awaiter(this, void 0, void 0, function* () {
221
221
  const scrapedData = [];
222
222
  while (scrapedData.length < limit) {
223
- // Get all parent elements matching the listSelector
224
- const parentElements = Array.from(document.querySelectorAll(listSelector));
223
+ let parentElements = Array.from(document.querySelectorAll(listSelector));
224
+ // If we only got one element or none, try a more generic approach
225
+ if (limit > 1 && parentElements.length <= 1) {
226
+ const [containerSelector, _] = listSelector.split('>').map(s => s.trim());
227
+ const container = document.querySelector(containerSelector);
228
+ if (container) {
229
+ const allChildren = Array.from(container.children);
230
+ const firstMatch = document.querySelector(listSelector);
231
+ if (firstMatch) {
232
+ // Get classes from the first matching element
233
+ const firstMatchClasses = Array.from(firstMatch.classList);
234
+ // Find similar elements by matching most of their classes
235
+ parentElements = allChildren.filter(element => {
236
+ const elementClasses = Array.from(element.classList);
237
+ // Element should share at least 70% of classes with the first match
238
+ const commonClasses = firstMatchClasses.filter(cls => elementClasses.includes(cls));
239
+ return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7);
240
+ });
241
+ }
242
+ }
243
+ }
225
244
  // Iterate through each parent element
226
245
  for (const parent of parentElements) {
227
246
  if (scrapedData.length >= limit)
@@ -254,6 +273,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
254
273
  }
255
274
  scrapedData.push(record);
256
275
  }
276
+ // If we've processed all available elements and still haven't reached the limit,
277
+ // break to avoid infinite loop
278
+ if (parentElements.length === 0 || scrapedData.length >= parentElements.length) {
279
+ break;
280
+ }
257
281
  }
258
282
  return scrapedData;
259
283
  });
@@ -84,14 +84,24 @@ class Interpreter extends events_1.EventEmitter {
84
84
  applyAdBlocker(page) {
85
85
  return __awaiter(this, void 0, void 0, function* () {
86
86
  if (this.blocker) {
87
- yield this.blocker.enableBlockingInPage(page);
87
+ try {
88
+ yield this.blocker.enableBlockingInPage(page);
89
+ }
90
+ catch (err) {
91
+ this.log(`Ad-blocker operation failed:`, logger_1.Level.ERROR);
92
+ }
88
93
  }
89
94
  });
90
95
  }
91
96
  disableAdBlocker(page) {
92
97
  return __awaiter(this, void 0, void 0, function* () {
93
98
  if (this.blocker) {
94
- yield this.blocker.disableBlockingInPage(page);
99
+ try {
100
+ yield this.blocker.disableBlockingInPage(page);
101
+ }
102
+ catch (err) {
103
+ this.log(`Ad-blocker operation failed:`, logger_1.Level.ERROR);
104
+ }
95
105
  }
96
106
  });
97
107
  }
@@ -156,8 +166,8 @@ class Interpreter extends events_1.EventEmitter {
156
166
  // const actionable = async (selector: string): Promise<boolean> => {
157
167
  // try {
158
168
  // const proms = [
159
- // page.isEnabled(selector, { timeout: 5000 }),
160
- // page.isVisible(selector, { timeout: 5000 }),
169
+ // page.isEnabled(selector, { timeout: 10000 }),
170
+ // page.isVisible(selector, { timeout: 10000 }),
161
171
  // ];
162
172
  // return await Promise.all(proms).then((bools) => bools.every((x) => x));
163
173
  // } catch (e) {
@@ -176,6 +186,15 @@ class Interpreter extends events_1.EventEmitter {
176
186
  // return [];
177
187
  // }),
178
188
  // ).then((x) => x.flat());
189
+ const presentSelectors = yield Promise.all(selectors.map((selector) => __awaiter(this, void 0, void 0, function* () {
190
+ try {
191
+ yield page.waitForSelector(selector, { state: 'attached' });
192
+ return [selector];
193
+ }
194
+ catch (e) {
195
+ return [];
196
+ }
197
+ }))).then((x) => x.flat());
179
198
  const action = workflowCopy[workflowCopy.length - 1];
180
199
  // console.log("Next action:", action)
181
200
  let url = page.url();
@@ -186,7 +205,7 @@ class Interpreter extends events_1.EventEmitter {
186
205
  url,
187
206
  cookies: (yield page.context().cookies([page.url()]))
188
207
  .reduce((p, cookie) => (Object.assign(Object.assign({}, p), { [cookie.name]: cookie.value })), {}),
189
- selectors,
208
+ selectors: presentSelectors,
190
209
  };
191
210
  });
192
211
  }
@@ -420,7 +439,12 @@ class Interpreter extends events_1.EventEmitter {
420
439
  yield executeAction(invokee, methodName, step.args);
421
440
  }
422
441
  catch (error) {
423
- yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
442
+ try {
443
+ yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
444
+ }
445
+ catch (error) {
446
+ continue;
447
+ }
424
448
  }
425
449
  }
426
450
  else {
@@ -549,7 +573,12 @@ class Interpreter extends events_1.EventEmitter {
549
573
  return __awaiter(this, void 0, void 0, function* () {
550
574
  const workflowCopy = JSON.parse(JSON.stringify(workflow));
551
575
  // apply ad-blocker to the current page
552
- yield this.applyAdBlocker(p);
576
+ try {
577
+ yield this.applyAdBlocker(p);
578
+ }
579
+ catch (error) {
580
+ this.log(`Failed to apply ad-blocker: ${error.message}`, logger_1.Level.ERROR);
581
+ }
553
582
  const usedActions = [];
554
583
  let selectors = [];
555
584
  let lastAction = null;
@@ -660,6 +689,7 @@ class Interpreter extends events_1.EventEmitter {
660
689
  return __awaiter(this, void 0, void 0, function* () {
661
690
  this.log('Starting the workflow.', logger_1.Level.LOG);
662
691
  const context = page.context();
692
+ page.setDefaultNavigationTimeout(100000);
663
693
  // Check proxy settings from context options
664
694
  const contextOptions = context._options;
665
695
  const hasProxy = !!(contextOptions === null || contextOptions === void 0 ? void 0 : contextOptions.proxy);
@@ -3,43 +3,43 @@
3
3
  */
4
4
  export default class Concurrency {
5
5
  /**
6
- * Maximum number of workers running in parallel. If set to `null`, there is no limit.
7
- */
6
+ * Maximum number of workers running in parallel. If set to `null`, there is no limit.
7
+ */
8
8
  maxConcurrency: number;
9
9
  /**
10
- * Number of currently active workers.
11
- */
10
+ * Number of currently active workers.
11
+ */
12
12
  activeWorkers: number;
13
13
  /**
14
- * Queue of jobs waiting to be completed.
15
- */
14
+ * Queue of jobs waiting to be completed.
15
+ */
16
16
  private jobQueue;
17
17
  /**
18
- * "Resolve" callbacks of the waitForCompletion() promises.
19
- */
18
+ * "Resolve" callbacks of the waitForCompletion() promises.
19
+ */
20
20
  private waiting;
21
21
  /**
22
- * Constructs a new instance of concurrency manager.
23
- * @param {number} maxConcurrency Maximum number of workers running in parallel.
24
- */
22
+ * Constructs a new instance of concurrency manager.
23
+ * @param {number} maxConcurrency Maximum number of workers running in parallel.
24
+ */
25
25
  constructor(maxConcurrency: number);
26
26
  /**
27
- * Takes a waiting job out of the queue and runs it.
28
- */
27
+ * Takes a waiting job out of the queue and runs it.
28
+ */
29
29
  private runNextJob;
30
30
  /**
31
- * Pass a job (a time-demanding async function) to the concurrency manager. \
32
- * The time of the job's execution depends on the concurrency manager itself
33
- * (given a generous enough `maxConcurrency` value, it might be immediate,
34
- * but this is not guaranteed).
35
- * @param worker Async function to be executed (job to be processed).
36
- */
31
+ * Pass a job (a time-demanding async function) to the concurrency manager. \
32
+ * The time of the job's execution depends on the concurrency manager itself
33
+ * (given a generous enough `maxConcurrency` value, it might be immediate,
34
+ * but this is not guaranteed).
35
+ * @param worker Async function to be executed (job to be processed).
36
+ */
37
37
  addJob(job: () => Promise<any>): void;
38
38
  /**
39
- * Waits until there is no running nor waiting job. \
40
- * If the concurrency manager is idle at the time of calling this function,
41
- * it waits until at least one job is completed (can be "presubscribed").
42
- * @returns Promise, resolved after there is no running/waiting worker.
43
- */
39
+ * Waits until there is no running nor waiting job. \
40
+ * If the concurrency manager is idle at the time of calling this function,
41
+ * it waits until at least one job is completed (can be "presubscribed").
42
+ * @returns Promise, resolved after there is no running/waiting worker.
43
+ */
44
44
  waitForCompletion(): Promise<void>;
45
45
  }
@@ -5,31 +5,31 @@ Object.defineProperty(exports, "__esModule", { value: true });
5
5
  */
6
6
  class Concurrency {
7
7
  /**
8
- * Constructs a new instance of concurrency manager.
9
- * @param {number} maxConcurrency Maximum number of workers running in parallel.
10
- */
8
+ * Constructs a new instance of concurrency manager.
9
+ * @param {number} maxConcurrency Maximum number of workers running in parallel.
10
+ */
11
11
  constructor(maxConcurrency) {
12
12
  /**
13
- * Maximum number of workers running in parallel. If set to `null`, there is no limit.
14
- */
13
+ * Maximum number of workers running in parallel. If set to `null`, there is no limit.
14
+ */
15
15
  this.maxConcurrency = 1;
16
16
  /**
17
- * Number of currently active workers.
18
- */
17
+ * Number of currently active workers.
18
+ */
19
19
  this.activeWorkers = 0;
20
20
  /**
21
- * Queue of jobs waiting to be completed.
22
- */
21
+ * Queue of jobs waiting to be completed.
22
+ */
23
23
  this.jobQueue = [];
24
24
  /**
25
- * "Resolve" callbacks of the waitForCompletion() promises.
26
- */
25
+ * "Resolve" callbacks of the waitForCompletion() promises.
26
+ */
27
27
  this.waiting = [];
28
28
  this.maxConcurrency = maxConcurrency;
29
29
  }
30
30
  /**
31
- * Takes a waiting job out of the queue and runs it.
32
- */
31
+ * Takes a waiting job out of the queue and runs it.
32
+ */
33
33
  runNextJob() {
34
34
  const job = this.jobQueue.pop();
35
35
  if (job) {
@@ -49,12 +49,12 @@ class Concurrency {
49
49
  }
50
50
  }
51
51
  /**
52
- * Pass a job (a time-demanding async function) to the concurrency manager. \
53
- * The time of the job's execution depends on the concurrency manager itself
54
- * (given a generous enough `maxConcurrency` value, it might be immediate,
55
- * but this is not guaranteed).
56
- * @param worker Async function to be executed (job to be processed).
57
- */
52
+ * Pass a job (a time-demanding async function) to the concurrency manager. \
53
+ * The time of the job's execution depends on the concurrency manager itself
54
+ * (given a generous enough `maxConcurrency` value, it might be immediate,
55
+ * but this is not guaranteed).
56
+ * @param worker Async function to be executed (job to be processed).
57
+ */
58
58
  addJob(job) {
59
59
  // console.debug("Adding a worker!");
60
60
  this.jobQueue.push(job);
@@ -67,11 +67,11 @@ class Concurrency {
67
67
  }
68
68
  }
69
69
  /**
70
- * Waits until there is no running nor waiting job. \
71
- * If the concurrency manager is idle at the time of calling this function,
72
- * it waits until at least one job is completed (can be "presubscribed").
73
- * @returns Promise, resolved after there is no running/waiting worker.
74
- */
70
+ * Waits until there is no running nor waiting job. \
71
+ * If the concurrency manager is idle at the time of calling this function,
72
+ * it waits until at least one job is completed (can be "presubscribed").
73
+ * @returns Promise, resolved after there is no running/waiting worker.
74
+ */
75
75
  waitForCompletion() {
76
76
  return new Promise((res) => {
77
77
  this.waiting.push(res);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.6",
3
+ "version": "0.0.7",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",