maxun-core 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -240,12 +240,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
240
240
  else if (attribute === 'src') {
241
241
  // Handle relative 'src' URLs
242
242
  const src = fieldElement.getAttribute('src');
243
- record[label] = src ? new URL(src, baseUrl).href : null;
243
+ record[label] = src ? new URL(src, window.location.origin).href : null;
244
244
  }
245
245
  else if (attribute === 'href') {
246
246
  // Handle relative 'href' URLs
247
247
  const href = fieldElement.getAttribute('href');
248
- record[label] = href ? new URL(href, baseUrl).href : null;
248
+ record[label] = href ? new URL(href, window.location.origin).href : null;
249
249
  }
250
250
  else {
251
251
  record[label] = fieldElement.getAttribute(attribute);
@@ -56,6 +56,7 @@ export default class Interpreter extends EventEmitter {
56
56
  constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
57
57
  private applyAdBlocker;
58
58
  private disableAdBlocker;
59
+ private getSelectors;
59
60
  /**
60
61
  * Returns the context object from given Page and the current workflow.\
61
62
  * \
@@ -84,6 +85,7 @@ export default class Interpreter extends EventEmitter {
84
85
  */
85
86
  private carryOutSteps;
86
87
  private handlePagination;
88
+ private getMatchingActionId;
87
89
  private runLoop;
88
90
  private ensureScriptsLoaded;
89
91
  /**
@@ -75,7 +75,7 @@ class Interpreter extends events_1.EventEmitter {
75
75
  oldLog(...args);
76
76
  };
77
77
  }
78
- adblocker_playwright_1.PlaywrightBlocker.fromPrebuiltAdsAndTracking(cross_fetch_1.default).then(blocker => {
78
+ adblocker_playwright_1.PlaywrightBlocker.fromLists(cross_fetch_1.default, ['https://easylist.to/easylist/easylist.txt']).then(blocker => {
79
79
  this.blocker = blocker;
80
80
  }).catch(err => {
81
81
  this.log(`Failed to initialize ad-blocker:`, logger_1.Level.ERROR);
@@ -95,6 +95,43 @@ class Interpreter extends events_1.EventEmitter {
95
95
  }
96
96
  });
97
97
  }
98
+ // private getSelectors(workflow: Workflow, actionId: number): string[] {
99
+ // const selectors: string[] = [];
100
+ // // Validate actionId
101
+ // if (actionId <= 0) {
102
+ // console.log("No previous selectors to collect.");
103
+ // return selectors; // Empty array as there are no previous steps
104
+ // }
105
+ // // Iterate from the start up to (but not including) actionId
106
+ // for (let index = 0; index < actionId; index++) {
107
+ // const currentSelectors = workflow[index]?.where?.selectors;
108
+ // console.log(`Selectors at step ${index}:`, currentSelectors);
109
+ // if (currentSelectors && currentSelectors.length > 0) {
110
+ // currentSelectors.forEach((selector) => {
111
+ // if (!selectors.includes(selector)) {
112
+ // selectors.push(selector); // Avoid duplicates
113
+ // }
114
+ // });
115
+ // }
116
+ // }
117
+ // console.log("Collected Selectors:", selectors);
118
+ // return selectors;
119
+ // }
120
+ getSelectors(workflow) {
121
+ var _a, _b;
122
+ const selectorsSet = new Set();
123
+ if (workflow.length === 0) {
124
+ return [];
125
+ }
126
+ for (let index = workflow.length - 1; index >= 0; index--) {
127
+ const currentSelectors = (_b = (_a = workflow[index]) === null || _a === void 0 ? void 0 : _a.where) === null || _b === void 0 ? void 0 : _b.selectors;
128
+ if (currentSelectors && currentSelectors.length > 0) {
129
+ currentSelectors.forEach((selector) => selectorsSet.add(selector));
130
+ return Array.from(selectorsSet);
131
+ }
132
+ }
133
+ return [];
134
+ }
98
135
  /**
99
136
  * Returns the context object from given Page and the current workflow.\
100
137
  * \
@@ -104,44 +141,52 @@ class Interpreter extends events_1.EventEmitter {
104
141
  * @param workflow Current **initialized** workflow (array of where-what pairs).
105
142
  * @returns {PageState} State of the current page.
106
143
  */
107
- getState(page, workflow) {
144
+ getState(page, workflowCopy, selectors) {
108
145
  return __awaiter(this, void 0, void 0, function* () {
109
146
  /**
110
147
  * All the selectors present in the current Workflow
111
148
  */
112
- const selectors = preprocessor_1.default.extractSelectors(workflow);
149
+ // const selectors = Preprocessor.extractSelectors(workflow);
150
+ // console.log("Current selectors:", selectors);
113
151
  /**
114
152
  * Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability).
115
153
  * @param selector Selector to be queried
116
154
  * @returns True if the targetted element is actionable, false otherwise.
117
155
  */
118
- const actionable = (selector) => __awaiter(this, void 0, void 0, function* () {
119
- try {
120
- const proms = [
121
- page.isEnabled(selector, { timeout: 500 }),
122
- page.isVisible(selector, { timeout: 500 }),
123
- ];
124
- return yield Promise.all(proms).then((bools) => bools.every((x) => x));
125
- }
126
- catch (e) {
127
- // log(<Error>e, Level.ERROR);
128
- return false;
129
- }
130
- });
156
+ // const actionable = async (selector: string): Promise<boolean> => {
157
+ // try {
158
+ // const proms = [
159
+ // page.isEnabled(selector, { timeout: 5000 }),
160
+ // page.isVisible(selector, { timeout: 5000 }),
161
+ // ];
162
+ // return await Promise.all(proms).then((bools) => bools.every((x) => x));
163
+ // } catch (e) {
164
+ // // log(<Error>e, Level.ERROR);
165
+ // return false;
166
+ // }
167
+ // };
131
168
  /**
132
169
  * Object of selectors present in the current page.
133
170
  */
134
- const presentSelectors = yield Promise.all(selectors.map((selector) => __awaiter(this, void 0, void 0, function* () {
135
- if (yield actionable(selector)) {
136
- return [selector];
137
- }
138
- return [];
139
- }))).then((x) => x.flat());
171
+ // const presentSelectors: SelectorArray = await Promise.all(
172
+ // selectors.map(async (selector) => {
173
+ // if (await actionable(selector)) {
174
+ // return [selector];
175
+ // }
176
+ // return [];
177
+ // }),
178
+ // ).then((x) => x.flat());
179
+ const action = workflowCopy[workflowCopy.length - 1];
180
+ // console.log("Next action:", action)
181
+ let url = page.url();
182
+ if (action && action.where.url !== url && action.where.url !== "about:blank") {
183
+ url = action.where.url;
184
+ }
140
185
  return {
141
- url: page.url(),
186
+ url,
142
187
  cookies: (yield page.context().cookies([page.url()]))
143
188
  .reduce((p, cookie) => (Object.assign(Object.assign({}, p), { [cookie.name]: cookie.value })), {}),
144
- selectors: presentSelectors,
189
+ selectors,
145
190
  };
146
191
  });
147
192
  }
@@ -228,7 +273,6 @@ class Interpreter extends events_1.EventEmitter {
228
273
  * @param steps Array of actions.
229
274
  */
230
275
  carryOutSteps(page, steps) {
231
- var _a;
232
276
  return __awaiter(this, void 0, void 0, function* () {
233
277
  /**
234
278
  * Defines overloaded (or added) methods/actions usable in the workflow.
@@ -300,6 +344,7 @@ class Interpreter extends events_1.EventEmitter {
300
344
  console.log("CUMULATIVE results:", this.cumulativeResults);
301
345
  console.log("MERGED results:", mergedResult);
302
346
  yield this.options.serializableCallback(mergedResult);
347
+ // await this.options.serializableCallback(scrapeResult);
303
348
  }),
304
349
  scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
305
350
  yield this.ensureScriptsLoaded(page);
@@ -338,6 +383,15 @@ class Interpreter extends events_1.EventEmitter {
338
383
  });
339
384
  }),
340
385
  };
386
+ const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
387
+ console.log("Executing action:", methodName, args);
388
+ if (!args || Array.isArray(args)) {
389
+ yield invokee[methodName](...(args !== null && args !== void 0 ? args : []));
390
+ }
391
+ else {
392
+ yield invokee[methodName](args);
393
+ }
394
+ });
341
395
  for (const step of steps) {
342
396
  this.log(`Launching ${String(step.action)}`, logger_1.Level.LOG);
343
397
  if (step.action in wawActions) {
@@ -353,11 +407,24 @@ class Interpreter extends events_1.EventEmitter {
353
407
  for (const level of levels.splice(0, levels.length - 1)) {
354
408
  invokee = invokee[level];
355
409
  }
356
- if (!step.args || Array.isArray(step.args)) {
357
- yield invokee[methodName](...((_a = step.args) !== null && _a !== void 0 ? _a : []));
410
+ if (methodName === 'waitForLoadState') {
411
+ try {
412
+ yield executeAction(invokee, methodName, step.args);
413
+ }
414
+ catch (error) {
415
+ yield executeAction(invokee, methodName, 'domcontentloaded');
416
+ }
417
+ }
418
+ else if (methodName === 'click') {
419
+ try {
420
+ yield executeAction(invokee, methodName, step.args);
421
+ }
422
+ catch (error) {
423
+ yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
424
+ }
358
425
  }
359
426
  else {
360
- yield invokee[methodName](step.args);
427
+ yield executeAction(invokee, methodName, step.args);
361
428
  }
362
429
  }
363
430
  yield new Promise((res) => { setTimeout(res, 500); });
@@ -396,6 +463,7 @@ class Interpreter extends events_1.EventEmitter {
396
463
  break;
397
464
  case 'clickNext':
398
465
  const pageResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
466
+ // console.log("Page results:", pageResults);
399
467
  // Filter out already scraped items
400
468
  const newResults = pageResults.filter(item => {
401
469
  const uniqueKey = JSON.stringify(item);
@@ -413,7 +481,7 @@ class Interpreter extends events_1.EventEmitter {
413
481
  return allResults; // No more pages to scrape
414
482
  }
415
483
  yield Promise.all([
416
- nextButton.click(),
484
+ nextButton.dispatchEvent('click'),
417
485
  page.waitForNavigation({ waitUntil: 'networkidle' })
418
486
  ]);
419
487
  yield page.waitForTimeout(1000);
@@ -428,7 +496,7 @@ class Interpreter extends events_1.EventEmitter {
428
496
  return allResults;
429
497
  }
430
498
  // Click the 'Load More' button to load additional items
431
- yield loadMoreButton.click();
499
+ yield loadMoreButton.dispatchEvent('click');
432
500
  yield page.waitForTimeout(2000); // Wait for new items to load
433
501
  // After clicking 'Load More', scroll down to load more items
434
502
  yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
@@ -462,13 +530,30 @@ class Interpreter extends events_1.EventEmitter {
462
530
  return allResults;
463
531
  });
464
532
  }
533
+ getMatchingActionId(workflow, pageState, usedActions) {
534
+ for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
535
+ const step = workflow[actionId];
536
+ const isApplicable = this.applicable(step.where, pageState, usedActions);
537
+ console.log("-------------------------------------------------------------");
538
+ console.log(`Where:`, step.where);
539
+ console.log(`Page state:`, pageState);
540
+ console.log(`Match result: ${isApplicable}`);
541
+ console.log("-------------------------------------------------------------");
542
+ if (isApplicable) {
543
+ return actionId;
544
+ }
545
+ }
546
+ }
465
547
  runLoop(p, workflow) {
466
548
  var _a, _b;
467
549
  return __awaiter(this, void 0, void 0, function* () {
550
+ const workflowCopy = JSON.parse(JSON.stringify(workflow));
468
551
  // apply ad-blocker to the current page
469
552
  yield this.applyAdBlocker(p);
470
553
  const usedActions = [];
554
+ let selectors = [];
471
555
  let lastAction = null;
556
+ let actionId = -1;
472
557
  let repeatCount = 0;
473
558
  /**
474
559
  * Enables the interpreter functionality for popup windows.
@@ -476,7 +561,7 @@ class Interpreter extends events_1.EventEmitter {
476
561
  * e.g. via `enqueueLinks`.
477
562
  */
478
563
  p.on('popup', (popup) => {
479
- this.concurrency.addJob(() => this.runLoop(popup, workflow));
564
+ this.concurrency.addJob(() => this.runLoop(popup, workflowCopy));
480
565
  });
481
566
  /* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
482
567
  while (true) {
@@ -493,8 +578,11 @@ class Interpreter extends events_1.EventEmitter {
493
578
  return;
494
579
  }
495
580
  let pageState = {};
581
+ let getStateTest = "Hello";
496
582
  try {
497
- pageState = yield this.getState(p, workflow);
583
+ pageState = yield this.getState(p, workflowCopy, selectors);
584
+ selectors = [];
585
+ console.log("Empty selectors:", selectors);
498
586
  }
499
587
  catch (e) {
500
588
  this.log('The browser has been closed.');
@@ -503,27 +591,43 @@ class Interpreter extends events_1.EventEmitter {
503
591
  if (this.options.debug) {
504
592
  this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, logger_1.Level.WARN);
505
593
  }
506
- const actionId = workflow.findIndex((step) => {
507
- const isApplicable = this.applicable(step.where, pageState, usedActions);
508
- console.log(`Where:`, step.where);
509
- console.log(`Page state:`, pageState);
510
- console.log(`Match result: ${isApplicable}`);
511
- return isApplicable;
512
- });
513
- const action = workflow[actionId];
594
+ // const actionId = workflow.findIndex((step) => {
595
+ // const isApplicable = this.applicable(step.where, pageState, usedActions);
596
+ // console.log("-------------------------------------------------------------");
597
+ // console.log(`Where:`, step.where);
598
+ // console.log(`Page state:`, pageState);
599
+ // console.log(`Match result: ${isApplicable}`);
600
+ // console.log("-------------------------------------------------------------");
601
+ // return isApplicable;
602
+ // });
603
+ actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
604
+ const action = workflowCopy[actionId];
605
+ console.log("MATCHED ACTION:", action);
606
+ console.log("MATCHED ACTION ID:", actionId);
514
607
  this.log(`Matched ${JSON.stringify(action === null || action === void 0 ? void 0 : action.where)}`, logger_1.Level.LOG);
515
608
  if (action) { // action is matched
516
609
  if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.activeId) {
517
610
  this.options.debugChannel.activeId(actionId);
518
611
  }
519
612
  repeatCount = action === lastAction ? repeatCount + 1 : 0;
520
- if (this.options.maxRepeats && repeatCount >= this.options.maxRepeats) {
613
+ console.log("REPEAT COUNT", repeatCount);
614
+ if (this.options.maxRepeats && repeatCount > this.options.maxRepeats) {
521
615
  return;
522
616
  }
523
617
  lastAction = action;
524
618
  try {
619
+ console.log("Carrying out:", action.what);
525
620
  yield this.carryOutSteps(p, action.what);
526
621
  usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
622
+ workflowCopy.splice(actionId, 1);
623
+ console.log(`Action with ID ${action.id} removed from the workflow copy.`);
624
+ // const newSelectors = this.getPreviousSelectors(workflow, actionId);
625
+ const newSelectors = this.getSelectors(workflowCopy);
626
+ newSelectors.forEach(selector => {
627
+ if (!selectors.includes(selector)) {
628
+ selectors.push(selector);
629
+ }
630
+ });
527
631
  }
528
632
  catch (e) {
529
633
  this.log(e, logger_1.Level.ERROR);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.5",
3
+ "version": "0.0.6",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",