maxun-core 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -192,10 +192,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
192
192
  return undefined;
193
193
  switch (attribute) {
194
194
  case 'href':
195
- const relativeHref = elem.getAttribute('href'); // Get the href attribute
196
- return relativeHref ? new URL(relativeHref, window.location.origin).href : null; // Convert to full URL
195
+ const relativeHref = elem.getAttribute('href');
196
+ return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
197
197
  case 'src':
198
- return elem.getAttribute('src');
198
+ const relativeSrc = elem.getAttribute('src');
199
+ return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
199
200
  case 'innerText':
200
201
  return elem.innerText;
201
202
  case 'textContent':
@@ -204,7 +205,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
204
205
  return elem.innerText;
205
206
  }
206
207
  }, (key) => key // Use the original key in the output
207
- ));
208
+ )) || [];
208
209
  };
209
210
  /**
210
211
  * Scrapes multiple lists of similar items based on a template item.
@@ -237,10 +238,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
237
238
  record[label] = fieldElement.innerHTML.trim();
238
239
  }
239
240
  else if (attribute === 'src') {
240
- record[label] = fieldElement.src;
241
+ // Handle relative 'src' URLs
242
+ const src = fieldElement.getAttribute('src');
243
+ record[label] = src ? new URL(src, window.location.origin).href : null;
241
244
  }
242
245
  else if (attribute === 'href') {
243
- record[label] = fieldElement.href;
246
+ // Handle relative 'href' URLs
247
+ const href = fieldElement.getAttribute('href');
248
+ record[label] = href ? new URL(href, window.location.origin).href : null;
244
249
  }
245
250
  else {
246
251
  record[label] = fieldElement.getAttribute(attribute);
@@ -2,6 +2,31 @@
2
2
  import { Page } from 'playwright';
3
3
  import { EventEmitter } from 'events';
4
4
  import { WorkflowFile, ParamType } from './types/workflow';
5
+ /**
6
+ * Extending the Window interface for custom scraping functions.
7
+ */
8
+ declare global {
9
+ interface Window {
10
+ scrape: (selector: string | null) => Record<string, string>[];
11
+ scrapeSchema: (schema: Record<string, {
12
+ selector: string;
13
+ tag: string;
14
+ attribute: string;
15
+ }>) => Record<string, any>;
16
+ scrapeList: (config: {
17
+ listSelector: string;
18
+ fields: any;
19
+ limit?: number;
20
+ pagination: any;
21
+ }) => Record<string, any>[];
22
+ scrapeListAuto: (listSelector: string) => {
23
+ selector: string;
24
+ innerText: string;
25
+ }[];
26
+ scrollDown: (pages?: number) => void;
27
+ scrollUp: (pages?: number) => void;
28
+ }
29
+ }
5
30
  /**
6
31
  * Defines optional intepreter options (passed in constructor)
7
32
  */
@@ -27,9 +52,11 @@ export default class Interpreter extends EventEmitter {
27
52
  private stopper;
28
53
  private log;
29
54
  private blocker;
55
+ private cumulativeResults;
30
56
  constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
31
57
  private applyAdBlocker;
32
58
  private disableAdBlocker;
59
+ private getSelectors;
33
60
  /**
34
61
  * Returns the context object from given Page and the current workflow.\
35
62
  * \
@@ -58,6 +85,7 @@ export default class Interpreter extends EventEmitter {
58
85
  */
59
86
  private carryOutSteps;
60
87
  private handlePagination;
88
+ private getMatchingActionId;
61
89
  private runLoop;
62
90
  private ensureScriptsLoaded;
63
91
  /**
@@ -53,9 +53,12 @@ class Interpreter extends events_1.EventEmitter {
53
53
  super();
54
54
  this.stopper = null;
55
55
  this.blocker = null;
56
+ this.cumulativeResults = [];
56
57
  this.workflow = workflow.workflow;
57
58
  this.initializedWorkflow = null;
58
- this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => { (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN); }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
59
+ this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
60
+ (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN);
61
+ }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
59
62
  this.concurrency = new concurrency_1.default(this.options.maxConcurrency);
60
63
  this.log = (...args) => (0, logger_1.default)(...args);
61
64
  const error = preprocessor_1.default.validateWorkflow(workflow);
@@ -72,7 +75,7 @@ class Interpreter extends events_1.EventEmitter {
72
75
  oldLog(...args);
73
76
  };
74
77
  }
75
- adblocker_playwright_1.PlaywrightBlocker.fromPrebuiltAdsAndTracking(cross_fetch_1.default).then(blocker => {
78
+ adblocker_playwright_1.PlaywrightBlocker.fromLists(cross_fetch_1.default, ['https://easylist.to/easylist/easylist.txt']).then(blocker => {
76
79
  this.blocker = blocker;
77
80
  }).catch(err => {
78
81
  this.log(`Failed to initialize ad-blocker:`, logger_1.Level.ERROR);
@@ -92,6 +95,43 @@ class Interpreter extends events_1.EventEmitter {
92
95
  }
93
96
  });
94
97
  }
98
+ // private getSelectors(workflow: Workflow, actionId: number): string[] {
99
+ // const selectors: string[] = [];
100
+ // // Validate actionId
101
+ // if (actionId <= 0) {
102
+ // console.log("No previous selectors to collect.");
103
+ // return selectors; // Empty array as there are no previous steps
104
+ // }
105
+ // // Iterate from the start up to (but not including) actionId
106
+ // for (let index = 0; index < actionId; index++) {
107
+ // const currentSelectors = workflow[index]?.where?.selectors;
108
+ // console.log(`Selectors at step ${index}:`, currentSelectors);
109
+ // if (currentSelectors && currentSelectors.length > 0) {
110
+ // currentSelectors.forEach((selector) => {
111
+ // if (!selectors.includes(selector)) {
112
+ // selectors.push(selector); // Avoid duplicates
113
+ // }
114
+ // });
115
+ // }
116
+ // }
117
+ // console.log("Collected Selectors:", selectors);
118
+ // return selectors;
119
+ // }
120
+ getSelectors(workflow) {
121
+ var _a, _b;
122
+ const selectorsSet = new Set();
123
+ if (workflow.length === 0) {
124
+ return [];
125
+ }
126
+ for (let index = workflow.length - 1; index >= 0; index--) {
127
+ const currentSelectors = (_b = (_a = workflow[index]) === null || _a === void 0 ? void 0 : _a.where) === null || _b === void 0 ? void 0 : _b.selectors;
128
+ if (currentSelectors && currentSelectors.length > 0) {
129
+ currentSelectors.forEach((selector) => selectorsSet.add(selector));
130
+ return Array.from(selectorsSet);
131
+ }
132
+ }
133
+ return [];
134
+ }
95
135
  /**
96
136
  * Returns the context object from given Page and the current workflow.\
97
137
  * \
@@ -101,44 +141,52 @@ class Interpreter extends events_1.EventEmitter {
101
141
  * @param workflow Current **initialized** workflow (array of where-what pairs).
102
142
  * @returns {PageState} State of the current page.
103
143
  */
104
- getState(page, workflow) {
144
+ getState(page, workflowCopy, selectors) {
105
145
  return __awaiter(this, void 0, void 0, function* () {
106
146
  /**
107
147
  * All the selectors present in the current Workflow
108
148
  */
109
- const selectors = preprocessor_1.default.extractSelectors(workflow);
149
+ // const selectors = Preprocessor.extractSelectors(workflow);
150
+ // console.log("Current selectors:", selectors);
110
151
  /**
111
152
  * Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability).
112
153
  * @param selector Selector to be queried
113
154
  * @returns True if the targetted element is actionable, false otherwise.
114
155
  */
115
- const actionable = (selector) => __awaiter(this, void 0, void 0, function* () {
116
- try {
117
- const proms = [
118
- page.isEnabled(selector, { timeout: 500 }),
119
- page.isVisible(selector, { timeout: 500 }),
120
- ];
121
- return yield Promise.all(proms).then((bools) => bools.every((x) => x));
122
- }
123
- catch (e) {
124
- // log(<Error>e, Level.ERROR);
125
- return false;
126
- }
127
- });
156
+ // const actionable = async (selector: string): Promise<boolean> => {
157
+ // try {
158
+ // const proms = [
159
+ // page.isEnabled(selector, { timeout: 5000 }),
160
+ // page.isVisible(selector, { timeout: 5000 }),
161
+ // ];
162
+ // return await Promise.all(proms).then((bools) => bools.every((x) => x));
163
+ // } catch (e) {
164
+ // // log(<Error>e, Level.ERROR);
165
+ // return false;
166
+ // }
167
+ // };
128
168
  /**
129
169
  * Object of selectors present in the current page.
130
170
  */
131
- const presentSelectors = yield Promise.all(selectors.map((selector) => __awaiter(this, void 0, void 0, function* () {
132
- if (yield actionable(selector)) {
133
- return [selector];
134
- }
135
- return [];
136
- }))).then((x) => x.flat());
171
+ // const presentSelectors: SelectorArray = await Promise.all(
172
+ // selectors.map(async (selector) => {
173
+ // if (await actionable(selector)) {
174
+ // return [selector];
175
+ // }
176
+ // return [];
177
+ // }),
178
+ // ).then((x) => x.flat());
179
+ const action = workflowCopy[workflowCopy.length - 1];
180
+ // console.log("Next action:", action)
181
+ let url = page.url();
182
+ if (action && action.where.url !== url && action.where.url !== "about:blank") {
183
+ url = action.where.url;
184
+ }
137
185
  return {
138
- url: page.url(),
186
+ url,
139
187
  cookies: (yield page.context().cookies([page.url()]))
140
188
  .reduce((p, cookie) => (Object.assign(Object.assign({}, p), { [cookie.name]: cookie.value })), {}),
141
- selectors: presentSelectors,
189
+ selectors,
142
190
  };
143
191
  });
144
192
  }
@@ -225,7 +273,6 @@ class Interpreter extends events_1.EventEmitter {
225
273
  * @param steps Array of actions.
226
274
  */
227
275
  carryOutSteps(page, steps) {
228
- var _a;
229
276
  return __awaiter(this, void 0, void 0, function* () {
230
277
  /**
231
278
  * Defines overloaded (or added) methods/actions usable in the workflow.
@@ -273,7 +320,31 @@ class Interpreter extends events_1.EventEmitter {
273
320
  scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
274
321
  yield this.ensureScriptsLoaded(page);
275
322
  const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
276
- yield this.options.serializableCallback(scrapeResult);
323
+ const newResults = Array.isArray(scrapeResult) ? scrapeResult : [scrapeResult];
324
+ newResults.forEach((result) => {
325
+ Object.entries(result).forEach(([key, value]) => {
326
+ const keyExists = this.cumulativeResults.some((item) => key in item && item[key] !== undefined);
327
+ if (!keyExists) {
328
+ this.cumulativeResults.push({ [key]: value });
329
+ }
330
+ });
331
+ });
332
+ const mergedResult = [
333
+ Object.fromEntries(Object.entries(this.cumulativeResults.reduce((acc, curr) => {
334
+ Object.entries(curr).forEach(([key, value]) => {
335
+ // If the key doesn't exist or the current value is not undefined, add/update it
336
+ if (value !== undefined) {
337
+ acc[key] = value;
338
+ }
339
+ });
340
+ return acc;
341
+ }, {})))
342
+ ];
343
+ // Log cumulative results after each action
344
+ console.log("CUMULATIVE results:", this.cumulativeResults);
345
+ console.log("MERGED results:", mergedResult);
346
+ yield this.options.serializableCallback(mergedResult);
347
+ // await this.options.serializableCallback(scrapeResult);
277
348
  }),
278
349
  scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
279
350
  yield this.ensureScriptsLoaded(page);
@@ -312,8 +383,17 @@ class Interpreter extends events_1.EventEmitter {
312
383
  });
313
384
  }),
314
385
  };
386
+ const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
387
+ console.log("Executing action:", methodName, args);
388
+ if (!args || Array.isArray(args)) {
389
+ yield invokee[methodName](...(args !== null && args !== void 0 ? args : []));
390
+ }
391
+ else {
392
+ yield invokee[methodName](args);
393
+ }
394
+ });
315
395
  for (const step of steps) {
316
- this.log(`Launching ${step.action}`, logger_1.Level.LOG);
396
+ this.log(`Launching ${String(step.action)}`, logger_1.Level.LOG);
317
397
  if (step.action in wawActions) {
318
398
  // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
319
399
  const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
@@ -321,17 +401,30 @@ class Interpreter extends events_1.EventEmitter {
321
401
  }
322
402
  else {
323
403
  // Implements the dot notation for the "method name" in the workflow
324
- const levels = step.action.split('.');
404
+ const levels = String(step.action).split('.');
325
405
  const methodName = levels[levels.length - 1];
326
406
  let invokee = page;
327
407
  for (const level of levels.splice(0, levels.length - 1)) {
328
408
  invokee = invokee[level];
329
409
  }
330
- if (!step.args || Array.isArray(step.args)) {
331
- yield invokee[methodName](...((_a = step.args) !== null && _a !== void 0 ? _a : []));
410
+ if (methodName === 'waitForLoadState') {
411
+ try {
412
+ yield executeAction(invokee, methodName, step.args);
413
+ }
414
+ catch (error) {
415
+ yield executeAction(invokee, methodName, 'domcontentloaded');
416
+ }
417
+ }
418
+ else if (methodName === 'click') {
419
+ try {
420
+ yield executeAction(invokee, methodName, step.args);
421
+ }
422
+ catch (error) {
423
+ yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
424
+ }
332
425
  }
333
426
  else {
334
- yield invokee[methodName](step.args);
427
+ yield executeAction(invokee, methodName, step.args);
335
428
  }
336
429
  }
337
430
  yield new Promise((res) => { setTimeout(res, 500); });
@@ -370,6 +463,7 @@ class Interpreter extends events_1.EventEmitter {
370
463
  break;
371
464
  case 'clickNext':
372
465
  const pageResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
466
+ // console.log("Page results:", pageResults);
373
467
  // Filter out already scraped items
374
468
  const newResults = pageResults.filter(item => {
375
469
  const uniqueKey = JSON.stringify(item);
@@ -387,7 +481,7 @@ class Interpreter extends events_1.EventEmitter {
387
481
  return allResults; // No more pages to scrape
388
482
  }
389
483
  yield Promise.all([
390
- nextButton.click(),
484
+ nextButton.dispatchEvent('click'),
391
485
  page.waitForNavigation({ waitUntil: 'networkidle' })
392
486
  ]);
393
487
  yield page.waitForTimeout(1000);
@@ -402,7 +496,7 @@ class Interpreter extends events_1.EventEmitter {
402
496
  return allResults;
403
497
  }
404
498
  // Click the 'Load More' button to load additional items
405
- yield loadMoreButton.click();
499
+ yield loadMoreButton.dispatchEvent('click');
406
500
  yield page.waitForTimeout(2000); // Wait for new items to load
407
501
  // After clicking 'Load More', scroll down to load more items
408
502
  yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
@@ -436,13 +530,30 @@ class Interpreter extends events_1.EventEmitter {
436
530
  return allResults;
437
531
  });
438
532
  }
533
+ getMatchingActionId(workflow, pageState, usedActions) {
534
+ for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
535
+ const step = workflow[actionId];
536
+ const isApplicable = this.applicable(step.where, pageState, usedActions);
537
+ console.log("-------------------------------------------------------------");
538
+ console.log(`Where:`, step.where);
539
+ console.log(`Page state:`, pageState);
540
+ console.log(`Match result: ${isApplicable}`);
541
+ console.log("-------------------------------------------------------------");
542
+ if (isApplicable) {
543
+ return actionId;
544
+ }
545
+ }
546
+ }
439
547
  runLoop(p, workflow) {
440
548
  var _a, _b;
441
549
  return __awaiter(this, void 0, void 0, function* () {
550
+ const workflowCopy = JSON.parse(JSON.stringify(workflow));
442
551
  // apply ad-blocker to the current page
443
552
  yield this.applyAdBlocker(p);
444
553
  const usedActions = [];
554
+ let selectors = [];
445
555
  let lastAction = null;
556
+ let actionId = -1;
446
557
  let repeatCount = 0;
447
558
  /**
448
559
  * Enables the interpreter functionality for popup windows.
@@ -450,7 +561,7 @@ class Interpreter extends events_1.EventEmitter {
450
561
  * e.g. via `enqueueLinks`.
451
562
  */
452
563
  p.on('popup', (popup) => {
453
- this.concurrency.addJob(() => this.runLoop(popup, workflow));
564
+ this.concurrency.addJob(() => this.runLoop(popup, workflowCopy));
454
565
  });
455
566
  /* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
456
567
  while (true) {
@@ -467,8 +578,11 @@ class Interpreter extends events_1.EventEmitter {
467
578
  return;
468
579
  }
469
580
  let pageState = {};
581
+ let getStateTest = "Hello";
470
582
  try {
471
- pageState = yield this.getState(p, workflow);
583
+ pageState = yield this.getState(p, workflowCopy, selectors);
584
+ selectors = [];
585
+ console.log("Empty selectors:", selectors);
472
586
  }
473
587
  catch (e) {
474
588
  this.log('The browser has been closed.');
@@ -477,21 +591,43 @@ class Interpreter extends events_1.EventEmitter {
477
591
  if (this.options.debug) {
478
592
  this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, logger_1.Level.WARN);
479
593
  }
480
- const actionId = workflow.findIndex((step) => this.applicable(step.where, pageState, usedActions));
481
- const action = workflow[actionId];
594
+ // const actionId = workflow.findIndex((step) => {
595
+ // const isApplicable = this.applicable(step.where, pageState, usedActions);
596
+ // console.log("-------------------------------------------------------------");
597
+ // console.log(`Where:`, step.where);
598
+ // console.log(`Page state:`, pageState);
599
+ // console.log(`Match result: ${isApplicable}`);
600
+ // console.log("-------------------------------------------------------------");
601
+ // return isApplicable;
602
+ // });
603
+ actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
604
+ const action = workflowCopy[actionId];
605
+ console.log("MATCHED ACTION:", action);
606
+ console.log("MATCHED ACTION ID:", actionId);
482
607
  this.log(`Matched ${JSON.stringify(action === null || action === void 0 ? void 0 : action.where)}`, logger_1.Level.LOG);
483
608
  if (action) { // action is matched
484
609
  if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.activeId) {
485
610
  this.options.debugChannel.activeId(actionId);
486
611
  }
487
612
  repeatCount = action === lastAction ? repeatCount + 1 : 0;
488
- if (this.options.maxRepeats && repeatCount >= this.options.maxRepeats) {
613
+ console.log("REPEAT COUNT", repeatCount);
614
+ if (this.options.maxRepeats && repeatCount > this.options.maxRepeats) {
489
615
  return;
490
616
  }
491
617
  lastAction = action;
492
618
  try {
619
+ console.log("Carrying out:", action.what);
493
620
  yield this.carryOutSteps(p, action.what);
494
621
  usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
622
+ workflowCopy.splice(actionId, 1);
623
+ console.log(`Action with ID ${action.id} removed from the workflow copy.`);
624
+ // const newSelectors = this.getPreviousSelectors(workflow, actionId);
625
+ const newSelectors = this.getSelectors(workflowCopy);
626
+ newSelectors.forEach(selector => {
627
+ if (!selectors.includes(selector)) {
628
+ selectors.push(selector);
629
+ }
630
+ });
495
631
  }
496
632
  catch (e) {
497
633
  this.log(e, logger_1.Level.ERROR);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.4",
3
+ "version": "0.0.6",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",