maxun-core 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -192,10 +192,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
192
192
  return undefined;
193
193
  switch (attribute) {
194
194
  case 'href':
195
- const relativeHref = elem.getAttribute('href'); // Get the href attribute
196
- return relativeHref ? new URL(relativeHref, window.location.origin).href : null; // Convert to full URL
195
+ const relativeHref = elem.getAttribute('href');
196
+ return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
197
197
  case 'src':
198
- return elem.getAttribute('src');
198
+ const relativeSrc = elem.getAttribute('src');
199
+ return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
199
200
  case 'innerText':
200
201
  return elem.innerText;
201
202
  case 'textContent':
@@ -204,7 +205,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
204
205
  return elem.innerText;
205
206
  }
206
207
  }, (key) => key // Use the original key in the output
207
- ));
208
+ )) || [];
208
209
  };
209
210
  /**
210
211
  * Scrapes multiple lists of similar items based on a template item.
@@ -237,10 +238,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
237
238
  record[label] = fieldElement.innerHTML.trim();
238
239
  }
239
240
  else if (attribute === 'src') {
240
- record[label] = fieldElement.src;
241
+ // Handle relative 'src' URLs
242
+ const src = fieldElement.getAttribute('src');
243
+ record[label] = src ? new URL(src, baseUrl).href : null;
241
244
  }
242
245
  else if (attribute === 'href') {
243
- record[label] = fieldElement.href;
246
+ // Handle relative 'href' URLs
247
+ const href = fieldElement.getAttribute('href');
248
+ record[label] = href ? new URL(href, baseUrl).href : null;
244
249
  }
245
250
  else {
246
251
  record[label] = fieldElement.getAttribute(attribute);
@@ -2,6 +2,31 @@
2
2
  import { Page } from 'playwright';
3
3
  import { EventEmitter } from 'events';
4
4
  import { WorkflowFile, ParamType } from './types/workflow';
5
+ /**
6
+ * Extending the Window interface for custom scraping functions.
7
+ */
8
+ declare global {
9
+ interface Window {
10
+ scrape: (selector: string | null) => Record<string, string>[];
11
+ scrapeSchema: (schema: Record<string, {
12
+ selector: string;
13
+ tag: string;
14
+ attribute: string;
15
+ }>) => Record<string, any>;
16
+ scrapeList: (config: {
17
+ listSelector: string;
18
+ fields: any;
19
+ limit?: number;
20
+ pagination: any;
21
+ }) => Record<string, any>[];
22
+ scrapeListAuto: (listSelector: string) => {
23
+ selector: string;
24
+ innerText: string;
25
+ }[];
26
+ scrollDown: (pages?: number) => void;
27
+ scrollUp: (pages?: number) => void;
28
+ }
29
+ }
5
30
  /**
6
31
  * Defines optional intepreter options (passed in constructor)
7
32
  */
@@ -27,6 +52,7 @@ export default class Interpreter extends EventEmitter {
27
52
  private stopper;
28
53
  private log;
29
54
  private blocker;
55
+ private cumulativeResults;
30
56
  constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
31
57
  private applyAdBlocker;
32
58
  private disableAdBlocker;
@@ -53,9 +53,12 @@ class Interpreter extends events_1.EventEmitter {
53
53
  super();
54
54
  this.stopper = null;
55
55
  this.blocker = null;
56
+ this.cumulativeResults = [];
56
57
  this.workflow = workflow.workflow;
57
58
  this.initializedWorkflow = null;
58
- this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => { (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN); }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
59
+ this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
60
+ (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN);
61
+ }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
59
62
  this.concurrency = new concurrency_1.default(this.options.maxConcurrency);
60
63
  this.log = (...args) => (0, logger_1.default)(...args);
61
64
  const error = preprocessor_1.default.validateWorkflow(workflow);
@@ -273,7 +276,30 @@ class Interpreter extends events_1.EventEmitter {
273
276
  scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
274
277
  yield this.ensureScriptsLoaded(page);
275
278
  const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
276
- yield this.options.serializableCallback(scrapeResult);
279
+ const newResults = Array.isArray(scrapeResult) ? scrapeResult : [scrapeResult];
280
+ newResults.forEach((result) => {
281
+ Object.entries(result).forEach(([key, value]) => {
282
+ const keyExists = this.cumulativeResults.some((item) => key in item && item[key] !== undefined);
283
+ if (!keyExists) {
284
+ this.cumulativeResults.push({ [key]: value });
285
+ }
286
+ });
287
+ });
288
+ const mergedResult = [
289
+ Object.fromEntries(Object.entries(this.cumulativeResults.reduce((acc, curr) => {
290
+ Object.entries(curr).forEach(([key, value]) => {
291
+ // If the key doesn't exist or the current value is not undefined, add/update it
292
+ if (value !== undefined) {
293
+ acc[key] = value;
294
+ }
295
+ });
296
+ return acc;
297
+ }, {})))
298
+ ];
299
+ // Log cumulative results after each action
300
+ console.log("CUMULATIVE results:", this.cumulativeResults);
301
+ console.log("MERGED results:", mergedResult);
302
+ yield this.options.serializableCallback(mergedResult);
277
303
  }),
278
304
  scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
279
305
  yield this.ensureScriptsLoaded(page);
@@ -313,7 +339,7 @@ class Interpreter extends events_1.EventEmitter {
313
339
  }),
314
340
  };
315
341
  for (const step of steps) {
316
- this.log(`Launching ${step.action}`, logger_1.Level.LOG);
342
+ this.log(`Launching ${String(step.action)}`, logger_1.Level.LOG);
317
343
  if (step.action in wawActions) {
318
344
  // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
319
345
  const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
@@ -321,7 +347,7 @@ class Interpreter extends events_1.EventEmitter {
321
347
  }
322
348
  else {
323
349
  // Implements the dot notation for the "method name" in the workflow
324
- const levels = step.action.split('.');
350
+ const levels = String(step.action).split('.');
325
351
  const methodName = levels[levels.length - 1];
326
352
  let invokee = page;
327
353
  for (const level of levels.splice(0, levels.length - 1)) {
@@ -477,7 +503,13 @@ class Interpreter extends events_1.EventEmitter {
477
503
  if (this.options.debug) {
478
504
  this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, logger_1.Level.WARN);
479
505
  }
480
- const actionId = workflow.findIndex((step) => this.applicable(step.where, pageState, usedActions));
506
+ const actionId = workflow.findIndex((step) => {
507
+ const isApplicable = this.applicable(step.where, pageState, usedActions);
508
+ console.log(`Where:`, step.where);
509
+ console.log(`Page state:`, pageState);
510
+ console.log(`Match result: ${isApplicable}`);
511
+ return isApplicable;
512
+ });
481
513
  const action = workflow[actionId];
482
514
  this.log(`Matched ${JSON.stringify(action === null || action === void 0 ? void 0 : action.where)}`, logger_1.Level.LOG);
483
515
  if (action) { // action is matched
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.4",
3
+ "version": "0.0.5",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",