maxun-core 0.0.15 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -466,6 +466,20 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
466
466
  return element.innerHTML.trim();
467
467
  }
468
468
  else if (attribute === 'src' || attribute === 'href') {
469
+ if (attribute === 'href' && element.tagName !== 'A') {
470
+ const parentElement = element.parentElement;
471
+ if (parentElement && parentElement.tagName === 'A') {
472
+ const parentHref = parentElement.getAttribute('href');
473
+ if (parentHref) {
474
+ try {
475
+ return new URL(parentHref, baseURL).href;
476
+ }
477
+ catch (e) {
478
+ return parentHref;
479
+ }
480
+ }
481
+ }
482
+ }
469
483
  const attrValue = element.getAttribute(attribute);
470
484
  const dataAttr = attrValue || element.getAttribute('data-' + attribute);
471
485
  if (!dataAttr || dataAttr.trim() === '') {
@@ -31,14 +31,16 @@ declare global {
31
31
  * Defines optional intepreter options (passed in constructor)
32
32
  */
33
33
  interface InterpreterOptions {
34
+ mode?: string;
34
35
  maxRepeats: number;
35
36
  maxConcurrency: number;
36
37
  serializableCallback: (output: any) => (void | Promise<void>);
37
38
  binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
38
39
  debug: boolean;
39
40
  debugChannel: Partial<{
40
- activeId: Function;
41
- debugMessage: Function;
41
+ activeId: (id: number) => void;
42
+ debugMessage: (msg: string) => void;
43
+ setActionType: (type: string) => void;
42
44
  }>;
43
45
  }
44
46
  /**
@@ -300,6 +300,7 @@ class Interpreter extends events_1.EventEmitter {
300
300
  * @param steps Array of actions.
301
301
  */
302
302
  carryOutSteps(page, steps) {
303
+ var _a;
303
304
  return __awaiter(this, void 0, void 0, function* () {
304
305
  /**
305
306
  * Defines overloaded (or added) methods/actions usable in the workflow.
@@ -311,10 +312,18 @@ class Interpreter extends events_1.EventEmitter {
311
312
  */
312
313
  const wawActions = {
313
314
  screenshot: (params) => __awaiter(this, void 0, void 0, function* () {
315
+ var _b;
316
+ if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.setActionType) {
317
+ this.options.debugChannel.setActionType('screenshot');
318
+ }
314
319
  const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
315
320
  yield this.options.binaryCallback(screenshotBuffer, 'image/png');
316
321
  }),
317
322
  enqueueLinks: (selector) => __awaiter(this, void 0, void 0, function* () {
323
+ var _c;
324
+ if ((_c = this.options.debugChannel) === null || _c === void 0 ? void 0 : _c.setActionType) {
325
+ this.options.debugChannel.setActionType('enqueueLinks');
326
+ }
318
327
  const links = yield page.locator(selector)
319
328
  .evaluateAll(
320
329
  // @ts-ignore
@@ -340,40 +349,50 @@ class Interpreter extends events_1.EventEmitter {
340
349
  yield page.close();
341
350
  }),
342
351
  scrape: (selector) => __awaiter(this, void 0, void 0, function* () {
352
+ var _d;
353
+ if ((_d = this.options.debugChannel) === null || _d === void 0 ? void 0 : _d.setActionType) {
354
+ this.options.debugChannel.setActionType('scrape');
355
+ }
343
356
  yield this.ensureScriptsLoaded(page);
344
357
  const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
345
358
  yield this.options.serializableCallback(scrapeResults);
346
359
  }),
347
360
  scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
361
+ var _e;
362
+ if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.setActionType) {
363
+ this.options.debugChannel.setActionType('scrapeSchema');
364
+ }
365
+ if (this.options.mode && this.options.mode === 'editor') {
366
+ yield this.options.serializableCallback({});
367
+ return;
368
+ }
348
369
  yield this.ensureScriptsLoaded(page);
349
370
  const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
350
- const newResults = Array.isArray(scrapeResult) ? scrapeResult : [scrapeResult];
351
- newResults.forEach((result) => {
352
- Object.entries(result).forEach(([key, value]) => {
353
- const keyExists = this.cumulativeResults.some((item) => key in item && item[key] !== undefined);
354
- if (!keyExists) {
355
- this.cumulativeResults.push({ [key]: value });
356
- }
357
- });
371
+ if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
372
+ this.cumulativeResults = [];
373
+ }
374
+ if (this.cumulativeResults.length === 0) {
375
+ this.cumulativeResults.push({});
376
+ }
377
+ const mergedResult = this.cumulativeResults[0];
378
+ const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
379
+ Object.entries(resultToProcess).forEach(([key, value]) => {
380
+ if (value !== undefined) {
381
+ mergedResult[key] = value;
382
+ }
358
383
  });
359
- const mergedResult = [
360
- Object.fromEntries(Object.entries(this.cumulativeResults.reduce((acc, curr) => {
361
- Object.entries(curr).forEach(([key, value]) => {
362
- // If the key doesn't exist or the current value is not undefined, add/update it
363
- if (value !== undefined) {
364
- acc[key] = value;
365
- }
366
- });
367
- return acc;
368
- }, {})))
369
- ];
370
- // Log cumulative results after each action
371
- console.log("CUMULATIVE results:", this.cumulativeResults);
372
- console.log("MERGED results:", mergedResult);
373
- yield this.options.serializableCallback(mergedResult);
374
- // await this.options.serializableCallback(scrapeResult);
384
+ console.log("Updated merged result:", mergedResult);
385
+ yield this.options.serializableCallback([mergedResult]);
375
386
  }),
376
387
  scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
388
+ var _f;
389
+ if ((_f = this.options.debugChannel) === null || _f === void 0 ? void 0 : _f.setActionType) {
390
+ this.options.debugChannel.setActionType('scrapeList');
391
+ }
392
+ if (this.options.mode && this.options.mode === 'editor') {
393
+ yield this.options.serializableCallback({});
394
+ return;
395
+ }
377
396
  yield this.ensureScriptsLoaded(page);
378
397
  if (!config.pagination) {
379
398
  const scrapeResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
@@ -385,6 +404,10 @@ class Interpreter extends events_1.EventEmitter {
385
404
  }
386
405
  }),
387
406
  scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
407
+ var _g;
408
+ if ((_g = this.options.debugChannel) === null || _g === void 0 ? void 0 : _g.setActionType) {
409
+ this.options.debugChannel.setActionType('scrapeListAuto');
410
+ }
388
411
  yield this.ensureScriptsLoaded(page);
389
412
  const scrapeResults = yield page.evaluate((listSelector) => {
390
413
  return window.scrapeListAuto(listSelector);
@@ -392,6 +415,10 @@ class Interpreter extends events_1.EventEmitter {
392
415
  yield this.options.serializableCallback(scrapeResults);
393
416
  }),
394
417
  scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
418
+ var _h;
419
+ if ((_h = this.options.debugChannel) === null || _h === void 0 ? void 0 : _h.setActionType) {
420
+ this.options.debugChannel.setActionType('scroll');
421
+ }
395
422
  yield page.evaluate((pagesInternal) => __awaiter(this, void 0, void 0, function* () {
396
423
  for (let i = 1; i <= (pagesInternal !== null && pagesInternal !== void 0 ? pagesInternal : 1); i += 1) {
397
424
  // @ts-ignore
@@ -400,12 +427,20 @@ class Interpreter extends events_1.EventEmitter {
400
427
  }), pages !== null && pages !== void 0 ? pages : 1);
401
428
  }),
402
429
  script: (code) => __awaiter(this, void 0, void 0, function* () {
430
+ var _j;
431
+ if ((_j = this.options.debugChannel) === null || _j === void 0 ? void 0 : _j.setActionType) {
432
+ this.options.debugChannel.setActionType('script');
433
+ }
403
434
  const AsyncFunction = Object.getPrototypeOf(() => __awaiter(this, void 0, void 0, function* () { })).constructor;
404
435
  const x = new AsyncFunction('page', 'log', code);
405
436
  yield x(page, this.log);
406
437
  }),
407
438
  flag: () => __awaiter(this, void 0, void 0, function* () {
408
439
  return new Promise((res) => {
440
+ var _a;
441
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
442
+ this.options.debugChannel.setActionType('flag');
443
+ }
409
444
  this.emit('flag', page, res);
410
445
  });
411
446
  }),
@@ -433,6 +468,9 @@ class Interpreter extends events_1.EventEmitter {
433
468
  yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
434
469
  }
435
470
  else {
471
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
472
+ this.options.debugChannel.setActionType(String(step.action));
473
+ }
436
474
  // Implements the dot notation for the "method name" in the workflow
437
475
  const levels = String(step.action).split('.');
438
476
  const methodName = levels[levels.length - 1];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.15",
3
+ "version": "0.0.16",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",