mx-cloud 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,8 +36,9 @@ interface InterpreterOptions {
36
36
  binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
37
37
  debug: boolean;
38
38
  debugChannel: Partial<{
39
- activeId: Function;
40
- debugMessage: Function;
39
+ activeId: (id: number) => void;
40
+ debugMessage: (msg: string) => void;
41
+ setActionType: (type: string) => void;
41
42
  }>;
42
43
  }
43
44
  /**
@@ -328,10 +328,18 @@ class Interpreter extends events_1.EventEmitter {
328
328
  */
329
329
  const wawActions = {
330
330
  screenshot: (params) => __awaiter(this, void 0, void 0, function* () {
331
+ var _a;
332
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
333
+ this.options.debugChannel.setActionType('screenshot');
334
+ }
331
335
  const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
332
336
  yield this.options.binaryCallback(screenshotBuffer, 'image/png');
333
337
  }),
334
338
  enqueueLinks: (selector) => __awaiter(this, void 0, void 0, function* () {
339
+ var _a;
340
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
341
+ this.options.debugChannel.setActionType('enqueueLinks');
342
+ }
335
343
  const links = yield page.locator(selector)
336
344
  .evaluateAll(
337
345
  // @ts-ignore
@@ -357,40 +365,42 @@ class Interpreter extends events_1.EventEmitter {
357
365
  yield page.close();
358
366
  }),
359
367
  scrape: (selector) => __awaiter(this, void 0, void 0, function* () {
368
+ var _a;
369
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
370
+ this.options.debugChannel.setActionType('scrape');
371
+ }
360
372
  yield this.ensureScriptsLoaded(page);
361
373
  const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
362
374
  yield this.options.serializableCallback(scrapeResults);
363
375
  }),
364
376
  scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
377
+ var _a;
378
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
379
+ this.options.debugChannel.setActionType('scrapeSchema');
380
+ }
365
381
  yield this.ensureScriptsLoaded(page);
366
382
  const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
367
- const newResults = Array.isArray(scrapeResult) ? scrapeResult : [scrapeResult];
368
- newResults.forEach((result) => {
369
- Object.entries(result).forEach(([key, value]) => {
370
- const keyExists = this.cumulativeResults.some((item) => key in item && item[key] !== undefined);
371
- if (!keyExists) {
372
- this.cumulativeResults.push({ [key]: value });
373
- }
374
- });
383
+ if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
384
+ this.cumulativeResults = [];
385
+ }
386
+ if (this.cumulativeResults.length === 0) {
387
+ this.cumulativeResults.push({});
388
+ }
389
+ const mergedResult = this.cumulativeResults[0];
390
+ const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
391
+ Object.entries(resultToProcess).forEach(([key, value]) => {
392
+ if (value !== undefined) {
393
+ mergedResult[key] = value;
394
+ }
375
395
  });
376
- const mergedResult = [
377
- Object.fromEntries(Object.entries(this.cumulativeResults.reduce((acc, curr) => {
378
- Object.entries(curr).forEach(([key, value]) => {
379
- // If the key doesn't exist or the current value is not undefined, add/update it
380
- if (value !== undefined) {
381
- acc[key] = value;
382
- }
383
- });
384
- return acc;
385
- }, {})))
386
- ];
387
- // Log cumulative results after each action
388
- console.log("CUMULATIVE results:", this.cumulativeResults);
389
- console.log("MERGED results:", mergedResult);
390
- yield this.options.serializableCallback(mergedResult);
391
- // await this.options.serializableCallback(scrapeResult);
396
+ console.log("Updated merged result:", mergedResult);
397
+ yield this.options.serializableCallback([mergedResult]);
392
398
  }),
393
399
  scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
400
+ var _a;
401
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
402
+ this.options.debugChannel.setActionType('scrapeList');
403
+ }
394
404
  yield this.ensureScriptsLoaded(page);
395
405
  let scrapeResults = [];
396
406
  if (!config.pagination) {
@@ -402,6 +412,10 @@ class Interpreter extends events_1.EventEmitter {
402
412
  yield this.options.serializableCallback(scrapeResults);
403
413
  }),
404
414
  scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
415
+ var _a;
416
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
417
+ this.options.debugChannel.setActionType('scrapeListAuto');
418
+ }
405
419
  yield this.ensureScriptsLoaded(page);
406
420
  const scrapeResults = yield page.evaluate((listSelector) => {
407
421
  return window.scrapeListAuto(listSelector);
@@ -409,6 +423,10 @@ class Interpreter extends events_1.EventEmitter {
409
423
  yield this.options.serializableCallback(scrapeResults);
410
424
  }),
411
425
  scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
426
+ var _a;
427
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
428
+ this.options.debugChannel.setActionType('scroll');
429
+ }
412
430
  yield page.evaluate((pagesInternal) => __awaiter(this, void 0, void 0, function* () {
413
431
  for (let i = 1; i <= (pagesInternal !== null && pagesInternal !== void 0 ? pagesInternal : 1); i += 1) {
414
432
  // @ts-ignore
@@ -417,12 +435,20 @@ class Interpreter extends events_1.EventEmitter {
417
435
  }), pages !== null && pages !== void 0 ? pages : 1);
418
436
  }),
419
437
  script: (code) => __awaiter(this, void 0, void 0, function* () {
438
+ var _a;
439
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
440
+ this.options.debugChannel.setActionType('script');
441
+ }
420
442
  const AsyncFunction = Object.getPrototypeOf(() => __awaiter(this, void 0, void 0, function* () { })).constructor;
421
443
  const x = new AsyncFunction('page', 'log', code);
422
444
  yield x(page, this.log);
423
445
  }),
424
446
  flag: () => __awaiter(this, void 0, void 0, function* () {
425
447
  return new Promise((res) => {
448
+ var _a;
449
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
450
+ this.options.debugChannel.setActionType('flag');
451
+ }
426
452
  this.emit('flag', page, res);
427
453
  });
428
454
  }),
@@ -494,6 +520,7 @@ class Interpreter extends events_1.EventEmitter {
494
520
  let visitedUrls = new Set();
495
521
  const MAX_RETRIES = 3;
496
522
  const RETRY_DELAY = 1000; // 1 second delay between retries
523
+ const MAX_UNCHANGED_RESULTS = 5;
497
524
  const debugLog = (message, ...args) => {
498
525
  console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
499
526
  };
@@ -572,28 +599,55 @@ class Interpreter extends events_1.EventEmitter {
572
599
  }
573
600
  });
574
601
  let availableSelectors = config.pagination.selector.split(',');
602
+ let unchangedResultCounter = 0;
575
603
  try {
576
604
  while (true) {
577
605
  switch (config.pagination.type) {
578
606
  case 'scrollDown': {
607
+ let previousResultCount = allResults.length;
608
+ yield scrapeCurrentPage();
609
+ if (checkLimit()) {
610
+ return allResults;
611
+ }
579
612
  yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
580
613
  yield page.waitForTimeout(2000);
581
614
  const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
615
+ const currentResultCount = allResults.length;
616
+ if (currentResultCount === previousResultCount) {
617
+ unchangedResultCounter++;
618
+ if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
619
+ return allResults;
620
+ }
621
+ }
622
+ else {
623
+ unchangedResultCounter = 0;
624
+ }
582
625
  if (currentHeight === previousHeight) {
583
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
584
- allResults = allResults.concat(finalResults);
585
626
  return allResults;
586
627
  }
587
628
  previousHeight = currentHeight;
588
629
  break;
589
630
  }
590
631
  case 'scrollUp': {
632
+ let previousResultCount = allResults.length;
633
+ yield scrapeCurrentPage();
634
+ if (checkLimit()) {
635
+ return allResults;
636
+ }
591
637
  yield page.evaluate(() => window.scrollTo(0, 0));
592
638
  yield page.waitForTimeout(2000);
593
639
  const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
640
+ const currentResultCount = allResults.length;
641
+ if (currentResultCount === previousResultCount) {
642
+ unchangedResultCounter++;
643
+ if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
644
+ return allResults;
645
+ }
646
+ }
647
+ else {
648
+ unchangedResultCounter = 0;
649
+ }
594
650
  if (currentTopHeight === 0) {
595
- const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
596
- allResults = allResults.concat(finalResults);
597
651
  return allResults;
598
652
  }
599
653
  previousHeight = currentTopHeight;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.3",
3
+ "version": "0.0.4",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",