mx-cloud 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.d.ts +4 -2
- package/build/interpret.js +90 -28
- package/package.json +1 -1
package/build/interpret.d.ts
CHANGED
|
@@ -30,14 +30,16 @@ declare global {
|
|
|
30
30
|
* Defines optional intepreter options (passed in constructor)
|
|
31
31
|
*/
|
|
32
32
|
interface InterpreterOptions {
|
|
33
|
+
mode?: string;
|
|
33
34
|
maxRepeats: number;
|
|
34
35
|
maxConcurrency: number;
|
|
35
36
|
serializableCallback: (output: any) => (void | Promise<void>);
|
|
36
37
|
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
|
37
38
|
debug: boolean;
|
|
38
39
|
debugChannel: Partial<{
|
|
39
|
-
activeId:
|
|
40
|
-
debugMessage:
|
|
40
|
+
activeId: (id: number) => void;
|
|
41
|
+
debugMessage: (msg: string) => void;
|
|
42
|
+
setActionType: (type: string) => void;
|
|
41
43
|
}>;
|
|
42
44
|
}
|
|
43
45
|
/**
|
package/build/interpret.js
CHANGED
|
@@ -328,10 +328,18 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
328
328
|
*/
|
|
329
329
|
const wawActions = {
|
|
330
330
|
screenshot: (params) => __awaiter(this, void 0, void 0, function* () {
|
|
331
|
+
var _a;
|
|
332
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
333
|
+
this.options.debugChannel.setActionType('screenshot');
|
|
334
|
+
}
|
|
331
335
|
const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
|
|
332
336
|
yield this.options.binaryCallback(screenshotBuffer, 'image/png');
|
|
333
337
|
}),
|
|
334
338
|
enqueueLinks: (selector) => __awaiter(this, void 0, void 0, function* () {
|
|
339
|
+
var _a;
|
|
340
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
341
|
+
this.options.debugChannel.setActionType('enqueueLinks');
|
|
342
|
+
}
|
|
335
343
|
const links = yield page.locator(selector)
|
|
336
344
|
.evaluateAll(
|
|
337
345
|
// @ts-ignore
|
|
@@ -357,40 +365,50 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
357
365
|
yield page.close();
|
|
358
366
|
}),
|
|
359
367
|
scrape: (selector) => __awaiter(this, void 0, void 0, function* () {
|
|
368
|
+
var _a;
|
|
369
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
370
|
+
this.options.debugChannel.setActionType('scrape');
|
|
371
|
+
}
|
|
360
372
|
yield this.ensureScriptsLoaded(page);
|
|
361
373
|
const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
|
|
362
374
|
yield this.options.serializableCallback(scrapeResults);
|
|
363
375
|
}),
|
|
364
376
|
scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
|
|
377
|
+
var _a;
|
|
378
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
379
|
+
this.options.debugChannel.setActionType('scrapeSchema');
|
|
380
|
+
}
|
|
381
|
+
if (this.options.mode && this.options.mode === 'editor') {
|
|
382
|
+
yield this.options.serializableCallback({});
|
|
383
|
+
return;
|
|
384
|
+
}
|
|
365
385
|
yield this.ensureScriptsLoaded(page);
|
|
366
386
|
const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
387
|
+
if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
|
|
388
|
+
this.cumulativeResults = [];
|
|
389
|
+
}
|
|
390
|
+
if (this.cumulativeResults.length === 0) {
|
|
391
|
+
this.cumulativeResults.push({});
|
|
392
|
+
}
|
|
393
|
+
const mergedResult = this.cumulativeResults[0];
|
|
394
|
+
const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
|
|
395
|
+
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
396
|
+
if (value !== undefined) {
|
|
397
|
+
mergedResult[key] = value;
|
|
398
|
+
}
|
|
375
399
|
});
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
Object.entries(curr).forEach(([key, value]) => {
|
|
379
|
-
// If the key doesn't exist or the current value is not undefined, add/update it
|
|
380
|
-
if (value !== undefined) {
|
|
381
|
-
acc[key] = value;
|
|
382
|
-
}
|
|
383
|
-
});
|
|
384
|
-
return acc;
|
|
385
|
-
}, {})))
|
|
386
|
-
];
|
|
387
|
-
// Log cumulative results after each action
|
|
388
|
-
console.log("CUMULATIVE results:", this.cumulativeResults);
|
|
389
|
-
console.log("MERGED results:", mergedResult);
|
|
390
|
-
yield this.options.serializableCallback(mergedResult);
|
|
391
|
-
// await this.options.serializableCallback(scrapeResult);
|
|
400
|
+
console.log("Updated merged result:", mergedResult);
|
|
401
|
+
yield this.options.serializableCallback([mergedResult]);
|
|
392
402
|
}),
|
|
393
403
|
scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
404
|
+
var _a;
|
|
405
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
406
|
+
this.options.debugChannel.setActionType('scrapeList');
|
|
407
|
+
}
|
|
408
|
+
if (this.options.mode && this.options.mode === 'editor') {
|
|
409
|
+
yield this.options.serializableCallback({});
|
|
410
|
+
return;
|
|
411
|
+
}
|
|
394
412
|
yield this.ensureScriptsLoaded(page);
|
|
395
413
|
let scrapeResults = [];
|
|
396
414
|
if (!config.pagination) {
|
|
@@ -402,6 +420,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
402
420
|
yield this.options.serializableCallback(scrapeResults);
|
|
403
421
|
}),
|
|
404
422
|
scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
423
|
+
var _a;
|
|
424
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
425
|
+
this.options.debugChannel.setActionType('scrapeListAuto');
|
|
426
|
+
}
|
|
405
427
|
yield this.ensureScriptsLoaded(page);
|
|
406
428
|
const scrapeResults = yield page.evaluate((listSelector) => {
|
|
407
429
|
return window.scrapeListAuto(listSelector);
|
|
@@ -409,6 +431,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
409
431
|
yield this.options.serializableCallback(scrapeResults);
|
|
410
432
|
}),
|
|
411
433
|
scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
|
|
434
|
+
var _a;
|
|
435
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
436
|
+
this.options.debugChannel.setActionType('scroll');
|
|
437
|
+
}
|
|
412
438
|
yield page.evaluate((pagesInternal) => __awaiter(this, void 0, void 0, function* () {
|
|
413
439
|
for (let i = 1; i <= (pagesInternal !== null && pagesInternal !== void 0 ? pagesInternal : 1); i += 1) {
|
|
414
440
|
// @ts-ignore
|
|
@@ -417,12 +443,20 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
417
443
|
}), pages !== null && pages !== void 0 ? pages : 1);
|
|
418
444
|
}),
|
|
419
445
|
script: (code) => __awaiter(this, void 0, void 0, function* () {
|
|
446
|
+
var _a;
|
|
447
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
448
|
+
this.options.debugChannel.setActionType('script');
|
|
449
|
+
}
|
|
420
450
|
const AsyncFunction = Object.getPrototypeOf(() => __awaiter(this, void 0, void 0, function* () { })).constructor;
|
|
421
451
|
const x = new AsyncFunction('page', 'log', code);
|
|
422
452
|
yield x(page, this.log);
|
|
423
453
|
}),
|
|
424
454
|
flag: () => __awaiter(this, void 0, void 0, function* () {
|
|
425
455
|
return new Promise((res) => {
|
|
456
|
+
var _a;
|
|
457
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
458
|
+
this.options.debugChannel.setActionType('flag');
|
|
459
|
+
}
|
|
426
460
|
this.emit('flag', page, res);
|
|
427
461
|
});
|
|
428
462
|
}),
|
|
@@ -494,6 +528,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
494
528
|
let visitedUrls = new Set();
|
|
495
529
|
const MAX_RETRIES = 3;
|
|
496
530
|
const RETRY_DELAY = 1000; // 1 second delay between retries
|
|
531
|
+
const MAX_UNCHANGED_RESULTS = 5;
|
|
497
532
|
const debugLog = (message, ...args) => {
|
|
498
533
|
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
|
499
534
|
};
|
|
@@ -572,28 +607,55 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
572
607
|
}
|
|
573
608
|
});
|
|
574
609
|
let availableSelectors = config.pagination.selector.split(',');
|
|
610
|
+
let unchangedResultCounter = 0;
|
|
575
611
|
try {
|
|
576
612
|
while (true) {
|
|
577
613
|
switch (config.pagination.type) {
|
|
578
614
|
case 'scrollDown': {
|
|
615
|
+
let previousResultCount = allResults.length;
|
|
616
|
+
yield scrapeCurrentPage();
|
|
617
|
+
if (checkLimit()) {
|
|
618
|
+
return allResults;
|
|
619
|
+
}
|
|
579
620
|
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
580
621
|
yield page.waitForTimeout(2000);
|
|
581
622
|
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
|
|
623
|
+
const currentResultCount = allResults.length;
|
|
624
|
+
if (currentResultCount === previousResultCount) {
|
|
625
|
+
unchangedResultCounter++;
|
|
626
|
+
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
627
|
+
return allResults;
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
else {
|
|
631
|
+
unchangedResultCounter = 0;
|
|
632
|
+
}
|
|
582
633
|
if (currentHeight === previousHeight) {
|
|
583
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
584
|
-
allResults = allResults.concat(finalResults);
|
|
585
634
|
return allResults;
|
|
586
635
|
}
|
|
587
636
|
previousHeight = currentHeight;
|
|
588
637
|
break;
|
|
589
638
|
}
|
|
590
639
|
case 'scrollUp': {
|
|
640
|
+
let previousResultCount = allResults.length;
|
|
641
|
+
yield scrapeCurrentPage();
|
|
642
|
+
if (checkLimit()) {
|
|
643
|
+
return allResults;
|
|
644
|
+
}
|
|
591
645
|
yield page.evaluate(() => window.scrollTo(0, 0));
|
|
592
646
|
yield page.waitForTimeout(2000);
|
|
593
647
|
const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
|
|
648
|
+
const currentResultCount = allResults.length;
|
|
649
|
+
if (currentResultCount === previousResultCount) {
|
|
650
|
+
unchangedResultCounter++;
|
|
651
|
+
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
652
|
+
return allResults;
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
else {
|
|
656
|
+
unchangedResultCounter = 0;
|
|
657
|
+
}
|
|
594
658
|
if (currentTopHeight === 0) {
|
|
595
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
596
|
-
allResults = allResults.concat(finalResults);
|
|
597
659
|
return allResults;
|
|
598
660
|
}
|
|
599
661
|
previousHeight = currentTopHeight;
|