mx-cloud 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +14 -0
- package/build/interpret.d.ts +3 -2
- package/build/interpret.js +82 -28
- package/package.json +1 -1
|
@@ -466,6 +466,20 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
466
466
|
return element.innerHTML.trim();
|
|
467
467
|
}
|
|
468
468
|
else if (attribute === 'src' || attribute === 'href') {
|
|
469
|
+
if (attribute === 'href' && element.tagName !== 'A') {
|
|
470
|
+
const parentElement = element.parentElement;
|
|
471
|
+
if (parentElement && parentElement.tagName === 'A') {
|
|
472
|
+
const parentHref = parentElement.getAttribute('href');
|
|
473
|
+
if (parentHref) {
|
|
474
|
+
try {
|
|
475
|
+
return new URL(parentHref, baseURL).href;
|
|
476
|
+
}
|
|
477
|
+
catch (e) {
|
|
478
|
+
return parentHref;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
}
|
|
469
483
|
const attrValue = element.getAttribute(attribute);
|
|
470
484
|
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
|
471
485
|
if (!dataAttr || dataAttr.trim() === '') {
|
package/build/interpret.d.ts
CHANGED
|
@@ -36,8 +36,9 @@ interface InterpreterOptions {
|
|
|
36
36
|
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
|
37
37
|
debug: boolean;
|
|
38
38
|
debugChannel: Partial<{
|
|
39
|
-
activeId:
|
|
40
|
-
debugMessage:
|
|
39
|
+
activeId: (id: number) => void;
|
|
40
|
+
debugMessage: (msg: string) => void;
|
|
41
|
+
setActionType: (type: string) => void;
|
|
41
42
|
}>;
|
|
42
43
|
}
|
|
43
44
|
/**
|
package/build/interpret.js
CHANGED
|
@@ -328,10 +328,18 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
328
328
|
*/
|
|
329
329
|
const wawActions = {
|
|
330
330
|
screenshot: (params) => __awaiter(this, void 0, void 0, function* () {
|
|
331
|
+
var _a;
|
|
332
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
333
|
+
this.options.debugChannel.setActionType('screenshot');
|
|
334
|
+
}
|
|
331
335
|
const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
|
|
332
336
|
yield this.options.binaryCallback(screenshotBuffer, 'image/png');
|
|
333
337
|
}),
|
|
334
338
|
enqueueLinks: (selector) => __awaiter(this, void 0, void 0, function* () {
|
|
339
|
+
var _a;
|
|
340
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
341
|
+
this.options.debugChannel.setActionType('enqueueLinks');
|
|
342
|
+
}
|
|
335
343
|
const links = yield page.locator(selector)
|
|
336
344
|
.evaluateAll(
|
|
337
345
|
// @ts-ignore
|
|
@@ -357,40 +365,42 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
357
365
|
yield page.close();
|
|
358
366
|
}),
|
|
359
367
|
scrape: (selector) => __awaiter(this, void 0, void 0, function* () {
|
|
368
|
+
var _a;
|
|
369
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
370
|
+
this.options.debugChannel.setActionType('scrape');
|
|
371
|
+
}
|
|
360
372
|
yield this.ensureScriptsLoaded(page);
|
|
361
373
|
const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
|
|
362
374
|
yield this.options.serializableCallback(scrapeResults);
|
|
363
375
|
}),
|
|
364
376
|
scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
|
|
377
|
+
var _a;
|
|
378
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
379
|
+
this.options.debugChannel.setActionType('scrapeSchema');
|
|
380
|
+
}
|
|
365
381
|
yield this.ensureScriptsLoaded(page);
|
|
366
382
|
const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
383
|
+
if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
|
|
384
|
+
this.cumulativeResults = [];
|
|
385
|
+
}
|
|
386
|
+
if (this.cumulativeResults.length === 0) {
|
|
387
|
+
this.cumulativeResults.push({});
|
|
388
|
+
}
|
|
389
|
+
const mergedResult = this.cumulativeResults[0];
|
|
390
|
+
const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
|
|
391
|
+
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
392
|
+
if (value !== undefined) {
|
|
393
|
+
mergedResult[key] = value;
|
|
394
|
+
}
|
|
375
395
|
});
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
Object.entries(curr).forEach(([key, value]) => {
|
|
379
|
-
// If the key doesn't exist or the current value is not undefined, add/update it
|
|
380
|
-
if (value !== undefined) {
|
|
381
|
-
acc[key] = value;
|
|
382
|
-
}
|
|
383
|
-
});
|
|
384
|
-
return acc;
|
|
385
|
-
}, {})))
|
|
386
|
-
];
|
|
387
|
-
// Log cumulative results after each action
|
|
388
|
-
console.log("CUMULATIVE results:", this.cumulativeResults);
|
|
389
|
-
console.log("MERGED results:", mergedResult);
|
|
390
|
-
yield this.options.serializableCallback(mergedResult);
|
|
391
|
-
// await this.options.serializableCallback(scrapeResult);
|
|
396
|
+
console.log("Updated merged result:", mergedResult);
|
|
397
|
+
yield this.options.serializableCallback([mergedResult]);
|
|
392
398
|
}),
|
|
393
399
|
scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
400
|
+
var _a;
|
|
401
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
402
|
+
this.options.debugChannel.setActionType('scrapeList');
|
|
403
|
+
}
|
|
394
404
|
yield this.ensureScriptsLoaded(page);
|
|
395
405
|
let scrapeResults = [];
|
|
396
406
|
if (!config.pagination) {
|
|
@@ -402,6 +412,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
402
412
|
yield this.options.serializableCallback(scrapeResults);
|
|
403
413
|
}),
|
|
404
414
|
scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
415
|
+
var _a;
|
|
416
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
417
|
+
this.options.debugChannel.setActionType('scrapeListAuto');
|
|
418
|
+
}
|
|
405
419
|
yield this.ensureScriptsLoaded(page);
|
|
406
420
|
const scrapeResults = yield page.evaluate((listSelector) => {
|
|
407
421
|
return window.scrapeListAuto(listSelector);
|
|
@@ -409,6 +423,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
409
423
|
yield this.options.serializableCallback(scrapeResults);
|
|
410
424
|
}),
|
|
411
425
|
scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
|
|
426
|
+
var _a;
|
|
427
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
428
|
+
this.options.debugChannel.setActionType('scroll');
|
|
429
|
+
}
|
|
412
430
|
yield page.evaluate((pagesInternal) => __awaiter(this, void 0, void 0, function* () {
|
|
413
431
|
for (let i = 1; i <= (pagesInternal !== null && pagesInternal !== void 0 ? pagesInternal : 1); i += 1) {
|
|
414
432
|
// @ts-ignore
|
|
@@ -417,12 +435,20 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
417
435
|
}), pages !== null && pages !== void 0 ? pages : 1);
|
|
418
436
|
}),
|
|
419
437
|
script: (code) => __awaiter(this, void 0, void 0, function* () {
|
|
438
|
+
var _a;
|
|
439
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
440
|
+
this.options.debugChannel.setActionType('script');
|
|
441
|
+
}
|
|
420
442
|
const AsyncFunction = Object.getPrototypeOf(() => __awaiter(this, void 0, void 0, function* () { })).constructor;
|
|
421
443
|
const x = new AsyncFunction('page', 'log', code);
|
|
422
444
|
yield x(page, this.log);
|
|
423
445
|
}),
|
|
424
446
|
flag: () => __awaiter(this, void 0, void 0, function* () {
|
|
425
447
|
return new Promise((res) => {
|
|
448
|
+
var _a;
|
|
449
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
450
|
+
this.options.debugChannel.setActionType('flag');
|
|
451
|
+
}
|
|
426
452
|
this.emit('flag', page, res);
|
|
427
453
|
});
|
|
428
454
|
}),
|
|
@@ -494,6 +520,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
494
520
|
let visitedUrls = new Set();
|
|
495
521
|
const MAX_RETRIES = 3;
|
|
496
522
|
const RETRY_DELAY = 1000; // 1 second delay between retries
|
|
523
|
+
const MAX_UNCHANGED_RESULTS = 5;
|
|
497
524
|
const debugLog = (message, ...args) => {
|
|
498
525
|
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
|
499
526
|
};
|
|
@@ -572,28 +599,55 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
572
599
|
}
|
|
573
600
|
});
|
|
574
601
|
let availableSelectors = config.pagination.selector.split(',');
|
|
602
|
+
let unchangedResultCounter = 0;
|
|
575
603
|
try {
|
|
576
604
|
while (true) {
|
|
577
605
|
switch (config.pagination.type) {
|
|
578
606
|
case 'scrollDown': {
|
|
607
|
+
let previousResultCount = allResults.length;
|
|
608
|
+
yield scrapeCurrentPage();
|
|
609
|
+
if (checkLimit()) {
|
|
610
|
+
return allResults;
|
|
611
|
+
}
|
|
579
612
|
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
580
613
|
yield page.waitForTimeout(2000);
|
|
581
614
|
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
|
|
615
|
+
const currentResultCount = allResults.length;
|
|
616
|
+
if (currentResultCount === previousResultCount) {
|
|
617
|
+
unchangedResultCounter++;
|
|
618
|
+
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
619
|
+
return allResults;
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
else {
|
|
623
|
+
unchangedResultCounter = 0;
|
|
624
|
+
}
|
|
582
625
|
if (currentHeight === previousHeight) {
|
|
583
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
584
|
-
allResults = allResults.concat(finalResults);
|
|
585
626
|
return allResults;
|
|
586
627
|
}
|
|
587
628
|
previousHeight = currentHeight;
|
|
588
629
|
break;
|
|
589
630
|
}
|
|
590
631
|
case 'scrollUp': {
|
|
632
|
+
let previousResultCount = allResults.length;
|
|
633
|
+
yield scrapeCurrentPage();
|
|
634
|
+
if (checkLimit()) {
|
|
635
|
+
return allResults;
|
|
636
|
+
}
|
|
591
637
|
yield page.evaluate(() => window.scrollTo(0, 0));
|
|
592
638
|
yield page.waitForTimeout(2000);
|
|
593
639
|
const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
|
|
640
|
+
const currentResultCount = allResults.length;
|
|
641
|
+
if (currentResultCount === previousResultCount) {
|
|
642
|
+
unchangedResultCounter++;
|
|
643
|
+
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
644
|
+
return allResults;
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
else {
|
|
648
|
+
unchangedResultCounter = 0;
|
|
649
|
+
}
|
|
594
650
|
if (currentTopHeight === 0) {
|
|
595
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
596
|
-
allResults = allResults.concat(finalResults);
|
|
597
651
|
return allResults;
|
|
598
652
|
}
|
|
599
653
|
previousHeight = currentTopHeight;
|