maxun-core 0.0.15 → 0.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +14 -0
- package/build/interpret.d.ts +4 -2
- package/build/interpret.js +62 -24
- package/package.json +1 -1
|
@@ -466,6 +466,20 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
466
466
|
return element.innerHTML.trim();
|
|
467
467
|
}
|
|
468
468
|
else if (attribute === 'src' || attribute === 'href') {
|
|
469
|
+
if (attribute === 'href' && element.tagName !== 'A') {
|
|
470
|
+
const parentElement = element.parentElement;
|
|
471
|
+
if (parentElement && parentElement.tagName === 'A') {
|
|
472
|
+
const parentHref = parentElement.getAttribute('href');
|
|
473
|
+
if (parentHref) {
|
|
474
|
+
try {
|
|
475
|
+
return new URL(parentHref, baseURL).href;
|
|
476
|
+
}
|
|
477
|
+
catch (e) {
|
|
478
|
+
return parentHref;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
}
|
|
469
483
|
const attrValue = element.getAttribute(attribute);
|
|
470
484
|
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
|
471
485
|
if (!dataAttr || dataAttr.trim() === '') {
|
package/build/interpret.d.ts
CHANGED
|
@@ -31,14 +31,16 @@ declare global {
|
|
|
31
31
|
* Defines optional intepreter options (passed in constructor)
|
|
32
32
|
*/
|
|
33
33
|
interface InterpreterOptions {
|
|
34
|
+
mode?: string;
|
|
34
35
|
maxRepeats: number;
|
|
35
36
|
maxConcurrency: number;
|
|
36
37
|
serializableCallback: (output: any) => (void | Promise<void>);
|
|
37
38
|
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
|
38
39
|
debug: boolean;
|
|
39
40
|
debugChannel: Partial<{
|
|
40
|
-
activeId:
|
|
41
|
-
debugMessage:
|
|
41
|
+
activeId: (id: number) => void;
|
|
42
|
+
debugMessage: (msg: string) => void;
|
|
43
|
+
setActionType: (type: string) => void;
|
|
42
44
|
}>;
|
|
43
45
|
}
|
|
44
46
|
/**
|
package/build/interpret.js
CHANGED
|
@@ -300,6 +300,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
300
300
|
* @param steps Array of actions.
|
|
301
301
|
*/
|
|
302
302
|
carryOutSteps(page, steps) {
|
|
303
|
+
var _a;
|
|
303
304
|
return __awaiter(this, void 0, void 0, function* () {
|
|
304
305
|
/**
|
|
305
306
|
* Defines overloaded (or added) methods/actions usable in the workflow.
|
|
@@ -311,10 +312,18 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
311
312
|
*/
|
|
312
313
|
const wawActions = {
|
|
313
314
|
screenshot: (params) => __awaiter(this, void 0, void 0, function* () {
|
|
315
|
+
var _b;
|
|
316
|
+
if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.setActionType) {
|
|
317
|
+
this.options.debugChannel.setActionType('screenshot');
|
|
318
|
+
}
|
|
314
319
|
const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
|
|
315
320
|
yield this.options.binaryCallback(screenshotBuffer, 'image/png');
|
|
316
321
|
}),
|
|
317
322
|
enqueueLinks: (selector) => __awaiter(this, void 0, void 0, function* () {
|
|
323
|
+
var _c;
|
|
324
|
+
if ((_c = this.options.debugChannel) === null || _c === void 0 ? void 0 : _c.setActionType) {
|
|
325
|
+
this.options.debugChannel.setActionType('enqueueLinks');
|
|
326
|
+
}
|
|
318
327
|
const links = yield page.locator(selector)
|
|
319
328
|
.evaluateAll(
|
|
320
329
|
// @ts-ignore
|
|
@@ -340,40 +349,50 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
340
349
|
yield page.close();
|
|
341
350
|
}),
|
|
342
351
|
scrape: (selector) => __awaiter(this, void 0, void 0, function* () {
|
|
352
|
+
var _d;
|
|
353
|
+
if ((_d = this.options.debugChannel) === null || _d === void 0 ? void 0 : _d.setActionType) {
|
|
354
|
+
this.options.debugChannel.setActionType('scrape');
|
|
355
|
+
}
|
|
343
356
|
yield this.ensureScriptsLoaded(page);
|
|
344
357
|
const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
|
|
345
358
|
yield this.options.serializableCallback(scrapeResults);
|
|
346
359
|
}),
|
|
347
360
|
scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
|
|
361
|
+
var _e;
|
|
362
|
+
if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.setActionType) {
|
|
363
|
+
this.options.debugChannel.setActionType('scrapeSchema');
|
|
364
|
+
}
|
|
365
|
+
if (this.options.mode && this.options.mode === 'editor') {
|
|
366
|
+
yield this.options.serializableCallback({});
|
|
367
|
+
return;
|
|
368
|
+
}
|
|
348
369
|
yield this.ensureScriptsLoaded(page);
|
|
349
370
|
const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
371
|
+
if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
|
|
372
|
+
this.cumulativeResults = [];
|
|
373
|
+
}
|
|
374
|
+
if (this.cumulativeResults.length === 0) {
|
|
375
|
+
this.cumulativeResults.push({});
|
|
376
|
+
}
|
|
377
|
+
const mergedResult = this.cumulativeResults[0];
|
|
378
|
+
const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
|
|
379
|
+
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
380
|
+
if (value !== undefined) {
|
|
381
|
+
mergedResult[key] = value;
|
|
382
|
+
}
|
|
358
383
|
});
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
Object.entries(curr).forEach(([key, value]) => {
|
|
362
|
-
// If the key doesn't exist or the current value is not undefined, add/update it
|
|
363
|
-
if (value !== undefined) {
|
|
364
|
-
acc[key] = value;
|
|
365
|
-
}
|
|
366
|
-
});
|
|
367
|
-
return acc;
|
|
368
|
-
}, {})))
|
|
369
|
-
];
|
|
370
|
-
// Log cumulative results after each action
|
|
371
|
-
console.log("CUMULATIVE results:", this.cumulativeResults);
|
|
372
|
-
console.log("MERGED results:", mergedResult);
|
|
373
|
-
yield this.options.serializableCallback(mergedResult);
|
|
374
|
-
// await this.options.serializableCallback(scrapeResult);
|
|
384
|
+
console.log("Updated merged result:", mergedResult);
|
|
385
|
+
yield this.options.serializableCallback([mergedResult]);
|
|
375
386
|
}),
|
|
376
387
|
scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
388
|
+
var _f;
|
|
389
|
+
if ((_f = this.options.debugChannel) === null || _f === void 0 ? void 0 : _f.setActionType) {
|
|
390
|
+
this.options.debugChannel.setActionType('scrapeList');
|
|
391
|
+
}
|
|
392
|
+
if (this.options.mode && this.options.mode === 'editor') {
|
|
393
|
+
yield this.options.serializableCallback({});
|
|
394
|
+
return;
|
|
395
|
+
}
|
|
377
396
|
yield this.ensureScriptsLoaded(page);
|
|
378
397
|
if (!config.pagination) {
|
|
379
398
|
const scrapeResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
@@ -385,6 +404,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
385
404
|
}
|
|
386
405
|
}),
|
|
387
406
|
scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
407
|
+
var _g;
|
|
408
|
+
if ((_g = this.options.debugChannel) === null || _g === void 0 ? void 0 : _g.setActionType) {
|
|
409
|
+
this.options.debugChannel.setActionType('scrapeListAuto');
|
|
410
|
+
}
|
|
388
411
|
yield this.ensureScriptsLoaded(page);
|
|
389
412
|
const scrapeResults = yield page.evaluate((listSelector) => {
|
|
390
413
|
return window.scrapeListAuto(listSelector);
|
|
@@ -392,6 +415,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
392
415
|
yield this.options.serializableCallback(scrapeResults);
|
|
393
416
|
}),
|
|
394
417
|
scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
|
|
418
|
+
var _h;
|
|
419
|
+
if ((_h = this.options.debugChannel) === null || _h === void 0 ? void 0 : _h.setActionType) {
|
|
420
|
+
this.options.debugChannel.setActionType('scroll');
|
|
421
|
+
}
|
|
395
422
|
yield page.evaluate((pagesInternal) => __awaiter(this, void 0, void 0, function* () {
|
|
396
423
|
for (let i = 1; i <= (pagesInternal !== null && pagesInternal !== void 0 ? pagesInternal : 1); i += 1) {
|
|
397
424
|
// @ts-ignore
|
|
@@ -400,12 +427,20 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
400
427
|
}), pages !== null && pages !== void 0 ? pages : 1);
|
|
401
428
|
}),
|
|
402
429
|
script: (code) => __awaiter(this, void 0, void 0, function* () {
|
|
430
|
+
var _j;
|
|
431
|
+
if ((_j = this.options.debugChannel) === null || _j === void 0 ? void 0 : _j.setActionType) {
|
|
432
|
+
this.options.debugChannel.setActionType('script');
|
|
433
|
+
}
|
|
403
434
|
const AsyncFunction = Object.getPrototypeOf(() => __awaiter(this, void 0, void 0, function* () { })).constructor;
|
|
404
435
|
const x = new AsyncFunction('page', 'log', code);
|
|
405
436
|
yield x(page, this.log);
|
|
406
437
|
}),
|
|
407
438
|
flag: () => __awaiter(this, void 0, void 0, function* () {
|
|
408
439
|
return new Promise((res) => {
|
|
440
|
+
var _a;
|
|
441
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
442
|
+
this.options.debugChannel.setActionType('flag');
|
|
443
|
+
}
|
|
409
444
|
this.emit('flag', page, res);
|
|
410
445
|
});
|
|
411
446
|
}),
|
|
@@ -433,6 +468,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
433
468
|
yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
|
|
434
469
|
}
|
|
435
470
|
else {
|
|
471
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
472
|
+
this.options.debugChannel.setActionType(String(step.action));
|
|
473
|
+
}
|
|
436
474
|
// Implements the dot notation for the "method name" in the workflow
|
|
437
475
|
const levels = String(step.action).split('.');
|
|
438
476
|
const methodName = levels[levels.length - 1];
|