maxun-core 0.0.26 → 0.0.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +2 -2
- package/build/interpret.d.ts +8 -1
- package/build/interpret.js +164 -88
- package/build/types/workflow.d.ts +1 -1
- package/build/utils/logger.js +2 -2
- package/build/utils/utils.js +2 -1
- package/package.json +5 -5
|
@@ -357,8 +357,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
357
357
|
* @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors
|
|
358
358
|
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
|
359
359
|
*/
|
|
360
|
-
window.scrapeList = function (
|
|
361
|
-
return __awaiter(this,
|
|
360
|
+
window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
|
|
361
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
362
362
|
// XPath evaluation functions
|
|
363
363
|
const queryInsideContext = (context, part) => {
|
|
364
364
|
try {
|
package/build/interpret.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
|
|
1
|
+
/// <reference types="node" />
|
|
2
|
+
import { Page } from 'playwright-core';
|
|
2
3
|
import { EventEmitter } from 'events';
|
|
3
4
|
import { WorkflowFile, ParamType } from './types/workflow';
|
|
4
5
|
/**
|
|
@@ -59,6 +60,7 @@ export default class Interpreter extends EventEmitter {
|
|
|
59
60
|
private namedResults;
|
|
60
61
|
private screenshotCounter;
|
|
61
62
|
private serializableDataByType;
|
|
63
|
+
private scrapeListCounter;
|
|
62
64
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
|
|
63
65
|
/**
|
|
64
66
|
* Sets the abort flag to immediately stop all operations
|
|
@@ -114,5 +116,10 @@ export default class Interpreter extends EventEmitter {
|
|
|
114
116
|
*/
|
|
115
117
|
run(page: Page, params?: ParamType): Promise<void>;
|
|
116
118
|
stop(): Promise<void>;
|
|
119
|
+
/**
|
|
120
|
+
* Cleanup method to release resources and prevent memory leaks
|
|
121
|
+
* Call this when the interpreter is no longer needed
|
|
122
|
+
*/
|
|
123
|
+
cleanup(): Promise<void>;
|
|
117
124
|
}
|
|
118
125
|
export {};
|
package/build/interpret.js
CHANGED
|
@@ -15,23 +15,13 @@ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (
|
|
|
15
15
|
}) : function(o, v) {
|
|
16
16
|
o["default"] = v;
|
|
17
17
|
});
|
|
18
|
-
var __importStar = (this && this.__importStar) ||
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
18
|
+
var __importStar = (this && this.__importStar) || function (mod) {
|
|
19
|
+
if (mod && mod.__esModule) return mod;
|
|
20
|
+
var result = {};
|
|
21
|
+
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
22
|
+
__setModuleDefault(result, mod);
|
|
23
|
+
return result;
|
|
24
|
+
};
|
|
35
25
|
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
36
26
|
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
37
27
|
return new (P || (P = Promise))(function (resolve, reject) {
|
|
@@ -71,6 +61,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
71
61
|
scrapeList: {},
|
|
72
62
|
scrapeSchema: {}
|
|
73
63
|
};
|
|
64
|
+
this.scrapeListCounter = 0;
|
|
74
65
|
this.workflow = workflow.workflow;
|
|
75
66
|
this.initializedWorkflow = null;
|
|
76
67
|
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
|
|
@@ -331,8 +322,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
331
322
|
* @param steps Array of actions.
|
|
332
323
|
*/
|
|
333
324
|
carryOutSteps(page, steps) {
|
|
325
|
+
var _a;
|
|
334
326
|
return __awaiter(this, void 0, void 0, function* () {
|
|
335
|
-
var _a;
|
|
336
327
|
if (this.isAborted) {
|
|
337
328
|
this.log('Workflow aborted, stopping execution', logger_1.Level.WARN);
|
|
338
329
|
return;
|
|
@@ -347,8 +338,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
347
338
|
*/
|
|
348
339
|
const wawActions = {
|
|
349
340
|
screenshot: (params, nameOverride) => __awaiter(this, void 0, void 0, function* () {
|
|
350
|
-
var
|
|
351
|
-
if ((
|
|
341
|
+
var _b;
|
|
342
|
+
if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.setActionType) {
|
|
352
343
|
this.options.debugChannel.setActionType("screenshot");
|
|
353
344
|
}
|
|
354
345
|
const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
|
|
@@ -368,8 +359,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
368
359
|
}, "image/png");
|
|
369
360
|
}),
|
|
370
361
|
enqueueLinks: (selector) => __awaiter(this, void 0, void 0, function* () {
|
|
371
|
-
var
|
|
372
|
-
if ((
|
|
362
|
+
var _c;
|
|
363
|
+
if ((_c = this.options.debugChannel) === null || _c === void 0 ? void 0 : _c.setActionType) {
|
|
373
364
|
this.options.debugChannel.setActionType('enqueueLinks');
|
|
374
365
|
}
|
|
375
366
|
const links = yield page.locator(selector)
|
|
@@ -380,8 +371,9 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
380
371
|
for (const link of links) {
|
|
381
372
|
// eslint-disable-next-line
|
|
382
373
|
this.concurrency.addJob(() => __awaiter(this, void 0, void 0, function* () {
|
|
374
|
+
let newPage = null;
|
|
383
375
|
try {
|
|
384
|
-
|
|
376
|
+
newPage = yield context.newPage();
|
|
385
377
|
yield newPage.goto(link);
|
|
386
378
|
yield newPage.waitForLoadState('networkidle');
|
|
387
379
|
yield this.runLoop(newPage, this.initializedWorkflow);
|
|
@@ -392,26 +384,36 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
392
384
|
// the interpreter by throwing).
|
|
393
385
|
this.log(e, logger_1.Level.ERROR);
|
|
394
386
|
}
|
|
387
|
+
finally {
|
|
388
|
+
if (newPage && !newPage.isClosed()) {
|
|
389
|
+
try {
|
|
390
|
+
yield newPage.close();
|
|
391
|
+
}
|
|
392
|
+
catch (closeError) {
|
|
393
|
+
this.log('Failed to close enqueued page', logger_1.Level.WARN);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
}
|
|
395
397
|
}));
|
|
396
398
|
}
|
|
397
399
|
yield page.close();
|
|
398
400
|
}),
|
|
399
401
|
scrape: (selector) => __awaiter(this, void 0, void 0, function* () {
|
|
400
|
-
var
|
|
401
|
-
if ((
|
|
402
|
+
var _d;
|
|
403
|
+
if ((_d = this.options.debugChannel) === null || _d === void 0 ? void 0 : _d.setActionType) {
|
|
402
404
|
this.options.debugChannel.setActionType('scrape');
|
|
403
405
|
}
|
|
404
406
|
yield this.ensureScriptsLoaded(page);
|
|
405
407
|
const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
|
|
406
408
|
yield this.options.serializableCallback(scrapeResults);
|
|
407
409
|
}),
|
|
408
|
-
scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
|
|
409
|
-
var
|
|
410
|
+
scrapeSchema: (schema, actionName = "") => __awaiter(this, void 0, void 0, function* () {
|
|
411
|
+
var _e;
|
|
410
412
|
if (this.isAborted) {
|
|
411
413
|
this.log('Workflow aborted, stopping scrapeSchema', logger_1.Level.WARN);
|
|
412
414
|
return;
|
|
413
415
|
}
|
|
414
|
-
if ((
|
|
416
|
+
if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.setActionType) {
|
|
415
417
|
this.options.debugChannel.setActionType('scrapeSchema');
|
|
416
418
|
}
|
|
417
419
|
if (this.options.mode && this.options.mode === 'editor') {
|
|
@@ -455,28 +457,28 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
455
457
|
}
|
|
456
458
|
}
|
|
457
459
|
const actionType = "scrapeSchema";
|
|
458
|
-
const
|
|
460
|
+
const name = actionName || "Texts";
|
|
459
461
|
if (!this.namedResults[actionType])
|
|
460
462
|
this.namedResults[actionType] = {};
|
|
461
|
-
this.namedResults[actionType][
|
|
463
|
+
this.namedResults[actionType][name] = this.cumulativeResults;
|
|
462
464
|
if (!this.serializableDataByType[actionType])
|
|
463
465
|
this.serializableDataByType[actionType] = {};
|
|
464
|
-
if (!this.serializableDataByType[actionType][
|
|
465
|
-
this.serializableDataByType[actionType][
|
|
466
|
+
if (!this.serializableDataByType[actionType][name]) {
|
|
467
|
+
this.serializableDataByType[actionType][name] = [];
|
|
466
468
|
}
|
|
467
|
-
this.serializableDataByType[actionType][
|
|
469
|
+
this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
|
|
468
470
|
yield this.options.serializableCallback({
|
|
469
471
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
470
472
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
471
473
|
});
|
|
472
474
|
}),
|
|
473
|
-
scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
474
|
-
var
|
|
475
|
+
scrapeList: (config, actionName = "") => __awaiter(this, void 0, void 0, function* () {
|
|
476
|
+
var _f, _g;
|
|
475
477
|
if (this.isAborted) {
|
|
476
478
|
this.log('Workflow aborted, stopping scrapeList', logger_1.Level.WARN);
|
|
477
479
|
return;
|
|
478
480
|
}
|
|
479
|
-
if ((
|
|
481
|
+
if ((_f = this.options.debugChannel) === null || _f === void 0 ? void 0 : _f.setActionType) {
|
|
480
482
|
this.options.debugChannel.setActionType('scrapeList');
|
|
481
483
|
}
|
|
482
484
|
if (this.options.mode && this.options.mode === 'editor') {
|
|
@@ -485,10 +487,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
485
487
|
}
|
|
486
488
|
try {
|
|
487
489
|
yield this.ensureScriptsLoaded(page);
|
|
488
|
-
if ((
|
|
490
|
+
if ((_g = this.options.debugChannel) === null || _g === void 0 ? void 0 : _g.incrementScrapeListIndex) {
|
|
489
491
|
this.options.debugChannel.incrementScrapeListIndex();
|
|
490
492
|
}
|
|
491
493
|
let scrapeResults = [];
|
|
494
|
+
let paginationUsed = false;
|
|
492
495
|
if (!config.pagination) {
|
|
493
496
|
scrapeResults = yield page.evaluate((cfg) => {
|
|
494
497
|
try {
|
|
@@ -501,34 +504,46 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
501
504
|
}, config);
|
|
502
505
|
}
|
|
503
506
|
else {
|
|
504
|
-
|
|
507
|
+
paginationUsed = true;
|
|
508
|
+
scrapeResults = yield this.handlePagination(page, config, actionName);
|
|
505
509
|
}
|
|
506
510
|
if (!Array.isArray(scrapeResults)) {
|
|
507
511
|
scrapeResults = [];
|
|
508
512
|
}
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
513
|
+
console.log(`ScrapeList completed with ${scrapeResults.length} results`);
|
|
514
|
+
if (!paginationUsed) {
|
|
515
|
+
const actionType = "scrapeList";
|
|
516
|
+
let name = actionName || "";
|
|
517
|
+
if (!name || name.trim() === "") {
|
|
518
|
+
this.scrapeListCounter++;
|
|
519
|
+
name = `List ${this.scrapeListCounter}`;
|
|
520
|
+
}
|
|
521
|
+
if (!this.serializableDataByType[actionType])
|
|
522
|
+
this.serializableDataByType[actionType] = {};
|
|
523
|
+
if (!this.serializableDataByType[actionType][name]) {
|
|
524
|
+
this.serializableDataByType[actionType][name] = [];
|
|
525
|
+
}
|
|
526
|
+
this.serializableDataByType[actionType][name].push(...scrapeResults);
|
|
527
|
+
yield this.options.serializableCallback({
|
|
528
|
+
scrapeList: this.serializableDataByType.scrapeList,
|
|
529
|
+
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
530
|
+
});
|
|
515
531
|
}
|
|
516
|
-
this.serializableDataByType[actionType][actionName].push(...scrapeResults);
|
|
517
|
-
yield this.options.serializableCallback({
|
|
518
|
-
scrapeList: this.serializableDataByType.scrapeList,
|
|
519
|
-
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
520
|
-
});
|
|
521
532
|
}
|
|
522
533
|
catch (error) {
|
|
523
534
|
console.error('ScrapeList action failed completely:', error.message);
|
|
524
535
|
const actionType = "scrapeList";
|
|
525
|
-
|
|
536
|
+
let name = actionName || "";
|
|
537
|
+
if (!name || name.trim() === "") {
|
|
538
|
+
this.scrapeListCounter++;
|
|
539
|
+
name = `List ${this.scrapeListCounter}`;
|
|
540
|
+
}
|
|
526
541
|
if (!this.namedResults[actionType])
|
|
527
542
|
this.namedResults[actionType] = {};
|
|
528
|
-
this.namedResults[actionType][
|
|
543
|
+
this.namedResults[actionType][name] = [];
|
|
529
544
|
if (!this.serializableDataByType[actionType])
|
|
530
545
|
this.serializableDataByType[actionType] = {};
|
|
531
|
-
this.serializableDataByType[actionType][
|
|
546
|
+
this.serializableDataByType[actionType][name] = [];
|
|
532
547
|
yield this.options.serializableCallback({
|
|
533
548
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
534
549
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
@@ -536,8 +551,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
536
551
|
}
|
|
537
552
|
}),
|
|
538
553
|
scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
|
|
539
|
-
var
|
|
540
|
-
if ((
|
|
554
|
+
var _h;
|
|
555
|
+
if ((_h = this.options.debugChannel) === null || _h === void 0 ? void 0 : _h.setActionType) {
|
|
541
556
|
this.options.debugChannel.setActionType('scrapeListAuto');
|
|
542
557
|
}
|
|
543
558
|
yield this.ensureScriptsLoaded(page);
|
|
@@ -547,8 +562,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
547
562
|
yield this.options.serializableCallback(scrapeResults);
|
|
548
563
|
}),
|
|
549
564
|
scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
|
|
550
|
-
var
|
|
551
|
-
if ((
|
|
565
|
+
var _j;
|
|
566
|
+
if ((_j = this.options.debugChannel) === null || _j === void 0 ? void 0 : _j.setActionType) {
|
|
552
567
|
this.options.debugChannel.setActionType('scroll');
|
|
553
568
|
}
|
|
554
569
|
yield page.evaluate((pagesInternal) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -559,8 +574,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
559
574
|
}), pages !== null && pages !== void 0 ? pages : 1);
|
|
560
575
|
}),
|
|
561
576
|
script: (code) => __awaiter(this, void 0, void 0, function* () {
|
|
562
|
-
var
|
|
563
|
-
if ((
|
|
577
|
+
var _k;
|
|
578
|
+
if ((_k = this.options.debugChannel) === null || _k === void 0 ? void 0 : _k.setActionType) {
|
|
564
579
|
this.options.debugChannel.setActionType('script');
|
|
565
580
|
}
|
|
566
581
|
try {
|
|
@@ -610,23 +625,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
610
625
|
if (debug === null || debug === void 0 ? void 0 : debug.setActionType) {
|
|
611
626
|
debug.setActionType(String(step.action));
|
|
612
627
|
}
|
|
613
|
-
|
|
614
|
-
stepName = step.name;
|
|
615
|
-
}
|
|
616
|
-
else if (Array.isArray(step === null || step === void 0 ? void 0 : step.args) &&
|
|
617
|
-
step.args.length > 0 &&
|
|
618
|
-
typeof step.args[0] === "object" &&
|
|
619
|
-
"__name" in step.args[0]) {
|
|
620
|
-
stepName = step.args[0].__name;
|
|
621
|
-
}
|
|
622
|
-
else if (typeof (step === null || step === void 0 ? void 0 : step.args) === "object" &&
|
|
623
|
-
(step === null || step === void 0 ? void 0 : step.args) !== null &&
|
|
624
|
-
"__name" in step.args) {
|
|
625
|
-
stepName = step.args.__name;
|
|
626
|
-
}
|
|
627
|
-
if (!stepName) {
|
|
628
|
-
stepName = String(step.action);
|
|
629
|
-
}
|
|
628
|
+
stepName = (step === null || step === void 0 ? void 0 : step.name) || String(step.action);
|
|
630
629
|
if (debug && typeof debug.setActionName === "function") {
|
|
631
630
|
debug.setActionName(stepName);
|
|
632
631
|
}
|
|
@@ -640,6 +639,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
640
639
|
if (step.action === 'screenshot') {
|
|
641
640
|
yield wawActions.screenshot(...(params !== null && params !== void 0 ? params : []), stepName !== null && stepName !== void 0 ? stepName : undefined);
|
|
642
641
|
}
|
|
642
|
+
else if (step.action === 'scrapeList' || step.action === 'scrapeSchema') {
|
|
643
|
+
const actionName = step.name || "";
|
|
644
|
+
yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []), actionName);
|
|
645
|
+
}
|
|
643
646
|
else {
|
|
644
647
|
yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
|
|
645
648
|
}
|
|
@@ -699,24 +702,35 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
699
702
|
}
|
|
700
703
|
});
|
|
701
704
|
}
|
|
702
|
-
handlePagination(page, config) {
|
|
705
|
+
handlePagination(page, config, providedActionName = "") {
|
|
703
706
|
return __awaiter(this, void 0, void 0, function* () {
|
|
704
707
|
if (this.isAborted) {
|
|
705
708
|
this.log('Workflow aborted, stopping pagination', logger_1.Level.WARN);
|
|
706
709
|
return [];
|
|
707
710
|
}
|
|
711
|
+
const actionType = "scrapeList";
|
|
712
|
+
let actionName = providedActionName || "";
|
|
713
|
+
if (!actionName || actionName.trim() === "") {
|
|
714
|
+
this.scrapeListCounter++;
|
|
715
|
+
actionName = `List ${this.scrapeListCounter}`;
|
|
716
|
+
}
|
|
717
|
+
if (!this.serializableDataByType[actionType]) {
|
|
718
|
+
this.serializableDataByType[actionType] = {};
|
|
719
|
+
}
|
|
720
|
+
if (!this.serializableDataByType[actionType][actionName]) {
|
|
721
|
+
this.serializableDataByType[actionType][actionName] = [];
|
|
722
|
+
}
|
|
708
723
|
let allResults = [];
|
|
709
724
|
let previousHeight = 0;
|
|
710
725
|
let scrapedItems = new Set();
|
|
711
726
|
let visitedUrls = new Set();
|
|
712
727
|
const MAX_RETRIES = 3;
|
|
713
|
-
const RETRY_DELAY = 1000;
|
|
728
|
+
const RETRY_DELAY = 1000;
|
|
714
729
|
const MAX_UNCHANGED_RESULTS = 5;
|
|
715
730
|
const debugLog = (message, ...args) => {
|
|
716
731
|
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
|
717
732
|
};
|
|
718
733
|
const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () {
|
|
719
|
-
// Check abort flag before scraping current page
|
|
720
734
|
if (this.isAborted) {
|
|
721
735
|
debugLog("Workflow aborted, stopping scrapeCurrentPage");
|
|
722
736
|
return;
|
|
@@ -740,7 +754,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
740
754
|
});
|
|
741
755
|
allResults = allResults.concat(newResults);
|
|
742
756
|
debugLog("Results collected:", allResults.length);
|
|
743
|
-
|
|
757
|
+
this.serializableDataByType[actionType][actionName] = [...allResults];
|
|
758
|
+
yield this.options.serializableCallback({
|
|
759
|
+
scrapeList: this.serializableDataByType.scrapeList,
|
|
760
|
+
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
761
|
+
});
|
|
744
762
|
});
|
|
745
763
|
const checkLimit = () => {
|
|
746
764
|
if (config.limit && allResults.length >= config.limit) {
|
|
@@ -762,7 +780,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
762
780
|
selector.includes(' or ');
|
|
763
781
|
};
|
|
764
782
|
// Helper function to wait for selector (CSS or XPath)
|
|
765
|
-
const waitForSelectorUniversal = (
|
|
783
|
+
const waitForSelectorUniversal = (selector, options = {}) => __awaiter(this, void 0, void 0, function* () {
|
|
766
784
|
try {
|
|
767
785
|
if (isXPathSelector(selector)) {
|
|
768
786
|
// Use XPath locator
|
|
@@ -842,7 +860,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
842
860
|
updatedSelectors
|
|
843
861
|
};
|
|
844
862
|
});
|
|
845
|
-
const retryOperation = (
|
|
863
|
+
const retryOperation = (operation, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
|
|
846
864
|
try {
|
|
847
865
|
return yield operation();
|
|
848
866
|
}
|
|
@@ -1006,7 +1024,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1006
1024
|
}).catch(e => {
|
|
1007
1025
|
throw e;
|
|
1008
1026
|
}),
|
|
1009
|
-
|
|
1027
|
+
page.locator(workingSelector).first().click()
|
|
1010
1028
|
]);
|
|
1011
1029
|
debugLog("Navigation successful after regular click");
|
|
1012
1030
|
yield page.waitForTimeout(2000);
|
|
@@ -1022,7 +1040,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1022
1040
|
}).catch(e => {
|
|
1023
1041
|
throw e;
|
|
1024
1042
|
}),
|
|
1025
|
-
|
|
1043
|
+
page.locator(workingSelector).first().dispatchEvent('click')
|
|
1026
1044
|
]);
|
|
1027
1045
|
debugLog("Navigation successful after dispatch event");
|
|
1028
1046
|
yield page.waitForTimeout(2000);
|
|
@@ -1030,11 +1048,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1030
1048
|
}
|
|
1031
1049
|
catch (dispatchNavError) {
|
|
1032
1050
|
try {
|
|
1033
|
-
yield
|
|
1051
|
+
yield page.locator(workingSelector).first().click();
|
|
1034
1052
|
yield page.waitForTimeout(2000);
|
|
1035
1053
|
}
|
|
1036
1054
|
catch (clickError) {
|
|
1037
|
-
yield
|
|
1055
|
+
yield page.locator(workingSelector).first().dispatchEvent('click');
|
|
1038
1056
|
yield page.waitForTimeout(2000);
|
|
1039
1057
|
}
|
|
1040
1058
|
}
|
|
@@ -1222,8 +1240,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1222
1240
|
return workflow;
|
|
1223
1241
|
}
|
|
1224
1242
|
runLoop(p, workflow) {
|
|
1243
|
+
var _a, _b;
|
|
1225
1244
|
return __awaiter(this, void 0, void 0, function* () {
|
|
1226
|
-
var _a, _b;
|
|
1227
1245
|
if (this.isAborted) {
|
|
1228
1246
|
this.log('Workflow aborted in runLoop', logger_1.Level.WARN);
|
|
1229
1247
|
return;
|
|
@@ -1247,36 +1265,52 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1247
1265
|
* User-requested concurrency should be entirely managed by the concurrency manager,
|
|
1248
1266
|
* e.g. via `enqueueLinks`.
|
|
1249
1267
|
*/
|
|
1250
|
-
|
|
1268
|
+
const popupHandler = (popup) => {
|
|
1251
1269
|
this.concurrency.addJob(() => this.runLoop(popup, workflowCopy));
|
|
1252
|
-
}
|
|
1270
|
+
};
|
|
1271
|
+
p.on('popup', popupHandler);
|
|
1253
1272
|
/* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
|
|
1254
1273
|
let loopIterations = 0;
|
|
1255
1274
|
const MAX_LOOP_ITERATIONS = 1000; // Circuit breaker
|
|
1275
|
+
// Cleanup function to remove popup listener
|
|
1276
|
+
const cleanup = () => {
|
|
1277
|
+
try {
|
|
1278
|
+
if (!p.isClosed()) {
|
|
1279
|
+
p.removeListener('popup', popupHandler);
|
|
1280
|
+
}
|
|
1281
|
+
}
|
|
1282
|
+
catch (cleanupError) {
|
|
1283
|
+
}
|
|
1284
|
+
};
|
|
1256
1285
|
while (true) {
|
|
1257
1286
|
if (this.isAborted) {
|
|
1258
1287
|
this.log('Workflow aborted during step execution', logger_1.Level.WARN);
|
|
1288
|
+
cleanup();
|
|
1259
1289
|
return;
|
|
1260
1290
|
}
|
|
1261
1291
|
// Circuit breaker to prevent infinite loops
|
|
1262
1292
|
if (++loopIterations > MAX_LOOP_ITERATIONS) {
|
|
1263
1293
|
this.log('Maximum loop iterations reached, terminating to prevent infinite loop', logger_1.Level.ERROR);
|
|
1294
|
+
cleanup();
|
|
1264
1295
|
return;
|
|
1265
1296
|
}
|
|
1266
1297
|
// Checks whether the page was closed from outside,
|
|
1267
1298
|
// or the workflow execution has been stopped via `interpreter.stop()`
|
|
1268
1299
|
if (p.isClosed() || !this.stopper) {
|
|
1300
|
+
cleanup();
|
|
1269
1301
|
return;
|
|
1270
1302
|
}
|
|
1271
1303
|
try {
|
|
1272
1304
|
yield p.waitForLoadState();
|
|
1273
1305
|
}
|
|
1274
1306
|
catch (e) {
|
|
1307
|
+
cleanup();
|
|
1275
1308
|
yield p.close();
|
|
1276
1309
|
return;
|
|
1277
1310
|
}
|
|
1278
1311
|
if (workflowCopy.length === 0) {
|
|
1279
1312
|
this.log('All actions completed. Workflow finished.', logger_1.Level.LOG);
|
|
1313
|
+
cleanup();
|
|
1280
1314
|
return;
|
|
1281
1315
|
}
|
|
1282
1316
|
// let pageState = {};
|
|
@@ -1355,6 +1389,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1355
1389
|
}
|
|
1356
1390
|
else {
|
|
1357
1391
|
//await this.disableAdBlocker(p);
|
|
1392
|
+
cleanup();
|
|
1358
1393
|
return;
|
|
1359
1394
|
}
|
|
1360
1395
|
}
|
|
@@ -1438,5 +1473,46 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1438
1473
|
}
|
|
1439
1474
|
});
|
|
1440
1475
|
}
|
|
1476
|
+
/**
|
|
1477
|
+
* Cleanup method to release resources and prevent memory leaks
|
|
1478
|
+
* Call this when the interpreter is no longer needed
|
|
1479
|
+
*/
|
|
1480
|
+
cleanup() {
|
|
1481
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
1482
|
+
try {
|
|
1483
|
+
// Stop any running workflows first
|
|
1484
|
+
if (this.stopper) {
|
|
1485
|
+
try {
|
|
1486
|
+
yield this.stop();
|
|
1487
|
+
}
|
|
1488
|
+
catch (error) {
|
|
1489
|
+
this.log(`Error stopping workflow during cleanup: ${error.message}`, logger_1.Level.WARN);
|
|
1490
|
+
}
|
|
1491
|
+
}
|
|
1492
|
+
// Clear ad-blocker resources
|
|
1493
|
+
if (this.blocker) {
|
|
1494
|
+
try {
|
|
1495
|
+
this.blocker = null;
|
|
1496
|
+
this.log('Ad-blocker resources cleared', logger_1.Level.DEBUG);
|
|
1497
|
+
}
|
|
1498
|
+
catch (error) {
|
|
1499
|
+
this.log(`Error cleaning up ad-blocker: ${error.message}`, logger_1.Level.WARN);
|
|
1500
|
+
}
|
|
1501
|
+
}
|
|
1502
|
+
// Clear accumulated data to free memory
|
|
1503
|
+
this.cumulativeResults = [];
|
|
1504
|
+
this.namedResults = {};
|
|
1505
|
+
this.serializableDataByType = { scrapeList: {}, scrapeSchema: {} };
|
|
1506
|
+
// Reset state
|
|
1507
|
+
this.isAborted = false;
|
|
1508
|
+
this.initializedWorkflow = null;
|
|
1509
|
+
this.log('Interpreter cleanup completed', logger_1.Level.DEBUG);
|
|
1510
|
+
}
|
|
1511
|
+
catch (error) {
|
|
1512
|
+
this.log(`Error during interpreter cleanup: ${error.message}`, logger_1.Level.ERROR);
|
|
1513
|
+
throw error;
|
|
1514
|
+
}
|
|
1515
|
+
});
|
|
1516
|
+
}
|
|
1441
1517
|
}
|
|
1442
1518
|
exports.default = Interpreter;
|
package/build/utils/logger.js
CHANGED
|
@@ -4,7 +4,6 @@
|
|
|
4
4
|
*/
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
6
|
exports.Level = void 0;
|
|
7
|
-
exports.default = logger;
|
|
8
7
|
var Level;
|
|
9
8
|
(function (Level) {
|
|
10
9
|
Level[Level["DATE"] = 36] = "DATE";
|
|
@@ -13,7 +12,7 @@ var Level;
|
|
|
13
12
|
Level[Level["ERROR"] = 31] = "ERROR";
|
|
14
13
|
Level[Level["DEBUG"] = 95] = "DEBUG";
|
|
15
14
|
Level[Level["RESET"] = 0] = "RESET";
|
|
16
|
-
})(Level
|
|
15
|
+
})(Level = exports.Level || (exports.Level = {}));
|
|
17
16
|
function logger(message, level = Level.LOG) {
|
|
18
17
|
let m = message;
|
|
19
18
|
if (message.constructor.name.includes('Error') && typeof message !== 'string') {
|
|
@@ -29,3 +28,4 @@ function logger(message, level = Level.LOG) {
|
|
|
29
28
|
}
|
|
30
29
|
process.stdout.write(`\x1b[${Level.RESET}m\n`);
|
|
31
30
|
}
|
|
31
|
+
exports.default = logger;
|
package/build/utils/utils.js
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* (it still does not represent the "utils" file)
|
|
5
5
|
*/
|
|
6
6
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
-
exports.arrayToObject =
|
|
7
|
+
exports.arrayToObject = void 0;
|
|
8
8
|
/* eslint-disable import/prefer-default-export */
|
|
9
9
|
/**
|
|
10
10
|
* Converts an array of scalars to an object with **items** of the array **for keys**.
|
|
@@ -12,3 +12,4 @@ exports.arrayToObject = arrayToObject;
|
|
|
12
12
|
function arrayToObject(array) {
|
|
13
13
|
return array.reduce((p, x) => (Object.assign(Object.assign({}, p), { [x]: [] })), {});
|
|
14
14
|
}
|
|
15
|
+
exports.arrayToObject = arrayToObject;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "maxun-core",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.28",
|
|
4
4
|
"description": "Core package for Maxun, responsible for data extraction",
|
|
5
5
|
"main": "build/index.js",
|
|
6
6
|
"typings": "build/index.d.ts",
|
|
@@ -31,10 +31,10 @@
|
|
|
31
31
|
"license": "AGPL-3.0-or-later",
|
|
32
32
|
"dependencies": {
|
|
33
33
|
"@cliqz/adblocker-playwright": "^1.31.3",
|
|
34
|
+
"@types/node": "22.7.9",
|
|
34
35
|
"cross-fetch": "^4.0.0",
|
|
35
36
|
"joi": "^17.6.0",
|
|
36
|
-
"playwright": "
|
|
37
|
-
"
|
|
38
|
-
"puppeteer-extra-plugin-stealth": "^2.11.2"
|
|
37
|
+
"playwright-core": "1.57.0",
|
|
38
|
+
"turndown": "^7.2.2"
|
|
39
39
|
}
|
|
40
|
-
}
|
|
40
|
+
}
|