maxun-core 0.0.31 → 0.0.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.d.ts +39 -0
- package/build/interpret.js +270 -52
- package/build/utils/concurrency.d.ts +5 -1
- package/build/utils/concurrency.js +24 -5
- package/package.json +1 -1
package/build/interpret.d.ts
CHANGED
|
@@ -37,6 +37,7 @@ interface InterpreterOptions {
|
|
|
37
37
|
serializableCallback: (output: any) => (void | Promise<void>);
|
|
38
38
|
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
|
39
39
|
debug: boolean;
|
|
40
|
+
type?: 'extract' | 'scrape' | 'crawl' | 'search';
|
|
40
41
|
debugChannel: Partial<{
|
|
41
42
|
activeId: (id: number) => void;
|
|
42
43
|
debugMessage: (msg: string) => void;
|
|
@@ -55,6 +56,7 @@ export default class Interpreter extends EventEmitter {
|
|
|
55
56
|
private concurrency;
|
|
56
57
|
private stopper;
|
|
57
58
|
private isAborted;
|
|
59
|
+
private visualRenderRequired;
|
|
58
60
|
private log;
|
|
59
61
|
private cumulativeResults;
|
|
60
62
|
private namedResults;
|
|
@@ -90,6 +92,43 @@ export default class Interpreter extends EventEmitter {
|
|
|
90
92
|
* @returns True if `where` is applicable in the given context, false otherwise
|
|
91
93
|
*/
|
|
92
94
|
private applicable;
|
|
95
|
+
/**
|
|
96
|
+
* Returns the optimal Playwright `waitUntil` navigation strategy based on
|
|
97
|
+
* whether the current operation requires visual rendering.
|
|
98
|
+
*
|
|
99
|
+
* - `'networkidle'` — used when screenshots are requested; waits for all
|
|
100
|
+
* sub-resources so the page renders correctly.
|
|
101
|
+
* - `'domcontentloaded'` — used for all DOM-only operations (scraping, crawling,
|
|
102
|
+
* extraction, search); skips stylesheet/image loading for
|
|
103
|
+
* maximum speed.
|
|
104
|
+
*
|
|
105
|
+
* @param blockOverride Pass `true` when the caller will take a screenshot
|
|
106
|
+
* or requires styled layout. Defaults to `false`.
|
|
107
|
+
*/
|
|
108
|
+
private getNavigationWaitStrategy;
|
|
109
|
+
/**
|
|
110
|
+
* Returns true if any step in the given `what` block requires a fully
|
|
111
|
+
* rendered page.
|
|
112
|
+
*/
|
|
113
|
+
private blockNeedsVisualRender;
|
|
114
|
+
/**
|
|
115
|
+
* Returns true if any of the remaining blocks in the workflow require a visual render
|
|
116
|
+
* before the next page navigation.
|
|
117
|
+
*/
|
|
118
|
+
private remainingWorkflowNeedsVisualRender;
|
|
119
|
+
/**
|
|
120
|
+
* Helper to wait for a "Network Quiet Window" (no meaningful activity for X ms).
|
|
121
|
+
*/
|
|
122
|
+
private waitForNetworkQuiet;
|
|
123
|
+
/**
|
|
124
|
+
* Scans the remaining workflow to find the next meaningful extraction selector.
|
|
125
|
+
*/
|
|
126
|
+
private getUpcomingExtractionSelector;
|
|
127
|
+
/**
|
|
128
|
+
* Function to wait for images to load.
|
|
129
|
+
*/
|
|
130
|
+
private waitForImagesLoaded;
|
|
131
|
+
private waitForDynamicStability;
|
|
93
132
|
/**
|
|
94
133
|
* Given a Playwright's page object and a "declarative" list of actions, this function
|
|
95
134
|
* calls all mentioned functions on the Page object.\
|
package/build/interpret.js
CHANGED
|
@@ -35,7 +35,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
35
35
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
36
36
|
};
|
|
37
37
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
38
|
-
const
|
|
38
|
+
const path = __importStar(require("path"));
|
|
39
39
|
const events_1 = require("events");
|
|
40
40
|
const logic_1 = require("./types/logic");
|
|
41
41
|
const utils_1 = require("./utils/utils");
|
|
@@ -51,6 +51,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
51
51
|
super();
|
|
52
52
|
this.stopper = null;
|
|
53
53
|
this.isAborted = false;
|
|
54
|
+
this.visualRenderRequired = false;
|
|
54
55
|
// private blocker: PlaywrightBlocker | null = null;
|
|
55
56
|
this.cumulativeResults = [];
|
|
56
57
|
this.namedResults = {};
|
|
@@ -69,6 +70,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
69
70
|
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
|
|
70
71
|
(0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN);
|
|
71
72
|
}, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
|
|
73
|
+
this.visualRenderRequired = ((options === null || options === void 0 ? void 0 : options.type) === 'extract');
|
|
72
74
|
this.concurrency = new concurrency_1.default(this.options.maxConcurrency);
|
|
73
75
|
this.log = (...args) => (0, logger_1.default)(...args);
|
|
74
76
|
const error = preprocessor_1.default.validateWorkflow(workflow);
|
|
@@ -290,6 +292,167 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
290
292
|
}
|
|
291
293
|
});
|
|
292
294
|
}
|
|
295
|
+
/**
|
|
296
|
+
* Returns the optimal Playwright `waitUntil` navigation strategy based on
|
|
297
|
+
* whether the current operation requires visual rendering.
|
|
298
|
+
*
|
|
299
|
+
* - `'networkidle'` — used when screenshots are requested; waits for all
|
|
300
|
+
* sub-resources so the page renders correctly.
|
|
301
|
+
* - `'domcontentloaded'` — used for all DOM-only operations (scraping, crawling,
|
|
302
|
+
* extraction, search); skips stylesheet/image loading for
|
|
303
|
+
* maximum speed.
|
|
304
|
+
*
|
|
305
|
+
* @param blockOverride Pass `true` when the caller will take a screenshot
|
|
306
|
+
* or requires styled layout. Defaults to `false`.
|
|
307
|
+
*/
|
|
308
|
+
getNavigationWaitStrategy(blockOverride) {
|
|
309
|
+
const finalRequirement = blockOverride !== null && blockOverride !== void 0 ? blockOverride : this.visualRenderRequired;
|
|
310
|
+
return finalRequirement ? 'networkidle' : 'domcontentloaded';
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Returns true if any step in the given `what` block requires a fully
|
|
314
|
+
* rendered page.
|
|
315
|
+
*/
|
|
316
|
+
blockNeedsVisualRender(steps) {
|
|
317
|
+
return steps.some((s) => {
|
|
318
|
+
var _a, _b;
|
|
319
|
+
if (s.action === 'screenshot')
|
|
320
|
+
return true;
|
|
321
|
+
if (s.action === 'scrapeList' || s.action === 'scrapeSchema')
|
|
322
|
+
return true;
|
|
323
|
+
const firstArg = Array.isArray(s.args) ? s.args[0] : s.args;
|
|
324
|
+
if (!firstArg || typeof firstArg !== 'object')
|
|
325
|
+
return false;
|
|
326
|
+
if (s.action === 'scrape') {
|
|
327
|
+
const formats = (_a = firstArg.formats) !== null && _a !== void 0 ? _a : [];
|
|
328
|
+
const heavyFormats = ['markdown', 'html', 'text', 'screenshot-visible', 'screenshot-full'];
|
|
329
|
+
return formats.some((f) => heavyFormats.includes(f));
|
|
330
|
+
}
|
|
331
|
+
if (s.action === 'crawl' || s.action === 'search') {
|
|
332
|
+
const outputFormats = (_b = firstArg.outputFormats) !== null && _b !== void 0 ? _b : [];
|
|
333
|
+
const heavyFormats = ['markdown', 'html', 'text', 'screenshot-visible', 'screenshot-full'];
|
|
334
|
+
return outputFormats.some((f) => heavyFormats.includes(f));
|
|
335
|
+
}
|
|
336
|
+
return false;
|
|
337
|
+
});
|
|
338
|
+
}
|
|
339
|
+
/**
|
|
340
|
+
* Returns true if any of the remaining blocks in the workflow require a visual render
|
|
341
|
+
* before the next page navigation.
|
|
342
|
+
*/
|
|
343
|
+
remainingWorkflowNeedsVisualRender(remainingWorkflow) {
|
|
344
|
+
if (!remainingWorkflow || remainingWorkflow.length === 0)
|
|
345
|
+
return false;
|
|
346
|
+
for (let i = remainingWorkflow.length - 1; i >= 0; i--) {
|
|
347
|
+
const pair = remainingWorkflow[i];
|
|
348
|
+
if (this.blockNeedsVisualRender(pair.what))
|
|
349
|
+
return true;
|
|
350
|
+
if (pair.what.some(s => s.action === 'goto'))
|
|
351
|
+
return false;
|
|
352
|
+
}
|
|
353
|
+
return false;
|
|
354
|
+
}
|
|
355
|
+
/**
|
|
356
|
+
* Helper to wait for a "Network Quiet Window" (no meaningful activity for X ms).
|
|
357
|
+
*/
|
|
358
|
+
waitForNetworkQuiet(page, timeout = 4000, quietWindow = 600) {
|
|
359
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
360
|
+
let lastRequestTime = Date.now();
|
|
361
|
+
const onRequest = () => { lastRequestTime = Date.now(); };
|
|
362
|
+
page.on('request', onRequest);
|
|
363
|
+
page.on('requestfinished', onRequest);
|
|
364
|
+
page.on('requestfailed', onRequest);
|
|
365
|
+
try {
|
|
366
|
+
const checkInterval = 100;
|
|
367
|
+
const start = Date.now();
|
|
368
|
+
while (Date.now() - start < timeout) {
|
|
369
|
+
if (Date.now() - lastRequestTime > quietWindow)
|
|
370
|
+
return;
|
|
371
|
+
yield new Promise(r => setTimeout(r, checkInterval));
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
finally {
|
|
375
|
+
page.off('request', onRequest);
|
|
376
|
+
page.off('requestfinished', onRequest);
|
|
377
|
+
page.off('requestfailed', onRequest);
|
|
378
|
+
}
|
|
379
|
+
});
|
|
380
|
+
}
|
|
381
|
+
/**
|
|
382
|
+
* Scans the remaining workflow to find the next meaningful extraction selector.
|
|
383
|
+
*/
|
|
384
|
+
getUpcomingExtractionSelector(remainingWorkflow) {
|
|
385
|
+
if (!remainingWorkflow || remainingWorkflow.length === 0)
|
|
386
|
+
return null;
|
|
387
|
+
for (let i = remainingWorkflow.length - 1; i >= 0; i--) {
|
|
388
|
+
const pair = remainingWorkflow[i];
|
|
389
|
+
for (const s of pair.what) {
|
|
390
|
+
if (s.action === 'goto')
|
|
391
|
+
return null;
|
|
392
|
+
if (s.action === 'scrapeList' || s.action === 'scrapeSchema') {
|
|
393
|
+
const firstArg = Array.isArray(s.args) ? s.args[0] : s.args;
|
|
394
|
+
if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.listSelector)
|
|
395
|
+
return firstArg.listSelector;
|
|
396
|
+
if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.fields) {
|
|
397
|
+
const firstField = Object.values(firstArg.fields)[0];
|
|
398
|
+
if (firstField === null || firstField === void 0 ? void 0 : firstField.selector)
|
|
399
|
+
return firstField.selector;
|
|
400
|
+
}
|
|
401
|
+
if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.selector)
|
|
402
|
+
return firstArg.selector;
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
return null;
|
|
407
|
+
}
|
|
408
|
+
/**
|
|
409
|
+
* Function to wait for images to load.
|
|
410
|
+
*/
|
|
411
|
+
waitForImagesLoaded(page) {
|
|
412
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
413
|
+
yield page.waitForFunction(() => Array.from(document.images).every(img => img.complete), { timeout: 5000 }).catch(() => { });
|
|
414
|
+
});
|
|
415
|
+
}
|
|
416
|
+
waitForDynamicStability(page, upcomingWorkflow = []) {
|
|
417
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
418
|
+
try {
|
|
419
|
+
const targetSelector = this.getUpcomingExtractionSelector(upcomingWorkflow);
|
|
420
|
+
const signals = [
|
|
421
|
+
this.waitForNetworkQuiet(page, 10000, 1000),
|
|
422
|
+
page.evaluate(() => __awaiter(this, void 0, void 0, function* () {
|
|
423
|
+
let lastLen = 0;
|
|
424
|
+
let stableIterations = 0;
|
|
425
|
+
for (let i = 0; i < 60; i++) {
|
|
426
|
+
const currentLen = document.body.innerText.length;
|
|
427
|
+
if (currentLen > 200 && currentLen === lastLen) {
|
|
428
|
+
stableIterations++;
|
|
429
|
+
}
|
|
430
|
+
else {
|
|
431
|
+
stableIterations = 0;
|
|
432
|
+
}
|
|
433
|
+
if (stableIterations >= 8)
|
|
434
|
+
return true;
|
|
435
|
+
lastLen = currentLen;
|
|
436
|
+
yield new Promise(r => setTimeout(r, 100));
|
|
437
|
+
}
|
|
438
|
+
return false;
|
|
439
|
+
})).catch(() => { }),
|
|
440
|
+
new Promise(resolve => setTimeout(resolve, 10000))
|
|
441
|
+
];
|
|
442
|
+
if (targetSelector) {
|
|
443
|
+
const found = yield page.waitForSelector(targetSelector, { timeout: 8000 }).catch(() => null);
|
|
444
|
+
if (found) {
|
|
445
|
+
yield new Promise(resolve => setTimeout(resolve, 1000));
|
|
446
|
+
return;
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
yield Promise.race(signals);
|
|
450
|
+
yield new Promise(resolve => setTimeout(resolve, 1500));
|
|
451
|
+
}
|
|
452
|
+
catch (e) {
|
|
453
|
+
}
|
|
454
|
+
});
|
|
455
|
+
}
|
|
293
456
|
/**
|
|
294
457
|
* Given a Playwright's page object and a "declarative" list of actions, this function
|
|
295
458
|
* calls all mentioned functions on the Page object.\
|
|
@@ -299,7 +462,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
299
462
|
* @param page Playwright Page object
|
|
300
463
|
* @param steps Array of actions.
|
|
301
464
|
*/
|
|
302
|
-
carryOutSteps(page, steps) {
|
|
465
|
+
carryOutSteps(page, steps, currentWorkflow) {
|
|
303
466
|
var _a;
|
|
304
467
|
return __awaiter(this, void 0, void 0, function* () {
|
|
305
468
|
if (this.isAborted) {
|
|
@@ -320,6 +483,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
320
483
|
if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.setActionType) {
|
|
321
484
|
this.options.debugChannel.setActionType("screenshot");
|
|
322
485
|
}
|
|
486
|
+
yield this.waitForImagesLoaded(page);
|
|
323
487
|
const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
|
|
324
488
|
const explicitName = (typeof nameOverride === 'string' && nameOverride.trim().length > 0) ? nameOverride.trim() : null;
|
|
325
489
|
let screenshotName;
|
|
@@ -352,8 +516,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
352
516
|
let newPage = null;
|
|
353
517
|
try {
|
|
354
518
|
newPage = yield context.newPage();
|
|
355
|
-
yield newPage.goto(link);
|
|
356
|
-
yield newPage.waitForLoadState('networkidle');
|
|
519
|
+
yield newPage.goto(link, { waitUntil: this.getNavigationWaitStrategy() });
|
|
357
520
|
yield this.runLoop(newPage, this.initializedWorkflow);
|
|
358
521
|
}
|
|
359
522
|
catch (e) {
|
|
@@ -381,6 +544,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
381
544
|
if ((_d = this.options.debugChannel) === null || _d === void 0 ? void 0 : _d.setActionType) {
|
|
382
545
|
this.options.debugChannel.setActionType('scrape');
|
|
383
546
|
}
|
|
547
|
+
yield this.waitForDynamicStability(page, [{
|
|
548
|
+
action: 'scrape',
|
|
549
|
+
args: [selector]
|
|
550
|
+
}]);
|
|
384
551
|
yield this.ensureScriptsLoaded(page);
|
|
385
552
|
const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
|
|
386
553
|
yield this.options.serializableCallback(scrapeResults);
|
|
@@ -394,12 +561,22 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
394
561
|
if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.setActionType) {
|
|
395
562
|
this.options.debugChannel.setActionType('scrapeSchema');
|
|
396
563
|
}
|
|
564
|
+
yield this.waitForDynamicStability(page, [{
|
|
565
|
+
action: 'scrapeSchema',
|
|
566
|
+
args: [schema]
|
|
567
|
+
}]);
|
|
397
568
|
if (this.options.mode && this.options.mode === 'editor') {
|
|
398
569
|
yield this.options.serializableCallback({});
|
|
399
570
|
return;
|
|
400
571
|
}
|
|
401
572
|
yield this.ensureScriptsLoaded(page);
|
|
402
|
-
const
|
|
573
|
+
const normalizedSchema = Object.fromEntries(Object.entries(schema).map(([key, value]) => [
|
|
574
|
+
key,
|
|
575
|
+
typeof value === 'string'
|
|
576
|
+
? { selector: value, tag: '', attribute: 'innerText', shadow: '' }
|
|
577
|
+
: value,
|
|
578
|
+
]));
|
|
579
|
+
const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), normalizedSchema);
|
|
403
580
|
if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
|
|
404
581
|
this.cumulativeResults = [];
|
|
405
582
|
}
|
|
@@ -446,10 +623,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
446
623
|
}
|
|
447
624
|
this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
|
|
448
625
|
yield this.options.serializableCallback({
|
|
449
|
-
scrapeList: this.serializableDataByType
|
|
450
|
-
scrapeSchema: this.serializableDataByType
|
|
451
|
-
crawl: this.serializableDataByType
|
|
452
|
-
search: this.serializableDataByType
|
|
626
|
+
scrapeList: this.serializableDataByType['scrapeList'],
|
|
627
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema'],
|
|
628
|
+
crawl: this.serializableDataByType['crawl'] || {},
|
|
629
|
+
search: this.serializableDataByType['search'] || {}
|
|
453
630
|
});
|
|
454
631
|
}),
|
|
455
632
|
scrapeList: (config, actionName = "") => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -505,8 +682,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
505
682
|
}
|
|
506
683
|
this.serializableDataByType[actionType][name].push(...scrapeResults);
|
|
507
684
|
yield this.options.serializableCallback({
|
|
508
|
-
scrapeList: this.serializableDataByType
|
|
509
|
-
scrapeSchema: this.serializableDataByType
|
|
685
|
+
scrapeList: this.serializableDataByType['scrapeList'],
|
|
686
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema']
|
|
510
687
|
});
|
|
511
688
|
}
|
|
512
689
|
}
|
|
@@ -525,8 +702,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
525
702
|
this.serializableDataByType[actionType] = {};
|
|
526
703
|
this.serializableDataByType[actionType][name] = [];
|
|
527
704
|
yield this.options.serializableCallback({
|
|
528
|
-
scrapeList: this.serializableDataByType
|
|
529
|
-
scrapeSchema: this.serializableDataByType
|
|
705
|
+
scrapeList: this.serializableDataByType['scrapeList'],
|
|
706
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema']
|
|
530
707
|
});
|
|
531
708
|
}
|
|
532
709
|
}),
|
|
@@ -765,8 +942,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
765
942
|
};
|
|
766
943
|
const extractLinksFromPage = () => __awaiter(this, void 0, void 0, function* () {
|
|
767
944
|
try {
|
|
768
|
-
yield page.waitForLoadState(
|
|
769
|
-
yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
|
|
945
|
+
yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => { });
|
|
770
946
|
yield new Promise(resolve => setTimeout(resolve, 1000));
|
|
771
947
|
const pageLinks = yield page.evaluate(() => {
|
|
772
948
|
const links = [];
|
|
@@ -937,12 +1113,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
937
1113
|
yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
|
|
938
1114
|
}
|
|
939
1115
|
yield page.goto(url, {
|
|
940
|
-
waitUntil:
|
|
1116
|
+
waitUntil: this.getNavigationWaitStrategy(),
|
|
941
1117
|
timeout: 30000
|
|
942
1118
|
}).catch((err) => {
|
|
943
1119
|
throw new Error(`Navigation failed: ${err.message}`);
|
|
944
1120
|
});
|
|
945
|
-
yield
|
|
1121
|
+
yield this.waitForDynamicStability(page, currentWorkflow || []);
|
|
946
1122
|
const pageResult = yield scrapePageContent(url);
|
|
947
1123
|
pageResult.metadata.depth = depth;
|
|
948
1124
|
crawlResults.push(pageResult);
|
|
@@ -989,10 +1165,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
989
1165
|
}
|
|
990
1166
|
this.serializableDataByType[actionType][actionName] = crawlResults;
|
|
991
1167
|
yield this.options.serializableCallback({
|
|
992
|
-
scrapeList: this.serializableDataByType
|
|
993
|
-
scrapeSchema: this.serializableDataByType
|
|
994
|
-
crawl: this.serializableDataByType
|
|
995
|
-
search: this.serializableDataByType
|
|
1168
|
+
scrapeList: this.serializableDataByType['scrapeList'] || {},
|
|
1169
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
|
|
1170
|
+
crawl: this.serializableDataByType['crawl'] || {},
|
|
1171
|
+
search: this.serializableDataByType['search'] || {}
|
|
996
1172
|
});
|
|
997
1173
|
}
|
|
998
1174
|
catch (error) {
|
|
@@ -1025,7 +1201,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1025
1201
|
const initialDelay = 500 + Math.random() * 1000;
|
|
1026
1202
|
yield new Promise(resolve => setTimeout(resolve, initialDelay));
|
|
1027
1203
|
yield page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
1028
|
-
yield page.waitForLoadState(
|
|
1204
|
+
yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => {
|
|
1029
1205
|
this.log('Load state timeout, continuing anyway', logger_1.Level.WARN);
|
|
1030
1206
|
});
|
|
1031
1207
|
const pageLoadDelay = 2000 + Math.random() * 1500;
|
|
@@ -1214,10 +1390,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1214
1390
|
};
|
|
1215
1391
|
this.serializableDataByType[actionType][actionName] = searchData;
|
|
1216
1392
|
yield this.options.serializableCallback({
|
|
1217
|
-
scrapeList: this.serializableDataByType
|
|
1218
|
-
scrapeSchema: this.serializableDataByType
|
|
1219
|
-
crawl: this.serializableDataByType
|
|
1220
|
-
search: this.serializableDataByType
|
|
1393
|
+
scrapeList: this.serializableDataByType['scrapeList'] || {},
|
|
1394
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
|
|
1395
|
+
crawl: this.serializableDataByType['crawl'] || {},
|
|
1396
|
+
search: this.serializableDataByType['search'] || {}
|
|
1221
1397
|
});
|
|
1222
1398
|
this.log(`Search completed in discover mode with ${searchResults.length} results`, logger_1.Level.LOG);
|
|
1223
1399
|
return;
|
|
@@ -1229,12 +1405,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1229
1405
|
try {
|
|
1230
1406
|
this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, logger_1.Level.LOG);
|
|
1231
1407
|
yield page.goto(result.url, {
|
|
1232
|
-
waitUntil:
|
|
1408
|
+
waitUntil: this.getNavigationWaitStrategy(),
|
|
1233
1409
|
timeout: 30000
|
|
1234
1410
|
}).catch(() => {
|
|
1235
1411
|
this.log(`Failed to navigate to ${result.url}, skipping...`, logger_1.Level.WARN);
|
|
1236
1412
|
});
|
|
1237
|
-
yield
|
|
1413
|
+
yield this.waitForDynamicStability(page, currentWorkflow || []);
|
|
1238
1414
|
const pageData = yield page.evaluate(() => {
|
|
1239
1415
|
var _a, _b;
|
|
1240
1416
|
const getMeta = (name) => {
|
|
@@ -1321,10 +1497,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1321
1497
|
};
|
|
1322
1498
|
this.serializableDataByType[actionType][actionName] = searchData;
|
|
1323
1499
|
yield this.options.serializableCallback({
|
|
1324
|
-
scrapeList: this.serializableDataByType
|
|
1325
|
-
scrapeSchema: this.serializableDataByType
|
|
1326
|
-
crawl: this.serializableDataByType
|
|
1327
|
-
search: this.serializableDataByType
|
|
1500
|
+
scrapeList: this.serializableDataByType['scrapeList'] || {},
|
|
1501
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
|
|
1502
|
+
crawl: this.serializableDataByType['crawl'] || {},
|
|
1503
|
+
search: this.serializableDataByType['search'] || {}
|
|
1328
1504
|
});
|
|
1329
1505
|
}
|
|
1330
1506
|
catch (error) {
|
|
@@ -1402,19 +1578,52 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1402
1578
|
for (const level of levels.splice(0, levels.length - 1)) {
|
|
1403
1579
|
invokee = invokee[level];
|
|
1404
1580
|
}
|
|
1405
|
-
if (methodName === '
|
|
1581
|
+
if (methodName === 'goto') {
|
|
1582
|
+
try {
|
|
1583
|
+
const gotoArgs = step.args || [];
|
|
1584
|
+
const url = gotoArgs[0];
|
|
1585
|
+
const existingOpts = (typeof gotoArgs[1] === 'object' && gotoArgs[1] !== null)
|
|
1586
|
+
? Object.assign({}, gotoArgs[1]) : {};
|
|
1587
|
+
const requestedWait = existingOpts.waitUntil;
|
|
1588
|
+
const remaining = (currentWorkflow || []).slice(0, -1);
|
|
1589
|
+
const needsDataSoon = this.blockNeedsVisualRender(steps) || this.remainingWorkflowNeedsVisualRender(remaining);
|
|
1590
|
+
if (!requestedWait || requestedWait === 'networkidle' || requestedWait === 'load') {
|
|
1591
|
+
existingOpts.waitUntil = 'domcontentloaded';
|
|
1592
|
+
this.log(`goto: navigation speed-optimized to 'domcontentloaded' + surgical-ready midground`, logger_1.Level.LOG);
|
|
1593
|
+
}
|
|
1594
|
+
if (!existingOpts.timeout)
|
|
1595
|
+
existingOpts.timeout = 15000;
|
|
1596
|
+
yield executeAction(invokee, methodName, [url, existingOpts]);
|
|
1597
|
+
if (needsDataSoon) {
|
|
1598
|
+
yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
|
|
1599
|
+
}
|
|
1600
|
+
}
|
|
1601
|
+
catch (error) {
|
|
1602
|
+
this.log(`goto failed: ${error.message}`, logger_1.Level.WARN);
|
|
1603
|
+
}
|
|
1604
|
+
}
|
|
1605
|
+
else if (methodName === 'waitForLoadState') {
|
|
1406
1606
|
try {
|
|
1407
1607
|
let args = step.args;
|
|
1408
|
-
if (Array.isArray(args)
|
|
1409
|
-
args = [args
|
|
1608
|
+
if (!Array.isArray(args)) {
|
|
1609
|
+
args = [args];
|
|
1410
1610
|
}
|
|
1411
|
-
|
|
1412
|
-
|
|
1611
|
+
const requestedState = args[0];
|
|
1612
|
+
const remaining = (currentWorkflow || []).slice(0, -1);
|
|
1613
|
+
const needsDataSoon = this.blockNeedsVisualRender(steps) || this.remainingWorkflowNeedsVisualRender(remaining);
|
|
1614
|
+
const optimalState = (requestedState === 'networkidle' || requestedState === 'load')
|
|
1615
|
+
? 'domcontentloaded'
|
|
1616
|
+
: requestedState;
|
|
1617
|
+
this.log(`waitForLoadState: workflow requested '${requestedState}', using 'domcontentloaded' + surgical-ready midground`, logger_1.Level.LOG);
|
|
1618
|
+
args = [optimalState, { timeout: 15000 }];
|
|
1619
|
+
yield executeAction(invokee, methodName, args);
|
|
1620
|
+
if (needsDataSoon) {
|
|
1621
|
+
yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
|
|
1413
1622
|
}
|
|
1414
|
-
yield executeAction(invokee, methodName, step.args);
|
|
1415
1623
|
}
|
|
1416
1624
|
catch (error) {
|
|
1417
|
-
yield executeAction(invokee, methodName, 'domcontentloaded');
|
|
1625
|
+
yield executeAction(invokee, methodName, ['domcontentloaded', { timeout: 10000 }]);
|
|
1626
|
+
yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
|
|
1418
1627
|
}
|
|
1419
1628
|
}
|
|
1420
1629
|
else if (methodName === 'click') {
|
|
@@ -1423,7 +1632,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1423
1632
|
}
|
|
1424
1633
|
catch (error) {
|
|
1425
1634
|
try {
|
|
1426
|
-
|
|
1635
|
+
const clickArgs = Array.isArray(step.args) ? step.args : [step.args];
|
|
1636
|
+
yield executeAction(invokee, methodName, [clickArgs[0], { force: true }]);
|
|
1427
1637
|
}
|
|
1428
1638
|
catch (error) {
|
|
1429
1639
|
this.log(`Click action failed: ${error.message}`, logger_1.Level.WARN);
|
|
@@ -1479,6 +1689,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1479
1689
|
debugLog("Workflow aborted, stopping scrapeCurrentPage");
|
|
1480
1690
|
return;
|
|
1481
1691
|
}
|
|
1692
|
+
yield this.waitForDynamicStability(page, [{
|
|
1693
|
+
action: 'scrapeList',
|
|
1694
|
+
args: [config]
|
|
1695
|
+
}]);
|
|
1482
1696
|
const evaluationPromise = page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
1483
1697
|
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('Page evaluation timeout')), 10000));
|
|
1484
1698
|
let results;
|
|
@@ -1509,10 +1723,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1509
1723
|
allResults = allResults.concat(itemsToAdd);
|
|
1510
1724
|
this.serializableDataByType[actionType][actionName] = [...allResults];
|
|
1511
1725
|
yield this.options.serializableCallback({
|
|
1512
|
-
scrapeList: this.serializableDataByType
|
|
1513
|
-
scrapeSchema: this.serializableDataByType
|
|
1514
|
-
crawl: this.serializableDataByType
|
|
1515
|
-
search: this.serializableDataByType
|
|
1726
|
+
scrapeList: this.serializableDataByType['scrapeList'],
|
|
1727
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema'],
|
|
1728
|
+
crawl: this.serializableDataByType['crawl'] || {},
|
|
1729
|
+
search: this.serializableDataByType['search'] || {}
|
|
1516
1730
|
});
|
|
1517
1731
|
});
|
|
1518
1732
|
const checkLimit = () => {
|
|
@@ -1839,7 +2053,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1839
2053
|
}
|
|
1840
2054
|
}
|
|
1841
2055
|
}
|
|
1842
|
-
yield page.waitForLoadState(
|
|
2056
|
+
yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => { });
|
|
1843
2057
|
if (!paginationSuccess) {
|
|
1844
2058
|
const newUrl = page.url();
|
|
1845
2059
|
const afterSignature = yield captureContentSignature();
|
|
@@ -2022,7 +2236,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2022
2236
|
return workflow;
|
|
2023
2237
|
}
|
|
2024
2238
|
runLoop(p, workflow) {
|
|
2025
|
-
var _a, _b, _c;
|
|
2239
|
+
var _a, _b, _c, _d, _e;
|
|
2026
2240
|
return __awaiter(this, void 0, void 0, function* () {
|
|
2027
2241
|
if (this.isAborted) {
|
|
2028
2242
|
this.log('Workflow aborted in runLoop', logger_1.Level.WARN);
|
|
@@ -2133,7 +2347,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2133
2347
|
repeatCount = action === lastAction ? repeatCount + 1 : 0;
|
|
2134
2348
|
console.log("REPEAT COUNT", repeatCount);
|
|
2135
2349
|
if (this.options.maxRepeats && repeatCount > this.options.maxRepeats) {
|
|
2136
|
-
|
|
2350
|
+
const failedAction = ((_c = (_b = action === null || action === void 0 ? void 0 : action.what) === null || _b === void 0 ? void 0 : _b.find((w) => (w === null || w === void 0 ? void 0 : w.action) !== 'flag')) === null || _c === void 0 ? void 0 : _c.action) || 'unknown';
|
|
2351
|
+
const maxRepeats = this.options.maxRepeats;
|
|
2352
|
+
this.log(`Action ${String(failedAction)} exceeded max retries (${maxRepeats})`, logger_1.Level.ERROR);
|
|
2353
|
+
cleanup();
|
|
2354
|
+
throw new Error(`Action ${String(failedAction)} exceeded max retries (${maxRepeats})`);
|
|
2137
2355
|
}
|
|
2138
2356
|
lastAction = action;
|
|
2139
2357
|
if (this.isAborted) {
|
|
@@ -2142,13 +2360,13 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2142
2360
|
}
|
|
2143
2361
|
try {
|
|
2144
2362
|
console.log("Carrying out:", action.what);
|
|
2145
|
-
yield this.carryOutSteps(p, action.what);
|
|
2146
|
-
usedActions.push((
|
|
2363
|
+
yield this.carryOutSteps(p, action.what, workflowCopy);
|
|
2364
|
+
usedActions.push((_d = action.id) !== null && _d !== void 0 ? _d : 'undefined');
|
|
2147
2365
|
workflowCopy.splice(actionId, 1);
|
|
2148
2366
|
console.log(`Action with ID ${action.id} removed from the workflow copy.`);
|
|
2149
2367
|
this.executedActions++;
|
|
2150
2368
|
const percentage = Math.round((this.executedActions / this.totalActions) * 100);
|
|
2151
|
-
if ((
|
|
2369
|
+
if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.progressUpdate) {
|
|
2152
2370
|
this.options.debugChannel.progressUpdate(this.executedActions, this.totalActions, percentage);
|
|
2153
2371
|
}
|
|
2154
2372
|
// const newSelectors = this.getPreviousSelectors(workflow, actionId);
|
|
@@ -2190,13 +2408,13 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2190
2408
|
timeoutPromise
|
|
2191
2409
|
]);
|
|
2192
2410
|
if (!isScriptLoaded) {
|
|
2193
|
-
yield page.addInitScript({ path:
|
|
2411
|
+
yield page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
|
|
2194
2412
|
}
|
|
2195
2413
|
}
|
|
2196
2414
|
catch (error) {
|
|
2197
2415
|
this.log(`Script check failed, adding script anyway: ${error.message}`, logger_1.Level.WARN);
|
|
2198
2416
|
try {
|
|
2199
|
-
yield page.addInitScript({ path:
|
|
2417
|
+
yield page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
|
|
2200
2418
|
}
|
|
2201
2419
|
catch (scriptError) {
|
|
2202
2420
|
this.log(`Failed to add script: ${scriptError.message}`, logger_1.Level.ERROR);
|
|
@@ -15,10 +15,14 @@ export default class Concurrency {
|
|
|
15
15
|
*/
|
|
16
16
|
private jobQueue;
|
|
17
17
|
/**
|
|
18
|
-
*
|
|
18
|
+
* Resolve/reject callbacks of the waitForCompletion() promises.
|
|
19
19
|
*/
|
|
20
20
|
private waiting;
|
|
21
21
|
/**
|
|
22
|
+
* First worker error captured during current execution wave.
|
|
23
|
+
*/
|
|
24
|
+
private firstError;
|
|
25
|
+
/**
|
|
22
26
|
* Constructs a new instance of concurrency manager.
|
|
23
27
|
* @param {number} maxConcurrency Maximum number of workers running in parallel.
|
|
24
28
|
*/
|
|
@@ -22,9 +22,13 @@ class Concurrency {
|
|
|
22
22
|
*/
|
|
23
23
|
this.jobQueue = [];
|
|
24
24
|
/**
|
|
25
|
-
*
|
|
25
|
+
* Resolve/reject callbacks of the waitForCompletion() promises.
|
|
26
26
|
*/
|
|
27
27
|
this.waiting = [];
|
|
28
|
+
/**
|
|
29
|
+
* First worker error captured during current execution wave.
|
|
30
|
+
*/
|
|
31
|
+
this.firstError = null;
|
|
28
32
|
this.maxConcurrency = maxConcurrency;
|
|
29
33
|
}
|
|
30
34
|
/**
|
|
@@ -38,7 +42,11 @@ class Concurrency {
|
|
|
38
42
|
// console.debug("Job finished, running the next waiting job...");
|
|
39
43
|
this.runNextJob();
|
|
40
44
|
}).catch((error) => {
|
|
41
|
-
|
|
45
|
+
const normalizedError = error instanceof Error ? error : new Error(String(error));
|
|
46
|
+
console.error(`Job failed with error: ${normalizedError.message}`);
|
|
47
|
+
if (!this.firstError) {
|
|
48
|
+
this.firstError = normalizedError;
|
|
49
|
+
}
|
|
42
50
|
// Continue processing other jobs even if one fails
|
|
43
51
|
this.runNextJob();
|
|
44
52
|
});
|
|
@@ -48,7 +56,18 @@ class Concurrency {
|
|
|
48
56
|
this.activeWorkers -= 1;
|
|
49
57
|
if (this.activeWorkers === 0) {
|
|
50
58
|
// console.debug("This concurrency manager is idle!");
|
|
51
|
-
this.waiting
|
|
59
|
+
const pending = [...this.waiting];
|
|
60
|
+
this.waiting = [];
|
|
61
|
+
const pendingError = this.firstError;
|
|
62
|
+
this.firstError = null;
|
|
63
|
+
pending.forEach(({ resolve, reject }) => {
|
|
64
|
+
if (pendingError) {
|
|
65
|
+
reject(pendingError);
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
resolve();
|
|
69
|
+
}
|
|
70
|
+
});
|
|
52
71
|
}
|
|
53
72
|
}
|
|
54
73
|
}
|
|
@@ -77,8 +96,8 @@ class Concurrency {
|
|
|
77
96
|
* @returns Promise, resolved after there is no running/waiting worker.
|
|
78
97
|
*/
|
|
79
98
|
waitForCompletion() {
|
|
80
|
-
return new Promise((
|
|
81
|
-
this.waiting.push(
|
|
99
|
+
return new Promise((resolve, reject) => {
|
|
100
|
+
this.waiting.push({ resolve, reject });
|
|
82
101
|
});
|
|
83
102
|
}
|
|
84
103
|
}
|