maxun-core 0.0.32 → 0.0.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.d.ts +39 -0
- package/build/interpret.js +263 -51
- package/build/utils/concurrency.d.ts +5 -1
- package/build/utils/concurrency.js +24 -5
- package/package.json +1 -1
package/build/interpret.d.ts
CHANGED
|
@@ -37,6 +37,7 @@ interface InterpreterOptions {
|
|
|
37
37
|
serializableCallback: (output: any) => (void | Promise<void>);
|
|
38
38
|
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
|
39
39
|
debug: boolean;
|
|
40
|
+
type?: 'extract' | 'scrape' | 'crawl' | 'search' | 'doc-extract' | 'doc-parse';
|
|
40
41
|
debugChannel: Partial<{
|
|
41
42
|
activeId: (id: number) => void;
|
|
42
43
|
debugMessage: (msg: string) => void;
|
|
@@ -55,6 +56,7 @@ export default class Interpreter extends EventEmitter {
|
|
|
55
56
|
private concurrency;
|
|
56
57
|
private stopper;
|
|
57
58
|
private isAborted;
|
|
59
|
+
private visualRenderRequired;
|
|
58
60
|
private log;
|
|
59
61
|
private cumulativeResults;
|
|
60
62
|
private namedResults;
|
|
@@ -90,6 +92,43 @@ export default class Interpreter extends EventEmitter {
|
|
|
90
92
|
* @returns True if `where` is applicable in the given context, false otherwise
|
|
91
93
|
*/
|
|
92
94
|
private applicable;
|
|
95
|
+
/**
|
|
96
|
+
* Returns the optimal Playwright `waitUntil` navigation strategy based on
|
|
97
|
+
* whether the current operation requires visual rendering.
|
|
98
|
+
*
|
|
99
|
+
* - `'networkidle'` — used when screenshots are requested; waits for all
|
|
100
|
+
* sub-resources so the page renders correctly.
|
|
101
|
+
* - `'domcontentloaded'` — used for all DOM-only operations (scraping, crawling,
|
|
102
|
+
* extraction, search); skips stylesheet/image loading for
|
|
103
|
+
* maximum speed.
|
|
104
|
+
*
|
|
105
|
+
* @param blockOverride Pass `true` when the caller will take a screenshot
|
|
106
|
+
* or requires styled layout. Defaults to `false`.
|
|
107
|
+
*/
|
|
108
|
+
private getNavigationWaitStrategy;
|
|
109
|
+
/**
|
|
110
|
+
* Returns true if any step in the given `what` block requires a fully
|
|
111
|
+
* rendered page.
|
|
112
|
+
*/
|
|
113
|
+
private blockNeedsVisualRender;
|
|
114
|
+
/**
|
|
115
|
+
* Returns true if any of the remaining blocks in the workflow require a visual render
|
|
116
|
+
* before the next page navigation.
|
|
117
|
+
*/
|
|
118
|
+
private remainingWorkflowNeedsVisualRender;
|
|
119
|
+
/**
|
|
120
|
+
* Helper to wait for a "Network Quiet Window" (no meaningful activity for X ms).
|
|
121
|
+
*/
|
|
122
|
+
private waitForNetworkQuiet;
|
|
123
|
+
/**
|
|
124
|
+
* Scans the remaining workflow to find the next meaningful extraction selector.
|
|
125
|
+
*/
|
|
126
|
+
private getUpcomingExtractionSelector;
|
|
127
|
+
/**
|
|
128
|
+
* Function to wait for images to load.
|
|
129
|
+
*/
|
|
130
|
+
private waitForImagesLoaded;
|
|
131
|
+
private waitForDynamicStability;
|
|
93
132
|
/**
|
|
94
133
|
* Given a Playwright's page object and a "declarative" list of actions, this function
|
|
95
134
|
* calls all mentioned functions on the Page object.\
|
package/build/interpret.js
CHANGED
|
@@ -35,7 +35,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
35
35
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
36
36
|
};
|
|
37
37
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
38
|
-
const
|
|
38
|
+
const path = __importStar(require("path"));
|
|
39
39
|
const events_1 = require("events");
|
|
40
40
|
const logic_1 = require("./types/logic");
|
|
41
41
|
const utils_1 = require("./utils/utils");
|
|
@@ -51,6 +51,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
51
51
|
super();
|
|
52
52
|
this.stopper = null;
|
|
53
53
|
this.isAborted = false;
|
|
54
|
+
this.visualRenderRequired = false;
|
|
54
55
|
// private blocker: PlaywrightBlocker | null = null;
|
|
55
56
|
this.cumulativeResults = [];
|
|
56
57
|
this.namedResults = {};
|
|
@@ -69,6 +70,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
69
70
|
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
|
|
70
71
|
(0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN);
|
|
71
72
|
}, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
|
|
73
|
+
this.visualRenderRequired = ((options === null || options === void 0 ? void 0 : options.type) === 'extract');
|
|
72
74
|
this.concurrency = new concurrency_1.default(this.options.maxConcurrency);
|
|
73
75
|
this.log = (...args) => (0, logger_1.default)(...args);
|
|
74
76
|
const error = preprocessor_1.default.validateWorkflow(workflow);
|
|
@@ -290,6 +292,167 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
290
292
|
}
|
|
291
293
|
});
|
|
292
294
|
}
|
|
295
|
+
/**
|
|
296
|
+
* Returns the optimal Playwright `waitUntil` navigation strategy based on
|
|
297
|
+
* whether the current operation requires visual rendering.
|
|
298
|
+
*
|
|
299
|
+
* - `'networkidle'` — used when screenshots are requested; waits for all
|
|
300
|
+
* sub-resources so the page renders correctly.
|
|
301
|
+
* - `'domcontentloaded'` — used for all DOM-only operations (scraping, crawling,
|
|
302
|
+
* extraction, search); skips stylesheet/image loading for
|
|
303
|
+
* maximum speed.
|
|
304
|
+
*
|
|
305
|
+
* @param blockOverride Pass `true` when the caller will take a screenshot
|
|
306
|
+
* or requires styled layout. Defaults to `false`.
|
|
307
|
+
*/
|
|
308
|
+
getNavigationWaitStrategy(blockOverride) {
|
|
309
|
+
const finalRequirement = blockOverride !== null && blockOverride !== void 0 ? blockOverride : this.visualRenderRequired;
|
|
310
|
+
return finalRequirement ? 'networkidle' : 'domcontentloaded';
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Returns true if any step in the given `what` block requires a fully
|
|
314
|
+
* rendered page.
|
|
315
|
+
*/
|
|
316
|
+
blockNeedsVisualRender(steps) {
|
|
317
|
+
return steps.some((s) => {
|
|
318
|
+
var _a, _b;
|
|
319
|
+
if (s.action === 'screenshot')
|
|
320
|
+
return true;
|
|
321
|
+
if (s.action === 'scrapeList' || s.action === 'scrapeSchema')
|
|
322
|
+
return true;
|
|
323
|
+
const firstArg = Array.isArray(s.args) ? s.args[0] : s.args;
|
|
324
|
+
if (!firstArg || typeof firstArg !== 'object')
|
|
325
|
+
return false;
|
|
326
|
+
if (s.action === 'scrape') {
|
|
327
|
+
const formats = (_a = firstArg.formats) !== null && _a !== void 0 ? _a : [];
|
|
328
|
+
const heavyFormats = ['markdown', 'html', 'text', 'screenshot-visible', 'screenshot-full'];
|
|
329
|
+
return formats.some((f) => heavyFormats.includes(f));
|
|
330
|
+
}
|
|
331
|
+
if (s.action === 'crawl' || s.action === 'search') {
|
|
332
|
+
const outputFormats = (_b = firstArg.outputFormats) !== null && _b !== void 0 ? _b : [];
|
|
333
|
+
const heavyFormats = ['markdown', 'html', 'text', 'screenshot-visible', 'screenshot-full'];
|
|
334
|
+
return outputFormats.some((f) => heavyFormats.includes(f));
|
|
335
|
+
}
|
|
336
|
+
return false;
|
|
337
|
+
});
|
|
338
|
+
}
|
|
339
|
+
/**
|
|
340
|
+
* Returns true if any of the remaining blocks in the workflow require a visual render
|
|
341
|
+
* before the next page navigation.
|
|
342
|
+
*/
|
|
343
|
+
remainingWorkflowNeedsVisualRender(remainingWorkflow) {
|
|
344
|
+
if (!remainingWorkflow || remainingWorkflow.length === 0)
|
|
345
|
+
return false;
|
|
346
|
+
for (let i = remainingWorkflow.length - 1; i >= 0; i--) {
|
|
347
|
+
const pair = remainingWorkflow[i];
|
|
348
|
+
if (this.blockNeedsVisualRender(pair.what))
|
|
349
|
+
return true;
|
|
350
|
+
if (pair.what.some(s => s.action === 'goto'))
|
|
351
|
+
return false;
|
|
352
|
+
}
|
|
353
|
+
return false;
|
|
354
|
+
}
|
|
355
|
+
/**
|
|
356
|
+
* Helper to wait for a "Network Quiet Window" (no meaningful activity for X ms).
|
|
357
|
+
*/
|
|
358
|
+
waitForNetworkQuiet(page, timeout = 4000, quietWindow = 600) {
|
|
359
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
360
|
+
let lastRequestTime = Date.now();
|
|
361
|
+
const onRequest = () => { lastRequestTime = Date.now(); };
|
|
362
|
+
page.on('request', onRequest);
|
|
363
|
+
page.on('requestfinished', onRequest);
|
|
364
|
+
page.on('requestfailed', onRequest);
|
|
365
|
+
try {
|
|
366
|
+
const checkInterval = 100;
|
|
367
|
+
const start = Date.now();
|
|
368
|
+
while (Date.now() - start < timeout) {
|
|
369
|
+
if (Date.now() - lastRequestTime > quietWindow)
|
|
370
|
+
return;
|
|
371
|
+
yield new Promise(r => setTimeout(r, checkInterval));
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
finally {
|
|
375
|
+
page.off('request', onRequest);
|
|
376
|
+
page.off('requestfinished', onRequest);
|
|
377
|
+
page.off('requestfailed', onRequest);
|
|
378
|
+
}
|
|
379
|
+
});
|
|
380
|
+
}
|
|
381
|
+
/**
|
|
382
|
+
* Scans the remaining workflow to find the next meaningful extraction selector.
|
|
383
|
+
*/
|
|
384
|
+
getUpcomingExtractionSelector(remainingWorkflow) {
|
|
385
|
+
if (!remainingWorkflow || remainingWorkflow.length === 0)
|
|
386
|
+
return null;
|
|
387
|
+
for (let i = remainingWorkflow.length - 1; i >= 0; i--) {
|
|
388
|
+
const pair = remainingWorkflow[i];
|
|
389
|
+
for (const s of pair.what) {
|
|
390
|
+
if (s.action === 'goto')
|
|
391
|
+
return null;
|
|
392
|
+
if (s.action === 'scrapeList' || s.action === 'scrapeSchema') {
|
|
393
|
+
const firstArg = Array.isArray(s.args) ? s.args[0] : s.args;
|
|
394
|
+
if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.listSelector)
|
|
395
|
+
return firstArg.listSelector;
|
|
396
|
+
if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.fields) {
|
|
397
|
+
const firstField = Object.values(firstArg.fields)[0];
|
|
398
|
+
if (firstField === null || firstField === void 0 ? void 0 : firstField.selector)
|
|
399
|
+
return firstField.selector;
|
|
400
|
+
}
|
|
401
|
+
if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.selector)
|
|
402
|
+
return firstArg.selector;
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
return null;
|
|
407
|
+
}
|
|
408
|
+
/**
|
|
409
|
+
* Function to wait for images to load.
|
|
410
|
+
*/
|
|
411
|
+
waitForImagesLoaded(page) {
|
|
412
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
413
|
+
yield page.waitForFunction(() => Array.from(document.images).every(img => img.complete), { timeout: 5000 }).catch(() => { });
|
|
414
|
+
});
|
|
415
|
+
}
|
|
416
|
+
waitForDynamicStability(page, upcomingWorkflow = []) {
|
|
417
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
418
|
+
try {
|
|
419
|
+
const targetSelector = this.getUpcomingExtractionSelector(upcomingWorkflow);
|
|
420
|
+
const signals = [
|
|
421
|
+
this.waitForNetworkQuiet(page, 10000, 1000),
|
|
422
|
+
page.evaluate(() => __awaiter(this, void 0, void 0, function* () {
|
|
423
|
+
let lastLen = 0;
|
|
424
|
+
let stableIterations = 0;
|
|
425
|
+
for (let i = 0; i < 60; i++) {
|
|
426
|
+
const currentLen = document.body.innerText.length;
|
|
427
|
+
if (currentLen > 200 && currentLen === lastLen) {
|
|
428
|
+
stableIterations++;
|
|
429
|
+
}
|
|
430
|
+
else {
|
|
431
|
+
stableIterations = 0;
|
|
432
|
+
}
|
|
433
|
+
if (stableIterations >= 8)
|
|
434
|
+
return true;
|
|
435
|
+
lastLen = currentLen;
|
|
436
|
+
yield new Promise(r => setTimeout(r, 100));
|
|
437
|
+
}
|
|
438
|
+
return false;
|
|
439
|
+
})).catch(() => { }),
|
|
440
|
+
new Promise(resolve => setTimeout(resolve, 10000))
|
|
441
|
+
];
|
|
442
|
+
if (targetSelector) {
|
|
443
|
+
const found = yield page.waitForSelector(targetSelector, { timeout: 8000 }).catch(() => null);
|
|
444
|
+
if (found) {
|
|
445
|
+
yield new Promise(resolve => setTimeout(resolve, 1000));
|
|
446
|
+
return;
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
yield Promise.race(signals);
|
|
450
|
+
yield new Promise(resolve => setTimeout(resolve, 1500));
|
|
451
|
+
}
|
|
452
|
+
catch (e) {
|
|
453
|
+
}
|
|
454
|
+
});
|
|
455
|
+
}
|
|
293
456
|
/**
|
|
294
457
|
* Given a Playwright's page object and a "declarative" list of actions, this function
|
|
295
458
|
* calls all mentioned functions on the Page object.\
|
|
@@ -299,7 +462,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
299
462
|
* @param page Playwright Page object
|
|
300
463
|
* @param steps Array of actions.
|
|
301
464
|
*/
|
|
302
|
-
carryOutSteps(page, steps) {
|
|
465
|
+
carryOutSteps(page, steps, currentWorkflow) {
|
|
303
466
|
var _a;
|
|
304
467
|
return __awaiter(this, void 0, void 0, function* () {
|
|
305
468
|
if (this.isAborted) {
|
|
@@ -320,6 +483,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
320
483
|
if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.setActionType) {
|
|
321
484
|
this.options.debugChannel.setActionType("screenshot");
|
|
322
485
|
}
|
|
486
|
+
yield this.waitForImagesLoaded(page);
|
|
323
487
|
const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
|
|
324
488
|
const explicitName = (typeof nameOverride === 'string' && nameOverride.trim().length > 0) ? nameOverride.trim() : null;
|
|
325
489
|
let screenshotName;
|
|
@@ -352,8 +516,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
352
516
|
let newPage = null;
|
|
353
517
|
try {
|
|
354
518
|
newPage = yield context.newPage();
|
|
355
|
-
yield newPage.goto(link);
|
|
356
|
-
yield newPage.waitForLoadState('networkidle');
|
|
519
|
+
yield newPage.goto(link, { waitUntil: this.getNavigationWaitStrategy() });
|
|
357
520
|
yield this.runLoop(newPage, this.initializedWorkflow);
|
|
358
521
|
}
|
|
359
522
|
catch (e) {
|
|
@@ -381,6 +544,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
381
544
|
if ((_d = this.options.debugChannel) === null || _d === void 0 ? void 0 : _d.setActionType) {
|
|
382
545
|
this.options.debugChannel.setActionType('scrape');
|
|
383
546
|
}
|
|
547
|
+
yield this.waitForDynamicStability(page, [{
|
|
548
|
+
action: 'scrape',
|
|
549
|
+
args: [selector]
|
|
550
|
+
}]);
|
|
384
551
|
yield this.ensureScriptsLoaded(page);
|
|
385
552
|
const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
|
|
386
553
|
yield this.options.serializableCallback(scrapeResults);
|
|
@@ -394,6 +561,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
394
561
|
if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.setActionType) {
|
|
395
562
|
this.options.debugChannel.setActionType('scrapeSchema');
|
|
396
563
|
}
|
|
564
|
+
yield this.waitForDynamicStability(page, [{
|
|
565
|
+
action: 'scrapeSchema',
|
|
566
|
+
args: [schema]
|
|
567
|
+
}]);
|
|
397
568
|
if (this.options.mode && this.options.mode === 'editor') {
|
|
398
569
|
yield this.options.serializableCallback({});
|
|
399
570
|
return;
|
|
@@ -452,10 +623,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
452
623
|
}
|
|
453
624
|
this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
|
|
454
625
|
yield this.options.serializableCallback({
|
|
455
|
-
scrapeList: this.serializableDataByType
|
|
456
|
-
scrapeSchema: this.serializableDataByType
|
|
457
|
-
crawl: this.serializableDataByType
|
|
458
|
-
search: this.serializableDataByType
|
|
626
|
+
scrapeList: this.serializableDataByType['scrapeList'],
|
|
627
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema'],
|
|
628
|
+
crawl: this.serializableDataByType['crawl'] || {},
|
|
629
|
+
search: this.serializableDataByType['search'] || {}
|
|
459
630
|
});
|
|
460
631
|
}),
|
|
461
632
|
scrapeList: (config, actionName = "") => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -511,8 +682,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
511
682
|
}
|
|
512
683
|
this.serializableDataByType[actionType][name].push(...scrapeResults);
|
|
513
684
|
yield this.options.serializableCallback({
|
|
514
|
-
scrapeList: this.serializableDataByType
|
|
515
|
-
scrapeSchema: this.serializableDataByType
|
|
685
|
+
scrapeList: this.serializableDataByType['scrapeList'],
|
|
686
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema']
|
|
516
687
|
});
|
|
517
688
|
}
|
|
518
689
|
}
|
|
@@ -531,8 +702,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
531
702
|
this.serializableDataByType[actionType] = {};
|
|
532
703
|
this.serializableDataByType[actionType][name] = [];
|
|
533
704
|
yield this.options.serializableCallback({
|
|
534
|
-
scrapeList: this.serializableDataByType
|
|
535
|
-
scrapeSchema: this.serializableDataByType
|
|
705
|
+
scrapeList: this.serializableDataByType['scrapeList'],
|
|
706
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema']
|
|
536
707
|
});
|
|
537
708
|
}
|
|
538
709
|
}),
|
|
@@ -771,8 +942,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
771
942
|
};
|
|
772
943
|
const extractLinksFromPage = () => __awaiter(this, void 0, void 0, function* () {
|
|
773
944
|
try {
|
|
774
|
-
yield page.waitForLoadState(
|
|
775
|
-
yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
|
|
945
|
+
yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => { });
|
|
776
946
|
yield new Promise(resolve => setTimeout(resolve, 1000));
|
|
777
947
|
const pageLinks = yield page.evaluate(() => {
|
|
778
948
|
const links = [];
|
|
@@ -943,12 +1113,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
943
1113
|
yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
|
|
944
1114
|
}
|
|
945
1115
|
yield page.goto(url, {
|
|
946
|
-
waitUntil:
|
|
1116
|
+
waitUntil: this.getNavigationWaitStrategy(),
|
|
947
1117
|
timeout: 30000
|
|
948
1118
|
}).catch((err) => {
|
|
949
1119
|
throw new Error(`Navigation failed: ${err.message}`);
|
|
950
1120
|
});
|
|
951
|
-
yield
|
|
1121
|
+
yield this.waitForDynamicStability(page, currentWorkflow || []);
|
|
952
1122
|
const pageResult = yield scrapePageContent(url);
|
|
953
1123
|
pageResult.metadata.depth = depth;
|
|
954
1124
|
crawlResults.push(pageResult);
|
|
@@ -995,10 +1165,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
995
1165
|
}
|
|
996
1166
|
this.serializableDataByType[actionType][actionName] = crawlResults;
|
|
997
1167
|
yield this.options.serializableCallback({
|
|
998
|
-
scrapeList: this.serializableDataByType
|
|
999
|
-
scrapeSchema: this.serializableDataByType
|
|
1000
|
-
crawl: this.serializableDataByType
|
|
1001
|
-
search: this.serializableDataByType
|
|
1168
|
+
scrapeList: this.serializableDataByType['scrapeList'] || {},
|
|
1169
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
|
|
1170
|
+
crawl: this.serializableDataByType['crawl'] || {},
|
|
1171
|
+
search: this.serializableDataByType['search'] || {}
|
|
1002
1172
|
});
|
|
1003
1173
|
}
|
|
1004
1174
|
catch (error) {
|
|
@@ -1031,7 +1201,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1031
1201
|
const initialDelay = 500 + Math.random() * 1000;
|
|
1032
1202
|
yield new Promise(resolve => setTimeout(resolve, initialDelay));
|
|
1033
1203
|
yield page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
1034
|
-
yield page.waitForLoadState(
|
|
1204
|
+
yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => {
|
|
1035
1205
|
this.log('Load state timeout, continuing anyway', logger_1.Level.WARN);
|
|
1036
1206
|
});
|
|
1037
1207
|
const pageLoadDelay = 2000 + Math.random() * 1500;
|
|
@@ -1220,10 +1390,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1220
1390
|
};
|
|
1221
1391
|
this.serializableDataByType[actionType][actionName] = searchData;
|
|
1222
1392
|
yield this.options.serializableCallback({
|
|
1223
|
-
scrapeList: this.serializableDataByType
|
|
1224
|
-
scrapeSchema: this.serializableDataByType
|
|
1225
|
-
crawl: this.serializableDataByType
|
|
1226
|
-
search: this.serializableDataByType
|
|
1393
|
+
scrapeList: this.serializableDataByType['scrapeList'] || {},
|
|
1394
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
|
|
1395
|
+
crawl: this.serializableDataByType['crawl'] || {},
|
|
1396
|
+
search: this.serializableDataByType['search'] || {}
|
|
1227
1397
|
});
|
|
1228
1398
|
this.log(`Search completed in discover mode with ${searchResults.length} results`, logger_1.Level.LOG);
|
|
1229
1399
|
return;
|
|
@@ -1235,12 +1405,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1235
1405
|
try {
|
|
1236
1406
|
this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, logger_1.Level.LOG);
|
|
1237
1407
|
yield page.goto(result.url, {
|
|
1238
|
-
waitUntil:
|
|
1408
|
+
waitUntil: this.getNavigationWaitStrategy(),
|
|
1239
1409
|
timeout: 30000
|
|
1240
1410
|
}).catch(() => {
|
|
1241
1411
|
this.log(`Failed to navigate to ${result.url}, skipping...`, logger_1.Level.WARN);
|
|
1242
1412
|
});
|
|
1243
|
-
yield
|
|
1413
|
+
yield this.waitForDynamicStability(page, currentWorkflow || []);
|
|
1244
1414
|
const pageData = yield page.evaluate(() => {
|
|
1245
1415
|
var _a, _b;
|
|
1246
1416
|
const getMeta = (name) => {
|
|
@@ -1327,10 +1497,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1327
1497
|
};
|
|
1328
1498
|
this.serializableDataByType[actionType][actionName] = searchData;
|
|
1329
1499
|
yield this.options.serializableCallback({
|
|
1330
|
-
scrapeList: this.serializableDataByType
|
|
1331
|
-
scrapeSchema: this.serializableDataByType
|
|
1332
|
-
crawl: this.serializableDataByType
|
|
1333
|
-
search: this.serializableDataByType
|
|
1500
|
+
scrapeList: this.serializableDataByType['scrapeList'] || {},
|
|
1501
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
|
|
1502
|
+
crawl: this.serializableDataByType['crawl'] || {},
|
|
1503
|
+
search: this.serializableDataByType['search'] || {}
|
|
1334
1504
|
});
|
|
1335
1505
|
}
|
|
1336
1506
|
catch (error) {
|
|
@@ -1408,19 +1578,52 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1408
1578
|
for (const level of levels.splice(0, levels.length - 1)) {
|
|
1409
1579
|
invokee = invokee[level];
|
|
1410
1580
|
}
|
|
1411
|
-
if (methodName === '
|
|
1581
|
+
if (methodName === 'goto') {
|
|
1582
|
+
try {
|
|
1583
|
+
const gotoArgs = step.args || [];
|
|
1584
|
+
const url = gotoArgs[0];
|
|
1585
|
+
const existingOpts = (typeof gotoArgs[1] === 'object' && gotoArgs[1] !== null)
|
|
1586
|
+
? Object.assign({}, gotoArgs[1]) : {};
|
|
1587
|
+
const requestedWait = existingOpts.waitUntil;
|
|
1588
|
+
const remaining = (currentWorkflow || []).slice(0, -1);
|
|
1589
|
+
const needsDataSoon = this.blockNeedsVisualRender(steps) || this.remainingWorkflowNeedsVisualRender(remaining);
|
|
1590
|
+
if (!requestedWait || requestedWait === 'networkidle' || requestedWait === 'load') {
|
|
1591
|
+
existingOpts.waitUntil = 'domcontentloaded';
|
|
1592
|
+
this.log(`goto: navigation speed-optimized to 'domcontentloaded' + surgical-ready midground`, logger_1.Level.LOG);
|
|
1593
|
+
}
|
|
1594
|
+
if (!existingOpts.timeout)
|
|
1595
|
+
existingOpts.timeout = 15000;
|
|
1596
|
+
yield executeAction(invokee, methodName, [url, existingOpts]);
|
|
1597
|
+
if (needsDataSoon) {
|
|
1598
|
+
yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
|
|
1599
|
+
}
|
|
1600
|
+
}
|
|
1601
|
+
catch (error) {
|
|
1602
|
+
this.log(`goto failed: ${error.message}`, logger_1.Level.WARN);
|
|
1603
|
+
}
|
|
1604
|
+
}
|
|
1605
|
+
else if (methodName === 'waitForLoadState') {
|
|
1412
1606
|
try {
|
|
1413
1607
|
let args = step.args;
|
|
1414
|
-
if (Array.isArray(args)
|
|
1415
|
-
args = [args
|
|
1608
|
+
if (!Array.isArray(args)) {
|
|
1609
|
+
args = [args];
|
|
1416
1610
|
}
|
|
1417
|
-
|
|
1418
|
-
|
|
1611
|
+
const requestedState = args[0];
|
|
1612
|
+
const remaining = (currentWorkflow || []).slice(0, -1);
|
|
1613
|
+
const needsDataSoon = this.blockNeedsVisualRender(steps) || this.remainingWorkflowNeedsVisualRender(remaining);
|
|
1614
|
+
const optimalState = (requestedState === 'networkidle' || requestedState === 'load')
|
|
1615
|
+
? 'domcontentloaded'
|
|
1616
|
+
: requestedState;
|
|
1617
|
+
this.log(`waitForLoadState: workflow requested '${requestedState}', using 'domcontentloaded' + surgical-ready midground`, logger_1.Level.LOG);
|
|
1618
|
+
args = [optimalState, { timeout: 15000 }];
|
|
1619
|
+
yield executeAction(invokee, methodName, args);
|
|
1620
|
+
if (needsDataSoon) {
|
|
1621
|
+
yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
|
|
1419
1622
|
}
|
|
1420
|
-
yield executeAction(invokee, methodName, step.args);
|
|
1421
1623
|
}
|
|
1422
1624
|
catch (error) {
|
|
1423
|
-
yield executeAction(invokee, methodName, 'domcontentloaded');
|
|
1625
|
+
yield executeAction(invokee, methodName, ['domcontentloaded', { timeout: 10000 }]);
|
|
1626
|
+
yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
|
|
1424
1627
|
}
|
|
1425
1628
|
}
|
|
1426
1629
|
else if (methodName === 'click') {
|
|
@@ -1429,7 +1632,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1429
1632
|
}
|
|
1430
1633
|
catch (error) {
|
|
1431
1634
|
try {
|
|
1432
|
-
|
|
1635
|
+
const clickArgs = Array.isArray(step.args) ? step.args : [step.args];
|
|
1636
|
+
yield executeAction(invokee, methodName, [clickArgs[0], { force: true }]);
|
|
1433
1637
|
}
|
|
1434
1638
|
catch (error) {
|
|
1435
1639
|
this.log(`Click action failed: ${error.message}`, logger_1.Level.WARN);
|
|
@@ -1485,6 +1689,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1485
1689
|
debugLog("Workflow aborted, stopping scrapeCurrentPage");
|
|
1486
1690
|
return;
|
|
1487
1691
|
}
|
|
1692
|
+
yield this.waitForDynamicStability(page, [{
|
|
1693
|
+
action: 'scrapeList',
|
|
1694
|
+
args: [config]
|
|
1695
|
+
}]);
|
|
1488
1696
|
const evaluationPromise = page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
1489
1697
|
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('Page evaluation timeout')), 10000));
|
|
1490
1698
|
let results;
|
|
@@ -1515,10 +1723,10 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1515
1723
|
allResults = allResults.concat(itemsToAdd);
|
|
1516
1724
|
this.serializableDataByType[actionType][actionName] = [...allResults];
|
|
1517
1725
|
yield this.options.serializableCallback({
|
|
1518
|
-
scrapeList: this.serializableDataByType
|
|
1519
|
-
scrapeSchema: this.serializableDataByType
|
|
1520
|
-
crawl: this.serializableDataByType
|
|
1521
|
-
search: this.serializableDataByType
|
|
1726
|
+
scrapeList: this.serializableDataByType['scrapeList'],
|
|
1727
|
+
scrapeSchema: this.serializableDataByType['scrapeSchema'],
|
|
1728
|
+
crawl: this.serializableDataByType['crawl'] || {},
|
|
1729
|
+
search: this.serializableDataByType['search'] || {}
|
|
1522
1730
|
});
|
|
1523
1731
|
});
|
|
1524
1732
|
const checkLimit = () => {
|
|
@@ -1845,7 +2053,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1845
2053
|
}
|
|
1846
2054
|
}
|
|
1847
2055
|
}
|
|
1848
|
-
yield page.waitForLoadState(
|
|
2056
|
+
yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => { });
|
|
1849
2057
|
if (!paginationSuccess) {
|
|
1850
2058
|
const newUrl = page.url();
|
|
1851
2059
|
const afterSignature = yield captureContentSignature();
|
|
@@ -2028,7 +2236,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2028
2236
|
return workflow;
|
|
2029
2237
|
}
|
|
2030
2238
|
runLoop(p, workflow) {
|
|
2031
|
-
var _a, _b, _c;
|
|
2239
|
+
var _a, _b, _c, _d, _e;
|
|
2032
2240
|
return __awaiter(this, void 0, void 0, function* () {
|
|
2033
2241
|
if (this.isAborted) {
|
|
2034
2242
|
this.log('Workflow aborted in runLoop', logger_1.Level.WARN);
|
|
@@ -2139,7 +2347,11 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2139
2347
|
repeatCount = action === lastAction ? repeatCount + 1 : 0;
|
|
2140
2348
|
console.log("REPEAT COUNT", repeatCount);
|
|
2141
2349
|
if (this.options.maxRepeats && repeatCount > this.options.maxRepeats) {
|
|
2142
|
-
|
|
2350
|
+
const failedAction = ((_c = (_b = action === null || action === void 0 ? void 0 : action.what) === null || _b === void 0 ? void 0 : _b.find((w) => (w === null || w === void 0 ? void 0 : w.action) !== 'flag')) === null || _c === void 0 ? void 0 : _c.action) || 'unknown';
|
|
2351
|
+
const maxRepeats = this.options.maxRepeats;
|
|
2352
|
+
this.log(`Action ${String(failedAction)} exceeded max retries (${maxRepeats})`, logger_1.Level.ERROR);
|
|
2353
|
+
cleanup();
|
|
2354
|
+
throw new Error(`Action ${String(failedAction)} exceeded max retries (${maxRepeats})`);
|
|
2143
2355
|
}
|
|
2144
2356
|
lastAction = action;
|
|
2145
2357
|
if (this.isAborted) {
|
|
@@ -2148,13 +2360,13 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2148
2360
|
}
|
|
2149
2361
|
try {
|
|
2150
2362
|
console.log("Carrying out:", action.what);
|
|
2151
|
-
yield this.carryOutSteps(p, action.what);
|
|
2152
|
-
usedActions.push((
|
|
2363
|
+
yield this.carryOutSteps(p, action.what, workflowCopy);
|
|
2364
|
+
usedActions.push((_d = action.id) !== null && _d !== void 0 ? _d : 'undefined');
|
|
2153
2365
|
workflowCopy.splice(actionId, 1);
|
|
2154
2366
|
console.log(`Action with ID ${action.id} removed from the workflow copy.`);
|
|
2155
2367
|
this.executedActions++;
|
|
2156
2368
|
const percentage = Math.round((this.executedActions / this.totalActions) * 100);
|
|
2157
|
-
if ((
|
|
2369
|
+
if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.progressUpdate) {
|
|
2158
2370
|
this.options.debugChannel.progressUpdate(this.executedActions, this.totalActions, percentage);
|
|
2159
2371
|
}
|
|
2160
2372
|
// const newSelectors = this.getPreviousSelectors(workflow, actionId);
|
|
@@ -2196,13 +2408,13 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
2196
2408
|
timeoutPromise
|
|
2197
2409
|
]);
|
|
2198
2410
|
if (!isScriptLoaded) {
|
|
2199
|
-
yield page.addInitScript({ path:
|
|
2411
|
+
yield page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
|
|
2200
2412
|
}
|
|
2201
2413
|
}
|
|
2202
2414
|
catch (error) {
|
|
2203
2415
|
this.log(`Script check failed, adding script anyway: ${error.message}`, logger_1.Level.WARN);
|
|
2204
2416
|
try {
|
|
2205
|
-
yield page.addInitScript({ path:
|
|
2417
|
+
yield page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
|
|
2206
2418
|
}
|
|
2207
2419
|
catch (scriptError) {
|
|
2208
2420
|
this.log(`Failed to add script: ${scriptError.message}`, logger_1.Level.ERROR);
|
|
@@ -15,10 +15,14 @@ export default class Concurrency {
|
|
|
15
15
|
*/
|
|
16
16
|
private jobQueue;
|
|
17
17
|
/**
|
|
18
|
-
*
|
|
18
|
+
* Resolve/reject callbacks of the waitForCompletion() promises.
|
|
19
19
|
*/
|
|
20
20
|
private waiting;
|
|
21
21
|
/**
|
|
22
|
+
* First worker error captured during current execution wave.
|
|
23
|
+
*/
|
|
24
|
+
private firstError;
|
|
25
|
+
/**
|
|
22
26
|
* Constructs a new instance of concurrency manager.
|
|
23
27
|
* @param {number} maxConcurrency Maximum number of workers running in parallel.
|
|
24
28
|
*/
|
|
@@ -22,9 +22,13 @@ class Concurrency {
|
|
|
22
22
|
*/
|
|
23
23
|
this.jobQueue = [];
|
|
24
24
|
/**
|
|
25
|
-
*
|
|
25
|
+
* Resolve/reject callbacks of the waitForCompletion() promises.
|
|
26
26
|
*/
|
|
27
27
|
this.waiting = [];
|
|
28
|
+
/**
|
|
29
|
+
* First worker error captured during current execution wave.
|
|
30
|
+
*/
|
|
31
|
+
this.firstError = null;
|
|
28
32
|
this.maxConcurrency = maxConcurrency;
|
|
29
33
|
}
|
|
30
34
|
/**
|
|
@@ -38,7 +42,11 @@ class Concurrency {
|
|
|
38
42
|
// console.debug("Job finished, running the next waiting job...");
|
|
39
43
|
this.runNextJob();
|
|
40
44
|
}).catch((error) => {
|
|
41
|
-
|
|
45
|
+
const normalizedError = error instanceof Error ? error : new Error(String(error));
|
|
46
|
+
console.error(`Job failed with error: ${normalizedError.message}`);
|
|
47
|
+
if (!this.firstError) {
|
|
48
|
+
this.firstError = normalizedError;
|
|
49
|
+
}
|
|
42
50
|
// Continue processing other jobs even if one fails
|
|
43
51
|
this.runNextJob();
|
|
44
52
|
});
|
|
@@ -48,7 +56,18 @@ class Concurrency {
|
|
|
48
56
|
this.activeWorkers -= 1;
|
|
49
57
|
if (this.activeWorkers === 0) {
|
|
50
58
|
// console.debug("This concurrency manager is idle!");
|
|
51
|
-
this.waiting
|
|
59
|
+
const pending = [...this.waiting];
|
|
60
|
+
this.waiting = [];
|
|
61
|
+
const pendingError = this.firstError;
|
|
62
|
+
this.firstError = null;
|
|
63
|
+
pending.forEach(({ resolve, reject }) => {
|
|
64
|
+
if (pendingError) {
|
|
65
|
+
reject(pendingError);
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
resolve();
|
|
69
|
+
}
|
|
70
|
+
});
|
|
52
71
|
}
|
|
53
72
|
}
|
|
54
73
|
}
|
|
@@ -77,8 +96,8 @@ class Concurrency {
|
|
|
77
96
|
* @returns Promise, resolved after there is no running/waiting worker.
|
|
78
97
|
*/
|
|
79
98
|
waitForCompletion() {
|
|
80
|
-
return new Promise((
|
|
81
|
-
this.waiting.push(
|
|
99
|
+
return new Promise((resolve, reject) => {
|
|
100
|
+
this.waiting.push({ resolve, reject });
|
|
82
101
|
});
|
|
83
102
|
}
|
|
84
103
|
}
|