maxun-core 0.0.31 → 0.0.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,6 +37,7 @@ interface InterpreterOptions {
37
37
  serializableCallback: (output: any) => (void | Promise<void>);
38
38
  binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
39
39
  debug: boolean;
40
+ type?: 'extract' | 'scrape' | 'crawl' | 'search';
40
41
  debugChannel: Partial<{
41
42
  activeId: (id: number) => void;
42
43
  debugMessage: (msg: string) => void;
@@ -55,6 +56,7 @@ export default class Interpreter extends EventEmitter {
55
56
  private concurrency;
56
57
  private stopper;
57
58
  private isAborted;
59
+ private visualRenderRequired;
58
60
  private log;
59
61
  private cumulativeResults;
60
62
  private namedResults;
@@ -90,6 +92,43 @@ export default class Interpreter extends EventEmitter {
90
92
  * @returns True if `where` is applicable in the given context, false otherwise
91
93
  */
92
94
  private applicable;
95
+ /**
96
+ * Returns the optimal Playwright `waitUntil` navigation strategy based on
97
+ * whether the current operation requires visual rendering.
98
+ *
99
+ * - `'networkidle'` — used when screenshots are requested; waits for all
100
+ * sub-resources so the page renders correctly.
101
+ * - `'domcontentloaded'` — used for all DOM-only operations (scraping, crawling,
102
+ * extraction, search); skips stylesheet/image loading for
103
+ * maximum speed.
104
+ *
105
+ * @param blockOverride Pass `true` when the caller will take a screenshot
106
+ * or requires styled layout. Defaults to `false`.
107
+ */
108
+ private getNavigationWaitStrategy;
109
+ /**
110
+ * Returns true if any step in the given `what` block requires a fully
111
+ * rendered page.
112
+ */
113
+ private blockNeedsVisualRender;
114
+ /**
115
+ * Returns true if any of the remaining blocks in the workflow require a visual render
116
+ * before the next page navigation.
117
+ */
118
+ private remainingWorkflowNeedsVisualRender;
119
+ /**
120
+ * Helper to wait for a "Network Quiet Window" (no meaningful activity for X ms).
121
+ */
122
+ private waitForNetworkQuiet;
123
+ /**
124
+ * Scans the remaining workflow to find the next meaningful extraction selector.
125
+ */
126
+ private getUpcomingExtractionSelector;
127
+ /**
128
+ * Function to wait for images to load.
129
+ */
130
+ private waitForImagesLoaded;
131
+ private waitForDynamicStability;
93
132
  /**
94
133
  * Given a Playwright's page object and a "declarative" list of actions, this function
95
134
  * calls all mentioned functions on the Page object.\
@@ -35,7 +35,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
35
35
  return (mod && mod.__esModule) ? mod : { "default": mod };
36
36
  };
37
37
  Object.defineProperty(exports, "__esModule", { value: true });
38
- const path_1 = __importDefault(require("path"));
38
+ const path = __importStar(require("path"));
39
39
  const events_1 = require("events");
40
40
  const logic_1 = require("./types/logic");
41
41
  const utils_1 = require("./utils/utils");
@@ -51,6 +51,7 @@ class Interpreter extends events_1.EventEmitter {
51
51
  super();
52
52
  this.stopper = null;
53
53
  this.isAborted = false;
54
+ this.visualRenderRequired = false;
54
55
  // private blocker: PlaywrightBlocker | null = null;
55
56
  this.cumulativeResults = [];
56
57
  this.namedResults = {};
@@ -69,6 +70,7 @@ class Interpreter extends events_1.EventEmitter {
69
70
  this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
70
71
  (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN);
71
72
  }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
73
+ this.visualRenderRequired = ((options === null || options === void 0 ? void 0 : options.type) === 'extract');
72
74
  this.concurrency = new concurrency_1.default(this.options.maxConcurrency);
73
75
  this.log = (...args) => (0, logger_1.default)(...args);
74
76
  const error = preprocessor_1.default.validateWorkflow(workflow);
@@ -290,6 +292,167 @@ class Interpreter extends events_1.EventEmitter {
290
292
  }
291
293
  });
292
294
  }
295
+ /**
296
+ * Returns the optimal Playwright `waitUntil` navigation strategy based on
297
+ * whether the current operation requires visual rendering.
298
+ *
299
+ * - `'networkidle'` — used when screenshots are requested; waits for all
300
+ * sub-resources so the page renders correctly.
301
+ * - `'domcontentloaded'` — used for all DOM-only operations (scraping, crawling,
302
+ * extraction, search); skips stylesheet/image loading for
303
+ * maximum speed.
304
+ *
305
+ * @param blockOverride Pass `true` when the caller will take a screenshot
306
+ * or requires styled layout. Defaults to `false`.
307
+ */
308
+ getNavigationWaitStrategy(blockOverride) {
309
+ const finalRequirement = blockOverride !== null && blockOverride !== void 0 ? blockOverride : this.visualRenderRequired;
310
+ return finalRequirement ? 'networkidle' : 'domcontentloaded';
311
+ }
312
+ /**
313
+ * Returns true if any step in the given `what` block requires a fully
314
+ * rendered page.
315
+ */
316
+ blockNeedsVisualRender(steps) {
317
+ return steps.some((s) => {
318
+ var _a, _b;
319
+ if (s.action === 'screenshot')
320
+ return true;
321
+ if (s.action === 'scrapeList' || s.action === 'scrapeSchema')
322
+ return true;
323
+ const firstArg = Array.isArray(s.args) ? s.args[0] : s.args;
324
+ if (!firstArg || typeof firstArg !== 'object')
325
+ return false;
326
+ if (s.action === 'scrape') {
327
+ const formats = (_a = firstArg.formats) !== null && _a !== void 0 ? _a : [];
328
+ const heavyFormats = ['markdown', 'html', 'text', 'screenshot-visible', 'screenshot-full'];
329
+ return formats.some((f) => heavyFormats.includes(f));
330
+ }
331
+ if (s.action === 'crawl' || s.action === 'search') {
332
+ const outputFormats = (_b = firstArg.outputFormats) !== null && _b !== void 0 ? _b : [];
333
+ const heavyFormats = ['markdown', 'html', 'text', 'screenshot-visible', 'screenshot-full'];
334
+ return outputFormats.some((f) => heavyFormats.includes(f));
335
+ }
336
+ return false;
337
+ });
338
+ }
339
+ /**
340
+ * Returns true if any of the remaining blocks in the workflow require a visual render
341
+ * before the next page navigation.
342
+ */
343
+ remainingWorkflowNeedsVisualRender(remainingWorkflow) {
344
+ if (!remainingWorkflow || remainingWorkflow.length === 0)
345
+ return false;
346
+ for (let i = remainingWorkflow.length - 1; i >= 0; i--) {
347
+ const pair = remainingWorkflow[i];
348
+ if (this.blockNeedsVisualRender(pair.what))
349
+ return true;
350
+ if (pair.what.some(s => s.action === 'goto'))
351
+ return false;
352
+ }
353
+ return false;
354
+ }
355
+ /**
356
+ * Helper to wait for a "Network Quiet Window" (no meaningful activity for X ms).
357
+ */
358
+ waitForNetworkQuiet(page, timeout = 4000, quietWindow = 600) {
359
+ return __awaiter(this, void 0, void 0, function* () {
360
+ let lastRequestTime = Date.now();
361
+ const onRequest = () => { lastRequestTime = Date.now(); };
362
+ page.on('request', onRequest);
363
+ page.on('requestfinished', onRequest);
364
+ page.on('requestfailed', onRequest);
365
+ try {
366
+ const checkInterval = 100;
367
+ const start = Date.now();
368
+ while (Date.now() - start < timeout) {
369
+ if (Date.now() - lastRequestTime > quietWindow)
370
+ return;
371
+ yield new Promise(r => setTimeout(r, checkInterval));
372
+ }
373
+ }
374
+ finally {
375
+ page.off('request', onRequest);
376
+ page.off('requestfinished', onRequest);
377
+ page.off('requestfailed', onRequest);
378
+ }
379
+ });
380
+ }
381
+ /**
382
+ * Scans the remaining workflow to find the next meaningful extraction selector.
383
+ */
384
+ getUpcomingExtractionSelector(remainingWorkflow) {
385
+ if (!remainingWorkflow || remainingWorkflow.length === 0)
386
+ return null;
387
+ for (let i = remainingWorkflow.length - 1; i >= 0; i--) {
388
+ const pair = remainingWorkflow[i];
389
+ for (const s of pair.what) {
390
+ if (s.action === 'goto')
391
+ return null;
392
+ if (s.action === 'scrapeList' || s.action === 'scrapeSchema') {
393
+ const firstArg = Array.isArray(s.args) ? s.args[0] : s.args;
394
+ if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.listSelector)
395
+ return firstArg.listSelector;
396
+ if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.fields) {
397
+ const firstField = Object.values(firstArg.fields)[0];
398
+ if (firstField === null || firstField === void 0 ? void 0 : firstField.selector)
399
+ return firstField.selector;
400
+ }
401
+ if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.selector)
402
+ return firstArg.selector;
403
+ }
404
+ }
405
+ }
406
+ return null;
407
+ }
408
+ /**
409
+ * Function to wait for images to load.
410
+ */
411
+ waitForImagesLoaded(page) {
412
+ return __awaiter(this, void 0, void 0, function* () {
413
+ yield page.waitForFunction(() => Array.from(document.images).every(img => img.complete), { timeout: 5000 }).catch(() => { });
414
+ });
415
+ }
416
+ waitForDynamicStability(page, upcomingWorkflow = []) {
417
+ return __awaiter(this, void 0, void 0, function* () {
418
+ try {
419
+ const targetSelector = this.getUpcomingExtractionSelector(upcomingWorkflow);
420
+ const signals = [
421
+ this.waitForNetworkQuiet(page, 10000, 1000),
422
+ page.evaluate(() => __awaiter(this, void 0, void 0, function* () {
423
+ let lastLen = 0;
424
+ let stableIterations = 0;
425
+ for (let i = 0; i < 60; i++) {
426
+ const currentLen = document.body.innerText.length;
427
+ if (currentLen > 200 && currentLen === lastLen) {
428
+ stableIterations++;
429
+ }
430
+ else {
431
+ stableIterations = 0;
432
+ }
433
+ if (stableIterations >= 8)
434
+ return true;
435
+ lastLen = currentLen;
436
+ yield new Promise(r => setTimeout(r, 100));
437
+ }
438
+ return false;
439
+ })).catch(() => { }),
440
+ new Promise(resolve => setTimeout(resolve, 10000))
441
+ ];
442
+ if (targetSelector) {
443
+ const found = yield page.waitForSelector(targetSelector, { timeout: 8000 }).catch(() => null);
444
+ if (found) {
445
+ yield new Promise(resolve => setTimeout(resolve, 1000));
446
+ return;
447
+ }
448
+ }
449
+ yield Promise.race(signals);
450
+ yield new Promise(resolve => setTimeout(resolve, 1500));
451
+ }
452
+ catch (e) {
453
+ }
454
+ });
455
+ }
293
456
  /**
294
457
  * Given a Playwright's page object and a "declarative" list of actions, this function
295
458
  * calls all mentioned functions on the Page object.\
@@ -299,7 +462,7 @@ class Interpreter extends events_1.EventEmitter {
299
462
  * @param page Playwright Page object
300
463
  * @param steps Array of actions.
301
464
  */
302
- carryOutSteps(page, steps) {
465
+ carryOutSteps(page, steps, currentWorkflow) {
303
466
  var _a;
304
467
  return __awaiter(this, void 0, void 0, function* () {
305
468
  if (this.isAborted) {
@@ -320,6 +483,7 @@ class Interpreter extends events_1.EventEmitter {
320
483
  if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.setActionType) {
321
484
  this.options.debugChannel.setActionType("screenshot");
322
485
  }
486
+ yield this.waitForImagesLoaded(page);
323
487
  const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
324
488
  const explicitName = (typeof nameOverride === 'string' && nameOverride.trim().length > 0) ? nameOverride.trim() : null;
325
489
  let screenshotName;
@@ -352,8 +516,7 @@ class Interpreter extends events_1.EventEmitter {
352
516
  let newPage = null;
353
517
  try {
354
518
  newPage = yield context.newPage();
355
- yield newPage.goto(link);
356
- yield newPage.waitForLoadState('networkidle');
519
+ yield newPage.goto(link, { waitUntil: this.getNavigationWaitStrategy() });
357
520
  yield this.runLoop(newPage, this.initializedWorkflow);
358
521
  }
359
522
  catch (e) {
@@ -381,6 +544,10 @@ class Interpreter extends events_1.EventEmitter {
381
544
  if ((_d = this.options.debugChannel) === null || _d === void 0 ? void 0 : _d.setActionType) {
382
545
  this.options.debugChannel.setActionType('scrape');
383
546
  }
547
+ yield this.waitForDynamicStability(page, [{
548
+ action: 'scrape',
549
+ args: [selector]
550
+ }]);
384
551
  yield this.ensureScriptsLoaded(page);
385
552
  const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
386
553
  yield this.options.serializableCallback(scrapeResults);
@@ -394,12 +561,22 @@ class Interpreter extends events_1.EventEmitter {
394
561
  if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.setActionType) {
395
562
  this.options.debugChannel.setActionType('scrapeSchema');
396
563
  }
564
+ yield this.waitForDynamicStability(page, [{
565
+ action: 'scrapeSchema',
566
+ args: [schema]
567
+ }]);
397
568
  if (this.options.mode && this.options.mode === 'editor') {
398
569
  yield this.options.serializableCallback({});
399
570
  return;
400
571
  }
401
572
  yield this.ensureScriptsLoaded(page);
402
- const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
573
+ const normalizedSchema = Object.fromEntries(Object.entries(schema).map(([key, value]) => [
574
+ key,
575
+ typeof value === 'string'
576
+ ? { selector: value, tag: '', attribute: 'innerText', shadow: '' }
577
+ : value,
578
+ ]));
579
+ const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), normalizedSchema);
403
580
  if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
404
581
  this.cumulativeResults = [];
405
582
  }
@@ -446,10 +623,10 @@ class Interpreter extends events_1.EventEmitter {
446
623
  }
447
624
  this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
448
625
  yield this.options.serializableCallback({
449
- scrapeList: this.serializableDataByType.scrapeList,
450
- scrapeSchema: this.serializableDataByType.scrapeSchema,
451
- crawl: this.serializableDataByType.crawl || {},
452
- search: this.serializableDataByType.search || {}
626
+ scrapeList: this.serializableDataByType['scrapeList'],
627
+ scrapeSchema: this.serializableDataByType['scrapeSchema'],
628
+ crawl: this.serializableDataByType['crawl'] || {},
629
+ search: this.serializableDataByType['search'] || {}
453
630
  });
454
631
  }),
455
632
  scrapeList: (config, actionName = "") => __awaiter(this, void 0, void 0, function* () {
@@ -505,8 +682,8 @@ class Interpreter extends events_1.EventEmitter {
505
682
  }
506
683
  this.serializableDataByType[actionType][name].push(...scrapeResults);
507
684
  yield this.options.serializableCallback({
508
- scrapeList: this.serializableDataByType.scrapeList,
509
- scrapeSchema: this.serializableDataByType.scrapeSchema
685
+ scrapeList: this.serializableDataByType['scrapeList'],
686
+ scrapeSchema: this.serializableDataByType['scrapeSchema']
510
687
  });
511
688
  }
512
689
  }
@@ -525,8 +702,8 @@ class Interpreter extends events_1.EventEmitter {
525
702
  this.serializableDataByType[actionType] = {};
526
703
  this.serializableDataByType[actionType][name] = [];
527
704
  yield this.options.serializableCallback({
528
- scrapeList: this.serializableDataByType.scrapeList,
529
- scrapeSchema: this.serializableDataByType.scrapeSchema
705
+ scrapeList: this.serializableDataByType['scrapeList'],
706
+ scrapeSchema: this.serializableDataByType['scrapeSchema']
530
707
  });
531
708
  }
532
709
  }),
@@ -765,8 +942,7 @@ class Interpreter extends events_1.EventEmitter {
765
942
  };
766
943
  const extractLinksFromPage = () => __awaiter(this, void 0, void 0, function* () {
767
944
  try {
768
- yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
769
- yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
945
+ yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => { });
770
946
  yield new Promise(resolve => setTimeout(resolve, 1000));
771
947
  const pageLinks = yield page.evaluate(() => {
772
948
  const links = [];
@@ -937,12 +1113,12 @@ class Interpreter extends events_1.EventEmitter {
937
1113
  yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
938
1114
  }
939
1115
  yield page.goto(url, {
940
- waitUntil: 'domcontentloaded',
1116
+ waitUntil: this.getNavigationWaitStrategy(),
941
1117
  timeout: 30000
942
1118
  }).catch((err) => {
943
1119
  throw new Error(`Navigation failed: ${err.message}`);
944
1120
  });
945
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
1121
+ yield this.waitForDynamicStability(page, currentWorkflow || []);
946
1122
  const pageResult = yield scrapePageContent(url);
947
1123
  pageResult.metadata.depth = depth;
948
1124
  crawlResults.push(pageResult);
@@ -989,10 +1165,10 @@ class Interpreter extends events_1.EventEmitter {
989
1165
  }
990
1166
  this.serializableDataByType[actionType][actionName] = crawlResults;
991
1167
  yield this.options.serializableCallback({
992
- scrapeList: this.serializableDataByType.scrapeList || {},
993
- scrapeSchema: this.serializableDataByType.scrapeSchema || {},
994
- crawl: this.serializableDataByType.crawl || {},
995
- search: this.serializableDataByType.search || {}
1168
+ scrapeList: this.serializableDataByType['scrapeList'] || {},
1169
+ scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
1170
+ crawl: this.serializableDataByType['crawl'] || {},
1171
+ search: this.serializableDataByType['search'] || {}
996
1172
  });
997
1173
  }
998
1174
  catch (error) {
@@ -1025,7 +1201,7 @@ class Interpreter extends events_1.EventEmitter {
1025
1201
  const initialDelay = 500 + Math.random() * 1000;
1026
1202
  yield new Promise(resolve => setTimeout(resolve, initialDelay));
1027
1203
  yield page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
1028
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => {
1204
+ yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => {
1029
1205
  this.log('Load state timeout, continuing anyway', logger_1.Level.WARN);
1030
1206
  });
1031
1207
  const pageLoadDelay = 2000 + Math.random() * 1500;
@@ -1214,10 +1390,10 @@ class Interpreter extends events_1.EventEmitter {
1214
1390
  };
1215
1391
  this.serializableDataByType[actionType][actionName] = searchData;
1216
1392
  yield this.options.serializableCallback({
1217
- scrapeList: this.serializableDataByType.scrapeList || {},
1218
- scrapeSchema: this.serializableDataByType.scrapeSchema || {},
1219
- crawl: this.serializableDataByType.crawl || {},
1220
- search: this.serializableDataByType.search || {}
1393
+ scrapeList: this.serializableDataByType['scrapeList'] || {},
1394
+ scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
1395
+ crawl: this.serializableDataByType['crawl'] || {},
1396
+ search: this.serializableDataByType['search'] || {}
1221
1397
  });
1222
1398
  this.log(`Search completed in discover mode with ${searchResults.length} results`, logger_1.Level.LOG);
1223
1399
  return;
@@ -1229,12 +1405,12 @@ class Interpreter extends events_1.EventEmitter {
1229
1405
  try {
1230
1406
  this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, logger_1.Level.LOG);
1231
1407
  yield page.goto(result.url, {
1232
- waitUntil: 'domcontentloaded',
1408
+ waitUntil: this.getNavigationWaitStrategy(),
1233
1409
  timeout: 30000
1234
1410
  }).catch(() => {
1235
1411
  this.log(`Failed to navigate to ${result.url}, skipping...`, logger_1.Level.WARN);
1236
1412
  });
1237
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
1413
+ yield this.waitForDynamicStability(page, currentWorkflow || []);
1238
1414
  const pageData = yield page.evaluate(() => {
1239
1415
  var _a, _b;
1240
1416
  const getMeta = (name) => {
@@ -1321,10 +1497,10 @@ class Interpreter extends events_1.EventEmitter {
1321
1497
  };
1322
1498
  this.serializableDataByType[actionType][actionName] = searchData;
1323
1499
  yield this.options.serializableCallback({
1324
- scrapeList: this.serializableDataByType.scrapeList || {},
1325
- scrapeSchema: this.serializableDataByType.scrapeSchema || {},
1326
- crawl: this.serializableDataByType.crawl || {},
1327
- search: this.serializableDataByType.search || {}
1500
+ scrapeList: this.serializableDataByType['scrapeList'] || {},
1501
+ scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
1502
+ crawl: this.serializableDataByType['crawl'] || {},
1503
+ search: this.serializableDataByType['search'] || {}
1328
1504
  });
1329
1505
  }
1330
1506
  catch (error) {
@@ -1402,19 +1578,52 @@ class Interpreter extends events_1.EventEmitter {
1402
1578
  for (const level of levels.splice(0, levels.length - 1)) {
1403
1579
  invokee = invokee[level];
1404
1580
  }
1405
- if (methodName === 'waitForLoadState') {
1581
+ if (methodName === 'goto') {
1582
+ try {
1583
+ const gotoArgs = step.args || [];
1584
+ const url = gotoArgs[0];
1585
+ const existingOpts = (typeof gotoArgs[1] === 'object' && gotoArgs[1] !== null)
1586
+ ? Object.assign({}, gotoArgs[1]) : {};
1587
+ const requestedWait = existingOpts.waitUntil;
1588
+ const remaining = (currentWorkflow || []).slice(0, -1);
1589
+ const needsDataSoon = this.blockNeedsVisualRender(steps) || this.remainingWorkflowNeedsVisualRender(remaining);
1590
+ if (!requestedWait || requestedWait === 'networkidle' || requestedWait === 'load') {
1591
+ existingOpts.waitUntil = 'domcontentloaded';
1592
+ this.log(`goto: navigation speed-optimized to 'domcontentloaded' + surgical-ready midground`, logger_1.Level.LOG);
1593
+ }
1594
+ if (!existingOpts.timeout)
1595
+ existingOpts.timeout = 15000;
1596
+ yield executeAction(invokee, methodName, [url, existingOpts]);
1597
+ if (needsDataSoon) {
1598
+ yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
1599
+ }
1600
+ }
1601
+ catch (error) {
1602
+ this.log(`goto failed: ${error.message}`, logger_1.Level.WARN);
1603
+ }
1604
+ }
1605
+ else if (methodName === 'waitForLoadState') {
1406
1606
  try {
1407
1607
  let args = step.args;
1408
- if (Array.isArray(args) && args.length === 1) {
1409
- args = [args[0], { timeout: 30000 }];
1608
+ if (!Array.isArray(args)) {
1609
+ args = [args];
1410
1610
  }
1411
- else if (!Array.isArray(args)) {
1412
- args = [args, { timeout: 30000 }];
1611
+ const requestedState = args[0];
1612
+ const remaining = (currentWorkflow || []).slice(0, -1);
1613
+ const needsDataSoon = this.blockNeedsVisualRender(steps) || this.remainingWorkflowNeedsVisualRender(remaining);
1614
+ const optimalState = (requestedState === 'networkidle' || requestedState === 'load')
1615
+ ? 'domcontentloaded'
1616
+ : requestedState;
1617
+ this.log(`waitForLoadState: workflow requested '${requestedState}', using 'domcontentloaded' + surgical-ready midground`, logger_1.Level.LOG);
1618
+ args = [optimalState, { timeout: 15000 }];
1619
+ yield executeAction(invokee, methodName, args);
1620
+ if (needsDataSoon) {
1621
+ yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
1413
1622
  }
1414
- yield executeAction(invokee, methodName, step.args);
1415
1623
  }
1416
1624
  catch (error) {
1417
- yield executeAction(invokee, methodName, 'domcontentloaded');
1625
+ yield executeAction(invokee, methodName, ['domcontentloaded', { timeout: 10000 }]);
1626
+ yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
1418
1627
  }
1419
1628
  }
1420
1629
  else if (methodName === 'click') {
@@ -1423,7 +1632,8 @@ class Interpreter extends events_1.EventEmitter {
1423
1632
  }
1424
1633
  catch (error) {
1425
1634
  try {
1426
- yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
1635
+ const clickArgs = Array.isArray(step.args) ? step.args : [step.args];
1636
+ yield executeAction(invokee, methodName, [clickArgs[0], { force: true }]);
1427
1637
  }
1428
1638
  catch (error) {
1429
1639
  this.log(`Click action failed: ${error.message}`, logger_1.Level.WARN);
@@ -1479,6 +1689,10 @@ class Interpreter extends events_1.EventEmitter {
1479
1689
  debugLog("Workflow aborted, stopping scrapeCurrentPage");
1480
1690
  return;
1481
1691
  }
1692
+ yield this.waitForDynamicStability(page, [{
1693
+ action: 'scrapeList',
1694
+ args: [config]
1695
+ }]);
1482
1696
  const evaluationPromise = page.evaluate((cfg) => window.scrapeList(cfg), config);
1483
1697
  const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('Page evaluation timeout')), 10000));
1484
1698
  let results;
@@ -1509,10 +1723,10 @@ class Interpreter extends events_1.EventEmitter {
1509
1723
  allResults = allResults.concat(itemsToAdd);
1510
1724
  this.serializableDataByType[actionType][actionName] = [...allResults];
1511
1725
  yield this.options.serializableCallback({
1512
- scrapeList: this.serializableDataByType.scrapeList,
1513
- scrapeSchema: this.serializableDataByType.scrapeSchema,
1514
- crawl: this.serializableDataByType.crawl || {},
1515
- search: this.serializableDataByType.search || {}
1726
+ scrapeList: this.serializableDataByType['scrapeList'],
1727
+ scrapeSchema: this.serializableDataByType['scrapeSchema'],
1728
+ crawl: this.serializableDataByType['crawl'] || {},
1729
+ search: this.serializableDataByType['search'] || {}
1516
1730
  });
1517
1731
  });
1518
1732
  const checkLimit = () => {
@@ -1839,7 +2053,7 @@ class Interpreter extends events_1.EventEmitter {
1839
2053
  }
1840
2054
  }
1841
2055
  }
1842
- yield page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => { });
2056
+ yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => { });
1843
2057
  if (!paginationSuccess) {
1844
2058
  const newUrl = page.url();
1845
2059
  const afterSignature = yield captureContentSignature();
@@ -2022,7 +2236,7 @@ class Interpreter extends events_1.EventEmitter {
2022
2236
  return workflow;
2023
2237
  }
2024
2238
  runLoop(p, workflow) {
2025
- var _a, _b, _c;
2239
+ var _a, _b, _c, _d, _e;
2026
2240
  return __awaiter(this, void 0, void 0, function* () {
2027
2241
  if (this.isAborted) {
2028
2242
  this.log('Workflow aborted in runLoop', logger_1.Level.WARN);
@@ -2133,7 +2347,11 @@ class Interpreter extends events_1.EventEmitter {
2133
2347
  repeatCount = action === lastAction ? repeatCount + 1 : 0;
2134
2348
  console.log("REPEAT COUNT", repeatCount);
2135
2349
  if (this.options.maxRepeats && repeatCount > this.options.maxRepeats) {
2136
- return;
2350
+ const failedAction = ((_c = (_b = action === null || action === void 0 ? void 0 : action.what) === null || _b === void 0 ? void 0 : _b.find((w) => (w === null || w === void 0 ? void 0 : w.action) !== 'flag')) === null || _c === void 0 ? void 0 : _c.action) || 'unknown';
2351
+ const maxRepeats = this.options.maxRepeats;
2352
+ this.log(`Action ${String(failedAction)} exceeded max retries (${maxRepeats})`, logger_1.Level.ERROR);
2353
+ cleanup();
2354
+ throw new Error(`Action ${String(failedAction)} exceeded max retries (${maxRepeats})`);
2137
2355
  }
2138
2356
  lastAction = action;
2139
2357
  if (this.isAborted) {
@@ -2142,13 +2360,13 @@ class Interpreter extends events_1.EventEmitter {
2142
2360
  }
2143
2361
  try {
2144
2362
  console.log("Carrying out:", action.what);
2145
- yield this.carryOutSteps(p, action.what);
2146
- usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
2363
+ yield this.carryOutSteps(p, action.what, workflowCopy);
2364
+ usedActions.push((_d = action.id) !== null && _d !== void 0 ? _d : 'undefined');
2147
2365
  workflowCopy.splice(actionId, 1);
2148
2366
  console.log(`Action with ID ${action.id} removed from the workflow copy.`);
2149
2367
  this.executedActions++;
2150
2368
  const percentage = Math.round((this.executedActions / this.totalActions) * 100);
2151
- if ((_c = this.options.debugChannel) === null || _c === void 0 ? void 0 : _c.progressUpdate) {
2369
+ if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.progressUpdate) {
2152
2370
  this.options.debugChannel.progressUpdate(this.executedActions, this.totalActions, percentage);
2153
2371
  }
2154
2372
  // const newSelectors = this.getPreviousSelectors(workflow, actionId);
@@ -2190,13 +2408,13 @@ class Interpreter extends events_1.EventEmitter {
2190
2408
  timeoutPromise
2191
2409
  ]);
2192
2410
  if (!isScriptLoaded) {
2193
- yield page.addInitScript({ path: path_1.default.join(__dirname, 'browserSide', 'scraper.js') });
2411
+ yield page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
2194
2412
  }
2195
2413
  }
2196
2414
  catch (error) {
2197
2415
  this.log(`Script check failed, adding script anyway: ${error.message}`, logger_1.Level.WARN);
2198
2416
  try {
2199
- yield page.addInitScript({ path: path_1.default.join(__dirname, 'browserSide', 'scraper.js') });
2417
+ yield page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
2200
2418
  }
2201
2419
  catch (scriptError) {
2202
2420
  this.log(`Failed to add script: ${scriptError.message}`, logger_1.Level.ERROR);
@@ -15,10 +15,14 @@ export default class Concurrency {
15
15
  */
16
16
  private jobQueue;
17
17
  /**
18
- * "Resolve" callbacks of the waitForCompletion() promises.
18
+ * Resolve/reject callbacks of the waitForCompletion() promises.
19
19
  */
20
20
  private waiting;
21
21
  /**
22
+ * First worker error captured during current execution wave.
23
+ */
24
+ private firstError;
25
+ /**
22
26
  * Constructs a new instance of concurrency manager.
23
27
  * @param {number} maxConcurrency Maximum number of workers running in parallel.
24
28
  */
@@ -22,9 +22,13 @@ class Concurrency {
22
22
  */
23
23
  this.jobQueue = [];
24
24
  /**
25
- * "Resolve" callbacks of the waitForCompletion() promises.
25
+ * Resolve/reject callbacks of the waitForCompletion() promises.
26
26
  */
27
27
  this.waiting = [];
28
+ /**
29
+ * First worker error captured during current execution wave.
30
+ */
31
+ this.firstError = null;
28
32
  this.maxConcurrency = maxConcurrency;
29
33
  }
30
34
  /**
@@ -38,7 +42,11 @@ class Concurrency {
38
42
  // console.debug("Job finished, running the next waiting job...");
39
43
  this.runNextJob();
40
44
  }).catch((error) => {
41
- console.error(`Job failed with error: ${error.message}`);
45
+ const normalizedError = error instanceof Error ? error : new Error(String(error));
46
+ console.error(`Job failed with error: ${normalizedError.message}`);
47
+ if (!this.firstError) {
48
+ this.firstError = normalizedError;
49
+ }
42
50
  // Continue processing other jobs even if one fails
43
51
  this.runNextJob();
44
52
  });
@@ -48,7 +56,18 @@ class Concurrency {
48
56
  this.activeWorkers -= 1;
49
57
  if (this.activeWorkers === 0) {
50
58
  // console.debug("This concurrency manager is idle!");
51
- this.waiting.forEach((x) => x());
59
+ const pending = [...this.waiting];
60
+ this.waiting = [];
61
+ const pendingError = this.firstError;
62
+ this.firstError = null;
63
+ pending.forEach(({ resolve, reject }) => {
64
+ if (pendingError) {
65
+ reject(pendingError);
66
+ }
67
+ else {
68
+ resolve();
69
+ }
70
+ });
52
71
  }
53
72
  }
54
73
  }
@@ -77,8 +96,8 @@ class Concurrency {
77
96
  * @returns Promise, resolved after there is no running/waiting worker.
78
97
  */
79
98
  waitForCompletion() {
80
- return new Promise((res) => {
81
- this.waiting.push(res);
99
+ return new Promise((resolve, reject) => {
100
+ this.waiting.push({ resolve, reject });
82
101
  });
83
102
  }
84
103
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.31",
3
+ "version": "0.0.33",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",