maxun-core 0.0.32 → 0.0.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,6 +37,7 @@ interface InterpreterOptions {
37
37
  serializableCallback: (output: any) => (void | Promise<void>);
38
38
  binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
39
39
  debug: boolean;
40
+ type?: 'extract' | 'scrape' | 'crawl' | 'search' | 'doc-extract' | 'doc-parse';
40
41
  debugChannel: Partial<{
41
42
  activeId: (id: number) => void;
42
43
  debugMessage: (msg: string) => void;
@@ -55,6 +56,7 @@ export default class Interpreter extends EventEmitter {
55
56
  private concurrency;
56
57
  private stopper;
57
58
  private isAborted;
59
+ private visualRenderRequired;
58
60
  private log;
59
61
  private cumulativeResults;
60
62
  private namedResults;
@@ -90,6 +92,43 @@ export default class Interpreter extends EventEmitter {
90
92
  * @returns True if `where` is applicable in the given context, false otherwise
91
93
  */
92
94
  private applicable;
95
+ /**
96
+ * Returns the optimal Playwright `waitUntil` navigation strategy based on
97
+ * whether the current operation requires visual rendering.
98
+ *
99
+ * - `'networkidle'` — used when screenshots are requested; waits for all
100
+ * sub-resources so the page renders correctly.
101
+ * - `'domcontentloaded'` — used for all DOM-only operations (scraping, crawling,
102
+ * extraction, search); skips stylesheet/image loading for
103
+ * maximum speed.
104
+ *
105
+ * @param blockOverride Pass `true` when the caller will take a screenshot
106
+ * or requires styled layout. Defaults to `false`.
107
+ */
108
+ private getNavigationWaitStrategy;
109
+ /**
110
+ * Returns true if any step in the given `what` block requires a fully
111
+ * rendered page.
112
+ */
113
+ private blockNeedsVisualRender;
114
+ /**
115
+ * Returns true if any of the remaining blocks in the workflow require a visual render
116
+ * before the next page navigation.
117
+ */
118
+ private remainingWorkflowNeedsVisualRender;
119
+ /**
120
+ * Helper to wait for a "Network Quiet Window" (no meaningful activity for X ms).
121
+ */
122
+ private waitForNetworkQuiet;
123
+ /**
124
+ * Scans the remaining workflow to find the next meaningful extraction selector.
125
+ */
126
+ private getUpcomingExtractionSelector;
127
+ /**
128
+ * Function to wait for images to load.
129
+ */
130
+ private waitForImagesLoaded;
131
+ private waitForDynamicStability;
93
132
  /**
94
133
  * Given a Playwright's page object and a "declarative" list of actions, this function
95
134
  * calls all mentioned functions on the Page object.\
@@ -35,7 +35,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
35
35
  return (mod && mod.__esModule) ? mod : { "default": mod };
36
36
  };
37
37
  Object.defineProperty(exports, "__esModule", { value: true });
38
- const path_1 = __importDefault(require("path"));
38
+ const path = __importStar(require("path"));
39
39
  const events_1 = require("events");
40
40
  const logic_1 = require("./types/logic");
41
41
  const utils_1 = require("./utils/utils");
@@ -51,6 +51,7 @@ class Interpreter extends events_1.EventEmitter {
51
51
  super();
52
52
  this.stopper = null;
53
53
  this.isAborted = false;
54
+ this.visualRenderRequired = false;
54
55
  // private blocker: PlaywrightBlocker | null = null;
55
56
  this.cumulativeResults = [];
56
57
  this.namedResults = {};
@@ -69,6 +70,7 @@ class Interpreter extends events_1.EventEmitter {
69
70
  this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
70
71
  (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN);
71
72
  }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
73
+ this.visualRenderRequired = ((options === null || options === void 0 ? void 0 : options.type) === 'extract');
72
74
  this.concurrency = new concurrency_1.default(this.options.maxConcurrency);
73
75
  this.log = (...args) => (0, logger_1.default)(...args);
74
76
  const error = preprocessor_1.default.validateWorkflow(workflow);
@@ -290,6 +292,167 @@ class Interpreter extends events_1.EventEmitter {
290
292
  }
291
293
  });
292
294
  }
295
+ /**
296
+ * Returns the optimal Playwright `waitUntil` navigation strategy based on
297
+ * whether the current operation requires visual rendering.
298
+ *
299
+ * - `'networkidle'` — used when screenshots are requested; waits for all
300
+ * sub-resources so the page renders correctly.
301
+ * - `'domcontentloaded'` — used for all DOM-only operations (scraping, crawling,
302
+ * extraction, search); skips stylesheet/image loading for
303
+ * maximum speed.
304
+ *
305
+ * @param blockOverride Pass `true` when the caller will take a screenshot
306
+ * or requires styled layout. Defaults to `false`.
307
+ */
308
+ getNavigationWaitStrategy(blockOverride) {
309
+ const finalRequirement = blockOverride !== null && blockOverride !== void 0 ? blockOverride : this.visualRenderRequired;
310
+ return finalRequirement ? 'networkidle' : 'domcontentloaded';
311
+ }
312
+ /**
313
+ * Returns true if any step in the given `what` block requires a fully
314
+ * rendered page.
315
+ */
316
+ blockNeedsVisualRender(steps) {
317
+ return steps.some((s) => {
318
+ var _a, _b;
319
+ if (s.action === 'screenshot')
320
+ return true;
321
+ if (s.action === 'scrapeList' || s.action === 'scrapeSchema')
322
+ return true;
323
+ const firstArg = Array.isArray(s.args) ? s.args[0] : s.args;
324
+ if (!firstArg || typeof firstArg !== 'object')
325
+ return false;
326
+ if (s.action === 'scrape') {
327
+ const formats = (_a = firstArg.formats) !== null && _a !== void 0 ? _a : [];
328
+ const heavyFormats = ['markdown', 'html', 'text', 'screenshot-visible', 'screenshot-full'];
329
+ return formats.some((f) => heavyFormats.includes(f));
330
+ }
331
+ if (s.action === 'crawl' || s.action === 'search') {
332
+ const outputFormats = (_b = firstArg.outputFormats) !== null && _b !== void 0 ? _b : [];
333
+ const heavyFormats = ['markdown', 'html', 'text', 'screenshot-visible', 'screenshot-full'];
334
+ return outputFormats.some((f) => heavyFormats.includes(f));
335
+ }
336
+ return false;
337
+ });
338
+ }
339
+ /**
340
+ * Returns true if any of the remaining blocks in the workflow require a visual render
341
+ * before the next page navigation.
342
+ */
343
+ remainingWorkflowNeedsVisualRender(remainingWorkflow) {
344
+ if (!remainingWorkflow || remainingWorkflow.length === 0)
345
+ return false;
346
+ for (let i = remainingWorkflow.length - 1; i >= 0; i--) {
347
+ const pair = remainingWorkflow[i];
348
+ if (this.blockNeedsVisualRender(pair.what))
349
+ return true;
350
+ if (pair.what.some(s => s.action === 'goto'))
351
+ return false;
352
+ }
353
+ return false;
354
+ }
355
+ /**
356
+ * Helper to wait for a "Network Quiet Window" (no meaningful activity for X ms).
357
+ */
358
+ waitForNetworkQuiet(page, timeout = 4000, quietWindow = 600) {
359
+ return __awaiter(this, void 0, void 0, function* () {
360
+ let lastRequestTime = Date.now();
361
+ const onRequest = () => { lastRequestTime = Date.now(); };
362
+ page.on('request', onRequest);
363
+ page.on('requestfinished', onRequest);
364
+ page.on('requestfailed', onRequest);
365
+ try {
366
+ const checkInterval = 100;
367
+ const start = Date.now();
368
+ while (Date.now() - start < timeout) {
369
+ if (Date.now() - lastRequestTime > quietWindow)
370
+ return;
371
+ yield new Promise(r => setTimeout(r, checkInterval));
372
+ }
373
+ }
374
+ finally {
375
+ page.off('request', onRequest);
376
+ page.off('requestfinished', onRequest);
377
+ page.off('requestfailed', onRequest);
378
+ }
379
+ });
380
+ }
381
+ /**
382
+ * Scans the remaining workflow to find the next meaningful extraction selector.
383
+ */
384
+ getUpcomingExtractionSelector(remainingWorkflow) {
385
+ if (!remainingWorkflow || remainingWorkflow.length === 0)
386
+ return null;
387
+ for (let i = remainingWorkflow.length - 1; i >= 0; i--) {
388
+ const pair = remainingWorkflow[i];
389
+ for (const s of pair.what) {
390
+ if (s.action === 'goto')
391
+ return null;
392
+ if (s.action === 'scrapeList' || s.action === 'scrapeSchema') {
393
+ const firstArg = Array.isArray(s.args) ? s.args[0] : s.args;
394
+ if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.listSelector)
395
+ return firstArg.listSelector;
396
+ if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.fields) {
397
+ const firstField = Object.values(firstArg.fields)[0];
398
+ if (firstField === null || firstField === void 0 ? void 0 : firstField.selector)
399
+ return firstField.selector;
400
+ }
401
+ if (firstArg === null || firstArg === void 0 ? void 0 : firstArg.selector)
402
+ return firstArg.selector;
403
+ }
404
+ }
405
+ }
406
+ return null;
407
+ }
408
+ /**
409
+ * Function to wait for images to load.
410
+ */
411
+ waitForImagesLoaded(page) {
412
+ return __awaiter(this, void 0, void 0, function* () {
413
+ yield page.waitForFunction(() => Array.from(document.images).every(img => img.complete), { timeout: 5000 }).catch(() => { });
414
+ });
415
+ }
416
+ waitForDynamicStability(page, upcomingWorkflow = []) {
417
+ return __awaiter(this, void 0, void 0, function* () {
418
+ try {
419
+ const targetSelector = this.getUpcomingExtractionSelector(upcomingWorkflow);
420
+ const signals = [
421
+ this.waitForNetworkQuiet(page, 10000, 1000),
422
+ page.evaluate(() => __awaiter(this, void 0, void 0, function* () {
423
+ let lastLen = 0;
424
+ let stableIterations = 0;
425
+ for (let i = 0; i < 60; i++) {
426
+ const currentLen = document.body.innerText.length;
427
+ if (currentLen > 200 && currentLen === lastLen) {
428
+ stableIterations++;
429
+ }
430
+ else {
431
+ stableIterations = 0;
432
+ }
433
+ if (stableIterations >= 8)
434
+ return true;
435
+ lastLen = currentLen;
436
+ yield new Promise(r => setTimeout(r, 100));
437
+ }
438
+ return false;
439
+ })).catch(() => { }),
440
+ new Promise(resolve => setTimeout(resolve, 10000))
441
+ ];
442
+ if (targetSelector) {
443
+ const found = yield page.waitForSelector(targetSelector, { timeout: 8000 }).catch(() => null);
444
+ if (found) {
445
+ yield new Promise(resolve => setTimeout(resolve, 1000));
446
+ return;
447
+ }
448
+ }
449
+ yield Promise.race(signals);
450
+ yield new Promise(resolve => setTimeout(resolve, 1500));
451
+ }
452
+ catch (e) {
453
+ }
454
+ });
455
+ }
293
456
  /**
294
457
  * Given a Playwright's page object and a "declarative" list of actions, this function
295
458
  * calls all mentioned functions on the Page object.\
@@ -299,7 +462,7 @@ class Interpreter extends events_1.EventEmitter {
299
462
  * @param page Playwright Page object
300
463
  * @param steps Array of actions.
301
464
  */
302
- carryOutSteps(page, steps) {
465
+ carryOutSteps(page, steps, currentWorkflow) {
303
466
  var _a;
304
467
  return __awaiter(this, void 0, void 0, function* () {
305
468
  if (this.isAborted) {
@@ -320,6 +483,7 @@ class Interpreter extends events_1.EventEmitter {
320
483
  if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.setActionType) {
321
484
  this.options.debugChannel.setActionType("screenshot");
322
485
  }
486
+ yield this.waitForImagesLoaded(page);
323
487
  const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
324
488
  const explicitName = (typeof nameOverride === 'string' && nameOverride.trim().length > 0) ? nameOverride.trim() : null;
325
489
  let screenshotName;
@@ -352,8 +516,7 @@ class Interpreter extends events_1.EventEmitter {
352
516
  let newPage = null;
353
517
  try {
354
518
  newPage = yield context.newPage();
355
- yield newPage.goto(link);
356
- yield newPage.waitForLoadState('networkidle');
519
+ yield newPage.goto(link, { waitUntil: this.getNavigationWaitStrategy() });
357
520
  yield this.runLoop(newPage, this.initializedWorkflow);
358
521
  }
359
522
  catch (e) {
@@ -381,6 +544,10 @@ class Interpreter extends events_1.EventEmitter {
381
544
  if ((_d = this.options.debugChannel) === null || _d === void 0 ? void 0 : _d.setActionType) {
382
545
  this.options.debugChannel.setActionType('scrape');
383
546
  }
547
+ yield this.waitForDynamicStability(page, [{
548
+ action: 'scrape',
549
+ args: [selector]
550
+ }]);
384
551
  yield this.ensureScriptsLoaded(page);
385
552
  const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
386
553
  yield this.options.serializableCallback(scrapeResults);
@@ -394,6 +561,10 @@ class Interpreter extends events_1.EventEmitter {
394
561
  if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.setActionType) {
395
562
  this.options.debugChannel.setActionType('scrapeSchema');
396
563
  }
564
+ yield this.waitForDynamicStability(page, [{
565
+ action: 'scrapeSchema',
566
+ args: [schema]
567
+ }]);
397
568
  if (this.options.mode && this.options.mode === 'editor') {
398
569
  yield this.options.serializableCallback({});
399
570
  return;
@@ -452,10 +623,10 @@ class Interpreter extends events_1.EventEmitter {
452
623
  }
453
624
  this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
454
625
  yield this.options.serializableCallback({
455
- scrapeList: this.serializableDataByType.scrapeList,
456
- scrapeSchema: this.serializableDataByType.scrapeSchema,
457
- crawl: this.serializableDataByType.crawl || {},
458
- search: this.serializableDataByType.search || {}
626
+ scrapeList: this.serializableDataByType['scrapeList'],
627
+ scrapeSchema: this.serializableDataByType['scrapeSchema'],
628
+ crawl: this.serializableDataByType['crawl'] || {},
629
+ search: this.serializableDataByType['search'] || {}
459
630
  });
460
631
  }),
461
632
  scrapeList: (config, actionName = "") => __awaiter(this, void 0, void 0, function* () {
@@ -511,8 +682,8 @@ class Interpreter extends events_1.EventEmitter {
511
682
  }
512
683
  this.serializableDataByType[actionType][name].push(...scrapeResults);
513
684
  yield this.options.serializableCallback({
514
- scrapeList: this.serializableDataByType.scrapeList,
515
- scrapeSchema: this.serializableDataByType.scrapeSchema
685
+ scrapeList: this.serializableDataByType['scrapeList'],
686
+ scrapeSchema: this.serializableDataByType['scrapeSchema']
516
687
  });
517
688
  }
518
689
  }
@@ -531,8 +702,8 @@ class Interpreter extends events_1.EventEmitter {
531
702
  this.serializableDataByType[actionType] = {};
532
703
  this.serializableDataByType[actionType][name] = [];
533
704
  yield this.options.serializableCallback({
534
- scrapeList: this.serializableDataByType.scrapeList,
535
- scrapeSchema: this.serializableDataByType.scrapeSchema
705
+ scrapeList: this.serializableDataByType['scrapeList'],
706
+ scrapeSchema: this.serializableDataByType['scrapeSchema']
536
707
  });
537
708
  }
538
709
  }),
@@ -771,8 +942,7 @@ class Interpreter extends events_1.EventEmitter {
771
942
  };
772
943
  const extractLinksFromPage = () => __awaiter(this, void 0, void 0, function* () {
773
944
  try {
774
- yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
775
- yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
945
+ yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => { });
776
946
  yield new Promise(resolve => setTimeout(resolve, 1000));
777
947
  const pageLinks = yield page.evaluate(() => {
778
948
  const links = [];
@@ -943,12 +1113,12 @@ class Interpreter extends events_1.EventEmitter {
943
1113
  yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
944
1114
  }
945
1115
  yield page.goto(url, {
946
- waitUntil: 'domcontentloaded',
1116
+ waitUntil: this.getNavigationWaitStrategy(),
947
1117
  timeout: 30000
948
1118
  }).catch((err) => {
949
1119
  throw new Error(`Navigation failed: ${err.message}`);
950
1120
  });
951
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
1121
+ yield this.waitForDynamicStability(page, currentWorkflow || []);
952
1122
  const pageResult = yield scrapePageContent(url);
953
1123
  pageResult.metadata.depth = depth;
954
1124
  crawlResults.push(pageResult);
@@ -995,10 +1165,10 @@ class Interpreter extends events_1.EventEmitter {
995
1165
  }
996
1166
  this.serializableDataByType[actionType][actionName] = crawlResults;
997
1167
  yield this.options.serializableCallback({
998
- scrapeList: this.serializableDataByType.scrapeList || {},
999
- scrapeSchema: this.serializableDataByType.scrapeSchema || {},
1000
- crawl: this.serializableDataByType.crawl || {},
1001
- search: this.serializableDataByType.search || {}
1168
+ scrapeList: this.serializableDataByType['scrapeList'] || {},
1169
+ scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
1170
+ crawl: this.serializableDataByType['crawl'] || {},
1171
+ search: this.serializableDataByType['search'] || {}
1002
1172
  });
1003
1173
  }
1004
1174
  catch (error) {
@@ -1031,7 +1201,7 @@ class Interpreter extends events_1.EventEmitter {
1031
1201
  const initialDelay = 500 + Math.random() * 1000;
1032
1202
  yield new Promise(resolve => setTimeout(resolve, initialDelay));
1033
1203
  yield page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
1034
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => {
1204
+ yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => {
1035
1205
  this.log('Load state timeout, continuing anyway', logger_1.Level.WARN);
1036
1206
  });
1037
1207
  const pageLoadDelay = 2000 + Math.random() * 1500;
@@ -1220,10 +1390,10 @@ class Interpreter extends events_1.EventEmitter {
1220
1390
  };
1221
1391
  this.serializableDataByType[actionType][actionName] = searchData;
1222
1392
  yield this.options.serializableCallback({
1223
- scrapeList: this.serializableDataByType.scrapeList || {},
1224
- scrapeSchema: this.serializableDataByType.scrapeSchema || {},
1225
- crawl: this.serializableDataByType.crawl || {},
1226
- search: this.serializableDataByType.search || {}
1393
+ scrapeList: this.serializableDataByType['scrapeList'] || {},
1394
+ scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
1395
+ crawl: this.serializableDataByType['crawl'] || {},
1396
+ search: this.serializableDataByType['search'] || {}
1227
1397
  });
1228
1398
  this.log(`Search completed in discover mode with ${searchResults.length} results`, logger_1.Level.LOG);
1229
1399
  return;
@@ -1235,12 +1405,12 @@ class Interpreter extends events_1.EventEmitter {
1235
1405
  try {
1236
1406
  this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, logger_1.Level.LOG);
1237
1407
  yield page.goto(result.url, {
1238
- waitUntil: 'domcontentloaded',
1408
+ waitUntil: this.getNavigationWaitStrategy(),
1239
1409
  timeout: 30000
1240
1410
  }).catch(() => {
1241
1411
  this.log(`Failed to navigate to ${result.url}, skipping...`, logger_1.Level.WARN);
1242
1412
  });
1243
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
1413
+ yield this.waitForDynamicStability(page, currentWorkflow || []);
1244
1414
  const pageData = yield page.evaluate(() => {
1245
1415
  var _a, _b;
1246
1416
  const getMeta = (name) => {
@@ -1327,10 +1497,10 @@ class Interpreter extends events_1.EventEmitter {
1327
1497
  };
1328
1498
  this.serializableDataByType[actionType][actionName] = searchData;
1329
1499
  yield this.options.serializableCallback({
1330
- scrapeList: this.serializableDataByType.scrapeList || {},
1331
- scrapeSchema: this.serializableDataByType.scrapeSchema || {},
1332
- crawl: this.serializableDataByType.crawl || {},
1333
- search: this.serializableDataByType.search || {}
1500
+ scrapeList: this.serializableDataByType['scrapeList'] || {},
1501
+ scrapeSchema: this.serializableDataByType['scrapeSchema'] || {},
1502
+ crawl: this.serializableDataByType['crawl'] || {},
1503
+ search: this.serializableDataByType['search'] || {}
1334
1504
  });
1335
1505
  }
1336
1506
  catch (error) {
@@ -1408,19 +1578,52 @@ class Interpreter extends events_1.EventEmitter {
1408
1578
  for (const level of levels.splice(0, levels.length - 1)) {
1409
1579
  invokee = invokee[level];
1410
1580
  }
1411
- if (methodName === 'waitForLoadState') {
1581
+ if (methodName === 'goto') {
1582
+ try {
1583
+ const gotoArgs = step.args || [];
1584
+ const url = gotoArgs[0];
1585
+ const existingOpts = (typeof gotoArgs[1] === 'object' && gotoArgs[1] !== null)
1586
+ ? Object.assign({}, gotoArgs[1]) : {};
1587
+ const requestedWait = existingOpts.waitUntil;
1588
+ const remaining = (currentWorkflow || []).slice(0, -1);
1589
+ const needsDataSoon = this.blockNeedsVisualRender(steps) || this.remainingWorkflowNeedsVisualRender(remaining);
1590
+ if (!requestedWait || requestedWait === 'networkidle' || requestedWait === 'load') {
1591
+ existingOpts.waitUntil = 'domcontentloaded';
1592
+ this.log(`goto: navigation speed-optimized to 'domcontentloaded' + surgical-ready midground`, logger_1.Level.LOG);
1593
+ }
1594
+ if (!existingOpts.timeout)
1595
+ existingOpts.timeout = 15000;
1596
+ yield executeAction(invokee, methodName, [url, existingOpts]);
1597
+ if (needsDataSoon) {
1598
+ yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
1599
+ }
1600
+ }
1601
+ catch (error) {
1602
+ this.log(`goto failed: ${error.message}`, logger_1.Level.WARN);
1603
+ }
1604
+ }
1605
+ else if (methodName === 'waitForLoadState') {
1412
1606
  try {
1413
1607
  let args = step.args;
1414
- if (Array.isArray(args) && args.length === 1) {
1415
- args = [args[0], { timeout: 30000 }];
1608
+ if (!Array.isArray(args)) {
1609
+ args = [args];
1416
1610
  }
1417
- else if (!Array.isArray(args)) {
1418
- args = [args, { timeout: 30000 }];
1611
+ const requestedState = args[0];
1612
+ const remaining = (currentWorkflow || []).slice(0, -1);
1613
+ const needsDataSoon = this.blockNeedsVisualRender(steps) || this.remainingWorkflowNeedsVisualRender(remaining);
1614
+ const optimalState = (requestedState === 'networkidle' || requestedState === 'load')
1615
+ ? 'domcontentloaded'
1616
+ : requestedState;
1617
+ this.log(`waitForLoadState: workflow requested '${requestedState}', using 'domcontentloaded' + surgical-ready midground`, logger_1.Level.LOG);
1618
+ args = [optimalState, { timeout: 15000 }];
1619
+ yield executeAction(invokee, methodName, args);
1620
+ if (needsDataSoon) {
1621
+ yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
1419
1622
  }
1420
- yield executeAction(invokee, methodName, step.args);
1421
1623
  }
1422
1624
  catch (error) {
1423
- yield executeAction(invokee, methodName, 'domcontentloaded');
1625
+ yield executeAction(invokee, methodName, ['domcontentloaded', { timeout: 10000 }]);
1626
+ yield this.waitForDynamicStability(page, (currentWorkflow || []).slice(0, -1));
1424
1627
  }
1425
1628
  }
1426
1629
  else if (methodName === 'click') {
@@ -1429,7 +1632,8 @@ class Interpreter extends events_1.EventEmitter {
1429
1632
  }
1430
1633
  catch (error) {
1431
1634
  try {
1432
- yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
1635
+ const clickArgs = Array.isArray(step.args) ? step.args : [step.args];
1636
+ yield executeAction(invokee, methodName, [clickArgs[0], { force: true }]);
1433
1637
  }
1434
1638
  catch (error) {
1435
1639
  this.log(`Click action failed: ${error.message}`, logger_1.Level.WARN);
@@ -1485,6 +1689,10 @@ class Interpreter extends events_1.EventEmitter {
1485
1689
  debugLog("Workflow aborted, stopping scrapeCurrentPage");
1486
1690
  return;
1487
1691
  }
1692
+ yield this.waitForDynamicStability(page, [{
1693
+ action: 'scrapeList',
1694
+ args: [config]
1695
+ }]);
1488
1696
  const evaluationPromise = page.evaluate((cfg) => window.scrapeList(cfg), config);
1489
1697
  const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('Page evaluation timeout')), 10000));
1490
1698
  let results;
@@ -1515,10 +1723,10 @@ class Interpreter extends events_1.EventEmitter {
1515
1723
  allResults = allResults.concat(itemsToAdd);
1516
1724
  this.serializableDataByType[actionType][actionName] = [...allResults];
1517
1725
  yield this.options.serializableCallback({
1518
- scrapeList: this.serializableDataByType.scrapeList,
1519
- scrapeSchema: this.serializableDataByType.scrapeSchema,
1520
- crawl: this.serializableDataByType.crawl || {},
1521
- search: this.serializableDataByType.search || {}
1726
+ scrapeList: this.serializableDataByType['scrapeList'],
1727
+ scrapeSchema: this.serializableDataByType['scrapeSchema'],
1728
+ crawl: this.serializableDataByType['crawl'] || {},
1729
+ search: this.serializableDataByType['search'] || {}
1522
1730
  });
1523
1731
  });
1524
1732
  const checkLimit = () => {
@@ -1845,7 +2053,7 @@ class Interpreter extends events_1.EventEmitter {
1845
2053
  }
1846
2054
  }
1847
2055
  }
1848
- yield page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => { });
2056
+ yield page.waitForLoadState(this.getNavigationWaitStrategy(), { timeout: 15000 }).catch(() => { });
1849
2057
  if (!paginationSuccess) {
1850
2058
  const newUrl = page.url();
1851
2059
  const afterSignature = yield captureContentSignature();
@@ -2028,7 +2236,7 @@ class Interpreter extends events_1.EventEmitter {
2028
2236
  return workflow;
2029
2237
  }
2030
2238
  runLoop(p, workflow) {
2031
- var _a, _b, _c;
2239
+ var _a, _b, _c, _d, _e;
2032
2240
  return __awaiter(this, void 0, void 0, function* () {
2033
2241
  if (this.isAborted) {
2034
2242
  this.log('Workflow aborted in runLoop', logger_1.Level.WARN);
@@ -2139,7 +2347,11 @@ class Interpreter extends events_1.EventEmitter {
2139
2347
  repeatCount = action === lastAction ? repeatCount + 1 : 0;
2140
2348
  console.log("REPEAT COUNT", repeatCount);
2141
2349
  if (this.options.maxRepeats && repeatCount > this.options.maxRepeats) {
2142
- return;
2350
+ const failedAction = ((_c = (_b = action === null || action === void 0 ? void 0 : action.what) === null || _b === void 0 ? void 0 : _b.find((w) => (w === null || w === void 0 ? void 0 : w.action) !== 'flag')) === null || _c === void 0 ? void 0 : _c.action) || 'unknown';
2351
+ const maxRepeats = this.options.maxRepeats;
2352
+ this.log(`Action ${String(failedAction)} exceeded max retries (${maxRepeats})`, logger_1.Level.ERROR);
2353
+ cleanup();
2354
+ throw new Error(`Action ${String(failedAction)} exceeded max retries (${maxRepeats})`);
2143
2355
  }
2144
2356
  lastAction = action;
2145
2357
  if (this.isAborted) {
@@ -2148,13 +2360,13 @@ class Interpreter extends events_1.EventEmitter {
2148
2360
  }
2149
2361
  try {
2150
2362
  console.log("Carrying out:", action.what);
2151
- yield this.carryOutSteps(p, action.what);
2152
- usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
2363
+ yield this.carryOutSteps(p, action.what, workflowCopy);
2364
+ usedActions.push((_d = action.id) !== null && _d !== void 0 ? _d : 'undefined');
2153
2365
  workflowCopy.splice(actionId, 1);
2154
2366
  console.log(`Action with ID ${action.id} removed from the workflow copy.`);
2155
2367
  this.executedActions++;
2156
2368
  const percentage = Math.round((this.executedActions / this.totalActions) * 100);
2157
- if ((_c = this.options.debugChannel) === null || _c === void 0 ? void 0 : _c.progressUpdate) {
2369
+ if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.progressUpdate) {
2158
2370
  this.options.debugChannel.progressUpdate(this.executedActions, this.totalActions, percentage);
2159
2371
  }
2160
2372
  // const newSelectors = this.getPreviousSelectors(workflow, actionId);
@@ -2196,13 +2408,13 @@ class Interpreter extends events_1.EventEmitter {
2196
2408
  timeoutPromise
2197
2409
  ]);
2198
2410
  if (!isScriptLoaded) {
2199
- yield page.addInitScript({ path: path_1.default.join(__dirname, 'browserSide', 'scraper.js') });
2411
+ yield page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
2200
2412
  }
2201
2413
  }
2202
2414
  catch (error) {
2203
2415
  this.log(`Script check failed, adding script anyway: ${error.message}`, logger_1.Level.WARN);
2204
2416
  try {
2205
- yield page.addInitScript({ path: path_1.default.join(__dirname, 'browserSide', 'scraper.js') });
2417
+ yield page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
2206
2418
  }
2207
2419
  catch (scriptError) {
2208
2420
  this.log(`Failed to add script: ${scriptError.message}`, logger_1.Level.ERROR);
@@ -15,10 +15,14 @@ export default class Concurrency {
15
15
  */
16
16
  private jobQueue;
17
17
  /**
18
- * "Resolve" callbacks of the waitForCompletion() promises.
18
+ * Resolve/reject callbacks of the waitForCompletion() promises.
19
19
  */
20
20
  private waiting;
21
21
  /**
22
+ * First worker error captured during current execution wave.
23
+ */
24
+ private firstError;
25
+ /**
22
26
  * Constructs a new instance of concurrency manager.
23
27
  * @param {number} maxConcurrency Maximum number of workers running in parallel.
24
28
  */
@@ -22,9 +22,13 @@ class Concurrency {
22
22
  */
23
23
  this.jobQueue = [];
24
24
  /**
25
- * "Resolve" callbacks of the waitForCompletion() promises.
25
+ * Resolve/reject callbacks of the waitForCompletion() promises.
26
26
  */
27
27
  this.waiting = [];
28
+ /**
29
+ * First worker error captured during current execution wave.
30
+ */
31
+ this.firstError = null;
28
32
  this.maxConcurrency = maxConcurrency;
29
33
  }
30
34
  /**
@@ -38,7 +42,11 @@ class Concurrency {
38
42
  // console.debug("Job finished, running the next waiting job...");
39
43
  this.runNextJob();
40
44
  }).catch((error) => {
41
- console.error(`Job failed with error: ${error.message}`);
45
+ const normalizedError = error instanceof Error ? error : new Error(String(error));
46
+ console.error(`Job failed with error: ${normalizedError.message}`);
47
+ if (!this.firstError) {
48
+ this.firstError = normalizedError;
49
+ }
42
50
  // Continue processing other jobs even if one fails
43
51
  this.runNextJob();
44
52
  });
@@ -48,7 +56,18 @@ class Concurrency {
48
56
  this.activeWorkers -= 1;
49
57
  if (this.activeWorkers === 0) {
50
58
  // console.debug("This concurrency manager is idle!");
51
- this.waiting.forEach((x) => x());
59
+ const pending = [...this.waiting];
60
+ this.waiting = [];
61
+ const pendingError = this.firstError;
62
+ this.firstError = null;
63
+ pending.forEach(({ resolve, reject }) => {
64
+ if (pendingError) {
65
+ reject(pendingError);
66
+ }
67
+ else {
68
+ resolve();
69
+ }
70
+ });
52
71
  }
53
72
  }
54
73
  }
@@ -77,8 +96,8 @@ class Concurrency {
77
96
  * @returns Promise, resolved after there is no running/waiting worker.
78
97
  */
79
98
  waitForCompletion() {
80
- return new Promise((res) => {
81
- this.waiting.push(res);
99
+ return new Promise((resolve, reject) => {
100
+ this.waiting.push({ resolve, reject });
82
101
  });
83
102
  }
84
103
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.32",
3
+ "version": "0.0.34",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",