mx-cloud 0.0.25 → 0.0.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,7 +49,6 @@ const adblocker_playwright_1 = require("@cliqz/adblocker-playwright");
49
49
  const cross_fetch_1 = __importDefault(require("cross-fetch"));
50
50
  const path_1 = __importDefault(require("path"));
51
51
  const events_1 = require("events");
52
- const logic_1 = require("./types/logic");
53
52
  const utils_1 = require("./utils/utils");
54
53
  const concurrency_1 = __importDefault(require("./utils/concurrency"));
55
54
  const preprocessor_1 = __importDefault(require("./preprocessor"));
@@ -72,10 +71,21 @@ class Interpreter extends events_1.EventEmitter {
72
71
  this.scrapeListCounter = 0;
73
72
  this.serializableDataByType = {
74
73
  scrapeList: {},
75
- scrapeSchema: {}
74
+ scrapeSchema: {},
75
+ crawl: {},
76
+ search: {}
76
77
  };
77
78
  this.pendingDeepExtraction = null;
78
79
  this.isInDeepExtractionPhase = false;
80
+ this.deepExtractionStats = {
81
+ totalUrlsFound: 0,
82
+ matchedUrls: 0,
83
+ successfulExtractions: 0,
84
+ failedExtractions: 0,
85
+ skippedDueToPattern: 0
86
+ };
87
+ this.totalActions = 0;
88
+ this.executedActions = 0;
79
89
  this.workflow = workflow.workflow;
80
90
  this.initializedWorkflow = null;
81
91
  this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
@@ -134,189 +144,12 @@ class Interpreter extends events_1.EventEmitter {
134
144
  }
135
145
  });
136
146
  }
137
- // private getSelectors(workflow: Workflow, actionId: number): string[] {
138
- // const selectors: string[] = [];
139
- // // Validate actionId
140
- // if (actionId <= 0) {
141
- // console.log("No previous selectors to collect.");
142
- // return selectors; // Empty array as there are no previous steps
143
- // }
144
- // // Iterate from the start up to (but not including) actionId
145
- // for (let index = 0; index < actionId; index++) {
146
- // const currentSelectors = workflow[index]?.where?.selectors;
147
- // console.log(`Selectors at step ${index}:`, currentSelectors);
148
- // if (currentSelectors && currentSelectors.length > 0) {
149
- // currentSelectors.forEach((selector) => {
150
- // if (!selectors.includes(selector)) {
151
- // selectors.push(selector); // Avoid duplicates
152
- // }
153
- // });
154
- // }
155
- // }
156
- // console.log("Collected Selectors:", selectors);
157
- // return selectors;
158
- // }
159
- getSelectors(workflow) {
160
- var _a, _b;
161
- const selectorsSet = new Set();
162
- if (workflow.length === 0) {
163
- return [];
164
- }
165
- for (let index = workflow.length - 1; index >= 0; index--) {
166
- const currentSelectors = (_b = (_a = workflow[index]) === null || _a === void 0 ? void 0 : _a.where) === null || _b === void 0 ? void 0 : _b.selectors;
167
- if (currentSelectors && currentSelectors.length > 0) {
168
- currentSelectors.forEach((selector) => selectorsSet.add(selector));
169
- return Array.from(selectorsSet);
170
- }
171
- }
172
- return [];
173
- }
174
- /**
175
- * Returns the context object from given Page and the current workflow.\
176
- * \
177
- * `workflow` is used for selector extraction - function searches for used selectors to
178
- * look for later in the page's context.
179
- * @param page Playwright Page object
180
- * @param workflow Current **initialized** workflow (array of where-what pairs).
181
- * @returns {PageState} State of the current page.
182
- */
183
- getState(page, workflowCopy, selectors) {
184
- return __awaiter(this, void 0, void 0, function* () {
185
- /**
186
- * All the selectors present in the current Workflow
187
- */
188
- // const selectors = Preprocessor.extractSelectors(workflow);
189
- // console.log("Current selectors:", selectors);
190
- /**
191
- * Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability).
192
- * @param selector Selector to be queried
193
- * @returns True if the targetted element is actionable, false otherwise.
194
- */
195
- // const actionable = async (selector: string): Promise<boolean> => {
196
- // try {
197
- // const proms = [
198
- // page.isEnabled(selector, { timeout: 10000 }),
199
- // page.isVisible(selector, { timeout: 10000 }),
200
- // ];
201
- // return await Promise.all(proms).then((bools) => bools.every((x) => x));
202
- // } catch (e) {
203
- // // log(<Error>e, Level.ERROR);
204
- // return false;
205
- // }
206
- // };
207
- /**
208
- * Object of selectors present in the current page.
209
- */
210
- // const presentSelectors: SelectorArray = await Promise.all(
211
- // selectors.map(async (selector) => {
212
- // if (await actionable(selector)) {
213
- // return [selector];
214
- // }
215
- // return [];
216
- // }),
217
- // ).then((x) => x.flat());
218
- const presentSelectors = yield Promise.all(selectors.map((selector) => __awaiter(this, void 0, void 0, function* () {
219
- try {
220
- yield page.waitForSelector(selector, { state: 'attached' });
221
- return [selector];
222
- }
223
- catch (e) {
224
- return [];
225
- }
226
- }))).then((x) => x.flat());
227
- const action = workflowCopy[workflowCopy.length - 1];
228
- // console.log("Next action:", action)
229
- let url = page.url();
230
- if (action && action.where.url !== url && action.where.url !== "about:blank") {
231
- url = action.where.url;
232
- }
233
- return {
234
- url,
235
- cookies: (yield page.context().cookies([page.url()]))
236
- .reduce((p, cookie) => (Object.assign(Object.assign({}, p), { [cookie.name]: cookie.value })), {}),
237
- selectors: presentSelectors,
238
- };
239
- });
240
- }
241
- /**
242
- * Tests if the given action is applicable with the given context.
243
- * @param where Tested *where* condition
244
- * @param context Current browser context.
245
- * @returns True if `where` is applicable in the given context, false otherwise
246
- */
247
- applicable(where, context, usedActions = []) {
248
- /**
249
- * Given two arbitrary objects, determines whether `subset` is a subset of `superset`.\
250
- * \
251
- * For every key in `subset`, there must be a corresponding key with equal scalar
252
- * value in `superset`, or `inclusive(subset[key], superset[key])` must hold.
253
- * @param subset Arbitrary non-cyclic JS object (where clause)
254
- * @param superset Arbitrary non-cyclic JS object (browser context)
255
- * @returns `true` if `subset <= superset`, `false` otherwise.
256
- */
257
- const inclusive = (subset, superset) => (Object.entries(subset).every(([key, value]) => {
258
- /**
259
- * Arrays are compared without order (are transformed into objects before comparison).
260
- */
261
- const parsedValue = Array.isArray(value) ? (0, utils_1.arrayToObject)(value) : value;
262
- const parsedSuperset = {};
263
- parsedSuperset[key] = Array.isArray(superset[key])
264
- ? (0, utils_1.arrayToObject)(superset[key])
265
- : superset[key];
266
- if ((key === 'url' || key === 'selectors') &&
267
- Array.isArray(value) && Array.isArray(superset[key]) &&
268
- value.length === 0 && superset[key].length === 0) {
269
- return true;
270
- }
271
- if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
272
- return value.some(selector => superset[key].includes(selector));
273
- }
274
- // Every `subset` key must exist in the `superset` and
275
- // have the same value (strict equality), or subset[key] <= superset[key]
276
- return parsedSuperset[key]
277
- && ((parsedSuperset[key] === parsedValue)
278
- || ((parsedValue).constructor.name === 'RegExp' && parsedValue.test(parsedSuperset[key]))
279
- || ((parsedValue).constructor.name !== 'RegExp'
280
- && typeof parsedValue === 'object' && inclusive(parsedValue, parsedSuperset[key])));
281
- }));
282
- // Every value in the "where" object should be compliant to the current state.
283
- return Object.entries(where).every(([key, value]) => {
284
- if (logic_1.operators.includes(key)) {
285
- const array = Array.isArray(value)
286
- ? value
287
- : Object.entries(value).map((a) => Object.fromEntries([a]));
288
- // every condition is treated as a single context
289
- switch (key) {
290
- case '$and':
291
- return array === null || array === void 0 ? void 0 : array.every((x) => this.applicable(x, context));
292
- case '$or':
293
- return array === null || array === void 0 ? void 0 : array.some((x) => this.applicable(x, context));
294
- case '$not':
295
- return !this.applicable(value, context); // $not should be a unary operator
296
- default:
297
- throw new Error('Undefined logic operator.');
298
- }
299
- }
300
- else if (logic_1.meta.includes(key)) {
301
- const testRegexString = (x) => {
302
- if (typeof value === 'string') {
303
- return x === value;
304
- }
305
- return value.test(x);
306
- };
307
- switch (key) {
308
- case '$before':
309
- return !usedActions.find(testRegexString);
310
- case '$after':
311
- return !!usedActions.find(testRegexString);
312
- default:
313
- throw new Error('Undefined meta operator.');
314
- }
315
- }
316
- else {
317
- // Current key is a base condition (url, cookies, selectors)
318
- return inclusive({ [key]: value }, context);
319
- }
147
+ callWithTimeout(callback_1) {
148
+ return __awaiter(this, arguments, void 0, function* (callback, timeoutMs = 30000, operationName = 'callback') {
149
+ return Promise.race([
150
+ Promise.resolve(callback()),
151
+ new Promise((_, reject) => setTimeout(() => reject(new Error(`${operationName} timeout after ${timeoutMs}ms`)), timeoutMs))
152
+ ]);
320
153
  });
321
154
  }
322
155
  /**
@@ -363,25 +196,20 @@ class Interpreter extends events_1.EventEmitter {
363
196
  this.options.debugChannel.setActionType("screenshot");
364
197
  }
365
198
  const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
366
- // Prefer explicit nameOverride (from workflow step.name or computed action name)
367
- // If nameOverride is provided (non-empty) use it *as-is*.
368
- // Only use counter-appended name when no nameOverride is available.
369
199
  const explicitName = (typeof nameOverride === 'string' && nameOverride.trim().length > 0) ? nameOverride.trim() : null;
370
200
  let screenshotName;
371
201
  if (explicitName) {
372
202
  screenshotName = explicitName;
373
203
  }
374
204
  else {
375
- // If no explicit name, produce a readable generated name with a counter
376
205
  this.screenshotCounter += 1;
377
206
  screenshotName = `Screenshot ${this.screenshotCounter}`;
378
207
  }
379
- // Pass structured metadata (name included) to binaryCallback
380
- yield this.options.binaryCallback({
208
+ yield this.callWithTimeout(() => this.options.binaryCallback({
381
209
  name: screenshotName,
382
210
  data: screenshotBuffer,
383
211
  mimeType: "image/png",
384
- }, "image/png");
212
+ }, "image/png"), 30000, 'binaryCallback (screenshot)');
385
213
  }),
386
214
  enqueueLinks: (selector) => __awaiter(this, void 0, void 0, function* () {
387
215
  var _a;
@@ -430,7 +258,7 @@ class Interpreter extends events_1.EventEmitter {
430
258
  }
431
259
  yield this.ensureScriptsLoaded(page);
432
260
  const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
433
- yield this.options.serializableCallback(scrapeResults);
261
+ yield this.callWithTimeout(() => this.options.serializableCallback(scrapeResults), 30000, 'serializableCallback (scrape)');
434
262
  }),
435
263
  scrapeSchema: (schema_1, ...args_1) => __awaiter(this, [schema_1, ...args_1], void 0, function* (schema, actionName = "") {
436
264
  var _a;
@@ -442,7 +270,7 @@ class Interpreter extends events_1.EventEmitter {
442
270
  this.options.debugChannel.setActionType('scrapeSchema');
443
271
  }
444
272
  if (this.options.mode && this.options.mode === 'editor') {
445
- yield this.options.serializableCallback({});
273
+ yield this.callWithTimeout(() => this.options.serializableCallback({}), 30000, 'serializableCallback (scrapeSchema editor mode)');
446
274
  return;
447
275
  }
448
276
  yield this.ensureScriptsLoaded(page);
@@ -492,22 +320,32 @@ class Interpreter extends events_1.EventEmitter {
492
320
  this.serializableDataByType[actionType][name] = [];
493
321
  }
494
322
  this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
495
- yield this.options.serializableCallback({
323
+ yield this.callWithTimeout(() => this.options.serializableCallback({
496
324
  scrapeList: this.serializableDataByType.scrapeList,
497
325
  scrapeSchema: this.serializableDataByType.scrapeSchema
498
- });
326
+ }), 30000, 'serializableCallback (scrapeSchema)');
327
+ const MAX_CUMULATIVE_RESULTS = 1000;
328
+ if (this.cumulativeResults.length > MAX_CUMULATIVE_RESULTS) {
329
+ this.cumulativeResults = this.cumulativeResults.slice(-500);
330
+ }
331
+ const MAX_STORED_SCHEMAS = 50;
332
+ const schemaKeys = Object.keys(this.serializableDataByType[actionType]);
333
+ if (schemaKeys.length > MAX_STORED_SCHEMAS) {
334
+ const sortedKeys = schemaKeys.sort();
335
+ const keysToRemove = sortedKeys.slice(0, schemaKeys.length - MAX_STORED_SCHEMAS);
336
+ keysToRemove.forEach(key => {
337
+ delete this.serializableDataByType[actionType][key];
338
+ });
339
+ }
499
340
  if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
500
341
  if (!this.pendingDeepExtraction) {
501
- console.log('DEBUG: Building hierarchical deep extraction plan from scrapeSchema...');
502
342
  const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
503
343
  if (hierarchyData.length > 0) {
504
- const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
344
+ const nextLevelIndex = 0;
505
345
  const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
506
346
  this.log(`Root scrapeSchema will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
507
- // Extract URLs from schema fields
508
347
  const urls = yield this.extractHrefsFromPage(page, schema);
509
348
  this.log(`scrapeSchema extracted ${urls.length} URLs from field selectors`, logger_1.Level.LOG);
510
- // Filter URLs against pattern
511
349
  const rootUrlMappings = urls
512
350
  .map((url, index) => ({
513
351
  scrapeListIndex: index,
@@ -524,9 +362,6 @@ class Interpreter extends events_1.EventEmitter {
524
362
  }))
525
363
  };
526
364
  }
527
- else {
528
- console.log('DEBUG: No goto actions found, deep extraction skipped');
529
- }
530
365
  }
531
366
  else {
532
367
  this.log(`[Deep Extract] scrapeSchema "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
@@ -585,7 +420,7 @@ class Interpreter extends events_1.EventEmitter {
585
420
  this.options.debugChannel.setActionType('scrapeList');
586
421
  }
587
422
  if (this.options.mode && this.options.mode === 'editor') {
588
- yield this.options.serializableCallback({});
423
+ yield this.callWithTimeout(() => this.options.serializableCallback({}), 30000, 'serializableCallback (scrapeList editor mode)');
589
424
  return;
590
425
  }
591
426
  try {
@@ -613,10 +448,9 @@ class Interpreter extends events_1.EventEmitter {
613
448
  const paginationUrls = paginationResult.urls;
614
449
  if (this.options.robotType === 'deep-extract' && this.initializedWorkflow && scrapeResults.length > 0) {
615
450
  if (!this.pendingDeepExtraction) {
616
- console.log('DEBUG: Building hierarchical deep extraction plan from pagination...');
617
451
  const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
618
452
  if (hierarchyData.length > 0) {
619
- const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
453
+ const nextLevelIndex = 0;
620
454
  const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
621
455
  this.log(`Root scrapeList (pagination) will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
622
456
  const rootUrlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextLevelGotoPattern);
@@ -625,6 +459,9 @@ class Interpreter extends events_1.EventEmitter {
625
459
  hierarchy: hierarchyData.map((level, idx) => ({
626
460
  gotoPattern: level.gotoPattern,
627
461
  actionsToExecute: level.actionsToExecute,
462
+ sourceActionName: level.sourceActionName,
463
+ sourceActionType: level.sourceActionType,
464
+ deepExtractionLimit: level.deepExtractionLimit,
628
465
  urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
629
466
  }))
630
467
  };
@@ -634,12 +471,30 @@ class Interpreter extends events_1.EventEmitter {
634
471
  this.log(`[Deep Extract] scrapeList (pagination) "${actionName}" extracting URLs`, logger_1.Level.LOG);
635
472
  const hierarchy = this.pendingDeepExtraction.hierarchy;
636
473
  if (hierarchy && hierarchy.length > 0) {
637
- const nextLevelIndex = hierarchy.length >= 3 ? hierarchy.length - 3 : 0;
638
- if (nextLevelIndex >= 0 && hierarchy[nextLevelIndex]) {
639
- const nextGotoPattern = hierarchy[nextLevelIndex].gotoPattern;
474
+ let targetLevelIndex = -1;
475
+ for (let i = hierarchy.length - 1; i >= 0; i--) {
476
+ if (hierarchy[i].urlMappings.length === 0) {
477
+ targetLevelIndex = i;
478
+ break;
479
+ }
480
+ }
481
+ if (targetLevelIndex >= 0) {
482
+ const nextGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
640
483
  this.log(`[Deep Extract] Extracting URLs for pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
641
484
  const urlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextGotoPattern);
642
485
  this.log(`[Deep Extract] Found ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
486
+ if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
487
+ const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
488
+ const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
489
+ if (newUrls.length > 0) {
490
+ const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
491
+ hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
492
+ this.log(`[Deep Extract] Merged ${newUrls.length} new URLs from pagination`, logger_1.Level.LOG);
493
+ }
494
+ }
495
+ else {
496
+ hierarchy[targetLevelIndex].urlMappings = urlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url }));
497
+ }
643
498
  const validUrls = urlMappings.filter(m => m.url !== null);
644
499
  if (validUrls.length > 0) {
645
500
  const sampleSize = Math.min(3, validUrls.length);
@@ -669,17 +524,24 @@ class Interpreter extends events_1.EventEmitter {
669
524
  this.serializableDataByType[actionType][name] = [];
670
525
  }
671
526
  this.serializableDataByType[actionType][name].push(...scrapeResults);
672
- yield this.options.serializableCallback({
527
+ yield this.callWithTimeout(() => this.options.serializableCallback({
673
528
  scrapeList: this.serializableDataByType.scrapeList,
674
529
  scrapeSchema: this.serializableDataByType.scrapeSchema
675
- });
676
- console.log(`DEBUG: Checking deep extract condition: robotType=${this.options.robotType}, hasWorkflow=${!!currentWorkflow}, alreadyPending=${!!this.pendingDeepExtraction}`);
530
+ }), 30000, 'serializableCallback (scrapeList)');
531
+ const MAX_STORED_LISTS = 50;
532
+ const listKeys = Object.keys(this.serializableDataByType[actionType]);
533
+ if (listKeys.length > MAX_STORED_LISTS) {
534
+ const sortedKeys = listKeys.sort();
535
+ const keysToRemove = sortedKeys.slice(0, listKeys.length - MAX_STORED_LISTS);
536
+ keysToRemove.forEach(key => {
537
+ delete this.serializableDataByType[actionType][key];
538
+ });
539
+ }
677
540
  if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
678
541
  if (!this.pendingDeepExtraction) {
679
- console.log('DEBUG: Building hierarchical deep extraction plan...');
680
542
  const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
681
543
  if (hierarchyData.length > 0) {
682
- const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
544
+ const nextLevelIndex = 0;
683
545
  const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
684
546
  this.log(`Root scrapeList will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
685
547
  const rootUrlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextLevelGotoPattern);
@@ -688,13 +550,13 @@ class Interpreter extends events_1.EventEmitter {
688
550
  hierarchy: hierarchyData.map((level, idx) => ({
689
551
  gotoPattern: level.gotoPattern,
690
552
  actionsToExecute: level.actionsToExecute,
553
+ sourceActionName: level.sourceActionName,
554
+ sourceActionType: level.sourceActionType,
555
+ deepExtractionLimit: level.deepExtractionLimit,
691
556
  urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
692
557
  }))
693
558
  };
694
559
  }
695
- else {
696
- console.log('DEBUG: No goto actions found, deep extraction skipped');
697
- }
698
560
  }
699
561
  else {
700
562
  this.log(`[Deep Extract] scrapeList "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
@@ -767,7 +629,7 @@ class Interpreter extends events_1.EventEmitter {
767
629
  const scrapeResults = yield page.evaluate((listSelector) => {
768
630
  return window.scrapeListAuto(listSelector);
769
631
  }, config.listSelector);
770
- yield this.options.serializableCallback(scrapeResults);
632
+ yield this.callWithTimeout(() => this.options.serializableCallback(scrapeResults), 30000, 'serializableCallback (scrapeListAuto)');
771
633
  }),
772
634
  scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
773
635
  var _a;
@@ -805,11 +667,622 @@ class Interpreter extends events_1.EventEmitter {
805
667
  this.emit('flag', page, res);
806
668
  });
807
669
  }),
670
+ crawl: (crawlConfig) => __awaiter(this, void 0, void 0, function* () {
671
+ var _a;
672
+ if (this.isAborted) {
673
+ this.log('Workflow aborted, stopping crawl', logger_1.Level.WARN);
674
+ return;
675
+ }
676
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
677
+ this.options.debugChannel.setActionType('crawl');
678
+ }
679
+ this.log('Starting crawl operation', logger_1.Level.LOG);
680
+ try {
681
+ // Get current page URL and log it
682
+ const currentUrl = page.url();
683
+ this.log(`Current page URL: ${currentUrl}`, logger_1.Level.LOG);
684
+ // If page is on about:blank or empty, we need to wait for navigation
685
+ if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') {
686
+ this.log('Page not yet navigated, waiting for navigation...', logger_1.Level.WARN);
687
+ yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
688
+ }
689
+ const baseUrl = page.url();
690
+ this.log(`Using base URL for crawl: ${baseUrl}`, logger_1.Level.LOG);
691
+ const parsedBase = new URL(baseUrl);
692
+ const baseDomain = parsedBase.hostname;
693
+ let discoveredUrls = [];
694
+ // Step 1: Sitemap discovery using XMLHttpRequest to avoid polyfills
695
+ if (crawlConfig.useSitemap) {
696
+ this.log('Fetching sitemap URLs...', logger_1.Level.LOG);
697
+ try {
698
+ const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
699
+ // Use XMLHttpRequest instead of fetch to avoid polyfills
700
+ const sitemapUrls = yield page.evaluate((url) => {
701
+ return new Promise((resolve) => {
702
+ const xhr = new XMLHttpRequest();
703
+ xhr.open('GET', url, true);
704
+ xhr.onload = function () {
705
+ if (xhr.status === 200) {
706
+ const text = xhr.responseText;
707
+ const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
708
+ const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
709
+ resolve(urls);
710
+ }
711
+ else {
712
+ resolve([]);
713
+ }
714
+ };
715
+ xhr.onerror = function () {
716
+ resolve([]);
717
+ };
718
+ xhr.send();
719
+ });
720
+ }, sitemapUrl);
721
+ if (sitemapUrls.length > 0) {
722
+ const nestedSitemaps = sitemapUrls.filter(url => url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/'));
723
+ const regularUrls = sitemapUrls.filter(url => !url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/'));
724
+ discoveredUrls.push(...regularUrls);
725
+ this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, logger_1.Level.LOG);
726
+ for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
727
+ try {
728
+ this.log(`Fetching nested sitemap: ${nestedUrl}`, logger_1.Level.LOG);
729
+ const nestedUrls = yield page.evaluate((url) => {
730
+ return new Promise((resolve) => {
731
+ const xhr = new XMLHttpRequest();
732
+ xhr.open('GET', url, true);
733
+ xhr.onload = function () {
734
+ if (xhr.status === 200) {
735
+ const text = xhr.responseText;
736
+ const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
737
+ const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
738
+ resolve(urls);
739
+ }
740
+ else {
741
+ resolve([]);
742
+ }
743
+ };
744
+ xhr.onerror = function () {
745
+ resolve([]);
746
+ };
747
+ xhr.send();
748
+ });
749
+ }, nestedUrl);
750
+ if (nestedUrls.length > 0) {
751
+ discoveredUrls.push(...nestedUrls);
752
+ this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, logger_1.Level.LOG);
753
+ }
754
+ }
755
+ catch (error) {
756
+ this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, logger_1.Level.WARN);
757
+ }
758
+ }
759
+ this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, logger_1.Level.LOG);
760
+ }
761
+ else {
762
+ this.log('No URLs found in sitemap or sitemap not available', logger_1.Level.WARN);
763
+ }
764
+ }
765
+ catch (error) {
766
+ this.log(`Sitemap fetch failed: ${error.message}`, logger_1.Level.WARN);
767
+ }
768
+ }
769
+ if (crawlConfig.followLinks) {
770
+ this.log('Extracting links from current page...', logger_1.Level.LOG);
771
+ try {
772
+ yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
773
+ yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
774
+ this.log('Network did not become idle, continuing anyway', logger_1.Level.WARN);
775
+ });
776
+ yield new Promise(resolve => setTimeout(resolve, 5000));
777
+ const anchorCount = yield page.evaluate(() => {
778
+ return document.querySelectorAll('a').length;
779
+ });
780
+ this.log(`Page has ${anchorCount} total anchor tags`, logger_1.Level.LOG);
781
+ const pageLinks = yield page.evaluate(() => {
782
+ const links = [];
783
+ const allAnchors = document.querySelectorAll('a');
784
+ console.log('Total anchors found:', allAnchors.length);
785
+ for (let i = 0; i < allAnchors.length; i++) {
786
+ const anchor = allAnchors[i];
787
+ const href = anchor.getAttribute('href');
788
+ const fullHref = anchor.href;
789
+ if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
790
+ links.push(fullHref);
791
+ }
792
+ }
793
+ console.log('Links extracted:', links.length);
794
+ return links;
795
+ });
796
+ discoveredUrls.push(...pageLinks);
797
+ this.log(`Found ${pageLinks.length} links from page`, logger_1.Level.LOG);
798
+ }
799
+ catch (error) {
800
+ this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
801
+ }
802
+ }
803
+ const filteredUrls = discoveredUrls.filter(url => {
804
+ try {
805
+ const urlObj = new URL(url);
806
+ if (crawlConfig.mode === 'domain') {
807
+ if (urlObj.hostname !== baseDomain)
808
+ return false;
809
+ }
810
+ else if (crawlConfig.mode === 'subdomain') {
811
+ if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
812
+ return false;
813
+ }
814
+ else if (crawlConfig.mode === 'path') {
815
+ if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
816
+ return false;
817
+ }
818
+ if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
819
+ const matches = crawlConfig.includePaths.some(pattern => {
820
+ const regex = new RegExp(pattern);
821
+ return regex.test(url);
822
+ });
823
+ if (!matches)
824
+ return false;
825
+ }
826
+ if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
827
+ const matches = crawlConfig.excludePaths.some(pattern => {
828
+ const regex = new RegExp(pattern);
829
+ return regex.test(url);
830
+ });
831
+ if (matches)
832
+ return false;
833
+ }
834
+ return true;
835
+ }
836
+ catch (error) {
837
+ return false;
838
+ }
839
+ });
840
+ const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
841
+ return url.replace(/#.*$/, '').replace(/\/$/, '');
842
+ })));
843
+ const basePathname = parsedBase.pathname;
844
+ const prioritizedUrls = uniqueUrls.sort((a, b) => {
845
+ try {
846
+ const aUrl = new URL(a);
847
+ const bUrl = new URL(b);
848
+ const aMatchesBase = aUrl.pathname.startsWith(basePathname);
849
+ const bMatchesBase = bUrl.pathname.startsWith(basePathname);
850
+ if (aMatchesBase && !bMatchesBase)
851
+ return -1;
852
+ if (!aMatchesBase && bMatchesBase)
853
+ return 1;
854
+ return 0;
855
+ }
856
+ catch (error) {
857
+ return 0;
858
+ }
859
+ });
860
+ const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
861
+ this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, logger_1.Level.LOG);
862
+ this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, logger_1.Level.LOG);
863
+ const crawlResults = [];
864
+ for (let i = 0; i < finalUrls.length; i++) {
865
+ const url = finalUrls[i];
866
+ try {
867
+ this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, logger_1.Level.LOG);
868
+ yield page.goto(url, {
869
+ waitUntil: 'domcontentloaded',
870
+ timeout: 30000
871
+ }).catch(() => {
872
+ this.log(`Failed to navigate to ${url}, skipping...`, logger_1.Level.WARN);
873
+ });
874
+ yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
875
+ const pageData = yield page.evaluate(() => {
876
+ var _a, _b;
877
+ const getMeta = (name) => {
878
+ const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
879
+ return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
880
+ };
881
+ const getAllMeta = () => {
882
+ const metadata = {};
883
+ const metaTags = document.querySelectorAll('meta');
884
+ metaTags.forEach(tag => {
885
+ const name = tag.getAttribute('name') || tag.getAttribute('property');
886
+ const content = tag.getAttribute('content');
887
+ if (name && content) {
888
+ metadata[name] = content;
889
+ }
890
+ });
891
+ return metadata;
892
+ };
893
+ const title = document.title || '';
894
+ const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
895
+ const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
896
+ elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
897
+ const html = document.documentElement.outerHTML;
898
+ const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
899
+ const allMetadata = getAllMeta();
900
+ return {
901
+ title,
902
+ description: getMeta('description'),
903
+ text: bodyText,
904
+ html: html,
905
+ links: links,
906
+ wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
907
+ metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
908
+ };
909
+ });
910
+ crawlResults.push({
911
+ metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
912
+ html: pageData.html,
913
+ text: pageData.text,
914
+ links: pageData.links,
915
+ wordCount: pageData.wordCount,
916
+ scrapedAt: new Date().toISOString()
917
+ });
918
+ this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
919
+ }
920
+ catch (error) {
921
+ this.log(`Failed to scrape ${url}: ${error.message}`, logger_1.Level.WARN);
922
+ crawlResults.push({
923
+ url: url,
924
+ error: error.message,
925
+ scrapedAt: new Date().toISOString()
926
+ });
927
+ }
928
+ }
929
+ this.log(`Successfully scraped ${crawlResults.length} pages`, logger_1.Level.LOG);
930
+ const actionType = "crawl";
931
+ const actionName = "Crawl Results";
932
+ if (!this.serializableDataByType[actionType]) {
933
+ this.serializableDataByType[actionType] = {};
934
+ }
935
+ if (!this.serializableDataByType[actionType][actionName]) {
936
+ this.serializableDataByType[actionType][actionName] = [];
937
+ }
938
+ this.serializableDataByType[actionType][actionName] = crawlResults;
939
+ yield this.options.serializableCallback({
940
+ scrapeList: this.serializableDataByType.scrapeList || {},
941
+ scrapeSchema: this.serializableDataByType.scrapeSchema || {},
942
+ crawl: this.serializableDataByType.crawl || {}
943
+ });
944
+ }
945
+ catch (error) {
946
+ this.log(`Crawl action failed: ${error.message}`, logger_1.Level.ERROR);
947
+ throw new Error(`Crawl execution error: ${error.message}`);
948
+ }
949
+ }),
950
+ search: (searchConfig) => __awaiter(this, void 0, void 0, function* () {
951
+ var _a, _b;
952
+ if (this.isAborted) {
953
+ this.log('Workflow aborted, stopping search', logger_1.Level.WARN);
954
+ return;
955
+ }
956
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
957
+ this.options.debugChannel.setActionType('search');
958
+ }
959
+ searchConfig.provider = 'duckduckgo';
960
+ this.log(`Performing DuckDuckGo search for: ${searchConfig.query}`, logger_1.Level.LOG);
961
+ try {
962
+ let searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(searchConfig.query)}`;
963
+ if ((_b = searchConfig.filters) === null || _b === void 0 ? void 0 : _b.timeRange) {
964
+ const timeMap = {
965
+ 'day': 'd',
966
+ 'week': 'w',
967
+ 'month': 'm',
968
+ 'year': 'y'
969
+ };
970
+ searchUrl += `&df=${timeMap[searchConfig.filters.timeRange]}`;
971
+ }
972
+ const initialDelay = 500 + Math.random() * 1000;
973
+ yield new Promise(resolve => setTimeout(resolve, initialDelay));
974
+ yield page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
975
+ yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => {
976
+ this.log('Load state timeout, continuing anyway', logger_1.Level.WARN);
977
+ });
978
+ const pageLoadDelay = 2000 + Math.random() * 1500;
979
+ yield new Promise(resolve => setTimeout(resolve, pageLoadDelay));
980
+ let searchResults = [];
981
+ let retryCount = 0;
982
+ const maxRetries = 2;
983
+ while (searchResults.length === 0 && retryCount <= maxRetries) {
984
+ if (retryCount > 0) {
985
+ this.log(`Retry attempt ${retryCount}/${maxRetries} for DuckDuckGo search...`, logger_1.Level.LOG);
986
+ const retryDelay = 1000 * Math.pow(2, retryCount) + Math.random() * 1000;
987
+ yield new Promise(resolve => setTimeout(resolve, retryDelay));
988
+ }
989
+ this.log('Attempting to extract DuckDuckGo search results...', logger_1.Level.LOG);
990
+ yield page.waitForSelector('[data-testid="result"], .result', { timeout: 5000 }).catch(() => {
991
+ this.log('DuckDuckGo results not found on initial wait', logger_1.Level.WARN);
992
+ });
993
+ let currentResultCount = 0;
994
+ const maxLoadAttempts = Math.ceil(searchConfig.limit / 10) * 2;
995
+ let loadAttempts = 0;
996
+ let noNewResultsCount = 0;
997
+ while (currentResultCount < searchConfig.limit && loadAttempts < maxLoadAttempts && noNewResultsCount < 3) {
998
+ const previousCount = currentResultCount;
999
+ currentResultCount = yield page.evaluate(() => {
1000
+ const selectors = [
1001
+ '[data-testid="result"]',
1002
+ 'article[data-testid="result"]',
1003
+ 'li[data-layout="organic"]',
1004
+ '.result',
1005
+ 'article[data-testid]'
1006
+ ];
1007
+ for (const selector of selectors) {
1008
+ const elements = document.querySelectorAll(selector);
1009
+ if (elements.length > 0) {
1010
+ return elements.length;
1011
+ }
1012
+ }
1013
+ return 0;
1014
+ });
1015
+ if (currentResultCount >= searchConfig.limit) {
1016
+ this.log(`Reached desired result count: ${currentResultCount}`, logger_1.Level.LOG);
1017
+ break;
1018
+ }
1019
+ if (currentResultCount === previousCount) {
1020
+ noNewResultsCount++;
1021
+ this.log(`No new results after load more (attempt ${noNewResultsCount}/3)`, logger_1.Level.WARN);
1022
+ if (noNewResultsCount >= 3)
1023
+ break;
1024
+ }
1025
+ else {
1026
+ noNewResultsCount = 0;
1027
+ this.log(`Current results count: ${currentResultCount}/${searchConfig.limit}`, logger_1.Level.LOG);
1028
+ }
1029
+ yield page.evaluate(() => {
1030
+ window.scrollTo(0, document.body.scrollHeight);
1031
+ });
1032
+ yield new Promise(resolve => setTimeout(resolve, 800));
1033
+ const loadMoreClicked = yield page.evaluate(() => {
1034
+ const selectors = [
1035
+ '#more-results',
1036
+ 'button:has-text("More results")',
1037
+ 'button:has-text("more results")',
1038
+ 'button[id*="more"]',
1039
+ 'button:has-text("Load more")'
1040
+ ];
1041
+ for (const selector of selectors) {
1042
+ try {
1043
+ const button = document.querySelector(selector);
1044
+ if (button && button.offsetParent !== null) {
1045
+ button.click();
1046
+ console.log(`Clicked load more button with selector: ${selector}`);
1047
+ return true;
1048
+ }
1049
+ }
1050
+ catch (e) {
1051
+ continue;
1052
+ }
1053
+ }
1054
+ return false;
1055
+ });
1056
+ if (loadMoreClicked) {
1057
+ this.log('Clicked "More results" button', logger_1.Level.LOG);
1058
+ yield new Promise(resolve => setTimeout(resolve, 1500 + Math.random() * 1000));
1059
+ }
1060
+ else {
1061
+ this.log('No "More results" button found, results may be limited', logger_1.Level.WARN);
1062
+ break;
1063
+ }
1064
+ loadAttempts++;
1065
+ }
1066
+ this.log(`Finished pagination. Total results available: ${currentResultCount}`, logger_1.Level.LOG);
1067
+ searchResults = yield page.evaluate((limit) => {
1068
+ const results = [];
1069
+ const cleanDescription = (text) => {
1070
+ if (!text)
1071
+ return '';
1072
+ let cleaned = text.replace(/^\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago\s*/i, '');
1073
+ cleaned = cleaned.replace(/^[A-Z][a-z]{2}\s+\d{1,2},?\s+\d{4}\s*[—\-]\s*/i, '');
1074
+ cleaned = cleaned.replace(/^\d{4}-\d{2}-\d{2}\s*[—\-]\s*/i, '');
1075
+ cleaned = cleaned.trim().replace(/\s+/g, ' ');
1076
+ return cleaned;
1077
+ };
1078
+ const selectors = [
1079
+ '[data-testid="result"]',
1080
+ 'article[data-testid="result"]',
1081
+ 'li[data-layout="organic"]',
1082
+ '.result',
1083
+ 'article[data-testid]'
1084
+ ];
1085
+ let allElements = [];
1086
+ for (const selector of selectors) {
1087
+ const elements = Array.from(document.querySelectorAll(selector));
1088
+ if (elements.length > 0) {
1089
+ console.log(`Found ${elements.length} DDG elements with: ${selector}`);
1090
+ allElements = elements;
1091
+ break;
1092
+ }
1093
+ }
1094
+ for (let i = 0; i < Math.min(allElements.length, limit); i++) {
1095
+ const element = allElements[i];
1096
+ const titleEl = element.querySelector('h2, [data-testid="result-title-a"], h3, [data-testid="result-title"]');
1097
+ let linkEl = titleEl === null || titleEl === void 0 ? void 0 : titleEl.querySelector('a[href]');
1098
+ if (!linkEl) {
1099
+ linkEl = element.querySelector('a[href]');
1100
+ }
1101
+ if (!linkEl || !linkEl.href)
1102
+ continue;
1103
+ let actualUrl = linkEl.href;
1104
+ if (actualUrl.includes('uddg=')) {
1105
+ try {
1106
+ const urlParams = new URLSearchParams(actualUrl.split('?')[1]);
1107
+ const uddgUrl = urlParams.get('uddg');
1108
+ if (uddgUrl) {
1109
+ actualUrl = decodeURIComponent(uddgUrl);
1110
+ }
1111
+ }
1112
+ catch (e) {
1113
+ console.log('Failed to parse uddg parameter:', e);
1114
+ }
1115
+ }
1116
+ if (actualUrl.includes('duckduckgo.com')) {
1117
+ console.log(`Skipping DDG internal URL: ${actualUrl}`);
1118
+ continue;
1119
+ }
1120
+ const descEl = element.querySelector('[data-result="snippet"], .result__snippet, [data-testid="result-snippet"]');
1121
+ if (titleEl && titleEl.textContent && actualUrl) {
1122
+ const rawDescription = ((descEl === null || descEl === void 0 ? void 0 : descEl.textContent) || '').trim();
1123
+ const cleanedDescription = cleanDescription(rawDescription);
1124
+ results.push({
1125
+ url: actualUrl,
1126
+ title: titleEl.textContent.trim(),
1127
+ description: cleanedDescription,
1128
+ position: results.length + 1
1129
+ });
1130
+ }
1131
+ }
1132
+ console.log(`Extracted ${results.length} DuckDuckGo search results`);
1133
+ return results;
1134
+ }, searchConfig.limit);
1135
+ if (searchResults.length === 0) {
1136
+ this.log(`No DuckDuckGo results found (attempt ${retryCount + 1}/${maxRetries + 1})`, logger_1.Level.WARN);
1137
+ retryCount++;
1138
+ }
1139
+ else {
1140
+ this.log(`Successfully extracted ${searchResults.length} results`, logger_1.Level.LOG);
1141
+ break;
1142
+ }
1143
+ }
1144
+ this.log(`Search found ${searchResults.length} results`, logger_1.Level.LOG);
1145
+ if (searchConfig.mode === 'discover') {
1146
+ const actionType = "search";
1147
+ const actionName = "Search Results";
1148
+ if (!this.serializableDataByType[actionType]) {
1149
+ this.serializableDataByType[actionType] = {};
1150
+ }
1151
+ if (!this.serializableDataByType[actionType][actionName]) {
1152
+ this.serializableDataByType[actionType][actionName] = {};
1153
+ }
1154
+ const searchData = {
1155
+ query: searchConfig.query,
1156
+ provider: searchConfig.provider,
1157
+ filters: searchConfig.filters || {},
1158
+ resultsCount: searchResults.length,
1159
+ results: searchResults,
1160
+ searchedAt: new Date().toISOString()
1161
+ };
1162
+ this.serializableDataByType[actionType][actionName] = searchData;
1163
+ yield this.options.serializableCallback({
1164
+ scrapeList: this.serializableDataByType.scrapeList || {},
1165
+ scrapeSchema: this.serializableDataByType.scrapeSchema || {},
1166
+ crawl: this.serializableDataByType.crawl || {},
1167
+ search: this.serializableDataByType.search || {}
1168
+ });
1169
+ this.log(`Search completed in discover mode with ${searchResults.length} results`, logger_1.Level.LOG);
1170
+ return;
1171
+ }
1172
+ this.log(`Starting to scrape content from ${searchResults.length} search results...`, logger_1.Level.LOG);
1173
+ const scrapedResults = [];
1174
+ for (let i = 0; i < searchResults.length; i++) {
1175
+ const result = searchResults[i];
1176
+ try {
1177
+ this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, logger_1.Level.LOG);
1178
+ yield page.goto(result.url, {
1179
+ waitUntil: 'domcontentloaded',
1180
+ timeout: 30000
1181
+ }).catch(() => {
1182
+ this.log(`Failed to navigate to ${result.url}, skipping...`, logger_1.Level.WARN);
1183
+ });
1184
+ yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
1185
+ const pageData = yield page.evaluate(() => {
1186
+ var _a, _b;
1187
+ const getMeta = (name) => {
1188
+ const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
1189
+ return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
1190
+ };
1191
+ const getAllMeta = () => {
1192
+ const metadata = {};
1193
+ const metaTags = document.querySelectorAll('meta');
1194
+ metaTags.forEach(tag => {
1195
+ const name = tag.getAttribute('name') || tag.getAttribute('property');
1196
+ const content = tag.getAttribute('content');
1197
+ if (name && content) {
1198
+ metadata[name] = content;
1199
+ }
1200
+ });
1201
+ return metadata;
1202
+ };
1203
+ const title = document.title || '';
1204
+ const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
1205
+ const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
1206
+ elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
1207
+ const html = document.documentElement.outerHTML;
1208
+ const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
1209
+ const allMetadata = getAllMeta();
1210
+ return {
1211
+ title,
1212
+ description: getMeta('description'),
1213
+ text: bodyText,
1214
+ html: html,
1215
+ links: links,
1216
+ wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
1217
+ metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
1218
+ };
1219
+ });
1220
+ scrapedResults.push({
1221
+ searchResult: {
1222
+ query: searchConfig.query,
1223
+ position: result.position,
1224
+ searchTitle: result.title,
1225
+ searchDescription: result.description,
1226
+ },
1227
+ metadata: Object.assign(Object.assign({}, pageData.metadata), { url: result.url, sourceURL: result.url }),
1228
+ html: pageData.html,
1229
+ text: pageData.text,
1230
+ links: pageData.links,
1231
+ wordCount: pageData.wordCount,
1232
+ scrapedAt: new Date().toISOString()
1233
+ });
1234
+ this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
1235
+ }
1236
+ catch (error) {
1237
+ this.log(`Failed to scrape ${result.url}: ${error.message}`, logger_1.Level.WARN);
1238
+ scrapedResults.push({
1239
+ searchResult: {
1240
+ query: searchConfig.query,
1241
+ position: result.position,
1242
+ searchTitle: result.title,
1243
+ searchDescription: result.description,
1244
+ },
1245
+ url: result.url,
1246
+ error: error.message,
1247
+ scrapedAt: new Date().toISOString()
1248
+ });
1249
+ }
1250
+ }
1251
+ this.log(`Successfully scraped ${scrapedResults.length} search results`, logger_1.Level.LOG);
1252
+ const actionType = "search";
1253
+ const actionName = "Search Results";
1254
+ if (!this.serializableDataByType[actionType]) {
1255
+ this.serializableDataByType[actionType] = {};
1256
+ }
1257
+ if (!this.serializableDataByType[actionType][actionName]) {
1258
+ this.serializableDataByType[actionType][actionName] = {};
1259
+ }
1260
+ const searchData = {
1261
+ query: searchConfig.query,
1262
+ provider: searchConfig.provider,
1263
+ filters: searchConfig.filters || {},
1264
+ mode: searchConfig.mode,
1265
+ resultsCount: scrapedResults.length,
1266
+ results: scrapedResults,
1267
+ searchedAt: new Date().toISOString()
1268
+ };
1269
+ this.serializableDataByType[actionType][actionName] = searchData;
1270
+ yield this.options.serializableCallback({
1271
+ scrapeList: this.serializableDataByType.scrapeList || {},
1272
+ scrapeSchema: this.serializableDataByType.scrapeSchema || {},
1273
+ crawl: this.serializableDataByType.crawl || {},
1274
+ search: this.serializableDataByType.search || {}
1275
+ });
1276
+ }
1277
+ catch (error) {
1278
+ this.log(`Search action failed: ${error.message}`, logger_1.Level.ERROR);
1279
+ throw new Error(`Search execution error: ${error.message}`);
1280
+ }
1281
+ }),
808
1282
  };
809
1283
  const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
810
1284
  console.log("Executing action:", methodName, args);
811
1285
  if (methodName === 'press' || methodName === 'type') {
812
- // Extract only the first two arguments for these methods
813
1286
  const limitedArgs = Array.isArray(args) ? args.slice(0, 2) : [args];
814
1287
  yield invokee[methodName](...limitedArgs);
815
1288
  return;
@@ -822,7 +1295,6 @@ class Interpreter extends events_1.EventEmitter {
822
1295
  }
823
1296
  });
824
1297
  for (const step of steps) {
825
- // Check abort flag before each step
826
1298
  if (this.isAborted) {
827
1299
  this.log('Workflow aborted during step execution', logger_1.Level.WARN);
828
1300
  return;
@@ -844,7 +1316,6 @@ class Interpreter extends events_1.EventEmitter {
844
1316
  }
845
1317
  try {
846
1318
  if (step.action in wawActions) {
847
- // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
848
1319
  const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
849
1320
  if (step.action === 'screenshot') {
850
1321
  yield wawActions.screenshot(...(params !== null && params !== void 0 ? params : []), stepName !== null && stepName !== void 0 ? stepName : undefined);
@@ -861,7 +1332,6 @@ class Interpreter extends events_1.EventEmitter {
861
1332
  if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
862
1333
  this.options.debugChannel.setActionType(String(step.action));
863
1334
  }
864
- // Implements the dot notation for the "method name" in the workflow
865
1335
  const levels = String(step.action).split('.');
866
1336
  const methodName = levels[levels.length - 1];
867
1337
  let invokee = page;
@@ -870,7 +1340,6 @@ class Interpreter extends events_1.EventEmitter {
870
1340
  }
871
1341
  if (methodName === 'waitForLoadState') {
872
1342
  try {
873
- // Add timeout if not already specified
874
1343
  let args = step.args;
875
1344
  if (Array.isArray(args) && args.length === 1) {
876
1345
  args = [args[0], { timeout: 30000 }];
@@ -894,7 +1363,7 @@ class Interpreter extends events_1.EventEmitter {
894
1363
  }
895
1364
  catch (error) {
896
1365
  this.log(`Click action failed for selector ${(_b = step.args) === null || _b === void 0 ? void 0 : _b[0]}: ${error.message}`, logger_1.Level.WARN);
897
- continue; // Skip to next action
1366
+ continue;
898
1367
  }
899
1368
  }
900
1369
  }
@@ -905,7 +1374,6 @@ class Interpreter extends events_1.EventEmitter {
905
1374
  }
906
1375
  catch (error) {
907
1376
  this.log(`Action ${String(step.action)} failed: ${error.message}`, logger_1.Level.WARN);
908
- // Continue to next action instead of breaking
909
1377
  continue;
910
1378
  }
911
1379
  yield new Promise((res) => { setTimeout(res, 500); });
@@ -920,12 +1388,10 @@ class Interpreter extends events_1.EventEmitter {
920
1388
  }
921
1389
  const actionType = "scrapeList";
922
1390
  let actionName = providedActionName || "";
923
- // During deep extraction, ALWAYS auto-increment to create separate lists for each URL
924
1391
  if (!actionName || actionName.trim() === "" || this.isInDeepExtractionPhase) {
925
1392
  this.scrapeListCounter++;
926
1393
  actionName = `List ${this.scrapeListCounter}`;
927
1394
  }
928
- // Initialize storage for this action
929
1395
  if (!this.serializableDataByType[actionType]) {
930
1396
  this.serializableDataByType[actionType] = {};
931
1397
  }
@@ -933,23 +1399,21 @@ class Interpreter extends events_1.EventEmitter {
933
1399
  this.serializableDataByType[actionType][actionName] = [];
934
1400
  }
935
1401
  let allResults = [];
936
- let allUrls = []; // Track URLs alongside results for deep-extract
1402
+ let allUrls = [];
937
1403
  let previousHeight = 0;
938
1404
  let scrapedItems = new Set();
939
1405
  let visitedUrls = new Set();
940
1406
  const MAX_RETRIES = 3;
941
- const RETRY_DELAY = 1000; // 1 second delay between retries
1407
+ const RETRY_DELAY = 1000;
942
1408
  const MAX_UNCHANGED_RESULTS = 5;
943
1409
  const debugLog = (message, ...args) => {
944
1410
  console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
945
1411
  };
946
1412
  const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () {
947
- // Check abort flag before scraping current page
948
1413
  if (this.isAborted) {
949
1414
  debugLog("Workflow aborted, stopping scrapeCurrentPage");
950
1415
  return;
951
1416
  }
952
- // Add timeout to prevent hanging on page evaluation
953
1417
  const evaluationPromise = page.evaluate((cfg) => window.scrapeList(cfg), config);
954
1418
  const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('Page evaluation timeout')), 10000));
955
1419
  let results;
@@ -960,10 +1424,7 @@ class Interpreter extends events_1.EventEmitter {
960
1424
  debugLog(`Page evaluation failed: ${error.message}`);
961
1425
  return;
962
1426
  }
963
- // Extract URLs for ALL items BEFORE filtering duplicates
964
- // This ensures URL indices match result indices
965
1427
  const allItemUrls = yield this.extractUrlsFromCurrentPage(page, config.listSelector, results.length);
966
- // Filter results AND URLs together using the same uniqueness logic
967
1428
  const newResults = [];
968
1429
  const newUrls = [];
969
1430
  results.forEach((item, index) => {
@@ -971,28 +1432,35 @@ class Interpreter extends events_1.EventEmitter {
971
1432
  if (!scrapedItems.has(uniqueKey)) {
972
1433
  scrapedItems.add(uniqueKey);
973
1434
  newResults.push(item);
974
- newUrls.push(allItemUrls[index] || []); // Add corresponding URLs
1435
+ newUrls.push(allItemUrls[index] || []);
975
1436
  }
976
1437
  });
977
1438
  allResults = allResults.concat(newResults);
978
1439
  allUrls = allUrls.concat(newUrls);
979
1440
  debugLog("Results collected:", allResults.length);
980
- // Store in serializableDataByType and send structured callback
981
1441
  this.serializableDataByType[actionType][actionName] = [...allResults];
982
- yield this.options.serializableCallback({
1442
+ yield this.callWithTimeout(() => this.options.serializableCallback({
983
1443
  scrapeList: this.serializableDataByType.scrapeList,
984
1444
  scrapeSchema: this.serializableDataByType.scrapeSchema
985
- });
1445
+ }), 30000, 'serializableCallback (pagination scrapeList)');
1446
+ const MAX_STORED_LISTS = 50;
1447
+ const listKeys = Object.keys(this.serializableDataByType[actionType]);
1448
+ if (listKeys.length > MAX_STORED_LISTS) {
1449
+ const sortedKeys = listKeys.sort();
1450
+ const keysToRemove = sortedKeys.slice(0, listKeys.length - MAX_STORED_LISTS);
1451
+ keysToRemove.forEach(key => {
1452
+ delete this.serializableDataByType[actionType][key];
1453
+ });
1454
+ }
986
1455
  });
987
1456
  const checkLimit = () => {
988
1457
  if (config.limit && allResults.length >= config.limit) {
989
1458
  allResults = allResults.slice(0, config.limit);
990
- allUrls = allUrls.slice(0, config.limit); // Also trim URLs to maintain sync
1459
+ allUrls = allUrls.slice(0, config.limit);
991
1460
  return true;
992
1461
  }
993
1462
  return false;
994
1463
  };
995
- // Helper function to detect if a selector is XPath
996
1464
  const isXPathSelector = (selector) => {
997
1465
  return selector.startsWith('//') ||
998
1466
  selector.startsWith('/') ||
@@ -1004,11 +1472,9 @@ class Interpreter extends events_1.EventEmitter {
1004
1472
  selector.includes(' and ') ||
1005
1473
  selector.includes(' or ');
1006
1474
  };
1007
- // Helper function to wait for selector (CSS or XPath)
1008
1475
  const waitForSelectorUniversal = (selector_2, ...args_1) => __awaiter(this, [selector_2, ...args_1], void 0, function* (selector, options = {}) {
1009
1476
  try {
1010
1477
  if (isXPathSelector(selector)) {
1011
- // Use XPath locator
1012
1478
  const locator = page.locator(`xpath=${selector}`);
1013
1479
  yield locator.waitFor({
1014
1480
  state: 'attached',
@@ -1017,7 +1483,6 @@ class Interpreter extends events_1.EventEmitter {
1017
1483
  return yield locator.elementHandle();
1018
1484
  }
1019
1485
  else {
1020
- // Use CSS selector
1021
1486
  return yield page.waitForSelector(selector, {
1022
1487
  state: 'attached',
1023
1488
  timeout: options.timeout || 10000
@@ -1028,13 +1493,11 @@ class Interpreter extends events_1.EventEmitter {
1028
1493
  return null;
1029
1494
  }
1030
1495
  });
1031
- // Enhanced button finder with retry mechanism for both CSS and XPath selectors
1032
1496
  const findWorkingButton = (selectors) => __awaiter(this, void 0, void 0, function* () {
1033
1497
  const startTime = Date.now();
1034
1498
  const MAX_BUTTON_SEARCH_TIME = 15000;
1035
1499
  let updatedSelectors = [...selectors];
1036
1500
  for (let i = 0; i < selectors.length; i++) {
1037
- // Check overall timeout
1038
1501
  if (Date.now() - startTime > MAX_BUTTON_SEARCH_TIME) {
1039
1502
  debugLog(`Button search timeout reached (${MAX_BUTTON_SEARCH_TIME}ms), aborting`);
1040
1503
  break;
@@ -1044,7 +1507,6 @@ class Interpreter extends events_1.EventEmitter {
1044
1507
  let selectorSuccess = false;
1045
1508
  while (retryCount < MAX_RETRIES && !selectorSuccess) {
1046
1509
  try {
1047
- // Reduce timeout to prevent hanging on slow selectors
1048
1510
  const button = yield waitForSelectorUniversal(selector, { timeout: 2000 });
1049
1511
  if (button) {
1050
1512
  debugLog('Found working selector:', selector);
@@ -1055,7 +1517,6 @@ class Interpreter extends events_1.EventEmitter {
1055
1517
  };
1056
1518
  }
1057
1519
  else {
1058
- // Treat null result as failed attempt
1059
1520
  retryCount++;
1060
1521
  debugLog(`Selector "${selector}" not found: attempt ${retryCount}/${MAX_RETRIES}`);
1061
1522
  if (retryCount < MAX_RETRIES) {
@@ -1064,7 +1525,7 @@ class Interpreter extends events_1.EventEmitter {
1064
1525
  else {
1065
1526
  debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
1066
1527
  updatedSelectors = updatedSelectors.filter(s => s !== selector);
1067
- selectorSuccess = true; // Exit retry loop for this selector
1528
+ selectorSuccess = true;
1068
1529
  }
1069
1530
  }
1070
1531
  }
@@ -1077,7 +1538,7 @@ class Interpreter extends events_1.EventEmitter {
1077
1538
  else {
1078
1539
  debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
1079
1540
  updatedSelectors = updatedSelectors.filter(s => s !== selector);
1080
- selectorSuccess = true; // Exit retry loop for this selector
1541
+ selectorSuccess = true;
1081
1542
  }
1082
1543
  }
1083
1544
  }
@@ -1105,17 +1566,15 @@ class Interpreter extends events_1.EventEmitter {
1105
1566
  let availableSelectors = config.pagination.selector.split(',');
1106
1567
  let unchangedResultCounter = 0;
1107
1568
  let paginationIterations = 0;
1108
- const MAX_PAGINATION_ITERATIONS = 100; // Prevent infinite pagination
1569
+ const MAX_PAGINATION_ITERATIONS = 100;
1109
1570
  const paginationStartTime = Date.now();
1110
- const MAX_PAGINATION_TIME = 30 * 60 * 1000; // 30 minutes max for pagination
1571
+ const MAX_PAGINATION_TIME = 30 * 60 * 1000;
1111
1572
  try {
1112
1573
  while (true) {
1113
- // Check abort flag at start of each pagination iteration
1114
1574
  if (this.isAborted) {
1115
1575
  this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
1116
1576
  return { results: allResults, urls: allUrls };
1117
1577
  }
1118
- // Pagination circuit breakers
1119
1578
  if (++paginationIterations > MAX_PAGINATION_ITERATIONS) {
1120
1579
  debugLog(`Maximum pagination iterations reached (${MAX_PAGINATION_ITERATIONS}), stopping`);
1121
1580
  return { results: allResults, urls: allUrls };
@@ -1124,7 +1583,6 @@ class Interpreter extends events_1.EventEmitter {
1124
1583
  debugLog('Maximum pagination time reached (10 minutes), stopping');
1125
1584
  return { results: allResults, urls: allUrls };
1126
1585
  }
1127
- // Add async yield every 5 iterations to prevent event loop blocking
1128
1586
  if (paginationIterations % 5 === 0) {
1129
1587
  yield new Promise(resolve => setImmediate(resolve));
1130
1588
  }
@@ -1144,12 +1602,20 @@ class Interpreter extends events_1.EventEmitter {
1144
1602
  return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
1145
1603
  });
1146
1604
  const currentResultCount = allResults.length;
1147
- if (currentResultCount === previousResultCount) {
1605
+ const newItemsFound = currentResultCount - previousResultCount;
1606
+ if (newItemsFound === 0) {
1148
1607
  unchangedResultCounter++;
1149
1608
  if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
1150
1609
  return { results: allResults, urls: allUrls };
1151
1610
  }
1152
1611
  }
1612
+ else if (newItemsFound < 3) {
1613
+ unchangedResultCounter++;
1614
+ if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS * 2) {
1615
+ debugLog('Very slow pagination detected (< 3 items/scroll), stopping');
1616
+ return { results: allResults, urls: allUrls };
1617
+ }
1618
+ }
1153
1619
  else {
1154
1620
  unchangedResultCounter = 0;
1155
1621
  }
@@ -1169,12 +1635,20 @@ class Interpreter extends events_1.EventEmitter {
1169
1635
  yield page.waitForTimeout(2000);
1170
1636
  const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
1171
1637
  const currentResultCount = allResults.length;
1172
- if (currentResultCount === previousResultCount) {
1638
+ const newItemsFound = currentResultCount - previousResultCount;
1639
+ if (newItemsFound === 0) {
1173
1640
  unchangedResultCounter++;
1174
1641
  if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
1175
1642
  return { results: allResults, urls: allUrls };
1176
1643
  }
1177
1644
  }
1645
+ else if (newItemsFound < 3) {
1646
+ unchangedResultCounter++;
1647
+ if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS * 2) {
1648
+ debugLog('Very slow pagination detected (< 3 items/scroll), stopping');
1649
+ return { results: allResults, urls: allUrls };
1650
+ }
1651
+ }
1178
1652
  else {
1179
1653
  unchangedResultCounter = 0;
1180
1654
  }
@@ -1191,9 +1665,14 @@ class Interpreter extends events_1.EventEmitter {
1191
1665
  if (checkLimit())
1192
1666
  return { results: allResults, urls: allUrls };
1193
1667
  const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
1194
- availableSelectors = updatedSelectors;
1668
+ if (workingSelector) {
1669
+ config.pagination.selector = workingSelector;
1670
+ availableSelectors = [workingSelector];
1671
+ }
1672
+ else {
1673
+ availableSelectors = updatedSelectors;
1674
+ }
1195
1675
  if (!button || !workingSelector) {
1196
- // Final retry for navigation when no selectors work
1197
1676
  const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () {
1198
1677
  try {
1199
1678
  yield page.evaluate(() => window.history.forward());
@@ -1210,7 +1689,6 @@ class Interpreter extends events_1.EventEmitter {
1210
1689
  }
1211
1690
  let retryCount = 0;
1212
1691
  let paginationSuccess = false;
1213
- // Capture basic content signature before click - with XPath support
1214
1692
  const captureContentSignature = () => __awaiter(this, void 0, void 0, function* () {
1215
1693
  return yield page.evaluate((listSelector) => {
1216
1694
  const isXPath = (selector) => {
@@ -1219,7 +1697,6 @@ class Interpreter extends events_1.EventEmitter {
1219
1697
  let items = [];
1220
1698
  if (isXPath(listSelector)) {
1221
1699
  try {
1222
- // Use XPath to find elements
1223
1700
  const xpathResult = document.evaluate(listSelector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
1224
1701
  items = [];
1225
1702
  for (let i = 0; i < xpathResult.snapshotLength; i++) {
@@ -1231,7 +1708,6 @@ class Interpreter extends events_1.EventEmitter {
1231
1708
  }
1232
1709
  catch (xpathError) {
1233
1710
  console.warn('XPath evaluation failed, trying CSS selector as fallback:', xpathError);
1234
- // Fallback to CSS selector
1235
1711
  try {
1236
1712
  items = document.querySelectorAll(listSelector);
1237
1713
  }
@@ -1243,7 +1719,6 @@ class Interpreter extends events_1.EventEmitter {
1243
1719
  }
1244
1720
  else {
1245
1721
  try {
1246
- // Use CSS selector
1247
1722
  items = document.querySelectorAll(listSelector);
1248
1723
  }
1249
1724
  catch (cssError) {
@@ -1344,11 +1819,10 @@ class Interpreter extends events_1.EventEmitter {
1344
1819
  if (checkLimit())
1345
1820
  return { results: allResults, urls: allUrls };
1346
1821
  let loadMoreCounter = 0;
1347
- const MAX_LOAD_MORE_ITERATIONS = 100; // Prevent infinite load more
1822
+ const MAX_LOAD_MORE_ITERATIONS = 100;
1348
1823
  const loadMoreStartTime = Date.now();
1349
- const MAX_LOAD_MORE_TIME = 30 * 60 * 1000; // 5 minutes max for load more
1824
+ const MAX_LOAD_MORE_TIME = 30 * 60 * 1000;
1350
1825
  while (true) {
1351
- // Load more circuit breakers
1352
1826
  if (loadMoreCounter >= MAX_LOAD_MORE_ITERATIONS) {
1353
1827
  debugLog(`Maximum load more iterations reached (${MAX_LOAD_MORE_ITERATIONS}), stopping`);
1354
1828
  return { results: allResults, urls: allUrls };
@@ -1357,18 +1831,21 @@ class Interpreter extends events_1.EventEmitter {
1357
1831
  debugLog('Maximum load more time reached (5 minutes), stopping');
1358
1832
  return { results: allResults, urls: allUrls };
1359
1833
  }
1360
- // Add async yield every 3 iterations
1361
1834
  if (loadMoreCounter % 3 === 0 && loadMoreCounter > 0) {
1362
1835
  yield new Promise(resolve => setImmediate(resolve));
1363
1836
  }
1364
- // Find working button with retry mechanism
1365
1837
  const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
1366
- availableSelectors = updatedSelectors;
1838
+ if (workingSelector) {
1839
+ config.pagination.selector = workingSelector;
1840
+ availableSelectors = [workingSelector];
1841
+ }
1842
+ else {
1843
+ availableSelectors = updatedSelectors;
1844
+ }
1367
1845
  if (!workingSelector || !loadMoreButton) {
1368
1846
  debugLog('No working Load More selector found after retries');
1369
1847
  return { results: allResults, urls: allUrls };
1370
1848
  }
1371
- // Implement retry mechanism for clicking the button
1372
1849
  let retryCount = 0;
1373
1850
  let clickSuccess = false;
1374
1851
  while (retryCount < MAX_RETRIES && !clickSuccess) {
@@ -1379,14 +1856,13 @@ class Interpreter extends events_1.EventEmitter {
1379
1856
  }
1380
1857
  catch (error) {
1381
1858
  debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
1382
- // If regular click fails, try dispatchEvent
1383
1859
  try {
1384
1860
  yield loadMoreButton.dispatchEvent('click');
1385
1861
  clickSuccess = true;
1386
1862
  }
1387
1863
  catch (dispatchError) {
1388
1864
  debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
1389
- throw dispatchError; // Propagate error to trigger retry
1865
+ throw dispatchError;
1390
1866
  }
1391
1867
  }
1392
1868
  if (clickSuccess) {
@@ -1408,7 +1884,6 @@ class Interpreter extends events_1.EventEmitter {
1408
1884
  debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
1409
1885
  return { results: allResults, urls: allUrls };
1410
1886
  }
1411
- // Wait for content to load and check scroll height
1412
1887
  yield page.waitForTimeout(2000);
1413
1888
  yield page.evaluate(() => {
1414
1889
  const scrollHeight = Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
@@ -1458,644 +1933,15 @@ class Interpreter extends events_1.EventEmitter {
1458
1933
  return { results: allResults, urls: allUrls };
1459
1934
  });
1460
1935
  }
1461
- getMatchingActionId(workflow, pageState, usedActions) {
1462
- for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
1463
- const step = workflow[actionId];
1464
- const isApplicable = this.applicable(step.where, pageState, usedActions);
1465
- console.log("-------------------------------------------------------------");
1466
- console.log(`Where:`, step.where);
1467
- console.log(`Page state:`, pageState);
1468
- console.log(`Match result: ${isApplicable}`);
1469
- console.log("-------------------------------------------------------------");
1470
- if (isApplicable) {
1471
- return actionId;
1472
- }
1473
- }
1474
- }
1475
- removeShadowSelectors(workflow) {
1476
- for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
1477
- const step = workflow[actionId];
1478
- // Check if step has where and selectors
1479
- if (step.where && Array.isArray(step.where.selectors)) {
1480
- // Filter out selectors that contain ">>"
1481
- step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>'));
1482
- }
1483
- }
1484
- return workflow;
1485
- }
1486
1936
  removeSpecialSelectors(workflow) {
1487
1937
  for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
1488
1938
  const step = workflow[actionId];
1489
1939
  if (step.where && Array.isArray(step.where.selectors)) {
1490
- // Filter out if selector has EITHER ":>>" OR ">>"
1491
1940
  step.where.selectors = step.where.selectors.filter(selector => !(selector.includes(':>>') || selector.includes('>>')));
1492
1941
  }
1493
1942
  }
1494
1943
  return workflow;
1495
1944
  }
1496
- generatePageNodeInformation(page_1) {
1497
- return __awaiter(this, arguments, void 0, function* (page, selector = 'body') {
1498
- yield page.waitForTimeout(100);
1499
- const nodeInfo = yield page.evaluate((sel) => {
1500
- const serialize = (node) => {
1501
- var _a, _b;
1502
- // Handle text nodes
1503
- if (node.nodeType === Node.TEXT_NODE) {
1504
- // Get coordinates from parent element for text nodes
1505
- let coordinates;
1506
- if (node.parentElement) {
1507
- const rect = node.parentElement.getBoundingClientRect();
1508
- coordinates = {
1509
- x: rect.x,
1510
- y: rect.y + window.scrollY,
1511
- };
1512
- }
1513
- return {
1514
- type: 'TEXT_NODE',
1515
- textContent: ((_a = node.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || '',
1516
- nodeType: node.nodeType,
1517
- coordinates
1518
- };
1519
- }
1520
- // Handle element nodes
1521
- const element = node;
1522
- const attributes = {};
1523
- for (const attr of Array.from(element.attributes || [])) {
1524
- attributes[attr.name] = attr.value;
1525
- }
1526
- // Get coordinates for element nodes
1527
- let coordinates;
1528
- if (node.nodeType === Node.ELEMENT_NODE) {
1529
- const rect = element.getBoundingClientRect();
1530
- coordinates = {
1531
- x: rect.left + window.scrollX,
1532
- y: rect.top + window.scrollY
1533
- };
1534
- }
1535
- // Get all child nodes (including text nodes)
1536
- const children = Array.from(node.childNodes)
1537
- .map(child => serialize(child))
1538
- .filter(child => {
1539
- return !(child.type === 'TEXT_NODE' && child.textContent === '');
1540
- });
1541
- return {
1542
- type: 'ELEMENT_NODE',
1543
- tagName: ((_b = element.tagName) === null || _b === void 0 ? void 0 : _b.toLowerCase()) || '',
1544
- nodeType: node.nodeType,
1545
- attributes,
1546
- coordinates,
1547
- children
1548
- };
1549
- };
1550
- const rootElement = document.querySelector(sel);
1551
- if (!rootElement) {
1552
- throw new Error(`Element not found: ${sel}`);
1553
- }
1554
- const exactRect = rootElement.getBoundingClientRect();
1555
- const exactCoordinates = {
1556
- x: exactRect.left + window.scrollX,
1557
- y: exactRect.top + window.scrollY
1558
- };
1559
- const nodeInfo = serialize(rootElement);
1560
- nodeInfo.coordinates = exactCoordinates;
1561
- return nodeInfo;
1562
- }, selector);
1563
- return nodeInfo;
1564
- });
1565
- }
1566
- detectElementChanges(page, schema) {
1567
- return __awaiter(this, void 0, void 0, function* () {
1568
- var _a;
1569
- const currentDomTree = yield this.generatePageNodeInformation(page, 'body');
1570
- const changes = [];
1571
- const isScrapeList = 'listSelector' in schema;
1572
- const findMatchingElement = (field, currentTree, targetInfo) => {
1573
- if (currentTree.type !== 'ELEMENT_NODE')
1574
- return { element: null, confidence: 0 };
1575
- const directMatchScore = () => {
1576
- let score = 0;
1577
- const weights = {
1578
- tag: 0.25,
1579
- attributes: 0.35,
1580
- content: 0.25,
1581
- structure: 0.15
1582
- };
1583
- if ('tagName' in targetInfo &&
1584
- currentTree.tagName === targetInfo.tagName) {
1585
- score += weights.tag;
1586
- }
1587
- const targetAttrs = 'attributes' in targetInfo ? targetInfo.attributes || {} : {};
1588
- const currentAttrs = currentTree.attributes || {};
1589
- const targetClasses = (targetAttrs.class || '').split(/\s+/).filter(Boolean);
1590
- const currentClasses = (currentAttrs.class || '').split(/\s+/).filter(Boolean);
1591
- let classScore = 0;
1592
- if (targetClasses.length === 0 && currentClasses.length === 0) {
1593
- classScore = 1; // Both have no classes
1594
- }
1595
- else if (targetClasses.length > 0 && currentClasses.length > 0) {
1596
- const commonClasses = targetClasses.filter(c => currentClasses.includes(c));
1597
- if (commonClasses.length > 0) {
1598
- classScore = commonClasses.length / Math.max(targetClasses.length, 1);
1599
- }
1600
- }
1601
- let idScore = 0;
1602
- if (targetAttrs.id && currentAttrs.id) {
1603
- idScore = targetAttrs.id === currentAttrs.id ? 1 : 0;
1604
- }
1605
- else if (!targetAttrs.id && !currentAttrs.id) {
1606
- idScore = 1; // Both have no id
1607
- }
1608
- const otherAttrs = Object.keys(Object.assign(Object.assign({}, targetAttrs), currentAttrs))
1609
- .filter(key => key !== 'class' && key !== 'id');
1610
- let otherAttrsScore = 0;
1611
- if (otherAttrs.length > 0) {
1612
- let matches = 0;
1613
- for (const key of otherAttrs) {
1614
- if (targetAttrs[key] === currentAttrs[key]) {
1615
- matches++;
1616
- }
1617
- else if (key in targetAttrs && key in currentAttrs) {
1618
- matches += 0.5;
1619
- }
1620
- }
1621
- otherAttrsScore = matches / otherAttrs.length;
1622
- }
1623
- else {
1624
- otherAttrsScore = 1;
1625
- }
1626
- const attributeWeights = { class: 0.5, id: 0.3, other: 0.2 };
1627
- const attrScore = (classScore * attributeWeights.class) +
1628
- (idScore * attributeWeights.id) +
1629
- (otherAttrsScore * attributeWeights.other);
1630
- score += weights.attributes * attrScore;
1631
- if ('children' in targetInfo) {
1632
- const targetText = targetInfo.children
1633
- .filter(child => child.type === 'TEXT_NODE')
1634
- .map(child => child.textContent)
1635
- .join(' ')
1636
- .trim();
1637
- const currentText = currentTree.children
1638
- .filter(child => child.type === 'TEXT_NODE')
1639
- .map(child => child.textContent)
1640
- .join(' ')
1641
- .trim();
1642
- let contentScore = 0;
1643
- if (targetText === currentText) {
1644
- contentScore = 1;
1645
- }
1646
- else if (!targetText && !currentText) {
1647
- contentScore = 1;
1648
- }
1649
- else if (targetText && currentText) {
1650
- if (targetText.includes(currentText) || currentText.includes(targetText)) {
1651
- const ratio = Math.min(targetText.length, currentText.length) /
1652
- Math.max(targetText.length, currentText.length);
1653
- contentScore = 0.7 + (ratio * 0.3);
1654
- }
1655
- else {
1656
- const targetWords = targetText.toLowerCase().split(/\s+/).filter(Boolean);
1657
- const currentWords = currentText.toLowerCase().split(/\s+/).filter(Boolean);
1658
- if (targetWords.length > 0 && currentWords.length > 0) {
1659
- const commonWords = targetWords.filter(word => currentWords.includes(word));
1660
- if (commonWords.length > 0) {
1661
- contentScore = commonWords.length / Math.max(targetWords.length, currentWords.length);
1662
- }
1663
- }
1664
- }
1665
- const numericPattern = /^\s*\$?[\d.,]+\s*$/;
1666
- if (numericPattern.test(targetText) && numericPattern.test(currentText)) {
1667
- contentScore = Math.max(contentScore, 0.8);
1668
- }
1669
- }
1670
- score += weights.content * contentScore;
1671
- }
1672
- if ('children' in targetInfo) {
1673
- const targetElementChildren = targetInfo.children
1674
- .filter(child => child.type === 'ELEMENT_NODE');
1675
- const currentElementChildren = currentTree.children
1676
- .filter(child => child.type === 'ELEMENT_NODE');
1677
- let structureScore = 0;
1678
- if (targetElementChildren.length === 0 && currentElementChildren.length === 0) {
1679
- structureScore = 1;
1680
- }
1681
- else if (targetElementChildren.length > 0 && currentElementChildren.length > 0) {
1682
- const targetChildTags = targetElementChildren
1683
- .map(child => child.tagName.toLowerCase());
1684
- const currentChildTags = currentElementChildren
1685
- .map(child => child.tagName.toLowerCase());
1686
- const tagMatches = targetChildTags.filter(tag => currentChildTags.includes(tag)).length;
1687
- const tagCountDiff = Math.abs(targetChildTags.length - currentChildTags.length);
1688
- const maxTags = Math.max(targetChildTags.length, currentChildTags.length);
1689
- if (maxTags > 0) {
1690
- structureScore =
1691
- (0.7 * (tagMatches / maxTags)) +
1692
- (0.3 * (1 - (tagCountDiff / maxTags)));
1693
- }
1694
- }
1695
- score += weights.structure * structureScore;
1696
- }
1697
- return score;
1698
- };
1699
- const score = directMatchScore();
1700
- if (field === 'listSelector') {
1701
- if (score >= 0.85) {
1702
- return { element: currentTree, confidence: score };
1703
- }
1704
- }
1705
- else {
1706
- if (score >= 0.7) {
1707
- return { element: currentTree, confidence: score };
1708
- }
1709
- }
1710
- let bestMatch = { element: null, confidence: 0 };
1711
- for (const child of currentTree.children) {
1712
- if (child.type === 'ELEMENT_NODE') {
1713
- const childMatch = findMatchingElement(field, child, targetInfo);
1714
- if (childMatch.confidence > bestMatch.confidence) {
1715
- bestMatch = childMatch;
1716
- }
1717
- }
1718
- }
1719
- return bestMatch;
1720
- };
1721
- const findMatchingParentElement = (field, currentDomTree, parentNodeInfo) => {
1722
- if (currentDomTree.type !== 'ELEMENT_NODE') {
1723
- return { element: null, confidence: 0, changes: [] };
1724
- }
1725
- const matchesParentElement = (element) => {
1726
- if (element.type !== 'ELEMENT_NODE' || parentNodeInfo.type !== 'ELEMENT_NODE') {
1727
- return 0;
1728
- }
1729
- let score = 0;
1730
- const weights = {
1731
- tagName: 0.3,
1732
- structure: 0.3,
1733
- children: 0.4
1734
- };
1735
- if (element.tagName === parentNodeInfo.tagName) {
1736
- score += weights.tagName;
1737
- }
1738
- const targetChildren = parentNodeInfo.children || [];
1739
- const currentChildren = element.children || [];
1740
- const targetChildTags = targetChildren
1741
- .filter(child => child.type === 'ELEMENT_NODE')
1742
- .map(child => child.tagName);
1743
- const currentChildTags = currentChildren
1744
- .filter(child => child.type === 'ELEMENT_NODE')
1745
- .map(child => child.tagName);
1746
- let matchingChildren = 0;
1747
- for (const tag of targetChildTags) {
1748
- if (currentChildTags.includes(tag)) {
1749
- matchingChildren++;
1750
- }
1751
- }
1752
- if (targetChildTags.length > 0) {
1753
- score += weights.structure * (matchingChildren / targetChildTags.length);
1754
- }
1755
- const targetChildElements = targetChildren.filter(child => child.type === 'ELEMENT_NODE');
1756
- const currentChildElements = currentChildren.filter(child => child.type === 'ELEMENT_NODE');
1757
- let childContentScore = 0;
1758
- let matchedChildrenCount = 0;
1759
- for (const targetChild of targetChildElements) {
1760
- for (const currentChild of currentChildElements) {
1761
- if (targetChild.tagName === currentChild.tagName &&
1762
- contentSimilarity(targetChild, currentChild) > 0.7) {
1763
- matchedChildrenCount++;
1764
- break;
1765
- }
1766
- }
1767
- }
1768
- if (targetChildElements.length > 0) {
1769
- childContentScore = matchedChildrenCount / targetChildElements.length;
1770
- score += weights.children * childContentScore;
1771
- }
1772
- return score;
1773
- };
1774
- const contentSimilarity = (element1, element2) => {
1775
- var _a, _b;
1776
- if (element1.type !== 'ELEMENT_NODE' || element2.type !== 'ELEMENT_NODE') {
1777
- return 0;
1778
- }
1779
- const text1 = ((_a = element1.children) === null || _a === void 0 ? void 0 : _a.filter(child => child.type === 'TEXT_NODE').map(child => child.textContent).join(' ').trim()) || '';
1780
- const text2 = ((_b = element2.children) === null || _b === void 0 ? void 0 : _b.filter(child => child.type === 'TEXT_NODE').map(child => child.textContent).join(' ').trim()) || '';
1781
- if (!text1 && !text2)
1782
- return 1;
1783
- if (!text1 || !text2)
1784
- return 0;
1785
- if (text1 === text2)
1786
- return 1;
1787
- if (text1.includes(text2) || text2.includes(text1)) {
1788
- return 0.8;
1789
- }
1790
- const words1 = text1.toLowerCase().split(/\s+/).filter(Boolean);
1791
- const words2 = text2.toLowerCase().split(/\s+/).filter(Boolean);
1792
- let commonCount = 0;
1793
- for (const word of words1) {
1794
- if (words2.includes(word)) {
1795
- commonCount++;
1796
- }
1797
- }
1798
- return commonCount / Math.max(words1.length, words2.length);
1799
- };
1800
- const detectAttributeChanges = (element) => {
1801
- const changes = [];
1802
- if (element.type !== 'ELEMENT_NODE' ||
1803
- parentNodeInfo.type !== 'ELEMENT_NODE') {
1804
- return changes;
1805
- }
1806
- const originalAttrs = parentNodeInfo.attributes || {};
1807
- const currentAttrs = element.attributes || {};
1808
- for (const key of Object.keys(Object.assign(Object.assign({}, originalAttrs), currentAttrs))) {
1809
- if (originalAttrs[key] !== currentAttrs[key]) {
1810
- changes.push({
1811
- attribute: key,
1812
- originalValue: originalAttrs[key] || '',
1813
- currentValue: currentAttrs[key] || ''
1814
- });
1815
- }
1816
- }
1817
- return changes;
1818
- };
1819
- const matchScore = matchesParentElement(currentDomTree);
1820
- if (field === 'listSelector') {
1821
- if (matchScore >= 0.85) {
1822
- const changes = detectAttributeChanges(currentDomTree);
1823
- return {
1824
- element: currentDomTree,
1825
- confidence: matchScore,
1826
- changes
1827
- };
1828
- }
1829
- }
1830
- else {
1831
- if (matchScore >= 0.7) {
1832
- const changes = detectAttributeChanges(currentDomTree);
1833
- return {
1834
- element: currentDomTree,
1835
- confidence: matchScore,
1836
- changes
1837
- };
1838
- }
1839
- }
1840
- let bestMatch = { element: null, confidence: 0, changes: [] };
1841
- for (const child of currentDomTree.children) {
1842
- if (child.type === 'ELEMENT_NODE') {
1843
- const childResult = findMatchingParentElement(field, child, parentNodeInfo);
1844
- if (childResult.confidence > bestMatch.confidence) {
1845
- bestMatch = childResult;
1846
- }
1847
- }
1848
- }
1849
- return bestMatch;
1850
- };
1851
- const processElement = (field, elementConfig) => __awaiter(this, void 0, void 0, function* () {
1852
- const checkChanges = (originalNode, currentNode, element, isParent = false) => {
1853
- const originalAttrs = 'attributes' in originalNode ? originalNode.attributes : {};
1854
- const currentAttrs = 'attributes' in currentNode ? currentNode.attributes : {};
1855
- if ((Object.keys(originalAttrs).length > 0 && Object.keys(currentAttrs).length === 0) ||
1856
- Object.keys(Object.assign(Object.assign({}, originalAttrs), currentAttrs)).some(key => {
1857
- if (key === 'class') {
1858
- const originalClasses = (originalAttrs[key] || '').split(/\s+/).filter(Boolean);
1859
- const currentClasses = (currentAttrs[key] || '').split(/\s+/).filter(Boolean);
1860
- return JSON.stringify(originalClasses.sort()) !== JSON.stringify(currentClasses.sort());
1861
- }
1862
- return originalAttrs[key] !== currentAttrs[key];
1863
- })) {
1864
- changes.push({
1865
- type: 'ATTRIBUTES_CHANGED',
1866
- field,
1867
- originalState: originalAttrs,
1868
- currentState: currentAttrs,
1869
- coordinates: currentNode.coordinates,
1870
- element: element.asElement(),
1871
- isParent,
1872
- confidence,
1873
- });
1874
- }
1875
- if ('tagName' in originalNode && originalNode.tagName && currentNode.tagName &&
1876
- originalNode.tagName.toLowerCase() !== currentNode.tagName.toLowerCase()) {
1877
- changes.push({
1878
- type: 'TAG_CHANGED',
1879
- field,
1880
- originalState: originalNode.tagName,
1881
- currentState: currentNode.tagName,
1882
- coordinates: currentNode.coordinates,
1883
- element: element.asElement(),
1884
- isParent,
1885
- confidence,
1886
- });
1887
- }
1888
- };
1889
- const { element: matchedElement, confidence } = findMatchingElement(field, currentDomTree, elementConfig.nodeInfo);
1890
- if (matchedElement) {
1891
- const actualElement = yield page.evaluateHandle((element) => {
1892
- function findExactElement(elementInfo) {
1893
- const candidatesAtCoords = document.elementsFromPoint(elementInfo.coordinates.x, elementInfo.coordinates.y);
1894
- const getNormalizedText = (el) => {
1895
- var _a;
1896
- if (el.nodeType === Node.TEXT_NODE)
1897
- return ((_a = el.textContent) === null || _a === void 0 ? void 0 : _a.trim()) || '';
1898
- const textNodes = Array.from(el.childNodes)
1899
- .filter((node) => node.nodeType === Node.TEXT_NODE);
1900
- return textNodes.map((node) => { var _a; return (_a = node.textContent) === null || _a === void 0 ? void 0 : _a.trim(); }).join('').trim();
1901
- };
1902
- const targetText = elementInfo.children
1903
- .filter(child => child.type === 'TEXT_NODE')
1904
- .map(child => child.textContent)
1905
- .join('')
1906
- .trim();
1907
- const exactMatch = Array.from(candidatesAtCoords).find(candidate => {
1908
- if (candidate.tagName.toLowerCase() !== elementInfo.tagName.toLowerCase()) {
1909
- return false;
1910
- }
1911
- const candidateText = getNormalizedText(candidate);
1912
- const textMatch = targetText === candidateText;
1913
- if (Object.keys(elementInfo.attributes || {}).length > 0) {
1914
- const attrsMatch = Object.entries(elementInfo.attributes).every(([key, value]) => {
1915
- if (key === 'class') {
1916
- const elementClasses = (value || '').split(/\s+/).filter(Boolean);
1917
- const candidateClasses = (candidate.getAttribute('class') || '')
1918
- .split(/\s+/)
1919
- .filter(Boolean);
1920
- return JSON.stringify(elementClasses.sort()) ===
1921
- JSON.stringify(candidateClasses.sort());
1922
- }
1923
- return candidate.getAttribute(key) === value;
1924
- });
1925
- return textMatch && attrsMatch;
1926
- }
1927
- const elementChildTags = elementInfo.children
1928
- .filter(child => child.type === 'ELEMENT_NODE')
1929
- .map(child => child.tagName.toLowerCase());
1930
- const candidateChildTags = Array.from(candidate.children)
1931
- .map(child => child.tagName.toLowerCase());
1932
- const structureMatch = JSON.stringify(elementChildTags) ===
1933
- JSON.stringify(candidateChildTags);
1934
- return textMatch && structureMatch;
1935
- });
1936
- if (exactMatch)
1937
- return exactMatch;
1938
- const allCandidates = document.getElementsByTagName(elementInfo.tagName);
1939
- return Array.from(allCandidates).find(candidate => {
1940
- const candidateText = getNormalizedText(candidate);
1941
- const textMatch = targetText === candidateText;
1942
- const rect = candidate.getBoundingClientRect();
1943
- const coordsMatch = Math.abs(rect.x - elementInfo.coordinates.x) < 5 &&
1944
- Math.abs(rect.y - elementInfo.coordinates.y) < 5;
1945
- return textMatch && coordsMatch;
1946
- });
1947
- }
1948
- return findExactElement(element);
1949
- }, matchedElement);
1950
- if (actualElement) {
1951
- checkChanges(elementConfig.nodeInfo, matchedElement, actualElement);
1952
- }
1953
- if ('parent' in elementConfig.nodeInfo && elementConfig.nodeInfo.parent) {
1954
- const { element: matchedParentElement, confidence: parentConfidence } = findMatchingParentElement(field, currentDomTree, elementConfig.nodeInfo.parent);
1955
- if (matchedParentElement && parentConfidence >= 0.5) {
1956
- checkChanges(elementConfig.nodeInfo.parent, matchedParentElement, actualElement, true);
1957
- }
1958
- }
1959
- }
1960
- });
1961
- if (isScrapeList) {
1962
- if (schema.listSelectorInfo) {
1963
- yield processElement('listSelector', schema.listSelectorInfo);
1964
- }
1965
- if (schema.fields) {
1966
- for (const [fieldName, fieldConfig] of Object.entries(schema.fields)) {
1967
- yield processElement(`fields.${fieldName}`, {
1968
- nodeInfo: fieldConfig.nodeInfo,
1969
- coordinates: fieldConfig.coordinates,
1970
- tag: fieldConfig.tag,
1971
- });
1972
- }
1973
- }
1974
- if ((_a = schema.pagination) === null || _a === void 0 ? void 0 : _a.nodeInfo) {
1975
- yield processElement('pagination', {
1976
- nodeInfo: schema.pagination.nodeInfo,
1977
- coordinates: schema.pagination.coordinates,
1978
- });
1979
- }
1980
- }
1981
- else {
1982
- const scrapeSchema = schema;
1983
- for (const [field, config] of Object.entries(scrapeSchema)) {
1984
- yield processElement(field, {
1985
- nodeInfo: config.nodeInfo,
1986
- coordinates: config.coordinates,
1987
- tag: config.tag,
1988
- });
1989
- }
1990
- }
1991
- return changes;
1992
- });
1993
- }
1994
- validateWorkflowAction(page, action) {
1995
- return __awaiter(this, void 0, void 0, function* () {
1996
- var _a, _b;
1997
- const modifiedAction = JSON.parse(JSON.stringify(action));
1998
- const actionIndex = modifiedAction.what.findIndex(item => item.action === "scrapeSchema" || item.action === "scrapeList");
1999
- if (actionIndex !== -1) {
2000
- const schema = modifiedAction.what[actionIndex].args[0];
2001
- const isScrapeList = modifiedAction.what[actionIndex].action === "scrapeList";
2002
- try {
2003
- const changes = yield this.detectElementChanges(page, schema);
2004
- const uniqueChanges = {};
2005
- changes.forEach(change => {
2006
- if (!uniqueChanges[change.field] || change.type === 'TAG_CHANGED') {
2007
- uniqueChanges[change.field] = change;
2008
- }
2009
- });
2010
- // console.log("Unique changes detected:", uniqueChanges);
2011
- console.log("Changes detected:", changes);
2012
- for (const fieldName in uniqueChanges) {
2013
- let schemaField = isScrapeList ?
2014
- fieldName === 'listSelector' ? schema[fieldName] :
2015
- fieldName.startsWith('fields.') ? schema.fields[fieldName.split('.')[1]] :
2016
- fieldName === 'pagination' ? schema.pagination : null
2017
- : schema[fieldName];
2018
- const change = uniqueChanges[fieldName];
2019
- if (change.element && schemaField) {
2020
- try {
2021
- let newSelectors;
2022
- if (fieldName === 'listSelector') {
2023
- newSelectors = yield (0, selector_1.generateNonUniqueSelectors)(page, change.element, '');
2024
- }
2025
- else if (isScrapeList && fieldName.startsWith('fields.')) {
2026
- newSelectors = yield (0, selector_1.generateNonUniqueSelectors)(page, change.element, schema.listSelector);
2027
- }
2028
- else {
2029
- newSelectors = yield (0, selector_1.generateSelectors)(page, change.element);
2030
- if (fieldName === 'pagination') {
2031
- let chainedSelectors = [
2032
- (_a = newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.iframeSelector) === null || _a === void 0 ? void 0 : _a.full,
2033
- (_b = newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.shadowSelector) === null || _b === void 0 ? void 0 : _b.full,
2034
- newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.testIdSelector,
2035
- newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.id,
2036
- newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.hrefSelector,
2037
- newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.accessibilitySelector,
2038
- newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.attrSelector,
2039
- ]
2040
- .filter(selector => selector !== null && selector !== undefined)
2041
- .join(',');
2042
- newSelectors = chainedSelectors;
2043
- }
2044
- }
2045
- if (!newSelectors) {
2046
- throw new Error('Failed to generate new selectors');
2047
- }
2048
- const bestSelector = fieldName !== 'pagination' ? yield (0, utils_1.getBestSelector)({
2049
- selectors: newSelectors,
2050
- tagName: change.type === 'TAG_CHANGED' ?
2051
- change.currentState :
2052
- schemaField.tag
2053
- }) : newSelectors;
2054
- if (!bestSelector) {
2055
- throw new Error('Failed to determine best selector');
2056
- }
2057
- // Update selectors
2058
- let previousSelector;
2059
- if (fieldName === 'listSelector') {
2060
- previousSelector = schema.listSelector;
2061
- schema.listSelector = bestSelector;
2062
- }
2063
- else if (isScrapeList && fieldName.startsWith('fields.')) {
2064
- const fieldKey = fieldName.split('.')[1];
2065
- previousSelector = schema.fields[fieldKey].selector;
2066
- schema.fields[fieldKey].selector = bestSelector;
2067
- }
2068
- else if (fieldName === 'pagination') {
2069
- previousSelector = schema.pagination.selector;
2070
- schema.pagination.selector = bestSelector;
2071
- }
2072
- else {
2073
- previousSelector = schema[fieldName].selector;
2074
- schema[fieldName].selector = bestSelector;
2075
- }
2076
- console.log(`Updated ${fieldName} from ${previousSelector} to ${bestSelector}`);
2077
- if (modifiedAction.where.selectors) {
2078
- const selectorIndex = modifiedAction.where.selectors.findIndex(s => s.includes(previousSelector));
2079
- if (selectorIndex !== -1) {
2080
- modifiedAction.where.selectors[selectorIndex] = bestSelector;
2081
- }
2082
- }
2083
- }
2084
- catch (error) {
2085
- console.error(`Auto-heal failed for field ${fieldName}:`, error);
2086
- this.trackAutohealFailure(`Failed to auto-heal field ${fieldName}: ${error.message}`);
2087
- }
2088
- }
2089
- }
2090
- }
2091
- catch (error) {
2092
- console.error('Complete auto-heal failure:', error);
2093
- this.trackAutohealFailure(`Complete auto-heal failure: ${error.message}`);
2094
- }
2095
- }
2096
- return modifiedAction;
2097
- });
2098
- }
2099
1945
  /**
2100
1946
  * Test if a selector is working on the current page
2101
1947
  * @param {Page} page - Playwright page object
@@ -2116,7 +1962,6 @@ class Interpreter extends events_1.EventEmitter {
2116
1962
  selector.includes('@id=');
2117
1963
  let count = 0;
2118
1964
  if (isXPath) {
2119
- // Add timeout to prevent XPath hanging
2120
1965
  const locator = page.locator(`xpath=${selector}`);
2121
1966
  const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('XPath timeout')), 5000));
2122
1967
  try {
@@ -2126,12 +1971,10 @@ class Interpreter extends events_1.EventEmitter {
2126
1971
  ]);
2127
1972
  }
2128
1973
  catch (error) {
2129
- // XPath timed out or failed
2130
1974
  return false;
2131
1975
  }
2132
1976
  }
2133
1977
  else {
2134
- // Add timeout to CSS selector operations
2135
1978
  try {
2136
1979
  const elementsPromise = page.$$(selector);
2137
1980
  const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('CSS selector timeout')), 5000));
@@ -2142,15 +1985,12 @@ class Interpreter extends events_1.EventEmitter {
2142
1985
  count = elements ? elements.length : 0;
2143
1986
  }
2144
1987
  catch (error) {
2145
- // CSS selector timed out or failed
2146
1988
  return false;
2147
1989
  }
2148
1990
  }
2149
- // For list selectors, we need multiple elements
2150
1991
  if (isListSelector) {
2151
1992
  return count >= 2;
2152
1993
  }
2153
- // For field selectors, we need at least one element
2154
1994
  return count >= 1;
2155
1995
  }
2156
1996
  catch (error) {
@@ -2170,12 +2010,10 @@ class Interpreter extends events_1.EventEmitter {
2170
2010
  return __awaiter(this, arguments, void 0, function* (page, fallbackSelector, isListSelector = false, listContext = '', isPagination = false) {
2171
2011
  var _a, _b;
2172
2012
  try {
2173
- // First check if fallback selector works
2174
2013
  const fallbackWorks = yield this.testSelectorWorks(page, fallbackSelector, isListSelector);
2175
2014
  if (!fallbackWorks) {
2176
2015
  return null;
2177
2016
  }
2178
- // Get element using fallback selector
2179
2017
  const isXPath = fallbackSelector.startsWith('//') ||
2180
2018
  fallbackSelector.startsWith('/') ||
2181
2019
  fallbackSelector.includes('contains(@');
@@ -2189,7 +2027,6 @@ class Interpreter extends events_1.EventEmitter {
2189
2027
  if (!element) {
2190
2028
  return null;
2191
2029
  }
2192
- // Generate new selectors
2193
2030
  let newSelectors;
2194
2031
  if (isListSelector) {
2195
2032
  return yield (0, selector_1.generateListSelectorFromFallback)(page, fallbackSelector);
@@ -2200,7 +2037,6 @@ class Interpreter extends events_1.EventEmitter {
2200
2037
  else {
2201
2038
  newSelectors = yield (0, selector_1.generateFieldSelectorFromFallback)(page, fallbackSelector);
2202
2039
  if (isPagination) {
2203
- // For pagination, chain selectors in priority order
2204
2040
  let chainedSelectors = [
2205
2041
  (_a = newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.iframeSelector) === null || _a === void 0 ? void 0 : _a.full,
2206
2042
  (_b = newSelectors === null || newSelectors === void 0 ? void 0 : newSelectors.shadowSelector) === null || _b === void 0 ? void 0 : _b.full,
@@ -2215,7 +2051,6 @@ class Interpreter extends events_1.EventEmitter {
2215
2051
  return chainedSelectors;
2216
2052
  }
2217
2053
  else {
2218
- // For non-pagination, use getBestSelector
2219
2054
  const tagName = yield element.evaluate(el => el.tagName.toLowerCase());
2220
2055
  return yield (0, utils_1.getBestSelector)({
2221
2056
  selectors: newSelectors,
@@ -2240,7 +2075,6 @@ class Interpreter extends events_1.EventEmitter {
2240
2075
  return __awaiter(this, void 0, void 0, function* () {
2241
2076
  let hasChanges = false;
2242
2077
  try {
2243
- // Validate listSelector
2244
2078
  const listSelectorWorks = yield this.testSelectorWorks(page, scrapeListConfig.listSelector, true);
2245
2079
  if (!listSelectorWorks && scrapeListConfig.listFallbackSelector) {
2246
2080
  console.log(`ListSelector "${scrapeListConfig.listSelector}" not working, trying fallback...`);
@@ -2251,11 +2085,17 @@ class Interpreter extends events_1.EventEmitter {
2251
2085
  hasChanges = true;
2252
2086
  }
2253
2087
  }
2254
- // Validate field selectors
2255
2088
  if (scrapeListConfig.fields) {
2256
- for (const [fieldName, fieldConfig] of Object.entries(scrapeListConfig.fields)) {
2257
- const fieldSelectorWorks = yield this.testSelectorWorks(page, fieldConfig.selector, false);
2258
- if (!fieldSelectorWorks && fieldConfig.fallbackSelector) {
2089
+ const fieldEntries = Object.entries(scrapeListConfig.fields);
2090
+ const selectorTests = yield Promise.all(fieldEntries.map((_a) => __awaiter(this, [_a], void 0, function* ([fieldName, fieldConfig]) {
2091
+ return ({
2092
+ fieldName,
2093
+ fieldConfig,
2094
+ works: yield this.testSelectorWorks(page, fieldConfig.selector, false)
2095
+ });
2096
+ })));
2097
+ for (const { fieldName, fieldConfig, works } of selectorTests) {
2098
+ if (!works && fieldConfig.fallbackSelector) {
2259
2099
  console.log(`Field selector "${fieldConfig.selector}" for ${fieldName} not working, trying fallback...`);
2260
2100
  const newFieldSelector = yield this.generateSelectorFromFallback(page, fieldConfig.fallbackSelector, false, scrapeListConfig.listSelector);
2261
2101
  if (newFieldSelector) {
@@ -2263,14 +2103,20 @@ class Interpreter extends events_1.EventEmitter {
2263
2103
  fieldConfig.selector = newFieldSelector;
2264
2104
  hasChanges = true;
2265
2105
  }
2106
+ else {
2107
+ this.log(`WARNING: Failed to autoheal field "${fieldName}" - selector broken and fallback failed. Data for this field may be incomplete.`, logger_1.Level.WARN);
2108
+ this.trackAutohealFailure(`Field "${fieldName}" autoheal failed - both primary selector and fallback selector failed`);
2109
+ }
2110
+ }
2111
+ else if (!works) {
2112
+ this.log(`WARNING: Field "${fieldName}" selector not working and no fallback available. Data for this field may be incomplete.`, logger_1.Level.WARN);
2113
+ this.trackAutohealFailure(`Field "${fieldName}" selector broken with no fallback defined`);
2266
2114
  }
2267
2115
  }
2268
2116
  }
2269
- // Validate pagination selector if it exists and is not empty
2270
2117
  if (scrapeListConfig.pagination &&
2271
2118
  scrapeListConfig.pagination.selector &&
2272
2119
  scrapeListConfig.pagination.selector.trim() !== '') {
2273
- // Handle comma-separated pagination selectors
2274
2120
  const paginationSelectors = scrapeListConfig.pagination.selector.split(',').map(s => s.trim());
2275
2121
  let workingSelector = null;
2276
2122
  for (const selector of paginationSelectors) {
@@ -2311,19 +2157,34 @@ class Interpreter extends events_1.EventEmitter {
2311
2157
  return __awaiter(this, void 0, void 0, function* () {
2312
2158
  let hasChanges = false;
2313
2159
  try {
2314
- for (const [fieldName, fieldConfig] of Object.entries(scrapeSchemaConfig)) {
2315
- if (fieldConfig.selector) {
2316
- const selectorWorks = yield this.testSelectorWorks(page, fieldConfig.selector, false);
2317
- if (!selectorWorks && fieldConfig.fallbackSelector) {
2318
- console.log(`Schema field selector "${fieldConfig.selector}" for ${fieldName} not working, trying fallback...`);
2319
- const newSelector = yield this.generateSelectorFromFallback(page, fieldConfig.fallbackSelector, false);
2320
- if (newSelector) {
2321
- console.log(`Updated schema field selector for ${fieldName}: ${fieldConfig.selector} -> ${newSelector}`);
2322
- fieldConfig.selector = newSelector;
2323
- hasChanges = true;
2324
- }
2160
+ const schemaEntries = Object.entries(scrapeSchemaConfig);
2161
+ const selectorTests = yield Promise.all(schemaEntries
2162
+ .filter(([_, fieldConfig]) => fieldConfig.selector)
2163
+ .map((_a) => __awaiter(this, [_a], void 0, function* ([fieldName, fieldConfig]) {
2164
+ return ({
2165
+ fieldName,
2166
+ fieldConfig,
2167
+ works: yield this.testSelectorWorks(page, fieldConfig.selector, false)
2168
+ });
2169
+ })));
2170
+ for (const { fieldName, fieldConfig, works } of selectorTests) {
2171
+ if (!works && fieldConfig.fallbackSelector) {
2172
+ console.log(`Schema field selector "${fieldConfig.selector}" for ${fieldName} not working, trying fallback...`);
2173
+ const newSelector = yield this.generateSelectorFromFallback(page, fieldConfig.fallbackSelector, false);
2174
+ if (newSelector) {
2175
+ console.log(`Updated schema field selector for ${fieldName}: ${fieldConfig.selector} -> ${newSelector}`);
2176
+ fieldConfig.selector = newSelector;
2177
+ hasChanges = true;
2178
+ }
2179
+ else {
2180
+ this.log(`WARNING: Failed to autoheal schema field "${fieldName}" - selector broken and fallback failed. Data for this field may be incomplete.`, logger_1.Level.WARN);
2181
+ this.trackAutohealFailure(`Schema field "${fieldName}" autoheal failed - both primary selector and fallback selector failed`);
2325
2182
  }
2326
2183
  }
2184
+ else if (!works) {
2185
+ this.log(`WARNING: Schema field "${fieldName}" selector not working and no fallback available. Data for this field may be incomplete.`, logger_1.Level.WARN);
2186
+ this.trackAutohealFailure(`Schema field "${fieldName}" selector broken with no fallback defined`);
2187
+ }
2327
2188
  }
2328
2189
  }
2329
2190
  catch (error) {
@@ -2343,10 +2204,8 @@ class Interpreter extends events_1.EventEmitter {
2343
2204
  const modifiedAction = JSON.parse(JSON.stringify(action));
2344
2205
  let totalChanges = 0;
2345
2206
  try {
2346
- // Process each action in the 'what' array
2347
2207
  for (let i = 0; i < modifiedAction.what.length; i++) {
2348
2208
  const whatAction = modifiedAction.what[i];
2349
- // Handle scrapeList actions
2350
2209
  if (whatAction.action === 'scrapeList' && whatAction.args && whatAction.args[0]) {
2351
2210
  console.log(`Validating scrapeList action...`);
2352
2211
  const hasChanges = yield this.validateScrapeListAction(whatAction.args[0], page);
@@ -2355,7 +2214,6 @@ class Interpreter extends events_1.EventEmitter {
2355
2214
  console.log(`Fixed scrapeList selectors`);
2356
2215
  }
2357
2216
  }
2358
- // Handle scrapeSchema actions
2359
2217
  if (whatAction.action === 'scrapeSchema' && whatAction.args && whatAction.args[0]) {
2360
2218
  console.log(`Validating scrapeSchema action...`);
2361
2219
  const hasChanges = yield this.validateScrapeSchemaAction(whatAction.args[0], page);
@@ -2405,8 +2263,6 @@ class Interpreter extends events_1.EventEmitter {
2405
2263
  else {
2406
2264
  listElements = Array.from(document.querySelectorAll(selector));
2407
2265
  }
2408
- // Extract URLs from the first 'limit' elements that match the selector
2409
- // The limit corresponds to the number of items that were scraped
2410
2266
  const elementsToProcess = listElements.slice(0, limit);
2411
2267
  elementsToProcess.forEach(element => {
2412
2268
  const urls = [];
@@ -2433,9 +2289,8 @@ class Interpreter extends events_1.EventEmitter {
2433
2289
  * Workflow is bottom-to-top, so we scan from end to start.
2434
2290
  */
2435
2291
  buildDeepExtractionHierarchy(currentWorkflow) {
2436
- var _a, _b;
2292
+ var _a, _b, _c;
2437
2293
  const hierarchy = [];
2438
- // Find all goto action indices with their patterns
2439
2294
  const gotoData = [];
2440
2295
  currentWorkflow.forEach((pair, index) => {
2441
2296
  var _a;
@@ -2482,12 +2337,16 @@ class Interpreter extends events_1.EventEmitter {
2482
2337
  }
2483
2338
  let sourceActionName = '';
2484
2339
  let sourceActionType = 'scrapeList';
2340
+ let deepExtractionLimit = undefined;
2485
2341
  if (i === uniqueGotos.length - 1) {
2486
- const scrapeListBefore = currentWorkflow.slice(gotoIndex + 1).find(pair => pair.what && pair.what.some(action => action.action === 'scrapeList'));
2342
+ const scrapeListBefore = currentWorkflow.slice(0, gotoIndex).reverse().find(pair => pair.what && pair.what.some(action => action.action === 'scrapeList'));
2487
2343
  if (scrapeListBefore) {
2488
2344
  const scrapeListAction = scrapeListBefore.what.find(action => action.action === 'scrapeList');
2489
2345
  sourceActionName = ((_b = (_a = scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.args) === null || _a === void 0 ? void 0 : _a[0]) === null || _b === void 0 ? void 0 : _b.name) || (scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.name) || '';
2490
2346
  sourceActionType = 'scrapeList';
2347
+ if (((_c = scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.args) === null || _c === void 0 ? void 0 : _c[0]) && typeof scrapeListAction.args[0] === 'object') {
2348
+ deepExtractionLimit = scrapeListAction.args[0].deepExtractionLimit;
2349
+ }
2491
2350
  }
2492
2351
  }
2493
2352
  else {
@@ -2499,9 +2358,10 @@ class Interpreter extends events_1.EventEmitter {
2499
2358
  gotoPattern: String(gotoPattern),
2500
2359
  actionsToExecute: dataExtractionActions,
2501
2360
  sourceActionName,
2502
- sourceActionType
2361
+ sourceActionType,
2362
+ deepExtractionLimit
2503
2363
  });
2504
- this.log(`Level ${i}: goto at index ${gotoIndex}, pattern=${gotoPattern}, actions=${dataExtractionActions.length}`, logger_1.Level.LOG);
2364
+ this.log(`Level ${i}: goto at index ${gotoIndex}, pattern=${gotoPattern}, actions=${dataExtractionActions.length}${deepExtractionLimit ? `, deepLimit=${deepExtractionLimit}` : ''}`, logger_1.Level.LOG);
2505
2365
  }
2506
2366
  return hierarchy;
2507
2367
  }
@@ -2663,9 +2523,11 @@ class Interpreter extends events_1.EventEmitter {
2663
2523
  scrapeListIndex,
2664
2524
  url: matchingUrl
2665
2525
  });
2526
+ if (!matchingUrl) {
2527
+ this.deepExtractionStats.skippedDueToPattern++;
2528
+ }
2666
2529
  });
2667
2530
  const matchedCount = urlMappings.filter(m => m.url !== null).length;
2668
- this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
2669
2531
  if (matchedCount > 0) {
2670
2532
  const matchedMappings = urlMappings.filter(m => m.url !== null);
2671
2533
  const sampleSize = Math.min(5, matchedMappings.length);
@@ -2727,20 +2589,17 @@ class Interpreter extends events_1.EventEmitter {
2727
2589
  continue;
2728
2590
  if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
2729
2591
  continue;
2730
- let pathMatches = true;
2731
- for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
2732
- if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
2733
- pathMatches = false;
2734
- break;
2735
- }
2736
- }
2737
- if (!pathMatches)
2738
- continue;
2739
2592
  const urlNormalized = url.replace(/\/$/, '').toLowerCase();
2740
2593
  if (urlNormalized === targetNormalized) {
2741
2594
  this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
2742
2595
  continue;
2743
2596
  }
2597
+ const matched = this.matchesGotoPattern(url, String(gotoTargetPattern));
2598
+ if (!matched) {
2599
+ this.log(` ❌ Pattern mismatch for: ${url}`, logger_1.Level.LOG);
2600
+ continue;
2601
+ }
2602
+ this.log(` ✅ MATCHED: ${url}`, logger_1.Level.LOG);
2744
2603
  matchingUrl = url;
2745
2604
  break;
2746
2605
  }
@@ -2748,9 +2607,11 @@ class Interpreter extends events_1.EventEmitter {
2748
2607
  scrapeListIndex,
2749
2608
  url: matchingUrl
2750
2609
  });
2610
+ if (!matchingUrl) {
2611
+ this.deepExtractionStats.skippedDueToPattern++;
2612
+ }
2751
2613
  });
2752
- const matchedCount = urlMappings.filter(m => m.url !== null).length;
2753
- this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
2614
+ const matchedCount = extractedUrls.filter(urls => urls.length > 0).length;
2754
2615
  if (matchedCount > 0) {
2755
2616
  const matchedMappings = urlMappings.filter(m => m.url !== null);
2756
2617
  const sampleSize = Math.min(5, matchedMappings.length);
@@ -2773,6 +2634,16 @@ class Interpreter extends events_1.EventEmitter {
2773
2634
  /**
2774
2635
  * Helper function to check if a URL matches a goto pattern.
2775
2636
  */
2637
+ /**
2638
+ * Generic pattern matching for deep extraction URLs.
2639
+ * Works across any website by analyzing URL structure rather than relying on keywords.
2640
+ *
2641
+ * Strategy:
2642
+ * 1. Match URLs with same origin and path length
2643
+ * 2. Identify "structural" segments (numbers, short words, etc.) that should match exactly
2644
+ * 3. Allow other segments to vary (dynamic content like IDs, slugs, names)
2645
+ * 4. Skip exact matches to avoid duplicates
2646
+ */
2776
2647
  matchesGotoPattern(url, gotoPattern) {
2777
2648
  try {
2778
2649
  const getUrlPattern = (urlStr) => {
@@ -2797,11 +2668,42 @@ class Interpreter extends events_1.EventEmitter {
2797
2668
  if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
2798
2669
  return false;
2799
2670
  if (urlNormalized === targetNormalized)
2800
- return false; // Skip exact matches
2801
- for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
2802
- if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
2671
+ return false;
2672
+ /**
2673
+ * Generic heuristic to identify "structural" segments that should match exactly.
2674
+ * These are segments that define the URL structure, not the content.
2675
+ */
2676
+ const isStructuralSegment = (segment, index, totalSegments) => {
2677
+ const normalized = segment.toLowerCase();
2678
+ if (/^\d+$/.test(normalized)) {
2679
+ return true;
2680
+ }
2681
+ if (normalized.length >= 2 && normalized.length <= 5 && /^[a-z0-9-]+$/.test(normalized)) {
2682
+ return true;
2683
+ }
2684
+ const wordCount = normalized.split(/[-_]/).length;
2685
+ if (wordCount === 2 && normalized.length <= 15) {
2686
+ return true;
2687
+ }
2688
+ if (/^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$/i.test(normalized)) {
2689
+ return false;
2690
+ }
2691
+ if (normalized.length > 20 && /^[a-z0-9-]+$/.test(normalized)) {
2803
2692
  return false;
2804
2693
  }
2694
+ if (normalized.length >= 6 && normalized.length <= 20) {
2695
+ return false;
2696
+ }
2697
+ return index < Math.ceil(totalSegments / 2);
2698
+ };
2699
+ for (let i = 0; i < targetPattern.pathSegments.length; i++) {
2700
+ const targetSegment = targetPattern.pathSegments[i];
2701
+ const urlSegment = urlPattern.pathSegments[i];
2702
+ if (isStructuralSegment(targetSegment, i, targetPattern.pathSegments.length)) {
2703
+ if (targetSegment.toLowerCase() !== urlSegment.toLowerCase()) {
2704
+ return false;
2705
+ }
2706
+ }
2805
2707
  }
2806
2708
  return true;
2807
2709
  }
@@ -2822,24 +2724,52 @@ class Interpreter extends events_1.EventEmitter {
2822
2724
  }
2823
2725
  this.log(`\n=== Starting Hierarchical Deep Extraction (${hierarchy.length} level${hierarchy.length > 1 ? 's' : ''}) ===`, logger_1.Level.LOG);
2824
2726
  this.isInDeepExtractionPhase = true;
2825
- const startLevel = hierarchy.length >= 2 ? hierarchy.length - 2 : hierarchy.length - 1;
2826
- for (let levelIndex = startLevel; levelIndex >= 0; levelIndex--) {
2727
+ const startLevel = 0;
2728
+ for (let levelIndex = startLevel; levelIndex < hierarchy.length; levelIndex++) {
2827
2729
  const level = hierarchy[levelIndex];
2828
2730
  const currentLevelUrls = level.urlMappings;
2829
- this.log(`\n=== Processing Deep Extraction Level ${startLevel - levelIndex + 1}/${startLevel + 1} ===`, logger_1.Level.LOG);
2731
+ const parentLevel = levelIndex + 1 < hierarchy.length ? hierarchy[levelIndex + 1] : null;
2732
+ const effectiveLimit = (parentLevel === null || parentLevel === void 0 ? void 0 : parentLevel.deepExtractionLimit) || level.deepExtractionLimit;
2733
+ this.log(`\n=== Processing Deep Extraction Level ${levelIndex + 1}/${hierarchy.length} ===`, logger_1.Level.LOG);
2830
2734
  this.log(`Goto pattern: ${level.gotoPattern}`, logger_1.Level.LOG);
2831
2735
  this.log(`Actions to execute: ${level.actionsToExecute.length}`, logger_1.Level.LOG);
2832
2736
  this.log(`URLs to process: ${currentLevelUrls.filter(m => m.url !== null).length}`, logger_1.Level.LOG);
2737
+ if (effectiveLimit) {
2738
+ this.log(`Deep extraction limit: ${effectiveLimit}`, logger_1.Level.LOG);
2739
+ }
2833
2740
  if (currentLevelUrls.length === 0 || currentLevelUrls.every(u => !u.url)) {
2834
2741
  this.log('No valid URLs at this level - stopping here', logger_1.Level.LOG);
2835
2742
  break;
2836
2743
  }
2837
- yield this.executeDeepExtractionLevel(page, level, currentLevelUrls);
2744
+ yield this.executeDeepExtractionLevel(page, Object.assign(Object.assign({}, level), { deepExtractionLimit: effectiveLimit }), currentLevelUrls);
2838
2745
  }
2839
2746
  this.log('\n=== Hierarchical Deep Extraction Completed ===', logger_1.Level.LOG);
2747
+ if (this.deepExtractionStats.totalUrlsFound > 0) {
2748
+ yield this.callWithTimeout(() => this.options.serializableCallback({
2749
+ deepExtractionStats: {
2750
+ totalUrlsFound: this.deepExtractionStats.totalUrlsFound,
2751
+ successfulExtractions: this.deepExtractionStats.successfulExtractions,
2752
+ failedExtractions: this.deepExtractionStats.failedExtractions
2753
+ }
2754
+ }), 30000, 'serializableCallback (deep extraction stats)');
2755
+ }
2840
2756
  }
2841
2757
  catch (error) {
2842
2758
  this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
2759
+ if (this.deepExtractionStats.totalUrlsFound > 0) {
2760
+ try {
2761
+ yield this.callWithTimeout(() => this.options.serializableCallback({
2762
+ deepExtractionStats: {
2763
+ totalUrlsFound: this.deepExtractionStats.totalUrlsFound,
2764
+ successfulExtractions: this.deepExtractionStats.successfulExtractions,
2765
+ failedExtractions: this.deepExtractionStats.failedExtractions
2766
+ }
2767
+ }), 30000, 'serializableCallback (deep extraction stats error)');
2768
+ }
2769
+ catch (callbackError) {
2770
+ this.log(`Failed to send stats on error: ${callbackError.message}`, logger_1.Level.ERROR);
2771
+ }
2772
+ }
2843
2773
  }
2844
2774
  finally {
2845
2775
  this.isInDeepExtractionPhase = false;
@@ -2854,17 +2784,54 @@ class Interpreter extends events_1.EventEmitter {
2854
2784
  executeDeepExtractionLevel(page, level, urlMappings) {
2855
2785
  return __awaiter(this, void 0, void 0, function* () {
2856
2786
  try {
2857
- const validMappings = urlMappings.filter(m => m.url !== null);
2787
+ let validMappings = urlMappings.filter(m => m.url !== null);
2788
+ const deepExtractionLimit = level.deepExtractionLimit;
2789
+ if (deepExtractionLimit && validMappings.length > deepExtractionLimit) {
2790
+ this.log(`Found deepExtractionLimit: ${deepExtractionLimit} from parent action`, logger_1.Level.LOG);
2791
+ this.log(`Deep extraction limit applied: ${validMappings.length} URLs found, limiting to ${deepExtractionLimit}`, logger_1.Level.LOG);
2792
+ validMappings = validMappings.slice(0, deepExtractionLimit);
2793
+ }
2794
+ else if (deepExtractionLimit) {
2795
+ this.log(`Deep extraction limit: ${deepExtractionLimit} URLs configured (found ${validMappings.length} URLs, all within limit)`, logger_1.Level.LOG);
2796
+ }
2797
+ else {
2798
+ this.log(`No deep extraction limit configured, processing all ${validMappings.length} URLs`, logger_1.Level.LOG);
2799
+ }
2858
2800
  if (validMappings.length === 0) {
2859
2801
  this.log('No URLs to process for this level', logger_1.Level.LOG);
2860
2802
  return;
2861
2803
  }
2862
- this.log(`Processing ${validMappings.length} URLs`, logger_1.Level.LOG);
2804
+ const isFirstLevel = this.deepExtractionStats.totalUrlsFound === 0;
2805
+ if (isFirstLevel) {
2806
+ this.deepExtractionStats.totalUrlsFound = validMappings.length;
2807
+ }
2808
+ else {
2809
+ this.deepExtractionStats.totalUrlsFound += validMappings.length;
2810
+ }
2811
+ this.log(`Processing ${validMappings.length} URLs for deep extraction`, logger_1.Level.LOG);
2863
2812
  for (const mapping of validMappings) {
2864
2813
  try {
2865
2814
  this.log(`[${mapping.index}] Navigating to: ${mapping.url}`, logger_1.Level.LOG);
2866
2815
  yield page.goto(mapping.url);
2867
2816
  yield page.waitForLoadState('networkidle', { timeout: 30000 });
2817
+ const getTotalItems = () => {
2818
+ let total = 0;
2819
+ for (const key in this.serializableDataByType.scrapeSchema || {}) {
2820
+ const schemaData = this.serializableDataByType.scrapeSchema[key];
2821
+ if (Array.isArray(schemaData)) {
2822
+ total += schemaData.length;
2823
+ }
2824
+ else if (schemaData && typeof schemaData === 'object') {
2825
+ total += Object.keys(schemaData).length > 0 ? 1 : 0;
2826
+ }
2827
+ }
2828
+ for (const key in this.serializableDataByType.scrapeList || {}) {
2829
+ const listData = this.serializableDataByType.scrapeList[key];
2830
+ total += Array.isArray(listData) ? listData.length : 0;
2831
+ }
2832
+ return total;
2833
+ };
2834
+ const itemCountBefore = getTotalItems();
2868
2835
  for (let i = level.actionsToExecute.length - 1; i >= 0; i--) {
2869
2836
  const actionPair = level.actionsToExecute[i];
2870
2837
  if (this.isAborted) {
@@ -2879,10 +2846,34 @@ class Interpreter extends events_1.EventEmitter {
2879
2846
  yield this.carryOutSteps(page, filteredActions);
2880
2847
  }
2881
2848
  }
2882
- this.log(`[${mapping.index}] Completed`, logger_1.Level.LOG);
2849
+ const itemCountAfter = getTotalItems();
2850
+ const dataWasExtracted = itemCountAfter > itemCountBefore;
2851
+ if (dataWasExtracted) {
2852
+ this.log(`[${mapping.index}] Completed - Data extracted successfully (${itemCountAfter - itemCountBefore} items)`, logger_1.Level.LOG);
2853
+ this.deepExtractionStats.successfulExtractions++;
2854
+ }
2855
+ else {
2856
+ this.log(`[${mapping.index}] Completed - No data extracted`, logger_1.Level.WARN);
2857
+ this.deepExtractionStats.failedExtractions++;
2858
+ }
2859
+ yield this.callWithTimeout(() => this.options.serializableCallback({
2860
+ deepExtractionStats: {
2861
+ totalUrlsFound: this.deepExtractionStats.totalUrlsFound,
2862
+ successfulExtractions: this.deepExtractionStats.successfulExtractions,
2863
+ failedExtractions: this.deepExtractionStats.failedExtractions
2864
+ }
2865
+ }), 30000, 'serializableCallback (deep extraction item stats)');
2883
2866
  }
2884
2867
  catch (error) {
2885
2868
  this.log(`[${mapping.index}] Failed: ${error.message}`, logger_1.Level.ERROR);
2869
+ this.deepExtractionStats.failedExtractions++;
2870
+ yield this.callWithTimeout(() => this.options.serializableCallback({
2871
+ deepExtractionStats: {
2872
+ totalUrlsFound: this.deepExtractionStats.totalUrlsFound,
2873
+ successfulExtractions: this.deepExtractionStats.successfulExtractions,
2874
+ failedExtractions: this.deepExtractionStats.failedExtractions
2875
+ }
2876
+ }), 30000, 'serializableCallback (deep extraction failed item stats)');
2886
2877
  }
2887
2878
  }
2888
2879
  }
@@ -2903,10 +2894,7 @@ class Interpreter extends events_1.EventEmitter {
2903
2894
  catch (error) {
2904
2895
  this.log(`Failed to apply ad-blocker: ${error.message}`, logger_1.Level.ERROR);
2905
2896
  }
2906
- const usedActions = [];
2907
- let selectors = [];
2908
2897
  let lastAction = null;
2909
- let actionId = -1;
2910
2898
  let repeatCount = 0;
2911
2899
  /**
2912
2900
  * Enables the interpreter functionality for popup windows.
@@ -2919,12 +2907,11 @@ class Interpreter extends events_1.EventEmitter {
2919
2907
  p.on('popup', popupHandler);
2920
2908
  /* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
2921
2909
  let loopIterations = 0;
2922
- const MAX_LOOP_ITERATIONS = 1000; // Circuit breaker
2910
+ const MAX_LOOP_ITERATIONS = 1000;
2923
2911
  let consecutiveFailures = 0;
2924
2912
  const MAX_CONSECUTIVE_FAILURES = 10;
2925
2913
  const startTime = Date.now();
2926
- const MAX_EXECUTION_TIME = 30 * 60 * 1000; // 30 minutes max
2927
- // Cleanup function to remove popup listener
2914
+ const MAX_EXECUTION_TIME = 30 * 60 * 1000;
2928
2915
  const cleanup = () => {
2929
2916
  try {
2930
2917
  if (!p.isClosed()) {
@@ -2935,32 +2922,26 @@ class Interpreter extends events_1.EventEmitter {
2935
2922
  }
2936
2923
  };
2937
2924
  while (true) {
2938
- // Multiple circuit breakers to prevent infinite loops
2939
2925
  if (++loopIterations > MAX_LOOP_ITERATIONS) {
2940
2926
  this.log('Maximum loop iterations reached, terminating to prevent infinite loop', logger_1.Level.ERROR);
2941
2927
  cleanup();
2942
2928
  return;
2943
2929
  }
2944
- // Time-based circuit breaker
2945
2930
  if (Date.now() - startTime > MAX_EXECUTION_TIME) {
2946
2931
  this.log('Maximum execution time reached (30 minutes), terminating workflow', logger_1.Level.ERROR);
2947
2932
  cleanup();
2948
2933
  return;
2949
2934
  }
2950
- // Failure-based circuit breaker
2951
2935
  if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
2952
2936
  this.log('Too many consecutive failures, terminating to prevent hang', logger_1.Level.ERROR);
2953
2937
  cleanup();
2954
2938
  return;
2955
2939
  }
2956
- // Check abort flag immediately
2957
2940
  if (this.isAborted) {
2958
2941
  this.log('Workflow aborted in runLoop', logger_1.Level.WARN);
2959
2942
  cleanup();
2960
2943
  return;
2961
2944
  }
2962
- // Checks whether the page was closed from outside,
2963
- // or the workflow execution has been stopped via `interpreter.stop()`
2964
2945
  if (p.isClosed() || !this.stopper) {
2965
2946
  cleanup();
2966
2947
  return;
@@ -2992,55 +2973,12 @@ class Interpreter extends events_1.EventEmitter {
2992
2973
  cleanup();
2993
2974
  return;
2994
2975
  }
2995
- // const newSelectors = this.getSelectors(workflowCopy);
2996
- // newSelectors.forEach(selector => {
2997
- // if (!selectors.includes(selector)) {
2998
- // selectors.push(selector);
2999
- // }
3000
- // });
3001
- // let pageState = {};
3002
- // let getStateTest = "Hello";
3003
- // try {
3004
- // pageState = await this.getState(p, workflowCopy, selectors);
3005
- // selectors = [];
3006
- // console.log("Empty selectors:", selectors)
3007
- // } catch (e: any) {
3008
- // this.log('The browser has been closed.');
3009
- // return;
3010
- // }
3011
- // if (this.options.debug) {
3012
- // this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
3013
- // }
3014
- // const actionId = workflow.findIndex((step) => {
3015
- // const isApplicable = this.applicable(step.where, pageState, usedActions);
3016
- // console.log("-------------------------------------------------------------");
3017
- // console.log(`Where:`, step.where);
3018
- // console.log(`Page state:`, pageState);
3019
- // console.log(`Match result: ${isApplicable}`);
3020
- // console.log("-------------------------------------------------------------");
3021
- // return isApplicable;
3022
- // });
3023
- // if (workflowCopy[0]) {
3024
- // if (workflowCopy[0].what[1].action === 'scrapeSchema') {
3025
- // const schema = workflowCopy[0].what[1].args[0];
3026
- // await p.goto(workflowCopy[0].where.url.toString())
3027
- // await p.waitForLoadState();
3028
- // const changes = await this.detectElementChanges(p, schema);
3029
- // console.log("Page URL: ", workflowCopy[0].where.url.toString());
3030
- // console.log("SCHEMA CHANGES:", changes);
3031
- // }
3032
- // }
3033
- // actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
3034
- // if (actionId !== -1 && workflowCopy[actionId]) {
3035
- // workflowCopy[actionId] = await this.validateWorkflowAction(p, workflowCopy[actionId]);
3036
- // }
3037
2976
  const actionId = workflowCopy.length - 1;
3038
2977
  const action = workflowCopy[actionId];
3039
2978
  console.log("MATCHED ACTION:", action);
3040
2979
  console.log("MATCHED ACTION ID:", actionId);
3041
2980
  this.log(`Matched ${JSON.stringify(action === null || action === void 0 ? void 0 : action.where)}`, logger_1.Level.LOG);
3042
- if (action) { // action is matched
3043
- // Check abort flag before executing action
2981
+ if (action) {
3044
2982
  if (this.isAborted) {
3045
2983
  this.log('Workflow aborted before action execution', logger_1.Level.WARN);
3046
2984
  return;
@@ -3054,27 +2992,43 @@ class Interpreter extends events_1.EventEmitter {
3054
2992
  return;
3055
2993
  }
3056
2994
  lastAction = action;
3057
- try {
3058
- const validatedAction = yield this.validateAndFixSelectors(p, action);
3059
- console.log("Carrying out:", validatedAction.what);
3060
- yield this.carryOutSteps(p, validatedAction.what, workflowCopy);
3061
- usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
3062
- workflowCopy.splice(actionId, 1);
3063
- console.log(`Action with ID ${action.id} removed from the workflow copy.`);
3064
- // Reset counters on successful action (but keep some history to prevent infinite resets)
3065
- loopIterations = Math.max(0, loopIterations - 10);
3066
- consecutiveFailures = 0;
3067
- // Add async yield to prevent event loop blocking
3068
- if (loopIterations % 10 === 0) {
3069
- yield new Promise(resolve => setImmediate(resolve));
3070
- }
3071
- }
3072
- catch (e) {
3073
- this.log(e, logger_1.Level.ERROR);
3074
- consecutiveFailures++;
3075
- // Add delay on failures to prevent tight error loops
3076
- yield new Promise(resolve => setTimeout(resolve, Math.min(1000, consecutiveFailures * 200)));
3077
- // Don't crash on individual action failures - continue with next iteration
2995
+ const MAX_ACTION_RETRIES = 3;
2996
+ let actionRetries = 0;
2997
+ let actionSucceeded = false;
2998
+ while (actionRetries < MAX_ACTION_RETRIES && !actionSucceeded) {
2999
+ try {
3000
+ const validatedAction = yield this.validateAndFixSelectors(p, action);
3001
+ console.log("Carrying out:", validatedAction.what);
3002
+ yield this.carryOutSteps(p, validatedAction.what, workflowCopy);
3003
+ workflowCopy.splice(actionId, 1);
3004
+ console.log(`Action with ID ${action.id} removed from the workflow copy.`);
3005
+ this.executedActions++;
3006
+ const percentage = Math.round((this.executedActions / this.totalActions) * 100);
3007
+ if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.progressUpdate) {
3008
+ this.options.debugChannel.progressUpdate(this.executedActions, this.totalActions, percentage);
3009
+ }
3010
+ actionSucceeded = true;
3011
+ consecutiveFailures = 0;
3012
+ loopIterations = Math.max(0, loopIterations - 10);
3013
+ if (loopIterations % 10 === 0) {
3014
+ yield new Promise(resolve => setImmediate(resolve));
3015
+ }
3016
+ }
3017
+ catch (e) {
3018
+ actionRetries++;
3019
+ this.log(e, logger_1.Level.ERROR);
3020
+ if (actionRetries < MAX_ACTION_RETRIES) {
3021
+ this.log(`Retrying action (attempt ${actionRetries + 1}/${MAX_ACTION_RETRIES})`, logger_1.Level.WARN);
3022
+ yield new Promise(resolve => setTimeout(resolve, 1000 * actionRetries));
3023
+ }
3024
+ else {
3025
+ this.log(`Action failed after ${MAX_ACTION_RETRIES} retries`, logger_1.Level.ERROR);
3026
+ consecutiveFailures++;
3027
+ yield new Promise(resolve => setTimeout(resolve, Math.min(1000, consecutiveFailures * 200)));
3028
+ }
3029
+ }
3030
+ }
3031
+ if (!actionSucceeded) {
3078
3032
  continue;
3079
3033
  }
3080
3034
  }
@@ -3088,8 +3042,8 @@ class Interpreter extends events_1.EventEmitter {
3088
3042
  }
3089
3043
  ensureScriptsLoaded(page) {
3090
3044
  return __awaiter(this, void 0, void 0, function* () {
3045
+ let scriptsLoaded = false;
3091
3046
  try {
3092
- // Add timeout to prevent hanging on script evaluation
3093
3047
  const evaluationPromise = page.evaluate(() => typeof window.scrape === 'function' &&
3094
3048
  typeof window.scrapeSchema === 'function' &&
3095
3049
  typeof window.scrapeList === 'function' &&
@@ -3097,22 +3051,21 @@ class Interpreter extends events_1.EventEmitter {
3097
3051
  typeof window.scrollDown === 'function' &&
3098
3052
  typeof window.scrollUp === 'function');
3099
3053
  const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('Script check timeout')), 3000));
3100
- const isScriptLoaded = yield Promise.race([
3054
+ scriptsLoaded = yield Promise.race([
3101
3055
  evaluationPromise,
3102
3056
  timeoutPromise
3103
3057
  ]);
3104
- if (!isScriptLoaded) {
3105
- yield page.addInitScript({ path: path_1.default.join(__dirname, 'browserSide', 'scraper.js') });
3106
- }
3107
3058
  }
3108
3059
  catch (error) {
3109
- // If script check fails, try to add the script anyway
3110
- this.log(`Script check failed, adding script anyway: ${error.message}`, logger_1.Level.WARN);
3060
+ this.log(`Script check failed or timed out: ${error.message}`, logger_1.Level.WARN);
3061
+ scriptsLoaded = false;
3062
+ }
3063
+ if (!scriptsLoaded) {
3111
3064
  try {
3112
3065
  yield page.addInitScript({ path: path_1.default.join(__dirname, 'browserSide', 'scraper.js') });
3113
3066
  }
3114
3067
  catch (scriptError) {
3115
- this.log(`Failed to add script: ${scriptError.message}`, logger_1.Level.ERROR);
3068
+ this.log(`Failed to add scraper script: ${scriptError.message}`, logger_1.Level.ERROR);
3116
3069
  }
3117
3070
  }
3118
3071
  });
@@ -3127,10 +3080,10 @@ class Interpreter extends events_1.EventEmitter {
3127
3080
  */
3128
3081
  run(page, params) {
3129
3082
  return __awaiter(this, void 0, void 0, function* () {
3083
+ var _a;
3130
3084
  this.log('Starting the workflow.', logger_1.Level.LOG);
3131
3085
  const context = page.context();
3132
3086
  page.setDefaultNavigationTimeout(100000);
3133
- // Check proxy settings from context options
3134
3087
  const contextOptions = context._options;
3135
3088
  const hasProxy = !!(contextOptions === null || contextOptions === void 0 ? void 0 : contextOptions.proxy);
3136
3089
  this.log(`Proxy settings: ${hasProxy ? `Proxy is configured...` : 'No proxy configured...'}`);
@@ -3146,6 +3099,11 @@ class Interpreter extends events_1.EventEmitter {
3146
3099
  * `this.workflow` with the parameters initialized.
3147
3100
  */
3148
3101
  this.initializedWorkflow = preprocessor_1.default.initWorkflow(this.workflow, params);
3102
+ this.totalActions = this.initializedWorkflow.length;
3103
+ this.executedActions = 0;
3104
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.progressUpdate) {
3105
+ this.options.debugChannel.progressUpdate(0, this.totalActions, 0);
3106
+ }
3149
3107
  yield this.ensureScriptsLoaded(page);
3150
3108
  this.stopper = () => {
3151
3109
  this.stopper = null;
@@ -3173,7 +3131,6 @@ class Interpreter extends events_1.EventEmitter {
3173
3131
  cleanup() {
3174
3132
  return __awaiter(this, void 0, void 0, function* () {
3175
3133
  try {
3176
- // Stop any running workflows first
3177
3134
  if (this.stopper) {
3178
3135
  try {
3179
3136
  yield this.stop();
@@ -3182,7 +3139,6 @@ class Interpreter extends events_1.EventEmitter {
3182
3139
  this.log(`Error stopping workflow during cleanup: ${error.message}`, logger_1.Level.WARN);
3183
3140
  }
3184
3141
  }
3185
- // Clear ad-blocker resources
3186
3142
  if (this.blocker) {
3187
3143
  try {
3188
3144
  this.blocker = null;
@@ -3192,12 +3148,10 @@ class Interpreter extends events_1.EventEmitter {
3192
3148
  this.log(`Error cleaning up ad-blocker: ${error.message}`, logger_1.Level.WARN);
3193
3149
  }
3194
3150
  }
3195
- // Clear accumulated data to free memory
3196
3151
  this.cumulativeResults = [];
3197
3152
  this.autohealFailures = [];
3198
3153
  this.namedResults = {};
3199
3154
  this.serializableDataByType = { scrapeList: {}, scrapeSchema: {} };
3200
- // Reset state
3201
3155
  this.isAborted = false;
3202
3156
  this.initializedWorkflow = null;
3203
3157
  this.log('Interpreter cleanup completed', logger_1.Level.DEBUG);