mx-cloud 0.0.23 → 0.0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,6 +38,7 @@ interface InterpreterOptions {
38
38
  serializableCallback: (output: any) => (void | Promise<void>);
39
39
  binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
40
40
  debug: boolean;
41
+ robotType?: 'extract' | 'scrape' | 'deep-extract';
41
42
  debugChannel: Partial<{
42
43
  activeId: (id: number) => void;
43
44
  debugMessage: (msg: string) => void;
@@ -63,6 +64,8 @@ export default class Interpreter extends EventEmitter {
63
64
  private screenshotCounter;
64
65
  private scrapeListCounter;
65
66
  private serializableDataByType;
67
+ private pendingDeepExtraction;
68
+ private isInDeepExtractionPhase;
66
69
  constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
67
70
  trackAutohealFailure(error: string): void;
68
71
  private applyAdBlocker;
@@ -148,6 +151,56 @@ export default class Interpreter extends EventEmitter {
148
151
  * @returns {Promise<WhereWhatPair>} - The potentially modified action
149
152
  */
150
153
  private validateAndFixSelectors;
154
+ /**
155
+ * Extracts URLs from the current page's list elements.
156
+ * Used during pagination to maintain sync between scraped results and extracted URLs.
157
+ *
158
+ * @param page - Playwright page object
159
+ * @param listSelector - The selector used to identify list elements
160
+ * @param limit - Maximum number of elements to process (should match number of scraped items)
161
+ * @returns Array of URL arrays, one per list element
162
+ */
163
+ private extractUrlsFromCurrentPage;
164
+ /**
165
+ * Builds a hierarchical deep extraction plan by analyzing the workflow structure.
166
+ * Identifies goto actions and determines what actions to execute at each level.
167
+ * Workflow is bottom-to-top, so we scan from end to start.
168
+ */
169
+ private buildDeepExtractionHierarchy;
170
+ /**
171
+ * Extracts hrefs directly from the page based on scrapeSchema selectors.
172
+ * Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
173
+ * This is called after scrapeSchema executes to capture hrefs for deep extraction.
174
+ */
175
+ private extractHrefsFromPage;
176
+ /**
177
+ * Filters URLs for deep extraction based on the goto action pattern.
178
+ * This is called immediately after the first capture action (scrapeList).
179
+ * Returns the filtered URL mappings that should be processed after workflow completion.
180
+ * Each mapping maintains alignment with the original scrapeList index.
181
+ */
182
+ private filterDeepExtractionUrls;
183
+ /**
184
+ * Filters pre-extracted URLs for deep extraction based on the goto action pattern.
185
+ * This is used for paginated lists where URLs were extracted during pagination.
186
+ * Returns the filtered URL mappings that maintain alignment with scrapeList indices.
187
+ */
188
+ private filterDeepExtractionUrlsFromExtracted;
189
+ /**
190
+ * Helper function to check if a URL matches a goto pattern.
191
+ */
192
+ private matchesGotoPattern;
193
+ /**
194
+ * Executes hierarchical deep extraction by processing each level recursively.
195
+ * URLs are already stored in each hierarchy level's urlMappings during workflow execution.
196
+ */
197
+ private executeHierarchicalDeepExtraction;
198
+ /**
199
+ * Executes deep extraction for a single level.
200
+ * URLs are already extracted and stored in hierarchy during workflow execution.
201
+ * This function just navigates to URLs and executes the capture actions.
202
+ */
203
+ private executeDeepExtractionLevel;
151
204
  private runLoop;
152
205
  private ensureScriptsLoaded;
153
206
  /**
@@ -74,6 +74,8 @@ class Interpreter extends events_1.EventEmitter {
74
74
  scrapeList: {},
75
75
  scrapeSchema: {}
76
76
  };
77
+ this.pendingDeepExtraction = null;
78
+ this.isInDeepExtractionPhase = false;
77
79
  this.workflow = workflow.workflow;
78
80
  this.initializedWorkflow = null;
79
81
  this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
@@ -338,7 +340,7 @@ class Interpreter extends events_1.EventEmitter {
338
340
  * @param page Playwright Page object
339
341
  * @param steps Array of actions.
340
342
  */
341
- carryOutSteps(page, steps) {
343
+ carryOutSteps(page, steps, currentWorkflow) {
342
344
  return __awaiter(this, void 0, void 0, function* () {
343
345
  var _a, _b;
344
346
  // Check abort flag at start of execution
@@ -430,9 +432,8 @@ class Interpreter extends events_1.EventEmitter {
430
432
  const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
431
433
  yield this.options.serializableCallback(scrapeResults);
432
434
  }),
433
- scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
435
+ scrapeSchema: (schema_1, ...args_1) => __awaiter(this, [schema_1, ...args_1], void 0, function* (schema, actionName = "") {
434
436
  var _a;
435
- // Check abort flag at start of scraping
436
437
  if (this.isAborted) {
437
438
  this.log('Workflow aborted, stopping scrapeSchema', logger_1.Level.WARN);
438
439
  return;
@@ -451,7 +452,6 @@ class Interpreter extends events_1.EventEmitter {
451
452
  }
452
453
  const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
453
454
  if (this.cumulativeResults.length === 0) {
454
- // First execution - create initial row
455
455
  const newRow = {};
456
456
  Object.entries(resultToProcess).forEach(([key, value]) => {
457
457
  if (value !== undefined) {
@@ -461,12 +461,10 @@ class Interpreter extends events_1.EventEmitter {
461
461
  this.cumulativeResults.push(newRow);
462
462
  }
463
463
  else {
464
- // Check if any keys from new result already exist in the last row
465
464
  const lastRow = this.cumulativeResults[this.cumulativeResults.length - 1];
466
465
  const newResultKeys = Object.keys(resultToProcess).filter(key => resultToProcess[key] !== undefined);
467
466
  const hasRepeatedKeys = newResultKeys.some(key => lastRow.hasOwnProperty(key));
468
467
  if (hasRepeatedKeys) {
469
- // Keys are repeated - create a new row
470
468
  const newRow = {};
471
469
  Object.entries(resultToProcess).forEach(([key, value]) => {
472
470
  if (value !== undefined) {
@@ -476,7 +474,6 @@ class Interpreter extends events_1.EventEmitter {
476
474
  this.cumulativeResults.push(newRow);
477
475
  }
478
476
  else {
479
- // No repeated keys - merge with the last row
480
477
  Object.entries(resultToProcess).forEach(([key, value]) => {
481
478
  if (value !== undefined) {
482
479
  lastRow[key] = value;
@@ -484,30 +481,102 @@ class Interpreter extends events_1.EventEmitter {
484
481
  });
485
482
  }
486
483
  }
487
- console.log("Total accumulated rows:", this.cumulativeResults.length);
488
- console.log("Current results:", this.cumulativeResults);
489
- // ✅ Append schema results under "scrapeSchema" → name
490
484
  const actionType = "scrapeSchema";
491
- const actionName = schema.__name || "Texts";
485
+ const name = actionName || "Texts";
492
486
  if (!this.namedResults[actionType])
493
487
  this.namedResults[actionType] = {};
494
- this.namedResults[actionType][actionName] = this.cumulativeResults;
488
+ this.namedResults[actionType][name] = this.cumulativeResults;
495
489
  if (!this.serializableDataByType[actionType])
496
490
  this.serializableDataByType[actionType] = {};
497
- if (!this.serializableDataByType[actionType][actionName]) {
498
- this.serializableDataByType[actionType][actionName] = [];
491
+ if (!this.serializableDataByType[actionType][name]) {
492
+ this.serializableDataByType[actionType][name] = [];
499
493
  }
500
- // Store as array (matching cumulativeResults structure)
501
- this.serializableDataByType[actionType][actionName] = [...this.cumulativeResults];
502
- // now emit full structured object
494
+ this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
503
495
  yield this.options.serializableCallback({
504
496
  scrapeList: this.serializableDataByType.scrapeList,
505
497
  scrapeSchema: this.serializableDataByType.scrapeSchema
506
498
  });
499
+ if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
500
+ if (!this.pendingDeepExtraction) {
501
+ console.log('DEBUG: Building hierarchical deep extraction plan from scrapeSchema...');
502
+ const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
503
+ if (hierarchyData.length > 0) {
504
+ const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
505
+ const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
506
+ this.log(`Root scrapeSchema will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
507
+ // Extract URLs from schema fields
508
+ const urls = yield this.extractHrefsFromPage(page, schema);
509
+ this.log(`scrapeSchema extracted ${urls.length} URLs from field selectors`, logger_1.Level.LOG);
510
+ // Filter URLs against pattern
511
+ const rootUrlMappings = urls
512
+ .map((url, index) => ({
513
+ scrapeListIndex: index,
514
+ url: this.matchesGotoPattern(url, nextLevelGotoPattern) ? url : null
515
+ }))
516
+ .filter(m => m.url !== null);
517
+ this.log(`Matched ${rootUrlMappings.length} URLs against pattern ${nextLevelGotoPattern}`, logger_1.Level.LOG);
518
+ this.pendingDeepExtraction = {
519
+ page,
520
+ hierarchy: hierarchyData.map((level, idx) => ({
521
+ gotoPattern: level.gotoPattern,
522
+ actionsToExecute: level.actionsToExecute,
523
+ urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
524
+ }))
525
+ };
526
+ }
527
+ else {
528
+ console.log('DEBUG: No goto actions found, deep extraction skipped');
529
+ }
530
+ }
531
+ else {
532
+ this.log(`[Deep Extract] scrapeSchema "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
533
+ const hierarchy = this.pendingDeepExtraction.hierarchy;
534
+ if (hierarchy && hierarchy.length > 0) {
535
+ let targetLevelIndex = -1;
536
+ for (let i = hierarchy.length - 1; i >= 0; i--) {
537
+ if (hierarchy[i].urlMappings.length === 0) {
538
+ targetLevelIndex = i;
539
+ break;
540
+ }
541
+ }
542
+ if (targetLevelIndex >= 0) {
543
+ const targetGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
544
+ this.log(`[Deep Extract] Storing URLs for level ${targetLevelIndex}, pattern: ${targetGotoPattern}`, logger_1.Level.LOG);
545
+ const urls = yield this.extractHrefsFromPage(page, schema);
546
+ this.log(`[Deep Extract] Extracted ${urls.length} URLs from scrapeSchema field selectors`, logger_1.Level.LOG);
547
+ const urlMappings = urls
548
+ .map((url, index) => ({
549
+ index,
550
+ url: this.matchesGotoPattern(url, targetGotoPattern) ? url : null
551
+ }))
552
+ .filter(m => m.url !== null);
553
+ if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
554
+ const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
555
+ const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
556
+ if (newUrls.length > 0) {
557
+ const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
558
+ hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
559
+ this.log(`[Deep Extract] Merged ${newUrls.length} new URLs from scrapeSchema`, logger_1.Level.LOG);
560
+ }
561
+ }
562
+ else {
563
+ hierarchy[targetLevelIndex].urlMappings = urlMappings;
564
+ }
565
+ this.log(`[Deep Extract] Stored ${urlMappings.length} matching URLs`, logger_1.Level.LOG);
566
+ if (urlMappings.length > 0) {
567
+ const sampleSize = Math.min(3, urlMappings.length);
568
+ this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${urlMappings.length}):`, logger_1.Level.LOG);
569
+ urlMappings.slice(0, sampleSize).forEach((mapping, idx) => {
570
+ this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
571
+ });
572
+ }
573
+ }
574
+ }
575
+ }
576
+ }
507
577
  }),
508
- scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
578
+ scrapeList: (config_1, ...args_1) => __awaiter(this, [config_1, ...args_1], void 0, function* (config, actionName = "") {
509
579
  var _a, _b;
510
- // Check abort flag at start of scraping
511
580
  if (this.isAborted) {
512
581
  this.log('Workflow aborted, stopping scrapeList', logger_1.Level.WARN);
513
582
  return;
@@ -533,53 +602,156 @@ class Interpreter extends events_1.EventEmitter {
533
602
  }
534
603
  catch (error) {
535
604
  console.warn('ScrapeList evaluation failed:', error.message);
536
- return []; // Return empty array instead of failing
605
+ return [];
537
606
  }
538
607
  }, config);
539
608
  }
540
609
  else {
541
610
  paginationUsed = true;
542
- scrapeResults = yield this.handlePagination(page, config);
611
+ const paginationResult = yield this.handlePagination(page, config, actionName);
612
+ scrapeResults = paginationResult.results;
613
+ const paginationUrls = paginationResult.urls;
614
+ if (this.options.robotType === 'deep-extract' && this.initializedWorkflow && scrapeResults.length > 0) {
615
+ if (!this.pendingDeepExtraction) {
616
+ console.log('DEBUG: Building hierarchical deep extraction plan from pagination...');
617
+ const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
618
+ if (hierarchyData.length > 0) {
619
+ const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
620
+ const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
621
+ this.log(`Root scrapeList (pagination) will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
622
+ const rootUrlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextLevelGotoPattern);
623
+ this.pendingDeepExtraction = {
624
+ page,
625
+ hierarchy: hierarchyData.map((level, idx) => ({
626
+ gotoPattern: level.gotoPattern,
627
+ actionsToExecute: level.actionsToExecute,
628
+ urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
629
+ }))
630
+ };
631
+ }
632
+ }
633
+ else {
634
+ this.log(`[Deep Extract] scrapeList (pagination) "${actionName}" extracting URLs`, logger_1.Level.LOG);
635
+ const hierarchy = this.pendingDeepExtraction.hierarchy;
636
+ if (hierarchy && hierarchy.length > 0) {
637
+ const nextLevelIndex = hierarchy.length >= 3 ? hierarchy.length - 3 : 0;
638
+ if (nextLevelIndex >= 0 && hierarchy[nextLevelIndex]) {
639
+ const nextGotoPattern = hierarchy[nextLevelIndex].gotoPattern;
640
+ this.log(`[Deep Extract] Extracting URLs for pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
641
+ const urlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextGotoPattern);
642
+ this.log(`[Deep Extract] Found ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
643
+ const validUrls = urlMappings.filter(m => m.url !== null);
644
+ if (validUrls.length > 0) {
645
+ const sampleSize = Math.min(3, validUrls.length);
646
+ this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${validUrls.length}):`, logger_1.Level.LOG);
647
+ validUrls.slice(0, sampleSize).forEach((mapping, idx) => {
648
+ this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
649
+ });
650
+ }
651
+ }
652
+ }
653
+ }
654
+ }
543
655
  }
544
- // Ensure we always have an array
545
656
  if (!Array.isArray(scrapeResults)) {
546
657
  scrapeResults = [];
547
658
  }
548
- console.log(`ScrapeList completed with ${scrapeResults.length} results`);
549
- // Only process and callback if pagination wasn't used
550
- // (handlePagination already handles storage and callbacks internally)
551
659
  if (!paginationUsed) {
552
- // ✅ Append list results under "scrapeList" → name
553
660
  const actionType = "scrapeList";
554
- let actionName = config.__name || "";
555
- // If no name provided, generate a unique one
556
- if (!actionName || actionName.trim() === "") {
661
+ let name = actionName || "";
662
+ if (!name || name.trim() === "" || this.isInDeepExtractionPhase) {
557
663
  this.scrapeListCounter++;
558
- actionName = `List ${this.scrapeListCounter}`;
664
+ name = `List ${this.scrapeListCounter}`;
559
665
  }
560
666
  if (!this.serializableDataByType[actionType])
561
667
  this.serializableDataByType[actionType] = {};
562
- if (!this.serializableDataByType[actionType][actionName]) {
563
- this.serializableDataByType[actionType][actionName] = [];
668
+ if (!this.serializableDataByType[actionType][name]) {
669
+ this.serializableDataByType[actionType][name] = [];
564
670
  }
565
- this.serializableDataByType[actionType][actionName].push(...scrapeResults);
671
+ this.serializableDataByType[actionType][name].push(...scrapeResults);
566
672
  yield this.options.serializableCallback({
567
673
  scrapeList: this.serializableDataByType.scrapeList,
568
674
  scrapeSchema: this.serializableDataByType.scrapeSchema
569
675
  });
676
+ console.log(`DEBUG: Checking deep extract condition: robotType=${this.options.robotType}, hasWorkflow=${!!currentWorkflow}, alreadyPending=${!!this.pendingDeepExtraction}`);
677
+ if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
678
+ if (!this.pendingDeepExtraction) {
679
+ console.log('DEBUG: Building hierarchical deep extraction plan...');
680
+ const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
681
+ if (hierarchyData.length > 0) {
682
+ const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
683
+ const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
684
+ this.log(`Root scrapeList will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
685
+ const rootUrlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextLevelGotoPattern);
686
+ this.pendingDeepExtraction = {
687
+ page,
688
+ hierarchy: hierarchyData.map((level, idx) => ({
689
+ gotoPattern: level.gotoPattern,
690
+ actionsToExecute: level.actionsToExecute,
691
+ urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
692
+ }))
693
+ };
694
+ }
695
+ else {
696
+ console.log('DEBUG: No goto actions found, deep extraction skipped');
697
+ }
698
+ }
699
+ else {
700
+ this.log(`[Deep Extract] scrapeList "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
701
+ const hierarchy = this.pendingDeepExtraction.hierarchy;
702
+ if (hierarchy && hierarchy.length > 0) {
703
+ let targetLevelIndex = -1;
704
+ for (let i = hierarchy.length - 1; i >= 0; i--) {
705
+ if (hierarchy[i].urlMappings.length === 0) {
706
+ targetLevelIndex = i;
707
+ break;
708
+ }
709
+ }
710
+ if (targetLevelIndex >= 0) {
711
+ const nextGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
712
+ this.log(`[Deep Extract] Storing URLs for level ${targetLevelIndex}, pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
713
+ const urlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextGotoPattern);
714
+ if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
715
+ const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
716
+ const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
717
+ if (newUrls.length > 0) {
718
+ const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
719
+ hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
720
+ this.log(`[Deep Extract] Merged ${newUrls.length} new URLs`, logger_1.Level.LOG);
721
+ }
722
+ }
723
+ else {
724
+ hierarchy[targetLevelIndex].urlMappings = urlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url }));
725
+ }
726
+ this.log(`[Deep Extract] Stored ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
727
+ const validUrls = urlMappings.filter(m => m.url !== null);
728
+ if (validUrls.length > 0) {
729
+ const sampleSize = Math.min(3, validUrls.length);
730
+ this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${validUrls.length}):`, logger_1.Level.LOG);
731
+ validUrls.slice(0, sampleSize).forEach((mapping, idx) => {
732
+ this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
733
+ });
734
+ }
735
+ }
736
+ }
737
+ }
738
+ }
570
739
  }
571
740
  }
572
741
  catch (error) {
573
742
  console.error('ScrapeList action failed completely:', error.message);
574
- // Don't throw error, just return empty array
575
743
  const actionType = "scrapeList";
576
- const actionName = config.__name || "List";
744
+ let name = actionName || "";
745
+ if (!name || name.trim() === "") {
746
+ this.scrapeListCounter++;
747
+ name = `List ${this.scrapeListCounter}`;
748
+ }
577
749
  if (!this.namedResults[actionType])
578
750
  this.namedResults[actionType] = {};
579
- this.namedResults[actionType][actionName] = [];
751
+ this.namedResults[actionType][name] = [];
580
752
  if (!this.serializableDataByType[actionType])
581
753
  this.serializableDataByType[actionType] = {};
582
- this.serializableDataByType[actionType][actionName] = [];
754
+ this.serializableDataByType[actionType][name] = [];
583
755
  yield this.options.serializableCallback({
584
756
  scrapeList: this.serializableDataByType.scrapeList,
585
757
  scrapeSchema: this.serializableDataByType.scrapeSchema
@@ -662,25 +834,7 @@ class Interpreter extends events_1.EventEmitter {
662
834
  if (debug === null || debug === void 0 ? void 0 : debug.setActionType) {
663
835
  debug.setActionType(String(step.action));
664
836
  }
665
- // Safely extract name for this step
666
- if (step === null || step === void 0 ? void 0 : step.name) {
667
- stepName = step.name;
668
- }
669
- else if (Array.isArray(step === null || step === void 0 ? void 0 : step.args) &&
670
- step.args.length > 0 &&
671
- typeof step.args[0] === "object" &&
672
- "__name" in step.args[0]) {
673
- stepName = step.args[0].__name;
674
- }
675
- else if (typeof (step === null || step === void 0 ? void 0 : step.args) === "object" &&
676
- (step === null || step === void 0 ? void 0 : step.args) !== null &&
677
- "__name" in step.args) {
678
- stepName = step.args.__name;
679
- }
680
- // Default fallback
681
- if (!stepName) {
682
- stepName = String(step.action);
683
- }
837
+ stepName = (step === null || step === void 0 ? void 0 : step.name) || String(step.action);
684
838
  if (debug && typeof debug.setActionName === "function") {
685
839
  debug.setActionName(stepName);
686
840
  }
@@ -693,9 +847,12 @@ class Interpreter extends events_1.EventEmitter {
693
847
  // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
694
848
  const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
695
849
  if (step.action === 'screenshot') {
696
- // call the screenshot handler directly to allow the extra name parameter
697
850
  yield wawActions.screenshot(...(params !== null && params !== void 0 ? params : []), stepName !== null && stepName !== void 0 ? stepName : undefined);
698
851
  }
852
+ else if (step.action === 'scrapeList' || step.action === 'scrapeSchema') {
853
+ const actionName = step.name || "";
854
+ yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []), actionName);
855
+ }
699
856
  else {
700
857
  yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
701
858
  }
@@ -755,17 +912,16 @@ class Interpreter extends events_1.EventEmitter {
755
912
  }
756
913
  });
757
914
  }
758
- handlePagination(page, config) {
759
- return __awaiter(this, void 0, void 0, function* () {
760
- // Check abort flag at start of pagination
915
+ handlePagination(page_1, config_1) {
916
+ return __awaiter(this, arguments, void 0, function* (page, config, providedActionName = "") {
761
917
  if (this.isAborted) {
762
918
  this.log('Workflow aborted, stopping pagination', logger_1.Level.WARN);
763
- return [];
919
+ return { results: [], urls: [] };
764
920
  }
765
- // Generate action name for this scrapeList
766
921
  const actionType = "scrapeList";
767
- let actionName = config.__name || "";
768
- if (!actionName || actionName.trim() === "") {
922
+ let actionName = providedActionName || "";
923
+ // During deep extraction, ALWAYS auto-increment to create separate lists for each URL
924
+ if (!actionName || actionName.trim() === "" || this.isInDeepExtractionPhase) {
769
925
  this.scrapeListCounter++;
770
926
  actionName = `List ${this.scrapeListCounter}`;
771
927
  }
@@ -777,6 +933,7 @@ class Interpreter extends events_1.EventEmitter {
777
933
  this.serializableDataByType[actionType][actionName] = [];
778
934
  }
779
935
  let allResults = [];
936
+ let allUrls = []; // Track URLs alongside results for deep-extract
780
937
  let previousHeight = 0;
781
938
  let scrapedItems = new Set();
782
939
  let visitedUrls = new Set();
@@ -803,14 +960,22 @@ class Interpreter extends events_1.EventEmitter {
803
960
  debugLog(`Page evaluation failed: ${error.message}`);
804
961
  return;
805
962
  }
806
- const newResults = results.filter(item => {
963
+ // Extract URLs for ALL items BEFORE filtering duplicates
964
+ // This ensures URL indices match result indices
965
+ const allItemUrls = yield this.extractUrlsFromCurrentPage(page, config.listSelector, results.length);
966
+ // Filter results AND URLs together using the same uniqueness logic
967
+ const newResults = [];
968
+ const newUrls = [];
969
+ results.forEach((item, index) => {
807
970
  const uniqueKey = JSON.stringify(item);
808
- if (scrapedItems.has(uniqueKey))
809
- return false;
810
- scrapedItems.add(uniqueKey);
811
- return true;
971
+ if (!scrapedItems.has(uniqueKey)) {
972
+ scrapedItems.add(uniqueKey);
973
+ newResults.push(item);
974
+ newUrls.push(allItemUrls[index] || []); // Add corresponding URLs
975
+ }
812
976
  });
813
977
  allResults = allResults.concat(newResults);
978
+ allUrls = allUrls.concat(newUrls);
814
979
  debugLog("Results collected:", allResults.length);
815
980
  // Store in serializableDataByType and send structured callback
816
981
  this.serializableDataByType[actionType][actionName] = [...allResults];
@@ -822,6 +987,7 @@ class Interpreter extends events_1.EventEmitter {
822
987
  const checkLimit = () => {
823
988
  if (config.limit && allResults.length >= config.limit) {
824
989
  allResults = allResults.slice(0, config.limit);
990
+ allUrls = allUrls.slice(0, config.limit); // Also trim URLs to maintain sync
825
991
  return true;
826
992
  }
827
993
  return false;
@@ -947,16 +1113,16 @@ class Interpreter extends events_1.EventEmitter {
947
1113
  // Check abort flag at start of each pagination iteration
948
1114
  if (this.isAborted) {
949
1115
  this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
950
- return allResults;
1116
+ return { results: allResults, urls: allUrls };
951
1117
  }
952
1118
  // Pagination circuit breakers
953
1119
  if (++paginationIterations > MAX_PAGINATION_ITERATIONS) {
954
1120
  debugLog(`Maximum pagination iterations reached (${MAX_PAGINATION_ITERATIONS}), stopping`);
955
- return allResults;
1121
+ return { results: allResults, urls: allUrls };
956
1122
  }
957
1123
  if (Date.now() - paginationStartTime > MAX_PAGINATION_TIME) {
958
1124
  debugLog('Maximum pagination time reached (10 minutes), stopping');
959
- return allResults;
1125
+ return { results: allResults, urls: allUrls };
960
1126
  }
961
1127
  // Add async yield every 5 iterations to prevent event loop blocking
962
1128
  if (paginationIterations % 5 === 0) {
@@ -967,7 +1133,7 @@ class Interpreter extends events_1.EventEmitter {
967
1133
  let previousResultCount = allResults.length;
968
1134
  yield scrapeCurrentPage();
969
1135
  if (checkLimit()) {
970
- return allResults;
1136
+ return { results: allResults, urls: allUrls };
971
1137
  }
972
1138
  yield page.evaluate(() => {
973
1139
  const scrollHeight = Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
@@ -981,14 +1147,14 @@ class Interpreter extends events_1.EventEmitter {
981
1147
  if (currentResultCount === previousResultCount) {
982
1148
  unchangedResultCounter++;
983
1149
  if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
984
- return allResults;
1150
+ return { results: allResults, urls: allUrls };
985
1151
  }
986
1152
  }
987
1153
  else {
988
1154
  unchangedResultCounter = 0;
989
1155
  }
990
1156
  if (currentHeight === previousHeight) {
991
- return allResults;
1157
+ return { results: allResults, urls: allUrls };
992
1158
  }
993
1159
  previousHeight = currentHeight;
994
1160
  break;
@@ -997,7 +1163,7 @@ class Interpreter extends events_1.EventEmitter {
997
1163
  let previousResultCount = allResults.length;
998
1164
  yield scrapeCurrentPage();
999
1165
  if (checkLimit()) {
1000
- return allResults;
1166
+ return { results: allResults, urls: allUrls };
1001
1167
  }
1002
1168
  yield page.evaluate(() => window.scrollTo(0, 0));
1003
1169
  yield page.waitForTimeout(2000);
@@ -1006,14 +1172,14 @@ class Interpreter extends events_1.EventEmitter {
1006
1172
  if (currentResultCount === previousResultCount) {
1007
1173
  unchangedResultCounter++;
1008
1174
  if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
1009
- return allResults;
1175
+ return { results: allResults, urls: allUrls };
1010
1176
  }
1011
1177
  }
1012
1178
  else {
1013
1179
  unchangedResultCounter = 0;
1014
1180
  }
1015
1181
  if (currentTopHeight === 0) {
1016
- return allResults;
1182
+ return { results: allResults, urls: allUrls };
1017
1183
  }
1018
1184
  previousHeight = currentTopHeight;
1019
1185
  break;
@@ -1023,7 +1189,7 @@ class Interpreter extends events_1.EventEmitter {
1023
1189
  visitedUrls.add(currentUrl);
1024
1190
  yield scrapeCurrentPage();
1025
1191
  if (checkLimit())
1026
- return allResults;
1192
+ return { results: allResults, urls: allUrls };
1027
1193
  const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
1028
1194
  availableSelectors = updatedSelectors;
1029
1195
  if (!button || !workingSelector) {
@@ -1039,7 +1205,7 @@ class Interpreter extends events_1.EventEmitter {
1039
1205
  }
1040
1206
  }));
1041
1207
  if (!success)
1042
- return allResults;
1208
+ return { results: allResults, urls: allUrls };
1043
1209
  break;
1044
1210
  }
1045
1211
  let retryCount = 0;
@@ -1169,14 +1335,14 @@ class Interpreter extends events_1.EventEmitter {
1169
1335
  }
1170
1336
  if (!paginationSuccess) {
1171
1337
  debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
1172
- return allResults;
1338
+ return { results: allResults, urls: allUrls };
1173
1339
  }
1174
1340
  break;
1175
1341
  }
1176
1342
  case 'clickLoadMore': {
1177
1343
  yield scrapeCurrentPage();
1178
1344
  if (checkLimit())
1179
- return allResults;
1345
+ return { results: allResults, urls: allUrls };
1180
1346
  let loadMoreCounter = 0;
1181
1347
  const MAX_LOAD_MORE_ITERATIONS = 100; // Prevent infinite load more
1182
1348
  const loadMoreStartTime = Date.now();
@@ -1185,11 +1351,11 @@ class Interpreter extends events_1.EventEmitter {
1185
1351
  // Load more circuit breakers
1186
1352
  if (loadMoreCounter >= MAX_LOAD_MORE_ITERATIONS) {
1187
1353
  debugLog(`Maximum load more iterations reached (${MAX_LOAD_MORE_ITERATIONS}), stopping`);
1188
- return allResults;
1354
+ return { results: allResults, urls: allUrls };
1189
1355
  }
1190
1356
  if (Date.now() - loadMoreStartTime > MAX_LOAD_MORE_TIME) {
1191
1357
  debugLog('Maximum load more time reached (5 minutes), stopping');
1192
- return allResults;
1358
+ return { results: allResults, urls: allUrls };
1193
1359
  }
1194
1360
  // Add async yield every 3 iterations
1195
1361
  if (loadMoreCounter % 3 === 0 && loadMoreCounter > 0) {
@@ -1200,7 +1366,7 @@ class Interpreter extends events_1.EventEmitter {
1200
1366
  availableSelectors = updatedSelectors;
1201
1367
  if (!workingSelector || !loadMoreButton) {
1202
1368
  debugLog('No working Load More selector found after retries');
1203
- return allResults;
1369
+ return { results: allResults, urls: allUrls };
1204
1370
  }
1205
1371
  // Implement retry mechanism for clicking the button
1206
1372
  let retryCount = 0;
@@ -1240,7 +1406,7 @@ class Interpreter extends events_1.EventEmitter {
1240
1406
  }
1241
1407
  if (!clickSuccess) {
1242
1408
  debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
1243
- return allResults;
1409
+ return { results: allResults, urls: allUrls };
1244
1410
  }
1245
1411
  // Wait for content to load and check scroll height
1246
1412
  yield page.waitForTimeout(2000);
@@ -1269,16 +1435,16 @@ class Interpreter extends events_1.EventEmitter {
1269
1435
  // previousResultCount = currentResultCount;
1270
1436
  // }
1271
1437
  if (checkLimit())
1272
- return allResults;
1438
+ return { results: allResults, urls: allUrls };
1273
1439
  if (!heightChanged) {
1274
1440
  debugLog('No more items loaded after Load More');
1275
- return allResults;
1441
+ return { results: allResults, urls: allUrls };
1276
1442
  }
1277
1443
  }
1278
1444
  }
1279
1445
  default: {
1280
1446
  yield scrapeCurrentPage();
1281
- return allResults;
1447
+ return { results: allResults, urls: allUrls };
1282
1448
  }
1283
1449
  }
1284
1450
  if (checkLimit())
@@ -1287,9 +1453,9 @@ class Interpreter extends events_1.EventEmitter {
1287
1453
  }
1288
1454
  catch (error) {
1289
1455
  debugLog(`Fatal error: ${error.message}`);
1290
- return allResults;
1456
+ return { results: allResults, urls: allUrls };
1291
1457
  }
1292
- return allResults;
1458
+ return { results: allResults, urls: allUrls };
1293
1459
  });
1294
1460
  }
1295
1461
  getMatchingActionId(workflow, pageState, usedActions) {
@@ -2213,6 +2379,518 @@ class Interpreter extends events_1.EventEmitter {
2213
2379
  return modifiedAction;
2214
2380
  });
2215
2381
  }
2382
+ /**
2383
+ * Extracts URLs from the current page's list elements.
2384
+ * Used during pagination to maintain sync between scraped results and extracted URLs.
2385
+ *
2386
+ * @param page - Playwright page object
2387
+ * @param listSelector - The selector used to identify list elements
2388
+ * @param limit - Maximum number of elements to process (should match number of scraped items)
2389
+ * @returns Array of URL arrays, one per list element
2390
+ */
2391
+ extractUrlsFromCurrentPage(page, listSelector, limit) {
2392
+ return __awaiter(this, void 0, void 0, function* () {
2393
+ const extractedUrls = yield page.evaluate(({ selector, limit }) => {
2394
+ const urlsByElement = [];
2395
+ let listElements = [];
2396
+ if (selector.startsWith('//') || selector.startsWith('(//')) {
2397
+ const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
2398
+ for (let i = 0; i < xpathResult.snapshotLength; i++) {
2399
+ const node = xpathResult.snapshotItem(i);
2400
+ if (node && node.nodeType === Node.ELEMENT_NODE) {
2401
+ listElements.push(node);
2402
+ }
2403
+ }
2404
+ }
2405
+ else {
2406
+ listElements = Array.from(document.querySelectorAll(selector));
2407
+ }
2408
+ // Extract URLs from the first 'limit' elements that match the selector
2409
+ // The limit corresponds to the number of items that were scraped
2410
+ const elementsToProcess = listElements.slice(0, limit);
2411
+ elementsToProcess.forEach(element => {
2412
+ const urls = [];
2413
+ if (element.tagName === 'A' && element.href) {
2414
+ urls.push(element.href);
2415
+ }
2416
+ const anchors = element.querySelectorAll('a[href]');
2417
+ anchors.forEach(anchor => {
2418
+ const href = anchor.href;
2419
+ if (href && !urls.includes(href)) {
2420
+ urls.push(href);
2421
+ }
2422
+ });
2423
+ urlsByElement.push(urls);
2424
+ });
2425
+ return urlsByElement;
2426
+ }, { selector: listSelector, limit });
2427
+ return extractedUrls;
2428
+ });
2429
+ }
2430
+ /**
2431
+ * Builds a hierarchical deep extraction plan by analyzing the workflow structure.
2432
+ * Identifies goto actions and determines what actions to execute at each level.
2433
+ * Workflow is bottom-to-top, so we scan from end to start.
2434
+ */
2435
+ buildDeepExtractionHierarchy(currentWorkflow) {
2436
+ var _a, _b;
2437
+ const hierarchy = [];
2438
+ // Find all goto action indices with their patterns
2439
+ const gotoData = [];
2440
+ currentWorkflow.forEach((pair, index) => {
2441
+ var _a;
2442
+ if (pair.what && pair.what.some(action => action.action === 'goto')) {
2443
+ const gotoAction = pair.what.find(action => action.action === 'goto');
2444
+ const pattern = (_a = gotoAction === null || gotoAction === void 0 ? void 0 : gotoAction.args) === null || _a === void 0 ? void 0 : _a[0];
2445
+ if (pattern) {
2446
+ gotoData.push({ index, pattern: String(pattern) });
2447
+ }
2448
+ }
2449
+ });
2450
+ if (gotoData.length === 0) {
2451
+ this.log('No goto actions found in workflow', logger_1.Level.WARN);
2452
+ return [];
2453
+ }
2454
+ this.log(`Found ${gotoData.length} goto action(s) at indices: ${gotoData.map(g => g.index).join(', ')}`, logger_1.Level.LOG);
2455
+ const uniqueGotos = [];
2456
+ for (let i = 0; i < gotoData.length; i++) {
2457
+ const current = gotoData[i];
2458
+ const next = gotoData[i + 1];
2459
+ if (next && current.pattern === next.pattern) {
2460
+ this.log(`Skipping duplicate goto at index ${next.index} (same as ${current.index})`, logger_1.Level.LOG);
2461
+ i++;
2462
+ }
2463
+ uniqueGotos.push(current);
2464
+ }
2465
+ this.log(`After deduplication: ${uniqueGotos.length} unique goto(s)`, logger_1.Level.LOG);
2466
+ for (let i = 0; i < uniqueGotos.length; i++) {
2467
+ const gotoIndex = uniqueGotos[i].index;
2468
+ const gotoPattern = uniqueGotos[i].pattern;
2469
+ const nextGotoIndex = i > 0 ? uniqueGotos[i - 1].index : 0;
2470
+ let actionsToExecute = currentWorkflow.slice(nextGotoIndex, gotoIndex);
2471
+ actionsToExecute = actionsToExecute.filter(pair => {
2472
+ return !pair.what || !pair.what.some(action => action.action === 'goto');
2473
+ });
2474
+ const dataExtractionActions = actionsToExecute.filter(pair => {
2475
+ return pair.what && pair.what.some(action => action.action === 'scrapeSchema' ||
2476
+ action.action === 'scrapeList' ||
2477
+ action.action === 'screenshot');
2478
+ });
2479
+ if (dataExtractionActions.length === 0) {
2480
+ this.log(`No data extraction actions found between goto at ${gotoIndex} and next level`, logger_1.Level.WARN);
2481
+ continue;
2482
+ }
2483
+ let sourceActionName = '';
2484
+ let sourceActionType = 'scrapeList';
2485
+ if (i === uniqueGotos.length - 1) {
2486
+ const scrapeListBefore = currentWorkflow.slice(gotoIndex + 1).find(pair => pair.what && pair.what.some(action => action.action === 'scrapeList'));
2487
+ if (scrapeListBefore) {
2488
+ const scrapeListAction = scrapeListBefore.what.find(action => action.action === 'scrapeList');
2489
+ sourceActionName = ((_b = (_a = scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.args) === null || _a === void 0 ? void 0 : _a[0]) === null || _b === void 0 ? void 0 : _b.name) || (scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.name) || '';
2490
+ sourceActionType = 'scrapeList';
2491
+ }
2492
+ }
2493
+ else {
2494
+ sourceActionName = '';
2495
+ sourceActionType = 'scrapeSchema';
2496
+ }
2497
+ hierarchy.push({
2498
+ gotoActionIndex: gotoIndex,
2499
+ gotoPattern: String(gotoPattern),
2500
+ actionsToExecute: dataExtractionActions,
2501
+ sourceActionName,
2502
+ sourceActionType
2503
+ });
2504
+ this.log(`Level ${i}: goto at index ${gotoIndex}, pattern=${gotoPattern}, actions=${dataExtractionActions.length}`, logger_1.Level.LOG);
2505
+ }
2506
+ return hierarchy;
2507
+ }
2508
+ /**
2509
+ * Extracts hrefs directly from the page based on scrapeSchema selectors.
2510
+ * Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
2511
+ * This is called after scrapeSchema executes to capture hrefs for deep extraction.
2512
+ */
2513
+ extractHrefsFromPage(page, schemaConfig) {
2514
+ return __awaiter(this, void 0, void 0, function* () {
2515
+ try {
2516
+ const fields = schemaConfig.fields || schemaConfig;
2517
+ const selectors = [];
2518
+ for (const [fieldName, fieldConfig] of Object.entries(fields)) {
2519
+ if (fieldConfig && typeof fieldConfig === 'object' && fieldConfig.selector) {
2520
+ selectors.push(String(fieldConfig.selector));
2521
+ }
2522
+ }
2523
+ if (selectors.length === 0) {
2524
+ return [];
2525
+ }
2526
+ const extractedUrls = yield page.evaluate((selectorList) => {
2527
+ const urls = [];
2528
+ for (const selector of selectorList) {
2529
+ if (!selector)
2530
+ continue;
2531
+ try {
2532
+ let elements = [];
2533
+ if (selector.startsWith('//') || selector.startsWith('(//')) {
2534
+ const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
2535
+ for (let i = 0; i < xpathResult.snapshotLength; i++) {
2536
+ const node = xpathResult.snapshotItem(i);
2537
+ if (node && node.nodeType === Node.ELEMENT_NODE) {
2538
+ elements.push(node);
2539
+ }
2540
+ }
2541
+ }
2542
+ else {
2543
+ elements = Array.from(document.querySelectorAll(selector));
2544
+ }
2545
+ for (const element of elements) {
2546
+ if (element.tagName === 'A' && element.href) {
2547
+ const href = element.href;
2548
+ if (href && !urls.includes(href)) {
2549
+ urls.push(href);
2550
+ }
2551
+ }
2552
+ }
2553
+ }
2554
+ catch (error) {
2555
+ console.warn(`Failed to extract hrefs for selector ${selector}:`, error);
2556
+ }
2557
+ }
2558
+ return urls;
2559
+ }, selectors);
2560
+ this.log(`Extracted ${extractedUrls.length} hrefs from page for schema selectors`, logger_1.Level.LOG);
2561
+ return extractedUrls;
2562
+ }
2563
+ catch (error) {
2564
+ this.log(`Failed to extract hrefs from page: ${error.message}`, logger_1.Level.ERROR);
2565
+ return [];
2566
+ }
2567
+ });
2568
+ }
2569
+ /**
2570
+ * Filters URLs for deep extraction based on the goto action pattern.
2571
+ * This is called immediately after the first capture action (scrapeList).
2572
+ * Returns the filtered URL mappings that should be processed after workflow completion.
2573
+ * Each mapping maintains alignment with the original scrapeList index.
2574
+ */
2575
+ filterDeepExtractionUrls(page, listSelector, scrapeResults, gotoTargetPattern) {
2576
+ return __awaiter(this, void 0, void 0, function* () {
2577
+ try {
2578
+ this.log(`Deep extraction: Filtering URLs from list structure (${scrapeResults.length} items)`, logger_1.Level.LOG);
2579
+ const extractedUrls = yield page.evaluate(({ selector, limit }) => {
2580
+ const urlsByElement = [];
2581
+ let listElements = [];
2582
+ if (selector.startsWith('//') || selector.startsWith('(//')) {
2583
+ const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
2584
+ for (let i = 0; i < xpathResult.snapshotLength; i++) {
2585
+ const node = xpathResult.snapshotItem(i);
2586
+ if (node && node.nodeType === Node.ELEMENT_NODE) {
2587
+ listElements.push(node);
2588
+ }
2589
+ }
2590
+ }
2591
+ else {
2592
+ listElements = Array.from(document.querySelectorAll(selector));
2593
+ }
2594
+ const elementsToProcess = listElements.slice(0, limit);
2595
+ elementsToProcess.forEach(element => {
2596
+ const urls = [];
2597
+ if (element.tagName === 'A' && element.href) {
2598
+ urls.push(element.href);
2599
+ }
2600
+ const anchors = element.querySelectorAll('a[href]');
2601
+ anchors.forEach(anchor => {
2602
+ const href = anchor.href;
2603
+ if (href && !urls.includes(href)) {
2604
+ urls.push(href);
2605
+ }
2606
+ });
2607
+ urlsByElement.push(urls);
2608
+ });
2609
+ return urlsByElement;
2610
+ }, { selector: listSelector, limit: scrapeResults.length });
2611
+ const totalUrlCount = extractedUrls.reduce((sum, urls) => sum + urls.length, 0);
2612
+ this.log(`Extracted ${totalUrlCount} total URLs from ${scrapeResults.length} list items (avg ${(totalUrlCount / scrapeResults.length).toFixed(1)} URLs per item)`, logger_1.Level.LOG);
2613
+ const getUrlPattern = (url) => {
2614
+ try {
2615
+ const urlObj = new URL(url);
2616
+ const pathname = urlObj.pathname.replace(/\/$/, '');
2617
+ const segments = pathname.split('/').filter(s => s.length > 0);
2618
+ return {
2619
+ origin: urlObj.origin,
2620
+ pathSegments: segments
2621
+ };
2622
+ }
2623
+ catch (_a) {
2624
+ return null;
2625
+ }
2626
+ };
2627
+ const targetPattern = getUrlPattern(String(gotoTargetPattern));
2628
+ const targetNormalized = String(gotoTargetPattern).replace(/\/$/, '').toLowerCase();
2629
+ if (!targetPattern) {
2630
+ this.log('Could not parse goto URL pattern, skipping deep extraction', logger_1.Level.WARN);
2631
+ return [];
2632
+ }
2633
+ this.log(`Target URL pattern: ${targetPattern.origin}/${targetPattern.pathSegments.join('/')}`, logger_1.Level.LOG);
2634
+ const urlMappings = [];
2635
+ extractedUrls.forEach((urlsFromElement, scrapeListIndex) => {
2636
+ let matchingUrl = null;
2637
+ for (const url of urlsFromElement) {
2638
+ const urlPattern = getUrlPattern(url);
2639
+ if (!urlPattern)
2640
+ continue;
2641
+ if (urlPattern.origin !== targetPattern.origin)
2642
+ continue;
2643
+ if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
2644
+ continue;
2645
+ let pathMatches = true;
2646
+ for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
2647
+ if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
2648
+ pathMatches = false;
2649
+ break;
2650
+ }
2651
+ }
2652
+ if (!pathMatches)
2653
+ continue;
2654
+ const urlNormalized = url.replace(/\/$/, '').toLowerCase();
2655
+ if (urlNormalized === targetNormalized) {
2656
+ this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
2657
+ continue;
2658
+ }
2659
+ matchingUrl = url;
2660
+ break;
2661
+ }
2662
+ urlMappings.push({
2663
+ scrapeListIndex,
2664
+ url: matchingUrl
2665
+ });
2666
+ });
2667
+ const matchedCount = urlMappings.filter(m => m.url !== null).length;
2668
+ this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
2669
+ if (matchedCount > 0) {
2670
+ const matchedMappings = urlMappings.filter(m => m.url !== null);
2671
+ const sampleSize = Math.min(5, matchedMappings.length);
2672
+ const sample = matchedMappings.slice(0, sampleSize);
2673
+ this.log(`Sample of matching URLs (showing ${sampleSize} of ${matchedMappings.length}):`, logger_1.Level.LOG);
2674
+ sample.forEach((mapping, idx) => {
2675
+ this.log(` ${idx + 1}. [Index ${mapping.scrapeListIndex}] ${mapping.url}`, logger_1.Level.LOG);
2676
+ });
2677
+ }
2678
+ else {
2679
+ this.log('No matching URLs found. Check if extracted URLs match the pattern.', logger_1.Level.WARN);
2680
+ }
2681
+ return urlMappings;
2682
+ }
2683
+ catch (error) {
2684
+ this.log(`URL filtering failed: ${error.message}`, logger_1.Level.ERROR);
2685
+ return [];
2686
+ }
2687
+ });
2688
+ }
2689
+ /**
2690
+ * Filters pre-extracted URLs for deep extraction based on the goto action pattern.
2691
+ * This is used for paginated lists where URLs were extracted during pagination.
2692
+ * Returns the filtered URL mappings that maintain alignment with scrapeList indices.
2693
+ */
2694
+ filterDeepExtractionUrlsFromExtracted(extractedUrls, scrapeResults, gotoTargetPattern) {
2695
+ try {
2696
+ const totalUrlCount = extractedUrls.reduce((sum, urls) => sum + urls.length, 0);
2697
+ this.log(`Deep extraction: Filtering ${totalUrlCount} pre-extracted URLs from ${scrapeResults.length} items`, logger_1.Level.LOG);
2698
+ const getUrlPattern = (url) => {
2699
+ try {
2700
+ const urlObj = new URL(url);
2701
+ const pathname = urlObj.pathname.replace(/\/$/, '');
2702
+ const segments = pathname.split('/').filter(s => s.length > 0);
2703
+ return {
2704
+ origin: urlObj.origin,
2705
+ pathSegments: segments
2706
+ };
2707
+ }
2708
+ catch (_a) {
2709
+ return null;
2710
+ }
2711
+ };
2712
+ const targetPattern = getUrlPattern(String(gotoTargetPattern));
2713
+ const targetNormalized = String(gotoTargetPattern).replace(/\/$/, '').toLowerCase();
2714
+ if (!targetPattern) {
2715
+ this.log('Could not parse goto URL pattern, skipping deep extraction', logger_1.Level.WARN);
2716
+ return [];
2717
+ }
2718
+ this.log(`Target URL pattern: ${targetPattern.origin}/${targetPattern.pathSegments.join('/')}`, logger_1.Level.LOG);
2719
+ const urlMappings = [];
2720
+ extractedUrls.forEach((urlsFromElement, scrapeListIndex) => {
2721
+ let matchingUrl = null;
2722
+ for (const url of urlsFromElement) {
2723
+ const urlPattern = getUrlPattern(url);
2724
+ if (!urlPattern)
2725
+ continue;
2726
+ if (urlPattern.origin !== targetPattern.origin)
2727
+ continue;
2728
+ if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
2729
+ continue;
2730
+ let pathMatches = true;
2731
+ for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
2732
+ if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
2733
+ pathMatches = false;
2734
+ break;
2735
+ }
2736
+ }
2737
+ if (!pathMatches)
2738
+ continue;
2739
+ const urlNormalized = url.replace(/\/$/, '').toLowerCase();
2740
+ if (urlNormalized === targetNormalized) {
2741
+ this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
2742
+ continue;
2743
+ }
2744
+ matchingUrl = url;
2745
+ break;
2746
+ }
2747
+ urlMappings.push({
2748
+ scrapeListIndex,
2749
+ url: matchingUrl
2750
+ });
2751
+ });
2752
+ const matchedCount = urlMappings.filter(m => m.url !== null).length;
2753
+ this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
2754
+ if (matchedCount > 0) {
2755
+ const matchedMappings = urlMappings.filter(m => m.url !== null);
2756
+ const sampleSize = Math.min(5, matchedMappings.length);
2757
+ const sample = matchedMappings.slice(0, sampleSize);
2758
+ this.log(`Sample of matching URLs (showing ${sampleSize} of ${matchedMappings.length}):`, logger_1.Level.LOG);
2759
+ sample.forEach((mapping, idx) => {
2760
+ this.log(` ${idx + 1}. [Index ${mapping.scrapeListIndex}] ${mapping.url}`, logger_1.Level.LOG);
2761
+ });
2762
+ }
2763
+ else {
2764
+ this.log('No matching URLs found. Check if extracted URLs match the pattern.', logger_1.Level.WARN);
2765
+ }
2766
+ return urlMappings;
2767
+ }
2768
+ catch (error) {
2769
+ this.log(`URL filtering failed: ${error.message}`, logger_1.Level.ERROR);
2770
+ return [];
2771
+ }
2772
+ }
2773
+ /**
2774
+ * Helper function to check if a URL matches a goto pattern.
2775
+ */
2776
+ matchesGotoPattern(url, gotoPattern) {
2777
+ try {
2778
+ const getUrlPattern = (urlStr) => {
2779
+ try {
2780
+ const urlObj = new URL(urlStr);
2781
+ const pathname = urlObj.pathname.replace(/\/$/, '');
2782
+ const segments = pathname.split('/').filter(s => s.length > 0);
2783
+ return { origin: urlObj.origin, pathSegments: segments };
2784
+ }
2785
+ catch (_a) {
2786
+ return null;
2787
+ }
2788
+ };
2789
+ const urlPattern = getUrlPattern(url);
2790
+ const targetPattern = getUrlPattern(gotoPattern);
2791
+ const targetNormalized = gotoPattern.replace(/\/$/, '').toLowerCase();
2792
+ const urlNormalized = url.replace(/\/$/, '').toLowerCase();
2793
+ if (!urlPattern || !targetPattern)
2794
+ return false;
2795
+ if (urlPattern.origin !== targetPattern.origin)
2796
+ return false;
2797
+ if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
2798
+ return false;
2799
+ if (urlNormalized === targetNormalized)
2800
+ return false; // Skip exact matches
2801
+ for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
2802
+ if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
2803
+ return false;
2804
+ }
2805
+ }
2806
+ return true;
2807
+ }
2808
+ catch (_a) {
2809
+ return false;
2810
+ }
2811
+ }
2812
+ /**
2813
+ * Executes hierarchical deep extraction by processing each level recursively.
2814
+ * URLs are already stored in each hierarchy level's urlMappings during workflow execution.
2815
+ */
2816
+ executeHierarchicalDeepExtraction(page, hierarchy) {
2817
+ return __awaiter(this, void 0, void 0, function* () {
2818
+ try {
2819
+ if (hierarchy.length === 0) {
2820
+ this.log('No hierarchy levels to process', logger_1.Level.LOG);
2821
+ return;
2822
+ }
2823
+ this.log(`\n=== Starting Hierarchical Deep Extraction (${hierarchy.length} level${hierarchy.length > 1 ? 's' : ''}) ===`, logger_1.Level.LOG);
2824
+ this.isInDeepExtractionPhase = true;
2825
+ const startLevel = hierarchy.length >= 2 ? hierarchy.length - 2 : hierarchy.length - 1;
2826
+ for (let levelIndex = startLevel; levelIndex >= 0; levelIndex--) {
2827
+ const level = hierarchy[levelIndex];
2828
+ const currentLevelUrls = level.urlMappings;
2829
+ this.log(`\n=== Processing Deep Extraction Level ${startLevel - levelIndex + 1}/${startLevel + 1} ===`, logger_1.Level.LOG);
2830
+ this.log(`Goto pattern: ${level.gotoPattern}`, logger_1.Level.LOG);
2831
+ this.log(`Actions to execute: ${level.actionsToExecute.length}`, logger_1.Level.LOG);
2832
+ this.log(`URLs to process: ${currentLevelUrls.filter(m => m.url !== null).length}`, logger_1.Level.LOG);
2833
+ if (currentLevelUrls.length === 0 || currentLevelUrls.every(u => !u.url)) {
2834
+ this.log('No valid URLs at this level - stopping here', logger_1.Level.LOG);
2835
+ break;
2836
+ }
2837
+ yield this.executeDeepExtractionLevel(page, level, currentLevelUrls);
2838
+ }
2839
+ this.log('\n=== Hierarchical Deep Extraction Completed ===', logger_1.Level.LOG);
2840
+ }
2841
+ catch (error) {
2842
+ this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
2843
+ }
2844
+ finally {
2845
+ this.isInDeepExtractionPhase = false;
2846
+ }
2847
+ });
2848
+ }
2849
+ /**
2850
+ * Executes deep extraction for a single level.
2851
+ * URLs are already extracted and stored in hierarchy during workflow execution.
2852
+ * This function just navigates to URLs and executes the capture actions.
2853
+ */
2854
+ executeDeepExtractionLevel(page, level, urlMappings) {
2855
+ return __awaiter(this, void 0, void 0, function* () {
2856
+ try {
2857
+ const validMappings = urlMappings.filter(m => m.url !== null);
2858
+ if (validMappings.length === 0) {
2859
+ this.log('No URLs to process for this level', logger_1.Level.LOG);
2860
+ return;
2861
+ }
2862
+ this.log(`Processing ${validMappings.length} URLs`, logger_1.Level.LOG);
2863
+ for (const mapping of validMappings) {
2864
+ try {
2865
+ this.log(`[${mapping.index}] Navigating to: ${mapping.url}`, logger_1.Level.LOG);
2866
+ yield page.goto(mapping.url);
2867
+ yield page.waitForLoadState('networkidle', { timeout: 30000 });
2868
+ for (let i = level.actionsToExecute.length - 1; i >= 0; i--) {
2869
+ const actionPair = level.actionsToExecute[i];
2870
+ if (this.isAborted) {
2871
+ this.log('Workflow aborted during deep extraction', logger_1.Level.WARN);
2872
+ return;
2873
+ }
2874
+ const validatedAction = yield this.validateAndFixSelectors(page, actionPair);
2875
+ const filteredActions = validatedAction.what.filter(action => action.action === 'scrapeSchema' ||
2876
+ action.action === 'scrapeList' ||
2877
+ action.action === 'screenshot');
2878
+ if (filteredActions.length > 0) {
2879
+ yield this.carryOutSteps(page, filteredActions);
2880
+ }
2881
+ }
2882
+ this.log(`[${mapping.index}] Completed`, logger_1.Level.LOG);
2883
+ }
2884
+ catch (error) {
2885
+ this.log(`[${mapping.index}] Failed: ${error.message}`, logger_1.Level.ERROR);
2886
+ }
2887
+ }
2888
+ }
2889
+ catch (error) {
2890
+ this.log(`Level execution failed: ${error.message}`, logger_1.Level.ERROR);
2891
+ }
2892
+ });
2893
+ }
2216
2894
  runLoop(p, workflow) {
2217
2895
  return __awaiter(this, void 0, void 0, function* () {
2218
2896
  var _a, _b;
@@ -2297,6 +2975,20 @@ class Interpreter extends events_1.EventEmitter {
2297
2975
  }
2298
2976
  if (workflowCopy.length === 0) {
2299
2977
  this.log('All actions completed. Workflow finished.', logger_1.Level.LOG);
2978
+ if (this.pendingDeepExtraction) {
2979
+ this.log('Starting deferred hierarchical deep extraction now that workflow has completed...', logger_1.Level.LOG);
2980
+ const { page, hierarchy } = this.pendingDeepExtraction;
2981
+ try {
2982
+ yield this.executeHierarchicalDeepExtraction(page, hierarchy);
2983
+ this.log('Hierarchical deep extraction completed successfully', logger_1.Level.LOG);
2984
+ }
2985
+ catch (error) {
2986
+ this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
2987
+ }
2988
+ finally {
2989
+ this.pendingDeepExtraction = null;
2990
+ }
2991
+ }
2300
2992
  cleanup();
2301
2993
  return;
2302
2994
  }
@@ -2365,7 +3057,7 @@ class Interpreter extends events_1.EventEmitter {
2365
3057
  try {
2366
3058
  const validatedAction = yield this.validateAndFixSelectors(p, action);
2367
3059
  console.log("Carrying out:", validatedAction.what);
2368
- yield this.carryOutSteps(p, validatedAction.what);
3060
+ yield this.carryOutSteps(p, validatedAction.what, workflowCopy);
2369
3061
  usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
2370
3062
  workflowCopy.splice(actionId, 1);
2371
3063
  console.log(`Action with ID ${action.id} removed from the workflow copy.`);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.23",
3
+ "version": "0.0.25",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",