mx-cloud 0.0.24 → 0.0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,6 +38,7 @@ interface InterpreterOptions {
38
38
  serializableCallback: (output: any) => (void | Promise<void>);
39
39
  binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
40
40
  debug: boolean;
41
+ robotType?: 'extract' | 'scrape' | 'deep-extract';
41
42
  debugChannel: Partial<{
42
43
  activeId: (id: number) => void;
43
44
  debugMessage: (msg: string) => void;
@@ -63,6 +64,8 @@ export default class Interpreter extends EventEmitter {
63
64
  private screenshotCounter;
64
65
  private scrapeListCounter;
65
66
  private serializableDataByType;
67
+ private pendingDeepExtraction;
68
+ private isInDeepExtractionPhase;
66
69
  constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
67
70
  trackAutohealFailure(error: string): void;
68
71
  private applyAdBlocker;
@@ -148,6 +151,56 @@ export default class Interpreter extends EventEmitter {
148
151
  * @returns {Promise<WhereWhatPair>} - The potentially modified action
149
152
  */
150
153
  private validateAndFixSelectors;
154
+ /**
155
+ * Extracts URLs from the current page's list elements.
156
+ * Used during pagination to maintain sync between scraped results and extracted URLs.
157
+ *
158
+ * @param page - Playwright page object
159
+ * @param listSelector - The selector used to identify list elements
160
+ * @param limit - Maximum number of elements to process (should match number of scraped items)
161
+ * @returns Array of URL arrays, one per list element
162
+ */
163
+ private extractUrlsFromCurrentPage;
164
+ /**
165
+ * Builds a hierarchical deep extraction plan by analyzing the workflow structure.
166
+ * Identifies goto actions and determines what actions to execute at each level.
167
+ * Workflow is bottom-to-top, so we scan from end to start.
168
+ */
169
+ private buildDeepExtractionHierarchy;
170
+ /**
171
+ * Extracts hrefs directly from the page based on scrapeSchema selectors.
172
+ * Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
173
+ * This is called after scrapeSchema executes to capture hrefs for deep extraction.
174
+ */
175
+ private extractHrefsFromPage;
176
+ /**
177
+ * Filters URLs for deep extraction based on the goto action pattern.
178
+ * This is called immediately after the first capture action (scrapeList).
179
+ * Returns the filtered URL mappings that should be processed after workflow completion.
180
+ * Each mapping maintains alignment with the original scrapeList index.
181
+ */
182
+ private filterDeepExtractionUrls;
183
+ /**
184
+ * Filters pre-extracted URLs for deep extraction based on the goto action pattern.
185
+ * This is used for paginated lists where URLs were extracted during pagination.
186
+ * Returns the filtered URL mappings that maintain alignment with scrapeList indices.
187
+ */
188
+ private filterDeepExtractionUrlsFromExtracted;
189
+ /**
190
+ * Helper function to check if a URL matches a goto pattern.
191
+ */
192
+ private matchesGotoPattern;
193
+ /**
194
+ * Executes hierarchical deep extraction by processing each level recursively.
195
+ * URLs are already stored in each hierarchy level's urlMappings during workflow execution.
196
+ */
197
+ private executeHierarchicalDeepExtraction;
198
+ /**
199
+ * Executes deep extraction for a single level.
200
+ * URLs are already extracted and stored in hierarchy during workflow execution.
201
+ * This function just navigates to URLs and executes the capture actions.
202
+ */
203
+ private executeDeepExtractionLevel;
151
204
  private runLoop;
152
205
  private ensureScriptsLoaded;
153
206
  /**
@@ -74,6 +74,8 @@ class Interpreter extends events_1.EventEmitter {
74
74
  scrapeList: {},
75
75
  scrapeSchema: {}
76
76
  };
77
+ this.pendingDeepExtraction = null;
78
+ this.isInDeepExtractionPhase = false;
77
79
  this.workflow = workflow.workflow;
78
80
  this.initializedWorkflow = null;
79
81
  this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
@@ -338,7 +340,7 @@ class Interpreter extends events_1.EventEmitter {
338
340
  * @param page Playwright Page object
339
341
  * @param steps Array of actions.
340
342
  */
341
- carryOutSteps(page, steps) {
343
+ carryOutSteps(page, steps, currentWorkflow) {
342
344
  return __awaiter(this, void 0, void 0, function* () {
343
345
  var _a, _b;
344
346
  // Check abort flag at start of execution
@@ -494,6 +496,84 @@ class Interpreter extends events_1.EventEmitter {
494
496
  scrapeList: this.serializableDataByType.scrapeList,
495
497
  scrapeSchema: this.serializableDataByType.scrapeSchema
496
498
  });
499
+ if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
500
+ if (!this.pendingDeepExtraction) {
501
+ console.log('DEBUG: Building hierarchical deep extraction plan from scrapeSchema...');
502
+ const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
503
+ if (hierarchyData.length > 0) {
504
+ const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
505
+ const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
506
+ this.log(`Root scrapeSchema will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
507
+ // Extract URLs from schema fields
508
+ const urls = yield this.extractHrefsFromPage(page, schema);
509
+ this.log(`scrapeSchema extracted ${urls.length} URLs from field selectors`, logger_1.Level.LOG);
510
+ // Filter URLs against pattern
511
+ const rootUrlMappings = urls
512
+ .map((url, index) => ({
513
+ scrapeListIndex: index,
514
+ url: this.matchesGotoPattern(url, nextLevelGotoPattern) ? url : null
515
+ }))
516
+ .filter(m => m.url !== null);
517
+ this.log(`Matched ${rootUrlMappings.length} URLs against pattern ${nextLevelGotoPattern}`, logger_1.Level.LOG);
518
+ this.pendingDeepExtraction = {
519
+ page,
520
+ hierarchy: hierarchyData.map((level, idx) => ({
521
+ gotoPattern: level.gotoPattern,
522
+ actionsToExecute: level.actionsToExecute,
523
+ urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
524
+ }))
525
+ };
526
+ }
527
+ else {
528
+ console.log('DEBUG: No goto actions found, deep extraction skipped');
529
+ }
530
+ }
531
+ else {
532
+ this.log(`[Deep Extract] scrapeSchema "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
533
+ const hierarchy = this.pendingDeepExtraction.hierarchy;
534
+ if (hierarchy && hierarchy.length > 0) {
535
+ let targetLevelIndex = -1;
536
+ for (let i = hierarchy.length - 1; i >= 0; i--) {
537
+ if (hierarchy[i].urlMappings.length === 0) {
538
+ targetLevelIndex = i;
539
+ break;
540
+ }
541
+ }
542
+ if (targetLevelIndex >= 0) {
543
+ const targetGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
544
+ this.log(`[Deep Extract] Storing URLs for level ${targetLevelIndex}, pattern: ${targetGotoPattern}`, logger_1.Level.LOG);
545
+ const urls = yield this.extractHrefsFromPage(page, schema);
546
+ this.log(`[Deep Extract] Extracted ${urls.length} URLs from scrapeSchema field selectors`, logger_1.Level.LOG);
547
+ const urlMappings = urls
548
+ .map((url, index) => ({
549
+ index,
550
+ url: this.matchesGotoPattern(url, targetGotoPattern) ? url : null
551
+ }))
552
+ .filter(m => m.url !== null);
553
+ if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
554
+ const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
555
+ const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
556
+ if (newUrls.length > 0) {
557
+ const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
558
+ hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
559
+ this.log(`[Deep Extract] Merged ${newUrls.length} new URLs from scrapeSchema`, logger_1.Level.LOG);
560
+ }
561
+ }
562
+ else {
563
+ hierarchy[targetLevelIndex].urlMappings = urlMappings;
564
+ }
565
+ this.log(`[Deep Extract] Stored ${urlMappings.length} matching URLs`, logger_1.Level.LOG);
566
+ if (urlMappings.length > 0) {
567
+ const sampleSize = Math.min(3, urlMappings.length);
568
+ this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${urlMappings.length}):`, logger_1.Level.LOG);
569
+ urlMappings.slice(0, sampleSize).forEach((mapping, idx) => {
570
+ this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
571
+ });
572
+ }
573
+ }
574
+ }
575
+ }
576
+ }
497
577
  }),
498
578
  scrapeList: (config_1, ...args_1) => __awaiter(this, [config_1, ...args_1], void 0, function* (config, actionName = "") {
499
579
  var _a, _b;
@@ -528,16 +608,58 @@ class Interpreter extends events_1.EventEmitter {
528
608
  }
529
609
  else {
530
610
  paginationUsed = true;
531
- scrapeResults = yield this.handlePagination(page, config, actionName);
611
+ const paginationResult = yield this.handlePagination(page, config, actionName);
612
+ scrapeResults = paginationResult.results;
613
+ const paginationUrls = paginationResult.urls;
614
+ if (this.options.robotType === 'deep-extract' && this.initializedWorkflow && scrapeResults.length > 0) {
615
+ if (!this.pendingDeepExtraction) {
616
+ console.log('DEBUG: Building hierarchical deep extraction plan from pagination...');
617
+ const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
618
+ if (hierarchyData.length > 0) {
619
+ const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
620
+ const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
621
+ this.log(`Root scrapeList (pagination) will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
622
+ const rootUrlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextLevelGotoPattern);
623
+ this.pendingDeepExtraction = {
624
+ page,
625
+ hierarchy: hierarchyData.map((level, idx) => ({
626
+ gotoPattern: level.gotoPattern,
627
+ actionsToExecute: level.actionsToExecute,
628
+ urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
629
+ }))
630
+ };
631
+ }
632
+ }
633
+ else {
634
+ this.log(`[Deep Extract] scrapeList (pagination) "${actionName}" extracting URLs`, logger_1.Level.LOG);
635
+ const hierarchy = this.pendingDeepExtraction.hierarchy;
636
+ if (hierarchy && hierarchy.length > 0) {
637
+ const nextLevelIndex = hierarchy.length >= 3 ? hierarchy.length - 3 : 0;
638
+ if (nextLevelIndex >= 0 && hierarchy[nextLevelIndex]) {
639
+ const nextGotoPattern = hierarchy[nextLevelIndex].gotoPattern;
640
+ this.log(`[Deep Extract] Extracting URLs for pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
641
+ const urlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextGotoPattern);
642
+ this.log(`[Deep Extract] Found ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
643
+ const validUrls = urlMappings.filter(m => m.url !== null);
644
+ if (validUrls.length > 0) {
645
+ const sampleSize = Math.min(3, validUrls.length);
646
+ this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${validUrls.length}):`, logger_1.Level.LOG);
647
+ validUrls.slice(0, sampleSize).forEach((mapping, idx) => {
648
+ this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
649
+ });
650
+ }
651
+ }
652
+ }
653
+ }
654
+ }
532
655
  }
533
656
  if (!Array.isArray(scrapeResults)) {
534
657
  scrapeResults = [];
535
658
  }
536
- console.log(`ScrapeList completed with ${scrapeResults.length} results`);
537
659
  if (!paginationUsed) {
538
660
  const actionType = "scrapeList";
539
661
  let name = actionName || "";
540
- if (!name || name.trim() === "") {
662
+ if (!name || name.trim() === "" || this.isInDeepExtractionPhase) {
541
663
  this.scrapeListCounter++;
542
664
  name = `List ${this.scrapeListCounter}`;
543
665
  }
@@ -551,6 +673,69 @@ class Interpreter extends events_1.EventEmitter {
551
673
  scrapeList: this.serializableDataByType.scrapeList,
552
674
  scrapeSchema: this.serializableDataByType.scrapeSchema
553
675
  });
676
+ console.log(`DEBUG: Checking deep extract condition: robotType=${this.options.robotType}, hasWorkflow=${!!currentWorkflow}, alreadyPending=${!!this.pendingDeepExtraction}`);
677
+ if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
678
+ if (!this.pendingDeepExtraction) {
679
+ console.log('DEBUG: Building hierarchical deep extraction plan...');
680
+ const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
681
+ if (hierarchyData.length > 0) {
682
+ const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
683
+ const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
684
+ this.log(`Root scrapeList will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
685
+ const rootUrlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextLevelGotoPattern);
686
+ this.pendingDeepExtraction = {
687
+ page,
688
+ hierarchy: hierarchyData.map((level, idx) => ({
689
+ gotoPattern: level.gotoPattern,
690
+ actionsToExecute: level.actionsToExecute,
691
+ urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
692
+ }))
693
+ };
694
+ }
695
+ else {
696
+ console.log('DEBUG: No goto actions found, deep extraction skipped');
697
+ }
698
+ }
699
+ else {
700
+ this.log(`[Deep Extract] scrapeList "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
701
+ const hierarchy = this.pendingDeepExtraction.hierarchy;
702
+ if (hierarchy && hierarchy.length > 0) {
703
+ let targetLevelIndex = -1;
704
+ for (let i = hierarchy.length - 1; i >= 0; i--) {
705
+ if (hierarchy[i].urlMappings.length === 0) {
706
+ targetLevelIndex = i;
707
+ break;
708
+ }
709
+ }
710
+ if (targetLevelIndex >= 0) {
711
+ const nextGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
712
+ this.log(`[Deep Extract] Storing URLs for level ${targetLevelIndex}, pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
713
+ const urlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextGotoPattern);
714
+ if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
715
+ const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
716
+ const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
717
+ if (newUrls.length > 0) {
718
+ const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
719
+ hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
720
+ this.log(`[Deep Extract] Merged ${newUrls.length} new URLs`, logger_1.Level.LOG);
721
+ }
722
+ }
723
+ else {
724
+ hierarchy[targetLevelIndex].urlMappings = urlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url }));
725
+ }
726
+ this.log(`[Deep Extract] Stored ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
727
+ const validUrls = urlMappings.filter(m => m.url !== null);
728
+ if (validUrls.length > 0) {
729
+ const sampleSize = Math.min(3, validUrls.length);
730
+ this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${validUrls.length}):`, logger_1.Level.LOG);
731
+ validUrls.slice(0, sampleSize).forEach((mapping, idx) => {
732
+ this.log(`[Deep Extract] ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
733
+ });
734
+ }
735
+ }
736
+ }
737
+ }
738
+ }
554
739
  }
555
740
  }
556
741
  catch (error) {
@@ -731,11 +916,12 @@ class Interpreter extends events_1.EventEmitter {
731
916
  return __awaiter(this, arguments, void 0, function* (page, config, providedActionName = "") {
732
917
  if (this.isAborted) {
733
918
  this.log('Workflow aborted, stopping pagination', logger_1.Level.WARN);
734
- return [];
919
+ return { results: [], urls: [] };
735
920
  }
736
921
  const actionType = "scrapeList";
737
922
  let actionName = providedActionName || "";
738
- if (!actionName || actionName.trim() === "") {
923
+ // During deep extraction, ALWAYS auto-increment to create separate lists for each URL
924
+ if (!actionName || actionName.trim() === "" || this.isInDeepExtractionPhase) {
739
925
  this.scrapeListCounter++;
740
926
  actionName = `List ${this.scrapeListCounter}`;
741
927
  }
@@ -747,6 +933,7 @@ class Interpreter extends events_1.EventEmitter {
747
933
  this.serializableDataByType[actionType][actionName] = [];
748
934
  }
749
935
  let allResults = [];
936
+ let allUrls = []; // Track URLs alongside results for deep-extract
750
937
  let previousHeight = 0;
751
938
  let scrapedItems = new Set();
752
939
  let visitedUrls = new Set();
@@ -773,14 +960,22 @@ class Interpreter extends events_1.EventEmitter {
773
960
  debugLog(`Page evaluation failed: ${error.message}`);
774
961
  return;
775
962
  }
776
- const newResults = results.filter(item => {
963
+ // Extract URLs for ALL items BEFORE filtering duplicates
964
+ // This ensures URL indices match result indices
965
+ const allItemUrls = yield this.extractUrlsFromCurrentPage(page, config.listSelector, results.length);
966
+ // Filter results AND URLs together using the same uniqueness logic
967
+ const newResults = [];
968
+ const newUrls = [];
969
+ results.forEach((item, index) => {
777
970
  const uniqueKey = JSON.stringify(item);
778
- if (scrapedItems.has(uniqueKey))
779
- return false;
780
- scrapedItems.add(uniqueKey);
781
- return true;
971
+ if (!scrapedItems.has(uniqueKey)) {
972
+ scrapedItems.add(uniqueKey);
973
+ newResults.push(item);
974
+ newUrls.push(allItemUrls[index] || []); // Add corresponding URLs
975
+ }
782
976
  });
783
977
  allResults = allResults.concat(newResults);
978
+ allUrls = allUrls.concat(newUrls);
784
979
  debugLog("Results collected:", allResults.length);
785
980
  // Store in serializableDataByType and send structured callback
786
981
  this.serializableDataByType[actionType][actionName] = [...allResults];
@@ -792,6 +987,7 @@ class Interpreter extends events_1.EventEmitter {
792
987
  const checkLimit = () => {
793
988
  if (config.limit && allResults.length >= config.limit) {
794
989
  allResults = allResults.slice(0, config.limit);
990
+ allUrls = allUrls.slice(0, config.limit); // Also trim URLs to maintain sync
795
991
  return true;
796
992
  }
797
993
  return false;
@@ -917,16 +1113,16 @@ class Interpreter extends events_1.EventEmitter {
917
1113
  // Check abort flag at start of each pagination iteration
918
1114
  if (this.isAborted) {
919
1115
  this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
920
- return allResults;
1116
+ return { results: allResults, urls: allUrls };
921
1117
  }
922
1118
  // Pagination circuit breakers
923
1119
  if (++paginationIterations > MAX_PAGINATION_ITERATIONS) {
924
1120
  debugLog(`Maximum pagination iterations reached (${MAX_PAGINATION_ITERATIONS}), stopping`);
925
- return allResults;
1121
+ return { results: allResults, urls: allUrls };
926
1122
  }
927
1123
  if (Date.now() - paginationStartTime > MAX_PAGINATION_TIME) {
928
1124
  debugLog('Maximum pagination time reached (10 minutes), stopping');
929
- return allResults;
1125
+ return { results: allResults, urls: allUrls };
930
1126
  }
931
1127
  // Add async yield every 5 iterations to prevent event loop blocking
932
1128
  if (paginationIterations % 5 === 0) {
@@ -937,7 +1133,7 @@ class Interpreter extends events_1.EventEmitter {
937
1133
  let previousResultCount = allResults.length;
938
1134
  yield scrapeCurrentPage();
939
1135
  if (checkLimit()) {
940
- return allResults;
1136
+ return { results: allResults, urls: allUrls };
941
1137
  }
942
1138
  yield page.evaluate(() => {
943
1139
  const scrollHeight = Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
@@ -951,14 +1147,14 @@ class Interpreter extends events_1.EventEmitter {
951
1147
  if (currentResultCount === previousResultCount) {
952
1148
  unchangedResultCounter++;
953
1149
  if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
954
- return allResults;
1150
+ return { results: allResults, urls: allUrls };
955
1151
  }
956
1152
  }
957
1153
  else {
958
1154
  unchangedResultCounter = 0;
959
1155
  }
960
1156
  if (currentHeight === previousHeight) {
961
- return allResults;
1157
+ return { results: allResults, urls: allUrls };
962
1158
  }
963
1159
  previousHeight = currentHeight;
964
1160
  break;
@@ -967,7 +1163,7 @@ class Interpreter extends events_1.EventEmitter {
967
1163
  let previousResultCount = allResults.length;
968
1164
  yield scrapeCurrentPage();
969
1165
  if (checkLimit()) {
970
- return allResults;
1166
+ return { results: allResults, urls: allUrls };
971
1167
  }
972
1168
  yield page.evaluate(() => window.scrollTo(0, 0));
973
1169
  yield page.waitForTimeout(2000);
@@ -976,14 +1172,14 @@ class Interpreter extends events_1.EventEmitter {
976
1172
  if (currentResultCount === previousResultCount) {
977
1173
  unchangedResultCounter++;
978
1174
  if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
979
- return allResults;
1175
+ return { results: allResults, urls: allUrls };
980
1176
  }
981
1177
  }
982
1178
  else {
983
1179
  unchangedResultCounter = 0;
984
1180
  }
985
1181
  if (currentTopHeight === 0) {
986
- return allResults;
1182
+ return { results: allResults, urls: allUrls };
987
1183
  }
988
1184
  previousHeight = currentTopHeight;
989
1185
  break;
@@ -993,7 +1189,7 @@ class Interpreter extends events_1.EventEmitter {
993
1189
  visitedUrls.add(currentUrl);
994
1190
  yield scrapeCurrentPage();
995
1191
  if (checkLimit())
996
- return allResults;
1192
+ return { results: allResults, urls: allUrls };
997
1193
  const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
998
1194
  availableSelectors = updatedSelectors;
999
1195
  if (!button || !workingSelector) {
@@ -1009,7 +1205,7 @@ class Interpreter extends events_1.EventEmitter {
1009
1205
  }
1010
1206
  }));
1011
1207
  if (!success)
1012
- return allResults;
1208
+ return { results: allResults, urls: allUrls };
1013
1209
  break;
1014
1210
  }
1015
1211
  let retryCount = 0;
@@ -1139,14 +1335,14 @@ class Interpreter extends events_1.EventEmitter {
1139
1335
  }
1140
1336
  if (!paginationSuccess) {
1141
1337
  debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
1142
- return allResults;
1338
+ return { results: allResults, urls: allUrls };
1143
1339
  }
1144
1340
  break;
1145
1341
  }
1146
1342
  case 'clickLoadMore': {
1147
1343
  yield scrapeCurrentPage();
1148
1344
  if (checkLimit())
1149
- return allResults;
1345
+ return { results: allResults, urls: allUrls };
1150
1346
  let loadMoreCounter = 0;
1151
1347
  const MAX_LOAD_MORE_ITERATIONS = 100; // Prevent infinite load more
1152
1348
  const loadMoreStartTime = Date.now();
@@ -1155,11 +1351,11 @@ class Interpreter extends events_1.EventEmitter {
1155
1351
  // Load more circuit breakers
1156
1352
  if (loadMoreCounter >= MAX_LOAD_MORE_ITERATIONS) {
1157
1353
  debugLog(`Maximum load more iterations reached (${MAX_LOAD_MORE_ITERATIONS}), stopping`);
1158
- return allResults;
1354
+ return { results: allResults, urls: allUrls };
1159
1355
  }
1160
1356
  if (Date.now() - loadMoreStartTime > MAX_LOAD_MORE_TIME) {
1161
1357
  debugLog('Maximum load more time reached (5 minutes), stopping');
1162
- return allResults;
1358
+ return { results: allResults, urls: allUrls };
1163
1359
  }
1164
1360
  // Add async yield every 3 iterations
1165
1361
  if (loadMoreCounter % 3 === 0 && loadMoreCounter > 0) {
@@ -1170,7 +1366,7 @@ class Interpreter extends events_1.EventEmitter {
1170
1366
  availableSelectors = updatedSelectors;
1171
1367
  if (!workingSelector || !loadMoreButton) {
1172
1368
  debugLog('No working Load More selector found after retries');
1173
- return allResults;
1369
+ return { results: allResults, urls: allUrls };
1174
1370
  }
1175
1371
  // Implement retry mechanism for clicking the button
1176
1372
  let retryCount = 0;
@@ -1210,7 +1406,7 @@ class Interpreter extends events_1.EventEmitter {
1210
1406
  }
1211
1407
  if (!clickSuccess) {
1212
1408
  debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
1213
- return allResults;
1409
+ return { results: allResults, urls: allUrls };
1214
1410
  }
1215
1411
  // Wait for content to load and check scroll height
1216
1412
  yield page.waitForTimeout(2000);
@@ -1239,16 +1435,16 @@ class Interpreter extends events_1.EventEmitter {
1239
1435
  // previousResultCount = currentResultCount;
1240
1436
  // }
1241
1437
  if (checkLimit())
1242
- return allResults;
1438
+ return { results: allResults, urls: allUrls };
1243
1439
  if (!heightChanged) {
1244
1440
  debugLog('No more items loaded after Load More');
1245
- return allResults;
1441
+ return { results: allResults, urls: allUrls };
1246
1442
  }
1247
1443
  }
1248
1444
  }
1249
1445
  default: {
1250
1446
  yield scrapeCurrentPage();
1251
- return allResults;
1447
+ return { results: allResults, urls: allUrls };
1252
1448
  }
1253
1449
  }
1254
1450
  if (checkLimit())
@@ -1257,9 +1453,9 @@ class Interpreter extends events_1.EventEmitter {
1257
1453
  }
1258
1454
  catch (error) {
1259
1455
  debugLog(`Fatal error: ${error.message}`);
1260
- return allResults;
1456
+ return { results: allResults, urls: allUrls };
1261
1457
  }
1262
- return allResults;
1458
+ return { results: allResults, urls: allUrls };
1263
1459
  });
1264
1460
  }
1265
1461
  getMatchingActionId(workflow, pageState, usedActions) {
@@ -2183,6 +2379,518 @@ class Interpreter extends events_1.EventEmitter {
2183
2379
  return modifiedAction;
2184
2380
  });
2185
2381
  }
2382
+ /**
2383
+ * Extracts URLs from the current page's list elements.
2384
+ * Used during pagination to maintain sync between scraped results and extracted URLs.
2385
+ *
2386
+ * @param page - Playwright page object
2387
+ * @param listSelector - The selector used to identify list elements
2388
+ * @param limit - Maximum number of elements to process (should match number of scraped items)
2389
+ * @returns Array of URL arrays, one per list element
2390
+ */
2391
+ extractUrlsFromCurrentPage(page, listSelector, limit) {
2392
+ return __awaiter(this, void 0, void 0, function* () {
2393
+ const extractedUrls = yield page.evaluate(({ selector, limit }) => {
2394
+ const urlsByElement = [];
2395
+ let listElements = [];
2396
+ if (selector.startsWith('//') || selector.startsWith('(//')) {
2397
+ const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
2398
+ for (let i = 0; i < xpathResult.snapshotLength; i++) {
2399
+ const node = xpathResult.snapshotItem(i);
2400
+ if (node && node.nodeType === Node.ELEMENT_NODE) {
2401
+ listElements.push(node);
2402
+ }
2403
+ }
2404
+ }
2405
+ else {
2406
+ listElements = Array.from(document.querySelectorAll(selector));
2407
+ }
2408
+ // Extract URLs from the first 'limit' elements that match the selector
2409
+ // The limit corresponds to the number of items that were scraped
2410
+ const elementsToProcess = listElements.slice(0, limit);
2411
+ elementsToProcess.forEach(element => {
2412
+ const urls = [];
2413
+ if (element.tagName === 'A' && element.href) {
2414
+ urls.push(element.href);
2415
+ }
2416
+ const anchors = element.querySelectorAll('a[href]');
2417
+ anchors.forEach(anchor => {
2418
+ const href = anchor.href;
2419
+ if (href && !urls.includes(href)) {
2420
+ urls.push(href);
2421
+ }
2422
+ });
2423
+ urlsByElement.push(urls);
2424
+ });
2425
+ return urlsByElement;
2426
+ }, { selector: listSelector, limit });
2427
+ return extractedUrls;
2428
+ });
2429
+ }
2430
+ /**
2431
+ * Builds a hierarchical deep extraction plan by analyzing the workflow structure.
2432
+ * Identifies goto actions and determines what actions to execute at each level.
2433
+ * Workflow is bottom-to-top, so we scan from end to start.
2434
+ */
2435
+ buildDeepExtractionHierarchy(currentWorkflow) {
2436
+ var _a, _b;
2437
+ const hierarchy = [];
2438
+ // Find all goto action indices with their patterns
2439
+ const gotoData = [];
2440
+ currentWorkflow.forEach((pair, index) => {
2441
+ var _a;
2442
+ if (pair.what && pair.what.some(action => action.action === 'goto')) {
2443
+ const gotoAction = pair.what.find(action => action.action === 'goto');
2444
+ const pattern = (_a = gotoAction === null || gotoAction === void 0 ? void 0 : gotoAction.args) === null || _a === void 0 ? void 0 : _a[0];
2445
+ if (pattern) {
2446
+ gotoData.push({ index, pattern: String(pattern) });
2447
+ }
2448
+ }
2449
+ });
2450
+ if (gotoData.length === 0) {
2451
+ this.log('No goto actions found in workflow', logger_1.Level.WARN);
2452
+ return [];
2453
+ }
2454
+ this.log(`Found ${gotoData.length} goto action(s) at indices: ${gotoData.map(g => g.index).join(', ')}`, logger_1.Level.LOG);
2455
+ const uniqueGotos = [];
2456
+ for (let i = 0; i < gotoData.length; i++) {
2457
+ const current = gotoData[i];
2458
+ const next = gotoData[i + 1];
2459
+ if (next && current.pattern === next.pattern) {
2460
+ this.log(`Skipping duplicate goto at index ${next.index} (same as ${current.index})`, logger_1.Level.LOG);
2461
+ i++;
2462
+ }
2463
+ uniqueGotos.push(current);
2464
+ }
2465
+ this.log(`After deduplication: ${uniqueGotos.length} unique goto(s)`, logger_1.Level.LOG);
2466
+ for (let i = 0; i < uniqueGotos.length; i++) {
2467
+ const gotoIndex = uniqueGotos[i].index;
2468
+ const gotoPattern = uniqueGotos[i].pattern;
2469
+ const nextGotoIndex = i > 0 ? uniqueGotos[i - 1].index : 0;
2470
+ let actionsToExecute = currentWorkflow.slice(nextGotoIndex, gotoIndex);
2471
+ actionsToExecute = actionsToExecute.filter(pair => {
2472
+ return !pair.what || !pair.what.some(action => action.action === 'goto');
2473
+ });
2474
+ const dataExtractionActions = actionsToExecute.filter(pair => {
2475
+ return pair.what && pair.what.some(action => action.action === 'scrapeSchema' ||
2476
+ action.action === 'scrapeList' ||
2477
+ action.action === 'screenshot');
2478
+ });
2479
+ if (dataExtractionActions.length === 0) {
2480
+ this.log(`No data extraction actions found between goto at ${gotoIndex} and next level`, logger_1.Level.WARN);
2481
+ continue;
2482
+ }
2483
+ let sourceActionName = '';
2484
+ let sourceActionType = 'scrapeList';
2485
+ if (i === uniqueGotos.length - 1) {
2486
+ const scrapeListBefore = currentWorkflow.slice(gotoIndex + 1).find(pair => pair.what && pair.what.some(action => action.action === 'scrapeList'));
2487
+ if (scrapeListBefore) {
2488
+ const scrapeListAction = scrapeListBefore.what.find(action => action.action === 'scrapeList');
2489
+ sourceActionName = ((_b = (_a = scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.args) === null || _a === void 0 ? void 0 : _a[0]) === null || _b === void 0 ? void 0 : _b.name) || (scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.name) || '';
2490
+ sourceActionType = 'scrapeList';
2491
+ }
2492
+ }
2493
+ else {
2494
+ sourceActionName = '';
2495
+ sourceActionType = 'scrapeSchema';
2496
+ }
2497
+ hierarchy.push({
2498
+ gotoActionIndex: gotoIndex,
2499
+ gotoPattern: String(gotoPattern),
2500
+ actionsToExecute: dataExtractionActions,
2501
+ sourceActionName,
2502
+ sourceActionType
2503
+ });
2504
+ this.log(`Level ${i}: goto at index ${gotoIndex}, pattern=${gotoPattern}, actions=${dataExtractionActions.length}`, logger_1.Level.LOG);
2505
+ }
2506
+ return hierarchy;
2507
+ }
2508
+ /**
2509
+ * Extracts hrefs directly from the page based on scrapeSchema selectors.
2510
+ * Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
2511
+ * This is called after scrapeSchema executes to capture hrefs for deep extraction.
2512
+ */
2513
+ extractHrefsFromPage(page, schemaConfig) {
2514
+ return __awaiter(this, void 0, void 0, function* () {
2515
+ try {
2516
+ const fields = schemaConfig.fields || schemaConfig;
2517
+ const selectors = [];
2518
+ for (const [fieldName, fieldConfig] of Object.entries(fields)) {
2519
+ if (fieldConfig && typeof fieldConfig === 'object' && fieldConfig.selector) {
2520
+ selectors.push(String(fieldConfig.selector));
2521
+ }
2522
+ }
2523
+ if (selectors.length === 0) {
2524
+ return [];
2525
+ }
2526
+ const extractedUrls = yield page.evaluate((selectorList) => {
2527
+ const urls = [];
2528
+ for (const selector of selectorList) {
2529
+ if (!selector)
2530
+ continue;
2531
+ try {
2532
+ let elements = [];
2533
+ if (selector.startsWith('//') || selector.startsWith('(//')) {
2534
+ const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
2535
+ for (let i = 0; i < xpathResult.snapshotLength; i++) {
2536
+ const node = xpathResult.snapshotItem(i);
2537
+ if (node && node.nodeType === Node.ELEMENT_NODE) {
2538
+ elements.push(node);
2539
+ }
2540
+ }
2541
+ }
2542
+ else {
2543
+ elements = Array.from(document.querySelectorAll(selector));
2544
+ }
2545
+ for (const element of elements) {
2546
+ if (element.tagName === 'A' && element.href) {
2547
+ const href = element.href;
2548
+ if (href && !urls.includes(href)) {
2549
+ urls.push(href);
2550
+ }
2551
+ }
2552
+ }
2553
+ }
2554
+ catch (error) {
2555
+ console.warn(`Failed to extract hrefs for selector ${selector}:`, error);
2556
+ }
2557
+ }
2558
+ return urls;
2559
+ }, selectors);
2560
+ this.log(`Extracted ${extractedUrls.length} hrefs from page for schema selectors`, logger_1.Level.LOG);
2561
+ return extractedUrls;
2562
+ }
2563
+ catch (error) {
2564
+ this.log(`Failed to extract hrefs from page: ${error.message}`, logger_1.Level.ERROR);
2565
+ return [];
2566
+ }
2567
+ });
2568
+ }
2569
+ /**
2570
+ * Filters URLs for deep extraction based on the goto action pattern.
2571
+ * This is called immediately after the first capture action (scrapeList).
2572
+ * Returns the filtered URL mappings that should be processed after workflow completion.
2573
+ * Each mapping maintains alignment with the original scrapeList index.
2574
+ */
2575
+ filterDeepExtractionUrls(page, listSelector, scrapeResults, gotoTargetPattern) {
2576
+ return __awaiter(this, void 0, void 0, function* () {
2577
+ try {
2578
+ this.log(`Deep extraction: Filtering URLs from list structure (${scrapeResults.length} items)`, logger_1.Level.LOG);
2579
+ const extractedUrls = yield page.evaluate(({ selector, limit }) => {
2580
+ const urlsByElement = [];
2581
+ let listElements = [];
2582
+ if (selector.startsWith('//') || selector.startsWith('(//')) {
2583
+ const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
2584
+ for (let i = 0; i < xpathResult.snapshotLength; i++) {
2585
+ const node = xpathResult.snapshotItem(i);
2586
+ if (node && node.nodeType === Node.ELEMENT_NODE) {
2587
+ listElements.push(node);
2588
+ }
2589
+ }
2590
+ }
2591
+ else {
2592
+ listElements = Array.from(document.querySelectorAll(selector));
2593
+ }
2594
+ const elementsToProcess = listElements.slice(0, limit);
2595
+ elementsToProcess.forEach(element => {
2596
+ const urls = [];
2597
+ if (element.tagName === 'A' && element.href) {
2598
+ urls.push(element.href);
2599
+ }
2600
+ const anchors = element.querySelectorAll('a[href]');
2601
+ anchors.forEach(anchor => {
2602
+ const href = anchor.href;
2603
+ if (href && !urls.includes(href)) {
2604
+ urls.push(href);
2605
+ }
2606
+ });
2607
+ urlsByElement.push(urls);
2608
+ });
2609
+ return urlsByElement;
2610
+ }, { selector: listSelector, limit: scrapeResults.length });
2611
+ const totalUrlCount = extractedUrls.reduce((sum, urls) => sum + urls.length, 0);
2612
+ this.log(`Extracted ${totalUrlCount} total URLs from ${scrapeResults.length} list items (avg ${(totalUrlCount / scrapeResults.length).toFixed(1)} URLs per item)`, logger_1.Level.LOG);
2613
+ const getUrlPattern = (url) => {
2614
+ try {
2615
+ const urlObj = new URL(url);
2616
+ const pathname = urlObj.pathname.replace(/\/$/, '');
2617
+ const segments = pathname.split('/').filter(s => s.length > 0);
2618
+ return {
2619
+ origin: urlObj.origin,
2620
+ pathSegments: segments
2621
+ };
2622
+ }
2623
+ catch (_a) {
2624
+ return null;
2625
+ }
2626
+ };
2627
+ const targetPattern = getUrlPattern(String(gotoTargetPattern));
2628
+ const targetNormalized = String(gotoTargetPattern).replace(/\/$/, '').toLowerCase();
2629
+ if (!targetPattern) {
2630
+ this.log('Could not parse goto URL pattern, skipping deep extraction', logger_1.Level.WARN);
2631
+ return [];
2632
+ }
2633
+ this.log(`Target URL pattern: ${targetPattern.origin}/${targetPattern.pathSegments.join('/')}`, logger_1.Level.LOG);
2634
+ const urlMappings = [];
2635
+ extractedUrls.forEach((urlsFromElement, scrapeListIndex) => {
2636
+ let matchingUrl = null;
2637
+ for (const url of urlsFromElement) {
2638
+ const urlPattern = getUrlPattern(url);
2639
+ if (!urlPattern)
2640
+ continue;
2641
+ if (urlPattern.origin !== targetPattern.origin)
2642
+ continue;
2643
+ if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
2644
+ continue;
2645
+ let pathMatches = true;
2646
+ for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
2647
+ if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
2648
+ pathMatches = false;
2649
+ break;
2650
+ }
2651
+ }
2652
+ if (!pathMatches)
2653
+ continue;
2654
+ const urlNormalized = url.replace(/\/$/, '').toLowerCase();
2655
+ if (urlNormalized === targetNormalized) {
2656
+ this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
2657
+ continue;
2658
+ }
2659
+ matchingUrl = url;
2660
+ break;
2661
+ }
2662
+ urlMappings.push({
2663
+ scrapeListIndex,
2664
+ url: matchingUrl
2665
+ });
2666
+ });
2667
+ const matchedCount = urlMappings.filter(m => m.url !== null).length;
2668
+ this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
2669
+ if (matchedCount > 0) {
2670
+ const matchedMappings = urlMappings.filter(m => m.url !== null);
2671
+ const sampleSize = Math.min(5, matchedMappings.length);
2672
+ const sample = matchedMappings.slice(0, sampleSize);
2673
+ this.log(`Sample of matching URLs (showing ${sampleSize} of ${matchedMappings.length}):`, logger_1.Level.LOG);
2674
+ sample.forEach((mapping, idx) => {
2675
+ this.log(` ${idx + 1}. [Index ${mapping.scrapeListIndex}] ${mapping.url}`, logger_1.Level.LOG);
2676
+ });
2677
+ }
2678
+ else {
2679
+ this.log('No matching URLs found. Check if extracted URLs match the pattern.', logger_1.Level.WARN);
2680
+ }
2681
+ return urlMappings;
2682
+ }
2683
+ catch (error) {
2684
+ this.log(`URL filtering failed: ${error.message}`, logger_1.Level.ERROR);
2685
+ return [];
2686
+ }
2687
+ });
2688
+ }
2689
+ /**
2690
+ * Filters pre-extracted URLs for deep extraction based on the goto action pattern.
2691
+ * This is used for paginated lists where URLs were extracted during pagination.
2692
+ * Returns the filtered URL mappings that maintain alignment with scrapeList indices.
2693
+ */
2694
+ filterDeepExtractionUrlsFromExtracted(extractedUrls, scrapeResults, gotoTargetPattern) {
2695
+ try {
2696
+ const totalUrlCount = extractedUrls.reduce((sum, urls) => sum + urls.length, 0);
2697
+ this.log(`Deep extraction: Filtering ${totalUrlCount} pre-extracted URLs from ${scrapeResults.length} items`, logger_1.Level.LOG);
2698
+ const getUrlPattern = (url) => {
2699
+ try {
2700
+ const urlObj = new URL(url);
2701
+ const pathname = urlObj.pathname.replace(/\/$/, '');
2702
+ const segments = pathname.split('/').filter(s => s.length > 0);
2703
+ return {
2704
+ origin: urlObj.origin,
2705
+ pathSegments: segments
2706
+ };
2707
+ }
2708
+ catch (_a) {
2709
+ return null;
2710
+ }
2711
+ };
2712
+ const targetPattern = getUrlPattern(String(gotoTargetPattern));
2713
+ const targetNormalized = String(gotoTargetPattern).replace(/\/$/, '').toLowerCase();
2714
+ if (!targetPattern) {
2715
+ this.log('Could not parse goto URL pattern, skipping deep extraction', logger_1.Level.WARN);
2716
+ return [];
2717
+ }
2718
+ this.log(`Target URL pattern: ${targetPattern.origin}/${targetPattern.pathSegments.join('/')}`, logger_1.Level.LOG);
2719
+ const urlMappings = [];
2720
+ extractedUrls.forEach((urlsFromElement, scrapeListIndex) => {
2721
+ let matchingUrl = null;
2722
+ for (const url of urlsFromElement) {
2723
+ const urlPattern = getUrlPattern(url);
2724
+ if (!urlPattern)
2725
+ continue;
2726
+ if (urlPattern.origin !== targetPattern.origin)
2727
+ continue;
2728
+ if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
2729
+ continue;
2730
+ let pathMatches = true;
2731
+ for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
2732
+ if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
2733
+ pathMatches = false;
2734
+ break;
2735
+ }
2736
+ }
2737
+ if (!pathMatches)
2738
+ continue;
2739
+ const urlNormalized = url.replace(/\/$/, '').toLowerCase();
2740
+ if (urlNormalized === targetNormalized) {
2741
+ this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
2742
+ continue;
2743
+ }
2744
+ matchingUrl = url;
2745
+ break;
2746
+ }
2747
+ urlMappings.push({
2748
+ scrapeListIndex,
2749
+ url: matchingUrl
2750
+ });
2751
+ });
2752
+ const matchedCount = urlMappings.filter(m => m.url !== null).length;
2753
+ this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
2754
+ if (matchedCount > 0) {
2755
+ const matchedMappings = urlMappings.filter(m => m.url !== null);
2756
+ const sampleSize = Math.min(5, matchedMappings.length);
2757
+ const sample = matchedMappings.slice(0, sampleSize);
2758
+ this.log(`Sample of matching URLs (showing ${sampleSize} of ${matchedMappings.length}):`, logger_1.Level.LOG);
2759
+ sample.forEach((mapping, idx) => {
2760
+ this.log(` ${idx + 1}. [Index ${mapping.scrapeListIndex}] ${mapping.url}`, logger_1.Level.LOG);
2761
+ });
2762
+ }
2763
+ else {
2764
+ this.log('No matching URLs found. Check if extracted URLs match the pattern.', logger_1.Level.WARN);
2765
+ }
2766
+ return urlMappings;
2767
+ }
2768
+ catch (error) {
2769
+ this.log(`URL filtering failed: ${error.message}`, logger_1.Level.ERROR);
2770
+ return [];
2771
+ }
2772
+ }
2773
+ /**
2774
+ * Helper function to check if a URL matches a goto pattern.
2775
+ */
2776
+ matchesGotoPattern(url, gotoPattern) {
2777
+ try {
2778
+ const getUrlPattern = (urlStr) => {
2779
+ try {
2780
+ const urlObj = new URL(urlStr);
2781
+ const pathname = urlObj.pathname.replace(/\/$/, '');
2782
+ const segments = pathname.split('/').filter(s => s.length > 0);
2783
+ return { origin: urlObj.origin, pathSegments: segments };
2784
+ }
2785
+ catch (_a) {
2786
+ return null;
2787
+ }
2788
+ };
2789
+ const urlPattern = getUrlPattern(url);
2790
+ const targetPattern = getUrlPattern(gotoPattern);
2791
+ const targetNormalized = gotoPattern.replace(/\/$/, '').toLowerCase();
2792
+ const urlNormalized = url.replace(/\/$/, '').toLowerCase();
2793
+ if (!urlPattern || !targetPattern)
2794
+ return false;
2795
+ if (urlPattern.origin !== targetPattern.origin)
2796
+ return false;
2797
+ if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
2798
+ return false;
2799
+ if (urlNormalized === targetNormalized)
2800
+ return false; // Skip exact matches
2801
+ for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
2802
+ if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
2803
+ return false;
2804
+ }
2805
+ }
2806
+ return true;
2807
+ }
2808
+ catch (_a) {
2809
+ return false;
2810
+ }
2811
+ }
2812
+ /**
2813
+ * Executes hierarchical deep extraction by processing each level recursively.
2814
+ * URLs are already stored in each hierarchy level's urlMappings during workflow execution.
2815
+ */
2816
+ executeHierarchicalDeepExtraction(page, hierarchy) {
2817
+ return __awaiter(this, void 0, void 0, function* () {
2818
+ try {
2819
+ if (hierarchy.length === 0) {
2820
+ this.log('No hierarchy levels to process', logger_1.Level.LOG);
2821
+ return;
2822
+ }
2823
+ this.log(`\n=== Starting Hierarchical Deep Extraction (${hierarchy.length} level${hierarchy.length > 1 ? 's' : ''}) ===`, logger_1.Level.LOG);
2824
+ this.isInDeepExtractionPhase = true;
2825
+ const startLevel = hierarchy.length >= 2 ? hierarchy.length - 2 : hierarchy.length - 1;
2826
+ for (let levelIndex = startLevel; levelIndex >= 0; levelIndex--) {
2827
+ const level = hierarchy[levelIndex];
2828
+ const currentLevelUrls = level.urlMappings;
2829
+ this.log(`\n=== Processing Deep Extraction Level ${startLevel - levelIndex + 1}/${startLevel + 1} ===`, logger_1.Level.LOG);
2830
+ this.log(`Goto pattern: ${level.gotoPattern}`, logger_1.Level.LOG);
2831
+ this.log(`Actions to execute: ${level.actionsToExecute.length}`, logger_1.Level.LOG);
2832
+ this.log(`URLs to process: ${currentLevelUrls.filter(m => m.url !== null).length}`, logger_1.Level.LOG);
2833
+ if (currentLevelUrls.length === 0 || currentLevelUrls.every(u => !u.url)) {
2834
+ this.log('No valid URLs at this level - stopping here', logger_1.Level.LOG);
2835
+ break;
2836
+ }
2837
+ yield this.executeDeepExtractionLevel(page, level, currentLevelUrls);
2838
+ }
2839
+ this.log('\n=== Hierarchical Deep Extraction Completed ===', logger_1.Level.LOG);
2840
+ }
2841
+ catch (error) {
2842
+ this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
2843
+ }
2844
+ finally {
2845
+ this.isInDeepExtractionPhase = false;
2846
+ }
2847
+ });
2848
+ }
2849
+ /**
2850
+ * Executes deep extraction for a single level.
2851
+ * URLs are already extracted and stored in hierarchy during workflow execution.
2852
+ * This function just navigates to URLs and executes the capture actions.
2853
+ */
2854
+ executeDeepExtractionLevel(page, level, urlMappings) {
2855
+ return __awaiter(this, void 0, void 0, function* () {
2856
+ try {
2857
+ const validMappings = urlMappings.filter(m => m.url !== null);
2858
+ if (validMappings.length === 0) {
2859
+ this.log('No URLs to process for this level', logger_1.Level.LOG);
2860
+ return;
2861
+ }
2862
+ this.log(`Processing ${validMappings.length} URLs`, logger_1.Level.LOG);
2863
+ for (const mapping of validMappings) {
2864
+ try {
2865
+ this.log(`[${mapping.index}] Navigating to: ${mapping.url}`, logger_1.Level.LOG);
2866
+ yield page.goto(mapping.url);
2867
+ yield page.waitForLoadState('networkidle', { timeout: 30000 });
2868
+ for (let i = level.actionsToExecute.length - 1; i >= 0; i--) {
2869
+ const actionPair = level.actionsToExecute[i];
2870
+ if (this.isAborted) {
2871
+ this.log('Workflow aborted during deep extraction', logger_1.Level.WARN);
2872
+ return;
2873
+ }
2874
+ const validatedAction = yield this.validateAndFixSelectors(page, actionPair);
2875
+ const filteredActions = validatedAction.what.filter(action => action.action === 'scrapeSchema' ||
2876
+ action.action === 'scrapeList' ||
2877
+ action.action === 'screenshot');
2878
+ if (filteredActions.length > 0) {
2879
+ yield this.carryOutSteps(page, filteredActions);
2880
+ }
2881
+ }
2882
+ this.log(`[${mapping.index}] Completed`, logger_1.Level.LOG);
2883
+ }
2884
+ catch (error) {
2885
+ this.log(`[${mapping.index}] Failed: ${error.message}`, logger_1.Level.ERROR);
2886
+ }
2887
+ }
2888
+ }
2889
+ catch (error) {
2890
+ this.log(`Level execution failed: ${error.message}`, logger_1.Level.ERROR);
2891
+ }
2892
+ });
2893
+ }
2186
2894
  runLoop(p, workflow) {
2187
2895
  return __awaiter(this, void 0, void 0, function* () {
2188
2896
  var _a, _b;
@@ -2267,6 +2975,20 @@ class Interpreter extends events_1.EventEmitter {
2267
2975
  }
2268
2976
  if (workflowCopy.length === 0) {
2269
2977
  this.log('All actions completed. Workflow finished.', logger_1.Level.LOG);
2978
+ if (this.pendingDeepExtraction) {
2979
+ this.log('Starting deferred hierarchical deep extraction now that workflow has completed...', logger_1.Level.LOG);
2980
+ const { page, hierarchy } = this.pendingDeepExtraction;
2981
+ try {
2982
+ yield this.executeHierarchicalDeepExtraction(page, hierarchy);
2983
+ this.log('Hierarchical deep extraction completed successfully', logger_1.Level.LOG);
2984
+ }
2985
+ catch (error) {
2986
+ this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
2987
+ }
2988
+ finally {
2989
+ this.pendingDeepExtraction = null;
2990
+ }
2991
+ }
2270
2992
  cleanup();
2271
2993
  return;
2272
2994
  }
@@ -2335,7 +3057,7 @@ class Interpreter extends events_1.EventEmitter {
2335
3057
  try {
2336
3058
  const validatedAction = yield this.validateAndFixSelectors(p, action);
2337
3059
  console.log("Carrying out:", validatedAction.what);
2338
- yield this.carryOutSteps(p, validatedAction.what);
3060
+ yield this.carryOutSteps(p, validatedAction.what, workflowCopy);
2339
3061
  usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
2340
3062
  workflowCopy.splice(actionId, 1);
2341
3063
  console.log(`Action with ID ${action.id} removed from the workflow copy.`);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.24",
3
+ "version": "0.0.25",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",