mx-cloud 0.0.24 → 0.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,11 +38,13 @@ interface InterpreterOptions {
38
38
  serializableCallback: (output: any) => (void | Promise<void>);
39
39
  binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
40
40
  debug: boolean;
41
+ robotType?: 'extract' | 'scrape' | 'deep-extract';
41
42
  debugChannel: Partial<{
42
43
  activeId: (id: number) => void;
43
44
  debugMessage: (msg: string) => void;
44
45
  setActionType: (type: string) => void;
45
46
  incrementScrapeListIndex: () => void;
47
+ progressUpdate: (current: number, total: number, percentage: number) => void;
46
48
  }>;
47
49
  }
48
50
  /**
@@ -63,28 +65,16 @@ export default class Interpreter extends EventEmitter {
63
65
  private screenshotCounter;
64
66
  private scrapeListCounter;
65
67
  private serializableDataByType;
68
+ private pendingDeepExtraction;
69
+ private isInDeepExtractionPhase;
70
+ private deepExtractionStats;
71
+ private totalActions;
72
+ private executedActions;
66
73
  constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
67
74
  trackAutohealFailure(error: string): void;
68
75
  private applyAdBlocker;
69
76
  private disableAdBlocker;
70
- private getSelectors;
71
- /**
72
- * Returns the context object from given Page and the current workflow.\
73
- * \
74
- * `workflow` is used for selector extraction - function searches for used selectors to
75
- * look for later in the page's context.
76
- * @param page Playwright Page object
77
- * @param workflow Current **initialized** workflow (array of where-what pairs).
78
- * @returns {PageState} State of the current page.
79
- */
80
- private getState;
81
- /**
82
- * Tests if the given action is applicable with the given context.
83
- * @param where Tested *where* condition
84
- * @param context Current browser context.
85
- * @returns True if `where` is applicable in the given context, false otherwise
86
- */
87
- private applicable;
77
+ private callWithTimeout;
88
78
  /**
89
79
  * Sets the abort flag to immediately stop all operations
90
80
  */
@@ -104,12 +94,7 @@ export default class Interpreter extends EventEmitter {
104
94
  */
105
95
  private carryOutSteps;
106
96
  private handlePagination;
107
- private getMatchingActionId;
108
- private removeShadowSelectors;
109
97
  private removeSpecialSelectors;
110
- private generatePageNodeInformation;
111
- private detectElementChanges;
112
- private validateWorkflowAction;
113
98
  /**
114
99
  * Test if a selector is working on the current page
115
100
  * @param {Page} page - Playwright page object
@@ -148,6 +133,66 @@ export default class Interpreter extends EventEmitter {
148
133
  * @returns {Promise<WhereWhatPair>} - The potentially modified action
149
134
  */
150
135
  private validateAndFixSelectors;
136
+ /**
137
+ * Extracts URLs from the current page's list elements.
138
+ * Used during pagination to maintain sync between scraped results and extracted URLs.
139
+ *
140
+ * @param page - Playwright page object
141
+ * @param listSelector - The selector used to identify list elements
142
+ * @param limit - Maximum number of elements to process (should match number of scraped items)
143
+ * @returns Array of URL arrays, one per list element
144
+ */
145
+ private extractUrlsFromCurrentPage;
146
+ /**
147
+ * Builds a hierarchical deep extraction plan by analyzing the workflow structure.
148
+ * Identifies goto actions and determines what actions to execute at each level.
149
+ * Workflow is bottom-to-top, so we scan from end to start.
150
+ */
151
+ private buildDeepExtractionHierarchy;
152
+ /**
153
+ * Extracts hrefs directly from the page based on scrapeSchema selectors.
154
+ * Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
155
+ * This is called after scrapeSchema executes to capture hrefs for deep extraction.
156
+ */
157
+ private extractHrefsFromPage;
158
+ /**
159
+ * Filters URLs for deep extraction based on the goto action pattern.
160
+ * This is called immediately after the first capture action (scrapeList).
161
+ * Returns the filtered URL mappings that should be processed after workflow completion.
162
+ * Each mapping maintains alignment with the original scrapeList index.
163
+ */
164
+ private filterDeepExtractionUrls;
165
+ /**
166
+ * Filters pre-extracted URLs for deep extraction based on the goto action pattern.
167
+ * This is used for paginated lists where URLs were extracted during pagination.
168
+ * Returns the filtered URL mappings that maintain alignment with scrapeList indices.
169
+ */
170
+ private filterDeepExtractionUrlsFromExtracted;
171
+ /**
172
+ * Helper function to check if a URL matches a goto pattern.
173
+ */
174
+ /**
175
+ * Generic pattern matching for deep extraction URLs.
176
+ * Works across any website by analyzing URL structure rather than relying on keywords.
177
+ *
178
+ * Strategy:
179
+ * 1. Match URLs with same origin and path length
180
+ * 2. Identify "structural" segments (numbers, short words, etc.) that should match exactly
181
+ * 3. Allow other segments to vary (dynamic content like IDs, slugs, names)
182
+ * 4. Skip exact matches to avoid duplicates
183
+ */
184
+ private matchesGotoPattern;
185
+ /**
186
+ * Executes hierarchical deep extraction by processing each level recursively.
187
+ * URLs are already stored in each hierarchy level's urlMappings during workflow execution.
188
+ */
189
+ private executeHierarchicalDeepExtraction;
190
+ /**
191
+ * Executes deep extraction for a single level.
192
+ * URLs are already extracted and stored in hierarchy during workflow execution.
193
+ * This function just navigates to URLs and executes the capture actions.
194
+ */
195
+ private executeDeepExtractionLevel;
151
196
  private runLoop;
152
197
  private ensureScriptsLoaded;
153
198
  /**