mx-cloud 0.0.24 → 0.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.d.ts +68 -23
- package/build/interpret.js +1709 -1033
- package/build/selector.d.ts +1 -32
- package/build/selector.js +1 -839
- package/build/types/workflow.d.ts +1 -1
- package/build/utils/utils.d.ts +0 -4
- package/build/utils/utils.js +0 -7
- package/package.json +1 -1
package/build/interpret.d.ts
CHANGED
|
@@ -38,11 +38,13 @@ interface InterpreterOptions {
|
|
|
38
38
|
serializableCallback: (output: any) => (void | Promise<void>);
|
|
39
39
|
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
|
40
40
|
debug: boolean;
|
|
41
|
+
robotType?: 'extract' | 'scrape' | 'deep-extract';
|
|
41
42
|
debugChannel: Partial<{
|
|
42
43
|
activeId: (id: number) => void;
|
|
43
44
|
debugMessage: (msg: string) => void;
|
|
44
45
|
setActionType: (type: string) => void;
|
|
45
46
|
incrementScrapeListIndex: () => void;
|
|
47
|
+
progressUpdate: (current: number, total: number, percentage: number) => void;
|
|
46
48
|
}>;
|
|
47
49
|
}
|
|
48
50
|
/**
|
|
@@ -63,28 +65,16 @@ export default class Interpreter extends EventEmitter {
|
|
|
63
65
|
private screenshotCounter;
|
|
64
66
|
private scrapeListCounter;
|
|
65
67
|
private serializableDataByType;
|
|
68
|
+
private pendingDeepExtraction;
|
|
69
|
+
private isInDeepExtractionPhase;
|
|
70
|
+
private deepExtractionStats;
|
|
71
|
+
private totalActions;
|
|
72
|
+
private executedActions;
|
|
66
73
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
|
|
67
74
|
trackAutohealFailure(error: string): void;
|
|
68
75
|
private applyAdBlocker;
|
|
69
76
|
private disableAdBlocker;
|
|
70
|
-
private
|
|
71
|
-
/**
|
|
72
|
-
* Returns the context object from given Page and the current workflow.\
|
|
73
|
-
* \
|
|
74
|
-
* `workflow` is used for selector extraction - function searches for used selectors to
|
|
75
|
-
* look for later in the page's context.
|
|
76
|
-
* @param page Playwright Page object
|
|
77
|
-
* @param workflow Current **initialized** workflow (array of where-what pairs).
|
|
78
|
-
* @returns {PageState} State of the current page.
|
|
79
|
-
*/
|
|
80
|
-
private getState;
|
|
81
|
-
/**
|
|
82
|
-
* Tests if the given action is applicable with the given context.
|
|
83
|
-
* @param where Tested *where* condition
|
|
84
|
-
* @param context Current browser context.
|
|
85
|
-
* @returns True if `where` is applicable in the given context, false otherwise
|
|
86
|
-
*/
|
|
87
|
-
private applicable;
|
|
77
|
+
private callWithTimeout;
|
|
88
78
|
/**
|
|
89
79
|
* Sets the abort flag to immediately stop all operations
|
|
90
80
|
*/
|
|
@@ -104,12 +94,7 @@ export default class Interpreter extends EventEmitter {
|
|
|
104
94
|
*/
|
|
105
95
|
private carryOutSteps;
|
|
106
96
|
private handlePagination;
|
|
107
|
-
private getMatchingActionId;
|
|
108
|
-
private removeShadowSelectors;
|
|
109
97
|
private removeSpecialSelectors;
|
|
110
|
-
private generatePageNodeInformation;
|
|
111
|
-
private detectElementChanges;
|
|
112
|
-
private validateWorkflowAction;
|
|
113
98
|
/**
|
|
114
99
|
* Test if a selector is working on the current page
|
|
115
100
|
* @param {Page} page - Playwright page object
|
|
@@ -148,6 +133,66 @@ export default class Interpreter extends EventEmitter {
|
|
|
148
133
|
* @returns {Promise<WhereWhatPair>} - The potentially modified action
|
|
149
134
|
*/
|
|
150
135
|
private validateAndFixSelectors;
|
|
136
|
+
/**
|
|
137
|
+
* Extracts URLs from the current page's list elements.
|
|
138
|
+
* Used during pagination to maintain sync between scraped results and extracted URLs.
|
|
139
|
+
*
|
|
140
|
+
* @param page - Playwright page object
|
|
141
|
+
* @param listSelector - The selector used to identify list elements
|
|
142
|
+
* @param limit - Maximum number of elements to process (should match number of scraped items)
|
|
143
|
+
* @returns Array of URL arrays, one per list element
|
|
144
|
+
*/
|
|
145
|
+
private extractUrlsFromCurrentPage;
|
|
146
|
+
/**
|
|
147
|
+
* Builds a hierarchical deep extraction plan by analyzing the workflow structure.
|
|
148
|
+
* Identifies goto actions and determines what actions to execute at each level.
|
|
149
|
+
* Workflow is bottom-to-top, so we scan from end to start.
|
|
150
|
+
*/
|
|
151
|
+
private buildDeepExtractionHierarchy;
|
|
152
|
+
/**
|
|
153
|
+
* Extracts hrefs directly from the page based on scrapeSchema selectors.
|
|
154
|
+
* Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
|
|
155
|
+
* This is called after scrapeSchema executes to capture hrefs for deep extraction.
|
|
156
|
+
*/
|
|
157
|
+
private extractHrefsFromPage;
|
|
158
|
+
/**
|
|
159
|
+
* Filters URLs for deep extraction based on the goto action pattern.
|
|
160
|
+
* This is called immediately after the first capture action (scrapeList).
|
|
161
|
+
* Returns the filtered URL mappings that should be processed after workflow completion.
|
|
162
|
+
* Each mapping maintains alignment with the original scrapeList index.
|
|
163
|
+
*/
|
|
164
|
+
private filterDeepExtractionUrls;
|
|
165
|
+
/**
|
|
166
|
+
* Filters pre-extracted URLs for deep extraction based on the goto action pattern.
|
|
167
|
+
* This is used for paginated lists where URLs were extracted during pagination.
|
|
168
|
+
* Returns the filtered URL mappings that maintain alignment with scrapeList indices.
|
|
169
|
+
*/
|
|
170
|
+
private filterDeepExtractionUrlsFromExtracted;
|
|
171
|
+
/**
|
|
172
|
+
* Helper function to check if a URL matches a goto pattern.
|
|
173
|
+
*/
|
|
174
|
+
/**
|
|
175
|
+
* Generic pattern matching for deep extraction URLs.
|
|
176
|
+
* Works across any website by analyzing URL structure rather than relying on keywords.
|
|
177
|
+
*
|
|
178
|
+
* Strategy:
|
|
179
|
+
* 1. Match URLs with same origin and path length
|
|
180
|
+
* 2. Identify "structural" segments (numbers, short words, etc.) that should match exactly
|
|
181
|
+
* 3. Allow other segments to vary (dynamic content like IDs, slugs, names)
|
|
182
|
+
* 4. Skip exact matches to avoid duplicates
|
|
183
|
+
*/
|
|
184
|
+
private matchesGotoPattern;
|
|
185
|
+
/**
|
|
186
|
+
* Executes hierarchical deep extraction by processing each level recursively.
|
|
187
|
+
* URLs are already stored in each hierarchy level's urlMappings during workflow execution.
|
|
188
|
+
*/
|
|
189
|
+
private executeHierarchicalDeepExtraction;
|
|
190
|
+
/**
|
|
191
|
+
* Executes deep extraction for a single level.
|
|
192
|
+
* URLs are already extracted and stored in hierarchy during workflow execution.
|
|
193
|
+
* This function just navigates to URLs and executes the capture actions.
|
|
194
|
+
*/
|
|
195
|
+
private executeDeepExtractionLevel;
|
|
151
196
|
private runLoop;
|
|
152
197
|
private ensureScriptsLoaded;
|
|
153
198
|
/**
|