npm - mx-cloud - Versions diffs - 0.0.23 → 0.0.25 - Mend

mx-cloud 0.0.23 → 0.0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/build/interpret.d.ts CHANGED Viewed

@@ -38,6 +38,7 @@ interface InterpreterOptions {
     serializableCallback: (output: any) => (void | Promise<void>);
     binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
     debug: boolean;
+    robotType?: 'extract' | 'scrape' | 'deep-extract';
     debugChannel: Partial<{
         activeId: (id: number) => void;
         debugMessage: (msg: string) => void;
@@ -63,6 +64,8 @@ export default class Interpreter extends EventEmitter {
     private screenshotCounter;
     private scrapeListCounter;
     private serializableDataByType;
+    private pendingDeepExtraction;
+    private isInDeepExtractionPhase;
     constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
     trackAutohealFailure(error: string): void;
     private applyAdBlocker;
@@ -148,6 +151,56 @@ export default class Interpreter extends EventEmitter {
      * @returns {Promise<WhereWhatPair>} - The potentially modified action
      */
     private validateAndFixSelectors;
+    /**
+     * Extracts URLs from the current page's list elements.
+     * Used during pagination to maintain sync between scraped results and extracted URLs.
+     *
+     * @param page - Playwright page object
+     * @param listSelector - The selector used to identify list elements
+     * @param limit - Maximum number of elements to process (should match number of scraped items)
+     * @returns Array of URL arrays, one per list element
+     */
+    private extractUrlsFromCurrentPage;
+    /**
+     * Builds a hierarchical deep extraction plan by analyzing the workflow structure.
+     * Identifies goto actions and determines what actions to execute at each level.
+     * Workflow is bottom-to-top, so we scan from end to start.
+     */
+    private buildDeepExtractionHierarchy;
+    /**
+     * Extracts hrefs directly from the page based on scrapeSchema selectors.
+     * Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
+     * This is called after scrapeSchema executes to capture hrefs for deep extraction.
+     */
+    private extractHrefsFromPage;
+    /**
+     * Filters URLs for deep extraction based on the goto action pattern.
+     * This is called immediately after the first capture action (scrapeList).
+     * Returns the filtered URL mappings that should be processed after workflow completion.
+     * Each mapping maintains alignment with the original scrapeList index.
+     */
+    private filterDeepExtractionUrls;
+    /**
+     * Filters pre-extracted URLs for deep extraction based on the goto action pattern.
+     * This is used for paginated lists where URLs were extracted during pagination.
+     * Returns the filtered URL mappings that maintain alignment with scrapeList indices.
+     */
+    private filterDeepExtractionUrlsFromExtracted;
+    /**
+     * Helper function to check if a URL matches a goto pattern.
+     */
+    private matchesGotoPattern;
+    /**
+     * Executes hierarchical deep extraction by processing each level recursively.
+     * URLs are already stored in each hierarchy level's urlMappings during workflow execution.
+     */
+    private executeHierarchicalDeepExtraction;
+    /**
+     * Executes deep extraction for a single level.
+     * URLs are already extracted and stored in hierarchy during workflow execution.
+     * This function just navigates to URLs and executes the capture actions.
+     */
+    private executeDeepExtractionLevel;
     private runLoop;
     private ensureScriptsLoaded;
     /**

package/build/interpret.js CHANGED Viewed

@@ -74,6 +74,8 @@ class Interpreter extends events_1.EventEmitter {
             scrapeList: {},
             scrapeSchema: {}
         };
+        this.pendingDeepExtraction = null;
+        this.isInDeepExtractionPhase = false;
         this.workflow = workflow.workflow;
         this.initializedWorkflow = null;
         this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
@@ -338,7 +340,7 @@ class Interpreter extends events_1.EventEmitter {
    * @param page Playwright Page object
    * @param steps Array of actions.
    */
-    carryOutSteps(page, steps) {
+    carryOutSteps(page, steps, currentWorkflow) {
         return __awaiter(this, void 0, void 0, function* () {
             var _a, _b;
             // Check abort flag at start of execution
@@ -430,9 +432,8 @@ class Interpreter extends events_1.EventEmitter {
                     const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
                     yield this.options.serializableCallback(scrapeResults);
                 }),
-                scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
+                scrapeSchema: (schema_1, ...args_1) => __awaiter(this, [schema_1, ...args_1], void 0, function* (schema, actionName = "") {
                     var _a;
-                    // Check abort flag at start of scraping
                     if (this.isAborted) {
                         this.log('Workflow aborted, stopping scrapeSchema', logger_1.Level.WARN);
                         return;
@@ -451,7 +452,6 @@ class Interpreter extends events_1.EventEmitter {
                     }
                     const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
                     if (this.cumulativeResults.length === 0) {
-                        // First execution - create initial row
                         const newRow = {};
                         Object.entries(resultToProcess).forEach(([key, value]) => {
                             if (value !== undefined) {
@@ -461,12 +461,10 @@ class Interpreter extends events_1.EventEmitter {
                         this.cumulativeResults.push(newRow);
                     }
                     else {
-                        // Check if any keys from new result already exist in the last row
                         const lastRow = this.cumulativeResults[this.cumulativeResults.length - 1];
                         const newResultKeys = Object.keys(resultToProcess).filter(key => resultToProcess[key] !== undefined);
                         const hasRepeatedKeys = newResultKeys.some(key => lastRow.hasOwnProperty(key));
                         if (hasRepeatedKeys) {
-                            // Keys are repeated - create a new row
                             const newRow = {};
                             Object.entries(resultToProcess).forEach(([key, value]) => {
                                 if (value !== undefined) {
@@ -476,7 +474,6 @@ class Interpreter extends events_1.EventEmitter {
                             this.cumulativeResults.push(newRow);
                         }
                         else {
-                            // No repeated keys - merge with the last row
                             Object.entries(resultToProcess).forEach(([key, value]) => {
                                 if (value !== undefined) {
                                     lastRow[key] = value;
@@ -484,30 +481,102 @@ class Interpreter extends events_1.EventEmitter {
                             });
                         }
                     }
-                    console.log("Total accumulated rows:", this.cumulativeResults.length);
-                    console.log("Current results:", this.cumulativeResults);
-                    // ✅ Append schema results under "scrapeSchema" → name
                     const actionType = "scrapeSchema";
-                    const actionName = schema.__name || "Texts";
+                    const name = actionName || "Texts";
                     if (!this.namedResults[actionType])
                         this.namedResults[actionType] = {};
-                    this.namedResults[actionType][actionName] = this.cumulativeResults;
+                    this.namedResults[actionType][name] = this.cumulativeResults;
                     if (!this.serializableDataByType[actionType])
                         this.serializableDataByType[actionType] = {};
-                    if (!this.serializableDataByType[actionType][actionName]) {
-                        this.serializableDataByType[actionType][actionName] = [];
+                    if (!this.serializableDataByType[actionType][name]) {
+                        this.serializableDataByType[actionType][name] = [];
                     }
-                    // Store as array (matching cumulativeResults structure)
-                    this.serializableDataByType[actionType][actionName] = [...this.cumulativeResults];
-                    // now emit full structured object
+                    this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
                     yield this.options.serializableCallback({
                         scrapeList: this.serializableDataByType.scrapeList,
                         scrapeSchema: this.serializableDataByType.scrapeSchema
                     });
+                    if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
+                        if (!this.pendingDeepExtraction) {
+                            console.log('DEBUG: Building hierarchical deep extraction plan from scrapeSchema...');
+                            const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
+                            if (hierarchyData.length > 0) {
+                                const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
+                                const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
+                                this.log(`Root scrapeSchema will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
+                                // Extract URLs from schema fields
+                                const urls = yield this.extractHrefsFromPage(page, schema);
+                                this.log(`scrapeSchema extracted ${urls.length} URLs from field selectors`, logger_1.Level.LOG);
+                                // Filter URLs against pattern
+                                const rootUrlMappings = urls
+                                    .map((url, index) => ({
+                                    scrapeListIndex: index,
+                                    url: this.matchesGotoPattern(url, nextLevelGotoPattern) ? url : null
+                                }))
+                                    .filter(m => m.url !== null);
+                                this.log(`Matched ${rootUrlMappings.length} URLs against pattern ${nextLevelGotoPattern}`, logger_1.Level.LOG);
+                                this.pendingDeepExtraction = {
+                                    page,
+                                    hierarchy: hierarchyData.map((level, idx) => ({
+                                        gotoPattern: level.gotoPattern,
+                                        actionsToExecute: level.actionsToExecute,
+                                        urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
+                                    }))
+                                };
+                            }
+                            else {
+                                console.log('DEBUG: No goto actions found, deep extraction skipped');
+                            }
+                        }
+                        else {
+                            this.log(`[Deep Extract] scrapeSchema "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
+                            const hierarchy = this.pendingDeepExtraction.hierarchy;
+                            if (hierarchy && hierarchy.length > 0) {
+                                let targetLevelIndex = -1;
+                                for (let i = hierarchy.length - 1; i >= 0; i--) {
+                                    if (hierarchy[i].urlMappings.length === 0) {
+                                        targetLevelIndex = i;
+                                        break;
+                                    }
+                                }
+                                if (targetLevelIndex >= 0) {
+                                    const targetGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
+                                    this.log(`[Deep Extract] Storing URLs for level ${targetLevelIndex}, pattern: ${targetGotoPattern}`, logger_1.Level.LOG);
+                                    const urls = yield this.extractHrefsFromPage(page, schema);
+                                    this.log(`[Deep Extract] Extracted ${urls.length} URLs from scrapeSchema field selectors`, logger_1.Level.LOG);
+                                    const urlMappings = urls
+                                        .map((url, index) => ({
+                                        index,
+                                        url: this.matchesGotoPattern(url, targetGotoPattern) ? url : null
+                                    }))
+                                        .filter(m => m.url !== null);
+                                    if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
+                                        const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
+                                        const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
+                                        if (newUrls.length > 0) {
+                                            const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
+                                            hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
+                                            this.log(`[Deep Extract] Merged ${newUrls.length} new URLs from scrapeSchema`, logger_1.Level.LOG);
+                                        }
+                                    }
+                                    else {
+                                        hierarchy[targetLevelIndex].urlMappings = urlMappings;
+                                    }
+                                    this.log(`[Deep Extract] Stored ${urlMappings.length} matching URLs`, logger_1.Level.LOG);
+                                    if (urlMappings.length > 0) {
+                                        const sampleSize = Math.min(3, urlMappings.length);
+                                        this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${urlMappings.length}):`, logger_1.Level.LOG);
+                                        urlMappings.slice(0, sampleSize).forEach((mapping, idx) => {
+                                            this.log(`[Deep Extract]   ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
+                                        });
+                                    }
+                                }
+                            }
+                        }
+                    }
                 }),
-                scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
+                scrapeList: (config_1, ...args_1) => __awaiter(this, [config_1, ...args_1], void 0, function* (config, actionName = "") {
                     var _a, _b;
-                    // Check abort flag at start of scraping
                     if (this.isAborted) {
                         this.log('Workflow aborted, stopping scrapeList', logger_1.Level.WARN);
                         return;
@@ -533,53 +602,156 @@ class Interpreter extends events_1.EventEmitter {
                                 }
                                 catch (error) {
                                     console.warn('ScrapeList evaluation failed:', error.message);
-                                    return []; // Return empty array instead of failing
+                                    return [];
                                 }
                             }, config);
                         }
                         else {
                             paginationUsed = true;
-                            scrapeResults = yield this.handlePagination(page, config);
+                            const paginationResult = yield this.handlePagination(page, config, actionName);
+                            scrapeResults = paginationResult.results;
+                            const paginationUrls = paginationResult.urls;
+                            if (this.options.robotType === 'deep-extract' && this.initializedWorkflow && scrapeResults.length > 0) {
+                                if (!this.pendingDeepExtraction) {
+                                    console.log('DEBUG: Building hierarchical deep extraction plan from pagination...');
+                                    const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
+                                    if (hierarchyData.length > 0) {
+                                        const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
+                                        const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
+                                        this.log(`Root scrapeList (pagination) will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
+                                        const rootUrlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextLevelGotoPattern);
+                                        this.pendingDeepExtraction = {
+                                            page,
+                                            hierarchy: hierarchyData.map((level, idx) => ({
+                                                gotoPattern: level.gotoPattern,
+                                                actionsToExecute: level.actionsToExecute,
+                                                urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
+                                            }))
+                                        };
+                                    }
+                                }
+                                else {
+                                    this.log(`[Deep Extract] scrapeList (pagination) "${actionName}" extracting URLs`, logger_1.Level.LOG);
+                                    const hierarchy = this.pendingDeepExtraction.hierarchy;
+                                    if (hierarchy && hierarchy.length > 0) {
+                                        const nextLevelIndex = hierarchy.length >= 3 ? hierarchy.length - 3 : 0;
+                                        if (nextLevelIndex >= 0 && hierarchy[nextLevelIndex]) {
+                                            const nextGotoPattern = hierarchy[nextLevelIndex].gotoPattern;
+                                            this.log(`[Deep Extract] Extracting URLs for pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
+                                            const urlMappings = this.filterDeepExtractionUrlsFromExtracted(paginationUrls, scrapeResults, nextGotoPattern);
+                                            this.log(`[Deep Extract] Found ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
+                                            const validUrls = urlMappings.filter(m => m.url !== null);
+                                            if (validUrls.length > 0) {
+                                                const sampleSize = Math.min(3, validUrls.length);
+                                                this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${validUrls.length}):`, logger_1.Level.LOG);
+                                                validUrls.slice(0, sampleSize).forEach((mapping, idx) => {
+                                                    this.log(`[Deep Extract]   ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
+                                                });
+                                            }
+                                        }
+                                    }
+                                }
+                            }
                         }
-                        // Ensure we always have an array
                         if (!Array.isArray(scrapeResults)) {
                             scrapeResults = [];
                         }
-                        console.log(`ScrapeList completed with ${scrapeResults.length} results`);
-                        // Only process and callback if pagination wasn't used
-                        // (handlePagination already handles storage and callbacks internally)
                         if (!paginationUsed) {
-                            // ✅ Append list results under "scrapeList" → name
                             const actionType = "scrapeList";
-                            let actionName = config.__name || "";
-                            // If no name provided, generate a unique one
-                            if (!actionName || actionName.trim() === "") {
+                            let name = actionName || "";
+                            if (!name || name.trim() === "" || this.isInDeepExtractionPhase) {
                                 this.scrapeListCounter++;
-                                actionName = `List ${this.scrapeListCounter}`;
+                                name = `List ${this.scrapeListCounter}`;
                             }
                             if (!this.serializableDataByType[actionType])
                                 this.serializableDataByType[actionType] = {};
-                            if (!this.serializableDataByType[actionType][actionName]) {
-                                this.serializableDataByType[actionType][actionName] = [];
+                            if (!this.serializableDataByType[actionType][name]) {
+                                this.serializableDataByType[actionType][name] = [];
                             }
-                            this.serializableDataByType[actionType][actionName].push(...scrapeResults);
+                            this.serializableDataByType[actionType][name].push(...scrapeResults);
                             yield this.options.serializableCallback({
                                 scrapeList: this.serializableDataByType.scrapeList,
                                 scrapeSchema: this.serializableDataByType.scrapeSchema
                             });
+                            console.log(`DEBUG: Checking deep extract condition: robotType=${this.options.robotType}, hasWorkflow=${!!currentWorkflow}, alreadyPending=${!!this.pendingDeepExtraction}`);
+                            if (this.options.robotType === 'deep-extract' && !this.isInDeepExtractionPhase && this.initializedWorkflow) {
+                                if (!this.pendingDeepExtraction) {
+                                    console.log('DEBUG: Building hierarchical deep extraction plan...');
+                                    const hierarchyData = this.buildDeepExtractionHierarchy(this.initializedWorkflow);
+                                    if (hierarchyData.length > 0) {
+                                        const nextLevelIndex = hierarchyData.length >= 2 ? hierarchyData.length - 2 : hierarchyData.length - 1;
+                                        const nextLevelGotoPattern = hierarchyData[nextLevelIndex].gotoPattern;
+                                        this.log(`Root scrapeList will extract URLs matching pattern: ${nextLevelGotoPattern}`, logger_1.Level.LOG);
+                                        const rootUrlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextLevelGotoPattern);
+                                        this.pendingDeepExtraction = {
+                                            page,
+                                            hierarchy: hierarchyData.map((level, idx) => ({
+                                                gotoPattern: level.gotoPattern,
+                                                actionsToExecute: level.actionsToExecute,
+                                                urlMappings: idx === nextLevelIndex ? rootUrlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url })) : []
+                                            }))
+                                        };
+                                    }
+                                    else {
+                                        console.log('DEBUG: No goto actions found, deep extraction skipped');
+                                    }
+                                }
+                                else {
+                                    this.log(`[Deep Extract] scrapeList "${name}" extracting URLs during workflow execution`, logger_1.Level.LOG);
+                                    const hierarchy = this.pendingDeepExtraction.hierarchy;
+                                    if (hierarchy && hierarchy.length > 0) {
+                                        let targetLevelIndex = -1;
+                                        for (let i = hierarchy.length - 1; i >= 0; i--) {
+                                            if (hierarchy[i].urlMappings.length === 0) {
+                                                targetLevelIndex = i;
+                                                break;
+                                            }
+                                        }
+                                        if (targetLevelIndex >= 0) {
+                                            const nextGotoPattern = hierarchy[targetLevelIndex].gotoPattern;
+                                            this.log(`[Deep Extract] Storing URLs for level ${targetLevelIndex}, pattern: ${nextGotoPattern}`, logger_1.Level.LOG);
+                                            const urlMappings = yield this.filterDeepExtractionUrls(page, config.listSelector, scrapeResults, nextGotoPattern);
+                                            if (hierarchy[targetLevelIndex].urlMappings.length > 0) {
+                                                const existingUrls = new Set(hierarchy[targetLevelIndex].urlMappings.map(m => m.url).filter(u => u !== null));
+                                                const newUrls = urlMappings.filter(m => m.url !== null && !existingUrls.has(m.url));
+                                                if (newUrls.length > 0) {
+                                                    const startIndex = hierarchy[targetLevelIndex].urlMappings.length;
+                                                    hierarchy[targetLevelIndex].urlMappings.push(...newUrls.map((m, idx) => ({ index: startIndex + idx, url: m.url })));
+                                                    this.log(`[Deep Extract] Merged ${newUrls.length} new URLs`, logger_1.Level.LOG);
+                                                }
+                                            }
+                                            else {
+                                                hierarchy[targetLevelIndex].urlMappings = urlMappings.map(m => ({ index: m.scrapeListIndex, url: m.url }));
+                                            }
+                                            this.log(`[Deep Extract] Stored ${urlMappings.filter(m => m.url !== null).length} matching URLs`, logger_1.Level.LOG);
+                                            const validUrls = urlMappings.filter(m => m.url !== null);
+                                            if (validUrls.length > 0) {
+                                                const sampleSize = Math.min(3, validUrls.length);
+                                                this.log(`[Deep Extract] Sample URLs (showing ${sampleSize} of ${validUrls.length}):`, logger_1.Level.LOG);
+                                                validUrls.slice(0, sampleSize).forEach((mapping, idx) => {
+                                                    this.log(`[Deep Extract]   ${idx + 1}. ${mapping.url}`, logger_1.Level.LOG);
+                                                });
+                                            }
+                                        }
+                                    }
+                                }
+                            }
                         }
                     }
                     catch (error) {
                         console.error('ScrapeList action failed completely:', error.message);
-                        // Don't throw error, just return empty array
                         const actionType = "scrapeList";
-                        const actionName = config.__name || "List";
+                        let name = actionName || "";
+                        if (!name || name.trim() === "") {
+                            this.scrapeListCounter++;
+                            name = `List ${this.scrapeListCounter}`;
+                        }
                         if (!this.namedResults[actionType])
                             this.namedResults[actionType] = {};
-                        this.namedResults[actionType][actionName] = [];
+                        this.namedResults[actionType][name] = [];
                         if (!this.serializableDataByType[actionType])
                             this.serializableDataByType[actionType] = {};
-                        this.serializableDataByType[actionType][actionName] = [];
+                        this.serializableDataByType[actionType][name] = [];
                         yield this.options.serializableCallback({
                             scrapeList: this.serializableDataByType.scrapeList,
                             scrapeSchema: this.serializableDataByType.scrapeSchema
@@ -662,25 +834,7 @@ class Interpreter extends events_1.EventEmitter {
                     if (debug === null || debug === void 0 ? void 0 : debug.setActionType) {
                         debug.setActionType(String(step.action));
                     }
-                    // Safely extract name for this step
-                    if (step === null || step === void 0 ? void 0 : step.name) {
-                        stepName = step.name;
-                    }
-                    else if (Array.isArray(step === null || step === void 0 ? void 0 : step.args) &&
-                        step.args.length > 0 &&
-                        typeof step.args[0] === "object" &&
-                        "__name" in step.args[0]) {
-                        stepName = step.args[0].__name;
-                    }
-                    else if (typeof (step === null || step === void 0 ? void 0 : step.args) === "object" &&
-                        (step === null || step === void 0 ? void 0 : step.args) !== null &&
-                        "__name" in step.args) {
-                        stepName = step.args.__name;
-                    }
-                    // Default fallback
-                    if (!stepName) {
-                        stepName = String(step.action);
-                    }
+                    stepName = (step === null || step === void 0 ? void 0 : step.name) || String(step.action);
                     if (debug && typeof debug.setActionName === "function") {
                         debug.setActionName(stepName);
                     }
@@ -693,9 +847,12 @@ class Interpreter extends events_1.EventEmitter {
                         // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
                         const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
                         if (step.action === 'screenshot') {
-                            // call the screenshot handler directly to allow the extra name parameter
                             yield wawActions.screenshot(...(params !== null && params !== void 0 ? params : []), stepName !== null && stepName !== void 0 ? stepName : undefined);
                         }
+                        else if (step.action === 'scrapeList' || step.action === 'scrapeSchema') {
+                            const actionName = step.name || "";
+                            yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []), actionName);
+                        }
                         else {
                             yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
                         }
@@ -755,17 +912,16 @@ class Interpreter extends events_1.EventEmitter {
             }
         });
     }
-    handlePagination(page, config) {
-        return __awaiter(this, void 0, void 0, function* () {
-            // Check abort flag at start of pagination
+    handlePagination(page_1, config_1) {
+        return __awaiter(this, arguments, void 0, function* (page, config, providedActionName = "") {
             if (this.isAborted) {
                 this.log('Workflow aborted, stopping pagination', logger_1.Level.WARN);
-                return [];
+                return { results: [], urls: [] };
             }
-            // Generate action name for this scrapeList
             const actionType = "scrapeList";
-            let actionName = config.__name || "";
-            if (!actionName || actionName.trim() === "") {
+            let actionName = providedActionName || "";
+            // During deep extraction, ALWAYS auto-increment to create separate lists for each URL
+            if (!actionName || actionName.trim() === "" || this.isInDeepExtractionPhase) {
                 this.scrapeListCounter++;
                 actionName = `List ${this.scrapeListCounter}`;
             }
@@ -777,6 +933,7 @@ class Interpreter extends events_1.EventEmitter {
                 this.serializableDataByType[actionType][actionName] = [];
             }
             let allResults = [];
+            let allUrls = []; // Track URLs alongside results for deep-extract
             let previousHeight = 0;
             let scrapedItems = new Set();
             let visitedUrls = new Set();
@@ -803,14 +960,22 @@ class Interpreter extends events_1.EventEmitter {
                     debugLog(`Page evaluation failed: ${error.message}`);
                     return;
                 }
-                const newResults = results.filter(item => {
+                // Extract URLs for ALL items BEFORE filtering duplicates
+                // This ensures URL indices match result indices
+                const allItemUrls = yield this.extractUrlsFromCurrentPage(page, config.listSelector, results.length);
+                // Filter results AND URLs together using the same uniqueness logic
+                const newResults = [];
+                const newUrls = [];
+                results.forEach((item, index) => {
                     const uniqueKey = JSON.stringify(item);
-                    if (scrapedItems.has(uniqueKey))
-                        return false;
-                    scrapedItems.add(uniqueKey);
-                    return true;
+                    if (!scrapedItems.has(uniqueKey)) {
+                        scrapedItems.add(uniqueKey);
+                        newResults.push(item);
+                        newUrls.push(allItemUrls[index] || []); // Add corresponding URLs
+                    }
                 });
                 allResults = allResults.concat(newResults);
+                allUrls = allUrls.concat(newUrls);
                 debugLog("Results collected:", allResults.length);
                 // Store in serializableDataByType and send structured callback
                 this.serializableDataByType[actionType][actionName] = [...allResults];
@@ -822,6 +987,7 @@ class Interpreter extends events_1.EventEmitter {
             const checkLimit = () => {
                 if (config.limit && allResults.length >= config.limit) {
                     allResults = allResults.slice(0, config.limit);
+                    allUrls = allUrls.slice(0, config.limit); // Also trim URLs to maintain sync
                     return true;
                 }
                 return false;
@@ -947,16 +1113,16 @@ class Interpreter extends events_1.EventEmitter {
                     // Check abort flag at start of each pagination iteration
                     if (this.isAborted) {
                         this.log('Workflow aborted during pagination loop', logger_1.Level.WARN);
-                        return allResults;
+                        return { results: allResults, urls: allUrls };
                     }
                     // Pagination circuit breakers
                     if (++paginationIterations > MAX_PAGINATION_ITERATIONS) {
                         debugLog(`Maximum pagination iterations reached (${MAX_PAGINATION_ITERATIONS}), stopping`);
-                        return allResults;
+                        return { results: allResults, urls: allUrls };
                     }
                     if (Date.now() - paginationStartTime > MAX_PAGINATION_TIME) {
                         debugLog('Maximum pagination time reached (10 minutes), stopping');
-                        return allResults;
+                        return { results: allResults, urls: allUrls };
                     }
                     // Add async yield every 5 iterations to prevent event loop blocking
                     if (paginationIterations % 5 === 0) {
@@ -967,7 +1133,7 @@ class Interpreter extends events_1.EventEmitter {
                             let previousResultCount = allResults.length;
                             yield scrapeCurrentPage();
                             if (checkLimit()) {
-                                return allResults;
+                                return { results: allResults, urls: allUrls };
                             }
                             yield page.evaluate(() => {
                                 const scrollHeight = Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);
@@ -981,14 +1147,14 @@ class Interpreter extends events_1.EventEmitter {
                             if (currentResultCount === previousResultCount) {
                                 unchangedResultCounter++;
                                 if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
-                                    return allResults;
+                                    return { results: allResults, urls: allUrls };
                                 }
                             }
                             else {
                                 unchangedResultCounter = 0;
                             }
                             if (currentHeight === previousHeight) {
-                                return allResults;
+                                return { results: allResults, urls: allUrls };
                             }
                             previousHeight = currentHeight;
                             break;
@@ -997,7 +1163,7 @@ class Interpreter extends events_1.EventEmitter {
                             let previousResultCount = allResults.length;
                             yield scrapeCurrentPage();
                             if (checkLimit()) {
-                                return allResults;
+                                return { results: allResults, urls: allUrls };
                             }
                             yield page.evaluate(() => window.scrollTo(0, 0));
                             yield page.waitForTimeout(2000);
@@ -1006,14 +1172,14 @@ class Interpreter extends events_1.EventEmitter {
                             if (currentResultCount === previousResultCount) {
                                 unchangedResultCounter++;
                                 if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
-                                    return allResults;
+                                    return { results: allResults, urls: allUrls };
                                 }
                             }
                             else {
                                 unchangedResultCounter = 0;
                             }
                             if (currentTopHeight === 0) {
-                                return allResults;
+                                return { results: allResults, urls: allUrls };
                             }
                             previousHeight = currentTopHeight;
                             break;
@@ -1023,7 +1189,7 @@ class Interpreter extends events_1.EventEmitter {
                             visitedUrls.add(currentUrl);
                             yield scrapeCurrentPage();
                             if (checkLimit())
-                                return allResults;
+                                return { results: allResults, urls: allUrls };
                             const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
                             availableSelectors = updatedSelectors;
                             if (!button || !workingSelector) {
@@ -1039,7 +1205,7 @@ class Interpreter extends events_1.EventEmitter {
                                     }
                                 }));
                                 if (!success)
-                                    return allResults;
+                                    return { results: allResults, urls: allUrls };
                                 break;
                             }
                             let retryCount = 0;
@@ -1169,14 +1335,14 @@ class Interpreter extends events_1.EventEmitter {
                             }
                             if (!paginationSuccess) {
                                 debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
-                                return allResults;
+                                return { results: allResults, urls: allUrls };
                             }
                             break;
                         }
                         case 'clickLoadMore': {
                             yield scrapeCurrentPage();
                             if (checkLimit())
-                                return allResults;
+                                return { results: allResults, urls: allUrls };
                             let loadMoreCounter = 0;
                             const MAX_LOAD_MORE_ITERATIONS = 100; // Prevent infinite load more
                             const loadMoreStartTime = Date.now();
@@ -1185,11 +1351,11 @@ class Interpreter extends events_1.EventEmitter {
                                 // Load more circuit breakers
                                 if (loadMoreCounter >= MAX_LOAD_MORE_ITERATIONS) {
                                     debugLog(`Maximum load more iterations reached (${MAX_LOAD_MORE_ITERATIONS}), stopping`);
-                                    return allResults;
+                                    return { results: allResults, urls: allUrls };
                                 }
                                 if (Date.now() - loadMoreStartTime > MAX_LOAD_MORE_TIME) {
                                     debugLog('Maximum load more time reached (5 minutes), stopping');
-                                    return allResults;
+                                    return { results: allResults, urls: allUrls };
                                 }
                                 // Add async yield every 3 iterations
                                 if (loadMoreCounter % 3 === 0 && loadMoreCounter > 0) {
@@ -1200,7 +1366,7 @@ class Interpreter extends events_1.EventEmitter {
                                 availableSelectors = updatedSelectors;
                                 if (!workingSelector || !loadMoreButton) {
                                     debugLog('No working Load More selector found after retries');
-                                    return allResults;
+                                    return { results: allResults, urls: allUrls };
                                 }
                                 // Implement retry mechanism for clicking the button
                                 let retryCount = 0;
@@ -1240,7 +1406,7 @@ class Interpreter extends events_1.EventEmitter {
                                 }
                                 if (!clickSuccess) {
                                     debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
-                                    return allResults;
+                                    return { results: allResults, urls: allUrls };
                                 }
                                 // Wait for content to load and check scroll height
                                 yield page.waitForTimeout(2000);
@@ -1269,16 +1435,16 @@ class Interpreter extends events_1.EventEmitter {
                                 //   previousResultCount = currentResultCount;
                                 // }
                                 if (checkLimit())
-                                    return allResults;
+                                    return { results: allResults, urls: allUrls };
                                 if (!heightChanged) {
                                     debugLog('No more items loaded after Load More');
-                                    return allResults;
+                                    return { results: allResults, urls: allUrls };
                                 }
                             }
                         }
                         default: {
                             yield scrapeCurrentPage();
-                            return allResults;
+                            return { results: allResults, urls: allUrls };
                         }
                     }
                     if (checkLimit())
@@ -1287,9 +1453,9 @@ class Interpreter extends events_1.EventEmitter {
             }
             catch (error) {
                 debugLog(`Fatal error: ${error.message}`);
-                return allResults;
+                return { results: allResults, urls: allUrls };
             }
-            return allResults;
+            return { results: allResults, urls: allUrls };
         });
     }
     getMatchingActionId(workflow, pageState, usedActions) {
@@ -2213,6 +2379,518 @@ class Interpreter extends events_1.EventEmitter {
             return modifiedAction;
         });
     }
+    /**
+     * Extracts URLs from the current page's list elements.
+     * Used during pagination to maintain sync between scraped results and extracted URLs.
+     *
+     * @param page - Playwright page object
+     * @param listSelector - The selector used to identify list elements
+     * @param limit - Maximum number of elements to process (should match number of scraped items)
+     * @returns Array of URL arrays, one per list element
+     */
+    extractUrlsFromCurrentPage(page, listSelector, limit) {
+        return __awaiter(this, void 0, void 0, function* () {
+            const extractedUrls = yield page.evaluate(({ selector, limit }) => {
+                const urlsByElement = [];
+                let listElements = [];
+                if (selector.startsWith('//') || selector.startsWith('(//')) {
+                    const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
+                    for (let i = 0; i < xpathResult.snapshotLength; i++) {
+                        const node = xpathResult.snapshotItem(i);
+                        if (node && node.nodeType === Node.ELEMENT_NODE) {
+                            listElements.push(node);
+                        }
+                    }
+                }
+                else {
+                    listElements = Array.from(document.querySelectorAll(selector));
+                }
+                // Extract URLs from the first 'limit' elements that match the selector
+                // The limit corresponds to the number of items that were scraped
+                const elementsToProcess = listElements.slice(0, limit);
+                elementsToProcess.forEach(element => {
+                    const urls = [];
+                    if (element.tagName === 'A' && element.href) {
+                        urls.push(element.href);
+                    }
+                    const anchors = element.querySelectorAll('a[href]');
+                    anchors.forEach(anchor => {
+                        const href = anchor.href;
+                        if (href && !urls.includes(href)) {
+                            urls.push(href);
+                        }
+                    });
+                    urlsByElement.push(urls);
+                });
+                return urlsByElement;
+            }, { selector: listSelector, limit });
+            return extractedUrls;
+        });
+    }
+    /**
+     * Builds a hierarchical deep extraction plan by analyzing the workflow structure.
+     * Identifies goto actions and determines what actions to execute at each level.
+     * Workflow is bottom-to-top, so we scan from end to start.
+     */
+    buildDeepExtractionHierarchy(currentWorkflow) {
+        var _a, _b;
+        const hierarchy = [];
+        // Find all goto action indices with their patterns
+        const gotoData = [];
+        currentWorkflow.forEach((pair, index) => {
+            var _a;
+            if (pair.what && pair.what.some(action => action.action === 'goto')) {
+                const gotoAction = pair.what.find(action => action.action === 'goto');
+                const pattern = (_a = gotoAction === null || gotoAction === void 0 ? void 0 : gotoAction.args) === null || _a === void 0 ? void 0 : _a[0];
+                if (pattern) {
+                    gotoData.push({ index, pattern: String(pattern) });
+                }
+            }
+        });
+        if (gotoData.length === 0) {
+            this.log('No goto actions found in workflow', logger_1.Level.WARN);
+            return [];
+        }
+        this.log(`Found ${gotoData.length} goto action(s) at indices: ${gotoData.map(g => g.index).join(', ')}`, logger_1.Level.LOG);
+        const uniqueGotos = [];
+        for (let i = 0; i < gotoData.length; i++) {
+            const current = gotoData[i];
+            const next = gotoData[i + 1];
+            if (next && current.pattern === next.pattern) {
+                this.log(`Skipping duplicate goto at index ${next.index} (same as ${current.index})`, logger_1.Level.LOG);
+                i++;
+            }
+            uniqueGotos.push(current);
+        }
+        this.log(`After deduplication: ${uniqueGotos.length} unique goto(s)`, logger_1.Level.LOG);
+        for (let i = 0; i < uniqueGotos.length; i++) {
+            const gotoIndex = uniqueGotos[i].index;
+            const gotoPattern = uniqueGotos[i].pattern;
+            const nextGotoIndex = i > 0 ? uniqueGotos[i - 1].index : 0;
+            let actionsToExecute = currentWorkflow.slice(nextGotoIndex, gotoIndex);
+            actionsToExecute = actionsToExecute.filter(pair => {
+                return !pair.what || !pair.what.some(action => action.action === 'goto');
+            });
+            const dataExtractionActions = actionsToExecute.filter(pair => {
+                return pair.what && pair.what.some(action => action.action === 'scrapeSchema' ||
+                    action.action === 'scrapeList' ||
+                    action.action === 'screenshot');
+            });
+            if (dataExtractionActions.length === 0) {
+                this.log(`No data extraction actions found between goto at ${gotoIndex} and next level`, logger_1.Level.WARN);
+                continue;
+            }
+            let sourceActionName = '';
+            let sourceActionType = 'scrapeList';
+            if (i === uniqueGotos.length - 1) {
+                const scrapeListBefore = currentWorkflow.slice(gotoIndex + 1).find(pair => pair.what && pair.what.some(action => action.action === 'scrapeList'));
+                if (scrapeListBefore) {
+                    const scrapeListAction = scrapeListBefore.what.find(action => action.action === 'scrapeList');
+                    sourceActionName = ((_b = (_a = scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.args) === null || _a === void 0 ? void 0 : _a[0]) === null || _b === void 0 ? void 0 : _b.name) || (scrapeListAction === null || scrapeListAction === void 0 ? void 0 : scrapeListAction.name) || '';
+                    sourceActionType = 'scrapeList';
+                }
+            }
+            else {
+                sourceActionName = '';
+                sourceActionType = 'scrapeSchema';
+            }
+            hierarchy.push({
+                gotoActionIndex: gotoIndex,
+                gotoPattern: String(gotoPattern),
+                actionsToExecute: dataExtractionActions,
+                sourceActionName,
+                sourceActionType
+            });
+            this.log(`Level ${i}: goto at index ${gotoIndex}, pattern=${gotoPattern}, actions=${dataExtractionActions.length}`, logger_1.Level.LOG);
+        }
+        return hierarchy;
+    }
+    /**
+     * Extracts hrefs directly from the page based on scrapeSchema selectors.
+     * Checks ALL selectors from the schema config - if they point to anchor elements, extract href.
+     * This is called after scrapeSchema executes to capture hrefs for deep extraction.
+     */
+    extractHrefsFromPage(page, schemaConfig) {
+        return __awaiter(this, void 0, void 0, function* () {
+            try {
+                const fields = schemaConfig.fields || schemaConfig;
+                const selectors = [];
+                for (const [fieldName, fieldConfig] of Object.entries(fields)) {
+                    if (fieldConfig && typeof fieldConfig === 'object' && fieldConfig.selector) {
+                        selectors.push(String(fieldConfig.selector));
+                    }
+                }
+                if (selectors.length === 0) {
+                    return [];
+                }
+                const extractedUrls = yield page.evaluate((selectorList) => {
+                    const urls = [];
+                    for (const selector of selectorList) {
+                        if (!selector)
+                            continue;
+                        try {
+                            let elements = [];
+                            if (selector.startsWith('//') || selector.startsWith('(//')) {
+                                const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
+                                for (let i = 0; i < xpathResult.snapshotLength; i++) {
+                                    const node = xpathResult.snapshotItem(i);
+                                    if (node && node.nodeType === Node.ELEMENT_NODE) {
+                                        elements.push(node);
+                                    }
+                                }
+                            }
+                            else {
+                                elements = Array.from(document.querySelectorAll(selector));
+                            }
+                            for (const element of elements) {
+                                if (element.tagName === 'A' && element.href) {
+                                    const href = element.href;
+                                    if (href && !urls.includes(href)) {
+                                        urls.push(href);
+                                    }
+                                }
+                            }
+                        }
+                        catch (error) {
+                            console.warn(`Failed to extract hrefs for selector ${selector}:`, error);
+                        }
+                    }
+                    return urls;
+                }, selectors);
+                this.log(`Extracted ${extractedUrls.length} hrefs from page for schema selectors`, logger_1.Level.LOG);
+                return extractedUrls;
+            }
+            catch (error) {
+                this.log(`Failed to extract hrefs from page: ${error.message}`, logger_1.Level.ERROR);
+                return [];
+            }
+        });
+    }
+    /**
+     * Filters URLs for deep extraction based on the goto action pattern.
+     * This is called immediately after the first capture action (scrapeList).
+     * Returns the filtered URL mappings that should be processed after workflow completion.
+     * Each mapping maintains alignment with the original scrapeList index.
+     */
+    filterDeepExtractionUrls(page, listSelector, scrapeResults, gotoTargetPattern) {
+        return __awaiter(this, void 0, void 0, function* () {
+            try {
+                this.log(`Deep extraction: Filtering URLs from list structure (${scrapeResults.length} items)`, logger_1.Level.LOG);
+                const extractedUrls = yield page.evaluate(({ selector, limit }) => {
+                    const urlsByElement = [];
+                    let listElements = [];
+                    if (selector.startsWith('//') || selector.startsWith('(//')) {
+                        const xpathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
+                        for (let i = 0; i < xpathResult.snapshotLength; i++) {
+                            const node = xpathResult.snapshotItem(i);
+                            if (node && node.nodeType === Node.ELEMENT_NODE) {
+                                listElements.push(node);
+                            }
+                        }
+                    }
+                    else {
+                        listElements = Array.from(document.querySelectorAll(selector));
+                    }
+                    const elementsToProcess = listElements.slice(0, limit);
+                    elementsToProcess.forEach(element => {
+                        const urls = [];
+                        if (element.tagName === 'A' && element.href) {
+                            urls.push(element.href);
+                        }
+                        const anchors = element.querySelectorAll('a[href]');
+                        anchors.forEach(anchor => {
+                            const href = anchor.href;
+                            if (href && !urls.includes(href)) {
+                                urls.push(href);
+                            }
+                        });
+                        urlsByElement.push(urls);
+                    });
+                    return urlsByElement;
+                }, { selector: listSelector, limit: scrapeResults.length });
+                const totalUrlCount = extractedUrls.reduce((sum, urls) => sum + urls.length, 0);
+                this.log(`Extracted ${totalUrlCount} total URLs from ${scrapeResults.length} list items (avg ${(totalUrlCount / scrapeResults.length).toFixed(1)} URLs per item)`, logger_1.Level.LOG);
+                const getUrlPattern = (url) => {
+                    try {
+                        const urlObj = new URL(url);
+                        const pathname = urlObj.pathname.replace(/\/$/, '');
+                        const segments = pathname.split('/').filter(s => s.length > 0);
+                        return {
+                            origin: urlObj.origin,
+                            pathSegments: segments
+                        };
+                    }
+                    catch (_a) {
+                        return null;
+                    }
+                };
+                const targetPattern = getUrlPattern(String(gotoTargetPattern));
+                const targetNormalized = String(gotoTargetPattern).replace(/\/$/, '').toLowerCase();
+                if (!targetPattern) {
+                    this.log('Could not parse goto URL pattern, skipping deep extraction', logger_1.Level.WARN);
+                    return [];
+                }
+                this.log(`Target URL pattern: ${targetPattern.origin}/${targetPattern.pathSegments.join('/')}`, logger_1.Level.LOG);
+                const urlMappings = [];
+                extractedUrls.forEach((urlsFromElement, scrapeListIndex) => {
+                    let matchingUrl = null;
+                    for (const url of urlsFromElement) {
+                        const urlPattern = getUrlPattern(url);
+                        if (!urlPattern)
+                            continue;
+                        if (urlPattern.origin !== targetPattern.origin)
+                            continue;
+                        if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
+                            continue;
+                        let pathMatches = true;
+                        for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
+                            if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
+                                pathMatches = false;
+                                break;
+                            }
+                        }
+                        if (!pathMatches)
+                            continue;
+                        const urlNormalized = url.replace(/\/$/, '').toLowerCase();
+                        if (urlNormalized === targetNormalized) {
+                            this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
+                            continue;
+                        }
+                        matchingUrl = url;
+                        break;
+                    }
+                    urlMappings.push({
+                        scrapeListIndex,
+                        url: matchingUrl
+                    });
+                });
+                const matchedCount = urlMappings.filter(m => m.url !== null).length;
+                this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
+                if (matchedCount > 0) {
+                    const matchedMappings = urlMappings.filter(m => m.url !== null);
+                    const sampleSize = Math.min(5, matchedMappings.length);
+                    const sample = matchedMappings.slice(0, sampleSize);
+                    this.log(`Sample of matching URLs (showing ${sampleSize} of ${matchedMappings.length}):`, logger_1.Level.LOG);
+                    sample.forEach((mapping, idx) => {
+                        this.log(`  ${idx + 1}. [Index ${mapping.scrapeListIndex}] ${mapping.url}`, logger_1.Level.LOG);
+                    });
+                }
+                else {
+                    this.log('No matching URLs found. Check if extracted URLs match the pattern.', logger_1.Level.WARN);
+                }
+                return urlMappings;
+            }
+            catch (error) {
+                this.log(`URL filtering failed: ${error.message}`, logger_1.Level.ERROR);
+                return [];
+            }
+        });
+    }
+    /**
+     * Filters pre-extracted URLs for deep extraction based on the goto action pattern.
+     * This is used for paginated lists where URLs were extracted during pagination.
+     * Returns the filtered URL mappings that maintain alignment with scrapeList indices.
+     */
+    filterDeepExtractionUrlsFromExtracted(extractedUrls, scrapeResults, gotoTargetPattern) {
+        try {
+            const totalUrlCount = extractedUrls.reduce((sum, urls) => sum + urls.length, 0);
+            this.log(`Deep extraction: Filtering ${totalUrlCount} pre-extracted URLs from ${scrapeResults.length} items`, logger_1.Level.LOG);
+            const getUrlPattern = (url) => {
+                try {
+                    const urlObj = new URL(url);
+                    const pathname = urlObj.pathname.replace(/\/$/, '');
+                    const segments = pathname.split('/').filter(s => s.length > 0);
+                    return {
+                        origin: urlObj.origin,
+                        pathSegments: segments
+                    };
+                }
+                catch (_a) {
+                    return null;
+                }
+            };
+            const targetPattern = getUrlPattern(String(gotoTargetPattern));
+            const targetNormalized = String(gotoTargetPattern).replace(/\/$/, '').toLowerCase();
+            if (!targetPattern) {
+                this.log('Could not parse goto URL pattern, skipping deep extraction', logger_1.Level.WARN);
+                return [];
+            }
+            this.log(`Target URL pattern: ${targetPattern.origin}/${targetPattern.pathSegments.join('/')}`, logger_1.Level.LOG);
+            const urlMappings = [];
+            extractedUrls.forEach((urlsFromElement, scrapeListIndex) => {
+                let matchingUrl = null;
+                for (const url of urlsFromElement) {
+                    const urlPattern = getUrlPattern(url);
+                    if (!urlPattern)
+                        continue;
+                    if (urlPattern.origin !== targetPattern.origin)
+                        continue;
+                    if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
+                        continue;
+                    let pathMatches = true;
+                    for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
+                        if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
+                            pathMatches = false;
+                            break;
+                        }
+                    }
+                    if (!pathMatches)
+                        continue;
+                    const urlNormalized = url.replace(/\/$/, '').toLowerCase();
+                    if (urlNormalized === targetNormalized) {
+                        this.log(`Excluding already-visited URL: ${url}`, logger_1.Level.LOG);
+                        continue;
+                    }
+                    matchingUrl = url;
+                    break;
+                }
+                urlMappings.push({
+                    scrapeListIndex,
+                    url: matchingUrl
+                });
+            });
+            const matchedCount = urlMappings.filter(m => m.url !== null).length;
+            this.log(`Filtered to ${matchedCount} matching URLs for deep extraction (out of ${scrapeResults.length} total items)`, logger_1.Level.LOG);
+            if (matchedCount > 0) {
+                const matchedMappings = urlMappings.filter(m => m.url !== null);
+                const sampleSize = Math.min(5, matchedMappings.length);
+                const sample = matchedMappings.slice(0, sampleSize);
+                this.log(`Sample of matching URLs (showing ${sampleSize} of ${matchedMappings.length}):`, logger_1.Level.LOG);
+                sample.forEach((mapping, idx) => {
+                    this.log(`  ${idx + 1}. [Index ${mapping.scrapeListIndex}] ${mapping.url}`, logger_1.Level.LOG);
+                });
+            }
+            else {
+                this.log('No matching URLs found. Check if extracted URLs match the pattern.', logger_1.Level.WARN);
+            }
+            return urlMappings;
+        }
+        catch (error) {
+            this.log(`URL filtering failed: ${error.message}`, logger_1.Level.ERROR);
+            return [];
+        }
+    }
+    /**
+     * Helper function to check if a URL matches a goto pattern.
+     */
+    matchesGotoPattern(url, gotoPattern) {
+        try {
+            const getUrlPattern = (urlStr) => {
+                try {
+                    const urlObj = new URL(urlStr);
+                    const pathname = urlObj.pathname.replace(/\/$/, '');
+                    const segments = pathname.split('/').filter(s => s.length > 0);
+                    return { origin: urlObj.origin, pathSegments: segments };
+                }
+                catch (_a) {
+                    return null;
+                }
+            };
+            const urlPattern = getUrlPattern(url);
+            const targetPattern = getUrlPattern(gotoPattern);
+            const targetNormalized = gotoPattern.replace(/\/$/, '').toLowerCase();
+            const urlNormalized = url.replace(/\/$/, '').toLowerCase();
+            if (!urlPattern || !targetPattern)
+                return false;
+            if (urlPattern.origin !== targetPattern.origin)
+                return false;
+            if (urlPattern.pathSegments.length !== targetPattern.pathSegments.length)
+                return false;
+            if (urlNormalized === targetNormalized)
+                return false; // Skip exact matches
+            for (let i = 0; i < targetPattern.pathSegments.length - 1; i++) {
+                if (urlPattern.pathSegments[i] !== targetPattern.pathSegments[i]) {
+                    return false;
+                }
+            }
+            return true;
+        }
+        catch (_a) {
+            return false;
+        }
+    }
+    /**
+     * Executes hierarchical deep extraction by processing each level recursively.
+     * URLs are already stored in each hierarchy level's urlMappings during workflow execution.
+     */
+    executeHierarchicalDeepExtraction(page, hierarchy) {
+        return __awaiter(this, void 0, void 0, function* () {
+            try {
+                if (hierarchy.length === 0) {
+                    this.log('No hierarchy levels to process', logger_1.Level.LOG);
+                    return;
+                }
+                this.log(`\n=== Starting Hierarchical Deep Extraction (${hierarchy.length} level${hierarchy.length > 1 ? 's' : ''}) ===`, logger_1.Level.LOG);
+                this.isInDeepExtractionPhase = true;
+                const startLevel = hierarchy.length >= 2 ? hierarchy.length - 2 : hierarchy.length - 1;
+                for (let levelIndex = startLevel; levelIndex >= 0; levelIndex--) {
+                    const level = hierarchy[levelIndex];
+                    const currentLevelUrls = level.urlMappings;
+                    this.log(`\n=== Processing Deep Extraction Level ${startLevel - levelIndex + 1}/${startLevel + 1} ===`, logger_1.Level.LOG);
+                    this.log(`Goto pattern: ${level.gotoPattern}`, logger_1.Level.LOG);
+                    this.log(`Actions to execute: ${level.actionsToExecute.length}`, logger_1.Level.LOG);
+                    this.log(`URLs to process: ${currentLevelUrls.filter(m => m.url !== null).length}`, logger_1.Level.LOG);
+                    if (currentLevelUrls.length === 0 || currentLevelUrls.every(u => !u.url)) {
+                        this.log('No valid URLs at this level - stopping here', logger_1.Level.LOG);
+                        break;
+                    }
+                    yield this.executeDeepExtractionLevel(page, level, currentLevelUrls);
+                }
+                this.log('\n=== Hierarchical Deep Extraction Completed ===', logger_1.Level.LOG);
+            }
+            catch (error) {
+                this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
+            }
+            finally {
+                this.isInDeepExtractionPhase = false;
+            }
+        });
+    }
+    /**
+     * Executes deep extraction for a single level.
+     * URLs are already extracted and stored in hierarchy during workflow execution.
+     * This function just navigates to URLs and executes the capture actions.
+     */
+    executeDeepExtractionLevel(page, level, urlMappings) {
+        return __awaiter(this, void 0, void 0, function* () {
+            try {
+                const validMappings = urlMappings.filter(m => m.url !== null);
+                if (validMappings.length === 0) {
+                    this.log('No URLs to process for this level', logger_1.Level.LOG);
+                    return;
+                }
+                this.log(`Processing ${validMappings.length} URLs`, logger_1.Level.LOG);
+                for (const mapping of validMappings) {
+                    try {
+                        this.log(`[${mapping.index}] Navigating to: ${mapping.url}`, logger_1.Level.LOG);
+                        yield page.goto(mapping.url);
+                        yield page.waitForLoadState('networkidle', { timeout: 30000 });
+                        for (let i = level.actionsToExecute.length - 1; i >= 0; i--) {
+                            const actionPair = level.actionsToExecute[i];
+                            if (this.isAborted) {
+                                this.log('Workflow aborted during deep extraction', logger_1.Level.WARN);
+                                return;
+                            }
+                            const validatedAction = yield this.validateAndFixSelectors(page, actionPair);
+                            const filteredActions = validatedAction.what.filter(action => action.action === 'scrapeSchema' ||
+                                action.action === 'scrapeList' ||
+                                action.action === 'screenshot');
+                            if (filteredActions.length > 0) {
+                                yield this.carryOutSteps(page, filteredActions);
+                            }
+                        }
+                        this.log(`[${mapping.index}] Completed`, logger_1.Level.LOG);
+                    }
+                    catch (error) {
+                        this.log(`[${mapping.index}] Failed: ${error.message}`, logger_1.Level.ERROR);
+                    }
+                }
+            }
+            catch (error) {
+                this.log(`Level execution failed: ${error.message}`, logger_1.Level.ERROR);
+            }
+        });
+    }
     runLoop(p, workflow) {
         return __awaiter(this, void 0, void 0, function* () {
             var _a, _b;
@@ -2297,6 +2975,20 @@ class Interpreter extends events_1.EventEmitter {
                 }
                 if (workflowCopy.length === 0) {
                     this.log('All actions completed. Workflow finished.', logger_1.Level.LOG);
+                    if (this.pendingDeepExtraction) {
+                        this.log('Starting deferred hierarchical deep extraction now that workflow has completed...', logger_1.Level.LOG);
+                        const { page, hierarchy } = this.pendingDeepExtraction;
+                        try {
+                            yield this.executeHierarchicalDeepExtraction(page, hierarchy);
+                            this.log('Hierarchical deep extraction completed successfully', logger_1.Level.LOG);
+                        }
+                        catch (error) {
+                            this.log(`Hierarchical deep extraction failed: ${error.message}`, logger_1.Level.ERROR);
+                        }
+                        finally {
+                            this.pendingDeepExtraction = null;
+                        }
+                    }
                     cleanup();
                     return;
                 }
@@ -2365,7 +3057,7 @@ class Interpreter extends events_1.EventEmitter {
                     try {
                         const validatedAction = yield this.validateAndFixSelectors(p, action);
                         console.log("Carrying out:", validatedAction.what);
-                        yield this.carryOutSteps(p, validatedAction.what);
+                        yield this.carryOutSteps(p, validatedAction.what, workflowCopy);
                         usedActions.push((_b = action.id) !== null && _b !== void 0 ? _b : 'undefined');
                         workflowCopy.splice(actionId, 1);
                         console.log(`Action with ID ${action.id} removed from the workflow copy.`);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mx-cloud",
-  "version": "0.0.23",
+  "version": "0.0.25",
   "description": "mx cloud",
   "main": "build/index.js",
   "typings": "build/index.d.ts",