npm - mx-cloud - Versions diffs - 0.0.11 → 0.0.12 - Mend

mx-cloud 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/build/browserSide/scraper.js +426 -121
package/package.json +1 -1

package/build/browserSide/scraper.js CHANGED Viewed

@@ -359,7 +359,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
    */
     window.scrapeList = function (_a) {
         return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
-            var _b;
             // XPath evaluation functions
             const evaluateXPath = (rootElement, xpath) => {
                 try {
@@ -372,7 +371,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                     return result.singleNodeValue;
                 }
                 catch (error) {
-                    console.warn('XPath evaluation failed:', xpath, error);
+                    console.warn("XPath evaluation failed:", xpath, error);
                     return null;
                 }
             };
@@ -394,33 +393,41 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                     return elements;
                 }
                 catch (error) {
-                    console.warn('XPath evaluation failed:', xpath, error);
+                    console.warn("XPath evaluation failed:", xpath, error);
                     return [];
                 }
             };
-            // Enhanced query function to handle iframe, frame, shadow DOM, and XPath
+            // Helper function to detect selector type
+            const isXPathSelector = (selector) => {
+                return (selector.startsWith("//") ||
+                    selector.startsWith("/") ||
+                    selector.startsWith("./"));
+            };
+            // Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath
             const queryElement = (rootElement, selector) => {
-                if (!selector.includes('>>') && !selector.includes(':>>')) {
+                if (!selector.includes(">>") && !selector.includes(":>>")) {
                     // Check if it's an XPath selector
-                    if (selector.startsWith('//') || selector.startsWith('/') || selector.startsWith('./')) {
+                    if (isXPathSelector(selector)) {
                         return evaluateXPath(rootElement, selector);
                     }
                     else {
                         return rootElement.querySelector(selector);
                     }
                 }
-                const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
+                const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
                 let currentElement = rootElement;
                 for (let i = 0; i < parts.length; i++) {
                     if (!currentElement)
                         return null;
                     // Handle iframe and frame traversal
-                    if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
+                    if (currentElement.tagName === "IFRAME" ||
+                        currentElement.tagName === "FRAME") {
                         try {
-                            const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
+                            const frameDoc = currentElement.contentDocument ||
+                                currentElement.contentWindow.document;
                             if (!frameDoc)
                                 return null;
-                            if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
+                            if (isXPathSelector(parts[i])) {
                                 currentElement = evaluateXPath(frameDoc, parts[i]);
                             }
                             else {
@@ -434,9 +441,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                         }
                     }
                     let nextElement = null;
-                    if ('querySelector' in currentElement) {
-                        // Handle XPath vs CSS selector
-                        if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
+                    // Try regular DOM first
+                    if ("querySelector" in currentElement) {
+                        if (isXPathSelector(parts[i])) {
                             nextElement = evaluateXPath(currentElement, parts[i]);
                         }
                         else {
@@ -444,8 +451,10 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                         }
                     }
                     // Try shadow DOM if not found
-                    if (!nextElement && 'shadowRoot' in currentElement && currentElement.shadowRoot) {
-                        if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
+                    if (!nextElement &&
+                        "shadowRoot" in currentElement &&
+                        currentElement.shadowRoot) {
+                        if (isXPathSelector(parts[i])) {
                             nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
                         }
                         else {
@@ -453,11 +462,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                         }
                     }
                     // Check children's shadow roots if still not found
-                    if (!nextElement && 'children' in currentElement) {
+                    if (!nextElement && "children" in currentElement) {
                         const children = Array.from(currentElement.children || []);
                         for (const child of children) {
                             if (child.shadowRoot) {
-                                if (parts[i].startsWith('//') || parts[i].startsWith('/') || parts[i].startsWith('./')) {
+                                if (isXPathSelector(parts[i])) {
                                     nextElement = evaluateXPath(child.shadowRoot, parts[i]);
                                 }
                                 else {
@@ -472,28 +481,27 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                 }
                 return currentElement;
             };
-            // Enhanced query all function for XPath and CSS selectors
+            // Enhanced query all function for both contexts
             const queryElementAll = (rootElement, selector) => {
-                if (!selector.includes('>>') && !selector.includes(':>>')) {
-                    // Check if it's an XPath selector
-                    if (selector.startsWith('//') || selector.startsWith('/') || selector.startsWith('./')) {
+                if (!selector.includes(">>") && !selector.includes(":>>")) {
+                    if (isXPathSelector(selector)) {
                         return evaluateXPathAll(rootElement, selector);
                     }
                     else {
                         return Array.from(rootElement.querySelectorAll(selector));
                     }
                 }
-                const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
+                const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
                 let currentElements = [rootElement];
                 for (const part of parts) {
                     const nextElements = [];
                     for (const element of currentElements) {
                         // Handle iframe and frame traversal
-                        if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
+                        if (element.tagName === "IFRAME" || element.tagName === "FRAME") {
                             try {
                                 const frameDoc = element.contentDocument || element.contentWindow.document;
                                 if (frameDoc) {
-                                    if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
+                                    if (isXPathSelector(part)) {
                                         nextElements.push(...evaluateXPathAll(frameDoc, part));
                                     }
                                     else {
@@ -509,7 +517,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                         else {
                             // Regular DOM elements
                             if (element.querySelectorAll) {
-                                if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
+                                if (isXPathSelector(part)) {
                                     nextElements.push(...evaluateXPathAll(element, part));
                                 }
                                 else {
@@ -518,7 +526,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                             }
                             // Shadow DOM elements
                             if (element.shadowRoot) {
-                                if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
+                                if (isXPathSelector(part)) {
                                     nextElements.push(...evaluateXPathAll(element.shadowRoot, part));
                                 }
                                 else {
@@ -529,7 +537,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                             const children = Array.from(element.children || []);
                             for (const child of children) {
                                 if (child.shadowRoot) {
-                                    if (part.startsWith('//') || part.startsWith('/') || part.startsWith('./')) {
+                                    if (isXPathSelector(part)) {
                                         nextElements.push(...evaluateXPathAll(child.shadowRoot, part));
                                     }
                                     else {
@@ -545,7 +553,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
             };
             // Enhanced value extraction with context awareness
             const extractValue = (element, attribute) => {
-                var _a, _b, _c, _d, _e, _f;
+                var _a, _b, _c, _d, _e;
                 if (!element)
                     return null;
                 // Get context-aware base URL
@@ -557,17 +565,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                         return shadowContent.trim();
                     }
                 }
-                if (attribute === 'innerText') {
+                if (attribute === "innerText") {
                     // First try standard innerText/textContent
                     let textContent = ((_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim()) || ((_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim());
                     // If empty, check for common data attributes that might contain the text
                     if (!textContent) {
                         const dataAttributes = [
-                            'data-600',
-                            'data-text',
-                            'data-label',
-                            'data-value',
-                            'data-content',
+                            "data-600",
+                            "data-text",
+                            "data-label",
+                            "data-value",
+                            "data-content",
                         ];
                         for (const attr of dataAttributes) {
                             const dataValue = element.getAttribute(attr);
@@ -579,140 +587,437 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                     }
                     return textContent || null;
                 }
-                else if (attribute === 'innerHTML') {
+                else if (attribute === "innerHTML") {
                     return ((_e = element.innerHTML) === null || _e === void 0 ? void 0 : _e.trim()) || null;
                 }
-                else if (attribute === 'href') {
-                    // For href, we need to find the anchor tag if the current element isn't one
-                    let anchorElement = element;
-                    // If current element is not an anchor, look for parent anchor
-                    if (element.tagName !== 'A') {
-                        anchorElement = element.closest('a') || ((_f = element.parentElement) === null || _f === void 0 ? void 0 : _f.closest('a')) || element;
+                else if (attribute === "src" || attribute === "href") {
+                    if (attribute === "href" && element.tagName !== "A") {
+                        const parentElement = element.parentElement;
+                        if (parentElement && parentElement.tagName === "A") {
+                            const parentHref = parentElement.getAttribute("href");
+                            if (parentHref) {
+                                try {
+                                    return new URL(parentHref, baseURL).href;
+                                }
+                                catch (e) {
+                                    return parentHref;
+                                }
+                            }
+                        }
                     }
-                    const hrefValue = anchorElement.getAttribute('href');
-                    if (!hrefValue || hrefValue.trim() === '') {
+                    const attrValue = element.getAttribute(attribute);
+                    const dataAttr = attrValue || element.getAttribute("data-" + attribute);
+                    if (!dataAttr || dataAttr.trim() === "") {
+                        if (attribute === "src") {
+                            const style = window.getComputedStyle(element);
+                            const bgImage = style.backgroundImage;
+                            if (bgImage && bgImage !== "none") {
+                                const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
+                                return matches ? new URL(matches[1], baseURL).href : null;
+                            }
+                        }
                         return null;
                     }
                     try {
-                        return new URL(hrefValue, baseURL).href;
+                        return new URL(dataAttr, baseURL).href;
                     }
                     catch (e) {
-                        console.warn('Error creating URL from', hrefValue, e);
-                        return hrefValue;
+                        console.warn("Error creating URL from", dataAttr, e);
+                        return dataAttr;
                     }
                 }
-                else if (attribute === 'src') {
-                    const attrValue = element.getAttribute(attribute);
-                    const dataAttr = attrValue || element.getAttribute('data-' + attribute);
-                    if (!dataAttr || dataAttr.trim() === '') {
-                        const style = window.getComputedStyle(element);
-                        const bgImage = style.backgroundImage;
-                        if (bgImage && bgImage !== 'none') {
-                            const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
-                            return matches ? new URL(matches[1], baseURL).href : null;
+                return element.getAttribute(attribute);
+            };
+            // Enhanced table ancestor finding with context support
+            const findTableAncestor = (element) => {
+                let currentElement = element;
+                const MAX_DEPTH = 5;
+                let depth = 0;
+                while (currentElement && depth < MAX_DEPTH) {
+                    // Handle shadow DOM
+                    if (currentElement.getRootNode() instanceof ShadowRoot) {
+                        currentElement = currentElement.getRootNode().host;
+                        continue;
+                    }
+                    if (currentElement.tagName === "TD") {
+                        return { type: "TD", element: currentElement };
+                    }
+                    else if (currentElement.tagName === "TR") {
+                        return { type: "TR", element: currentElement };
+                    }
+                    // Handle iframe and frame crossing
+                    if (currentElement.tagName === "IFRAME" ||
+                        currentElement.tagName === "FRAME") {
+                        try {
+                            currentElement = currentElement.contentDocument.body;
                         }
-                        return null;
+                        catch (e) {
+                            return null;
+                        }
+                    }
+                    else {
+                        currentElement = currentElement.parentElement;
+                    }
+                    depth++;
+                }
+                return null;
+            };
+            // Helper function to get cell index
+            const getCellIndex = (td) => {
+                if (td.getRootNode() instanceof ShadowRoot) {
+                    const shadowRoot = td.getRootNode();
+                    const allCells = Array.from(shadowRoot.querySelectorAll("td"));
+                    return allCells.indexOf(td);
+                }
+                let index = 0;
+                let sibling = td;
+                while ((sibling = sibling.previousElementSibling)) {
+                    index++;
+                }
+                return index;
+            };
+            // Helper function to check for TH elements
+            const hasThElement = (row, tableFields) => {
+                for (const [_, { selector }] of Object.entries(tableFields)) {
+                    const element = queryElement(row, selector);
+                    if (element) {
+                        let current = element;
+                        while (current && current !== row) {
+                            if (current.getRootNode() instanceof ShadowRoot) {
+                                current = current.getRootNode().host;
+                                continue;
+                            }
+                            if (current.tagName === "TH")
+                                return true;
+                            if (current.tagName === "IFRAME" || current.tagName === "FRAME") {
+                                try {
+                                    current = current.contentDocument.body;
+                                }
+                                catch (e) {
+                                    break;
+                                }
+                            }
+                            else {
+                                current = current.parentElement;
+                            }
+                        }
+                    }
+                }
+                return false;
+            };
+            // Helper function to filter rows
+            const filterRowsBasedOnTag = (rows, tableFields) => {
+                for (const row of rows) {
+                    if (hasThElement(row, tableFields)) {
+                        return rows;
                     }
+                }
+                return rows.filter((row) => {
+                    const directTH = row.getElementsByTagName("TH").length === 0;
+                    const shadowTH = row.shadowRoot
+                        ? row.shadowRoot.querySelector("th") === null
+                        : true;
+                    return directTH && shadowTH;
+                });
+            };
+            // Class similarity comparison functions
+            const calculateClassSimilarity = (classList1, classList2) => {
+                const set1 = new Set(classList1);
+                const set2 = new Set(classList2);
+                const intersection = new Set([...set1].filter((x) => set2.has(x)));
+                const union = new Set([...set1, ...set2]);
+                return intersection.size / union.size;
+            };
+            // Enhanced similar elements finding with context support
+            const findSimilarElements = (baseElement, similarityThreshold = 0.7) => {
+                const baseClasses = Array.from(baseElement.classList);
+                if (baseClasses.length === 0)
+                    return [];
+                const allElements = [];
+                // Get elements from main document
+                allElements.push(...document.getElementsByTagName(baseElement.tagName));
+                // Get elements from shadow DOM
+                if (baseElement.getRootNode() instanceof ShadowRoot) {
+                    const shadowHost = baseElement.getRootNode().host;
+                    allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
+                }
+                // Get elements from iframes and frames
+                const frames = [
+                    ...Array.from(document.getElementsByTagName("iframe")),
+                    ...Array.from(document.getElementsByTagName("frame")),
+                ];
+                for (const frame of frames) {
                     try {
-                        return new URL(dataAttr, baseURL).href;
+                        const frameDoc = frame.contentDocument || frame.contentWindow.document;
+                        allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
                     }
                     catch (e) {
-                        console.warn('Error creating URL from', dataAttr, e);
-                        return dataAttr;
+                        console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
                     }
                 }
-                return element.getAttribute(attribute);
+                return allElements.filter((element) => {
+                    if (element === baseElement)
+                        return false;
+                    const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
+                    return similarity >= similarityThreshold;
+                });
+            };
+            const tryFallbackSelector = (rootElement, originalSelector) => {
+                let element = queryElement(rootElement, originalSelector);
+                if (!element && originalSelector.includes("nth-child")) {
+                    const match = originalSelector.match(/nth-child\((\d+)\)/);
+                    if (match) {
+                        const position = parseInt(match[1], 10);
+                        for (let i = position - 1; i >= 1; i--) {
+                            const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
+                            element = queryElement(rootElement, fallbackSelector);
+                            if (element)
+                                break;
+                        }
+                        if (!element) {
+                            const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, "");
+                            element = queryElement(rootElement, baseSelector);
+                        }
+                    }
+                }
+                return element;
             };
             // Create indexed XPath for specific container instance
             const createIndexedXPath = (childSelector, listSelector, containerIndex) => {
-                console.log(`Creating indexed XPath for container ${containerIndex}`);
-                console.log(`Child selector: ${childSelector}`);
-                console.log(`List selector: ${listSelector}`);
                 // Check if the child selector contains the list selector pattern
-                if (childSelector.includes(listSelector.replace('//', ''))) {
+                if (childSelector.includes(listSelector.replace("//", ""))) {
                     // Replace the list selector part with indexed version
-                    const listPattern = listSelector.replace('//', '');
+                    const listPattern = listSelector.replace("//", "");
                     const indexedListSelector = `(${listSelector})[${containerIndex}]`;
                     const indexedSelector = childSelector.replace(`//${listPattern}`, indexedListSelector);
-                    console.log(`Generated indexed selector: ${indexedSelector}`);
                     return indexedSelector;
                 }
                 else {
                     // If pattern doesn't match, create a more generic indexed selector
-                    console.warn(`Pattern doesn't match, using fallback approach`);
-                    return `(${listSelector})[${containerIndex}]${childSelector.replace('//', '/')}`;
+                    return `(${listSelector})[${containerIndex}]${childSelector.replace("//", "/")}`;
                 }
             };
-            // Main scraping logic
-            console.log('🚀 Starting list data extraction');
-            console.log('List Selector:', listSelector);
-            console.log('Fields:', fields);
-            // Step 1: Get all container elements matching the list selector
-            const containers = queryElementAll(document, listSelector);
-            console.log(`📦 Found ${containers.length} list containers`);
+            // Main scraping logic with unified support for both CSS and XPath
+            console.log("🚀 Starting unified list data extraction");
+            console.log("List Selector:", listSelector);
+            console.log("Fields:", fields);
+            let containers = queryElementAll(document, listSelector);
+            containers = Array.from(containers);
             if (containers.length === 0) {
-                console.warn('❌ No containers found for listSelector:', listSelector);
+                console.warn("❌ No containers found for listSelector:", listSelector);
                 return [];
             }
-            // Step 2: Extract data from each container up to the limit
-            const extractedData = [];
-            const containersToProcess = Math.min(containers.length, limit);
-            console.log(`🔄 Processing ${containersToProcess} containers...`);
-            for (let containerIndex = 0; containerIndex < containersToProcess; containerIndex++) {
-                const container = containers[containerIndex];
-                const record = {};
-                console.log(`\n📋 Processing container ${containerIndex + 1}/${containersToProcess}`);
-                // Step 3: For each field, extract data from the current container
-                for (const [label, field] of Object.entries(fields)) {
-                    console.log(`\n  🔍 Extracting field "${label}"`);
-                    console.log(`    Original selector: ${field.selector}`);
-                    console.log(`    Attribute: ${field.attribute}`);
-                    let element = null;
-                    // Handle XPath selectors with container indexing
-                    if (field.selector.startsWith('//')) {
-                        // Create indexed absolute XPath
-                        const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
-                        console.log(`    📍 Indexed selector: ${indexedSelector}`);
-                        element = evaluateXPath(document, indexedSelector);
-                        console.log(`    📍 Indexed XPath result: ${element ? 'FOUND' : 'NOT FOUND'}`);
+            console.log(`📦 Found ${containers.length} list containers`);
+            // For CSS selectors, try to find similar containers if needed
+            if (!isXPathSelector(listSelector) &&
+                limit > 1 &&
+                containers.length === 1) {
+                const baseContainer = containers[0];
+                const similarContainers = findSimilarElements(baseContainer);
+                if (similarContainers.length > 0) {
+                    const newContainers = similarContainers.filter((container) => !container.matches(listSelector));
+                    containers = [...containers, ...newContainers];
+                }
+            }
+            const containerFields = containers.map(() => ({
+                tableFields: {},
+                nonTableFields: {},
+            }));
+            // For XPath selectors, use the new approach
+            if (isXPathSelector(listSelector)) {
+                const extractedData = [];
+                const containersToProcess = Math.min(containers.length, limit);
+                for (let containerIndex = 0; containerIndex < containersToProcess; containerIndex++) {
+                    const record = {};
+                    for (const [label, field] of Object.entries(fields)) {
+                        let element = null;
+                        if (isXPathSelector(field.selector)) {
+                            // Create indexed absolute XPath
+                            const indexedSelector = createIndexedXPath(field.selector, listSelector, containerIndex + 1);
+                            element = evaluateXPath(document, indexedSelector);
+                        }
+                        else {
+                            // Fallback for CSS selectors within XPath containers
+                            const container = containers[containerIndex];
+                            element = queryElement(container, field.selector);
+                        }
                         if (element) {
-                            console.log(`    📍 Found element text: "${(_b = element.textContent) === null || _b === void 0 ? void 0 : _b.trim()}"`);
+                            const value = extractValue(element, field.attribute);
+                            if (value !== null && value !== "") {
+                                record[label] = value;
+                            }
+                            else {
+                                record[label] = "";
+                            }
+                        }
+                        else {
+                            record[label] = "";
                         }
                     }
-                    else {
-                        // Fallback for non-XPath selectors - search within container
-                        element = queryElement(container, field.selector);
+                    if (Object.values(record).some((value) => value !== "")) {
+                        extractedData.push(record);
                     }
-                    // Step 4: Extract the value from the found element
-                    if (element) {
-                        const value = extractValue(element, field.attribute);
-                        if (value !== null && value !== '') {
-                            record[label] = value;
-                            console.log(`    ✅ Extracted "${label}": "${value}"`);
+                }
+                console.log(`📊 Total records extracted: ${extractedData.length}`);
+                return extractedData;
+            }
+            // For CSS selectors, use the original table-aware approach
+            containers.forEach((container, containerIndex) => {
+                for (const [label, field] of Object.entries(fields)) {
+                    const sampleElement = queryElement(container, field.selector);
+                    if (sampleElement) {
+                        const ancestor = findTableAncestor(sampleElement);
+                        if (ancestor) {
+                            containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1 });
                         }
                         else {
-                            console.warn(`    ⚠️ Empty value for "${label}"`);
-                            record[label] = '';
+                            containerFields[containerIndex].nonTableFields[label] = field;
                         }
                     }
                     else {
-                        console.warn(`    ❌ Element not found for "${label}"`);
-                        record[label] = '';
+                        containerFields[containerIndex].nonTableFields[label] = field;
                     }
                 }
-                // Step 5: Add record if it has any non-empty values
-                if (Object.values(record).some(value => value !== '')) {
-                    extractedData.push(record);
-                    console.log(`  ✅ Added record ${containerIndex + 1}:`, record);
+            });
+            const tableData = [];
+            const nonTableData = [];
+            // Process table data with support for iframes, frames, and shadow DOM
+            for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
+                const container = containers[containerIndex];
+                const { tableFields } = containerFields[containerIndex];
+                if (Object.keys(tableFields).length > 0) {
+                    const firstField = Object.values(tableFields)[0];
+                    const firstElement = queryElement(container, firstField.selector);
+                    let tableContext = firstElement;
+                    // Find table context including iframe, frame and shadow DOM
+                    while (tableContext &&
+                        tableContext.tagName !== "TABLE" &&
+                        tableContext !== container) {
+                        if (tableContext.getRootNode() instanceof ShadowRoot) {
+                            tableContext = tableContext.getRootNode().host;
+                            continue;
+                        }
+                        if (tableContext.tagName === "IFRAME" ||
+                            tableContext.tagName === "FRAME") {
+                            try {
+                                tableContext = tableContext.contentDocument.body;
+                            }
+                            catch (e) {
+                                break;
+                            }
+                        }
+                        else {
+                            tableContext = tableContext.parentElement;
+                        }
+                    }
+                    if (tableContext) {
+                        // Get rows from all contexts
+                        const rows = [];
+                        // Get rows from regular DOM
+                        rows.push(...tableContext.getElementsByTagName("TR"));
+                        // Get rows from shadow DOM
+                        if (tableContext.shadowRoot) {
+                            rows.push(...tableContext.shadowRoot.getElementsByTagName("TR"));
+                        }
+                        // Get rows from iframes and frames
+                        if (tableContext.tagName === "IFRAME" ||
+                            tableContext.tagName === "FRAME") {
+                            try {
+                                const frameDoc = tableContext.contentDocument ||
+                                    tableContext.contentWindow.document;
+                                rows.push(...frameDoc.getElementsByTagName("TR"));
+                            }
+                            catch (e) {
+                                console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
+                            }
+                        }
+                        const processedRows = filterRowsBasedOnTag(rows, tableFields);
+                        for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
+                            const record = {};
+                            const currentRow = processedRows[rowIndex];
+                            for (const [label, { selector, attribute, cellIndex },] of Object.entries(tableFields)) {
+                                let element = null;
+                                if (cellIndex >= 0) {
+                                    // Get TD element considering both contexts
+                                    let td = currentRow.children[cellIndex];
+                                    // Check shadow DOM for td
+                                    if (!td && currentRow.shadowRoot) {
+                                        const shadowCells = currentRow.shadowRoot.children;
+                                        if (shadowCells && shadowCells.length > cellIndex) {
+                                            td = shadowCells[cellIndex];
+                                        }
+                                    }
+                                    if (td) {
+                                        element = queryElement(td, selector);
+                                        if (!element &&
+                                            selector
+                                                .split(/(?:>>|:>>)/)
+                                                .pop()
+                                                .includes("td:nth-child")) {
+                                            element = td;
+                                        }
+                                        if (!element) {
+                                            const tagOnlySelector = selector.split(".")[0];
+                                            element = queryElement(td, tagOnlySelector);
+                                        }
+                                        if (!element) {
+                                            let currentElement = td;
+                                            while (currentElement &&
+                                                currentElement.children.length > 0) {
+                                                let foundContentChild = false;
+                                                for (const child of currentElement.children) {
+                                                    if (extractValue(child, attribute)) {
+                                                        currentElement = child;
+                                                        foundContentChild = true;
+                                                        break;
+                                                    }
+                                                }
+                                                if (!foundContentChild)
+                                                    break;
+                                            }
+                                            element = currentElement;
+                                        }
+                                    }
+                                }
+                                else {
+                                    element = queryElement(currentRow, selector);
+                                }
+                                if (element) {
+                                    record[label] = extractValue(element, attribute);
+                                }
+                            }
+                            if (Object.keys(record).length > 0) {
+                                tableData.push(record);
+                            }
+                        }
+                    }
                 }
-                else {
-                    console.warn(`  ⚠️ Skipping empty record for container ${containerIndex + 1}`);
+            }
+            // Process non-table data with all contexts support
+            for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
+                if (nonTableData.length >= limit)
+                    break;
+                const container = containers[containerIndex];
+                const { nonTableFields } = containerFields[containerIndex];
+                if (Object.keys(nonTableFields).length > 0) {
+                    const record = {};
+                    for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
+                        // Get the last part of the selector after any context delimiter
+                        const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
+                        const element = tryFallbackSelector(container, relativeSelector);
+                        if (element) {
+                            record[label] = extractValue(element, attribute);
+                        }
+                    }
+                    if (Object.keys(record).length > 0) {
+                        nonTableData.push(record);
+                    }
                 }
             }
-            console.log('\n🎉 Extraction complete!');
-            console.log(`📊 Total records extracted: ${extractedData.length}`);
-            console.log('📋 All records:', extractedData);
-            return extractedData;
+            // Merge and limit the results
+            const scrapedData = [...tableData, ...nonTableData];
+            console.log(`📊 Total records extracted: ${scrapedData.length}`);
+            return scrapedData;
         });
     };
     /**

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mx-cloud",
-  "version": "0.0.11",
+  "version": "0.0.12",
   "description": "mx cloud",
   "main": "build/index.js",
   "typings": "build/index.d.ts",