npm - mx-cloud - Versions diffs - 0.0.14 → 0.0.15 - Mend

mx-cloud 0.0.14 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/build/browserSide/scraper.js +72 -2
package/package.json +1 -1

package/build/browserSide/scraper.js CHANGED Viewed

@@ -287,8 +287,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
                     return (_e = element.textContent) === null || _e === void 0 ? void 0 : _e.trim();
                 case 'innerHTML':
                     return element.innerHTML;
-                case 'outerHTML':
-                    return element.outerHTML;
+                case 'outerHTML': {
+                    const clonedElement = element.cloneNode(true);
+                    const elementsWithMxId = clonedElement.querySelectorAll('[data-mx-id]');
+                    elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
+                    if (clonedElement.hasAttribute && clonedElement.hasAttribute('data-mx-id')) {
+                        clonedElement.removeAttribute('data-mx-id');
+                    }
+                    return clonedElement.outerHTML;
+                }
                 default:
                     return element.getAttribute(attribute) || ((_f = element.innerText) === null || _f === void 0 ? void 0 : _f.trim());
             }
@@ -359,6 +366,69 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
    */
     window.scrapeList = function (_a) {
         return __awaiter(this, arguments, void 0, function* ({ listSelector, fields, limit = 10 }) {
+            const isSitemapUrl = () => {
+                const url = window.location.href.toLowerCase();
+                return url.includes('sitemap') && url.includes('.xml');
+            };
+            const scrapeSitemapData = () => {
+                // Try to get the XML content from the page
+                let xmlContent = null;
+                // Method 1: Check if the page is already parsed as XML
+                if (document.documentElement.tagName.toLowerCase() === 'urlset') {
+                    xmlContent = document;
+                }
+                // Method 2: Try to get raw XML from pre tags (common browser display)
+                if (!xmlContent) {
+                    const preElement = document.querySelector('pre');
+                    if (preElement) {
+                        try {
+                            const parser = new DOMParser();
+                            xmlContent = parser.parseFromString(preElement.textContent, 'text/xml');
+                        }
+                        catch (e) {
+                            console.warn('Failed to parse XML from pre element:', e);
+                        }
+                    }
+                }
+                // Method 3: Try to parse the entire document as XML
+                if (!xmlContent) {
+                    try {
+                        const parser = new DOMParser();
+                        xmlContent = parser.parseFromString(document.documentElement.outerHTML, 'text/xml');
+                    }
+                    catch (e) {
+                        console.warn('Failed to parse document as XML:', e);
+                    }
+                }
+                if (!xmlContent) {
+                    console.error('Could not parse sitemap XML');
+                    return [];
+                }
+                // Extract URL entries from the sitemap
+                const urlElements = xmlContent.querySelectorAll('url');
+                const sitemapData = [];
+                urlElements.forEach((urlElement, index) => {
+                    if (limit && index >= limit)
+                        return;
+                    const locElement = urlElement.querySelector('loc');
+                    const lastmodElement = urlElement.querySelector('lastmod');
+                    const entry = {};
+                    if (locElement) {
+                        entry.loc = locElement.textContent.trim();
+                    }
+                    if (lastmodElement) {
+                        entry.lastmod = lastmodElement.textContent.trim();
+                    }
+                    // Only add entries that have at least a loc field
+                    if (entry.loc) {
+                        sitemapData.push(entry);
+                    }
+                });
+                return sitemapData;
+            };
+            if (isSitemapUrl()) {
+                return scrapeSitemapData();
+            }
             // XPath evaluation functions
             const queryInsideContext = (context, part) => {
                 try {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mx-cloud",
-  "version": "0.0.14",
+  "version": "0.0.15",
   "description": "mx cloud",
   "main": "build/index.js",
   "typings": "build/index.d.ts",