npm - mx-cloud - Versions diffs - 0.0.26 → 0.0.28 - Mend

mx-cloud 0.0.26 → 0.0.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/build/interpret.d.ts CHANGED Viewed

@@ -38,7 +38,7 @@ interface InterpreterOptions {
     serializableCallback: (output: any) => (void | Promise<void>);
     binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
     debug: boolean;
-    robotType?: 'extract' | 'scrape' | 'deep-extract';
+    robotType?: 'extract' | 'scrape' | 'crawl' | 'search' | 'deep-extract';
     debugChannel: Partial<{
         activeId: (id: number) => void;
         debugMessage: (msg: string) => void;

package/build/interpret.js CHANGED Viewed

@@ -678,10 +678,8 @@ class Interpreter extends events_1.EventEmitter {
                     }
                     this.log('Starting crawl operation', logger_1.Level.LOG);
                     try {
-                        // Get current page URL and log it
                         const currentUrl = page.url();
                         this.log(`Current page URL: ${currentUrl}`, logger_1.Level.LOG);
-                        // If page is on about:blank or empty, we need to wait for navigation
                         if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') {
                             this.log('Page not yet navigated, waiting for navigation...', logger_1.Level.WARN);
                             yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
@@ -690,13 +688,260 @@ class Interpreter extends events_1.EventEmitter {
                         this.log(`Using base URL for crawl: ${baseUrl}`, logger_1.Level.LOG);
                         const parsedBase = new URL(baseUrl);
                         const baseDomain = parsedBase.hostname;
-                        let discoveredUrls = [];
-                        // Step 1: Sitemap discovery using XMLHttpRequest to avoid polyfills
+                        let robotRules = {
+                            disallowedPaths: [],
+                            allowedPaths: [],
+                            crawlDelay: null
+                        };
+                        if (crawlConfig.respectRobots) {
+                            this.log('Fetching robots.txt...', logger_1.Level.LOG);
+                            try {
+                                const robotsUrl = `${parsedBase.protocol}//${parsedBase.host}/robots.txt`;
+                                const robotsContent = yield page.evaluate((url) => {
+                                    return new Promise((resolve) => {
+                                        const xhr = new XMLHttpRequest();
+                                        xhr.open('GET', url, true);
+                                        xhr.onload = function () {
+                                            if (xhr.status === 200) {
+                                                resolve(xhr.responseText);
+                                            }
+                                            else {
+                                                resolve('');
+                                            }
+                                        };
+                                        xhr.onerror = function () {
+                                            resolve('');
+                                        };
+                                        xhr.send();
+                                    });
+                                }, robotsUrl);
+                                if (robotsContent) {
+                                    const lines = robotsContent.split('\n');
+                                    let isRelevantUserAgent = false;
+                                    let foundSpecificUserAgent = false;
+                                    for (const line of lines) {
+                                        const trimmedLine = line.trim().toLowerCase();
+                                        if (trimmedLine.startsWith('#') || trimmedLine === '') {
+                                            continue;
+                                        }
+                                        const colonIndex = line.indexOf(':');
+                                        if (colonIndex === -1)
+                                            continue;
+                                        const directive = line.substring(0, colonIndex).trim().toLowerCase();
+                                        const value = line.substring(colonIndex + 1).trim();
+                                        if (directive === 'user-agent') {
+                                            const agent = value.toLowerCase();
+                                            if (agent === '*' && !foundSpecificUserAgent) {
+                                                isRelevantUserAgent = true;
+                                            }
+                                            else if (agent.includes('bot') || agent.includes('crawler') || agent.includes('spider')) {
+                                                isRelevantUserAgent = true;
+                                                foundSpecificUserAgent = true;
+                                            }
+                                            else {
+                                                if (!foundSpecificUserAgent) {
+                                                    isRelevantUserAgent = false;
+                                                }
+                                            }
+                                        }
+                                        else if (isRelevantUserAgent) {
+                                            if (directive === 'disallow' && value) {
+                                                robotRules.disallowedPaths.push(value);
+                                            }
+                                            else if (directive === 'allow' && value) {
+                                                robotRules.allowedPaths.push(value);
+                                            }
+                                            else if (directive === 'crawl-delay' && value) {
+                                                const delay = parseFloat(value);
+                                                if (!isNaN(delay) && delay > 0) {
+                                                    robotRules.crawlDelay = delay * 1000;
+                                                }
+                                            }
+                                        }
+                                    }
+                                    this.log(`Robots.txt parsed: ${robotRules.disallowedPaths.length} disallowed paths, ${robotRules.allowedPaths.length} allowed paths, crawl-delay: ${robotRules.crawlDelay || 'none'}`, logger_1.Level.LOG);
+                                }
+                                else {
+                                    this.log('No robots.txt found or not accessible, proceeding without restrictions', logger_1.Level.WARN);
+                                }
+                            }
+                            catch (error) {
+                                this.log(`Failed to fetch robots.txt: ${error.message}, proceeding without restrictions`, logger_1.Level.WARN);
+                            }
+                        }
+                        const isUrlAllowedByRobots = (url) => {
+                            if (!crawlConfig.respectRobots)
+                                return true;
+                            try {
+                                const urlObj = new URL(url);
+                                const pathname = urlObj.pathname;
+                                for (const allowedPath of robotRules.allowedPaths) {
+                                    if (allowedPath === pathname || pathname.startsWith(allowedPath)) {
+                                        return true;
+                                    }
+                                    if (allowedPath.includes('*')) {
+                                        const regex = new RegExp('^' + allowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
+                                        if (regex.test(pathname)) {
+                                            return true;
+                                        }
+                                    }
+                                }
+                                for (const disallowedPath of robotRules.disallowedPaths) {
+                                    if (disallowedPath === '/') {
+                                        return false;
+                                    }
+                                    if (pathname.startsWith(disallowedPath)) {
+                                        return false;
+                                    }
+                                    if (disallowedPath.includes('*')) {
+                                        const regex = new RegExp('^' + disallowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
+                                        if (regex.test(pathname)) {
+                                            return false;
+                                        }
+                                    }
+                                    if (disallowedPath.endsWith('$')) {
+                                        const pattern = disallowedPath.slice(0, -1);
+                                        if (pathname === pattern || pathname.endsWith(pattern)) {
+                                            return false;
+                                        }
+                                    }
+                                }
+                                return true;
+                            }
+                            catch (error) {
+                                return true;
+                            }
+                        };
+                        const isUrlAllowedByConfig = (url) => {
+                            try {
+                                const urlObj = new URL(url);
+                                if (crawlConfig.mode === 'domain') {
+                                    if (urlObj.hostname !== baseDomain)
+                                        return false;
+                                }
+                                else if (crawlConfig.mode === 'subdomain') {
+                                    if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
+                                        return false;
+                                }
+                                else if (crawlConfig.mode === 'path') {
+                                    if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
+                                        return false;
+                                }
+                                if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
+                                    const matches = crawlConfig.includePaths.some(pattern => {
+                                        try {
+                                            const regex = new RegExp(pattern);
+                                            return regex.test(url);
+                                        }
+                                        catch (_a) {
+                                            return url.includes(pattern);
+                                        }
+                                    });
+                                    if (!matches)
+                                        return false;
+                                }
+                                if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
+                                    const matches = crawlConfig.excludePaths.some(pattern => {
+                                        try {
+                                            const regex = new RegExp(pattern);
+                                            return regex.test(url);
+                                        }
+                                        catch (_a) {
+                                            return url.includes(pattern);
+                                        }
+                                    });
+                                    if (matches)
+                                        return false;
+                                }
+                                return true;
+                            }
+                            catch (error) {
+                                return false;
+                            }
+                        };
+                        const normalizeUrl = (url) => {
+                            return url.replace(/#.*$/, '').replace(/\/$/, '');
+                        };
+                        const extractLinksFromPage = () => __awaiter(this, void 0, void 0, function* () {
+                            try {
+                                yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
+                                yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
+                                yield new Promise(resolve => setTimeout(resolve, 1000));
+                                const pageLinks = yield page.evaluate(() => {
+                                    const links = [];
+                                    const allAnchors = document.querySelectorAll('a');
+                                    for (let i = 0; i < allAnchors.length; i++) {
+                                        const anchor = allAnchors[i];
+                                        const fullHref = anchor.href;
+                                        if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
+                                            links.push(fullHref);
+                                        }
+                                    }
+                                    return links;
+                                });
+                                return pageLinks;
+                            }
+                            catch (error) {
+                                this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
+                                return [];
+                            }
+                        });
+                        const scrapePageContent = (url) => __awaiter(this, void 0, void 0, function* () {
+                            const pageData = yield page.evaluate(() => {
+                                var _a, _b;
+                                const getMeta = (name) => {
+                                    const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
+                                    return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
+                                };
+                                const getAllMeta = () => {
+                                    const metadata = {};
+                                    const metaTags = document.querySelectorAll('meta');
+                                    metaTags.forEach(tag => {
+                                        const name = tag.getAttribute('name') || tag.getAttribute('property');
+                                        const content = tag.getAttribute('content');
+                                        if (name && content) {
+                                            metadata[name] = content;
+                                        }
+                                    });
+                                    return metadata;
+                                };
+                                const title = document.title || '';
+                                const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
+                                const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
+                                elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
+                                const html = document.documentElement.outerHTML;
+                                const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
+                                const allMetadata = getAllMeta();
+                                return {
+                                    title,
+                                    description: getMeta('description'),
+                                    text: bodyText,
+                                    html: html,
+                                    links: links,
+                                    wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
+                                    metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
+                                };
+                            });
+                            return {
+                                metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
+                                html: pageData.html,
+                                text: pageData.text,
+                                links: pageData.links,
+                                wordCount: pageData.wordCount,
+                                scrapedAt: new Date().toISOString()
+                            };
+                        });
+                        const visitedUrls = new Set();
+                        const crawlResults = [];
+                        const crawlQueue = [];
+                        const normalizedBaseUrl = normalizeUrl(baseUrl);
+                        visitedUrls.add(normalizedBaseUrl);
+                        crawlQueue.push({ url: baseUrl, depth: 0 });
+                        this.log(`Starting breadth-first crawl with maxDepth: ${crawlConfig.maxDepth}, limit: ${crawlConfig.limit}`, logger_1.Level.LOG);
                         if (crawlConfig.useSitemap) {
                             this.log('Fetching sitemap URLs...', logger_1.Level.LOG);
                             try {
                                 const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
-                                // Use XMLHttpRequest instead of fetch to avoid polyfills
                                 const sitemapUrls = yield page.evaluate((url) => {
                                     return new Promise((resolve) => {
                                         const xhr = new XMLHttpRequest();
@@ -721,7 +966,13 @@ class Interpreter extends events_1.EventEmitter {
                                 if (sitemapUrls.length > 0) {
                                     const nestedSitemaps = sitemapUrls.filter(url => url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/'));
                                     const regularUrls = sitemapUrls.filter(url => !url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/'));
-                                    discoveredUrls.push(...regularUrls);
+                                    for (const sitemapPageUrl of regularUrls) {
+                                        const normalized = normalizeUrl(sitemapPageUrl);
+                                        if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(sitemapPageUrl) && isUrlAllowedByRobots(sitemapPageUrl)) {
+                                            visitedUrls.add(normalized);
+                                            crawlQueue.push({ url: sitemapPageUrl, depth: 1 });
+                                        }
+                                    }
                                     this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, logger_1.Level.LOG);
                                     for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
                                         try {
@@ -747,16 +998,20 @@ class Interpreter extends events_1.EventEmitter {
                                                     xhr.send();
                                                 });
                                             }, nestedUrl);
-                                            if (nestedUrls.length > 0) {
-                                                discoveredUrls.push(...nestedUrls);
-                                                this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, logger_1.Level.LOG);
+                                            for (const nestedPageUrl of nestedUrls) {
+                                                const normalized = normalizeUrl(nestedPageUrl);
+                                                if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(nestedPageUrl) && isUrlAllowedByRobots(nestedPageUrl)) {
+                                                    visitedUrls.add(normalized);
+                                                    crawlQueue.push({ url: nestedPageUrl, depth: 1 });
+                                                }
                                             }
+                                            this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, logger_1.Level.LOG);
                                         }
                                         catch (error) {
                                             this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, logger_1.Level.WARN);
                                         }
                                     }
-                                    this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, logger_1.Level.LOG);
+                                    this.log(`Total URLs queued from sitemaps: ${crawlQueue.length - 1}`, logger_1.Level.LOG);
                                 }
                                 else {
                                     this.log('No URLs found in sitemap or sitemap not available', logger_1.Level.WARN);
@@ -766,167 +1021,63 @@ class Interpreter extends events_1.EventEmitter {
                                 this.log(`Sitemap fetch failed: ${error.message}`, logger_1.Level.WARN);
                             }
                         }
-                        if (crawlConfig.followLinks) {
-                            this.log('Extracting links from current page...', logger_1.Level.LOG);
-                            try {
-                                yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
-                                yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
-                                    this.log('Network did not become idle, continuing anyway', logger_1.Level.WARN);
-                                });
-                                yield new Promise(resolve => setTimeout(resolve, 5000));
-                                const anchorCount = yield page.evaluate(() => {
-                                    return document.querySelectorAll('a').length;
-                                });
-                                this.log(`Page has ${anchorCount} total anchor tags`, logger_1.Level.LOG);
-                                const pageLinks = yield page.evaluate(() => {
-                                    const links = [];
-                                    const allAnchors = document.querySelectorAll('a');
-                                    console.log('Total anchors found:', allAnchors.length);
-                                    for (let i = 0; i < allAnchors.length; i++) {
-                                        const anchor = allAnchors[i];
-                                        const href = anchor.getAttribute('href');
-                                        const fullHref = anchor.href;
-                                        if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
-                                            links.push(fullHref);
-                                        }
-                                    }
-                                    console.log('Links extracted:', links.length);
-                                    return links;
-                                });
-                                discoveredUrls.push(...pageLinks);
-                                this.log(`Found ${pageLinks.length} links from page`, logger_1.Level.LOG);
-                            }
-                            catch (error) {
-                                this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
+                        let processedCount = 0;
+                        while (crawlQueue.length > 0 && crawlResults.length < crawlConfig.limit) {
+                            if (this.isAborted) {
+                                this.log('Workflow aborted during crawl', logger_1.Level.WARN);
+                                break;
                             }
-                        }
-                        const filteredUrls = discoveredUrls.filter(url => {
+                            const { url, depth } = crawlQueue.shift();
+                            processedCount++;
+                            this.log(`[${crawlResults.length + 1}/${crawlConfig.limit}] Crawling (depth ${depth}): ${url}`, logger_1.Level.LOG);
                             try {
-                                const urlObj = new URL(url);
-                                if (crawlConfig.mode === 'domain') {
-                                    if (urlObj.hostname !== baseDomain)
-                                        return false;
-                                }
-                                else if (crawlConfig.mode === 'subdomain') {
-                                    if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
-                                        return false;
-                                }
-                                else if (crawlConfig.mode === 'path') {
-                                    if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
-                                        return false;
+                                if (robotRules.crawlDelay && crawlResults.length > 0) {
+                                    this.log(`Applying crawl delay: ${robotRules.crawlDelay}ms`, logger_1.Level.LOG);
+                                    yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
                                 }
-                                if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
-                                    const matches = crawlConfig.includePaths.some(pattern => {
-                                        const regex = new RegExp(pattern);
-                                        return regex.test(url);
-                                    });
-                                    if (!matches)
-                                        return false;
-                                }
-                                if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
-                                    const matches = crawlConfig.excludePaths.some(pattern => {
-                                        const regex = new RegExp(pattern);
-                                        return regex.test(url);
-                                    });
-                                    if (matches)
-                                        return false;
-                                }
-                                return true;
-                            }
-                            catch (error) {
-                                return false;
-                            }
-                        });
-                        const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
-                            return url.replace(/#.*$/, '').replace(/\/$/, '');
-                        })));
-                        const basePathname = parsedBase.pathname;
-                        const prioritizedUrls = uniqueUrls.sort((a, b) => {
-                            try {
-                                const aUrl = new URL(a);
-                                const bUrl = new URL(b);
-                                const aMatchesBase = aUrl.pathname.startsWith(basePathname);
-                                const bMatchesBase = bUrl.pathname.startsWith(basePathname);
-                                if (aMatchesBase && !bMatchesBase)
-                                    return -1;
-                                if (!aMatchesBase && bMatchesBase)
-                                    return 1;
-                                return 0;
-                            }
-                            catch (error) {
-                                return 0;
-                            }
-                        });
-                        const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
-                        this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, logger_1.Level.LOG);
-                        this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, logger_1.Level.LOG);
-                        const crawlResults = [];
-                        for (let i = 0; i < finalUrls.length; i++) {
-                            const url = finalUrls[i];
-                            try {
-                                this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, logger_1.Level.LOG);
                                 yield page.goto(url, {
                                     waitUntil: 'domcontentloaded',
                                     timeout: 30000
-                                }).catch(() => {
-                                    this.log(`Failed to navigate to ${url}, skipping...`, logger_1.Level.WARN);
+                                }).catch((err) => {
+                                    throw new Error(`Navigation failed: ${err.message}`);
                                 });
                                 yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
-                                const pageData = yield page.evaluate(() => {
-                                    var _a, _b;
-                                    const getMeta = (name) => {
-                                        const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
-                                        return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
-                                    };
-                                    const getAllMeta = () => {
-                                        const metadata = {};
-                                        const metaTags = document.querySelectorAll('meta');
-                                        metaTags.forEach(tag => {
-                                            const name = tag.getAttribute('name') || tag.getAttribute('property');
-                                            const content = tag.getAttribute('content');
-                                            if (name && content) {
-                                                metadata[name] = content;
-                                            }
-                                        });
-                                        return metadata;
-                                    };
-                                    const title = document.title || '';
-                                    const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
-                                    const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
-                                    elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
-                                    const html = document.documentElement.outerHTML;
-                                    const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
-                                    const allMetadata = getAllMeta();
-                                    return {
-                                        title,
-                                        description: getMeta('description'),
-                                        text: bodyText,
-                                        html: html,
-                                        links: links,
-                                        wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
-                                        metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
-                                    };
-                                });
-                                crawlResults.push({
-                                    metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
-                                    html: pageData.html,
-                                    text: pageData.text,
-                                    links: pageData.links,
-                                    wordCount: pageData.wordCount,
-                                    scrapedAt: new Date().toISOString()
-                                });
-                                this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
+                                const pageResult = yield scrapePageContent(url);
+                                pageResult.metadata.depth = depth;
+                                crawlResults.push(pageResult);
+                                this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, logger_1.Level.LOG);
+                                if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
+                                    const newLinks = yield extractLinksFromPage();
+                                    let addedCount = 0;
+                                    for (const link of newLinks) {
+                                        const normalized = normalizeUrl(link);
+                                        if (!visitedUrls.has(normalized) &&
+                                            isUrlAllowedByConfig(link) &&
+                                            isUrlAllowedByRobots(link)) {
+                                            visitedUrls.add(normalized);
+                                            crawlQueue.push({ url: link, depth: depth + 1 });
+                                            addedCount++;
+                                        }
+                                    }
+                                    if (addedCount > 0) {
+                                        this.log(`Added ${addedCount} new URLs to queue at depth ${depth + 1}`, logger_1.Level.LOG);
+                                    }
+                                }
                             }
                             catch (error) {
-                                this.log(`Failed to scrape ${url}: ${error.message}`, logger_1.Level.WARN);
+                                this.log(`Failed to crawl ${url}: ${error.message}`, logger_1.Level.WARN);
                                 crawlResults.push({
-                                    url: url,
+                                    metadata: {
+                                        url: url,
+                                        sourceURL: url,
+                                        depth: depth
+                                    },
                                     error: error.message,
                                     scrapedAt: new Date().toISOString()
                                 });
                             }
                         }
-                        this.log(`Successfully scraped ${crawlResults.length} pages`, logger_1.Level.LOG);
+                        this.log(`Crawl completed: ${crawlResults.length} pages scraped (${processedCount} URLs processed, ${visitedUrls.size} URLs discovered)`, logger_1.Level.LOG);
                         const actionType = "crawl";
                         const actionName = "Crawl Results";
                         if (!this.serializableDataByType[actionType]) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mx-cloud",
-  "version": "0.0.26",
+  "version": "0.0.28",
   "description": "mx cloud",
   "main": "build/index.js",
   "typings": "build/index.d.ts",