npm - mx-cloud - Versions diffs - 0.0.30 → 0.0.31 - Mend

mx-cloud 0.0.30 → 0.0.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/build/index.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import Interpreter from './interpret';
 export default Interpreter;
 export { default as Preprocessor } from './preprocessor';
-export type { WorkflowFile, WhereWhatPair, Where, What, } from './types/workflow';
+export type { WorkflowFile, WhereWhatPair, Where, What, CustomFunctions, } from './types/workflow';
 export { unaryOperators, naryOperators, meta as metaOperators } from './types/logic';

package/build/interpret.js CHANGED Viewed

@@ -252,15 +252,19 @@ class Interpreter extends events_1.EventEmitter {
                     }
                     yield page.close();
                 }),
-                scrape: (selector) => __awaiter(this, void 0, void 0, function* () {
-                    var _a;
-                    if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
-                        this.options.debugChannel.setActionType('scrape');
-                    }
-                    yield this.ensureScriptsLoaded(page);
-                    const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
-                    yield this.callWithTimeout(() => this.options.serializableCallback(scrapeResults), 30000, 'serializableCallback (scrape)');
-                }),
+                // DEPRECATED: Old scrape action - commented out in favor of new workflow-based scrape action
+                // scrape: async (selector?: string) => {
+                //   if (this.options.debugChannel?.setActionType) {
+                //     this.options.debugChannel.setActionType('scrape');
+                //   }
+                //   await this.ensureScriptsLoaded(page);
+                //   const scrapeResults: Record<string, string>[] = await page.evaluate((s) => window.scrape(s ?? null), selector);
+                //   await this.callWithTimeout(
+                //     () => this.options.serializableCallback(scrapeResults),
+                //     30000,
+                //     'serializableCallback (scrape)'
+                //   );
+                // },
                 scrapeSchema: (schema_1, ...args_1) => __awaiter(this, [schema_1, ...args_1], void 0, function* (schema, actionName = "") {
                     var _a;
                     if (this.isAborted) {
@@ -1567,6 +1571,169 @@ class Interpreter extends events_1.EventEmitter {
                         throw new Error(`Search execution error: ${error.message}`);
                     }
                 }),
+                /**
+                 * scrape action: Converts a webpage to text, markdown, HTML, and/or screenshots.
+                 * This is the workflow action for scrape robots.
+                 */
+                scrape: (scrapeConfig) => __awaiter(this, void 0, void 0, function* () {
+                    var _a;
+                    if (this.isAborted) {
+                        this.log('Workflow aborted, stopping scrape', logger_1.Level.WARN);
+                        return;
+                    }
+                    if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
+                        this.options.debugChannel.setActionType('scrape');
+                    }
+                    this.log(`Starting scrape for URL: ${scrapeConfig.url}`, logger_1.Level.LOG);
+                    try {
+                        const formats = scrapeConfig.formats || ['markdown', 'html', 'text'];
+                        const url = scrapeConfig.url;
+                        if (!url) {
+                            throw new Error('No URL specified for scrape action');
+                        }
+                        const currentUrl = page.url();
+                        if (currentUrl === 'about:blank' || currentUrl === '' || !currentUrl.includes(new URL(url).hostname)) {
+                            this.log(`Navigating to ${url}`, logger_1.Level.LOG);
+                            yield page.goto(url, { waitUntil: 'load', timeout: 60000 });
+                            yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
+                        }
+                        const serializableOutput = {};
+                        const SCRAPE_TIMEOUT = 120000;
+                        if (formats.includes('text')) {
+                            try {
+                                const textPromise = page.evaluate(() => {
+                                    const body = document.body;
+                                    if (!body)
+                                        return '';
+                                    return body.innerText || body.textContent || '';
+                                });
+                                const timeoutPromise = new Promise((_, reject) => {
+                                    setTimeout(() => reject(new Error(`Text extraction timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
+                                });
+                                const text = yield Promise.race([textPromise, timeoutPromise]);
+                                if (text && text.trim().length > 0) {
+                                    serializableOutput.text = [{ content: text.trim() }];
+                                    this.log('Text extraction completed', logger_1.Level.LOG);
+                                }
+                            }
+                            catch (error) {
+                                this.log(`Text extraction failed: ${error.message}`, logger_1.Level.WARN);
+                            }
+                        }
+                        if (formats.includes('markdown')) {
+                            try {
+                                const html = yield page.evaluate(() => {
+                                    const selectors = [
+                                        "script", "style", "link[rel='stylesheet']", "noscript", "meta",
+                                        "svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
+                                    ];
+                                    selectors.forEach(sel => {
+                                        document.querySelectorAll(sel).forEach(e => e.remove());
+                                    });
+                                    const all = document.querySelectorAll("*");
+                                    all.forEach(el => {
+                                        [...el.attributes].forEach(attr => {
+                                            if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
+                                                el.removeAttribute(attr.name);
+                                            }
+                                        });
+                                    });
+                                    return document.documentElement.outerHTML;
+                                });
+                                const markdownPromise = (0, markdown_1.parseMarkdown)(html, url);
+                                const timeoutPromise = new Promise((_, reject) => {
+                                    setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
+                                });
+                                const markdown = yield Promise.race([markdownPromise, timeoutPromise]);
+                                if (markdown && markdown.trim().length > 0) {
+                                    serializableOutput.markdown = [{ content: markdown }];
+                                    this.log('Markdown conversion completed', logger_1.Level.LOG);
+                                }
+                            }
+                            catch (error) {
+                                this.log(`Markdown conversion failed: ${error.message}`, logger_1.Level.WARN);
+                            }
+                        }
+                        if (formats.includes('html')) {
+                            try {
+                                const htmlPromise = page.evaluate(() => {
+                                    const selectors = [
+                                        "script", "style", "link[rel='stylesheet']", "noscript", "meta",
+                                        "svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
+                                    ];
+                                    selectors.forEach(sel => {
+                                        document.querySelectorAll(sel).forEach(e => e.remove());
+                                    });
+                                    const all = document.querySelectorAll("*");
+                                    all.forEach(el => {
+                                        [...el.attributes].forEach(attr => {
+                                            if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
+                                                el.removeAttribute(attr.name);
+                                            }
+                                        });
+                                    });
+                                    return document.documentElement.outerHTML;
+                                });
+                                const timeoutPromise = new Promise((_, reject) => {
+                                    setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
+                                });
+                                const html = yield Promise.race([htmlPromise, timeoutPromise]);
+                                if (html && html.trim().length > 0) {
+                                    serializableOutput.html = [{ content: html }];
+                                    this.log('HTML conversion completed', logger_1.Level.LOG);
+                                }
+                            }
+                            catch (error) {
+                                this.log(`HTML conversion failed: ${error.message}`, logger_1.Level.WARN);
+                            }
+                        }
+                        if (formats.includes('screenshot-visible')) {
+                            try {
+                                const screenshotBuffer = yield page.screenshot({ fullPage: false, type: 'png' });
+                                if (screenshotBuffer && screenshotBuffer.length > 0) {
+                                    yield this.options.binaryCallback({
+                                        name: 'screenshot-visible',
+                                        data: screenshotBuffer,
+                                        mimeType: 'image/png'
+                                    }, 'image/png');
+                                    this.log('Visible screenshot captured', logger_1.Level.LOG);
+                                }
+                            }
+                            catch (error) {
+                                this.log(`Screenshot-visible failed: ${error.message}`, logger_1.Level.WARN);
+                            }
+                        }
+                        if (formats.includes('screenshot-fullpage')) {
+                            try {
+                                const screenshotBuffer = yield page.screenshot({ fullPage: true, type: 'png' });
+                                if (screenshotBuffer && screenshotBuffer.length > 0) {
+                                    yield this.options.binaryCallback({
+                                        name: 'screenshot-fullpage',
+                                        data: screenshotBuffer,
+                                        mimeType: 'image/png'
+                                    }, 'image/png');
+                                    this.log('Full page screenshot captured', logger_1.Level.LOG);
+                                }
+                            }
+                            catch (error) {
+                                this.log(`Screenshot-fullpage failed: ${error.message}`, logger_1.Level.WARN);
+                            }
+                        }
+                        const hasSerializableOutput = Object.keys(serializableOutput).length > 0 &&
+                            Object.values(serializableOutput).some((arr) => Array.isArray(arr) && arr.length > 0);
+                        if (hasSerializableOutput) {
+                            yield this.options.serializableCallback({ scrape: serializableOutput });
+                            this.log(`scrape completed successfully for ${url}`, logger_1.Level.LOG);
+                        }
+                        else {
+                            this.log(`scrape completed but no content could be extracted from ${url}`, logger_1.Level.WARN);
+                        }
+                    }
+                    catch (error) {
+                        this.log(`scrape action failed: ${error.message}`, logger_1.Level.ERROR);
+                        throw new Error(`scrape execution error: ${error.message}`);
+                    }
+                }),
             };
             const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
                 console.log("Executing action:", methodName, args);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mx-cloud",
-  "version": "0.0.30",
+  "version": "0.0.31",
   "description": "mx cloud",
   "main": "build/index.js",
   "typings": "build/index.d.ts",