mx-cloud 0.0.30 → 0.0.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import Interpreter from './interpret';
2
2
  export default Interpreter;
3
3
  export { default as Preprocessor } from './preprocessor';
4
- export type { WorkflowFile, WhereWhatPair, Where, What, } from './types/workflow';
4
+ export type { WorkflowFile, WhereWhatPair, Where, What, CustomFunctions, } from './types/workflow';
5
5
  export { unaryOperators, naryOperators, meta as metaOperators } from './types/logic';
@@ -252,15 +252,19 @@ class Interpreter extends events_1.EventEmitter {
252
252
  }
253
253
  yield page.close();
254
254
  }),
255
- scrape: (selector) => __awaiter(this, void 0, void 0, function* () {
256
- var _a;
257
- if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
258
- this.options.debugChannel.setActionType('scrape');
259
- }
260
- yield this.ensureScriptsLoaded(page);
261
- const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
262
- yield this.callWithTimeout(() => this.options.serializableCallback(scrapeResults), 30000, 'serializableCallback (scrape)');
263
- }),
255
+ // DEPRECATED: Old scrape action - commented out in favor of new workflow-based scrape action
256
+ // scrape: async (selector?: string) => {
257
+ // if (this.options.debugChannel?.setActionType) {
258
+ // this.options.debugChannel.setActionType('scrape');
259
+ // }
260
+ // await this.ensureScriptsLoaded(page);
261
+ // const scrapeResults: Record<string, string>[] = await page.evaluate((s) => window.scrape(s ?? null), selector);
262
+ // await this.callWithTimeout(
263
+ // () => this.options.serializableCallback(scrapeResults),
264
+ // 30000,
265
+ // 'serializableCallback (scrape)'
266
+ // );
267
+ // },
264
268
  scrapeSchema: (schema_1, ...args_1) => __awaiter(this, [schema_1, ...args_1], void 0, function* (schema, actionName = "") {
265
269
  var _a;
266
270
  if (this.isAborted) {
@@ -1567,6 +1571,169 @@ class Interpreter extends events_1.EventEmitter {
1567
1571
  throw new Error(`Search execution error: ${error.message}`);
1568
1572
  }
1569
1573
  }),
1574
+ /**
1575
+ * scrape action: Converts a webpage to text, markdown, HTML, and/or screenshots.
1576
+ * This is the workflow action for scrape robots.
1577
+ */
1578
+ scrape: (scrapeConfig) => __awaiter(this, void 0, void 0, function* () {
1579
+ var _a;
1580
+ if (this.isAborted) {
1581
+ this.log('Workflow aborted, stopping scrape', logger_1.Level.WARN);
1582
+ return;
1583
+ }
1584
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
1585
+ this.options.debugChannel.setActionType('scrape');
1586
+ }
1587
+ this.log(`Starting scrape for URL: ${scrapeConfig.url}`, logger_1.Level.LOG);
1588
+ try {
1589
+ const formats = scrapeConfig.formats || ['markdown', 'html', 'text'];
1590
+ const url = scrapeConfig.url;
1591
+ if (!url) {
1592
+ throw new Error('No URL specified for scrape action');
1593
+ }
1594
+ const currentUrl = page.url();
1595
+ if (currentUrl === 'about:blank' || currentUrl === '' || !currentUrl.includes(new URL(url).hostname)) {
1596
+ this.log(`Navigating to ${url}`, logger_1.Level.LOG);
1597
+ yield page.goto(url, { waitUntil: 'load', timeout: 60000 });
1598
+ yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
1599
+ }
1600
+ const serializableOutput = {};
1601
+ const SCRAPE_TIMEOUT = 120000;
1602
+ if (formats.includes('text')) {
1603
+ try {
1604
+ const textPromise = page.evaluate(() => {
1605
+ const body = document.body;
1606
+ if (!body)
1607
+ return '';
1608
+ return body.innerText || body.textContent || '';
1609
+ });
1610
+ const timeoutPromise = new Promise((_, reject) => {
1611
+ setTimeout(() => reject(new Error(`Text extraction timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
1612
+ });
1613
+ const text = yield Promise.race([textPromise, timeoutPromise]);
1614
+ if (text && text.trim().length > 0) {
1615
+ serializableOutput.text = [{ content: text.trim() }];
1616
+ this.log('Text extraction completed', logger_1.Level.LOG);
1617
+ }
1618
+ }
1619
+ catch (error) {
1620
+ this.log(`Text extraction failed: ${error.message}`, logger_1.Level.WARN);
1621
+ }
1622
+ }
1623
+ if (formats.includes('markdown')) {
1624
+ try {
1625
+ const html = yield page.evaluate(() => {
1626
+ const selectors = [
1627
+ "script", "style", "link[rel='stylesheet']", "noscript", "meta",
1628
+ "svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
1629
+ ];
1630
+ selectors.forEach(sel => {
1631
+ document.querySelectorAll(sel).forEach(e => e.remove());
1632
+ });
1633
+ const all = document.querySelectorAll("*");
1634
+ all.forEach(el => {
1635
+ [...el.attributes].forEach(attr => {
1636
+ if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
1637
+ el.removeAttribute(attr.name);
1638
+ }
1639
+ });
1640
+ });
1641
+ return document.documentElement.outerHTML;
1642
+ });
1643
+ const markdownPromise = (0, markdown_1.parseMarkdown)(html, url);
1644
+ const timeoutPromise = new Promise((_, reject) => {
1645
+ setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
1646
+ });
1647
+ const markdown = yield Promise.race([markdownPromise, timeoutPromise]);
1648
+ if (markdown && markdown.trim().length > 0) {
1649
+ serializableOutput.markdown = [{ content: markdown }];
1650
+ this.log('Markdown conversion completed', logger_1.Level.LOG);
1651
+ }
1652
+ }
1653
+ catch (error) {
1654
+ this.log(`Markdown conversion failed: ${error.message}`, logger_1.Level.WARN);
1655
+ }
1656
+ }
1657
+ if (formats.includes('html')) {
1658
+ try {
1659
+ const htmlPromise = page.evaluate(() => {
1660
+ const selectors = [
1661
+ "script", "style", "link[rel='stylesheet']", "noscript", "meta",
1662
+ "svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
1663
+ ];
1664
+ selectors.forEach(sel => {
1665
+ document.querySelectorAll(sel).forEach(e => e.remove());
1666
+ });
1667
+ const all = document.querySelectorAll("*");
1668
+ all.forEach(el => {
1669
+ [...el.attributes].forEach(attr => {
1670
+ if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
1671
+ el.removeAttribute(attr.name);
1672
+ }
1673
+ });
1674
+ });
1675
+ return document.documentElement.outerHTML;
1676
+ });
1677
+ const timeoutPromise = new Promise((_, reject) => {
1678
+ setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
1679
+ });
1680
+ const html = yield Promise.race([htmlPromise, timeoutPromise]);
1681
+ if (html && html.trim().length > 0) {
1682
+ serializableOutput.html = [{ content: html }];
1683
+ this.log('HTML conversion completed', logger_1.Level.LOG);
1684
+ }
1685
+ }
1686
+ catch (error) {
1687
+ this.log(`HTML conversion failed: ${error.message}`, logger_1.Level.WARN);
1688
+ }
1689
+ }
1690
+ if (formats.includes('screenshot-visible')) {
1691
+ try {
1692
+ const screenshotBuffer = yield page.screenshot({ fullPage: false, type: 'png' });
1693
+ if (screenshotBuffer && screenshotBuffer.length > 0) {
1694
+ yield this.options.binaryCallback({
1695
+ name: 'screenshot-visible',
1696
+ data: screenshotBuffer,
1697
+ mimeType: 'image/png'
1698
+ }, 'image/png');
1699
+ this.log('Visible screenshot captured', logger_1.Level.LOG);
1700
+ }
1701
+ }
1702
+ catch (error) {
1703
+ this.log(`Screenshot-visible failed: ${error.message}`, logger_1.Level.WARN);
1704
+ }
1705
+ }
1706
+ if (formats.includes('screenshot-fullpage')) {
1707
+ try {
1708
+ const screenshotBuffer = yield page.screenshot({ fullPage: true, type: 'png' });
1709
+ if (screenshotBuffer && screenshotBuffer.length > 0) {
1710
+ yield this.options.binaryCallback({
1711
+ name: 'screenshot-fullpage',
1712
+ data: screenshotBuffer,
1713
+ mimeType: 'image/png'
1714
+ }, 'image/png');
1715
+ this.log('Full page screenshot captured', logger_1.Level.LOG);
1716
+ }
1717
+ }
1718
+ catch (error) {
1719
+ this.log(`Screenshot-fullpage failed: ${error.message}`, logger_1.Level.WARN);
1720
+ }
1721
+ }
1722
+ const hasSerializableOutput = Object.keys(serializableOutput).length > 0 &&
1723
+ Object.values(serializableOutput).some((arr) => Array.isArray(arr) && arr.length > 0);
1724
+ if (hasSerializableOutput) {
1725
+ yield this.options.serializableCallback({ scrape: serializableOutput });
1726
+ this.log(`scrape completed successfully for ${url}`, logger_1.Level.LOG);
1727
+ }
1728
+ else {
1729
+ this.log(`scrape completed but no content could be extracted from ${url}`, logger_1.Level.WARN);
1730
+ }
1731
+ }
1732
+ catch (error) {
1733
+ this.log(`scrape action failed: ${error.message}`, logger_1.Level.ERROR);
1734
+ throw new Error(`scrape execution error: ${error.message}`);
1735
+ }
1736
+ }),
1570
1737
  };
1571
1738
  const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
1572
1739
  console.log("Executing action:", methodName, args);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.30",
3
+ "version": "0.0.31",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",