mx-cloud 0.0.28 → 0.0.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import Interpreter from './interpret';
2
2
  export default Interpreter;
3
3
  export { default as Preprocessor } from './preprocessor';
4
- export type { WorkflowFile, WhereWhatPair, Where, What, } from './types/workflow';
4
+ export type { WorkflowFile, WhereWhatPair, Where, What, CustomFunctions, } from './types/workflow';
5
5
  export { unaryOperators, naryOperators, meta as metaOperators } from './types/logic';
@@ -54,6 +54,7 @@ const concurrency_1 = __importDefault(require("./utils/concurrency"));
54
54
  const preprocessor_1 = __importDefault(require("./preprocessor"));
55
55
  const logger_1 = __importStar(require("./utils/logger"));
56
56
  const selector_1 = require("./selector");
57
+ const markdown_1 = require("./utils/markdown");
57
58
  /**
58
59
  * Class for running the Smart Workflows.
59
60
  */
@@ -251,15 +252,19 @@ class Interpreter extends events_1.EventEmitter {
251
252
  }
252
253
  yield page.close();
253
254
  }),
254
- scrape: (selector) => __awaiter(this, void 0, void 0, function* () {
255
- var _a;
256
- if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
257
- this.options.debugChannel.setActionType('scrape');
258
- }
259
- yield this.ensureScriptsLoaded(page);
260
- const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
261
- yield this.callWithTimeout(() => this.options.serializableCallback(scrapeResults), 30000, 'serializableCallback (scrape)');
262
- }),
255
+ // DEPRECATED: Old scrape action - commented out in favor of new workflow-based scrape action
256
+ // scrape: async (selector?: string) => {
257
+ // if (this.options.debugChannel?.setActionType) {
258
+ // this.options.debugChannel.setActionType('scrape');
259
+ // }
260
+ // await this.ensureScriptsLoaded(page);
261
+ // const scrapeResults: Record<string, string>[] = await page.evaluate((s) => window.scrape(s ?? null), selector);
262
+ // await this.callWithTimeout(
263
+ // () => this.options.serializableCallback(scrapeResults),
264
+ // 30000,
265
+ // 'serializableCallback (scrape)'
266
+ // );
267
+ // },
263
268
  scrapeSchema: (schema_1, ...args_1) => __awaiter(this, [schema_1, ...args_1], void 0, function* (schema, actionName = "") {
264
269
  var _a;
265
270
  if (this.isAborted) {
@@ -922,14 +927,60 @@ class Interpreter extends events_1.EventEmitter {
922
927
  metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
923
928
  };
924
929
  });
925
- return {
930
+ const result = {
926
931
  metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
927
- html: pageData.html,
928
- text: pageData.text,
929
- links: pageData.links,
930
- wordCount: pageData.wordCount,
931
932
  scrapedAt: new Date().toISOString()
932
933
  };
934
+ const formats = crawlConfig.outputFormats || [];
935
+ if (formats.includes('text')) {
936
+ result.text = pageData.text;
937
+ result.wordCount = pageData.wordCount;
938
+ }
939
+ if (formats.includes('html')) {
940
+ result.html = pageData.html;
941
+ result.links = pageData.links;
942
+ }
943
+ if (formats.includes('markdown')) {
944
+ try {
945
+ const markdown = yield (0, markdown_1.parseMarkdown)(pageData.html, url);
946
+ result.markdown = markdown;
947
+ }
948
+ catch (err) {
949
+ this.log(`Markdown conversion failed for ${url}: ${err.message}`, logger_1.Level.WARN);
950
+ result.markdown = '';
951
+ }
952
+ }
953
+ if (formats.includes('screenshot-visible')) {
954
+ try {
955
+ const screenshotBuffer = yield page.screenshot({ fullPage: false });
956
+ const screenshotName = `Crawl - ${crawlResults.length} - Visible`;
957
+ yield this.options.binaryCallback({
958
+ name: screenshotName,
959
+ data: screenshotBuffer,
960
+ mimeType: 'image/png'
961
+ }, 'image/png');
962
+ result.screenshotVisible = screenshotName;
963
+ }
964
+ catch (err) {
965
+ this.log(`Screenshot-visible failed for ${url}: ${err.message}`, logger_1.Level.WARN);
966
+ }
967
+ }
968
+ if (formats.includes('screenshot-fullpage')) {
969
+ try {
970
+ const screenshotBuffer = yield page.screenshot({ fullPage: true });
971
+ const screenshotName = `Crawl - ${crawlResults.length} - Full Page`;
972
+ yield this.options.binaryCallback({
973
+ name: screenshotName,
974
+ data: screenshotBuffer,
975
+ mimeType: 'image/png'
976
+ }, 'image/png');
977
+ result.screenshotFullpage = screenshotName;
978
+ }
979
+ catch (err) {
980
+ this.log(`Screenshot-fullpage failed for ${url}: ${err.message}`, logger_1.Level.WARN);
981
+ }
982
+ }
983
+ return result;
933
984
  });
934
985
  const visitedUrls = new Set();
935
986
  const crawlResults = [];
@@ -1036,15 +1087,28 @@ class Interpreter extends events_1.EventEmitter {
1036
1087
  yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
1037
1088
  }
1038
1089
  yield page.goto(url, {
1039
- waitUntil: 'domcontentloaded',
1040
- timeout: 30000
1090
+ waitUntil: 'load',
1091
+ timeout: 60000
1041
1092
  }).catch((err) => {
1042
1093
  throw new Error(`Navigation failed: ${err.message}`);
1043
1094
  });
1044
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
1095
+ yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
1045
1096
  const pageResult = yield scrapePageContent(url);
1046
1097
  pageResult.metadata.depth = depth;
1047
1098
  crawlResults.push(pageResult);
1099
+ const actionType = "crawl";
1100
+ const actionName = "Crawl Results";
1101
+ if (!this.serializableDataByType[actionType]) {
1102
+ this.serializableDataByType[actionType] = {};
1103
+ }
1104
+ this.serializableDataByType[actionType][actionName] = [...crawlResults];
1105
+ yield this.options.serializableCallback({
1106
+ crawl: this.serializableDataByType.crawl
1107
+ });
1108
+ if (this.isAborted) {
1109
+ this.log(`Run aborted after scraping ${url}, stopping crawl`, logger_1.Level.WARN);
1110
+ break;
1111
+ }
1048
1112
  this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, logger_1.Level.LOG);
1049
1113
  if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
1050
1114
  const newLinks = yield extractLinksFromPage();
@@ -1308,6 +1372,7 @@ class Interpreter extends events_1.EventEmitter {
1308
1372
  filters: searchConfig.filters || {},
1309
1373
  resultsCount: searchResults.length,
1310
1374
  results: searchResults,
1375
+ mode: searchConfig.mode,
1311
1376
  searchedAt: new Date().toISOString()
1312
1377
  };
1313
1378
  this.serializableDataByType[actionType][actionName] = searchData;
@@ -1323,16 +1388,25 @@ class Interpreter extends events_1.EventEmitter {
1323
1388
  this.log(`Starting to scrape content from ${searchResults.length} search results...`, logger_1.Level.LOG);
1324
1389
  const scrapedResults = [];
1325
1390
  for (let i = 0; i < searchResults.length; i++) {
1391
+ if (this.isAborted) {
1392
+ this.log(`Run aborted, stopping search scraping at result ${i + 1}/${searchResults.length}`, logger_1.Level.WARN);
1393
+ break;
1394
+ }
1326
1395
  const result = searchResults[i];
1327
1396
  try {
1328
1397
  this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, logger_1.Level.LOG);
1398
+ let navigationFailed = false;
1329
1399
  yield page.goto(result.url, {
1330
- waitUntil: 'domcontentloaded',
1331
- timeout: 30000
1400
+ waitUntil: 'load',
1401
+ timeout: 60000
1332
1402
  }).catch(() => {
1333
1403
  this.log(`Failed to navigate to ${result.url}, skipping...`, logger_1.Level.WARN);
1404
+ navigationFailed = true;
1334
1405
  });
1335
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
1406
+ if (navigationFailed) {
1407
+ continue;
1408
+ }
1409
+ yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
1336
1410
  const pageData = yield page.evaluate(() => {
1337
1411
  var _a, _b;
1338
1412
  const getMeta = (name) => {
@@ -1368,7 +1442,7 @@ class Interpreter extends events_1.EventEmitter {
1368
1442
  metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
1369
1443
  };
1370
1444
  });
1371
- scrapedResults.push({
1445
+ const scrapedResult = {
1372
1446
  searchResult: {
1373
1447
  query: searchConfig.query,
1374
1448
  position: result.position,
@@ -1376,12 +1450,79 @@ class Interpreter extends events_1.EventEmitter {
1376
1450
  searchDescription: result.description,
1377
1451
  },
1378
1452
  metadata: Object.assign(Object.assign({}, pageData.metadata), { url: result.url, sourceURL: result.url }),
1379
- html: pageData.html,
1380
- text: pageData.text,
1381
- links: pageData.links,
1382
- wordCount: pageData.wordCount,
1383
1453
  scrapedAt: new Date().toISOString()
1454
+ };
1455
+ const formats = searchConfig.outputFormats || [];
1456
+ if (formats.includes('text')) {
1457
+ scrapedResult.text = pageData.text;
1458
+ scrapedResult.wordCount = pageData.wordCount;
1459
+ }
1460
+ if (formats.includes('html')) {
1461
+ scrapedResult.html = pageData.html;
1462
+ scrapedResult.links = pageData.links;
1463
+ }
1464
+ if (formats.includes('markdown')) {
1465
+ try {
1466
+ const markdown = yield (0, markdown_1.parseMarkdown)(pageData.html, result.url);
1467
+ scrapedResult.markdown = markdown;
1468
+ }
1469
+ catch (err) {
1470
+ this.log(`Markdown conversion failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
1471
+ scrapedResult.markdown = '';
1472
+ }
1473
+ }
1474
+ if (formats.includes('screenshot-visible')) {
1475
+ try {
1476
+ const screenshotBuffer = yield page.screenshot({ fullPage: false });
1477
+ const screenshotName = `Search - ${i} - Visible`;
1478
+ yield this.options.binaryCallback({
1479
+ name: screenshotName,
1480
+ data: screenshotBuffer,
1481
+ mimeType: 'image/png'
1482
+ }, 'image/png');
1483
+ scrapedResult.screenshotVisible = screenshotName;
1484
+ }
1485
+ catch (err) {
1486
+ this.log(`Screenshot-visible failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
1487
+ }
1488
+ }
1489
+ if (formats.includes('screenshot-fullpage')) {
1490
+ try {
1491
+ const screenshotBuffer = yield page.screenshot({ fullPage: true });
1492
+ const screenshotName = `Search - ${i} - Full Page`;
1493
+ yield this.options.binaryCallback({
1494
+ name: screenshotName,
1495
+ data: screenshotBuffer,
1496
+ mimeType: 'image/png'
1497
+ }, 'image/png');
1498
+ scrapedResult.screenshotFullpage = screenshotName;
1499
+ }
1500
+ catch (err) {
1501
+ this.log(`Screenshot-fullpage failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
1502
+ }
1503
+ }
1504
+ scrapedResults.push(scrapedResult);
1505
+ const actionType = "search";
1506
+ const actionName = "Search Results";
1507
+ if (!this.serializableDataByType[actionType]) {
1508
+ this.serializableDataByType[actionType] = {};
1509
+ }
1510
+ this.serializableDataByType[actionType][actionName] = {
1511
+ query: searchConfig.query,
1512
+ provider: searchConfig.provider,
1513
+ filters: searchConfig.filters || {},
1514
+ resultsCount: scrapedResults.length,
1515
+ results: scrapedResults,
1516
+ mode: searchConfig.mode,
1517
+ searchedAt: new Date().toISOString()
1518
+ };
1519
+ yield this.options.serializableCallback({
1520
+ search: this.serializableDataByType.search
1384
1521
  });
1522
+ if (this.isAborted) {
1523
+ this.log(`Run aborted after scraping ${result.url}, stopping search`, logger_1.Level.WARN);
1524
+ break;
1525
+ }
1385
1526
  this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
1386
1527
  }
1387
1528
  catch (error) {
@@ -1430,6 +1571,169 @@ class Interpreter extends events_1.EventEmitter {
1430
1571
  throw new Error(`Search execution error: ${error.message}`);
1431
1572
  }
1432
1573
  }),
1574
+ /**
1575
+ * scrape action: Converts a webpage to text, markdown, HTML, and/or screenshots.
1576
+ * This is the workflow action for scrape robots.
1577
+ */
1578
+ scrape: (scrapeConfig) => __awaiter(this, void 0, void 0, function* () {
1579
+ var _a;
1580
+ if (this.isAborted) {
1581
+ this.log('Workflow aborted, stopping scrape', logger_1.Level.WARN);
1582
+ return;
1583
+ }
1584
+ if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
1585
+ this.options.debugChannel.setActionType('scrape');
1586
+ }
1587
+ this.log(`Starting scrape for URL: ${scrapeConfig.url}`, logger_1.Level.LOG);
1588
+ try {
1589
+ const formats = scrapeConfig.formats || ['markdown', 'html', 'text'];
1590
+ const url = scrapeConfig.url;
1591
+ if (!url) {
1592
+ throw new Error('No URL specified for scrape action');
1593
+ }
1594
+ const currentUrl = page.url();
1595
+ if (currentUrl === 'about:blank' || currentUrl === '' || !currentUrl.includes(new URL(url).hostname)) {
1596
+ this.log(`Navigating to ${url}`, logger_1.Level.LOG);
1597
+ yield page.goto(url, { waitUntil: 'load', timeout: 60000 });
1598
+ yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
1599
+ }
1600
+ const serializableOutput = {};
1601
+ const SCRAPE_TIMEOUT = 120000;
1602
+ if (formats.includes('text')) {
1603
+ try {
1604
+ const textPromise = page.evaluate(() => {
1605
+ const body = document.body;
1606
+ if (!body)
1607
+ return '';
1608
+ return body.innerText || body.textContent || '';
1609
+ });
1610
+ const timeoutPromise = new Promise((_, reject) => {
1611
+ setTimeout(() => reject(new Error(`Text extraction timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
1612
+ });
1613
+ const text = yield Promise.race([textPromise, timeoutPromise]);
1614
+ if (text && text.trim().length > 0) {
1615
+ serializableOutput.text = [{ content: text.trim() }];
1616
+ this.log('Text extraction completed', logger_1.Level.LOG);
1617
+ }
1618
+ }
1619
+ catch (error) {
1620
+ this.log(`Text extraction failed: ${error.message}`, logger_1.Level.WARN);
1621
+ }
1622
+ }
1623
+ if (formats.includes('markdown')) {
1624
+ try {
1625
+ const html = yield page.evaluate(() => {
1626
+ const selectors = [
1627
+ "script", "style", "link[rel='stylesheet']", "noscript", "meta",
1628
+ "svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
1629
+ ];
1630
+ selectors.forEach(sel => {
1631
+ document.querySelectorAll(sel).forEach(e => e.remove());
1632
+ });
1633
+ const all = document.querySelectorAll("*");
1634
+ all.forEach(el => {
1635
+ [...el.attributes].forEach(attr => {
1636
+ if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
1637
+ el.removeAttribute(attr.name);
1638
+ }
1639
+ });
1640
+ });
1641
+ return document.documentElement.outerHTML;
1642
+ });
1643
+ const markdownPromise = (0, markdown_1.parseMarkdown)(html, url);
1644
+ const timeoutPromise = new Promise((_, reject) => {
1645
+ setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
1646
+ });
1647
+ const markdown = yield Promise.race([markdownPromise, timeoutPromise]);
1648
+ if (markdown && markdown.trim().length > 0) {
1649
+ serializableOutput.markdown = [{ content: markdown }];
1650
+ this.log('Markdown conversion completed', logger_1.Level.LOG);
1651
+ }
1652
+ }
1653
+ catch (error) {
1654
+ this.log(`Markdown conversion failed: ${error.message}`, logger_1.Level.WARN);
1655
+ }
1656
+ }
1657
+ if (formats.includes('html')) {
1658
+ try {
1659
+ const htmlPromise = page.evaluate(() => {
1660
+ const selectors = [
1661
+ "script", "style", "link[rel='stylesheet']", "noscript", "meta",
1662
+ "svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
1663
+ ];
1664
+ selectors.forEach(sel => {
1665
+ document.querySelectorAll(sel).forEach(e => e.remove());
1666
+ });
1667
+ const all = document.querySelectorAll("*");
1668
+ all.forEach(el => {
1669
+ [...el.attributes].forEach(attr => {
1670
+ if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
1671
+ el.removeAttribute(attr.name);
1672
+ }
1673
+ });
1674
+ });
1675
+ return document.documentElement.outerHTML;
1676
+ });
1677
+ const timeoutPromise = new Promise((_, reject) => {
1678
+ setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
1679
+ });
1680
+ const html = yield Promise.race([htmlPromise, timeoutPromise]);
1681
+ if (html && html.trim().length > 0) {
1682
+ serializableOutput.html = [{ content: html }];
1683
+ this.log('HTML conversion completed', logger_1.Level.LOG);
1684
+ }
1685
+ }
1686
+ catch (error) {
1687
+ this.log(`HTML conversion failed: ${error.message}`, logger_1.Level.WARN);
1688
+ }
1689
+ }
1690
+ if (formats.includes('screenshot-visible')) {
1691
+ try {
1692
+ const screenshotBuffer = yield page.screenshot({ fullPage: false, type: 'png' });
1693
+ if (screenshotBuffer && screenshotBuffer.length > 0) {
1694
+ yield this.options.binaryCallback({
1695
+ name: 'screenshot-visible',
1696
+ data: screenshotBuffer,
1697
+ mimeType: 'image/png'
1698
+ }, 'image/png');
1699
+ this.log('Visible screenshot captured', logger_1.Level.LOG);
1700
+ }
1701
+ }
1702
+ catch (error) {
1703
+ this.log(`Screenshot-visible failed: ${error.message}`, logger_1.Level.WARN);
1704
+ }
1705
+ }
1706
+ if (formats.includes('screenshot-fullpage')) {
1707
+ try {
1708
+ const screenshotBuffer = yield page.screenshot({ fullPage: true, type: 'png' });
1709
+ if (screenshotBuffer && screenshotBuffer.length > 0) {
1710
+ yield this.options.binaryCallback({
1711
+ name: 'screenshot-fullpage',
1712
+ data: screenshotBuffer,
1713
+ mimeType: 'image/png'
1714
+ }, 'image/png');
1715
+ this.log('Full page screenshot captured', logger_1.Level.LOG);
1716
+ }
1717
+ }
1718
+ catch (error) {
1719
+ this.log(`Screenshot-fullpage failed: ${error.message}`, logger_1.Level.WARN);
1720
+ }
1721
+ }
1722
+ const hasSerializableOutput = Object.keys(serializableOutput).length > 0 &&
1723
+ Object.values(serializableOutput).some((arr) => Array.isArray(arr) && arr.length > 0);
1724
+ if (hasSerializableOutput) {
1725
+ yield this.options.serializableCallback({ scrape: serializableOutput });
1726
+ this.log(`scrape completed successfully for ${url}`, logger_1.Level.LOG);
1727
+ }
1728
+ else {
1729
+ this.log(`scrape completed but no content could be extracted from ${url}`, logger_1.Level.WARN);
1730
+ }
1731
+ }
1732
+ catch (error) {
1733
+ this.log(`scrape action failed: ${error.message}`, logger_1.Level.ERROR);
1734
+ throw new Error(`scrape execution error: ${error.message}`);
1735
+ }
1736
+ }),
1433
1737
  };
1434
1738
  const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
1435
1739
  console.log("Executing action:", methodName, args);
@@ -0,0 +1 @@
1
+ export declare function parseMarkdown(html: string | null | undefined, baseUrl?: string | null): Promise<string>;
@@ -0,0 +1,153 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.parseMarkdown = parseMarkdown;
13
+ function parseMarkdown(html, baseUrl) {
14
+ return __awaiter(this, void 0, void 0, function* () {
15
+ const TurndownService = require("turndown");
16
+ const { gfm } = require("joplin-turndown-plugin-gfm");
17
+ const cheerio = require("cheerio");
18
+ const { URL } = require("url");
19
+ if (!html)
20
+ return "";
21
+ const tidiedHtml = tidyHtml(html);
22
+ const t = new TurndownService({
23
+ headingStyle: "atx", // ensures #### instead of ------
24
+ codeBlockStyle: "fenced",
25
+ });
26
+ // ---------------------------------------------
27
+ // Proper ATX headings #### instead of underline-style
28
+ // ---------------------------------------------
29
+ t.addRule("forceAtxHeadings", {
30
+ filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
31
+ replacement: (content, node) => {
32
+ const level = Number(node.nodeName.charAt(1));
33
+ const clean = content.trim();
34
+ return `\n${"#".repeat(level)} ${clean}\n`;
35
+ },
36
+ });
37
+ // ---------------------------------------------
38
+ // Remove SVGs
39
+ // ---------------------------------------------
40
+ t.addRule("truncate-svg", {
41
+ filter: "svg",
42
+ replacement: () => "",
43
+ });
44
+ // ---------------------------------------------
45
+ // Improved paragraph cleanup
46
+ // ---------------------------------------------
47
+ t.addRule("improved-paragraph", {
48
+ filter: "p",
49
+ replacement: (innerText) => {
50
+ const trimmed = innerText.trim();
51
+ if (!trimmed)
52
+ return "";
53
+ return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
54
+ },
55
+ });
56
+ // ---------------------------------------------
57
+ // Inline link with fallback text
58
+ // ---------------------------------------------
59
+ t.addRule("inlineLink", {
60
+ filter: (node, opts) => node.nodeName === "A" && node.getAttribute("href"),
61
+ replacement: (content, node) => {
62
+ var _a, _b;
63
+ let text = content.trim();
64
+ // Fallback: aria-label → title → domain
65
+ if (!text) {
66
+ text =
67
+ ((_a = node.getAttribute("aria-label")) === null || _a === void 0 ? void 0 : _a.trim()) ||
68
+ ((_b = node.getAttribute("title")) === null || _b === void 0 ? void 0 : _b.trim()) ||
69
+ getDomainFromUrl(node.getAttribute("href")) ||
70
+ "link";
71
+ }
72
+ let href = node.getAttribute("href").trim();
73
+ // relative → absolute
74
+ if (baseUrl && isRelativeUrl(href)) {
75
+ try {
76
+ const u = new URL(href, baseUrl);
77
+ href = u.toString();
78
+ }
79
+ catch (_c) { }
80
+ }
81
+ href = cleanUrl(href);
82
+ return `[${text}](${href})`;
83
+ },
84
+ });
85
+ t.use(gfm);
86
+ // Convert HTML → Markdown
87
+ try {
88
+ let out = yield t.turndown(tidiedHtml);
89
+ out = fixBrokenLinks(out);
90
+ out = stripSkipLinks(out);
91
+ return out.trim();
92
+ }
93
+ catch (err) {
94
+ console.error("HTML→Markdown failed", { err });
95
+ return "";
96
+ }
97
+ });
98
+ }
99
+ // -----------------------------------------------------
100
+ // Helpers
101
+ // -----------------------------------------------------
102
+ function isRelativeUrl(url) {
103
+ return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
104
+ }
105
+ function getDomainFromUrl(url) {
106
+ try {
107
+ const u = new URL(url);
108
+ return u.hostname.replace("www.", "");
109
+ }
110
+ catch (_a) {
111
+ return null;
112
+ }
113
+ }
114
+ function cleanUrl(u) {
115
+ return u;
116
+ }
117
+ function cleanAttribute(attr) {
118
+ return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
119
+ }
120
+ function tidyHtml(html) {
121
+ const cheerio = require("cheerio");
122
+ const $ = cheerio.load(html);
123
+ const manuallyCleanedElements = [
124
+ "script",
125
+ "style",
126
+ "iframe",
127
+ "noscript",
128
+ "meta",
129
+ "link",
130
+ "object",
131
+ "embed",
132
+ "canvas",
133
+ "audio",
134
+ "video",
135
+ ];
136
+ manuallyCleanedElements.forEach((tag) => $(tag).remove());
137
+ return $("body").html();
138
+ }
139
+ function fixBrokenLinks(md) {
140
+ let depth = 0;
141
+ let result = "";
142
+ for (const ch of md) {
143
+ if (ch === "[")
144
+ depth++;
145
+ if (ch === "]")
146
+ depth = Math.max(0, depth - 1);
147
+ result += depth > 0 && ch === "\n" ? "\\\n" : ch;
148
+ }
149
+ return result;
150
+ }
151
+ function stripSkipLinks(md) {
152
+ return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
153
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.28",
3
+ "version": "0.0.31",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",
@@ -17,9 +17,16 @@
17
17
  "license": "AGPL-3.0-or-later",
18
18
  "dependencies": {
19
19
  "@cliqz/adblocker-playwright": "^1.31.3",
20
+ "cheerio": "^1.1.2",
20
21
  "cross-fetch": "^4.0.0",
21
22
  "joi": "^17.6.0",
23
+ "joplin-turndown-plugin-gfm": "^1.0.12",
22
24
  "nodemailer": "^6.10.0",
23
- "playwright-core": "^1.57.0"
25
+ "playwright-core": "^1.57.0",
26
+ "rimraf": "^6.1.2",
27
+ "turndown": "^7.2.0"
28
+ },
29
+ "devDependencies": {
30
+ "@types/turndown": "^5.0.6"
24
31
  }
25
32
  }