mx-cloud 0.0.28 → 0.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,6 +54,7 @@ const concurrency_1 = __importDefault(require("./utils/concurrency"));
54
54
  const preprocessor_1 = __importDefault(require("./preprocessor"));
55
55
  const logger_1 = __importStar(require("./utils/logger"));
56
56
  const selector_1 = require("./selector");
57
+ const markdown_1 = require("./utils/markdown");
57
58
  /**
58
59
  * Class for running the Smart Workflows.
59
60
  */
@@ -922,14 +923,60 @@ class Interpreter extends events_1.EventEmitter {
922
923
  metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
923
924
  };
924
925
  });
925
- return {
926
+ const result = {
926
927
  metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
927
- html: pageData.html,
928
- text: pageData.text,
929
- links: pageData.links,
930
- wordCount: pageData.wordCount,
931
928
  scrapedAt: new Date().toISOString()
932
929
  };
930
+ const formats = crawlConfig.outputFormats || [];
931
+ if (formats.includes('text')) {
932
+ result.text = pageData.text;
933
+ result.wordCount = pageData.wordCount;
934
+ }
935
+ if (formats.includes('html')) {
936
+ result.html = pageData.html;
937
+ result.links = pageData.links;
938
+ }
939
+ if (formats.includes('markdown')) {
940
+ try {
941
+ const markdown = yield (0, markdown_1.parseMarkdown)(pageData.html, url);
942
+ result.markdown = markdown;
943
+ }
944
+ catch (err) {
945
+ this.log(`Markdown conversion failed for ${url}: ${err.message}`, logger_1.Level.WARN);
946
+ result.markdown = '';
947
+ }
948
+ }
949
+ if (formats.includes('screenshot-visible')) {
950
+ try {
951
+ const screenshotBuffer = yield page.screenshot({ fullPage: false });
952
+ const screenshotName = `Crawl - ${crawlResults.length} - Visible`;
953
+ yield this.options.binaryCallback({
954
+ name: screenshotName,
955
+ data: screenshotBuffer,
956
+ mimeType: 'image/png'
957
+ }, 'image/png');
958
+ result.screenshotVisible = screenshotName;
959
+ }
960
+ catch (err) {
961
+ this.log(`Screenshot-visible failed for ${url}: ${err.message}`, logger_1.Level.WARN);
962
+ }
963
+ }
964
+ if (formats.includes('screenshot-fullpage')) {
965
+ try {
966
+ const screenshotBuffer = yield page.screenshot({ fullPage: true });
967
+ const screenshotName = `Crawl - ${crawlResults.length} - Full Page`;
968
+ yield this.options.binaryCallback({
969
+ name: screenshotName,
970
+ data: screenshotBuffer,
971
+ mimeType: 'image/png'
972
+ }, 'image/png');
973
+ result.screenshotFullpage = screenshotName;
974
+ }
975
+ catch (err) {
976
+ this.log(`Screenshot-fullpage failed for ${url}: ${err.message}`, logger_1.Level.WARN);
977
+ }
978
+ }
979
+ return result;
933
980
  });
934
981
  const visitedUrls = new Set();
935
982
  const crawlResults = [];
@@ -1036,15 +1083,28 @@ class Interpreter extends events_1.EventEmitter {
1036
1083
  yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
1037
1084
  }
1038
1085
  yield page.goto(url, {
1039
- waitUntil: 'domcontentloaded',
1040
- timeout: 30000
1086
+ waitUntil: 'load',
1087
+ timeout: 60000
1041
1088
  }).catch((err) => {
1042
1089
  throw new Error(`Navigation failed: ${err.message}`);
1043
1090
  });
1044
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
1091
+ yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
1045
1092
  const pageResult = yield scrapePageContent(url);
1046
1093
  pageResult.metadata.depth = depth;
1047
1094
  crawlResults.push(pageResult);
1095
+ const actionType = "crawl";
1096
+ const actionName = "Crawl Results";
1097
+ if (!this.serializableDataByType[actionType]) {
1098
+ this.serializableDataByType[actionType] = {};
1099
+ }
1100
+ this.serializableDataByType[actionType][actionName] = [...crawlResults];
1101
+ yield this.options.serializableCallback({
1102
+ crawl: this.serializableDataByType.crawl
1103
+ });
1104
+ if (this.isAborted) {
1105
+ this.log(`Run aborted after scraping ${url}, stopping crawl`, logger_1.Level.WARN);
1106
+ break;
1107
+ }
1048
1108
  this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, logger_1.Level.LOG);
1049
1109
  if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
1050
1110
  const newLinks = yield extractLinksFromPage();
@@ -1308,6 +1368,7 @@ class Interpreter extends events_1.EventEmitter {
1308
1368
  filters: searchConfig.filters || {},
1309
1369
  resultsCount: searchResults.length,
1310
1370
  results: searchResults,
1371
+ mode: searchConfig.mode,
1311
1372
  searchedAt: new Date().toISOString()
1312
1373
  };
1313
1374
  this.serializableDataByType[actionType][actionName] = searchData;
@@ -1323,16 +1384,25 @@ class Interpreter extends events_1.EventEmitter {
1323
1384
  this.log(`Starting to scrape content from ${searchResults.length} search results...`, logger_1.Level.LOG);
1324
1385
  const scrapedResults = [];
1325
1386
  for (let i = 0; i < searchResults.length; i++) {
1387
+ if (this.isAborted) {
1388
+ this.log(`Run aborted, stopping search scraping at result ${i + 1}/${searchResults.length}`, logger_1.Level.WARN);
1389
+ break;
1390
+ }
1326
1391
  const result = searchResults[i];
1327
1392
  try {
1328
1393
  this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, logger_1.Level.LOG);
1394
+ let navigationFailed = false;
1329
1395
  yield page.goto(result.url, {
1330
- waitUntil: 'domcontentloaded',
1331
- timeout: 30000
1396
+ waitUntil: 'load',
1397
+ timeout: 60000
1332
1398
  }).catch(() => {
1333
1399
  this.log(`Failed to navigate to ${result.url}, skipping...`, logger_1.Level.WARN);
1400
+ navigationFailed = true;
1334
1401
  });
1335
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
1402
+ if (navigationFailed) {
1403
+ continue;
1404
+ }
1405
+ yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
1336
1406
  const pageData = yield page.evaluate(() => {
1337
1407
  var _a, _b;
1338
1408
  const getMeta = (name) => {
@@ -1368,7 +1438,7 @@ class Interpreter extends events_1.EventEmitter {
1368
1438
  metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
1369
1439
  };
1370
1440
  });
1371
- scrapedResults.push({
1441
+ const scrapedResult = {
1372
1442
  searchResult: {
1373
1443
  query: searchConfig.query,
1374
1444
  position: result.position,
@@ -1376,12 +1446,79 @@ class Interpreter extends events_1.EventEmitter {
1376
1446
  searchDescription: result.description,
1377
1447
  },
1378
1448
  metadata: Object.assign(Object.assign({}, pageData.metadata), { url: result.url, sourceURL: result.url }),
1379
- html: pageData.html,
1380
- text: pageData.text,
1381
- links: pageData.links,
1382
- wordCount: pageData.wordCount,
1383
1449
  scrapedAt: new Date().toISOString()
1450
+ };
1451
+ const formats = searchConfig.outputFormats || [];
1452
+ if (formats.includes('text')) {
1453
+ scrapedResult.text = pageData.text;
1454
+ scrapedResult.wordCount = pageData.wordCount;
1455
+ }
1456
+ if (formats.includes('html')) {
1457
+ scrapedResult.html = pageData.html;
1458
+ scrapedResult.links = pageData.links;
1459
+ }
1460
+ if (formats.includes('markdown')) {
1461
+ try {
1462
+ const markdown = yield (0, markdown_1.parseMarkdown)(pageData.html, result.url);
1463
+ scrapedResult.markdown = markdown;
1464
+ }
1465
+ catch (err) {
1466
+ this.log(`Markdown conversion failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
1467
+ scrapedResult.markdown = '';
1468
+ }
1469
+ }
1470
+ if (formats.includes('screenshot-visible')) {
1471
+ try {
1472
+ const screenshotBuffer = yield page.screenshot({ fullPage: false });
1473
+ const screenshotName = `Search - ${i} - Visible`;
1474
+ yield this.options.binaryCallback({
1475
+ name: screenshotName,
1476
+ data: screenshotBuffer,
1477
+ mimeType: 'image/png'
1478
+ }, 'image/png');
1479
+ scrapedResult.screenshotVisible = screenshotName;
1480
+ }
1481
+ catch (err) {
1482
+ this.log(`Screenshot-visible failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
1483
+ }
1484
+ }
1485
+ if (formats.includes('screenshot-fullpage')) {
1486
+ try {
1487
+ const screenshotBuffer = yield page.screenshot({ fullPage: true });
1488
+ const screenshotName = `Search - ${i} - Full Page`;
1489
+ yield this.options.binaryCallback({
1490
+ name: screenshotName,
1491
+ data: screenshotBuffer,
1492
+ mimeType: 'image/png'
1493
+ }, 'image/png');
1494
+ scrapedResult.screenshotFullpage = screenshotName;
1495
+ }
1496
+ catch (err) {
1497
+ this.log(`Screenshot-fullpage failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
1498
+ }
1499
+ }
1500
+ scrapedResults.push(scrapedResult);
1501
+ const actionType = "search";
1502
+ const actionName = "Search Results";
1503
+ if (!this.serializableDataByType[actionType]) {
1504
+ this.serializableDataByType[actionType] = {};
1505
+ }
1506
+ this.serializableDataByType[actionType][actionName] = {
1507
+ query: searchConfig.query,
1508
+ provider: searchConfig.provider,
1509
+ filters: searchConfig.filters || {},
1510
+ resultsCount: scrapedResults.length,
1511
+ results: scrapedResults,
1512
+ mode: searchConfig.mode,
1513
+ searchedAt: new Date().toISOString()
1514
+ };
1515
+ yield this.options.serializableCallback({
1516
+ search: this.serializableDataByType.search
1384
1517
  });
1518
+ if (this.isAborted) {
1519
+ this.log(`Run aborted after scraping ${result.url}, stopping search`, logger_1.Level.WARN);
1520
+ break;
1521
+ }
1385
1522
  this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
1386
1523
  }
1387
1524
  catch (error) {
@@ -0,0 +1 @@
1
+ export declare function parseMarkdown(html: string | null | undefined, baseUrl?: string | null): Promise<string>;
@@ -0,0 +1,153 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.parseMarkdown = parseMarkdown;
13
+ function parseMarkdown(html, baseUrl) {
14
+ return __awaiter(this, void 0, void 0, function* () {
15
+ const TurndownService = require("turndown");
16
+ const { gfm } = require("joplin-turndown-plugin-gfm");
17
+ const cheerio = require("cheerio");
18
+ const { URL } = require("url");
19
+ if (!html)
20
+ return "";
21
+ const tidiedHtml = tidyHtml(html);
22
+ const t = new TurndownService({
23
+ headingStyle: "atx", // ensures #### instead of ------
24
+ codeBlockStyle: "fenced",
25
+ });
26
+ // ---------------------------------------------
27
+ // Proper ATX headings #### instead of underline-style
28
+ // ---------------------------------------------
29
+ t.addRule("forceAtxHeadings", {
30
+ filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
31
+ replacement: (content, node) => {
32
+ const level = Number(node.nodeName.charAt(1));
33
+ const clean = content.trim();
34
+ return `\n${"#".repeat(level)} ${clean}\n`;
35
+ },
36
+ });
37
+ // ---------------------------------------------
38
+ // Remove SVGs
39
+ // ---------------------------------------------
40
+ t.addRule("truncate-svg", {
41
+ filter: "svg",
42
+ replacement: () => "",
43
+ });
44
+ // ---------------------------------------------
45
+ // Improved paragraph cleanup
46
+ // ---------------------------------------------
47
+ t.addRule("improved-paragraph", {
48
+ filter: "p",
49
+ replacement: (innerText) => {
50
+ const trimmed = innerText.trim();
51
+ if (!trimmed)
52
+ return "";
53
+ return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
54
+ },
55
+ });
56
+ // ---------------------------------------------
57
+ // Inline link with fallback text
58
+ // ---------------------------------------------
59
+ t.addRule("inlineLink", {
60
+ filter: (node, opts) => node.nodeName === "A" && node.getAttribute("href"),
61
+ replacement: (content, node) => {
62
+ var _a, _b;
63
+ let text = content.trim();
64
+ // Fallback: aria-label → title → domain
65
+ if (!text) {
66
+ text =
67
+ ((_a = node.getAttribute("aria-label")) === null || _a === void 0 ? void 0 : _a.trim()) ||
68
+ ((_b = node.getAttribute("title")) === null || _b === void 0 ? void 0 : _b.trim()) ||
69
+ getDomainFromUrl(node.getAttribute("href")) ||
70
+ "link";
71
+ }
72
+ let href = node.getAttribute("href").trim();
73
+ // relative → absolute
74
+ if (baseUrl && isRelativeUrl(href)) {
75
+ try {
76
+ const u = new URL(href, baseUrl);
77
+ href = u.toString();
78
+ }
79
+ catch (_c) { }
80
+ }
81
+ href = cleanUrl(href);
82
+ return `[${text}](${href})`;
83
+ },
84
+ });
85
+ t.use(gfm);
86
+ // Convert HTML → Markdown
87
+ try {
88
+ let out = yield t.turndown(tidiedHtml);
89
+ out = fixBrokenLinks(out);
90
+ out = stripSkipLinks(out);
91
+ return out.trim();
92
+ }
93
+ catch (err) {
94
+ console.error("HTML→Markdown failed", { err });
95
+ return "";
96
+ }
97
+ });
98
+ }
99
+ // -----------------------------------------------------
100
+ // Helpers
101
+ // -----------------------------------------------------
102
+ function isRelativeUrl(url) {
103
+ return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
104
+ }
105
+ function getDomainFromUrl(url) {
106
+ try {
107
+ const u = new URL(url);
108
+ return u.hostname.replace("www.", "");
109
+ }
110
+ catch (_a) {
111
+ return null;
112
+ }
113
+ }
114
+ function cleanUrl(u) {
115
+ return u;
116
+ }
117
+ function cleanAttribute(attr) {
118
+ return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
119
+ }
120
+ function tidyHtml(html) {
121
+ const cheerio = require("cheerio");
122
+ const $ = cheerio.load(html);
123
+ const manuallyCleanedElements = [
124
+ "script",
125
+ "style",
126
+ "iframe",
127
+ "noscript",
128
+ "meta",
129
+ "link",
130
+ "object",
131
+ "embed",
132
+ "canvas",
133
+ "audio",
134
+ "video",
135
+ ];
136
+ manuallyCleanedElements.forEach((tag) => $(tag).remove());
137
+ return $("body").html();
138
+ }
139
+ function fixBrokenLinks(md) {
140
+ let depth = 0;
141
+ let result = "";
142
+ for (const ch of md) {
143
+ if (ch === "[")
144
+ depth++;
145
+ if (ch === "]")
146
+ depth = Math.max(0, depth - 1);
147
+ result += depth > 0 && ch === "\n" ? "\\\n" : ch;
148
+ }
149
+ return result;
150
+ }
151
+ function stripSkipLinks(md) {
152
+ return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
153
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.28",
3
+ "version": "0.0.30",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",
@@ -17,9 +17,16 @@
17
17
  "license": "AGPL-3.0-or-later",
18
18
  "dependencies": {
19
19
  "@cliqz/adblocker-playwright": "^1.31.3",
20
+ "cheerio": "^1.1.2",
20
21
  "cross-fetch": "^4.0.0",
21
22
  "joi": "^17.6.0",
23
+ "joplin-turndown-plugin-gfm": "^1.0.12",
22
24
  "nodemailer": "^6.10.0",
23
- "playwright-core": "^1.57.0"
25
+ "playwright-core": "^1.57.0",
26
+ "rimraf": "^6.1.2",
27
+ "turndown": "^7.2.0"
28
+ },
29
+ "devDependencies": {
30
+ "@types/turndown": "^5.0.6"
24
31
  }
25
32
  }