mx-cloud 0.0.31 → 0.0.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1599,6 +1599,38 @@ class Interpreter extends events_1.EventEmitter {
1599
1599
  }
1600
1600
  const serializableOutput = {};
1601
1601
  const SCRAPE_TIMEOUT = 120000;
1602
+ if (formats.includes('screenshot-visible')) {
1603
+ try {
1604
+ const screenshotBuffer = yield page.screenshot({ fullPage: false, type: 'png' });
1605
+ if (screenshotBuffer && screenshotBuffer.length > 0) {
1606
+ yield this.options.binaryCallback({
1607
+ name: 'screenshot-visible',
1608
+ data: screenshotBuffer,
1609
+ mimeType: 'image/png'
1610
+ }, 'image/png');
1611
+ this.log('Visible screenshot captured', logger_1.Level.LOG);
1612
+ }
1613
+ }
1614
+ catch (error) {
1615
+ this.log(`Screenshot-visible failed: ${error.message}`, logger_1.Level.WARN);
1616
+ }
1617
+ }
1618
+ if (formats.includes('screenshot-fullpage')) {
1619
+ try {
1620
+ const screenshotBuffer = yield page.screenshot({ fullPage: true, type: 'png' });
1621
+ if (screenshotBuffer && screenshotBuffer.length > 0) {
1622
+ yield this.options.binaryCallback({
1623
+ name: 'screenshot-fullpage',
1624
+ data: screenshotBuffer,
1625
+ mimeType: 'image/png'
1626
+ }, 'image/png');
1627
+ this.log('Full page screenshot captured', logger_1.Level.LOG);
1628
+ }
1629
+ }
1630
+ catch (error) {
1631
+ this.log(`Screenshot-fullpage failed: ${error.message}`, logger_1.Level.WARN);
1632
+ }
1633
+ }
1602
1634
  if (formats.includes('text')) {
1603
1635
  try {
1604
1636
  const textPromise = page.evaluate(() => {
@@ -1625,15 +1657,16 @@ class Interpreter extends events_1.EventEmitter {
1625
1657
  const html = yield page.evaluate(() => {
1626
1658
  const selectors = [
1627
1659
  "script", "style", "link[rel='stylesheet']", "noscript", "meta",
1628
- "svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
1660
+ "iframe", "object", "embed"
1629
1661
  ];
1630
1662
  selectors.forEach(sel => {
1631
1663
  document.querySelectorAll(sel).forEach(e => e.remove());
1632
1664
  });
1633
1665
  const all = document.querySelectorAll("*");
1634
1666
  all.forEach(el => {
1635
- [...el.attributes].forEach(attr => {
1636
- if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
1667
+ Array.from(el.attributes).forEach(attr => {
1668
+ const name = attr.name.toLowerCase();
1669
+ if (name.startsWith("on") || name === "data-mx-id" || name === "jsaction" || name === "jsname") {
1637
1670
  el.removeAttribute(attr.name);
1638
1671
  }
1639
1672
  });
@@ -1659,15 +1692,16 @@ class Interpreter extends events_1.EventEmitter {
1659
1692
  const htmlPromise = page.evaluate(() => {
1660
1693
  const selectors = [
1661
1694
  "script", "style", "link[rel='stylesheet']", "noscript", "meta",
1662
- "svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
1695
+ "iframe", "object", "embed"
1663
1696
  ];
1664
1697
  selectors.forEach(sel => {
1665
1698
  document.querySelectorAll(sel).forEach(e => e.remove());
1666
1699
  });
1667
1700
  const all = document.querySelectorAll("*");
1668
1701
  all.forEach(el => {
1669
- [...el.attributes].forEach(attr => {
1670
- if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
1702
+ Array.from(el.attributes).forEach(attr => {
1703
+ const name = attr.name.toLowerCase();
1704
+ if (name.startsWith("on") || name === "data-mx-id" || name === "jsaction" || name === "jsname") {
1671
1705
  el.removeAttribute(attr.name);
1672
1706
  }
1673
1707
  });
@@ -1687,38 +1721,6 @@ class Interpreter extends events_1.EventEmitter {
1687
1721
  this.log(`HTML conversion failed: ${error.message}`, logger_1.Level.WARN);
1688
1722
  }
1689
1723
  }
1690
- if (formats.includes('screenshot-visible')) {
1691
- try {
1692
- const screenshotBuffer = yield page.screenshot({ fullPage: false, type: 'png' });
1693
- if (screenshotBuffer && screenshotBuffer.length > 0) {
1694
- yield this.options.binaryCallback({
1695
- name: 'screenshot-visible',
1696
- data: screenshotBuffer,
1697
- mimeType: 'image/png'
1698
- }, 'image/png');
1699
- this.log('Visible screenshot captured', logger_1.Level.LOG);
1700
- }
1701
- }
1702
- catch (error) {
1703
- this.log(`Screenshot-visible failed: ${error.message}`, logger_1.Level.WARN);
1704
- }
1705
- }
1706
- if (formats.includes('screenshot-fullpage')) {
1707
- try {
1708
- const screenshotBuffer = yield page.screenshot({ fullPage: true, type: 'png' });
1709
- if (screenshotBuffer && screenshotBuffer.length > 0) {
1710
- yield this.options.binaryCallback({
1711
- name: 'screenshot-fullpage',
1712
- data: screenshotBuffer,
1713
- mimeType: 'image/png'
1714
- }, 'image/png');
1715
- this.log('Full page screenshot captured', logger_1.Level.LOG);
1716
- }
1717
- }
1718
- catch (error) {
1719
- this.log(`Screenshot-fullpage failed: ${error.message}`, logger_1.Level.WARN);
1720
- }
1721
- }
1722
1724
  const hasSerializableOutput = Object.keys(serializableOutput).length > 0 &&
1723
1725
  Object.values(serializableOutput).some((arr) => Array.isArray(arr) && arr.length > 0);
1724
1726
  if (hasSerializableOutput) {
@@ -1,4 +1,37 @@
1
1
  "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
2
35
  var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
36
  function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
37
  return new (P || (P = Promise))(function (resolve, reject) {
@@ -8,20 +41,25 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
8
41
  step((generator = generator.apply(thisArg, _arguments || [])).next());
9
42
  });
10
43
  };
44
+ var __importDefault = (this && this.__importDefault) || function (mod) {
45
+ return (mod && mod.__esModule) ? mod : { "default": mod };
46
+ };
11
47
  Object.defineProperty(exports, "__esModule", { value: true });
12
48
  exports.parseMarkdown = parseMarkdown;
49
+ const turndown_1 = __importDefault(require("turndown"));
50
+ // @ts-ignore
51
+ const joplin_turndown_plugin_gfm_1 = require("joplin-turndown-plugin-gfm");
52
+ const cheerio = __importStar(require("cheerio"));
53
+ const url_1 = require("url");
13
54
  function parseMarkdown(html, baseUrl) {
14
55
  return __awaiter(this, void 0, void 0, function* () {
15
- const TurndownService = require("turndown");
16
- const { gfm } = require("joplin-turndown-plugin-gfm");
17
- const cheerio = require("cheerio");
18
- const { URL } = require("url");
19
56
  if (!html)
20
57
  return "";
21
58
  const tidiedHtml = tidyHtml(html);
22
- const t = new TurndownService({
59
+ const t = new turndown_1.default({
23
60
  headingStyle: "atx", // ensures #### instead of ------
24
61
  codeBlockStyle: "fenced",
62
+ bulletListMarker: "-",
25
63
  });
26
64
  // ---------------------------------------------
27
65
  // Proper ATX headings #### instead of underline-style
@@ -31,18 +69,20 @@ function parseMarkdown(html, baseUrl) {
31
69
  replacement: (content, node) => {
32
70
  const level = Number(node.nodeName.charAt(1));
33
71
  const clean = content.trim();
72
+ if (!clean)
73
+ return "";
34
74
  return `\n${"#".repeat(level)} ${clean}\n`;
35
75
  },
36
76
  });
37
77
  // ---------------------------------------------
38
- // Remove SVGs
78
+ // Remove SVGs and other typically noisy elements
39
79
  // ---------------------------------------------
40
80
  t.addRule("truncate-svg", {
41
- filter: "svg",
81
+ filter: (node) => node.nodeName.toLowerCase() === "svg",
42
82
  replacement: () => "",
43
83
  });
44
84
  // ---------------------------------------------
45
- // Improved paragraph cleanup
85
+ // Paragraph cleanup
46
86
  // ---------------------------------------------
47
87
  t.addRule("improved-paragraph", {
48
88
  filter: "p",
@@ -50,17 +90,17 @@ function parseMarkdown(html, baseUrl) {
50
90
  const trimmed = innerText.trim();
51
91
  if (!trimmed)
52
92
  return "";
53
- return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
93
+ return `\n\n${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
54
94
  },
55
95
  });
56
96
  // ---------------------------------------------
57
97
  // Inline link with fallback text
58
98
  // ---------------------------------------------
59
99
  t.addRule("inlineLink", {
60
- filter: (node, opts) => node.nodeName === "A" && node.getAttribute("href"),
100
+ filter: (node) => node.nodeName === "A" && node.getAttribute("href"),
61
101
  replacement: (content, node) => {
62
102
  var _a, _b;
63
- let text = content.trim();
103
+ let text = content.trim().replace(/\n+/g, " ");
64
104
  // Fallback: aria-label → title → domain
65
105
  if (!text) {
66
106
  text =
@@ -73,7 +113,7 @@ function parseMarkdown(html, baseUrl) {
73
113
  // relative → absolute
74
114
  if (baseUrl && isRelativeUrl(href)) {
75
115
  try {
76
- const u = new URL(href, baseUrl);
116
+ const u = new url_1.URL(href, baseUrl);
77
117
  href = u.toString();
78
118
  }
79
119
  catch (_c) { }
@@ -82,12 +122,33 @@ function parseMarkdown(html, baseUrl) {
82
122
  return `[${text}](${href})`;
83
123
  },
84
124
  });
85
- t.use(gfm);
125
+ // ---------------------------------------------
126
+ // Image handling
127
+ // ---------------------------------------------
128
+ t.addRule("images", {
129
+ filter: "img",
130
+ replacement: (_content, node) => {
131
+ var _a, _b, _c;
132
+ const alt = ((_a = node.getAttribute("alt")) === null || _a === void 0 ? void 0 : _a.trim()) || ((_b = node.getAttribute("title")) === null || _b === void 0 ? void 0 : _b.trim()) || "";
133
+ let src = ((_c = node.getAttribute("src")) === null || _c === void 0 ? void 0 : _c.trim()) || "";
134
+ if (!src)
135
+ return "";
136
+ if (baseUrl && isRelativeUrl(src)) {
137
+ try {
138
+ src = new url_1.URL(src, baseUrl).toString();
139
+ }
140
+ catch (_d) { }
141
+ }
142
+ return alt ? `![${alt}](${src})` : `[Image](${src})`;
143
+ },
144
+ });
145
+ t.use(joplin_turndown_plugin_gfm_1.gfm);
86
146
  // Convert HTML → Markdown
87
147
  try {
88
- let out = yield t.turndown(tidiedHtml);
148
+ let out = t.turndown(tidiedHtml);
89
149
  out = fixBrokenLinks(out);
90
150
  out = stripSkipLinks(out);
151
+ out = cleanupExtraWhitespace(out);
91
152
  return out.trim();
92
153
  }
93
154
  catch (err) {
@@ -100,11 +161,13 @@ function parseMarkdown(html, baseUrl) {
100
161
  // Helpers
101
162
  // -----------------------------------------------------
102
163
  function isRelativeUrl(url) {
103
- return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
164
+ if (!url)
165
+ return false;
166
+ return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:") && !url.startsWith("data:");
104
167
  }
105
168
  function getDomainFromUrl(url) {
106
169
  try {
107
- const u = new URL(url);
170
+ const u = new url_1.URL(url);
108
171
  return u.hostname.replace("www.", "");
109
172
  }
110
173
  catch (_a) {
@@ -112,29 +175,86 @@ function getDomainFromUrl(url) {
112
175
  }
113
176
  }
114
177
  function cleanUrl(u) {
115
- return u;
116
- }
117
- function cleanAttribute(attr) {
118
- return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
178
+ return u.split("#")[0];
119
179
  }
120
180
  function tidyHtml(html) {
121
- const cheerio = require("cheerio");
122
181
  const $ = cheerio.load(html);
123
- const manuallyCleanedElements = [
124
- "script",
125
- "style",
126
- "iframe",
127
- "noscript",
128
- "meta",
129
- "link",
130
- "object",
131
- "embed",
132
- "canvas",
133
- "audio",
134
- "video",
182
+ const technicalElements = [
183
+ "script", "style", "iframe", "noscript", "meta", "link", "object",
184
+ "embed", "canvas", "audio", "video", "svg", "map", "area"
185
+ ];
186
+ technicalElements.forEach((tag) => $(tag).remove());
187
+ const noiseSelectors = [
188
+ "nav", "header", "footer", "aside",
189
+ ".nav", ".header", ".footer", ".sidebar", ".menu", ".ads", ".ad", ".advertisement",
190
+ "#nav", "#header", "#footer", "#sidebar", ".breadcrumb", ".social-share",
191
+ ".comments", ".popup", ".modal", ".cookie-banner", ".location-widget",
192
+ ".keyboard-shortcuts", ".skip-link", ".banner", ".top-bar", ".nav-bar",
193
+ '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
194
+ '[role="complementary"]', "#shortcut-menu", ".nav-sprite", ".a-header", ".a-footer",
195
+ ".gb_wa", ".gb_xa",
196
+ "#nav-belt", "#nav-main", "#nav-footer"
135
197
  ];
136
- manuallyCleanedElements.forEach((tag) => $(tag).remove());
137
- return $("body").html();
198
+ noiseSelectors.forEach((sel) => $(sel).remove());
199
+ const uiArtifacts = ["Undo", "Done", "Edit", "Viewed categories", "Dismiss", "Close", "View detail", "View more"];
200
+ $("button, span, a, div").each((_i, el) => {
201
+ const text = $(el).text().trim();
202
+ if (uiArtifacts.includes(text) && $(el).children().length === 0) {
203
+ $(el).remove();
204
+ }
205
+ });
206
+ const mainSelectors = ["main", "article", "#main-content", "#content", ".main", ".content", ".article", ".post-content", "[role='main']"];
207
+ let bestContent = null;
208
+ for (const selector of mainSelectors) {
209
+ const el = $(selector);
210
+ if (el.length > 0) {
211
+ let candidate = el.first();
212
+ let maxLen = candidate.text().length;
213
+ el.each((_idx, elem) => {
214
+ const len = $(elem).text().length;
215
+ if (len > maxLen) {
216
+ maxLen = len;
217
+ candidate = $(elem);
218
+ }
219
+ });
220
+ if (maxLen > 100) {
221
+ bestContent = candidate;
222
+ break;
223
+ }
224
+ }
225
+ }
226
+ let contentToProcess = bestContent || $("body");
227
+ contentToProcess.find("div, ul, section").each((_i, el) => {
228
+ const $el = $(el);
229
+ const children = $el.children();
230
+ if (children.length > 10) {
231
+ const tagCounts = {};
232
+ children.each((_idx, child) => {
233
+ const tag = child.tagName || child.name;
234
+ if (tag) {
235
+ tagCounts[tag] = (tagCounts[tag] || 0) + 1;
236
+ }
237
+ });
238
+ const dominantTag = Object.keys(tagCounts).find(tag => tagCounts[tag] > 15);
239
+ if (dominantTag && $el.text().length / children.length < 30) {
240
+ $el.remove();
241
+ }
242
+ }
243
+ });
244
+ contentToProcess.find("ul, ol").each((_i, el) => {
245
+ const $el = $(el);
246
+ const items = $el.children("li");
247
+ if (items.length > 40) {
248
+ items.slice(40).remove();
249
+ $el.append("<li>... (further items truncated for readability)</li>");
250
+ }
251
+ });
252
+ const title = $("title").text().trim() || $("h1").first().text().trim();
253
+ let resultHtml = contentToProcess.html() || "";
254
+ if (title && !resultHtml.includes(title)) {
255
+ resultHtml = `<h1>${title}</h1>\n${resultHtml}`;
256
+ }
257
+ return resultHtml;
138
258
  }
139
259
  function fixBrokenLinks(md) {
140
260
  let depth = 0;
@@ -151,3 +271,9 @@ function fixBrokenLinks(md) {
151
271
  function stripSkipLinks(md) {
152
272
  return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
153
273
  }
274
+ function cleanupExtraWhitespace(md) {
275
+ return md
276
+ .replace(/\n{3,}/g, "\n\n")
277
+ .replace(/[ \t]+\n/g, "\n")
278
+ .replace(/\n[ \t]+/g, "\n");
279
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.31",
3
+ "version": "0.0.33",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",
@@ -27,6 +27,7 @@
27
27
  "turndown": "^7.2.0"
28
28
  },
29
29
  "devDependencies": {
30
+ "@types/node": "^25.2.0",
30
31
  "@types/turndown": "^5.0.6"
31
32
  }
32
33
  }