mx-cloud 0.0.32 → 0.0.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1657,15 +1657,16 @@ class Interpreter extends events_1.EventEmitter {
1657
1657
  const html = yield page.evaluate(() => {
1658
1658
  const selectors = [
1659
1659
  "script", "style", "link[rel='stylesheet']", "noscript", "meta",
1660
- "svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
1660
+ "iframe", "object", "embed"
1661
1661
  ];
1662
1662
  selectors.forEach(sel => {
1663
1663
  document.querySelectorAll(sel).forEach(e => e.remove());
1664
1664
  });
1665
1665
  const all = document.querySelectorAll("*");
1666
1666
  all.forEach(el => {
1667
- [...el.attributes].forEach(attr => {
1668
- if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
1667
+ Array.from(el.attributes).forEach(attr => {
1668
+ const name = attr.name.toLowerCase();
1669
+ if (name.startsWith("on") || name === "data-mx-id" || name === "jsaction" || name === "jsname") {
1669
1670
  el.removeAttribute(attr.name);
1670
1671
  }
1671
1672
  });
@@ -1691,15 +1692,16 @@ class Interpreter extends events_1.EventEmitter {
1691
1692
  const htmlPromise = page.evaluate(() => {
1692
1693
  const selectors = [
1693
1694
  "script", "style", "link[rel='stylesheet']", "noscript", "meta",
1694
- "svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
1695
+ "iframe", "object", "embed"
1695
1696
  ];
1696
1697
  selectors.forEach(sel => {
1697
1698
  document.querySelectorAll(sel).forEach(e => e.remove());
1698
1699
  });
1699
1700
  const all = document.querySelectorAll("*");
1700
1701
  all.forEach(el => {
1701
- [...el.attributes].forEach(attr => {
1702
- if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
1702
+ Array.from(el.attributes).forEach(attr => {
1703
+ const name = attr.name.toLowerCase();
1704
+ if (name.startsWith("on") || name === "data-mx-id" || name === "jsaction" || name === "jsname") {
1703
1705
  el.removeAttribute(attr.name);
1704
1706
  }
1705
1707
  });
@@ -1,4 +1,37 @@
1
1
  "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
2
35
  var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
36
  function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
37
  return new (P || (P = Promise))(function (resolve, reject) {
@@ -8,20 +41,25 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
8
41
  step((generator = generator.apply(thisArg, _arguments || [])).next());
9
42
  });
10
43
  };
44
+ var __importDefault = (this && this.__importDefault) || function (mod) {
45
+ return (mod && mod.__esModule) ? mod : { "default": mod };
46
+ };
11
47
  Object.defineProperty(exports, "__esModule", { value: true });
12
48
  exports.parseMarkdown = parseMarkdown;
49
+ const turndown_1 = __importDefault(require("turndown"));
50
+ // @ts-ignore
51
+ const joplin_turndown_plugin_gfm_1 = require("joplin-turndown-plugin-gfm");
52
+ const cheerio = __importStar(require("cheerio"));
53
+ const url_1 = require("url");
13
54
  function parseMarkdown(html, baseUrl) {
14
55
  return __awaiter(this, void 0, void 0, function* () {
15
- const TurndownService = require("turndown");
16
- const { gfm } = require("joplin-turndown-plugin-gfm");
17
- const cheerio = require("cheerio");
18
- const { URL } = require("url");
19
56
  if (!html)
20
57
  return "";
21
58
  const tidiedHtml = tidyHtml(html);
22
- const t = new TurndownService({
59
+ const t = new turndown_1.default({
23
60
  headingStyle: "atx", // ensures #### instead of ------
24
61
  codeBlockStyle: "fenced",
62
+ bulletListMarker: "-",
25
63
  });
26
64
  // ---------------------------------------------
27
65
  // Proper ATX headings #### instead of underline-style
@@ -31,18 +69,20 @@ function parseMarkdown(html, baseUrl) {
31
69
  replacement: (content, node) => {
32
70
  const level = Number(node.nodeName.charAt(1));
33
71
  const clean = content.trim();
72
+ if (!clean)
73
+ return "";
34
74
  return `\n${"#".repeat(level)} ${clean}\n`;
35
75
  },
36
76
  });
37
77
  // ---------------------------------------------
38
- // Remove SVGs
78
+ // Remove SVGs and other typically noisy elements
39
79
  // ---------------------------------------------
40
80
  t.addRule("truncate-svg", {
41
- filter: "svg",
81
+ filter: (node) => node.nodeName.toLowerCase() === "svg",
42
82
  replacement: () => "",
43
83
  });
44
84
  // ---------------------------------------------
45
- // Improved paragraph cleanup
85
+ // Paragraph cleanup
46
86
  // ---------------------------------------------
47
87
  t.addRule("improved-paragraph", {
48
88
  filter: "p",
@@ -50,17 +90,17 @@ function parseMarkdown(html, baseUrl) {
50
90
  const trimmed = innerText.trim();
51
91
  if (!trimmed)
52
92
  return "";
53
- return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
93
+ return `\n\n${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
54
94
  },
55
95
  });
56
96
  // ---------------------------------------------
57
97
  // Inline link with fallback text
58
98
  // ---------------------------------------------
59
99
  t.addRule("inlineLink", {
60
- filter: (node, opts) => node.nodeName === "A" && node.getAttribute("href"),
100
+ filter: (node) => node.nodeName === "A" && node.getAttribute("href"),
61
101
  replacement: (content, node) => {
62
102
  var _a, _b;
63
- let text = content.trim();
103
+ let text = content.trim().replace(/\n+/g, " ");
64
104
  // Fallback: aria-label → title → domain
65
105
  if (!text) {
66
106
  text =
@@ -73,7 +113,7 @@ function parseMarkdown(html, baseUrl) {
73
113
  // relative → absolute
74
114
  if (baseUrl && isRelativeUrl(href)) {
75
115
  try {
76
- const u = new URL(href, baseUrl);
116
+ const u = new url_1.URL(href, baseUrl);
77
117
  href = u.toString();
78
118
  }
79
119
  catch (_c) { }
@@ -82,12 +122,33 @@ function parseMarkdown(html, baseUrl) {
82
122
  return `[${text}](${href})`;
83
123
  },
84
124
  });
85
- t.use(gfm);
125
+ // ---------------------------------------------
126
+ // Image handling
127
+ // ---------------------------------------------
128
+ t.addRule("images", {
129
+ filter: "img",
130
+ replacement: (_content, node) => {
131
+ var _a, _b, _c;
132
+ const alt = ((_a = node.getAttribute("alt")) === null || _a === void 0 ? void 0 : _a.trim()) || ((_b = node.getAttribute("title")) === null || _b === void 0 ? void 0 : _b.trim()) || "";
133
+ let src = ((_c = node.getAttribute("src")) === null || _c === void 0 ? void 0 : _c.trim()) || "";
134
+ if (!src)
135
+ return "";
136
+ if (baseUrl && isRelativeUrl(src)) {
137
+ try {
138
+ src = new url_1.URL(src, baseUrl).toString();
139
+ }
140
+ catch (_d) { }
141
+ }
142
+ return alt ? `![${alt}](${src})` : `[Image](${src})`;
143
+ },
144
+ });
145
+ t.use(joplin_turndown_plugin_gfm_1.gfm);
86
146
  // Convert HTML → Markdown
87
147
  try {
88
- let out = yield t.turndown(tidiedHtml);
148
+ let out = t.turndown(tidiedHtml);
89
149
  out = fixBrokenLinks(out);
90
150
  out = stripSkipLinks(out);
151
+ out = cleanupExtraWhitespace(out);
91
152
  return out.trim();
92
153
  }
93
154
  catch (err) {
@@ -100,11 +161,13 @@ function parseMarkdown(html, baseUrl) {
100
161
  // Helpers
101
162
  // -----------------------------------------------------
102
163
  function isRelativeUrl(url) {
103
- return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
164
+ if (!url)
165
+ return false;
166
+ return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:") && !url.startsWith("data:");
104
167
  }
105
168
  function getDomainFromUrl(url) {
106
169
  try {
107
- const u = new URL(url);
170
+ const u = new url_1.URL(url);
108
171
  return u.hostname.replace("www.", "");
109
172
  }
110
173
  catch (_a) {
@@ -112,29 +175,86 @@ function getDomainFromUrl(url) {
112
175
  }
113
176
  }
114
177
  function cleanUrl(u) {
115
- return u;
116
- }
117
- function cleanAttribute(attr) {
118
- return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
178
+ return u.split("#")[0];
119
179
  }
120
180
  function tidyHtml(html) {
121
- const cheerio = require("cheerio");
122
181
  const $ = cheerio.load(html);
123
- const manuallyCleanedElements = [
124
- "script",
125
- "style",
126
- "iframe",
127
- "noscript",
128
- "meta",
129
- "link",
130
- "object",
131
- "embed",
132
- "canvas",
133
- "audio",
134
- "video",
182
+ const technicalElements = [
183
+ "script", "style", "iframe", "noscript", "meta", "link", "object",
184
+ "embed", "canvas", "audio", "video", "svg", "map", "area"
185
+ ];
186
+ technicalElements.forEach((tag) => $(tag).remove());
187
+ const noiseSelectors = [
188
+ "nav", "header", "footer", "aside",
189
+ ".nav", ".header", ".footer", ".sidebar", ".menu", ".ads", ".ad", ".advertisement",
190
+ "#nav", "#header", "#footer", "#sidebar", ".breadcrumb", ".social-share",
191
+ ".comments", ".popup", ".modal", ".cookie-banner", ".location-widget",
192
+ ".keyboard-shortcuts", ".skip-link", ".banner", ".top-bar", ".nav-bar",
193
+ '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
194
+ '[role="complementary"]', "#shortcut-menu", ".nav-sprite", ".a-header", ".a-footer",
195
+ ".gb_wa", ".gb_xa",
196
+ "#nav-belt", "#nav-main", "#nav-footer"
135
197
  ];
136
- manuallyCleanedElements.forEach((tag) => $(tag).remove());
137
- return $("body").html();
198
+ noiseSelectors.forEach((sel) => $(sel).remove());
199
+ const uiArtifacts = ["Undo", "Done", "Edit", "Viewed categories", "Dismiss", "Close", "View detail", "View more"];
200
+ $("button, span, a, div").each((_i, el) => {
201
+ const text = $(el).text().trim();
202
+ if (uiArtifacts.includes(text) && $(el).children().length === 0) {
203
+ $(el).remove();
204
+ }
205
+ });
206
+ const mainSelectors = ["main", "article", "#main-content", "#content", ".main", ".content", ".article", ".post-content", "[role='main']"];
207
+ let bestContent = null;
208
+ for (const selector of mainSelectors) {
209
+ const el = $(selector);
210
+ if (el.length > 0) {
211
+ let candidate = el.first();
212
+ let maxLen = candidate.text().length;
213
+ el.each((_idx, elem) => {
214
+ const len = $(elem).text().length;
215
+ if (len > maxLen) {
216
+ maxLen = len;
217
+ candidate = $(elem);
218
+ }
219
+ });
220
+ if (maxLen > 100) {
221
+ bestContent = candidate;
222
+ break;
223
+ }
224
+ }
225
+ }
226
+ let contentToProcess = bestContent || $("body");
227
+ contentToProcess.find("div, ul, section").each((_i, el) => {
228
+ const $el = $(el);
229
+ const children = $el.children();
230
+ if (children.length > 10) {
231
+ const tagCounts = {};
232
+ children.each((_idx, child) => {
233
+ const tag = child.tagName || child.name;
234
+ if (tag) {
235
+ tagCounts[tag] = (tagCounts[tag] || 0) + 1;
236
+ }
237
+ });
238
+ const dominantTag = Object.keys(tagCounts).find(tag => tagCounts[tag] > 15);
239
+ if (dominantTag && $el.text().length / children.length < 30) {
240
+ $el.remove();
241
+ }
242
+ }
243
+ });
244
+ contentToProcess.find("ul, ol").each((_i, el) => {
245
+ const $el = $(el);
246
+ const items = $el.children("li");
247
+ if (items.length > 40) {
248
+ items.slice(40).remove();
249
+ $el.append("<li>... (further items truncated for readability)</li>");
250
+ }
251
+ });
252
+ const title = $("title").text().trim() || $("h1").first().text().trim();
253
+ let resultHtml = contentToProcess.html() || "";
254
+ if (title && !resultHtml.includes(title)) {
255
+ resultHtml = `<h1>${title}</h1>\n${resultHtml}`;
256
+ }
257
+ return resultHtml;
138
258
  }
139
259
  function fixBrokenLinks(md) {
140
260
  let depth = 0;
@@ -151,3 +271,9 @@ function fixBrokenLinks(md) {
151
271
  function stripSkipLinks(md) {
152
272
  return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
153
273
  }
274
+ function cleanupExtraWhitespace(md) {
275
+ return md
276
+ .replace(/\n{3,}/g, "\n\n")
277
+ .replace(/[ \t]+\n/g, "\n")
278
+ .replace(/\n[ \t]+/g, "\n");
279
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.32",
3
+ "version": "0.0.33",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",
@@ -27,6 +27,7 @@
27
27
  "turndown": "^7.2.0"
28
28
  },
29
29
  "devDependencies": {
30
+ "@types/node": "^25.2.0",
30
31
  "@types/turndown": "^5.0.6"
31
32
  }
32
33
  }