mx-cloud 0.0.32 → 0.0.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.js +8 -6
- package/build/utils/markdown.js +161 -35
- package/package.json +2 -1
package/build/interpret.js
CHANGED
|
@@ -1657,15 +1657,16 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1657
1657
|
const html = yield page.evaluate(() => {
|
|
1658
1658
|
const selectors = [
|
|
1659
1659
|
"script", "style", "link[rel='stylesheet']", "noscript", "meta",
|
|
1660
|
-
"
|
|
1660
|
+
"iframe", "object", "embed"
|
|
1661
1661
|
];
|
|
1662
1662
|
selectors.forEach(sel => {
|
|
1663
1663
|
document.querySelectorAll(sel).forEach(e => e.remove());
|
|
1664
1664
|
});
|
|
1665
1665
|
const all = document.querySelectorAll("*");
|
|
1666
1666
|
all.forEach(el => {
|
|
1667
|
-
|
|
1668
|
-
|
|
1667
|
+
Array.from(el.attributes).forEach(attr => {
|
|
1668
|
+
const name = attr.name.toLowerCase();
|
|
1669
|
+
if (name.startsWith("on") || name === "data-mx-id" || name === "jsaction" || name === "jsname") {
|
|
1669
1670
|
el.removeAttribute(attr.name);
|
|
1670
1671
|
}
|
|
1671
1672
|
});
|
|
@@ -1691,15 +1692,16 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1691
1692
|
const htmlPromise = page.evaluate(() => {
|
|
1692
1693
|
const selectors = [
|
|
1693
1694
|
"script", "style", "link[rel='stylesheet']", "noscript", "meta",
|
|
1694
|
-
"
|
|
1695
|
+
"iframe", "object", "embed"
|
|
1695
1696
|
];
|
|
1696
1697
|
selectors.forEach(sel => {
|
|
1697
1698
|
document.querySelectorAll(sel).forEach(e => e.remove());
|
|
1698
1699
|
});
|
|
1699
1700
|
const all = document.querySelectorAll("*");
|
|
1700
1701
|
all.forEach(el => {
|
|
1701
|
-
|
|
1702
|
-
|
|
1702
|
+
Array.from(el.attributes).forEach(attr => {
|
|
1703
|
+
const name = attr.name.toLowerCase();
|
|
1704
|
+
if (name.startsWith("on") || name === "data-mx-id" || name === "jsaction" || name === "jsname") {
|
|
1703
1705
|
el.removeAttribute(attr.name);
|
|
1704
1706
|
}
|
|
1705
1707
|
});
|
package/build/utils/markdown.js
CHANGED
|
@@ -1,4 +1,37 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
2
35
|
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
36
|
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
37
|
return new (P || (P = Promise))(function (resolve, reject) {
|
|
@@ -8,20 +41,25 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
8
41
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
42
|
});
|
|
10
43
|
};
|
|
44
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
45
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
46
|
+
};
|
|
11
47
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
48
|
exports.parseMarkdown = parseMarkdown;
|
|
49
|
+
const turndown_1 = __importDefault(require("turndown"));
|
|
50
|
+
// @ts-ignore
|
|
51
|
+
const joplin_turndown_plugin_gfm_1 = require("joplin-turndown-plugin-gfm");
|
|
52
|
+
const cheerio = __importStar(require("cheerio"));
|
|
53
|
+
const url_1 = require("url");
|
|
13
54
|
function parseMarkdown(html, baseUrl) {
|
|
14
55
|
return __awaiter(this, void 0, void 0, function* () {
|
|
15
|
-
const TurndownService = require("turndown");
|
|
16
|
-
const { gfm } = require("joplin-turndown-plugin-gfm");
|
|
17
|
-
const cheerio = require("cheerio");
|
|
18
|
-
const { URL } = require("url");
|
|
19
56
|
if (!html)
|
|
20
57
|
return "";
|
|
21
58
|
const tidiedHtml = tidyHtml(html);
|
|
22
|
-
const t = new
|
|
59
|
+
const t = new turndown_1.default({
|
|
23
60
|
headingStyle: "atx", // ensures #### instead of ------
|
|
24
61
|
codeBlockStyle: "fenced",
|
|
62
|
+
bulletListMarker: "-",
|
|
25
63
|
});
|
|
26
64
|
// ---------------------------------------------
|
|
27
65
|
// Proper ATX headings #### instead of underline-style
|
|
@@ -31,18 +69,20 @@ function parseMarkdown(html, baseUrl) {
|
|
|
31
69
|
replacement: (content, node) => {
|
|
32
70
|
const level = Number(node.nodeName.charAt(1));
|
|
33
71
|
const clean = content.trim();
|
|
72
|
+
if (!clean)
|
|
73
|
+
return "";
|
|
34
74
|
return `\n${"#".repeat(level)} ${clean}\n`;
|
|
35
75
|
},
|
|
36
76
|
});
|
|
37
77
|
// ---------------------------------------------
|
|
38
|
-
// Remove SVGs
|
|
78
|
+
// Remove SVGs and other typically noisy elements
|
|
39
79
|
// ---------------------------------------------
|
|
40
80
|
t.addRule("truncate-svg", {
|
|
41
|
-
filter: "svg",
|
|
81
|
+
filter: (node) => node.nodeName.toLowerCase() === "svg",
|
|
42
82
|
replacement: () => "",
|
|
43
83
|
});
|
|
44
84
|
// ---------------------------------------------
|
|
45
|
-
//
|
|
85
|
+
// Paragraph cleanup
|
|
46
86
|
// ---------------------------------------------
|
|
47
87
|
t.addRule("improved-paragraph", {
|
|
48
88
|
filter: "p",
|
|
@@ -50,17 +90,17 @@ function parseMarkdown(html, baseUrl) {
|
|
|
50
90
|
const trimmed = innerText.trim();
|
|
51
91
|
if (!trimmed)
|
|
52
92
|
return "";
|
|
53
|
-
return
|
|
93
|
+
return `\n\n${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
|
|
54
94
|
},
|
|
55
95
|
});
|
|
56
96
|
// ---------------------------------------------
|
|
57
97
|
// Inline link with fallback text
|
|
58
98
|
// ---------------------------------------------
|
|
59
99
|
t.addRule("inlineLink", {
|
|
60
|
-
filter: (node
|
|
100
|
+
filter: (node) => node.nodeName === "A" && node.getAttribute("href"),
|
|
61
101
|
replacement: (content, node) => {
|
|
62
102
|
var _a, _b;
|
|
63
|
-
let text = content.trim();
|
|
103
|
+
let text = content.trim().replace(/\n+/g, " ");
|
|
64
104
|
// Fallback: aria-label → title → domain
|
|
65
105
|
if (!text) {
|
|
66
106
|
text =
|
|
@@ -73,7 +113,7 @@ function parseMarkdown(html, baseUrl) {
|
|
|
73
113
|
// relative → absolute
|
|
74
114
|
if (baseUrl && isRelativeUrl(href)) {
|
|
75
115
|
try {
|
|
76
|
-
const u = new URL(href, baseUrl);
|
|
116
|
+
const u = new url_1.URL(href, baseUrl);
|
|
77
117
|
href = u.toString();
|
|
78
118
|
}
|
|
79
119
|
catch (_c) { }
|
|
@@ -82,12 +122,33 @@ function parseMarkdown(html, baseUrl) {
|
|
|
82
122
|
return `[${text}](${href})`;
|
|
83
123
|
},
|
|
84
124
|
});
|
|
85
|
-
|
|
125
|
+
// ---------------------------------------------
|
|
126
|
+
// Image handling
|
|
127
|
+
// ---------------------------------------------
|
|
128
|
+
t.addRule("images", {
|
|
129
|
+
filter: "img",
|
|
130
|
+
replacement: (_content, node) => {
|
|
131
|
+
var _a, _b, _c;
|
|
132
|
+
const alt = ((_a = node.getAttribute("alt")) === null || _a === void 0 ? void 0 : _a.trim()) || ((_b = node.getAttribute("title")) === null || _b === void 0 ? void 0 : _b.trim()) || "";
|
|
133
|
+
let src = ((_c = node.getAttribute("src")) === null || _c === void 0 ? void 0 : _c.trim()) || "";
|
|
134
|
+
if (!src)
|
|
135
|
+
return "";
|
|
136
|
+
if (baseUrl && isRelativeUrl(src)) {
|
|
137
|
+
try {
|
|
138
|
+
src = new url_1.URL(src, baseUrl).toString();
|
|
139
|
+
}
|
|
140
|
+
catch (_d) { }
|
|
141
|
+
}
|
|
142
|
+
return alt ? `` : `[Image](${src})`;
|
|
143
|
+
},
|
|
144
|
+
});
|
|
145
|
+
t.use(joplin_turndown_plugin_gfm_1.gfm);
|
|
86
146
|
// Convert HTML → Markdown
|
|
87
147
|
try {
|
|
88
|
-
let out =
|
|
148
|
+
let out = t.turndown(tidiedHtml);
|
|
89
149
|
out = fixBrokenLinks(out);
|
|
90
150
|
out = stripSkipLinks(out);
|
|
151
|
+
out = cleanupExtraWhitespace(out);
|
|
91
152
|
return out.trim();
|
|
92
153
|
}
|
|
93
154
|
catch (err) {
|
|
@@ -100,11 +161,13 @@ function parseMarkdown(html, baseUrl) {
|
|
|
100
161
|
// Helpers
|
|
101
162
|
// -----------------------------------------------------
|
|
102
163
|
function isRelativeUrl(url) {
|
|
103
|
-
|
|
164
|
+
if (!url)
|
|
165
|
+
return false;
|
|
166
|
+
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:") && !url.startsWith("data:");
|
|
104
167
|
}
|
|
105
168
|
function getDomainFromUrl(url) {
|
|
106
169
|
try {
|
|
107
|
-
const u = new URL(url);
|
|
170
|
+
const u = new url_1.URL(url);
|
|
108
171
|
return u.hostname.replace("www.", "");
|
|
109
172
|
}
|
|
110
173
|
catch (_a) {
|
|
@@ -112,29 +175,86 @@ function getDomainFromUrl(url) {
|
|
|
112
175
|
}
|
|
113
176
|
}
|
|
114
177
|
function cleanUrl(u) {
|
|
115
|
-
return u;
|
|
116
|
-
}
|
|
117
|
-
function cleanAttribute(attr) {
|
|
118
|
-
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
|
|
178
|
+
return u.split("#")[0];
|
|
119
179
|
}
|
|
120
180
|
function tidyHtml(html) {
|
|
121
|
-
const cheerio = require("cheerio");
|
|
122
181
|
const $ = cheerio.load(html);
|
|
123
|
-
const
|
|
124
|
-
"script",
|
|
125
|
-
"
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
"
|
|
130
|
-
"
|
|
131
|
-
"
|
|
132
|
-
"
|
|
133
|
-
"
|
|
134
|
-
"
|
|
182
|
+
const technicalElements = [
|
|
183
|
+
"script", "style", "iframe", "noscript", "meta", "link", "object",
|
|
184
|
+
"embed", "canvas", "audio", "video", "svg", "map", "area"
|
|
185
|
+
];
|
|
186
|
+
technicalElements.forEach((tag) => $(tag).remove());
|
|
187
|
+
const noiseSelectors = [
|
|
188
|
+
"nav", "header", "footer", "aside",
|
|
189
|
+
".nav", ".header", ".footer", ".sidebar", ".menu", ".ads", ".ad", ".advertisement",
|
|
190
|
+
"#nav", "#header", "#footer", "#sidebar", ".breadcrumb", ".social-share",
|
|
191
|
+
".comments", ".popup", ".modal", ".cookie-banner", ".location-widget",
|
|
192
|
+
".keyboard-shortcuts", ".skip-link", ".banner", ".top-bar", ".nav-bar",
|
|
193
|
+
'[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
|
|
194
|
+
'[role="complementary"]', "#shortcut-menu", ".nav-sprite", ".a-header", ".a-footer",
|
|
195
|
+
".gb_wa", ".gb_xa",
|
|
196
|
+
"#nav-belt", "#nav-main", "#nav-footer"
|
|
135
197
|
];
|
|
136
|
-
|
|
137
|
-
|
|
198
|
+
noiseSelectors.forEach((sel) => $(sel).remove());
|
|
199
|
+
const uiArtifacts = ["Undo", "Done", "Edit", "Viewed categories", "Dismiss", "Close", "View detail", "View more"];
|
|
200
|
+
$("button, span, a, div").each((_i, el) => {
|
|
201
|
+
const text = $(el).text().trim();
|
|
202
|
+
if (uiArtifacts.includes(text) && $(el).children().length === 0) {
|
|
203
|
+
$(el).remove();
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
const mainSelectors = ["main", "article", "#main-content", "#content", ".main", ".content", ".article", ".post-content", "[role='main']"];
|
|
207
|
+
let bestContent = null;
|
|
208
|
+
for (const selector of mainSelectors) {
|
|
209
|
+
const el = $(selector);
|
|
210
|
+
if (el.length > 0) {
|
|
211
|
+
let candidate = el.first();
|
|
212
|
+
let maxLen = candidate.text().length;
|
|
213
|
+
el.each((_idx, elem) => {
|
|
214
|
+
const len = $(elem).text().length;
|
|
215
|
+
if (len > maxLen) {
|
|
216
|
+
maxLen = len;
|
|
217
|
+
candidate = $(elem);
|
|
218
|
+
}
|
|
219
|
+
});
|
|
220
|
+
if (maxLen > 100) {
|
|
221
|
+
bestContent = candidate;
|
|
222
|
+
break;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
let contentToProcess = bestContent || $("body");
|
|
227
|
+
contentToProcess.find("div, ul, section").each((_i, el) => {
|
|
228
|
+
const $el = $(el);
|
|
229
|
+
const children = $el.children();
|
|
230
|
+
if (children.length > 10) {
|
|
231
|
+
const tagCounts = {};
|
|
232
|
+
children.each((_idx, child) => {
|
|
233
|
+
const tag = child.tagName || child.name;
|
|
234
|
+
if (tag) {
|
|
235
|
+
tagCounts[tag] = (tagCounts[tag] || 0) + 1;
|
|
236
|
+
}
|
|
237
|
+
});
|
|
238
|
+
const dominantTag = Object.keys(tagCounts).find(tag => tagCounts[tag] > 15);
|
|
239
|
+
if (dominantTag && $el.text().length / children.length < 30) {
|
|
240
|
+
$el.remove();
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
});
|
|
244
|
+
contentToProcess.find("ul, ol").each((_i, el) => {
|
|
245
|
+
const $el = $(el);
|
|
246
|
+
const items = $el.children("li");
|
|
247
|
+
if (items.length > 40) {
|
|
248
|
+
items.slice(40).remove();
|
|
249
|
+
$el.append("<li>... (further items truncated for readability)</li>");
|
|
250
|
+
}
|
|
251
|
+
});
|
|
252
|
+
const title = $("title").text().trim() || $("h1").first().text().trim();
|
|
253
|
+
let resultHtml = contentToProcess.html() || "";
|
|
254
|
+
if (title && !resultHtml.includes(title)) {
|
|
255
|
+
resultHtml = `<h1>${title}</h1>\n${resultHtml}`;
|
|
256
|
+
}
|
|
257
|
+
return resultHtml;
|
|
138
258
|
}
|
|
139
259
|
function fixBrokenLinks(md) {
|
|
140
260
|
let depth = 0;
|
|
@@ -151,3 +271,9 @@ function fixBrokenLinks(md) {
|
|
|
151
271
|
function stripSkipLinks(md) {
|
|
152
272
|
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
|
|
153
273
|
}
|
|
274
|
+
function cleanupExtraWhitespace(md) {
|
|
275
|
+
return md
|
|
276
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
277
|
+
.replace(/[ \t]+\n/g, "\n")
|
|
278
|
+
.replace(/\n[ \t]+/g, "\n");
|
|
279
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mx-cloud",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.33",
|
|
4
4
|
"description": "mx cloud",
|
|
5
5
|
"main": "build/index.js",
|
|
6
6
|
"typings": "build/index.d.ts",
|
|
@@ -27,6 +27,7 @@
|
|
|
27
27
|
"turndown": "^7.2.0"
|
|
28
28
|
},
|
|
29
29
|
"devDependencies": {
|
|
30
|
+
"@types/node": "^25.2.0",
|
|
30
31
|
"@types/turndown": "^5.0.6"
|
|
31
32
|
}
|
|
32
33
|
}
|