@jenslys/curldown 1.0.4 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/constants.js +10 -2
- package/dist/transform.js +25 -0
- package/package.json +1 -1
package/dist/constants.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export const VERSION = "1.0.
|
|
1
|
+
export const VERSION = "1.0.5";
|
|
2
2
|
export const DEFAULT_STATIC_TIMEOUT_MS = 15_000;
|
|
3
3
|
export const DEFAULT_DYNAMIC_TIMEOUT_MS = 30_000;
|
|
4
4
|
export const DEFAULT_REMOVE_SELECTORS = [
|
|
@@ -8,5 +8,13 @@ export const DEFAULT_REMOVE_SELECTORS = [
|
|
|
8
8
|
"template",
|
|
9
9
|
"svg",
|
|
10
10
|
"canvas",
|
|
11
|
-
"iframe"
|
|
11
|
+
"iframe",
|
|
12
|
+
"wbr",
|
|
13
|
+
"button",
|
|
14
|
+
"input",
|
|
15
|
+
"select",
|
|
16
|
+
"textarea",
|
|
17
|
+
"[role='button']",
|
|
18
|
+
"[role='toolbar']",
|
|
19
|
+
"[role='separator']"
|
|
12
20
|
];
|
package/dist/transform.js
CHANGED
|
@@ -14,15 +14,24 @@ const turndown = new TurndownService({
|
|
|
14
14
|
emDelimiter: "_"
|
|
15
15
|
});
|
|
16
16
|
turndown.use(turndownPluginGfm.gfm);
|
|
17
|
+
const tableCellTurndown = new TurndownService({
|
|
18
|
+
headingStyle: "atx",
|
|
19
|
+
codeBlockStyle: "fenced",
|
|
20
|
+
bulletListMarker: "-",
|
|
21
|
+
emDelimiter: "_"
|
|
22
|
+
});
|
|
23
|
+
tableCellTurndown.use(turndownPluginGfm.gfm);
|
|
17
24
|
const FALLBACK_BASE_URL = "https://curldown.local/";
|
|
18
25
|
const PRIMARY_CONTENT_SELECTOR = "main, article, [role='main']";
|
|
19
26
|
const MIN_PRIMARY_CONTENT_TEXT_LENGTH = 200;
|
|
27
|
+
const COMPLEX_TABLE_CELL_SELECTOR = "ul, ol, blockquote, pre, h1, h2, h3, h4, h5, h6, hr";
|
|
20
28
|
function getNormalizedTextLength(value) {
|
|
21
29
|
return value?.replace(/\s+/g, " ").trim().length ?? 0;
|
|
22
30
|
}
|
|
23
31
|
function cleanupFragmentHtml(html) {
|
|
24
32
|
const $ = load(html);
|
|
25
33
|
$(DEFAULT_REMOVE_SELECTORS.join(",")).remove();
|
|
34
|
+
normalizeComplexTables($);
|
|
26
35
|
$("img").each((_, element) => {
|
|
27
36
|
const alt = $(element).attr("alt")?.trim() ?? "";
|
|
28
37
|
if (!alt) {
|
|
@@ -42,6 +51,22 @@ function cleanupFragmentHtml(html) {
|
|
|
42
51
|
});
|
|
43
52
|
return $.root().html() ?? "";
|
|
44
53
|
}
|
|
54
|
+
function normalizeComplexTables($) {
|
|
55
|
+
$("table")
|
|
56
|
+
.find(`th:has(${COMPLEX_TABLE_CELL_SELECTOR}), td:has(${COMPLEX_TABLE_CELL_SELECTOR})`)
|
|
57
|
+
.each((_, cell) => {
|
|
58
|
+
const markdown = normalizeTableCellMarkdown(tableCellTurndown.turndown($(cell).html() ?? ""));
|
|
59
|
+
$(cell).empty().text(markdown);
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
function normalizeTableCellMarkdown(markdown) {
|
|
63
|
+
return markdown
|
|
64
|
+
.replace(/\r\n/g, "\n")
|
|
65
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
66
|
+
.replace(/^#{1,6}\s+/gm, "")
|
|
67
|
+
.replace(/^\s{0,3}>\s?/gm, "")
|
|
68
|
+
.trim();
|
|
69
|
+
}
|
|
45
70
|
function extractBodyHtml(document) {
|
|
46
71
|
return document.body?.innerHTML ?? document.documentElement?.innerHTML ?? "";
|
|
47
72
|
}
|