@jenslys/curldown 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/constants.js +10 -2
- package/dist/transform.js +64 -11
- package/package.json +2 -3
package/dist/constants.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export const VERSION = "1.0.
|
|
1
|
+
export const VERSION = "1.0.5";
|
|
2
2
|
export const DEFAULT_STATIC_TIMEOUT_MS = 15_000;
|
|
3
3
|
export const DEFAULT_DYNAMIC_TIMEOUT_MS = 30_000;
|
|
4
4
|
export const DEFAULT_REMOVE_SELECTORS = [
|
|
@@ -8,5 +8,13 @@ export const DEFAULT_REMOVE_SELECTORS = [
|
|
|
8
8
|
"template",
|
|
9
9
|
"svg",
|
|
10
10
|
"canvas",
|
|
11
|
-
"iframe"
|
|
11
|
+
"iframe",
|
|
12
|
+
"wbr",
|
|
13
|
+
"button",
|
|
14
|
+
"input",
|
|
15
|
+
"select",
|
|
16
|
+
"textarea",
|
|
17
|
+
"[role='button']",
|
|
18
|
+
"[role='toolbar']",
|
|
19
|
+
"[role='separator']"
|
|
12
20
|
];
|
package/dist/transform.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { Readability } from "@mozilla/readability";
|
|
2
2
|
import { load } from "cheerio";
|
|
3
|
-
import {
|
|
3
|
+
import { DOMParser } from "linkedom";
|
|
4
4
|
import { createRequire } from "node:module";
|
|
5
5
|
import TurndownService from "turndown";
|
|
6
6
|
import { DEFAULT_REMOVE_SELECTORS } from "./constants.js";
|
|
@@ -14,15 +14,50 @@ const turndown = new TurndownService({
|
|
|
14
14
|
emDelimiter: "_"
|
|
15
15
|
});
|
|
16
16
|
turndown.use(turndownPluginGfm.gfm);
|
|
17
|
+
const tableCellTurndown = new TurndownService({
|
|
18
|
+
headingStyle: "atx",
|
|
19
|
+
codeBlockStyle: "fenced",
|
|
20
|
+
bulletListMarker: "-",
|
|
21
|
+
emDelimiter: "_"
|
|
22
|
+
});
|
|
23
|
+
tableCellTurndown.use(turndownPluginGfm.gfm);
|
|
17
24
|
const FALLBACK_BASE_URL = "https://curldown.local/";
|
|
18
25
|
const PRIMARY_CONTENT_SELECTOR = "main, article, [role='main']";
|
|
19
26
|
const MIN_PRIMARY_CONTENT_TEXT_LENGTH = 200;
|
|
27
|
+
const COMPLEX_TABLE_CELL_SELECTOR = "ul, ol, blockquote, pre, h1, h2, h3, h4, h5, h6, hr";
|
|
20
28
|
function getNormalizedTextLength(value) {
|
|
21
29
|
return value?.replace(/\s+/g, " ").trim().length ?? 0;
|
|
22
30
|
}
|
|
23
|
-
function
|
|
31
|
+
function resolveUrl(value, baseUrl) {
|
|
32
|
+
if (!value) {
|
|
33
|
+
return undefined;
|
|
34
|
+
}
|
|
35
|
+
if (!baseUrl) {
|
|
36
|
+
return value;
|
|
37
|
+
}
|
|
38
|
+
try {
|
|
39
|
+
return new URL(value, baseUrl).toString();
|
|
40
|
+
}
|
|
41
|
+
catch {
|
|
42
|
+
return value;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
function cleanupFragmentHtml(html, baseUrl) {
|
|
24
46
|
const $ = load(html);
|
|
25
47
|
$(DEFAULT_REMOVE_SELECTORS.join(",")).remove();
|
|
48
|
+
normalizeComplexTables($);
|
|
49
|
+
$("img").each((_, element) => {
|
|
50
|
+
const src = resolveUrl($(element).attr("src"), baseUrl);
|
|
51
|
+
if (src) {
|
|
52
|
+
$(element).attr("src", src);
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
$("a").each((_, element) => {
|
|
56
|
+
const href = resolveUrl($(element).attr("href"), baseUrl);
|
|
57
|
+
if (href) {
|
|
58
|
+
$(element).attr("href", href);
|
|
59
|
+
}
|
|
60
|
+
});
|
|
26
61
|
$("img").each((_, element) => {
|
|
27
62
|
const alt = $(element).attr("alt")?.trim() ?? "";
|
|
28
63
|
if (!alt) {
|
|
@@ -42,6 +77,22 @@ function cleanupFragmentHtml(html) {
|
|
|
42
77
|
});
|
|
43
78
|
return $.root().html() ?? "";
|
|
44
79
|
}
|
|
80
|
+
function normalizeComplexTables($) {
|
|
81
|
+
$("table")
|
|
82
|
+
.find(`th:has(${COMPLEX_TABLE_CELL_SELECTOR}), td:has(${COMPLEX_TABLE_CELL_SELECTOR})`)
|
|
83
|
+
.each((_, cell) => {
|
|
84
|
+
const markdown = normalizeTableCellMarkdown(tableCellTurndown.turndown($(cell).html() ?? ""));
|
|
85
|
+
$(cell).empty().text(markdown);
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
function normalizeTableCellMarkdown(markdown) {
|
|
89
|
+
return markdown
|
|
90
|
+
.replace(/\r\n/g, "\n")
|
|
91
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
92
|
+
.replace(/^#{1,6}\s+/gm, "")
|
|
93
|
+
.replace(/^\s{0,3}>\s?/gm, "")
|
|
94
|
+
.trim();
|
|
95
|
+
}
|
|
45
96
|
function extractBodyHtml(document) {
|
|
46
97
|
return document.body?.innerHTML ?? document.documentElement?.innerHTML ?? "";
|
|
47
98
|
}
|
|
@@ -66,11 +117,11 @@ function selectReadabilityHtml(document) {
|
|
|
66
117
|
}
|
|
67
118
|
return article.content ?? undefined;
|
|
68
119
|
}
|
|
69
|
-
function toMarkdownCandidate(html) {
|
|
120
|
+
function toMarkdownCandidate(html, baseUrl) {
|
|
70
121
|
if (!html) {
|
|
71
122
|
return undefined;
|
|
72
123
|
}
|
|
73
|
-
const cleanedHtml = cleanupFragmentHtml(html);
|
|
124
|
+
const cleanedHtml = cleanupFragmentHtml(html, baseUrl);
|
|
74
125
|
if (cleanedHtml.trim().length === 0) {
|
|
75
126
|
return undefined;
|
|
76
127
|
}
|
|
@@ -86,6 +137,10 @@ function getFirstMeaningfulMarkdownLine(markdown) {
|
|
|
86
137
|
function startsWithPrimaryHeading(markdown) {
|
|
87
138
|
return /^#\s+\S/.test(getFirstMeaningfulMarkdownLine(markdown) ?? "");
|
|
88
139
|
}
|
|
140
|
+
function parseDocument(html) {
|
|
141
|
+
const normalizedHtml = /<html[\s>]/i.test(html) ? html : `<!doctype html><html>${html}</html>`;
|
|
142
|
+
return new DOMParser().parseFromString(normalizedHtml, "text/html");
|
|
143
|
+
}
|
|
89
144
|
/**
|
|
90
145
|
* Convert fetched HTML into markdown.
|
|
91
146
|
* The function prefers semantic primary-content containers, falls back to
|
|
@@ -93,13 +148,11 @@ function startsWithPrimaryHeading(markdown) {
|
|
|
93
148
|
* no stronger content signal exists.
|
|
94
149
|
*/
|
|
95
150
|
export function transformHtmlToMarkdown(input) {
|
|
96
|
-
const
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
const
|
|
100
|
-
const
|
|
101
|
-
const readabilityMarkdown = toMarkdownCandidate(selectReadabilityHtml(new JSDOM(input.html, { url: input.url ?? FALLBACK_BASE_URL }).window.document));
|
|
102
|
-
const fallbackMarkdown = toMarkdownCandidate(extractBodyHtml(document));
|
|
151
|
+
const baseUrl = input.url ?? FALLBACK_BASE_URL;
|
|
152
|
+
const document = parseDocument(input.html);
|
|
153
|
+
const semanticMarkdown = toMarkdownCandidate(selectSemanticPrimaryHtml(document), baseUrl);
|
|
154
|
+
const readabilityMarkdown = toMarkdownCandidate(selectReadabilityHtml(parseDocument(input.html)), baseUrl);
|
|
155
|
+
const fallbackMarkdown = toMarkdownCandidate(extractBodyHtml(document), baseUrl);
|
|
103
156
|
const markdown = semanticMarkdown && startsWithPrimaryHeading(semanticMarkdown) && !startsWithPrimaryHeading(readabilityMarkdown ?? "")
|
|
104
157
|
? semanticMarkdown
|
|
105
158
|
: readabilityMarkdown ?? semanticMarkdown ?? fallbackMarkdown;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@jenslys/curldown",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.6",
|
|
4
4
|
"description": "Fetch URL content and convert it to markdown.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -33,12 +33,11 @@
|
|
|
33
33
|
"@mozilla/readability": "^0.6.0",
|
|
34
34
|
"cheerio": "^1.2.0",
|
|
35
35
|
"commander": "^14.0.3",
|
|
36
|
-
"
|
|
36
|
+
"linkedom": "^0.18.12",
|
|
37
37
|
"playwright": "^1.58.2",
|
|
38
38
|
"turndown": "^7.2.2"
|
|
39
39
|
},
|
|
40
40
|
"devDependencies": {
|
|
41
|
-
"@types/jsdom": "^28.0.1",
|
|
42
41
|
"@types/node": "^25.3.3",
|
|
43
42
|
"@types/turndown": "^5.0.6",
|
|
44
43
|
"typescript": "^5.9.3",
|