@jenslys/curldown 1.0.5 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/transform.js +39 -11
- package/package.json +2 -3
package/dist/transform.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { Readability } from "@mozilla/readability";
|
|
2
2
|
import { load } from "cheerio";
|
|
3
|
-
import {
|
|
3
|
+
import { DOMParser } from "linkedom";
|
|
4
4
|
import { createRequire } from "node:module";
|
|
5
5
|
import TurndownService from "turndown";
|
|
6
6
|
import { DEFAULT_REMOVE_SELECTORS } from "./constants.js";
|
|
@@ -28,10 +28,36 @@ const COMPLEX_TABLE_CELL_SELECTOR = "ul, ol, blockquote, pre, h1, h2, h3, h4, h5
|
|
|
28
28
|
function getNormalizedTextLength(value) {
|
|
29
29
|
return value?.replace(/\s+/g, " ").trim().length ?? 0;
|
|
30
30
|
}
|
|
31
|
-
function
|
|
31
|
+
function resolveUrl(value, baseUrl) {
|
|
32
|
+
if (!value) {
|
|
33
|
+
return undefined;
|
|
34
|
+
}
|
|
35
|
+
if (!baseUrl) {
|
|
36
|
+
return value;
|
|
37
|
+
}
|
|
38
|
+
try {
|
|
39
|
+
return new URL(value, baseUrl).toString();
|
|
40
|
+
}
|
|
41
|
+
catch {
|
|
42
|
+
return value;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
function cleanupFragmentHtml(html, baseUrl) {
|
|
32
46
|
const $ = load(html);
|
|
33
47
|
$(DEFAULT_REMOVE_SELECTORS.join(",")).remove();
|
|
34
48
|
normalizeComplexTables($);
|
|
49
|
+
$("img").each((_, element) => {
|
|
50
|
+
const src = resolveUrl($(element).attr("src"), baseUrl);
|
|
51
|
+
if (src) {
|
|
52
|
+
$(element).attr("src", src);
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
$("a").each((_, element) => {
|
|
56
|
+
const href = resolveUrl($(element).attr("href"), baseUrl);
|
|
57
|
+
if (href) {
|
|
58
|
+
$(element).attr("href", href);
|
|
59
|
+
}
|
|
60
|
+
});
|
|
35
61
|
$("img").each((_, element) => {
|
|
36
62
|
const alt = $(element).attr("alt")?.trim() ?? "";
|
|
37
63
|
if (!alt) {
|
|
@@ -91,11 +117,11 @@ function selectReadabilityHtml(document) {
|
|
|
91
117
|
}
|
|
92
118
|
return article.content ?? undefined;
|
|
93
119
|
}
|
|
94
|
-
function toMarkdownCandidate(html) {
|
|
120
|
+
function toMarkdownCandidate(html, baseUrl) {
|
|
95
121
|
if (!html) {
|
|
96
122
|
return undefined;
|
|
97
123
|
}
|
|
98
|
-
const cleanedHtml = cleanupFragmentHtml(html);
|
|
124
|
+
const cleanedHtml = cleanupFragmentHtml(html, baseUrl);
|
|
99
125
|
if (cleanedHtml.trim().length === 0) {
|
|
100
126
|
return undefined;
|
|
101
127
|
}
|
|
@@ -111,6 +137,10 @@ function getFirstMeaningfulMarkdownLine(markdown) {
|
|
|
111
137
|
function startsWithPrimaryHeading(markdown) {
|
|
112
138
|
return /^#\s+\S/.test(getFirstMeaningfulMarkdownLine(markdown) ?? "");
|
|
113
139
|
}
|
|
140
|
+
function parseDocument(html) {
|
|
141
|
+
const normalizedHtml = /<html[\s>]/i.test(html) ? html : `<!doctype html><html>${html}</html>`;
|
|
142
|
+
return new DOMParser().parseFromString(normalizedHtml, "text/html");
|
|
143
|
+
}
|
|
114
144
|
/**
|
|
115
145
|
* Convert fetched HTML into markdown.
|
|
116
146
|
* The function prefers semantic primary-content containers, falls back to
|
|
@@ -118,13 +148,11 @@ function startsWithPrimaryHeading(markdown) {
|
|
|
118
148
|
* no stronger content signal exists.
|
|
119
149
|
*/
|
|
120
150
|
export function transformHtmlToMarkdown(input) {
|
|
121
|
-
const
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
const
|
|
125
|
-
const
|
|
126
|
-
const readabilityMarkdown = toMarkdownCandidate(selectReadabilityHtml(new JSDOM(input.html, { url: input.url ?? FALLBACK_BASE_URL }).window.document));
|
|
127
|
-
const fallbackMarkdown = toMarkdownCandidate(extractBodyHtml(document));
|
|
151
|
+
const baseUrl = input.url ?? FALLBACK_BASE_URL;
|
|
152
|
+
const document = parseDocument(input.html);
|
|
153
|
+
const semanticMarkdown = toMarkdownCandidate(selectSemanticPrimaryHtml(document), baseUrl);
|
|
154
|
+
const readabilityMarkdown = toMarkdownCandidate(selectReadabilityHtml(parseDocument(input.html)), baseUrl);
|
|
155
|
+
const fallbackMarkdown = toMarkdownCandidate(extractBodyHtml(document), baseUrl);
|
|
128
156
|
const markdown = semanticMarkdown && startsWithPrimaryHeading(semanticMarkdown) && !startsWithPrimaryHeading(readabilityMarkdown ?? "")
|
|
129
157
|
? semanticMarkdown
|
|
130
158
|
: readabilityMarkdown ?? semanticMarkdown ?? fallbackMarkdown;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@jenslys/curldown",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.6",
|
|
4
4
|
"description": "Fetch URL content and convert it to markdown.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -33,12 +33,11 @@
|
|
|
33
33
|
"@mozilla/readability": "^0.6.0",
|
|
34
34
|
"cheerio": "^1.2.0",
|
|
35
35
|
"commander": "^14.0.3",
|
|
36
|
-
"
|
|
36
|
+
"linkedom": "^0.18.12",
|
|
37
37
|
"playwright": "^1.58.2",
|
|
38
38
|
"turndown": "^7.2.2"
|
|
39
39
|
},
|
|
40
40
|
"devDependencies": {
|
|
41
|
-
"@types/jsdom": "^28.0.1",
|
|
42
41
|
"@types/node": "^25.3.3",
|
|
43
42
|
"@types/turndown": "^5.0.6",
|
|
44
43
|
"typescript": "^5.9.3",
|