@jenslys/curldown 1.0.5 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/transform.js +39 -11
  2. package/package.json +2 -3
package/dist/transform.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import { Readability } from "@mozilla/readability";
2
2
  import { load } from "cheerio";
3
- import { JSDOM } from "jsdom";
3
+ import { DOMParser } from "linkedom";
4
4
  import { createRequire } from "node:module";
5
5
  import TurndownService from "turndown";
6
6
  import { DEFAULT_REMOVE_SELECTORS } from "./constants.js";
@@ -28,10 +28,36 @@ const COMPLEX_TABLE_CELL_SELECTOR = "ul, ol, blockquote, pre, h1, h2, h3, h4, h5
28
28
  function getNormalizedTextLength(value) {
29
29
  return value?.replace(/\s+/g, " ").trim().length ?? 0;
30
30
  }
31
- function cleanupFragmentHtml(html) {
31
+ function resolveUrl(value, baseUrl) {
32
+ if (!value) {
33
+ return undefined;
34
+ }
35
+ if (!baseUrl) {
36
+ return value;
37
+ }
38
+ try {
39
+ return new URL(value, baseUrl).toString();
40
+ }
41
+ catch {
42
+ return value;
43
+ }
44
+ }
45
+ function cleanupFragmentHtml(html, baseUrl) {
32
46
  const $ = load(html);
33
47
  $(DEFAULT_REMOVE_SELECTORS.join(",")).remove();
34
48
  normalizeComplexTables($);
49
+ $("img").each((_, element) => {
50
+ const src = resolveUrl($(element).attr("src"), baseUrl);
51
+ if (src) {
52
+ $(element).attr("src", src);
53
+ }
54
+ });
55
+ $("a").each((_, element) => {
56
+ const href = resolveUrl($(element).attr("href"), baseUrl);
57
+ if (href) {
58
+ $(element).attr("href", href);
59
+ }
60
+ });
35
61
  $("img").each((_, element) => {
36
62
  const alt = $(element).attr("alt")?.trim() ?? "";
37
63
  if (!alt) {
@@ -91,11 +117,11 @@ function selectReadabilityHtml(document) {
91
117
  }
92
118
  return article.content ?? undefined;
93
119
  }
94
- function toMarkdownCandidate(html) {
120
+ function toMarkdownCandidate(html, baseUrl) {
95
121
  if (!html) {
96
122
  return undefined;
97
123
  }
98
- const cleanedHtml = cleanupFragmentHtml(html);
124
+ const cleanedHtml = cleanupFragmentHtml(html, baseUrl);
99
125
  if (cleanedHtml.trim().length === 0) {
100
126
  return undefined;
101
127
  }
@@ -111,6 +137,10 @@ function getFirstMeaningfulMarkdownLine(markdown) {
111
137
  function startsWithPrimaryHeading(markdown) {
112
138
  return /^#\s+\S/.test(getFirstMeaningfulMarkdownLine(markdown) ?? "");
113
139
  }
140
+ function parseDocument(html) {
141
+ const normalizedHtml = /<html[\s>]/i.test(html) ? html : `<!doctype html><html>${html}</html>`;
142
+ return new DOMParser().parseFromString(normalizedHtml, "text/html");
143
+ }
114
144
  /**
115
145
  * Convert fetched HTML into markdown.
116
146
  * The function prefers semantic primary-content containers, falls back to
@@ -118,13 +148,11 @@ function startsWithPrimaryHeading(markdown) {
118
148
  * no stronger content signal exists.
119
149
  */
120
150
  export function transformHtmlToMarkdown(input) {
121
- const dom = new JSDOM(input.html, {
122
- url: input.url ?? FALLBACK_BASE_URL
123
- });
124
- const { document } = dom.window;
125
- const semanticMarkdown = toMarkdownCandidate(selectSemanticPrimaryHtml(document));
126
- const readabilityMarkdown = toMarkdownCandidate(selectReadabilityHtml(new JSDOM(input.html, { url: input.url ?? FALLBACK_BASE_URL }).window.document));
127
- const fallbackMarkdown = toMarkdownCandidate(extractBodyHtml(document));
151
+ const baseUrl = input.url ?? FALLBACK_BASE_URL;
152
+ const document = parseDocument(input.html);
153
+ const semanticMarkdown = toMarkdownCandidate(selectSemanticPrimaryHtml(document), baseUrl);
154
+ const readabilityMarkdown = toMarkdownCandidate(selectReadabilityHtml(parseDocument(input.html)), baseUrl);
155
+ const fallbackMarkdown = toMarkdownCandidate(extractBodyHtml(document), baseUrl);
128
156
  const markdown = semanticMarkdown && startsWithPrimaryHeading(semanticMarkdown) && !startsWithPrimaryHeading(readabilityMarkdown ?? "")
129
157
  ? semanticMarkdown
130
158
  : readabilityMarkdown ?? semanticMarkdown ?? fallbackMarkdown;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@jenslys/curldown",
3
- "version": "1.0.5",
3
+ "version": "1.0.6",
4
4
  "description": "Fetch URL content and convert it to markdown.",
5
5
  "repository": {
6
6
  "type": "git",
@@ -33,12 +33,11 @@
33
33
  "@mozilla/readability": "^0.6.0",
34
34
  "cheerio": "^1.2.0",
35
35
  "commander": "^14.0.3",
36
- "jsdom": "^29.0.0",
36
+ "linkedom": "^0.18.12",
37
37
  "playwright": "^1.58.2",
38
38
  "turndown": "^7.2.2"
39
39
  },
40
40
  "devDependencies": {
41
- "@types/jsdom": "^28.0.1",
42
41
  "@types/node": "^25.3.3",
43
42
  "@types/turndown": "^5.0.6",
44
43
  "typescript": "^5.9.3",