@jenslys/curldown 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/constants.js CHANGED
@@ -1,4 +1,4 @@
1
- export const VERSION = "1.0.1";
1
+ export const VERSION = "1.0.5";
2
2
  export const DEFAULT_STATIC_TIMEOUT_MS = 15_000;
3
3
  export const DEFAULT_DYNAMIC_TIMEOUT_MS = 30_000;
4
4
  export const DEFAULT_REMOVE_SELECTORS = [
@@ -8,5 +8,13 @@ export const DEFAULT_REMOVE_SELECTORS = [
8
8
  "template",
9
9
  "svg",
10
10
  "canvas",
11
- "iframe"
11
+ "iframe",
12
+ "wbr",
13
+ "button",
14
+ "input",
15
+ "select",
16
+ "textarea",
17
+ "[role='button']",
18
+ "[role='toolbar']",
19
+ "[role='separator']"
12
20
  ];
package/dist/transform.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import { Readability } from "@mozilla/readability";
2
2
  import { load } from "cheerio";
3
- import { JSDOM } from "jsdom";
3
+ import { DOMParser } from "linkedom";
4
4
  import { createRequire } from "node:module";
5
5
  import TurndownService from "turndown";
6
6
  import { DEFAULT_REMOVE_SELECTORS } from "./constants.js";
@@ -14,15 +14,50 @@ const turndown = new TurndownService({
14
14
  emDelimiter: "_"
15
15
  });
16
16
  turndown.use(turndownPluginGfm.gfm);
17
+ const tableCellTurndown = new TurndownService({
18
+ headingStyle: "atx",
19
+ codeBlockStyle: "fenced",
20
+ bulletListMarker: "-",
21
+ emDelimiter: "_"
22
+ });
23
+ tableCellTurndown.use(turndownPluginGfm.gfm);
17
24
  const FALLBACK_BASE_URL = "https://curldown.local/";
18
25
  const PRIMARY_CONTENT_SELECTOR = "main, article, [role='main']";
19
26
  const MIN_PRIMARY_CONTENT_TEXT_LENGTH = 200;
27
+ const COMPLEX_TABLE_CELL_SELECTOR = "ul, ol, blockquote, pre, h1, h2, h3, h4, h5, h6, hr";
20
28
  function getNormalizedTextLength(value) {
21
29
  return value?.replace(/\s+/g, " ").trim().length ?? 0;
22
30
  }
23
- function cleanupFragmentHtml(html) {
31
+ function resolveUrl(value, baseUrl) {
32
+ if (!value) {
33
+ return undefined;
34
+ }
35
+ if (!baseUrl) {
36
+ return value;
37
+ }
38
+ try {
39
+ return new URL(value, baseUrl).toString();
40
+ }
41
+ catch {
42
+ return value;
43
+ }
44
+ }
45
+ function cleanupFragmentHtml(html, baseUrl) {
24
46
  const $ = load(html);
25
47
  $(DEFAULT_REMOVE_SELECTORS.join(",")).remove();
48
+ normalizeComplexTables($);
49
+ $("img").each((_, element) => {
50
+ const src = resolveUrl($(element).attr("src"), baseUrl);
51
+ if (src) {
52
+ $(element).attr("src", src);
53
+ }
54
+ });
55
+ $("a").each((_, element) => {
56
+ const href = resolveUrl($(element).attr("href"), baseUrl);
57
+ if (href) {
58
+ $(element).attr("href", href);
59
+ }
60
+ });
26
61
  $("img").each((_, element) => {
27
62
  const alt = $(element).attr("alt")?.trim() ?? "";
28
63
  if (!alt) {
@@ -42,6 +77,22 @@ function cleanupFragmentHtml(html) {
42
77
  });
43
78
  return $.root().html() ?? "";
44
79
  }
80
+ function normalizeComplexTables($) {
81
+ $("table")
82
+ .find(`th:has(${COMPLEX_TABLE_CELL_SELECTOR}), td:has(${COMPLEX_TABLE_CELL_SELECTOR})`)
83
+ .each((_, cell) => {
84
+ const markdown = normalizeTableCellMarkdown(tableCellTurndown.turndown($(cell).html() ?? ""));
85
+ $(cell).empty().text(markdown);
86
+ });
87
+ }
88
+ function normalizeTableCellMarkdown(markdown) {
89
+ return markdown
90
+ .replace(/\r\n/g, "\n")
91
+ .replace(/\n{3,}/g, "\n\n")
92
+ .replace(/^#{1,6}\s+/gm, "")
93
+ .replace(/^\s{0,3}>\s?/gm, "")
94
+ .trim();
95
+ }
45
96
  function extractBodyHtml(document) {
46
97
  return document.body?.innerHTML ?? document.documentElement?.innerHTML ?? "";
47
98
  }
@@ -66,11 +117,11 @@ function selectReadabilityHtml(document) {
66
117
  }
67
118
  return article.content ?? undefined;
68
119
  }
69
- function toMarkdownCandidate(html) {
120
+ function toMarkdownCandidate(html, baseUrl) {
70
121
  if (!html) {
71
122
  return undefined;
72
123
  }
73
- const cleanedHtml = cleanupFragmentHtml(html);
124
+ const cleanedHtml = cleanupFragmentHtml(html, baseUrl);
74
125
  if (cleanedHtml.trim().length === 0) {
75
126
  return undefined;
76
127
  }
@@ -86,6 +137,10 @@ function getFirstMeaningfulMarkdownLine(markdown) {
86
137
  function startsWithPrimaryHeading(markdown) {
87
138
  return /^#\s+\S/.test(getFirstMeaningfulMarkdownLine(markdown) ?? "");
88
139
  }
140
+ function parseDocument(html) {
141
+ const normalizedHtml = /<html[\s>]/i.test(html) ? html : `<!doctype html><html>${html}</html>`;
142
+ return new DOMParser().parseFromString(normalizedHtml, "text/html");
143
+ }
89
144
  /**
90
145
  * Convert fetched HTML into markdown.
91
146
  * The function prefers semantic primary-content containers, falls back to
@@ -93,13 +148,11 @@ function startsWithPrimaryHeading(markdown) {
93
148
  * no stronger content signal exists.
94
149
  */
95
150
  export function transformHtmlToMarkdown(input) {
96
- const dom = new JSDOM(input.html, {
97
- url: input.url ?? FALLBACK_BASE_URL
98
- });
99
- const { document } = dom.window;
100
- const semanticMarkdown = toMarkdownCandidate(selectSemanticPrimaryHtml(document));
101
- const readabilityMarkdown = toMarkdownCandidate(selectReadabilityHtml(new JSDOM(input.html, { url: input.url ?? FALLBACK_BASE_URL }).window.document));
102
- const fallbackMarkdown = toMarkdownCandidate(extractBodyHtml(document));
151
+ const baseUrl = input.url ?? FALLBACK_BASE_URL;
152
+ const document = parseDocument(input.html);
153
+ const semanticMarkdown = toMarkdownCandidate(selectSemanticPrimaryHtml(document), baseUrl);
154
+ const readabilityMarkdown = toMarkdownCandidate(selectReadabilityHtml(parseDocument(input.html)), baseUrl);
155
+ const fallbackMarkdown = toMarkdownCandidate(extractBodyHtml(document), baseUrl);
103
156
  const markdown = semanticMarkdown && startsWithPrimaryHeading(semanticMarkdown) && !startsWithPrimaryHeading(readabilityMarkdown ?? "")
104
157
  ? semanticMarkdown
105
158
  : readabilityMarkdown ?? semanticMarkdown ?? fallbackMarkdown;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@jenslys/curldown",
3
- "version": "1.0.4",
3
+ "version": "1.0.6",
4
4
  "description": "Fetch URL content and convert it to markdown.",
5
5
  "repository": {
6
6
  "type": "git",
@@ -33,12 +33,11 @@
33
33
  "@mozilla/readability": "^0.6.0",
34
34
  "cheerio": "^1.2.0",
35
35
  "commander": "^14.0.3",
36
- "jsdom": "^29.0.0",
36
+ "linkedom": "^0.18.12",
37
37
  "playwright": "^1.58.2",
38
38
  "turndown": "^7.2.2"
39
39
  },
40
40
  "devDependencies": {
41
- "@types/jsdom": "^28.0.1",
42
41
  "@types/node": "^25.3.3",
43
42
  "@types/turndown": "^5.0.6",
44
43
  "typescript": "^5.9.3",