@jenslys/curldown 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ import { Command } from "commander";
2
+ import { DEFAULT_DYNAMIC_TIMEOUT_MS, DEFAULT_STATIC_TIMEOUT_MS, VERSION } from "./constants.js";
3
+ import { InputError } from "./errors.js";
4
+ function collectRepeatable(value, previous = []) {
5
+ return [...previous, value];
6
+ }
7
+ export function buildProgram() {
8
+ return new Command()
9
+ .name("curldown")
10
+ .description("Fetch URL content and convert it to markdown.")
11
+ .version(VERSION)
12
+ .argument("<url>", "The URL to fetch")
13
+ .option("--dynamic", "Use headless Chromium (Playwright) to render the page")
14
+ .option("--auto", "Try static first and fallback to dynamic when static output is thin")
15
+ .option("--format <type>", "Output format: markdown|json", "markdown")
16
+ .option("-o, --output <path>", "Write output to a file instead of stdout")
17
+ .option("--timeout-ms <number>", "Timeout in milliseconds")
18
+ .option("--header <key:value>", "Set custom request header", collectRepeatable, [])
19
+ .showHelpAfterError()
20
+ .exitOverride();
21
+ }
22
+ function parseHeaders(rawHeaders) {
23
+ const headers = {};
24
+ for (const rawHeader of rawHeaders) {
25
+ const separatorIndex = rawHeader.indexOf(":");
26
+ if (separatorIndex <= 0 || separatorIndex === rawHeader.length - 1) {
27
+ throw new InputError(`Invalid --header value \"${rawHeader}\". Use key:value format.`);
28
+ }
29
+ const key = rawHeader.slice(0, separatorIndex).trim();
30
+ const value = rawHeader.slice(separatorIndex + 1).trim();
31
+ if (!key || !value) {
32
+ throw new InputError(`Invalid --header value \"${rawHeader}\". Header key and value are required.`);
33
+ }
34
+ headers[key] = value;
35
+ }
36
+ return headers;
37
+ }
38
+ function parseFormat(rawFormat) {
39
+ if (rawFormat === "markdown" || rawFormat === "json") {
40
+ return rawFormat;
41
+ }
42
+ throw new InputError(`Invalid --format value \"${rawFormat}\". Use \"markdown\" or \"json\".`);
43
+ }
44
+ function parseTimeouts(rawTimeout, dynamic, auto) {
45
+ if (rawTimeout === undefined) {
46
+ if (dynamic) {
47
+ return {
48
+ timeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS,
49
+ dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
50
+ };
51
+ }
52
+ if (auto) {
53
+ return {
54
+ timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
55
+ dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
56
+ };
57
+ }
58
+ return {
59
+ timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
60
+ dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
61
+ };
62
+ }
63
+ const parsed = Number.parseInt(rawTimeout, 10);
64
+ if (!Number.isInteger(parsed) || parsed <= 0) {
65
+ throw new InputError(`Invalid --timeout-ms value \"${rawTimeout}\". Must be a positive integer.`);
66
+ }
67
+ return {
68
+ timeoutMs: parsed,
69
+ dynamicTimeoutMs: parsed
70
+ };
71
+ }
72
+ /**
73
+ * Validate and normalize parsed CLI arguments into the canonical runtime shape.
74
+ * Fails fast with {@link InputError} on malformed input.
75
+ */
76
+ export function normalizeArgs(urlInput, options) {
77
+ if (!urlInput) {
78
+ throw new InputError("A URL argument is required.");
79
+ }
80
+ let parsedUrl;
81
+ try {
82
+ parsedUrl = new URL(urlInput);
83
+ }
84
+ catch (error) {
85
+ throw new InputError(`Invalid URL \"${urlInput}\".`, {
86
+ cause: error instanceof Error ? error : undefined
87
+ });
88
+ }
89
+ if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") {
90
+ throw new InputError(`Unsupported URL protocol \"${parsedUrl.protocol}\". Only http:// and https:// are supported.`);
91
+ }
92
+ const dynamic = options.dynamic ?? false;
93
+ const auto = options.auto ?? false;
94
+ if (dynamic && auto) {
95
+ throw new InputError("--dynamic and --auto cannot be used together.");
96
+ }
97
+ const timeouts = parseTimeouts(options.timeoutMs, dynamic, auto);
98
+ return {
99
+ url: parsedUrl.toString(),
100
+ auto,
101
+ dynamic,
102
+ format: parseFormat(options.format ?? "markdown"),
103
+ outputPath: options.output,
104
+ timeoutMs: timeouts.timeoutMs,
105
+ dynamicTimeoutMs: timeouts.dynamicTimeoutMs,
106
+ headers: parseHeaders(options.header ?? [])
107
+ };
108
+ }
@@ -0,0 +1,119 @@
1
+ import { createHash } from "node:crypto";
2
+ import { ConversionError } from "./errors.js";
3
+ import { extractHtmlTitle } from "./transform.js";
4
+ const MARKDOWN_CONTENT_TYPES = new Set([
5
+ "text/markdown",
6
+ "text/x-markdown",
7
+ "application/markdown",
8
+ "application/x-markdown"
9
+ ]);
10
+ const PLAINTEXT_CONTENT_TYPE = "text/plain";
11
+ const MARKDOWN_FILE_EXTENSIONS = [
12
+ ".md",
13
+ ".markdown",
14
+ ".mdown",
15
+ ".mkd",
16
+ ".mkdn",
17
+ ".mdtxt",
18
+ ".mdx"
19
+ ];
20
+ function normalizeMarkdown(markdown) {
21
+ const trimmed = markdown.trim();
22
+ if (!trimmed) {
23
+ throw new ConversionError("Content was fetched but markdown output is empty.");
24
+ }
25
+ return `${trimmed}\n`;
26
+ }
27
+ function inferTitleFromMarkdown(markdown) {
28
+ const firstHeading = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim();
29
+ return firstHeading || undefined;
30
+ }
31
+ function isMarkdownContentType(contentType) {
32
+ if (!contentType) {
33
+ return false;
34
+ }
35
+ const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
36
+ return MARKDOWN_CONTENT_TYPES.has(normalized);
37
+ }
38
+ function isPlainTextContentType(contentType) {
39
+ if (!contentType) {
40
+ return false;
41
+ }
42
+ const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
43
+ return normalized === PLAINTEXT_CONTENT_TYPE;
44
+ }
45
+ function hasMarkdownFileExtension(urlValue) {
46
+ let pathname;
47
+ try {
48
+ pathname = new URL(urlValue).pathname;
49
+ }
50
+ catch {
51
+ return false;
52
+ }
53
+ const normalizedPath = pathname.toLowerCase();
54
+ return MARKDOWN_FILE_EXTENSIONS.some((extension) => normalizedPath.endsWith(extension));
55
+ }
56
+ function shouldTreatAsMarkdownPassthrough(result) {
57
+ if (isMarkdownContentType(result.contentType)) {
58
+ return true;
59
+ }
60
+ return isPlainTextContentType(result.contentType) && hasMarkdownFileExtension(result.finalUrl);
61
+ }
62
+ function countWords(value) {
63
+ const trimmed = value.trim();
64
+ if (!trimmed) {
65
+ return 0;
66
+ }
67
+ return trimmed.split(/\s+/).length;
68
+ }
69
+ export function shouldAutoFallback(markdown) {
70
+ const trimmed = markdown.trim();
71
+ if (!trimmed) {
72
+ return true;
73
+ }
74
+ const lower = trimmed.toLowerCase();
75
+ if (/enable javascript|javascript is required|checking your browser|just a moment|please wait/.test(lower)) {
76
+ return true;
77
+ }
78
+ const nonEmptyLines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).length;
79
+ return countWords(trimmed) < 30 && nonEmptyLines <= 2;
80
+ }
81
+ export function prepareContentFromFetchResult(result, deps) {
82
+ if (shouldTreatAsMarkdownPassthrough(result)) {
83
+ const markdown = normalizeMarkdown(result.body);
84
+ return {
85
+ markdown,
86
+ title: inferTitleFromMarkdown(markdown),
87
+ source: result,
88
+ passthrough: true
89
+ };
90
+ }
91
+ const markdown = deps.transformHtmlToMarkdown({
92
+ html: result.body,
93
+ url: result.finalUrl
94
+ });
95
+ return {
96
+ markdown,
97
+ title: extractHtmlTitle(result.body),
98
+ source: result,
99
+ passthrough: false
100
+ };
101
+ }
102
+ export function formatOutput(args, content, usedDynamic) {
103
+ if (args.format === "markdown") {
104
+ return content.markdown;
105
+ }
106
+ const payload = {
107
+ url: args.url,
108
+ final_url: content.source.finalUrl,
109
+ title: content.title ?? null,
110
+ markdown: content.markdown,
111
+ content_type: content.source.contentType ?? null,
112
+ status: content.source.status,
113
+ fetched_at: new Date().toISOString(),
114
+ word_count: countWords(content.markdown),
115
+ sha256: createHash("sha256").update(content.markdown).digest("hex"),
116
+ used_dynamic: usedDynamic
117
+ };
118
+ return `${JSON.stringify(payload, null, 2)}\n`;
119
+ }
@@ -0,0 +1,22 @@
1
+ import { realpathSync } from "node:fs";
2
+ import { fileURLToPath, pathToFileURL } from "node:url";
3
+ function resolvePathStrict(pathInput) {
4
+ return realpathSync(pathInput);
5
+ }
6
+ /**
7
+ * Determine whether `argvPath` points at the current module entrypoint.
8
+ * Resolves symlinks for both paths so global installs that expose a symlinked bin still execute.
9
+ */
10
+ export function isMainModuleFor(moduleUrl, argvPath = process.argv[1]) {
11
+ if (argvPath === undefined) {
12
+ return false;
13
+ }
14
+ try {
15
+ const invokedPath = resolvePathStrict(argvPath);
16
+ const modulePath = resolvePathStrict(fileURLToPath(moduleUrl));
17
+ return invokedPath === modulePath;
18
+ }
19
+ catch {
20
+ return pathToFileURL(argvPath).href === moduleUrl;
21
+ }
22
+ }
package/dist/cli.js CHANGED
@@ -1,30 +1,13 @@
1
1
  #!/usr/bin/env node
2
- import { createHash } from "node:crypto";
3
- import { realpathSync } from "node:fs";
4
- import { Command, CommanderError } from "commander";
5
- import { fileURLToPath, pathToFileURL } from "node:url";
6
- import { DEFAULT_DYNAMIC_TIMEOUT_MS, DEFAULT_STATIC_TIMEOUT_MS, VERSION } from "./constants.js";
7
- import { asCurldownError, ConversionError, InputError } from "./errors.js";
2
+ import { CommanderError } from "commander";
3
+ import { buildProgram, normalizeArgs } from "./cli-args.js";
4
+ import { formatOutput, prepareContentFromFetchResult, shouldAutoFallback } from "./cli-content.js";
5
+ import { isMainModuleFor } from "./cli-main-module.js";
6
+ import { asCurldownError } from "./errors.js";
8
7
  import { fetchDynamicHtml } from "./fetch-dynamic.js";
9
8
  import { fetchStaticHtml } from "./fetch-static.js";
10
9
  import { writeOutput } from "./output.js";
11
- import { extractHtmlTitle, transformHtmlToMarkdown } from "./transform.js";
12
- const MARKDOWN_CONTENT_TYPES = new Set([
13
- "text/markdown",
14
- "text/x-markdown",
15
- "application/markdown",
16
- "application/x-markdown"
17
- ]);
18
- const PLAINTEXT_CONTENT_TYPE = "text/plain";
19
- const MARKDOWN_FILE_EXTENSIONS = [
20
- ".md",
21
- ".markdown",
22
- ".mdown",
23
- ".mkd",
24
- ".mkdn",
25
- ".mdtxt",
26
- ".mdx"
27
- ];
10
+ import { transformHtmlToMarkdown } from "./transform.js";
28
11
  const defaultDependencies = {
29
12
  fetchStatic: fetchStaticHtml,
30
13
  fetchDynamic: fetchDynamicHtml,
@@ -32,208 +15,6 @@ const defaultDependencies = {
32
15
  writeOutput,
33
16
  stderrWrite: (message) => process.stderr.write(message)
34
17
  };
35
- function collectRepeatable(value, previous = []) {
36
- return [...previous, value];
37
- }
38
- function buildProgram() {
39
- return new Command()
40
- .name("curldown")
41
- .description("Fetch URL content and convert it to markdown.")
42
- .version(VERSION)
43
- .argument("<url>", "The URL to fetch")
44
- .option("--dynamic", "Use headless Chromium (Playwright) to render the page")
45
- .option("--auto", "Try static first and fallback to dynamic when static output is thin")
46
- .option("--format <type>", "Output format: markdown|json", "markdown")
47
- .option("-o, --output <path>", "Write output to a file instead of stdout")
48
- .option("--timeout-ms <number>", "Timeout in milliseconds")
49
- .option("--header <key:value>", "Set custom request header", collectRepeatable, [])
50
- .showHelpAfterError()
51
- .exitOverride();
52
- }
53
- function parseHeaders(rawHeaders) {
54
- const headers = {};
55
- for (const rawHeader of rawHeaders) {
56
- const separatorIndex = rawHeader.indexOf(":");
57
- if (separatorIndex <= 0 || separatorIndex === rawHeader.length - 1) {
58
- throw new InputError(`Invalid --header value \"${rawHeader}\". Use key:value format.`);
59
- }
60
- const key = rawHeader.slice(0, separatorIndex).trim();
61
- const value = rawHeader.slice(separatorIndex + 1).trim();
62
- if (!key || !value) {
63
- throw new InputError(`Invalid --header value \"${rawHeader}\". Header key and value are required.`);
64
- }
65
- headers[key] = value;
66
- }
67
- return headers;
68
- }
69
- function parseFormat(rawFormat) {
70
- if (rawFormat === "markdown" || rawFormat === "json") {
71
- return rawFormat;
72
- }
73
- throw new InputError(`Invalid --format value \"${rawFormat}\". Use \"markdown\" or \"json\".`);
74
- }
75
- function parseTimeouts(rawTimeout, dynamic, auto) {
76
- if (rawTimeout === undefined) {
77
- if (dynamic) {
78
- return {
79
- timeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS,
80
- dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
81
- };
82
- }
83
- if (auto) {
84
- return {
85
- timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
86
- dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
87
- };
88
- }
89
- return {
90
- timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
91
- dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
92
- };
93
- }
94
- const parsed = Number.parseInt(rawTimeout, 10);
95
- if (!Number.isInteger(parsed) || parsed <= 0) {
96
- throw new InputError(`Invalid --timeout-ms value \"${rawTimeout}\". Must be a positive integer.`);
97
- }
98
- return {
99
- timeoutMs: parsed,
100
- dynamicTimeoutMs: parsed
101
- };
102
- }
103
- function normalizeMarkdown(markdown) {
104
- const trimmed = markdown.trim();
105
- if (!trimmed) {
106
- throw new ConversionError("Content was fetched but markdown output is empty.");
107
- }
108
- return `${trimmed}\n`;
109
- }
110
- function inferTitleFromMarkdown(markdown) {
111
- const firstHeading = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim();
112
- return firstHeading || undefined;
113
- }
114
- function isMarkdownContentType(contentType) {
115
- if (!contentType) {
116
- return false;
117
- }
118
- const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
119
- return MARKDOWN_CONTENT_TYPES.has(normalized);
120
- }
121
- function isPlainTextContentType(contentType) {
122
- if (!contentType) {
123
- return false;
124
- }
125
- const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
126
- return normalized === PLAINTEXT_CONTENT_TYPE;
127
- }
128
- function hasMarkdownFileExtension(urlValue) {
129
- let pathname;
130
- try {
131
- pathname = new URL(urlValue).pathname;
132
- }
133
- catch {
134
- return false;
135
- }
136
- const normalizedPath = pathname.toLowerCase();
137
- return MARKDOWN_FILE_EXTENSIONS.some((extension) => normalizedPath.endsWith(extension));
138
- }
139
- function shouldTreatAsMarkdownPassthrough(result) {
140
- if (isMarkdownContentType(result.contentType)) {
141
- return true;
142
- }
143
- return isPlainTextContentType(result.contentType) && hasMarkdownFileExtension(result.finalUrl);
144
- }
145
- function countWords(value) {
146
- const trimmed = value.trim();
147
- if (!trimmed) {
148
- return 0;
149
- }
150
- return trimmed.split(/\s+/).length;
151
- }
152
- function shouldAutoFallback(markdown) {
153
- const trimmed = markdown.trim();
154
- if (!trimmed) {
155
- return true;
156
- }
157
- const lower = trimmed.toLowerCase();
158
- if (/enable javascript|javascript is required|checking your browser|just a moment|please wait/.test(lower)) {
159
- return true;
160
- }
161
- const nonEmptyLines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).length;
162
- return countWords(trimmed) < 30 && nonEmptyLines <= 2;
163
- }
164
- /**
165
- * Validate and normalize parsed CLI arguments into the canonical runtime shape.
166
- * Fails fast with {@link InputError} on malformed input.
167
- */
168
- function normalizeArgs(urlInput, options) {
169
- if (!urlInput) {
170
- throw new InputError("A URL argument is required.");
171
- }
172
- let parsedUrl;
173
- try {
174
- parsedUrl = new URL(urlInput);
175
- }
176
- catch (error) {
177
- throw new InputError(`Invalid URL \"${urlInput}\".`, {
178
- cause: error instanceof Error ? error : undefined
179
- });
180
- }
181
- if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") {
182
- throw new InputError(`Unsupported URL protocol \"${parsedUrl.protocol}\". Only http:// and https:// are supported.`);
183
- }
184
- const dynamic = options.dynamic ?? false;
185
- const auto = options.auto ?? false;
186
- if (dynamic && auto) {
187
- throw new InputError("--dynamic and --auto cannot be used together.");
188
- }
189
- const timeouts = parseTimeouts(options.timeoutMs, dynamic, auto);
190
- return {
191
- url: parsedUrl.toString(),
192
- auto,
193
- dynamic,
194
- format: parseFormat(options.format ?? "markdown"),
195
- outputPath: options.output,
196
- timeoutMs: timeouts.timeoutMs,
197
- dynamicTimeoutMs: timeouts.dynamicTimeoutMs,
198
- headers: parseHeaders(options.header ?? [])
199
- };
200
- }
201
- function prepareContentFromFetchResult(result, deps) {
202
- if (shouldTreatAsMarkdownPassthrough(result)) {
203
- const markdown = normalizeMarkdown(result.body);
204
- return {
205
- markdown,
206
- title: inferTitleFromMarkdown(markdown),
207
- source: result,
208
- passthrough: true
209
- };
210
- }
211
- const markdown = deps.transformHtmlToMarkdown({ html: result.body });
212
- return {
213
- markdown,
214
- title: extractHtmlTitle(result.body),
215
- source: result,
216
- passthrough: false
217
- };
218
- }
219
- function formatOutput(args, content, usedDynamic) {
220
- if (args.format === "markdown") {
221
- return content.markdown;
222
- }
223
- const payload = {
224
- url: args.url,
225
- final_url: content.source.finalUrl,
226
- title: content.title ?? null,
227
- markdown: content.markdown,
228
- content_type: content.source.contentType ?? null,
229
- status: content.source.status,
230
- fetched_at: new Date().toISOString(),
231
- word_count: countWords(content.markdown),
232
- sha256: createHash("sha256").update(content.markdown).digest("hex"),
233
- used_dynamic: usedDynamic
234
- };
235
- return `${JSON.stringify(payload, null, 2)}\n`;
236
- }
237
18
  /**
238
19
  * Execute one curldown CLI invocation and return process exit code.
239
20
  * `argv` should not include the Node executable or script path.
@@ -310,28 +91,10 @@ export async function run(argv, deps = defaultDependencies) {
310
91
  return curldownError.exitCode;
311
92
  }
312
93
  }
313
- function resolvePathStrict(pathInput) {
314
- return realpathSync(pathInput);
315
- }
316
- /**
317
- * Determine whether this module was invoked as the CLI entrypoint.
318
- * Resolves symlinks for both paths so global installs that expose a symlinked bin still execute.
319
- */
320
94
  export function isMainModule(argvPath = process.argv[1]) {
321
- if (argvPath === undefined) {
322
- return false;
323
- }
324
- try {
325
- const invokedPath = resolvePathStrict(argvPath);
326
- const modulePath = resolvePathStrict(fileURLToPath(import.meta.url));
327
- return invokedPath === modulePath;
328
- }
329
- catch {
330
- return pathToFileURL(argvPath).href === import.meta.url;
331
- }
95
+ return isMainModuleFor(import.meta.url, argvPath);
332
96
  }
333
- const isMain = isMainModule();
334
- if (isMain) {
97
+ if (isMainModule()) {
335
98
  void run(process.argv.slice(2)).then((exitCode) => {
336
99
  process.exitCode = exitCode;
337
100
  });
package/dist/transform.js CHANGED
@@ -1,4 +1,6 @@
1
+ import { Readability } from "@mozilla/readability";
1
2
  import { load } from "cheerio";
3
+ import { JSDOM } from "jsdom";
2
4
  import { createRequire } from "node:module";
3
5
  import TurndownService from "turndown";
4
6
  import { DEFAULT_REMOVE_SELECTORS } from "./constants.js";
@@ -12,22 +14,98 @@ const turndown = new TurndownService({
12
14
  emDelimiter: "_"
13
15
  });
14
16
  turndown.use(turndownPluginGfm.gfm);
17
+ const FALLBACK_BASE_URL = "https://curldown.local/";
18
+ const PRIMARY_CONTENT_SELECTOR = "main, article, [role='main']";
19
+ const MIN_PRIMARY_CONTENT_TEXT_LENGTH = 200;
20
+ function getNormalizedTextLength(value) {
21
+ return value?.replace(/\s+/g, " ").trim().length ?? 0;
22
+ }
23
+ function cleanupFragmentHtml(html) {
24
+ const $ = load(html);
25
+ $(DEFAULT_REMOVE_SELECTORS.join(",")).remove();
26
+ $("img").each((_, element) => {
27
+ const alt = $(element).attr("alt")?.trim() ?? "";
28
+ if (!alt) {
29
+ $(element).remove();
30
+ }
31
+ });
32
+ $("a").each((_, element) => {
33
+ const link = $(element);
34
+ const textLength = getNormalizedTextLength(link.text());
35
+ const hasAltImage = link
36
+ .find("img")
37
+ .toArray()
38
+ .some((image) => getNormalizedTextLength($(image).attr("alt")) > 0);
39
+ if (textLength === 0 && !hasAltImage) {
40
+ link.remove();
41
+ }
42
+ });
43
+ return $.root().html() ?? "";
44
+ }
45
+ function extractBodyHtml(document) {
46
+ return document.body?.innerHTML ?? document.documentElement?.innerHTML ?? "";
47
+ }
48
+ function selectSemanticPrimaryHtml(document) {
49
+ const candidates = Array.from(document.querySelectorAll(PRIMARY_CONTENT_SELECTOR));
50
+ const bestCandidate = candidates
51
+ .map((element) => ({
52
+ html: element.innerHTML,
53
+ textLength: getNormalizedTextLength(element.textContent)
54
+ }))
55
+ .filter((candidate) => candidate.textLength > 0)
56
+ .sort((left, right) => right.textLength - left.textLength)[0];
57
+ if (!bestCandidate || bestCandidate.textLength < MIN_PRIMARY_CONTENT_TEXT_LENGTH) {
58
+ return undefined;
59
+ }
60
+ return bestCandidate.html;
61
+ }
62
+ function selectReadabilityHtml(document) {
63
+ const article = new Readability(document).parse();
64
+ if (!article || getNormalizedTextLength(article.textContent) === 0) {
65
+ return undefined;
66
+ }
67
+ return article.content ?? undefined;
68
+ }
69
+ function toMarkdownCandidate(html) {
70
+ if (!html) {
71
+ return undefined;
72
+ }
73
+ const cleanedHtml = cleanupFragmentHtml(html);
74
+ if (cleanedHtml.trim().length === 0) {
75
+ return undefined;
76
+ }
77
+ const markdown = turndown.turndown(cleanedHtml).trim();
78
+ return markdown.length > 0 ? markdown : undefined;
79
+ }
80
+ function getFirstMeaningfulMarkdownLine(markdown) {
81
+ return markdown
82
+ .split(/\r?\n/)
83
+ .map((line) => line.trim())
84
+ .find((line) => line.length > 0);
85
+ }
86
+ function startsWithPrimaryHeading(markdown) {
87
+ return /^#\s+\S/.test(getFirstMeaningfulMarkdownLine(markdown) ?? "");
88
+ }
15
89
  /**
16
90
  * Convert fetched HTML into markdown.
17
- * The function removes default non-content nodes before running Turndown
18
- * with GitHub Flavored Markdown extensions.
91
+ * The function prefers semantic primary-content containers, falls back to
92
+ * Readability for unstructured pages, and only converts the full body when
93
+ * no stronger content signal exists.
19
94
  */
20
95
  export function transformHtmlToMarkdown(input) {
21
- const $ = load(input.html);
22
- $(DEFAULT_REMOVE_SELECTORS.join(",")).remove();
23
- const bodyHtml = $("body").length > 0 ? $("body").html() ?? "" : $.root().html() ?? "";
24
- if (bodyHtml.trim().length === 0) {
96
+ const dom = new JSDOM(input.html, {
97
+ url: input.url ?? FALLBACK_BASE_URL
98
+ });
99
+ const { document } = dom.window;
100
+ const semanticMarkdown = toMarkdownCandidate(selectSemanticPrimaryHtml(document));
101
+ const readabilityMarkdown = toMarkdownCandidate(selectReadabilityHtml(new JSDOM(input.html, { url: input.url ?? FALLBACK_BASE_URL }).window.document));
102
+ const fallbackMarkdown = toMarkdownCandidate(extractBodyHtml(document));
103
+ const markdown = semanticMarkdown && startsWithPrimaryHeading(semanticMarkdown) && !startsWithPrimaryHeading(readabilityMarkdown ?? "")
104
+ ? semanticMarkdown
105
+ : readabilityMarkdown ?? semanticMarkdown ?? fallbackMarkdown;
106
+ if (!markdown) {
25
107
  throw new ConversionError("No HTML body content found to convert.");
26
108
  }
27
- const markdown = turndown.turndown(bodyHtml).trim();
28
- if (markdown.length === 0) {
29
- throw new ConversionError("HTML was fetched but produced empty markdown output.");
30
- }
31
109
  return `${markdown}\n`;
32
110
  }
33
111
  /** Extract document title from HTML head when available. */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@jenslys/curldown",
3
- "version": "1.0.3",
3
+ "version": "1.0.4",
4
4
  "description": "Fetch URL content and convert it to markdown.",
5
5
  "repository": {
6
6
  "type": "git",
@@ -30,12 +30,15 @@
30
30
  },
31
31
  "dependencies": {
32
32
  "@joplin/turndown-plugin-gfm": "^1.0.64",
33
+ "@mozilla/readability": "^0.6.0",
33
34
  "cheerio": "^1.2.0",
34
35
  "commander": "^14.0.3",
36
+ "jsdom": "^29.0.0",
35
37
  "playwright": "^1.58.2",
36
38
  "turndown": "^7.2.2"
37
39
  },
38
40
  "devDependencies": {
41
+ "@types/jsdom": "^28.0.1",
39
42
  "@types/node": "^25.3.3",
40
43
  "@types/turndown": "^5.0.6",
41
44
  "typescript": "^5.9.3",