@jenslys/curldown 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-args.js +108 -0
- package/dist/cli-content.js +119 -0
- package/dist/cli-main-module.js +22 -0
- package/dist/cli.js +8 -245
- package/dist/transform.js +88 -10
- package/package.json +4 -1
package/dist/cli-args.js
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import { Command } from "commander";
|
|
2
|
+
import { DEFAULT_DYNAMIC_TIMEOUT_MS, DEFAULT_STATIC_TIMEOUT_MS, VERSION } from "./constants.js";
|
|
3
|
+
import { InputError } from "./errors.js";
|
|
4
|
+
function collectRepeatable(value, previous = []) {
|
|
5
|
+
return [...previous, value];
|
|
6
|
+
}
|
|
7
|
+
export function buildProgram() {
|
|
8
|
+
return new Command()
|
|
9
|
+
.name("curldown")
|
|
10
|
+
.description("Fetch URL content and convert it to markdown.")
|
|
11
|
+
.version(VERSION)
|
|
12
|
+
.argument("<url>", "The URL to fetch")
|
|
13
|
+
.option("--dynamic", "Use headless Chromium (Playwright) to render the page")
|
|
14
|
+
.option("--auto", "Try static first and fallback to dynamic when static output is thin")
|
|
15
|
+
.option("--format <type>", "Output format: markdown|json", "markdown")
|
|
16
|
+
.option("-o, --output <path>", "Write output to a file instead of stdout")
|
|
17
|
+
.option("--timeout-ms <number>", "Timeout in milliseconds")
|
|
18
|
+
.option("--header <key:value>", "Set custom request header", collectRepeatable, [])
|
|
19
|
+
.showHelpAfterError()
|
|
20
|
+
.exitOverride();
|
|
21
|
+
}
|
|
22
|
+
function parseHeaders(rawHeaders) {
|
|
23
|
+
const headers = {};
|
|
24
|
+
for (const rawHeader of rawHeaders) {
|
|
25
|
+
const separatorIndex = rawHeader.indexOf(":");
|
|
26
|
+
if (separatorIndex <= 0 || separatorIndex === rawHeader.length - 1) {
|
|
27
|
+
throw new InputError(`Invalid --header value \"${rawHeader}\". Use key:value format.`);
|
|
28
|
+
}
|
|
29
|
+
const key = rawHeader.slice(0, separatorIndex).trim();
|
|
30
|
+
const value = rawHeader.slice(separatorIndex + 1).trim();
|
|
31
|
+
if (!key || !value) {
|
|
32
|
+
throw new InputError(`Invalid --header value \"${rawHeader}\". Header key and value are required.`);
|
|
33
|
+
}
|
|
34
|
+
headers[key] = value;
|
|
35
|
+
}
|
|
36
|
+
return headers;
|
|
37
|
+
}
|
|
38
|
+
function parseFormat(rawFormat) {
|
|
39
|
+
if (rawFormat === "markdown" || rawFormat === "json") {
|
|
40
|
+
return rawFormat;
|
|
41
|
+
}
|
|
42
|
+
throw new InputError(`Invalid --format value \"${rawFormat}\". Use \"markdown\" or \"json\".`);
|
|
43
|
+
}
|
|
44
|
+
function parseTimeouts(rawTimeout, dynamic, auto) {
|
|
45
|
+
if (rawTimeout === undefined) {
|
|
46
|
+
if (dynamic) {
|
|
47
|
+
return {
|
|
48
|
+
timeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS,
|
|
49
|
+
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
if (auto) {
|
|
53
|
+
return {
|
|
54
|
+
timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
|
|
55
|
+
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
return {
|
|
59
|
+
timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
|
|
60
|
+
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
const parsed = Number.parseInt(rawTimeout, 10);
|
|
64
|
+
if (!Number.isInteger(parsed) || parsed <= 0) {
|
|
65
|
+
throw new InputError(`Invalid --timeout-ms value \"${rawTimeout}\". Must be a positive integer.`);
|
|
66
|
+
}
|
|
67
|
+
return {
|
|
68
|
+
timeoutMs: parsed,
|
|
69
|
+
dynamicTimeoutMs: parsed
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Validate and normalize parsed CLI arguments into the canonical runtime shape.
|
|
74
|
+
* Fails fast with {@link InputError} on malformed input.
|
|
75
|
+
*/
|
|
76
|
+
export function normalizeArgs(urlInput, options) {
|
|
77
|
+
if (!urlInput) {
|
|
78
|
+
throw new InputError("A URL argument is required.");
|
|
79
|
+
}
|
|
80
|
+
let parsedUrl;
|
|
81
|
+
try {
|
|
82
|
+
parsedUrl = new URL(urlInput);
|
|
83
|
+
}
|
|
84
|
+
catch (error) {
|
|
85
|
+
throw new InputError(`Invalid URL \"${urlInput}\".`, {
|
|
86
|
+
cause: error instanceof Error ? error : undefined
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") {
|
|
90
|
+
throw new InputError(`Unsupported URL protocol \"${parsedUrl.protocol}\". Only http:// and https:// are supported.`);
|
|
91
|
+
}
|
|
92
|
+
const dynamic = options.dynamic ?? false;
|
|
93
|
+
const auto = options.auto ?? false;
|
|
94
|
+
if (dynamic && auto) {
|
|
95
|
+
throw new InputError("--dynamic and --auto cannot be used together.");
|
|
96
|
+
}
|
|
97
|
+
const timeouts = parseTimeouts(options.timeoutMs, dynamic, auto);
|
|
98
|
+
return {
|
|
99
|
+
url: parsedUrl.toString(),
|
|
100
|
+
auto,
|
|
101
|
+
dynamic,
|
|
102
|
+
format: parseFormat(options.format ?? "markdown"),
|
|
103
|
+
outputPath: options.output,
|
|
104
|
+
timeoutMs: timeouts.timeoutMs,
|
|
105
|
+
dynamicTimeoutMs: timeouts.dynamicTimeoutMs,
|
|
106
|
+
headers: parseHeaders(options.header ?? [])
|
|
107
|
+
};
|
|
108
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { ConversionError } from "./errors.js";
|
|
3
|
+
import { extractHtmlTitle } from "./transform.js";
|
|
4
|
+
const MARKDOWN_CONTENT_TYPES = new Set([
|
|
5
|
+
"text/markdown",
|
|
6
|
+
"text/x-markdown",
|
|
7
|
+
"application/markdown",
|
|
8
|
+
"application/x-markdown"
|
|
9
|
+
]);
|
|
10
|
+
const PLAINTEXT_CONTENT_TYPE = "text/plain";
|
|
11
|
+
const MARKDOWN_FILE_EXTENSIONS = [
|
|
12
|
+
".md",
|
|
13
|
+
".markdown",
|
|
14
|
+
".mdown",
|
|
15
|
+
".mkd",
|
|
16
|
+
".mkdn",
|
|
17
|
+
".mdtxt",
|
|
18
|
+
".mdx"
|
|
19
|
+
];
|
|
20
|
+
function normalizeMarkdown(markdown) {
|
|
21
|
+
const trimmed = markdown.trim();
|
|
22
|
+
if (!trimmed) {
|
|
23
|
+
throw new ConversionError("Content was fetched but markdown output is empty.");
|
|
24
|
+
}
|
|
25
|
+
return `${trimmed}\n`;
|
|
26
|
+
}
|
|
27
|
+
function inferTitleFromMarkdown(markdown) {
|
|
28
|
+
const firstHeading = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim();
|
|
29
|
+
return firstHeading || undefined;
|
|
30
|
+
}
|
|
31
|
+
function isMarkdownContentType(contentType) {
|
|
32
|
+
if (!contentType) {
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
36
|
+
return MARKDOWN_CONTENT_TYPES.has(normalized);
|
|
37
|
+
}
|
|
38
|
+
function isPlainTextContentType(contentType) {
|
|
39
|
+
if (!contentType) {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
43
|
+
return normalized === PLAINTEXT_CONTENT_TYPE;
|
|
44
|
+
}
|
|
45
|
+
function hasMarkdownFileExtension(urlValue) {
|
|
46
|
+
let pathname;
|
|
47
|
+
try {
|
|
48
|
+
pathname = new URL(urlValue).pathname;
|
|
49
|
+
}
|
|
50
|
+
catch {
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
const normalizedPath = pathname.toLowerCase();
|
|
54
|
+
return MARKDOWN_FILE_EXTENSIONS.some((extension) => normalizedPath.endsWith(extension));
|
|
55
|
+
}
|
|
56
|
+
function shouldTreatAsMarkdownPassthrough(result) {
|
|
57
|
+
if (isMarkdownContentType(result.contentType)) {
|
|
58
|
+
return true;
|
|
59
|
+
}
|
|
60
|
+
return isPlainTextContentType(result.contentType) && hasMarkdownFileExtension(result.finalUrl);
|
|
61
|
+
}
|
|
62
|
+
function countWords(value) {
|
|
63
|
+
const trimmed = value.trim();
|
|
64
|
+
if (!trimmed) {
|
|
65
|
+
return 0;
|
|
66
|
+
}
|
|
67
|
+
return trimmed.split(/\s+/).length;
|
|
68
|
+
}
|
|
69
|
+
export function shouldAutoFallback(markdown) {
|
|
70
|
+
const trimmed = markdown.trim();
|
|
71
|
+
if (!trimmed) {
|
|
72
|
+
return true;
|
|
73
|
+
}
|
|
74
|
+
const lower = trimmed.toLowerCase();
|
|
75
|
+
if (/enable javascript|javascript is required|checking your browser|just a moment|please wait/.test(lower)) {
|
|
76
|
+
return true;
|
|
77
|
+
}
|
|
78
|
+
const nonEmptyLines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).length;
|
|
79
|
+
return countWords(trimmed) < 30 && nonEmptyLines <= 2;
|
|
80
|
+
}
|
|
81
|
+
export function prepareContentFromFetchResult(result, deps) {
|
|
82
|
+
if (shouldTreatAsMarkdownPassthrough(result)) {
|
|
83
|
+
const markdown = normalizeMarkdown(result.body);
|
|
84
|
+
return {
|
|
85
|
+
markdown,
|
|
86
|
+
title: inferTitleFromMarkdown(markdown),
|
|
87
|
+
source: result,
|
|
88
|
+
passthrough: true
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
const markdown = deps.transformHtmlToMarkdown({
|
|
92
|
+
html: result.body,
|
|
93
|
+
url: result.finalUrl
|
|
94
|
+
});
|
|
95
|
+
return {
|
|
96
|
+
markdown,
|
|
97
|
+
title: extractHtmlTitle(result.body),
|
|
98
|
+
source: result,
|
|
99
|
+
passthrough: false
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
export function formatOutput(args, content, usedDynamic) {
|
|
103
|
+
if (args.format === "markdown") {
|
|
104
|
+
return content.markdown;
|
|
105
|
+
}
|
|
106
|
+
const payload = {
|
|
107
|
+
url: args.url,
|
|
108
|
+
final_url: content.source.finalUrl,
|
|
109
|
+
title: content.title ?? null,
|
|
110
|
+
markdown: content.markdown,
|
|
111
|
+
content_type: content.source.contentType ?? null,
|
|
112
|
+
status: content.source.status,
|
|
113
|
+
fetched_at: new Date().toISOString(),
|
|
114
|
+
word_count: countWords(content.markdown),
|
|
115
|
+
sha256: createHash("sha256").update(content.markdown).digest("hex"),
|
|
116
|
+
used_dynamic: usedDynamic
|
|
117
|
+
};
|
|
118
|
+
return `${JSON.stringify(payload, null, 2)}\n`;
|
|
119
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { realpathSync } from "node:fs";
|
|
2
|
+
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
3
|
+
function resolvePathStrict(pathInput) {
|
|
4
|
+
return realpathSync(pathInput);
|
|
5
|
+
}
|
|
6
|
+
/**
|
|
7
|
+
* Determine whether `argvPath` points at the current module entrypoint.
|
|
8
|
+
* Resolves symlinks for both paths so global installs that expose a symlinked bin still execute.
|
|
9
|
+
*/
|
|
10
|
+
export function isMainModuleFor(moduleUrl, argvPath = process.argv[1]) {
|
|
11
|
+
if (argvPath === undefined) {
|
|
12
|
+
return false;
|
|
13
|
+
}
|
|
14
|
+
try {
|
|
15
|
+
const invokedPath = resolvePathStrict(argvPath);
|
|
16
|
+
const modulePath = resolvePathStrict(fileURLToPath(moduleUrl));
|
|
17
|
+
return invokedPath === modulePath;
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return pathToFileURL(argvPath).href === moduleUrl;
|
|
21
|
+
}
|
|
22
|
+
}
|
package/dist/cli.js
CHANGED
|
@@ -1,30 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
import {
|
|
7
|
-
import { asCurldownError, ConversionError, InputError } from "./errors.js";
|
|
2
|
+
import { CommanderError } from "commander";
|
|
3
|
+
import { buildProgram, normalizeArgs } from "./cli-args.js";
|
|
4
|
+
import { formatOutput, prepareContentFromFetchResult, shouldAutoFallback } from "./cli-content.js";
|
|
5
|
+
import { isMainModuleFor } from "./cli-main-module.js";
|
|
6
|
+
import { asCurldownError } from "./errors.js";
|
|
8
7
|
import { fetchDynamicHtml } from "./fetch-dynamic.js";
|
|
9
8
|
import { fetchStaticHtml } from "./fetch-static.js";
|
|
10
9
|
import { writeOutput } from "./output.js";
|
|
11
|
-
import {
|
|
12
|
-
const MARKDOWN_CONTENT_TYPES = new Set([
|
|
13
|
-
"text/markdown",
|
|
14
|
-
"text/x-markdown",
|
|
15
|
-
"application/markdown",
|
|
16
|
-
"application/x-markdown"
|
|
17
|
-
]);
|
|
18
|
-
const PLAINTEXT_CONTENT_TYPE = "text/plain";
|
|
19
|
-
const MARKDOWN_FILE_EXTENSIONS = [
|
|
20
|
-
".md",
|
|
21
|
-
".markdown",
|
|
22
|
-
".mdown",
|
|
23
|
-
".mkd",
|
|
24
|
-
".mkdn",
|
|
25
|
-
".mdtxt",
|
|
26
|
-
".mdx"
|
|
27
|
-
];
|
|
10
|
+
import { transformHtmlToMarkdown } from "./transform.js";
|
|
28
11
|
const defaultDependencies = {
|
|
29
12
|
fetchStatic: fetchStaticHtml,
|
|
30
13
|
fetchDynamic: fetchDynamicHtml,
|
|
@@ -32,208 +15,6 @@ const defaultDependencies = {
|
|
|
32
15
|
writeOutput,
|
|
33
16
|
stderrWrite: (message) => process.stderr.write(message)
|
|
34
17
|
};
|
|
35
|
-
function collectRepeatable(value, previous = []) {
|
|
36
|
-
return [...previous, value];
|
|
37
|
-
}
|
|
38
|
-
function buildProgram() {
|
|
39
|
-
return new Command()
|
|
40
|
-
.name("curldown")
|
|
41
|
-
.description("Fetch URL content and convert it to markdown.")
|
|
42
|
-
.version(VERSION)
|
|
43
|
-
.argument("<url>", "The URL to fetch")
|
|
44
|
-
.option("--dynamic", "Use headless Chromium (Playwright) to render the page")
|
|
45
|
-
.option("--auto", "Try static first and fallback to dynamic when static output is thin")
|
|
46
|
-
.option("--format <type>", "Output format: markdown|json", "markdown")
|
|
47
|
-
.option("-o, --output <path>", "Write output to a file instead of stdout")
|
|
48
|
-
.option("--timeout-ms <number>", "Timeout in milliseconds")
|
|
49
|
-
.option("--header <key:value>", "Set custom request header", collectRepeatable, [])
|
|
50
|
-
.showHelpAfterError()
|
|
51
|
-
.exitOverride();
|
|
52
|
-
}
|
|
53
|
-
function parseHeaders(rawHeaders) {
|
|
54
|
-
const headers = {};
|
|
55
|
-
for (const rawHeader of rawHeaders) {
|
|
56
|
-
const separatorIndex = rawHeader.indexOf(":");
|
|
57
|
-
if (separatorIndex <= 0 || separatorIndex === rawHeader.length - 1) {
|
|
58
|
-
throw new InputError(`Invalid --header value \"${rawHeader}\". Use key:value format.`);
|
|
59
|
-
}
|
|
60
|
-
const key = rawHeader.slice(0, separatorIndex).trim();
|
|
61
|
-
const value = rawHeader.slice(separatorIndex + 1).trim();
|
|
62
|
-
if (!key || !value) {
|
|
63
|
-
throw new InputError(`Invalid --header value \"${rawHeader}\". Header key and value are required.`);
|
|
64
|
-
}
|
|
65
|
-
headers[key] = value;
|
|
66
|
-
}
|
|
67
|
-
return headers;
|
|
68
|
-
}
|
|
69
|
-
function parseFormat(rawFormat) {
|
|
70
|
-
if (rawFormat === "markdown" || rawFormat === "json") {
|
|
71
|
-
return rawFormat;
|
|
72
|
-
}
|
|
73
|
-
throw new InputError(`Invalid --format value \"${rawFormat}\". Use \"markdown\" or \"json\".`);
|
|
74
|
-
}
|
|
75
|
-
function parseTimeouts(rawTimeout, dynamic, auto) {
|
|
76
|
-
if (rawTimeout === undefined) {
|
|
77
|
-
if (dynamic) {
|
|
78
|
-
return {
|
|
79
|
-
timeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS,
|
|
80
|
-
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
81
|
-
};
|
|
82
|
-
}
|
|
83
|
-
if (auto) {
|
|
84
|
-
return {
|
|
85
|
-
timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
|
|
86
|
-
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
87
|
-
};
|
|
88
|
-
}
|
|
89
|
-
return {
|
|
90
|
-
timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
|
|
91
|
-
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
92
|
-
};
|
|
93
|
-
}
|
|
94
|
-
const parsed = Number.parseInt(rawTimeout, 10);
|
|
95
|
-
if (!Number.isInteger(parsed) || parsed <= 0) {
|
|
96
|
-
throw new InputError(`Invalid --timeout-ms value \"${rawTimeout}\". Must be a positive integer.`);
|
|
97
|
-
}
|
|
98
|
-
return {
|
|
99
|
-
timeoutMs: parsed,
|
|
100
|
-
dynamicTimeoutMs: parsed
|
|
101
|
-
};
|
|
102
|
-
}
|
|
103
|
-
function normalizeMarkdown(markdown) {
|
|
104
|
-
const trimmed = markdown.trim();
|
|
105
|
-
if (!trimmed) {
|
|
106
|
-
throw new ConversionError("Content was fetched but markdown output is empty.");
|
|
107
|
-
}
|
|
108
|
-
return `${trimmed}\n`;
|
|
109
|
-
}
|
|
110
|
-
function inferTitleFromMarkdown(markdown) {
|
|
111
|
-
const firstHeading = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim();
|
|
112
|
-
return firstHeading || undefined;
|
|
113
|
-
}
|
|
114
|
-
function isMarkdownContentType(contentType) {
|
|
115
|
-
if (!contentType) {
|
|
116
|
-
return false;
|
|
117
|
-
}
|
|
118
|
-
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
119
|
-
return MARKDOWN_CONTENT_TYPES.has(normalized);
|
|
120
|
-
}
|
|
121
|
-
function isPlainTextContentType(contentType) {
|
|
122
|
-
if (!contentType) {
|
|
123
|
-
return false;
|
|
124
|
-
}
|
|
125
|
-
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
126
|
-
return normalized === PLAINTEXT_CONTENT_TYPE;
|
|
127
|
-
}
|
|
128
|
-
function hasMarkdownFileExtension(urlValue) {
|
|
129
|
-
let pathname;
|
|
130
|
-
try {
|
|
131
|
-
pathname = new URL(urlValue).pathname;
|
|
132
|
-
}
|
|
133
|
-
catch {
|
|
134
|
-
return false;
|
|
135
|
-
}
|
|
136
|
-
const normalizedPath = pathname.toLowerCase();
|
|
137
|
-
return MARKDOWN_FILE_EXTENSIONS.some((extension) => normalizedPath.endsWith(extension));
|
|
138
|
-
}
|
|
139
|
-
function shouldTreatAsMarkdownPassthrough(result) {
|
|
140
|
-
if (isMarkdownContentType(result.contentType)) {
|
|
141
|
-
return true;
|
|
142
|
-
}
|
|
143
|
-
return isPlainTextContentType(result.contentType) && hasMarkdownFileExtension(result.finalUrl);
|
|
144
|
-
}
|
|
145
|
-
function countWords(value) {
|
|
146
|
-
const trimmed = value.trim();
|
|
147
|
-
if (!trimmed) {
|
|
148
|
-
return 0;
|
|
149
|
-
}
|
|
150
|
-
return trimmed.split(/\s+/).length;
|
|
151
|
-
}
|
|
152
|
-
function shouldAutoFallback(markdown) {
|
|
153
|
-
const trimmed = markdown.trim();
|
|
154
|
-
if (!trimmed) {
|
|
155
|
-
return true;
|
|
156
|
-
}
|
|
157
|
-
const lower = trimmed.toLowerCase();
|
|
158
|
-
if (/enable javascript|javascript is required|checking your browser|just a moment|please wait/.test(lower)) {
|
|
159
|
-
return true;
|
|
160
|
-
}
|
|
161
|
-
const nonEmptyLines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).length;
|
|
162
|
-
return countWords(trimmed) < 30 && nonEmptyLines <= 2;
|
|
163
|
-
}
|
|
164
|
-
/**
|
|
165
|
-
* Validate and normalize parsed CLI arguments into the canonical runtime shape.
|
|
166
|
-
* Fails fast with {@link InputError} on malformed input.
|
|
167
|
-
*/
|
|
168
|
-
function normalizeArgs(urlInput, options) {
|
|
169
|
-
if (!urlInput) {
|
|
170
|
-
throw new InputError("A URL argument is required.");
|
|
171
|
-
}
|
|
172
|
-
let parsedUrl;
|
|
173
|
-
try {
|
|
174
|
-
parsedUrl = new URL(urlInput);
|
|
175
|
-
}
|
|
176
|
-
catch (error) {
|
|
177
|
-
throw new InputError(`Invalid URL \"${urlInput}\".`, {
|
|
178
|
-
cause: error instanceof Error ? error : undefined
|
|
179
|
-
});
|
|
180
|
-
}
|
|
181
|
-
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") {
|
|
182
|
-
throw new InputError(`Unsupported URL protocol \"${parsedUrl.protocol}\". Only http:// and https:// are supported.`);
|
|
183
|
-
}
|
|
184
|
-
const dynamic = options.dynamic ?? false;
|
|
185
|
-
const auto = options.auto ?? false;
|
|
186
|
-
if (dynamic && auto) {
|
|
187
|
-
throw new InputError("--dynamic and --auto cannot be used together.");
|
|
188
|
-
}
|
|
189
|
-
const timeouts = parseTimeouts(options.timeoutMs, dynamic, auto);
|
|
190
|
-
return {
|
|
191
|
-
url: parsedUrl.toString(),
|
|
192
|
-
auto,
|
|
193
|
-
dynamic,
|
|
194
|
-
format: parseFormat(options.format ?? "markdown"),
|
|
195
|
-
outputPath: options.output,
|
|
196
|
-
timeoutMs: timeouts.timeoutMs,
|
|
197
|
-
dynamicTimeoutMs: timeouts.dynamicTimeoutMs,
|
|
198
|
-
headers: parseHeaders(options.header ?? [])
|
|
199
|
-
};
|
|
200
|
-
}
|
|
201
|
-
function prepareContentFromFetchResult(result, deps) {
|
|
202
|
-
if (shouldTreatAsMarkdownPassthrough(result)) {
|
|
203
|
-
const markdown = normalizeMarkdown(result.body);
|
|
204
|
-
return {
|
|
205
|
-
markdown,
|
|
206
|
-
title: inferTitleFromMarkdown(markdown),
|
|
207
|
-
source: result,
|
|
208
|
-
passthrough: true
|
|
209
|
-
};
|
|
210
|
-
}
|
|
211
|
-
const markdown = deps.transformHtmlToMarkdown({ html: result.body });
|
|
212
|
-
return {
|
|
213
|
-
markdown,
|
|
214
|
-
title: extractHtmlTitle(result.body),
|
|
215
|
-
source: result,
|
|
216
|
-
passthrough: false
|
|
217
|
-
};
|
|
218
|
-
}
|
|
219
|
-
function formatOutput(args, content, usedDynamic) {
|
|
220
|
-
if (args.format === "markdown") {
|
|
221
|
-
return content.markdown;
|
|
222
|
-
}
|
|
223
|
-
const payload = {
|
|
224
|
-
url: args.url,
|
|
225
|
-
final_url: content.source.finalUrl,
|
|
226
|
-
title: content.title ?? null,
|
|
227
|
-
markdown: content.markdown,
|
|
228
|
-
content_type: content.source.contentType ?? null,
|
|
229
|
-
status: content.source.status,
|
|
230
|
-
fetched_at: new Date().toISOString(),
|
|
231
|
-
word_count: countWords(content.markdown),
|
|
232
|
-
sha256: createHash("sha256").update(content.markdown).digest("hex"),
|
|
233
|
-
used_dynamic: usedDynamic
|
|
234
|
-
};
|
|
235
|
-
return `${JSON.stringify(payload, null, 2)}\n`;
|
|
236
|
-
}
|
|
237
18
|
/**
|
|
238
19
|
* Execute one curldown CLI invocation and return process exit code.
|
|
239
20
|
* `argv` should not include the Node executable or script path.
|
|
@@ -310,28 +91,10 @@ export async function run(argv, deps = defaultDependencies) {
|
|
|
310
91
|
return curldownError.exitCode;
|
|
311
92
|
}
|
|
312
93
|
}
|
|
313
|
-
function resolvePathStrict(pathInput) {
|
|
314
|
-
return realpathSync(pathInput);
|
|
315
|
-
}
|
|
316
|
-
/**
|
|
317
|
-
* Determine whether this module was invoked as the CLI entrypoint.
|
|
318
|
-
* Resolves symlinks for both paths so global installs that expose a symlinked bin still execute.
|
|
319
|
-
*/
|
|
320
94
|
export function isMainModule(argvPath = process.argv[1]) {
|
|
321
|
-
|
|
322
|
-
return false;
|
|
323
|
-
}
|
|
324
|
-
try {
|
|
325
|
-
const invokedPath = resolvePathStrict(argvPath);
|
|
326
|
-
const modulePath = resolvePathStrict(fileURLToPath(import.meta.url));
|
|
327
|
-
return invokedPath === modulePath;
|
|
328
|
-
}
|
|
329
|
-
catch {
|
|
330
|
-
return pathToFileURL(argvPath).href === import.meta.url;
|
|
331
|
-
}
|
|
95
|
+
return isMainModuleFor(import.meta.url, argvPath);
|
|
332
96
|
}
|
|
333
|
-
|
|
334
|
-
if (isMain) {
|
|
97
|
+
if (isMainModule()) {
|
|
335
98
|
void run(process.argv.slice(2)).then((exitCode) => {
|
|
336
99
|
process.exitCode = exitCode;
|
|
337
100
|
});
|
package/dist/transform.js
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
import { Readability } from "@mozilla/readability";
|
|
1
2
|
import { load } from "cheerio";
|
|
3
|
+
import { JSDOM } from "jsdom";
|
|
2
4
|
import { createRequire } from "node:module";
|
|
3
5
|
import TurndownService from "turndown";
|
|
4
6
|
import { DEFAULT_REMOVE_SELECTORS } from "./constants.js";
|
|
@@ -12,22 +14,98 @@ const turndown = new TurndownService({
|
|
|
12
14
|
emDelimiter: "_"
|
|
13
15
|
});
|
|
14
16
|
turndown.use(turndownPluginGfm.gfm);
|
|
17
|
+
const FALLBACK_BASE_URL = "https://curldown.local/";
|
|
18
|
+
const PRIMARY_CONTENT_SELECTOR = "main, article, [role='main']";
|
|
19
|
+
const MIN_PRIMARY_CONTENT_TEXT_LENGTH = 200;
|
|
20
|
+
function getNormalizedTextLength(value) {
|
|
21
|
+
return value?.replace(/\s+/g, " ").trim().length ?? 0;
|
|
22
|
+
}
|
|
23
|
+
function cleanupFragmentHtml(html) {
|
|
24
|
+
const $ = load(html);
|
|
25
|
+
$(DEFAULT_REMOVE_SELECTORS.join(",")).remove();
|
|
26
|
+
$("img").each((_, element) => {
|
|
27
|
+
const alt = $(element).attr("alt")?.trim() ?? "";
|
|
28
|
+
if (!alt) {
|
|
29
|
+
$(element).remove();
|
|
30
|
+
}
|
|
31
|
+
});
|
|
32
|
+
$("a").each((_, element) => {
|
|
33
|
+
const link = $(element);
|
|
34
|
+
const textLength = getNormalizedTextLength(link.text());
|
|
35
|
+
const hasAltImage = link
|
|
36
|
+
.find("img")
|
|
37
|
+
.toArray()
|
|
38
|
+
.some((image) => getNormalizedTextLength($(image).attr("alt")) > 0);
|
|
39
|
+
if (textLength === 0 && !hasAltImage) {
|
|
40
|
+
link.remove();
|
|
41
|
+
}
|
|
42
|
+
});
|
|
43
|
+
return $.root().html() ?? "";
|
|
44
|
+
}
|
|
45
|
+
function extractBodyHtml(document) {
|
|
46
|
+
return document.body?.innerHTML ?? document.documentElement?.innerHTML ?? "";
|
|
47
|
+
}
|
|
48
|
+
function selectSemanticPrimaryHtml(document) {
|
|
49
|
+
const candidates = Array.from(document.querySelectorAll(PRIMARY_CONTENT_SELECTOR));
|
|
50
|
+
const bestCandidate = candidates
|
|
51
|
+
.map((element) => ({
|
|
52
|
+
html: element.innerHTML,
|
|
53
|
+
textLength: getNormalizedTextLength(element.textContent)
|
|
54
|
+
}))
|
|
55
|
+
.filter((candidate) => candidate.textLength > 0)
|
|
56
|
+
.sort((left, right) => right.textLength - left.textLength)[0];
|
|
57
|
+
if (!bestCandidate || bestCandidate.textLength < MIN_PRIMARY_CONTENT_TEXT_LENGTH) {
|
|
58
|
+
return undefined;
|
|
59
|
+
}
|
|
60
|
+
return bestCandidate.html;
|
|
61
|
+
}
|
|
62
|
+
function selectReadabilityHtml(document) {
|
|
63
|
+
const article = new Readability(document).parse();
|
|
64
|
+
if (!article || getNormalizedTextLength(article.textContent) === 0) {
|
|
65
|
+
return undefined;
|
|
66
|
+
}
|
|
67
|
+
return article.content ?? undefined;
|
|
68
|
+
}
|
|
69
|
+
function toMarkdownCandidate(html) {
|
|
70
|
+
if (!html) {
|
|
71
|
+
return undefined;
|
|
72
|
+
}
|
|
73
|
+
const cleanedHtml = cleanupFragmentHtml(html);
|
|
74
|
+
if (cleanedHtml.trim().length === 0) {
|
|
75
|
+
return undefined;
|
|
76
|
+
}
|
|
77
|
+
const markdown = turndown.turndown(cleanedHtml).trim();
|
|
78
|
+
return markdown.length > 0 ? markdown : undefined;
|
|
79
|
+
}
|
|
80
|
+
function getFirstMeaningfulMarkdownLine(markdown) {
|
|
81
|
+
return markdown
|
|
82
|
+
.split(/\r?\n/)
|
|
83
|
+
.map((line) => line.trim())
|
|
84
|
+
.find((line) => line.length > 0);
|
|
85
|
+
}
|
|
86
|
+
function startsWithPrimaryHeading(markdown) {
|
|
87
|
+
return /^#\s+\S/.test(getFirstMeaningfulMarkdownLine(markdown) ?? "");
|
|
88
|
+
}
|
|
15
89
|
/**
|
|
16
90
|
* Convert fetched HTML into markdown.
|
|
17
|
-
* The function
|
|
18
|
-
*
|
|
91
|
+
* The function prefers semantic primary-content containers, falls back to
|
|
92
|
+
* Readability for unstructured pages, and only converts the full body when
|
|
93
|
+
* no stronger content signal exists.
|
|
19
94
|
*/
|
|
20
95
|
export function transformHtmlToMarkdown(input) {
|
|
21
|
-
const
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
96
|
+
const dom = new JSDOM(input.html, {
|
|
97
|
+
url: input.url ?? FALLBACK_BASE_URL
|
|
98
|
+
});
|
|
99
|
+
const { document } = dom.window;
|
|
100
|
+
const semanticMarkdown = toMarkdownCandidate(selectSemanticPrimaryHtml(document));
|
|
101
|
+
const readabilityMarkdown = toMarkdownCandidate(selectReadabilityHtml(new JSDOM(input.html, { url: input.url ?? FALLBACK_BASE_URL }).window.document));
|
|
102
|
+
const fallbackMarkdown = toMarkdownCandidate(extractBodyHtml(document));
|
|
103
|
+
const markdown = semanticMarkdown && startsWithPrimaryHeading(semanticMarkdown) && !startsWithPrimaryHeading(readabilityMarkdown ?? "")
|
|
104
|
+
? semanticMarkdown
|
|
105
|
+
: readabilityMarkdown ?? semanticMarkdown ?? fallbackMarkdown;
|
|
106
|
+
if (!markdown) {
|
|
25
107
|
throw new ConversionError("No HTML body content found to convert.");
|
|
26
108
|
}
|
|
27
|
-
const markdown = turndown.turndown(bodyHtml).trim();
|
|
28
|
-
if (markdown.length === 0) {
|
|
29
|
-
throw new ConversionError("HTML was fetched but produced empty markdown output.");
|
|
30
|
-
}
|
|
31
109
|
return `${markdown}\n`;
|
|
32
110
|
}
|
|
33
111
|
/** Extract document title from HTML head when available. */
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@jenslys/curldown",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.4",
|
|
4
4
|
"description": "Fetch URL content and convert it to markdown.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -30,12 +30,15 @@
|
|
|
30
30
|
},
|
|
31
31
|
"dependencies": {
|
|
32
32
|
"@joplin/turndown-plugin-gfm": "^1.0.64",
|
|
33
|
+
"@mozilla/readability": "^0.6.0",
|
|
33
34
|
"cheerio": "^1.2.0",
|
|
34
35
|
"commander": "^14.0.3",
|
|
36
|
+
"jsdom": "^29.0.0",
|
|
35
37
|
"playwright": "^1.58.2",
|
|
36
38
|
"turndown": "^7.2.2"
|
|
37
39
|
},
|
|
38
40
|
"devDependencies": {
|
|
41
|
+
"@types/jsdom": "^28.0.1",
|
|
39
42
|
"@types/node": "^25.3.3",
|
|
40
43
|
"@types/turndown": "^5.0.6",
|
|
41
44
|
"typescript": "^5.9.3",
|