@jenslys/curldown 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -1
- package/dist/cli-args.js +108 -0
- package/dist/cli-content.js +119 -0
- package/dist/cli-main-module.js +22 -0
- package/dist/cli.js +8 -211
- package/dist/transform.js +88 -10
- package/package.json +4 -1
package/README.md
CHANGED
|
@@ -7,6 +7,7 @@ Fetch a webpage and return clean Markdown for AI workflows.
|
|
|
7
7
|
- Static mode: `fetch` HTML -> Cheerio cleanup -> Turndown markdown.
|
|
8
8
|
- Dynamic mode: headless Chromium (Playwright) -> HTML -> markdown.
|
|
9
9
|
- `--auto` tries static first and falls back to dynamic when static output is thin.
|
|
10
|
+
- Direct markdown responses are passed through (including `.md` URLs served as `text/plain`).
|
|
10
11
|
- `--format json` emits markdown plus metadata for agent pipelines.
|
|
11
12
|
|
|
12
13
|
## Install
|
|
@@ -82,7 +83,8 @@ Paste this into your `AGENTS.md` if you want agents to always use `curldown` for
|
|
|
82
83
|
```md
|
|
83
84
|
## Website Content Retrieval
|
|
84
85
|
|
|
85
|
-
-
|
|
86
|
+
- Use `curldown` for website/article page retrieval in agent workflows.
|
|
87
|
+
- Do not use `curldown` for raw code files or repository file blobs (for those, fetch the file directly).
|
|
86
88
|
- Default command: `curldown <url>`.
|
|
87
89
|
- Prefer `curldown <url> --auto` when page rendering might be uncertain.
|
|
88
90
|
- Use `curldown <url> --format json` when downstream steps need structured metadata.
|
package/dist/cli-args.js
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import { Command } from "commander";
|
|
2
|
+
import { DEFAULT_DYNAMIC_TIMEOUT_MS, DEFAULT_STATIC_TIMEOUT_MS, VERSION } from "./constants.js";
|
|
3
|
+
import { InputError } from "./errors.js";
|
|
4
|
+
function collectRepeatable(value, previous = []) {
|
|
5
|
+
return [...previous, value];
|
|
6
|
+
}
|
|
7
|
+
export function buildProgram() {
|
|
8
|
+
return new Command()
|
|
9
|
+
.name("curldown")
|
|
10
|
+
.description("Fetch URL content and convert it to markdown.")
|
|
11
|
+
.version(VERSION)
|
|
12
|
+
.argument("<url>", "The URL to fetch")
|
|
13
|
+
.option("--dynamic", "Use headless Chromium (Playwright) to render the page")
|
|
14
|
+
.option("--auto", "Try static first and fallback to dynamic when static output is thin")
|
|
15
|
+
.option("--format <type>", "Output format: markdown|json", "markdown")
|
|
16
|
+
.option("-o, --output <path>", "Write output to a file instead of stdout")
|
|
17
|
+
.option("--timeout-ms <number>", "Timeout in milliseconds")
|
|
18
|
+
.option("--header <key:value>", "Set custom request header", collectRepeatable, [])
|
|
19
|
+
.showHelpAfterError()
|
|
20
|
+
.exitOverride();
|
|
21
|
+
}
|
|
22
|
+
function parseHeaders(rawHeaders) {
|
|
23
|
+
const headers = {};
|
|
24
|
+
for (const rawHeader of rawHeaders) {
|
|
25
|
+
const separatorIndex = rawHeader.indexOf(":");
|
|
26
|
+
if (separatorIndex <= 0 || separatorIndex === rawHeader.length - 1) {
|
|
27
|
+
throw new InputError(`Invalid --header value \"${rawHeader}\". Use key:value format.`);
|
|
28
|
+
}
|
|
29
|
+
const key = rawHeader.slice(0, separatorIndex).trim();
|
|
30
|
+
const value = rawHeader.slice(separatorIndex + 1).trim();
|
|
31
|
+
if (!key || !value) {
|
|
32
|
+
throw new InputError(`Invalid --header value \"${rawHeader}\". Header key and value are required.`);
|
|
33
|
+
}
|
|
34
|
+
headers[key] = value;
|
|
35
|
+
}
|
|
36
|
+
return headers;
|
|
37
|
+
}
|
|
38
|
+
function parseFormat(rawFormat) {
|
|
39
|
+
if (rawFormat === "markdown" || rawFormat === "json") {
|
|
40
|
+
return rawFormat;
|
|
41
|
+
}
|
|
42
|
+
throw new InputError(`Invalid --format value \"${rawFormat}\". Use \"markdown\" or \"json\".`);
|
|
43
|
+
}
|
|
44
|
+
function parseTimeouts(rawTimeout, dynamic, auto) {
|
|
45
|
+
if (rawTimeout === undefined) {
|
|
46
|
+
if (dynamic) {
|
|
47
|
+
return {
|
|
48
|
+
timeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS,
|
|
49
|
+
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
if (auto) {
|
|
53
|
+
return {
|
|
54
|
+
timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
|
|
55
|
+
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
return {
|
|
59
|
+
timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
|
|
60
|
+
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
const parsed = Number.parseInt(rawTimeout, 10);
|
|
64
|
+
if (!Number.isInteger(parsed) || parsed <= 0) {
|
|
65
|
+
throw new InputError(`Invalid --timeout-ms value \"${rawTimeout}\". Must be a positive integer.`);
|
|
66
|
+
}
|
|
67
|
+
return {
|
|
68
|
+
timeoutMs: parsed,
|
|
69
|
+
dynamicTimeoutMs: parsed
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Validate and normalize parsed CLI arguments into the canonical runtime shape.
|
|
74
|
+
* Fails fast with {@link InputError} on malformed input.
|
|
75
|
+
*/
|
|
76
|
+
export function normalizeArgs(urlInput, options) {
|
|
77
|
+
if (!urlInput) {
|
|
78
|
+
throw new InputError("A URL argument is required.");
|
|
79
|
+
}
|
|
80
|
+
let parsedUrl;
|
|
81
|
+
try {
|
|
82
|
+
parsedUrl = new URL(urlInput);
|
|
83
|
+
}
|
|
84
|
+
catch (error) {
|
|
85
|
+
throw new InputError(`Invalid URL \"${urlInput}\".`, {
|
|
86
|
+
cause: error instanceof Error ? error : undefined
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") {
|
|
90
|
+
throw new InputError(`Unsupported URL protocol \"${parsedUrl.protocol}\". Only http:// and https:// are supported.`);
|
|
91
|
+
}
|
|
92
|
+
const dynamic = options.dynamic ?? false;
|
|
93
|
+
const auto = options.auto ?? false;
|
|
94
|
+
if (dynamic && auto) {
|
|
95
|
+
throw new InputError("--dynamic and --auto cannot be used together.");
|
|
96
|
+
}
|
|
97
|
+
const timeouts = parseTimeouts(options.timeoutMs, dynamic, auto);
|
|
98
|
+
return {
|
|
99
|
+
url: parsedUrl.toString(),
|
|
100
|
+
auto,
|
|
101
|
+
dynamic,
|
|
102
|
+
format: parseFormat(options.format ?? "markdown"),
|
|
103
|
+
outputPath: options.output,
|
|
104
|
+
timeoutMs: timeouts.timeoutMs,
|
|
105
|
+
dynamicTimeoutMs: timeouts.dynamicTimeoutMs,
|
|
106
|
+
headers: parseHeaders(options.header ?? [])
|
|
107
|
+
};
|
|
108
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { ConversionError } from "./errors.js";
|
|
3
|
+
import { extractHtmlTitle } from "./transform.js";
|
|
4
|
+
const MARKDOWN_CONTENT_TYPES = new Set([
|
|
5
|
+
"text/markdown",
|
|
6
|
+
"text/x-markdown",
|
|
7
|
+
"application/markdown",
|
|
8
|
+
"application/x-markdown"
|
|
9
|
+
]);
|
|
10
|
+
const PLAINTEXT_CONTENT_TYPE = "text/plain";
|
|
11
|
+
const MARKDOWN_FILE_EXTENSIONS = [
|
|
12
|
+
".md",
|
|
13
|
+
".markdown",
|
|
14
|
+
".mdown",
|
|
15
|
+
".mkd",
|
|
16
|
+
".mkdn",
|
|
17
|
+
".mdtxt",
|
|
18
|
+
".mdx"
|
|
19
|
+
];
|
|
20
|
+
function normalizeMarkdown(markdown) {
|
|
21
|
+
const trimmed = markdown.trim();
|
|
22
|
+
if (!trimmed) {
|
|
23
|
+
throw new ConversionError("Content was fetched but markdown output is empty.");
|
|
24
|
+
}
|
|
25
|
+
return `${trimmed}\n`;
|
|
26
|
+
}
|
|
27
|
+
function inferTitleFromMarkdown(markdown) {
|
|
28
|
+
const firstHeading = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim();
|
|
29
|
+
return firstHeading || undefined;
|
|
30
|
+
}
|
|
31
|
+
function isMarkdownContentType(contentType) {
|
|
32
|
+
if (!contentType) {
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
36
|
+
return MARKDOWN_CONTENT_TYPES.has(normalized);
|
|
37
|
+
}
|
|
38
|
+
function isPlainTextContentType(contentType) {
|
|
39
|
+
if (!contentType) {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
43
|
+
return normalized === PLAINTEXT_CONTENT_TYPE;
|
|
44
|
+
}
|
|
45
|
+
function hasMarkdownFileExtension(urlValue) {
|
|
46
|
+
let pathname;
|
|
47
|
+
try {
|
|
48
|
+
pathname = new URL(urlValue).pathname;
|
|
49
|
+
}
|
|
50
|
+
catch {
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
const normalizedPath = pathname.toLowerCase();
|
|
54
|
+
return MARKDOWN_FILE_EXTENSIONS.some((extension) => normalizedPath.endsWith(extension));
|
|
55
|
+
}
|
|
56
|
+
function shouldTreatAsMarkdownPassthrough(result) {
|
|
57
|
+
if (isMarkdownContentType(result.contentType)) {
|
|
58
|
+
return true;
|
|
59
|
+
}
|
|
60
|
+
return isPlainTextContentType(result.contentType) && hasMarkdownFileExtension(result.finalUrl);
|
|
61
|
+
}
|
|
62
|
+
function countWords(value) {
|
|
63
|
+
const trimmed = value.trim();
|
|
64
|
+
if (!trimmed) {
|
|
65
|
+
return 0;
|
|
66
|
+
}
|
|
67
|
+
return trimmed.split(/\s+/).length;
|
|
68
|
+
}
|
|
69
|
+
export function shouldAutoFallback(markdown) {
|
|
70
|
+
const trimmed = markdown.trim();
|
|
71
|
+
if (!trimmed) {
|
|
72
|
+
return true;
|
|
73
|
+
}
|
|
74
|
+
const lower = trimmed.toLowerCase();
|
|
75
|
+
if (/enable javascript|javascript is required|checking your browser|just a moment|please wait/.test(lower)) {
|
|
76
|
+
return true;
|
|
77
|
+
}
|
|
78
|
+
const nonEmptyLines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).length;
|
|
79
|
+
return countWords(trimmed) < 30 && nonEmptyLines <= 2;
|
|
80
|
+
}
|
|
81
|
+
export function prepareContentFromFetchResult(result, deps) {
|
|
82
|
+
if (shouldTreatAsMarkdownPassthrough(result)) {
|
|
83
|
+
const markdown = normalizeMarkdown(result.body);
|
|
84
|
+
return {
|
|
85
|
+
markdown,
|
|
86
|
+
title: inferTitleFromMarkdown(markdown),
|
|
87
|
+
source: result,
|
|
88
|
+
passthrough: true
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
const markdown = deps.transformHtmlToMarkdown({
|
|
92
|
+
html: result.body,
|
|
93
|
+
url: result.finalUrl
|
|
94
|
+
});
|
|
95
|
+
return {
|
|
96
|
+
markdown,
|
|
97
|
+
title: extractHtmlTitle(result.body),
|
|
98
|
+
source: result,
|
|
99
|
+
passthrough: false
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
export function formatOutput(args, content, usedDynamic) {
|
|
103
|
+
if (args.format === "markdown") {
|
|
104
|
+
return content.markdown;
|
|
105
|
+
}
|
|
106
|
+
const payload = {
|
|
107
|
+
url: args.url,
|
|
108
|
+
final_url: content.source.finalUrl,
|
|
109
|
+
title: content.title ?? null,
|
|
110
|
+
markdown: content.markdown,
|
|
111
|
+
content_type: content.source.contentType ?? null,
|
|
112
|
+
status: content.source.status,
|
|
113
|
+
fetched_at: new Date().toISOString(),
|
|
114
|
+
word_count: countWords(content.markdown),
|
|
115
|
+
sha256: createHash("sha256").update(content.markdown).digest("hex"),
|
|
116
|
+
used_dynamic: usedDynamic
|
|
117
|
+
};
|
|
118
|
+
return `${JSON.stringify(payload, null, 2)}\n`;
|
|
119
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { realpathSync } from "node:fs";
|
|
2
|
+
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
3
|
+
function resolvePathStrict(pathInput) {
|
|
4
|
+
return realpathSync(pathInput);
|
|
5
|
+
}
|
|
6
|
+
/**
|
|
7
|
+
* Determine whether `argvPath` points at the current module entrypoint.
|
|
8
|
+
* Resolves symlinks for both paths so global installs that expose a symlinked bin still execute.
|
|
9
|
+
*/
|
|
10
|
+
export function isMainModuleFor(moduleUrl, argvPath = process.argv[1]) {
|
|
11
|
+
if (argvPath === undefined) {
|
|
12
|
+
return false;
|
|
13
|
+
}
|
|
14
|
+
try {
|
|
15
|
+
const invokedPath = resolvePathStrict(argvPath);
|
|
16
|
+
const modulePath = resolvePathStrict(fileURLToPath(moduleUrl));
|
|
17
|
+
return invokedPath === modulePath;
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return pathToFileURL(argvPath).href === moduleUrl;
|
|
21
|
+
}
|
|
22
|
+
}
|
package/dist/cli.js
CHANGED
|
@@ -1,20 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
import {
|
|
7
|
-
import { asCurldownError, ConversionError, InputError } from "./errors.js";
|
|
2
|
+
import { CommanderError } from "commander";
|
|
3
|
+
import { buildProgram, normalizeArgs } from "./cli-args.js";
|
|
4
|
+
import { formatOutput, prepareContentFromFetchResult, shouldAutoFallback } from "./cli-content.js";
|
|
5
|
+
import { isMainModuleFor } from "./cli-main-module.js";
|
|
6
|
+
import { asCurldownError } from "./errors.js";
|
|
8
7
|
import { fetchDynamicHtml } from "./fetch-dynamic.js";
|
|
9
8
|
import { fetchStaticHtml } from "./fetch-static.js";
|
|
10
9
|
import { writeOutput } from "./output.js";
|
|
11
|
-
import {
|
|
12
|
-
const MARKDOWN_CONTENT_TYPES = new Set([
|
|
13
|
-
"text/markdown",
|
|
14
|
-
"text/x-markdown",
|
|
15
|
-
"application/markdown",
|
|
16
|
-
"application/x-markdown"
|
|
17
|
-
]);
|
|
10
|
+
import { transformHtmlToMarkdown } from "./transform.js";
|
|
18
11
|
const defaultDependencies = {
|
|
19
12
|
fetchStatic: fetchStaticHtml,
|
|
20
13
|
fetchDynamic: fetchDynamicHtml,
|
|
@@ -22,184 +15,6 @@ const defaultDependencies = {
|
|
|
22
15
|
writeOutput,
|
|
23
16
|
stderrWrite: (message) => process.stderr.write(message)
|
|
24
17
|
};
|
|
25
|
-
function collectRepeatable(value, previous = []) {
|
|
26
|
-
return [...previous, value];
|
|
27
|
-
}
|
|
28
|
-
function buildProgram() {
|
|
29
|
-
return new Command()
|
|
30
|
-
.name("curldown")
|
|
31
|
-
.description("Fetch URL content and convert it to markdown.")
|
|
32
|
-
.version(VERSION)
|
|
33
|
-
.argument("<url>", "The URL to fetch")
|
|
34
|
-
.option("--dynamic", "Use headless Chromium (Playwright) to render the page")
|
|
35
|
-
.option("--auto", "Try static first and fallback to dynamic when static output is thin")
|
|
36
|
-
.option("--format <type>", "Output format: markdown|json", "markdown")
|
|
37
|
-
.option("-o, --output <path>", "Write output to a file instead of stdout")
|
|
38
|
-
.option("--timeout-ms <number>", "Timeout in milliseconds")
|
|
39
|
-
.option("--header <key:value>", "Set custom request header", collectRepeatable, [])
|
|
40
|
-
.showHelpAfterError()
|
|
41
|
-
.exitOverride();
|
|
42
|
-
}
|
|
43
|
-
function parseHeaders(rawHeaders) {
|
|
44
|
-
const headers = {};
|
|
45
|
-
for (const rawHeader of rawHeaders) {
|
|
46
|
-
const separatorIndex = rawHeader.indexOf(":");
|
|
47
|
-
if (separatorIndex <= 0 || separatorIndex === rawHeader.length - 1) {
|
|
48
|
-
throw new InputError(`Invalid --header value \"${rawHeader}\". Use key:value format.`);
|
|
49
|
-
}
|
|
50
|
-
const key = rawHeader.slice(0, separatorIndex).trim();
|
|
51
|
-
const value = rawHeader.slice(separatorIndex + 1).trim();
|
|
52
|
-
if (!key || !value) {
|
|
53
|
-
throw new InputError(`Invalid --header value \"${rawHeader}\". Header key and value are required.`);
|
|
54
|
-
}
|
|
55
|
-
headers[key] = value;
|
|
56
|
-
}
|
|
57
|
-
return headers;
|
|
58
|
-
}
|
|
59
|
-
function parseFormat(rawFormat) {
|
|
60
|
-
if (rawFormat === "markdown" || rawFormat === "json") {
|
|
61
|
-
return rawFormat;
|
|
62
|
-
}
|
|
63
|
-
throw new InputError(`Invalid --format value \"${rawFormat}\". Use \"markdown\" or \"json\".`);
|
|
64
|
-
}
|
|
65
|
-
function parseTimeouts(rawTimeout, dynamic, auto) {
|
|
66
|
-
if (rawTimeout === undefined) {
|
|
67
|
-
if (dynamic) {
|
|
68
|
-
return {
|
|
69
|
-
timeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS,
|
|
70
|
-
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
71
|
-
};
|
|
72
|
-
}
|
|
73
|
-
if (auto) {
|
|
74
|
-
return {
|
|
75
|
-
timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
|
|
76
|
-
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
77
|
-
};
|
|
78
|
-
}
|
|
79
|
-
return {
|
|
80
|
-
timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
|
|
81
|
-
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
82
|
-
};
|
|
83
|
-
}
|
|
84
|
-
const parsed = Number.parseInt(rawTimeout, 10);
|
|
85
|
-
if (!Number.isInteger(parsed) || parsed <= 0) {
|
|
86
|
-
throw new InputError(`Invalid --timeout-ms value \"${rawTimeout}\". Must be a positive integer.`);
|
|
87
|
-
}
|
|
88
|
-
return {
|
|
89
|
-
timeoutMs: parsed,
|
|
90
|
-
dynamicTimeoutMs: parsed
|
|
91
|
-
};
|
|
92
|
-
}
|
|
93
|
-
function normalizeMarkdown(markdown) {
|
|
94
|
-
const trimmed = markdown.trim();
|
|
95
|
-
if (!trimmed) {
|
|
96
|
-
throw new ConversionError("Content was fetched but markdown output is empty.");
|
|
97
|
-
}
|
|
98
|
-
return `${trimmed}\n`;
|
|
99
|
-
}
|
|
100
|
-
function inferTitleFromMarkdown(markdown) {
|
|
101
|
-
const firstHeading = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim();
|
|
102
|
-
return firstHeading || undefined;
|
|
103
|
-
}
|
|
104
|
-
function isMarkdownContentType(contentType) {
|
|
105
|
-
if (!contentType) {
|
|
106
|
-
return false;
|
|
107
|
-
}
|
|
108
|
-
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
109
|
-
return MARKDOWN_CONTENT_TYPES.has(normalized);
|
|
110
|
-
}
|
|
111
|
-
function countWords(value) {
|
|
112
|
-
const trimmed = value.trim();
|
|
113
|
-
if (!trimmed) {
|
|
114
|
-
return 0;
|
|
115
|
-
}
|
|
116
|
-
return trimmed.split(/\s+/).length;
|
|
117
|
-
}
|
|
118
|
-
function shouldAutoFallback(markdown) {
|
|
119
|
-
const trimmed = markdown.trim();
|
|
120
|
-
if (!trimmed) {
|
|
121
|
-
return true;
|
|
122
|
-
}
|
|
123
|
-
const lower = trimmed.toLowerCase();
|
|
124
|
-
if (/enable javascript|javascript is required|checking your browser|just a moment|please wait/.test(lower)) {
|
|
125
|
-
return true;
|
|
126
|
-
}
|
|
127
|
-
const nonEmptyLines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).length;
|
|
128
|
-
return countWords(trimmed) < 30 && nonEmptyLines <= 2;
|
|
129
|
-
}
|
|
130
|
-
/**
|
|
131
|
-
* Validate and normalize parsed CLI arguments into the canonical runtime shape.
|
|
132
|
-
* Fails fast with {@link InputError} on malformed input.
|
|
133
|
-
*/
|
|
134
|
-
function normalizeArgs(urlInput, options) {
|
|
135
|
-
if (!urlInput) {
|
|
136
|
-
throw new InputError("A URL argument is required.");
|
|
137
|
-
}
|
|
138
|
-
let parsedUrl;
|
|
139
|
-
try {
|
|
140
|
-
parsedUrl = new URL(urlInput);
|
|
141
|
-
}
|
|
142
|
-
catch (error) {
|
|
143
|
-
throw new InputError(`Invalid URL \"${urlInput}\".`, {
|
|
144
|
-
cause: error instanceof Error ? error : undefined
|
|
145
|
-
});
|
|
146
|
-
}
|
|
147
|
-
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") {
|
|
148
|
-
throw new InputError(`Unsupported URL protocol \"${parsedUrl.protocol}\". Only http:// and https:// are supported.`);
|
|
149
|
-
}
|
|
150
|
-
const dynamic = options.dynamic ?? false;
|
|
151
|
-
const auto = options.auto ?? false;
|
|
152
|
-
if (dynamic && auto) {
|
|
153
|
-
throw new InputError("--dynamic and --auto cannot be used together.");
|
|
154
|
-
}
|
|
155
|
-
const timeouts = parseTimeouts(options.timeoutMs, dynamic, auto);
|
|
156
|
-
return {
|
|
157
|
-
url: parsedUrl.toString(),
|
|
158
|
-
auto,
|
|
159
|
-
dynamic,
|
|
160
|
-
format: parseFormat(options.format ?? "markdown"),
|
|
161
|
-
outputPath: options.output,
|
|
162
|
-
timeoutMs: timeouts.timeoutMs,
|
|
163
|
-
dynamicTimeoutMs: timeouts.dynamicTimeoutMs,
|
|
164
|
-
headers: parseHeaders(options.header ?? [])
|
|
165
|
-
};
|
|
166
|
-
}
|
|
167
|
-
function prepareContentFromFetchResult(result, deps) {
|
|
168
|
-
if (isMarkdownContentType(result.contentType)) {
|
|
169
|
-
const markdown = normalizeMarkdown(result.body);
|
|
170
|
-
return {
|
|
171
|
-
markdown,
|
|
172
|
-
title: inferTitleFromMarkdown(markdown),
|
|
173
|
-
source: result,
|
|
174
|
-
passthrough: true
|
|
175
|
-
};
|
|
176
|
-
}
|
|
177
|
-
const markdown = deps.transformHtmlToMarkdown({ html: result.body });
|
|
178
|
-
return {
|
|
179
|
-
markdown,
|
|
180
|
-
title: extractHtmlTitle(result.body),
|
|
181
|
-
source: result,
|
|
182
|
-
passthrough: false
|
|
183
|
-
};
|
|
184
|
-
}
|
|
185
|
-
function formatOutput(args, content, usedDynamic) {
|
|
186
|
-
if (args.format === "markdown") {
|
|
187
|
-
return content.markdown;
|
|
188
|
-
}
|
|
189
|
-
const payload = {
|
|
190
|
-
url: args.url,
|
|
191
|
-
final_url: content.source.finalUrl,
|
|
192
|
-
title: content.title ?? null,
|
|
193
|
-
markdown: content.markdown,
|
|
194
|
-
content_type: content.source.contentType ?? null,
|
|
195
|
-
status: content.source.status,
|
|
196
|
-
fetched_at: new Date().toISOString(),
|
|
197
|
-
word_count: countWords(content.markdown),
|
|
198
|
-
sha256: createHash("sha256").update(content.markdown).digest("hex"),
|
|
199
|
-
used_dynamic: usedDynamic
|
|
200
|
-
};
|
|
201
|
-
return `${JSON.stringify(payload, null, 2)}\n`;
|
|
202
|
-
}
|
|
203
18
|
/**
|
|
204
19
|
* Execute one curldown CLI invocation and return process exit code.
|
|
205
20
|
* `argv` should not include the Node executable or script path.
|
|
@@ -276,28 +91,10 @@ export async function run(argv, deps = defaultDependencies) {
|
|
|
276
91
|
return curldownError.exitCode;
|
|
277
92
|
}
|
|
278
93
|
}
|
|
279
|
-
function resolvePathStrict(pathInput) {
|
|
280
|
-
return realpathSync(pathInput);
|
|
281
|
-
}
|
|
282
|
-
/**
|
|
283
|
-
* Determine whether this module was invoked as the CLI entrypoint.
|
|
284
|
-
* Resolves symlinks for both paths so global installs that expose a symlinked bin still execute.
|
|
285
|
-
*/
|
|
286
94
|
export function isMainModule(argvPath = process.argv[1]) {
|
|
287
|
-
|
|
288
|
-
return false;
|
|
289
|
-
}
|
|
290
|
-
try {
|
|
291
|
-
const invokedPath = resolvePathStrict(argvPath);
|
|
292
|
-
const modulePath = resolvePathStrict(fileURLToPath(import.meta.url));
|
|
293
|
-
return invokedPath === modulePath;
|
|
294
|
-
}
|
|
295
|
-
catch {
|
|
296
|
-
return pathToFileURL(argvPath).href === import.meta.url;
|
|
297
|
-
}
|
|
95
|
+
return isMainModuleFor(import.meta.url, argvPath);
|
|
298
96
|
}
|
|
299
|
-
|
|
300
|
-
if (isMain) {
|
|
97
|
+
if (isMainModule()) {
|
|
301
98
|
void run(process.argv.slice(2)).then((exitCode) => {
|
|
302
99
|
process.exitCode = exitCode;
|
|
303
100
|
});
|
package/dist/transform.js
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
import { Readability } from "@mozilla/readability";
|
|
1
2
|
import { load } from "cheerio";
|
|
3
|
+
import { JSDOM } from "jsdom";
|
|
2
4
|
import { createRequire } from "node:module";
|
|
3
5
|
import TurndownService from "turndown";
|
|
4
6
|
import { DEFAULT_REMOVE_SELECTORS } from "./constants.js";
|
|
@@ -12,22 +14,98 @@ const turndown = new TurndownService({
|
|
|
12
14
|
emDelimiter: "_"
|
|
13
15
|
});
|
|
14
16
|
turndown.use(turndownPluginGfm.gfm);
|
|
17
|
+
const FALLBACK_BASE_URL = "https://curldown.local/";
|
|
18
|
+
const PRIMARY_CONTENT_SELECTOR = "main, article, [role='main']";
|
|
19
|
+
const MIN_PRIMARY_CONTENT_TEXT_LENGTH = 200;
|
|
20
|
+
function getNormalizedTextLength(value) {
|
|
21
|
+
return value?.replace(/\s+/g, " ").trim().length ?? 0;
|
|
22
|
+
}
|
|
23
|
+
function cleanupFragmentHtml(html) {
|
|
24
|
+
const $ = load(html);
|
|
25
|
+
$(DEFAULT_REMOVE_SELECTORS.join(",")).remove();
|
|
26
|
+
$("img").each((_, element) => {
|
|
27
|
+
const alt = $(element).attr("alt")?.trim() ?? "";
|
|
28
|
+
if (!alt) {
|
|
29
|
+
$(element).remove();
|
|
30
|
+
}
|
|
31
|
+
});
|
|
32
|
+
$("a").each((_, element) => {
|
|
33
|
+
const link = $(element);
|
|
34
|
+
const textLength = getNormalizedTextLength(link.text());
|
|
35
|
+
const hasAltImage = link
|
|
36
|
+
.find("img")
|
|
37
|
+
.toArray()
|
|
38
|
+
.some((image) => getNormalizedTextLength($(image).attr("alt")) > 0);
|
|
39
|
+
if (textLength === 0 && !hasAltImage) {
|
|
40
|
+
link.remove();
|
|
41
|
+
}
|
|
42
|
+
});
|
|
43
|
+
return $.root().html() ?? "";
|
|
44
|
+
}
|
|
45
|
+
function extractBodyHtml(document) {
|
|
46
|
+
return document.body?.innerHTML ?? document.documentElement?.innerHTML ?? "";
|
|
47
|
+
}
|
|
48
|
+
function selectSemanticPrimaryHtml(document) {
|
|
49
|
+
const candidates = Array.from(document.querySelectorAll(PRIMARY_CONTENT_SELECTOR));
|
|
50
|
+
const bestCandidate = candidates
|
|
51
|
+
.map((element) => ({
|
|
52
|
+
html: element.innerHTML,
|
|
53
|
+
textLength: getNormalizedTextLength(element.textContent)
|
|
54
|
+
}))
|
|
55
|
+
.filter((candidate) => candidate.textLength > 0)
|
|
56
|
+
.sort((left, right) => right.textLength - left.textLength)[0];
|
|
57
|
+
if (!bestCandidate || bestCandidate.textLength < MIN_PRIMARY_CONTENT_TEXT_LENGTH) {
|
|
58
|
+
return undefined;
|
|
59
|
+
}
|
|
60
|
+
return bestCandidate.html;
|
|
61
|
+
}
|
|
62
|
+
function selectReadabilityHtml(document) {
|
|
63
|
+
const article = new Readability(document).parse();
|
|
64
|
+
if (!article || getNormalizedTextLength(article.textContent) === 0) {
|
|
65
|
+
return undefined;
|
|
66
|
+
}
|
|
67
|
+
return article.content ?? undefined;
|
|
68
|
+
}
|
|
69
|
+
function toMarkdownCandidate(html) {
|
|
70
|
+
if (!html) {
|
|
71
|
+
return undefined;
|
|
72
|
+
}
|
|
73
|
+
const cleanedHtml = cleanupFragmentHtml(html);
|
|
74
|
+
if (cleanedHtml.trim().length === 0) {
|
|
75
|
+
return undefined;
|
|
76
|
+
}
|
|
77
|
+
const markdown = turndown.turndown(cleanedHtml).trim();
|
|
78
|
+
return markdown.length > 0 ? markdown : undefined;
|
|
79
|
+
}
|
|
80
|
+
function getFirstMeaningfulMarkdownLine(markdown) {
|
|
81
|
+
return markdown
|
|
82
|
+
.split(/\r?\n/)
|
|
83
|
+
.map((line) => line.trim())
|
|
84
|
+
.find((line) => line.length > 0);
|
|
85
|
+
}
|
|
86
|
+
function startsWithPrimaryHeading(markdown) {
|
|
87
|
+
return /^#\s+\S/.test(getFirstMeaningfulMarkdownLine(markdown) ?? "");
|
|
88
|
+
}
|
|
15
89
|
/**
|
|
16
90
|
* Convert fetched HTML into markdown.
|
|
17
|
-
* The function
|
|
18
|
-
*
|
|
91
|
+
* The function prefers semantic primary-content containers, falls back to
|
|
92
|
+
* Readability for unstructured pages, and only converts the full body when
|
|
93
|
+
* no stronger content signal exists.
|
|
19
94
|
*/
|
|
20
95
|
export function transformHtmlToMarkdown(input) {
|
|
21
|
-
const
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
96
|
+
const dom = new JSDOM(input.html, {
|
|
97
|
+
url: input.url ?? FALLBACK_BASE_URL
|
|
98
|
+
});
|
|
99
|
+
const { document } = dom.window;
|
|
100
|
+
const semanticMarkdown = toMarkdownCandidate(selectSemanticPrimaryHtml(document));
|
|
101
|
+
const readabilityMarkdown = toMarkdownCandidate(selectReadabilityHtml(new JSDOM(input.html, { url: input.url ?? FALLBACK_BASE_URL }).window.document));
|
|
102
|
+
const fallbackMarkdown = toMarkdownCandidate(extractBodyHtml(document));
|
|
103
|
+
const markdown = semanticMarkdown && startsWithPrimaryHeading(semanticMarkdown) && !startsWithPrimaryHeading(readabilityMarkdown ?? "")
|
|
104
|
+
? semanticMarkdown
|
|
105
|
+
: readabilityMarkdown ?? semanticMarkdown ?? fallbackMarkdown;
|
|
106
|
+
if (!markdown) {
|
|
25
107
|
throw new ConversionError("No HTML body content found to convert.");
|
|
26
108
|
}
|
|
27
|
-
const markdown = turndown.turndown(bodyHtml).trim();
|
|
28
|
-
if (markdown.length === 0) {
|
|
29
|
-
throw new ConversionError("HTML was fetched but produced empty markdown output.");
|
|
30
|
-
}
|
|
31
109
|
return `${markdown}\n`;
|
|
32
110
|
}
|
|
33
111
|
/** Extract document title from HTML head when available. */
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@jenslys/curldown",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.4",
|
|
4
4
|
"description": "Fetch URL content and convert it to markdown.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -30,12 +30,15 @@
|
|
|
30
30
|
},
|
|
31
31
|
"dependencies": {
|
|
32
32
|
"@joplin/turndown-plugin-gfm": "^1.0.64",
|
|
33
|
+
"@mozilla/readability": "^0.6.0",
|
|
33
34
|
"cheerio": "^1.2.0",
|
|
34
35
|
"commander": "^14.0.3",
|
|
36
|
+
"jsdom": "^29.0.0",
|
|
35
37
|
"playwright": "^1.58.2",
|
|
36
38
|
"turndown": "^7.2.2"
|
|
37
39
|
},
|
|
38
40
|
"devDependencies": {
|
|
41
|
+
"@types/jsdom": "^28.0.1",
|
|
39
42
|
"@types/node": "^25.3.3",
|
|
40
43
|
"@types/turndown": "^5.0.6",
|
|
41
44
|
"typescript": "^5.9.3",
|