@jenslys/curldown 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +80 -0
- package/dist/cli.js +149 -0
- package/dist/constants.js +13 -0
- package/dist/errors.js +55 -0
- package/dist/fetch-dynamic.js +35 -0
- package/dist/fetch-static.js +31 -0
- package/dist/output.js +23 -0
- package/dist/transform.js +38 -0
- package/dist/types.js +1 -0
- package/package.json +43 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jens
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# curldown
|
|
2
|
+
|
|
3
|
+
Fetch a webpage and return clean Markdown.
|
|
4
|
+
|
|
5
|
+
`curldown` is a CLI-first tool for AI agents and scripts:
|
|
6
|
+
|
|
7
|
+
- Static mode: `fetch` HTML -> Cheerio cleanup -> Turndown markdown.
|
|
8
|
+
- Dynamic mode: headless Chromium (Playwright) -> HTML -> markdown.
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
npm install -g @jenslys/curldown
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
bun add -g @jenslys/curldown
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Print markdown to stdout
|
|
24
|
+
curldown https://example.com
|
|
25
|
+
|
|
26
|
+
# JS-heavy pages
|
|
27
|
+
curldown https://example.com --dynamic
|
|
28
|
+
|
|
29
|
+
# Write to file
|
|
30
|
+
curldown https://example.com --output page.md
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## CLI
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
curldown <url> [options]
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Options
|
|
40
|
+
|
|
41
|
+
- `--dynamic` Use Playwright Chromium to render before extraction.
|
|
42
|
+
- `-o, --output <path>` Write markdown to file instead of stdout.
|
|
43
|
+
- `--timeout-ms <number>` Request/render timeout in milliseconds.
|
|
44
|
+
- `--user-agent <string>` Override request user-agent.
|
|
45
|
+
- `--header <key:value>` Custom request header (repeatable).
|
|
46
|
+
- `--remove-selector <css>` Remove selector(s) before conversion (repeatable).
|
|
47
|
+
- `--help` Show help.
|
|
48
|
+
- `--version` Show version.
|
|
49
|
+
|
|
50
|
+
## Local Development
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
bun install
|
|
54
|
+
bun run build
|
|
55
|
+
bun run test
|
|
56
|
+
node dist/cli.js https://example.com
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## AGENTS.md Snippet (Optional)
|
|
60
|
+
|
|
61
|
+
Paste this into your `AGENTS.md` if you want agents to always use `curldown` for website content retrieval:
|
|
62
|
+
|
|
63
|
+
```md
|
|
64
|
+
## Website Content Retrieval
|
|
65
|
+
|
|
66
|
+
- Always use `curldown` to fetch web pages for agent workflows.
|
|
67
|
+
- Default command: `curldown <url>`.
|
|
68
|
+
- If the page is JS-rendered or incomplete, retry with: `curldown <url> --dynamic`.
|
|
69
|
+
- Prefer stdout output unless a task explicitly requires a file (`--output <path>`).
|
|
70
|
+
- Do not use ad-hoc HTML scraping or direct browser automation when `curldown` can handle it.
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Exit Codes
|
|
74
|
+
|
|
75
|
+
- `0` success
|
|
76
|
+
- `1` input/usage error
|
|
77
|
+
- `2` static fetch/network error
|
|
78
|
+
- `3` dynamic render/browser error
|
|
79
|
+
- `4` output write error
|
|
80
|
+
- `5` conversion pipeline error
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { Command, CommanderError } from "commander";
|
|
3
|
+
import { pathToFileURL } from "node:url";
|
|
4
|
+
import { DEFAULT_DYNAMIC_TIMEOUT_MS, DEFAULT_STATIC_TIMEOUT_MS, DEFAULT_USER_AGENT, VERSION } from "./constants.js";
|
|
5
|
+
import { asCurldownError, InputError } from "./errors.js";
|
|
6
|
+
import { fetchDynamicHtml } from "./fetch-dynamic.js";
|
|
7
|
+
import { fetchStaticHtml } from "./fetch-static.js";
|
|
8
|
+
import { writeOutput } from "./output.js";
|
|
9
|
+
import { transformHtmlToMarkdown } from "./transform.js";
|
|
10
|
+
const defaultDependencies = {
|
|
11
|
+
fetchStatic: fetchStaticHtml,
|
|
12
|
+
fetchDynamic: fetchDynamicHtml,
|
|
13
|
+
transformHtmlToMarkdown,
|
|
14
|
+
writeOutput,
|
|
15
|
+
stderrWrite: (message) => process.stderr.write(message)
|
|
16
|
+
};
|
|
17
|
+
function collectRepeatable(value, previous = []) {
|
|
18
|
+
return [...previous, value];
|
|
19
|
+
}
|
|
20
|
+
function buildProgram() {
|
|
21
|
+
return new Command()
|
|
22
|
+
.name("curldown")
|
|
23
|
+
.description("Fetch URL content and convert it to markdown.")
|
|
24
|
+
.version(VERSION)
|
|
25
|
+
.argument("<url>", "The URL to fetch")
|
|
26
|
+
.option("--dynamic", "Use headless Chromium (Playwright) to render the page")
|
|
27
|
+
.option("-o, --output <path>", "Write markdown to a file instead of stdout")
|
|
28
|
+
.option("--timeout-ms <number>", "Timeout in milliseconds")
|
|
29
|
+
.option("--user-agent <string>", "Override request user-agent")
|
|
30
|
+
.option("--header <key:value>", "Set custom request header", collectRepeatable, [])
|
|
31
|
+
.option("--remove-selector <css>", "Remove matching selector(s) before markdown conversion", collectRepeatable, [])
|
|
32
|
+
.showHelpAfterError()
|
|
33
|
+
.exitOverride();
|
|
34
|
+
}
|
|
35
|
+
function parseHeaders(rawHeaders) {
|
|
36
|
+
const headers = {};
|
|
37
|
+
for (const rawHeader of rawHeaders) {
|
|
38
|
+
const separatorIndex = rawHeader.indexOf(":");
|
|
39
|
+
if (separatorIndex <= 0 || separatorIndex === rawHeader.length - 1) {
|
|
40
|
+
throw new InputError(`Invalid --header value \"${rawHeader}\". Use key:value format.`);
|
|
41
|
+
}
|
|
42
|
+
const key = rawHeader.slice(0, separatorIndex).trim();
|
|
43
|
+
const value = rawHeader.slice(separatorIndex + 1).trim();
|
|
44
|
+
if (!key || !value) {
|
|
45
|
+
throw new InputError(`Invalid --header value \"${rawHeader}\". Header key and value are required.`);
|
|
46
|
+
}
|
|
47
|
+
headers[key] = value;
|
|
48
|
+
}
|
|
49
|
+
return headers;
|
|
50
|
+
}
|
|
51
|
+
function parseTimeout(rawTimeout, dynamic) {
|
|
52
|
+
if (rawTimeout === undefined) {
|
|
53
|
+
return dynamic ? DEFAULT_DYNAMIC_TIMEOUT_MS : DEFAULT_STATIC_TIMEOUT_MS;
|
|
54
|
+
}
|
|
55
|
+
const parsed = Number.parseInt(rawTimeout, 10);
|
|
56
|
+
if (!Number.isInteger(parsed) || parsed <= 0) {
|
|
57
|
+
throw new InputError(`Invalid --timeout-ms value \"${rawTimeout}\". Must be a positive integer.`);
|
|
58
|
+
}
|
|
59
|
+
return parsed;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Validate and normalize parsed CLI arguments into the canonical runtime shape.
|
|
63
|
+
* Fails fast with {@link InputError} on malformed input.
|
|
64
|
+
*/
|
|
65
|
+
function normalizeArgs(urlInput, options) {
|
|
66
|
+
if (!urlInput) {
|
|
67
|
+
throw new InputError("A URL argument is required.");
|
|
68
|
+
}
|
|
69
|
+
let parsedUrl;
|
|
70
|
+
try {
|
|
71
|
+
parsedUrl = new URL(urlInput);
|
|
72
|
+
}
|
|
73
|
+
catch (error) {
|
|
74
|
+
throw new InputError(`Invalid URL \"${urlInput}\".`, {
|
|
75
|
+
cause: error instanceof Error ? error : undefined
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") {
|
|
79
|
+
throw new InputError(`Unsupported URL protocol \"${parsedUrl.protocol}\". Only http:// and https:// are supported.`);
|
|
80
|
+
}
|
|
81
|
+
const dynamic = options.dynamic ?? false;
|
|
82
|
+
return {
|
|
83
|
+
url: parsedUrl.toString(),
|
|
84
|
+
dynamic,
|
|
85
|
+
outputPath: options.output,
|
|
86
|
+
timeoutMs: parseTimeout(options.timeoutMs, dynamic),
|
|
87
|
+
userAgent: options.userAgent?.trim() || DEFAULT_USER_AGENT,
|
|
88
|
+
headers: parseHeaders(options.header ?? []),
|
|
89
|
+
removeSelectors: (options.removeSelector ?? []).map((selector) => selector.trim()).filter(Boolean)
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Execute one curldown CLI invocation and return process exit code.
|
|
94
|
+
* `argv` should not include the Node executable or script path.
|
|
95
|
+
*/
|
|
96
|
+
export async function run(argv, deps = defaultDependencies) {
|
|
97
|
+
const program = buildProgram();
|
|
98
|
+
let options;
|
|
99
|
+
let urlArg;
|
|
100
|
+
try {
|
|
101
|
+
const parsedProgram = program.parse(argv, { from: "user" });
|
|
102
|
+
options = parsedProgram.opts();
|
|
103
|
+
[urlArg] = parsedProgram.args;
|
|
104
|
+
}
|
|
105
|
+
catch (error) {
|
|
106
|
+
if (error instanceof CommanderError) {
|
|
107
|
+
if (error.code === "commander.helpDisplayed" || error.code === "commander.version") {
|
|
108
|
+
return 0;
|
|
109
|
+
}
|
|
110
|
+
deps.stderrWrite(`${error.message}\n`);
|
|
111
|
+
return 1;
|
|
112
|
+
}
|
|
113
|
+
const curldownError = asCurldownError(error);
|
|
114
|
+
deps.stderrWrite(`${curldownError.message}\n`);
|
|
115
|
+
return curldownError.exitCode;
|
|
116
|
+
}
|
|
117
|
+
try {
|
|
118
|
+
const args = normalizeArgs(urlArg, options);
|
|
119
|
+
const fetchInput = {
|
|
120
|
+
url: args.url,
|
|
121
|
+
timeoutMs: args.timeoutMs,
|
|
122
|
+
userAgent: args.userAgent,
|
|
123
|
+
headers: args.headers
|
|
124
|
+
};
|
|
125
|
+
const html = args.dynamic
|
|
126
|
+
? await deps.fetchDynamic(fetchInput)
|
|
127
|
+
: await deps.fetchStatic(fetchInput);
|
|
128
|
+
const markdown = deps.transformHtmlToMarkdown({
|
|
129
|
+
html,
|
|
130
|
+
removeSelectors: args.removeSelectors
|
|
131
|
+
});
|
|
132
|
+
await deps.writeOutput({
|
|
133
|
+
markdown,
|
|
134
|
+
outputPath: args.outputPath
|
|
135
|
+
});
|
|
136
|
+
return 0;
|
|
137
|
+
}
|
|
138
|
+
catch (error) {
|
|
139
|
+
const curldownError = asCurldownError(error);
|
|
140
|
+
deps.stderrWrite(`${curldownError.message}\n`);
|
|
141
|
+
return curldownError.exitCode;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
const isMain = process.argv[1] !== undefined && pathToFileURL(process.argv[1]).href === import.meta.url;
|
|
145
|
+
if (isMain) {
|
|
146
|
+
void run(process.argv.slice(2)).then((exitCode) => {
|
|
147
|
+
process.exitCode = exitCode;
|
|
148
|
+
});
|
|
149
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export const VERSION = "1.0.0";
|
|
2
|
+
export const DEFAULT_STATIC_TIMEOUT_MS = 15_000;
|
|
3
|
+
export const DEFAULT_DYNAMIC_TIMEOUT_MS = 30_000;
|
|
4
|
+
export const DEFAULT_USER_AGENT = `curldown/${VERSION} (+https://www.npmjs.com/package/@jenslys/curldown)`;
|
|
5
|
+
export const DEFAULT_REMOVE_SELECTORS = [
|
|
6
|
+
"script",
|
|
7
|
+
"style",
|
|
8
|
+
"noscript",
|
|
9
|
+
"template",
|
|
10
|
+
"svg",
|
|
11
|
+
"canvas",
|
|
12
|
+
"iframe"
|
|
13
|
+
];
|
package/dist/errors.js
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Base error for all domain failures in curldown.
|
|
3
|
+
* Each subclass maps directly to a CLI exit code.
|
|
4
|
+
*/
|
|
5
|
+
export class CurldownError extends Error {
|
|
6
|
+
exitCode;
|
|
7
|
+
constructor(message, exitCode, options) {
|
|
8
|
+
super(message, options);
|
|
9
|
+
this.name = new.target.name;
|
|
10
|
+
this.exitCode = exitCode;
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
/** Invalid CLI usage or invalid input values. */
|
|
14
|
+
export class InputError extends CurldownError {
|
|
15
|
+
constructor(message, options) {
|
|
16
|
+
super(message, 1, options);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
/** Static network fetch failure from Node fetch. */
|
|
20
|
+
export class FetchError extends CurldownError {
|
|
21
|
+
constructor(message, options) {
|
|
22
|
+
super(message, 2, options);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
/** Browser-rendering failure in dynamic mode. */
|
|
26
|
+
export class DynamicError extends CurldownError {
|
|
27
|
+
constructor(message, options) {
|
|
28
|
+
super(message, 3, options);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
/** Failure while writing markdown to stdout or file output. */
|
|
32
|
+
export class OutputError extends CurldownError {
|
|
33
|
+
constructor(message, options) {
|
|
34
|
+
super(message, 4, options);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
/** Failure in HTML cleanup or markdown conversion. */
|
|
38
|
+
export class ConversionError extends CurldownError {
|
|
39
|
+
constructor(message, options) {
|
|
40
|
+
super(message, 5, options);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Convert unknown thrown values into typed curldown errors so the CLI
|
|
45
|
+
* always returns a deterministic exit code.
|
|
46
|
+
*/
|
|
47
|
+
export function asCurldownError(error) {
|
|
48
|
+
if (error instanceof CurldownError) {
|
|
49
|
+
return error;
|
|
50
|
+
}
|
|
51
|
+
if (error instanceof Error) {
|
|
52
|
+
return new ConversionError(error.message, { cause: error });
|
|
53
|
+
}
|
|
54
|
+
return new ConversionError("Unknown error while processing page content.");
|
|
55
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { chromium } from "playwright";
|
|
2
|
+
import { DynamicError } from "./errors.js";
|
|
3
|
+
/**
|
|
4
|
+
* Render a page in headless Chromium and return the resulting HTML snapshot.
|
|
5
|
+
* Throws {@link DynamicError} if browser startup, navigation, or capture fails.
|
|
6
|
+
*/
|
|
7
|
+
export async function fetchDynamicHtml(input) {
|
|
8
|
+
let browser;
|
|
9
|
+
try {
|
|
10
|
+
browser = await chromium.launch({ headless: true });
|
|
11
|
+
const context = await browser.newContext({
|
|
12
|
+
userAgent: input.userAgent,
|
|
13
|
+
extraHTTPHeaders: input.headers
|
|
14
|
+
});
|
|
15
|
+
try {
|
|
16
|
+
const page = await context.newPage();
|
|
17
|
+
await page.goto(input.url, {
|
|
18
|
+
timeout: input.timeoutMs,
|
|
19
|
+
waitUntil: "domcontentloaded"
|
|
20
|
+
});
|
|
21
|
+
return await page.content();
|
|
22
|
+
}
|
|
23
|
+
finally {
|
|
24
|
+
await context.close();
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
catch (error) {
|
|
28
|
+
throw new DynamicError(`Dynamic fetch failed for ${input.url}: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
|
|
29
|
+
}
|
|
30
|
+
finally {
|
|
31
|
+
if (browser) {
|
|
32
|
+
await browser.close();
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { FetchError } from "./errors.js";
|
|
2
|
+
/**
|
|
3
|
+
* Fetch raw HTML using Node's native fetch implementation.
|
|
4
|
+
* Throws {@link FetchError} for transport, status, or body-read failures.
|
|
5
|
+
*/
|
|
6
|
+
export async function fetchStaticHtml(input) {
|
|
7
|
+
const headers = new Headers(input.headers);
|
|
8
|
+
if (input.userAgent) {
|
|
9
|
+
headers.set("user-agent", input.userAgent);
|
|
10
|
+
}
|
|
11
|
+
let response;
|
|
12
|
+
try {
|
|
13
|
+
response = await fetch(input.url, {
|
|
14
|
+
headers,
|
|
15
|
+
redirect: "follow",
|
|
16
|
+
signal: AbortSignal.timeout(input.timeoutMs)
|
|
17
|
+
});
|
|
18
|
+
}
|
|
19
|
+
catch (error) {
|
|
20
|
+
throw new FetchError(`Static fetch failed for ${input.url}: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
|
|
21
|
+
}
|
|
22
|
+
if (!response.ok) {
|
|
23
|
+
throw new FetchError(`Static fetch failed for ${input.url}: HTTP ${response.status} ${response.statusText}`);
|
|
24
|
+
}
|
|
25
|
+
try {
|
|
26
|
+
return await response.text();
|
|
27
|
+
}
|
|
28
|
+
catch (error) {
|
|
29
|
+
throw new FetchError(`Failed reading response body for ${input.url}: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
|
|
30
|
+
}
|
|
31
|
+
}
|
package/dist/output.js
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { writeFile } from "node:fs/promises";
|
|
2
|
+
import { OutputError } from "./errors.js";
|
|
3
|
+
/**
|
|
4
|
+
* Emit markdown to stdout (default) or to a target file.
|
|
5
|
+
* Throws {@link OutputError} when writing fails.
|
|
6
|
+
*/
|
|
7
|
+
export async function writeOutput(input) {
|
|
8
|
+
if (input.outputPath) {
|
|
9
|
+
try {
|
|
10
|
+
await writeFile(input.outputPath, input.markdown, "utf8");
|
|
11
|
+
return;
|
|
12
|
+
}
|
|
13
|
+
catch (error) {
|
|
14
|
+
throw new OutputError(`Failed writing markdown to ${input.outputPath}: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
try {
|
|
18
|
+
process.stdout.write(input.markdown);
|
|
19
|
+
}
|
|
20
|
+
catch (error) {
|
|
21
|
+
throw new OutputError(`Failed writing markdown to stdout: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
|
|
22
|
+
}
|
|
23
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { load } from "cheerio";
|
|
2
|
+
import TurndownService from "turndown";
|
|
3
|
+
import { DEFAULT_REMOVE_SELECTORS } from "./constants.js";
|
|
4
|
+
import { ConversionError } from "./errors.js";
|
|
5
|
+
const turndown = new TurndownService({
|
|
6
|
+
headingStyle: "atx",
|
|
7
|
+
codeBlockStyle: "fenced",
|
|
8
|
+
bulletListMarker: "-",
|
|
9
|
+
emDelimiter: "_"
|
|
10
|
+
});
|
|
11
|
+
/** Normalize selector input by trimming, dropping empties, and removing duplicates. */
|
|
12
|
+
function uniqueSelectors(selectors) {
|
|
13
|
+
return [...new Set(selectors.map((selector) => selector.trim()).filter(Boolean))];
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Convert fetched HTML into markdown.
|
|
17
|
+
* The function removes default non-content nodes and optional caller-provided
|
|
18
|
+
* selectors before running Turndown conversion.
|
|
19
|
+
*/
|
|
20
|
+
export function transformHtmlToMarkdown(input) {
|
|
21
|
+
const $ = load(input.html);
|
|
22
|
+
const selectorsToRemove = uniqueSelectors([
|
|
23
|
+
...DEFAULT_REMOVE_SELECTORS,
|
|
24
|
+
...input.removeSelectors
|
|
25
|
+
]);
|
|
26
|
+
if (selectorsToRemove.length > 0) {
|
|
27
|
+
$(selectorsToRemove.join(",")).remove();
|
|
28
|
+
}
|
|
29
|
+
const bodyHtml = $("body").length > 0 ? $("body").html() ?? "" : $.root().html() ?? "";
|
|
30
|
+
if (bodyHtml.trim().length === 0) {
|
|
31
|
+
throw new ConversionError("No HTML body content found to convert.");
|
|
32
|
+
}
|
|
33
|
+
const markdown = turndown.turndown(bodyHtml).trim();
|
|
34
|
+
if (markdown.length === 0) {
|
|
35
|
+
throw new ConversionError("HTML was fetched but produced empty markdown output.");
|
|
36
|
+
}
|
|
37
|
+
return `${markdown}\n`;
|
|
38
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/package.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@jenslys/curldown",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Fetch URL content and convert it to markdown.",
|
|
5
|
+
"repository": {
|
|
6
|
+
"type": "git",
|
|
7
|
+
"url": "https://github.com/jenslys/curldown"
|
|
8
|
+
},
|
|
9
|
+
"homepage": "https://github.com/jenslys/curldown",
|
|
10
|
+
"bugs": {
|
|
11
|
+
"url": "https://github.com/jenslys/curldown/issues"
|
|
12
|
+
},
|
|
13
|
+
"type": "module",
|
|
14
|
+
"bin": {
|
|
15
|
+
"curldown": "dist/cli.js"
|
|
16
|
+
},
|
|
17
|
+
"files": [
|
|
18
|
+
"dist",
|
|
19
|
+
"README.md"
|
|
20
|
+
],
|
|
21
|
+
"engines": {
|
|
22
|
+
"node": ">=20"
|
|
23
|
+
},
|
|
24
|
+
"scripts": {
|
|
25
|
+
"build": "tsc -p tsconfig.build.json",
|
|
26
|
+
"typecheck": "tsc -p tsconfig.json --noEmit",
|
|
27
|
+
"lint": "npm run typecheck",
|
|
28
|
+
"test": "vitest run",
|
|
29
|
+
"prepublishOnly": "npm run build && npm run test"
|
|
30
|
+
},
|
|
31
|
+
"dependencies": {
|
|
32
|
+
"cheerio": "^1.2.0",
|
|
33
|
+
"commander": "^14.0.3",
|
|
34
|
+
"playwright": "^1.58.2",
|
|
35
|
+
"turndown": "^7.2.2"
|
|
36
|
+
},
|
|
37
|
+
"devDependencies": {
|
|
38
|
+
"@types/node": "^25.3.3",
|
|
39
|
+
"@types/turndown": "^5.0.6",
|
|
40
|
+
"typescript": "^5.9.3",
|
|
41
|
+
"vitest": "^4.0.18"
|
|
42
|
+
}
|
|
43
|
+
}
|