@jenslys/curldown 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jens
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,80 @@
1
+ # curldown
2
+
3
+ Fetch a webpage and return clean Markdown.
4
+
5
+ `curldown` is a CLI-first tool for AI agents and scripts:
6
+
7
+ - Static mode: `fetch` HTML -> Cheerio cleanup -> Turndown markdown.
8
+ - Dynamic mode: headless Chromium (Playwright) -> HTML -> markdown.
9
+
10
+ ## Install
11
+
12
+ ```bash
13
+ npm install -g @jenslys/curldown
14
+ ```
15
+
16
+ ```bash
17
+ bun add -g @jenslys/curldown
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ```bash
23
+ # Print markdown to stdout
24
+ curldown https://example.com
25
+
26
+ # JS-heavy pages
27
+ curldown https://example.com --dynamic
28
+
29
+ # Write to file
30
+ curldown https://example.com --output page.md
31
+ ```
32
+
33
+ ## CLI
34
+
35
+ ```bash
36
+ curldown <url> [options]
37
+ ```
38
+
39
+ ## Options
40
+
41
+ - `--dynamic` Use Playwright Chromium to render before extraction.
42
+ - `-o, --output <path>` Write markdown to file instead of stdout.
43
+ - `--timeout-ms <number>` Request/render timeout in milliseconds.
44
+ - `--user-agent <string>` Override request user-agent.
45
+ - `--header <key:value>` Custom request header (repeatable).
46
+ - `--remove-selector <css>` Remove selector(s) before conversion (repeatable).
47
+ - `--help` Show help.
48
+ - `--version` Show version.
49
+
50
+ ## Local Development
51
+
52
+ ```bash
53
+ bun install
54
+ bun run build
55
+ bun run test
56
+ node dist/cli.js https://example.com
57
+ ```
58
+
59
+ ## AGENTS.md Snippet (Optional)
60
+
61
+ Paste this into your `AGENTS.md` if you want agents to always use `curldown` for website content retrieval:
62
+
63
+ ```md
64
+ ## Website Content Retrieval
65
+
66
+ - Always use `curldown` to fetch web pages for agent workflows.
67
+ - Default command: `curldown <url>`.
68
+ - If the page is JS-rendered or incomplete, retry with: `curldown <url> --dynamic`.
69
+ - Prefer stdout output unless a task explicitly requires a file (`--output <path>`).
70
+ - Do not use ad-hoc HTML scraping or direct browser automation when `curldown` can handle it.
71
+ ```
72
+
73
+ ## Exit Codes
74
+
75
+ - `0` success
76
+ - `1` input/usage error
77
+ - `2` static fetch/network error
78
+ - `3` dynamic render/browser error
79
+ - `4` output write error
80
+ - `5` conversion pipeline error
package/dist/cli.js ADDED
@@ -0,0 +1,149 @@
1
+ #!/usr/bin/env node
2
+ import { Command, CommanderError } from "commander";
3
+ import { pathToFileURL } from "node:url";
4
+ import { DEFAULT_DYNAMIC_TIMEOUT_MS, DEFAULT_STATIC_TIMEOUT_MS, DEFAULT_USER_AGENT, VERSION } from "./constants.js";
5
+ import { asCurldownError, InputError } from "./errors.js";
6
+ import { fetchDynamicHtml } from "./fetch-dynamic.js";
7
+ import { fetchStaticHtml } from "./fetch-static.js";
8
+ import { writeOutput } from "./output.js";
9
+ import { transformHtmlToMarkdown } from "./transform.js";
10
+ const defaultDependencies = {
11
+ fetchStatic: fetchStaticHtml,
12
+ fetchDynamic: fetchDynamicHtml,
13
+ transformHtmlToMarkdown,
14
+ writeOutput,
15
+ stderrWrite: (message) => process.stderr.write(message)
16
+ };
17
+ function collectRepeatable(value, previous = []) {
18
+ return [...previous, value];
19
+ }
20
+ function buildProgram() {
21
+ return new Command()
22
+ .name("curldown")
23
+ .description("Fetch URL content and convert it to markdown.")
24
+ .version(VERSION)
25
+ .argument("<url>", "The URL to fetch")
26
+ .option("--dynamic", "Use headless Chromium (Playwright) to render the page")
27
+ .option("-o, --output <path>", "Write markdown to a file instead of stdout")
28
+ .option("--timeout-ms <number>", "Timeout in milliseconds")
29
+ .option("--user-agent <string>", "Override request user-agent")
30
+ .option("--header <key:value>", "Set custom request header", collectRepeatable, [])
31
+ .option("--remove-selector <css>", "Remove matching selector(s) before markdown conversion", collectRepeatable, [])
32
+ .showHelpAfterError()
33
+ .exitOverride();
34
+ }
35
+ function parseHeaders(rawHeaders) {
36
+ const headers = {};
37
+ for (const rawHeader of rawHeaders) {
38
+ const separatorIndex = rawHeader.indexOf(":");
39
+ if (separatorIndex <= 0 || separatorIndex === rawHeader.length - 1) {
40
+ throw new InputError(`Invalid --header value \"${rawHeader}\". Use key:value format.`);
41
+ }
42
+ const key = rawHeader.slice(0, separatorIndex).trim();
43
+ const value = rawHeader.slice(separatorIndex + 1).trim();
44
+ if (!key || !value) {
45
+ throw new InputError(`Invalid --header value \"${rawHeader}\". Header key and value are required.`);
46
+ }
47
+ headers[key] = value;
48
+ }
49
+ return headers;
50
+ }
51
+ function parseTimeout(rawTimeout, dynamic) {
52
+ if (rawTimeout === undefined) {
53
+ return dynamic ? DEFAULT_DYNAMIC_TIMEOUT_MS : DEFAULT_STATIC_TIMEOUT_MS;
54
+ }
55
+ const parsed = Number.parseInt(rawTimeout, 10);
56
+ if (!Number.isInteger(parsed) || parsed <= 0) {
57
+ throw new InputError(`Invalid --timeout-ms value \"${rawTimeout}\". Must be a positive integer.`);
58
+ }
59
+ return parsed;
60
+ }
61
+ /**
62
+ * Validate and normalize parsed CLI arguments into the canonical runtime shape.
63
+ * Fails fast with {@link InputError} on malformed input.
64
+ */
65
+ function normalizeArgs(urlInput, options) {
66
+ if (!urlInput) {
67
+ throw new InputError("A URL argument is required.");
68
+ }
69
+ let parsedUrl;
70
+ try {
71
+ parsedUrl = new URL(urlInput);
72
+ }
73
+ catch (error) {
74
+ throw new InputError(`Invalid URL \"${urlInput}\".`, {
75
+ cause: error instanceof Error ? error : undefined
76
+ });
77
+ }
78
+ if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") {
79
+ throw new InputError(`Unsupported URL protocol \"${parsedUrl.protocol}\". Only http:// and https:// are supported.`);
80
+ }
81
+ const dynamic = options.dynamic ?? false;
82
+ return {
83
+ url: parsedUrl.toString(),
84
+ dynamic,
85
+ outputPath: options.output,
86
+ timeoutMs: parseTimeout(options.timeoutMs, dynamic),
87
+ userAgent: options.userAgent?.trim() || DEFAULT_USER_AGENT,
88
+ headers: parseHeaders(options.header ?? []),
89
+ removeSelectors: (options.removeSelector ?? []).map((selector) => selector.trim()).filter(Boolean)
90
+ };
91
+ }
92
+ /**
93
+ * Execute one curldown CLI invocation and return process exit code.
94
+ * `argv` should not include the Node executable or script path.
95
+ */
96
+ export async function run(argv, deps = defaultDependencies) {
97
+ const program = buildProgram();
98
+ let options;
99
+ let urlArg;
100
+ try {
101
+ const parsedProgram = program.parse(argv, { from: "user" });
102
+ options = parsedProgram.opts();
103
+ [urlArg] = parsedProgram.args;
104
+ }
105
+ catch (error) {
106
+ if (error instanceof CommanderError) {
107
+ if (error.code === "commander.helpDisplayed" || error.code === "commander.version") {
108
+ return 0;
109
+ }
110
+ deps.stderrWrite(`${error.message}\n`);
111
+ return 1;
112
+ }
113
+ const curldownError = asCurldownError(error);
114
+ deps.stderrWrite(`${curldownError.message}\n`);
115
+ return curldownError.exitCode;
116
+ }
117
+ try {
118
+ const args = normalizeArgs(urlArg, options);
119
+ const fetchInput = {
120
+ url: args.url,
121
+ timeoutMs: args.timeoutMs,
122
+ userAgent: args.userAgent,
123
+ headers: args.headers
124
+ };
125
+ const html = args.dynamic
126
+ ? await deps.fetchDynamic(fetchInput)
127
+ : await deps.fetchStatic(fetchInput);
128
+ const markdown = deps.transformHtmlToMarkdown({
129
+ html,
130
+ removeSelectors: args.removeSelectors
131
+ });
132
+ await deps.writeOutput({
133
+ markdown,
134
+ outputPath: args.outputPath
135
+ });
136
+ return 0;
137
+ }
138
+ catch (error) {
139
+ const curldownError = asCurldownError(error);
140
+ deps.stderrWrite(`${curldownError.message}\n`);
141
+ return curldownError.exitCode;
142
+ }
143
+ }
144
+ const isMain = process.argv[1] !== undefined && pathToFileURL(process.argv[1]).href === import.meta.url;
145
+ if (isMain) {
146
+ void run(process.argv.slice(2)).then((exitCode) => {
147
+ process.exitCode = exitCode;
148
+ });
149
+ }
@@ -0,0 +1,13 @@
1
+ export const VERSION = "1.0.0";
2
+ export const DEFAULT_STATIC_TIMEOUT_MS = 15_000;
3
+ export const DEFAULT_DYNAMIC_TIMEOUT_MS = 30_000;
4
+ export const DEFAULT_USER_AGENT = `curldown/${VERSION} (+https://www.npmjs.com/package/@jenslys/curldown)`;
5
+ export const DEFAULT_REMOVE_SELECTORS = [
6
+ "script",
7
+ "style",
8
+ "noscript",
9
+ "template",
10
+ "svg",
11
+ "canvas",
12
+ "iframe"
13
+ ];
package/dist/errors.js ADDED
@@ -0,0 +1,55 @@
1
+ /**
2
+ * Base error for all domain failures in curldown.
3
+ * Each subclass maps directly to a CLI exit code.
4
+ */
5
+ export class CurldownError extends Error {
6
+ exitCode;
7
+ constructor(message, exitCode, options) {
8
+ super(message, options);
9
+ this.name = new.target.name;
10
+ this.exitCode = exitCode;
11
+ }
12
+ }
13
+ /** Invalid CLI usage or invalid input values. */
14
+ export class InputError extends CurldownError {
15
+ constructor(message, options) {
16
+ super(message, 1, options);
17
+ }
18
+ }
19
+ /** Static network fetch failure from Node fetch. */
20
+ export class FetchError extends CurldownError {
21
+ constructor(message, options) {
22
+ super(message, 2, options);
23
+ }
24
+ }
25
+ /** Browser-rendering failure in dynamic mode. */
26
+ export class DynamicError extends CurldownError {
27
+ constructor(message, options) {
28
+ super(message, 3, options);
29
+ }
30
+ }
31
+ /** Failure while writing markdown to stdout or file output. */
32
+ export class OutputError extends CurldownError {
33
+ constructor(message, options) {
34
+ super(message, 4, options);
35
+ }
36
+ }
37
+ /** Failure in HTML cleanup or markdown conversion. */
38
+ export class ConversionError extends CurldownError {
39
+ constructor(message, options) {
40
+ super(message, 5, options);
41
+ }
42
+ }
43
+ /**
44
+ * Convert unknown thrown values into typed curldown errors so the CLI
45
+ * always returns a deterministic exit code.
46
+ */
47
+ export function asCurldownError(error) {
48
+ if (error instanceof CurldownError) {
49
+ return error;
50
+ }
51
+ if (error instanceof Error) {
52
+ return new ConversionError(error.message, { cause: error });
53
+ }
54
+ return new ConversionError("Unknown error while processing page content.");
55
+ }
@@ -0,0 +1,35 @@
1
+ import { chromium } from "playwright";
2
+ import { DynamicError } from "./errors.js";
3
+ /**
4
+ * Render a page in headless Chromium and return the resulting HTML snapshot.
5
+ * Throws {@link DynamicError} if browser startup, navigation, or capture fails.
6
+ */
7
+ export async function fetchDynamicHtml(input) {
8
+ let browser;
9
+ try {
10
+ browser = await chromium.launch({ headless: true });
11
+ const context = await browser.newContext({
12
+ userAgent: input.userAgent,
13
+ extraHTTPHeaders: input.headers
14
+ });
15
+ try {
16
+ const page = await context.newPage();
17
+ await page.goto(input.url, {
18
+ timeout: input.timeoutMs,
19
+ waitUntil: "domcontentloaded"
20
+ });
21
+ return await page.content();
22
+ }
23
+ finally {
24
+ await context.close();
25
+ }
26
+ }
27
+ catch (error) {
28
+ throw new DynamicError(`Dynamic fetch failed for ${input.url}: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
29
+ }
30
+ finally {
31
+ if (browser) {
32
+ await browser.close();
33
+ }
34
+ }
35
+ }
@@ -0,0 +1,31 @@
1
+ import { FetchError } from "./errors.js";
2
+ /**
3
+ * Fetch raw HTML using Node's native fetch implementation.
4
+ * Throws {@link FetchError} for transport, status, or body-read failures.
5
+ */
6
+ export async function fetchStaticHtml(input) {
7
+ const headers = new Headers(input.headers);
8
+ if (input.userAgent) {
9
+ headers.set("user-agent", input.userAgent);
10
+ }
11
+ let response;
12
+ try {
13
+ response = await fetch(input.url, {
14
+ headers,
15
+ redirect: "follow",
16
+ signal: AbortSignal.timeout(input.timeoutMs)
17
+ });
18
+ }
19
+ catch (error) {
20
+ throw new FetchError(`Static fetch failed for ${input.url}: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
21
+ }
22
+ if (!response.ok) {
23
+ throw new FetchError(`Static fetch failed for ${input.url}: HTTP ${response.status} ${response.statusText}`);
24
+ }
25
+ try {
26
+ return await response.text();
27
+ }
28
+ catch (error) {
29
+ throw new FetchError(`Failed reading response body for ${input.url}: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
30
+ }
31
+ }
package/dist/output.js ADDED
@@ -0,0 +1,23 @@
1
+ import { writeFile } from "node:fs/promises";
2
+ import { OutputError } from "./errors.js";
3
+ /**
4
+ * Emit markdown to stdout (default) or to a target file.
5
+ * Throws {@link OutputError} when writing fails.
6
+ */
7
+ export async function writeOutput(input) {
8
+ if (input.outputPath) {
9
+ try {
10
+ await writeFile(input.outputPath, input.markdown, "utf8");
11
+ return;
12
+ }
13
+ catch (error) {
14
+ throw new OutputError(`Failed writing markdown to ${input.outputPath}: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
15
+ }
16
+ }
17
+ try {
18
+ process.stdout.write(input.markdown);
19
+ }
20
+ catch (error) {
21
+ throw new OutputError(`Failed writing markdown to stdout: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
22
+ }
23
+ }
@@ -0,0 +1,38 @@
1
+ import { load } from "cheerio";
2
+ import TurndownService from "turndown";
3
+ import { DEFAULT_REMOVE_SELECTORS } from "./constants.js";
4
+ import { ConversionError } from "./errors.js";
5
+ const turndown = new TurndownService({
6
+ headingStyle: "atx",
7
+ codeBlockStyle: "fenced",
8
+ bulletListMarker: "-",
9
+ emDelimiter: "_"
10
+ });
11
+ /** Normalize selector input by trimming, dropping empties, and removing duplicates. */
12
+ function uniqueSelectors(selectors) {
13
+ return [...new Set(selectors.map((selector) => selector.trim()).filter(Boolean))];
14
+ }
15
+ /**
16
+ * Convert fetched HTML into markdown.
17
+ * The function removes default non-content nodes and optional caller-provided
18
+ * selectors before running Turndown conversion.
19
+ */
20
+ export function transformHtmlToMarkdown(input) {
21
+ const $ = load(input.html);
22
+ const selectorsToRemove = uniqueSelectors([
23
+ ...DEFAULT_REMOVE_SELECTORS,
24
+ ...input.removeSelectors
25
+ ]);
26
+ if (selectorsToRemove.length > 0) {
27
+ $(selectorsToRemove.join(",")).remove();
28
+ }
29
+ const bodyHtml = $("body").length > 0 ? $("body").html() ?? "" : $.root().html() ?? "";
30
+ if (bodyHtml.trim().length === 0) {
31
+ throw new ConversionError("No HTML body content found to convert.");
32
+ }
33
+ const markdown = turndown.turndown(bodyHtml).trim();
34
+ if (markdown.length === 0) {
35
+ throw new ConversionError("HTML was fetched but produced empty markdown output.");
36
+ }
37
+ return `${markdown}\n`;
38
+ }
package/dist/types.js ADDED
@@ -0,0 +1 @@
1
+ export {};
package/package.json ADDED
@@ -0,0 +1,43 @@
1
+ {
2
+ "name": "@jenslys/curldown",
3
+ "version": "1.0.0",
4
+ "description": "Fetch URL content and convert it to markdown.",
5
+ "repository": {
6
+ "type": "git",
7
+ "url": "https://github.com/jenslys/curldown"
8
+ },
9
+ "homepage": "https://github.com/jenslys/curldown",
10
+ "bugs": {
11
+ "url": "https://github.com/jenslys/curldown/issues"
12
+ },
13
+ "type": "module",
14
+ "bin": {
15
+ "curldown": "dist/cli.js"
16
+ },
17
+ "files": [
18
+ "dist",
19
+ "README.md"
20
+ ],
21
+ "engines": {
22
+ "node": ">=20"
23
+ },
24
+ "scripts": {
25
+ "build": "tsc -p tsconfig.build.json",
26
+ "typecheck": "tsc -p tsconfig.json --noEmit",
27
+ "lint": "npm run typecheck",
28
+ "test": "vitest run",
29
+ "prepublishOnly": "npm run build && npm run test"
30
+ },
31
+ "dependencies": {
32
+ "cheerio": "^1.2.0",
33
+ "commander": "^14.0.3",
34
+ "playwright": "^1.58.2",
35
+ "turndown": "^7.2.2"
36
+ },
37
+ "devDependencies": {
38
+ "@types/node": "^25.3.3",
39
+ "@types/turndown": "^5.0.6",
40
+ "typescript": "^5.9.3",
41
+ "vitest": "^4.0.18"
42
+ }
43
+ }