@jenslys/curldown 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,11 +1,13 @@
1
1
  # curldown
2
2
 
3
- Fetch a webpage and return clean Markdown.
3
+ Fetch a webpage and return clean Markdown for AI workflows.
4
4
 
5
- `curldown` is a CLI-first tool for AI agents and scripts:
5
+ `curldown` is CLI-first:
6
6
 
7
7
  - Static mode: `fetch` HTML -> Cheerio cleanup -> Turndown markdown.
8
8
  - Dynamic mode: headless Chromium (Playwright) -> HTML -> markdown.
9
+ - `--auto` tries static first and falls back to dynamic when static output is thin.
10
+ - `--format json` emits markdown plus metadata for agent pipelines.
9
11
 
10
12
  ## Install
11
13
 
@@ -13,10 +15,6 @@ Fetch a webpage and return clean Markdown.
13
15
  npm install -g @jenslys/curldown
14
16
  ```
15
17
 
16
- ```bash
17
- bun add -g @jenslys/curldown
18
- ```
19
-
20
18
  ## Quick Start
21
19
 
22
20
  ```bash
@@ -26,7 +24,13 @@ curldown https://example.com
26
24
  # JS-heavy pages
27
25
  curldown https://example.com --dynamic
28
26
 
29
- # Write to file
27
+ # Auto fallback to dynamic when static output looks incomplete
28
+ curldown https://example.com --auto
29
+
30
+ # JSON output for AI pipelines
31
+ curldown https://example.com --format json
32
+
33
+ # Write output to a file
30
34
  curldown https://example.com --output page.md
31
35
  ```
32
36
 
@@ -38,15 +42,30 @@ curldown <url> [options]
38
42
 
39
43
  ## Options
40
44
 
45
+ - `--auto` Try static first and fallback to dynamic when static output is thin.
41
46
  - `--dynamic` Use Playwright Chromium to render before extraction.
42
- - `-o, --output <path>` Write markdown to file instead of stdout.
47
+ - `--format <type>` Output format: `markdown` (default) or `json`.
48
+ - `-o, --output <path>` Write output to file instead of stdout.
43
49
  - `--timeout-ms <number>` Request/render timeout in milliseconds.
44
- - `--user-agent <string>` Override request user-agent.
45
50
  - `--header <key:value>` Custom request header (repeatable).
46
- - `--remove-selector <css>` Remove selector(s) before conversion (repeatable).
47
51
  - `--help` Show help.
48
52
  - `--version` Show version.
49
53
 
54
+ ## JSON Output Shape
55
+
56
+ `--format json` returns:
57
+
58
+ - `url`
59
+ - `final_url`
60
+ - `title`
61
+ - `markdown`
62
+ - `content_type`
63
+ - `status`
64
+ - `fetched_at`
65
+ - `word_count`
66
+ - `sha256`
67
+ - `used_dynamic`
68
+
50
69
  ## Local Development
51
70
 
52
71
  ```bash
@@ -65,16 +84,8 @@ Paste this into your `AGENTS.md` if you want agents to always use `curldown` for
65
84
 
66
85
  - Always use `curldown` to fetch web pages for agent workflows.
67
86
  - Default command: `curldown <url>`.
68
- - If the page is JS-rendered or incomplete, retry with: `curldown <url> --dynamic`.
87
+ - Prefer `curldown <url> --auto` when page rendering might be uncertain.
88
+ - Use `curldown <url> --format json` when downstream steps need structured metadata.
69
89
  - Prefer stdout output unless a task explicitly requires a file (`--output <path>`).
70
90
  - Do not use ad-hoc HTML scraping or direct browser automation when `curldown` can handle it.
71
91
  ```
72
-
73
- ## Exit Codes
74
-
75
- - `0` success
76
- - `1` input/usage error
77
- - `2` static fetch/network error
78
- - `3` dynamic render/browser error
79
- - `4` output write error
80
- - `5` conversion pipeline error
package/dist/cli.js CHANGED
@@ -1,12 +1,20 @@
1
1
  #!/usr/bin/env node
2
+ import { createHash } from "node:crypto";
3
+ import { realpathSync } from "node:fs";
2
4
  import { Command, CommanderError } from "commander";
3
- import { pathToFileURL } from "node:url";
4
- import { DEFAULT_DYNAMIC_TIMEOUT_MS, DEFAULT_STATIC_TIMEOUT_MS, DEFAULT_USER_AGENT, VERSION } from "./constants.js";
5
- import { asCurldownError, InputError } from "./errors.js";
5
+ import { fileURLToPath, pathToFileURL } from "node:url";
6
+ import { DEFAULT_DYNAMIC_TIMEOUT_MS, DEFAULT_STATIC_TIMEOUT_MS, VERSION } from "./constants.js";
7
+ import { asCurldownError, ConversionError, InputError } from "./errors.js";
6
8
  import { fetchDynamicHtml } from "./fetch-dynamic.js";
7
9
  import { fetchStaticHtml } from "./fetch-static.js";
8
10
  import { writeOutput } from "./output.js";
9
- import { transformHtmlToMarkdown } from "./transform.js";
11
+ import { extractHtmlTitle, transformHtmlToMarkdown } from "./transform.js";
12
+ const MARKDOWN_CONTENT_TYPES = new Set([
13
+ "text/markdown",
14
+ "text/x-markdown",
15
+ "application/markdown",
16
+ "application/x-markdown"
17
+ ]);
10
18
  const defaultDependencies = {
11
19
  fetchStatic: fetchStaticHtml,
12
20
  fetchDynamic: fetchDynamicHtml,
@@ -24,11 +32,11 @@ function buildProgram() {
24
32
  .version(VERSION)
25
33
  .argument("<url>", "The URL to fetch")
26
34
  .option("--dynamic", "Use headless Chromium (Playwright) to render the page")
27
- .option("-o, --output <path>", "Write markdown to a file instead of stdout")
35
+ .option("--auto", "Try static first and fallback to dynamic when static output is thin")
36
+ .option("--format <type>", "Output format: markdown|json", "markdown")
37
+ .option("-o, --output <path>", "Write output to a file instead of stdout")
28
38
  .option("--timeout-ms <number>", "Timeout in milliseconds")
29
- .option("--user-agent <string>", "Override request user-agent")
30
39
  .option("--header <key:value>", "Set custom request header", collectRepeatable, [])
31
- .option("--remove-selector <css>", "Remove matching selector(s) before markdown conversion", collectRepeatable, [])
32
40
  .showHelpAfterError()
33
41
  .exitOverride();
34
42
  }
@@ -48,15 +56,76 @@ function parseHeaders(rawHeaders) {
48
56
  }
49
57
  return headers;
50
58
  }
51
- function parseTimeout(rawTimeout, dynamic) {
59
+ function parseFormat(rawFormat) {
60
+ if (rawFormat === "markdown" || rawFormat === "json") {
61
+ return rawFormat;
62
+ }
63
+ throw new InputError(`Invalid --format value \"${rawFormat}\". Use \"markdown\" or \"json\".`);
64
+ }
65
+ function parseTimeouts(rawTimeout, dynamic, auto) {
52
66
  if (rawTimeout === undefined) {
53
- return dynamic ? DEFAULT_DYNAMIC_TIMEOUT_MS : DEFAULT_STATIC_TIMEOUT_MS;
67
+ if (dynamic) {
68
+ return {
69
+ timeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS,
70
+ dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
71
+ };
72
+ }
73
+ if (auto) {
74
+ return {
75
+ timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
76
+ dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
77
+ };
78
+ }
79
+ return {
80
+ timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
81
+ dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
82
+ };
54
83
  }
55
84
  const parsed = Number.parseInt(rawTimeout, 10);
56
85
  if (!Number.isInteger(parsed) || parsed <= 0) {
57
86
  throw new InputError(`Invalid --timeout-ms value \"${rawTimeout}\". Must be a positive integer.`);
58
87
  }
59
- return parsed;
88
+ return {
89
+ timeoutMs: parsed,
90
+ dynamicTimeoutMs: parsed
91
+ };
92
+ }
93
+ function normalizeMarkdown(markdown) {
94
+ const trimmed = markdown.trim();
95
+ if (!trimmed) {
96
+ throw new ConversionError("Content was fetched but markdown output is empty.");
97
+ }
98
+ return `${trimmed}\n`;
99
+ }
100
+ function inferTitleFromMarkdown(markdown) {
101
+ const firstHeading = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim();
102
+ return firstHeading || undefined;
103
+ }
104
+ function isMarkdownContentType(contentType) {
105
+ if (!contentType) {
106
+ return false;
107
+ }
108
+ const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
109
+ return MARKDOWN_CONTENT_TYPES.has(normalized);
110
+ }
111
+ function countWords(value) {
112
+ const trimmed = value.trim();
113
+ if (!trimmed) {
114
+ return 0;
115
+ }
116
+ return trimmed.split(/\s+/).length;
117
+ }
118
+ function shouldAutoFallback(markdown) {
119
+ const trimmed = markdown.trim();
120
+ if (!trimmed) {
121
+ return true;
122
+ }
123
+ const lower = trimmed.toLowerCase();
124
+ if (/enable javascript|javascript is required|checking your browser|just a moment|please wait/.test(lower)) {
125
+ return true;
126
+ }
127
+ const nonEmptyLines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).length;
128
+ return countWords(trimmed) < 30 && nonEmptyLines <= 2;
60
129
  }
61
130
  /**
62
131
  * Validate and normalize parsed CLI arguments into the canonical runtime shape.
@@ -79,16 +148,58 @@ function normalizeArgs(urlInput, options) {
79
148
  throw new InputError(`Unsupported URL protocol \"${parsedUrl.protocol}\". Only http:// and https:// are supported.`);
80
149
  }
81
150
  const dynamic = options.dynamic ?? false;
151
+ const auto = options.auto ?? false;
152
+ if (dynamic && auto) {
153
+ throw new InputError("--dynamic and --auto cannot be used together.");
154
+ }
155
+ const timeouts = parseTimeouts(options.timeoutMs, dynamic, auto);
82
156
  return {
83
157
  url: parsedUrl.toString(),
158
+ auto,
84
159
  dynamic,
160
+ format: parseFormat(options.format ?? "markdown"),
85
161
  outputPath: options.output,
86
- timeoutMs: parseTimeout(options.timeoutMs, dynamic),
87
- userAgent: options.userAgent?.trim() || DEFAULT_USER_AGENT,
88
- headers: parseHeaders(options.header ?? []),
89
- removeSelectors: (options.removeSelector ?? []).map((selector) => selector.trim()).filter(Boolean)
162
+ timeoutMs: timeouts.timeoutMs,
163
+ dynamicTimeoutMs: timeouts.dynamicTimeoutMs,
164
+ headers: parseHeaders(options.header ?? [])
90
165
  };
91
166
  }
167
+ function prepareContentFromFetchResult(result, deps) {
168
+ if (isMarkdownContentType(result.contentType)) {
169
+ const markdown = normalizeMarkdown(result.body);
170
+ return {
171
+ markdown,
172
+ title: inferTitleFromMarkdown(markdown),
173
+ source: result,
174
+ passthrough: true
175
+ };
176
+ }
177
+ const markdown = deps.transformHtmlToMarkdown({ html: result.body });
178
+ return {
179
+ markdown,
180
+ title: extractHtmlTitle(result.body),
181
+ source: result,
182
+ passthrough: false
183
+ };
184
+ }
185
+ function formatOutput(args, content, usedDynamic) {
186
+ if (args.format === "markdown") {
187
+ return content.markdown;
188
+ }
189
+ const payload = {
190
+ url: args.url,
191
+ final_url: content.source.finalUrl,
192
+ title: content.title ?? null,
193
+ markdown: content.markdown,
194
+ content_type: content.source.contentType ?? null,
195
+ status: content.source.status,
196
+ fetched_at: new Date().toISOString(),
197
+ word_count: countWords(content.markdown),
198
+ sha256: createHash("sha256").update(content.markdown).digest("hex"),
199
+ used_dynamic: usedDynamic
200
+ };
201
+ return `${JSON.stringify(payload, null, 2)}\n`;
202
+ }
92
203
  /**
93
204
  * Execute one curldown CLI invocation and return process exit code.
94
205
  * `argv` should not include the Node executable or script path.
@@ -116,21 +227,45 @@ export async function run(argv, deps = defaultDependencies) {
116
227
  }
117
228
  try {
118
229
  const args = normalizeArgs(urlArg, options);
119
- const fetchInput = {
120
- url: args.url,
121
- timeoutMs: args.timeoutMs,
122
- userAgent: args.userAgent,
123
- headers: args.headers
124
- };
125
- const html = args.dynamic
126
- ? await deps.fetchDynamic(fetchInput)
127
- : await deps.fetchStatic(fetchInput);
128
- const markdown = deps.transformHtmlToMarkdown({
129
- html,
130
- removeSelectors: args.removeSelectors
131
- });
230
+ let usedDynamic = false;
231
+ let content;
232
+ if (args.auto) {
233
+ const staticResult = await deps.fetchStatic({
234
+ url: args.url,
235
+ timeoutMs: args.timeoutMs,
236
+ headers: args.headers
237
+ });
238
+ content = prepareContentFromFetchResult(staticResult, deps);
239
+ if (!content.passthrough && shouldAutoFallback(content.markdown)) {
240
+ const dynamicResult = await deps.fetchDynamic({
241
+ url: args.url,
242
+ timeoutMs: args.dynamicTimeoutMs,
243
+ headers: args.headers
244
+ });
245
+ content = prepareContentFromFetchResult(dynamicResult, deps);
246
+ usedDynamic = true;
247
+ }
248
+ }
249
+ else if (args.dynamic) {
250
+ const dynamicResult = await deps.fetchDynamic({
251
+ url: args.url,
252
+ timeoutMs: args.dynamicTimeoutMs,
253
+ headers: args.headers
254
+ });
255
+ content = prepareContentFromFetchResult(dynamicResult, deps);
256
+ usedDynamic = true;
257
+ }
258
+ else {
259
+ const staticResult = await deps.fetchStatic({
260
+ url: args.url,
261
+ timeoutMs: args.timeoutMs,
262
+ headers: args.headers
263
+ });
264
+ content = prepareContentFromFetchResult(staticResult, deps);
265
+ }
266
+ const output = formatOutput(args, content, usedDynamic);
132
267
  await deps.writeOutput({
133
- markdown,
268
+ content: output,
134
269
  outputPath: args.outputPath
135
270
  });
136
271
  return 0;
@@ -141,7 +276,27 @@ export async function run(argv, deps = defaultDependencies) {
141
276
  return curldownError.exitCode;
142
277
  }
143
278
  }
144
- const isMain = process.argv[1] !== undefined && pathToFileURL(process.argv[1]).href === import.meta.url;
279
+ function resolvePathStrict(pathInput) {
280
+ return realpathSync(pathInput);
281
+ }
282
+ /**
283
+ * Determine whether this module was invoked as the CLI entrypoint.
284
+ * Resolves symlinks for both paths so global installs that expose a symlinked bin still execute.
285
+ */
286
+ export function isMainModule(argvPath = process.argv[1]) {
287
+ if (argvPath === undefined) {
288
+ return false;
289
+ }
290
+ try {
291
+ const invokedPath = resolvePathStrict(argvPath);
292
+ const modulePath = resolvePathStrict(fileURLToPath(import.meta.url));
293
+ return invokedPath === modulePath;
294
+ }
295
+ catch {
296
+ return pathToFileURL(argvPath).href === import.meta.url;
297
+ }
298
+ }
299
+ const isMain = isMainModule();
145
300
  if (isMain) {
146
301
  void run(process.argv.slice(2)).then((exitCode) => {
147
302
  process.exitCode = exitCode;
package/dist/constants.js CHANGED
@@ -1,7 +1,6 @@
1
- export const VERSION = "1.0.0";
1
+ export const VERSION = "1.0.1";
2
2
  export const DEFAULT_STATIC_TIMEOUT_MS = 15_000;
3
3
  export const DEFAULT_DYNAMIC_TIMEOUT_MS = 30_000;
4
- export const DEFAULT_USER_AGENT = `curldown/${VERSION} (+https://www.npmjs.com/package/@jenslys/curldown)`;
5
4
  export const DEFAULT_REMOVE_SELECTORS = [
6
5
  "script",
7
6
  "style",
@@ -9,16 +9,21 @@ export async function fetchDynamicHtml(input) {
9
9
  try {
10
10
  browser = await chromium.launch({ headless: true });
11
11
  const context = await browser.newContext({
12
- userAgent: input.userAgent,
13
12
  extraHTTPHeaders: input.headers
14
13
  });
15
14
  try {
16
15
  const page = await context.newPage();
17
- await page.goto(input.url, {
16
+ const response = await page.goto(input.url, {
18
17
  timeout: input.timeoutMs,
19
18
  waitUntil: "domcontentloaded"
20
19
  });
21
- return await page.content();
20
+ const body = await page.content();
21
+ return {
22
+ body,
23
+ finalUrl: page.url(),
24
+ status: response?.status() ?? 200,
25
+ contentType: response?.headers()["content-type"]
26
+ };
22
27
  }
23
28
  finally {
24
29
  await context.close();
@@ -5,9 +5,6 @@ import { FetchError } from "./errors.js";
5
5
  */
6
6
  export async function fetchStaticHtml(input) {
7
7
  const headers = new Headers(input.headers);
8
- if (input.userAgent) {
9
- headers.set("user-agent", input.userAgent);
10
- }
11
8
  let response;
12
9
  try {
13
10
  response = await fetch(input.url, {
@@ -23,7 +20,13 @@ export async function fetchStaticHtml(input) {
23
20
  throw new FetchError(`Static fetch failed for ${input.url}: HTTP ${response.status} ${response.statusText}`);
24
21
  }
25
22
  try {
26
- return await response.text();
23
+ const body = await response.text();
24
+ return {
25
+ body,
26
+ finalUrl: response.url || input.url,
27
+ status: response.status,
28
+ contentType: response.headers.get("content-type") ?? undefined
29
+ };
27
30
  }
28
31
  catch (error) {
29
32
  throw new FetchError(`Failed reading response body for ${input.url}: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
package/dist/output.js CHANGED
@@ -7,7 +7,7 @@ import { OutputError } from "./errors.js";
7
7
  export async function writeOutput(input) {
8
8
  if (input.outputPath) {
9
9
  try {
10
- await writeFile(input.outputPath, input.markdown, "utf8");
10
+ await writeFile(input.outputPath, input.content, "utf8");
11
11
  return;
12
12
  }
13
13
  catch (error) {
@@ -15,7 +15,7 @@ export async function writeOutput(input) {
15
15
  }
16
16
  }
17
17
  try {
18
- process.stdout.write(input.markdown);
18
+ process.stdout.write(input.content);
19
19
  }
20
20
  catch (error) {
21
21
  throw new OutputError(`Failed writing markdown to stdout: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
package/dist/transform.js CHANGED
@@ -1,31 +1,25 @@
1
1
  import { load } from "cheerio";
2
+ import { createRequire } from "node:module";
2
3
  import TurndownService from "turndown";
3
4
  import { DEFAULT_REMOVE_SELECTORS } from "./constants.js";
4
5
  import { ConversionError } from "./errors.js";
6
+ const require = createRequire(import.meta.url);
7
+ const turndownPluginGfm = require("@joplin/turndown-plugin-gfm");
5
8
  const turndown = new TurndownService({
6
9
  headingStyle: "atx",
7
10
  codeBlockStyle: "fenced",
8
11
  bulletListMarker: "-",
9
12
  emDelimiter: "_"
10
13
  });
11
- /** Normalize selector input by trimming, dropping empties, and removing duplicates. */
12
- function uniqueSelectors(selectors) {
13
- return [...new Set(selectors.map((selector) => selector.trim()).filter(Boolean))];
14
- }
14
+ turndown.use(turndownPluginGfm.gfm);
15
15
  /**
16
16
  * Convert fetched HTML into markdown.
17
- * The function removes default non-content nodes and optional caller-provided
18
- * selectors before running Turndown conversion.
17
+ * The function removes default non-content nodes before running Turndown
18
+ * with GitHub Flavored Markdown extensions.
19
19
  */
20
20
  export function transformHtmlToMarkdown(input) {
21
21
  const $ = load(input.html);
22
- const selectorsToRemove = uniqueSelectors([
23
- ...DEFAULT_REMOVE_SELECTORS,
24
- ...input.removeSelectors
25
- ]);
26
- if (selectorsToRemove.length > 0) {
27
- $(selectorsToRemove.join(",")).remove();
28
- }
22
+ $(DEFAULT_REMOVE_SELECTORS.join(",")).remove();
29
23
  const bodyHtml = $("body").length > 0 ? $("body").html() ?? "" : $.root().html() ?? "";
30
24
  if (bodyHtml.trim().length === 0) {
31
25
  throw new ConversionError("No HTML body content found to convert.");
@@ -36,3 +30,9 @@ export function transformHtmlToMarkdown(input) {
36
30
  }
37
31
  return `${markdown}\n`;
38
32
  }
33
+ /** Extract document title from HTML head when available. */
34
+ export function extractHtmlTitle(html) {
35
+ const $ = load(html);
36
+ const title = $("title").first().text().trim();
37
+ return title || undefined;
38
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@jenslys/curldown",
3
- "version": "1.0.0",
3
+ "version": "1.0.2",
4
4
  "description": "Fetch URL content and convert it to markdown.",
5
5
  "repository": {
6
6
  "type": "git",
@@ -29,6 +29,7 @@
29
29
  "prepublishOnly": "npm run build && npm run test"
30
30
  },
31
31
  "dependencies": {
32
+ "@joplin/turndown-plugin-gfm": "^1.0.64",
32
33
  "cheerio": "^1.2.0",
33
34
  "commander": "^14.0.3",
34
35
  "playwright": "^1.58.2",