@jenslys/curldown 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -20
- package/dist/cli.js +184 -29
- package/dist/constants.js +1 -2
- package/dist/fetch-dynamic.js +8 -3
- package/dist/fetch-static.js +7 -4
- package/dist/output.js +2 -2
- package/dist/transform.js +13 -13
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# curldown
|
|
2
2
|
|
|
3
|
-
Fetch a webpage and return clean Markdown.
|
|
3
|
+
Fetch a webpage and return clean Markdown for AI workflows.
|
|
4
4
|
|
|
5
|
-
`curldown` is
|
|
5
|
+
`curldown` is CLI-first:
|
|
6
6
|
|
|
7
7
|
- Static mode: `fetch` HTML -> Cheerio cleanup -> Turndown markdown.
|
|
8
8
|
- Dynamic mode: headless Chromium (Playwright) -> HTML -> markdown.
|
|
9
|
+
- `--auto` tries static first and falls back to dynamic when static output is thin.
|
|
10
|
+
- `--format json` emits markdown plus metadata for agent pipelines.
|
|
9
11
|
|
|
10
12
|
## Install
|
|
11
13
|
|
|
@@ -13,10 +15,6 @@ Fetch a webpage and return clean Markdown.
|
|
|
13
15
|
npm install -g @jenslys/curldown
|
|
14
16
|
```
|
|
15
17
|
|
|
16
|
-
```bash
|
|
17
|
-
bun add -g @jenslys/curldown
|
|
18
|
-
```
|
|
19
|
-
|
|
20
18
|
## Quick Start
|
|
21
19
|
|
|
22
20
|
```bash
|
|
@@ -26,7 +24,13 @@ curldown https://example.com
|
|
|
26
24
|
# JS-heavy pages
|
|
27
25
|
curldown https://example.com --dynamic
|
|
28
26
|
|
|
29
|
-
#
|
|
27
|
+
# Auto fallback to dynamic when static output looks incomplete
|
|
28
|
+
curldown https://example.com --auto
|
|
29
|
+
|
|
30
|
+
# JSON output for AI pipelines
|
|
31
|
+
curldown https://example.com --format json
|
|
32
|
+
|
|
33
|
+
# Write output to a file
|
|
30
34
|
curldown https://example.com --output page.md
|
|
31
35
|
```
|
|
32
36
|
|
|
@@ -38,15 +42,30 @@ curldown <url> [options]
|
|
|
38
42
|
|
|
39
43
|
## Options
|
|
40
44
|
|
|
45
|
+
- `--auto` Try static first and fallback to dynamic when static output is thin.
|
|
41
46
|
- `--dynamic` Use Playwright Chromium to render before extraction.
|
|
42
|
-
-
|
|
47
|
+
- `--format <type>` Output format: `markdown` (default) or `json`.
|
|
48
|
+
- `-o, --output <path>` Write output to file instead of stdout.
|
|
43
49
|
- `--timeout-ms <number>` Request/render timeout in milliseconds.
|
|
44
|
-
- `--user-agent <string>` Override request user-agent.
|
|
45
50
|
- `--header <key:value>` Custom request header (repeatable).
|
|
46
|
-
- `--remove-selector <css>` Remove selector(s) before conversion (repeatable).
|
|
47
51
|
- `--help` Show help.
|
|
48
52
|
- `--version` Show version.
|
|
49
53
|
|
|
54
|
+
## JSON Output Shape
|
|
55
|
+
|
|
56
|
+
`--format json` returns:
|
|
57
|
+
|
|
58
|
+
- `url`
|
|
59
|
+
- `final_url`
|
|
60
|
+
- `title`
|
|
61
|
+
- `markdown`
|
|
62
|
+
- `content_type`
|
|
63
|
+
- `status`
|
|
64
|
+
- `fetched_at`
|
|
65
|
+
- `word_count`
|
|
66
|
+
- `sha256`
|
|
67
|
+
- `used_dynamic`
|
|
68
|
+
|
|
50
69
|
## Local Development
|
|
51
70
|
|
|
52
71
|
```bash
|
|
@@ -65,16 +84,8 @@ Paste this into your `AGENTS.md` if you want agents to always use `curldown` for
|
|
|
65
84
|
|
|
66
85
|
- Always use `curldown` to fetch web pages for agent workflows.
|
|
67
86
|
- Default command: `curldown <url>`.
|
|
68
|
-
-
|
|
87
|
+
- Prefer `curldown <url> --auto` when page rendering might be uncertain.
|
|
88
|
+
- Use `curldown <url> --format json` when downstream steps need structured metadata.
|
|
69
89
|
- Prefer stdout output unless a task explicitly requires a file (`--output <path>`).
|
|
70
90
|
- Do not use ad-hoc HTML scraping or direct browser automation when `curldown` can handle it.
|
|
71
91
|
```
|
|
72
|
-
|
|
73
|
-
## Exit Codes
|
|
74
|
-
|
|
75
|
-
- `0` success
|
|
76
|
-
- `1` input/usage error
|
|
77
|
-
- `2` static fetch/network error
|
|
78
|
-
- `3` dynamic render/browser error
|
|
79
|
-
- `4` output write error
|
|
80
|
-
- `5` conversion pipeline error
|
package/dist/cli.js
CHANGED
|
@@ -1,12 +1,20 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import { createHash } from "node:crypto";
|
|
3
|
+
import { realpathSync } from "node:fs";
|
|
2
4
|
import { Command, CommanderError } from "commander";
|
|
3
|
-
import { pathToFileURL } from "node:url";
|
|
4
|
-
import { DEFAULT_DYNAMIC_TIMEOUT_MS, DEFAULT_STATIC_TIMEOUT_MS,
|
|
5
|
-
import { asCurldownError, InputError } from "./errors.js";
|
|
5
|
+
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
6
|
+
import { DEFAULT_DYNAMIC_TIMEOUT_MS, DEFAULT_STATIC_TIMEOUT_MS, VERSION } from "./constants.js";
|
|
7
|
+
import { asCurldownError, ConversionError, InputError } from "./errors.js";
|
|
6
8
|
import { fetchDynamicHtml } from "./fetch-dynamic.js";
|
|
7
9
|
import { fetchStaticHtml } from "./fetch-static.js";
|
|
8
10
|
import { writeOutput } from "./output.js";
|
|
9
|
-
import { transformHtmlToMarkdown } from "./transform.js";
|
|
11
|
+
import { extractHtmlTitle, transformHtmlToMarkdown } from "./transform.js";
|
|
12
|
+
const MARKDOWN_CONTENT_TYPES = new Set([
|
|
13
|
+
"text/markdown",
|
|
14
|
+
"text/x-markdown",
|
|
15
|
+
"application/markdown",
|
|
16
|
+
"application/x-markdown"
|
|
17
|
+
]);
|
|
10
18
|
const defaultDependencies = {
|
|
11
19
|
fetchStatic: fetchStaticHtml,
|
|
12
20
|
fetchDynamic: fetchDynamicHtml,
|
|
@@ -24,11 +32,11 @@ function buildProgram() {
|
|
|
24
32
|
.version(VERSION)
|
|
25
33
|
.argument("<url>", "The URL to fetch")
|
|
26
34
|
.option("--dynamic", "Use headless Chromium (Playwright) to render the page")
|
|
27
|
-
.option("
|
|
35
|
+
.option("--auto", "Try static first and fallback to dynamic when static output is thin")
|
|
36
|
+
.option("--format <type>", "Output format: markdown|json", "markdown")
|
|
37
|
+
.option("-o, --output <path>", "Write output to a file instead of stdout")
|
|
28
38
|
.option("--timeout-ms <number>", "Timeout in milliseconds")
|
|
29
|
-
.option("--user-agent <string>", "Override request user-agent")
|
|
30
39
|
.option("--header <key:value>", "Set custom request header", collectRepeatable, [])
|
|
31
|
-
.option("--remove-selector <css>", "Remove matching selector(s) before markdown conversion", collectRepeatable, [])
|
|
32
40
|
.showHelpAfterError()
|
|
33
41
|
.exitOverride();
|
|
34
42
|
}
|
|
@@ -48,15 +56,76 @@ function parseHeaders(rawHeaders) {
|
|
|
48
56
|
}
|
|
49
57
|
return headers;
|
|
50
58
|
}
|
|
51
|
-
function
|
|
59
|
+
function parseFormat(rawFormat) {
|
|
60
|
+
if (rawFormat === "markdown" || rawFormat === "json") {
|
|
61
|
+
return rawFormat;
|
|
62
|
+
}
|
|
63
|
+
throw new InputError(`Invalid --format value \"${rawFormat}\". Use \"markdown\" or \"json\".`);
|
|
64
|
+
}
|
|
65
|
+
function parseTimeouts(rawTimeout, dynamic, auto) {
|
|
52
66
|
if (rawTimeout === undefined) {
|
|
53
|
-
|
|
67
|
+
if (dynamic) {
|
|
68
|
+
return {
|
|
69
|
+
timeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS,
|
|
70
|
+
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
if (auto) {
|
|
74
|
+
return {
|
|
75
|
+
timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
|
|
76
|
+
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
return {
|
|
80
|
+
timeoutMs: DEFAULT_STATIC_TIMEOUT_MS,
|
|
81
|
+
dynamicTimeoutMs: DEFAULT_DYNAMIC_TIMEOUT_MS
|
|
82
|
+
};
|
|
54
83
|
}
|
|
55
84
|
const parsed = Number.parseInt(rawTimeout, 10);
|
|
56
85
|
if (!Number.isInteger(parsed) || parsed <= 0) {
|
|
57
86
|
throw new InputError(`Invalid --timeout-ms value \"${rawTimeout}\". Must be a positive integer.`);
|
|
58
87
|
}
|
|
59
|
-
return
|
|
88
|
+
return {
|
|
89
|
+
timeoutMs: parsed,
|
|
90
|
+
dynamicTimeoutMs: parsed
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
function normalizeMarkdown(markdown) {
|
|
94
|
+
const trimmed = markdown.trim();
|
|
95
|
+
if (!trimmed) {
|
|
96
|
+
throw new ConversionError("Content was fetched but markdown output is empty.");
|
|
97
|
+
}
|
|
98
|
+
return `${trimmed}\n`;
|
|
99
|
+
}
|
|
100
|
+
function inferTitleFromMarkdown(markdown) {
|
|
101
|
+
const firstHeading = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim();
|
|
102
|
+
return firstHeading || undefined;
|
|
103
|
+
}
|
|
104
|
+
function isMarkdownContentType(contentType) {
|
|
105
|
+
if (!contentType) {
|
|
106
|
+
return false;
|
|
107
|
+
}
|
|
108
|
+
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
109
|
+
return MARKDOWN_CONTENT_TYPES.has(normalized);
|
|
110
|
+
}
|
|
111
|
+
function countWords(value) {
|
|
112
|
+
const trimmed = value.trim();
|
|
113
|
+
if (!trimmed) {
|
|
114
|
+
return 0;
|
|
115
|
+
}
|
|
116
|
+
return trimmed.split(/\s+/).length;
|
|
117
|
+
}
|
|
118
|
+
function shouldAutoFallback(markdown) {
|
|
119
|
+
const trimmed = markdown.trim();
|
|
120
|
+
if (!trimmed) {
|
|
121
|
+
return true;
|
|
122
|
+
}
|
|
123
|
+
const lower = trimmed.toLowerCase();
|
|
124
|
+
if (/enable javascript|javascript is required|checking your browser|just a moment|please wait/.test(lower)) {
|
|
125
|
+
return true;
|
|
126
|
+
}
|
|
127
|
+
const nonEmptyLines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).length;
|
|
128
|
+
return countWords(trimmed) < 30 && nonEmptyLines <= 2;
|
|
60
129
|
}
|
|
61
130
|
/**
|
|
62
131
|
* Validate and normalize parsed CLI arguments into the canonical runtime shape.
|
|
@@ -79,16 +148,58 @@ function normalizeArgs(urlInput, options) {
|
|
|
79
148
|
throw new InputError(`Unsupported URL protocol \"${parsedUrl.protocol}\". Only http:// and https:// are supported.`);
|
|
80
149
|
}
|
|
81
150
|
const dynamic = options.dynamic ?? false;
|
|
151
|
+
const auto = options.auto ?? false;
|
|
152
|
+
if (dynamic && auto) {
|
|
153
|
+
throw new InputError("--dynamic and --auto cannot be used together.");
|
|
154
|
+
}
|
|
155
|
+
const timeouts = parseTimeouts(options.timeoutMs, dynamic, auto);
|
|
82
156
|
return {
|
|
83
157
|
url: parsedUrl.toString(),
|
|
158
|
+
auto,
|
|
84
159
|
dynamic,
|
|
160
|
+
format: parseFormat(options.format ?? "markdown"),
|
|
85
161
|
outputPath: options.output,
|
|
86
|
-
timeoutMs:
|
|
87
|
-
|
|
88
|
-
headers: parseHeaders(options.header ?? [])
|
|
89
|
-
removeSelectors: (options.removeSelector ?? []).map((selector) => selector.trim()).filter(Boolean)
|
|
162
|
+
timeoutMs: timeouts.timeoutMs,
|
|
163
|
+
dynamicTimeoutMs: timeouts.dynamicTimeoutMs,
|
|
164
|
+
headers: parseHeaders(options.header ?? [])
|
|
90
165
|
};
|
|
91
166
|
}
|
|
167
|
+
function prepareContentFromFetchResult(result, deps) {
|
|
168
|
+
if (isMarkdownContentType(result.contentType)) {
|
|
169
|
+
const markdown = normalizeMarkdown(result.body);
|
|
170
|
+
return {
|
|
171
|
+
markdown,
|
|
172
|
+
title: inferTitleFromMarkdown(markdown),
|
|
173
|
+
source: result,
|
|
174
|
+
passthrough: true
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
const markdown = deps.transformHtmlToMarkdown({ html: result.body });
|
|
178
|
+
return {
|
|
179
|
+
markdown,
|
|
180
|
+
title: extractHtmlTitle(result.body),
|
|
181
|
+
source: result,
|
|
182
|
+
passthrough: false
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
function formatOutput(args, content, usedDynamic) {
|
|
186
|
+
if (args.format === "markdown") {
|
|
187
|
+
return content.markdown;
|
|
188
|
+
}
|
|
189
|
+
const payload = {
|
|
190
|
+
url: args.url,
|
|
191
|
+
final_url: content.source.finalUrl,
|
|
192
|
+
title: content.title ?? null,
|
|
193
|
+
markdown: content.markdown,
|
|
194
|
+
content_type: content.source.contentType ?? null,
|
|
195
|
+
status: content.source.status,
|
|
196
|
+
fetched_at: new Date().toISOString(),
|
|
197
|
+
word_count: countWords(content.markdown),
|
|
198
|
+
sha256: createHash("sha256").update(content.markdown).digest("hex"),
|
|
199
|
+
used_dynamic: usedDynamic
|
|
200
|
+
};
|
|
201
|
+
return `${JSON.stringify(payload, null, 2)}\n`;
|
|
202
|
+
}
|
|
92
203
|
/**
|
|
93
204
|
* Execute one curldown CLI invocation and return process exit code.
|
|
94
205
|
* `argv` should not include the Node executable or script path.
|
|
@@ -116,21 +227,45 @@ export async function run(argv, deps = defaultDependencies) {
|
|
|
116
227
|
}
|
|
117
228
|
try {
|
|
118
229
|
const args = normalizeArgs(urlArg, options);
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
230
|
+
let usedDynamic = false;
|
|
231
|
+
let content;
|
|
232
|
+
if (args.auto) {
|
|
233
|
+
const staticResult = await deps.fetchStatic({
|
|
234
|
+
url: args.url,
|
|
235
|
+
timeoutMs: args.timeoutMs,
|
|
236
|
+
headers: args.headers
|
|
237
|
+
});
|
|
238
|
+
content = prepareContentFromFetchResult(staticResult, deps);
|
|
239
|
+
if (!content.passthrough && shouldAutoFallback(content.markdown)) {
|
|
240
|
+
const dynamicResult = await deps.fetchDynamic({
|
|
241
|
+
url: args.url,
|
|
242
|
+
timeoutMs: args.dynamicTimeoutMs,
|
|
243
|
+
headers: args.headers
|
|
244
|
+
});
|
|
245
|
+
content = prepareContentFromFetchResult(dynamicResult, deps);
|
|
246
|
+
usedDynamic = true;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
else if (args.dynamic) {
|
|
250
|
+
const dynamicResult = await deps.fetchDynamic({
|
|
251
|
+
url: args.url,
|
|
252
|
+
timeoutMs: args.dynamicTimeoutMs,
|
|
253
|
+
headers: args.headers
|
|
254
|
+
});
|
|
255
|
+
content = prepareContentFromFetchResult(dynamicResult, deps);
|
|
256
|
+
usedDynamic = true;
|
|
257
|
+
}
|
|
258
|
+
else {
|
|
259
|
+
const staticResult = await deps.fetchStatic({
|
|
260
|
+
url: args.url,
|
|
261
|
+
timeoutMs: args.timeoutMs,
|
|
262
|
+
headers: args.headers
|
|
263
|
+
});
|
|
264
|
+
content = prepareContentFromFetchResult(staticResult, deps);
|
|
265
|
+
}
|
|
266
|
+
const output = formatOutput(args, content, usedDynamic);
|
|
132
267
|
await deps.writeOutput({
|
|
133
|
-
|
|
268
|
+
content: output,
|
|
134
269
|
outputPath: args.outputPath
|
|
135
270
|
});
|
|
136
271
|
return 0;
|
|
@@ -141,7 +276,27 @@ export async function run(argv, deps = defaultDependencies) {
|
|
|
141
276
|
return curldownError.exitCode;
|
|
142
277
|
}
|
|
143
278
|
}
|
|
144
|
-
|
|
279
|
+
function resolvePathStrict(pathInput) {
|
|
280
|
+
return realpathSync(pathInput);
|
|
281
|
+
}
|
|
282
|
+
/**
|
|
283
|
+
* Determine whether this module was invoked as the CLI entrypoint.
|
|
284
|
+
* Resolves symlinks for both paths so global installs that expose a symlinked bin still execute.
|
|
285
|
+
*/
|
|
286
|
+
export function isMainModule(argvPath = process.argv[1]) {
|
|
287
|
+
if (argvPath === undefined) {
|
|
288
|
+
return false;
|
|
289
|
+
}
|
|
290
|
+
try {
|
|
291
|
+
const invokedPath = resolvePathStrict(argvPath);
|
|
292
|
+
const modulePath = resolvePathStrict(fileURLToPath(import.meta.url));
|
|
293
|
+
return invokedPath === modulePath;
|
|
294
|
+
}
|
|
295
|
+
catch {
|
|
296
|
+
return pathToFileURL(argvPath).href === import.meta.url;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
const isMain = isMainModule();
|
|
145
300
|
if (isMain) {
|
|
146
301
|
void run(process.argv.slice(2)).then((exitCode) => {
|
|
147
302
|
process.exitCode = exitCode;
|
package/dist/constants.js
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
export const VERSION = "1.0.
|
|
1
|
+
export const VERSION = "1.0.1";
|
|
2
2
|
export const DEFAULT_STATIC_TIMEOUT_MS = 15_000;
|
|
3
3
|
export const DEFAULT_DYNAMIC_TIMEOUT_MS = 30_000;
|
|
4
|
-
export const DEFAULT_USER_AGENT = `curldown/${VERSION} (+https://www.npmjs.com/package/@jenslys/curldown)`;
|
|
5
4
|
export const DEFAULT_REMOVE_SELECTORS = [
|
|
6
5
|
"script",
|
|
7
6
|
"style",
|
package/dist/fetch-dynamic.js
CHANGED
|
@@ -9,16 +9,21 @@ export async function fetchDynamicHtml(input) {
|
|
|
9
9
|
try {
|
|
10
10
|
browser = await chromium.launch({ headless: true });
|
|
11
11
|
const context = await browser.newContext({
|
|
12
|
-
userAgent: input.userAgent,
|
|
13
12
|
extraHTTPHeaders: input.headers
|
|
14
13
|
});
|
|
15
14
|
try {
|
|
16
15
|
const page = await context.newPage();
|
|
17
|
-
await page.goto(input.url, {
|
|
16
|
+
const response = await page.goto(input.url, {
|
|
18
17
|
timeout: input.timeoutMs,
|
|
19
18
|
waitUntil: "domcontentloaded"
|
|
20
19
|
});
|
|
21
|
-
|
|
20
|
+
const body = await page.content();
|
|
21
|
+
return {
|
|
22
|
+
body,
|
|
23
|
+
finalUrl: page.url(),
|
|
24
|
+
status: response?.status() ?? 200,
|
|
25
|
+
contentType: response?.headers()["content-type"]
|
|
26
|
+
};
|
|
22
27
|
}
|
|
23
28
|
finally {
|
|
24
29
|
await context.close();
|
package/dist/fetch-static.js
CHANGED
|
@@ -5,9 +5,6 @@ import { FetchError } from "./errors.js";
|
|
|
5
5
|
*/
|
|
6
6
|
export async function fetchStaticHtml(input) {
|
|
7
7
|
const headers = new Headers(input.headers);
|
|
8
|
-
if (input.userAgent) {
|
|
9
|
-
headers.set("user-agent", input.userAgent);
|
|
10
|
-
}
|
|
11
8
|
let response;
|
|
12
9
|
try {
|
|
13
10
|
response = await fetch(input.url, {
|
|
@@ -23,7 +20,13 @@ export async function fetchStaticHtml(input) {
|
|
|
23
20
|
throw new FetchError(`Static fetch failed for ${input.url}: HTTP ${response.status} ${response.statusText}`);
|
|
24
21
|
}
|
|
25
22
|
try {
|
|
26
|
-
|
|
23
|
+
const body = await response.text();
|
|
24
|
+
return {
|
|
25
|
+
body,
|
|
26
|
+
finalUrl: response.url || input.url,
|
|
27
|
+
status: response.status,
|
|
28
|
+
contentType: response.headers.get("content-type") ?? undefined
|
|
29
|
+
};
|
|
27
30
|
}
|
|
28
31
|
catch (error) {
|
|
29
32
|
throw new FetchError(`Failed reading response body for ${input.url}: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
|
package/dist/output.js
CHANGED
|
@@ -7,7 +7,7 @@ import { OutputError } from "./errors.js";
|
|
|
7
7
|
export async function writeOutput(input) {
|
|
8
8
|
if (input.outputPath) {
|
|
9
9
|
try {
|
|
10
|
-
await writeFile(input.outputPath, input.
|
|
10
|
+
await writeFile(input.outputPath, input.content, "utf8");
|
|
11
11
|
return;
|
|
12
12
|
}
|
|
13
13
|
catch (error) {
|
|
@@ -15,7 +15,7 @@ export async function writeOutput(input) {
|
|
|
15
15
|
}
|
|
16
16
|
}
|
|
17
17
|
try {
|
|
18
|
-
process.stdout.write(input.
|
|
18
|
+
process.stdout.write(input.content);
|
|
19
19
|
}
|
|
20
20
|
catch (error) {
|
|
21
21
|
throw new OutputError(`Failed writing markdown to stdout: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined });
|
package/dist/transform.js
CHANGED
|
@@ -1,31 +1,25 @@
|
|
|
1
1
|
import { load } from "cheerio";
|
|
2
|
+
import { createRequire } from "node:module";
|
|
2
3
|
import TurndownService from "turndown";
|
|
3
4
|
import { DEFAULT_REMOVE_SELECTORS } from "./constants.js";
|
|
4
5
|
import { ConversionError } from "./errors.js";
|
|
6
|
+
const require = createRequire(import.meta.url);
|
|
7
|
+
const turndownPluginGfm = require("@joplin/turndown-plugin-gfm");
|
|
5
8
|
const turndown = new TurndownService({
|
|
6
9
|
headingStyle: "atx",
|
|
7
10
|
codeBlockStyle: "fenced",
|
|
8
11
|
bulletListMarker: "-",
|
|
9
12
|
emDelimiter: "_"
|
|
10
13
|
});
|
|
11
|
-
|
|
12
|
-
function uniqueSelectors(selectors) {
|
|
13
|
-
return [...new Set(selectors.map((selector) => selector.trim()).filter(Boolean))];
|
|
14
|
-
}
|
|
14
|
+
turndown.use(turndownPluginGfm.gfm);
|
|
15
15
|
/**
|
|
16
16
|
* Convert fetched HTML into markdown.
|
|
17
|
-
* The function removes default non-content nodes
|
|
18
|
-
*
|
|
17
|
+
* The function removes default non-content nodes before running Turndown
|
|
18
|
+
* with GitHub Flavored Markdown extensions.
|
|
19
19
|
*/
|
|
20
20
|
export function transformHtmlToMarkdown(input) {
|
|
21
21
|
const $ = load(input.html);
|
|
22
|
-
|
|
23
|
-
...DEFAULT_REMOVE_SELECTORS,
|
|
24
|
-
...input.removeSelectors
|
|
25
|
-
]);
|
|
26
|
-
if (selectorsToRemove.length > 0) {
|
|
27
|
-
$(selectorsToRemove.join(",")).remove();
|
|
28
|
-
}
|
|
22
|
+
$(DEFAULT_REMOVE_SELECTORS.join(",")).remove();
|
|
29
23
|
const bodyHtml = $("body").length > 0 ? $("body").html() ?? "" : $.root().html() ?? "";
|
|
30
24
|
if (bodyHtml.trim().length === 0) {
|
|
31
25
|
throw new ConversionError("No HTML body content found to convert.");
|
|
@@ -36,3 +30,9 @@ export function transformHtmlToMarkdown(input) {
|
|
|
36
30
|
}
|
|
37
31
|
return `${markdown}\n`;
|
|
38
32
|
}
|
|
33
|
+
/** Extract document title from HTML head when available. */
|
|
34
|
+
export function extractHtmlTitle(html) {
|
|
35
|
+
const $ = load(html);
|
|
36
|
+
const title = $("title").first().text().trim();
|
|
37
|
+
return title || undefined;
|
|
38
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@jenslys/curldown",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.2",
|
|
4
4
|
"description": "Fetch URL content and convert it to markdown.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -29,6 +29,7 @@
|
|
|
29
29
|
"prepublishOnly": "npm run build && npm run test"
|
|
30
30
|
},
|
|
31
31
|
"dependencies": {
|
|
32
|
+
"@joplin/turndown-plugin-gfm": "^1.0.64",
|
|
32
33
|
"cheerio": "^1.2.0",
|
|
33
34
|
"commander": "^14.0.3",
|
|
34
35
|
"playwright": "^1.58.2",
|