@jenslys/curldown 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -1
- package/dist/cli.js +58 -3
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -7,6 +7,7 @@ Fetch a webpage and return clean Markdown for AI workflows.
|
|
|
7
7
|
- Static mode: `fetch` HTML -> Cheerio cleanup -> Turndown markdown.
|
|
8
8
|
- Dynamic mode: headless Chromium (Playwright) -> HTML -> markdown.
|
|
9
9
|
- `--auto` tries static first and falls back to dynamic when static output is thin.
|
|
10
|
+
- Direct markdown responses are passed through (including `.md` URLs served as `text/plain`).
|
|
10
11
|
- `--format json` emits markdown plus metadata for agent pipelines.
|
|
11
12
|
|
|
12
13
|
## Install
|
|
@@ -82,7 +83,8 @@ Paste this into your `AGENTS.md` if you want agents to always use `curldown` for
|
|
|
82
83
|
```md
|
|
83
84
|
## Website Content Retrieval
|
|
84
85
|
|
|
85
|
-
-
|
|
86
|
+
- Use `curldown` for website/article page retrieval in agent workflows.
|
|
87
|
+
- Do not use `curldown` for raw code files or repository file blobs (for those, fetch the file directly).
|
|
86
88
|
- Default command: `curldown <url>`.
|
|
87
89
|
- Prefer `curldown <url> --auto` when page rendering might be uncertain.
|
|
88
90
|
- Use `curldown <url> --format json` when downstream steps need structured metadata.
|
package/dist/cli.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
|
+
import { realpathSync } from "node:fs";
|
|
3
4
|
import { Command, CommanderError } from "commander";
|
|
4
|
-
import { pathToFileURL } from "node:url";
|
|
5
|
+
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
5
6
|
import { DEFAULT_DYNAMIC_TIMEOUT_MS, DEFAULT_STATIC_TIMEOUT_MS, VERSION } from "./constants.js";
|
|
6
7
|
import { asCurldownError, ConversionError, InputError } from "./errors.js";
|
|
7
8
|
import { fetchDynamicHtml } from "./fetch-dynamic.js";
|
|
@@ -14,6 +15,16 @@ const MARKDOWN_CONTENT_TYPES = new Set([
|
|
|
14
15
|
"application/markdown",
|
|
15
16
|
"application/x-markdown"
|
|
16
17
|
]);
|
|
18
|
+
const PLAINTEXT_CONTENT_TYPE = "text/plain";
|
|
19
|
+
const MARKDOWN_FILE_EXTENSIONS = [
|
|
20
|
+
".md",
|
|
21
|
+
".markdown",
|
|
22
|
+
".mdown",
|
|
23
|
+
".mkd",
|
|
24
|
+
".mkdn",
|
|
25
|
+
".mdtxt",
|
|
26
|
+
".mdx"
|
|
27
|
+
];
|
|
17
28
|
const defaultDependencies = {
|
|
18
29
|
fetchStatic: fetchStaticHtml,
|
|
19
30
|
fetchDynamic: fetchDynamicHtml,
|
|
@@ -107,6 +118,30 @@ function isMarkdownContentType(contentType) {
|
|
|
107
118
|
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
108
119
|
return MARKDOWN_CONTENT_TYPES.has(normalized);
|
|
109
120
|
}
|
|
121
|
+
function isPlainTextContentType(contentType) {
|
|
122
|
+
if (!contentType) {
|
|
123
|
+
return false;
|
|
124
|
+
}
|
|
125
|
+
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
126
|
+
return normalized === PLAINTEXT_CONTENT_TYPE;
|
|
127
|
+
}
|
|
128
|
+
function hasMarkdownFileExtension(urlValue) {
|
|
129
|
+
let pathname;
|
|
130
|
+
try {
|
|
131
|
+
pathname = new URL(urlValue).pathname;
|
|
132
|
+
}
|
|
133
|
+
catch {
|
|
134
|
+
return false;
|
|
135
|
+
}
|
|
136
|
+
const normalizedPath = pathname.toLowerCase();
|
|
137
|
+
return MARKDOWN_FILE_EXTENSIONS.some((extension) => normalizedPath.endsWith(extension));
|
|
138
|
+
}
|
|
139
|
+
function shouldTreatAsMarkdownPassthrough(result) {
|
|
140
|
+
if (isMarkdownContentType(result.contentType)) {
|
|
141
|
+
return true;
|
|
142
|
+
}
|
|
143
|
+
return isPlainTextContentType(result.contentType) && hasMarkdownFileExtension(result.finalUrl);
|
|
144
|
+
}
|
|
110
145
|
function countWords(value) {
|
|
111
146
|
const trimmed = value.trim();
|
|
112
147
|
if (!trimmed) {
|
|
@@ -164,7 +199,7 @@ function normalizeArgs(urlInput, options) {
|
|
|
164
199
|
};
|
|
165
200
|
}
|
|
166
201
|
function prepareContentFromFetchResult(result, deps) {
|
|
167
|
-
if (
|
|
202
|
+
if (shouldTreatAsMarkdownPassthrough(result)) {
|
|
168
203
|
const markdown = normalizeMarkdown(result.body);
|
|
169
204
|
return {
|
|
170
205
|
markdown,
|
|
@@ -275,7 +310,27 @@ export async function run(argv, deps = defaultDependencies) {
|
|
|
275
310
|
return curldownError.exitCode;
|
|
276
311
|
}
|
|
277
312
|
}
|
|
278
|
-
|
|
313
|
+
function resolvePathStrict(pathInput) {
|
|
314
|
+
return realpathSync(pathInput);
|
|
315
|
+
}
|
|
316
|
+
/**
|
|
317
|
+
* Determine whether this module was invoked as the CLI entrypoint.
|
|
318
|
+
* Resolves symlinks for both paths so global installs that expose a symlinked bin still execute.
|
|
319
|
+
*/
|
|
320
|
+
export function isMainModule(argvPath = process.argv[1]) {
|
|
321
|
+
if (argvPath === undefined) {
|
|
322
|
+
return false;
|
|
323
|
+
}
|
|
324
|
+
try {
|
|
325
|
+
const invokedPath = resolvePathStrict(argvPath);
|
|
326
|
+
const modulePath = resolvePathStrict(fileURLToPath(import.meta.url));
|
|
327
|
+
return invokedPath === modulePath;
|
|
328
|
+
}
|
|
329
|
+
catch {
|
|
330
|
+
return pathToFileURL(argvPath).href === import.meta.url;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
const isMain = isMainModule();
|
|
279
334
|
if (isMain) {
|
|
280
335
|
void run(process.argv.slice(2)).then((exitCode) => {
|
|
281
336
|
process.exitCode = exitCode;
|