@jenslys/curldown 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -1
- package/dist/cli.js +35 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -7,6 +7,7 @@ Fetch a webpage and return clean Markdown for AI workflows.
|
|
|
7
7
|
- Static mode: `fetch` HTML -> Cheerio cleanup -> Turndown markdown.
|
|
8
8
|
- Dynamic mode: headless Chromium (Playwright) -> HTML -> markdown.
|
|
9
9
|
- `--auto` tries static first and falls back to dynamic when static output is thin.
|
|
10
|
+
- Direct markdown responses are passed through (including `.md` URLs served as `text/plain`).
|
|
10
11
|
- `--format json` emits markdown plus metadata for agent pipelines.
|
|
11
12
|
|
|
12
13
|
## Install
|
|
@@ -82,7 +83,8 @@ Paste this into your `AGENTS.md` if you want agents to always use `curldown` for
|
|
|
82
83
|
```md
|
|
83
84
|
## Website Content Retrieval
|
|
84
85
|
|
|
85
|
-
-
|
|
86
|
+
- Use `curldown` for website/article page retrieval in agent workflows.
|
|
87
|
+
- Do not use `curldown` for raw code files or repository file blobs (for those, fetch the file directly).
|
|
86
88
|
- Default command: `curldown <url>`.
|
|
87
89
|
- Prefer `curldown <url> --auto` when page rendering might be uncertain.
|
|
88
90
|
- Use `curldown <url> --format json` when downstream steps need structured metadata.
|
package/dist/cli.js
CHANGED
|
@@ -15,6 +15,16 @@ const MARKDOWN_CONTENT_TYPES = new Set([
|
|
|
15
15
|
"application/markdown",
|
|
16
16
|
"application/x-markdown"
|
|
17
17
|
]);
|
|
18
|
+
const PLAINTEXT_CONTENT_TYPE = "text/plain";
|
|
19
|
+
const MARKDOWN_FILE_EXTENSIONS = [
|
|
20
|
+
".md",
|
|
21
|
+
".markdown",
|
|
22
|
+
".mdown",
|
|
23
|
+
".mkd",
|
|
24
|
+
".mkdn",
|
|
25
|
+
".mdtxt",
|
|
26
|
+
".mdx"
|
|
27
|
+
];
|
|
18
28
|
const defaultDependencies = {
|
|
19
29
|
fetchStatic: fetchStaticHtml,
|
|
20
30
|
fetchDynamic: fetchDynamicHtml,
|
|
@@ -108,6 +118,30 @@ function isMarkdownContentType(contentType) {
|
|
|
108
118
|
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
109
119
|
return MARKDOWN_CONTENT_TYPES.has(normalized);
|
|
110
120
|
}
|
|
121
|
+
function isPlainTextContentType(contentType) {
|
|
122
|
+
if (!contentType) {
|
|
123
|
+
return false;
|
|
124
|
+
}
|
|
125
|
+
const normalized = contentType.toLowerCase().split(";")[0]?.trim() ?? "";
|
|
126
|
+
return normalized === PLAINTEXT_CONTENT_TYPE;
|
|
127
|
+
}
|
|
128
|
+
function hasMarkdownFileExtension(urlValue) {
|
|
129
|
+
let pathname;
|
|
130
|
+
try {
|
|
131
|
+
pathname = new URL(urlValue).pathname;
|
|
132
|
+
}
|
|
133
|
+
catch {
|
|
134
|
+
return false;
|
|
135
|
+
}
|
|
136
|
+
const normalizedPath = pathname.toLowerCase();
|
|
137
|
+
return MARKDOWN_FILE_EXTENSIONS.some((extension) => normalizedPath.endsWith(extension));
|
|
138
|
+
}
|
|
139
|
+
function shouldTreatAsMarkdownPassthrough(result) {
|
|
140
|
+
if (isMarkdownContentType(result.contentType)) {
|
|
141
|
+
return true;
|
|
142
|
+
}
|
|
143
|
+
return isPlainTextContentType(result.contentType) && hasMarkdownFileExtension(result.finalUrl);
|
|
144
|
+
}
|
|
111
145
|
function countWords(value) {
|
|
112
146
|
const trimmed = value.trim();
|
|
113
147
|
if (!trimmed) {
|
|
@@ -165,7 +199,7 @@ function normalizeArgs(urlInput, options) {
|
|
|
165
199
|
};
|
|
166
200
|
}
|
|
167
201
|
function prepareContentFromFetchResult(result, deps) {
|
|
168
|
-
if (
|
|
202
|
+
if (shouldTreatAsMarkdownPassthrough(result)) {
|
|
169
203
|
const markdown = normalizeMarkdown(result.body);
|
|
170
204
|
return {
|
|
171
205
|
markdown,
|