md-fetch 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/AGENTS.md +212 -0
  2. package/LICENSE +21 -0
  3. package/README.md +449 -0
  4. package/README.zh-CN.md +449 -0
  5. package/dist/cli.d.ts +27 -0
  6. package/dist/cli.d.ts.map +1 -0
  7. package/dist/cli.js +158 -0
  8. package/dist/cli.js.map +1 -0
  9. package/dist/constants.d.ts +9 -0
  10. package/dist/constants.d.ts.map +1 -0
  11. package/dist/constants.js +15 -0
  12. package/dist/constants.js.map +1 -0
  13. package/dist/core/browser.d.ts +23 -0
  14. package/dist/core/browser.d.ts.map +1 -0
  15. package/dist/core/browser.js +125 -0
  16. package/dist/core/browser.js.map +1 -0
  17. package/dist/core/converter.d.ts +18 -0
  18. package/dist/core/converter.d.ts.map +1 -0
  19. package/dist/core/converter.js +74 -0
  20. package/dist/core/converter.js.map +1 -0
  21. package/dist/core/extractor.d.ts +28 -0
  22. package/dist/core/extractor.d.ts.map +1 -0
  23. package/dist/core/extractor.js +151 -0
  24. package/dist/core/extractor.js.map +1 -0
  25. package/dist/core/fetcher.d.ts +24 -0
  26. package/dist/core/fetcher.d.ts.map +1 -0
  27. package/dist/core/fetcher.js +111 -0
  28. package/dist/core/fetcher.js.map +1 -0
  29. package/dist/core/processor.d.ts +22 -0
  30. package/dist/core/processor.d.ts.map +1 -0
  31. package/dist/core/processor.js +104 -0
  32. package/dist/core/processor.js.map +1 -0
  33. package/dist/core/screenshotter.d.ts +31 -0
  34. package/dist/core/screenshotter.d.ts.map +1 -0
  35. package/dist/core/screenshotter.js +222 -0
  36. package/dist/core/screenshotter.js.map +1 -0
  37. package/dist/index.d.ts +3 -0
  38. package/dist/index.d.ts.map +1 -0
  39. package/dist/index.js +14 -0
  40. package/dist/index.js.map +1 -0
  41. package/dist/screen-cli.d.ts +26 -0
  42. package/dist/screen-cli.d.ts.map +1 -0
  43. package/dist/screen-cli.js +196 -0
  44. package/dist/screen-cli.js.map +1 -0
  45. package/dist/screen.d.ts +3 -0
  46. package/dist/screen.d.ts.map +1 -0
  47. package/dist/screen.js +14 -0
  48. package/dist/screen.js.map +1 -0
  49. package/dist/types/index.d.ts +151 -0
  50. package/dist/types/index.d.ts.map +1 -0
  51. package/dist/types/index.js +42 -0
  52. package/dist/types/index.js.map +1 -0
  53. package/dist/utils/filename-sanitizer.d.ts +38 -0
  54. package/dist/utils/filename-sanitizer.d.ts.map +1 -0
  55. package/dist/utils/filename-sanitizer.js +79 -0
  56. package/dist/utils/filename-sanitizer.js.map +1 -0
  57. package/dist/utils/frontmatter.d.ts +6 -0
  58. package/dist/utils/frontmatter.d.ts.map +1 -0
  59. package/dist/utils/frontmatter.js +65 -0
  60. package/dist/utils/frontmatter.js.map +1 -0
  61. package/package.json +56 -0
  62. package/skills/md-fetch/SKILL.md +133 -0
  63. package/skills/md-fetch/references/cli-reference.md +257 -0
  64. package/src/cli.ts +169 -0
  65. package/src/constants.ts +17 -0
  66. package/src/core/browser.ts +161 -0
  67. package/src/core/converter.ts +82 -0
  68. package/src/core/extractor.ts +172 -0
  69. package/src/core/fetcher.ts +143 -0
  70. package/src/core/processor.ts +124 -0
  71. package/src/core/screenshotter.ts +289 -0
  72. package/src/index.ts +15 -0
  73. package/src/screen-cli.ts +216 -0
  74. package/src/screen.ts +15 -0
  75. package/src/types/index.ts +227 -0
  76. package/src/utils/filename-sanitizer.ts +88 -0
  77. package/src/utils/frontmatter.ts +81 -0
  78. package/tsconfig.json +20 -0
@@ -0,0 +1,449 @@
1
+ # md-fetch
2
+
3
+ [English](./README.md)
4
+
5
+ 一套网页内容处理的命令行工具:
6
+ - **md-fetch** - 将网页转换为干净的 Markdown 格式
7
+ - **md-fetch-screen** - 对网页进行高质量截图
8
+
9
+ ## 作者
10
+
11
+ 由 **Claude Code** 和 **Claude Sonnet** 开发
12
+
13
+ ## 目录
14
+
15
+ - [md-fetch - Markdown 转换器](#md-fetch---markdown-转换器)
16
+ - [特性](#特性)
17
+ - [安装](#安装)
18
+ - [使用](#使用)
19
+ - [CLI 选项](#cli-选项)
20
+ - [md-fetch-screen - 截图工具](#md-fetch-screen---截图工具)
21
+ - [截图功能](#截图功能)
22
+ - [截图使用方法](#截图使用方法)
23
+ - [截图 CLI 选项](#截图-cli-选项)
24
+ - [技术栈](#技术栈)
25
+ - [开发](#开发)
26
+
27
+ ---
28
+
29
+ # md-fetch - Markdown 转换器
30
+
31
+ ## 特性
32
+
33
+ - 🚀 使用原生 fetch API 获取网页内容
34
+ - 🌐 支持无头浏览器模式(Puppeteer)处理 SPA 页面
35
+ - 📄 使用 Mozilla Readability 提取主要内容
36
+ - ✨ 使用 Turndown 将 HTML 转换为 Markdown
37
+ - 📋 **自动生成 YAML frontmatter**(包含标题、URL、作者、发布时间等元数据)
38
+ - 🎯 支持自定义 CSS 选择器提取内容
39
+ - 🔒 代理支持(HTTP_PROXY/HTTPS_PROXY 环境变量)
40
+ - ⚙️ 可配置的超时、headers 等选项
41
+ - 🔄 自动重试(3 次,指数退避)
42
+ - 📦 最小化依赖
43
+
44
+ ## 安装
45
+
46
+ ### 开发环境安装
47
+
48
+ ```bash
49
+ # 克隆项目(如果还没有)
50
+ git clone <repo-url>
51
+ cd md-fetch
52
+
53
+ # 安装依赖
54
+ pnpm install
55
+ ```
56
+
57
+ ### 全局安装
58
+
59
+ **使用 pnpm:**
60
+
61
+ ```bash
62
+ # 1. 构建项目
63
+ pnpm build
64
+
65
+ # 2. 配置 pnpm(首次使用需要)
66
+ pnpm setup
67
+
68
+ # 3. 全局链接(开发时推荐)
69
+ pnpm link --global
70
+
71
+ # 4. 现在可以在任何位置使用 md-fetch 命令
72
+ md-fetch https://example.com
73
+ ```
74
+
75
+ **使用 npm:**
76
+
77
+ ```bash
78
+ # 1. 构建项目
79
+ pnpm build
80
+
81
+ # 2. 全局链接
82
+ npm link
83
+
84
+ # 3. 现在可以在任何位置使用 md-fetch 命令
85
+ md-fetch https://example.com
86
+ ```
87
+
88
+ ### 修改代码后重新安装
89
+
90
+ ```bash
91
+ # 1. 重新构建
92
+ pnpm build
93
+
94
+ # 2. 无需重新 link,构建后会自动生效
95
+ md-fetch https://example.com
96
+ ```
97
+
98
+ ### 卸载
99
+
100
+ **使用 pnpm:**
101
+
102
+ ```bash
103
+ # 取消全局链接
104
+ pnpm unlink --global
105
+
106
+ # 可选:清理 pnpm 全局 store 中未被引用的包(释放磁盘空间)
107
+ pnpm store prune
108
+ ```
109
+
110
+ **使用 npm:**
111
+
112
+ ```bash
113
+ # 取消全局链接
114
+ npm unlink -g md-fetch
115
+ ```
116
+
117
+ **删除项目:**
118
+
119
+ ```bash
120
+ # 如果想完全删除项目,直接删除项目目录即可
121
+ cd ..
122
+ rm -rf md-fetch # 或在 Windows 上使用 rmdir /s md-fetch
123
+ ```
124
+
125
+ ## 使用
126
+
127
+ ### 开发模式
128
+
129
+ ```bash
130
+ # 基本使用 - 输出到 stdout
131
+ pnpm dev -- https://example.com
132
+
133
+ # 保存到文件
134
+ pnpm dev -- https://example.com -o output.md
135
+
136
+ # 浏览器模式(用于 SPA 页面)
137
+ pnpm dev -- -b https://react-app.example.com
138
+
139
+ # 禁用 readability,保留完整内容
140
+ pnpm dev -- https://example.com -R
141
+ # 或使用完整选项名
142
+ pnpm dev -- https://example.com --no-readability
143
+
144
+ # 使用自定义选择器
145
+ pnpm dev -- https://example.com -s "article.main-content"
146
+
147
+ # 处理多个 URL
148
+ pnpm dev -- https://example.com https://httpbin.org/html
149
+
150
+ # 自定义 HTTP headers
151
+ pnpm dev -- https://example.com -H "Authorization: Bearer token"
152
+
153
+ # 使用代理
154
+ pnpm dev -- https://example.com --proxy http://proxy.example.com:8080
155
+
156
+ # 详细日志
157
+ pnpm dev -- https://example.com --verbose
158
+
159
+ # 查看所有选项
160
+ pnpm dev -- --help
161
+ ```
162
+
163
+ ### 生产使用(全局安装后)
164
+
165
+ ```bash
166
+ # 基本使用
167
+ md-fetch https://example.com
168
+
169
+ # 保存到文件
170
+ md-fetch https://example.com -o article.md
171
+
172
+ # 浏览器模式
173
+ md-fetch -b https://react-app.example.com
174
+
175
+ # 使用代理(从环境变量)
176
+ export HTTPS_PROXY=http://proxy.example.com:8080
177
+ md-fetch https://example.com
178
+ ```
179
+
180
+ ## 输出示例
181
+
182
+ md-fetch 会自动在 Markdown 文件开头添加 YAML frontmatter,包含页面的元数据:
183
+
184
+ ```markdown
185
+ ---
186
+ title: "Example Domain"
187
+ url: https://example.com
188
+ description: "Example Domain description"
189
+ author: "John Doe"
190
+ siteName: "Example"
191
+ publishedTime: 2024-01-01T00:00:00Z
192
+ modifiedTime: 2024-01-15T10:30:00Z
193
+ keywords:
194
+ - example
195
+ - demo
196
+ - test
197
+ image: https://example.com/og-image.jpg
198
+ lang: en
199
+ ---
200
+
201
+ # Example Domain
202
+
203
+ This domain is for use in illustrative examples...
204
+ ```
205
+
206
+ ### Frontmatter 字段说明
207
+
208
+ - `title` - 页面标题(优先从 Readability、Open Graph、Twitter Cards 或 `<title>` 标签提取)
209
+ - `url` - 原始 URL
210
+ - `description` - 页面描述或摘要
211
+ - `author` - 作者信息
212
+ - `siteName` - 站点名称
213
+ - `publishedTime` - 发布时间(ISO 8601 格式)
214
+ - `modifiedTime` - 最后修改时间(ISO 8601 格式)
215
+ - `keywords` - 关键词数组
216
+ - `image` - 页面主图片(Open Graph 或 Twitter Cards)
217
+ - `lang` - 页面语言代码
218
+
219
+ ## CLI 选项
220
+
221
+ ```
222
+ Usage: md-fetch <urls...> [options]
223
+
224
+ Arguments:
225
+ urls URLs to convert to Markdown
226
+
227
+ Options:
228
+ -V, --version output the version number
229
+ -o, --output <file> Output to file instead of stdout
230
+ -b, --browser Use headless browser mode (for SPA pages)
231
+ --browser-path <path> Custom Chrome/Chromium executable path
232
+ -R, --no-readability Disable readability, keep full HTML content
233
+ -s, --selector <selector> Custom CSS selector to extract content
234
+ -H, --header <header> Custom HTTP header (can be repeated)
235
+ --proxy <url> Proxy server URL (also reads HTTP_PROXY/HTTPS_PROXY env vars)
236
+ -t, --timeout <ms> Request timeout in milliseconds (default: 30000)
237
+ --user-agent <string> Custom user agent (default: "md-fetch/1.0.0")
238
+ --wait-until <event> Browser wait condition (load|domcontentloaded|networkidle0|networkidle2)
239
+ --verbose Enable verbose logging
240
+ -h, --help display help for command
241
+ ```
242
+
243
+ ## 技术栈
244
+
245
+ - **TypeScript** - 类型安全
246
+ - **Node.js ≥18** - 使用原生 fetch API
247
+ - **ES 模块** - 现代 JavaScript
248
+ - **Commander** - CLI 参数解析
249
+ - **Mozilla Readability** - 智能内容提取
250
+ - **Turndown** - HTML 转 Markdown
251
+ - **JSDOM** - DOM 解析
252
+ - **Puppeteer-core** - 无头浏览器支持
253
+ - **Undici** - 代理支持
254
+
255
+ ## 开发
256
+
257
+ ```bash
258
+ # 安装依赖
259
+ pnpm install
260
+
261
+ # 开发模式
262
+ pnpm dev -- <url>
263
+
264
+ # 构建
265
+ pnpm build
266
+
267
+ # 运行测试
268
+ pnpm test
269
+ ```
270
+
271
+ ## 工作原理
272
+
273
+ 1. **Fetch** - 使用原生 fetch 或 Puppeteer 无头浏览器获取 HTML 内容
274
+ 2. **Extract** - 使用 Readability 或自定义选择器提取主要内容,同时提取页面元数据
275
+ 3. **Convert** - 使用 Turndown 转换为 Markdown
276
+ 4. **Generate Frontmatter** - 从提取的元数据生成 YAML frontmatter
277
+ 5. **Output** - 将 frontmatter 和 Markdown 内容输出到 stdout 或保存到文件
278
+
279
+ ## 代理支持
280
+
281
+ md-fetch 自动从环境变量读取代理配置:
282
+
283
+ ```bash
284
+ # 设置代理
285
+ export HTTP_PROXY=http://proxy.example.com:8080
286
+ export HTTPS_PROXY=http://proxy.example.com:8080
287
+
288
+ # 排除某些域名
289
+ export NO_PROXY=localhost,127.0.0.1,.example.com
290
+
291
+ # 或通过命令行参数
292
+ md-fetch https://example.com --proxy http://proxy.example.com:8080
293
+ ```
294
+
295
+ ---
296
+
297
+ # md-fetch-screen - 截图工具
298
+
299
+ ## 截图功能
300
+
301
+ - 📸 对网页进行高质量截图
302
+ - 🖥️ 全页截图或仅视口截图模式
303
+ - 📐 可自定义视口尺寸(宽度/高度)
304
+ - ✨ 支持设备像素比例,可生成高清截图(Retina 显示屏)
305
+ - 🎨 多种图片格式(PNG、JPEG、WebP)
306
+ - 🎯 使用 CSS 选择器截取特定元素
307
+ - 🙈 隐藏不需要的元素(广告、弹窗等)
308
+ - ⏱️ 可配置截图前延迟
309
+ - 🔒 代理支持
310
+ - 🌐 使用 Puppeteer 的无头浏览器模式
311
+ - 📁 从 URL 和时间戳自动生成文件名
312
+ - 🔄 批量截图多个 URL
313
+
314
+ ## 截图使用方法
315
+
316
+ ### 基本用法
317
+
318
+ ```bash
319
+ # 基本截图(全页,标准分辨率)
320
+ md-fetch-screen https://example.com
321
+
322
+ # 仅视口截图,自定义尺寸
323
+ md-fetch-screen https://example.com --viewport -W 1440 -H 900
324
+
325
+ # 高清截图(2倍像素比例,适合 Retina 显示屏)
326
+ md-fetch-screen https://example.com --scale 2
327
+
328
+ # 带详细日志的截图
329
+ md-fetch-screen https://example.com --verbose
330
+ ```
331
+
332
+ ### 高级用法
333
+
334
+ ```bash
335
+ # 截取特定元素
336
+ md-fetch-screen https://example.com --selector "#main-content"
337
+
338
+ # 隐藏广告和弹窗
339
+ md-fetch-screen https://example.com --hide ".ad,.popup,.cookie-banner"
340
+
341
+ # JPEG 格式,自定义质量
342
+ md-fetch-screen https://example.com --format jpeg --quality 85
343
+
344
+ # 保存到指定目录
345
+ md-fetch-screen https://example.com --output ./screenshots
346
+
347
+ # 等待页面加载完成后延迟 2 秒再截图
348
+ md-fetch-screen https://example.com --wait-until networkidle0 --delay 2000
349
+
350
+ # 批量截图多个 URL
351
+ md-fetch-screen https://site1.com https://site2.com https://site3.com
352
+ ```
353
+
354
+ ### 理解宽度、高度和像素比例参数
355
+
356
+ **全页模式(默认):**
357
+ - 宽度/高度控制浏览器视口大小
358
+ - 截图会捕获整个页面内容
359
+ - 最终图片尺寸取决于页面的实际高度
360
+
361
+ ```bash
362
+ # 全页截图,视口宽度 1920px
363
+ md-fetch-screen https://example.com -W 1920 -H 1080
364
+ ```
365
+
366
+ **视口模式:**
367
+ - 宽度/高度直接控制截图尺寸
368
+ - 只捕获视口内可见的内容
369
+
370
+ ```bash
371
+ # 精确 1440x900 的截图
372
+ md-fetch-screen https://example.com --viewport -W 1440 -H 900
373
+ ```
374
+
375
+ **像素比例(设备像素比):**
376
+ - `--scale 1`(默认):标准分辨率
377
+ - 视口 1920x1080 → 图片 1920x1080 像素
378
+ - `--scale 2`:高清(Retina)
379
+ - 视口 1920x1080 → 图片 3840x2160 像素
380
+ - `--scale 3`:超高清
381
+ - 视口 1920x1080 → 图片 5760x3240 像素
382
+
383
+ ```bash
384
+ # 高质量 Retina 截图
385
+ md-fetch-screen https://example.com --scale 2
386
+
387
+ # 视口模式 + 2倍像素比例 = 2880x1800 最终图片
388
+ md-fetch-screen https://example.com --viewport -W 1440 -H 900 --scale 2
389
+ ```
390
+
391
+ ## 截图 CLI 选项
392
+
393
+ ```
394
+ 用法: md-fetch-screen [options] <urls...>
395
+
396
+ 参数:
397
+ urls 要截图的 URL
398
+
399
+ 选项:
400
+ -V, --version 输出版本号
401
+
402
+ 视口和尺寸:
403
+ -f, --full-page 全页截图(默认)
404
+ --viewport 仅视口截图
405
+ -W, --width <pixels> 视口宽度(像素)(默认:1920)
406
+ -H, --height <pixels> 视口高度(像素)(默认:1080)
407
+ --scale <number> 设备像素比例,用于高清截图(1/2/3,默认:1)
408
+
409
+ 输出:
410
+ --output <dir> 输出目录(默认:".")
411
+ --format <type> 图片格式:png|jpeg|webp(默认:"png")
412
+ --quality <number> JPEG/WebP 质量 0-100(默认:90)
413
+
414
+ 浏览器:
415
+ --browser-path <path> 自定义 Chrome/Chromium 可执行文件路径
416
+ --wait-until <event> 等待条件:load|domcontentloaded|networkidle0|networkidle2
417
+ --timeout <ms> 超时时间(毫秒)(默认:30000)
418
+ --user-agent <string> 自定义 user agent
419
+ --proxy <url> 代理服务器 URL
420
+
421
+ 内容:
422
+ --delay <ms> 截图前延迟时间(毫秒)(默认:0)
423
+ --selector <css> CSS 选择器,用于截取特定元素
424
+ --hide <selectors> 要隐藏的 CSS 选择器(逗号分隔)
425
+
426
+ 其他:
427
+ --verbose 启用详细日志
428
+ -h, --help 显示帮助信息
429
+ ```
430
+
431
+ ### 文件名格式
432
+
433
+ 截图会自动使用以下格式命名:
434
+ ```
435
+ <域名_路径_前50字符>_<时间戳>.png
436
+ ```
437
+
438
+ 示例:
439
+ - `example.com_20251229153045.png`
440
+ - `github.com_user_repo_issues_123_20251229153045.png`
441
+
442
+ 文件名包含:
443
+ - 域名和路径(最多 50 个字符,已进行文件系统安全化处理)
444
+ - 时间戳格式:`YYYYMMDDHHmmss`
445
+ - 基于格式的文件扩展名
446
+
447
+ ## 许可
448
+
449
+ MIT
package/dist/cli.d.ts ADDED
@@ -0,0 +1,27 @@
1
+ export declare class CLI {
2
+ private program;
3
+ private processor;
4
+ constructor();
5
+ private setupCommands;
6
+ /**
7
+ * Collect multiple header options
8
+ */
9
+ private collectHeaders;
10
+ /**
11
+ * Parse headers from array of strings like "Key: Value"
12
+ */
13
+ private parseHeaders;
14
+ /**
15
+ * Main handler for the fetch command
16
+ */
17
+ private handleFetch;
18
+ /**
19
+ * Write content to file
20
+ */
21
+ private writeToFile;
22
+ /**
23
+ * Run the CLI
24
+ */
25
+ run(argv: string[]): Promise<void>;
26
+ }
27
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":"AAOA,qBAAa,GAAG;IACd,OAAO,CAAC,OAAO,CAAU;IACzB,OAAO,CAAC,SAAS,CAAmB;;IAQpC,OAAO,CAAC,aAAa;IAyBrB;;OAEG;IACH,OAAO,CAAC,cAAc;IAItB;;OAEG;IACH,OAAO,CAAC,YAAY;IAkBpB;;OAEG;YACW,WAAW;IAyEzB;;OAEG;YACW,WAAW;IAazB;;OAEG;IACG,GAAG,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;CAGzC"}
package/dist/cli.js ADDED
@@ -0,0 +1,158 @@
1
+ import { Command } from 'commander';
2
+ import { ContentProcessor } from './core/processor.js';
3
+ import { DEFAULT_TIMEOUT, DEFAULT_USER_AGENT } from './constants.js';
4
+ import { writeFile, mkdir } from 'fs/promises';
5
+ import { dirname } from 'path';
6
+ export class CLI {
7
+ program;
8
+ processor;
9
+ constructor() {
10
+ this.program = new Command();
11
+ this.processor = new ContentProcessor();
12
+ this.setupCommands();
13
+ }
14
+ setupCommands() {
15
+ this.program
16
+ .name('md-fetch')
17
+ .description('Convert web pages to clean Markdown using fetch, readability, and turndown')
18
+ .version('1.0.0')
19
+ .argument('<urls...>', 'URLs to convert to Markdown')
20
+ .option('-o, --output <file>', 'Output to file instead of stdout')
21
+ .option('-b, --browser', 'Use headless browser mode (for SPA pages)')
22
+ .option('--browser-path <path>', 'Custom Chrome/Chromium executable path')
23
+ .option('-R, --no-readability', 'Disable readability, keep full HTML content')
24
+ .option('-s, --selector <selector>', 'Custom CSS selector to extract content')
25
+ .option('-f, --file <path>', 'Read URLs from file (one per line) - not yet implemented')
26
+ .option('-H, --header <header>', 'Custom HTTP header (can be repeated)', this.collectHeaders, [])
27
+ .option('--proxy <url>', 'Proxy server URL (also reads HTTP_PROXY/HTTPS_PROXY env vars)')
28
+ .option('-t, --timeout <ms>', 'Request timeout in milliseconds', parseInt, DEFAULT_TIMEOUT)
29
+ .option('--config <path>', 'Custom config file path - not yet implemented')
30
+ .option('--user-agent <string>', 'Custom user agent', DEFAULT_USER_AGENT)
31
+ .option('--wait-until <event>', 'Browser wait condition (load|domcontentloaded|networkidle0|networkidle2)')
32
+ .option('--concurrent <num>', 'Concurrent requests for batch mode', parseInt, 3)
33
+ .option('--verbose', 'Enable verbose logging', false)
34
+ .action(async (urls, options) => {
35
+ await this.handleFetch(urls, options);
36
+ });
37
+ }
38
+ /**
39
+ * Collect multiple header options
40
+ */
41
+ collectHeaders(value, previous) {
42
+ return previous.concat([value]);
43
+ }
44
+ /**
45
+ * Parse headers from array of strings like "Key: Value"
46
+ */
47
+ parseHeaders(headerStrings) {
48
+ const headers = {};
49
+ for (const header of headerStrings) {
50
+ const colonIndex = header.indexOf(':');
51
+ if (colonIndex === -1) {
52
+ console.warn(`Warning: Invalid header format "${header}", expected "Key: Value"`);
53
+ continue;
54
+ }
55
+ const key = header.slice(0, colonIndex).trim();
56
+ const value = header.slice(colonIndex + 1).trim();
57
+ headers[key] = value;
58
+ }
59
+ return headers;
60
+ }
61
+ /**
62
+ * Main handler for the fetch command
63
+ */
64
+ async handleFetch(urls, options) {
65
+ try {
66
+ // Build process options
67
+ const processOptions = {
68
+ useBrowser: options.browser || false,
69
+ useReadability: options.readability !== false, // default true
70
+ selector: options.selector,
71
+ fetchOptions: {
72
+ timeout: options.timeout || DEFAULT_TIMEOUT,
73
+ userAgent: options.userAgent || DEFAULT_USER_AGENT,
74
+ headers: this.parseHeaders(options.header || []),
75
+ proxy: options.proxy
76
+ },
77
+ browserOptions: options.browser ? {
78
+ executablePath: options.browserPath,
79
+ waitUntil: options.waitUntil,
80
+ timeout: options.timeout || DEFAULT_TIMEOUT,
81
+ userAgent: options.userAgent || DEFAULT_USER_AGENT,
82
+ proxy: options.proxy
83
+ } : undefined,
84
+ verbose: options.verbose || false
85
+ };
86
+ // Process URLs
87
+ let markdown;
88
+ if (urls.length === 1) {
89
+ // Single URL
90
+ markdown = await this.processor.process(urls[0], processOptions);
91
+ }
92
+ else {
93
+ // Multiple URLs (batch mode)
94
+ const results = await this.processor.processBatch(urls, processOptions);
95
+ // Combine results
96
+ markdown = results
97
+ .map((result, index) => {
98
+ if (result.success) {
99
+ const separator = index === 0 ? '' : '\n\n---\n\n';
100
+ return `${separator}<!-- Source: ${result.url} -->\n\n${result.markdown}`;
101
+ }
102
+ else {
103
+ return `<!-- Error processing ${result.url}: ${result.error?.message} -->`;
104
+ }
105
+ })
106
+ .join('\n');
107
+ // Report errors
108
+ const errors = results.filter(r => !r.success);
109
+ if (errors.length > 0) {
110
+ console.error(`\nWarning: ${errors.length} of ${results.length} URLs failed to process:`);
111
+ errors.forEach(e => {
112
+ console.error(` - ${e.url}: ${e.error?.message}`);
113
+ });
114
+ }
115
+ }
116
+ // Output
117
+ if (options.output) {
118
+ await this.writeToFile(options.output, markdown);
119
+ if (options.verbose) {
120
+ console.log(`\nOutput written to: ${options.output}`);
121
+ }
122
+ }
123
+ else {
124
+ console.log(markdown);
125
+ }
126
+ }
127
+ catch (error) {
128
+ console.error('Error:', error.message);
129
+ process.exit(1);
130
+ }
131
+ finally {
132
+ // Cleanup resources (close browser if opened)
133
+ await this.processor.cleanup();
134
+ }
135
+ }
136
+ /**
137
+ * Write content to file
138
+ */
139
+ async writeToFile(filepath, content) {
140
+ try {
141
+ // Ensure directory exists
142
+ const dir = dirname(filepath);
143
+ await mkdir(dir, { recursive: true });
144
+ // Write file
145
+ await writeFile(filepath, content, 'utf-8');
146
+ }
147
+ catch (error) {
148
+ throw new Error(`Failed to write to file ${filepath}: ${error.message}`);
149
+ }
150
+ }
151
+ /**
152
+ * Run the CLI
153
+ */
154
+ async run(argv) {
155
+ await this.program.parseAsync(argv);
156
+ }
157
+ }
158
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAEvD,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAC;AACrE,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,aAAa,CAAC;AAC/C,OAAO,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AAE/B,MAAM,OAAO,GAAG;IACN,OAAO,CAAU;IACjB,SAAS,CAAmB;IAEpC;QACE,IAAI,CAAC,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,CAAC,SAAS,GAAG,IAAI,gBAAgB,EAAE,CAAC;QACxC,IAAI,CAAC,aAAa,EAAE,CAAC;IACvB,CAAC;IAEO,aAAa;QACnB,IAAI,CAAC,OAAO;aACT,IAAI,CAAC,UAAU,CAAC;aAChB,WAAW,CAAC,4EAA4E,CAAC;aACzF,OAAO,CAAC,OAAO,CAAC;aAChB,QAAQ,CAAC,WAAW,EAAE,6BAA6B,CAAC;aACpD,MAAM,CAAC,qBAAqB,EAAE,kCAAkC,CAAC;aACjE,MAAM,CAAC,eAAe,EAAE,2CAA2C,CAAC;aACpE,MAAM,CAAC,uBAAuB,EAAE,wCAAwC,CAAC;aACzE,MAAM,CAAC,sBAAsB,EAAE,6CAA6C,CAAC;aAC7E,MAAM,CAAC,2BAA2B,EAAE,wCAAwC,CAAC;aAC7E,MAAM,CAAC,mBAAmB,EAAE,0DAA0D,CAAC;aACvF,MAAM,CAAC,uBAAuB,EAAE,sCAAsC,EAAE,IAAI,CAAC,cAAc,EAAE,EAAE,CAAC;aAChG,MAAM,CAAC,eAAe,EAAE,+DAA+D,CAAC;aACxF,MAAM,CAAC,oBAAoB,EAAE,iCAAiC,EAAE,QAAQ,EAAE,eAAe,CAAC;aAC1F,MAAM,CAAC,iBAAiB,EAAE,+CAA+C,CAAC;aAC1E,MAAM,CAAC,uBAAuB,EAAE,mBAAmB,EAAE,kBAAkB,CAAC;aACxE,MAAM,CAAC,sBAAsB,EAAE,0EAA0E,CAAC;aAC1G,MAAM,CAAC,oBAAoB,EAAE,oCAAoC,EAAE,QAAQ,EAAE,CAAC,CAAC;aAC/E,MAAM,CAAC,WAAW,EAAE,wBAAwB,EAAE,KAAK,CAAC;aACpD,MAAM,CAAC,KAAK,EAAE,IAAc,EAAE,OAAmB,EAAE,EAAE;YACpD,MAAM,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QACxC,CAAC,CAAC,CAAC;IACP,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,KAAa,EAAE,QAAkB;QACtD,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC;IAClC,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,aAAuB;QAC1C,MAAM,OAAO,GAA2B,EAAE,CAAC;QAE3C,KAAK,MAAM,MAAM,IAAI,aAAa,EAAE,CAAC;YACnC,MAAM,UAAU,GAAG,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YACvC,IAAI,UAAU,KAAK,CAAC,CAAC,EAAE,CAAC;gBACtB,OAAO,CAAC,IAAI,CAAC,mCAAmC,MAAM,0BAA0B,CAAC,CAAC;gBAClF,SAAS;YACX,CAAC;YAED,MAAM,GAAG,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC,IAAI,EAAE,CAAC;YAC/C,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAClD,OAAO,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;QACvB,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,WAAW,CAAC,IAAc,EAAE,OAAmB;QAC3D,IAAI,CAAC;YACH,wBAAwB;YACxB,MAAM,cAAc,GAAmB;gBACrC,UAAU,EAAE,OAAO,CAAC,OAAO,IAAI,KAAK;gBACpC,cAAc,EAAE,OAAO,CAAC,WAAW,KAAK,KAAK,EAAE,eAAe;gBAC9D,QAAQ,EAAE,OAAO,CAAC,QAAQ;gBAC1B,YAAY,EAAE;oBACZ,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,eAAe;oBAC3C,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,kBAAkB;oBAClD,OAAO,EAAE,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC,MAAM,IAAI,EAAE,CAAC;oBAChD,KAAK,EAAE,OAAO,CAAC,KAAK;iBACrB;gBACD,cAAc,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC;oBAChC,cAAc,EAAE,OAAO,CAAC,WAAW;oBACnC,SAAS,EAAE,OAAO,CAAC,SAAgB;oBACnC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,eAAe;oBAC3C,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,kBAAkB;oBAClD,KAAK,EAAE,OAAO,CAAC,KAAK;iBACrB,CAAC,CAAC,CAAC,SAAS;gBACb,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,KAAK;aAClC,CAAC;YAEF,eAAe;YACf,IAAI,QAAgB,CAAC;YAErB,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACtB,aAAa;gBACb,QAAQ,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,cAAc,CAAC,CAAC;YACnE,CAAC;iBAAM,CAAC;gBACN,6BAA6B;gBAC7B,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,YAAY,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC;gBAExE,kBAAkB;gBAClB,QAAQ,GAAG,OAAO;qBACf,GAAG,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE;oBACrB,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;wBACnB,MAAM,SAAS,GAAG,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,aAAa,CAAC;wBACnD,OAAO,GAAG,SAAS,gBAAgB,MAAM,CAAC,GAAG,WAAW,MAAM,CAAC,QAAQ,EAAE,CAAC;oBAC5E,CAAC;yBAAM,CAAC;wBACN,OAAO,yBAAyB,MAAM,CAAC,GAAG,KAAK,MAAM,CAAC,KAAK,EAAE,OAAO,MAAM,CAAC;oBAC7E,CAAC;gBACH,CAAC,CAAC;qBACD,IAAI,CAAC,IAAI,CAAC,CAAC;gBAEd,gBAAgB;gBAChB,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;gBAC/C,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACtB,OAAO,CAAC,KAAK,CAAC,cAAc,MAAM,CAAC,MAAM,OAAO,OAAO,CAAC,MAAM,0BAA0B,CAAC,CAAC;oBAC1F,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE;wBACjB,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,KAAK,EAAE,OAAO,EAAE,CAAC,CAAC;oBACrD,CAAC,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;YAED,SAAS;YACT,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;gBACnB,MAAM,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;gBACjD,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;oBACpB,OAAO,CAAC,GAAG,CAAC,wBAAwB,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;gBACxD,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,QAAQ,EAAG,KAAe,CAAC,OAAO,CAAC,CAAC;YAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;gBAAS,CAAC;YACT,8CAA8C;YAC9C,MAAM,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,CAAC;QACjC,CAAC;IACH,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,WAAW,CAAC,QAAgB,EAAE,OAAe;QACzD,IAAI,CAAC;YACH,0BAA0B;YAC1B,MAAM,GAAG,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;YAC9B,MAAM,KAAK,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAEtC,aAAa;YACb,MAAM,SAAS,CAAC,QAAQ,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;QAC9C,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,2BAA2B,QAAQ,KAAM,KAAe,CAAC,OAAO,EAAE,CAAC,CAAC;QACtF,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,GAAG,CAAC,IAAc;QACtB,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACtC,CAAC;CACF"}
@@ -0,0 +1,9 @@
1
+ export declare const DEFAULT_TIMEOUT = 30000;
2
+ export declare const DEFAULT_CONCURRENT = 3;
3
+ export declare const DEFAULT_USER_AGENT = "md-fetch/1.0.0";
4
+ export declare const DEFAULT_WAIT_UNTIL: "networkidle2";
5
+ export declare const WAIT_UNTIL_OPTIONS: readonly ["load", "domcontentloaded", "networkidle0", "networkidle2"];
6
+ export declare const CONFIG_FILE_NAMES: readonly [".md-fetchrc", ".md-fetchrc.json", ".md-fetchrc.yaml", ".md-fetchrc.yml", "md-fetch.config.js"];
7
+ export declare const RETRY_ATTEMPTS = 3;
8
+ export declare const RETRY_DELAY = 1000;
9
+ //# sourceMappingURL=constants.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"constants.d.ts","sourceRoot":"","sources":["../src/constants.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,eAAe,QAAQ,CAAC;AACrC,eAAO,MAAM,kBAAkB,IAAI,CAAC;AACpC,eAAO,MAAM,kBAAkB,mBAAmB,CAAC;AACnD,eAAO,MAAM,kBAAkB,EAAG,cAAuB,CAAC;AAE1D,eAAO,MAAM,kBAAkB,uEAAwE,CAAC;AAExG,eAAO,MAAM,iBAAiB,2GAMpB,CAAC;AAEX,eAAO,MAAM,cAAc,IAAI,CAAC;AAChC,eAAO,MAAM,WAAW,OAAO,CAAC"}
@@ -0,0 +1,15 @@
1
+ export const DEFAULT_TIMEOUT = 30000;
2
+ export const DEFAULT_CONCURRENT = 3;
3
+ export const DEFAULT_USER_AGENT = 'md-fetch/1.0.0';
4
+ export const DEFAULT_WAIT_UNTIL = 'networkidle2';
5
+ export const WAIT_UNTIL_OPTIONS = ['load', 'domcontentloaded', 'networkidle0', 'networkidle2'];
6
+ export const CONFIG_FILE_NAMES = [
7
+ '.md-fetchrc',
8
+ '.md-fetchrc.json',
9
+ '.md-fetchrc.yaml',
10
+ '.md-fetchrc.yml',
11
+ 'md-fetch.config.js'
12
+ ];
13
+ export const RETRY_ATTEMPTS = 3;
14
+ export const RETRY_DELAY = 1000; // ms
15
+ //# sourceMappingURL=constants.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"constants.js","sourceRoot":"","sources":["../src/constants.ts"],"names":[],"mappings":"AAAA,MAAM,CAAC,MAAM,eAAe,GAAG,KAAK,CAAC;AACrC,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC,CAAC;AACpC,MAAM,CAAC,MAAM,kBAAkB,GAAG,gBAAgB,CAAC;AACnD,MAAM,CAAC,MAAM,kBAAkB,GAAG,cAAuB,CAAC;AAE1D,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC,MAAM,EAAE,kBAAkB,EAAE,cAAc,EAAE,cAAc,CAAU,CAAC;AAExG,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,aAAa;IACb,kBAAkB;IAClB,kBAAkB;IAClB,iBAAiB;IACjB,oBAAoB;CACZ,CAAC;AAEX,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC;AAChC,MAAM,CAAC,MAAM,WAAW,GAAG,IAAI,CAAC,CAAC,KAAK"}