magicmd 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magicmd-0.1.0/.github/workflows/ci.yml +32 -0
- magicmd-0.1.0/.gitignore +11 -0
- magicmd-0.1.0/.magicmd.example.toml +41 -0
- magicmd-0.1.0/CHANGELOG.md +51 -0
- magicmd-0.1.0/LICENSE +21 -0
- magicmd-0.1.0/PKG-INFO +315 -0
- magicmd-0.1.0/README.md +292 -0
- magicmd-0.1.0/README_EN.md +292 -0
- magicmd-0.1.0/SKILL.md +31 -0
- magicmd-0.1.0/docs/MagicMD-v0.1-design.md +425 -0
- magicmd-0.1.0/docs/MagicMD-v0.1-implementation-plan.md +846 -0
- magicmd-0.1.0/docs/development.md +145 -0
- magicmd-0.1.0/docs/releases/v0.1.0.md +199 -0
- magicmd-0.1.0/docs/supported-sites.md +46 -0
- magicmd-0.1.0/docs/wechat-regression-corpus.md +47 -0
- magicmd-0.1.0/pyproject.toml +49 -0
- magicmd-0.1.0/samples/csdn-complex-10.txt +10 -0
- magicmd-0.1.0/samples/juejin-complex-5.txt +17 -0
- magicmd-0.1.0/samples/juejin-homepage-5.txt +17 -0
- magicmd-0.1.0/src/magicmd/__init__.py +2 -0
- magicmd-0.1.0/src/magicmd/assets.py +149 -0
- magicmd-0.1.0/src/magicmd/cli.py +422 -0
- magicmd-0.1.0/src/magicmd/config.py +78 -0
- magicmd-0.1.0/src/magicmd/detect.py +8 -0
- magicmd-0.1.0/src/magicmd/diagnostics.py +118 -0
- magicmd-0.1.0/src/magicmd/fetchers/__init__.py +1 -0
- magicmd-0.1.0/src/magicmd/fetchers/browser.py +51 -0
- magicmd-0.1.0/src/magicmd/fetchers/http.py +17 -0
- magicmd-0.1.0/src/magicmd/models.py +53 -0
- magicmd-0.1.0/src/magicmd/output.py +58 -0
- magicmd-0.1.0/src/magicmd/platforms/__init__.py +1 -0
- magicmd-0.1.0/src/magicmd/platforms/base.py +19 -0
- magicmd-0.1.0/src/magicmd/platforms/csdn.py +76 -0
- magicmd-0.1.0/src/magicmd/platforms/generic.py +54 -0
- magicmd-0.1.0/src/magicmd/platforms/juejin.py +95 -0
- magicmd-0.1.0/src/magicmd/platforms/registry.py +73 -0
- magicmd-0.1.0/src/magicmd/platforms/shared/__init__.py +0 -0
- magicmd-0.1.0/src/magicmd/platforms/shared/content.py +440 -0
- magicmd-0.1.0/src/magicmd/platforms/shared/markdown.py +95 -0
- magicmd-0.1.0/src/magicmd/platforms/shared/metadata.py +38 -0
- magicmd-0.1.0/src/magicmd/platforms/wechat.py +57 -0
- magicmd-0.1.0/src/magicmd/quality.py +199 -0
- magicmd-0.1.0/src/magicmd/renderers/__init__.py +1 -0
- magicmd-0.1.0/src/magicmd/renderers/markdown.py +62 -0
- magicmd-0.1.0/src/magicmd/templates/magicmd.example.toml +41 -0
- magicmd-0.1.0/tests/fixtures/csdn/basic.html +14 -0
- magicmd-0.1.0/tests/fixtures/generic/basic.html +12 -0
- magicmd-0.1.0/tests/fixtures/juejin/basic.html +14 -0
- magicmd-0.1.0/tests/fixtures/site_validation_manifest.json +119 -0
- magicmd-0.1.0/tests/fixtures/wechat/basic.html +19 -0
- magicmd-0.1.0/tests/fixtures/wechat_regression_manifest.json +56 -0
- magicmd-0.1.0/tests/test_assets.py +107 -0
- magicmd-0.1.0/tests/test_browser_fetcher.py +81 -0
- magicmd-0.1.0/tests/test_cli.py +654 -0
- magicmd-0.1.0/tests/test_config.py +67 -0
- magicmd-0.1.0/tests/test_detect.py +17 -0
- magicmd-0.1.0/tests/test_markdown.py +23 -0
- magicmd-0.1.0/tests/test_models.py +38 -0
- magicmd-0.1.0/tests/test_output.py +21 -0
- magicmd-0.1.0/tests/test_platform_csdn.py +209 -0
- magicmd-0.1.0/tests/test_platform_generic.py +36 -0
- magicmd-0.1.0/tests/test_platform_juejin.py +154 -0
- magicmd-0.1.0/tests/test_platform_registry.py +15 -0
- magicmd-0.1.0/tests/test_platform_wechat.py +343 -0
- magicmd-0.1.0/tests/test_project_metadata.py +27 -0
- magicmd-0.1.0/tests/test_quality.py +177 -0
- magicmd-0.1.0/tests/test_regression_manifest.py +15 -0
- magicmd-0.1.0/tests/test_site_validation_manifest.py +26 -0
- magicmd-0.1.0/uv.lock +1880 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
test:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
|
|
14
|
+
- name: Install uv
|
|
15
|
+
uses: astral-sh/setup-uv@v5
|
|
16
|
+
with:
|
|
17
|
+
enable-cache: true
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
run: uv python install 3.11
|
|
21
|
+
|
|
22
|
+
- name: Install dependencies
|
|
23
|
+
run: uv sync --extra dev
|
|
24
|
+
|
|
25
|
+
- name: Run tests
|
|
26
|
+
run: uv run pytest -q
|
|
27
|
+
|
|
28
|
+
- name: Run ruff
|
|
29
|
+
run: uv run ruff check .
|
|
30
|
+
|
|
31
|
+
- name: Build package
|
|
32
|
+
run: uv build
|
magicmd-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[output]
|
|
2
|
+
directory = "output"
|
|
3
|
+
overwrite = false
|
|
4
|
+
save_debug_html = "on_failure"
|
|
5
|
+
|
|
6
|
+
[markdown]
|
|
7
|
+
template = "default"
|
|
8
|
+
front_matter = "yaml"
|
|
9
|
+
include_source_block = true
|
|
10
|
+
heading_offset = 0
|
|
11
|
+
|
|
12
|
+
[images]
|
|
13
|
+
download = true
|
|
14
|
+
directory = "images"
|
|
15
|
+
filename_pattern = "img_{index:03d}.{ext}"
|
|
16
|
+
concurrency = 5
|
|
17
|
+
|
|
18
|
+
[fetch]
|
|
19
|
+
timeout_seconds = 20
|
|
20
|
+
browser_timeout_seconds = 15
|
|
21
|
+
browser_attempts = 2
|
|
22
|
+
user_agent = "default"
|
|
23
|
+
|
|
24
|
+
[platforms.wechat]
|
|
25
|
+
enabled = true
|
|
26
|
+
browser = "camoufox"
|
|
27
|
+
wait_selector = "#js_content"
|
|
28
|
+
|
|
29
|
+
[platforms.juejin]
|
|
30
|
+
enabled = true
|
|
31
|
+
browser = "camoufox"
|
|
32
|
+
wait_selector = "article"
|
|
33
|
+
|
|
34
|
+
[platforms.csdn]
|
|
35
|
+
enabled = true
|
|
36
|
+
browser = "camoufox"
|
|
37
|
+
wait_selector = "#content_views"
|
|
38
|
+
|
|
39
|
+
[platforms.generic]
|
|
40
|
+
enabled = true
|
|
41
|
+
browser = "http"
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## v0.1.0 - 2026-06-06
|
|
4
|
+
|
|
5
|
+
### 中文
|
|
6
|
+
|
|
7
|
+
MagicMD v0.1.0 固化为可用的独立 CLI 基线,重点覆盖公开文章链接到 Markdown 内容包的转换流程。
|
|
8
|
+
|
|
9
|
+
- 支持单篇 URL 转换和批量 URL 转换。
|
|
10
|
+
- 支持微信公众号、掘金、CSDN 和通用公开文章页面。
|
|
11
|
+
- 微信公众号解析已覆盖视频占位与本地下载、图片懒加载、动图占位过滤、富文本加粗清洗、非代码 `pre` 文本、块级链接边界、推荐阅读区等真实样例问题。
|
|
12
|
+
- 掘金解析已覆盖真实首页样本和复杂技术文章样本,包含图片下载、外链还原、代码块保留和标题层级归一化。
|
|
13
|
+
- CSDN 解析已覆盖 10 篇复杂真实样本,并人工确认代码块错位、孤立数字、Mermaid/SVG 图、站内目录死链和代码控件噪声等问题。
|
|
14
|
+
- 输出 `article.md`、`metadata.json`、`extraction-report.json` 和本地媒体目录。
|
|
15
|
+
- 平台注册表集中管理 URL 匹配、默认抓取方式、等待选择器和解析器入口,便于后续扩展新站点。
|
|
16
|
+
- 浏览器抓取配置已真实生效,支持通过配置控制 Camoufox 等待超时和最大尝试次数。
|
|
17
|
+
- 平台通用转换逻辑已从 `platforms/base.py` 拆分到 `platforms/shared/`,并保留 `base.py` 兼容入口,降低后续维护成本。
|
|
18
|
+
- 平台测试已按 WeChat、Juejin、CSDN 和 Generic 拆分,便于单站点回归和定位问题。
|
|
19
|
+
- 批量转换后自动生成 `batch-report.json` 和 `batch-report.md`,用于快速定位失败链接、解析 warning 和 Markdown 质量疑点。
|
|
20
|
+
- 批量报告补充 `platform`、`fetcher`、`stage`、`elapsed_ms`、`max_attempts` 和 `retry_enabled` 等诊断字段,方便定位失败阶段和抓取上下文。
|
|
21
|
+
- 批量命令新增 `--skip-existing` 和 `--overwrite`,支持重复跑回归集时跳过已有内容包,或显式覆盖同名输出包。
|
|
22
|
+
- 浏览器抓取层会对瞬时 Camoufox/Playwright 失败自动重试一次,降低批量转换中的偶发中断。
|
|
23
|
+
- `magicmd doctor` 已升级为环境诊断命令,可检查 Python 版本、MagicMD 版本、配置解析、输出目录可写性、Camoufox 可用性和平台默认抓取方式。
|
|
24
|
+
- 根命令新增 `--version`,方便安装后快速确认当前 CLI 版本。
|
|
25
|
+
- 建立微信公众号回归样本清单:`tests/fixtures/wechat_regression_manifest.json`。
|
|
26
|
+
- 建立跨站点验证清单:`tests/fixtures/site_validation_manifest.json`。
|
|
27
|
+
- 发布前构建检查通过:`uv build`、wheel/sdist 内容检查、临时 wheel 安装 smoke test 和 `twine check` 均已验证。
|
|
28
|
+
|
|
29
|
+
### English
|
|
30
|
+
|
|
31
|
+
MagicMD v0.1.0 is the first usable standalone CLI baseline for converting public article URLs into Markdown content packages.
|
|
32
|
+
|
|
33
|
+
- Supports single URL conversion and batch URL conversion.
|
|
34
|
+
- Supports WeChat public account articles, Juejin, CSDN, and generic public article pages.
|
|
35
|
+
- The WeChat parser covers real-world formatting issues around video placeholders and local downloads, lazy images, decorative GIF filtering, rich-text bold cleanup, non-code `pre` text, block-link boundaries, and recommendation sections.
|
|
36
|
+
- The Juejin parser has been validated against live homepage samples and complex technical articles, covering image download, external-link restoration, code blocks, and heading-depth normalization.
|
|
37
|
+
- The CSDN parser has been validated against ten complex live samples, with manual review for code-block collisions, stray numeric markers, Mermaid/SVG diagrams, generated table-of-contents links, and code-widget noise.
|
|
38
|
+
- Outputs `article.md`, `metadata.json`, `extraction-report.json`, and local media directories.
|
|
39
|
+
- Centralizes URL matching, default fetch mode, wait selectors, and parser entrypoints in a platform registry to make new-site support easier to add.
|
|
40
|
+
- Browser fetch configuration is now effective, allowing Camoufox wait timeouts and maximum attempts to be controlled from config.
|
|
41
|
+
- Shared platform conversion logic has been split from `platforms/base.py` into `platforms/shared/`, while `base.py` remains as a compatibility entrypoint.
|
|
42
|
+
- Platform tests are now split by WeChat, Juejin, CSDN, and Generic coverage to make site-specific regression easier.
|
|
43
|
+
- Batch conversion now generates `batch-report.json` and `batch-report.md` for failed URLs, extraction warnings, and Markdown quality signals.
|
|
44
|
+
- Batch reports now include diagnostic fields such as `platform`, `fetcher`, `stage`, `elapsed_ms`, `max_attempts`, and `retry_enabled` to make failures easier to locate.
|
|
45
|
+
- The batch command now supports `--skip-existing` and `--overwrite` for repeated regression runs and explicit output replacement.
|
|
46
|
+
- Browser fetching now retries transient Camoufox/Playwright failures once, reducing intermittent interruptions during batch conversion.
|
|
47
|
+
- `magicmd doctor` is now a real runtime diagnostic command that checks Python version, MagicMD version, config parsing, output writability, Camoufox availability, and platform defaults.
|
|
48
|
+
- Adds a root `--version` option so installed CLI environments can quickly confirm the active MagicMD version.
|
|
49
|
+
- Adds the WeChat regression corpus manifest: `tests/fixtures/wechat_regression_manifest.json`.
|
|
50
|
+
- Adds the cross-site validation manifest: `tests/fixtures/site_validation_manifest.json`.
|
|
51
|
+
- Pre-release build checks passed: `uv build`, wheel/sdist content inspection, temporary wheel-install smoke tests, and `twine check` have all been verified.
|
magicmd-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 MagicMD Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
magicmd-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: magicmd
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert public article links into configurable Markdown packages.
|
|
5
|
+
Author: MagicMD Contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: article,crawler,juejin,markdown,wechat
|
|
9
|
+
Requires-Python: >=3.11
|
|
10
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
11
|
+
Requires-Dist: camoufox[geoip]>=0.4
|
|
12
|
+
Requires-Dist: chardet>=5.2
|
|
13
|
+
Requires-Dist: httpx>=0.27
|
|
14
|
+
Requires-Dist: markdownify>=0.13
|
|
15
|
+
Requires-Dist: pydantic>=2.8
|
|
16
|
+
Requires-Dist: rich>=13.7
|
|
17
|
+
Requires-Dist: typer>=0.12
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
20
|
+
Requires-Dist: pytest>=8.2; extra == 'dev'
|
|
21
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# MagicMD
|
|
25
|
+
|
|
26
|
+
中文 | [English](./README_EN.md)
|
|
27
|
+
|
|
28
|
+
把散落在网页里的好文章,变成你仓库里的 Markdown。
|
|
29
|
+
|
|
30
|
+
MagicMD 是一个面向公开文章链接的 Markdown 转换工具。你给它一条 URL,或者一整个 URL 列表,它把文章正文、图片、来源信息和转换报告整理成一个可长期保存的内容包。
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
magicmd "https://mp.weixin.qq.com/s/example"
|
|
34
|
+
magicmd batch urls.txt -o output/
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
输出不是一段临时文本,而是一份可以直接进入内容工作流的目录:
|
|
38
|
+
|
|
39
|
+
```text
|
|
40
|
+
output/article-title/
|
|
41
|
+
├── article.md
|
|
42
|
+
├── metadata.json
|
|
43
|
+
├── extraction-report.json
|
|
44
|
+
└── images/
|
|
45
|
+
├── img_001.png
|
|
46
|
+
└── img_002.png
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
MagicMD 可以当 CLI 用,也可以作为 Agent Skill 使用。人负责给链接,工具负责落盘;Agent 负责批量整理时,也走同一套规则。
|
|
50
|
+
|
|
51
|
+
## 适合做什么
|
|
52
|
+
|
|
53
|
+
- 把微信公众号文章保存为 Markdown。
|
|
54
|
+
- 把掘金、CSDN 技术文章沉淀到本地知识库。
|
|
55
|
+
- 把公开文章批量整理到 GitHub、Hugo、Docusaurus、HaoGit 或自己的站点。
|
|
56
|
+
- 给 Agent 一个稳定的“文章链接转 Markdown”能力,而不是每次重新写提示词。
|
|
57
|
+
|
|
58
|
+
MagicMD 不是浏览器收藏夹,也不是通用爬虫框架。它的目标更窄:**把公开文章页面尽量干净、可追踪、可复用地转成 Markdown 内容包。**
|
|
59
|
+
|
|
60
|
+
## 和同类工具有什么不同
|
|
61
|
+
|
|
62
|
+
很多工具可以把网页或文件转成 Markdown。MagicMD 更关心另一个问题:**中文内容平台里的文章,怎么稳定变成能长期维护的 Markdown。**
|
|
63
|
+
|
|
64
|
+
| 类型 | 常见特点 | MagicMD 的差异 |
|
|
65
|
+
| --- | --- | --- |
|
|
66
|
+
| 通用网页转 Markdown 工具 | 更适合标准网页、文档页、英文站点或 LLM 输入清洗。 | MagicMD 针对微信公众号、掘金、CSDN 做平台适配,重点处理中文内容平台常见的富文本、跳转链接、代码控件和编辑器噪声。 |
|
|
67
|
+
| 微信文章转换脚本 | 往往能抓正文和图片,但配置、批量、报告和多平台扩展有限。 | MagicMD 不只转微信,还保留 `metadata.json`、`extraction-report.json`、批量报告和可配置 Markdown 输出,方便后续发布或自动化处理。 |
|
|
68
|
+
| 爬虫框架 | 能力强,但通常需要自己写解析逻辑、清洗规则和输出结构。 | MagicMD 直接给文章采集场景一个可用 CLI:链接进去,内容包落盘。 |
|
|
69
|
+
| 手动复制到 Markdown | 可控,但慢,图片、链接、代码块和来源信息很容易丢。 | MagicMD 自动处理图片本地化、标题层级、代码块、链接、来源信息和失败 warning。 |
|
|
70
|
+
|
|
71
|
+
MagicMD 的优势不是“抓全网”,而是把中文技术内容归档这件事做细:微信视频会提取链接并尝试下载;掘金外链会尽量还原真实目标地址;CSDN 代码块会清理复制按钮、行号和编辑器控件;批量转换会留下报告,方便你知道哪篇文章需要人工复核。
|
|
72
|
+
|
|
73
|
+
## 安装
|
|
74
|
+
|
|
75
|
+
当前 v0.1 推荐从源码安装或开发安装。Clone 仓库后运行:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
cd magicmd
|
|
79
|
+
uv sync --extra dev
|
|
80
|
+
uv run magicmd doctor
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
安装成全局命令后,可以直接运行 `magicmd`:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
uv tool install --editable .
|
|
87
|
+
magicmd doctor
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
如果你习惯 `pipx`:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pipx install .
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
还没有全局安装时,把下面命令里的 `magicmd` 换成 `uv run magicmd` 即可。
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
uv run magicmd batch urls.txt -o output/
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### PyPI 和 npm
|
|
103
|
+
|
|
104
|
+
MagicMD 目前还没有发布到 PyPI,所以暂时不能直接运行:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
uv tool install magicmd
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
MagicMD 目前也不是 npm 包,所以暂时不支持:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
npm install -g magicmd
|
|
114
|
+
npx magicmd
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
npm 入口适合后续做成轻量 wrapper:用户通过 npm 安装,底层仍调用 MagicMD CLI。v0.1 先把 Python CLI 稳住,再评估这个入口。
|
|
118
|
+
|
|
119
|
+
## 快速使用
|
|
120
|
+
|
|
121
|
+
转换单篇文章:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
magicmd "https://mp.weixin.qq.com/s/example"
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
指定输出目录:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
magicmd convert "https://juejin.cn/post/example" -o output/
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
批量转换:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
magicmd batch urls.txt -o output/
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
`urls.txt` 一行一个链接:
|
|
140
|
+
|
|
141
|
+
```text
|
|
142
|
+
https://mp.weixin.qq.com/s/example
|
|
143
|
+
https://juejin.cn/post/example
|
|
144
|
+
https://blog.csdn.net/user/article/details/123
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
重复跑同一批链接时,跳过已经生成过的内容包:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
magicmd batch urls.txt -o output/ --skip-existing
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
确认要重新生成时,覆盖同名输出包:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
magicmd batch urls.txt -o output/ --overwrite
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
只要正文,不下载图片:
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
magicmd convert "https://blog.csdn.net/user/article/details/123" --no-images
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## 支持站点
|
|
166
|
+
|
|
167
|
+
| 站点 | 状态 | 默认抓取 | 说明 |
|
|
168
|
+
| --- | --- | --- | --- |
|
|
169
|
+
| 微信公众号 `mp.weixin.qq.com` | 稳定主目标 | `camoufox` | v0.1 最主要的验证对象,已做多轮真实样本格式修复。 |
|
|
170
|
+
| 掘金 `juejin.cn` | 实验支持 | `camoufox` | 已验证首页样本和复杂技术文章,重点看图片、代码块、外链和标题层级。 |
|
|
171
|
+
| CSDN `blog.csdn.net` | 实验支持 | `camoufox` | 已人工检查 10 篇复杂样本,重点修过代码块、Mermaid/SVG、目录链接和控件噪声。 |
|
|
172
|
+
| 通用网页 | 尽力支持 | `http` | 对标准 `article`、`main` 或 Open Graph 元信息页面做基础提取。 |
|
|
173
|
+
|
|
174
|
+
更多站点说明见 [docs/supported-sites.md](./docs/supported-sites.md)。
|
|
175
|
+
|
|
176
|
+
## MagicMD 会生成什么
|
|
177
|
+
|
|
178
|
+
单篇文章会生成一个内容包:
|
|
179
|
+
|
|
180
|
+
```text
|
|
181
|
+
output/
|
|
182
|
+
└── article-title/
|
|
183
|
+
├── article.md # Markdown 正文
|
|
184
|
+
├── metadata.json # 标题、作者、时间、来源、hash 等
|
|
185
|
+
├── extraction-report.json # 抓取、解析、媒体和 warning
|
|
186
|
+
└── images/ # 下载后的本地图片
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
批量转换会额外生成:
|
|
190
|
+
|
|
191
|
+
```text
|
|
192
|
+
output/
|
|
193
|
+
├── batch-report.json # 适合程序读取
|
|
194
|
+
└── batch-report.md # 适合人工检查
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
默认的 `article.md` 类似这样:
|
|
198
|
+
|
|
199
|
+
```md
|
|
200
|
+
---
|
|
201
|
+
title: "Example Article"
|
|
202
|
+
author: "Example Author"
|
|
203
|
+
platform: "wechat"
|
|
204
|
+
source_url: "https://mp.weixin.qq.com/s/example"
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
# Example Article
|
|
208
|
+
|
|
209
|
+
> Source: wechat
|
|
210
|
+
> Author: Example Author
|
|
211
|
+
> Original: https://mp.weixin.qq.com/s/example
|
|
212
|
+
|
|
213
|
+
正文内容...
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
## CLI 和 Skill
|
|
217
|
+
|
|
218
|
+
MagicMD 有两个入口。
|
|
219
|
+
|
|
220
|
+
第一个是给人用的 CLI:
|
|
221
|
+
|
|
222
|
+
```bash
|
|
223
|
+
magicmd batch urls.txt -o output/
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
第二个是给 Agent 用的 [SKILL.md](./SKILL.md)。Skill 把“什么时候使用 MagicMD、怎么运行、检查哪些文件、遇到失败看什么报告”写成固定流程。这样 Agent 不需要每次猜命令,也不会把登录页、付费墙、验证码页面当成正常文章处理。
|
|
227
|
+
|
|
228
|
+
如果你未来要把 MagicMD 接入 HaoGit,建议让 Agent 先调用 Skill 完成采集和转换,再把 `article.md`、`metadata.json` 和图片交给发布流程。
|
|
229
|
+
|
|
230
|
+
## 配置
|
|
231
|
+
|
|
232
|
+
生成配置文件:
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
magicmd config init
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
配置文件示例见 [.magicmd.example.toml](./.magicmd.example.toml)。
|
|
239
|
+
|
|
240
|
+
常用配置:
|
|
241
|
+
|
|
242
|
+
```toml
|
|
243
|
+
[output]
|
|
244
|
+
directory = "output"
|
|
245
|
+
overwrite = false
|
|
246
|
+
save_debug_html = "on_failure"
|
|
247
|
+
|
|
248
|
+
[markdown]
|
|
249
|
+
template = "default"
|
|
250
|
+
front_matter = "yaml"
|
|
251
|
+
include_source_block = true
|
|
252
|
+
heading_offset = 0
|
|
253
|
+
|
|
254
|
+
[images]
|
|
255
|
+
download = true
|
|
256
|
+
directory = "images"
|
|
257
|
+
filename_pattern = "img_{index:03d}.{ext}"
|
|
258
|
+
|
|
259
|
+
[fetch]
|
|
260
|
+
timeout_seconds = 20
|
|
261
|
+
browser_timeout_seconds = 15
|
|
262
|
+
browser_attempts = 2
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
常见选项:
|
|
266
|
+
|
|
267
|
+
| 配置 | 说明 |
|
|
268
|
+
| --- | --- |
|
|
269
|
+
| `output.directory` | 默认输出目录。 |
|
|
270
|
+
| `output.overwrite` | 是否覆盖同名内容包。 |
|
|
271
|
+
| `output.save_debug_html` | `always`、`on_failure`、`never`,控制是否保存 `debug.html`。 |
|
|
272
|
+
| `markdown.front_matter` | `yaml` 或 `none`。 |
|
|
273
|
+
| `markdown.template` | `default` 或 `clean`。 |
|
|
274
|
+
| `markdown.heading_offset` | 统一调整 Markdown 标题层级。 |
|
|
275
|
+
| `images.download` | 是否下载图片。 |
|
|
276
|
+
| `fetch.browser_attempts` | 浏览器模式失败后的总尝试次数。 |
|
|
277
|
+
| `platforms.<name>.browser` | 使用 `http` 或 `camoufox`。 |
|
|
278
|
+
| `platforms.<name>.wait_selector` | 浏览器抓取时等待的选择器。 |
|
|
279
|
+
|
|
280
|
+
检查环境:
|
|
281
|
+
|
|
282
|
+
```bash
|
|
283
|
+
magicmd doctor
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
`doctor` 会检查 Python 版本、MagicMD 版本、配置文件解析、输出目录可写性、Camoufox 是否可用,以及各平台默认抓取方式。
|
|
287
|
+
|
|
288
|
+
## 使用前知道
|
|
289
|
+
|
|
290
|
+
MagicMD 只处理公开文章页面。它不会绕过登录、付费墙、私有内容、验证码或平台访问限制。
|
|
291
|
+
|
|
292
|
+
如果遇到 403、验证码、登录限制、视频防盗链或动态资源失效,MagicMD 会尽量保留已经能提取的内容,并在报告里记录 warning 或失败原因。
|
|
293
|
+
|
|
294
|
+
如果某个平台的页面结构变化导致转换效果下降,建议先保留输出目录里的 `extraction-report.json`,再用同一链接复现问题。真实样本记录放在 [docs/wechat-regression-corpus.md](./docs/wechat-regression-corpus.md) 和 [tests/fixtures/site_validation_manifest.json](./tests/fixtures/site_validation_manifest.json),不放在首页展开。
|
|
295
|
+
|
|
296
|
+
## 开发文档
|
|
297
|
+
|
|
298
|
+
- [docs/development.md](./docs/development.md):项目结构、核心模块、转换流程和验证命令。
|
|
299
|
+
- [docs/supported-sites.md](./docs/supported-sites.md):当前支持站点和注意事项。
|
|
300
|
+
- [docs/wechat-regression-corpus.md](./docs/wechat-regression-corpus.md):微信公众号真实样本回归说明。
|
|
301
|
+
- [docs/MagicMD-v0.1-design.md](./docs/MagicMD-v0.1-design.md):v0.1 设计说明。
|
|
302
|
+
|
|
303
|
+
## 接下来
|
|
304
|
+
|
|
305
|
+
- 发布到 PyPI,支持 `uv tool install magicmd`。
|
|
306
|
+
- 评估 npm wrapper,支持 `npm install -g magicmd` 或 `npx magicmd`。
|
|
307
|
+
- 增加 Markdown 模板系统。
|
|
308
|
+
- 增加 GitHub 发布能力。
|
|
309
|
+
- 增加 HaoGit 导入能力。
|
|
310
|
+
- 扩充微信公众号、掘金、CSDN 真实样本回归集。
|
|
311
|
+
- 增加更多站点适配器。
|
|
312
|
+
|
|
313
|
+
## License
|
|
314
|
+
|
|
315
|
+
[MIT](./LICENSE)
|