any2md 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- any2md-0.4.0/.gitignore +40 -0
- any2md-0.4.0/CHANGELOG.md +26 -0
- any2md-0.4.0/LICENSE +21 -0
- any2md-0.4.0/PKG-INFO +256 -0
- any2md-0.4.0/README.md +234 -0
- any2md-0.4.0/any2md/__init__.py +3 -0
- any2md-0.4.0/any2md/__main__.py +4 -0
- any2md-0.4.0/any2md/cli.py +145 -0
- any2md-0.4.0/any2md/converters/__init__.py +32 -0
- any2md-0.4.0/any2md/converters/docx.py +72 -0
- any2md-0.4.0/any2md/converters/html.py +162 -0
- any2md-0.4.0/any2md/converters/pdf.py +73 -0
- any2md-0.4.0/any2md/utils.py +86 -0
- any2md-0.4.0/pyproject.toml +54 -0
- any2md-0.4.0/requirements.txt +6 -0
any2md-0.4.0/.gitignore
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
|
|
7
|
+
# Virtual environments
|
|
8
|
+
venv/
|
|
9
|
+
.venv/
|
|
10
|
+
env/
|
|
11
|
+
|
|
12
|
+
# Distribution / packaging
|
|
13
|
+
dist/
|
|
14
|
+
build/
|
|
15
|
+
*.egg-info/
|
|
16
|
+
*.egg
|
|
17
|
+
|
|
18
|
+
# IDE
|
|
19
|
+
.idea/
|
|
20
|
+
.vscode/
|
|
21
|
+
*.swp
|
|
22
|
+
*.swo
|
|
23
|
+
|
|
24
|
+
# OS
|
|
25
|
+
.DS_Store
|
|
26
|
+
Thumbs.db
|
|
27
|
+
|
|
28
|
+
# Environment variables
|
|
29
|
+
.env
|
|
30
|
+
.env.local
|
|
31
|
+
|
|
32
|
+
# Generated output
|
|
33
|
+
output/
|
|
34
|
+
Text/
|
|
35
|
+
input/
|
|
36
|
+
|
|
37
|
+
# ZERG / dev tooling
|
|
38
|
+
.gsd/
|
|
39
|
+
.zerg/
|
|
40
|
+
.devcontainer/
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
- HTML file conversion (.html, .htm) with BeautifulSoup pre-cleaning, trafilatura content extraction, and markdownify fallback
|
|
11
|
+
- URL fetching — convert web pages directly by passing a URL as a positional argument
|
|
12
|
+
- `--strip-links` flag to remove markdown hyperlinks from output, keeping only link text
|
|
13
|
+
- .html/.htm support in `--input-dir` batch scanning
|
|
14
|
+
- Package architecture (`any2md/` package with `converters` subpackage)
|
|
15
|
+
- `python -m any2md` entry point
|
|
16
|
+
- Shared utilities module (`any2md/utils.py`) with `strip_links()` and `url_to_filename()`
|
|
17
|
+
- YAML frontmatter for HTML outputs includes `source_url` when converted from a URL
|
|
18
|
+
|
|
19
|
+
### Changed
|
|
20
|
+
- Refactored from single-file (`any2md.py`) to package architecture
|
|
21
|
+
- `any2md.py` is now a thin wrapper for backward compatibility
|
|
22
|
+
- Updated `SUPPORTED_EXTENSIONS` to include `.html` and `.htm`
|
|
23
|
+
|
|
24
|
+
### Dependencies
|
|
25
|
+
- Added `trafilatura` for HTML content extraction and URL fetching
|
|
26
|
+
- Added `beautifulsoup4` for HTML pre-cleaning
|
any2md-0.4.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 rocklambros
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
any2md-0.4.0/PKG-INFO
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: any2md
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Convert PDF, DOCX, and HTML files — or web pages by URL — to clean, LLM-optimized Markdown with YAML frontmatter.
|
|
5
|
+
Project-URL: Homepage, https://github.com/rocklambros/any2md
|
|
6
|
+
Project-URL: Issues, https://github.com/rocklambros/any2md/issues
|
|
7
|
+
Author-email: rocklambros <rock@rockcyber.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Requires-Dist: beautifulsoup4
|
|
16
|
+
Requires-Dist: mammoth
|
|
17
|
+
Requires-Dist: markdownify
|
|
18
|
+
Requires-Dist: pymupdf
|
|
19
|
+
Requires-Dist: pymupdf4llm
|
|
20
|
+
Requires-Dist: trafilatura
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# any2md
|
|
24
|
+
|
|
25
|
+
Convert PDF, DOCX, and HTML files — or web pages by URL — to clean, LLM-optimized Markdown with YAML frontmatter.
|
|
26
|
+
|
|
27
|
+
One command. Any format. Consistent, structured output ready for language models.
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install any2md
|
|
33
|
+
|
|
34
|
+
any2md report.pdf
|
|
35
|
+
any2md https://example.com/article
|
|
36
|
+
any2md --help
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Output lands in `./Text/` by default:
|
|
40
|
+
|
|
41
|
+
```markdown
|
|
42
|
+
---
|
|
43
|
+
title: "Quarterly Financial Report"
|
|
44
|
+
source_file: "report.pdf"
|
|
45
|
+
pages: 12
|
|
46
|
+
type: pdf
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
# Quarterly Financial Report
|
|
50
|
+
|
|
51
|
+
Document content here...
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Features
|
|
55
|
+
|
|
56
|
+
| Feature | Description |
|
|
57
|
+
|---------|-------------|
|
|
58
|
+
| **Multi-format** | PDF, DOCX, HTML (.html, .htm) |
|
|
59
|
+
| **URL fetching** | Pass any http/https URL as input |
|
|
60
|
+
| **YAML frontmatter** | Title, source, page/word count, type |
|
|
61
|
+
| **Batch processing** | Single file, directory scan, or mixed inputs |
|
|
62
|
+
| **Auto-routing** | Dispatches to the correct converter by extension |
|
|
63
|
+
| **Smart skip** | Won't overwrite existing files unless `--force` |
|
|
64
|
+
| **Filename sanitization** | Spaces, special characters, unicode dashes handled |
|
|
65
|
+
| **Title extraction** | Pulls the first H1–H3 heading automatically |
|
|
66
|
+
| **Link stripping** | `--strip-links` removes hyperlinks, keeps text |
|
|
67
|
+
|
|
68
|
+
## Installation
|
|
69
|
+
|
|
70
|
+
Requires **Python 3.8+**.
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install any2md
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### From source
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
git clone https://github.com/rocklambros/any2md.git
|
|
80
|
+
cd any2md
|
|
81
|
+
pip install .
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Dependencies
|
|
85
|
+
|
|
86
|
+
| Library | Purpose |
|
|
87
|
+
|---------|---------|
|
|
88
|
+
| [PyMuPDF](https://pymupdf.readthedocs.io/) + [pymupdf4llm](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/) | PDF extraction |
|
|
89
|
+
| [mammoth](https://github.com/mwilliamson/python-mammoth) + [markdownify](https://github.com/matthewwithanm/python-markdownify) | DOCX conversion |
|
|
90
|
+
| [trafilatura](https://trafilatura.readthedocs.io/) + [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) | HTML/URL extraction |
|
|
91
|
+
|
|
92
|
+
## Usage
|
|
93
|
+
|
|
94
|
+
### Basic conversion
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
# Single file
|
|
98
|
+
any2md report.pdf
|
|
99
|
+
|
|
100
|
+
# Multiple files
|
|
101
|
+
any2md report.pdf proposal.docx "meeting notes.pdf"
|
|
102
|
+
|
|
103
|
+
# HTML file
|
|
104
|
+
any2md page.html
|
|
105
|
+
|
|
106
|
+
# Web page by URL
|
|
107
|
+
any2md https://example.com/article
|
|
108
|
+
|
|
109
|
+
# Mixed batch — PDFs, DOCX, HTML, and URLs together
|
|
110
|
+
any2md doc.pdf page.html https://example.com
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Directory scanning
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Scan a specific directory
|
|
117
|
+
any2md --input-dir ./documents
|
|
118
|
+
|
|
119
|
+
# Convert everything in the current directory (default behavior)
|
|
120
|
+
any2md
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Options
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
# Custom output directory
|
|
127
|
+
any2md -o ./converted report.pdf
|
|
128
|
+
|
|
129
|
+
# Overwrite existing files
|
|
130
|
+
any2md --force
|
|
131
|
+
|
|
132
|
+
# Strip hyperlinks from output
|
|
133
|
+
any2md --strip-links doc.pdf
|
|
134
|
+
|
|
135
|
+
# Combine options
|
|
136
|
+
any2md -f -o ./out --strip-links docs/*.pdf docs/*.docx
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Alternative invocations
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
# Module mode (works without installing via pip)
|
|
143
|
+
python -m any2md report.pdf
|
|
144
|
+
|
|
145
|
+
# Legacy script (backward compatibility)
|
|
146
|
+
python3 mdconv.py report.pdf
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Output Format
|
|
150
|
+
|
|
151
|
+
Every converted file has YAML frontmatter followed by cleaned Markdown. The frontmatter fields vary by source format:
|
|
152
|
+
|
|
153
|
+
**PDF** — includes page count:
|
|
154
|
+
|
|
155
|
+
```markdown
|
|
156
|
+
---
|
|
157
|
+
title: "Quarterly Financial Report"
|
|
158
|
+
source_file: "Q3 Report 2024.pdf"
|
|
159
|
+
pages: 12
|
|
160
|
+
type: pdf
|
|
161
|
+
---
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**DOCX** — includes word count:
|
|
165
|
+
|
|
166
|
+
```markdown
|
|
167
|
+
---
|
|
168
|
+
title: "Project Proposal"
|
|
169
|
+
source_file: "proposal.docx"
|
|
170
|
+
word_count: 3847
|
|
171
|
+
type: docx
|
|
172
|
+
---
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**HTML file** — includes word count:
|
|
176
|
+
|
|
177
|
+
```markdown
|
|
178
|
+
---
|
|
179
|
+
title: "Page Title"
|
|
180
|
+
source_file: "page.html"
|
|
181
|
+
word_count: 1234
|
|
182
|
+
type: html
|
|
183
|
+
---
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
**URL** — records source URL instead of filename:
|
|
187
|
+
|
|
188
|
+
```markdown
|
|
189
|
+
---
|
|
190
|
+
title: "Article Title"
|
|
191
|
+
source_url: "https://example.com/article"
|
|
192
|
+
word_count: 567
|
|
193
|
+
type: html
|
|
194
|
+
---
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## CLI Reference
|
|
198
|
+
|
|
199
|
+
```
|
|
200
|
+
usage: any2md [-h] [--input-dir PATH] [--force] [--output-dir PATH] [--strip-links] [files ...]
|
|
201
|
+
|
|
202
|
+
Convert PDF, DOCX, and HTML files to LLM-optimized Markdown.
|
|
203
|
+
|
|
204
|
+
positional arguments:
|
|
205
|
+
files Files or URLs to convert. Supports PDF, DOCX, HTML
|
|
206
|
+
files and http(s) URLs. If omitted, converts all
|
|
207
|
+
supported files in the current directory.
|
|
208
|
+
|
|
209
|
+
options:
|
|
210
|
+
-h, --help show this help message and exit
|
|
211
|
+
--input-dir, -i PATH Directory to scan for supported files (PDF, DOCX, HTML)
|
|
212
|
+
--force, -f Overwrite existing .md files
|
|
213
|
+
--output-dir, -o PATH Output directory (default: ./Text)
|
|
214
|
+
--strip-links Remove markdown links, keeping only the link text
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Architecture
|
|
218
|
+
|
|
219
|
+
```
|
|
220
|
+
User Input (files, URLs, flags)
|
|
221
|
+
│
|
|
222
|
+
▼
|
|
223
|
+
cli.py ─── parse args, classify URLs vs file paths
|
|
224
|
+
│
|
|
225
|
+
▼
|
|
226
|
+
converters/__init__.py ─── dispatch by extension
|
|
227
|
+
│
|
|
228
|
+
┌────┼────┐
|
|
229
|
+
▼ ▼ ▼
|
|
230
|
+
pdf docx html ─── format-specific extraction
|
|
231
|
+
│ │ │
|
|
232
|
+
└────┼────┘
|
|
233
|
+
▼
|
|
234
|
+
utils.py ─── clean, title-extract, sanitize, frontmatter
|
|
235
|
+
│
|
|
236
|
+
▼
|
|
237
|
+
Output ─── YAML frontmatter + Markdown → output_dir/
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Extraction pipelines
|
|
241
|
+
|
|
242
|
+
| Format | Pipeline |
|
|
243
|
+
|--------|----------|
|
|
244
|
+
| **PDF** | `pymupdf4llm.to_markdown()` → clean → frontmatter |
|
|
245
|
+
| **DOCX** | `mammoth` (DOCX → HTML) → `markdownify` (HTML → Markdown) → clean → frontmatter |
|
|
246
|
+
| **HTML/URL** | BS4 pre-clean → `trafilatura` extract (fallback: `markdownify`) → clean → frontmatter |
|
|
247
|
+
|
|
248
|
+
### Adding a new format
|
|
249
|
+
|
|
250
|
+
1. Create `any2md/converters/newformat.py` with a `convert_newformat(path, output_dir, force, strip_links_flag) → bool` function
|
|
251
|
+
2. Add the extension and function to `CONVERTERS` in `any2md/converters/__init__.py`
|
|
252
|
+
3. Add the extension to `SUPPORTED_EXTENSIONS`
|
|
253
|
+
|
|
254
|
+
## License
|
|
255
|
+
|
|
256
|
+
MIT
|
any2md-0.4.0/README.md
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
# any2md
|
|
2
|
+
|
|
3
|
+
Convert PDF, DOCX, and HTML files — or web pages by URL — to clean, LLM-optimized Markdown with YAML frontmatter.
|
|
4
|
+
|
|
5
|
+
One command. Any format. Consistent, structured output ready for language models.
|
|
6
|
+
|
|
7
|
+
## Quick Start
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install any2md
|
|
11
|
+
|
|
12
|
+
any2md report.pdf
|
|
13
|
+
any2md https://example.com/article
|
|
14
|
+
any2md --help
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Output lands in `./Text/` by default:
|
|
18
|
+
|
|
19
|
+
```markdown
|
|
20
|
+
---
|
|
21
|
+
title: "Quarterly Financial Report"
|
|
22
|
+
source_file: "report.pdf"
|
|
23
|
+
pages: 12
|
|
24
|
+
type: pdf
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
# Quarterly Financial Report
|
|
28
|
+
|
|
29
|
+
Document content here...
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Features
|
|
33
|
+
|
|
34
|
+
| Feature | Description |
|
|
35
|
+
|---------|-------------|
|
|
36
|
+
| **Multi-format** | PDF, DOCX, HTML (.html, .htm) |
|
|
37
|
+
| **URL fetching** | Pass any http/https URL as input |
|
|
38
|
+
| **YAML frontmatter** | Title, source, page/word count, type |
|
|
39
|
+
| **Batch processing** | Single file, directory scan, or mixed inputs |
|
|
40
|
+
| **Auto-routing** | Dispatches to the correct converter by extension |
|
|
41
|
+
| **Smart skip** | Won't overwrite existing files unless `--force` |
|
|
42
|
+
| **Filename sanitization** | Spaces, special characters, unicode dashes handled |
|
|
43
|
+
| **Title extraction** | Pulls the first H1–H3 heading automatically |
|
|
44
|
+
| **Link stripping** | `--strip-links` removes hyperlinks, keeps text |
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
Requires **Python 3.8+**.
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install any2md
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### From source
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
git clone https://github.com/rocklambros/any2md.git
|
|
58
|
+
cd any2md
|
|
59
|
+
pip install .
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Dependencies
|
|
63
|
+
|
|
64
|
+
| Library | Purpose |
|
|
65
|
+
|---------|---------|
|
|
66
|
+
| [PyMuPDF](https://pymupdf.readthedocs.io/) + [pymupdf4llm](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/) | PDF extraction |
|
|
67
|
+
| [mammoth](https://github.com/mwilliamson/python-mammoth) + [markdownify](https://github.com/matthewwithanm/python-markdownify) | DOCX conversion |
|
|
68
|
+
| [trafilatura](https://trafilatura.readthedocs.io/) + [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) | HTML/URL extraction |
|
|
69
|
+
|
|
70
|
+
## Usage
|
|
71
|
+
|
|
72
|
+
### Basic conversion
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# Single file
|
|
76
|
+
any2md report.pdf
|
|
77
|
+
|
|
78
|
+
# Multiple files
|
|
79
|
+
any2md report.pdf proposal.docx "meeting notes.pdf"
|
|
80
|
+
|
|
81
|
+
# HTML file
|
|
82
|
+
any2md page.html
|
|
83
|
+
|
|
84
|
+
# Web page by URL
|
|
85
|
+
any2md https://example.com/article
|
|
86
|
+
|
|
87
|
+
# Mixed batch — PDFs, DOCX, HTML, and URLs together
|
|
88
|
+
any2md doc.pdf page.html https://example.com
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Directory scanning
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# Scan a specific directory
|
|
95
|
+
any2md --input-dir ./documents
|
|
96
|
+
|
|
97
|
+
# Convert everything in the current directory (default behavior)
|
|
98
|
+
any2md
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Options
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
# Custom output directory
|
|
105
|
+
any2md -o ./converted report.pdf
|
|
106
|
+
|
|
107
|
+
# Overwrite existing files
|
|
108
|
+
any2md --force
|
|
109
|
+
|
|
110
|
+
# Strip hyperlinks from output
|
|
111
|
+
any2md --strip-links doc.pdf
|
|
112
|
+
|
|
113
|
+
# Combine options
|
|
114
|
+
any2md -f -o ./out --strip-links docs/*.pdf docs/*.docx
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Alternative invocations
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
# Module mode (works without installing via pip)
|
|
121
|
+
python -m any2md report.pdf
|
|
122
|
+
|
|
123
|
+
# Legacy script (backward compatibility)
|
|
124
|
+
python3 mdconv.py report.pdf
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Output Format
|
|
128
|
+
|
|
129
|
+
Every converted file has YAML frontmatter followed by cleaned Markdown. The frontmatter fields vary by source format:
|
|
130
|
+
|
|
131
|
+
**PDF** — includes page count:
|
|
132
|
+
|
|
133
|
+
```markdown
|
|
134
|
+
---
|
|
135
|
+
title: "Quarterly Financial Report"
|
|
136
|
+
source_file: "Q3 Report 2024.pdf"
|
|
137
|
+
pages: 12
|
|
138
|
+
type: pdf
|
|
139
|
+
---
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**DOCX** — includes word count:
|
|
143
|
+
|
|
144
|
+
```markdown
|
|
145
|
+
---
|
|
146
|
+
title: "Project Proposal"
|
|
147
|
+
source_file: "proposal.docx"
|
|
148
|
+
word_count: 3847
|
|
149
|
+
type: docx
|
|
150
|
+
---
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
**HTML file** — includes word count:
|
|
154
|
+
|
|
155
|
+
```markdown
|
|
156
|
+
---
|
|
157
|
+
title: "Page Title"
|
|
158
|
+
source_file: "page.html"
|
|
159
|
+
word_count: 1234
|
|
160
|
+
type: html
|
|
161
|
+
---
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**URL** — records source URL instead of filename:
|
|
165
|
+
|
|
166
|
+
```markdown
|
|
167
|
+
---
|
|
168
|
+
title: "Article Title"
|
|
169
|
+
source_url: "https://example.com/article"
|
|
170
|
+
word_count: 567
|
|
171
|
+
type: html
|
|
172
|
+
---
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## CLI Reference
|
|
176
|
+
|
|
177
|
+
```
|
|
178
|
+
usage: any2md [-h] [--input-dir PATH] [--force] [--output-dir PATH] [--strip-links] [files ...]
|
|
179
|
+
|
|
180
|
+
Convert PDF, DOCX, and HTML files to LLM-optimized Markdown.
|
|
181
|
+
|
|
182
|
+
positional arguments:
|
|
183
|
+
files Files or URLs to convert. Supports PDF, DOCX, HTML
|
|
184
|
+
files and http(s) URLs. If omitted, converts all
|
|
185
|
+
supported files in the current directory.
|
|
186
|
+
|
|
187
|
+
options:
|
|
188
|
+
-h, --help show this help message and exit
|
|
189
|
+
--input-dir, -i PATH Directory to scan for supported files (PDF, DOCX, HTML)
|
|
190
|
+
--force, -f Overwrite existing .md files
|
|
191
|
+
--output-dir, -o PATH Output directory (default: ./Text)
|
|
192
|
+
--strip-links Remove markdown links, keeping only the link text
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Architecture
|
|
196
|
+
|
|
197
|
+
```
|
|
198
|
+
User Input (files, URLs, flags)
|
|
199
|
+
│
|
|
200
|
+
▼
|
|
201
|
+
cli.py ─── parse args, classify URLs vs file paths
|
|
202
|
+
│
|
|
203
|
+
▼
|
|
204
|
+
converters/__init__.py ─── dispatch by extension
|
|
205
|
+
│
|
|
206
|
+
┌────┼────┐
|
|
207
|
+
▼ ▼ ▼
|
|
208
|
+
pdf docx html ─── format-specific extraction
|
|
209
|
+
│ │ │
|
|
210
|
+
└────┼────┘
|
|
211
|
+
▼
|
|
212
|
+
utils.py ─── clean, title-extract, sanitize, frontmatter
|
|
213
|
+
│
|
|
214
|
+
▼
|
|
215
|
+
Output ─── YAML frontmatter + Markdown → output_dir/
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### Extraction pipelines
|
|
219
|
+
|
|
220
|
+
| Format | Pipeline |
|
|
221
|
+
|--------|----------|
|
|
222
|
+
| **PDF** | `pymupdf4llm.to_markdown()` → clean → frontmatter |
|
|
223
|
+
| **DOCX** | `mammoth` (DOCX → HTML) → `markdownify` (HTML → Markdown) → clean → frontmatter |
|
|
224
|
+
| **HTML/URL** | BS4 pre-clean → `trafilatura` extract (fallback: `markdownify`) → clean → frontmatter |
|
|
225
|
+
|
|
226
|
+
### Adding a new format
|
|
227
|
+
|
|
228
|
+
1. Create `any2md/converters/newformat.py` with a `convert_newformat(path, output_dir, force, strip_links_flag) → bool` function
|
|
229
|
+
2. Add the extension and function to `CONVERTERS` in `any2md/converters/__init__.py`
|
|
230
|
+
3. Add the extension to `SUPPORTED_EXTENSIONS`
|
|
231
|
+
|
|
232
|
+
## License
|
|
233
|
+
|
|
234
|
+
MIT
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""CLI entry point for any2md."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from any2md.converters import convert_file, SUPPORTED_EXTENSIONS
|
|
9
|
+
from any2md.converters.html import convert_html, fetch_url
|
|
10
|
+
from any2md.utils import sanitize_filename, url_to_filename
|
|
11
|
+
|
|
12
|
+
SCRIPT_DIR = Path.cwd()
|
|
13
|
+
DEFAULT_OUTPUT_DIR = SCRIPT_DIR / "Text"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def main():
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
description="Convert PDF, DOCX, and HTML files to LLM-optimized Markdown."
|
|
19
|
+
)
|
|
20
|
+
parser.add_argument(
|
|
21
|
+
"files",
|
|
22
|
+
nargs="*",
|
|
23
|
+
help="Files or URLs to convert. Supports PDF, DOCX, HTML files and http(s) URLs. "
|
|
24
|
+
"If omitted, converts all supported files in the current directory.",
|
|
25
|
+
)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--input-dir", "-i",
|
|
28
|
+
type=Path,
|
|
29
|
+
help="Directory to scan for supported files (PDF, DOCX, HTML).",
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--force", "-f",
|
|
33
|
+
action="store_true",
|
|
34
|
+
help="Overwrite existing .md files.",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--output-dir", "-o",
|
|
38
|
+
type=Path,
|
|
39
|
+
default=DEFAULT_OUTPUT_DIR,
|
|
40
|
+
help=f"Output directory (default: {DEFAULT_OUTPUT_DIR}).",
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--strip-links",
|
|
44
|
+
action="store_true",
|
|
45
|
+
help="Remove markdown links, keeping only the link text.",
|
|
46
|
+
)
|
|
47
|
+
args = parser.parse_args()
|
|
48
|
+
|
|
49
|
+
# Determine which files to process
|
|
50
|
+
if args.files and args.input_dir:
|
|
51
|
+
print("Error: cannot use both positional files and --input-dir.", file=sys.stderr)
|
|
52
|
+
sys.exit(1)
|
|
53
|
+
|
|
54
|
+
urls = []
|
|
55
|
+
file_paths = []
|
|
56
|
+
|
|
57
|
+
if args.files:
|
|
58
|
+
for f in args.files:
|
|
59
|
+
# URL detection
|
|
60
|
+
if f.startswith("http://") or f.startswith("https://"):
|
|
61
|
+
urls.append(f)
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
p = Path(f)
|
|
65
|
+
if not p.is_absolute():
|
|
66
|
+
p = Path.cwd() / p
|
|
67
|
+
if not p.exists():
|
|
68
|
+
print(f" NOT FOUND: {f}", file=sys.stderr)
|
|
69
|
+
continue
|
|
70
|
+
if p.suffix.lower() not in SUPPORTED_EXTENSIONS:
|
|
71
|
+
print(f" UNSUPPORTED FORMAT: {f}", file=sys.stderr)
|
|
72
|
+
continue
|
|
73
|
+
file_paths.append(p)
|
|
74
|
+
elif args.input_dir:
|
|
75
|
+
if not args.input_dir.is_dir():
|
|
76
|
+
print(f"Error: not a directory: {args.input_dir}", file=sys.stderr)
|
|
77
|
+
sys.exit(1)
|
|
78
|
+
file_paths = sorted(
|
|
79
|
+
p for ext in SUPPORTED_EXTENSIONS
|
|
80
|
+
for p in args.input_dir.glob(f"*{ext}")
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
file_paths = sorted(
|
|
84
|
+
p for ext in SUPPORTED_EXTENSIONS
|
|
85
|
+
for p in SCRIPT_DIR.glob(f"*{ext}")
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if not file_paths and not urls:
|
|
89
|
+
print("No supported files to process.")
|
|
90
|
+
sys.exit(0)
|
|
91
|
+
|
|
92
|
+
total = len(file_paths) + len(urls)
|
|
93
|
+
print(f"Processing {total} file(s) → {args.output_dir}/\n")
|
|
94
|
+
start = time.time()
|
|
95
|
+
ok = 0
|
|
96
|
+
fail = 0
|
|
97
|
+
skip = 0
|
|
98
|
+
|
|
99
|
+
# Process URLs
|
|
100
|
+
for url in urls:
|
|
101
|
+
html_content, error = fetch_url(url)
|
|
102
|
+
if error:
|
|
103
|
+
print(f" FAIL: {url} -- {error}", file=sys.stderr)
|
|
104
|
+
fail += 1
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
out_name = url_to_filename(url)
|
|
108
|
+
out_exists = (args.output_dir / out_name).exists()
|
|
109
|
+
if out_exists and not args.force:
|
|
110
|
+
skip += 1
|
|
111
|
+
|
|
112
|
+
result = convert_html(
|
|
113
|
+
None,
|
|
114
|
+
args.output_dir,
|
|
115
|
+
force=args.force,
|
|
116
|
+
strip_links_flag=args.strip_links,
|
|
117
|
+
source_url=url,
|
|
118
|
+
html_content=html_content,
|
|
119
|
+
)
|
|
120
|
+
if result:
|
|
121
|
+
if not (out_exists and not args.force):
|
|
122
|
+
ok += 1
|
|
123
|
+
else:
|
|
124
|
+
fail += 1
|
|
125
|
+
|
|
126
|
+
# Process local files
|
|
127
|
+
for file_path in file_paths:
|
|
128
|
+
out_name = sanitize_filename(file_path.name)
|
|
129
|
+
out_exists = (args.output_dir / out_name).exists()
|
|
130
|
+
if out_exists and not args.force:
|
|
131
|
+
skip += 1
|
|
132
|
+
result = convert_file(
|
|
133
|
+
file_path,
|
|
134
|
+
args.output_dir,
|
|
135
|
+
force=args.force,
|
|
136
|
+
strip_links_flag=args.strip_links,
|
|
137
|
+
)
|
|
138
|
+
if result:
|
|
139
|
+
if not (out_exists and not args.force):
|
|
140
|
+
ok += 1
|
|
141
|
+
else:
|
|
142
|
+
fail += 1
|
|
143
|
+
|
|
144
|
+
elapsed = time.time() - start
|
|
145
|
+
print(f"\nDone in {elapsed:.1f}s: {ok} converted, {skip} skipped, {fail} failed.")
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Converter dispatcher for any2md."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from any2md.converters.pdf import convert_pdf
|
|
7
|
+
from any2md.converters.docx import convert_docx
|
|
8
|
+
from any2md.converters.html import convert_html
|
|
9
|
+
|
|
10
|
+
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".html", ".htm"}
|
|
11
|
+
|
|
12
|
+
CONVERTERS = {
|
|
13
|
+
".pdf": convert_pdf,
|
|
14
|
+
".docx": convert_docx,
|
|
15
|
+
".html": convert_html,
|
|
16
|
+
".htm": convert_html,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def convert_file(
|
|
21
|
+
file_path: Path,
|
|
22
|
+
output_dir: Path,
|
|
23
|
+
force: bool = False,
|
|
24
|
+
strip_links_flag: bool = False,
|
|
25
|
+
) -> bool:
|
|
26
|
+
"""Dispatch to the appropriate converter based on file extension."""
|
|
27
|
+
ext = file_path.suffix.lower()
|
|
28
|
+
converter = CONVERTERS.get(ext)
|
|
29
|
+
if converter is None:
|
|
30
|
+
print(f" UNSUPPORTED: {file_path.name} (no converter for {ext})", file=sys.stderr)
|
|
31
|
+
return False
|
|
32
|
+
return converter(file_path, output_dir, force=force, strip_links_flag=strip_links_flag)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""DOCX to Markdown converter module."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import mammoth
|
|
7
|
+
import markdownify
|
|
8
|
+
|
|
9
|
+
from any2md.utils import sanitize_filename, extract_title, clean_markdown, strip_links, escape_yaml_string
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_docx(
|
|
13
|
+
docx_path: Path,
|
|
14
|
+
output_dir: Path,
|
|
15
|
+
force: bool = False,
|
|
16
|
+
strip_links_flag: bool = False,
|
|
17
|
+
) -> bool:
|
|
18
|
+
"""Convert a single DOCX to LLM-optimized Markdown.
|
|
19
|
+
|
|
20
|
+
Returns True on success, False on failure.
|
|
21
|
+
"""
|
|
22
|
+
out_name = sanitize_filename(docx_path.name)
|
|
23
|
+
out_path = output_dir / out_name
|
|
24
|
+
|
|
25
|
+
if out_path.exists() and not force:
|
|
26
|
+
print(f" SKIP (exists): {out_name}")
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
with open(docx_path, "rb") as f:
|
|
31
|
+
result = mammoth.convert_to_html(f)
|
|
32
|
+
|
|
33
|
+
md_text = markdownify.markdownify(
|
|
34
|
+
result.value,
|
|
35
|
+
heading_style="ATX",
|
|
36
|
+
strip=["img"],
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Clean markdown content
|
|
40
|
+
md_text = clean_markdown(md_text)
|
|
41
|
+
|
|
42
|
+
# Optionally strip links (before frontmatter)
|
|
43
|
+
if strip_links_flag:
|
|
44
|
+
md_text = strip_links(md_text)
|
|
45
|
+
|
|
46
|
+
# Extract title
|
|
47
|
+
title = extract_title(md_text, docx_path.stem)
|
|
48
|
+
|
|
49
|
+
# Word count (DOCX has no reliable page count)
|
|
50
|
+
word_count = len(md_text.split())
|
|
51
|
+
|
|
52
|
+
# Build frontmatter (escape values for valid YAML)
|
|
53
|
+
frontmatter = (
|
|
54
|
+
f'---\n'
|
|
55
|
+
f'title: "{escape_yaml_string(title)}"\n'
|
|
56
|
+
f'source_file: "{escape_yaml_string(docx_path.name)}"\n'
|
|
57
|
+
f'word_count: {word_count}\n'
|
|
58
|
+
f'type: docx\n'
|
|
59
|
+
f'---\n\n'
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
full_text = frontmatter + md_text
|
|
63
|
+
|
|
64
|
+
# Write output
|
|
65
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
out_path.write_text(full_text, encoding="utf-8")
|
|
67
|
+
print(f" OK: {out_name} ({word_count} words)")
|
|
68
|
+
return True
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f" FAIL: {docx_path.name} -- {e}", file=sys.stderr)
|
|
72
|
+
return False
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""HTML to Markdown converter module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
import urllib.parse
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import trafilatura
|
|
10
|
+
import markdownify
|
|
11
|
+
from bs4 import BeautifulSoup
|
|
12
|
+
|
|
13
|
+
from any2md.utils import (
|
|
14
|
+
sanitize_filename,
|
|
15
|
+
extract_title,
|
|
16
|
+
clean_markdown,
|
|
17
|
+
strip_links,
|
|
18
|
+
url_to_filename,
|
|
19
|
+
escape_yaml_string,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def fetch_url(url: str) -> tuple[str | None, str | None]:
|
|
24
|
+
"""Fetch HTML content from a URL.
|
|
25
|
+
|
|
26
|
+
Only http and https schemes are accepted.
|
|
27
|
+
|
|
28
|
+
Returns (html_string, None) on success or (None, error_message) on failure.
|
|
29
|
+
"""
|
|
30
|
+
parsed = urllib.parse.urlparse(url)
|
|
31
|
+
if parsed.scheme not in ("http", "https"):
|
|
32
|
+
return None, f"Unsupported URL scheme: {parsed.scheme!r} (only http/https allowed)"
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
html = trafilatura.fetch_url(url)
|
|
36
|
+
if html is None:
|
|
37
|
+
return None, f"Failed to fetch URL: {url}"
|
|
38
|
+
return html, None
|
|
39
|
+
except Exception as e:
|
|
40
|
+
return None, f"Error fetching URL: {e}"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _bs4_preclean(html: str) -> str:
|
|
44
|
+
"""Remove boilerplate HTML elements before conversion.
|
|
45
|
+
|
|
46
|
+
Strips script, style, nav, header, footer, aside, and iframe tags
|
|
47
|
+
along with their contents.
|
|
48
|
+
"""
|
|
49
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
50
|
+
for tag in soup.find_all(["script", "style", "nav", "header", "footer", "aside", "iframe"]):
|
|
51
|
+
tag.decompose()
|
|
52
|
+
return str(soup)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def convert_html(
|
|
56
|
+
html_path: Path | None,
|
|
57
|
+
output_dir: Path,
|
|
58
|
+
force: bool = False,
|
|
59
|
+
strip_links_flag: bool = False,
|
|
60
|
+
source_url: str | None = None,
|
|
61
|
+
html_content: str | None = None,
|
|
62
|
+
) -> bool:
|
|
63
|
+
"""Convert HTML to LLM-optimized Markdown.
|
|
64
|
+
|
|
65
|
+
When *html_content* is provided it is used directly; otherwise the file
|
|
66
|
+
at *html_path* is read. When *source_url* is set, frontmatter records
|
|
67
|
+
the URL instead of a local filename.
|
|
68
|
+
|
|
69
|
+
Returns True on success, False on failure.
|
|
70
|
+
"""
|
|
71
|
+
# Determine output filename
|
|
72
|
+
if source_url:
|
|
73
|
+
out_name = url_to_filename(source_url)
|
|
74
|
+
name_for_error = source_url
|
|
75
|
+
elif html_path is not None:
|
|
76
|
+
out_name = sanitize_filename(html_path.name)
|
|
77
|
+
name_for_error = html_path.name
|
|
78
|
+
else:
|
|
79
|
+
raise ValueError("Either source_url or html_path must be provided")
|
|
80
|
+
|
|
81
|
+
out_path = output_dir / out_name
|
|
82
|
+
|
|
83
|
+
if out_path.exists() and not force:
|
|
84
|
+
print(f" SKIP (exists): {out_name}")
|
|
85
|
+
return True
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
# 1. Acquire HTML
|
|
89
|
+
if html_content is not None:
|
|
90
|
+
raw_html = html_content
|
|
91
|
+
elif html_path is not None:
|
|
92
|
+
try:
|
|
93
|
+
raw_html = html_path.read_text(encoding="utf-8")
|
|
94
|
+
except UnicodeDecodeError:
|
|
95
|
+
raw_html = html_path.read_text(encoding="latin-1")
|
|
96
|
+
else:
|
|
97
|
+
raise ValueError("Either html_content or html_path must be provided")
|
|
98
|
+
|
|
99
|
+
# 2. BS4 pre-clean
|
|
100
|
+
cleaned_html = _bs4_preclean(raw_html)
|
|
101
|
+
|
|
102
|
+
# 3. trafilatura extract
|
|
103
|
+
md_text = trafilatura.extract(
|
|
104
|
+
cleaned_html,
|
|
105
|
+
include_formatting=True,
|
|
106
|
+
include_links=True,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# 4. Fallback to markdownify if trafilatura returned nothing
|
|
110
|
+
if not md_text:
|
|
111
|
+
md_text = markdownify.markdownify(
|
|
112
|
+
cleaned_html,
|
|
113
|
+
heading_style="ATX",
|
|
114
|
+
strip=["img"],
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# 5. Clean markdown
|
|
118
|
+
md_text = clean_markdown(md_text)
|
|
119
|
+
|
|
120
|
+
# 6. Optionally strip links
|
|
121
|
+
if strip_links_flag:
|
|
122
|
+
md_text = strip_links(md_text)
|
|
123
|
+
|
|
124
|
+
# 7. Extract title
|
|
125
|
+
if source_url:
|
|
126
|
+
fallback = urllib.parse.urlparse(source_url).netloc
|
|
127
|
+
elif html_path is not None:
|
|
128
|
+
fallback = html_path.stem
|
|
129
|
+
else:
|
|
130
|
+
fallback = "untitled"
|
|
131
|
+
title = extract_title(md_text, fallback)
|
|
132
|
+
|
|
133
|
+
# 8. Word count
|
|
134
|
+
word_count = len(md_text.split())
|
|
135
|
+
|
|
136
|
+
# 9. Build frontmatter (escape values for valid YAML)
|
|
137
|
+
if source_url:
|
|
138
|
+
source_field = f'source_url: "{escape_yaml_string(source_url)}"'
|
|
139
|
+
elif html_path is not None:
|
|
140
|
+
source_field = f'source_file: "{escape_yaml_string(html_path.name)}"'
|
|
141
|
+
else:
|
|
142
|
+
source_field = 'source_file: "unknown"'
|
|
143
|
+
|
|
144
|
+
frontmatter = (
|
|
145
|
+
f'---\n'
|
|
146
|
+
f'title: "{escape_yaml_string(title)}"\n'
|
|
147
|
+
f'{source_field}\n'
|
|
148
|
+
f'word_count: {word_count}\n'
|
|
149
|
+
f'type: html\n'
|
|
150
|
+
f'---\n\n'
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# 10-11. Write output
|
|
154
|
+
full_text = frontmatter + md_text
|
|
155
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
156
|
+
out_path.write_text(full_text, encoding="utf-8")
|
|
157
|
+
print(f" OK: {out_name} ({word_count} words)")
|
|
158
|
+
return True
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
print(f" FAIL: {name_for_error} -- {e}", file=sys.stderr)
|
|
162
|
+
return False
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""PDF to Markdown converter."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import pymupdf
|
|
7
|
+
import pymupdf4llm
|
|
8
|
+
|
|
9
|
+
from any2md.utils import sanitize_filename, extract_title, clean_markdown, strip_links, escape_yaml_string
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_pdf(
|
|
13
|
+
pdf_path: Path,
|
|
14
|
+
output_dir: Path,
|
|
15
|
+
force: bool = False,
|
|
16
|
+
strip_links_flag: bool = False,
|
|
17
|
+
) -> bool:
|
|
18
|
+
"""Convert a single PDF to LLM-optimized Markdown.
|
|
19
|
+
|
|
20
|
+
Returns True on success, False on failure.
|
|
21
|
+
"""
|
|
22
|
+
out_name = sanitize_filename(pdf_path.name)
|
|
23
|
+
out_path = output_dir / out_name
|
|
24
|
+
|
|
25
|
+
if out_path.exists() and not force:
|
|
26
|
+
print(f" SKIP (exists): {out_name}")
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
# Get page count
|
|
31
|
+
doc = pymupdf.open(str(pdf_path))
|
|
32
|
+
page_count = len(doc)
|
|
33
|
+
doc.close()
|
|
34
|
+
|
|
35
|
+
# Convert to markdown
|
|
36
|
+
md_text = pymupdf4llm.to_markdown(
|
|
37
|
+
str(pdf_path),
|
|
38
|
+
write_images=False,
|
|
39
|
+
show_progress=False,
|
|
40
|
+
force_text=True,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Clean markdown content
|
|
44
|
+
md_text = clean_markdown(md_text)
|
|
45
|
+
|
|
46
|
+
# Optionally strip links (before frontmatter)
|
|
47
|
+
if strip_links_flag:
|
|
48
|
+
md_text = strip_links(md_text)
|
|
49
|
+
|
|
50
|
+
# Extract title
|
|
51
|
+
title = extract_title(md_text, pdf_path.stem)
|
|
52
|
+
|
|
53
|
+
# Build frontmatter (escape values for valid YAML)
|
|
54
|
+
frontmatter = (
|
|
55
|
+
f'---\n'
|
|
56
|
+
f'title: "{escape_yaml_string(title)}"\n'
|
|
57
|
+
f'source_file: "{escape_yaml_string(pdf_path.name)}"\n'
|
|
58
|
+
f'pages: {page_count}\n'
|
|
59
|
+
f'type: pdf\n'
|
|
60
|
+
f'---\n\n'
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
full_text = frontmatter + md_text
|
|
64
|
+
|
|
65
|
+
# Write output
|
|
66
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
out_path.write_text(full_text, encoding="utf-8")
|
|
68
|
+
print(f" OK: {out_name} ({page_count} pages)")
|
|
69
|
+
return True
|
|
70
|
+
|
|
71
|
+
except Exception as e:
|
|
72
|
+
print(f" FAIL: {pdf_path.name} -- {e}", file=sys.stderr)
|
|
73
|
+
return False
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Shared utility functions for any2md."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import urllib.parse
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def sanitize_filename(name: str) -> str:
|
|
9
|
+
"""Convert a source filename to a sanitized .md filename.
|
|
10
|
+
|
|
11
|
+
Matches existing convention: spaces -> underscores, extension -> .md.
|
|
12
|
+
"""
|
|
13
|
+
stem = Path(name).stem
|
|
14
|
+
# Replace spaces with underscores
|
|
15
|
+
stem = stem.replace(" ", "_")
|
|
16
|
+
# Replace characters problematic in filenames
|
|
17
|
+
stem = re.sub(r"[,;:'\"\u2014\u2013]", "", stem)
|
|
18
|
+
# Collapse multiple underscores
|
|
19
|
+
stem = re.sub(r"_+", "_", stem)
|
|
20
|
+
# Strip leading/trailing underscores
|
|
21
|
+
stem = stem.strip("_")
|
|
22
|
+
return stem + ".md"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def extract_title(markdown_text: str, fallback: str) -> str:
|
|
26
|
+
"""Extract the first markdown heading as the document title."""
|
|
27
|
+
match = re.search(r"^#{1,3}\s+(.+)", markdown_text, re.MULTILINE)
|
|
28
|
+
if match:
|
|
29
|
+
title = match.group(1).strip()
|
|
30
|
+
# Clean markdown formatting from title
|
|
31
|
+
title = re.sub(r"\*+", "", title)
|
|
32
|
+
title = re.sub(r"_+", " ", title)
|
|
33
|
+
title = title.strip()
|
|
34
|
+
if len(title) > 10:
|
|
35
|
+
return title
|
|
36
|
+
# Fallback: derive from filename
|
|
37
|
+
return fallback.replace("_", " ").strip()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def clean_markdown(text: str) -> str:
|
|
41
|
+
"""Clean up markdown for LLM consumption.
|
|
42
|
+
|
|
43
|
+
Reduces excessive whitespace while preserving structure.
|
|
44
|
+
"""
|
|
45
|
+
# Collapse 3+ consecutive blank lines to 2
|
|
46
|
+
text = re.sub(r"\n{4,}", "\n\n\n", text)
|
|
47
|
+
# Remove trailing whitespace on each line
|
|
48
|
+
text = re.sub(r"[ \t]+$", "", text, flags=re.MULTILINE)
|
|
49
|
+
# Ensure file ends with single newline
|
|
50
|
+
text = text.rstrip() + "\n"
|
|
51
|
+
return text
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def escape_yaml_string(value: str) -> str:
|
|
55
|
+
"""Escape a string for safe inclusion in double-quoted YAML values."""
|
|
56
|
+
return value.replace("\\", "\\\\").replace('"', '\\"')
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def strip_links(text: str) -> str:
|
|
60
|
+
"""Replace markdown links with their display text.
|
|
61
|
+
|
|
62
|
+
Converts ``[text](url)`` to ``text``.
|
|
63
|
+
"""
|
|
64
|
+
return re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def url_to_filename(url: str) -> str:
|
|
68
|
+
"""Convert a URL to a sanitized .md filename.
|
|
69
|
+
|
|
70
|
+
Uses the netloc and path components, replacing dots and slashes
|
|
71
|
+
with underscores and collapsing duplicates.
|
|
72
|
+
|
|
73
|
+
Example::
|
|
74
|
+
|
|
75
|
+
>>> url_to_filename("https://example.com/blog/my-post")
|
|
76
|
+
'example_com_blog_my-post.md'
|
|
77
|
+
"""
|
|
78
|
+
parsed = urllib.parse.urlparse(url)
|
|
79
|
+
raw = parsed.netloc + parsed.path
|
|
80
|
+
# Replace dots and slashes with underscores
|
|
81
|
+
raw = raw.replace(".", "_").replace("/", "_")
|
|
82
|
+
# Strip leading/trailing underscores
|
|
83
|
+
raw = raw.strip("_")
|
|
84
|
+
# Collapse multiple underscores
|
|
85
|
+
raw = re.sub(r"_+", "_", raw)
|
|
86
|
+
return raw + ".md"
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "any2md"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Convert PDF, DOCX, and HTML files — or web pages by URL — to clean, LLM-optimized Markdown with YAML frontmatter."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "rocklambros", email = "rock@rockcyber.com" },
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"pymupdf",
|
|
23
|
+
"pymupdf4llm",
|
|
24
|
+
"mammoth",
|
|
25
|
+
"markdownify",
|
|
26
|
+
"trafilatura",
|
|
27
|
+
"beautifulsoup4",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.scripts]
|
|
31
|
+
any2md = "any2md.cli:main"
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://github.com/rocklambros/any2md"
|
|
35
|
+
Issues = "https://github.com/rocklambros/any2md/issues"
|
|
36
|
+
|
|
37
|
+
[tool.hatch.version]
|
|
38
|
+
path = "any2md/__init__.py"
|
|
39
|
+
|
|
40
|
+
[tool.hatch.build.targets.sdist]
|
|
41
|
+
exclude = [
|
|
42
|
+
"/.github",
|
|
43
|
+
"/.claude",
|
|
44
|
+
"/.devcontainer",
|
|
45
|
+
"/.gsd",
|
|
46
|
+
"/.zerg",
|
|
47
|
+
"/input",
|
|
48
|
+
"/output",
|
|
49
|
+
"/Text",
|
|
50
|
+
"mdconv.py",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
[tool.hatch.build.targets.wheel]
|
|
54
|
+
packages = ["any2md"]
|