any2md 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+
7
+ # Virtual environments
8
+ venv/
9
+ .venv/
10
+ env/
11
+
12
+ # Distribution / packaging
13
+ dist/
14
+ build/
15
+ *.egg-info/
16
+ *.egg
17
+
18
+ # IDE
19
+ .idea/
20
+ .vscode/
21
+ *.swp
22
+ *.swo
23
+
24
+ # OS
25
+ .DS_Store
26
+ Thumbs.db
27
+
28
+ # Environment variables
29
+ .env
30
+ .env.local
31
+
32
+ # Generated output
33
+ output/
34
+ Text/
35
+ input/
36
+
37
+ # ZERG / dev tooling
38
+ .gsd/
39
+ .zerg/
40
+ .devcontainer/
@@ -0,0 +1,26 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
6
+
7
+ ## [Unreleased]
8
+
9
+ ### Added
10
+ - HTML file conversion (.html, .htm) with BeautifulSoup pre-cleaning, trafilatura content extraction, and markdownify fallback
11
+ - URL fetching — convert web pages directly by passing a URL as a positional argument
12
+ - `--strip-links` flag to remove markdown hyperlinks from output, keeping only link text
13
+ - .html/.htm support in `--input-dir` batch scanning
14
+ - Package architecture (`any2md/` package with `converters` subpackage)
15
+ - `python -m any2md` entry point
16
+ - Shared utilities module (`any2md/utils.py`) with `strip_links()` and `url_to_filename()`
17
+ - YAML frontmatter for HTML outputs includes `source_url` when converted from a URL
18
+
19
+ ### Changed
20
+ - Refactored from single-file (`any2md.py`) to package architecture
21
+ - `any2md.py` is now a thin wrapper for backward compatibility
22
+ - Updated `SUPPORTED_EXTENSIONS` to include `.html` and `.htm`
23
+
24
+ ### Dependencies
25
+ - Added `trafilatura` for HTML content extraction and URL fetching
26
+ - Added `beautifulsoup4` for HTML pre-cleaning
any2md-0.4.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 rocklambros
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
any2md-0.4.0/PKG-INFO ADDED
@@ -0,0 +1,256 @@
1
+ Metadata-Version: 2.4
2
+ Name: any2md
3
+ Version: 0.4.0
4
+ Summary: Convert PDF, DOCX, and HTML files — or web pages by URL — to clean, LLM-optimized Markdown with YAML frontmatter.
5
+ Project-URL: Homepage, https://github.com/rocklambros/any2md
6
+ Project-URL: Issues, https://github.com/rocklambros/any2md/issues
7
+ Author-email: rocklambros <rock@rockcyber.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
14
+ Requires-Python: >=3.8
15
+ Requires-Dist: beautifulsoup4
16
+ Requires-Dist: mammoth
17
+ Requires-Dist: markdownify
18
+ Requires-Dist: pymupdf
19
+ Requires-Dist: pymupdf4llm
20
+ Requires-Dist: trafilatura
21
+ Description-Content-Type: text/markdown
22
+
23
+ # any2md
24
+
25
+ Convert PDF, DOCX, and HTML files — or web pages by URL — to clean, LLM-optimized Markdown with YAML frontmatter.
26
+
27
+ One command. Any format. Consistent, structured output ready for language models.
28
+
29
+ ## Quick Start
30
+
31
+ ```bash
32
+ pip install any2md
33
+
34
+ any2md report.pdf
35
+ any2md https://example.com/article
36
+ any2md --help
37
+ ```
38
+
39
+ Output lands in `./Text/` by default:
40
+
41
+ ```markdown
42
+ ---
43
+ title: "Quarterly Financial Report"
44
+ source_file: "report.pdf"
45
+ pages: 12
46
+ type: pdf
47
+ ---
48
+
49
+ # Quarterly Financial Report
50
+
51
+ Document content here...
52
+ ```
53
+
54
+ ## Features
55
+
56
+ | Feature | Description |
57
+ |---------|-------------|
58
+ | **Multi-format** | PDF, DOCX, HTML (.html, .htm) |
59
+ | **URL fetching** | Pass any http/https URL as input |
60
+ | **YAML frontmatter** | Title, source, page/word count, type |
61
+ | **Batch processing** | Single file, directory scan, or mixed inputs |
62
+ | **Auto-routing** | Dispatches to the correct converter by extension |
63
+ | **Smart skip** | Won't overwrite existing files unless `--force` |
64
+ | **Filename sanitization** | Spaces, special characters, unicode dashes handled |
65
+ | **Title extraction** | Pulls the first H1–H3 heading automatically |
66
+ | **Link stripping** | `--strip-links` removes hyperlinks, keeps text |
67
+
68
+ ## Installation
69
+
70
+ Requires **Python 3.8+**.
71
+
72
+ ```bash
73
+ pip install any2md
74
+ ```
75
+
76
+ ### From source
77
+
78
+ ```bash
79
+ git clone https://github.com/rocklambros/any2md.git
80
+ cd any2md
81
+ pip install .
82
+ ```
83
+
84
+ ### Dependencies
85
+
86
+ | Library | Purpose |
87
+ |---------|---------|
88
+ | [PyMuPDF](https://pymupdf.readthedocs.io/) + [pymupdf4llm](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/) | PDF extraction |
89
+ | [mammoth](https://github.com/mwilliamson/python-mammoth) + [markdownify](https://github.com/matthewwithanm/python-markdownify) | DOCX conversion |
90
+ | [trafilatura](https://trafilatura.readthedocs.io/) + [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) | HTML/URL extraction |
91
+
92
+ ## Usage
93
+
94
+ ### Basic conversion
95
+
96
+ ```bash
97
+ # Single file
98
+ any2md report.pdf
99
+
100
+ # Multiple files
101
+ any2md report.pdf proposal.docx "meeting notes.pdf"
102
+
103
+ # HTML file
104
+ any2md page.html
105
+
106
+ # Web page by URL
107
+ any2md https://example.com/article
108
+
109
+ # Mixed batch — PDFs, DOCX, HTML, and URLs together
110
+ any2md doc.pdf page.html https://example.com
111
+ ```
112
+
113
+ ### Directory scanning
114
+
115
+ ```bash
116
+ # Scan a specific directory
117
+ any2md --input-dir ./documents
118
+
119
+ # Convert everything in the current directory (default behavior)
120
+ any2md
121
+ ```
122
+
123
+ ### Options
124
+
125
+ ```bash
126
+ # Custom output directory
127
+ any2md -o ./converted report.pdf
128
+
129
+ # Overwrite existing files
130
+ any2md --force
131
+
132
+ # Strip hyperlinks from output
133
+ any2md --strip-links doc.pdf
134
+
135
+ # Combine options
136
+ any2md -f -o ./out --strip-links docs/*.pdf docs/*.docx
137
+ ```
138
+
139
+ ### Alternative invocations
140
+
141
+ ```bash
142
+ # Module mode (works without installing via pip)
143
+ python -m any2md report.pdf
144
+
145
+ # Legacy script (backward compatibility)
146
+ python3 mdconv.py report.pdf
147
+ ```
148
+
149
+ ## Output Format
150
+
151
+ Every converted file has YAML frontmatter followed by cleaned Markdown. The frontmatter fields vary by source format:
152
+
153
+ **PDF** — includes page count:
154
+
155
+ ```markdown
156
+ ---
157
+ title: "Quarterly Financial Report"
158
+ source_file: "Q3 Report 2024.pdf"
159
+ pages: 12
160
+ type: pdf
161
+ ---
162
+ ```
163
+
164
+ **DOCX** — includes word count:
165
+
166
+ ```markdown
167
+ ---
168
+ title: "Project Proposal"
169
+ source_file: "proposal.docx"
170
+ word_count: 3847
171
+ type: docx
172
+ ---
173
+ ```
174
+
175
+ **HTML file** — includes word count:
176
+
177
+ ```markdown
178
+ ---
179
+ title: "Page Title"
180
+ source_file: "page.html"
181
+ word_count: 1234
182
+ type: html
183
+ ---
184
+ ```
185
+
186
+ **URL** — records source URL instead of filename:
187
+
188
+ ```markdown
189
+ ---
190
+ title: "Article Title"
191
+ source_url: "https://example.com/article"
192
+ word_count: 567
193
+ type: html
194
+ ---
195
+ ```
196
+
197
+ ## CLI Reference
198
+
199
+ ```
200
+ usage: any2md [-h] [--input-dir PATH] [--force] [--output-dir PATH] [--strip-links] [files ...]
201
+
202
+ Convert PDF, DOCX, and HTML files to LLM-optimized Markdown.
203
+
204
+ positional arguments:
205
+ files Files or URLs to convert. Supports PDF, DOCX, HTML
206
+ files and http(s) URLs. If omitted, converts all
207
+ supported files in the current directory.
208
+
209
+ options:
210
+ -h, --help show this help message and exit
211
+ --input-dir, -i PATH Directory to scan for supported files (PDF, DOCX, HTML)
212
+ --force, -f Overwrite existing .md files
213
+ --output-dir, -o PATH Output directory (default: ./Text)
214
+ --strip-links Remove markdown links, keeping only the link text
215
+ ```
216
+
217
+ ## Architecture
218
+
219
+ ```
220
+ User Input (files, URLs, flags)
221
+
222
+
223
+ cli.py ─── parse args, classify URLs vs file paths
224
+
225
+
226
+ converters/__init__.py ─── dispatch by extension
227
+
228
+ ┌────┼────┐
229
+ ▼ ▼ ▼
230
+ pdf docx html ─── format-specific extraction
231
+ │ │ │
232
+ └────┼────┘
233
+
234
+ utils.py ─── clean, title-extract, sanitize, frontmatter
235
+
236
+
237
+ Output ─── YAML frontmatter + Markdown → output_dir/
238
+ ```
239
+
240
+ ### Extraction pipelines
241
+
242
+ | Format | Pipeline |
243
+ |--------|----------|
244
+ | **PDF** | `pymupdf4llm.to_markdown()` → clean → frontmatter |
245
+ | **DOCX** | `mammoth` (DOCX → HTML) → `markdownify` (HTML → Markdown) → clean → frontmatter |
246
+ | **HTML/URL** | BS4 pre-clean → `trafilatura` extract (fallback: `markdownify`) → clean → frontmatter |
247
+
248
+ ### Adding a new format
249
+
250
+ 1. Create `any2md/converters/newformat.py` with a `convert_newformat(path, output_dir, force, strip_links_flag) → bool` function
251
+ 2. Add the extension and function to `CONVERTERS` in `any2md/converters/__init__.py`
252
+ 3. Add the extension to `SUPPORTED_EXTENSIONS`
253
+
254
+ ## License
255
+
256
+ MIT
any2md-0.4.0/README.md ADDED
@@ -0,0 +1,234 @@
1
+ # any2md
2
+
3
+ Convert PDF, DOCX, and HTML files — or web pages by URL — to clean, LLM-optimized Markdown with YAML frontmatter.
4
+
5
+ One command. Any format. Consistent, structured output ready for language models.
6
+
7
+ ## Quick Start
8
+
9
+ ```bash
10
+ pip install any2md
11
+
12
+ any2md report.pdf
13
+ any2md https://example.com/article
14
+ any2md --help
15
+ ```
16
+
17
+ Output lands in `./Text/` by default:
18
+
19
+ ```markdown
20
+ ---
21
+ title: "Quarterly Financial Report"
22
+ source_file: "report.pdf"
23
+ pages: 12
24
+ type: pdf
25
+ ---
26
+
27
+ # Quarterly Financial Report
28
+
29
+ Document content here...
30
+ ```
31
+
32
+ ## Features
33
+
34
+ | Feature | Description |
35
+ |---------|-------------|
36
+ | **Multi-format** | PDF, DOCX, HTML (.html, .htm) |
37
+ | **URL fetching** | Pass any http/https URL as input |
38
+ | **YAML frontmatter** | Title, source, page/word count, type |
39
+ | **Batch processing** | Single file, directory scan, or mixed inputs |
40
+ | **Auto-routing** | Dispatches to the correct converter by extension |
41
+ | **Smart skip** | Won't overwrite existing files unless `--force` |
42
+ | **Filename sanitization** | Spaces, special characters, unicode dashes handled |
43
+ | **Title extraction** | Pulls the first H1–H3 heading automatically |
44
+ | **Link stripping** | `--strip-links` removes hyperlinks, keeps text |
45
+
46
+ ## Installation
47
+
48
+ Requires **Python 3.8+**.
49
+
50
+ ```bash
51
+ pip install any2md
52
+ ```
53
+
54
+ ### From source
55
+
56
+ ```bash
57
+ git clone https://github.com/rocklambros/any2md.git
58
+ cd any2md
59
+ pip install .
60
+ ```
61
+
62
+ ### Dependencies
63
+
64
+ | Library | Purpose |
65
+ |---------|---------|
66
+ | [PyMuPDF](https://pymupdf.readthedocs.io/) + [pymupdf4llm](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/) | PDF extraction |
67
+ | [mammoth](https://github.com/mwilliamson/python-mammoth) + [markdownify](https://github.com/matthewwithanm/python-markdownify) | DOCX conversion |
68
+ | [trafilatura](https://trafilatura.readthedocs.io/) + [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) | HTML/URL extraction |
69
+
70
+ ## Usage
71
+
72
+ ### Basic conversion
73
+
74
+ ```bash
75
+ # Single file
76
+ any2md report.pdf
77
+
78
+ # Multiple files
79
+ any2md report.pdf proposal.docx "meeting notes.pdf"
80
+
81
+ # HTML file
82
+ any2md page.html
83
+
84
+ # Web page by URL
85
+ any2md https://example.com/article
86
+
87
+ # Mixed batch — PDFs, DOCX, HTML, and URLs together
88
+ any2md doc.pdf page.html https://example.com
89
+ ```
90
+
91
+ ### Directory scanning
92
+
93
+ ```bash
94
+ # Scan a specific directory
95
+ any2md --input-dir ./documents
96
+
97
+ # Convert everything in the current directory (default behavior)
98
+ any2md
99
+ ```
100
+
101
+ ### Options
102
+
103
+ ```bash
104
+ # Custom output directory
105
+ any2md -o ./converted report.pdf
106
+
107
+ # Overwrite existing files
108
+ any2md --force
109
+
110
+ # Strip hyperlinks from output
111
+ any2md --strip-links doc.pdf
112
+
113
+ # Combine options
114
+ any2md -f -o ./out --strip-links docs/*.pdf docs/*.docx
115
+ ```
116
+
117
+ ### Alternative invocations
118
+
119
+ ```bash
120
+ # Module mode (works without installing via pip)
121
+ python -m any2md report.pdf
122
+
123
+ # Legacy script (backward compatibility)
124
+ python3 mdconv.py report.pdf
125
+ ```
126
+
127
+ ## Output Format
128
+
129
+ Every converted file has YAML frontmatter followed by cleaned Markdown. The frontmatter fields vary by source format:
130
+
131
+ **PDF** — includes page count:
132
+
133
+ ```markdown
134
+ ---
135
+ title: "Quarterly Financial Report"
136
+ source_file: "Q3 Report 2024.pdf"
137
+ pages: 12
138
+ type: pdf
139
+ ---
140
+ ```
141
+
142
+ **DOCX** — includes word count:
143
+
144
+ ```markdown
145
+ ---
146
+ title: "Project Proposal"
147
+ source_file: "proposal.docx"
148
+ word_count: 3847
149
+ type: docx
150
+ ---
151
+ ```
152
+
153
+ **HTML file** — includes word count:
154
+
155
+ ```markdown
156
+ ---
157
+ title: "Page Title"
158
+ source_file: "page.html"
159
+ word_count: 1234
160
+ type: html
161
+ ---
162
+ ```
163
+
164
+ **URL** — records source URL instead of filename:
165
+
166
+ ```markdown
167
+ ---
168
+ title: "Article Title"
169
+ source_url: "https://example.com/article"
170
+ word_count: 567
171
+ type: html
172
+ ---
173
+ ```
174
+
175
+ ## CLI Reference
176
+
177
+ ```
178
+ usage: any2md [-h] [--input-dir PATH] [--force] [--output-dir PATH] [--strip-links] [files ...]
179
+
180
+ Convert PDF, DOCX, and HTML files to LLM-optimized Markdown.
181
+
182
+ positional arguments:
183
+ files Files or URLs to convert. Supports PDF, DOCX, HTML
184
+ files and http(s) URLs. If omitted, converts all
185
+ supported files in the current directory.
186
+
187
+ options:
188
+ -h, --help show this help message and exit
189
+ --input-dir, -i PATH Directory to scan for supported files (PDF, DOCX, HTML)
190
+ --force, -f Overwrite existing .md files
191
+ --output-dir, -o PATH Output directory (default: ./Text)
192
+ --strip-links Remove markdown links, keeping only the link text
193
+ ```
194
+
195
+ ## Architecture
196
+
197
+ ```
198
+ User Input (files, URLs, flags)
199
+
200
+
201
+ cli.py ─── parse args, classify URLs vs file paths
202
+
203
+
204
+ converters/__init__.py ─── dispatch by extension
205
+
206
+ ┌────┼────┐
207
+ ▼ ▼ ▼
208
+ pdf docx html ─── format-specific extraction
209
+ │ │ │
210
+ └────┼────┘
211
+
212
+ utils.py ─── clean, title-extract, sanitize, frontmatter
213
+
214
+
215
+ Output ─── YAML frontmatter + Markdown → output_dir/
216
+ ```
217
+
218
+ ### Extraction pipelines
219
+
220
+ | Format | Pipeline |
221
+ |--------|----------|
222
+ | **PDF** | `pymupdf4llm.to_markdown()` → clean → frontmatter |
223
+ | **DOCX** | `mammoth` (DOCX → HTML) → `markdownify` (HTML → Markdown) → clean → frontmatter |
224
+ | **HTML/URL** | BS4 pre-clean → `trafilatura` extract (fallback: `markdownify`) → clean → frontmatter |
225
+
226
+ ### Adding a new format
227
+
228
+ 1. Create `any2md/converters/newformat.py` with a `convert_newformat(path, output_dir, force, strip_links_flag) → bool` function
229
+ 2. Add the extension and function to `CONVERTERS` in `any2md/converters/__init__.py`
230
+ 3. Add the extension to `SUPPORTED_EXTENSIONS`
231
+
232
+ ## License
233
+
234
+ MIT
@@ -0,0 +1,3 @@
1
+ """Convert PDF, DOCX, and HTML files to LLM-optimized Markdown."""
2
+
3
+ __version__ = "0.4.0"
@@ -0,0 +1,4 @@
1
+ """Allow running any2md as a module: python -m any2md."""
2
+ from any2md.cli import main
3
+
4
+ main()
@@ -0,0 +1,145 @@
1
+ """CLI entry point for any2md."""
2
+
3
+ import argparse
4
+ import sys
5
+ import time
6
+ from pathlib import Path
7
+
8
+ from any2md.converters import convert_file, SUPPORTED_EXTENSIONS
9
+ from any2md.converters.html import convert_html, fetch_url
10
+ from any2md.utils import sanitize_filename, url_to_filename
11
+
12
+ SCRIPT_DIR = Path.cwd()
13
+ DEFAULT_OUTPUT_DIR = SCRIPT_DIR / "Text"
14
+
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser(
18
+ description="Convert PDF, DOCX, and HTML files to LLM-optimized Markdown."
19
+ )
20
+ parser.add_argument(
21
+ "files",
22
+ nargs="*",
23
+ help="Files or URLs to convert. Supports PDF, DOCX, HTML files and http(s) URLs. "
24
+ "If omitted, converts all supported files in the current directory.",
25
+ )
26
+ parser.add_argument(
27
+ "--input-dir", "-i",
28
+ type=Path,
29
+ help="Directory to scan for supported files (PDF, DOCX, HTML).",
30
+ )
31
+ parser.add_argument(
32
+ "--force", "-f",
33
+ action="store_true",
34
+ help="Overwrite existing .md files.",
35
+ )
36
+ parser.add_argument(
37
+ "--output-dir", "-o",
38
+ type=Path,
39
+ default=DEFAULT_OUTPUT_DIR,
40
+ help=f"Output directory (default: {DEFAULT_OUTPUT_DIR}).",
41
+ )
42
+ parser.add_argument(
43
+ "--strip-links",
44
+ action="store_true",
45
+ help="Remove markdown links, keeping only the link text.",
46
+ )
47
+ args = parser.parse_args()
48
+
49
+ # Determine which files to process
50
+ if args.files and args.input_dir:
51
+ print("Error: cannot use both positional files and --input-dir.", file=sys.stderr)
52
+ sys.exit(1)
53
+
54
+ urls = []
55
+ file_paths = []
56
+
57
+ if args.files:
58
+ for f in args.files:
59
+ # URL detection
60
+ if f.startswith("http://") or f.startswith("https://"):
61
+ urls.append(f)
62
+ continue
63
+
64
+ p = Path(f)
65
+ if not p.is_absolute():
66
+ p = Path.cwd() / p
67
+ if not p.exists():
68
+ print(f" NOT FOUND: {f}", file=sys.stderr)
69
+ continue
70
+ if p.suffix.lower() not in SUPPORTED_EXTENSIONS:
71
+ print(f" UNSUPPORTED FORMAT: {f}", file=sys.stderr)
72
+ continue
73
+ file_paths.append(p)
74
+ elif args.input_dir:
75
+ if not args.input_dir.is_dir():
76
+ print(f"Error: not a directory: {args.input_dir}", file=sys.stderr)
77
+ sys.exit(1)
78
+ file_paths = sorted(
79
+ p for ext in SUPPORTED_EXTENSIONS
80
+ for p in args.input_dir.glob(f"*{ext}")
81
+ )
82
+ else:
83
+ file_paths = sorted(
84
+ p for ext in SUPPORTED_EXTENSIONS
85
+ for p in SCRIPT_DIR.glob(f"*{ext}")
86
+ )
87
+
88
+ if not file_paths and not urls:
89
+ print("No supported files to process.")
90
+ sys.exit(0)
91
+
92
+ total = len(file_paths) + len(urls)
93
+ print(f"Processing {total} file(s) → {args.output_dir}/\n")
94
+ start = time.time()
95
+ ok = 0
96
+ fail = 0
97
+ skip = 0
98
+
99
+ # Process URLs
100
+ for url in urls:
101
+ html_content, error = fetch_url(url)
102
+ if error:
103
+ print(f" FAIL: {url} -- {error}", file=sys.stderr)
104
+ fail += 1
105
+ continue
106
+
107
+ out_name = url_to_filename(url)
108
+ out_exists = (args.output_dir / out_name).exists()
109
+ if out_exists and not args.force:
110
+ skip += 1
111
+
112
+ result = convert_html(
113
+ None,
114
+ args.output_dir,
115
+ force=args.force,
116
+ strip_links_flag=args.strip_links,
117
+ source_url=url,
118
+ html_content=html_content,
119
+ )
120
+ if result:
121
+ if not (out_exists and not args.force):
122
+ ok += 1
123
+ else:
124
+ fail += 1
125
+
126
+ # Process local files
127
+ for file_path in file_paths:
128
+ out_name = sanitize_filename(file_path.name)
129
+ out_exists = (args.output_dir / out_name).exists()
130
+ if out_exists and not args.force:
131
+ skip += 1
132
+ result = convert_file(
133
+ file_path,
134
+ args.output_dir,
135
+ force=args.force,
136
+ strip_links_flag=args.strip_links,
137
+ )
138
+ if result:
139
+ if not (out_exists and not args.force):
140
+ ok += 1
141
+ else:
142
+ fail += 1
143
+
144
+ elapsed = time.time() - start
145
+ print(f"\nDone in {elapsed:.1f}s: {ok} converted, {skip} skipped, {fail} failed.")
@@ -0,0 +1,32 @@
1
+ """Converter dispatcher for any2md."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from any2md.converters.pdf import convert_pdf
7
+ from any2md.converters.docx import convert_docx
8
+ from any2md.converters.html import convert_html
9
+
10
+ SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".html", ".htm"}
11
+
12
+ CONVERTERS = {
13
+ ".pdf": convert_pdf,
14
+ ".docx": convert_docx,
15
+ ".html": convert_html,
16
+ ".htm": convert_html,
17
+ }
18
+
19
+
20
+ def convert_file(
21
+ file_path: Path,
22
+ output_dir: Path,
23
+ force: bool = False,
24
+ strip_links_flag: bool = False,
25
+ ) -> bool:
26
+ """Dispatch to the appropriate converter based on file extension."""
27
+ ext = file_path.suffix.lower()
28
+ converter = CONVERTERS.get(ext)
29
+ if converter is None:
30
+ print(f" UNSUPPORTED: {file_path.name} (no converter for {ext})", file=sys.stderr)
31
+ return False
32
+ return converter(file_path, output_dir, force=force, strip_links_flag=strip_links_flag)
@@ -0,0 +1,72 @@
1
+ """DOCX to Markdown converter module."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import mammoth
7
+ import markdownify
8
+
9
+ from any2md.utils import sanitize_filename, extract_title, clean_markdown, strip_links, escape_yaml_string
10
+
11
+
12
+ def convert_docx(
13
+ docx_path: Path,
14
+ output_dir: Path,
15
+ force: bool = False,
16
+ strip_links_flag: bool = False,
17
+ ) -> bool:
18
+ """Convert a single DOCX to LLM-optimized Markdown.
19
+
20
+ Returns True on success, False on failure.
21
+ """
22
+ out_name = sanitize_filename(docx_path.name)
23
+ out_path = output_dir / out_name
24
+
25
+ if out_path.exists() and not force:
26
+ print(f" SKIP (exists): {out_name}")
27
+ return True
28
+
29
+ try:
30
+ with open(docx_path, "rb") as f:
31
+ result = mammoth.convert_to_html(f)
32
+
33
+ md_text = markdownify.markdownify(
34
+ result.value,
35
+ heading_style="ATX",
36
+ strip=["img"],
37
+ )
38
+
39
+ # Clean markdown content
40
+ md_text = clean_markdown(md_text)
41
+
42
+ # Optionally strip links (before frontmatter)
43
+ if strip_links_flag:
44
+ md_text = strip_links(md_text)
45
+
46
+ # Extract title
47
+ title = extract_title(md_text, docx_path.stem)
48
+
49
+ # Word count (DOCX has no reliable page count)
50
+ word_count = len(md_text.split())
51
+
52
+ # Build frontmatter (escape values for valid YAML)
53
+ frontmatter = (
54
+ f'---\n'
55
+ f'title: "{escape_yaml_string(title)}"\n'
56
+ f'source_file: "{escape_yaml_string(docx_path.name)}"\n'
57
+ f'word_count: {word_count}\n'
58
+ f'type: docx\n'
59
+ f'---\n\n'
60
+ )
61
+
62
+ full_text = frontmatter + md_text
63
+
64
+ # Write output
65
+ output_dir.mkdir(parents=True, exist_ok=True)
66
+ out_path.write_text(full_text, encoding="utf-8")
67
+ print(f" OK: {out_name} ({word_count} words)")
68
+ return True
69
+
70
+ except Exception as e:
71
+ print(f" FAIL: {docx_path.name} -- {e}", file=sys.stderr)
72
+ return False
@@ -0,0 +1,162 @@
1
+ """HTML to Markdown converter module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ import urllib.parse
7
+ from pathlib import Path
8
+
9
+ import trafilatura
10
+ import markdownify
11
+ from bs4 import BeautifulSoup
12
+
13
+ from any2md.utils import (
14
+ sanitize_filename,
15
+ extract_title,
16
+ clean_markdown,
17
+ strip_links,
18
+ url_to_filename,
19
+ escape_yaml_string,
20
+ )
21
+
22
+
23
+ def fetch_url(url: str) -> tuple[str | None, str | None]:
24
+ """Fetch HTML content from a URL.
25
+
26
+ Only http and https schemes are accepted.
27
+
28
+ Returns (html_string, None) on success or (None, error_message) on failure.
29
+ """
30
+ parsed = urllib.parse.urlparse(url)
31
+ if parsed.scheme not in ("http", "https"):
32
+ return None, f"Unsupported URL scheme: {parsed.scheme!r} (only http/https allowed)"
33
+
34
+ try:
35
+ html = trafilatura.fetch_url(url)
36
+ if html is None:
37
+ return None, f"Failed to fetch URL: {url}"
38
+ return html, None
39
+ except Exception as e:
40
+ return None, f"Error fetching URL: {e}"
41
+
42
+
43
+ def _bs4_preclean(html: str) -> str:
44
+ """Remove boilerplate HTML elements before conversion.
45
+
46
+ Strips script, style, nav, header, footer, aside, and iframe tags
47
+ along with their contents.
48
+ """
49
+ soup = BeautifulSoup(html, "html.parser")
50
+ for tag in soup.find_all(["script", "style", "nav", "header", "footer", "aside", "iframe"]):
51
+ tag.decompose()
52
+ return str(soup)
53
+
54
+
55
+ def convert_html(
56
+ html_path: Path | None,
57
+ output_dir: Path,
58
+ force: bool = False,
59
+ strip_links_flag: bool = False,
60
+ source_url: str | None = None,
61
+ html_content: str | None = None,
62
+ ) -> bool:
63
+ """Convert HTML to LLM-optimized Markdown.
64
+
65
+ When *html_content* is provided it is used directly; otherwise the file
66
+ at *html_path* is read. When *source_url* is set, frontmatter records
67
+ the URL instead of a local filename.
68
+
69
+ Returns True on success, False on failure.
70
+ """
71
+ # Determine output filename
72
+ if source_url:
73
+ out_name = url_to_filename(source_url)
74
+ name_for_error = source_url
75
+ elif html_path is not None:
76
+ out_name = sanitize_filename(html_path.name)
77
+ name_for_error = html_path.name
78
+ else:
79
+ raise ValueError("Either source_url or html_path must be provided")
80
+
81
+ out_path = output_dir / out_name
82
+
83
+ if out_path.exists() and not force:
84
+ print(f" SKIP (exists): {out_name}")
85
+ return True
86
+
87
+ try:
88
+ # 1. Acquire HTML
89
+ if html_content is not None:
90
+ raw_html = html_content
91
+ elif html_path is not None:
92
+ try:
93
+ raw_html = html_path.read_text(encoding="utf-8")
94
+ except UnicodeDecodeError:
95
+ raw_html = html_path.read_text(encoding="latin-1")
96
+ else:
97
+ raise ValueError("Either html_content or html_path must be provided")
98
+
99
+ # 2. BS4 pre-clean
100
+ cleaned_html = _bs4_preclean(raw_html)
101
+
102
+ # 3. trafilatura extract
103
+ md_text = trafilatura.extract(
104
+ cleaned_html,
105
+ include_formatting=True,
106
+ include_links=True,
107
+ )
108
+
109
+ # 4. Fallback to markdownify if trafilatura returned nothing
110
+ if not md_text:
111
+ md_text = markdownify.markdownify(
112
+ cleaned_html,
113
+ heading_style="ATX",
114
+ strip=["img"],
115
+ )
116
+
117
+ # 5. Clean markdown
118
+ md_text = clean_markdown(md_text)
119
+
120
+ # 6. Optionally strip links
121
+ if strip_links_flag:
122
+ md_text = strip_links(md_text)
123
+
124
+ # 7. Extract title
125
+ if source_url:
126
+ fallback = urllib.parse.urlparse(source_url).netloc
127
+ elif html_path is not None:
128
+ fallback = html_path.stem
129
+ else:
130
+ fallback = "untitled"
131
+ title = extract_title(md_text, fallback)
132
+
133
+ # 8. Word count
134
+ word_count = len(md_text.split())
135
+
136
+ # 9. Build frontmatter (escape values for valid YAML)
137
+ if source_url:
138
+ source_field = f'source_url: "{escape_yaml_string(source_url)}"'
139
+ elif html_path is not None:
140
+ source_field = f'source_file: "{escape_yaml_string(html_path.name)}"'
141
+ else:
142
+ source_field = 'source_file: "unknown"'
143
+
144
+ frontmatter = (
145
+ f'---\n'
146
+ f'title: "{escape_yaml_string(title)}"\n'
147
+ f'{source_field}\n'
148
+ f'word_count: {word_count}\n'
149
+ f'type: html\n'
150
+ f'---\n\n'
151
+ )
152
+
153
+ # 10-11. Write output
154
+ full_text = frontmatter + md_text
155
+ output_dir.mkdir(parents=True, exist_ok=True)
156
+ out_path.write_text(full_text, encoding="utf-8")
157
+ print(f" OK: {out_name} ({word_count} words)")
158
+ return True
159
+
160
+ except Exception as e:
161
+ print(f" FAIL: {name_for_error} -- {e}", file=sys.stderr)
162
+ return False
@@ -0,0 +1,73 @@
1
+ """PDF to Markdown converter."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import pymupdf
7
+ import pymupdf4llm
8
+
9
+ from any2md.utils import sanitize_filename, extract_title, clean_markdown, strip_links, escape_yaml_string
10
+
11
+
12
+ def convert_pdf(
13
+ pdf_path: Path,
14
+ output_dir: Path,
15
+ force: bool = False,
16
+ strip_links_flag: bool = False,
17
+ ) -> bool:
18
+ """Convert a single PDF to LLM-optimized Markdown.
19
+
20
+ Returns True on success, False on failure.
21
+ """
22
+ out_name = sanitize_filename(pdf_path.name)
23
+ out_path = output_dir / out_name
24
+
25
+ if out_path.exists() and not force:
26
+ print(f" SKIP (exists): {out_name}")
27
+ return True
28
+
29
+ try:
30
+ # Get page count
31
+ doc = pymupdf.open(str(pdf_path))
32
+ page_count = len(doc)
33
+ doc.close()
34
+
35
+ # Convert to markdown
36
+ md_text = pymupdf4llm.to_markdown(
37
+ str(pdf_path),
38
+ write_images=False,
39
+ show_progress=False,
40
+ force_text=True,
41
+ )
42
+
43
+ # Clean markdown content
44
+ md_text = clean_markdown(md_text)
45
+
46
+ # Optionally strip links (before frontmatter)
47
+ if strip_links_flag:
48
+ md_text = strip_links(md_text)
49
+
50
+ # Extract title
51
+ title = extract_title(md_text, pdf_path.stem)
52
+
53
+ # Build frontmatter (escape values for valid YAML)
54
+ frontmatter = (
55
+ f'---\n'
56
+ f'title: "{escape_yaml_string(title)}"\n'
57
+ f'source_file: "{escape_yaml_string(pdf_path.name)}"\n'
58
+ f'pages: {page_count}\n'
59
+ f'type: pdf\n'
60
+ f'---\n\n'
61
+ )
62
+
63
+ full_text = frontmatter + md_text
64
+
65
+ # Write output
66
+ output_dir.mkdir(parents=True, exist_ok=True)
67
+ out_path.write_text(full_text, encoding="utf-8")
68
+ print(f" OK: {out_name} ({page_count} pages)")
69
+ return True
70
+
71
+ except Exception as e:
72
+ print(f" FAIL: {pdf_path.name} -- {e}", file=sys.stderr)
73
+ return False
@@ -0,0 +1,86 @@
1
+ """Shared utility functions for any2md."""
2
+
3
+ import re
4
+ import urllib.parse
5
+ from pathlib import Path
6
+
7
+
8
+ def sanitize_filename(name: str) -> str:
9
+ """Convert a source filename to a sanitized .md filename.
10
+
11
+ Matches existing convention: spaces -> underscores, extension -> .md.
12
+ """
13
+ stem = Path(name).stem
14
+ # Replace spaces with underscores
15
+ stem = stem.replace(" ", "_")
16
+ # Replace characters problematic in filenames
17
+ stem = re.sub(r"[,;:'\"\u2014\u2013]", "", stem)
18
+ # Collapse multiple underscores
19
+ stem = re.sub(r"_+", "_", stem)
20
+ # Strip leading/trailing underscores
21
+ stem = stem.strip("_")
22
+ return stem + ".md"
23
+
24
+
25
+ def extract_title(markdown_text: str, fallback: str) -> str:
26
+ """Extract the first markdown heading as the document title."""
27
+ match = re.search(r"^#{1,3}\s+(.+)", markdown_text, re.MULTILINE)
28
+ if match:
29
+ title = match.group(1).strip()
30
+ # Clean markdown formatting from title
31
+ title = re.sub(r"\*+", "", title)
32
+ title = re.sub(r"_+", " ", title)
33
+ title = title.strip()
34
+ if len(title) > 10:
35
+ return title
36
+ # Fallback: derive from filename
37
+ return fallback.replace("_", " ").strip()
38
+
39
+
40
+ def clean_markdown(text: str) -> str:
41
+ """Clean up markdown for LLM consumption.
42
+
43
+ Reduces excessive whitespace while preserving structure.
44
+ """
45
+ # Collapse 3+ consecutive blank lines to 2
46
+ text = re.sub(r"\n{4,}", "\n\n\n", text)
47
+ # Remove trailing whitespace on each line
48
+ text = re.sub(r"[ \t]+$", "", text, flags=re.MULTILINE)
49
+ # Ensure file ends with single newline
50
+ text = text.rstrip() + "\n"
51
+ return text
52
+
53
+
54
+ def escape_yaml_string(value: str) -> str:
55
+ """Escape a string for safe inclusion in double-quoted YAML values."""
56
+ return value.replace("\\", "\\\\").replace('"', '\\"')
57
+
58
+
59
+ def strip_links(text: str) -> str:
60
+ """Replace markdown links with their display text.
61
+
62
+ Converts ``[text](url)`` to ``text``.
63
+ """
64
+ return re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
65
+
66
+
67
+ def url_to_filename(url: str) -> str:
68
+ """Convert a URL to a sanitized .md filename.
69
+
70
+ Uses the netloc and path components, replacing dots and slashes
71
+ with underscores and collapsing duplicates.
72
+
73
+ Example::
74
+
75
+ >>> url_to_filename("https://example.com/blog/my-post")
76
+ 'example_com_blog_my-post.md'
77
+ """
78
+ parsed = urllib.parse.urlparse(url)
79
+ raw = parsed.netloc + parsed.path
80
+ # Replace dots and slashes with underscores
81
+ raw = raw.replace(".", "_").replace("/", "_")
82
+ # Strip leading/trailing underscores
83
+ raw = raw.strip("_")
84
+ # Collapse multiple underscores
85
+ raw = re.sub(r"_+", "_", raw)
86
+ return raw + ".md"
@@ -0,0 +1,54 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "any2md"
7
+ dynamic = ["version"]
8
+ description = "Convert PDF, DOCX, and HTML files — or web pages by URL — to clean, LLM-optimized Markdown with YAML frontmatter."
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.8"
12
+ authors = [
13
+ { name = "rocklambros", email = "rock@rockcyber.com" },
14
+ ]
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ "Topic :: Text Processing :: Markup :: Markdown",
20
+ ]
21
+ dependencies = [
22
+ "pymupdf",
23
+ "pymupdf4llm",
24
+ "mammoth",
25
+ "markdownify",
26
+ "trafilatura",
27
+ "beautifulsoup4",
28
+ ]
29
+
30
+ [project.scripts]
31
+ any2md = "any2md.cli:main"
32
+
33
+ [project.urls]
34
+ Homepage = "https://github.com/rocklambros/any2md"
35
+ Issues = "https://github.com/rocklambros/any2md/issues"
36
+
37
+ [tool.hatch.version]
38
+ path = "any2md/__init__.py"
39
+
40
+ [tool.hatch.build.targets.sdist]
41
+ exclude = [
42
+ "/.github",
43
+ "/.claude",
44
+ "/.devcontainer",
45
+ "/.gsd",
46
+ "/.zerg",
47
+ "/input",
48
+ "/output",
49
+ "/Text",
50
+ "mdconv.py",
51
+ ]
52
+
53
+ [tool.hatch.build.targets.wheel]
54
+ packages = ["any2md"]
@@ -0,0 +1,6 @@
1
+ pymupdf
2
+ pymupdf4llm
3
+ mammoth
4
+ markdownify
5
+ trafilatura
6
+ beautifulsoup4