documint2md 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- documint2md-1.0.0/LICENSE +21 -0
- documint2md-1.0.0/PKG-INFO +259 -0
- documint2md-1.0.0/README.md +214 -0
- documint2md-1.0.0/doc2md/__init__.py +25 -0
- documint2md-1.0.0/doc2md/__main__.py +8 -0
- documint2md-1.0.0/doc2md/cli.py +3675 -0
- documint2md-1.0.0/doc2md/converter.py +241 -0
- documint2md-1.0.0/doc2md/history_db.py +514 -0
- documint2md-1.0.0/doc2md/ocr.py +364 -0
- documint2md-1.0.0/documint2md.egg-info/PKG-INFO +259 -0
- documint2md-1.0.0/documint2md.egg-info/SOURCES.txt +31 -0
- documint2md-1.0.0/documint2md.egg-info/dependency_links.txt +1 -0
- documint2md-1.0.0/documint2md.egg-info/entry_points.txt +2 -0
- documint2md-1.0.0/documint2md.egg-info/requires.txt +34 -0
- documint2md-1.0.0/documint2md.egg-info/top_level.txt +1 -0
- documint2md-1.0.0/pyproject.toml +58 -0
- documint2md-1.0.0/setup.cfg +4 -0
- documint2md-1.0.0/tests/test_cli.py +585 -0
- documint2md-1.0.0/tests/test_cli_contract.py +74 -0
- documint2md-1.0.0/tests/test_converter.py +81 -0
- documint2md-1.0.0/tests/test_converter_ocr.py +66 -0
- documint2md-1.0.0/tests/test_explain_command.py +90 -0
- documint2md-1.0.0/tests/test_history_commands.py +122 -0
- documint2md-1.0.0/tests/test_history_db.py +96 -0
- documint2md-1.0.0/tests/test_history_logging.py +108 -0
- documint2md-1.0.0/tests/test_jump_command.py +64 -0
- documint2md-1.0.0/tests/test_ocr.py +114 -0
- documint2md-1.0.0/tests/test_ocr_integration.py +61 -0
- documint2md-1.0.0/tests/test_profile_commands.py +95 -0
- documint2md-1.0.0/tests/test_recent_command.py +54 -0
- documint2md-1.0.0/tests/test_slash_history_command.py +84 -0
- documint2md-1.0.0/tests/test_ui_actions.py +56 -0
- documint2md-1.0.0/tests/test_ui_command.py +56 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 DocuMint Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: documint2md
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Convert PDF, DOCX, CSV, and image files to Markdown.
|
|
5
|
+
Author: DocuMint Contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: markdown,pdf,docx,csv,cli
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
13
|
+
Requires-Python: >=3.11
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: beautifulsoup4==4.14.3
|
|
17
|
+
Requires-Dist: lxml==6.0.2
|
|
18
|
+
Requires-Dist: mammoth==1.11.0
|
|
19
|
+
Requires-Dist: markdownify==1.2.2
|
|
20
|
+
Requires-Dist: pandas==2.3.3
|
|
21
|
+
Requires-Dist: pdfminer.six==20251107
|
|
22
|
+
Requires-Dist: prompt-toolkit==3.0.48
|
|
23
|
+
Requires-Dist: tabulate==0.9.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest==9.0.2; extra == "dev"
|
|
26
|
+
Requires-Dist: pip-tools==7.5.2; extra == "dev"
|
|
27
|
+
Requires-Dist: build==1.4.0; extra == "dev"
|
|
28
|
+
Requires-Dist: setuptools==65.5.0; extra == "dev"
|
|
29
|
+
Requires-Dist: twine==5.1.1; extra == "dev"
|
|
30
|
+
Provides-Extra: pdftext
|
|
31
|
+
Requires-Dist: pdftext==0.6.3; extra == "pdftext"
|
|
32
|
+
Provides-Extra: marker
|
|
33
|
+
Requires-Dist: marker-pdf==1.10.1; extra == "marker"
|
|
34
|
+
Provides-Extra: pypdfium2
|
|
35
|
+
Requires-Dist: pypdfium2==4.30.0; extra == "pypdfium2"
|
|
36
|
+
Provides-Extra: ocr
|
|
37
|
+
Requires-Dist: paddleocr==3.4.0; extra == "ocr"
|
|
38
|
+
Requires-Dist: pypdfium2==4.30.0; extra == "ocr"
|
|
39
|
+
Provides-Extra: all
|
|
40
|
+
Requires-Dist: pdftext==0.6.3; extra == "all"
|
|
41
|
+
Requires-Dist: marker-pdf==1.10.1; extra == "all"
|
|
42
|
+
Requires-Dist: pypdfium2==4.30.0; extra == "all"
|
|
43
|
+
Requires-Dist: paddleocr==3.4.0; extra == "all"
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
# DocuMint - Convert PDF, DOCX and CSV to Markdown
|
|
47
|
+
|
|
48
|
+
DocuMint is a small Python CLI and library (package `doc2md`) that turns PDF, DOCX, CSV, and image files into consistent, deterministic Markdown. It is built for documentation flows where the same source should always produce the same Markdown output, even when run on different machines or in CI.
|
|
49
|
+
|
|
50
|
+
## Highlights
|
|
51
|
+
|
|
52
|
+
- Text-first conversions for PDF (`pdfminer.six`), DOCX (Mammoth → BeautifulSoup → `markdownify`), and CSV (Pandas + Markdown table) controls the format you care about.
|
|
53
|
+
- OCR support for images and scanned PDFs (opt-in for PDFs).
|
|
54
|
+
- Small CLI plus a library API that can drop right into scripts, CI, or exploratory sessions.
|
|
55
|
+
- Deterministic normalization (newline, whitespace, blank lines) and CLI contracts that keep automation predictable.
|
|
56
|
+
- Interactive terminal UI with `/files`, `/format`, `/engine`, `/output`, and OCR/toggle commands when you launch `doc2md` without inputs.
|
|
57
|
+
|
|
58
|
+
## Quick start
|
|
59
|
+
|
|
60
|
+
1. Create a virtualenv, install reproducible dependencies, and activate it (Python 3.11+):
|
|
61
|
+
```powershell
|
|
62
|
+
Set-Location 'C:\path\to\DocuMint'
|
|
63
|
+
py -m venv .venv
|
|
64
|
+
& .\.venv\Scripts\Activate.ps1
|
|
65
|
+
python -m pip install --upgrade pip
|
|
66
|
+
python -m pip install --require-hashes -r requirements.txt
|
|
67
|
+
```
|
|
68
|
+
2. Convert a few sample files so “it works”:
|
|
69
|
+
```powershell
|
|
70
|
+
doc2md .\docs_in\sample.docx
|
|
71
|
+
python -m doc2md.cli file.pdf
|
|
72
|
+
python -m doc2md.cli table.csv
|
|
73
|
+
python -m doc2md.cli scan.png
|
|
74
|
+
```
|
|
75
|
+
3. Drop into interactive mode (no inputs) to explore `/files`, `/format`, and `/output`.
|
|
76
|
+
|
|
77
|
+
### Reproducible installs (Windows)
|
|
78
|
+
|
|
79
|
+
- Core runtime:
|
|
80
|
+
```powershell
|
|
81
|
+
python -m pip install --require-hashes -r requirements.txt
|
|
82
|
+
```
|
|
83
|
+
- Full feature set (PDF engines + OCR):
|
|
84
|
+
```powershell
|
|
85
|
+
python -m pip install --require-hashes -r requirements-all.txt
|
|
86
|
+
```
|
|
87
|
+
- Dev/test dependencies:
|
|
88
|
+
```powershell
|
|
89
|
+
python -m pip install --require-hashes -r requirements-dev.txt
|
|
90
|
+
```
|
|
91
|
+
- Regenerate lock files when dependencies change:
|
|
92
|
+
```powershell
|
|
93
|
+
.\scripts\lock_requirements.ps1
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Installation
|
|
97
|
+
|
|
98
|
+
### From TestPyPI (for testing)
|
|
99
|
+
|
|
100
|
+
```powershell
|
|
101
|
+
py -m pip install --upgrade pip
|
|
102
|
+
py -m pip install -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ documint2md
|
|
103
|
+
doc2md --help
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### From PyPI (production)
|
|
107
|
+
|
|
108
|
+
```powershell
|
|
109
|
+
py -m pip install --upgrade pip
|
|
110
|
+
py -m pip install documint2md
|
|
111
|
+
doc2md --help
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Optional extras (PDF engines + OCR) when installing from PyPI:
|
|
115
|
+
|
|
116
|
+
```powershell
|
|
117
|
+
py -m pip install "documint2md[all]"
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## CLI usage
|
|
121
|
+
|
|
122
|
+
Run `doc2md <file>` (or `python -m doc2md.cli <file>`) to convert a single input. By default the Markdown lands in `docs_out/<input filename>.md`. Use `-o <file>` to force a path and `-o -` to stream to stdout. Omit inputs to open the interactive picker, or pass `--interactive` for the picker even inside scripts.
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
python -m doc2md.cli file.docx -o file.md
|
|
126
|
+
python -m doc2md.cli file.pdf
|
|
127
|
+
python -m doc2md.cli table.csv
|
|
128
|
+
python -m doc2md.cli scan.png
|
|
129
|
+
doc2md # interactive mode
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### CLI contract
|
|
133
|
+
|
|
134
|
+
- Default output is `docs_out/<input filename>.md`; `-o <file>` overrides the destination, `-o -` writes to stdout.
|
|
135
|
+
- Interactive mode (no input) opens a curses-like UI tied to `docs_in`; `/files` loads the list and `/more` exposes advanced commands (history, OCR, toggles).
|
|
136
|
+
- Errors and diagnostics stream to stderr.
|
|
137
|
+
- Exit codes: `2` usage/argument error, `3` unsupported format, `4` conversion failure, `5` output write failure.
|
|
138
|
+
|
|
139
|
+
### CLI options
|
|
140
|
+
|
|
141
|
+
- `--format pdf|docx|csv|image` forces the parser instead of inferring from the extension.
|
|
142
|
+
- `--engine pdfminer|pdftext|marker` selects the PDF engine (default `pdfminer`; `marker` stays text-only unless assets are enabled explicitly).
|
|
143
|
+
- `--ocr` or `--ocr-mode auto` enables OCR fallback for PDFs when text extraction is empty.
|
|
144
|
+
- `--ocr-mode never|auto|always` controls OCR behavior for PDFs (default `never`).
|
|
145
|
+
- `--ocr-lang es` sets OCR language (default `es`).
|
|
146
|
+
- `--ocr-device cpu|gpu:0` overrides OCR device selection.
|
|
147
|
+
- `--ocr-render-scale 2.0` controls PDF render scale for OCR.
|
|
148
|
+
- `--ocr-min-score 0.5` filters low-confidence OCR text.
|
|
149
|
+
- `--csv-na ""` controls how empty values render.
|
|
150
|
+
- `--csv-float-format "%.6g"` stabilizes floating-point output when needed.
|
|
151
|
+
- `--profile <name>` loads defaults from `doc2md.toml`
|
|
152
|
+
- `--stats`, `--profile-report`, `--quiet`, `--debug`, `--version`, `--theme`, `--interactive`, `--no-input` toggle output, logging, and interactivity.
|
|
153
|
+
|
|
154
|
+
### OCR setup (optional)
|
|
155
|
+
|
|
156
|
+
Recommended (CPU + GPU side-by-side):
|
|
157
|
+
|
|
158
|
+
```powershell
|
|
159
|
+
.\scripts\setup_ocr_envs.ps1
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
See `docs/OCR Dual Environment Setup.md` for GPU verification, fallback index, and usage.
|
|
163
|
+
|
|
164
|
+
Quick run (GPU):
|
|
165
|
+
|
|
166
|
+
```powershell
|
|
167
|
+
.\scripts\doc2md-gpu.cmd docs_in\ocr_samples\sample_text.png --ocr-lang en --ocr-device gpu:0 --yes -o docs_out\sample_text.gpu.md
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Quick run (CPU):
|
|
171
|
+
|
|
172
|
+
```powershell
|
|
173
|
+
.\scripts\doc2md-cpu.cmd docs_in\ocr_samples\sample_text.png --ocr-lang en --ocr-device cpu --yes -o docs_out\sample_text.cpu.md
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
CPU:
|
|
177
|
+
```powershell
|
|
178
|
+
python -m pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
|
179
|
+
python -m pip install paddleocr==3.4.0
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
GPU (Windows; choose one CUDA index):
|
|
183
|
+
```powershell
|
|
184
|
+
python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
|
185
|
+
python -m pip install paddleocr==3.4.0
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
If model download issues:
|
|
189
|
+
```powershell
|
|
190
|
+
$env:PADDLE_PDX_MODEL_SOURCE = "BOS"
|
|
191
|
+
$env:PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK = "True"
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
Performance tips:
|
|
195
|
+
- Batch multiple files in one command to reuse OCR initialization.
|
|
196
|
+
- For scanned PDFs, use `--ocr-render-scale 1.0` to trade accuracy for speed.
|
|
197
|
+
- Prefer `--ocr-mode auto` for PDFs so OCR runs only on textless pages.
|
|
198
|
+
- First OCR run is slow due to model downloads; subsequent runs are faster.
|
|
199
|
+
|
|
200
|
+
## Interactive mode
|
|
201
|
+
|
|
202
|
+
When you run `doc2md` without inputs, the CLI opens a full-screen picker. Interact with `/files` (space to select, enter to convert), type `/` to see the short command list, and use `/more` for advanced tools (history, profiles, UI theme, session toggles). OCR is configured via `/ocr` subcommands (e.g. `/ocr mode auto`, `/ocr lang es`). The footer keeps the current format/engine/output in view while the header shows version + cwd. Use `Ctrl+P/Ctrl+N` for command history.
|
|
203
|
+
|
|
204
|
+
## Library API
|
|
205
|
+
|
|
206
|
+
- `doc2md.pdf_to_markdown(path)` – extracts text-only Markdown from PDFs (OCR optional via `ocr_mode`).
|
|
207
|
+
- `doc2md.docx_to_markdown(path)` – converts DOCX → Mammoth HTML → Markdown via `markdownify` with deterministic heading/list settings.
|
|
208
|
+
- `doc2md.csv_to_markdown(path)` – parses CSV files with `pandas` and emits clean Markdown tables.
|
|
209
|
+
- `doc2md.image_to_markdown(path)` – runs OCR on image files and returns Markdown text.
|
|
210
|
+
- Input types: `str | PathLike`; return type: `str`.
|
|
211
|
+
- Exceptions: `ConversionError` for failures, `UnsupportedFormatError` for unsupported formats/engines.
|
|
212
|
+
|
|
213
|
+
## Normalization rules
|
|
214
|
+
|
|
215
|
+
- Normalize newlines to `\n`.
|
|
216
|
+
- Strip trailing whitespace per line.
|
|
217
|
+
- Cap consecutive blank lines at two.
|
|
218
|
+
- Remove trailing blank lines and end every non-empty output with a single newline.
|
|
219
|
+
|
|
220
|
+
## Testing & fixtures
|
|
221
|
+
|
|
222
|
+
```powershell
|
|
223
|
+
python -m pip install --require-hashes -r requirements-dev.txt
|
|
224
|
+
python -m pytest
|
|
225
|
+
python -m compileall .
|
|
226
|
+
python -m doc2md.cli .\docs_in\sample.docx -o .\docs_out\sample.docx.md
|
|
227
|
+
python -m doc2md.cli .\docs_in\sample.pdf > .\docs_out\sample.pdf.md
|
|
228
|
+
python -m doc2md.cli .\docs_in\sample.csv -o .\docs_out\sample.csv.md
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Edge-case fixtures live in `tests/fixtures/in` with golden Markdown in `tests/fixtures/out`; `docs_in` mirrors a subset for quick manual runs.
|
|
232
|
+
|
|
233
|
+
## Publishing
|
|
234
|
+
|
|
235
|
+
```powershell
|
|
236
|
+
python -m pip install --require-hashes -r requirements-dev.txt
|
|
237
|
+
py -m pip install --upgrade build twine
|
|
238
|
+
py -m build
|
|
239
|
+
twine check dist/*
|
|
240
|
+
twine upload --repository testpypi dist/* # for test releases
|
|
241
|
+
twine upload dist/* # for PyPI when ready
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## Release checklist
|
|
245
|
+
|
|
246
|
+
- Update `pyproject.toml` version.
|
|
247
|
+
- Regenerate `requirements.txt`, `requirements-all.txt`, and `requirements-dev.txt`.
|
|
248
|
+
- Run tests and CLI smoke conversions.
|
|
249
|
+
- Build and check distributions before upload.
|
|
250
|
+
|
|
251
|
+
## Contributing
|
|
252
|
+
|
|
253
|
+
Drop samples into `docs_in` and run the CLI to confirm conversions. Read `.github/copilot-instructions.md` for repo-specific guidance, keep diffs small, and explain fixture changes when extraction output shifts.
|
|
254
|
+
|
|
255
|
+
## Notes
|
|
256
|
+
|
|
257
|
+
- The interactive UI pauses ~2 seconds after success so the confirmation stays on screen unless you pass `--quiet`.
|
|
258
|
+
- History helpers: `doc2md history`, `search`, `rerun`, `jump`, `recent`, `explain`, and `ui`.
|
|
259
|
+
- The CLI exposes both quick (`/files`, `/format`, `/output`) and advanced (`/more`) helpers to explore settings without re-running the command.
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# DocuMint - Convert PDF, DOCX and CSV to Markdown
|
|
2
|
+
|
|
3
|
+
DocuMint is a small Python CLI and library (package `doc2md`) that turns PDF, DOCX, CSV, and image files into consistent, deterministic Markdown. It is built for documentation flows where the same source should always produce the same Markdown output, even when run on different machines or in CI.
|
|
4
|
+
|
|
5
|
+
## Highlights
|
|
6
|
+
|
|
7
|
+
- Text-first conversions for PDF (`pdfminer.six`), DOCX (Mammoth → BeautifulSoup → `markdownify`), and CSV (Pandas + Markdown table) controls the format you care about.
|
|
8
|
+
- OCR support for images and scanned PDFs (opt-in for PDFs).
|
|
9
|
+
- Small CLI plus a library API that can drop right into scripts, CI, or exploratory sessions.
|
|
10
|
+
- Deterministic normalization (newline, whitespace, blank lines) and CLI contracts that keep automation predictable.
|
|
11
|
+
- Interactive terminal UI with `/files`, `/format`, `/engine`, `/output`, and OCR/toggle commands when you launch `doc2md` without inputs.
|
|
12
|
+
|
|
13
|
+
## Quick start
|
|
14
|
+
|
|
15
|
+
1. Create a virtualenv, install reproducible dependencies, and activate it (Python 3.11+):
|
|
16
|
+
```powershell
|
|
17
|
+
Set-Location 'C:\path\to\DocuMint'
|
|
18
|
+
py -m venv .venv
|
|
19
|
+
& .\.venv\Scripts\Activate.ps1
|
|
20
|
+
python -m pip install --upgrade pip
|
|
21
|
+
python -m pip install --require-hashes -r requirements.txt
|
|
22
|
+
```
|
|
23
|
+
2. Convert a few sample files so “it works”:
|
|
24
|
+
```powershell
|
|
25
|
+
doc2md .\docs_in\sample.docx
|
|
26
|
+
python -m doc2md.cli file.pdf
|
|
27
|
+
python -m doc2md.cli table.csv
|
|
28
|
+
python -m doc2md.cli scan.png
|
|
29
|
+
```
|
|
30
|
+
3. Drop into interactive mode (no inputs) to explore `/files`, `/format`, and `/output`.
|
|
31
|
+
|
|
32
|
+
### Reproducible installs (Windows)
|
|
33
|
+
|
|
34
|
+
- Core runtime:
|
|
35
|
+
```powershell
|
|
36
|
+
python -m pip install --require-hashes -r requirements.txt
|
|
37
|
+
```
|
|
38
|
+
- Full feature set (PDF engines + OCR):
|
|
39
|
+
```powershell
|
|
40
|
+
python -m pip install --require-hashes -r requirements-all.txt
|
|
41
|
+
```
|
|
42
|
+
- Dev/test dependencies:
|
|
43
|
+
```powershell
|
|
44
|
+
python -m pip install --require-hashes -r requirements-dev.txt
|
|
45
|
+
```
|
|
46
|
+
- Regenerate lock files when dependencies change:
|
|
47
|
+
```powershell
|
|
48
|
+
.\scripts\lock_requirements.ps1
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
### From TestPyPI (for testing)
|
|
54
|
+
|
|
55
|
+
```powershell
|
|
56
|
+
py -m pip install --upgrade pip
|
|
57
|
+
py -m pip install -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ documint2md
|
|
58
|
+
doc2md --help
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### From PyPI (production)
|
|
62
|
+
|
|
63
|
+
```powershell
|
|
64
|
+
py -m pip install --upgrade pip
|
|
65
|
+
py -m pip install documint2md
|
|
66
|
+
doc2md --help
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Optional extras (PDF engines + OCR) when installing from PyPI:
|
|
70
|
+
|
|
71
|
+
```powershell
|
|
72
|
+
py -m pip install "documint2md[all]"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## CLI usage
|
|
76
|
+
|
|
77
|
+
Run `doc2md <file>` (or `python -m doc2md.cli <file>`) to convert a single input. By default the Markdown lands in `docs_out/<input filename>.md`. Use `-o <file>` to force a path and `-o -` to stream to stdout. Omit inputs to open the interactive picker, or pass `--interactive` for the picker even inside scripts.
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
python -m doc2md.cli file.docx -o file.md
|
|
81
|
+
python -m doc2md.cli file.pdf
|
|
82
|
+
python -m doc2md.cli table.csv
|
|
83
|
+
python -m doc2md.cli scan.png
|
|
84
|
+
doc2md # interactive mode
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### CLI contract
|
|
88
|
+
|
|
89
|
+
- Default output is `docs_out/<input filename>.md`; `-o <file>` overrides the destination, `-o -` writes to stdout.
|
|
90
|
+
- Interactive mode (no input) opens a curses-like UI tied to `docs_in`; `/files` loads the list and `/more` exposes advanced commands (history, OCR, toggles).
|
|
91
|
+
- Errors and diagnostics stream to stderr.
|
|
92
|
+
- Exit codes: `2` usage/argument error, `3` unsupported format, `4` conversion failure, `5` output write failure.
|
|
93
|
+
|
|
94
|
+
### CLI options
|
|
95
|
+
|
|
96
|
+
- `--format pdf|docx|csv|image` forces the parser instead of inferring from the extension.
|
|
97
|
+
- `--engine pdfminer|pdftext|marker` selects the PDF engine (default `pdfminer`; `marker` stays text-only unless assets are enabled explicitly).
|
|
98
|
+
- `--ocr` or `--ocr-mode auto` enables OCR fallback for PDFs when text extraction is empty.
|
|
99
|
+
- `--ocr-mode never|auto|always` controls OCR behavior for PDFs (default `never`).
|
|
100
|
+
- `--ocr-lang es` sets OCR language (default `es`).
|
|
101
|
+
- `--ocr-device cpu|gpu:0` overrides OCR device selection.
|
|
102
|
+
- `--ocr-render-scale 2.0` controls PDF render scale for OCR.
|
|
103
|
+
- `--ocr-min-score 0.5` filters low-confidence OCR text.
|
|
104
|
+
- `--csv-na ""` controls how empty values render.
|
|
105
|
+
- `--csv-float-format "%.6g"` stabilizes floating-point output when needed.
|
|
106
|
+
- `--profile <name>` loads defaults from `doc2md.toml`
|
|
107
|
+
- `--stats`, `--profile-report`, `--quiet`, `--debug`, `--version`, `--theme`, `--interactive`, `--no-input` toggle output, logging, and interactivity.
|
|
108
|
+
|
|
109
|
+
### OCR setup (optional)
|
|
110
|
+
|
|
111
|
+
Recommended (CPU + GPU side-by-side):
|
|
112
|
+
|
|
113
|
+
```powershell
|
|
114
|
+
.\scripts\setup_ocr_envs.ps1
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
See `docs/OCR Dual Environment Setup.md` for GPU verification, fallback index, and usage.
|
|
118
|
+
|
|
119
|
+
Quick run (GPU):
|
|
120
|
+
|
|
121
|
+
```powershell
|
|
122
|
+
.\scripts\doc2md-gpu.cmd docs_in\ocr_samples\sample_text.png --ocr-lang en --ocr-device gpu:0 --yes -o docs_out\sample_text.gpu.md
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Quick run (CPU):
|
|
126
|
+
|
|
127
|
+
```powershell
|
|
128
|
+
.\scripts\doc2md-cpu.cmd docs_in\ocr_samples\sample_text.png --ocr-lang en --ocr-device cpu --yes -o docs_out\sample_text.cpu.md
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
CPU:
|
|
132
|
+
```powershell
|
|
133
|
+
python -m pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
|
134
|
+
python -m pip install paddleocr==3.4.0
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
GPU (Windows; choose one CUDA index):
|
|
138
|
+
```powershell
|
|
139
|
+
python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
|
140
|
+
python -m pip install paddleocr==3.4.0
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
If model download issues:
|
|
144
|
+
```powershell
|
|
145
|
+
$env:PADDLE_PDX_MODEL_SOURCE = "BOS"
|
|
146
|
+
$env:PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK = "True"
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Performance tips:
|
|
150
|
+
- Batch multiple files in one command to reuse OCR initialization.
|
|
151
|
+
- For scanned PDFs, use `--ocr-render-scale 1.0` to trade accuracy for speed.
|
|
152
|
+
- Prefer `--ocr-mode auto` for PDFs so OCR runs only on textless pages.
|
|
153
|
+
- First OCR run is slow due to model downloads; subsequent runs are faster.
|
|
154
|
+
|
|
155
|
+
## Interactive mode
|
|
156
|
+
|
|
157
|
+
When you run `doc2md` without inputs, the CLI opens a full-screen picker. Interact with `/files` (space to select, enter to convert), type `/` to see the short command list, and use `/more` for advanced tools (history, profiles, UI theme, session toggles). OCR is configured via `/ocr` subcommands (e.g. `/ocr mode auto`, `/ocr lang es`). The footer keeps the current format/engine/output in view while the header shows version + cwd. Use `Ctrl+P/Ctrl+N` for command history.
|
|
158
|
+
|
|
159
|
+
## Library API
|
|
160
|
+
|
|
161
|
+
- `doc2md.pdf_to_markdown(path)` – extracts text-only Markdown from PDFs (OCR optional via `ocr_mode`).
|
|
162
|
+
- `doc2md.docx_to_markdown(path)` – converts DOCX → Mammoth HTML → Markdown via `markdownify` with deterministic heading/list settings.
|
|
163
|
+
- `doc2md.csv_to_markdown(path)` – parses CSV files with `pandas` and emits clean Markdown tables.
|
|
164
|
+
- `doc2md.image_to_markdown(path)` – runs OCR on image files and returns Markdown text.
|
|
165
|
+
- Input types: `str | PathLike`; return type: `str`.
|
|
166
|
+
- Exceptions: `ConversionError` for failures, `UnsupportedFormatError` for unsupported formats/engines.
|
|
167
|
+
|
|
168
|
+
## Normalization rules
|
|
169
|
+
|
|
170
|
+
- Normalize newlines to `\n`.
|
|
171
|
+
- Strip trailing whitespace per line.
|
|
172
|
+
- Cap consecutive blank lines at two.
|
|
173
|
+
- Remove trailing blank lines and end every non-empty output with a single newline.
|
|
174
|
+
|
|
175
|
+
## Testing & fixtures
|
|
176
|
+
|
|
177
|
+
```powershell
|
|
178
|
+
python -m pip install --require-hashes -r requirements-dev.txt
|
|
179
|
+
python -m pytest
|
|
180
|
+
python -m compileall .
|
|
181
|
+
python -m doc2md.cli .\docs_in\sample.docx -o .\docs_out\sample.docx.md
|
|
182
|
+
python -m doc2md.cli .\docs_in\sample.pdf > .\docs_out\sample.pdf.md
|
|
183
|
+
python -m doc2md.cli .\docs_in\sample.csv -o .\docs_out\sample.csv.md
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
Edge-case fixtures live in `tests/fixtures/in` with golden Markdown in `tests/fixtures/out`; `docs_in` mirrors a subset for quick manual runs.
|
|
187
|
+
|
|
188
|
+
## Publishing
|
|
189
|
+
|
|
190
|
+
```powershell
|
|
191
|
+
python -m pip install --require-hashes -r requirements-dev.txt
|
|
192
|
+
py -m pip install --upgrade build twine
|
|
193
|
+
py -m build
|
|
194
|
+
twine check dist/*
|
|
195
|
+
twine upload --repository testpypi dist/* # for test releases
|
|
196
|
+
twine upload dist/* # for PyPI when ready
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Release checklist
|
|
200
|
+
|
|
201
|
+
- Update `pyproject.toml` version.
|
|
202
|
+
- Regenerate `requirements.txt`, `requirements-all.txt`, and `requirements-dev.txt`.
|
|
203
|
+
- Run tests and CLI smoke conversions.
|
|
204
|
+
- Build and check distributions before upload.
|
|
205
|
+
|
|
206
|
+
## Contributing
|
|
207
|
+
|
|
208
|
+
Drop samples into `docs_in` and run the CLI to confirm conversions. Read `.github/copilot-instructions.md` for repo-specific guidance, keep diffs small, and explain fixture changes when extraction output shifts.
|
|
209
|
+
|
|
210
|
+
## Notes
|
|
211
|
+
|
|
212
|
+
- The interactive UI pauses ~2 seconds after success so the confirmation stays on screen unless you pass `--quiet`.
|
|
213
|
+
- History helpers: `doc2md history`, `search`, `rerun`, `jump`, `recent`, `explain`, and `ui`.
|
|
214
|
+
- The CLI exposes both quick (`/files`, `/format`, `/output`) and advanced (`/more`) helpers to explore settings without re-running the command.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Simple document -> Markdown converter utilities."""
|
|
2
|
+
from importlib import metadata
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
__version__ = metadata.version("documint2md")
|
|
6
|
+
except metadata.PackageNotFoundError:
|
|
7
|
+
__version__ = "0.0.0+local"
|
|
8
|
+
from .converter import (
|
|
9
|
+
ConversionError,
|
|
10
|
+
UnsupportedFormatError,
|
|
11
|
+
csv_to_markdown,
|
|
12
|
+
docx_to_markdown,
|
|
13
|
+
image_to_markdown,
|
|
14
|
+
pdf_to_markdown,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"ConversionError",
|
|
19
|
+
"UnsupportedFormatError",
|
|
20
|
+
"pdf_to_markdown",
|
|
21
|
+
"docx_to_markdown",
|
|
22
|
+
"csv_to_markdown",
|
|
23
|
+
"image_to_markdown",
|
|
24
|
+
"__version__",
|
|
25
|
+
]
|