documint2md 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. documint2md-1.0.0/LICENSE +21 -0
  2. documint2md-1.0.0/PKG-INFO +259 -0
  3. documint2md-1.0.0/README.md +214 -0
  4. documint2md-1.0.0/doc2md/__init__.py +25 -0
  5. documint2md-1.0.0/doc2md/__main__.py +8 -0
  6. documint2md-1.0.0/doc2md/cli.py +3675 -0
  7. documint2md-1.0.0/doc2md/converter.py +241 -0
  8. documint2md-1.0.0/doc2md/history_db.py +514 -0
  9. documint2md-1.0.0/doc2md/ocr.py +364 -0
  10. documint2md-1.0.0/documint2md.egg-info/PKG-INFO +259 -0
  11. documint2md-1.0.0/documint2md.egg-info/SOURCES.txt +31 -0
  12. documint2md-1.0.0/documint2md.egg-info/dependency_links.txt +1 -0
  13. documint2md-1.0.0/documint2md.egg-info/entry_points.txt +2 -0
  14. documint2md-1.0.0/documint2md.egg-info/requires.txt +34 -0
  15. documint2md-1.0.0/documint2md.egg-info/top_level.txt +1 -0
  16. documint2md-1.0.0/pyproject.toml +58 -0
  17. documint2md-1.0.0/setup.cfg +4 -0
  18. documint2md-1.0.0/tests/test_cli.py +585 -0
  19. documint2md-1.0.0/tests/test_cli_contract.py +74 -0
  20. documint2md-1.0.0/tests/test_converter.py +81 -0
  21. documint2md-1.0.0/tests/test_converter_ocr.py +66 -0
  22. documint2md-1.0.0/tests/test_explain_command.py +90 -0
  23. documint2md-1.0.0/tests/test_history_commands.py +122 -0
  24. documint2md-1.0.0/tests/test_history_db.py +96 -0
  25. documint2md-1.0.0/tests/test_history_logging.py +108 -0
  26. documint2md-1.0.0/tests/test_jump_command.py +64 -0
  27. documint2md-1.0.0/tests/test_ocr.py +114 -0
  28. documint2md-1.0.0/tests/test_ocr_integration.py +61 -0
  29. documint2md-1.0.0/tests/test_profile_commands.py +95 -0
  30. documint2md-1.0.0/tests/test_recent_command.py +54 -0
  31. documint2md-1.0.0/tests/test_slash_history_command.py +84 -0
  32. documint2md-1.0.0/tests/test_ui_actions.py +56 -0
  33. documint2md-1.0.0/tests/test_ui_command.py +56 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 DocuMint Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,259 @@
1
+ Metadata-Version: 2.4
2
+ Name: documint2md
3
+ Version: 1.0.0
4
+ Summary: Convert PDF, DOCX, CSV, and image files to Markdown.
5
+ Author: DocuMint Contributors
6
+ License-Expression: MIT
7
+ Keywords: markdown,pdf,docx,csv,cli
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3 :: Only
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Operating System :: Microsoft :: Windows
13
+ Requires-Python: >=3.11
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: beautifulsoup4==4.14.3
17
+ Requires-Dist: lxml==6.0.2
18
+ Requires-Dist: mammoth==1.11.0
19
+ Requires-Dist: markdownify==1.2.2
20
+ Requires-Dist: pandas==2.3.3
21
+ Requires-Dist: pdfminer.six==20251107
22
+ Requires-Dist: prompt-toolkit==3.0.48
23
+ Requires-Dist: tabulate==0.9.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest==9.0.2; extra == "dev"
26
+ Requires-Dist: pip-tools==7.5.2; extra == "dev"
27
+ Requires-Dist: build==1.4.0; extra == "dev"
28
+ Requires-Dist: setuptools==65.5.0; extra == "dev"
29
+ Requires-Dist: twine==5.1.1; extra == "dev"
30
+ Provides-Extra: pdftext
31
+ Requires-Dist: pdftext==0.6.3; extra == "pdftext"
32
+ Provides-Extra: marker
33
+ Requires-Dist: marker-pdf==1.10.1; extra == "marker"
34
+ Provides-Extra: pypdfium2
35
+ Requires-Dist: pypdfium2==4.30.0; extra == "pypdfium2"
36
+ Provides-Extra: ocr
37
+ Requires-Dist: paddleocr==3.4.0; extra == "ocr"
38
+ Requires-Dist: pypdfium2==4.30.0; extra == "ocr"
39
+ Provides-Extra: all
40
+ Requires-Dist: pdftext==0.6.3; extra == "all"
41
+ Requires-Dist: marker-pdf==1.10.1; extra == "all"
42
+ Requires-Dist: pypdfium2==4.30.0; extra == "all"
43
+ Requires-Dist: paddleocr==3.4.0; extra == "all"
44
+ Dynamic: license-file
45
+
46
+ # DocuMint - Convert PDF, DOCX and CSV to Markdown
47
+
48
+ DocuMint is a small Python CLI and library (package `doc2md`) that turns PDF, DOCX, CSV, and image files into consistent, deterministic Markdown. It is built for documentation flows where the same source should always produce the same Markdown output, even when run on different machines or in CI.
49
+
50
+ ## Highlights
51
+
52
+ - Text-first conversions for PDF (`pdfminer.six`), DOCX (Mammoth → BeautifulSoup → `markdownify`), and CSV (Pandas + Markdown table) controls the format you care about.
53
+ - OCR support for images and scanned PDFs (opt-in for PDFs).
54
+ - Small CLI plus a library API that can drop right into scripts, CI, or exploratory sessions.
55
+ - Deterministic normalization (newline, whitespace, blank lines) and CLI contracts that keep automation predictable.
56
+ - Interactive terminal UI with `/files`, `/format`, `/engine`, `/output`, and OCR/toggle commands when you launch `doc2md` without inputs.
57
+
58
+ ## Quick start
59
+
60
+ 1. Create a virtualenv, install reproducible dependencies, and activate it (Python 3.11+):
61
+ ```powershell
62
+ Set-Location 'C:\path\to\DocuMint'
63
+ py -m venv .venv
64
+ & .\.venv\Scripts\Activate.ps1
65
+ python -m pip install --upgrade pip
66
+ python -m pip install --require-hashes -r requirements.txt
67
+ ```
68
+ 2. Convert a few sample files so “it works”:
69
+ ```powershell
70
+ doc2md .\docs_in\sample.docx
71
+ python -m doc2md.cli file.pdf
72
+ python -m doc2md.cli table.csv
73
+ python -m doc2md.cli scan.png
74
+ ```
75
+ 3. Drop into interactive mode (no inputs) to explore `/files`, `/format`, and `/output`.
76
+
77
+ ### Reproducible installs (Windows)
78
+
79
+ - Core runtime:
80
+ ```powershell
81
+ python -m pip install --require-hashes -r requirements.txt
82
+ ```
83
+ - Full feature set (PDF engines + OCR):
84
+ ```powershell
85
+ python -m pip install --require-hashes -r requirements-all.txt
86
+ ```
87
+ - Dev/test dependencies:
88
+ ```powershell
89
+ python -m pip install --require-hashes -r requirements-dev.txt
90
+ ```
91
+ - Regenerate lock files when dependencies change:
92
+ ```powershell
93
+ .\scripts\lock_requirements.ps1
94
+ ```
95
+
96
+ ## Installation
97
+
98
+ ### From TestPyPI (for testing)
99
+
100
+ ```powershell
101
+ py -m pip install --upgrade pip
102
+ py -m pip install -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ documint2md
103
+ doc2md --help
104
+ ```
105
+
106
+ ### From PyPI (production)
107
+
108
+ ```powershell
109
+ py -m pip install --upgrade pip
110
+ py -m pip install documint2md
111
+ doc2md --help
112
+ ```
113
+
114
+ Optional extras (PDF engines + OCR) when installing from PyPI:
115
+
116
+ ```powershell
117
+ py -m pip install "documint2md[all]"
118
+ ```
119
+
120
+ ## CLI usage
121
+
122
+ Run `doc2md <file>` (or `python -m doc2md.cli <file>`) to convert a single input. By default the Markdown lands in `docs_out/<input filename>.md`. Use `-o <file>` to force a path and `-o -` to stream to stdout. Omit inputs to open the interactive picker, or pass `--interactive` for the picker even inside scripts.
123
+
124
+ ```
125
+ python -m doc2md.cli file.docx -o file.md
126
+ python -m doc2md.cli file.pdf
127
+ python -m doc2md.cli table.csv
128
+ python -m doc2md.cli scan.png
129
+ doc2md # interactive mode
130
+ ```
131
+
132
+ ### CLI contract
133
+
134
+ - Default output is `docs_out/<input filename>.md`; `-o <file>` overrides the destination, `-o -` writes to stdout.
135
+ - Interactive mode (no input) opens a curses-like UI tied to `docs_in`; `/files` loads the list and `/more` exposes advanced commands (history, OCR, toggles).
136
+ - Errors and diagnostics stream to stderr.
137
+ - Exit codes: `2` usage/argument error, `3` unsupported format, `4` conversion failure, `5` output write failure.
138
+
139
+ ### CLI options
140
+
141
+ - `--format pdf|docx|csv|image` forces the parser instead of inferring from the extension.
142
+ - `--engine pdfminer|pdftext|marker` selects the PDF engine (default `pdfminer`; `marker` stays text-only unless assets are enabled explicitly).
143
+ - `--ocr` or `--ocr-mode auto` enables OCR fallback for PDFs when text extraction is empty.
144
+ - `--ocr-mode never|auto|always` controls OCR behavior for PDFs (default `never`).
145
+ - `--ocr-lang es` sets OCR language (default `es`).
146
+ - `--ocr-device cpu|gpu:0` overrides OCR device selection.
147
+ - `--ocr-render-scale 2.0` controls PDF render scale for OCR.
148
+ - `--ocr-min-score 0.5` filters low-confidence OCR text.
149
+ - `--csv-na ""` controls how empty values render.
150
+ - `--csv-float-format "%.6g"` stabilizes floating-point output when needed.
151
+ - `--profile <name>` loads defaults from `doc2md.toml`
152
+ - `--stats`, `--profile-report`, `--quiet`, `--debug`, `--version`, `--theme`, `--interactive`, `--no-input` toggle output, logging, and interactivity.
153
+
154
+ ### OCR setup (optional)
155
+
156
+ Recommended (CPU + GPU side-by-side):
157
+
158
+ ```powershell
159
+ .\scripts\setup_ocr_envs.ps1
160
+ ```
161
+
162
+ See `docs/OCR Dual Environment Setup.md` for GPU verification, fallback index, and usage.
163
+
164
+ Quick run (GPU):
165
+
166
+ ```powershell
167
+ .\scripts\doc2md-gpu.cmd docs_in\ocr_samples\sample_text.png --ocr-lang en --ocr-device gpu:0 --yes -o docs_out\sample_text.gpu.md
168
+ ```
169
+
170
+ Quick run (CPU):
171
+
172
+ ```powershell
173
+ .\scripts\doc2md-cpu.cmd docs_in\ocr_samples\sample_text.png --ocr-lang en --ocr-device cpu --yes -o docs_out\sample_text.cpu.md
174
+ ```
175
+
176
+ CPU:
177
+ ```powershell
178
+ python -m pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
179
+ python -m pip install paddleocr==3.4.0
180
+ ```
181
+
182
+ GPU (Windows; choose one CUDA index):
183
+ ```powershell
184
+ python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
185
+ python -m pip install paddleocr==3.4.0
186
+ ```
187
+
188
+ If model download issues:
189
+ ```powershell
190
+ $env:PADDLE_PDX_MODEL_SOURCE = "BOS"
191
+ $env:PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK = "True"
192
+ ```
193
+
194
+ Performance tips:
195
+ - Batch multiple files in one command to reuse OCR initialization.
196
+ - For scanned PDFs, use `--ocr-render-scale 1.0` to trade accuracy for speed.
197
+ - Prefer `--ocr-mode auto` for PDFs so OCR runs only on textless pages.
198
+ - First OCR run is slow due to model downloads; subsequent runs are faster.
199
+
200
+ ## Interactive mode
201
+
202
+ When you run `doc2md` without inputs, the CLI opens a full-screen picker. Interact with `/files` (space to select, enter to convert), type `/` to see the short command list, and use `/more` for advanced tools (history, profiles, UI theme, session toggles). OCR is configured via `/ocr` subcommands (e.g. `/ocr mode auto`, `/ocr lang es`). The footer keeps the current format/engine/output in view while the header shows version + cwd. Use `Ctrl+P/Ctrl+N` for command history.
203
+
204
+ ## Library API
205
+
206
+ - `doc2md.pdf_to_markdown(path)` – extracts text-only Markdown from PDFs (OCR optional via `ocr_mode`).
207
+ - `doc2md.docx_to_markdown(path)` – converts DOCX → Mammoth HTML → Markdown via `markdownify` with deterministic heading/list settings.
208
+ - `doc2md.csv_to_markdown(path)` – parses CSV files with `pandas` and emits clean Markdown tables.
209
+ - `doc2md.image_to_markdown(path)` – runs OCR on image files and returns Markdown text.
210
+ - Input types: `str | PathLike`; return type: `str`.
211
+ - Exceptions: `ConversionError` for failures, `UnsupportedFormatError` for unsupported formats/engines.
212
+
213
+ ## Normalization rules
214
+
215
+ - Normalize newlines to `\n`.
216
+ - Strip trailing whitespace per line.
217
+ - Cap consecutive blank lines at two.
218
+ - Remove trailing blank lines and end every non-empty output with a single newline.
219
+
220
+ ## Testing & fixtures
221
+
222
+ ```powershell
223
+ python -m pip install --require-hashes -r requirements-dev.txt
224
+ python -m pytest
225
+ python -m compileall .
226
+ python -m doc2md.cli .\docs_in\sample.docx -o .\docs_out\sample.docx.md
227
+ python -m doc2md.cli .\docs_in\sample.pdf > .\docs_out\sample.pdf.md
228
+ python -m doc2md.cli .\docs_in\sample.csv -o .\docs_out\sample.csv.md
229
+ ```
230
+
231
+ Edge-case fixtures live in `tests/fixtures/in` with golden Markdown in `tests/fixtures/out`; `docs_in` mirrors a subset for quick manual runs.
232
+
233
+ ## Publishing
234
+
235
+ ```powershell
236
+ python -m pip install --require-hashes -r requirements-dev.txt
237
+ py -m pip install --upgrade build twine
238
+ py -m build
239
+ twine check dist/*
240
+ twine upload --repository testpypi dist/* # for test releases
241
+ twine upload dist/* # for PyPI when ready
242
+ ```
243
+
244
+ ## Release checklist
245
+
246
+ - Update `pyproject.toml` version.
247
+ - Regenerate `requirements.txt`, `requirements-all.txt`, and `requirements-dev.txt`.
248
+ - Run tests and CLI smoke conversions.
249
+ - Build and check distributions before upload.
250
+
251
+ ## Contributing
252
+
253
+ Drop samples into `docs_in` and run the CLI to confirm conversions. Read `.github/copilot-instructions.md` for repo-specific guidance, keep diffs small, and explain fixture changes when extraction output shifts.
254
+
255
+ ## Notes
256
+
257
+ - The interactive UI pauses ~2 seconds after success so the confirmation stays on screen unless you pass `--quiet`.
258
+ - History helpers: `doc2md history`, `search`, `rerun`, `jump`, `recent`, `explain`, and `ui`.
259
+ - The CLI exposes both quick (`/files`, `/format`, `/output`) and advanced (`/more`) helpers to explore settings without re-running the command.
@@ -0,0 +1,214 @@
1
+ # DocuMint - Convert PDF, DOCX and CSV to Markdown
2
+
3
+ DocuMint is a small Python CLI and library (package `doc2md`) that turns PDF, DOCX, CSV, and image files into consistent, deterministic Markdown. It is built for documentation flows where the same source should always produce the same Markdown output, even when run on different machines or in CI.
4
+
5
+ ## Highlights
6
+
7
+ - Text-first conversions for PDF (`pdfminer.six`), DOCX (Mammoth → BeautifulSoup → `markdownify`), and CSV (Pandas + Markdown table) controls the format you care about.
8
+ - OCR support for images and scanned PDFs (opt-in for PDFs).
9
+ - Small CLI plus a library API that can drop right into scripts, CI, or exploratory sessions.
10
+ - Deterministic normalization (newline, whitespace, blank lines) and CLI contracts that keep automation predictable.
11
+ - Interactive terminal UI with `/files`, `/format`, `/engine`, `/output`, and OCR/toggle commands when you launch `doc2md` without inputs.
12
+
13
+ ## Quick start
14
+
15
+ 1. Create a virtualenv, install reproducible dependencies, and activate it (Python 3.11+):
16
+ ```powershell
17
+ Set-Location 'C:\path\to\DocuMint'
18
+ py -m venv .venv
19
+ & .\.venv\Scripts\Activate.ps1
20
+ python -m pip install --upgrade pip
21
+ python -m pip install --require-hashes -r requirements.txt
22
+ ```
23
+ 2. Convert a few sample files so “it works”:
24
+ ```powershell
25
+ doc2md .\docs_in\sample.docx
26
+ python -m doc2md.cli file.pdf
27
+ python -m doc2md.cli table.csv
28
+ python -m doc2md.cli scan.png
29
+ ```
30
+ 3. Drop into interactive mode (no inputs) to explore `/files`, `/format`, and `/output`.
31
+
32
+ ### Reproducible installs (Windows)
33
+
34
+ - Core runtime:
35
+ ```powershell
36
+ python -m pip install --require-hashes -r requirements.txt
37
+ ```
38
+ - Full feature set (PDF engines + OCR):
39
+ ```powershell
40
+ python -m pip install --require-hashes -r requirements-all.txt
41
+ ```
42
+ - Dev/test dependencies:
43
+ ```powershell
44
+ python -m pip install --require-hashes -r requirements-dev.txt
45
+ ```
46
+ - Regenerate lock files when dependencies change:
47
+ ```powershell
48
+ .\scripts\lock_requirements.ps1
49
+ ```
50
+
51
+ ## Installation
52
+
53
+ ### From TestPyPI (for testing)
54
+
55
+ ```powershell
56
+ py -m pip install --upgrade pip
57
+ py -m pip install -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ documint2md
58
+ doc2md --help
59
+ ```
60
+
61
+ ### From PyPI (production)
62
+
63
+ ```powershell
64
+ py -m pip install --upgrade pip
65
+ py -m pip install documint2md
66
+ doc2md --help
67
+ ```
68
+
69
+ Optional extras (PDF engines + OCR) when installing from PyPI:
70
+
71
+ ```powershell
72
+ py -m pip install "documint2md[all]"
73
+ ```
74
+
75
+ ## CLI usage
76
+
77
+ Run `doc2md <file>` (or `python -m doc2md.cli <file>`) to convert a single input. By default the Markdown lands in `docs_out/<input filename>.md`. Use `-o <file>` to force a path and `-o -` to stream to stdout. Omit inputs to open the interactive picker, or pass `--interactive` for the picker even inside scripts.
78
+
79
+ ```
80
+ python -m doc2md.cli file.docx -o file.md
81
+ python -m doc2md.cli file.pdf
82
+ python -m doc2md.cli table.csv
83
+ python -m doc2md.cli scan.png
84
+ doc2md # interactive mode
85
+ ```
86
+
87
+ ### CLI contract
88
+
89
+ - Default output is `docs_out/<input filename>.md`; `-o <file>` overrides the destination, `-o -` writes to stdout.
90
+ - Interactive mode (no input) opens a curses-like UI tied to `docs_in`; `/files` loads the list and `/more` exposes advanced commands (history, OCR, toggles).
91
+ - Errors and diagnostics stream to stderr.
92
+ - Exit codes: `2` usage/argument error, `3` unsupported format, `4` conversion failure, `5` output write failure.
93
+
94
+ ### CLI options
95
+
96
+ - `--format pdf|docx|csv|image` forces the parser instead of inferring from the extension.
97
+ - `--engine pdfminer|pdftext|marker` selects the PDF engine (default `pdfminer`; `marker` stays text-only unless assets are enabled explicitly).
98
+ - `--ocr` or `--ocr-mode auto` enables OCR fallback for PDFs when text extraction is empty.
99
+ - `--ocr-mode never|auto|always` controls OCR behavior for PDFs (default `never`).
100
+ - `--ocr-lang es` sets OCR language (default `es`).
101
+ - `--ocr-device cpu|gpu:0` overrides OCR device selection.
102
+ - `--ocr-render-scale 2.0` controls PDF render scale for OCR.
103
+ - `--ocr-min-score 0.5` filters low-confidence OCR text.
104
+ - `--csv-na ""` controls how empty values render.
105
+ - `--csv-float-format "%.6g"` stabilizes floating-point output when needed.
106
+ - `--profile <name>` loads defaults from `doc2md.toml`
107
+ - `--stats`, `--profile-report`, `--quiet`, `--debug`, `--version`, `--theme`, `--interactive`, `--no-input` toggle output, logging, and interactivity.
108
+
109
+ ### OCR setup (optional)
110
+
111
+ Recommended (CPU + GPU side-by-side):
112
+
113
+ ```powershell
114
+ .\scripts\setup_ocr_envs.ps1
115
+ ```
116
+
117
+ See `docs/OCR Dual Environment Setup.md` for GPU verification, fallback index, and usage.
118
+
119
+ Quick run (GPU):
120
+
121
+ ```powershell
122
+ .\scripts\doc2md-gpu.cmd docs_in\ocr_samples\sample_text.png --ocr-lang en --ocr-device gpu:0 --yes -o docs_out\sample_text.gpu.md
123
+ ```
124
+
125
+ Quick run (CPU):
126
+
127
+ ```powershell
128
+ .\scripts\doc2md-cpu.cmd docs_in\ocr_samples\sample_text.png --ocr-lang en --ocr-device cpu --yes -o docs_out\sample_text.cpu.md
129
+ ```
130
+
131
+ CPU:
132
+ ```powershell
133
+ python -m pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
134
+ python -m pip install paddleocr==3.4.0
135
+ ```
136
+
137
+ GPU (Windows; choose one CUDA index):
138
+ ```powershell
139
+ python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
140
+ python -m pip install paddleocr==3.4.0
141
+ ```
142
+
143
+ If model download issues:
144
+ ```powershell
145
+ $env:PADDLE_PDX_MODEL_SOURCE = "BOS"
146
+ $env:PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK = "True"
147
+ ```
148
+
149
+ Performance tips:
150
+ - Batch multiple files in one command to reuse OCR initialization.
151
+ - For scanned PDFs, use `--ocr-render-scale 1.0` to trade accuracy for speed.
152
+ - Prefer `--ocr-mode auto` for PDFs so OCR runs only on textless pages.
153
+ - First OCR run is slow due to model downloads; subsequent runs are faster.
154
+
155
+ ## Interactive mode
156
+
157
+ When you run `doc2md` without inputs, the CLI opens a full-screen picker. Interact with `/files` (space to select, enter to convert), type `/` to see the short command list, and use `/more` for advanced tools (history, profiles, UI theme, session toggles). OCR is configured via `/ocr` subcommands (e.g. `/ocr mode auto`, `/ocr lang es`). The footer keeps the current format/engine/output in view while the header shows version + cwd. Use `Ctrl+P/Ctrl+N` for command history.
158
+
159
+ ## Library API
160
+
161
+ - `doc2md.pdf_to_markdown(path)` – extracts text-only Markdown from PDFs (OCR optional via `ocr_mode`).
162
+ - `doc2md.docx_to_markdown(path)` – converts DOCX → Mammoth HTML → Markdown via `markdownify` with deterministic heading/list settings.
163
+ - `doc2md.csv_to_markdown(path)` – parses CSV files with `pandas` and emits clean Markdown tables.
164
+ - `doc2md.image_to_markdown(path)` – runs OCR on image files and returns Markdown text.
165
+ - Input types: `str | PathLike`; return type: `str`.
166
+ - Exceptions: `ConversionError` for failures, `UnsupportedFormatError` for unsupported formats/engines.
167
+
168
+ ## Normalization rules
169
+
170
+ - Normalize newlines to `\n`.
171
+ - Strip trailing whitespace per line.
172
+ - Cap consecutive blank lines at two.
173
+ - Remove trailing blank lines and end every non-empty output with a single newline.
174
+
175
+ ## Testing & fixtures
176
+
177
+ ```powershell
178
+ python -m pip install --require-hashes -r requirements-dev.txt
179
+ python -m pytest
180
+ python -m compileall .
181
+ python -m doc2md.cli .\docs_in\sample.docx -o .\docs_out\sample.docx.md
182
+ python -m doc2md.cli .\docs_in\sample.pdf > .\docs_out\sample.pdf.md
183
+ python -m doc2md.cli .\docs_in\sample.csv -o .\docs_out\sample.csv.md
184
+ ```
185
+
186
+ Edge-case fixtures live in `tests/fixtures/in` with golden Markdown in `tests/fixtures/out`; `docs_in` mirrors a subset for quick manual runs.
187
+
188
+ ## Publishing
189
+
190
+ ```powershell
191
+ python -m pip install --require-hashes -r requirements-dev.txt
192
+ py -m pip install --upgrade build twine
193
+ py -m build
194
+ twine check dist/*
195
+ twine upload --repository testpypi dist/* # for test releases
196
+ twine upload dist/* # for PyPI when ready
197
+ ```
198
+
199
+ ## Release checklist
200
+
201
+ - Update `pyproject.toml` version.
202
+ - Regenerate `requirements.txt`, `requirements-all.txt`, and `requirements-dev.txt`.
203
+ - Run tests and CLI smoke conversions.
204
+ - Build and check distributions before upload.
205
+
206
+ ## Contributing
207
+
208
+ Drop samples into `docs_in` and run the CLI to confirm conversions. Read `.github/copilot-instructions.md` for repo-specific guidance, keep diffs small, and explain fixture changes when extraction output shifts.
209
+
210
+ ## Notes
211
+
212
+ - The interactive UI pauses ~2 seconds after success so the confirmation stays on screen unless you pass `--quiet`.
213
+ - History helpers: `doc2md history`, `search`, `rerun`, `jump`, `recent`, `explain`, and `ui`.
214
+ - The CLI exposes both quick (`/files`, `/format`, `/output`) and advanced (`/more`) helpers to explore settings without re-running the command.
@@ -0,0 +1,25 @@
1
+ """Simple document -> Markdown converter utilities."""
2
+ from importlib import metadata
3
+
4
+ try:
5
+ __version__ = metadata.version("documint2md")
6
+ except metadata.PackageNotFoundError:
7
+ __version__ = "0.0.0+local"
8
+ from .converter import (
9
+ ConversionError,
10
+ UnsupportedFormatError,
11
+ csv_to_markdown,
12
+ docx_to_markdown,
13
+ image_to_markdown,
14
+ pdf_to_markdown,
15
+ )
16
+
17
+ __all__ = [
18
+ "ConversionError",
19
+ "UnsupportedFormatError",
20
+ "pdf_to_markdown",
21
+ "docx_to_markdown",
22
+ "csv_to_markdown",
23
+ "image_to_markdown",
24
+ "__version__",
25
+ ]
@@ -0,0 +1,8 @@
1
+ """Module entrypoint for doc2md."""
2
+ from __future__ import annotations
3
+
4
+ from .cli import main
5
+
6
+
7
+ if __name__ == "__main__":
8
+ raise SystemExit(main())