libmumd 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- libmumd-0.1.0/LICENSE +21 -0
- libmumd-0.1.0/PKG-INFO +199 -0
- libmumd-0.1.0/README.md +167 -0
- libmumd-0.1.0/libmumd/__init__.py +4 -0
- libmumd-0.1.0/libmumd/cli.py +26 -0
- libmumd-0.1.0/libmumd/convert.py +116 -0
- libmumd-0.1.0/libmumd.egg-info/PKG-INFO +199 -0
- libmumd-0.1.0/libmumd.egg-info/SOURCES.txt +12 -0
- libmumd-0.1.0/libmumd.egg-info/dependency_links.txt +1 -0
- libmumd-0.1.0/libmumd.egg-info/entry_points.txt +2 -0
- libmumd-0.1.0/libmumd.egg-info/requires.txt +5 -0
- libmumd-0.1.0/libmumd.egg-info/top_level.txt +1 -0
- libmumd-0.1.0/pyproject.toml +50 -0
- libmumd-0.1.0/setup.cfg +4 -0
libmumd-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Erfan Ashtari
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
libmumd-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: libmumd
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert documents (Office, PDF) to Markdown — optimized for Persian/Farsi and multilingual content
|
|
5
|
+
Author: Erfan Ashtari
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/erfan-ashtari/libmumd
|
|
8
|
+
Project-URL: Repository, https://github.com/erfan-ashtari/libmumd
|
|
9
|
+
Project-URL: Issues, https://github.com/erfan-ashtari/libmumd/issues
|
|
10
|
+
Keywords: markdown,pdf,document-conversion,persian,farsi,llm
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: pymupdf4llm
|
|
28
|
+
Requires-Dist: markitdown
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
<div align="center">
|
|
34
|
+
|
|
35
|
+
# libmumd
|
|
36
|
+
|
|
37
|
+
**Convert documents to clean, LLM-ready Markdown.**
|
|
38
|
+
|
|
39
|
+
[](https://www.python.org/downloads/)
|
|
40
|
+
[](LICENSE)
|
|
41
|
+
[](https://github.com/erfan-ashtari/libmumd/releases)
|
|
42
|
+
|
|
43
|
+
</div>
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Why libmumd?
|
|
48
|
+
|
|
49
|
+
Most document converters produce messy output — broken tables, lost formatting, garbled non-English text. **libmumd** is different:
|
|
50
|
+
|
|
51
|
+
- **Persian & Arabic first-class support** — Correctly handles RTL text, Persian typography, and Arabic script
|
|
52
|
+
- **Better than markitdown alone** — Uses PyMuPDF's layout engine for cleaner, more accurate conversion
|
|
53
|
+
- **Multi-language support** — Handles Persian, Arabic, Chinese, Japanese, Korean, and European languages
|
|
54
|
+
- **Table detection** — Automatically converts complex tables to Markdown format
|
|
55
|
+
- **Figure & image handling** — Extracts and references images properly
|
|
56
|
+
- **Layout-aware** — Preserves reading order, headers, and document structure
|
|
57
|
+
- **No GPU required** — Runs on any machine with Python
|
|
58
|
+
|
|
59
|
+
## Features
|
|
60
|
+
|
|
61
|
+
| Feature | Description |
|
|
62
|
+
|---------|-------------|
|
|
63
|
+
| **PDF → Markdown** | High-quality extraction with layout preservation |
|
|
64
|
+
| **Office → Markdown** | Convert `.docx`, `.pptx`, `.xlsx`, and more via LibreOffice |
|
|
65
|
+
| **Smart table parsing** | Complex tables become clean Markdown tables |
|
|
66
|
+
| **Image extraction** | Embedded images are saved and referenced |
|
|
67
|
+
| **Header detection** | Font sizes map to `#` heading levels automatically |
|
|
68
|
+
| **Inline formatting** | Preserves **bold**, *italic*, and `code` |
|
|
69
|
+
| **Multi-column layouts** | Reconstructs natural reading order |
|
|
70
|
+
| **OCR fallback** | Handles scanned documents when text layer is missing |
|
|
71
|
+
|
|
72
|
+
## Installation
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install git+https://github.com/erfan-ashtari/libmumd.git
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Or install from source:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
git clone https://github.com/erfan-ashtari/libmumd.git
|
|
82
|
+
cd libmumd
|
|
83
|
+
pip install .
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Quick Start
|
|
87
|
+
|
|
88
|
+
### Command Line
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# Convert PDF to Markdown
|
|
92
|
+
libmumd document.pdf
|
|
93
|
+
|
|
94
|
+
# Convert Office document
|
|
95
|
+
libmumd report.docx output.md
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Python
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from libmumd import convert_file
|
|
102
|
+
|
|
103
|
+
# Basic usage
|
|
104
|
+
result = convert_file("document.pdf")
|
|
105
|
+
print(result)
|
|
106
|
+
# {'status': 'ok', 'chars': 4523, 'output': 'document.md'}
|
|
107
|
+
|
|
108
|
+
# Custom output path
|
|
109
|
+
result = convert_file("presentation.pptx", "slides.md")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Supported Formats
|
|
113
|
+
|
|
114
|
+
| Format | Extensions | Conversion Method |
|
|
115
|
+
|--------|------------|-------------------|
|
|
116
|
+
| PDF | `.pdf` | PyMuPDF (native) |
|
|
117
|
+
| Word | `.docx`, `.doc` | LibreOffice |
|
|
118
|
+
| PowerPoint | `.pptx`, `.ppt` | LibreOffice |
|
|
119
|
+
| Excel | `.xlsx`, `.xls` | LibreOffice |
|
|
120
|
+
| OpenDocument | `.odt`, `.odp`, `.ods` | LibreOffice |
|
|
121
|
+
| Rich Text | `.rtf` | LibreOffice |
|
|
122
|
+
| Other | Any | markitdown fallback |
|
|
123
|
+
|
|
124
|
+
## Requirements
|
|
125
|
+
|
|
126
|
+
### Python Packages (Auto-installed)
|
|
127
|
+
|
|
128
|
+
- `pymupdf4llm` — PDF extraction engine
|
|
129
|
+
- `markitdown` — Fallback converter
|
|
130
|
+
|
|
131
|
+
### LibreOffice (Required for Office Files)
|
|
132
|
+
|
|
133
|
+
LibreOffice is needed to convert Word, PowerPoint, and Excel files.
|
|
134
|
+
|
|
135
|
+
| OS | Installation |
|
|
136
|
+
|----|--------------|
|
|
137
|
+
| **Windows** | `winget install --id TheDocumentFoundation.LibreOffice` |
|
|
138
|
+
| **macOS** | `brew install --cask libreoffice` |
|
|
139
|
+
| **Linux** | `sudo apt-get install libreoffice` |
|
|
140
|
+
|
|
141
|
+
Or download from [libreoffice.org](https://www.libreoffice.org/).
|
|
142
|
+
|
|
143
|
+
> **Note:** PDF conversion works without LibreOffice. Only Office document conversion requires it.
|
|
144
|
+
|
|
145
|
+
## Output Quality Comparison
|
|
146
|
+
|
|
147
|
+
| Aspect | markitdown only | libmumd |
|
|
148
|
+
|--------|-----------------|---------|
|
|
149
|
+
| Table formatting | Inconsistent | Clean Markdown tables |
|
|
150
|
+
| Multi-language | Basic | Full Unicode support |
|
|
151
|
+
| Layout preservation | None | Reading order preserved |
|
|
152
|
+
| Image handling | Limited | Extracted and referenced |
|
|
153
|
+
| Header detection | None | Automatic heading levels |
|
|
154
|
+
|
|
155
|
+
## Persian (Farsi) & Arabic Support
|
|
156
|
+
|
|
157
|
+
libmumd is built with **Persian and Arabic documents in mind**:
|
|
158
|
+
|
|
159
|
+
- **RTL text handling** — Correctly processes right-to-left text
|
|
160
|
+
- **Persian typography** — Preserves proper character connections and diacritics
|
|
161
|
+
- **Mixed content** — Handles documents with both Persian/Arabic and English text
|
|
162
|
+
- **PDF extraction** — Extracts Persian text without garbling or losing characters
|
|
163
|
+
- **Font support** — Works with Persian fonts like IRANSans, Vazirmatn, and more
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
from libmumd import convert_file
|
|
167
|
+
|
|
168
|
+
# Convert a Persian PDF document
|
|
169
|
+
result = convert_file("persian-document.pdf")
|
|
170
|
+
# Output preserves RTL text and Persian characters correctly
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Contributing
|
|
174
|
+
|
|
175
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
176
|
+
|
|
177
|
+
1. Fork the repository
|
|
178
|
+
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
|
179
|
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
|
180
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
181
|
+
5. Open a Pull Request
|
|
182
|
+
|
|
183
|
+
## License
|
|
184
|
+
|
|
185
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
|
|
186
|
+
|
|
187
|
+
### Dependency Licenses
|
|
188
|
+
|
|
189
|
+
- **pymupdf4llm** — [AGPL-3.0](https://github.com/pymupdf/pymupdf4llm/blob/main/LICENSE) (required for PDF conversion)
|
|
190
|
+
- **markitdown** — MIT
|
|
191
|
+
- **LibreOffice** — [MPL-2.0](https://www.libreoffice.org/license/mpl-2.0/)
|
|
192
|
+
|
|
193
|
+
Users of this package must comply with the AGPL-3.0 license for pymupdf4llm.
|
|
194
|
+
|
|
195
|
+
## Acknowledgments
|
|
196
|
+
|
|
197
|
+
- [PyMuPDF4LLM](https://github.com/pymupdf/pymupdf4llm) — PDF extraction engine
|
|
198
|
+
- [markitdown](https://github.com/microsoft/markitdown) — Fallback converter
|
|
199
|
+
- [LibreOffice](https://www.libreoffice.org/) — Office document handling
|
libmumd-0.1.0/README.md
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# libmumd
|
|
4
|
+
|
|
5
|
+
**Convert documents to clean, LLM-ready Markdown.**
|
|
6
|
+
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
[](LICENSE)
|
|
9
|
+
[](https://github.com/erfan-ashtari/libmumd/releases)
|
|
10
|
+
|
|
11
|
+
</div>
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## Why libmumd?
|
|
16
|
+
|
|
17
|
+
Most document converters produce messy output — broken tables, lost formatting, garbled non-English text. **libmumd** is different:
|
|
18
|
+
|
|
19
|
+
- **Persian & Arabic first-class support** — Correctly handles RTL text, Persian typography, and Arabic script
|
|
20
|
+
- **Better than markitdown alone** — Uses PyMuPDF's layout engine for cleaner, more accurate conversion
|
|
21
|
+
- **Multi-language support** — Handles Persian, Arabic, Chinese, Japanese, Korean, and European languages
|
|
22
|
+
- **Table detection** — Automatically converts complex tables to Markdown format
|
|
23
|
+
- **Figure & image handling** — Extracts and references images properly
|
|
24
|
+
- **Layout-aware** — Preserves reading order, headers, and document structure
|
|
25
|
+
- **No GPU required** — Runs on any machine with Python
|
|
26
|
+
|
|
27
|
+
## Features
|
|
28
|
+
|
|
29
|
+
| Feature | Description |
|
|
30
|
+
|---------|-------------|
|
|
31
|
+
| **PDF → Markdown** | High-quality extraction with layout preservation |
|
|
32
|
+
| **Office → Markdown** | Convert `.docx`, `.pptx`, `.xlsx`, and more via LibreOffice |
|
|
33
|
+
| **Smart table parsing** | Complex tables become clean Markdown tables |
|
|
34
|
+
| **Image extraction** | Embedded images are saved and referenced |
|
|
35
|
+
| **Header detection** | Font sizes map to `#` heading levels automatically |
|
|
36
|
+
| **Inline formatting** | Preserves **bold**, *italic*, and `code` |
|
|
37
|
+
| **Multi-column layouts** | Reconstructs natural reading order |
|
|
38
|
+
| **OCR fallback** | Handles scanned documents when text layer is missing |
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install git+https://github.com/erfan-ashtari/libmumd.git
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Or install from source:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
git clone https://github.com/erfan-ashtari/libmumd.git
|
|
50
|
+
cd libmumd
|
|
51
|
+
pip install .
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Quick Start
|
|
55
|
+
|
|
56
|
+
### Command Line
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
# Convert PDF to Markdown
|
|
60
|
+
libmumd document.pdf
|
|
61
|
+
|
|
62
|
+
# Convert Office document
|
|
63
|
+
libmumd report.docx output.md
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Python
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from libmumd import convert_file
|
|
70
|
+
|
|
71
|
+
# Basic usage
|
|
72
|
+
result = convert_file("document.pdf")
|
|
73
|
+
print(result)
|
|
74
|
+
# {'status': 'ok', 'chars': 4523, 'output': 'document.md'}
|
|
75
|
+
|
|
76
|
+
# Custom output path
|
|
77
|
+
result = convert_file("presentation.pptx", "slides.md")
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Supported Formats
|
|
81
|
+
|
|
82
|
+
| Format | Extensions | Conversion Method |
|
|
83
|
+
|--------|------------|-------------------|
|
|
84
|
+
| PDF | `.pdf` | PyMuPDF (native) |
|
|
85
|
+
| Word | `.docx`, `.doc` | LibreOffice |
|
|
86
|
+
| PowerPoint | `.pptx`, `.ppt` | LibreOffice |
|
|
87
|
+
| Excel | `.xlsx`, `.xls` | LibreOffice |
|
|
88
|
+
| OpenDocument | `.odt`, `.odp`, `.ods` | LibreOffice |
|
|
89
|
+
| Rich Text | `.rtf` | LibreOffice |
|
|
90
|
+
| Other | Any | markitdown fallback |
|
|
91
|
+
|
|
92
|
+
## Requirements
|
|
93
|
+
|
|
94
|
+
### Python Packages (Auto-installed)
|
|
95
|
+
|
|
96
|
+
- `pymupdf4llm` — PDF extraction engine
|
|
97
|
+
- `markitdown` — Fallback converter
|
|
98
|
+
|
|
99
|
+
### LibreOffice (Required for Office Files)
|
|
100
|
+
|
|
101
|
+
LibreOffice is needed to convert Word, PowerPoint, and Excel files.
|
|
102
|
+
|
|
103
|
+
| OS | Installation |
|
|
104
|
+
|----|--------------|
|
|
105
|
+
| **Windows** | `winget install --id TheDocumentFoundation.LibreOffice` |
|
|
106
|
+
| **macOS** | `brew install --cask libreoffice` |
|
|
107
|
+
| **Linux** | `sudo apt-get install libreoffice` |
|
|
108
|
+
|
|
109
|
+
Or download from [libreoffice.org](https://www.libreoffice.org/).
|
|
110
|
+
|
|
111
|
+
> **Note:** PDF conversion works without LibreOffice. Only Office document conversion requires it.
|
|
112
|
+
|
|
113
|
+
## Output Quality Comparison
|
|
114
|
+
|
|
115
|
+
| Aspect | markitdown only | libmumd |
|
|
116
|
+
|--------|-----------------|---------|
|
|
117
|
+
| Table formatting | Inconsistent | Clean Markdown tables |
|
|
118
|
+
| Multi-language | Basic | Full Unicode support |
|
|
119
|
+
| Layout preservation | None | Reading order preserved |
|
|
120
|
+
| Image handling | Limited | Extracted and referenced |
|
|
121
|
+
| Header detection | None | Automatic heading levels |
|
|
122
|
+
|
|
123
|
+
## Persian (Farsi) & Arabic Support
|
|
124
|
+
|
|
125
|
+
libmumd is built with **Persian and Arabic documents in mind**:
|
|
126
|
+
|
|
127
|
+
- **RTL text handling** — Correctly processes right-to-left text
|
|
128
|
+
- **Persian typography** — Preserves proper character connections and diacritics
|
|
129
|
+
- **Mixed content** — Handles documents with both Persian/Arabic and English text
|
|
130
|
+
- **PDF extraction** — Extracts Persian text without garbling or losing characters
|
|
131
|
+
- **Font support** — Works with Persian fonts like IRANSans, Vazirmatn, and more
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from libmumd import convert_file
|
|
135
|
+
|
|
136
|
+
# Convert a Persian PDF document
|
|
137
|
+
result = convert_file("persian-document.pdf")
|
|
138
|
+
# Output preserves RTL text and Persian characters correctly
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Contributing
|
|
142
|
+
|
|
143
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
144
|
+
|
|
145
|
+
1. Fork the repository
|
|
146
|
+
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
|
147
|
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
|
148
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
149
|
+
5. Open a Pull Request
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
|
|
154
|
+
|
|
155
|
+
### Dependency Licenses
|
|
156
|
+
|
|
157
|
+
- **pymupdf4llm** — [AGPL-3.0](https://github.com/pymupdf/pymupdf4llm/blob/main/LICENSE) (required for PDF conversion)
|
|
158
|
+
- **markitdown** — MIT
|
|
159
|
+
- **LibreOffice** — [MPL-2.0](https://www.libreoffice.org/license/mpl-2.0/)
|
|
160
|
+
|
|
161
|
+
Users of this package must comply with the AGPL-3.0 license for pymupdf4llm.
|
|
162
|
+
|
|
163
|
+
## Acknowledgments
|
|
164
|
+
|
|
165
|
+
- [PyMuPDF4LLM](https://github.com/pymupdf/pymupdf4llm) — PDF extraction engine
|
|
166
|
+
- [markitdown](https://github.com/microsoft/markitdown) — Fallback converter
|
|
167
|
+
- [LibreOffice](https://www.libreoffice.org/) — Office document handling
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from .convert import convert_file
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def main():
|
|
6
|
+
if len(sys.argv) < 2 or sys.argv[1] in ['-h', '--help']:
|
|
7
|
+
print("Usage: libmumd <input_file> [output_file]")
|
|
8
|
+
print("\nConvert documents (Office, PDF) to Markdown.")
|
|
9
|
+
print("\nExamples:")
|
|
10
|
+
print(" libmumd document.pdf")
|
|
11
|
+
print(" libmumd document.docx output.md")
|
|
12
|
+
sys.exit(0)
|
|
13
|
+
|
|
14
|
+
input_file = sys.argv[1]
|
|
15
|
+
output_file = sys.argv[2] if len(sys.argv) > 2 else None
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
result = convert_file(input_file, output_file)
|
|
19
|
+
print(f"OK:{result.get('chars', 'fallback')}")
|
|
20
|
+
except Exception as e:
|
|
21
|
+
print(f"FAIL:{e}", file=sys.stderr)
|
|
22
|
+
sys.exit(1)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if __name__ == "__main__":
|
|
26
|
+
main()
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import shutil
|
|
3
|
+
import tempfile
|
|
4
|
+
import platform
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
OFFICE_EXTS = {'.docx', '.doc', '.pptx', '.ppt', '.xlsx', '.xls', '.odt', '.odp', '.ods', '.rtf'}
|
|
8
|
+
PDF_EXT = '.pdf'
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def find_soffice():
|
|
12
|
+
path = shutil.which('soffice')
|
|
13
|
+
if path:
|
|
14
|
+
return path
|
|
15
|
+
system = platform.system()
|
|
16
|
+
if system == 'Windows':
|
|
17
|
+
candidates = [
|
|
18
|
+
r'C:\Program Files\LibreOffice\program\soffice.exe',
|
|
19
|
+
r'C:\Program Files (x86)\LibreOffice\program\soffice.exe',
|
|
20
|
+
]
|
|
21
|
+
for c in candidates:
|
|
22
|
+
if Path(c).exists():
|
|
23
|
+
return c
|
|
24
|
+
elif system == 'Darwin':
|
|
25
|
+
candidate = '/Applications/LibreOffice.app/Contents/MacOS/soffice'
|
|
26
|
+
if Path(candidate).exists():
|
|
27
|
+
return candidate
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def install_soffice():
|
|
32
|
+
system = platform.system()
|
|
33
|
+
try:
|
|
34
|
+
if system == 'Windows':
|
|
35
|
+
subprocess.run(['winget', 'install', '--id', 'TheDocumentFoundation.LibreOffice', '--accept-package-agreements', '--accept-source-agreements'], check=True)
|
|
36
|
+
elif system == 'Darwin':
|
|
37
|
+
subprocess.run(['brew', 'install', '--cask', 'libreoffice'], check=True)
|
|
38
|
+
elif system == 'Linux':
|
|
39
|
+
subprocess.run(['sudo', 'apt-get', 'install', '-y', 'libreoffice'], check=True)
|
|
40
|
+
else:
|
|
41
|
+
return None
|
|
42
|
+
return find_soffice()
|
|
43
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def convert_office_to_pdf(office_file, tmp_dir):
|
|
48
|
+
soffice = find_soffice() or install_soffice()
|
|
49
|
+
if not soffice:
|
|
50
|
+
raise RuntimeError("LibreOffice is not installed and could not be auto-installed")
|
|
51
|
+
expected_pdf = tmp_dir / (office_file.stem + '.pdf')
|
|
52
|
+
cmd = [soffice, '--headless', '--convert-to', 'pdf', '--outdir', str(tmp_dir), str(office_file)]
|
|
53
|
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
|
54
|
+
if expected_pdf.exists():
|
|
55
|
+
return expected_pdf
|
|
56
|
+
err = result.stderr.strip()[:200] if result.stderr else 'unknown error'
|
|
57
|
+
raise RuntimeError(f"LibreOffice failed: {err}")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def convert_with_markitdown(input_file, output_file):
|
|
61
|
+
result = subprocess.run(
|
|
62
|
+
['markitdown', str(input_file), '-o', str(output_file)],
|
|
63
|
+
capture_output=True, text=True, timeout=60
|
|
64
|
+
)
|
|
65
|
+
if result.returncode != 0:
|
|
66
|
+
raise RuntimeError(result.stderr.strip()[:200] or f"markitdown exit code {result.returncode}")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def convert_file(input_file, output_file=None):
|
|
70
|
+
try:
|
|
71
|
+
import pymupdf4llm
|
|
72
|
+
except ImportError:
|
|
73
|
+
raise ImportError("pymupdf4llm not installed. Run: pip install pymupdf4llm")
|
|
74
|
+
|
|
75
|
+
input_path = Path(input_file).resolve()
|
|
76
|
+
if output_file is None:
|
|
77
|
+
output_path = input_path.with_suffix('.md')
|
|
78
|
+
else:
|
|
79
|
+
output_path = Path(output_file).resolve()
|
|
80
|
+
|
|
81
|
+
if not input_path.is_file():
|
|
82
|
+
raise FileNotFoundError(f"File not found: {input_path}")
|
|
83
|
+
|
|
84
|
+
ext = input_path.suffix.lower()
|
|
85
|
+
|
|
86
|
+
if ext == PDF_EXT:
|
|
87
|
+
md_text = pymupdf4llm.to_markdown(str(input_path))
|
|
88
|
+
output_path.write_text(md_text, encoding='utf-8')
|
|
89
|
+
return {"status": "ok", "chars": len(md_text), "output": str(output_path)}
|
|
90
|
+
|
|
91
|
+
elif ext in OFFICE_EXTS:
|
|
92
|
+
tmp_dir = Path(tempfile.mkdtemp(prefix="office2md_"))
|
|
93
|
+
try:
|
|
94
|
+
pdf_file = convert_office_to_pdf(input_path, tmp_dir)
|
|
95
|
+
md_text = pymupdf4llm.to_markdown(str(pdf_file))
|
|
96
|
+
output_path.write_text(md_text, encoding='utf-8')
|
|
97
|
+
return {"status": "ok", "chars": len(md_text), "output": str(output_path)}
|
|
98
|
+
except Exception as e:
|
|
99
|
+
try:
|
|
100
|
+
convert_with_markitdown(input_path, output_path)
|
|
101
|
+
return {"status": "ok", "method": "markitdown", "output": str(output_path)}
|
|
102
|
+
except Exception as e2:
|
|
103
|
+
raise RuntimeError(f"{e} (markitdown fallback: {e2})")
|
|
104
|
+
finally:
|
|
105
|
+
shutil.rmtree(tmp_dir, ignore_errors=True)
|
|
106
|
+
|
|
107
|
+
else:
|
|
108
|
+
try:
|
|
109
|
+
convert_with_markitdown(input_path, output_path)
|
|
110
|
+
return {"status": "ok", "method": "markitdown", "output": str(output_path)}
|
|
111
|
+
except Exception as e:
|
|
112
|
+
raise RuntimeError(f"Unsupported extension {ext}: {e}")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def convert(input_file, output_file=None):
|
|
116
|
+
return convert_file(input_file, output_file)
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: libmumd
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert documents (Office, PDF) to Markdown — optimized for Persian/Farsi and multilingual content
|
|
5
|
+
Author: Erfan Ashtari
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/erfan-ashtari/libmumd
|
|
8
|
+
Project-URL: Repository, https://github.com/erfan-ashtari/libmumd
|
|
9
|
+
Project-URL: Issues, https://github.com/erfan-ashtari/libmumd/issues
|
|
10
|
+
Keywords: markdown,pdf,document-conversion,persian,farsi,llm
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: pymupdf4llm
|
|
28
|
+
Requires-Dist: markitdown
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
<div align="center">
|
|
34
|
+
|
|
35
|
+
# libmumd
|
|
36
|
+
|
|
37
|
+
**Convert documents to clean, LLM-ready Markdown.**
|
|
38
|
+
|
|
39
|
+
[](https://www.python.org/downloads/)
|
|
40
|
+
[](LICENSE)
|
|
41
|
+
[](https://github.com/erfan-ashtari/libmumd/releases)
|
|
42
|
+
|
|
43
|
+
</div>
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Why libmumd?
|
|
48
|
+
|
|
49
|
+
Most document converters produce messy output — broken tables, lost formatting, garbled non-English text. **libmumd** is different:
|
|
50
|
+
|
|
51
|
+
- **Persian & Arabic first-class support** — Correctly handles RTL text, Persian typography, and Arabic script
|
|
52
|
+
- **Better than markitdown alone** — Uses PyMuPDF's layout engine for cleaner, more accurate conversion
|
|
53
|
+
- **Multi-language support** — Handles Persian, Arabic, Chinese, Japanese, Korean, and European languages
|
|
54
|
+
- **Table detection** — Automatically converts complex tables to Markdown format
|
|
55
|
+
- **Figure & image handling** — Extracts and references images properly
|
|
56
|
+
- **Layout-aware** — Preserves reading order, headers, and document structure
|
|
57
|
+
- **No GPU required** — Runs on any machine with Python
|
|
58
|
+
|
|
59
|
+
## Features
|
|
60
|
+
|
|
61
|
+
| Feature | Description |
|
|
62
|
+
|---------|-------------|
|
|
63
|
+
| **PDF → Markdown** | High-quality extraction with layout preservation |
|
|
64
|
+
| **Office → Markdown** | Convert `.docx`, `.pptx`, `.xlsx`, and more via LibreOffice |
|
|
65
|
+
| **Smart table parsing** | Complex tables become clean Markdown tables |
|
|
66
|
+
| **Image extraction** | Embedded images are saved and referenced |
|
|
67
|
+
| **Header detection** | Font sizes map to `#` heading levels automatically |
|
|
68
|
+
| **Inline formatting** | Preserves **bold**, *italic*, and `code` |
|
|
69
|
+
| **Multi-column layouts** | Reconstructs natural reading order |
|
|
70
|
+
| **OCR fallback** | Handles scanned documents when text layer is missing |
|
|
71
|
+
|
|
72
|
+
## Installation
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install git+https://github.com/erfan-ashtari/libmumd.git
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Or install from source:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
git clone https://github.com/erfan-ashtari/libmumd.git
|
|
82
|
+
cd libmumd
|
|
83
|
+
pip install .
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Quick Start
|
|
87
|
+
|
|
88
|
+
### Command Line
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# Convert PDF to Markdown
|
|
92
|
+
libmumd document.pdf
|
|
93
|
+
|
|
94
|
+
# Convert Office document
|
|
95
|
+
libmumd report.docx output.md
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Python
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from libmumd import convert_file
|
|
102
|
+
|
|
103
|
+
# Basic usage
|
|
104
|
+
result = convert_file("document.pdf")
|
|
105
|
+
print(result)
|
|
106
|
+
# {'status': 'ok', 'chars': 4523, 'output': 'document.md'}
|
|
107
|
+
|
|
108
|
+
# Custom output path
|
|
109
|
+
result = convert_file("presentation.pptx", "slides.md")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Supported Formats
|
|
113
|
+
|
|
114
|
+
| Format | Extensions | Conversion Method |
|
|
115
|
+
|--------|------------|-------------------|
|
|
116
|
+
| PDF | `.pdf` | PyMuPDF (native) |
|
|
117
|
+
| Word | `.docx`, `.doc` | LibreOffice |
|
|
118
|
+
| PowerPoint | `.pptx`, `.ppt` | LibreOffice |
|
|
119
|
+
| Excel | `.xlsx`, `.xls` | LibreOffice |
|
|
120
|
+
| OpenDocument | `.odt`, `.odp`, `.ods` | LibreOffice |
|
|
121
|
+
| Rich Text | `.rtf` | LibreOffice |
|
|
122
|
+
| Other | Any | markitdown fallback |
|
|
123
|
+
|
|
124
|
+
## Requirements
|
|
125
|
+
|
|
126
|
+
### Python Packages (Auto-installed)
|
|
127
|
+
|
|
128
|
+
- `pymupdf4llm` — PDF extraction engine
|
|
129
|
+
- `markitdown` — Fallback converter
|
|
130
|
+
|
|
131
|
+
### LibreOffice (Required for Office Files)
|
|
132
|
+
|
|
133
|
+
LibreOffice is needed to convert Word, PowerPoint, and Excel files.
|
|
134
|
+
|
|
135
|
+
| OS | Installation |
|
|
136
|
+
|----|--------------|
|
|
137
|
+
| **Windows** | `winget install --id TheDocumentFoundation.LibreOffice` |
|
|
138
|
+
| **macOS** | `brew install --cask libreoffice` |
|
|
139
|
+
| **Linux** | `sudo apt-get install libreoffice` |
|
|
140
|
+
|
|
141
|
+
Or download from [libreoffice.org](https://www.libreoffice.org/).
|
|
142
|
+
|
|
143
|
+
> **Note:** PDF conversion works without LibreOffice. Only Office document conversion requires it.
|
|
144
|
+
|
|
145
|
+
## Output Quality Comparison
|
|
146
|
+
|
|
147
|
+
| Aspect | markitdown only | libmumd |
|
|
148
|
+
|--------|-----------------|---------|
|
|
149
|
+
| Table formatting | Inconsistent | Clean Markdown tables |
|
|
150
|
+
| Multi-language | Basic | Full Unicode support |
|
|
151
|
+
| Layout preservation | None | Reading order preserved |
|
|
152
|
+
| Image handling | Limited | Extracted and referenced |
|
|
153
|
+
| Header detection | None | Automatic heading levels |
|
|
154
|
+
|
|
155
|
+
## Persian (Farsi) & Arabic Support
|
|
156
|
+
|
|
157
|
+
libmumd is built with **Persian and Arabic documents in mind**:
|
|
158
|
+
|
|
159
|
+
- **RTL text handling** — Correctly processes right-to-left text
|
|
160
|
+
- **Persian typography** — Preserves proper character connections and diacritics
|
|
161
|
+
- **Mixed content** — Handles documents with both Persian/Arabic and English text
|
|
162
|
+
- **PDF extraction** — Extracts Persian text without garbling or losing characters
|
|
163
|
+
- **Font support** — Works with Persian fonts like IRANSans, Vazirmatn, and more
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
from libmumd import convert_file
|
|
167
|
+
|
|
168
|
+
# Convert a Persian PDF document
|
|
169
|
+
result = convert_file("persian-document.pdf")
|
|
170
|
+
# Output preserves RTL text and Persian characters correctly
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Contributing
|
|
174
|
+
|
|
175
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
176
|
+
|
|
177
|
+
1. Fork the repository
|
|
178
|
+
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
|
179
|
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
|
180
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
181
|
+
5. Open a Pull Request
|
|
182
|
+
|
|
183
|
+
## License
|
|
184
|
+
|
|
185
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
|
|
186
|
+
|
|
187
|
+
### Dependency Licenses
|
|
188
|
+
|
|
189
|
+
- **pymupdf4llm** — [AGPL-3.0](https://github.com/pymupdf/pymupdf4llm/blob/main/LICENSE) (required for PDF conversion)
|
|
190
|
+
- **markitdown** — MIT
|
|
191
|
+
- **LibreOffice** — [MPL-2.0](https://www.libreoffice.org/license/mpl-2.0/)
|
|
192
|
+
|
|
193
|
+
Users of this package must comply with the AGPL-3.0 license for pymupdf4llm.
|
|
194
|
+
|
|
195
|
+
## Acknowledgments
|
|
196
|
+
|
|
197
|
+
- [PyMuPDF4LLM](https://github.com/pymupdf/pymupdf4llm) — PDF extraction engine
|
|
198
|
+
- [markitdown](https://github.com/microsoft/markitdown) — Fallback converter
|
|
199
|
+
- [LibreOffice](https://www.libreoffice.org/) — Office document handling
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
libmumd/__init__.py
|
|
5
|
+
libmumd/cli.py
|
|
6
|
+
libmumd/convert.py
|
|
7
|
+
libmumd.egg-info/PKG-INFO
|
|
8
|
+
libmumd.egg-info/SOURCES.txt
|
|
9
|
+
libmumd.egg-info/dependency_links.txt
|
|
10
|
+
libmumd.egg-info/entry_points.txt
|
|
11
|
+
libmumd.egg-info/requires.txt
|
|
12
|
+
libmumd.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
libmumd
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "libmumd"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Convert documents (Office, PDF) to Markdown — optimized for Persian/Farsi and multilingual content"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Erfan Ashtari"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["markdown", "pdf", "document-conversion", "persian", "farsi", "llm"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.8",
|
|
23
|
+
"Programming Language :: Python :: 3.9",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Programming Language :: Python :: 3.12",
|
|
27
|
+
"Programming Language :: Python :: 3.13",
|
|
28
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
29
|
+
"Topic :: Utilities",
|
|
30
|
+
]
|
|
31
|
+
dependencies = [
|
|
32
|
+
"pymupdf4llm",
|
|
33
|
+
"markitdown",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
dev = [
|
|
38
|
+
"pytest",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[project.scripts]
|
|
42
|
+
libmumd = "libmumd.cli:main"
|
|
43
|
+
|
|
44
|
+
[project.urls]
|
|
45
|
+
Homepage = "https://github.com/erfan-ashtari/libmumd"
|
|
46
|
+
Repository = "https://github.com/erfan-ashtari/libmumd"
|
|
47
|
+
Issues = "https://github.com/erfan-ashtari/libmumd/issues"
|
|
48
|
+
|
|
49
|
+
[tool.setuptools.packages.find]
|
|
50
|
+
include = ["libmumd*"]
|
libmumd-0.1.0/setup.cfg
ADDED