pdf2mj 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf2mj-0.1.0/.gitignore +33 -0
- pdf2mj-0.1.0/PKG-INFO +241 -0
- pdf2mj-0.1.0/README.md +205 -0
- pdf2mj-0.1.0/pyproject.toml +70 -0
- pdf2mj-0.1.0/sample_pdfs/README.md +22 -0
- pdf2mj-0.1.0/src/pdf2mj/__init__.py +3 -0
- pdf2mj-0.1.0/src/pdf2mj/chunker.py +58 -0
- pdf2mj-0.1.0/src/pdf2mj/cli.py +207 -0
- pdf2mj-0.1.0/src/pdf2mj/config.py +70 -0
- pdf2mj-0.1.0/src/pdf2mj/console_util.py +26 -0
- pdf2mj-0.1.0/src/pdf2mj/converter.py +197 -0
- pdf2mj-0.1.0/src/pdf2mj/doctor.py +165 -0
- pdf2mj-0.1.0/src/pdf2mj/image_extractor.py +48 -0
- pdf2mj-0.1.0/src/pdf2mj/json_export.py +57 -0
- pdf2mj-0.1.0/src/pdf2mj/markdown.py +75 -0
- pdf2mj-0.1.0/src/pdf2mj/metadata.py +31 -0
- pdf2mj-0.1.0/src/pdf2mj/models.py +52 -0
- pdf2mj-0.1.0/src/pdf2mj/ocr.py +42 -0
- pdf2mj-0.1.0/src/pdf2mj/table_extractor.py +64 -0
- pdf2mj-0.1.0/src/pdf2mj/welcome.py +147 -0
- pdf2mj-0.1.0/tests/conftest.py +79 -0
- pdf2mj-0.1.0/tests/test_cli.py +59 -0
- pdf2mj-0.1.0/tests/test_cli_errors.py +18 -0
- pdf2mj-0.1.0/tests/test_config.py +70 -0
- pdf2mj-0.1.0/tests/test_converter.py +76 -0
- pdf2mj-0.1.0/tests/test_doctor.py +43 -0
- pdf2mj-0.1.0/tests/test_exports.py +101 -0
- pdf2mj-0.1.0/tests/test_image_extractor.py +18 -0
- pdf2mj-0.1.0/tests/test_json_blocks.py +32 -0
- pdf2mj-0.1.0/tests/test_markdown_blocks.py +29 -0
- pdf2mj-0.1.0/tests/test_metadata.py +21 -0
- pdf2mj-0.1.0/tests/test_models.py +26 -0
- pdf2mj-0.1.0/tests/test_ocr.py +7 -0
- pdf2mj-0.1.0/tests/test_table_extractor.py +12 -0
- pdf2mj-0.1.0/tests/test_welcome.py +76 -0
pdf2mj-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
.eggs/
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
*.egg
|
|
10
|
+
.venv/
|
|
11
|
+
venv/
|
|
12
|
+
.env
|
|
13
|
+
|
|
14
|
+
# Testing
|
|
15
|
+
.coverage
|
|
16
|
+
htmlcov/
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
.mypy_cache/
|
|
19
|
+
.ruff_cache/
|
|
20
|
+
|
|
21
|
+
# IDE
|
|
22
|
+
.idea/
|
|
23
|
+
.vscode/
|
|
24
|
+
*.swp
|
|
25
|
+
|
|
26
|
+
# Output
|
|
27
|
+
output/
|
|
28
|
+
*.md
|
|
29
|
+
!README.md
|
|
30
|
+
|
|
31
|
+
# OS
|
|
32
|
+
.DS_Store
|
|
33
|
+
Thumbs.db
|
pdf2mj-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdf2mj
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert PDF documents to Markdown and structured JSON for RAG and LLM pipelines
|
|
5
|
+
Project-URL: Homepage, https://github.com/Ronit-Pai/pdf2mj
|
|
6
|
+
Project-URL: Documentation, https://github.com/Ronit-Pai/pdf2mj#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/Ronit-Pai/pdf2mj
|
|
8
|
+
Author: PDF2MJ Contributors
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Keywords: document-conversion,json,llm,markdown,pdf,rag
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Text Processing
|
|
19
|
+
Classifier: Topic :: Utilities
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Requires-Dist: pandas>=2.0.0
|
|
22
|
+
Requires-Dist: pillow>=10.0.0
|
|
23
|
+
Requires-Dist: platformdirs>=4.0.0
|
|
24
|
+
Requires-Dist: pydantic>=2.0.0
|
|
25
|
+
Requires-Dist: pymupdf4llm>=0.0.17
|
|
26
|
+
Requires-Dist: pymupdf>=1.24.0
|
|
27
|
+
Requires-Dist: rich>=13.7.0
|
|
28
|
+
Requires-Dist: typer>=0.12.0
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
32
|
+
Provides-Extra: ocr
|
|
33
|
+
Requires-Dist: opencv-python-headless>=4.8.0; extra == 'ocr'
|
|
34
|
+
Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# PDF2MJ
|
|
38
|
+
|
|
39
|
+
Convert PDF documents to **Markdown** and **structured JSON** for RAG pipelines, LLM preprocessing, and knowledge bases.
|
|
40
|
+
|
|
41
|
+
## Installation (For Users)
|
|
42
|
+
|
|
43
|
+
**PyPI:** `pdf2mj` is not published on PyPI yet. Install from source (see [Development Setup](#development-setup-for-contributors)) or publish the package first.
|
|
44
|
+
|
|
45
|
+
When available on PyPI:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install pdf2mj
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
With OCR support:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install "pdf2mj[ocr]"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### OCR Requirements
|
|
58
|
+
|
|
59
|
+
OCR is optional and requires:
|
|
60
|
+
|
|
61
|
+
* Tesseract OCR installed on your system
|
|
62
|
+
* OCR extras installed via:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install "pdf2mj[ocr]"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### First Run
|
|
69
|
+
|
|
70
|
+
On the first `pdf2mj` invocation (no arguments), a Rich-powered welcome screen is shown once. State is stored in:
|
|
71
|
+
|
|
72
|
+
- Linux/macOS: `~/.config/pdf2mj/config.json`
|
|
73
|
+
- Windows: `%APPDATA%\pdf2mj\config.json`
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pdf2mj welcome # show the welcome screen again
|
|
77
|
+
pdf2mj doctor # verify dependencies and environment
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Quick Start
|
|
81
|
+
|
|
82
|
+
Convert a PDF to Markdown and JSON:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pdf2mj document.pdf
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Output files are generated next to the source PDF:
|
|
89
|
+
|
|
90
|
+
```text
|
|
91
|
+
document.md
|
|
92
|
+
document.json
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Specify an output directory:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
pdf2mj document.pdf --output ./output
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Common Examples
|
|
102
|
+
|
|
103
|
+
Generate all outputs:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
pdf2mj document.pdf --all --output ./output
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Extract images:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
pdf2mj document.pdf --extract-images
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Generate RAG chunks:
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
pdf2mj document.pdf --chunk-size 1000
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Use OCR for scanned PDFs:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
pdf2mj document.pdf --ocr
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### CLI Options
|
|
128
|
+
|
|
129
|
+
| Flag | Description |
|
|
130
|
+
| ------------------------------ | -------------------------------------- |
|
|
131
|
+
| `--markdown` / `--no-markdown` | Generate Markdown (default: on) |
|
|
132
|
+
| `--json` / `--no-json` | Generate structured JSON (default: on) |
|
|
133
|
+
| `--ocr` | OCR scanned pages |
|
|
134
|
+
| `--extract-images` | Extract embedded images |
|
|
135
|
+
| `--figures` | Alias for `--extract-images` |
|
|
136
|
+
| `--chunk-size N` | Generate RAG chunks |
|
|
137
|
+
| `--chunk-overlap N` | Chunk overlap (default: 200) |
|
|
138
|
+
| `--output`, `-o` | Output directory |
|
|
139
|
+
| `--verbose`, `-v` | Detailed logging |
|
|
140
|
+
| `--metadata` | Export metadata JSON |
|
|
141
|
+
| `--tables` / `--no-tables` | Extract tables |
|
|
142
|
+
| `--all` | Enable all supported outputs |
|
|
143
|
+
|
|
144
|
+
### Utility Commands
|
|
145
|
+
|
|
146
|
+
| Command | Description |
|
|
147
|
+
| ----------------- | ------------------------------------------------ |
|
|
148
|
+
| `pdf2mj welcome` | Show the onboarding welcome screen |
|
|
149
|
+
| `pdf2mj doctor` | Check Python, dependencies, OCR, and write access |
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# Development Setup (For Contributors)
|
|
153
|
+
|
|
154
|
+
## Prerequisites
|
|
155
|
+
|
|
156
|
+
* Python 3.12+
|
|
157
|
+
* Git
|
|
158
|
+
* Optional: Tesseract OCR
|
|
159
|
+
|
|
160
|
+
## Clone the Repository
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
git clone https://github.com/Ronit-Pai/pdf2mj.git
|
|
164
|
+
cd pdf2mj
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Create a Development Environment
|
|
168
|
+
|
|
169
|
+
Using pip:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
python -m venv .venv
|
|
173
|
+
source .venv/bin/activate # Linux/macOS
|
|
174
|
+
# .venv\Scripts\activate # Windows
|
|
175
|
+
|
|
176
|
+
pip install -e ".[dev]"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Using uv:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
uv venv
|
|
183
|
+
source .venv/bin/activate
|
|
184
|
+
|
|
185
|
+
uv pip install -e ".[dev]"
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
With OCR support:
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
pip install -e ".[dev,ocr]"
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Running Tests
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
pytest
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Coverage:
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
pytest --cov=pdf2mj --cov-report=html
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Project Structure
|
|
207
|
+
|
|
208
|
+
```text
|
|
209
|
+
src/pdf2mj/
|
|
210
|
+
cli.py
|
|
211
|
+
config.py
|
|
212
|
+
welcome.py
|
|
213
|
+
doctor.py
|
|
214
|
+
converter.py
|
|
215
|
+
models.py
|
|
216
|
+
markdown.py
|
|
217
|
+
json_export.py
|
|
218
|
+
metadata.py
|
|
219
|
+
table_extractor.py
|
|
220
|
+
image_extractor.py
|
|
221
|
+
ocr.py
|
|
222
|
+
chunker.py
|
|
223
|
+
console_util.py
|
|
224
|
+
|
|
225
|
+
tests/
|
|
226
|
+
sample_pdfs/
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## Local Development
|
|
230
|
+
|
|
231
|
+
Run directly from source:
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
pdf2mj sample.pdf
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
or
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
python -m pdf2mj sample.pdf
|
|
241
|
+
```
|
pdf2mj-0.1.0/README.md
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# PDF2MJ
|
|
2
|
+
|
|
3
|
+
Convert PDF documents to **Markdown** and **structured JSON** for RAG pipelines, LLM preprocessing, and knowledge bases.
|
|
4
|
+
|
|
5
|
+
## Installation (For Users)
|
|
6
|
+
|
|
7
|
+
**PyPI:** `pdf2mj` is not published on PyPI yet. Install from source (see [Development Setup](#development-setup-for-contributors)) or publish the package first.
|
|
8
|
+
|
|
9
|
+
When available on PyPI:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install pdf2mj
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
With OCR support:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install "pdf2mj[ocr]"
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### OCR Requirements
|
|
22
|
+
|
|
23
|
+
OCR is optional and requires:
|
|
24
|
+
|
|
25
|
+
* Tesseract OCR installed on your system
|
|
26
|
+
* OCR extras installed via:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install "pdf2mj[ocr]"
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### First Run
|
|
33
|
+
|
|
34
|
+
On the first `pdf2mj` invocation (no arguments), a Rich-powered welcome screen is shown once. State is stored in:
|
|
35
|
+
|
|
36
|
+
- Linux/macOS: `~/.config/pdf2mj/config.json`
|
|
37
|
+
- Windows: `%APPDATA%\pdf2mj\config.json`
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pdf2mj welcome # show the welcome screen again
|
|
41
|
+
pdf2mj doctor # verify dependencies and environment
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Quick Start
|
|
45
|
+
|
|
46
|
+
Convert a PDF to Markdown and JSON:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pdf2mj document.pdf
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Output files are generated next to the source PDF:
|
|
53
|
+
|
|
54
|
+
```text
|
|
55
|
+
document.md
|
|
56
|
+
document.json
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Specify an output directory:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pdf2mj document.pdf --output ./output
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Common Examples
|
|
66
|
+
|
|
67
|
+
Generate all outputs:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pdf2mj document.pdf --all --output ./output
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Extract images:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pdf2mj document.pdf --extract-images
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Generate RAG chunks:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pdf2mj document.pdf --chunk-size 1000
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Use OCR for scanned PDFs:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pdf2mj document.pdf --ocr
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### CLI Options
|
|
92
|
+
|
|
93
|
+
| Flag | Description |
|
|
94
|
+
| ------------------------------ | -------------------------------------- |
|
|
95
|
+
| `--markdown` / `--no-markdown` | Generate Markdown (default: on) |
|
|
96
|
+
| `--json` / `--no-json` | Generate structured JSON (default: on) |
|
|
97
|
+
| `--ocr` | OCR scanned pages |
|
|
98
|
+
| `--extract-images` | Extract embedded images |
|
|
99
|
+
| `--figures` | Alias for `--extract-images` |
|
|
100
|
+
| `--chunk-size N` | Generate RAG chunks |
|
|
101
|
+
| `--chunk-overlap N` | Chunk overlap (default: 200) |
|
|
102
|
+
| `--output`, `-o` | Output directory |
|
|
103
|
+
| `--verbose`, `-v` | Detailed logging |
|
|
104
|
+
| `--metadata` | Export metadata JSON |
|
|
105
|
+
| `--tables` / `--no-tables` | Extract tables |
|
|
106
|
+
| `--all` | Enable all supported outputs |
|
|
107
|
+
|
|
108
|
+
### Utility Commands
|
|
109
|
+
|
|
110
|
+
| Command | Description |
|
|
111
|
+
| ----------------- | ------------------------------------------------ |
|
|
112
|
+
| `pdf2mj welcome` | Show the onboarding welcome screen |
|
|
113
|
+
| `pdf2mj doctor` | Check Python, dependencies, OCR, and write access |
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# Development Setup (For Contributors)
|
|
117
|
+
|
|
118
|
+
## Prerequisites
|
|
119
|
+
|
|
120
|
+
* Python 3.12+
|
|
121
|
+
* Git
|
|
122
|
+
* Optional: Tesseract OCR
|
|
123
|
+
|
|
124
|
+
## Clone the Repository
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
git clone https://github.com/Ronit-Pai/pdf2mj.git
|
|
128
|
+
cd pdf2mj
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Create a Development Environment
|
|
132
|
+
|
|
133
|
+
Using pip:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
python -m venv .venv
|
|
137
|
+
source .venv/bin/activate # Linux/macOS
|
|
138
|
+
# .venv\Scripts\activate # Windows
|
|
139
|
+
|
|
140
|
+
pip install -e ".[dev]"
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Using uv:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
uv venv
|
|
147
|
+
source .venv/bin/activate
|
|
148
|
+
|
|
149
|
+
uv pip install -e ".[dev]"
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
With OCR support:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
pip install -e ".[dev,ocr]"
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Running Tests
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
pytest
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Coverage:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
pytest --cov=pdf2mj --cov-report=html
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Project Structure
|
|
171
|
+
|
|
172
|
+
```text
|
|
173
|
+
src/pdf2mj/
|
|
174
|
+
cli.py
|
|
175
|
+
config.py
|
|
176
|
+
welcome.py
|
|
177
|
+
doctor.py
|
|
178
|
+
converter.py
|
|
179
|
+
models.py
|
|
180
|
+
markdown.py
|
|
181
|
+
json_export.py
|
|
182
|
+
metadata.py
|
|
183
|
+
table_extractor.py
|
|
184
|
+
image_extractor.py
|
|
185
|
+
ocr.py
|
|
186
|
+
chunker.py
|
|
187
|
+
console_util.py
|
|
188
|
+
|
|
189
|
+
tests/
|
|
190
|
+
sample_pdfs/
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Local Development
|
|
194
|
+
|
|
195
|
+
Run directly from source:
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
pdf2mj sample.pdf
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
or
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
python -m pdf2mj sample.pdf
|
|
205
|
+
```
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pdf2mj"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Convert PDF documents to Markdown and structured JSON for RAG and LLM pipelines"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [{ name = "PDF2MJ Contributors" }]
|
|
13
|
+
keywords = ["pdf", "markdown", "json", "rag", "llm", "document-conversion"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Environment :: Console",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Topic :: Text Processing",
|
|
23
|
+
"Topic :: Utilities",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"typer>=0.12.0",
|
|
27
|
+
"pymupdf>=1.24.0",
|
|
28
|
+
"pymupdf4llm>=0.0.17",
|
|
29
|
+
"rich>=13.7.0",
|
|
30
|
+
"pydantic>=2.0.0",
|
|
31
|
+
"pandas>=2.0.0",
|
|
32
|
+
"pillow>=10.0.0",
|
|
33
|
+
"platformdirs>=4.0.0",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
ocr = ["pytesseract>=0.3.10", "opencv-python-headless>=4.8.0"]
|
|
38
|
+
dev = ["pytest>=8.0.0", "pytest-cov>=5.0.0"]
|
|
39
|
+
|
|
40
|
+
[project.scripts]
|
|
41
|
+
pdf2mj = "pdf2mj.cli:run"
|
|
42
|
+
|
|
43
|
+
[project.urls]
|
|
44
|
+
Homepage = "https://github.com/Ronit-Pai/pdf2mj"
|
|
45
|
+
Documentation = "https://github.com/Ronit-Pai/pdf2mj#readme"
|
|
46
|
+
Repository = "https://github.com/Ronit-Pai/pdf2mj"
|
|
47
|
+
|
|
48
|
+
[tool.hatch.build.targets.wheel]
|
|
49
|
+
packages = ["src/pdf2mj"]
|
|
50
|
+
|
|
51
|
+
[tool.hatch.build.targets.sdist]
|
|
52
|
+
include = ["src/pdf2mj", "tests", "README.md", "plan.md"]
|
|
53
|
+
|
|
54
|
+
[tool.pytest.ini_options]
|
|
55
|
+
testpaths = ["tests"]
|
|
56
|
+
pythonpath = ["src"]
|
|
57
|
+
addopts = "-v --tb=short"
|
|
58
|
+
|
|
59
|
+
[tool.coverage.run]
|
|
60
|
+
source = ["pdf2mj"]
|
|
61
|
+
branch = true
|
|
62
|
+
|
|
63
|
+
[tool.coverage.report]
|
|
64
|
+
fail_under = 73
|
|
65
|
+
show_missing = true
|
|
66
|
+
exclude_lines = [
|
|
67
|
+
"pragma: no cover",
|
|
68
|
+
"if TYPE_CHECKING:",
|
|
69
|
+
"raise NotImplementedError",
|
|
70
|
+
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Sample PDFs
|
|
2
|
+
|
|
3
|
+
Place sample PDF files here for manual testing, for example:
|
|
4
|
+
|
|
5
|
+
- `text_sample.pdf` — digital text PDF
|
|
6
|
+
- `scan_sample.pdf` — scanned document (use `--ocr`)
|
|
7
|
+
- `tables_sample.pdf` — document with tables
|
|
8
|
+
- `images_sample.pdf` — document with embedded images
|
|
9
|
+
|
|
10
|
+
Generate a quick test PDF with the CLI after install:
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
python -c "
|
|
14
|
+
import fitz
|
|
15
|
+
doc = fitz.open()
|
|
16
|
+
p = doc.new_page()
|
|
17
|
+
p.insert_text((72, 72), 'Hello PDF2MJ', fontsize=14)
|
|
18
|
+
doc.save('sample_pdfs/demo.pdf')
|
|
19
|
+
doc.close()
|
|
20
|
+
"
|
|
21
|
+
pdf2mj sample_pdfs/demo.pdf --output sample_pdfs/output --all
|
|
22
|
+
```
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Text chunking for RAG pipelines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from pdf2mj.models import Chunk, Document
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _estimate_tokens(text: str) -> int:
|
|
11
|
+
return len(re.findall(r"\S+", text))
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def chunk_document(
|
|
15
|
+
document: Document,
|
|
16
|
+
chunk_size: int = 1000,
|
|
17
|
+
overlap: int = 200,
|
|
18
|
+
) -> list[Chunk]:
|
|
19
|
+
"""Split document text into overlapping chunks with page references."""
|
|
20
|
+
chunks: list[Chunk] = []
|
|
21
|
+
chunk_index = 0
|
|
22
|
+
|
|
23
|
+
for page in document.pages:
|
|
24
|
+
page_text_parts: list[str] = []
|
|
25
|
+
for block in page.blocks:
|
|
26
|
+
if block.type in ("image",):
|
|
27
|
+
continue
|
|
28
|
+
text = block.text.strip()
|
|
29
|
+
if text:
|
|
30
|
+
page_text_parts.append(text)
|
|
31
|
+
|
|
32
|
+
full_text = "\n\n".join(page_text_parts)
|
|
33
|
+
if not full_text.strip():
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
words = full_text.split()
|
|
37
|
+
if not words:
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
start = 0
|
|
41
|
+
while start < len(words):
|
|
42
|
+
end = min(start + chunk_size, len(words))
|
|
43
|
+
chunk_words = words[start:end]
|
|
44
|
+
chunk_text = " ".join(chunk_words)
|
|
45
|
+
chunk_index += 1
|
|
46
|
+
chunks.append(
|
|
47
|
+
Chunk(
|
|
48
|
+
chunk_id=f"c{chunk_index}",
|
|
49
|
+
page=page.page_number,
|
|
50
|
+
text=chunk_text,
|
|
51
|
+
tokens=_estimate_tokens(chunk_text),
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
if end >= len(words):
|
|
55
|
+
break
|
|
56
|
+
start = max(0, end - overlap)
|
|
57
|
+
|
|
58
|
+
return chunks
|