botlpdf 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- botlpdf-0.1.2/PKG-INFO +681 -0
- botlpdf-0.1.2/README.md +629 -0
- botlpdf-0.1.2/pyproject.toml +79 -0
- botlpdf-0.1.2/python/botl_pdf/__init__.py +29 -0
- botlpdf-0.1.2/python/botl_pdf/_core.pyi +139 -0
- botlpdf-0.1.2/python/botl_pdf/cli/__init__.py +1 -0
- botlpdf-0.1.2/python/botl_pdf/cli/main.py +190 -0
- botlpdf-0.1.2/python/botl_pdf/debug.py +114 -0
- botlpdf-0.1.2/python/botl_pdf/document.py +75 -0
- botlpdf-0.1.2/python/botl_pdf/export.py +50 -0
- botlpdf-0.1.2/python/botl_pdf/ocr/__init__.py +5 -0
- botlpdf-0.1.2/python/botl_pdf/ocr/base.py +43 -0
- botlpdf-0.1.2/python/botl_pdf/page.py +67 -0
- botlpdf-0.1.2/python/botl_pdf/plugins/__init__.py +14 -0
- botlpdf-0.1.2/python/botl_pdf/plugins/registry.py +33 -0
- botlpdf-0.1.2/python/botl_pdf/tables.py +69 -0
- botlpdf-0.1.2/rust/Cargo.lock +1157 -0
- botlpdf-0.1.2/rust/Cargo.toml +32 -0
- botlpdf-0.1.2/rust/botl-pdf-core/Cargo.toml +47 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/ascii85.rs +104 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/asciihex.rs +59 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/dct.rs +13 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/flate.rs +41 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/jpx.rs +11 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/lzw.rs +141 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/mod.rs +90 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/runlength.rs +61 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/error.rs +44 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/geometry/bbox.rs +114 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/geometry/matrix.rs +128 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/geometry/mod.rs +6 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/geometry/spatial.rs +57 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/layout/elements.rs +180 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/layout/grouping.rs +231 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/layout/mod.rs +4 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/layout/ordering.rs +155 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/layout/strategy.rs +506 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/lib.rs +8 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/parser/document.rs +460 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/parser/incremental.rs +65 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/parser/lexer.rs +545 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/parser/mod.rs +5 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/parser/objects.rs +598 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/parser/xref.rs +422 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/text/cmap.rs +219 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/text/fonts.rs +388 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/text/mod.rs +4 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/text/operator.rs +869 -0
- botlpdf-0.1.2/rust/botl-pdf-core/src/text/unicode.rs +385 -0
- botlpdf-0.1.2/rust/botl-pdf-csys/Cargo.toml +17 -0
- botlpdf-0.1.2/rust/botl-pdf-csys/build.rs +4 -0
- botlpdf-0.1.2/rust/botl-pdf-csys/src/image.rs +31 -0
- botlpdf-0.1.2/rust/botl-pdf-csys/src/jpeg.rs +101 -0
- botlpdf-0.1.2/rust/botl-pdf-csys/src/jpx.rs +207 -0
- botlpdf-0.1.2/rust/botl-pdf-csys/src/lib.rs +11 -0
- botlpdf-0.1.2/rust/botl-pdf-python/Cargo.toml +16 -0
- botlpdf-0.1.2/rust/botl-pdf-python/src/codecs_reexport.rs +10 -0
- botlpdf-0.1.2/rust/botl-pdf-python/src/document.rs +320 -0
- botlpdf-0.1.2/rust/botl-pdf-python/src/elements.rs +519 -0
- botlpdf-0.1.2/rust/botl-pdf-python/src/errors.rs +49 -0
- botlpdf-0.1.2/rust/botl-pdf-python/src/lib.rs +42 -0
- botlpdf-0.1.2/rust/botl-pdf-python/src/page.rs +502 -0
- botlpdf-0.1.2/rust/botl-pdf-python/src/writer.rs +65 -0
botlpdf-0.1.2/PKG-INFO
ADDED
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: botlpdf
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Classifier: Development Status :: 3 - Alpha
|
|
5
|
+
Classifier: Intended Audience :: Developers
|
|
6
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Programming Language :: Rust
|
|
14
|
+
Classifier: Topic :: Text Processing :: General
|
|
15
|
+
Classifier: Typing :: Typed
|
|
16
|
+
Requires-Dist: pillow >=10.0 ; extra == 'render'
|
|
17
|
+
Requires-Dist: pillow >=10.0 ; extra == 'debug'
|
|
18
|
+
Requires-Dist: matplotlib >=3.8 ; extra == 'debug'
|
|
19
|
+
Requires-Dist: pytesseract >=0.3 ; extra == 'ocr-tesseract'
|
|
20
|
+
Requires-Dist: pillow >=10.0 ; extra == 'ocr-tesseract'
|
|
21
|
+
Requires-Dist: easyocr >=1.7 ; extra == 'ocr-easyocr'
|
|
22
|
+
Requires-Dist: pillow >=10.0 ; extra == 'ocr-easyocr'
|
|
23
|
+
Requires-Dist: pandas >=2.0 ; extra == 'pandas'
|
|
24
|
+
Requires-Dist: typer >=0.12 ; extra == 'cli'
|
|
25
|
+
Requires-Dist: rich >=13.0 ; extra == 'cli'
|
|
26
|
+
Requires-Dist: botlpdf[render,debug,ocr-tesseract,pandas,cli] ; extra == 'all'
|
|
27
|
+
Requires-Dist: pytest >=8.0 ; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-cov >=5.0 ; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest-benchmark >=4.0 ; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff >=0.5 ; extra == 'dev'
|
|
31
|
+
Requires-Dist: mypy >=1.10 ; extra == 'dev'
|
|
32
|
+
Requires-Dist: pre-commit >=3.0 ; extra == 'dev'
|
|
33
|
+
Requires-Dist: hypothesis >=6.0 ; extra == 'dev'
|
|
34
|
+
Provides-Extra: render
|
|
35
|
+
Provides-Extra: debug
|
|
36
|
+
Provides-Extra: ocr-tesseract
|
|
37
|
+
Provides-Extra: ocr-easyocr
|
|
38
|
+
Provides-Extra: pandas
|
|
39
|
+
Provides-Extra: cli
|
|
40
|
+
Provides-Extra: all
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
License-File: LICENSE
|
|
43
|
+
Summary: High-performance PDF processing: extract text, tables, images with a Rust + C core.
|
|
44
|
+
Keywords: pdf,text-extraction,tables,layout-analysis,rust
|
|
45
|
+
Author: botl-pdf Contributors
|
|
46
|
+
License: Apache-2.0
|
|
47
|
+
Requires-Python: >=3.10
|
|
48
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
49
|
+
Project-URL: Homepage, https://github.com/Shivamjohri247/botl-pdf
|
|
50
|
+
Project-URL: Repository, https://github.com/Shivamjohri247/botl-pdf
|
|
51
|
+
|
|
52
|
+
# botl-pdf
|
|
53
|
+
|
|
54
|
+
High-performance PDF text extraction library with a custom Rust core and Python bindings. No dependency on poppler, pdfium, or pdfbox — the entire PDF parsing and text extraction pipeline is written from scratch.
|
|
55
|
+
|
|
56
|
+
## Features
|
|
57
|
+
|
|
58
|
+
- Fast text extraction with layout analysis
|
|
59
|
+
- Character-level output with bounding boxes, fonts, colors, and styles
|
|
60
|
+
- Layout-preserving text extraction (spatial whitespace)
|
|
61
|
+
- Table of contents (TOC/outline) extraction with page numbers
|
|
62
|
+
- Document metadata extraction (title, author, dates, etc.)
|
|
63
|
+
- Geometric element extraction (lines, rectangles)
|
|
64
|
+
- Configurable layout parameters (word spacing, line grouping, reading order)
|
|
65
|
+
- Run-aware de-interleaving for correct reading order on complex PDFs
|
|
66
|
+
- Pythonic API with type hints throughout
|
|
67
|
+
- CLI for common operations
|
|
68
|
+
- Zero external PDF library dependencies
|
|
69
|
+
|
|
70
|
+
## Install
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install botl-pdf
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Build from source (requires Rust toolchain):
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install maturin
|
|
80
|
+
git clone https://github.com/botl-pdf/botl-pdf.git
|
|
81
|
+
cd botl-pdf
|
|
82
|
+
maturin develop --release
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Quick Start
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
import botl_pdf
|
|
91
|
+
|
|
92
|
+
doc = botl_pdf.open("report.pdf")
|
|
93
|
+
text = doc.pages[0].extract_text()
|
|
94
|
+
print(text)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Opening Documents
|
|
100
|
+
|
|
101
|
+
### From a file path
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
import botl_pdf
|
|
105
|
+
|
|
106
|
+
doc = botl_pdf.open("report.pdf")
|
|
107
|
+
print(f"Pages: {doc.num_pages}")
|
|
108
|
+
print(f"Encrypted: {doc.is_encrypted}")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### From bytes
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
with open("report.pdf", "rb") as f:
|
|
115
|
+
data = f.read()
|
|
116
|
+
|
|
117
|
+
doc = botl_pdf.open(data)
|
|
118
|
+
print(f"Pages: {doc.num_pages}")
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### As a context manager
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
with botl_pdf.open("report.pdf") as doc:
|
|
125
|
+
text = doc.pages[0].extract_text()
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Text Extraction
|
|
131
|
+
|
|
132
|
+
### Plain text (default)
|
|
133
|
+
|
|
134
|
+
Returns clean, readable text. Blocks are separated by double newlines, lines by single newlines, words by spaces.
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
doc = botl_pdf.open("report.pdf")
|
|
138
|
+
|
|
139
|
+
# Single page
|
|
140
|
+
text = doc.pages[0].extract_text()
|
|
141
|
+
print(text)
|
|
142
|
+
|
|
143
|
+
# All pages
|
|
144
|
+
for page in doc.pages:
|
|
145
|
+
print(page.extract_text())
|
|
146
|
+
|
|
147
|
+
# Subscript access (0-based, supports negative)
|
|
148
|
+
text_last = doc.pages[-1].extract_text()
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Layout-preserving text
|
|
152
|
+
|
|
153
|
+
Maintains spatial positioning using proportional spaces between words. Useful when you need to preserve visual alignment of columns, tables, or indented text.
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
doc = botl_pdf.open("financial_report.pdf")
|
|
157
|
+
page = doc.pages[0]
|
|
158
|
+
|
|
159
|
+
# Layout mode preserves spatial whitespace
|
|
160
|
+
layout_text = page.extract_text(layout=True)
|
|
161
|
+
print(layout_text)
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Tuning extraction parameters
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
import botl_pdf
|
|
168
|
+
|
|
169
|
+
doc = botl_pdf.open("two_column.pdf")
|
|
170
|
+
|
|
171
|
+
# Tighter word grouping (merge chars closer together)
|
|
172
|
+
params = botl_pdf.LayoutParams(
|
|
173
|
+
word_margin=1.5, # max horizontal gap in same word (× font_size), default 2.0
|
|
174
|
+
line_margin=0.5, # max vertical gap in same block (× line height), default 0.5
|
|
175
|
+
boxes_flow=0.5, # reading order: 0.0=horizontal, 1.0=vertical, default 0.5
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
text = doc.pages[0].extract_text(layout=True, layout_params=params)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Exporting entire documents
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
from botl_pdf.export import to_text, to_markdown
|
|
185
|
+
|
|
186
|
+
# Plain text for all pages
|
|
187
|
+
full_text = to_text("report.pdf")
|
|
188
|
+
|
|
189
|
+
# Layout-preserved text
|
|
190
|
+
full_text_layout = to_text("report.pdf", layout=True)
|
|
191
|
+
|
|
192
|
+
# Markdown (pages separated by horizontal rules)
|
|
193
|
+
markdown = to_markdown("report.pdf")
|
|
194
|
+
|
|
195
|
+
# Specific page range only
|
|
196
|
+
markdown_subset = to_markdown("report.pdf", pages=range(0, 5))
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Character-Level Access
|
|
202
|
+
|
|
203
|
+
Each page exposes individual characters with full style information: bounding box, font name, font size, bold/italic flags, fill and stroke colors, rotation, and run ID.
|
|
204
|
+
|
|
205
|
+
### Inspecting individual characters
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
doc = botl_pdf.open("report.pdf")
|
|
209
|
+
page = doc.pages[0]
|
|
210
|
+
|
|
211
|
+
for char in page.chars[:5]:
|
|
212
|
+
print(f" char={char.text!r} "
|
|
213
|
+
f"pos=({char.bbox.x0:.1f}, {char.bbox.y0:.1f}) "
|
|
214
|
+
f"size={char.font_size:.1f} "
|
|
215
|
+
f"font={char.font_name}")
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
Output:
|
|
219
|
+
```
|
|
220
|
+
char='H' pos=(100.0, 700.0) size=12.0 font=F1
|
|
221
|
+
char='e' pos=(108.0, 700.0) size=12.0 font=F1
|
|
222
|
+
char='l' pos=(115.0, 700.0) size=12.0 font=F1
|
|
223
|
+
char='l' pos=(120.0, 700.0) size=12.0 font=F1
|
|
224
|
+
char='o' pos=(125.0, 700.0) size=12.0 font=F1
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Finding text by style
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
# Find all bold characters on page 0
|
|
231
|
+
bold_chars = [c for c in doc.pages[0].chars if c.bold]
|
|
232
|
+
bold_text = "".join(c.text for c in bold_chars)
|
|
233
|
+
|
|
234
|
+
# Find characters in a specific color (e.g., red links)
|
|
235
|
+
red_chars = [
|
|
236
|
+
c for c in doc.pages[0].chars
|
|
237
|
+
if c.color and c.color[0] > 0.8 and c.color[1] < 0.2 and c.color[2] < 0.2
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
# Find large decorative initials (font size > 30)
|
|
241
|
+
initials = [c for c in doc.pages[0].chars if c.font_size > 30]
|
|
242
|
+
for c in initials:
|
|
243
|
+
print(f"Decorative initial: {c.text!r} at size {c.font_size:.0f}")
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### Extracting text from a region
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
# Get all text in a specific rectangular area
|
|
250
|
+
x0, y0, x1, y1 = 100.0, 600.0, 400.0, 700.0
|
|
251
|
+
|
|
252
|
+
region_chars = [
|
|
253
|
+
c for c in doc.pages[0].chars
|
|
254
|
+
if c.bbox.x0 >= x0 and c.bbox.x1 <= x1
|
|
255
|
+
and c.bbox.y0 >= y0 and c.bbox.y1 <= y1
|
|
256
|
+
]
|
|
257
|
+
region_text = "".join(c.text for c in region_chars)
|
|
258
|
+
print(region_text)
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
### Run ID tracking
|
|
262
|
+
|
|
263
|
+
Characters from the same text-showing operation (Tj/TJ) share a `run_id`. This lets you group characters by their PDF text operation — useful for debugging extraction issues or understanding the PDF's internal structure.
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
from collections import defaultdict
|
|
267
|
+
|
|
268
|
+
# Group characters by their source text operation
|
|
269
|
+
runs = defaultdict(str)
|
|
270
|
+
for c in doc.pages[0].chars:
|
|
271
|
+
runs[c.run_id] += c.text
|
|
272
|
+
|
|
273
|
+
for run_id, text in sorted(runs.items()):
|
|
274
|
+
print(f" Run {run_id}: {text[:60]!r}")
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## Document Metadata
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
doc = botl_pdf.open("report.pdf")
|
|
283
|
+
|
|
284
|
+
meta = doc.metadata
|
|
285
|
+
print(f"Title: {meta.get('title')}")
|
|
286
|
+
print(f"Author: {meta.get('author')}")
|
|
287
|
+
print(f"Subject: {meta.get('subject')}")
|
|
288
|
+
print(f"Creator: {meta.get('creator')}")
|
|
289
|
+
print(f"Producer: {meta.get('producer')}")
|
|
290
|
+
print(f"Created: {meta.get('creation_date')}")
|
|
291
|
+
print(f"Modified: {meta.get('mod_date')}")
|
|
292
|
+
print(f"Version: {meta.get('version')}")
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
## Table of Contents
|
|
298
|
+
|
|
299
|
+
```python
|
|
300
|
+
doc = botl_pdf.open("book.pdf")
|
|
301
|
+
|
|
302
|
+
toc = doc.toc
|
|
303
|
+
for entry in toc:
|
|
304
|
+
indent = " " * entry.level
|
|
305
|
+
page = entry.page_number
|
|
306
|
+
print(f"{indent}{entry.title} → page {page}")
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
Output:
|
|
310
|
+
```
|
|
311
|
+
Preface → page 5
|
|
312
|
+
Acknowledgments → page 7
|
|
313
|
+
Part I. Foundations → page 11
|
|
314
|
+
Chapter 1. Introduction → page 13
|
|
315
|
+
Chapter 2. Methods → page 27
|
|
316
|
+
Part II. Results → page 45
|
|
317
|
+
Chapter 3. Analysis → page 47
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
### Building a page lookup from TOC
|
|
321
|
+
|
|
322
|
+
```python
|
|
323
|
+
# Map page numbers to their chapter titles
|
|
324
|
+
chapters = {}
|
|
325
|
+
current_chapter = None
|
|
326
|
+
for entry in doc.toc:
|
|
327
|
+
if entry.level == 0 and entry.page_number is not None:
|
|
328
|
+
current_chapter = entry.title
|
|
329
|
+
if current_chapter and entry.page_number is not None:
|
|
330
|
+
chapters[entry.page_number] = current_chapter
|
|
331
|
+
|
|
332
|
+
# Find which chapter a page belongs to
|
|
333
|
+
def chapter_for_page(page_idx):
|
|
334
|
+
page_nums = sorted(chapters.keys())
|
|
335
|
+
for i, p in enumerate(page_nums):
|
|
336
|
+
if page_idx < p:
|
|
337
|
+
return chapters[page_nums[max(0, i - 1)]] if i > 0 else None
|
|
338
|
+
return chapters[page_nums[-1]]
|
|
339
|
+
|
|
340
|
+
print(f"Page 30 is in: {chapter_for_page(30)}")
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
---
|
|
344
|
+
|
|
345
|
+
## Geometric Elements
|
|
346
|
+
|
|
347
|
+
Pages expose geometric lines and rectangles drawn on the PDF canvas — useful for detecting table borders, rules, decorative elements, and form fields.
|
|
348
|
+
|
|
349
|
+
### Lines
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
page = doc.pages[0]
|
|
353
|
+
|
|
354
|
+
for line in page.lines:
|
|
355
|
+
print(f" Line ({line.x0:.1f},{line.y0:.1f}) → ({line.x1:.1f},{line.y1:.1f}) "
|
|
356
|
+
f"width={line.line_width:.1f}")
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
### Rectangles
|
|
360
|
+
|
|
361
|
+
```python
|
|
362
|
+
for rect in page.rects:
|
|
363
|
+
fill = rect.fill_color
|
|
364
|
+
stroke = rect.stroke_color
|
|
365
|
+
print(f" Rect ({rect.bbox.x0:.1f},{rect.bbox.y0:.1f})-"
|
|
366
|
+
f"({rect.bbox.x1:.1f},{rect.bbox.y1:.1f}) "
|
|
367
|
+
f"stroke={stroke} fill={fill}")
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
### Detecting horizontal rules
|
|
371
|
+
|
|
372
|
+
```python
|
|
373
|
+
# Find horizontal lines (useful for detecting separators/tables)
|
|
374
|
+
h_rules = [
|
|
375
|
+
line for line in page.lines
|
|
376
|
+
if abs(line.y1 - line.y0) < 1.0 and (line.x1 - line.x0) > 50.0
|
|
377
|
+
]
|
|
378
|
+
|
|
379
|
+
for rule in h_rules:
|
|
380
|
+
print(f"Horizontal rule at y={rule.y0:.1f} from x={rule.x0:.1f} to x={rule.x1:.1f}")
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
---
|
|
384
|
+
|
|
385
|
+
## Page Properties
|
|
386
|
+
|
|
387
|
+
```python
|
|
388
|
+
doc = botl_pdf.open("report.pdf")
|
|
389
|
+
|
|
390
|
+
for i, page in enumerate(doc.pages):
|
|
391
|
+
print(f"Page {i}: {page.width:.0f}×{page.height:.0f}pt "
|
|
392
|
+
f"rotation={page.rotation}° "
|
|
393
|
+
f"label={page.label!r}")
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
Output:
|
|
397
|
+
```
|
|
398
|
+
Page 0: 612×792pt rotation=0° label='1'
|
|
399
|
+
Page 1: 612×792pt rotation=0° label='2'
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
Common page sizes:
|
|
403
|
+
- Letter: 612 × 792 pt (8.5" × 11")
|
|
404
|
+
- A4: 595 × 842 pt (210mm × 297mm)
|
|
405
|
+
|
|
406
|
+
---
|
|
407
|
+
|
|
408
|
+
## Visual Debugging
|
|
409
|
+
|
|
410
|
+
Requires `Pillow`. Draws bounding boxes and geometric elements on a rendered page image — useful for debugging extraction issues or understanding PDF layout.
|
|
411
|
+
|
|
412
|
+
```bash
|
|
413
|
+
pip install botl-pdf[debug]
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
```python
|
|
417
|
+
from botl_pdf.debug import VisualDebugger
|
|
418
|
+
import botl_pdf
|
|
419
|
+
|
|
420
|
+
doc = botl_pdf.open("report.pdf")
|
|
421
|
+
page = doc.pages[0]
|
|
422
|
+
|
|
423
|
+
debugger = VisualDebugger(page)
|
|
424
|
+
|
|
425
|
+
# Draw character bounding boxes (red)
|
|
426
|
+
img = debugger.draw_chars(resolution=150)
|
|
427
|
+
img.save("debug_chars.png")
|
|
428
|
+
|
|
429
|
+
# Draw geometric lines (blue)
|
|
430
|
+
img = debugger.draw_lines(resolution=150)
|
|
431
|
+
img.save("debug_lines.png")
|
|
432
|
+
|
|
433
|
+
# Draw geometric rectangles (green)
|
|
434
|
+
img = debugger.draw_rects(resolution=150)
|
|
435
|
+
img.save("debug_rects.png")
|
|
436
|
+
|
|
437
|
+
# All elements layered together
|
|
438
|
+
img = debugger.draw_all(resolution=150)
|
|
439
|
+
img.save("debug_all.png")
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
---
|
|
443
|
+
|
|
444
|
+
## CLI
|
|
445
|
+
|
|
446
|
+
```bash
|
|
447
|
+
pip install botl-pdf[cli]
|
|
448
|
+
```
|
|
449
|
+
|
|
450
|
+
### Extract text
|
|
451
|
+
|
|
452
|
+
```bash
|
|
453
|
+
# To stdout
|
|
454
|
+
botl-pdf text report.pdf
|
|
455
|
+
|
|
456
|
+
# To file
|
|
457
|
+
botl-pdf text report.pdf --output text.txt
|
|
458
|
+
|
|
459
|
+
# Specific pages
|
|
460
|
+
botl-pdf text report.pdf --pages 1-5
|
|
461
|
+
|
|
462
|
+
# Layout-preserved
|
|
463
|
+
botl-pdf text report.pdf --layout
|
|
464
|
+
```
|
|
465
|
+
|
|
466
|
+
### Show metadata
|
|
467
|
+
|
|
468
|
+
```bash
|
|
469
|
+
botl-pdf info report.pdf
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
Output:
|
|
473
|
+
```json
|
|
474
|
+
{
|
|
475
|
+
"version": "1.4",
|
|
476
|
+
"page_count": 42,
|
|
477
|
+
"encrypted": false,
|
|
478
|
+
"title": "Annual Report 2024",
|
|
479
|
+
"author": "Acme Corp",
|
|
480
|
+
"creator": "LaTeX",
|
|
481
|
+
"producer": "pdfTeX-1.40"
|
|
482
|
+
}
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
### Export
|
|
486
|
+
|
|
487
|
+
```bash
|
|
488
|
+
# Markdown
|
|
489
|
+
botl-pdf export report.pdf --format markdown --output report.md
|
|
490
|
+
|
|
491
|
+
# Plain text
|
|
492
|
+
botl-pdf export report.pdf --format text --output report.txt
|
|
493
|
+
```
|
|
494
|
+
|
|
495
|
+
---
|
|
496
|
+
|
|
497
|
+
## API Reference
|
|
498
|
+
|
|
499
|
+
### `botl_pdf.open(path_or_bytes, *, password=None, lazy=True) -> Document`
|
|
500
|
+
|
|
501
|
+
Open a PDF from a file path (str) or raw bytes.
|
|
502
|
+
|
|
503
|
+
### `Document`
|
|
504
|
+
|
|
505
|
+
| Property / Method | Type | Description |
|
|
506
|
+
|---|---|---|
|
|
507
|
+
| `.metadata` | `dict` | Metadata fields: title, author, subject, keywords, creator, producer, creation_date, mod_date, version, page_count |
|
|
508
|
+
| `.num_pages` | `int` | Number of pages |
|
|
509
|
+
| `.is_encrypted` | `bool` | Whether the document is encrypted |
|
|
510
|
+
| `.toc` | `list[TOCEntry]` | Table of contents / outline bookmarks |
|
|
511
|
+
| `.pages` | `PageCollection` | Iterable, subscriptable page access |
|
|
512
|
+
| `doc[i]` | `PyPage` | Shortcut for `doc.pages[i]` (supports negative indices) |
|
|
513
|
+
| `len(doc)` | `int` | Same as `.num_pages` |
|
|
514
|
+
|
|
515
|
+
### `Page` (via `doc.pages[i]`)
|
|
516
|
+
|
|
517
|
+
| Property / Method | Type | Description |
|
|
518
|
+
|---|---|---|
|
|
519
|
+
| `.extract_text(layout=False, layout_params=None)` | `str` | Extract text (plain or layout-preserved) |
|
|
520
|
+
| `.chars` | `list[Char]` | All characters with full style info |
|
|
521
|
+
| `.lines` | `list[GeomLine]` | Geometric lines on the page |
|
|
522
|
+
| `.rects` | `list[GeomRect]` | Geometric rectangles on the page |
|
|
523
|
+
| `.width` | `float` | Page width in points |
|
|
524
|
+
| `.height` | `float` | Page height in points |
|
|
525
|
+
| `.rotation` | `int` | Rotation in degrees (0, 90, 180, 270) |
|
|
526
|
+
| `.page_number` | `int` | Zero-based page index |
|
|
527
|
+
| `.label` | `str` | Page label string (e.g. "iii", "A-1") |
|
|
528
|
+
|
|
529
|
+
### `Char`
|
|
530
|
+
|
|
531
|
+
| Property | Type | Description |
|
|
532
|
+
|---|---|---|
|
|
533
|
+
| `.text` | `str` | Unicode character |
|
|
534
|
+
| `.bbox` | `BBox` | Bounding box |
|
|
535
|
+
| `.font_name` | `str` | Font resource name (e.g. "F1") |
|
|
536
|
+
| `.font_size` | `float` | Font size in points |
|
|
537
|
+
| `.bold` | `bool` | Bold flag |
|
|
538
|
+
| `.italic` | `bool` | Italic flag |
|
|
539
|
+
| `.color` | `tuple[float, float, float] or None` | Fill color (RGB, 0.0-1.0) |
|
|
540
|
+
| `.stroking_color` | `tuple[float, float, float] or None` | Stroke color (RGB, 0.0-1.0) |
|
|
541
|
+
| `.rotation` | `float` | Rotation in degrees |
|
|
542
|
+
| `.run_id` | `int` | Text operation ID (chars from same Tj/TJ share this) |
|
|
543
|
+
|
|
544
|
+
### `BBox`
|
|
545
|
+
|
|
546
|
+
| Property / Method | Type | Description |
|
|
547
|
+
|---|---|---|
|
|
548
|
+
| `.x0`, `.y0` | `float` | Top-left corner |
|
|
549
|
+
| `.x1`, `.y1` | `float` | Bottom-right corner |
|
|
550
|
+
| `.width` | `float` | Width (x1 - x0) |
|
|
551
|
+
| `.height` | `float` | Height (y1 - y0) |
|
|
552
|
+
| `.center()` | `(float, float)` | Center point |
|
|
553
|
+
| `.area()` | `float` | Area |
|
|
554
|
+
|
|
555
|
+
### `TOCEntry`
|
|
556
|
+
|
|
557
|
+
| Property | Type | Description |
|
|
558
|
+
|---|---|---|
|
|
559
|
+
| `.title` | `str` | Outline entry title |
|
|
560
|
+
| `.level` | `int` | Nesting depth (0 = top-level) |
|
|
561
|
+
| `.page_number` | `int or None` | 0-indexed destination page (None if unresolvable) |
|
|
562
|
+
| `.dest` | `str or None` | Raw destination string |
|
|
563
|
+
|
|
564
|
+
### `GeomLine`
|
|
565
|
+
|
|
566
|
+
| Property | Type | Description |
|
|
567
|
+
|---|---|---|
|
|
568
|
+
| `.x0`, `.y0` | `float` | Start point |
|
|
569
|
+
| `.x1`, `.y1` | `float` | End point |
|
|
570
|
+
| `.line_width` | `float` | Stroke width |
|
|
571
|
+
| `.color` | `tuple or None` | RGB color (0.0-1.0) |
|
|
572
|
+
|
|
573
|
+
### `GeomRect`
|
|
574
|
+
|
|
575
|
+
| Property | Type | Description |
|
|
576
|
+
|---|---|---|
|
|
577
|
+
| `.bbox` | `BBox` | Bounding box |
|
|
578
|
+
| `.line_width` | `float` | Stroke width |
|
|
579
|
+
| `.stroke_color` | `tuple or None` | Stroke RGB color |
|
|
580
|
+
| `.fill_color` | `tuple or None` | Fill RGB color |
|
|
581
|
+
|
|
582
|
+
### `LayoutParams`
|
|
583
|
+
|
|
584
|
+
| Parameter | Type | Default | Description |
|
|
585
|
+
|---|---|---|---|
|
|
586
|
+
| `word_margin` | `float` | `2.0` | Max horizontal gap between chars in same word, as a multiple of font size |
|
|
587
|
+
| `line_margin` | `float` | `0.5` | Max vertical gap between lines in same block, as a multiple of line height |
|
|
588
|
+
| `boxes_flow` | `float` | `0.5` | Reading-order direction (0.0 = strict horizontal, 1.0 = strict vertical) |
|
|
589
|
+
|
|
590
|
+
```python
|
|
591
|
+
params = botl_pdf.LayoutParams(word_margin=1.5, line_margin=0.3, boxes_flow=0.0)
|
|
592
|
+
text = page.extract_text(layout=True, layout_params=params)
|
|
593
|
+
```
|
|
594
|
+
|
|
595
|
+
---
|
|
596
|
+
|
|
597
|
+
## Architecture
|
|
598
|
+
|
|
599
|
+
```
|
|
600
|
+
PDF bytes
|
|
601
|
+
→ Parser (nom tokenizer + recursive-descent objects)
|
|
602
|
+
→ Content stream interpreter (Tj/TJ/q/Q/cm operators)
|
|
603
|
+
→ Character extraction (CMap, fonts, glyph widths)
|
|
604
|
+
→ Layout analysis (chars → words → lines → blocks)
|
|
605
|
+
→ Reading order (column detection, run de-interleaving)
|
|
606
|
+
→ Text output (plain or layout-preserved)
|
|
607
|
+
```
|
|
608
|
+
|
|
609
|
+
The pipeline is entirely custom Rust — no dependency on poppler, pdfium, pdfbox, or any other PDF library.
|
|
610
|
+
|
|
611
|
+
**Key design decisions:**
|
|
612
|
+
|
|
613
|
+
- **Run-aware de-interleaving** — Each Tj/TJ text operation tags characters with a `run_id`. When PDF producers interleave characters from different operations at alternating x-positions, the layout engine detects this and groups by run, preserving correct reading order.
|
|
614
|
+
- **Font-band separation** — Within a line, characters are grouped by font size to handle decorative initials and mixed-size text on the same visual line.
|
|
615
|
+
- **Lazy extraction** — Page content is decoded on first access and cached. The parsed `Document` is shared across pages via `Arc<Mutex>`, so there's no per-page re-parsing.
|
|
616
|
+
|
|
617
|
+
---
|
|
618
|
+
|
|
619
|
+
## Benchmarks
|
|
620
|
+
|
|
621
|
+
Tested against PyMuPDF on real-world PDFs (textbooks, novels, academic papers):
|
|
622
|
+
|
|
623
|
+
| PDF | Pages | botl-pdf words | PyMuPDF words | botl-pdf time | PyMuPDF time |
|
|
624
|
+
|---|---|---|---|---|---|
|
|
625
|
+
| Electrical engineering textbook | 100 | 35,435 | 34,708 | 238ms | 174ms |
|
|
626
|
+
| Discrete math textbook | 200 | 89,291 | 89,968 | 526ms | 426ms |
|
|
627
|
+
| French novel | 130 | 45,355 | 45,337 | 293ms | 214ms |
|
|
628
|
+
| American Revolution history | 293 | 100,954 | 99,897 | 591ms | 377ms |
|
|
629
|
+
| Rust Programming Language 3E | 560 | 200,177 | 196,748 | 1262ms | 873ms |
|
|
630
|
+
| Mystery novel | 300 | 89,610 | 88,604 | 568ms | 445ms |
|
|
631
|
+
| **Total** | **1583** | **660,822** | **655,262** | **3478ms** | **2519ms** |
|
|
632
|
+
|
|
633
|
+
Word counts match within ~1% of PyMuPDF. Performance is ~1.4x slower.
|
|
634
|
+
|
|
635
|
+
---
|
|
636
|
+
|
|
637
|
+
## Development
|
|
638
|
+
|
|
639
|
+
```bash
|
|
640
|
+
# Set up environment
|
|
641
|
+
python -m venv .venv && source .venv/bin/activate
|
|
642
|
+
pip install maturin pytest
|
|
643
|
+
|
|
644
|
+
# Build Rust extension in release mode
|
|
645
|
+
maturin develop --release
|
|
646
|
+
|
|
647
|
+
# Run Rust tests (198 tests)
|
|
648
|
+
cd rust && cargo test
|
|
649
|
+
|
|
650
|
+
# Run Python tests
|
|
651
|
+
pytest tests/python/
|
|
652
|
+
|
|
653
|
+
# Run benchmarks
|
|
654
|
+
pytest tests/python/benchmarks/ --benchmark-only
|
|
655
|
+
```
|
|
656
|
+
|
|
657
|
+
### Project structure
|
|
658
|
+
|
|
659
|
+
```
|
|
660
|
+
botl-pdf/
|
|
661
|
+
├── rust/
|
|
662
|
+
│ ├── botl-pdf-core/ # Core engine (parser, text, layout, codecs)
|
|
663
|
+
│ ├── botl-pdf-python/ # PyO3 bindings → _core native module
|
|
664
|
+
│ └── botl-pdf-csys/ # Image codec FFI (JPEG, JPEG2000)
|
|
665
|
+
├── python/botl_pdf/ # High-level Python API
|
|
666
|
+
│ ├── document.py # Document, PageCollection
|
|
667
|
+
│ ├── page.py # Page wrapper
|
|
668
|
+
│ ├── export.py # to_text(), to_markdown()
|
|
669
|
+
│ ├── debug.py # VisualDebugger (Pillow overlays)
|
|
670
|
+
│ ├── tables.py # Table/TableCell dataclasses
|
|
671
|
+
│ └── cli/main.py # CLI: text, info, export
|
|
672
|
+
├── tests/
|
|
673
|
+
│ ├── rust/ # Integration tests (parser, text, layout, geometry)
|
|
674
|
+
│ └── python/ # Unit + integration tests
|
|
675
|
+
└── docs/ # Sphinx docs
|
|
676
|
+
```
|
|
677
|
+
|
|
678
|
+
## License
|
|
679
|
+
|
|
680
|
+
Apache 2.0
|
|
681
|
+
|