botlpdf 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. botlpdf-0.1.2/PKG-INFO +681 -0
  2. botlpdf-0.1.2/README.md +629 -0
  3. botlpdf-0.1.2/pyproject.toml +79 -0
  4. botlpdf-0.1.2/python/botl_pdf/__init__.py +29 -0
  5. botlpdf-0.1.2/python/botl_pdf/_core.pyi +139 -0
  6. botlpdf-0.1.2/python/botl_pdf/cli/__init__.py +1 -0
  7. botlpdf-0.1.2/python/botl_pdf/cli/main.py +190 -0
  8. botlpdf-0.1.2/python/botl_pdf/debug.py +114 -0
  9. botlpdf-0.1.2/python/botl_pdf/document.py +75 -0
  10. botlpdf-0.1.2/python/botl_pdf/export.py +50 -0
  11. botlpdf-0.1.2/python/botl_pdf/ocr/__init__.py +5 -0
  12. botlpdf-0.1.2/python/botl_pdf/ocr/base.py +43 -0
  13. botlpdf-0.1.2/python/botl_pdf/page.py +67 -0
  14. botlpdf-0.1.2/python/botl_pdf/plugins/__init__.py +14 -0
  15. botlpdf-0.1.2/python/botl_pdf/plugins/registry.py +33 -0
  16. botlpdf-0.1.2/python/botl_pdf/tables.py +69 -0
  17. botlpdf-0.1.2/rust/Cargo.lock +1157 -0
  18. botlpdf-0.1.2/rust/Cargo.toml +32 -0
  19. botlpdf-0.1.2/rust/botl-pdf-core/Cargo.toml +47 -0
  20. botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/ascii85.rs +104 -0
  21. botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/asciihex.rs +59 -0
  22. botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/dct.rs +13 -0
  23. botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/flate.rs +41 -0
  24. botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/jpx.rs +11 -0
  25. botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/lzw.rs +141 -0
  26. botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/mod.rs +90 -0
  27. botlpdf-0.1.2/rust/botl-pdf-core/src/codecs/runlength.rs +61 -0
  28. botlpdf-0.1.2/rust/botl-pdf-core/src/error.rs +44 -0
  29. botlpdf-0.1.2/rust/botl-pdf-core/src/geometry/bbox.rs +114 -0
  30. botlpdf-0.1.2/rust/botl-pdf-core/src/geometry/matrix.rs +128 -0
  31. botlpdf-0.1.2/rust/botl-pdf-core/src/geometry/mod.rs +6 -0
  32. botlpdf-0.1.2/rust/botl-pdf-core/src/geometry/spatial.rs +57 -0
  33. botlpdf-0.1.2/rust/botl-pdf-core/src/layout/elements.rs +180 -0
  34. botlpdf-0.1.2/rust/botl-pdf-core/src/layout/grouping.rs +231 -0
  35. botlpdf-0.1.2/rust/botl-pdf-core/src/layout/mod.rs +4 -0
  36. botlpdf-0.1.2/rust/botl-pdf-core/src/layout/ordering.rs +155 -0
  37. botlpdf-0.1.2/rust/botl-pdf-core/src/layout/strategy.rs +506 -0
  38. botlpdf-0.1.2/rust/botl-pdf-core/src/lib.rs +8 -0
  39. botlpdf-0.1.2/rust/botl-pdf-core/src/parser/document.rs +460 -0
  40. botlpdf-0.1.2/rust/botl-pdf-core/src/parser/incremental.rs +65 -0
  41. botlpdf-0.1.2/rust/botl-pdf-core/src/parser/lexer.rs +545 -0
  42. botlpdf-0.1.2/rust/botl-pdf-core/src/parser/mod.rs +5 -0
  43. botlpdf-0.1.2/rust/botl-pdf-core/src/parser/objects.rs +598 -0
  44. botlpdf-0.1.2/rust/botl-pdf-core/src/parser/xref.rs +422 -0
  45. botlpdf-0.1.2/rust/botl-pdf-core/src/text/cmap.rs +219 -0
  46. botlpdf-0.1.2/rust/botl-pdf-core/src/text/fonts.rs +388 -0
  47. botlpdf-0.1.2/rust/botl-pdf-core/src/text/mod.rs +4 -0
  48. botlpdf-0.1.2/rust/botl-pdf-core/src/text/operator.rs +869 -0
  49. botlpdf-0.1.2/rust/botl-pdf-core/src/text/unicode.rs +385 -0
  50. botlpdf-0.1.2/rust/botl-pdf-csys/Cargo.toml +17 -0
  51. botlpdf-0.1.2/rust/botl-pdf-csys/build.rs +4 -0
  52. botlpdf-0.1.2/rust/botl-pdf-csys/src/image.rs +31 -0
  53. botlpdf-0.1.2/rust/botl-pdf-csys/src/jpeg.rs +101 -0
  54. botlpdf-0.1.2/rust/botl-pdf-csys/src/jpx.rs +207 -0
  55. botlpdf-0.1.2/rust/botl-pdf-csys/src/lib.rs +11 -0
  56. botlpdf-0.1.2/rust/botl-pdf-python/Cargo.toml +16 -0
  57. botlpdf-0.1.2/rust/botl-pdf-python/src/codecs_reexport.rs +10 -0
  58. botlpdf-0.1.2/rust/botl-pdf-python/src/document.rs +320 -0
  59. botlpdf-0.1.2/rust/botl-pdf-python/src/elements.rs +519 -0
  60. botlpdf-0.1.2/rust/botl-pdf-python/src/errors.rs +49 -0
  61. botlpdf-0.1.2/rust/botl-pdf-python/src/lib.rs +42 -0
  62. botlpdf-0.1.2/rust/botl-pdf-python/src/page.rs +502 -0
  63. botlpdf-0.1.2/rust/botl-pdf-python/src/writer.rs +65 -0
botlpdf-0.1.2/PKG-INFO ADDED
@@ -0,0 +1,681 @@
1
+ Metadata-Version: 2.3
2
+ Name: botlpdf
3
+ Version: 0.1.2
4
+ Classifier: Development Status :: 3 - Alpha
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: License :: OSI Approved :: Apache Software License
7
+ Classifier: Operating System :: OS Independent
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Programming Language :: Rust
14
+ Classifier: Topic :: Text Processing :: General
15
+ Classifier: Typing :: Typed
16
+ Requires-Dist: pillow >=10.0 ; extra == 'render'
17
+ Requires-Dist: pillow >=10.0 ; extra == 'debug'
18
+ Requires-Dist: matplotlib >=3.8 ; extra == 'debug'
19
+ Requires-Dist: pytesseract >=0.3 ; extra == 'ocr-tesseract'
20
+ Requires-Dist: pillow >=10.0 ; extra == 'ocr-tesseract'
21
+ Requires-Dist: easyocr >=1.7 ; extra == 'ocr-easyocr'
22
+ Requires-Dist: pillow >=10.0 ; extra == 'ocr-easyocr'
23
+ Requires-Dist: pandas >=2.0 ; extra == 'pandas'
24
+ Requires-Dist: typer >=0.12 ; extra == 'cli'
25
+ Requires-Dist: rich >=13.0 ; extra == 'cli'
26
+ Requires-Dist: botlpdf[render,debug,ocr-tesseract,pandas,cli] ; extra == 'all'
27
+ Requires-Dist: pytest >=8.0 ; extra == 'dev'
28
+ Requires-Dist: pytest-cov >=5.0 ; extra == 'dev'
29
+ Requires-Dist: pytest-benchmark >=4.0 ; extra == 'dev'
30
+ Requires-Dist: ruff >=0.5 ; extra == 'dev'
31
+ Requires-Dist: mypy >=1.10 ; extra == 'dev'
32
+ Requires-Dist: pre-commit >=3.0 ; extra == 'dev'
33
+ Requires-Dist: hypothesis >=6.0 ; extra == 'dev'
34
+ Provides-Extra: render
35
+ Provides-Extra: debug
36
+ Provides-Extra: ocr-tesseract
37
+ Provides-Extra: ocr-easyocr
38
+ Provides-Extra: pandas
39
+ Provides-Extra: cli
40
+ Provides-Extra: all
41
+ Provides-Extra: dev
42
+ License-File: LICENSE
43
+ Summary: High-performance PDF processing: extract text, tables, images with a Rust + C core.
44
+ Keywords: pdf,text-extraction,tables,layout-analysis,rust
45
+ Author: botl-pdf Contributors
46
+ License: Apache-2.0
47
+ Requires-Python: >=3.10
48
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
49
+ Project-URL: Homepage, https://github.com/Shivamjohri247/botl-pdf
50
+ Project-URL: Repository, https://github.com/Shivamjohri247/botl-pdf
51
+
52
+ # botl-pdf
53
+
54
+ High-performance PDF text extraction library with a custom Rust core and Python bindings. No dependency on poppler, pdfium, or pdfbox — the entire PDF parsing and text extraction pipeline is written from scratch.
55
+
56
+ ## Features
57
+
58
+ - Fast text extraction with layout analysis
59
+ - Character-level output with bounding boxes, fonts, colors, and styles
60
+ - Layout-preserving text extraction (spatial whitespace)
61
+ - Table of contents (TOC/outline) extraction with page numbers
62
+ - Document metadata extraction (title, author, dates, etc.)
63
+ - Geometric element extraction (lines, rectangles)
64
+ - Configurable layout parameters (word spacing, line grouping, reading order)
65
+ - Run-aware de-interleaving for correct reading order on complex PDFs
66
+ - Pythonic API with type hints throughout
67
+ - CLI for common operations
68
+ - Zero external PDF library dependencies
69
+
70
+ ## Install
71
+
72
+ ```bash
73
+ pip install botl-pdf
74
+ ```
75
+
76
+ Build from source (requires Rust toolchain):
77
+
78
+ ```bash
79
+ pip install maturin
80
+ git clone https://github.com/botl-pdf/botl-pdf.git
81
+ cd botl-pdf
82
+ maturin develop --release
83
+ ```
84
+
85
+ ---
86
+
87
+ ## Quick Start
88
+
89
+ ```python
90
+ import botl_pdf
91
+
92
+ doc = botl_pdf.open("report.pdf")
93
+ text = doc.pages[0].extract_text()
94
+ print(text)
95
+ ```
96
+
97
+ ---
98
+
99
+ ## Opening Documents
100
+
101
+ ### From a file path
102
+
103
+ ```python
104
+ import botl_pdf
105
+
106
+ doc = botl_pdf.open("report.pdf")
107
+ print(f"Pages: {doc.num_pages}")
108
+ print(f"Encrypted: {doc.is_encrypted}")
109
+ ```
110
+
111
+ ### From bytes
112
+
113
+ ```python
114
+ with open("report.pdf", "rb") as f:
115
+ data = f.read()
116
+
117
+ doc = botl_pdf.open(data)
118
+ print(f"Pages: {doc.num_pages}")
119
+ ```
120
+
121
+ ### As a context manager
122
+
123
+ ```python
124
+ with botl_pdf.open("report.pdf") as doc:
125
+ text = doc.pages[0].extract_text()
126
+ ```
127
+
128
+ ---
129
+
130
+ ## Text Extraction
131
+
132
+ ### Plain text (default)
133
+
134
+ Returns clean, readable text. Blocks are separated by double newlines, lines by single newlines, words by spaces.
135
+
136
+ ```python
137
+ doc = botl_pdf.open("report.pdf")
138
+
139
+ # Single page
140
+ text = doc.pages[0].extract_text()
141
+ print(text)
142
+
143
+ # All pages
144
+ for page in doc.pages:
145
+ print(page.extract_text())
146
+
147
+ # Subscript access (0-based, supports negative)
148
+ text_last = doc.pages[-1].extract_text()
149
+ ```
150
+
151
+ ### Layout-preserving text
152
+
153
+ Maintains spatial positioning using proportional spaces between words. Useful when you need to preserve visual alignment of columns, tables, or indented text.
154
+
155
+ ```python
156
+ doc = botl_pdf.open("financial_report.pdf")
157
+ page = doc.pages[0]
158
+
159
+ # Layout mode preserves spatial whitespace
160
+ layout_text = page.extract_text(layout=True)
161
+ print(layout_text)
162
+ ```
163
+
164
+ ### Tuning extraction parameters
165
+
166
+ ```python
167
+ import botl_pdf
168
+
169
+ doc = botl_pdf.open("two_column.pdf")
170
+
171
+ # Tighter word grouping (merge chars closer together)
172
+ params = botl_pdf.LayoutParams(
173
+ word_margin=1.5, # max horizontal gap in same word (× font_size), default 2.0
174
+ line_margin=0.5, # max vertical gap in same block (× line height), default 0.5
175
+ boxes_flow=0.5, # reading order: 0.0=horizontal, 1.0=vertical, default 0.5
176
+ )
177
+
178
+ text = doc.pages[0].extract_text(layout=True, layout_params=params)
179
+ ```
180
+
181
+ ### Exporting entire documents
182
+
183
+ ```python
184
+ from botl_pdf.export import to_text, to_markdown
185
+
186
+ # Plain text for all pages
187
+ full_text = to_text("report.pdf")
188
+
189
+ # Layout-preserved text
190
+ full_text_layout = to_text("report.pdf", layout=True)
191
+
192
+ # Markdown (pages separated by horizontal rules)
193
+ markdown = to_markdown("report.pdf")
194
+
195
+ # Specific page range only
196
+ markdown_subset = to_markdown("report.pdf", pages=range(0, 5))
197
+ ```
198
+
199
+ ---
200
+
201
+ ## Character-Level Access
202
+
203
+ Each page exposes individual characters with full style information: bounding box, font name, font size, bold/italic flags, fill and stroke colors, rotation, and run ID.
204
+
205
+ ### Inspecting individual characters
206
+
207
+ ```python
208
+ doc = botl_pdf.open("report.pdf")
209
+ page = doc.pages[0]
210
+
211
+ for char in page.chars[:5]:
212
+ print(f" char={char.text!r} "
213
+ f"pos=({char.bbox.x0:.1f}, {char.bbox.y0:.1f}) "
214
+ f"size={char.font_size:.1f} "
215
+ f"font={char.font_name}")
216
+ ```
217
+
218
+ Output:
219
+ ```
220
+ char='H' pos=(100.0, 700.0) size=12.0 font=F1
221
+ char='e' pos=(108.0, 700.0) size=12.0 font=F1
222
+ char='l' pos=(115.0, 700.0) size=12.0 font=F1
223
+ char='l' pos=(120.0, 700.0) size=12.0 font=F1
224
+ char='o' pos=(125.0, 700.0) size=12.0 font=F1
225
+ ```
226
+
227
+ ### Finding text by style
228
+
229
+ ```python
230
+ # Find all bold characters on page 0
231
+ bold_chars = [c for c in doc.pages[0].chars if c.bold]
232
+ bold_text = "".join(c.text for c in bold_chars)
233
+
234
+ # Find characters in a specific color (e.g., red links)
235
+ red_chars = [
236
+ c for c in doc.pages[0].chars
237
+ if c.color and c.color[0] > 0.8 and c.color[1] < 0.2 and c.color[2] < 0.2
238
+ ]
239
+
240
+ # Find large decorative initials (font size > 30)
241
+ initials = [c for c in doc.pages[0].chars if c.font_size > 30]
242
+ for c in initials:
243
+ print(f"Decorative initial: {c.text!r} at size {c.font_size:.0f}")
244
+ ```
245
+
246
+ ### Extracting text from a region
247
+
248
+ ```python
249
+ # Get all text in a specific rectangular area
250
+ x0, y0, x1, y1 = 100.0, 600.0, 400.0, 700.0
251
+
252
+ region_chars = [
253
+ c for c in doc.pages[0].chars
254
+ if c.bbox.x0 >= x0 and c.bbox.x1 <= x1
255
+ and c.bbox.y0 >= y0 and c.bbox.y1 <= y1
256
+ ]
257
+ region_text = "".join(c.text for c in region_chars)
258
+ print(region_text)
259
+ ```
260
+
261
+ ### Run ID tracking
262
+
263
+ Characters from the same text-showing operation (Tj/TJ) share a `run_id`. This lets you group characters by their PDF text operation — useful for debugging extraction issues or understanding the PDF's internal structure.
264
+
265
+ ```python
266
+ from collections import defaultdict
267
+
268
+ # Group characters by their source text operation
269
+ runs = defaultdict(str)
270
+ for c in doc.pages[0].chars:
271
+ runs[c.run_id] += c.text
272
+
273
+ for run_id, text in sorted(runs.items()):
274
+ print(f" Run {run_id}: {text[:60]!r}")
275
+ ```
276
+
277
+ ---
278
+
279
+ ## Document Metadata
280
+
281
+ ```python
282
+ doc = botl_pdf.open("report.pdf")
283
+
284
+ meta = doc.metadata
285
+ print(f"Title: {meta.get('title')}")
286
+ print(f"Author: {meta.get('author')}")
287
+ print(f"Subject: {meta.get('subject')}")
288
+ print(f"Creator: {meta.get('creator')}")
289
+ print(f"Producer: {meta.get('producer')}")
290
+ print(f"Created: {meta.get('creation_date')}")
291
+ print(f"Modified: {meta.get('mod_date')}")
292
+ print(f"Version: {meta.get('version')}")
293
+ ```
294
+
295
+ ---
296
+
297
+ ## Table of Contents
298
+
299
+ ```python
300
+ doc = botl_pdf.open("book.pdf")
301
+
302
+ toc = doc.toc
303
+ for entry in toc:
304
+ indent = " " * entry.level
305
+ page = entry.page_number
306
+ print(f"{indent}{entry.title} → page {page}")
307
+ ```
308
+
309
+ Output:
310
+ ```
311
+ Preface → page 5
312
+ Acknowledgments → page 7
313
+ Part I. Foundations → page 11
314
+ Chapter 1. Introduction → page 13
315
+ Chapter 2. Methods → page 27
316
+ Part II. Results → page 45
317
+ Chapter 3. Analysis → page 47
318
+ ```
319
+
320
+ ### Building a page lookup from TOC
321
+
322
+ ```python
323
+ # Map page numbers to their chapter titles
324
+ chapters = {}
325
+ current_chapter = None
326
+ for entry in doc.toc:
327
+ if entry.level == 0 and entry.page_number is not None:
328
+ current_chapter = entry.title
329
+ if current_chapter and entry.page_number is not None:
330
+ chapters[entry.page_number] = current_chapter
331
+
332
+ # Find which chapter a page belongs to
333
+ def chapter_for_page(page_idx):
334
+ page_nums = sorted(chapters.keys())
335
+ for i, p in enumerate(page_nums):
336
+ if page_idx < p:
337
+ return chapters[page_nums[max(0, i - 1)]] if i > 0 else None
338
+ return chapters[page_nums[-1]]
339
+
340
+ print(f"Page 30 is in: {chapter_for_page(30)}")
341
+ ```
342
+
343
+ ---
344
+
345
+ ## Geometric Elements
346
+
347
+ Pages expose geometric lines and rectangles drawn on the PDF canvas — useful for detecting table borders, rules, decorative elements, and form fields.
348
+
349
+ ### Lines
350
+
351
+ ```python
352
+ page = doc.pages[0]
353
+
354
+ for line in page.lines:
355
+ print(f" Line ({line.x0:.1f},{line.y0:.1f}) → ({line.x1:.1f},{line.y1:.1f}) "
356
+ f"width={line.line_width:.1f}")
357
+ ```
358
+
359
+ ### Rectangles
360
+
361
+ ```python
362
+ for rect in page.rects:
363
+ fill = rect.fill_color
364
+ stroke = rect.stroke_color
365
+ print(f" Rect ({rect.bbox.x0:.1f},{rect.bbox.y0:.1f})-"
366
+ f"({rect.bbox.x1:.1f},{rect.bbox.y1:.1f}) "
367
+ f"stroke={stroke} fill={fill}")
368
+ ```
369
+
370
+ ### Detecting horizontal rules
371
+
372
+ ```python
373
+ # Find horizontal lines (useful for detecting separators/tables)
374
+ h_rules = [
375
+ line for line in page.lines
376
+ if abs(line.y1 - line.y0) < 1.0 and (line.x1 - line.x0) > 50.0
377
+ ]
378
+
379
+ for rule in h_rules:
380
+ print(f"Horizontal rule at y={rule.y0:.1f} from x={rule.x0:.1f} to x={rule.x1:.1f}")
381
+ ```
382
+
383
+ ---
384
+
385
+ ## Page Properties
386
+
387
+ ```python
388
+ doc = botl_pdf.open("report.pdf")
389
+
390
+ for i, page in enumerate(doc.pages):
391
+ print(f"Page {i}: {page.width:.0f}×{page.height:.0f}pt "
392
+ f"rotation={page.rotation}° "
393
+ f"label={page.label!r}")
394
+ ```
395
+
396
+ Output:
397
+ ```
398
+ Page 0: 612×792pt rotation=0° label='1'
399
+ Page 1: 612×792pt rotation=0° label='2'
400
+ ```
401
+
402
+ Common page sizes:
403
+ - Letter: 612 × 792 pt (8.5" × 11")
404
+ - A4: 595 × 842 pt (210mm × 297mm)
405
+
406
+ ---
407
+
408
+ ## Visual Debugging
409
+
410
+ Requires `Pillow`. Draws bounding boxes and geometric elements on a rendered page image — useful for debugging extraction issues or understanding PDF layout.
411
+
412
+ ```bash
413
+ pip install botl-pdf[debug]
414
+ ```
415
+
416
+ ```python
417
+ from botl_pdf.debug import VisualDebugger
418
+ import botl_pdf
419
+
420
+ doc = botl_pdf.open("report.pdf")
421
+ page = doc.pages[0]
422
+
423
+ debugger = VisualDebugger(page)
424
+
425
+ # Draw character bounding boxes (red)
426
+ img = debugger.draw_chars(resolution=150)
427
+ img.save("debug_chars.png")
428
+
429
+ # Draw geometric lines (blue)
430
+ img = debugger.draw_lines(resolution=150)
431
+ img.save("debug_lines.png")
432
+
433
+ # Draw geometric rectangles (green)
434
+ img = debugger.draw_rects(resolution=150)
435
+ img.save("debug_rects.png")
436
+
437
+ # All elements layered together
438
+ img = debugger.draw_all(resolution=150)
439
+ img.save("debug_all.png")
440
+ ```
441
+
442
+ ---
443
+
444
+ ## CLI
445
+
446
+ ```bash
447
+ pip install botl-pdf[cli]
448
+ ```
449
+
450
+ ### Extract text
451
+
452
+ ```bash
453
+ # To stdout
454
+ botl-pdf text report.pdf
455
+
456
+ # To file
457
+ botl-pdf text report.pdf --output text.txt
458
+
459
+ # Specific pages
460
+ botl-pdf text report.pdf --pages 1-5
461
+
462
+ # Layout-preserved
463
+ botl-pdf text report.pdf --layout
464
+ ```
465
+
466
+ ### Show metadata
467
+
468
+ ```bash
469
+ botl-pdf info report.pdf
470
+ ```
471
+
472
+ Output:
473
+ ```json
474
+ {
475
+ "version": "1.4",
476
+ "page_count": 42,
477
+ "encrypted": false,
478
+ "title": "Annual Report 2024",
479
+ "author": "Acme Corp",
480
+ "creator": "LaTeX",
481
+ "producer": "pdfTeX-1.40"
482
+ }
483
+ ```
484
+
485
+ ### Export
486
+
487
+ ```bash
488
+ # Markdown
489
+ botl-pdf export report.pdf --format markdown --output report.md
490
+
491
+ # Plain text
492
+ botl-pdf export report.pdf --format text --output report.txt
493
+ ```
494
+
495
+ ---
496
+
497
+ ## API Reference
498
+
499
+ ### `botl_pdf.open(path_or_bytes, *, password=None, lazy=True) -> Document`
500
+
501
+ Open a PDF from a file path (str) or raw bytes.
502
+
503
+ ### `Document`
504
+
505
+ | Property / Method | Type | Description |
506
+ |---|---|---|
507
+ | `.metadata` | `dict` | Metadata fields: title, author, subject, keywords, creator, producer, creation_date, mod_date, version, page_count |
508
+ | `.num_pages` | `int` | Number of pages |
509
+ | `.is_encrypted` | `bool` | Whether the document is encrypted |
510
+ | `.toc` | `list[TOCEntry]` | Table of contents / outline bookmarks |
511
+ | `.pages` | `PageCollection` | Iterable, subscriptable page access |
512
+ | `doc[i]` | `PyPage` | Shortcut for `doc.pages[i]` (supports negative indices) |
513
+ | `len(doc)` | `int` | Same as `.num_pages` |
514
+
515
+ ### `Page` (via `doc.pages[i]`)
516
+
517
+ | Property / Method | Type | Description |
518
+ |---|---|---|
519
+ | `.extract_text(layout=False, layout_params=None)` | `str` | Extract text (plain or layout-preserved) |
520
+ | `.chars` | `list[Char]` | All characters with full style info |
521
+ | `.lines` | `list[GeomLine]` | Geometric lines on the page |
522
+ | `.rects` | `list[GeomRect]` | Geometric rectangles on the page |
523
+ | `.width` | `float` | Page width in points |
524
+ | `.height` | `float` | Page height in points |
525
+ | `.rotation` | `int` | Rotation in degrees (0, 90, 180, 270) |
526
+ | `.page_number` | `int` | Zero-based page index |
527
+ | `.label` | `str` | Page label string (e.g. "iii", "A-1") |
528
+
529
+ ### `Char`
530
+
531
+ | Property | Type | Description |
532
+ |---|---|---|
533
+ | `.text` | `str` | Unicode character |
534
+ | `.bbox` | `BBox` | Bounding box |
535
+ | `.font_name` | `str` | Font resource name (e.g. "F1") |
536
+ | `.font_size` | `float` | Font size in points |
537
+ | `.bold` | `bool` | Bold flag |
538
+ | `.italic` | `bool` | Italic flag |
539
+ | `.color` | `tuple[float, float, float] or None` | Fill color (RGB, 0.0-1.0) |
540
+ | `.stroking_color` | `tuple[float, float, float] or None` | Stroke color (RGB, 0.0-1.0) |
541
+ | `.rotation` | `float` | Rotation in degrees |
542
+ | `.run_id` | `int` | Text operation ID (chars from same Tj/TJ share this) |
543
+
544
+ ### `BBox`
545
+
546
+ | Property / Method | Type | Description |
547
+ |---|---|---|
548
+ | `.x0`, `.y0` | `float` | Top-left corner |
549
+ | `.x1`, `.y1` | `float` | Bottom-right corner |
550
+ | `.width` | `float` | Width (x1 - x0) |
551
+ | `.height` | `float` | Height (y1 - y0) |
552
+ | `.center()` | `(float, float)` | Center point |
553
+ | `.area()` | `float` | Area |
554
+
555
+ ### `TOCEntry`
556
+
557
+ | Property | Type | Description |
558
+ |---|---|---|
559
+ | `.title` | `str` | Outline entry title |
560
+ | `.level` | `int` | Nesting depth (0 = top-level) |
561
+ | `.page_number` | `int or None` | 0-indexed destination page (None if unresolvable) |
562
+ | `.dest` | `str or None` | Raw destination string |
563
+
564
+ ### `GeomLine`
565
+
566
+ | Property | Type | Description |
567
+ |---|---|---|
568
+ | `.x0`, `.y0` | `float` | Start point |
569
+ | `.x1`, `.y1` | `float` | End point |
570
+ | `.line_width` | `float` | Stroke width |
571
+ | `.color` | `tuple or None` | RGB color (0.0-1.0) |
572
+
573
+ ### `GeomRect`
574
+
575
+ | Property | Type | Description |
576
+ |---|---|---|
577
+ | `.bbox` | `BBox` | Bounding box |
578
+ | `.line_width` | `float` | Stroke width |
579
+ | `.stroke_color` | `tuple or None` | Stroke RGB color |
580
+ | `.fill_color` | `tuple or None` | Fill RGB color |
581
+
582
+ ### `LayoutParams`
583
+
584
+ | Parameter | Type | Default | Description |
585
+ |---|---|---|---|
586
+ | `word_margin` | `float` | `2.0` | Max horizontal gap between chars in same word, as a multiple of font size |
587
+ | `line_margin` | `float` | `0.5` | Max vertical gap between lines in same block, as a multiple of line height |
588
+ | `boxes_flow` | `float` | `0.5` | Reading-order direction (0.0 = strict horizontal, 1.0 = strict vertical) |
589
+
590
+ ```python
591
+ params = botl_pdf.LayoutParams(word_margin=1.5, line_margin=0.3, boxes_flow=0.0)
592
+ text = page.extract_text(layout=True, layout_params=params)
593
+ ```
594
+
595
+ ---
596
+
597
+ ## Architecture
598
+
599
+ ```
600
+ PDF bytes
601
+ → Parser (nom tokenizer + recursive-descent objects)
602
+ → Content stream interpreter (Tj/TJ/q/Q/cm operators)
603
+ → Character extraction (CMap, fonts, glyph widths)
604
+ → Layout analysis (chars → words → lines → blocks)
605
+ → Reading order (column detection, run de-interleaving)
606
+ → Text output (plain or layout-preserved)
607
+ ```
608
+
609
+ The pipeline is entirely custom Rust — no dependency on poppler, pdfium, pdfbox, or any other PDF library.
610
+
611
+ **Key design decisions:**
612
+
613
+ - **Run-aware de-interleaving** — Each Tj/TJ text operation tags characters with a `run_id`. When PDF producers interleave characters from different operations at alternating x-positions, the layout engine detects this and groups by run, preserving correct reading order.
614
+ - **Font-band separation** — Within a line, characters are grouped by font size to handle decorative initials and mixed-size text on the same visual line.
615
+ - **Lazy extraction** — Page content is decoded on first access and cached. The parsed `Document` is shared across pages via `Arc<Mutex>`, so there's no per-page re-parsing.
616
+
617
+ ---
618
+
619
+ ## Benchmarks
620
+
621
+ Tested against PyMuPDF on real-world PDFs (textbooks, novels, academic papers):
622
+
623
+ | PDF | Pages | botl-pdf words | PyMuPDF words | botl-pdf time | PyMuPDF time |
624
+ |---|---|---|---|---|---|
625
+ | Electrical engineering textbook | 100 | 35,435 | 34,708 | 238ms | 174ms |
626
+ | Discrete math textbook | 200 | 89,291 | 89,968 | 526ms | 426ms |
627
+ | French novel | 130 | 45,355 | 45,337 | 293ms | 214ms |
628
+ | American Revolution history | 293 | 100,954 | 99,897 | 591ms | 377ms |
629
+ | Rust Programming Language 3E | 560 | 200,177 | 196,748 | 1262ms | 873ms |
630
+ | Mystery novel | 300 | 89,610 | 88,604 | 568ms | 445ms |
631
+ | **Total** | **1583** | **660,822** | **655,262** | **3478ms** | **2519ms** |
632
+
633
+ Word counts match within ~1% of PyMuPDF. Performance is ~1.4x slower.
634
+
635
+ ---
636
+
637
+ ## Development
638
+
639
+ ```bash
640
+ # Set up environment
641
+ python -m venv .venv && source .venv/bin/activate
642
+ pip install maturin pytest
643
+
644
+ # Build Rust extension in release mode
645
+ maturin develop --release
646
+
647
+ # Run Rust tests (198 tests)
648
+ cd rust && cargo test
649
+
650
+ # Run Python tests
651
+ pytest tests/python/
652
+
653
+ # Run benchmarks
654
+ pytest tests/python/benchmarks/ --benchmark-only
655
+ ```
656
+
657
+ ### Project structure
658
+
659
+ ```
660
+ botl-pdf/
661
+ ├── rust/
662
+ │ ├── botl-pdf-core/ # Core engine (parser, text, layout, codecs)
663
+ │ ├── botl-pdf-python/ # PyO3 bindings → _core native module
664
+ │ └── botl-pdf-csys/ # Image codec FFI (JPEG, JPEG2000)
665
+ ├── python/botl_pdf/ # High-level Python API
666
+ │ ├── document.py # Document, PageCollection
667
+ │ ├── page.py # Page wrapper
668
+ │ ├── export.py # to_text(), to_markdown()
669
+ │ ├── debug.py # VisualDebugger (Pillow overlays)
670
+ │ ├── tables.py # Table/TableCell dataclasses
671
+ │ └── cli/main.py # CLI: text, info, export
672
+ ├── tests/
673
+ │ ├── rust/ # Integration tests (parser, text, layout, geometry)
674
+ │ └── python/ # Unit + integration tests
675
+ └── docs/ # Sphinx docs
676
+ ```
677
+
678
+ ## License
679
+
680
+ Apache 2.0
681
+