doctra 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. doctra/__init__.py +19 -0
  2. doctra/cli/__init__.py +27 -0
  3. doctra/cli/main.py +856 -0
  4. doctra/cli/utils.py +340 -0
  5. doctra/engines/__init__.py +0 -0
  6. doctra/engines/layout/__init__.py +0 -0
  7. doctra/engines/layout/layout_models.py +90 -0
  8. doctra/engines/layout/paddle_layout.py +225 -0
  9. doctra/engines/ocr/__init__.py +4 -0
  10. doctra/engines/ocr/api.py +36 -0
  11. doctra/engines/ocr/path_resolver.py +48 -0
  12. doctra/engines/ocr/pytesseract_engine.py +76 -0
  13. doctra/engines/vlm/__init__.py +0 -0
  14. doctra/engines/vlm/outlines_types.py +31 -0
  15. doctra/engines/vlm/provider.py +58 -0
  16. doctra/engines/vlm/service.py +117 -0
  17. doctra/exporters/__init__.py +0 -0
  18. doctra/exporters/excel_writer.py +197 -0
  19. doctra/exporters/image_saver.py +42 -0
  20. doctra/exporters/markdown_table.py +56 -0
  21. doctra/exporters/markdown_writer.py +29 -0
  22. doctra/parsers/__init__.py +6 -0
  23. doctra/parsers/layout_order.py +16 -0
  24. doctra/parsers/structured_pdf_parser.py +434 -0
  25. doctra/parsers/table_chart_extractor.py +283 -0
  26. doctra/utils/__init__.py +0 -0
  27. doctra/utils/bbox.py +18 -0
  28. doctra/utils/constants.py +8 -0
  29. doctra/utils/file_ops.py +26 -0
  30. doctra/utils/io_utils.py +10 -0
  31. doctra/utils/ocr_utils.py +20 -0
  32. doctra/utils/pdf_io.py +19 -0
  33. doctra/utils/quiet.py +13 -0
  34. doctra/utils/structured_utils.py +49 -0
  35. doctra/version.py +2 -0
  36. doctra-0.1.0.dist-info/METADATA +626 -0
  37. doctra-0.1.0.dist-info/RECORD +40 -0
  38. doctra-0.1.0.dist-info/WHEEL +5 -0
  39. doctra-0.1.0.dist-info/licenses/LICENSE +201 -0
  40. doctra-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,626 @@
1
+ Metadata-Version: 2.4
2
+ Name: doctra
3
+ Version: 0.1.0
4
+ Summary: Parse, extract, and analyze documents with ease
5
+ Home-page: https://github.com/AdemBoukhris457/Doctra
6
+ Author: Adem Boukhris
7
+ Author-email: Adem Boukhris <boukhrisadam98@gmail.com>
8
+ License: Apache License
9
+ Version 2.0, January 2004
10
+ http://www.apache.org/licenses/
11
+
12
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
13
+
14
+ 1. Definitions.
15
+
16
+ "License" shall mean the terms and conditions for use, reproduction,
17
+ and distribution as defined by Sections 1 through 9 of this document.
18
+
19
+ "Licensor" shall mean the copyright owner or entity authorized by
20
+ the copyright owner that is granting the License.
21
+
22
+ "Legal Entity" shall mean the union of the acting entity and all
23
+ other entities that control, are controlled by, or are under common
24
+ control with that entity. For the purposes of this definition,
25
+ "control" means (i) the power, direct or indirect, to cause the
26
+ direction or management of such entity, whether by contract or
27
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
28
+ outstanding shares, or (iii) beneficial ownership of such entity.
29
+
30
+ "You" (or "Your") shall mean an individual or Legal Entity
31
+ exercising permissions granted by this License.
32
+
33
+ "Source" form shall mean the preferred form for making modifications,
34
+ including but not limited to software source code, documentation
35
+ source, and configuration files.
36
+
37
+ "Object" form shall mean any form resulting from mechanical
38
+ transformation or translation of a Source form, including but
39
+ not limited to compiled object code, generated documentation,
40
+ and conversions to other media types.
41
+
42
+ "Work" shall mean the work of authorship, whether in Source or
43
+ Object form, made available under the License, as indicated by a
44
+ copyright notice that is included in or attached to the work
45
+ (an example is provided in the Appendix below).
46
+
47
+ "Derivative Works" shall mean any work, whether in Source or Object
48
+ form, that is based on (or derived from) the Work and for which the
49
+ editorial revisions, annotations, elaborations, or other modifications
50
+ represent, as a whole, an original work of authorship. For the purposes
51
+ of this License, Derivative Works shall not include works that remain
52
+ separable from, or merely link (or bind by name) to the interfaces of,
53
+ the Work and Derivative Works thereof.
54
+
55
+ "Contribution" shall mean any work of authorship, including
56
+ the original version of the Work and any modifications or additions
57
+ to that Work or Derivative Works thereof, that is intentionally
58
+ submitted to Licensor for inclusion in the Work by the copyright owner
59
+ or by an individual or Legal Entity authorized to submit on behalf of
60
+ the copyright owner. For the purposes of this definition, "submitted"
61
+ means any form of electronic, verbal, or written communication sent
62
+ to the Licensor or its representatives, including but not limited to
63
+ communication on electronic mailing lists, source code control systems,
64
+ and issue tracking systems that are managed by, or on behalf of, the
65
+ Licensor for the purpose of discussing and improving the Work, but
66
+ excluding communication that is conspicuously marked or otherwise
67
+ designated in writing by the copyright owner as "Not a Contribution."
68
+
69
+ "Contributor" shall mean Licensor and any individual or Legal Entity
70
+ on behalf of whom a Contribution has been received by Licensor and
71
+ subsequently incorporated within the Work.
72
+
73
+ 2. Grant of Copyright License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ copyright license to reproduce, prepare Derivative Works of,
77
+ publicly display, publicly perform, sublicense, and distribute the
78
+ Work and such Derivative Works in Source or Object form.
79
+
80
+ 3. Grant of Patent License. Subject to the terms and conditions of
81
+ this License, each Contributor hereby grants to You a perpetual,
82
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
83
+ (except as stated in this section) patent license to make, have made,
84
+ use, offer to sell, sell, import, and otherwise transfer the Work,
85
+ where such license applies only to those patent claims licensable
86
+ by such Contributor that are necessarily infringed by their
87
+ Contribution(s) alone or by combination of their Contribution(s)
88
+ with the Work to which such Contribution(s) was submitted. If You
89
+ institute patent litigation against any entity (including a
90
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
91
+ or a Contribution incorporated within the Work constitutes direct
92
+ or contributory patent infringement, then any patent licenses
93
+ granted to You under this License for that Work shall terminate
94
+ as of the date such litigation is filed.
95
+
96
+ 4. Redistribution. You may reproduce and distribute copies of the
97
+ Work or Derivative Works thereof in any medium, with or without
98
+ modifications, and in Source or Object form, provided that You
99
+ meet the following conditions:
100
+
101
+ (a) You must give any other recipients of the Work or
102
+ Derivative Works a copy of this License; and
103
+
104
+ (b) You must cause any modified files to carry prominent notices
105
+ stating that You changed the files; and
106
+
107
+ (c) You must retain, in the Source form of any Derivative Works
108
+ that You distribute, all copyright, patent, trademark, and
109
+ attribution notices from the Source form of the Work,
110
+ excluding those notices that do not pertain to any part of
111
+ the Derivative Works; and
112
+
113
+ (d) If the Work includes a "NOTICE" text file as part of its
114
+ distribution, then any Derivative Works that You distribute must
115
+ include a readable copy of the attribution notices contained
116
+ within such NOTICE file, excluding those notices that do not
117
+ pertain to any part of the Derivative Works, in at least one
118
+ of the following places: within a NOTICE text file distributed
119
+ as part of the Derivative Works; within the Source form or
120
+ documentation, if provided along with the Derivative Works; or,
121
+ within a display generated by the Derivative Works, if and
122
+ wherever such third-party notices normally appear. The contents
123
+ of the NOTICE file are for informational purposes only and
124
+ do not modify the License. You may add Your own attribution
125
+ notices within Derivative Works that You distribute, alongside
126
+ or as an addendum to the NOTICE text from the Work, provided
127
+ that such additional attribution notices cannot be construed
128
+ as modifying the License.
129
+
130
+ You may add Your own copyright statement to Your modifications and
131
+ may provide additional or different license terms and conditions
132
+ for use, reproduction, or distribution of Your modifications, or
133
+ for any such Derivative Works as a whole, provided Your use,
134
+ reproduction, and distribution of the Work otherwise complies with
135
+ the conditions stated in this License.
136
+
137
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
138
+ any Contribution intentionally submitted for inclusion in the Work
139
+ by You to the Licensor shall be under the terms and conditions of
140
+ this License, without any additional terms or conditions.
141
+ Notwithstanding the above, nothing herein shall supersede or modify
142
+ the terms of any separate license agreement you may have executed
143
+ with Licensor regarding such Contributions.
144
+
145
+ 6. Trademarks. This License does not grant permission to use the trade
146
+ names, trademarks, service marks, or product names of the Licensor,
147
+ except as required for reasonable and customary use in describing the
148
+ origin of the Work and reproducing the content of the NOTICE file.
149
+
150
+ 7. Disclaimer of Warranty. Unless required by applicable law or
151
+ agreed to in writing, Licensor provides the Work (and each
152
+ Contributor provides its Contributions) on an "AS IS" BASIS,
153
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
154
+ implied, including, without limitation, any warranties or conditions
155
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
156
+ PARTICULAR PURPOSE. You are solely responsible for determining the
157
+ appropriateness of using or redistributing the Work and assume any
158
+ risks associated with Your exercise of permissions under this License.
159
+
160
+ 8. Limitation of Liability. In no event and under no legal theory,
161
+ whether in tort (including negligence), contract, or otherwise,
162
+ unless required by applicable law (such as deliberate and grossly
163
+ negligent acts) or agreed to in writing, shall any Contributor be
164
+ liable to You for damages, including any direct, indirect, special,
165
+ incidental, or consequential damages of any character arising as a
166
+ result of this License or out of the use or inability to use the
167
+ Work (including but not limited to damages for loss of goodwill,
168
+ work stoppage, computer failure or malfunction, or any and all
169
+ other commercial damages or losses), even if such Contributor
170
+ has been advised of the possibility of such damages.
171
+
172
+ 9. Accepting Warranty or Additional Liability. While redistributing
173
+ the Work or Derivative Works thereof, You may choose to offer,
174
+ and charge a fee for, acceptance of support, warranty, indemnity,
175
+ or other liability obligations and/or rights consistent with this
176
+ License. However, in accepting such obligations, You may act only
177
+ on Your own behalf and on Your sole responsibility, not on behalf
178
+ of any other Contributor, and only if You agree to indemnify,
179
+ defend, and hold each Contributor harmless for any liability
180
+ incurred by, or claims asserted against, such Contributor by reason
181
+ of your accepting any such warranty or additional liability.
182
+
183
+ END OF TERMS AND CONDITIONS
184
+
185
+ APPENDIX: How to apply the Apache License to your work.
186
+
187
+ To apply the Apache License to your work, attach the following
188
+ boilerplate notice, with the fields enclosed by brackets "[]"
189
+ replaced with your own identifying information. (Don't include
190
+ the brackets!) The text should be enclosed in the appropriate
191
+ comment syntax for the file format. We also recommend that a
192
+ file or class name and description of purpose be included on the
193
+ same "printed page" as the copyright notice for easier
194
+ identification within third-party archives.
195
+
196
+ Copyright [yyyy] [name of copyright owner]
197
+
198
+ Licensed under the Apache License, Version 2.0 (the "License");
199
+ you may not use this file except in compliance with the License.
200
+ You may obtain a copy of the License at
201
+
202
+ http://www.apache.org/licenses/LICENSE-2.0
203
+
204
+ Unless required by applicable law or agreed to in writing, software
205
+ distributed under the License is distributed on an "AS IS" BASIS,
206
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
207
+ See the License for the specific language governing permissions and
208
+ limitations under the License.
209
+
210
+ Project-URL: Homepage, https://github.com/AdemBoukhris457/Doctra
211
+ Project-URL: Repository, https://github.com/AdemBoukhris457/Doctra
212
+ Project-URL: Issues, https://github.com/AdemBoukhris457/Doctra/issues
213
+ Keywords: pdf,parser,document,extraction,ocr,layout-detection
214
+ Classifier: Development Status :: 4 - Beta
215
+ Classifier: Intended Audience :: Developers
216
+ Classifier: License :: OSI Approved :: MIT License
217
+ Classifier: Operating System :: OS Independent
218
+ Classifier: Programming Language :: Python :: 3
219
+ Classifier: Programming Language :: Python :: 3.8
220
+ Classifier: Programming Language :: Python :: 3.9
221
+ Classifier: Programming Language :: Python :: 3.10
222
+ Classifier: Programming Language :: Python :: 3.11
223
+ Classifier: Programming Language :: Python :: 3.12
224
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
225
+ Classifier: Topic :: Text Processing :: Markup
226
+ Classifier: Topic :: Multimedia :: Graphics :: Graphics Conversion
227
+ Requires-Python: >=3.8
228
+ Description-Content-Type: text/markdown
229
+ License-File: LICENSE
230
+ Requires-Dist: paddlepaddle>=2.4.0
231
+ Requires-Dist: paddleocr>=2.6.0
232
+ Requires-Dist: pillow>=8.0.0
233
+ Requires-Dist: opencv-python>=4.5.0
234
+ Requires-Dist: pandas>=1.3.0
235
+ Requires-Dist: openpyxl>=3.0.0
236
+ Requires-Dist: tesseract>=0.1.3
237
+ Requires-Dist: outlines>=0.0.34
238
+ Requires-Dist: tqdm>=4.62.0
239
+ Requires-Dist: matplotlib>=3.5.0
240
+ Provides-Extra: openai
241
+ Requires-Dist: openai>=1.0.0; extra == "openai"
242
+ Provides-Extra: gemini
243
+ Requires-Dist: google-generativeai>=0.3.0; extra == "gemini"
244
+ Provides-Extra: dev
245
+ Requires-Dist: pytest>=6.0; extra == "dev"
246
+ Requires-Dist: pytest-cov>=2.0; extra == "dev"
247
+ Requires-Dist: black>=21.0; extra == "dev"
248
+ Requires-Dist: isort>=5.0; extra == "dev"
249
+ Requires-Dist: flake8>=3.9; extra == "dev"
250
+ Requires-Dist: mypy>=0.910; extra == "dev"
251
+ Requires-Dist: pre-commit>=2.15.0; extra == "dev"
252
+ Dynamic: author
253
+ Dynamic: home-page
254
+ Dynamic: license-file
255
+ Dynamic: requires-python
256
+
257
+ # 🚀 **Doctra - Document Parser Library** 📑🔎
258
+
259
+ ![Doctra_Logo.png](assets/Doctra_Logo.png)
260
+
261
+ <div align="center">
262
+
263
+ [![GitHub Stars](https://img.shields.io/github/stars/AdemBoukhris457/Doctra?style=social)](https://github.com/AdemBoukhris457/Doctra/stargazers)
264
+ [![GitHub Forks](https://img.shields.io/github/forks/AdemBoukhris457/Doctra?style=social)](https://github.com/AdemBoukhris457/Doctra/forks)
265
+
266
+ </div>
267
+
268
+ ## 📋 Table of Contents
269
+
270
+ - [Installation](#installation)
271
+ - [Quick Start](#quick-start)
272
+ - [Core Components](#core-components)
273
+ - [StructuredPDFParser](#structuredpdfparser)
274
+ - [ChartTablePDFParser](#charttablepdfparser)
275
+ - [Visualization](#visualization)
276
+ - [Usage Examples](#usage-examples)
277
+ - [Features](#features)
278
+ - [Requirements](#requirements)
279
+
280
+ ## 🛠️ Installation
281
+
282
+ ### From PyPI (recommended)
283
+
284
+ ```bash
285
+ pip install doctra
286
+ ```
287
+
288
+ ### From source
289
+
290
+ ```bash
291
+ git clone https://github.com/AdemBoukhris457/Doctra.git
292
+ cd Doctra
293
+ pip install .
294
+ ```
295
+
296
+ ## ⚡ Quick Start
297
+
298
+ ```python
299
+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
300
+
301
+ # Initialize the parser
302
+ parser = StructuredPDFParser()
303
+
304
+ # Parse a PDF document
305
+ parser.parse("path/to/your/document.pdf")
306
+ ```
307
+
308
+ ## 🔧 Core Components
309
+
310
+ ### StructuredPDFParser
311
+
312
+ The `StructuredPDFParser` is a comprehensive PDF parser that extracts all types of content from PDF documents. It processes PDFs through layout detection, extracts text using OCR, saves images for visual elements, and optionally converts charts/tables to structured data using Vision Language Models (VLM).
313
+
314
+ #### Key Features:
315
+ - **Layout Detection**: Uses PaddleOCR for accurate document layout analysis
316
+ - **OCR Processing**: Extracts text from all document elements
317
+ - **Visual Element Extraction**: Saves figures, charts, and tables as images
318
+ - **VLM Integration**: Optional conversion of visual elements to structured data
319
+ - **Multiple Output Formats**: Generates Markdown, Excel, and structured JSON
320
+
321
+ #### Basic Usage:
322
+
323
+ ```python
324
+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
325
+
326
+ # Basic parser without VLM
327
+ parser = StructuredPDFParser()
328
+
329
+ # Parser with VLM for structured data extraction
330
+ parser = StructuredPDFParser(
331
+ use_vlm=True,
332
+ vlm_provider="openai", # or "gemini"
333
+ vlm_api_key="your_api_key_here"
334
+ )
335
+
336
+ # Parse document
337
+ parser.parse("document.pdf")
338
+ ```
339
+
340
+ #### Advanced Configuration:
341
+
342
+ ```python
343
+ parser = StructuredPDFParser(
344
+ # VLM Settings
345
+ use_vlm=True,
346
+ vlm_provider="openai",
347
+ vlm_model="gpt-4o",
348
+ vlm_api_key="your_api_key",
349
+
350
+ # Layout Detection Settings
351
+ layout_model_name="PP-DocLayout_plus-L",
352
+ dpi=200,
353
+ min_score=0.0,
354
+
355
+ # OCR Settings
356
+ ocr_lang="eng",
357
+ ocr_psm=4,
358
+ ocr_oem=3,
359
+ ocr_extra_config="",
360
+
361
+ # Output Settings
362
+ box_separator="\n"
363
+ )
364
+ ```
365
+
366
+ ### ChartTablePDFParser
367
+
368
+ The `ChartTablePDFParser` is a specialized parser focused specifically on extracting charts and tables from PDF documents. It's optimized for scenarios where you only need these specific elements, providing faster processing and more targeted output.
369
+
370
+ #### Key Features:
371
+ - **Focused Extraction**: Extracts only charts and/or tables
372
+ - **Selective Processing**: Choose to extract charts, tables, or both
373
+ - **VLM Integration**: Optional conversion to structured data
374
+ - **Organized Output**: Separate directories for charts and tables
375
+ - **Progress Tracking**: Real-time progress bars for extraction
376
+
377
+ #### Basic Usage:
378
+
379
+ ```python
380
+ from doctra.parsers.table_chart_extractor import ChartTablePDFParser
381
+
382
+ # Extract both charts and tables
383
+ parser = ChartTablePDFParser(
384
+ extract_charts=True,
385
+ extract_tables=True
386
+ )
387
+
388
+ # Extract only charts
389
+ parser = ChartTablePDFParser(
390
+ extract_charts=True,
391
+ extract_tables=False
392
+ )
393
+
394
+ # Parse with custom output directory
395
+ parser.parse("document.pdf", output_base_dir="my_outputs")
396
+ ```
397
+
398
+ #### Advanced Configuration:
399
+
400
+ ```python
401
+ parser = ChartTablePDFParser(
402
+ # Extraction Settings
403
+ extract_charts=True,
404
+ extract_tables=True,
405
+
406
+ # VLM Settings
407
+ use_vlm=True,
408
+ vlm_provider="openai",
409
+ vlm_model="gpt-4o",
410
+ vlm_api_key="your_api_key",
411
+
412
+ # Layout Detection Settings
413
+ layout_model_name="PP-DocLayout_plus-L",
414
+ dpi=200,
415
+ min_score=0.0
416
+ )
417
+ ```
418
+
419
+ ## 🎨 Visualization
420
+
421
+ Doctra provides powerful visualization capabilities to help you understand how the layout detection works and verify the accuracy of element extraction.
422
+
423
+ ### Layout Detection Visualization
424
+
425
+ The `StructuredPDFParser` includes a built-in visualization method that displays PDF pages with bounding boxes overlaid on detected elements. This is perfect for:
426
+
427
+ - **Debugging**: Verify that layout detection is working correctly
428
+ - **Quality Assurance**: Check the accuracy of element identification
429
+ - **Documentation**: Create visual documentation of extraction results
430
+ - **Analysis**: Understand document structure and layout patterns
431
+
432
+ #### Basic Visualization:
433
+
434
+ ```python
435
+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
436
+
437
+ # Initialize parser
438
+ parser = StructuredPDFParser()
439
+
440
+ # Display visualization (opens in default image viewer)
441
+ parser.display_pages_with_boxes("document.pdf")
442
+ ```
443
+
444
+ #### Advanced Visualization with Custom Settings:
445
+
446
+ ```python
447
+ # Custom visualization configuration
448
+ parser.display_pages_with_boxes(
449
+ pdf_path="document.pdf",
450
+ num_pages=5, # Number of pages to visualize
451
+ cols=3, # Number of columns in grid
452
+ page_width=600, # Width of each page in pixels
453
+ spacing=30, # Spacing between pages
454
+ save_path="layout_visualization.png" # Save to file instead of displaying
455
+ )
456
+ ```
457
+
458
+ #### Visualization Features:
459
+
460
+ - **Color-coded Elements**: Each element type (text, table, chart, figure) has a distinct color
461
+ - **Confidence Scores**: Shows detection confidence for each element
462
+ - **Grid Layout**: Multiple pages displayed in an organized grid
463
+ - **Interactive Legend**: Color legend showing all detected element types
464
+ - **High Quality**: High-resolution output suitable for documentation
465
+ - **Flexible Output**: Display on screen or save to file
466
+
467
+ #### Example Output:
468
+
469
+ The visualization shows:
470
+ - **Blue boxes**: Text elements
471
+ - **Red boxes**: Tables
472
+ - **Green boxes**: Charts
473
+ - **Orange boxes**: Figures
474
+ - **Labels**: Element type and confidence score (e.g., "table (0.95)")
475
+ - **Page titles**: Page number and element count
476
+ - **Summary statistics**: Total elements detected by type
477
+
478
+ ### Use Cases for Visualization:
479
+
480
+ 1. **Document Analysis**: Quickly assess document structure and complexity
481
+ 2. **Quality Control**: Verify extraction accuracy before processing
482
+ 3. **Debugging**: Identify issues with layout detection
483
+ 4. **Documentation**: Create visual reports of extraction results
484
+ 5. **Training**: Help users understand how the system works
485
+
486
+ ### Visualization Configuration Options:
487
+
488
+ | Parameter | Default | Description |
489
+ |-----------|---------|-------------|
490
+ | `num_pages` | 3 | Number of pages to visualize |
491
+ | `cols` | 2 | Number of columns in grid layout |
492
+ | `page_width` | 800 | Width of each page in pixels |
493
+ | `spacing` | 40 | Spacing between pages in pixels |
494
+ | `save_path` | None | Path to save visualization (if None, displays on screen) |
495
+
496
+ ## 📖 Usage Examples
497
+
498
+ ### Example 1: Basic Document Processing
499
+
500
+ ```python
501
+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
502
+
503
+ # Initialize parser
504
+ parser = StructuredPDFParser()
505
+
506
+ # Process document
507
+ parser.parse("financial_report.pdf")
508
+
509
+ # Output will be saved to: outputs/financial_report/
510
+ # - Extracted text content
511
+ # - Cropped images of figures, charts, and tables
512
+ # - Markdown file with all content
513
+ ```
514
+
515
+ ### Example 2: Chart and Table Extraction with VLM
516
+
517
+ ```python
518
+ from doctra.parsers.table_chart_extractor import ChartTablePDFParser
519
+
520
+ # Initialize parser with VLM
521
+ parser = ChartTablePDFParser(
522
+ extract_charts=True,
523
+ extract_tables=True,
524
+ use_vlm=True,
525
+ vlm_provider="openai",
526
+ vlm_api_key="your_gemini_api_key"
527
+ )
528
+
529
+ # Process document
530
+ parser.parse("data_report.pdf", output_base_dir="extracted_data")
531
+
532
+ # Output will include:
533
+ # - Cropped chart and table images
534
+ # - Structured data in Excel format
535
+ # - Markdown tables with extracted data
536
+ ```
537
+
538
+ ### Example 3: Custom Configuration
539
+
540
+ ```python
541
+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
542
+
543
+ # Custom configuration for high-quality processing
544
+ parser = StructuredPDFParser(
545
+ use_vlm=True,
546
+ vlm_provider="openai",
547
+ vlm_api_key="your_openai_api_key",
548
+ vlm__model="gpt-4o",
549
+ layout_model_name="PP-DocLayout_plus-L",
550
+ dpi=300, # Higher DPI for better quality
551
+ min_score=0.5, # Higher confidence threshold
552
+ ocr_lang="eng",
553
+ ocr_psm=6, # Uniform block of text
554
+ box_separator="\n\n" # Double line breaks between elements
555
+ )
556
+
557
+ parser.parse("complex_document.pdf")
558
+ ```
559
+
560
+ ### Example 4: Layout Visualization
561
+
562
+ ```python
563
+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
564
+
565
+ # Initialize parser
566
+ parser = StructuredPDFParser()
567
+
568
+ # Create a comprehensive visualization
569
+ parser.display_pages_with_boxes(
570
+ pdf_path="research_paper.pdf",
571
+ num_pages=6, # Visualize first 6 pages
572
+ cols=2, # 2 columns layout
573
+ page_width=700, # Larger pages for better detail
574
+ spacing=50, # More spacing between pages
575
+ save_path="research_paper_layout.png" # Save for documentation
576
+ )
577
+
578
+ # For quick preview (displays on screen)
579
+ parser.display_pages_with_boxes("document.pdf")
580
+ ```
581
+
582
+ ## ✨ Features
583
+
584
+ ### 🔍 Layout Detection
585
+ - Advanced document layout analysis using PaddleOCR
586
+ - Accurate identification of text, tables, charts, and figures
587
+ - Configurable confidence thresholds
588
+
589
+ ### 📝 OCR Processing
590
+ - High-quality text extraction using Tesseract
591
+ - Support for multiple languages
592
+ - Configurable OCR parameters
593
+
594
+ ### 🖼️ Visual Element Extraction
595
+ - Automatic cropping and saving of figures, charts, and tables
596
+ - Organized output directory structure
597
+ - High-resolution image preservation
598
+
599
+ ### 🤖 VLM Integration
600
+ - Vision Language Model support for structured data extraction
601
+ - Multiple provider options (Gemini, OpenAI)
602
+ - Automatic conversion of charts and tables to structured formats
603
+
604
+ ### 📊 Multiple Output Formats
605
+ - **Markdown**: Human-readable document with embedded images and tables
606
+ - **Excel**: Structured data in spreadsheet format
607
+ - **JSON**: Programmatically accessible structured data
608
+ - **Images**: High-quality cropped visual elements
609
+
610
+ ### ⚙️ Flexible Configuration
611
+ - Extensive customization options
612
+ - Performance tuning parameters
613
+ - Output format selection
614
+
615
+ ## 📋 Requirements
616
+
617
+ ### Core Dependencies
618
+ - **PaddleOCR**: Document layout detection
619
+ - **Outlines**: Structured output generation
620
+ - **Tesseract**: OCR text extraction
621
+ - **Pillow**: Image processing
622
+ - **OpenCV**: Computer vision operations
623
+ - **Pandas**: Data manipulation
624
+ - **OpenPyXL**: Excel file generation
625
+ - **Google Generative AI**: For Gemini VLM integration
626
+ - **OpenAI**: For GPT-4 VLM integration
@@ -0,0 +1,40 @@
1
+ doctra/__init__.py,sha256=-Pkx0Vh4Hz3EQvLaxlL6Mo4lVig59FTN5LvUcxThn4U,519
2
+ doctra/version.py,sha256=GPuOaof41_BpIl3xsohGQ10EKqKhbb0ZGSUcSV09uMI,60
3
+ doctra/cli/__init__.py,sha256=4PTujjYRShOOUlZ7PwuWckShPWLC4v4CYIhJpzgyv1k,911
4
+ doctra/cli/main.py,sha256=O3Bgov3rtf58AJHmuojJaptrH17X1mw19iTplId3gGo,35327
5
+ doctra/cli/utils.py,sha256=QuttjEtBiFrOHmqZz4mjbf3GWZe26lYChPwa23Loz_4,11314
6
+ doctra/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ doctra/engines/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ doctra/engines/layout/layout_models.py,sha256=vuTzjWd3FD-SkFPngktmUVhOJ6Xvff6ufwFEq796PQs,3162
9
+ doctra/engines/layout/paddle_layout.py,sha256=Yf6_OtBq_RSup8CSDofJUZxM_bJMBlCC0eSv5ib1uNk,9364
10
+ doctra/engines/ocr/__init__.py,sha256=h6bFiveGXdI59fsKzCqOXki3C74DCndEmvloOtMqnR0,133
11
+ doctra/engines/ocr/api.py,sha256=YOBKDLExXpvSiOsc_TDJasaMPxzdVx1llQCtYlsruWo,1280
12
+ doctra/engines/ocr/path_resolver.py,sha256=2_7Nsekt3dCDU3oVsgdr62iMrlAhbGNfYwgh4G7S3pA,1492
13
+ doctra/engines/ocr/pytesseract_engine.py,sha256=Imz2uwju6himkBiS8CH7DLxBRe-LtmMYZiOdb_6PoQw,2911
14
+ doctra/engines/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ doctra/engines/vlm/outlines_types.py,sha256=qL-G6MNiA5mxp1qAPVEFhOANp4NqVt_MQKseJCr_xXE,970
16
+ doctra/engines/vlm/provider.py,sha256=ws-04Jhuvg0a3vXzz8cfMWIiwldoIFs3i_qSb2Q6enA,2137
17
+ doctra/engines/vlm/service.py,sha256=cONhekqKfGo2fe-2g7YT89BHxytdjGhCSFyU3sJUzWI,4966
18
+ doctra/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ doctra/exporters/excel_writer.py,sha256=U5Eb5SF7_ll1QveUapSWSkCRt3OEoisKEVUQ_7X8Wjo,7762
20
+ doctra/exporters/image_saver.py,sha256=zsPoQ0CwoE643ui4iZMdXk96kv5mU8L_zC2JfF22N1A,1639
21
+ doctra/exporters/markdown_table.py,sha256=4_OJIwG_WoIPYBzJx1njy_3tNVdkK6QKSP-P9r-b0zw,2030
22
+ doctra/exporters/markdown_writer.py,sha256=L7EjF2MB8jYX7XkZ3a3NeeEC8gnb0qzRPTzIN9tdfuw,1027
23
+ doctra/parsers/__init__.py,sha256=8M6LVzcWGpuTIK_1SMXML3ll7zK1CTHXGI5qXvqdm-A,206
24
+ doctra/parsers/layout_order.py,sha256=W6b-T11H907RZ2FaZwNvnYhmvH11rpUzxC5yLkdf28k,640
25
+ doctra/parsers/structured_pdf_parser.py,sha256=g0k9XsSJRVnJg4omrEC1Ef1MWZZ3Ve2OnXjMoc6IScU,19953
26
+ doctra/parsers/table_chart_extractor.py,sha256=A-rjazOmx6d_8CbZXdebE4NsYYqiQP0wQktTfCp_pwI,12669
27
+ doctra/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ doctra/utils/bbox.py,sha256=R2-95p0KiWvet3TH27TQVvCar7WJg6z0u3L21iEDF-A,674
29
+ doctra/utils/constants.py,sha256=ZWOvNDrvETbQ_pxHiX7vUW4J5Oj8_qnov0QacUOBizI,189
30
+ doctra/utils/file_ops.py,sha256=3IS0EQncs6Kaj27fcg2zxQX3xRSvtItIsyKGLYgeOgw,815
31
+ doctra/utils/io_utils.py,sha256=L1bWV4-ybs2j_3ZEN7GfQVgdC73JKVECVnpwKbP0dy0,219
32
+ doctra/utils/ocr_utils.py,sha256=Doa1uYBg3kRgRYd2aPq9fICHgHfrM_efdhZfI7jl6OM,780
33
+ doctra/utils/pdf_io.py,sha256=c8EY47Z1iqVtlLFHS_n0qGuXJ5ERFaMUd84ivXV0b9E,706
34
+ doctra/utils/quiet.py,sha256=5XPS-1CtJ0sVk6qgSQctdhr_wR8mP1xoJLoUbmkXROA,387
35
+ doctra/utils/structured_utils.py,sha256=EdNhCUDLKvYcLqXbTGveNtIRGyQ3yzYhTh-zy_awwM4,1450
36
+ doctra-0.1.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
37
+ doctra-0.1.0.dist-info/METADATA,sha256=vaZojx28o38pv-3FoEzmI5uJpl_NAGKkbmKr_yWVOig,26653
38
+ doctra-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
+ doctra-0.1.0.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
40
+ doctra-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+