natural-pdf 25.3.16__tar.gz → 25.3.17.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {natural_pdf-25.3.16/natural_pdf.egg-info → natural_pdf-25.3.17.2}/PKG-INFO +25 -3
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/README.md +23 -2
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/index.md +2 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/installation/index.md +0 -10
- natural_pdf-25.3.17.2/examples/direct_qa_example.py +71 -0
- natural_pdf-25.3.17.2/examples/docling_comprehensive_test.py +325 -0
- natural_pdf-25.3.17.2/examples/docling_example.py +192 -0
- natural_pdf-25.3.17.2/examples/docling_hierarchy_example.py +230 -0
- natural_pdf-25.3.17.2/examples/docling_text_sources.py +241 -0
- natural_pdf-25.3.17.2/examples/improved_qa_example.py +66 -0
- natural_pdf-25.3.17.2/examples/url_pdf_example.py +45 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/analyzers/document_layout.py +276 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/core/page.py +72 -21
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/core/pdf.py +102 -71
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/region.py +174 -19
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/qa/document_qa.py +29 -38
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/selectors/parser.py +6 -2
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2/natural_pdf.egg-info}/PKG-INFO +25 -3
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf.egg-info/SOURCES.txt +6 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf.egg-info/requires.txt +1 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/setup.py +4 -2
- natural_pdf-25.3.16/examples/direct_qa_example.py +0 -165
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/LICENSE +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/MANIFEST.in +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/api/index.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/assets/favicon.png +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/assets/social-preview.png +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/document-qa/index.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/element-selection/index.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/explanations/index.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/explanations/ocr-challenges.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/explanations/pdf-extraction-challenges.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/explanations/pdf-fonts.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/layout-analysis/index.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/ocr/index.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/pdf-navigation/index.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/regions/index.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/tables/index.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/text-extraction/index.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/visual-debugging/index.md +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/__init__.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/another_exclusion_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/basic_usage.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/boundary_exclusion_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/boundary_inclusion_fix_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/chainable_layout_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/color_basic_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/color_name_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/color_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/debug_ocr.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/direct_ocr_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/direct_paddle_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/document_layout_analysis.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/document_qa_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/exclusion_count_debug.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/exclusion_debug.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/exclusion_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/exclusion_optimization_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/extract_text_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/font_aware_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/font_variant_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/footer_overlap_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_all_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_attributes_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_confidence_display.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_demo.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_float_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlighting_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/image_width_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/improved_api_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/layout_confidence_display_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/layout_confidence_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/layout_coordinate_debug.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/layout_highlight_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/logging_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_comprehensive.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_debug_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_default_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_engine_comparison.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_simplified_params.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_visualization.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_visualization_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/paddle_layout_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/paddle_layout_simple.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/paddleocr_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/page_collection_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/polygon_highlight_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/position_methods_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_boundary_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_exclusion_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_expand_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_image_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_ocr_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_sections_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/school_books.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/school_books_all.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/scouring.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/section_extraction_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/simple_document_qa.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/spatial_navigation_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/table_extraction_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/table_structure_detection.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/tatr_cells_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/tatr_ocr_table_test.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/text_search_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/text_style_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/tiny-text.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/until_boundaries_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/until_example.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/very_basics.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/__init__.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/analyzers/__init__.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/analyzers/text_structure.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/core/__init__.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/__init__.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/base.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/collections.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/line.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/rect.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/text.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/ocr/__init__.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/ocr/easyocr_engine.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/ocr/engine.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/ocr/paddleocr_engine.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/qa/__init__.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/selectors/__init__.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/templates/__init__.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/templates/ocr_debug.html +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/utils/__init__.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/utils/highlighting.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/utils/ocr.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/utils/reading_order.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/utils/visualization.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf.egg-info/dependency_links.txt +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf.egg-info/top_level.txt +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/pyproject.toml +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/setup.cfg +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/tests/__init__.py +0 -0
- {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/tests/test_pdf.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 25.3.
|
3
|
+
Version: 25.3.17.2
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Home-page: https://github.com/jsoma/natural-pdf
|
6
6
|
Author: Jonathan Soma
|
@@ -15,6 +15,7 @@ Requires-Dist: pdfplumber>=0.7.0
|
|
15
15
|
Requires-Dist: Pillow>=8.0.0
|
16
16
|
Requires-Dist: colour>=0.1.5
|
17
17
|
Requires-Dist: numpy>=1.20.0
|
18
|
+
Requires-Dist: urllib3>=1.26.0
|
18
19
|
Requires-Dist: doclayout_yolo>=0.0.3
|
19
20
|
Requires-Dist: torch>=2.0.0
|
20
21
|
Requires-Dist: torchvision>=0.15.0
|
@@ -58,7 +59,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
|
|
58
59
|
|
59
60
|
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
60
61
|
|
61
|
-
[Complete documentation here](https://jsoma.github.io/natural-pdf)
|
62
|
+
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
63
|
+
- [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
|
62
64
|
|
63
65
|
## Features
|
64
66
|
|
@@ -96,9 +98,12 @@ pip install natural-pdf[easyocr,paddle]
|
|
96
98
|
```python
|
97
99
|
from natural_pdf import PDF
|
98
100
|
|
99
|
-
# Open a PDF
|
101
|
+
# Open a local PDF
|
100
102
|
pdf = PDF('document.pdf')
|
101
103
|
|
104
|
+
# Or open a PDF from a URL
|
105
|
+
pdf = PDF('https://example.com/document.pdf')
|
106
|
+
|
102
107
|
# Get the first page
|
103
108
|
page = pdf.pages[0]
|
104
109
|
|
@@ -263,6 +268,23 @@ Logs follow a hierarchical structure matching the library's module organization:
|
|
263
268
|
- `natural_pdf.analyzers` - Layout analysis operations
|
264
269
|
- `natural_pdf.ocr` - OCR engine operations
|
265
270
|
|
271
|
+
## Document QA
|
272
|
+
|
273
|
+
Ask questions directly to your documents:
|
274
|
+
|
275
|
+
```python
|
276
|
+
# Ask questions about the document content
|
277
|
+
result = pdf.ask("What was the company's revenue in 2022?")
|
278
|
+
print(f"Answer: {result['answer']}")
|
279
|
+
print(f"Confidence: {result['confidence']:.2f}")
|
280
|
+
|
281
|
+
# Access more details in the result dictionary
|
282
|
+
result = pdf.ask("Who is the CEO?")
|
283
|
+
print(f"Answer: {result['answer']}")
|
284
|
+
print(f"Found on page: {result['page_num']}")
|
285
|
+
print(f"Source text: {result.get('source_text', 'N/A')}")
|
286
|
+
```
|
287
|
+
|
266
288
|
## More details
|
267
289
|
|
268
290
|
[Complete documentation here](https://jsoma.github.io/natural-pdf)
|
@@ -4,7 +4,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
|
|
4
4
|
|
5
5
|
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
6
6
|
|
7
|
-
[Complete documentation here](https://jsoma.github.io/natural-pdf)
|
7
|
+
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
8
|
+
- [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
|
8
9
|
|
9
10
|
## Features
|
10
11
|
|
@@ -42,9 +43,12 @@ pip install natural-pdf[easyocr,paddle]
|
|
42
43
|
```python
|
43
44
|
from natural_pdf import PDF
|
44
45
|
|
45
|
-
# Open a PDF
|
46
|
+
# Open a local PDF
|
46
47
|
pdf = PDF('document.pdf')
|
47
48
|
|
49
|
+
# Or open a PDF from a URL
|
50
|
+
pdf = PDF('https://example.com/document.pdf')
|
51
|
+
|
48
52
|
# Get the first page
|
49
53
|
page = pdf.pages[0]
|
50
54
|
|
@@ -209,6 +213,23 @@ Logs follow a hierarchical structure matching the library's module organization:
|
|
209
213
|
- `natural_pdf.analyzers` - Layout analysis operations
|
210
214
|
- `natural_pdf.ocr` - OCR engine operations
|
211
215
|
|
216
|
+
## Document QA
|
217
|
+
|
218
|
+
Ask questions directly to your documents:
|
219
|
+
|
220
|
+
```python
|
221
|
+
# Ask questions about the document content
|
222
|
+
result = pdf.ask("What was the company's revenue in 2022?")
|
223
|
+
print(f"Answer: {result['answer']}")
|
224
|
+
print(f"Confidence: {result['confidence']:.2f}")
|
225
|
+
|
226
|
+
# Access more details in the result dictionary
|
227
|
+
result = pdf.ask("Who is the CEO?")
|
228
|
+
print(f"Answer: {result['answer']}")
|
229
|
+
print(f"Found on page: {result['page_num']}")
|
230
|
+
print(f"Source text: {result.get('source_text', 'N/A')}")
|
231
|
+
```
|
232
|
+
|
212
233
|
## More details
|
213
234
|
|
214
235
|
[Complete documentation here](https://jsoma.github.io/natural-pdf)
|
@@ -4,6 +4,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
|
|
4
4
|
|
5
5
|
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
6
6
|
|
7
|
+
- [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
|
8
|
+
|
7
9
|
## Quick Example
|
8
10
|
|
9
11
|
```python
|
@@ -4,20 +4,10 @@ Let's get Natural PDF installed and run your first extraction.
|
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
7
|
-
Natural PDF is available on PyPI. The simplest way to install it is with pip:
|
8
|
-
|
9
7
|
```bash
|
10
8
|
pip install natural-pdf
|
11
9
|
```
|
12
10
|
|
13
|
-
You can also install from source:
|
14
|
-
|
15
|
-
```bash
|
16
|
-
git clone https://github.com/jsoma/natural-pdf.git
|
17
|
-
cd natural-pdf
|
18
|
-
pip install -e .
|
19
|
-
```
|
20
|
-
|
21
11
|
### Optional Dependencies
|
22
12
|
|
23
13
|
Natural PDF has modular dependencies for different features:
|
@@ -0,0 +1,71 @@
|
|
1
|
+
"""
|
2
|
+
Direct Document QA example that closely mirrors the original pdfplumber implementation.
|
3
|
+
|
4
|
+
This example shows how to:
|
5
|
+
1. Use pdfplumber directly to extract words and images
|
6
|
+
2. Use transformers pipelines for document QA
|
7
|
+
3. Compare with the Natural PDF implementation
|
8
|
+
|
9
|
+
It's intentionally similar to the original code provided by the user.
|
10
|
+
"""
|
11
|
+
|
12
|
+
import os
|
13
|
+
import sys
|
14
|
+
import argparse
|
15
|
+
import pdfplumber
|
16
|
+
from PIL import Image, ImageDraw
|
17
|
+
import numpy as np
|
18
|
+
|
19
|
+
# Add parent directory to path to run without installing
|
20
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
21
|
+
|
22
|
+
# For comparison
|
23
|
+
from natural_pdf import PDF, configure_logging
|
24
|
+
import logging
|
25
|
+
|
26
|
+
def main():
|
27
|
+
parser = argparse.ArgumentParser(description="Direct Document QA Example")
|
28
|
+
parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
|
29
|
+
help="Path to PDF document")
|
30
|
+
parser.add_argument("--question", default="How many votes for Harris and Walz?",
|
31
|
+
help="Question to ask about the document")
|
32
|
+
parser.add_argument("--debug", action="store_true",
|
33
|
+
help="Save debug information for troubleshooting")
|
34
|
+
|
35
|
+
args = parser.parse_args()
|
36
|
+
|
37
|
+
# Configure logging for Natural PDF
|
38
|
+
if args.debug:
|
39
|
+
configure_logging(level=logging.DEBUG)
|
40
|
+
else:
|
41
|
+
configure_logging(level=logging.INFO)
|
42
|
+
|
43
|
+
print(f"Document: {args.pdf_path}")
|
44
|
+
print(f"Question: {args.question}")
|
45
|
+
|
46
|
+
print("\n=== Natural PDF implementation ===")
|
47
|
+
|
48
|
+
# Use Natural PDF
|
49
|
+
pdf = PDF(args.pdf_path)
|
50
|
+
page = pdf.pages[0]
|
51
|
+
|
52
|
+
# Ask the question
|
53
|
+
result = page.ask(args.question, debug=args.debug)
|
54
|
+
|
55
|
+
if result.get("found", False):
|
56
|
+
print(f"Answer: {result['answer']}")
|
57
|
+
print(f"Confidence: {result['confidence']:.2f}")
|
58
|
+
|
59
|
+
# Highlight the answer
|
60
|
+
if result.get("source_elements"):
|
61
|
+
for element in result["source_elements"]:
|
62
|
+
element.highlight(color=(1, 0.5, 0, 0.5))
|
63
|
+
|
64
|
+
# Save the image
|
65
|
+
page.save_image("output/natural_pdf_answer.png")
|
66
|
+
print("Saved highlighted answer to output/natural_pdf_answer.png")
|
67
|
+
else:
|
68
|
+
print(f"No answer found: {result.get('error', '')}")
|
69
|
+
|
70
|
+
if __name__ == "__main__":
|
71
|
+
main()
|
@@ -0,0 +1,325 @@
|
|
1
|
+
"""
|
2
|
+
Comprehensive test of the Docling integration with Natural PDF.
|
3
|
+
|
4
|
+
This script tests all aspects of the Docling integration:
|
5
|
+
1. Basic document layout detection
|
6
|
+
2. Hierarchical document navigation
|
7
|
+
3. Text extraction from complex structures
|
8
|
+
4. Integration with other layout models
|
9
|
+
5. Performance and edge cases
|
10
|
+
|
11
|
+
Usage:
|
12
|
+
python examples/docling_comprehensive_test.py [pdf_path]
|
13
|
+
|
14
|
+
Dependencies:
|
15
|
+
- torch
|
16
|
+
- transformers
|
17
|
+
- docling_core
|
18
|
+
"""
|
19
|
+
|
20
|
+
import os
|
21
|
+
import sys
|
22
|
+
import time
|
23
|
+
import logging
|
24
|
+
from pathlib import Path
|
25
|
+
|
26
|
+
# Import the library
|
27
|
+
from natural_pdf import PDF, configure_logging
|
28
|
+
|
29
|
+
# Configure detailed logging for debugging
|
30
|
+
configure_logging(level=logging.INFO)
|
31
|
+
logger = logging.getLogger("docling_test")
|
32
|
+
logger.setLevel(logging.INFO)
|
33
|
+
|
34
|
+
# Get PDF path from command line or use demo file
|
35
|
+
if len(sys.argv) > 1:
|
36
|
+
pdf_path = sys.argv[1]
|
37
|
+
else:
|
38
|
+
# Default to a sample PDF in the pdfs directory
|
39
|
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
40
|
+
repo_root = os.path.dirname(script_dir)
|
41
|
+
pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
|
42
|
+
|
43
|
+
# Check if required packages are installed
|
44
|
+
try:
|
45
|
+
from docling.document_converter import DocumentConverter
|
46
|
+
except ImportError:
|
47
|
+
logger.error("Missing required packages. Please install with:")
|
48
|
+
logger.error("pip install docling")
|
49
|
+
sys.exit(1)
|
50
|
+
|
51
|
+
# Create output directory for test results
|
52
|
+
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output", "docling_tests")
|
53
|
+
os.makedirs(output_dir, exist_ok=True)
|
54
|
+
|
55
|
+
# Load the PDF
|
56
|
+
logger.info(f"Loading PDF: {pdf_path}")
|
57
|
+
pdf = PDF(pdf_path)
|
58
|
+
logger.info(f"PDF has {len(pdf.pages)} pages")
|
59
|
+
|
60
|
+
# Process only the first page for tests
|
61
|
+
page = pdf.pages[0]
|
62
|
+
|
63
|
+
# SECTION 1: Basic Docling Detection
|
64
|
+
logger.info("\n*** SECTION 1: Basic Docling Detection ***")
|
65
|
+
|
66
|
+
# Time the Docling analysis
|
67
|
+
start_time = time.time()
|
68
|
+
page.analyze_layout(
|
69
|
+
model="docling",
|
70
|
+
confidence=0.2, # This parameter isn't used by Docling but kept for API consistency
|
71
|
+
model_params={
|
72
|
+
"verbose": True
|
73
|
+
# Any other parameters would be passed directly to DocumentConverter
|
74
|
+
}
|
75
|
+
)
|
76
|
+
docling_time = time.time() - start_time
|
77
|
+
logger.info(f"Docling analysis completed in {docling_time:.2f} seconds")
|
78
|
+
|
79
|
+
# Verify that docling_document was created
|
80
|
+
if hasattr(page, 'docling_document'):
|
81
|
+
logger.info("✅ Docling document created successfully")
|
82
|
+
else:
|
83
|
+
logger.error("❌ Docling document not created")
|
84
|
+
|
85
|
+
# Count detected regions
|
86
|
+
docling_regions = page.find_all('region[model=docling]')
|
87
|
+
logger.info(f"Found {len(docling_regions)} total Docling regions")
|
88
|
+
|
89
|
+
# Get regions by type
|
90
|
+
section_headers = page.find_all('section-header')
|
91
|
+
text_regions = page.find_all('region[model=docling][type=text]')
|
92
|
+
figures = page.find_all('region[model=docling][type=figure]')
|
93
|
+
|
94
|
+
logger.info(f"- Section headers: {len(section_headers)}")
|
95
|
+
logger.info(f"- Text regions: {len(text_regions)}")
|
96
|
+
logger.info(f"- Figures: {len(figures)}")
|
97
|
+
|
98
|
+
# SECTION 2: Hierarchical Navigation
|
99
|
+
logger.info("\n*** SECTION 2: Hierarchical Navigation ***")
|
100
|
+
|
101
|
+
# Test if regions have child_regions attribute
|
102
|
+
has_children_attr = all(hasattr(region, 'child_regions') for region in docling_regions)
|
103
|
+
logger.info(f"All regions have child_regions attribute: {has_children_attr}")
|
104
|
+
|
105
|
+
# Count top-level regions (no parent)
|
106
|
+
top_level_regions = [r for r in docling_regions if not r.parent_region]
|
107
|
+
logger.info(f"Top-level regions: {len(top_level_regions)}")
|
108
|
+
|
109
|
+
# Test child traversal for section headers
|
110
|
+
if section_headers:
|
111
|
+
header = section_headers[0]
|
112
|
+
logger.info(f"Testing section header: '{header.extract_text()[:30]}...'")
|
113
|
+
|
114
|
+
# Test get_children method
|
115
|
+
if hasattr(header, 'get_children'):
|
116
|
+
children = header.get_children()
|
117
|
+
logger.info(f"- Direct children: {len(children)}")
|
118
|
+
|
119
|
+
# Test filtered get_children
|
120
|
+
text_children = header.get_children('text')
|
121
|
+
logger.info(f"- Direct text children: {len(text_children)}")
|
122
|
+
else:
|
123
|
+
logger.error("❌ get_children method not found")
|
124
|
+
|
125
|
+
# Test get_descendants method
|
126
|
+
if hasattr(header, 'get_descendants'):
|
127
|
+
descendants = header.get_descendants()
|
128
|
+
logger.info(f"- All descendants: {len(descendants)}")
|
129
|
+
|
130
|
+
# Test filtered get_descendants
|
131
|
+
text_descendants = header.get_descendants('text')
|
132
|
+
logger.info(f"- Text descendants: {len(text_descendants)}")
|
133
|
+
else:
|
134
|
+
logger.error("❌ get_descendants method not found")
|
135
|
+
|
136
|
+
# Test find_all with recursive option
|
137
|
+
children_find = header.find_all('text', recursive=False)
|
138
|
+
logger.info(f"- Children via find_all(recursive=False): {len(children_find)}")
|
139
|
+
|
140
|
+
all_find = header.find_all('text', recursive=True)
|
141
|
+
logger.info(f"- All text via find_all(recursive=True): {len(all_find)}")
|
142
|
+
|
143
|
+
# SECTION 3: Text Extraction
|
144
|
+
logger.info("\n*** SECTION 3: Text Extraction ***")
|
145
|
+
|
146
|
+
# Test basic text extraction
|
147
|
+
if section_headers:
|
148
|
+
header = section_headers[0]
|
149
|
+
header_text = header.extract_text()
|
150
|
+
logger.info(f"Section header text: '{header_text[:50]}...'")
|
151
|
+
|
152
|
+
# Test extraction from hierarchy
|
153
|
+
if hasattr(header, 'get_children') and header.get_children():
|
154
|
+
child = header.get_children()[0]
|
155
|
+
child_text = child.extract_text()
|
156
|
+
logger.info(f"First child text: '{child_text[:50]}...'")
|
157
|
+
|
158
|
+
# Compare with standard extraction
|
159
|
+
# In a real document, the header's extract_text might include the child text too
|
160
|
+
combined_len = len(header_text) + len(child_text)
|
161
|
+
logger.info(f"Combined text length: {combined_len} characters")
|
162
|
+
|
163
|
+
# Test text extraction with and without OCR
|
164
|
+
# This is a simplified test - in a real scenario, we'd compare with known text
|
165
|
+
extracted_text = page.extract_text()
|
166
|
+
logger.info(f"Extracted page text: {len(extracted_text)} characters")
|
167
|
+
|
168
|
+
# SECTION 4: Integration with Other Models
|
169
|
+
logger.info("\n*** SECTION 4: Integration with Other Models ***")
|
170
|
+
|
171
|
+
# Store current regions for comparison
|
172
|
+
original_region_count = len(page._regions['detected'])
|
173
|
+
|
174
|
+
# Add YOLO analysis
|
175
|
+
page.analyze_layout(
|
176
|
+
model="yolo",
|
177
|
+
confidence=0.3,
|
178
|
+
existing="append" # Important: don't replace Docling regions
|
179
|
+
)
|
180
|
+
|
181
|
+
# Count new regions
|
182
|
+
all_regions = page._regions['detected']
|
183
|
+
logger.info(f"Total regions after adding YOLO: {len(all_regions)}")
|
184
|
+
logger.info(f"New regions added: {len(all_regions) - original_region_count}")
|
185
|
+
|
186
|
+
# Test filtering by model
|
187
|
+
yolo_regions = page.find_all('region[model=yolo]')
|
188
|
+
docling_regions_after = page.find_all('region[model=docling]')
|
189
|
+
|
190
|
+
logger.info(f"YOLO regions: {len(yolo_regions)}")
|
191
|
+
logger.info(f"Docling regions after YOLO: {len(docling_regions_after)}")
|
192
|
+
logger.info(f"Docling regions preserved: {len(docling_regions_after) == len(docling_regions)}")
|
193
|
+
|
194
|
+
# SECTION 5: Visualization
|
195
|
+
logger.info("\n*** SECTION 5: Visualization ***")
|
196
|
+
|
197
|
+
# Clear previous highlights
|
198
|
+
page.clear_highlights()
|
199
|
+
|
200
|
+
# Highlight different models and region types
|
201
|
+
if section_headers:
|
202
|
+
section_headers.highlight(
|
203
|
+
color=(1, 0, 0, 0.3),
|
204
|
+
label="Docling Headers",
|
205
|
+
include_attrs=['region_type']
|
206
|
+
)
|
207
|
+
|
208
|
+
if text_regions:
|
209
|
+
text_regions.highlight(
|
210
|
+
color=(0, 0, 1, 0.3),
|
211
|
+
label="Docling Text",
|
212
|
+
include_attrs=['region_type']
|
213
|
+
)
|
214
|
+
|
215
|
+
if yolo_regions:
|
216
|
+
yolo_regions.highlight(
|
217
|
+
color=(0, 1, 0, 0.3),
|
218
|
+
label="YOLO Regions",
|
219
|
+
include_attrs=['region_type']
|
220
|
+
)
|
221
|
+
|
222
|
+
# Save highlighted image
|
223
|
+
highlight_path = os.path.join(output_dir, "model_comparison.png")
|
224
|
+
page.save_image(highlight_path, labels=True)
|
225
|
+
logger.info(f"Saved visualization to {highlight_path}")
|
226
|
+
|
227
|
+
# Test hierarchical highlighting
|
228
|
+
if section_headers and len(section_headers) > 0:
|
229
|
+
# Clear previous highlights
|
230
|
+
page.clear_highlights()
|
231
|
+
|
232
|
+
# Select a section to visualize
|
233
|
+
header = section_headers[0]
|
234
|
+
|
235
|
+
# Highlight header
|
236
|
+
header.highlight(
|
237
|
+
color=(1, 0, 0, 0.3),
|
238
|
+
label="Section Header"
|
239
|
+
)
|
240
|
+
|
241
|
+
# Highlight direct children
|
242
|
+
if hasattr(header, 'get_children') and header.get_children():
|
243
|
+
children = header.get_children()
|
244
|
+
for child in children:
|
245
|
+
child.highlight(
|
246
|
+
color=(0, 1, 0, 0.3),
|
247
|
+
label="Direct Children",
|
248
|
+
include_attrs=['region_type']
|
249
|
+
)
|
250
|
+
|
251
|
+
# Save hierarchy visualization
|
252
|
+
hierarchy_path = os.path.join(output_dir, "hierarchy_visualization.png")
|
253
|
+
page.save_image(hierarchy_path, labels=True)
|
254
|
+
logger.info(f"Saved hierarchy visualization to {hierarchy_path}")
|
255
|
+
|
256
|
+
# SECTION 6: Text Source Testing (OCR vs Native)
|
257
|
+
logger.info("\n*** SECTION 6: Text Source Testing ***")
|
258
|
+
|
259
|
+
# Find text elements by source
|
260
|
+
native_text = page.find_all('text[source=native]')
|
261
|
+
ocr_text = page.find_all('text[source=ocr]')
|
262
|
+
docling_text = page.find_all('region[model=docling][type=text]')
|
263
|
+
|
264
|
+
logger.info(f"Text elements by source:")
|
265
|
+
logger.info(f"- Native PDF text: {len(native_text)} elements")
|
266
|
+
logger.info(f"- OCR text: {len(ocr_text)} elements")
|
267
|
+
logger.info(f"- Docling text: {len(docling_text)} elements")
|
268
|
+
|
269
|
+
# Test specific text element queries
|
270
|
+
if native_text:
|
271
|
+
sample_native = native_text[0]
|
272
|
+
logger.info(f"Sample native text: '{sample_native.text[:30]}...'")
|
273
|
+
logger.info(f"Has source='native' attribute: {getattr(sample_native, 'source', None) == 'native'}")
|
274
|
+
|
275
|
+
# Test if text_content attribute is set
|
276
|
+
has_text_content = False
|
277
|
+
for region in docling_regions:
|
278
|
+
if hasattr(region, 'text_content') and region.text_content:
|
279
|
+
has_text_content = True
|
280
|
+
logger.info(f"Found region with text_content: '{region.text_content[:30]}...'")
|
281
|
+
break
|
282
|
+
|
283
|
+
logger.info(f"Regions have text_content attribute: {has_text_content}")
|
284
|
+
|
285
|
+
# Test if associated_text_elements is used
|
286
|
+
has_associated_text = False
|
287
|
+
for region in docling_regions:
|
288
|
+
if hasattr(region, 'associated_text_elements') and region.associated_text_elements:
|
289
|
+
has_associated_text = True
|
290
|
+
logger.info(f"Found region with associated_text_elements: {len(region.associated_text_elements)} elements")
|
291
|
+
break
|
292
|
+
|
293
|
+
logger.info(f"Regions have associated_text_elements: {has_associated_text}")
|
294
|
+
|
295
|
+
# Highlight different text sources
|
296
|
+
page.clear_highlights()
|
297
|
+
if native_text:
|
298
|
+
native_text.highlight(
|
299
|
+
color=(0, 0, 0.7, 0.3),
|
300
|
+
label="Native Text Elements",
|
301
|
+
include_attrs=['source']
|
302
|
+
)
|
303
|
+
|
304
|
+
if docling_text:
|
305
|
+
docling_text.highlight(
|
306
|
+
color=(0.7, 0, 0, 0.3),
|
307
|
+
label="Docling Text Elements",
|
308
|
+
include_attrs=['model']
|
309
|
+
)
|
310
|
+
|
311
|
+
# Save source visualization
|
312
|
+
source_path = os.path.join(output_dir, "text_sources.png")
|
313
|
+
page.save_image(source_path, labels=True)
|
314
|
+
logger.info(f"Saved text source visualization to {source_path}")
|
315
|
+
|
316
|
+
# Log final summary
|
317
|
+
print("\n*** TEST SUMMARY ***")
|
318
|
+
print(f"Total Docling regions: {len(docling_regions)}")
|
319
|
+
print(f"Hierarchical navigation: {'✅ Working' if has_children_attr else '❌ Not working'}")
|
320
|
+
print(f"Text extraction: {'✅ Working' if len(extracted_text) > 0 else '❌ Not working'}")
|
321
|
+
print(f"Multi-model integration: {'✅ Working' if len(yolo_regions) > 0 else '❌ Not working'}")
|
322
|
+
print(f"Test artifacts saved to: {output_dir}")
|
323
|
+
|
324
|
+
print("\nAll tests completed with no errors!")
|
325
|
+
logger.info("\nAll tests completed.")
|