natural-pdf 25.3.16__tar.gz → 25.3.17.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. {natural_pdf-25.3.16/natural_pdf.egg-info → natural_pdf-25.3.17.2}/PKG-INFO +25 -3
  2. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/README.md +23 -2
  3. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/index.md +2 -0
  4. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/installation/index.md +0 -10
  5. natural_pdf-25.3.17.2/examples/direct_qa_example.py +71 -0
  6. natural_pdf-25.3.17.2/examples/docling_comprehensive_test.py +325 -0
  7. natural_pdf-25.3.17.2/examples/docling_example.py +192 -0
  8. natural_pdf-25.3.17.2/examples/docling_hierarchy_example.py +230 -0
  9. natural_pdf-25.3.17.2/examples/docling_text_sources.py +241 -0
  10. natural_pdf-25.3.17.2/examples/improved_qa_example.py +66 -0
  11. natural_pdf-25.3.17.2/examples/url_pdf_example.py +45 -0
  12. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/analyzers/document_layout.py +276 -0
  13. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/core/page.py +72 -21
  14. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/core/pdf.py +102 -71
  15. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/region.py +174 -19
  16. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/qa/document_qa.py +29 -38
  17. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/selectors/parser.py +6 -2
  18. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2/natural_pdf.egg-info}/PKG-INFO +25 -3
  19. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf.egg-info/SOURCES.txt +6 -0
  20. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf.egg-info/requires.txt +1 -0
  21. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/setup.py +4 -2
  22. natural_pdf-25.3.16/examples/direct_qa_example.py +0 -165
  23. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/LICENSE +0 -0
  24. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/MANIFEST.in +0 -0
  25. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/api/index.md +0 -0
  26. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/assets/favicon.png +0 -0
  27. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/assets/social-preview.png +0 -0
  28. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/document-qa/index.md +0 -0
  29. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/element-selection/index.md +0 -0
  30. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/explanations/index.md +0 -0
  31. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/explanations/ocr-challenges.md +0 -0
  32. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/explanations/pdf-extraction-challenges.md +0 -0
  33. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/explanations/pdf-fonts.md +0 -0
  34. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/layout-analysis/index.md +0 -0
  35. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/ocr/index.md +0 -0
  36. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/pdf-navigation/index.md +0 -0
  37. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/regions/index.md +0 -0
  38. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/tables/index.md +0 -0
  39. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/text-extraction/index.md +0 -0
  40. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/docs/visual-debugging/index.md +0 -0
  41. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/__init__.py +0 -0
  42. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/another_exclusion_example.py +0 -0
  43. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/basic_usage.py +0 -0
  44. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/boundary_exclusion_test.py +0 -0
  45. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/boundary_inclusion_fix_test.py +0 -0
  46. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/chainable_layout_example.py +0 -0
  47. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/color_basic_test.py +0 -0
  48. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/color_name_example.py +0 -0
  49. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/color_test.py +0 -0
  50. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/debug_ocr.py +0 -0
  51. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/direct_ocr_test.py +0 -0
  52. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/direct_paddle_test.py +0 -0
  53. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/document_layout_analysis.py +0 -0
  54. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/document_qa_example.py +0 -0
  55. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/exclusion_count_debug.py +0 -0
  56. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/exclusion_debug.py +0 -0
  57. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/exclusion_example.py +0 -0
  58. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/exclusion_optimization_example.py +0 -0
  59. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/extract_text_test.py +0 -0
  60. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/font_aware_example.py +0 -0
  61. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/font_variant_example.py +0 -0
  62. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/footer_overlap_test.py +0 -0
  63. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_all_example.py +0 -0
  64. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_attributes_test.py +0 -0
  65. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_confidence_display.py +0 -0
  66. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_demo.py +0 -0
  67. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_float_test.py +0 -0
  68. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlight_test.py +0 -0
  69. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/highlighting_example.py +0 -0
  70. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/image_width_example.py +0 -0
  71. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/improved_api_example.py +0 -0
  72. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/layout_confidence_display_test.py +0 -0
  73. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/layout_confidence_test.py +0 -0
  74. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/layout_coordinate_debug.py +0 -0
  75. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/layout_highlight_test.py +0 -0
  76. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/logging_example.py +0 -0
  77. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_comprehensive.py +0 -0
  78. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_debug_example.py +0 -0
  79. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_default_test.py +0 -0
  80. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_engine_comparison.py +0 -0
  81. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_example.py +0 -0
  82. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_simplified_params.py +0 -0
  83. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_visualization.py +0 -0
  84. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/ocr_visualization_test.py +0 -0
  85. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/paddle_layout_example.py +0 -0
  86. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/paddle_layout_simple.py +0 -0
  87. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/paddleocr_example.py +0 -0
  88. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/page_collection_example.py +0 -0
  89. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/polygon_highlight_example.py +0 -0
  90. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/position_methods_example.py +0 -0
  91. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_boundary_test.py +0 -0
  92. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_exclusion_test.py +0 -0
  93. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_expand_example.py +0 -0
  94. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_image_example.py +0 -0
  95. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_ocr_test.py +0 -0
  96. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/region_sections_example.py +0 -0
  97. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/school_books.py +0 -0
  98. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/school_books_all.py +0 -0
  99. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/scouring.py +0 -0
  100. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/section_extraction_example.py +0 -0
  101. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/simple_document_qa.py +0 -0
  102. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/spatial_navigation_example.py +0 -0
  103. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/table_extraction_example.py +0 -0
  104. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/table_structure_detection.py +0 -0
  105. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/tatr_cells_test.py +0 -0
  106. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/tatr_ocr_table_test.py +0 -0
  107. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/text_search_example.py +0 -0
  108. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/text_style_example.py +0 -0
  109. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/tiny-text.py +0 -0
  110. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/until_boundaries_example.py +0 -0
  111. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/until_example.py +0 -0
  112. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/examples/very_basics.py +0 -0
  113. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/__init__.py +0 -0
  114. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/analyzers/__init__.py +0 -0
  115. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/analyzers/text_structure.py +0 -0
  116. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/core/__init__.py +0 -0
  117. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/__init__.py +0 -0
  118. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/base.py +0 -0
  119. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/collections.py +0 -0
  120. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/line.py +0 -0
  121. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/rect.py +0 -0
  122. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/elements/text.py +0 -0
  123. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/ocr/__init__.py +0 -0
  124. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/ocr/easyocr_engine.py +0 -0
  125. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/ocr/engine.py +0 -0
  126. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/ocr/paddleocr_engine.py +0 -0
  127. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/qa/__init__.py +0 -0
  128. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/selectors/__init__.py +0 -0
  129. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/templates/__init__.py +0 -0
  130. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/templates/ocr_debug.html +0 -0
  131. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/utils/__init__.py +0 -0
  132. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/utils/highlighting.py +0 -0
  133. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/utils/ocr.py +0 -0
  134. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/utils/reading_order.py +0 -0
  135. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf/utils/visualization.py +0 -0
  136. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf.egg-info/dependency_links.txt +0 -0
  137. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/natural_pdf.egg-info/top_level.txt +0 -0
  138. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/pyproject.toml +0 -0
  139. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/setup.cfg +0 -0
  140. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/tests/__init__.py +0 -0
  141. {natural_pdf-25.3.16 → natural_pdf-25.3.17.2}/tests/test_pdf.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: natural-pdf
3
- Version: 25.3.16
3
+ Version: 25.3.17.2
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Home-page: https://github.com/jsoma/natural-pdf
6
6
  Author: Jonathan Soma
@@ -15,6 +15,7 @@ Requires-Dist: pdfplumber>=0.7.0
15
15
  Requires-Dist: Pillow>=8.0.0
16
16
  Requires-Dist: colour>=0.1.5
17
17
  Requires-Dist: numpy>=1.20.0
18
+ Requires-Dist: urllib3>=1.26.0
18
19
  Requires-Dist: doclayout_yolo>=0.0.3
19
20
  Requires-Dist: torch>=2.0.0
20
21
  Requires-Dist: torchvision>=0.15.0
@@ -58,7 +59,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
58
59
 
59
60
  Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
60
61
 
61
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
62
+ - [Complete documentation here](https://jsoma.github.io/natural-pdf)
63
+ - [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
62
64
 
63
65
  ## Features
64
66
 
@@ -96,9 +98,12 @@ pip install natural-pdf[easyocr,paddle]
96
98
  ```python
97
99
  from natural_pdf import PDF
98
100
 
99
- # Open a PDF
101
+ # Open a local PDF
100
102
  pdf = PDF('document.pdf')
101
103
 
104
+ # Or open a PDF from a URL
105
+ pdf = PDF('https://example.com/document.pdf')
106
+
102
107
  # Get the first page
103
108
  page = pdf.pages[0]
104
109
 
@@ -263,6 +268,23 @@ Logs follow a hierarchical structure matching the library's module organization:
263
268
  - `natural_pdf.analyzers` - Layout analysis operations
264
269
  - `natural_pdf.ocr` - OCR engine operations
265
270
 
271
+ ## Document QA
272
+
273
+ Ask questions directly to your documents:
274
+
275
+ ```python
276
+ # Ask questions about the document content
277
+ result = pdf.ask("What was the company's revenue in 2022?")
278
+ print(f"Answer: {result['answer']}")
279
+ print(f"Confidence: {result['confidence']:.2f}")
280
+
281
+ # Access more details in the result dictionary
282
+ result = pdf.ask("Who is the CEO?")
283
+ print(f"Answer: {result['answer']}")
284
+ print(f"Found on page: {result['page_num']}")
285
+ print(f"Source text: {result.get('source_text', 'N/A')}")
286
+ ```
287
+
266
288
  ## More details
267
289
 
268
290
  [Complete documentation here](https://jsoma.github.io/natural-pdf)
@@ -4,7 +4,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
4
4
 
5
5
  Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
6
6
 
7
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
7
+ - [Complete documentation here](https://jsoma.github.io/natural-pdf)
8
+ - [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
8
9
 
9
10
  ## Features
10
11
 
@@ -42,9 +43,12 @@ pip install natural-pdf[easyocr,paddle]
42
43
  ```python
43
44
  from natural_pdf import PDF
44
45
 
45
- # Open a PDF
46
+ # Open a local PDF
46
47
  pdf = PDF('document.pdf')
47
48
 
49
+ # Or open a PDF from a URL
50
+ pdf = PDF('https://example.com/document.pdf')
51
+
48
52
  # Get the first page
49
53
  page = pdf.pages[0]
50
54
 
@@ -209,6 +213,23 @@ Logs follow a hierarchical structure matching the library's module organization:
209
213
  - `natural_pdf.analyzers` - Layout analysis operations
210
214
  - `natural_pdf.ocr` - OCR engine operations
211
215
 
216
+ ## Document QA
217
+
218
+ Ask questions directly to your documents:
219
+
220
+ ```python
221
+ # Ask questions about the document content
222
+ result = pdf.ask("What was the company's revenue in 2022?")
223
+ print(f"Answer: {result['answer']}")
224
+ print(f"Confidence: {result['confidence']:.2f}")
225
+
226
+ # Access more details in the result dictionary
227
+ result = pdf.ask("Who is the CEO?")
228
+ print(f"Answer: {result['answer']}")
229
+ print(f"Found on page: {result['page_num']}")
230
+ print(f"Source text: {result.get('source_text', 'N/A')}")
231
+ ```
232
+
212
233
  ## More details
213
234
 
214
235
  [Complete documentation here](https://jsoma.github.io/natural-pdf)
@@ -4,6 +4,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
4
4
 
5
5
  Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
6
6
 
7
+ - [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
8
+
7
9
  ## Quick Example
8
10
 
9
11
  ```python
@@ -4,20 +4,10 @@ Let's get Natural PDF installed and run your first extraction.
4
4
 
5
5
  ## Installation
6
6
 
7
- Natural PDF is available on PyPI. The simplest way to install it is with pip:
8
-
9
7
  ```bash
10
8
  pip install natural-pdf
11
9
  ```
12
10
 
13
- You can also install from source:
14
-
15
- ```bash
16
- git clone https://github.com/jsoma/natural-pdf.git
17
- cd natural-pdf
18
- pip install -e .
19
- ```
20
-
21
11
  ### Optional Dependencies
22
12
 
23
13
  Natural PDF has modular dependencies for different features:
@@ -0,0 +1,71 @@
1
+ """
2
+ Direct Document QA example that closely mirrors the original pdfplumber implementation.
3
+
4
+ This example shows how to:
5
+ 1. Use pdfplumber directly to extract words and images
6
+ 2. Use transformers pipelines for document QA
7
+ 3. Compare with the Natural PDF implementation
8
+
9
+ It's intentionally similar to the original code provided by the user.
10
+ """
11
+
12
+ import os
13
+ import sys
14
+ import argparse
15
+ import pdfplumber
16
+ from PIL import Image, ImageDraw
17
+ import numpy as np
18
+
19
+ # Add parent directory to path to run without installing
20
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
21
+
22
+ # For comparison
23
+ from natural_pdf import PDF, configure_logging
24
+ import logging
25
+
26
+ def main():
27
+ parser = argparse.ArgumentParser(description="Direct Document QA Example")
28
+ parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
29
+ help="Path to PDF document")
30
+ parser.add_argument("--question", default="How many votes for Harris and Walz?",
31
+ help="Question to ask about the document")
32
+ parser.add_argument("--debug", action="store_true",
33
+ help="Save debug information for troubleshooting")
34
+
35
+ args = parser.parse_args()
36
+
37
+ # Configure logging for Natural PDF
38
+ if args.debug:
39
+ configure_logging(level=logging.DEBUG)
40
+ else:
41
+ configure_logging(level=logging.INFO)
42
+
43
+ print(f"Document: {args.pdf_path}")
44
+ print(f"Question: {args.question}")
45
+
46
+ print("\n=== Natural PDF implementation ===")
47
+
48
+ # Use Natural PDF
49
+ pdf = PDF(args.pdf_path)
50
+ page = pdf.pages[0]
51
+
52
+ # Ask the question
53
+ result = page.ask(args.question, debug=args.debug)
54
+
55
+ if result.get("found", False):
56
+ print(f"Answer: {result['answer']}")
57
+ print(f"Confidence: {result['confidence']:.2f}")
58
+
59
+ # Highlight the answer
60
+ if result.get("source_elements"):
61
+ for element in result["source_elements"]:
62
+ element.highlight(color=(1, 0.5, 0, 0.5))
63
+
64
+ # Save the image
65
+ page.save_image("output/natural_pdf_answer.png")
66
+ print("Saved highlighted answer to output/natural_pdf_answer.png")
67
+ else:
68
+ print(f"No answer found: {result.get('error', '')}")
69
+
70
+ if __name__ == "__main__":
71
+ main()
@@ -0,0 +1,325 @@
1
+ """
2
+ Comprehensive test of the Docling integration with Natural PDF.
3
+
4
+ This script tests all aspects of the Docling integration:
5
+ 1. Basic document layout detection
6
+ 2. Hierarchical document navigation
7
+ 3. Text extraction from complex structures
8
+ 4. Integration with other layout models
9
+ 5. Performance and edge cases
10
+
11
+ Usage:
12
+ python examples/docling_comprehensive_test.py [pdf_path]
13
+
14
+ Dependencies:
15
+ - torch
16
+ - transformers
17
+ - docling_core
18
+ """
19
+
20
+ import os
21
+ import sys
22
+ import time
23
+ import logging
24
+ from pathlib import Path
25
+
26
+ # Import the library
27
+ from natural_pdf import PDF, configure_logging
28
+
29
+ # Configure detailed logging for debugging
30
+ configure_logging(level=logging.INFO)
31
+ logger = logging.getLogger("docling_test")
32
+ logger.setLevel(logging.INFO)
33
+
34
+ # Get PDF path from command line or use demo file
35
+ if len(sys.argv) > 1:
36
+ pdf_path = sys.argv[1]
37
+ else:
38
+ # Default to a sample PDF in the pdfs directory
39
+ script_dir = os.path.dirname(os.path.abspath(__file__))
40
+ repo_root = os.path.dirname(script_dir)
41
+ pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
42
+
43
+ # Check if required packages are installed
44
+ try:
45
+ from docling.document_converter import DocumentConverter
46
+ except ImportError:
47
+ logger.error("Missing required packages. Please install with:")
48
+ logger.error("pip install docling")
49
+ sys.exit(1)
50
+
51
+ # Create output directory for test results
52
+ output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output", "docling_tests")
53
+ os.makedirs(output_dir, exist_ok=True)
54
+
55
+ # Load the PDF
56
+ logger.info(f"Loading PDF: {pdf_path}")
57
+ pdf = PDF(pdf_path)
58
+ logger.info(f"PDF has {len(pdf.pages)} pages")
59
+
60
+ # Process only the first page for tests
61
+ page = pdf.pages[0]
62
+
63
+ # SECTION 1: Basic Docling Detection
64
+ logger.info("\n*** SECTION 1: Basic Docling Detection ***")
65
+
66
+ # Time the Docling analysis
67
+ start_time = time.time()
68
+ page.analyze_layout(
69
+ model="docling",
70
+ confidence=0.2, # This parameter isn't used by Docling but kept for API consistency
71
+ model_params={
72
+ "verbose": True
73
+ # Any other parameters would be passed directly to DocumentConverter
74
+ }
75
+ )
76
+ docling_time = time.time() - start_time
77
+ logger.info(f"Docling analysis completed in {docling_time:.2f} seconds")
78
+
79
+ # Verify that docling_document was created
80
+ if hasattr(page, 'docling_document'):
81
+ logger.info("✅ Docling document created successfully")
82
+ else:
83
+ logger.error("❌ Docling document not created")
84
+
85
+ # Count detected regions
86
+ docling_regions = page.find_all('region[model=docling]')
87
+ logger.info(f"Found {len(docling_regions)} total Docling regions")
88
+
89
+ # Get regions by type
90
+ section_headers = page.find_all('section-header')
91
+ text_regions = page.find_all('region[model=docling][type=text]')
92
+ figures = page.find_all('region[model=docling][type=figure]')
93
+
94
+ logger.info(f"- Section headers: {len(section_headers)}")
95
+ logger.info(f"- Text regions: {len(text_regions)}")
96
+ logger.info(f"- Figures: {len(figures)}")
97
+
98
+ # SECTION 2: Hierarchical Navigation
99
+ logger.info("\n*** SECTION 2: Hierarchical Navigation ***")
100
+
101
+ # Test if regions have child_regions attribute
102
+ has_children_attr = all(hasattr(region, 'child_regions') for region in docling_regions)
103
+ logger.info(f"All regions have child_regions attribute: {has_children_attr}")
104
+
105
+ # Count top-level regions (no parent)
106
+ top_level_regions = [r for r in docling_regions if not r.parent_region]
107
+ logger.info(f"Top-level regions: {len(top_level_regions)}")
108
+
109
+ # Test child traversal for section headers
110
+ if section_headers:
111
+ header = section_headers[0]
112
+ logger.info(f"Testing section header: '{header.extract_text()[:30]}...'")
113
+
114
+ # Test get_children method
115
+ if hasattr(header, 'get_children'):
116
+ children = header.get_children()
117
+ logger.info(f"- Direct children: {len(children)}")
118
+
119
+ # Test filtered get_children
120
+ text_children = header.get_children('text')
121
+ logger.info(f"- Direct text children: {len(text_children)}")
122
+ else:
123
+ logger.error("❌ get_children method not found")
124
+
125
+ # Test get_descendants method
126
+ if hasattr(header, 'get_descendants'):
127
+ descendants = header.get_descendants()
128
+ logger.info(f"- All descendants: {len(descendants)}")
129
+
130
+ # Test filtered get_descendants
131
+ text_descendants = header.get_descendants('text')
132
+ logger.info(f"- Text descendants: {len(text_descendants)}")
133
+ else:
134
+ logger.error("❌ get_descendants method not found")
135
+
136
+ # Test find_all with recursive option
137
+ children_find = header.find_all('text', recursive=False)
138
+ logger.info(f"- Children via find_all(recursive=False): {len(children_find)}")
139
+
140
+ all_find = header.find_all('text', recursive=True)
141
+ logger.info(f"- All text via find_all(recursive=True): {len(all_find)}")
142
+
143
+ # SECTION 3: Text Extraction
144
+ logger.info("\n*** SECTION 3: Text Extraction ***")
145
+
146
+ # Test basic text extraction
147
+ if section_headers:
148
+ header = section_headers[0]
149
+ header_text = header.extract_text()
150
+ logger.info(f"Section header text: '{header_text[:50]}...'")
151
+
152
+ # Test extraction from hierarchy
153
+ if hasattr(header, 'get_children') and header.get_children():
154
+ child = header.get_children()[0]
155
+ child_text = child.extract_text()
156
+ logger.info(f"First child text: '{child_text[:50]}...'")
157
+
158
+ # Compare with standard extraction
159
+ # In a real document, the header's extract_text might include the child text too
160
+ combined_len = len(header_text) + len(child_text)
161
+ logger.info(f"Combined text length: {combined_len} characters")
162
+
163
+ # Test text extraction with and without OCR
164
+ # This is a simplified test - in a real scenario, we'd compare with known text
165
+ extracted_text = page.extract_text()
166
+ logger.info(f"Extracted page text: {len(extracted_text)} characters")
167
+
168
+ # SECTION 4: Integration with Other Models
169
+ logger.info("\n*** SECTION 4: Integration with Other Models ***")
170
+
171
+ # Store current regions for comparison
172
+ original_region_count = len(page._regions['detected'])
173
+
174
+ # Add YOLO analysis
175
+ page.analyze_layout(
176
+ model="yolo",
177
+ confidence=0.3,
178
+ existing="append" # Important: don't replace Docling regions
179
+ )
180
+
181
+ # Count new regions
182
+ all_regions = page._regions['detected']
183
+ logger.info(f"Total regions after adding YOLO: {len(all_regions)}")
184
+ logger.info(f"New regions added: {len(all_regions) - original_region_count}")
185
+
186
+ # Test filtering by model
187
+ yolo_regions = page.find_all('region[model=yolo]')
188
+ docling_regions_after = page.find_all('region[model=docling]')
189
+
190
+ logger.info(f"YOLO regions: {len(yolo_regions)}")
191
+ logger.info(f"Docling regions after YOLO: {len(docling_regions_after)}")
192
+ logger.info(f"Docling regions preserved: {len(docling_regions_after) == len(docling_regions)}")
193
+
194
+ # SECTION 5: Visualization
195
+ logger.info("\n*** SECTION 5: Visualization ***")
196
+
197
+ # Clear previous highlights
198
+ page.clear_highlights()
199
+
200
+ # Highlight different models and region types
201
+ if section_headers:
202
+ section_headers.highlight(
203
+ color=(1, 0, 0, 0.3),
204
+ label="Docling Headers",
205
+ include_attrs=['region_type']
206
+ )
207
+
208
+ if text_regions:
209
+ text_regions.highlight(
210
+ color=(0, 0, 1, 0.3),
211
+ label="Docling Text",
212
+ include_attrs=['region_type']
213
+ )
214
+
215
+ if yolo_regions:
216
+ yolo_regions.highlight(
217
+ color=(0, 1, 0, 0.3),
218
+ label="YOLO Regions",
219
+ include_attrs=['region_type']
220
+ )
221
+
222
+ # Save highlighted image
223
+ highlight_path = os.path.join(output_dir, "model_comparison.png")
224
+ page.save_image(highlight_path, labels=True)
225
+ logger.info(f"Saved visualization to {highlight_path}")
226
+
227
+ # Test hierarchical highlighting
228
+ if section_headers and len(section_headers) > 0:
229
+ # Clear previous highlights
230
+ page.clear_highlights()
231
+
232
+ # Select a section to visualize
233
+ header = section_headers[0]
234
+
235
+ # Highlight header
236
+ header.highlight(
237
+ color=(1, 0, 0, 0.3),
238
+ label="Section Header"
239
+ )
240
+
241
+ # Highlight direct children
242
+ if hasattr(header, 'get_children') and header.get_children():
243
+ children = header.get_children()
244
+ for child in children:
245
+ child.highlight(
246
+ color=(0, 1, 0, 0.3),
247
+ label="Direct Children",
248
+ include_attrs=['region_type']
249
+ )
250
+
251
+ # Save hierarchy visualization
252
+ hierarchy_path = os.path.join(output_dir, "hierarchy_visualization.png")
253
+ page.save_image(hierarchy_path, labels=True)
254
+ logger.info(f"Saved hierarchy visualization to {hierarchy_path}")
255
+
256
+ # SECTION 6: Text Source Testing (OCR vs Native)
257
+ logger.info("\n*** SECTION 6: Text Source Testing ***")
258
+
259
+ # Find text elements by source
260
+ native_text = page.find_all('text[source=native]')
261
+ ocr_text = page.find_all('text[source=ocr]')
262
+ docling_text = page.find_all('region[model=docling][type=text]')
263
+
264
+ logger.info(f"Text elements by source:")
265
+ logger.info(f"- Native PDF text: {len(native_text)} elements")
266
+ logger.info(f"- OCR text: {len(ocr_text)} elements")
267
+ logger.info(f"- Docling text: {len(docling_text)} elements")
268
+
269
+ # Test specific text element queries
270
+ if native_text:
271
+ sample_native = native_text[0]
272
+ logger.info(f"Sample native text: '{sample_native.text[:30]}...'")
273
+ logger.info(f"Has source='native' attribute: {getattr(sample_native, 'source', None) == 'native'}")
274
+
275
+ # Test if text_content attribute is set
276
+ has_text_content = False
277
+ for region in docling_regions:
278
+ if hasattr(region, 'text_content') and region.text_content:
279
+ has_text_content = True
280
+ logger.info(f"Found region with text_content: '{region.text_content[:30]}...'")
281
+ break
282
+
283
+ logger.info(f"Regions have text_content attribute: {has_text_content}")
284
+
285
+ # Test if associated_text_elements is used
286
+ has_associated_text = False
287
+ for region in docling_regions:
288
+ if hasattr(region, 'associated_text_elements') and region.associated_text_elements:
289
+ has_associated_text = True
290
+ logger.info(f"Found region with associated_text_elements: {len(region.associated_text_elements)} elements")
291
+ break
292
+
293
+ logger.info(f"Regions have associated_text_elements: {has_associated_text}")
294
+
295
+ # Highlight different text sources
296
+ page.clear_highlights()
297
+ if native_text:
298
+ native_text.highlight(
299
+ color=(0, 0, 0.7, 0.3),
300
+ label="Native Text Elements",
301
+ include_attrs=['source']
302
+ )
303
+
304
+ if docling_text:
305
+ docling_text.highlight(
306
+ color=(0.7, 0, 0, 0.3),
307
+ label="Docling Text Elements",
308
+ include_attrs=['model']
309
+ )
310
+
311
+ # Save source visualization
312
+ source_path = os.path.join(output_dir, "text_sources.png")
313
+ page.save_image(source_path, labels=True)
314
+ logger.info(f"Saved text source visualization to {source_path}")
315
+
316
+ # Log final summary
317
+ print("\n*** TEST SUMMARY ***")
318
+ print(f"Total Docling regions: {len(docling_regions)}")
319
+ print(f"Hierarchical navigation: {'✅ Working' if has_children_attr else '❌ Not working'}")
320
+ print(f"Text extraction: {'✅ Working' if len(extracted_text) > 0 else '❌ Not working'}")
321
+ print(f"Multi-model integration: {'✅ Working' if len(yolo_regions) > 0 else '❌ Not working'}")
322
+ print(f"Test artifacts saved to: {output_dir}")
323
+
324
+ print("\nAll tests completed with no errors!")
325
+ logger.info("\nAll tests completed.")