natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
docs/tables/index.md ADDED
@@ -0,0 +1,144 @@
1
+ # Table Extraction
2
+
3
+ Extracting tables from PDFs can range from straightforward to complex. Natural PDF provides several tools and methods to handle different scenarios, leveraging both rule-based (`pdfplumber`) and model-based (`TATR`) approaches.
4
+
5
+ ## Setup
6
+
7
+ Let's load a PDF containing tables.
8
+
9
+ ```python
10
+ from natural_pdf import PDF
11
+
12
+ # Load the PDF
13
+ pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
14
+
15
+ # Select the first page
16
+ page = pdf.pages[0]
17
+
18
+ # Display the page
19
+ page.show()
20
+ ```
21
+
22
+ ## Basic Table Extraction (No Detection)
23
+
24
+ If you know a table exists, you can try `extract_table()` directly on the page or a region. This uses `pdfplumber` behind the scenes.
25
+
26
+ ```python
27
+ # Extract the first table found on the page using pdfplumber
28
+ # This works best for simple tables with clear lines
29
+ table_data = page.extract_table() # Returns a list of lists
30
+ table_data
31
+ ```
32
+
33
+ *This might fail or give poor results if there are multiple tables or the table structure is complex.*
34
+
35
+ ## Layout Analysis for Table Detection
36
+
37
+ A more robust approach can be to first *detect* the table boundaries using layout analysis.
38
+
39
+ ### Using YOLO (Default)
40
+
41
+ The default YOLO model finds the overall bounding box of tables.
42
+
43
+ ```python
44
+ # Detect layout elements using YOLO (default)
45
+ page.analyze_layout(engine='yolo')
46
+
47
+ # Find regions detected as tables
48
+ table_regions_yolo = page.find_all('region[type=table][model=yolo]')
49
+ table_regions_yolo.show()
50
+ ```
51
+
52
+ ```python
53
+ table_regions_yolo[0].extract_table()
54
+ ```
55
+
56
+ ### Using TATR (Table Transformer)
57
+
58
+ The TATR model provides detailed table structure (rows, columns, headers).
59
+
60
+ ```python
61
+ page.clear_detected_layout_regions() # Clear previous YOLO regions for clarity
62
+ page.analyze_layout(engine='tatr')
63
+ ```
64
+
65
+ ```python
66
+ # Find the main table region(s) detected by TATR
67
+ tatr_table = page.find('region[type=table][model=tatr]')
68
+ tatr_table.show()
69
+ ```
70
+
71
+ ```python
72
+ # Find rows, columns, headers detected by TATR
73
+ rows = page.find_all('region[type=table-row][model=tatr]')
74
+ cols = page.find_all('region[type=table-column][model=tatr]')
75
+ hdrs = page.find_all('region[type=table-column-header][model=tatr]')
76
+ f"TATR found: {len(rows)} rows, {len(cols)} columns, {len(hdrs)} headers"
77
+ ```
78
+
79
+ ## Controlling Extraction Method (`plumber` vs `tatr`)
80
+
81
+ When you call `extract_table()` on a region:
82
+ - If the region was detected by **YOLO** (or not detected at all), it uses the `plumber` method.
83
+ - If the region was detected by **TATR**, it defaults to the `tatr` method, which uses the detected row/column structure.
84
+
85
+ You can override this using the `method` argument.
86
+
87
+ ```python
88
+ tatr_table = page.find('region[type=table][model=tatr]')
89
+ tatr_table.extract_table(method='tatr')
90
+ ```
91
+
92
+ ```python
93
+ # Force using pdfplumber even on a TATR-detected region
94
+ # (Might be useful for comparison or if TATR structure is flawed)
95
+ tatr_table = page.find('region[type=table][model=tatr]')
96
+ tatr_table.extract_table(method='pdfplumber')
97
+ ```
98
+
99
+ ### When to Use Which Method?
100
+
101
+ - **`pdfplumber`**: Good for simple tables with clear grid lines. Faster.
102
+ - **`tatr`**: Better for tables without clear lines, complex cell merging, or irregular layouts. Leverages the model's understanding of rows and columns.
103
+
104
+ ## Customizing `pdfplumber` Settings
105
+
106
+ If using the `pdfplumber` method (explicitly or implicitly), you can pass `pdfplumber` settings via `table_settings`.
107
+
108
+ ```python
109
+ # Example: Use text alignment for vertical lines, explicit lines for horizontal
110
+ # See pdfplumber documentation for all settings
111
+ table_settings = {
112
+ "vertical_strategy": "text",
113
+ "horizontal_strategy": "lines",
114
+ "intersection_x_tolerance": 5, # Increase tolerance for intersections
115
+ }
116
+
117
+ results = page.extract_table(
118
+ table_settings=table_settings
119
+ )
120
+ ```
121
+
122
+ ## Saving Extracted Tables
123
+
124
+ You can easily save the extracted data (list of lists) to common formats.
125
+
126
+ ```python
127
+ import pandas as pd
128
+
129
+ pd.DataFrame(page.extract_table())
130
+ ```
131
+
132
+ ## Working Directly with TATR Cells
133
+
134
+ The TATR engine implicitly creates cell regions at the intersection of detected rows and columns. You can access these for fine-grained control.
135
+
136
+ ```python
137
+ # This doesn't work! I forget why, I should troubleshoot later.
138
+ # tatr_table.cells
139
+ ```
140
+
141
+ ## Next Steps
142
+
143
+ - [Layout Analysis](../layout-analysis/index.ipynb): Understand how table detection fits into overall document structure analysis.
144
+ - [Working with Regions](../regions/index.ipynb): Manually define table areas if detection fails.