natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,40 +1,44 @@
1
1
  """
2
2
  Visualization utilities for natural-pdf.
3
3
  """
4
- from typing import List, Dict, Tuple, Optional, Union, Any, Set
4
+
5
5
  import io
6
+ import itertools # Added for cycling
6
7
  import math
7
8
  import random
8
- import itertools # Added for cycling
9
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
10
+
9
11
  from PIL import Image, ImageDraw, ImageFont
10
12
 
11
13
  # Define a base list of visually distinct colors for highlighting
12
14
  # Format: (R, G, B)
13
15
  _BASE_HIGHLIGHT_COLORS = [
14
- (255, 0, 0), # Red
15
- (0, 255, 0), # Green
16
- (0, 0, 255), # Blue
17
- (255, 0, 255), # Magenta
18
- (0, 255, 255), # Cyan
19
- (255, 165, 0), # Orange
20
- (128, 0, 128), # Purple
21
- (0, 128, 0), # Dark Green
22
- (0, 0, 128), # Navy
23
- (255, 215, 0), # Gold
24
- (75, 0, 130), # Indigo
16
+ (255, 0, 0), # Red
17
+ (0, 255, 0), # Green
18
+ (0, 0, 255), # Blue
19
+ (255, 0, 255), # Magenta
20
+ (0, 255, 255), # Cyan
21
+ (255, 165, 0), # Orange
22
+ (128, 0, 128), # Purple
23
+ (0, 128, 0), # Dark Green
24
+ (0, 0, 128), # Navy
25
+ (255, 215, 0), # Gold
26
+ (75, 0, 130), # Indigo
25
27
  (240, 128, 128), # Light Coral
26
- (32, 178, 170), # Light Sea Green
27
- (138, 43, 226), # Blue Violet
28
- (160, 82, 45), # Sienna
28
+ (32, 178, 170), # Light Sea Green
29
+ (138, 43, 226), # Blue Violet
30
+ (160, 82, 45), # Sienna
29
31
  ]
30
32
 
31
33
  # Default Alpha for highlight fills
32
34
  DEFAULT_FILL_ALPHA = 100
33
35
 
36
+
34
37
  class ColorManager:
35
38
  """
36
39
  Manages color assignment for highlights, ensuring consistency for labels.
37
40
  """
41
+
38
42
  def __init__(self, alpha: int = DEFAULT_FILL_ALPHA):
39
43
  """
40
44
  Initializes the ColorManager.
@@ -52,8 +56,9 @@ class ColorManager:
52
56
  """Applies the instance's alpha to an RGB tuple."""
53
57
  return (*rgb, self._alpha)
54
58
 
55
- def get_color(self, label: Optional[str] = None,
56
- force_cycle: bool = False) -> Tuple[int, int, int, int]:
59
+ def get_color(
60
+ self, label: Optional[str] = None, force_cycle: bool = False
61
+ ) -> Tuple[int, int, int, int]:
57
62
  """
58
63
  Gets an RGBA color tuple.
59
64
 
@@ -72,7 +77,7 @@ class ColorManager:
72
77
  # Always get the next color, don't store by label
73
78
  rgb = next(self._color_cycle)
74
79
  return self._get_rgba_color(rgb)
75
-
80
+
76
81
  if label is not None:
77
82
  if label in self._labels_colors:
78
83
  # Return existing color for this label
@@ -99,41 +104,43 @@ class ColorManager:
99
104
  self._color_cycle = itertools.cycle(self._available_colors)
100
105
  self._labels_colors = {}
101
106
 
102
- # --- Global color state and functions removed ---
107
+
108
+ # --- Global color state and functions removed ---
103
109
  # HIGHLIGHT_COLORS, _color_cycle, _current_labels_colors, _used_colors_iterator
104
110
  # get_next_highlight_color(), reset_highlight_colors()
105
111
 
106
- def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
107
- width: int = 200,
108
- item_height: int = 30) -> Image.Image:
112
+
113
+ def create_legend(
114
+ labels_colors: Dict[str, Tuple[int, int, int, int]], width: int = 200, item_height: int = 30
115
+ ) -> Image.Image:
109
116
  """
110
117
  Create a legend image for the highlighted elements.
111
-
118
+
112
119
  Args:
113
120
  labels_colors: Dictionary mapping labels to colors
114
121
  width: Width of the legend image
115
122
  item_height: Height of each legend item
116
-
123
+
117
124
  Returns:
118
125
  PIL Image with the legend
119
126
  """
120
127
  # Calculate the height based on the number of labels
121
128
  height = len(labels_colors) * item_height + 10 # 10px padding
122
-
129
+
123
130
  # Create a white image
124
- legend = Image.new('RGBA', (width, height), (255, 255, 255, 255))
131
+ legend = Image.new("RGBA", (width, height), (255, 255, 255, 255))
125
132
  draw = ImageDraw.Draw(legend)
126
-
133
+
127
134
  # Try to load a font, use default if not available
128
135
  try:
129
136
  # Use a commonly available font, adjust size
130
- font = ImageFont.truetype("DejaVuSans.ttf", 14)
137
+ font = ImageFont.truetype("DejaVuSans.ttf", 14)
131
138
  except IOError:
132
139
  try:
133
- font = ImageFont.truetype("Arial.ttf", 14)
140
+ font = ImageFont.truetype("Arial.ttf", 14)
134
141
  except IOError:
135
142
  font = ImageFont.load_default()
136
-
143
+
137
144
  # Draw each legend item
138
145
  y = 5 # Start with 5px padding
139
146
  for label, color in labels_colors.items():
@@ -141,10 +148,10 @@ def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
141
148
  # Handle potential case where alpha isn't provided (use default 255)
142
149
  if len(color) == 3:
143
150
  r, g, b = color
144
- alpha = 255 # Assume opaque if alpha is missing
151
+ alpha = 255 # Assume opaque if alpha is missing
145
152
  else:
146
153
  r, g, b, alpha = color
147
-
154
+
148
155
  # Calculate the apparent color when drawn on white background
149
156
  # Alpha blending formula: result = (source * alpha) + (dest * (1-alpha))
150
157
  # Where alpha is normalized to 0-1 range
@@ -152,72 +159,74 @@ def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
152
159
  apparent_r = int(r * alpha_norm + 255 * (1 - alpha_norm))
153
160
  apparent_g = int(g * alpha_norm + 255 * (1 - alpha_norm))
154
161
  apparent_b = int(b * alpha_norm + 255 * (1 - alpha_norm))
155
-
162
+
156
163
  # Use solid color that matches the apparent color of the semi-transparent highlight
157
164
  legend_color = (apparent_r, apparent_g, apparent_b, 255)
158
-
165
+
159
166
  # Draw the color box
160
167
  draw.rectangle([(10, y), (30, y + item_height - 5)], fill=legend_color)
161
-
168
+
162
169
  # Draw the label text
163
170
  draw.text((40, y + (item_height // 2) - 6), label, fill=(0, 0, 0, 255), font=font)
164
-
171
+
165
172
  # Move to the next position
166
173
  y += item_height
167
-
174
+
168
175
  return legend
169
176
 
170
- def merge_images_with_legend(image: Image.Image,
171
- legend: Image.Image,
172
- position: str = 'right') -> Image.Image:
177
+
178
+ def merge_images_with_legend(
179
+ image: Image.Image, legend: Image.Image, position: str = "right"
180
+ ) -> Image.Image:
173
181
  """
174
182
  Merge an image with a legend.
175
-
183
+
176
184
  Args:
177
185
  image: Main image
178
186
  legend: Legend image
179
187
  position: Position of the legend ('right', 'bottom', 'top', 'left')
180
-
188
+
181
189
  Returns:
182
190
  Merged image
183
191
  """
184
192
  if not legend:
185
- return image # Return original image if legend is None or empty
186
-
187
- # Determine background color from top-left pixel (safer than assuming white)
188
- bg_color = image.getpixel((0,0)) if image.mode == 'RGBA' else (255, 255, 255, 255)
193
+ return image # Return original image if legend is None or empty
189
194
 
190
- if position == 'right':
195
+ bg_color = (255, 255, 255, 255) # Always use white for the merged background
196
+
197
+ if position == "right":
191
198
  # Create a new image with extra width for the legend
192
199
  merged_width = image.width + legend.width
193
200
  merged_height = max(image.height, legend.height)
194
- merged = Image.new('RGBA', (merged_width, merged_height), bg_color)
201
+ merged = Image.new("RGBA", (merged_width, merged_height), bg_color)
195
202
  merged.paste(image, (0, 0))
196
- merged.paste(legend, (image.width, 0), legend if legend.mode == 'RGBA' else None) # Handle transparency
197
- elif position == 'bottom':
203
+ merged.paste(
204
+ legend, (image.width, 0), legend if legend.mode == "RGBA" else None
205
+ ) # Handle transparency
206
+ elif position == "bottom":
198
207
  # Create a new image with extra height for the legend
199
208
  merged_width = max(image.width, legend.width)
200
209
  merged_height = image.height + legend.height
201
- merged = Image.new('RGBA', (merged_width, merged_height), bg_color)
210
+ merged = Image.new("RGBA", (merged_width, merged_height), bg_color)
202
211
  merged.paste(image, (0, 0))
203
- merged.paste(legend, (0, image.height), legend if legend.mode == 'RGBA' else None)
204
- elif position == 'top':
212
+ merged.paste(legend, (0, image.height), legend if legend.mode == "RGBA" else None)
213
+ elif position == "top":
205
214
  # Create a new image with extra height for the legend
206
215
  merged_width = max(image.width, legend.width)
207
216
  merged_height = image.height + legend.height
208
- merged = Image.new('RGBA', (merged_width, merged_height), bg_color)
209
- merged.paste(legend, (0, 0), legend if legend.mode == 'RGBA' else None)
217
+ merged = Image.new("RGBA", (merged_width, merged_height), bg_color)
218
+ merged.paste(legend, (0, 0), legend if legend.mode == "RGBA" else None)
210
219
  merged.paste(image, (0, legend.height))
211
- elif position == 'left':
220
+ elif position == "left":
212
221
  # Create a new image with extra width for the legend
213
222
  merged_width = image.width + legend.width
214
223
  merged_height = max(image.height, legend.height)
215
- merged = Image.new('RGBA', (merged_width, merged_height), bg_color)
216
- merged.paste(legend, (0, 0), legend if legend.mode == 'RGBA' else None)
224
+ merged = Image.new("RGBA", (merged_width, merged_height), bg_color)
225
+ merged.paste(legend, (0, 0), legend if legend.mode == "RGBA" else None)
217
226
  merged.paste(image, (legend.width, 0))
218
227
  else:
219
228
  # Invalid position, return the original image
220
229
  print(f"Warning: Invalid legend position '{position}'. Returning original image.")
221
230
  merged = image
222
-
223
- return merged
231
+
232
+ return merged
@@ -1,4 +1,3 @@
1
- from .viewer import SimpleInteractiveViewerWidget as InteractiveViewerWidget
2
-
3
1
  # Also provide the original implementation for reference
4
- from .viewer import InteractiveViewerWidget as _OriginalInteractiveViewerWidget
2
+ from .viewer import InteractiveViewerWidget as _OriginalInteractiveViewerWidget
3
+ from .viewer import SimpleInteractiveViewerWidget as InteractiveViewerWidget