natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +117 -75
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/elements/base.py +9 -9
- natural_pdf/elements/collections.py +105 -50
- natural_pdf/elements/region.py +200 -126
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
natural_pdf/flows/flow.py
CHANGED
@@ -3,11 +3,12 @@ from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
|
|
3
3
|
|
4
4
|
if TYPE_CHECKING:
|
5
5
|
from natural_pdf.core.page import Page
|
6
|
-
from natural_pdf.elements.region import Region as PhysicalRegion
|
7
6
|
from natural_pdf.elements.base import Element as PhysicalElement
|
8
7
|
from natural_pdf.elements.collections import ElementCollection as PhysicalElementCollection
|
9
|
-
from .
|
8
|
+
from natural_pdf.elements.region import Region as PhysicalRegion
|
9
|
+
|
10
10
|
from .collections import FlowElementCollection
|
11
|
+
from .element import FlowElement
|
11
12
|
|
12
13
|
logger = logging.getLogger(__name__)
|
13
14
|
|
@@ -53,14 +54,18 @@ class Flow:
|
|
53
54
|
|
54
55
|
self.segments: List["PhysicalRegion"] = self._normalize_segments(segments)
|
55
56
|
self.arrangement: Literal["vertical", "horizontal"] = arrangement
|
56
|
-
self.alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] =
|
57
|
+
self.alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = (
|
58
|
+
alignment
|
59
|
+
)
|
57
60
|
self.segment_gap: float = segment_gap
|
58
61
|
|
59
62
|
self._validate_alignment()
|
60
63
|
|
61
64
|
# TODO: Pre-calculate segment offsets for faster lookups if needed
|
62
65
|
|
63
|
-
def _normalize_segments(
|
66
|
+
def _normalize_segments(
|
67
|
+
self, segments: List[Union["Page", "PhysicalRegion"]]
|
68
|
+
) -> List["PhysicalRegion"]:
|
64
69
|
"""Converts all Page segments to full-page Region objects for uniform processing."""
|
65
70
|
normalized = []
|
66
71
|
from natural_pdf.core.page import Page as CorePage
|
@@ -71,13 +76,17 @@ class Flow:
|
|
71
76
|
normalized.append(segment.region(0, 0, segment.width, segment.height))
|
72
77
|
elif isinstance(segment, ElementsRegion):
|
73
78
|
normalized.append(segment)
|
74
|
-
elif hasattr(segment,
|
79
|
+
elif hasattr(segment, "object_type") and segment.object_type == "page":
|
75
80
|
if not isinstance(segment, CorePage):
|
76
|
-
raise TypeError(
|
81
|
+
raise TypeError(
|
82
|
+
f"Segment {i} has object_type 'page' but is not an instance of natural_pdf.core.page.Page. Got {type(segment)}"
|
83
|
+
)
|
77
84
|
normalized.append(segment.region(0, 0, segment.width, segment.height))
|
78
|
-
elif hasattr(segment,
|
85
|
+
elif hasattr(segment, "object_type") and segment.object_type == "region":
|
79
86
|
if not isinstance(segment, ElementsRegion):
|
80
|
-
raise TypeError(
|
87
|
+
raise TypeError(
|
88
|
+
f"Segment {i} has object_type 'region' but is not an instance of natural_pdf.elements.region.Region. Got {type(segment)}"
|
89
|
+
)
|
81
90
|
normalized.append(segment)
|
82
91
|
else:
|
83
92
|
raise TypeError(
|
@@ -129,7 +138,7 @@ class Flow:
|
|
129
138
|
apply_exclusions=apply_exclusions,
|
130
139
|
regex=regex,
|
131
140
|
case=case,
|
132
|
-
**kwargs
|
141
|
+
**kwargs,
|
133
142
|
)
|
134
143
|
return results.first if results else None
|
135
144
|
|
@@ -172,7 +181,7 @@ class Flow:
|
|
172
181
|
# This preserves the order from matches_in_segment.elements
|
173
182
|
for phys_elem in matches_in_segment.elements:
|
174
183
|
all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
|
175
|
-
|
184
|
+
|
176
185
|
# The global sort that was here previously has been removed.
|
177
186
|
# The order is now determined by segment sequence, then by local order within each segment.
|
178
187
|
|
@@ -184,10 +193,12 @@ class Flow:
|
|
184
193
|
f"arrangement='{self.arrangement}', alignment='{self.alignment}', gap={self.segment_gap}>"
|
185
194
|
)
|
186
195
|
|
187
|
-
# --- Helper methods for coordinate transformations and segment iteration ---
|
196
|
+
# --- Helper methods for coordinate transformations and segment iteration ---
|
188
197
|
# These will be crucial for FlowElement's directional methods.
|
189
198
|
|
190
|
-
def get_segment_bounding_box_in_flow(
|
199
|
+
def get_segment_bounding_box_in_flow(
|
200
|
+
self, segment_index: int
|
201
|
+
) -> Optional[tuple[float, float, float, float]]:
|
191
202
|
"""
|
192
203
|
Calculates the conceptual bounding box of a segment within the flow's coordinate system.
|
193
204
|
This considers arrangement, alignment, and segment gaps.
|
@@ -196,15 +207,19 @@ class Flow:
|
|
196
207
|
"""
|
197
208
|
if segment_index < 0 or segment_index >= len(self.segments):
|
198
209
|
return None
|
199
|
-
|
210
|
+
|
200
211
|
# This is a simplified version. A full implementation would calculate offsets.
|
201
212
|
# For now, we assume FlowElement directional logic handles segment traversal and uses physical coords.
|
202
213
|
# If we were to *draw* the flow or get a FlowRegion bbox that spans gaps, this would be critical.
|
203
214
|
# physical_segment = self.segments[segment_index]
|
204
215
|
# return physical_segment.bbox
|
205
|
-
raise NotImplementedError(
|
216
|
+
raise NotImplementedError(
|
217
|
+
"Calculating a segment's bbox *within the flow's virtual coordinate system* is not yet fully implemented."
|
218
|
+
)
|
206
219
|
|
207
|
-
def get_element_flow_coordinates(
|
220
|
+
def get_element_flow_coordinates(
|
221
|
+
self, physical_element: "PhysicalElement"
|
222
|
+
) -> Optional[tuple[float, float, float, float]]:
|
208
223
|
"""
|
209
224
|
Translates a physical element's coordinates into the flow's virtual coordinate system.
|
210
225
|
(Placeholder - very complex if segment_gap > 0 or complex alignments)
|
@@ -213,4 +228,6 @@ class Flow:
|
|
213
228
|
# if FlowRegion.bbox or other operations needed to present a unified coordinate space.
|
214
229
|
# As per our discussion, elements *within* a FlowRegion retain original physical coordinates.
|
215
230
|
# So, this might not be strictly necessary for the current design's core functionality.
|
216
|
-
raise NotImplementedError(
|
231
|
+
raise NotImplementedError(
|
232
|
+
"Translating element coordinates to a unified flow coordinate system is not yet implemented."
|
233
|
+
)
|
natural_pdf/flows/region.py
CHANGED
@@ -1,19 +1,21 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
3
3
|
|
4
|
-
from pdfplumber.utils.geometry import objects_to_bbox
|
4
|
+
from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
|
5
5
|
|
6
6
|
# For runtime image manipulation
|
7
7
|
from PIL import Image as PIL_Image_Runtime
|
8
8
|
|
9
9
|
if TYPE_CHECKING:
|
10
|
-
from PIL.Image import Image as PIL_Image
|
10
|
+
from PIL.Image import Image as PIL_Image # For type hints
|
11
|
+
|
12
|
+
from natural_pdf.core.page import Page as PhysicalPage
|
11
13
|
from natural_pdf.elements.base import Element as PhysicalElement
|
12
|
-
from natural_pdf.elements.region import Region as PhysicalRegion
|
13
14
|
from natural_pdf.elements.collections import ElementCollection
|
14
|
-
from natural_pdf.
|
15
|
-
|
15
|
+
from natural_pdf.elements.region import Region as PhysicalRegion
|
16
|
+
|
16
17
|
from .element import FlowElement
|
18
|
+
from .flow import Flow
|
17
19
|
|
18
20
|
logger = logging.getLogger(__name__)
|
19
21
|
|
@@ -53,7 +55,7 @@ class FlowRegion:
|
|
53
55
|
|
54
56
|
# Cache for expensive operations
|
55
57
|
self._cached_text: Optional[str] = None
|
56
|
-
self._cached_elements: Optional["ElementCollection"] = None
|
58
|
+
self._cached_elements: Optional["ElementCollection"] = None # Stringized
|
57
59
|
self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
|
58
60
|
|
59
61
|
@property
|
@@ -68,7 +70,7 @@ class FlowRegion:
|
|
68
70
|
return self._cached_bbox
|
69
71
|
if not self.constituent_regions:
|
70
72
|
return None
|
71
|
-
|
73
|
+
|
72
74
|
# Use objects_to_bbox from pdfplumber.utils.geometry to merge bboxes
|
73
75
|
# This helper expects a list of objects that have .x0, .top, .x1, .bottom attributes.
|
74
76
|
# Our PhysicalRegion objects satisfy this.
|
@@ -113,7 +115,9 @@ class FlowRegion:
|
|
113
115
|
Returns:
|
114
116
|
The combined text content as a string.
|
115
117
|
"""
|
116
|
-
if
|
118
|
+
if (
|
119
|
+
self._cached_text is not None and apply_exclusions
|
120
|
+
): # Simple cache check, might need refinement if kwargs change behavior
|
117
121
|
return self._cached_text
|
118
122
|
|
119
123
|
if not self.constituent_regions:
|
@@ -124,17 +128,19 @@ class FlowRegion:
|
|
124
128
|
# The FlowElement._flow_direction method is responsible for ordering constituent_regions correctly.
|
125
129
|
for region in self.constituent_regions:
|
126
130
|
texts.append(region.extract_text(apply_exclusions=apply_exclusions, **kwargs))
|
127
|
-
|
131
|
+
|
128
132
|
# Join based on flow arrangement (e.g., newline for vertical, space for horizontal)
|
129
133
|
# This is a simplification; true layout-aware joining would be more complex.
|
130
|
-
joiner =
|
134
|
+
joiner = (
|
135
|
+
"\n" if self.flow.arrangement == "vertical" else " "
|
136
|
+
) # TODO: Make this smarter, consider segment_gap
|
131
137
|
extracted = joiner.join(t for t in texts if t)
|
132
|
-
|
133
|
-
if apply_exclusions:
|
134
|
-
|
138
|
+
|
139
|
+
if apply_exclusions: # Only cache if standard exclusion behavior
|
140
|
+
self._cached_text = extracted
|
135
141
|
return extracted
|
136
142
|
|
137
|
-
def elements(self, apply_exclusions: bool = True) -> "ElementCollection":
|
143
|
+
def elements(self, apply_exclusions: bool = True) -> "ElementCollection": # Stringized return
|
138
144
|
"""
|
139
145
|
Collects all unique physical elements from all constituent physical regions.
|
140
146
|
|
@@ -145,36 +151,44 @@ class FlowRegion:
|
|
145
151
|
Returns:
|
146
152
|
An ElementCollection containing all unique elements.
|
147
153
|
"""
|
148
|
-
from natural_pdf.elements.collections import
|
154
|
+
from natural_pdf.elements.collections import (
|
155
|
+
ElementCollection as RuntimeElementCollection, # Local import
|
156
|
+
)
|
149
157
|
|
150
|
-
if self._cached_elements is not None and apply_exclusions:
|
158
|
+
if self._cached_elements is not None and apply_exclusions: # Simple cache check
|
151
159
|
return self._cached_elements
|
152
160
|
|
153
161
|
if not self.constituent_regions:
|
154
162
|
return RuntimeElementCollection([])
|
155
163
|
|
156
|
-
all_physical_elements: List["PhysicalElement"] = []
|
157
|
-
seen_elements =
|
164
|
+
all_physical_elements: List["PhysicalElement"] = [] # Stringized item type
|
165
|
+
seen_elements = (
|
166
|
+
set()
|
167
|
+
) # To ensure uniqueness if elements are shared or duplicated by region definitions
|
158
168
|
|
159
169
|
for region in self.constituent_regions:
|
160
170
|
# Region.get_elements() returns a list, not ElementCollection
|
161
|
-
elements_in_region: List["PhysicalElement"] = region.get_elements(
|
171
|
+
elements_in_region: List["PhysicalElement"] = region.get_elements(
|
172
|
+
apply_exclusions=apply_exclusions
|
173
|
+
)
|
162
174
|
for elem in elements_in_region:
|
163
|
-
if elem not in seen_elements:
|
175
|
+
if elem not in seen_elements: # Check for uniqueness based on object identity
|
164
176
|
all_physical_elements.append(elem)
|
165
177
|
seen_elements.add(elem)
|
166
178
|
|
167
179
|
# Basic reading order sort based on original page and coordinates.
|
168
|
-
def get_sort_key(phys_elem: "PhysicalElement"):
|
180
|
+
def get_sort_key(phys_elem: "PhysicalElement"): # Stringized param type
|
169
181
|
page_idx = -1
|
170
|
-
if hasattr(phys_elem,
|
182
|
+
if hasattr(phys_elem, "page") and hasattr(phys_elem.page, "index"):
|
171
183
|
page_idx = phys_elem.page.index
|
172
184
|
return (page_idx, phys_elem.top, phys_elem.x0)
|
173
185
|
|
174
186
|
try:
|
175
187
|
sorted_physical_elements = sorted(all_physical_elements, key=get_sort_key)
|
176
188
|
except AttributeError:
|
177
|
-
logger.warning(
|
189
|
+
logger.warning(
|
190
|
+
"Could not sort elements in FlowRegion by reading order; some elements might be missing page, top or x0 attributes."
|
191
|
+
)
|
178
192
|
sorted_physical_elements = all_physical_elements
|
179
193
|
|
180
194
|
result_collection = RuntimeElementCollection(sorted_physical_elements)
|
@@ -182,22 +196,30 @@ class FlowRegion:
|
|
182
196
|
self._cached_elements = result_collection
|
183
197
|
return result_collection
|
184
198
|
|
185
|
-
def find(
|
199
|
+
def find(
|
200
|
+
self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
|
201
|
+
) -> Optional["PhysicalElement"]: # Stringized
|
186
202
|
"""
|
187
203
|
Finds the first physical element within this FlowRegion that matches the selector or text.
|
188
204
|
"""
|
189
205
|
# Uses self.elements() which respects exclusions if apply_exclusions=True by default
|
190
206
|
all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
|
191
|
-
return all_elems.find(selector=selector, text=text, **kwargs)
|
207
|
+
return all_elems.find(selector=selector, text=text, **kwargs) # ElementCollection.find
|
192
208
|
|
193
|
-
def find_all(
|
209
|
+
def find_all(
|
210
|
+
self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
|
211
|
+
) -> "ElementCollection": # Stringized
|
194
212
|
"""
|
195
213
|
Finds all physical elements within this FlowRegion that match the selector or text.
|
196
214
|
"""
|
197
215
|
all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
|
198
|
-
return all_elems.find_all(
|
216
|
+
return all_elems.find_all(
|
217
|
+
selector=selector, text=text, **kwargs
|
218
|
+
) # ElementCollection.find_all
|
199
219
|
|
200
|
-
def highlight(
|
220
|
+
def highlight(
|
221
|
+
self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs
|
222
|
+
) -> "FlowRegion": # Stringized
|
201
223
|
"""
|
202
224
|
Highlights all constituent physical regions on their respective pages.
|
203
225
|
|
@@ -214,7 +236,9 @@ class FlowRegion:
|
|
214
236
|
|
215
237
|
base_label = label if label else "FlowRegionPart"
|
216
238
|
for i, region in enumerate(self.constituent_regions):
|
217
|
-
current_label =
|
239
|
+
current_label = (
|
240
|
+
f"{base_label}_{i+1}" if len(self.constituent_regions) > 1 else base_label
|
241
|
+
)
|
218
242
|
region.highlight(label=current_label, color=color, **kwargs)
|
219
243
|
return self
|
220
244
|
|
@@ -229,7 +253,7 @@ class FlowRegion:
|
|
229
253
|
stack_direction: str = "vertical",
|
230
254
|
stack_gap: int = 5,
|
231
255
|
stack_background_color: Tuple[int, int, int] = (255, 255, 255),
|
232
|
-
**kwargs
|
256
|
+
**kwargs,
|
233
257
|
) -> Optional["PIL_Image"]:
|
234
258
|
"""
|
235
259
|
Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
|
@@ -256,9 +280,9 @@ class FlowRegion:
|
|
256
280
|
# 2. Get a highlighter service (e.g., from the first page involved)
|
257
281
|
first_page_with_regions = next(iter(regions_by_page.keys()), None)
|
258
282
|
highlighter_service = None
|
259
|
-
if first_page_with_regions and hasattr(first_page_with_regions,
|
283
|
+
if first_page_with_regions and hasattr(first_page_with_regions, "_highlighter"):
|
260
284
|
highlighter_service = first_page_with_regions._highlighter
|
261
|
-
|
285
|
+
|
262
286
|
if not highlighter_service:
|
263
287
|
raise ValueError(
|
264
288
|
"Cannot get highlighter service for FlowRegion.show(). "
|
@@ -266,9 +290,12 @@ class FlowRegion:
|
|
266
290
|
)
|
267
291
|
|
268
292
|
output_page_images: List["PIL_Image_Runtime"] = []
|
269
|
-
|
293
|
+
|
270
294
|
# Sort pages by index for consistent output order
|
271
|
-
sorted_pages = sorted(
|
295
|
+
sorted_pages = sorted(
|
296
|
+
regions_by_page.keys(),
|
297
|
+
key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
|
298
|
+
)
|
272
299
|
|
273
300
|
# 3. Render each page with its relevant constituent regions highlighted
|
274
301
|
for page_idx, page_obj in enumerate(sorted_pages):
|
@@ -279,41 +306,55 @@ class FlowRegion:
|
|
279
306
|
temp_highlights_for_page = []
|
280
307
|
for i, region_part in enumerate(constituent_regions_on_this_page):
|
281
308
|
part_label = None
|
282
|
-
if labels and label_prefix:
|
309
|
+
if labels and label_prefix: # Ensure labels is True for label_prefix to apply
|
283
310
|
# If FlowRegion consists of multiple parts on this page, or overall
|
284
311
|
count_indicator = ""
|
285
|
-
if
|
312
|
+
if (
|
313
|
+
len(self.constituent_regions) > 1
|
314
|
+
): # If flow region has multiple parts overall
|
286
315
|
# Find global index of this region_part in self.constituent_regions
|
287
316
|
try:
|
288
317
|
global_idx = self.constituent_regions.index(region_part)
|
289
318
|
count_indicator = f"_{global_idx + 1}"
|
290
|
-
except ValueError:
|
291
|
-
count_indicator = f"_p{page_idx}i{i+1}"
|
292
|
-
elif
|
293
|
-
|
319
|
+
except ValueError: # Should not happen if region_part is from the list
|
320
|
+
count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
|
321
|
+
elif (
|
322
|
+
len(constituent_regions_on_this_page) > 1
|
323
|
+
): # If multiple parts on *this* page, but FR is single part overall
|
324
|
+
count_indicator = f"_{i+1}"
|
294
325
|
|
295
326
|
part_label = f"{label_prefix}{count_indicator}" if label_prefix else None
|
296
|
-
|
297
|
-
temp_highlights_for_page.append(
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
327
|
+
|
328
|
+
temp_highlights_for_page.append(
|
329
|
+
{
|
330
|
+
"page_index": (
|
331
|
+
page_obj.index
|
332
|
+
if hasattr(page_obj, "index")
|
333
|
+
else getattr(page_obj, "page_number", 1) - 1
|
334
|
+
),
|
335
|
+
"bbox": region_part.bbox,
|
336
|
+
"polygon": region_part.polygon if region_part.has_polygon else None,
|
337
|
+
"color": color, # Use the passed color
|
338
|
+
"label": part_label,
|
339
|
+
"use_color_cycling": False, # Keep specific color
|
340
|
+
}
|
341
|
+
)
|
342
|
+
|
306
343
|
if not temp_highlights_for_page:
|
307
344
|
continue
|
308
345
|
|
309
346
|
page_image = highlighter_service.render_preview(
|
310
|
-
page_index=
|
347
|
+
page_index=(
|
348
|
+
page_obj.index
|
349
|
+
if hasattr(page_obj, "index")
|
350
|
+
else getattr(page_obj, "page_number", 1) - 1
|
351
|
+
),
|
311
352
|
temporary_highlights=temp_highlights_for_page,
|
312
353
|
scale=scale,
|
313
354
|
width=width,
|
314
|
-
labels=labels,
|
355
|
+
labels=labels, # Pass through labels
|
315
356
|
legend_position=legend_position,
|
316
|
-
**kwargs
|
357
|
+
**kwargs,
|
317
358
|
)
|
318
359
|
if page_image:
|
319
360
|
output_page_images.append(page_image)
|
@@ -322,18 +363,23 @@ class FlowRegion:
|
|
322
363
|
if not output_page_images:
|
323
364
|
logger.info("FlowRegion.show() produced no page images to concatenate.")
|
324
365
|
return None
|
325
|
-
|
366
|
+
|
326
367
|
if len(output_page_images) == 1:
|
327
368
|
return output_page_images[0]
|
328
369
|
|
329
370
|
# Stacking logic (same as in FlowRegionCollection.show)
|
330
371
|
if stack_direction == "vertical":
|
331
372
|
final_width = max(img.width for img in output_page_images)
|
332
|
-
final_height =
|
333
|
-
|
373
|
+
final_height = (
|
374
|
+
sum(img.height for img in output_page_images)
|
375
|
+
+ (len(output_page_images) - 1) * stack_gap
|
376
|
+
)
|
377
|
+
if final_width == 0 or final_height == 0:
|
334
378
|
raise ValueError("Cannot create concatenated image with zero width or height.")
|
335
|
-
|
336
|
-
concatenated_image = PIL_Image_Runtime.new(
|
379
|
+
|
380
|
+
concatenated_image = PIL_Image_Runtime.new(
|
381
|
+
"RGB", (final_width, final_height), stack_background_color
|
382
|
+
)
|
337
383
|
current_y = 0
|
338
384
|
for img in output_page_images:
|
339
385
|
paste_x = (final_width - img.width) // 2
|
@@ -341,12 +387,17 @@ class FlowRegion:
|
|
341
387
|
current_y += img.height + stack_gap
|
342
388
|
return concatenated_image
|
343
389
|
elif stack_direction == "horizontal":
|
344
|
-
final_width =
|
390
|
+
final_width = (
|
391
|
+
sum(img.width for img in output_page_images)
|
392
|
+
+ (len(output_page_images) - 1) * stack_gap
|
393
|
+
)
|
345
394
|
final_height = max(img.height for img in output_page_images)
|
346
395
|
if final_width == 0 or final_height == 0:
|
347
396
|
raise ValueError("Cannot create concatenated image with zero width or height.")
|
348
397
|
|
349
|
-
concatenated_image = PIL_Image_Runtime.new(
|
398
|
+
concatenated_image = PIL_Image_Runtime.new(
|
399
|
+
"RGB", (final_width, final_height), stack_background_color
|
400
|
+
)
|
350
401
|
current_x = 0
|
351
402
|
for img in output_page_images:
|
352
403
|
paste_y = (final_height - img.height) // 2
|
@@ -354,15 +405,17 @@ class FlowRegion:
|
|
354
405
|
current_x += img.width + stack_gap
|
355
406
|
return concatenated_image
|
356
407
|
else:
|
357
|
-
raise ValueError(
|
408
|
+
raise ValueError(
|
409
|
+
f"Invalid stack_direction '{stack_direction}' for FlowRegion.show(). Must be 'vertical' or 'horizontal'."
|
410
|
+
)
|
358
411
|
|
359
412
|
def to_images(
|
360
413
|
self,
|
361
414
|
resolution: float = 150,
|
362
|
-
**kwargs,
|
363
|
-
) -> List["PIL_Image"]:
|
415
|
+
**kwargs,
|
416
|
+
) -> List["PIL_Image"]:
|
364
417
|
"""
|
365
|
-
Generates and returns a list of cropped PIL Images,
|
418
|
+
Generates and returns a list of cropped PIL Images,
|
366
419
|
one for each constituent physical region of this FlowRegion.
|
367
420
|
"""
|
368
421
|
if not self.constituent_regions:
|
@@ -373,19 +426,19 @@ class FlowRegion:
|
|
373
426
|
for region_part in self.constituent_regions:
|
374
427
|
try:
|
375
428
|
img = region_part.to_image(
|
376
|
-
resolution=resolution,
|
377
|
-
crop_only=True,
|
378
|
-
include_highlights=False,
|
379
|
-
**kwargs
|
429
|
+
resolution=resolution, crop_only=True, include_highlights=False, **kwargs
|
380
430
|
)
|
381
431
|
if img:
|
382
432
|
cropped_images.append(img)
|
383
433
|
except Exception as e:
|
384
|
-
logger.error(
|
385
|
-
|
434
|
+
logger.error(
|
435
|
+
f"Error generating image for constituent region {region_part.bbox}: {e}",
|
436
|
+
exc_info=True,
|
437
|
+
)
|
438
|
+
|
386
439
|
return cropped_images
|
387
440
|
|
388
|
-
def to_image(self, background_color=(255,255,255), **kwargs) -> Optional["PIL_Image"]:
|
441
|
+
def to_image(self, background_color=(255, 255, 255), **kwargs) -> Optional["PIL_Image"]:
|
389
442
|
"""
|
390
443
|
Creates a single composite image by stacking the images of its constituent regions.
|
391
444
|
Stacking direction is based on the Flow's arrangement.
|
@@ -410,23 +463,29 @@ class FlowRegion:
|
|
410
463
|
# Stack vertically
|
411
464
|
composite_width = max(img.width for img in images)
|
412
465
|
composite_height = sum(img.height for img in images)
|
413
|
-
if composite_width == 0 or composite_height == 0:
|
466
|
+
if composite_width == 0 or composite_height == 0:
|
467
|
+
return None # Avoid zero-size image
|
414
468
|
|
415
|
-
new_image = PIL_Image_Runtime.new(
|
469
|
+
new_image = PIL_Image_Runtime.new(
|
470
|
+
"RGB", (composite_width, composite_height), background_color
|
471
|
+
)
|
416
472
|
current_y = 0
|
417
473
|
for img in images:
|
418
474
|
# Default to left alignment for vertical stacking
|
419
475
|
new_image.paste(img, (0, current_y))
|
420
476
|
current_y += img.height
|
421
477
|
return new_image
|
422
|
-
|
478
|
+
|
423
479
|
elif self.flow.arrangement == "horizontal":
|
424
480
|
# Stack horizontally
|
425
481
|
composite_width = sum(img.width for img in images)
|
426
482
|
composite_height = max(img.height for img in images)
|
427
|
-
if composite_width == 0 or composite_height == 0:
|
483
|
+
if composite_width == 0 or composite_height == 0:
|
484
|
+
return None
|
428
485
|
|
429
|
-
new_image = PIL_Image_Runtime.new(
|
486
|
+
new_image = PIL_Image_Runtime.new(
|
487
|
+
"RGB", (composite_width, composite_height), background_color
|
488
|
+
)
|
430
489
|
current_x = 0
|
431
490
|
for img in images:
|
432
491
|
# Default to top alignment for horizontal stacking
|
@@ -435,7 +494,9 @@ class FlowRegion:
|
|
435
494
|
return new_image
|
436
495
|
else:
|
437
496
|
# Should not happen if flow.arrangement is validated
|
438
|
-
logger.warning(
|
497
|
+
logger.warning(
|
498
|
+
f"Unknown flow arrangement: {self.flow.arrangement}. Cannot stack images."
|
499
|
+
)
|
439
500
|
return None
|
440
501
|
|
441
502
|
def __repr__(self) -> str:
|
@@ -453,6 +514,8 @@ class FlowRegion:
|
|
453
514
|
# For now, if it has regions, it's not considered empty by this simple check.
|
454
515
|
# User Point 4: FlowRegion can be empty (no text, no elements). This implies checking content.
|
455
516
|
try:
|
456
|
-
return not bool(self.extract_text(apply_exclusions=False).strip()) and not bool(
|
517
|
+
return not bool(self.extract_text(apply_exclusions=False).strip()) and not bool(
|
518
|
+
self.elements(apply_exclusions=False)
|
519
|
+
)
|
457
520
|
except Exception:
|
458
|
-
return True
|
521
|
+
return True # If error during check, assume empty to be safe
|
natural_pdf/ocr/engine_doctr.py
CHANGED
@@ -60,10 +60,22 @@ class DoctrOCREngine(OCREngine):
|
|
60
60
|
# Filter out None values
|
61
61
|
predictor_args = {k: v for k, v in predictor_args.items() if v is not None}
|
62
62
|
|
63
|
-
|
63
|
+
# Filter only allowed doctr ocr_predictor args
|
64
|
+
allowed_ocr_args = {
|
65
|
+
"det_arch",
|
66
|
+
"reco_arch",
|
67
|
+
"pretrained",
|
68
|
+
"assume_straight_pages",
|
69
|
+
"export_as_straight_boxes",
|
70
|
+
}
|
71
|
+
filtered_ocr_args = {k: v for k, v in predictor_args.items() if k in allowed_ocr_args}
|
72
|
+
dropped_ocr = set(predictor_args) - allowed_ocr_args
|
73
|
+
if dropped_ocr:
|
74
|
+
self.logger.warning(f"Dropped unsupported doctr ocr_predictor args: {dropped_ocr}")
|
75
|
+
|
76
|
+
self.logger.debug(f"doctr ocr_predictor constructor args: {filtered_ocr_args}")
|
64
77
|
try:
|
65
|
-
|
66
|
-
self._model = doctr.models.ocr_predictor(**predictor_args)
|
78
|
+
self._model = doctr.models.ocr_predictor(**filtered_ocr_args)
|
67
79
|
|
68
80
|
# Apply CUDA if available
|
69
81
|
if use_cuda:
|
@@ -81,7 +93,28 @@ class DoctrOCREngine(OCREngine):
|
|
81
93
|
"preserve_aspect_ratio": doctr_opts.preserve_aspect_ratio,
|
82
94
|
"batch_size": doctr_opts.batch_size,
|
83
95
|
}
|
84
|
-
|
96
|
+
# Filter out None values
|
97
|
+
detection_args = {k: v for k, v in detection_args.items() if v is not None}
|
98
|
+
allowed_det_args = {
|
99
|
+
"arch",
|
100
|
+
"pretrained",
|
101
|
+
"assume_straight_pages",
|
102
|
+
"symmetric_pad",
|
103
|
+
"preserve_aspect_ratio",
|
104
|
+
"batch_size",
|
105
|
+
}
|
106
|
+
filtered_det_args = {
|
107
|
+
k: v for k, v in detection_args.items() if k in allowed_det_args
|
108
|
+
}
|
109
|
+
dropped_det = set(detection_args) - allowed_det_args
|
110
|
+
if dropped_det:
|
111
|
+
self.logger.warning(
|
112
|
+
f"Dropped unsupported doctr detection_predictor args: {dropped_det}"
|
113
|
+
)
|
114
|
+
self.logger.debug(
|
115
|
+
f"doctr detection_predictor constructor args: {filtered_det_args}"
|
116
|
+
)
|
117
|
+
self._detection_model = doctr.models.detection_predictor(**filtered_det_args)
|
85
118
|
|
86
119
|
# Apply CUDA if available
|
87
120
|
if use_cuda:
|
@@ -59,14 +59,34 @@ class EasyOCREngine(OCREngine):
|
|
59
59
|
"cudnn_benchmark": easy_options.cudnn_benchmark,
|
60
60
|
}
|
61
61
|
|
62
|
-
# Filter out None values
|
62
|
+
# Filter out None values
|
63
63
|
constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
|
64
64
|
|
65
|
-
|
65
|
+
# Filter only allowed EasyOCR args
|
66
|
+
allowed_args = {
|
67
|
+
"lang_list",
|
68
|
+
"gpu",
|
69
|
+
"model_storage_directory",
|
70
|
+
"user_network_directory",
|
71
|
+
"recog_network",
|
72
|
+
"detect_network",
|
73
|
+
"download_enabled",
|
74
|
+
"detector",
|
75
|
+
"recognizer",
|
76
|
+
"verbose",
|
77
|
+
"quantize",
|
78
|
+
"cudnn_benchmark",
|
79
|
+
}
|
80
|
+
filtered_args = {k: v for k, v in constructor_args.items() if k in allowed_args}
|
81
|
+
dropped = set(constructor_args) - allowed_args
|
82
|
+
if dropped:
|
83
|
+
self.logger.warning(f"Dropped unsupported EasyOCR args: {dropped}")
|
84
|
+
|
85
|
+
self.logger.debug(f"EasyOCR Reader constructor args: {filtered_args}")
|
66
86
|
|
67
87
|
# Create the reader
|
68
88
|
try:
|
69
|
-
self._model = easyocr.Reader(**
|
89
|
+
self._model = easyocr.Reader(**filtered_args)
|
70
90
|
self.logger.info("EasyOCR reader created successfully")
|
71
91
|
except Exception as e:
|
72
92
|
self.logger.error(f"Failed to create EasyOCR reader: {e}")
|