natural-pdf 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +1373 -0
- natural_pdf/classification/manager.py +2 -3
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/highlighting_service.py +29 -38
- natural_pdf/core/page.py +284 -187
- natural_pdf/core/pdf.py +4 -4
- natural_pdf/elements/base.py +54 -20
- natural_pdf/elements/collections.py +160 -9
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +380 -38
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/selectors/parser.py +163 -8
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/RECORD +22 -17
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/WHEEL +1 -1
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,382 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union, cast
|
3
|
+
|
4
|
+
if TYPE_CHECKING:
|
5
|
+
from natural_pdf.elements.base import Element as PhysicalElement
|
6
|
+
from natural_pdf.elements.region import Region as PhysicalRegion
|
7
|
+
from natural_pdf.core.page import Page as PhysicalPage # For type checking physical_object.page
|
8
|
+
from .flow import Flow
|
9
|
+
from .region import FlowRegion
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class FlowElement:
|
15
|
+
"""
|
16
|
+
Represents a physical PDF Element or Region that is anchored within a Flow.
|
17
|
+
This class provides methods for flow-aware directional navigation (e.g., below, above)
|
18
|
+
that operate across the segments defined in its associated Flow.
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(self, physical_object: Union["PhysicalElement", "PhysicalRegion"], flow: "Flow"):
|
22
|
+
"""
|
23
|
+
Initializes a FlowElement.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
physical_object: The actual natural_pdf.elements.base.Element or
|
27
|
+
natural_pdf.elements.region.Region object.
|
28
|
+
flow: The Flow instance this element is part of.
|
29
|
+
"""
|
30
|
+
if not (hasattr(physical_object, 'bbox') and hasattr(physical_object, 'page')):
|
31
|
+
raise TypeError(
|
32
|
+
f"physical_object must be a valid PDF element-like object with 'bbox' and 'page' attributes. Got {type(physical_object)}"
|
33
|
+
)
|
34
|
+
self.physical_object: Union["PhysicalElement", "PhysicalRegion"] = physical_object
|
35
|
+
self.flow: "Flow" = flow
|
36
|
+
|
37
|
+
# --- Properties to delegate to the physical_object ---
|
38
|
+
@property
|
39
|
+
def bbox(self) -> Tuple[float, float, float, float]:
|
40
|
+
return self.physical_object.bbox
|
41
|
+
|
42
|
+
@property
|
43
|
+
def x0(self) -> float:
|
44
|
+
return self.physical_object.x0
|
45
|
+
|
46
|
+
@property
|
47
|
+
def top(self) -> float:
|
48
|
+
return self.physical_object.top
|
49
|
+
|
50
|
+
@property
|
51
|
+
def x1(self) -> float:
|
52
|
+
return self.physical_object.x1
|
53
|
+
|
54
|
+
@property
|
55
|
+
def bottom(self) -> float:
|
56
|
+
return self.physical_object.bottom
|
57
|
+
|
58
|
+
@property
|
59
|
+
def width(self) -> float:
|
60
|
+
return self.physical_object.width
|
61
|
+
|
62
|
+
@property
|
63
|
+
def height(self) -> float:
|
64
|
+
return self.physical_object.height
|
65
|
+
|
66
|
+
@property
|
67
|
+
def text(self) -> Optional[str]:
|
68
|
+
return getattr(self.physical_object, 'text', None)
|
69
|
+
|
70
|
+
@property
|
71
|
+
def page(self) -> Optional["PhysicalPage"]:
|
72
|
+
"""Returns the physical page of the underlying element."""
|
73
|
+
return getattr(self.physical_object, 'page', None)
|
74
|
+
|
75
|
+
def _flow_direction(
|
76
|
+
self,
|
77
|
+
direction: str, # "above", "below", "left", "right"
|
78
|
+
size: Optional[float] = None,
|
79
|
+
cross_size_ratio: Optional[float] = None, # Default to None for full flow width
|
80
|
+
cross_size_absolute: Optional[float] = None,
|
81
|
+
cross_alignment: str = "center", # "start", "center", "end"
|
82
|
+
until: Optional[str] = None,
|
83
|
+
include_endpoint: bool = True,
|
84
|
+
**kwargs,
|
85
|
+
) -> "FlowRegion":
|
86
|
+
from .region import FlowRegion # Runtime import for return if not stringized, but stringizing is safer
|
87
|
+
# Ensure correct import for creating new PhysicalRegion instances if needed
|
88
|
+
from natural_pdf.elements.region import Region as PhysicalRegion_Class # Runtime import
|
89
|
+
|
90
|
+
collected_constituent_regions: List[PhysicalRegion_Class] = [] # PhysicalRegion_Class is runtime
|
91
|
+
boundary_element_hit: Optional["PhysicalElement"] = None # Stringized
|
92
|
+
# Ensure remaining_size is float, even if size is int.
|
93
|
+
remaining_size = float(size) if size is not None else float('inf')
|
94
|
+
|
95
|
+
|
96
|
+
# 1. Identify Starting Segment and its index
|
97
|
+
start_segment_index = -1
|
98
|
+
for i, segment_in_flow in enumerate(self.flow.segments):
|
99
|
+
if self.physical_object.page != segment_in_flow.page:
|
100
|
+
continue
|
101
|
+
|
102
|
+
obj_center_x = (self.physical_object.x0 + self.physical_object.x1) / 2
|
103
|
+
obj_center_y = (self.physical_object.top + self.physical_object.bottom) / 2
|
104
|
+
|
105
|
+
if segment_in_flow.is_point_inside(obj_center_x, obj_center_y):
|
106
|
+
start_segment_index = i
|
107
|
+
break
|
108
|
+
obj_bbox = self.physical_object.bbox
|
109
|
+
seg_bbox = segment_in_flow.bbox
|
110
|
+
if not (obj_bbox[2] < seg_bbox[0] or obj_bbox[0] > seg_bbox[2] or \
|
111
|
+
obj_bbox[3] < seg_bbox[1] or obj_bbox[1] > seg_bbox[3]):
|
112
|
+
if start_segment_index == -1:
|
113
|
+
start_segment_index = i
|
114
|
+
|
115
|
+
if start_segment_index == -1:
|
116
|
+
page_num_str = str(self.physical_object.page.page_number) if self.physical_object.page else 'N/A'
|
117
|
+
logger.warning(
|
118
|
+
f"FlowElement's physical object {self.physical_object.bbox} on page {page_num_str} "
|
119
|
+
f"not found within any flow segment. Cannot perform directional operation '{direction}'."
|
120
|
+
)
|
121
|
+
# Need FlowRegion for the return type, ensure it's available or stringized
|
122
|
+
from .region import FlowRegion as RuntimeFlowRegion
|
123
|
+
return RuntimeFlowRegion(
|
124
|
+
flow=self.flow,
|
125
|
+
constituent_regions=[],
|
126
|
+
source_flow_element=self,
|
127
|
+
boundary_element_found=None
|
128
|
+
)
|
129
|
+
|
130
|
+
is_primary_vertical = self.flow.arrangement == "vertical"
|
131
|
+
segment_iterator: range
|
132
|
+
|
133
|
+
if direction == "below":
|
134
|
+
if not is_primary_vertical: raise NotImplementedError("'below' is for vertical flows.")
|
135
|
+
is_forward = True
|
136
|
+
segment_iterator = range(start_segment_index, len(self.flow.segments))
|
137
|
+
elif direction == "above":
|
138
|
+
if not is_primary_vertical: raise NotImplementedError("'above' is for vertical flows.")
|
139
|
+
is_forward = False
|
140
|
+
segment_iterator = range(start_segment_index, -1, -1)
|
141
|
+
elif direction == "right":
|
142
|
+
if is_primary_vertical: raise NotImplementedError("'right' is for horizontal flows.")
|
143
|
+
is_forward = True
|
144
|
+
segment_iterator = range(start_segment_index, len(self.flow.segments))
|
145
|
+
elif direction == "left":
|
146
|
+
if is_primary_vertical: raise NotImplementedError("'left' is for horizontal flows.")
|
147
|
+
is_forward = False
|
148
|
+
segment_iterator = range(start_segment_index, -1, -1)
|
149
|
+
else:
|
150
|
+
raise ValueError(f"Internal error: Invalid direction '{direction}' for _flow_direction.")
|
151
|
+
|
152
|
+
for current_segment_idx in segment_iterator:
|
153
|
+
if remaining_size <= 0 and size is not None: break
|
154
|
+
if boundary_element_hit: break
|
155
|
+
|
156
|
+
current_segment: PhysicalRegion_Class = self.flow.segments[current_segment_idx]
|
157
|
+
segment_contribution: Optional[PhysicalRegion_Class] = None
|
158
|
+
|
159
|
+
op_source: Union["PhysicalElement", PhysicalRegion_Class] # Stringized PhysicalElement
|
160
|
+
op_direction_params: dict = {
|
161
|
+
"direction": direction, "until": until, "include_endpoint": include_endpoint, **kwargs
|
162
|
+
}
|
163
|
+
|
164
|
+
# --- Cross-size logic: Default to "full" if no specific ratio or absolute is given ---
|
165
|
+
cross_size_for_op: Union[str, float]
|
166
|
+
if cross_size_absolute is not None:
|
167
|
+
cross_size_for_op = cross_size_absolute
|
168
|
+
elif cross_size_ratio is not None: # User explicitly provided a ratio
|
169
|
+
base_cross_dim = self.physical_object.width if is_primary_vertical else self.physical_object.height
|
170
|
+
cross_size_for_op = base_cross_dim * cross_size_ratio
|
171
|
+
else: # Default case: neither absolute nor ratio provided, so use "full"
|
172
|
+
cross_size_for_op = "full"
|
173
|
+
op_direction_params["cross_size"] = cross_size_for_op
|
174
|
+
|
175
|
+
if current_segment_idx == start_segment_index:
|
176
|
+
op_source = self.physical_object
|
177
|
+
op_direction_params["size"] = remaining_size if size is not None else None
|
178
|
+
op_direction_params["include_source"] = False
|
179
|
+
|
180
|
+
source_for_op_call = op_source
|
181
|
+
if not isinstance(source_for_op_call, PhysicalRegion_Class):
|
182
|
+
if hasattr(source_for_op_call, 'to_region'):
|
183
|
+
source_for_op_call = source_for_op_call.to_region()
|
184
|
+
else:
|
185
|
+
logger.error(f"FlowElement: Cannot convert op_source {type(op_source)} to region.")
|
186
|
+
continue
|
187
|
+
|
188
|
+
# 1. Perform directional operation *without* 'until' initially to get basic shape.
|
189
|
+
initial_op_params = {
|
190
|
+
"direction": direction,
|
191
|
+
"size": remaining_size if size is not None else None,
|
192
|
+
"cross_size": cross_size_for_op,
|
193
|
+
"cross_alignment": cross_alignment, # Pass alignment
|
194
|
+
"include_source": False,
|
195
|
+
# Pass other relevant kwargs if Region._direction uses them (e.g. strict_type)
|
196
|
+
**{k: v for k, v in kwargs.items() if k in ['strict_type', 'first_match_only']}
|
197
|
+
}
|
198
|
+
initial_region_from_op = source_for_op_call._direction(**initial_op_params)
|
199
|
+
|
200
|
+
# 2. Clip this initial region to the current flow segment's boundaries.
|
201
|
+
clipped_search_area = current_segment.clip(initial_region_from_op)
|
202
|
+
segment_contribution = clipped_search_area # Default contribution
|
203
|
+
|
204
|
+
# 3. If 'until' is specified, search for it *within* the clipped_search_area.
|
205
|
+
if until and clipped_search_area and clipped_search_area.width > 0 and clipped_search_area.height > 0:
|
206
|
+
# kwargs for find_all are the general kwargs passed to _flow_direction
|
207
|
+
until_matches = clipped_search_area.find_all(until, **kwargs)
|
208
|
+
|
209
|
+
if until_matches:
|
210
|
+
potential_hit: Optional["PhysicalElement"] = None
|
211
|
+
if direction == "below": potential_hit = until_matches.sort(key=lambda m: m.top).first
|
212
|
+
elif direction == "above": potential_hit = until_matches.sort(key=lambda m: m.bottom, reverse=True).first
|
213
|
+
elif direction == "right": potential_hit = until_matches.sort(key=lambda m: m.x0).first
|
214
|
+
elif direction == "left": potential_hit = until_matches.sort(key=lambda m: m.x1, reverse=True).first
|
215
|
+
|
216
|
+
if potential_hit:
|
217
|
+
boundary_element_hit = potential_hit # Set the overall boundary flag
|
218
|
+
# Adjust segment_contribution to stop at this boundary_element_hit.
|
219
|
+
if is_primary_vertical:
|
220
|
+
if direction == "below":
|
221
|
+
edge = boundary_element_hit.bottom if include_endpoint else (boundary_element_hit.top - 1)
|
222
|
+
else: # direction == "above"
|
223
|
+
edge = boundary_element_hit.top if include_endpoint else (boundary_element_hit.bottom + 1)
|
224
|
+
segment_contribution = segment_contribution.clip(
|
225
|
+
bottom=edge if direction == "below" else None,
|
226
|
+
top=edge if direction == "above" else None
|
227
|
+
)
|
228
|
+
else:
|
229
|
+
if direction == "right":
|
230
|
+
edge = boundary_element_hit.x1 if include_endpoint else (boundary_element_hit.x0 - 1)
|
231
|
+
else: # direction == "left"
|
232
|
+
edge = boundary_element_hit.x0 if include_endpoint else (boundary_element_hit.x1 + 1)
|
233
|
+
segment_contribution = segment_contribution.clip(
|
234
|
+
right=edge if direction == "right" else None,
|
235
|
+
left=edge if direction == "left" else None
|
236
|
+
)
|
237
|
+
else:
|
238
|
+
candidate_region_in_segment = current_segment
|
239
|
+
if until and not boundary_element_hit:
|
240
|
+
until_matches = candidate_region_in_segment.find_all(until, **kwargs)
|
241
|
+
if until_matches:
|
242
|
+
potential_hit = None
|
243
|
+
if direction == "below": potential_hit = until_matches.sort(key=lambda m: m.top).first
|
244
|
+
elif direction == "above": potential_hit = until_matches.sort(key=lambda m: m.bottom, reverse=True).first
|
245
|
+
elif direction == "right": potential_hit = until_matches.sort(key=lambda m: m.x0).first
|
246
|
+
elif direction == "left": potential_hit = until_matches.sort(key=lambda m: m.x1, reverse=True).first
|
247
|
+
|
248
|
+
if potential_hit:
|
249
|
+
boundary_element_hit = potential_hit
|
250
|
+
if is_primary_vertical:
|
251
|
+
if direction == "below":
|
252
|
+
edge = boundary_element_hit.bottom if include_endpoint else (boundary_element_hit.top - 1)
|
253
|
+
else: # direction == "above"
|
254
|
+
edge = boundary_element_hit.top if include_endpoint else (boundary_element_hit.bottom + 1)
|
255
|
+
candidate_region_in_segment = candidate_region_in_segment.clip(bottom=edge if direction == "below" else None, top=edge if direction == "above" else None)
|
256
|
+
else:
|
257
|
+
if direction == "right":
|
258
|
+
edge = boundary_element_hit.x1 if include_endpoint else (boundary_element_hit.x0 - 1)
|
259
|
+
else: # direction == "left"
|
260
|
+
edge = boundary_element_hit.x0 if include_endpoint else (boundary_element_hit.x1 + 1)
|
261
|
+
candidate_region_in_segment = candidate_region_in_segment.clip(right=edge if direction == "right" else None, left=edge if direction == "left" else None)
|
262
|
+
segment_contribution = candidate_region_in_segment
|
263
|
+
|
264
|
+
if segment_contribution and segment_contribution.width > 0 and segment_contribution.height > 0 and size is not None:
|
265
|
+
current_part_consumed_size = 0.0
|
266
|
+
if is_primary_vertical:
|
267
|
+
current_part_consumed_size = segment_contribution.height
|
268
|
+
if current_part_consumed_size > remaining_size:
|
269
|
+
new_edge = (segment_contribution.top + remaining_size) if is_forward else (segment_contribution.bottom - remaining_size)
|
270
|
+
segment_contribution = segment_contribution.clip(bottom=new_edge if is_forward else None, top=new_edge if not is_forward else None)
|
271
|
+
current_part_consumed_size = remaining_size
|
272
|
+
else:
|
273
|
+
current_part_consumed_size = segment_contribution.width
|
274
|
+
if current_part_consumed_size > remaining_size:
|
275
|
+
new_edge = (segment_contribution.x0 + remaining_size) if is_forward else (segment_contribution.x1 - remaining_size)
|
276
|
+
segment_contribution = segment_contribution.clip(right=new_edge if is_forward else None, left=new_edge if not is_forward else None)
|
277
|
+
current_part_consumed_size = remaining_size
|
278
|
+
remaining_size -= current_part_consumed_size
|
279
|
+
|
280
|
+
if segment_contribution and segment_contribution.width > 0 and segment_contribution.height > 0:
|
281
|
+
collected_constituent_regions.append(segment_contribution)
|
282
|
+
|
283
|
+
# If boundary was hit in this segment, and we are not on the start segment (where we might still collect part of it)
|
284
|
+
# or if we are on the start segment AND the contribution became zero (e.g. until was immediate)
|
285
|
+
if boundary_element_hit and (current_segment_idx != start_segment_index or not segment_contribution or (segment_contribution.width <= 0 or segment_contribution.height <= 0)):
|
286
|
+
break # Stop iterating through more segments
|
287
|
+
|
288
|
+
is_logically_last_segment = (is_forward and current_segment_idx == len(self.flow.segments) - 1) or \
|
289
|
+
(not is_forward and current_segment_idx == 0)
|
290
|
+
if not is_logically_last_segment and self.flow.segment_gap > 0 and size is not None:
|
291
|
+
if remaining_size > 0 :
|
292
|
+
remaining_size -= self.flow.segment_gap
|
293
|
+
|
294
|
+
from .region import FlowRegion as RuntimeFlowRegion # Ensure it's available for return
|
295
|
+
return RuntimeFlowRegion(
|
296
|
+
flow=self.flow,
|
297
|
+
constituent_regions=collected_constituent_regions,
|
298
|
+
source_flow_element=self,
|
299
|
+
boundary_element_found=boundary_element_hit
|
300
|
+
)
|
301
|
+
|
302
|
+
# --- Public Directional Methods ---
|
303
|
+
# These will largely mirror DirectionalMixin but call _flow_direction.
|
304
|
+
|
305
|
+
def above(
|
306
|
+
self,
|
307
|
+
height: Optional[float] = None,
|
308
|
+
width_ratio: Optional[float] = None,
|
309
|
+
width_absolute: Optional[float] = None,
|
310
|
+
width_alignment: str = "center",
|
311
|
+
until: Optional[str] = None,
|
312
|
+
include_endpoint: bool = True,
|
313
|
+
**kwargs,
|
314
|
+
) -> "FlowRegion": # Stringized
|
315
|
+
if self.flow.arrangement == "vertical":
|
316
|
+
return self._flow_direction(
|
317
|
+
direction="above", size=height, cross_size_ratio=width_ratio,
|
318
|
+
cross_size_absolute=width_absolute, cross_alignment=width_alignment,
|
319
|
+
until=until, include_endpoint=include_endpoint, **kwargs,
|
320
|
+
)
|
321
|
+
else:
|
322
|
+
raise NotImplementedError("'above' in a horizontal flow is ambiguous with current 1D flow logic and not yet implemented.")
|
323
|
+
|
324
|
+
def below(
|
325
|
+
self,
|
326
|
+
height: Optional[float] = None,
|
327
|
+
width_ratio: Optional[float] = None,
|
328
|
+
width_absolute: Optional[float] = None,
|
329
|
+
width_alignment: str = "center",
|
330
|
+
until: Optional[str] = None,
|
331
|
+
include_endpoint: bool = True,
|
332
|
+
**kwargs,
|
333
|
+
) -> "FlowRegion": # Stringized
|
334
|
+
if self.flow.arrangement == "vertical":
|
335
|
+
return self._flow_direction(
|
336
|
+
direction="below", size=height, cross_size_ratio=width_ratio,
|
337
|
+
cross_size_absolute=width_absolute, cross_alignment=width_alignment,
|
338
|
+
until=until, include_endpoint=include_endpoint, **kwargs,
|
339
|
+
)
|
340
|
+
else:
|
341
|
+
raise NotImplementedError("'below' in a horizontal flow is ambiguous with current 1D flow logic and not yet implemented.")
|
342
|
+
|
343
|
+
def left(
|
344
|
+
self,
|
345
|
+
width: Optional[float] = None,
|
346
|
+
height_ratio: Optional[float] = None,
|
347
|
+
height_absolute: Optional[float] = None,
|
348
|
+
height_alignment: str = "center",
|
349
|
+
until: Optional[str] = None,
|
350
|
+
include_endpoint: bool = True,
|
351
|
+
**kwargs,
|
352
|
+
) -> "FlowRegion": # Stringized
|
353
|
+
if self.flow.arrangement == "horizontal":
|
354
|
+
return self._flow_direction(
|
355
|
+
direction="left", size=width, cross_size_ratio=height_ratio,
|
356
|
+
cross_size_absolute=height_absolute, cross_alignment=height_alignment,
|
357
|
+
until=until, include_endpoint=include_endpoint, **kwargs,
|
358
|
+
)
|
359
|
+
else:
|
360
|
+
raise NotImplementedError("'left' in a vertical flow is ambiguous with current 1D flow logic and not yet implemented.")
|
361
|
+
|
362
|
+
def right(
|
363
|
+
self,
|
364
|
+
width: Optional[float] = None,
|
365
|
+
height_ratio: Optional[float] = None,
|
366
|
+
height_absolute: Optional[float] = None,
|
367
|
+
height_alignment: str = "center",
|
368
|
+
until: Optional[str] = None,
|
369
|
+
include_endpoint: bool = True,
|
370
|
+
**kwargs,
|
371
|
+
) -> "FlowRegion": # Stringized
|
372
|
+
if self.flow.arrangement == "horizontal":
|
373
|
+
return self._flow_direction(
|
374
|
+
direction="right", size=width, cross_size_ratio=height_ratio,
|
375
|
+
cross_size_absolute=height_absolute, cross_alignment=height_alignment,
|
376
|
+
until=until, include_endpoint=include_endpoint, **kwargs,
|
377
|
+
)
|
378
|
+
else:
|
379
|
+
raise NotImplementedError("'right' in a vertical flow is ambiguous with current 1D flow logic and not yet implemented.")
|
380
|
+
|
381
|
+
def __repr__(self) -> str:
|
382
|
+
return f"<FlowElement for {self.physical_object.__class__.__name__} {self.bbox} in {self.flow}>"
|
@@ -0,0 +1,216 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
|
3
|
+
|
4
|
+
if TYPE_CHECKING:
|
5
|
+
from natural_pdf.core.page import Page
|
6
|
+
from natural_pdf.elements.region import Region as PhysicalRegion
|
7
|
+
from natural_pdf.elements.base import Element as PhysicalElement
|
8
|
+
from natural_pdf.elements.collections import ElementCollection as PhysicalElementCollection
|
9
|
+
from .element import FlowElement
|
10
|
+
from .collections import FlowElementCollection
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
class Flow:
|
16
|
+
"""
|
17
|
+
Defines a logical flow or sequence of physical Page or Region objects,
|
18
|
+
specifying their arrangement and alignment to enable operations that
|
19
|
+
span across these segments as if they were a continuous area.
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
segments: List[Union["Page", "PhysicalRegion"]],
|
25
|
+
arrangement: Literal["vertical", "horizontal"],
|
26
|
+
alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
|
27
|
+
segment_gap: float = 0.0,
|
28
|
+
):
|
29
|
+
"""
|
30
|
+
Initializes a Flow object.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
segments: An ordered list of natural_pdf.core.page.Page or
|
34
|
+
natural_pdf.elements.region.Region objects that constitute the flow.
|
35
|
+
arrangement: The primary direction of the flow.
|
36
|
+
- "vertical": Segments are stacked top-to-bottom.
|
37
|
+
- "horizontal": Segments are arranged left-to-right.
|
38
|
+
alignment: How segments are aligned on their cross-axis if they have
|
39
|
+
differing dimensions. For a "vertical" arrangement:
|
40
|
+
- "left" (or "start"): Align left edges.
|
41
|
+
- "center": Align centers.
|
42
|
+
- "right" (or "end"): Align right edges.
|
43
|
+
For a "horizontal" arrangement:
|
44
|
+
- "top" (or "start"): Align top edges.
|
45
|
+
- "center": Align centers.
|
46
|
+
- "bottom" (or "end"): Align bottom edges.
|
47
|
+
segment_gap: The virtual gap (in PDF points) between segments.
|
48
|
+
"""
|
49
|
+
if not segments:
|
50
|
+
raise ValueError("Flow segments cannot be empty.")
|
51
|
+
if arrangement not in ["vertical", "horizontal"]:
|
52
|
+
raise ValueError("Arrangement must be 'vertical' or 'horizontal'.")
|
53
|
+
|
54
|
+
self.segments: List["PhysicalRegion"] = self._normalize_segments(segments)
|
55
|
+
self.arrangement: Literal["vertical", "horizontal"] = arrangement
|
56
|
+
self.alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = alignment
|
57
|
+
self.segment_gap: float = segment_gap
|
58
|
+
|
59
|
+
self._validate_alignment()
|
60
|
+
|
61
|
+
# TODO: Pre-calculate segment offsets for faster lookups if needed
|
62
|
+
|
63
|
+
def _normalize_segments(self, segments: List[Union["Page", "PhysicalRegion"]]) -> List["PhysicalRegion"]:
|
64
|
+
"""Converts all Page segments to full-page Region objects for uniform processing."""
|
65
|
+
normalized = []
|
66
|
+
from natural_pdf.core.page import Page as CorePage
|
67
|
+
from natural_pdf.elements.region import Region as ElementsRegion
|
68
|
+
|
69
|
+
for i, segment in enumerate(segments):
|
70
|
+
if isinstance(segment, CorePage):
|
71
|
+
normalized.append(segment.region(0, 0, segment.width, segment.height))
|
72
|
+
elif isinstance(segment, ElementsRegion):
|
73
|
+
normalized.append(segment)
|
74
|
+
elif hasattr(segment, 'object_type') and segment.object_type == "page":
|
75
|
+
if not isinstance(segment, CorePage):
|
76
|
+
raise TypeError(f"Segment {i} has object_type 'page' but is not an instance of natural_pdf.core.page.Page. Got {type(segment)}")
|
77
|
+
normalized.append(segment.region(0, 0, segment.width, segment.height))
|
78
|
+
elif hasattr(segment, 'object_type') and segment.object_type == "region":
|
79
|
+
if not isinstance(segment, ElementsRegion):
|
80
|
+
raise TypeError(f"Segment {i} has object_type 'region' but is not an instance of natural_pdf.elements.region.Region. Got {type(segment)}")
|
81
|
+
normalized.append(segment)
|
82
|
+
else:
|
83
|
+
raise TypeError(
|
84
|
+
f"Segment {i} is not a valid Page or Region object. Got {type(segment)}."
|
85
|
+
)
|
86
|
+
return normalized
|
87
|
+
|
88
|
+
def _validate_alignment(self) -> None:
|
89
|
+
"""Validates the alignment based on the arrangement."""
|
90
|
+
valid_alignments = {
|
91
|
+
"vertical": ["start", "center", "end", "left", "right"],
|
92
|
+
"horizontal": ["start", "center", "end", "top", "bottom"],
|
93
|
+
}
|
94
|
+
if self.alignment not in valid_alignments[self.arrangement]:
|
95
|
+
raise ValueError(
|
96
|
+
f"Invalid alignment '{self.alignment}' for '{self.arrangement}' arrangement. "
|
97
|
+
f"Valid options are: {valid_alignments[self.arrangement]}"
|
98
|
+
)
|
99
|
+
|
100
|
+
def find(
|
101
|
+
self,
|
102
|
+
selector: Optional[str] = None,
|
103
|
+
*,
|
104
|
+
text: Optional[str] = None,
|
105
|
+
apply_exclusions: bool = True,
|
106
|
+
regex: bool = False,
|
107
|
+
case: bool = True,
|
108
|
+
**kwargs,
|
109
|
+
) -> Optional["FlowElement"]:
|
110
|
+
"""
|
111
|
+
Finds the first element within the flow that matches the given selector or text criteria.
|
112
|
+
|
113
|
+
Elements found are wrapped as FlowElement objects, anchored to this Flow.
|
114
|
+
|
115
|
+
Args:
|
116
|
+
selector: CSS-like selector string.
|
117
|
+
text: Text content to search for.
|
118
|
+
apply_exclusions: Whether to respect exclusion zones on the original pages/regions.
|
119
|
+
regex: Whether the text search uses regex.
|
120
|
+
case: Whether the text search is case-sensitive.
|
121
|
+
**kwargs: Additional filter parameters for the underlying find operation.
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
A FlowElement if a match is found, otherwise None.
|
125
|
+
"""
|
126
|
+
results = self.find_all(
|
127
|
+
selector=selector,
|
128
|
+
text=text,
|
129
|
+
apply_exclusions=apply_exclusions,
|
130
|
+
regex=regex,
|
131
|
+
case=case,
|
132
|
+
**kwargs
|
133
|
+
)
|
134
|
+
return results.first if results else None
|
135
|
+
|
136
|
+
def find_all(
|
137
|
+
self,
|
138
|
+
selector: Optional[str] = None,
|
139
|
+
*,
|
140
|
+
text: Optional[str] = None,
|
141
|
+
apply_exclusions: bool = True,
|
142
|
+
regex: bool = False,
|
143
|
+
case: bool = True,
|
144
|
+
**kwargs,
|
145
|
+
) -> "FlowElementCollection":
|
146
|
+
"""
|
147
|
+
Finds all elements within the flow that match the given selector or text criteria.
|
148
|
+
Elements are collected segment by segment, preserving the flow order.
|
149
|
+
|
150
|
+
Elements found are wrapped as FlowElement objects, anchored to this Flow,
|
151
|
+
and returned in a FlowElementCollection.
|
152
|
+
"""
|
153
|
+
from .collections import FlowElementCollection
|
154
|
+
from .element import FlowElement
|
155
|
+
|
156
|
+
all_flow_elements: List["FlowElement"] = []
|
157
|
+
|
158
|
+
# Iterate through segments in their defined flow order
|
159
|
+
for physical_segment in self.segments:
|
160
|
+
# Find all matching physical elements within the current segment
|
161
|
+
# Region.find_all() should return elements in local reading order.
|
162
|
+
matches_in_segment: "PhysicalElementCollection" = physical_segment.find_all(
|
163
|
+
selector=selector,
|
164
|
+
text=text,
|
165
|
+
apply_exclusions=apply_exclusions,
|
166
|
+
regex=regex,
|
167
|
+
case=case,
|
168
|
+
**kwargs,
|
169
|
+
)
|
170
|
+
if matches_in_segment:
|
171
|
+
# Wrap each found physical element as a FlowElement and add to the list
|
172
|
+
# This preserves the order from matches_in_segment.elements
|
173
|
+
for phys_elem in matches_in_segment.elements:
|
174
|
+
all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
|
175
|
+
|
176
|
+
# The global sort that was here previously has been removed.
|
177
|
+
# The order is now determined by segment sequence, then by local order within each segment.
|
178
|
+
|
179
|
+
return FlowElementCollection(all_flow_elements)
|
180
|
+
|
181
|
+
def __repr__(self) -> str:
|
182
|
+
return (
|
183
|
+
f"<Flow segments={len(self.segments)}, "
|
184
|
+
f"arrangement='{self.arrangement}', alignment='{self.alignment}', gap={self.segment_gap}>"
|
185
|
+
)
|
186
|
+
|
187
|
+
# --- Helper methods for coordinate transformations and segment iteration ---
|
188
|
+
# These will be crucial for FlowElement's directional methods.
|
189
|
+
|
190
|
+
def get_segment_bounding_box_in_flow(self, segment_index: int) -> Optional[tuple[float, float, float, float]]:
|
191
|
+
"""
|
192
|
+
Calculates the conceptual bounding box of a segment within the flow's coordinate system.
|
193
|
+
This considers arrangement, alignment, and segment gaps.
|
194
|
+
(This is a placeholder for more complex logic if a true virtual coordinate system is needed)
|
195
|
+
For now, it might just return the physical segment's bbox if gaps are 0 and alignment is simple.
|
196
|
+
"""
|
197
|
+
if segment_index < 0 or segment_index >= len(self.segments):
|
198
|
+
return None
|
199
|
+
|
200
|
+
# This is a simplified version. A full implementation would calculate offsets.
|
201
|
+
# For now, we assume FlowElement directional logic handles segment traversal and uses physical coords.
|
202
|
+
# If we were to *draw* the flow or get a FlowRegion bbox that spans gaps, this would be critical.
|
203
|
+
# physical_segment = self.segments[segment_index]
|
204
|
+
# return physical_segment.bbox
|
205
|
+
raise NotImplementedError("Calculating a segment's bbox *within the flow's virtual coordinate system* is not yet fully implemented.")
|
206
|
+
|
207
|
+
def get_element_flow_coordinates(self, physical_element: "PhysicalElement") -> Optional[tuple[float, float, float, float]]:
|
208
|
+
"""
|
209
|
+
Translates a physical element's coordinates into the flow's virtual coordinate system.
|
210
|
+
(Placeholder - very complex if segment_gap > 0 or complex alignments)
|
211
|
+
"""
|
212
|
+
# For now, elements operate in their own physical coordinates. This method would be needed
|
213
|
+
# if FlowRegion.bbox or other operations needed to present a unified coordinate space.
|
214
|
+
# As per our discussion, elements *within* a FlowRegion retain original physical coordinates.
|
215
|
+
# So, this might not be strictly necessary for the current design's core functionality.
|
216
|
+
raise NotImplementedError("Translating element coordinates to a unified flow coordinate system is not yet implemented.")
|