natural-pdf 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/classification/manager.py +2 -3
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/highlighting_service.py +29 -38
- natural_pdf/core/page.py +283 -186
- natural_pdf/core/pdf.py +4 -4
- natural_pdf/elements/base.py +34 -0
- natural_pdf/elements/collections.py +160 -9
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +353 -12
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/selectors/parser.py +163 -8
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +22 -17
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,458 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
3
|
+
|
4
|
+
from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
|
5
|
+
|
6
|
+
# For runtime image manipulation
|
7
|
+
from PIL import Image as PIL_Image_Runtime
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from PIL.Image import Image as PIL_Image # For type hints
|
11
|
+
from natural_pdf.elements.base import Element as PhysicalElement
|
12
|
+
from natural_pdf.elements.region import Region as PhysicalRegion
|
13
|
+
from natural_pdf.elements.collections import ElementCollection
|
14
|
+
from natural_pdf.core.page import Page as PhysicalPage
|
15
|
+
from .flow import Flow
|
16
|
+
from .element import FlowElement
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class FlowRegion:
|
22
|
+
"""
|
23
|
+
Represents a selected area within a Flow, potentially composed of multiple
|
24
|
+
physical Region objects (constituent_regions) that might span across
|
25
|
+
different original pages or disjoint physical regions defined in the Flow.
|
26
|
+
|
27
|
+
A FlowRegion is the result of a directional operation (e.g., .below(), .above())
|
28
|
+
on a FlowElement.
|
29
|
+
"""
|
30
|
+
|
31
|
+
def __init__(
|
32
|
+
self,
|
33
|
+
flow: "Flow",
|
34
|
+
constituent_regions: List["PhysicalRegion"],
|
35
|
+
source_flow_element: "FlowElement",
|
36
|
+
boundary_element_found: Optional["PhysicalElement"] = None,
|
37
|
+
):
|
38
|
+
"""
|
39
|
+
Initializes a FlowRegion.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
flow: The Flow instance this region belongs to.
|
43
|
+
constituent_regions: A list of physical natural_pdf.elements.region.Region
|
44
|
+
objects that make up this FlowRegion.
|
45
|
+
source_flow_element: The FlowElement that created this FlowRegion.
|
46
|
+
boundary_element_found: The physical element that stopped an 'until' search,
|
47
|
+
if applicable.
|
48
|
+
"""
|
49
|
+
self.flow: "Flow" = flow
|
50
|
+
self.constituent_regions: List["PhysicalRegion"] = constituent_regions
|
51
|
+
self.source_flow_element: "FlowElement" = source_flow_element
|
52
|
+
self.boundary_element_found: Optional["PhysicalElement"] = boundary_element_found
|
53
|
+
|
54
|
+
# Cache for expensive operations
|
55
|
+
self._cached_text: Optional[str] = None
|
56
|
+
self._cached_elements: Optional["ElementCollection"] = None # Stringized
|
57
|
+
self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
|
58
|
+
|
59
|
+
@property
|
60
|
+
def bbox(self) -> Optional[Tuple[float, float, float, float]]:
|
61
|
+
"""
|
62
|
+
Calculates a conceptual bounding box that encompasses all constituent physical regions.
|
63
|
+
This is the union of the bounding boxes of the constituent regions in their
|
64
|
+
original physical coordinates.
|
65
|
+
Returns None if there are no constituent regions.
|
66
|
+
"""
|
67
|
+
if self._cached_bbox is not None:
|
68
|
+
return self._cached_bbox
|
69
|
+
if not self.constituent_regions:
|
70
|
+
return None
|
71
|
+
|
72
|
+
# Use objects_to_bbox from pdfplumber.utils.geometry to merge bboxes
|
73
|
+
# This helper expects a list of objects that have .x0, .top, .x1, .bottom attributes.
|
74
|
+
# Our PhysicalRegion objects satisfy this.
|
75
|
+
self._cached_bbox = objects_to_bbox(self.constituent_regions)
|
76
|
+
return self._cached_bbox
|
77
|
+
|
78
|
+
@property
|
79
|
+
def x0(self) -> Optional[float]:
|
80
|
+
return self.bbox[0] if self.bbox else None
|
81
|
+
|
82
|
+
@property
|
83
|
+
def top(self) -> Optional[float]:
|
84
|
+
return self.bbox[1] if self.bbox else None
|
85
|
+
|
86
|
+
@property
|
87
|
+
def x1(self) -> Optional[float]:
|
88
|
+
return self.bbox[2] if self.bbox else None
|
89
|
+
|
90
|
+
@property
|
91
|
+
def bottom(self) -> Optional[float]:
|
92
|
+
return self.bbox[3] if self.bbox else None
|
93
|
+
|
94
|
+
@property
|
95
|
+
def width(self) -> Optional[float]:
|
96
|
+
return self.x1 - self.x0 if self.bbox else None
|
97
|
+
|
98
|
+
@property
|
99
|
+
def height(self) -> Optional[float]:
|
100
|
+
return self.bottom - self.top if self.bbox else None
|
101
|
+
|
102
|
+
def extract_text(self, apply_exclusions: bool = True, **kwargs) -> str:
|
103
|
+
"""
|
104
|
+
Extracts and concatenates text from all constituent physical regions.
|
105
|
+
The order of concatenation respects the flow's arrangement.
|
106
|
+
|
107
|
+
Args:
|
108
|
+
apply_exclusions: Whether to respect PDF exclusion zones within each
|
109
|
+
constituent physical region during text extraction.
|
110
|
+
**kwargs: Additional arguments passed to the underlying extract_text method
|
111
|
+
of each constituent region.
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
The combined text content as a string.
|
115
|
+
"""
|
116
|
+
if self._cached_text is not None and apply_exclusions: # Simple cache check, might need refinement if kwargs change behavior
|
117
|
+
return self._cached_text
|
118
|
+
|
119
|
+
if not self.constituent_regions:
|
120
|
+
return ""
|
121
|
+
|
122
|
+
texts: List[str] = []
|
123
|
+
# For now, simple concatenation. Order depends on how constituent_regions were added.
|
124
|
+
# The FlowElement._flow_direction method is responsible for ordering constituent_regions correctly.
|
125
|
+
for region in self.constituent_regions:
|
126
|
+
texts.append(region.extract_text(apply_exclusions=apply_exclusions, **kwargs))
|
127
|
+
|
128
|
+
# Join based on flow arrangement (e.g., newline for vertical, space for horizontal)
|
129
|
+
# This is a simplification; true layout-aware joining would be more complex.
|
130
|
+
joiner = "\n" if self.flow.arrangement == "vertical" else " " # TODO: Make this smarter, consider segment_gap
|
131
|
+
extracted = joiner.join(t for t in texts if t)
|
132
|
+
|
133
|
+
if apply_exclusions: # Only cache if standard exclusion behavior
|
134
|
+
self._cached_text = extracted
|
135
|
+
return extracted
|
136
|
+
|
137
|
+
def elements(self, apply_exclusions: bool = True) -> "ElementCollection": # Stringized return
|
138
|
+
"""
|
139
|
+
Collects all unique physical elements from all constituent physical regions.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
apply_exclusions: Whether to respect PDF exclusion zones within each
|
143
|
+
constituent physical region when gathering elements.
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
An ElementCollection containing all unique elements.
|
147
|
+
"""
|
148
|
+
from natural_pdf.elements.collections import ElementCollection as RuntimeElementCollection # Local import
|
149
|
+
|
150
|
+
if self._cached_elements is not None and apply_exclusions: # Simple cache check
|
151
|
+
return self._cached_elements
|
152
|
+
|
153
|
+
if not self.constituent_regions:
|
154
|
+
return RuntimeElementCollection([])
|
155
|
+
|
156
|
+
all_physical_elements: List["PhysicalElement"] = [] # Stringized item type
|
157
|
+
seen_elements = set() # To ensure uniqueness if elements are shared or duplicated by region definitions
|
158
|
+
|
159
|
+
for region in self.constituent_regions:
|
160
|
+
# Region.get_elements() returns a list, not ElementCollection
|
161
|
+
elements_in_region: List["PhysicalElement"] = region.get_elements(apply_exclusions=apply_exclusions)
|
162
|
+
for elem in elements_in_region:
|
163
|
+
if elem not in seen_elements: # Check for uniqueness based on object identity
|
164
|
+
all_physical_elements.append(elem)
|
165
|
+
seen_elements.add(elem)
|
166
|
+
|
167
|
+
# Basic reading order sort based on original page and coordinates.
|
168
|
+
def get_sort_key(phys_elem: "PhysicalElement"): # Stringized param type
|
169
|
+
page_idx = -1
|
170
|
+
if hasattr(phys_elem, 'page') and hasattr(phys_elem.page, 'index'):
|
171
|
+
page_idx = phys_elem.page.index
|
172
|
+
return (page_idx, phys_elem.top, phys_elem.x0)
|
173
|
+
|
174
|
+
try:
|
175
|
+
sorted_physical_elements = sorted(all_physical_elements, key=get_sort_key)
|
176
|
+
except AttributeError:
|
177
|
+
logger.warning("Could not sort elements in FlowRegion by reading order; some elements might be missing page, top or x0 attributes.")
|
178
|
+
sorted_physical_elements = all_physical_elements
|
179
|
+
|
180
|
+
result_collection = RuntimeElementCollection(sorted_physical_elements)
|
181
|
+
if apply_exclusions:
|
182
|
+
self._cached_elements = result_collection
|
183
|
+
return result_collection
|
184
|
+
|
185
|
+
def find(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> Optional["PhysicalElement"]: # Stringized
|
186
|
+
"""
|
187
|
+
Finds the first physical element within this FlowRegion that matches the selector or text.
|
188
|
+
"""
|
189
|
+
# Uses self.elements() which respects exclusions if apply_exclusions=True by default
|
190
|
+
all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
|
191
|
+
return all_elems.find(selector=selector, text=text, **kwargs) # ElementCollection.find
|
192
|
+
|
193
|
+
def find_all(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> "ElementCollection": # Stringized
|
194
|
+
"""
|
195
|
+
Finds all physical elements within this FlowRegion that match the selector or text.
|
196
|
+
"""
|
197
|
+
all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
|
198
|
+
return all_elems.find_all(selector=selector, text=text, **kwargs) # ElementCollection.find_all
|
199
|
+
|
200
|
+
def highlight(self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs) -> "FlowRegion": # Stringized
|
201
|
+
"""
|
202
|
+
Highlights all constituent physical regions on their respective pages.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
label: A base label for the highlights. Each constituent region might get an indexed label.
|
206
|
+
color: Color for the highlight.
|
207
|
+
**kwargs: Additional arguments for the underlying highlight method.
|
208
|
+
|
209
|
+
Returns:
|
210
|
+
Self for method chaining.
|
211
|
+
"""
|
212
|
+
if not self.constituent_regions:
|
213
|
+
return self
|
214
|
+
|
215
|
+
base_label = label if label else "FlowRegionPart"
|
216
|
+
for i, region in enumerate(self.constituent_regions):
|
217
|
+
current_label = f"{base_label}_{i+1}" if len(self.constituent_regions) > 1 else base_label
|
218
|
+
region.highlight(label=current_label, color=color, **kwargs)
|
219
|
+
return self
|
220
|
+
|
221
|
+
def show(
|
222
|
+
self,
|
223
|
+
scale: float = 2.0,
|
224
|
+
labels: bool = True,
|
225
|
+
legend_position: str = "right",
|
226
|
+
color: Optional[Union[Tuple, str]] = "fuchsia",
|
227
|
+
label_prefix: Optional[str] = "FlowPart",
|
228
|
+
width: Optional[int] = None,
|
229
|
+
stack_direction: str = "vertical",
|
230
|
+
stack_gap: int = 5,
|
231
|
+
stack_background_color: Tuple[int, int, int] = (255, 255, 255),
|
232
|
+
**kwargs
|
233
|
+
) -> Optional["PIL_Image"]:
|
234
|
+
"""
|
235
|
+
Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
|
236
|
+
If multiple pages are involved, they are stacked into a single image.
|
237
|
+
"""
|
238
|
+
if not self.constituent_regions:
|
239
|
+
logger.info("FlowRegion.show() called with no constituent regions.")
|
240
|
+
return None
|
241
|
+
|
242
|
+
# 1. Group constituent regions by their physical page
|
243
|
+
regions_by_page: Dict["PhysicalPage", List["PhysicalRegion"]] = {}
|
244
|
+
for region in self.constituent_regions:
|
245
|
+
if region.page:
|
246
|
+
if region.page not in regions_by_page:
|
247
|
+
regions_by_page[region.page] = []
|
248
|
+
regions_by_page[region.page].append(region)
|
249
|
+
else:
|
250
|
+
raise ValueError(f"Constituent region {region.bbox} has no page.")
|
251
|
+
|
252
|
+
if not regions_by_page:
|
253
|
+
logger.info("FlowRegion.show() found no constituent regions with associated pages.")
|
254
|
+
return None
|
255
|
+
|
256
|
+
# 2. Get a highlighter service (e.g., from the first page involved)
|
257
|
+
first_page_with_regions = next(iter(regions_by_page.keys()), None)
|
258
|
+
highlighter_service = None
|
259
|
+
if first_page_with_regions and hasattr(first_page_with_regions, '_highlighter'):
|
260
|
+
highlighter_service = first_page_with_regions._highlighter
|
261
|
+
|
262
|
+
if not highlighter_service:
|
263
|
+
raise ValueError(
|
264
|
+
"Cannot get highlighter service for FlowRegion.show(). "
|
265
|
+
"Ensure constituent regions' pages are initialized with a highlighter."
|
266
|
+
)
|
267
|
+
|
268
|
+
output_page_images: List["PIL_Image_Runtime"] = []
|
269
|
+
|
270
|
+
# Sort pages by index for consistent output order
|
271
|
+
sorted_pages = sorted(regions_by_page.keys(), key=lambda p: p.index if hasattr(p, 'index') else getattr(p, 'page_number', 0))
|
272
|
+
|
273
|
+
# 3. Render each page with its relevant constituent regions highlighted
|
274
|
+
for page_idx, page_obj in enumerate(sorted_pages):
|
275
|
+
constituent_regions_on_this_page = regions_by_page[page_obj]
|
276
|
+
if not constituent_regions_on_this_page:
|
277
|
+
continue
|
278
|
+
|
279
|
+
temp_highlights_for_page = []
|
280
|
+
for i, region_part in enumerate(constituent_regions_on_this_page):
|
281
|
+
part_label = None
|
282
|
+
if labels and label_prefix: # Ensure labels is True for label_prefix to apply
|
283
|
+
# If FlowRegion consists of multiple parts on this page, or overall
|
284
|
+
count_indicator = ""
|
285
|
+
if len(self.constituent_regions) > 1 : # If flow region has multiple parts overall
|
286
|
+
# Find global index of this region_part in self.constituent_regions
|
287
|
+
try:
|
288
|
+
global_idx = self.constituent_regions.index(region_part)
|
289
|
+
count_indicator = f"_{global_idx + 1}"
|
290
|
+
except ValueError: # Should not happen if region_part is from the list
|
291
|
+
count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
|
292
|
+
elif len(constituent_regions_on_this_page) > 1 : # If multiple parts on *this* page, but FR is single part overall
|
293
|
+
count_indicator = f"_{i+1}"
|
294
|
+
|
295
|
+
part_label = f"{label_prefix}{count_indicator}" if label_prefix else None
|
296
|
+
|
297
|
+
temp_highlights_for_page.append({
|
298
|
+
"page_index": page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
|
299
|
+
"bbox": region_part.bbox,
|
300
|
+
"polygon": region_part.polygon if region_part.has_polygon else None,
|
301
|
+
"color": color, # Use the passed color
|
302
|
+
"label": part_label,
|
303
|
+
"use_color_cycling": False, # Keep specific color
|
304
|
+
})
|
305
|
+
|
306
|
+
if not temp_highlights_for_page:
|
307
|
+
continue
|
308
|
+
|
309
|
+
page_image = highlighter_service.render_preview(
|
310
|
+
page_index=page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
|
311
|
+
temporary_highlights=temp_highlights_for_page,
|
312
|
+
scale=scale,
|
313
|
+
width=width,
|
314
|
+
labels=labels, # Pass through labels
|
315
|
+
legend_position=legend_position,
|
316
|
+
**kwargs
|
317
|
+
)
|
318
|
+
if page_image:
|
319
|
+
output_page_images.append(page_image)
|
320
|
+
|
321
|
+
# 4. Stack the generated page images if multiple
|
322
|
+
if not output_page_images:
|
323
|
+
logger.info("FlowRegion.show() produced no page images to concatenate.")
|
324
|
+
return None
|
325
|
+
|
326
|
+
if len(output_page_images) == 1:
|
327
|
+
return output_page_images[0]
|
328
|
+
|
329
|
+
# Stacking logic (same as in FlowRegionCollection.show)
|
330
|
+
if stack_direction == "vertical":
|
331
|
+
final_width = max(img.width for img in output_page_images)
|
332
|
+
final_height = sum(img.height for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
|
333
|
+
if final_width == 0 or final_height == 0:
|
334
|
+
raise ValueError("Cannot create concatenated image with zero width or height.")
|
335
|
+
|
336
|
+
concatenated_image = PIL_Image_Runtime.new("RGB", (final_width, final_height), stack_background_color)
|
337
|
+
current_y = 0
|
338
|
+
for img in output_page_images:
|
339
|
+
paste_x = (final_width - img.width) // 2
|
340
|
+
concatenated_image.paste(img, (paste_x, current_y))
|
341
|
+
current_y += img.height + stack_gap
|
342
|
+
return concatenated_image
|
343
|
+
elif stack_direction == "horizontal":
|
344
|
+
final_width = sum(img.width for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
|
345
|
+
final_height = max(img.height for img in output_page_images)
|
346
|
+
if final_width == 0 or final_height == 0:
|
347
|
+
raise ValueError("Cannot create concatenated image with zero width or height.")
|
348
|
+
|
349
|
+
concatenated_image = PIL_Image_Runtime.new("RGB", (final_width, final_height), stack_background_color)
|
350
|
+
current_x = 0
|
351
|
+
for img in output_page_images:
|
352
|
+
paste_y = (final_height - img.height) // 2
|
353
|
+
concatenated_image.paste(img, (current_x, paste_y))
|
354
|
+
current_x += img.width + stack_gap
|
355
|
+
return concatenated_image
|
356
|
+
else:
|
357
|
+
raise ValueError(f"Invalid stack_direction '{stack_direction}' for FlowRegion.show(). Must be 'vertical' or 'horizontal'.")
|
358
|
+
|
359
|
+
def to_images(
|
360
|
+
self,
|
361
|
+
resolution: float = 150,
|
362
|
+
**kwargs,
|
363
|
+
) -> List["PIL_Image"]:
|
364
|
+
"""
|
365
|
+
Generates and returns a list of cropped PIL Images,
|
366
|
+
one for each constituent physical region of this FlowRegion.
|
367
|
+
"""
|
368
|
+
if not self.constituent_regions:
|
369
|
+
logger.info("FlowRegion.to_images() called on an empty FlowRegion.")
|
370
|
+
return []
|
371
|
+
|
372
|
+
cropped_images: List["PIL_Image"] = []
|
373
|
+
for region_part in self.constituent_regions:
|
374
|
+
try:
|
375
|
+
img = region_part.to_image(
|
376
|
+
resolution=resolution,
|
377
|
+
crop_only=True,
|
378
|
+
include_highlights=False,
|
379
|
+
**kwargs
|
380
|
+
)
|
381
|
+
if img:
|
382
|
+
cropped_images.append(img)
|
383
|
+
except Exception as e:
|
384
|
+
logger.error(f"Error generating image for constituent region {region_part.bbox}: {e}", exc_info=True)
|
385
|
+
|
386
|
+
return cropped_images
|
387
|
+
|
388
|
+
def to_image(self, background_color=(255,255,255), **kwargs) -> Optional["PIL_Image"]:
|
389
|
+
"""
|
390
|
+
Creates a single composite image by stacking the images of its constituent regions.
|
391
|
+
Stacking direction is based on the Flow's arrangement.
|
392
|
+
Individual region images are obtained by calling to_images(**kwargs).
|
393
|
+
|
394
|
+
Args:
|
395
|
+
background_color: Tuple for RGB background color of the composite image.
|
396
|
+
**kwargs: Additional arguments passed to to_images() for rendering individual parts
|
397
|
+
(e.g., resolution).
|
398
|
+
|
399
|
+
Returns:
|
400
|
+
A single PIL.Image.Image object, or None if no constituent images.
|
401
|
+
"""
|
402
|
+
# Use PIL_Image_Runtime for creating new images at runtime
|
403
|
+
images = self.to_images(**kwargs)
|
404
|
+
if not images:
|
405
|
+
return None
|
406
|
+
if len(images) == 1:
|
407
|
+
return images[0]
|
408
|
+
|
409
|
+
if self.flow.arrangement == "vertical":
|
410
|
+
# Stack vertically
|
411
|
+
composite_width = max(img.width for img in images)
|
412
|
+
composite_height = sum(img.height for img in images)
|
413
|
+
if composite_width == 0 or composite_height == 0: return None # Avoid zero-size image
|
414
|
+
|
415
|
+
new_image = PIL_Image_Runtime.new("RGB", (composite_width, composite_height), background_color)
|
416
|
+
current_y = 0
|
417
|
+
for img in images:
|
418
|
+
# Default to left alignment for vertical stacking
|
419
|
+
new_image.paste(img, (0, current_y))
|
420
|
+
current_y += img.height
|
421
|
+
return new_image
|
422
|
+
|
423
|
+
elif self.flow.arrangement == "horizontal":
|
424
|
+
# Stack horizontally
|
425
|
+
composite_width = sum(img.width for img in images)
|
426
|
+
composite_height = max(img.height for img in images)
|
427
|
+
if composite_width == 0 or composite_height == 0: return None
|
428
|
+
|
429
|
+
new_image = PIL_Image_Runtime.new("RGB", (composite_width, composite_height), background_color)
|
430
|
+
current_x = 0
|
431
|
+
for img in images:
|
432
|
+
# Default to top alignment for horizontal stacking
|
433
|
+
new_image.paste(img, (current_x, 0))
|
434
|
+
current_x += img.width
|
435
|
+
return new_image
|
436
|
+
else:
|
437
|
+
# Should not happen if flow.arrangement is validated
|
438
|
+
logger.warning(f"Unknown flow arrangement: {self.flow.arrangement}. Cannot stack images.")
|
439
|
+
return None
|
440
|
+
|
441
|
+
def __repr__(self) -> str:
|
442
|
+
return (
|
443
|
+
f"<FlowRegion constituents={len(self.constituent_regions)}, flow={self.flow}, "
|
444
|
+
f"source_bbox={self.source_flow_element.bbox if self.source_flow_element else 'N/A'}>"
|
445
|
+
)
|
446
|
+
|
447
|
+
@property
|
448
|
+
def is_empty(self) -> bool:
|
449
|
+
"""Checks if the FlowRegion contains no constituent regions or if all are empty."""
|
450
|
+
if not self.constituent_regions:
|
451
|
+
return True
|
452
|
+
# A more robust check might see if extract_text() is empty and elements() is empty.
|
453
|
+
# For now, if it has regions, it's not considered empty by this simple check.
|
454
|
+
# User Point 4: FlowRegion can be empty (no text, no elements). This implies checking content.
|
455
|
+
try:
|
456
|
+
return not bool(self.extract_text(apply_exclusions=False).strip()) and not bool(self.elements(apply_exclusions=False))
|
457
|
+
except Exception:
|
458
|
+
return True # If error during check, assume empty to be safe
|
natural_pdf/selectors/parser.py
CHANGED
@@ -71,6 +71,91 @@ def safe_parse_color(value_str: str) -> tuple:
|
|
71
71
|
return (0, 0, 0)
|
72
72
|
|
73
73
|
|
74
|
+
def _split_top_level_or(selector: str) -> List[str]:
|
75
|
+
"""
|
76
|
+
Split a selector string on top-level OR operators (| or ,) only.
|
77
|
+
|
78
|
+
Respects parsing contexts and does not split when | or , appear inside:
|
79
|
+
- Quoted strings (both single and double quotes)
|
80
|
+
- Parentheses (for pseudo-class arguments like :not(...))
|
81
|
+
- Square brackets (for attribute selectors like [attr="value"])
|
82
|
+
|
83
|
+
Args:
|
84
|
+
selector: The selector string to split
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
List of selector parts. If no top-level OR operators found, returns [selector].
|
88
|
+
|
89
|
+
Examples:
|
90
|
+
>>> _split_top_level_or('text:contains("a|b")|text:bold')
|
91
|
+
['text:contains("a|b")', 'text:bold']
|
92
|
+
|
93
|
+
>>> _split_top_level_or('text:contains("hello,world")')
|
94
|
+
['text:contains("hello,world")']
|
95
|
+
"""
|
96
|
+
if not selector or not isinstance(selector, str):
|
97
|
+
return [selector] if selector else []
|
98
|
+
|
99
|
+
parts = []
|
100
|
+
current_part = ""
|
101
|
+
i = 0
|
102
|
+
|
103
|
+
# Parsing state
|
104
|
+
in_double_quotes = False
|
105
|
+
in_single_quotes = False
|
106
|
+
paren_depth = 0
|
107
|
+
bracket_depth = 0
|
108
|
+
|
109
|
+
while i < len(selector):
|
110
|
+
char = selector[i]
|
111
|
+
|
112
|
+
# Handle escape sequences in quotes
|
113
|
+
if i > 0 and selector[i-1] == '\\':
|
114
|
+
current_part += char
|
115
|
+
i += 1
|
116
|
+
continue
|
117
|
+
|
118
|
+
# Handle quote state changes
|
119
|
+
if char == '"' and not in_single_quotes:
|
120
|
+
in_double_quotes = not in_double_quotes
|
121
|
+
elif char == "'" and not in_double_quotes:
|
122
|
+
in_single_quotes = not in_single_quotes
|
123
|
+
|
124
|
+
# Handle parentheses and brackets only when not in quotes
|
125
|
+
elif not in_double_quotes and not in_single_quotes:
|
126
|
+
if char == '(':
|
127
|
+
paren_depth += 1
|
128
|
+
elif char == ')':
|
129
|
+
paren_depth -= 1
|
130
|
+
elif char == '[':
|
131
|
+
bracket_depth += 1
|
132
|
+
elif char == ']':
|
133
|
+
bracket_depth -= 1
|
134
|
+
|
135
|
+
# Check for top-level OR operators
|
136
|
+
elif (char == '|' or char == ',') and paren_depth == 0 and bracket_depth == 0:
|
137
|
+
# Found a top-level OR operator
|
138
|
+
part = current_part.strip()
|
139
|
+
if part: # Only add non-empty parts
|
140
|
+
parts.append(part)
|
141
|
+
current_part = ""
|
142
|
+
i += 1
|
143
|
+
continue
|
144
|
+
|
145
|
+
# Add character to current part
|
146
|
+
current_part += char
|
147
|
+
i += 1
|
148
|
+
|
149
|
+
# Add the final part
|
150
|
+
final_part = current_part.strip()
|
151
|
+
if final_part:
|
152
|
+
parts.append(final_part)
|
153
|
+
|
154
|
+
# If we only found one part, return it as a single-element list
|
155
|
+
# If we found multiple parts, those are the OR-separated parts
|
156
|
+
return parts if parts else [selector]
|
157
|
+
|
158
|
+
|
74
159
|
def parse_selector(selector: str) -> Dict[str, Any]:
|
75
160
|
"""
|
76
161
|
Parse a CSS-like selector string into a structured selector object.
|
@@ -80,12 +165,28 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
80
165
|
- Attribute presence (e.g., '[data-id]')
|
81
166
|
- Attribute value checks with various operators (e.g., '[count=5]', '[name*="bold"]'')
|
82
167
|
- Pseudo-classes (e.g., ':contains("Total")', ':empty', ':not(...)')
|
168
|
+
- OR operators (e.g., 'text:contains("A")|text:bold', 'sel1,sel2')
|
83
169
|
|
84
170
|
Args:
|
85
171
|
selector: CSS-like selector string
|
86
172
|
|
87
173
|
Returns:
|
88
|
-
Dict representing the parsed selector
|
174
|
+
Dict representing the parsed selector, or compound selector with OR logic
|
175
|
+
|
176
|
+
Examples:
|
177
|
+
>>> parse_selector('text:contains("hello")') # Single selector
|
178
|
+
{'type': 'text', 'pseudo_classes': [{'name': 'contains', 'args': 'hello'}], ...}
|
179
|
+
|
180
|
+
>>> parse_selector('text:contains("A")|text:bold') # OR with pipe
|
181
|
+
{'type': 'or', 'selectors': [...]}
|
182
|
+
|
183
|
+
>>> parse_selector('text:contains("A"),line[width>5]') # OR with comma
|
184
|
+
{'type': 'or', 'selectors': [...]}
|
185
|
+
|
186
|
+
Note:
|
187
|
+
OR operators work with all selector types except spatial pseudo-classes
|
188
|
+
(:above, :below, :near, :left-of, :right-of) which require page context.
|
189
|
+
Spatial relationships within OR selectors are not currently supported.
|
89
190
|
"""
|
90
191
|
result = {
|
91
192
|
"type": "any",
|
@@ -100,6 +201,36 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
100
201
|
|
101
202
|
selector = selector.strip()
|
102
203
|
|
204
|
+
# --- Handle OR operators first (| or ,) ---
|
205
|
+
# Check if selector contains OR operators at the top level only
|
206
|
+
# (not inside quotes, parentheses, or brackets)
|
207
|
+
or_parts = _split_top_level_or(selector)
|
208
|
+
|
209
|
+
# If we found OR parts, parse each one recursively and return compound selector
|
210
|
+
if len(or_parts) > 1:
|
211
|
+
parsed_selectors = []
|
212
|
+
for part in or_parts:
|
213
|
+
try:
|
214
|
+
parsed_selectors.append(parse_selector(part))
|
215
|
+
except (ValueError, TypeError) as e:
|
216
|
+
logger.warning(f"Skipping invalid OR selector part '{part}': {e}")
|
217
|
+
continue
|
218
|
+
|
219
|
+
if len(parsed_selectors) > 1:
|
220
|
+
return {
|
221
|
+
"type": "or",
|
222
|
+
"selectors": parsed_selectors
|
223
|
+
}
|
224
|
+
elif len(parsed_selectors) == 1:
|
225
|
+
# Only one valid part, return it directly
|
226
|
+
return parsed_selectors[0]
|
227
|
+
else:
|
228
|
+
# No valid parts, return default
|
229
|
+
logger.warning(f"No valid parts found in OR selector '{original_selector_for_error}', returning default selector")
|
230
|
+
return result
|
231
|
+
|
232
|
+
# --- Continue with single selector parsing (existing logic) ---
|
233
|
+
|
103
234
|
# --- Handle wildcard selector explicitly ---
|
104
235
|
if selector == "*":
|
105
236
|
# Wildcard matches any type, already the default.
|
@@ -109,12 +240,6 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
109
240
|
|
110
241
|
# 1. Extract type (optional, at the beginning)
|
111
242
|
# Only run if selector wasn't '*'
|
112
|
-
if selector:
|
113
|
-
type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
|
114
|
-
if type_match:
|
115
|
-
result["type"] = type_match.group(1).lower()
|
116
|
-
selector = selector[len(type_match.group(0)) :].strip()
|
117
|
-
# Only run if selector wasn't '*'
|
118
243
|
if selector:
|
119
244
|
type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
|
120
245
|
if type_match:
|
@@ -597,12 +722,42 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
|
|
597
722
|
To inspect the individual filters, call `_build_filter_list` directly.
|
598
723
|
|
599
724
|
Args:
|
600
|
-
selector: Parsed selector dictionary
|
725
|
+
selector: Parsed selector dictionary (single or compound OR selector)
|
601
726
|
**kwargs: Additional filter parameters (e.g., regex, case).
|
602
727
|
|
603
728
|
Returns:
|
604
729
|
Function that takes an element and returns True if it matches the selector.
|
605
730
|
"""
|
731
|
+
# Handle compound OR selectors
|
732
|
+
if selector.get("type") == "or":
|
733
|
+
sub_selectors = selector.get("selectors", [])
|
734
|
+
if not sub_selectors:
|
735
|
+
# Empty OR selector, return a function that never matches
|
736
|
+
return lambda element: False
|
737
|
+
|
738
|
+
# Create filter functions for each sub-selector
|
739
|
+
sub_filter_funcs = []
|
740
|
+
for sub_selector in sub_selectors:
|
741
|
+
sub_filter_funcs.append(selector_to_filter_func(sub_selector, **kwargs))
|
742
|
+
|
743
|
+
if logger.isEnabledFor(logging.DEBUG):
|
744
|
+
logger.debug(f"Creating OR filter with {len(sub_filter_funcs)} sub-selectors")
|
745
|
+
|
746
|
+
# Return OR combination - element matches if ANY sub-selector matches
|
747
|
+
def or_filter(element):
|
748
|
+
for func in sub_filter_funcs:
|
749
|
+
try:
|
750
|
+
if func(element):
|
751
|
+
return True
|
752
|
+
except Exception as e:
|
753
|
+
logger.error(f"Error applying OR sub-filter to element: {e}", exc_info=True)
|
754
|
+
# Continue to next sub-filter on error
|
755
|
+
continue
|
756
|
+
return False
|
757
|
+
|
758
|
+
return or_filter
|
759
|
+
|
760
|
+
# Handle single selectors (existing logic)
|
606
761
|
filter_list = _build_filter_list(selector, **kwargs)
|
607
762
|
|
608
763
|
if logger.isEnabledFor(logging.DEBUG):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.13
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -20,6 +20,7 @@ Requires-Dist: tqdm
|
|
20
20
|
Requires-Dist: pydantic
|
21
21
|
Requires-Dist: jenkspy
|
22
22
|
Requires-Dist: pikepdf>=9.7.0
|
23
|
+
Requires-Dist: scipy
|
23
24
|
Provides-Extra: viewer
|
24
25
|
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "viewer"
|
25
26
|
Provides-Extra: easyocr
|