natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +3 -4
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +146 -75
- natural_pdf/core/page.py +287 -188
- natural_pdf/core/pdf.py +57 -42
- natural_pdf/elements/base.py +51 -0
- natural_pdf/elements/collections.py +362 -67
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +396 -23
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/selectors/parser.py +163 -8
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,458 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
3
|
+
|
4
|
+
from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
|
5
|
+
|
6
|
+
# For runtime image manipulation
|
7
|
+
from PIL import Image as PIL_Image_Runtime
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from PIL.Image import Image as PIL_Image # For type hints
|
11
|
+
from natural_pdf.elements.base import Element as PhysicalElement
|
12
|
+
from natural_pdf.elements.region import Region as PhysicalRegion
|
13
|
+
from natural_pdf.elements.collections import ElementCollection
|
14
|
+
from natural_pdf.core.page import Page as PhysicalPage
|
15
|
+
from .flow import Flow
|
16
|
+
from .element import FlowElement
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class FlowRegion:
|
22
|
+
"""
|
23
|
+
Represents a selected area within a Flow, potentially composed of multiple
|
24
|
+
physical Region objects (constituent_regions) that might span across
|
25
|
+
different original pages or disjoint physical regions defined in the Flow.
|
26
|
+
|
27
|
+
A FlowRegion is the result of a directional operation (e.g., .below(), .above())
|
28
|
+
on a FlowElement.
|
29
|
+
"""
|
30
|
+
|
31
|
+
def __init__(
|
32
|
+
self,
|
33
|
+
flow: "Flow",
|
34
|
+
constituent_regions: List["PhysicalRegion"],
|
35
|
+
source_flow_element: "FlowElement",
|
36
|
+
boundary_element_found: Optional["PhysicalElement"] = None,
|
37
|
+
):
|
38
|
+
"""
|
39
|
+
Initializes a FlowRegion.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
flow: The Flow instance this region belongs to.
|
43
|
+
constituent_regions: A list of physical natural_pdf.elements.region.Region
|
44
|
+
objects that make up this FlowRegion.
|
45
|
+
source_flow_element: The FlowElement that created this FlowRegion.
|
46
|
+
boundary_element_found: The physical element that stopped an 'until' search,
|
47
|
+
if applicable.
|
48
|
+
"""
|
49
|
+
self.flow: "Flow" = flow
|
50
|
+
self.constituent_regions: List["PhysicalRegion"] = constituent_regions
|
51
|
+
self.source_flow_element: "FlowElement" = source_flow_element
|
52
|
+
self.boundary_element_found: Optional["PhysicalElement"] = boundary_element_found
|
53
|
+
|
54
|
+
# Cache for expensive operations
|
55
|
+
self._cached_text: Optional[str] = None
|
56
|
+
self._cached_elements: Optional["ElementCollection"] = None # Stringized
|
57
|
+
self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
|
58
|
+
|
59
|
+
@property
|
60
|
+
def bbox(self) -> Optional[Tuple[float, float, float, float]]:
|
61
|
+
"""
|
62
|
+
Calculates a conceptual bounding box that encompasses all constituent physical regions.
|
63
|
+
This is the union of the bounding boxes of the constituent regions in their
|
64
|
+
original physical coordinates.
|
65
|
+
Returns None if there are no constituent regions.
|
66
|
+
"""
|
67
|
+
if self._cached_bbox is not None:
|
68
|
+
return self._cached_bbox
|
69
|
+
if not self.constituent_regions:
|
70
|
+
return None
|
71
|
+
|
72
|
+
# Use objects_to_bbox from pdfplumber.utils.geometry to merge bboxes
|
73
|
+
# This helper expects a list of objects that have .x0, .top, .x1, .bottom attributes.
|
74
|
+
# Our PhysicalRegion objects satisfy this.
|
75
|
+
self._cached_bbox = objects_to_bbox(self.constituent_regions)
|
76
|
+
return self._cached_bbox
|
77
|
+
|
78
|
+
@property
|
79
|
+
def x0(self) -> Optional[float]:
|
80
|
+
return self.bbox[0] if self.bbox else None
|
81
|
+
|
82
|
+
@property
|
83
|
+
def top(self) -> Optional[float]:
|
84
|
+
return self.bbox[1] if self.bbox else None
|
85
|
+
|
86
|
+
@property
|
87
|
+
def x1(self) -> Optional[float]:
|
88
|
+
return self.bbox[2] if self.bbox else None
|
89
|
+
|
90
|
+
@property
|
91
|
+
def bottom(self) -> Optional[float]:
|
92
|
+
return self.bbox[3] if self.bbox else None
|
93
|
+
|
94
|
+
@property
|
95
|
+
def width(self) -> Optional[float]:
|
96
|
+
return self.x1 - self.x0 if self.bbox else None
|
97
|
+
|
98
|
+
@property
|
99
|
+
def height(self) -> Optional[float]:
|
100
|
+
return self.bottom - self.top if self.bbox else None
|
101
|
+
|
102
|
+
def extract_text(self, apply_exclusions: bool = True, **kwargs) -> str:
|
103
|
+
"""
|
104
|
+
Extracts and concatenates text from all constituent physical regions.
|
105
|
+
The order of concatenation respects the flow's arrangement.
|
106
|
+
|
107
|
+
Args:
|
108
|
+
apply_exclusions: Whether to respect PDF exclusion zones within each
|
109
|
+
constituent physical region during text extraction.
|
110
|
+
**kwargs: Additional arguments passed to the underlying extract_text method
|
111
|
+
of each constituent region.
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
The combined text content as a string.
|
115
|
+
"""
|
116
|
+
if self._cached_text is not None and apply_exclusions: # Simple cache check, might need refinement if kwargs change behavior
|
117
|
+
return self._cached_text
|
118
|
+
|
119
|
+
if not self.constituent_regions:
|
120
|
+
return ""
|
121
|
+
|
122
|
+
texts: List[str] = []
|
123
|
+
# For now, simple concatenation. Order depends on how constituent_regions were added.
|
124
|
+
# The FlowElement._flow_direction method is responsible for ordering constituent_regions correctly.
|
125
|
+
for region in self.constituent_regions:
|
126
|
+
texts.append(region.extract_text(apply_exclusions=apply_exclusions, **kwargs))
|
127
|
+
|
128
|
+
# Join based on flow arrangement (e.g., newline for vertical, space for horizontal)
|
129
|
+
# This is a simplification; true layout-aware joining would be more complex.
|
130
|
+
joiner = "\n" if self.flow.arrangement == "vertical" else " " # TODO: Make this smarter, consider segment_gap
|
131
|
+
extracted = joiner.join(t for t in texts if t)
|
132
|
+
|
133
|
+
if apply_exclusions: # Only cache if standard exclusion behavior
|
134
|
+
self._cached_text = extracted
|
135
|
+
return extracted
|
136
|
+
|
137
|
+
def elements(self, apply_exclusions: bool = True) -> "ElementCollection": # Stringized return
|
138
|
+
"""
|
139
|
+
Collects all unique physical elements from all constituent physical regions.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
apply_exclusions: Whether to respect PDF exclusion zones within each
|
143
|
+
constituent physical region when gathering elements.
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
An ElementCollection containing all unique elements.
|
147
|
+
"""
|
148
|
+
from natural_pdf.elements.collections import ElementCollection as RuntimeElementCollection # Local import
|
149
|
+
|
150
|
+
if self._cached_elements is not None and apply_exclusions: # Simple cache check
|
151
|
+
return self._cached_elements
|
152
|
+
|
153
|
+
if not self.constituent_regions:
|
154
|
+
return RuntimeElementCollection([])
|
155
|
+
|
156
|
+
all_physical_elements: List["PhysicalElement"] = [] # Stringized item type
|
157
|
+
seen_elements = set() # To ensure uniqueness if elements are shared or duplicated by region definitions
|
158
|
+
|
159
|
+
for region in self.constituent_regions:
|
160
|
+
# Region.get_elements() returns a list, not ElementCollection
|
161
|
+
elements_in_region: List["PhysicalElement"] = region.get_elements(apply_exclusions=apply_exclusions)
|
162
|
+
for elem in elements_in_region:
|
163
|
+
if elem not in seen_elements: # Check for uniqueness based on object identity
|
164
|
+
all_physical_elements.append(elem)
|
165
|
+
seen_elements.add(elem)
|
166
|
+
|
167
|
+
# Basic reading order sort based on original page and coordinates.
|
168
|
+
def get_sort_key(phys_elem: "PhysicalElement"): # Stringized param type
|
169
|
+
page_idx = -1
|
170
|
+
if hasattr(phys_elem, 'page') and hasattr(phys_elem.page, 'index'):
|
171
|
+
page_idx = phys_elem.page.index
|
172
|
+
return (page_idx, phys_elem.top, phys_elem.x0)
|
173
|
+
|
174
|
+
try:
|
175
|
+
sorted_physical_elements = sorted(all_physical_elements, key=get_sort_key)
|
176
|
+
except AttributeError:
|
177
|
+
logger.warning("Could not sort elements in FlowRegion by reading order; some elements might be missing page, top or x0 attributes.")
|
178
|
+
sorted_physical_elements = all_physical_elements
|
179
|
+
|
180
|
+
result_collection = RuntimeElementCollection(sorted_physical_elements)
|
181
|
+
if apply_exclusions:
|
182
|
+
self._cached_elements = result_collection
|
183
|
+
return result_collection
|
184
|
+
|
185
|
+
def find(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> Optional["PhysicalElement"]: # Stringized
|
186
|
+
"""
|
187
|
+
Finds the first physical element within this FlowRegion that matches the selector or text.
|
188
|
+
"""
|
189
|
+
# Uses self.elements() which respects exclusions if apply_exclusions=True by default
|
190
|
+
all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
|
191
|
+
return all_elems.find(selector=selector, text=text, **kwargs) # ElementCollection.find
|
192
|
+
|
193
|
+
def find_all(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> "ElementCollection": # Stringized
|
194
|
+
"""
|
195
|
+
Finds all physical elements within this FlowRegion that match the selector or text.
|
196
|
+
"""
|
197
|
+
all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
|
198
|
+
return all_elems.find_all(selector=selector, text=text, **kwargs) # ElementCollection.find_all
|
199
|
+
|
200
|
+
def highlight(self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs) -> "FlowRegion": # Stringized
|
201
|
+
"""
|
202
|
+
Highlights all constituent physical regions on their respective pages.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
label: A base label for the highlights. Each constituent region might get an indexed label.
|
206
|
+
color: Color for the highlight.
|
207
|
+
**kwargs: Additional arguments for the underlying highlight method.
|
208
|
+
|
209
|
+
Returns:
|
210
|
+
Self for method chaining.
|
211
|
+
"""
|
212
|
+
if not self.constituent_regions:
|
213
|
+
return self
|
214
|
+
|
215
|
+
base_label = label if label else "FlowRegionPart"
|
216
|
+
for i, region in enumerate(self.constituent_regions):
|
217
|
+
current_label = f"{base_label}_{i+1}" if len(self.constituent_regions) > 1 else base_label
|
218
|
+
region.highlight(label=current_label, color=color, **kwargs)
|
219
|
+
return self
|
220
|
+
|
221
|
+
def show(
|
222
|
+
self,
|
223
|
+
scale: float = 2.0,
|
224
|
+
labels: bool = True,
|
225
|
+
legend_position: str = "right",
|
226
|
+
color: Optional[Union[Tuple, str]] = "fuchsia",
|
227
|
+
label_prefix: Optional[str] = "FlowPart",
|
228
|
+
width: Optional[int] = None,
|
229
|
+
stack_direction: str = "vertical",
|
230
|
+
stack_gap: int = 5,
|
231
|
+
stack_background_color: Tuple[int, int, int] = (255, 255, 255),
|
232
|
+
**kwargs
|
233
|
+
) -> Optional["PIL_Image"]:
|
234
|
+
"""
|
235
|
+
Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
|
236
|
+
If multiple pages are involved, they are stacked into a single image.
|
237
|
+
"""
|
238
|
+
if not self.constituent_regions:
|
239
|
+
logger.info("FlowRegion.show() called with no constituent regions.")
|
240
|
+
return None
|
241
|
+
|
242
|
+
# 1. Group constituent regions by their physical page
|
243
|
+
regions_by_page: Dict["PhysicalPage", List["PhysicalRegion"]] = {}
|
244
|
+
for region in self.constituent_regions:
|
245
|
+
if region.page:
|
246
|
+
if region.page not in regions_by_page:
|
247
|
+
regions_by_page[region.page] = []
|
248
|
+
regions_by_page[region.page].append(region)
|
249
|
+
else:
|
250
|
+
raise ValueError(f"Constituent region {region.bbox} has no page.")
|
251
|
+
|
252
|
+
if not regions_by_page:
|
253
|
+
logger.info("FlowRegion.show() found no constituent regions with associated pages.")
|
254
|
+
return None
|
255
|
+
|
256
|
+
# 2. Get a highlighter service (e.g., from the first page involved)
|
257
|
+
first_page_with_regions = next(iter(regions_by_page.keys()), None)
|
258
|
+
highlighter_service = None
|
259
|
+
if first_page_with_regions and hasattr(first_page_with_regions, '_highlighter'):
|
260
|
+
highlighter_service = first_page_with_regions._highlighter
|
261
|
+
|
262
|
+
if not highlighter_service:
|
263
|
+
raise ValueError(
|
264
|
+
"Cannot get highlighter service for FlowRegion.show(). "
|
265
|
+
"Ensure constituent regions' pages are initialized with a highlighter."
|
266
|
+
)
|
267
|
+
|
268
|
+
output_page_images: List["PIL_Image_Runtime"] = []
|
269
|
+
|
270
|
+
# Sort pages by index for consistent output order
|
271
|
+
sorted_pages = sorted(regions_by_page.keys(), key=lambda p: p.index if hasattr(p, 'index') else getattr(p, 'page_number', 0))
|
272
|
+
|
273
|
+
# 3. Render each page with its relevant constituent regions highlighted
|
274
|
+
for page_idx, page_obj in enumerate(sorted_pages):
|
275
|
+
constituent_regions_on_this_page = regions_by_page[page_obj]
|
276
|
+
if not constituent_regions_on_this_page:
|
277
|
+
continue
|
278
|
+
|
279
|
+
temp_highlights_for_page = []
|
280
|
+
for i, region_part in enumerate(constituent_regions_on_this_page):
|
281
|
+
part_label = None
|
282
|
+
if labels and label_prefix: # Ensure labels is True for label_prefix to apply
|
283
|
+
# If FlowRegion consists of multiple parts on this page, or overall
|
284
|
+
count_indicator = ""
|
285
|
+
if len(self.constituent_regions) > 1 : # If flow region has multiple parts overall
|
286
|
+
# Find global index of this region_part in self.constituent_regions
|
287
|
+
try:
|
288
|
+
global_idx = self.constituent_regions.index(region_part)
|
289
|
+
count_indicator = f"_{global_idx + 1}"
|
290
|
+
except ValueError: # Should not happen if region_part is from the list
|
291
|
+
count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
|
292
|
+
elif len(constituent_regions_on_this_page) > 1 : # If multiple parts on *this* page, but FR is single part overall
|
293
|
+
count_indicator = f"_{i+1}"
|
294
|
+
|
295
|
+
part_label = f"{label_prefix}{count_indicator}" if label_prefix else None
|
296
|
+
|
297
|
+
temp_highlights_for_page.append({
|
298
|
+
"page_index": page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
|
299
|
+
"bbox": region_part.bbox,
|
300
|
+
"polygon": region_part.polygon if region_part.has_polygon else None,
|
301
|
+
"color": color, # Use the passed color
|
302
|
+
"label": part_label,
|
303
|
+
"use_color_cycling": False, # Keep specific color
|
304
|
+
})
|
305
|
+
|
306
|
+
if not temp_highlights_for_page:
|
307
|
+
continue
|
308
|
+
|
309
|
+
page_image = highlighter_service.render_preview(
|
310
|
+
page_index=page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
|
311
|
+
temporary_highlights=temp_highlights_for_page,
|
312
|
+
scale=scale,
|
313
|
+
width=width,
|
314
|
+
labels=labels, # Pass through labels
|
315
|
+
legend_position=legend_position,
|
316
|
+
**kwargs
|
317
|
+
)
|
318
|
+
if page_image:
|
319
|
+
output_page_images.append(page_image)
|
320
|
+
|
321
|
+
# 4. Stack the generated page images if multiple
|
322
|
+
if not output_page_images:
|
323
|
+
logger.info("FlowRegion.show() produced no page images to concatenate.")
|
324
|
+
return None
|
325
|
+
|
326
|
+
if len(output_page_images) == 1:
|
327
|
+
return output_page_images[0]
|
328
|
+
|
329
|
+
# Stacking logic (same as in FlowRegionCollection.show)
|
330
|
+
if stack_direction == "vertical":
|
331
|
+
final_width = max(img.width for img in output_page_images)
|
332
|
+
final_height = sum(img.height for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
|
333
|
+
if final_width == 0 or final_height == 0:
|
334
|
+
raise ValueError("Cannot create concatenated image with zero width or height.")
|
335
|
+
|
336
|
+
concatenated_image = PIL_Image_Runtime.new("RGB", (final_width, final_height), stack_background_color)
|
337
|
+
current_y = 0
|
338
|
+
for img in output_page_images:
|
339
|
+
paste_x = (final_width - img.width) // 2
|
340
|
+
concatenated_image.paste(img, (paste_x, current_y))
|
341
|
+
current_y += img.height + stack_gap
|
342
|
+
return concatenated_image
|
343
|
+
elif stack_direction == "horizontal":
|
344
|
+
final_width = sum(img.width for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
|
345
|
+
final_height = max(img.height for img in output_page_images)
|
346
|
+
if final_width == 0 or final_height == 0:
|
347
|
+
raise ValueError("Cannot create concatenated image with zero width or height.")
|
348
|
+
|
349
|
+
concatenated_image = PIL_Image_Runtime.new("RGB", (final_width, final_height), stack_background_color)
|
350
|
+
current_x = 0
|
351
|
+
for img in output_page_images:
|
352
|
+
paste_y = (final_height - img.height) // 2
|
353
|
+
concatenated_image.paste(img, (current_x, paste_y))
|
354
|
+
current_x += img.width + stack_gap
|
355
|
+
return concatenated_image
|
356
|
+
else:
|
357
|
+
raise ValueError(f"Invalid stack_direction '{stack_direction}' for FlowRegion.show(). Must be 'vertical' or 'horizontal'.")
|
358
|
+
|
359
|
+
def to_images(
|
360
|
+
self,
|
361
|
+
resolution: float = 150,
|
362
|
+
**kwargs,
|
363
|
+
) -> List["PIL_Image"]:
|
364
|
+
"""
|
365
|
+
Generates and returns a list of cropped PIL Images,
|
366
|
+
one for each constituent physical region of this FlowRegion.
|
367
|
+
"""
|
368
|
+
if not self.constituent_regions:
|
369
|
+
logger.info("FlowRegion.to_images() called on an empty FlowRegion.")
|
370
|
+
return []
|
371
|
+
|
372
|
+
cropped_images: List["PIL_Image"] = []
|
373
|
+
for region_part in self.constituent_regions:
|
374
|
+
try:
|
375
|
+
img = region_part.to_image(
|
376
|
+
resolution=resolution,
|
377
|
+
crop_only=True,
|
378
|
+
include_highlights=False,
|
379
|
+
**kwargs
|
380
|
+
)
|
381
|
+
if img:
|
382
|
+
cropped_images.append(img)
|
383
|
+
except Exception as e:
|
384
|
+
logger.error(f"Error generating image for constituent region {region_part.bbox}: {e}", exc_info=True)
|
385
|
+
|
386
|
+
return cropped_images
|
387
|
+
|
388
|
+
def to_image(self, background_color=(255,255,255), **kwargs) -> Optional["PIL_Image"]:
|
389
|
+
"""
|
390
|
+
Creates a single composite image by stacking the images of its constituent regions.
|
391
|
+
Stacking direction is based on the Flow's arrangement.
|
392
|
+
Individual region images are obtained by calling to_images(**kwargs).
|
393
|
+
|
394
|
+
Args:
|
395
|
+
background_color: Tuple for RGB background color of the composite image.
|
396
|
+
**kwargs: Additional arguments passed to to_images() for rendering individual parts
|
397
|
+
(e.g., resolution).
|
398
|
+
|
399
|
+
Returns:
|
400
|
+
A single PIL.Image.Image object, or None if no constituent images.
|
401
|
+
"""
|
402
|
+
# Use PIL_Image_Runtime for creating new images at runtime
|
403
|
+
images = self.to_images(**kwargs)
|
404
|
+
if not images:
|
405
|
+
return None
|
406
|
+
if len(images) == 1:
|
407
|
+
return images[0]
|
408
|
+
|
409
|
+
if self.flow.arrangement == "vertical":
|
410
|
+
# Stack vertically
|
411
|
+
composite_width = max(img.width for img in images)
|
412
|
+
composite_height = sum(img.height for img in images)
|
413
|
+
if composite_width == 0 or composite_height == 0: return None # Avoid zero-size image
|
414
|
+
|
415
|
+
new_image = PIL_Image_Runtime.new("RGB", (composite_width, composite_height), background_color)
|
416
|
+
current_y = 0
|
417
|
+
for img in images:
|
418
|
+
# Default to left alignment for vertical stacking
|
419
|
+
new_image.paste(img, (0, current_y))
|
420
|
+
current_y += img.height
|
421
|
+
return new_image
|
422
|
+
|
423
|
+
elif self.flow.arrangement == "horizontal":
|
424
|
+
# Stack horizontally
|
425
|
+
composite_width = sum(img.width for img in images)
|
426
|
+
composite_height = max(img.height for img in images)
|
427
|
+
if composite_width == 0 or composite_height == 0: return None
|
428
|
+
|
429
|
+
new_image = PIL_Image_Runtime.new("RGB", (composite_width, composite_height), background_color)
|
430
|
+
current_x = 0
|
431
|
+
for img in images:
|
432
|
+
# Default to top alignment for horizontal stacking
|
433
|
+
new_image.paste(img, (current_x, 0))
|
434
|
+
current_x += img.width
|
435
|
+
return new_image
|
436
|
+
else:
|
437
|
+
# Should not happen if flow.arrangement is validated
|
438
|
+
logger.warning(f"Unknown flow arrangement: {self.flow.arrangement}. Cannot stack images.")
|
439
|
+
return None
|
440
|
+
|
441
|
+
def __repr__(self) -> str:
|
442
|
+
return (
|
443
|
+
f"<FlowRegion constituents={len(self.constituent_regions)}, flow={self.flow}, "
|
444
|
+
f"source_bbox={self.source_flow_element.bbox if self.source_flow_element else 'N/A'}>"
|
445
|
+
)
|
446
|
+
|
447
|
+
@property
|
448
|
+
def is_empty(self) -> bool:
|
449
|
+
"""Checks if the FlowRegion contains no constituent regions or if all are empty."""
|
450
|
+
if not self.constituent_regions:
|
451
|
+
return True
|
452
|
+
# A more robust check might see if extract_text() is empty and elements() is empty.
|
453
|
+
# For now, if it has regions, it's not considered empty by this simple check.
|
454
|
+
# User Point 4: FlowRegion can be empty (no text, no elements). This implies checking content.
|
455
|
+
try:
|
456
|
+
return not bool(self.extract_text(apply_exclusions=False).strip()) and not bool(self.elements(apply_exclusions=False))
|
457
|
+
except Exception:
|
458
|
+
return True # If error during check, assume empty to be safe
|
natural_pdf/search/__init__.py
CHANGED
@@ -3,29 +3,46 @@
|
|
3
3
|
import logging
|
4
4
|
from typing import Optional
|
5
5
|
|
6
|
-
#
|
7
|
-
|
8
|
-
from .haystack_search_service import HaystackSearchService
|
9
|
-
|
10
|
-
# --- Utils Import ---
|
11
|
-
from .haystack_utils import ( # Re-export flag and helper
|
12
|
-
HAS_HAYSTACK_EXTRAS,
|
13
|
-
check_haystack_availability,
|
14
|
-
)
|
15
|
-
|
16
|
-
# --- Option Imports (for convenience) ---
|
17
|
-
# Make options easily available via `from natural_pdf.search import ...`
|
18
|
-
from .search_options import SearchOptions # Alias for TextSearchOptions for simplicity?
|
6
|
+
# Import constants
|
7
|
+
from .search_options import SearchOptions
|
19
8
|
from .search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
|
20
|
-
|
21
|
-
# --- Protocol Import ---
|
22
|
-
# Import the protocol for type hinting
|
23
9
|
from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
|
24
10
|
|
11
|
+
# Check search extras availability
|
12
|
+
LANCEDB_AVAILABLE = False
|
13
|
+
SEARCH_DEPENDENCIES_AVAILABLE = False
|
14
|
+
|
15
|
+
try:
|
16
|
+
import sentence_transformers
|
17
|
+
import numpy as np
|
18
|
+
# Basic search dependencies are available
|
19
|
+
SEARCH_DEPENDENCIES_AVAILABLE = True
|
20
|
+
|
21
|
+
# Check if LanceDB is available
|
22
|
+
try:
|
23
|
+
import lancedb
|
24
|
+
import pyarrow
|
25
|
+
LANCEDB_AVAILABLE = True
|
26
|
+
from .lancedb_search_service import LanceDBSearchService, DEFAULT_LANCEDB_PERSIST_PATH, DEFAULT_EMBEDDING_MODEL
|
27
|
+
except ImportError:
|
28
|
+
# LanceDB not available, we'll use NumPy fallback
|
29
|
+
LANCEDB_AVAILABLE = False
|
30
|
+
from .numpy_search_service import NumpySearchService, DEFAULT_EMBEDDING_MODEL
|
31
|
+
except ImportError:
|
32
|
+
# Basic dependencies missing
|
33
|
+
SEARCH_DEPENDENCIES_AVAILABLE = False
|
34
|
+
LANCEDB_AVAILABLE = False
|
35
|
+
|
25
36
|
logger = logging.getLogger(__name__)
|
26
37
|
|
38
|
+
def check_search_availability():
|
39
|
+
"""Check if required search dependencies are available."""
|
40
|
+
if not SEARCH_DEPENDENCIES_AVAILABLE:
|
41
|
+
raise ImportError(
|
42
|
+
"Search functionality requires 'sentence-transformers' and NumPy. "
|
43
|
+
"Install with: pip install natural-pdf[search] (or pip install sentence-transformers numpy)"
|
44
|
+
)
|
27
45
|
|
28
|
-
# Factory Function
|
29
46
|
def get_search_service(
|
30
47
|
collection_name: str,
|
31
48
|
persist: bool = False,
|
@@ -34,53 +51,49 @@ def get_search_service(
|
|
34
51
|
) -> SearchServiceProtocol:
|
35
52
|
"""
|
36
53
|
Factory function to get an instance of the configured search service.
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
54
|
+
|
55
|
+
Automatically selects the best available implementation:
|
56
|
+
- LanceDB if installed (recommended for both in-memory and persistent)
|
57
|
+
- Numpy fallback for in-memory only
|
41
58
|
|
42
59
|
Args:
|
43
|
-
collection_name: The logical name for the index this service instance manages
|
44
|
-
(used as table_name for LanceDB).
|
60
|
+
collection_name: The logical name for the index/table this service instance manages.
|
45
61
|
persist: If True, creates a service instance configured for persistent
|
46
|
-
storage
|
47
|
-
uri: Override the default path
|
62
|
+
storage. If False (default), uses InMemory (via temp dir for LanceDB).
|
63
|
+
uri: Override the default path for persistent storage.
|
48
64
|
default_embedding_model: Override the default embedding model used by the service.
|
49
|
-
**kwargs: Reserved for future configuration options.
|
50
65
|
|
51
66
|
Returns:
|
52
|
-
An instance conforming to the SearchServiceProtocol
|
67
|
+
An instance conforming to the SearchServiceProtocol.
|
53
68
|
"""
|
54
69
|
logger.debug(
|
55
|
-
f"Calling get_search_service factory for
|
70
|
+
f"Calling get_search_service factory for collection '{collection_name}' (persist={persist}, uri={uri})..."
|
56
71
|
)
|
72
|
+
check_search_availability()
|
57
73
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
74
|
+
service_args = {
|
75
|
+
"collection_name": collection_name,
|
76
|
+
"persist": persist,
|
77
|
+
}
|
62
78
|
if uri is not None:
|
63
79
|
service_args["uri"] = uri
|
64
|
-
if default_embedding_model is not None:
|
65
|
-
service_args["embedding_model"] = default_embedding_model
|
66
80
|
|
67
|
-
|
81
|
+
if default_embedding_model is not None:
|
82
|
+
service_args["embedding_model_name"] = default_embedding_model
|
68
83
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
# Error message remains valid
|
75
|
-
logger.error(
|
76
|
-
f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True
|
84
|
+
# If persistence is requested, LanceDB is required
|
85
|
+
if persist and not LANCEDB_AVAILABLE:
|
86
|
+
raise RuntimeError(
|
87
|
+
"Persistent vector search requires LanceDB. "
|
88
|
+
"Please install: pip install lancedb"
|
77
89
|
)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
90
|
+
|
91
|
+
# Select the appropriate implementation
|
92
|
+
if LANCEDB_AVAILABLE:
|
93
|
+
logger.info(f"Using LanceDB for vector search (collection: {collection_name})")
|
94
|
+
service_instance = LanceDBSearchService(**service_args)
|
95
|
+
else:
|
96
|
+
logger.info(f"Using NumPy fallback for in-memory vector search (collection: {collection_name})")
|
97
|
+
service_instance = NumpySearchService(**service_args)
|
98
|
+
|
99
|
+
return service_instance
|