natural-pdf 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1092 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
3
+
4
+ import cv2
5
+ import numpy as np
6
+ from PIL import Image, ImageDraw
7
+ from scipy.signal import find_peaks
8
+ from scipy.ndimage import gaussian_filter1d
9
+
10
+ if TYPE_CHECKING:
11
+ from natural_pdf.core.page import Page
12
+ from natural_pdf.core.pdf import PDF
13
+ from natural_pdf.elements.collections import ElementCollection, PageCollection
14
+ from natural_pdf.elements.line import LineElement
15
+ # from natural_pdf.elements.rect import RectangleElement # Removed
16
+ from natural_pdf.elements.region import Region
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Constants for default values of less commonly adjusted line detection parameters
21
+ LINE_DETECTION_PARAM_DEFAULTS = {
22
+ "binarization_method": "otsu",
23
+ "adaptive_thresh_block_size": 21,
24
+ "adaptive_thresh_C_val": 5,
25
+ "morph_op_h": "none",
26
+ "morph_kernel_h": (1, 2), # Kernel as (columns, rows)
27
+ "morph_op_v": "none",
28
+ "morph_kernel_v": (2, 1), # Kernel as (columns, rows)
29
+ "smoothing_sigma_h": 0.6,
30
+ "smoothing_sigma_v": 0.6,
31
+ "peak_width_rel_height": 0.5,
32
+ }
33
+
34
+ class ShapeDetectionMixin:
35
+ """
36
+ Mixin class to provide shape detection capabilities (lines)
37
+ for Page, Region, PDFCollection, and PageCollection objects.
38
+ """
39
+
40
+ def _get_image_for_detection(self, resolution: int) -> Tuple[Optional[np.ndarray], float, Tuple[float, float], Optional['Page']]:
41
+ """
42
+ Gets the image for detection, scale factor, PDF origin offset, and the relevant page object.
43
+
44
+ Returns:
45
+ Tuple containing:
46
+ - cv_image (np.ndarray, optional): The OpenCV image array.
47
+ - scale_factor (float): Factor to convert image pixels to PDF points.
48
+ - origin_offset_pdf (Tuple[float, float]): (x0, top) offset in PDF points.
49
+ - page_obj (Page, optional): The page object this detection pertains to.
50
+ """
51
+ pil_image = None
52
+ page_obj = None
53
+ origin_offset_pdf = (0.0, 0.0)
54
+
55
+ # Determine the type of self and get the appropriate image and page context
56
+ if hasattr(self, 'to_image') and hasattr(self, 'width') and hasattr(self, 'height'): # Page or Region
57
+ if hasattr(self, 'x0') and hasattr(self, 'top') and hasattr(self, '_page'): # Region
58
+ logger.debug(f"Shape detection on Region: {self}")
59
+ page_obj = self._page
60
+ pil_image = self.to_image(resolution=resolution, crop_only=True, include_highlights=False)
61
+ if pil_image: # Ensure pil_image is not None before accessing attributes
62
+ origin_offset_pdf = (self.x0, self.top)
63
+ logger.debug(f"Region image rendered successfully: {pil_image.width}x{pil_image.height}, origin_offset: {origin_offset_pdf}")
64
+ else: # Page
65
+ logger.debug(f"Shape detection on Page: {self}")
66
+ page_obj = self
67
+ pil_image = self.to_image(resolution=resolution, include_highlights=False)
68
+ logger.debug(f"Page image rendered successfully: {pil_image.width}x{pil_image.height}")
69
+ else:
70
+ logger.error(f"Instance of type {type(self)} does not support to_image for detection.")
71
+ return None, 1.0, (0.0, 0.0), None
72
+
73
+ if not pil_image:
74
+ logger.warning("Failed to render image for shape detection.")
75
+ return None, 1.0, (0.0, 0.0), page_obj
76
+
77
+ if pil_image.mode != "RGB":
78
+ pil_image = pil_image.convert("RGB")
79
+ cv_image = np.array(pil_image)
80
+
81
+ # Calculate scale_factor: points_per_pixel
82
+ # For a Page, self.width/height are PDF points. pil_image.width/height are pixels.
83
+ # For a Region, self.width/height are PDF points of the region. pil_image.width/height are pixels of the cropped image.
84
+ # The scale factor should always relate the dimensions of the *processed image* to the *PDF dimensions* of that same area.
85
+
86
+ if page_obj and pil_image.width > 0 and pil_image.height > 0:
87
+ # If it's a region, its self.width/height are its dimensions in PDF points.
88
+ # pil_image.width/height are the pixel dimensions of the cropped image of that region.
89
+ # So, the scale factor remains consistent.
90
+ # We need to convert pixel distances on the image back to PDF point distances.
91
+ # If 100 PDF points span 200 pixels, then 1 pixel = 0.5 PDF points. scale_factor = points/pixels
92
+ # Example: Page width 500pt, image width 1000px. Scale = 500/1000 = 0.5 pt/px
93
+ # Region width 50pt, cropped image width 100px. Scale = 50/100 = 0.5 pt/px
94
+
95
+ # Use self.width/height for scale factor calculation because these correspond to the PDF dimensions of the area imaged.
96
+ # This ensures that if self is a Region, its specific dimensions are used for scaling its own cropped image.
97
+
98
+ # We need two scale factors if aspect ratio is not preserved by to_image,
99
+ # but to_image generally aims to preserve it when only resolution is changed.
100
+ # Assuming uniform scaling for now.
101
+ # A robust way: scale_x = self.width / pil_image.width; scale_y = self.height / pil_image.height
102
+ # For simplicity, let's assume uniform scaling or average it.
103
+ # Average scale factor:
104
+ scale_factor = ( (self.width / pil_image.width) + (self.height / pil_image.height) ) / 2.0
105
+ logger.debug(f"Calculated scale_factor: {scale_factor:.4f} (PDF dimensions: {self.width:.1f}x{self.height:.1f}, Image: {pil_image.width}x{pil_image.height})")
106
+
107
+ else:
108
+ logger.warning("Could not determine page object or image dimensions for scaling.")
109
+ scale_factor = 1.0 # Default to no scaling if info is missing
110
+
111
+ return cv_image, scale_factor, origin_offset_pdf, page_obj
112
+
113
+
114
+ def _process_image_for_lines(
115
+ self,
116
+ cv_image: np.ndarray,
117
+ off_angle: int,
118
+ min_line_length: int,
119
+ merge_angle_tolerance: int,
120
+ merge_distance_tolerance: int,
121
+ merge_endpoint_tolerance: int,
122
+ initial_min_line_length: int,
123
+ min_nfa_score_horizontal: float,
124
+ min_nfa_score_vertical: float,
125
+ ) -> List[Dict]:
126
+ """Processes an image to detect lines using OpenCV LSD and merging logic."""
127
+ if cv_image is None:
128
+ return []
129
+
130
+ gray_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2GRAY)
131
+ lsd = cv2.createLineSegmentDetector(cv2.LSD_REFINE_ADV)
132
+ coords_arr, widths_arr, precs_arr, nfa_scores_arr = lsd.detect(gray_image)
133
+
134
+ lines_raw = []
135
+ if coords_arr is not None: # nfa_scores_arr can be None if no lines are found
136
+ nfa_scores_list = nfa_scores_arr.flatten() if nfa_scores_arr is not None else [0.0] * len(coords_arr)
137
+ widths_list = widths_arr.flatten() if widths_arr is not None else [1.0] * len(coords_arr)
138
+ precs_list = precs_arr.flatten() if precs_arr is not None else [0.0] * len(coords_arr)
139
+
140
+ for i in range(len(coords_arr)):
141
+ lines_raw.append((
142
+ coords_arr[i][0],
143
+ widths_list[i] if i < len(widths_list) else 1.0,
144
+ precs_list[i] if i < len(precs_list) else 0.0,
145
+ nfa_scores_list[i] if i < len(nfa_scores_list) else 0.0
146
+ ))
147
+
148
+ def get_line_properties(line_data_item):
149
+ l_coords, l_width, l_prec, l_nfa_score = line_data_item
150
+ x1, y1, x2, y2 = l_coords
151
+ angle_rad = np.arctan2(y2 - y1, x2 - x1)
152
+ angle_deg = np.degrees(angle_rad)
153
+ normalized_angle_deg = angle_deg % 180
154
+ if normalized_angle_deg < 0:
155
+ normalized_angle_deg += 180
156
+
157
+ is_h = abs(normalized_angle_deg) <= off_angle or abs(normalized_angle_deg - 180) <= off_angle
158
+ is_v = abs(normalized_angle_deg - 90) <= off_angle
159
+
160
+ if is_h and x1 > x2: x1, x2, y1, y2 = x2, x1, y2, y1
161
+ elif is_v and y1 > y2: y1, y2, x1, x2 = y2, y1, x2, x1
162
+
163
+ length = np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
164
+ return {'coords': (x1, y1, x2, y2), 'width': l_width, 'prec': l_prec,
165
+ 'angle_deg': normalized_angle_deg, 'is_horizontal': is_h, 'is_vertical': is_v,
166
+ 'length': length, 'nfa_score': l_nfa_score}
167
+
168
+ processed_lines = [get_line_properties(ld) for ld in lines_raw]
169
+
170
+ filtered_lines = []
171
+ for p in processed_lines:
172
+ if p['length'] <= initial_min_line_length: continue
173
+ if p['is_horizontal'] and p['nfa_score'] >= min_nfa_score_horizontal:
174
+ filtered_lines.append(p)
175
+ elif p['is_vertical'] and p['nfa_score'] >= min_nfa_score_vertical:
176
+ filtered_lines.append(p)
177
+
178
+ horizontal_lines = [p for p in filtered_lines if p['is_horizontal']]
179
+ vertical_lines = [p for p in filtered_lines if p['is_vertical']]
180
+
181
+ def merge_lines_list(lines_list, is_horizontal_merge):
182
+ if not lines_list: return []
183
+ key_sort = (lambda p: (p['coords'][1], p['coords'][0])) if is_horizontal_merge else (lambda p: (p['coords'][0], p['coords'][1]))
184
+ lines_list.sort(key=key_sort)
185
+
186
+ merged_results = []
187
+ merged_flags = [False] * len(lines_list)
188
+
189
+ for i, current_line_props in enumerate(lines_list):
190
+ if merged_flags[i]: continue
191
+ group = [current_line_props]; merged_flags[i] = True
192
+
193
+ # Keep trying to expand the group until no more lines can be added
194
+ # Use multiple passes to ensure transitive merging works properly
195
+ for merge_pass in range(10): # Up to 3 passes to catch complex merging scenarios
196
+ group_changed = False
197
+
198
+ # Calculate current group boundaries
199
+ group_x1, group_y1 = min(p['coords'][0] for p in group), min(p['coords'][1] for p in group)
200
+ group_x2, group_y2 = max(p['coords'][2] for p in group), max(p['coords'][3] for p in group)
201
+ total_len_in_group = sum(p['length'] for p in group)
202
+ if total_len_in_group == 0: continue # Should not happen
203
+
204
+ # Calculate weighted averages for the group
205
+ group_avg_angle = sum(p['angle_deg'] * p['length'] for p in group) / total_len_in_group
206
+
207
+ if is_horizontal_merge:
208
+ group_avg_perp_coord = sum(((p['coords'][1] + p['coords'][3]) / 2) * p['length'] for p in group) / total_len_in_group
209
+ else:
210
+ group_avg_perp_coord = sum(((p['coords'][0] + p['coords'][2]) / 2) * p['length'] for p in group) / total_len_in_group
211
+
212
+ # Check all unmerged lines for potential merging
213
+ for j, candidate_props in enumerate(lines_list):
214
+ if merged_flags[j]: continue
215
+
216
+ # 1. Check for parallelism (angle similarity)
217
+ angle_diff = abs(group_avg_angle - candidate_props['angle_deg'])
218
+ # Handle wraparound for angles near 0/180
219
+ if angle_diff > 90:
220
+ angle_diff = 180 - angle_diff
221
+ if angle_diff > merge_angle_tolerance: continue
222
+
223
+ # 2. Check for closeness (perpendicular distance)
224
+ if is_horizontal_merge:
225
+ cand_perp_coord = (candidate_props['coords'][1] + candidate_props['coords'][3]) / 2
226
+ else:
227
+ cand_perp_coord = (candidate_props['coords'][0] + candidate_props['coords'][2]) / 2
228
+
229
+ perp_distance = abs(group_avg_perp_coord - cand_perp_coord)
230
+ if perp_distance > merge_distance_tolerance: continue
231
+
232
+ # 3. Check for reasonable proximity along the primary axis
233
+ if is_horizontal_merge:
234
+ # For horizontal lines, check x-axis relationship
235
+ cand_x1, cand_x2 = candidate_props['coords'][0], candidate_props['coords'][2]
236
+ # Check if there's overlap OR if the gap is reasonable
237
+ overlap = max(0, min(group_x2, cand_x2) - max(group_x1, cand_x1))
238
+ gap_to_group = min(abs(group_x1 - cand_x2), abs(group_x2 - cand_x1))
239
+
240
+ # Accept if there's overlap OR the gap is reasonable OR the candidate is contained within group span
241
+ if not (overlap > 0 or gap_to_group <= merge_endpoint_tolerance or (cand_x1 >= group_x1 and cand_x2 <= group_x2)):
242
+ continue
243
+ else:
244
+ # For vertical lines, check y-axis relationship
245
+ cand_y1, cand_y2 = candidate_props['coords'][1], candidate_props['coords'][3]
246
+ overlap = max(0, min(group_y2, cand_y2) - max(group_y1, cand_y1))
247
+ gap_to_group = min(abs(group_y1 - cand_y2), abs(group_y2 - cand_y1))
248
+
249
+ if not (overlap > 0 or gap_to_group <= merge_endpoint_tolerance or (cand_y1 >= group_y1 and cand_y2 <= group_y2)):
250
+ continue
251
+
252
+ # If we reach here, lines should be merged
253
+ group.append(candidate_props)
254
+ merged_flags[j] = True
255
+ group_changed = True
256
+
257
+ if not group_changed:
258
+ break # No more lines added in this pass, stop trying
259
+
260
+ # Create final merged line from the group
261
+ final_x1, final_y1 = min(p['coords'][0] for p in group), min(p['coords'][1] for p in group)
262
+ final_x2, final_y2 = max(p['coords'][2] for p in group), max(p['coords'][3] for p in group)
263
+ final_total_len = sum(p['length'] for p in group)
264
+ if final_total_len == 0: continue
265
+
266
+ final_width = sum(p['width'] * p['length'] for p in group) / final_total_len
267
+ final_nfa = sum(p['nfa_score'] * p['length'] for p in group) / final_total_len
268
+
269
+ if is_horizontal_merge:
270
+ final_y = sum(((p['coords'][1] + p['coords'][3]) / 2) * p['length'] for p in group) / final_total_len
271
+ merged_line_data = (final_x1, final_y, final_x2, final_y, final_width, final_nfa)
272
+ else:
273
+ final_x = sum(((p['coords'][0] + p['coords'][2]) / 2) * p['length'] for p in group) / final_total_len
274
+ merged_line_data = (final_x, final_y1, final_x, final_y2, final_width, final_nfa)
275
+ merged_results.append(merged_line_data)
276
+ return merged_results
277
+
278
+ merged_h_lines = merge_lines_list(horizontal_lines, True)
279
+ merged_v_lines = merge_lines_list(vertical_lines, False)
280
+ all_merged = merged_h_lines + merged_v_lines
281
+
282
+ final_lines_data = []
283
+ for line_data_item in all_merged:
284
+ x1, y1, x2, y2, width, nfa = line_data_item
285
+ length = np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
286
+ if length > min_line_length:
287
+ # Ensure x1 <= x2 for horizontal, y1 <= y2 for vertical
288
+ if abs(y2 - y1) < abs(x2-x1): # Horizontal-ish
289
+ if x1 > x2: x1_out, y1_out, x2_out, y2_out = x2, y2, x1, y1
290
+ else: x1_out, y1_out, x2_out, y2_out = x1, y1, x2, y2
291
+ else: # Vertical-ish
292
+ if y1 > y2: x1_out, y1_out, x2_out, y2_out = x2, y2, x1, y1
293
+ else: x1_out, y1_out, x2_out, y2_out = x1, y1, x2, y2
294
+
295
+ final_lines_data.append({
296
+ 'x1': x1_out, 'y1': y1_out, 'x2': x2_out, 'y2': y2_out,
297
+ 'width': width, 'nfa_score': nfa, 'length': length
298
+ })
299
+ return final_lines_data
300
+
301
+ def _convert_line_to_element_data(
302
+ self, line_data_img: Dict, scale_factor: float, origin_offset_pdf: Tuple[float, float], page_obj: 'Page', source_label: str
303
+ ) -> Dict:
304
+ """Converts line data from image coordinates to PDF element data."""
305
+ # Ensure scale_factor is not zero to prevent division by zero or incorrect scaling
306
+ if scale_factor == 0:
307
+ logger.warning("Scale factor is zero, cannot convert line coordinates correctly.")
308
+ # Return something or raise error, for now, try to proceed with unscaled if possible (won't be right)
309
+ # This situation ideally shouldn't happen if _get_image_for_detection is robust.
310
+ effective_scale = 1.0
311
+ else:
312
+ effective_scale = scale_factor
313
+
314
+ x0 = origin_offset_pdf[0] + line_data_img['x1'] * effective_scale
315
+ top = origin_offset_pdf[1] + line_data_img['y1'] * effective_scale
316
+ x1 = origin_offset_pdf[0] + line_data_img['x2'] * effective_scale
317
+ bottom = origin_offset_pdf[1] + line_data_img['y2'] * effective_scale # y2 is the second y-coord
318
+
319
+ # For lines, width attribute in PDF points
320
+ line_width_pdf = line_data_img['width'] * effective_scale
321
+
322
+ # initial_doctop might not be loaded if page object is minimal
323
+ initial_doctop = getattr(page_obj._page, 'initial_doctop', 0) if hasattr(page_obj, '_page') else 0
324
+
325
+ return {
326
+ "x0": x0, "top": top, "x1": x1, "bottom": bottom, # bottom here is y2_pdf
327
+ "width": abs(x1 - x0), # This is bounding box width
328
+ "height": abs(bottom - top), # This is bounding box height
329
+ "linewidth": line_width_pdf, # Actual stroke width of the line
330
+ "object_type": "line",
331
+ "page_number": page_obj.page_number,
332
+ "doctop": top + initial_doctop,
333
+ "source": source_label,
334
+ "stroking_color": (0,0,0), # Default, can be enhanced
335
+ "non_stroking_color": (0,0,0), # Default
336
+ # Add other raw data if useful
337
+ "raw_line_thickness_px": line_data_img.get('line_thickness_px'), # Renamed from raw_nfa_score
338
+ "raw_line_position_px": line_data_img.get('line_position_px'), # Added for clarity
339
+ }
340
+
341
+ def _find_lines_on_image_data(
342
+ self,
343
+ cv_image: np.ndarray,
344
+ pil_image_rgb: Image.Image, # For original dimensions
345
+ horizontal: bool = True,
346
+ vertical: bool = True,
347
+ peak_threshold_h: float = 0.5,
348
+ min_gap_h: int = 5,
349
+ peak_threshold_v: float = 0.5,
350
+ min_gap_v: int = 5,
351
+ max_lines_h: Optional[int] = None,
352
+ max_lines_v: Optional[int] = None,
353
+ binarization_method: str = LINE_DETECTION_PARAM_DEFAULTS["binarization_method"],
354
+ adaptive_thresh_block_size: int = LINE_DETECTION_PARAM_DEFAULTS["adaptive_thresh_block_size"],
355
+ adaptive_thresh_C_val: int = LINE_DETECTION_PARAM_DEFAULTS["adaptive_thresh_C_val"],
356
+ morph_op_h: str = LINE_DETECTION_PARAM_DEFAULTS["morph_op_h"],
357
+ morph_kernel_h: Tuple[int, int] = LINE_DETECTION_PARAM_DEFAULTS["morph_kernel_h"],
358
+ morph_op_v: str = LINE_DETECTION_PARAM_DEFAULTS["morph_op_v"],
359
+ morph_kernel_v: Tuple[int, int] = LINE_DETECTION_PARAM_DEFAULTS["morph_kernel_v"],
360
+ smoothing_sigma_h: float = LINE_DETECTION_PARAM_DEFAULTS["smoothing_sigma_h"],
361
+ smoothing_sigma_v: float = LINE_DETECTION_PARAM_DEFAULTS["smoothing_sigma_v"],
362
+ peak_width_rel_height: float = LINE_DETECTION_PARAM_DEFAULTS["peak_width_rel_height"],
363
+ ) -> Tuple[List[Dict], Optional[np.ndarray], Optional[np.ndarray]]:
364
+ """
365
+ Core image processing logic to detect lines using projection profiling.
366
+ Returns raw line data (image coordinates) and smoothed profiles.
367
+ """
368
+ if cv_image is None:
369
+ return [], None, None
370
+
371
+ cv_gray = cv2.cvtColor(cv_image, cv2.COLOR_RGB2GRAY)
372
+ img_height, img_width = cv_gray.shape
373
+ logger.debug(f"Line detection - Image dimensions: {img_width}x{img_height}")
374
+
375
+ if binarization_method == "adaptive":
376
+ binarized_image = cv2.adaptiveThreshold(cv_gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
377
+ cv2.THRESH_BINARY_INV, adaptive_thresh_block_size, adaptive_thresh_C_val)
378
+ elif binarization_method == "otsu":
379
+ otsu_thresh_val, binarized_image = cv2.threshold(cv_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
380
+ logger.debug(f"Otsu's threshold applied. Value: {otsu_thresh_val}")
381
+ else:
382
+ logger.error(f"Invalid binarization_method: {binarization_method}. Supported: 'otsu', 'adaptive'. Defaulting to 'otsu'.")
383
+ otsu_thresh_val, binarized_image = cv2.threshold(cv_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
384
+
385
+ binarized_norm = binarized_image / 255.0
386
+
387
+ detected_lines_data = []
388
+ profile_h_smoothed_for_viz: Optional[np.ndarray] = None
389
+ profile_v_smoothed_for_viz: Optional[np.ndarray] = None
390
+
391
+ def get_lines_from_profile(
392
+ profile_data: np.ndarray,
393
+ max_dimension_for_ratio: int,
394
+ params_key_suffix: str,
395
+ is_horizontal_detection: bool
396
+ ) -> Tuple[List[Dict], np.ndarray]: # Ensure it always returns profile_smoothed
397
+ lines_info = []
398
+ sigma = smoothing_sigma_h if is_horizontal_detection else smoothing_sigma_v
399
+ profile_smoothed = gaussian_filter1d(profile_data.astype(float), sigma=sigma)
400
+
401
+ peak_threshold = peak_threshold_h if is_horizontal_detection else peak_threshold_v
402
+ min_gap = min_gap_h if is_horizontal_detection else min_gap_v
403
+ max_lines = max_lines_h if is_horizontal_detection else max_lines_v
404
+
405
+ current_peak_height_threshold = peak_threshold * max_dimension_for_ratio
406
+ find_peaks_distance = min_gap
407
+
408
+ if max_lines is not None:
409
+ current_peak_height_threshold = 1.0
410
+ find_peaks_distance = 1
411
+
412
+ candidate_peaks_indices, candidate_properties = find_peaks(
413
+ profile_smoothed, height=current_peak_height_threshold, distance=find_peaks_distance,
414
+ width=1, prominence=1, rel_height=peak_width_rel_height
415
+ )
416
+
417
+ final_peaks_indices = candidate_peaks_indices
418
+ final_properties = candidate_properties
419
+
420
+ if max_lines is not None:
421
+ if len(candidate_peaks_indices) > 0 and 'prominences' in candidate_properties:
422
+ prominences = candidate_properties["prominences"]
423
+ sorted_candidate_indices_by_prominence = np.argsort(prominences)[::-1]
424
+ selected_peaks_original_indices = []
425
+ suppressed_profile_indices = np.zeros(len(profile_smoothed), dtype=bool)
426
+ num_selected = 0
427
+ for original_idx_in_candidate_list in sorted_candidate_indices_by_prominence:
428
+ actual_profile_idx = candidate_peaks_indices[original_idx_in_candidate_list]
429
+ if not suppressed_profile_indices[actual_profile_idx]:
430
+ selected_peaks_original_indices.append(original_idx_in_candidate_list)
431
+ num_selected += 1
432
+ lower_bound = max(0, actual_profile_idx - min_gap)
433
+ upper_bound = min(len(profile_smoothed), actual_profile_idx + min_gap + 1)
434
+ suppressed_profile_indices[lower_bound:upper_bound] = True
435
+ if num_selected >= max_lines: break
436
+ final_peaks_indices = candidate_peaks_indices[selected_peaks_original_indices]
437
+ final_properties = {key: val_array[selected_peaks_original_indices] for key, val_array in candidate_properties.items()}
438
+ logger.debug(f"Selected {len(final_peaks_indices)} {params_key_suffix.upper()}-lines for max_lines={max_lines}.")
439
+ else:
440
+ final_peaks_indices = np.array([])
441
+ final_properties = {}
442
+ logger.debug(f"No {params_key_suffix.upper()}-peaks for max_lines selection.")
443
+ elif not final_peaks_indices.size:
444
+ final_properties = {}
445
+ logger.debug(f"No {params_key_suffix.upper()}-lines found using threshold.")
446
+ else:
447
+ logger.debug(f"Found {len(final_peaks_indices)} {params_key_suffix.upper()}-lines using threshold.")
448
+
449
+ if final_peaks_indices.size > 0:
450
+ sort_order = np.argsort(final_peaks_indices)
451
+ final_peaks_indices = final_peaks_indices[sort_order]
452
+ for key in final_properties: final_properties[key] = final_properties[key][sort_order]
453
+
454
+ for i, peak_idx in enumerate(final_peaks_indices):
455
+ center_coord = int(peak_idx)
456
+ profile_thickness = final_properties.get("widths", [])[i] if "widths" in final_properties and i < len(final_properties["widths"]) else 1.0
457
+ profile_thickness = max(1, int(round(profile_thickness)))
458
+
459
+ current_img_width = pil_image_rgb.width # Use actual passed image dimensions
460
+ current_img_height = pil_image_rgb.height
461
+
462
+ if is_horizontal_detection:
463
+ lines_info.append({
464
+ 'x1': 0, 'y1': center_coord,
465
+ 'x2': current_img_width -1, 'y2': center_coord,
466
+ 'width': profile_thickness,
467
+ 'length': current_img_width,
468
+ 'line_thickness_px': profile_thickness,
469
+ 'line_position_px': center_coord
470
+ })
471
+ else:
472
+ lines_info.append({
473
+ 'x1': center_coord, 'y1': 0,
474
+ 'x2': center_coord, 'y2': current_img_height -1,
475
+ 'width': profile_thickness,
476
+ 'length': current_img_height,
477
+ 'line_thickness_px': profile_thickness,
478
+ 'line_position_px': center_coord
479
+ })
480
+ return lines_info, profile_smoothed
481
+
482
+ if horizontal:
483
+ processed_image_h = binarized_norm.copy()
484
+ if morph_op_h != "none":
485
+ kernel_h_struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_kernel_h)
486
+ op = cv2.MORPH_OPEN if morph_op_h == "open" else cv2.MORPH_CLOSE
487
+ processed_image_h = cv2.morphologyEx(processed_image_h, op, kernel_h_struct)
488
+ profile_h_raw = np.sum(processed_image_h, axis=1)
489
+ horizontal_lines, smoothed_h = get_lines_from_profile(profile_h_raw, pil_image_rgb.width, 'h', True)
490
+ profile_h_smoothed_for_viz = smoothed_h
491
+ detected_lines_data.extend(horizontal_lines)
492
+ logger.info(f"Detected {len(horizontal_lines)} horizontal lines.")
493
+
494
+ if vertical:
495
+ processed_image_v = binarized_norm.copy()
496
+ if morph_op_v != "none":
497
+ kernel_v_struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_kernel_v)
498
+ op = cv2.MORPH_OPEN if morph_op_v == "open" else cv2.MORPH_CLOSE
499
+ processed_image_v = cv2.morphologyEx(processed_image_v, op, kernel_v_struct)
500
+ profile_v_raw = np.sum(processed_image_v, axis=0)
501
+ vertical_lines, smoothed_v = get_lines_from_profile(profile_v_raw, pil_image_rgb.height, 'v', False)
502
+ profile_v_smoothed_for_viz = smoothed_v
503
+ detected_lines_data.extend(vertical_lines)
504
+ logger.info(f"Detected {len(vertical_lines)} vertical lines.")
505
+
506
+ return detected_lines_data, profile_h_smoothed_for_viz, profile_v_smoothed_for_viz
507
+
508
+ def detect_lines(
509
+ self,
510
+ resolution: int = 192,
511
+ source_label: str = "detected",
512
+ horizontal: bool = True,
513
+ vertical: bool = True,
514
+ peak_threshold_h: float = 0.5,
515
+ min_gap_h: int = 5,
516
+ peak_threshold_v: float = 0.5,
517
+ min_gap_v: int = 5,
518
+ max_lines_h: Optional[int] = None,
519
+ max_lines_v: Optional[int] = None,
520
+ replace: bool = True,
521
+ binarization_method: str = LINE_DETECTION_PARAM_DEFAULTS["binarization_method"],
522
+ adaptive_thresh_block_size: int = LINE_DETECTION_PARAM_DEFAULTS["adaptive_thresh_block_size"],
523
+ adaptive_thresh_C_val: int = LINE_DETECTION_PARAM_DEFAULTS["adaptive_thresh_C_val"],
524
+ morph_op_h: str = LINE_DETECTION_PARAM_DEFAULTS["morph_op_h"],
525
+ morph_kernel_h: Tuple[int, int] = LINE_DETECTION_PARAM_DEFAULTS["morph_kernel_h"],
526
+ morph_op_v: str = LINE_DETECTION_PARAM_DEFAULTS["morph_op_v"],
527
+ morph_kernel_v: Tuple[int, int] = LINE_DETECTION_PARAM_DEFAULTS["morph_kernel_v"],
528
+ smoothing_sigma_h: float = LINE_DETECTION_PARAM_DEFAULTS["smoothing_sigma_h"],
529
+ smoothing_sigma_v: float = LINE_DETECTION_PARAM_DEFAULTS["smoothing_sigma_v"],
530
+ peak_width_rel_height: float = LINE_DETECTION_PARAM_DEFAULTS["peak_width_rel_height"],
531
+ ) -> "ShapeDetectionMixin": # Return type changed back to self
532
+ """
533
+ Detects lines on the Page or Region, or on all pages within a Collection
534
+ using projection profiling and peak detection.
535
+ Adds detected lines as LineElement objects to the ElementManager.
536
+
537
+ Args:
538
+ resolution: DPI for image rendering before detection.
539
+ source_label: Label assigned to the 'source' attribute of created LineElements.
540
+ horizontal: If True, detect horizontal lines.
541
+ vertical: If True, detect vertical lines.
542
+ peak_threshold_h: Threshold for peak detection in horizontal profile (ratio of image width).
543
+ min_gap_h: Minimum gap between horizontal lines (pixels).
544
+ peak_threshold_v: Threshold for peak detection in vertical profile (ratio of image height).
545
+ min_gap_v: Minimum gap between vertical lines (pixels).
546
+ max_lines_h: If set, limits the number of horizontal lines to the top N by prominence.
547
+ max_lines_v: If set, limits the number of vertical lines to the top N by prominence.
548
+ replace: If True, remove existing detected lines with the same source_label.
549
+ binarization_method: "adaptive" or "otsu".
550
+ adaptive_thresh_block_size: Block size for adaptive thresholding (if method is "adaptive").
551
+ adaptive_thresh_C_val: Constant subtracted from the mean for adaptive thresholding (if method is "adaptive").
552
+ morph_op_h: Morphological operation for horizontal lines ("open", "close", "none").
553
+ morph_kernel_h: Kernel tuple (cols, rows) for horizontal morphology. Example: (1, 2).
554
+ morph_op_v: Morphological operation for vertical lines ("open", "close", "none").
555
+ morph_kernel_v: Kernel tuple (cols, rows) for vertical morphology. Example: (2, 1).
556
+ smoothing_sigma_h: Gaussian smoothing sigma for horizontal profile.
557
+ smoothing_sigma_v: Gaussian smoothing sigma for vertical profile.
558
+ peak_width_rel_height: Relative height for `scipy.find_peaks` 'width' parameter.
559
+
560
+ Returns:
561
+ Self for method chaining.
562
+ """
563
+ if not horizontal and not vertical:
564
+ logger.info("Line detection skipped as both horizontal and vertical are False.")
565
+ return self
566
+
567
+ collection_params = {
568
+ "resolution": resolution, "source_label": source_label,
569
+ "horizontal": horizontal, "vertical": vertical,
570
+ "peak_threshold_h": peak_threshold_h, "min_gap_h": min_gap_h,
571
+ "peak_threshold_v": peak_threshold_v, "min_gap_v": min_gap_v,
572
+ "max_lines_h": max_lines_h, "max_lines_v": max_lines_v,
573
+ "replace": replace,
574
+ "binarization_method": binarization_method,
575
+ "adaptive_thresh_block_size": adaptive_thresh_block_size,
576
+ "adaptive_thresh_C_val": adaptive_thresh_C_val,
577
+ "morph_op_h": morph_op_h, "morph_kernel_h": morph_kernel_h,
578
+ "morph_op_v": morph_op_v, "morph_kernel_v": morph_kernel_v,
579
+ "smoothing_sigma_h": smoothing_sigma_h, "smoothing_sigma_v": smoothing_sigma_v,
580
+ "peak_width_rel_height": peak_width_rel_height,
581
+ }
582
+
583
+ if hasattr(self, 'pdfs'):
584
+ for pdf_doc in self.pdfs:
585
+ for page_obj in pdf_doc.pages:
586
+ page_obj.detect_lines(**collection_params)
587
+ return self
588
+ elif hasattr(self, 'pages') and not hasattr(self, '_page'):
589
+ for page_obj in self.pages:
590
+ page_obj.detect_lines(**collection_params)
591
+ return self
592
+
593
+ cv_image, scale_factor, origin_offset_pdf, page_object_ctx = self._get_image_for_detection(resolution)
594
+ if cv_image is None or page_object_ctx is None:
595
+ logger.warning(f"Skipping line detection for {self} due to image error.")
596
+ return self
597
+
598
+ pil_image_for_dims = None
599
+ if hasattr(self, 'to_image') and hasattr(self, 'width') and hasattr(self, 'height'):
600
+ if hasattr(self, 'x0') and hasattr(self, 'top') and hasattr(self, '_page'):
601
+ pil_image_for_dims = self.to_image(resolution=resolution, crop_only=True, include_highlights=False)
602
+ else:
603
+ pil_image_for_dims = self.to_image(resolution=resolution, include_highlights=False)
604
+ if pil_image_for_dims is None:
605
+ logger.warning(f"Could not re-render PIL image for dimensions for {self}.")
606
+ pil_image_for_dims = Image.fromarray(cv_image) # Ensure it's not None
607
+
608
+ if pil_image_for_dims.mode != "RGB":
609
+ pil_image_for_dims = pil_image_for_dims.convert("RGB")
610
+
611
+ if replace:
612
+ from natural_pdf.elements.line import LineElement
613
+ element_manager = page_object_ctx._element_mgr
614
+ if hasattr(element_manager, '_elements') and 'lines' in element_manager._elements:
615
+ original_count = len(element_manager._elements['lines'])
616
+ element_manager._elements['lines'] = [
617
+ line for line in element_manager._elements['lines']
618
+ if getattr(line, 'source', None) != source_label
619
+ ]
620
+ removed_count = original_count - len(element_manager._elements['lines'])
621
+ if removed_count > 0:
622
+ logger.info(f"Removed {removed_count} existing lines with source '{source_label}' from {page_object_ctx}")
623
+
624
+ lines_data_img, profile_h_smoothed, profile_v_smoothed = self._find_lines_on_image_data(
625
+ cv_image=cv_image,
626
+ pil_image_rgb=pil_image_for_dims,
627
+ horizontal=horizontal,
628
+ vertical=vertical,
629
+ peak_threshold_h=peak_threshold_h,
630
+ min_gap_h=min_gap_h,
631
+ peak_threshold_v=peak_threshold_v,
632
+ min_gap_v=min_gap_v,
633
+ max_lines_h=max_lines_h,
634
+ max_lines_v=max_lines_v,
635
+ binarization_method=binarization_method,
636
+ adaptive_thresh_block_size=adaptive_thresh_block_size,
637
+ adaptive_thresh_C_val=adaptive_thresh_C_val,
638
+ morph_op_h=morph_op_h, morph_kernel_h=morph_kernel_h,
639
+ morph_op_v=morph_op_v, morph_kernel_v=morph_kernel_v,
640
+ smoothing_sigma_h=smoothing_sigma_h, smoothing_sigma_v=smoothing_sigma_v,
641
+ peak_width_rel_height=peak_width_rel_height,
642
+ )
643
+
644
+ from natural_pdf.elements.line import LineElement
645
+ element_manager = page_object_ctx._element_mgr
646
+
647
+ for line_data_item_img in lines_data_img:
648
+ element_constructor_data = self._convert_line_to_element_data(
649
+ line_data_item_img, scale_factor, origin_offset_pdf, page_object_ctx, source_label
650
+ )
651
+ try:
652
+ line_element = LineElement(element_constructor_data, page_object_ctx)
653
+ element_manager.add_element(line_element, element_type="lines")
654
+ except Exception as e:
655
+ logger.error(f"Failed to create or add LineElement: {e}. Data: {element_constructor_data}", exc_info=True)
656
+
657
+ logger.info(f"Detected and added {len(lines_data_img)} lines to {page_object_ctx} with source '{source_label}'.")
658
+ return self
659
+
660
+ def detect_lines_preview(
661
+ self,
662
+ resolution: int = 72, # Preview typically uses lower resolution
663
+ horizontal: bool = True,
664
+ vertical: bool = True,
665
+ peak_threshold_h: float = 0.5,
666
+ min_gap_h: int = 5,
667
+ peak_threshold_v: float = 0.5,
668
+ min_gap_v: int = 5,
669
+ max_lines_h: Optional[int] = None,
670
+ max_lines_v: Optional[int] = None,
671
+ binarization_method: str = LINE_DETECTION_PARAM_DEFAULTS["binarization_method"],
672
+ adaptive_thresh_block_size: int = LINE_DETECTION_PARAM_DEFAULTS["adaptive_thresh_block_size"],
673
+ adaptive_thresh_C_val: int = LINE_DETECTION_PARAM_DEFAULTS["adaptive_thresh_C_val"],
674
+ morph_op_h: str = LINE_DETECTION_PARAM_DEFAULTS["morph_op_h"],
675
+ morph_kernel_h: Tuple[int, int] = LINE_DETECTION_PARAM_DEFAULTS["morph_kernel_h"],
676
+ morph_op_v: str = LINE_DETECTION_PARAM_DEFAULTS["morph_op_v"],
677
+ morph_kernel_v: Tuple[int, int] = LINE_DETECTION_PARAM_DEFAULTS["morph_kernel_v"],
678
+ smoothing_sigma_h: float = LINE_DETECTION_PARAM_DEFAULTS["smoothing_sigma_h"],
679
+ smoothing_sigma_v: float = LINE_DETECTION_PARAM_DEFAULTS["smoothing_sigma_v"],
680
+ peak_width_rel_height: float = LINE_DETECTION_PARAM_DEFAULTS["peak_width_rel_height"],
681
+ ) -> Optional[Image.Image]:
682
+ """
683
+ Previews detected lines on a Page or Region without adding them to the PDF elements.
684
+ Generates and returns a debug visualization image.
685
+ This method is intended for Page or Region objects.
686
+ See `detect_lines` for parameter descriptions. The main difference is a lower default `resolution`.
687
+ """
688
+ if hasattr(self, 'pdfs') or (hasattr(self, 'pages') and not hasattr(self, '_page')):
689
+ logger.warning("preview_detected_lines is intended for single Page/Region objects. For collections, process pages individually.")
690
+ return None
691
+
692
+ if not horizontal and not vertical: # Check this early
693
+ logger.info("Line preview skipped as both horizontal and vertical are False.")
694
+ return None
695
+
696
+ cv_image, _, _, page_object_ctx = self._get_image_for_detection(resolution) # scale_factor and origin_offset not needed for preview
697
+ if cv_image is None or page_object_ctx is None: # page_object_ctx for logging context mostly
698
+ logger.warning(f"Skipping line preview for {self} due to image error.")
699
+ return None
700
+
701
+ pil_image_for_dims = None
702
+ if hasattr(self, 'to_image') and hasattr(self, 'width') and hasattr(self, 'height'):
703
+ if hasattr(self, 'x0') and hasattr(self, 'top') and hasattr(self, '_page'):
704
+ pil_image_for_dims = self.to_image(resolution=resolution, crop_only=True, include_highlights=False)
705
+ else:
706
+ pil_image_for_dims = self.to_image(resolution=resolution, include_highlights=False)
707
+
708
+ if pil_image_for_dims is None:
709
+ logger.warning(f"Could not render PIL image for preview for {self}. Using cv_image to create one.")
710
+ pil_image_for_dims = Image.fromarray(cv_image)
711
+
712
+ if pil_image_for_dims.mode != "RGB":
713
+ pil_image_for_dims = pil_image_for_dims.convert("RGB")
714
+
715
+ lines_data_img, profile_h_smoothed, profile_v_smoothed = self._find_lines_on_image_data(
716
+ cv_image=cv_image,
717
+ pil_image_rgb=pil_image_for_dims,
718
+ horizontal=horizontal,
719
+ vertical=vertical,
720
+ peak_threshold_h=peak_threshold_h,
721
+ min_gap_h=min_gap_h,
722
+ peak_threshold_v=peak_threshold_v,
723
+ min_gap_v=min_gap_v,
724
+ max_lines_h=max_lines_h,
725
+ max_lines_v=max_lines_v,
726
+ binarization_method=binarization_method,
727
+ adaptive_thresh_block_size=adaptive_thresh_block_size,
728
+ adaptive_thresh_C_val=adaptive_thresh_C_val,
729
+ morph_op_h=morph_op_h, morph_kernel_h=morph_kernel_h,
730
+ morph_op_v=morph_op_v, morph_kernel_v=morph_kernel_v,
731
+ smoothing_sigma_h=smoothing_sigma_h, smoothing_sigma_v=smoothing_sigma_v,
732
+ peak_width_rel_height=peak_width_rel_height,
733
+ )
734
+
735
+ if not lines_data_img: # Check if any lines were detected before visualization
736
+ logger.info(f"No lines detected for preview on {page_object_ctx or self}")
737
+ # Optionally return the base image if no lines, or None
738
+ return pil_image_for_dims.convert("RGBA") # Return base image so something is shown
739
+
740
+
741
+ # --- Visualization Logic (copied from previous debug block) ---
742
+ final_viz_image: Optional[Image.Image] = None
743
+ viz_image_base = pil_image_for_dims.convert("RGBA")
744
+ draw = ImageDraw.Draw(viz_image_base)
745
+ img_width, img_height = viz_image_base.size
746
+
747
+ viz_params = {
748
+ "draw_line_thickness_viz": 1,
749
+ "debug_histogram_size": 100,
750
+ "line_color_h": (255, 0, 0, 200), "line_color_v": (0, 0, 255, 200),
751
+ "histogram_bar_color_h": (200, 0, 0, 200), "histogram_bar_color_v": (0, 0, 200, 200),
752
+ "histogram_bg_color": (240, 240, 240, 255), "padding_between_viz": 10,
753
+ "peak_threshold_h": peak_threshold_h,
754
+ "peak_threshold_v": peak_threshold_v,
755
+ "max_lines_h": max_lines_h,
756
+ "max_lines_v": max_lines_v,
757
+ }
758
+
759
+ for line_info in lines_data_img:
760
+ is_h_line = abs(line_info['y1'] - line_info['y2']) < abs(line_info['x1'] - line_info['x2'])
761
+ line_color = viz_params["line_color_h"] if is_h_line else viz_params["line_color_v"]
762
+ draw.line([
763
+ (line_info['x1'], line_info['y1']),
764
+ (line_info['x2'], line_info['y2'])
765
+ ], fill=line_color, width=viz_params["draw_line_thickness_viz"])
766
+
767
+ hist_size = viz_params["debug_histogram_size"]
768
+ hist_h_img = Image.new("RGBA", (hist_size, img_height), viz_params["histogram_bg_color"])
769
+ hist_h_draw = ImageDraw.Draw(hist_h_img)
770
+
771
+ if profile_h_smoothed is not None and profile_h_smoothed.size > 0:
772
+ actual_max_h_profile = profile_h_smoothed.max()
773
+ max_h_profile_val_for_scaling = actual_max_h_profile if actual_max_h_profile > 0 else img_width
774
+ display_threshold_val_h = peak_threshold_h * img_width
775
+ for y_coord, val in enumerate(profile_h_smoothed):
776
+ bar_len = 0; thresh_bar_len = 0
777
+ if max_h_profile_val_for_scaling > 0:
778
+ bar_len = int((val / max_h_profile_val_for_scaling) * hist_size)
779
+ if display_threshold_val_h >= 0:
780
+ thresh_bar_len = int((display_threshold_val_h / max_h_profile_val_for_scaling) * hist_size)
781
+ bar_len = min(max(0, bar_len), hist_size)
782
+ if bar_len > 0: hist_h_draw.line([(0, y_coord), (bar_len -1 , y_coord)], fill=viz_params["histogram_bar_color_h"], width=1)
783
+ if viz_params["max_lines_h"] is None and display_threshold_val_h >=0 and \
784
+ thresh_bar_len > 0 and thresh_bar_len < hist_size:
785
+ hist_h_draw.line([(thresh_bar_len, y_coord), (thresh_bar_len, y_coord+1 if y_coord+1 < img_height else y_coord)], fill=(0,255,0,100), width=1)
786
+
787
+ hist_v_img = Image.new("RGBA", (img_width, hist_size), viz_params["histogram_bg_color"])
788
+ hist_v_draw = ImageDraw.Draw(hist_v_img)
789
+ if profile_v_smoothed is not None and profile_v_smoothed.size > 0:
790
+ actual_max_v_profile = profile_v_smoothed.max()
791
+ max_v_profile_val_for_scaling = actual_max_v_profile if actual_max_v_profile > 0 else img_height
792
+ display_threshold_val_v = peak_threshold_v * img_height
793
+ for x_coord, val in enumerate(profile_v_smoothed):
794
+ bar_height = 0; thresh_bar_h = 0
795
+ if max_v_profile_val_for_scaling > 0:
796
+ bar_height = int((val / max_v_profile_val_for_scaling) * hist_size)
797
+ if display_threshold_val_v >=0:
798
+ thresh_bar_h = int((display_threshold_val_v / max_v_profile_val_for_scaling) * hist_size)
799
+ bar_height = min(max(0, bar_height), hist_size)
800
+ if bar_height > 0: hist_v_draw.line([(x_coord, hist_size -1 ), (x_coord, hist_size - bar_height)], fill=viz_params["histogram_bar_color_v"], width=1)
801
+ if viz_params["max_lines_v"] is None and display_threshold_val_v >=0 and \
802
+ thresh_bar_h > 0 and thresh_bar_h < hist_size:
803
+ hist_v_draw.line([(x_coord, hist_size - thresh_bar_h), (x_coord+1 if x_coord+1 < img_width else x_coord, hist_size - thresh_bar_h)], fill=(0,255,0,100), width=1)
804
+
805
+ padding = viz_params["padding_between_viz"]
806
+ total_width = img_width + padding + hist_size
807
+ total_height = img_height + padding + hist_size
808
+ final_viz_image = Image.new("RGBA", (total_width, total_height), (255, 255, 255, 255))
809
+ final_viz_image.paste(viz_image_base, (0, 0))
810
+ final_viz_image.paste(hist_h_img, (img_width + padding, 0))
811
+ final_viz_image.paste(hist_v_img, (0, img_height + padding))
812
+ logger.info(f"Generated line preview visualization for {page_object_ctx or self}")
813
+ return final_viz_image
814
+
815
+ def detect_table_structure_from_lines(
816
+ self,
817
+ source_label: str = "detected",
818
+ ignore_outer_regions: bool = True,
819
+ cell_padding: float = 0.5, # Small padding inside cells, default to 0.5px
820
+ ) -> "ShapeDetectionMixin":
821
+ """
822
+ Create table structure (rows, columns, cells) from previously detected lines.
823
+
824
+ This method analyzes horizontal and vertical lines to create a grid structure,
825
+ then generates Region objects for:
826
+ - An overall table region that encompasses the entire table structure
827
+ - Individual row regions spanning the width of the table
828
+ - Individual column regions spanning the height of the table
829
+ - Individual cell regions at each row/column intersection
830
+
831
+ Args:
832
+ source_label: Filter lines by this source label (from detect_lines)
833
+ ignore_outer_regions: If True, don't create regions outside the defined by lines grid.
834
+ If False, include regions from page/object edges to the first/last lines.
835
+ cell_padding: Internal padding for cell regions
836
+
837
+ Returns:
838
+ Self for method chaining
839
+ """
840
+ # Handle collections
841
+ if hasattr(self, 'pdfs'):
842
+ for pdf_doc in self.pdfs:
843
+ for page_obj in pdf_doc.pages:
844
+ page_obj.detect_table_structure_from_lines(
845
+ source_label=source_label,
846
+ ignore_outer_regions=ignore_outer_regions,
847
+ cell_padding=cell_padding,
848
+ )
849
+ return self
850
+ elif hasattr(self, 'pages') and not hasattr(self, '_page'): # PageCollection
851
+ for page_obj in self.pages:
852
+ page_obj.detect_table_structure_from_lines(
853
+ source_label=source_label,
854
+ ignore_outer_regions=ignore_outer_regions,
855
+ cell_padding=cell_padding,
856
+ )
857
+ return self
858
+
859
+ # Determine context (Page or Region) for coordinates and element management
860
+ page_object_for_elements = None
861
+ origin_x, origin_y = 0.0, 0.0
862
+ context_width, context_height = 0.0, 0.0
863
+
864
+ if hasattr(self, '_element_mgr') and hasattr(self, 'width') and hasattr(self, 'height'): # Likely a Page
865
+ page_object_for_elements = self
866
+ context_width = self.width
867
+ context_height = self.height
868
+ logger.debug(f"Operating on Page context: {self}")
869
+ elif hasattr(self, '_page') and hasattr(self, 'x0') and hasattr(self, 'width'): # Likely a Region
870
+ page_object_for_elements = self._page
871
+ origin_x = self.x0
872
+ origin_y = self.top
873
+ context_width = self.width # Region's own width/height for its boundary calculations
874
+ context_height = self.height
875
+ logger.debug(f"Operating on Region context: {self}, origin: ({origin_x}, {origin_y})")
876
+ else:
877
+ logger.warning(f"Could not determine valid page/region context for {self}. Aborting table structure detection.")
878
+ return self
879
+
880
+ element_manager = page_object_for_elements._element_mgr
881
+
882
+ # Get lines with the specified source
883
+ all_lines = element_manager.lines # Access lines from the correct element manager
884
+ filtered_lines = [line for line in all_lines if getattr(line, 'source', None) == source_label]
885
+
886
+ if not filtered_lines:
887
+ logger.info(f"No lines found with source '{source_label}' for table structure detection on {self}.")
888
+ return self
889
+
890
+ # Separate horizontal and vertical lines
891
+ # For regions, line coordinates are already absolute to the page.
892
+ horizontal_lines = [line for line in filtered_lines if line.is_horizontal]
893
+ vertical_lines = [line for line in filtered_lines if line.is_vertical]
894
+
895
+ logger.info(f"Found {len(horizontal_lines)} horizontal and {len(vertical_lines)} vertical lines for {self} with source '{source_label}'.")
896
+
897
+ # Define boundaries based on line positions (mid-points for sorting, actual edges for boundaries)
898
+ # These coordinates are relative to the page_object_for_elements (which is always a Page)
899
+
900
+ # Horizontal line Y-coordinates (use average y, effectively the line's y-position)
901
+ h_line_ys = sorted(list(set([(line.top + line.bottom) / 2 for line in horizontal_lines])))
902
+
903
+ # Vertical line X-coordinates (use average x, effectively the line's x-position)
904
+ v_line_xs = sorted(list(set([(line.x0 + line.x1) / 2 for line in vertical_lines])))
905
+
906
+ row_boundaries = []
907
+ if horizontal_lines:
908
+ if not ignore_outer_regions:
909
+ row_boundaries.append(origin_y) # Region's top or Page's 0
910
+ row_boundaries.extend(h_line_ys)
911
+ if not ignore_outer_regions:
912
+ row_boundaries.append(origin_y + context_height) # Region's bottom or Page's height
913
+ elif not ignore_outer_regions : # No horizontal lines, but we might want full height cells
914
+ row_boundaries.extend([origin_y, origin_y + context_height])
915
+ row_boundaries = sorted(list(set(row_boundaries)))
916
+
917
+
918
+ col_boundaries = []
919
+ if vertical_lines:
920
+ if not ignore_outer_regions:
921
+ col_boundaries.append(origin_x) # Region's left or Page's 0
922
+ col_boundaries.extend(v_line_xs)
923
+ if not ignore_outer_regions:
924
+ col_boundaries.append(origin_x + context_width) # Region's right or Page's width
925
+ elif not ignore_outer_regions: # No vertical lines, but we might want full width cells
926
+ col_boundaries.extend([origin_x, origin_x + context_width])
927
+ col_boundaries = sorted(list(set(col_boundaries)))
928
+
929
+ logger.debug(f"Row boundaries for {self}: {row_boundaries}")
930
+ logger.debug(f"Col boundaries for {self}: {col_boundaries}")
931
+
932
+ # Create overall table region that wraps the entire structure
933
+ tables_created = 0
934
+ if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
935
+ table_left = col_boundaries[0]
936
+ table_top = row_boundaries[0]
937
+ table_right = col_boundaries[-1]
938
+ table_bottom = row_boundaries[-1]
939
+
940
+ if table_right > table_left and table_bottom > table_top:
941
+ try:
942
+ table_region = page_object_for_elements.create_region(
943
+ table_left, table_top, table_right, table_bottom
944
+ )
945
+ table_region.source = source_label
946
+ table_region.region_type = "table"
947
+ table_region.normalized_type = "table" # Add normalized_type for selector compatibility
948
+ table_region.metadata.update({
949
+ "source_lines_label": source_label,
950
+ "num_rows": len(row_boundaries) - 1,
951
+ "num_cols": len(col_boundaries) - 1,
952
+ "boundaries": {
953
+ "rows": row_boundaries,
954
+ "cols": col_boundaries
955
+ }
956
+ })
957
+ element_manager.add_element(table_region, element_type="regions")
958
+ tables_created += 1
959
+ logger.debug(f"Created table region: L{table_left:.1f} T{table_top:.1f} R{table_right:.1f} B{table_bottom:.1f}")
960
+ except Exception as e:
961
+ logger.error(f"Failed to create or add table Region: {e}. Table abs coords: L{table_left} T{table_top} R{table_right} B{table_bottom}", exc_info=True)
962
+
963
+ # Create cell regions
964
+ cells_created = 0
965
+ rows_created = 0
966
+ cols_created = 0
967
+
968
+ # Create Row Regions
969
+ if len(row_boundaries) >= 2:
970
+ # Determine horizontal extent for rows
971
+ row_extent_x0 = origin_x
972
+ row_extent_x1 = origin_x + context_width
973
+ if col_boundaries: # If columns are defined, rows should span only across them
974
+ if len(col_boundaries) >=2:
975
+ row_extent_x0 = col_boundaries[0]
976
+ row_extent_x1 = col_boundaries[-1]
977
+ # If only one col_boundary (e.g. from ignore_outer_regions=False and one line), use context width
978
+ # This case should be rare if lines are properly detected to form a grid.
979
+
980
+ for i in range(len(row_boundaries) - 1):
981
+ top_abs = row_boundaries[i]
982
+ bottom_abs = row_boundaries[i+1]
983
+
984
+ # Use calculated row_extent_x0 and row_extent_x1
985
+ if bottom_abs > top_abs and row_extent_x1 > row_extent_x0: # Ensure valid region
986
+ try:
987
+ row_region = page_object_for_elements.create_region(
988
+ row_extent_x0, top_abs, row_extent_x1, bottom_abs
989
+ )
990
+ row_region.source = source_label
991
+ row_region.region_type = "table_row"
992
+ row_region.normalized_type = "table_row" # Add normalized_type for selector compatibility
993
+ row_region.metadata.update({
994
+ "row_index": i,
995
+ "source_lines_label": source_label
996
+ })
997
+ element_manager.add_element(row_region, element_type="regions")
998
+ rows_created += 1
999
+ except Exception as e:
1000
+ logger.error(f"Failed to create or add table_row Region: {e}. Row abs coords: L{row_extent_x0} T{top_abs} R{row_extent_x1} B{bottom_abs}", exc_info=True)
1001
+
1002
+ # Create Column Regions
1003
+ if len(col_boundaries) >= 2:
1004
+ # Determine vertical extent for columns
1005
+ col_extent_y0 = origin_y
1006
+ col_extent_y1 = origin_y + context_height
1007
+ if row_boundaries: # If rows are defined, columns should span only across them
1008
+ if len(row_boundaries) >=2:
1009
+ col_extent_y0 = row_boundaries[0]
1010
+ col_extent_y1 = row_boundaries[-1]
1011
+ # If only one row_boundary, use context height - similar logic to rows
1012
+
1013
+ for j in range(len(col_boundaries) - 1):
1014
+ left_abs = col_boundaries[j]
1015
+ right_abs = col_boundaries[j+1]
1016
+
1017
+ # Use calculated col_extent_y0 and col_extent_y1
1018
+ if right_abs > left_abs and col_extent_y1 > col_extent_y0: # Ensure valid region
1019
+ try:
1020
+ col_region = page_object_for_elements.create_region(
1021
+ left_abs, col_extent_y0, right_abs, col_extent_y1
1022
+ )
1023
+ col_region.source = source_label
1024
+ col_region.region_type = "table_column"
1025
+ col_region.normalized_type = "table_column" # Add normalized_type for selector compatibility
1026
+ col_region.metadata.update({
1027
+ "col_index": j,
1028
+ "source_lines_label": source_label
1029
+ })
1030
+ element_manager.add_element(col_region, element_type="regions")
1031
+ cols_created += 1
1032
+ except Exception as e:
1033
+ logger.error(f"Failed to create or add table_column Region: {e}. Col abs coords: L{left_abs} T{col_extent_y0} R{right_abs} B{col_extent_y1}", exc_info=True)
1034
+
1035
+ # Create Cell Regions (existing logic)
1036
+ if len(row_boundaries) < 2 or len(col_boundaries) < 2:
1037
+ logger.info(f"Not enough boundaries to form cells for {self}. Rows: {len(row_boundaries)}, Cols: {len(col_boundaries)}")
1038
+ # return self # Return will be at the end
1039
+ else:
1040
+ for i in range(len(row_boundaries) - 1):
1041
+ top_abs = row_boundaries[i]
1042
+ bottom_abs = row_boundaries[i+1]
1043
+
1044
+ for j in range(len(col_boundaries) - 1):
1045
+ left_abs = col_boundaries[j]
1046
+ right_abs = col_boundaries[j+1]
1047
+
1048
+ cell_left_abs = left_abs + cell_padding
1049
+ cell_top_abs = top_abs + cell_padding
1050
+ cell_right_abs = right_abs - cell_padding
1051
+ cell_bottom_abs = bottom_abs - cell_padding
1052
+
1053
+ cell_width = cell_right_abs - cell_left_abs
1054
+ cell_height = cell_bottom_abs - cell_top_abs
1055
+
1056
+ if cell_width <= 0 or cell_height <= 0:
1057
+ logger.debug(f"Skipping cell (zero or negative dimension after padding): L{left_abs:.1f} T{top_abs:.1f} R{right_abs:.1f} B{bottom_abs:.1f} -> W{cell_width:.1f} H{cell_height:.1f}")
1058
+ continue
1059
+
1060
+ try:
1061
+ cell_region = page_object_for_elements.create_region(
1062
+ cell_left_abs, cell_top_abs, cell_right_abs, cell_bottom_abs
1063
+ )
1064
+ cell_region.source = source_label
1065
+ cell_region.region_type = "table_cell"
1066
+ cell_region.normalized_type = "table_cell" # Add normalized_type for selector compatibility
1067
+ cell_region.metadata.update({
1068
+ "row_index": i,
1069
+ "col_index": j,
1070
+ "source_lines_label": source_label,
1071
+ "original_boundaries_abs": {
1072
+ "left": left_abs, "top": top_abs,
1073
+ "right": right_abs, "bottom": bottom_abs
1074
+ }
1075
+ })
1076
+ element_manager.add_element(cell_region, element_type="regions")
1077
+ cells_created += 1
1078
+ except Exception as e:
1079
+ logger.error(f"Failed to create or add cell Region: {e}. Cell abs coords: L{cell_left_abs} T{cell_top_abs} R{cell_right_abs} B{cell_bottom_abs}", exc_info=True)
1080
+
1081
+ logger.info(f"Created {tables_created} table, {rows_created} rows, {cols_created} columns, and {cells_created} table cells from detected lines (source: '{source_label}') for {self}.")
1082
+ return self
1083
+
1084
+ # Example usage would be:
1085
+ # page.detect_lines(source_label="my_table_lines")
1086
+ # page.detect_table_structure_from_lines(source_label="my_table_lines", cell_padding=0.5)
1087
+ #
1088
+ # Now both selector styles work equivalently:
1089
+ # table = page.find('table[source*="table_from"]') # Direct type selector
1090
+ # table = page.find('region[type="table"][source*="table_from"]') # Region attribute selector
1091
+ # cells = page.find_all('table-cell[source*="table_cells_from"]') # Direct type selector
1092
+ # cells = page.find_all('region[type="table-cell"][source*="table_cells_from"]') # Region attribute selector