natural-pdf 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1129,645 +1129,6 @@ class ShapeDetectionMixin:
1129
1129
  )
1130
1130
  return final_lines_data
1131
1131
 
1132
- def detect_lines_preview(
1133
- self,
1134
- resolution: int = 72, # Preview typically uses lower resolution
1135
- method: str = "projection",
1136
- horizontal: bool = True,
1137
- vertical: bool = True,
1138
- peak_threshold_h: float = 0.5,
1139
- min_gap_h: int = 5,
1140
- peak_threshold_v: float = 0.5,
1141
- min_gap_v: int = 5,
1142
- max_lines_h: Optional[int] = None,
1143
- max_lines_v: Optional[int] = None,
1144
- binarization_method: str = LINE_DETECTION_PARAM_DEFAULTS["binarization_method"],
1145
- adaptive_thresh_block_size: int = LINE_DETECTION_PARAM_DEFAULTS[
1146
- "adaptive_thresh_block_size"
1147
- ],
1148
- adaptive_thresh_C_val: int = LINE_DETECTION_PARAM_DEFAULTS["adaptive_thresh_C_val"],
1149
- morph_op_h: str = LINE_DETECTION_PARAM_DEFAULTS["morph_op_h"],
1150
- morph_kernel_h: Tuple[int, int] = LINE_DETECTION_PARAM_DEFAULTS["morph_kernel_h"],
1151
- morph_op_v: str = LINE_DETECTION_PARAM_DEFAULTS["morph_op_v"],
1152
- morph_kernel_v: Tuple[int, int] = LINE_DETECTION_PARAM_DEFAULTS["morph_kernel_v"],
1153
- smoothing_sigma_h: float = LINE_DETECTION_PARAM_DEFAULTS["smoothing_sigma_h"],
1154
- smoothing_sigma_v: float = LINE_DETECTION_PARAM_DEFAULTS["smoothing_sigma_v"],
1155
- peak_width_rel_height: float = LINE_DETECTION_PARAM_DEFAULTS["peak_width_rel_height"],
1156
- # LSD-specific parameters
1157
- off_angle: int = 5,
1158
- min_line_length: int = 30,
1159
- merge_angle_tolerance: int = 5,
1160
- merge_distance_tolerance: int = 3,
1161
- merge_endpoint_tolerance: int = 10,
1162
- initial_min_line_length: int = 10,
1163
- min_nfa_score_horizontal: float = -10.0,
1164
- min_nfa_score_vertical: float = -10.0,
1165
- ) -> Optional[Image.Image]:
1166
- """
1167
- Previews detected lines on a Page or Region without adding them to the PDF elements.
1168
- Generates and returns a debug visualization image.
1169
- This method is intended for Page or Region objects.
1170
-
1171
- Args:
1172
- method: Detection method - "projection" (default) or "lsd" (requires opencv-python).
1173
- See `detect_lines` for other parameter descriptions. The main difference is a lower default `resolution`.
1174
-
1175
- Returns:
1176
- PIL Image with line detection visualization, or None if preview failed.
1177
-
1178
- Note:
1179
- Only projection profiling method supports histogram visualization.
1180
- LSD method will show detected lines overlaid on the original image.
1181
- """
1182
- if hasattr(self, "pdfs") or (hasattr(self, "pages") and not hasattr(self, "_page")):
1183
- logger.warning(
1184
- "preview_detected_lines is intended for single Page/Region objects. For collections, process pages individually."
1185
- )
1186
- return None
1187
-
1188
- if not horizontal and not vertical: # Check this early
1189
- logger.info("Line preview skipped as both horizontal and vertical are False.")
1190
- return None
1191
-
1192
- # Validate method parameter
1193
- if method not in ["projection", "lsd"]:
1194
- raise ValueError(f"Invalid method '{method}'. Supported methods: 'projection', 'lsd'")
1195
-
1196
- cv_image, _, _, page_object_ctx = self._get_image_for_detection(
1197
- resolution
1198
- ) # scale_factor and origin_offset not needed for preview
1199
- if (
1200
- cv_image is None or page_object_ctx is None
1201
- ): # page_object_ctx for logging context mostly
1202
- logger.warning(f"Skipping line preview for {self} due to image error.")
1203
- return None
1204
-
1205
- pil_image_for_dims = None
1206
- if hasattr(self, "to_image") and hasattr(self, "width") and hasattr(self, "height"):
1207
- if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"):
1208
- pil_image_for_dims = self.to_image(
1209
- resolution=resolution, crop=True, include_highlights=False
1210
- )
1211
- else:
1212
- pil_image_for_dims = self.to_image(resolution=resolution, include_highlights=False)
1213
-
1214
- if pil_image_for_dims is None:
1215
- logger.warning(
1216
- f"Could not render PIL image for preview for {self}. Using cv_image to create one."
1217
- )
1218
- pil_image_for_dims = Image.fromarray(cv_image)
1219
-
1220
- if pil_image_for_dims.mode != "RGB":
1221
- pil_image_for_dims = pil_image_for_dims.convert("RGB")
1222
-
1223
- # Get lines data based on method
1224
- if method == "projection":
1225
- lines_data_img, profile_h_smoothed, profile_v_smoothed = self._find_lines_on_image_data(
1226
- cv_image=cv_image,
1227
- pil_image_rgb=pil_image_for_dims,
1228
- horizontal=horizontal,
1229
- vertical=vertical,
1230
- peak_threshold_h=peak_threshold_h,
1231
- min_gap_h=min_gap_h,
1232
- peak_threshold_v=peak_threshold_v,
1233
- min_gap_v=min_gap_v,
1234
- max_lines_h=max_lines_h,
1235
- max_lines_v=max_lines_v,
1236
- binarization_method=binarization_method,
1237
- adaptive_thresh_block_size=adaptive_thresh_block_size,
1238
- adaptive_thresh_C_val=adaptive_thresh_C_val,
1239
- morph_op_h=morph_op_h,
1240
- morph_kernel_h=morph_kernel_h,
1241
- morph_op_v=morph_op_v,
1242
- morph_kernel_v=morph_kernel_v,
1243
- smoothing_sigma_h=smoothing_sigma_h,
1244
- smoothing_sigma_v=smoothing_sigma_v,
1245
- peak_width_rel_height=peak_width_rel_height,
1246
- )
1247
- elif method == "lsd":
1248
- try:
1249
- import cv2
1250
- except ImportError:
1251
- raise ImportError(
1252
- "OpenCV (cv2) is required for LSD line detection preview. "
1253
- "Install it with: pip install opencv-python\n"
1254
- "Alternatively, use method='projection' for preview."
1255
- )
1256
- lines_data_img = self._process_image_for_lines_lsd(
1257
- cv_image,
1258
- off_angle,
1259
- min_line_length,
1260
- merge_angle_tolerance,
1261
- merge_distance_tolerance,
1262
- merge_endpoint_tolerance,
1263
- initial_min_line_length,
1264
- min_nfa_score_horizontal,
1265
- min_nfa_score_vertical,
1266
- )
1267
- profile_h_smoothed, profile_v_smoothed = None, None # LSD doesn't use profiles
1268
-
1269
- if not lines_data_img: # Check if any lines were detected before visualization
1270
- logger.info(f"No lines detected for preview on {page_object_ctx or self}")
1271
- # Optionally return the base image if no lines, or None
1272
- return pil_image_for_dims.convert("RGBA") # Return base image so something is shown
1273
-
1274
- # --- Visualization Logic ---
1275
- final_viz_image: Optional[Image.Image] = None
1276
- viz_image_base = pil_image_for_dims.convert("RGBA")
1277
- draw = ImageDraw.Draw(viz_image_base)
1278
- img_width, img_height = viz_image_base.size
1279
-
1280
- viz_params = {
1281
- "draw_line_thickness_viz": 2, # Slightly thicker for better visibility
1282
- "debug_histogram_size": 100,
1283
- "line_color_h": (255, 0, 0, 200),
1284
- "line_color_v": (0, 0, 255, 200),
1285
- "histogram_bar_color_h": (200, 0, 0, 200),
1286
- "histogram_bar_color_v": (0, 0, 200, 200),
1287
- "histogram_bg_color": (240, 240, 240, 255),
1288
- "padding_between_viz": 10,
1289
- "peak_threshold_h": peak_threshold_h,
1290
- "peak_threshold_v": peak_threshold_v,
1291
- "max_lines_h": max_lines_h,
1292
- "max_lines_v": max_lines_v,
1293
- }
1294
-
1295
- # Draw detected lines on the image
1296
- for line_info in lines_data_img:
1297
- is_h_line = abs(line_info["y1"] - line_info["y2"]) < abs(
1298
- line_info["x1"] - line_info["x2"]
1299
- )
1300
- line_color = viz_params["line_color_h"] if is_h_line else viz_params["line_color_v"]
1301
- draw.line(
1302
- [(line_info["x1"], line_info["y1"]), (line_info["x2"], line_info["y2"])],
1303
- fill=line_color,
1304
- width=viz_params["draw_line_thickness_viz"],
1305
- )
1306
-
1307
- # For projection method, add histogram visualization
1308
- if method == "projection" and (
1309
- profile_h_smoothed is not None or profile_v_smoothed is not None
1310
- ):
1311
- hist_size = viz_params["debug_histogram_size"]
1312
- hist_h_img = Image.new(
1313
- "RGBA", (hist_size, img_height), viz_params["histogram_bg_color"]
1314
- )
1315
- hist_h_draw = ImageDraw.Draw(hist_h_img)
1316
-
1317
- if profile_h_smoothed is not None and profile_h_smoothed.size > 0:
1318
- actual_max_h_profile = profile_h_smoothed.max()
1319
- display_threshold_val_h = peak_threshold_h * img_width
1320
- # Use the maximum of either the profile max or threshold for scaling, so both are always visible
1321
- max_h_profile_val_for_scaling = (
1322
- max(actual_max_h_profile, display_threshold_val_h)
1323
- if actual_max_h_profile > 0
1324
- else img_width
1325
- )
1326
- for y_coord, val in enumerate(profile_h_smoothed):
1327
- bar_len = 0
1328
- thresh_bar_len = 0
1329
- if max_h_profile_val_for_scaling > 0:
1330
- bar_len = int((val / max_h_profile_val_for_scaling) * hist_size)
1331
- if display_threshold_val_h >= 0:
1332
- thresh_bar_len = int(
1333
- (display_threshold_val_h / max_h_profile_val_for_scaling)
1334
- * hist_size
1335
- )
1336
- bar_len = min(max(0, bar_len), hist_size)
1337
- if bar_len > 0:
1338
- hist_h_draw.line(
1339
- [(0, y_coord), (bar_len - 1, y_coord)],
1340
- fill=viz_params["histogram_bar_color_h"],
1341
- width=1,
1342
- )
1343
- if (
1344
- viz_params["max_lines_h"] is None
1345
- and display_threshold_val_h >= 0
1346
- and thresh_bar_len > 0
1347
- and thresh_bar_len <= hist_size
1348
- ):
1349
- # Ensure threshold line is within bounds
1350
- thresh_x = min(thresh_bar_len, hist_size - 1)
1351
- hist_h_draw.line(
1352
- [
1353
- (thresh_x, y_coord),
1354
- (thresh_x, y_coord + 1 if y_coord + 1 < img_height else y_coord),
1355
- ],
1356
- fill=(0, 255, 0, 100),
1357
- width=1,
1358
- )
1359
-
1360
- hist_v_img = Image.new("RGBA", (img_width, hist_size), viz_params["histogram_bg_color"])
1361
- hist_v_draw = ImageDraw.Draw(hist_v_img)
1362
- if profile_v_smoothed is not None and profile_v_smoothed.size > 0:
1363
- actual_max_v_profile = profile_v_smoothed.max()
1364
- display_threshold_val_v = peak_threshold_v * img_height
1365
- # Use the maximum of either the profile max or threshold for scaling, so both are always visible
1366
- max_v_profile_val_for_scaling = (
1367
- max(actual_max_v_profile, display_threshold_val_v)
1368
- if actual_max_v_profile > 0
1369
- else img_height
1370
- )
1371
- for x_coord, val in enumerate(profile_v_smoothed):
1372
- bar_height = 0
1373
- thresh_bar_h = 0
1374
- if max_v_profile_val_for_scaling > 0:
1375
- bar_height = int((val / max_v_profile_val_for_scaling) * hist_size)
1376
- if display_threshold_val_v >= 0:
1377
- thresh_bar_h = int(
1378
- (display_threshold_val_v / max_v_profile_val_for_scaling)
1379
- * hist_size
1380
- )
1381
- bar_height = min(max(0, bar_height), hist_size)
1382
- if bar_height > 0:
1383
- hist_v_draw.line(
1384
- [(x_coord, hist_size - 1), (x_coord, hist_size - bar_height)],
1385
- fill=viz_params["histogram_bar_color_v"],
1386
- width=1,
1387
- )
1388
- if (
1389
- viz_params["max_lines_v"] is None
1390
- and display_threshold_val_v >= 0
1391
- and thresh_bar_h > 0
1392
- and thresh_bar_h <= hist_size
1393
- ):
1394
- # Ensure threshold line is within bounds
1395
- thresh_y = min(thresh_bar_h, hist_size - 1)
1396
- hist_v_draw.line(
1397
- [
1398
- (x_coord, hist_size - thresh_y),
1399
- (
1400
- x_coord + 1 if x_coord + 1 < img_width else x_coord,
1401
- hist_size - thresh_y,
1402
- ),
1403
- ],
1404
- fill=(0, 255, 0, 100),
1405
- width=1,
1406
- )
1407
-
1408
- padding = viz_params["padding_between_viz"]
1409
- total_width = img_width + padding + hist_size
1410
- total_height = img_height + padding + hist_size
1411
- final_viz_image = Image.new("RGBA", (total_width, total_height), (255, 255, 255, 255))
1412
- final_viz_image.paste(viz_image_base, (0, 0))
1413
- final_viz_image.paste(hist_h_img, (img_width + padding, 0))
1414
- final_viz_image.paste(hist_v_img, (0, img_height + padding))
1415
- else:
1416
- # For LSD method, just return the image with lines overlaid
1417
- final_viz_image = viz_image_base
1418
-
1419
- logger.info(f"Generated line preview visualization for {page_object_ctx or self}")
1420
- return final_viz_image
1421
-
1422
- def detect_table_structure_from_lines(
1423
- self,
1424
- source_label: str = "detected",
1425
- ignore_outer_regions: bool = True,
1426
- cell_padding: float = 0.5, # Small padding inside cells, default to 0.5px
1427
- ) -> "ShapeDetectionMixin":
1428
- """
1429
- Create table structure (rows, columns, cells) from previously detected lines.
1430
-
1431
- This method analyzes horizontal and vertical lines to create a grid structure,
1432
- then generates Region objects for:
1433
- - An overall table region that encompasses the entire table structure
1434
- - Individual row regions spanning the width of the table
1435
- - Individual column regions spanning the height of the table
1436
- - Individual cell regions at each row/column intersection
1437
-
1438
- Args:
1439
- source_label: Filter lines by this source label (from detect_lines)
1440
- ignore_outer_regions: If True, don't create regions outside the defined by lines grid.
1441
- If False, include regions from page/object edges to the first/last lines.
1442
- cell_padding: Internal padding for cell regions
1443
-
1444
- Returns:
1445
- Self for method chaining
1446
- """
1447
- # Handle collections
1448
- if hasattr(self, "pdfs"):
1449
- for pdf_doc in self.pdfs:
1450
- for page_obj in pdf_doc.pages:
1451
- page_obj.detect_table_structure_from_lines(
1452
- source_label=source_label,
1453
- ignore_outer_regions=ignore_outer_regions,
1454
- cell_padding=cell_padding,
1455
- )
1456
- return self
1457
- elif hasattr(self, "pages") and not hasattr(self, "_page"): # PageCollection
1458
- for page_obj in self.pages:
1459
- page_obj.detect_table_structure_from_lines(
1460
- source_label=source_label,
1461
- ignore_outer_regions=ignore_outer_regions,
1462
- cell_padding=cell_padding,
1463
- )
1464
- return self
1465
-
1466
- # Determine context (Page or Region) for coordinates and element management
1467
- page_object_for_elements = None
1468
- origin_x, origin_y = 0.0, 0.0
1469
- context_width, context_height = 0.0, 0.0
1470
-
1471
- if (
1472
- hasattr(self, "_element_mgr") and hasattr(self, "width") and hasattr(self, "height")
1473
- ): # Likely a Page
1474
- page_object_for_elements = self
1475
- context_width = self.width
1476
- context_height = self.height
1477
- logger.debug(f"Operating on Page context: {self}")
1478
- elif (
1479
- hasattr(self, "_page") and hasattr(self, "x0") and hasattr(self, "width")
1480
- ): # Likely a Region
1481
- page_object_for_elements = self._page
1482
- origin_x = self.x0
1483
- origin_y = self.top
1484
- context_width = self.width # Region's own width/height for its boundary calculations
1485
- context_height = self.height
1486
- logger.debug(f"Operating on Region context: {self}, origin: ({origin_x}, {origin_y})")
1487
- else:
1488
- logger.warning(
1489
- f"Could not determine valid page/region context for {self}. Aborting table structure detection."
1490
- )
1491
- return self
1492
-
1493
- element_manager = page_object_for_elements._element_mgr
1494
-
1495
- # ------------------------------------------------------------------
1496
- # CLEAN-UP existing table-related regions from earlier runs to avoid duplicates
1497
- # ------------------------------------------------------------------
1498
- try:
1499
- _purge_types = {"table", "table_row", "table_column", "table_cell"}
1500
-
1501
- if (
1502
- hasattr(element_manager, "_elements")
1503
- and "regions" in element_manager._elements
1504
- ):
1505
- _orig_len = len(element_manager._elements["regions"])
1506
- element_manager._elements["regions"] = [
1507
- r
1508
- for r in element_manager._elements["regions"]
1509
- if not (
1510
- getattr(r, "source", None) == source_label
1511
- and getattr(r, "region_type", None) in _purge_types
1512
- )
1513
- ]
1514
- _removed = _orig_len - len(element_manager._elements["regions"])
1515
- if _removed:
1516
- logger.info(
1517
- f"Removed {_removed} previous table-related regions (source='{source_label}') before regeneration."
1518
- )
1519
-
1520
- if hasattr(page_object_for_elements, "_regions") and "detected" in page_object_for_elements._regions:
1521
- page_object_for_elements._regions["detected"] = [
1522
- r
1523
- for r in page_object_for_elements._regions["detected"]
1524
- if not (
1525
- getattr(r, "source", None) == source_label
1526
- and getattr(r, "region_type", None) in _purge_types
1527
- )
1528
- ]
1529
- except Exception as _cleanup_err:
1530
- logger.warning(
1531
- f"Table-region cleanup failed: {_cleanup_err}", exc_info=True
1532
- )
1533
-
1534
- # Get lines with the specified source
1535
- all_lines = element_manager.lines # Access lines from the correct element manager
1536
- filtered_lines = [
1537
- line for line in all_lines if getattr(line, "source", None) == source_label
1538
- ]
1539
-
1540
- if not filtered_lines:
1541
- logger.info(
1542
- f"No lines found with source '{source_label}' for table structure detection on {self}."
1543
- )
1544
- return self
1545
-
1546
- # Separate horizontal and vertical lines
1547
- # For regions, line coordinates are already absolute to the page.
1548
- horizontal_lines = [line for line in filtered_lines if line.is_horizontal]
1549
- vertical_lines = [line for line in filtered_lines if line.is_vertical]
1550
-
1551
- logger.info(
1552
- f"Found {len(horizontal_lines)} horizontal and {len(vertical_lines)} vertical lines for {self} with source '{source_label}'."
1553
- )
1554
-
1555
- # Define boundaries based on line positions (mid-points for sorting, actual edges for boundaries)
1556
- # These coordinates are relative to the page_object_for_elements (which is always a Page)
1557
-
1558
- # Horizontal line Y-coordinates (use average y, effectively the line's y-position)
1559
- h_line_ys = sorted(list(set([(line.top + line.bottom) / 2 for line in horizontal_lines])))
1560
-
1561
- # Vertical line X-coordinates (use average x, effectively the line's x-position)
1562
- v_line_xs = sorted(list(set([(line.x0 + line.x1) / 2 for line in vertical_lines])))
1563
-
1564
- row_boundaries = []
1565
- if horizontal_lines:
1566
- if not ignore_outer_regions:
1567
- row_boundaries.append(origin_y) # Region's top or Page's 0
1568
- row_boundaries.extend(h_line_ys)
1569
- if not ignore_outer_regions:
1570
- row_boundaries.append(origin_y + context_height) # Region's bottom or Page's height
1571
- elif not ignore_outer_regions: # No horizontal lines, but we might want full height cells
1572
- row_boundaries.extend([origin_y, origin_y + context_height])
1573
- row_boundaries = sorted(list(set(row_boundaries)))
1574
-
1575
- col_boundaries = []
1576
- if vertical_lines:
1577
- if not ignore_outer_regions:
1578
- col_boundaries.append(origin_x) # Region's left or Page's 0
1579
- col_boundaries.extend(v_line_xs)
1580
- if not ignore_outer_regions:
1581
- col_boundaries.append(origin_x + context_width) # Region's right or Page's width
1582
- elif not ignore_outer_regions: # No vertical lines, but we might want full width cells
1583
- col_boundaries.extend([origin_x, origin_x + context_width])
1584
- col_boundaries = sorted(list(set(col_boundaries)))
1585
-
1586
- logger.debug(f"Row boundaries for {self}: {row_boundaries}")
1587
- logger.debug(f"Col boundaries for {self}: {col_boundaries}")
1588
-
1589
- # Create overall table region that wraps the entire structure
1590
- tables_created = 0
1591
- if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
1592
- table_left = col_boundaries[0]
1593
- table_top = row_boundaries[0]
1594
- table_right = col_boundaries[-1]
1595
- table_bottom = row_boundaries[-1]
1596
-
1597
- if table_right > table_left and table_bottom > table_top:
1598
- try:
1599
- table_region = page_object_for_elements.create_region(
1600
- table_left, table_top, table_right, table_bottom
1601
- )
1602
- table_region.source = source_label
1603
- table_region.region_type = "table"
1604
- table_region.normalized_type = (
1605
- "table" # Add normalized_type for selector compatibility
1606
- )
1607
- table_region.metadata.update(
1608
- {
1609
- "source_lines_label": source_label,
1610
- "num_rows": len(row_boundaries) - 1,
1611
- "num_cols": len(col_boundaries) - 1,
1612
- "boundaries": {"rows": row_boundaries, "cols": col_boundaries},
1613
- }
1614
- )
1615
- element_manager.add_element(table_region, element_type="regions")
1616
- tables_created += 1
1617
- logger.debug(
1618
- f"Created table region: L{table_left:.1f} T{table_top:.1f} R{table_right:.1f} B{table_bottom:.1f}"
1619
- )
1620
- except Exception as e:
1621
- logger.error(
1622
- f"Failed to create or add table Region: {e}. Table abs coords: L{table_left} T{table_top} R{table_right} B{table_bottom}",
1623
- exc_info=True,
1624
- )
1625
-
1626
- # Create cell regions
1627
- cells_created = 0
1628
- rows_created = 0
1629
- cols_created = 0
1630
-
1631
- # Create Row Regions
1632
- if len(row_boundaries) >= 2:
1633
- # Determine horizontal extent for rows
1634
- row_extent_x0 = origin_x
1635
- row_extent_x1 = origin_x + context_width
1636
- if col_boundaries: # If columns are defined, rows should span only across them
1637
- if len(col_boundaries) >= 2:
1638
- row_extent_x0 = col_boundaries[0]
1639
- row_extent_x1 = col_boundaries[-1]
1640
- # If only one col_boundary (e.g. from ignore_outer_regions=False and one line), use context width
1641
- # This case should be rare if lines are properly detected to form a grid.
1642
-
1643
- for i in range(len(row_boundaries) - 1):
1644
- top_abs = row_boundaries[i]
1645
- bottom_abs = row_boundaries[i + 1]
1646
-
1647
- # Use calculated row_extent_x0 and row_extent_x1
1648
- if bottom_abs > top_abs and row_extent_x1 > row_extent_x0: # Ensure valid region
1649
- try:
1650
- row_region = page_object_for_elements.create_region(
1651
- row_extent_x0, top_abs, row_extent_x1, bottom_abs
1652
- )
1653
- row_region.source = source_label
1654
- row_region.region_type = "table_row"
1655
- row_region.normalized_type = (
1656
- "table_row" # Add normalized_type for selector compatibility
1657
- )
1658
- row_region.metadata.update(
1659
- {"row_index": i, "source_lines_label": source_label}
1660
- )
1661
- element_manager.add_element(row_region, element_type="regions")
1662
- rows_created += 1
1663
- except Exception as e:
1664
- logger.error(
1665
- f"Failed to create or add table_row Region: {e}. Row abs coords: L{row_extent_x0} T{top_abs} R{row_extent_x1} B{bottom_abs}",
1666
- exc_info=True,
1667
- )
1668
-
1669
- # Create Column Regions
1670
- if len(col_boundaries) >= 2:
1671
- # Determine vertical extent for columns
1672
- col_extent_y0 = origin_y
1673
- col_extent_y1 = origin_y + context_height
1674
- if row_boundaries: # If rows are defined, columns should span only across them
1675
- if len(row_boundaries) >= 2:
1676
- col_extent_y0 = row_boundaries[0]
1677
- col_extent_y1 = row_boundaries[-1]
1678
- # If only one row_boundary, use context height - similar logic to rows
1679
-
1680
- for j in range(len(col_boundaries) - 1):
1681
- left_abs = col_boundaries[j]
1682
- right_abs = col_boundaries[j + 1]
1683
-
1684
- # Use calculated col_extent_y0 and col_extent_y1
1685
- if right_abs > left_abs and col_extent_y1 > col_extent_y0: # Ensure valid region
1686
- try:
1687
- col_region = page_object_for_elements.create_region(
1688
- left_abs, col_extent_y0, right_abs, col_extent_y1
1689
- )
1690
- col_region.source = source_label
1691
- col_region.region_type = "table_column"
1692
- col_region.normalized_type = (
1693
- "table_column" # Add normalized_type for selector compatibility
1694
- )
1695
- col_region.metadata.update(
1696
- {"col_index": j, "source_lines_label": source_label}
1697
- )
1698
- element_manager.add_element(col_region, element_type="regions")
1699
- cols_created += 1
1700
- except Exception as e:
1701
- logger.error(
1702
- f"Failed to create or add table_column Region: {e}. Col abs coords: L{left_abs} T{col_extent_y0} R{right_abs} B{col_extent_y1}",
1703
- exc_info=True,
1704
- )
1705
-
1706
- # Create Cell Regions (existing logic)
1707
- if len(row_boundaries) < 2 or len(col_boundaries) < 2:
1708
- logger.info(
1709
- f"Not enough boundaries to form cells for {self}. Rows: {len(row_boundaries)}, Cols: {len(col_boundaries)}"
1710
- )
1711
- # return self # Return will be at the end
1712
- else:
1713
- for i in range(len(row_boundaries) - 1):
1714
- top_abs = row_boundaries[i]
1715
- bottom_abs = row_boundaries[i + 1]
1716
-
1717
- for j in range(len(col_boundaries) - 1):
1718
- left_abs = col_boundaries[j]
1719
- right_abs = col_boundaries[j + 1]
1720
-
1721
- cell_left_abs = left_abs + cell_padding
1722
- cell_top_abs = top_abs + cell_padding
1723
- cell_right_abs = right_abs - cell_padding
1724
- cell_bottom_abs = bottom_abs - cell_padding
1725
-
1726
- cell_width = cell_right_abs - cell_left_abs
1727
- cell_height = cell_bottom_abs - cell_top_abs
1728
-
1729
- if cell_width <= 0 or cell_height <= 0:
1730
- logger.debug(
1731
- f"Skipping cell (zero or negative dimension after padding): L{left_abs:.1f} T{top_abs:.1f} R{right_abs:.1f} B{bottom_abs:.1f} -> W{cell_width:.1f} H{cell_height:.1f}"
1732
- )
1733
- continue
1734
-
1735
- try:
1736
- cell_region = page_object_for_elements.create_region(
1737
- cell_left_abs, cell_top_abs, cell_right_abs, cell_bottom_abs
1738
- )
1739
- cell_region.source = source_label
1740
- cell_region.region_type = "table_cell"
1741
- cell_region.normalized_type = (
1742
- "table_cell" # Add normalized_type for selector compatibility
1743
- )
1744
- cell_region.metadata.update(
1745
- {
1746
- "row_index": i,
1747
- "col_index": j,
1748
- "source_lines_label": source_label,
1749
- "original_boundaries_abs": {
1750
- "left": left_abs,
1751
- "top": top_abs,
1752
- "right": right_abs,
1753
- "bottom": bottom_abs,
1754
- },
1755
- }
1756
- )
1757
- element_manager.add_element(cell_region, element_type="regions")
1758
- cells_created += 1
1759
- except Exception as e:
1760
- logger.error(
1761
- f"Failed to create or add cell Region: {e}. Cell abs coords: L{cell_left_abs} T{cell_top_abs} R{cell_right_abs} B{cell_bottom_abs}",
1762
- exc_info=True,
1763
- )
1764
-
1765
- logger.info(
1766
- f"Created {tables_created} table, {rows_created} rows, {cols_created} columns, and {cells_created} table cells from detected lines (source: '{source_label}') for {self}."
1767
- )
1768
-
1769
- return self
1770
-
1771
1132
  def detect_blobs(
1772
1133
  self,
1773
1134
  k: Optional[int] = None,
@@ -1993,14 +1354,3 @@ class ShapeDetectionMixin:
1993
1354
  page_obj._element_mgr.add_region(region)
1994
1355
 
1995
1356
  return self
1996
-
1997
-
1998
- # Example usage would be:
1999
- # page.detect_lines(source_label="my_table_lines")
2000
- # page.detect_table_structure_from_lines(source_label="my_table_lines", cell_padding=0.5)
2001
- #
2002
- # Now both selector styles work equivalently:
2003
- # table = page.find('table[source*="table_from"]') # Direct type selector
2004
- # table = page.find('region[type="table"][source*="table_from"]') # Region attribute selector
2005
- # cells = page.find_all('table-cell[source*="table_cells_from"]') # Direct type selector
2006
- # cells = page.find_all('region[type="table-cell"][source*="table_cells_from"]') # Region attribute selector