natural-pdf 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +18 -4
- natural_pdf/analyzers/guides.py +2176 -0
- natural_pdf/analyzers/shape_detection_mixin.py +0 -650
- natural_pdf/core/element_manager.py +86 -27
- natural_pdf/core/page.py +49 -1
- natural_pdf/core/pdf.py +22 -0
- natural_pdf/elements/collections.py +61 -0
- natural_pdf/elements/region.py +257 -14
- natural_pdf/elements/text.py +29 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/RECORD +15 -19
- bad_pdf_analysis/analyze_10_more.py +0 -300
- bad_pdf_analysis/analyze_final_10.py +0 -552
- bad_pdf_analysis/analyze_specific_pages.py +0 -394
- bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
- tools/rtl_smoke_test.py +0 -80
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/top_level.txt +0 -0
@@ -1129,645 +1129,6 @@ class ShapeDetectionMixin:
|
|
1129
1129
|
)
|
1130
1130
|
return final_lines_data
|
1131
1131
|
|
1132
|
-
def detect_lines_preview(
|
1133
|
-
self,
|
1134
|
-
resolution: int = 72, # Preview typically uses lower resolution
|
1135
|
-
method: str = "projection",
|
1136
|
-
horizontal: bool = True,
|
1137
|
-
vertical: bool = True,
|
1138
|
-
peak_threshold_h: float = 0.5,
|
1139
|
-
min_gap_h: int = 5,
|
1140
|
-
peak_threshold_v: float = 0.5,
|
1141
|
-
min_gap_v: int = 5,
|
1142
|
-
max_lines_h: Optional[int] = None,
|
1143
|
-
max_lines_v: Optional[int] = None,
|
1144
|
-
binarization_method: str = LINE_DETECTION_PARAM_DEFAULTS["binarization_method"],
|
1145
|
-
adaptive_thresh_block_size: int = LINE_DETECTION_PARAM_DEFAULTS[
|
1146
|
-
"adaptive_thresh_block_size"
|
1147
|
-
],
|
1148
|
-
adaptive_thresh_C_val: int = LINE_DETECTION_PARAM_DEFAULTS["adaptive_thresh_C_val"],
|
1149
|
-
morph_op_h: str = LINE_DETECTION_PARAM_DEFAULTS["morph_op_h"],
|
1150
|
-
morph_kernel_h: Tuple[int, int] = LINE_DETECTION_PARAM_DEFAULTS["morph_kernel_h"],
|
1151
|
-
morph_op_v: str = LINE_DETECTION_PARAM_DEFAULTS["morph_op_v"],
|
1152
|
-
morph_kernel_v: Tuple[int, int] = LINE_DETECTION_PARAM_DEFAULTS["morph_kernel_v"],
|
1153
|
-
smoothing_sigma_h: float = LINE_DETECTION_PARAM_DEFAULTS["smoothing_sigma_h"],
|
1154
|
-
smoothing_sigma_v: float = LINE_DETECTION_PARAM_DEFAULTS["smoothing_sigma_v"],
|
1155
|
-
peak_width_rel_height: float = LINE_DETECTION_PARAM_DEFAULTS["peak_width_rel_height"],
|
1156
|
-
# LSD-specific parameters
|
1157
|
-
off_angle: int = 5,
|
1158
|
-
min_line_length: int = 30,
|
1159
|
-
merge_angle_tolerance: int = 5,
|
1160
|
-
merge_distance_tolerance: int = 3,
|
1161
|
-
merge_endpoint_tolerance: int = 10,
|
1162
|
-
initial_min_line_length: int = 10,
|
1163
|
-
min_nfa_score_horizontal: float = -10.0,
|
1164
|
-
min_nfa_score_vertical: float = -10.0,
|
1165
|
-
) -> Optional[Image.Image]:
|
1166
|
-
"""
|
1167
|
-
Previews detected lines on a Page or Region without adding them to the PDF elements.
|
1168
|
-
Generates and returns a debug visualization image.
|
1169
|
-
This method is intended for Page or Region objects.
|
1170
|
-
|
1171
|
-
Args:
|
1172
|
-
method: Detection method - "projection" (default) or "lsd" (requires opencv-python).
|
1173
|
-
See `detect_lines` for other parameter descriptions. The main difference is a lower default `resolution`.
|
1174
|
-
|
1175
|
-
Returns:
|
1176
|
-
PIL Image with line detection visualization, or None if preview failed.
|
1177
|
-
|
1178
|
-
Note:
|
1179
|
-
Only projection profiling method supports histogram visualization.
|
1180
|
-
LSD method will show detected lines overlaid on the original image.
|
1181
|
-
"""
|
1182
|
-
if hasattr(self, "pdfs") or (hasattr(self, "pages") and not hasattr(self, "_page")):
|
1183
|
-
logger.warning(
|
1184
|
-
"preview_detected_lines is intended for single Page/Region objects. For collections, process pages individually."
|
1185
|
-
)
|
1186
|
-
return None
|
1187
|
-
|
1188
|
-
if not horizontal and not vertical: # Check this early
|
1189
|
-
logger.info("Line preview skipped as both horizontal and vertical are False.")
|
1190
|
-
return None
|
1191
|
-
|
1192
|
-
# Validate method parameter
|
1193
|
-
if method not in ["projection", "lsd"]:
|
1194
|
-
raise ValueError(f"Invalid method '{method}'. Supported methods: 'projection', 'lsd'")
|
1195
|
-
|
1196
|
-
cv_image, _, _, page_object_ctx = self._get_image_for_detection(
|
1197
|
-
resolution
|
1198
|
-
) # scale_factor and origin_offset not needed for preview
|
1199
|
-
if (
|
1200
|
-
cv_image is None or page_object_ctx is None
|
1201
|
-
): # page_object_ctx for logging context mostly
|
1202
|
-
logger.warning(f"Skipping line preview for {self} due to image error.")
|
1203
|
-
return None
|
1204
|
-
|
1205
|
-
pil_image_for_dims = None
|
1206
|
-
if hasattr(self, "to_image") and hasattr(self, "width") and hasattr(self, "height"):
|
1207
|
-
if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"):
|
1208
|
-
pil_image_for_dims = self.to_image(
|
1209
|
-
resolution=resolution, crop=True, include_highlights=False
|
1210
|
-
)
|
1211
|
-
else:
|
1212
|
-
pil_image_for_dims = self.to_image(resolution=resolution, include_highlights=False)
|
1213
|
-
|
1214
|
-
if pil_image_for_dims is None:
|
1215
|
-
logger.warning(
|
1216
|
-
f"Could not render PIL image for preview for {self}. Using cv_image to create one."
|
1217
|
-
)
|
1218
|
-
pil_image_for_dims = Image.fromarray(cv_image)
|
1219
|
-
|
1220
|
-
if pil_image_for_dims.mode != "RGB":
|
1221
|
-
pil_image_for_dims = pil_image_for_dims.convert("RGB")
|
1222
|
-
|
1223
|
-
# Get lines data based on method
|
1224
|
-
if method == "projection":
|
1225
|
-
lines_data_img, profile_h_smoothed, profile_v_smoothed = self._find_lines_on_image_data(
|
1226
|
-
cv_image=cv_image,
|
1227
|
-
pil_image_rgb=pil_image_for_dims,
|
1228
|
-
horizontal=horizontal,
|
1229
|
-
vertical=vertical,
|
1230
|
-
peak_threshold_h=peak_threshold_h,
|
1231
|
-
min_gap_h=min_gap_h,
|
1232
|
-
peak_threshold_v=peak_threshold_v,
|
1233
|
-
min_gap_v=min_gap_v,
|
1234
|
-
max_lines_h=max_lines_h,
|
1235
|
-
max_lines_v=max_lines_v,
|
1236
|
-
binarization_method=binarization_method,
|
1237
|
-
adaptive_thresh_block_size=adaptive_thresh_block_size,
|
1238
|
-
adaptive_thresh_C_val=adaptive_thresh_C_val,
|
1239
|
-
morph_op_h=morph_op_h,
|
1240
|
-
morph_kernel_h=morph_kernel_h,
|
1241
|
-
morph_op_v=morph_op_v,
|
1242
|
-
morph_kernel_v=morph_kernel_v,
|
1243
|
-
smoothing_sigma_h=smoothing_sigma_h,
|
1244
|
-
smoothing_sigma_v=smoothing_sigma_v,
|
1245
|
-
peak_width_rel_height=peak_width_rel_height,
|
1246
|
-
)
|
1247
|
-
elif method == "lsd":
|
1248
|
-
try:
|
1249
|
-
import cv2
|
1250
|
-
except ImportError:
|
1251
|
-
raise ImportError(
|
1252
|
-
"OpenCV (cv2) is required for LSD line detection preview. "
|
1253
|
-
"Install it with: pip install opencv-python\n"
|
1254
|
-
"Alternatively, use method='projection' for preview."
|
1255
|
-
)
|
1256
|
-
lines_data_img = self._process_image_for_lines_lsd(
|
1257
|
-
cv_image,
|
1258
|
-
off_angle,
|
1259
|
-
min_line_length,
|
1260
|
-
merge_angle_tolerance,
|
1261
|
-
merge_distance_tolerance,
|
1262
|
-
merge_endpoint_tolerance,
|
1263
|
-
initial_min_line_length,
|
1264
|
-
min_nfa_score_horizontal,
|
1265
|
-
min_nfa_score_vertical,
|
1266
|
-
)
|
1267
|
-
profile_h_smoothed, profile_v_smoothed = None, None # LSD doesn't use profiles
|
1268
|
-
|
1269
|
-
if not lines_data_img: # Check if any lines were detected before visualization
|
1270
|
-
logger.info(f"No lines detected for preview on {page_object_ctx or self}")
|
1271
|
-
# Optionally return the base image if no lines, or None
|
1272
|
-
return pil_image_for_dims.convert("RGBA") # Return base image so something is shown
|
1273
|
-
|
1274
|
-
# --- Visualization Logic ---
|
1275
|
-
final_viz_image: Optional[Image.Image] = None
|
1276
|
-
viz_image_base = pil_image_for_dims.convert("RGBA")
|
1277
|
-
draw = ImageDraw.Draw(viz_image_base)
|
1278
|
-
img_width, img_height = viz_image_base.size
|
1279
|
-
|
1280
|
-
viz_params = {
|
1281
|
-
"draw_line_thickness_viz": 2, # Slightly thicker for better visibility
|
1282
|
-
"debug_histogram_size": 100,
|
1283
|
-
"line_color_h": (255, 0, 0, 200),
|
1284
|
-
"line_color_v": (0, 0, 255, 200),
|
1285
|
-
"histogram_bar_color_h": (200, 0, 0, 200),
|
1286
|
-
"histogram_bar_color_v": (0, 0, 200, 200),
|
1287
|
-
"histogram_bg_color": (240, 240, 240, 255),
|
1288
|
-
"padding_between_viz": 10,
|
1289
|
-
"peak_threshold_h": peak_threshold_h,
|
1290
|
-
"peak_threshold_v": peak_threshold_v,
|
1291
|
-
"max_lines_h": max_lines_h,
|
1292
|
-
"max_lines_v": max_lines_v,
|
1293
|
-
}
|
1294
|
-
|
1295
|
-
# Draw detected lines on the image
|
1296
|
-
for line_info in lines_data_img:
|
1297
|
-
is_h_line = abs(line_info["y1"] - line_info["y2"]) < abs(
|
1298
|
-
line_info["x1"] - line_info["x2"]
|
1299
|
-
)
|
1300
|
-
line_color = viz_params["line_color_h"] if is_h_line else viz_params["line_color_v"]
|
1301
|
-
draw.line(
|
1302
|
-
[(line_info["x1"], line_info["y1"]), (line_info["x2"], line_info["y2"])],
|
1303
|
-
fill=line_color,
|
1304
|
-
width=viz_params["draw_line_thickness_viz"],
|
1305
|
-
)
|
1306
|
-
|
1307
|
-
# For projection method, add histogram visualization
|
1308
|
-
if method == "projection" and (
|
1309
|
-
profile_h_smoothed is not None or profile_v_smoothed is not None
|
1310
|
-
):
|
1311
|
-
hist_size = viz_params["debug_histogram_size"]
|
1312
|
-
hist_h_img = Image.new(
|
1313
|
-
"RGBA", (hist_size, img_height), viz_params["histogram_bg_color"]
|
1314
|
-
)
|
1315
|
-
hist_h_draw = ImageDraw.Draw(hist_h_img)
|
1316
|
-
|
1317
|
-
if profile_h_smoothed is not None and profile_h_smoothed.size > 0:
|
1318
|
-
actual_max_h_profile = profile_h_smoothed.max()
|
1319
|
-
display_threshold_val_h = peak_threshold_h * img_width
|
1320
|
-
# Use the maximum of either the profile max or threshold for scaling, so both are always visible
|
1321
|
-
max_h_profile_val_for_scaling = (
|
1322
|
-
max(actual_max_h_profile, display_threshold_val_h)
|
1323
|
-
if actual_max_h_profile > 0
|
1324
|
-
else img_width
|
1325
|
-
)
|
1326
|
-
for y_coord, val in enumerate(profile_h_smoothed):
|
1327
|
-
bar_len = 0
|
1328
|
-
thresh_bar_len = 0
|
1329
|
-
if max_h_profile_val_for_scaling > 0:
|
1330
|
-
bar_len = int((val / max_h_profile_val_for_scaling) * hist_size)
|
1331
|
-
if display_threshold_val_h >= 0:
|
1332
|
-
thresh_bar_len = int(
|
1333
|
-
(display_threshold_val_h / max_h_profile_val_for_scaling)
|
1334
|
-
* hist_size
|
1335
|
-
)
|
1336
|
-
bar_len = min(max(0, bar_len), hist_size)
|
1337
|
-
if bar_len > 0:
|
1338
|
-
hist_h_draw.line(
|
1339
|
-
[(0, y_coord), (bar_len - 1, y_coord)],
|
1340
|
-
fill=viz_params["histogram_bar_color_h"],
|
1341
|
-
width=1,
|
1342
|
-
)
|
1343
|
-
if (
|
1344
|
-
viz_params["max_lines_h"] is None
|
1345
|
-
and display_threshold_val_h >= 0
|
1346
|
-
and thresh_bar_len > 0
|
1347
|
-
and thresh_bar_len <= hist_size
|
1348
|
-
):
|
1349
|
-
# Ensure threshold line is within bounds
|
1350
|
-
thresh_x = min(thresh_bar_len, hist_size - 1)
|
1351
|
-
hist_h_draw.line(
|
1352
|
-
[
|
1353
|
-
(thresh_x, y_coord),
|
1354
|
-
(thresh_x, y_coord + 1 if y_coord + 1 < img_height else y_coord),
|
1355
|
-
],
|
1356
|
-
fill=(0, 255, 0, 100),
|
1357
|
-
width=1,
|
1358
|
-
)
|
1359
|
-
|
1360
|
-
hist_v_img = Image.new("RGBA", (img_width, hist_size), viz_params["histogram_bg_color"])
|
1361
|
-
hist_v_draw = ImageDraw.Draw(hist_v_img)
|
1362
|
-
if profile_v_smoothed is not None and profile_v_smoothed.size > 0:
|
1363
|
-
actual_max_v_profile = profile_v_smoothed.max()
|
1364
|
-
display_threshold_val_v = peak_threshold_v * img_height
|
1365
|
-
# Use the maximum of either the profile max or threshold for scaling, so both are always visible
|
1366
|
-
max_v_profile_val_for_scaling = (
|
1367
|
-
max(actual_max_v_profile, display_threshold_val_v)
|
1368
|
-
if actual_max_v_profile > 0
|
1369
|
-
else img_height
|
1370
|
-
)
|
1371
|
-
for x_coord, val in enumerate(profile_v_smoothed):
|
1372
|
-
bar_height = 0
|
1373
|
-
thresh_bar_h = 0
|
1374
|
-
if max_v_profile_val_for_scaling > 0:
|
1375
|
-
bar_height = int((val / max_v_profile_val_for_scaling) * hist_size)
|
1376
|
-
if display_threshold_val_v >= 0:
|
1377
|
-
thresh_bar_h = int(
|
1378
|
-
(display_threshold_val_v / max_v_profile_val_for_scaling)
|
1379
|
-
* hist_size
|
1380
|
-
)
|
1381
|
-
bar_height = min(max(0, bar_height), hist_size)
|
1382
|
-
if bar_height > 0:
|
1383
|
-
hist_v_draw.line(
|
1384
|
-
[(x_coord, hist_size - 1), (x_coord, hist_size - bar_height)],
|
1385
|
-
fill=viz_params["histogram_bar_color_v"],
|
1386
|
-
width=1,
|
1387
|
-
)
|
1388
|
-
if (
|
1389
|
-
viz_params["max_lines_v"] is None
|
1390
|
-
and display_threshold_val_v >= 0
|
1391
|
-
and thresh_bar_h > 0
|
1392
|
-
and thresh_bar_h <= hist_size
|
1393
|
-
):
|
1394
|
-
# Ensure threshold line is within bounds
|
1395
|
-
thresh_y = min(thresh_bar_h, hist_size - 1)
|
1396
|
-
hist_v_draw.line(
|
1397
|
-
[
|
1398
|
-
(x_coord, hist_size - thresh_y),
|
1399
|
-
(
|
1400
|
-
x_coord + 1 if x_coord + 1 < img_width else x_coord,
|
1401
|
-
hist_size - thresh_y,
|
1402
|
-
),
|
1403
|
-
],
|
1404
|
-
fill=(0, 255, 0, 100),
|
1405
|
-
width=1,
|
1406
|
-
)
|
1407
|
-
|
1408
|
-
padding = viz_params["padding_between_viz"]
|
1409
|
-
total_width = img_width + padding + hist_size
|
1410
|
-
total_height = img_height + padding + hist_size
|
1411
|
-
final_viz_image = Image.new("RGBA", (total_width, total_height), (255, 255, 255, 255))
|
1412
|
-
final_viz_image.paste(viz_image_base, (0, 0))
|
1413
|
-
final_viz_image.paste(hist_h_img, (img_width + padding, 0))
|
1414
|
-
final_viz_image.paste(hist_v_img, (0, img_height + padding))
|
1415
|
-
else:
|
1416
|
-
# For LSD method, just return the image with lines overlaid
|
1417
|
-
final_viz_image = viz_image_base
|
1418
|
-
|
1419
|
-
logger.info(f"Generated line preview visualization for {page_object_ctx or self}")
|
1420
|
-
return final_viz_image
|
1421
|
-
|
1422
|
-
def detect_table_structure_from_lines(
|
1423
|
-
self,
|
1424
|
-
source_label: str = "detected",
|
1425
|
-
ignore_outer_regions: bool = True,
|
1426
|
-
cell_padding: float = 0.5, # Small padding inside cells, default to 0.5px
|
1427
|
-
) -> "ShapeDetectionMixin":
|
1428
|
-
"""
|
1429
|
-
Create table structure (rows, columns, cells) from previously detected lines.
|
1430
|
-
|
1431
|
-
This method analyzes horizontal and vertical lines to create a grid structure,
|
1432
|
-
then generates Region objects for:
|
1433
|
-
- An overall table region that encompasses the entire table structure
|
1434
|
-
- Individual row regions spanning the width of the table
|
1435
|
-
- Individual column regions spanning the height of the table
|
1436
|
-
- Individual cell regions at each row/column intersection
|
1437
|
-
|
1438
|
-
Args:
|
1439
|
-
source_label: Filter lines by this source label (from detect_lines)
|
1440
|
-
ignore_outer_regions: If True, don't create regions outside the defined by lines grid.
|
1441
|
-
If False, include regions from page/object edges to the first/last lines.
|
1442
|
-
cell_padding: Internal padding for cell regions
|
1443
|
-
|
1444
|
-
Returns:
|
1445
|
-
Self for method chaining
|
1446
|
-
"""
|
1447
|
-
# Handle collections
|
1448
|
-
if hasattr(self, "pdfs"):
|
1449
|
-
for pdf_doc in self.pdfs:
|
1450
|
-
for page_obj in pdf_doc.pages:
|
1451
|
-
page_obj.detect_table_structure_from_lines(
|
1452
|
-
source_label=source_label,
|
1453
|
-
ignore_outer_regions=ignore_outer_regions,
|
1454
|
-
cell_padding=cell_padding,
|
1455
|
-
)
|
1456
|
-
return self
|
1457
|
-
elif hasattr(self, "pages") and not hasattr(self, "_page"): # PageCollection
|
1458
|
-
for page_obj in self.pages:
|
1459
|
-
page_obj.detect_table_structure_from_lines(
|
1460
|
-
source_label=source_label,
|
1461
|
-
ignore_outer_regions=ignore_outer_regions,
|
1462
|
-
cell_padding=cell_padding,
|
1463
|
-
)
|
1464
|
-
return self
|
1465
|
-
|
1466
|
-
# Determine context (Page or Region) for coordinates and element management
|
1467
|
-
page_object_for_elements = None
|
1468
|
-
origin_x, origin_y = 0.0, 0.0
|
1469
|
-
context_width, context_height = 0.0, 0.0
|
1470
|
-
|
1471
|
-
if (
|
1472
|
-
hasattr(self, "_element_mgr") and hasattr(self, "width") and hasattr(self, "height")
|
1473
|
-
): # Likely a Page
|
1474
|
-
page_object_for_elements = self
|
1475
|
-
context_width = self.width
|
1476
|
-
context_height = self.height
|
1477
|
-
logger.debug(f"Operating on Page context: {self}")
|
1478
|
-
elif (
|
1479
|
-
hasattr(self, "_page") and hasattr(self, "x0") and hasattr(self, "width")
|
1480
|
-
): # Likely a Region
|
1481
|
-
page_object_for_elements = self._page
|
1482
|
-
origin_x = self.x0
|
1483
|
-
origin_y = self.top
|
1484
|
-
context_width = self.width # Region's own width/height for its boundary calculations
|
1485
|
-
context_height = self.height
|
1486
|
-
logger.debug(f"Operating on Region context: {self}, origin: ({origin_x}, {origin_y})")
|
1487
|
-
else:
|
1488
|
-
logger.warning(
|
1489
|
-
f"Could not determine valid page/region context for {self}. Aborting table structure detection."
|
1490
|
-
)
|
1491
|
-
return self
|
1492
|
-
|
1493
|
-
element_manager = page_object_for_elements._element_mgr
|
1494
|
-
|
1495
|
-
# ------------------------------------------------------------------
|
1496
|
-
# CLEAN-UP existing table-related regions from earlier runs to avoid duplicates
|
1497
|
-
# ------------------------------------------------------------------
|
1498
|
-
try:
|
1499
|
-
_purge_types = {"table", "table_row", "table_column", "table_cell"}
|
1500
|
-
|
1501
|
-
if (
|
1502
|
-
hasattr(element_manager, "_elements")
|
1503
|
-
and "regions" in element_manager._elements
|
1504
|
-
):
|
1505
|
-
_orig_len = len(element_manager._elements["regions"])
|
1506
|
-
element_manager._elements["regions"] = [
|
1507
|
-
r
|
1508
|
-
for r in element_manager._elements["regions"]
|
1509
|
-
if not (
|
1510
|
-
getattr(r, "source", None) == source_label
|
1511
|
-
and getattr(r, "region_type", None) in _purge_types
|
1512
|
-
)
|
1513
|
-
]
|
1514
|
-
_removed = _orig_len - len(element_manager._elements["regions"])
|
1515
|
-
if _removed:
|
1516
|
-
logger.info(
|
1517
|
-
f"Removed {_removed} previous table-related regions (source='{source_label}') before regeneration."
|
1518
|
-
)
|
1519
|
-
|
1520
|
-
if hasattr(page_object_for_elements, "_regions") and "detected" in page_object_for_elements._regions:
|
1521
|
-
page_object_for_elements._regions["detected"] = [
|
1522
|
-
r
|
1523
|
-
for r in page_object_for_elements._regions["detected"]
|
1524
|
-
if not (
|
1525
|
-
getattr(r, "source", None) == source_label
|
1526
|
-
and getattr(r, "region_type", None) in _purge_types
|
1527
|
-
)
|
1528
|
-
]
|
1529
|
-
except Exception as _cleanup_err:
|
1530
|
-
logger.warning(
|
1531
|
-
f"Table-region cleanup failed: {_cleanup_err}", exc_info=True
|
1532
|
-
)
|
1533
|
-
|
1534
|
-
# Get lines with the specified source
|
1535
|
-
all_lines = element_manager.lines # Access lines from the correct element manager
|
1536
|
-
filtered_lines = [
|
1537
|
-
line for line in all_lines if getattr(line, "source", None) == source_label
|
1538
|
-
]
|
1539
|
-
|
1540
|
-
if not filtered_lines:
|
1541
|
-
logger.info(
|
1542
|
-
f"No lines found with source '{source_label}' for table structure detection on {self}."
|
1543
|
-
)
|
1544
|
-
return self
|
1545
|
-
|
1546
|
-
# Separate horizontal and vertical lines
|
1547
|
-
# For regions, line coordinates are already absolute to the page.
|
1548
|
-
horizontal_lines = [line for line in filtered_lines if line.is_horizontal]
|
1549
|
-
vertical_lines = [line for line in filtered_lines if line.is_vertical]
|
1550
|
-
|
1551
|
-
logger.info(
|
1552
|
-
f"Found {len(horizontal_lines)} horizontal and {len(vertical_lines)} vertical lines for {self} with source '{source_label}'."
|
1553
|
-
)
|
1554
|
-
|
1555
|
-
# Define boundaries based on line positions (mid-points for sorting, actual edges for boundaries)
|
1556
|
-
# These coordinates are relative to the page_object_for_elements (which is always a Page)
|
1557
|
-
|
1558
|
-
# Horizontal line Y-coordinates (use average y, effectively the line's y-position)
|
1559
|
-
h_line_ys = sorted(list(set([(line.top + line.bottom) / 2 for line in horizontal_lines])))
|
1560
|
-
|
1561
|
-
# Vertical line X-coordinates (use average x, effectively the line's x-position)
|
1562
|
-
v_line_xs = sorted(list(set([(line.x0 + line.x1) / 2 for line in vertical_lines])))
|
1563
|
-
|
1564
|
-
row_boundaries = []
|
1565
|
-
if horizontal_lines:
|
1566
|
-
if not ignore_outer_regions:
|
1567
|
-
row_boundaries.append(origin_y) # Region's top or Page's 0
|
1568
|
-
row_boundaries.extend(h_line_ys)
|
1569
|
-
if not ignore_outer_regions:
|
1570
|
-
row_boundaries.append(origin_y + context_height) # Region's bottom or Page's height
|
1571
|
-
elif not ignore_outer_regions: # No horizontal lines, but we might want full height cells
|
1572
|
-
row_boundaries.extend([origin_y, origin_y + context_height])
|
1573
|
-
row_boundaries = sorted(list(set(row_boundaries)))
|
1574
|
-
|
1575
|
-
col_boundaries = []
|
1576
|
-
if vertical_lines:
|
1577
|
-
if not ignore_outer_regions:
|
1578
|
-
col_boundaries.append(origin_x) # Region's left or Page's 0
|
1579
|
-
col_boundaries.extend(v_line_xs)
|
1580
|
-
if not ignore_outer_regions:
|
1581
|
-
col_boundaries.append(origin_x + context_width) # Region's right or Page's width
|
1582
|
-
elif not ignore_outer_regions: # No vertical lines, but we might want full width cells
|
1583
|
-
col_boundaries.extend([origin_x, origin_x + context_width])
|
1584
|
-
col_boundaries = sorted(list(set(col_boundaries)))
|
1585
|
-
|
1586
|
-
logger.debug(f"Row boundaries for {self}: {row_boundaries}")
|
1587
|
-
logger.debug(f"Col boundaries for {self}: {col_boundaries}")
|
1588
|
-
|
1589
|
-
# Create overall table region that wraps the entire structure
|
1590
|
-
tables_created = 0
|
1591
|
-
if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
|
1592
|
-
table_left = col_boundaries[0]
|
1593
|
-
table_top = row_boundaries[0]
|
1594
|
-
table_right = col_boundaries[-1]
|
1595
|
-
table_bottom = row_boundaries[-1]
|
1596
|
-
|
1597
|
-
if table_right > table_left and table_bottom > table_top:
|
1598
|
-
try:
|
1599
|
-
table_region = page_object_for_elements.create_region(
|
1600
|
-
table_left, table_top, table_right, table_bottom
|
1601
|
-
)
|
1602
|
-
table_region.source = source_label
|
1603
|
-
table_region.region_type = "table"
|
1604
|
-
table_region.normalized_type = (
|
1605
|
-
"table" # Add normalized_type for selector compatibility
|
1606
|
-
)
|
1607
|
-
table_region.metadata.update(
|
1608
|
-
{
|
1609
|
-
"source_lines_label": source_label,
|
1610
|
-
"num_rows": len(row_boundaries) - 1,
|
1611
|
-
"num_cols": len(col_boundaries) - 1,
|
1612
|
-
"boundaries": {"rows": row_boundaries, "cols": col_boundaries},
|
1613
|
-
}
|
1614
|
-
)
|
1615
|
-
element_manager.add_element(table_region, element_type="regions")
|
1616
|
-
tables_created += 1
|
1617
|
-
logger.debug(
|
1618
|
-
f"Created table region: L{table_left:.1f} T{table_top:.1f} R{table_right:.1f} B{table_bottom:.1f}"
|
1619
|
-
)
|
1620
|
-
except Exception as e:
|
1621
|
-
logger.error(
|
1622
|
-
f"Failed to create or add table Region: {e}. Table abs coords: L{table_left} T{table_top} R{table_right} B{table_bottom}",
|
1623
|
-
exc_info=True,
|
1624
|
-
)
|
1625
|
-
|
1626
|
-
# Create cell regions
|
1627
|
-
cells_created = 0
|
1628
|
-
rows_created = 0
|
1629
|
-
cols_created = 0
|
1630
|
-
|
1631
|
-
# Create Row Regions
|
1632
|
-
if len(row_boundaries) >= 2:
|
1633
|
-
# Determine horizontal extent for rows
|
1634
|
-
row_extent_x0 = origin_x
|
1635
|
-
row_extent_x1 = origin_x + context_width
|
1636
|
-
if col_boundaries: # If columns are defined, rows should span only across them
|
1637
|
-
if len(col_boundaries) >= 2:
|
1638
|
-
row_extent_x0 = col_boundaries[0]
|
1639
|
-
row_extent_x1 = col_boundaries[-1]
|
1640
|
-
# If only one col_boundary (e.g. from ignore_outer_regions=False and one line), use context width
|
1641
|
-
# This case should be rare if lines are properly detected to form a grid.
|
1642
|
-
|
1643
|
-
for i in range(len(row_boundaries) - 1):
|
1644
|
-
top_abs = row_boundaries[i]
|
1645
|
-
bottom_abs = row_boundaries[i + 1]
|
1646
|
-
|
1647
|
-
# Use calculated row_extent_x0 and row_extent_x1
|
1648
|
-
if bottom_abs > top_abs and row_extent_x1 > row_extent_x0: # Ensure valid region
|
1649
|
-
try:
|
1650
|
-
row_region = page_object_for_elements.create_region(
|
1651
|
-
row_extent_x0, top_abs, row_extent_x1, bottom_abs
|
1652
|
-
)
|
1653
|
-
row_region.source = source_label
|
1654
|
-
row_region.region_type = "table_row"
|
1655
|
-
row_region.normalized_type = (
|
1656
|
-
"table_row" # Add normalized_type for selector compatibility
|
1657
|
-
)
|
1658
|
-
row_region.metadata.update(
|
1659
|
-
{"row_index": i, "source_lines_label": source_label}
|
1660
|
-
)
|
1661
|
-
element_manager.add_element(row_region, element_type="regions")
|
1662
|
-
rows_created += 1
|
1663
|
-
except Exception as e:
|
1664
|
-
logger.error(
|
1665
|
-
f"Failed to create or add table_row Region: {e}. Row abs coords: L{row_extent_x0} T{top_abs} R{row_extent_x1} B{bottom_abs}",
|
1666
|
-
exc_info=True,
|
1667
|
-
)
|
1668
|
-
|
1669
|
-
# Create Column Regions
|
1670
|
-
if len(col_boundaries) >= 2:
|
1671
|
-
# Determine vertical extent for columns
|
1672
|
-
col_extent_y0 = origin_y
|
1673
|
-
col_extent_y1 = origin_y + context_height
|
1674
|
-
if row_boundaries: # If rows are defined, columns should span only across them
|
1675
|
-
if len(row_boundaries) >= 2:
|
1676
|
-
col_extent_y0 = row_boundaries[0]
|
1677
|
-
col_extent_y1 = row_boundaries[-1]
|
1678
|
-
# If only one row_boundary, use context height - similar logic to rows
|
1679
|
-
|
1680
|
-
for j in range(len(col_boundaries) - 1):
|
1681
|
-
left_abs = col_boundaries[j]
|
1682
|
-
right_abs = col_boundaries[j + 1]
|
1683
|
-
|
1684
|
-
# Use calculated col_extent_y0 and col_extent_y1
|
1685
|
-
if right_abs > left_abs and col_extent_y1 > col_extent_y0: # Ensure valid region
|
1686
|
-
try:
|
1687
|
-
col_region = page_object_for_elements.create_region(
|
1688
|
-
left_abs, col_extent_y0, right_abs, col_extent_y1
|
1689
|
-
)
|
1690
|
-
col_region.source = source_label
|
1691
|
-
col_region.region_type = "table_column"
|
1692
|
-
col_region.normalized_type = (
|
1693
|
-
"table_column" # Add normalized_type for selector compatibility
|
1694
|
-
)
|
1695
|
-
col_region.metadata.update(
|
1696
|
-
{"col_index": j, "source_lines_label": source_label}
|
1697
|
-
)
|
1698
|
-
element_manager.add_element(col_region, element_type="regions")
|
1699
|
-
cols_created += 1
|
1700
|
-
except Exception as e:
|
1701
|
-
logger.error(
|
1702
|
-
f"Failed to create or add table_column Region: {e}. Col abs coords: L{left_abs} T{col_extent_y0} R{right_abs} B{col_extent_y1}",
|
1703
|
-
exc_info=True,
|
1704
|
-
)
|
1705
|
-
|
1706
|
-
# Create Cell Regions (existing logic)
|
1707
|
-
if len(row_boundaries) < 2 or len(col_boundaries) < 2:
|
1708
|
-
logger.info(
|
1709
|
-
f"Not enough boundaries to form cells for {self}. Rows: {len(row_boundaries)}, Cols: {len(col_boundaries)}"
|
1710
|
-
)
|
1711
|
-
# return self # Return will be at the end
|
1712
|
-
else:
|
1713
|
-
for i in range(len(row_boundaries) - 1):
|
1714
|
-
top_abs = row_boundaries[i]
|
1715
|
-
bottom_abs = row_boundaries[i + 1]
|
1716
|
-
|
1717
|
-
for j in range(len(col_boundaries) - 1):
|
1718
|
-
left_abs = col_boundaries[j]
|
1719
|
-
right_abs = col_boundaries[j + 1]
|
1720
|
-
|
1721
|
-
cell_left_abs = left_abs + cell_padding
|
1722
|
-
cell_top_abs = top_abs + cell_padding
|
1723
|
-
cell_right_abs = right_abs - cell_padding
|
1724
|
-
cell_bottom_abs = bottom_abs - cell_padding
|
1725
|
-
|
1726
|
-
cell_width = cell_right_abs - cell_left_abs
|
1727
|
-
cell_height = cell_bottom_abs - cell_top_abs
|
1728
|
-
|
1729
|
-
if cell_width <= 0 or cell_height <= 0:
|
1730
|
-
logger.debug(
|
1731
|
-
f"Skipping cell (zero or negative dimension after padding): L{left_abs:.1f} T{top_abs:.1f} R{right_abs:.1f} B{bottom_abs:.1f} -> W{cell_width:.1f} H{cell_height:.1f}"
|
1732
|
-
)
|
1733
|
-
continue
|
1734
|
-
|
1735
|
-
try:
|
1736
|
-
cell_region = page_object_for_elements.create_region(
|
1737
|
-
cell_left_abs, cell_top_abs, cell_right_abs, cell_bottom_abs
|
1738
|
-
)
|
1739
|
-
cell_region.source = source_label
|
1740
|
-
cell_region.region_type = "table_cell"
|
1741
|
-
cell_region.normalized_type = (
|
1742
|
-
"table_cell" # Add normalized_type for selector compatibility
|
1743
|
-
)
|
1744
|
-
cell_region.metadata.update(
|
1745
|
-
{
|
1746
|
-
"row_index": i,
|
1747
|
-
"col_index": j,
|
1748
|
-
"source_lines_label": source_label,
|
1749
|
-
"original_boundaries_abs": {
|
1750
|
-
"left": left_abs,
|
1751
|
-
"top": top_abs,
|
1752
|
-
"right": right_abs,
|
1753
|
-
"bottom": bottom_abs,
|
1754
|
-
},
|
1755
|
-
}
|
1756
|
-
)
|
1757
|
-
element_manager.add_element(cell_region, element_type="regions")
|
1758
|
-
cells_created += 1
|
1759
|
-
except Exception as e:
|
1760
|
-
logger.error(
|
1761
|
-
f"Failed to create or add cell Region: {e}. Cell abs coords: L{cell_left_abs} T{cell_top_abs} R{cell_right_abs} B{cell_bottom_abs}",
|
1762
|
-
exc_info=True,
|
1763
|
-
)
|
1764
|
-
|
1765
|
-
logger.info(
|
1766
|
-
f"Created {tables_created} table, {rows_created} rows, {cols_created} columns, and {cells_created} table cells from detected lines (source: '{source_label}') for {self}."
|
1767
|
-
)
|
1768
|
-
|
1769
|
-
return self
|
1770
|
-
|
1771
1132
|
def detect_blobs(
|
1772
1133
|
self,
|
1773
1134
|
k: Optional[int] = None,
|
@@ -1993,14 +1354,3 @@ class ShapeDetectionMixin:
|
|
1993
1354
|
page_obj._element_mgr.add_region(region)
|
1994
1355
|
|
1995
1356
|
return self
|
1996
|
-
|
1997
|
-
|
1998
|
-
# Example usage would be:
|
1999
|
-
# page.detect_lines(source_label="my_table_lines")
|
2000
|
-
# page.detect_table_structure_from_lines(source_label="my_table_lines", cell_padding=0.5)
|
2001
|
-
#
|
2002
|
-
# Now both selector styles work equivalently:
|
2003
|
-
# table = page.find('table[source*="table_from"]') # Direct type selector
|
2004
|
-
# table = page.find('region[type="table"][source*="table_from"]') # Region attribute selector
|
2005
|
-
# cells = page.find_all('table-cell[source*="table_cells_from"]') # Direct type selector
|
2006
|
-
# cells = page.find_all('region[type="table-cell"][source*="table_cells_from"]') # Region attribute selector
|