natural-pdf 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -7
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +236 -383
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +172 -83
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +318 -243
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +4 -4
- natural_pdf/flows/flow.py +1200 -243
- natural_pdf/flows/region.py +707 -261
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +7 -3
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -53
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/analyzers/guides.py
CHANGED
@@ -13,7 +13,7 @@ from natural_pdf.utils.layout import merge_bboxes
|
|
13
13
|
if TYPE_CHECKING:
|
14
14
|
from natural_pdf.core.page import Page
|
15
15
|
from natural_pdf.elements.base import Element
|
16
|
-
from natural_pdf.elements.
|
16
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
17
17
|
from natural_pdf.elements.region import Region
|
18
18
|
from natural_pdf.flows.region import FlowRegion
|
19
19
|
|
@@ -21,7 +21,8 @@ logger = logging.getLogger(__name__)
|
|
21
21
|
|
22
22
|
|
23
23
|
def _normalize_markers(
|
24
|
-
markers: Union[str, List[str], "ElementCollection", None],
|
24
|
+
markers: Union[str, List[str], "ElementCollection", None],
|
25
|
+
obj: Union["Page", "Region", "FlowRegion"],
|
25
26
|
) -> List[str]:
|
26
27
|
"""
|
27
28
|
Normalize markers parameter to a list of text strings for guide creation.
|
@@ -168,7 +169,7 @@ class GuidesList(UserList):
|
|
168
169
|
for region in self._parent.context.constituent_regions:
|
169
170
|
# Normalize markers for this region
|
170
171
|
marker_texts = _normalize_markers(markers, region)
|
171
|
-
|
172
|
+
|
172
173
|
# Create guides for this region
|
173
174
|
region_guides = Guides.from_content(
|
174
175
|
obj=region,
|
@@ -178,24 +179,29 @@ class GuidesList(UserList):
|
|
178
179
|
outer=outer,
|
179
180
|
tolerance=tolerance,
|
180
181
|
)
|
181
|
-
|
182
|
+
|
182
183
|
# Collect guides from this region
|
183
184
|
if self._axis == "vertical":
|
184
185
|
all_guides.extend(region_guides.vertical)
|
185
186
|
else:
|
186
187
|
all_guides.extend(region_guides.horizontal)
|
187
|
-
|
188
|
+
|
188
189
|
# Update parent's flow guides structure
|
189
190
|
if append:
|
190
191
|
# Append to existing
|
191
|
-
existing = [
|
192
|
-
|
193
|
-
|
192
|
+
existing = [
|
193
|
+
coord
|
194
|
+
for coord, _ in (
|
195
|
+
self._parent._unified_vertical
|
196
|
+
if self._axis == "vertical"
|
197
|
+
else self._parent._unified_horizontal
|
198
|
+
)
|
199
|
+
]
|
194
200
|
all_guides = existing + all_guides
|
195
|
-
|
201
|
+
|
196
202
|
# Remove duplicates and sort
|
197
203
|
unique_guides = sorted(list(set(all_guides)))
|
198
|
-
|
204
|
+
|
199
205
|
# Clear and rebuild unified view
|
200
206
|
if self._axis == "vertical":
|
201
207
|
self._parent._unified_vertical = []
|
@@ -221,27 +227,27 @@ class GuidesList(UserList):
|
|
221
227
|
break
|
222
228
|
self._parent._horizontal_cache = None
|
223
229
|
self.data = unique_guides
|
224
|
-
|
230
|
+
|
225
231
|
# Update per-region guides
|
226
232
|
for region in self._parent.context.constituent_regions:
|
227
233
|
region_verticals = []
|
228
234
|
region_horizontals = []
|
229
|
-
|
235
|
+
|
230
236
|
for coord, r in self._parent._unified_vertical:
|
231
237
|
if r == region:
|
232
238
|
region_verticals.append(coord)
|
233
|
-
|
239
|
+
|
234
240
|
for coord, r in self._parent._unified_horizontal:
|
235
241
|
if r == region:
|
236
242
|
region_horizontals.append(coord)
|
237
|
-
|
243
|
+
|
238
244
|
self._parent._flow_guides[region] = (
|
239
245
|
sorted(region_verticals),
|
240
|
-
sorted(region_horizontals)
|
246
|
+
sorted(region_horizontals),
|
241
247
|
)
|
242
|
-
|
248
|
+
|
243
249
|
return self._parent
|
244
|
-
|
250
|
+
|
245
251
|
# Original single-region logic
|
246
252
|
# Normalize markers to list of text strings
|
247
253
|
marker_texts = _normalize_markers(markers, target_obj)
|
@@ -286,7 +292,7 @@ class GuidesList(UserList):
|
|
286
292
|
source_label: Optional[str] = None,
|
287
293
|
max_lines: Optional[int] = None,
|
288
294
|
outer: bool = False,
|
289
|
-
detection_method: str = "
|
295
|
+
detection_method: str = "pixels",
|
290
296
|
resolution: int = 192,
|
291
297
|
*,
|
292
298
|
n: Optional[int] = None,
|
@@ -340,7 +346,7 @@ class GuidesList(UserList):
|
|
340
346
|
if self._parent.is_flow_region:
|
341
347
|
# Create guides across all constituent regions
|
342
348
|
all_guides = []
|
343
|
-
|
349
|
+
|
344
350
|
for region in self._parent.context.constituent_regions:
|
345
351
|
# Create guides for this specific region
|
346
352
|
region_guides = Guides.from_lines(
|
@@ -353,26 +359,31 @@ class GuidesList(UserList):
|
|
353
359
|
outer=outer,
|
354
360
|
detection_method=detection_method,
|
355
361
|
resolution=resolution,
|
356
|
-
**detect_kwargs
|
362
|
+
**detect_kwargs,
|
357
363
|
)
|
358
|
-
|
364
|
+
|
359
365
|
# Collect guides from this region
|
360
366
|
if self._axis == "vertical":
|
361
367
|
all_guides.extend(region_guides.vertical)
|
362
368
|
else:
|
363
369
|
all_guides.extend(region_guides.horizontal)
|
364
|
-
|
370
|
+
|
365
371
|
# Update parent's flow guides structure
|
366
372
|
if append:
|
367
373
|
# Append to existing
|
368
|
-
existing = [
|
369
|
-
|
370
|
-
|
374
|
+
existing = [
|
375
|
+
coord
|
376
|
+
for coord, _ in (
|
377
|
+
self._parent._unified_vertical
|
378
|
+
if self._axis == "vertical"
|
379
|
+
else self._parent._unified_horizontal
|
380
|
+
)
|
381
|
+
]
|
371
382
|
all_guides = existing + all_guides
|
372
|
-
|
383
|
+
|
373
384
|
# Remove duplicates and sort
|
374
385
|
unique_guides = sorted(list(set(all_guides)))
|
375
|
-
|
386
|
+
|
376
387
|
# Clear and rebuild unified view
|
377
388
|
if self._axis == "vertical":
|
378
389
|
self._parent._unified_vertical = []
|
@@ -398,25 +409,25 @@ class GuidesList(UserList):
|
|
398
409
|
break
|
399
410
|
self._parent._horizontal_cache = None
|
400
411
|
self.data = unique_guides
|
401
|
-
|
412
|
+
|
402
413
|
# Update per-region guides
|
403
414
|
for region in self._parent.context.constituent_regions:
|
404
415
|
region_verticals = []
|
405
416
|
region_horizontals = []
|
406
|
-
|
417
|
+
|
407
418
|
for coord, r in self._parent._unified_vertical:
|
408
419
|
if r == region:
|
409
420
|
region_verticals.append(coord)
|
410
|
-
|
421
|
+
|
411
422
|
for coord, r in self._parent._unified_horizontal:
|
412
423
|
if r == region:
|
413
424
|
region_horizontals.append(coord)
|
414
|
-
|
425
|
+
|
415
426
|
self._parent._flow_guides[region] = (
|
416
427
|
sorted(region_verticals),
|
417
|
-
sorted(region_horizontals)
|
428
|
+
sorted(region_horizontals),
|
418
429
|
)
|
419
|
-
|
430
|
+
|
420
431
|
return self._parent
|
421
432
|
|
422
433
|
# Original single-region logic
|
@@ -458,8 +469,11 @@ class GuidesList(UserList):
|
|
458
469
|
return self._parent
|
459
470
|
|
460
471
|
def from_whitespace(
|
461
|
-
self,
|
462
|
-
|
472
|
+
self,
|
473
|
+
obj: Optional[Union["Page", "Region", "FlowRegion"]] = None,
|
474
|
+
min_gap: float = 10,
|
475
|
+
*,
|
476
|
+
append: bool = False,
|
463
477
|
) -> "Guides":
|
464
478
|
"""
|
465
479
|
Create guides from whitespace gaps.
|
@@ -479,32 +493,33 @@ class GuidesList(UserList):
|
|
479
493
|
if self._parent.is_flow_region:
|
480
494
|
# Create guides across all constituent regions
|
481
495
|
all_guides = []
|
482
|
-
|
496
|
+
|
483
497
|
for region in self._parent.context.constituent_regions:
|
484
498
|
# Create guides for this specific region
|
485
|
-
region_guides = Guides.from_whitespace(
|
486
|
-
|
487
|
-
axis=self._axis,
|
488
|
-
min_gap=min_gap
|
489
|
-
)
|
490
|
-
|
499
|
+
region_guides = Guides.from_whitespace(obj=region, axis=self._axis, min_gap=min_gap)
|
500
|
+
|
491
501
|
# Collect guides from this region
|
492
502
|
if self._axis == "vertical":
|
493
503
|
all_guides.extend(region_guides.vertical)
|
494
504
|
else:
|
495
505
|
all_guides.extend(region_guides.horizontal)
|
496
|
-
|
506
|
+
|
497
507
|
# Update parent's flow guides structure
|
498
508
|
if append:
|
499
509
|
# Append to existing
|
500
|
-
existing = [
|
501
|
-
|
502
|
-
|
510
|
+
existing = [
|
511
|
+
coord
|
512
|
+
for coord, _ in (
|
513
|
+
self._parent._unified_vertical
|
514
|
+
if self._axis == "vertical"
|
515
|
+
else self._parent._unified_horizontal
|
516
|
+
)
|
517
|
+
]
|
503
518
|
all_guides = existing + all_guides
|
504
|
-
|
519
|
+
|
505
520
|
# Remove duplicates and sort
|
506
521
|
unique_guides = sorted(list(set(all_guides)))
|
507
|
-
|
522
|
+
|
508
523
|
# Clear and rebuild unified view
|
509
524
|
if self._axis == "vertical":
|
510
525
|
self._parent._unified_vertical = []
|
@@ -530,25 +545,25 @@ class GuidesList(UserList):
|
|
530
545
|
break
|
531
546
|
self._parent._horizontal_cache = None
|
532
547
|
self.data = unique_guides
|
533
|
-
|
548
|
+
|
534
549
|
# Update per-region guides
|
535
550
|
for region in self._parent.context.constituent_regions:
|
536
551
|
region_verticals = []
|
537
552
|
region_horizontals = []
|
538
|
-
|
553
|
+
|
539
554
|
for coord, r in self._parent._unified_vertical:
|
540
555
|
if r == region:
|
541
556
|
region_verticals.append(coord)
|
542
|
-
|
557
|
+
|
543
558
|
for coord, r in self._parent._unified_horizontal:
|
544
559
|
if r == region:
|
545
560
|
region_horizontals.append(coord)
|
546
|
-
|
561
|
+
|
547
562
|
self._parent._flow_guides[region] = (
|
548
563
|
sorted(region_verticals),
|
549
|
-
sorted(region_horizontals)
|
564
|
+
sorted(region_horizontals),
|
550
565
|
)
|
551
|
-
|
566
|
+
|
552
567
|
return self._parent
|
553
568
|
|
554
569
|
# Original single-region logic
|
@@ -915,7 +930,7 @@ class Guides:
|
|
915
930
|
|
916
931
|
# Check if we're dealing with a FlowRegion
|
917
932
|
self.is_flow_region = hasattr(context, "constituent_regions")
|
918
|
-
|
933
|
+
|
919
934
|
# If FlowRegion, we'll store guides per constituent region
|
920
935
|
if self.is_flow_region:
|
921
936
|
self._flow_guides: Dict["Region", Tuple[List[float], List[float]]] = {}
|
@@ -976,7 +991,7 @@ class Guides:
|
|
976
991
|
if self.is_flow_region:
|
977
992
|
# Invalidate cache when setting new values
|
978
993
|
self._vertical_cache = None
|
979
|
-
|
994
|
+
|
980
995
|
if value is None:
|
981
996
|
self._vertical.data = []
|
982
997
|
elif isinstance(value, Guides):
|
@@ -1018,7 +1033,7 @@ class Guides:
|
|
1018
1033
|
if self.is_flow_region:
|
1019
1034
|
# Invalidate cache when setting new values
|
1020
1035
|
self._horizontal_cache = None
|
1021
|
-
|
1036
|
+
|
1022
1037
|
if value is None:
|
1023
1038
|
self._horizontal.data = []
|
1024
1039
|
elif isinstance(value, Guides):
|
@@ -1132,7 +1147,7 @@ class Guides:
|
|
1132
1147
|
max_lines_h: Optional[int] = None,
|
1133
1148
|
max_lines_v: Optional[int] = None,
|
1134
1149
|
outer: bool = False,
|
1135
|
-
detection_method: str = "
|
1150
|
+
detection_method: str = "pixels",
|
1136
1151
|
resolution: int = 192,
|
1137
1152
|
**detect_kwargs,
|
1138
1153
|
) -> "Guides":
|
@@ -1163,7 +1178,7 @@ class Guides:
|
|
1163
1178
|
# Handle FlowRegion
|
1164
1179
|
if hasattr(obj, "constituent_regions"):
|
1165
1180
|
guides = cls(context=obj)
|
1166
|
-
|
1181
|
+
|
1167
1182
|
# Process each constituent region
|
1168
1183
|
for region in obj.constituent_regions:
|
1169
1184
|
# Create guides for this specific region
|
@@ -1177,27 +1192,27 @@ class Guides:
|
|
1177
1192
|
outer=outer,
|
1178
1193
|
detection_method=detection_method,
|
1179
1194
|
resolution=resolution,
|
1180
|
-
**detect_kwargs
|
1195
|
+
**detect_kwargs,
|
1181
1196
|
)
|
1182
|
-
|
1197
|
+
|
1183
1198
|
# Store in flow guides
|
1184
1199
|
guides._flow_guides[region] = (
|
1185
1200
|
list(region_guides.vertical),
|
1186
|
-
list(region_guides.horizontal)
|
1201
|
+
list(region_guides.horizontal),
|
1187
1202
|
)
|
1188
|
-
|
1203
|
+
|
1189
1204
|
# Add to unified view
|
1190
1205
|
for v in region_guides.vertical:
|
1191
1206
|
guides._unified_vertical.append((v, region))
|
1192
1207
|
for h in region_guides.horizontal:
|
1193
1208
|
guides._unified_horizontal.append((h, region))
|
1194
|
-
|
1209
|
+
|
1195
1210
|
# Invalidate caches to force rebuild on next access
|
1196
1211
|
guides._vertical_cache = None
|
1197
1212
|
guides._horizontal_cache = None
|
1198
|
-
|
1213
|
+
|
1199
1214
|
return guides
|
1200
|
-
|
1215
|
+
|
1201
1216
|
# Original single-region logic follows...
|
1202
1217
|
# Get bounds for potential outer guides
|
1203
1218
|
if hasattr(obj, "bbox"):
|
@@ -1228,12 +1243,17 @@ class Guides:
|
|
1228
1243
|
}
|
1229
1244
|
|
1230
1245
|
# Handle threshold parameter
|
1231
|
-
if threshold == "auto":
|
1246
|
+
if threshold == "auto" and detection_method == "vector":
|
1232
1247
|
# Auto mode: use very low thresholds with max_lines constraints
|
1233
1248
|
detect_params["peak_threshold_h"] = 0.0
|
1234
1249
|
detect_params["peak_threshold_v"] = 0.0
|
1235
1250
|
detect_params["max_lines_h"] = max_lines_h
|
1236
1251
|
detect_params["max_lines_v"] = max_lines_v
|
1252
|
+
if threshold == "auto" and detection_method == "pixels":
|
1253
|
+
detect_params["peak_threshold_h"] = 0.5
|
1254
|
+
detect_params["peak_threshold_v"] = 0.5
|
1255
|
+
detect_params["max_lines_h"] = max_lines_h
|
1256
|
+
detect_params["max_lines_v"] = max_lines_v
|
1237
1257
|
else:
|
1238
1258
|
# Fixed threshold mode
|
1239
1259
|
detect_params["peak_threshold_h"] = (
|
@@ -1275,6 +1295,7 @@ class Guides:
|
|
1275
1295
|
lines = []
|
1276
1296
|
|
1277
1297
|
# Filter by the source we just used
|
1298
|
+
|
1278
1299
|
lines = [
|
1279
1300
|
l for l in lines if getattr(l, "source", None) == detect_params["source_label"]
|
1280
1301
|
]
|
@@ -1399,7 +1420,7 @@ class Guides:
|
|
1399
1420
|
# Handle FlowRegion
|
1400
1421
|
if hasattr(obj, "constituent_regions"):
|
1401
1422
|
guides = cls(context=obj)
|
1402
|
-
|
1423
|
+
|
1403
1424
|
# Process each constituent region
|
1404
1425
|
for region in obj.constituent_regions:
|
1405
1426
|
# Create guides for this specific region
|
@@ -1409,27 +1430,27 @@ class Guides:
|
|
1409
1430
|
markers=markers,
|
1410
1431
|
align=align,
|
1411
1432
|
outer=outer,
|
1412
|
-
tolerance=tolerance
|
1433
|
+
tolerance=tolerance,
|
1413
1434
|
)
|
1414
|
-
|
1435
|
+
|
1415
1436
|
# Store in flow guides
|
1416
1437
|
guides._flow_guides[region] = (
|
1417
1438
|
list(region_guides.vertical),
|
1418
|
-
list(region_guides.horizontal)
|
1439
|
+
list(region_guides.horizontal),
|
1419
1440
|
)
|
1420
|
-
|
1441
|
+
|
1421
1442
|
# Add to unified view
|
1422
1443
|
for v in region_guides.vertical:
|
1423
1444
|
guides._unified_vertical.append((v, region))
|
1424
1445
|
for h in region_guides.horizontal:
|
1425
1446
|
guides._unified_horizontal.append((h, region))
|
1426
|
-
|
1447
|
+
|
1427
1448
|
# Invalidate caches
|
1428
1449
|
guides._vertical_cache = None
|
1429
1450
|
guides._horizontal_cache = None
|
1430
|
-
|
1451
|
+
|
1431
1452
|
return guides
|
1432
|
-
|
1453
|
+
|
1433
1454
|
# Original single-region logic follows...
|
1434
1455
|
guides_coords = []
|
1435
1456
|
bounds = None
|
@@ -1594,27 +1615,38 @@ class Guides:
|
|
1594
1615
|
if self.is_flow_region:
|
1595
1616
|
all_text_elements = []
|
1596
1617
|
region_bounds = {}
|
1597
|
-
|
1618
|
+
|
1598
1619
|
for region in self.context.constituent_regions:
|
1599
1620
|
# Get text elements from this region
|
1600
1621
|
if hasattr(region, "find_all"):
|
1601
1622
|
try:
|
1602
1623
|
text_elements = region.find_all("text", apply_exclusions=False)
|
1603
|
-
elements =
|
1624
|
+
elements = (
|
1625
|
+
text_elements.elements
|
1626
|
+
if hasattr(text_elements, "elements")
|
1627
|
+
else text_elements
|
1628
|
+
)
|
1604
1629
|
all_text_elements.extend(elements)
|
1605
|
-
|
1630
|
+
|
1606
1631
|
# Store bounds for each region
|
1607
1632
|
if hasattr(region, "bbox"):
|
1608
1633
|
region_bounds[region] = region.bbox
|
1609
1634
|
elif hasattr(region, "x0"):
|
1610
|
-
region_bounds[region] = (
|
1635
|
+
region_bounds[region] = (
|
1636
|
+
region.x0,
|
1637
|
+
region.top,
|
1638
|
+
region.x1,
|
1639
|
+
region.bottom,
|
1640
|
+
)
|
1611
1641
|
except Exception as e:
|
1612
1642
|
logger.warning(f"Error getting text elements from region: {e}")
|
1613
|
-
|
1643
|
+
|
1614
1644
|
if not all_text_elements:
|
1615
|
-
logger.warning(
|
1645
|
+
logger.warning(
|
1646
|
+
"No text elements found across flow regions for whitespace detection"
|
1647
|
+
)
|
1616
1648
|
return self
|
1617
|
-
|
1649
|
+
|
1618
1650
|
# Find whitespace gaps across all regions
|
1619
1651
|
if axis == "vertical":
|
1620
1652
|
gaps = self._find_vertical_whitespace_gaps(all_text_elements, min_gap, threshold)
|
@@ -1624,14 +1656,14 @@ class Guides:
|
|
1624
1656
|
for coord, region in self._unified_vertical:
|
1625
1657
|
all_guides.append(coord)
|
1626
1658
|
guide_to_region_map.setdefault(coord, []).append(region)
|
1627
|
-
|
1659
|
+
|
1628
1660
|
if gaps and all_guides:
|
1629
1661
|
# Keep a copy of original guides to maintain mapping
|
1630
1662
|
original_guides = all_guides.copy()
|
1631
|
-
|
1663
|
+
|
1632
1664
|
# Snap guides to gaps
|
1633
1665
|
self._snap_guides_to_gaps(all_guides, gaps, axis)
|
1634
|
-
|
1666
|
+
|
1635
1667
|
# Update the unified view with snapped positions
|
1636
1668
|
self._unified_vertical = []
|
1637
1669
|
for i, new_coord in enumerate(all_guides):
|
@@ -1641,7 +1673,7 @@ class Guides:
|
|
1641
1673
|
regions = guide_to_region_map.get(original_coord, [])
|
1642
1674
|
for region in regions:
|
1643
1675
|
self._unified_vertical.append((new_coord, region))
|
1644
|
-
|
1676
|
+
|
1645
1677
|
# Update individual region guides
|
1646
1678
|
for region in self._flow_guides:
|
1647
1679
|
region_verticals = []
|
@@ -1649,13 +1681,13 @@ class Guides:
|
|
1649
1681
|
if r == region:
|
1650
1682
|
region_verticals.append(coord)
|
1651
1683
|
self._flow_guides[region] = (
|
1652
|
-
sorted(list(set(region_verticals))),
|
1653
|
-
self._flow_guides[region][1]
|
1684
|
+
sorted(list(set(region_verticals))), # Deduplicate here
|
1685
|
+
self._flow_guides[region][1],
|
1654
1686
|
)
|
1655
|
-
|
1687
|
+
|
1656
1688
|
# Invalidate cache
|
1657
1689
|
self._vertical_cache = None
|
1658
|
-
|
1690
|
+
|
1659
1691
|
elif axis == "horizontal":
|
1660
1692
|
gaps = self._find_horizontal_whitespace_gaps(all_text_elements, min_gap, threshold)
|
1661
1693
|
# Get all horizontal guides across regions
|
@@ -1664,14 +1696,14 @@ class Guides:
|
|
1664
1696
|
for coord, region in self._unified_horizontal:
|
1665
1697
|
all_guides.append(coord)
|
1666
1698
|
guide_to_region_map.setdefault(coord, []).append(region)
|
1667
|
-
|
1699
|
+
|
1668
1700
|
if gaps and all_guides:
|
1669
1701
|
# Keep a copy of original guides to maintain mapping
|
1670
1702
|
original_guides = all_guides.copy()
|
1671
|
-
|
1703
|
+
|
1672
1704
|
# Snap guides to gaps
|
1673
1705
|
self._snap_guides_to_gaps(all_guides, gaps, axis)
|
1674
|
-
|
1706
|
+
|
1675
1707
|
# Update the unified view with snapped positions
|
1676
1708
|
self._unified_horizontal = []
|
1677
1709
|
for i, new_coord in enumerate(all_guides):
|
@@ -1680,7 +1712,7 @@ class Guides:
|
|
1680
1712
|
regions = guide_to_region_map.get(original_coord, [])
|
1681
1713
|
for region in regions:
|
1682
1714
|
self._unified_horizontal.append((new_coord, region))
|
1683
|
-
|
1715
|
+
|
1684
1716
|
# Update individual region guides
|
1685
1717
|
for region in self._flow_guides:
|
1686
1718
|
region_horizontals = []
|
@@ -1689,17 +1721,17 @@ class Guides:
|
|
1689
1721
|
region_horizontals.append(coord)
|
1690
1722
|
self._flow_guides[region] = (
|
1691
1723
|
self._flow_guides[region][0],
|
1692
|
-
sorted(list(set(region_horizontals)))
|
1724
|
+
sorted(list(set(region_horizontals))), # Deduplicate here
|
1693
1725
|
)
|
1694
|
-
|
1726
|
+
|
1695
1727
|
# Invalidate cache
|
1696
1728
|
self._horizontal_cache = None
|
1697
|
-
|
1729
|
+
|
1698
1730
|
else:
|
1699
1731
|
raise ValueError("axis must be 'vertical' or 'horizontal'")
|
1700
|
-
|
1732
|
+
|
1701
1733
|
return self
|
1702
|
-
|
1734
|
+
|
1703
1735
|
# Original single-region logic
|
1704
1736
|
# Get elements for trough detection
|
1705
1737
|
text_elements = self._get_text_elements()
|
@@ -1794,10 +1826,10 @@ class Guides:
|
|
1794
1826
|
|
1795
1827
|
# Handle FlowRegion context merging
|
1796
1828
|
new_context = self.context or other.context
|
1797
|
-
|
1829
|
+
|
1798
1830
|
# If both are flow regions, we might need a more complex merge,
|
1799
1831
|
# but for now, just picking one context is sufficient.
|
1800
|
-
|
1832
|
+
|
1801
1833
|
# Create the new Guides object
|
1802
1834
|
new_guides = Guides(
|
1803
1835
|
verticals=combined_verticals,
|
@@ -1815,7 +1847,7 @@ class Guides:
|
|
1815
1847
|
new_guides._flow_guides.update(self._flow_guides)
|
1816
1848
|
if hasattr(other, "_flow_guides"):
|
1817
1849
|
new_guides._flow_guides.update(other._flow_guides)
|
1818
|
-
|
1850
|
+
|
1819
1851
|
# Re-initialize unified views
|
1820
1852
|
if hasattr(self, "_unified_vertical"):
|
1821
1853
|
new_guides._unified_vertical.extend(self._unified_vertical)
|
@@ -1826,7 +1858,7 @@ class Guides:
|
|
1826
1858
|
new_guides._unified_horizontal.extend(self._unified_horizontal)
|
1827
1859
|
if hasattr(other, "_unified_horizontal"):
|
1828
1860
|
new_guides._unified_horizontal.extend(other._unified_horizontal)
|
1829
|
-
|
1861
|
+
|
1830
1862
|
# Invalidate caches to force rebuild
|
1831
1863
|
new_guides._vertical_cache = None
|
1832
1864
|
new_guides._horizontal_cache = None
|
@@ -1850,65 +1882,73 @@ class Guides:
|
|
1850
1882
|
if self.is_flow_region and (on is None or on == self.context):
|
1851
1883
|
if not self._flow_guides:
|
1852
1884
|
raise ValueError("No guides to show for FlowRegion")
|
1853
|
-
|
1885
|
+
|
1854
1886
|
# Get stacking parameters from kwargs or use defaults
|
1855
|
-
stack_direction = kwargs.get(
|
1856
|
-
stack_gap = kwargs.get(
|
1857
|
-
stack_background_color = kwargs.get(
|
1858
|
-
|
1887
|
+
stack_direction = kwargs.get("stack_direction", "vertical")
|
1888
|
+
stack_gap = kwargs.get("stack_gap", 5)
|
1889
|
+
stack_background_color = kwargs.get("stack_background_color", (255, 255, 255))
|
1890
|
+
|
1859
1891
|
# First, render all constituent regions without guides to get base images
|
1860
1892
|
base_images = []
|
1861
1893
|
region_infos = [] # Store region info for guide coordinate mapping
|
1862
|
-
|
1894
|
+
|
1863
1895
|
for region in self.context.constituent_regions:
|
1864
1896
|
try:
|
1865
|
-
# Render region without guides
|
1866
|
-
|
1897
|
+
# Render region without guides using new system
|
1898
|
+
if hasattr(region, "render"):
|
1899
|
+
img = region.render(
|
1900
|
+
resolution=kwargs.get("resolution", 150),
|
1901
|
+
width=kwargs.get("width", None),
|
1902
|
+
crop=True, # Always crop regions to their bounds
|
1903
|
+
)
|
1904
|
+
else:
|
1905
|
+
# Fallback to old method
|
1906
|
+
img = region.render(**kwargs)
|
1867
1907
|
if img:
|
1868
1908
|
base_images.append(img)
|
1869
|
-
|
1909
|
+
|
1870
1910
|
# Calculate scaling factors for this region
|
1871
1911
|
scale_x = img.width / region.width
|
1872
1912
|
scale_y = img.height / region.height
|
1873
|
-
|
1874
|
-
region_infos.append(
|
1875
|
-
|
1876
|
-
|
1877
|
-
|
1878
|
-
|
1879
|
-
|
1880
|
-
|
1881
|
-
|
1882
|
-
|
1883
|
-
|
1884
|
-
|
1913
|
+
|
1914
|
+
region_infos.append(
|
1915
|
+
{
|
1916
|
+
"region": region,
|
1917
|
+
"img_width": img.width,
|
1918
|
+
"img_height": img.height,
|
1919
|
+
"scale_x": scale_x,
|
1920
|
+
"scale_y": scale_y,
|
1921
|
+
"pdf_x0": region.x0,
|
1922
|
+
"pdf_top": region.top,
|
1923
|
+
"pdf_x1": region.x1,
|
1924
|
+
"pdf_bottom": region.bottom,
|
1925
|
+
}
|
1926
|
+
)
|
1885
1927
|
except Exception as e:
|
1886
1928
|
logger.warning(f"Failed to render region: {e}")
|
1887
|
-
|
1929
|
+
|
1888
1930
|
if not base_images:
|
1889
1931
|
raise ValueError("Failed to render any images for FlowRegion")
|
1890
|
-
|
1932
|
+
|
1891
1933
|
# Calculate final canvas size based on stacking direction
|
1892
1934
|
if stack_direction == "vertical":
|
1893
1935
|
final_width = max(img.width for img in base_images)
|
1894
1936
|
final_height = (
|
1895
|
-
sum(img.height for img in base_images)
|
1896
|
-
+ (len(base_images) - 1) * stack_gap
|
1937
|
+
sum(img.height for img in base_images) + (len(base_images) - 1) * stack_gap
|
1897
1938
|
)
|
1898
1939
|
else: # horizontal
|
1899
1940
|
final_width = (
|
1900
|
-
sum(img.width for img in base_images)
|
1901
|
-
+ (len(base_images) - 1) * stack_gap
|
1941
|
+
sum(img.width for img in base_images) + (len(base_images) - 1) * stack_gap
|
1902
1942
|
)
|
1903
1943
|
final_height = max(img.height for img in base_images)
|
1904
|
-
|
1944
|
+
|
1905
1945
|
# Create unified canvas
|
1906
1946
|
canvas = Image.new("RGB", (final_width, final_height), stack_background_color)
|
1907
1947
|
draw = ImageDraw.Draw(canvas)
|
1908
|
-
|
1948
|
+
|
1909
1949
|
# Paste base images and track positions
|
1910
1950
|
region_positions = [] # (region_info, paste_x, paste_y)
|
1911
|
-
|
1951
|
+
|
1912
1952
|
if stack_direction == "vertical":
|
1913
1953
|
current_y = 0
|
1914
1954
|
for i, (img, info) in enumerate(zip(base_images, region_infos)):
|
@@ -1923,44 +1963,50 @@ class Guides:
|
|
1923
1963
|
canvas.paste(img, (current_x, paste_y))
|
1924
1964
|
region_positions.append((info, current_x, paste_y))
|
1925
1965
|
current_x += img.width + stack_gap
|
1926
|
-
|
1966
|
+
|
1927
1967
|
# Now draw guides on the unified canvas
|
1928
1968
|
# Draw vertical guides (blue) - these extend through the full canvas height
|
1929
1969
|
for v_coord in self.vertical:
|
1930
1970
|
# Find which region(s) this guide intersects
|
1931
1971
|
for info, paste_x, paste_y in region_positions:
|
1932
|
-
if info[
|
1972
|
+
if info["pdf_x0"] <= v_coord <= info["pdf_x1"]:
|
1933
1973
|
# This guide is within this region's x-bounds
|
1934
1974
|
# Convert PDF coordinate to pixel coordinate relative to the region
|
1935
|
-
adjusted_x = v_coord - info[
|
1936
|
-
pixel_x = adjusted_x * info[
|
1937
|
-
|
1975
|
+
adjusted_x = v_coord - info["pdf_x0"]
|
1976
|
+
pixel_x = adjusted_x * info["scale_x"] + paste_x
|
1977
|
+
|
1938
1978
|
# Draw full-height line on canvas (not clipped to region)
|
1939
1979
|
if 0 <= pixel_x <= final_width:
|
1940
1980
|
x_pixel = int(pixel_x)
|
1941
|
-
draw.line(
|
1942
|
-
|
1981
|
+
draw.line(
|
1982
|
+
[(x_pixel, 0), (x_pixel, final_height - 1)],
|
1983
|
+
fill=(0, 0, 255, 200),
|
1984
|
+
width=2,
|
1985
|
+
)
|
1943
1986
|
break # Only draw once per guide
|
1944
|
-
|
1987
|
+
|
1945
1988
|
# Draw horizontal guides (red) - these extend through the full canvas width
|
1946
1989
|
for h_coord in self.horizontal:
|
1947
1990
|
# Find which region(s) this guide intersects
|
1948
1991
|
for info, paste_x, paste_y in region_positions:
|
1949
|
-
if info[
|
1992
|
+
if info["pdf_top"] <= h_coord <= info["pdf_bottom"]:
|
1950
1993
|
# This guide is within this region's y-bounds
|
1951
1994
|
# Convert PDF coordinate to pixel coordinate relative to the region
|
1952
|
-
adjusted_y = h_coord - info[
|
1953
|
-
pixel_y = adjusted_y * info[
|
1954
|
-
|
1995
|
+
adjusted_y = h_coord - info["pdf_top"]
|
1996
|
+
pixel_y = adjusted_y * info["scale_y"] + paste_y
|
1997
|
+
|
1955
1998
|
# Draw full-width line on canvas (not clipped to region)
|
1956
1999
|
if 0 <= pixel_y <= final_height:
|
1957
2000
|
y_pixel = int(pixel_y)
|
1958
|
-
draw.line(
|
1959
|
-
|
2001
|
+
draw.line(
|
2002
|
+
[(0, y_pixel), (final_width - 1, y_pixel)],
|
2003
|
+
fill=(255, 0, 0, 200),
|
2004
|
+
width=2,
|
2005
|
+
)
|
1960
2006
|
break # Only draw once per guide
|
1961
|
-
|
2007
|
+
|
1962
2008
|
return canvas
|
1963
|
-
|
2009
|
+
|
1964
2010
|
# Original single-region logic follows...
|
1965
2011
|
# Determine what to display guides on
|
1966
2012
|
target = on if on is not None else self.context
|
@@ -1981,10 +2027,15 @@ class Guides:
|
|
1981
2027
|
raise ValueError("No target specified and no context available for guides display")
|
1982
2028
|
|
1983
2029
|
# Prepare kwargs for image generation
|
1984
|
-
image_kwargs =
|
2030
|
+
image_kwargs = {}
|
1985
2031
|
|
1986
|
-
#
|
1987
|
-
|
2032
|
+
# Extract only the parameters that the new render() method accepts
|
2033
|
+
if "resolution" in kwargs:
|
2034
|
+
image_kwargs["resolution"] = kwargs["resolution"]
|
2035
|
+
if "width" in kwargs:
|
2036
|
+
image_kwargs["width"] = kwargs["width"]
|
2037
|
+
if "crop" in kwargs:
|
2038
|
+
image_kwargs["crop"] = kwargs["crop"]
|
1988
2039
|
|
1989
2040
|
# If target is a region-like object, crop to just that region
|
1990
2041
|
if hasattr(target, "bbox") and hasattr(target, "page"):
|
@@ -1992,13 +2043,17 @@ class Guides:
|
|
1992
2043
|
image_kwargs["crop"] = True
|
1993
2044
|
|
1994
2045
|
# Get base image
|
1995
|
-
if hasattr(target, "
|
1996
|
-
|
2046
|
+
if hasattr(target, "render"):
|
2047
|
+
# Use the new unified rendering system
|
2048
|
+
img = target.render(**image_kwargs)
|
2049
|
+
elif hasattr(target, "render"):
|
2050
|
+
# Fallback to old method if available
|
2051
|
+
img = target.render(**image_kwargs)
|
1997
2052
|
elif hasattr(target, "mode") and hasattr(target, "size"):
|
1998
2053
|
# It's already a PIL Image
|
1999
2054
|
img = target
|
2000
2055
|
else:
|
2001
|
-
raise ValueError(f"Object {target} does not support
|
2056
|
+
raise ValueError(f"Object {target} does not support render() and is not a PIL Image")
|
2002
2057
|
|
2003
2058
|
if img is None:
|
2004
2059
|
raise ValueError("Failed to generate base image")
|
@@ -2599,64 +2654,70 @@ class Guides:
|
|
2599
2654
|
source: Source label for created regions (for identification)
|
2600
2655
|
cell_padding: Internal padding for cell regions in points
|
2601
2656
|
include_outer_boundaries: Whether to add boundaries at edges if missing
|
2602
|
-
multi_page: Controls multi-
|
2603
|
-
- "auto": (default) Creates a
|
2604
|
-
- True: Forces creation of a multi-
|
2605
|
-
- False: Creates separate grids for each
|
2657
|
+
multi_page: Controls multi-region table creation for FlowRegions.
|
2658
|
+
- "auto": (default) Creates a unified grid if there are multiple regions or guides span pages.
|
2659
|
+
- True: Forces creation of a unified multi-region grid.
|
2660
|
+
- False: Creates separate grids for each region.
|
2606
2661
|
|
2607
2662
|
Returns:
|
2608
2663
|
Dictionary with 'counts' and 'regions' created.
|
2609
2664
|
"""
|
2610
2665
|
# Dispatch to appropriate implementation based on context and flags
|
2611
2666
|
if self.is_flow_region:
|
2667
|
+
# Check if we should create a unified multi-region grid
|
2668
|
+
has_multiple_regions = len(self.context.constituent_regions) > 1
|
2612
2669
|
spans_pages = self._spans_pages()
|
2613
|
-
|
2670
|
+
|
2671
|
+
# Create unified grid if:
|
2672
|
+
# - multi_page is explicitly True, OR
|
2673
|
+
# - multi_page is "auto" AND (spans pages OR has multiple regions)
|
2674
|
+
if multi_page is True or (
|
2675
|
+
multi_page == "auto" and (spans_pages or has_multiple_regions)
|
2676
|
+
):
|
2614
2677
|
return self._build_grid_multi_page(
|
2615
2678
|
source=source,
|
2616
2679
|
cell_padding=cell_padding,
|
2617
2680
|
include_outer_boundaries=include_outer_boundaries,
|
2618
2681
|
)
|
2619
2682
|
else:
|
2620
|
-
# FlowRegion
|
2683
|
+
# Single region FlowRegion or multi_page=False: create separate tables per region
|
2621
2684
|
total_counts = {"table": 0, "rows": 0, "columns": 0, "cells": 0}
|
2622
2685
|
all_regions = {"table": [], "rows": [], "columns": [], "cells": []}
|
2623
|
-
|
2686
|
+
|
2624
2687
|
for region in self.context.constituent_regions:
|
2625
2688
|
if region in self._flow_guides:
|
2626
2689
|
verticals, horizontals = self._flow_guides[region]
|
2627
|
-
|
2690
|
+
|
2628
2691
|
region_guides = Guides(
|
2629
|
-
verticals=verticals,
|
2630
|
-
horizontals=horizontals,
|
2631
|
-
context=region
|
2692
|
+
verticals=verticals, horizontals=horizontals, context=region
|
2632
2693
|
)
|
2633
|
-
|
2694
|
+
|
2634
2695
|
try:
|
2635
2696
|
result = region_guides._build_grid_single_page(
|
2636
2697
|
target=region,
|
2637
2698
|
source=source,
|
2638
2699
|
cell_padding=cell_padding,
|
2639
|
-
include_outer_boundaries=include_outer_boundaries
|
2700
|
+
include_outer_boundaries=include_outer_boundaries,
|
2640
2701
|
)
|
2641
|
-
|
2702
|
+
|
2642
2703
|
for key in total_counts:
|
2643
2704
|
total_counts[key] += result["counts"][key]
|
2644
|
-
|
2705
|
+
|
2645
2706
|
if result["regions"]["table"]:
|
2646
2707
|
all_regions["table"].append(result["regions"]["table"])
|
2647
2708
|
all_regions["rows"].extend(result["regions"]["rows"])
|
2648
2709
|
all_regions["columns"].extend(result["regions"]["columns"])
|
2649
2710
|
all_regions["cells"].extend(result["regions"]["cells"])
|
2650
|
-
|
2711
|
+
|
2651
2712
|
except Exception as e:
|
2652
2713
|
logger.warning(f"Failed to build grid on region: {e}")
|
2653
|
-
|
2714
|
+
|
2654
2715
|
logger.info(
|
2655
2716
|
f"Created {total_counts['table']} tables, {total_counts['rows']} rows, "
|
2656
2717
|
f"{total_counts['columns']} columns, and {total_counts['cells']} cells "
|
2657
2718
|
f"from guides across {len(self._flow_guides)} regions"
|
2658
2719
|
)
|
2659
|
-
|
2720
|
+
|
2660
2721
|
return {"counts": total_counts, "regions": all_regions}
|
2661
2722
|
|
2662
2723
|
# Fallback for single page/region
|
@@ -2673,7 +2734,16 @@ class Guides:
|
|
2673
2734
|
cell_padding: float,
|
2674
2735
|
include_outer_boundaries: bool,
|
2675
2736
|
) -> Dict[str, Any]:
|
2676
|
-
"""
|
2737
|
+
"""
|
2738
|
+
Builds a single, coherent grid across multiple regions of a FlowRegion.
|
2739
|
+
|
2740
|
+
Creates physical Region objects for each constituent region with _fragment
|
2741
|
+
region types (e.g., table_column_fragment), then stitches them into logical
|
2742
|
+
FlowRegion objects. Both are registered with pages, but the fragment types
|
2743
|
+
allow easy differentiation:
|
2744
|
+
- find_all('table_column') returns only logical columns
|
2745
|
+
- find_all('table_column_fragment') returns only physical fragments
|
2746
|
+
"""
|
2677
2747
|
from natural_pdf.flows.region import FlowRegion
|
2678
2748
|
|
2679
2749
|
if not self.is_flow_region or not hasattr(self.context, "flow") or not self.context.flow:
|
@@ -2699,9 +2769,9 @@ class Guides:
|
|
2699
2769
|
# Ensure the region's own boundaries are included to close off cells at page breaks
|
2700
2770
|
clipped_verticals = sorted(list(set([bounds[0], bounds[2]] + clipped_verticals)))
|
2701
2771
|
clipped_horizontals = sorted(list(set([bounds[1], bounds[3]] + clipped_horizontals)))
|
2702
|
-
|
2772
|
+
|
2703
2773
|
if len(clipped_verticals) < 2 or len(clipped_horizontals) < 2:
|
2704
|
-
continue
|
2774
|
+
continue # Not enough guides to form a cell
|
2705
2775
|
|
2706
2776
|
region_guides = Guides(
|
2707
2777
|
verticals=clipped_verticals,
|
@@ -2713,10 +2783,30 @@ class Guides:
|
|
2713
2783
|
target=region,
|
2714
2784
|
source=source,
|
2715
2785
|
cell_padding=cell_padding,
|
2716
|
-
include_outer_boundaries=False,
|
2786
|
+
include_outer_boundaries=False, # Boundaries are already handled
|
2717
2787
|
)
|
2718
2788
|
|
2719
2789
|
if grid_parts["counts"]["table"] > 0:
|
2790
|
+
# Mark physical regions as fragments by updating their region_type
|
2791
|
+
# This happens before stitching into logical FlowRegions
|
2792
|
+
if len(self.context.constituent_regions) > 1:
|
2793
|
+
# Update region types to indicate these are fragments
|
2794
|
+
if grid_parts["regions"]["table"]:
|
2795
|
+
grid_parts["regions"]["table"].region_type = "table_fragment"
|
2796
|
+
grid_parts["regions"]["table"].metadata["is_fragment"] = True
|
2797
|
+
|
2798
|
+
for row in grid_parts["regions"]["rows"]:
|
2799
|
+
row.region_type = "table_row_fragment"
|
2800
|
+
row.metadata["is_fragment"] = True
|
2801
|
+
|
2802
|
+
for col in grid_parts["regions"]["columns"]:
|
2803
|
+
col.region_type = "table_column_fragment"
|
2804
|
+
col.metadata["is_fragment"] = True
|
2805
|
+
|
2806
|
+
for cell in grid_parts["regions"]["cells"]:
|
2807
|
+
cell.region_type = "table_cell_fragment"
|
2808
|
+
cell.metadata["is_fragment"] = True
|
2809
|
+
|
2720
2810
|
results_by_region.append(grid_parts)
|
2721
2811
|
|
2722
2812
|
if not results_by_region:
|
@@ -2738,7 +2828,7 @@ class Guides:
|
|
2738
2828
|
multi_page_table.metadata.update(
|
2739
2829
|
{"is_multi_page": True, "num_rows": self.n_rows, "num_cols": self.n_cols}
|
2740
2830
|
)
|
2741
|
-
|
2831
|
+
|
2742
2832
|
# Initialize final region collections
|
2743
2833
|
final_rows = []
|
2744
2834
|
final_cols = []
|
@@ -2756,32 +2846,53 @@ class Guides:
|
|
2756
2846
|
# Iterate through page breaks to merge split rows/cells
|
2757
2847
|
for i in range(len(results_by_region) - 1):
|
2758
2848
|
region_A = self.context.constituent_regions[i]
|
2759
|
-
|
2849
|
+
|
2760
2850
|
# Check if a guide exists at the boundary
|
2761
|
-
is_break_bounded = any(
|
2851
|
+
is_break_bounded = any(
|
2852
|
+
abs(h - region_A.bottom) < 0.1 for h in self.horizontal.data
|
2853
|
+
)
|
2762
2854
|
|
2763
|
-
if not is_break_bounded and page_rows[i] and page_rows[i+1]:
|
2855
|
+
if not is_break_bounded and page_rows[i] and page_rows[i + 1]:
|
2764
2856
|
# No guide at break -> merge last row of A with first row of B
|
2765
2857
|
last_row_A = page_rows[i].pop(-1)
|
2766
|
-
first_row_B = page_rows[i+1].pop(0)
|
2767
|
-
|
2768
|
-
merged_row = FlowRegion(
|
2858
|
+
first_row_B = page_rows[i + 1].pop(0)
|
2859
|
+
|
2860
|
+
merged_row = FlowRegion(
|
2861
|
+
flow, [last_row_A, first_row_B], source_flow_element=None
|
2862
|
+
)
|
2769
2863
|
merged_row.source = source
|
2770
2864
|
merged_row.region_type = "table_row"
|
2771
|
-
merged_row.metadata.update(
|
2772
|
-
|
2865
|
+
merged_row.metadata.update(
|
2866
|
+
{
|
2867
|
+
"row_index": last_row_A.metadata.get("row_index"),
|
2868
|
+
"is_multi_page": True,
|
2869
|
+
}
|
2870
|
+
)
|
2871
|
+
page_rows[i].append(merged_row) # Add merged row back in place of A's last
|
2773
2872
|
|
2774
2873
|
# Merge the corresponding cells using explicit row/col indices
|
2775
2874
|
last_row_idx = last_row_A.metadata.get("row_index")
|
2776
2875
|
first_row_idx = first_row_B.metadata.get("row_index")
|
2777
2876
|
|
2778
2877
|
# Cells belonging to those rows
|
2779
|
-
last_cells_A = [
|
2780
|
-
|
2878
|
+
last_cells_A = [
|
2879
|
+
c for c in page_cells[i] if c.metadata.get("row_index") == last_row_idx
|
2880
|
+
]
|
2881
|
+
first_cells_B = [
|
2882
|
+
c
|
2883
|
+
for c in page_cells[i + 1]
|
2884
|
+
if c.metadata.get("row_index") == first_row_idx
|
2885
|
+
]
|
2781
2886
|
|
2782
2887
|
# Remove them from their page lists
|
2783
|
-
page_cells[i]
|
2784
|
-
|
2888
|
+
page_cells[i] = [
|
2889
|
+
c for c in page_cells[i] if c.metadata.get("row_index") != last_row_idx
|
2890
|
+
]
|
2891
|
+
page_cells[i + 1] = [
|
2892
|
+
c
|
2893
|
+
for c in page_cells[i + 1]
|
2894
|
+
if c.metadata.get("row_index") != first_row_idx
|
2895
|
+
]
|
2785
2896
|
|
2786
2897
|
# Sort both lists by column index to keep alignment stable
|
2787
2898
|
last_cells_A.sort(key=lambda c: c.metadata.get("col_index", 0))
|
@@ -2789,14 +2900,18 @@ class Guides:
|
|
2789
2900
|
|
2790
2901
|
# Pair-wise merge
|
2791
2902
|
for cell_A, cell_B in zip(last_cells_A, first_cells_B):
|
2792
|
-
merged_cell = FlowRegion(
|
2903
|
+
merged_cell = FlowRegion(
|
2904
|
+
flow, [cell_A, cell_B], source_flow_element=None
|
2905
|
+
)
|
2793
2906
|
merged_cell.source = source
|
2794
2907
|
merged_cell.region_type = "table_cell"
|
2795
|
-
merged_cell.metadata.update(
|
2796
|
-
|
2797
|
-
|
2798
|
-
|
2799
|
-
|
2908
|
+
merged_cell.metadata.update(
|
2909
|
+
{
|
2910
|
+
"row_index": cell_A.metadata.get("row_index"),
|
2911
|
+
"col_index": cell_A.metadata.get("col_index"),
|
2912
|
+
"is_multi_page": True,
|
2913
|
+
}
|
2914
|
+
)
|
2800
2915
|
page_cells[i].append(merged_cell)
|
2801
2916
|
|
2802
2917
|
# Flatten the potentially modified lists of rows and cells
|
@@ -2804,23 +2919,27 @@ class Guides:
|
|
2804
2919
|
final_cells = [cell for cells_list in page_cells for cell in cells_list]
|
2805
2920
|
|
2806
2921
|
# Stitch columns, which always span vertically
|
2807
|
-
physical_cols_by_index = zip(
|
2922
|
+
physical_cols_by_index = zip(
|
2923
|
+
*(res["regions"]["columns"] for res in results_by_region)
|
2924
|
+
)
|
2808
2925
|
for j, physical_cols in enumerate(physical_cols_by_index):
|
2809
|
-
col_fr = FlowRegion(
|
2926
|
+
col_fr = FlowRegion(
|
2927
|
+
flow=flow, constituent_regions=list(physical_cols), source_flow_element=None
|
2928
|
+
)
|
2810
2929
|
col_fr.source = source
|
2811
2930
|
col_fr.region_type = "table_column"
|
2812
2931
|
col_fr.metadata.update({"col_index": j, "is_multi_page": True})
|
2813
2932
|
final_cols.append(col_fr)
|
2814
2933
|
|
2815
2934
|
elif orientation == "horizontal":
|
2816
|
-
|
2817
|
-
|
2935
|
+
# Symmetric logic for horizontal flow (not fully implemented here for brevity)
|
2936
|
+
# This would merge last column of A with first column of B if no vertical guide exists
|
2818
2937
|
logger.warning("Horizontal table stitching not fully implemented.")
|
2819
2938
|
final_rows = [row for res in results_by_region for row in res["regions"]["rows"]]
|
2820
2939
|
final_cols = [col for res in results_by_region for col in res["regions"]["columns"]]
|
2821
2940
|
final_cells = [cell for res in results_by_region for cell in res["regions"]["cells"]]
|
2822
|
-
|
2823
|
-
else:
|
2941
|
+
|
2942
|
+
else: # Unknown orientation, just flatten everything
|
2824
2943
|
final_rows = [row for res in results_by_region for row in res["regions"]["rows"]]
|
2825
2944
|
final_cols = [col for res in results_by_region for col in res["regions"]["columns"]]
|
2826
2945
|
final_cells = [cell for res in results_by_region for cell in res["regions"]["cells"]]
|
@@ -2829,65 +2948,40 @@ class Guides:
|
|
2829
2948
|
# This ensures that page.find('table') finds the logical multi-page table, not fragments
|
2830
2949
|
constituent_pages = set()
|
2831
2950
|
for region in self.context.constituent_regions:
|
2832
|
-
if hasattr(region,
|
2951
|
+
if hasattr(region, "page") and hasattr(region.page, "_element_mgr"):
|
2833
2952
|
constituent_pages.add(region.page)
|
2834
|
-
|
2835
|
-
#
|
2836
|
-
#
|
2837
|
-
physical_tables_to_remove = set(physical_tables) # Convert to set for fast lookup
|
2838
|
-
|
2953
|
+
|
2954
|
+
# Register the logical multi-page table with all constituent pages
|
2955
|
+
# Note: Physical table fragments are already registered with region_type="table_fragment"
|
2839
2956
|
for page in constituent_pages:
|
2840
2957
|
try:
|
2841
|
-
# Find and remove only the specific physical tables that are part of this multi-page table
|
2842
|
-
existing_tables = page.find_all('table')
|
2843
|
-
tables_to_remove = [
|
2844
|
-
table for table in existing_tables
|
2845
|
-
if (table in physical_tables_to_remove and
|
2846
|
-
not isinstance(table, FlowRegion)) # Only remove the specific Region tables we created
|
2847
|
-
]
|
2848
|
-
|
2849
|
-
for table in tables_to_remove:
|
2850
|
-
page._element_mgr.remove_element(table, element_type="regions")
|
2851
|
-
logger.debug(f"Removed physical table fragment from page {page.page_number}")
|
2852
|
-
|
2853
|
-
# Now register the multi-page table
|
2854
2958
|
page._element_mgr.add_element(multi_page_table, element_type="regions")
|
2855
2959
|
logger.debug(f"Registered multi-page table with page {page.page_number}")
|
2856
|
-
|
2960
|
+
|
2857
2961
|
except Exception as e:
|
2858
|
-
logger.warning(
|
2859
|
-
|
2860
|
-
|
2861
|
-
|
2862
|
-
|
2863
|
-
|
2864
|
-
|
2865
|
-
|
2866
|
-
|
2867
|
-
|
2868
|
-
|
2869
|
-
|
2870
|
-
|
2871
|
-
|
2872
|
-
|
2873
|
-
|
2874
|
-
|
2875
|
-
|
2876
|
-
|
2877
|
-
|
2878
|
-
|
2879
|
-
|
2880
|
-
|
2881
|
-
|
2882
|
-
for cell in final_cells:
|
2883
|
-
if hasattr(cell, 'constituent_regions'):
|
2884
|
-
# This is a FlowRegion cell spanning multiple pages
|
2885
|
-
for constituent_region in cell.constituent_regions:
|
2886
|
-
if hasattr(constituent_region, 'page') and hasattr(constituent_region.page, '_element_mgr'):
|
2887
|
-
try:
|
2888
|
-
constituent_region.page._element_mgr.add_element(cell, element_type="regions")
|
2889
|
-
except Exception as e:
|
2890
|
-
logger.warning(f"Failed to register multi-page cell: {e}")
|
2962
|
+
logger.warning(
|
2963
|
+
f"Failed to register multi-page table with page {page.page_number}: {e}"
|
2964
|
+
)
|
2965
|
+
|
2966
|
+
# SMART PAGE-LEVEL REGISTRY: Register logical FlowRegion elements.
|
2967
|
+
# Physical fragments are already registered with their pages with _fragment region types,
|
2968
|
+
# so users can differentiate between logical regions and physical fragments.
|
2969
|
+
for page in constituent_pages:
|
2970
|
+
try:
|
2971
|
+
# Register all logical rows with this page
|
2972
|
+
for row in final_rows:
|
2973
|
+
page._element_mgr.add_element(row, element_type="regions")
|
2974
|
+
|
2975
|
+
# Register all logical columns with this page
|
2976
|
+
for col in final_cols:
|
2977
|
+
page._element_mgr.add_element(col, element_type="regions")
|
2978
|
+
|
2979
|
+
# Register all logical cells with this page
|
2980
|
+
for cell in final_cells:
|
2981
|
+
page._element_mgr.add_element(cell, element_type="regions")
|
2982
|
+
|
2983
|
+
except Exception as e:
|
2984
|
+
logger.warning(f"Failed to register multi-region table elements with page: {e}")
|
2891
2985
|
|
2892
2986
|
final_counts = {
|
2893
2987
|
"table": 1,
|
@@ -3130,7 +3224,9 @@ class Guides:
|
|
3130
3224
|
try:
|
3131
3225
|
text_elements = region.find_all("text", apply_exclusions=False)
|
3132
3226
|
elements = (
|
3133
|
-
text_elements.elements
|
3227
|
+
text_elements.elements
|
3228
|
+
if hasattr(text_elements, "elements")
|
3229
|
+
else text_elements
|
3134
3230
|
)
|
3135
3231
|
all_text_elements.extend(elements)
|
3136
3232
|
except Exception as e:
|
@@ -3161,7 +3257,7 @@ class Guides:
|
|
3161
3257
|
v_guide_pages = {}
|
3162
3258
|
for coord, region in self._unified_vertical:
|
3163
3259
|
v_guide_pages.setdefault(coord, set()).add(region.page.page_number)
|
3164
|
-
|
3260
|
+
|
3165
3261
|
for pages in v_guide_pages.values():
|
3166
3262
|
if len(pages) > 1:
|
3167
3263
|
return True
|
@@ -3331,7 +3427,7 @@ class Guides:
|
|
3331
3427
|
return "unknown"
|
3332
3428
|
|
3333
3429
|
r1 = self.context.constituent_regions[0]
|
3334
|
-
r2 = self.context.constituent_regions[1]
|
3430
|
+
r2 = self.context.constituent_regions[1] # Compare first two regions
|
3335
3431
|
|
3336
3432
|
if not r1.bbox or not r2.bbox:
|
3337
3433
|
return "unknown"
|