natural-pdf 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bad_pdf_analysis/analyze_10_more.py +300 -0
- bad_pdf_analysis/analyze_final_10.py +552 -0
- bad_pdf_analysis/analyze_specific_pages.py +394 -0
- bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +45 -1
- natural_pdf/analyzers/layout/surya.py +1 -1
- natural_pdf/analyzers/layout/yolo.py +2 -2
- natural_pdf/analyzers/shape_detection_mixin.py +228 -0
- natural_pdf/classification/manager.py +67 -0
- natural_pdf/core/element_manager.py +556 -25
- natural_pdf/core/highlighting_service.py +98 -43
- natural_pdf/core/page.py +86 -20
- natural_pdf/core/pdf.py +0 -2
- natural_pdf/describe/base.py +40 -9
- natural_pdf/describe/elements.py +11 -6
- natural_pdf/elements/base.py +134 -20
- natural_pdf/elements/collections.py +43 -11
- natural_pdf/elements/image.py +43 -0
- natural_pdf/elements/region.py +64 -19
- natural_pdf/elements/text.py +89 -11
- natural_pdf/flows/collections.py +4 -4
- natural_pdf/flows/region.py +17 -2
- natural_pdf/ocr/engine_paddle.py +1 -1
- natural_pdf/ocr/ocr_factory.py +8 -8
- natural_pdf/ocr/ocr_manager.py +51 -1
- natural_pdf/selectors/parser.py +27 -7
- natural_pdf/tables/__init__.py +5 -0
- natural_pdf/tables/result.py +101 -0
- natural_pdf/utils/bidi_mirror.py +36 -0
- natural_pdf/utils/visualization.py +15 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +51 -29
- natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
- optimization/memory_comparison.py +172 -0
- optimization/pdf_analyzer.py +410 -0
- optimization/performance_analysis.py +397 -0
- optimization/test_cleanup_methods.py +155 -0
- optimization/test_memory_fix.py +162 -0
- tools/bad_pdf_eval/__init__.py +1 -0
- tools/bad_pdf_eval/analyser.py +302 -0
- tools/bad_pdf_eval/collate_summaries.py +130 -0
- tools/bad_pdf_eval/eval_suite.py +116 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
- tools/bad_pdf_eval/llm_enrich.py +273 -0
- tools/bad_pdf_eval/reporter.py +17 -0
- tools/bad_pdf_eval/utils.py +127 -0
- tools/rtl_smoke_test.py +80 -0
- natural_pdf-0.1.27.dist-info/top_level.txt +0 -2
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
natural_pdf/describe/elements.py
CHANGED
@@ -279,7 +279,7 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
|
|
279
279
|
"""Analyze typography patterns in text elements."""
|
280
280
|
fonts = Counter()
|
281
281
|
sizes = Counter()
|
282
|
-
styles = {'bold': 0, 'italic': 0}
|
282
|
+
styles = {'bold': 0, 'italic': 0, 'strikeout': 0, 'underline': 0, 'highlight': 0}
|
283
283
|
colors = Counter()
|
284
284
|
|
285
285
|
for element in elements:
|
@@ -302,6 +302,12 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
|
|
302
302
|
styles['bold'] += 1
|
303
303
|
if getattr(element, 'italic', False):
|
304
304
|
styles['italic'] += 1
|
305
|
+
if getattr(element, 'strikeout', False):
|
306
|
+
styles['strikeout'] += 1
|
307
|
+
if getattr(element, 'underline', False):
|
308
|
+
styles['underline'] += 1
|
309
|
+
if getattr(element, 'highlight', False):
|
310
|
+
styles['highlight'] += 1
|
305
311
|
|
306
312
|
# Color - use TextElement's color property
|
307
313
|
color = getattr(element, 'color', None)
|
@@ -328,13 +334,12 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
|
|
328
334
|
|
329
335
|
# Styles
|
330
336
|
style_list = []
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
style_list.append(f"{styles['italic']} italic")
|
337
|
+
for style, count in styles.items():
|
338
|
+
if count > 0:
|
339
|
+
style_list.append(f"{count} {style}")
|
335
340
|
if style_list:
|
336
341
|
result['styles'] = ", ".join(style_list)
|
337
|
-
|
342
|
+
|
338
343
|
# Colors
|
339
344
|
if colors and len(colors) > 1: # Only show if there are multiple colors
|
340
345
|
result['colors'] = dict(colors.most_common())
|
natural_pdf/elements/base.py
CHANGED
@@ -414,6 +414,114 @@ class DirectionalMixin:
|
|
414
414
|
|
415
415
|
return new_region
|
416
416
|
|
417
|
+
# ------------------------------------------------------------------
|
418
|
+
# Spatial parent lookup
|
419
|
+
# ------------------------------------------------------------------
|
420
|
+
|
421
|
+
def parent(
|
422
|
+
self,
|
423
|
+
selector: Optional[str] = None,
|
424
|
+
*,
|
425
|
+
mode: str = "contains", # "contains" | "center" | "overlap"
|
426
|
+
) -> Optional["Element"]:
|
427
|
+
"""Return the *smallest* element/region that encloses this one.
|
428
|
+
|
429
|
+
The search is purely geometric – no pre-existing hierarchy is assumed.
|
430
|
+
|
431
|
+
Parameters
|
432
|
+
----------
|
433
|
+
selector : str, optional
|
434
|
+
CSS-style selector used to filter candidate containers first.
|
435
|
+
mode : str, default "contains"
|
436
|
+
How to decide if a candidate encloses this element.
|
437
|
+
|
438
|
+
• ``"contains"`` – candidate bbox fully contains *self* bbox.
|
439
|
+
• ``"center"`` – candidate contains the centroid of *self*.
|
440
|
+
• ``"overlap"`` – any bbox intersection > 0 pt².
|
441
|
+
|
442
|
+
Returns
|
443
|
+
-------
|
444
|
+
Element | Region | None
|
445
|
+
The smallest-area container that matches, or *None* if none found.
|
446
|
+
"""
|
447
|
+
|
448
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
449
|
+
|
450
|
+
# --- Gather candidates ------------------------------------------------
|
451
|
+
page = getattr(self, "page", None)
|
452
|
+
if page is None:
|
453
|
+
return None
|
454
|
+
|
455
|
+
# All basic elements
|
456
|
+
try:
|
457
|
+
candidates: List["Element"] = list(page.get_elements(apply_exclusions=False))
|
458
|
+
except Exception:
|
459
|
+
candidates = []
|
460
|
+
|
461
|
+
# Add detected regions if present
|
462
|
+
if hasattr(page, "_element_mgr") and hasattr(page._element_mgr, "regions"):
|
463
|
+
candidates.extend(list(page._element_mgr.regions))
|
464
|
+
|
465
|
+
# Remove self from pool
|
466
|
+
candidates = [c for c in candidates if c is not self]
|
467
|
+
|
468
|
+
# Apply selector filtering early if provided
|
469
|
+
if selector:
|
470
|
+
sel_obj = parse_selector(selector)
|
471
|
+
filt = selector_to_filter_func(sel_obj)
|
472
|
+
candidates = [c for c in candidates if filt(c)]
|
473
|
+
|
474
|
+
if not candidates:
|
475
|
+
return None
|
476
|
+
|
477
|
+
# Helper to extract bbox (x0, top, x1, bottom)
|
478
|
+
def _bbox(obj):
|
479
|
+
return extract_bbox(obj)
|
480
|
+
|
481
|
+
# Self metrics
|
482
|
+
self_bbox = _bbox(self)
|
483
|
+
if self_bbox is None:
|
484
|
+
return None
|
485
|
+
s_x0, s_y0, s_x1, s_y1 = self_bbox
|
486
|
+
s_cx = (s_x0 + s_x1) / 2
|
487
|
+
s_cy = (s_y0 + s_y1) / 2
|
488
|
+
|
489
|
+
matches: List["Element"] = []
|
490
|
+
|
491
|
+
for cand in candidates:
|
492
|
+
c_bbox = _bbox(cand)
|
493
|
+
if c_bbox is None:
|
494
|
+
continue
|
495
|
+
c_x0, c_y0, c_x1, c_y1 = c_bbox
|
496
|
+
|
497
|
+
if mode == "contains":
|
498
|
+
if c_x0 <= s_x0 and c_y0 <= s_y0 and c_x1 >= s_x1 and c_y1 >= s_y1:
|
499
|
+
matches.append(cand)
|
500
|
+
elif mode == "center":
|
501
|
+
if c_x0 <= s_cx <= c_x1 and c_y0 <= s_cy <= c_y1:
|
502
|
+
matches.append(cand)
|
503
|
+
elif mode == "overlap":
|
504
|
+
# Compute overlap rectangle
|
505
|
+
ox0 = max(c_x0, s_x0)
|
506
|
+
oy0 = max(c_y0, s_y0)
|
507
|
+
ox1 = min(c_x1, s_x1)
|
508
|
+
oy1 = min(c_y1, s_y1)
|
509
|
+
if ox1 > ox0 and oy1 > oy0:
|
510
|
+
matches.append(cand)
|
511
|
+
|
512
|
+
if not matches:
|
513
|
+
return None
|
514
|
+
|
515
|
+
# Pick the smallest-area match
|
516
|
+
def _area(obj):
|
517
|
+
bb = _bbox(obj)
|
518
|
+
if bb is None:
|
519
|
+
return float("inf")
|
520
|
+
return (bb[2] - bb[0]) * (bb[3] - bb[1])
|
521
|
+
|
522
|
+
matches.sort(key=_area)
|
523
|
+
return matches[0]
|
524
|
+
|
417
525
|
|
418
526
|
class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
419
527
|
"""
|
@@ -805,25 +913,17 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
805
913
|
|
806
914
|
def highlight(
|
807
915
|
self,
|
808
|
-
label:
|
809
|
-
color: Optional[
|
810
|
-
use_color_cycling: bool =
|
916
|
+
label: str = "",
|
917
|
+
color: Optional[Tuple[float, float, float]] = None,
|
918
|
+
use_color_cycling: bool = True,
|
811
919
|
include_attrs: Optional[List[str]] = None,
|
812
920
|
existing: str = "append",
|
813
921
|
) -> "Element":
|
814
|
-
"""
|
815
|
-
Highlight this element on the page.
|
816
|
-
|
817
|
-
Args:
|
818
|
-
label: Optional label for the highlight
|
819
|
-
color: Color tuple/string for the highlight, or None to use automatic color
|
820
|
-
use_color_cycling: Force color cycling even with no label (default: False)
|
821
|
-
include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
|
822
|
-
existing: How to handle existing highlights - 'append' (default) or 'replace'
|
922
|
+
"""Highlight the element with the specified colour.
|
823
923
|
|
824
|
-
|
825
|
-
Self for method chaining
|
924
|
+
Highlight the element on the page.
|
826
925
|
"""
|
926
|
+
|
827
927
|
# Access the correct highlighter service
|
828
928
|
highlighter = self.page._highlighter
|
829
929
|
|
@@ -850,7 +950,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
850
950
|
|
851
951
|
def show(
|
852
952
|
self,
|
853
|
-
|
953
|
+
resolution: Optional[float] = None,
|
854
954
|
labels: bool = True,
|
855
955
|
legend_position: str = "right",
|
856
956
|
color: Optional[Union[Tuple, str]] = "red", # Default color for single element
|
@@ -862,7 +962,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
862
962
|
Show the page with only this element highlighted temporarily.
|
863
963
|
|
864
964
|
Args:
|
865
|
-
|
965
|
+
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
866
966
|
labels: Whether to include a legend for the highlight
|
867
967
|
legend_position: Position of the legend
|
868
968
|
color: Color to highlight this element (default: red)
|
@@ -874,6 +974,13 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
874
974
|
Returns:
|
875
975
|
PIL Image of the page with only this element highlighted, or None if error.
|
876
976
|
"""
|
977
|
+
# Apply global options as defaults
|
978
|
+
import natural_pdf
|
979
|
+
if resolution is None:
|
980
|
+
if natural_pdf.options.image.resolution is not None:
|
981
|
+
resolution = natural_pdf.options.image.resolution
|
982
|
+
else:
|
983
|
+
resolution = 144 # Default resolution when none specified
|
877
984
|
if not hasattr(self, "page") or not self.page:
|
878
985
|
logger.warning(f"Cannot show element, missing 'page' attribute: {self}")
|
879
986
|
return None
|
@@ -909,7 +1016,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
909
1016
|
return service.render_preview(
|
910
1017
|
page_index=self.page.index,
|
911
1018
|
temporary_highlights=[temp_highlight_data],
|
912
|
-
|
1019
|
+
resolution=resolution,
|
913
1020
|
width=width, # Pass the width parameter
|
914
1021
|
labels=labels,
|
915
1022
|
legend_position=legend_position,
|
@@ -920,22 +1027,29 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
920
1027
|
return None
|
921
1028
|
|
922
1029
|
def save(
|
923
|
-
self, filename: str,
|
1030
|
+
self, filename: str, resolution: Optional[float] = None, labels: bool = True, legend_position: str = "right"
|
924
1031
|
) -> None:
|
925
1032
|
"""
|
926
1033
|
Save the page with this element highlighted to an image file.
|
927
1034
|
|
928
1035
|
Args:
|
929
1036
|
filename: Path to save the image to
|
930
|
-
|
1037
|
+
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
931
1038
|
labels: Whether to include a legend for labels
|
932
1039
|
legend_position: Position of the legend
|
933
1040
|
|
934
1041
|
Returns:
|
935
1042
|
Self for method chaining
|
936
1043
|
"""
|
1044
|
+
# Apply global options as defaults
|
1045
|
+
import natural_pdf
|
1046
|
+
if resolution is None:
|
1047
|
+
if natural_pdf.options.image.resolution is not None:
|
1048
|
+
resolution = natural_pdf.options.image.resolution
|
1049
|
+
else:
|
1050
|
+
resolution = 144 # Default resolution when none specified
|
937
1051
|
# Save the highlighted image
|
938
|
-
self.page.save_image(filename,
|
1052
|
+
self.page.save_image(filename, resolution=resolution, labels=labels, legend_position=legend_position)
|
939
1053
|
return self
|
940
1054
|
|
941
1055
|
# Note: save_image method removed in favor of save()
|
@@ -859,7 +859,7 @@ class ElementCollection(
|
|
859
859
|
distinct: bool = False,
|
860
860
|
include_attrs: Optional[List[str]] = None,
|
861
861
|
# --- Rendering Parameters ---
|
862
|
-
|
862
|
+
resolution: Optional[float] = None,
|
863
863
|
labels: bool = True, # Use 'labels' consistent with service
|
864
864
|
legend_position: str = "right",
|
865
865
|
render_ocr: bool = False,
|
@@ -884,7 +884,7 @@ class ElementCollection(
|
|
884
884
|
label_format: F-string to format group labels if group_by is used.
|
885
885
|
distinct: Highlight each element distinctly (overrides group_by/label).
|
886
886
|
include_attrs: Attributes to display on individual highlights.
|
887
|
-
|
887
|
+
resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI).
|
888
888
|
labels: Whether to include a legend for the temporary highlights.
|
889
889
|
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
890
890
|
render_ocr: Whether to render OCR text.
|
@@ -900,6 +900,18 @@ class ElementCollection(
|
|
900
900
|
Raises:
|
901
901
|
ValueError: If the collection is empty or elements are on different pages/PDFs.
|
902
902
|
"""
|
903
|
+
# Apply global options as defaults, but allow explicit parameters to override
|
904
|
+
import natural_pdf
|
905
|
+
|
906
|
+
# Use global options if parameters are not explicitly set
|
907
|
+
if width is None:
|
908
|
+
width = natural_pdf.options.image.width
|
909
|
+
if resolution is None:
|
910
|
+
if natural_pdf.options.image.resolution is not None:
|
911
|
+
resolution = natural_pdf.options.image.resolution
|
912
|
+
else:
|
913
|
+
resolution = 144 # Default resolution when none specified
|
914
|
+
|
903
915
|
if not self._elements:
|
904
916
|
raise ValueError("Cannot show an empty collection.")
|
905
917
|
|
@@ -967,7 +979,7 @@ class ElementCollection(
|
|
967
979
|
img = service.render_preview(
|
968
980
|
page_index=page.index,
|
969
981
|
temporary_highlights=highlight_data_list,
|
970
|
-
|
982
|
+
resolution=resolution,
|
971
983
|
width=width, # Pass the width parameter
|
972
984
|
labels=labels, # Use 'labels'
|
973
985
|
legend_position=legend_position,
|
@@ -982,7 +994,7 @@ class ElementCollection(
|
|
982
994
|
def save(
|
983
995
|
self,
|
984
996
|
filename: str,
|
985
|
-
|
997
|
+
resolution: Optional[float] = None,
|
986
998
|
width: Optional[int] = None,
|
987
999
|
labels: bool = True,
|
988
1000
|
legend_position: str = "right",
|
@@ -993,7 +1005,7 @@ class ElementCollection(
|
|
993
1005
|
|
994
1006
|
Args:
|
995
1007
|
filename: Path to save the image to
|
996
|
-
|
1008
|
+
resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI)
|
997
1009
|
width: Optional width for the output image in pixels
|
998
1010
|
labels: Whether to include a legend for labels
|
999
1011
|
legend_position: Position of the legend
|
@@ -1002,10 +1014,22 @@ class ElementCollection(
|
|
1002
1014
|
Returns:
|
1003
1015
|
Self for method chaining
|
1004
1016
|
"""
|
1017
|
+
# Apply global options as defaults, but allow explicit parameters to override
|
1018
|
+
import natural_pdf
|
1019
|
+
|
1020
|
+
# Use global options if parameters are not explicitly set
|
1021
|
+
if width is None:
|
1022
|
+
width = natural_pdf.options.image.width
|
1023
|
+
if resolution is None:
|
1024
|
+
if natural_pdf.options.image.resolution is not None:
|
1025
|
+
resolution = natural_pdf.options.image.resolution
|
1026
|
+
else:
|
1027
|
+
resolution = 144 # Default resolution when none specified
|
1028
|
+
|
1005
1029
|
# Use to_image to generate and save the image
|
1006
1030
|
self.to_image(
|
1007
1031
|
path=filename,
|
1008
|
-
|
1032
|
+
resolution=resolution,
|
1009
1033
|
width=width,
|
1010
1034
|
labels=labels,
|
1011
1035
|
legend_position=legend_position,
|
@@ -1016,7 +1040,7 @@ class ElementCollection(
|
|
1016
1040
|
def to_image(
|
1017
1041
|
self,
|
1018
1042
|
path: Optional[str] = None,
|
1019
|
-
|
1043
|
+
resolution: Optional[float] = None,
|
1020
1044
|
width: Optional[int] = None,
|
1021
1045
|
labels: bool = True,
|
1022
1046
|
legend_position: str = "right",
|
@@ -1028,7 +1052,7 @@ class ElementCollection(
|
|
1028
1052
|
|
1029
1053
|
Args:
|
1030
1054
|
path: Optional path to save the image to
|
1031
|
-
|
1055
|
+
resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI)
|
1032
1056
|
width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
|
1033
1057
|
labels: Whether to include a legend for labels
|
1034
1058
|
legend_position: Position of the legend
|
@@ -1043,7 +1067,7 @@ class ElementCollection(
|
|
1043
1067
|
# Generate the image using to_image
|
1044
1068
|
return page.to_image(
|
1045
1069
|
path=path,
|
1046
|
-
|
1070
|
+
resolution=resolution,
|
1047
1071
|
width=width,
|
1048
1072
|
labels=labels,
|
1049
1073
|
legend_position=legend_position,
|
@@ -1774,7 +1798,7 @@ class ElementCollection(
|
|
1774
1798
|
self,
|
1775
1799
|
padding: int = 1,
|
1776
1800
|
threshold: float = 0.95,
|
1777
|
-
resolution: float =
|
1801
|
+
resolution: Optional[float] = None,
|
1778
1802
|
show_progress: bool = True,
|
1779
1803
|
) -> "ElementCollection":
|
1780
1804
|
"""
|
@@ -1786,12 +1810,20 @@ class ElementCollection(
|
|
1786
1810
|
Args:
|
1787
1811
|
padding: Number of pixels to keep as padding after trimming (default: 1)
|
1788
1812
|
threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
|
1789
|
-
resolution: Resolution for image rendering in DPI (default:
|
1813
|
+
resolution: Resolution for image rendering in DPI (default: uses global options, fallback to 144 DPI)
|
1790
1814
|
show_progress: Whether to show a progress bar for the trimming operation
|
1791
1815
|
|
1792
1816
|
Returns:
|
1793
1817
|
New ElementCollection with trimmed regions
|
1794
1818
|
"""
|
1819
|
+
# Apply global options as defaults
|
1820
|
+
import natural_pdf
|
1821
|
+
if resolution is None:
|
1822
|
+
if natural_pdf.options.image.resolution is not None:
|
1823
|
+
resolution = natural_pdf.options.image.resolution
|
1824
|
+
else:
|
1825
|
+
resolution = 144 # Default resolution when none specified
|
1826
|
+
|
1795
1827
|
return self.apply(
|
1796
1828
|
lambda element: element.trim(
|
1797
1829
|
padding=padding, threshold=threshold, resolution=resolution
|
@@ -0,0 +1,43 @@
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, Tuple
|
2
|
+
|
3
|
+
from natural_pdf.elements.base import Element
|
4
|
+
|
5
|
+
if TYPE_CHECKING:
|
6
|
+
from natural_pdf.core.page import Page
|
7
|
+
|
8
|
+
class ImageElement(Element):
|
9
|
+
"""Represents a raster XObject (embedded image) on a PDF page."""
|
10
|
+
|
11
|
+
def __init__(self, obj: Dict[str, Any], page: "Page"):
|
12
|
+
super().__init__(obj, page)
|
13
|
+
|
14
|
+
# ------------------------------------------------------------------
|
15
|
+
# Simple attribute proxies
|
16
|
+
# ------------------------------------------------------------------
|
17
|
+
@property
|
18
|
+
def type(self) -> str: # noqa: D401 – short description already given
|
19
|
+
return "image"
|
20
|
+
|
21
|
+
@property
|
22
|
+
def width(self) -> float: # override just to use dict value directly
|
23
|
+
return float(self._obj.get("width", 0))
|
24
|
+
|
25
|
+
@property
|
26
|
+
def height(self) -> float:
|
27
|
+
return float(self._obj.get("height", 0))
|
28
|
+
|
29
|
+
@property
|
30
|
+
def srcsize(self) -> Tuple[float, float]:
|
31
|
+
"""Original pixel dimensions of the embedded image (width, height)."""
|
32
|
+
return self._obj.get("srcsize", (None, None))
|
33
|
+
|
34
|
+
@property
|
35
|
+
def colorspace(self): # raw pdfminer data
|
36
|
+
return self._obj.get("colorspace")
|
37
|
+
|
38
|
+
# No text extraction for images
|
39
|
+
def extract_text(self, *args, **kwargs) -> str: # noqa: D401 – consistent signature
|
40
|
+
return ""
|
41
|
+
|
42
|
+
def __repr__(self):
|
43
|
+
return f"<ImageElement bbox={self.bbox} srcsize={self.srcsize}>"
|
natural_pdf/elements/region.py
CHANGED
@@ -26,6 +26,11 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
26
26
|
# Import new utils
|
27
27
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
28
28
|
|
29
|
+
# ------------------------------------------------------------------
|
30
|
+
# Table utilities
|
31
|
+
# ------------------------------------------------------------------
|
32
|
+
from natural_pdf.tables import TableResult
|
33
|
+
|
29
34
|
# --- End Classification Imports --- #
|
30
35
|
|
31
36
|
|
@@ -590,8 +595,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
590
595
|
|
591
596
|
def to_image(
|
592
597
|
self,
|
593
|
-
|
594
|
-
resolution: float = 150,
|
598
|
+
resolution: Optional[float] = None,
|
595
599
|
crop: bool = False,
|
596
600
|
include_highlights: bool = True,
|
597
601
|
**kwargs,
|
@@ -600,7 +604,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
600
604
|
Generate an image of just this region.
|
601
605
|
|
602
606
|
Args:
|
603
|
-
resolution: Resolution in DPI for rendering (default:
|
607
|
+
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
604
608
|
crop: If True, only crop the region without highlighting its boundaries
|
605
609
|
include_highlights: Whether to include existing highlights (default: True)
|
606
610
|
**kwargs: Additional parameters for page.to_image()
|
@@ -608,6 +612,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
608
612
|
Returns:
|
609
613
|
PIL Image of just this region
|
610
614
|
"""
|
615
|
+
# Apply global options as defaults
|
616
|
+
import natural_pdf
|
617
|
+
if resolution is None:
|
618
|
+
if natural_pdf.options.image.resolution is not None:
|
619
|
+
resolution = natural_pdf.options.image.resolution
|
620
|
+
else:
|
621
|
+
resolution = 144 # Default resolution when none specified
|
622
|
+
|
611
623
|
# Handle the case where user wants the cropped region to have a specific width
|
612
624
|
page_kwargs = kwargs.copy()
|
613
625
|
effective_resolution = resolution # Start with the provided resolution
|
@@ -633,7 +645,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
633
645
|
|
634
646
|
# First get the full page image with highlights if requested
|
635
647
|
page_image = self._page.to_image(
|
636
|
-
scale=scale,
|
637
648
|
resolution=effective_resolution,
|
638
649
|
include_highlights=include_highlights,
|
639
650
|
**page_kwargs,
|
@@ -683,7 +694,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
683
694
|
|
684
695
|
def show(
|
685
696
|
self,
|
686
|
-
|
697
|
+
resolution: Optional[float] = None,
|
687
698
|
labels: bool = True,
|
688
699
|
legend_position: str = "right",
|
689
700
|
# Add a default color for standalone show
|
@@ -696,7 +707,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
696
707
|
Show the page with just this region highlighted temporarily.
|
697
708
|
|
698
709
|
Args:
|
699
|
-
|
710
|
+
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
700
711
|
labels: Whether to include a legend for labels
|
701
712
|
legend_position: Position of the legend
|
702
713
|
color: Color to highlight this region (default: blue)
|
@@ -709,6 +720,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
709
720
|
Returns:
|
710
721
|
PIL Image of the page with only this region highlighted
|
711
722
|
"""
|
723
|
+
# Apply global options as defaults
|
724
|
+
import natural_pdf
|
725
|
+
if resolution is None:
|
726
|
+
if natural_pdf.options.image.resolution is not None:
|
727
|
+
resolution = natural_pdf.options.image.resolution
|
728
|
+
else:
|
729
|
+
resolution = 144 # Default resolution when none specified
|
730
|
+
|
712
731
|
if not self._page:
|
713
732
|
raise ValueError("Region must be associated with a page to show.")
|
714
733
|
|
@@ -737,7 +756,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
737
756
|
return service.render_preview(
|
738
757
|
page_index=self._page.index,
|
739
758
|
temporary_highlights=[temp_highlight_data],
|
740
|
-
|
759
|
+
resolution=resolution,
|
741
760
|
width=width, # Pass the width parameter
|
742
761
|
labels=labels,
|
743
762
|
legend_position=legend_position,
|
@@ -745,31 +764,39 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
745
764
|
)
|
746
765
|
|
747
766
|
def save(
|
748
|
-
self, filename: str,
|
767
|
+
self, filename: str, resolution: Optional[float] = None, labels: bool = True, legend_position: str = "right"
|
749
768
|
) -> "Region":
|
750
769
|
"""
|
751
770
|
Save the page with this region highlighted to an image file.
|
752
771
|
|
753
772
|
Args:
|
754
773
|
filename: Path to save the image to
|
755
|
-
|
774
|
+
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
756
775
|
labels: Whether to include a legend for labels
|
757
776
|
legend_position: Position of the legend
|
758
777
|
|
759
778
|
Returns:
|
760
779
|
Self for method chaining
|
761
780
|
"""
|
781
|
+
# Apply global options as defaults
|
782
|
+
import natural_pdf
|
783
|
+
if resolution is None:
|
784
|
+
if natural_pdf.options.image.resolution is not None:
|
785
|
+
resolution = natural_pdf.options.image.resolution
|
786
|
+
else:
|
787
|
+
resolution = 144 # Default resolution when none specified
|
788
|
+
|
762
789
|
# Highlight this region if not already highlighted
|
763
790
|
self.highlight()
|
764
791
|
|
765
792
|
# Save the highlighted image
|
766
|
-
self._page.save_image(filename,
|
793
|
+
self._page.save_image(filename, resolution=resolution, labels=labels, legend_position=legend_position)
|
767
794
|
return self
|
768
795
|
|
769
796
|
def save_image(
|
770
797
|
self,
|
771
798
|
filename: str,
|
772
|
-
resolution: float =
|
799
|
+
resolution: Optional[float] = None,
|
773
800
|
crop: bool = False,
|
774
801
|
include_highlights: bool = True,
|
775
802
|
**kwargs,
|
@@ -779,7 +806,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
779
806
|
|
780
807
|
Args:
|
781
808
|
filename: Path to save the image to
|
782
|
-
resolution: Resolution in DPI for rendering (default:
|
809
|
+
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
783
810
|
crop: If True, only crop the region without highlighting its boundaries
|
784
811
|
include_highlights: Whether to include existing highlights (default: True)
|
785
812
|
**kwargs: Additional parameters for page.to_image()
|
@@ -787,6 +814,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
787
814
|
Returns:
|
788
815
|
Self for method chaining
|
789
816
|
"""
|
817
|
+
# Apply global options as defaults
|
818
|
+
import natural_pdf
|
819
|
+
if resolution is None:
|
820
|
+
if natural_pdf.options.image.resolution is not None:
|
821
|
+
resolution = natural_pdf.options.image.resolution
|
822
|
+
else:
|
823
|
+
resolution = 144 # Default resolution when none specified
|
824
|
+
|
790
825
|
# Get the region image
|
791
826
|
image = self.to_image(
|
792
827
|
resolution=resolution,
|
@@ -803,7 +838,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
803
838
|
self,
|
804
839
|
padding: int = 1,
|
805
840
|
threshold: float = 0.95,
|
806
|
-
resolution: float =
|
841
|
+
resolution: Optional[float] = None,
|
807
842
|
pre_shrink: float = 0.5,
|
808
843
|
) -> "Region":
|
809
844
|
"""
|
@@ -817,7 +852,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
817
852
|
threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
|
818
853
|
Higher values mean more strict whitespace detection.
|
819
854
|
E.g., 0.95 means if 95% of pixels in a row/column are white, consider it whitespace.
|
820
|
-
resolution: Resolution for image rendering in DPI (default:
|
855
|
+
resolution: Resolution for image rendering in DPI (default: uses global options, fallback to 144 DPI)
|
821
856
|
pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
|
822
857
|
This helps avoid detecting box borders/slivers as content.
|
823
858
|
|
@@ -834,6 +869,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
834
869
|
# Conservative trimming with more padding
|
835
870
|
loose = region.trim(padding=3, threshold=0.98)
|
836
871
|
"""
|
872
|
+
# Apply global options as defaults
|
873
|
+
import natural_pdf
|
874
|
+
if resolution is None:
|
875
|
+
if natural_pdf.options.image.resolution is not None:
|
876
|
+
resolution = natural_pdf.options.image.resolution
|
877
|
+
else:
|
878
|
+
resolution = 144 # Default resolution when none specified
|
879
|
+
|
837
880
|
# Pre-shrink the region to avoid box slivers
|
838
881
|
work_region = (
|
839
882
|
self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
|
@@ -1172,7 +1215,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1172
1215
|
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1173
1216
|
# --- NEW: Add tqdm control option --- #
|
1174
1217
|
show_progress: bool = False, # Controls progress bar for text method
|
1175
|
-
) ->
|
1218
|
+
) -> TableResult: # Return type allows Optional[str] for cells
|
1176
1219
|
"""
|
1177
1220
|
Extract a table from this region.
|
1178
1221
|
|
@@ -1224,7 +1267,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1224
1267
|
logger.debug(
|
1225
1268
|
f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
|
1226
1269
|
)
|
1227
|
-
return self._extract_table_from_cells(cell_regions_in_table)
|
1270
|
+
return TableResult(self._extract_table_from_cells(cell_regions_in_table))
|
1228
1271
|
|
1229
1272
|
# --------------------------------------------------------------- #
|
1230
1273
|
|
@@ -1280,19 +1323,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1280
1323
|
|
1281
1324
|
# Use the selected method
|
1282
1325
|
if effective_method == "tatr":
|
1283
|
-
|
1326
|
+
table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
|
1284
1327
|
elif effective_method == "text":
|
1285
1328
|
current_text_options = text_options.copy()
|
1286
1329
|
current_text_options["cell_extraction_func"] = cell_extraction_func
|
1287
1330
|
current_text_options["show_progress"] = show_progress
|
1288
|
-
|
1331
|
+
table_rows = self._extract_table_text(**current_text_options)
|
1289
1332
|
elif effective_method == "pdfplumber":
|
1290
|
-
|
1333
|
+
table_rows = self._extract_table_plumber(table_settings)
|
1291
1334
|
else:
|
1292
1335
|
raise ValueError(
|
1293
1336
|
f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
|
1294
1337
|
)
|
1295
1338
|
|
1339
|
+
return TableResult(table_rows)
|
1340
|
+
|
1296
1341
|
def extract_tables(
|
1297
1342
|
self,
|
1298
1343
|
method: Optional[str] = None,
|