natural-pdf 0.1.40__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -7
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +236 -383
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +172 -83
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +318 -243
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +4 -4
- natural_pdf/flows/flow.py +1200 -243
- natural_pdf/flows/region.py +707 -261
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +7 -3
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/RECORD +55 -53
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/top_level.txt +0 -2
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py
CHANGED
@@ -16,6 +16,7 @@ from typing import ( # Added overload
|
|
16
16
|
Callable,
|
17
17
|
Dict,
|
18
18
|
List,
|
19
|
+
Literal,
|
19
20
|
Optional,
|
20
21
|
Tuple,
|
21
22
|
Union,
|
@@ -26,7 +27,7 @@ import pdfplumber
|
|
26
27
|
from PIL import Image, ImageDraw
|
27
28
|
from tqdm.auto import tqdm # Added tqdm import
|
28
29
|
|
29
|
-
from natural_pdf.elements.
|
30
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
30
31
|
from natural_pdf.elements.region import Region
|
31
32
|
from natural_pdf.selectors.parser import parse_selector
|
32
33
|
from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
|
@@ -38,7 +39,6 @@ if TYPE_CHECKING:
|
|
38
39
|
from natural_pdf.core.highlighting_service import HighlightingService
|
39
40
|
from natural_pdf.core.pdf import PDF
|
40
41
|
from natural_pdf.elements.base import Element
|
41
|
-
from natural_pdf.elements.collections import ElementCollection
|
42
42
|
|
43
43
|
# # New Imports
|
44
44
|
import itertools
|
@@ -61,12 +61,19 @@ from natural_pdf.classification.manager import ClassificationManager # For type
|
|
61
61
|
# # --- Classification Imports --- #
|
62
62
|
from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
|
63
63
|
from natural_pdf.core.element_manager import ElementManager
|
64
|
+
|
65
|
+
# Add new import
|
66
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
64
67
|
from natural_pdf.describe.mixin import DescribeMixin # Import describe mixin
|
65
68
|
from natural_pdf.elements.base import Element # Import base element
|
66
69
|
from natural_pdf.elements.text import TextElement
|
70
|
+
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
67
71
|
from natural_pdf.ocr import OCRManager, OCROptions
|
68
72
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
69
73
|
from natural_pdf.qa import DocumentQA, get_qa_engine
|
74
|
+
|
75
|
+
# --- Text update mixin import --- #
|
76
|
+
from natural_pdf.text_mixin import TextMixin
|
70
77
|
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
71
78
|
|
72
79
|
# # Import new utils
|
@@ -75,10 +82,6 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
|
|
75
82
|
|
76
83
|
# --- End Classification Imports --- #
|
77
84
|
|
78
|
-
# --- Text update mixin import --- #
|
79
|
-
from natural_pdf.text_mixin import TextMixin
|
80
|
-
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
81
|
-
|
82
85
|
|
83
86
|
try:
|
84
87
|
from deskew import determine_skew
|
@@ -92,7 +95,14 @@ except ImportError:
|
|
92
95
|
logger = logging.getLogger(__name__)
|
93
96
|
|
94
97
|
|
95
|
-
class Page(
|
98
|
+
class Page(
|
99
|
+
TextMixin,
|
100
|
+
ClassificationMixin,
|
101
|
+
ExtractionMixin,
|
102
|
+
ShapeDetectionMixin,
|
103
|
+
DescribeMixin,
|
104
|
+
Visualizable,
|
105
|
+
):
|
96
106
|
"""Enhanced Page wrapper built on top of pdfplumber.Page.
|
97
107
|
|
98
108
|
This class provides a fluent interface for working with PDF pages,
|
@@ -262,6 +272,77 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
262
272
|
self._load_elements()
|
263
273
|
self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
|
264
274
|
|
275
|
+
def _get_render_specs(
|
276
|
+
self,
|
277
|
+
mode: Literal["show", "render"] = "show",
|
278
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
279
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
280
|
+
crop: Union[bool, Literal["content"]] = False,
|
281
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
282
|
+
**kwargs,
|
283
|
+
) -> List[RenderSpec]:
|
284
|
+
"""Get render specifications for this page.
|
285
|
+
|
286
|
+
Args:
|
287
|
+
mode: Rendering mode - 'show' includes page highlights, 'render' is clean
|
288
|
+
color: Default color for highlights in show mode
|
289
|
+
highlights: Additional highlight groups to show
|
290
|
+
crop: Whether to crop the page
|
291
|
+
crop_bbox: Explicit crop bounds
|
292
|
+
**kwargs: Additional parameters
|
293
|
+
|
294
|
+
Returns:
|
295
|
+
List containing a single RenderSpec for this page
|
296
|
+
"""
|
297
|
+
spec = RenderSpec(page=self)
|
298
|
+
|
299
|
+
# Handle cropping
|
300
|
+
if crop_bbox:
|
301
|
+
spec.crop_bbox = crop_bbox
|
302
|
+
elif crop == "content":
|
303
|
+
# Calculate content bounds from all elements
|
304
|
+
elements = self.get_elements(apply_exclusions=False)
|
305
|
+
if elements:
|
306
|
+
# Get bounding box of all elements
|
307
|
+
x_coords = []
|
308
|
+
y_coords = []
|
309
|
+
for elem in elements:
|
310
|
+
if hasattr(elem, "bbox") and elem.bbox:
|
311
|
+
x0, y0, x1, y1 = elem.bbox
|
312
|
+
x_coords.extend([x0, x1])
|
313
|
+
y_coords.extend([y0, y1])
|
314
|
+
|
315
|
+
if x_coords and y_coords:
|
316
|
+
spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
|
317
|
+
elif crop is True:
|
318
|
+
# Crop to full page (no-op, but included for consistency)
|
319
|
+
spec.crop_bbox = (0, 0, self.width, self.height)
|
320
|
+
|
321
|
+
# Add highlights in show mode
|
322
|
+
if mode == "show":
|
323
|
+
# Add page's persistent highlights if any
|
324
|
+
page_highlights = self._highlighter.get_highlights_for_page(self.index)
|
325
|
+
for highlight in page_highlights:
|
326
|
+
spec.add_highlight(
|
327
|
+
bbox=highlight.bbox,
|
328
|
+
polygon=highlight.polygon,
|
329
|
+
color=highlight.color,
|
330
|
+
label=highlight.label,
|
331
|
+
element=None, # Persistent highlights don't have element refs
|
332
|
+
)
|
333
|
+
|
334
|
+
# Add additional highlight groups if provided
|
335
|
+
if highlights:
|
336
|
+
for group in highlights:
|
337
|
+
elements = group.get("elements", [])
|
338
|
+
group_color = group.get("color", color)
|
339
|
+
group_label = group.get("label")
|
340
|
+
|
341
|
+
for elem in elements:
|
342
|
+
spec.add_highlight(element=elem, color=group_color, label=group_label)
|
343
|
+
|
344
|
+
return [spec]
|
345
|
+
|
265
346
|
@property
|
266
347
|
def pdf(self) -> "PDF":
|
267
348
|
"""Provides public access to the parent PDF object."""
|
@@ -322,7 +403,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
322
403
|
exclusion_func_or_region: Either a callable function returning a Region,
|
323
404
|
a Region object, or another object with a valid .bbox attribute.
|
324
405
|
label: Optional label for this exclusion (e.g., 'header', 'footer').
|
325
|
-
method: Exclusion method - 'region' (exclude all elements in bounding box) or
|
406
|
+
method: Exclusion method - 'region' (exclude all elements in bounding box) or
|
326
407
|
'element' (exclude only the specific elements). Default: 'region'.
|
327
408
|
|
328
409
|
Returns:
|
@@ -346,7 +427,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
346
427
|
# Likewise, if an ElementCollection is passed we iterate over its
|
347
428
|
# elements and create Regions for each one.
|
348
429
|
# ------------------------------------------------------------------
|
349
|
-
from
|
430
|
+
# Import ElementCollection from the new module path (old path removed)
|
431
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
350
432
|
|
351
433
|
# Selector string ---------------------------------------------------
|
352
434
|
if isinstance(exclusion_func_or_region, str):
|
@@ -368,7 +450,12 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
368
450
|
else: # method == "region"
|
369
451
|
for el in matching_elements:
|
370
452
|
try:
|
371
|
-
bbox_coords = (
|
453
|
+
bbox_coords = (
|
454
|
+
float(el.x0),
|
455
|
+
float(el.top),
|
456
|
+
float(el.x1),
|
457
|
+
float(el.bottom),
|
458
|
+
)
|
372
459
|
region = Region(self, bbox_coords, label=label)
|
373
460
|
# Store directly as a Region tuple so we don't recurse endlessly
|
374
461
|
self._exclusions.append((region, label, method))
|
@@ -376,9 +463,12 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
376
463
|
f"Page {self.index}: Added exclusion region from selector '{selector_str}' -> {bbox_coords}"
|
377
464
|
)
|
378
465
|
except Exception as e:
|
379
|
-
|
380
|
-
|
466
|
+
# Re-raise so calling code/test sees the failure immediately
|
467
|
+
logger.error(
|
468
|
+
f"Page {self.index}: Failed to create exclusion region from element {el}: {e}",
|
469
|
+
exc_info=False,
|
381
470
|
)
|
471
|
+
raise
|
382
472
|
return self # Completed processing for selector input
|
383
473
|
|
384
474
|
# ElementCollection -----------------------------------------------
|
@@ -406,9 +496,11 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
406
496
|
f"Page {self.index}: Added exclusion region from ElementCollection element {bbox_coords}"
|
407
497
|
)
|
408
498
|
except Exception as e:
|
409
|
-
logger.
|
410
|
-
f"Page {self.index}: Failed to convert ElementCollection element to Region: {e}"
|
499
|
+
logger.error(
|
500
|
+
f"Page {self.index}: Failed to convert ElementCollection element to Region: {e}",
|
501
|
+
exc_info=False,
|
411
502
|
)
|
503
|
+
raise
|
412
504
|
return self # Completed processing for ElementCollection input
|
413
505
|
|
414
506
|
# ------------------------------------------------------------------
|
@@ -425,7 +517,11 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
425
517
|
elif isinstance(exclusion_func_or_region, Region):
|
426
518
|
# Store Region objects directly, assigning the label
|
427
519
|
exclusion_func_or_region.label = label # Assign label
|
428
|
-
exclusion_data = (
|
520
|
+
exclusion_data = (
|
521
|
+
exclusion_func_or_region,
|
522
|
+
label,
|
523
|
+
method,
|
524
|
+
) # Store as tuple for consistency
|
429
525
|
logger.debug(
|
430
526
|
f"Page {self.index}: Added Region exclusion '{label}' with method '{method}': {exclusion_func_or_region}"
|
431
527
|
)
|
@@ -547,7 +643,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
547
643
|
else:
|
548
644
|
# New format: (exclusion_item, label, method)
|
549
645
|
exclusion_item, label, method = exclusion_data
|
550
|
-
|
646
|
+
|
551
647
|
exclusion_label = label if label else f"exclusion {i}"
|
552
648
|
|
553
649
|
# Process callable exclusion functions
|
@@ -609,7 +705,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
609
705
|
) -> List["Element"]:
|
610
706
|
"""
|
611
707
|
Filters a list of elements, removing those based on exclusion rules.
|
612
|
-
Handles both region-based exclusions (exclude all in area) and
|
708
|
+
Handles both region-based exclusions (exclude all in area) and
|
613
709
|
element-based exclusions (exclude only specific elements).
|
614
710
|
|
615
711
|
Args:
|
@@ -633,7 +729,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
633
729
|
|
634
730
|
# Collect element-based exclusions
|
635
731
|
excluded_elements = set() # Use set for O(1) lookup
|
636
|
-
|
732
|
+
|
637
733
|
for exclusion_data in self._exclusions:
|
638
734
|
# Handle both old format (2-tuple) and new format (3-tuple)
|
639
735
|
if len(exclusion_data) == 2:
|
@@ -641,15 +737,15 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
641
737
|
method = "region"
|
642
738
|
else:
|
643
739
|
exclusion_item, label, method = exclusion_data
|
644
|
-
|
740
|
+
|
645
741
|
# Skip callables (already handled in _get_exclusion_regions)
|
646
742
|
if callable(exclusion_item):
|
647
743
|
continue
|
648
|
-
|
744
|
+
|
649
745
|
# Skip regions (already in exclusion_regions)
|
650
746
|
if isinstance(exclusion_item, Region):
|
651
747
|
continue
|
652
|
-
|
748
|
+
|
653
749
|
# Handle element-based exclusions
|
654
750
|
if method == "element" and hasattr(exclusion_item, "bbox"):
|
655
751
|
excluded_elements.add(id(exclusion_item))
|
@@ -665,10 +761,10 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
665
761
|
filtered_elements = []
|
666
762
|
region_excluded_count = 0
|
667
763
|
element_excluded_count = 0
|
668
|
-
|
764
|
+
|
669
765
|
for element in elements:
|
670
766
|
exclude = False
|
671
|
-
|
767
|
+
|
672
768
|
# Check element-based exclusions first (faster)
|
673
769
|
if id(element) in excluded_elements:
|
674
770
|
exclude = True
|
@@ -685,7 +781,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
685
781
|
if debug_exclusions:
|
686
782
|
print(f" Element {element} excluded by region {region}")
|
687
783
|
break # No need to check other regions for this element
|
688
|
-
|
784
|
+
|
689
785
|
if not exclude:
|
690
786
|
filtered_elements.append(element)
|
691
787
|
|
@@ -837,7 +933,9 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
837
933
|
Returns:
|
838
934
|
ElementCollection with matching elements.
|
839
935
|
"""
|
840
|
-
from natural_pdf.elements.
|
936
|
+
from natural_pdf.elements.element_collection import ( # Import here for type hint
|
937
|
+
ElementCollection,
|
938
|
+
)
|
841
939
|
|
842
940
|
if selector is not None and text is not None:
|
843
941
|
raise ValueError("Provide either 'selector' or 'text', not both.")
|
@@ -1324,7 +1422,12 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1324
1422
|
return self._page.crop(bbox, **kwargs)
|
1325
1423
|
|
1326
1424
|
def extract_text(
|
1327
|
-
self,
|
1425
|
+
self,
|
1426
|
+
preserve_whitespace=True,
|
1427
|
+
use_exclusions=True,
|
1428
|
+
debug_exclusions=False,
|
1429
|
+
content_filter=None,
|
1430
|
+
**kwargs,
|
1328
1431
|
) -> str:
|
1329
1432
|
"""
|
1330
1433
|
Extract text from this page, respecting exclusions and using pdfplumber's
|
@@ -1363,11 +1466,15 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1363
1466
|
|
1364
1467
|
# 2. Apply element-based exclusions if enabled
|
1365
1468
|
if use_exclusions and self._exclusions:
|
1366
|
-
# Filter word elements through _filter_elements_by_exclusions
|
1469
|
+
# Filter word elements through _filter_elements_by_exclusions
|
1367
1470
|
# This handles both element-based and region-based exclusions
|
1368
|
-
word_elements = self._filter_elements_by_exclusions(
|
1471
|
+
word_elements = self._filter_elements_by_exclusions(
|
1472
|
+
word_elements, debug_exclusions=debug
|
1473
|
+
)
|
1369
1474
|
if debug:
|
1370
|
-
logger.debug(
|
1475
|
+
logger.debug(
|
1476
|
+
f"Page {self.number}: {len(word_elements)} words remaining after exclusion filtering."
|
1477
|
+
)
|
1371
1478
|
|
1372
1479
|
# 3. Get region-based exclusions for spatial filtering
|
1373
1480
|
apply_exclusions_flag = kwargs.get("use_exclusions", use_exclusions)
|
@@ -1375,7 +1482,9 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1375
1482
|
if apply_exclusions_flag and self._exclusions:
|
1376
1483
|
exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
|
1377
1484
|
if debug:
|
1378
|
-
logger.debug(
|
1485
|
+
logger.debug(
|
1486
|
+
f"Page {self.number}: Found {len(exclusion_regions)} region exclusions for spatial filtering."
|
1487
|
+
)
|
1379
1488
|
elif debug:
|
1380
1489
|
logger.debug(f"Page {self.number}: Not applying exclusions.")
|
1381
1490
|
|
@@ -1656,7 +1765,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1656
1765
|
table_settings.setdefault("join_y_tolerance", join)
|
1657
1766
|
|
1658
1767
|
raw_tables = self._page.extract_tables(table_settings)
|
1659
|
-
|
1768
|
+
|
1660
1769
|
# Apply RTL text processing to all extracted tables
|
1661
1770
|
if raw_tables:
|
1662
1771
|
processed_tables = []
|
@@ -1674,7 +1783,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1674
1783
|
processed_table.append(processed_row)
|
1675
1784
|
processed_tables.append(processed_table)
|
1676
1785
|
return processed_tables
|
1677
|
-
|
1786
|
+
|
1678
1787
|
return raw_tables
|
1679
1788
|
else:
|
1680
1789
|
raise ValueError(
|
@@ -1743,7 +1852,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1743
1852
|
label: Optional[str] = None,
|
1744
1853
|
use_color_cycling: bool = False,
|
1745
1854
|
element: Optional[Any] = None,
|
1746
|
-
|
1855
|
+
annotate: Optional[List[str]] = None,
|
1747
1856
|
existing: str = "append",
|
1748
1857
|
) -> "Page":
|
1749
1858
|
"""
|
@@ -1756,7 +1865,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1756
1865
|
label: Optional label for the highlight.
|
1757
1866
|
use_color_cycling: If True and no label/color, use next cycle color.
|
1758
1867
|
element: Optional original element being highlighted (for attribute extraction).
|
1759
|
-
|
1868
|
+
annotate: List of attribute names from 'element' to display.
|
1760
1869
|
existing: How to handle existing highlights ('append' or 'replace').
|
1761
1870
|
|
1762
1871
|
Returns:
|
@@ -1770,7 +1879,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1770
1879
|
label=label,
|
1771
1880
|
use_color_cycling=use_color_cycling,
|
1772
1881
|
element=element,
|
1773
|
-
|
1882
|
+
annotate=annotate,
|
1774
1883
|
existing=existing,
|
1775
1884
|
)
|
1776
1885
|
return self
|
@@ -1782,7 +1891,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1782
1891
|
label: Optional[str] = None,
|
1783
1892
|
use_color_cycling: bool = False,
|
1784
1893
|
element: Optional[Any] = None,
|
1785
|
-
|
1894
|
+
annotate: Optional[List[str]] = None,
|
1786
1895
|
existing: str = "append",
|
1787
1896
|
) -> "Page":
|
1788
1897
|
"""
|
@@ -1795,7 +1904,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1795
1904
|
label: Optional label for the highlight.
|
1796
1905
|
use_color_cycling: If True and no label/color, use next cycle color.
|
1797
1906
|
element: Optional original element being highlighted (for attribute extraction).
|
1798
|
-
|
1907
|
+
annotate: List of attribute names from 'element' to display.
|
1799
1908
|
existing: How to handle existing highlights ('append' or 'replace').
|
1800
1909
|
|
1801
1910
|
Returns:
|
@@ -1808,41 +1917,11 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1808
1917
|
label=label,
|
1809
1918
|
use_color_cycling=use_color_cycling,
|
1810
1919
|
element=element,
|
1811
|
-
|
1920
|
+
annotate=annotate,
|
1812
1921
|
existing=existing,
|
1813
1922
|
)
|
1814
1923
|
return self
|
1815
1924
|
|
1816
|
-
def show(
|
1817
|
-
self,
|
1818
|
-
resolution: float = 144,
|
1819
|
-
width: Optional[int] = None,
|
1820
|
-
labels: bool = True,
|
1821
|
-
legend_position: str = "right",
|
1822
|
-
render_ocr: bool = False,
|
1823
|
-
) -> Optional[Image.Image]:
|
1824
|
-
"""
|
1825
|
-
Generates and returns an image of the page with persistent highlights rendered.
|
1826
|
-
|
1827
|
-
Args:
|
1828
|
-
resolution: Resolution in DPI for rendering (default: 144 DPI, equivalent to previous scale=2.0).
|
1829
|
-
width: Optional width for the output image.
|
1830
|
-
labels: Whether to include a legend for labels.
|
1831
|
-
legend_position: Position of the legend.
|
1832
|
-
render_ocr: Whether to render OCR text.
|
1833
|
-
|
1834
|
-
Returns:
|
1835
|
-
PIL Image object of the page with highlights, or None if rendering fails.
|
1836
|
-
"""
|
1837
|
-
return self.to_image(
|
1838
|
-
resolution=resolution,
|
1839
|
-
width=width,
|
1840
|
-
labels=labels,
|
1841
|
-
legend_position=legend_position,
|
1842
|
-
render_ocr=render_ocr,
|
1843
|
-
include_highlights=True, # Ensure highlights are requested
|
1844
|
-
)
|
1845
|
-
|
1846
1925
|
def save_image(
|
1847
1926
|
self,
|
1848
1927
|
filename: str,
|
@@ -1870,17 +1949,38 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1870
1949
|
Returns:
|
1871
1950
|
Self for method chaining.
|
1872
1951
|
"""
|
1873
|
-
# Use
|
1874
|
-
|
1875
|
-
|
1876
|
-
|
1877
|
-
|
1878
|
-
|
1879
|
-
|
1880
|
-
|
1881
|
-
|
1882
|
-
|
1883
|
-
|
1952
|
+
# Use export() to save the image
|
1953
|
+
if include_highlights:
|
1954
|
+
self.export(
|
1955
|
+
path=filename,
|
1956
|
+
resolution=resolution,
|
1957
|
+
width=width,
|
1958
|
+
labels=labels,
|
1959
|
+
legend_position=legend_position,
|
1960
|
+
render_ocr=render_ocr,
|
1961
|
+
**kwargs,
|
1962
|
+
)
|
1963
|
+
else:
|
1964
|
+
# For saving without highlights, use render() and save manually
|
1965
|
+
img = self.render(resolution=resolution, **kwargs)
|
1966
|
+
if img:
|
1967
|
+
# Resize if width is specified
|
1968
|
+
if width is not None and width > 0 and img.width > 0:
|
1969
|
+
aspect_ratio = img.height / img.width
|
1970
|
+
height = int(width * aspect_ratio)
|
1971
|
+
try:
|
1972
|
+
img = img.resize((width, height), Image.Resampling.LANCZOS)
|
1973
|
+
except Exception as e:
|
1974
|
+
logger.warning(f"Could not resize image: {e}")
|
1975
|
+
|
1976
|
+
# Save the image
|
1977
|
+
try:
|
1978
|
+
if os.path.dirname(filename):
|
1979
|
+
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
1980
|
+
img.save(filename)
|
1981
|
+
except Exception as e:
|
1982
|
+
logger.error(f"Failed to save image to {filename}: {e}")
|
1983
|
+
|
1884
1984
|
return self
|
1885
1985
|
|
1886
1986
|
def clear_highlights(self) -> "Page":
|
@@ -1923,280 +2023,6 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
1923
2023
|
# Return the collection of elements which now have style attributes
|
1924
2024
|
return processed_elements_collection
|
1925
2025
|
|
1926
|
-
def to_image(
|
1927
|
-
self,
|
1928
|
-
path: Optional[str] = None,
|
1929
|
-
width: Optional[int] = None,
|
1930
|
-
labels: bool = True,
|
1931
|
-
legend_position: str = "right",
|
1932
|
-
render_ocr: bool = False,
|
1933
|
-
resolution: Optional[float] = None,
|
1934
|
-
include_highlights: bool = True,
|
1935
|
-
exclusions: Optional[str] = None, # New parameter
|
1936
|
-
**kwargs,
|
1937
|
-
) -> Optional[Image.Image]:
|
1938
|
-
"""
|
1939
|
-
Generate a PIL image of the page, using HighlightingService if needed.
|
1940
|
-
|
1941
|
-
Args:
|
1942
|
-
path: Optional path to save the image to.
|
1943
|
-
width: Optional width for the output image.
|
1944
|
-
labels: Whether to include a legend for highlights.
|
1945
|
-
legend_position: Position of the legend.
|
1946
|
-
render_ocr: Whether to render OCR text on highlights.
|
1947
|
-
resolution: Resolution in DPI for base page image. If None, uses global setting or defaults to 144 DPI.
|
1948
|
-
include_highlights: Whether to render highlights.
|
1949
|
-
exclusions: Accepts one of the following:
|
1950
|
-
• None – no masking (default)
|
1951
|
-
• "mask" – mask using solid white (back-compat)
|
1952
|
-
• CSS/HTML colour string (e.g. "red", "#ff0000", "#ff000080")
|
1953
|
-
• Tuple of RGB or RGBA values (ints 0-255 or floats 0-1)
|
1954
|
-
All excluded regions are filled with this colour.
|
1955
|
-
**kwargs: Additional parameters for pdfplumber.to_image.
|
1956
|
-
|
1957
|
-
Returns:
|
1958
|
-
PIL Image of the page, or None if rendering fails.
|
1959
|
-
"""
|
1960
|
-
# Apply global options as defaults, but allow explicit parameters to override
|
1961
|
-
import natural_pdf
|
1962
|
-
|
1963
|
-
# Determine if this is likely a computational use (OCR, analysis, etc.)
|
1964
|
-
# If resolution is explicitly provided but width is not, assume computational use
|
1965
|
-
# and don't apply global display width settings
|
1966
|
-
is_computational_use = (resolution is not None and width is None and
|
1967
|
-
kwargs.get('include_highlights', True) is False)
|
1968
|
-
|
1969
|
-
# Use global options if parameters are not explicitly set
|
1970
|
-
if width is None and not is_computational_use:
|
1971
|
-
width = natural_pdf.options.image.width
|
1972
|
-
if resolution is None:
|
1973
|
-
if natural_pdf.options.image.resolution is not None:
|
1974
|
-
resolution = natural_pdf.options.image.resolution
|
1975
|
-
else:
|
1976
|
-
resolution = 144 # Default resolution when none specified
|
1977
|
-
# 1. Create cache key (excluding path)
|
1978
|
-
cache_key_parts = [
|
1979
|
-
width,
|
1980
|
-
labels,
|
1981
|
-
legend_position,
|
1982
|
-
render_ocr,
|
1983
|
-
resolution,
|
1984
|
-
include_highlights,
|
1985
|
-
exclusions,
|
1986
|
-
]
|
1987
|
-
# Convert kwargs to a stable, hashable representation
|
1988
|
-
sorted_kwargs_list = []
|
1989
|
-
for k, v in sorted(kwargs.items()):
|
1990
|
-
if isinstance(v, list):
|
1991
|
-
try:
|
1992
|
-
v = tuple(v) # Convert lists to tuples
|
1993
|
-
except TypeError: # pragma: no cover
|
1994
|
-
# If list contains unhashable items, fall back to repr or skip
|
1995
|
-
# For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
|
1996
|
-
logger.warning(
|
1997
|
-
f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements."
|
1998
|
-
)
|
1999
|
-
sorted_kwargs_list.append((k, v))
|
2000
|
-
|
2001
|
-
cache_key_parts.append(tuple(sorted_kwargs_list))
|
2002
|
-
|
2003
|
-
try:
|
2004
|
-
cache_key = tuple(cache_key_parts)
|
2005
|
-
except TypeError as e: # pragma: no cover
|
2006
|
-
logger.warning(
|
2007
|
-
f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call."
|
2008
|
-
)
|
2009
|
-
cache_key = None # Fallback to not using cache for this call
|
2010
|
-
|
2011
|
-
image_to_return: Optional[Image.Image] = None
|
2012
|
-
|
2013
|
-
# 2. Check cache
|
2014
|
-
if cache_key is not None and cache_key in self._to_image_cache:
|
2015
|
-
image_to_return = self._to_image_cache[cache_key]
|
2016
|
-
logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
|
2017
|
-
else:
|
2018
|
-
# --- This is the original logic to generate the image ---
|
2019
|
-
rendered_image_component: Optional[Image.Image] = (
|
2020
|
-
None # Renamed from 'image' in original
|
2021
|
-
)
|
2022
|
-
render_resolution = resolution
|
2023
|
-
thread_id = threading.current_thread().name
|
2024
|
-
logger.debug(
|
2025
|
-
f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
|
2026
|
-
)
|
2027
|
-
lock_wait_start = time.monotonic()
|
2028
|
-
try:
|
2029
|
-
# Acquire the global PDF rendering lock
|
2030
|
-
with pdf_render_lock:
|
2031
|
-
lock_acquired_time = time.monotonic()
|
2032
|
-
logger.debug(
|
2033
|
-
f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
|
2034
|
-
)
|
2035
|
-
if include_highlights:
|
2036
|
-
# Delegate rendering to the central service
|
2037
|
-
rendered_image_component = self._highlighter.render_page(
|
2038
|
-
page_index=self.index,
|
2039
|
-
resolution=render_resolution,
|
2040
|
-
labels=labels,
|
2041
|
-
legend_position=legend_position,
|
2042
|
-
render_ocr=render_ocr,
|
2043
|
-
**kwargs,
|
2044
|
-
)
|
2045
|
-
else:
|
2046
|
-
rendered_image_component = render_plain_page(self, render_resolution)
|
2047
|
-
except Exception as e:
|
2048
|
-
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
2049
|
-
# rendered_image_component remains None
|
2050
|
-
finally:
|
2051
|
-
render_end_time = time.monotonic()
|
2052
|
-
logger.debug(
|
2053
|
-
f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
|
2054
|
-
)
|
2055
|
-
|
2056
|
-
if rendered_image_component is None:
|
2057
|
-
if cache_key is not None:
|
2058
|
-
self._to_image_cache[cache_key] = None # Cache the failure
|
2059
|
-
# Save the image if path is provided (will try to save None, handled by PIL/OS)
|
2060
|
-
if path:
|
2061
|
-
try:
|
2062
|
-
if os.path.dirname(path):
|
2063
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
2064
|
-
if rendered_image_component is not None: # Should be None here
|
2065
|
-
rendered_image_component.save(path) # This line won't be hit if None
|
2066
|
-
# else: logger.debug("Not saving None image") # Not strictly needed
|
2067
|
-
except Exception as save_error: # pragma: no cover
|
2068
|
-
logger.error(f"Failed to save image to {path}: {save_error}")
|
2069
|
-
return None
|
2070
|
-
|
2071
|
-
# --- Apply exclusion masking if requested ---
|
2072
|
-
# This modifies 'rendered_image_component'
|
2073
|
-
image_after_masking = rendered_image_component # Start with the rendered image
|
2074
|
-
|
2075
|
-
# Determine if masking is requested and establish the fill colour
|
2076
|
-
mask_requested = exclusions is not None and self._exclusions
|
2077
|
-
mask_color: Union[str, Tuple[int, int, int, int]] = "white" # default
|
2078
|
-
|
2079
|
-
if mask_requested:
|
2080
|
-
if exclusions != "mask":
|
2081
|
-
# Attempt to parse custom colour input
|
2082
|
-
try:
|
2083
|
-
if isinstance(exclusions, tuple):
|
2084
|
-
# Handle RGB/RGBA tuples with ints 0-255 or floats 0-1
|
2085
|
-
processed = []
|
2086
|
-
all_float = all(isinstance(c, float) for c in exclusions)
|
2087
|
-
for i, c in enumerate(exclusions):
|
2088
|
-
if isinstance(c, float):
|
2089
|
-
val = int(c * 255) if all_float or i == 3 else int(c)
|
2090
|
-
else:
|
2091
|
-
val = int(c)
|
2092
|
-
processed.append(max(0, min(255, val)))
|
2093
|
-
if len(processed) == 3:
|
2094
|
-
processed.append(255) # add full alpha
|
2095
|
-
mask_color = tuple(processed) # type: ignore[assignment]
|
2096
|
-
elif isinstance(exclusions, str):
|
2097
|
-
# Try using the optional 'colour' library for rich parsing
|
2098
|
-
try:
|
2099
|
-
from colour import Color # type: ignore
|
2100
|
-
|
2101
|
-
color_obj = Color(exclusions)
|
2102
|
-
mask_color = (
|
2103
|
-
int(color_obj.red * 255),
|
2104
|
-
int(color_obj.green * 255),
|
2105
|
-
int(color_obj.blue * 255),
|
2106
|
-
255,
|
2107
|
-
)
|
2108
|
-
except Exception:
|
2109
|
-
# Fallback: if parsing fails, treat as plain string accepted by PIL
|
2110
|
-
mask_color = exclusions # e.g. "red"
|
2111
|
-
else:
|
2112
|
-
logger.warning(
|
2113
|
-
f"Unsupported exclusions colour spec: {exclusions!r}. Using white."
|
2114
|
-
)
|
2115
|
-
except Exception as colour_parse_err: # pragma: no cover
|
2116
|
-
logger.warning(
|
2117
|
-
f"Failed to parse exclusions colour {exclusions!r}: {colour_parse_err}. Using white."
|
2118
|
-
)
|
2119
|
-
|
2120
|
-
try:
|
2121
|
-
# Ensure image is mutable (RGB or RGBA)
|
2122
|
-
if image_after_masking.mode not in ("RGB", "RGBA"):
|
2123
|
-
image_after_masking = image_after_masking.convert("RGB")
|
2124
|
-
|
2125
|
-
exclusion_regions = self._get_exclusion_regions(
|
2126
|
-
include_callable=True, debug=False
|
2127
|
-
)
|
2128
|
-
if exclusion_regions:
|
2129
|
-
draw = ImageDraw.Draw(image_after_masking)
|
2130
|
-
# Scaling factor for converting PDF pts → image px
|
2131
|
-
img_scale = render_resolution / 72.0
|
2132
|
-
|
2133
|
-
# Determine fill colour compatible with current mode
|
2134
|
-
def _mode_compatible(colour):
|
2135
|
-
if isinstance(colour, tuple) and image_after_masking.mode != "RGBA":
|
2136
|
-
return colour[:3] # drop alpha for RGB images
|
2137
|
-
return colour
|
2138
|
-
|
2139
|
-
fill_colour = _mode_compatible(mask_color)
|
2140
|
-
|
2141
|
-
for region in exclusion_regions:
|
2142
|
-
img_x0 = region.x0 * img_scale
|
2143
|
-
img_top = region.top * img_scale
|
2144
|
-
img_x1 = region.x1 * img_scale
|
2145
|
-
img_bottom = region.bottom * img_scale
|
2146
|
-
|
2147
|
-
img_coords = (
|
2148
|
-
max(0, img_x0),
|
2149
|
-
max(0, img_top),
|
2150
|
-
min(image_after_masking.width, img_x1),
|
2151
|
-
min(image_after_masking.height, img_bottom),
|
2152
|
-
)
|
2153
|
-
if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
|
2154
|
-
draw.rectangle(img_coords, fill=fill_colour)
|
2155
|
-
else: # pragma: no cover
|
2156
|
-
logger.warning(
|
2157
|
-
f"Skipping invalid exclusion rect for masking: {img_coords}"
|
2158
|
-
)
|
2159
|
-
del draw # Release drawing context
|
2160
|
-
except Exception as mask_error: # pragma: no cover
|
2161
|
-
logger.error(
|
2162
|
-
f"Error applying exclusion mask to page {self.index}: {mask_error}",
|
2163
|
-
exc_info=True,
|
2164
|
-
)
|
2165
|
-
# Continue with potentially unmasked or partially masked image
|
2166
|
-
|
2167
|
-
# --- Resize the final image if width is provided ---
|
2168
|
-
image_final_content = image_after_masking # Start with image after masking
|
2169
|
-
if width is not None and width > 0 and image_final_content.width > 0:
|
2170
|
-
aspect_ratio = image_final_content.height / image_final_content.width
|
2171
|
-
height = int(width * aspect_ratio)
|
2172
|
-
try:
|
2173
|
-
image_final_content = image_final_content.resize(
|
2174
|
-
(width, height), Image.Resampling.LANCZOS
|
2175
|
-
)
|
2176
|
-
except Exception as resize_error: # pragma: no cover
|
2177
|
-
logger.warning(f"Could not resize image: {resize_error}")
|
2178
|
-
# image_final_content remains the un-resized version if resize fails
|
2179
|
-
|
2180
|
-
# Store in cache
|
2181
|
-
if cache_key is not None:
|
2182
|
-
self._to_image_cache[cache_key] = image_final_content
|
2183
|
-
logger.debug(f"Page {self.index}: Cached image for key: {cache_key}")
|
2184
|
-
image_to_return = image_final_content
|
2185
|
-
# --- End of cache miss block ---
|
2186
|
-
|
2187
|
-
# Save the image (either from cache or newly generated) if path is provided
|
2188
|
-
if path and image_to_return:
|
2189
|
-
try:
|
2190
|
-
# Ensure directory exists
|
2191
|
-
if os.path.dirname(path): # Only call makedirs if there's a directory part
|
2192
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
2193
|
-
image_to_return.save(path)
|
2194
|
-
logger.debug(f"Saved page image to: {path}")
|
2195
|
-
except Exception as save_error: # pragma: no cover
|
2196
|
-
logger.error(f"Failed to save image to {path}: {save_error}")
|
2197
|
-
|
2198
|
-
return image_to_return
|
2199
|
-
|
2200
2026
|
def _create_text_elements_from_ocr(
|
2201
2027
|
self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
|
2202
2028
|
) -> List["TextElement"]:
|
@@ -2309,7 +2135,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
2309
2135
|
# Get base image without highlights using the determined resolution
|
2310
2136
|
# Use the global PDF rendering lock
|
2311
2137
|
with pdf_render_lock:
|
2312
|
-
|
2138
|
+
# Use render() for clean image without highlights
|
2139
|
+
image = self.render(resolution=final_resolution)
|
2313
2140
|
if not image:
|
2314
2141
|
logger.error(
|
2315
2142
|
f" Failed to render page {self.number} to image for OCR extraction."
|
@@ -2491,7 +2318,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
2491
2318
|
return self
|
2492
2319
|
|
2493
2320
|
def get_section_between(
|
2494
|
-
self, start_element=None, end_element=None,
|
2321
|
+
self, start_element=None, end_element=None, include_boundaries="both"
|
2495
2322
|
) -> Optional["Region"]: # Return Optional
|
2496
2323
|
"""
|
2497
2324
|
Get a section between two elements on this page.
|
@@ -2504,7 +2331,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
2504
2331
|
return page_region.get_section_between(
|
2505
2332
|
start_element=start_element,
|
2506
2333
|
end_element=end_element,
|
2507
|
-
|
2334
|
+
include_boundaries=include_boundaries,
|
2508
2335
|
)
|
2509
2336
|
except Exception as e:
|
2510
2337
|
logger.error(
|
@@ -2526,7 +2353,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
2526
2353
|
self,
|
2527
2354
|
start_elements=None,
|
2528
2355
|
end_elements=None,
|
2529
|
-
|
2356
|
+
include_boundaries="start",
|
2530
2357
|
y_threshold=5.0,
|
2531
2358
|
bounding_box=None,
|
2532
2359
|
) -> "ElementCollection[Region]":
|
@@ -2567,8 +2394,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
2567
2394
|
end_elements = []
|
2568
2395
|
|
2569
2396
|
valid_inclusions = ["start", "end", "both", "none"]
|
2570
|
-
if
|
2571
|
-
raise ValueError(f"
|
2397
|
+
if include_boundaries not in valid_inclusions:
|
2398
|
+
raise ValueError(f"include_boundaries must be one of {valid_inclusions}")
|
2572
2399
|
|
2573
2400
|
if not start_elements:
|
2574
2401
|
# Return an empty ElementCollection if no start elements
|
@@ -2600,12 +2427,12 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
2600
2427
|
# Determine region boundaries
|
2601
2428
|
sec_top = (
|
2602
2429
|
current_start_element.top
|
2603
|
-
if
|
2430
|
+
if include_boundaries in ["start", "both"]
|
2604
2431
|
else current_start_element.bottom
|
2605
2432
|
)
|
2606
2433
|
sec_bottom = (
|
2607
2434
|
end_boundary_el.top
|
2608
|
-
if
|
2435
|
+
if include_boundaries not in ["end", "both"]
|
2609
2436
|
else end_boundary_el.bottom
|
2610
2437
|
)
|
2611
2438
|
|
@@ -2627,12 +2454,12 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
2627
2454
|
end_boundary_el = element
|
2628
2455
|
sec_top = (
|
2629
2456
|
current_start_element.top
|
2630
|
-
if
|
2457
|
+
if include_boundaries in ["start", "both"]
|
2631
2458
|
else current_start_element.bottom
|
2632
2459
|
)
|
2633
2460
|
sec_bottom = (
|
2634
2461
|
end_boundary_el.bottom
|
2635
|
-
if
|
2462
|
+
if include_boundaries in ["end", "both"]
|
2636
2463
|
else end_boundary_el.top
|
2637
2464
|
)
|
2638
2465
|
|
@@ -2652,7 +2479,7 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
2652
2479
|
if active_section_started:
|
2653
2480
|
sec_top = (
|
2654
2481
|
current_start_element.top
|
2655
|
-
if
|
2482
|
+
if include_boundaries in ["start", "both"]
|
2656
2483
|
else current_start_element.bottom
|
2657
2484
|
)
|
2658
2485
|
x0, _, x1, page_bottom = get_bounds()
|
@@ -3069,13 +2896,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
3069
2896
|
else default_resolution
|
3070
2897
|
)
|
3071
2898
|
|
3072
|
-
# Use
|
3073
|
-
img = self.
|
3074
|
-
resolution=resolution,
|
3075
|
-
include_highlights=False,
|
3076
|
-
labels=False,
|
3077
|
-
exclusions=None, # Don't mask exclusions for classification input image
|
3078
|
-
)
|
2899
|
+
# Use render() for clean image without highlights
|
2900
|
+
img = self.render(resolution=resolution)
|
3079
2901
|
if img is None:
|
3080
2902
|
raise ValueError(
|
3081
2903
|
"Cannot classify page with 'vision' model: Failed to render image."
|
@@ -3134,7 +2956,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
3134
2956
|
logger.debug(f"Page {self.number}: Detecting skew angle (resolution={resolution} DPI)...")
|
3135
2957
|
try:
|
3136
2958
|
# Render the page at the specified detection resolution
|
3137
|
-
|
2959
|
+
# Use render() for clean image without highlights
|
2960
|
+
img = self.render(resolution=resolution)
|
3138
2961
|
if not img:
|
3139
2962
|
logger.warning(f"Page {self.number}: Failed to render image for skew detection.")
|
3140
2963
|
self._skew_angle = None
|
@@ -3213,7 +3036,8 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
3213
3036
|
|
3214
3037
|
try:
|
3215
3038
|
# Render the original page at the desired output resolution
|
3216
|
-
|
3039
|
+
# Use render() for clean image without highlights
|
3040
|
+
img = self.render(resolution=resolution)
|
3217
3041
|
if not img:
|
3218
3042
|
logger.error(f"Page {self.number}: Failed to render image for deskewing.")
|
3219
3043
|
return None
|
@@ -3303,32 +3127,33 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
3303
3127
|
def _apply_rtl_processing_to_text(self, text: str) -> str:
|
3304
3128
|
"""
|
3305
3129
|
Apply RTL (Right-to-Left) text processing to a string.
|
3306
|
-
|
3130
|
+
|
3307
3131
|
This converts visual order text (as stored in PDFs) to logical order
|
3308
3132
|
for proper display of Arabic, Hebrew, and other RTL scripts.
|
3309
|
-
|
3133
|
+
|
3310
3134
|
Args:
|
3311
3135
|
text: Input text string in visual order
|
3312
|
-
|
3136
|
+
|
3313
3137
|
Returns:
|
3314
3138
|
Text string in logical order
|
3315
3139
|
"""
|
3316
3140
|
if not text or not text.strip():
|
3317
3141
|
return text
|
3318
|
-
|
3142
|
+
|
3319
3143
|
# Quick check for RTL characters - if none found, return as-is
|
3320
3144
|
import unicodedata
|
3321
|
-
|
3145
|
+
|
3322
3146
|
def _contains_rtl(s):
|
3323
3147
|
return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
|
3324
|
-
|
3148
|
+
|
3325
3149
|
if not _contains_rtl(text):
|
3326
3150
|
return text
|
3327
|
-
|
3151
|
+
|
3328
3152
|
try:
|
3329
3153
|
from bidi.algorithm import get_display # type: ignore
|
3154
|
+
|
3330
3155
|
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
3331
|
-
|
3156
|
+
|
3332
3157
|
# Apply BiDi algorithm to convert from visual to logical order
|
3333
3158
|
# Process line by line to handle mixed content properly
|
3334
3159
|
processed_lines = []
|
@@ -3341,9 +3166,9 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
3341
3166
|
processed_lines.append(mirror_brackets(logical_line))
|
3342
3167
|
else:
|
3343
3168
|
processed_lines.append(line)
|
3344
|
-
|
3169
|
+
|
3345
3170
|
return "\n".join(processed_lines)
|
3346
|
-
|
3171
|
+
|
3347
3172
|
except (ImportError, Exception):
|
3348
3173
|
# If bidi library is not available or fails, return original text
|
3349
3174
|
return text
|
@@ -3361,3 +3186,31 @@ class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin,
|
|
3361
3186
|
def images(self) -> List[Any]:
|
3362
3187
|
"""Get all embedded raster images on this page."""
|
3363
3188
|
return self._element_mgr.images
|
3189
|
+
|
3190
|
+
def highlights(self, show: bool = False) -> "HighlightContext":
|
3191
|
+
"""
|
3192
|
+
Create a highlight context for accumulating highlights.
|
3193
|
+
|
3194
|
+
This allows for clean syntax to show multiple highlight groups:
|
3195
|
+
|
3196
|
+
Example:
|
3197
|
+
with page.highlights() as h:
|
3198
|
+
h.add(page.find_all('table'), label='tables', color='blue')
|
3199
|
+
h.add(page.find_all('text:bold'), label='bold text', color='red')
|
3200
|
+
h.show()
|
3201
|
+
|
3202
|
+
Or with automatic display:
|
3203
|
+
with page.highlights(show=True) as h:
|
3204
|
+
h.add(page.find_all('table'), label='tables')
|
3205
|
+
h.add(page.find_all('text:bold'), label='bold')
|
3206
|
+
# Automatically shows when exiting the context
|
3207
|
+
|
3208
|
+
Args:
|
3209
|
+
show: If True, automatically show highlights when exiting context
|
3210
|
+
|
3211
|
+
Returns:
|
3212
|
+
HighlightContext for accumulating highlights
|
3213
|
+
"""
|
3214
|
+
from natural_pdf.core.highlighting_service import HighlightContext
|
3215
|
+
|
3216
|
+
return HighlightContext(self, show_on_exit=show)
|