natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import base64
2
+ import concurrent.futures # Added import
2
3
  import hashlib
3
4
  import io
4
5
  import json
@@ -6,14 +7,30 @@ import logging
6
7
  import os
7
8
  import re
8
9
  import tempfile
10
+ import threading
11
+ import time # Import time
9
12
  from pathlib import Path
10
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
13
+ from typing import ( # Added overload
14
+ TYPE_CHECKING,
15
+ Any,
16
+ Callable,
17
+ Dict,
18
+ List,
19
+ Optional,
20
+ Tuple,
21
+ Union,
22
+ overload,
23
+ )
11
24
 
12
25
  import pdfplumber
13
26
  from PIL import Image, ImageDraw
27
+ from tqdm.auto import tqdm # Added tqdm import
14
28
 
15
29
  from natural_pdf.elements.collections import ElementCollection
16
30
  from natural_pdf.elements.region import Region
31
+ from natural_pdf.selectors.parser import parse_selector
32
+ from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
33
+ from natural_pdf.utils.visualization import render_plain_page
17
34
 
18
35
  if TYPE_CHECKING:
19
36
  import pdfplumber
@@ -26,6 +43,8 @@ if TYPE_CHECKING:
26
43
  # New Imports
27
44
  import itertools
28
45
 
46
+ # Deskew Imports (Conditional)
47
+ import numpy as np
29
48
  from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
30
49
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
31
50
 
@@ -34,22 +53,40 @@ from natural_pdf.analyzers.layout.layout_manager import LayoutManager
34
53
  from natural_pdf.analyzers.layout.layout_options import LayoutOptions
35
54
  from natural_pdf.analyzers.text_options import TextStyleOptions
36
55
  from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
56
+ from natural_pdf.classification.manager import ClassificationManager # For type hint
57
+
58
+ # --- Classification Imports --- #
59
+ from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
37
60
  from natural_pdf.core.element_manager import ElementManager
61
+ from natural_pdf.elements.base import Element # Import base element
38
62
  from natural_pdf.elements.text import TextElement
63
+ from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
39
64
  from natural_pdf.ocr import OCRManager, OCROptions
65
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
66
+ from natural_pdf.qa import DocumentQA, get_qa_engine
67
+ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
40
68
 
41
69
  # Import new utils
42
70
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
43
71
  from natural_pdf.widgets import InteractiveViewerWidget
44
72
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
45
73
 
46
- from natural_pdf.qa import DocumentQA, get_qa_engine
47
- from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
74
+ # --- End Classification Imports --- #
75
+
76
+
77
+ try:
78
+ from deskew import determine_skew
79
+
80
+ DESKEW_AVAILABLE = True
81
+ except ImportError:
82
+ DESKEW_AVAILABLE = False
83
+ determine_skew = None
84
+ # End Deskew Imports
48
85
 
49
86
  logger = logging.getLogger(__name__)
50
87
 
51
88
 
52
- class Page:
89
+ class Page(ClassificationMixin, ExtractionMixin):
53
90
  """
54
91
  Enhanced Page wrapper built on top of pdfplumber.Page.
55
92
 
@@ -72,6 +109,11 @@ class Page:
72
109
  self._index = index
73
110
  self._text_styles = None # Lazy-loaded text style analyzer results
74
111
  self._exclusions = [] # List to store exclusion functions/regions
112
+ self._skew_angle: Optional[float] = None # Stores detected skew angle
113
+
114
+ # --- ADDED --- Metadata store for mixins
115
+ self.metadata: Dict[str, Any] = {}
116
+ # --- END ADDED ---
75
117
 
76
118
  # Region management
77
119
  self._regions = {
@@ -79,8 +121,11 @@ class Page:
79
121
  "named": {}, # Named regions (name -> region)
80
122
  }
81
123
 
82
- # Initialize ElementManager
83
- self._element_mgr = ElementManager(self, font_attrs)
124
+ # Initialize ElementManager, passing font_attrs
125
+ self._element_mgr = ElementManager(self, font_attrs=font_attrs)
126
+ # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
127
+ # --- NEW --- Central registry for analysis results
128
+ self.analyses: Dict[str, Any] = {}
84
129
 
85
130
  # --- Get OCR Manager Instance ---
86
131
  if (
@@ -115,6 +160,8 @@ class Page:
115
160
  # Initialize the internal variable with a single underscore
116
161
  self._layout_analyzer = None
117
162
 
163
+ self._load_elements()
164
+
118
165
  @property
119
166
  def pdf(self) -> "PDF":
120
167
  """Provides public access to the parent PDF object."""
@@ -412,25 +459,79 @@ class Page:
412
459
 
413
460
  return filtered_elements
414
461
 
415
- def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
462
+ @overload
463
+ def find(
464
+ self,
465
+ *,
466
+ text: str,
467
+ apply_exclusions: bool = True,
468
+ regex: bool = False,
469
+ case: bool = True,
470
+ **kwargs,
471
+ ) -> Optional[Any]: ...
472
+
473
+ @overload
474
+ def find(
475
+ self,
476
+ selector: str,
477
+ *,
478
+ apply_exclusions: bool = True,
479
+ regex: bool = False,
480
+ case: bool = True,
481
+ **kwargs,
482
+ ) -> Optional[Any]: ...
483
+
484
+ def find(
485
+ self,
486
+ selector: Optional[str] = None, # Now optional
487
+ *, # Force subsequent args to be keyword-only
488
+ text: Optional[str] = None, # New text parameter
489
+ apply_exclusions: bool = True,
490
+ regex: bool = False,
491
+ case: bool = True,
492
+ **kwargs,
493
+ ) -> Optional[Any]:
416
494
  """
417
- Find first element on this page matching selector.
495
+ Find first element on this page matching selector OR text content.
496
+
497
+ Provide EITHER `selector` OR `text`, but not both.
418
498
 
419
499
  Args:
420
- selector: CSS-like selector string
421
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
422
- regex: Whether to use regex for text search in :contains (default: False)
423
- case: Whether to do case-sensitive text search (default: True)
424
- **kwargs: Additional filter parameters
500
+ selector: CSS-like selector string.
501
+ text: Text content to search for (equivalent to 'text:contains(...)').
502
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
503
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
504
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
505
+ **kwargs: Additional filter parameters.
425
506
 
426
507
  Returns:
427
- Element object or None if not found
428
- """
429
- from natural_pdf.selectors.parser import parse_selector
508
+ Element object or None if not found.
509
+ """
510
+ if selector is not None and text is not None:
511
+ raise ValueError("Provide either 'selector' or 'text', not both.")
512
+ if selector is None and text is None:
513
+ raise ValueError("Provide either 'selector' or 'text'.")
514
+
515
+ # Construct selector if 'text' is provided
516
+ effective_selector = ""
517
+ if text is not None:
518
+ # Escape quotes within the text for the selector string
519
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
520
+ # Default to 'text:contains(...)'
521
+ effective_selector = f'text:contains("{escaped_text}")'
522
+ # Note: regex/case handled by kwargs passed down
523
+ logger.debug(
524
+ f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
525
+ )
526
+ elif selector is not None:
527
+ effective_selector = selector
528
+ else:
529
+ # Should be unreachable due to checks above
530
+ raise ValueError("Internal error: No selector or text provided.")
430
531
 
431
- selector_obj = parse_selector(selector)
532
+ selector_obj = parse_selector(effective_selector)
432
533
 
433
- # Pass regex and case flags to selector function
534
+ # Pass regex and case flags to selector function via kwargs
434
535
  kwargs["regex"] = regex
435
536
  kwargs["case"] = case
436
537
 
@@ -450,27 +551,80 @@ class Page:
450
551
  else:
451
552
  return None
452
553
 
554
+ @overload
555
+ def find_all(
556
+ self,
557
+ *,
558
+ text: str,
559
+ apply_exclusions: bool = True,
560
+ regex: bool = False,
561
+ case: bool = True,
562
+ **kwargs,
563
+ ) -> "ElementCollection": ...
564
+
565
+ @overload
566
+ def find_all(
567
+ self,
568
+ selector: str,
569
+ *,
570
+ apply_exclusions: bool = True,
571
+ regex: bool = False,
572
+ case: bool = True,
573
+ **kwargs,
574
+ ) -> "ElementCollection": ...
575
+
453
576
  def find_all(
454
- self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
577
+ self,
578
+ selector: Optional[str] = None, # Now optional
579
+ *, # Force subsequent args to be keyword-only
580
+ text: Optional[str] = None, # New text parameter
581
+ apply_exclusions: bool = True,
582
+ regex: bool = False,
583
+ case: bool = True,
584
+ **kwargs,
455
585
  ) -> "ElementCollection":
456
586
  """
457
- Find all elements on this page matching selector.
587
+ Find all elements on this page matching selector OR text content.
588
+
589
+ Provide EITHER `selector` OR `text`, but not both.
458
590
 
459
591
  Args:
460
- selector: CSS-like selector string
461
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
462
- regex: Whether to use regex for text search in :contains (default: False)
463
- case: Whether to do case-sensitive text search (default: True)
464
- **kwargs: Additional filter parameters
592
+ selector: CSS-like selector string.
593
+ text: Text content to search for (equivalent to 'text:contains(...)').
594
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
595
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
596
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
597
+ **kwargs: Additional filter parameters.
465
598
 
466
599
  Returns:
467
- ElementCollection with matching elements
468
- """
469
- from natural_pdf.selectors.parser import parse_selector
600
+ ElementCollection with matching elements.
601
+ """
602
+ from natural_pdf.elements.collections import ElementCollection # Import here for type hint
603
+
604
+ if selector is not None and text is not None:
605
+ raise ValueError("Provide either 'selector' or 'text', not both.")
606
+ if selector is None and text is None:
607
+ raise ValueError("Provide either 'selector' or 'text'.")
608
+
609
+ # Construct selector if 'text' is provided
610
+ effective_selector = ""
611
+ if text is not None:
612
+ # Escape quotes within the text for the selector string
613
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
614
+ # Default to 'text:contains(...)'
615
+ effective_selector = f'text:contains("{escaped_text}")'
616
+ logger.debug(
617
+ f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
618
+ )
619
+ elif selector is not None:
620
+ effective_selector = selector
621
+ else:
622
+ # Should be unreachable due to checks above
623
+ raise ValueError("Internal error: No selector or text provided.")
470
624
 
471
- selector_obj = parse_selector(selector)
625
+ selector_obj = parse_selector(effective_selector)
472
626
 
473
- # Pass regex and case flags to selector function
627
+ # Pass regex and case flags to selector function via kwargs
474
628
  kwargs["regex"] = regex
475
629
  kwargs["case"] = case
476
630
 
@@ -1257,38 +1411,39 @@ class Page:
1257
1411
  """
1258
1412
  image = None
1259
1413
  render_resolution = resolution if resolution is not None else scale * 72
1414
+ thread_id = threading.current_thread().name
1415
+ logger.debug(
1416
+ f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
1417
+ )
1418
+ lock_wait_start = time.monotonic()
1260
1419
  try:
1261
- if include_highlights:
1262
- # Delegate rendering to the central service
1263
- image = self._highlighter.render_page(
1264
- page_index=self.index,
1265
- scale=scale, # Note: scale is used by highlighter internally for drawing
1266
- labels=labels,
1267
- legend_position=legend_position,
1268
- render_ocr=render_ocr,
1269
- resolution=render_resolution, # Pass the calculated resolution
1270
- **kwargs,
1271
- )
1272
- else:
1273
- # Get the base page image directly from pdfplumber if no highlights needed
1274
- # Use the underlying pdfplumber page object
1275
- img_object = self._page.to_image(resolution=render_resolution, **kwargs)
1276
- # Access the PIL image directly (assuming pdfplumber structure)
1277
- image = (
1278
- img_object.annotated
1279
- if hasattr(img_object, "annotated")
1280
- else img_object._repr_png_()
1420
+ # Acquire the global PDF rendering lock
1421
+ with pdf_render_lock:
1422
+ lock_acquired_time = time.monotonic()
1423
+ logger.debug(
1424
+ f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
1281
1425
  )
1282
- if isinstance(image, bytes): # Handle cases where it returns bytes
1283
- from io import BytesIO
1284
-
1285
- image = Image.open(BytesIO(image)).convert(
1286
- "RGB"
1287
- ) # Convert to RGB for consistency
1288
-
1426
+ if include_highlights:
1427
+ # Delegate rendering to the central service
1428
+ image = self._highlighter.render_page(
1429
+ page_index=self.index,
1430
+ scale=scale,
1431
+ labels=labels,
1432
+ legend_position=legend_position,
1433
+ render_ocr=render_ocr,
1434
+ resolution=render_resolution, # Pass the calculated resolution
1435
+ **kwargs,
1436
+ )
1437
+ else:
1438
+ image = render_plain_page(self, render_resolution)
1289
1439
  except Exception as e:
1290
1440
  logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
1291
1441
  return None # Return None on error
1442
+ finally:
1443
+ render_end_time = time.monotonic()
1444
+ logger.debug(
1445
+ f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
1446
+ )
1292
1447
 
1293
1448
  if image is None:
1294
1449
  return None
@@ -1384,6 +1539,7 @@ class Page:
1384
1539
  resolution: Optional[int] = None,
1385
1540
  detect_only: bool = False,
1386
1541
  apply_exclusions: bool = True,
1542
+ replace: bool = True,
1387
1543
  ) -> "Page":
1388
1544
  """
1389
1545
  Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
@@ -1397,13 +1553,23 @@ class Page:
1397
1553
  resolution: DPI resolution for rendering page image before OCR.
1398
1554
  apply_exclusions: If True (default), render page image for OCR
1399
1555
  with excluded areas masked (whited out).
1556
+ detect_only: If True, only detect text bounding boxes, don't perform OCR.
1557
+ replace: If True (default), remove any existing OCR elements before
1558
+ adding new ones. If False, add new OCR elements to existing ones.
1400
1559
 
1401
1560
  Returns:
1402
- List of created TextElements derived from OCR results for this page.
1561
+ Self for method chaining.
1403
1562
  """
1404
1563
  if not hasattr(self._parent, "apply_ocr"):
1405
1564
  logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
1406
- return [] # Return empty list for consistency
1565
+ return self # Return self for chaining
1566
+
1567
+ # Remove existing OCR elements if replace is True
1568
+ if replace and hasattr(self, "_element_mgr"):
1569
+ logger.info(
1570
+ f"Page {self.number}: Removing existing OCR elements before applying new OCR."
1571
+ )
1572
+ self._element_mgr.remove_ocr_elements()
1407
1573
 
1408
1574
  logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1409
1575
  try:
@@ -1419,18 +1585,13 @@ class Page:
1419
1585
  resolution=resolution,
1420
1586
  detect_only=detect_only,
1421
1587
  apply_exclusions=apply_exclusions,
1588
+ replace=replace, # Pass the replace parameter to PDF.apply_ocr
1422
1589
  )
1423
1590
  except Exception as e:
1424
1591
  logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1425
- return []
1592
+ return self # Return self for chaining
1426
1593
 
1427
- # Return the OCR elements specifically added to this page
1428
- ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
1429
- logger.debug(
1430
- f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
1431
- )
1432
- # Note: The method is typed to return Page for chaining, but the log indicates
1433
- # finding elements. Let's stick to returning self for chaining consistency.
1594
+ # Return self for chaining
1434
1595
  return self
1435
1596
 
1436
1597
  def extract_ocr_elements(
@@ -1471,11 +1632,15 @@ class Page:
1471
1632
 
1472
1633
  try:
1473
1634
  # Get base image without highlights using the determined resolution
1474
- image = self.to_image(resolution=final_resolution, include_highlights=False)
1475
- if not image:
1476
- logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1477
- return []
1478
- logger.debug(f" Rendered image size: {image.width}x{image.height}")
1635
+ # Use the global PDF rendering lock
1636
+ with pdf_render_lock:
1637
+ image = self.to_image(resolution=final_resolution, include_highlights=False)
1638
+ if not image:
1639
+ logger.error(
1640
+ f" Failed to render page {self.number} to image for OCR extraction."
1641
+ )
1642
+ return []
1643
+ logger.debug(f" Rendered image size: {image.width}x{image.height}")
1479
1644
  except Exception as e:
1480
1645
  logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
1481
1646
  return []
@@ -1545,6 +1710,11 @@ class Page:
1545
1710
  logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
1546
1711
  return temp_elements
1547
1712
 
1713
+ @property
1714
+ def size(self) -> Tuple[float, float]:
1715
+ """Get the size of the page in points."""
1716
+ return (self._page.width, self._page.height)
1717
+
1548
1718
  @property
1549
1719
  def layout_analyzer(self) -> LayoutAnalyzer:
1550
1720
  """Get or create the layout analyzer for this page."""
@@ -1564,6 +1734,8 @@ class Page:
1564
1734
  exclude_classes: Optional[List[str]] = None,
1565
1735
  device: Optional[str] = None,
1566
1736
  existing: str = "replace",
1737
+ model_name: Optional[str] = None,
1738
+ client: Optional[Any] = None, # Add client parameter
1567
1739
  ) -> ElementCollection[Region]:
1568
1740
  """
1569
1741
  Analyze the page layout using the configured LayoutManager.
@@ -1589,6 +1761,8 @@ class Page:
1589
1761
  exclude_classes=exclude_classes,
1590
1762
  device=device,
1591
1763
  existing=existing,
1764
+ model_name=model_name,
1765
+ client=client, # Pass client down
1592
1766
  )
1593
1767
 
1594
1768
  # Retrieve the detected regions from the element manager
@@ -1659,14 +1833,24 @@ class Page:
1659
1833
  )
1660
1834
  return None
1661
1835
 
1836
+ def split(self, divider, **kwargs) -> "ElementCollection[Region]":
1837
+ """
1838
+ Divides the page into sections based on the provided divider elements.
1839
+ """
1840
+ sections = self.get_sections(start_elements=divider, **kwargs)
1841
+ top = self.region(0, 0, self.width, sections[0].top)
1842
+ sections.append(top)
1843
+
1844
+ return sections
1845
+
1662
1846
  def get_sections(
1663
1847
  self,
1664
1848
  start_elements=None,
1665
1849
  end_elements=None,
1666
- boundary_inclusion="both",
1850
+ boundary_inclusion="start",
1667
1851
  y_threshold=5.0,
1668
1852
  bounding_box=None,
1669
- ) -> "ElementCollection[Region]": # Updated type hint
1853
+ ) -> "ElementCollection[Region]":
1670
1854
  """
1671
1855
  Get sections of a page defined by start/end elements.
1672
1856
  Uses the page-level implementation.
@@ -2027,43 +2211,344 @@ class Page:
2027
2211
  def correct_ocr(
2028
2212
  self,
2029
2213
  correction_callback: Callable[[Any], Optional[str]],
2214
+ max_workers: Optional[int] = None,
2215
+ progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
2030
2216
  ) -> "Page": # Return self for chaining
2031
2217
  """
2032
2218
  Applies corrections to OCR-generated text elements on this page
2033
- using a user-provided callback function.
2219
+ using a user-provided callback function, potentially in parallel.
2034
2220
 
2035
2221
  Finds text elements on this page whose 'source' attribute starts
2036
2222
  with 'ocr' and calls the `correction_callback` for each, passing the
2037
- element itself.
2038
-
2039
- The `correction_callback` should contain the logic to:
2040
- 1. Determine if the element needs correction.
2041
- 2. Perform the correction (e.g., call an LLM).
2042
- 3. Return the new text (`str`) or `None`.
2043
-
2044
- If the callback returns a string, the element's `.text` is updated.
2045
- Metadata updates (source, confidence, etc.) should happen within the callback.
2223
+ element itself. Updates the element's text if the callback returns
2224
+ a new string.
2046
2225
 
2047
2226
  Args:
2048
2227
  correction_callback: A function accepting an element and returning
2049
2228
  `Optional[str]` (new text or None).
2229
+ max_workers: The maximum number of threads to use for parallel execution.
2230
+ If None or 0 or 1, runs sequentially.
2231
+ progress_callback: Optional callback function to call after processing each element.
2050
2232
 
2051
2233
  Returns:
2052
2234
  Self for method chaining.
2053
2235
  """
2054
2236
  logger.info(
2055
- f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'"
2237
+ f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
2238
+ )
2239
+
2240
+ target_elements_collection = self.find_all(
2241
+ selector="text[source=ocr]", apply_exclusions=False
2056
2242
  )
2243
+ target_elements = target_elements_collection.elements # Get the list
2244
+
2245
+ if not target_elements:
2246
+ logger.info(f"Page {self.number}: No OCR elements found to correct.")
2247
+ return self
2248
+
2249
+ processed_count = 0
2250
+ updated_count = 0
2251
+ error_count = 0
2252
+
2253
+ # Define the task to be run by the worker thread or sequentially
2254
+ def _process_element_task(element):
2255
+ try:
2256
+ current_text = getattr(element, "text", None)
2257
+ # Call the user-provided callback
2258
+ corrected_text = correction_callback(element)
2259
+
2260
+ # Validate result type
2261
+ if corrected_text is not None and not isinstance(corrected_text, str):
2262
+ logger.warning(
2263
+ f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
2264
+ )
2265
+ return element, None, None # Treat as no correction
2057
2266
 
2058
- # Find OCR elements specifically on this page
2059
- # Note: We typically want to correct even if the element falls in an excluded area
2060
- target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
2267
+ return element, corrected_text, None # Return element, result, no error
2268
+ except Exception as e:
2269
+ logger.error(
2270
+ f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
2271
+ exc_info=False, # Keep log concise
2272
+ )
2273
+ return element, None, e # Return element, no result, error
2274
+ finally:
2275
+ # --- Call progress callback here --- #
2276
+ if progress_callback:
2277
+ try:
2278
+ progress_callback()
2279
+ except Exception as cb_e:
2280
+ # Log error in callback itself, but don't stop processing
2281
+ logger.error(
2282
+ f"Page {self.number}: Error executing progress_callback: {cb_e}",
2283
+ exc_info=False,
2284
+ )
2061
2285
 
2062
- # Delegate to the utility function
2063
- _apply_ocr_correction_to_elements(
2064
- elements=target_elements, # Pass the ElementCollection directly
2065
- correction_callback=correction_callback,
2066
- caller_info=f"Page({self.number})", # Pass caller info
2286
+ # Choose execution strategy based on max_workers
2287
+ if max_workers is not None and max_workers > 1:
2288
+ # --- Parallel execution --- #
2289
+ logger.info(
2290
+ f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
2291
+ )
2292
+ futures = []
2293
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
2294
+ # Submit all tasks
2295
+ future_to_element = {
2296
+ executor.submit(_process_element_task, element): element
2297
+ for element in target_elements
2298
+ }
2299
+
2300
+ # Process results as they complete (progress_callback called by worker)
2301
+ for future in concurrent.futures.as_completed(future_to_element):
2302
+ processed_count += 1
2303
+ try:
2304
+ element, corrected_text, error = future.result()
2305
+ if error:
2306
+ error_count += 1
2307
+ # Error already logged in worker
2308
+ elif corrected_text is not None:
2309
+ # Apply correction if text changed
2310
+ current_text = getattr(element, "text", None)
2311
+ if corrected_text != current_text:
2312
+ element.text = corrected_text
2313
+ updated_count += 1
2314
+ except Exception as exc:
2315
+ # Catch errors from future.result() itself
2316
+ element = future_to_element[future] # Find original element
2317
+ logger.error(
2318
+ f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
2319
+ exc_info=True,
2320
+ )
2321
+ error_count += 1
2322
+ # Note: progress_callback was already called in the worker's finally block
2323
+
2324
+ else:
2325
+ # --- Sequential execution --- #
2326
+ logger.info(f"Page {self.number}: Running OCR correction sequentially.")
2327
+ for element in target_elements:
2328
+ # Call the task function directly (it handles progress_callback)
2329
+ processed_count += 1
2330
+ _element, corrected_text, error = _process_element_task(element)
2331
+ if error:
2332
+ error_count += 1
2333
+ elif corrected_text is not None:
2334
+ # Apply correction if text changed
2335
+ current_text = getattr(_element, "text", None)
2336
+ if corrected_text != current_text:
2337
+ _element.text = corrected_text
2338
+ updated_count += 1
2339
+
2340
+ logger.info(
2341
+ f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
2067
2342
  )
2068
2343
 
2069
2344
  return self # Return self for chaining
2345
+
2346
+ # --- Classification Mixin Implementation --- #
2347
+ def _get_classification_manager(self) -> "ClassificationManager":
2348
+ if not hasattr(self, "pdf") or not hasattr(self.pdf, "get_manager"):
2349
+ raise AttributeError(
2350
+ "ClassificationManager cannot be accessed: Parent PDF or get_manager method missing."
2351
+ )
2352
+ try:
2353
+ # Use the PDF's manager registry accessor
2354
+ return self.pdf.get_manager("classification")
2355
+ except (ValueError, RuntimeError, AttributeError) as e:
2356
+ # Wrap potential errors from get_manager for clarity
2357
+ raise AttributeError(f"Failed to get ClassificationManager from PDF: {e}") from e
2358
+
2359
+ def _get_classification_content(
2360
+ self, model_type: str, **kwargs
2361
+ ) -> Union[str, "Image"]: # Use "Image" for lazy import
2362
+ if model_type == "text":
2363
+ text_content = self.extract_text(
2364
+ layout=False, use_exclusions=False
2365
+ ) # Simple join, ignore exclusions for classification
2366
+ if not text_content or text_content.isspace():
2367
+ raise ValueError("Cannot classify page with 'text' model: No text content found.")
2368
+ return text_content
2369
+ elif model_type == "vision":
2370
+ # Get resolution from manager/kwargs if possible, else default
2371
+ manager = self._get_classification_manager()
2372
+ default_resolution = 150
2373
+ # Access kwargs passed to classify method if needed
2374
+ resolution = (
2375
+ kwargs.get("resolution", default_resolution)
2376
+ if "kwargs" in locals()
2377
+ else default_resolution
2378
+ )
2379
+
2380
+ # Use to_image, ensuring no highlights interfere
2381
+ img = self.to_image(
2382
+ resolution=resolution,
2383
+ include_highlights=False,
2384
+ labels=False,
2385
+ exclusions=None, # Don't mask exclusions for classification input image
2386
+ )
2387
+ if img is None:
2388
+ raise ValueError(
2389
+ "Cannot classify page with 'vision' model: Failed to render image."
2390
+ )
2391
+ return img
2392
+ else:
2393
+ raise ValueError(f"Unsupported model_type for classification: {model_type}")
2394
+
2395
+ def _get_metadata_storage(self) -> Dict[str, Any]:
2396
+ # Ensure metadata exists
2397
+ if not hasattr(self, "metadata") or self.metadata is None:
2398
+ self.metadata = {}
2399
+ return self.metadata
2400
+
2401
+ # --- Content Extraction ---
2402
+
2403
+ # --- Skew Detection and Correction --- #
2404
+
2405
+ @property
2406
+ def skew_angle(self) -> Optional[float]:
2407
+ """Get the detected skew angle for this page (if calculated)."""
2408
+ return self._skew_angle
2409
+
2410
+ def detect_skew_angle(
2411
+ self,
2412
+ resolution: int = 72,
2413
+ grayscale: bool = True,
2414
+ force_recalculate: bool = False,
2415
+ **deskew_kwargs,
2416
+ ) -> Optional[float]:
2417
+ """
2418
+ Detects the skew angle of the page image and stores it.
2419
+
2420
+ Args:
2421
+ resolution: DPI resolution for rendering the page image for detection.
2422
+ grayscale: Whether to convert the image to grayscale before detection.
2423
+ force_recalculate: If True, recalculate even if an angle exists.
2424
+ **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
2425
+ (e.g., `max_angle`, `num_peaks`).
2426
+
2427
+ Returns:
2428
+ The detected skew angle in degrees, or None if detection failed.
2429
+
2430
+ Raises:
2431
+ ImportError: If the 'deskew' library is not installed.
2432
+ """
2433
+ if not DESKEW_AVAILABLE:
2434
+ raise ImportError(
2435
+ "Deskew library not found. Install with: pip install natural-pdf[deskew]"
2436
+ )
2437
+
2438
+ if self._skew_angle is not None and not force_recalculate:
2439
+ logger.debug(f"Page {self.number}: Returning cached skew angle: {self._skew_angle:.2f}")
2440
+ return self._skew_angle
2441
+
2442
+ logger.debug(f"Page {self.number}: Detecting skew angle (resolution={resolution} DPI)...")
2443
+ try:
2444
+ # Render the page at the specified detection resolution
2445
+ img = self.to_image(resolution=resolution, include_highlights=False)
2446
+ if not img:
2447
+ logger.warning(f"Page {self.number}: Failed to render image for skew detection.")
2448
+ self._skew_angle = None
2449
+ return None
2450
+
2451
+ # Convert to numpy array
2452
+ img_np = np.array(img)
2453
+
2454
+ # Convert to grayscale if needed
2455
+ if grayscale:
2456
+ if len(img_np.shape) == 3 and img_np.shape[2] >= 3:
2457
+ gray_np = np.mean(img_np[:, :, :3], axis=2).astype(np.uint8)
2458
+ elif len(img_np.shape) == 2:
2459
+ gray_np = img_np # Already grayscale
2460
+ else:
2461
+ logger.warning(
2462
+ f"Page {self.number}: Unexpected image shape {img_np.shape} for grayscale conversion."
2463
+ )
2464
+ gray_np = img_np # Try using it anyway
2465
+ else:
2466
+ gray_np = img_np # Use original if grayscale=False
2467
+
2468
+ # Determine skew angle using the deskew library
2469
+ angle = determine_skew(gray_np, **deskew_kwargs)
2470
+ self._skew_angle = angle
2471
+ logger.debug(f"Page {self.number}: Detected skew angle = {angle}")
2472
+ return angle
2473
+
2474
+ except Exception as e:
2475
+ logger.warning(f"Page {self.number}: Failed during skew detection: {e}", exc_info=True)
2476
+ self._skew_angle = None
2477
+ return None
2478
+
2479
+ def deskew(
2480
+ self,
2481
+ resolution: int = 300,
2482
+ angle: Optional[float] = None,
2483
+ detection_resolution: int = 72,
2484
+ **deskew_kwargs,
2485
+ ) -> Optional[Image.Image]:
2486
+ """
2487
+ Creates and returns a deskewed PIL image of the page.
2488
+
2489
+ If `angle` is not provided, it will first try to detect the skew angle
2490
+ using `detect_skew_angle` (or use the cached angle if available).
2491
+
2492
+ Args:
2493
+ resolution: DPI resolution for the output deskewed image.
2494
+ angle: The specific angle (in degrees) to rotate by. If None, detects automatically.
2495
+ detection_resolution: DPI resolution used for detection if `angle` is None.
2496
+ **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
2497
+ if automatic detection is performed.
2498
+
2499
+ Returns:
2500
+ A deskewed PIL.Image.Image object, or None if rendering/rotation fails.
2501
+
2502
+ Raises:
2503
+ ImportError: If the 'deskew' library is not installed.
2504
+ """
2505
+ if not DESKEW_AVAILABLE:
2506
+ raise ImportError(
2507
+ "Deskew library not found. Install with: pip install natural-pdf[deskew]"
2508
+ )
2509
+
2510
+ # Determine the angle to use
2511
+ rotation_angle = angle
2512
+ if rotation_angle is None:
2513
+ # Detect angle (or use cached) if not explicitly provided
2514
+ rotation_angle = self.detect_skew_angle(
2515
+ resolution=detection_resolution, **deskew_kwargs
2516
+ )
2517
+
2518
+ logger.debug(
2519
+ f"Page {self.number}: Preparing to deskew (output resolution={resolution} DPI). Using angle: {rotation_angle}"
2520
+ )
2521
+
2522
+ try:
2523
+ # Render the original page at the desired output resolution
2524
+ img = self.to_image(resolution=resolution, include_highlights=False)
2525
+ if not img:
2526
+ logger.error(f"Page {self.number}: Failed to render image for deskewing.")
2527
+ return None
2528
+
2529
+ # Rotate if a significant angle was found/provided
2530
+ if rotation_angle is not None and abs(rotation_angle) > 0.05:
2531
+ logger.debug(f"Page {self.number}: Rotating by {rotation_angle:.2f} degrees.")
2532
+ # Determine fill color based on image mode
2533
+ fill = (255, 255, 255) if img.mode == "RGB" else 255 # White background
2534
+ # Rotate the image using PIL
2535
+ rotated_img = img.rotate(
2536
+ rotation_angle, # deskew provides angle, PIL rotates counter-clockwise
2537
+ resample=Image.Resampling.BILINEAR,
2538
+ expand=True, # Expand image to fit rotated content
2539
+ fillcolor=fill,
2540
+ )
2541
+ return rotated_img
2542
+ else:
2543
+ logger.debug(
2544
+ f"Page {self.number}: No significant rotation needed (angle={rotation_angle}). Returning original render."
2545
+ )
2546
+ return img # Return the original rendered image if no rotation needed
2547
+
2548
+ except Exception as e:
2549
+ logger.error(
2550
+ f"Page {self.number}: Error during deskewing image generation: {e}", exc_info=True
2551
+ )
2552
+ return None
2553
+
2554
+ # --- End Skew Detection and Correction --- #