natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +230 -151
  13. natural_pdf/classification/mixin.py +49 -35
  14. natural_pdf/classification/results.py +64 -46
  15. natural_pdf/collections/mixins.py +68 -20
  16. natural_pdf/collections/pdf_collection.py +177 -64
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +633 -190
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +503 -131
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import base64
2
+ import concurrent.futures # Added import
2
3
  import hashlib
3
4
  import io
4
5
  import json
@@ -6,19 +7,30 @@ import logging
6
7
  import os
7
8
  import re
8
9
  import tempfile
9
- import time # Import time
10
- from pathlib import Path
11
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
12
- import concurrent.futures # Added import
13
- from tqdm.auto import tqdm # Added tqdm import
14
10
  import threading
11
+ import time # Import time
12
+ from pathlib import Path
13
+ from typing import ( # Added overload
14
+ TYPE_CHECKING,
15
+ Any,
16
+ Callable,
17
+ Dict,
18
+ List,
19
+ Optional,
20
+ Tuple,
21
+ Union,
22
+ overload,
23
+ )
15
24
 
16
25
  import pdfplumber
17
26
  from PIL import Image, ImageDraw
27
+ from tqdm.auto import tqdm # Added tqdm import
18
28
 
19
29
  from natural_pdf.elements.collections import ElementCollection
20
30
  from natural_pdf.elements.region import Region
31
+ from natural_pdf.selectors.parser import parse_selector
21
32
  from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
33
+ from natural_pdf.utils.visualization import render_plain_page
22
34
 
23
35
  if TYPE_CHECKING:
24
36
  import pdfplumber
@@ -31,6 +43,8 @@ if TYPE_CHECKING:
31
43
  # New Imports
32
44
  import itertools
33
45
 
46
+ # Deskew Imports (Conditional)
47
+ import numpy as np
34
48
  from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
35
49
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
36
50
 
@@ -39,27 +53,35 @@ from natural_pdf.analyzers.layout.layout_manager import LayoutManager
39
53
  from natural_pdf.analyzers.layout.layout_options import LayoutOptions
40
54
  from natural_pdf.analyzers.text_options import TextStyleOptions
41
55
  from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
56
+ from natural_pdf.classification.manager import ClassificationManager # For type hint
57
+
58
+ # --- Classification Imports --- #
59
+ from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
42
60
  from natural_pdf.core.element_manager import ElementManager
61
+ from natural_pdf.elements.base import Element # Import base element
43
62
  from natural_pdf.elements.text import TextElement
63
+ from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
44
64
  from natural_pdf.ocr import OCRManager, OCROptions
65
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
66
+ from natural_pdf.qa import DocumentQA, get_qa_engine
67
+ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
45
68
 
46
69
  # Import new utils
47
70
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
48
71
  from natural_pdf.widgets import InteractiveViewerWidget
49
72
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
50
73
 
51
- from natural_pdf.qa import DocumentQA, get_qa_engine
52
- from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
53
-
54
- # --- Classification Imports --- #
55
- from natural_pdf.classification.mixin import ClassificationMixin
56
- from natural_pdf.classification.manager import ClassificationManager # For type hint
57
74
  # --- End Classification Imports --- #
58
75
 
59
- from natural_pdf.utils.locks import pdf_render_lock # Import the lock
60
- from natural_pdf.elements.base import Element # Import base element
61
- from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
62
- from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
76
+
77
+ try:
78
+ from deskew import determine_skew
79
+
80
+ DESKEW_AVAILABLE = True
81
+ except ImportError:
82
+ DESKEW_AVAILABLE = False
83
+ determine_skew = None
84
+ # End Deskew Imports
63
85
 
64
86
  logger = logging.getLogger(__name__)
65
87
 
@@ -87,6 +109,7 @@ class Page(ClassificationMixin, ExtractionMixin):
87
109
  self._index = index
88
110
  self._text_styles = None # Lazy-loaded text style analyzer results
89
111
  self._exclusions = [] # List to store exclusion functions/regions
112
+ self._skew_angle: Optional[float] = None # Stores detected skew angle
90
113
 
91
114
  # --- ADDED --- Metadata store for mixins
92
115
  self.metadata: Dict[str, Any] = {}
@@ -436,25 +459,79 @@ class Page(ClassificationMixin, ExtractionMixin):
436
459
 
437
460
  return filtered_elements
438
461
 
439
- def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
462
+ @overload
463
+ def find(
464
+ self,
465
+ *,
466
+ text: str,
467
+ apply_exclusions: bool = True,
468
+ regex: bool = False,
469
+ case: bool = True,
470
+ **kwargs,
471
+ ) -> Optional[Any]: ...
472
+
473
+ @overload
474
+ def find(
475
+ self,
476
+ selector: str,
477
+ *,
478
+ apply_exclusions: bool = True,
479
+ regex: bool = False,
480
+ case: bool = True,
481
+ **kwargs,
482
+ ) -> Optional[Any]: ...
483
+
484
+ def find(
485
+ self,
486
+ selector: Optional[str] = None, # Now optional
487
+ *, # Force subsequent args to be keyword-only
488
+ text: Optional[str] = None, # New text parameter
489
+ apply_exclusions: bool = True,
490
+ regex: bool = False,
491
+ case: bool = True,
492
+ **kwargs,
493
+ ) -> Optional[Any]:
440
494
  """
441
- Find first element on this page matching selector.
495
+ Find first element on this page matching selector OR text content.
496
+
497
+ Provide EITHER `selector` OR `text`, but not both.
442
498
 
443
499
  Args:
444
- selector: CSS-like selector string
445
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
446
- regex: Whether to use regex for text search in :contains (default: False)
447
- case: Whether to do case-sensitive text search (default: True)
448
- **kwargs: Additional filter parameters
500
+ selector: CSS-like selector string.
501
+ text: Text content to search for (equivalent to 'text:contains(...)').
502
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
503
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
504
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
505
+ **kwargs: Additional filter parameters.
449
506
 
450
507
  Returns:
451
- Element object or None if not found
452
- """
453
- from natural_pdf.selectors.parser import parse_selector
508
+ Element object or None if not found.
509
+ """
510
+ if selector is not None and text is not None:
511
+ raise ValueError("Provide either 'selector' or 'text', not both.")
512
+ if selector is None and text is None:
513
+ raise ValueError("Provide either 'selector' or 'text'.")
514
+
515
+ # Construct selector if 'text' is provided
516
+ effective_selector = ""
517
+ if text is not None:
518
+ # Escape quotes within the text for the selector string
519
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
520
+ # Default to 'text:contains(...)'
521
+ effective_selector = f'text:contains("{escaped_text}")'
522
+ # Note: regex/case handled by kwargs passed down
523
+ logger.debug(
524
+ f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
525
+ )
526
+ elif selector is not None:
527
+ effective_selector = selector
528
+ else:
529
+ # Should be unreachable due to checks above
530
+ raise ValueError("Internal error: No selector or text provided.")
454
531
 
455
- selector_obj = parse_selector(selector)
532
+ selector_obj = parse_selector(effective_selector)
456
533
 
457
- # Pass regex and case flags to selector function
534
+ # Pass regex and case flags to selector function via kwargs
458
535
  kwargs["regex"] = regex
459
536
  kwargs["case"] = case
460
537
 
@@ -474,27 +551,80 @@ class Page(ClassificationMixin, ExtractionMixin):
474
551
  else:
475
552
  return None
476
553
 
554
+ @overload
555
+ def find_all(
556
+ self,
557
+ *,
558
+ text: str,
559
+ apply_exclusions: bool = True,
560
+ regex: bool = False,
561
+ case: bool = True,
562
+ **kwargs,
563
+ ) -> "ElementCollection": ...
564
+
565
+ @overload
477
566
  def find_all(
478
- self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
567
+ self,
568
+ selector: str,
569
+ *,
570
+ apply_exclusions: bool = True,
571
+ regex: bool = False,
572
+ case: bool = True,
573
+ **kwargs,
574
+ ) -> "ElementCollection": ...
575
+
576
+ def find_all(
577
+ self,
578
+ selector: Optional[str] = None, # Now optional
579
+ *, # Force subsequent args to be keyword-only
580
+ text: Optional[str] = None, # New text parameter
581
+ apply_exclusions: bool = True,
582
+ regex: bool = False,
583
+ case: bool = True,
584
+ **kwargs,
479
585
  ) -> "ElementCollection":
480
586
  """
481
- Find all elements on this page matching selector.
587
+ Find all elements on this page matching selector OR text content.
588
+
589
+ Provide EITHER `selector` OR `text`, but not both.
482
590
 
483
591
  Args:
484
- selector: CSS-like selector string
485
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
486
- regex: Whether to use regex for text search in :contains (default: False)
487
- case: Whether to do case-sensitive text search (default: True)
488
- **kwargs: Additional filter parameters
592
+ selector: CSS-like selector string.
593
+ text: Text content to search for (equivalent to 'text:contains(...)').
594
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
595
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
596
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
597
+ **kwargs: Additional filter parameters.
489
598
 
490
599
  Returns:
491
- ElementCollection with matching elements
492
- """
493
- from natural_pdf.selectors.parser import parse_selector
600
+ ElementCollection with matching elements.
601
+ """
602
+ from natural_pdf.elements.collections import ElementCollection # Import here for type hint
603
+
604
+ if selector is not None and text is not None:
605
+ raise ValueError("Provide either 'selector' or 'text', not both.")
606
+ if selector is None and text is None:
607
+ raise ValueError("Provide either 'selector' or 'text'.")
608
+
609
+ # Construct selector if 'text' is provided
610
+ effective_selector = ""
611
+ if text is not None:
612
+ # Escape quotes within the text for the selector string
613
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
614
+ # Default to 'text:contains(...)'
615
+ effective_selector = f'text:contains("{escaped_text}")'
616
+ logger.debug(
617
+ f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
618
+ )
619
+ elif selector is not None:
620
+ effective_selector = selector
621
+ else:
622
+ # Should be unreachable due to checks above
623
+ raise ValueError("Internal error: No selector or text provided.")
494
624
 
495
- selector_obj = parse_selector(selector)
625
+ selector_obj = parse_selector(effective_selector)
496
626
 
497
- # Pass regex and case flags to selector function
627
+ # Pass regex and case flags to selector function via kwargs
498
628
  kwargs["regex"] = regex
499
629
  kwargs["case"] = case
500
630
 
@@ -1282,18 +1412,22 @@ class Page(ClassificationMixin, ExtractionMixin):
1282
1412
  image = None
1283
1413
  render_resolution = resolution if resolution is not None else scale * 72
1284
1414
  thread_id = threading.current_thread().name
1285
- logger.debug(f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image...")
1415
+ logger.debug(
1416
+ f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
1417
+ )
1286
1418
  lock_wait_start = time.monotonic()
1287
1419
  try:
1288
1420
  # Acquire the global PDF rendering lock
1289
1421
  with pdf_render_lock:
1290
1422
  lock_acquired_time = time.monotonic()
1291
- logger.debug(f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render...")
1423
+ logger.debug(
1424
+ f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
1425
+ )
1292
1426
  if include_highlights:
1293
1427
  # Delegate rendering to the central service
1294
1428
  image = self._highlighter.render_page(
1295
1429
  page_index=self.index,
1296
- scale=scale, # Note: scale is used by highlighter internally for drawing
1430
+ scale=scale,
1297
1431
  labels=labels,
1298
1432
  legend_position=legend_position,
1299
1433
  render_ocr=render_ocr,
@@ -1301,28 +1435,15 @@ class Page(ClassificationMixin, ExtractionMixin):
1301
1435
  **kwargs,
1302
1436
  )
1303
1437
  else:
1304
- # Get the base page image directly from pdfplumber if no highlights needed
1305
- # Use the underlying pdfplumber page object
1306
- img_object = self._page.to_image(resolution=render_resolution, **kwargs)
1307
- # Access the PIL image directly (assuming pdfplumber structure)
1308
- image = (
1309
- img_object.annotated
1310
- if hasattr(img_object, "annotated")
1311
- else img_object._repr_png_()
1312
- )
1313
- if isinstance(image, bytes): # Handle cases where it returns bytes
1314
- from io import BytesIO
1315
-
1316
- image = Image.open(BytesIO(image)).convert(
1317
- "RGB"
1318
- ) # Convert to RGB for consistency
1319
-
1438
+ image = render_plain_page(self, render_resolution)
1320
1439
  except Exception as e:
1321
1440
  logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
1322
1441
  return None # Return None on error
1323
1442
  finally:
1324
1443
  render_end_time = time.monotonic()
1325
- logger.debug(f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s")
1444
+ logger.debug(
1445
+ f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
1446
+ )
1326
1447
 
1327
1448
  if image is None:
1328
1449
  return None
@@ -1445,7 +1566,9 @@ class Page(ClassificationMixin, ExtractionMixin):
1445
1566
 
1446
1567
  # Remove existing OCR elements if replace is True
1447
1568
  if replace and hasattr(self, "_element_mgr"):
1448
- logger.info(f"Page {self.number}: Removing existing OCR elements before applying new OCR.")
1569
+ logger.info(
1570
+ f"Page {self.number}: Removing existing OCR elements before applying new OCR."
1571
+ )
1449
1572
  self._element_mgr.remove_ocr_elements()
1450
1573
 
1451
1574
  logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
@@ -1513,7 +1636,9 @@ class Page(ClassificationMixin, ExtractionMixin):
1513
1636
  with pdf_render_lock:
1514
1637
  image = self.to_image(resolution=final_resolution, include_highlights=False)
1515
1638
  if not image:
1516
- logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1639
+ logger.error(
1640
+ f" Failed to render page {self.number} to image for OCR extraction."
1641
+ )
1517
1642
  return []
1518
1643
  logger.debug(f" Rendered image size: {image.width}x{image.height}")
1519
1644
  except Exception as e:
@@ -1585,6 +1710,11 @@ class Page(ClassificationMixin, ExtractionMixin):
1585
1710
  logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
1586
1711
  return temp_elements
1587
1712
 
1713
+ @property
1714
+ def size(self) -> Tuple[float, float]:
1715
+ """Get the size of the page in points."""
1716
+ return (self._page.width, self._page.height)
1717
+
1588
1718
  @property
1589
1719
  def layout_analyzer(self) -> LayoutAnalyzer:
1590
1720
  """Get or create the layout analyzer for this page."""
@@ -1604,6 +1734,8 @@ class Page(ClassificationMixin, ExtractionMixin):
1604
1734
  exclude_classes: Optional[List[str]] = None,
1605
1735
  device: Optional[str] = None,
1606
1736
  existing: str = "replace",
1737
+ model_name: Optional[str] = None,
1738
+ client: Optional[Any] = None, # Add client parameter
1607
1739
  ) -> ElementCollection[Region]:
1608
1740
  """
1609
1741
  Analyze the page layout using the configured LayoutManager.
@@ -1629,6 +1761,8 @@ class Page(ClassificationMixin, ExtractionMixin):
1629
1761
  exclude_classes=exclude_classes,
1630
1762
  device=device,
1631
1763
  existing=existing,
1764
+ model_name=model_name,
1765
+ client=client, # Pass client down
1632
1766
  )
1633
1767
 
1634
1768
  # Retrieve the detected regions from the element manager
@@ -1699,14 +1833,24 @@ class Page(ClassificationMixin, ExtractionMixin):
1699
1833
  )
1700
1834
  return None
1701
1835
 
1836
+ def split(self, divider, **kwargs) -> "ElementCollection[Region]":
1837
+ """
1838
+ Divides the page into sections based on the provided divider elements.
1839
+ """
1840
+ sections = self.get_sections(start_elements=divider, **kwargs)
1841
+ top = self.region(0, 0, self.width, sections[0].top)
1842
+ sections.append(top)
1843
+
1844
+ return sections
1845
+
1702
1846
  def get_sections(
1703
1847
  self,
1704
1848
  start_elements=None,
1705
1849
  end_elements=None,
1706
- boundary_inclusion="both",
1850
+ boundary_inclusion="start",
1707
1851
  y_threshold=5.0,
1708
1852
  bounding_box=None,
1709
- ) -> "ElementCollection[Region]": # Updated type hint
1853
+ ) -> "ElementCollection[Region]":
1710
1854
  """
1711
1855
  Get sections of a page defined by start/end elements.
1712
1856
  Uses the page-level implementation.
@@ -2068,7 +2212,7 @@ class Page(ClassificationMixin, ExtractionMixin):
2068
2212
  self,
2069
2213
  correction_callback: Callable[[Any], Optional[str]],
2070
2214
  max_workers: Optional[int] = None,
2071
- progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
2215
+ progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
2072
2216
  ) -> "Page": # Return self for chaining
2073
2217
  """
2074
2218
  Applies corrections to OCR-generated text elements on this page
@@ -2096,7 +2240,7 @@ class Page(ClassificationMixin, ExtractionMixin):
2096
2240
  target_elements_collection = self.find_all(
2097
2241
  selector="text[source=ocr]", apply_exclusions=False
2098
2242
  )
2099
- target_elements = target_elements_collection.elements # Get the list
2243
+ target_elements = target_elements_collection.elements # Get the list
2100
2244
 
2101
2245
  if not target_elements:
2102
2246
  logger.info(f"Page {self.number}: No OCR elements found to correct.")
@@ -2109,22 +2253,24 @@ class Page(ClassificationMixin, ExtractionMixin):
2109
2253
  # Define the task to be run by the worker thread or sequentially
2110
2254
  def _process_element_task(element):
2111
2255
  try:
2112
- current_text = getattr(element, 'text', None)
2256
+ current_text = getattr(element, "text", None)
2113
2257
  # Call the user-provided callback
2114
2258
  corrected_text = correction_callback(element)
2115
2259
 
2116
2260
  # Validate result type
2117
2261
  if corrected_text is not None and not isinstance(corrected_text, str):
2118
- logger.warning(f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update.")
2119
- return element, None, None # Treat as no correction
2262
+ logger.warning(
2263
+ f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
2264
+ )
2265
+ return element, None, None # Treat as no correction
2120
2266
 
2121
2267
  return element, corrected_text, None # Return element, result, no error
2122
2268
  except Exception as e:
2123
2269
  logger.error(
2124
2270
  f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
2125
- exc_info=False # Keep log concise
2271
+ exc_info=False, # Keep log concise
2126
2272
  )
2127
- return element, None, e # Return element, no result, error
2273
+ return element, None, e # Return element, no result, error
2128
2274
  finally:
2129
2275
  # --- Call progress callback here --- #
2130
2276
  if progress_callback:
@@ -2132,16 +2278,24 @@ class Page(ClassificationMixin, ExtractionMixin):
2132
2278
  progress_callback()
2133
2279
  except Exception as cb_e:
2134
2280
  # Log error in callback itself, but don't stop processing
2135
- logger.error(f"Page {self.number}: Error executing progress_callback: {cb_e}", exc_info=False)
2281
+ logger.error(
2282
+ f"Page {self.number}: Error executing progress_callback: {cb_e}",
2283
+ exc_info=False,
2284
+ )
2136
2285
 
2137
2286
  # Choose execution strategy based on max_workers
2138
2287
  if max_workers is not None and max_workers > 1:
2139
2288
  # --- Parallel execution --- #
2140
- logger.info(f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers.")
2289
+ logger.info(
2290
+ f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
2291
+ )
2141
2292
  futures = []
2142
2293
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
2143
2294
  # Submit all tasks
2144
- future_to_element = {executor.submit(_process_element_task, element): element for element in target_elements}
2295
+ future_to_element = {
2296
+ executor.submit(_process_element_task, element): element
2297
+ for element in target_elements
2298
+ }
2145
2299
 
2146
2300
  # Process results as they complete (progress_callback called by worker)
2147
2301
  for future in concurrent.futures.as_completed(future_to_element):
@@ -2153,14 +2307,17 @@ class Page(ClassificationMixin, ExtractionMixin):
2153
2307
  # Error already logged in worker
2154
2308
  elif corrected_text is not None:
2155
2309
  # Apply correction if text changed
2156
- current_text = getattr(element, 'text', None)
2310
+ current_text = getattr(element, "text", None)
2157
2311
  if corrected_text != current_text:
2158
2312
  element.text = corrected_text
2159
2313
  updated_count += 1
2160
2314
  except Exception as exc:
2161
2315
  # Catch errors from future.result() itself
2162
- element = future_to_element[future] # Find original element
2163
- logger.error(f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}", exc_info=True)
2316
+ element = future_to_element[future] # Find original element
2317
+ logger.error(
2318
+ f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
2319
+ exc_info=True,
2320
+ )
2164
2321
  error_count += 1
2165
2322
  # Note: progress_callback was already called in the worker's finally block
2166
2323
 
@@ -2168,65 +2325,230 @@ class Page(ClassificationMixin, ExtractionMixin):
2168
2325
  # --- Sequential execution --- #
2169
2326
  logger.info(f"Page {self.number}: Running OCR correction sequentially.")
2170
2327
  for element in target_elements:
2171
- # Call the task function directly (it handles progress_callback)
2172
- processed_count += 1
2173
- _element, corrected_text, error = _process_element_task(element)
2174
- if error:
2175
- error_count += 1
2176
- elif corrected_text is not None:
2177
- # Apply correction if text changed
2178
- current_text = getattr(_element, 'text', None)
2179
- if corrected_text != current_text:
2180
- _element.text = corrected_text
2181
- updated_count += 1
2328
+ # Call the task function directly (it handles progress_callback)
2329
+ processed_count += 1
2330
+ _element, corrected_text, error = _process_element_task(element)
2331
+ if error:
2332
+ error_count += 1
2333
+ elif corrected_text is not None:
2334
+ # Apply correction if text changed
2335
+ current_text = getattr(_element, "text", None)
2336
+ if corrected_text != current_text:
2337
+ _element.text = corrected_text
2338
+ updated_count += 1
2182
2339
 
2183
2340
  logger.info(
2184
- f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
2341
+ f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
2185
2342
  )
2186
2343
 
2187
- return self # Return self for chaining
2344
+ return self # Return self for chaining
2188
2345
 
2189
2346
  # --- Classification Mixin Implementation --- #
2190
2347
  def _get_classification_manager(self) -> "ClassificationManager":
2191
- if not hasattr(self, 'pdf') or not hasattr(self.pdf, 'get_manager'):
2192
- raise AttributeError("ClassificationManager cannot be accessed: Parent PDF or get_manager method missing.")
2348
+ if not hasattr(self, "pdf") or not hasattr(self.pdf, "get_manager"):
2349
+ raise AttributeError(
2350
+ "ClassificationManager cannot be accessed: Parent PDF or get_manager method missing."
2351
+ )
2193
2352
  try:
2194
- # Use the PDF's manager registry accessor
2195
- return self.pdf.get_manager('classification')
2353
+ # Use the PDF's manager registry accessor
2354
+ return self.pdf.get_manager("classification")
2196
2355
  except (ValueError, RuntimeError, AttributeError) as e:
2197
2356
  # Wrap potential errors from get_manager for clarity
2198
2357
  raise AttributeError(f"Failed to get ClassificationManager from PDF: {e}") from e
2199
2358
 
2200
- def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
2201
- if model_type == 'text':
2202
- text_content = self.extract_text(layout=False, use_exclusions=False) # Simple join, ignore exclusions for classification
2359
+ def _get_classification_content(
2360
+ self, model_type: str, **kwargs
2361
+ ) -> Union[str, "Image"]: # Use "Image" for lazy import
2362
+ if model_type == "text":
2363
+ text_content = self.extract_text(
2364
+ layout=False, use_exclusions=False
2365
+ ) # Simple join, ignore exclusions for classification
2203
2366
  if not text_content or text_content.isspace():
2204
2367
  raise ValueError("Cannot classify page with 'text' model: No text content found.")
2205
2368
  return text_content
2206
- elif model_type == 'vision':
2369
+ elif model_type == "vision":
2207
2370
  # Get resolution from manager/kwargs if possible, else default
2208
2371
  manager = self._get_classification_manager()
2209
2372
  default_resolution = 150
2210
2373
  # Access kwargs passed to classify method if needed
2211
- resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
2374
+ resolution = (
2375
+ kwargs.get("resolution", default_resolution)
2376
+ if "kwargs" in locals()
2377
+ else default_resolution
2378
+ )
2212
2379
 
2213
2380
  # Use to_image, ensuring no highlights interfere
2214
2381
  img = self.to_image(
2215
2382
  resolution=resolution,
2216
2383
  include_highlights=False,
2217
2384
  labels=False,
2218
- exclusions=None # Don't mask exclusions for classification input image
2385
+ exclusions=None, # Don't mask exclusions for classification input image
2219
2386
  )
2220
2387
  if img is None:
2221
- raise ValueError("Cannot classify page with 'vision' model: Failed to render image.")
2388
+ raise ValueError(
2389
+ "Cannot classify page with 'vision' model: Failed to render image."
2390
+ )
2222
2391
  return img
2223
2392
  else:
2224
2393
  raise ValueError(f"Unsupported model_type for classification: {model_type}")
2225
2394
 
2226
2395
  def _get_metadata_storage(self) -> Dict[str, Any]:
2227
2396
  # Ensure metadata exists
2228
- if not hasattr(self, 'metadata') or self.metadata is None:
2397
+ if not hasattr(self, "metadata") or self.metadata is None:
2229
2398
  self.metadata = {}
2230
2399
  return self.metadata
2231
2400
 
2232
2401
  # --- Content Extraction ---
2402
+
2403
+ # --- Skew Detection and Correction --- #
2404
+
2405
+ @property
2406
+ def skew_angle(self) -> Optional[float]:
2407
+ """Get the detected skew angle for this page (if calculated)."""
2408
+ return self._skew_angle
2409
+
2410
+ def detect_skew_angle(
2411
+ self,
2412
+ resolution: int = 72,
2413
+ grayscale: bool = True,
2414
+ force_recalculate: bool = False,
2415
+ **deskew_kwargs,
2416
+ ) -> Optional[float]:
2417
+ """
2418
+ Detects the skew angle of the page image and stores it.
2419
+
2420
+ Args:
2421
+ resolution: DPI resolution for rendering the page image for detection.
2422
+ grayscale: Whether to convert the image to grayscale before detection.
2423
+ force_recalculate: If True, recalculate even if an angle exists.
2424
+ **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
2425
+ (e.g., `max_angle`, `num_peaks`).
2426
+
2427
+ Returns:
2428
+ The detected skew angle in degrees, or None if detection failed.
2429
+
2430
+ Raises:
2431
+ ImportError: If the 'deskew' library is not installed.
2432
+ """
2433
+ if not DESKEW_AVAILABLE:
2434
+ raise ImportError(
2435
+ "Deskew library not found. Install with: pip install natural-pdf[deskew]"
2436
+ )
2437
+
2438
+ if self._skew_angle is not None and not force_recalculate:
2439
+ logger.debug(f"Page {self.number}: Returning cached skew angle: {self._skew_angle:.2f}")
2440
+ return self._skew_angle
2441
+
2442
+ logger.debug(f"Page {self.number}: Detecting skew angle (resolution={resolution} DPI)...")
2443
+ try:
2444
+ # Render the page at the specified detection resolution
2445
+ img = self.to_image(resolution=resolution, include_highlights=False)
2446
+ if not img:
2447
+ logger.warning(f"Page {self.number}: Failed to render image for skew detection.")
2448
+ self._skew_angle = None
2449
+ return None
2450
+
2451
+ # Convert to numpy array
2452
+ img_np = np.array(img)
2453
+
2454
+ # Convert to grayscale if needed
2455
+ if grayscale:
2456
+ if len(img_np.shape) == 3 and img_np.shape[2] >= 3:
2457
+ gray_np = np.mean(img_np[:, :, :3], axis=2).astype(np.uint8)
2458
+ elif len(img_np.shape) == 2:
2459
+ gray_np = img_np # Already grayscale
2460
+ else:
2461
+ logger.warning(
2462
+ f"Page {self.number}: Unexpected image shape {img_np.shape} for grayscale conversion."
2463
+ )
2464
+ gray_np = img_np # Try using it anyway
2465
+ else:
2466
+ gray_np = img_np # Use original if grayscale=False
2467
+
2468
+ # Determine skew angle using the deskew library
2469
+ angle = determine_skew(gray_np, **deskew_kwargs)
2470
+ self._skew_angle = angle
2471
+ logger.debug(f"Page {self.number}: Detected skew angle = {angle}")
2472
+ return angle
2473
+
2474
+ except Exception as e:
2475
+ logger.warning(f"Page {self.number}: Failed during skew detection: {e}", exc_info=True)
2476
+ self._skew_angle = None
2477
+ return None
2478
+
2479
+ def deskew(
2480
+ self,
2481
+ resolution: int = 300,
2482
+ angle: Optional[float] = None,
2483
+ detection_resolution: int = 72,
2484
+ **deskew_kwargs,
2485
+ ) -> Optional[Image.Image]:
2486
+ """
2487
+ Creates and returns a deskewed PIL image of the page.
2488
+
2489
+ If `angle` is not provided, it will first try to detect the skew angle
2490
+ using `detect_skew_angle` (or use the cached angle if available).
2491
+
2492
+ Args:
2493
+ resolution: DPI resolution for the output deskewed image.
2494
+ angle: The specific angle (in degrees) to rotate by. If None, detects automatically.
2495
+ detection_resolution: DPI resolution used for detection if `angle` is None.
2496
+ **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
2497
+ if automatic detection is performed.
2498
+
2499
+ Returns:
2500
+ A deskewed PIL.Image.Image object, or None if rendering/rotation fails.
2501
+
2502
+ Raises:
2503
+ ImportError: If the 'deskew' library is not installed.
2504
+ """
2505
+ if not DESKEW_AVAILABLE:
2506
+ raise ImportError(
2507
+ "Deskew library not found. Install with: pip install natural-pdf[deskew]"
2508
+ )
2509
+
2510
+ # Determine the angle to use
2511
+ rotation_angle = angle
2512
+ if rotation_angle is None:
2513
+ # Detect angle (or use cached) if not explicitly provided
2514
+ rotation_angle = self.detect_skew_angle(
2515
+ resolution=detection_resolution, **deskew_kwargs
2516
+ )
2517
+
2518
+ logger.debug(
2519
+ f"Page {self.number}: Preparing to deskew (output resolution={resolution} DPI). Using angle: {rotation_angle}"
2520
+ )
2521
+
2522
+ try:
2523
+ # Render the original page at the desired output resolution
2524
+ img = self.to_image(resolution=resolution, include_highlights=False)
2525
+ if not img:
2526
+ logger.error(f"Page {self.number}: Failed to render image for deskewing.")
2527
+ return None
2528
+
2529
+ # Rotate if a significant angle was found/provided
2530
+ if rotation_angle is not None and abs(rotation_angle) > 0.05:
2531
+ logger.debug(f"Page {self.number}: Rotating by {rotation_angle:.2f} degrees.")
2532
+ # Determine fill color based on image mode
2533
+ fill = (255, 255, 255) if img.mode == "RGB" else 255 # White background
2534
+ # Rotate the image using PIL
2535
+ rotated_img = img.rotate(
2536
+ rotation_angle, # deskew provides angle, PIL rotates counter-clockwise
2537
+ resample=Image.Resampling.BILINEAR,
2538
+ expand=True, # Expand image to fit rotated content
2539
+ fillcolor=fill,
2540
+ )
2541
+ return rotated_img
2542
+ else:
2543
+ logger.debug(
2544
+ f"Page {self.number}: No significant rotation needed (angle={rotation_angle}). Returning original render."
2545
+ )
2546
+ return img # Return the original rendered image if no rotation needed
2547
+
2548
+ except Exception as e:
2549
+ logger.error(
2550
+ f"Page {self.number}: Error during deskewing image generation: {e}", exc_info=True
2551
+ )
2552
+ return None
2553
+
2554
+ # --- End Skew Detection and Correction --- #