natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +11 -6
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +252 -399
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +231 -89
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +405 -280
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +25 -0
  33. natural_pdf/flows/flow.py +1658 -19
  34. natural_pdf/flows/region.py +757 -263
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +35 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +101 -0
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -16,6 +16,7 @@ from typing import (
16
16
  Dict,
17
17
  Iterable,
18
18
  List,
19
+ Literal,
19
20
  Optional,
20
21
  Tuple,
21
22
  Type,
@@ -31,6 +32,7 @@ from natural_pdf.classification.manager import ClassificationError
31
32
  from natural_pdf.classification.mixin import ClassificationMixin
32
33
  from natural_pdf.classification.results import ClassificationResult
33
34
  from natural_pdf.core.highlighting_service import HighlightingService
35
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
34
36
  from natural_pdf.elements.base import Element
35
37
  from natural_pdf.elements.region import Region
36
38
  from natural_pdf.export.mixin import ExportMixin
@@ -38,8 +40,12 @@ from natural_pdf.extraction.manager import StructuredDataManager
38
40
  from natural_pdf.extraction.mixin import ExtractionMixin
39
41
  from natural_pdf.ocr import OCRManager, OCROptions
40
42
  from natural_pdf.selectors.parser import parse_selector
43
+ from natural_pdf.text_mixin import TextMixin
41
44
  from natural_pdf.utils.locks import pdf_render_lock
42
45
 
46
+ if TYPE_CHECKING:
47
+ from natural_pdf.elements.element_collection import ElementCollection
48
+
43
49
  try:
44
50
  from typing import Any as TypingAny
45
51
 
@@ -103,7 +109,6 @@ except ImportError:
103
109
  from collections.abc import Sequence
104
110
 
105
111
 
106
-
107
112
  class _LazyPageList(Sequence):
108
113
  """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
109
114
 
@@ -141,18 +146,18 @@ class _LazyPageList(Sequence):
141
146
  """
142
147
 
143
148
  def __init__(
144
- self,
145
- parent_pdf: "PDF",
146
- plumber_pdf: "pdfplumber.PDF",
147
- font_attrs=None,
149
+ self,
150
+ parent_pdf: "PDF",
151
+ plumber_pdf: "pdfplumber.PDF",
152
+ font_attrs=None,
148
153
  load_text=True,
149
- indices: Optional[List[int]] = None
154
+ indices: Optional[List[int]] = None,
150
155
  ):
151
156
  self._parent_pdf = parent_pdf
152
157
  self._plumber_pdf = plumber_pdf
153
158
  self._font_attrs = font_attrs
154
159
  self._load_text = load_text
155
-
160
+
156
161
  # If indices is provided, this is a sliced view
157
162
  if indices is not None:
158
163
  self._indices = indices
@@ -180,23 +185,23 @@ class _LazyPageList(Sequence):
180
185
  font_attrs=self._font_attrs,
181
186
  load_text=self._load_text,
182
187
  )
183
-
188
+
184
189
  # Apply any stored exclusions to the newly created page
185
- if hasattr(self._parent_pdf, '_exclusions'):
190
+ if hasattr(self._parent_pdf, "_exclusions"):
186
191
  for exclusion_data in self._parent_pdf._exclusions:
187
192
  exclusion_func, label = exclusion_data
188
193
  try:
189
194
  cached.add_exclusion(exclusion_func, label=label)
190
195
  except Exception as e:
191
196
  logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
192
-
197
+
193
198
  # Apply any stored regions to the newly created page
194
- if hasattr(self._parent_pdf, '_regions'):
199
+ if hasattr(self._parent_pdf, "_regions"):
195
200
  for region_data in self._parent_pdf._regions:
196
201
  region_func, name = region_data
197
202
  try:
198
203
  region_instance = region_func(cached)
199
- if region_instance and hasattr(region_instance, '__class__'):
204
+ if region_instance and hasattr(region_instance, "__class__"):
200
205
  # Check if it's a Region-like object (avoid importing Region here)
201
206
  cached.add_region(region_instance, name=name, source="named")
202
207
  elif region_instance is not None:
@@ -205,7 +210,7 @@ class _LazyPageList(Sequence):
205
210
  )
206
211
  except Exception as e:
207
212
  logger.warning(f"Failed to apply region to page {cached.number}: {e}")
208
-
213
+
209
214
  self._cache[index] = cached
210
215
  return cached
211
216
 
@@ -215,7 +220,7 @@ class _LazyPageList(Sequence):
215
220
 
216
221
  def __getitem__(self, key):
217
222
  if isinstance(key, slice):
218
- # Get the slice of our current indices
223
+ # Get the slice of our current indices
219
224
  slice_indices = range(*key.indices(len(self)))
220
225
  # Extract the actual page indices for this slice
221
226
  actual_indices = [self._indices[i] for i in slice_indices]
@@ -225,7 +230,7 @@ class _LazyPageList(Sequence):
225
230
  self._plumber_pdf,
226
231
  font_attrs=self._font_attrs,
227
232
  load_text=self._load_text,
228
- indices=actual_indices
233
+ indices=actual_indices,
229
234
  )
230
235
  elif isinstance(key, int):
231
236
  if key < 0:
@@ -247,7 +252,7 @@ class _LazyPageList(Sequence):
247
252
  # --- End Lazy Page List Helper --- #
248
253
 
249
254
 
250
- class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
255
+ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, Visualizable):
251
256
  """Enhanced PDF wrapper built on top of pdfplumber.
252
257
 
253
258
  This class provides a fluent interface for working with PDF documents,
@@ -576,7 +581,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
576
581
  print(f"Page {page.index} has {len(page.chars)} characters")
577
582
  ```
578
583
  """
579
- from natural_pdf.elements.collections import PageCollection
584
+ from natural_pdf.core.page_collection import PageCollection
580
585
 
581
586
  if not hasattr(self, "_pages"):
582
587
  raise AttributeError("PDF pages not yet initialized.")
@@ -608,7 +613,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
608
613
  raise AttributeError("PDF pages not yet initialized.")
609
614
 
610
615
  self._exclusions = []
611
-
616
+
612
617
  # Clear exclusions only from already-created (cached) pages to avoid forcing page creation
613
618
  for i in range(len(self._pages)):
614
619
  if self._pages._cache[i] is not None: # Only clear from existing pages
@@ -618,9 +623,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
618
623
  logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
619
624
  return self
620
625
 
621
- def add_exclusion(
622
- self, exclusion_func, label: str = None
623
- ) -> "PDF":
626
+ def add_exclusion(self, exclusion_func, label: str = None) -> "PDF":
624
627
  """Add an exclusion function to the PDF.
625
628
 
626
629
  Exclusion functions define regions of each page that should be ignored during
@@ -669,12 +672,12 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
669
672
  # Support selector strings and ElementCollection objects directly.
670
673
  # Store exclusion and apply only to already-created pages.
671
674
  # ------------------------------------------------------------------
672
- from natural_pdf.elements.collections import ElementCollection # local import
675
+ from natural_pdf.elements.element_collection import ElementCollection # local import
673
676
 
674
677
  if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
675
678
  # Store for bookkeeping and lazy application
676
679
  self._exclusions.append((exclusion_func, label))
677
-
680
+
678
681
  # Apply only to already-created (cached) pages to avoid forcing page creation
679
682
  for i in range(len(self._pages)):
680
683
  if self._pages._cache[i] is not None: # Only apply to existing pages
@@ -842,11 +845,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
842
845
  "include_highlights": False,
843
846
  "exclusions": "mask" if apply_exclusions else None,
844
847
  }
845
- img = page.to_image(**to_image_kwargs)
848
+ # Use render() for clean image without highlights
849
+ img = page.render(resolution=final_resolution)
846
850
  if img is None:
847
851
  logger.error(f" Failed to render page {page.number} to image.")
848
852
  continue
849
- continue
850
853
  images_pil.append(img)
851
854
  page_image_map.append((page, img))
852
855
  except Exception as e:
@@ -1140,7 +1143,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1140
1143
  if page_elements:
1141
1144
  all_elements.extend(page_elements.elements)
1142
1145
 
1143
- from natural_pdf.elements.collections import ElementCollection
1146
+ from natural_pdf.elements.element_collection import ElementCollection
1144
1147
 
1145
1148
  return ElementCollection(all_elements)
1146
1149
 
@@ -1229,6 +1232,62 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1229
1232
 
1230
1233
  return all_tables
1231
1234
 
1235
+ def get_sections(
1236
+ self,
1237
+ start_elements=None,
1238
+ end_elements=None,
1239
+ new_section_on_page_break=False,
1240
+ include_boundaries="both",
1241
+ ) -> "ElementCollection":
1242
+ """
1243
+ Extract sections from the entire PDF based on start/end elements.
1244
+
1245
+ This method delegates to the PageCollection.get_sections() method,
1246
+ providing a convenient way to extract document sections across all pages.
1247
+
1248
+ Args:
1249
+ start_elements: Elements or selector string that mark the start of sections (optional)
1250
+ end_elements: Elements or selector string that mark the end of sections (optional)
1251
+ new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
1252
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
1253
+
1254
+ Returns:
1255
+ ElementCollection of Region objects representing the extracted sections
1256
+
1257
+ Example:
1258
+ Extract sections between headers:
1259
+ ```python
1260
+ pdf = npdf.PDF("document.pdf")
1261
+
1262
+ # Get sections between headers
1263
+ sections = pdf.get_sections(
1264
+ start_elements='text[size>14]:bold',
1265
+ end_elements='text[size>14]:bold'
1266
+ )
1267
+
1268
+ # Get sections that break at page boundaries
1269
+ sections = pdf.get_sections(
1270
+ start_elements='text:contains("Chapter")',
1271
+ new_section_on_page_break=True
1272
+ )
1273
+ ```
1274
+
1275
+ Note:
1276
+ You can provide only start_elements, only end_elements, or both.
1277
+ - With only start_elements: sections go from each start to the next start (or end of document)
1278
+ - With only end_elements: sections go from beginning of document to each end
1279
+ - With both: sections go from each start to the corresponding end
1280
+ """
1281
+ if not hasattr(self, "_pages"):
1282
+ raise AttributeError("PDF pages not yet initialized.")
1283
+
1284
+ return self.pages.get_sections(
1285
+ start_elements=start_elements,
1286
+ end_elements=end_elements,
1287
+ new_section_on_page_break=new_section_on_page_break,
1288
+ include_boundaries=include_boundaries,
1289
+ )
1290
+
1232
1291
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
1233
1292
  """
1234
1293
  DEPRECATED: Use save_pdf(..., ocr=True) instead.
@@ -1363,6 +1422,36 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1363
1422
  # Re-raise exception from exporter
1364
1423
  raise e
1365
1424
 
1425
+ def _get_render_specs(
1426
+ self,
1427
+ mode: Literal["show", "render"] = "show",
1428
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
1429
+ highlights: Optional[List[Dict[str, Any]]] = None,
1430
+ crop: Union[bool, Literal["content"]] = False,
1431
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
1432
+ **kwargs,
1433
+ ) -> List[RenderSpec]:
1434
+ """Get render specifications for this PDF.
1435
+
1436
+ For PDF objects, this delegates to the pages collection to handle
1437
+ multi-page rendering.
1438
+
1439
+ Args:
1440
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
1441
+ color: Color for highlighting pages in show mode
1442
+ highlights: Additional highlight groups to show
1443
+ crop: Whether to crop pages
1444
+ crop_bbox: Explicit crop bounds
1445
+ **kwargs: Additional parameters
1446
+
1447
+ Returns:
1448
+ List of RenderSpec objects, one per page
1449
+ """
1450
+ # Delegate to pages collection
1451
+ return self.pages._get_render_specs(
1452
+ mode=mode, color=color, highlights=highlights, crop=crop, crop_bbox=crop_bbox, **kwargs
1453
+ )
1454
+
1366
1455
  def ask(
1367
1456
  self,
1368
1457
  question: str,
@@ -1387,14 +1476,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1387
1476
  Dict containing: answer, confidence, found, page_num, source_elements, etc.
1388
1477
  """
1389
1478
  # Delegate to ask_batch and return the first result
1390
- results = self.ask_batch([question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs)
1391
- return results[0] if results else {
1392
- "answer": None,
1393
- "confidence": 0.0,
1394
- "found": False,
1395
- "page_num": None,
1396
- "source_elements": [],
1397
- }
1479
+ results = self.ask_batch(
1480
+ [question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs
1481
+ )
1482
+ return (
1483
+ results[0]
1484
+ if results
1485
+ else {
1486
+ "answer": None,
1487
+ "confidence": 0.0,
1488
+ "found": False,
1489
+ "page_num": None,
1490
+ "source_elements": [],
1491
+ }
1492
+ )
1398
1493
 
1399
1494
  def ask_batch(
1400
1495
  self,
@@ -1464,7 +1559,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1464
1559
  for _ in questions
1465
1560
  ]
1466
1561
 
1467
- logger.info(f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA...")
1562
+ logger.info(
1563
+ f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA..."
1564
+ )
1468
1565
 
1469
1566
  # Collect all page images and metadata for batch processing
1470
1567
  page_images = []
@@ -1474,26 +1571,26 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1474
1571
  for page in target_pages:
1475
1572
  # Get page image
1476
1573
  try:
1477
- page_image = page.to_image(resolution=150, include_highlights=False)
1574
+ # Use render() for clean image without highlights
1575
+ page_image = page.render(resolution=150)
1478
1576
  if page_image is None:
1479
1577
  logger.warning(f"Failed to render image for page {page.number}, skipping")
1480
1578
  continue
1481
-
1579
+
1482
1580
  # Get text elements for word boxes
1483
1581
  elements = page.find_all("text")
1484
1582
  if not elements:
1485
1583
  logger.warning(f"No text elements found on page {page.number}")
1486
1584
  word_boxes = []
1487
1585
  else:
1488
- word_boxes = qa_engine._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
1489
-
1586
+ word_boxes = qa_engine._get_word_boxes_from_elements(
1587
+ elements, offset_x=0, offset_y=0
1588
+ )
1589
+
1490
1590
  page_images.append(page_image)
1491
1591
  page_word_boxes.append(word_boxes)
1492
- page_metadata.append({
1493
- "page_number": page.number,
1494
- "page_object": page
1495
- })
1496
-
1592
+ page_metadata.append({"page_number": page.number, "page_object": page})
1593
+
1497
1594
  except Exception as e:
1498
1595
  logger.warning(f"Error processing page {page.number}: {e}")
1499
1596
  continue
@@ -1513,22 +1610,24 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1513
1610
 
1514
1611
  # Process all questions against all pages in batch
1515
1612
  all_results = []
1516
-
1613
+
1517
1614
  for question_text in questions:
1518
1615
  question_results = []
1519
-
1616
+
1520
1617
  # Ask this question against each page (but in batch per page)
1521
- for i, (page_image, word_boxes, page_meta) in enumerate(zip(page_images, page_word_boxes, page_metadata)):
1618
+ for i, (page_image, word_boxes, page_meta) in enumerate(
1619
+ zip(page_images, page_word_boxes, page_metadata)
1620
+ ):
1522
1621
  try:
1523
- # Use the DocumentQA batch interface
1622
+ # Use the DocumentQA batch interface
1524
1623
  page_result = qa_engine.ask(
1525
1624
  image=page_image,
1526
1625
  question=question_text,
1527
1626
  word_boxes=word_boxes,
1528
1627
  min_confidence=min_confidence,
1529
- **kwargs
1628
+ **kwargs,
1530
1629
  )
1531
-
1630
+
1532
1631
  if page_result and page_result.found:
1533
1632
  # Add page metadata to result
1534
1633
  page_result_dict = {
@@ -1536,30 +1635,34 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1536
1635
  "confidence": page_result.confidence,
1537
1636
  "found": page_result.found,
1538
1637
  "page_num": page_meta["page_number"],
1539
- "source_elements": getattr(page_result, 'source_elements', []),
1540
- "start": getattr(page_result, 'start', -1),
1541
- "end": getattr(page_result, 'end', -1),
1638
+ "source_elements": getattr(page_result, "source_elements", []),
1639
+ "start": getattr(page_result, "start", -1),
1640
+ "end": getattr(page_result, "end", -1),
1542
1641
  }
1543
1642
  question_results.append(page_result_dict)
1544
-
1643
+
1545
1644
  except Exception as e:
1546
- logger.warning(f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}")
1645
+ logger.warning(
1646
+ f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}"
1647
+ )
1547
1648
  continue
1548
-
1649
+
1549
1650
  # Sort results by confidence and take the best one for this question
1550
1651
  question_results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
1551
-
1652
+
1552
1653
  if question_results:
1553
1654
  all_results.append(question_results[0])
1554
1655
  else:
1555
1656
  # No results found for this question
1556
- all_results.append({
1557
- "answer": None,
1558
- "confidence": 0.0,
1559
- "found": False,
1560
- "page_num": None,
1561
- "source_elements": [],
1562
- })
1657
+ all_results.append(
1658
+ {
1659
+ "answer": None,
1660
+ "confidence": 0.0,
1661
+ "found": False,
1662
+ "page_num": None,
1663
+ "source_elements": [],
1664
+ }
1665
+ )
1563
1666
 
1564
1667
  return all_results
1565
1668
 
@@ -1703,32 +1806,28 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1703
1806
  logger.error(f"Failed to export correction task: {e}")
1704
1807
  raise
1705
1808
 
1706
- def correct_ocr(
1809
+ def update_text(
1707
1810
  self,
1708
- correction_callback: Callable[[Any], Optional[str]],
1811
+ transform: Callable[[Any], Optional[str]],
1709
1812
  pages: Optional[Union[Iterable[int], range, slice]] = None,
1813
+ selector: str = "text",
1710
1814
  max_workers: Optional[int] = None,
1711
1815
  progress_callback: Optional[Callable[[], None]] = None,
1712
1816
  ) -> "PDF":
1713
1817
  """
1714
- Applies corrections to OCR text elements using a callback function.
1715
- Applies corrections to OCR text elements using a callback function.
1818
+ Applies corrections to text elements using a callback function.
1716
1819
 
1717
1820
  Args:
1718
- correction_callback: Function that takes an element and returns corrected text or None
1719
1821
  correction_callback: Function that takes an element and returns corrected text or None
1720
1822
  pages: Optional page indices/slice to limit the scope of correction
1721
- max_workers: Maximum number of threads to use for parallel execution
1722
- progress_callback: Optional callback function for progress updates
1823
+ selector: Selector to apply corrections to (default: "text")
1723
1824
  max_workers: Maximum number of threads to use for parallel execution
1724
1825
  progress_callback: Optional callback function for progress updates
1725
1826
 
1726
1827
  Returns:
1727
1828
  Self for method chaining
1728
- Self for method chaining
1729
1829
  """
1730
1830
  target_page_indices = []
1731
- target_page_indices = []
1732
1831
  if pages is None:
1733
1832
  target_page_indices = list(range(len(self._pages)))
1734
1833
  elif isinstance(pages, slice):
@@ -1741,32 +1840,31 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1741
1840
  raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
1742
1841
  except (IndexError, TypeError, ValueError) as e:
1743
1842
  raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
1744
- raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
1745
1843
  else:
1746
1844
  raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1747
- raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1748
1845
 
1749
1846
  if not target_page_indices:
1750
- logger.warning("No pages selected for OCR correction.")
1847
+ logger.warning("No pages selected for text update.")
1751
1848
  return self
1752
1849
 
1753
- logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1754
- logger.info(f"Starting OCR correction for pages: {target_page_indices}")
1850
+ logger.info(
1851
+ f"Starting text update for pages: {target_page_indices} with selector='{selector}'"
1852
+ )
1755
1853
 
1756
1854
  for page_idx in target_page_indices:
1757
1855
  page = self._pages[page_idx]
1758
1856
  try:
1759
- page.correct_ocr(
1760
- correction_callback=correction_callback,
1857
+ page.update_text(
1858
+ transform=transform,
1859
+ selector=selector,
1761
1860
  max_workers=max_workers,
1762
1861
  progress_callback=progress_callback,
1763
1862
  )
1764
1863
  except Exception as e:
1765
- logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1766
- logger.error(f"Error during correct_ocr on page {page_idx}: {e}")
1864
+ logger.error(f"Error during text update on page {page_idx}: {e}")
1865
+ logger.error(f"Error during text update on page {page_idx}: {e}")
1767
1866
 
1768
- logger.info("OCR correction process finished.")
1769
- logger.info("OCR correction process finished.")
1867
+ logger.info("Text update process finished.")
1770
1868
  return self
1771
1869
 
1772
1870
  def __len__(self) -> int:
@@ -1781,9 +1879,10 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1781
1879
  raise AttributeError("PDF pages not initialized yet.")
1782
1880
 
1783
1881
  if isinstance(key, slice):
1784
- from natural_pdf.elements.collections import PageCollection
1882
+ from natural_pdf.core.page_collection import PageCollection
1883
+
1785
1884
  # Use the lazy page list's slicing which returns another _LazyPageList
1786
- lazy_slice = self._pages[key]
1885
+ lazy_slice = self._pages[key]
1787
1886
  # Wrap in PageCollection for compatibility
1788
1887
  return PageCollection(lazy_slice)
1789
1888
  elif isinstance(key, int):
@@ -2126,10 +2225,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
2126
2225
 
2127
2226
  try:
2128
2227
  for page in tqdm(self.pages, desc="Rendering Pages"):
2129
- img = page.to_image(
2228
+ # Use render() for clean images
2229
+ img = page.render(
2130
2230
  resolution=resolution,
2131
- include_highlights=include_highlights,
2132
- labels=labels,
2133
2231
  **kwargs,
2134
2232
  )
2135
2233
  if img:
@@ -2359,3 +2457,47 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
2359
2457
  os.unlink(path)
2360
2458
  except Exception as e:
2361
2459
  logger.warning(f"Failed to clean up temporary file '{path}': {e}")
2460
+
2461
+ def analyze_layout(self, *args, **kwargs) -> "ElementCollection[Region]":
2462
+ """
2463
+ Analyzes the layout of all pages in the PDF.
2464
+
2465
+ This is a convenience method that calls analyze_layout on the PDF's
2466
+ page collection.
2467
+
2468
+ Args:
2469
+ *args: Positional arguments passed to pages.analyze_layout().
2470
+ **kwargs: Keyword arguments passed to pages.analyze_layout().
2471
+
2472
+ Returns:
2473
+ An ElementCollection of all detected Region objects.
2474
+ """
2475
+ return self.pages.analyze_layout(*args, **kwargs)
2476
+
2477
+ def highlights(self, show: bool = False) -> "HighlightContext":
2478
+ """
2479
+ Create a highlight context for accumulating highlights.
2480
+
2481
+ This allows for clean syntax to show multiple highlight groups:
2482
+
2483
+ Example:
2484
+ with pdf.highlights() as h:
2485
+ h.add(pdf.find_all('table'), label='tables', color='blue')
2486
+ h.add(pdf.find_all('text:bold'), label='bold text', color='red')
2487
+ h.show()
2488
+
2489
+ Or with automatic display:
2490
+ with pdf.highlights(show=True) as h:
2491
+ h.add(pdf.find_all('table'), label='tables')
2492
+ h.add(pdf.find_all('text:bold'), label='bold')
2493
+ # Automatically shows when exiting the context
2494
+
2495
+ Args:
2496
+ show: If True, automatically show highlights when exiting context
2497
+
2498
+ Returns:
2499
+ HighlightContext for accumulating highlights
2500
+ """
2501
+ from natural_pdf.core.highlighting_service import HighlightContext
2502
+
2503
+ return HighlightContext(self, show_on_exit=show)