natural-pdf 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +6 -7
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +236 -383
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +172 -83
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +318 -243
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +4 -4
  33. natural_pdf/flows/flow.py +1200 -243
  34. natural_pdf/flows/region.py +707 -261
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +2 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +7 -3
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -53
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -16,6 +16,7 @@ from typing import (
16
16
  Dict,
17
17
  Iterable,
18
18
  List,
19
+ Literal,
19
20
  Optional,
20
21
  Tuple,
21
22
  Type,
@@ -31,6 +32,7 @@ from natural_pdf.classification.manager import ClassificationError
31
32
  from natural_pdf.classification.mixin import ClassificationMixin
32
33
  from natural_pdf.classification.results import ClassificationResult
33
34
  from natural_pdf.core.highlighting_service import HighlightingService
35
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
34
36
  from natural_pdf.elements.base import Element
35
37
  from natural_pdf.elements.region import Region
36
38
  from natural_pdf.export.mixin import ExportMixin
@@ -38,11 +40,11 @@ from natural_pdf.extraction.manager import StructuredDataManager
38
40
  from natural_pdf.extraction.mixin import ExtractionMixin
39
41
  from natural_pdf.ocr import OCRManager, OCROptions
40
42
  from natural_pdf.selectors.parser import parse_selector
41
- from natural_pdf.utils.locks import pdf_render_lock
42
43
  from natural_pdf.text_mixin import TextMixin
44
+ from natural_pdf.utils.locks import pdf_render_lock
43
45
 
44
46
  if TYPE_CHECKING:
45
- from natural_pdf.elements.collections import ElementCollection
47
+ from natural_pdf.elements.element_collection import ElementCollection
46
48
 
47
49
  try:
48
50
  from typing import Any as TypingAny
@@ -107,7 +109,6 @@ except ImportError:
107
109
  from collections.abc import Sequence
108
110
 
109
111
 
110
-
111
112
  class _LazyPageList(Sequence):
112
113
  """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
113
114
 
@@ -145,18 +146,18 @@ class _LazyPageList(Sequence):
145
146
  """
146
147
 
147
148
  def __init__(
148
- self,
149
- parent_pdf: "PDF",
150
- plumber_pdf: "pdfplumber.PDF",
151
- font_attrs=None,
149
+ self,
150
+ parent_pdf: "PDF",
151
+ plumber_pdf: "pdfplumber.PDF",
152
+ font_attrs=None,
152
153
  load_text=True,
153
- indices: Optional[List[int]] = None
154
+ indices: Optional[List[int]] = None,
154
155
  ):
155
156
  self._parent_pdf = parent_pdf
156
157
  self._plumber_pdf = plumber_pdf
157
158
  self._font_attrs = font_attrs
158
159
  self._load_text = load_text
159
-
160
+
160
161
  # If indices is provided, this is a sliced view
161
162
  if indices is not None:
162
163
  self._indices = indices
@@ -184,23 +185,23 @@ class _LazyPageList(Sequence):
184
185
  font_attrs=self._font_attrs,
185
186
  load_text=self._load_text,
186
187
  )
187
-
188
+
188
189
  # Apply any stored exclusions to the newly created page
189
- if hasattr(self._parent_pdf, '_exclusions'):
190
+ if hasattr(self._parent_pdf, "_exclusions"):
190
191
  for exclusion_data in self._parent_pdf._exclusions:
191
192
  exclusion_func, label = exclusion_data
192
193
  try:
193
194
  cached.add_exclusion(exclusion_func, label=label)
194
195
  except Exception as e:
195
196
  logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
196
-
197
+
197
198
  # Apply any stored regions to the newly created page
198
- if hasattr(self._parent_pdf, '_regions'):
199
+ if hasattr(self._parent_pdf, "_regions"):
199
200
  for region_data in self._parent_pdf._regions:
200
201
  region_func, name = region_data
201
202
  try:
202
203
  region_instance = region_func(cached)
203
- if region_instance and hasattr(region_instance, '__class__'):
204
+ if region_instance and hasattr(region_instance, "__class__"):
204
205
  # Check if it's a Region-like object (avoid importing Region here)
205
206
  cached.add_region(region_instance, name=name, source="named")
206
207
  elif region_instance is not None:
@@ -209,7 +210,7 @@ class _LazyPageList(Sequence):
209
210
  )
210
211
  except Exception as e:
211
212
  logger.warning(f"Failed to apply region to page {cached.number}: {e}")
212
-
213
+
213
214
  self._cache[index] = cached
214
215
  return cached
215
216
 
@@ -219,7 +220,7 @@ class _LazyPageList(Sequence):
219
220
 
220
221
  def __getitem__(self, key):
221
222
  if isinstance(key, slice):
222
- # Get the slice of our current indices
223
+ # Get the slice of our current indices
223
224
  slice_indices = range(*key.indices(len(self)))
224
225
  # Extract the actual page indices for this slice
225
226
  actual_indices = [self._indices[i] for i in slice_indices]
@@ -229,7 +230,7 @@ class _LazyPageList(Sequence):
229
230
  self._plumber_pdf,
230
231
  font_attrs=self._font_attrs,
231
232
  load_text=self._load_text,
232
- indices=actual_indices
233
+ indices=actual_indices,
233
234
  )
234
235
  elif isinstance(key, int):
235
236
  if key < 0:
@@ -251,7 +252,7 @@ class _LazyPageList(Sequence):
251
252
  # --- End Lazy Page List Helper --- #
252
253
 
253
254
 
254
- class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
255
+ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, Visualizable):
255
256
  """Enhanced PDF wrapper built on top of pdfplumber.
256
257
 
257
258
  This class provides a fluent interface for working with PDF documents,
@@ -580,7 +581,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
580
581
  print(f"Page {page.index} has {len(page.chars)} characters")
581
582
  ```
582
583
  """
583
- from natural_pdf.elements.collections import PageCollection
584
+ from natural_pdf.core.page_collection import PageCollection
584
585
 
585
586
  if not hasattr(self, "_pages"):
586
587
  raise AttributeError("PDF pages not yet initialized.")
@@ -612,7 +613,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
612
613
  raise AttributeError("PDF pages not yet initialized.")
613
614
 
614
615
  self._exclusions = []
615
-
616
+
616
617
  # Clear exclusions only from already-created (cached) pages to avoid forcing page creation
617
618
  for i in range(len(self._pages)):
618
619
  if self._pages._cache[i] is not None: # Only clear from existing pages
@@ -622,9 +623,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
622
623
  logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
623
624
  return self
624
625
 
625
- def add_exclusion(
626
- self, exclusion_func, label: str = None
627
- ) -> "PDF":
626
+ def add_exclusion(self, exclusion_func, label: str = None) -> "PDF":
628
627
  """Add an exclusion function to the PDF.
629
628
 
630
629
  Exclusion functions define regions of each page that should be ignored during
@@ -673,12 +672,12 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
673
672
  # Support selector strings and ElementCollection objects directly.
674
673
  # Store exclusion and apply only to already-created pages.
675
674
  # ------------------------------------------------------------------
676
- from natural_pdf.elements.collections import ElementCollection # local import
675
+ from natural_pdf.elements.element_collection import ElementCollection # local import
677
676
 
678
677
  if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
679
678
  # Store for bookkeeping and lazy application
680
679
  self._exclusions.append((exclusion_func, label))
681
-
680
+
682
681
  # Apply only to already-created (cached) pages to avoid forcing page creation
683
682
  for i in range(len(self._pages)):
684
683
  if self._pages._cache[i] is not None: # Only apply to existing pages
@@ -846,11 +845,11 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
846
845
  "include_highlights": False,
847
846
  "exclusions": "mask" if apply_exclusions else None,
848
847
  }
849
- img = page.to_image(**to_image_kwargs)
848
+ # Use render() for clean image without highlights
849
+ img = page.render(resolution=final_resolution)
850
850
  if img is None:
851
851
  logger.error(f" Failed to render page {page.number} to image.")
852
852
  continue
853
- continue
854
853
  images_pil.append(img)
855
854
  page_image_map.append((page, img))
856
855
  except Exception as e:
@@ -1144,7 +1143,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1144
1143
  if page_elements:
1145
1144
  all_elements.extend(page_elements.elements)
1146
1145
 
1147
- from natural_pdf.elements.collections import ElementCollection
1146
+ from natural_pdf.elements.element_collection import ElementCollection
1148
1147
 
1149
1148
  return ElementCollection(all_elements)
1150
1149
 
@@ -1238,7 +1237,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1238
1237
  start_elements=None,
1239
1238
  end_elements=None,
1240
1239
  new_section_on_page_break=False,
1241
- boundary_inclusion="both",
1240
+ include_boundaries="both",
1242
1241
  ) -> "ElementCollection":
1243
1242
  """
1244
1243
  Extract sections from the entire PDF based on start/end elements.
@@ -1250,7 +1249,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1250
1249
  start_elements: Elements or selector string that mark the start of sections (optional)
1251
1250
  end_elements: Elements or selector string that mark the end of sections (optional)
1252
1251
  new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
1253
- boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
1252
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
1254
1253
 
1255
1254
  Returns:
1256
1255
  ElementCollection of Region objects representing the extracted sections
@@ -1259,13 +1258,13 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1259
1258
  Extract sections between headers:
1260
1259
  ```python
1261
1260
  pdf = npdf.PDF("document.pdf")
1262
-
1261
+
1263
1262
  # Get sections between headers
1264
1263
  sections = pdf.get_sections(
1265
1264
  start_elements='text[size>14]:bold',
1266
1265
  end_elements='text[size>14]:bold'
1267
1266
  )
1268
-
1267
+
1269
1268
  # Get sections that break at page boundaries
1270
1269
  sections = pdf.get_sections(
1271
1270
  start_elements='text:contains("Chapter")',
@@ -1286,7 +1285,7 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1286
1285
  start_elements=start_elements,
1287
1286
  end_elements=end_elements,
1288
1287
  new_section_on_page_break=new_section_on_page_break,
1289
- boundary_inclusion=boundary_inclusion,
1288
+ include_boundaries=include_boundaries,
1290
1289
  )
1291
1290
 
1292
1291
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
@@ -1423,6 +1422,36 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1423
1422
  # Re-raise exception from exporter
1424
1423
  raise e
1425
1424
 
1425
+ def _get_render_specs(
1426
+ self,
1427
+ mode: Literal["show", "render"] = "show",
1428
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
1429
+ highlights: Optional[List[Dict[str, Any]]] = None,
1430
+ crop: Union[bool, Literal["content"]] = False,
1431
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
1432
+ **kwargs,
1433
+ ) -> List[RenderSpec]:
1434
+ """Get render specifications for this PDF.
1435
+
1436
+ For PDF objects, this delegates to the pages collection to handle
1437
+ multi-page rendering.
1438
+
1439
+ Args:
1440
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
1441
+ color: Color for highlighting pages in show mode
1442
+ highlights: Additional highlight groups to show
1443
+ crop: Whether to crop pages
1444
+ crop_bbox: Explicit crop bounds
1445
+ **kwargs: Additional parameters
1446
+
1447
+ Returns:
1448
+ List of RenderSpec objects, one per page
1449
+ """
1450
+ # Delegate to pages collection
1451
+ return self.pages._get_render_specs(
1452
+ mode=mode, color=color, highlights=highlights, crop=crop, crop_bbox=crop_bbox, **kwargs
1453
+ )
1454
+
1426
1455
  def ask(
1427
1456
  self,
1428
1457
  question: str,
@@ -1447,14 +1476,20 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1447
1476
  Dict containing: answer, confidence, found, page_num, source_elements, etc.
1448
1477
  """
1449
1478
  # Delegate to ask_batch and return the first result
1450
- results = self.ask_batch([question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs)
1451
- return results[0] if results else {
1452
- "answer": None,
1453
- "confidence": 0.0,
1454
- "found": False,
1455
- "page_num": None,
1456
- "source_elements": [],
1457
- }
1479
+ results = self.ask_batch(
1480
+ [question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs
1481
+ )
1482
+ return (
1483
+ results[0]
1484
+ if results
1485
+ else {
1486
+ "answer": None,
1487
+ "confidence": 0.0,
1488
+ "found": False,
1489
+ "page_num": None,
1490
+ "source_elements": [],
1491
+ }
1492
+ )
1458
1493
 
1459
1494
  def ask_batch(
1460
1495
  self,
@@ -1524,7 +1559,9 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1524
1559
  for _ in questions
1525
1560
  ]
1526
1561
 
1527
- logger.info(f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA...")
1562
+ logger.info(
1563
+ f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA..."
1564
+ )
1528
1565
 
1529
1566
  # Collect all page images and metadata for batch processing
1530
1567
  page_images = []
@@ -1534,26 +1571,26 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1534
1571
  for page in target_pages:
1535
1572
  # Get page image
1536
1573
  try:
1537
- page_image = page.to_image(resolution=150, include_highlights=False)
1574
+ # Use render() for clean image without highlights
1575
+ page_image = page.render(resolution=150)
1538
1576
  if page_image is None:
1539
1577
  logger.warning(f"Failed to render image for page {page.number}, skipping")
1540
1578
  continue
1541
-
1579
+
1542
1580
  # Get text elements for word boxes
1543
1581
  elements = page.find_all("text")
1544
1582
  if not elements:
1545
1583
  logger.warning(f"No text elements found on page {page.number}")
1546
1584
  word_boxes = []
1547
1585
  else:
1548
- word_boxes = qa_engine._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
1549
-
1586
+ word_boxes = qa_engine._get_word_boxes_from_elements(
1587
+ elements, offset_x=0, offset_y=0
1588
+ )
1589
+
1550
1590
  page_images.append(page_image)
1551
1591
  page_word_boxes.append(word_boxes)
1552
- page_metadata.append({
1553
- "page_number": page.number,
1554
- "page_object": page
1555
- })
1556
-
1592
+ page_metadata.append({"page_number": page.number, "page_object": page})
1593
+
1557
1594
  except Exception as e:
1558
1595
  logger.warning(f"Error processing page {page.number}: {e}")
1559
1596
  continue
@@ -1573,22 +1610,24 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1573
1610
 
1574
1611
  # Process all questions against all pages in batch
1575
1612
  all_results = []
1576
-
1613
+
1577
1614
  for question_text in questions:
1578
1615
  question_results = []
1579
-
1616
+
1580
1617
  # Ask this question against each page (but in batch per page)
1581
- for i, (page_image, word_boxes, page_meta) in enumerate(zip(page_images, page_word_boxes, page_metadata)):
1618
+ for i, (page_image, word_boxes, page_meta) in enumerate(
1619
+ zip(page_images, page_word_boxes, page_metadata)
1620
+ ):
1582
1621
  try:
1583
- # Use the DocumentQA batch interface
1622
+ # Use the DocumentQA batch interface
1584
1623
  page_result = qa_engine.ask(
1585
1624
  image=page_image,
1586
1625
  question=question_text,
1587
1626
  word_boxes=word_boxes,
1588
1627
  min_confidence=min_confidence,
1589
- **kwargs
1628
+ **kwargs,
1590
1629
  )
1591
-
1630
+
1592
1631
  if page_result and page_result.found:
1593
1632
  # Add page metadata to result
1594
1633
  page_result_dict = {
@@ -1596,30 +1635,34 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1596
1635
  "confidence": page_result.confidence,
1597
1636
  "found": page_result.found,
1598
1637
  "page_num": page_meta["page_number"],
1599
- "source_elements": getattr(page_result, 'source_elements', []),
1600
- "start": getattr(page_result, 'start', -1),
1601
- "end": getattr(page_result, 'end', -1),
1638
+ "source_elements": getattr(page_result, "source_elements", []),
1639
+ "start": getattr(page_result, "start", -1),
1640
+ "end": getattr(page_result, "end", -1),
1602
1641
  }
1603
1642
  question_results.append(page_result_dict)
1604
-
1643
+
1605
1644
  except Exception as e:
1606
- logger.warning(f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}")
1645
+ logger.warning(
1646
+ f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}"
1647
+ )
1607
1648
  continue
1608
-
1649
+
1609
1650
  # Sort results by confidence and take the best one for this question
1610
1651
  question_results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
1611
-
1652
+
1612
1653
  if question_results:
1613
1654
  all_results.append(question_results[0])
1614
1655
  else:
1615
1656
  # No results found for this question
1616
- all_results.append({
1617
- "answer": None,
1618
- "confidence": 0.0,
1619
- "found": False,
1620
- "page_num": None,
1621
- "source_elements": [],
1622
- })
1657
+ all_results.append(
1658
+ {
1659
+ "answer": None,
1660
+ "confidence": 0.0,
1661
+ "found": False,
1662
+ "page_num": None,
1663
+ "source_elements": [],
1664
+ }
1665
+ )
1623
1666
 
1624
1667
  return all_results
1625
1668
 
@@ -1804,17 +1847,19 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1804
1847
  logger.warning("No pages selected for text update.")
1805
1848
  return self
1806
1849
 
1807
- logger.info(f"Starting text update for pages: {target_page_indices} with selector='{selector}'")
1850
+ logger.info(
1851
+ f"Starting text update for pages: {target_page_indices} with selector='{selector}'"
1852
+ )
1808
1853
 
1809
1854
  for page_idx in target_page_indices:
1810
1855
  page = self._pages[page_idx]
1811
1856
  try:
1812
- page.update_text(
1813
- transform=transform,
1814
- selector=selector,
1815
- max_workers=max_workers,
1816
- progress_callback=progress_callback,
1817
- )
1857
+ page.update_text(
1858
+ transform=transform,
1859
+ selector=selector,
1860
+ max_workers=max_workers,
1861
+ progress_callback=progress_callback,
1862
+ )
1818
1863
  except Exception as e:
1819
1864
  logger.error(f"Error during text update on page {page_idx}: {e}")
1820
1865
  logger.error(f"Error during text update on page {page_idx}: {e}")
@@ -1834,9 +1879,10 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
1834
1879
  raise AttributeError("PDF pages not initialized yet.")
1835
1880
 
1836
1881
  if isinstance(key, slice):
1837
- from natural_pdf.elements.collections import PageCollection
1882
+ from natural_pdf.core.page_collection import PageCollection
1883
+
1838
1884
  # Use the lazy page list's slicing which returns another _LazyPageList
1839
- lazy_slice = self._pages[key]
1885
+ lazy_slice = self._pages[key]
1840
1886
  # Wrap in PageCollection for compatibility
1841
1887
  return PageCollection(lazy_slice)
1842
1888
  elif isinstance(key, int):
@@ -2179,10 +2225,9 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
2179
2225
 
2180
2226
  try:
2181
2227
  for page in tqdm(self.pages, desc="Rendering Pages"):
2182
- img = page.to_image(
2228
+ # Use render() for clean images
2229
+ img = page.render(
2183
2230
  resolution=resolution,
2184
- include_highlights=include_highlights,
2185
- labels=labels,
2186
2231
  **kwargs,
2187
2232
  )
2188
2233
  if img:
@@ -2412,3 +2457,47 @@ class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
2412
2457
  os.unlink(path)
2413
2458
  except Exception as e:
2414
2459
  logger.warning(f"Failed to clean up temporary file '{path}': {e}")
2460
+
2461
+ def analyze_layout(self, *args, **kwargs) -> "ElementCollection[Region]":
2462
+ """
2463
+ Analyzes the layout of all pages in the PDF.
2464
+
2465
+ This is a convenience method that calls analyze_layout on the PDF's
2466
+ page collection.
2467
+
2468
+ Args:
2469
+ *args: Positional arguments passed to pages.analyze_layout().
2470
+ **kwargs: Keyword arguments passed to pages.analyze_layout().
2471
+
2472
+ Returns:
2473
+ An ElementCollection of all detected Region objects.
2474
+ """
2475
+ return self.pages.analyze_layout(*args, **kwargs)
2476
+
2477
+ def highlights(self, show: bool = False) -> "HighlightContext":
2478
+ """
2479
+ Create a highlight context for accumulating highlights.
2480
+
2481
+ This allows for clean syntax to show multiple highlight groups:
2482
+
2483
+ Example:
2484
+ with pdf.highlights() as h:
2485
+ h.add(pdf.find_all('table'), label='tables', color='blue')
2486
+ h.add(pdf.find_all('text:bold'), label='bold text', color='red')
2487
+ h.show()
2488
+
2489
+ Or with automatic display:
2490
+ with pdf.highlights(show=True) as h:
2491
+ h.add(pdf.find_all('table'), label='tables')
2492
+ h.add(pdf.find_all('text:bold'), label='bold')
2493
+ # Automatically shows when exiting the context
2494
+
2495
+ Args:
2496
+ show: If True, automatically show highlights when exiting context
2497
+
2498
+ Returns:
2499
+ HighlightContext for accumulating highlights
2500
+ """
2501
+ from natural_pdf.core.highlighting_service import HighlightContext
2502
+
2503
+ return HighlightContext(self, show_on_exit=show)
@@ -588,24 +588,25 @@ class PDFCollection(
588
588
  # Get classification manager from first PDF
589
589
  try:
590
590
  first_pdf = self._pdfs[0]
591
- if not hasattr(first_pdf, 'get_manager'):
591
+ if not hasattr(first_pdf, "get_manager"):
592
592
  raise RuntimeError("PDFs do not support classification manager")
593
- manager = first_pdf.get_manager('classification')
593
+ manager = first_pdf.get_manager("classification")
594
594
  if not manager or not manager.is_available():
595
595
  raise RuntimeError("ClassificationManager is not available")
596
596
  except Exception as e:
597
597
  from natural_pdf.classification.manager import ClassificationError
598
+
598
599
  raise ClassificationError(f"Cannot access ClassificationManager: {e}") from e
599
600
 
600
601
  # Determine processing mode early
601
602
  inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
602
-
603
+
603
604
  # Gather content from all PDFs
604
605
  pdf_contents = []
605
606
  valid_pdfs = []
606
-
607
+
607
608
  logger.info(f"Gathering content from {len(self._pdfs)} PDFs for batch classification...")
608
-
609
+
609
610
  for pdf in self._pdfs:
610
611
  try:
611
612
  # Get the content for classification - use the same logic as individual PDF classify
@@ -618,16 +619,18 @@ class PDFCollection(
618
619
  elif inferred_using == "vision":
619
620
  # For vision, we need single-page PDFs only
620
621
  if len(pdf.pages) != 1:
621
- logger.warning(f"Skipping PDF {pdf.path}: Vision classification requires single-page PDFs")
622
+ logger.warning(
623
+ f"Skipping PDF {pdf.path}: Vision classification requires single-page PDFs"
624
+ )
622
625
  continue
623
626
  # Get first page image
624
- content = pdf.pages[0].to_image()
627
+ content = pdf.pages[0].render()
625
628
  else:
626
629
  raise ValueError(f"Unsupported using mode: {inferred_using}")
627
-
630
+
628
631
  pdf_contents.append(content)
629
632
  valid_pdfs.append(pdf)
630
-
633
+
631
634
  except Exception as e:
632
635
  logger.warning(f"Skipping PDF {pdf.path}: Error getting content - {e}")
633
636
  continue
@@ -636,7 +639,9 @@ class PDFCollection(
636
639
  logger.warning("No valid content could be gathered from PDFs for classification.")
637
640
  return self
638
641
 
639
- logger.info(f"Gathered content from {len(valid_pdfs)} PDFs. Running batch classification...")
642
+ logger.info(
643
+ f"Gathered content from {len(valid_pdfs)} PDFs. Running batch classification..."
644
+ )
640
645
 
641
646
  # Run batch classification
642
647
  try:
@@ -651,6 +656,7 @@ class PDFCollection(
651
656
  except Exception as e:
652
657
  logger.error(f"Batch classification failed: {e}")
653
658
  from natural_pdf.classification.manager import ClassificationError
659
+
654
660
  raise ClassificationError(f"Batch classification failed: {e}") from e
655
661
 
656
662
  # Assign results back to PDFs
@@ -660,10 +666,11 @@ class PDFCollection(
660
666
  f"with PDFs processed ({len(valid_pdfs)}). Cannot assign results."
661
667
  )
662
668
  from natural_pdf.classification.manager import ClassificationError
669
+
663
670
  raise ClassificationError("Batch result count mismatch with input PDFs")
664
671
 
665
672
  logger.info(f"Assigning {len(batch_results)} results to PDFs under key '{analysis_key}'.")
666
-
673
+
667
674
  processed_count = 0
668
675
  for pdf, result_obj in zip(valid_pdfs, batch_results):
669
676
  try: