natural-pdf 0.1.37__py3-none-any.whl → 0.1.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -1655,7 +1655,27 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1655
1655
  table_settings.setdefault("join_x_tolerance", join)
1656
1656
  table_settings.setdefault("join_y_tolerance", join)
1657
1657
 
1658
- return self._page.extract_tables(table_settings)
1658
+ raw_tables = self._page.extract_tables(table_settings)
1659
+
1660
+ # Apply RTL text processing to all extracted tables
1661
+ if raw_tables:
1662
+ processed_tables = []
1663
+ for table in raw_tables:
1664
+ processed_table = []
1665
+ for row in table:
1666
+ processed_row = []
1667
+ for cell in row:
1668
+ if cell is not None:
1669
+ # Apply RTL text processing to each cell
1670
+ rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
1671
+ processed_row.append(rtl_processed_cell)
1672
+ else:
1673
+ processed_row.append(cell)
1674
+ processed_table.append(processed_row)
1675
+ processed_tables.append(processed_table)
1676
+ return processed_tables
1677
+
1678
+ return raw_tables
1659
1679
  else:
1660
1680
  raise ValueError(
1661
1681
  f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
@@ -3280,6 +3300,54 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
3280
3300
  )
3281
3301
  return self
3282
3302
 
3303
+ def _apply_rtl_processing_to_text(self, text: str) -> str:
3304
+ """
3305
+ Apply RTL (Right-to-Left) text processing to a string.
3306
+
3307
+ This converts visual order text (as stored in PDFs) to logical order
3308
+ for proper display of Arabic, Hebrew, and other RTL scripts.
3309
+
3310
+ Args:
3311
+ text: Input text string in visual order
3312
+
3313
+ Returns:
3314
+ Text string in logical order
3315
+ """
3316
+ if not text or not text.strip():
3317
+ return text
3318
+
3319
+ # Quick check for RTL characters - if none found, return as-is
3320
+ import unicodedata
3321
+
3322
+ def _contains_rtl(s):
3323
+ return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
3324
+
3325
+ if not _contains_rtl(text):
3326
+ return text
3327
+
3328
+ try:
3329
+ from bidi.algorithm import get_display # type: ignore
3330
+ from natural_pdf.utils.bidi_mirror import mirror_brackets
3331
+
3332
+ # Apply BiDi algorithm to convert from visual to logical order
3333
+ # Process line by line to handle mixed content properly
3334
+ processed_lines = []
3335
+ for line in text.split("\n"):
3336
+ if line.strip():
3337
+ # Determine base direction for this line
3338
+ base_dir = "R" if _contains_rtl(line) else "L"
3339
+ logical_line = get_display(line, base_dir=base_dir)
3340
+ # Apply bracket mirroring for correct logical order
3341
+ processed_lines.append(mirror_brackets(logical_line))
3342
+ else:
3343
+ processed_lines.append(line)
3344
+
3345
+ return "\n".join(processed_lines)
3346
+
3347
+ except (ImportError, Exception):
3348
+ # If bidi library is not available or fails, return original text
3349
+ return text
3350
+
3283
3351
  @property
3284
3352
  def lines(self) -> List[Any]:
3285
3353
  """Get all line elements on this page."""
natural_pdf/core/pdf.py CHANGED
@@ -103,6 +103,7 @@ except ImportError:
103
103
  from collections.abc import Sequence
104
104
 
105
105
 
106
+
106
107
  class _LazyPageList(Sequence):
107
108
  """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
108
109
 
@@ -121,6 +122,7 @@ class _LazyPageList(Sequence):
121
122
  _font_attrs: Font attributes to use when creating pages.
122
123
  _cache: List of cached Page objects (None until accessed).
123
124
  _load_text: Whether to load text layer when creating pages.
125
+ _indices: Optional range of indices this list represents (for slices).
124
126
 
125
127
  Example:
126
128
  ```python
@@ -130,7 +132,7 @@ class _LazyPageList(Sequence):
130
132
  last_page = pdf.pages[-1] # Creates another Page object
131
133
 
132
134
  # Slicing works too
133
- first_three = pdf.pages[0:3] # Creates 3 Page objects
135
+ first_three = pdf.pages[0:3] # Returns another lazy list
134
136
 
135
137
  # Iteration creates all pages
136
138
  for page in pdf.pages: # Each page created as needed
@@ -139,30 +141,71 @@ class _LazyPageList(Sequence):
139
141
  """
140
142
 
141
143
  def __init__(
142
- self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True
144
+ self,
145
+ parent_pdf: "PDF",
146
+ plumber_pdf: "pdfplumber.PDF",
147
+ font_attrs=None,
148
+ load_text=True,
149
+ indices: Optional[List[int]] = None
143
150
  ):
144
151
  self._parent_pdf = parent_pdf
145
152
  self._plumber_pdf = plumber_pdf
146
153
  self._font_attrs = font_attrs
147
- # One slot per pdfplumber page – initially all None
148
- self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
149
154
  self._load_text = load_text
155
+
156
+ # If indices is provided, this is a sliced view
157
+ if indices is not None:
158
+ self._indices = indices
159
+ self._cache = [None] * len(indices)
160
+ else:
161
+ # Full PDF - one slot per pdfplumber page
162
+ self._indices = list(range(len(plumber_pdf.pages)))
163
+ self._cache = [None] * len(plumber_pdf.pages)
150
164
 
151
165
  # Internal helper -----------------------------------------------------
152
166
  def _create_page(self, index: int) -> "Page":
167
+ """Create and cache a page at the given index within this list."""
153
168
  cached = self._cache[index]
154
169
  if cached is None:
155
170
  # Import here to avoid circular import problems
156
171
  from natural_pdf.core.page import Page
157
172
 
158
- plumber_page = self._plumber_pdf.pages[index]
173
+ # Get the actual page index in the full PDF
174
+ actual_page_index = self._indices[index]
175
+ plumber_page = self._plumber_pdf.pages[actual_page_index]
159
176
  cached = Page(
160
177
  plumber_page,
161
178
  parent=self._parent_pdf,
162
- index=index,
179
+ index=actual_page_index,
163
180
  font_attrs=self._font_attrs,
164
181
  load_text=self._load_text,
165
182
  )
183
+
184
+ # Apply any stored exclusions to the newly created page
185
+ if hasattr(self._parent_pdf, '_exclusions'):
186
+ for exclusion_data in self._parent_pdf._exclusions:
187
+ exclusion_func, label = exclusion_data
188
+ try:
189
+ cached.add_exclusion(exclusion_func, label=label)
190
+ except Exception as e:
191
+ logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
192
+
193
+ # Apply any stored regions to the newly created page
194
+ if hasattr(self._parent_pdf, '_regions'):
195
+ for region_data in self._parent_pdf._regions:
196
+ region_func, name = region_data
197
+ try:
198
+ region_instance = region_func(cached)
199
+ if region_instance and hasattr(region_instance, '__class__'):
200
+ # Check if it's a Region-like object (avoid importing Region here)
201
+ cached.add_region(region_instance, name=name, source="named")
202
+ elif region_instance is not None:
203
+ logger.warning(
204
+ f"Region function did not return a valid Region for page {cached.number}"
205
+ )
206
+ except Exception as e:
207
+ logger.warning(f"Failed to apply region to page {cached.number}: {e}")
208
+
166
209
  self._cache[index] = cached
167
210
  return cached
168
211
 
@@ -172,9 +215,18 @@ class _LazyPageList(Sequence):
172
215
 
173
216
  def __getitem__(self, key):
174
217
  if isinstance(key, slice):
175
- # Materialise pages for slice lazily as well
176
- indices = range(*key.indices(len(self)))
177
- return [self._create_page(i) for i in indices]
218
+ # Get the slice of our current indices
219
+ slice_indices = range(*key.indices(len(self)))
220
+ # Extract the actual page indices for this slice
221
+ actual_indices = [self._indices[i] for i in slice_indices]
222
+ # Return a new lazy list for the slice
223
+ return _LazyPageList(
224
+ self._parent_pdf,
225
+ self._plumber_pdf,
226
+ font_attrs=self._font_attrs,
227
+ load_text=self._load_text,
228
+ indices=actual_indices
229
+ )
178
230
  elif isinstance(key, int):
179
231
  if key < 0:
180
232
  key += len(self)
@@ -556,8 +608,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
556
608
  raise AttributeError("PDF pages not yet initialized.")
557
609
 
558
610
  self._exclusions = []
559
- for page in self._pages:
560
- page.clear_exclusions()
611
+
612
+ # Clear exclusions only from already-created (cached) pages to avoid forcing page creation
613
+ for i in range(len(self._pages)):
614
+ if self._pages._cache[i] is not None: # Only clear from existing pages
615
+ try:
616
+ self._pages._cache[i].clear_exclusions()
617
+ except Exception as e:
618
+ logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
561
619
  return self
562
620
 
563
621
  def add_exclusion(
@@ -608,25 +666,35 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
608
666
  raise AttributeError("PDF pages not yet initialized.")
609
667
 
610
668
  # ------------------------------------------------------------------
611
- # NEW: Support selector strings and ElementCollection objects directly.
612
- # We simply forward the same object to each page's add_exclusion which
613
- # now knows how to interpret these inputs.
669
+ # Support selector strings and ElementCollection objects directly.
670
+ # Store exclusion and apply only to already-created pages.
614
671
  # ------------------------------------------------------------------
615
672
  from natural_pdf.elements.collections import ElementCollection # local import
616
673
 
617
674
  if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
618
- # Store for bookkeeping
675
+ # Store for bookkeeping and lazy application
619
676
  self._exclusions.append((exclusion_func, label))
620
- for page in self._pages:
621
- page.add_exclusion(exclusion_func, label=label)
677
+
678
+ # Apply only to already-created (cached) pages to avoid forcing page creation
679
+ for i in range(len(self._pages)):
680
+ if self._pages._cache[i] is not None: # Only apply to existing pages
681
+ try:
682
+ self._pages._cache[i].add_exclusion(exclusion_func, label=label)
683
+ except Exception as e:
684
+ logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
622
685
  return self
623
686
 
624
687
  # Fallback to original callable / Region behaviour ------------------
625
688
  exclusion_data = (exclusion_func, label)
626
689
  self._exclusions.append(exclusion_data)
627
690
 
628
- for page in self._pages:
629
- page.add_exclusion(exclusion_func, label=label)
691
+ # Apply only to already-created (cached) pages to avoid forcing page creation
692
+ for i in range(len(self._pages)):
693
+ if self._pages._cache[i] is not None: # Only apply to existing pages
694
+ try:
695
+ self._pages._cache[i].add_exclusion(exclusion_func, label=label)
696
+ except Exception as e:
697
+ logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
630
698
 
631
699
  return self
632
700
 
@@ -868,7 +936,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
868
936
  Add a region function to the PDF.
869
937
 
870
938
  Args:
871
- region_func: A function that takes a Page and returns a Region, or None
872
939
  region_func: A function that takes a Page and returns a Region, or None
873
940
  name: Optional name for the region
874
941
 
@@ -881,17 +948,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
881
948
  region_data = (region_func, name)
882
949
  self._regions.append(region_data)
883
950
 
884
- for page in self._pages:
885
- try:
886
- region_instance = region_func(page)
887
- if region_instance and isinstance(region_instance, Region):
888
- page.add_region(region_instance, name=name, source="named")
889
- elif region_instance is not None:
890
- logger.warning(
891
- f"Region function did not return a valid Region for page {page.number}"
892
- )
893
- except Exception as e:
894
- logger.error(f"Error adding region for page {page.number}: {e}")
951
+ # Apply only to already-created (cached) pages to avoid forcing page creation
952
+ for i in range(len(self._pages)):
953
+ if self._pages._cache[i] is not None: # Only apply to existing pages
954
+ page = self._pages._cache[i]
955
+ try:
956
+ region_instance = region_func(page)
957
+ if region_instance and isinstance(region_instance, Region):
958
+ page.add_region(region_instance, name=name, source="named")
959
+ elif region_instance is not None:
960
+ logger.warning(
961
+ f"Region function did not return a valid Region for page {page.number}"
962
+ )
963
+ except Exception as e:
964
+ logger.error(f"Error adding region for page {page.number}: {e}")
895
965
 
896
966
  return self
897
967
 
@@ -1712,10 +1782,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1712
1782
 
1713
1783
  if isinstance(key, slice):
1714
1784
  from natural_pdf.elements.collections import PageCollection
1715
-
1716
- return PageCollection(self._pages[key])
1717
-
1718
- if isinstance(key, int):
1785
+ # Use the lazy page list's slicing which returns another _LazyPageList
1786
+ lazy_slice = self._pages[key]
1787
+ # Wrap in PageCollection for compatibility
1788
+ return PageCollection(lazy_slice)
1789
+ elif isinstance(key, int):
1719
1790
  if 0 <= key < len(self._pages):
1720
1791
  return self._pages[key]
1721
1792
  else:
@@ -1,6 +1,6 @@
1
1
  import hashlib
2
2
  import logging
3
- from collections.abc import MutableSequence
3
+ from collections.abc import MutableSequence, Sequence
4
4
  from pathlib import Path
5
5
  from typing import (
6
6
  TYPE_CHECKING,
@@ -2051,14 +2051,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2051
2051
  Provides methods for batch operations on these pages.
2052
2052
  """
2053
2053
 
2054
- def __init__(self, pages: List[P]):
2054
+ def __init__(self, pages: Union[List[P], Sequence[P]]):
2055
2055
  """
2056
2056
  Initialize a page collection.
2057
2057
 
2058
2058
  Args:
2059
- pages: List of Page objects
2059
+ pages: List or sequence of Page objects (can be lazy)
2060
2060
  """
2061
- self.pages = pages
2061
+ # Store the sequence as-is to preserve lazy behavior
2062
+ # Only convert to list if we need list-specific operations
2063
+ if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
2064
+ self.pages = pages
2065
+ else:
2066
+ # Fallback for non-sequence types
2067
+ self.pages = list(pages)
2062
2068
 
2063
2069
  def __len__(self) -> int:
2064
2070
  """Return the number of pages in the collection."""
@@ -2078,6 +2084,31 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2078
2084
  """Return a string representation showing the page count."""
2079
2085
  return f"<PageCollection(count={len(self)})>"
2080
2086
 
2087
+ def _get_items_for_apply(self) -> Iterator[P]:
2088
+ """
2089
+ Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
2090
+
2091
+ Returns an iterator that yields pages on-demand rather than materializing
2092
+ all pages at once, maintaining the lazy loading behavior.
2093
+ """
2094
+ return iter(self.pages)
2095
+
2096
+ def _get_page_indices(self) -> List[int]:
2097
+ """
2098
+ Get page indices without forcing materialization of pages.
2099
+
2100
+ Returns:
2101
+ List of page indices for the pages in this collection.
2102
+ """
2103
+ # Handle different types of page sequences efficiently
2104
+ if hasattr(self.pages, '_indices'):
2105
+ # If it's a _LazyPageList (or slice), get indices directly
2106
+ return list(self.pages._indices)
2107
+ else:
2108
+ # Fallback: if pages are already materialized, get indices normally
2109
+ # This will force materialization but only if pages aren't lazy
2110
+ return [p.index for p in self.pages]
2111
+
2081
2112
  def extract_text(
2082
2113
  self,
2083
2114
  keep_blank_chars: bool = True,
@@ -2172,7 +2203,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2172
2203
  raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
2173
2204
 
2174
2205
  # Get the 0-based indices of the pages in this collection
2175
- page_indices = [p.index for p in self.pages]
2206
+ page_indices = self._get_page_indices()
2176
2207
 
2177
2208
  logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
2178
2209
 
@@ -2374,7 +2405,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2374
2405
  "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
2375
2406
  )
2376
2407
 
2377
- page_indices = [p.index for p in self.pages]
2408
+ page_indices = self._get_page_indices()
2378
2409
  logger.info(
2379
2410
  f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
2380
2411
  )
@@ -2800,7 +2831,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2800
2831
  )
2801
2832
 
2802
2833
  # Get the 0-based indices of the pages in this collection
2803
- page_indices = [p.index for p in self.pages]
2834
+ page_indices = self._get_page_indices()
2804
2835
  logger.info(
2805
2836
  f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
2806
2837
  )
@@ -1616,8 +1616,26 @@ class Region(
1616
1616
  # Extract all tables from the cropped area
1617
1617
  tables = cropped.extract_tables(table_settings)
1618
1618
 
1619
- # Return the tables or an empty list if none found
1620
- return tables if tables else []
1619
+ # Apply RTL text processing to all tables
1620
+ if tables:
1621
+ processed_tables = []
1622
+ for table in tables:
1623
+ processed_table = []
1624
+ for row in table:
1625
+ processed_row = []
1626
+ for cell in row:
1627
+ if cell is not None:
1628
+ # Apply RTL text processing to each cell
1629
+ rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
1630
+ processed_row.append(rtl_processed_cell)
1631
+ else:
1632
+ processed_row.append(cell)
1633
+ processed_table.append(processed_row)
1634
+ processed_tables.append(processed_table)
1635
+ return processed_tables
1636
+
1637
+ # Return empty list if no tables found
1638
+ return []
1621
1639
 
1622
1640
  def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
1623
1641
  """
@@ -1662,21 +1680,25 @@ class Region(
1662
1680
 
1663
1681
  # Return the table or an empty list if none found
1664
1682
  if table:
1665
- # Apply content filtering if provided
1666
- if content_filter is not None:
1667
- filtered_table = []
1668
- for row in table:
1669
- filtered_row = []
1670
- for cell in row:
1671
- if cell is not None:
1672
- # Apply content filter to cell text
1673
- filtered_cell = self._apply_content_filter_to_text(cell, content_filter)
1674
- filtered_row.append(filtered_cell)
1683
+ # Apply RTL text processing and content filtering if provided
1684
+ processed_table = []
1685
+ for row in table:
1686
+ processed_row = []
1687
+ for cell in row:
1688
+ if cell is not None:
1689
+ # Apply RTL text processing first
1690
+ rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
1691
+
1692
+ # Then apply content filter if provided
1693
+ if content_filter is not None:
1694
+ filtered_cell = self._apply_content_filter_to_text(rtl_processed_cell, content_filter)
1695
+ processed_row.append(filtered_cell)
1675
1696
  else:
1676
- filtered_row.append(cell)
1677
- filtered_table.append(filtered_row)
1678
- return filtered_table
1679
- return table
1697
+ processed_row.append(rtl_processed_cell)
1698
+ else:
1699
+ processed_row.append(cell)
1700
+ processed_table.append(processed_row)
1701
+ return processed_table
1680
1702
  return []
1681
1703
 
1682
1704
  def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
@@ -3490,6 +3512,54 @@ class Region(
3490
3512
 
3491
3513
  return table_grid
3492
3514
 
3515
+ def _apply_rtl_processing_to_text(self, text: str) -> str:
3516
+ """
3517
+ Apply RTL (Right-to-Left) text processing to a string.
3518
+
3519
+ This converts visual order text (as stored in PDFs) to logical order
3520
+ for proper display of Arabic, Hebrew, and other RTL scripts.
3521
+
3522
+ Args:
3523
+ text: Input text string in visual order
3524
+
3525
+ Returns:
3526
+ Text string in logical order
3527
+ """
3528
+ if not text or not text.strip():
3529
+ return text
3530
+
3531
+ # Quick check for RTL characters - if none found, return as-is
3532
+ import unicodedata
3533
+
3534
+ def _contains_rtl(s):
3535
+ return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
3536
+
3537
+ if not _contains_rtl(text):
3538
+ return text
3539
+
3540
+ try:
3541
+ from bidi.algorithm import get_display # type: ignore
3542
+ from natural_pdf.utils.bidi_mirror import mirror_brackets
3543
+
3544
+ # Apply BiDi algorithm to convert from visual to logical order
3545
+ # Process line by line to handle mixed content properly
3546
+ processed_lines = []
3547
+ for line in text.split("\n"):
3548
+ if line.strip():
3549
+ # Determine base direction for this line
3550
+ base_dir = "R" if _contains_rtl(line) else "L"
3551
+ logical_line = get_display(line, base_dir=base_dir)
3552
+ # Apply bracket mirroring for correct logical order
3553
+ processed_lines.append(mirror_brackets(logical_line))
3554
+ else:
3555
+ processed_lines.append(line)
3556
+
3557
+ return "\n".join(processed_lines)
3558
+
3559
+ except (ImportError, Exception):
3560
+ # If bidi library is not available or fails, return original text
3561
+ return text
3562
+
3493
3563
  def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
3494
3564
  """
3495
3565
  Apply content filter to a text string.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.37
3
+ Version: 0.1.38
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -27,8 +27,8 @@ natural_pdf/collections/pdf_collection.py,sha256=sDVEbFMNME_2OaHIsCoR_W7V1cAATNw
27
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
28
  natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
29
29
  natural_pdf/core/highlighting_service.py,sha256=2tBrrEq6d6hz5f6Yf7z5TysJdlTyuHTURBnQxokJnDM,40645
30
- natural_pdf/core/page.py,sha256=MwIENkMjEKStC6RlD3SBrqmyZt_MKzrIY7vLBFIvrwY,142529
31
- natural_pdf/core/pdf.py,sha256=2hK3yRVRxEQMVy1v4w6P26VGoDpCu_3FNkYgN-LO4hA,93221
30
+ natural_pdf/core/page.py,sha256=iWokHLuSrQ71kxB_tTWkCp_O-i72urR4iGFUIzKoH8k,145351
31
+ natural_pdf/core/pdf.py,sha256=5M1gB9psqwJCgE0w7PQ_G1XVa_XCmyNNmluZO7pIyZ4,97112
32
32
  natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
33
33
  natural_pdf/describe/base.py,sha256=CLhZXYQO6SOPUVWLt6VwZ7MK48t_6wgPMyFMLtTCKRc,18166
34
34
  natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
@@ -36,11 +36,11 @@ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ
36
36
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
37
37
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
38
38
  natural_pdf/elements/base.py,sha256=-ZAcc8lb2aSWTKcprwKTvnR6hsDGDm7T8a1Y9V38E_A,52042
39
- natural_pdf/elements/collections.py,sha256=_B03lJA1n147alE4xvn6qQ9uZWI8kb8VGxpchghqxqg,131834
39
+ natural_pdf/elements/collections.py,sha256=qtHEaLPxZ6i3zPQsbSOw_KMAr9oDMWR1516ilSMSDeY,133189
40
40
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
41
41
  natural_pdf/elements/line.py,sha256=mHSeV-ZABY-Cc_K_NpFL53OGtTWlexYDlMvZc8_Vrx8,3845
42
42
  natural_pdf/elements/rect.py,sha256=QuQg0Qo7XYQKBac-3Ss0n0ELV6icdPcrygWM2VWzeX8,3325
43
- natural_pdf/elements/region.py,sha256=ewY9HmV_VN6tN_VKtHj7dtk6nh7hrot-pW5Soz5iMg0,148150
43
+ natural_pdf/elements/region.py,sha256=s3iFTq6QNiEgSAEV9ywt-3oQW5_swTvB6FNMgANpvmA,151055
44
44
  natural_pdf/elements/text.py,sha256=giPJQaXuOBCviQ7QKVx_ZMrKFVpgQAsaCS2-kn-8mp0,20530
45
45
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
46
46
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -98,7 +98,7 @@ natural_pdf/utils/text_extraction.py,sha256=HYWlYGPfafwzsuMyfL5oQhvcD4NobbvC_aCp
98
98
  natural_pdf/utils/visualization.py,sha256=olDkWtuVzP0NxRg0CP0DL-eXNCY7Bs-SH-2Xn-cjbo0,9370
99
99
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
100
100
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
101
- natural_pdf-0.1.37.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
101
+ natural_pdf-0.1.38.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
102
102
  optimization/memory_comparison.py,sha256=F90D_5WhliSGAct_lyx93xd4q4F-jeo8QpGyDr8tmNw,6543
103
103
  optimization/pdf_analyzer.py,sha256=xf6h-FNlqCpsm8NriXcs_bQZOB8eQkxgGGKVRL_jgCM,19347
104
104
  optimization/performance_analysis.py,sha256=RjAqeE3YS1r_7qTWkY6Ng5YMbb6MXJXfXX6LoVjg_xQ,13035
@@ -115,8 +115,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
115
115
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
116
116
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
117
117
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
118
- natural_pdf-0.1.37.dist-info/METADATA,sha256=1POawL7Edgjod2Qt1TO-2DhUkVesip-OnB0KkQCgGQ0,6739
119
- natural_pdf-0.1.37.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
120
- natural_pdf-0.1.37.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
121
- natural_pdf-0.1.37.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
122
- natural_pdf-0.1.37.dist-info/RECORD,,
118
+ natural_pdf-0.1.38.dist-info/METADATA,sha256=7a2BfP1oBRbUDUm_9t-3jCsw9BGjIiGyoFwGQyDvcVo,6739
119
+ natural_pdf-0.1.38.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
120
+ natural_pdf-0.1.38.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
121
+ natural_pdf-0.1.38.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
122
+ natural_pdf-0.1.38.dist-info/RECORD,,