natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,39 +1,50 @@
1
- import os
1
+ import copy # Added for copying options
2
2
  import glob as py_glob
3
3
  import logging
4
- from typing import List, Optional, Dict, Any, Union, Iterable, Set, TYPE_CHECKING, Type
4
+ import os
5
+ import re # Added for safe path generation
5
6
  from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union
8
+
6
9
  from PIL import Image
7
- import re # Added for safe path generation
8
- import copy # Added for copying options
9
10
  from tqdm import tqdm
10
11
 
11
12
  # Set up logger early
12
13
  logger = logging.getLogger(__name__)
13
14
 
14
15
  from natural_pdf.core.pdf import PDF
15
- from natural_pdf.elements.region import Region
16
+ from natural_pdf.elements.region import Region
16
17
 
17
18
  # --- Search Imports ---
18
19
  try:
19
20
  from natural_pdf.search.search_service_protocol import (
20
- SearchServiceProtocol, SearchOptions, Indexable
21
- )
21
+ Indexable,
22
+ SearchOptions,
23
+ SearchServiceProtocol,
24
+ )
22
25
  from natural_pdf.search.searchable_mixin import SearchableMixin
23
26
  except ImportError as e:
24
27
  logger_init = logging.getLogger(__name__)
25
- logger_init.error(f"Failed to import search components. Search functionality disabled. Error: {e}", exc_info=True)
28
+ logger_init.warning(
29
+ f"Failed to import Haystack components. Semantic search functionality disabled.",
30
+ )
31
+
26
32
  # Dummy definitions
27
- class SearchableMixin: pass
33
+ class SearchableMixin:
34
+ pass
35
+
28
36
  SearchServiceProtocol, SearchOptions, Indexable = object, object, object
29
37
 
30
- from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
38
+ from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
39
+
31
40
 
32
- class PDFCollection(SearchableMixin): # Inherit from the mixin
33
- def __init__(self,
34
- source: Union[str, Iterable[Union[str, 'PDF']]],
35
- recursive: bool = True,
36
- **pdf_options: Any):
41
+ class PDFCollection(SearchableMixin): # Inherit from the mixin
42
+ def __init__(
43
+ self,
44
+ source: Union[str, Iterable[Union[str, "PDF"]]],
45
+ recursive: bool = True,
46
+ **pdf_options: Any,
47
+ ):
37
48
  """
38
49
  Initializes a collection of PDF documents from various sources.
39
50
 
@@ -46,27 +57,29 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
46
57
  whether to search recursively (default: True).
47
58
  **pdf_options: Keyword arguments passed to the PDF constructor.
48
59
  """
49
- self._pdfs: List['PDF'] = []
50
- self._pdf_options = pdf_options # Store options for potential slicing later
51
- self._recursive = recursive # Store setting for potential slicing
60
+ self._pdfs: List["PDF"] = []
61
+ self._pdf_options = pdf_options # Store options for potential slicing later
62
+ self._recursive = recursive # Store setting for potential slicing
52
63
 
53
64
  # Dynamically import PDF class within methods to avoid circular import at module load time
54
65
  PDF = self._get_pdf_class()
55
66
 
56
- if hasattr(source, '__iter__') and not isinstance(source, str):
57
- source_list = list(source)
58
- if not source_list: return # Empty list source
59
- if isinstance(source_list[0], PDF):
60
- if all(isinstance(item, PDF) for item in source_list):
61
- self._pdfs = source_list # Direct assignment
62
- # Don't adopt search context anymore
63
- return
64
- else: raise TypeError("Iterable source has mixed PDF/non-PDF objects.")
65
- # If it's an iterable but not PDFs, fall through to resolve sources
67
+ if hasattr(source, "__iter__") and not isinstance(source, str):
68
+ source_list = list(source)
69
+ if not source_list:
70
+ return # Empty list source
71
+ if isinstance(source_list[0], PDF):
72
+ if all(isinstance(item, PDF) for item in source_list):
73
+ self._pdfs = source_list # Direct assignment
74
+ # Don't adopt search context anymore
75
+ return
76
+ else:
77
+ raise TypeError("Iterable source has mixed PDF/non-PDF objects.")
78
+ # If it's an iterable but not PDFs, fall through to resolve sources
66
79
 
67
80
  # Resolve string, iterable of strings, or single string source to paths/URLs
68
81
  resolved_paths_or_urls = self._resolve_sources_to_paths(source)
69
- self._initialize_pdfs(resolved_paths_or_urls, PDF) # Pass PDF class
82
+ self._initialize_pdfs(resolved_paths_or_urls, PDF) # Pass PDF class
70
83
 
71
84
  self._iter_index = 0
72
85
 
@@ -79,15 +92,21 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
79
92
  try:
80
93
  # Import needs to resolve path correctly
81
94
  from natural_pdf.core.pdf import PDF
95
+
82
96
  return PDF
83
97
  except ImportError as e:
84
- logger.error("Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime.")
98
+ logger.error(
99
+ "Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime."
100
+ )
85
101
  raise ImportError("PDF class is required but could not be imported.") from e
86
102
 
87
103
  # --- Internal Helpers ---
88
104
 
89
- def _is_url(self, s: str) -> bool: return s.startswith(('http://', 'https://'))
90
- def _has_glob_magic(self, s: str) -> bool: return py_glob.has_magic(s)
105
+ def _is_url(self, s: str) -> bool:
106
+ return s.startswith(("http://", "https://"))
107
+
108
+ def _has_glob_magic(self, s: str) -> bool:
109
+ return py_glob.has_magic(s)
91
110
 
92
111
  def _execute_glob(self, pattern: str) -> Set[str]:
93
112
  """Glob for paths and return a set of valid PDF paths."""
@@ -96,10 +115,10 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
96
115
  # Use iglob for potentially large directories/matches
97
116
  paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
98
117
  for path_str in paths_iter:
99
- # Use Path object for easier checking
100
- p = Path(path_str)
101
- if p.is_file() and p.suffix.lower() == ".pdf":
102
- found_paths.add(str(p.resolve())) # Store resolved absolute path
118
+ # Use Path object for easier checking
119
+ p = Path(path_str)
120
+ if p.is_file() and p.suffix.lower() == ".pdf":
121
+ found_paths.add(str(p.resolve())) # Store resolved absolute path
103
122
  except Exception as e:
104
123
  logger.error(f"Error processing glob pattern '{pattern}': {e}")
105
124
  return found_paths
@@ -111,33 +130,37 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
111
130
 
112
131
  if isinstance(source, str):
113
132
  sources_to_process.append(source)
114
- elif hasattr(source, '__iter__'):
133
+ elif hasattr(source, "__iter__"):
115
134
  sources_to_process.extend(list(source))
116
- else: # Should not happen based on __init__ checks, but safeguard
117
- raise TypeError(f"Unexpected source type in _resolve_sources_to_paths: {type(source)}")
135
+ else: # Should not happen based on __init__ checks, but safeguard
136
+ raise TypeError(f"Unexpected source type in _resolve_sources_to_paths: {type(source)}")
118
137
 
119
138
  for item in sources_to_process:
120
- if not isinstance(item, str):
121
- logger.warning(f"Skipping non-string item in source list: {type(item)}")
122
- continue
123
-
124
- item_path = Path(item)
125
-
126
- if self._is_url(item):
127
- final_paths.add(item) # Add URL directly
128
- elif self._has_glob_magic(item):
129
- glob_results = self._execute_glob(item)
130
- final_paths.update(glob_results)
131
- elif item_path.is_dir():
132
- # Use glob to find PDFs in directory, respecting recursive flag
133
- dir_pattern = str(item_path / "**" / "*.pdf") if self._recursive else str(item_path / "*.pdf")
134
- dir_glob_results = self._execute_glob(dir_pattern)
135
- final_paths.update(dir_glob_results)
136
- elif item_path.is_file() and item_path.suffix.lower() == ".pdf":
137
- final_paths.add(str(item_path.resolve())) # Add resolved file path
138
- else:
139
- logger.warning(f"Source item ignored (not a valid URL, directory, file, or glob): {item}")
140
-
139
+ if not isinstance(item, str):
140
+ logger.warning(f"Skipping non-string item in source list: {type(item)}")
141
+ continue
142
+
143
+ item_path = Path(item)
144
+
145
+ if self._is_url(item):
146
+ final_paths.add(item) # Add URL directly
147
+ elif self._has_glob_magic(item):
148
+ glob_results = self._execute_glob(item)
149
+ final_paths.update(glob_results)
150
+ elif item_path.is_dir():
151
+ # Use glob to find PDFs in directory, respecting recursive flag
152
+ dir_pattern = (
153
+ str(item_path / "**" / "*.pdf") if self._recursive else str(item_path / "*.pdf")
154
+ )
155
+ dir_glob_results = self._execute_glob(dir_pattern)
156
+ final_paths.update(dir_glob_results)
157
+ elif item_path.is_file() and item_path.suffix.lower() == ".pdf":
158
+ final_paths.add(str(item_path.resolve())) # Add resolved file path
159
+ else:
160
+ logger.warning(
161
+ f"Source item ignored (not a valid URL, directory, file, or glob): {item}"
162
+ )
163
+
141
164
  return sorted(list(final_paths))
142
165
 
143
166
  def _initialize_pdfs(self, paths_or_urls: List[str], PDF_cls: Type):
@@ -149,32 +172,38 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
149
172
  pdf_instance = PDF_cls(path_or_url, **self._pdf_options)
150
173
  self._pdfs.append(pdf_instance)
151
174
  except Exception as e:
152
- logger.error(f"Failed to load PDF: {path_or_url}. Error: {e}", exc_info=False) # Keep log concise
153
- failed_count += 1
175
+ logger.error(
176
+ f"Failed to load PDF: {path_or_url}. Error: {e}", exc_info=False
177
+ ) # Keep log concise
178
+ failed_count += 1
154
179
  logger.info(f"Successfully initialized {len(self._pdfs)} PDFs. Failed: {failed_count}")
155
180
 
156
181
  # --- Public Factory Class Methods (Simplified) ---
157
182
 
158
183
  @classmethod
159
- def from_paths(cls, paths_or_urls: List[str], **pdf_options: Any) -> 'PDFCollection':
184
+ def from_paths(cls, paths_or_urls: List[str], **pdf_options: Any) -> "PDFCollection":
160
185
  """Creates a PDFCollection explicitly from a list of file paths or URLs."""
161
186
  # __init__ can handle List[str] directly now
162
187
  return cls(paths_or_urls, **pdf_options)
163
188
 
164
189
  @classmethod
165
- def from_glob(cls, pattern: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
190
+ def from_glob(cls, pattern: str, recursive: bool = True, **pdf_options: Any) -> "PDFCollection":
166
191
  """Creates a PDFCollection explicitly from a single glob pattern."""
167
192
  # __init__ can handle single glob string directly
168
193
  return cls(pattern, recursive=recursive, **pdf_options)
169
194
 
170
195
  @classmethod
171
- def from_globs(cls, patterns: List[str], recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
196
+ def from_globs(
197
+ cls, patterns: List[str], recursive: bool = True, **pdf_options: Any
198
+ ) -> "PDFCollection":
172
199
  """Creates a PDFCollection explicitly from a list of glob patterns."""
173
- # __init__ can handle List[str] containing globs directly
200
+ # __init__ can handle List[str] containing globs directly
174
201
  return cls(patterns, recursive=recursive, **pdf_options)
175
202
 
176
203
  @classmethod
177
- def from_directory(cls, directory_path: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
204
+ def from_directory(
205
+ cls, directory_path: str, recursive: bool = True, **pdf_options: Any
206
+ ) -> "PDFCollection":
178
207
  """Creates a PDFCollection explicitly from PDF files within a directory."""
179
208
  # __init__ can handle single directory string directly
180
209
  return cls(directory_path, recursive=recursive, **pdf_options)
@@ -183,12 +212,12 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
183
212
  def __len__(self) -> int:
184
213
  return len(self._pdfs)
185
214
 
186
- def __getitem__(self, key) -> Union['PDF', 'PDFCollection']:
215
+ def __getitem__(self, key) -> Union["PDF", "PDFCollection"]:
187
216
  # Use dynamic import here as well
188
217
  PDF = self._get_pdf_class()
189
218
  if isinstance(key, slice):
190
219
  # Create a new collection with the sliced PDFs and original options
191
- new_collection = PDFCollection.__new__(PDFCollection) # Create blank instance
220
+ new_collection = PDFCollection.__new__(PDFCollection) # Create blank instance
192
221
  new_collection._pdfs = self._pdfs[key]
193
222
  new_collection._pdf_options = self._pdf_options
194
223
  new_collection._recursive = self._recursive
@@ -199,9 +228,9 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
199
228
  if 0 <= key < len(self._pdfs):
200
229
  return self._pdfs[key]
201
230
  else:
202
- raise IndexError(f"PDF index {key} out of range (0-{len(self._pdfs)-1}).")
231
+ raise IndexError(f"PDF index {key} out of range (0-{len(self._pdfs)-1}).")
203
232
  else:
204
- raise TypeError(f"PDF indices must be integers or slices, not {type(key)}.")
233
+ raise TypeError(f"PDF indices must be integers or slices, not {type(key)}.")
205
234
 
206
235
  def __iter__(self):
207
236
  return iter(self._pdfs)
@@ -211,24 +240,23 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
211
240
  return f"<PDFCollection(count={len(self)})>"
212
241
 
213
242
  @property
214
- def pdfs(self) -> List['PDF']:
215
- """Returns the list of PDF objects held by the collection."""
216
- return self._pdfs
243
+ def pdfs(self) -> List["PDF"]:
244
+ """Returns the list of PDF objects held by the collection."""
245
+ return self._pdfs
217
246
 
218
- # --- Other Methods (e.g., apply_ocr_to_pages - could leverage service in future?) ---
219
- def apply_ocr_to_pages(self, *args, **kwargs):
247
+ def apply_ocr(self, *args, **kwargs):
220
248
  PDF = self._get_pdf_class()
221
249
  # Delegate to individual PDF objects
222
250
  logger.info("Applying OCR to relevant PDFs in collection...")
223
251
  results = []
224
252
  for pdf in self._pdfs:
225
- # We need to figure out which pages belong to which PDF if batching here
226
- # For now, simpler to call on each PDF
227
- try:
228
- # Assume apply_ocr_to_pages exists on PDF and accepts similar args
229
- pdf.apply_ocr_to_pages(*args, **kwargs)
230
- except Exception as e:
231
- logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
253
+ # We need to figure out which pages belong to which PDF if batching here
254
+ # For now, simpler to call on each PDF
255
+ try:
256
+ # Assume apply_ocr exists on PDF and accepts similar args
257
+ pdf.apply_ocr(*args, **kwargs)
258
+ except Exception as e:
259
+ logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
232
260
  return self
233
261
 
234
262
  # --- Advanced Method Placeholders ---
@@ -237,23 +265,23 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
237
265
  def categorize(self, categories: List[str], **kwargs):
238
266
  """Categorizes PDFs in the collection based on content or features."""
239
267
  # Implementation requires integrating with classification models or logic
240
- raise NotImplementedError("categorize requires classification implementation.")
268
+ raise NotImplementedError("categorize requires classification implementation.")
241
269
 
242
- # --- Mixin Required Implementation ---
270
+ # --- Mixin Required Implementation ---
243
271
  def get_indexable_items(self) -> Iterable[Indexable]:
244
272
  """Yields Page objects from the collection, conforming to Indexable."""
245
273
  if not self._pdfs:
246
- return # Return empty iterator if no PDFs
247
-
274
+ return # Return empty iterator if no PDFs
275
+
248
276
  for pdf in self._pdfs:
249
- if not pdf.pages: # Handle case where a PDF might have 0 pages after loading
250
- logger.warning(f"PDF '{pdf.path}' has no pages. Skipping.")
251
- continue
252
- for page in pdf.pages:
253
- # Optional: Add filtering here if needed (e.g., skip empty pages)
254
- # Assuming Page object conforms to Indexable
255
- # We might still want the empty page check here for efficiency
256
- # if not page.extract_text(use_exclusions=False).strip():
257
- # logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
258
- # continue
259
- yield page
277
+ if not pdf.pages: # Handle case where a PDF might have 0 pages after loading
278
+ logger.warning(f"PDF '{pdf.path}' has no pages. Skipping.")
279
+ continue
280
+ for page in pdf.pages:
281
+ # Optional: Add filtering here if needed (e.g., skip empty pages)
282
+ # Assuming Page object conforms to Indexable
283
+ # We might still want the empty page check here for efficiency
284
+ # if not page.extract_text(use_exclusions=False).strip():
285
+ # logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
286
+ # continue
287
+ yield page
@@ -1,3 +1,3 @@
1
1
  """
2
2
  Core classes for Natural PDF.
3
- """
3
+ """