natural-pdf 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@ except ImportError:
25
25
  AutoModelForSequenceClassification = object
26
26
  torch = None
27
27
 
28
- from natural_pdf.utils.tqdm_utils import get_tqdm
28
+ from tqdm.auto import tqdm
29
29
 
30
30
  # Import result classes
31
31
  from .results import CategoryScore, ClassificationResult
@@ -343,8 +343,7 @@ class ClassificationManager:
343
343
  total_items = len(item_contents)
344
344
  if progress_bar:
345
345
  # Get the appropriate tqdm class
346
- tqdm_class = get_tqdm()
347
- results_iterator = tqdm_class(
346
+ results_iterator = tqdm(
348
347
  results_iterator,
349
348
  total=total_items,
350
349
  desc=f"Classifying batch ({model_id})",
@@ -25,14 +25,12 @@ from typing import (
25
25
  )
26
26
 
27
27
  from PIL import Image
28
- from tqdm import tqdm
29
- from tqdm.auto import tqdm as auto_tqdm
30
- from tqdm.notebook import tqdm as notebook_tqdm
28
+ from tqdm.auto import tqdm
31
29
 
32
- from natural_pdf.utils.tqdm_utils import get_tqdm
30
+ from natural_pdf.exporters.base import FinetuneExporter
33
31
 
34
- # Get the appropriate tqdm class once
35
- tqdm = get_tqdm()
32
+ # Need to import this utility
33
+ from natural_pdf.utils.identifiers import generate_short_path_hash
36
34
 
37
35
  # Set up logger early
38
36
  # Configure logging to include thread information
@@ -67,8 +65,10 @@ except ImportError as e:
67
65
  from natural_pdf.collections.mixins import ApplyMixin
68
66
  from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
69
67
 
68
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
70
69
 
71
- class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixin
70
+
71
+ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin): # Add ExportMixin and ShapeDetectionMixin
72
72
  def __init__(
73
73
  self,
74
74
  source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -119,16 +119,8 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
119
119
  @staticmethod
120
120
  def _get_pdf_class():
121
121
  """Helper method to dynamically import the PDF class."""
122
- try:
123
- # Import needs to resolve path correctly
124
- from natural_pdf.core.pdf import PDF
125
-
126
- return PDF
127
- except ImportError as e:
128
- logger.error(
129
- "Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime."
130
- )
131
- raise ImportError("PDF class is required but could not be imported.") from e
122
+ from natural_pdf.core.pdf import PDF
123
+ return PDF
132
124
 
133
125
  # --- Internal Helpers ---
134
126
 
@@ -141,16 +133,13 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
141
133
  def _execute_glob(self, pattern: str) -> Set[str]:
142
134
  """Glob for paths and return a set of valid PDF paths."""
143
135
  found_paths = set()
144
- try:
145
- # Use iglob for potentially large directories/matches
146
- paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
147
- for path_str in paths_iter:
148
- # Use Path object for easier checking
149
- p = Path(path_str)
150
- if p.is_file() and p.suffix.lower() == ".pdf":
151
- found_paths.add(str(p.resolve())) # Store resolved absolute path
152
- except Exception as e:
153
- logger.error(f"Error processing glob pattern '{pattern}': {e}")
136
+ # Use iglob for potentially large directories/matches
137
+ paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
138
+ for path_str in paths_iter:
139
+ # Use Path object for easier checking
140
+ p = Path(path_str)
141
+ if p.is_file() and p.suffix.lower() == ".pdf":
142
+ found_paths.add(str(p.resolve())) # Store resolved absolute path
154
143
  return found_paths
155
144
 
156
145
  def _resolve_sources_to_paths(self, source: Union[str, Iterable[str]]) -> List[str]:
@@ -534,19 +523,10 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
534
523
  **kwargs: Additional arguments passed to create_correction_task_package
535
524
  (e.g., image_render_scale, overwrite).
536
525
  """
537
- try:
538
- from natural_pdf.utils.packaging import create_correction_task_package
526
+ from natural_pdf.utils.packaging import create_correction_task_package
539
527
 
540
- # Pass the collection itself (self) as the source
541
- create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
542
- except ImportError:
543
- logger.error(
544
- "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
545
- )
546
- # Or raise
547
- except Exception as e:
548
- logger.error(f"Failed to export correction task for collection: {e}", exc_info=True)
549
- raise # Re-raise the exception from the utility function
528
+ # Pass the collection itself (self) as the source
529
+ create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
550
530
 
551
531
  # --- Mixin Required Implementation ---
552
532
  def get_indexable_items(self) -> Iterable[Indexable]:
@@ -215,21 +215,14 @@ class HighlightRenderer:
215
215
  def _render_ocr_text(self):
216
216
  """Renders OCR text onto the image. (Adapted from old HighlightManager)"""
217
217
  # Use the page reference to get OCR elements
218
- try:
219
- # Try finding first, then extracting if necessary
220
- ocr_elements = self.page.find_all("text[source=ocr]")
221
- if not ocr_elements:
222
- # Don't run full OCR here, just extract if already run
223
- ocr_elements = [
224
- el for el in self.page.words if getattr(el, "source", None) == "ocr"
225
- ]
226
- # Alternative: self.page.extract_ocr_elements() - but might be slow
227
-
228
- except Exception as e:
229
- logger.warning(
230
- f"Could not get OCR elements for page {self.page.number}: {e}", exc_info=True
231
- )
232
- return # Don't modify image if OCR elements aren't available
218
+ # Try finding first, then extracting if necessary
219
+ ocr_elements = self.page.find_all("text[source=ocr]")
220
+ if not ocr_elements:
221
+ # Don't run full OCR here, just extract if already run
222
+ ocr_elements = [
223
+ el for el in self.page.words if getattr(el, "source", None) == "ocr"
224
+ ]
225
+ # Alternative: self.page.extract_ocr_elements() - but might be slow
233
226
 
234
227
  if not ocr_elements:
235
228
  logger.debug(f"No OCR elements found for page {self.page.number} to render.")
@@ -293,20 +286,15 @@ class HighlightRenderer:
293
286
  )
294
287
 
295
288
  # Calculate text position (centered vertically, slightly offset from left)
296
- try:
297
- if hasattr(sized_font, "getbbox"): # Modern PIL
298
- _, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
299
- text_h = text_bottom_offset - text_top_offset
300
- else: # Older PIL approximation
301
- text_h = font_size
302
- text_y = top_s + (box_h - text_h) / 2
303
- # Adjust for vertical offset in some fonts
304
- text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
305
- text_x = x0_s + padding # Start near left edge with padding
306
-
307
- except Exception:
308
- # Fallback positioning
309
- text_x, text_y = x0_s + padding, top_s + padding
289
+ if hasattr(sized_font, "getbbox"): # Modern PIL
290
+ _, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
291
+ text_h = text_bottom_offset - text_top_offset
292
+ else: # Older PIL approximation
293
+ text_h = font_size
294
+ text_y = top_s + (box_h - text_h) / 2
295
+ # Adjust for vertical offset in some fonts
296
+ text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
297
+ text_x = x0_s + padding # Start near left edge with padding
310
298
 
311
299
  draw.text((text_x, text_y), element.text, fill=(0, 0, 0, 255), font=sized_font)
312
300
 
@@ -392,9 +380,6 @@ class HighlightingService:
392
380
  except ValueError:
393
381
  logger.warning(f"Invalid color string: '{color_input}'")
394
382
  return None
395
- except Exception as e:
396
- logger.error(f"Error processing color string '{color_input}': {e}")
397
- return None
398
383
  else:
399
384
  logger.warning(f"Invalid color input type: {type(color_input)}")
400
385
  return None
@@ -677,9 +662,12 @@ class HighlightingService:
677
662
  actual_scale_y = scale # Fallback
678
663
  logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}")
679
664
 
680
- except Exception as e:
681
- logger.error(f"Error creating base image for page {page_index}: {e}", exc_info=True)
682
- return None
665
+ except IOError as e:
666
+ logger.error(f"IOError creating base image for page {page_index}: {e}")
667
+ raise
668
+ except AttributeError as e:
669
+ logger.error(f"AttributeError creating base image for page {page_index}: {e}")
670
+ raise
683
671
 
684
672
  renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
685
673
 
@@ -865,8 +853,11 @@ class HighlightingService:
865
853
  else:
866
854
  final_image = rendered_image
867
855
 
868
- except Exception as e:
869
- logger.error(f"Error rendering preview for page {page_index}: {e}", exc_info=True)
870
- return None
856
+ except IOError as e:
857
+ logger.error(f"IOError rendering preview for page {page_index}: {e}")
858
+ raise
859
+ except AttributeError as e:
860
+ logger.error(f"AttributeError rendering preview for page {page_index}: {e}")
861
+ raise
871
862
 
872
863
  return final_image