PyPI - natural-pdf - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

natural-pdf 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

natural_pdf/__init__.py +24 -40
natural_pdf/classification/manager.py +26 -22
natural_pdf/classification/mixin.py +7 -7
natural_pdf/classification/results.py +17 -9
natural_pdf/collections/mixins.py +17 -0
natural_pdf/collections/pdf_collection.py +78 -46
natural_pdf/core/page.py +17 -17
natural_pdf/core/pdf.py +192 -18
natural_pdf/elements/collections.py +307 -3
natural_pdf/elements/region.py +2 -3
natural_pdf/exporters/hocr.py +540 -0
natural_pdf/exporters/hocr_font.py +142 -0
natural_pdf/exporters/original_pdf.py +130 -0
natural_pdf/exporters/searchable_pdf.py +3 -3
natural_pdf/ocr/engine_surya.py +1 -1
{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/METADATA +1 -2
{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/RECORD +20 -17
{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.9.dist-info → natural_pdf-0.1.11.dist-info}/top_level.txt +0 -0

natural_pdf/__init__.py CHANGED Viewed

@@ -37,72 +37,56 @@ def configure_logging(level=logging.INFO, handler=None):
     logger.propagate = False
+# Version
+__version__ = "0.1.1"
+# Core imports
+from natural_pdf.collections.pdf_collection import PDFCollection
 from natural_pdf.core.page import Page
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.collections import ElementCollection
 from natural_pdf.elements.region import Region
-# Import QA module if available
-try:
-    from natural_pdf.qa import DocumentQA, get_qa_engine
-    HAS_QA = True
-except ImportError:
-    HAS_QA = False
-__version__ = "0.1.1"
-__all__ = [
-    "PDF",
-    "PDFCollection",
-    "Page",
-    "Region",
-    "ElementCollection",
-    "TextSearchOptions",
-    "MultiModalSearchOptions",
-    "BaseSearchOptions",
-    "configure_logging",
-]
-if HAS_QA:
-    __all__.extend(["DocumentQA", "get_qa_engine"])
-from .collections.pdf_collection import PDFCollection
-# Core classes
-from .core.pdf import PDF
-from .elements.region import Region
+ElementCollection = None
 # Search options (if extras installed)
 try:
-    from .search.search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
+    from natural_pdf.search.search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
 except ImportError:
     # Define dummy classes if extras not installed, so imports don't break
     # but using them will raise the ImportError from check_haystack_availability
-    class TextSearchOptions:
+    class BaseSearchOptions:
         def __init__(self, *args, **kwargs):
             pass
-    class MultiModalSearchOptions:
+    class TextSearchOptions:
         def __init__(self, *args, **kwargs):
             pass
-    class BaseSearchOptions:
+    class MultiModalSearchOptions:
         def __init__(self, *args, **kwargs):
             pass
-# Expose logging setup? (Optional)
-# from . import logging_config
-# logging_config.setup_logging()
+# Import QA module if available
+try:
+    from natural_pdf.qa import DocumentQA, get_qa_engine
+    HAS_QA = True
+except ImportError:
+    HAS_QA = False
 # Explicitly define what gets imported with 'from natural_pdf import *'
 __all__ = [
     "PDF",
     "PDFCollection",
+    "Page",
     "Region",
-    "TextSearchOptions",  # Include search options
+    "ElementCollection",
+    "TextSearchOptions",
     "MultiModalSearchOptions",
     "BaseSearchOptions",
+    "configure_logging",
 ]
+# Add QA components to __all__ if available
+if HAS_QA:
+    __all__.extend(["DocumentQA", "get_qa_engine"])

natural_pdf/classification/manager.py CHANGED Viewed

@@ -161,7 +161,7 @@ class ClassificationManager:
     def classify_item(
         self,
         item_content: Union[str, Image.Image],
-        categories: List[str],
+        labels: List[str],
         model_id: Optional[str] = None,
         using: Optional[str] = None,
         min_confidence: float = 0.0,
@@ -193,13 +193,13 @@ class ClassificationManager:
                     else self.DEFAULT_VISION_MODEL
                 )
-        if not categories:
-            raise ValueError("Categories list cannot be empty.")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
         pipeline_instance = self._get_pipeline(model_id, effective_using)
         timestamp = datetime.now()
         parameters = {  # Store parameters used for this run
-            "categories": categories,
+            "labels": labels,
             "model_id": model_id,
             "using": effective_using,
             "min_confidence": min_confidence,
@@ -214,7 +214,7 @@ class ClassificationManager:
             # Handle potential kwargs for specific pipelines if needed
             # The zero-shot pipelines expect `candidate_labels`
             result_raw = pipeline_instance(
-                item_content, candidate_labels=categories, multi_label=multi_label, **kwargs
+                item_content, candidate_labels=labels, multi_label=multi_label, **kwargs
             )
             logger.debug(f"Raw pipeline result: {result_raw}")
@@ -226,7 +226,7 @@ class ClassificationManager:
                 for label, score_val in zip(result_raw["labels"], result_raw["scores"]):
                     if score_val >= min_confidence:
                         try:
-                            scores_list.append(CategoryScore(label=label, confidence=score_val))
+                            scores_list.append(CategoryScore(label, score_val))
                         except (ValueError, TypeError) as score_err:
                             logger.warning(
                                 f"Skipping invalid score from text pipeline: label='{label}', score={score_val}. Error: {score_err}"
@@ -241,7 +241,7 @@ class ClassificationManager:
                     label = item["label"]
                     if score_val >= min_confidence:
                         try:
-                            scores_list.append(CategoryScore(label=label, confidence=score_val))
+                            scores_list.append(CategoryScore(label, score_val))
                         except (ValueError, TypeError) as score_err:
                             logger.warning(
                                 f"Skipping invalid score from vision pipeline: label='{label}', score={score_val}. Error: {score_err}"
@@ -253,13 +253,15 @@ class ClassificationManager:
                 # Return empty result?
                 # scores_list = []
-            return ClassificationResult(
+            # ClassificationResult now calculates top score/category internally
+            result_obj = ClassificationResult(
+                scores=scores_list,  # Pass the filtered list
                 model_id=model_id,
                 using=effective_using,
-                timestamp=timestamp,
                 parameters=parameters,
-                scores=scores_list,
+                timestamp=timestamp,
             )
+            return result_obj
             # --- End Processing --- #
         except Exception as e:
@@ -273,7 +275,7 @@ class ClassificationManager:
     def classify_batch(
         self,
         item_contents: List[Union[str, Image.Image]],
-        categories: List[str],
+        labels: List[str],
         model_id: Optional[str] = None,
         using: Optional[str] = None,
         min_confidence: float = 0.0,
@@ -307,13 +309,13 @@ class ClassificationManager:
                     else self.DEFAULT_VISION_MODEL
                 )
-        if not categories:
-            raise ValueError("Categories list cannot be empty.")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
         pipeline_instance = self._get_pipeline(model_id, effective_using)
         timestamp = datetime.now()  # Single timestamp for the batch run
         parameters = {  # Parameters for the whole batch
-            "categories": categories,
+            "labels": labels,
             "model_id": model_id,
             "using": effective_using,
             "min_confidence": min_confidence,
@@ -331,7 +333,7 @@ class ClassificationManager:
             # Use pipeline directly for batching
             results_iterator = pipeline_instance(
                 item_contents,
-                candidate_labels=categories,
+                candidate_labels=labels,
                 multi_label=multi_label,
                 batch_size=batch_size,
                 **kwargs,
@@ -362,9 +364,7 @@ class ClassificationManager:
                         for label, score_val in zip(raw_result["labels"], raw_result["scores"]):
                             if score_val >= min_confidence:
                                 try:
-                                    scores_list.append(
-                                        CategoryScore(label=label, confidence=score_val)
-                                    )
+                                    scores_list.append(CategoryScore(label, score_val))
                                 except (ValueError, TypeError) as score_err:
                                     logger.warning(
                                         f"Skipping invalid score from text pipeline batch: label='{label}', score={score_val}. Error: {score_err}"
@@ -376,9 +376,7 @@ class ClassificationManager:
                                 score_val = item["score"]
                                 label = item["label"]
                                 if score_val >= min_confidence:
-                                    scores_list.append(
-                                        CategoryScore(label=label, confidence=score_val)
-                                    )
+                                    scores_list.append(CategoryScore(label, score_val))
                             except (KeyError, ValueError, TypeError) as item_err:
                                 logger.warning(
                                     f"Skipping invalid item in vision result list from batch: {item}. Error: {item_err}"
@@ -394,14 +392,20 @@ class ClassificationManager:
                     )
                     # scores_list remains empty for this item
+                # --- Determine top category and score ---
+                scores_list.sort(key=lambda s: s.score, reverse=True)
+                top_category = scores_list[0].label
+                top_score = scores_list[0].score
+                # --- End Determine top category ---
                 # Append result object for this item
                 batch_results_list.append(
                     ClassificationResult(
+                        scores=scores_list,  # Pass the full list, init will sort/filter
                         model_id=model_id,
                         using=effective_using,
                         timestamp=timestamp,  # Use same timestamp for batch
                         parameters=parameters,  # Use same params for batch
-                        scores=scores_list,
                     )
                 )
                 # --- End Processing --- #

natural_pdf/classification/mixin.py CHANGED Viewed

@@ -44,9 +44,9 @@ class ClassificationMixin:
     def classify(
         self,
-        categories: List[str],
-        model: Optional[str] = None,  # Default handled by manager
-        using: Optional[str] = None,  # Renamed parameter
+        labels: List[str],
+        model: Optional[str] = None,
+        using: Optional[str] = None,
         min_confidence: float = 0.0,
         analysis_key: str = "classification",  # Default key
         multi_label: bool = False,
@@ -60,7 +60,7 @@ class ClassificationMixin:
         result under that key.
         Args:
-            categories: A list of string category names.
+            labels: A list of string category names.
             model: Model identifier (e.g., 'text', 'vision', HF ID). Defaults handled by manager.
             using: Optional processing mode ('text' or 'vision'). If None, inferred by manager.
             min_confidence: Minimum confidence threshold for results (0.0-1.0).
@@ -103,9 +103,9 @@ class ClassificationMixin:
             # Manager now returns a ClassificationResult object
             result_obj: ClassificationResult = manager.classify_item(
                 item_content=content,
-                categories=categories,
-                model_id=effective_model_id,  # Pass the resolved model ID
-                using=inferred_using,  # Pass renamed argument
+                labels=labels,
+                model_id=effective_model_id,
+                using=inferred_using,
                 min_confidence=min_confidence,
                 multi_label=multi_label,
                 **kwargs,

natural_pdf/classification/results.py CHANGED Viewed

@@ -11,19 +11,19 @@ logger = logging.getLogger(__name__)
 class CategoryScore:
     """Represents a category and its confidence score from classification."""
-    category: str
+    label: str
     score: float
     def to_dict(self) -> Dict[str, Any]:
         """Convert to dictionary for serialization."""
-        return {"category": self.category, "score": self.score}
+        return {"category": self.label, "score": self.score}
 @dataclass
 class ClassificationResult:
     """Results from a classification operation."""
-    category: str
+    category: Optional[str]  # Can be None if scores are empty
     score: float
     scores: List[CategoryScore]
     model_id: str
@@ -33,17 +33,25 @@ class ClassificationResult:
     def __init__(
         self,
-        category: str,
-        score: float,
-        scores: List[CategoryScore],
+        scores: List[CategoryScore],  # Now the primary source
         model_id: str,
         using: str,
         parameters: Optional[Dict[str, Any]] = None,
         timestamp: Optional[datetime] = None,
     ):
-        self.category = category
-        self.score = score
-        self.scores = scores
+        # Determine top category and score from the scores list
+        if scores:
+            # Sort scores descending by score to find the top one
+            sorted_scores = sorted(scores, key=lambda s: s.score, reverse=True)
+            self.category = sorted_scores[0].label
+            self.score = sorted_scores[0].score
+            self.scores = sorted_scores  # Store the sorted list
+        else:
+            # Handle empty scores list
+            self.category = None
+            self.score = 0.0
+            self.scores = []  # Store empty list
         self.model_id = model_id
         self.using = using
         self.parameters = parameters or {}

natural_pdf/collections/mixins.py CHANGED Viewed

@@ -109,3 +109,20 @@ class ApplyMixin:
             return PageCollection(results)
         return results
+    def filter(self: Any, predicate: Callable[[Any], bool]) -> Any:
+        """
+        Filters the collection based on a predicate function.
+        Args:
+            predicate: A function that takes an item and returns True if the item
+                       should be included in the result, False otherwise.
+        Returns:
+            A new collection of the same type containing only the items
+            for which the predicate returned True.
+        """
+        items_iterable = self._get_items_for_apply()
+        filtered_items = [item for item in items_iterable if predicate(item)]
+        return type(self)(filtered_items)

natural_pdf/collections/pdf_collection.py CHANGED Viewed

@@ -519,7 +519,7 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
         return self
-    def categorize(self, categories: List[str], **kwargs):
+    def categorize(self, labels: List[str], **kwargs):
         """Categorizes PDFs in the collection based on content or features."""
         # Implementation requires integrating with classification models or logic
         raise NotImplementedError("categorize requires classification implementation.")
@@ -570,85 +570,101 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
     # --- Classification Method --- #
     def classify_all(
         self,
-        categories: List[str],
-        model: str = "text",
+        labels: List[str],
+        using: Optional[str] = None,  # Default handled by PDF.classify -> manager
+        model: Optional[str] = None,  # Optional model ID
         max_workers: Optional[int] = None,
+        analysis_key: str = "classification",  # Key for storing result in PDF.analyses
         **kwargs,
     ) -> "PDFCollection":
         """
-        Classify all pages across all PDFs in the collection, potentially in parallel.
+        Classify each PDF document in the collection, potentially in parallel.
-        This method uses the unified `classify_all` approach, delegating page
-        classification to each PDF's `classify_pages` method.
-        It displays a progress bar tracking individual pages.
+        This method delegates classification to each PDF object's `classify` method.
+        By default, uses the full extracted text of the PDF.
+        If `using='vision'`, it classifies the first page's image, but ONLY if
+        the PDF has a single page (raises ValueError otherwise).
         Args:
-            categories: A list of string category names.
-            model: Model identifier ('text', 'vision', or specific HF ID).
+            labels: A list of string category names.
+            using: Processing mode ('text', 'vision'). If None, manager infers (defaulting to text).
+            model: Optional specific model identifier (e.g., HF ID). If None, manager uses default for 'using' mode.
             max_workers: Maximum number of threads to process PDFs concurrently.
                          If None or 1, processing is sequential.
-            **kwargs: Additional arguments passed down to `pdf.classify_pages` and
-                      subsequently to `page.classify` (e.g., device,
-                      confidence_threshold, resolution).
+            analysis_key: Key under which to store the ClassificationResult in each PDF's `analyses` dict.
+            **kwargs: Additional arguments passed down to `pdf.classify` (e.g., device,
+                      min_confidence, multi_label, text extraction options).
         Returns:
             Self for method chaining.
         Raises:
-            ValueError: If categories list is empty.
-            ClassificationError: If classification fails for any page (will stop processing).
+            ValueError: If labels list is empty, or if using='vision' on a multi-page PDF.
+            ClassificationError: If classification fails for any PDF (will stop processing).
             ImportError: If classification dependencies are missing.
         """
         PDF = self._get_pdf_class()
-        if not categories:
-            raise ValueError("Categories list cannot be empty.")
+        if not labels:
+            raise ValueError("Labels list cannot be empty.")
+        if not self._pdfs:
+            logger.warning("PDFCollection is empty, skipping classification.")
+            return self
+        mode_desc = f"using='{using}'" if using else f"model='{model}'" if model else "default text"
         logger.info(
-            f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')..."
+            f"Starting classification for {len(self._pdfs)} PDFs in collection ({mode_desc})..."
         )
-        # Calculate total pages for the progress bar
-        total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
-        if total_pages == 0:
-            logger.warning("No pages found in the PDF collection to classify.")
-            return self
         progress_bar = tqdm(
-            total=total_pages, desc=f"Classifying Pages (model: {model})", unit="page"
+            total=len(self._pdfs), desc=f"Classifying PDFs ({mode_desc})", unit="pdf"
         )
         # Worker function
         def _process_pdf_classification(pdf: PDF):
             thread_id = threading.current_thread().name
             pdf_path = pdf.path
-            logger.debug(f"[{thread_id}] Starting classification process for: {pdf_path}")
+            logger.debug(f"[{thread_id}] Starting classification process for PDF: {pdf_path}")
             start_time = time.monotonic()
             try:
-                # Call classify_pages on the PDF, passing the progress callback
-                pdf.classify_pages(
-                    categories=categories,
+                # Call classify directly on the PDF object
+                pdf.classify(
+                    labels=labels,
+                    using=using,
                     model=model,
-                    progress_callback=progress_bar.update,
-                    **kwargs,
+                    analysis_key=analysis_key,
+                    **kwargs,  # Pass other relevant args like min_confidence, multi_label
                 )
                 end_time = time.monotonic()
                 logger.debug(
-                    f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
+                    f"[{thread_id}] Finished classification for PDF: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
                 )
+                progress_bar.update(1)  # Update progress bar upon success
                 return pdf_path, None  # Return path and no error
-            except Exception as e:
+            except ValueError as ve:
+                # Catch specific error for vision on multi-page PDF
                 end_time = time.monotonic()
-                # Error is logged within classify_pages, but log summary here
                 logger.error(
-                    f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
+                    f"[{thread_id}] Skipped classification for {pdf_path} after {end_time - start_time:.2f}s: {ve}",
                     exc_info=False,
                 )
-                # Close progress bar immediately on error to avoid hanging
-                progress_bar.close()
+                progress_bar.update(1)  # Still update progress bar
+                return pdf_path, ve  # Return the specific ValueError
+            except Exception as e:
+                end_time = time.monotonic()
+                logger.error(
+                    f"[{thread_id}] Failed classification process for PDF {pdf_path} after {end_time - start_time:.2f}s: {e}",
+                    exc_info=True,  # Log full traceback for unexpected errors
+                )
+                # Close progress bar immediately on critical error to avoid hanging
+                if not progress_bar.disable:
+                    progress_bar.close()
                 # Re-raise the exception to stop the entire collection processing
-                raise
+                raise ClassificationError(f"Classification failed for {pdf_path}: {e}") from e
         # Use ThreadPoolExecutor for parallel processing if max_workers > 1
+        processed_count = 0
+        skipped_count = 0
         try:
             if max_workers is not None and max_workers > 1:
                 logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
@@ -659,23 +675,39 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
                     for pdf in self._pdfs:
                         futures.append(executor.submit(_process_pdf_classification, pdf))
-                    # Wait for all futures to complete (progress updated by callback)
-                    # Exceptions are raised by future.result() if worker failed
+                    # Wait for all futures to complete
+                    # Progress updated within worker
                     for future in concurrent.futures.as_completed(futures):
-                        future.result()  # Raise exception if worker failed
+                        processed_count += 1
+                        pdf_path, error = (
+                            future.result()
+                        )  # Raise ClassificationError if worker failed critically
+                        if isinstance(error, ValueError):
+                            # Logged in worker, just count as skipped
+                            skipped_count += 1
             else:  # Sequential processing
                 logger.info("Classifying PDFs sequentially.")
                 for pdf in self._pdfs:
-                    _process_pdf_classification(pdf)
-            logger.info("Finished classification across the collection.")
+                    processed_count += 1
+                    pdf_path, error = _process_pdf_classification(
+                        pdf
+                    )  # Raise ClassificationError if worker failed critically
+                    if isinstance(error, ValueError):
+                        skipped_count += 1
+            final_message = (
+                f"Finished classification across the collection. Processed: {processed_count}"
+            )
+            if skipped_count > 0:
+                final_message += f", Skipped (e.g., vision on multi-page): {skipped_count}"
+            logger.info(final_message + ".")
         finally:
-            # Ensure progress bar is closed even if errors occurred elsewhere
+            # Ensure progress bar is closed properly
             if not progress_bar.disable and progress_bar.n < progress_bar.total:
-                progress_bar.close()
-            elif progress_bar.disable is False:
+                progress_bar.n = progress_bar.total  # Ensure it reaches 100%
+            if not progress_bar.disable:
                 progress_bar.close()
         return self

natural-pdf 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

natural-pdf 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl