PyPI - natural-pdf - Versions diffs - 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

natural-pdf 0.1.15py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

natural_pdf/__init__.py +31 -0
natural_pdf/analyzers/layout/gemini.py +137 -162
natural_pdf/analyzers/layout/layout_manager.py +9 -5
natural_pdf/analyzers/layout/layout_options.py +77 -7
natural_pdf/analyzers/layout/paddle.py +318 -165
natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
natural_pdf/analyzers/shape_detection_mixin.py +770 -405
natural_pdf/classification/mixin.py +2 -8
natural_pdf/collections/pdf_collection.py +25 -30
natural_pdf/core/highlighting_service.py +47 -32
natural_pdf/core/page.py +117 -75
natural_pdf/core/pdf.py +19 -22
natural_pdf/elements/base.py +9 -9
natural_pdf/elements/collections.py +105 -50
natural_pdf/elements/region.py +200 -126
natural_pdf/exporters/paddleocr.py +38 -13
natural_pdf/flows/__init__.py +3 -3
natural_pdf/flows/collections.py +303 -132
natural_pdf/flows/element.py +277 -132
natural_pdf/flows/flow.py +33 -16
natural_pdf/flows/region.py +142 -79
natural_pdf/ocr/engine_doctr.py +37 -4
natural_pdf/ocr/engine_easyocr.py +23 -3
natural_pdf/ocr/engine_paddle.py +281 -30
natural_pdf/ocr/engine_surya.py +8 -3
natural_pdf/ocr/ocr_manager.py +75 -76
natural_pdf/ocr/ocr_options.py +52 -87
natural_pdf/search/__init__.py +25 -12
natural_pdf/search/lancedb_search_service.py +91 -54
natural_pdf/search/numpy_search_service.py +86 -65
natural_pdf/search/searchable_mixin.py +2 -2
natural_pdf/selectors/parser.py +125 -81
natural_pdf/widgets/__init__.py +1 -1
natural_pdf/widgets/viewer.py +205 -449
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0

natural_pdf/exporters/paddleocr.py CHANGED Viewed

@@ -1,9 +1,9 @@
+import collections
 import logging
 import os
 import random
 import shutil
 from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
-import collections
 from tqdm.auto import tqdm
@@ -144,16 +144,34 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
                     )
                     continue
-                elements = pdf.find_all(
-                    self.selector, apply_exclusions=False
-                )  # Usually want all text, even if excluded
+                elements = pdf.find_all(self.selector, apply_exclusions=False)
                 if not elements:
                     logger.debug(f"No elements matching '{self.selector}' found in {pdf.path}")
                     continue
+                # --- FILTER BASED ON CHARACTER FREQUENCY BEFORE EXPORT ---
+                filtered_elements = []
+                if self.min_char_freq > 1:
+                    # First, count all characters in all elements
+                    char_counts = collections.Counter()
+                    for element in elements:
+                        if hasattr(element, "text") and isinstance(element.text, str):
+                            char_counts.update(element.text)
+                    rare_chars = {
+                        char for char, count in char_counts.items() if count < self.min_char_freq
+                    }
+                    for element in elements:
+                        if hasattr(element, "text") and isinstance(element.text, str):
+                            if any(char in rare_chars for char in element.text):
+                                elements_skipped += 1
+                                continue
+                        filtered_elements.append(element)
+                else:
+                    filtered_elements = elements
                 for i, element in enumerate(
                     tqdm(
-                        elements,
+                        filtered_elements,
                         desc=f"Exporting '{os.path.basename(pdf.path)}'",
                         leave=False,
                         position=1,
@@ -243,16 +261,20 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
                 filtered_labels = []
                 for img_path, text in labels:
                     if any(char in rare_chars for char in text):
-                        elements_skipped += 1 # Count these as skipped due to rare chars
-                        elements_processed -=1 # Decrement from processed as it's now being skipped
+                        elements_skipped += 1  # Count these as skipped due to rare chars
+                        elements_processed -= (
+                            1  # Decrement from processed as it's now being skipped
+                        )
                     else:
                         filtered_labels.append((img_path, text))
                 labels_removed_count = original_label_count - len(filtered_labels)
                 if labels_removed_count > 0:
-                    logger.info(f"Removed {labels_removed_count} elements containing rare characters.")
+                    logger.info(
+                        f"Removed {labels_removed_count} elements containing rare characters."
+                    )
                 labels = filtered_labels
                 # Recalculate char_counts based on filtered_labels to update the dictionary
                 char_counts.clear()
                 for _, text in labels:
@@ -266,15 +288,18 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
             else:
                 logger.info("No rare characters found below the frequency threshold.")
         # --- 3. Generate Dictionary File (`dict.txt`) ---
         dict_path = os.path.join(output_dir, "dict.txt")
         try:
             # Log the character set before sorting/writing
-            final_chars_for_dict = set(char_counts.keys()) # Use keys from potentially filtered char_counts
+            final_chars_for_dict = set(
+                char_counts.keys()
+            )  # Use keys from potentially filtered char_counts
             logger.debug(f"Exporter final char_set for dict: {repr(final_chars_for_dict)}")
-            sorted_chars = sorted(list(final_chars_for_dict)) # No specific sorting order needed, just make it consistent
+            sorted_chars = sorted(
+                list(final_chars_for_dict)
+            )  # No specific sorting order needed, just make it consistent
             with open(dict_path, "w", encoding="utf-8") as f_dict:
                 for char in sorted_chars:
                     # Ensure we don't write empty strings or just newlines as dictionary entries

natural_pdf/flows/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from .flow import Flow
+from .collections import FlowElementCollection, FlowRegionCollection
 from .element import FlowElement
+from .flow import Flow
 from .region import FlowRegion
-from .collections import FlowElementCollection, FlowRegionCollection
 __all__ = [
     "Flow",
@@ -9,4 +9,4 @@ __all__ = [
     "FlowRegion",
     "FlowElementCollection",
     "FlowRegionCollection",
-]
+]

natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl

natural-pdf 0.1.15py3-none-any.whl → 0.1.16py3-none-any.whl