PyPI - natural-pdf - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

natural-pdf 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

natural_pdf/__init__.py +7 -2
natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
natural_pdf/analyzers/text_options.py +9 -1
natural_pdf/analyzers/text_structure.py +371 -58
natural_pdf/classification/manager.py +3 -4
natural_pdf/collections/pdf_collection.py +19 -39
natural_pdf/core/element_manager.py +11 -1
natural_pdf/core/highlighting_service.py +146 -75
natural_pdf/core/page.py +287 -188
natural_pdf/core/pdf.py +57 -42
natural_pdf/elements/base.py +51 -0
natural_pdf/elements/collections.py +362 -67
natural_pdf/elements/line.py +5 -0
natural_pdf/elements/region.py +396 -23
natural_pdf/exporters/data/__init__.py +0 -0
natural_pdf/exporters/data/pdf.ttf +0 -0
natural_pdf/exporters/data/sRGB.icc +0 -0
natural_pdf/exporters/hocr.py +40 -61
natural_pdf/exporters/hocr_font.py +7 -13
natural_pdf/exporters/original_pdf.py +10 -13
natural_pdf/exporters/paddleocr.py +51 -11
natural_pdf/exporters/searchable_pdf.py +0 -10
natural_pdf/flows/__init__.py +12 -0
natural_pdf/flows/collections.py +533 -0
natural_pdf/flows/element.py +382 -0
natural_pdf/flows/flow.py +216 -0
natural_pdf/flows/region.py +458 -0
natural_pdf/search/__init__.py +65 -52
natural_pdf/search/lancedb_search_service.py +325 -0
natural_pdf/search/numpy_search_service.py +255 -0
natural_pdf/search/searchable_mixin.py +25 -71
natural_pdf/selectors/parser.py +163 -8
natural_pdf/widgets/viewer.py +22 -31
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
natural_pdf/search/haystack_search_service.py +0 -687
natural_pdf/search/haystack_utils.py +0 -474
natural_pdf/utils/tqdm_utils.py +0 -51
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0

natural_pdf/exporters/hocr.py CHANGED Viewed

@@ -66,28 +66,28 @@ class HocrTransform:
     """
     box_pattern = re.compile(
-        r'''
+        r"""
         bbox \s+
         (\d+) \s+   # left: uint
         (\d+) \s+   # top: uint
         (\d+) \s+   # right: uint
         (\d+)       # bottom: uint
-        ''',
+        """,
         re.VERBOSE,
     )
     baseline_pattern = re.compile(
-        r'''
+        r"""
         baseline \s+
         ([\-\+]?\d*\.?\d*) \s+  # +/- decimal float
         ([\-\+]?\d+)            # +/- int
-        ''',
+        """,
         re.VERBOSE,
     )
     textangle_pattern = re.compile(
-        r'''
+        r"""
         textangle \s+
         ([\-\+]?\d*\.?\d*)  # +/- decimal float
-        ''',
+        """,
         re.VERBOSE,
     )
@@ -121,12 +121,12 @@ class HocrTransform:
         # if the hOCR file has a namespace, ElementTree requires its use to
         # find elements
-        matches = re.match(r'({.*})html', self.hocr.getroot().tag)
-        self.xmlns = ''
+        matches = re.match(r"({.*})html", self.hocr.getroot().tag)
+        self.xmlns = ""
         if matches:
             self.xmlns = matches.group(1)
-        for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')):
+        for div in self.hocr.findall(self._child_xpath("div", "ocr_page")):
             coords = self.element_coordinates(div)
             if not coords:
                 raise HocrTransformError("hocr file is missing page dimensions")
@@ -137,16 +137,16 @@ class HocrTransform:
     def _get_element_text(self, element: Element) -> str:
         """Return the textual content of the element and its children."""
-        text = element.text if element.text is not None else ''
+        text = element.text if element.text is not None else ""
         for child in element:
             text += self._get_element_text(child)
-        text += element.tail if element.tail is not None else ''
+        text += element.tail if element.tail is not None else ""
         return text
     @classmethod
     def element_coordinates(cls, element: Element) -> Rectangle | None:
         """Get coordinates of the bounding box around an element."""
-        matches = cls.box_pattern.search(element.attrib.get('title', ''))
+        matches = cls.box_pattern.search(element.attrib.get("title", ""))
         if not matches:
             return None
         return Rectangle(
@@ -159,7 +159,7 @@ class HocrTransform:
     @classmethod
     def baseline(cls, element: Element) -> tuple[float, float]:
         """Get baseline's slope and intercept."""
-        matches = cls.baseline_pattern.search(element.attrib.get('title', ''))
+        matches = cls.baseline_pattern.search(element.attrib.get("title", ""))
         if not matches:
             return (0.0, 0.0)
         return float(matches.group(1)), int(matches.group(2))
@@ -167,7 +167,7 @@ class HocrTransform:
     @classmethod
     def textangle(cls, element: Element) -> float:
         """Get text angle of an element."""
-        matches = cls.textangle_pattern.search(element.attrib.get('title', ''))
+        matches = cls.textangle_pattern.search(element.attrib.get("title", ""))
         if not matches:
             return 0.0
         return float(matches.group(1))
@@ -220,13 +220,13 @@ class HocrTransform:
         with canvas.do.save_state(cm=page_matrix):
             self._debug_draw_paragraph_boxes(canvas)
             found_lines = False
-            for par in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')):
+            for par in self.hocr.iterfind(self._child_xpath("p", "ocr_par")):
                 for line in (
                     element
-                    for element in par.iterfind(self._child_xpath('span'))
-                    if 'class' in element.attrib
-                    and element.attrib['class']
-                    in {'ocr_header', 'ocr_line', 'ocr_textfloat', 'ocr_caption'}
+                    for element in par.iterfind(self._child_xpath("span"))
+                    if "class" in element.attrib
+                    and element.attrib["class"]
+                    in {"ocr_header", "ocr_line", "ocr_textfloat", "ocr_caption"}
                 ):
                     found_lines = True
                     direction = self._get_text_direction(par)
@@ -242,7 +242,7 @@ class HocrTransform:
             if not found_lines:
                 # Tesseract did not report any lines (just words)
-                root = self.hocr.find(self._child_xpath('div', 'ocr_page'))
+                root = self.hocr.find(self._child_xpath("div", "ocr_page"))
                 direction = self._get_text_direction(root)
                 self._do_line(
                     canvas,
@@ -254,27 +254,21 @@ class HocrTransform:
                 )
         # put the image on the page, scaled to fill the page
         if image_filename is not None:
-            canvas.do.draw_image(
-                image_filename, 0, 0, width=self.width, height=self.height
-            )
+            canvas.do.draw_image(image_filename, 0, 0, width=self.width, height=self.height)
         # finish up the page and save it
         canvas.to_pdf().save(out_filename)
     def _get_text_direction(self, par):
         """Get the text direction of the paragraph.
         Arabic, Hebrew, Persian, are right-to-left languages.
         When the paragraph element is None, defaults to left-to-right.
         """
         if par is None:
             return TextDirection.LTR
-        return (
-            TextDirection.RTL
-            if par.attrib.get('dir', 'ltr') == 'rtl'
-            else TextDirection.LTR
-        )
+        return TextDirection.RTL if par.attrib.get("dir", "ltr") == "rtl" else TextDirection.LTR
     def _get_inject_word_breaks(self, par):
         """Determine whether word breaks should be injected.
@@ -283,9 +277,9 @@ class HocrTransform:
         words are usually one or two characters and separators are usually explicit.
         In all other languages, we inject word breaks to help word segmentation.
         """
-        lang = par.attrib.get('lang', '')
+        lang = par.attrib.get("lang", "")
         log.debug(lang)
-        if lang in {'chi_sim', 'chi_tra', 'jpn', 'kor'}:
+        if lang in {"chi_sim", "chi_tra", "jpn", "kor"}:
             return False
         return True
@@ -339,8 +333,7 @@ class HocrTransform:
         # size as the true bounding box of the line.
         top_left_corner = (line_min_aabb.llx, line_min_aabb.lly)
         line_size_aabb_matrix = (
-            Matrix()
-            .translated(*top_left_corner)
+            Matrix().translated(*top_left_corner)
             # Note: negative sign (textangle is counter-clockwise, see hOCR spec)
             .rotated(-self.textangle(line))
         )
@@ -371,12 +364,10 @@ class HocrTransform:
             text.font(self._fontname, fontsize)
             text.render_mode(3 if invisible_text else 0)
-            self._debug_draw_baseline(
-                canvas, baseline_matrix.inverse().transform(line_min_aabb), 0
-            )
+            self._debug_draw_baseline(canvas, baseline_matrix.inverse().transform(line_min_aabb), 0)
             canvas.do.fill_color(BLACK)  # text in black
-            elements = line.findall(self._child_xpath('span', elemclass))
+            elements = line.findall(self._child_xpath("span", elemclass))
             for elem, next_elem in pairwise(elements + [None]):
                 self._do_line_word(
                     canvas,
@@ -405,7 +396,7 @@ class HocrTransform:
         if elem is None:
             return
         elemtxt = self.normalize_text(self._get_element_text(elem).strip())
-        if elemtxt == '':
+        if elemtxt == "":
             return
         hocr_box = self.element_coordinates(elem)
@@ -430,9 +421,7 @@ class HocrTransform:
             text.show(self._font.text_encode(elemtxt))
         # Get coordinates of the next word (if there is one)
-        hocr_next_box = (
-            self.element_coordinates(next_elem) if next_elem is not None else None
-        )
+        hocr_next_box = self.element_coordinates(next_elem) if next_elem is not None else None
         if hocr_next_box is None:
             return
         # Render a space between this word and the next word. The explicit space helps
@@ -447,16 +436,14 @@ class HocrTransform:
         elif text_direction == TextDirection.RTL:
             space_box = Rectangle(next_box.urx, box.lly, box.llx, next_box.ury)
         self._debug_draw_space_bbox(canvas, space_box)
-        space_width = self._font.text_width(' ', fontsize)
+        space_width = self._font.text_width(" ", fontsize)
         if space_width > 0 and space_box.width > 0:
             if text_direction == TextDirection.LTR:
                 text.text_transform(Matrix(1, 0, 0, -1, space_box.llx, 0))
             elif text_direction == TextDirection.RTL:
-                text.text_transform(
-                    Matrix(-1, 0, 0, -1, space_box.llx + space_box.width, 0)
-                )
+                text.text_transform(Matrix(-1, 0, 0, -1, space_box.llx + space_box.width, 0))
             text.horiz_scale(100 * space_box.width / space_width)
-            text.show(self._font.text_encode(' '))
+            text.show(self._font.text_encode(" "))
     def _debug_draw_paragraph_boxes(self, canvas: Canvas, color=CYAN):
         """Draw boxes around paragraphs in the document."""
@@ -465,16 +452,14 @@ class HocrTransform:
         with canvas.do.save_state():
             # draw box around paragraph
             canvas.do.stroke_color(color).line_width(0.1)
-            for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')):
+            for elem in self.hocr.iterfind(self._child_xpath("p", "ocr_par")):
                 elemtxt = self._get_element_text(elem).strip()
                 if len(elemtxt) == 0:
                     continue
                 ocr_par = self.element_coordinates(elem)
                 if ocr_par is None:
                     continue
-                canvas.do.rect(
-                    ocr_par.llx, ocr_par.lly, ocr_par.width, ocr_par.height, fill=False
-                )
+                canvas.do.rect(ocr_par.llx, ocr_par.lly, ocr_par.width, ocr_par.height, fill=False)
     def _debug_draw_line_bbox(self, canvas: Canvas, line_box: Rectangle, color=BLUE):
         """Render the bounding box of a text line."""
@@ -485,22 +470,16 @@ class HocrTransform:
                 line_box.llx, line_box.lly, line_box.width, line_box.height, fill=False
             )
-    def _debug_draw_word_triangle(
-        self, canvas: Canvas, box: Rectangle, color=RED, line_width=0.1
-    ):
+    def _debug_draw_word_triangle(self, canvas: Canvas, box: Rectangle, color=RED, line_width=0.1):
         """Render a triangle that conveys word height and drawing direction."""
         if not self.render_options.render_triangle:  # pragma: no cover
             return
         with canvas.do.save_state():
             canvas.do.stroke_color(color).line_width(line_width).line(
                 box.llx, box.lly, box.urx, box.lly
-            ).line(box.urx, box.lly, box.llx, box.ury).line(
-                box.llx, box.lly, box.llx, box.ury
-            )
+            ).line(box.urx, box.lly, box.llx, box.ury).line(box.llx, box.lly, box.llx, box.ury)
-    def _debug_draw_word_bbox(
-        self, canvas: Canvas, box: Rectangle, color=GREEN, line_width=0.1
-    ):
+    def _debug_draw_word_bbox(self, canvas: Canvas, box: Rectangle, color=GREEN, line_width=0.1):
         """Render a box depicting the word."""
         if not self.render_options.render_word_bbox:  # pragma: no cover
             return
@@ -537,4 +516,4 @@ class HocrTransform:
                 baseline_lly,
                 line_box.urx,
                 baseline_lly,
-            )
+            )

natural_pdf/exporters/hocr_font.py CHANGED Viewed

@@ -8,11 +8,7 @@ import unicodedata
 import zlib
 from importlib.resources import files as package_files
-from pikepdf import (
-    Dictionary,
-    Name,
-    Pdf,
-)
+from pikepdf import Dictionary, Name, Pdf
 from pikepdf.canvas import Font
 log = logging.getLogger(__name__)
@@ -25,8 +21,8 @@ class EncodableFont(Font):
 class GlyphlessFont(EncodableFont):
     CID_TO_GID_DATA = zlib.compress(b"\x00\x01" * 65536)
-    GLYPHLESS_FONT_NAME = 'pdf.ttf'
-    GLYPHLESS_FONT_PACKAGE_PATH = 'natural_pdf.exporters.data'
+    GLYPHLESS_FONT_NAME = "pdf.ttf"
+    GLYPHLESS_FONT_PACKAGE_PATH = "natural_pdf.exporters.data"
     GLYPHLESS_FONT = (package_files(GLYPHLESS_FONT_PACKAGE_PATH) / GLYPHLESS_FONT_NAME).read_bytes()
     CHAR_ASPECT = 2
@@ -39,7 +35,7 @@ class GlyphlessFont(EncodableFont):
         return len(unicodedata.normalize("NFKC", text)) * (fontsize / self.CHAR_ASPECT)
     def text_encode(self, text: str) -> bytes:
-        return text.encode('utf-16be')
+        return text.encode("utf-16be")
     def register(self, pdf: Pdf):
         """Register the glyphless font.
@@ -76,9 +72,7 @@ class GlyphlessFont(EncodableFont):
             )
         )
         basefont.DescendantFonts = [cid_font_type2]
-        cid_font_type2.CIDToGIDMap = pdf.make_stream(
-            self.CID_TO_GID_DATA, Filter=Name.FlateDecode
-        )
+        cid_font_type2.CIDToGIDMap = pdf.make_stream(self.CID_TO_GID_DATA, Filter=Name.FlateDecode)
         basefont.ToUnicode = pdf.make_stream(
             b"/CIDInit /ProcSet findresource begin\n"
             b"12 dict begin\n"
@@ -129,7 +123,7 @@ class Courier(EncodableFont):
         return len(text) * fontsize
     def text_encode(self, text: str) -> bytes:
-        return text.encode('pdfdoc', errors='ignore')
+        return text.encode("pdfdoc", errors="ignore")
     def register(self, pdf: Pdf) -> Dictionary:
         """Register the font."""
@@ -139,4 +133,4 @@ class Courier(EncodableFont):
                 Type=Name.Font,
                 Subtype=Name.Type1,
             )
-        )
+        )

natural_pdf/exporters/original_pdf.py CHANGED Viewed

@@ -44,7 +44,7 @@ def create_original_pdf(
     if pikepdf is None:
         raise ImportError(
             "Saving original PDF pages requires 'pikepdf'. "
-            "Install with: pip install \"natural-pdf[ocr-export]\""
+            'Install with: pip install "natural-pdf[ocr-export]"'
         )
     output_path_str = str(output_path)
@@ -55,18 +55,17 @@ def create_original_pdf(
         if not source.pages:
             raise ValueError("Cannot save an empty collection/PDF.")
         pages_to_extract = source.pages
-    elif hasattr(source, "page") and hasattr(source, "number"): # Single Page object
-         # Check if it's a natural_pdf.core.page.Page or similar duck-typed object
-         if hasattr(source, 'pdf') and source.pdf and hasattr(source.pdf, 'path'):
-             pages_to_extract = [source]
-         else:
+    elif hasattr(source, "page") and hasattr(source, "number"):  # Single Page object
+        # Check if it's a natural_pdf.core.page.Page or similar duck-typed object
+        if hasattr(source, "pdf") and source.pdf and hasattr(source.pdf, "path"):
+            pages_to_extract = [source]
+        else:
             raise ValueError("Input Page object does not have a valid PDF reference with a path.")
     else:
         raise TypeError(f"Unsupported source type for create_original_pdf: {type(source)}")
     if not pages_to_extract:
-         raise ValueError("No valid pages found in the source object.")
+        raise ValueError("No valid pages found in the source object.")
     # Verify all pages come from the same PDF and get path
     first_page_pdf_path = None
@@ -115,16 +114,14 @@ def create_original_pdf(
             )
     except pikepdf.PasswordError:
-        logger.error(
-            f"Failed to open password-protected source PDF: {first_page_pdf_path}"
-        )
+        logger.error(f"Failed to open password-protected source PDF: {first_page_pdf_path}")
         raise RuntimeError(
             f"Source PDF '{first_page_pdf_path}' is password-protected."
-        ) from None # Raise specific error without chaining the generic Exception
+        ) from None  # Raise specific error without chaining the generic Exception
     except Exception as e:
         logger.error(
             f"Failed to save original pages PDF to '{output_path_str}': {e}",
             exc_info=True,
         )
         # Re-raise as RuntimeError for consistent API error handling
-        raise RuntimeError(f"Failed to save original pages PDF: {e}") from e
+        raise RuntimeError(f"Failed to save original pages PDF: {e}") from e

natural_pdf/exporters/paddleocr.py CHANGED Viewed

@@ -3,8 +3,9 @@ import os
 import random
 import shutil
 from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
+import collections
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from natural_pdf.exporters.base import FinetuneExporter
@@ -33,19 +34,20 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
     def __init__(
         self,
         resolution: int = 150,
-        padding: int = 2,
+        padding: int = 0,
         selector: Optional[str] = None,
         corrected_only: bool = False,
         split_ratio: Optional[float] = 0.9,
         include_guide: bool = True,
         random_seed: Optional[int] = 42,
+        min_char_freq: int = 3,
     ):
         """
         Initialize the PaddleOCR Recognition Exporter.
         Args:
             resolution: DPI resolution for rendering text region images (default: 150).
-            padding: Padding (in points) to add around text element bbox before cropping (default: 2).
+            padding: Padding (in points) to add around text element bbox before cropping (default: 0).
             selector: CSS-like selector to filter which TextElements to export.
                       If None and corrected_only is False, all 'text' elements are considered.
             corrected_only: If True, overrides selector and exports only elements likely
@@ -57,6 +59,9 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
                            in the output directory (default: True).
             random_seed: Seed for the random number generator used for train/val split shuffling,
                          ensuring reproducibility (default: 42).
+            min_char_freq: Minimum frequency for a character to be included in the dictionary.
+                           Text elements containing characters below this frequency will be removed.
+                           (default: 1, meaning no filtering based on frequency).
         """
         if corrected_only and selector:
             logger.warning(
@@ -76,10 +81,12 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
         self.split_ratio = split_ratio
         self.include_guide = include_guide
         self.random_seed = random_seed
+        self.min_char_freq = min_char_freq
         logger.info(
             f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
-            f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
+            f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}, "
+            f"min_char_freq={min_char_freq}"
         )
     def export(
@@ -114,7 +121,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
         # --- 2. Collect Elements and Render Images ---
         labels: List[Tuple[str, str]] = []  # List of (relative_image_path, text_label)
-        char_set: Set[str] = set()
+        char_counts: collections.Counter = collections.Counter()
         elements_processed = 0
         elements_skipped = 0
@@ -200,7 +207,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
                         labels.append(
                             (relative_image_path.replace(os.path.sep, "/"), element_text)
                         )  # Use forward slashes for labels
-                        char_set.update(element_text)
+                        char_counts.update(element_text)
                         elements_processed += 1
                     except Exception as e:
@@ -226,15 +233,48 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
         logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
+        # --- 2.5 Filter based on character frequency ---
+        if self.min_char_freq > 1:
+            logger.info(f"Filtering elements based on min_char_freq: {self.min_char_freq}")
+            original_label_count = len(labels)
+            rare_chars = {char for char, count in char_counts.items() if count < self.min_char_freq}
+            if rare_chars:
+                logger.info(f"Identified {len(rare_chars)} rare characters: {rare_chars}")
+                filtered_labels = []
+                for img_path, text in labels:
+                    if any(char in rare_chars for char in text):
+                        elements_skipped += 1 # Count these as skipped due to rare chars
+                        elements_processed -=1 # Decrement from processed as it's now being skipped
+                    else:
+                        filtered_labels.append((img_path, text))
+                labels_removed_count = original_label_count - len(filtered_labels)
+                if labels_removed_count > 0:
+                    logger.info(f"Removed {labels_removed_count} elements containing rare characters.")
+                labels = filtered_labels
+                # Recalculate char_counts based on filtered_labels to update the dictionary
+                char_counts.clear()
+                for _, text in labels:
+                    char_counts.update(text)
+                if not labels:
+                    logger.error(
+                        "All elements were removed after character frequency filtering. Aborting."
+                    )
+                    return
+            else:
+                logger.info("No rare characters found below the frequency threshold.")
         # --- 3. Generate Dictionary File (`dict.txt`) ---
         dict_path = os.path.join(output_dir, "dict.txt")
         try:
             # Log the character set before sorting/writing
-            logger.debug(f"Exporter final char_set before sorting: {repr(char_set)}")
-            # PaddleOCR typically doesn't require special tokens like <UNK> or <BLK> in the dict
-            # for recognition models, but this might depend on the specific base model.
-            # Start with just the characters found.
-            sorted_chars = sorted(list(char_set), reverse=True)
+            final_chars_for_dict = set(char_counts.keys()) # Use keys from potentially filtered char_counts
+            logger.debug(f"Exporter final char_set for dict: {repr(final_chars_for_dict)}")
+            sorted_chars = sorted(list(final_chars_for_dict)) # No specific sorting order needed, just make it consistent
             with open(dict_path, "w", encoding="utf-8") as f_dict:
                 for char in sorted_chars:
                     # Ensure we don't write empty strings or just newlines as dictionary entries

natural_pdf/exporters/searchable_pdf.py CHANGED Viewed

@@ -318,16 +318,6 @@ def create_searchable_pdf(
         dpi: The resolution (dots per inch) for rendering page images and hOCR.
     """
-    # --- Ensure dependencies are loaded (they should be if installed) ---
-    if Image is None or pikepdf is None or HocrTransform is None:
-        # This should ideally not happen if dependencies are in main install,
-        # but serves as a safeguard during development or if install is broken.
-        raise ImportError(
-            "Required dependencies (Pillow, pikepdf) are missing. "
-            "Please ensure natural-pdf is installed correctly with all dependencies."
-        )
-    # --- End Safeguard Check ---
     # duck type to see if source has .pages, to populate pages =
     if hasattr(source, "pages"):
         pages = source.pages

natural_pdf/flows/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from .flow import Flow
+from .element import FlowElement
+from .region import FlowRegion
+from .collections import FlowElementCollection, FlowRegionCollection
+__all__ = [
+    "Flow",
+    "FlowElement",
+    "FlowRegion",
+    "FlowElementCollection",
+    "FlowRegionCollection",
+]

natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

natural-pdf 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl