PyPI - natural-pdf - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +126 -98
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +910 -516
natural_pdf/core/pdf.py +387 -289
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +714 -514
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.3.dist-info/RECORD +0 -61
natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/exporters/searchable_pdf.py CHANGED Viewed

@@ -5,33 +5,37 @@ Module for exporting PDF content to various formats.
 import logging
 import os
 import tempfile
-from typing import TYPE_CHECKING, List, Dict, Any, Tuple
+import xml.etree.ElementTree as ET
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
+from xml.etree.ElementTree import Element as ETElement
+from xml.etree.ElementTree import SubElement
 # Lazy imports for optional dependencies
 try:
     from PIL import Image
 except ImportError:
-    Image = None # type: ignore
+    Image = None  # type: ignore
 try:
     import pikepdf
 except ImportError:
-    pikepdf = None # type: ignore
+    pikepdf = None  # type: ignore
 try:
     from ocrmypdf.hocrtransform import HocrTransform
 except ImportError:
-    HocrTransform = None # type: ignore
+    HocrTransform = None  # type: ignore
 if TYPE_CHECKING:
-    from natural_pdf.core.pdf import PDF
     from natural_pdf.core.page import Page
+    from natural_pdf.core.pdf import PDF
+    from natural_pdf.elements.collections import PageCollection
 logger = logging.getLogger(__name__)
 # --- Constants ---
-HOCR_TEMPLATE_HEADER = '''<?xml version="1.0" encoding="UTF-8"?>
+HOCR_TEMPLATE_HEADER = """<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
@@ -42,27 +46,27 @@ HOCR_TEMPLATE_HEADER = '''<?xml version="1.0" encoding="UTF-8"?>
   <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
  </head>
  <body>
-'''
+"""
-HOCR_TEMPLATE_PAGE = '''  <div class='ocr_page' id='page_{page_num}' title='image "{image_path}"; bbox 0 0 {width} {height}; ppageno {page_num}'>
-'''
+HOCR_TEMPLATE_PAGE = """  <div class='ocr_page' id='page_{page_num}' title='image "{image_path}"; bbox 0 0 {width} {height}; ppageno {page_num}'>
+"""
-HOCR_TEMPLATE_WORD = '''   <span class='ocrx_word' id='word_{page_num}_{word_id}' title='bbox {x0} {y0} {x1} {y1}; x_wconf {confidence}'>{text}</span>
-'''
+HOCR_TEMPLATE_WORD = """   <span class='ocrx_word' id='word_{page_num}_{word_id}' title='bbox {x0} {y0} {x1} {y1}; x_wconf {confidence}'>{text}</span>
+"""
-HOCR_TEMPLATE_LINE_START = '''   <span class='ocr_line' id='line_{page_num}_{line_id}' title='bbox {x0} {y0} {x1} {y1}'>
-'''
-HOCR_TEMPLATE_LINE_END = '''   </span>
-'''
+HOCR_TEMPLATE_LINE_START = """   <span class='ocr_line' id='line_{page_num}_{line_id}' title='bbox {x0} {y0} {x1} {y1}; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'>
+"""
+HOCR_TEMPLATE_LINE_END = """   </span>
+"""
-HOCR_TEMPLATE_FOOTER = '''  </div>
+HOCR_TEMPLATE_FOOTER = """  </div>
  </body>
 </html>
-'''
+"""
 # --- End Constants ---
-def _generate_hocr_for_page(page: 'Page', image_width: int, image_height: int) -> str:
+def _generate_hocr_for_page(page: "Page", image_width: int, image_height: int) -> str:
     """
     Generates an hOCR string for a given Page object based on its OCR elements.
@@ -79,96 +83,240 @@ def _generate_hocr_for_page(page: 'Page', image_width: int, image_height: int) -
     """
     # Attempt to get OCR elements (words) using find_all with selector
     # Use find_all which returns an ElementCollection
-    ocr_elements_collection = page.find_all('text[source=ocr]')
-    ocr_elements = ocr_elements_collection.elements # Get the list of elements
+    ocr_elements_collection = page.find_all("text[source=ocr]")
+    ocr_elements = ocr_elements_collection.elements  # Get the list of elements
     if not ocr_elements:
-        logger.warning(f"Page {page.number} has no OCR elements (text[source=ocr]) to generate hOCR from.")
+        logger.warning(
+            f"Page {page.number} has no OCR elements (text[source=ocr]) to generate hOCR from."
+        )
         # Return minimal valid hOCR for an empty page
         hocr_content = HOCR_TEMPLATE_HEADER
-        hocr_content += HOCR_TEMPLATE_PAGE.format(page_num=page.index, image_path="", width=image_width, height=image_height)
+        hocr_content += HOCR_TEMPLATE_PAGE.format(
+            page_num=page.index, image_path="", width=image_width, height=image_height
+        )
         hocr_content += HOCR_TEMPLATE_FOOTER
         return hocr_content
-    # --- TODO: Implement logic to group words into lines if necessary ---
-    # For now, just output words directly. A more advanced implementation
-    # might group words geometrically into lines first.
-    # Example (simple, assuming elements are somewhat sorted):
-    # lines = []
-    # current_line = []
-    # last_y = -1
-    # for word in ocr_elements:
-    #     if not current_line or abs(word.y0 - last_y) < threshold: # Simple Y-based grouping
-    #         current_line.append(word)
-    #         last_y = word.y0
-    #     else:
-    #         lines.append(current_line)
-    #         current_line = [word]
-    #         last_y = word.y0
-    # if current_line:
-    #     lines.append(current_line)
-    # --- End Line Grouping Placeholder ---
-    hocr_content = HOCR_TEMPLATE_HEADER
-    hocr_content += HOCR_TEMPLATE_PAGE.format(page_num=page.index, image_path="", width=image_width, height=image_height) # image_path is often unused
-    # Scale factors from PDF points (page dims) to image pixels (rendered image dims)
-    # Note: Assumes OCR element coordinates are in PDF points (page.width/height)
+    # --- Start Line Grouping Logic ---
+    logger.debug(f"Page {page.index}: Grouping {len(ocr_elements)} words into lines.")
+    ocr_elements.sort(key=lambda el: (el.bbox[1], el.bbox[0]))
+    lines = []
+    current_line = []
+    if ocr_elements:
+        current_line.append(ocr_elements[0])
+        for i in range(1, len(ocr_elements)):
+            current_word = ocr_elements[i]
+            last_word = current_line[-1]
+            last_word_y0, last_word_y1 = last_word.bbox[1], last_word.bbox[3]
+            current_word_y0, current_word_y1 = current_word.bbox[1], current_word.bbox[3]
+            last_word_center_y = (last_word_y0 + last_word_y1) / 2
+            current_word_center_y = (current_word_y0 + current_word_y1) / 2
+            last_word_height = last_word_y1 - last_word_y0
+            current_word_height = current_word_y1 - current_word_y0
+            avg_height = (last_word_height + current_word_height) / 2
+            if avg_height <= 0:
+                avg_height = 1
+            tolerance_factor = 0.7
+            threshold = avg_height * tolerance_factor
+            delta_y = abs(current_word_center_y - last_word_center_y)
+            # if delta_y < threshold:
+            #     current_line.append(current_word)
+            # else:
+            lines.append(current_line)
+            current_line = [current_word]
+        if current_line:
+            lines.append(current_line)
+    logger.debug(f"Page {page.index}: Grouped into {len(lines)} lines.")
+    # --- End Line Grouping Logic ---
+    # --- Start ElementTree hOCR Generation ---
     scale_x = image_width / page.width if page.width > 0 else 1
     scale_y = image_height / page.height if page.height > 0 else 1
+    # Create root element
+    page_hocr = ETElement(
+        "html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": "en"}
+    )
+    # Head
+    head = SubElement(page_hocr, "head")
+    SubElement(head, "title").text = ""
+    SubElement(
+        head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html;charset=utf-8"}
+    )
+    SubElement(head, "meta", attrib={"name": "ocr-system", "content": "natural-pdf"})
+    SubElement(
+        head,
+        "meta",
+        attrib={
+            "name": "ocr-capabilities",
+            "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word",
+        },
+    )
+    # Body and Page
+    body = SubElement(page_hocr, "body")
+    page_div = SubElement(
+        body,
+        "div",
+        attrib={
+            "class": "ocr_page",
+            "id": f"page_{page.index}",
+            "title": f"image; bbox 0 0 {image_width} {image_height}; ppageno {page.index}",
+        },
+    )
+    # Calculate overall bbox for carea/par (image coords)
+    min_area_x0, min_area_y0 = image_width, image_height
+    max_area_x1, max_area_y1 = 0, 0
+    if lines:
+        for line_words in lines:
+            for word in line_words:
+                (x0, y0, x1, y1) = word.bbox
+                img_x0 = int(x0 * scale_x)
+                img_y0 = int(y0 * scale_y)
+                img_x1 = int(x1 * scale_x)
+                img_y1 = int(y1 * scale_y)
+                min_area_x0 = min(min_area_x0, img_x0)
+                min_area_y0 = min(min_area_y0, img_y0)
+                max_area_x1 = max(max_area_x1, img_x1)
+                max_area_y1 = max(max_area_y1, img_y1)
+        area_img_x0, area_img_y0 = max(0, min_area_x0), max(0, min_area_y0)
+        area_img_x1, area_img_y1 = min(image_width, max_area_x1), min(image_height, max_area_y1)
+        if area_img_x0 >= area_img_x1 or area_img_y0 >= area_img_y1:
+            area_img_x0, area_img_y0, area_img_x1, area_img_y1 = 0, 0, image_width, image_height
+    else:
+        area_img_x0, area_img_y0, area_img_x1, area_img_y1 = 0, 0, image_width, image_height
+    # Add Carea and Par wrappers (assuming one block/paragraph per page for simplicity)
+    block_div = SubElement(
+        page_div,  # Attach to page_div now
+        "div",
+        attrib={
+            "class": "ocr_carea",
+            "id": "block_0_1",  # Simple ID
+            "title": f"bbox {area_img_x0} {area_img_y0} {area_img_x1} {area_img_y1}",
+        },
+    )
+    par_div = SubElement(
+        block_div,
+        "p",
+        attrib={
+            "class": "ocr_par",
+            "id": "par_0_1",  # Simple ID
+            "title": f"bbox {area_img_x0} {area_img_y0} {area_img_x1} {area_img_y1}",
+        },
+    )
+    # Loop through lines and words
     word_id_counter = 0
-    for word in ocr_elements:
-        # Scale coordinates to image dimensions
-        img_x0 = int(word.x0 * scale_x)
-        img_y0 = int(word.y0 * scale_y)
-        img_x1 = int(word.x1 * scale_x)
-        img_y1 = int(word.y1 * scale_y)
-        # Ensure coordinates are within image bounds
-        img_x0 = max(0, img_x0)
-        img_y0 = max(0, img_y0)
-        img_x1 = min(image_width, img_x1)
-        img_y1 = min(image_height, img_y1)
-        # Basic escaping for XML - might need more robust escaping
-        text = word.text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
-        # Confidence (assuming it exists, default to 99 if not)
-        confidence = getattr(word, 'confidence', 0.99) * 100 # hOCR often uses 0-100
-        hocr_content += HOCR_TEMPLATE_WORD.format(
-            page_num=page.index,
-            word_id=word_id_counter,
-            x0=img_x0,
-            y0=img_y0,
-            x1=img_x1,
-            y1=img_y1,
-            confidence=int(confidence),
-            text=text
+    line_id_counter = 0
+    for current_line_words in lines:
+        if not current_line_words:
+            continue
+        # Sort words in line by x0
+        current_line_words.sort(key=lambda el: el.bbox[0])
+        # Calculate line bbox (image coords)
+        min_line_x0, min_line_y0 = image_width, image_height
+        max_line_x1, max_line_y1 = 0, 0
+        for word in current_line_words:
+            (x0, y0, x1, y1) = word.bbox
+            img_x0, img_y0 = int(x0 * scale_x), int(y0 * scale_y)
+            img_x1, img_y1 = int(x1 * scale_x), int(y1 * scale_y)
+            min_line_x0, min_line_y0 = min(min_line_x0, img_x0), min(min_line_y0, img_y0)
+            max_line_x1, max_line_y1 = max(max_line_x1, img_x1), max(max_line_y1, img_y1)
+        line_img_x0, line_img_y0 = max(0, min_line_x0), max(0, min_line_y0)
+        line_img_x1, line_img_y1 = min(image_width, max_line_x1), min(image_height, max_line_y1)
+        if line_img_x0 >= line_img_x1 or line_img_y0 >= line_img_y1:
+            line_img_x0, line_img_y0, line_img_x1, line_img_y1 = 0, 0, 1, 1
+        # Create ocr_line span
+        line_span = SubElement(
+            par_div,  # Attach line to paragraph
+            "span",
+            attrib={
+                "class": "ocr_line",
+                "id": f"line_{page.index}_{line_id_counter}",
+                "title": f"bbox {line_img_x0} {line_img_y0} {line_img_x1} {line_img_y1}; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0",
+            },
         )
-        word_id_counter += 1
-        hocr_content += "\n" # Add newline for readability
+        # Add words to line
+        for word in current_line_words:
+            (x0, y0, x1, y1) = word.bbox
+            img_x0, img_y0 = int(x0 * scale_x), int(y0 * scale_y)
+            img_x1, img_y1 = int(x1 * scale_x), int(y1 * scale_y)
+            img_x0, img_y0 = max(0, img_x0), max(0, img_y0)
+            img_x1, img_y1 = min(image_width, img_x1), min(image_height, img_y1)
+            if img_x1 <= img_x0:
+                img_x1 = img_x0 + 1
+            if img_y1 <= img_y0:
+                img_y1 = img_y0 + 1
+            # --- Strip whitespace and check if word is empty --- #
+            text = word.text.strip().replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
+            if not text:
+                continue  # Skip adding this word if it becomes empty after stripping
+            # --- End strip ---
+            confidence = getattr(word, "confidence", 1.00)
+            word_span = SubElement(
+                line_span,  # Attach word to line
+                "span",
+                attrib={
+                    "class": "ocrx_word",
+                    "id": f"word_{page.index}_{word_id_counter}",
+                    "title": f"bbox {img_x0} {img_y0} {img_x1} {img_y1}; x_wconf {confidence}",
+                },
+            )
+            word_span.text = text
+            word_id_counter += 1
+        line_id_counter += 1
+    # Convert ElementTree to string
+    # xml_declaration = '<?xml version="1.0" encoding="UTF-8"?>\n' # No longer needed
+    # doctype_declaration = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    # "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n''' # No longer needed
+    # ET.indent(page_hocr) # Optional: for pretty printing, requires Python 3.9+
+    # Need bytes for writing, then decode for HocrTransform if it needs str
+    # Let's stick to unicode string output for now, as the file write expects it.
+    hocr_content = ET.tostring(
+        page_hocr, encoding="unicode", method="xml"
+    )  # Revert back to method='xml'
+    # hocr_content = xml_declaration + doctype_declaration + hocr_string_content # Removed string addition
+    # --- End ElementTree hOCR Generation ---
+    # --- Add code to save hOCR output for inspection ---
+    try:
+        hocr_output_path = "natural_pdf_hocr_output.hocr"
+        with open(hocr_output_path, "w", encoding="utf-8") as f_out:
+            f_out.write(hocr_content)
+        logger.info(f"Saved hOCR content for page {page.index} to: {hocr_output_path}")
+    except Exception as e:
+        logger.error(f"Failed to save hOCR output to file: {e}")
+    # --- End save hOCR ---
-    hocr_content += HOCR_TEMPLATE_FOOTER
     return hocr_content
-def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
+def create_searchable_pdf(
+    source: Union["Page", "PageCollection", "PDF"], output_path: str, dpi: int = 300
+):
     """
     Creates a searchable PDF from a natural_pdf.PDF object using OCR results.
     Relies on ocrmypdf for hOCR transformation. Requires optional dependencies.
     Args:
-        pdf_object: The natural_pdf.PDF instance (OCR should have been run).
+        source: The natural_pdf.PDF, PageCollection, or Page object
         output_path: The path to save the resulting searchable PDF.
         dpi: The resolution (dots per inch) for rendering page images and hOCR.
     """
-    # _check_dependencies() # Removed check
     # --- Ensure dependencies are loaded (they should be if installed) ---
     if Image is None or pikepdf is None or HocrTransform is None:
@@ -180,7 +328,13 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
         )
     # --- End Safeguard Check ---
-    logger.info(f"Starting searchable PDF creation for '{pdf_object.source_path}' -> '{output_path}' at {dpi} DPI.")
+    # duck type to see if source has .pages, to populate pages =
+    if hasattr(source, "pages"):
+        pages = source.pages
+    else:
+        pages = [source]
+    logger.info(f"Starting searchable PDF creation '{output_path}' at {dpi} DPI.")
     temp_pdf_pages: List[str] = []
     output_abs_path = os.path.abspath(output_path)
@@ -188,10 +342,12 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
     with tempfile.TemporaryDirectory() as tmpdir:
         logger.debug(f"Using temporary directory: {tmpdir}")
-        for i, page in enumerate(pdf_object.pages):
-            logger.debug(f"Processing page {page.number} (index {i})...")
+        for i, page in enumerate(pages):
+            logger.debug(f"Processing page {i+1} of {len(pages)}...")
             page_base_name = f"page_{i}"
-            img_path = os.path.join(tmpdir, f"{page_base_name}.png") # Use PNG for potentially better quality
+            img_path = os.path.join(
+                tmpdir, f"{page_base_name}.png"
+            )  # Use PNG for potentially better quality
             hocr_path = os.path.join(tmpdir, f"{page_base_name}.hocr")
             pdf_page_path = os.path.join(tmpdir, f"{page_base_name}.pdf")
@@ -200,18 +356,17 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
                 logger.debug(f"  Rendering page {i} to image ({dpi} DPI)...")
                 # Use the Page's to_image method
                 pil_image = page.to_image(resolution=dpi, include_highlights=False)
-                pil_image.save(img_path, format='PNG')
+                pil_image.save(img_path, format="PNG")
                 img_width, img_height = pil_image.size
                 logger.debug(f"  Image saved to {img_path} ({img_width}x{img_height})")
                 # 2. Generate hOCR
                 logger.debug(f"  Generating hOCR...")
                 hocr_content = _generate_hocr_for_page(page, img_width, img_height)
-                with open(hocr_path, 'w', encoding='utf-8') as f:
+                with open(hocr_path, "w", encoding="utf-8") as f:
                     f.write(hocr_content)
                 logger.debug(f"  hOCR saved to {hocr_path}")
                 # 3. Use HocrTransform to create searchable PDF page
                 logger.debug(f"  Running HocrTransform...")
                 hocr_transform = HocrTransform(hocr_filename=hocr_path, dpi=dpi)
@@ -221,11 +376,11 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
                 logger.debug(f"  Temporary PDF page saved to {pdf_page_path}")
             except Exception as e:
-                 logger.error(f"  Failed to process page {page.number}: {e}", exc_info=True)
-                 # Decide whether to skip or raise error
-                 # For now, let's skip and continue
-                 logger.warning(f"  Skipping page {page.number} due to error.")
-                 continue # Skip to the next page
+                logger.error(f"  Failed to process page {page.number}: {e}", exc_info=True)
+                # Decide whether to skip or raise error
+                # For now, let's skip and continue
+                logger.warning(f"  Skipping page {page.number} due to error.")
+                continue  # Skip to the next page
         # 4. Merge temporary PDF pages
         if not temp_pdf_pages:
@@ -237,16 +392,20 @@ def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
             # Use pikepdf for merging
             output_pdf = pikepdf.Pdf.new()
             for temp_pdf_path in temp_pdf_pages:
-                 with pikepdf.Pdf.open(temp_pdf_path) as src_page_pdf:
-                      # Assuming each temp PDF has exactly one page
-                      if len(src_page_pdf.pages) == 1:
-                           output_pdf.pages.append(src_page_pdf.pages[0])
-                      else:
-                           logger.warning(f"Temporary PDF '{temp_pdf_path}' had unexpected number of pages ({len(src_page_pdf.pages)}). Skipping.")
+                with pikepdf.Pdf.open(temp_pdf_path) as src_page_pdf:
+                    # Assuming each temp PDF has exactly one page
+                    if len(src_page_pdf.pages) == 1:
+                        output_pdf.pages.append(src_page_pdf.pages[0])
+                    else:
+                        logger.warning(
+                            f"Temporary PDF '{temp_pdf_path}' had unexpected number of pages ({len(src_page_pdf.pages)}). Skipping."
+                        )
             output_pdf.save(output_abs_path)
             logger.info(f"Successfully saved merged searchable PDF to: {output_abs_path}")
         except Exception as e:
-            logger.error(f"Failed to merge temporary PDFs into '{output_abs_path}': {e}", exc_info=True)
+            logger.error(
+                f"Failed to merge temporary PDFs into '{output_abs_path}': {e}", exc_info=True
+            )
             raise RuntimeError(f"Failed to save final PDF: {e}") from e
-    logger.debug("Temporary directory cleaned up.")
+    logger.debug("Temporary directory cleaned up.")

natural_pdf/ocr/__init__.py CHANGED Viewed

@@ -3,47 +3,56 @@ OCR engines for natural-pdf.
 This module provides different OCR engines that can be used with natural-pdf.
 """
 import logging
 # Set up module logger
 logger = logging.getLogger("natural_pdf.ocr")
-from .ocr_manager import OCRManager
-from .engine import OCREngine
-from .ocr_options import OCROptions
 from .engine import OCREngine
 from .engine_paddle import PaddleOCREngine
 from .engine_surya import SuryaOCREngine
+from .ocr_manager import OCRManager
+from .ocr_options import OCROptions
-__all__ = ['OCRManager', 'OCREngine', 'OCROptions', 'EasyOCREngine', 'PaddleOCREngine', 'SuryaOCREngine']
+__all__ = [
+    "OCRManager",
+    "OCREngine",
+    "OCROptions",
+    "EasyOCREngine",
+    "PaddleOCREngine",
+    "SuryaOCREngine",
+]
 DEFAULT_ENGINE = SuryaOCREngine
 def get_engine(engine_name=None, **kwargs):
     """
     Get OCR engine by name.
     Args:
         engine_name: Name of the engine to use ('easyocr', 'paddleocr', etc.)
                      If None, the default engine is used (PaddleOCR if available, otherwise EasyOCR)
         **kwargs: Additional arguments to pass to the engine constructor
     Returns:
         OCREngine instance
     """
     logger.debug(f"Initializing OCR engine: {engine_name or 'default'}")
-    if engine_name is None or engine_name == 'default':
+    if engine_name is None or engine_name == "default":
         engine = DEFAULT_ENGINE(**kwargs)
         logger.info(f"Using default OCR engine: {engine.__class__.__name__}")
         return engine
-    if engine_name.lower() == 'easyocr':
+    if engine_name.lower() == "easyocr":
         logger.info("Initializing EasyOCR engine")
         return EasyOCREngine(**kwargs)
-    if engine_name.lower() == 'paddleocr':
+    if engine_name.lower() == "paddleocr":
         try:
             from .engine_paddle import PaddleOCREngine
             logger.info("Initializing PaddleOCR engine")
             return PaddleOCREngine(**kwargs)
         except ImportError:
@@ -51,6 +60,6 @@ def get_engine(engine_name=None, **kwargs):
             raise ImportError(
                 "PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
             )
     logger.error(f"Unknown OCR engine: {engine_name}")
-    raise ValueError(f"Unknown OCR engine: {engine_name}")
+    raise ValueError(f"Unknown OCR engine: {engine_name}")

natural_pdf/ocr/engine.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # ocr_engine_base.py
 import logging
 from abc import ABC, abstractmethod
-from typing import Dict, List, Any, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 from PIL import Image
 # Assuming ocr_options defines BaseOCROptions
@@ -9,6 +10,7 @@ from .ocr_options import BaseOCROptions
 logger = logging.getLogger(__name__)
 class OCREngine(ABC):
     """Abstract Base Class for OCR engines."""
@@ -16,14 +18,14 @@ class OCREngine(ABC):
         """Initializes the base OCR engine."""
         self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
         self.logger.info(f"Initializing {self.__class__.__name__}")
-        self._reader_cache = {} # Cache for initialized models/readers
+        self._reader_cache = {}  # Cache for initialized models/readers
     @abstractmethod
     def process_image(
         self,
-        images: Union[Image.Image, List[Image.Image]], # Accept single or list
-        options: BaseOCROptions
-    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
+        images: Union[Image.Image, List[Image.Image]],  # Accept single or list
+        options: BaseOCROptions,
+    ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:  # Return single or list of lists
         """
         Processes a single image or a batch of images using the specific engine and options.
@@ -80,10 +82,18 @@ class OCREngine(ABC):
             Tuple[float, float, float, float] or None if conversion fails.
         """
         try:
-            if isinstance(bbox, (list, tuple)) and len(bbox) == 4 and all(isinstance(n, (int, float)) for n in bbox):
+            if (
+                isinstance(bbox, (list, tuple))
+                and len(bbox) == 4
+                and all(isinstance(n, (int, float)) for n in bbox)
+            ):
                 # Already in (x0, y0, x1, y1) format (or similar)
                 return tuple(float(c) for c in bbox[:4])
-            elif isinstance(bbox, (list, tuple)) and len(bbox) > 0 and isinstance(bbox[0], (list, tuple)):
+            elif (
+                isinstance(bbox, (list, tuple))
+                and len(bbox) > 0
+                and isinstance(bbox[0], (list, tuple))
+            ):
                 # Polygon format [[x1,y1],[x2,y2],...]
                 x_coords = [float(point[0]) for point in bbox]
                 y_coords = [float(point[1]) for point in bbox]
@@ -101,4 +111,3 @@ class OCREngine(ABC):
         self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
         # Clear reader cache to free up memory/GPU resources
         self._reader_cache.clear()

natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl