PyPI - natural-pdf - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

natural-pdf 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

docs/finetuning/index.md +176 -0
docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/gemini.py +63 -47
natural_pdf/collections/pdf_collection.py +5 -2
natural_pdf/core/element_manager.py +6 -4
natural_pdf/core/page.py +36 -27
natural_pdf/core/pdf.py +25 -16
natural_pdf/elements/base.py +1 -3
natural_pdf/elements/collections.py +13 -14
natural_pdf/elements/region.py +7 -6
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/ocr/__init__.py +16 -8
natural_pdf/ocr/engine.py +46 -30
natural_pdf/ocr/engine_easyocr.py +81 -40
natural_pdf/ocr/engine_paddle.py +39 -28
natural_pdf/ocr/engine_surya.py +32 -16
natural_pdf/ocr/ocr_factory.py +34 -23
natural_pdf/ocr/ocr_manager.py +15 -11
natural_pdf/ocr/ocr_options.py +5 -0
natural_pdf/ocr/utils.py +46 -31
natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
natural_pdf/utils/debug.py +4 -2
natural_pdf/utils/identifiers.py +9 -5
natural_pdf/utils/packaging.py +172 -105
natural_pdf/utils/text_extraction.py +44 -64
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0

natural_pdf/utils/packaging.py CHANGED Viewed

@@ -28,11 +28,12 @@ from natural_pdf.utils.identifiers import generate_short_path_hash
 logger = logging.getLogger(__name__)
 def create_correction_task_package(
-    source: Union['PDF', 'PDFCollection', List['PDF']],
+    source: Union["PDF", "PDFCollection", List["PDF"]],
     output_zip_path: str,
     overwrite: bool = False,
-    suggest = None,
+    suggest=None,
     resolution: int = 150,
 ) -> None:
     """
@@ -55,27 +56,32 @@ def create_correction_task_package(
         ValueError: If no valid pages with OCR data are found in the source.
     """
     if os.path.exists(output_zip_path) and not overwrite:
-        raise FileExistsError(f"Output file already exists: {output_zip_path}. Set overwrite=True to replace it.")
+        raise FileExistsError(
+            f"Output file already exists: {output_zip_path}. Set overwrite=True to replace it."
+        )
     # --- Resolve source to a list of PDF objects ---
-    pdfs_to_process: List['PDF'] = []
-    if hasattr(source, '__class__') and source.__class__.__name__ == 'PDF': # Check type without direct import
+    pdfs_to_process: List["PDF"] = []
+    if (
+        hasattr(source, "__class__") and source.__class__.__name__ == "PDF"
+    ):  # Check type without direct import
         pdfs_to_process = [source]
-    elif hasattr(source, '__class__') and source.__class__.__name__ == 'PDFCollection':
-        pdfs_to_process = source.pdfs # Assuming PDFCollection has a .pdfs property
-    elif isinstance(source, list) and all(hasattr(p, '__class__') and p.__class__.__name__ == 'PDF' for p in source):
+    elif hasattr(source, "__class__") and source.__class__.__name__ == "PDFCollection":
+        pdfs_to_process = source.pdfs  # Assuming PDFCollection has a .pdfs property
+    elif isinstance(source, list) and all(
+        hasattr(p, "__class__") and p.__class__.__name__ == "PDF" for p in source
+    ):
         pdfs_to_process = source
     else:
-        raise TypeError(f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF].")
+        raise TypeError(
+            f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
+        )
     if not pdfs_to_process:
         logger.warning("No PDF documents provided in the source.")
         return
-    manifest_data = {
-        "pdfs": [], # Store pdf-level info if needed later?
-        "pages": []
-    }
+    manifest_data = {"pdfs": [], "pages": []}  # Store pdf-level info if needed later?
     total_regions_found = 0
     # Use a temporary directory for staging files before zipping
@@ -84,38 +90,52 @@ def create_correction_task_package(
         os.makedirs(images_dir)
         logger.info(f"Using temporary directory for staging: {temp_dir}")
-        # --- Process each PDF ---
+        # --- Process each PDF ---
         for pdf in pdfs_to_process:
-            if not hasattr(pdf, 'path') or not hasattr(pdf, 'pages'):
+            if not hasattr(pdf, "path") or not hasattr(pdf, "pages"):
                 logger.warning(f"Skipping invalid PDF object: {pdf}")
                 continue
-            pdf_path = pdf.path # Should be the resolved, absolute path
+            pdf_path = pdf.path  # Should be the resolved, absolute path
             pdf_short_id = generate_short_path_hash(pdf_path)
             logger.debug(f"Processing PDF: {pdf_path} (ID: {pdf_short_id})")
             pdf_has_ocr_regions = False
             for page in pdf.pages:
-                if not hasattr(page, 'index') or not hasattr(page, 'number') or \
-                    not hasattr(page, 'width') or not hasattr(page, 'height') or \
-                    not hasattr(page, 'find_all') or not hasattr(page, 'to_image'):
-                    logger.warning(f"Skipping invalid Page object in {pdf_path}: page index {getattr(page, 'index', 'unknown')}")
+                if (
+                    not hasattr(page, "index")
+                    or not hasattr(page, "number")
+                    or not hasattr(page, "width")
+                    or not hasattr(page, "height")
+                    or not hasattr(page, "find_all")
+                    or not hasattr(page, "to_image")
+                ):
+                    logger.warning(
+                        f"Skipping invalid Page object in {pdf_path}: page index {getattr(page, 'index', 'unknown')}"
+                    )
                     continue
                 # 1. Extract OCR elements for this page
                 try:
                     # Important: apply_exclusions=False ensures we get *all* OCR data
                     # regardless of user exclusions set on the PDF/page object.
-                    ocr_elements = page.find_all('text[source=ocr]', apply_exclusions=False).elements
+                    ocr_elements = page.find_all(
+                        "text[source=ocr]", apply_exclusions=False
+                    ).elements
                 except Exception as e:
-                    logger.error(f"Failed to extract OCR elements for {pdf_path} page {page.number}: {e}", exc_info=True)
-                    continue # Skip this page if element extraction fails
+                    logger.error(
+                        f"Failed to extract OCR elements for {pdf_path} page {page.number}: {e}",
+                        exc_info=True,
+                    )
+                    continue  # Skip this page if element extraction fails
                 if not ocr_elements:
-                    logger.debug(f"No OCR elements found for {pdf_path} page {page.number}. Skipping page in manifest.")
-                    continue # Skip page if no OCR elements
+                    logger.debug(
+                        f"No OCR elements found for {pdf_path} page {page.number}. Skipping page in manifest."
+                    )
+                    continue  # Skip page if no OCR elements
-                pdf_has_ocr_regions = True # Mark that this PDF is relevant
+                pdf_has_ocr_regions = True  # Mark that this PDF is relevant
                 logger.debug(f"  Found {len(ocr_elements)} OCR elements on page {page.number}")
                 total_regions_found += len(ocr_elements)
@@ -128,9 +148,12 @@ def create_correction_task_package(
                         raise ValueError("page.to_image returned None")
                     img.save(image_save_path, "PNG")
                 except Exception as e:
-                    logger.error(f"Failed to render/save image for {pdf_path} page {page.number}: {e}", exc_info=True)
+                    logger.error(
+                        f"Failed to render/save image for {pdf_path} page {page.number}: {e}",
+                        exc_info=True,
+                    )
                     # If image fails, we cannot proceed with this page for the task
-                    pdf_has_ocr_regions = False # Reset flag for this page
+                    pdf_has_ocr_regions = False  # Reset flag for this page
                     continue
                 # 3. Prepare region data for manifest
@@ -142,72 +165,85 @@ def create_correction_task_package(
                 for elem in tqdm(ocr_elements):
                     i += 1
                     # Basic check for necessary attributes
-                    if not all(hasattr(elem, attr) for attr in ['x0', 'top', 'x1', 'bottom', 'text']):
-                        logger.warning(f"Skipping invalid OCR element {i} on {pdf_path} page {page.number}")
+                    if not all(
+                        hasattr(elem, attr) for attr in ["x0", "top", "x1", "bottom", "text"]
+                    ):
+                        logger.warning(
+                            f"Skipping invalid OCR element {i} on {pdf_path} page {page.number}"
+                        )
                         continue
-                    region_id = f"r_{page.index}_{i}" # ID unique within page
+                    region_id = f"r_{page.index}_{i}"  # ID unique within page
                     # Scale coordinates to match the 300 DPI image
                     scaled_bbox = [
                         elem.x0 * coord_scale_factor,
                         elem.top * coord_scale_factor,
                         elem.x1 * coord_scale_factor,
-                        elem.bottom * coord_scale_factor
+                        elem.bottom * coord_scale_factor,
                     ]
                     corrected = elem.text
                     if suggest:
-                        corrected = suggest(elem.to_region(), getattr(elem, 'confidence', None))
-                    page_regions_data.append({
-                        "resolution": resolution,
-                        "id": region_id,
-                        "bbox": scaled_bbox,
-                        "ocr_text": elem.text,
-                        "confidence": getattr(elem, 'confidence', None), # Include confidence if available
-                        "corrected_text": corrected,
-                        "modified": False
-                    })
+                        corrected = suggest(elem.to_region(), getattr(elem, "confidence", None))
+                    page_regions_data.append(
+                        {
+                            "resolution": resolution,
+                            "id": region_id,
+                            "bbox": scaled_bbox,
+                            "ocr_text": elem.text,
+                            "confidence": getattr(
+                                elem, "confidence", None
+                            ),  # Include confidence if available
+                            "corrected_text": corrected,
+                            "modified": False,
+                        }
+                    )
                 # 4. Add page data to manifest if it has regions
                 if page_regions_data:
-                    manifest_data["pages"].append({
-                        "pdf_source": pdf_path,
-                        "pdf_short_id": pdf_short_id,
-                        "page_number": page.number,
-                        "page_index": page.index,
-                        "image_path": f"images/{image_filename}", # Relative path within zip
-                        "width": page.width,
-                        "height": page.height,
-                        "regions": page_regions_data
-                    })
+                    manifest_data["pages"].append(
+                        {
+                            "pdf_source": pdf_path,
+                            "pdf_short_id": pdf_short_id,
+                            "page_number": page.number,
+                            "page_index": page.index,
+                            "image_path": f"images/{image_filename}",  # Relative path within zip
+                            "width": page.width,
+                            "height": page.height,
+                            "regions": page_regions_data,
+                        }
+                    )
                 else:
                     # If, after checks, no valid regions remain, ensure flag is correct
                     pdf_has_ocr_regions = False
-        # --- Final Checks and Zipping ---
+        # --- Final Checks and Zipping ---
         if not manifest_data["pages"] or total_regions_found == 0:
-             logger.error("No pages with valid OCR regions and successfully rendered images found in the source PDFs. Cannot create task package.")
-             # Consider raising ValueError here instead of just returning
-             raise ValueError("No valid pages with OCR data found to create a task package.")
+            logger.error(
+                "No pages with valid OCR regions and successfully rendered images found in the source PDFs. Cannot create task package."
+            )
+            # Consider raising ValueError here instead of just returning
+            raise ValueError("No valid pages with OCR data found to create a task package.")
         manifest_path = os.path.join(temp_dir, "manifest.json")
         try:
-            with open(manifest_path, 'w', encoding='utf-8') as f_manifest:
+            with open(manifest_path, "w", encoding="utf-8") as f_manifest:
                 json.dump(manifest_data, f_manifest, indent=2)
         except Exception as e:
             logger.error(f"Failed to write manifest.json: {e}", exc_info=True)
-            raise # Re-raise error, cannot proceed
+            raise  # Re-raise error, cannot proceed
         # --- Copy SPA files into temp dir ---
         try:
             # Find the path to the spa template directory relative to this file
             # Using __file__ assumes this script is installed alongside the templates
             utils_dir = os.path.dirname(os.path.abspath(__file__))
-            templates_dir = os.path.join(os.path.dirname(utils_dir), 'templates') # Go up one level from utils
-            spa_template_dir = os.path.join(templates_dir, 'spa')
+            templates_dir = os.path.join(
+                os.path.dirname(utils_dir), "templates"
+            )  # Go up one level from utils
+            spa_template_dir = os.path.join(templates_dir, "spa")
             if not os.path.isdir(spa_template_dir):
                 raise FileNotFoundError(f"SPA template directory not found at {spa_template_dir}")
@@ -224,32 +260,34 @@ def create_correction_task_package(
         # --- Create the final zip file ---
         try:
             logger.info(f"Creating zip package at: {output_zip_path}")
-            with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            with zipfile.ZipFile(output_zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
                 # Add manifest.json
                 zipf.write(manifest_path, arcname="manifest.json")
                 # Add images directory
                 for root, _, files in os.walk(images_dir):
                     for file in files:
                         full_path = os.path.join(root, file)
-                        # Create the correct archive name (e.g., images/...)
+                        # Create the correct archive name (e.g., images/...)
                         arcname = os.path.relpath(full_path, temp_dir)
                         zipf.write(full_path, arcname=arcname)
-            logger.info(f"Successfully created correction task package: {output_zip_path} ({total_regions_found} regions total)")
+            logger.info(
+                f"Successfully created correction task package: {output_zip_path} ({total_regions_found} regions total)"
+            )
         except Exception as e:
             logger.error(f"Failed to create zip file {output_zip_path}: {e}", exc_info=True)
             # Attempt to clean up existing zip if creation failed partially
             if os.path.exists(output_zip_path):
-                try: os.remove(output_zip_path)
-                except: pass
-            raise # Re-raise error
+                try:
+                    os.remove(output_zip_path)
+                except:
+                    pass
+            raise  # Re-raise error
-    # Temporary directory is automatically cleaned up by context manager
+    # Temporary directory is automatically cleaned up by context manager
-def import_ocr_from_manifest(
-    pdf: 'PDF',
-    manifest_path: str
-) -> Dict[str, int]:
+def import_ocr_from_manifest(pdf: "PDF", manifest_path: str) -> Dict[str, int]:
     """
     Imports OCR data into a PDF object from a manifest file.
@@ -275,8 +313,8 @@ def import_ocr_from_manifest(
         ValueError: If the manifest is invalid or contains data for a different PDF.
         TypeError: If the input pdf object is not a valid PDF instance.
     """
-    if not (hasattr(pdf, '__class__') and pdf.__class__.__name__ == 'PDF'):
-         raise TypeError(f"Input must be a natural_pdf PDF object, got {type(pdf)}")
+    if not (hasattr(pdf, "__class__") and pdf.__class__.__name__ == "PDF"):
+        raise TypeError(f"Input must be a natural_pdf PDF object, got {type(pdf)}")
     if not os.path.exists(manifest_path):
         raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
@@ -284,7 +322,7 @@ def import_ocr_from_manifest(
     logger.info(f"Importing OCR data into PDF '{pdf.path}' from manifest '{manifest_path}'")
     try:
-        with open(manifest_path, 'r', encoding='utf-8') as f:
+        with open(manifest_path, "r", encoding="utf-8") as f:
             manifest_data = json.load(f)
     except json.JSONDecodeError as e:
         logger.error(f"Failed to parse manifest file: {e}")
@@ -300,18 +338,21 @@ def import_ocr_from_manifest(
     manifest_pages = manifest_data.get("pages", [])
     if not manifest_pages:
         logger.warning("Manifest contains no page data.")
-        return {'imported': 0, 'skipped': 0}
+        return {"imported": 0, "skipped": 0}
     # --- Pre-check PDF source consistency ---
     first_manifest_pdf_path = manifest_pages[0].get("pdf_source")
     if first_manifest_pdf_path != pdf.path:
         # Allow matching based on just the filename if paths differ (e.g., absolute vs relative)
         if os.path.basename(first_manifest_pdf_path) != os.path.basename(pdf.path):
-            logger.error(f"Manifest PDF source ('{first_manifest_pdf_path}') does not match target PDF path ('{pdf.path}'). Aborting.")
+            logger.error(
+                f"Manifest PDF source ('{first_manifest_pdf_path}') does not match target PDF path ('{pdf.path}'). Aborting."
+            )
             raise ValueError("Manifest source PDF does not match the provided PDF object.")
         else:
-            logger.warning(f"Manifest PDF source path ('{first_manifest_pdf_path}') differs from target PDF path ('{pdf.path}'), but filenames match. Proceeding cautiously.")
+            logger.warning(
+                f"Manifest PDF source path ('{first_manifest_pdf_path}') differs from target PDF path ('{pdf.path}'), but filenames match. Proceeding cautiously."
+            )
     pdf_pages_by_index = {page.index: page for page in pdf.pages}
@@ -320,19 +361,27 @@ def import_ocr_from_manifest(
         manifest_pdf_path = page_data.get("pdf_source")
         # Check consistency for every page? (Maybe overkill if pre-checked)
-        if manifest_pdf_path != pdf.path and os.path.basename(manifest_pdf_path) != os.path.basename(pdf.path):
-             logger.warning(f"Skipping page index {page_index} due to PDF source mismatch ('{manifest_pdf_path}' vs '{pdf.path}')")
-             skipped_count += len(page_data.get("regions", [])) # Count all regions as skipped
-             continue
+        if manifest_pdf_path != pdf.path and os.path.basename(
+            manifest_pdf_path
+        ) != os.path.basename(pdf.path):
+            logger.warning(
+                f"Skipping page index {page_index} due to PDF source mismatch ('{manifest_pdf_path}' vs '{pdf.path}')"
+            )
+            skipped_count += len(page_data.get("regions", []))  # Count all regions as skipped
+            continue
         if page_index is None:
-            logger.warning(f"Skipping page entry with missing 'page_index': {page_data.get('page_number')}")
+            logger.warning(
+                f"Skipping page entry with missing 'page_index': {page_data.get('page_number')}"
+            )
             skipped_count += len(page_data.get("regions", []))
             continue
         page = pdf_pages_by_index.get(page_index)
         if page is None:
-            logger.warning(f"Could not find page with index {page_index} in the target PDF. Skipping.")
+            logger.warning(
+                f"Could not find page with index {page_index} in the target PDF. Skipping."
+            )
             skipped_count += len(page_data.get("regions", []))
             continue
@@ -353,11 +402,13 @@ def import_ocr_from_manifest(
             if text_to_import is None:
                 text_to_import = region_data.get("ocr_text")
-            resolution = region_data.get("resolution") # Mandatory from export
-            confidence = region_data.get("confidence") # Optional
+            resolution = region_data.get("resolution")  # Mandatory from export
+            confidence = region_data.get("confidence")  # Optional
             if not all([manifest_bbox, text_to_import is not None, resolution]):
-                logger.warning(f"Skipping incomplete/invalid region data on page {page_index}, region id '{region_id}': Missing bbox, text, or resolution.")
+                logger.warning(
+                    f"Skipping incomplete/invalid region data on page {page_index}, region id '{region_id}': Missing bbox, text, or resolution."
+                )
                 skipped_count += 1
                 continue
@@ -369,11 +420,13 @@ def import_ocr_from_manifest(
                 pdf_x1 = manifest_bbox[2] * scale_factor
                 pdf_bottom = manifest_bbox[3] * scale_factor
             except (ValueError, TypeError, IndexError, ZeroDivisionError):
-                logger.warning(f"Invalid bbox or resolution for region '{region_id}' on page {page_index}. Skipping.")
+                logger.warning(
+                    f"Invalid bbox or resolution for region '{region_id}' on page {page_index}. Skipping."
+                )
                 skipped_count += 1
                 continue
-            # --- Create New Element ---
+            # --- Create New Element ---
             try:
                 new_element = TextElement(
                     text=text_to_import,
@@ -381,31 +434,44 @@ def import_ocr_from_manifest(
                     top=pdf_top,
                     x1=pdf_x1,
                     bottom=pdf_bottom,
-                    page=page, # Reference to the parent Page object
-                    source='manifest-import', # Indicate origin
-                    confidence=confidence, # Pass confidence if available
+                    page=page,  # Reference to the parent Page object
+                    source="manifest-import",  # Indicate origin
+                    confidence=confidence,  # Pass confidence if available
                     # Add metadata from manifest if needed? Maybe original_ocr?
-                    metadata={'original_ocr': region_data.get("ocr_text")} if region_data.get("ocr_text") != text_to_import else {}
+                    metadata=(
+                        {"original_ocr": region_data.get("ocr_text")}
+                        if region_data.get("ocr_text") != text_to_import
+                        else {}
+                    ),
                 )
                 regions_to_add.append(new_element)
                 imported_count += 1
             except Exception as e:
-                 logger.error(f"Error creating TextElement for region '{region_id}' on page {page_index}: {e}", exc_info=True)
-                 skipped_count += 1
+                logger.error(
+                    f"Error creating TextElement for region '{region_id}' on page {page_index}: {e}",
+                    exc_info=True,
+                )
+                skipped_count += 1
         # --- Add Elements to Page ---
         # Add all created elements for this page in one go
         if regions_to_add:
             try:
                 # Accessing _elements directly; use manager if a public add method exists
-                if hasattr(page, '_elements') and hasattr(page._elements, 'elements') and isinstance(page._elements.elements, list):
+                if (
+                    hasattr(page, "_elements")
+                    and hasattr(page._elements, "elements")
+                    and isinstance(page._elements.elements, list)
+                ):
                     page._elements.elements.extend(regions_to_add)
                     # TODO: Should potentially invalidate page element cache if exists
                 else:
-                     logger.error(f"Could not add elements to page {page.index}, page._elements structure unexpected.")
-                     # Decrement count as they weren't actually added
-                     imported_count -= len(regions_to_add)
-                     skipped_count += len(regions_to_add)
+                    logger.error(
+                        f"Could not add elements to page {page.index}, page._elements structure unexpected."
+                    )
+                    # Decrement count as they weren't actually added
+                    imported_count -= len(regions_to_add)
+                    skipped_count += len(regions_to_add)
             except Exception as e:
                 logger.error(f"Error adding elements to page {page.index}: {e}", exc_info=True)
@@ -413,6 +479,7 @@ def import_ocr_from_manifest(
                 imported_count -= len(regions_to_add)
                 skipped_count += len(regions_to_add)
-    logger.info(f"Import process finished. Imported: {imported_count}, Skipped: {skipped_count}. Processed {processed_pages} pages from manifest.")
-    return {'imported': imported_count, 'skipped': skipped_count}
+    logger.info(
+        f"Import process finished. Imported: {imported_count}, Skipped: {skipped_count}. Processed {processed_pages} pages from manifest."
+    )
+    return {"imported": imported_count, "skipped": skipped_count}

natural_pdf/utils/text_extraction.py CHANGED Viewed

@@ -116,80 +116,60 @@ def filter_chars_spatially(
 def generate_text_layout(
     char_dicts: List[Dict[str, Any]],
-    layout_context_bbox: Tuple[float, float, float, float],
-    user_kwargs: Dict[str, Any],
+    layout_context_bbox: Optional[Tuple[float, float, float, float]] = None,
+    user_kwargs: Optional[Dict[str, Any]] = None,
 ) -> str:
     """
-    Takes a list of filtered character dictionaries and generates
-    text output using pdfplumber's layout engine.
+    Generates a string representation of text from character dictionaries,
+    attempting to reconstruct layout using pdfplumber's utilities.
     Args:
-        char_dicts: The final list of character dictionaries to include.
-        layout_context_bbox: The bounding box (x0, top, x1, bottom) to use for
-                             calculating default layout width/height/shifts.
-        user_kwargs: Dictionary of user-provided keyword arguments.
+        char_dicts: List of character dictionary objects.
+        layout_context_bbox: Optional bounding box for layout context.
+        user_kwargs: User-provided kwargs, potentially overriding defaults.
     Returns:
-        The formatted text string.
+        String representation of the text.
     """
-    if not char_dicts:
-        logger.debug("generate_text_layout: No characters provided.")
+    # --- Filter out invalid char dicts early ---
+    initial_count = len(char_dicts)
+    valid_char_dicts = [c for c in char_dicts if isinstance(c.get("text"), str)]
+    filtered_count = initial_count - len(valid_char_dicts)
+    if filtered_count > 0:
+        logger.debug(
+            f"generate_text_layout: Filtered out {filtered_count} char dicts with non-string/None text."
+        )
+    if not valid_char_dicts:  # Return empty if no valid chars remain
+        logger.debug("generate_text_layout: No valid character dicts found after filtering.")
         return ""
-    # Prepare layout kwargs, prioritizing user input
-    layout_kwargs = {}
-    allowed_keys = set(WORD_EXTRACTOR_KWARGS) | set(TEXTMAP_KWARGS)
-    for key, value in user_kwargs.items():
-        if key in allowed_keys:
-            layout_kwargs[key] = value
-    # Default to layout=True unless explicitly False
-    use_layout = layout_kwargs.get("layout", True)  # Default to layout if called
-    layout_kwargs["layout"] = use_layout
-    if use_layout:
-        ctx_x0, ctx_top, ctx_x1, ctx_bottom = layout_context_bbox
-        ctx_width = ctx_x1 - ctx_x0
-        ctx_height = ctx_bottom - ctx_top
-        # Set layout defaults based on context_bbox if not overridden by user
-        if "layout_bbox" not in layout_kwargs:
-            layout_kwargs["layout_bbox"] = layout_context_bbox
-        # Only set default layout_width if neither width specifier is present
-        if "layout_width_chars" not in layout_kwargs and "layout_width" not in layout_kwargs:
-            layout_kwargs["layout_width"] = ctx_width
-        if "layout_height" not in layout_kwargs:
-            layout_kwargs["layout_height"] = ctx_height
-        # Adjust shift based on context's top-left corner
-        if "x_shift" not in layout_kwargs:
-            layout_kwargs["x_shift"] = ctx_x0
-        if "y_shift" not in layout_kwargs:
-            layout_kwargs["y_shift"] = ctx_top
+    # Prepare layout arguments
+    layout_kwargs = _get_layout_kwargs(layout_context_bbox, user_kwargs)
+    use_layout = layout_kwargs.pop("layout", True)  # Extract layout flag, default True
-        logger.debug(
-            f"generate_text_layout: Calling chars_to_textmap with {len(char_dicts)} chars and kwargs: {layout_kwargs}"
-        )
-        try:
-            # Sort final list by reading order before passing to textmap
-            # TODO: Make sorting key dynamic based on layout_kwargs directions?
-            char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
-            textmap = chars_to_textmap(char_dicts, **layout_kwargs)
-            result = textmap.as_string
-        except Exception as e:
-            logger.error(
-                f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=True
-            )
-            logger.warning(
-                "generate_text_layout: Falling back to simple character join due to layout error."
-            )
-            # Ensure chars are sorted before fallback join
-            fallback_chars = sorted(char_dicts, key=lambda c: (c.get("top", 0), c.get("x0", 0)))
-            result = "".join(c.get("text", "") for c in fallback_chars)
-    else:
+    if not use_layout:
         # Simple join if layout=False
-        logger.debug("generate_text_layout: Using simple join (layout=False).")
-        # Sort by document order for simple join as well
-        char_dicts.sort(key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0)))
-        result = "".join(c.get("text", "") for c in char_dicts)
+        logger.debug("generate_text_layout: Using simple join (layout=False requested).")
+        # Sort before joining if layout is off
+        valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
+        result = "".join(c.get("text", "") for c in valid_char_dicts)  # Use valid chars
+        return result
+    try:
+        # Sort chars primarily by top, then x0 before layout analysis
+        # This helps pdfplumber group lines correctly
+        valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
+        textmap = chars_to_textmap(valid_char_dicts, **layout_kwargs)
+        result = textmap.as_string
+    except Exception as e:
+        # Fallback to simple join on error
+        logger.error(f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=False)
+        logger.warning(
+            "generate_text_layout: Falling back to simple character join due to layout error."
+        )
+        # Fallback already has sorted characters if layout was attempted
+        # Need to use the valid_char_dicts here too
+        result = "".join(c.get("text", "") for c in valid_char_dicts)
     return result

natural_pdf/utils/visualization.py CHANGED Viewed

@@ -192,7 +192,7 @@ def merge_images_with_legend(
     if not legend:
         return image  # Return original image if legend is None or empty
-    bg_color = (255, 255, 255, 255) # Always use white for the merged background
+    bg_color = (255, 255, 255, 255)  # Always use white for the merged background
     if position == "right":
         # Create a new image with extra width for the legend

natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

natural-pdf 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl