PyPI - natural-pdf - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +126 -98
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +910 -516
natural_pdf/core/pdf.py +387 -289
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +714 -514
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.3.dist-info/RECORD +0 -61
natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/templates/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- # Templates package
1	+ # Templates package

natural_pdf/utils/highlighting.py CHANGED Viewed

@@ -4,9 +4,15 @@ Highlighting utilities for natural-pdf.
 This module primarily re-exports core highlighting utilities from the visualization module.
 The main highlighting logic is now centralized in `natural_pdf.core.highlighting_service.HighlightingService`.
 """
 # Re-export necessary functions from visualization
-from .visualization import get_next_highlight_color, create_legend, merge_images_with_legend, reset_highlight_colors
+from .visualization import (
+    create_legend,
+    get_next_highlight_color,
+    merge_images_with_legend,
+    reset_highlight_colors,
+)
 # --- The Highlight class and HighlightManager class previously defined here have been removed ---
 # --- The functionality is now handled by natural_pdf.core.highlighting_service.HighlightingService ---
-# --- and its internal HighlightRenderer class. ---
+# --- and its internal HighlightRenderer class. ---

natural_pdf/utils/reading_order.py CHANGED Viewed

@@ -1,26 +1,28 @@
 """
 Reading order utilities for natural-pdf.
 """
-from typing import List, Dict, Any, Callable, Optional
+from typing import Any, Callable, Dict, List, Optional
-def establish_reading_order(elements: List[Dict[str, Any]],
-                           algorithm: str = 'basic') -> List[Dict[str, Any]]:
+def establish_reading_order(
+    elements: List[Dict[str, Any]], algorithm: str = "basic"
+) -> List[Dict[str, Any]]:
     """
     Establish reading order for a collection of elements.
     Args:
         elements: List of elements to order
         algorithm: Algorithm to use ('basic', 'column', 'complex')
     Returns:
         List of elements in reading order
     """
-    if algorithm == 'basic':
+    if algorithm == "basic":
         return _basic_reading_order(elements)
-    elif algorithm == 'column':
+    elif algorithm == "column":
         return _column_reading_order(elements)
-    elif algorithm == 'complex':
+    elif algorithm == "complex":
         return _complex_reading_order(elements)
     else:
         # Default to basic
@@ -30,55 +32,52 @@ def establish_reading_order(elements: List[Dict[str, Any]],
 def _basic_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """
     Basic top-to-bottom, left-to-right reading order.
     Args:
         elements: List of elements to order
     Returns:
         List of elements in reading order
     """
     # Simple sort by y0 (top), then by x0 (left)
-    return sorted(elements, key=lambda e: (
-        e.get('top', e.get('y0', 0)),
-        e.get('x0', 0)
-    ))
+    return sorted(elements, key=lambda e: (e.get("top", e.get("y0", 0)), e.get("x0", 0)))
-def _column_reading_order(elements: List[Dict[str, Any]],
-                         column_threshold: float = 0.2,
-                         x_tolerance: float = 10.0) -> List[Dict[str, Any]]:
+def _column_reading_order(
+    elements: List[Dict[str, Any]], column_threshold: float = 0.2, x_tolerance: float = 10.0
+) -> List[Dict[str, Any]]:
     """
     Reading order that accounts for columns.
     This is more complex as it needs to detect columns first,
     then read each column in order.
     Args:
         elements: List of elements to order
         column_threshold: Percentage overlap threshold for column detection (0.0 to 1.0)
         x_tolerance: Horizontal tolerance for determining column edges
     Returns:
         List of elements in reading order
     """
     if not elements:
         return []
     # 1. Group elements by line
     lines = group_elements_by_line(elements)
     # 2. For each line, find the x-coordinate ranges (potential column boundaries)
     line_x_ranges = []
     for line in lines:
         for el in line:
-            x0 = el.get('x0', 0)
-            x1 = el.get('x1', 0)
+            x0 = el.get("x0", 0)
+            x1 = el.get("x1", 0)
             line_x_ranges.append((x0, x1))
     # If we don't have enough ranges to detect columns, just use basic ordering
     if len(line_x_ranges) < 3:
         return _basic_reading_order(elements)
     # 3. Detect columns by clustering x-coordinate ranges
     def overlaps(range1, range2, threshold=column_threshold):
         """Determine if two ranges overlap by more than threshold percentage."""
@@ -86,25 +85,25 @@ def _column_reading_order(elements: List[Dict[str, Any]],
         overlap_start = max(range1[0], range2[0])
         overlap_end = min(range1[1], range2[1])
         overlap = max(0, overlap_end - overlap_start)
         # Calculate lengths
         len1 = range1[1] - range1[0]
         len2 = range2[1] - range2[0]
         # Calculate overlap as percentage of the shorter range
         shorter_len = min(len1, len2)
         if shorter_len == 0:
             return False
         return overlap / shorter_len >= threshold
     # Cluster x-ranges into columns
     columns = []
     for x_range in line_x_ranges:
         # Skip zero-width ranges
         if x_range[1] - x_range[0] <= 0:
             continue
         # Try to find an existing column to add to
         added = False
         for col in columns:
@@ -112,68 +111,70 @@ def _column_reading_order(elements: List[Dict[str, Any]],
                 col.append(x_range)
                 added = True
                 break
         # If not added to an existing column, create a new one
         if not added:
             columns.append([x_range])
     # 4. Get column boundaries by averaging x-ranges in each column
     column_bounds = []
     for col in columns:
         left = sum(r[0] for r in col) / len(col)
         right = sum(r[1] for r in col) / len(col)
         column_bounds.append((left, right))
     # Sort columns by x-coordinate (left to right)
     column_bounds.sort(key=lambda b: b[0])
     # 5. Assign each element to a column
     element_columns = {}
     for el in elements:
         # Get element x-coordinates
-        el_x0 = el.get('x0', 0)
-        el_x1 = el.get('x1', 0)
+        el_x0 = el.get("x0", 0)
+        el_x1 = el.get("x1", 0)
         el_center = (el_x0 + el_x1) / 2
         # Find the column this element belongs to
         for i, (left, right) in enumerate(column_bounds):
             # Extend bounds by tolerance
             extended_left = left - x_tolerance
             extended_right = right + x_tolerance
             # Check if center point is within extended column bounds
             if extended_left <= el_center <= extended_right:
                 element_columns[el] = i
                 break
         else:
             # If no column found, assign to nearest column
-            distances = [(i, min(abs(el_center - left), abs(el_center - right)))
-                         for i, (left, right) in enumerate(column_bounds)]
+            distances = [
+                (i, min(abs(el_center - left), abs(el_center - right)))
+                for i, (left, right) in enumerate(column_bounds)
+            ]
             nearest_col = min(distances, key=lambda d: d[1])[0]
             element_columns[el] = nearest_col
     # 6. Sort elements by column, then by vertical position
     sorted_elements = []
     for col_idx, _ in enumerate(column_bounds):
         # Get elements in this column
         col_elements = [el for el in elements if element_columns.get(el) == col_idx]
         # Sort by top coordinate
-        col_elements.sort(key=lambda e: e.get('top', e.get('y0', 0)))
+        col_elements.sort(key=lambda e: e.get("top", e.get("y0", 0)))
         # Add to final list
         sorted_elements.extend(col_elements)
     return sorted_elements
 def _complex_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """
     Complex reading order that accounts for various document structures.
     This considers columns, text flow around images, tables, etc.
     Args:
         elements: List of elements to order
     Returns:
         List of elements in reading order
     """
@@ -182,31 +183,32 @@ def _complex_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any
     return _column_reading_order(elements)
-def group_elements_by_line(elements: List[Dict[str, Any]],
-                          tolerance: float = 3.0) -> List[List[Dict[str, Any]]]:
+def group_elements_by_line(
+    elements: List[Dict[str, Any]], tolerance: float = 3.0
+) -> List[List[Dict[str, Any]]]:
     """
     Group elements into lines based on vertical position.
     Args:
         elements: List of elements to group
         tolerance: Maximum vertical distance for elements to be considered on the same line
     Returns:
         List of lists, where each sublist contains elements on the same line
     """
     if not elements:
         return []
     # Sort by top coordinate
-    sorted_elements = sorted(elements, key=lambda e: e.get('top', e.get('y0', 0)))
+    sorted_elements = sorted(elements, key=lambda e: e.get("top", e.get("y0", 0)))
     lines = []
     current_line = [sorted_elements[0]]
-    current_top = sorted_elements[0].get('top', sorted_elements[0].get('y0', 0))
+    current_top = sorted_elements[0].get("top", sorted_elements[0].get("y0", 0))
     for element in sorted_elements[1:]:
-        element_top = element.get('top', element.get('y0', 0))
+        element_top = element.get("top", element.get("y0", 0))
         # If element is close enough to current line's top, add to current line
         if abs(element_top - current_top) <= tolerance:
             current_line.append(element)
@@ -215,13 +217,13 @@ def group_elements_by_line(elements: List[Dict[str, Any]],
             lines.append(current_line)
             current_line = [element]
             current_top = element_top
     # Add the last line
     if current_line:
         lines.append(current_line)
     # Sort elements within each line by x0
     for line in lines:
-        line.sort(key=lambda e: e.get('x0', 0))
-    return lines
+        line.sort(key=lambda e: e.get("x0", 0))
+    return lines

natural_pdf/utils/text_extraction.py ADDED Viewed

@@ -0,0 +1,195 @@
+# natural_pdf/utils/text_extraction.py
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
+from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
+if TYPE_CHECKING:
+    from natural_pdf.elements.region import Region  # Use type hint
+logger = logging.getLogger(__name__)
+def filter_chars_spatially(
+    char_dicts: List[Dict[str, Any]],
+    exclusion_regions: List["Region"],
+    target_region: Optional["Region"] = None,
+    debug: bool = False,
+) -> List[Dict[str, Any]]:
+    """
+    Filters a list of character dictionaries spatially based on exclusions
+    and an optional target region.
+    Args:
+        char_dicts: List of character dictionaries to filter.
+        exclusion_regions: List of Region objects to exclude characters from.
+        target_region: Optional Region object. If provided, only characters within
+                       this region (respecting polygons) are kept.
+        debug: Enable debug logging.
+    Returns:
+        Filtered list of character dictionaries.
+    """
+    if not char_dicts:
+        return []
+    initial_count = len(char_dicts)
+    filtered_chars = char_dicts
+    # 1. Filter by Target Region (if provided)
+    if target_region:
+        target_bbox = target_region.bbox
+        target_is_polygon = target_region.has_polygon  # Check once
+        region_filtered_chars = []
+        for char_dict in filtered_chars:
+            # Ensure basic geometry keys exist before processing
+            if not all(k in char_dict for k in ["x0", "top", "x1", "bottom"]):
+                if debug:
+                    logger.warning(
+                        f"Skipping char due to missing geometry: {char_dict.get('text', '?')}"
+                    )
+                continue
+            char_bbox = (char_dict["x0"], char_dict["top"], char_dict["x1"], char_dict["bottom"])
+            # BBox pre-filter first
+            if get_bbox_overlap(char_bbox, target_bbox) is None:
+                continue
+            # Precise check if needed
+            char_center_x = (char_dict["x0"] + char_dict["x1"]) / 2
+            char_center_y = (char_dict["top"] + char_dict["bottom"]) / 2
+            if target_is_polygon:
+                if target_region.is_point_inside(char_center_x, char_center_y):
+                    region_filtered_chars.append(char_dict)
+                # else: # Optionally log discarded by polygon
+                #     if debug: logger.debug(...)
+            else:  # Rectangular region, bbox overlap was sufficient
+                region_filtered_chars.append(char_dict)
+        filtered_chars = region_filtered_chars
+        if debug:
+            logger.debug(
+                f"filter_chars_spatially: {len(filtered_chars)}/{initial_count} chars remaining after target region filter."
+            )
+        if not filtered_chars:
+            return []
+    # 2. Filter by Exclusions (if any)
+    if exclusion_regions:
+        final_chars = []
+        # Only calculate union_bbox if there are exclusions AND chars remaining
+        union_bbox = merge_bboxes(excl.bbox for excl in exclusion_regions)
+        for char_dict in filtered_chars:  # Process only chars within target
+            # Ensure basic geometry keys exist before processing
+            if not all(k in char_dict for k in ["x0", "top", "x1", "bottom"]):
+                # Already warned in target region filter if applicable
+                continue
+            char_bbox = (char_dict["x0"], char_dict["top"], char_dict["x1"], char_dict["bottom"])
+            # BBox pre-filter vs exclusion union
+            if get_bbox_overlap(char_bbox, union_bbox) is None:
+                final_chars.append(char_dict)  # Cannot be excluded
+                continue
+            # Precise check against individual overlapping exclusions
+            is_excluded = False
+            char_center_x = (char_dict["x0"] + char_dict["x1"]) / 2
+            char_center_y = (char_dict["top"] + char_dict["bottom"]) / 2
+            for exclusion in exclusion_regions:
+                # Optional: Add bbox overlap check here too before point_inside
+                if get_bbox_overlap(char_bbox, exclusion.bbox) is not None:
+                    if exclusion.is_point_inside(char_center_x, char_center_y):
+                        is_excluded = True
+                        if debug:
+                            char_text = char_dict.get("text", "?")
+                            log_msg = f"  - Excluding char '{char_text}' at {char_bbox} due to overlap with exclusion {exclusion.bbox}"
+                            logger.debug(log_msg)
+                        break
+            if not is_excluded:
+                final_chars.append(char_dict)
+        filtered_chars = final_chars
+        if debug:
+            logger.debug(
+                f"filter_chars_spatially: {len(filtered_chars)}/{initial_count} chars remaining after exclusion filter."
+            )
+        if not filtered_chars:
+            return []
+    return filtered_chars
+def generate_text_layout(
+    char_dicts: List[Dict[str, Any]],
+    layout_context_bbox: Tuple[float, float, float, float],
+    user_kwargs: Dict[str, Any],
+) -> str:
+    """
+    Takes a list of filtered character dictionaries and generates
+    text output using pdfplumber's layout engine.
+    Args:
+        char_dicts: The final list of character dictionaries to include.
+        layout_context_bbox: The bounding box (x0, top, x1, bottom) to use for
+                             calculating default layout width/height/shifts.
+        user_kwargs: Dictionary of user-provided keyword arguments.
+    Returns:
+        The formatted text string.
+    """
+    if not char_dicts:
+        logger.debug("generate_text_layout: No characters provided.")
+        return ""
+    # Prepare layout kwargs, prioritizing user input
+    layout_kwargs = {}
+    allowed_keys = set(WORD_EXTRACTOR_KWARGS) | set(TEXTMAP_KWARGS)
+    for key, value in user_kwargs.items():
+        if key in allowed_keys:
+            layout_kwargs[key] = value
+    # Default to layout=True unless explicitly False
+    use_layout = layout_kwargs.get("layout", True)  # Default to layout if called
+    layout_kwargs["layout"] = use_layout
+    if use_layout:
+        ctx_x0, ctx_top, ctx_x1, ctx_bottom = layout_context_bbox
+        ctx_width = ctx_x1 - ctx_x0
+        ctx_height = ctx_bottom - ctx_top
+        # Set layout defaults based on context_bbox if not overridden by user
+        if "layout_bbox" not in layout_kwargs:
+            layout_kwargs["layout_bbox"] = layout_context_bbox
+        # Only set default layout_width if neither width specifier is present
+        if "layout_width_chars" not in layout_kwargs and "layout_width" not in layout_kwargs:
+            layout_kwargs["layout_width"] = ctx_width
+        if "layout_height" not in layout_kwargs:
+            layout_kwargs["layout_height"] = ctx_height
+        # Adjust shift based on context's top-left corner
+        if "x_shift" not in layout_kwargs:
+            layout_kwargs["x_shift"] = ctx_x0
+        if "y_shift" not in layout_kwargs:
+            layout_kwargs["y_shift"] = ctx_top
+        logger.debug(
+            f"generate_text_layout: Calling chars_to_textmap with {len(char_dicts)} chars and kwargs: {layout_kwargs}"
+        )
+        try:
+            # Sort final list by reading order before passing to textmap
+            # TODO: Make sorting key dynamic based on layout_kwargs directions?
+            char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
+            textmap = chars_to_textmap(char_dicts, **layout_kwargs)
+            result = textmap.as_string
+        except Exception as e:
+            logger.error(
+                f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=True
+            )
+            logger.warning(
+                "generate_text_layout: Falling back to simple character join due to layout error."
+            )
+            # Ensure chars are sorted before fallback join
+            fallback_chars = sorted(char_dicts, key=lambda c: (c.get("top", 0), c.get("x0", 0)))
+            result = "".join(c.get("text", "") for c in fallback_chars)
+    else:
+        # Simple join if layout=False
+        logger.debug("generate_text_layout: Using simple join (layout=False).")
+        # Sort by document order for simple join as well
+        char_dicts.sort(key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0)))
+        result = "".join(c.get("text", "") for c in char_dicts)
+    return result

natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl