PyPI - natural-pdf - Versions diffs - 0.1.23__py3-none-any.whl → 0.1.26.dev0__py3-none-any.whl - Mend

natural-pdf 0.1.23py3-none-any.whl → 0.1.26.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

natural_pdf/analyzers/shape_detection_mixin.py +40 -0
natural_pdf/core/highlighting_service.py +4 -4
natural_pdf/core/page.py +82 -9
natural_pdf/describe/base.py +11 -1
natural_pdf/describe/summary.py +28 -2
natural_pdf/elements/base.py +2 -2
natural_pdf/elements/collections.py +139 -100
natural_pdf/elements/line.py +9 -4
natural_pdf/elements/region.py +173 -16
natural_pdf/elements/text.py +65 -8
natural_pdf/flows/region.py +116 -1
natural_pdf/qa/document_qa.py +224 -113
natural_pdf/utils/packaging.py +23 -9
natural_pdf/utils/text_extraction.py +34 -14
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/METADATA +2 -1
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/RECORD +20 -20
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/top_level.txt +0 -0

natural_pdf/qa/document_qa.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import logging
 import os
 import tempfile
+import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
@@ -119,29 +120,52 @@ class DocumentQA:
     def ask(
         self,
         image: Union[str, Image.Image, np.ndarray],
-        question: str,
+        question: Union[str, List[str], Tuple[str, ...]],
         word_boxes: List = None,
         min_confidence: float = 0.1,
         debug: bool = False,
         debug_output_dir: str = "output",
-    ) -> QAResult:
+    ) -> Union[QAResult, List[QAResult]]:
         """
-        Ask a question about document content.
+        Ask one or more natural-language questions about the supplied document image.
+        This method now accepts a single *question* (``str``) **or** an
+        iterable of questions (``list``/``tuple`` of ``str``).  When multiple
+        questions are provided they are executed in a single batch through the
+        underlying transformers pipeline which is considerably faster than
+        looping and calling :py:meth:`ask` repeatedly.
         Args:
-            image: PIL Image, numpy array, or path to image file
-            question: Question to ask about the document
-            word_boxes: Optional pre-extracted word boxes [[text, [x0, y0, x1, y1]], ...]
-            min_confidence: Minimum confidence threshold for answers
-            debug: Whether to save debug information
-            debug_output_dir: Directory to save debug files
+            image: PIL ``Image``, ``numpy`` array, or path to an image file.
+            question: A question string *or* a list/tuple of question strings.
+            word_boxes: Optional pre-extracted word-boxes in the LayoutLMv3
+                format ``[[text, [x0, y0, x1, y1]], …]``.
+            min_confidence: Minimum confidence threshold below which an answer
+                will be marked as ``found = False``.
+            debug: If ``True`` intermediate artefacts will be written to
+                *debug_output_dir* to aid troubleshooting.
+            debug_output_dir: Directory where debug artefacts should be saved.
         Returns:
-            QAResult instance with answer details
+            • A single :class:`QAResult` when *question* is a string.
+            • A ``list`` of :class:`QAResult`` objects (one per question) when
+              *question* is a list/tuple.
         """
         if not self._is_initialized:
             raise RuntimeError("DocumentQA is not properly initialized")
+        # Normalise *questions* to a list so we can treat batch and single
+        # uniformly.  We'll remember if the caller supplied a single question
+        # so that we can preserve the original return type.
+        single_question = False
+        if isinstance(question, str):
+            questions = [question]
+            single_question = True
+        elif isinstance(question, (list, tuple)) and all(isinstance(q, str) for q in question):
+            questions = list(question)
+        else:
+            raise TypeError("'question' must be a string or a list/tuple of strings")
         # Process the image
         if isinstance(image, str):
             # It's a file path
@@ -157,12 +181,16 @@ class DocumentQA:
         else:
             raise TypeError("Image must be a PIL Image, numpy array, or file path")
-        # Prepare the query
-        query = {"image": image_obj, "question": question}
+        # ------------------------------------------------------------------
+        # Build the queries for the pipeline (either single dict or list).
+        # ------------------------------------------------------------------
+        def _build_query_dict(q: str):
+            d = {"image": image_obj, "question": q}
+            if word_boxes:
+                d["word_boxes"] = word_boxes
+            return d
-        # Add word boxes if provided
-        if word_boxes:
-            query["word_boxes"] = word_boxes
+        queries = [_build_query_dict(q) for q in questions]
         # Save debug information if requested
         if debug:
@@ -198,48 +226,79 @@ class DocumentQA:
                 logger.info(f"Word boxes: {word_boxes_path}")
                 logger.info(f"Visualization: {vis_path}")
-        # Run the query through the pipeline
-        logger.info(f"Running document QA pipeline with question: {question}")
-        result = self.pipe(query)[0]
-        logger.info(f"Raw result: {result}")
-        # Save the result if debugging
-        if debug:
-            result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
-            with open(result_path, "w") as f:
-                # Convert any non-serializable data
-                serializable_result = {
-                    k: (
-                        str(v)
-                        if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
-                        else v
-                    )
-                    for k, v in result.items()
-                }
-                json.dump(serializable_result, f, indent=2)
-        # Check confidence against threshold
-        if result["score"] < min_confidence:
-            logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
-            return QAResult(
-                answer="",
-                confidence=result["score"],
-                start=result.get("start", -1),
-                end=result.get("end", -1),
-                found=False,
-            )
-        return QAResult(
-            answer=result["answer"],
-            confidence=result["score"],
-            start=result.get("start", 0),
-            end=result.get("end", 0),
-            found=True,
+        # ------------------------------------------------------------------
+        # Run the queries through the pipeline (batch or single) and collect
+        # *only the top answer* for each, mirroring the original behaviour.
+        # ------------------------------------------------------------------
+        logger.info(
+            f"Running document QA pipeline with {len(queries)} question{'s' if len(queries) != 1 else ''}."
         )
+        # When we pass a list the pipeline returns a list of per-question
+        # results; each per-question result is itself a list (top-k answers).
+        # We keep only the best answer (index 0) to maintain backwards
+        # compatibility.
+        raw_results = self.pipe(queries if len(queries) > 1 else queries[0])
+        # Ensure we always have a list aligned with *questions*
+        if len(queries) == 1:
+            raw_results = [raw_results]
+        processed_results: List[QAResult] = []
+        for q, res in zip(questions, raw_results):
+            top_res = res[0] if isinstance(res, list) else res  # pipeline may or may not nest
+            # Save per-question result in debug mode
+            if debug:
+                # File names: debug_qa_result_0.json, …
+                result_path = os.path.join(debug_output_dir, f"debug_qa_result_{q[:30].replace(' ', '_')}.json")
+                try:
+                    with open(result_path, "w") as f:
+                        serializable = {
+                            k: (
+                                str(v)
+                                if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
+                                else v
+                            )
+                            for k, v in top_res.items()
+                        }
+                        json.dump(serializable, f, indent=2)
+                except Exception as e:
+                    logger.warning(f"Failed to save debug QA result for question '{q}': {e}")
+            # Apply confidence threshold
+            if top_res["score"] < min_confidence:
+                qa_res = QAResult(
+                    question=q,
+                    answer="",
+                    confidence=top_res["score"],
+                    start=top_res.get("start", -1),
+                    end=top_res.get("end", -1),
+                    found=False,
+                )
+            else:
+                qa_res = QAResult(
+                    question=q,
+                    answer=top_res["answer"],
+                    confidence=top_res["score"],
+                    start=top_res.get("start", 0),
+                    end=top_res.get("end", 0),
+                    found=True,
+                )
+            processed_results.append(qa_res)
+        # Return appropriately typed result (single item or list)
+        return processed_results[0] if single_question else processed_results
     def ask_pdf_page(
-        self, page, question: str, min_confidence: float = 0.1, debug: bool = False
-    ) -> QAResult:
+        self,
+        page,
+        question: Union[str, List[str], Tuple[str, ...]],
+        min_confidence: float = 0.1,
+        debug: bool = False,
+    ) -> Union[QAResult, List[QAResult]]:
         """
         Ask a question about a specific PDF page.
@@ -252,13 +311,39 @@ class DocumentQA:
             QAResult instance with answer details
         """
         # Ensure we have text elements on the page
-        if not page.find_all("text"):
-            # Apply OCR if no text is available
-            logger.info(f"No text elements found on page {page.index}, applying OCR")
-            page.apply_ocr()
+        elements = page.find_all("text")
+        if not elements:
+            # Warn that no text was found and recommend OCR
+            warnings.warn(
+                f"No text elements found on page {page.index}. "
+                "Consider applying OCR first using page.apply_ocr() to extract text from images.",
+                UserWarning
+            )
+            # Return appropriate "not found" result(s)
+            if isinstance(question, (list, tuple)):
+                return [
+                    QAResult(
+                        question=q,
+                        answer="",
+                        confidence=0.0,
+                        start=-1,
+                        end=-1,
+                        found=False,
+                    )
+                    for q in question
+                ]
+            else:
+                return QAResult(
+                    question=question,
+                    answer="",
+                    confidence=0.0,
+                    start=-1,
+                    end=-1,
+                    found=False,
+                )
         # Extract word boxes
-        elements = page.find_all("text")
         word_boxes = self._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
         # Generate a high-resolution image of the page
@@ -270,8 +355,8 @@ class DocumentQA:
         page_image.save(temp_path)
         try:
-            # Ask the question
-            result = self.ask(
+            # Ask the question(s)
+            result_obj = self.ask(
                 image=temp_path,
                 question=question,
                 word_boxes=word_boxes,
@@ -279,34 +364,35 @@ class DocumentQA:
                 debug=debug,
             )
-            # Add page reference to the result
-            result.page_num = page.index
+            # Ensure we have a list for uniform processing
+            results = result_obj if isinstance(result_obj, list) else [result_obj]
-            # Add element references if possible
-            if result.found and "start" in result and "end" in result:
-                start_idx = result.start
-                end_idx = result.end
+            for res in results:
+                # Attach page reference
+                res.page_num = page.index
-                # Make sure we have valid indices and elements to work with
-                if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
-                    # Find the actual source elements in the original list
-                    # Since word_boxes may have filtered out some elements, we need to map indices
+                # Map answer span back to source elements
+                if res.found and "start" in res and "end" in res:
+                    start_idx = res.start
+                    end_idx = res.end
-                    # Get the text from result word boxes
-                    matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
+                    if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
+                        matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
-                    # Find corresponding elements in the full element list
-                    source_elements = []
-                    for element in elements:
-                        if hasattr(element, "text") and element.text in matched_texts:
-                            source_elements.append(element)
-                            # Remove from matched texts to avoid duplicates
-                            if element.text in matched_texts:
-                                matched_texts.remove(element.text)
+                        source_elements = []
+                        for element in elements:
+                            if hasattr(element, "text") and element.text in matched_texts:
+                                source_elements.append(element)
+                                if element.text in matched_texts:
+                                    matched_texts.remove(element.text)
-                    result.source_elements = ElementCollection(source_elements)
+                        res.source_elements = ElementCollection(source_elements)
-            return result
+            # Return result(s) preserving original input type
+            if isinstance(question, (list, tuple)):
+                return results
+            else:
+                return results[0]
         finally:
             # Clean up temporary file
@@ -314,8 +400,12 @@ class DocumentQA:
                 os.remove(temp_path)
     def ask_pdf_region(
-        self, region, question: str, min_confidence: float = 0.1, debug: bool = False
-    ) -> QAResult:
+        self,
+        region,
+        question: Union[str, List[str], Tuple[str, ...]],
+        min_confidence: float = 0.1,
+        debug: bool = False,
+    ) -> Union[QAResult, List[QAResult]]:
         """
         Ask a question about a specific region of a PDF page.
@@ -330,10 +420,37 @@ class DocumentQA:
         # Get all text elements within the region
         elements = region.find_all("text")
-        # Apply OCR if needed
+        # Check if we have text elements
         if not elements:
-            logger.info(f"No text elements found in region, applying OCR")
-            elements = region.apply_ocr()
+            # Warn that no text was found and recommend OCR
+            warnings.warn(
+                f"No text elements found in region on page {region.page.index}. "
+                "Consider applying OCR first using region.apply_ocr() to extract text from images.",
+                UserWarning
+            )
+            # Return appropriate "not found" result(s)
+            if isinstance(question, (list, tuple)):
+                return [
+                    QAResult(
+                        question=q,
+                        answer="",
+                        confidence=0.0,
+                        start=-1,
+                        end=-1,
+                        found=False,
+                    )
+                    for q in question
+                ]
+            else:
+                return QAResult(
+                    question=question,
+                    answer="",
+                    confidence=0.0,
+                    start=-1,
+                    end=-1,
+                    found=False,
+                )
         # Extract word boxes adjusted for the cropped region
         x0, top = int(region.x0), int(region.top)
@@ -352,8 +469,8 @@ class DocumentQA:
         region_image.save(temp_path)
         try:
-            # Ask the question
-            result = self.ask(
+            # Ask the question(s)
+            result_obj = self.ask(
                 image=temp_path,
                 question=question,
                 word_boxes=word_boxes,
@@ -361,35 +478,29 @@ class DocumentQA:
                 debug=debug,
             )
-            # Add region reference to the result
-            result.region = region
-            result.page_num = region.page.index
+            results = result_obj if isinstance(result_obj, list) else [result_obj]
-            # Add element references if possible
-            if result.found and "start" in result and "end" in result:
-                start_idx = result.start
-                end_idx = result.end
+            for res in results:
+                res.region = region
+                res.page_num = region.page.index
-                # Make sure we have valid indices and elements to work with
-                if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
-                    # Find the actual source elements in the original list
-                    # Since word_boxes may have filtered out some elements, we need to map indices
+                if res.found and "start" in res and "end" in res:
+                    start_idx = res.start
+                    end_idx = res.end
-                    # Get the text from result word boxes
-                    matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
+                    if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
+                        matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
-                    # Find corresponding elements in the full element list
-                    source_elements = []
-                    for element in elements:
-                        if hasattr(element, "text") and element.text in matched_texts:
-                            source_elements.append(element)
-                            # Remove from matched texts to avoid duplicates
-                            if element.text in matched_texts:
-                                matched_texts.remove(element.text)
+                        source_elements = []
+                        for element in elements:
+                            if hasattr(element, "text") and element.text in matched_texts:
+                                source_elements.append(element)
+                                if element.text in matched_texts:
+                                    matched_texts.remove(element.text)
-                    result.source_elements = ElementCollection(source_elements)
+                        res.source_elements = ElementCollection(source_elements)
-            return result
+            return results if isinstance(question, (list, tuple)) else results[0]
         finally:
             # Clean up temporary file

natural_pdf/utils/packaging.py CHANGED Viewed

@@ -36,7 +36,7 @@ def create_correction_task_package(
     output_zip_path: str,
     overwrite: bool = False,
     suggest=None,
-    resolution: int = 150,
+    resolution: int = 300,
 ) -> None:
     """
     Creates a zip package containing data for an OCR correction task.
@@ -160,8 +160,22 @@ def create_correction_task_package(
                 # 3. Prepare region data for manifest
                 page_regions_data = []
-                # Calculate scaling factor from PDF coordinates (72 DPI) to image pixels
-                coord_scale_factor = resolution / 72.0
+                # Calculate scaling factor *from PDF points* to *actual image pixels*.
+                # We prefer using the rendered image dimensions rather than the nominal
+                # resolution value, because the image might have been resized (e.g. via
+                # global `natural_pdf.options.image.width`). This guarantees that the
+                # bounding boxes we write to the manifest always align with the exact
+                # pixel grid of the exported image.
+                try:
+                    scale_x = img.width / float(page.width) if page.width else 1.0
+                    scale_y = img.height / float(page.height) if page.height else 1.0
+                except Exception as e:
+                    logger.warning(
+                        f"Could not compute per-axis scale factors for page {page.number}: {e}. "
+                        "Falling back to resolution-based scaling."
+                    )
+                    scale_x = scale_y = resolution / 72.0
                 i = -1
                 for elem in tqdm(ocr_elements):
@@ -176,12 +190,12 @@ def create_correction_task_package(
                         continue
                     region_id = f"r_{page.index}_{i}"  # ID unique within page
-                    # Scale coordinates to match the 300 DPI image
+                    # Scale coordinates to match the **actual** image dimensions.
                     scaled_bbox = [
-                        elem.x0 * coord_scale_factor,
-                        elem.top * coord_scale_factor,
-                        elem.x1 * coord_scale_factor,
-                        elem.bottom * coord_scale_factor,
+                        elem.x0 * scale_x,
+                        elem.top * scale_y,
+                        elem.x1 * scale_x,
+                        elem.bottom * scale_y,
                     ]
                     corrected = elem.text
@@ -191,7 +205,7 @@ def create_correction_task_package(
                     page_regions_data.append(
                         {
-                            "resolution": resolution,
+                            "resolution": scale_x * 72.0,
                             "id": region_id,
                             "bbox": scaled_bbox,
                             "ocr_text": elem.text,

natural_pdf/utils/text_extraction.py CHANGED Viewed

@@ -63,9 +63,9 @@ def _get_layout_kwargs(
             else:
                 logger.warning(f"Ignoring unsupported layout keyword argument: '{key}'")
-    # 4. Ensure layout flag is present, defaulting to True
+    # 4. Ensure layout flag is present, defaulting to False (caller can override)
     if "layout" not in layout_kwargs:
-        layout_kwargs["layout"] = True
+        layout_kwargs["layout"] = False
     return layout_kwargs
@@ -203,24 +203,42 @@ def generate_text_layout(
         logger.debug("generate_text_layout: No valid character dicts found after filtering.")
         return ""
-    # Prepare layout arguments
-    layout_kwargs = _get_layout_kwargs(layout_context_bbox, user_kwargs)
-    use_layout = layout_kwargs.pop("layout", True)  # Extract layout flag, default True
+    # Make a working copy of user_kwargs so we can safely pop custom keys
+    incoming_kwargs = user_kwargs.copy() if user_kwargs else {}
-    if not use_layout:
-        # Simple join if layout=False
-        logger.debug("generate_text_layout: Using simple join (layout=False requested).")
-        # Sort before joining if layout is off
-        valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
-        result = "".join(c.get("text", "") for c in valid_char_dicts)  # Use valid chars
-        return result
+    # --- Handle custom 'strip' option ------------------------------------
+    # * strip=True  – post-process the final string to remove leading/trailing
+    #                 whitespace (typically used when layout=False)
+    # * strip=False – preserve whitespace exactly as produced.
+    # Default behaviour depends on the layout flag (see below).
+    explicit_strip_flag = incoming_kwargs.pop("strip", None)  # May be None
+    # Prepare layout arguments now that we've removed the non-pdfplumber key
+    layout_kwargs = _get_layout_kwargs(layout_context_bbox, incoming_kwargs)
+    use_layout = layout_kwargs.get("layout", False)
+    # Determine final strip behaviour: if caller specified override, honour it;
+    # otherwise default to !use_layout (True when layout=False, False when
+    # layout=True) per user request.
+    strip_result = explicit_strip_flag if explicit_strip_flag is not None else (not use_layout)
     try:
-        # Sort chars primarily by top, then x0 before layout analysis
-        # This helps pdfplumber group lines correctly
+        # Sort chars primarily by top, then x0 before layout analysis – required by
+        # pdfplumber so that grouping into lines works deterministically.
         valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
+        # Build the text map. `layout_kwargs` still contains the caller-specified or
+        # default "layout" flag, which chars_to_textmap will respect.
         textmap = chars_to_textmap(valid_char_dicts, **layout_kwargs)
         result = textmap.as_string
+        # ----------------------------------------------------------------
+        # Optional post-processing strip
+        # ----------------------------------------------------------------
+        if strip_result and isinstance(result, str):
+            # Remove trailing spaces on each line then trim leading/trailing
+            # blank lines for a cleaner output while keeping internal newlines.
+            result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
     except Exception as e:
         # Fallback to simple join on error
         logger.error(f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=False)
@@ -230,5 +248,7 @@ def generate_text_layout(
         # Fallback already has sorted characters if layout was attempted
         # Need to use the valid_char_dicts here too
         result = "".join(c.get("text", "") for c in valid_char_dicts)
+        if strip_result:
+            result = result.strip()
     return result

{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.26.dev0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.23
+Version: 0.1.26.dev0
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: markdown
 Requires-Dist: pandas
 Requires-Dist: pdfplumber
 Requires-Dist: colormath2

natural-pdf 0.1.23__py3-none-any.whl → 0.1.26.dev0__py3-none-any.whl

natural-pdf 0.1.23py3-none-any.whl → 0.1.26.dev0py3-none-any.whl