PyPI - natural-pdf - Versions diffs - 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl - Mend

natural-pdf 0.1.23py3-none-any.whl → 0.1.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

natural_pdf/analyzers/shape_detection_mixin.py +40 -0
natural_pdf/core/highlighting_service.py +4 -4
natural_pdf/core/page.py +16 -2
natural_pdf/describe/base.py +11 -1
natural_pdf/describe/summary.py +26 -0
natural_pdf/elements/base.py +2 -2
natural_pdf/elements/collections.py +139 -100
natural_pdf/elements/region.py +133 -12
natural_pdf/elements/text.py +15 -7
natural_pdf/flows/region.py +116 -1
natural_pdf/qa/document_qa.py +162 -105
natural_pdf/utils/text_extraction.py +34 -14
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +2 -1
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +18 -18
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0

natural_pdf/qa/document_qa.py CHANGED Viewed

@@ -119,29 +119,52 @@ class DocumentQA:
     def ask(
         self,
         image: Union[str, Image.Image, np.ndarray],
-        question: str,
+        question: Union[str, List[str], Tuple[str, ...]],
         word_boxes: List = None,
         min_confidence: float = 0.1,
         debug: bool = False,
         debug_output_dir: str = "output",
-    ) -> QAResult:
+    ) -> Union[QAResult, List[QAResult]]:
         """
-        Ask a question about document content.
+        Ask one or more natural-language questions about the supplied document image.
+        This method now accepts a single *question* (``str``) **or** an
+        iterable of questions (``list``/``tuple`` of ``str``).  When multiple
+        questions are provided they are executed in a single batch through the
+        underlying transformers pipeline which is considerably faster than
+        looping and calling :py:meth:`ask` repeatedly.
         Args:
-            image: PIL Image, numpy array, or path to image file
-            question: Question to ask about the document
-            word_boxes: Optional pre-extracted word boxes [[text, [x0, y0, x1, y1]], ...]
-            min_confidence: Minimum confidence threshold for answers
-            debug: Whether to save debug information
-            debug_output_dir: Directory to save debug files
+            image: PIL ``Image``, ``numpy`` array, or path to an image file.
+            question: A question string *or* a list/tuple of question strings.
+            word_boxes: Optional pre-extracted word-boxes in the LayoutLMv3
+                format ``[[text, [x0, y0, x1, y1]], …]``.
+            min_confidence: Minimum confidence threshold below which an answer
+                will be marked as ``found = False``.
+            debug: If ``True`` intermediate artefacts will be written to
+                *debug_output_dir* to aid troubleshooting.
+            debug_output_dir: Directory where debug artefacts should be saved.
         Returns:
-            QAResult instance with answer details
+            • A single :class:`QAResult` when *question* is a string.
+            • A ``list`` of :class:`QAResult`` objects (one per question) when
+              *question* is a list/tuple.
         """
         if not self._is_initialized:
             raise RuntimeError("DocumentQA is not properly initialized")
+        # Normalise *questions* to a list so we can treat batch and single
+        # uniformly.  We'll remember if the caller supplied a single question
+        # so that we can preserve the original return type.
+        single_question = False
+        if isinstance(question, str):
+            questions = [question]
+            single_question = True
+        elif isinstance(question, (list, tuple)) and all(isinstance(q, str) for q in question):
+            questions = list(question)
+        else:
+            raise TypeError("'question' must be a string or a list/tuple of strings")
         # Process the image
         if isinstance(image, str):
             # It's a file path
@@ -157,12 +180,16 @@ class DocumentQA:
         else:
             raise TypeError("Image must be a PIL Image, numpy array, or file path")
-        # Prepare the query
-        query = {"image": image_obj, "question": question}
+        # ------------------------------------------------------------------
+        # Build the queries for the pipeline (either single dict or list).
+        # ------------------------------------------------------------------
+        def _build_query_dict(q: str):
+            d = {"image": image_obj, "question": q}
+            if word_boxes:
+                d["word_boxes"] = word_boxes
+            return d
-        # Add word boxes if provided
-        if word_boxes:
-            query["word_boxes"] = word_boxes
+        queries = [_build_query_dict(q) for q in questions]
         # Save debug information if requested
         if debug:
@@ -198,48 +225,79 @@ class DocumentQA:
                 logger.info(f"Word boxes: {word_boxes_path}")
                 logger.info(f"Visualization: {vis_path}")
-        # Run the query through the pipeline
-        logger.info(f"Running document QA pipeline with question: {question}")
-        result = self.pipe(query)[0]
-        logger.info(f"Raw result: {result}")
-        # Save the result if debugging
-        if debug:
-            result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
-            with open(result_path, "w") as f:
-                # Convert any non-serializable data
-                serializable_result = {
-                    k: (
-                        str(v)
-                        if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
-                        else v
-                    )
-                    for k, v in result.items()
-                }
-                json.dump(serializable_result, f, indent=2)
-        # Check confidence against threshold
-        if result["score"] < min_confidence:
-            logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
-            return QAResult(
-                answer="",
-                confidence=result["score"],
-                start=result.get("start", -1),
-                end=result.get("end", -1),
-                found=False,
-            )
-        return QAResult(
-            answer=result["answer"],
-            confidence=result["score"],
-            start=result.get("start", 0),
-            end=result.get("end", 0),
-            found=True,
+        # ------------------------------------------------------------------
+        # Run the queries through the pipeline (batch or single) and collect
+        # *only the top answer* for each, mirroring the original behaviour.
+        # ------------------------------------------------------------------
+        logger.info(
+            f"Running document QA pipeline with {len(queries)} question{'s' if len(queries) != 1 else ''}."
         )
+        # When we pass a list the pipeline returns a list of per-question
+        # results; each per-question result is itself a list (top-k answers).
+        # We keep only the best answer (index 0) to maintain backwards
+        # compatibility.
+        raw_results = self.pipe(queries if len(queries) > 1 else queries[0])
+        # Ensure we always have a list aligned with *questions*
+        if len(queries) == 1:
+            raw_results = [raw_results]
+        processed_results: List[QAResult] = []
+        for q, res in zip(questions, raw_results):
+            top_res = res[0] if isinstance(res, list) else res  # pipeline may or may not nest
+            # Save per-question result in debug mode
+            if debug:
+                # File names: debug_qa_result_0.json, …
+                result_path = os.path.join(debug_output_dir, f"debug_qa_result_{q[:30].replace(' ', '_')}.json")
+                try:
+                    with open(result_path, "w") as f:
+                        serializable = {
+                            k: (
+                                str(v)
+                                if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
+                                else v
+                            )
+                            for k, v in top_res.items()
+                        }
+                        json.dump(serializable, f, indent=2)
+                except Exception as e:
+                    logger.warning(f"Failed to save debug QA result for question '{q}': {e}")
+            # Apply confidence threshold
+            if top_res["score"] < min_confidence:
+                qa_res = QAResult(
+                    question=q,
+                    answer="",
+                    confidence=top_res["score"],
+                    start=top_res.get("start", -1),
+                    end=top_res.get("end", -1),
+                    found=False,
+                )
+            else:
+                qa_res = QAResult(
+                    question=q,
+                    answer=top_res["answer"],
+                    confidence=top_res["score"],
+                    start=top_res.get("start", 0),
+                    end=top_res.get("end", 0),
+                    found=True,
+                )
+            processed_results.append(qa_res)
+        # Return appropriately typed result (single item or list)
+        return processed_results[0] if single_question else processed_results
     def ask_pdf_page(
-        self, page, question: str, min_confidence: float = 0.1, debug: bool = False
-    ) -> QAResult:
+        self,
+        page,
+        question: Union[str, List[str], Tuple[str, ...]],
+        min_confidence: float = 0.1,
+        debug: bool = False,
+    ) -> Union[QAResult, List[QAResult]]:
         """
         Ask a question about a specific PDF page.
@@ -270,8 +328,8 @@ class DocumentQA:
         page_image.save(temp_path)
         try:
-            # Ask the question
-            result = self.ask(
+            # Ask the question(s)
+            result_obj = self.ask(
                 image=temp_path,
                 question=question,
                 word_boxes=word_boxes,
@@ -279,34 +337,35 @@ class DocumentQA:
                 debug=debug,
             )
-            # Add page reference to the result
-            result.page_num = page.index
+            # Ensure we have a list for uniform processing
+            results = result_obj if isinstance(result_obj, list) else [result_obj]
-            # Add element references if possible
-            if result.found and "start" in result and "end" in result:
-                start_idx = result.start
-                end_idx = result.end
+            for res in results:
+                # Attach page reference
+                res.page_num = page.index
-                # Make sure we have valid indices and elements to work with
-                if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
-                    # Find the actual source elements in the original list
-                    # Since word_boxes may have filtered out some elements, we need to map indices
+                # Map answer span back to source elements
+                if res.found and "start" in res and "end" in res:
+                    start_idx = res.start
+                    end_idx = res.end
-                    # Get the text from result word boxes
-                    matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
+                    if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
+                        matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
-                    # Find corresponding elements in the full element list
-                    source_elements = []
-                    for element in elements:
-                        if hasattr(element, "text") and element.text in matched_texts:
-                            source_elements.append(element)
-                            # Remove from matched texts to avoid duplicates
-                            if element.text in matched_texts:
-                                matched_texts.remove(element.text)
+                        source_elements = []
+                        for element in elements:
+                            if hasattr(element, "text") and element.text in matched_texts:
+                                source_elements.append(element)
+                                if element.text in matched_texts:
+                                    matched_texts.remove(element.text)
-                    result.source_elements = ElementCollection(source_elements)
+                        res.source_elements = ElementCollection(source_elements)
-            return result
+            # Return result(s) preserving original input type
+            if isinstance(question, (list, tuple)):
+                return results
+            else:
+                return results[0]
         finally:
             # Clean up temporary file
@@ -314,8 +373,12 @@ class DocumentQA:
                 os.remove(temp_path)
     def ask_pdf_region(
-        self, region, question: str, min_confidence: float = 0.1, debug: bool = False
-    ) -> QAResult:
+        self,
+        region,
+        question: Union[str, List[str], Tuple[str, ...]],
+        min_confidence: float = 0.1,
+        debug: bool = False,
+    ) -> Union[QAResult, List[QAResult]]:
         """
         Ask a question about a specific region of a PDF page.
@@ -352,8 +415,8 @@ class DocumentQA:
         region_image.save(temp_path)
         try:
-            # Ask the question
-            result = self.ask(
+            # Ask the question(s)
+            result_obj = self.ask(
                 image=temp_path,
                 question=question,
                 word_boxes=word_boxes,
@@ -361,35 +424,29 @@ class DocumentQA:
                 debug=debug,
             )
-            # Add region reference to the result
-            result.region = region
-            result.page_num = region.page.index
+            results = result_obj if isinstance(result_obj, list) else [result_obj]
-            # Add element references if possible
-            if result.found and "start" in result and "end" in result:
-                start_idx = result.start
-                end_idx = result.end
+            for res in results:
+                res.region = region
+                res.page_num = region.page.index
-                # Make sure we have valid indices and elements to work with
-                if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
-                    # Find the actual source elements in the original list
-                    # Since word_boxes may have filtered out some elements, we need to map indices
+                if res.found and "start" in res and "end" in res:
+                    start_idx = res.start
+                    end_idx = res.end
-                    # Get the text from result word boxes
-                    matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
+                    if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
+                        matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
-                    # Find corresponding elements in the full element list
-                    source_elements = []
-                    for element in elements:
-                        if hasattr(element, "text") and element.text in matched_texts:
-                            source_elements.append(element)
-                            # Remove from matched texts to avoid duplicates
-                            if element.text in matched_texts:
-                                matched_texts.remove(element.text)
+                        source_elements = []
+                        for element in elements:
+                            if hasattr(element, "text") and element.text in matched_texts:
+                                source_elements.append(element)
+                                if element.text in matched_texts:
+                                    matched_texts.remove(element.text)
-                    result.source_elements = ElementCollection(source_elements)
+                        res.source_elements = ElementCollection(source_elements)
-            return result
+            return results if isinstance(question, (list, tuple)) else results[0]
         finally:
             # Clean up temporary file

natural_pdf/utils/text_extraction.py CHANGED Viewed

@@ -63,9 +63,9 @@ def _get_layout_kwargs(
             else:
                 logger.warning(f"Ignoring unsupported layout keyword argument: '{key}'")
-    # 4. Ensure layout flag is present, defaulting to True
+    # 4. Ensure layout flag is present, defaulting to False (caller can override)
     if "layout" not in layout_kwargs:
-        layout_kwargs["layout"] = True
+        layout_kwargs["layout"] = False
     return layout_kwargs
@@ -203,24 +203,42 @@ def generate_text_layout(
         logger.debug("generate_text_layout: No valid character dicts found after filtering.")
         return ""
-    # Prepare layout arguments
-    layout_kwargs = _get_layout_kwargs(layout_context_bbox, user_kwargs)
-    use_layout = layout_kwargs.pop("layout", True)  # Extract layout flag, default True
+    # Make a working copy of user_kwargs so we can safely pop custom keys
+    incoming_kwargs = user_kwargs.copy() if user_kwargs else {}
-    if not use_layout:
-        # Simple join if layout=False
-        logger.debug("generate_text_layout: Using simple join (layout=False requested).")
-        # Sort before joining if layout is off
-        valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
-        result = "".join(c.get("text", "") for c in valid_char_dicts)  # Use valid chars
-        return result
+    # --- Handle custom 'strip' option ------------------------------------
+    # * strip=True  – post-process the final string to remove leading/trailing
+    #                 whitespace (typically used when layout=False)
+    # * strip=False – preserve whitespace exactly as produced.
+    # Default behaviour depends on the layout flag (see below).
+    explicit_strip_flag = incoming_kwargs.pop("strip", None)  # May be None
+    # Prepare layout arguments now that we've removed the non-pdfplumber key
+    layout_kwargs = _get_layout_kwargs(layout_context_bbox, incoming_kwargs)
+    use_layout = layout_kwargs.get("layout", False)
+    # Determine final strip behaviour: if caller specified override, honour it;
+    # otherwise default to !use_layout (True when layout=False, False when
+    # layout=True) per user request.
+    strip_result = explicit_strip_flag if explicit_strip_flag is not None else (not use_layout)
     try:
-        # Sort chars primarily by top, then x0 before layout analysis
-        # This helps pdfplumber group lines correctly
+        # Sort chars primarily by top, then x0 before layout analysis – required by
+        # pdfplumber so that grouping into lines works deterministically.
         valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
+        # Build the text map. `layout_kwargs` still contains the caller-specified or
+        # default "layout" flag, which chars_to_textmap will respect.
         textmap = chars_to_textmap(valid_char_dicts, **layout_kwargs)
         result = textmap.as_string
+        # ----------------------------------------------------------------
+        # Optional post-processing strip
+        # ----------------------------------------------------------------
+        if strip_result and isinstance(result, str):
+            # Remove trailing spaces on each line then trim leading/trailing
+            # blank lines for a cleaner output while keeping internal newlines.
+            result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
     except Exception as e:
         # Fallback to simple join on error
         logger.error(f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=False)
@@ -230,5 +248,7 @@ def generate_text_layout(
         # Fallback already has sorted characters if layout was attempted
         # Need to use the valid_char_dicts here too
         result = "".join(c.get("text", "") for c in valid_char_dicts)
+        if strip_result:
+            result = result.strip()
     return result

{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.23
+Version: 0.1.24
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: markdown
 Requires-Dist: pandas
 Requires-Dist: pdfplumber
 Requires-Dist: colormath2

{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
 natural_pdf/cli.py,sha256=IXrP2lCHihr-ed-CFiDbMTnSsutQa1j1PYALOLGbpsc,4019
 natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
-natural_pdf/analyzers/shape_detection_mixin.py,sha256=blpeHMWl6nXlutAByfdi6zjfmcyaDpdv2S7IR4l0WO0,81783
+natural_pdf/analyzers/shape_detection_mixin.py,sha256=aHn4EMdbwOe8VWECPceGs5wN7gJP_kIxyAbmbNlNPSs,83634
 natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
 natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
 natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
@@ -25,21 +25,21 @@ natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiY
 natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
 natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
 natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
-natural_pdf/core/highlighting_service.py,sha256=wWoU2kJ_JBbxKV3NWEjqU6DLvmlwME9sTntk-TDqOfs,38223
-natural_pdf/core/page.py,sha256=U4GRy_zdoTB4sx4EPrAIKg4beIQ8atJsY5HX_jWfDjg,118953
+natural_pdf/core/highlighting_service.py,sha256=DKoaxiiuQsWgtf6wSroMAIcFiqJOOF7dXhciYdQKdCw,38223
+natural_pdf/core/page.py,sha256=TOtpUp5lRhDj32wv3yvRaS8kxPX6R9904OCC6uHFi84,119512
 natural_pdf/core/pdf.py,sha256=qsSW4RxOJRmCnweLPMs0NhzkRfiAVdghTgnh4D_wuO4,74295
 natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
-natural_pdf/describe/base.py,sha256=LAZLc_thK2u2surgGd0Pk7CN2uVaZK9AbMOE3-1RmQ4,16842
+natural_pdf/describe/base.py,sha256=mUvEydumXXPJ2FkWAYm1BbWrRWY81I0dMyQrEU32rmc,17256
 natural_pdf/describe/elements.py,sha256=xD8wwR1z5IKat7RIwoAwQRUEL6zJTEwcOKorF4F-xPg,12717
 natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
-natural_pdf/describe/summary.py,sha256=dPtjrn6fQ8nL0F74RITX2vXlDX7ZgaX9JQPnJB-S_XQ,6735
+natural_pdf/describe/summary.py,sha256=h5zy9zG7t27wFnJ2hEguGSoURtN2IR4x6WBO3aXB4eo,7980
 natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
-natural_pdf/elements/base.py,sha256=iTIy6FfQj48llZkm7wERnTky3VTmUgkYfQytRuyueZo,43304
-natural_pdf/elements/collections.py,sha256=zaqJ8pr0dmYwv1gPBs24oXfZExpSIX4URDRox-QLj98,123173
+natural_pdf/elements/base.py,sha256=iw-Ab0o7eI69npt0gAxQvA14GPWHAAhkLrJ_JeKvIos,43309
+natural_pdf/elements/collections.py,sha256=JrM42VPRtDOJ9Q9KIR3SrcbamiiCHXI4nzTq2BBkeEk,124223
 natural_pdf/elements/line.py,sha256=300kSFBDUBIudfeQtH_tzW9gTYRgRKUDPiTABw6J-BE,4782
 natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
-natural_pdf/elements/region.py,sha256=BAOriJuQYovppV0S5xI6tq5YEuzffiMQneDvHuT22Uo,118562
-natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
+natural_pdf/elements/region.py,sha256=CVncbiCk8ivn04CI7Ob93O7UY0ANVpCJwikBt-jVWgg,123698
+natural_pdf/elements/text.py,sha256=yshGrvdiBZSkYhQfdi6Yz6NN0kWvmqKHSSC82D829os,11470
 natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
 natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
 natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
@@ -58,7 +58,7 @@ natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU
 natural_pdf/flows/collections.py,sha256=qGuSPFSPQF-wiYquG6STiSzg_o951MSsFEq_B44Jef8,28441
 natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
 natural_pdf/flows/flow.py,sha256=I61BpFVDQyo6ORsmoqoYiOEP1DBRp0vgDJjm_V8frhc,10562
-natural_pdf/flows/region.py,sha256=hucKKmjjmLt__x-RiX6S1Amsp88yweyjcgWJ7PQtTgY,22187
+natural_pdf/flows/region.py,sha256=4U3S7pLEa3oCyPfS-hpD0lSXf8MWT-MdF9AsVvMJbWU,26670
 natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
 natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,8741
 natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
@@ -70,7 +70,7 @@ natural_pdf/ocr/ocr_manager.py,sha256=K2gpFo3e6RB1ouXOstlEAAYd14DbjBNt5RH6J7ZdDQ
 natural_pdf/ocr/ocr_options.py,sha256=l33QKu_93r-uwi3t_v8UH8pEgHo6HTVzP4tfmQFRF1w,5488
 natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
 natural_pdf/qa/__init__.py,sha256=2u2KJcA71g1I0HnLD-j6yvDw1moAjo9kkLhhfoYRURM,166
-natural_pdf/qa/document_qa.py,sha256=bwOrO_bq_9wEaLu7j7h8EkN3ya5xMxDoE7oNurEb6-E,14889
+natural_pdf/qa/document_qa.py,sha256=6-XuIEFf5BcVA_e85FBmAeXpNZgzZhTBDkNUMPAl-tc,17803
 natural_pdf/qa/qa_result.py,sha256=_q4dlSqsjtgomcI8-pqbOT69lqQKnEMkhZNydoxEkkE,2227
 natural_pdf/search/__init__.py,sha256=0Xa7tT_2q57wHObFMQLQLd4gd9AV0oyS-svV6BmmdMI,4276
 natural_pdf/search/lancedb_search_service.py,sha256=6dz2IEZUWk3hFW28C-LF_85pWohd7Sr5k44bM0pBdm4,14472
@@ -88,13 +88,13 @@ natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2b
 natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
 natural_pdf/utils/packaging.py,sha256=Jshxp6S1zfcqoZmFhdd7WOpL--b6rBSz-Y9mYqELXIY,21581
 natural_pdf/utils/reading_order.py,sha256=s3DsYq_3g_1YA07qhd4BGEjeIRTeyGtnwc_hNtSzwBY,7290
-natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9YDmfXWL4,9605
+natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6lSjBaOk,10854
 natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
 natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
 natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
-natural_pdf-0.1.23.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
-natural_pdf-0.1.23.dist-info/METADATA,sha256=z7Mq5yr_sckn7pFR1KqBz_fG2sG-jBBSb2czsRrzC_k,6660
-natural_pdf-0.1.23.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-natural_pdf-0.1.23.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
-natural_pdf-0.1.23.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
-natural_pdf-0.1.23.dist-info/RECORD,,
+natural_pdf-0.1.24.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.1.24.dist-info/METADATA,sha256=qcyQUXKXciLsomzdsdkQ4inSw_MJbczyj8oPq4KVGZQ,6684
+natural_pdf-0.1.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+natural_pdf-0.1.24.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
+natural_pdf-0.1.24.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
+natural_pdf-0.1.24.dist-info/RECORD,,

{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl

natural-pdf 0.1.23py3-none-any.whl → 0.1.24py3-none-any.whl