PyPI - natural-pdf - Versions diffs - 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl - Mend

natural-pdf 0.1.22py3-none-any.whl → 0.1.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

natural_pdf/analyzers/shape_detection_mixin.py +43 -3
natural_pdf/classification/manager.py +1 -1
natural_pdf/classification/mixin.py +35 -14
natural_pdf/classification/results.py +16 -1
natural_pdf/cli.py +1 -0
natural_pdf/core/highlighting_service.py +23 -0
natural_pdf/core/page.py +32 -2
natural_pdf/core/pdf.py +24 -4
natural_pdf/describe/base.py +11 -1
natural_pdf/describe/summary.py +26 -0
natural_pdf/elements/base.py +81 -3
natural_pdf/elements/collections.py +162 -101
natural_pdf/elements/region.py +187 -160
natural_pdf/elements/text.py +15 -7
natural_pdf/exporters/paddleocr.py +1 -1
natural_pdf/extraction/manager.py +2 -2
natural_pdf/extraction/mixin.py +295 -11
natural_pdf/extraction/result.py +28 -1
natural_pdf/flows/region.py +117 -2
natural_pdf/ocr/engine_surya.py +25 -5
natural_pdf/qa/__init__.py +2 -1
natural_pdf/qa/document_qa.py +166 -113
natural_pdf/qa/qa_result.py +55 -0
natural_pdf/selectors/parser.py +22 -0
natural_pdf/utils/text_extraction.py +34 -14
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +22 -13
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +31 -30
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0

natural_pdf/qa/document_qa.py CHANGED Viewed

@@ -8,6 +8,7 @@ import numpy as np
 from PIL import Image, ImageDraw
 from natural_pdf.elements.collections import ElementCollection
+from .qa_result import QAResult
 logger = logging.getLogger("natural_pdf.qa.document_qa")
@@ -118,34 +119,52 @@ class DocumentQA:
     def ask(
         self,
         image: Union[str, Image.Image, np.ndarray],
-        question: str,
+        question: Union[str, List[str], Tuple[str, ...]],
         word_boxes: List = None,
         min_confidence: float = 0.1,
         debug: bool = False,
         debug_output_dir: str = "output",
-    ) -> Dict[str, Any]:
+    ) -> Union[QAResult, List[QAResult]]:
         """
-        Ask a question about document content.
+        Ask one or more natural-language questions about the supplied document image.
+        This method now accepts a single *question* (``str``) **or** an
+        iterable of questions (``list``/``tuple`` of ``str``).  When multiple
+        questions are provided they are executed in a single batch through the
+        underlying transformers pipeline which is considerably faster than
+        looping and calling :py:meth:`ask` repeatedly.
         Args:
-            image: PIL Image, numpy array, or path to image file
-            question: Question to ask about the document
-            word_boxes: Optional pre-extracted word boxes [[text, [x0, y0, x1, y1]], ...]
-            min_confidence: Minimum confidence threshold for answers
-            debug: Whether to save debug information
-            debug_output_dir: Directory to save debug files
+            image: PIL ``Image``, ``numpy`` array, or path to an image file.
+            question: A question string *or* a list/tuple of question strings.
+            word_boxes: Optional pre-extracted word-boxes in the LayoutLMv3
+                format ``[[text, [x0, y0, x1, y1]], …]``.
+            min_confidence: Minimum confidence threshold below which an answer
+                will be marked as ``found = False``.
+            debug: If ``True`` intermediate artefacts will be written to
+                *debug_output_dir* to aid troubleshooting.
+            debug_output_dir: Directory where debug artefacts should be saved.
         Returns:
-            Dictionary with answer details: {
-                "answer": extracted text,
-                "confidence": confidence score,
-                "start": start word index,
-                "end": end word index
-            }
+            • A single :class:`QAResult` when *question* is a string.
+            • A ``list`` of :class:`QAResult`` objects (one per question) when
+              *question* is a list/tuple.
         """
         if not self._is_initialized:
             raise RuntimeError("DocumentQA is not properly initialized")
+        # Normalise *questions* to a list so we can treat batch and single
+        # uniformly.  We'll remember if the caller supplied a single question
+        # so that we can preserve the original return type.
+        single_question = False
+        if isinstance(question, str):
+            questions = [question]
+            single_question = True
+        elif isinstance(question, (list, tuple)) and all(isinstance(q, str) for q in question):
+            questions = list(question)
+        else:
+            raise TypeError("'question' must be a string or a list/tuple of strings")
         # Process the image
         if isinstance(image, str):
             # It's a file path
@@ -161,12 +180,16 @@ class DocumentQA:
         else:
             raise TypeError("Image must be a PIL Image, numpy array, or file path")
-        # Prepare the query
-        query = {"image": image_obj, "question": question}
+        # ------------------------------------------------------------------
+        # Build the queries for the pipeline (either single dict or list).
+        # ------------------------------------------------------------------
+        def _build_query_dict(q: str):
+            d = {"image": image_obj, "question": q}
+            if word_boxes:
+                d["word_boxes"] = word_boxes
+            return d
-        # Add word boxes if provided
-        if word_boxes:
-            query["word_boxes"] = word_boxes
+        queries = [_build_query_dict(q) for q in questions]
         # Save debug information if requested
         if debug:
@@ -202,48 +225,79 @@ class DocumentQA:
                 logger.info(f"Word boxes: {word_boxes_path}")
                 logger.info(f"Visualization: {vis_path}")
-        # Run the query through the pipeline
-        logger.info(f"Running document QA pipeline with question: {question}")
-        result = self.pipe(query)[0]
-        logger.info(f"Raw result: {result}")
-        # Save the result if debugging
-        if debug:
-            result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
-            with open(result_path, "w") as f:
-                # Convert any non-serializable data
-                serializable_result = {
-                    k: (
-                        str(v)
-                        if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
-                        else v
-                    )
-                    for k, v in result.items()
-                }
-                json.dump(serializable_result, f, indent=2)
-        # Check confidence against threshold
-        if result["score"] < min_confidence:
-            logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
-            return {
-                "answer": "",
-                "confidence": result["score"],
-                "start": result.get("start", -1),
-                "end": result.get("end", -1),
-                "found": False,
-            }
-        return {
-            "answer": result["answer"],
-            "confidence": result["score"],
-            "start": result.get("start", 0),
-            "end": result.get("end", 0),
-            "found": True,
-        }
+        # ------------------------------------------------------------------
+        # Run the queries through the pipeline (batch or single) and collect
+        # *only the top answer* for each, mirroring the original behaviour.
+        # ------------------------------------------------------------------
+        logger.info(
+            f"Running document QA pipeline with {len(queries)} question{'s' if len(queries) != 1 else ''}."
+        )
+        # When we pass a list the pipeline returns a list of per-question
+        # results; each per-question result is itself a list (top-k answers).
+        # We keep only the best answer (index 0) to maintain backwards
+        # compatibility.
+        raw_results = self.pipe(queries if len(queries) > 1 else queries[0])
+        # Ensure we always have a list aligned with *questions*
+        if len(queries) == 1:
+            raw_results = [raw_results]
+        processed_results: List[QAResult] = []
+        for q, res in zip(questions, raw_results):
+            top_res = res[0] if isinstance(res, list) else res  # pipeline may or may not nest
+            # Save per-question result in debug mode
+            if debug:
+                # File names: debug_qa_result_0.json, …
+                result_path = os.path.join(debug_output_dir, f"debug_qa_result_{q[:30].replace(' ', '_')}.json")
+                try:
+                    with open(result_path, "w") as f:
+                        serializable = {
+                            k: (
+                                str(v)
+                                if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
+                                else v
+                            )
+                            for k, v in top_res.items()
+                        }
+                        json.dump(serializable, f, indent=2)
+                except Exception as e:
+                    logger.warning(f"Failed to save debug QA result for question '{q}': {e}")
+            # Apply confidence threshold
+            if top_res["score"] < min_confidence:
+                qa_res = QAResult(
+                    question=q,
+                    answer="",
+                    confidence=top_res["score"],
+                    start=top_res.get("start", -1),
+                    end=top_res.get("end", -1),
+                    found=False,
+                )
+            else:
+                qa_res = QAResult(
+                    question=q,
+                    answer=top_res["answer"],
+                    confidence=top_res["score"],
+                    start=top_res.get("start", 0),
+                    end=top_res.get("end", 0),
+                    found=True,
+                )
+            processed_results.append(qa_res)
+        # Return appropriately typed result (single item or list)
+        return processed_results[0] if single_question else processed_results
     def ask_pdf_page(
-        self, page, question: str, min_confidence: float = 0.1, debug: bool = False
-    ) -> Dict[str, Any]:
+        self,
+        page,
+        question: Union[str, List[str], Tuple[str, ...]],
+        min_confidence: float = 0.1,
+        debug: bool = False,
+    ) -> Union[QAResult, List[QAResult]]:
         """
         Ask a question about a specific PDF page.
@@ -253,7 +307,7 @@ class DocumentQA:
             min_confidence: Minimum confidence threshold for answers
         Returns:
-            Dictionary with answer details
+            QAResult instance with answer details
         """
         # Ensure we have text elements on the page
         if not page.find_all("text"):
@@ -274,8 +328,8 @@ class DocumentQA:
         page_image.save(temp_path)
         try:
-            # Ask the question
-            result = self.ask(
+            # Ask the question(s)
+            result_obj = self.ask(
                 image=temp_path,
                 question=question,
                 word_boxes=word_boxes,
@@ -283,34 +337,35 @@ class DocumentQA:
                 debug=debug,
             )
-            # Add page reference to the result
-            result["page_num"] = page.index
+            # Ensure we have a list for uniform processing
+            results = result_obj if isinstance(result_obj, list) else [result_obj]
-            # Add element references if possible
-            if result.get("found", False) and "start" in result and "end" in result:
-                start_idx = result["start"]
-                end_idx = result["end"]
+            for res in results:
+                # Attach page reference
+                res.page_num = page.index
-                # Make sure we have valid indices and elements to work with
-                if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
-                    # Find the actual source elements in the original list
-                    # Since word_boxes may have filtered out some elements, we need to map indices
+                # Map answer span back to source elements
+                if res.found and "start" in res and "end" in res:
+                    start_idx = res.start
+                    end_idx = res.end
-                    # Get the text from result word boxes
-                    matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
+                    if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
+                        matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
-                    # Find corresponding elements in the full element list
-                    source_elements = []
-                    for element in elements:
-                        if hasattr(element, "text") and element.text in matched_texts:
-                            source_elements.append(element)
-                            # Remove from matched texts to avoid duplicates
-                            if element.text in matched_texts:
-                                matched_texts.remove(element.text)
+                        source_elements = []
+                        for element in elements:
+                            if hasattr(element, "text") and element.text in matched_texts:
+                                source_elements.append(element)
+                                if element.text in matched_texts:
+                                    matched_texts.remove(element.text)
-                    result["source_elements"] = ElementCollection(source_elements)
+                        res.source_elements = ElementCollection(source_elements)
-            return result
+            # Return result(s) preserving original input type
+            if isinstance(question, (list, tuple)):
+                return results
+            else:
+                return results[0]
         finally:
             # Clean up temporary file
@@ -318,8 +373,12 @@ class DocumentQA:
                 os.remove(temp_path)
     def ask_pdf_region(
-        self, region, question: str, min_confidence: float = 0.1, debug: bool = False
-    ) -> Dict[str, Any]:
+        self,
+        region,
+        question: Union[str, List[str], Tuple[str, ...]],
+        min_confidence: float = 0.1,
+        debug: bool = False,
+    ) -> Union[QAResult, List[QAResult]]:
         """
         Ask a question about a specific region of a PDF page.
@@ -329,7 +388,7 @@ class DocumentQA:
             min_confidence: Minimum confidence threshold for answers
         Returns:
-            Dictionary with answer details
+            QAResult instance with answer details
         """
         # Get all text elements within the region
         elements = region.find_all("text")
@@ -356,8 +415,8 @@ class DocumentQA:
         region_image.save(temp_path)
         try:
-            # Ask the question
-            result = self.ask(
+            # Ask the question(s)
+            result_obj = self.ask(
                 image=temp_path,
                 question=question,
                 word_boxes=word_boxes,
@@ -365,35 +424,29 @@ class DocumentQA:
                 debug=debug,
             )
-            # Add region reference to the result
-            result["region"] = region
-            result["page_num"] = region.page.index
+            results = result_obj if isinstance(result_obj, list) else [result_obj]
-            # Add element references if possible
-            if result.get("found", False) and "start" in result and "end" in result:
-                start_idx = result["start"]
-                end_idx = result["end"]
+            for res in results:
+                res.region = region
+                res.page_num = region.page.index
-                # Make sure we have valid indices and elements to work with
-                if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
-                    # Find the actual source elements in the original list
-                    # Since word_boxes may have filtered out some elements, we need to map indices
+                if res.found and "start" in res and "end" in res:
+                    start_idx = res.start
+                    end_idx = res.end
-                    # Get the text from result word boxes
-                    matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
+                    if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
+                        matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
-                    # Find corresponding elements in the full element list
-                    source_elements = []
-                    for element in elements:
-                        if hasattr(element, "text") and element.text in matched_texts:
-                            source_elements.append(element)
-                            # Remove from matched texts to avoid duplicates
-                            if element.text in matched_texts:
-                                matched_texts.remove(element.text)
+                        source_elements = []
+                        for element in elements:
+                            if hasattr(element, "text") and element.text in matched_texts:
+                                source_elements.append(element)
+                                if element.text in matched_texts:
+                                    matched_texts.remove(element.text)
-                    result["source_elements"] = ElementCollection(source_elements)
+                        res.source_elements = ElementCollection(source_elements)
-            return result
+            return results if isinstance(question, (list, tuple)) else results[0]
         finally:
             # Clean up temporary file

natural_pdf/qa/qa_result.py ADDED Viewed

@@ -0,0 +1,55 @@
+class QAResult(dict):
+    """Dictionary-like container for Document QA results with a convenient ``show`` method.
+    This class behaves exactly like a regular ``dict`` so existing code that
+    expects a mapping will continue to work.  In addition it exposes:
+    • ``show()`` – delegates to the underlying ``source_elements.show`` if those
+      elements are present (added automatically by ``ask_pdf_page`` and
+      ``ask_pdf_region``).  This provides a quick way to visualise where an
+      answer was found in the document.
+    • Attribute access (e.g. ``result.answer``) as sugar for the usual
+      ``result["answer"]``.
+    """
+    # ---------------------------------------------------------------------
+    # Convenience helpers
+    # ---------------------------------------------------------------------
+    def show(self, *args, **kwargs):
+        """Display the answer region by delegating to ``source_elements.show``.
+        Any positional or keyword arguments are forwarded to
+        ``ElementCollection.show``.
+        """
+        source = self.get("source_elements")
+        if source is None:
+            raise AttributeError(
+                "QAResult does not contain 'source_elements'; nothing to show()."
+            )
+        if not hasattr(source, "show"):
+            raise AttributeError(
+                "'source_elements' object has no 'show' method; cannot visualise."
+            )
+        return source.show(*args, **kwargs)
+    # ------------------------------------------------------------------
+    # Attribute <-> key delegation so ``result.answer`` works
+    # ------------------------------------------------------------------
+    def __getattr__(self, item):
+        try:
+            return self[item]
+        except KeyError as exc:
+            raise AttributeError(item) from exc
+    def __setattr__(self, key, value):
+        # Store all non-dunder attributes in the underlying mapping so that
+        # they remain serialisable.
+        if key.startswith("__") and key.endswith("__"):
+            super().__setattr__(key, value)
+        else:
+            self[key] = value
+    # Ensure ``copy`` keeps the subclass type
+    def copy(self):
+        return QAResult(self)

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -698,6 +698,28 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
             filter_lambda = contains_check
+        # --- Handle :startswith and :starts-with (alias) --- #
+        elif name in ("starts-with", "startswith") and args is not None:
+            filter_name = f"pseudo-class :{name}({args!r})"
+            def startswith_check(element, arg=args):
+                if not hasattr(element, "text") or not element.text:
+                    return False
+                return str(element.text).startswith(str(arg))
+            filter_lambda = startswith_check
+        # --- Handle :endswith and :ends-with (alias) --- #
+        elif name in ("ends-with", "endswith") and args is not None:
+            filter_name = f"pseudo-class :{name}({args!r})"
+            def endswith_check(element, arg=args):
+                if not hasattr(element, "text") or not element.text:
+                    return False
+                return str(element.text).endswith(str(arg))
+            filter_lambda = endswith_check
         elif name == "starts-with" and args is not None:
             filter_lambda = (
                 lambda el, arg=args: hasattr(el, "text")

natural_pdf/utils/text_extraction.py CHANGED Viewed

@@ -63,9 +63,9 @@ def _get_layout_kwargs(
             else:
                 logger.warning(f"Ignoring unsupported layout keyword argument: '{key}'")
-    # 4. Ensure layout flag is present, defaulting to True
+    # 4. Ensure layout flag is present, defaulting to False (caller can override)
     if "layout" not in layout_kwargs:
-        layout_kwargs["layout"] = True
+        layout_kwargs["layout"] = False
     return layout_kwargs
@@ -203,24 +203,42 @@ def generate_text_layout(
         logger.debug("generate_text_layout: No valid character dicts found after filtering.")
         return ""
-    # Prepare layout arguments
-    layout_kwargs = _get_layout_kwargs(layout_context_bbox, user_kwargs)
-    use_layout = layout_kwargs.pop("layout", True)  # Extract layout flag, default True
+    # Make a working copy of user_kwargs so we can safely pop custom keys
+    incoming_kwargs = user_kwargs.copy() if user_kwargs else {}
-    if not use_layout:
-        # Simple join if layout=False
-        logger.debug("generate_text_layout: Using simple join (layout=False requested).")
-        # Sort before joining if layout is off
-        valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
-        result = "".join(c.get("text", "") for c in valid_char_dicts)  # Use valid chars
-        return result
+    # --- Handle custom 'strip' option ------------------------------------
+    # * strip=True  – post-process the final string to remove leading/trailing
+    #                 whitespace (typically used when layout=False)
+    # * strip=False – preserve whitespace exactly as produced.
+    # Default behaviour depends on the layout flag (see below).
+    explicit_strip_flag = incoming_kwargs.pop("strip", None)  # May be None
+    # Prepare layout arguments now that we've removed the non-pdfplumber key
+    layout_kwargs = _get_layout_kwargs(layout_context_bbox, incoming_kwargs)
+    use_layout = layout_kwargs.get("layout", False)
+    # Determine final strip behaviour: if caller specified override, honour it;
+    # otherwise default to !use_layout (True when layout=False, False when
+    # layout=True) per user request.
+    strip_result = explicit_strip_flag if explicit_strip_flag is not None else (not use_layout)
     try:
-        # Sort chars primarily by top, then x0 before layout analysis
-        # This helps pdfplumber group lines correctly
+        # Sort chars primarily by top, then x0 before layout analysis – required by
+        # pdfplumber so that grouping into lines works deterministically.
         valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
+        # Build the text map. `layout_kwargs` still contains the caller-specified or
+        # default "layout" flag, which chars_to_textmap will respect.
         textmap = chars_to_textmap(valid_char_dicts, **layout_kwargs)
         result = textmap.as_string
+        # ----------------------------------------------------------------
+        # Optional post-processing strip
+        # ----------------------------------------------------------------
+        if strip_result and isinstance(result, str):
+            # Remove trailing spaces on each line then trim leading/trailing
+            # blank lines for a cleaner output while keeping internal newlines.
+            result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
     except Exception as e:
         # Fallback to simple join on error
         logger.error(f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=False)
@@ -230,5 +248,7 @@ def generate_text_layout(
         # Fallback already has sorted characters if layout was attempted
         # Need to use the valid_char_dicts here too
         result = "".join(c.get("text", "") for c in valid_char_dicts)
+        if strip_result:
+            result = result.strip()
     return result

{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.22
+Version: 0.1.24
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: markdown
 Requires-Dist: pandas
 Requires-Dist: pdfplumber
 Requires-Dist: colormath2
@@ -22,12 +23,6 @@ Requires-Dist: tqdm
 Requires-Dist: pydantic
 Requires-Dist: jenkspy
 Requires-Dist: scipy
-Requires-Dist: torch
-Requires-Dist: torchvision
-Requires-Dist: transformers[sentencepiece]
-Requires-Dist: huggingface_hub>=0.29.3
-Requires-Dist: sentence-transformers
-Requires-Dist: timm
 Requires-Dist: ipywidgets>=7.0.0
 Provides-Extra: test
 Requires-Dist: pytest; extra == "test"
@@ -57,6 +52,7 @@ Requires-Dist: natural-pdf[test]; extra == "all"
 Requires-Dist: natural-pdf[search]; extra == "all"
 Requires-Dist: natural-pdf[favorites]; extra == "all"
 Requires-Dist: natural-pdf[export-extras]; extra == "all"
+Requires-Dist: natural-pdf[ai]; extra == "all"
 Provides-Extra: deskew
 Requires-Dist: deskew>=1.5; extra == "deskew"
 Requires-Dist: img2pdf; extra == "deskew"
@@ -68,6 +64,15 @@ Requires-Dist: pikepdf; extra == "ocr-export"
 Provides-Extra: export-extras
 Requires-Dist: jupytext; extra == "export-extras"
 Requires-Dist: nbformat; extra == "export-extras"
+Provides-Extra: ai
+Requires-Dist: sentence-transformers; extra == "ai"
+Requires-Dist: torch; extra == "ai"
+Requires-Dist: torchvision; extra == "ai"
+Requires-Dist: transformers[sentencepiece]; extra == "ai"
+Requires-Dist: huggingface_hub>=0.29.3; extra == "ai"
+Requires-Dist: timm; extra == "ai"
+Requires-Dist: doclayout_yolo; extra == "ai"
+Requires-Dist: easyocr; extra == "ai"
 Dynamic: license-file
 # Natural PDF
@@ -87,25 +92,29 @@ Natural PDF lets you find and extract content from PDFs using simple code that m
 pip install natural-pdf
 ```
-Need OCR engines, layout models, or other heavy add-ons? Install the **core** once, then use the helper CLI to pull in exactly what you need:
+Need OCR engines, layout models, or other heavy add-ons? Install the **core** once, then use the helper `npdf` command to pull in exactly what you need:
 ```bash
-# add PaddleOCR (+paddlex) after the fact
-npdf install paddle
+# Everything you need for classification, document-QA, semantic search, etc.
+npdf install ai
 # Surya OCR and the YOLO Doc-Layout detector in one go
 npdf install surya yolo
+# add PaddleOCR (+paddlex) after the fact
+npdf install paddle
 # see what's already on your machine
 npdf list
 ```
-Light-weight extras such as `deskew` or `search` can still be added with
-classic PEP-508 markers if you prefer:
+Lightweight extras such as `deskew` or `search` can still be added with
+classic `pip install`:
 ```bash
 pip install "natural-pdf[deskew]"
 pip install "natural-pdf[search]"
+pip install "natural-pdf[ai]"
 ```
 More details in the [installation guide](https://jsoma.github.io/natural-pdf/installation/).
@@ -116,7 +125,7 @@ More details in the [installation guide](https://jsoma.github.io/natural-pdf/ins
 from natural_pdf import PDF
 # Open a PDF
-pdf = PDF('document.pdf')
+pdf = PDF('https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf')
 page = pdf.pages[0]
 # Extract all of the text on the page

natural-pdf 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl

natural-pdf 0.1.22py3-none-any.whl → 0.1.24py3-none-any.whl