PyPI - natural-pdf - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

natural-pdf 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

natural_pdf/core/page.py CHANGED Viewed

@@ -1138,31 +1138,145 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
         logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
         return result
-    def extract_table(self, table_settings={}) -> List[Any]:
+    def extract_table(
+        self,
+        method: Optional[str] = None,
+        table_settings: Optional[dict] = None,
+        use_ocr: bool = False,
+        ocr_config: Optional[dict] = None,
+        text_options: Optional[Dict] = None,
+        cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
+        show_progress: bool = False,
+    ) -> List[List[Optional[str]]]:
         """
-        Extract the largest table from this page.
+        Extract the largest table from this page using enhanced region-based extraction.
         Args:
-            table_settings: Additional extraction parameters
+            method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
+            table_settings: Settings for pdfplumber table extraction.
+            use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
+            ocr_config: OCR configuration parameters.
+            text_options: Dictionary of options for the 'text' method.
+            cell_extraction_func: Optional callable function that takes a cell Region object
+                                  and returns its string content. For 'text' method only.
+            show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
         Returns:
-            List of extracted tables (or None if no table found)
+            Table data as a list of rows, where each row is a list of cell values (str or None).
         """
-        # pdfplumber returns None if no table found
-        return self._page.extract_table(table_settings)
+        # Create a full-page region and delegate to its enhanced extract_table method
+        page_region = self.create_region(0, 0, self.width, self.height)
+        return page_region.extract_table(
+            method=method,
+            table_settings=table_settings,
+            use_ocr=use_ocr,
+            ocr_config=ocr_config,
+            text_options=text_options,
+            cell_extraction_func=cell_extraction_func,
+            show_progress=show_progress,
+        )
-    def extract_tables(self, table_settings={}) -> List[Any]:
+    def extract_tables(
+        self,
+        method: Optional[str] = None,
+        table_settings: Optional[dict] = None,
+        check_tatr: bool = True,
+    ) -> List[List[List[str]]]:
         """
-        Extract tables from this page.
+        Extract all tables from this page with enhanced method support.
         Args:
-            table_settings: Additional extraction parameters
+            method: Method to use: 'pdfplumber', 'stream', 'lattice', or None (auto-detect).
+                    'stream' uses text-based strategies, 'lattice' uses line-based strategies.
+                    Note: 'tatr' and 'text' methods are not supported for extract_tables.
+            table_settings: Settings for pdfplumber table extraction.
+            check_tatr: If True (default), first check for TATR-detected table regions
+                        and extract from those before falling back to pdfplumber methods.
         Returns:
-            List of extracted tables
+            List of tables, where each table is a list of rows, and each row is a list of cell values.
         """
-        # pdfplumber returns list of tables
-        return self._page.extract_tables(table_settings)
+        if table_settings is None:
+            table_settings = {}
+        # Check for TATR-detected table regions first if enabled
+        if check_tatr:
+            try:
+                tatr_tables = self.find_all("region[type=table][model=tatr]")
+                if tatr_tables:
+                    logger.debug(f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those...")
+                    extracted_tables = []
+                    for table_region in tatr_tables:
+                        try:
+                            table_data = table_region.extract_table(method="tatr")
+                            if table_data:  # Only add non-empty tables
+                                extracted_tables.append(table_data)
+                        except Exception as e:
+                            logger.warning(f"Failed to extract table from TATR region {table_region.bbox}: {e}")
+                    if extracted_tables:
+                        logger.debug(f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions")
+                        return extracted_tables
+                    else:
+                        logger.debug(f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber")
+                else:
+                    logger.debug(f"Page {self.number}: No TATR table regions found, using pdfplumber methods")
+            except Exception as e:
+                logger.debug(f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber")
+        # Auto-detect method if not specified (try lattice first, then stream)
+        if method is None:
+            logger.debug(f"Page {self.number}: Auto-detecting tables extraction method...")
+            # Try lattice first
+            try:
+                lattice_settings = table_settings.copy()
+                lattice_settings.setdefault("vertical_strategy", "lines")
+                lattice_settings.setdefault("horizontal_strategy", "lines")
+                logger.debug(f"Page {self.number}: Trying 'lattice' method first for tables...")
+                lattice_result = self._page.extract_tables(lattice_settings)
+                # Check if lattice found meaningful tables
+                if (lattice_result and len(lattice_result) > 0 and
+                    any(any(any(cell and cell.strip() for cell in row if cell) for row in table if table) for table in lattice_result)):
+                    logger.debug(f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables")
+                    return lattice_result
+                else:
+                    logger.debug(f"Page {self.number}: 'lattice' method found no meaningful tables")
+            except Exception as e:
+                logger.debug(f"Page {self.number}: 'lattice' method failed: {e}")
+            # Fall back to stream
+            logger.debug(f"Page {self.number}: Falling back to 'stream' method for tables...")
+            stream_settings = table_settings.copy()
+            stream_settings.setdefault("vertical_strategy", "text")
+            stream_settings.setdefault("horizontal_strategy", "text")
+            return self._page.extract_tables(stream_settings)
+        effective_method = method
+        # Handle method aliases
+        if effective_method == "stream":
+            logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
+            effective_method = "pdfplumber"
+            table_settings.setdefault("vertical_strategy", "text")
+            table_settings.setdefault("horizontal_strategy", "text")
+        elif effective_method == "lattice":
+            logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
+            effective_method = "pdfplumber"
+            table_settings.setdefault("vertical_strategy", "lines")
+            table_settings.setdefault("horizontal_strategy", "lines")
+        # Use the selected method
+        if effective_method == "pdfplumber":
+            return self._page.extract_tables(table_settings)
+        else:
+            raise ValueError(
+                f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
+            )
     def _load_elements(self):
         """Load all elements from the page via ElementManager."""

natural_pdf/elements/region.py CHANGED Viewed

@@ -1247,8 +1247,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Extract a table from this region.
         Args:
-            method: Method to use: 'tatr', 'plumber', 'text', or None (auto-detect).
-            table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method).
+            method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
+                    'stream' is an alias for 'pdfplumber' with text-based strategies (equivalent to
+                    setting `vertical_strategy` and `horizontal_strategy` to 'text').
+                    'lattice' is an alias for 'pdfplumber' with line-based strategies (equivalent to
+                    setting `vertical_strategy` and `horizontal_strategy` to 'lines').
+            table_settings: Settings for pdfplumber table extraction (used with 'pdfplumber', 'stream', or 'lattice' methods).
             use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
             ocr_config: OCR configuration parameters.
             text_options: Dictionary of options for the 'text' method, corresponding to arguments
@@ -1268,13 +1272,47 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             text_options = {}  # Initialize empty dict
         # Auto-detect method if not specified
-        effective_method = method
-        if effective_method is None:
+        if method is None:
             # If this is a TATR-detected region, use TATR method
             if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
                 effective_method = "tatr"
             else:
-                effective_method = "plumber"
+                # Try lattice first, then fall back to stream if no meaningful results
+                logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
+                try:
+                    logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
+                    lattice_result = self.extract_table('lattice', table_settings=table_settings.copy())
+                    # Check if lattice found meaningful content
+                    if (lattice_result and len(lattice_result) > 0 and
+                        any(any(cell and cell.strip() for cell in row if cell) for row in lattice_result)):
+                        logger.debug(f"Region {self.bbox}: 'lattice' method found table with {len(lattice_result)} rows")
+                        return lattice_result
+                    else:
+                        logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful content")
+                except Exception as e:
+                    logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
+                # Fall back to stream
+                logger.debug(f"Region {self.bbox}: Falling back to 'stream' method...")
+                return self.extract_table('stream', table_settings=table_settings.copy())
+        else:
+            effective_method = method
+        # Handle method aliases for pdfplumber
+        if effective_method == "stream":
+            logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
+            effective_method = "pdfplumber"
+            # Set default text strategies if not already provided by the user
+            table_settings.setdefault("vertical_strategy", "text")
+            table_settings.setdefault("horizontal_strategy", "text")
+        elif effective_method == "lattice":
+            logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
+            effective_method = "pdfplumber"
+            # Set default line strategies if not already provided by the user
+            table_settings.setdefault("vertical_strategy", "lines")
+            table_settings.setdefault("horizontal_strategy", "lines")
         logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
@@ -1284,16 +1322,111 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         elif effective_method == "text":
             current_text_options = text_options.copy()
             current_text_options["cell_extraction_func"] = cell_extraction_func
-            # --- Pass show_progress to the helper --- #
             current_text_options["show_progress"] = show_progress
             return self._extract_table_text(**current_text_options)
-        elif effective_method == "plumber":
+        elif effective_method == "pdfplumber":
             return self._extract_table_plumber(table_settings)
         else:
             raise ValueError(
-                f"Unknown table extraction method: '{effective_method}'. Choose from 'tatr', 'plumber', 'text'."
+                f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
             )
+    def extract_tables(
+        self,
+        method: Optional[str] = None,
+        table_settings: Optional[dict] = None,
+    ) -> List[List[List[str]]]:
+        """
+        Extract all tables from this region using pdfplumber-based methods.
+        Note: Only 'pdfplumber', 'stream', and 'lattice' methods are supported for extract_tables.
+        'tatr' and 'text' methods are designed for single table extraction only.
+        Args:
+            method: Method to use: 'pdfplumber', 'stream', 'lattice', or None (auto-detect).
+                    'stream' uses text-based strategies, 'lattice' uses line-based strategies.
+            table_settings: Settings for pdfplumber table extraction.
+        Returns:
+            List of tables, where each table is a list of rows, and each row is a list of cell values.
+        """
+        if table_settings is None:
+            table_settings = {}
+        # Auto-detect method if not specified (try lattice first, then stream)
+        if method is None:
+            logger.debug(f"Region {self.bbox}: Auto-detecting tables extraction method...")
+            # Try lattice first
+            try:
+                lattice_settings = table_settings.copy()
+                lattice_settings.setdefault("vertical_strategy", "lines")
+                lattice_settings.setdefault("horizontal_strategy", "lines")
+                logger.debug(f"Region {self.bbox}: Trying 'lattice' method first for tables...")
+                lattice_result = self._extract_tables_plumber(lattice_settings)
+                # Check if lattice found meaningful tables
+                if (lattice_result and len(lattice_result) > 0 and
+                    any(any(any(cell and cell.strip() for cell in row if cell) for row in table if table) for table in lattice_result)):
+                    logger.debug(f"Region {self.bbox}: 'lattice' method found {len(lattice_result)} tables")
+                    return lattice_result
+                else:
+                    logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful tables")
+            except Exception as e:
+                logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
+            # Fall back to stream
+            logger.debug(f"Region {self.bbox}: Falling back to 'stream' method for tables...")
+            stream_settings = table_settings.copy()
+            stream_settings.setdefault("vertical_strategy", "text")
+            stream_settings.setdefault("horizontal_strategy", "text")
+            return self._extract_tables_plumber(stream_settings)
+        effective_method = method
+        # Handle method aliases
+        if effective_method == "stream":
+            logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
+            effective_method = "pdfplumber"
+            table_settings.setdefault("vertical_strategy", "text")
+            table_settings.setdefault("horizontal_strategy", "text")
+        elif effective_method == "lattice":
+            logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
+            effective_method = "pdfplumber"
+            table_settings.setdefault("vertical_strategy", "lines")
+            table_settings.setdefault("horizontal_strategy", "lines")
+        # Use the selected method
+        if effective_method == "pdfplumber":
+            return self._extract_tables_plumber(table_settings)
+        else:
+            raise ValueError(
+                f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
+            )
+    def _extract_tables_plumber(self, table_settings: dict) -> List[List[List[str]]]:
+        """
+        Extract all tables using pdfplumber's table extraction.
+        Args:
+            table_settings: Settings for pdfplumber table extraction
+        Returns:
+            List of tables, where each table is a list of rows, and each row is a list of cell values
+        """
+        # Create a crop of the page for this region
+        cropped = self.page._page.crop(self.bbox)
+        # Extract all tables from the cropped area
+        tables = cropped.extract_tables(table_settings)
+        # Return the tables or an empty list if none found
+        return tables if tables else []
     def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
         """
         Extract table using pdfplumber's table extraction.

{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.14
+Version: 0.1.15
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.15.dist-info}/RECORD RENAMED Viewed

@@ -24,14 +24,14 @@ natural_pdf/collections/pdf_collection.py,sha256=nsbrzcsXAD2qVLLXhDYpljAb_WnjMNa
 natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
 natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
 natural_pdf/core/highlighting_service.py,sha256=tjMJpdJj2oaMGpdqiNHPcTJqID4nd-uBZ5v7KtPmoc0,36762
-natural_pdf/core/page.py,sha256=hg7EoYMbvgo9dXivBl6xb6dENobhSHt0Wuu36O5J900,111119
+natural_pdf/core/page.py,sha256=M-KgTxceFebw0n1BehFAeQ0sxnCpIr9dZX10k2OJzUY,117518
 natural_pdf/core/pdf.py,sha256=395aBTg4Le4vABvQWgBhPm669nGJ8JdMToTs1UtQ2Vg,69575
 natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
 natural_pdf/elements/base.py,sha256=NNF-iUzkip0UgfKTuqLE1jVJsq2yD7LUTvOQWMi_Jpc,39631
 natural_pdf/elements/collections.py,sha256=qd58tD3f-eojz90ICytlqu4Ej0OQoWgsxV4umQDhUvA,120809
 natural_pdf/elements/line.py,sha256=300kSFBDUBIudfeQtH_tzW9gTYRgRKUDPiTABw6J-BE,4782
 natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
-natural_pdf/elements/region.py,sha256=wBBAcuudRqL1b9ojLdrXiwUIcQbTWEWTky_RbBuCgnU,115798
+natural_pdf/elements/region.py,sha256=l9J6E7bAkxZoA603cfPKG1LuU7uRUPl4PArUBkuk7VI,122719
 natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
 natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
 natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
@@ -84,8 +84,8 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
 natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
 natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
 natural_pdf/widgets/viewer.py,sha256=ekgXTEfA48GrR-JjpCpgyBCXdf4IubV0pAXDJozcU7A,39196
-natural_pdf-0.1.14.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
-natural_pdf-0.1.14.dist-info/METADATA,sha256=NzaR_hcSyFH22knKZ-NMCct_XOo2nPUk83XHspTncyE,7674
-natural_pdf-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-natural_pdf-0.1.14.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
-natural_pdf-0.1.14.dist-info/RECORD,,
+natural_pdf-0.1.15.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.1.15.dist-info/METADATA,sha256=O8RUOiFgln7unuRhKey0Z6l90K71ktMY7WwpaiEyZdc,7674
+natural_pdf-0.1.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+natural_pdf-0.1.15.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
+natural_pdf-0.1.15.dist-info/RECORD,,

{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.15.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.15.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.15.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

natural-pdf 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl