PyPI - pdfalyzer - Versions diffs - 1.17.3__tar.gz → 1.17.5__tar.gz - Mend

pdfalyzer 1.17.3tar.gz → 1.17.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pdfalyzer might be problematic. Click here for more details.

Files changed (52) hide show

{pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/CHANGELOG.md RENAMED Viewed

@@ -1,5 +1,11 @@
 # NEXT RELEASE
+### 1.17.5
+* Fix `PIL` lazy import
+### 1.17.4
+* Make `PIL` a lazy import so installs without `[extract]` extras don't fail
 ### 1.17.3
 * Put back `--debug` arg for CLI tools

{pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pdfalyzer
-Version: 1.17.3
+Version: 1.17.5
 Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
 Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
 License: GPL-3.0-or-later

{pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/decorators/pdf_file.py RENAMED Viewed

@@ -20,6 +20,8 @@ from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_t
 from pdfalyzer.helpers.string_helper import exception_str
 from pdfalyzer.util.page_range import PageRange
+DEPENDENCY_ERROR_MSG = "Pdfalyzer is missing an optional dependency required to extract text. " + \
+                       "Try 'pip install pdfalyzer[extract]'"
 DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
 MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
@@ -54,11 +56,11 @@ class PdfFile:
         self.file_size = self.file_path.stat().st_size
     def extract_page_range(
-            self,
-            page_range: PageRange,
-            destination_dir: Optional[Path] = None,
-            extra_file_suffix: Optional[str] = None
-        ) -> Path:
+        self,
+        page_range: PageRange,
+        destination_dir: Optional[Path] = None,
+        extra_file_suffix: Optional[str] = None
+    ) -> Path:
         """
         Extract a range of pages to a new PDF file.
@@ -86,7 +88,7 @@ class PdfFile:
         extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
         extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
-        console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
+        console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'")
         pdf_writer = PdfWriter()
         with open(self.file_path, 'rb') as source_pdf:
@@ -99,11 +101,11 @@ class PdfFile:
         return extracted_pages_pdf_path
     def extract_text(
-            self,
-            page_range: Optional[PageRange] = None,
-            logger: Optional[Logger] = None,
-            print_as_parsed: bool = False
-        ) -> Optional[str]:
+        self,
+        page_range: Optional[PageRange] = None,
+        logger: Optional[Logger] = None,
+        print_as_parsed: bool = False
+    ) -> Optional[str]:
         """
         Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
@@ -167,7 +169,7 @@ class PdfFile:
                 if print_as_parsed:
                     print(f"{page_text}")
         except DependencyError:
-            log.error("Pdfalyzer is missing an optional dependency required to extract text. Try 'pip install pdfalyzer[extract]'")
+            log.error(DEPENDENCY_ERROR_MSG)
         except EmptyFileError:
             log.warning("Skipping empty file!")
         except PdfStreamError as e:
@@ -190,7 +192,8 @@ class PdfFile:
         try:
             extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
-        except Exception as e:
+        except Exception:
+            stderr_console.print_exception()
             stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
             extracted_file = None

{pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/helpers/image_helper.py RENAMED Viewed

@@ -1,19 +1,19 @@
 from typing import Optional
-from PIL import Image
 from yaralyzer.output.rich_console import console
 from pdfalyzer.helpers.rich_text_helper import warning_text
-def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
+def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]:
     """Use pytesseract to OCR the text in the image and return it as a string."""
     import pytesseract
+    from PIL import Image
     text = None
     try:
         text = pytesseract.image_to_string(image)
-    except pytesseract.pytesseract.TesseractError as e:
+    except pytesseract.pytesseract.TesseractError:
         console.print_exception()
         console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
     except OSError as e:

{pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/util/cli_tools_argument_parser.py RENAMED Viewed

@@ -133,7 +133,7 @@ extract_text_parser.add_argument('--page-range', '-r',
 extract_text_parser.add_argument('--print-as-parsed', '-p',
                                  action='store_true',
-                                 help='print pages as they are parsed instead of waiting until document is fully parsed')
+                                 help='print pages as they are parsed instead of waiting until parsing complete')
 def parse_text_extraction_args() -> Namespace:

{pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdfalyzer"
-version = "1.17.3"
+version = "1.17.5"
 description = "Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more."
 authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
 license = "GPL-3.0-or-later"