PyPI - pdfalyzer - Versions diffs - 1.17.2__py3-none-any.whl → 1.17.4__py3-none-any.whl - Mend

pdfalyzer 1.17.2py3-none-any.whl → 1.17.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pdfalyzer might be problematic. Click here for more details.

Files changed (9) hide show

CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,11 @@
 # NEXT RELEASE
+### 1.17.4
+* Make `PIL` a lazy import so installs without `[extract]` extras don't fail
+### 1.17.3
+* Put back `--debug` arg for CLI tools
 ### 1.17.2
 * Remove unused `--debug` args for CLI tools
 * Rename `extract_text_from_pdfs` to `extract_pdf_text`

pdfalyzer/decorators/pdf_file.py CHANGED Viewed

@@ -20,6 +20,8 @@ from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_t
 from pdfalyzer.helpers.string_helper import exception_str
 from pdfalyzer.util.page_range import PageRange
+DEPENDENCY_ERROR_MSG = "Pdfalyzer is missing an optional dependency required to extract text. " + \
+                       "Try 'pip install pdfalyzer[extract]'"
 DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
 MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
@@ -54,11 +56,11 @@ class PdfFile:
         self.file_size = self.file_path.stat().st_size
     def extract_page_range(
-            self,
-            page_range: PageRange,
-            destination_dir: Optional[Path] = None,
-            extra_file_suffix: Optional[str] = None
-        ) -> Path:
+        self,
+        page_range: PageRange,
+        destination_dir: Optional[Path] = None,
+        extra_file_suffix: Optional[str] = None
+    ) -> Path:
         """
         Extract a range of pages to a new PDF file.
@@ -86,7 +88,7 @@ class PdfFile:
         extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
         extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
-        console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
+        console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'")
         pdf_writer = PdfWriter()
         with open(self.file_path, 'rb') as source_pdf:
@@ -99,11 +101,11 @@ class PdfFile:
         return extracted_pages_pdf_path
     def extract_text(
-            self,
-            page_range: Optional[PageRange] = None,
-            logger: Optional[Logger] = None,
-            print_as_parsed: bool = False
-        ) -> Optional[str]:
+        self,
+        page_range: Optional[PageRange] = None,
+        logger: Optional[Logger] = None,
+        print_as_parsed: bool = False
+    ) -> Optional[str]:
         """
         Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
@@ -167,7 +169,7 @@ class PdfFile:
                 if print_as_parsed:
                     print(f"{page_text}")
         except DependencyError:
-            log.error("Pdfalyzer is missing an optional dependency required to extract text. Try 'pip install pdfalyzer[extract]'")
+            log.error(DEPENDENCY_ERROR_MSG)
         except EmptyFileError:
             log.warning("Skipping empty file!")
         except PdfStreamError as e:
@@ -190,7 +192,8 @@ class PdfFile:
         try:
             extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
-        except Exception as e:
+        except Exception:
+            stderr_console.print_exception()
             stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
             extracted_file = None
@@ -215,10 +218,3 @@ class PdfFile:
             return
         stderr_console.print(msg, style=style or "")
-    # def _num_pages(self) -> int:
-    #     pdf_reader = PdfReader(self.file_path)
-    #     page_count = len(pdf_reader.pages)
-    #         log.debug(f"PDF Page count: {page_count}")
-    #         for page_number, page in enumerate(pdf_reader.pages, start=1):

pdfalyzer/helpers/image_helper.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from typing import Optional
-from PIL import Image
 from yaralyzer.output.rich_console import console
 from pdfalyzer.helpers.rich_text_helper import warning_text
@@ -9,11 +8,12 @@ from pdfalyzer.helpers.rich_text_helper import warning_text
 def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
     """Use pytesseract to OCR the text in the image and return it as a string."""
     import pytesseract
+    from PIL import Image
     text = None
     try:
         text = pytesseract.image_to_string(image)
-    except pytesseract.pytesseract.TesseractError as e:
+    except pytesseract.pytesseract.TesseractError:
         console.print_exception()
         console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
     except OSError as e:

pdfalyzer/util/cli_tools_argument_parser.py CHANGED Viewed

@@ -5,6 +5,7 @@ Argument parsers for the command line tools other than `pdfalyze` that are inclu
 2. extract_pdf_pages
 3. extract_pdf_text
 """
+import logging
 import sys
 from argparse import ArgumentParser, Namespace
 from pathlib import Path
@@ -88,6 +89,7 @@ extract_pdf_parser = ArgumentParser(
 )
 extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
+extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
 extract_pdf_parser.add_argument('--page-range', '-r',
                                 type=page_range_validator,
@@ -109,6 +111,7 @@ def parse_pdf_page_extraction_args() -> Namespace:
         log.error(f"Destination dir '{args.destination_dir}' does not exist.")
         sys.exit(1)
+    _set_log_level(args)
     return args
@@ -122,6 +125,7 @@ extract_text_parser = ArgumentParser(
 )
 extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
+extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
 extract_text_parser.add_argument('--page-range', '-r',
                                  type=page_range_validator,
@@ -129,7 +133,7 @@ extract_text_parser.add_argument('--page-range', '-r',
 extract_text_parser.add_argument('--print-as-parsed', '-p',
                                  action='store_true',
-                                 help='print pages as they are parsed instead of waiting until document is fully parsed')
+                                 help='print pages as they are parsed instead of waiting until parsing complete')
 def parse_text_extraction_args() -> Namespace:
@@ -151,4 +155,10 @@ def parse_text_extraction_args() -> Namespace:
         log.error(f"--page-range can only be specified for a single PDF")
         sys.exit(-1)
+    _set_log_level(args)
     return args
+def _set_log_level(args: Namespace):
+    if args.debug:
+        log.setLevel(logging.DEBUG)

{pdfalyzer-1.17.2.dist-info → pdfalyzer-1.17.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pdfalyzer
-Version: 1.17.2
+Version: 1.17.4
 Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
 Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
 License: GPL-3.0-or-later

{pdfalyzer-1.17.2.dist-info → pdfalyzer-1.17.4.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 .pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
-CHANGELOG.md,sha256=Zzq920SR4HeZjCDlqqVoLCPrc1hd5rqJQxih5p3aK1I,13250
+CHANGELOG.md,sha256=dyXJVhpeNYDdeh8Ugfl7co6v86ksu_AtNOYKEm2U5TI,13390
 LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
 pdfalyzer/__init__.py,sha256=2Gikt_-OSXZqeQij4wSwb65g7jycVAupjeFmXBf51lo,6159
 pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
@@ -7,7 +7,7 @@ pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04El
 pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
 pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
 pdfalyzer/decorators/indeterminate_node.py,sha256=QLJr-nGKih8gPZcIqxLU028OwWWD5VjNHYMUjniwT_k,6586
-pdfalyzer/decorators/pdf_file.py,sha256=KstGLrBHGio867l-ieVWR_Sps9Yy5z29Q0jBun9g68o,10746
+pdfalyzer/decorators/pdf_file.py,sha256=ryAYzzsO8Fw5_ZMoomruW0Bal8pTb5C0VlLOTjdVqNI,10552
 pdfalyzer/decorators/pdf_object_properties.py,sha256=Il3RObxQ4XUf0Ei-nd4tjJO0LeaxC6u7yFa3cQs_jVY,5485
 pdfalyzer/decorators/pdf_tree_node.py,sha256=4LReGJUtG8iEcLUQD1jW-yp3xPWsHrC-3Anbkt7XZ3A,11134
 pdfalyzer/decorators/pdf_tree_verifier.py,sha256=2hVe9APsAWQZ7ra8AGndHQnGWmmxmb3ZwfJHZuLvsvc,4714
@@ -18,7 +18,7 @@ pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47Nd
 pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
 pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
 pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
-pdfalyzer/helpers/image_helper.py,sha256=QjoAUcKKEtpmuEyOmEfmaUN-lzNykQ1SzqgNn9-R4Y0,1120
+pdfalyzer/helpers/image_helper.py,sha256=E3Mby-KG-1eIYThuYqXEkwG1mnhY0imvrpiO8N8otfQ,1119
 pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
 pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
 pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
@@ -36,7 +36,7 @@ pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVn
 pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
 pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
 pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
-pdfalyzer/util/cli_tools_argument_parser.py,sha256=8bW0B4UXXkwuxNwDRe9MyXucTFm0pZRyOq98CO8D_Hs,5925
+pdfalyzer/util/cli_tools_argument_parser.py,sha256=HyZhztyrPtbvOswmG975M0tK5KPon37lV3fxVA0OwYo,6277
 pdfalyzer/util/debugging.py,sha256=hjYGxptJmal9TTaAUkkoo0oNu2tdx6ZYSyC0WjvzHh0,156
 pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
 pdfalyzer/util/page_range.py,sha256=NMNh3_TojxTxBIpvUYK1AmvID_m8qOP6AihZrLWZF2I,1652
@@ -47,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
 pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
 pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
 pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
-pdfalyzer-1.17.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-pdfalyzer-1.17.2.dist-info/METADATA,sha256=y8x6ka9yoCPuU98_oLBzaNK4wXBWXy_2NLA7jYp4l9U,27294
-pdfalyzer-1.17.2.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
-pdfalyzer-1.17.2.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
-pdfalyzer-1.17.2.dist-info/RECORD,,
+pdfalyzer-1.17.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+pdfalyzer-1.17.4.dist-info/METADATA,sha256=plr6KKGy51GfRWhsqIku4u4nkMoHwM5xMLmV9Lm38ak,27294
+pdfalyzer-1.17.4.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
+pdfalyzer-1.17.4.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
+pdfalyzer-1.17.4.dist-info/RECORD,,

{pdfalyzer-1.17.2.dist-info → pdfalyzer-1.17.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{pdfalyzer-1.17.2.dist-info → pdfalyzer-1.17.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{pdfalyzer-1.17.2.dist-info → pdfalyzer-1.17.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

pdfalyzer 1.17.2__py3-none-any.whl → 1.17.4__py3-none-any.whl

Potentially problematic release.

pdfalyzer 1.17.2py3-none-any.whl → 1.17.4py3-none-any.whl