PyPI - pdfalyzer - Versions diffs - 1.17.4__py3-none-any.whl → 1.17.6__py3-none-any.whl - Mend

pdfalyzer 1.17.4py3-none-any.whl → 1.17.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pdfalyzer might be problematic. Click here for more details.

Files changed (9) hide show

CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,12 @@
 # NEXT RELEASE
+### 1.17.6
+* Better handling for errors resulting from bugs in PyPDF
+* Properly close file handle when pdfalyzing is complete
+### 1.17.5
+* Fix `PIL` lazy import
 ### 1.17.4
 * Make `PIL` a lazy import so installs without `[extract]` extras don't fail

pdfalyzer/__init__.py CHANGED Viewed

@@ -43,7 +43,7 @@ MAX_THEME_COL_SIZE = 35
 def pdfalyze():
     args = parse_arguments()
     pdfalyzer = Pdfalyzer(args.file_to_scan_path)
-    pdfalyzer = PdfalyzerPresenter(pdfalyzer)
+    presenter = PdfalyzerPresenter(pdfalyzer)
     output_basepath = None
     # Binary stream extraction is a special case
@@ -55,7 +55,7 @@ def pdfalyze():
     # The method that gets called is related to the argument name. See 'possible_output_sections' list in
     # argument_parser.py. Analysis exports wrap themselves around the methods that actually generate the analyses.
-    for (arg, method) in output_sections(args, pdfalyzer):
+    for (arg, method) in output_sections(args, presenter):
         if args.output_dir:
             output_basepath = PdfalyzerConfig.get_output_basepath(method)
             print(f'Exporting {arg} data to {output_basepath}...')
@@ -80,6 +80,8 @@ def pdfalyze():
     if args.interact:
         code.interact(local=locals())
+    pdfalyzer.pdf_filehandle.close()
 def pdfalyzer_show_color_theme() -> None:
     """Utility method to show pdfalyzer's color theme. Invocable with 'pdfalyzer_show_color_theme'."""

pdfalyzer/helpers/image_helper.py CHANGED Viewed

@@ -5,7 +5,7 @@ from yaralyzer.output.rich_console import console
 from pdfalyzer.helpers.rich_text_helper import warning_text
-def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
+def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]:  # noqa F821
     """Use pytesseract to OCR the text in the image and return it as a string."""
     import pytesseract
     from PIL import Image

pdfalyzer/pdfalyzer.py CHANGED Viewed

@@ -7,10 +7,11 @@ from typing import Dict, Iterator, List, Optional
 from anytree import LevelOrderIter, SymlinkNode
 from anytree.search import findall, findall_by_attr
 from pypdf import PdfReader
+from pypdf.errors import PdfReadError
 from pypdf.generic import IndirectObject
 from yaralyzer.helpers.file_helper import load_binary_data
 from yaralyzer.output.file_hashes_table import compute_file_hashes
-from yaralyzer.output.rich_console import console
+from yaralyzer.output.rich_console import console, print_fatal_error_and_exit
 from yaralyzer.util.logging import log
 from pdfalyzer.decorators.document_model_printer import print_with_header
@@ -22,7 +23,8 @@ from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
 from pdfalyzer.util.adobe_strings import *
 from pdfalyzer.util.exceptions import PdfWalkError
-TRAILER_FALLBACK_ID = 10000000
+TRAILER_FALLBACK_ID = 10_000_000
+PYPDF_ERROR_MSG = "Failed to open file with PyPDF. Consider filing a PyPDF bug report: https://github.com/py-pdf/pypdf/issues"
 class Pdfalyzer:
@@ -32,6 +34,19 @@ class Pdfalyzer:
     Each of the PDF's internal objects isw rapped in a `PdfTreeNode` object. The tree is managed
     by the `anytree` library. Information about the tree as a whole is stored in this class.
     Once the PDF is parsed this class provides access to info about or from the underlying PDF tree.
+    Attributes:
+        font_infos (List[FontInfo]): Font summary objects
+        max_generation (int): Max revision number ("generation") encounted in this PDF.
+        nodes_encountered (Dict[int, PdfTreeNode]): Nodes we've traversed already.
+        pdf_basename (str): The base name of the PDF file (with extension).
+        pdf_bytes (bytes): PDF binary data.
+        pdf_bytes_info (BytesInfo): File size, hashes, and other data points about the PDF's raw bytes.
+        pdf_filehandle (BufferedReader): File handle that reads the PDF.
+        pdf_path (str): The path to the PDF file.
+        pdf_size (int): Number of nodes as extracted from the PDF's Trailer node.
+        pdf_tree (PdfTreeNode): The top node of the PDF data structure tree.
+        verifier (PdfTreeVerifier): PdfTreeVerifier that can validate the PDF has been walked successfully.
     """
     def __init__(self, pdf_path: str):
@@ -43,14 +58,21 @@ class Pdfalyzer:
         self.pdf_basename = basename(pdf_path)
         self.pdf_bytes = load_binary_data(pdf_path)
         self.pdf_bytes_info = compute_file_hashes(self.pdf_bytes)
-        pdf_file = open(pdf_path, 'rb')  # Filehandle must be left open for PyPDF to perform seeks
-        self.pdf_reader = PdfReader(pdf_file)
+        self.pdf_filehandle = open(pdf_path, 'rb')  # Filehandle must be left open for PyPDF to perform seeks
+        try:
+            self.pdf_reader = PdfReader(self.pdf_filehandle)
+        except PdfReadError:
+            self._handle_fatal_error(f'PdfReadError: "{pdf_path}" doesn\'t seem to be a valid PDF file.')
+        except Exception as e:
+            console.print_exception()
+            self._handle_fatal_error(f"{PYPDF_ERROR_MSG}\n{e}")
         # Initialize tracking variables
-        self.indeterminate_ids = set()  # See INDETERMINATE_REF_KEYS comment
-        self.nodes_encountered: Dict[int, PdfTreeNode] = {}  # Nodes we've seen already
         self.font_infos: List[FontInfo] = []  # Font summary objects
         self.max_generation = 0  # PDF revisions are "generations"; this is the max generation encountered
+        self.nodes_encountered: Dict[int, PdfTreeNode] = {}  # Nodes we've seen already
+        self._indeterminate_ids = set()  # See INDETERMINATE_REF_KEYS comment
         # Bootstrap the root of the tree with the trailer. PDFs are always read trailer first.
         # Technically the trailer has no PDF Object ID but we set it to the /Size of the PDF.
@@ -148,9 +170,9 @@ class Pdfalyzer:
                 from_node.add_child(to_node)
             # Remove this to_node from inteterminacy now that it's got a child or parent
-            if relationship.to_obj.idnum in self.indeterminate_ids:
+            if relationship.to_obj.idnum in self._indeterminate_ids:
                 log.info(f"  Found {relationship} => {to_node} was marked indeterminate but now placed")
-                self.indeterminate_ids.remove(relationship.to_obj.idnum)
+                self._indeterminate_ids.remove(relationship.to_obj.idnum)
         # If the relationship is indeterminate or we've seen the PDF object before, add it as
         # a non-tree relationship for now. An attempt to place the node will be made at the end.
@@ -159,7 +181,7 @@ class Pdfalyzer:
             # If we already encountered 'to_node' then skip adding it to the queue of nodes to walk
             if was_seen_before:
-                if relationship.to_obj.idnum not in self.indeterminate_ids and to_node.parent is None:
+                if relationship.to_obj.idnum not in self._indeterminate_ids and to_node.parent is None:
                     raise PdfWalkError(f"{relationship} - ref has no parent and is not indeterminate")
                 else:
                     log.debug(f"  Already saw {relationship}; not scanning next")
@@ -167,7 +189,7 @@ class Pdfalyzer:
             # Indeterminate relationships need to wait until everything has been scanned to be placed
             elif relationship.is_indeterminate or (relationship.is_link and not self.is_in_tree(to_node)):
                 log.info(f'  Indeterminate ref {relationship}')
-                self.indeterminate_ids.add(to_node.idnum)
+                self._indeterminate_ids.add(to_node.idnum)
             # Link nodes like /Dest are usually just links between nodes
             elif relationship.is_link:
                 log.debug(f"  Link ref {relationship}")
@@ -178,9 +200,13 @@ class Pdfalyzer:
         return to_node
+    def _handle_fatal_error(self, msg: str) -> None:
+        self.pdf_filehandle.close()
+        print_fatal_error_and_exit(msg)
     def _resolve_indeterminate_nodes(self) -> None:
         """Place all indeterminate nodes in the tree. Called after all nodes have been walked."""
-        indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self.indeterminate_ids]
+        indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self._indeterminate_ids]
         indeterminate_nodes_string = "\n   ".join([f"{node}" for node in indeterminate_nodes])
         log.info(f"Resolving {len(indeterminate_nodes)} indeterminate nodes: {indeterminate_nodes_string}")

{pdfalyzer-1.17.4.dist-info → pdfalyzer-1.17.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pdfalyzer
-Version: 1.17.4
+Version: 1.17.6
 Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
 Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
 License: GPL-3.0-or-later

{pdfalyzer-1.17.4.dist-info → pdfalyzer-1.17.6.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 .pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
-CHANGELOG.md,sha256=dyXJVhpeNYDdeh8Ugfl7co6v86ksu_AtNOYKEm2U5TI,13390
+CHANGELOG.md,sha256=pQmLiE-WvF72lDYysezquoZ-dOj-LxtU3gisei9Nxvo,13553
 LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-pdfalyzer/__init__.py,sha256=2Gikt_-OSXZqeQij4wSwb65g7jycVAupjeFmXBf51lo,6159
+pdfalyzer/__init__.py,sha256=3ylD-19PcG1bJ-rMa6ruP06QaM9Q1BitaMOA2ppugM8,6197
 pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
 pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04ElB8ilU,10748
 pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
@@ -18,7 +18,7 @@ pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47Nd
 pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
 pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
 pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
-pdfalyzer/helpers/image_helper.py,sha256=E3Mby-KG-1eIYThuYqXEkwG1mnhY0imvrpiO8N8otfQ,1119
+pdfalyzer/helpers/image_helper.py,sha256=mDiscZZ7yrsFa-bxFqIEz9gH3WGhz8455yhXd4_QfAY,1134
 pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
 pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
 pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
@@ -33,7 +33,7 @@ pdfalyzer/output/tables/font_summary_table.py,sha256=TyCwcvqn99LXTWnmtk6MBPdc_33
 pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=7G-FLb_EUP50kZmYCTbo8Q6taU4xKp2QIGNOnQtYbNg,5908
 pdfalyzer/output/tables/stream_objects_table.py,sha256=PgQj8oTtW5_X8SMQb3FvCWDS-d4Zl6QiE44Qhiv7lTY,706
 pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVnev_4uEk,5291
-pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
+pdfalyzer/pdfalyzer.py,sha256=iu4D3Y9qlKP0D_k883ji4U6LLzelQkHONlzAed0QUx4,12713
 pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
 pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
 pdfalyzer/util/cli_tools_argument_parser.py,sha256=HyZhztyrPtbvOswmG975M0tK5KPon37lV3fxVA0OwYo,6277
@@ -47,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
 pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
 pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
 pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
-pdfalyzer-1.17.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-pdfalyzer-1.17.4.dist-info/METADATA,sha256=plr6KKGy51GfRWhsqIku4u4nkMoHwM5xMLmV9Lm38ak,27294
-pdfalyzer-1.17.4.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
-pdfalyzer-1.17.4.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
-pdfalyzer-1.17.4.dist-info/RECORD,,
+pdfalyzer-1.17.6.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+pdfalyzer-1.17.6.dist-info/METADATA,sha256=Gm-8GMSrvAtkydftH93qVhxMXB-JwyUJnsgz87dsRm8,27294
+pdfalyzer-1.17.6.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
+pdfalyzer-1.17.6.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
+pdfalyzer-1.17.6.dist-info/RECORD,,

{pdfalyzer-1.17.4.dist-info → pdfalyzer-1.17.6.dist-info}/LICENSE RENAMED Viewed

File without changes

{pdfalyzer-1.17.4.dist-info → pdfalyzer-1.17.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{pdfalyzer-1.17.4.dist-info → pdfalyzer-1.17.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

pdfalyzer 1.17.4__py3-none-any.whl → 1.17.6__py3-none-any.whl

Potentially problematic release.

pdfalyzer 1.17.4py3-none-any.whl → 1.17.6py3-none-any.whl