pdfalyzer 1.17.4__py3-none-any.whl → 1.17.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pdfalyzer might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.17.6
4
+ * Better handling for errors resulting from bugs in PyPDF
5
+ * Properly close file handle when pdfalyzing is complete
6
+
7
+ ### 1.17.5
8
+ * Fix `PIL` lazy import
9
+
3
10
  ### 1.17.4
4
11
  * Make `PIL` a lazy import so installs without `[extract]` extras don't fail
5
12
 
pdfalyzer/__init__.py CHANGED
@@ -43,7 +43,7 @@ MAX_THEME_COL_SIZE = 35
43
43
  def pdfalyze():
44
44
  args = parse_arguments()
45
45
  pdfalyzer = Pdfalyzer(args.file_to_scan_path)
46
- pdfalyzer = PdfalyzerPresenter(pdfalyzer)
46
+ presenter = PdfalyzerPresenter(pdfalyzer)
47
47
  output_basepath = None
48
48
 
49
49
  # Binary stream extraction is a special case
@@ -55,7 +55,7 @@ def pdfalyze():
55
55
 
56
56
  # The method that gets called is related to the argument name. See 'possible_output_sections' list in
57
57
  # argument_parser.py. Analysis exports wrap themselves around the methods that actually generate the analyses.
58
- for (arg, method) in output_sections(args, pdfalyzer):
58
+ for (arg, method) in output_sections(args, presenter):
59
59
  if args.output_dir:
60
60
  output_basepath = PdfalyzerConfig.get_output_basepath(method)
61
61
  print(f'Exporting {arg} data to {output_basepath}...')
@@ -80,6 +80,8 @@ def pdfalyze():
80
80
  if args.interact:
81
81
  code.interact(local=locals())
82
82
 
83
+ pdfalyzer.pdf_filehandle.close()
84
+
83
85
 
84
86
  def pdfalyzer_show_color_theme() -> None:
85
87
  """Utility method to show pdfalyzer's color theme. Invocable with 'pdfalyzer_show_color_theme'."""
@@ -5,7 +5,7 @@ from yaralyzer.output.rich_console import console
5
5
  from pdfalyzer.helpers.rich_text_helper import warning_text
6
6
 
7
7
 
8
- def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
8
+ def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]: # noqa F821
9
9
  """Use pytesseract to OCR the text in the image and return it as a string."""
10
10
  import pytesseract
11
11
  from PIL import Image
pdfalyzer/pdfalyzer.py CHANGED
@@ -7,10 +7,11 @@ from typing import Dict, Iterator, List, Optional
7
7
  from anytree import LevelOrderIter, SymlinkNode
8
8
  from anytree.search import findall, findall_by_attr
9
9
  from pypdf import PdfReader
10
+ from pypdf.errors import PdfReadError
10
11
  from pypdf.generic import IndirectObject
11
12
  from yaralyzer.helpers.file_helper import load_binary_data
12
13
  from yaralyzer.output.file_hashes_table import compute_file_hashes
13
- from yaralyzer.output.rich_console import console
14
+ from yaralyzer.output.rich_console import console, print_fatal_error_and_exit
14
15
  from yaralyzer.util.logging import log
15
16
 
16
17
  from pdfalyzer.decorators.document_model_printer import print_with_header
@@ -22,7 +23,8 @@ from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
22
23
  from pdfalyzer.util.adobe_strings import *
23
24
  from pdfalyzer.util.exceptions import PdfWalkError
24
25
 
25
- TRAILER_FALLBACK_ID = 10000000
26
+ TRAILER_FALLBACK_ID = 10_000_000
27
+ PYPDF_ERROR_MSG = "Failed to open file with PyPDF. Consider filing a PyPDF bug report: https://github.com/py-pdf/pypdf/issues"
26
28
 
27
29
 
28
30
  class Pdfalyzer:
@@ -32,6 +34,19 @@ class Pdfalyzer:
32
34
  Each of the PDF's internal objects isw rapped in a `PdfTreeNode` object. The tree is managed
33
35
  by the `anytree` library. Information about the tree as a whole is stored in this class.
34
36
  Once the PDF is parsed this class provides access to info about or from the underlying PDF tree.
37
+
38
+ Attributes:
39
+ font_infos (List[FontInfo]): Font summary objects
40
+ max_generation (int): Max revision number ("generation") encounted in this PDF.
41
+ nodes_encountered (Dict[int, PdfTreeNode]): Nodes we've traversed already.
42
+ pdf_basename (str): The base name of the PDF file (with extension).
43
+ pdf_bytes (bytes): PDF binary data.
44
+ pdf_bytes_info (BytesInfo): File size, hashes, and other data points about the PDF's raw bytes.
45
+ pdf_filehandle (BufferedReader): File handle that reads the PDF.
46
+ pdf_path (str): The path to the PDF file.
47
+ pdf_size (int): Number of nodes as extracted from the PDF's Trailer node.
48
+ pdf_tree (PdfTreeNode): The top node of the PDF data structure tree.
49
+ verifier (PdfTreeVerifier): PdfTreeVerifier that can validate the PDF has been walked successfully.
35
50
  """
36
51
 
37
52
  def __init__(self, pdf_path: str):
@@ -43,14 +58,21 @@ class Pdfalyzer:
43
58
  self.pdf_basename = basename(pdf_path)
44
59
  self.pdf_bytes = load_binary_data(pdf_path)
45
60
  self.pdf_bytes_info = compute_file_hashes(self.pdf_bytes)
46
- pdf_file = open(pdf_path, 'rb') # Filehandle must be left open for PyPDF to perform seeks
47
- self.pdf_reader = PdfReader(pdf_file)
61
+ self.pdf_filehandle = open(pdf_path, 'rb') # Filehandle must be left open for PyPDF to perform seeks
62
+
63
+ try:
64
+ self.pdf_reader = PdfReader(self.pdf_filehandle)
65
+ except PdfReadError:
66
+ self._handle_fatal_error(f'PdfReadError: "{pdf_path}" doesn\'t seem to be a valid PDF file.')
67
+ except Exception as e:
68
+ console.print_exception()
69
+ self._handle_fatal_error(f"{PYPDF_ERROR_MSG}\n{e}")
48
70
 
49
71
  # Initialize tracking variables
50
- self.indeterminate_ids = set() # See INDETERMINATE_REF_KEYS comment
51
- self.nodes_encountered: Dict[int, PdfTreeNode] = {} # Nodes we've seen already
52
72
  self.font_infos: List[FontInfo] = [] # Font summary objects
53
73
  self.max_generation = 0 # PDF revisions are "generations"; this is the max generation encountered
74
+ self.nodes_encountered: Dict[int, PdfTreeNode] = {} # Nodes we've seen already
75
+ self._indeterminate_ids = set() # See INDETERMINATE_REF_KEYS comment
54
76
 
55
77
  # Bootstrap the root of the tree with the trailer. PDFs are always read trailer first.
56
78
  # Technically the trailer has no PDF Object ID but we set it to the /Size of the PDF.
@@ -148,9 +170,9 @@ class Pdfalyzer:
148
170
  from_node.add_child(to_node)
149
171
 
150
172
  # Remove this to_node from inteterminacy now that it's got a child or parent
151
- if relationship.to_obj.idnum in self.indeterminate_ids:
173
+ if relationship.to_obj.idnum in self._indeterminate_ids:
152
174
  log.info(f" Found {relationship} => {to_node} was marked indeterminate but now placed")
153
- self.indeterminate_ids.remove(relationship.to_obj.idnum)
175
+ self._indeterminate_ids.remove(relationship.to_obj.idnum)
154
176
 
155
177
  # If the relationship is indeterminate or we've seen the PDF object before, add it as
156
178
  # a non-tree relationship for now. An attempt to place the node will be made at the end.
@@ -159,7 +181,7 @@ class Pdfalyzer:
159
181
 
160
182
  # If we already encountered 'to_node' then skip adding it to the queue of nodes to walk
161
183
  if was_seen_before:
162
- if relationship.to_obj.idnum not in self.indeterminate_ids and to_node.parent is None:
184
+ if relationship.to_obj.idnum not in self._indeterminate_ids and to_node.parent is None:
163
185
  raise PdfWalkError(f"{relationship} - ref has no parent and is not indeterminate")
164
186
  else:
165
187
  log.debug(f" Already saw {relationship}; not scanning next")
@@ -167,7 +189,7 @@ class Pdfalyzer:
167
189
  # Indeterminate relationships need to wait until everything has been scanned to be placed
168
190
  elif relationship.is_indeterminate or (relationship.is_link and not self.is_in_tree(to_node)):
169
191
  log.info(f' Indeterminate ref {relationship}')
170
- self.indeterminate_ids.add(to_node.idnum)
192
+ self._indeterminate_ids.add(to_node.idnum)
171
193
  # Link nodes like /Dest are usually just links between nodes
172
194
  elif relationship.is_link:
173
195
  log.debug(f" Link ref {relationship}")
@@ -178,9 +200,13 @@ class Pdfalyzer:
178
200
 
179
201
  return to_node
180
202
 
203
+ def _handle_fatal_error(self, msg: str) -> None:
204
+ self.pdf_filehandle.close()
205
+ print_fatal_error_and_exit(msg)
206
+
181
207
  def _resolve_indeterminate_nodes(self) -> None:
182
208
  """Place all indeterminate nodes in the tree. Called after all nodes have been walked."""
183
- indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self.indeterminate_ids]
209
+ indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self._indeterminate_ids]
184
210
  indeterminate_nodes_string = "\n ".join([f"{node}" for node in indeterminate_nodes])
185
211
  log.info(f"Resolving {len(indeterminate_nodes)} indeterminate nodes: {indeterminate_nodes_string}")
186
212
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.17.4
3
+ Version: 1.17.6
4
4
  Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
@@ -1,7 +1,7 @@
1
1
  .pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
2
- CHANGELOG.md,sha256=dyXJVhpeNYDdeh8Ugfl7co6v86ksu_AtNOYKEm2U5TI,13390
2
+ CHANGELOG.md,sha256=pQmLiE-WvF72lDYysezquoZ-dOj-LxtU3gisei9Nxvo,13553
3
3
  LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
4
- pdfalyzer/__init__.py,sha256=2Gikt_-OSXZqeQij4wSwb65g7jycVAupjeFmXBf51lo,6159
4
+ pdfalyzer/__init__.py,sha256=3ylD-19PcG1bJ-rMa6ruP06QaM9Q1BitaMOA2ppugM8,6197
5
5
  pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
6
6
  pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04ElB8ilU,10748
7
7
  pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
@@ -18,7 +18,7 @@ pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47Nd
18
18
  pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
19
19
  pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
20
20
  pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
21
- pdfalyzer/helpers/image_helper.py,sha256=E3Mby-KG-1eIYThuYqXEkwG1mnhY0imvrpiO8N8otfQ,1119
21
+ pdfalyzer/helpers/image_helper.py,sha256=mDiscZZ7yrsFa-bxFqIEz9gH3WGhz8455yhXd4_QfAY,1134
22
22
  pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
23
23
  pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
24
24
  pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
@@ -33,7 +33,7 @@ pdfalyzer/output/tables/font_summary_table.py,sha256=TyCwcvqn99LXTWnmtk6MBPdc_33
33
33
  pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=7G-FLb_EUP50kZmYCTbo8Q6taU4xKp2QIGNOnQtYbNg,5908
34
34
  pdfalyzer/output/tables/stream_objects_table.py,sha256=PgQj8oTtW5_X8SMQb3FvCWDS-d4Zl6QiE44Qhiv7lTY,706
35
35
  pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVnev_4uEk,5291
36
- pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
36
+ pdfalyzer/pdfalyzer.py,sha256=iu4D3Y9qlKP0D_k883ji4U6LLzelQkHONlzAed0QUx4,12713
37
37
  pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
38
38
  pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
39
39
  pdfalyzer/util/cli_tools_argument_parser.py,sha256=HyZhztyrPtbvOswmG975M0tK5KPon37lV3fxVA0OwYo,6277
@@ -47,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
47
47
  pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
48
48
  pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
49
49
  pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
50
- pdfalyzer-1.17.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
51
- pdfalyzer-1.17.4.dist-info/METADATA,sha256=plr6KKGy51GfRWhsqIku4u4nkMoHwM5xMLmV9Lm38ak,27294
52
- pdfalyzer-1.17.4.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
53
- pdfalyzer-1.17.4.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
54
- pdfalyzer-1.17.4.dist-info/RECORD,,
50
+ pdfalyzer-1.17.6.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
51
+ pdfalyzer-1.17.6.dist-info/METADATA,sha256=Gm-8GMSrvAtkydftH93qVhxMXB-JwyUJnsgz87dsRm8,27294
52
+ pdfalyzer-1.17.6.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
53
+ pdfalyzer-1.17.6.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
54
+ pdfalyzer-1.17.6.dist-info/RECORD,,