pdfalyzer 1.17.4__py3-none-any.whl → 1.17.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- CHANGELOG.md +7 -0
- pdfalyzer/__init__.py +4 -2
- pdfalyzer/helpers/image_helper.py +1 -1
- pdfalyzer/pdfalyzer.py +37 -11
- {pdfalyzer-1.17.4.dist-info → pdfalyzer-1.17.6.dist-info}/METADATA +1 -1
- {pdfalyzer-1.17.4.dist-info → pdfalyzer-1.17.6.dist-info}/RECORD +9 -9
- {pdfalyzer-1.17.4.dist-info → pdfalyzer-1.17.6.dist-info}/LICENSE +0 -0
- {pdfalyzer-1.17.4.dist-info → pdfalyzer-1.17.6.dist-info}/WHEEL +0 -0
- {pdfalyzer-1.17.4.dist-info → pdfalyzer-1.17.6.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.17.6
|
|
4
|
+
* Better handling for errors resulting from bugs in PyPDF
|
|
5
|
+
* Properly close file handle when pdfalyzing is complete
|
|
6
|
+
|
|
7
|
+
### 1.17.5
|
|
8
|
+
* Fix `PIL` lazy import
|
|
9
|
+
|
|
3
10
|
### 1.17.4
|
|
4
11
|
* Make `PIL` a lazy import so installs without `[extract]` extras don't fail
|
|
5
12
|
|
pdfalyzer/__init__.py
CHANGED
|
@@ -43,7 +43,7 @@ MAX_THEME_COL_SIZE = 35
|
|
|
43
43
|
def pdfalyze():
|
|
44
44
|
args = parse_arguments()
|
|
45
45
|
pdfalyzer = Pdfalyzer(args.file_to_scan_path)
|
|
46
|
-
|
|
46
|
+
presenter = PdfalyzerPresenter(pdfalyzer)
|
|
47
47
|
output_basepath = None
|
|
48
48
|
|
|
49
49
|
# Binary stream extraction is a special case
|
|
@@ -55,7 +55,7 @@ def pdfalyze():
|
|
|
55
55
|
|
|
56
56
|
# The method that gets called is related to the argument name. See 'possible_output_sections' list in
|
|
57
57
|
# argument_parser.py. Analysis exports wrap themselves around the methods that actually generate the analyses.
|
|
58
|
-
for (arg, method) in output_sections(args,
|
|
58
|
+
for (arg, method) in output_sections(args, presenter):
|
|
59
59
|
if args.output_dir:
|
|
60
60
|
output_basepath = PdfalyzerConfig.get_output_basepath(method)
|
|
61
61
|
print(f'Exporting {arg} data to {output_basepath}...')
|
|
@@ -80,6 +80,8 @@ def pdfalyze():
|
|
|
80
80
|
if args.interact:
|
|
81
81
|
code.interact(local=locals())
|
|
82
82
|
|
|
83
|
+
pdfalyzer.pdf_filehandle.close()
|
|
84
|
+
|
|
83
85
|
|
|
84
86
|
def pdfalyzer_show_color_theme() -> None:
|
|
85
87
|
"""Utility method to show pdfalyzer's color theme. Invocable with 'pdfalyzer_show_color_theme'."""
|
|
@@ -5,7 +5,7 @@ from yaralyzer.output.rich_console import console
|
|
|
5
5
|
from pdfalyzer.helpers.rich_text_helper import warning_text
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
|
|
8
|
+
def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]: # noqa F821
|
|
9
9
|
"""Use pytesseract to OCR the text in the image and return it as a string."""
|
|
10
10
|
import pytesseract
|
|
11
11
|
from PIL import Image
|
pdfalyzer/pdfalyzer.py
CHANGED
|
@@ -7,10 +7,11 @@ from typing import Dict, Iterator, List, Optional
|
|
|
7
7
|
from anytree import LevelOrderIter, SymlinkNode
|
|
8
8
|
from anytree.search import findall, findall_by_attr
|
|
9
9
|
from pypdf import PdfReader
|
|
10
|
+
from pypdf.errors import PdfReadError
|
|
10
11
|
from pypdf.generic import IndirectObject
|
|
11
12
|
from yaralyzer.helpers.file_helper import load_binary_data
|
|
12
13
|
from yaralyzer.output.file_hashes_table import compute_file_hashes
|
|
13
|
-
from yaralyzer.output.rich_console import console
|
|
14
|
+
from yaralyzer.output.rich_console import console, print_fatal_error_and_exit
|
|
14
15
|
from yaralyzer.util.logging import log
|
|
15
16
|
|
|
16
17
|
from pdfalyzer.decorators.document_model_printer import print_with_header
|
|
@@ -22,7 +23,8 @@ from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
|
|
|
22
23
|
from pdfalyzer.util.adobe_strings import *
|
|
23
24
|
from pdfalyzer.util.exceptions import PdfWalkError
|
|
24
25
|
|
|
25
|
-
TRAILER_FALLBACK_ID =
|
|
26
|
+
TRAILER_FALLBACK_ID = 10_000_000
|
|
27
|
+
PYPDF_ERROR_MSG = "Failed to open file with PyPDF. Consider filing a PyPDF bug report: https://github.com/py-pdf/pypdf/issues"
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
class Pdfalyzer:
|
|
@@ -32,6 +34,19 @@ class Pdfalyzer:
|
|
|
32
34
|
Each of the PDF's internal objects isw rapped in a `PdfTreeNode` object. The tree is managed
|
|
33
35
|
by the `anytree` library. Information about the tree as a whole is stored in this class.
|
|
34
36
|
Once the PDF is parsed this class provides access to info about or from the underlying PDF tree.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
font_infos (List[FontInfo]): Font summary objects
|
|
40
|
+
max_generation (int): Max revision number ("generation") encounted in this PDF.
|
|
41
|
+
nodes_encountered (Dict[int, PdfTreeNode]): Nodes we've traversed already.
|
|
42
|
+
pdf_basename (str): The base name of the PDF file (with extension).
|
|
43
|
+
pdf_bytes (bytes): PDF binary data.
|
|
44
|
+
pdf_bytes_info (BytesInfo): File size, hashes, and other data points about the PDF's raw bytes.
|
|
45
|
+
pdf_filehandle (BufferedReader): File handle that reads the PDF.
|
|
46
|
+
pdf_path (str): The path to the PDF file.
|
|
47
|
+
pdf_size (int): Number of nodes as extracted from the PDF's Trailer node.
|
|
48
|
+
pdf_tree (PdfTreeNode): The top node of the PDF data structure tree.
|
|
49
|
+
verifier (PdfTreeVerifier): PdfTreeVerifier that can validate the PDF has been walked successfully.
|
|
35
50
|
"""
|
|
36
51
|
|
|
37
52
|
def __init__(self, pdf_path: str):
|
|
@@ -43,14 +58,21 @@ class Pdfalyzer:
|
|
|
43
58
|
self.pdf_basename = basename(pdf_path)
|
|
44
59
|
self.pdf_bytes = load_binary_data(pdf_path)
|
|
45
60
|
self.pdf_bytes_info = compute_file_hashes(self.pdf_bytes)
|
|
46
|
-
|
|
47
|
-
|
|
61
|
+
self.pdf_filehandle = open(pdf_path, 'rb') # Filehandle must be left open for PyPDF to perform seeks
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
self.pdf_reader = PdfReader(self.pdf_filehandle)
|
|
65
|
+
except PdfReadError:
|
|
66
|
+
self._handle_fatal_error(f'PdfReadError: "{pdf_path}" doesn\'t seem to be a valid PDF file.')
|
|
67
|
+
except Exception as e:
|
|
68
|
+
console.print_exception()
|
|
69
|
+
self._handle_fatal_error(f"{PYPDF_ERROR_MSG}\n{e}")
|
|
48
70
|
|
|
49
71
|
# Initialize tracking variables
|
|
50
|
-
self.indeterminate_ids = set() # See INDETERMINATE_REF_KEYS comment
|
|
51
|
-
self.nodes_encountered: Dict[int, PdfTreeNode] = {} # Nodes we've seen already
|
|
52
72
|
self.font_infos: List[FontInfo] = [] # Font summary objects
|
|
53
73
|
self.max_generation = 0 # PDF revisions are "generations"; this is the max generation encountered
|
|
74
|
+
self.nodes_encountered: Dict[int, PdfTreeNode] = {} # Nodes we've seen already
|
|
75
|
+
self._indeterminate_ids = set() # See INDETERMINATE_REF_KEYS comment
|
|
54
76
|
|
|
55
77
|
# Bootstrap the root of the tree with the trailer. PDFs are always read trailer first.
|
|
56
78
|
# Technically the trailer has no PDF Object ID but we set it to the /Size of the PDF.
|
|
@@ -148,9 +170,9 @@ class Pdfalyzer:
|
|
|
148
170
|
from_node.add_child(to_node)
|
|
149
171
|
|
|
150
172
|
# Remove this to_node from inteterminacy now that it's got a child or parent
|
|
151
|
-
if relationship.to_obj.idnum in self.
|
|
173
|
+
if relationship.to_obj.idnum in self._indeterminate_ids:
|
|
152
174
|
log.info(f" Found {relationship} => {to_node} was marked indeterminate but now placed")
|
|
153
|
-
self.
|
|
175
|
+
self._indeterminate_ids.remove(relationship.to_obj.idnum)
|
|
154
176
|
|
|
155
177
|
# If the relationship is indeterminate or we've seen the PDF object before, add it as
|
|
156
178
|
# a non-tree relationship for now. An attempt to place the node will be made at the end.
|
|
@@ -159,7 +181,7 @@ class Pdfalyzer:
|
|
|
159
181
|
|
|
160
182
|
# If we already encountered 'to_node' then skip adding it to the queue of nodes to walk
|
|
161
183
|
if was_seen_before:
|
|
162
|
-
if relationship.to_obj.idnum not in self.
|
|
184
|
+
if relationship.to_obj.idnum not in self._indeterminate_ids and to_node.parent is None:
|
|
163
185
|
raise PdfWalkError(f"{relationship} - ref has no parent and is not indeterminate")
|
|
164
186
|
else:
|
|
165
187
|
log.debug(f" Already saw {relationship}; not scanning next")
|
|
@@ -167,7 +189,7 @@ class Pdfalyzer:
|
|
|
167
189
|
# Indeterminate relationships need to wait until everything has been scanned to be placed
|
|
168
190
|
elif relationship.is_indeterminate or (relationship.is_link and not self.is_in_tree(to_node)):
|
|
169
191
|
log.info(f' Indeterminate ref {relationship}')
|
|
170
|
-
self.
|
|
192
|
+
self._indeterminate_ids.add(to_node.idnum)
|
|
171
193
|
# Link nodes like /Dest are usually just links between nodes
|
|
172
194
|
elif relationship.is_link:
|
|
173
195
|
log.debug(f" Link ref {relationship}")
|
|
@@ -178,9 +200,13 @@ class Pdfalyzer:
|
|
|
178
200
|
|
|
179
201
|
return to_node
|
|
180
202
|
|
|
203
|
+
def _handle_fatal_error(self, msg: str) -> None:
|
|
204
|
+
self.pdf_filehandle.close()
|
|
205
|
+
print_fatal_error_and_exit(msg)
|
|
206
|
+
|
|
181
207
|
def _resolve_indeterminate_nodes(self) -> None:
|
|
182
208
|
"""Place all indeterminate nodes in the tree. Called after all nodes have been walked."""
|
|
183
|
-
indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self.
|
|
209
|
+
indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self._indeterminate_ids]
|
|
184
210
|
indeterminate_nodes_string = "\n ".join([f"{node}" for node in indeterminate_nodes])
|
|
185
211
|
log.info(f"Resolving {len(indeterminate_nodes)} indeterminate nodes: {indeterminate_nodes_string}")
|
|
186
212
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.17.
|
|
3
|
+
Version: 1.17.6
|
|
4
4
|
Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
.pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
|
|
2
|
-
CHANGELOG.md,sha256=
|
|
2
|
+
CHANGELOG.md,sha256=pQmLiE-WvF72lDYysezquoZ-dOj-LxtU3gisei9Nxvo,13553
|
|
3
3
|
LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
4
|
-
pdfalyzer/__init__.py,sha256=
|
|
4
|
+
pdfalyzer/__init__.py,sha256=3ylD-19PcG1bJ-rMa6ruP06QaM9Q1BitaMOA2ppugM8,6197
|
|
5
5
|
pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
|
|
6
6
|
pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04ElB8ilU,10748
|
|
7
7
|
pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
|
|
@@ -18,7 +18,7 @@ pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47Nd
|
|
|
18
18
|
pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
|
|
19
19
|
pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
|
|
20
20
|
pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
|
|
21
|
-
pdfalyzer/helpers/image_helper.py,sha256=
|
|
21
|
+
pdfalyzer/helpers/image_helper.py,sha256=mDiscZZ7yrsFa-bxFqIEz9gH3WGhz8455yhXd4_QfAY,1134
|
|
22
22
|
pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
|
|
23
23
|
pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
|
|
24
24
|
pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
|
|
@@ -33,7 +33,7 @@ pdfalyzer/output/tables/font_summary_table.py,sha256=TyCwcvqn99LXTWnmtk6MBPdc_33
|
|
|
33
33
|
pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=7G-FLb_EUP50kZmYCTbo8Q6taU4xKp2QIGNOnQtYbNg,5908
|
|
34
34
|
pdfalyzer/output/tables/stream_objects_table.py,sha256=PgQj8oTtW5_X8SMQb3FvCWDS-d4Zl6QiE44Qhiv7lTY,706
|
|
35
35
|
pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVnev_4uEk,5291
|
|
36
|
-
pdfalyzer/pdfalyzer.py,sha256=
|
|
36
|
+
pdfalyzer/pdfalyzer.py,sha256=iu4D3Y9qlKP0D_k883ji4U6LLzelQkHONlzAed0QUx4,12713
|
|
37
37
|
pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
|
|
38
38
|
pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
|
|
39
39
|
pdfalyzer/util/cli_tools_argument_parser.py,sha256=HyZhztyrPtbvOswmG975M0tK5KPon37lV3fxVA0OwYo,6277
|
|
@@ -47,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
|
47
47
|
pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
|
|
48
48
|
pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
|
|
49
49
|
pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
|
|
50
|
-
pdfalyzer-1.17.
|
|
51
|
-
pdfalyzer-1.17.
|
|
52
|
-
pdfalyzer-1.17.
|
|
53
|
-
pdfalyzer-1.17.
|
|
54
|
-
pdfalyzer-1.17.
|
|
50
|
+
pdfalyzer-1.17.6.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
51
|
+
pdfalyzer-1.17.6.dist-info/METADATA,sha256=Gm-8GMSrvAtkydftH93qVhxMXB-JwyUJnsgz87dsRm8,27294
|
|
52
|
+
pdfalyzer-1.17.6.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
53
|
+
pdfalyzer-1.17.6.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
|
|
54
|
+
pdfalyzer-1.17.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|