pdfalyzer 1.16.13__tar.gz → 1.16.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/CHANGELOG.md +4 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/PKG-INFO +10 -7
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/README.md +7 -3
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/binary/binary_scanner.py +28 -12
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/config.py +2 -1
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/decorators/indeterminate_node.py +11 -11
- pdfalyzer-1.16.14/pdfalyzer/decorators/pdf_file.py +50 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/decorators/pdf_object_properties.py +3 -3
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/decorators/pdf_tree_node.py +17 -11
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/decorators/pdf_tree_verifier.py +2 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/detection/yaralyzer_helper.py +9 -10
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/helpers/rich_text_helper.py +1 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/output/character_mapping.py +1 -1
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/output/layout.py +13 -3
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/output/tables/decoding_stats_table.py +4 -4
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/output/tables/font_summary_table.py +2 -2
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/pdfalyzer.py +20 -13
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/util/argument_parser.py +14 -7
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pyproject.toml +3 -4
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/.pdfalyzer.example +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/LICENSE +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/__init__.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/__main__.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/decorators/document_model_printer.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/detection/constants/binary_regexes.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/detection/javascript_hunter.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/font_info.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/helpers/dict_helper.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/helpers/filesystem_helper.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/helpers/number_helper.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/helpers/pdf_object_helper.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/helpers/string_helper.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/output/pdfalyzer_presenter.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/output/styles/node_colors.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/output/styles/rich_theme.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/output/tables/stream_objects_table.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/pdf_object_relationship.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/util/adobe_strings.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/util/debugging.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/util/exceptions.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/util/pdf_parser_manager.py +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/yara_rules/PDF.yara +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/yara_rules/__init.py__ +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/yara_rules/didier_stevens.yara +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
- {pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/yara_rules/pdf_malware.yara +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.16.
|
|
3
|
+
Version: 1.16.14
|
|
4
4
|
Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
7
7
|
Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
|
|
8
8
|
Author: Michel de Cryptadamus
|
|
9
9
|
Author-email: michel@cryptadamus.com
|
|
10
|
-
Requires-Python: >=3.
|
|
10
|
+
Requires-Python: >=3.10,<4.0
|
|
11
11
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
12
|
Classifier: Environment :: Console
|
|
13
13
|
Classifier: Intended Audience :: Information Technology
|
|
@@ -18,13 +18,12 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
22
21
|
Classifier: Topic :: Artistic Software
|
|
23
22
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
24
23
|
Classifier: Topic :: Security
|
|
25
24
|
Requires-Dist: anytree (>=2.13,<3.0)
|
|
26
25
|
Requires-Dist: pypdf (>=6.0.0,<7.0.0)
|
|
27
|
-
Requires-Dist: yaralyzer (>=1.0.
|
|
26
|
+
Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
|
|
28
27
|
Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
|
|
29
28
|
Project-URL: Documentation, https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
30
29
|
Project-URL: Repository, https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
@@ -114,7 +113,12 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
|
|
|
114
113
|
The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
|
|
115
114
|
|
|
116
115
|
### Setting Command Line Options Permanently With A `.pdfalyzer` File
|
|
117
|
-
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer`
|
|
116
|
+
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
|
|
117
|
+
|
|
118
|
+
1. the current directory
|
|
119
|
+
2. the user's home directory
|
|
120
|
+
|
|
121
|
+
If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
|
|
118
122
|
|
|
119
123
|
### Environment Variables
|
|
120
124
|
Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
|
|
@@ -125,10 +129,9 @@ Run `pdfalyzer_show_color_theme` to see the color theme employed.
|
|
|
125
129
|
### Guarantees
|
|
126
130
|
Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
|
|
127
131
|
|
|
128
|
-
## Example
|
|
132
|
+
## Example Malicious PDF Investigation
|
|
129
133
|
[BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
|
|
130
134
|
|
|
131
|
-
-------------
|
|
132
135
|
|
|
133
136
|
## Use As A Code Library
|
|
134
137
|
For info about setting up a dev environment see [Contributing](#contributing) below.
|
|
@@ -82,7 +82,12 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
|
|
|
82
82
|
The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
|
|
83
83
|
|
|
84
84
|
### Setting Command Line Options Permanently With A `.pdfalyzer` File
|
|
85
|
-
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer`
|
|
85
|
+
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
|
|
86
|
+
|
|
87
|
+
1. the current directory
|
|
88
|
+
2. the user's home directory
|
|
89
|
+
|
|
90
|
+
If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
|
|
86
91
|
|
|
87
92
|
### Environment Variables
|
|
88
93
|
Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
|
|
@@ -93,10 +98,9 @@ Run `pdfalyzer_show_color_theme` to see the color theme employed.
|
|
|
93
98
|
### Guarantees
|
|
94
99
|
Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
|
|
95
100
|
|
|
96
|
-
## Example
|
|
101
|
+
## Example Malicious PDF Investigation
|
|
97
102
|
[BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
|
|
98
103
|
|
|
99
|
-
-------------
|
|
100
104
|
|
|
101
105
|
## Use As A Code Library
|
|
102
106
|
For info about setting up a dev environment see [Contributing](#contributing) below.
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
various character encodings upon it to see what comes out.
|
|
2
|
+
`BinaryScanner` class.
|
|
4
3
|
"""
|
|
5
4
|
from collections import defaultdict
|
|
6
5
|
from typing import Iterator, Optional, Tuple
|
|
@@ -28,8 +27,18 @@ from pdfalyzer.util.adobe_strings import CONTENTS, CURRENTFILE_EEXEC, FONT_FILE_
|
|
|
28
27
|
|
|
29
28
|
|
|
30
29
|
class BinaryScanner:
|
|
30
|
+
"""
|
|
31
|
+
Class for handling binary data - scanning through it for various suspicious patterns as well as forcing
|
|
32
|
+
various character encodings upon it to see what comes out.
|
|
33
|
+
"""
|
|
34
|
+
|
|
31
35
|
def __init__(self, _bytes: bytes, owner: PdfTreeNode, label: Optional[Text] = None):
|
|
32
|
-
"""
|
|
36
|
+
"""
|
|
37
|
+
Args:
|
|
38
|
+
_bytes (bytes): The binary data to be scanned.
|
|
39
|
+
owner (PdfTreeNode): The `PdfTreeNode` that contains this binary data.
|
|
40
|
+
label (Optional[Text]): A rich `Text` label for the binary data (e.g. the PDF object's address).
|
|
41
|
+
"""
|
|
33
42
|
self.bytes = _bytes
|
|
34
43
|
self.label = label
|
|
35
44
|
self.owner = owner
|
|
@@ -42,7 +51,7 @@ class BinaryScanner:
|
|
|
42
51
|
self.regex_extraction_stats = defaultdict(lambda: RegexMatchMetrics())
|
|
43
52
|
|
|
44
53
|
def check_for_dangerous_instructions(self) -> None:
|
|
45
|
-
"""Scan for all the strings in DANGEROUS_INSTRUCTIONS list and decode bytes around them."""
|
|
54
|
+
"""Scan for all the strings in `DANGEROUS_INSTRUCTIONS` list and decode bytes around them."""
|
|
46
55
|
subheader = "Scanning Binary For Anything That Could Be Described As 'sus'..."
|
|
47
56
|
print_section_sub_subheader(subheader, style=f"bright_red")
|
|
48
57
|
|
|
@@ -71,8 +80,8 @@ class BinaryScanner:
|
|
|
71
80
|
|
|
72
81
|
def force_decode_quoted_bytes(self) -> None:
|
|
73
82
|
"""
|
|
74
|
-
Find all strings matching QUOTE_PATTERNS (AKA between quote chars) and decode them with various
|
|
75
|
-
The
|
|
83
|
+
Find all strings matching `QUOTE_PATTERNS` (AKA between quote chars) and decode them with various
|
|
84
|
+
encodings. The `--quote-type` arg will limit this decode to just one kind of quote.
|
|
76
85
|
"""
|
|
77
86
|
quote_selections = PdfalyzerConfig._args.extract_quoteds
|
|
78
87
|
|
|
@@ -100,11 +109,11 @@ class BinaryScanner:
|
|
|
100
109
|
# YARA rules are written on the fly and then YARA does the matching.
|
|
101
110
|
# -------------------------------------------------------------------------------
|
|
102
111
|
def extract_guillemet_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
|
|
103
|
-
"""Iterate on all strings surrounded by Guillemet quotes, e.g. «string
|
|
112
|
+
"""Iterate on all strings surrounded by Guillemet quotes, e.g. «string»."""
|
|
104
113
|
return self._quote_yaralyzer(QUOTE_PATTERNS[GUILLEMET], GUILLEMET).match_iterator()
|
|
105
114
|
|
|
106
115
|
def extract_backtick_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
|
|
107
|
-
"""Returns an interator over all strings surrounded by backticks"""
|
|
116
|
+
"""Returns an interator over all strings surrounded by backticks."""
|
|
108
117
|
return self._quote_yaralyzer(QUOTE_PATTERNS[BACKTICK], BACKTICK).match_iterator()
|
|
109
118
|
|
|
110
119
|
def extract_front_slash_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
|
|
@@ -137,7 +146,14 @@ class BinaryScanner:
|
|
|
137
146
|
console.line()
|
|
138
147
|
|
|
139
148
|
def process_yara_matches(self, yaralyzer: Yaralyzer, pattern: str, force: bool = False) -> None:
|
|
140
|
-
"""
|
|
149
|
+
"""
|
|
150
|
+
Decide whether to attempt to decode the matched bytes and track stats.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
yaralyzer (Yaralyzer): The `Yaralyzer` instance to use for finding matches.
|
|
154
|
+
pattern (str): The pattern being searched for (used for stats tracking).
|
|
155
|
+
force (bool): If `True`, decode all matches even if they are very short or very long.
|
|
156
|
+
"""
|
|
141
157
|
for bytes_match, decoder in yaralyzer.match_iterator():
|
|
142
158
|
log.debug(f"Trackings match stats for {pattern}, bytes_match: {bytes_match}, is_decodable: {bytes_match.is_decodable()}") # noqa: E501
|
|
143
159
|
|
|
@@ -162,7 +178,7 @@ class BinaryScanner:
|
|
|
162
178
|
return self.bytes.split(CURRENTFILE_EEXEC)[1] if CURRENTFILE_EEXEC in self.bytes else self.bytes
|
|
163
179
|
|
|
164
180
|
def _quote_yaralyzer(self, quote_pattern: str, quote_type: str):
|
|
165
|
-
"""Helper method to build a Yaralyzer for a quote_pattern"""
|
|
181
|
+
"""Helper method to build a Yaralyzer for a `quote_pattern`."""
|
|
166
182
|
label = f"{quote_type}_Quoted"
|
|
167
183
|
|
|
168
184
|
if quote_type == GUILLEMET:
|
|
@@ -177,7 +193,7 @@ class BinaryScanner:
|
|
|
177
193
|
rules_label: Optional[str] = None,
|
|
178
194
|
pattern_label: Optional[str] = None
|
|
179
195
|
) -> Yaralyzer:
|
|
180
|
-
"""Build a yaralyzer to scan self.bytes"""
|
|
196
|
+
"""Build a `yaralyzer` to scan `self.bytes`."""
|
|
181
197
|
return Yaralyzer.for_patterns(
|
|
182
198
|
patterns=[escape_yara_pattern(pattern)],
|
|
183
199
|
patterns_type=pattern_type,
|
|
@@ -198,5 +214,5 @@ class BinaryScanner:
|
|
|
198
214
|
self.suppression_notice_queue = []
|
|
199
215
|
|
|
200
216
|
def _eexec_idx(self) -> int:
|
|
201
|
-
"""Returns the location of CURRENTFILES_EEXEC within the binary stream data (or 0 if it's not there)."""
|
|
217
|
+
"""Returns the location of `CURRENTFILES_EEXEC` within the binary stream data (or 0 if it's not there)."""
|
|
202
218
|
return self.bytes.find(CURRENTFILE_EEXEC) if CURRENTFILE_EEXEC in self.bytes else 0
|
|
@@ -9,9 +9,10 @@ from os import environ, pardir, path
|
|
|
9
9
|
from yaralyzer.config import YaralyzerConfig, is_env_var_set_and_not_false, is_invoked_by_pytest
|
|
10
10
|
|
|
11
11
|
PDFALYZE = 'pdfalyze'
|
|
12
|
+
PDFALYZER = f"{PDFALYZE}r"
|
|
12
13
|
ALL_STREAMS = -1
|
|
13
14
|
PYTEST_FLAG = 'INVOKED_BY_PYTEST'
|
|
14
|
-
PROJECT_ROOT = path.join(str(importlib.resources.files(
|
|
15
|
+
PROJECT_ROOT = path.join(str(importlib.resources.files(PDFALYZER)), pardir)
|
|
15
16
|
|
|
16
17
|
# 3rd part pdf-parser.py
|
|
17
18
|
PDF_PARSER_EXECUTABLE_ENV_VAR = 'PDFALYZER_PDF_PARSER_PY_PATH'
|
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Some nodes cannot be placed until we have walked the rest of the tree. For instance
|
|
3
|
-
if we encounter a /Page that relationships /Resources we need to know if there's a
|
|
4
|
-
/Pages parent of the /Page before committing to a tree structure.
|
|
5
|
-
|
|
6
|
-
This class handles choosing among the candidates for a given PDF object's parent node
|
|
7
|
-
(AKA "figuring out where to place the node in the PDF object tree").
|
|
8
|
-
"""
|
|
9
1
|
from typing import Callable, List, Optional
|
|
10
2
|
|
|
11
3
|
from rich.markup import escape
|
|
@@ -18,6 +10,14 @@ from pdfalyzer.util.adobe_strings import *
|
|
|
18
10
|
|
|
19
11
|
|
|
20
12
|
class IndeterminateNode:
|
|
13
|
+
"""
|
|
14
|
+
Class to handle choosing among the candidates for a given PDF object's parent node.
|
|
15
|
+
|
|
16
|
+
Some nodes cannot be placed until we have walked the rest of the tree. For instance
|
|
17
|
+
if we encounter a /Page that relationships /Resources we need to know if there's a
|
|
18
|
+
/Pages parent of the /Page before committing to a tree structure.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
21
|
def __init__(self, node: PdfTreeNode) -> None:
|
|
22
22
|
self.node = node
|
|
23
23
|
|
|
@@ -56,7 +56,7 @@ class IndeterminateNode:
|
|
|
56
56
|
|
|
57
57
|
self.node.set_parent(parent)
|
|
58
58
|
|
|
59
|
-
def find_node_with_most_descendants(self, list_of_nodes: List[PdfTreeNode] = None) -> PdfTreeNode:
|
|
59
|
+
def find_node_with_most_descendants(self, list_of_nodes: Optional[List[PdfTreeNode]] = None) -> PdfTreeNode:
|
|
60
60
|
"""Find node with a reference to this one that has the most descendants"""
|
|
61
61
|
list_of_nodes = list_of_nodes or [r.from_node for r in self.node.non_tree_relationships]
|
|
62
62
|
max_descendants = max([node.descendants_count() for node in list_of_nodes])
|
|
@@ -64,7 +64,7 @@ class IndeterminateNode:
|
|
|
64
64
|
|
|
65
65
|
def _has_only_similar_relationships(self) -> bool:
|
|
66
66
|
"""
|
|
67
|
-
Returns True if all the nodes w/references to this one have the same type or if all the
|
|
67
|
+
Returns `True` if all the nodes w/references to this one have the same type or if all the
|
|
68
68
|
reference_keys that point to this node are the same.
|
|
69
69
|
"""
|
|
70
70
|
unique_refferer_labels = self.node.unique_labels_of_referring_nodes()
|
|
@@ -125,6 +125,6 @@ class IndeterminateNode:
|
|
|
125
125
|
|
|
126
126
|
|
|
127
127
|
def find_node_with_lowest_id(list_of_nodes: List[PdfTreeNode]) -> PdfTreeNode:
|
|
128
|
-
"""
|
|
128
|
+
"""Return node in `list_of_nodes` with lowest ID."""
|
|
129
129
|
lowest_idnum = min([n.idnum for n in list_of_nodes])
|
|
130
130
|
return next(n for n in list_of_nodes if n.idnum == lowest_idnum)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from os import path
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List, Optional, Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PdfFile:
|
|
7
|
+
"""
|
|
8
|
+
Wrapper for a PDF file path that provides useful methods and properties.
|
|
9
|
+
"""
|
|
10
|
+
def __init__(self, file_path: Union[str, Path]) -> None:
|
|
11
|
+
self.file_path: Path = Path(file_path)
|
|
12
|
+
|
|
13
|
+
if not self.file_path.exists():
|
|
14
|
+
raise FileNotFoundError(f"File '{file_path}' does not exist.")
|
|
15
|
+
|
|
16
|
+
self.dirname = self.file_path.parent
|
|
17
|
+
self.basename: str = path.basename(file_path)
|
|
18
|
+
self.basename_without_ext: str = str(Path(self.basename).with_suffix(''))
|
|
19
|
+
self.extname: str = self.file_path.suffix
|
|
20
|
+
self.text_extraction_attempted: bool = False
|
|
21
|
+
|
|
22
|
+
def extract_page_range(
|
|
23
|
+
self,
|
|
24
|
+
page_range: PageRange,
|
|
25
|
+
destination_dir: Optional[Path] = None,
|
|
26
|
+
extra_file_suffix: Optional[str] = None
|
|
27
|
+
) -> Path:
|
|
28
|
+
"""Extract a range of pages to a new PDF file (or 1 page if last_page_number not provided.)"""
|
|
29
|
+
destination_dir = destination_dir or DEFAULT_PDF_ERRORS_DIR
|
|
30
|
+
create_dir_if_it_does_not_exist(destination_dir)
|
|
31
|
+
|
|
32
|
+
if extra_file_suffix is None:
|
|
33
|
+
file_suffix = page_range.file_suffix()
|
|
34
|
+
else:
|
|
35
|
+
file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
|
|
36
|
+
|
|
37
|
+
extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
|
|
38
|
+
extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
|
|
39
|
+
stderr_console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
|
|
40
|
+
pdf_writer = PdfWriter()
|
|
41
|
+
|
|
42
|
+
with open(self.file_path, 'rb') as source_pdf:
|
|
43
|
+
pdf_writer.append(fileobj=source_pdf, pages=page_range.to_tuple())
|
|
44
|
+
|
|
45
|
+
if SortableFile.confirm_file_overwrite(extracted_pages_pdf_path):
|
|
46
|
+
with open(extracted_pages_pdf_path, 'wb') as extracted_pages_pdf:
|
|
47
|
+
pdf_writer.write(extracted_pages_pdf)
|
|
48
|
+
|
|
49
|
+
stderr_console.print(f"Wrote new PDF '{extracted_pages_pdf_path}'.")
|
|
50
|
+
return extracted_pages_pdf_path
|
|
@@ -15,7 +15,7 @@ from pdfalyzer.util.adobe_strings import *
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class PdfObjectProperties:
|
|
18
|
-
"""Simple class to extract critical features of a PdfObject
|
|
18
|
+
"""Simple class to extract critical features of a `PdfObject`."""
|
|
19
19
|
|
|
20
20
|
def __init__(
|
|
21
21
|
self,
|
|
@@ -86,7 +86,7 @@ class PdfObjectProperties:
|
|
|
86
86
|
obj: PdfObject,
|
|
87
87
|
is_single_row_table: bool = False
|
|
88
88
|
) -> List[Union[Text, str]]:
|
|
89
|
-
"""PDF object property at reference_key becomes a formatted 3-tuple for use in Rich tables."""
|
|
89
|
+
"""PDF object property at `reference_key` becomes a formatted 3-tuple for use in Rich tables."""
|
|
90
90
|
with_resolved_refs = cls.resolve_references(reference_key, obj)
|
|
91
91
|
|
|
92
92
|
return [
|
|
@@ -101,7 +101,7 @@ class PdfObjectProperties:
|
|
|
101
101
|
# TODO: this doesn't recurse...
|
|
102
102
|
@classmethod
|
|
103
103
|
def _obj_to_rich_text(cls, obj: Any) -> Text:
|
|
104
|
-
"""Recurse through obj and build a Text object."""
|
|
104
|
+
"""Recurse through `obj` and build a `Text` object."""
|
|
105
105
|
if isinstance(obj, dict):
|
|
106
106
|
key_value_pairs = [Text(f"{k}: ").append_text(cls._obj_to_rich_text(v)) for k, v in obj.items()]
|
|
107
107
|
return Text('{').append_text(comma_join_txt(key_value_pairs)).append('}')
|
|
@@ -1,10 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
Also adds decorators/generators for Rich text representation.
|
|
4
|
-
|
|
5
|
-
Child/parent relationships should be set using the add_child() and set_parent()
|
|
6
|
-
methods and not set directly. (TODO: this could be done better with anytree
|
|
7
|
-
hooks)
|
|
2
|
+
`PdfTreeNode` decorates a `PdfObject` with tree structure information.
|
|
8
3
|
"""
|
|
9
4
|
from typing import Callable, List, Optional
|
|
10
5
|
|
|
@@ -27,11 +22,22 @@ DECODE_FAILURE_LEN = -1
|
|
|
27
22
|
|
|
28
23
|
|
|
29
24
|
class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
25
|
+
"""
|
|
26
|
+
PDF node decorator - wraps actual PDF objects to make them `anytree` nodes.
|
|
27
|
+
Also adds decorators/generators for Rich text representation.
|
|
28
|
+
|
|
29
|
+
Child/parent relationships should be set using the `add_child()` and `set_parent()`
|
|
30
|
+
methods and not set directly.
|
|
31
|
+
|
|
32
|
+
TODO: this could be done better with anytree hooks.
|
|
33
|
+
"""
|
|
34
|
+
|
|
30
35
|
def __init__(self, obj: PdfObject, address: str, idnum: int):
|
|
31
36
|
"""
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
37
|
+
Args:
|
|
38
|
+
obj (PdfObject): The underlying PDF object
|
|
39
|
+
address (str): The first address that points from some node to this one
|
|
40
|
+
idnum (int): ID used in the reference
|
|
35
41
|
"""
|
|
36
42
|
PdfObjectProperties.__init__(self, obj, address, idnum)
|
|
37
43
|
self.non_tree_relationships: List[PdfObjectRelationship] = []
|
|
@@ -54,7 +60,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
54
60
|
|
|
55
61
|
@classmethod
|
|
56
62
|
def from_reference(cls, ref: IndirectObject, address: str) -> 'PdfTreeNode':
|
|
57
|
-
"""
|
|
63
|
+
"""Alternate constructor to Build a `PdfTreeNode` from an `IndirectObject`."""
|
|
58
64
|
try:
|
|
59
65
|
return cls(ref.get_object(), address, ref.idnum)
|
|
60
66
|
except PdfReadError as e:
|
|
@@ -82,7 +88,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
82
88
|
child.set_parent(self)
|
|
83
89
|
|
|
84
90
|
def add_non_tree_relationship(self, relationship: PdfObjectRelationship) -> None:
|
|
85
|
-
"""Add a relationship that points at this node's PDF object. TODO: doesn't include parent/child"""
|
|
91
|
+
"""Add a relationship that points at this node's PDF object. TODO: doesn't include parent/child."""
|
|
86
92
|
if relationship in self.non_tree_relationships:
|
|
87
93
|
return
|
|
88
94
|
|
|
@@ -11,6 +11,8 @@ from pdfalyzer.util.adobe_strings import *
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class PdfTreeVerifier:
|
|
14
|
+
"""Class to verify that the PDF tree is complete/contains all the nodes in the PDF file."""
|
|
15
|
+
|
|
14
16
|
def __init__(self, pdfalyzer: 'Pdfalyzer') -> None:
|
|
15
17
|
self.pdfalyzer = pdfalyzer
|
|
16
18
|
|
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Functions to help with the pre-configured YARA rules in the /yara directory.
|
|
3
3
|
"""
|
|
4
4
|
from importlib.resources import as_file, files
|
|
5
5
|
from sys import exit
|
|
6
6
|
from typing import Optional, Union
|
|
7
7
|
|
|
8
8
|
from yaralyzer.config import YaralyzerConfig
|
|
9
|
+
from yaralyzer.output.rich_console import print_fatal_error_and_exit
|
|
9
10
|
from yaralyzer.yaralyzer import Yaralyzer
|
|
10
11
|
|
|
11
|
-
|
|
12
|
+
from pdfalyzer.config import PDFALYZER
|
|
13
|
+
|
|
14
|
+
YARA_RULES_DIR = files(PDFALYZER).joinpath('yara_rules')
|
|
12
15
|
|
|
13
16
|
YARA_RULES_FILES = [
|
|
14
17
|
'didier_stevens.yara',
|
|
@@ -20,11 +23,12 @@ YARA_RULES_FILES = [
|
|
|
20
23
|
|
|
21
24
|
|
|
22
25
|
def get_file_yaralyzer(file_path_to_scan: str) -> Yaralyzer:
|
|
23
|
-
"""Get a yaralyzer for a file path"""
|
|
26
|
+
"""Get a yaralyzer for a file path."""
|
|
24
27
|
return _build_yaralyzer(file_path_to_scan)
|
|
25
28
|
|
|
26
29
|
|
|
27
30
|
def get_bytes_yaralyzer(scannable: bytes, label: str) -> Yaralyzer:
|
|
31
|
+
"""Get a yaralyzer for a `scannable` bytes."""
|
|
28
32
|
return _build_yaralyzer(scannable, label)
|
|
29
33
|
|
|
30
34
|
|
|
@@ -44,10 +48,5 @@ def _build_yaralyzer(scannable: Union[bytes, str], label: Optional[str] = None)
|
|
|
44
48
|
|
|
45
49
|
try:
|
|
46
50
|
return Yaralyzer.for_rules_files(rules_paths, scannable, label)
|
|
47
|
-
except
|
|
48
|
-
|
|
49
|
-
if "it doesn't exist" in str(e):
|
|
50
|
-
print(str(e))
|
|
51
|
-
exit(1)
|
|
52
|
-
else:
|
|
53
|
-
raise e
|
|
51
|
+
except FileNotFoundError as e:
|
|
52
|
+
print_fatal_error_and_exit(str(e))
|
|
@@ -55,5 +55,6 @@ def number_and_pct(_number: int, total: int, digits: int = 1) -> Text:
|
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
def pct_txt(_number: int, total: int, digits: int = 1) -> Text:
|
|
58
|
+
"""Return nicely formatted percentage, e.g. '(80%)'."""
|
|
58
59
|
pct = (100 * float(_number) / float(total)).__round__(digits)
|
|
59
60
|
return Text(f"({pct}%)", style='blue')
|
|
@@ -19,7 +19,7 @@ CHARMAP_PADDING = (0, 2, 0, 10)
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def print_character_mapping(font: 'FontInfo') -> None: # noqa: F821
|
|
22
|
-
"""Prints the character mapping extracted by PyPDF._charmap in tidy columns"""
|
|
22
|
+
"""Prints the character mapping extracted by PyPDF._charmap in tidy columns."""
|
|
23
23
|
if font.character_mapping is None or len(font.character_mapping) == 0:
|
|
24
24
|
log.info(f"No character map found in {font}")
|
|
25
25
|
return
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Methods to help with the formatting of the output tables, headers, panels, etc.
|
|
3
3
|
"""
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
4
6
|
from rich import box
|
|
5
7
|
from rich.padding import Padding
|
|
6
8
|
from rich.panel import Panel
|
|
@@ -11,7 +13,7 @@ DEFAULT_SUBTABLE_COL_STYLES = ['white', 'bright_white']
|
|
|
11
13
|
HEADER_PADDING = (1, 1)
|
|
12
14
|
|
|
13
15
|
|
|
14
|
-
def generate_subtable(cols, header_style='subtable') -> Table:
|
|
16
|
+
def generate_subtable(cols: List[str], header_style: str = 'subtable') -> Table:
|
|
15
17
|
"""Suited for placement in larger tables."""
|
|
16
18
|
table = Table(
|
|
17
19
|
box=box.SIMPLE,
|
|
@@ -33,10 +35,12 @@ def generate_subtable(cols, header_style='subtable') -> Table:
|
|
|
33
35
|
|
|
34
36
|
|
|
35
37
|
def subheading_width() -> int:
|
|
38
|
+
"""Return 75% of the console width."""
|
|
36
39
|
return int(console_width() * 0.75)
|
|
37
40
|
|
|
38
41
|
|
|
39
42
|
def half_width() -> int:
|
|
43
|
+
"""Return 50% of the console width."""
|
|
40
44
|
return int(console_width() * 0.5)
|
|
41
45
|
|
|
42
46
|
|
|
@@ -46,28 +50,34 @@ def pad_header(header: str) -> Padding:
|
|
|
46
50
|
|
|
47
51
|
|
|
48
52
|
def print_section_header(headline: str, style: str = '') -> None:
|
|
53
|
+
"""Prints a full-width section header with padding above and below."""
|
|
49
54
|
console.line(2)
|
|
50
55
|
_print_header_panel(headline, f"{style} reverse", True, console_width(), HEADER_PADDING)
|
|
51
56
|
console.line()
|
|
52
57
|
|
|
53
58
|
|
|
54
59
|
def print_section_subheader(headline: str, style: str = '') -> None:
|
|
60
|
+
"""Prints a half-width section subheader with padding above."""
|
|
55
61
|
console.line()
|
|
56
62
|
_print_header_panel(headline, style, True, subheading_width(), HEADER_PADDING)
|
|
57
63
|
|
|
58
64
|
|
|
59
65
|
def print_section_sub_subheader(headline: str, style: str = ''):
|
|
66
|
+
"""Prints a half-width section sub-subheader with no padding above."""
|
|
60
67
|
console.line()
|
|
61
68
|
_print_header_panel(headline, style, True, half_width())
|
|
62
69
|
|
|
63
70
|
|
|
64
|
-
def print_headline_panel(headline, style: str = ''):
|
|
71
|
+
def print_headline_panel(headline: str, style: str = ''):
|
|
72
|
+
"""Prints a full-width headline panel with no padding above or below."""
|
|
65
73
|
_print_header_panel(headline, style, False, console_width())
|
|
66
74
|
|
|
67
75
|
|
|
68
|
-
def print_fatal_error_panel(headline):
|
|
76
|
+
def print_fatal_error_panel(headline: str):
|
|
77
|
+
"""Prints a full-width red blinking panel for fatal errors."""
|
|
69
78
|
print_headline_panel(headline, style='red blink')
|
|
70
79
|
|
|
71
80
|
|
|
72
81
|
def _print_header_panel(headline: str, style: str, expand: bool, width: int, padding: tuple = (0,)) -> None:
|
|
82
|
+
"""Helper to print a rich `Panel` with the given style, width, and padding."""
|
|
73
83
|
console.print(Panel(headline, style=style, expand=expand, width=width or subheading_width(), padding=padding))
|
|
@@ -19,7 +19,7 @@ DECODES_SUBTABLE_COLS = ['Encoding', '#', 'Decoded', '#', 'Forced', '#', 'Failed
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def build_decoding_stats_table(scanner: BinaryScanner) -> Table:
|
|
22
|
-
"""Diplay aggregate results on the decoding attempts we made on subsets of scanner.bytes"""
|
|
22
|
+
"""Diplay aggregate results on the decoding attempts we made on subsets of `scanner.bytes`."""
|
|
23
23
|
stats_table = _new_decoding_stats_table(scanner.label.plain if scanner.label else '')
|
|
24
24
|
regexes_not_found_in_stream = []
|
|
25
25
|
|
|
@@ -58,9 +58,9 @@ def build_decoding_stats_table(scanner: BinaryScanner) -> Table:
|
|
|
58
58
|
return stats_table
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def _new_decoding_stats_table(
|
|
62
|
-
"""Build an empty table for displaying decoding stats"""
|
|
63
|
-
title = prefix_with_style(
|
|
61
|
+
def _new_decoding_stats_table(title_str: str) -> Table:
|
|
62
|
+
"""Build an empty table for displaying decoding stats."""
|
|
63
|
+
title = prefix_with_style(title_str, style='blue underline')
|
|
64
64
|
title.append(": Decoding Attempts Summary Statistics", style='bright_white bold')
|
|
65
65
|
|
|
66
66
|
table = Table(
|
|
@@ -15,8 +15,8 @@ ATTRIBUTES_TO_SHOW_IN_SUMMARY_TABLE = [
|
|
|
15
15
|
]
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def font_summary_table(font):
|
|
19
|
-
"""Build a Rich Table with important info about the font"""
|
|
18
|
+
def font_summary_table(font: 'FontInfo') -> Table: # noqa: F821
|
|
19
|
+
"""Build a Rich `Table` with important info about the font"""
|
|
20
20
|
table = Table('', '', show_header=False)
|
|
21
21
|
table.columns[0].style = 'font.property'
|
|
22
22
|
table.columns[0].justify = 'right'
|
|
@@ -1,10 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
wrapping each internal PDF object in a PdfTreeNode. Tree is managed by
|
|
4
|
-
the anytree library. Information about the tree as a whole is stored
|
|
5
|
-
in this class.
|
|
6
|
-
Once the PDF is parsed this class manages access to
|
|
7
|
-
information about or from the underlying PDF tree.
|
|
2
|
+
PDFalyzer: Analyze and explore the structure of PDF files.
|
|
8
3
|
"""
|
|
9
4
|
from os.path import basename
|
|
10
5
|
from typing import Dict, Iterator, List, Optional
|
|
@@ -31,7 +26,19 @@ TRAILER_FALLBACK_ID = 10000000
|
|
|
31
26
|
|
|
32
27
|
|
|
33
28
|
class Pdfalyzer:
|
|
29
|
+
"""
|
|
30
|
+
Walks a PDF's internals and builds the PDF logical structure tree.
|
|
31
|
+
|
|
32
|
+
Each of the PDF's internal objects isw rapped in a `PdfTreeNode` object. The tree is managed
|
|
33
|
+
by the `anytree` library. Information about the tree as a whole is stored in this class.
|
|
34
|
+
Once the PDF is parsed this class provides access to info about or from the underlying PDF tree.
|
|
35
|
+
"""
|
|
36
|
+
|
|
34
37
|
def __init__(self, pdf_path: str):
|
|
38
|
+
"""
|
|
39
|
+
Args:
|
|
40
|
+
pdf_path: Path to the PDF file to analyze
|
|
41
|
+
"""
|
|
35
42
|
self.pdf_path = pdf_path
|
|
36
43
|
self.pdf_basename = basename(pdf_path)
|
|
37
44
|
self.pdf_bytes = load_binary_data(pdf_path)
|
|
@@ -72,7 +79,7 @@ class Pdfalyzer:
|
|
|
72
79
|
log.info(f"Walk complete.")
|
|
73
80
|
|
|
74
81
|
def walk_node(self, node: PdfTreeNode) -> None:
|
|
75
|
-
"""Recursively walk the PDF's tree structure starting at a given node"""
|
|
82
|
+
"""Recursively walk the PDF's tree structure starting at a given node."""
|
|
76
83
|
log.info(f'walk_node() called with {node}. Object dump:\n{print_with_header(node.obj, node.label)}')
|
|
77
84
|
nodes_to_walk_next = [self._add_relationship_to_pdf_tree(r) for r in node.references_to_other_nodes()]
|
|
78
85
|
node.all_references_processed = True
|
|
@@ -82,7 +89,7 @@ class Pdfalyzer:
|
|
|
82
89
|
self.walk_node(next_node)
|
|
83
90
|
|
|
84
91
|
def find_node_by_idnum(self, idnum) -> Optional[PdfTreeNode]:
|
|
85
|
-
"""Find node with idnum in the tree. Return None if that node is not reachable from the root."""
|
|
92
|
+
"""Find node with `idnum` in the tree. Return `None` if that node is not reachable from the root."""
|
|
86
93
|
nodes = [
|
|
87
94
|
node for node in findall_by_attr(self.pdf_tree, name='idnum', value=idnum)
|
|
88
95
|
if not isinstance(node, SymlinkNode)
|
|
@@ -96,7 +103,7 @@ class Pdfalyzer:
|
|
|
96
103
|
raise PdfWalkError(f"Too many nodes had id {idnum}: {nodes}")
|
|
97
104
|
|
|
98
105
|
def is_in_tree(self, search_for_node: PdfTreeNode) -> bool:
|
|
99
|
-
"""Returns true if search_for_node is in the tree already."""
|
|
106
|
+
"""Returns true if `search_for_node` is in the tree already."""
|
|
100
107
|
return any([node == search_for_node for node in self.node_iterator()])
|
|
101
108
|
|
|
102
109
|
def node_iterator(self) -> Iterator[PdfTreeNode]:
|
|
@@ -110,7 +117,7 @@ class Pdfalyzer:
|
|
|
110
117
|
|
|
111
118
|
def _add_relationship_to_pdf_tree(self, relationship: PdfObjectRelationship) -> Optional[PdfTreeNode]:
|
|
112
119
|
"""
|
|
113
|
-
Place the relationship
|
|
120
|
+
Place the `relationship` node in the tree. Returns an optional node that should be
|
|
114
121
|
placed in the PDF node processing queue.
|
|
115
122
|
"""
|
|
116
123
|
log.info(f'Assessing relationship {relationship}...')
|
|
@@ -172,7 +179,7 @@ class Pdfalyzer:
|
|
|
172
179
|
return to_node
|
|
173
180
|
|
|
174
181
|
def _resolve_indeterminate_nodes(self) -> None:
|
|
175
|
-
"""Place all indeterminate nodes in the tree."""
|
|
182
|
+
"""Place all indeterminate nodes in the tree. Called after all nodes have been walked."""
|
|
176
183
|
indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self.indeterminate_ids]
|
|
177
184
|
indeterminate_nodes_string = "\n ".join([f"{node}" for node in indeterminate_nodes])
|
|
178
185
|
log.info(f"Resolving {len(indeterminate_nodes)} indeterminate nodes: {indeterminate_nodes_string}")
|
|
@@ -185,7 +192,7 @@ class Pdfalyzer:
|
|
|
185
192
|
IndeterminateNode(node).place_node()
|
|
186
193
|
|
|
187
194
|
def _extract_font_infos(self) -> None:
|
|
188
|
-
"""Extract information about fonts in the tree and place it in self.font_infos"""
|
|
195
|
+
"""Extract information about fonts in the tree and place it in `self.font_infos`."""
|
|
189
196
|
for node in self.node_iterator():
|
|
190
197
|
if isinstance(node.obj, dict) and RESOURCES in node.obj:
|
|
191
198
|
log.debug(f"Extracting fonts from node with '{RESOURCES}' key: {node}...")
|
|
@@ -207,6 +214,6 @@ class Pdfalyzer:
|
|
|
207
214
|
return new_node
|
|
208
215
|
|
|
209
216
|
def _print_nodes_encountered(self) -> None:
|
|
210
|
-
"""Debug method that displays which nodes have already been walked"""
|
|
217
|
+
"""Debug method that displays which nodes have already been walked."""
|
|
211
218
|
for i in sorted(self.nodes_encountered.keys()):
|
|
212
219
|
console.print(f'{i}: {self.nodes_encountered[i]}')
|
|
@@ -15,7 +15,7 @@ from rich.text import Text
|
|
|
15
15
|
from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
|
|
16
16
|
from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
|
|
17
17
|
|
|
18
|
-
from pdfalyzer.config import ALL_STREAMS, PdfalyzerConfig
|
|
18
|
+
from pdfalyzer.config import ALL_STREAMS, PDFALYZER, PdfalyzerConfig
|
|
19
19
|
from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
|
|
20
20
|
from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
|
|
21
21
|
with_pdf_extension)
|
|
@@ -124,9 +124,9 @@ parser._action_groups = parser._action_groups[:2] + [parser._action_groups[-1]]
|
|
|
124
124
|
# Main argument parsing begins #
|
|
125
125
|
################################
|
|
126
126
|
def parse_arguments():
|
|
127
|
-
"""Parse command line args. Most
|
|
127
|
+
"""Parse command line args. Most args can also be communicated to the app by setting env vars."""
|
|
128
128
|
if '--version' in sys.argv:
|
|
129
|
-
print(f"pdfalyzer {version(
|
|
129
|
+
print(f"pdfalyzer {version(PDFALYZER)}")
|
|
130
130
|
sys.exit()
|
|
131
131
|
|
|
132
132
|
args = parser.parse_args()
|
|
@@ -158,10 +158,16 @@ def parse_arguments():
|
|
|
158
158
|
return args
|
|
159
159
|
|
|
160
160
|
|
|
161
|
-
def output_sections(args, pdfalyzer) -> List[OutputSection]:
|
|
161
|
+
def output_sections(args: Namespace, pdfalyzer: 'Pdfalyzer') -> List[OutputSection]: # noqa: F821
|
|
162
162
|
"""
|
|
163
163
|
Determine which of the tree visualizations, font scans, etc should be run.
|
|
164
164
|
If nothing is specified output ALL sections other than --streams which is v. slow/verbose.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
args: parsed command line arguments
|
|
168
|
+
pdfalyzer: the `pdfalyzer` instance whose methods will be called to produce output
|
|
169
|
+
Returns:
|
|
170
|
+
List[OutputSection]: List of `OutputSection` namedtuples with 'argument' and 'method' fields
|
|
165
171
|
"""
|
|
166
172
|
# Create a partial for print_font_info() because it's the only one that can take an argument
|
|
167
173
|
# partials have no __name__ so update_wrapper() propagates the 'print_font_info' as this partial's name
|
|
@@ -196,9 +202,10 @@ def all_sections_chosen(args):
|
|
|
196
202
|
return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
|
|
197
203
|
|
|
198
204
|
|
|
199
|
-
|
|
200
|
-
#
|
|
201
|
-
|
|
205
|
+
#############################################################
|
|
206
|
+
# Separate arg parsers for combine_pdfs and other scripts #
|
|
207
|
+
#############################################################
|
|
208
|
+
|
|
202
209
|
MAX_QUALITY = 10
|
|
203
210
|
|
|
204
211
|
combine_pdfs_parser = ArgumentParser(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "pdfalyzer"
|
|
3
|
-
version = "1.16.
|
|
3
|
+
version = "1.16.14"
|
|
4
4
|
description = "PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
|
|
5
5
|
authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
|
|
6
6
|
license = "GPL-3.0-or-later"
|
|
@@ -15,7 +15,6 @@ classifiers = [
|
|
|
15
15
|
"Intended Audience :: Information Technology",
|
|
16
16
|
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
|
17
17
|
"Programming Language :: Python",
|
|
18
|
-
"Programming Language :: Python :: 3.9",
|
|
19
18
|
"Programming Language :: Python :: 3.10",
|
|
20
19
|
"Programming Language :: Python :: 3.11",
|
|
21
20
|
"Programming Language :: Python :: 3.12",
|
|
@@ -66,10 +65,10 @@ packages = [
|
|
|
66
65
|
# Dependencies #
|
|
67
66
|
#####################
|
|
68
67
|
[tool.poetry.dependencies]
|
|
69
|
-
python = "^3.
|
|
68
|
+
python = "^3.10"
|
|
70
69
|
anytree = "~=2.13"
|
|
71
70
|
pypdf = "^6.0.0"
|
|
72
|
-
yaralyzer = "^1.0.
|
|
71
|
+
yaralyzer = "^1.0.9"
|
|
73
72
|
|
|
74
73
|
[tool.poetry.group.dev.dependencies]
|
|
75
74
|
flake8 = "^7.3.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pdfalyzer-1.16.13 → pdfalyzer-1.16.14}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara
RENAMED
|
File without changes
|
|
File without changes
|