pdfalyzer 1.14.9__tar.gz → 1.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/CHANGELOG.md +7 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/PKG-INFO +30 -16
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/README.md +29 -14
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/__init__.py +42 -2
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/binary/binary_scanner.py +11 -12
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/decorators/document_model_printer.py +1 -1
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/decorators/indeterminate_node.py +8 -6
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/decorators/pdf_object_properties.py +1 -1
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/decorators/pdf_tree_node.py +15 -13
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/decorators/pdf_tree_verifier.py +1 -1
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/detection/constants/binary_regexes.py +1 -7
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/detection/javascript_hunter.py +1 -1
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/detection/yaralyzer_helper.py +1 -1
- pdfalyzer-1.15.0/pdfalyzer/helpers/filesystem_helper.py +102 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/helpers/number_helper.py +3 -1
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/helpers/rich_text_helper.py +12 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/pdf_object_relationship.py +0 -1
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/util/adobe_strings.py +2 -1
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/util/argument_parser.py +78 -3
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/yara_rules/PDF.yara +25 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pyproject.toml +2 -2
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/LICENSE +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/__main__.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/config.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/font_info.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/helpers/dict_helper.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/helpers/pdf_object_helper.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/helpers/string_helper.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/output/character_mapping.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/output/layout.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/output/pdfalyzer_presenter.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/output/styles/node_colors.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/output/styles/rich_theme.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/output/tables/decoding_stats_table.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/output/tables/font_summary_table.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/output/tables/stream_objects_table.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/pdfalyzer.py +1 -1
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/util/debugging.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/util/exceptions.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/util/pdf_parser_manager.py +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/yara_rules/__init.py__ +0 -0
- {pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
# 1.15.0
|
|
4
|
+
* Add `combine_pdfs` command line script to merge a bunch of PDFs into one
|
|
5
|
+
* Remove unused `Deprecated` dependency
|
|
6
|
+
|
|
7
|
+
### 1.14.10
|
|
8
|
+
* Add `malware_MaldocinPDF` YARA rule
|
|
9
|
+
|
|
3
10
|
### 1.14.9
|
|
4
11
|
* Add [ActiveMime YARA rule](https://blog.didierstevens.com/2023/08/29/quickpost-pdf-activemime-maldocs-yara-rule/)
|
|
5
12
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.15.0
|
|
4
4
|
Summary: A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -16,7 +16,6 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
16
16
|
Classifier: Topic :: Artistic Software
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
18
18
|
Classifier: Topic :: Security
|
|
19
|
-
Requires-Dist: Deprecated (>=1.2.13,<2.0.0)
|
|
20
19
|
Requires-Dist: PyPDF2 (>=2.10,<3.0)
|
|
21
20
|
Requires-Dist: anytree (>=2.8,<3.0)
|
|
22
21
|
Requires-Dist: chardet (>=5.0.0,<6.0.0)
|
|
@@ -63,25 +62,32 @@ If you're looking for one of these things this may be the tool for you.
|
|
|
63
62
|
### What It Don't Do
|
|
64
63
|
This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
|
|
65
64
|
|
|
65
|
+
-------------
|
|
66
66
|
|
|
67
67
|
# Installation
|
|
68
68
|
|
|
69
|
-
Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` should also work.
|
|
69
|
+
Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` / `pip` should also work.
|
|
70
70
|
```sh
|
|
71
71
|
pipx install pdfalyzer
|
|
72
72
|
```
|
|
73
73
|
|
|
74
74
|
See [PyPDF2 installation notes](https://github.com/py-pdf/PyPDF2#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
|
|
75
75
|
|
|
76
|
-
|
|
76
|
+
If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
|
|
77
|
+
|
|
78
|
+
### Troubleshooting
|
|
77
79
|
1. If you used `pip3` instead of `pipx` and have an issue you should try to install with `pipx`.
|
|
78
80
|
1. If you run into an issue about missing YARA try to install [yara-python](https://pypi.org/project/yara-python/).
|
|
79
81
|
1. If you encounter an error building the python `cryptography` package check your `pip` version (`pip --version`). If it's less than 22.0, upgrade `pip` with `pip install --upgrade pip`.
|
|
82
|
+
1. If you get a YARA internal error number you can look up what it actually means [here](https://github.com/VirusTotal/yara/blob/master/libyara/include/yara/error.h).
|
|
83
|
+
1. If you can't get the `pdfalyze` command to work try `python -m pdfalyzer`. It's an equivalent but more portable version of the same command that does not rely on your python script paths being set up in a sane way.
|
|
84
|
+
1. While The Pdfalyzer has been tested on quite a few large and very complicated PDFs there are no doubt a bunch of edge cases that will trip up the code. Sifting through the various interconnected internal PDF objects and building the correct tree representation is much, much harder than it should be and requires multiple scans and a little bit of educated guessing. If a PDF fails to parse and you hit an error please open [a GitHub issue](https://github.com/michelcrypt4d4mus/pdfalyzer/issues) with the compressed (`.zip`, `.gz`, whatever) PDF that is causing the problem attached (if possible) and I'll take a look when I can. I will _not_ take a look at any uncompressed PDFs due to the security risks so make sure you zip it before you ship it.
|
|
80
85
|
1. On Linux if you encounter an error building `wheel` or `cffi` you may need to install some packages:
|
|
81
86
|
```bash
|
|
82
87
|
sudo apt-get install build-essential libssl-dev libffi-dev rustc
|
|
83
88
|
```
|
|
84
|
-
|
|
89
|
+
|
|
90
|
+
-------------
|
|
85
91
|
|
|
86
92
|
# Usage
|
|
87
93
|
|
|
@@ -92,8 +98,8 @@ Run `pdfalyze --help` to see usage instructions. As of right now these are the o
|
|
|
92
98
|
## Runtime Options
|
|
93
99
|
If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--help` then all of the analyses will be done _except_ the `--streams`. In other words, these two commands are equivalent:
|
|
94
100
|
|
|
95
|
-
1. `
|
|
96
|
-
1. `
|
|
101
|
+
1. `pdfalyze lacan_buys_the_dip.pdf`
|
|
102
|
+
1. `pdfalyze lacan_buys_the_dip.pdf -d -t -r -f -y -c`
|
|
97
103
|
|
|
98
104
|
The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
|
|
99
105
|
|
|
@@ -106,15 +112,11 @@ Even if you don't configure your own `.pdfalyzer` file you may still glean some
|
|
|
106
112
|
### Colors And Themes
|
|
107
113
|
Run `pdfalyzer_show_color_theme` to see the color theme employed.
|
|
108
114
|
|
|
109
|
-
|
|
110
|
-
## Guarantees
|
|
115
|
+
### Guarantees
|
|
111
116
|
Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
|
|
112
117
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
1. If you can't get the `pdfalyze` command to work try `python -m pdfalyzer`. It's an equivalent but more portable version of the same command that does not rely on your python script paths being set up in a sane way.
|
|
116
|
-
1. While The Pdfalyzer has been tested on quite a few large and very complicated PDFs there are no doubt a bunch of edge cases that will trip up the code. If that does happen and you hit an error, please open [a GitHub issue](https://github.com/michelcrypt4d4mus/pdfalyzer/issues) with the compressed (`.zip`, `.gz`, whatever) PDF that is causing the problem attached (if possible) and I'll take a look when I can. I will _not_ take a look at any uncompressed PDFs due to the security risks so make sure you zip it before you ship it.
|
|
117
|
-
|
|
118
|
+
## Example Usage
|
|
119
|
+
[BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
|
|
118
120
|
|
|
119
121
|
-------------
|
|
120
122
|
|
|
@@ -135,6 +137,7 @@ pdfalyzer = Pdfalyzer("/path/to/the/evil_or_non_evil.pdf")
|
|
|
135
137
|
actual_pdf_tree: PdfTreeNode = pdfalyzer.pdf_tree
|
|
136
138
|
|
|
137
139
|
# The PdfalyzerPresenter handles formatting/prettifying output
|
|
140
|
+
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
138
141
|
PdfalyzerPresenter(pdfalyzer).print_everything()
|
|
139
142
|
|
|
140
143
|
# Iterate over all nodes in the PDF tree
|
|
@@ -164,6 +167,7 @@ for backtick_quoted_string in font.binary_scanner.extract_backtick_quoted_bytes(
|
|
|
164
167
|
do_stuff(backtick_quoted_string)
|
|
165
168
|
```
|
|
166
169
|
|
|
170
|
+
-------------
|
|
167
171
|
|
|
168
172
|
# Example Output
|
|
169
173
|
The Pdfalyzer can export visualizations to HTML, ANSI colored text, and SVG images using the file export functionality that comes with [Rich](https://github.com/Textualize/rich). SVGs can be turned into `png` format images with a tool like Inkscape or `cairosvg` (Inkscape works a lot better in our experience). See `pdfalyze --help` for the specifics.
|
|
@@ -188,7 +192,7 @@ This image shows a more in-depth view of of the PDF tree for the same document s
|
|
|
188
192
|
|
|
189
193
|
## Fonts
|
|
190
194
|
|
|
191
|
-
#### **Extract character mappings from ancient Adobe font formats
|
|
195
|
+
#### **Extract character mappings from ancient Adobe font formats**. It's actually `PyPDF2` doing the lifting here but we're happy to take the credit.
|
|
192
196
|
|
|
193
197
|

|
|
194
198
|
|
|
@@ -223,8 +227,11 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
223
227
|
|
|
224
228
|

|
|
225
229
|
|
|
230
|
+
-------------
|
|
226
231
|
|
|
227
232
|
# PDF Resources
|
|
233
|
+
## Included PDF Tools
|
|
234
|
+
The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
|
|
228
235
|
|
|
229
236
|
## 3rd Party PDF Tools
|
|
230
237
|
### Installing Didier Stevens's PDF Analysis Tools
|
|
@@ -247,7 +254,7 @@ There's [a script](scripts/install_t1utils.sh) to help you install the suite if
|
|
|
247
254
|
scripts/install_t1utils.sh
|
|
248
255
|
```
|
|
249
256
|
|
|
250
|
-
## Documentation
|
|
257
|
+
## External Documentation
|
|
251
258
|
### Official Adobe Documentation
|
|
252
259
|
* [Official Adobe PDF 1.7 Specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf) - Indispensable map when navigating a PDF forest.
|
|
253
260
|
* [Adobe Type 1 Font Format Specification](https://adobe-type-tools.github.io/font-tech-notes/pdfs/T1_SPEC.pdf) - Official spec for Adobe's original font description language and file format. Useful if you have suspicions about malicious fonts. Type1 seems to be the attack vector of choice recently which isn't so surprising when you consider that it's a 30 year old technology and the code that renders these fonts probably hasn't been extensively tested in decades because almost no one uses them anymore outside of people who want to use them as attack vectors.
|
|
@@ -270,6 +277,8 @@ This tool was built to fill a gap in the PDF assessment landscape following [my
|
|
|
270
277
|
|
|
271
278
|
Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [PyPDF2](https://github.com/py-pdf/PyPDF2), [Rich](https://github.com/Textualize/rich), and [YARA](https://github.com/VirusTotal/yara-python) via [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer)) into this tool.
|
|
272
279
|
|
|
280
|
+
-------------
|
|
281
|
+
|
|
273
282
|
# Contributing
|
|
274
283
|
One easy way of contributing is to run [the script to test against all the PDFs in your `~/Documents` folder](scripts/test_against_all_pdfs_in_Documents_folder.sh) and report any issues.
|
|
275
284
|
|
|
@@ -290,7 +299,12 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
290
299
|
| **`indeterminate_node`** | any node whose place in the tree cannot be decided until every node has been seen |
|
|
291
300
|
| **`link_node`** | nodes like `/Dest` that just contain a pointer to another node |
|
|
292
301
|
|
|
302
|
+
### Reference
|
|
303
|
+
* [`PyPDF2 2.12.0` documentation](https://pypdf2.readthedocs.io/en/2.12.0/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
|
|
304
|
+
|
|
305
|
+
|
|
293
306
|
# TODO
|
|
307
|
+
* Upgrade `PyPDF` to latest and expand `combine_pdfs` compression command line option
|
|
294
308
|
* Highlight decodes with a lot of Javascript keywords
|
|
295
309
|
* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
|
|
296
310
|
* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
|
|
@@ -33,25 +33,32 @@ If you're looking for one of these things this may be the tool for you.
|
|
|
33
33
|
### What It Don't Do
|
|
34
34
|
This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
|
|
35
35
|
|
|
36
|
+
-------------
|
|
36
37
|
|
|
37
38
|
# Installation
|
|
38
39
|
|
|
39
|
-
Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` should also work.
|
|
40
|
+
Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` / `pip` should also work.
|
|
40
41
|
```sh
|
|
41
42
|
pipx install pdfalyzer
|
|
42
43
|
```
|
|
43
44
|
|
|
44
45
|
See [PyPDF2 installation notes](https://github.com/py-pdf/PyPDF2#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
|
|
45
46
|
|
|
46
|
-
|
|
47
|
+
If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
|
|
48
|
+
|
|
49
|
+
### Troubleshooting
|
|
47
50
|
1. If you used `pip3` instead of `pipx` and have an issue you should try to install with `pipx`.
|
|
48
51
|
1. If you run into an issue about missing YARA try to install [yara-python](https://pypi.org/project/yara-python/).
|
|
49
52
|
1. If you encounter an error building the python `cryptography` package check your `pip` version (`pip --version`). If it's less than 22.0, upgrade `pip` with `pip install --upgrade pip`.
|
|
53
|
+
1. If you get a YARA internal error number you can look up what it actually means [here](https://github.com/VirusTotal/yara/blob/master/libyara/include/yara/error.h).
|
|
54
|
+
1. If you can't get the `pdfalyze` command to work try `python -m pdfalyzer`. It's an equivalent but more portable version of the same command that does not rely on your python script paths being set up in a sane way.
|
|
55
|
+
1. While The Pdfalyzer has been tested on quite a few large and very complicated PDFs there are no doubt a bunch of edge cases that will trip up the code. Sifting through the various interconnected internal PDF objects and building the correct tree representation is much, much harder than it should be and requires multiple scans and a little bit of educated guessing. If a PDF fails to parse and you hit an error please open [a GitHub issue](https://github.com/michelcrypt4d4mus/pdfalyzer/issues) with the compressed (`.zip`, `.gz`, whatever) PDF that is causing the problem attached (if possible) and I'll take a look when I can. I will _not_ take a look at any uncompressed PDFs due to the security risks so make sure you zip it before you ship it.
|
|
50
56
|
1. On Linux if you encounter an error building `wheel` or `cffi` you may need to install some packages:
|
|
51
57
|
```bash
|
|
52
58
|
sudo apt-get install build-essential libssl-dev libffi-dev rustc
|
|
53
59
|
```
|
|
54
|
-
|
|
60
|
+
|
|
61
|
+
-------------
|
|
55
62
|
|
|
56
63
|
# Usage
|
|
57
64
|
|
|
@@ -62,8 +69,8 @@ Run `pdfalyze --help` to see usage instructions. As of right now these are the o
|
|
|
62
69
|
## Runtime Options
|
|
63
70
|
If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--help` then all of the analyses will be done _except_ the `--streams`. In other words, these two commands are equivalent:
|
|
64
71
|
|
|
65
|
-
1. `
|
|
66
|
-
1. `
|
|
72
|
+
1. `pdfalyze lacan_buys_the_dip.pdf`
|
|
73
|
+
1. `pdfalyze lacan_buys_the_dip.pdf -d -t -r -f -y -c`
|
|
67
74
|
|
|
68
75
|
The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
|
|
69
76
|
|
|
@@ -76,15 +83,11 @@ Even if you don't configure your own `.pdfalyzer` file you may still glean some
|
|
|
76
83
|
### Colors And Themes
|
|
77
84
|
Run `pdfalyzer_show_color_theme` to see the color theme employed.
|
|
78
85
|
|
|
79
|
-
|
|
80
|
-
## Guarantees
|
|
86
|
+
### Guarantees
|
|
81
87
|
Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
|
|
82
88
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
1. If you can't get the `pdfalyze` command to work try `python -m pdfalyzer`. It's an equivalent but more portable version of the same command that does not rely on your python script paths being set up in a sane way.
|
|
86
|
-
1. While The Pdfalyzer has been tested on quite a few large and very complicated PDFs there are no doubt a bunch of edge cases that will trip up the code. If that does happen and you hit an error, please open [a GitHub issue](https://github.com/michelcrypt4d4mus/pdfalyzer/issues) with the compressed (`.zip`, `.gz`, whatever) PDF that is causing the problem attached (if possible) and I'll take a look when I can. I will _not_ take a look at any uncompressed PDFs due to the security risks so make sure you zip it before you ship it.
|
|
87
|
-
|
|
89
|
+
## Example Usage
|
|
90
|
+
[BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
|
|
88
91
|
|
|
89
92
|
-------------
|
|
90
93
|
|
|
@@ -105,6 +108,7 @@ pdfalyzer = Pdfalyzer("/path/to/the/evil_or_non_evil.pdf")
|
|
|
105
108
|
actual_pdf_tree: PdfTreeNode = pdfalyzer.pdf_tree
|
|
106
109
|
|
|
107
110
|
# The PdfalyzerPresenter handles formatting/prettifying output
|
|
111
|
+
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
108
112
|
PdfalyzerPresenter(pdfalyzer).print_everything()
|
|
109
113
|
|
|
110
114
|
# Iterate over all nodes in the PDF tree
|
|
@@ -134,6 +138,7 @@ for backtick_quoted_string in font.binary_scanner.extract_backtick_quoted_bytes(
|
|
|
134
138
|
do_stuff(backtick_quoted_string)
|
|
135
139
|
```
|
|
136
140
|
|
|
141
|
+
-------------
|
|
137
142
|
|
|
138
143
|
# Example Output
|
|
139
144
|
The Pdfalyzer can export visualizations to HTML, ANSI colored text, and SVG images using the file export functionality that comes with [Rich](https://github.com/Textualize/rich). SVGs can be turned into `png` format images with a tool like Inkscape or `cairosvg` (Inkscape works a lot better in our experience). See `pdfalyze --help` for the specifics.
|
|
@@ -158,7 +163,7 @@ This image shows a more in-depth view of of the PDF tree for the same document s
|
|
|
158
163
|
|
|
159
164
|
## Fonts
|
|
160
165
|
|
|
161
|
-
#### **Extract character mappings from ancient Adobe font formats
|
|
166
|
+
#### **Extract character mappings from ancient Adobe font formats**. It's actually `PyPDF2` doing the lifting here but we're happy to take the credit.
|
|
162
167
|
|
|
163
168
|

|
|
164
169
|
|
|
@@ -193,8 +198,11 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
193
198
|
|
|
194
199
|

|
|
195
200
|
|
|
201
|
+
-------------
|
|
196
202
|
|
|
197
203
|
# PDF Resources
|
|
204
|
+
## Included PDF Tools
|
|
205
|
+
The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
|
|
198
206
|
|
|
199
207
|
## 3rd Party PDF Tools
|
|
200
208
|
### Installing Didier Stevens's PDF Analysis Tools
|
|
@@ -217,7 +225,7 @@ There's [a script](scripts/install_t1utils.sh) to help you install the suite if
|
|
|
217
225
|
scripts/install_t1utils.sh
|
|
218
226
|
```
|
|
219
227
|
|
|
220
|
-
## Documentation
|
|
228
|
+
## External Documentation
|
|
221
229
|
### Official Adobe Documentation
|
|
222
230
|
* [Official Adobe PDF 1.7 Specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf) - Indispensable map when navigating a PDF forest.
|
|
223
231
|
* [Adobe Type 1 Font Format Specification](https://adobe-type-tools.github.io/font-tech-notes/pdfs/T1_SPEC.pdf) - Official spec for Adobe's original font description language and file format. Useful if you have suspicions about malicious fonts. Type1 seems to be the attack vector of choice recently which isn't so surprising when you consider that it's a 30 year old technology and the code that renders these fonts probably hasn't been extensively tested in decades because almost no one uses them anymore outside of people who want to use them as attack vectors.
|
|
@@ -240,6 +248,8 @@ This tool was built to fill a gap in the PDF assessment landscape following [my
|
|
|
240
248
|
|
|
241
249
|
Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [PyPDF2](https://github.com/py-pdf/PyPDF2), [Rich](https://github.com/Textualize/rich), and [YARA](https://github.com/VirusTotal/yara-python) via [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer)) into this tool.
|
|
242
250
|
|
|
251
|
+
-------------
|
|
252
|
+
|
|
243
253
|
# Contributing
|
|
244
254
|
One easy way of contributing is to run [the script to test against all the PDFs in your `~/Documents` folder](scripts/test_against_all_pdfs_in_Documents_folder.sh) and report any issues.
|
|
245
255
|
|
|
@@ -260,7 +270,12 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
260
270
|
| **`indeterminate_node`** | any node whose place in the tree cannot be decided until every node has been seen |
|
|
261
271
|
| **`link_node`** | nodes like `/Dest` that just contain a pointer to another node |
|
|
262
272
|
|
|
273
|
+
### Reference
|
|
274
|
+
* [`PyPDF2 2.12.0` documentation](https://pypdf2.readthedocs.io/en/2.12.0/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
|
|
275
|
+
|
|
276
|
+
|
|
263
277
|
# TODO
|
|
278
|
+
* Upgrade `PyPDF` to latest and expand `combine_pdfs` compression command line option
|
|
264
279
|
* Highlight decodes with a lot of Javascript keywords
|
|
265
280
|
* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
|
|
266
281
|
* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
import code
|
|
2
|
-
import logging
|
|
3
2
|
import sys
|
|
4
3
|
from os import environ, getcwd, path
|
|
4
|
+
from pathlib import Path
|
|
5
5
|
|
|
6
6
|
from dotenv import load_dotenv
|
|
7
|
+
# TODO: PdfMerger is deprecated in favor of PdfWriter at v3.9.1 (see https://pypdf.readthedocs.io/en/latest/user/merging-pdfs.html#basic-example)
|
|
8
|
+
from PyPDF2 import PdfMerger
|
|
9
|
+
from PyPDF2.errors import PdfReadError
|
|
7
10
|
|
|
11
|
+
# Should be first local import before load_dotenv() (or at least I think it needs to come first)
|
|
8
12
|
from pdfalyzer.config import PdfalyzerConfig
|
|
9
13
|
|
|
10
14
|
# load_dotenv() should be called as soon as possible (before parsing local classes) but not for pytest
|
|
@@ -16,16 +20,19 @@ if not environ.get('INVOKED_BY_PYTEST', False):
|
|
|
16
20
|
|
|
17
21
|
from rich.columns import Columns
|
|
18
22
|
from rich.panel import Panel
|
|
23
|
+
from rich.text import Text
|
|
19
24
|
from yaralyzer.helpers.rich_text_helper import prefix_with_plain_text_obj
|
|
20
25
|
from yaralyzer.output.file_export import invoke_rich_export
|
|
21
26
|
from yaralyzer.output.rich_console import console
|
|
22
27
|
from yaralyzer.util.logging import log, log_and_print
|
|
23
28
|
|
|
29
|
+
from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
|
|
30
|
+
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
24
31
|
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
25
32
|
from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
|
|
26
33
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
34
|
+
from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments, parse_combine_pdfs_args
|
|
27
35
|
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
|
|
28
|
-
from pdfalyzer.util.argument_parser import output_sections, parse_arguments
|
|
29
36
|
|
|
30
37
|
# For the table shown by running pdfalyzer_show_color_theme
|
|
31
38
|
MAX_THEME_COL_SIZE = 35
|
|
@@ -82,3 +89,36 @@ def pdfalyzer_show_color_theme() -> None:
|
|
|
82
89
|
]
|
|
83
90
|
|
|
84
91
|
console.print(Columns(colors, column_first=True, padding=(0,3)))
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def combine_pdfs():
|
|
95
|
+
"""Utility method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'."""
|
|
96
|
+
args = parse_combine_pdfs_args()
|
|
97
|
+
set_max_open_files(args.number_of_pdfs)
|
|
98
|
+
merger = PdfMerger()
|
|
99
|
+
|
|
100
|
+
for pdf in args.pdfs:
|
|
101
|
+
try:
|
|
102
|
+
print_highlighted(f" -> Merging '{pdf}'...", style='dim')
|
|
103
|
+
merger.append(pdf)
|
|
104
|
+
except PdfReadError as e:
|
|
105
|
+
print_highlighted(f" -> Failed to merge '{pdf}'! {e}", style='red')
|
|
106
|
+
ask_to_proceed()
|
|
107
|
+
|
|
108
|
+
if args.compression_level == 0:
|
|
109
|
+
print_highlighted("\nSkipping content stream compression...")
|
|
110
|
+
else:
|
|
111
|
+
print_highlighted(f"\nCompressing content streams with zlib level {args.compression_level}...")
|
|
112
|
+
|
|
113
|
+
for i, page in enumerate(merger.pages):
|
|
114
|
+
# TODO: enable image quality reduction + zlib level once PyPDF is upgraded to 4.x and option is available
|
|
115
|
+
# See https://pypdf.readthedocs.io/en/latest/user/file-size.html#reducing-image-quality
|
|
116
|
+
print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
|
|
117
|
+
page.pagedata.compress_content_streams() # This is CPU intensive!
|
|
118
|
+
|
|
119
|
+
print_highlighted(f"\nWriting '{args.output_file}'...", style='cyan')
|
|
120
|
+
merger.write(args.output_file)
|
|
121
|
+
merger.close()
|
|
122
|
+
txt = Text('').append(f" -> Wrote ")
|
|
123
|
+
txt.append(str(file_size_in_mb(args.output_file)), style='cyan').append(" megabytes\n")
|
|
124
|
+
print_highlighted(txt)
|
|
@@ -20,9 +20,8 @@ from yaralyzer.util.logging import log
|
|
|
20
20
|
|
|
21
21
|
from pdfalyzer.config import PdfalyzerConfig
|
|
22
22
|
from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode
|
|
23
|
-
from pdfalyzer.detection.constants.binary_regexes import (BACKTICK,
|
|
24
|
-
DANGEROUS_PDF_KEYS_TO_HUNT_ONLY_IN_FONTS, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET,
|
|
25
|
-
QUOTE_PATTERNS)
|
|
23
|
+
from pdfalyzer.detection.constants.binary_regexes import (BACKTICK, DANGEROUS_PDF_KEYS_TO_HUNT_ONLY_IN_FONTS,
|
|
24
|
+
DANGEROUS_PDF_KEYS_TO_HUNT_ONLY_IN_FONTS, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET, QUOTE_PATTERNS)
|
|
26
25
|
from pdfalyzer.helpers.string_helper import generate_hyphen_line
|
|
27
26
|
from pdfalyzer.output.layout import print_headline_panel, print_section_sub_subheader
|
|
28
27
|
from pdfalyzer.util.adobe_strings import CONTENTS, CURRENTFILE_EEXEC, FONT_FILE_KEYS
|
|
@@ -30,7 +29,7 @@ from pdfalyzer.util.adobe_strings import CONTENTS, CURRENTFILE_EEXEC, FONT_FILE_
|
|
|
30
29
|
|
|
31
30
|
class BinaryScanner:
|
|
32
31
|
def __init__(self, _bytes: bytes, owner: PdfTreeNode, label: Optional[Text] = None):
|
|
33
|
-
"""owner is an optional link back to the object containing this binary"""
|
|
32
|
+
"""'owner' arg is an optional link back to the object containing this binary."""
|
|
34
33
|
self.bytes = _bytes
|
|
35
34
|
self.label = label
|
|
36
35
|
self.owner = owner
|
|
@@ -43,8 +42,8 @@ class BinaryScanner:
|
|
|
43
42
|
self.regex_extraction_stats = defaultdict(lambda: RegexMatchMetrics())
|
|
44
43
|
|
|
45
44
|
def check_for_dangerous_instructions(self) -> None:
|
|
46
|
-
"""Scan for all the strings in DANGEROUS_INSTRUCTIONS list and decode bytes around them"""
|
|
47
|
-
subheader = "Scanning Binary For Anything That Could Be Described As '
|
|
45
|
+
"""Scan for all the strings in DANGEROUS_INSTRUCTIONS list and decode bytes around them."""
|
|
46
|
+
subheader = "Scanning Binary For Anything That Could Be Described As 'sus'..."
|
|
48
47
|
print_section_sub_subheader(subheader, style=f"bright_red")
|
|
49
48
|
|
|
50
49
|
for instruction in DANGEROUS_STRINGS:
|
|
@@ -62,7 +61,7 @@ class BinaryScanner:
|
|
|
62
61
|
self.process_yara_matches(yaralyzer, instruction, force=True)
|
|
63
62
|
|
|
64
63
|
def check_for_boms(self) -> None:
|
|
65
|
-
"""Check the binary data for BOMs"""
|
|
64
|
+
"""Check the binary data for BOMs."""
|
|
66
65
|
print_section_sub_subheader("Scanning Binary for any BOMs...", style='BOM')
|
|
67
66
|
|
|
68
67
|
for bom_bytes, bom_name in BOMS.items():
|
|
@@ -105,11 +104,11 @@ class BinaryScanner:
|
|
|
105
104
|
return self._quote_yaralyzer(QUOTE_PATTERNS[BACKTICK], BACKTICK).match_iterator()
|
|
106
105
|
|
|
107
106
|
def extract_front_slash_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
|
|
108
|
-
"""Returns an interator over all strings surrounded by front_slashes (hint: regular expressions)"""
|
|
107
|
+
"""Returns an interator over all strings surrounded by front_slashes (hint: regular expressions)."""
|
|
109
108
|
return self._quote_yaralyzer(QUOTE_PATTERNS[FRONTSLASH], FRONTSLASH).match_iterator()
|
|
110
109
|
|
|
111
110
|
def print_stream_preview(self, num_bytes=None, title_suffix=None) -> None:
|
|
112
|
-
"""Print a preview showing the beginning and end of the embedded stream data"""
|
|
111
|
+
"""Print a preview showing the beginning and end of the embedded stream data."""
|
|
113
112
|
num_bytes = num_bytes or PdfalyzerConfig._args.preview_stream_length or console_width()
|
|
114
113
|
snipped_byte_count = self.stream_length - (num_bytes * 2)
|
|
115
114
|
console.line()
|
|
@@ -134,7 +133,7 @@ class BinaryScanner:
|
|
|
134
133
|
console.line()
|
|
135
134
|
|
|
136
135
|
def process_yara_matches(self, yaralyzer: Yaralyzer, pattern: str, force: bool = False) -> None:
|
|
137
|
-
"""Decide whether to attempt to decode the matched bytes, track stats. force param ignores min/max length"""
|
|
136
|
+
"""Decide whether to attempt to decode the matched bytes, track stats. force param ignores min/max length."""
|
|
138
137
|
for bytes_match, decoder in yaralyzer.match_iterator():
|
|
139
138
|
log.debug(f"Trackings stats for match: {pattern}, bytes_match: {bytes_match}, is_decodable: {bytes_match.is_decodable()}")
|
|
140
139
|
|
|
@@ -185,7 +184,7 @@ class BinaryScanner:
|
|
|
185
184
|
)
|
|
186
185
|
|
|
187
186
|
def _print_suppression_notices(self) -> None:
|
|
188
|
-
"""Print notices in queue in a single panel
|
|
187
|
+
"""Print the notices in queue in a single display panel and then empty the queue."""
|
|
189
188
|
if len(self.suppression_notice_queue) == 0:
|
|
190
189
|
return
|
|
191
190
|
|
|
@@ -195,5 +194,5 @@ class BinaryScanner:
|
|
|
195
194
|
self.suppression_notice_queue = []
|
|
196
195
|
|
|
197
196
|
def _eexec_idx(self) -> int:
|
|
198
|
-
"""Returns the location of CURRENTFILES_EEXEC within the binary stream
|
|
197
|
+
"""Returns the location of CURRENTFILES_EEXEC within the binary stream data (or 0 if it's not there)."""
|
|
199
198
|
return self.bytes.find(CURRENTFILE_EEXEC) if CURRENTFILE_EEXEC in self.bytes else 0
|
|
@@ -22,6 +22,7 @@ class IndeterminateNode:
|
|
|
22
22
|
self.node = node
|
|
23
23
|
|
|
24
24
|
def place_node(self) -> None:
|
|
25
|
+
"""Attempt to find the appropriate parent/child relationships for this node."""
|
|
25
26
|
log.debug(f"Attempting to resolve indeterminate node: {self.node}")
|
|
26
27
|
|
|
27
28
|
if self._check_for_common_ancestor():
|
|
@@ -34,7 +35,7 @@ class IndeterminateNode:
|
|
|
34
35
|
parent = self.find_node_with_most_descendants()
|
|
35
36
|
parent_str = escape(str(parent))
|
|
36
37
|
|
|
37
|
-
# Any branch that doesn't return or raise will
|
|
38
|
+
# Any if/else branch that doesn't return or raise will decide parent to be the node w/most descendants
|
|
38
39
|
if self._has_only_similar_relationships():
|
|
39
40
|
log.info(f" Fuzzy match addresses or labels; placing under node w/most descendants: {parent_str}")
|
|
40
41
|
elif self._make_parent_if_one_remains(lambda r: r.from_node.type in PAGE_AND_PAGES):
|
|
@@ -43,7 +44,8 @@ class IndeterminateNode:
|
|
|
43
44
|
elif self.node.type == COLOR_SPACE:
|
|
44
45
|
log.info(f" Color space node found; placing under node w/most descendants: {parent_str}")
|
|
45
46
|
elif set(self.node.unique_labels_of_referring_nodes()) == set(PAGE_AND_PAGES):
|
|
46
|
-
#
|
|
47
|
+
# Handle an edge case seen in the wild involving a PDF that doesn't conform to the PDF spec
|
|
48
|
+
# in a particular way.
|
|
47
49
|
log.warning(f" {self.node} seems to be a loose {PAGE}. Linking to first {PAGES}")
|
|
48
50
|
pages_nodes = [n for n in self.node.nodes_with_here_references() if self.node.type == PAGES]
|
|
49
51
|
self.node.set_parent(self.find_node_with_most_descendants(pages_nodes))
|
|
@@ -63,7 +65,7 @@ class IndeterminateNode:
|
|
|
63
65
|
def _has_only_similar_relationships(self) -> bool:
|
|
64
66
|
"""
|
|
65
67
|
Returns True if all the nodes w/references to this one have the same type or if all the
|
|
66
|
-
reference_keys that point to this node are the same
|
|
68
|
+
reference_keys that point to this node are the same.
|
|
67
69
|
"""
|
|
68
70
|
unique_refferer_labels = self.node.unique_labels_of_referring_nodes()
|
|
69
71
|
unique_addresses = self.node.unique_addresses()
|
|
@@ -99,7 +101,7 @@ class IndeterminateNode:
|
|
|
99
101
|
log.info(f"{possible_ancestor} is the common ancestor of {other_nodes_str}")
|
|
100
102
|
return possible_ancestor
|
|
101
103
|
|
|
102
|
-
def _check_single_relation_rules(self):
|
|
104
|
+
def _check_single_relation_rules(self) -> bool:
|
|
103
105
|
"""Check various ways of narrowing down the list of potential parents to one node."""
|
|
104
106
|
if self._make_parent_if_one_remains(lambda r: r.reference_key in [K, KIDS]):
|
|
105
107
|
log.info(" Found single explicit /K or /Kids ref")
|
|
@@ -111,7 +113,7 @@ class IndeterminateNode:
|
|
|
111
113
|
return True
|
|
112
114
|
|
|
113
115
|
def _make_parent_if_one_remains(self, is_possible_parent: Callable) -> bool:
|
|
114
|
-
"""Relationships are filtered w/
|
|
116
|
+
"""Relationships are filtered w/is_possible_parent(); if there's only one possibility it's made the parent."""
|
|
115
117
|
remaining_relationships = [r for r in self.node.non_tree_relationships if is_possible_parent(r)]
|
|
116
118
|
|
|
117
119
|
if len(remaining_relationships) == 1:
|
|
@@ -123,6 +125,6 @@ class IndeterminateNode:
|
|
|
123
125
|
|
|
124
126
|
|
|
125
127
|
def find_node_with_lowest_id(list_of_nodes: List[PdfTreeNode]) -> PdfTreeNode:
|
|
126
|
-
"""Find node in list_of_nodes_with_lowest ID"""
|
|
128
|
+
"""Find node in list_of_nodes_with_lowest ID."""
|
|
127
129
|
lowest_idnum = min([n.idnum for n in list_of_nodes])
|
|
128
130
|
return next(n for n in list_of_nodes if n.idnum == lowest_idnum)
|
|
@@ -47,7 +47,7 @@ class PdfObjectProperties:
|
|
|
47
47
|
self.label = address
|
|
48
48
|
self.type = root_address(address) if isinstance(address, str) else None
|
|
49
49
|
|
|
50
|
-
# Force a string. TODO this sucks.
|
|
50
|
+
# Force self.label to be a string. TODO this sucks.
|
|
51
51
|
if isinstance(self.label, int):
|
|
52
52
|
self.label = f"{UNLABELED}[{self.label}]"
|
|
53
53
|
|
|
@@ -29,9 +29,9 @@ DECODE_FAILURE_LEN = -1
|
|
|
29
29
|
class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
30
30
|
def __init__(self, obj: PdfObject, address: str, idnum: int):
|
|
31
31
|
"""
|
|
32
|
-
obj:
|
|
33
|
-
address:
|
|
34
|
-
idnum:
|
|
32
|
+
obj: The underlying PDF object
|
|
33
|
+
address: The first address that points from some node to this one
|
|
34
|
+
idnum: ID used in the reference
|
|
35
35
|
"""
|
|
36
36
|
PdfObjectProperties.__init__(self, obj, address, idnum)
|
|
37
37
|
self.non_tree_relationships: List[PdfObjectRelationship] = []
|
|
@@ -54,7 +54,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
54
54
|
|
|
55
55
|
@classmethod
|
|
56
56
|
def from_reference(cls, ref: IndirectObject, address: str) -> 'PdfTreeNode':
|
|
57
|
-
"""Builds a PdfTreeDecorator from an IndirectObject"""
|
|
57
|
+
"""Builds a PdfTreeDecorator from an IndirectObject."""
|
|
58
58
|
try:
|
|
59
59
|
return cls(ref.get_object(), address, ref.idnum)
|
|
60
60
|
except PdfReadError as e:
|
|
@@ -90,7 +90,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
90
90
|
log.info(f'Added other relationship: {relationship} {self}')
|
|
91
91
|
|
|
92
92
|
def remove_non_tree_relationship(self, from_node: 'PdfTreeNode') -> None:
|
|
93
|
-
"""Remove all non_tree_relationships from from_node to this node"""
|
|
93
|
+
"""Remove all non_tree_relationships from from_node to this node."""
|
|
94
94
|
relationships_to_remove = [r for r in self.non_tree_relationships if r.from_node == from_node]
|
|
95
95
|
|
|
96
96
|
if len(relationships_to_remove) == 0:
|
|
@@ -104,10 +104,11 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
104
104
|
self.non_tree_relationships.remove(relationship)
|
|
105
105
|
|
|
106
106
|
def nodes_with_here_references(self) -> List['PdfTreeNode']:
|
|
107
|
-
"""Return a list of nodes that contain this
|
|
107
|
+
"""Return a list of nodes that contain this node's PDF object as an IndirectObject reference."""
|
|
108
108
|
return [r.from_node for r in self.non_tree_relationships if r.from_node]
|
|
109
109
|
|
|
110
110
|
def non_tree_relationship_count(self) -> int:
|
|
111
|
+
"""Number of non parent/child relationships containing this node."""
|
|
111
112
|
return len(self.non_tree_relationships)
|
|
112
113
|
|
|
113
114
|
def unique_addresses(self) -> List[str]:
|
|
@@ -120,15 +121,15 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
120
121
|
return list(addresses)
|
|
121
122
|
|
|
122
123
|
def references_to_other_nodes(self) -> List[PdfObjectRelationship]:
|
|
123
|
-
"""Returns all nodes referenced from node.obj (see PdfObjectRelationship definition)"""
|
|
124
|
+
"""Returns all nodes referenced from node.obj (see PdfObjectRelationship definition)."""
|
|
124
125
|
return PdfObjectRelationship.build_node_references(from_node=self)
|
|
125
126
|
|
|
126
127
|
def contains_stream(self) -> bool:
|
|
127
|
-
"""Returns True for ContentStream, DecodedStream, and EncodedStream objects"""
|
|
128
|
+
"""Returns True for ContentStream, DecodedStream, and EncodedStream objects."""
|
|
128
129
|
return isinstance(self.obj, StreamObject)
|
|
129
130
|
|
|
130
131
|
def tree_address(self, max_length: Optional[int] = DEFAULT_MAX_ADDRESS_LENGTH) -> str:
|
|
131
|
-
"""Creates a string like '/Catalog/Pages/Resources[2]/Font' truncated to max_length (if given)"""
|
|
132
|
+
"""Creates a string like '/Catalog/Pages/Resources[2]/Font' truncated to max_length (if given)."""
|
|
132
133
|
if self.label == TRAILER:
|
|
133
134
|
return '/'
|
|
134
135
|
elif self.parent is None:
|
|
@@ -144,7 +145,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
144
145
|
return '...' + address[-max_length:][3:]
|
|
145
146
|
|
|
146
147
|
def address_of_this_node_in_other(self, from_node: 'PdfTreeNode') -> Optional[str]:
|
|
147
|
-
"""Find the local address used in from_node to refer to this node"""
|
|
148
|
+
"""Find the local address used in 'from_node' to refer to this node."""
|
|
148
149
|
refs_to_this_node = [
|
|
149
150
|
ref for ref in from_node.references_to_other_nodes()
|
|
150
151
|
if ref.to_obj.idnum == self.idnum
|
|
@@ -163,7 +164,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
163
164
|
else:
|
|
164
165
|
address = refs_to_this_node[0].address
|
|
165
166
|
# If other node's label doesn't start with a NON_STANDARD_ADDRESS string
|
|
166
|
-
# and any of the relationships pointing at this
|
|
167
|
+
# and any of the relationships pointing at this node use something other than a
|
|
167
168
|
# NON_STANDARD_ADDRESS_NODES string to refer here, print a warning about multiple refs.
|
|
168
169
|
if not (is_prefixed_by_any(from_node.label, NON_STANDARD_ADDRESS_NODES) or \
|
|
169
170
|
all(ref.address in NON_STANDARD_ADDRESS_NODES for ref in refs_to_this_node)):
|
|
@@ -189,10 +190,11 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
189
190
|
SymlinkNode(self, parent=relationship.from_node)
|
|
190
191
|
|
|
191
192
|
def descendants_count(self) -> int:
|
|
192
|
-
"""
|
|
193
|
+
"""Count nodes in the tree that are children/grandchildren/great grandchildren/etc of this one."""
|
|
193
194
|
return len(self.children) + sum([child.descendants_count() for child in self.children])
|
|
194
195
|
|
|
195
196
|
def unique_labels_of_referring_nodes(self) -> List[str]:
|
|
197
|
+
"""Unique label strings of nodes referring here outside the parent/child hierarchy."""
|
|
196
198
|
return list(set([r.from_node.label for r in self.non_tree_relationships]))
|
|
197
199
|
|
|
198
200
|
def print_non_tree_relationships(self) -> None:
|
|
@@ -211,7 +213,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
211
213
|
write_method(f" {i + 1}. {escape(str(r))}, Descendant Count: {r.from_node.descendants_count()}")
|
|
212
214
|
|
|
213
215
|
def _colored_address(self, max_length: Optional[int] = None) -> Text:
|
|
214
|
-
"""Rich text version of tree_address()"""
|
|
216
|
+
"""Rich text version of tree_address()."""
|
|
215
217
|
text = Text('@', style='bright_white')
|
|
216
218
|
return text.append(self.tree_address(max_length), style='address')
|
|
217
219
|
|
|
@@ -29,7 +29,7 @@ class PdfTreeVerifier:
|
|
|
29
29
|
log.warning(msg)
|
|
30
30
|
|
|
31
31
|
def verify_unencountered_are_untraversable(self) -> None:
|
|
32
|
-
"""Make sure any PDF object IDs we can't find in tree are /ObjStm or /Xref nodes"""
|
|
32
|
+
"""Make sure any PDF object IDs we can't find in tree are /ObjStm or /Xref nodes."""
|
|
33
33
|
if self.pdfalyzer.pdf_size is None:
|
|
34
34
|
log.warning(f"{SIZE} not found in PDF trailer; cannot verify all nodes are in tree")
|
|
35
35
|
return
|
|
@@ -1,13 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Configuration of what to scan for in binary data. Regexes here will be matched against binary streams
|
|
3
|
-
and then force decoded
|
|
3
|
+
and then force decoded.
|
|
4
4
|
"""
|
|
5
|
-
|
|
6
|
-
import re
|
|
7
|
-
from typing import Union
|
|
8
|
-
|
|
9
|
-
from deprecated import deprecated
|
|
10
|
-
|
|
11
5
|
from pdfalyzer.util.adobe_strings import DANGEROUS_PDF_KEYS
|
|
12
6
|
|
|
13
7
|
DANGEROUS_JAVASCRIPT_INSTRUCTIONS = ['eval']
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Some helpers for stuff with the local filesystem.
|
|
3
|
+
"""
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Union
|
|
7
|
+
|
|
8
|
+
from yaralyzer.output.rich_console import console
|
|
9
|
+
|
|
10
|
+
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
11
|
+
|
|
12
|
+
NUMBERED_PAGE_REGEX = re.compile(r'.*_(\d+)\.\w{3,4}$')
|
|
13
|
+
DEFAULT_MAX_OPEN_FILES = 256 # macOS default
|
|
14
|
+
OPEN_FILES_BUFFER = 30 # we might have some files open already so we need to go beyond DEFAULT_MAX_OPEN_FILES
|
|
15
|
+
PDF_EXT = '.pdf'
|
|
16
|
+
|
|
17
|
+
# TODO: this kind of type alias is not supported until Python 3.12
|
|
18
|
+
#type StrOrPath = Union[str, Path]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def with_pdf_extension(file_path: Union[str, Path]) -> str:
|
|
22
|
+
"""Append '.pdf' to 'file_path' if it doesn't already end with '.pdf'."""
|
|
23
|
+
return str(file_path) + ('' if is_pdf(file_path) else PDF_EXT)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def is_pdf(file_path: Union[str, Path]) -> bool:
|
|
27
|
+
"""Return True if 'file_path' ends with '.pdf'."""
|
|
28
|
+
return str(file_path).endswith(PDF_EXT)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def file_exists(file_path: Union[str, Path]) -> bool:
|
|
32
|
+
"""Return True if 'file_path' exists."""
|
|
33
|
+
return Path(file_path).exists()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def do_all_files_exist(file_paths: list[Union[str, Path]]) -> bool:
|
|
37
|
+
"""Print an error for each element of 'file_paths' that's not a file. Return True if all 'file_paths' exist."""
|
|
38
|
+
all_files_exist = True
|
|
39
|
+
|
|
40
|
+
for file_path in file_paths:
|
|
41
|
+
if not file_exists(file_path):
|
|
42
|
+
console.print(f"File not found: '{file_path}'", style='error')
|
|
43
|
+
all_files_exist = False
|
|
44
|
+
|
|
45
|
+
return all_files_exist
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def extract_page_number(file_path: Union[str, Path]) -> int|None:
|
|
49
|
+
"""Extract the page number from the end of a filename if it exists."""
|
|
50
|
+
match = NUMBERED_PAGE_REGEX.match(str(file_path))
|
|
51
|
+
return int(match.group(1)) if match else None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def file_size_in_mb(file_path: Union[str, Path], decimal_places: int = 2) -> float:
|
|
55
|
+
"""Return the size of 'file_path' in MB rounded to 2 decimal places,"""
|
|
56
|
+
return round(Path(file_path).stat().st_size / 1024.0 / 1024.0, decimal_places)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def set_max_open_files(num_filehandles: int = DEFAULT_MAX_OPEN_FILES) -> tuple[int | None, int | None]:
|
|
60
|
+
"""
|
|
61
|
+
Sets the OS level max open files to at least 'num_filehandles'. Current value can be seen with 'ulimit -a'.
|
|
62
|
+
Required when you might be opening more than DEFAULT_MAX_OPEN_FILES file handles simultaneously
|
|
63
|
+
(e.g. when you are merging a lot of small images or PDFs). Equivalent of something like
|
|
64
|
+
'default ulimit -n 1024' on macOS.
|
|
65
|
+
|
|
66
|
+
NOTE: Does nothing on Windows (I think).
|
|
67
|
+
NOTE: This mostly came from somewhere on stackoverflow but I lost the link.
|
|
68
|
+
"""
|
|
69
|
+
try:
|
|
70
|
+
import resource # Windows doesn't have this package / doesn't need to bump up the ulimit (??)
|
|
71
|
+
except ImportError:
|
|
72
|
+
resource = None
|
|
73
|
+
|
|
74
|
+
if resource is None:
|
|
75
|
+
print_highlighted(f"No resource module; cannot set max open files on this platform...", style='yellow')
|
|
76
|
+
return (None, None)
|
|
77
|
+
elif num_filehandles <= DEFAULT_MAX_OPEN_FILES:
|
|
78
|
+
# Then the OS max open files value is already sufficient.
|
|
79
|
+
return (DEFAULT_MAX_OPEN_FILES, DEFAULT_MAX_OPEN_FILES)
|
|
80
|
+
|
|
81
|
+
# %% (0) what is current ulimit -n setting?
|
|
82
|
+
(soft, hard) = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
83
|
+
num_filehandles = num_filehandles + OPEN_FILES_BUFFER
|
|
84
|
+
|
|
85
|
+
# %% (1) increase limit (soft and even hard) if needed
|
|
86
|
+
if soft < num_filehandles:
|
|
87
|
+
soft = num_filehandles
|
|
88
|
+
hard = max(soft, hard)
|
|
89
|
+
print_highlighted(f"Increasing max open files soft & hard 'ulimit -n {soft} {hard}'...")
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
|
|
93
|
+
except (ValueError, resource.error):
|
|
94
|
+
try:
|
|
95
|
+
hard = soft
|
|
96
|
+
print_highlighted(f"Retrying setting max open files (soft, hard)=({soft}, {hard})", style='yellow')
|
|
97
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
|
|
98
|
+
except Exception:
|
|
99
|
+
print_highlighted('Failed to set max open files / ulimit, giving up!', style='error')
|
|
100
|
+
soft,hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
101
|
+
|
|
102
|
+
return (soft, hard)
|
|
@@ -1,14 +1,26 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Functions for miscellaneous Rich text/string operations.
|
|
3
3
|
"""
|
|
4
|
+
from functools import partial
|
|
4
5
|
from typing import List
|
|
5
6
|
|
|
6
7
|
from PyPDF2.generic import PdfObject
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.highlighter import RegexHighlighter, JSONHighlighter
|
|
7
10
|
from rich.text import Text
|
|
11
|
+
from yaralyzer.output.rich_console import console
|
|
8
12
|
|
|
9
13
|
from pdfalyzer.helpers.pdf_object_helper import pypdf_class_name
|
|
10
14
|
from pdfalyzer.output.styles.node_colors import get_label_style, get_class_style_italic
|
|
11
15
|
|
|
16
|
+
# Usually we use the yaralyzer console but that has no highlighter
|
|
17
|
+
pdfalyzer_console = Console(color_system='256')
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def print_highlighted(msg: str|Text, **kwargs) -> None:
|
|
21
|
+
"""Print 'msg' with Rich highlighting."""
|
|
22
|
+
pdfalyzer_console.print(msg, highlight=True, **kwargs)
|
|
23
|
+
|
|
12
24
|
|
|
13
25
|
def quoted_text(
|
|
14
26
|
_string: str,
|
|
@@ -6,7 +6,6 @@ from typing import List, Optional, Union
|
|
|
6
6
|
from PyPDF2.generic import IndirectObject, PdfObject
|
|
7
7
|
from yaralyzer.util.logging import log
|
|
8
8
|
|
|
9
|
-
#from pdfalyzer.he import has_indeterminate_prefix
|
|
10
9
|
from pdfalyzer.helpers.string_helper import bracketed, is_prefixed_by_any
|
|
11
10
|
from pdfalyzer.util.adobe_strings import *
|
|
12
11
|
|
|
@@ -79,7 +79,8 @@ XREF_STREAM = '/XRefStm'
|
|
|
79
79
|
FONT_LENGTHS = [f'/Length{i + 1}' for i in range(3)]
|
|
80
80
|
FONT_FILE_KEYS = [FONT_FILE, FONT_FILE2, FONT_FILE3]
|
|
81
81
|
|
|
82
|
-
# Instructions to flag when scanning stream data for malicious content.
|
|
82
|
+
# Instructions to flag when scanning stream data for malicious content. The leading
|
|
83
|
+
# front slash will be removed when pattern matching.
|
|
83
84
|
DANGEROUS_PDF_KEYS = [
|
|
84
85
|
# AA, # AA is too generic; can't afford to remove the frontslash
|
|
85
86
|
ACRO_FORM,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import sys
|
|
2
|
-
from argparse import ArgumentError, ArgumentParser
|
|
2
|
+
from argparse import ArgumentError, ArgumentParser, Namespace
|
|
3
3
|
from collections import namedtuple
|
|
4
4
|
from functools import partial, update_wrapper
|
|
5
5
|
from importlib.metadata import version
|
|
@@ -7,11 +7,16 @@ from os import getcwd, path
|
|
|
7
7
|
from typing import List
|
|
8
8
|
|
|
9
9
|
from rich_argparse_plus import RichHelpFormatterPlus
|
|
10
|
+
from rich.prompt import Confirm
|
|
11
|
+
from rich.text import Text
|
|
10
12
|
from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args
|
|
11
13
|
from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
|
|
12
14
|
|
|
13
15
|
from pdfalyzer.config import ALL_STREAMS, PdfalyzerConfig
|
|
14
16
|
from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
|
|
17
|
+
from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
|
|
18
|
+
with_pdf_extension)
|
|
19
|
+
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
15
20
|
|
|
16
21
|
# NamedTuple to keep our argument selection orderly
|
|
17
22
|
OutputSection = namedtuple('OutputSection', ['argument', 'method'])
|
|
@@ -25,7 +30,7 @@ DESCRIPTION = "Explore PDF's inner data structure with absurdly large and in dep
|
|
|
25
30
|
|
|
26
31
|
EPILOG = "Values for various config options can be set permanently by a .pdfalyzer file in your home directory; " + \
|
|
27
32
|
"see the documentation for details. " + \
|
|
28
|
-
f"A registry of previous pdfalyzer invocations will be
|
|
33
|
+
f"A registry of previous pdfalyzer invocations will be inscribed to a file if the " + \
|
|
29
34
|
"{YaralyzerConfig.LOG_DIR_ENV_VAR} environment variable is configured."
|
|
30
35
|
|
|
31
36
|
# Analysis selection sections
|
|
@@ -107,7 +112,9 @@ select.add_argument('--preview-stream-length',
|
|
|
107
112
|
parser._action_groups = parser._action_groups[:2] + [parser._action_groups[-1]] + parser._action_groups[2:-1]
|
|
108
113
|
|
|
109
114
|
|
|
110
|
-
|
|
115
|
+
################################
|
|
116
|
+
# Main argument parsing begins #
|
|
117
|
+
################################
|
|
111
118
|
def parse_arguments():
|
|
112
119
|
"""Parse command line args. Most settings are communicated to the app by setting env vars"""
|
|
113
120
|
if '--version' in sys.argv:
|
|
@@ -175,3 +182,71 @@ def output_sections(args, pdfalyzer) -> List[OutputSection]:
|
|
|
175
182
|
def all_sections_chosen(args):
|
|
176
183
|
"""Returns true if all flags are set or no flags are set."""
|
|
177
184
|
return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
###############################################
|
|
188
|
+
# Separate arg parser for combine_pdfs script #
|
|
189
|
+
###############################################
|
|
190
|
+
combine_pdfs_parser = ArgumentParser(
|
|
191
|
+
description="Combine multiple PDFs into one.",
|
|
192
|
+
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" \
|
|
193
|
+
" page numebrs prior to merging.",
|
|
194
|
+
formatter_class=RichHelpFormatterPlus)
|
|
195
|
+
|
|
196
|
+
combine_pdfs_parser.add_argument('pdfs',
|
|
197
|
+
help='two or more PDFs to combine',
|
|
198
|
+
metavar='PDF_PATH',
|
|
199
|
+
nargs='+')
|
|
200
|
+
|
|
201
|
+
combine_pdfs_parser.add_argument('-c', '--compression-level',
|
|
202
|
+
help='zlib image compression level (0=none, max=1 until PyPDF is upgraded)',
|
|
203
|
+
choices=range(0, 2),
|
|
204
|
+
default=1,
|
|
205
|
+
type=int)
|
|
206
|
+
|
|
207
|
+
combine_pdfs_parser.add_argument('-o', '--output-file',
|
|
208
|
+
help='path to write the combined PDFs to',
|
|
209
|
+
required=True)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def parse_combine_pdfs_args() -> Namespace:
|
|
213
|
+
"""Parse command line args for combine_pdfs script."""
|
|
214
|
+
args = combine_pdfs_parser.parse_args()
|
|
215
|
+
args.output_file = with_pdf_extension(args.output_file)
|
|
216
|
+
confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
|
|
217
|
+
args.number_of_pdfs = len(args.pdfs)
|
|
218
|
+
|
|
219
|
+
if args.number_of_pdfs < 2:
|
|
220
|
+
exit_with_error(f"Need at least 2 PDFs to merge.")
|
|
221
|
+
elif not do_all_files_exist(args.pdfs):
|
|
222
|
+
exit_with_error()
|
|
223
|
+
elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
|
|
224
|
+
exit_with_error()
|
|
225
|
+
|
|
226
|
+
if all(is_pdf(pdf) for pdf in args.pdfs):
|
|
227
|
+
if all(extract_page_number(pdf) for pdf in args.pdfs):
|
|
228
|
+
print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
|
|
229
|
+
args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
|
|
230
|
+
else:
|
|
231
|
+
print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
|
|
232
|
+
else:
|
|
233
|
+
print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
|
|
234
|
+
ask_to_proceed()
|
|
235
|
+
|
|
236
|
+
print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
|
|
237
|
+
return args
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def ask_to_proceed() -> None:
|
|
241
|
+
"""Exit if user doesn't confirm they want to proceed."""
|
|
242
|
+
if not Confirm.ask(Text("Proceed anyway?")):
|
|
243
|
+
exit_with_error()
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def exit_with_error(error_message: str|None = None) -> None:
|
|
247
|
+
"""Print 'error_message' and exit with status code 1."""
|
|
248
|
+
if error_message:
|
|
249
|
+
print_highlighted(error_message, style='bold red')
|
|
250
|
+
|
|
251
|
+
print_highlighted('Exiting...', style='red')
|
|
252
|
+
sys.exit(1)
|
|
@@ -1019,3 +1019,28 @@ rule rule_pdf_activemime {
|
|
|
1019
1019
|
condition:
|
|
1020
1020
|
$pdf at 0 and any of ($base64_ActiveMim*)
|
|
1021
1021
|
}
|
|
1022
|
+
|
|
1023
|
+
|
|
1024
|
+
rule malware_MaldocinPDF {
|
|
1025
|
+
meta:
|
|
1026
|
+
author = "Yuma Masubuchi and Kota Kino"
|
|
1027
|
+
description = "Search for embeddings of malicious Word files into a PDF file."
|
|
1028
|
+
created_date = "2023-08-15"
|
|
1029
|
+
blog_reference = "https://blogs.jpcert.or.jp/en/2023/08/maldocinpdf.html"
|
|
1030
|
+
labs_reference = "N/A"
|
|
1031
|
+
labs_pivot = "N/A"
|
|
1032
|
+
samples = "ef59d7038cfd565fd65bae12588810d5361df938244ebad33b71882dcf683058"
|
|
1033
|
+
|
|
1034
|
+
strings:
|
|
1035
|
+
$docfile2 = "<w:WordDocument>" ascii nocase
|
|
1036
|
+
$xlsfile2 = "<x:ExcelWorkbook>" ascii nocase
|
|
1037
|
+
$mhtfile0 = "mime" ascii nocase
|
|
1038
|
+
$mhtfile1 = "content-location:" ascii nocase
|
|
1039
|
+
$mhtfile2 = "content-type:" ascii nocase
|
|
1040
|
+
|
|
1041
|
+
condition:
|
|
1042
|
+
(uint32(0) == 0x46445025) and
|
|
1043
|
+
(1 of ($mhtfile*)) and
|
|
1044
|
+
( (1 of ($docfile*)) or
|
|
1045
|
+
(1 of ($xlsfile*)) )
|
|
1046
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "pdfalyzer"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.15.0"
|
|
4
4
|
description = "A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
|
|
5
5
|
authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
|
|
6
6
|
license = "GPL-3.0-or-later"
|
|
@@ -42,6 +42,7 @@ packages = [
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
[tool.poetry.scripts]
|
|
45
|
+
combine_pdfs = 'pdfalyzer:combine_pdfs'
|
|
45
46
|
pdfalyze = 'pdfalyzer:pdfalyze'
|
|
46
47
|
pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'
|
|
47
48
|
|
|
@@ -50,7 +51,6 @@ pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'
|
|
|
50
51
|
python = "^3.9"
|
|
51
52
|
anytree = "~=2.8"
|
|
52
53
|
chardet = ">=5.0.0,<6.0.0"
|
|
53
|
-
Deprecated = "^1.2.13"
|
|
54
54
|
PyPDF2 = "^2.10"
|
|
55
55
|
python-dotenv = "^0.21.0"
|
|
56
56
|
rich = "^12.5.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pdfalyzer-1.14.9 → pdfalyzer-1.15.0}/pdfalyzer/detection/constants/javascript_reserved_keywords.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -13,8 +13,8 @@ from anytree import LevelOrderIter, SymlinkNode
|
|
|
13
13
|
from anytree.search import findall, findall_by_attr
|
|
14
14
|
from PyPDF2 import PdfReader
|
|
15
15
|
from PyPDF2.generic import IndirectObject
|
|
16
|
-
from yaralyzer.output.file_hashes_table import compute_file_hashes
|
|
17
16
|
from yaralyzer.helpers.file_helper import load_binary_data
|
|
17
|
+
from yaralyzer.output.file_hashes_table import compute_file_hashes
|
|
18
18
|
from yaralyzer.output.rich_console import console
|
|
19
19
|
from yaralyzer.util.logging import log
|
|
20
20
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|