pdfalyzer 1.14.10__tar.gz → 1.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pdfalyzer might be problematic. Click here for more details.

Files changed (45) hide show
  1. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/CHANGELOG.md +4 -0
  2. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/PKG-INFO +30 -16
  3. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/README.md +29 -14
  4. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/__init__.py +42 -2
  5. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/binary/binary_scanner.py +2 -3
  6. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/decorators/document_model_printer.py +1 -1
  7. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/decorators/indeterminate_node.py +8 -6
  8. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/decorators/pdf_tree_node.py +5 -3
  9. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/detection/constants/binary_regexes.py +1 -7
  10. pdfalyzer-1.15.0/pdfalyzer/helpers/filesystem_helper.py +102 -0
  11. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/helpers/rich_text_helper.py +12 -0
  12. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/util/adobe_strings.py +2 -1
  13. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/util/argument_parser.py +78 -3
  14. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/yara_rules/PDF.yara +1 -1
  15. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pyproject.toml +2 -2
  16. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/LICENSE +0 -0
  17. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/__main__.py +0 -0
  18. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/config.py +0 -0
  19. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/decorators/pdf_object_properties.py +0 -0
  20. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/decorators/pdf_tree_verifier.py +0 -0
  21. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
  22. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/detection/javascript_hunter.py +0 -0
  23. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/detection/yaralyzer_helper.py +0 -0
  24. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/font_info.py +0 -0
  25. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/helpers/dict_helper.py +0 -0
  26. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/helpers/number_helper.py +0 -0
  27. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/helpers/pdf_object_helper.py +0 -0
  28. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/helpers/string_helper.py +0 -0
  29. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/output/character_mapping.py +0 -0
  30. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/output/layout.py +0 -0
  31. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/output/pdfalyzer_presenter.py +0 -0
  32. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/output/styles/node_colors.py +0 -0
  33. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/output/styles/rich_theme.py +0 -0
  34. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/output/tables/decoding_stats_table.py +0 -0
  35. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/output/tables/font_summary_table.py +0 -0
  36. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
  37. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/output/tables/stream_objects_table.py +0 -0
  38. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/pdf_object_relationship.py +0 -0
  39. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/pdfalyzer.py +0 -0
  40. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/util/debugging.py +0 -0
  41. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/util/exceptions.py +0 -0
  42. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/util/pdf_parser_manager.py +0 -0
  43. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
  44. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/yara_rules/__init.py__ +0 -0
  45. {pdfalyzer-1.14.10 → pdfalyzer-1.15.0}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
@@ -1,5 +1,9 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ # 1.15.0
4
+ * Add `combine_pdfs` command line script to merge a bunch of PDFs into one
5
+ * Remove unused `Deprecated` dependency
6
+
3
7
  ### 1.14.10
4
8
  * Add `malware_MaldocinPDF` YARA rule
5
9
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.14.10
3
+ Version: 1.15.0
4
4
  Summary: A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
@@ -16,7 +16,6 @@ Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Topic :: Artistic Software
17
17
  Classifier: Topic :: Scientific/Engineering :: Visualization
18
18
  Classifier: Topic :: Security
19
- Requires-Dist: Deprecated (>=1.2.13,<2.0.0)
20
19
  Requires-Dist: PyPDF2 (>=2.10,<3.0)
21
20
  Requires-Dist: anytree (>=2.8,<3.0)
22
21
  Requires-Dist: chardet (>=5.0.0,<6.0.0)
@@ -63,25 +62,32 @@ If you're looking for one of these things this may be the tool for you.
63
62
  ### What It Don't Do
64
63
  This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
65
64
 
65
+ -------------
66
66
 
67
67
  # Installation
68
68
 
69
- Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` should also work.
69
+ Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` / `pip` should also work.
70
70
  ```sh
71
71
  pipx install pdfalyzer
72
72
  ```
73
73
 
74
74
  See [PyPDF2 installation notes](https://github.com/py-pdf/PyPDF2#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
75
75
 
76
- ### Troubleshooting The Installation
76
+ If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
77
+
78
+ ### Troubleshooting
77
79
  1. If you used `pip3` instead of `pipx` and have an issue you should try to install with `pipx`.
78
80
  1. If you run into an issue about missing YARA try to install [yara-python](https://pypi.org/project/yara-python/).
79
81
  1. If you encounter an error building the python `cryptography` package check your `pip` version (`pip --version`). If it's less than 22.0, upgrade `pip` with `pip install --upgrade pip`.
82
+ 1. If you get a YARA internal error number you can look up what it actually means [here](https://github.com/VirusTotal/yara/blob/master/libyara/include/yara/error.h).
83
+ 1. If you can't get the `pdfalyze` command to work try `python -m pdfalyzer`. It's an equivalent but more portable version of the same command that does not rely on your python script paths being set up in a sane way.
84
+ 1. While The Pdfalyzer has been tested on quite a few large and very complicated PDFs there are no doubt a bunch of edge cases that will trip up the code. Sifting through the various interconnected internal PDF objects and building the correct tree representation is much, much harder than it should be and requires multiple scans and a little bit of educated guessing. If a PDF fails to parse and you hit an error please open [a GitHub issue](https://github.com/michelcrypt4d4mus/pdfalyzer/issues) with the compressed (`.zip`, `.gz`, whatever) PDF that is causing the problem attached (if possible) and I'll take a look when I can. I will _not_ take a look at any uncompressed PDFs due to the security risks so make sure you zip it before you ship it.
80
85
  1. On Linux if you encounter an error building `wheel` or `cffi` you may need to install some packages:
81
86
  ```bash
82
87
  sudo apt-get install build-essential libssl-dev libffi-dev rustc
83
88
  ```
84
- 1. If you get a YARA internal error number you can look up what it actually means [here](https://github.com/VirusTotal/yara/blob/master/libyara/include/yara/error.h).
89
+
90
+ -------------
85
91
 
86
92
  # Usage
87
93
 
@@ -92,8 +98,8 @@ Run `pdfalyze --help` to see usage instructions. As of right now these are the o
92
98
  ## Runtime Options
93
99
  If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--help` then all of the analyses will be done _except_ the `--streams`. In other words, these two commands are equivalent:
94
100
 
95
- 1. `pdfalyzer lacan_buys_the_dip.pdf`
96
- 1. `pdfalyzer lacan_buys_the_dip.pdf -d -t -r -f -y -c`
101
+ 1. `pdfalyze lacan_buys_the_dip.pdf`
102
+ 1. `pdfalyze lacan_buys_the_dip.pdf -d -t -r -f -y -c`
97
103
 
98
104
  The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
99
105
 
@@ -106,15 +112,11 @@ Even if you don't configure your own `.pdfalyzer` file you may still glean some
106
112
  ### Colors And Themes
107
113
  Run `pdfalyzer_show_color_theme` to see the color theme employed.
108
114
 
109
-
110
- ## Guarantees
115
+ ### Guarantees
111
116
  Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
112
117
 
113
-
114
- ## Troubleshooting
115
- 1. If you can't get the `pdfalyze` command to work try `python -m pdfalyzer`. It's an equivalent but more portable version of the same command that does not rely on your python script paths being set up in a sane way.
116
- 1. While The Pdfalyzer has been tested on quite a few large and very complicated PDFs there are no doubt a bunch of edge cases that will trip up the code. If that does happen and you hit an error, please open [a GitHub issue](https://github.com/michelcrypt4d4mus/pdfalyzer/issues) with the compressed (`.zip`, `.gz`, whatever) PDF that is causing the problem attached (if possible) and I'll take a look when I can. I will _not_ take a look at any uncompressed PDFs due to the security risks so make sure you zip it before you ship it.
117
-
118
+ ## Example Usage
119
+ [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
118
120
 
119
121
  -------------
120
122
 
@@ -135,6 +137,7 @@ pdfalyzer = Pdfalyzer("/path/to/the/evil_or_non_evil.pdf")
135
137
  actual_pdf_tree: PdfTreeNode = pdfalyzer.pdf_tree
136
138
 
137
139
  # The PdfalyzerPresenter handles formatting/prettifying output
140
+ from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
138
141
  PdfalyzerPresenter(pdfalyzer).print_everything()
139
142
 
140
143
  # Iterate over all nodes in the PDF tree
@@ -164,6 +167,7 @@ for backtick_quoted_string in font.binary_scanner.extract_backtick_quoted_bytes(
164
167
  do_stuff(backtick_quoted_string)
165
168
  ```
166
169
 
170
+ -------------
167
171
 
168
172
  # Example Output
169
173
  The Pdfalyzer can export visualizations to HTML, ANSI colored text, and SVG images using the file export functionality that comes with [Rich](https://github.com/Textualize/rich). SVGs can be turned into `png` format images with a tool like Inkscape or `cairosvg` (Inkscape works a lot better in our experience). See `pdfalyze --help` for the specifics.
@@ -188,7 +192,7 @@ This image shows a more in-depth view of of the PDF tree for the same document s
188
192
 
189
193
  ## Fonts
190
194
 
191
- #### **Extract character mappings from ancient Adobe font formats:** It's actually `PyPDF2` doing the lifting here but we're happy to take the credit.
195
+ #### **Extract character mappings from ancient Adobe font formats**. It's actually `PyPDF2` doing the lifting here but we're happy to take the credit.
192
196
 
193
197
  ![](https://github.com/michelcrypt4d4mus/pdfalyzer/raw/master/doc/svgs/rendered_images/font_character_mapping.png)
194
198
 
@@ -223,8 +227,11 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
223
227
 
224
228
  ![](https://github.com/michelcrypt4d4mus/pdfalyzer/raw/master/doc/svgs/rendered_images/decoding_and_chardet_table_2.png)
225
229
 
230
+ -------------
226
231
 
227
232
  # PDF Resources
233
+ ## Included PDF Tools
234
+ The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
228
235
 
229
236
  ## 3rd Party PDF Tools
230
237
  ### Installing Didier Stevens's PDF Analysis Tools
@@ -247,7 +254,7 @@ There's [a script](scripts/install_t1utils.sh) to help you install the suite if
247
254
  scripts/install_t1utils.sh
248
255
  ```
249
256
 
250
- ## Documentation
257
+ ## External Documentation
251
258
  ### Official Adobe Documentation
252
259
  * [Official Adobe PDF 1.7 Specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf) - Indispensable map when navigating a PDF forest.
253
260
  * [Adobe Type 1 Font Format Specification](https://adobe-type-tools.github.io/font-tech-notes/pdfs/T1_SPEC.pdf) - Official spec for Adobe's original font description language and file format. Useful if you have suspicions about malicious fonts. Type1 seems to be the attack vector of choice recently which isn't so surprising when you consider that it's a 30 year old technology and the code that renders these fonts probably hasn't been extensively tested in decades because almost no one uses them anymore outside of people who want to use them as attack vectors.
@@ -270,6 +277,8 @@ This tool was built to fill a gap in the PDF assessment landscape following [my
270
277
 
271
278
  Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [PyPDF2](https://github.com/py-pdf/PyPDF2), [Rich](https://github.com/Textualize/rich), and [YARA](https://github.com/VirusTotal/yara-python) via [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer)) into this tool.
272
279
 
280
+ -------------
281
+
273
282
  # Contributing
274
283
  One easy way of contributing is to run [the script to test against all the PDFs in your `~/Documents` folder](scripts/test_against_all_pdfs_in_Documents_folder.sh) and report any issues.
275
284
 
@@ -290,7 +299,12 @@ These are the naming conventions at play in The Pdfalyzer code base:
290
299
  | **`indeterminate_node`** | any node whose place in the tree cannot be decided until every node has been seen |
291
300
  | **`link_node`** | nodes like `/Dest` that just contain a pointer to another node |
292
301
 
302
+ ### Reference
303
+ * [`PyPDF2 2.12.0` documentation](https://pypdf2.readthedocs.io/en/2.12.0/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
304
+
305
+
293
306
  # TODO
307
+ * Upgrade `PyPDF` to latest and expand `combine_pdfs` compression command line option
294
308
  * Highlight decodes with a lot of Javascript keywords
295
309
  * https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
296
310
  * https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
@@ -33,25 +33,32 @@ If you're looking for one of these things this may be the tool for you.
33
33
  ### What It Don't Do
34
34
  This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
35
35
 
36
+ -------------
36
37
 
37
38
  # Installation
38
39
 
39
- Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` should also work.
40
+ Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` / `pip` should also work.
40
41
  ```sh
41
42
  pipx install pdfalyzer
42
43
  ```
43
44
 
44
45
  See [PyPDF2 installation notes](https://github.com/py-pdf/PyPDF2#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
45
46
 
46
- ### Troubleshooting The Installation
47
+ If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
48
+
49
+ ### Troubleshooting
47
50
  1. If you used `pip3` instead of `pipx` and have an issue you should try to install with `pipx`.
48
51
  1. If you run into an issue about missing YARA try to install [yara-python](https://pypi.org/project/yara-python/).
49
52
  1. If you encounter an error building the python `cryptography` package check your `pip` version (`pip --version`). If it's less than 22.0, upgrade `pip` with `pip install --upgrade pip`.
53
+ 1. If you get a YARA internal error number you can look up what it actually means [here](https://github.com/VirusTotal/yara/blob/master/libyara/include/yara/error.h).
54
+ 1. If you can't get the `pdfalyze` command to work try `python -m pdfalyzer`. It's an equivalent but more portable version of the same command that does not rely on your python script paths being set up in a sane way.
55
+ 1. While The Pdfalyzer has been tested on quite a few large and very complicated PDFs there are no doubt a bunch of edge cases that will trip up the code. Sifting through the various interconnected internal PDF objects and building the correct tree representation is much, much harder than it should be and requires multiple scans and a little bit of educated guessing. If a PDF fails to parse and you hit an error please open [a GitHub issue](https://github.com/michelcrypt4d4mus/pdfalyzer/issues) with the compressed (`.zip`, `.gz`, whatever) PDF that is causing the problem attached (if possible) and I'll take a look when I can. I will _not_ take a look at any uncompressed PDFs due to the security risks so make sure you zip it before you ship it.
50
56
  1. On Linux if you encounter an error building `wheel` or `cffi` you may need to install some packages:
51
57
  ```bash
52
58
  sudo apt-get install build-essential libssl-dev libffi-dev rustc
53
59
  ```
54
- 1. If you get a YARA internal error number you can look up what it actually means [here](https://github.com/VirusTotal/yara/blob/master/libyara/include/yara/error.h).
60
+
61
+ -------------
55
62
 
56
63
  # Usage
57
64
 
@@ -62,8 +69,8 @@ Run `pdfalyze --help` to see usage instructions. As of right now these are the o
62
69
  ## Runtime Options
63
70
  If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--help` then all of the analyses will be done _except_ the `--streams`. In other words, these two commands are equivalent:
64
71
 
65
- 1. `pdfalyzer lacan_buys_the_dip.pdf`
66
- 1. `pdfalyzer lacan_buys_the_dip.pdf -d -t -r -f -y -c`
72
+ 1. `pdfalyze lacan_buys_the_dip.pdf`
73
+ 1. `pdfalyze lacan_buys_the_dip.pdf -d -t -r -f -y -c`
67
74
 
68
75
  The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
69
76
 
@@ -76,15 +83,11 @@ Even if you don't configure your own `.pdfalyzer` file you may still glean some
76
83
  ### Colors And Themes
77
84
  Run `pdfalyzer_show_color_theme` to see the color theme employed.
78
85
 
79
-
80
- ## Guarantees
86
+ ### Guarantees
81
87
  Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
82
88
 
83
-
84
- ## Troubleshooting
85
- 1. If you can't get the `pdfalyze` command to work try `python -m pdfalyzer`. It's an equivalent but more portable version of the same command that does not rely on your python script paths being set up in a sane way.
86
- 1. While The Pdfalyzer has been tested on quite a few large and very complicated PDFs there are no doubt a bunch of edge cases that will trip up the code. If that does happen and you hit an error, please open [a GitHub issue](https://github.com/michelcrypt4d4mus/pdfalyzer/issues) with the compressed (`.zip`, `.gz`, whatever) PDF that is causing the problem attached (if possible) and I'll take a look when I can. I will _not_ take a look at any uncompressed PDFs due to the security risks so make sure you zip it before you ship it.
87
-
89
+ ## Example Usage
90
+ [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
88
91
 
89
92
  -------------
90
93
 
@@ -105,6 +108,7 @@ pdfalyzer = Pdfalyzer("/path/to/the/evil_or_non_evil.pdf")
105
108
  actual_pdf_tree: PdfTreeNode = pdfalyzer.pdf_tree
106
109
 
107
110
  # The PdfalyzerPresenter handles formatting/prettifying output
111
+ from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
108
112
  PdfalyzerPresenter(pdfalyzer).print_everything()
109
113
 
110
114
  # Iterate over all nodes in the PDF tree
@@ -134,6 +138,7 @@ for backtick_quoted_string in font.binary_scanner.extract_backtick_quoted_bytes(
134
138
  do_stuff(backtick_quoted_string)
135
139
  ```
136
140
 
141
+ -------------
137
142
 
138
143
  # Example Output
139
144
  The Pdfalyzer can export visualizations to HTML, ANSI colored text, and SVG images using the file export functionality that comes with [Rich](https://github.com/Textualize/rich). SVGs can be turned into `png` format images with a tool like Inkscape or `cairosvg` (Inkscape works a lot better in our experience). See `pdfalyze --help` for the specifics.
@@ -158,7 +163,7 @@ This image shows a more in-depth view of of the PDF tree for the same document s
158
163
 
159
164
  ## Fonts
160
165
 
161
- #### **Extract character mappings from ancient Adobe font formats:** It's actually `PyPDF2` doing the lifting here but we're happy to take the credit.
166
+ #### **Extract character mappings from ancient Adobe font formats**. It's actually `PyPDF2` doing the lifting here but we're happy to take the credit.
162
167
 
163
168
  ![](https://github.com/michelcrypt4d4mus/pdfalyzer/raw/master/doc/svgs/rendered_images/font_character_mapping.png)
164
169
 
@@ -193,8 +198,11 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
193
198
 
194
199
  ![](https://github.com/michelcrypt4d4mus/pdfalyzer/raw/master/doc/svgs/rendered_images/decoding_and_chardet_table_2.png)
195
200
 
201
+ -------------
196
202
 
197
203
  # PDF Resources
204
+ ## Included PDF Tools
205
+ The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
198
206
 
199
207
  ## 3rd Party PDF Tools
200
208
  ### Installing Didier Stevens's PDF Analysis Tools
@@ -217,7 +225,7 @@ There's [a script](scripts/install_t1utils.sh) to help you install the suite if
217
225
  scripts/install_t1utils.sh
218
226
  ```
219
227
 
220
- ## Documentation
228
+ ## External Documentation
221
229
  ### Official Adobe Documentation
222
230
  * [Official Adobe PDF 1.7 Specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf) - Indispensable map when navigating a PDF forest.
223
231
  * [Adobe Type 1 Font Format Specification](https://adobe-type-tools.github.io/font-tech-notes/pdfs/T1_SPEC.pdf) - Official spec for Adobe's original font description language and file format. Useful if you have suspicions about malicious fonts. Type1 seems to be the attack vector of choice recently which isn't so surprising when you consider that it's a 30 year old technology and the code that renders these fonts probably hasn't been extensively tested in decades because almost no one uses them anymore outside of people who want to use them as attack vectors.
@@ -240,6 +248,8 @@ This tool was built to fill a gap in the PDF assessment landscape following [my
240
248
 
241
249
  Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [PyPDF2](https://github.com/py-pdf/PyPDF2), [Rich](https://github.com/Textualize/rich), and [YARA](https://github.com/VirusTotal/yara-python) via [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer)) into this tool.
242
250
 
251
+ -------------
252
+
243
253
  # Contributing
244
254
  One easy way of contributing is to run [the script to test against all the PDFs in your `~/Documents` folder](scripts/test_against_all_pdfs_in_Documents_folder.sh) and report any issues.
245
255
 
@@ -260,7 +270,12 @@ These are the naming conventions at play in The Pdfalyzer code base:
260
270
  | **`indeterminate_node`** | any node whose place in the tree cannot be decided until every node has been seen |
261
271
  | **`link_node`** | nodes like `/Dest` that just contain a pointer to another node |
262
272
 
273
+ ### Reference
274
+ * [`PyPDF2 2.12.0` documentation](https://pypdf2.readthedocs.io/en/2.12.0/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
275
+
276
+
263
277
  # TODO
278
+ * Upgrade `PyPDF` to latest and expand `combine_pdfs` compression command line option
264
279
  * Highlight decodes with a lot of Javascript keywords
265
280
  * https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
266
281
  * https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
@@ -1,10 +1,14 @@
1
1
  import code
2
- import logging
3
2
  import sys
4
3
  from os import environ, getcwd, path
4
+ from pathlib import Path
5
5
 
6
6
  from dotenv import load_dotenv
7
+ # TODO: PdfMerger is deprecated in favor of PdfWriter at v3.9.1 (see https://pypdf.readthedocs.io/en/latest/user/merging-pdfs.html#basic-example)
8
+ from PyPDF2 import PdfMerger
9
+ from PyPDF2.errors import PdfReadError
7
10
 
11
+ # Should be first local import before load_dotenv() (or at least I think it needs to come first)
8
12
  from pdfalyzer.config import PdfalyzerConfig
9
13
 
10
14
  # load_dotenv() should be called as soon as possible (before parsing local classes) but not for pytest
@@ -16,16 +20,19 @@ if not environ.get('INVOKED_BY_PYTEST', False):
16
20
 
17
21
  from rich.columns import Columns
18
22
  from rich.panel import Panel
23
+ from rich.text import Text
19
24
  from yaralyzer.helpers.rich_text_helper import prefix_with_plain_text_obj
20
25
  from yaralyzer.output.file_export import invoke_rich_export
21
26
  from yaralyzer.output.rich_console import console
22
27
  from yaralyzer.util.logging import log, log_and_print
23
28
 
29
+ from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
30
+ from pdfalyzer.helpers.rich_text_helper import print_highlighted
24
31
  from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
25
32
  from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
26
33
  from pdfalyzer.pdfalyzer import Pdfalyzer
34
+ from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments, parse_combine_pdfs_args
27
35
  from pdfalyzer.util.pdf_parser_manager import PdfParserManager
28
- from pdfalyzer.util.argument_parser import output_sections, parse_arguments
29
36
 
30
37
  # For the table shown by running pdfalyzer_show_color_theme
31
38
  MAX_THEME_COL_SIZE = 35
@@ -82,3 +89,36 @@ def pdfalyzer_show_color_theme() -> None:
82
89
  ]
83
90
 
84
91
  console.print(Columns(colors, column_first=True, padding=(0,3)))
92
+
93
+
94
+ def combine_pdfs():
95
+ """Utility method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'."""
96
+ args = parse_combine_pdfs_args()
97
+ set_max_open_files(args.number_of_pdfs)
98
+ merger = PdfMerger()
99
+
100
+ for pdf in args.pdfs:
101
+ try:
102
+ print_highlighted(f" -> Merging '{pdf}'...", style='dim')
103
+ merger.append(pdf)
104
+ except PdfReadError as e:
105
+ print_highlighted(f" -> Failed to merge '{pdf}'! {e}", style='red')
106
+ ask_to_proceed()
107
+
108
+ if args.compression_level == 0:
109
+ print_highlighted("\nSkipping content stream compression...")
110
+ else:
111
+ print_highlighted(f"\nCompressing content streams with zlib level {args.compression_level}...")
112
+
113
+ for i, page in enumerate(merger.pages):
114
+ # TODO: enable image quality reduction + zlib level once PyPDF is upgraded to 4.x and option is available
115
+ # See https://pypdf.readthedocs.io/en/latest/user/file-size.html#reducing-image-quality
116
+ print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
117
+ page.pagedata.compress_content_streams() # This is CPU intensive!
118
+
119
+ print_highlighted(f"\nWriting '{args.output_file}'...", style='cyan')
120
+ merger.write(args.output_file)
121
+ merger.close()
122
+ txt = Text('').append(f" -> Wrote ")
123
+ txt.append(str(file_size_in_mb(args.output_file)), style='cyan').append(" megabytes\n")
124
+ print_highlighted(txt)
@@ -20,9 +20,8 @@ from yaralyzer.util.logging import log
20
20
 
21
21
  from pdfalyzer.config import PdfalyzerConfig
22
22
  from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode
23
- from pdfalyzer.detection.constants.binary_regexes import (BACKTICK,
24
- DANGEROUS_PDF_KEYS_TO_HUNT_ONLY_IN_FONTS, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET,
25
- QUOTE_PATTERNS)
23
+ from pdfalyzer.detection.constants.binary_regexes import (BACKTICK, DANGEROUS_PDF_KEYS_TO_HUNT_ONLY_IN_FONTS,
24
+ DANGEROUS_PDF_KEYS_TO_HUNT_ONLY_IN_FONTS, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET, QUOTE_PATTERNS)
26
25
  from pdfalyzer.helpers.string_helper import generate_hyphen_line
27
26
  from pdfalyzer.output.layout import print_headline_panel, print_section_sub_subheader
28
27
  from pdfalyzer.util.adobe_strings import CONTENTS, CURRENTFILE_EEXEC, FONT_FILE_KEYS
@@ -1,5 +1,5 @@
1
1
  """
2
- Deprecated old, pre-tree, more rawformat reader.
2
+ Deprecated old, pre-tree, more rawformat reader. Only used for debugging these days.
3
3
  """
4
4
  from io import StringIO
5
5
 
@@ -22,6 +22,7 @@ class IndeterminateNode:
22
22
  self.node = node
23
23
 
24
24
  def place_node(self) -> None:
25
+ """Attempt to find the appropriate parent/child relationships for this node."""
25
26
  log.debug(f"Attempting to resolve indeterminate node: {self.node}")
26
27
 
27
28
  if self._check_for_common_ancestor():
@@ -34,7 +35,7 @@ class IndeterminateNode:
34
35
  parent = self.find_node_with_most_descendants()
35
36
  parent_str = escape(str(parent))
36
37
 
37
- # Any branch that doesn't return or raise will end with parent being node w/most descendants
38
+ # Any if/else branch that doesn't return or raise will decide parent to be the node w/most descendants
38
39
  if self._has_only_similar_relationships():
39
40
  log.info(f" Fuzzy match addresses or labels; placing under node w/most descendants: {parent_str}")
40
41
  elif self._make_parent_if_one_remains(lambda r: r.from_node.type in PAGE_AND_PAGES):
@@ -43,7 +44,8 @@ class IndeterminateNode:
43
44
  elif self.node.type == COLOR_SPACE:
44
45
  log.info(f" Color space node found; placing under node w/most descendants: {parent_str}")
45
46
  elif set(self.node.unique_labels_of_referring_nodes()) == set(PAGE_AND_PAGES):
46
- # An edge case seen in the wild involving a PDF that doesn't conform to the PDF spec
47
+ # Handle an edge case seen in the wild involving a PDF that doesn't conform to the PDF spec
48
+ # in a particular way.
47
49
  log.warning(f" {self.node} seems to be a loose {PAGE}. Linking to first {PAGES}")
48
50
  pages_nodes = [n for n in self.node.nodes_with_here_references() if self.node.type == PAGES]
49
51
  self.node.set_parent(self.find_node_with_most_descendants(pages_nodes))
@@ -63,7 +65,7 @@ class IndeterminateNode:
63
65
  def _has_only_similar_relationships(self) -> bool:
64
66
  """
65
67
  Returns True if all the nodes w/references to this one have the same type or if all the
66
- reference_keys that point to this node are the same
68
+ reference_keys that point to this node are the same.
67
69
  """
68
70
  unique_refferer_labels = self.node.unique_labels_of_referring_nodes()
69
71
  unique_addresses = self.node.unique_addresses()
@@ -99,7 +101,7 @@ class IndeterminateNode:
99
101
  log.info(f"{possible_ancestor} is the common ancestor of {other_nodes_str}")
100
102
  return possible_ancestor
101
103
 
102
- def _check_single_relation_rules(self):
104
+ def _check_single_relation_rules(self) -> bool:
103
105
  """Check various ways of narrowing down the list of potential parents to one node."""
104
106
  if self._make_parent_if_one_remains(lambda r: r.reference_key in [K, KIDS]):
105
107
  log.info(" Found single explicit /K or /Kids ref")
@@ -111,7 +113,7 @@ class IndeterminateNode:
111
113
  return True
112
114
 
113
115
  def _make_parent_if_one_remains(self, is_possible_parent: Callable) -> bool:
114
- """Relationships are filtered w/filter_parents(). If only one remains it's made the parent"""
116
+ """Relationships are filtered w/is_possible_parent(); if there's only one possibility it's made the parent."""
115
117
  remaining_relationships = [r for r in self.node.non_tree_relationships if is_possible_parent(r)]
116
118
 
117
119
  if len(remaining_relationships) == 1:
@@ -123,6 +125,6 @@ class IndeterminateNode:
123
125
 
124
126
 
125
127
  def find_node_with_lowest_id(list_of_nodes: List[PdfTreeNode]) -> PdfTreeNode:
126
- """Find node in list_of_nodes_with_lowest ID"""
128
+ """Find node in list_of_nodes_with_lowest ID."""
127
129
  lowest_idnum = min([n.idnum for n in list_of_nodes])
128
130
  return next(n for n in list_of_nodes if n.idnum == lowest_idnum)
@@ -104,10 +104,11 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
104
104
  self.non_tree_relationships.remove(relationship)
105
105
 
106
106
  def nodes_with_here_references(self) -> List['PdfTreeNode']:
107
- """Return a list of nodes that contain this nodes PDF object as an IndirectObject reference."""
107
+ """Return a list of nodes that contain this node's PDF object as an IndirectObject reference."""
108
108
  return [r.from_node for r in self.non_tree_relationships if r.from_node]
109
109
 
110
110
  def non_tree_relationship_count(self) -> int:
111
+ """Number of non parent/child relationships containing this node."""
111
112
  return len(self.non_tree_relationships)
112
113
 
113
114
  def unique_addresses(self) -> List[str]:
@@ -128,7 +129,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
128
129
  return isinstance(self.obj, StreamObject)
129
130
 
130
131
  def tree_address(self, max_length: Optional[int] = DEFAULT_MAX_ADDRESS_LENGTH) -> str:
131
- """Creates a string like '/Catalog/Pages/Resources[2]/Font' truncated to max_length (if given)"""
132
+ """Creates a string like '/Catalog/Pages/Resources[2]/Font' truncated to max_length (if given)."""
132
133
  if self.label == TRAILER:
133
134
  return '/'
134
135
  elif self.parent is None:
@@ -163,7 +164,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
163
164
  else:
164
165
  address = refs_to_this_node[0].address
165
166
  # If other node's label doesn't start with a NON_STANDARD_ADDRESS string
166
- # and any of the relationships pointing at this nod use something other than a
167
+ # and any of the relationships pointing at this node use something other than a
167
168
  # NON_STANDARD_ADDRESS_NODES string to refer here, print a warning about multiple refs.
168
169
  if not (is_prefixed_by_any(from_node.label, NON_STANDARD_ADDRESS_NODES) or \
169
170
  all(ref.address in NON_STANDARD_ADDRESS_NODES for ref in refs_to_this_node)):
@@ -193,6 +194,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
193
194
  return len(self.children) + sum([child.descendants_count() for child in self.children])
194
195
 
195
196
  def unique_labels_of_referring_nodes(self) -> List[str]:
197
+ """Unique label strings of nodes referring here outside the parent/child hierarchy."""
196
198
  return list(set([r.from_node.label for r in self.non_tree_relationships]))
197
199
 
198
200
  def print_non_tree_relationships(self) -> None:
@@ -1,13 +1,7 @@
1
1
  """
2
2
  Configuration of what to scan for in binary data. Regexes here will be matched against binary streams
3
- and then force decoded
3
+ and then force decoded.
4
4
  """
5
-
6
- import re
7
- from typing import Union
8
-
9
- from deprecated import deprecated
10
-
11
5
  from pdfalyzer.util.adobe_strings import DANGEROUS_PDF_KEYS
12
6
 
13
7
  DANGEROUS_JAVASCRIPT_INSTRUCTIONS = ['eval']
@@ -0,0 +1,102 @@
1
+ """
2
+ Some helpers for stuff with the local filesystem.
3
+ """
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Union
7
+
8
+ from yaralyzer.output.rich_console import console
9
+
10
+ from pdfalyzer.helpers.rich_text_helper import print_highlighted
11
+
12
+ NUMBERED_PAGE_REGEX = re.compile(r'.*_(\d+)\.\w{3,4}$')
13
+ DEFAULT_MAX_OPEN_FILES = 256 # macOS default
14
+ OPEN_FILES_BUFFER = 30 # we might have some files open already so we need to go beyond DEFAULT_MAX_OPEN_FILES
15
+ PDF_EXT = '.pdf'
16
+
17
+ # TODO: this kind of type alias is not supported until Python 3.12
18
+ #type StrOrPath = Union[str, Path]
19
+
20
+
21
+ def with_pdf_extension(file_path: Union[str, Path]) -> str:
22
+ """Append '.pdf' to 'file_path' if it doesn't already end with '.pdf'."""
23
+ return str(file_path) + ('' if is_pdf(file_path) else PDF_EXT)
24
+
25
+
26
+ def is_pdf(file_path: Union[str, Path]) -> bool:
27
+ """Return True if 'file_path' ends with '.pdf'."""
28
+ return str(file_path).endswith(PDF_EXT)
29
+
30
+
31
+ def file_exists(file_path: Union[str, Path]) -> bool:
32
+ """Return True if 'file_path' exists."""
33
+ return Path(file_path).exists()
34
+
35
+
36
+ def do_all_files_exist(file_paths: list[Union[str, Path]]) -> bool:
37
+ """Print an error for each element of 'file_paths' that's not a file. Return True if all 'file_paths' exist."""
38
+ all_files_exist = True
39
+
40
+ for file_path in file_paths:
41
+ if not file_exists(file_path):
42
+ console.print(f"File not found: '{file_path}'", style='error')
43
+ all_files_exist = False
44
+
45
+ return all_files_exist
46
+
47
+
48
+ def extract_page_number(file_path: Union[str, Path]) -> int|None:
49
+ """Extract the page number from the end of a filename if it exists."""
50
+ match = NUMBERED_PAGE_REGEX.match(str(file_path))
51
+ return int(match.group(1)) if match else None
52
+
53
+
54
+ def file_size_in_mb(file_path: Union[str, Path], decimal_places: int = 2) -> float:
55
+ """Return the size of 'file_path' in MB rounded to 2 decimal places,"""
56
+ return round(Path(file_path).stat().st_size / 1024.0 / 1024.0, decimal_places)
57
+
58
+
59
+ def set_max_open_files(num_filehandles: int = DEFAULT_MAX_OPEN_FILES) -> tuple[int | None, int | None]:
60
+ """
61
+ Sets the OS level max open files to at least 'num_filehandles'. Current value can be seen with 'ulimit -a'.
62
+ Required when you might be opening more than DEFAULT_MAX_OPEN_FILES file handles simultaneously
63
+ (e.g. when you are merging a lot of small images or PDFs). Equivalent of something like
64
+ 'default ulimit -n 1024' on macOS.
65
+
66
+ NOTE: Does nothing on Windows (I think).
67
+ NOTE: This mostly came from somewhere on stackoverflow but I lost the link.
68
+ """
69
+ try:
70
+ import resource # Windows doesn't have this package / doesn't need to bump up the ulimit (??)
71
+ except ImportError:
72
+ resource = None
73
+
74
+ if resource is None:
75
+ print_highlighted(f"No resource module; cannot set max open files on this platform...", style='yellow')
76
+ return (None, None)
77
+ elif num_filehandles <= DEFAULT_MAX_OPEN_FILES:
78
+ # Then the OS max open files value is already sufficient.
79
+ return (DEFAULT_MAX_OPEN_FILES, DEFAULT_MAX_OPEN_FILES)
80
+
81
+ # %% (0) what is current ulimit -n setting?
82
+ (soft, hard) = resource.getrlimit(resource.RLIMIT_NOFILE)
83
+ num_filehandles = num_filehandles + OPEN_FILES_BUFFER
84
+
85
+ # %% (1) increase limit (soft and even hard) if needed
86
+ if soft < num_filehandles:
87
+ soft = num_filehandles
88
+ hard = max(soft, hard)
89
+ print_highlighted(f"Increasing max open files soft & hard 'ulimit -n {soft} {hard}'...")
90
+
91
+ try:
92
+ resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
93
+ except (ValueError, resource.error):
94
+ try:
95
+ hard = soft
96
+ print_highlighted(f"Retrying setting max open files (soft, hard)=({soft}, {hard})", style='yellow')
97
+ resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
98
+ except Exception:
99
+ print_highlighted('Failed to set max open files / ulimit, giving up!', style='error')
100
+ soft,hard = resource.getrlimit(resource.RLIMIT_NOFILE)
101
+
102
+ return (soft, hard)
@@ -1,14 +1,26 @@
1
1
  """
2
2
  Functions for miscellaneous Rich text/string operations.
3
3
  """
4
+ from functools import partial
4
5
  from typing import List
5
6
 
6
7
  from PyPDF2.generic import PdfObject
8
+ from rich.console import Console
9
+ from rich.highlighter import RegexHighlighter, JSONHighlighter
7
10
  from rich.text import Text
11
+ from yaralyzer.output.rich_console import console
8
12
 
9
13
  from pdfalyzer.helpers.pdf_object_helper import pypdf_class_name
10
14
  from pdfalyzer.output.styles.node_colors import get_label_style, get_class_style_italic
11
15
 
16
+ # Usually we use the yaralyzer console but that has no highlighter
17
+ pdfalyzer_console = Console(color_system='256')
18
+
19
+
20
+ def print_highlighted(msg: str|Text, **kwargs) -> None:
21
+ """Print 'msg' with Rich highlighting."""
22
+ pdfalyzer_console.print(msg, highlight=True, **kwargs)
23
+
12
24
 
13
25
  def quoted_text(
14
26
  _string: str,
@@ -79,7 +79,8 @@ XREF_STREAM = '/XRefStm'
79
79
  FONT_LENGTHS = [f'/Length{i + 1}' for i in range(3)]
80
80
  FONT_FILE_KEYS = [FONT_FILE, FONT_FILE2, FONT_FILE3]
81
81
 
82
- # Instructions to flag when scanning stream data for malicious content.
82
+ # Instructions to flag when scanning stream data for malicious content. The leading
83
+ # front slash will be removed when pattern matching.
83
84
  DANGEROUS_PDF_KEYS = [
84
85
  # AA, # AA is too generic; can't afford to remove the frontslash
85
86
  ACRO_FORM,
@@ -1,5 +1,5 @@
1
1
  import sys
2
- from argparse import ArgumentError, ArgumentParser
2
+ from argparse import ArgumentError, ArgumentParser, Namespace
3
3
  from collections import namedtuple
4
4
  from functools import partial, update_wrapper
5
5
  from importlib.metadata import version
@@ -7,11 +7,16 @@ from os import getcwd, path
7
7
  from typing import List
8
8
 
9
9
  from rich_argparse_plus import RichHelpFormatterPlus
10
+ from rich.prompt import Confirm
11
+ from rich.text import Text
10
12
  from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args
11
13
  from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
12
14
 
13
15
  from pdfalyzer.config import ALL_STREAMS, PdfalyzerConfig
14
16
  from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
17
+ from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
18
+ with_pdf_extension)
19
+ from pdfalyzer.helpers.rich_text_helper import print_highlighted
15
20
 
16
21
  # NamedTuple to keep our argument selection orderly
17
22
  OutputSection = namedtuple('OutputSection', ['argument', 'method'])
@@ -25,7 +30,7 @@ DESCRIPTION = "Explore PDF's inner data structure with absurdly large and in dep
25
30
 
26
31
  EPILOG = "Values for various config options can be set permanently by a .pdfalyzer file in your home directory; " + \
27
32
  "see the documentation for details. " + \
28
- f"A registry of previous pdfalyzer invocations will be incribed to a file if the " + \
33
+ f"A registry of previous pdfalyzer invocations will be inscribed to a file if the " + \
29
34
  "{YaralyzerConfig.LOG_DIR_ENV_VAR} environment variable is configured."
30
35
 
31
36
  # Analysis selection sections
@@ -107,7 +112,9 @@ select.add_argument('--preview-stream-length',
107
112
  parser._action_groups = parser._action_groups[:2] + [parser._action_groups[-1]] + parser._action_groups[2:-1]
108
113
 
109
114
 
110
- # The Parsening Begins
115
+ ################################
116
+ # Main argument parsing begins #
117
+ ################################
111
118
  def parse_arguments():
112
119
  """Parse command line args. Most settings are communicated to the app by setting env vars"""
113
120
  if '--version' in sys.argv:
@@ -175,3 +182,71 @@ def output_sections(args, pdfalyzer) -> List[OutputSection]:
175
182
  def all_sections_chosen(args):
176
183
  """Returns true if all flags are set or no flags are set."""
177
184
  return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
185
+
186
+
187
+ ###############################################
188
+ # Separate arg parser for combine_pdfs script #
189
+ ###############################################
190
+ combine_pdfs_parser = ArgumentParser(
191
+ description="Combine multiple PDFs into one.",
192
+ epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" \
193
+ " page numebrs prior to merging.",
194
+ formatter_class=RichHelpFormatterPlus)
195
+
196
+ combine_pdfs_parser.add_argument('pdfs',
197
+ help='two or more PDFs to combine',
198
+ metavar='PDF_PATH',
199
+ nargs='+')
200
+
201
+ combine_pdfs_parser.add_argument('-c', '--compression-level',
202
+ help='zlib image compression level (0=none, max=1 until PyPDF is upgraded)',
203
+ choices=range(0, 2),
204
+ default=1,
205
+ type=int)
206
+
207
+ combine_pdfs_parser.add_argument('-o', '--output-file',
208
+ help='path to write the combined PDFs to',
209
+ required=True)
210
+
211
+
212
+ def parse_combine_pdfs_args() -> Namespace:
213
+ """Parse command line args for combine_pdfs script."""
214
+ args = combine_pdfs_parser.parse_args()
215
+ args.output_file = with_pdf_extension(args.output_file)
216
+ confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
217
+ args.number_of_pdfs = len(args.pdfs)
218
+
219
+ if args.number_of_pdfs < 2:
220
+ exit_with_error(f"Need at least 2 PDFs to merge.")
221
+ elif not do_all_files_exist(args.pdfs):
222
+ exit_with_error()
223
+ elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
224
+ exit_with_error()
225
+
226
+ if all(is_pdf(pdf) for pdf in args.pdfs):
227
+ if all(extract_page_number(pdf) for pdf in args.pdfs):
228
+ print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
229
+ args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
230
+ else:
231
+ print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
232
+ else:
233
+ print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
234
+ ask_to_proceed()
235
+
236
+ print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
237
+ return args
238
+
239
+
240
+ def ask_to_proceed() -> None:
241
+ """Exit if user doesn't confirm they want to proceed."""
242
+ if not Confirm.ask(Text("Proceed anyway?")):
243
+ exit_with_error()
244
+
245
+
246
+ def exit_with_error(error_message: str|None = None) -> None:
247
+ """Print 'error_message' and exit with status code 1."""
248
+ if error_message:
249
+ print_highlighted(error_message, style='bold red')
250
+
251
+ print_highlighted('Exiting...', style='red')
252
+ sys.exit(1)
@@ -1026,7 +1026,7 @@ rule malware_MaldocinPDF {
1026
1026
  author = "Yuma Masubuchi and Kota Kino"
1027
1027
  description = "Search for embeddings of malicious Word files into a PDF file."
1028
1028
  created_date = "2023-08-15"
1029
- blog_reference = "https://malware.news/t/maldoc-in-pdf-detection-bypass-by-embedding-a-malicious-word-file-into-a-pdf-file/72815"
1029
+ blog_reference = "https://blogs.jpcert.or.jp/en/2023/08/maldocinpdf.html"
1030
1030
  labs_reference = "N/A"
1031
1031
  labs_pivot = "N/A"
1032
1032
  samples = "ef59d7038cfd565fd65bae12588810d5361df938244ebad33b71882dcf683058"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "pdfalyzer"
3
- version = "1.14.10"
3
+ version = "1.15.0"
4
4
  description = "A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
5
5
  authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
6
6
  license = "GPL-3.0-or-later"
@@ -42,6 +42,7 @@ packages = [
42
42
 
43
43
 
44
44
  [tool.poetry.scripts]
45
+ combine_pdfs = 'pdfalyzer:combine_pdfs'
45
46
  pdfalyze = 'pdfalyzer:pdfalyze'
46
47
  pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'
47
48
 
@@ -50,7 +51,6 @@ pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'
50
51
  python = "^3.9"
51
52
  anytree = "~=2.8"
52
53
  chardet = ">=5.0.0,<6.0.0"
53
- Deprecated = "^1.2.13"
54
54
  PyPDF2 = "^2.10"
55
55
  python-dotenv = "^0.21.0"
56
56
  rich = "^12.5.1"
File without changes