pdfalyzer 1.15.0__tar.gz → 1.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/CHANGELOG.md +7 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/PKG-INFO +8 -9
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/README.md +6 -7
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/__init__.py +20 -15
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/decorators/document_model_printer.py +1 -1
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/decorators/pdf_object_properties.py +2 -2
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/decorators/pdf_tree_node.py +3 -3
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/decorators/pdf_tree_verifier.py +2 -2
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/detection/yaralyzer_helper.py +7 -2
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/font_info.py +2 -2
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/helpers/pdf_object_helper.py +3 -3
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/helpers/rich_text_helper.py +1 -1
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/output/character_mapping.py +4 -4
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/output/pdfalyzer_presenter.py +2 -2
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/output/styles/node_colors.py +1 -1
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/output/tables/pdf_node_rich_table.py +1 -1
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/pdf_object_relationship.py +1 -1
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/pdfalyzer.py +3 -3
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/util/adobe_strings.py +2 -2
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/util/argument_parser.py +24 -13
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pyproject.toml +2 -2
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/LICENSE +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/__main__.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/binary/binary_scanner.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/config.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/decorators/indeterminate_node.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/detection/constants/binary_regexes.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/detection/javascript_hunter.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/helpers/dict_helper.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/helpers/filesystem_helper.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/helpers/number_helper.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/helpers/string_helper.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/output/layout.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/output/styles/rich_theme.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/output/tables/decoding_stats_table.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/output/tables/font_summary_table.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/output/tables/stream_objects_table.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/util/debugging.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/util/exceptions.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/util/pdf_parser_manager.py +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/yara_rules/PDF.yara +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/yara_rules/__init.py__ +0 -0
- {pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
# 1.16.0
|
|
4
|
+
* Upgrade `PyPDF2` 2.x to `pypdf` 5.0.1 (new name, same package)
|
|
5
|
+
* Add `--image-quality` option to `combine_pdfs` tool
|
|
6
|
+
|
|
7
|
+
### 1.15.1
|
|
8
|
+
* Add `--no-default-yara-rules` command line option so users can use _only_ their own custom YARA rules files if they want. Previously you could only use custom YARA rules _in addition to_ the default rules; now you can just skip the default rules.
|
|
9
|
+
|
|
3
10
|
# 1.15.0
|
|
4
11
|
* Add `combine_pdfs` command line script to merge a bunch of PDFs into one
|
|
5
12
|
* Remove unused `Deprecated` dependency
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.16.0
|
|
4
4
|
Summary: A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -16,9 +16,9 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
16
16
|
Classifier: Topic :: Artistic Software
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
18
18
|
Classifier: Topic :: Security
|
|
19
|
-
Requires-Dist: PyPDF2 (>=2.10,<3.0)
|
|
20
19
|
Requires-Dist: anytree (>=2.8,<3.0)
|
|
21
20
|
Requires-Dist: chardet (>=5.0.0,<6.0.0)
|
|
21
|
+
Requires-Dist: pypdf (>=5.0.1,<6.0.0)
|
|
22
22
|
Requires-Dist: python-dotenv (>=0.21.0,<0.22.0)
|
|
23
23
|
Requires-Dist: rich (>=12.5.1,<13.0.0)
|
|
24
24
|
Requires-Dist: rich-argparse-plus (>=0.3.1,<0.4.0)
|
|
@@ -71,7 +71,7 @@ Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `
|
|
|
71
71
|
pipx install pdfalyzer
|
|
72
72
|
```
|
|
73
73
|
|
|
74
|
-
See [
|
|
74
|
+
See [PyPDF installation notes](https://github.com/py-pdf/pypdf#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
|
|
75
75
|
|
|
76
76
|
If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
|
|
77
77
|
|
|
@@ -123,7 +123,7 @@ Warnings will be printed if any PDF object ID between 1 and the `/Size` reported
|
|
|
123
123
|
## Use As A Code Library
|
|
124
124
|
For info about setting up a dev environment see [Contributing](#contributing) below.
|
|
125
125
|
|
|
126
|
-
At its core The Pdfalyzer is taking PDF internal objects gathered by [
|
|
126
|
+
At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
|
|
127
127
|
|
|
128
128
|
As far as The Pdfalyzer's unique functionality goes, [`Pdfalyzer`](pdfalyzer/pdfalyzer.py) is the class at the heart of the operation. It holds the PDF's logical tree as well as a few other data structures. Chief among these are the [`FontInfo`](pdfalyzer/font_info.py) class which pulls together various properties of a font strewn across 3 or 4 different PDF objects and the [`BinaryScanner`](pdfalyzer/binary/binary_scanner.py) class which lets you dig through the embedded streams' bytes looking for suspicious patterns.
|
|
129
129
|
|
|
@@ -192,7 +192,7 @@ This image shows a more in-depth view of of the PDF tree for the same document s
|
|
|
192
192
|
|
|
193
193
|
## Fonts
|
|
194
194
|
|
|
195
|
-
#### **Extract character mappings from ancient Adobe font formats**. It's actually `
|
|
195
|
+
#### **Extract character mappings from ancient Adobe font formats**. It's actually `PyPDF` doing the lifting here but we're happy to take the credit.
|
|
196
196
|
|
|
197
197
|

|
|
198
198
|
|
|
@@ -275,7 +275,7 @@ scripts/install_t1utils.sh
|
|
|
275
275
|
## Did The World Really Need Another PDF Tool?
|
|
276
276
|
This tool was built to fill a gap in the PDF assessment landscape following [my own recent experience trying to find malicious content in a PDF file](https://twitter.com/Cryptadamist/status/1570167937381826560). Didier Stevens's [pdfid.py](https://github.com/DidierStevens/DidierStevensSuite/blob/master/pdfid.py) and [pdf-parser.py](https://github.com/DidierStevens/DidierStevensSuite/blob/master/pdf-parser.py) are still the best game in town when it comes to PDF analysis tools but they lack in the visualization department and also don't give you much to work with as far as giving you a data model you can write your own code around. [Peepdf](https://github.com/jesparza/peepdf) seemed promising but turned out to be in a buggy, out of date, and more or less unfixable state. And neither of them offered much in the way of tooling for embedded binary analysis.
|
|
277
277
|
|
|
278
|
-
Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [
|
|
278
|
+
Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [PyPDF](https://github.com/py-pdf/pypdf), [Rich](https://github.com/Textualize/rich), and [YARA](https://github.com/VirusTotal/yara-python) via [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer)) into this tool.
|
|
279
279
|
|
|
280
280
|
-------------
|
|
281
281
|
|
|
@@ -289,7 +289,7 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
289
289
|
|
|
290
290
|
| Term | Meaning |
|
|
291
291
|
| ----------------- | ---------------- |
|
|
292
|
-
| **`PDF Object`** | Instance of a `
|
|
292
|
+
| **`PDF Object`** | Instance of a `PyPDF` class that represents the information stored in the PDF binary between open and close guillemet quotes (« and ») |
|
|
293
293
|
| **`reference_key`** | String found in a PDF object that names a property (e.g. `/BaseFont` or `/Subtype`) |
|
|
294
294
|
| **`reference`** | Link _from_ a PDF object _to_ another node. Outward facing relationships, basically. |
|
|
295
295
|
| **`address`** | `reference_key` plus a hash key or numerical array index if that's how the reference works. e.g. if node A has a reference key `/Resources` pointing to a dict `{'/Font2': [IndirectObject(55), IndirectObject(2)]}` the address of `IndirectObject(55)` from node A would be `/Resources[/Font2][0]` |
|
|
@@ -300,11 +300,10 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
300
300
|
| **`link_node`** | nodes like `/Dest` that just contain a pointer to another node |
|
|
301
301
|
|
|
302
302
|
### Reference
|
|
303
|
-
* [`
|
|
303
|
+
* [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
|
|
304
304
|
|
|
305
305
|
|
|
306
306
|
# TODO
|
|
307
|
-
* Upgrade `PyPDF` to latest and expand `combine_pdfs` compression command line option
|
|
308
307
|
* Highlight decodes with a lot of Javascript keywords
|
|
309
308
|
* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
|
|
310
309
|
* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
|
|
@@ -42,7 +42,7 @@ Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `
|
|
|
42
42
|
pipx install pdfalyzer
|
|
43
43
|
```
|
|
44
44
|
|
|
45
|
-
See [
|
|
45
|
+
See [PyPDF installation notes](https://github.com/py-pdf/pypdf#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
|
|
46
46
|
|
|
47
47
|
If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
|
|
48
48
|
|
|
@@ -94,7 +94,7 @@ Warnings will be printed if any PDF object ID between 1 and the `/Size` reported
|
|
|
94
94
|
## Use As A Code Library
|
|
95
95
|
For info about setting up a dev environment see [Contributing](#contributing) below.
|
|
96
96
|
|
|
97
|
-
At its core The Pdfalyzer is taking PDF internal objects gathered by [
|
|
97
|
+
At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
|
|
98
98
|
|
|
99
99
|
As far as The Pdfalyzer's unique functionality goes, [`Pdfalyzer`](pdfalyzer/pdfalyzer.py) is the class at the heart of the operation. It holds the PDF's logical tree as well as a few other data structures. Chief among these are the [`FontInfo`](pdfalyzer/font_info.py) class which pulls together various properties of a font strewn across 3 or 4 different PDF objects and the [`BinaryScanner`](pdfalyzer/binary/binary_scanner.py) class which lets you dig through the embedded streams' bytes looking for suspicious patterns.
|
|
100
100
|
|
|
@@ -163,7 +163,7 @@ This image shows a more in-depth view of of the PDF tree for the same document s
|
|
|
163
163
|
|
|
164
164
|
## Fonts
|
|
165
165
|
|
|
166
|
-
#### **Extract character mappings from ancient Adobe font formats**. It's actually `
|
|
166
|
+
#### **Extract character mappings from ancient Adobe font formats**. It's actually `PyPDF` doing the lifting here but we're happy to take the credit.
|
|
167
167
|
|
|
168
168
|

|
|
169
169
|
|
|
@@ -246,7 +246,7 @@ scripts/install_t1utils.sh
|
|
|
246
246
|
## Did The World Really Need Another PDF Tool?
|
|
247
247
|
This tool was built to fill a gap in the PDF assessment landscape following [my own recent experience trying to find malicious content in a PDF file](https://twitter.com/Cryptadamist/status/1570167937381826560). Didier Stevens's [pdfid.py](https://github.com/DidierStevens/DidierStevensSuite/blob/master/pdfid.py) and [pdf-parser.py](https://github.com/DidierStevens/DidierStevensSuite/blob/master/pdf-parser.py) are still the best game in town when it comes to PDF analysis tools but they lack in the visualization department and also don't give you much to work with as far as giving you a data model you can write your own code around. [Peepdf](https://github.com/jesparza/peepdf) seemed promising but turned out to be in a buggy, out of date, and more or less unfixable state. And neither of them offered much in the way of tooling for embedded binary analysis.
|
|
248
248
|
|
|
249
|
-
Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [
|
|
249
|
+
Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [PyPDF](https://github.com/py-pdf/pypdf), [Rich](https://github.com/Textualize/rich), and [YARA](https://github.com/VirusTotal/yara-python) via [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer)) into this tool.
|
|
250
250
|
|
|
251
251
|
-------------
|
|
252
252
|
|
|
@@ -260,7 +260,7 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
260
260
|
|
|
261
261
|
| Term | Meaning |
|
|
262
262
|
| ----------------- | ---------------- |
|
|
263
|
-
| **`PDF Object`** | Instance of a `
|
|
263
|
+
| **`PDF Object`** | Instance of a `PyPDF` class that represents the information stored in the PDF binary between open and close guillemet quotes (« and ») |
|
|
264
264
|
| **`reference_key`** | String found in a PDF object that names a property (e.g. `/BaseFont` or `/Subtype`) |
|
|
265
265
|
| **`reference`** | Link _from_ a PDF object _to_ another node. Outward facing relationships, basically. |
|
|
266
266
|
| **`address`** | `reference_key` plus a hash key or numerical array index if that's how the reference works. e.g. if node A has a reference key `/Resources` pointing to a dict `{'/Font2': [IndirectObject(55), IndirectObject(2)]}` the address of `IndirectObject(55)` from node A would be `/Resources[/Font2][0]` |
|
|
@@ -271,11 +271,10 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
271
271
|
| **`link_node`** | nodes like `/Dest` that just contain a pointer to another node |
|
|
272
272
|
|
|
273
273
|
### Reference
|
|
274
|
-
* [`
|
|
274
|
+
* [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
|
|
275
275
|
|
|
276
276
|
|
|
277
277
|
# TODO
|
|
278
|
-
* Upgrade `PyPDF` to latest and expand `combine_pdfs` compression command line option
|
|
279
278
|
* Highlight decodes with a lot of Javascript keywords
|
|
280
279
|
* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
|
|
281
280
|
* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
|
|
@@ -4,9 +4,8 @@ from os import environ, getcwd, path
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
6
|
from dotenv import load_dotenv
|
|
7
|
-
|
|
8
|
-
from
|
|
9
|
-
from PyPDF2.errors import PdfReadError
|
|
7
|
+
from pypdf import PdfWriter
|
|
8
|
+
from pypdf.errors import PdfReadError
|
|
10
9
|
|
|
11
10
|
# Should be first local import before load_dotenv() (or at least I think it needs to come first)
|
|
12
11
|
from pdfalyzer.config import PdfalyzerConfig
|
|
@@ -31,7 +30,8 @@ from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
|
31
30
|
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
32
31
|
from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
|
|
33
32
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
34
|
-
from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments,
|
|
33
|
+
from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
|
|
34
|
+
parse_combine_pdfs_args)
|
|
35
35
|
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
|
|
36
36
|
|
|
37
37
|
# For the table shown by running pdfalyzer_show_color_theme
|
|
@@ -51,6 +51,7 @@ def pdfalyze():
|
|
|
51
51
|
log_and_print(f"Binary stream extraction complete, files written to '{args.output_dir}'.\nExiting.\n")
|
|
52
52
|
sys.exit()
|
|
53
53
|
|
|
54
|
+
# The method that gets called is related to the argument name. See 'possible_output_sections' list in argument_parser.py
|
|
54
55
|
# Analysis exports wrap themselves around the methods that actually generate the analyses
|
|
55
56
|
for (arg, method) in output_sections(args, pdfalyzer):
|
|
56
57
|
if args.output_dir:
|
|
@@ -92,10 +93,13 @@ def pdfalyzer_show_color_theme() -> None:
|
|
|
92
93
|
|
|
93
94
|
|
|
94
95
|
def combine_pdfs():
|
|
95
|
-
"""
|
|
96
|
+
"""
|
|
97
|
+
Utility method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'.
|
|
98
|
+
Example: https://github.com/py-pdf/pypdf/blob/main/docs/user/merging-pdfs.md
|
|
99
|
+
"""
|
|
96
100
|
args = parse_combine_pdfs_args()
|
|
97
101
|
set_max_open_files(args.number_of_pdfs)
|
|
98
|
-
merger =
|
|
102
|
+
merger = PdfWriter()
|
|
99
103
|
|
|
100
104
|
for pdf in args.pdfs:
|
|
101
105
|
try:
|
|
@@ -105,18 +109,19 @@ def combine_pdfs():
|
|
|
105
109
|
print_highlighted(f" -> Failed to merge '{pdf}'! {e}", style='red')
|
|
106
110
|
ask_to_proceed()
|
|
107
111
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
+
# Iterate through pages and compress, lowering image quality if requested
|
|
113
|
+
# See https://pypdf.readthedocs.io/en/latest/user/file-size.html#reducing-image-quality
|
|
114
|
+
for i, page in enumerate(merger.pages):
|
|
115
|
+
if args.image_quality < MAX_QUALITY:
|
|
116
|
+
for j, img in enumerate(page.images):
|
|
117
|
+
print_highlighted(f" -> Reducing image #{j + 1} quality on page {i + 1} to {args.image_quality}...", style='dim')
|
|
118
|
+
img.replace(img.image, quality=args.image_quality)
|
|
112
119
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# See https://pypdf.readthedocs.io/en/latest/user/file-size.html#reducing-image-quality
|
|
116
|
-
print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
|
|
117
|
-
page.pagedata.compress_content_streams() # This is CPU intensive!
|
|
120
|
+
print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
|
|
121
|
+
page.compress_content_streams() # This is CPU intensive!
|
|
118
122
|
|
|
119
123
|
print_highlighted(f"\nWriting '{args.output_file}'...", style='cyan')
|
|
124
|
+
merger.compress_identical_objects(remove_identicals=True, remove_orphans=True)
|
|
120
125
|
merger.write(args.output_file)
|
|
121
126
|
merger.close()
|
|
122
127
|
txt = Text('').append(f" -> Wrote ")
|
|
@@ -3,7 +3,7 @@ Deprecated old, pre-tree, more rawformat reader. Only used for debugging these d
|
|
|
3
3
|
"""
|
|
4
4
|
from io import StringIO
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject
|
|
7
7
|
from rich.console import Console
|
|
8
8
|
from rich.markup import escape
|
|
9
9
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Decorator for
|
|
2
|
+
Decorator for PyPDF PdfObject that extracts a couple of properties (type, label, etc).
|
|
3
3
|
"""
|
|
4
4
|
from typing import Any, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pypdf.generic import DictionaryObject, IndirectObject, NumberObject, PdfObject
|
|
7
7
|
from rich.text import Text
|
|
8
8
|
from yaralyzer.util.logging import log
|
|
9
9
|
|
|
@@ -9,8 +9,8 @@ hooks)
|
|
|
9
9
|
from typing import Callable, List, Optional, Set
|
|
10
10
|
|
|
11
11
|
from anytree import NodeMixin, SymlinkNode
|
|
12
|
-
from
|
|
13
|
-
from
|
|
12
|
+
from pypdf.errors import PdfReadError
|
|
13
|
+
from pypdf.generic import IndirectObject, PdfObject, StreamObject
|
|
14
14
|
from rich.markup import escape
|
|
15
15
|
from rich.text import Text
|
|
16
16
|
from yaralyzer.output.rich_console import console
|
|
@@ -41,7 +41,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
41
41
|
self.stream_data = self.obj.get_data()
|
|
42
42
|
self.stream_length = len(self.stream_data)
|
|
43
43
|
except (NotImplementedError, PdfReadError) as e:
|
|
44
|
-
msg = f"
|
|
44
|
+
msg = f"PyPDF failed to decode stream in {self}: {e}.\n" + \
|
|
45
45
|
"Trees will be unaffected but scans/extractions will not be able to check this stream."
|
|
46
46
|
console.print_exception()
|
|
47
47
|
log.warning(msg)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Verify that the PDF tree is complete/contains all the nodes in the PDF file.
|
|
3
3
|
"""
|
|
4
|
-
from
|
|
5
|
-
from
|
|
4
|
+
from pypdf.errors import PdfReadError
|
|
5
|
+
from pypdf.generic import IndirectObject, NameObject, NumberObject
|
|
6
6
|
from rich.markup import escape
|
|
7
7
|
from yaralyzer.output.rich_console import console
|
|
8
8
|
from yaralyzer.util.logging import log
|
|
@@ -8,6 +8,8 @@ from typing import Optional, Union
|
|
|
8
8
|
from yaralyzer.config import YaralyzerConfig
|
|
9
9
|
from yaralyzer.yaralyzer import Yaralyzer
|
|
10
10
|
|
|
11
|
+
from pdfalyzer.config import PdfalyzerConfig
|
|
12
|
+
|
|
11
13
|
YARA_RULES_DIR = files('pdfalyzer').joinpath('yara_rules')
|
|
12
14
|
|
|
13
15
|
YARA_RULES_FILES = [
|
|
@@ -32,8 +34,11 @@ def _build_yaralyzer(scannable: Union[bytes, str], label: Optional[str] = None)
|
|
|
32
34
|
with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[0])) as yara0:
|
|
33
35
|
with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[1])) as yara1:
|
|
34
36
|
with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[2])) as yara2:
|
|
35
|
-
|
|
36
|
-
rules_paths
|
|
37
|
+
# If there is a custom yara_rules argument file use that instead of the files in the yara_rules/ dir
|
|
38
|
+
rules_paths = YaralyzerConfig.args.yara_rules_files or []
|
|
39
|
+
|
|
40
|
+
if not YaralyzerConfig.args.no_default_yara_rules:
|
|
41
|
+
rules_paths += [str(y) for y in [yara0, yara1, yara2]]
|
|
37
42
|
|
|
38
43
|
try:
|
|
39
44
|
return Yaralyzer.for_rules_files(rules_paths, scannable, label)
|
|
@@ -3,8 +3,8 @@ Unify font information spread across a bunch of PdfObjects (Font, FontDescriptor
|
|
|
3
3
|
and FontFile) into a single class.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from
|
|
6
|
+
from pypdf._cmap import build_char_map, prepare_cm
|
|
7
|
+
from pypdf.generic import IndirectObject, PdfObject
|
|
8
8
|
from rich.text import Text
|
|
9
9
|
from yaralyzer.output.rich_console import console
|
|
10
10
|
from yaralyzer.util.logging import log
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Some methods to help with the direct manipulation/processing of
|
|
2
|
+
Some methods to help with the direct manipulation/processing of PyPDF's PdfObjects
|
|
3
3
|
"""
|
|
4
4
|
from typing import List, Optional
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pypdf.generic import IndirectObject, PdfObject
|
|
7
7
|
|
|
8
8
|
from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
|
|
9
9
|
from pdfalyzer.util.adobe_strings import *
|
|
@@ -24,7 +24,7 @@ def _sort_pdf_object_refs(refs: List[PdfObjectRelationship]) -> List[PdfObjectRe
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
def pypdf_class_name(obj: PdfObject) -> str:
|
|
27
|
-
"""Shortened name of type(obj), e.g.
|
|
27
|
+
"""Shortened name of type(obj), e.g. PyPDF.generic._data_structures.ArrayObject becomes Array"""
|
|
28
28
|
class_pkgs = type(obj).__name__.split('.')
|
|
29
29
|
class_pkgs.reverse()
|
|
30
30
|
return class_pkgs[0].removesuffix('Object')
|
|
@@ -4,7 +4,7 @@ Functions for miscellaneous Rich text/string operations.
|
|
|
4
4
|
from functools import partial
|
|
5
5
|
from typing import List
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pypdf.generic import PdfObject
|
|
8
8
|
from rich.console import Console
|
|
9
9
|
from rich.highlighter import RegexHighlighter, JSONHighlighter
|
|
10
10
|
from rich.text import Text
|
|
@@ -12,13 +12,13 @@ from pdfalyzer.helpers.rich_text_helper import quoted_text
|
|
|
12
12
|
from pdfalyzer.helpers.string_helper import pp
|
|
13
13
|
from pdfalyzer.output.layout import print_headline_panel, subheading_width
|
|
14
14
|
|
|
15
|
-
CHARMAP_TITLE = 'Character Mapping (As Extracted By
|
|
15
|
+
CHARMAP_TITLE = 'Character Mapping (As Extracted By PyPDF)'
|
|
16
16
|
CHARMAP_TITLE_PADDING = (1, 0, 0, 2)
|
|
17
17
|
CHARMAP_PADDING = (0, 2, 0, 10)
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def print_character_mapping(font: 'FontInfo') -> None:
|
|
21
|
-
"""Prints the character mapping extracted by
|
|
21
|
+
"""Prints the character mapping extracted by PyPDF._charmap in tidy columns"""
|
|
22
22
|
if font.character_mapping is None or len(font.character_mapping) == 0:
|
|
23
23
|
log.info(f"No character map found in {font}")
|
|
24
24
|
return
|
|
@@ -38,12 +38,12 @@ def print_character_mapping(font: 'FontInfo') -> None:
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
def print_prepared_charmap(font: 'FontInfo'):
|
|
41
|
-
"""Prints the prepared_charmap returned by
|
|
41
|
+
"""Prints the prepared_charmap returned by PyPDF."""
|
|
42
42
|
if font.prepared_char_map is None:
|
|
43
43
|
log.info(f"No prepared_charmap found in {font}")
|
|
44
44
|
return
|
|
45
45
|
|
|
46
|
-
headline = f"{font} Adobe PostScript charmap prepared by
|
|
46
|
+
headline = f"{font} Adobe PostScript charmap prepared by PyPDF"
|
|
47
47
|
print_headline_panel(headline, style='charmap.prepared_title')
|
|
48
48
|
print_bytes(font.prepared_char_map, style='charmap.prepared')
|
|
49
49
|
console.line()
|
|
@@ -47,7 +47,7 @@ class PdfalyzerPresenter:
|
|
|
47
47
|
def print_document_info(self) -> None:
|
|
48
48
|
"""Print the embedded document info (author, timestamps, version, etc)."""
|
|
49
49
|
print_section_header(f'Document Info for {self.pdfalyzer.pdf_basename}')
|
|
50
|
-
console.print(pp.pformat(self.pdfalyzer.pdf_reader.
|
|
50
|
+
console.print(pp.pformat(self.pdfalyzer.pdf_reader.metadata))
|
|
51
51
|
console.line()
|
|
52
52
|
console.print(bytes_hashes_table(self.pdfalyzer.pdf_bytes, self.pdfalyzer.pdf_basename))
|
|
53
53
|
console.line()
|
|
@@ -124,7 +124,7 @@ class PdfalyzerPresenter:
|
|
|
124
124
|
console.print(build_decoding_stats_table(binary_scanner), justify='center')
|
|
125
125
|
|
|
126
126
|
def print_yara_results(self) -> None:
|
|
127
|
-
"""Scan the
|
|
127
|
+
"""Scan the main PDF and each individual binary stream in it with yara_rules/*.yara files"""
|
|
128
128
|
print_section_header(f"YARA Scan of PDF rules for '{self.pdfalyzer.pdf_basename}'")
|
|
129
129
|
YaralyzerConfig.args.standalone_mode = True # TODO: using 'standalone mode' like this kind of sucks
|
|
130
130
|
|
|
@@ -6,7 +6,7 @@ from collections import namedtuple
|
|
|
6
6
|
from numbers import Number
|
|
7
7
|
from typing import Any
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from pypdf.generic import (ArrayObject, ByteStringObject, EncodedStreamObject, IndirectObject,
|
|
10
10
|
StreamObject, TextStringObject)
|
|
11
11
|
from yaralyzer.output.rich_console import YARALYZER_THEME_DICT
|
|
12
12
|
|
|
@@ -5,7 +5,7 @@ from collections import namedtuple
|
|
|
5
5
|
from typing import List, Optional
|
|
6
6
|
|
|
7
7
|
from anytree import SymlinkNode
|
|
8
|
-
from
|
|
8
|
+
from pypdf.generic import StreamObject
|
|
9
9
|
from rich.markup import escape
|
|
10
10
|
from rich.panel import Panel
|
|
11
11
|
from rich.table import Table
|
|
@@ -3,7 +3,7 @@ Simple container class for information about a link between two PDF objects.
|
|
|
3
3
|
"""
|
|
4
4
|
from typing import List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pypdf.generic import IndirectObject, PdfObject
|
|
7
7
|
from yaralyzer.util.logging import log
|
|
8
8
|
|
|
9
9
|
from pdfalyzer.helpers.string_helper import bracketed, is_prefixed_by_any
|
|
@@ -11,8 +11,8 @@ from typing import Dict, Iterator, List, Optional
|
|
|
11
11
|
|
|
12
12
|
from anytree import LevelOrderIter, SymlinkNode
|
|
13
13
|
from anytree.search import findall, findall_by_attr
|
|
14
|
-
from
|
|
15
|
-
from
|
|
14
|
+
from pypdf import PdfReader
|
|
15
|
+
from pypdf.generic import IndirectObject
|
|
16
16
|
from yaralyzer.helpers.file_helper import load_binary_data
|
|
17
17
|
from yaralyzer.output.file_hashes_table import compute_file_hashes
|
|
18
18
|
from yaralyzer.output.rich_console import console
|
|
@@ -36,7 +36,7 @@ class Pdfalyzer:
|
|
|
36
36
|
self.pdf_basename = basename(pdf_path)
|
|
37
37
|
self.pdf_bytes = load_binary_data(pdf_path)
|
|
38
38
|
self.pdf_bytes_info = compute_file_hashes(self.pdf_bytes)
|
|
39
|
-
pdf_file = open(pdf_path, 'rb') # Filehandle must be left open for
|
|
39
|
+
pdf_file = open(pdf_path, 'rb') # Filehandle must be left open for PyPDF to perform seeks
|
|
40
40
|
self.pdf_reader = PdfReader(pdf_file)
|
|
41
41
|
|
|
42
42
|
# Initialize tracking variables
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
String constants specified in the Adobe specs for PDFs, fonts, etc.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
PagesAttributes,
|
|
5
|
+
from pypdf.constants import (CatalogDictionary, ImageAttributes, PageAttributes,
|
|
6
|
+
PagesAttributes, Resources)
|
|
7
7
|
|
|
8
8
|
from pdfalyzer.helpers.string_helper import is_prefixed_by_any
|
|
9
9
|
|
|
@@ -9,7 +9,7 @@ from typing import List
|
|
|
9
9
|
from rich_argparse_plus import RichHelpFormatterPlus
|
|
10
10
|
from rich.prompt import Confirm
|
|
11
11
|
from rich.text import Text
|
|
12
|
-
from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args
|
|
12
|
+
from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
|
|
13
13
|
from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
|
|
14
14
|
|
|
15
15
|
from pdfalyzer.config import ALL_STREAMS, PdfalyzerConfig
|
|
@@ -50,8 +50,13 @@ export.add_argument('-bin', '--extract-binary-streams',
|
|
|
50
50
|
const='bin',
|
|
51
51
|
help='extract all binary streams in the PDF to separate files (requires pdf-parser.py)')
|
|
52
52
|
|
|
53
|
+
# Add one more option to the YARA rules section
|
|
54
|
+
source.add_argument('--no-default-yara-rules',
|
|
55
|
+
action='store_true',
|
|
56
|
+
help='if --yara is selected use only custom rules from --yara-file arg and not the default included YARA rules')
|
|
53
57
|
|
|
54
|
-
|
|
58
|
+
|
|
59
|
+
# Note that we extend the yaralyzer's parser and export
|
|
55
60
|
parser = ArgumentParser(
|
|
56
61
|
formatter_class=RichHelpFormatterPlus,
|
|
57
62
|
description=DESCRIPTION,
|
|
@@ -78,7 +83,7 @@ select.add_argument('-f', '--fonts', action='store_true',
|
|
|
78
83
|
help="show info about fonts included character mappings for embedded font binaries")
|
|
79
84
|
|
|
80
85
|
select.add_argument('-y', '--yara', action='store_true',
|
|
81
|
-
help="scan the PDF with YARA rules")
|
|
86
|
+
help="scan the PDF with the included malicious PDF YARA rules and/or your custom YARA rules")
|
|
82
87
|
|
|
83
88
|
select.add_argument('-c', '--counts', action='store_true',
|
|
84
89
|
help='show counts of some of the properties of the objects in the PDF')
|
|
@@ -127,10 +132,13 @@ def parse_arguments():
|
|
|
127
132
|
|
|
128
133
|
if not args.streams:
|
|
129
134
|
if args.extract_quoteds:
|
|
130
|
-
|
|
135
|
+
exit_with_error("--extract-quoted does nothing if --streams is not selected")
|
|
131
136
|
if args.suppress_boms:
|
|
132
137
|
log.warning("--suppress-boms has nothing to suppress if --streams is not selected")
|
|
133
138
|
|
|
139
|
+
if args.no_default_yara_rules and not args.yara_rules_files:
|
|
140
|
+
exit_with_error("--no-default-yara-rules requires at least one --yara-file argument")
|
|
141
|
+
|
|
134
142
|
# File export options
|
|
135
143
|
if args.export_svg or args.export_txt or args.export_html or args.extract_binary_streams:
|
|
136
144
|
args.output_dir = args.output_dir or getcwd()
|
|
@@ -149,8 +157,8 @@ def parse_arguments():
|
|
|
149
157
|
|
|
150
158
|
def output_sections(args, pdfalyzer) -> List[OutputSection]:
|
|
151
159
|
"""
|
|
152
|
-
Determine which of the tree visualizations, font scans, etc
|
|
153
|
-
If nothing
|
|
160
|
+
Determine which of the tree visualizations, font scans, etc should be run.
|
|
161
|
+
If nothing is specified output ALL sections other than --streams which is v. slow/verbose.
|
|
154
162
|
"""
|
|
155
163
|
# Create a partial for print_font_info() because it's the only one that can take an argument
|
|
156
164
|
# partials have no __name__ so update_wrapper() propagates the 'print_font_info' as this partial's name
|
|
@@ -158,7 +166,8 @@ def output_sections(args, pdfalyzer) -> List[OutputSection]:
|
|
|
158
166
|
stream_scan = partial(pdfalyzer.print_streams_analysis, idnum=stream_id)
|
|
159
167
|
update_wrapper(stream_scan, pdfalyzer.print_streams_analysis)
|
|
160
168
|
|
|
161
|
-
#
|
|
169
|
+
# 1st element string matches the argument in 'select' group
|
|
170
|
+
# 2nd is fxn to call if selected.
|
|
162
171
|
# Top to bottom is the default order of output.
|
|
163
172
|
possible_output_sections = [
|
|
164
173
|
OutputSection(DOCINFO, pdfalyzer.print_document_info),
|
|
@@ -187,6 +196,8 @@ def all_sections_chosen(args):
|
|
|
187
196
|
###############################################
|
|
188
197
|
# Separate arg parser for combine_pdfs script #
|
|
189
198
|
###############################################
|
|
199
|
+
MAX_QUALITY = 10
|
|
200
|
+
|
|
190
201
|
combine_pdfs_parser = ArgumentParser(
|
|
191
202
|
description="Combine multiple PDFs into one.",
|
|
192
203
|
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" \
|
|
@@ -198,10 +209,10 @@ combine_pdfs_parser.add_argument('pdfs',
|
|
|
198
209
|
metavar='PDF_PATH',
|
|
199
210
|
nargs='+')
|
|
200
211
|
|
|
201
|
-
combine_pdfs_parser.add_argument('-
|
|
202
|
-
help='
|
|
203
|
-
choices=range(
|
|
204
|
-
default=
|
|
212
|
+
combine_pdfs_parser.add_argument('-iq', '--image-quality',
|
|
213
|
+
help='image quality for embedded images (can compress PDF at loss of quality)',
|
|
214
|
+
choices=range(1, MAX_QUALITY + 1),
|
|
215
|
+
default=MAX_QUALITY,
|
|
205
216
|
type=int)
|
|
206
217
|
|
|
207
218
|
combine_pdfs_parser.add_argument('-o', '--output-file',
|
|
@@ -246,7 +257,7 @@ def ask_to_proceed() -> None:
|
|
|
246
257
|
def exit_with_error(error_message: str|None = None) -> None:
|
|
247
258
|
"""Print 'error_message' and exit with status code 1."""
|
|
248
259
|
if error_message:
|
|
249
|
-
print_highlighted(
|
|
260
|
+
print_highlighted(Text('').append('ERROR', style='bold red').append(f': {error_message}'))
|
|
250
261
|
|
|
251
|
-
print_highlighted('Exiting...', style='red')
|
|
262
|
+
print_highlighted('Exiting...', style='dim red')
|
|
252
263
|
sys.exit(1)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "pdfalyzer"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.16.0"
|
|
4
4
|
description = "A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
|
|
5
5
|
authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
|
|
6
6
|
license = "GPL-3.0-or-later"
|
|
@@ -51,7 +51,7 @@ pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'
|
|
|
51
51
|
python = "^3.9"
|
|
52
52
|
anytree = "~=2.8"
|
|
53
53
|
chardet = ">=5.0.0,<6.0.0"
|
|
54
|
-
|
|
54
|
+
pypdf = "^5.0.1"
|
|
55
55
|
python-dotenv = "^0.21.0"
|
|
56
56
|
rich = "^12.5.1"
|
|
57
57
|
rich-argparse-plus = "^0.3.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pdfalyzer-1.15.0 → pdfalyzer-1.16.0}/pdfalyzer/detection/constants/javascript_reserved_keywords.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|