pdfalyzer 1.17.6__tar.gz → 1.17.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/CHANGELOG.md +9 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/PKG-INFO +22 -26
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/README.md +19 -23
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/image_helper.py +3 -3
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/rich_text_helper.py +46 -41
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/pdfalyzer_presenter.py +12 -1
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/pdfalyzer.py +14 -3
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pyproject.toml +3 -3
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/.pdfalyzer.example +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/LICENSE +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/__init__.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/__main__.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/binary/binary_scanner.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/config.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/document_model_printer.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/indeterminate_node.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_file.py +1 -1
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_object_properties.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_tree_node.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_tree_verifier.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/detection/constants/binary_regexes.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/detection/javascript_hunter.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/detection/yaralyzer_helper.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/font_info.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/dict_helper.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/filesystem_helper.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/number_helper.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/pdf_object_helper.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/string_helper.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/character_mapping.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/layout.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/styles/node_colors.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/styles/rich_theme.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/decoding_stats_table.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/font_summary_table.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/stream_objects_table.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/pdf_object_relationship.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/adobe_strings.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/argument_parser.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/cli_tools_argument_parser.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/debugging.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/exceptions.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/page_range.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/pdf_parser_manager.py +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/PDF.yara +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/__init.py__ +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/didier_stevens.yara +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
- {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/pdf_malware.yara +0 -0
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.17.9
|
|
4
|
+
* Broaden exception handling in `FontInfo` extraction
|
|
5
|
+
|
|
6
|
+
### 1.17.8
|
|
7
|
+
* Handle `AttributeError` in `FontInfo` extraction
|
|
8
|
+
|
|
9
|
+
### 1.17.7
|
|
10
|
+
* Bump `pypdf` to 6.1.3 (fixes [#31](https://github.com/michelcrypt4d4mus/pdfalyzer/issues/31)), `PyMuPDF` to 1.26.5
|
|
11
|
+
|
|
3
12
|
### 1.17.6
|
|
4
13
|
* Better handling for errors resulting from bugs in PyPDF
|
|
5
14
|
* Properly close file handle when pdfalyzing is complete
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.17.
|
|
3
|
+
Version: 1.17.9
|
|
4
4
|
Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -22,9 +22,9 @@ Classifier: Topic :: Artistic Software
|
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
23
23
|
Classifier: Topic :: Security
|
|
24
24
|
Provides-Extra: extract
|
|
25
|
-
Requires-Dist: PyMuPDF (>=1.26.
|
|
25
|
+
Requires-Dist: PyMuPDF (>=1.26.5,<2.0.0) ; extra == "extract"
|
|
26
26
|
Requires-Dist: anytree (>=2.13,<3.0)
|
|
27
|
-
Requires-Dist: pypdf (>=6.
|
|
27
|
+
Requires-Dist: pypdf (>=6.1.3,<7.0.0)
|
|
28
28
|
Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
|
|
29
29
|
Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
|
|
30
30
|
Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
|
|
@@ -67,9 +67,8 @@ If you're looking for one of these things this may be the tool for you.
|
|
|
67
67
|
### What It Don't Do
|
|
68
68
|
This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
|
|
69
69
|
|
|
70
|
-
If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it
|
|
70
|
+
If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
|
|
71
71
|
|
|
72
|
-
-------------
|
|
73
72
|
|
|
74
73
|
# Installation
|
|
75
74
|
#### All Platforms
|
|
@@ -99,7 +98,6 @@ brew install pdfalyzer
|
|
|
99
98
|
sudo apt-get install build-essential libssl-dev libffi-dev rustc
|
|
100
99
|
```
|
|
101
100
|
|
|
102
|
-
-------------
|
|
103
101
|
|
|
104
102
|
# Usage
|
|
105
103
|
|
|
@@ -115,7 +113,7 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
|
|
|
115
113
|
|
|
116
114
|
The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
|
|
117
115
|
|
|
118
|
-
|
|
116
|
+
#### Setting Command Line Options Permanently With A `.pdfalyzer` File
|
|
119
117
|
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
|
|
120
118
|
|
|
121
119
|
1. the current directory
|
|
@@ -123,12 +121,9 @@ When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfa
|
|
|
123
121
|
|
|
124
122
|
If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
|
|
125
123
|
|
|
126
|
-
|
|
124
|
+
#### Environment Variables
|
|
127
125
|
Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
|
|
128
126
|
|
|
129
|
-
### Colors And Themes
|
|
130
|
-
Run `pdfalyzer_show_color_theme` to see the color theme employed.
|
|
131
|
-
|
|
132
127
|
### Guarantees
|
|
133
128
|
Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
|
|
134
129
|
|
|
@@ -136,7 +131,22 @@ Warnings will be printed if any PDF object ID between 1 and the `/Size` reported
|
|
|
136
131
|
[BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
|
|
137
132
|
|
|
138
133
|
|
|
139
|
-
##
|
|
134
|
+
## Included Command Line Tools
|
|
135
|
+
The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
|
|
136
|
+
|
|
137
|
+
* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
|
|
138
|
+
* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
|
|
139
|
+
* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
|
|
140
|
+
* `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
|
|
141
|
+
|
|
142
|
+
Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
pipx install pdfalyzer[extract]
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
## As A Python Library
|
|
140
150
|
For info about setting up a dev environment see [Contributing](#contributing) below.
|
|
141
151
|
|
|
142
152
|
At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
|
|
@@ -247,20 +257,6 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
247
257
|
|
|
248
258
|
|
|
249
259
|
# PDF Resources
|
|
250
|
-
## Included PDF Tools
|
|
251
|
-
The Pdfalyzer comes with a few command line tools:
|
|
252
|
-
|
|
253
|
-
* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
|
|
254
|
-
* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
|
|
255
|
-
* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
|
|
256
|
-
|
|
257
|
-
Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
|
|
258
|
-
|
|
259
|
-
```bash
|
|
260
|
-
pipx install pdfalyzer[extract]
|
|
261
|
-
```
|
|
262
|
-
|
|
263
|
-
|
|
264
260
|
## 3rd Party PDF Tools
|
|
265
261
|
### Installing Didier Stevens's PDF Analysis Tools
|
|
266
262
|
Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
|
|
@@ -33,9 +33,8 @@ If you're looking for one of these things this may be the tool for you.
|
|
|
33
33
|
### What It Don't Do
|
|
34
34
|
This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
|
|
35
35
|
|
|
36
|
-
If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it
|
|
36
|
+
If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
|
|
37
37
|
|
|
38
|
-
-------------
|
|
39
38
|
|
|
40
39
|
# Installation
|
|
41
40
|
#### All Platforms
|
|
@@ -65,7 +64,6 @@ brew install pdfalyzer
|
|
|
65
64
|
sudo apt-get install build-essential libssl-dev libffi-dev rustc
|
|
66
65
|
```
|
|
67
66
|
|
|
68
|
-
-------------
|
|
69
67
|
|
|
70
68
|
# Usage
|
|
71
69
|
|
|
@@ -81,7 +79,7 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
|
|
|
81
79
|
|
|
82
80
|
The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
|
|
83
81
|
|
|
84
|
-
|
|
82
|
+
#### Setting Command Line Options Permanently With A `.pdfalyzer` File
|
|
85
83
|
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
|
|
86
84
|
|
|
87
85
|
1. the current directory
|
|
@@ -89,12 +87,9 @@ When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfa
|
|
|
89
87
|
|
|
90
88
|
If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
|
|
91
89
|
|
|
92
|
-
|
|
90
|
+
#### Environment Variables
|
|
93
91
|
Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
|
|
94
92
|
|
|
95
|
-
### Colors And Themes
|
|
96
|
-
Run `pdfalyzer_show_color_theme` to see the color theme employed.
|
|
97
|
-
|
|
98
93
|
### Guarantees
|
|
99
94
|
Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
|
|
100
95
|
|
|
@@ -102,7 +97,22 @@ Warnings will be printed if any PDF object ID between 1 and the `/Size` reported
|
|
|
102
97
|
[BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
|
|
103
98
|
|
|
104
99
|
|
|
105
|
-
##
|
|
100
|
+
## Included Command Line Tools
|
|
101
|
+
The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
|
|
102
|
+
|
|
103
|
+
* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
|
|
104
|
+
* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
|
|
105
|
+
* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
|
|
106
|
+
* `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
|
|
107
|
+
|
|
108
|
+
Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
pipx install pdfalyzer[extract]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
## As A Python Library
|
|
106
116
|
For info about setting up a dev environment see [Contributing](#contributing) below.
|
|
107
117
|
|
|
108
118
|
At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
|
|
@@ -213,20 +223,6 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
213
223
|
|
|
214
224
|
|
|
215
225
|
# PDF Resources
|
|
216
|
-
## Included PDF Tools
|
|
217
|
-
The Pdfalyzer comes with a few command line tools:
|
|
218
|
-
|
|
219
|
-
* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
|
|
220
|
-
* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
|
|
221
|
-
* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
|
|
222
|
-
|
|
223
|
-
Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
|
|
224
|
-
|
|
225
|
-
```bash
|
|
226
|
-
pipx install pdfalyzer[extract]
|
|
227
|
-
```
|
|
228
|
-
|
|
229
|
-
|
|
230
226
|
## 3rd Party PDF Tools
|
|
231
227
|
### Installing Didier Stevens's PDF Analysis Tools
|
|
232
228
|
Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
|
|
@@ -2,7 +2,7 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
from yaralyzer.output.rich_console import console
|
|
4
4
|
|
|
5
|
-
from pdfalyzer.helpers.rich_text_helper import
|
|
5
|
+
from pdfalyzer.helpers.rich_text_helper import print_warning
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]: # noqa F821
|
|
@@ -15,10 +15,10 @@ def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]: # noqa F8
|
|
|
15
15
|
text = pytesseract.image_to_string(image)
|
|
16
16
|
except pytesseract.pytesseract.TesseractError:
|
|
17
17
|
console.print_exception()
|
|
18
|
-
|
|
18
|
+
print_warning(f"Tesseract OCR failure '{image_name}'! No OCR text extracted...")
|
|
19
19
|
except OSError as e:
|
|
20
20
|
if 'truncated' in str(e):
|
|
21
|
-
|
|
21
|
+
print_warning(f"Truncated image file '{image_name}'!")
|
|
22
22
|
else:
|
|
23
23
|
console.print_exception()
|
|
24
24
|
console.print(f"Error while extracting '{image_name}'!", style='bright_red')
|
|
@@ -21,26 +21,9 @@ pdfalyzer_console = Console(color_system='256')
|
|
|
21
21
|
stderr_console = Console(color_system='256', file=stderr)
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def quoted_text(
|
|
30
|
-
_string: str,
|
|
31
|
-
style: str = '',
|
|
32
|
-
quote_char_style: str = 'white',
|
|
33
|
-
quote_char: str = "'"
|
|
34
|
-
) -> Text:
|
|
35
|
-
"""Wrap _string in 'quote_char'. Style 'quote_char' with 'quote_char_style'."""
|
|
36
|
-
quote_char_txt = Text(quote_char, style=quote_char_style)
|
|
37
|
-
txt = quote_char_txt + Text(_string, style=style) + quote_char_txt
|
|
38
|
-
txt.justify = 'center'
|
|
39
|
-
return txt
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def indented_bullet(msg: Union[str, Text], style: Optional[str] = None) -> Text:
|
|
43
|
-
return Text(' ') + bullet_text(msg, style)
|
|
24
|
+
def attention_getting_panel(text: Text, title: str, style: str = 'white on red') -> Padding:
|
|
25
|
+
p = Panel(text, padding=(2), title=title, style=style)
|
|
26
|
+
return Padding(p, pad=(1, 10, 2, 10))
|
|
44
27
|
|
|
45
28
|
|
|
46
29
|
def bullet_text(msg: Union[str, Text], style: Optional[str] = None) -> Text:
|
|
@@ -50,6 +33,23 @@ def bullet_text(msg: Union[str, Text], style: Optional[str] = None) -> Text:
|
|
|
50
33
|
return Text(ARROW_BULLET).append(msg)
|
|
51
34
|
|
|
52
35
|
|
|
36
|
+
def comma_join_txt(text_objs: List[Text]) -> Text:
|
|
37
|
+
return Text(", ").join(text_objs)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def error_text(text: Union[str, Text]) -> Text:
|
|
41
|
+
msg = Text('').append(f"ERROR", style='bright_red').append(": ")
|
|
42
|
+
|
|
43
|
+
if isinstance(text, Text):
|
|
44
|
+
return msg + text
|
|
45
|
+
else:
|
|
46
|
+
return msg.append(text)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def indented_bullet(msg: Union[str, Text], style: Optional[str] = None) -> Text:
|
|
50
|
+
return Text(' ') + bullet_text(msg, style)
|
|
51
|
+
|
|
52
|
+
|
|
53
53
|
def mild_warning(msg: str) -> None:
|
|
54
54
|
console.print(indented_bullet(Text(msg, style='mild_warning')))
|
|
55
55
|
|
|
@@ -67,10 +67,6 @@ def node_label(idnum: int, label: str, pdf_object: PdfObject, underline: bool =
|
|
|
67
67
|
return text
|
|
68
68
|
|
|
69
69
|
|
|
70
|
-
def comma_join_txt(text_objs: List[Text]) -> Text:
|
|
71
|
-
return Text(", ").join(text_objs)
|
|
72
|
-
|
|
73
|
-
|
|
74
70
|
def number_and_pct(_number: int, total: int, digits: int = 1) -> Text:
|
|
75
71
|
"""Return e.g. '8 (80%)'."""
|
|
76
72
|
return Text(str(_number), style='bright_white').append_text(pct_txt(_number, total, digits))
|
|
@@ -82,28 +78,37 @@ def pct_txt(_number: int, total: int, digits: int = 1) -> Text:
|
|
|
82
78
|
return Text(f"({pct}%)", style='blue')
|
|
83
79
|
|
|
84
80
|
|
|
85
|
-
def
|
|
86
|
-
|
|
81
|
+
def print_error(text: Union[str, Text]) -> Text:
|
|
82
|
+
console.line()
|
|
83
|
+
console.print(error_text(text))
|
|
87
84
|
|
|
88
|
-
if isinstance(text, Text):
|
|
89
|
-
return msg + text
|
|
90
|
-
else:
|
|
91
|
-
return msg.append(text)
|
|
92
85
|
|
|
86
|
+
def print_highlighted(msg: Union[str, Text], **kwargs) -> None:
|
|
87
|
+
"""Print 'msg' with Rich highlighting."""
|
|
88
|
+
pdfalyzer_console.print(msg, highlight=True, **kwargs)
|
|
93
89
|
|
|
94
|
-
def error_text(text: Union[str, Text]) -> Text:
|
|
95
|
-
msg = Text('').append(f"ERROR", style='bright_red').append(": ")
|
|
96
90
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
else:
|
|
100
|
-
return msg.append(text)
|
|
91
|
+
def print_warning(text: Union[str, Text]) -> None:
|
|
92
|
+
console.print(_warning_text(text))
|
|
101
93
|
|
|
102
94
|
|
|
103
|
-
def
|
|
104
|
-
|
|
105
|
-
|
|
95
|
+
def quoted_text(
|
|
96
|
+
_string: str,
|
|
97
|
+
style: str = '',
|
|
98
|
+
quote_char_style: str = 'white',
|
|
99
|
+
quote_char: str = "'"
|
|
100
|
+
) -> Text:
|
|
101
|
+
"""Wrap _string in 'quote_char'. Style 'quote_char' with 'quote_char_style'."""
|
|
102
|
+
quote_char_txt = Text(quote_char, style=quote_char_style)
|
|
103
|
+
txt = quote_char_txt + Text(_string, style=style) + quote_char_txt
|
|
104
|
+
txt.justify = 'center'
|
|
105
|
+
return txt
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
def
|
|
109
|
-
|
|
108
|
+
def _warning_text(text: Union[str, Text]) -> Text:
|
|
109
|
+
msg = Text('').append(f"WARNING", style='bright_yellow').append(": ")
|
|
110
|
+
|
|
111
|
+
if isinstance(text, Text):
|
|
112
|
+
return msg + text
|
|
113
|
+
else:
|
|
114
|
+
return msg.append(text)
|
|
@@ -20,6 +20,7 @@ from pdfalyzer.binary.binary_scanner import BinaryScanner
|
|
|
20
20
|
from pdfalyzer.config import PdfalyzerConfig
|
|
21
21
|
from pdfalyzer.decorators.pdf_tree_node import DECODE_FAILURE_LEN
|
|
22
22
|
from pdfalyzer.detection.yaralyzer_helper import get_bytes_yaralyzer, get_file_yaralyzer
|
|
23
|
+
from pdfalyzer.helpers.rich_text_helper import print_error
|
|
23
24
|
from pdfalyzer.helpers.string_helper import pp
|
|
24
25
|
from pdfalyzer.output.layout import (print_fatal_error_panel, print_section_header, print_section_subheader,
|
|
25
26
|
print_section_sub_subheader)
|
|
@@ -27,12 +28,19 @@ from pdfalyzer.output.tables.decoding_stats_table import build_decoding_stats_ta
|
|
|
27
28
|
from pdfalyzer.output.tables.pdf_node_rich_table import generate_rich_tree, get_symlink_representation
|
|
28
29
|
from pdfalyzer.output.tables.stream_objects_table import stream_objects_table
|
|
29
30
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
30
|
-
# from pdfalyzer.util.adobe_strings import *
|
|
31
31
|
|
|
32
32
|
INTERNAL_YARA_ERROR_MSG = "Internal YARA error! YARA's error codes can be checked here: https://github.com/VirusTotal/yara/blob/master/libyara/include/yara/error.h" # noqa: E501
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class PdfalyzerPresenter:
|
|
36
|
+
"""
|
|
37
|
+
Handles formatting of console text output for Pdfalyzer class.
|
|
38
|
+
|
|
39
|
+
Attributes:
|
|
40
|
+
pdfalyzer (Pdfalyzer): Pdfalyzer for a given PDF file
|
|
41
|
+
yaralyzer (Yaralyzer): Yaralyzer for a given PDF file
|
|
42
|
+
"""
|
|
43
|
+
|
|
36
44
|
def __init__(self, pdfalyzer: Pdfalyzer):
|
|
37
45
|
self.pdfalyzer = pdfalyzer
|
|
38
46
|
self.yaralyzer = get_file_yaralyzer(self.pdfalyzer.pdf_path)
|
|
@@ -83,6 +91,9 @@ class PdfalyzerPresenter:
|
|
|
83
91
|
"""Print informatin about all fonts that appear in this PDF."""
|
|
84
92
|
print_section_header(f'{len(self.pdfalyzer.font_infos)} fonts found in {self.pdfalyzer.pdf_basename}')
|
|
85
93
|
|
|
94
|
+
if self.pdfalyzer.font_info_extraction_error:
|
|
95
|
+
print_error(f"Failed to extract font information (error: {self.pdfalyzer.font_info_extraction_error})")
|
|
96
|
+
|
|
86
97
|
for font_info in [fi for fi in self.pdfalyzer.font_infos if font_idnum is None or font_idnum == fi.idnum]:
|
|
87
98
|
font_info.print_summary()
|
|
88
99
|
|
|
@@ -19,6 +19,7 @@ from pdfalyzer.decorators.indeterminate_node import IndeterminateNode
|
|
|
19
19
|
from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode
|
|
20
20
|
from pdfalyzer.decorators.pdf_tree_verifier import PdfTreeVerifier
|
|
21
21
|
from pdfalyzer.font_info import FontInfo
|
|
22
|
+
from pdfalyzer.helpers.rich_text_helper import print_error
|
|
22
23
|
from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
|
|
23
24
|
from pdfalyzer.util.adobe_strings import *
|
|
24
25
|
from pdfalyzer.util.exceptions import PdfWalkError
|
|
@@ -37,6 +38,7 @@ class Pdfalyzer:
|
|
|
37
38
|
|
|
38
39
|
Attributes:
|
|
39
40
|
font_infos (List[FontInfo]): Font summary objects
|
|
41
|
+
font_info_extraction_error (Optional[Exception]): Error encountered extracting FontInfo (if any)
|
|
40
42
|
max_generation (int): Max revision number ("generation") encounted in this PDF.
|
|
41
43
|
nodes_encountered (Dict[int, PdfTreeNode]): Nodes we've traversed already.
|
|
42
44
|
pdf_basename (str): The base name of the PDF file (with extension).
|
|
@@ -70,6 +72,7 @@ class Pdfalyzer:
|
|
|
70
72
|
|
|
71
73
|
# Initialize tracking variables
|
|
72
74
|
self.font_infos: List[FontInfo] = [] # Font summary objects
|
|
75
|
+
self.font_info_extraction_error: Optional[Exception] = None
|
|
73
76
|
self.max_generation = 0 # PDF revisions are "generations"; this is the max generation encountered
|
|
74
77
|
self.nodes_encountered: Dict[int, PdfTreeNode] = {} # Nodes we've seen already
|
|
75
78
|
self._indeterminate_ids = set() # See INDETERMINATE_REF_KEYS comment
|
|
@@ -220,14 +223,22 @@ class Pdfalyzer:
|
|
|
220
223
|
def _extract_font_infos(self) -> None:
|
|
221
224
|
"""Extract information about fonts in the tree and place it in `self.font_infos`."""
|
|
222
225
|
for node in self.node_iterator():
|
|
223
|
-
if isinstance(node.obj, dict) and RESOURCES in node.obj:
|
|
224
|
-
|
|
225
|
-
|
|
226
|
+
if not (isinstance(node.obj, dict) and RESOURCES in node.obj):
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
log.debug(f"Extracting fonts from node with '{RESOURCES}' key: {node}...")
|
|
230
|
+
known_font_ids = [fi.idnum for fi in self.font_infos]
|
|
226
231
|
|
|
232
|
+
try:
|
|
227
233
|
self.font_infos += [
|
|
228
234
|
fi for fi in FontInfo.extract_font_infos(node.obj)
|
|
229
235
|
if fi.idnum not in known_font_ids
|
|
230
236
|
]
|
|
237
|
+
except Exception as e:
|
|
238
|
+
self.font_info_extraction_error = e
|
|
239
|
+
console.line()
|
|
240
|
+
log.warning(f"Failed to extract font information from node: {node} (error: {e})")
|
|
241
|
+
console.line()
|
|
231
242
|
|
|
232
243
|
def _build_or_find_node(self, relationship: IndirectObject, relationship_key: str) -> PdfTreeNode:
|
|
233
244
|
"""If node in self.nodes_encountered already then return it, otherwise build a node and store it."""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "pdfalyzer"
|
|
3
|
-
version = "1.17.
|
|
3
|
+
version = "1.17.9"
|
|
4
4
|
description = "Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more."
|
|
5
5
|
authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
|
|
6
6
|
license = "GPL-3.0-or-later"
|
|
@@ -67,8 +67,8 @@ packages = [
|
|
|
67
67
|
[tool.poetry.dependencies]
|
|
68
68
|
python = "^3.10"
|
|
69
69
|
anytree = "~=2.13"
|
|
70
|
-
pypdf = "^6.
|
|
71
|
-
PyMuPDF = {version = "^1.26.
|
|
70
|
+
pypdf = "^6.1.3"
|
|
71
|
+
PyMuPDF = {version = "^1.26.5", optional = true}
|
|
72
72
|
pytesseract = {version = "^0.3.13", optional = true}
|
|
73
73
|
yaralyzer = "^1.0.9"
|
|
74
74
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -173,8 +173,8 @@ class PdfFile:
|
|
|
173
173
|
except EmptyFileError:
|
|
174
174
|
log.warning("Skipping empty file!")
|
|
175
175
|
except PdfStreamError as e:
|
|
176
|
-
print_error(f"Error parsing PDF file '{self.file_path}': {e}")
|
|
177
176
|
stderr_console.print_exception()
|
|
177
|
+
print_error(f"Error parsing PDF file '{self.file_path}': {e}")
|
|
178
178
|
|
|
179
179
|
return "\n\n".join(extracted_pages).strip()
|
|
180
180
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/detection/constants/javascript_reserved_keywords.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|