pdfalyzer 1.17.6__tar.gz → 1.17.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/CHANGELOG.md +9 -0
  2. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/PKG-INFO +22 -26
  3. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/README.md +19 -23
  4. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/image_helper.py +3 -3
  5. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/rich_text_helper.py +46 -41
  6. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/pdfalyzer_presenter.py +12 -1
  7. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/pdfalyzer.py +14 -3
  8. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pyproject.toml +3 -3
  9. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/.pdfalyzer.example +0 -0
  10. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/LICENSE +0 -0
  11. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/__init__.py +0 -0
  12. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/__main__.py +0 -0
  13. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/binary/binary_scanner.py +0 -0
  14. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/config.py +0 -0
  15. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/document_model_printer.py +0 -0
  16. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/indeterminate_node.py +0 -0
  17. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_file.py +1 -1
  18. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_object_properties.py +0 -0
  19. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_tree_node.py +0 -0
  20. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_tree_verifier.py +0 -0
  21. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/detection/constants/binary_regexes.py +0 -0
  22. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
  23. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/detection/javascript_hunter.py +0 -0
  24. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/detection/yaralyzer_helper.py +0 -0
  25. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/font_info.py +0 -0
  26. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/dict_helper.py +0 -0
  27. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/filesystem_helper.py +0 -0
  28. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/number_helper.py +0 -0
  29. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/pdf_object_helper.py +0 -0
  30. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/helpers/string_helper.py +0 -0
  31. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/character_mapping.py +0 -0
  32. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/layout.py +0 -0
  33. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/styles/node_colors.py +0 -0
  34. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/styles/rich_theme.py +0 -0
  35. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/decoding_stats_table.py +0 -0
  36. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/font_summary_table.py +0 -0
  37. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
  38. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/stream_objects_table.py +0 -0
  39. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/pdf_object_relationship.py +0 -0
  40. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/adobe_strings.py +0 -0
  41. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/argument_parser.py +0 -0
  42. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/cli_tools_argument_parser.py +0 -0
  43. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/debugging.py +0 -0
  44. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/exceptions.py +0 -0
  45. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/page_range.py +0 -0
  46. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/util/pdf_parser_manager.py +0 -0
  47. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/PDF.yara +0 -0
  48. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
  49. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/__init.py__ +0 -0
  50. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/didier_stevens.yara +0 -0
  51. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
  52. {pdfalyzer-1.17.6 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/pdf_malware.yara +0 -0
@@ -1,5 +1,14 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.17.9
4
+ * Broaden exception handling in `FontInfo` extraction
5
+
6
+ ### 1.17.8
7
+ * Handle `AttributeError` in `FontInfo` extraction
8
+
9
+ ### 1.17.7
10
+ * Bump `pypdf` to 6.1.3 (fixes [#31](https://github.com/michelcrypt4d4mus/pdfalyzer/issues/31)), `PyMuPDF` to 1.26.5
11
+
3
12
  ### 1.17.6
4
13
  * Better handling for errors resulting from bugs in PyPDF
5
14
  * Properly close file handle when pdfalyzing is complete
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.17.6
3
+ Version: 1.17.9
4
4
  Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
@@ -22,9 +22,9 @@ Classifier: Topic :: Artistic Software
22
22
  Classifier: Topic :: Scientific/Engineering :: Visualization
23
23
  Classifier: Topic :: Security
24
24
  Provides-Extra: extract
25
- Requires-Dist: PyMuPDF (>=1.26.4,<2.0.0) ; extra == "extract"
25
+ Requires-Dist: PyMuPDF (>=1.26.5,<2.0.0) ; extra == "extract"
26
26
  Requires-Dist: anytree (>=2.13,<3.0)
27
- Requires-Dist: pypdf (>=6.0.0,<7.0.0)
27
+ Requires-Dist: pypdf (>=6.1.3,<7.0.0)
28
28
  Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
29
29
  Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
30
30
  Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
@@ -67,9 +67,8 @@ If you're looking for one of these things this may be the tool for you.
67
67
  ### What It Don't Do
68
68
  This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
69
69
 
70
- If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it; embedded javascript etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
70
+ If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
71
71
 
72
- -------------
73
72
 
74
73
  # Installation
75
74
  #### All Platforms
@@ -99,7 +98,6 @@ brew install pdfalyzer
99
98
  sudo apt-get install build-essential libssl-dev libffi-dev rustc
100
99
  ```
101
100
 
102
- -------------
103
101
 
104
102
  # Usage
105
103
 
@@ -115,7 +113,7 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
115
113
 
116
114
  The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
117
115
 
118
- ### Setting Command Line Options Permanently With A `.pdfalyzer` File
116
+ #### Setting Command Line Options Permanently With A `.pdfalyzer` File
119
117
  When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
120
118
 
121
119
  1. the current directory
@@ -123,12 +121,9 @@ When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfa
123
121
 
124
122
  If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
125
123
 
126
- ### Environment Variables
124
+ #### Environment Variables
127
125
  Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
128
126
 
129
- ### Colors And Themes
130
- Run `pdfalyzer_show_color_theme` to see the color theme employed.
131
-
132
127
  ### Guarantees
133
128
  Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
134
129
 
@@ -136,7 +131,22 @@ Warnings will be printed if any PDF object ID between 1 and the `/Size` reported
136
131
  [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
137
132
 
138
133
 
139
- ## Use As A Code Library
134
+ ## Included Command Line Tools
135
+ The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
136
+
137
+ * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
138
+ * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
139
+ * `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
140
+ * `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
141
+
142
+ Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
143
+
144
+ ```bash
145
+ pipx install pdfalyzer[extract]
146
+ ```
147
+
148
+
149
+ ## As A Python Library
140
150
  For info about setting up a dev environment see [Contributing](#contributing) below.
141
151
 
142
152
  At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
@@ -247,20 +257,6 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
247
257
 
248
258
 
249
259
  # PDF Resources
250
- ## Included PDF Tools
251
- The Pdfalyzer comes with a few command line tools:
252
-
253
- * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
254
- * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
255
- * `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
256
-
257
- Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
258
-
259
- ```bash
260
- pipx install pdfalyzer[extract]
261
- ```
262
-
263
-
264
260
  ## 3rd Party PDF Tools
265
261
  ### Installing Didier Stevens's PDF Analysis Tools
266
262
  Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
@@ -33,9 +33,8 @@ If you're looking for one of these things this may be the tool for you.
33
33
  ### What It Don't Do
34
34
  This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
35
35
 
36
- If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it; embedded javascript etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
36
+ If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
37
37
 
38
- -------------
39
38
 
40
39
  # Installation
41
40
  #### All Platforms
@@ -65,7 +64,6 @@ brew install pdfalyzer
65
64
  sudo apt-get install build-essential libssl-dev libffi-dev rustc
66
65
  ```
67
66
 
68
- -------------
69
67
 
70
68
  # Usage
71
69
 
@@ -81,7 +79,7 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
81
79
 
82
80
  The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
83
81
 
84
- ### Setting Command Line Options Permanently With A `.pdfalyzer` File
82
+ #### Setting Command Line Options Permanently With A `.pdfalyzer` File
85
83
  When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
86
84
 
87
85
  1. the current directory
@@ -89,12 +87,9 @@ When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfa
89
87
 
90
88
  If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
91
89
 
92
- ### Environment Variables
90
+ #### Environment Variables
93
91
  Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
94
92
 
95
- ### Colors And Themes
96
- Run `pdfalyzer_show_color_theme` to see the color theme employed.
97
-
98
93
  ### Guarantees
99
94
  Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
100
95
 
@@ -102,7 +97,22 @@ Warnings will be printed if any PDF object ID between 1 and the `/Size` reported
102
97
  [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
103
98
 
104
99
 
105
- ## Use As A Code Library
100
+ ## Included Command Line Tools
101
+ The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
102
+
103
+ * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
104
+ * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
105
+ * `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
106
+ * `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
107
+
108
+ Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
109
+
110
+ ```bash
111
+ pipx install pdfalyzer[extract]
112
+ ```
113
+
114
+
115
+ ## As A Python Library
106
116
  For info about setting up a dev environment see [Contributing](#contributing) below.
107
117
 
108
118
  At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
@@ -213,20 +223,6 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
213
223
 
214
224
 
215
225
  # PDF Resources
216
- ## Included PDF Tools
217
- The Pdfalyzer comes with a few command line tools:
218
-
219
- * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
220
- * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
221
- * `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
222
-
223
- Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
224
-
225
- ```bash
226
- pipx install pdfalyzer[extract]
227
- ```
228
-
229
-
230
226
  ## 3rd Party PDF Tools
231
227
  ### Installing Didier Stevens's PDF Analysis Tools
232
228
  Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
@@ -2,7 +2,7 @@ from typing import Optional
2
2
 
3
3
  from yaralyzer.output.rich_console import console
4
4
 
5
- from pdfalyzer.helpers.rich_text_helper import warning_text
5
+ from pdfalyzer.helpers.rich_text_helper import print_warning
6
6
 
7
7
 
8
8
  def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]: # noqa F821
@@ -15,10 +15,10 @@ def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]: # noqa F8
15
15
  text = pytesseract.image_to_string(image)
16
16
  except pytesseract.pytesseract.TesseractError:
17
17
  console.print_exception()
18
- console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
18
+ print_warning(f"Tesseract OCR failure '{image_name}'! No OCR text extracted...")
19
19
  except OSError as e:
20
20
  if 'truncated' in str(e):
21
- console.print(warning_text(f"Truncated image file '{image_name}'!"))
21
+ print_warning(f"Truncated image file '{image_name}'!")
22
22
  else:
23
23
  console.print_exception()
24
24
  console.print(f"Error while extracting '{image_name}'!", style='bright_red')
@@ -21,26 +21,9 @@ pdfalyzer_console = Console(color_system='256')
21
21
  stderr_console = Console(color_system='256', file=stderr)
22
22
 
23
23
 
24
- def print_highlighted(msg: Union[str, Text], **kwargs) -> None:
25
- """Print 'msg' with Rich highlighting."""
26
- pdfalyzer_console.print(msg, highlight=True, **kwargs)
27
-
28
-
29
- def quoted_text(
30
- _string: str,
31
- style: str = '',
32
- quote_char_style: str = 'white',
33
- quote_char: str = "'"
34
- ) -> Text:
35
- """Wrap _string in 'quote_char'. Style 'quote_char' with 'quote_char_style'."""
36
- quote_char_txt = Text(quote_char, style=quote_char_style)
37
- txt = quote_char_txt + Text(_string, style=style) + quote_char_txt
38
- txt.justify = 'center'
39
- return txt
40
-
41
-
42
- def indented_bullet(msg: Union[str, Text], style: Optional[str] = None) -> Text:
43
- return Text(' ') + bullet_text(msg, style)
24
+ def attention_getting_panel(text: Text, title: str, style: str = 'white on red') -> Padding:
25
+ p = Panel(text, padding=(2), title=title, style=style)
26
+ return Padding(p, pad=(1, 10, 2, 10))
44
27
 
45
28
 
46
29
  def bullet_text(msg: Union[str, Text], style: Optional[str] = None) -> Text:
@@ -50,6 +33,23 @@ def bullet_text(msg: Union[str, Text], style: Optional[str] = None) -> Text:
50
33
  return Text(ARROW_BULLET).append(msg)
51
34
 
52
35
 
36
+ def comma_join_txt(text_objs: List[Text]) -> Text:
37
+ return Text(", ").join(text_objs)
38
+
39
+
40
+ def error_text(text: Union[str, Text]) -> Text:
41
+ msg = Text('').append(f"ERROR", style='bright_red').append(": ")
42
+
43
+ if isinstance(text, Text):
44
+ return msg + text
45
+ else:
46
+ return msg.append(text)
47
+
48
+
49
+ def indented_bullet(msg: Union[str, Text], style: Optional[str] = None) -> Text:
50
+ return Text(' ') + bullet_text(msg, style)
51
+
52
+
53
53
  def mild_warning(msg: str) -> None:
54
54
  console.print(indented_bullet(Text(msg, style='mild_warning')))
55
55
 
@@ -67,10 +67,6 @@ def node_label(idnum: int, label: str, pdf_object: PdfObject, underline: bool =
67
67
  return text
68
68
 
69
69
 
70
- def comma_join_txt(text_objs: List[Text]) -> Text:
71
- return Text(", ").join(text_objs)
72
-
73
-
74
70
  def number_and_pct(_number: int, total: int, digits: int = 1) -> Text:
75
71
  """Return e.g. '8 (80%)'."""
76
72
  return Text(str(_number), style='bright_white').append_text(pct_txt(_number, total, digits))
@@ -82,28 +78,37 @@ def pct_txt(_number: int, total: int, digits: int = 1) -> Text:
82
78
  return Text(f"({pct}%)", style='blue')
83
79
 
84
80
 
85
- def warning_text(text: Union[str, Text]) -> Text:
86
- msg = Text('').append(f"WARNING", style='bright_yellow').append(": ")
81
+ def print_error(text: Union[str, Text]) -> Text:
82
+ console.line()
83
+ console.print(error_text(text))
87
84
 
88
- if isinstance(text, Text):
89
- return msg + text
90
- else:
91
- return msg.append(text)
92
85
 
86
+ def print_highlighted(msg: Union[str, Text], **kwargs) -> None:
87
+ """Print 'msg' with Rich highlighting."""
88
+ pdfalyzer_console.print(msg, highlight=True, **kwargs)
93
89
 
94
- def error_text(text: Union[str, Text]) -> Text:
95
- msg = Text('').append(f"ERROR", style='bright_red').append(": ")
96
90
 
97
- if isinstance(text, Text):
98
- return msg + text
99
- else:
100
- return msg.append(text)
91
+ def print_warning(text: Union[str, Text]) -> None:
92
+ console.print(_warning_text(text))
101
93
 
102
94
 
103
- def attention_getting_panel(text: Text, title: str, style: str = 'white on red') -> Padding:
104
- p = Panel(text, padding=(2), title=title, style=style)
105
- return Padding(p, pad=(1, 10, 2, 10))
95
+ def quoted_text(
96
+ _string: str,
97
+ style: str = '',
98
+ quote_char_style: str = 'white',
99
+ quote_char: str = "'"
100
+ ) -> Text:
101
+ """Wrap _string in 'quote_char'. Style 'quote_char' with 'quote_char_style'."""
102
+ quote_char_txt = Text(quote_char, style=quote_char_style)
103
+ txt = quote_char_txt + Text(_string, style=style) + quote_char_txt
104
+ txt.justify = 'center'
105
+ return txt
106
106
 
107
107
 
108
- def print_error(text: Union[str, Text]) -> Text:
109
- console.print(error_text(text))
108
+ def _warning_text(text: Union[str, Text]) -> Text:
109
+ msg = Text('').append(f"WARNING", style='bright_yellow').append(": ")
110
+
111
+ if isinstance(text, Text):
112
+ return msg + text
113
+ else:
114
+ return msg.append(text)
@@ -20,6 +20,7 @@ from pdfalyzer.binary.binary_scanner import BinaryScanner
20
20
  from pdfalyzer.config import PdfalyzerConfig
21
21
  from pdfalyzer.decorators.pdf_tree_node import DECODE_FAILURE_LEN
22
22
  from pdfalyzer.detection.yaralyzer_helper import get_bytes_yaralyzer, get_file_yaralyzer
23
+ from pdfalyzer.helpers.rich_text_helper import print_error
23
24
  from pdfalyzer.helpers.string_helper import pp
24
25
  from pdfalyzer.output.layout import (print_fatal_error_panel, print_section_header, print_section_subheader,
25
26
  print_section_sub_subheader)
@@ -27,12 +28,19 @@ from pdfalyzer.output.tables.decoding_stats_table import build_decoding_stats_ta
27
28
  from pdfalyzer.output.tables.pdf_node_rich_table import generate_rich_tree, get_symlink_representation
28
29
  from pdfalyzer.output.tables.stream_objects_table import stream_objects_table
29
30
  from pdfalyzer.pdfalyzer import Pdfalyzer
30
- # from pdfalyzer.util.adobe_strings import *
31
31
 
32
32
  INTERNAL_YARA_ERROR_MSG = "Internal YARA error! YARA's error codes can be checked here: https://github.com/VirusTotal/yara/blob/master/libyara/include/yara/error.h" # noqa: E501
33
33
 
34
34
 
35
35
  class PdfalyzerPresenter:
36
+ """
37
+ Handles formatting of console text output for Pdfalyzer class.
38
+
39
+ Attributes:
40
+ pdfalyzer (Pdfalyzer): Pdfalyzer for a given PDF file
41
+ yaralyzer (Yaralyzer): Yaralyzer for a given PDF file
42
+ """
43
+
36
44
  def __init__(self, pdfalyzer: Pdfalyzer):
37
45
  self.pdfalyzer = pdfalyzer
38
46
  self.yaralyzer = get_file_yaralyzer(self.pdfalyzer.pdf_path)
@@ -83,6 +91,9 @@ class PdfalyzerPresenter:
83
91
  """Print informatin about all fonts that appear in this PDF."""
84
92
  print_section_header(f'{len(self.pdfalyzer.font_infos)} fonts found in {self.pdfalyzer.pdf_basename}')
85
93
 
94
+ if self.pdfalyzer.font_info_extraction_error:
95
+ print_error(f"Failed to extract font information (error: {self.pdfalyzer.font_info_extraction_error})")
96
+
86
97
  for font_info in [fi for fi in self.pdfalyzer.font_infos if font_idnum is None or font_idnum == fi.idnum]:
87
98
  font_info.print_summary()
88
99
 
@@ -19,6 +19,7 @@ from pdfalyzer.decorators.indeterminate_node import IndeterminateNode
19
19
  from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode
20
20
  from pdfalyzer.decorators.pdf_tree_verifier import PdfTreeVerifier
21
21
  from pdfalyzer.font_info import FontInfo
22
+ from pdfalyzer.helpers.rich_text_helper import print_error
22
23
  from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
23
24
  from pdfalyzer.util.adobe_strings import *
24
25
  from pdfalyzer.util.exceptions import PdfWalkError
@@ -37,6 +38,7 @@ class Pdfalyzer:
37
38
 
38
39
  Attributes:
39
40
  font_infos (List[FontInfo]): Font summary objects
41
+ font_info_extraction_error (Optional[Exception]): Error encountered extracting FontInfo (if any)
40
42
  max_generation (int): Max revision number ("generation") encounted in this PDF.
41
43
  nodes_encountered (Dict[int, PdfTreeNode]): Nodes we've traversed already.
42
44
  pdf_basename (str): The base name of the PDF file (with extension).
@@ -70,6 +72,7 @@ class Pdfalyzer:
70
72
 
71
73
  # Initialize tracking variables
72
74
  self.font_infos: List[FontInfo] = [] # Font summary objects
75
+ self.font_info_extraction_error: Optional[Exception] = None
73
76
  self.max_generation = 0 # PDF revisions are "generations"; this is the max generation encountered
74
77
  self.nodes_encountered: Dict[int, PdfTreeNode] = {} # Nodes we've seen already
75
78
  self._indeterminate_ids = set() # See INDETERMINATE_REF_KEYS comment
@@ -220,14 +223,22 @@ class Pdfalyzer:
220
223
  def _extract_font_infos(self) -> None:
221
224
  """Extract information about fonts in the tree and place it in `self.font_infos`."""
222
225
  for node in self.node_iterator():
223
- if isinstance(node.obj, dict) and RESOURCES in node.obj:
224
- log.debug(f"Extracting fonts from node with '{RESOURCES}' key: {node}...")
225
- known_font_ids = [fi.idnum for fi in self.font_infos]
226
+ if not (isinstance(node.obj, dict) and RESOURCES in node.obj):
227
+ continue
228
+
229
+ log.debug(f"Extracting fonts from node with '{RESOURCES}' key: {node}...")
230
+ known_font_ids = [fi.idnum for fi in self.font_infos]
226
231
 
232
+ try:
227
233
  self.font_infos += [
228
234
  fi for fi in FontInfo.extract_font_infos(node.obj)
229
235
  if fi.idnum not in known_font_ids
230
236
  ]
237
+ except Exception as e:
238
+ self.font_info_extraction_error = e
239
+ console.line()
240
+ log.warning(f"Failed to extract font information from node: {node} (error: {e})")
241
+ console.line()
231
242
 
232
243
  def _build_or_find_node(self, relationship: IndirectObject, relationship_key: str) -> PdfTreeNode:
233
244
  """If node in self.nodes_encountered already then return it, otherwise build a node and store it."""
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "pdfalyzer"
3
- version = "1.17.6"
3
+ version = "1.17.9"
4
4
  description = "Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more."
5
5
  authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
6
6
  license = "GPL-3.0-or-later"
@@ -67,8 +67,8 @@ packages = [
67
67
  [tool.poetry.dependencies]
68
68
  python = "^3.10"
69
69
  anytree = "~=2.13"
70
- pypdf = "^6.0.0"
71
- PyMuPDF = {version = "^1.26.4", optional = true}
70
+ pypdf = "^6.1.3"
71
+ PyMuPDF = {version = "^1.26.5", optional = true}
72
72
  pytesseract = {version = "^0.3.13", optional = true}
73
73
  yaralyzer = "^1.0.9"
74
74
 
File without changes
@@ -173,8 +173,8 @@ class PdfFile:
173
173
  except EmptyFileError:
174
174
  log.warning("Skipping empty file!")
175
175
  except PdfStreamError as e:
176
- print_error(f"Error parsing PDF file '{self.file_path}': {e}")
177
176
  stderr_console.print_exception()
177
+ print_error(f"Error parsing PDF file '{self.file_path}': {e}")
178
178
 
179
179
  return "\n\n".join(extracted_pages).strip()
180
180