pdfalyzer 1.16.13__tar.gz → 1.17.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pdfalyzer might be problematic. Click here for more details.

Files changed (51) hide show
  1. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/CHANGELOG.md +8 -0
  2. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/PKG-INFO +35 -11
  3. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/README.md +29 -7
  4. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/__init__.py +19 -5
  5. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/binary/binary_scanner.py +28 -12
  6. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/config.py +2 -1
  7. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/decorators/indeterminate_node.py +11 -11
  8. pdfalyzer-1.17.0/pdfalyzer/decorators/pdf_file.py +212 -0
  9. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/decorators/pdf_object_properties.py +3 -3
  10. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/decorators/pdf_tree_node.py +17 -11
  11. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/decorators/pdf_tree_verifier.py +2 -0
  12. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/detection/yaralyzer_helper.py +9 -10
  13. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/helpers/filesystem_helper.py +27 -3
  14. pdfalyzer-1.17.0/pdfalyzer/helpers/image_helper.py +31 -0
  15. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/helpers/rich_text_helper.py +51 -1
  16. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/helpers/string_helper.py +6 -1
  17. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/output/character_mapping.py +1 -1
  18. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/output/layout.py +13 -3
  19. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/output/styles/rich_theme.py +2 -1
  20. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/output/tables/decoding_stats_table.py +4 -4
  21. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/output/tables/font_summary_table.py +2 -2
  22. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/pdfalyzer.py +20 -13
  23. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/util/argument_parser.py +102 -7
  24. pdfalyzer-1.17.0/pdfalyzer/util/page_range.py +54 -0
  25. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pyproject.toml +15 -4
  26. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/.pdfalyzer.example +0 -0
  27. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/LICENSE +0 -0
  28. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/__main__.py +0 -0
  29. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/decorators/document_model_printer.py +0 -0
  30. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/detection/constants/binary_regexes.py +0 -0
  31. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
  32. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/detection/javascript_hunter.py +0 -0
  33. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/font_info.py +0 -0
  34. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/helpers/dict_helper.py +0 -0
  35. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/helpers/number_helper.py +0 -0
  36. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/helpers/pdf_object_helper.py +0 -0
  37. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/output/pdfalyzer_presenter.py +0 -0
  38. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/output/styles/node_colors.py +0 -0
  39. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
  40. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/output/tables/stream_objects_table.py +0 -0
  41. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/pdf_object_relationship.py +0 -0
  42. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/util/adobe_strings.py +0 -0
  43. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/util/debugging.py +0 -0
  44. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/util/exceptions.py +0 -0
  45. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/util/pdf_parser_manager.py +0 -0
  46. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/PDF.yara +0 -0
  47. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
  48. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/__init.py__ +0 -0
  49. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/didier_stevens.yara +0 -0
  50. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
  51. {pdfalyzer-1.16.13 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/pdf_malware.yara +0 -0
@@ -1,5 +1,13 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ # 1.17.0
4
+ * Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
5
+ * Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)
6
+
7
+ ### 1.16.14
8
+ * Bump `yaralyzer` to v1.0.9, handle `FileNotFoundError` which is now raised instead of `TypeError`
9
+ * Drop support for python 3.9
10
+
3
11
  ### 1.16.13
4
12
  * Bump `yaralyzer` to v1.0.7 and fix reference to yaralyzer's renamed `prefix_with_style()` method
5
13
 
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.16.13
3
+ Version: 1.17.0
4
4
  Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
7
7
  Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
8
8
  Author: Michel de Cryptadamus
9
9
  Author-email: michel@cryptadamus.com
10
- Requires-Python: >=3.9.2,<4.0
10
+ Requires-Python: >=3.10,<4.0
11
11
  Classifier: Development Status :: 5 - Production/Stable
12
12
  Classifier: Environment :: Console
13
13
  Classifier: Intended Audience :: Information Technology
@@ -18,24 +18,26 @@ Classifier: Programming Language :: Python :: 3.10
18
18
  Classifier: Programming Language :: Python :: 3.11
19
19
  Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3.13
21
- Classifier: Programming Language :: Python :: 3.9
22
21
  Classifier: Topic :: Artistic Software
23
22
  Classifier: Topic :: Scientific/Engineering :: Visualization
24
23
  Classifier: Topic :: Security
24
+ Provides-Extra: extract
25
+ Requires-Dist: PyMuPDF (>=1.26.4,<2.0.0) ; extra == "extract"
25
26
  Requires-Dist: anytree (>=2.13,<3.0)
26
27
  Requires-Dist: pypdf (>=6.0.0,<7.0.0)
27
- Requires-Dist: yaralyzer (>=1.0.7,<2.0.0)
28
+ Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
29
+ Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
28
30
  Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
29
31
  Project-URL: Documentation, https://github.com/michelcrypt4d4mus/pdfalyzer
30
32
  Project-URL: Repository, https://github.com/michelcrypt4d4mus/pdfalyzer
31
33
  Description-Content-Type: text/markdown
32
34
 
33
- <!-- ![Tests](https://img.shields.io/github/workflow/status/michelcrypt4d4mus/pdfalyzer/tests?label=tests) -->
34
- ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
35
35
  [![GithubRelease](https://img.shields.io/github/v/release/michelcrypt4d4mus/pdfalyzer?sort=semver)](https://pypi.org/project/pdfalyzer/)
36
- [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
37
36
  ![PyPiRelease](https://img.shields.io/pypi/v/pdfalyzer)
37
+ [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
38
+ ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
38
39
  ![Downloads](https://img.shields.io/pypi/dm/pdfalyzer)
40
+ [![Tests](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml/badge.svg)](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
39
41
 
40
42
 
41
43
  # THE PDFALYZER
@@ -114,7 +116,12 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
114
116
  The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
115
117
 
116
118
  ### Setting Command Line Options Permanently With A `.pdfalyzer` File
117
- When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` first in the current directory and then in the home directory. If it finds a file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
119
+ When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
120
+
121
+ 1. the current directory
122
+ 2. the user's home directory
123
+
124
+ If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
118
125
 
119
126
  ### Environment Variables
120
127
  Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
@@ -125,10 +132,9 @@ Run `pdfalyzer_show_color_theme` to see the color theme employed.
125
132
  ### Guarantees
126
133
  Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
127
134
 
128
- ## Example Usage
135
+ ## Example Malicious PDF Investigation
129
136
  [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
130
137
 
131
- -------------
132
138
 
133
139
  ## Use As A Code Library
134
140
  For info about setting up a dev environment see [Contributing](#contributing) below.
@@ -239,9 +245,27 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
239
245
 
240
246
  -------------
241
247
 
248
+
242
249
  # PDF Resources
243
250
  ## Included PDF Tools
244
- The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
251
+ The Pdfalyzer comes with a few command line tools:
252
+
253
+ #### `combine_pdfs`
254
+ Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
255
+
256
+ #### `extract_pdf_pages`
257
+ Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
258
+ ![](doc/extract_pages_from_pdf_help.png)
259
+
260
+ #### `extract_text_from_pdfs`
261
+ Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
262
+
263
+ ```bash
264
+ pipx install pdfalyzer[extract]
265
+ ```
266
+
267
+ Run `extract_text_from_pdfs --help` to see the options.
268
+
245
269
 
246
270
  ## 3rd Party PDF Tools
247
271
  ### Installing Didier Stevens's PDF Analysis Tools
@@ -1,9 +1,9 @@
1
- <!-- ![Tests](https://img.shields.io/github/workflow/status/michelcrypt4d4mus/pdfalyzer/tests?label=tests) -->
2
- ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
3
1
  [![GithubRelease](https://img.shields.io/github/v/release/michelcrypt4d4mus/pdfalyzer?sort=semver)](https://pypi.org/project/pdfalyzer/)
4
- [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
5
2
  ![PyPiRelease](https://img.shields.io/pypi/v/pdfalyzer)
3
+ [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
4
+ ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
6
5
  ![Downloads](https://img.shields.io/pypi/dm/pdfalyzer)
6
+ [![Tests](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml/badge.svg)](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
7
7
 
8
8
 
9
9
  # THE PDFALYZER
@@ -82,7 +82,12 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
82
82
  The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
83
83
 
84
84
  ### Setting Command Line Options Permanently With A `.pdfalyzer` File
85
- When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` first in the current directory and then in the home directory. If it finds a file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
85
+ When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
86
+
87
+ 1. the current directory
88
+ 2. the user's home directory
89
+
90
+ If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
86
91
 
87
92
  ### Environment Variables
88
93
  Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
@@ -93,10 +98,9 @@ Run `pdfalyzer_show_color_theme` to see the color theme employed.
93
98
  ### Guarantees
94
99
  Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
95
100
 
96
- ## Example Usage
101
+ ## Example Malicious PDF Investigation
97
102
  [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
98
103
 
99
- -------------
100
104
 
101
105
  ## Use As A Code Library
102
106
  For info about setting up a dev environment see [Contributing](#contributing) below.
@@ -207,9 +211,27 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
207
211
 
208
212
  -------------
209
213
 
214
+
210
215
  # PDF Resources
211
216
  ## Included PDF Tools
212
- The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
217
+ The Pdfalyzer comes with a few command line tools:
218
+
219
+ #### `combine_pdfs`
220
+ Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
221
+
222
+ #### `extract_pdf_pages`
223
+ Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
224
+ ![](doc/extract_pages_from_pdf_help.png)
225
+
226
+ #### `extract_text_from_pdfs`
227
+ Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
228
+
229
+ ```bash
230
+ pipx install pdfalyzer[extract]
231
+ ```
232
+
233
+ Run `extract_text_from_pdfs --help` to see the options.
234
+
213
235
 
214
236
  ## 3rd Party PDF Tools
215
237
  ### Installing Didier Stevens's PDF Analysis Tools
@@ -1,5 +1,6 @@
1
1
  import code
2
2
  import sys
3
+ from argparse import Namespace
3
4
  from os import environ, getcwd, path
4
5
 
5
6
  from dotenv import load_dotenv
@@ -24,13 +25,14 @@ from yaralyzer.output.file_export import invoke_rich_export
24
25
  from yaralyzer.output.rich_console import console
25
26
  from yaralyzer.util.logging import log_and_print
26
27
 
28
+ from pdfalyzer.decorators.pdf_file import PdfFile
27
29
  from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
28
30
  from pdfalyzer.helpers.rich_text_helper import print_highlighted
29
31
  from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
30
32
  from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
31
33
  from pdfalyzer.pdfalyzer import Pdfalyzer
32
34
  from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
33
- parse_combine_pdfs_args)
35
+ parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
34
36
  from pdfalyzer.util.pdf_parser_manager import PdfParserManager
35
37
 
36
38
  # For the table shown by running pdfalyzer_show_color_theme
@@ -132,7 +134,19 @@ def combine_pdfs():
132
134
  print_highlighted(txt)
133
135
 
134
136
 
135
- # TODO: migrate this functionality from clown_sort
136
- # def extract_pages_from_pdf() -> None:
137
- # args = parse_pdf_page_extraction_args()
138
- # PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
137
+ def extract_pdf_pages() -> None:
138
+ args = parse_pdf_page_extraction_args()
139
+ PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
140
+
141
+
142
+ def extract_text_from_pdfs() -> None:
143
+ """
144
+ Extract text from a single file or from all files in a given directory. Can accept
145
+ multiple paths as arguments on the command line.
146
+ """
147
+ args: Namespace = parse_text_extraction_args()
148
+ console.line()
149
+
150
+ for file_path in args.files_to_process:
151
+ PdfFile(file_path).print_extracted_text(args.page_range, args.print_as_parsed)
152
+ console.line(2)
@@ -1,6 +1,5 @@
1
1
  """
2
- Class for handling binary data - scanning through it for various suspicious patterns as well as forcing
3
- various character encodings upon it to see what comes out.
2
+ `BinaryScanner` class.
4
3
  """
5
4
  from collections import defaultdict
6
5
  from typing import Iterator, Optional, Tuple
@@ -28,8 +27,18 @@ from pdfalyzer.util.adobe_strings import CONTENTS, CURRENTFILE_EEXEC, FONT_FILE_
28
27
 
29
28
 
30
29
  class BinaryScanner:
30
+ """
31
+ Class for handling binary data - scanning through it for various suspicious patterns as well as forcing
32
+ various character encodings upon it to see what comes out.
33
+ """
34
+
31
35
  def __init__(self, _bytes: bytes, owner: PdfTreeNode, label: Optional[Text] = None):
32
- """'owner' arg is an optional link back to the object containing this binary."""
36
+ """
37
+ Args:
38
+ _bytes (bytes): The binary data to be scanned.
39
+ owner (PdfTreeNode): The `PdfTreeNode` that contains this binary data.
40
+ label (Optional[Text]): A rich `Text` label for the binary data (e.g. the PDF object's address).
41
+ """
33
42
  self.bytes = _bytes
34
43
  self.label = label
35
44
  self.owner = owner
@@ -42,7 +51,7 @@ class BinaryScanner:
42
51
  self.regex_extraction_stats = defaultdict(lambda: RegexMatchMetrics())
43
52
 
44
53
  def check_for_dangerous_instructions(self) -> None:
45
- """Scan for all the strings in DANGEROUS_INSTRUCTIONS list and decode bytes around them."""
54
+ """Scan for all the strings in `DANGEROUS_INSTRUCTIONS` list and decode bytes around them."""
46
55
  subheader = "Scanning Binary For Anything That Could Be Described As 'sus'..."
47
56
  print_section_sub_subheader(subheader, style=f"bright_red")
48
57
 
@@ -71,8 +80,8 @@ class BinaryScanner:
71
80
 
72
81
  def force_decode_quoted_bytes(self) -> None:
73
82
  """
74
- Find all strings matching QUOTE_PATTERNS (AKA between quote chars) and decode them with various encodings.
75
- The --quote-type arg will limit this decode to just one kind of quote.
83
+ Find all strings matching `QUOTE_PATTERNS` (AKA between quote chars) and decode them with various
84
+ encodings. The `--quote-type` arg will limit this decode to just one kind of quote.
76
85
  """
77
86
  quote_selections = PdfalyzerConfig._args.extract_quoteds
78
87
 
@@ -100,11 +109,11 @@ class BinaryScanner:
100
109
  # YARA rules are written on the fly and then YARA does the matching.
101
110
  # -------------------------------------------------------------------------------
102
111
  def extract_guillemet_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
103
- """Iterate on all strings surrounded by Guillemet quotes, e.g. «string»"""
112
+ """Iterate on all strings surrounded by Guillemet quotes, e.g. «string»."""
104
113
  return self._quote_yaralyzer(QUOTE_PATTERNS[GUILLEMET], GUILLEMET).match_iterator()
105
114
 
106
115
  def extract_backtick_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
107
- """Returns an interator over all strings surrounded by backticks"""
116
+ """Returns an interator over all strings surrounded by backticks."""
108
117
  return self._quote_yaralyzer(QUOTE_PATTERNS[BACKTICK], BACKTICK).match_iterator()
109
118
 
110
119
  def extract_front_slash_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
@@ -137,7 +146,14 @@ class BinaryScanner:
137
146
  console.line()
138
147
 
139
148
  def process_yara_matches(self, yaralyzer: Yaralyzer, pattern: str, force: bool = False) -> None:
140
- """Decide whether to attempt to decode the matched bytes, track stats. force param ignores min/max length."""
149
+ """
150
+ Decide whether to attempt to decode the matched bytes and track stats.
151
+
152
+ Args:
153
+ yaralyzer (Yaralyzer): The `Yaralyzer` instance to use for finding matches.
154
+ pattern (str): The pattern being searched for (used for stats tracking).
155
+ force (bool): If `True`, decode all matches even if they are very short or very long.
156
+ """
141
157
  for bytes_match, decoder in yaralyzer.match_iterator():
142
158
  log.debug(f"Trackings match stats for {pattern}, bytes_match: {bytes_match}, is_decodable: {bytes_match.is_decodable()}") # noqa: E501
143
159
 
@@ -162,7 +178,7 @@ class BinaryScanner:
162
178
  return self.bytes.split(CURRENTFILE_EEXEC)[1] if CURRENTFILE_EEXEC in self.bytes else self.bytes
163
179
 
164
180
  def _quote_yaralyzer(self, quote_pattern: str, quote_type: str):
165
- """Helper method to build a Yaralyzer for a quote_pattern"""
181
+ """Helper method to build a Yaralyzer for a `quote_pattern`."""
166
182
  label = f"{quote_type}_Quoted"
167
183
 
168
184
  if quote_type == GUILLEMET:
@@ -177,7 +193,7 @@ class BinaryScanner:
177
193
  rules_label: Optional[str] = None,
178
194
  pattern_label: Optional[str] = None
179
195
  ) -> Yaralyzer:
180
- """Build a yaralyzer to scan self.bytes"""
196
+ """Build a `yaralyzer` to scan `self.bytes`."""
181
197
  return Yaralyzer.for_patterns(
182
198
  patterns=[escape_yara_pattern(pattern)],
183
199
  patterns_type=pattern_type,
@@ -198,5 +214,5 @@ class BinaryScanner:
198
214
  self.suppression_notice_queue = []
199
215
 
200
216
  def _eexec_idx(self) -> int:
201
- """Returns the location of CURRENTFILES_EEXEC within the binary stream data (or 0 if it's not there)."""
217
+ """Returns the location of `CURRENTFILES_EEXEC` within the binary stream data (or 0 if it's not there)."""
202
218
  return self.bytes.find(CURRENTFILE_EEXEC) if CURRENTFILE_EEXEC in self.bytes else 0
@@ -9,9 +9,10 @@ from os import environ, pardir, path
9
9
  from yaralyzer.config import YaralyzerConfig, is_env_var_set_and_not_false, is_invoked_by_pytest
10
10
 
11
11
  PDFALYZE = 'pdfalyze'
12
+ PDFALYZER = f"{PDFALYZE}r"
12
13
  ALL_STREAMS = -1
13
14
  PYTEST_FLAG = 'INVOKED_BY_PYTEST'
14
- PROJECT_ROOT = path.join(str(importlib.resources.files('pdfalyzer')), pardir)
15
+ PROJECT_ROOT = path.join(str(importlib.resources.files(PDFALYZER)), pardir)
15
16
 
16
17
  # 3rd part pdf-parser.py
17
18
  PDF_PARSER_EXECUTABLE_ENV_VAR = 'PDFALYZER_PDF_PARSER_PY_PATH'
@@ -1,11 +1,3 @@
1
- """
2
- Some nodes cannot be placed until we have walked the rest of the tree. For instance
3
- if we encounter a /Page that relationships /Resources we need to know if there's a
4
- /Pages parent of the /Page before committing to a tree structure.
5
-
6
- This class handles choosing among the candidates for a given PDF object's parent node
7
- (AKA "figuring out where to place the node in the PDF object tree").
8
- """
9
1
  from typing import Callable, List, Optional
10
2
 
11
3
  from rich.markup import escape
@@ -18,6 +10,14 @@ from pdfalyzer.util.adobe_strings import *
18
10
 
19
11
 
20
12
  class IndeterminateNode:
13
+ """
14
+ Class to handle choosing among the candidates for a given PDF object's parent node.
15
+
16
+ Some nodes cannot be placed until we have walked the rest of the tree. For instance
17
+ if we encounter a /Page that relationships /Resources we need to know if there's a
18
+ /Pages parent of the /Page before committing to a tree structure.
19
+ """
20
+
21
21
  def __init__(self, node: PdfTreeNode) -> None:
22
22
  self.node = node
23
23
 
@@ -56,7 +56,7 @@ class IndeterminateNode:
56
56
 
57
57
  self.node.set_parent(parent)
58
58
 
59
- def find_node_with_most_descendants(self, list_of_nodes: List[PdfTreeNode] = None) -> PdfTreeNode:
59
+ def find_node_with_most_descendants(self, list_of_nodes: Optional[List[PdfTreeNode]] = None) -> PdfTreeNode:
60
60
  """Find node with a reference to this one that has the most descendants"""
61
61
  list_of_nodes = list_of_nodes or [r.from_node for r in self.node.non_tree_relationships]
62
62
  max_descendants = max([node.descendants_count() for node in list_of_nodes])
@@ -64,7 +64,7 @@ class IndeterminateNode:
64
64
 
65
65
  def _has_only_similar_relationships(self) -> bool:
66
66
  """
67
- Returns True if all the nodes w/references to this one have the same type or if all the
67
+ Returns `True` if all the nodes w/references to this one have the same type or if all the
68
68
  reference_keys that point to this node are the same.
69
69
  """
70
70
  unique_refferer_labels = self.node.unique_labels_of_referring_nodes()
@@ -125,6 +125,6 @@ class IndeterminateNode:
125
125
 
126
126
 
127
127
  def find_node_with_lowest_id(list_of_nodes: List[PdfTreeNode]) -> PdfTreeNode:
128
- """Find node in list_of_nodes_with_lowest ID."""
128
+ """Return node in `list_of_nodes` with lowest ID."""
129
129
  lowest_idnum = min([n.idnum for n in list_of_nodes])
130
130
  return next(n for n in list_of_nodes if n.idnum == lowest_idnum)
@@ -0,0 +1,212 @@
1
+ import io
2
+ from logging import Logger
3
+ from os import path
4
+ from pathlib import Path
5
+ from typing import List, Optional, Union
6
+
7
+ from pypdf import PdfReader, PdfWriter
8
+ from pypdf.errors import DependencyError, EmptyFileError, PdfStreamError
9
+ from rich.console import Console
10
+ from rich.markup import escape
11
+ from rich.panel import Panel
12
+ from rich.text import Text
13
+ from yaralyzer.output.rich_console import console
14
+ from yaralyzer.util.logging import log as yaralyzer_log
15
+
16
+ from pdfalyzer.helpers.filesystem_helper import create_dir_if_it_does_not_exist, insert_suffix_before_extension
17
+ from pdfalyzer.helpers.image_helper import ocr_text
18
+ from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_text, mild_warning,
19
+ print_error, stderr_console)
20
+ from pdfalyzer.helpers.string_helper import exception_str
21
+ from pdfalyzer.util.page_range import PageRange
22
+
23
+ DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
24
+ MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
25
+
26
+
27
+ class PdfFile:
28
+ """
29
+ Wrapper for a PDF file path that provides useful methods and properties.
30
+
31
+ Attributes:
32
+ file_path (Path): The path to the PDF file.
33
+ dirname (Path): The directory containing the PDF file.
34
+ basename (str): The base name of the PDF file (with extension).
35
+ basename_without_ext (str): The base name of the PDF file (without extension).
36
+ extname (str): The file extension of the PDF file.
37
+ """
38
+
39
+ def __init__(self, file_path: Union[str, Path]) -> None:
40
+ """
41
+ Args:
42
+ file_path (Union[str, Path]): Path to the PDF file.
43
+ """
44
+ self.file_path: Path = Path(file_path)
45
+
46
+ if not self.file_path.exists():
47
+ raise FileNotFoundError(f"File '{file_path}' does not exist.")
48
+
49
+ self.dirname = self.file_path.parent
50
+ self.basename: str = path.basename(file_path)
51
+ self.basename_without_ext: str = str(Path(self.basename).with_suffix(''))
52
+ self.extname: str = self.file_path.suffix
53
+ self.file_size = self.file_path.stat().st_size
54
+
55
+ def extract_page_range(
56
+ self,
57
+ page_range: PageRange,
58
+ destination_dir: Optional[Path] = None,
59
+ extra_file_suffix: Optional[str] = None
60
+ ) -> Path:
61
+ """
62
+ Extract a range of pages to a new PDF file.
63
+
64
+ Args:
65
+ page_range (PageRange): Range of pages to extract.
66
+ destination_dir (Optional[Path]): Directory to save the new PDF file. Defaults to the same
67
+ directory as the source PDF.
68
+ extra_file_suffix (Optional[str]): An optional suffix to append to the new PDF's filename.
69
+ Defaults to the page range suffix.
70
+
71
+ Returns:
72
+ Path: The path to the newly created PDF file containing the extracted pages.
73
+ """
74
+ destination_dir = destination_dir or self.dirname
75
+ create_dir_if_it_does_not_exist(destination_dir)
76
+
77
+ if extra_file_suffix is None:
78
+ file_suffix = page_range.file_suffix()
79
+ else:
80
+ file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
81
+
82
+ extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
83
+ extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
84
+ console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
85
+ pdf_writer = PdfWriter()
86
+
87
+ with open(self.file_path, 'rb') as source_pdf:
88
+ pdf_writer.append(fileobj=source_pdf, pages=page_range.to_tuple())
89
+
90
+ with open(extracted_pages_pdf_path, 'wb') as extracted_pages_pdf:
91
+ pdf_writer.write(extracted_pages_pdf)
92
+
93
+ console.print(f"Extracted pages to new PDF: '{extracted_pages_pdf_path}'.")
94
+ return extracted_pages_pdf_path
95
+
96
+ def extract_text(
97
+ self,
98
+ page_range: Optional[PageRange] = None,
99
+ logger: Optional[Logger] = None,
100
+ print_as_parsed: bool = False
101
+ ) -> Optional[str]:
102
+ """
103
+ Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
104
+
105
+ Args:
106
+ page_range (Optional[PageRange]): If provided, only extract text from pages in this range.
107
+ Page numbers are 1-indexed. If not provided, extract text from all pages.
108
+ log (Optional[Logger]): If provided, log progress to this logger. Otherwise use default logger.
109
+ print_as_parsed (bool): If True, print each page's text to STDOUT as it is parsed.
110
+
111
+ Returns:
112
+ Optional[str]: The extracted text, or None if extraction failed.
113
+ """
114
+ from PIL import Image # Imported here to avoid hard dependency if not using this method
115
+ log = logger or yaralyzer_log
116
+ log.debug(f"Extracting text from '{self.file_path}'...")
117
+ self._page_numbers_of_errors: List[int] = []
118
+ extracted_pages = []
119
+
120
+ try:
121
+ pdf_reader = PdfReader(self.file_path)
122
+ page_count = len(pdf_reader.pages)
123
+ log.debug(f"PDF Page count: {page_count}")
124
+
125
+ for page_number, page in enumerate(pdf_reader.pages, start=1):
126
+ if page_range and not page_range.in_range(page_number):
127
+ self._log_to_stderr(f"Skipping page {page_number}...")
128
+ continue
129
+
130
+ self._log_to_stderr(f"Parsing page {page_number}...")
131
+ page_buffer = Console(file=io.StringIO())
132
+ page_buffer.print(Panel(f"PAGE {page_number}", padding=(0, 15), expand=False))
133
+ page_buffer.print(escape(page.extract_text().strip()))
134
+ image_number = 1
135
+
136
+ # Extracting images is a bit fraught (lots of PIL and pypdf exceptions have come from here)
137
+ try:
138
+ for image_number, image in enumerate(page.images, start=1):
139
+ image_name = f"Page {page_number}, Image {image_number}"
140
+ self._log_to_stderr(f" Processing {image_name}...", "dim")
141
+ page_buffer.print(Panel(image_name, expand=False))
142
+ image_obj = Image.open(io.BytesIO(image.data))
143
+ image_text = ocr_text(image_obj, f"{self.file_path} ({image_name})")
144
+ page_buffer.print((image_text or '').strip())
145
+ except (OSError, NotImplementedError, TypeError, ValueError) as e:
146
+ error_str = exception_str(e)
147
+ msg = f"{error_str} while parsing embedded image {image_number} on page {page_number}..."
148
+ mild_warning(msg)
149
+
150
+ # Dump an error PDF and encourage user to report to pypdf team.
151
+ if 'JBIG2Decode' not in str(e):
152
+ stderr_console.print_exception()
153
+
154
+ if page_number not in self._page_numbers_of_errors:
155
+ self._handle_extraction_error(page_number, error_str)
156
+ self._page_numbers_of_errors.append(page_number)
157
+
158
+ page_text = page_buffer.file.getvalue()
159
+ extracted_pages.append(page_text)
160
+ log.debug(page_text)
161
+
162
+ if print_as_parsed:
163
+ print(f"{page_text}")
164
+ except DependencyError:
165
+ log.error("Pdfalyzer is missing an optional dependency required to extract text. Try 'pip install pdfalyzer[extract]'")
166
+ except EmptyFileError:
167
+ log.warning("Skipping empty file!")
168
+ except PdfStreamError as e:
169
+ print_error(f"Error parsing PDF file '{self.file_path}': {e}")
170
+ stderr_console.print_exception()
171
+
172
+ return "\n\n".join(extracted_pages).strip()
173
+
174
+ def print_extracted_text(self, page_range: Optional[PageRange] = None, print_as_parsed: bool = False) -> None:
175
+ """Fancy wrapper for printing the extracted text to the screen."""
176
+ console.print(Panel(str(self.file_path), expand=False, style='bright_white reverse'))
177
+ txt = self.extract_text(page_range=page_range, print_as_parsed=print_as_parsed)
178
+
179
+ if not print_as_parsed:
180
+ console.print(txt)
181
+
182
+ def _handle_extraction_error(self, page_number: int, error_msg: str) -> None:
183
+ """Rip the offending page to a new file and suggest that user report bug to PyPDF."""
184
+ destination_dir = DEFAULT_PDF_ERRORS_DIR
185
+
186
+ try:
187
+ extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
188
+ except Exception as e:
189
+ stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
190
+ extracted_file = None
191
+
192
+ blink_txt = Text('', style='bright_white')
193
+ blink_txt.append("An error (", style='blink color(154)').append(error_msg, style='color(11) blink')
194
+ blink_txt.append(') ', style='blink color(154)')
195
+ blink_txt.append("was encountered while processing a PDF file.\n\n", style='blink color(154)')
196
+
197
+ txt = Text(f"The error was of a type such that it probably came from a bug in ", style='bright_white')
198
+ txt.append('PyPDF', style='underline bright_green').append('. It was encountered processing the file ')
199
+ txt.append(str(self.file_path), style='file').append('. You should see a stack trace above this box.\n\n')
200
+
201
+ txt.append('The offending page will be extracted to ', style='bright_white')
202
+ txt.append(str(extracted_file), style='file').append('.\n\n')
203
+ txt.append(f"Please visit 'https://github.com/py-pdf/pypdf/issues' to report a bug. ", style='bold')
204
+ txt.append(f"Providing the devs with the extracted page and the stack trace help improve pypdf.")
205
+ stderr_console.print(attention_getting_panel(blink_txt + txt, title='PyPDF Error'))
206
+
207
+ def _log_to_stderr(self, msg: str, style: Optional[str] = None) -> None:
208
+ """When parsing very large PDFs it can be useful to log progress and other messages to STDERR."""
209
+ if self.file_size < MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR:
210
+ return
211
+
212
+ stderr_console.print(msg, style=style or "")
@@ -15,7 +15,7 @@ from pdfalyzer.util.adobe_strings import *
15
15
 
16
16
 
17
17
  class PdfObjectProperties:
18
- """Simple class to extract critical features of a PdfObject."""
18
+ """Simple class to extract critical features of a `PdfObject`."""
19
19
 
20
20
  def __init__(
21
21
  self,
@@ -86,7 +86,7 @@ class PdfObjectProperties:
86
86
  obj: PdfObject,
87
87
  is_single_row_table: bool = False
88
88
  ) -> List[Union[Text, str]]:
89
- """PDF object property at reference_key becomes a formatted 3-tuple for use in Rich tables."""
89
+ """PDF object property at `reference_key` becomes a formatted 3-tuple for use in Rich tables."""
90
90
  with_resolved_refs = cls.resolve_references(reference_key, obj)
91
91
 
92
92
  return [
@@ -101,7 +101,7 @@ class PdfObjectProperties:
101
101
  # TODO: this doesn't recurse...
102
102
  @classmethod
103
103
  def _obj_to_rich_text(cls, obj: Any) -> Text:
104
- """Recurse through obj and build a Text object."""
104
+ """Recurse through `obj` and build a `Text` object."""
105
105
  if isinstance(obj, dict):
106
106
  key_value_pairs = [Text(f"{k}: ").append_text(cls._obj_to_rich_text(v)) for k, v in obj.items()]
107
107
  return Text('{').append_text(comma_join_txt(key_value_pairs)).append('}')