pdfalyzer 1.16.3__tar.gz → 1.17.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfalyzer-1.17.9/.pdfalyzer.example +66 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/CHANGELOG.md +74 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/PKG-INFO +57 -37
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/README.md +41 -25
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/__init__.py +37 -13
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/binary/binary_scanner.py +44 -24
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/config.py +6 -1
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/decorators/indeterminate_node.py +11 -11
- pdfalyzer-1.17.9/pdfalyzer/decorators/pdf_file.py +220 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_object_properties.py +16 -15
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_tree_node.py +25 -16
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_tree_verifier.py +9 -4
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/detection/constants/binary_regexes.py +7 -7
- pdfalyzer-1.17.9/pdfalyzer/detection/yaralyzer_helper.py +51 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/font_info.py +11 -12
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/filesystem_helper.py +32 -9
- pdfalyzer-1.17.9/pdfalyzer/helpers/image_helper.py +31 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/pdf_object_helper.py +8 -8
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/rich_text_helper.py +74 -21
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/string_helper.py +33 -30
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/character_mapping.py +4 -3
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/layout.py +14 -4
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/pdfalyzer_presenter.py +18 -5
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/styles/rich_theme.py +2 -1
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/decoding_stats_table.py +11 -6
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/font_summary_table.py +2 -2
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/stream_objects_table.py +0 -1
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/pdf_object_relationship.py +12 -12
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/pdfalyzer.py +75 -32
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/util/adobe_strings.py +4 -5
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/util/argument_parser.py +23 -67
- pdfalyzer-1.17.9/pdfalyzer/util/cli_tools_argument_parser.py +164 -0
- pdfalyzer-1.17.9/pdfalyzer/util/page_range.py +51 -0
- pdfalyzer-1.17.9/pdfalyzer/yara_rules/PDF.yara +1859 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/PDF_binary_stream.yara +6 -8
- pdfalyzer-1.17.9/pdfalyzer/yara_rules/didier_stevens.yara +248 -0
- pdfalyzer-1.17.9/pdfalyzer/yara_rules/pdf_malware.yara +3072 -0
- pdfalyzer-1.17.9/pyproject.toml +120 -0
- pdfalyzer-1.16.3/pdfalyzer/detection/yaralyzer_helper.py +0 -51
- pdfalyzer-1.16.3/pdfalyzer/yara_rules/PDF.yara +0 -1075
- pdfalyzer-1.16.3/pyproject.toml +0 -77
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/LICENSE +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/__main__.py +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/decorators/document_model_printer.py +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/detection/javascript_hunter.py +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/dict_helper.py +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/number_helper.py +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/styles/node_colors.py +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/util/debugging.py +1 -1
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/util/exceptions.py +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/util/pdf_parser_manager.py +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/__init.py__ +0 -0
- {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# If you place a filed called '.pdfalyzer' in your home dir or the current dir environment variables specified
|
|
2
|
+
# in that .pdfalyzer file will be added to the environment each time pdfalyzer is invoked. (See the `dotenv`
|
|
3
|
+
# package for more details.) This file contains environment variables you can place in .pdfalyzer to configure
|
|
4
|
+
# the application above and beyond providing command line options. Useful if you want to permanently
|
|
5
|
+
# configure options you tend to reuse (e.g. '--maximize-width') so you can stop remembering to type them.
|
|
6
|
+
#
|
|
7
|
+
# Almost all of the yaralyzer (yes, you read that right - The Pdfalyzer uses The Yaralyzer for all
|
|
8
|
+
# kinds of backend functionality) command line options can be configured in this file by capitalizing them and
|
|
9
|
+
# prefixing 'YARALYZER'. e.g. to configure the --maximize-width option for every invocation, you would set:
|
|
10
|
+
# YARALYZER_MAXIMIZE_WIDTH=True
|
|
11
|
+
#
|
|
12
|
+
# Note that many of these options are actually configuring the yaralyzer, which is a separate tool leveraged
|
|
13
|
+
# by the Pdfalyzer to actually do the work of finding patterns. More info can be found at
|
|
14
|
+
# https://github.com/michelcrypt4d4mus/yaralyzer
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Expand the width of the output to the fit the display window (same as the --maximize-width options)
|
|
19
|
+
# YARALYZER_MAXIMIZE_WIDTH=True
|
|
20
|
+
|
|
21
|
+
# yara-python internal options passed through to yara.set_config() as the stack_size and max_match_data arguments
|
|
22
|
+
# YARALYZER_STACK_SIZE=10485760
|
|
23
|
+
# YARALYZER_MAX_MATCH_LENGTH=10737418240
|
|
24
|
+
|
|
25
|
+
# Suppress all PDF binary regex matching/scanning/etc
|
|
26
|
+
# YARALYZER_SUPPRESS_DECODES_TABLE=False
|
|
27
|
+
|
|
28
|
+
# Suppress the display of the table showing the the encoding assessments given by `chardet.detect()`
|
|
29
|
+
# about a particular chunk of binary data. (The most important data in the chardet confidence table is
|
|
30
|
+
# redunandant anyways. Only the low likelihood encodings are hidden from the usef)
|
|
31
|
+
# YARALYZER_SUPPRESS_CHARDET_TABLE=False
|
|
32
|
+
# Minimum confidence to display an encoding in the chardet results table
|
|
33
|
+
# YARALYZER_MIN_CHARDET_CONFIDENCE=2.0
|
|
34
|
+
|
|
35
|
+
# Configure how many bytes before and after any binary data should be included in scans and visualizations
|
|
36
|
+
# YARALYZER_SURROUNDING_BYTES=64
|
|
37
|
+
|
|
38
|
+
# Size thresholds (in bytes) under/over which pdfalyzer will NOT make attempts to decode a match.
|
|
39
|
+
# Longer byte sequences are for obvious reasons slower to decode by force.
|
|
40
|
+
# It may feel counterintuitive but larger chunks of random binary are also harder to examine and
|
|
41
|
+
# (in my experience) less likely to be maningful. Consider it - two frontslash characters 20,000 lines apart
|
|
42
|
+
# are more likely to be random than those same frontslashes when placed nearer to each other and
|
|
43
|
+
# in the vicinity of lot of computerized sigils of internet power like `.', `+bacd*?`,. and other regexes.*
|
|
44
|
+
# Keeping the max value number low will do more to affect the speed of the app than ay anything else you
|
|
45
|
+
# can easily configure..
|
|
46
|
+
#
|
|
47
|
+
# YARALYZER_MIN_DECODE_LENGTH=1
|
|
48
|
+
# YARALYZER_MAX_DECODE_LENGTH=256
|
|
49
|
+
|
|
50
|
+
# Directory to write application logs to. Must be an absolute path, not a relative one.
|
|
51
|
+
# These logs are not normally written to a file and the default log level means that the standard behavior
|
|
52
|
+
# is to more or less discard them. Be aware that if you configure this variable a few things will change:
|
|
53
|
+
#
|
|
54
|
+
# 1. Logs WILL NOT be written to STDOUT. They will stream ONLY to files in the configured directory.
|
|
55
|
+
# This is true even with the -D option.
|
|
56
|
+
# 2. The default log_level will be decreased from WARN (extremely spartan) to INFO (fairly verbose).
|
|
57
|
+
# The -D option, which sets the log level to DEBUG, will be respected whether or not
|
|
58
|
+
# YARALYZER_LOG_DIR is configured.
|
|
59
|
+
#
|
|
60
|
+
# YARALYZER_LOG_DIR=/path/to/pdfalyzer/log_dir/
|
|
61
|
+
|
|
62
|
+
# Log level
|
|
63
|
+
# YARALYZER_LOG_LEVEL='INFO'
|
|
64
|
+
|
|
65
|
+
# Path to directory containing Didier Stevens's pdf-parser.py. Only required for extracting binary streams to files.
|
|
66
|
+
# PDFALYZER_PDF_PARSER_PY_PATH=/path/to/pdfparserdotpy/
|
|
@@ -1,5 +1,79 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.17.9
|
|
4
|
+
* Broaden exception handling in `FontInfo` extraction
|
|
5
|
+
|
|
6
|
+
### 1.17.8
|
|
7
|
+
* Handle `AttributeError` in `FontInfo` extraction
|
|
8
|
+
|
|
9
|
+
### 1.17.7
|
|
10
|
+
* Bump `pypdf` to 6.1.3 (fixes [#31](https://github.com/michelcrypt4d4mus/pdfalyzer/issues/31)), `PyMuPDF` to 1.26.5
|
|
11
|
+
|
|
12
|
+
### 1.17.6
|
|
13
|
+
* Better handling for errors resulting from bugs in PyPDF
|
|
14
|
+
* Properly close file handle when pdfalyzing is complete
|
|
15
|
+
|
|
16
|
+
### 1.17.5
|
|
17
|
+
* Fix `PIL` lazy import
|
|
18
|
+
|
|
19
|
+
### 1.17.4
|
|
20
|
+
* Make `PIL` a lazy import so installs without `[extract]` extras don't fail
|
|
21
|
+
|
|
22
|
+
### 1.17.3
|
|
23
|
+
* Put back `--debug` arg for CLI tools
|
|
24
|
+
|
|
25
|
+
### 1.17.2
|
|
26
|
+
* Remove unused `--debug` args for CLI tools
|
|
27
|
+
* Rename `extract_text_from_pdfs` to `extract_pdf_text`
|
|
28
|
+
|
|
29
|
+
### 1.17.1
|
|
30
|
+
* Fix issue where `extract_pdf_pages` page ranges were indexed from 0 instead of 1
|
|
31
|
+
|
|
32
|
+
# 1.17.0
|
|
33
|
+
* Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
|
|
34
|
+
* Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)
|
|
35
|
+
|
|
36
|
+
### 1.16.14
|
|
37
|
+
* Bump `yaralyzer` to v1.0.9, handle `FileNotFoundError` which is now raised instead of `TypeError`
|
|
38
|
+
* Drop support for python 3.9
|
|
39
|
+
|
|
40
|
+
### 1.16.13
|
|
41
|
+
* Bump `yaralyzer` to v1.0.7 and fix reference to yaralyzer's renamed `prefix_with_style()` method
|
|
42
|
+
|
|
43
|
+
### 1.16.12
|
|
44
|
+
* Bump `PyPDF` to v6.0.0
|
|
45
|
+
|
|
46
|
+
### 1.16.11
|
|
47
|
+
* Fix typo in `combine_pdfs` help
|
|
48
|
+
* Add some more PyPi classifiers
|
|
49
|
+
* Add a `.flake8` config and fix a bunch of style issues
|
|
50
|
+
|
|
51
|
+
### 1.16.10
|
|
52
|
+
* Add `Environment :: Console` and `Programming Language :: Python` to pypi classifiers
|
|
53
|
+
* Add `.pdfalyzer.example` to PyPi package
|
|
54
|
+
|
|
55
|
+
### 1.16.9
|
|
56
|
+
* Add `Development Status :: 5 - Production/Stable` to pypi classifiers
|
|
57
|
+
|
|
58
|
+
### 1.16.8
|
|
59
|
+
* Even more PDF related YARA rules
|
|
60
|
+
* Upgrade `anytree` to 2.13.0
|
|
61
|
+
* Upgrade `yaralyzer` to 1.0.4
|
|
62
|
+
|
|
63
|
+
### 1.16.7
|
|
64
|
+
* Lots of new PDF related YARA rules
|
|
65
|
+
* Upgrade `yaralyzer` to 1.0.3
|
|
66
|
+
* Upgrade `pypdf` to 5.9.0
|
|
67
|
+
|
|
68
|
+
### 1.16.6
|
|
69
|
+
* Add the creator hash to GIFTEDCROOK rule
|
|
70
|
+
|
|
71
|
+
### 1.16.5
|
|
72
|
+
* Add YARA rule for GIFTEDCROOK infostealer PDFs
|
|
73
|
+
|
|
74
|
+
### 1.16.4
|
|
75
|
+
* Bump `PyPDF` to 5.7.0
|
|
76
|
+
|
|
3
77
|
### 1.16.3
|
|
4
78
|
* Fix typo in help
|
|
5
79
|
|
|
@@ -1,39 +1,43 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 1.17.9
|
|
4
|
+
Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
7
|
-
Keywords: ascii art,binary,color,
|
|
7
|
+
Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
|
|
8
8
|
Author: Michel de Cryptadamus
|
|
9
9
|
Author-email: michel@cryptadamus.com
|
|
10
|
-
Requires-Python: >=3.
|
|
10
|
+
Requires-Python: >=3.10,<4.0
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Information Technology
|
|
11
14
|
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
15
|
+
Classifier: Programming Language :: Python
|
|
12
16
|
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
14
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
15
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
21
|
Classifier: Topic :: Artistic Software
|
|
17
22
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
18
23
|
Classifier: Topic :: Security
|
|
19
|
-
|
|
20
|
-
Requires-Dist:
|
|
21
|
-
Requires-Dist:
|
|
22
|
-
Requires-Dist:
|
|
23
|
-
Requires-Dist:
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist: yaralyzer (>=0.9.4,<0.10.0)
|
|
24
|
+
Provides-Extra: extract
|
|
25
|
+
Requires-Dist: PyMuPDF (>=1.26.5,<2.0.0) ; extra == "extract"
|
|
26
|
+
Requires-Dist: anytree (>=2.13,<3.0)
|
|
27
|
+
Requires-Dist: pypdf (>=6.1.3,<7.0.0)
|
|
28
|
+
Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
|
|
29
|
+
Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
|
|
26
30
|
Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
|
|
27
31
|
Project-URL: Documentation, https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
28
32
|
Project-URL: Repository, https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
29
33
|
Description-Content-Type: text/markdown
|
|
30
34
|
|
|
31
|
-
<!--  -->
|
|
32
|
-

|
|
33
35
|
[](https://pypi.org/project/pdfalyzer/)
|
|
34
|
-
[](https://github.com/michelcrypt4d4mus/pdfalyzer)
|
|
35
36
|

|
|
37
|
+
[](https://github.com/michelcrypt4d4mus/pdfalyzer)
|
|
38
|
+

|
|
36
39
|

|
|
40
|
+
[](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
|
|
37
41
|
|
|
38
42
|
|
|
39
43
|
# THE PDFALYZER
|
|
@@ -63,10 +67,11 @@ If you're looking for one of these things this may be the tool for you.
|
|
|
63
67
|
### What It Don't Do
|
|
64
68
|
This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
|
|
65
69
|
|
|
66
|
-
|
|
70
|
+
If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
|
|
67
71
|
|
|
68
|
-
# Installation
|
|
69
72
|
|
|
73
|
+
# Installation
|
|
74
|
+
#### All Platforms
|
|
70
75
|
Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` / `pip` should also work.
|
|
71
76
|
```sh
|
|
72
77
|
pipx install pdfalyzer
|
|
@@ -74,7 +79,12 @@ pipx install pdfalyzer
|
|
|
74
79
|
|
|
75
80
|
See [PyPDF installation notes](https://github.com/py-pdf/pypdf#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
|
|
76
81
|
|
|
77
|
-
|
|
82
|
+
#### macOS Homebrew
|
|
83
|
+
If you are on macOS and use `homebrew` someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so this should work:
|
|
84
|
+
|
|
85
|
+
```sh
|
|
86
|
+
brew install pdfalyzer
|
|
87
|
+
```
|
|
78
88
|
|
|
79
89
|
### Troubleshooting
|
|
80
90
|
1. If you used `pip3` instead of `pipx` and have an issue you should try to install with `pipx`.
|
|
@@ -88,7 +98,6 @@ If you are on macOS someone out there was kind enough to make [The Pdfalyzer ava
|
|
|
88
98
|
sudo apt-get install build-essential libssl-dev libffi-dev rustc
|
|
89
99
|
```
|
|
90
100
|
|
|
91
|
-
-------------
|
|
92
101
|
|
|
93
102
|
# Usage
|
|
94
103
|
|
|
@@ -104,24 +113,40 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
|
|
|
104
113
|
|
|
105
114
|
The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
|
|
106
115
|
|
|
107
|
-
|
|
108
|
-
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer`
|
|
116
|
+
#### Setting Command Line Options Permanently With A `.pdfalyzer` File
|
|
117
|
+
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
|
|
109
118
|
|
|
110
|
-
|
|
111
|
-
|
|
119
|
+
1. the current directory
|
|
120
|
+
2. the user's home directory
|
|
112
121
|
|
|
113
|
-
|
|
114
|
-
|
|
122
|
+
If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
|
|
123
|
+
|
|
124
|
+
#### Environment Variables
|
|
125
|
+
Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
|
|
115
126
|
|
|
116
127
|
### Guarantees
|
|
117
128
|
Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
|
|
118
129
|
|
|
119
|
-
## Example
|
|
130
|
+
## Example Malicious PDF Investigation
|
|
120
131
|
[BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
|
|
121
132
|
|
|
122
|
-
-------------
|
|
123
133
|
|
|
124
|
-
##
|
|
134
|
+
## Included Command Line Tools
|
|
135
|
+
The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
|
|
136
|
+
|
|
137
|
+
* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
|
|
138
|
+
* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
|
|
139
|
+
* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
|
|
140
|
+
* `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
|
|
141
|
+
|
|
142
|
+
Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
pipx install pdfalyzer[extract]
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
## As A Python Library
|
|
125
150
|
For info about setting up a dev environment see [Contributing](#contributing) below.
|
|
126
151
|
|
|
127
152
|
At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
|
|
@@ -230,10 +255,8 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
230
255
|
|
|
231
256
|
-------------
|
|
232
257
|
|
|
233
|
-
# PDF Resources
|
|
234
|
-
## Included PDF Tools
|
|
235
|
-
The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
|
|
236
258
|
|
|
259
|
+
# PDF Resources
|
|
237
260
|
## 3rd Party PDF Tools
|
|
238
261
|
### Installing Didier Stevens's PDF Analysis Tools
|
|
239
262
|
Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
|
|
@@ -263,6 +286,7 @@ scripts/install_t1utils.sh
|
|
|
263
286
|
* [Adobe Type 2 Charstring Format](https://adobe-type-tools.github.io/font-tech-notes/pdfs/5177.Type2.pdf) - Describes the newer Type 2 font operators which are also used in some multiple-master Type 1 fonts.
|
|
264
287
|
|
|
265
288
|
### Other Stuff
|
|
289
|
+
* [Didier Stevens's PDF tools](http://blog.didierstevens.com/programs/pdf-tools/)
|
|
266
290
|
* [Didier Stevens's free book about malicious PDFs](https://blog.didierstevens.com/2010/09/26/free-malicious-pdf-analysis-e-book/) - The master of the malicious PDFs wrote a whole book about how to analyze them. It's an old book but the PDF spec was last changed in 2008 so it's still relevant.
|
|
267
291
|
* [Analyzing Malicious PDFs Cheat Sheet](https://zeltser.com/media/docs/analyzing-malicious-document-files.pdf) - Like it says on the tin. If that link fails there's a copy [here in the repo](doc/analyzing-malicious-document-files.pdf).
|
|
268
292
|
* [T1Utils Github Repo](https://github.com/kohler/t1utils) - Suite of tools for manipulating Type1 fonts.
|
|
@@ -271,6 +295,8 @@ scripts/install_t1utils.sh
|
|
|
271
295
|
* [A Curious Exploration of Malicious PDF Documents](https://www.scitepress.org/Papers/2020/89923/89923.pdf) by Julian Lindenhofer, Rene Offenthaler and Martin Pirker, 2020. Overview of all the possible execution paths that can lead to a PDF executing JavaScript, opening loca/remote files, or making web requests.
|
|
272
296
|
* [Malicious PDF Generator](https://github.com/jonaslejon/malicious-pdf) is a well maintained GitHub project that does what it says on the tin.
|
|
273
297
|
* [PDF is Broken, and so is this file](https://blog.trailofbits.com/2021/02/02/pdf-is-broken-a-justctf-challenge/) is a 2021 report on what happens when you challenge cybersecurity teams to turn PDFs into weapons. (Among other things they managed to create a PDF that launches a webserver when you open it.)
|
|
298
|
+
* [linuxPDF](https://github.com/ading2210/linuxpdf) is a project that managed to embed an entire linux operating system inside a PDF document. The related [DoomPDF](https://github.com/ading2210/doompdf) managed to embed the classic video game Doom in a PDF.
|
|
299
|
+
* [horrifying-pdf-experiments](https://github.com/osnr/horrifying-pdf-experiments) is a repo of horrifying things you can do with PDFs.
|
|
274
300
|
|
|
275
301
|
|
|
276
302
|
## Did The World Really Need Another PDF Tool?
|
|
@@ -304,12 +330,6 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
304
330
|
* [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
|
|
305
331
|
|
|
306
332
|
|
|
307
|
-
# TODO
|
|
308
|
-
* Highlight decodes with a lot of Javascript keywords
|
|
309
|
-
* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
|
|
310
|
-
* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
|
|
311
|
-
|
|
312
|
-
|
|
313
333
|
[^1]: The official Adobe PDF specification calls this tree the PDF's "logical structure", which is a good example of nomenclature that does not help those who see it understand anything about what is being described. I can forgive them given that they named this thing back in the 80s, though it's a good example of why picking good names for things at the beginning is so important.
|
|
314
334
|
|
|
315
335
|
[^2]: An exception will be raised if there's any issue placing a node while parsing or if there are any nodes not reachable from the root of the tree at the end of parsing. If there are no exceptions then all internal PDF objects are guaranteed to exist in the tree except in these situations when warnings will be printed:
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
<!--  -->
|
|
2
|
-

|
|
3
1
|
[](https://pypi.org/project/pdfalyzer/)
|
|
4
|
-
[](https://github.com/michelcrypt4d4mus/pdfalyzer)
|
|
5
2
|

|
|
3
|
+
[](https://github.com/michelcrypt4d4mus/pdfalyzer)
|
|
4
|
+

|
|
6
5
|

|
|
6
|
+
[](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
# THE PDFALYZER
|
|
@@ -33,10 +33,11 @@ If you're looking for one of these things this may be the tool for you.
|
|
|
33
33
|
### What It Don't Do
|
|
34
34
|
This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
|
|
37
37
|
|
|
38
|
-
# Installation
|
|
39
38
|
|
|
39
|
+
# Installation
|
|
40
|
+
#### All Platforms
|
|
40
41
|
Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` / `pip` should also work.
|
|
41
42
|
```sh
|
|
42
43
|
pipx install pdfalyzer
|
|
@@ -44,7 +45,12 @@ pipx install pdfalyzer
|
|
|
44
45
|
|
|
45
46
|
See [PyPDF installation notes](https://github.com/py-pdf/pypdf#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
|
|
46
47
|
|
|
47
|
-
|
|
48
|
+
#### macOS Homebrew
|
|
49
|
+
If you are on macOS and use `homebrew` someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so this should work:
|
|
50
|
+
|
|
51
|
+
```sh
|
|
52
|
+
brew install pdfalyzer
|
|
53
|
+
```
|
|
48
54
|
|
|
49
55
|
### Troubleshooting
|
|
50
56
|
1. If you used `pip3` instead of `pipx` and have an issue you should try to install with `pipx`.
|
|
@@ -58,7 +64,6 @@ If you are on macOS someone out there was kind enough to make [The Pdfalyzer ava
|
|
|
58
64
|
sudo apt-get install build-essential libssl-dev libffi-dev rustc
|
|
59
65
|
```
|
|
60
66
|
|
|
61
|
-
-------------
|
|
62
67
|
|
|
63
68
|
# Usage
|
|
64
69
|
|
|
@@ -74,24 +79,40 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
|
|
|
74
79
|
|
|
75
80
|
The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
|
|
76
81
|
|
|
77
|
-
|
|
78
|
-
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer`
|
|
82
|
+
#### Setting Command Line Options Permanently With A `.pdfalyzer` File
|
|
83
|
+
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
|
|
79
84
|
|
|
80
|
-
|
|
81
|
-
|
|
85
|
+
1. the current directory
|
|
86
|
+
2. the user's home directory
|
|
82
87
|
|
|
83
|
-
|
|
84
|
-
|
|
88
|
+
If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
|
|
89
|
+
|
|
90
|
+
#### Environment Variables
|
|
91
|
+
Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
|
|
85
92
|
|
|
86
93
|
### Guarantees
|
|
87
94
|
Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
|
|
88
95
|
|
|
89
|
-
## Example
|
|
96
|
+
## Example Malicious PDF Investigation
|
|
90
97
|
[BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
|
|
91
98
|
|
|
92
|
-
-------------
|
|
93
99
|
|
|
94
|
-
##
|
|
100
|
+
## Included Command Line Tools
|
|
101
|
+
The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
|
|
102
|
+
|
|
103
|
+
* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
|
|
104
|
+
* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
|
|
105
|
+
* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
|
|
106
|
+
* `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
|
|
107
|
+
|
|
108
|
+
Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
pipx install pdfalyzer[extract]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
## As A Python Library
|
|
95
116
|
For info about setting up a dev environment see [Contributing](#contributing) below.
|
|
96
117
|
|
|
97
118
|
At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
|
|
@@ -200,10 +221,8 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
200
221
|
|
|
201
222
|
-------------
|
|
202
223
|
|
|
203
|
-
# PDF Resources
|
|
204
|
-
## Included PDF Tools
|
|
205
|
-
The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
|
|
206
224
|
|
|
225
|
+
# PDF Resources
|
|
207
226
|
## 3rd Party PDF Tools
|
|
208
227
|
### Installing Didier Stevens's PDF Analysis Tools
|
|
209
228
|
Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
|
|
@@ -233,6 +252,7 @@ scripts/install_t1utils.sh
|
|
|
233
252
|
* [Adobe Type 2 Charstring Format](https://adobe-type-tools.github.io/font-tech-notes/pdfs/5177.Type2.pdf) - Describes the newer Type 2 font operators which are also used in some multiple-master Type 1 fonts.
|
|
234
253
|
|
|
235
254
|
### Other Stuff
|
|
255
|
+
* [Didier Stevens's PDF tools](http://blog.didierstevens.com/programs/pdf-tools/)
|
|
236
256
|
* [Didier Stevens's free book about malicious PDFs](https://blog.didierstevens.com/2010/09/26/free-malicious-pdf-analysis-e-book/) - The master of the malicious PDFs wrote a whole book about how to analyze them. It's an old book but the PDF spec was last changed in 2008 so it's still relevant.
|
|
237
257
|
* [Analyzing Malicious PDFs Cheat Sheet](https://zeltser.com/media/docs/analyzing-malicious-document-files.pdf) - Like it says on the tin. If that link fails there's a copy [here in the repo](doc/analyzing-malicious-document-files.pdf).
|
|
238
258
|
* [T1Utils Github Repo](https://github.com/kohler/t1utils) - Suite of tools for manipulating Type1 fonts.
|
|
@@ -241,6 +261,8 @@ scripts/install_t1utils.sh
|
|
|
241
261
|
* [A Curious Exploration of Malicious PDF Documents](https://www.scitepress.org/Papers/2020/89923/89923.pdf) by Julian Lindenhofer, Rene Offenthaler and Martin Pirker, 2020. Overview of all the possible execution paths that can lead to a PDF executing JavaScript, opening loca/remote files, or making web requests.
|
|
242
262
|
* [Malicious PDF Generator](https://github.com/jonaslejon/malicious-pdf) is a well maintained GitHub project that does what it says on the tin.
|
|
243
263
|
* [PDF is Broken, and so is this file](https://blog.trailofbits.com/2021/02/02/pdf-is-broken-a-justctf-challenge/) is a 2021 report on what happens when you challenge cybersecurity teams to turn PDFs into weapons. (Among other things they managed to create a PDF that launches a webserver when you open it.)
|
|
264
|
+
* [linuxPDF](https://github.com/ading2210/linuxpdf) is a project that managed to embed an entire linux operating system inside a PDF document. The related [DoomPDF](https://github.com/ading2210/doompdf) managed to embed the classic video game Doom in a PDF.
|
|
265
|
+
* [horrifying-pdf-experiments](https://github.com/osnr/horrifying-pdf-experiments) is a repo of horrifying things you can do with PDFs.
|
|
244
266
|
|
|
245
267
|
|
|
246
268
|
## Did The World Really Need Another PDF Tool?
|
|
@@ -274,12 +296,6 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
274
296
|
* [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
|
|
275
297
|
|
|
276
298
|
|
|
277
|
-
# TODO
|
|
278
|
-
* Highlight decodes with a lot of Javascript keywords
|
|
279
|
-
* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
|
|
280
|
-
* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
|
|
281
|
-
|
|
282
|
-
|
|
283
299
|
[^1]: The official Adobe PDF specification calls this tree the PDF's "logical structure", which is a good example of nomenclature that does not help those who see it understand anything about what is being described. I can forgive them given that they named this thing back in the 80s, though it's a good example of why picking good names for things at the beginning is so important.
|
|
284
300
|
|
|
285
301
|
[^2]: An exception will be raised if there's any issue placing a node while parsing or if there are any nodes not reachable from the root of the tree at the end of parsing. If there are no exceptions then all internal PDF objects are guaranteed to exist in the tree except in these situations when warnings will be printed:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import code
|
|
2
2
|
import sys
|
|
3
|
+
from argparse import Namespace
|
|
3
4
|
from os import environ, getcwd, path
|
|
4
|
-
from pathlib import Path
|
|
5
5
|
|
|
6
6
|
from dotenv import load_dotenv
|
|
7
7
|
from pypdf import PdfWriter
|
|
@@ -20,18 +20,20 @@ if not environ.get('INVOKED_BY_PYTEST', False):
|
|
|
20
20
|
from rich.columns import Columns
|
|
21
21
|
from rich.panel import Panel
|
|
22
22
|
from rich.text import Text
|
|
23
|
-
from yaralyzer.helpers.rich_text_helper import
|
|
23
|
+
from yaralyzer.helpers.rich_text_helper import prefix_with_style
|
|
24
24
|
from yaralyzer.output.file_export import invoke_rich_export
|
|
25
25
|
from yaralyzer.output.rich_console import console
|
|
26
|
-
from yaralyzer.util.logging import
|
|
26
|
+
from yaralyzer.util.logging import log_and_print
|
|
27
27
|
|
|
28
|
+
from pdfalyzer.decorators.pdf_file import PdfFile
|
|
28
29
|
from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
|
|
29
30
|
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
30
31
|
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
31
32
|
from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
|
|
32
33
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
33
|
-
from pdfalyzer.util.argument_parser import
|
|
34
|
-
|
|
34
|
+
from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
|
|
35
|
+
from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, parse_combine_pdfs_args,
|
|
36
|
+
parse_pdf_page_extraction_args, parse_text_extraction_args)
|
|
35
37
|
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
|
|
36
38
|
|
|
37
39
|
# For the table shown by running pdfalyzer_show_color_theme
|
|
@@ -41,7 +43,7 @@ MAX_THEME_COL_SIZE = 35
|
|
|
41
43
|
def pdfalyze():
|
|
42
44
|
args = parse_arguments()
|
|
43
45
|
pdfalyzer = Pdfalyzer(args.file_to_scan_path)
|
|
44
|
-
|
|
46
|
+
presenter = PdfalyzerPresenter(pdfalyzer)
|
|
45
47
|
output_basepath = None
|
|
46
48
|
|
|
47
49
|
# Binary stream extraction is a special case
|
|
@@ -51,9 +53,9 @@ def pdfalyze():
|
|
|
51
53
|
log_and_print(f"Binary stream extraction complete, files written to '{args.output_dir}'.\nExiting.\n")
|
|
52
54
|
sys.exit()
|
|
53
55
|
|
|
54
|
-
# The method that gets called is related to the argument name. See 'possible_output_sections' list in
|
|
55
|
-
# Analysis exports wrap themselves around the methods that actually generate the analyses
|
|
56
|
-
for (arg, method) in output_sections(args,
|
|
56
|
+
# The method that gets called is related to the argument name. See 'possible_output_sections' list in
|
|
57
|
+
# argument_parser.py. Analysis exports wrap themselves around the methods that actually generate the analyses.
|
|
58
|
+
for (arg, method) in output_sections(args, presenter):
|
|
57
59
|
if args.output_dir:
|
|
58
60
|
output_basepath = PdfalyzerConfig.get_output_basepath(method)
|
|
59
61
|
print(f'Exporting {arg} data to {output_basepath}...')
|
|
@@ -78,23 +80,25 @@ def pdfalyze():
|
|
|
78
80
|
if args.interact:
|
|
79
81
|
code.interact(local=locals())
|
|
80
82
|
|
|
83
|
+
pdfalyzer.pdf_filehandle.close()
|
|
84
|
+
|
|
81
85
|
|
|
82
86
|
def pdfalyzer_show_color_theme() -> None:
|
|
83
87
|
"""Utility method to show pdfalyzer's color theme. Invocable with 'pdfalyzer_show_color_theme'."""
|
|
84
88
|
console.print(Panel('The Pdfalyzer Color Theme', style='reverse'))
|
|
85
89
|
|
|
86
90
|
colors = [
|
|
87
|
-
|
|
91
|
+
prefix_with_style(name[:MAX_THEME_COL_SIZE], style=str(style)).append(' ')
|
|
88
92
|
for name, style in PDFALYZER_THEME_DICT.items()
|
|
89
93
|
if name not in ['reset', 'repr_url']
|
|
90
94
|
]
|
|
91
95
|
|
|
92
|
-
console.print(Columns(colors, column_first=True, padding=(0,3)))
|
|
96
|
+
console.print(Columns(colors, column_first=True, padding=(0, 3)))
|
|
93
97
|
|
|
94
98
|
|
|
95
99
|
def combine_pdfs():
|
|
96
100
|
"""
|
|
97
|
-
|
|
101
|
+
Script method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'.
|
|
98
102
|
Example: https://github.com/py-pdf/pypdf/blob/main/docs/user/merging-pdfs.md
|
|
99
103
|
"""
|
|
100
104
|
args = parse_combine_pdfs_args()
|
|
@@ -114,7 +118,11 @@ def combine_pdfs():
|
|
|
114
118
|
for i, page in enumerate(merger.pages):
|
|
115
119
|
if args.image_quality < MAX_QUALITY:
|
|
116
120
|
for j, img in enumerate(page.images):
|
|
117
|
-
print_highlighted(
|
|
121
|
+
print_highlighted(
|
|
122
|
+
f" -> Reducing image #{j + 1} quality on page {i + 1} to {args.image_quality}...",
|
|
123
|
+
style='dim'
|
|
124
|
+
)
|
|
125
|
+
|
|
118
126
|
img.replace(img.image, quality=args.image_quality)
|
|
119
127
|
|
|
120
128
|
print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
|
|
@@ -127,3 +135,19 @@ def combine_pdfs():
|
|
|
127
135
|
txt = Text('').append(f" -> Wrote ")
|
|
128
136
|
txt.append(str(file_size_in_mb(args.output_file)), style='cyan').append(" megabytes\n")
|
|
129
137
|
print_highlighted(txt)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def extract_pdf_pages() -> None:
|
|
141
|
+
"""Extract a range of pages from a PDF to a new PDF."""
|
|
142
|
+
args = parse_pdf_page_extraction_args()
|
|
143
|
+
PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def extract_pdf_text() -> None:
|
|
147
|
+
"""Extract text from a list of file or from all PDF files in a list of directories."""
|
|
148
|
+
args: Namespace = parse_text_extraction_args()
|
|
149
|
+
console.line()
|
|
150
|
+
|
|
151
|
+
for file_path in args.files_to_process:
|
|
152
|
+
PdfFile(file_path).print_extracted_text(args.page_range, args.print_as_parsed)
|
|
153
|
+
console.line(2)
|