pdfalyzer 1.16.3__tar.gz → 1.17.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. pdfalyzer-1.17.9/.pdfalyzer.example +66 -0
  2. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/CHANGELOG.md +74 -0
  3. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/PKG-INFO +57 -37
  4. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/README.md +41 -25
  5. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/__init__.py +37 -13
  6. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/binary/binary_scanner.py +44 -24
  7. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/config.py +6 -1
  8. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/decorators/indeterminate_node.py +11 -11
  9. pdfalyzer-1.17.9/pdfalyzer/decorators/pdf_file.py +220 -0
  10. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_object_properties.py +16 -15
  11. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_tree_node.py +25 -16
  12. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/decorators/pdf_tree_verifier.py +9 -4
  13. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/detection/constants/binary_regexes.py +7 -7
  14. pdfalyzer-1.17.9/pdfalyzer/detection/yaralyzer_helper.py +51 -0
  15. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/font_info.py +11 -12
  16. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/filesystem_helper.py +32 -9
  17. pdfalyzer-1.17.9/pdfalyzer/helpers/image_helper.py +31 -0
  18. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/pdf_object_helper.py +8 -8
  19. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/rich_text_helper.py +74 -21
  20. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/string_helper.py +33 -30
  21. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/character_mapping.py +4 -3
  22. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/layout.py +14 -4
  23. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/pdfalyzer_presenter.py +18 -5
  24. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/styles/rich_theme.py +2 -1
  25. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/decoding_stats_table.py +11 -6
  26. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/font_summary_table.py +2 -2
  27. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/stream_objects_table.py +0 -1
  28. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/pdf_object_relationship.py +12 -12
  29. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/pdfalyzer.py +75 -32
  30. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/util/adobe_strings.py +4 -5
  31. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/util/argument_parser.py +23 -67
  32. pdfalyzer-1.17.9/pdfalyzer/util/cli_tools_argument_parser.py +164 -0
  33. pdfalyzer-1.17.9/pdfalyzer/util/page_range.py +51 -0
  34. pdfalyzer-1.17.9/pdfalyzer/yara_rules/PDF.yara +1859 -0
  35. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/PDF_binary_stream.yara +6 -8
  36. pdfalyzer-1.17.9/pdfalyzer/yara_rules/didier_stevens.yara +248 -0
  37. pdfalyzer-1.17.9/pdfalyzer/yara_rules/pdf_malware.yara +3072 -0
  38. pdfalyzer-1.17.9/pyproject.toml +120 -0
  39. pdfalyzer-1.16.3/pdfalyzer/detection/yaralyzer_helper.py +0 -51
  40. pdfalyzer-1.16.3/pdfalyzer/yara_rules/PDF.yara +0 -1075
  41. pdfalyzer-1.16.3/pyproject.toml +0 -77
  42. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/LICENSE +0 -0
  43. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/__main__.py +0 -0
  44. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/decorators/document_model_printer.py +0 -0
  45. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
  46. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/detection/javascript_hunter.py +0 -0
  47. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/dict_helper.py +0 -0
  48. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/helpers/number_helper.py +0 -0
  49. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/styles/node_colors.py +0 -0
  50. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
  51. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/util/debugging.py +1 -1
  52. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/util/exceptions.py +0 -0
  53. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/util/pdf_parser_manager.py +0 -0
  54. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/__init.py__ +0 -0
  55. {pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
@@ -0,0 +1,66 @@
1
+ # If you place a filed called '.pdfalyzer' in your home dir or the current dir environment variables specified
2
+ # in that .pdfalyzer file will be added to the environment each time pdfalyzer is invoked. (See the `dotenv`
3
+ # package for more details.) This file contains environment variables you can place in .pdfalyzer to configure
4
+ # the application above and beyond providing command line options. Useful if you want to permanently
5
+ # configure options you tend to reuse (e.g. '--maximize-width') so you can stop remembering to type them.
6
+ #
7
+ # Almost all of the yaralyzer (yes, you read that right - The Pdfalyzer uses The Yaralyzer for all
8
+ # kinds of backend functionality) command line options can be configured in this file by capitalizing them and
9
+ # prefixing 'YARALYZER'. e.g. to configure the --maximize-width option for every invocation, you would set:
10
+ # YARALYZER_MAXIMIZE_WIDTH=True
11
+ #
12
+ # Note that many of these options are actually configuring the yaralyzer, which is a separate tool leveraged
13
+ # by the Pdfalyzer to actually do the work of finding patterns. More info can be found at
14
+ # https://github.com/michelcrypt4d4mus/yaralyzer
15
+
16
+
17
+
18
+ # Expand the width of the output to the fit the display window (same as the --maximize-width options)
19
+ # YARALYZER_MAXIMIZE_WIDTH=True
20
+
21
+ # yara-python internal options passed through to yara.set_config() as the stack_size and max_match_data arguments
22
+ # YARALYZER_STACK_SIZE=10485760
23
+ # YARALYZER_MAX_MATCH_LENGTH=10737418240
24
+
25
+ # Suppress all PDF binary regex matching/scanning/etc
26
+ # YARALYZER_SUPPRESS_DECODES_TABLE=False
27
+
28
+ # Suppress the display of the table showing the the encoding assessments given by `chardet.detect()`
29
+ # about a particular chunk of binary data. (The most important data in the chardet confidence table is
30
+ # redunandant anyways. Only the low likelihood encodings are hidden from the usef)
31
+ # YARALYZER_SUPPRESS_CHARDET_TABLE=False
32
+ # Minimum confidence to display an encoding in the chardet results table
33
+ # YARALYZER_MIN_CHARDET_CONFIDENCE=2.0
34
+
35
+ # Configure how many bytes before and after any binary data should be included in scans and visualizations
36
+ # YARALYZER_SURROUNDING_BYTES=64
37
+
38
+ # Size thresholds (in bytes) under/over which pdfalyzer will NOT make attempts to decode a match.
39
+ # Longer byte sequences are for obvious reasons slower to decode by force.
40
+ # It may feel counterintuitive but larger chunks of random binary are also harder to examine and
41
+ # (in my experience) less likely to be maningful. Consider it - two frontslash characters 20,000 lines apart
42
+ # are more likely to be random than those same frontslashes when placed nearer to each other and
43
+ # in the vicinity of lot of computerized sigils of internet power like `.', `+bacd*?`,. and other regexes.*
44
+ # Keeping the max value number low will do more to affect the speed of the app than ay anything else you
45
+ # can easily configure..
46
+ #
47
+ # YARALYZER_MIN_DECODE_LENGTH=1
48
+ # YARALYZER_MAX_DECODE_LENGTH=256
49
+
50
+ # Directory to write application logs to. Must be an absolute path, not a relative one.
51
+ # These logs are not normally written to a file and the default log level means that the standard behavior
52
+ # is to more or less discard them. Be aware that if you configure this variable a few things will change:
53
+ #
54
+ # 1. Logs WILL NOT be written to STDOUT. They will stream ONLY to files in the configured directory.
55
+ # This is true even with the -D option.
56
+ # 2. The default log_level will be decreased from WARN (extremely spartan) to INFO (fairly verbose).
57
+ # The -D option, which sets the log level to DEBUG, will be respected whether or not
58
+ # YARALYZER_LOG_DIR is configured.
59
+ #
60
+ # YARALYZER_LOG_DIR=/path/to/pdfalyzer/log_dir/
61
+
62
+ # Log level
63
+ # YARALYZER_LOG_LEVEL='INFO'
64
+
65
+ # Path to directory containing Didier Stevens's pdf-parser.py. Only required for extracting binary streams to files.
66
+ # PDFALYZER_PDF_PARSER_PY_PATH=/path/to/pdfparserdotpy/
@@ -1,5 +1,79 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.17.9
4
+ * Broaden exception handling in `FontInfo` extraction
5
+
6
+ ### 1.17.8
7
+ * Handle `AttributeError` in `FontInfo` extraction
8
+
9
+ ### 1.17.7
10
+ * Bump `pypdf` to 6.1.3 (fixes [#31](https://github.com/michelcrypt4d4mus/pdfalyzer/issues/31)), `PyMuPDF` to 1.26.5
11
+
12
+ ### 1.17.6
13
+ * Better handling for errors resulting from bugs in PyPDF
14
+ * Properly close file handle when pdfalyzing is complete
15
+
16
+ ### 1.17.5
17
+ * Fix `PIL` lazy import
18
+
19
+ ### 1.17.4
20
+ * Make `PIL` a lazy import so installs without `[extract]` extras don't fail
21
+
22
+ ### 1.17.3
23
+ * Put back `--debug` arg for CLI tools
24
+
25
+ ### 1.17.2
26
+ * Remove unused `--debug` args for CLI tools
27
+ * Rename `extract_text_from_pdfs` to `extract_pdf_text`
28
+
29
+ ### 1.17.1
30
+ * Fix issue where `extract_pdf_pages` page ranges were indexed from 0 instead of 1
31
+
32
+ # 1.17.0
33
+ * Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
34
+ * Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)
35
+
36
+ ### 1.16.14
37
+ * Bump `yaralyzer` to v1.0.9, handle `FileNotFoundError` which is now raised instead of `TypeError`
38
+ * Drop support for python 3.9
39
+
40
+ ### 1.16.13
41
+ * Bump `yaralyzer` to v1.0.7 and fix reference to yaralyzer's renamed `prefix_with_style()` method
42
+
43
+ ### 1.16.12
44
+ * Bump `PyPDF` to v6.0.0
45
+
46
+ ### 1.16.11
47
+ * Fix typo in `combine_pdfs` help
48
+ * Add some more PyPi classifiers
49
+ * Add a `.flake8` config and fix a bunch of style issues
50
+
51
+ ### 1.16.10
52
+ * Add `Environment :: Console` and `Programming Language :: Python` to pypi classifiers
53
+ * Add `.pdfalyzer.example` to PyPi package
54
+
55
+ ### 1.16.9
56
+ * Add `Development Status :: 5 - Production/Stable` to pypi classifiers
57
+
58
+ ### 1.16.8
59
+ * Even more PDF related YARA rules
60
+ * Upgrade `anytree` to 2.13.0
61
+ * Upgrade `yaralyzer` to 1.0.4
62
+
63
+ ### 1.16.7
64
+ * Lots of new PDF related YARA rules
65
+ * Upgrade `yaralyzer` to 1.0.3
66
+ * Upgrade `pypdf` to 5.9.0
67
+
68
+ ### 1.16.6
69
+ * Add the creator hash to GIFTEDCROOK rule
70
+
71
+ ### 1.16.5
72
+ * Add YARA rule for GIFTEDCROOK infostealer PDFs
73
+
74
+ ### 1.16.4
75
+ * Bump `PyPDF` to 5.7.0
76
+
3
77
  ### 1.16.3
4
78
  * Fix typo in help
5
79
 
@@ -1,39 +1,43 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.16.3
4
- Summary: A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
3
+ Version: 1.17.9
4
+ Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
7
- Keywords: ascii art,binary,color,font,encoding,malicious pdf,malware,malware analysis,pdf,threat assessment,visualization,yara
7
+ Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
8
8
  Author: Michel de Cryptadamus
9
9
  Author-email: michel@cryptadamus.com
10
- Requires-Python: >=3.9,<4.0
10
+ Requires-Python: >=3.10,<4.0
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Information Technology
11
14
  Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
15
+ Classifier: Programming Language :: Python
12
16
  Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.9
14
17
  Classifier: Programming Language :: Python :: 3.10
15
18
  Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
16
21
  Classifier: Topic :: Artistic Software
17
22
  Classifier: Topic :: Scientific/Engineering :: Visualization
18
23
  Classifier: Topic :: Security
19
- Requires-Dist: anytree (>=2.8,<3.0)
20
- Requires-Dist: chardet (>=5.0.0,<6.0.0)
21
- Requires-Dist: pypdf (>=5.0.1,<6.0.0)
22
- Requires-Dist: python-dotenv (>=0.21.0,<0.22.0)
23
- Requires-Dist: rich (>=12.5.1,<13.0.0)
24
- Requires-Dist: rich-argparse-plus (>=0.3.1,<0.4.0)
25
- Requires-Dist: yaralyzer (>=0.9.4,<0.10.0)
24
+ Provides-Extra: extract
25
+ Requires-Dist: PyMuPDF (>=1.26.5,<2.0.0) ; extra == "extract"
26
+ Requires-Dist: anytree (>=2.13,<3.0)
27
+ Requires-Dist: pypdf (>=6.1.3,<7.0.0)
28
+ Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
29
+ Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
26
30
  Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
27
31
  Project-URL: Documentation, https://github.com/michelcrypt4d4mus/pdfalyzer
28
32
  Project-URL: Repository, https://github.com/michelcrypt4d4mus/pdfalyzer
29
33
  Description-Content-Type: text/markdown
30
34
 
31
- <!-- ![Tests](https://img.shields.io/github/workflow/status/michelcrypt4d4mus/pdfalyzer/tests?label=tests) -->
32
- ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
33
35
  [![GithubRelease](https://img.shields.io/github/v/release/michelcrypt4d4mus/pdfalyzer?sort=semver)](https://pypi.org/project/pdfalyzer/)
34
- [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
35
36
  ![PyPiRelease](https://img.shields.io/pypi/v/pdfalyzer)
37
+ [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
38
+ ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
36
39
  ![Downloads](https://img.shields.io/pypi/dm/pdfalyzer)
40
+ [![Tests](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml/badge.svg)](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
37
41
 
38
42
 
39
43
  # THE PDFALYZER
@@ -63,10 +67,11 @@ If you're looking for one of these things this may be the tool for you.
63
67
  ### What It Don't Do
64
68
  This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
65
69
 
66
- -------------
70
+ If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
67
71
 
68
- # Installation
69
72
 
73
+ # Installation
74
+ #### All Platforms
70
75
  Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` / `pip` should also work.
71
76
  ```sh
72
77
  pipx install pdfalyzer
@@ -74,7 +79,12 @@ pipx install pdfalyzer
74
79
 
75
80
  See [PyPDF installation notes](https://github.com/py-pdf/pypdf#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
76
81
 
77
- If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
82
+ #### macOS Homebrew
83
+ If you are on macOS and use `homebrew` someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so this should work:
84
+
85
+ ```sh
86
+ brew install pdfalyzer
87
+ ```
78
88
 
79
89
  ### Troubleshooting
80
90
  1. If you used `pip3` instead of `pipx` and have an issue you should try to install with `pipx`.
@@ -88,7 +98,6 @@ If you are on macOS someone out there was kind enough to make [The Pdfalyzer ava
88
98
  sudo apt-get install build-essential libssl-dev libffi-dev rustc
89
99
  ```
90
100
 
91
- -------------
92
101
 
93
102
  # Usage
94
103
 
@@ -104,24 +113,40 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
104
113
 
105
114
  The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
106
115
 
107
- ### Setting Command Line Options Permanently With A `.pdfalyzer` File
108
- When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` first in the current directory and then in the home directory. If it finds a file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
116
+ #### Setting Command Line Options Permanently With A `.pdfalyzer` File
117
+ When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
109
118
 
110
- ### Environment Variables
111
- Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
119
+ 1. the current directory
120
+ 2. the user's home directory
112
121
 
113
- ### Colors And Themes
114
- Run `pdfalyzer_show_color_theme` to see the color theme employed.
122
+ If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
123
+
124
+ #### Environment Variables
125
+ Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
115
126
 
116
127
  ### Guarantees
117
128
  Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
118
129
 
119
- ## Example Usage
130
+ ## Example Malicious PDF Investigation
120
131
  [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
121
132
 
122
- -------------
123
133
 
124
- ## Use As A Code Library
134
+ ## Included Command Line Tools
135
+ The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
136
+
137
+ * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
138
+ * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
139
+ * `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
140
+ * `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
141
+
142
+ Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
143
+
144
+ ```bash
145
+ pipx install pdfalyzer[extract]
146
+ ```
147
+
148
+
149
+ ## As A Python Library
125
150
  For info about setting up a dev environment see [Contributing](#contributing) below.
126
151
 
127
152
  At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
@@ -230,10 +255,8 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
230
255
 
231
256
  -------------
232
257
 
233
- # PDF Resources
234
- ## Included PDF Tools
235
- The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
236
258
 
259
+ # PDF Resources
237
260
  ## 3rd Party PDF Tools
238
261
  ### Installing Didier Stevens's PDF Analysis Tools
239
262
  Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
@@ -263,6 +286,7 @@ scripts/install_t1utils.sh
263
286
  * [Adobe Type 2 Charstring Format](https://adobe-type-tools.github.io/font-tech-notes/pdfs/5177.Type2.pdf) - Describes the newer Type 2 font operators which are also used in some multiple-master Type 1 fonts.
264
287
 
265
288
  ### Other Stuff
289
+ * [Didier Stevens's PDF tools](http://blog.didierstevens.com/programs/pdf-tools/)
266
290
  * [Didier Stevens's free book about malicious PDFs](https://blog.didierstevens.com/2010/09/26/free-malicious-pdf-analysis-e-book/) - The master of the malicious PDFs wrote a whole book about how to analyze them. It's an old book but the PDF spec was last changed in 2008 so it's still relevant.
267
291
  * [Analyzing Malicious PDFs Cheat Sheet](https://zeltser.com/media/docs/analyzing-malicious-document-files.pdf) - Like it says on the tin. If that link fails there's a copy [here in the repo](doc/analyzing-malicious-document-files.pdf).
268
292
  * [T1Utils Github Repo](https://github.com/kohler/t1utils) - Suite of tools for manipulating Type1 fonts.
@@ -271,6 +295,8 @@ scripts/install_t1utils.sh
271
295
  * [A Curious Exploration of Malicious PDF Documents](https://www.scitepress.org/Papers/2020/89923/89923.pdf) by Julian Lindenhofer, Rene Offenthaler and Martin Pirker, 2020. Overview of all the possible execution paths that can lead to a PDF executing JavaScript, opening loca/remote files, or making web requests.
272
296
  * [Malicious PDF Generator](https://github.com/jonaslejon/malicious-pdf) is a well maintained GitHub project that does what it says on the tin.
273
297
  * [PDF is Broken, and so is this file](https://blog.trailofbits.com/2021/02/02/pdf-is-broken-a-justctf-challenge/) is a 2021 report on what happens when you challenge cybersecurity teams to turn PDFs into weapons. (Among other things they managed to create a PDF that launches a webserver when you open it.)
298
+ * [linuxPDF](https://github.com/ading2210/linuxpdf) is a project that managed to embed an entire linux operating system inside a PDF document. The related [DoomPDF](https://github.com/ading2210/doompdf) managed to embed the classic video game Doom in a PDF.
299
+ * [horrifying-pdf-experiments](https://github.com/osnr/horrifying-pdf-experiments) is a repo of horrifying things you can do with PDFs.
274
300
 
275
301
 
276
302
  ## Did The World Really Need Another PDF Tool?
@@ -304,12 +330,6 @@ These are the naming conventions at play in The Pdfalyzer code base:
304
330
  * [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
305
331
 
306
332
 
307
- # TODO
308
- * Highlight decodes with a lot of Javascript keywords
309
- * https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
310
- * https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
311
-
312
-
313
333
  [^1]: The official Adobe PDF specification calls this tree the PDF's "logical structure", which is a good example of nomenclature that does not help those who see it understand anything about what is being described. I can forgive them given that they named this thing back in the 80s, though it's a good example of why picking good names for things at the beginning is so important.
314
334
 
315
335
  [^2]: An exception will be raised if there's any issue placing a node while parsing or if there are any nodes not reachable from the root of the tree at the end of parsing. If there are no exceptions then all internal PDF objects are guaranteed to exist in the tree except in these situations when warnings will be printed:
@@ -1,9 +1,9 @@
1
- <!-- ![Tests](https://img.shields.io/github/workflow/status/michelcrypt4d4mus/pdfalyzer/tests?label=tests) -->
2
- ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
3
1
  [![GithubRelease](https://img.shields.io/github/v/release/michelcrypt4d4mus/pdfalyzer?sort=semver)](https://pypi.org/project/pdfalyzer/)
4
- [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
5
2
  ![PyPiRelease](https://img.shields.io/pypi/v/pdfalyzer)
3
+ [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
4
+ ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
6
5
  ![Downloads](https://img.shields.io/pypi/dm/pdfalyzer)
6
+ [![Tests](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml/badge.svg)](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
7
7
 
8
8
 
9
9
  # THE PDFALYZER
@@ -33,10 +33,11 @@ If you're looking for one of these things this may be the tool for you.
33
33
  ### What It Don't Do
34
34
  This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
35
35
 
36
- -------------
36
+ If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
37
37
 
38
- # Installation
39
38
 
39
+ # Installation
40
+ #### All Platforms
40
41
  Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` / `pip` should also work.
41
42
  ```sh
42
43
  pipx install pdfalyzer
@@ -44,7 +45,12 @@ pipx install pdfalyzer
44
45
 
45
46
  See [PyPDF installation notes](https://github.com/py-pdf/pypdf#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
46
47
 
47
- If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
48
+ #### macOS Homebrew
49
+ If you are on macOS and use `homebrew` someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so this should work:
50
+
51
+ ```sh
52
+ brew install pdfalyzer
53
+ ```
48
54
 
49
55
  ### Troubleshooting
50
56
  1. If you used `pip3` instead of `pipx` and have an issue you should try to install with `pipx`.
@@ -58,7 +64,6 @@ If you are on macOS someone out there was kind enough to make [The Pdfalyzer ava
58
64
  sudo apt-get install build-essential libssl-dev libffi-dev rustc
59
65
  ```
60
66
 
61
- -------------
62
67
 
63
68
  # Usage
64
69
 
@@ -74,24 +79,40 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
74
79
 
75
80
  The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
76
81
 
77
- ### Setting Command Line Options Permanently With A `.pdfalyzer` File
78
- When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` first in the current directory and then in the home directory. If it finds a file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
82
+ #### Setting Command Line Options Permanently With A `.pdfalyzer` File
83
+ When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
79
84
 
80
- ### Environment Variables
81
- Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
85
+ 1. the current directory
86
+ 2. the user's home directory
82
87
 
83
- ### Colors And Themes
84
- Run `pdfalyzer_show_color_theme` to see the color theme employed.
88
+ If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
89
+
90
+ #### Environment Variables
91
+ Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
85
92
 
86
93
  ### Guarantees
87
94
  Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
88
95
 
89
- ## Example Usage
96
+ ## Example Malicious PDF Investigation
90
97
  [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
91
98
 
92
- -------------
93
99
 
94
- ## Use As A Code Library
100
+ ## Included Command Line Tools
101
+ The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
102
+
103
+ * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
104
+ * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
105
+ * `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
106
+ * `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
107
+
108
+ Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
109
+
110
+ ```bash
111
+ pipx install pdfalyzer[extract]
112
+ ```
113
+
114
+
115
+ ## As A Python Library
95
116
  For info about setting up a dev environment see [Contributing](#contributing) below.
96
117
 
97
118
  At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
@@ -200,10 +221,8 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
200
221
 
201
222
  -------------
202
223
 
203
- # PDF Resources
204
- ## Included PDF Tools
205
- The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
206
224
 
225
+ # PDF Resources
207
226
  ## 3rd Party PDF Tools
208
227
  ### Installing Didier Stevens's PDF Analysis Tools
209
228
  Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
@@ -233,6 +252,7 @@ scripts/install_t1utils.sh
233
252
  * [Adobe Type 2 Charstring Format](https://adobe-type-tools.github.io/font-tech-notes/pdfs/5177.Type2.pdf) - Describes the newer Type 2 font operators which are also used in some multiple-master Type 1 fonts.
234
253
 
235
254
  ### Other Stuff
255
+ * [Didier Stevens's PDF tools](http://blog.didierstevens.com/programs/pdf-tools/)
236
256
  * [Didier Stevens's free book about malicious PDFs](https://blog.didierstevens.com/2010/09/26/free-malicious-pdf-analysis-e-book/) - The master of the malicious PDFs wrote a whole book about how to analyze them. It's an old book but the PDF spec was last changed in 2008 so it's still relevant.
237
257
  * [Analyzing Malicious PDFs Cheat Sheet](https://zeltser.com/media/docs/analyzing-malicious-document-files.pdf) - Like it says on the tin. If that link fails there's a copy [here in the repo](doc/analyzing-malicious-document-files.pdf).
238
258
  * [T1Utils Github Repo](https://github.com/kohler/t1utils) - Suite of tools for manipulating Type1 fonts.
@@ -241,6 +261,8 @@ scripts/install_t1utils.sh
241
261
  * [A Curious Exploration of Malicious PDF Documents](https://www.scitepress.org/Papers/2020/89923/89923.pdf) by Julian Lindenhofer, Rene Offenthaler and Martin Pirker, 2020. Overview of all the possible execution paths that can lead to a PDF executing JavaScript, opening loca/remote files, or making web requests.
242
262
  * [Malicious PDF Generator](https://github.com/jonaslejon/malicious-pdf) is a well maintained GitHub project that does what it says on the tin.
243
263
  * [PDF is Broken, and so is this file](https://blog.trailofbits.com/2021/02/02/pdf-is-broken-a-justctf-challenge/) is a 2021 report on what happens when you challenge cybersecurity teams to turn PDFs into weapons. (Among other things they managed to create a PDF that launches a webserver when you open it.)
264
+ * [linuxPDF](https://github.com/ading2210/linuxpdf) is a project that managed to embed an entire linux operating system inside a PDF document. The related [DoomPDF](https://github.com/ading2210/doompdf) managed to embed the classic video game Doom in a PDF.
265
+ * [horrifying-pdf-experiments](https://github.com/osnr/horrifying-pdf-experiments) is a repo of horrifying things you can do with PDFs.
244
266
 
245
267
 
246
268
  ## Did The World Really Need Another PDF Tool?
@@ -274,12 +296,6 @@ These are the naming conventions at play in The Pdfalyzer code base:
274
296
  * [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
275
297
 
276
298
 
277
- # TODO
278
- * Highlight decodes with a lot of Javascript keywords
279
- * https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
280
- * https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
281
-
282
-
283
299
  [^1]: The official Adobe PDF specification calls this tree the PDF's "logical structure", which is a good example of nomenclature that does not help those who see it understand anything about what is being described. I can forgive them given that they named this thing back in the 80s, though it's a good example of why picking good names for things at the beginning is so important.
284
300
 
285
301
  [^2]: An exception will be raised if there's any issue placing a node while parsing or if there are any nodes not reachable from the root of the tree at the end of parsing. If there are no exceptions then all internal PDF objects are guaranteed to exist in the tree except in these situations when warnings will be printed:
@@ -1,7 +1,7 @@
1
1
  import code
2
2
  import sys
3
+ from argparse import Namespace
3
4
  from os import environ, getcwd, path
4
- from pathlib import Path
5
5
 
6
6
  from dotenv import load_dotenv
7
7
  from pypdf import PdfWriter
@@ -20,18 +20,20 @@ if not environ.get('INVOKED_BY_PYTEST', False):
20
20
  from rich.columns import Columns
21
21
  from rich.panel import Panel
22
22
  from rich.text import Text
23
- from yaralyzer.helpers.rich_text_helper import prefix_with_plain_text_obj
23
+ from yaralyzer.helpers.rich_text_helper import prefix_with_style
24
24
  from yaralyzer.output.file_export import invoke_rich_export
25
25
  from yaralyzer.output.rich_console import console
26
- from yaralyzer.util.logging import log, log_and_print
26
+ from yaralyzer.util.logging import log_and_print
27
27
 
28
+ from pdfalyzer.decorators.pdf_file import PdfFile
28
29
  from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
29
30
  from pdfalyzer.helpers.rich_text_helper import print_highlighted
30
31
  from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
31
32
  from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
32
33
  from pdfalyzer.pdfalyzer import Pdfalyzer
33
- from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
34
- parse_combine_pdfs_args)
34
+ from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
35
+ from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, parse_combine_pdfs_args,
36
+ parse_pdf_page_extraction_args, parse_text_extraction_args)
35
37
  from pdfalyzer.util.pdf_parser_manager import PdfParserManager
36
38
 
37
39
  # For the table shown by running pdfalyzer_show_color_theme
@@ -41,7 +43,7 @@ MAX_THEME_COL_SIZE = 35
41
43
  def pdfalyze():
42
44
  args = parse_arguments()
43
45
  pdfalyzer = Pdfalyzer(args.file_to_scan_path)
44
- pdfalyzer = PdfalyzerPresenter(pdfalyzer)
46
+ presenter = PdfalyzerPresenter(pdfalyzer)
45
47
  output_basepath = None
46
48
 
47
49
  # Binary stream extraction is a special case
@@ -51,9 +53,9 @@ def pdfalyze():
51
53
  log_and_print(f"Binary stream extraction complete, files written to '{args.output_dir}'.\nExiting.\n")
52
54
  sys.exit()
53
55
 
54
- # The method that gets called is related to the argument name. See 'possible_output_sections' list in argument_parser.py
55
- # Analysis exports wrap themselves around the methods that actually generate the analyses
56
- for (arg, method) in output_sections(args, pdfalyzer):
56
+ # The method that gets called is related to the argument name. See 'possible_output_sections' list in
57
+ # argument_parser.py. Analysis exports wrap themselves around the methods that actually generate the analyses.
58
+ for (arg, method) in output_sections(args, presenter):
57
59
  if args.output_dir:
58
60
  output_basepath = PdfalyzerConfig.get_output_basepath(method)
59
61
  print(f'Exporting {arg} data to {output_basepath}...')
@@ -78,23 +80,25 @@ def pdfalyze():
78
80
  if args.interact:
79
81
  code.interact(local=locals())
80
82
 
83
+ pdfalyzer.pdf_filehandle.close()
84
+
81
85
 
82
86
  def pdfalyzer_show_color_theme() -> None:
83
87
  """Utility method to show pdfalyzer's color theme. Invocable with 'pdfalyzer_show_color_theme'."""
84
88
  console.print(Panel('The Pdfalyzer Color Theme', style='reverse'))
85
89
 
86
90
  colors = [
87
- prefix_with_plain_text_obj(name[:MAX_THEME_COL_SIZE], style=str(style)).append(' ')
91
+ prefix_with_style(name[:MAX_THEME_COL_SIZE], style=str(style)).append(' ')
88
92
  for name, style in PDFALYZER_THEME_DICT.items()
89
93
  if name not in ['reset', 'repr_url']
90
94
  ]
91
95
 
92
- console.print(Columns(colors, column_first=True, padding=(0,3)))
96
+ console.print(Columns(colors, column_first=True, padding=(0, 3)))
93
97
 
94
98
 
95
99
  def combine_pdfs():
96
100
  """
97
- Utility method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'.
101
+ Script method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'.
98
102
  Example: https://github.com/py-pdf/pypdf/blob/main/docs/user/merging-pdfs.md
99
103
  """
100
104
  args = parse_combine_pdfs_args()
@@ -114,7 +118,11 @@ def combine_pdfs():
114
118
  for i, page in enumerate(merger.pages):
115
119
  if args.image_quality < MAX_QUALITY:
116
120
  for j, img in enumerate(page.images):
117
- print_highlighted(f" -> Reducing image #{j + 1} quality on page {i + 1} to {args.image_quality}...", style='dim')
121
+ print_highlighted(
122
+ f" -> Reducing image #{j + 1} quality on page {i + 1} to {args.image_quality}...",
123
+ style='dim'
124
+ )
125
+
118
126
  img.replace(img.image, quality=args.image_quality)
119
127
 
120
128
  print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
@@ -127,3 +135,19 @@ def combine_pdfs():
127
135
  txt = Text('').append(f" -> Wrote ")
128
136
  txt.append(str(file_size_in_mb(args.output_file)), style='cyan').append(" megabytes\n")
129
137
  print_highlighted(txt)
138
+
139
+
140
+ def extract_pdf_pages() -> None:
141
+ """Extract a range of pages from a PDF to a new PDF."""
142
+ args = parse_pdf_page_extraction_args()
143
+ PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
144
+
145
+
146
+ def extract_pdf_text() -> None:
147
+ """Extract text from a list of file or from all PDF files in a list of directories."""
148
+ args: Namespace = parse_text_extraction_args()
149
+ console.line()
150
+
151
+ for file_path in args.files_to_process:
152
+ PdfFile(file_path).print_extracted_text(args.page_range, args.print_as_parsed)
153
+ console.line(2)