PyPI - pdfalyzer - Versions diffs - 1.16.3__tar.gz → 1.17.9__tar.gz - Mend

pdfalyzer 1.16.3tar.gz → 1.17.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

pdfalyzer-1.17.9/.pdfalyzer.example ADDED Viewed

@@ -0,0 +1,66 @@
+# If you place a filed called '.pdfalyzer' in your home dir or the current dir environment variables specified
+# in that .pdfalyzer file will be added to the environment each time pdfalyzer is invoked. (See the `dotenv`
+# package for more details.) This file contains environment variables you can place in .pdfalyzer to configure
+# the application above and beyond providing command line options.  Useful if you want to permanently
+# configure options you tend to reuse (e.g. '--maximize-width') so you can stop remembering to type them.
+#
+# Almost all of the yaralyzer (yes, you read that right - The Pdfalyzer uses The Yaralyzer for all
+# kinds of backend functionality) command line options can be configured in this file by capitalizing them and
+# prefixing 'YARALYZER'. e.g. to configure the --maximize-width option for every invocation, you would set:
+# YARALYZER_MAXIMIZE_WIDTH=True
+#
+# Note that many of these options are actually configuring the yaralyzer, which is a separate tool leveraged
+# by the Pdfalyzer to actually do the work of finding patterns. More info can be found at
+# https://github.com/michelcrypt4d4mus/yaralyzer
+# Expand the width of the output to the fit the display window (same as the --maximize-width options)
+#    YARALYZER_MAXIMIZE_WIDTH=True
+# yara-python internal options passed through to yara.set_config() as the stack_size and max_match_data arguments
+#    YARALYZER_STACK_SIZE=10485760
+#    YARALYZER_MAX_MATCH_LENGTH=10737418240
+# Suppress all PDF binary regex matching/scanning/etc
+#    YARALYZER_SUPPRESS_DECODES_TABLE=False
+# Suppress the display of the table showing the the encoding assessments given by `chardet.detect()`
+# about a particular chunk of binary data. (The most important data in the chardet confidence table is
+# redunandant anyways. Only the low likelihood encodings are hidden from the usef)
+#    YARALYZER_SUPPRESS_CHARDET_TABLE=False
+# Minimum confidence to display an encoding in the chardet results table
+#    YARALYZER_MIN_CHARDET_CONFIDENCE=2.0
+# Configure how many bytes before and after any binary data should be included in scans and visualizations
+#    YARALYZER_SURROUNDING_BYTES=64
+# Size thresholds (in bytes) under/over which pdfalyzer will NOT make attempts to decode a match.
+# Longer byte sequences are for obvious reasons slower to decode by force.
+# It may feel counterintuitive but larger chunks of random binary are also harder to examine and
+# (in my experience) less likely to be maningful. Consider it - two frontslash characters 20,000 lines apart
+# are more likely to be random than those same frontslashes when placed nearer to each other and
+# in the vicinity of lot of computerized sigils of internet power like `.', `+bacd*?`,. and other regexes.*
+# Keeping the max value number low will do more to affect the speed of the app than ay anything else you
+# can easily configure..
+#
+#    YARALYZER_MIN_DECODE_LENGTH=1
+#    YARALYZER_MAX_DECODE_LENGTH=256
+# Directory to write application logs to. Must be an absolute path, not a relative one.
+# These logs are not normally written to a file and the default log level means that the standard behavior
+# is to more or less discard them. Be aware that if you configure this variable a few things will change:
+#
+#   1. Logs WILL NOT be written to STDOUT. They will stream ONLY to files in the configured directory.
+#      This is true even with the -D option.
+#   2. The default log_level will be decreased from WARN (extremely spartan) to INFO (fairly verbose).
+#      The -D option, which sets the log level to DEBUG, will be respected whether or not
+#      YARALYZER_LOG_DIR is configured.
+#
+#     YARALYZER_LOG_DIR=/path/to/pdfalyzer/log_dir/
+# Log level
+#     YARALYZER_LOG_LEVEL='INFO'
+# Path to directory containing Didier Stevens's pdf-parser.py. Only required for extracting binary streams to files.
+#     PDFALYZER_PDF_PARSER_PY_PATH=/path/to/pdfparserdotpy/

{pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/CHANGELOG.md RENAMED Viewed

@@ -1,5 +1,79 @@
 # NEXT RELEASE
+### 1.17.9
+* Broaden exception handling in `FontInfo` extraction
+### 1.17.8
+* Handle `AttributeError` in `FontInfo` extraction
+### 1.17.7
+* Bump `pypdf` to 6.1.3 (fixes [#31](https://github.com/michelcrypt4d4mus/pdfalyzer/issues/31)), `PyMuPDF` to 1.26.5
+### 1.17.6
+* Better handling for errors resulting from bugs in PyPDF
+* Properly close file handle when pdfalyzing is complete
+### 1.17.5
+* Fix `PIL` lazy import
+### 1.17.4
+* Make `PIL` a lazy import so installs without `[extract]` extras don't fail
+### 1.17.3
+* Put back `--debug` arg for CLI tools
+### 1.17.2
+* Remove unused `--debug` args for CLI tools
+* Rename `extract_text_from_pdfs` to `extract_pdf_text`
+### 1.17.1
+* Fix issue where `extract_pdf_pages` page ranges were indexed from 0 instead of 1
+# 1.17.0
+* Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
+* Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)
+### 1.16.14
+* Bump `yaralyzer` to v1.0.9, handle `FileNotFoundError` which is now raised instead of `TypeError`
+* Drop support for python 3.9
+### 1.16.13
+* Bump `yaralyzer` to v1.0.7 and fix reference to yaralyzer's renamed `prefix_with_style()` method
+### 1.16.12
+* Bump `PyPDF` to v6.0.0
+### 1.16.11
+* Fix typo in `combine_pdfs` help
+* Add some more PyPi classifiers
+* Add a `.flake8` config and fix a bunch of style issues
+### 1.16.10
+* Add `Environment :: Console` and `Programming Language :: Python` to pypi classifiers
+* Add `.pdfalyzer.example` to PyPi package
+### 1.16.9
+* Add `Development Status :: 5 - Production/Stable` to pypi classifiers
+### 1.16.8
+* Even more PDF related YARA rules
+* Upgrade `anytree` to 2.13.0
+* Upgrade `yaralyzer` to 1.0.4
+### 1.16.7
+* Lots of new PDF related YARA rules
+* Upgrade `yaralyzer` to 1.0.3
+* Upgrade `pypdf` to 5.9.0
+### 1.16.6
+* Add the creator hash to GIFTEDCROOK rule
+### 1.16.5
+* Add YARA rule for GIFTEDCROOK infostealer PDFs
+### 1.16.4
+* Bump `PyPDF` to 5.7.0
 ### 1.16.3
 * Fix typo in help

{pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/PKG-INFO RENAMED Viewed

@@ -1,39 +1,43 @@
 Metadata-Version: 2.1
 Name: pdfalyzer
-Version: 1.16.3
-Summary: A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
+Version: 1.17.9
+Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
 Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
 License: GPL-3.0-or-later
-Keywords: ascii art,binary,color,font,encoding,malicious pdf,malware,malware analysis,pdf,threat assessment,visualization,yara
+Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
 Author: Michel de Cryptadamus
 Author-email: michel@cryptadamus.com
-Requires-Python: >=3.9,<4.0
+Requires-Python: >=3.10,<4.0
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Information Technology
 Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
+Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Artistic Software
 Classifier: Topic :: Scientific/Engineering :: Visualization
 Classifier: Topic :: Security
-Requires-Dist: anytree (>=2.8,<3.0)
-Requires-Dist: chardet (>=5.0.0,<6.0.0)
-Requires-Dist: pypdf (>=5.0.1,<6.0.0)
-Requires-Dist: python-dotenv (>=0.21.0,<0.22.0)
-Requires-Dist: rich (>=12.5.1,<13.0.0)
-Requires-Dist: rich-argparse-plus (>=0.3.1,<0.4.0)
-Requires-Dist: yaralyzer (>=0.9.4,<0.10.0)
+Provides-Extra: extract
+Requires-Dist: PyMuPDF (>=1.26.5,<2.0.0) ; extra == "extract"
+Requires-Dist: anytree (>=2.13,<3.0)
+Requires-Dist: pypdf (>=6.1.3,<7.0.0)
+Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
+Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
 Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
 Project-URL: Documentation, https://github.com/michelcrypt4d4mus/pdfalyzer
 Project-URL: Repository, https://github.com/michelcrypt4d4mus/pdfalyzer
 Description-Content-Type: text/markdown
-<!-- ![Tests](https://img.shields.io/github/workflow/status/michelcrypt4d4mus/pdfalyzer/tests?label=tests)  -->
-![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
 [![GithubRelease](https://img.shields.io/github/v/release/michelcrypt4d4mus/pdfalyzer?sort=semver)](https://pypi.org/project/pdfalyzer/)
-[![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
 ![PyPiRelease](https://img.shields.io/pypi/v/pdfalyzer)
+[![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
+![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
 ![Downloads](https://img.shields.io/pypi/dm/pdfalyzer)
+[![Tests](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml/badge.svg)](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
 # THE PDFALYZER
@@ -63,10 +67,11 @@ If you're looking for one of these things this may be the tool for you.
 ### What It Don't Do
 This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
--------------
+If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
-# Installation
+# Installation
+#### All Platforms
 Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` / `pip` should also work.
 ```sh
 pipx install pdfalyzer
@@ -74,7 +79,12 @@ pipx install pdfalyzer
 See [PyPDF installation notes](https://github.com/py-pdf/pypdf#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
-If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
+#### macOS Homebrew
+If you are on macOS and use `homebrew` someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so this should work:
+```sh
+brew install pdfalyzer
+```
 ### Troubleshooting
 1. If you used `pip3` instead of `pipx` and have an issue you should try to install with `pipx`.
@@ -88,7 +98,6 @@ If you are on macOS someone out there was kind enough to make [The Pdfalyzer ava
    sudo apt-get install build-essential libssl-dev libffi-dev rustc
    ```
--------------
 # Usage
@@ -104,24 +113,40 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
 The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
-### Setting Command Line Options Permanently With A `.pdfalyzer` File
-When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` first in the current directory and then in the home directory. If it finds a file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
+#### Setting Command Line Options Permanently With A `.pdfalyzer` File
+When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
-### Environment Variables
-Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
+1. the current directory
+2. the user's home directory
-### Colors And Themes
-Run `pdfalyzer_show_color_theme` to see the color theme employed.
+If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
+#### Environment Variables
+Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
 ### Guarantees
 Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
-## Example Usage
+## Example Malicious PDF Investigation
 [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
--------------
-## Use As A Code Library
+## Included Command Line Tools
+The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
+* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
+* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
+* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
+* `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
+Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
+```bash
+pipx install pdfalyzer[extract]
+```
+## As A Python Library
 For info about setting up a dev environment see [Contributing](#contributing) below.
 At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class.  Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
@@ -230,10 +255,8 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
 -------------
-# PDF Resources
-## Included PDF Tools
-The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
+# PDF Resources
 ## 3rd Party PDF Tools
 ### Installing Didier Stevens's PDF Analysis Tools
 Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
@@ -263,6 +286,7 @@ scripts/install_t1utils.sh
 * [Adobe Type 2 Charstring Format](https://adobe-type-tools.github.io/font-tech-notes/pdfs/5177.Type2.pdf) - Describes the newer Type 2 font operators which are also used in some multiple-master Type 1 fonts.
 ### Other Stuff
+* [Didier Stevens's PDF tools](http://blog.didierstevens.com/programs/pdf-tools/)
 * [Didier Stevens's free book about malicious PDFs](https://blog.didierstevens.com/2010/09/26/free-malicious-pdf-analysis-e-book/) - The master of the malicious PDFs wrote a whole book about how to analyze them. It's an old book but the PDF spec was last changed in 2008 so it's still relevant.
 * [Analyzing Malicious PDFs Cheat Sheet](https://zeltser.com/media/docs/analyzing-malicious-document-files.pdf) - Like it says on the tin. If that link fails there's a copy [here in the repo](doc/analyzing-malicious-document-files.pdf).
 * [T1Utils Github Repo](https://github.com/kohler/t1utils) - Suite of tools for manipulating Type1 fonts.
@@ -271,6 +295,8 @@ scripts/install_t1utils.sh
 * [A Curious Exploration of Malicious PDF Documents](https://www.scitepress.org/Papers/2020/89923/89923.pdf) by Julian Lindenhofer, Rene Offenthaler and Martin Pirker, 2020. Overview of all the possible execution paths that can lead to a PDF executing JavaScript, opening loca/remote files, or making web requests.
 * [Malicious PDF Generator](https://github.com/jonaslejon/malicious-pdf) is a well maintained GitHub project that does what it says on the tin.
 * [PDF is Broken, and so is this file](https://blog.trailofbits.com/2021/02/02/pdf-is-broken-a-justctf-challenge/) is a 2021 report on what happens when you challenge cybersecurity teams to turn PDFs into weapons. (Among other things they managed to create a PDF that launches a webserver when you open it.)
+* [linuxPDF](https://github.com/ading2210/linuxpdf) is a project that managed to embed an entire linux operating system inside a PDF document. The related [DoomPDF](https://github.com/ading2210/doompdf) managed to embed the classic video game Doom in a PDF.
+* [horrifying-pdf-experiments](https://github.com/osnr/horrifying-pdf-experiments) is a repo of horrifying things you can do with PDFs.
 ## Did The World Really Need Another PDF Tool?
@@ -304,12 +330,6 @@ These are the naming conventions at play in The Pdfalyzer code base:
 * [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
-# TODO
-* Highlight decodes with a lot of Javascript keywords
-* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
-* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
 [^1]: The official Adobe PDF specification calls this tree the PDF's "logical structure", which is a good example of nomenclature that does not help those who see it understand anything about what is being described. I can forgive them given that they named this thing back in the 80s, though it's a good example of why picking good names for things at the beginning is so important.
 [^2]: An exception will be raised if there's any issue placing a node while parsing or if there are any nodes not reachable from the root of the tree at the end of parsing. If there are no exceptions then all internal PDF objects are guaranteed to exist in the tree except in these situations when warnings will be printed:

{pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/README.md RENAMED Viewed

@@ -1,9 +1,9 @@
-<!-- ![Tests](https://img.shields.io/github/workflow/status/michelcrypt4d4mus/pdfalyzer/tests?label=tests)  -->
-![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
 [![GithubRelease](https://img.shields.io/github/v/release/michelcrypt4d4mus/pdfalyzer?sort=semver)](https://pypi.org/project/pdfalyzer/)
-[![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
 ![PyPiRelease](https://img.shields.io/pypi/v/pdfalyzer)
+[![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
+![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
 ![Downloads](https://img.shields.io/pypi/dm/pdfalyzer)
+[![Tests](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml/badge.svg)](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
 # THE PDFALYZER
@@ -33,10 +33,11 @@ If you're looking for one of these things this may be the tool for you.
 ### What It Don't Do
 This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
--------------
+If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
-# Installation
+# Installation
+#### All Platforms
 Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` / `pip` should also work.
 ```sh
 pipx install pdfalyzer
@@ -44,7 +45,12 @@ pipx install pdfalyzer
 See [PyPDF installation notes](https://github.com/py-pdf/pypdf#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
-If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
+#### macOS Homebrew
+If you are on macOS and use `homebrew` someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so this should work:
+```sh
+brew install pdfalyzer
+```
 ### Troubleshooting
 1. If you used `pip3` instead of `pipx` and have an issue you should try to install with `pipx`.
@@ -58,7 +64,6 @@ If you are on macOS someone out there was kind enough to make [The Pdfalyzer ava
    sudo apt-get install build-essential libssl-dev libffi-dev rustc
    ```
--------------
 # Usage
@@ -74,24 +79,40 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
 The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
-### Setting Command Line Options Permanently With A `.pdfalyzer` File
-When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` first in the current directory and then in the home directory. If it finds a file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
+#### Setting Command Line Options Permanently With A `.pdfalyzer` File
+When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
-### Environment Variables
-Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
+1. the current directory
+2. the user's home directory
-### Colors And Themes
-Run `pdfalyzer_show_color_theme` to see the color theme employed.
+If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
+#### Environment Variables
+Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
 ### Guarantees
 Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
-## Example Usage
+## Example Malicious PDF Investigation
 [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
--------------
-## Use As A Code Library
+## Included Command Line Tools
+The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
+* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
+* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
+* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
+* `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
+Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
+```bash
+pipx install pdfalyzer[extract]
+```
+## As A Python Library
 For info about setting up a dev environment see [Contributing](#contributing) below.
 At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class.  Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
@@ -200,10 +221,8 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
 -------------
-# PDF Resources
-## Included PDF Tools
-The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
+# PDF Resources
 ## 3rd Party PDF Tools
 ### Installing Didier Stevens's PDF Analysis Tools
 Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
@@ -233,6 +252,7 @@ scripts/install_t1utils.sh
 * [Adobe Type 2 Charstring Format](https://adobe-type-tools.github.io/font-tech-notes/pdfs/5177.Type2.pdf) - Describes the newer Type 2 font operators which are also used in some multiple-master Type 1 fonts.
 ### Other Stuff
+* [Didier Stevens's PDF tools](http://blog.didierstevens.com/programs/pdf-tools/)
 * [Didier Stevens's free book about malicious PDFs](https://blog.didierstevens.com/2010/09/26/free-malicious-pdf-analysis-e-book/) - The master of the malicious PDFs wrote a whole book about how to analyze them. It's an old book but the PDF spec was last changed in 2008 so it's still relevant.
 * [Analyzing Malicious PDFs Cheat Sheet](https://zeltser.com/media/docs/analyzing-malicious-document-files.pdf) - Like it says on the tin. If that link fails there's a copy [here in the repo](doc/analyzing-malicious-document-files.pdf).
 * [T1Utils Github Repo](https://github.com/kohler/t1utils) - Suite of tools for manipulating Type1 fonts.
@@ -241,6 +261,8 @@ scripts/install_t1utils.sh
 * [A Curious Exploration of Malicious PDF Documents](https://www.scitepress.org/Papers/2020/89923/89923.pdf) by Julian Lindenhofer, Rene Offenthaler and Martin Pirker, 2020. Overview of all the possible execution paths that can lead to a PDF executing JavaScript, opening loca/remote files, or making web requests.
 * [Malicious PDF Generator](https://github.com/jonaslejon/malicious-pdf) is a well maintained GitHub project that does what it says on the tin.
 * [PDF is Broken, and so is this file](https://blog.trailofbits.com/2021/02/02/pdf-is-broken-a-justctf-challenge/) is a 2021 report on what happens when you challenge cybersecurity teams to turn PDFs into weapons. (Among other things they managed to create a PDF that launches a webserver when you open it.)
+* [linuxPDF](https://github.com/ading2210/linuxpdf) is a project that managed to embed an entire linux operating system inside a PDF document. The related [DoomPDF](https://github.com/ading2210/doompdf) managed to embed the classic video game Doom in a PDF.
+* [horrifying-pdf-experiments](https://github.com/osnr/horrifying-pdf-experiments) is a repo of horrifying things you can do with PDFs.
 ## Did The World Really Need Another PDF Tool?
@@ -274,12 +296,6 @@ These are the naming conventions at play in The Pdfalyzer code base:
 * [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
-# TODO
-* Highlight decodes with a lot of Javascript keywords
-* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
-* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
 [^1]: The official Adobe PDF specification calls this tree the PDF's "logical structure", which is a good example of nomenclature that does not help those who see it understand anything about what is being described. I can forgive them given that they named this thing back in the 80s, though it's a good example of why picking good names for things at the beginning is so important.
 [^2]: An exception will be raised if there's any issue placing a node while parsing or if there are any nodes not reachable from the root of the tree at the end of parsing. If there are no exceptions then all internal PDF objects are guaranteed to exist in the tree except in these situations when warnings will be printed:

{pdfalyzer-1.16.3 → pdfalyzer-1.17.9}/pdfalyzer/__init__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import code
 import sys
+from argparse import Namespace
 from os import environ, getcwd, path
-from pathlib import Path
 from dotenv import load_dotenv
 from pypdf import PdfWriter
@@ -20,18 +20,20 @@ if not environ.get('INVOKED_BY_PYTEST', False):
 from rich.columns import Columns
 from rich.panel import Panel
 from rich.text import Text
-from yaralyzer.helpers.rich_text_helper import prefix_with_plain_text_obj
+from yaralyzer.helpers.rich_text_helper import prefix_with_style
 from yaralyzer.output.file_export import invoke_rich_export
 from yaralyzer.output.rich_console import console
-from yaralyzer.util.logging import log, log_and_print
+from yaralyzer.util.logging import log_and_print
+from pdfalyzer.decorators.pdf_file import PdfFile
 from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
 from pdfalyzer.helpers.rich_text_helper import print_highlighted
 from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
 from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
 from pdfalyzer.pdfalyzer import Pdfalyzer
-from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
-     parse_combine_pdfs_args)
+from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
+from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, parse_combine_pdfs_args,
+     parse_pdf_page_extraction_args, parse_text_extraction_args)
 from pdfalyzer.util.pdf_parser_manager import PdfParserManager
 # For the table shown by running pdfalyzer_show_color_theme
@@ -41,7 +43,7 @@ MAX_THEME_COL_SIZE = 35
 def pdfalyze():
     args = parse_arguments()
     pdfalyzer = Pdfalyzer(args.file_to_scan_path)
-    pdfalyzer = PdfalyzerPresenter(pdfalyzer)
+    presenter = PdfalyzerPresenter(pdfalyzer)
     output_basepath = None
     # Binary stream extraction is a special case
@@ -51,9 +53,9 @@ def pdfalyze():
         log_and_print(f"Binary stream extraction complete, files written to '{args.output_dir}'.\nExiting.\n")
         sys.exit()
-    # The method that gets called is related to the argument name. See 'possible_output_sections' list in argument_parser.py
-    # Analysis exports wrap themselves around the methods that actually generate the analyses
-    for (arg, method) in output_sections(args, pdfalyzer):
+    # The method that gets called is related to the argument name. See 'possible_output_sections' list in
+    # argument_parser.py. Analysis exports wrap themselves around the methods that actually generate the analyses.
+    for (arg, method) in output_sections(args, presenter):
         if args.output_dir:
             output_basepath = PdfalyzerConfig.get_output_basepath(method)
             print(f'Exporting {arg} data to {output_basepath}...')
@@ -78,23 +80,25 @@ def pdfalyze():
     if args.interact:
         code.interact(local=locals())
+    pdfalyzer.pdf_filehandle.close()
 def pdfalyzer_show_color_theme() -> None:
     """Utility method to show pdfalyzer's color theme. Invocable with 'pdfalyzer_show_color_theme'."""
     console.print(Panel('The Pdfalyzer Color Theme', style='reverse'))
     colors = [
-        prefix_with_plain_text_obj(name[:MAX_THEME_COL_SIZE], style=str(style)).append(' ')
+        prefix_with_style(name[:MAX_THEME_COL_SIZE], style=str(style)).append(' ')
         for name, style in PDFALYZER_THEME_DICT.items()
         if name not in ['reset', 'repr_url']
     ]
-    console.print(Columns(colors, column_first=True, padding=(0,3)))
+    console.print(Columns(colors, column_first=True, padding=(0, 3)))
 def combine_pdfs():
     """
-    Utility method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'.
+    Script method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'.
     Example: https://github.com/py-pdf/pypdf/blob/main/docs/user/merging-pdfs.md
     """
     args = parse_combine_pdfs_args()
@@ -114,7 +118,11 @@ def combine_pdfs():
     for i, page in enumerate(merger.pages):
         if args.image_quality < MAX_QUALITY:
             for j, img in enumerate(page.images):
-                print_highlighted(f"  -> Reducing image #{j + 1} quality on page {i + 1} to {args.image_quality}...", style='dim')
+                print_highlighted(
+                    f"  -> Reducing image #{j + 1} quality on page {i + 1} to {args.image_quality}...",
+                    style='dim'
+                )
                 img.replace(img.image, quality=args.image_quality)
         print_highlighted(f"  -> Compressing page {i + 1}...", style='dim')
@@ -127,3 +135,19 @@ def combine_pdfs():
     txt = Text('').append(f"  -> Wrote ")
     txt.append(str(file_size_in_mb(args.output_file)), style='cyan').append(" megabytes\n")
     print_highlighted(txt)
+def extract_pdf_pages() -> None:
+    """Extract a range of pages from a PDF to a new PDF."""
+    args = parse_pdf_page_extraction_args()
+    PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
+def extract_pdf_text() -> None:
+    """Extract text from a list of file or from all PDF files in a list of directories."""
+    args: Namespace = parse_text_extraction_args()
+    console.line()
+    for file_path in args.files_to_process:
+        PdfFile(file_path).print_extracted_text(args.page_range, args.print_as_parsed)
+        console.line(2)

pdfalyzer 1.16.3__tar.gz → 1.17.9__tar.gz

pdfalyzer 1.16.3tar.gz → 1.17.9tar.gz