pdfalyzer 1.16.9__tar.gz → 1.16.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- pdfalyzer-1.16.11/.pdfalyzer.example +66 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/CHANGELOG.md +9 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/PKG-INFO +8 -9
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/README.md +0 -6
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/__init__.py +9 -6
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/binary/binary_scanner.py +16 -12
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/config.py +4 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/decorators/pdf_object_properties.py +13 -12
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/decorators/pdf_tree_node.py +8 -5
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/decorators/pdf_tree_verifier.py +7 -4
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/detection/constants/binary_regexes.py +7 -7
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/detection/yaralyzer_helper.py +1 -3
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/font_info.py +11 -12
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/helpers/filesystem_helper.py +6 -6
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/helpers/pdf_object_helper.py +1 -1
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/helpers/rich_text_helper.py +6 -8
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/output/character_mapping.py +3 -2
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/output/layout.py +1 -1
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/output/pdfalyzer_presenter.py +7 -5
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/output/tables/decoding_stats_table.py +6 -1
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/output/tables/stream_objects_table.py +0 -1
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/pdf_object_relationship.py +12 -12
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/pdfalyzer.py +4 -5
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/util/adobe_strings.py +4 -5
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/util/argument_parser.py +11 -8
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pyproject.toml +53 -28
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/LICENSE +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/__main__.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/decorators/document_model_printer.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/decorators/indeterminate_node.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/detection/javascript_hunter.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/helpers/dict_helper.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/helpers/number_helper.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/helpers/string_helper.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/output/styles/node_colors.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/output/styles/rich_theme.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/output/tables/font_summary_table.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/util/debugging.py +1 -1
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/util/exceptions.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/util/pdf_parser_manager.py +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/yara_rules/PDF.yara +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/yara_rules/__init.py__ +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/yara_rules/didier_stevens.yara +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
- {pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/yara_rules/pdf_malware.yara +0 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# If you place a filed called '.pdfalyzer' in your home dir or the current dir environment variables specified
|
|
2
|
+
# in that .pdfalyzer file will be added to the environment each time pdfalyzer is invoked. (See the `dotenv`
|
|
3
|
+
# package for more details.) This file contains environment variables you can place in .pdfalyzer to configure
|
|
4
|
+
# the application above and beyond providing command line options. Useful if you want to permanently
|
|
5
|
+
# configure options you tend to reuse (e.g. '--maximize-width') so you can stop remembering to type them.
|
|
6
|
+
#
|
|
7
|
+
# Almost all of the yaralyzer (yes, you read that right - The Pdfalyzer uses The Yaralyzer for all
|
|
8
|
+
# kinds of backend functionality) command line options can be configured in this file by capitalizing them and
|
|
9
|
+
# prefixing 'YARALYZER'. e.g. to configure the --maximize-width option for every invocation, you would set:
|
|
10
|
+
# YARALYZER_MAXIMIZE_WIDTH=True
|
|
11
|
+
#
|
|
12
|
+
# Note that many of these options are actually configuring the yaralyzer, which is a separate tool leveraged
|
|
13
|
+
# by the Pdfalyzer to actually do the work of finding patterns. More info can be found at
|
|
14
|
+
# https://github.com/michelcrypt4d4mus/yaralyzer
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Expand the width of the output to the fit the display window (same as the --maximize-width options)
|
|
19
|
+
# YARALYZER_MAXIMIZE_WIDTH=True
|
|
20
|
+
|
|
21
|
+
# yara-python internal options passed through to yara.set_config() as the stack_size and max_match_data arguments
|
|
22
|
+
# YARALYZER_STACK_SIZE=10485760
|
|
23
|
+
# YARALYZER_MAX_MATCH_LENGTH=10737418240
|
|
24
|
+
|
|
25
|
+
# Suppress all PDF binary regex matching/scanning/etc
|
|
26
|
+
# YARALYZER_SUPPRESS_DECODES_TABLE=False
|
|
27
|
+
|
|
28
|
+
# Suppress the display of the table showing the the encoding assessments given by `chardet.detect()`
|
|
29
|
+
# about a particular chunk of binary data. (The most important data in the chardet confidence table is
|
|
30
|
+
# redunandant anyways. Only the low likelihood encodings are hidden from the usef)
|
|
31
|
+
# YARALYZER_SUPPRESS_CHARDET_TABLE=False
|
|
32
|
+
# Minimum confidence to display an encoding in the chardet results table
|
|
33
|
+
# YARALYZER_MIN_CHARDET_CONFIDENCE=2.0
|
|
34
|
+
|
|
35
|
+
# Configure how many bytes before and after any binary data should be included in scans and visualizations
|
|
36
|
+
# YARALYZER_SURROUNDING_BYTES=64
|
|
37
|
+
|
|
38
|
+
# Size thresholds (in bytes) under/over which pdfalyzer will NOT make attempts to decode a match.
|
|
39
|
+
# Longer byte sequences are for obvious reasons slower to decode by force.
|
|
40
|
+
# It may feel counterintuitive but larger chunks of random binary are also harder to examine and
|
|
41
|
+
# (in my experience) less likely to be maningful. Consider it - two frontslash characters 20,000 lines apart
|
|
42
|
+
# are more likely to be random than those same frontslashes when placed nearer to each other and
|
|
43
|
+
# in the vicinity of lot of computerized sigils of internet power like `.', `+bacd*?`,. and other regexes.*
|
|
44
|
+
# Keeping the max value number low will do more to affect the speed of the app than ay anything else you
|
|
45
|
+
# can easily configure..
|
|
46
|
+
#
|
|
47
|
+
# YARALYZER_MIN_DECODE_LENGTH=1
|
|
48
|
+
# YARALYZER_MAX_DECODE_LENGTH=256
|
|
49
|
+
|
|
50
|
+
# Directory to write application logs to. Must be an absolute path, not a relative one.
|
|
51
|
+
# These logs are not normally written to a file and the default log level means that the standard behavior
|
|
52
|
+
# is to more or less discard them. Be aware that if you configure this variable a few things will change:
|
|
53
|
+
#
|
|
54
|
+
# 1. Logs WILL NOT be written to STDOUT. They will stream ONLY to files in the configured directory.
|
|
55
|
+
# This is true even with the -D option.
|
|
56
|
+
# 2. The default log_level will be decreased from WARN (extremely spartan) to INFO (fairly verbose).
|
|
57
|
+
# The -D option, which sets the log level to DEBUG, will be respected whether or not
|
|
58
|
+
# YARALYZER_LOG_DIR is configured.
|
|
59
|
+
#
|
|
60
|
+
# YARALYZER_LOG_DIR=/path/to/pdfalyzer/log_dir/
|
|
61
|
+
|
|
62
|
+
# Log level
|
|
63
|
+
# YARALYZER_LOG_LEVEL='INFO'
|
|
64
|
+
|
|
65
|
+
# Path to directory containing Didier Stevens's pdf-parser.py. Only required for extracting binary streams to files.
|
|
66
|
+
# PDFALYZER_PDF_PARSER_PY_PATH=/path/to/pdfparserdotpy/
|
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.16.11
|
|
4
|
+
* Fix typo in `combine_pdfs` help
|
|
5
|
+
* Add some more PyPi classifiers
|
|
6
|
+
* Add a `.flake8` config and fix a bunch of style issues
|
|
7
|
+
|
|
8
|
+
### 1.16.10
|
|
9
|
+
* Add `Environment :: Console` and `Programming Language :: Python` to pypi classifiers
|
|
10
|
+
* Add `.pdfalyzer.example` to PyPi package
|
|
11
|
+
|
|
3
12
|
### 1.16.9
|
|
4
13
|
* Add `Development Status :: 5 - Production/Stable` to pypi classifiers
|
|
5
14
|
|
|
@@ -1,19 +1,24 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.16.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 1.16.11
|
|
4
|
+
Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
7
|
-
Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,threat assessment,visualization,yara
|
|
7
|
+
Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,visualization,yara
|
|
8
8
|
Author: Michel de Cryptadamus
|
|
9
9
|
Author-email: michel@cryptadamus.com
|
|
10
10
|
Requires-Python: >=3.9.2,<4.0.0
|
|
11
11
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Environment :: Console
|
|
12
13
|
Classifier: Intended Audience :: Information Technology
|
|
13
14
|
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
15
|
+
Classifier: Programming Language :: Python
|
|
14
16
|
Classifier: Programming Language :: Python :: 3
|
|
15
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
22
|
Classifier: Topic :: Artistic Software
|
|
18
23
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
19
24
|
Classifier: Topic :: Security
|
|
@@ -304,12 +309,6 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
304
309
|
* [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
|
|
305
310
|
|
|
306
311
|
|
|
307
|
-
# TODO
|
|
308
|
-
* Highlight decodes with a lot of Javascript keywords
|
|
309
|
-
* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
|
|
310
|
-
* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
|
|
311
|
-
|
|
312
|
-
|
|
313
312
|
[^1]: The official Adobe PDF specification calls this tree the PDF's "logical structure", which is a good example of nomenclature that does not help those who see it understand anything about what is being described. I can forgive them given that they named this thing back in the 80s, though it's a good example of why picking good names for things at the beginning is so important.
|
|
314
313
|
|
|
315
314
|
[^2]: An exception will be raised if there's any issue placing a node while parsing or if there are any nodes not reachable from the root of the tree at the end of parsing. If there are no exceptions then all internal PDF objects are guaranteed to exist in the tree except in these situations when warnings will be printed:
|
|
@@ -277,12 +277,6 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
277
277
|
* [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
|
|
278
278
|
|
|
279
279
|
|
|
280
|
-
# TODO
|
|
281
|
-
* Highlight decodes with a lot of Javascript keywords
|
|
282
|
-
* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
|
|
283
|
-
* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
|
|
284
|
-
|
|
285
|
-
|
|
286
280
|
[^1]: The official Adobe PDF specification calls this tree the PDF's "logical structure", which is a good example of nomenclature that does not help those who see it understand anything about what is being described. I can forgive them given that they named this thing back in the 80s, though it's a good example of why picking good names for things at the beginning is so important.
|
|
287
281
|
|
|
288
282
|
[^2]: An exception will be raised if there's any issue placing a node while parsing or if there are any nodes not reachable from the root of the tree at the end of parsing. If there are no exceptions then all internal PDF objects are guaranteed to exist in the tree except in these situations when warnings will be printed:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import code
|
|
2
2
|
import sys
|
|
3
3
|
from os import environ, getcwd, path
|
|
4
|
-
from pathlib import Path
|
|
5
4
|
|
|
6
5
|
from dotenv import load_dotenv
|
|
7
6
|
from pypdf import PdfWriter
|
|
@@ -23,7 +22,7 @@ from rich.text import Text
|
|
|
23
22
|
from yaralyzer.helpers.rich_text_helper import prefix_with_plain_text_obj
|
|
24
23
|
from yaralyzer.output.file_export import invoke_rich_export
|
|
25
24
|
from yaralyzer.output.rich_console import console
|
|
26
|
-
from yaralyzer.util.logging import
|
|
25
|
+
from yaralyzer.util.logging import log_and_print
|
|
27
26
|
|
|
28
27
|
from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
|
|
29
28
|
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
@@ -51,8 +50,8 @@ def pdfalyze():
|
|
|
51
50
|
log_and_print(f"Binary stream extraction complete, files written to '{args.output_dir}'.\nExiting.\n")
|
|
52
51
|
sys.exit()
|
|
53
52
|
|
|
54
|
-
# The method that gets called is related to the argument name. See 'possible_output_sections' list in
|
|
55
|
-
# Analysis exports wrap themselves around the methods that actually generate the analyses
|
|
53
|
+
# The method that gets called is related to the argument name. See 'possible_output_sections' list in
|
|
54
|
+
# argument_parser.py. Analysis exports wrap themselves around the methods that actually generate the analyses.
|
|
56
55
|
for (arg, method) in output_sections(args, pdfalyzer):
|
|
57
56
|
if args.output_dir:
|
|
58
57
|
output_basepath = PdfalyzerConfig.get_output_basepath(method)
|
|
@@ -89,7 +88,7 @@ def pdfalyzer_show_color_theme() -> None:
|
|
|
89
88
|
if name not in ['reset', 'repr_url']
|
|
90
89
|
]
|
|
91
90
|
|
|
92
|
-
console.print(Columns(colors, column_first=True, padding=(0,3)))
|
|
91
|
+
console.print(Columns(colors, column_first=True, padding=(0, 3)))
|
|
93
92
|
|
|
94
93
|
|
|
95
94
|
def combine_pdfs():
|
|
@@ -114,7 +113,11 @@ def combine_pdfs():
|
|
|
114
113
|
for i, page in enumerate(merger.pages):
|
|
115
114
|
if args.image_quality < MAX_QUALITY:
|
|
116
115
|
for j, img in enumerate(page.images):
|
|
117
|
-
print_highlighted(
|
|
116
|
+
print_highlighted(
|
|
117
|
+
f" -> Reducing image #{j + 1} quality on page {i + 1} to {args.image_quality}...",
|
|
118
|
+
style='dim'
|
|
119
|
+
)
|
|
120
|
+
|
|
118
121
|
img.replace(img.image, quality=args.image_quality)
|
|
119
122
|
|
|
120
123
|
print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
|
|
@@ -12,8 +12,8 @@ from yaralyzer.decoding.bytes_decoder import BytesDecoder
|
|
|
12
12
|
from yaralyzer.encoding_detection.character_encodings import BOMS
|
|
13
13
|
from yaralyzer.helpers.bytes_helper import hex_string, print_bytes
|
|
14
14
|
from yaralyzer.helpers.string_helper import escape_yara_pattern
|
|
15
|
-
from yaralyzer.output.rich_console import BYTES_NO_DIM, console, console_width
|
|
16
15
|
from yaralyzer.output.regex_match_metrics import RegexMatchMetrics
|
|
16
|
+
from yaralyzer.output.rich_console import BYTES_NO_DIM, console, console_width
|
|
17
17
|
from yaralyzer.yara.yara_rule_builder import HEX, REGEX, safe_label
|
|
18
18
|
from yaralyzer.yaralyzer import Yaralyzer
|
|
19
19
|
from yaralyzer.util.logging import log
|
|
@@ -21,7 +21,7 @@ from yaralyzer.util.logging import log
|
|
|
21
21
|
from pdfalyzer.config import PdfalyzerConfig
|
|
22
22
|
from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode
|
|
23
23
|
from pdfalyzer.detection.constants.binary_regexes import (BACKTICK, DANGEROUS_PDF_KEYS_TO_HUNT_ONLY_IN_FONTS,
|
|
24
|
-
|
|
24
|
+
DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET, QUOTE_PATTERNS)
|
|
25
25
|
from pdfalyzer.helpers.string_helper import generate_hyphen_line
|
|
26
26
|
from pdfalyzer.output.layout import print_headline_panel, print_section_sub_subheader
|
|
27
27
|
from pdfalyzer.util.adobe_strings import CONTENTS, CURRENTFILE_EEXEC, FONT_FILE_KEYS
|
|
@@ -36,7 +36,7 @@ class BinaryScanner:
|
|
|
36
36
|
self.stream_length = len(_bytes)
|
|
37
37
|
|
|
38
38
|
if label is None and isinstance(owner, PdfTreeNode):
|
|
39
|
-
|
|
39
|
+
self.label = owner.__rich__()
|
|
40
40
|
|
|
41
41
|
self.suppression_notice_queue = []
|
|
42
42
|
self.regex_extraction_stats = defaultdict(lambda: RegexMatchMetrics())
|
|
@@ -86,8 +86,12 @@ class BinaryScanner:
|
|
|
86
86
|
print_headline_panel(msg, style='dim')
|
|
87
87
|
continue
|
|
88
88
|
|
|
89
|
+
print_section_sub_subheader(
|
|
90
|
+
f"Forcing Decode of {quote_type.capitalize()} Quoted Strings",
|
|
91
|
+
style=BYTES_NO_DIM
|
|
92
|
+
)
|
|
93
|
+
|
|
89
94
|
quote_pattern = QUOTE_PATTERNS[quote_type]
|
|
90
|
-
print_section_sub_subheader(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style=BYTES_NO_DIM)
|
|
91
95
|
yaralyzer = self._quote_yaralyzer(quote_pattern, quote_type)
|
|
92
96
|
self.process_yara_matches(yaralyzer, f"{quote_type}_quoted")
|
|
93
97
|
|
|
@@ -135,7 +139,7 @@ class BinaryScanner:
|
|
|
135
139
|
def process_yara_matches(self, yaralyzer: Yaralyzer, pattern: str, force: bool = False) -> None:
|
|
136
140
|
"""Decide whether to attempt to decode the matched bytes, track stats. force param ignores min/max length."""
|
|
137
141
|
for bytes_match, decoder in yaralyzer.match_iterator():
|
|
138
|
-
log.debug(f"Trackings stats for
|
|
142
|
+
log.debug(f"Trackings match stats for {pattern}, bytes_match: {bytes_match}, is_decodable: {bytes_match.is_decodable()}") # noqa: E501
|
|
139
143
|
|
|
140
144
|
# Send suppressed decodes to a queue and track the reason for the suppression in the stats
|
|
141
145
|
if not (bytes_match.is_decodable() or force):
|
|
@@ -145,7 +149,7 @@ class BinaryScanner:
|
|
|
145
149
|
# Print out any queued suppressed notices before printing non suppressed matches
|
|
146
150
|
self._print_suppression_notices()
|
|
147
151
|
console.print(decoder)
|
|
148
|
-
self.regex_extraction_stats[pattern].tally_match(decoder)
|
|
152
|
+
self.regex_extraction_stats[pattern].tally_match(decoder) # TODO: This call must come after print(decoder)
|
|
149
153
|
|
|
150
154
|
self._print_suppression_notices()
|
|
151
155
|
|
|
@@ -167,12 +171,12 @@ class BinaryScanner:
|
|
|
167
171
|
return self._pattern_yaralyzer(quote_pattern, REGEX, label, label)
|
|
168
172
|
|
|
169
173
|
def _pattern_yaralyzer(
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
174
|
+
self,
|
|
175
|
+
pattern: str,
|
|
176
|
+
pattern_type: str,
|
|
177
|
+
rules_label: Optional[str] = None,
|
|
178
|
+
pattern_label: Optional[str] = None
|
|
179
|
+
) -> Yaralyzer:
|
|
176
180
|
"""Build a yaralyzer to scan self.bytes"""
|
|
177
181
|
return Yaralyzer.for_patterns(
|
|
178
182
|
patterns=[escape_yara_pattern(pattern)],
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PdfalyzerConfig object holds the unification of configuration options parsed from the command line
|
|
3
|
+
as well as those set by environment variables and/or a .pdfalyzer file.
|
|
4
|
+
"""
|
|
1
5
|
import importlib.resources
|
|
2
6
|
from argparse import Namespace
|
|
3
7
|
from os import environ, pardir, path
|
|
@@ -16,13 +16,14 @@ from pdfalyzer.util.adobe_strings import *
|
|
|
16
16
|
|
|
17
17
|
class PdfObjectProperties:
|
|
18
18
|
"""Simple class to extract critical features of a PdfObject."""
|
|
19
|
+
|
|
19
20
|
def __init__(
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
21
|
+
self,
|
|
22
|
+
pdf_object: PdfObject,
|
|
23
|
+
address: str,
|
|
24
|
+
idnum: int,
|
|
25
|
+
indirect_object: Optional[IndirectObject] = None
|
|
26
|
+
):
|
|
26
27
|
self.idnum = idnum
|
|
27
28
|
self.obj = pdf_object
|
|
28
29
|
self.indirect_object = indirect_object
|
|
@@ -57,7 +58,7 @@ class PdfObjectProperties:
|
|
|
57
58
|
else:
|
|
58
59
|
self.first_address = address
|
|
59
60
|
|
|
60
|
-
log.debug(f"Node ID: {self.idnum}, type: {self.type}, subtype: {self.sub_type}, " +
|
|
61
|
+
log.debug(f"Node ID: {self.idnum}, type: {self.type}, subtype: {self.sub_type}, " +
|
|
61
62
|
f"label: {self.label}, first_address: {self.first_address}")
|
|
62
63
|
|
|
63
64
|
@classmethod
|
|
@@ -80,11 +81,11 @@ class PdfObjectProperties:
|
|
|
80
81
|
|
|
81
82
|
@classmethod
|
|
82
83
|
def to_table_row(
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
84
|
+
cls,
|
|
85
|
+
reference_key: str,
|
|
86
|
+
obj: PdfObject,
|
|
87
|
+
is_single_row_table: bool = False
|
|
88
|
+
) -> List[Union[Text, str]]:
|
|
88
89
|
"""PDF object property at reference_key becomes a formatted 3-tuple for use in Rich tables."""
|
|
89
90
|
with_resolved_refs = cls.resolve_references(reference_key, obj)
|
|
90
91
|
|
|
@@ -6,7 +6,7 @@ Child/parent relationships should be set using the add_child() and set_parent()
|
|
|
6
6
|
methods and not set directly. (TODO: this could be done better with anytree
|
|
7
7
|
hooks)
|
|
8
8
|
"""
|
|
9
|
-
from typing import Callable, List, Optional
|
|
9
|
+
from typing import Callable, List, Optional
|
|
10
10
|
|
|
11
11
|
from anytree import NodeMixin, SymlinkNode
|
|
12
12
|
from pypdf.errors import PdfReadError
|
|
@@ -163,11 +163,14 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
163
163
|
return None
|
|
164
164
|
else:
|
|
165
165
|
address = refs_to_this_node[0].address
|
|
166
|
+
|
|
166
167
|
# If other node's label doesn't start with a NON_STANDARD_ADDRESS string
|
|
167
|
-
#
|
|
168
|
-
#
|
|
169
|
-
|
|
170
|
-
|
|
168
|
+
# AND any of the relationships pointing at this node use something other than a
|
|
169
|
+
# NON_STANDARD_ADDRESS_NODES string to refer here,
|
|
170
|
+
# then print a warning about multiple refs.
|
|
171
|
+
if not (is_prefixed_by_any(from_node.label, NON_STANDARD_ADDRESS_NODES)
|
|
172
|
+
or
|
|
173
|
+
all(ref.address in NON_STANDARD_ADDRESS_NODES for ref in refs_to_this_node)):
|
|
171
174
|
refs_to_this_node_str = "\n ".join([f"{i + 1}. {r}" for i, r in enumerate(refs_to_this_node)])
|
|
172
175
|
msg = f"Multiple refs from {from_node} to {self}:\n {refs_to_this_node_str}"
|
|
173
176
|
log.warning(msg + f"\nCommon address of refs: {address}")
|
|
@@ -37,7 +37,10 @@ class PdfTreeVerifier:
|
|
|
37
37
|
log.warning(f"Methodd doesn't check revisions but this doc is generation {self.pdfalyzer.max_generation}")
|
|
38
38
|
|
|
39
39
|
# We expect to see all ordinals up to the number of nodes /Trailer claims exist as obj. IDs.
|
|
40
|
-
missing_node_ids = [
|
|
40
|
+
missing_node_ids = [
|
|
41
|
+
i for i in range(1, self.pdfalyzer.pdf_size)
|
|
42
|
+
if self.pdfalyzer.find_node_by_idnum(i) is None
|
|
43
|
+
]
|
|
41
44
|
|
|
42
45
|
for idnum in missing_node_ids:
|
|
43
46
|
ref = IndirectObject(idnum, self.pdfalyzer.max_generation, self.pdfalyzer.pdf_reader)
|
|
@@ -57,13 +60,13 @@ class PdfTreeVerifier:
|
|
|
57
60
|
log.error(f"Cannot find ref {ref} in PDF!")
|
|
58
61
|
continue
|
|
59
62
|
elif isinstance(obj, (NumberObject, NameObject)):
|
|
60
|
-
log.info(f"Obj {idnum} is a {type(obj)} w/value {obj}; if relationshipd by /Length etc. this is a nonissue but maybe worth doublechecking")
|
|
63
|
+
log.info(f"Obj {idnum} is a {type(obj)} w/value {obj}; if relationshipd by /Length etc. this is a nonissue but maybe worth doublechecking") # noqa: E501
|
|
61
64
|
continue
|
|
62
65
|
elif not isinstance(obj, dict):
|
|
63
|
-
log.error(f"Obj {idnum} ({obj}) of type {type(obj)} isn't dict, cannot determine if it should be in tree")
|
|
66
|
+
log.error(f"Obj {idnum} ({obj}) of type {type(obj)} isn't dict, cannot determine if it should be in tree") # noqa: E501
|
|
64
67
|
continue
|
|
65
68
|
elif TYPE not in obj:
|
|
66
|
-
msg = f"Obj {idnum} has no {TYPE} and is not in tree. Either a loose node w/no data or an error in pdfalyzer."
|
|
69
|
+
msg = f"Obj {idnum} has no {TYPE} and is not in tree. Either a loose node w/no data or an error in pdfalyzer." # noqa: E501
|
|
67
70
|
msg += f"\nHere's the contents for you to assess:\n{obj}"
|
|
68
71
|
log.warning(msg)
|
|
69
72
|
continue
|
|
@@ -36,13 +36,13 @@ PARENTHESES = 'parentheses'
|
|
|
36
36
|
|
|
37
37
|
QUOTE_PATTERNS = {
|
|
38
38
|
BACKTICK: '`.+`',
|
|
39
|
-
BRACKET: '\\[.+\\]',
|
|
40
|
-
CURLY_BRACKET: '{.+}',
|
|
41
|
-
DOUBLE_LESS_THAN: '<<.+>>',
|
|
39
|
+
BRACKET: '\\[.+\\]', # { 91 [-] 93 }
|
|
40
|
+
CURLY_BRACKET: '{.+}', # { 123 [-] 125 }
|
|
41
|
+
DOUBLE_LESS_THAN: '<<.+>>', # Hex { 60 60 [-] 62 62 }
|
|
42
42
|
ESCAPED_SINGLE: "\\'.+\\'",
|
|
43
43
|
ESCAPED_DOUBLE: '\\".+\\"',
|
|
44
|
-
FRONTSLASH: '/.+/',
|
|
45
|
-
GUILLEMET: 'AB [-] BB',
|
|
46
|
-
LESS_THAN: '<.+>',
|
|
47
|
-
PARENTHESES: '\\(.+\\)',
|
|
44
|
+
FRONTSLASH: '/.+/', # { 47 [-] 47 }
|
|
45
|
+
GUILLEMET: 'AB [-] BB', # Guillemet quotes are not ANSI so require byte pattern
|
|
46
|
+
LESS_THAN: '<.+>', # Hex { 60 [-] 62 }
|
|
47
|
+
PARENTHESES: '\\(.+\\)', # Hex { 28 [-] 29 }
|
|
48
48
|
}
|
|
@@ -8,8 +8,6 @@ from typing import Optional, Union
|
|
|
8
8
|
from yaralyzer.config import YaralyzerConfig
|
|
9
9
|
from yaralyzer.yaralyzer import Yaralyzer
|
|
10
10
|
|
|
11
|
-
from pdfalyzer.config import PdfalyzerConfig
|
|
12
|
-
|
|
13
11
|
YARA_RULES_DIR = files('pdfalyzer').joinpath('yara_rules')
|
|
14
12
|
|
|
15
13
|
YARA_RULES_FILES = [
|
|
@@ -38,7 +36,7 @@ def _build_yaralyzer(scannable: Union[bytes, str], label: Optional[str] = None)
|
|
|
38
36
|
with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[2])) as yara2:
|
|
39
37
|
with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[3])) as yara3:
|
|
40
38
|
with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[4])) as yara4:
|
|
41
|
-
# If there is a custom yara_rules
|
|
39
|
+
# If there is a custom yara_rules arg, use that instead of the files in the yara_rules/ dir
|
|
42
40
|
rules_paths = YaralyzerConfig.args.yara_rules_files or []
|
|
43
41
|
|
|
44
42
|
if not YaralyzerConfig.args.no_default_yara_rules:
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
Unify font information spread across a bunch of PdfObjects (Font, FontDescriptor,
|
|
3
3
|
and FontFile) into a single class.
|
|
4
4
|
"""
|
|
5
|
-
|
|
6
5
|
from pypdf._cmap import build_char_map, prepare_cm
|
|
7
6
|
from pypdf.generic import IndirectObject, PdfObject
|
|
8
7
|
from rich.text import Text
|
|
@@ -11,9 +10,9 @@ from yaralyzer.util.logging import log
|
|
|
11
10
|
|
|
12
11
|
from pdfalyzer.binary.binary_scanner import BinaryScanner
|
|
13
12
|
from pdfalyzer.output.character_mapping import print_character_mapping, print_prepared_charmap
|
|
14
|
-
from pdfalyzer.output.tables.font_summary_table import font_summary_table
|
|
15
13
|
from pdfalyzer.output.layout import print_section_subheader
|
|
16
14
|
from pdfalyzer.output.styles.node_colors import get_label_style
|
|
15
|
+
from pdfalyzer.output.tables.font_summary_table import font_summary_table
|
|
17
16
|
from pdfalyzer.util.adobe_strings import (FONT, FONT_DESCRIPTOR, FONT_FILE, FONT_LENGTHS, RESOURCES,
|
|
18
17
|
SUBTYPE, TO_UNICODE, TYPE, W, WIDTHS)
|
|
19
18
|
|
|
@@ -143,19 +142,19 @@ class FontInfo:
|
|
|
143
142
|
console.line()
|
|
144
143
|
|
|
145
144
|
# TODO: currently unused
|
|
146
|
-
def preview_bytes_at_advertised_lengths(self):
|
|
147
|
-
|
|
148
|
-
|
|
145
|
+
# def preview_bytes_at_advertised_lengths(self):
|
|
146
|
+
# """Show the bytes at the boundaries provided by /Length1, /Length2, and /Length3, if they exist"""
|
|
147
|
+
# lengths = self.lengths or []
|
|
149
148
|
|
|
150
|
-
|
|
151
|
-
|
|
149
|
+
# if self.lengths is None or len(lengths) <= 1:
|
|
150
|
+
# console.print("No length demarcations to preview.", style='grey.dark')
|
|
152
151
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
152
|
+
# for i, demarcation in enumerate(lengths[1:]):
|
|
153
|
+
# console.print(f"{self.font_file} at /Length{i} ({demarcation}):")
|
|
154
|
+
# print(f"\n Stream before: {self.stream_data[demarcation - FONT_SECTION_PREVIEW_LEN:demarcation + 1]}")
|
|
155
|
+
# print(f"\n Stream after: {self.stream_data[demarcation:demarcation + FONT_SECTION_PREVIEW_LEN]}")
|
|
157
156
|
|
|
158
|
-
|
|
157
|
+
# print(f"\nfinal bytes back from {self.stream_data.lengths[2]} + 10: {self.stream_data[-10 - -f.lengths[2]:]}")
|
|
159
158
|
|
|
160
159
|
def __str__(self) -> str:
|
|
161
160
|
return self.display_title
|
|
@@ -15,7 +15,7 @@ OPEN_FILES_BUFFER = 30 # we might have some files open already so we need
|
|
|
15
15
|
PDF_EXT = '.pdf'
|
|
16
16
|
|
|
17
17
|
# TODO: this kind of type alias is not supported until Python 3.12
|
|
18
|
-
#type StrOrPath = Union[str, Path]
|
|
18
|
+
# type StrOrPath = Union[str, Path]
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def with_pdf_extension(file_path: Union[str, Path]) -> str:
|
|
@@ -92,11 +92,11 @@ def set_max_open_files(num_filehandles: int = DEFAULT_MAX_OPEN_FILES) -> tuple[O
|
|
|
92
92
|
resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
|
|
93
93
|
except (ValueError, resource.error):
|
|
94
94
|
try:
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
95
|
+
hard = soft
|
|
96
|
+
print_highlighted(f"Retrying setting max open files (soft, hard)=({soft}, {hard})", style='yellow')
|
|
97
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
|
|
98
98
|
except Exception:
|
|
99
|
-
|
|
100
|
-
|
|
99
|
+
print_highlighted('Failed to set max open files / ulimit, giving up!', style='error')
|
|
100
|
+
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
101
101
|
|
|
102
102
|
return (soft, hard)
|
|
@@ -6,7 +6,6 @@ from typing import List, Optional
|
|
|
6
6
|
from pypdf.generic import IndirectObject, PdfObject
|
|
7
7
|
|
|
8
8
|
from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
|
|
9
|
-
from pdfalyzer.util.adobe_strings import *
|
|
10
9
|
|
|
11
10
|
|
|
12
11
|
def pdf_object_id(pdf_object) -> Optional[int]:
|
|
@@ -20,6 +19,7 @@ def does_list_have_any_references(_list) -> bool:
|
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
def _sort_pdf_object_refs(refs: List[PdfObjectRelationship]) -> List[PdfObjectRelationship]:
|
|
22
|
+
"""Sort a list of PdfObjectRelationship objects by their to_obj's idnum. Only used by pytest."""
|
|
23
23
|
return sorted(refs, key=lambda ref: ref.to_obj.idnum)
|
|
24
24
|
|
|
25
25
|
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Functions for miscellaneous Rich text/string operations.
|
|
2
|
+
Functions for miscellaneous Rich text/string pretty printing operations.
|
|
3
3
|
"""
|
|
4
4
|
from typing import List, Union
|
|
5
5
|
|
|
6
6
|
from pypdf.generic import PdfObject
|
|
7
7
|
from rich.console import Console
|
|
8
|
-
from rich.highlighter import RegexHighlighter, JSONHighlighter
|
|
9
8
|
from rich.text import Text
|
|
10
|
-
from yaralyzer.output.rich_console import console
|
|
11
9
|
|
|
12
10
|
from pdfalyzer.helpers.pdf_object_helper import pypdf_class_name
|
|
13
11
|
from pdfalyzer.output.styles.node_colors import get_label_style, get_class_style_italic
|
|
@@ -22,11 +20,11 @@ def print_highlighted(msg: Union[str, Text], **kwargs) -> None:
|
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
def quoted_text(
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
23
|
+
_string: str,
|
|
24
|
+
style: str = '',
|
|
25
|
+
quote_char_style: str = 'white',
|
|
26
|
+
quote_char: str = "'"
|
|
27
|
+
) -> Text:
|
|
30
28
|
"""Wrap _string in 'quote_char'. Style 'quote_char' with 'quote_char_style'."""
|
|
31
29
|
quote_char_txt = Text(quote_char, style=quote_char_style)
|
|
32
30
|
txt = quote_char_txt + Text(_string, style=style) + quote_char_txt
|
|
@@ -8,6 +8,7 @@ from yaralyzer.helpers.bytes_helper import print_bytes
|
|
|
8
8
|
from yaralyzer.output.rich_console import console
|
|
9
9
|
from yaralyzer.util.logging import log
|
|
10
10
|
|
|
11
|
+
# from pdfalyzer.font_info import FontInfo # Causes circular import
|
|
11
12
|
from pdfalyzer.helpers.rich_text_helper import quoted_text
|
|
12
13
|
from pdfalyzer.helpers.string_helper import pp
|
|
13
14
|
from pdfalyzer.output.layout import print_headline_panel, subheading_width
|
|
@@ -17,7 +18,7 @@ CHARMAP_TITLE_PADDING = (1, 0, 0, 2)
|
|
|
17
18
|
CHARMAP_PADDING = (0, 2, 0, 10)
|
|
18
19
|
|
|
19
20
|
|
|
20
|
-
def print_character_mapping(font: 'FontInfo') -> None:
|
|
21
|
+
def print_character_mapping(font: 'FontInfo') -> None: # noqa: F821
|
|
21
22
|
"""Prints the character mapping extracted by PyPDF._charmap in tidy columns"""
|
|
22
23
|
if font.character_mapping is None or len(font.character_mapping) == 0:
|
|
23
24
|
log.info(f"No character map found in {font}")
|
|
@@ -37,7 +38,7 @@ def print_character_mapping(font: 'FontInfo') -> None:
|
|
|
37
38
|
console.line()
|
|
38
39
|
|
|
39
40
|
|
|
40
|
-
def print_prepared_charmap(font: 'FontInfo'):
|
|
41
|
+
def print_prepared_charmap(font: 'FontInfo'): # noqa: F821
|
|
41
42
|
"""Prints the prepared_charmap returned by PyPDF."""
|
|
42
43
|
if font.prepared_char_map is None:
|
|
43
44
|
log.info(f"No prepared_charmap found in {font}")
|
|
@@ -23,11 +23,13 @@ from pdfalyzer.detection.yaralyzer_helper import get_bytes_yaralyzer, get_file_y
|
|
|
23
23
|
from pdfalyzer.helpers.string_helper import pp
|
|
24
24
|
from pdfalyzer.output.layout import (print_fatal_error_panel, print_section_header, print_section_subheader,
|
|
25
25
|
print_section_sub_subheader)
|
|
26
|
+
from pdfalyzer.output.tables.decoding_stats_table import build_decoding_stats_table
|
|
26
27
|
from pdfalyzer.output.tables.pdf_node_rich_table import generate_rich_tree, get_symlink_representation
|
|
27
28
|
from pdfalyzer.output.tables.stream_objects_table import stream_objects_table
|
|
28
|
-
from pdfalyzer.output.tables.decoding_stats_table import build_decoding_stats_table
|
|
29
29
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
30
|
-
from pdfalyzer.util.adobe_strings import *
|
|
30
|
+
# from pdfalyzer.util.adobe_strings import *
|
|
31
|
+
|
|
32
|
+
INTERNAL_YARA_ERROR_MSG = "Internal YARA error! YARA's error codes can be checked here: https://github.com/VirusTotal/yara/blob/master/libyara/include/yara/error.h" # noqa: E501
|
|
31
33
|
|
|
32
34
|
|
|
33
35
|
class PdfalyzerPresenter:
|
|
@@ -91,7 +93,6 @@ class PdfalyzerPresenter:
|
|
|
91
93
|
2. Check for (and force decode) dangerous PDF instructions like /JavaScript and /OpenAction
|
|
92
94
|
3. Check for (and force decode) any BOMs (byte order marks)
|
|
93
95
|
4. Check for (and force decode) any sequences of bytes between quotes
|
|
94
|
-
|
|
95
96
|
"""
|
|
96
97
|
print_section_header(f'Binary Stream Analysis / Extraction')
|
|
97
98
|
console.print(self._stream_objects_table())
|
|
@@ -109,6 +110,7 @@ class PdfalyzerPresenter:
|
|
|
109
110
|
log.warning(msg)
|
|
110
111
|
node_stream_bytes = node_stream_bytes.encode()
|
|
111
112
|
|
|
113
|
+
console.line()
|
|
112
114
|
print_section_subheader(f"{escape(str(node))} Summary and Analysis", style=f"{BYTES_HIGHLIGHT} reverse")
|
|
113
115
|
binary_scanner = BinaryScanner(node_stream_bytes, node)
|
|
114
116
|
console.print(bytes_hashes_table(binary_scanner.bytes))
|
|
@@ -130,9 +132,9 @@ class PdfalyzerPresenter:
|
|
|
130
132
|
|
|
131
133
|
try:
|
|
132
134
|
self.yaralyzer.yaralyze()
|
|
133
|
-
except yara.Error
|
|
135
|
+
except yara.Error:
|
|
134
136
|
console.print_exception()
|
|
135
|
-
print_fatal_error_panel(
|
|
137
|
+
print_fatal_error_panel(INTERNAL_YARA_ERROR_MSG)
|
|
136
138
|
return
|
|
137
139
|
|
|
138
140
|
YaralyzerConfig.args.standalone_mode = False
|
|
@@ -1,9 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Helper functions for building a table that summarizes the decoding attempts made on binary data.
|
|
3
|
+
"""
|
|
1
4
|
from numbers import Number
|
|
2
5
|
|
|
3
6
|
from rich.table import Table
|
|
4
7
|
from rich.text import Text
|
|
5
8
|
from yaralyzer.helpers.rich_text_helper import CENTER, na_txt, prefix_with_plain_text_obj
|
|
6
9
|
|
|
10
|
+
from pdfalyzer.binary.binary_scanner import BinaryScanner
|
|
7
11
|
from pdfalyzer.helpers.rich_text_helper import pct_txt
|
|
8
12
|
from pdfalyzer.output.layout import generate_subtable, half_width, pad_header
|
|
9
13
|
|
|
@@ -13,7 +17,8 @@ NOT_FOUND_MSG = Text('(not found)', style='grey.dark_italic')
|
|
|
13
17
|
REGEX_SUBTABLE_COLS = ['Metric', 'Value']
|
|
14
18
|
DECODES_SUBTABLE_COLS = ['Encoding', '#', 'Decoded', '#', 'Forced', '#', 'Failed']
|
|
15
19
|
|
|
16
|
-
|
|
20
|
+
|
|
21
|
+
def build_decoding_stats_table(scanner: BinaryScanner) -> Table:
|
|
17
22
|
"""Diplay aggregate results on the decoding attempts we made on subsets of scanner.bytes"""
|
|
18
23
|
stats_table = _new_decoding_stats_table(scanner.label.plain if scanner.label else '')
|
|
19
24
|
regexes_not_found_in_stream = []
|
|
@@ -14,12 +14,12 @@ INCOMPARABLE_PROPS = ['from_obj', 'to_obj']
|
|
|
14
14
|
|
|
15
15
|
class PdfObjectRelationship:
|
|
16
16
|
def __init__(
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
self,
|
|
18
|
+
from_node: 'PdfTreeNode',
|
|
19
|
+
to_obj: IndirectObject,
|
|
20
|
+
reference_key: str,
|
|
21
|
+
address: str
|
|
22
|
+
) -> None:
|
|
23
23
|
"""
|
|
24
24
|
In the case of easy key/value pairs the reference_key and the address are the same but
|
|
25
25
|
for more complicated references the address will be the reference_key plus sub references.
|
|
@@ -53,12 +53,12 @@ class PdfObjectRelationship:
|
|
|
53
53
|
|
|
54
54
|
@classmethod
|
|
55
55
|
def build_node_references(
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
56
|
+
cls,
|
|
57
|
+
from_node: 'PdfTreeObject',
|
|
58
|
+
from_obj: Optional[PdfObject] = None,
|
|
59
|
+
ref_key: Optional[Union[str, int]] = None,
|
|
60
|
+
address: Optional[str] = None
|
|
61
|
+
) -> List['PdfObjectRelationship']:
|
|
62
62
|
"""
|
|
63
63
|
Builds list of relationships 'from_node.obj' contains referencing other PDF objects.
|
|
64
64
|
Initially called with single arg from_node. Other args are employed when recursable
|
|
@@ -77,7 +77,7 @@ class Pdfalyzer:
|
|
|
77
77
|
nodes_to_walk_next = [self._add_relationship_to_pdf_tree(r) for r in node.references_to_other_nodes()]
|
|
78
78
|
node.all_references_processed = True
|
|
79
79
|
|
|
80
|
-
for next_node in [n for n in nodes_to_walk_next if not (n is None or n.all_references_processed)
|
|
80
|
+
for next_node in [n for n in nodes_to_walk_next if not (n is None or n.all_references_processed)]:
|
|
81
81
|
if not next_node.all_references_processed:
|
|
82
82
|
self.walk_node(next_node)
|
|
83
83
|
|
|
@@ -105,7 +105,7 @@ class Pdfalyzer:
|
|
|
105
105
|
|
|
106
106
|
def stream_nodes(self) -> List[PdfTreeNode]:
|
|
107
107
|
"""List of actual nodes (not SymlinkNodes) containing streams sorted by PDF object ID"""
|
|
108
|
-
stream_filter = lambda node: node.contains_stream() and not isinstance(node, SymlinkNode)
|
|
108
|
+
stream_filter = lambda node: node.contains_stream() and not isinstance(node, SymlinkNode) # noqa: E731
|
|
109
109
|
return sorted(findall(self.pdf_tree, stream_filter), key=lambda r: r.idnum)
|
|
110
110
|
|
|
111
111
|
def _add_relationship_to_pdf_tree(self, relationship: PdfObjectRelationship) -> Optional[PdfTreeNode]:
|
|
@@ -114,7 +114,7 @@ class Pdfalyzer:
|
|
|
114
114
|
placed in the PDF node processing queue.
|
|
115
115
|
"""
|
|
116
116
|
log.info(f'Assessing relationship {relationship}...')
|
|
117
|
-
was_seen_before = (relationship.to_obj.idnum in self.nodes_encountered)
|
|
117
|
+
was_seen_before = (relationship.to_obj.idnum in self.nodes_encountered) # Must come before _build_or_find()
|
|
118
118
|
from_node = relationship.from_node
|
|
119
119
|
to_node = self._build_or_find_node(relationship.to_obj, relationship.address)
|
|
120
120
|
self.max_generation = max([self.max_generation, relationship.to_obj.generation or 0])
|
|
@@ -133,7 +133,7 @@ class Pdfalyzer:
|
|
|
133
133
|
from_node.set_parent(to_node)
|
|
134
134
|
elif to_node.parent is not None:
|
|
135
135
|
# Some StructElem nodes I have seen use /P or /K despire not being the real parent/child
|
|
136
|
-
if relationship.from_node.type.startswith(STRUCT_ELEM)
|
|
136
|
+
if relationship.from_node.type.startswith(STRUCT_ELEM):
|
|
137
137
|
log.info(f"{relationship} fail: {to_node} parent is already {to_node.parent}")
|
|
138
138
|
else:
|
|
139
139
|
log.warning(f"{relationship} fail: {to_node} parent is already {to_node.parent}")
|
|
@@ -173,7 +173,6 @@ class Pdfalyzer:
|
|
|
173
173
|
|
|
174
174
|
def _resolve_indeterminate_nodes(self) -> None:
|
|
175
175
|
"""Place all indeterminate nodes in the tree."""
|
|
176
|
-
#set_log_level('INFO')
|
|
177
176
|
indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self.indeterminate_ids]
|
|
178
177
|
indeterminate_nodes_string = "\n ".join([f"{node}" for node in indeterminate_nodes])
|
|
179
178
|
log.info(f"Resolving {len(indeterminate_nodes)} indeterminate nodes: {indeterminate_nodes_string}")
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
String constants specified in the Adobe specs for PDFs, fonts, etc.
|
|
3
3
|
"""
|
|
4
|
-
|
|
5
4
|
from pypdf.constants import (CatalogDictionary, ImageAttributes, PageAttributes,
|
|
6
5
|
PagesAttributes, Resources)
|
|
7
6
|
|
|
@@ -117,20 +116,20 @@ NON_TREE_REFERENCES = [
|
|
|
117
116
|
|
|
118
117
|
# Some PdfObjects can't be properly placed in the tree until the entire tree is parsed
|
|
119
118
|
INDETERMINATE_REF_KEYS = [
|
|
120
|
-
ANNOTS,
|
|
119
|
+
ANNOTS, # At least when it appears in a page
|
|
121
120
|
COLOR_SPACE,
|
|
122
121
|
D,
|
|
123
122
|
DEST,
|
|
124
123
|
EXT_G_STATE,
|
|
125
|
-
FIELDS,
|
|
124
|
+
FIELDS, # At least for /AcroForm
|
|
126
125
|
FIRST,
|
|
127
126
|
FONT,
|
|
128
127
|
NAMES,
|
|
129
128
|
OPEN_ACTION,
|
|
130
|
-
P,
|
|
129
|
+
P, # At least for widgets...
|
|
131
130
|
RESOURCES,
|
|
132
131
|
XOBJECT,
|
|
133
|
-
UNLABELED,
|
|
132
|
+
UNLABELED, # TODO: this might be wrong? maybe this is where the /Resources actually live?
|
|
134
133
|
]
|
|
135
134
|
|
|
136
135
|
INDETERMINATE_PREFIXES = [p for p in INDETERMINATE_REF_KEYS if len(p) > 2]
|
|
@@ -1,5 +1,8 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parse command line arguments for pdfalyzer and construct the PdfalyzerConfig object.
|
|
3
|
+
"""
|
|
1
4
|
import sys
|
|
2
|
-
from argparse import
|
|
5
|
+
from argparse import ArgumentParser, Namespace
|
|
3
6
|
from collections import namedtuple
|
|
4
7
|
from functools import partial, update_wrapper
|
|
5
8
|
from importlib.metadata import version
|
|
@@ -89,9 +92,9 @@ select.add_argument('-c', '--counts', action='store_true',
|
|
|
89
92
|
help='show counts of some of the properties of the objects in the PDF')
|
|
90
93
|
|
|
91
94
|
select.add_argument('-s', '--streams',
|
|
92
|
-
help="scan all the PDF's decoded/decrypted streams for sus content as well as any YARA rule matches. " +
|
|
93
|
-
"brute force is involved; output is verbose. a single OBJ_ID can be optionally provided to " +
|
|
94
|
-
"limit the output to a single internal object. try '-s -- [OTHERARGS]' if you run into an " +
|
|
95
|
+
help="scan all the PDF's decoded/decrypted streams for sus content as well as any YARA rule matches. " +
|
|
96
|
+
"brute force is involved; output is verbose. a single OBJ_ID can be optionally provided to " +
|
|
97
|
+
"limit the output to a single internal object. try '-s -- [OTHERARGS]' if you run into an " +
|
|
95
98
|
"argument position related piccadilly.",
|
|
96
99
|
nargs='?',
|
|
97
100
|
const=ALL_STREAMS,
|
|
@@ -99,7 +102,7 @@ select.add_argument('-s', '--streams',
|
|
|
99
102
|
type=int)
|
|
100
103
|
|
|
101
104
|
select.add_argument('--extract-quoted',
|
|
102
|
-
help="extract and force decode all bytes found between this kind of quotation marks " +
|
|
105
|
+
help="extract and force decode all bytes found between this kind of quotation marks " +
|
|
103
106
|
"(requires --streams. can be specified more than once)",
|
|
104
107
|
choices=list(QUOTE_PATTERNS.keys()),
|
|
105
108
|
dest='extract_quoteds',
|
|
@@ -144,7 +147,7 @@ def parse_arguments():
|
|
|
144
147
|
args.output_dir = args.output_dir or getcwd()
|
|
145
148
|
file_prefix = (args.file_prefix + '__') if args.file_prefix else ''
|
|
146
149
|
args.file_suffix = ('_' + args.file_suffix) if args.file_suffix else ''
|
|
147
|
-
args.output_basename =
|
|
150
|
+
args.output_basename = f"{file_prefix}{path.basename(args.file_to_scan_path)}"
|
|
148
151
|
elif args.output_dir:
|
|
149
152
|
log.warning('--output-dir provided but no export option was chosen')
|
|
150
153
|
|
|
@@ -200,8 +203,8 @@ MAX_QUALITY = 10
|
|
|
200
203
|
|
|
201
204
|
combine_pdfs_parser = ArgumentParser(
|
|
202
205
|
description="Combine multiple PDFs into one.",
|
|
203
|
-
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were"
|
|
204
|
-
" page
|
|
206
|
+
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
|
|
207
|
+
" page numbers prior to merging.",
|
|
205
208
|
formatter_class=RichHelpFormatterPlus)
|
|
206
209
|
|
|
207
210
|
combine_pdfs_parser.add_argument('pdfs',
|
|
@@ -1,13 +1,35 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "pdfalyzer"
|
|
3
|
-
version = "1.16.
|
|
4
|
-
description = "
|
|
3
|
+
version = "1.16.11"
|
|
4
|
+
description = "PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
|
|
5
5
|
authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
|
|
6
6
|
license = "GPL-3.0-or-later"
|
|
7
7
|
readme = "README.md"
|
|
8
|
+
documentation = "https://github.com/michelcrypt4d4mus/pdfalyzer"
|
|
8
9
|
homepage = "https://github.com/michelcrypt4d4mus/pdfalyzer"
|
|
9
10
|
repository = "https://github.com/michelcrypt4d4mus/pdfalyzer"
|
|
10
|
-
|
|
11
|
+
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 5 - Production/Stable",
|
|
14
|
+
"Environment :: Console",
|
|
15
|
+
"Intended Audience :: Information Technology",
|
|
16
|
+
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
|
17
|
+
"Programming Language :: Python",
|
|
18
|
+
"Programming Language :: Python :: 3.9",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Topic :: Artistic Software",
|
|
24
|
+
"Topic :: Security",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Visualization",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
include = [
|
|
29
|
+
"CHANGELOG.md",
|
|
30
|
+
"LICENSE",
|
|
31
|
+
".pdfalyzer.example"
|
|
32
|
+
]
|
|
11
33
|
|
|
12
34
|
keywords = [
|
|
13
35
|
"ascii art",
|
|
@@ -25,56 +47,59 @@ keywords = [
|
|
|
25
47
|
"pdf",
|
|
26
48
|
"pdfs",
|
|
27
49
|
"pdf analysis",
|
|
50
|
+
"pypdf",
|
|
28
51
|
"threat assessment",
|
|
29
52
|
"visualization",
|
|
30
53
|
"yara"
|
|
31
54
|
]
|
|
32
55
|
|
|
33
|
-
classifiers = [
|
|
34
|
-
"Development Status :: 5 - Production/Stable",
|
|
35
|
-
"Intended Audience :: Information Technology",
|
|
36
|
-
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
|
37
|
-
"Topic :: Artistic Software",
|
|
38
|
-
"Topic :: Security",
|
|
39
|
-
"Topic :: Scientific/Engineering :: Visualization",
|
|
40
|
-
]
|
|
41
|
-
|
|
42
|
-
include = [
|
|
43
|
-
"CHANGELOG.md",
|
|
44
|
-
"LICENSE"
|
|
45
|
-
]
|
|
46
|
-
|
|
47
56
|
packages = [
|
|
48
57
|
{ include = "pdfalyzer" }
|
|
49
58
|
]
|
|
50
59
|
|
|
51
|
-
[tool.poetry.urls]
|
|
52
|
-
Changelog = "https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md"
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
[tool.poetry.scripts]
|
|
56
|
-
combine_pdfs = 'pdfalyzer:combine_pdfs'
|
|
57
|
-
pdfalyze = 'pdfalyzer:pdfalyze'
|
|
58
|
-
pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'
|
|
59
|
-
|
|
60
60
|
|
|
61
|
+
#####################
|
|
62
|
+
# Dependencies #
|
|
63
|
+
#####################
|
|
61
64
|
[tool.poetry.dependencies]
|
|
62
65
|
python = "^3.9.2"
|
|
63
66
|
anytree = "~=2.13"
|
|
64
67
|
pypdf = "^5.9.0"
|
|
65
68
|
yaralyzer = "^1.0.4"
|
|
66
69
|
|
|
67
|
-
|
|
68
70
|
[tool.poetry.group.dev.dependencies]
|
|
71
|
+
flake8 = "^7.3.0"
|
|
69
72
|
pytest = "^7.1.2"
|
|
70
73
|
pytest-skip-slow = "^0.0.3"
|
|
71
74
|
|
|
72
75
|
|
|
76
|
+
#############
|
|
77
|
+
# Scripts #
|
|
78
|
+
#############
|
|
79
|
+
[tool.poetry.scripts]
|
|
80
|
+
combine_pdfs = 'pdfalyzer:combine_pdfs'
|
|
81
|
+
pdfalyze = 'pdfalyzer:pdfalyze'
|
|
82
|
+
pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
#####################
|
|
86
|
+
# PyPi URLs #
|
|
87
|
+
#####################
|
|
88
|
+
[tool.poetry.urls]
|
|
89
|
+
Changelog = "https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
###############################
|
|
93
|
+
# Poetry build system #
|
|
94
|
+
###############################
|
|
73
95
|
[build-system]
|
|
74
|
-
requires = ["poetry-core>=1.0.0"]
|
|
75
96
|
build-backend = "poetry.core.masonry.api"
|
|
97
|
+
requires = ["poetry-core>=1.0.0"]
|
|
76
98
|
|
|
77
99
|
|
|
100
|
+
##################
|
|
101
|
+
# pytest #
|
|
102
|
+
##################
|
|
78
103
|
[tool.pytest.ini_options]
|
|
79
104
|
addopts = [
|
|
80
105
|
"--import-mode=importlib",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pdfalyzer-1.16.9 → pdfalyzer-1.16.11}/pdfalyzer/detection/constants/javascript_reserved_keywords.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|