pdfalyzer 1.14.10__py3-none-any.whl → 1.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- CHANGELOG.md +7 -0
- pdfalyzer/__init__.py +43 -2
- pdfalyzer/binary/binary_scanner.py +2 -3
- pdfalyzer/decorators/document_model_printer.py +1 -1
- pdfalyzer/decorators/indeterminate_node.py +8 -6
- pdfalyzer/decorators/pdf_tree_node.py +5 -3
- pdfalyzer/detection/constants/binary_regexes.py +1 -7
- pdfalyzer/detection/yaralyzer_helper.py +7 -2
- pdfalyzer/helpers/filesystem_helper.py +102 -0
- pdfalyzer/helpers/rich_text_helper.py +12 -0
- pdfalyzer/output/pdfalyzer_presenter.py +1 -1
- pdfalyzer/util/adobe_strings.py +2 -1
- pdfalyzer/util/argument_parser.py +94 -10
- pdfalyzer/yara_rules/PDF.yara +1 -1
- {pdfalyzer-1.14.10.dist-info → pdfalyzer-1.15.1.dist-info}/METADATA +30 -16
- {pdfalyzer-1.14.10.dist-info → pdfalyzer-1.15.1.dist-info}/RECORD +19 -18
- {pdfalyzer-1.14.10.dist-info → pdfalyzer-1.15.1.dist-info}/entry_points.txt +1 -0
- {pdfalyzer-1.14.10.dist-info → pdfalyzer-1.15.1.dist-info}/LICENSE +0 -0
- {pdfalyzer-1.14.10.dist-info → pdfalyzer-1.15.1.dist-info}/WHEEL +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.15.1
|
|
4
|
+
* Add `--no-default-yara-rules` command line option so users can use _only_ their own custom YARA rules files if they want. Previously you could only use custom YARA rules _in addition to_ the default rules; now you can just skip the default rules.
|
|
5
|
+
|
|
6
|
+
# 1.15.0
|
|
7
|
+
* Add `combine_pdfs` command line script to merge a bunch of PDFs into one
|
|
8
|
+
* Remove unused `Deprecated` dependency
|
|
9
|
+
|
|
3
10
|
### 1.14.10
|
|
4
11
|
* Add `malware_MaldocinPDF` YARA rule
|
|
5
12
|
|
pdfalyzer/__init__.py
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
import code
|
|
2
|
-
import logging
|
|
3
2
|
import sys
|
|
4
3
|
from os import environ, getcwd, path
|
|
4
|
+
from pathlib import Path
|
|
5
5
|
|
|
6
6
|
from dotenv import load_dotenv
|
|
7
|
+
# TODO: PdfMerger is deprecated in favor of PdfWriter at v3.9.1 (see https://pypdf.readthedocs.io/en/latest/user/merging-pdfs.html#basic-example)
|
|
8
|
+
from PyPDF2 import PdfMerger
|
|
9
|
+
from PyPDF2.errors import PdfReadError
|
|
7
10
|
|
|
11
|
+
# Should be first local import before load_dotenv() (or at least I think it needs to come first)
|
|
8
12
|
from pdfalyzer.config import PdfalyzerConfig
|
|
9
13
|
|
|
10
14
|
# load_dotenv() should be called as soon as possible (before parsing local classes) but not for pytest
|
|
@@ -16,16 +20,19 @@ if not environ.get('INVOKED_BY_PYTEST', False):
|
|
|
16
20
|
|
|
17
21
|
from rich.columns import Columns
|
|
18
22
|
from rich.panel import Panel
|
|
23
|
+
from rich.text import Text
|
|
19
24
|
from yaralyzer.helpers.rich_text_helper import prefix_with_plain_text_obj
|
|
20
25
|
from yaralyzer.output.file_export import invoke_rich_export
|
|
21
26
|
from yaralyzer.output.rich_console import console
|
|
22
27
|
from yaralyzer.util.logging import log, log_and_print
|
|
23
28
|
|
|
29
|
+
from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
|
|
30
|
+
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
24
31
|
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
25
32
|
from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
|
|
26
33
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
34
|
+
from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments, parse_combine_pdfs_args
|
|
27
35
|
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
|
|
28
|
-
from pdfalyzer.util.argument_parser import output_sections, parse_arguments
|
|
29
36
|
|
|
30
37
|
# For the table shown by running pdfalyzer_show_color_theme
|
|
31
38
|
MAX_THEME_COL_SIZE = 35
|
|
@@ -44,6 +51,7 @@ def pdfalyze():
|
|
|
44
51
|
log_and_print(f"Binary stream extraction complete, files written to '{args.output_dir}'.\nExiting.\n")
|
|
45
52
|
sys.exit()
|
|
46
53
|
|
|
54
|
+
# The method that gets called is related to the argument name. See 'possible_output_sections' list in argument_parser.py
|
|
47
55
|
# Analysis exports wrap themselves around the methods that actually generate the analyses
|
|
48
56
|
for (arg, method) in output_sections(args, pdfalyzer):
|
|
49
57
|
if args.output_dir:
|
|
@@ -82,3 +90,36 @@ def pdfalyzer_show_color_theme() -> None:
|
|
|
82
90
|
]
|
|
83
91
|
|
|
84
92
|
console.print(Columns(colors, column_first=True, padding=(0,3)))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def combine_pdfs():
|
|
96
|
+
"""Utility method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'."""
|
|
97
|
+
args = parse_combine_pdfs_args()
|
|
98
|
+
set_max_open_files(args.number_of_pdfs)
|
|
99
|
+
merger = PdfMerger()
|
|
100
|
+
|
|
101
|
+
for pdf in args.pdfs:
|
|
102
|
+
try:
|
|
103
|
+
print_highlighted(f" -> Merging '{pdf}'...", style='dim')
|
|
104
|
+
merger.append(pdf)
|
|
105
|
+
except PdfReadError as e:
|
|
106
|
+
print_highlighted(f" -> Failed to merge '{pdf}'! {e}", style='red')
|
|
107
|
+
ask_to_proceed()
|
|
108
|
+
|
|
109
|
+
if args.compression_level == 0:
|
|
110
|
+
print_highlighted("\nSkipping content stream compression...")
|
|
111
|
+
else:
|
|
112
|
+
print_highlighted(f"\nCompressing content streams with zlib level {args.compression_level}...")
|
|
113
|
+
|
|
114
|
+
for i, page in enumerate(merger.pages):
|
|
115
|
+
# TODO: enable image quality reduction + zlib level once PyPDF is upgraded to 4.x and option is available
|
|
116
|
+
# See https://pypdf.readthedocs.io/en/latest/user/file-size.html#reducing-image-quality
|
|
117
|
+
print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
|
|
118
|
+
page.pagedata.compress_content_streams() # This is CPU intensive!
|
|
119
|
+
|
|
120
|
+
print_highlighted(f"\nWriting '{args.output_file}'...", style='cyan')
|
|
121
|
+
merger.write(args.output_file)
|
|
122
|
+
merger.close()
|
|
123
|
+
txt = Text('').append(f" -> Wrote ")
|
|
124
|
+
txt.append(str(file_size_in_mb(args.output_file)), style='cyan').append(" megabytes\n")
|
|
125
|
+
print_highlighted(txt)
|
|
@@ -20,9 +20,8 @@ from yaralyzer.util.logging import log
|
|
|
20
20
|
|
|
21
21
|
from pdfalyzer.config import PdfalyzerConfig
|
|
22
22
|
from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode
|
|
23
|
-
from pdfalyzer.detection.constants.binary_regexes import (BACKTICK,
|
|
24
|
-
DANGEROUS_PDF_KEYS_TO_HUNT_ONLY_IN_FONTS, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET,
|
|
25
|
-
QUOTE_PATTERNS)
|
|
23
|
+
from pdfalyzer.detection.constants.binary_regexes import (BACKTICK, DANGEROUS_PDF_KEYS_TO_HUNT_ONLY_IN_FONTS,
|
|
24
|
+
DANGEROUS_PDF_KEYS_TO_HUNT_ONLY_IN_FONTS, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET, QUOTE_PATTERNS)
|
|
26
25
|
from pdfalyzer.helpers.string_helper import generate_hyphen_line
|
|
27
26
|
from pdfalyzer.output.layout import print_headline_panel, print_section_sub_subheader
|
|
28
27
|
from pdfalyzer.util.adobe_strings import CONTENTS, CURRENTFILE_EEXEC, FONT_FILE_KEYS
|
|
@@ -22,6 +22,7 @@ class IndeterminateNode:
|
|
|
22
22
|
self.node = node
|
|
23
23
|
|
|
24
24
|
def place_node(self) -> None:
|
|
25
|
+
"""Attempt to find the appropriate parent/child relationships for this node."""
|
|
25
26
|
log.debug(f"Attempting to resolve indeterminate node: {self.node}")
|
|
26
27
|
|
|
27
28
|
if self._check_for_common_ancestor():
|
|
@@ -34,7 +35,7 @@ class IndeterminateNode:
|
|
|
34
35
|
parent = self.find_node_with_most_descendants()
|
|
35
36
|
parent_str = escape(str(parent))
|
|
36
37
|
|
|
37
|
-
# Any branch that doesn't return or raise will
|
|
38
|
+
# Any if/else branch that doesn't return or raise will decide parent to be the node w/most descendants
|
|
38
39
|
if self._has_only_similar_relationships():
|
|
39
40
|
log.info(f" Fuzzy match addresses or labels; placing under node w/most descendants: {parent_str}")
|
|
40
41
|
elif self._make_parent_if_one_remains(lambda r: r.from_node.type in PAGE_AND_PAGES):
|
|
@@ -43,7 +44,8 @@ class IndeterminateNode:
|
|
|
43
44
|
elif self.node.type == COLOR_SPACE:
|
|
44
45
|
log.info(f" Color space node found; placing under node w/most descendants: {parent_str}")
|
|
45
46
|
elif set(self.node.unique_labels_of_referring_nodes()) == set(PAGE_AND_PAGES):
|
|
46
|
-
#
|
|
47
|
+
# Handle an edge case seen in the wild involving a PDF that doesn't conform to the PDF spec
|
|
48
|
+
# in a particular way.
|
|
47
49
|
log.warning(f" {self.node} seems to be a loose {PAGE}. Linking to first {PAGES}")
|
|
48
50
|
pages_nodes = [n for n in self.node.nodes_with_here_references() if self.node.type == PAGES]
|
|
49
51
|
self.node.set_parent(self.find_node_with_most_descendants(pages_nodes))
|
|
@@ -63,7 +65,7 @@ class IndeterminateNode:
|
|
|
63
65
|
def _has_only_similar_relationships(self) -> bool:
|
|
64
66
|
"""
|
|
65
67
|
Returns True if all the nodes w/references to this one have the same type or if all the
|
|
66
|
-
reference_keys that point to this node are the same
|
|
68
|
+
reference_keys that point to this node are the same.
|
|
67
69
|
"""
|
|
68
70
|
unique_refferer_labels = self.node.unique_labels_of_referring_nodes()
|
|
69
71
|
unique_addresses = self.node.unique_addresses()
|
|
@@ -99,7 +101,7 @@ class IndeterminateNode:
|
|
|
99
101
|
log.info(f"{possible_ancestor} is the common ancestor of {other_nodes_str}")
|
|
100
102
|
return possible_ancestor
|
|
101
103
|
|
|
102
|
-
def _check_single_relation_rules(self):
|
|
104
|
+
def _check_single_relation_rules(self) -> bool:
|
|
103
105
|
"""Check various ways of narrowing down the list of potential parents to one node."""
|
|
104
106
|
if self._make_parent_if_one_remains(lambda r: r.reference_key in [K, KIDS]):
|
|
105
107
|
log.info(" Found single explicit /K or /Kids ref")
|
|
@@ -111,7 +113,7 @@ class IndeterminateNode:
|
|
|
111
113
|
return True
|
|
112
114
|
|
|
113
115
|
def _make_parent_if_one_remains(self, is_possible_parent: Callable) -> bool:
|
|
114
|
-
"""Relationships are filtered w/
|
|
116
|
+
"""Relationships are filtered w/is_possible_parent(); if there's only one possibility it's made the parent."""
|
|
115
117
|
remaining_relationships = [r for r in self.node.non_tree_relationships if is_possible_parent(r)]
|
|
116
118
|
|
|
117
119
|
if len(remaining_relationships) == 1:
|
|
@@ -123,6 +125,6 @@ class IndeterminateNode:
|
|
|
123
125
|
|
|
124
126
|
|
|
125
127
|
def find_node_with_lowest_id(list_of_nodes: List[PdfTreeNode]) -> PdfTreeNode:
|
|
126
|
-
"""Find node in list_of_nodes_with_lowest ID"""
|
|
128
|
+
"""Find node in list_of_nodes_with_lowest ID."""
|
|
127
129
|
lowest_idnum = min([n.idnum for n in list_of_nodes])
|
|
128
130
|
return next(n for n in list_of_nodes if n.idnum == lowest_idnum)
|
|
@@ -104,10 +104,11 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
104
104
|
self.non_tree_relationships.remove(relationship)
|
|
105
105
|
|
|
106
106
|
def nodes_with_here_references(self) -> List['PdfTreeNode']:
|
|
107
|
-
"""Return a list of nodes that contain this
|
|
107
|
+
"""Return a list of nodes that contain this node's PDF object as an IndirectObject reference."""
|
|
108
108
|
return [r.from_node for r in self.non_tree_relationships if r.from_node]
|
|
109
109
|
|
|
110
110
|
def non_tree_relationship_count(self) -> int:
|
|
111
|
+
"""Number of non parent/child relationships containing this node."""
|
|
111
112
|
return len(self.non_tree_relationships)
|
|
112
113
|
|
|
113
114
|
def unique_addresses(self) -> List[str]:
|
|
@@ -128,7 +129,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
128
129
|
return isinstance(self.obj, StreamObject)
|
|
129
130
|
|
|
130
131
|
def tree_address(self, max_length: Optional[int] = DEFAULT_MAX_ADDRESS_LENGTH) -> str:
|
|
131
|
-
"""Creates a string like '/Catalog/Pages/Resources[2]/Font' truncated to max_length (if given)"""
|
|
132
|
+
"""Creates a string like '/Catalog/Pages/Resources[2]/Font' truncated to max_length (if given)."""
|
|
132
133
|
if self.label == TRAILER:
|
|
133
134
|
return '/'
|
|
134
135
|
elif self.parent is None:
|
|
@@ -163,7 +164,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
163
164
|
else:
|
|
164
165
|
address = refs_to_this_node[0].address
|
|
165
166
|
# If other node's label doesn't start with a NON_STANDARD_ADDRESS string
|
|
166
|
-
# and any of the relationships pointing at this
|
|
167
|
+
# and any of the relationships pointing at this node use something other than a
|
|
167
168
|
# NON_STANDARD_ADDRESS_NODES string to refer here, print a warning about multiple refs.
|
|
168
169
|
if not (is_prefixed_by_any(from_node.label, NON_STANDARD_ADDRESS_NODES) or \
|
|
169
170
|
all(ref.address in NON_STANDARD_ADDRESS_NODES for ref in refs_to_this_node)):
|
|
@@ -193,6 +194,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
193
194
|
return len(self.children) + sum([child.descendants_count() for child in self.children])
|
|
194
195
|
|
|
195
196
|
def unique_labels_of_referring_nodes(self) -> List[str]:
|
|
197
|
+
"""Unique label strings of nodes referring here outside the parent/child hierarchy."""
|
|
196
198
|
return list(set([r.from_node.label for r in self.non_tree_relationships]))
|
|
197
199
|
|
|
198
200
|
def print_non_tree_relationships(self) -> None:
|
|
@@ -1,13 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Configuration of what to scan for in binary data. Regexes here will be matched against binary streams
|
|
3
|
-
and then force decoded
|
|
3
|
+
and then force decoded.
|
|
4
4
|
"""
|
|
5
|
-
|
|
6
|
-
import re
|
|
7
|
-
from typing import Union
|
|
8
|
-
|
|
9
|
-
from deprecated import deprecated
|
|
10
|
-
|
|
11
5
|
from pdfalyzer.util.adobe_strings import DANGEROUS_PDF_KEYS
|
|
12
6
|
|
|
13
7
|
DANGEROUS_JAVASCRIPT_INSTRUCTIONS = ['eval']
|
|
@@ -8,6 +8,8 @@ from typing import Optional, Union
|
|
|
8
8
|
from yaralyzer.config import YaralyzerConfig
|
|
9
9
|
from yaralyzer.yaralyzer import Yaralyzer
|
|
10
10
|
|
|
11
|
+
from pdfalyzer.config import PdfalyzerConfig
|
|
12
|
+
|
|
11
13
|
YARA_RULES_DIR = files('pdfalyzer').joinpath('yara_rules')
|
|
12
14
|
|
|
13
15
|
YARA_RULES_FILES = [
|
|
@@ -32,8 +34,11 @@ def _build_yaralyzer(scannable: Union[bytes, str], label: Optional[str] = None)
|
|
|
32
34
|
with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[0])) as yara0:
|
|
33
35
|
with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[1])) as yara1:
|
|
34
36
|
with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[2])) as yara2:
|
|
35
|
-
|
|
36
|
-
rules_paths
|
|
37
|
+
# If there is a custom yara_rules argument file use that instead of the files in the yara_rules/ dir
|
|
38
|
+
rules_paths = YaralyzerConfig.args.yara_rules_files or []
|
|
39
|
+
|
|
40
|
+
if not YaralyzerConfig.args.no_default_yara_rules:
|
|
41
|
+
rules_paths += [str(y) for y in [yara0, yara1, yara2]]
|
|
37
42
|
|
|
38
43
|
try:
|
|
39
44
|
return Yaralyzer.for_rules_files(rules_paths, scannable, label)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Some helpers for stuff with the local filesystem.
|
|
3
|
+
"""
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Union
|
|
7
|
+
|
|
8
|
+
from yaralyzer.output.rich_console import console
|
|
9
|
+
|
|
10
|
+
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
11
|
+
|
|
12
|
+
NUMBERED_PAGE_REGEX = re.compile(r'.*_(\d+)\.\w{3,4}$')
|
|
13
|
+
DEFAULT_MAX_OPEN_FILES = 256 # macOS default
|
|
14
|
+
OPEN_FILES_BUFFER = 30 # we might have some files open already so we need to go beyond DEFAULT_MAX_OPEN_FILES
|
|
15
|
+
PDF_EXT = '.pdf'
|
|
16
|
+
|
|
17
|
+
# TODO: this kind of type alias is not supported until Python 3.12
|
|
18
|
+
#type StrOrPath = Union[str, Path]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def with_pdf_extension(file_path: Union[str, Path]) -> str:
|
|
22
|
+
"""Append '.pdf' to 'file_path' if it doesn't already end with '.pdf'."""
|
|
23
|
+
return str(file_path) + ('' if is_pdf(file_path) else PDF_EXT)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def is_pdf(file_path: Union[str, Path]) -> bool:
|
|
27
|
+
"""Return True if 'file_path' ends with '.pdf'."""
|
|
28
|
+
return str(file_path).endswith(PDF_EXT)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def file_exists(file_path: Union[str, Path]) -> bool:
|
|
32
|
+
"""Return True if 'file_path' exists."""
|
|
33
|
+
return Path(file_path).exists()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def do_all_files_exist(file_paths: list[Union[str, Path]]) -> bool:
|
|
37
|
+
"""Print an error for each element of 'file_paths' that's not a file. Return True if all 'file_paths' exist."""
|
|
38
|
+
all_files_exist = True
|
|
39
|
+
|
|
40
|
+
for file_path in file_paths:
|
|
41
|
+
if not file_exists(file_path):
|
|
42
|
+
console.print(f"File not found: '{file_path}'", style='error')
|
|
43
|
+
all_files_exist = False
|
|
44
|
+
|
|
45
|
+
return all_files_exist
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def extract_page_number(file_path: Union[str, Path]) -> int|None:
|
|
49
|
+
"""Extract the page number from the end of a filename if it exists."""
|
|
50
|
+
match = NUMBERED_PAGE_REGEX.match(str(file_path))
|
|
51
|
+
return int(match.group(1)) if match else None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def file_size_in_mb(file_path: Union[str, Path], decimal_places: int = 2) -> float:
|
|
55
|
+
"""Return the size of 'file_path' in MB rounded to 2 decimal places,"""
|
|
56
|
+
return round(Path(file_path).stat().st_size / 1024.0 / 1024.0, decimal_places)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def set_max_open_files(num_filehandles: int = DEFAULT_MAX_OPEN_FILES) -> tuple[int | None, int | None]:
|
|
60
|
+
"""
|
|
61
|
+
Sets the OS level max open files to at least 'num_filehandles'. Current value can be seen with 'ulimit -a'.
|
|
62
|
+
Required when you might be opening more than DEFAULT_MAX_OPEN_FILES file handles simultaneously
|
|
63
|
+
(e.g. when you are merging a lot of small images or PDFs). Equivalent of something like
|
|
64
|
+
'default ulimit -n 1024' on macOS.
|
|
65
|
+
|
|
66
|
+
NOTE: Does nothing on Windows (I think).
|
|
67
|
+
NOTE: This mostly came from somewhere on stackoverflow but I lost the link.
|
|
68
|
+
"""
|
|
69
|
+
try:
|
|
70
|
+
import resource # Windows doesn't have this package / doesn't need to bump up the ulimit (??)
|
|
71
|
+
except ImportError:
|
|
72
|
+
resource = None
|
|
73
|
+
|
|
74
|
+
if resource is None:
|
|
75
|
+
print_highlighted(f"No resource module; cannot set max open files on this platform...", style='yellow')
|
|
76
|
+
return (None, None)
|
|
77
|
+
elif num_filehandles <= DEFAULT_MAX_OPEN_FILES:
|
|
78
|
+
# Then the OS max open files value is already sufficient.
|
|
79
|
+
return (DEFAULT_MAX_OPEN_FILES, DEFAULT_MAX_OPEN_FILES)
|
|
80
|
+
|
|
81
|
+
# %% (0) what is current ulimit -n setting?
|
|
82
|
+
(soft, hard) = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
83
|
+
num_filehandles = num_filehandles + OPEN_FILES_BUFFER
|
|
84
|
+
|
|
85
|
+
# %% (1) increase limit (soft and even hard) if needed
|
|
86
|
+
if soft < num_filehandles:
|
|
87
|
+
soft = num_filehandles
|
|
88
|
+
hard = max(soft, hard)
|
|
89
|
+
print_highlighted(f"Increasing max open files soft & hard 'ulimit -n {soft} {hard}'...")
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
|
|
93
|
+
except (ValueError, resource.error):
|
|
94
|
+
try:
|
|
95
|
+
hard = soft
|
|
96
|
+
print_highlighted(f"Retrying setting max open files (soft, hard)=({soft}, {hard})", style='yellow')
|
|
97
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
|
|
98
|
+
except Exception:
|
|
99
|
+
print_highlighted('Failed to set max open files / ulimit, giving up!', style='error')
|
|
100
|
+
soft,hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
101
|
+
|
|
102
|
+
return (soft, hard)
|
|
@@ -1,14 +1,26 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Functions for miscellaneous Rich text/string operations.
|
|
3
3
|
"""
|
|
4
|
+
from functools import partial
|
|
4
5
|
from typing import List
|
|
5
6
|
|
|
6
7
|
from PyPDF2.generic import PdfObject
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.highlighter import RegexHighlighter, JSONHighlighter
|
|
7
10
|
from rich.text import Text
|
|
11
|
+
from yaralyzer.output.rich_console import console
|
|
8
12
|
|
|
9
13
|
from pdfalyzer.helpers.pdf_object_helper import pypdf_class_name
|
|
10
14
|
from pdfalyzer.output.styles.node_colors import get_label_style, get_class_style_italic
|
|
11
15
|
|
|
16
|
+
# Usually we use the yaralyzer console but that has no highlighter
|
|
17
|
+
pdfalyzer_console = Console(color_system='256')
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def print_highlighted(msg: str|Text, **kwargs) -> None:
|
|
21
|
+
"""Print 'msg' with Rich highlighting."""
|
|
22
|
+
pdfalyzer_console.print(msg, highlight=True, **kwargs)
|
|
23
|
+
|
|
12
24
|
|
|
13
25
|
def quoted_text(
|
|
14
26
|
_string: str,
|
|
@@ -124,7 +124,7 @@ class PdfalyzerPresenter:
|
|
|
124
124
|
console.print(build_decoding_stats_table(binary_scanner), justify='center')
|
|
125
125
|
|
|
126
126
|
def print_yara_results(self) -> None:
|
|
127
|
-
"""Scan the
|
|
127
|
+
"""Scan the main PDF and each individual binary stream in it with yara_rules/*.yara files"""
|
|
128
128
|
print_section_header(f"YARA Scan of PDF rules for '{self.pdfalyzer.pdf_basename}'")
|
|
129
129
|
YaralyzerConfig.args.standalone_mode = True # TODO: using 'standalone mode' like this kind of sucks
|
|
130
130
|
|
pdfalyzer/util/adobe_strings.py
CHANGED
|
@@ -79,7 +79,8 @@ XREF_STREAM = '/XRefStm'
|
|
|
79
79
|
FONT_LENGTHS = [f'/Length{i + 1}' for i in range(3)]
|
|
80
80
|
FONT_FILE_KEYS = [FONT_FILE, FONT_FILE2, FONT_FILE3]
|
|
81
81
|
|
|
82
|
-
# Instructions to flag when scanning stream data for malicious content.
|
|
82
|
+
# Instructions to flag when scanning stream data for malicious content. The leading
|
|
83
|
+
# front slash will be removed when pattern matching.
|
|
83
84
|
DANGEROUS_PDF_KEYS = [
|
|
84
85
|
# AA, # AA is too generic; can't afford to remove the frontslash
|
|
85
86
|
ACRO_FORM,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import sys
|
|
2
|
-
from argparse import ArgumentError, ArgumentParser
|
|
2
|
+
from argparse import ArgumentError, ArgumentParser, Namespace
|
|
3
3
|
from collections import namedtuple
|
|
4
4
|
from functools import partial, update_wrapper
|
|
5
5
|
from importlib.metadata import version
|
|
@@ -7,11 +7,16 @@ from os import getcwd, path
|
|
|
7
7
|
from typing import List
|
|
8
8
|
|
|
9
9
|
from rich_argparse_plus import RichHelpFormatterPlus
|
|
10
|
-
from
|
|
10
|
+
from rich.prompt import Confirm
|
|
11
|
+
from rich.text import Text
|
|
12
|
+
from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
|
|
11
13
|
from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
|
|
12
14
|
|
|
13
15
|
from pdfalyzer.config import ALL_STREAMS, PdfalyzerConfig
|
|
14
16
|
from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
|
|
17
|
+
from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
|
|
18
|
+
with_pdf_extension)
|
|
19
|
+
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
15
20
|
|
|
16
21
|
# NamedTuple to keep our argument selection orderly
|
|
17
22
|
OutputSection = namedtuple('OutputSection', ['argument', 'method'])
|
|
@@ -25,7 +30,7 @@ DESCRIPTION = "Explore PDF's inner data structure with absurdly large and in dep
|
|
|
25
30
|
|
|
26
31
|
EPILOG = "Values for various config options can be set permanently by a .pdfalyzer file in your home directory; " + \
|
|
27
32
|
"see the documentation for details. " + \
|
|
28
|
-
f"A registry of previous pdfalyzer invocations will be
|
|
33
|
+
f"A registry of previous pdfalyzer invocations will be inscribed to a file if the " + \
|
|
29
34
|
"{YaralyzerConfig.LOG_DIR_ENV_VAR} environment variable is configured."
|
|
30
35
|
|
|
31
36
|
# Analysis selection sections
|
|
@@ -45,8 +50,13 @@ export.add_argument('-bin', '--extract-binary-streams',
|
|
|
45
50
|
const='bin',
|
|
46
51
|
help='extract all binary streams in the PDF to separate files (requires pdf-parser.py)')
|
|
47
52
|
|
|
53
|
+
# Add one more option to the YARA rules section
|
|
54
|
+
source.add_argument('--no-default-yara-rules',
|
|
55
|
+
action='store_true',
|
|
56
|
+
help='if --yara is selected use only custom rules from --yara-file arg and not the default included YARA rules')
|
|
48
57
|
|
|
49
|
-
|
|
58
|
+
|
|
59
|
+
# Note that we extend the yaralyzer's parser and export
|
|
50
60
|
parser = ArgumentParser(
|
|
51
61
|
formatter_class=RichHelpFormatterPlus,
|
|
52
62
|
description=DESCRIPTION,
|
|
@@ -73,7 +83,7 @@ select.add_argument('-f', '--fonts', action='store_true',
|
|
|
73
83
|
help="show info about fonts included character mappings for embedded font binaries")
|
|
74
84
|
|
|
75
85
|
select.add_argument('-y', '--yara', action='store_true',
|
|
76
|
-
help="scan the PDF with YARA rules")
|
|
86
|
+
help="scan the PDF with the included malicious PDF YARA rules and/or your custom YARA rules")
|
|
77
87
|
|
|
78
88
|
select.add_argument('-c', '--counts', action='store_true',
|
|
79
89
|
help='show counts of some of the properties of the objects in the PDF')
|
|
@@ -107,7 +117,9 @@ select.add_argument('--preview-stream-length',
|
|
|
107
117
|
parser._action_groups = parser._action_groups[:2] + [parser._action_groups[-1]] + parser._action_groups[2:-1]
|
|
108
118
|
|
|
109
119
|
|
|
110
|
-
|
|
120
|
+
################################
|
|
121
|
+
# Main argument parsing begins #
|
|
122
|
+
################################
|
|
111
123
|
def parse_arguments():
|
|
112
124
|
"""Parse command line args. Most settings are communicated to the app by setting env vars"""
|
|
113
125
|
if '--version' in sys.argv:
|
|
@@ -120,10 +132,13 @@ def parse_arguments():
|
|
|
120
132
|
|
|
121
133
|
if not args.streams:
|
|
122
134
|
if args.extract_quoteds:
|
|
123
|
-
|
|
135
|
+
exit_with_error("--extract-quoted does nothing if --streams is not selected")
|
|
124
136
|
if args.suppress_boms:
|
|
125
137
|
log.warning("--suppress-boms has nothing to suppress if --streams is not selected")
|
|
126
138
|
|
|
139
|
+
if args.no_default_yara_rules and not args.yara_rules_files:
|
|
140
|
+
exit_with_error("--no-default-yara-rules requires at least one --yara-file argument")
|
|
141
|
+
|
|
127
142
|
# File export options
|
|
128
143
|
if args.export_svg or args.export_txt or args.export_html or args.extract_binary_streams:
|
|
129
144
|
args.output_dir = args.output_dir or getcwd()
|
|
@@ -142,8 +157,8 @@ def parse_arguments():
|
|
|
142
157
|
|
|
143
158
|
def output_sections(args, pdfalyzer) -> List[OutputSection]:
|
|
144
159
|
"""
|
|
145
|
-
Determine which of the tree visualizations, font scans, etc
|
|
146
|
-
If nothing
|
|
160
|
+
Determine which of the tree visualizations, font scans, etc should be run.
|
|
161
|
+
If nothing is specified output ALL sections other than --streams which is v. slow/verbose.
|
|
147
162
|
"""
|
|
148
163
|
# Create a partial for print_font_info() because it's the only one that can take an argument
|
|
149
164
|
# partials have no __name__ so update_wrapper() propagates the 'print_font_info' as this partial's name
|
|
@@ -151,7 +166,8 @@ def output_sections(args, pdfalyzer) -> List[OutputSection]:
|
|
|
151
166
|
stream_scan = partial(pdfalyzer.print_streams_analysis, idnum=stream_id)
|
|
152
167
|
update_wrapper(stream_scan, pdfalyzer.print_streams_analysis)
|
|
153
168
|
|
|
154
|
-
#
|
|
169
|
+
# 1st element string matches the argument in 'select' group
|
|
170
|
+
# 2nd is fxn to call if selected.
|
|
155
171
|
# Top to bottom is the default order of output.
|
|
156
172
|
possible_output_sections = [
|
|
157
173
|
OutputSection(DOCINFO, pdfalyzer.print_document_info),
|
|
@@ -175,3 +191,71 @@ def output_sections(args, pdfalyzer) -> List[OutputSection]:
|
|
|
175
191
|
def all_sections_chosen(args):
|
|
176
192
|
"""Returns true if all flags are set or no flags are set."""
|
|
177
193
|
return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
###############################################
|
|
197
|
+
# Separate arg parser for combine_pdfs script #
|
|
198
|
+
###############################################
|
|
199
|
+
combine_pdfs_parser = ArgumentParser(
|
|
200
|
+
description="Combine multiple PDFs into one.",
|
|
201
|
+
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" \
|
|
202
|
+
" page numebrs prior to merging.",
|
|
203
|
+
formatter_class=RichHelpFormatterPlus)
|
|
204
|
+
|
|
205
|
+
combine_pdfs_parser.add_argument('pdfs',
|
|
206
|
+
help='two or more PDFs to combine',
|
|
207
|
+
metavar='PDF_PATH',
|
|
208
|
+
nargs='+')
|
|
209
|
+
|
|
210
|
+
combine_pdfs_parser.add_argument('-c', '--compression-level',
|
|
211
|
+
help='zlib image compression level (0=none, max=1 until PyPDF is upgraded)',
|
|
212
|
+
choices=range(0, 2),
|
|
213
|
+
default=1,
|
|
214
|
+
type=int)
|
|
215
|
+
|
|
216
|
+
combine_pdfs_parser.add_argument('-o', '--output-file',
|
|
217
|
+
help='path to write the combined PDFs to',
|
|
218
|
+
required=True)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def parse_combine_pdfs_args() -> Namespace:
|
|
222
|
+
"""Parse command line args for combine_pdfs script."""
|
|
223
|
+
args = combine_pdfs_parser.parse_args()
|
|
224
|
+
args.output_file = with_pdf_extension(args.output_file)
|
|
225
|
+
confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
|
|
226
|
+
args.number_of_pdfs = len(args.pdfs)
|
|
227
|
+
|
|
228
|
+
if args.number_of_pdfs < 2:
|
|
229
|
+
exit_with_error(f"Need at least 2 PDFs to merge.")
|
|
230
|
+
elif not do_all_files_exist(args.pdfs):
|
|
231
|
+
exit_with_error()
|
|
232
|
+
elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
|
|
233
|
+
exit_with_error()
|
|
234
|
+
|
|
235
|
+
if all(is_pdf(pdf) for pdf in args.pdfs):
|
|
236
|
+
if all(extract_page_number(pdf) for pdf in args.pdfs):
|
|
237
|
+
print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
|
|
238
|
+
args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
|
|
239
|
+
else:
|
|
240
|
+
print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
|
|
241
|
+
else:
|
|
242
|
+
print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
|
|
243
|
+
ask_to_proceed()
|
|
244
|
+
|
|
245
|
+
print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
|
|
246
|
+
return args
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def ask_to_proceed() -> None:
|
|
250
|
+
"""Exit if user doesn't confirm they want to proceed."""
|
|
251
|
+
if not Confirm.ask(Text("Proceed anyway?")):
|
|
252
|
+
exit_with_error()
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def exit_with_error(error_message: str|None = None) -> None:
|
|
256
|
+
"""Print 'error_message' and exit with status code 1."""
|
|
257
|
+
if error_message:
|
|
258
|
+
print_highlighted(Text('').append('ERROR', style='bold red').append(f': {error_message}'))
|
|
259
|
+
|
|
260
|
+
print_highlighted('Exiting...', style='dim red')
|
|
261
|
+
sys.exit(1)
|
pdfalyzer/yara_rules/PDF.yara
CHANGED
|
@@ -1026,7 +1026,7 @@ rule malware_MaldocinPDF {
|
|
|
1026
1026
|
author = "Yuma Masubuchi and Kota Kino"
|
|
1027
1027
|
description = "Search for embeddings of malicious Word files into a PDF file."
|
|
1028
1028
|
created_date = "2023-08-15"
|
|
1029
|
-
blog_reference = "https://
|
|
1029
|
+
blog_reference = "https://blogs.jpcert.or.jp/en/2023/08/maldocinpdf.html"
|
|
1030
1030
|
labs_reference = "N/A"
|
|
1031
1031
|
labs_pivot = "N/A"
|
|
1032
1032
|
samples = "ef59d7038cfd565fd65bae12588810d5361df938244ebad33b71882dcf683058"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.15.1
|
|
4
4
|
Summary: A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -16,7 +16,6 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
16
16
|
Classifier: Topic :: Artistic Software
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
18
18
|
Classifier: Topic :: Security
|
|
19
|
-
Requires-Dist: Deprecated (>=1.2.13,<2.0.0)
|
|
20
19
|
Requires-Dist: PyPDF2 (>=2.10,<3.0)
|
|
21
20
|
Requires-Dist: anytree (>=2.8,<3.0)
|
|
22
21
|
Requires-Dist: chardet (>=5.0.0,<6.0.0)
|
|
@@ -63,25 +62,32 @@ If you're looking for one of these things this may be the tool for you.
|
|
|
63
62
|
### What It Don't Do
|
|
64
63
|
This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
|
|
65
64
|
|
|
65
|
+
-------------
|
|
66
66
|
|
|
67
67
|
# Installation
|
|
68
68
|
|
|
69
|
-
Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` should also work.
|
|
69
|
+
Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `pip3` / `pip` should also work.
|
|
70
70
|
```sh
|
|
71
71
|
pipx install pdfalyzer
|
|
72
72
|
```
|
|
73
73
|
|
|
74
74
|
See [PyPDF2 installation notes](https://github.com/py-pdf/PyPDF2#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
|
|
75
75
|
|
|
76
|
-
|
|
76
|
+
If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
|
|
77
|
+
|
|
78
|
+
### Troubleshooting
|
|
77
79
|
1. If you used `pip3` instead of `pipx` and have an issue you should try to install with `pipx`.
|
|
78
80
|
1. If you run into an issue about missing YARA try to install [yara-python](https://pypi.org/project/yara-python/).
|
|
79
81
|
1. If you encounter an error building the python `cryptography` package check your `pip` version (`pip --version`). If it's less than 22.0, upgrade `pip` with `pip install --upgrade pip`.
|
|
82
|
+
1. If you get a YARA internal error number you can look up what it actually means [here](https://github.com/VirusTotal/yara/blob/master/libyara/include/yara/error.h).
|
|
83
|
+
1. If you can't get the `pdfalyze` command to work try `python -m pdfalyzer`. It's an equivalent but more portable version of the same command that does not rely on your python script paths being set up in a sane way.
|
|
84
|
+
1. While The Pdfalyzer has been tested on quite a few large and very complicated PDFs there are no doubt a bunch of edge cases that will trip up the code. Sifting through the various interconnected internal PDF objects and building the correct tree representation is much, much harder than it should be and requires multiple scans and a little bit of educated guessing. If a PDF fails to parse and you hit an error please open [a GitHub issue](https://github.com/michelcrypt4d4mus/pdfalyzer/issues) with the compressed (`.zip`, `.gz`, whatever) PDF that is causing the problem attached (if possible) and I'll take a look when I can. I will _not_ take a look at any uncompressed PDFs due to the security risks so make sure you zip it before you ship it.
|
|
80
85
|
1. On Linux if you encounter an error building `wheel` or `cffi` you may need to install some packages:
|
|
81
86
|
```bash
|
|
82
87
|
sudo apt-get install build-essential libssl-dev libffi-dev rustc
|
|
83
88
|
```
|
|
84
|
-
|
|
89
|
+
|
|
90
|
+
-------------
|
|
85
91
|
|
|
86
92
|
# Usage
|
|
87
93
|
|
|
@@ -92,8 +98,8 @@ Run `pdfalyze --help` to see usage instructions. As of right now these are the o
|
|
|
92
98
|
## Runtime Options
|
|
93
99
|
If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--help` then all of the analyses will be done _except_ the `--streams`. In other words, these two commands are equivalent:
|
|
94
100
|
|
|
95
|
-
1. `
|
|
96
|
-
1. `
|
|
101
|
+
1. `pdfalyze lacan_buys_the_dip.pdf`
|
|
102
|
+
1. `pdfalyze lacan_buys_the_dip.pdf -d -t -r -f -y -c`
|
|
97
103
|
|
|
98
104
|
The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
|
|
99
105
|
|
|
@@ -106,15 +112,11 @@ Even if you don't configure your own `.pdfalyzer` file you may still glean some
|
|
|
106
112
|
### Colors And Themes
|
|
107
113
|
Run `pdfalyzer_show_color_theme` to see the color theme employed.
|
|
108
114
|
|
|
109
|
-
|
|
110
|
-
## Guarantees
|
|
115
|
+
### Guarantees
|
|
111
116
|
Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
|
|
112
117
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
1. If you can't get the `pdfalyze` command to work try `python -m pdfalyzer`. It's an equivalent but more portable version of the same command that does not rely on your python script paths being set up in a sane way.
|
|
116
|
-
1. While The Pdfalyzer has been tested on quite a few large and very complicated PDFs there are no doubt a bunch of edge cases that will trip up the code. If that does happen and you hit an error, please open [a GitHub issue](https://github.com/michelcrypt4d4mus/pdfalyzer/issues) with the compressed (`.zip`, `.gz`, whatever) PDF that is causing the problem attached (if possible) and I'll take a look when I can. I will _not_ take a look at any uncompressed PDFs due to the security risks so make sure you zip it before you ship it.
|
|
117
|
-
|
|
118
|
+
## Example Usage
|
|
119
|
+
[BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
|
|
118
120
|
|
|
119
121
|
-------------
|
|
120
122
|
|
|
@@ -135,6 +137,7 @@ pdfalyzer = Pdfalyzer("/path/to/the/evil_or_non_evil.pdf")
|
|
|
135
137
|
actual_pdf_tree: PdfTreeNode = pdfalyzer.pdf_tree
|
|
136
138
|
|
|
137
139
|
# The PdfalyzerPresenter handles formatting/prettifying output
|
|
140
|
+
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
138
141
|
PdfalyzerPresenter(pdfalyzer).print_everything()
|
|
139
142
|
|
|
140
143
|
# Iterate over all nodes in the PDF tree
|
|
@@ -164,6 +167,7 @@ for backtick_quoted_string in font.binary_scanner.extract_backtick_quoted_bytes(
|
|
|
164
167
|
do_stuff(backtick_quoted_string)
|
|
165
168
|
```
|
|
166
169
|
|
|
170
|
+
-------------
|
|
167
171
|
|
|
168
172
|
# Example Output
|
|
169
173
|
The Pdfalyzer can export visualizations to HTML, ANSI colored text, and SVG images using the file export functionality that comes with [Rich](https://github.com/Textualize/rich). SVGs can be turned into `png` format images with a tool like Inkscape or `cairosvg` (Inkscape works a lot better in our experience). See `pdfalyze --help` for the specifics.
|
|
@@ -188,7 +192,7 @@ This image shows a more in-depth view of of the PDF tree for the same document s
|
|
|
188
192
|
|
|
189
193
|
## Fonts
|
|
190
194
|
|
|
191
|
-
#### **Extract character mappings from ancient Adobe font formats
|
|
195
|
+
#### **Extract character mappings from ancient Adobe font formats**. It's actually `PyPDF2` doing the lifting here but we're happy to take the credit.
|
|
192
196
|
|
|
193
197
|

|
|
194
198
|
|
|
@@ -223,8 +227,11 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
223
227
|
|
|
224
228
|

|
|
225
229
|
|
|
230
|
+
-------------
|
|
226
231
|
|
|
227
232
|
# PDF Resources
|
|
233
|
+
## Included PDF Tools
|
|
234
|
+
The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
|
|
228
235
|
|
|
229
236
|
## 3rd Party PDF Tools
|
|
230
237
|
### Installing Didier Stevens's PDF Analysis Tools
|
|
@@ -247,7 +254,7 @@ There's [a script](scripts/install_t1utils.sh) to help you install the suite if
|
|
|
247
254
|
scripts/install_t1utils.sh
|
|
248
255
|
```
|
|
249
256
|
|
|
250
|
-
## Documentation
|
|
257
|
+
## External Documentation
|
|
251
258
|
### Official Adobe Documentation
|
|
252
259
|
* [Official Adobe PDF 1.7 Specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf) - Indispensable map when navigating a PDF forest.
|
|
253
260
|
* [Adobe Type 1 Font Format Specification](https://adobe-type-tools.github.io/font-tech-notes/pdfs/T1_SPEC.pdf) - Official spec for Adobe's original font description language and file format. Useful if you have suspicions about malicious fonts. Type1 seems to be the attack vector of choice recently which isn't so surprising when you consider that it's a 30 year old technology and the code that renders these fonts probably hasn't been extensively tested in decades because almost no one uses them anymore outside of people who want to use them as attack vectors.
|
|
@@ -270,6 +277,8 @@ This tool was built to fill a gap in the PDF assessment landscape following [my
|
|
|
270
277
|
|
|
271
278
|
Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [PyPDF2](https://github.com/py-pdf/PyPDF2), [Rich](https://github.com/Textualize/rich), and [YARA](https://github.com/VirusTotal/yara-python) via [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer)) into this tool.
|
|
272
279
|
|
|
280
|
+
-------------
|
|
281
|
+
|
|
273
282
|
# Contributing
|
|
274
283
|
One easy way of contributing is to run [the script to test against all the PDFs in your `~/Documents` folder](scripts/test_against_all_pdfs_in_Documents_folder.sh) and report any issues.
|
|
275
284
|
|
|
@@ -290,7 +299,12 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
290
299
|
| **`indeterminate_node`** | any node whose place in the tree cannot be decided until every node has been seen |
|
|
291
300
|
| **`link_node`** | nodes like `/Dest` that just contain a pointer to another node |
|
|
292
301
|
|
|
302
|
+
### Reference
|
|
303
|
+
* [`PyPDF2 2.12.0` documentation](https://pypdf2.readthedocs.io/en/2.12.0/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
|
|
304
|
+
|
|
305
|
+
|
|
293
306
|
# TODO
|
|
307
|
+
* Upgrade `PyPDF` to latest and expand `combine_pdfs` compression command line option
|
|
294
308
|
* Highlight decodes with a lot of Javascript keywords
|
|
295
309
|
* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
|
|
296
310
|
* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
|
|
@@ -1,27 +1,28 @@
|
|
|
1
|
-
CHANGELOG.md,sha256=
|
|
1
|
+
CHANGELOG.md,sha256=uQ7oQFJC0a7S2mV19yEeJNfpgNJSbEh1qOIWw0s49Wo,11581
|
|
2
2
|
LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
3
|
-
pdfalyzer/__init__.py,sha256=
|
|
3
|
+
pdfalyzer/__init__.py,sha256=BO4KrcTSwabB4nh284jFRVoM9WDNlxIjYnMxxVUlK9Y,5312
|
|
4
4
|
pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
|
|
5
|
-
pdfalyzer/binary/binary_scanner.py,sha256=
|
|
5
|
+
pdfalyzer/binary/binary_scanner.py,sha256=7NrXx8GB2gpb04oR2bcZJKkOXOlzn2hWpcGlcYMqSfs,10217
|
|
6
6
|
pdfalyzer/config.py,sha256=oN-pVR037lt3giRsnsm4c8ku5hCW8ChFqYFi9V7w1qU,1918
|
|
7
|
-
pdfalyzer/decorators/document_model_printer.py,sha256=
|
|
8
|
-
pdfalyzer/decorators/indeterminate_node.py,sha256=
|
|
7
|
+
pdfalyzer/decorators/document_model_printer.py,sha256=VD9N47i7CGuNd7b6OYwYzPtx4-LDsEx9cpQIxFjDzI4,2683
|
|
8
|
+
pdfalyzer/decorators/indeterminate_node.py,sha256=ivB6dX5aN8W9m0ksXhmUcixnjYjnuE7DARalH-nMjxY,6616
|
|
9
9
|
pdfalyzer/decorators/pdf_object_properties.py,sha256=8dqHmi0J2USwnGPSy0Sg_ria_2TsaRWe_HWs-14RKrg,5524
|
|
10
|
-
pdfalyzer/decorators/pdf_tree_node.py,sha256=
|
|
10
|
+
pdfalyzer/decorators/pdf_tree_node.py,sha256=A69k-Wj7g4Y0AgnvFeE-stiNP4ZWNkFaDz3yZitgA4A,10930
|
|
11
11
|
pdfalyzer/decorators/pdf_tree_verifier.py,sha256=IRgm7ikdaqJEq66q3JcMZo49XQoONODM7lySioJfxRc,4543
|
|
12
|
-
pdfalyzer/detection/constants/binary_regexes.py,sha256=
|
|
12
|
+
pdfalyzer/detection/constants/binary_regexes.py,sha256=eFx1VVAOzxKmlacbGgicDCp1fcKgOkQkkzeduGjqLBQ,1594
|
|
13
13
|
pdfalyzer/detection/constants/javascript_reserved_keywords.py,sha256=CXXdWskdQa0Hs5wCci2RBVvipgZg34_cLfmkWG4Xcmg,991
|
|
14
14
|
pdfalyzer/detection/javascript_hunter.py,sha256=_wT2vkKTMlm_RGCjYsmwcmV-ag1qep3EpkHmUw0nWcQ,711
|
|
15
|
-
pdfalyzer/detection/yaralyzer_helper.py,sha256=
|
|
15
|
+
pdfalyzer/detection/yaralyzer_helper.py,sha256=_l9eJQUtMlo9RhY5h8Xq9gBLxzn1VgJsCA1nCsFDGvo,1999
|
|
16
16
|
pdfalyzer/font_info.py,sha256=L5ykKvlifAQv2uw-pKqxbQPqWrvbli0IcO8DgDK0SQo,6665
|
|
17
17
|
pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
|
|
18
|
+
pdfalyzer/helpers/filesystem_helper.py,sha256=wHlFz4DFzPAJt2OzMRrhsjL-O3gLJ02JhuwBRwkE958,4089
|
|
18
19
|
pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
|
|
19
20
|
pdfalyzer/helpers/pdf_object_helper.py,sha256=u0j8B9mY8s5cTGo5LmDcozotvvgZNrwwJ4w_ipQqiXw,1105
|
|
20
|
-
pdfalyzer/helpers/rich_text_helper.py,sha256=
|
|
21
|
+
pdfalyzer/helpers/rich_text_helper.py,sha256=EkuF1GNQ8F8StZnl2flpI4C8RPvpxUV2aqCIDdjUDj8,2255
|
|
21
22
|
pdfalyzer/helpers/string_helper.py,sha256=75EDEFw3UWHvWF32WtvZVBbqYY3ozO4y30dtH2qVMX0,2278
|
|
22
23
|
pdfalyzer/output/character_mapping.py,sha256=lKPf-Xw3K3A3h33EOB_B-YaaxuFie7h7PUXCrphuwmw,2095
|
|
23
24
|
pdfalyzer/output/layout.py,sha256=E58T9Tl6BYZTDsj6ouMr1J5SSUiXa7timUNxnOI2IzI,2149
|
|
24
|
-
pdfalyzer/output/pdfalyzer_presenter.py,sha256
|
|
25
|
+
pdfalyzer/output/pdfalyzer_presenter.py,sha256=RoOuVMqc4MLgkOPMuPaymdNj_4cUS33rplXSj5dZ0Qo,8501
|
|
25
26
|
pdfalyzer/output/styles/node_colors.py,sha256=sw-e97iRwAzqBdg0sP_b__9KCe6MbRcgMzQlPL6sCrA,3987
|
|
26
27
|
pdfalyzer/output/styles/rich_theme.py,sha256=Y8QmuINlyZNIHvf3oD0CV3w2dC49NNKtvOChvudDCT8,1983
|
|
27
28
|
pdfalyzer/output/tables/decoding_stats_table.py,sha256=mhQOiWhmovaC4sop38WcxStv_bIdAlQWUysAz5fW4MU,3461
|
|
@@ -30,17 +31,17 @@ pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=Soz5gkSl9pMFbwmGxyKyil_9X-
|
|
|
30
31
|
pdfalyzer/output/tables/stream_objects_table.py,sha256=nzCTci8Kqs8Pyghad3L5KWHDdIWRSrKCRNW8geA_rMo,707
|
|
31
32
|
pdfalyzer/pdf_object_relationship.py,sha256=EgeIiVDofvZd-il114H8ZlKKwCOci5T5S4e15mHK_Wg,5340
|
|
32
33
|
pdfalyzer/pdfalyzer.py,sha256=sOZqOKiRivd2I0Lek_cbYu0h4jIi8DXYnw5H0f6TfcA,11016
|
|
33
|
-
pdfalyzer/util/adobe_strings.py,sha256=
|
|
34
|
-
pdfalyzer/util/argument_parser.py,sha256=
|
|
34
|
+
pdfalyzer/util/adobe_strings.py,sha256=ea9rY83u1oL3uAx43AjuXY24zSdtyc2H7iJN6epaqkE,5048
|
|
35
|
+
pdfalyzer/util/argument_parser.py,sha256=36FDle0ke_HyxdQIIKHvm88XLQCexdbjMgSLnT3ZK7g,11860
|
|
35
36
|
pdfalyzer/util/debugging.py,sha256=nE64VUQbdu2OQRC8w8-AJkMtBOy8Kf3mjozuFslfWsw,156
|
|
36
37
|
pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
|
|
37
38
|
pdfalyzer/util/pdf_parser_manager.py,sha256=FVRYAYsCd0y5MAm--qvXnwCZnDtB3x85FdJtb-gpyw4,3109
|
|
38
|
-
pdfalyzer/yara_rules/PDF.yara,sha256=
|
|
39
|
+
pdfalyzer/yara_rules/PDF.yara,sha256=fBMKYmJgBLiCq-kpVzsTP9zUJEBep6yi_QVKmC-FdY0,38611
|
|
39
40
|
pdfalyzer/yara_rules/PDF_binary_stream.yara,sha256=oWRPLe5yQiRFMvi3BTHNTlB6T7NcAuxKn0C9OSvgJSM,804
|
|
40
41
|
pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
42
|
pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
|
|
42
|
-
pdfalyzer-1.
|
|
43
|
-
pdfalyzer-1.
|
|
44
|
-
pdfalyzer-1.
|
|
45
|
-
pdfalyzer-1.
|
|
46
|
-
pdfalyzer-1.
|
|
43
|
+
pdfalyzer-1.15.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
44
|
+
pdfalyzer-1.15.1.dist-info/METADATA,sha256=_xfjC5qZZtsjtGaNfUwYsGrekgGiZdfgenJXfxED9H0,25817
|
|
45
|
+
pdfalyzer-1.15.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
46
|
+
pdfalyzer-1.15.1.dist-info/entry_points.txt,sha256=aZurgt-Xg3pojS7oTRI4hNLpK1hO4kTfChf0x2eQoD8,147
|
|
47
|
+
pdfalyzer-1.15.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|