pdfalyzer 1.16.13__py3-none-any.whl → 1.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- CHANGELOG.md +8 -0
- pdfalyzer/__init__.py +19 -5
- pdfalyzer/binary/binary_scanner.py +28 -12
- pdfalyzer/config.py +2 -1
- pdfalyzer/decorators/indeterminate_node.py +11 -11
- pdfalyzer/decorators/pdf_file.py +212 -0
- pdfalyzer/decorators/pdf_object_properties.py +3 -3
- pdfalyzer/decorators/pdf_tree_node.py +17 -11
- pdfalyzer/decorators/pdf_tree_verifier.py +2 -0
- pdfalyzer/detection/yaralyzer_helper.py +9 -10
- pdfalyzer/helpers/filesystem_helper.py +27 -3
- pdfalyzer/helpers/image_helper.py +31 -0
- pdfalyzer/helpers/rich_text_helper.py +51 -1
- pdfalyzer/helpers/string_helper.py +6 -1
- pdfalyzer/output/character_mapping.py +1 -1
- pdfalyzer/output/layout.py +13 -3
- pdfalyzer/output/styles/rich_theme.py +2 -1
- pdfalyzer/output/tables/decoding_stats_table.py +4 -4
- pdfalyzer/output/tables/font_summary_table.py +2 -2
- pdfalyzer/pdfalyzer.py +20 -13
- pdfalyzer/util/argument_parser.py +102 -7
- pdfalyzer/util/page_range.py +54 -0
- {pdfalyzer-1.16.13.dist-info → pdfalyzer-1.17.0.dist-info}/METADATA +35 -11
- pdfalyzer-1.17.0.dist-info/RECORD +53 -0
- {pdfalyzer-1.16.13.dist-info → pdfalyzer-1.17.0.dist-info}/entry_points.txt +2 -0
- pdfalyzer-1.16.13.dist-info/RECORD +0 -50
- {pdfalyzer-1.16.13.dist-info → pdfalyzer-1.17.0.dist-info}/LICENSE +0 -0
- {pdfalyzer-1.16.13.dist-info → pdfalyzer-1.17.0.dist-info}/WHEEL +0 -0
|
@@ -1,17 +1,24 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Functions for miscellaneous Rich text/string pretty printing operations.
|
|
3
3
|
"""
|
|
4
|
-
from
|
|
4
|
+
from sys import stderr
|
|
5
|
+
from typing import List, Optional, Union
|
|
5
6
|
|
|
6
7
|
from pypdf.generic import PdfObject
|
|
7
8
|
from rich.console import Console
|
|
9
|
+
from rich.panel import Panel
|
|
10
|
+
from rich.padding import Padding
|
|
8
11
|
from rich.text import Text
|
|
12
|
+
from yaralyzer.output.rich_console import console
|
|
9
13
|
|
|
10
14
|
from pdfalyzer.helpers.pdf_object_helper import pypdf_class_name
|
|
11
15
|
from pdfalyzer.output.styles.node_colors import get_label_style, get_class_style_italic
|
|
12
16
|
|
|
17
|
+
ARROW_BULLET = '➤ '
|
|
18
|
+
|
|
13
19
|
# Usually we use the yaralyzer console but that has no highlighter
|
|
14
20
|
pdfalyzer_console = Console(color_system='256')
|
|
21
|
+
stderr_console = Console(color_system='256', file=stderr)
|
|
15
22
|
|
|
16
23
|
|
|
17
24
|
def print_highlighted(msg: Union[str, Text], **kwargs) -> None:
|
|
@@ -32,6 +39,21 @@ def quoted_text(
|
|
|
32
39
|
return txt
|
|
33
40
|
|
|
34
41
|
|
|
42
|
+
def indented_bullet(msg: Union[str, Text], style: Optional[str] = None) -> Text:
|
|
43
|
+
return Text(' ') + bullet_text(msg, style)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def bullet_text(msg: Union[str, Text], style: Optional[str] = None) -> Text:
|
|
47
|
+
if isinstance(msg, str):
|
|
48
|
+
msg = Text(msg, style=style)
|
|
49
|
+
|
|
50
|
+
return Text(ARROW_BULLET).append(msg)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def mild_warning(msg: str) -> None:
|
|
54
|
+
console.print(indented_bullet(Text(msg, style='mild_warning')))
|
|
55
|
+
|
|
56
|
+
|
|
35
57
|
def node_label(idnum: int, label: str, pdf_object: PdfObject, underline: bool = True) -> Text:
|
|
36
58
|
"""Colored text representation of a PDF node. Example: <5:FontDescriptor(Dictionary)>."""
|
|
37
59
|
text = Text('<', style='white')
|
|
@@ -55,5 +77,33 @@ def number_and_pct(_number: int, total: int, digits: int = 1) -> Text:
|
|
|
55
77
|
|
|
56
78
|
|
|
57
79
|
def pct_txt(_number: int, total: int, digits: int = 1) -> Text:
|
|
80
|
+
"""Return nicely formatted percentage, e.g. '(80%)'."""
|
|
58
81
|
pct = (100 * float(_number) / float(total)).__round__(digits)
|
|
59
82
|
return Text(f"({pct}%)", style='blue')
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def warning_text(text: Union[str, Text]) -> Text:
|
|
86
|
+
msg = Text('').append(f"WARNING", style='bright_yellow').append(": ")
|
|
87
|
+
|
|
88
|
+
if isinstance(text, Text):
|
|
89
|
+
return msg + text
|
|
90
|
+
else:
|
|
91
|
+
return msg.append(text)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def error_text(text: Union[str, Text]) -> Text:
|
|
95
|
+
msg = Text('').append(f"ERROR", style='bright_red').append(": ")
|
|
96
|
+
|
|
97
|
+
if isinstance(text, Text):
|
|
98
|
+
return msg + text
|
|
99
|
+
else:
|
|
100
|
+
return msg.append(text)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def attention_getting_panel(text: Text, title: str, style: str = 'white on red') -> Padding:
|
|
104
|
+
p = Panel(text, padding=(2), title=title, style=style)
|
|
105
|
+
return Padding(p, pad=(1, 10, 2, 10))
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def print_error(text: Union[str, Text]) -> Text:
|
|
109
|
+
console.print(error_text(text))
|
|
@@ -35,10 +35,15 @@ def count_pattern_matches_in_text(pattern: str, text: str) -> int:
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
def count_regex_matches_in_text(regex: Pattern, text: str) -> int:
|
|
38
|
-
"""For use when you precompile the regex"""
|
|
38
|
+
"""For use when you precompile the regex."""
|
|
39
39
|
return sum(1 for _ in regex.finditer(text))
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
def exception_str(e: Exception) -> str:
|
|
43
|
+
"""A string with the type and message."""
|
|
44
|
+
return f"{type(e).__name__}: {e}"
|
|
45
|
+
|
|
46
|
+
|
|
42
47
|
def root_address(_string: str) -> str:
|
|
43
48
|
"""Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
|
|
44
49
|
return _string.split('[')[0]
|
|
@@ -19,7 +19,7 @@ CHARMAP_PADDING = (0, 2, 0, 10)
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def print_character_mapping(font: 'FontInfo') -> None: # noqa: F821
|
|
22
|
-
"""Prints the character mapping extracted by PyPDF._charmap in tidy columns"""
|
|
22
|
+
"""Prints the character mapping extracted by PyPDF._charmap in tidy columns."""
|
|
23
23
|
if font.character_mapping is None or len(font.character_mapping) == 0:
|
|
24
24
|
log.info(f"No character map found in {font}")
|
|
25
25
|
return
|
pdfalyzer/output/layout.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Methods to help with the formatting of the output tables, headers, panels, etc.
|
|
3
3
|
"""
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
4
6
|
from rich import box
|
|
5
7
|
from rich.padding import Padding
|
|
6
8
|
from rich.panel import Panel
|
|
@@ -11,7 +13,7 @@ DEFAULT_SUBTABLE_COL_STYLES = ['white', 'bright_white']
|
|
|
11
13
|
HEADER_PADDING = (1, 1)
|
|
12
14
|
|
|
13
15
|
|
|
14
|
-
def generate_subtable(cols, header_style='subtable') -> Table:
|
|
16
|
+
def generate_subtable(cols: List[str], header_style: str = 'subtable') -> Table:
|
|
15
17
|
"""Suited for placement in larger tables."""
|
|
16
18
|
table = Table(
|
|
17
19
|
box=box.SIMPLE,
|
|
@@ -33,10 +35,12 @@ def generate_subtable(cols, header_style='subtable') -> Table:
|
|
|
33
35
|
|
|
34
36
|
|
|
35
37
|
def subheading_width() -> int:
|
|
38
|
+
"""Return 75% of the console width."""
|
|
36
39
|
return int(console_width() * 0.75)
|
|
37
40
|
|
|
38
41
|
|
|
39
42
|
def half_width() -> int:
|
|
43
|
+
"""Return 50% of the console width."""
|
|
40
44
|
return int(console_width() * 0.5)
|
|
41
45
|
|
|
42
46
|
|
|
@@ -46,28 +50,34 @@ def pad_header(header: str) -> Padding:
|
|
|
46
50
|
|
|
47
51
|
|
|
48
52
|
def print_section_header(headline: str, style: str = '') -> None:
|
|
53
|
+
"""Prints a full-width section header with padding above and below."""
|
|
49
54
|
console.line(2)
|
|
50
55
|
_print_header_panel(headline, f"{style} reverse", True, console_width(), HEADER_PADDING)
|
|
51
56
|
console.line()
|
|
52
57
|
|
|
53
58
|
|
|
54
59
|
def print_section_subheader(headline: str, style: str = '') -> None:
|
|
60
|
+
"""Prints a half-width section subheader with padding above."""
|
|
55
61
|
console.line()
|
|
56
62
|
_print_header_panel(headline, style, True, subheading_width(), HEADER_PADDING)
|
|
57
63
|
|
|
58
64
|
|
|
59
65
|
def print_section_sub_subheader(headline: str, style: str = ''):
|
|
66
|
+
"""Prints a half-width section sub-subheader with no padding above."""
|
|
60
67
|
console.line()
|
|
61
68
|
_print_header_panel(headline, style, True, half_width())
|
|
62
69
|
|
|
63
70
|
|
|
64
|
-
def print_headline_panel(headline, style: str = ''):
|
|
71
|
+
def print_headline_panel(headline: str, style: str = ''):
|
|
72
|
+
"""Prints a full-width headline panel with no padding above or below."""
|
|
65
73
|
_print_header_panel(headline, style, False, console_width())
|
|
66
74
|
|
|
67
75
|
|
|
68
|
-
def print_fatal_error_panel(headline):
|
|
76
|
+
def print_fatal_error_panel(headline: str):
|
|
77
|
+
"""Prints a full-width red blinking panel for fatal errors."""
|
|
69
78
|
print_headline_panel(headline, style='red blink')
|
|
70
79
|
|
|
71
80
|
|
|
72
81
|
def _print_header_panel(headline: str, style: str, expand: bool, width: int, padding: tuple = (0,)) -> None:
|
|
82
|
+
"""Helper to print a rich `Panel` with the given style, width, and padding."""
|
|
73
83
|
console.print(Panel(headline, style=style, expand=expand, width=width or subheading_width(), padding=padding))
|
|
@@ -51,7 +51,8 @@ PDFALYZER_THEME_DICT.update({
|
|
|
51
51
|
'warn.harsh': 'reverse bright_yellow',
|
|
52
52
|
# error log events
|
|
53
53
|
'fail': 'bold reverse red',
|
|
54
|
-
'
|
|
54
|
+
'mild_error': 'red', # TODO: unused?
|
|
55
|
+
'mild_warning': 'color(228) dim',
|
|
55
56
|
'red_alert': 'blink bold red reverse on white',
|
|
56
57
|
})
|
|
57
58
|
|
|
@@ -19,7 +19,7 @@ DECODES_SUBTABLE_COLS = ['Encoding', '#', 'Decoded', '#', 'Forced', '#', 'Failed
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def build_decoding_stats_table(scanner: BinaryScanner) -> Table:
|
|
22
|
-
"""Diplay aggregate results on the decoding attempts we made on subsets of scanner.bytes"""
|
|
22
|
+
"""Diplay aggregate results on the decoding attempts we made on subsets of `scanner.bytes`."""
|
|
23
23
|
stats_table = _new_decoding_stats_table(scanner.label.plain if scanner.label else '')
|
|
24
24
|
regexes_not_found_in_stream = []
|
|
25
25
|
|
|
@@ -58,9 +58,9 @@ def build_decoding_stats_table(scanner: BinaryScanner) -> Table:
|
|
|
58
58
|
return stats_table
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def _new_decoding_stats_table(
|
|
62
|
-
"""Build an empty table for displaying decoding stats"""
|
|
63
|
-
title = prefix_with_style(
|
|
61
|
+
def _new_decoding_stats_table(title_str: str) -> Table:
|
|
62
|
+
"""Build an empty table for displaying decoding stats."""
|
|
63
|
+
title = prefix_with_style(title_str, style='blue underline')
|
|
64
64
|
title.append(": Decoding Attempts Summary Statistics", style='bright_white bold')
|
|
65
65
|
|
|
66
66
|
table = Table(
|
|
@@ -15,8 +15,8 @@ ATTRIBUTES_TO_SHOW_IN_SUMMARY_TABLE = [
|
|
|
15
15
|
]
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def font_summary_table(font):
|
|
19
|
-
"""Build a Rich Table with important info about the font"""
|
|
18
|
+
def font_summary_table(font: 'FontInfo') -> Table: # noqa: F821
|
|
19
|
+
"""Build a Rich `Table` with important info about the font"""
|
|
20
20
|
table = Table('', '', show_header=False)
|
|
21
21
|
table.columns[0].style = 'font.property'
|
|
22
22
|
table.columns[0].justify = 'right'
|
pdfalyzer/pdfalyzer.py
CHANGED
|
@@ -1,10 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
wrapping each internal PDF object in a PdfTreeNode. Tree is managed by
|
|
4
|
-
the anytree library. Information about the tree as a whole is stored
|
|
5
|
-
in this class.
|
|
6
|
-
Once the PDF is parsed this class manages access to
|
|
7
|
-
information about or from the underlying PDF tree.
|
|
2
|
+
PDFalyzer: Analyze and explore the structure of PDF files.
|
|
8
3
|
"""
|
|
9
4
|
from os.path import basename
|
|
10
5
|
from typing import Dict, Iterator, List, Optional
|
|
@@ -31,7 +26,19 @@ TRAILER_FALLBACK_ID = 10000000
|
|
|
31
26
|
|
|
32
27
|
|
|
33
28
|
class Pdfalyzer:
|
|
29
|
+
"""
|
|
30
|
+
Walks a PDF's internals and builds the PDF logical structure tree.
|
|
31
|
+
|
|
32
|
+
Each of the PDF's internal objects isw rapped in a `PdfTreeNode` object. The tree is managed
|
|
33
|
+
by the `anytree` library. Information about the tree as a whole is stored in this class.
|
|
34
|
+
Once the PDF is parsed this class provides access to info about or from the underlying PDF tree.
|
|
35
|
+
"""
|
|
36
|
+
|
|
34
37
|
def __init__(self, pdf_path: str):
|
|
38
|
+
"""
|
|
39
|
+
Args:
|
|
40
|
+
pdf_path: Path to the PDF file to analyze
|
|
41
|
+
"""
|
|
35
42
|
self.pdf_path = pdf_path
|
|
36
43
|
self.pdf_basename = basename(pdf_path)
|
|
37
44
|
self.pdf_bytes = load_binary_data(pdf_path)
|
|
@@ -72,7 +79,7 @@ class Pdfalyzer:
|
|
|
72
79
|
log.info(f"Walk complete.")
|
|
73
80
|
|
|
74
81
|
def walk_node(self, node: PdfTreeNode) -> None:
|
|
75
|
-
"""Recursively walk the PDF's tree structure starting at a given node"""
|
|
82
|
+
"""Recursively walk the PDF's tree structure starting at a given node."""
|
|
76
83
|
log.info(f'walk_node() called with {node}. Object dump:\n{print_with_header(node.obj, node.label)}')
|
|
77
84
|
nodes_to_walk_next = [self._add_relationship_to_pdf_tree(r) for r in node.references_to_other_nodes()]
|
|
78
85
|
node.all_references_processed = True
|
|
@@ -82,7 +89,7 @@ class Pdfalyzer:
|
|
|
82
89
|
self.walk_node(next_node)
|
|
83
90
|
|
|
84
91
|
def find_node_by_idnum(self, idnum) -> Optional[PdfTreeNode]:
|
|
85
|
-
"""Find node with idnum in the tree. Return None if that node is not reachable from the root."""
|
|
92
|
+
"""Find node with `idnum` in the tree. Return `None` if that node is not reachable from the root."""
|
|
86
93
|
nodes = [
|
|
87
94
|
node for node in findall_by_attr(self.pdf_tree, name='idnum', value=idnum)
|
|
88
95
|
if not isinstance(node, SymlinkNode)
|
|
@@ -96,7 +103,7 @@ class Pdfalyzer:
|
|
|
96
103
|
raise PdfWalkError(f"Too many nodes had id {idnum}: {nodes}")
|
|
97
104
|
|
|
98
105
|
def is_in_tree(self, search_for_node: PdfTreeNode) -> bool:
|
|
99
|
-
"""Returns true if search_for_node is in the tree already."""
|
|
106
|
+
"""Returns true if `search_for_node` is in the tree already."""
|
|
100
107
|
return any([node == search_for_node for node in self.node_iterator()])
|
|
101
108
|
|
|
102
109
|
def node_iterator(self) -> Iterator[PdfTreeNode]:
|
|
@@ -110,7 +117,7 @@ class Pdfalyzer:
|
|
|
110
117
|
|
|
111
118
|
def _add_relationship_to_pdf_tree(self, relationship: PdfObjectRelationship) -> Optional[PdfTreeNode]:
|
|
112
119
|
"""
|
|
113
|
-
Place the relationship
|
|
120
|
+
Place the `relationship` node in the tree. Returns an optional node that should be
|
|
114
121
|
placed in the PDF node processing queue.
|
|
115
122
|
"""
|
|
116
123
|
log.info(f'Assessing relationship {relationship}...')
|
|
@@ -172,7 +179,7 @@ class Pdfalyzer:
|
|
|
172
179
|
return to_node
|
|
173
180
|
|
|
174
181
|
def _resolve_indeterminate_nodes(self) -> None:
|
|
175
|
-
"""Place all indeterminate nodes in the tree."""
|
|
182
|
+
"""Place all indeterminate nodes in the tree. Called after all nodes have been walked."""
|
|
176
183
|
indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self.indeterminate_ids]
|
|
177
184
|
indeterminate_nodes_string = "\n ".join([f"{node}" for node in indeterminate_nodes])
|
|
178
185
|
log.info(f"Resolving {len(indeterminate_nodes)} indeterminate nodes: {indeterminate_nodes_string}")
|
|
@@ -185,7 +192,7 @@ class Pdfalyzer:
|
|
|
185
192
|
IndeterminateNode(node).place_node()
|
|
186
193
|
|
|
187
194
|
def _extract_font_infos(self) -> None:
|
|
188
|
-
"""Extract information about fonts in the tree and place it in self.font_infos"""
|
|
195
|
+
"""Extract information about fonts in the tree and place it in `self.font_infos`."""
|
|
189
196
|
for node in self.node_iterator():
|
|
190
197
|
if isinstance(node.obj, dict) and RESOURCES in node.obj:
|
|
191
198
|
log.debug(f"Extracting fonts from node with '{RESOURCES}' key: {node}...")
|
|
@@ -207,6 +214,6 @@ class Pdfalyzer:
|
|
|
207
214
|
return new_node
|
|
208
215
|
|
|
209
216
|
def _print_nodes_encountered(self) -> None:
|
|
210
|
-
"""Debug method that displays which nodes have already been walked"""
|
|
217
|
+
"""Debug method that displays which nodes have already been walked."""
|
|
211
218
|
for i in sorted(self.nodes_encountered.keys()):
|
|
212
219
|
console.print(f'{i}: {self.nodes_encountered[i]}')
|
|
@@ -7,19 +7,23 @@ from collections import namedtuple
|
|
|
7
7
|
from functools import partial, update_wrapper
|
|
8
8
|
from importlib.metadata import version
|
|
9
9
|
from os import getcwd, path
|
|
10
|
+
from pathlib import Path
|
|
10
11
|
from typing import List, Optional
|
|
11
12
|
|
|
12
13
|
from rich_argparse_plus import RichHelpFormatterPlus
|
|
13
14
|
from rich.prompt import Confirm
|
|
14
15
|
from rich.text import Text
|
|
16
|
+
from yaralyzer.helpers.file_helper import files_in_dir
|
|
15
17
|
from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
|
|
16
18
|
from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
|
|
17
19
|
|
|
18
|
-
|
|
20
|
+
|
|
21
|
+
from pdfalyzer.config import ALL_STREAMS, PDFALYZER, PdfalyzerConfig
|
|
19
22
|
from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
|
|
20
23
|
from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
|
|
21
24
|
with_pdf_extension)
|
|
22
25
|
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
26
|
+
from pdfalyzer.util.page_range import PageRangeArgumentValidator
|
|
23
27
|
|
|
24
28
|
# NamedTuple to keep our argument selection orderly
|
|
25
29
|
OutputSection = namedtuple('OutputSection', ['argument', 'method'])
|
|
@@ -124,9 +128,9 @@ parser._action_groups = parser._action_groups[:2] + [parser._action_groups[-1]]
|
|
|
124
128
|
# Main argument parsing begins #
|
|
125
129
|
################################
|
|
126
130
|
def parse_arguments():
|
|
127
|
-
"""Parse command line args. Most
|
|
131
|
+
"""Parse command line args. Most args can also be communicated to the app by setting env vars."""
|
|
128
132
|
if '--version' in sys.argv:
|
|
129
|
-
print(f"pdfalyzer {version(
|
|
133
|
+
print(f"pdfalyzer {version(PDFALYZER)}")
|
|
130
134
|
sys.exit()
|
|
131
135
|
|
|
132
136
|
args = parser.parse_args()
|
|
@@ -158,10 +162,16 @@ def parse_arguments():
|
|
|
158
162
|
return args
|
|
159
163
|
|
|
160
164
|
|
|
161
|
-
def output_sections(args, pdfalyzer) -> List[OutputSection]:
|
|
165
|
+
def output_sections(args: Namespace, pdfalyzer: 'Pdfalyzer') -> List[OutputSection]: # noqa: F821
|
|
162
166
|
"""
|
|
163
167
|
Determine which of the tree visualizations, font scans, etc should be run.
|
|
164
168
|
If nothing is specified output ALL sections other than --streams which is v. slow/verbose.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
args: parsed command line arguments
|
|
172
|
+
pdfalyzer: the `pdfalyzer` instance whose methods will be called to produce output
|
|
173
|
+
Returns:
|
|
174
|
+
List[OutputSection]: List of `OutputSection` namedtuples with 'argument' and 'method' fields
|
|
165
175
|
"""
|
|
166
176
|
# Create a partial for print_font_info() because it's the only one that can take an argument
|
|
167
177
|
# partials have no __name__ so update_wrapper() propagates the 'print_font_info' as this partial's name
|
|
@@ -196,9 +206,10 @@ def all_sections_chosen(args):
|
|
|
196
206
|
return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
|
|
197
207
|
|
|
198
208
|
|
|
199
|
-
|
|
200
|
-
#
|
|
201
|
-
|
|
209
|
+
#############################################################
|
|
210
|
+
# Separate arg parsers for combine_pdfs and other scripts #
|
|
211
|
+
#############################################################
|
|
212
|
+
|
|
202
213
|
MAX_QUALITY = 10
|
|
203
214
|
|
|
204
215
|
combine_pdfs_parser = ArgumentParser(
|
|
@@ -251,6 +262,90 @@ def parse_combine_pdfs_args() -> Namespace:
|
|
|
251
262
|
return args
|
|
252
263
|
|
|
253
264
|
|
|
265
|
+
###########################################
|
|
266
|
+
# Parse args for extract_pdf_pages() #
|
|
267
|
+
###########################################
|
|
268
|
+
page_range_validator = PageRangeArgumentValidator()
|
|
269
|
+
|
|
270
|
+
extract_pdf_parser = ArgumentParser(
|
|
271
|
+
formatter_class=RichHelpFormatterPlus,
|
|
272
|
+
description="Extract pages from one PDF into a new PDF.",
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
|
|
276
|
+
|
|
277
|
+
extract_pdf_parser.add_argument('--page-range', '-r',
|
|
278
|
+
type=page_range_validator,
|
|
279
|
+
help=page_range_validator.HELP_MSG,
|
|
280
|
+
required=True)
|
|
281
|
+
|
|
282
|
+
extract_pdf_parser.add_argument('--destination-dir', '-d',
|
|
283
|
+
help="directory to write the new PDF to",
|
|
284
|
+
default=Path.cwd())
|
|
285
|
+
|
|
286
|
+
extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def parse_pdf_page_extraction_args() -> Namespace:
|
|
290
|
+
args = extract_pdf_parser.parse_args()
|
|
291
|
+
|
|
292
|
+
if not is_pdf(args.pdf_file):
|
|
293
|
+
log.error(f"'{args.pdf_file}' is not a PDF.")
|
|
294
|
+
sys.exit(-1)
|
|
295
|
+
elif not Path(args.destination_dir).exists():
|
|
296
|
+
log.error(f"Destination dir '{args.destination_dir}' does not exist.")
|
|
297
|
+
sys.exit(1)
|
|
298
|
+
|
|
299
|
+
return args
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
############################################
|
|
303
|
+
# Parse args for extract_text_from_pdfs() #
|
|
304
|
+
############################################
|
|
305
|
+
extract_text_parser = ArgumentParser(
|
|
306
|
+
formatter_class=RichHelpFormatterPlus,
|
|
307
|
+
description="Extract the text from one or more files or directories.",
|
|
308
|
+
epilog="If any of the FILE_OR_DIRs is a directory all files in that directory will be extracted."
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
|
|
312
|
+
extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
313
|
+
|
|
314
|
+
extract_text_parser.add_argument('--page-range', '-r',
|
|
315
|
+
type=page_range_validator,
|
|
316
|
+
help=f"[PDFs only] {page_range_validator.HELP_MSG}")
|
|
317
|
+
|
|
318
|
+
extract_text_parser.add_argument('--print-as-parsed', '-p',
|
|
319
|
+
action='store_true',
|
|
320
|
+
help='print pages as they are parsed instead of waiting until document is fully parsed')
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def parse_text_extraction_args() -> Namespace:
|
|
324
|
+
args = extract_text_parser.parse_args()
|
|
325
|
+
args.files_to_process = []
|
|
326
|
+
|
|
327
|
+
for file_or_dir in args.file_or_dir:
|
|
328
|
+
file_path = Path(file_or_dir)
|
|
329
|
+
|
|
330
|
+
if not file_path.exists():
|
|
331
|
+
log.error(f"File '{file_path}' doesn't exist!")
|
|
332
|
+
sys.exit(-1)
|
|
333
|
+
elif file_path.is_dir():
|
|
334
|
+
args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
|
|
335
|
+
else:
|
|
336
|
+
args.files_to_process.append(file_path)
|
|
337
|
+
|
|
338
|
+
if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
|
|
339
|
+
log.error(f"--page-range can only be specified for a single PDF")
|
|
340
|
+
sys.exit(-1)
|
|
341
|
+
|
|
342
|
+
return args
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
#############
|
|
346
|
+
# Helpers #
|
|
347
|
+
#############
|
|
348
|
+
|
|
254
349
|
def ask_to_proceed() -> None:
|
|
255
350
|
"""Exit if user doesn't confirm they want to proceed."""
|
|
256
351
|
if not Confirm.ask(Text("Proceed anyway?")):
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A range of page numbers. Copied from clown_sort repo.
|
|
3
|
+
"""
|
|
4
|
+
import re
|
|
5
|
+
from argparse import ArgumentTypeError
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Tuple
|
|
8
|
+
|
|
9
|
+
PAGE_RANGE_REGEX = re.compile('\\d(-\\d)?')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class PageRange:
|
|
14
|
+
page_range: str
|
|
15
|
+
|
|
16
|
+
def __post_init__(self):
|
|
17
|
+
if not PAGE_RANGE_REGEX.match(self.page_range):
|
|
18
|
+
raise ValueError(f"Invalid page range '{self.page_range}'")
|
|
19
|
+
|
|
20
|
+
if '-' in self.page_range:
|
|
21
|
+
(self.first_page, self.last_page) = (int(p) for p in self.page_range.split('-'))
|
|
22
|
+
else:
|
|
23
|
+
self.first_page = int(self.page_range)
|
|
24
|
+
self.last_page = self.first_page + 1
|
|
25
|
+
|
|
26
|
+
if self.last_page <= self.first_page:
|
|
27
|
+
raise ValueError(f"Invalid page range {self.__repr__()}")
|
|
28
|
+
|
|
29
|
+
def in_range(self, page_number) -> bool:
|
|
30
|
+
"""Returns `True` if `page_number` is in this range."""
|
|
31
|
+
return page_number >= self.first_page and page_number < self.last_page
|
|
32
|
+
|
|
33
|
+
def file_suffix(self) -> str:
|
|
34
|
+
"""String that can be used as file suffix."""
|
|
35
|
+
if self.first_page + 1 == self.last_page:
|
|
36
|
+
return f"page_{self.first_page}"
|
|
37
|
+
else:
|
|
38
|
+
return f"pages_{self.first_page}-{self.last_page}"
|
|
39
|
+
|
|
40
|
+
def to_tuple(self) -> Tuple[int, int]:
|
|
41
|
+
return (self.first_page, self.last_page)
|
|
42
|
+
|
|
43
|
+
def __repr__(self) -> str:
|
|
44
|
+
return f"PageRange({self.first_page}, {self.last_page})"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class PageRangeArgumentValidator(object):
|
|
48
|
+
HELP_MSG = "a single digit ('11') or a range ('11-15') (WILL NOT extract the last page)"
|
|
49
|
+
|
|
50
|
+
def __call__(self, value):
|
|
51
|
+
if not PAGE_RANGE_REGEX.match(value):
|
|
52
|
+
raise ArgumentTypeError("Argument has to match '{}'".format(PAGE_RANGE_REGEX.pattern))
|
|
53
|
+
|
|
54
|
+
return PageRange(value)
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.17.0
|
|
4
4
|
Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
7
7
|
Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
|
|
8
8
|
Author: Michel de Cryptadamus
|
|
9
9
|
Author-email: michel@cryptadamus.com
|
|
10
|
-
Requires-Python: >=3.
|
|
10
|
+
Requires-Python: >=3.10,<4.0
|
|
11
11
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
12
|
Classifier: Environment :: Console
|
|
13
13
|
Classifier: Intended Audience :: Information Technology
|
|
@@ -18,24 +18,26 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
22
21
|
Classifier: Topic :: Artistic Software
|
|
23
22
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
24
23
|
Classifier: Topic :: Security
|
|
24
|
+
Provides-Extra: extract
|
|
25
|
+
Requires-Dist: PyMuPDF (>=1.26.4,<2.0.0) ; extra == "extract"
|
|
25
26
|
Requires-Dist: anytree (>=2.13,<3.0)
|
|
26
27
|
Requires-Dist: pypdf (>=6.0.0,<7.0.0)
|
|
27
|
-
Requires-Dist:
|
|
28
|
+
Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
|
|
29
|
+
Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
|
|
28
30
|
Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
|
|
29
31
|
Project-URL: Documentation, https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
30
32
|
Project-URL: Repository, https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
31
33
|
Description-Content-Type: text/markdown
|
|
32
34
|
|
|
33
|
-
<!--  -->
|
|
34
|
-

|
|
35
35
|
[](https://pypi.org/project/pdfalyzer/)
|
|
36
|
-
[](https://github.com/michelcrypt4d4mus/pdfalyzer)
|
|
37
36
|

|
|
37
|
+
[](https://github.com/michelcrypt4d4mus/pdfalyzer)
|
|
38
|
+

|
|
38
39
|

|
|
40
|
+
[](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
|
|
39
41
|
|
|
40
42
|
|
|
41
43
|
# THE PDFALYZER
|
|
@@ -114,7 +116,12 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
|
|
|
114
116
|
The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
|
|
115
117
|
|
|
116
118
|
### Setting Command Line Options Permanently With A `.pdfalyzer` File
|
|
117
|
-
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer`
|
|
119
|
+
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
|
|
120
|
+
|
|
121
|
+
1. the current directory
|
|
122
|
+
2. the user's home directory
|
|
123
|
+
|
|
124
|
+
If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
|
|
118
125
|
|
|
119
126
|
### Environment Variables
|
|
120
127
|
Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
|
|
@@ -125,10 +132,9 @@ Run `pdfalyzer_show_color_theme` to see the color theme employed.
|
|
|
125
132
|
### Guarantees
|
|
126
133
|
Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
|
|
127
134
|
|
|
128
|
-
## Example
|
|
135
|
+
## Example Malicious PDF Investigation
|
|
129
136
|
[BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
|
|
130
137
|
|
|
131
|
-
-------------
|
|
132
138
|
|
|
133
139
|
## Use As A Code Library
|
|
134
140
|
For info about setting up a dev environment see [Contributing](#contributing) below.
|
|
@@ -239,9 +245,27 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
239
245
|
|
|
240
246
|
-------------
|
|
241
247
|
|
|
248
|
+
|
|
242
249
|
# PDF Resources
|
|
243
250
|
## Included PDF Tools
|
|
244
|
-
The Pdfalyzer
|
|
251
|
+
The Pdfalyzer comes with a few command line tools:
|
|
252
|
+
|
|
253
|
+
#### `combine_pdfs`
|
|
254
|
+
Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
|
|
255
|
+
|
|
256
|
+
#### `extract_pdf_pages`
|
|
257
|
+
Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
|
|
258
|
+

|
|
259
|
+
|
|
260
|
+
#### `extract_text_from_pdfs`
|
|
261
|
+
Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
pipx install pdfalyzer[extract]
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
Run `extract_text_from_pdfs --help` to see the options.
|
|
268
|
+
|
|
245
269
|
|
|
246
270
|
## 3rd Party PDF Tools
|
|
247
271
|
### Installing Didier Stevens's PDF Analysis Tools
|