pdfalyzer 1.16.13__py3-none-any.whl → 1.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- CHANGELOG.md +8 -0
- pdfalyzer/__init__.py +19 -5
- pdfalyzer/binary/binary_scanner.py +28 -12
- pdfalyzer/config.py +2 -1
- pdfalyzer/decorators/indeterminate_node.py +11 -11
- pdfalyzer/decorators/pdf_file.py +212 -0
- pdfalyzer/decorators/pdf_object_properties.py +3 -3
- pdfalyzer/decorators/pdf_tree_node.py +17 -11
- pdfalyzer/decorators/pdf_tree_verifier.py +2 -0
- pdfalyzer/detection/yaralyzer_helper.py +9 -10
- pdfalyzer/helpers/filesystem_helper.py +27 -3
- pdfalyzer/helpers/image_helper.py +31 -0
- pdfalyzer/helpers/rich_text_helper.py +51 -1
- pdfalyzer/helpers/string_helper.py +6 -1
- pdfalyzer/output/character_mapping.py +1 -1
- pdfalyzer/output/layout.py +13 -3
- pdfalyzer/output/styles/rich_theme.py +2 -1
- pdfalyzer/output/tables/decoding_stats_table.py +4 -4
- pdfalyzer/output/tables/font_summary_table.py +2 -2
- pdfalyzer/pdfalyzer.py +20 -13
- pdfalyzer/util/argument_parser.py +102 -7
- pdfalyzer/util/page_range.py +54 -0
- {pdfalyzer-1.16.13.dist-info → pdfalyzer-1.17.0.dist-info}/METADATA +35 -11
- pdfalyzer-1.17.0.dist-info/RECORD +53 -0
- {pdfalyzer-1.16.13.dist-info → pdfalyzer-1.17.0.dist-info}/entry_points.txt +2 -0
- pdfalyzer-1.16.13.dist-info/RECORD +0 -50
- {pdfalyzer-1.16.13.dist-info → pdfalyzer-1.17.0.dist-info}/LICENSE +0 -0
- {pdfalyzer-1.16.13.dist-info → pdfalyzer-1.17.0.dist-info}/WHEEL +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
# 1.17.0
|
|
4
|
+
* Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
|
|
5
|
+
* Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)
|
|
6
|
+
|
|
7
|
+
### 1.16.14
|
|
8
|
+
* Bump `yaralyzer` to v1.0.9, handle `FileNotFoundError` which is now raised instead of `TypeError`
|
|
9
|
+
* Drop support for python 3.9
|
|
10
|
+
|
|
3
11
|
### 1.16.13
|
|
4
12
|
* Bump `yaralyzer` to v1.0.7 and fix reference to yaralyzer's renamed `prefix_with_style()` method
|
|
5
13
|
|
pdfalyzer/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import code
|
|
2
2
|
import sys
|
|
3
|
+
from argparse import Namespace
|
|
3
4
|
from os import environ, getcwd, path
|
|
4
5
|
|
|
5
6
|
from dotenv import load_dotenv
|
|
@@ -24,13 +25,14 @@ from yaralyzer.output.file_export import invoke_rich_export
|
|
|
24
25
|
from yaralyzer.output.rich_console import console
|
|
25
26
|
from yaralyzer.util.logging import log_and_print
|
|
26
27
|
|
|
28
|
+
from pdfalyzer.decorators.pdf_file import PdfFile
|
|
27
29
|
from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
|
|
28
30
|
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
29
31
|
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
30
32
|
from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
|
|
31
33
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
32
34
|
from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
|
|
33
|
-
parse_combine_pdfs_args)
|
|
35
|
+
parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
|
|
34
36
|
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
|
|
35
37
|
|
|
36
38
|
# For the table shown by running pdfalyzer_show_color_theme
|
|
@@ -132,7 +134,19 @@ def combine_pdfs():
|
|
|
132
134
|
print_highlighted(txt)
|
|
133
135
|
|
|
134
136
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
137
|
+
def extract_pdf_pages() -> None:
|
|
138
|
+
args = parse_pdf_page_extraction_args()
|
|
139
|
+
PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def extract_text_from_pdfs() -> None:
|
|
143
|
+
"""
|
|
144
|
+
Extract text from a single file or from all files in a given directory. Can accept
|
|
145
|
+
multiple paths as arguments on the command line.
|
|
146
|
+
"""
|
|
147
|
+
args: Namespace = parse_text_extraction_args()
|
|
148
|
+
console.line()
|
|
149
|
+
|
|
150
|
+
for file_path in args.files_to_process:
|
|
151
|
+
PdfFile(file_path).print_extracted_text(args.page_range, args.print_as_parsed)
|
|
152
|
+
console.line(2)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
various character encodings upon it to see what comes out.
|
|
2
|
+
`BinaryScanner` class.
|
|
4
3
|
"""
|
|
5
4
|
from collections import defaultdict
|
|
6
5
|
from typing import Iterator, Optional, Tuple
|
|
@@ -28,8 +27,18 @@ from pdfalyzer.util.adobe_strings import CONTENTS, CURRENTFILE_EEXEC, FONT_FILE_
|
|
|
28
27
|
|
|
29
28
|
|
|
30
29
|
class BinaryScanner:
|
|
30
|
+
"""
|
|
31
|
+
Class for handling binary data - scanning through it for various suspicious patterns as well as forcing
|
|
32
|
+
various character encodings upon it to see what comes out.
|
|
33
|
+
"""
|
|
34
|
+
|
|
31
35
|
def __init__(self, _bytes: bytes, owner: PdfTreeNode, label: Optional[Text] = None):
|
|
32
|
-
"""
|
|
36
|
+
"""
|
|
37
|
+
Args:
|
|
38
|
+
_bytes (bytes): The binary data to be scanned.
|
|
39
|
+
owner (PdfTreeNode): The `PdfTreeNode` that contains this binary data.
|
|
40
|
+
label (Optional[Text]): A rich `Text` label for the binary data (e.g. the PDF object's address).
|
|
41
|
+
"""
|
|
33
42
|
self.bytes = _bytes
|
|
34
43
|
self.label = label
|
|
35
44
|
self.owner = owner
|
|
@@ -42,7 +51,7 @@ class BinaryScanner:
|
|
|
42
51
|
self.regex_extraction_stats = defaultdict(lambda: RegexMatchMetrics())
|
|
43
52
|
|
|
44
53
|
def check_for_dangerous_instructions(self) -> None:
|
|
45
|
-
"""Scan for all the strings in DANGEROUS_INSTRUCTIONS list and decode bytes around them."""
|
|
54
|
+
"""Scan for all the strings in `DANGEROUS_INSTRUCTIONS` list and decode bytes around them."""
|
|
46
55
|
subheader = "Scanning Binary For Anything That Could Be Described As 'sus'..."
|
|
47
56
|
print_section_sub_subheader(subheader, style=f"bright_red")
|
|
48
57
|
|
|
@@ -71,8 +80,8 @@ class BinaryScanner:
|
|
|
71
80
|
|
|
72
81
|
def force_decode_quoted_bytes(self) -> None:
|
|
73
82
|
"""
|
|
74
|
-
Find all strings matching QUOTE_PATTERNS (AKA between quote chars) and decode them with various
|
|
75
|
-
The
|
|
83
|
+
Find all strings matching `QUOTE_PATTERNS` (AKA between quote chars) and decode them with various
|
|
84
|
+
encodings. The `--quote-type` arg will limit this decode to just one kind of quote.
|
|
76
85
|
"""
|
|
77
86
|
quote_selections = PdfalyzerConfig._args.extract_quoteds
|
|
78
87
|
|
|
@@ -100,11 +109,11 @@ class BinaryScanner:
|
|
|
100
109
|
# YARA rules are written on the fly and then YARA does the matching.
|
|
101
110
|
# -------------------------------------------------------------------------------
|
|
102
111
|
def extract_guillemet_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
|
|
103
|
-
"""Iterate on all strings surrounded by Guillemet quotes, e.g. «string
|
|
112
|
+
"""Iterate on all strings surrounded by Guillemet quotes, e.g. «string»."""
|
|
104
113
|
return self._quote_yaralyzer(QUOTE_PATTERNS[GUILLEMET], GUILLEMET).match_iterator()
|
|
105
114
|
|
|
106
115
|
def extract_backtick_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
|
|
107
|
-
"""Returns an interator over all strings surrounded by backticks"""
|
|
116
|
+
"""Returns an interator over all strings surrounded by backticks."""
|
|
108
117
|
return self._quote_yaralyzer(QUOTE_PATTERNS[BACKTICK], BACKTICK).match_iterator()
|
|
109
118
|
|
|
110
119
|
def extract_front_slash_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
|
|
@@ -137,7 +146,14 @@ class BinaryScanner:
|
|
|
137
146
|
console.line()
|
|
138
147
|
|
|
139
148
|
def process_yara_matches(self, yaralyzer: Yaralyzer, pattern: str, force: bool = False) -> None:
|
|
140
|
-
"""
|
|
149
|
+
"""
|
|
150
|
+
Decide whether to attempt to decode the matched bytes and track stats.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
yaralyzer (Yaralyzer): The `Yaralyzer` instance to use for finding matches.
|
|
154
|
+
pattern (str): The pattern being searched for (used for stats tracking).
|
|
155
|
+
force (bool): If `True`, decode all matches even if they are very short or very long.
|
|
156
|
+
"""
|
|
141
157
|
for bytes_match, decoder in yaralyzer.match_iterator():
|
|
142
158
|
log.debug(f"Trackings match stats for {pattern}, bytes_match: {bytes_match}, is_decodable: {bytes_match.is_decodable()}") # noqa: E501
|
|
143
159
|
|
|
@@ -162,7 +178,7 @@ class BinaryScanner:
|
|
|
162
178
|
return self.bytes.split(CURRENTFILE_EEXEC)[1] if CURRENTFILE_EEXEC in self.bytes else self.bytes
|
|
163
179
|
|
|
164
180
|
def _quote_yaralyzer(self, quote_pattern: str, quote_type: str):
|
|
165
|
-
"""Helper method to build a Yaralyzer for a quote_pattern"""
|
|
181
|
+
"""Helper method to build a Yaralyzer for a `quote_pattern`."""
|
|
166
182
|
label = f"{quote_type}_Quoted"
|
|
167
183
|
|
|
168
184
|
if quote_type == GUILLEMET:
|
|
@@ -177,7 +193,7 @@ class BinaryScanner:
|
|
|
177
193
|
rules_label: Optional[str] = None,
|
|
178
194
|
pattern_label: Optional[str] = None
|
|
179
195
|
) -> Yaralyzer:
|
|
180
|
-
"""Build a yaralyzer to scan self.bytes"""
|
|
196
|
+
"""Build a `yaralyzer` to scan `self.bytes`."""
|
|
181
197
|
return Yaralyzer.for_patterns(
|
|
182
198
|
patterns=[escape_yara_pattern(pattern)],
|
|
183
199
|
patterns_type=pattern_type,
|
|
@@ -198,5 +214,5 @@ class BinaryScanner:
|
|
|
198
214
|
self.suppression_notice_queue = []
|
|
199
215
|
|
|
200
216
|
def _eexec_idx(self) -> int:
|
|
201
|
-
"""Returns the location of CURRENTFILES_EEXEC within the binary stream data (or 0 if it's not there)."""
|
|
217
|
+
"""Returns the location of `CURRENTFILES_EEXEC` within the binary stream data (or 0 if it's not there)."""
|
|
202
218
|
return self.bytes.find(CURRENTFILE_EEXEC) if CURRENTFILE_EEXEC in self.bytes else 0
|
pdfalyzer/config.py
CHANGED
|
@@ -9,9 +9,10 @@ from os import environ, pardir, path
|
|
|
9
9
|
from yaralyzer.config import YaralyzerConfig, is_env_var_set_and_not_false, is_invoked_by_pytest
|
|
10
10
|
|
|
11
11
|
PDFALYZE = 'pdfalyze'
|
|
12
|
+
PDFALYZER = f"{PDFALYZE}r"
|
|
12
13
|
ALL_STREAMS = -1
|
|
13
14
|
PYTEST_FLAG = 'INVOKED_BY_PYTEST'
|
|
14
|
-
PROJECT_ROOT = path.join(str(importlib.resources.files(
|
|
15
|
+
PROJECT_ROOT = path.join(str(importlib.resources.files(PDFALYZER)), pardir)
|
|
15
16
|
|
|
16
17
|
# 3rd part pdf-parser.py
|
|
17
18
|
PDF_PARSER_EXECUTABLE_ENV_VAR = 'PDFALYZER_PDF_PARSER_PY_PATH'
|
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Some nodes cannot be placed until we have walked the rest of the tree. For instance
|
|
3
|
-
if we encounter a /Page that relationships /Resources we need to know if there's a
|
|
4
|
-
/Pages parent of the /Page before committing to a tree structure.
|
|
5
|
-
|
|
6
|
-
This class handles choosing among the candidates for a given PDF object's parent node
|
|
7
|
-
(AKA "figuring out where to place the node in the PDF object tree").
|
|
8
|
-
"""
|
|
9
1
|
from typing import Callable, List, Optional
|
|
10
2
|
|
|
11
3
|
from rich.markup import escape
|
|
@@ -18,6 +10,14 @@ from pdfalyzer.util.adobe_strings import *
|
|
|
18
10
|
|
|
19
11
|
|
|
20
12
|
class IndeterminateNode:
|
|
13
|
+
"""
|
|
14
|
+
Class to handle choosing among the candidates for a given PDF object's parent node.
|
|
15
|
+
|
|
16
|
+
Some nodes cannot be placed until we have walked the rest of the tree. For instance
|
|
17
|
+
if we encounter a /Page that relationships /Resources we need to know if there's a
|
|
18
|
+
/Pages parent of the /Page before committing to a tree structure.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
21
|
def __init__(self, node: PdfTreeNode) -> None:
|
|
22
22
|
self.node = node
|
|
23
23
|
|
|
@@ -56,7 +56,7 @@ class IndeterminateNode:
|
|
|
56
56
|
|
|
57
57
|
self.node.set_parent(parent)
|
|
58
58
|
|
|
59
|
-
def find_node_with_most_descendants(self, list_of_nodes: List[PdfTreeNode] = None) -> PdfTreeNode:
|
|
59
|
+
def find_node_with_most_descendants(self, list_of_nodes: Optional[List[PdfTreeNode]] = None) -> PdfTreeNode:
|
|
60
60
|
"""Find node with a reference to this one that has the most descendants"""
|
|
61
61
|
list_of_nodes = list_of_nodes or [r.from_node for r in self.node.non_tree_relationships]
|
|
62
62
|
max_descendants = max([node.descendants_count() for node in list_of_nodes])
|
|
@@ -64,7 +64,7 @@ class IndeterminateNode:
|
|
|
64
64
|
|
|
65
65
|
def _has_only_similar_relationships(self) -> bool:
|
|
66
66
|
"""
|
|
67
|
-
Returns True if all the nodes w/references to this one have the same type or if all the
|
|
67
|
+
Returns `True` if all the nodes w/references to this one have the same type or if all the
|
|
68
68
|
reference_keys that point to this node are the same.
|
|
69
69
|
"""
|
|
70
70
|
unique_refferer_labels = self.node.unique_labels_of_referring_nodes()
|
|
@@ -125,6 +125,6 @@ class IndeterminateNode:
|
|
|
125
125
|
|
|
126
126
|
|
|
127
127
|
def find_node_with_lowest_id(list_of_nodes: List[PdfTreeNode]) -> PdfTreeNode:
|
|
128
|
-
"""
|
|
128
|
+
"""Return node in `list_of_nodes` with lowest ID."""
|
|
129
129
|
lowest_idnum = min([n.idnum for n in list_of_nodes])
|
|
130
130
|
return next(n for n in list_of_nodes if n.idnum == lowest_idnum)
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import io
|
|
2
|
+
from logging import Logger
|
|
3
|
+
from os import path
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import List, Optional, Union
|
|
6
|
+
|
|
7
|
+
from pypdf import PdfReader, PdfWriter
|
|
8
|
+
from pypdf.errors import DependencyError, EmptyFileError, PdfStreamError
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.markup import escape
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
from rich.text import Text
|
|
13
|
+
from yaralyzer.output.rich_console import console
|
|
14
|
+
from yaralyzer.util.logging import log as yaralyzer_log
|
|
15
|
+
|
|
16
|
+
from pdfalyzer.helpers.filesystem_helper import create_dir_if_it_does_not_exist, insert_suffix_before_extension
|
|
17
|
+
from pdfalyzer.helpers.image_helper import ocr_text
|
|
18
|
+
from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_text, mild_warning,
|
|
19
|
+
print_error, stderr_console)
|
|
20
|
+
from pdfalyzer.helpers.string_helper import exception_str
|
|
21
|
+
from pdfalyzer.util.page_range import PageRange
|
|
22
|
+
|
|
23
|
+
DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
|
|
24
|
+
MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class PdfFile:
|
|
28
|
+
"""
|
|
29
|
+
Wrapper for a PDF file path that provides useful methods and properties.
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
file_path (Path): The path to the PDF file.
|
|
33
|
+
dirname (Path): The directory containing the PDF file.
|
|
34
|
+
basename (str): The base name of the PDF file (with extension).
|
|
35
|
+
basename_without_ext (str): The base name of the PDF file (without extension).
|
|
36
|
+
extname (str): The file extension of the PDF file.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, file_path: Union[str, Path]) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Args:
|
|
42
|
+
file_path (Union[str, Path]): Path to the PDF file.
|
|
43
|
+
"""
|
|
44
|
+
self.file_path: Path = Path(file_path)
|
|
45
|
+
|
|
46
|
+
if not self.file_path.exists():
|
|
47
|
+
raise FileNotFoundError(f"File '{file_path}' does not exist.")
|
|
48
|
+
|
|
49
|
+
self.dirname = self.file_path.parent
|
|
50
|
+
self.basename: str = path.basename(file_path)
|
|
51
|
+
self.basename_without_ext: str = str(Path(self.basename).with_suffix(''))
|
|
52
|
+
self.extname: str = self.file_path.suffix
|
|
53
|
+
self.file_size = self.file_path.stat().st_size
|
|
54
|
+
|
|
55
|
+
def extract_page_range(
|
|
56
|
+
self,
|
|
57
|
+
page_range: PageRange,
|
|
58
|
+
destination_dir: Optional[Path] = None,
|
|
59
|
+
extra_file_suffix: Optional[str] = None
|
|
60
|
+
) -> Path:
|
|
61
|
+
"""
|
|
62
|
+
Extract a range of pages to a new PDF file.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
page_range (PageRange): Range of pages to extract.
|
|
66
|
+
destination_dir (Optional[Path]): Directory to save the new PDF file. Defaults to the same
|
|
67
|
+
directory as the source PDF.
|
|
68
|
+
extra_file_suffix (Optional[str]): An optional suffix to append to the new PDF's filename.
|
|
69
|
+
Defaults to the page range suffix.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Path: The path to the newly created PDF file containing the extracted pages.
|
|
73
|
+
"""
|
|
74
|
+
destination_dir = destination_dir or self.dirname
|
|
75
|
+
create_dir_if_it_does_not_exist(destination_dir)
|
|
76
|
+
|
|
77
|
+
if extra_file_suffix is None:
|
|
78
|
+
file_suffix = page_range.file_suffix()
|
|
79
|
+
else:
|
|
80
|
+
file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
|
|
81
|
+
|
|
82
|
+
extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
|
|
83
|
+
extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
|
|
84
|
+
console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
|
|
85
|
+
pdf_writer = PdfWriter()
|
|
86
|
+
|
|
87
|
+
with open(self.file_path, 'rb') as source_pdf:
|
|
88
|
+
pdf_writer.append(fileobj=source_pdf, pages=page_range.to_tuple())
|
|
89
|
+
|
|
90
|
+
with open(extracted_pages_pdf_path, 'wb') as extracted_pages_pdf:
|
|
91
|
+
pdf_writer.write(extracted_pages_pdf)
|
|
92
|
+
|
|
93
|
+
console.print(f"Extracted pages to new PDF: '{extracted_pages_pdf_path}'.")
|
|
94
|
+
return extracted_pages_pdf_path
|
|
95
|
+
|
|
96
|
+
def extract_text(
|
|
97
|
+
self,
|
|
98
|
+
page_range: Optional[PageRange] = None,
|
|
99
|
+
logger: Optional[Logger] = None,
|
|
100
|
+
print_as_parsed: bool = False
|
|
101
|
+
) -> Optional[str]:
|
|
102
|
+
"""
|
|
103
|
+
Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
page_range (Optional[PageRange]): If provided, only extract text from pages in this range.
|
|
107
|
+
Page numbers are 1-indexed. If not provided, extract text from all pages.
|
|
108
|
+
log (Optional[Logger]): If provided, log progress to this logger. Otherwise use default logger.
|
|
109
|
+
print_as_parsed (bool): If True, print each page's text to STDOUT as it is parsed.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Optional[str]: The extracted text, or None if extraction failed.
|
|
113
|
+
"""
|
|
114
|
+
from PIL import Image # Imported here to avoid hard dependency if not using this method
|
|
115
|
+
log = logger or yaralyzer_log
|
|
116
|
+
log.debug(f"Extracting text from '{self.file_path}'...")
|
|
117
|
+
self._page_numbers_of_errors: List[int] = []
|
|
118
|
+
extracted_pages = []
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
pdf_reader = PdfReader(self.file_path)
|
|
122
|
+
page_count = len(pdf_reader.pages)
|
|
123
|
+
log.debug(f"PDF Page count: {page_count}")
|
|
124
|
+
|
|
125
|
+
for page_number, page in enumerate(pdf_reader.pages, start=1):
|
|
126
|
+
if page_range and not page_range.in_range(page_number):
|
|
127
|
+
self._log_to_stderr(f"Skipping page {page_number}...")
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
self._log_to_stderr(f"Parsing page {page_number}...")
|
|
131
|
+
page_buffer = Console(file=io.StringIO())
|
|
132
|
+
page_buffer.print(Panel(f"PAGE {page_number}", padding=(0, 15), expand=False))
|
|
133
|
+
page_buffer.print(escape(page.extract_text().strip()))
|
|
134
|
+
image_number = 1
|
|
135
|
+
|
|
136
|
+
# Extracting images is a bit fraught (lots of PIL and pypdf exceptions have come from here)
|
|
137
|
+
try:
|
|
138
|
+
for image_number, image in enumerate(page.images, start=1):
|
|
139
|
+
image_name = f"Page {page_number}, Image {image_number}"
|
|
140
|
+
self._log_to_stderr(f" Processing {image_name}...", "dim")
|
|
141
|
+
page_buffer.print(Panel(image_name, expand=False))
|
|
142
|
+
image_obj = Image.open(io.BytesIO(image.data))
|
|
143
|
+
image_text = ocr_text(image_obj, f"{self.file_path} ({image_name})")
|
|
144
|
+
page_buffer.print((image_text or '').strip())
|
|
145
|
+
except (OSError, NotImplementedError, TypeError, ValueError) as e:
|
|
146
|
+
error_str = exception_str(e)
|
|
147
|
+
msg = f"{error_str} while parsing embedded image {image_number} on page {page_number}..."
|
|
148
|
+
mild_warning(msg)
|
|
149
|
+
|
|
150
|
+
# Dump an error PDF and encourage user to report to pypdf team.
|
|
151
|
+
if 'JBIG2Decode' not in str(e):
|
|
152
|
+
stderr_console.print_exception()
|
|
153
|
+
|
|
154
|
+
if page_number not in self._page_numbers_of_errors:
|
|
155
|
+
self._handle_extraction_error(page_number, error_str)
|
|
156
|
+
self._page_numbers_of_errors.append(page_number)
|
|
157
|
+
|
|
158
|
+
page_text = page_buffer.file.getvalue()
|
|
159
|
+
extracted_pages.append(page_text)
|
|
160
|
+
log.debug(page_text)
|
|
161
|
+
|
|
162
|
+
if print_as_parsed:
|
|
163
|
+
print(f"{page_text}")
|
|
164
|
+
except DependencyError:
|
|
165
|
+
log.error("Pdfalyzer is missing an optional dependency required to extract text. Try 'pip install pdfalyzer[extract]'")
|
|
166
|
+
except EmptyFileError:
|
|
167
|
+
log.warning("Skipping empty file!")
|
|
168
|
+
except PdfStreamError as e:
|
|
169
|
+
print_error(f"Error parsing PDF file '{self.file_path}': {e}")
|
|
170
|
+
stderr_console.print_exception()
|
|
171
|
+
|
|
172
|
+
return "\n\n".join(extracted_pages).strip()
|
|
173
|
+
|
|
174
|
+
def print_extracted_text(self, page_range: Optional[PageRange] = None, print_as_parsed: bool = False) -> None:
|
|
175
|
+
"""Fancy wrapper for printing the extracted text to the screen."""
|
|
176
|
+
console.print(Panel(str(self.file_path), expand=False, style='bright_white reverse'))
|
|
177
|
+
txt = self.extract_text(page_range=page_range, print_as_parsed=print_as_parsed)
|
|
178
|
+
|
|
179
|
+
if not print_as_parsed:
|
|
180
|
+
console.print(txt)
|
|
181
|
+
|
|
182
|
+
def _handle_extraction_error(self, page_number: int, error_msg: str) -> None:
|
|
183
|
+
"""Rip the offending page to a new file and suggest that user report bug to PyPDF."""
|
|
184
|
+
destination_dir = DEFAULT_PDF_ERRORS_DIR
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
|
|
188
|
+
except Exception as e:
|
|
189
|
+
stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
|
|
190
|
+
extracted_file = None
|
|
191
|
+
|
|
192
|
+
blink_txt = Text('', style='bright_white')
|
|
193
|
+
blink_txt.append("An error (", style='blink color(154)').append(error_msg, style='color(11) blink')
|
|
194
|
+
blink_txt.append(') ', style='blink color(154)')
|
|
195
|
+
blink_txt.append("was encountered while processing a PDF file.\n\n", style='blink color(154)')
|
|
196
|
+
|
|
197
|
+
txt = Text(f"The error was of a type such that it probably came from a bug in ", style='bright_white')
|
|
198
|
+
txt.append('PyPDF', style='underline bright_green').append('. It was encountered processing the file ')
|
|
199
|
+
txt.append(str(self.file_path), style='file').append('. You should see a stack trace above this box.\n\n')
|
|
200
|
+
|
|
201
|
+
txt.append('The offending page will be extracted to ', style='bright_white')
|
|
202
|
+
txt.append(str(extracted_file), style='file').append('.\n\n')
|
|
203
|
+
txt.append(f"Please visit 'https://github.com/py-pdf/pypdf/issues' to report a bug. ", style='bold')
|
|
204
|
+
txt.append(f"Providing the devs with the extracted page and the stack trace help improve pypdf.")
|
|
205
|
+
stderr_console.print(attention_getting_panel(blink_txt + txt, title='PyPDF Error'))
|
|
206
|
+
|
|
207
|
+
def _log_to_stderr(self, msg: str, style: Optional[str] = None) -> None:
|
|
208
|
+
"""When parsing very large PDFs it can be useful to log progress and other messages to STDERR."""
|
|
209
|
+
if self.file_size < MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR:
|
|
210
|
+
return
|
|
211
|
+
|
|
212
|
+
stderr_console.print(msg, style=style or "")
|
|
@@ -15,7 +15,7 @@ from pdfalyzer.util.adobe_strings import *
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class PdfObjectProperties:
|
|
18
|
-
"""Simple class to extract critical features of a PdfObject
|
|
18
|
+
"""Simple class to extract critical features of a `PdfObject`."""
|
|
19
19
|
|
|
20
20
|
def __init__(
|
|
21
21
|
self,
|
|
@@ -86,7 +86,7 @@ class PdfObjectProperties:
|
|
|
86
86
|
obj: PdfObject,
|
|
87
87
|
is_single_row_table: bool = False
|
|
88
88
|
) -> List[Union[Text, str]]:
|
|
89
|
-
"""PDF object property at reference_key becomes a formatted 3-tuple for use in Rich tables."""
|
|
89
|
+
"""PDF object property at `reference_key` becomes a formatted 3-tuple for use in Rich tables."""
|
|
90
90
|
with_resolved_refs = cls.resolve_references(reference_key, obj)
|
|
91
91
|
|
|
92
92
|
return [
|
|
@@ -101,7 +101,7 @@ class PdfObjectProperties:
|
|
|
101
101
|
# TODO: this doesn't recurse...
|
|
102
102
|
@classmethod
|
|
103
103
|
def _obj_to_rich_text(cls, obj: Any) -> Text:
|
|
104
|
-
"""Recurse through obj and build a Text object."""
|
|
104
|
+
"""Recurse through `obj` and build a `Text` object."""
|
|
105
105
|
if isinstance(obj, dict):
|
|
106
106
|
key_value_pairs = [Text(f"{k}: ").append_text(cls._obj_to_rich_text(v)) for k, v in obj.items()]
|
|
107
107
|
return Text('{').append_text(comma_join_txt(key_value_pairs)).append('}')
|
|
@@ -1,10 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
Also adds decorators/generators for Rich text representation.
|
|
4
|
-
|
|
5
|
-
Child/parent relationships should be set using the add_child() and set_parent()
|
|
6
|
-
methods and not set directly. (TODO: this could be done better with anytree
|
|
7
|
-
hooks)
|
|
2
|
+
`PdfTreeNode` decorates a `PdfObject` with tree structure information.
|
|
8
3
|
"""
|
|
9
4
|
from typing import Callable, List, Optional
|
|
10
5
|
|
|
@@ -27,11 +22,22 @@ DECODE_FAILURE_LEN = -1
|
|
|
27
22
|
|
|
28
23
|
|
|
29
24
|
class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
25
|
+
"""
|
|
26
|
+
PDF node decorator - wraps actual PDF objects to make them `anytree` nodes.
|
|
27
|
+
Also adds decorators/generators for Rich text representation.
|
|
28
|
+
|
|
29
|
+
Child/parent relationships should be set using the `add_child()` and `set_parent()`
|
|
30
|
+
methods and not set directly.
|
|
31
|
+
|
|
32
|
+
TODO: this could be done better with anytree hooks.
|
|
33
|
+
"""
|
|
34
|
+
|
|
30
35
|
def __init__(self, obj: PdfObject, address: str, idnum: int):
|
|
31
36
|
"""
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
37
|
+
Args:
|
|
38
|
+
obj (PdfObject): The underlying PDF object
|
|
39
|
+
address (str): The first address that points from some node to this one
|
|
40
|
+
idnum (int): ID used in the reference
|
|
35
41
|
"""
|
|
36
42
|
PdfObjectProperties.__init__(self, obj, address, idnum)
|
|
37
43
|
self.non_tree_relationships: List[PdfObjectRelationship] = []
|
|
@@ -54,7 +60,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
54
60
|
|
|
55
61
|
@classmethod
|
|
56
62
|
def from_reference(cls, ref: IndirectObject, address: str) -> 'PdfTreeNode':
|
|
57
|
-
"""
|
|
63
|
+
"""Alternate constructor to Build a `PdfTreeNode` from an `IndirectObject`."""
|
|
58
64
|
try:
|
|
59
65
|
return cls(ref.get_object(), address, ref.idnum)
|
|
60
66
|
except PdfReadError as e:
|
|
@@ -82,7 +88,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
82
88
|
child.set_parent(self)
|
|
83
89
|
|
|
84
90
|
def add_non_tree_relationship(self, relationship: PdfObjectRelationship) -> None:
|
|
85
|
-
"""Add a relationship that points at this node's PDF object. TODO: doesn't include parent/child"""
|
|
91
|
+
"""Add a relationship that points at this node's PDF object. TODO: doesn't include parent/child."""
|
|
86
92
|
if relationship in self.non_tree_relationships:
|
|
87
93
|
return
|
|
88
94
|
|
|
@@ -11,6 +11,8 @@ from pdfalyzer.util.adobe_strings import *
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class PdfTreeVerifier:
|
|
14
|
+
"""Class to verify that the PDF tree is complete/contains all the nodes in the PDF file."""
|
|
15
|
+
|
|
14
16
|
def __init__(self, pdfalyzer: 'Pdfalyzer') -> None:
|
|
15
17
|
self.pdfalyzer = pdfalyzer
|
|
16
18
|
|
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Functions to help with the pre-configured YARA rules in the /yara directory.
|
|
3
3
|
"""
|
|
4
4
|
from importlib.resources import as_file, files
|
|
5
5
|
from sys import exit
|
|
6
6
|
from typing import Optional, Union
|
|
7
7
|
|
|
8
8
|
from yaralyzer.config import YaralyzerConfig
|
|
9
|
+
from yaralyzer.output.rich_console import print_fatal_error_and_exit
|
|
9
10
|
from yaralyzer.yaralyzer import Yaralyzer
|
|
10
11
|
|
|
11
|
-
|
|
12
|
+
from pdfalyzer.config import PDFALYZER
|
|
13
|
+
|
|
14
|
+
YARA_RULES_DIR = files(PDFALYZER).joinpath('yara_rules')
|
|
12
15
|
|
|
13
16
|
YARA_RULES_FILES = [
|
|
14
17
|
'didier_stevens.yara',
|
|
@@ -20,11 +23,12 @@ YARA_RULES_FILES = [
|
|
|
20
23
|
|
|
21
24
|
|
|
22
25
|
def get_file_yaralyzer(file_path_to_scan: str) -> Yaralyzer:
|
|
23
|
-
"""Get a yaralyzer for a file path"""
|
|
26
|
+
"""Get a yaralyzer for a file path."""
|
|
24
27
|
return _build_yaralyzer(file_path_to_scan)
|
|
25
28
|
|
|
26
29
|
|
|
27
30
|
def get_bytes_yaralyzer(scannable: bytes, label: str) -> Yaralyzer:
|
|
31
|
+
"""Get a yaralyzer for a `scannable` bytes."""
|
|
28
32
|
return _build_yaralyzer(scannable, label)
|
|
29
33
|
|
|
30
34
|
|
|
@@ -44,10 +48,5 @@ def _build_yaralyzer(scannable: Union[bytes, str], label: Optional[str] = None)
|
|
|
44
48
|
|
|
45
49
|
try:
|
|
46
50
|
return Yaralyzer.for_rules_files(rules_paths, scannable, label)
|
|
47
|
-
except
|
|
48
|
-
|
|
49
|
-
if "it doesn't exist" in str(e):
|
|
50
|
-
print(str(e))
|
|
51
|
-
exit(1)
|
|
52
|
-
else:
|
|
53
|
-
raise e
|
|
51
|
+
except FileNotFoundError as e:
|
|
52
|
+
print_fatal_error_and_exit(str(e))
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Optional, Union
|
|
7
7
|
|
|
8
8
|
from yaralyzer.output.rich_console import console
|
|
9
|
+
from yaralyzer.util.logging import log
|
|
9
10
|
|
|
10
11
|
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
11
12
|
|
|
@@ -18,9 +19,20 @@ PDF_EXT = '.pdf'
|
|
|
18
19
|
# type StrOrPath = Union[str, Path]
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
def
|
|
22
|
-
"""
|
|
23
|
-
|
|
22
|
+
def create_dir_if_it_does_not_exist(dir: Path) -> None:
|
|
23
|
+
"""Like it says on the tin."""
|
|
24
|
+
if dir.exists():
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
console.warning(f"Need to create '{dir}'")
|
|
28
|
+
dir.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def insert_suffix_before_extension(file_path: Path, suffix: str, separator: str = '__') -> Path:
|
|
32
|
+
"""Inserting 'page 1' suffix in 'path/to/file.jpg' -> '/path/to/file__page_1.jpg'."""
|
|
33
|
+
suffix = strip_bad_chars(suffix).replace(' ', '_')
|
|
34
|
+
file_path_without_extension = file_path.with_suffix('')
|
|
35
|
+
return Path(f"{file_path_without_extension}{separator}{suffix}{file_path.suffix}")
|
|
24
36
|
|
|
25
37
|
|
|
26
38
|
def is_pdf(file_path: Union[str, Path]) -> bool:
|
|
@@ -100,3 +112,15 @@ def set_max_open_files(num_filehandles: int = DEFAULT_MAX_OPEN_FILES) -> tuple[O
|
|
|
100
112
|
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
101
113
|
|
|
102
114
|
return (soft, hard)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def strip_bad_chars(text: str) -> str:
|
|
118
|
+
"""Remove chars that don't work well in filenames."""
|
|
119
|
+
text = ' '.join(text.splitlines()).replace('\\s+', ' ')
|
|
120
|
+
text = re.sub('’', "'", text).replace('|', 'I').replace(',', ',')
|
|
121
|
+
return re.sub('[^-0-9a-zA-Z@.,?_:=#\'\\$" ()]+', '_', text).replace(' ', ' ')
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def with_pdf_extension(file_path: Union[str, Path]) -> str:
|
|
125
|
+
"""Append `".pdf"` to `file_path` if it doesn't already end with `".pdf"`."""
|
|
126
|
+
return str(file_path) + ('' if is_pdf(file_path) else PDF_EXT)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from PIL import Image
|
|
4
|
+
from yaralyzer.output.rich_console import console
|
|
5
|
+
|
|
6
|
+
from pdfalyzer.helpers.rich_text_helper import warning_text
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
|
|
10
|
+
"""Use pytesseract to OCR the text in the image and return it as a string."""
|
|
11
|
+
import pytesseract
|
|
12
|
+
text = None
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
text = pytesseract.image_to_string(image)
|
|
16
|
+
except pytesseract.pytesseract.TesseractError as e:
|
|
17
|
+
console.print_exception()
|
|
18
|
+
console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
|
|
19
|
+
except OSError as e:
|
|
20
|
+
if 'truncated' in str(e):
|
|
21
|
+
console.print(warning_text(f"Truncated image file '{image_name}'!"))
|
|
22
|
+
else:
|
|
23
|
+
console.print_exception()
|
|
24
|
+
console.print(f"Error while extracting '{image_name}'!", style='bright_red')
|
|
25
|
+
raise e
|
|
26
|
+
except Exception as e:
|
|
27
|
+
console.print_exception()
|
|
28
|
+
console.print(f"Error while extracting '{image_name}'!", style='bright_red')
|
|
29
|
+
raise e
|
|
30
|
+
|
|
31
|
+
return None if text is None else text.strip()
|