pdfalyzer 1.17.0__py3-none-any.whl → 1.17.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- CHANGELOG.md +23 -0
- pdfalyzer/__init__.py +10 -9
- pdfalyzer/decorators/pdf_file.py +28 -20
- pdfalyzer/detection/yaralyzer_helper.py +0 -1
- pdfalyzer/helpers/filesystem_helper.py +0 -1
- pdfalyzer/helpers/image_helper.py +3 -3
- pdfalyzer/helpers/string_helper.py +28 -30
- pdfalyzer/pdfalyzer.py +37 -11
- pdfalyzer/util/argument_parser.py +2 -144
- pdfalyzer/util/cli_tools_argument_parser.py +164 -0
- pdfalyzer/util/page_range.py +4 -7
- {pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/METADATA +23 -33
- {pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/RECORD +16 -15
- {pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/entry_points.txt +1 -1
- {pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/LICENSE +0 -0
- {pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/WHEEL +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,28 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.17.7
|
|
4
|
+
* Bump `pypdf` to 6.1.3 (fixes [#31](https://github.com/michelcrypt4d4mus/pdfalyzer/issues/31), `PyMuPDF` to 1.26.5
|
|
5
|
+
|
|
6
|
+
### 1.17.6
|
|
7
|
+
* Better handling for errors resulting from bugs in PyPDF
|
|
8
|
+
* Properly close file handle when pdfalyzing is complete
|
|
9
|
+
|
|
10
|
+
### 1.17.5
|
|
11
|
+
* Fix `PIL` lazy import
|
|
12
|
+
|
|
13
|
+
### 1.17.4
|
|
14
|
+
* Make `PIL` a lazy import so installs without `[extract]` extras don't fail
|
|
15
|
+
|
|
16
|
+
### 1.17.3
|
|
17
|
+
* Put back `--debug` arg for CLI tools
|
|
18
|
+
|
|
19
|
+
### 1.17.2
|
|
20
|
+
* Remove unused `--debug` args for CLI tools
|
|
21
|
+
* Rename `extract_text_from_pdfs` to `extract_pdf_text`
|
|
22
|
+
|
|
23
|
+
### 1.17.1
|
|
24
|
+
* Fix issue where `extract_pdf_pages` page ranges were indexed from 0 instead of 1
|
|
25
|
+
|
|
3
26
|
# 1.17.0
|
|
4
27
|
* Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
|
|
5
28
|
* Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)
|
pdfalyzer/__init__.py
CHANGED
|
@@ -31,8 +31,9 @@ from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
|
31
31
|
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
32
32
|
from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
|
|
33
33
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
34
|
-
from pdfalyzer.util.argument_parser import
|
|
35
|
-
|
|
34
|
+
from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
|
|
35
|
+
from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, parse_combine_pdfs_args,
|
|
36
|
+
parse_pdf_page_extraction_args, parse_text_extraction_args)
|
|
36
37
|
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
|
|
37
38
|
|
|
38
39
|
# For the table shown by running pdfalyzer_show_color_theme
|
|
@@ -42,7 +43,7 @@ MAX_THEME_COL_SIZE = 35
|
|
|
42
43
|
def pdfalyze():
|
|
43
44
|
args = parse_arguments()
|
|
44
45
|
pdfalyzer = Pdfalyzer(args.file_to_scan_path)
|
|
45
|
-
|
|
46
|
+
presenter = PdfalyzerPresenter(pdfalyzer)
|
|
46
47
|
output_basepath = None
|
|
47
48
|
|
|
48
49
|
# Binary stream extraction is a special case
|
|
@@ -54,7 +55,7 @@ def pdfalyze():
|
|
|
54
55
|
|
|
55
56
|
# The method that gets called is related to the argument name. See 'possible_output_sections' list in
|
|
56
57
|
# argument_parser.py. Analysis exports wrap themselves around the methods that actually generate the analyses.
|
|
57
|
-
for (arg, method) in output_sections(args,
|
|
58
|
+
for (arg, method) in output_sections(args, presenter):
|
|
58
59
|
if args.output_dir:
|
|
59
60
|
output_basepath = PdfalyzerConfig.get_output_basepath(method)
|
|
60
61
|
print(f'Exporting {arg} data to {output_basepath}...')
|
|
@@ -79,6 +80,8 @@ def pdfalyze():
|
|
|
79
80
|
if args.interact:
|
|
80
81
|
code.interact(local=locals())
|
|
81
82
|
|
|
83
|
+
pdfalyzer.pdf_filehandle.close()
|
|
84
|
+
|
|
82
85
|
|
|
83
86
|
def pdfalyzer_show_color_theme() -> None:
|
|
84
87
|
"""Utility method to show pdfalyzer's color theme. Invocable with 'pdfalyzer_show_color_theme'."""
|
|
@@ -135,15 +138,13 @@ def combine_pdfs():
|
|
|
135
138
|
|
|
136
139
|
|
|
137
140
|
def extract_pdf_pages() -> None:
|
|
141
|
+
"""Extract a range of pages from a PDF to a new PDF."""
|
|
138
142
|
args = parse_pdf_page_extraction_args()
|
|
139
143
|
PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
|
|
140
144
|
|
|
141
145
|
|
|
142
|
-
def
|
|
143
|
-
"""
|
|
144
|
-
Extract text from a single file or from all files in a given directory. Can accept
|
|
145
|
-
multiple paths as arguments on the command line.
|
|
146
|
-
"""
|
|
146
|
+
def extract_pdf_text() -> None:
|
|
147
|
+
"""Extract text from a list of file or from all PDF files in a list of directories."""
|
|
147
148
|
args: Namespace = parse_text_extraction_args()
|
|
148
149
|
console.line()
|
|
149
150
|
|
pdfalyzer/decorators/pdf_file.py
CHANGED
|
@@ -20,6 +20,8 @@ from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_t
|
|
|
20
20
|
from pdfalyzer.helpers.string_helper import exception_str
|
|
21
21
|
from pdfalyzer.util.page_range import PageRange
|
|
22
22
|
|
|
23
|
+
DEPENDENCY_ERROR_MSG = "Pdfalyzer is missing an optional dependency required to extract text. " + \
|
|
24
|
+
"Try 'pip install pdfalyzer[extract]'"
|
|
23
25
|
DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
|
|
24
26
|
MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
|
|
25
27
|
|
|
@@ -30,10 +32,11 @@ class PdfFile:
|
|
|
30
32
|
|
|
31
33
|
Attributes:
|
|
32
34
|
file_path (Path): The path to the PDF file.
|
|
33
|
-
dirname (Path): The directory containing the PDF file.
|
|
34
35
|
basename (str): The base name of the PDF file (with extension).
|
|
35
36
|
basename_without_ext (str): The base name of the PDF file (without extension).
|
|
37
|
+
dirname (Path): The directory containing the PDF file.
|
|
36
38
|
extname (str): The file extension of the PDF file.
|
|
39
|
+
file_size (int): The size of the file in bytes.
|
|
37
40
|
"""
|
|
38
41
|
|
|
39
42
|
def __init__(self, file_path: Union[str, Path]) -> None:
|
|
@@ -44,7 +47,7 @@ class PdfFile:
|
|
|
44
47
|
self.file_path: Path = Path(file_path)
|
|
45
48
|
|
|
46
49
|
if not self.file_path.exists():
|
|
47
|
-
raise FileNotFoundError(f"
|
|
50
|
+
raise FileNotFoundError(f"'{file_path}' is not a valid file or directory.")
|
|
48
51
|
|
|
49
52
|
self.dirname = self.file_path.parent
|
|
50
53
|
self.basename: str = path.basename(file_path)
|
|
@@ -53,11 +56,11 @@ class PdfFile:
|
|
|
53
56
|
self.file_size = self.file_path.stat().st_size
|
|
54
57
|
|
|
55
58
|
def extract_page_range(
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
59
|
+
self,
|
|
60
|
+
page_range: PageRange,
|
|
61
|
+
destination_dir: Optional[Path] = None,
|
|
62
|
+
extra_file_suffix: Optional[str] = None
|
|
63
|
+
) -> Path:
|
|
61
64
|
"""
|
|
62
65
|
Extract a range of pages to a new PDF file.
|
|
63
66
|
|
|
@@ -71,17 +74,21 @@ class PdfFile:
|
|
|
71
74
|
Returns:
|
|
72
75
|
Path: The path to the newly created PDF file containing the extracted pages.
|
|
73
76
|
"""
|
|
74
|
-
destination_dir = destination_dir or self.dirname
|
|
77
|
+
destination_dir = Path(destination_dir or self.dirname)
|
|
75
78
|
create_dir_if_it_does_not_exist(destination_dir)
|
|
79
|
+
pdf_reader = PdfReader(self.file_path)
|
|
80
|
+
page_count = len(pdf_reader.pages)
|
|
81
|
+
file_suffix = page_range.file_suffix()
|
|
76
82
|
|
|
77
|
-
if
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
83
|
+
if page_count < (page_range.last_page - 1):
|
|
84
|
+
raise ValueError(f"PDF only has {page_count} pages but you asked for pages {page_range}!")
|
|
85
|
+
|
|
86
|
+
if extra_file_suffix is not None:
|
|
87
|
+
file_suffix += f"__{extra_file_suffix}"
|
|
81
88
|
|
|
82
89
|
extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
|
|
83
90
|
extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
|
|
84
|
-
console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'
|
|
91
|
+
console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'")
|
|
85
92
|
pdf_writer = PdfWriter()
|
|
86
93
|
|
|
87
94
|
with open(self.file_path, 'rb') as source_pdf:
|
|
@@ -94,11 +101,11 @@ class PdfFile:
|
|
|
94
101
|
return extracted_pages_pdf_path
|
|
95
102
|
|
|
96
103
|
def extract_text(
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
104
|
+
self,
|
|
105
|
+
page_range: Optional[PageRange] = None,
|
|
106
|
+
logger: Optional[Logger] = None,
|
|
107
|
+
print_as_parsed: bool = False
|
|
108
|
+
) -> Optional[str]:
|
|
102
109
|
"""
|
|
103
110
|
Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
|
|
104
111
|
|
|
@@ -162,7 +169,7 @@ class PdfFile:
|
|
|
162
169
|
if print_as_parsed:
|
|
163
170
|
print(f"{page_text}")
|
|
164
171
|
except DependencyError:
|
|
165
|
-
log.error(
|
|
172
|
+
log.error(DEPENDENCY_ERROR_MSG)
|
|
166
173
|
except EmptyFileError:
|
|
167
174
|
log.warning("Skipping empty file!")
|
|
168
175
|
except PdfStreamError as e:
|
|
@@ -185,7 +192,8 @@ class PdfFile:
|
|
|
185
192
|
|
|
186
193
|
try:
|
|
187
194
|
extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
|
|
188
|
-
except Exception
|
|
195
|
+
except Exception:
|
|
196
|
+
stderr_console.print_exception()
|
|
189
197
|
stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
|
|
190
198
|
extracted_file = None
|
|
191
199
|
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
|
-
from PIL import Image
|
|
4
3
|
from yaralyzer.output.rich_console import console
|
|
5
4
|
|
|
6
5
|
from pdfalyzer.helpers.rich_text_helper import warning_text
|
|
7
6
|
|
|
8
7
|
|
|
9
|
-
def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
|
|
8
|
+
def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]: # noqa F821
|
|
10
9
|
"""Use pytesseract to OCR the text in the image and return it as a string."""
|
|
11
10
|
import pytesseract
|
|
11
|
+
from PIL import Image
|
|
12
12
|
text = None
|
|
13
13
|
|
|
14
14
|
try:
|
|
15
15
|
text = pytesseract.image_to_string(image)
|
|
16
|
-
except pytesseract.pytesseract.TesseractError
|
|
16
|
+
except pytesseract.pytesseract.TesseractError:
|
|
17
17
|
console.print_exception()
|
|
18
18
|
console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
|
|
19
19
|
except OSError as e:
|
|
@@ -3,7 +3,7 @@ Various text formatting/styling/manipulating methods.
|
|
|
3
3
|
"""
|
|
4
4
|
import re
|
|
5
5
|
from pprint import PrettyPrinter
|
|
6
|
-
from typing import List, Pattern, Union
|
|
6
|
+
from typing import List, Optional, Pattern, Union
|
|
7
7
|
|
|
8
8
|
from yaralyzer.output.rich_console import console_width
|
|
9
9
|
|
|
@@ -18,16 +18,14 @@ pp = PrettyPrinter(
|
|
|
18
18
|
sort_dicts=True)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def
|
|
22
|
-
"""
|
|
23
|
-
|
|
21
|
+
def all_strings_are_same_ignoring_numbers(strings: List[str]) -> bool:
|
|
22
|
+
"""Returns true if string addresses are same except for digits."""
|
|
23
|
+
return len(set([replace_digits(s) for s in strings])) == 1
|
|
24
24
|
|
|
25
|
-
if title is None:
|
|
26
|
-
return '-' * width
|
|
27
25
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
return
|
|
26
|
+
def bracketed(index: Union[int, str]) -> str:
|
|
27
|
+
"""Surround index with [ and ]."""
|
|
28
|
+
return f"[{index}]"
|
|
31
29
|
|
|
32
30
|
|
|
33
31
|
def count_pattern_matches_in_text(pattern: str, text: str) -> int:
|
|
@@ -44,9 +42,20 @@ def exception_str(e: Exception) -> str:
|
|
|
44
42
|
return f"{type(e).__name__}: {e}"
|
|
45
43
|
|
|
46
44
|
|
|
47
|
-
def
|
|
48
|
-
"""
|
|
49
|
-
|
|
45
|
+
def generate_hyphen_line(width: Optional[int] = None, title: Optional[str] = None):
|
|
46
|
+
"""e.g. '-----------------BEGIN-----------------'"""
|
|
47
|
+
width = width or console_width()
|
|
48
|
+
|
|
49
|
+
if title is None:
|
|
50
|
+
return '-' * width
|
|
51
|
+
|
|
52
|
+
side_hyphens = int((width - len(title)) / 2) * '-'
|
|
53
|
+
line = side_hyphens + title + side_hyphens
|
|
54
|
+
return line if len(line) == width else line + '-'
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def has_a_common_substring(strings: List[str]) -> bool:
|
|
58
|
+
return all([is_substring_of_longer_strings_in_list(s, strings) for s in strings])
|
|
50
59
|
|
|
51
60
|
|
|
52
61
|
def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
|
|
@@ -54,9 +63,10 @@ def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
|
|
|
54
63
|
return any([_string.startswith(prefix) for prefix in prefixes])
|
|
55
64
|
|
|
56
65
|
|
|
57
|
-
def
|
|
58
|
-
"""
|
|
59
|
-
|
|
66
|
+
def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
|
|
67
|
+
"""Return True if '_string' is a substring of all the 'strings' longer than '_string'."""
|
|
68
|
+
longer_strings = [s for s in strings if len(s) > len(_string)]
|
|
69
|
+
return all([_string in longer_string for longer_string in longer_strings])
|
|
60
70
|
|
|
61
71
|
|
|
62
72
|
def replace_digits(string_with_digits: str) -> str:
|
|
@@ -64,18 +74,6 @@ def replace_digits(string_with_digits: str) -> str:
|
|
|
64
74
|
return DIGIT_REGEX.sub('x', string_with_digits)
|
|
65
75
|
|
|
66
76
|
|
|
67
|
-
def
|
|
68
|
-
"""
|
|
69
|
-
return
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
|
|
73
|
-
longer_strings = [s for s in strings if len(s) > len(_string)]
|
|
74
|
-
return all([_string in longer_string for longer_string in longer_strings])
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def has_a_common_substring(strings: List[str]) -> bool:
|
|
78
|
-
return all([
|
|
79
|
-
is_substring_of_longer_strings_in_list(s, strings)
|
|
80
|
-
for s in strings
|
|
81
|
-
])
|
|
77
|
+
def root_address(_string: str) -> str:
|
|
78
|
+
"""Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
|
|
79
|
+
return _string.split('[')[0]
|
pdfalyzer/pdfalyzer.py
CHANGED
|
@@ -7,10 +7,11 @@ from typing import Dict, Iterator, List, Optional
|
|
|
7
7
|
from anytree import LevelOrderIter, SymlinkNode
|
|
8
8
|
from anytree.search import findall, findall_by_attr
|
|
9
9
|
from pypdf import PdfReader
|
|
10
|
+
from pypdf.errors import PdfReadError
|
|
10
11
|
from pypdf.generic import IndirectObject
|
|
11
12
|
from yaralyzer.helpers.file_helper import load_binary_data
|
|
12
13
|
from yaralyzer.output.file_hashes_table import compute_file_hashes
|
|
13
|
-
from yaralyzer.output.rich_console import console
|
|
14
|
+
from yaralyzer.output.rich_console import console, print_fatal_error_and_exit
|
|
14
15
|
from yaralyzer.util.logging import log
|
|
15
16
|
|
|
16
17
|
from pdfalyzer.decorators.document_model_printer import print_with_header
|
|
@@ -22,7 +23,8 @@ from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
|
|
|
22
23
|
from pdfalyzer.util.adobe_strings import *
|
|
23
24
|
from pdfalyzer.util.exceptions import PdfWalkError
|
|
24
25
|
|
|
25
|
-
TRAILER_FALLBACK_ID =
|
|
26
|
+
TRAILER_FALLBACK_ID = 10_000_000
|
|
27
|
+
PYPDF_ERROR_MSG = "Failed to open file with PyPDF. Consider filing a PyPDF bug report: https://github.com/py-pdf/pypdf/issues"
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
class Pdfalyzer:
|
|
@@ -32,6 +34,19 @@ class Pdfalyzer:
|
|
|
32
34
|
Each of the PDF's internal objects isw rapped in a `PdfTreeNode` object. The tree is managed
|
|
33
35
|
by the `anytree` library. Information about the tree as a whole is stored in this class.
|
|
34
36
|
Once the PDF is parsed this class provides access to info about or from the underlying PDF tree.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
font_infos (List[FontInfo]): Font summary objects
|
|
40
|
+
max_generation (int): Max revision number ("generation") encounted in this PDF.
|
|
41
|
+
nodes_encountered (Dict[int, PdfTreeNode]): Nodes we've traversed already.
|
|
42
|
+
pdf_basename (str): The base name of the PDF file (with extension).
|
|
43
|
+
pdf_bytes (bytes): PDF binary data.
|
|
44
|
+
pdf_bytes_info (BytesInfo): File size, hashes, and other data points about the PDF's raw bytes.
|
|
45
|
+
pdf_filehandle (BufferedReader): File handle that reads the PDF.
|
|
46
|
+
pdf_path (str): The path to the PDF file.
|
|
47
|
+
pdf_size (int): Number of nodes as extracted from the PDF's Trailer node.
|
|
48
|
+
pdf_tree (PdfTreeNode): The top node of the PDF data structure tree.
|
|
49
|
+
verifier (PdfTreeVerifier): PdfTreeVerifier that can validate the PDF has been walked successfully.
|
|
35
50
|
"""
|
|
36
51
|
|
|
37
52
|
def __init__(self, pdf_path: str):
|
|
@@ -43,14 +58,21 @@ class Pdfalyzer:
|
|
|
43
58
|
self.pdf_basename = basename(pdf_path)
|
|
44
59
|
self.pdf_bytes = load_binary_data(pdf_path)
|
|
45
60
|
self.pdf_bytes_info = compute_file_hashes(self.pdf_bytes)
|
|
46
|
-
|
|
47
|
-
|
|
61
|
+
self.pdf_filehandle = open(pdf_path, 'rb') # Filehandle must be left open for PyPDF to perform seeks
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
self.pdf_reader = PdfReader(self.pdf_filehandle)
|
|
65
|
+
except PdfReadError:
|
|
66
|
+
self._handle_fatal_error(f'PdfReadError: "{pdf_path}" doesn\'t seem to be a valid PDF file.')
|
|
67
|
+
except Exception as e:
|
|
68
|
+
console.print_exception()
|
|
69
|
+
self._handle_fatal_error(f"{PYPDF_ERROR_MSG}\n{e}")
|
|
48
70
|
|
|
49
71
|
# Initialize tracking variables
|
|
50
|
-
self.indeterminate_ids = set() # See INDETERMINATE_REF_KEYS comment
|
|
51
|
-
self.nodes_encountered: Dict[int, PdfTreeNode] = {} # Nodes we've seen already
|
|
52
72
|
self.font_infos: List[FontInfo] = [] # Font summary objects
|
|
53
73
|
self.max_generation = 0 # PDF revisions are "generations"; this is the max generation encountered
|
|
74
|
+
self.nodes_encountered: Dict[int, PdfTreeNode] = {} # Nodes we've seen already
|
|
75
|
+
self._indeterminate_ids = set() # See INDETERMINATE_REF_KEYS comment
|
|
54
76
|
|
|
55
77
|
# Bootstrap the root of the tree with the trailer. PDFs are always read trailer first.
|
|
56
78
|
# Technically the trailer has no PDF Object ID but we set it to the /Size of the PDF.
|
|
@@ -148,9 +170,9 @@ class Pdfalyzer:
|
|
|
148
170
|
from_node.add_child(to_node)
|
|
149
171
|
|
|
150
172
|
# Remove this to_node from inteterminacy now that it's got a child or parent
|
|
151
|
-
if relationship.to_obj.idnum in self.
|
|
173
|
+
if relationship.to_obj.idnum in self._indeterminate_ids:
|
|
152
174
|
log.info(f" Found {relationship} => {to_node} was marked indeterminate but now placed")
|
|
153
|
-
self.
|
|
175
|
+
self._indeterminate_ids.remove(relationship.to_obj.idnum)
|
|
154
176
|
|
|
155
177
|
# If the relationship is indeterminate or we've seen the PDF object before, add it as
|
|
156
178
|
# a non-tree relationship for now. An attempt to place the node will be made at the end.
|
|
@@ -159,7 +181,7 @@ class Pdfalyzer:
|
|
|
159
181
|
|
|
160
182
|
# If we already encountered 'to_node' then skip adding it to the queue of nodes to walk
|
|
161
183
|
if was_seen_before:
|
|
162
|
-
if relationship.to_obj.idnum not in self.
|
|
184
|
+
if relationship.to_obj.idnum not in self._indeterminate_ids and to_node.parent is None:
|
|
163
185
|
raise PdfWalkError(f"{relationship} - ref has no parent and is not indeterminate")
|
|
164
186
|
else:
|
|
165
187
|
log.debug(f" Already saw {relationship}; not scanning next")
|
|
@@ -167,7 +189,7 @@ class Pdfalyzer:
|
|
|
167
189
|
# Indeterminate relationships need to wait until everything has been scanned to be placed
|
|
168
190
|
elif relationship.is_indeterminate or (relationship.is_link and not self.is_in_tree(to_node)):
|
|
169
191
|
log.info(f' Indeterminate ref {relationship}')
|
|
170
|
-
self.
|
|
192
|
+
self._indeterminate_ids.add(to_node.idnum)
|
|
171
193
|
# Link nodes like /Dest are usually just links between nodes
|
|
172
194
|
elif relationship.is_link:
|
|
173
195
|
log.debug(f" Link ref {relationship}")
|
|
@@ -178,9 +200,13 @@ class Pdfalyzer:
|
|
|
178
200
|
|
|
179
201
|
return to_node
|
|
180
202
|
|
|
203
|
+
def _handle_fatal_error(self, msg: str) -> None:
|
|
204
|
+
self.pdf_filehandle.close()
|
|
205
|
+
print_fatal_error_and_exit(msg)
|
|
206
|
+
|
|
181
207
|
def _resolve_indeterminate_nodes(self) -> None:
|
|
182
208
|
"""Place all indeterminate nodes in the tree. Called after all nodes have been walked."""
|
|
183
|
-
indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self.
|
|
209
|
+
indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self._indeterminate_ids]
|
|
184
210
|
indeterminate_nodes_string = "\n ".join([f"{node}" for node in indeterminate_nodes])
|
|
185
211
|
log.info(f"Resolving {len(indeterminate_nodes)} indeterminate nodes: {indeterminate_nodes_string}")
|
|
186
212
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Parse command line arguments for
|
|
2
|
+
Parse command line arguments for `pdfalyze` and construct the `PdfalyzerConfig` object.
|
|
3
3
|
"""
|
|
4
4
|
import sys
|
|
5
5
|
from argparse import ArgumentParser, Namespace
|
|
@@ -7,23 +7,17 @@ from collections import namedtuple
|
|
|
7
7
|
from functools import partial, update_wrapper
|
|
8
8
|
from importlib.metadata import version
|
|
9
9
|
from os import getcwd, path
|
|
10
|
-
from pathlib import Path
|
|
11
10
|
from typing import List, Optional
|
|
12
11
|
|
|
13
12
|
from rich_argparse_plus import RichHelpFormatterPlus
|
|
14
13
|
from rich.prompt import Confirm
|
|
15
14
|
from rich.text import Text
|
|
16
|
-
from yaralyzer.helpers.file_helper import files_in_dir
|
|
17
15
|
from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
|
|
18
16
|
from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
|
|
19
17
|
|
|
20
|
-
|
|
21
18
|
from pdfalyzer.config import ALL_STREAMS, PDFALYZER, PdfalyzerConfig
|
|
22
19
|
from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
|
|
23
|
-
from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
|
|
24
|
-
with_pdf_extension)
|
|
25
20
|
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
26
|
-
from pdfalyzer.util.page_range import PageRangeArgumentValidator
|
|
27
21
|
|
|
28
22
|
# NamedTuple to keep our argument selection orderly
|
|
29
23
|
OutputSection = namedtuple('OutputSection', ['argument', 'method'])
|
|
@@ -202,146 +196,10 @@ def output_sections(args: Namespace, pdfalyzer: 'Pdfalyzer') -> List[OutputSecti
|
|
|
202
196
|
|
|
203
197
|
|
|
204
198
|
def all_sections_chosen(args):
|
|
205
|
-
"""Returns
|
|
199
|
+
"""Returns True if all flags are set or no flags are set."""
|
|
206
200
|
return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
|
|
207
201
|
|
|
208
202
|
|
|
209
|
-
#############################################################
|
|
210
|
-
# Separate arg parsers for combine_pdfs and other scripts #
|
|
211
|
-
#############################################################
|
|
212
|
-
|
|
213
|
-
MAX_QUALITY = 10
|
|
214
|
-
|
|
215
|
-
combine_pdfs_parser = ArgumentParser(
|
|
216
|
-
description="Combine multiple PDFs into one.",
|
|
217
|
-
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
|
|
218
|
-
" page numbers prior to merging.",
|
|
219
|
-
formatter_class=RichHelpFormatterPlus)
|
|
220
|
-
|
|
221
|
-
combine_pdfs_parser.add_argument('pdfs',
|
|
222
|
-
help='two or more PDFs to combine',
|
|
223
|
-
metavar='PDF_PATH',
|
|
224
|
-
nargs='+')
|
|
225
|
-
|
|
226
|
-
combine_pdfs_parser.add_argument('-iq', '--image-quality',
|
|
227
|
-
help='image quality for embedded images (can compress PDF at loss of quality)',
|
|
228
|
-
choices=range(1, MAX_QUALITY + 1),
|
|
229
|
-
default=MAX_QUALITY,
|
|
230
|
-
type=int)
|
|
231
|
-
|
|
232
|
-
combine_pdfs_parser.add_argument('-o', '--output-file',
|
|
233
|
-
help='path to write the combined PDFs to',
|
|
234
|
-
required=True)
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def parse_combine_pdfs_args() -> Namespace:
|
|
238
|
-
"""Parse command line args for combine_pdfs script."""
|
|
239
|
-
args = combine_pdfs_parser.parse_args()
|
|
240
|
-
args.output_file = with_pdf_extension(args.output_file)
|
|
241
|
-
confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
|
|
242
|
-
args.number_of_pdfs = len(args.pdfs)
|
|
243
|
-
|
|
244
|
-
if args.number_of_pdfs < 2:
|
|
245
|
-
exit_with_error(f"Need at least 2 PDFs to merge.")
|
|
246
|
-
elif not do_all_files_exist(args.pdfs):
|
|
247
|
-
exit_with_error()
|
|
248
|
-
elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
|
|
249
|
-
exit_with_error()
|
|
250
|
-
|
|
251
|
-
if all(is_pdf(pdf) for pdf in args.pdfs):
|
|
252
|
-
if all(extract_page_number(pdf) for pdf in args.pdfs):
|
|
253
|
-
print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
|
|
254
|
-
args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
|
|
255
|
-
else:
|
|
256
|
-
print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
|
|
257
|
-
else:
|
|
258
|
-
print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
|
|
259
|
-
ask_to_proceed()
|
|
260
|
-
|
|
261
|
-
print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
|
|
262
|
-
return args
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
###########################################
|
|
266
|
-
# Parse args for extract_pdf_pages() #
|
|
267
|
-
###########################################
|
|
268
|
-
page_range_validator = PageRangeArgumentValidator()
|
|
269
|
-
|
|
270
|
-
extract_pdf_parser = ArgumentParser(
|
|
271
|
-
formatter_class=RichHelpFormatterPlus,
|
|
272
|
-
description="Extract pages from one PDF into a new PDF.",
|
|
273
|
-
)
|
|
274
|
-
|
|
275
|
-
extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
|
|
276
|
-
|
|
277
|
-
extract_pdf_parser.add_argument('--page-range', '-r',
|
|
278
|
-
type=page_range_validator,
|
|
279
|
-
help=page_range_validator.HELP_MSG,
|
|
280
|
-
required=True)
|
|
281
|
-
|
|
282
|
-
extract_pdf_parser.add_argument('--destination-dir', '-d',
|
|
283
|
-
help="directory to write the new PDF to",
|
|
284
|
-
default=Path.cwd())
|
|
285
|
-
|
|
286
|
-
extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
def parse_pdf_page_extraction_args() -> Namespace:
|
|
290
|
-
args = extract_pdf_parser.parse_args()
|
|
291
|
-
|
|
292
|
-
if not is_pdf(args.pdf_file):
|
|
293
|
-
log.error(f"'{args.pdf_file}' is not a PDF.")
|
|
294
|
-
sys.exit(-1)
|
|
295
|
-
elif not Path(args.destination_dir).exists():
|
|
296
|
-
log.error(f"Destination dir '{args.destination_dir}' does not exist.")
|
|
297
|
-
sys.exit(1)
|
|
298
|
-
|
|
299
|
-
return args
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
############################################
|
|
303
|
-
# Parse args for extract_text_from_pdfs() #
|
|
304
|
-
############################################
|
|
305
|
-
extract_text_parser = ArgumentParser(
|
|
306
|
-
formatter_class=RichHelpFormatterPlus,
|
|
307
|
-
description="Extract the text from one or more files or directories.",
|
|
308
|
-
epilog="If any of the FILE_OR_DIRs is a directory all files in that directory will be extracted."
|
|
309
|
-
)
|
|
310
|
-
|
|
311
|
-
extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
|
|
312
|
-
extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
313
|
-
|
|
314
|
-
extract_text_parser.add_argument('--page-range', '-r',
|
|
315
|
-
type=page_range_validator,
|
|
316
|
-
help=f"[PDFs only] {page_range_validator.HELP_MSG}")
|
|
317
|
-
|
|
318
|
-
extract_text_parser.add_argument('--print-as-parsed', '-p',
|
|
319
|
-
action='store_true',
|
|
320
|
-
help='print pages as they are parsed instead of waiting until document is fully parsed')
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
def parse_text_extraction_args() -> Namespace:
|
|
324
|
-
args = extract_text_parser.parse_args()
|
|
325
|
-
args.files_to_process = []
|
|
326
|
-
|
|
327
|
-
for file_or_dir in args.file_or_dir:
|
|
328
|
-
file_path = Path(file_or_dir)
|
|
329
|
-
|
|
330
|
-
if not file_path.exists():
|
|
331
|
-
log.error(f"File '{file_path}' doesn't exist!")
|
|
332
|
-
sys.exit(-1)
|
|
333
|
-
elif file_path.is_dir():
|
|
334
|
-
args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
|
|
335
|
-
else:
|
|
336
|
-
args.files_to_process.append(file_path)
|
|
337
|
-
|
|
338
|
-
if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
|
|
339
|
-
log.error(f"--page-range can only be specified for a single PDF")
|
|
340
|
-
sys.exit(-1)
|
|
341
|
-
|
|
342
|
-
return args
|
|
343
|
-
|
|
344
|
-
|
|
345
203
|
#############
|
|
346
204
|
# Helpers #
|
|
347
205
|
#############
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Argument parsers for the command line tools other than `pdfalyze` that are included with The Pdfalyzer.
|
|
3
|
+
|
|
4
|
+
1. combine_pdfs
|
|
5
|
+
2. extract_pdf_pages
|
|
6
|
+
3. extract_pdf_text
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
import sys
|
|
10
|
+
from argparse import ArgumentParser, Namespace
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from rich_argparse_plus import RichHelpFormatterPlus
|
|
14
|
+
from rich.prompt import Confirm
|
|
15
|
+
from rich.text import Text
|
|
16
|
+
from yaralyzer.helpers.file_helper import files_in_dir
|
|
17
|
+
from yaralyzer.util.logging import log
|
|
18
|
+
|
|
19
|
+
from pdfalyzer.util.argument_parser import ask_to_proceed, exit_with_error
|
|
20
|
+
from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
|
|
21
|
+
with_pdf_extension)
|
|
22
|
+
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
23
|
+
from pdfalyzer.util.page_range import PageRangeArgumentValidator
|
|
24
|
+
|
|
25
|
+
MAX_QUALITY = 10
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
##################
|
|
29
|
+
# combine_pdfs #
|
|
30
|
+
##################
|
|
31
|
+
combine_pdfs_parser = ArgumentParser(
|
|
32
|
+
description="Combine multiple PDFs into one.",
|
|
33
|
+
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
|
|
34
|
+
" page numbers prior to merging.",
|
|
35
|
+
formatter_class=RichHelpFormatterPlus)
|
|
36
|
+
|
|
37
|
+
combine_pdfs_parser.add_argument('pdfs',
|
|
38
|
+
help='two or more PDFs to combine',
|
|
39
|
+
metavar='PDF_PATH',
|
|
40
|
+
nargs='+')
|
|
41
|
+
|
|
42
|
+
combine_pdfs_parser.add_argument('-iq', '--image-quality',
|
|
43
|
+
help='image quality for embedded images (can compress PDF at loss of quality)',
|
|
44
|
+
choices=range(1, MAX_QUALITY + 1),
|
|
45
|
+
default=MAX_QUALITY,
|
|
46
|
+
type=int)
|
|
47
|
+
|
|
48
|
+
combine_pdfs_parser.add_argument('-o', '--output-file',
|
|
49
|
+
help='path to write the combined PDFs to',
|
|
50
|
+
required=True)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def parse_combine_pdfs_args() -> Namespace:
|
|
54
|
+
"""Parse command line args for combine_pdfs script."""
|
|
55
|
+
args = combine_pdfs_parser.parse_args()
|
|
56
|
+
args.output_file = with_pdf_extension(args.output_file)
|
|
57
|
+
confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
|
|
58
|
+
args.number_of_pdfs = len(args.pdfs)
|
|
59
|
+
|
|
60
|
+
if args.number_of_pdfs < 2:
|
|
61
|
+
exit_with_error(f"Need at least 2 PDFs to merge.")
|
|
62
|
+
elif not do_all_files_exist(args.pdfs):
|
|
63
|
+
exit_with_error()
|
|
64
|
+
elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
|
|
65
|
+
exit_with_error()
|
|
66
|
+
|
|
67
|
+
if all(is_pdf(pdf) for pdf in args.pdfs):
|
|
68
|
+
if all(extract_page_number(pdf) for pdf in args.pdfs):
|
|
69
|
+
print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
|
|
70
|
+
args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
|
|
71
|
+
else:
|
|
72
|
+
print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
|
|
73
|
+
else:
|
|
74
|
+
print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
|
|
75
|
+
ask_to_proceed()
|
|
76
|
+
|
|
77
|
+
print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
|
|
78
|
+
return args
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
#####################
|
|
82
|
+
# extract_pdf_pages #
|
|
83
|
+
#####################
|
|
84
|
+
page_range_validator = PageRangeArgumentValidator()
|
|
85
|
+
|
|
86
|
+
extract_pdf_parser = ArgumentParser(
|
|
87
|
+
formatter_class=RichHelpFormatterPlus,
|
|
88
|
+
description="Extract pages from one PDF into a new PDF.",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
|
|
92
|
+
extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
93
|
+
|
|
94
|
+
extract_pdf_parser.add_argument('--page-range', '-r',
|
|
95
|
+
type=page_range_validator,
|
|
96
|
+
help=page_range_validator.HELP_MSG,
|
|
97
|
+
required=True)
|
|
98
|
+
|
|
99
|
+
extract_pdf_parser.add_argument('--destination-dir', '-d',
|
|
100
|
+
help="directory to write the new PDF to",
|
|
101
|
+
default=Path.cwd())
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def parse_pdf_page_extraction_args() -> Namespace:
|
|
105
|
+
args = extract_pdf_parser.parse_args()
|
|
106
|
+
|
|
107
|
+
if not is_pdf(args.pdf_file):
|
|
108
|
+
log.error(f"'{args.pdf_file}' is not a PDF.")
|
|
109
|
+
sys.exit(-1)
|
|
110
|
+
elif not Path(args.destination_dir).exists():
|
|
111
|
+
log.error(f"Destination dir '{args.destination_dir}' does not exist.")
|
|
112
|
+
sys.exit(1)
|
|
113
|
+
|
|
114
|
+
_set_log_level(args)
|
|
115
|
+
return args
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
######################
|
|
119
|
+
# extract_pdf_text #
|
|
120
|
+
######################
|
|
121
|
+
extract_text_parser = ArgumentParser(
|
|
122
|
+
formatter_class=RichHelpFormatterPlus,
|
|
123
|
+
description="Extract the text from one or more files or directories.",
|
|
124
|
+
epilog="If any of the FILE_OR_DIRs is a directory all PDF files in that directory will be extracted."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
|
|
128
|
+
extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
129
|
+
|
|
130
|
+
extract_text_parser.add_argument('--page-range', '-r',
|
|
131
|
+
type=page_range_validator,
|
|
132
|
+
help=f"[PDFs only] {page_range_validator.HELP_MSG}")
|
|
133
|
+
|
|
134
|
+
extract_text_parser.add_argument('--print-as-parsed', '-p',
|
|
135
|
+
action='store_true',
|
|
136
|
+
help='print pages as they are parsed instead of waiting until parsing complete')
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def parse_text_extraction_args() -> Namespace:
|
|
140
|
+
args = extract_text_parser.parse_args()
|
|
141
|
+
args.files_to_process = []
|
|
142
|
+
|
|
143
|
+
for file_or_dir in args.file_or_dir:
|
|
144
|
+
file_path = Path(file_or_dir)
|
|
145
|
+
|
|
146
|
+
if not file_path.exists():
|
|
147
|
+
log.error(f"'{file_path}' is not a valid file or directory.")
|
|
148
|
+
sys.exit(-1)
|
|
149
|
+
elif file_path.is_dir():
|
|
150
|
+
args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
|
|
151
|
+
else:
|
|
152
|
+
args.files_to_process.append(file_path)
|
|
153
|
+
|
|
154
|
+
if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
|
|
155
|
+
log.error(f"--page-range can only be specified for a single PDF")
|
|
156
|
+
sys.exit(-1)
|
|
157
|
+
|
|
158
|
+
_set_log_level(args)
|
|
159
|
+
return args
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _set_log_level(args: Namespace):
|
|
163
|
+
if args.debug:
|
|
164
|
+
log.setLevel(logging.DEBUG)
|
pdfalyzer/util/page_range.py
CHANGED
|
@@ -6,7 +6,7 @@ from argparse import ArgumentTypeError
|
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from typing import Tuple
|
|
8
8
|
|
|
9
|
-
PAGE_RANGE_REGEX = re.compile('
|
|
9
|
+
PAGE_RANGE_REGEX = re.compile(r'[1-9](\d+)?(-\d+)?')
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
@dataclass
|
|
@@ -15,7 +15,7 @@ class PageRange:
|
|
|
15
15
|
|
|
16
16
|
def __post_init__(self):
|
|
17
17
|
if not PAGE_RANGE_REGEX.match(self.page_range):
|
|
18
|
-
raise
|
|
18
|
+
raise ArgumentTypeError(f"Invalid page range '{self.page_range}'")
|
|
19
19
|
|
|
20
20
|
if '-' in self.page_range:
|
|
21
21
|
(self.first_page, self.last_page) = (int(p) for p in self.page_range.split('-'))
|
|
@@ -35,10 +35,10 @@ class PageRange:
|
|
|
35
35
|
if self.first_page + 1 == self.last_page:
|
|
36
36
|
return f"page_{self.first_page}"
|
|
37
37
|
else:
|
|
38
|
-
return f"pages_{self.first_page}-{self.last_page}"
|
|
38
|
+
return f"pages_{self.first_page}-{self.last_page - 1}"
|
|
39
39
|
|
|
40
40
|
def to_tuple(self) -> Tuple[int, int]:
|
|
41
|
-
return (self.first_page, self.last_page)
|
|
41
|
+
return (self.first_page - 1, self.last_page - 1)
|
|
42
42
|
|
|
43
43
|
def __repr__(self) -> str:
|
|
44
44
|
return f"PageRange({self.first_page}, {self.last_page})"
|
|
@@ -48,7 +48,4 @@ class PageRangeArgumentValidator(object):
|
|
|
48
48
|
HELP_MSG = "a single digit ('11') or a range ('11-15') (WILL NOT extract the last page)"
|
|
49
49
|
|
|
50
50
|
def __call__(self, value):
|
|
51
|
-
if not PAGE_RANGE_REGEX.match(value):
|
|
52
|
-
raise ArgumentTypeError("Argument has to match '{}'".format(PAGE_RANGE_REGEX.pattern))
|
|
53
|
-
|
|
54
51
|
return PageRange(value)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.17.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 1.17.7
|
|
4
|
+
Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
7
7
|
Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
|
|
@@ -22,9 +22,9 @@ Classifier: Topic :: Artistic Software
|
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
23
23
|
Classifier: Topic :: Security
|
|
24
24
|
Provides-Extra: extract
|
|
25
|
-
Requires-Dist: PyMuPDF (>=1.26.
|
|
25
|
+
Requires-Dist: PyMuPDF (>=1.26.5,<2.0.0) ; extra == "extract"
|
|
26
26
|
Requires-Dist: anytree (>=2.13,<3.0)
|
|
27
|
-
Requires-Dist: pypdf (>=6.
|
|
27
|
+
Requires-Dist: pypdf (>=6.1.3,<7.0.0)
|
|
28
28
|
Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
|
|
29
29
|
Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
|
|
30
30
|
Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
|
|
@@ -67,9 +67,8 @@ If you're looking for one of these things this may be the tool for you.
|
|
|
67
67
|
### What It Don't Do
|
|
68
68
|
This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
|
|
69
69
|
|
|
70
|
-
If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it
|
|
70
|
+
If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
|
|
71
71
|
|
|
72
|
-
-------------
|
|
73
72
|
|
|
74
73
|
# Installation
|
|
75
74
|
#### All Platforms
|
|
@@ -99,7 +98,6 @@ brew install pdfalyzer
|
|
|
99
98
|
sudo apt-get install build-essential libssl-dev libffi-dev rustc
|
|
100
99
|
```
|
|
101
100
|
|
|
102
|
-
-------------
|
|
103
101
|
|
|
104
102
|
# Usage
|
|
105
103
|
|
|
@@ -115,7 +113,7 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
|
|
|
115
113
|
|
|
116
114
|
The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
|
|
117
115
|
|
|
118
|
-
|
|
116
|
+
#### Setting Command Line Options Permanently With A `.pdfalyzer` File
|
|
119
117
|
When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
|
|
120
118
|
|
|
121
119
|
1. the current directory
|
|
@@ -123,12 +121,9 @@ When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfa
|
|
|
123
121
|
|
|
124
122
|
If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
|
|
125
123
|
|
|
126
|
-
|
|
124
|
+
#### Environment Variables
|
|
127
125
|
Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
|
|
128
126
|
|
|
129
|
-
### Colors And Themes
|
|
130
|
-
Run `pdfalyzer_show_color_theme` to see the color theme employed.
|
|
131
|
-
|
|
132
127
|
### Guarantees
|
|
133
128
|
Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
|
|
134
129
|
|
|
@@ -136,7 +131,22 @@ Warnings will be printed if any PDF object ID between 1 and the `/Size` reported
|
|
|
136
131
|
[BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
|
|
137
132
|
|
|
138
133
|
|
|
139
|
-
##
|
|
134
|
+
## Included Command Line Tools
|
|
135
|
+
The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
|
|
136
|
+
|
|
137
|
+
* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
|
|
138
|
+
* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
|
|
139
|
+
* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
|
|
140
|
+
* `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
|
|
141
|
+
|
|
142
|
+
Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
pipx install pdfalyzer[extract]
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
## As A Python Library
|
|
140
150
|
For info about setting up a dev environment see [Contributing](#contributing) below.
|
|
141
151
|
|
|
142
152
|
At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
|
|
@@ -247,26 +257,6 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
247
257
|
|
|
248
258
|
|
|
249
259
|
# PDF Resources
|
|
250
|
-
## Included PDF Tools
|
|
251
|
-
The Pdfalyzer comes with a few command line tools:
|
|
252
|
-
|
|
253
|
-
#### `combine_pdfs`
|
|
254
|
-
Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
|
|
255
|
-
|
|
256
|
-
#### `extract_pdf_pages`
|
|
257
|
-
Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
|
|
258
|
-

|
|
259
|
-
|
|
260
|
-
#### `extract_text_from_pdfs`
|
|
261
|
-
Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
|
|
262
|
-
|
|
263
|
-
```bash
|
|
264
|
-
pipx install pdfalyzer[extract]
|
|
265
|
-
```
|
|
266
|
-
|
|
267
|
-
Run `extract_text_from_pdfs --help` to see the options.
|
|
268
|
-
|
|
269
|
-
|
|
270
260
|
## 3rd Party PDF Tools
|
|
271
261
|
### Installing Didier Stevens's PDF Analysis Tools
|
|
272
262
|
Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
|
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
.pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
|
|
2
|
-
CHANGELOG.md,sha256=
|
|
2
|
+
CHANGELOG.md,sha256=LEAlcDOgi-BH86Pe66RFDGFgOfHVaZD05veJbCPyBB0,13681
|
|
3
3
|
LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
4
|
-
pdfalyzer/__init__.py,sha256=
|
|
4
|
+
pdfalyzer/__init__.py,sha256=3ylD-19PcG1bJ-rMa6ruP06QaM9Q1BitaMOA2ppugM8,6197
|
|
5
5
|
pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
|
|
6
6
|
pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04ElB8ilU,10748
|
|
7
7
|
pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
|
|
8
8
|
pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
|
|
9
9
|
pdfalyzer/decorators/indeterminate_node.py,sha256=QLJr-nGKih8gPZcIqxLU028OwWWD5VjNHYMUjniwT_k,6586
|
|
10
|
-
pdfalyzer/decorators/pdf_file.py,sha256=
|
|
10
|
+
pdfalyzer/decorators/pdf_file.py,sha256=ryAYzzsO8Fw5_ZMoomruW0Bal8pTb5C0VlLOTjdVqNI,10552
|
|
11
11
|
pdfalyzer/decorators/pdf_object_properties.py,sha256=Il3RObxQ4XUf0Ei-nd4tjJO0LeaxC6u7yFa3cQs_jVY,5485
|
|
12
12
|
pdfalyzer/decorators/pdf_tree_node.py,sha256=4LReGJUtG8iEcLUQD1jW-yp3xPWsHrC-3Anbkt7XZ3A,11134
|
|
13
13
|
pdfalyzer/decorators/pdf_tree_verifier.py,sha256=2hVe9APsAWQZ7ra8AGndHQnGWmmxmb3ZwfJHZuLvsvc,4714
|
|
14
14
|
pdfalyzer/detection/constants/binary_regexes.py,sha256=s69S7uq1v4vBy3ZkKKKt3ClNuFCuQ0ztootUxzlgfFw,1632
|
|
15
15
|
pdfalyzer/detection/constants/javascript_reserved_keywords.py,sha256=CXXdWskdQa0Hs5wCci2RBVvipgZg34_cLfmkWG4Xcmg,991
|
|
16
16
|
pdfalyzer/detection/javascript_hunter.py,sha256=_wT2vkKTMlm_RGCjYsmwcmV-ag1qep3EpkHmUw0nWcQ,711
|
|
17
|
-
pdfalyzer/detection/yaralyzer_helper.py,sha256=
|
|
17
|
+
pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47NdsDasg01uiQ,2194
|
|
18
18
|
pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
|
|
19
19
|
pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
|
|
20
|
-
pdfalyzer/helpers/filesystem_helper.py,sha256=
|
|
21
|
-
pdfalyzer/helpers/image_helper.py,sha256=
|
|
20
|
+
pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
|
|
21
|
+
pdfalyzer/helpers/image_helper.py,sha256=mDiscZZ7yrsFa-bxFqIEz9gH3WGhz8455yhXd4_QfAY,1134
|
|
22
22
|
pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
|
|
23
23
|
pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
|
|
24
24
|
pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
|
|
25
|
-
pdfalyzer/helpers/string_helper.py,sha256=
|
|
25
|
+
pdfalyzer/helpers/string_helper.py,sha256=zl7VnxqkaB50Zv1yQoz-ShVcLT2_nOgmxekWTpXHyx4,2521
|
|
26
26
|
pdfalyzer/output/character_mapping.py,sha256=UN66b4BjvJiokBCi2kregiQvi6u2l1BJcHYFGG_G43M,2190
|
|
27
27
|
pdfalyzer/output/layout.py,sha256=U9n5RnwwBg2UXxRBAc4E2gQ9t3dNsmiu62klz-Ig1Zg,2767
|
|
28
28
|
pdfalyzer/output/pdfalyzer_presenter.py,sha256=TUsMc2GTUDjFzIGk7Ep5ZASfXcKX_WNtZzZKbQTHcfY,8580
|
|
@@ -33,12 +33,13 @@ pdfalyzer/output/tables/font_summary_table.py,sha256=TyCwcvqn99LXTWnmtk6MBPdc_33
|
|
|
33
33
|
pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=7G-FLb_EUP50kZmYCTbo8Q6taU4xKp2QIGNOnQtYbNg,5908
|
|
34
34
|
pdfalyzer/output/tables/stream_objects_table.py,sha256=PgQj8oTtW5_X8SMQb3FvCWDS-d4Zl6QiE44Qhiv7lTY,706
|
|
35
35
|
pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVnev_4uEk,5291
|
|
36
|
-
pdfalyzer/pdfalyzer.py,sha256=
|
|
36
|
+
pdfalyzer/pdfalyzer.py,sha256=iu4D3Y9qlKP0D_k883ji4U6LLzelQkHONlzAed0QUx4,12713
|
|
37
37
|
pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
|
|
38
|
-
pdfalyzer/util/argument_parser.py,sha256=
|
|
38
|
+
pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
|
|
39
|
+
pdfalyzer/util/cli_tools_argument_parser.py,sha256=HyZhztyrPtbvOswmG975M0tK5KPon37lV3fxVA0OwYo,6277
|
|
39
40
|
pdfalyzer/util/debugging.py,sha256=hjYGxptJmal9TTaAUkkoo0oNu2tdx6ZYSyC0WjvzHh0,156
|
|
40
41
|
pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
|
|
41
|
-
pdfalyzer/util/page_range.py,sha256=
|
|
42
|
+
pdfalyzer/util/page_range.py,sha256=NMNh3_TojxTxBIpvUYK1AmvID_m8qOP6AihZrLWZF2I,1652
|
|
42
43
|
pdfalyzer/util/pdf_parser_manager.py,sha256=FVRYAYsCd0y5MAm--qvXnwCZnDtB3x85FdJtb-gpyw4,3109
|
|
43
44
|
pdfalyzer/yara_rules/PDF.yara,sha256=70JzPq5F6AS8F46Seu6u0j5GS1JHxkS42r7g7PVSpRg,81489
|
|
44
45
|
pdfalyzer/yara_rules/PDF_binary_stream.yara,sha256=Qt0Wd7RFXYiHaT9YxTCrhC68ccmFcEG1XMNC3p5IwcI,821
|
|
@@ -46,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
|
46
47
|
pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
|
|
47
48
|
pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
|
|
48
49
|
pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
|
|
49
|
-
pdfalyzer-1.17.
|
|
50
|
-
pdfalyzer-1.17.
|
|
51
|
-
pdfalyzer-1.17.
|
|
52
|
-
pdfalyzer-1.17.
|
|
53
|
-
pdfalyzer-1.17.
|
|
50
|
+
pdfalyzer-1.17.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
51
|
+
pdfalyzer-1.17.7.dist-info/METADATA,sha256=Cbd6Qu3SS8xGKrC__jEPG-74nnYvY0rJu9pirLiqrFQ,27328
|
|
52
|
+
pdfalyzer-1.17.7.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
53
|
+
pdfalyzer-1.17.7.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
|
|
54
|
+
pdfalyzer-1.17.7.dist-info/RECORD,,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
combine_pdfs=pdfalyzer:combine_pdfs
|
|
3
3
|
extract_pdf_pages=pdfalyzer:extract_pdf_pages
|
|
4
|
-
|
|
4
|
+
extract_pdf_text=pdfalyzer:extract_pdf_text
|
|
5
5
|
pdfalyze=pdfalyzer:pdfalyze
|
|
6
6
|
pdfalyzer_show_color_theme=pdfalyzer:pdfalyzer_show_color_theme
|
|
7
7
|
|
|
File without changes
|
|
File without changes
|