pdfalyzer 1.17.0__tar.gz → 1.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/CHANGELOG.md +3 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/PKG-INFO +5 -11
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/README.md +4 -10
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/__init__.py +4 -5
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/decorators/pdf_file.py +3 -2
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/detection/yaralyzer_helper.py +0 -1
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/helpers/string_helper.py +28 -30
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/util/argument_parser.py +1 -143
- pdfalyzer-1.17.1/pdfalyzer/util/cli_tools_argument_parser.py +156 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/util/page_range.py +4 -7
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pyproject.toml +5 -6
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/.pdfalyzer.example +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/LICENSE +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/__main__.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/binary/binary_scanner.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/config.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/decorators/document_model_printer.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/decorators/indeterminate_node.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/decorators/pdf_object_properties.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/decorators/pdf_tree_node.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/decorators/pdf_tree_verifier.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/detection/constants/binary_regexes.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/detection/javascript_hunter.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/font_info.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/helpers/dict_helper.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/helpers/filesystem_helper.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/helpers/image_helper.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/helpers/number_helper.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/helpers/pdf_object_helper.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/helpers/rich_text_helper.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/output/character_mapping.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/output/layout.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/output/pdfalyzer_presenter.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/output/styles/node_colors.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/output/styles/rich_theme.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/output/tables/decoding_stats_table.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/output/tables/font_summary_table.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/output/tables/stream_objects_table.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/pdf_object_relationship.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/pdfalyzer.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/util/adobe_strings.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/util/debugging.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/util/exceptions.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/util/pdf_parser_manager.py +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/yara_rules/PDF.yara +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/yara_rules/__init.py__ +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/yara_rules/didier_stevens.yara +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
- {pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/yara_rules/pdf_malware.yara +0 -0
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.17.1
|
|
4
|
+
* Fix issue where `combine_pdfs` page ranges were indexed from 0 instead of 1
|
|
5
|
+
|
|
3
6
|
# 1.17.0
|
|
4
7
|
* Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
|
|
5
8
|
* Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.17.
|
|
3
|
+
Version: 1.17.1
|
|
4
4
|
Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -250,22 +250,16 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
250
250
|
## Included PDF Tools
|
|
251
251
|
The Pdfalyzer comes with a few command line tools:
|
|
252
252
|
|
|
253
|
-
|
|
254
|
-
|
|
253
|
+
* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
|
|
254
|
+
* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
|
|
255
|
+
* `extract_text_from_pdfs` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_text_from_pdfs --help` for more info.
|
|
255
256
|
|
|
256
|
-
|
|
257
|
-
Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
|
|
258
|
-

|
|
259
|
-
|
|
260
|
-
#### `extract_text_from_pdfs`
|
|
261
|
-
Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
|
|
257
|
+
Running `extract_text_from_pdfs` requires that you install The Pdfalyzer's optional dependencies:
|
|
262
258
|
|
|
263
259
|
```bash
|
|
264
260
|
pipx install pdfalyzer[extract]
|
|
265
261
|
```
|
|
266
262
|
|
|
267
|
-
Run `extract_text_from_pdfs --help` to see the options.
|
|
268
|
-
|
|
269
263
|
|
|
270
264
|
## 3rd Party PDF Tools
|
|
271
265
|
### Installing Didier Stevens's PDF Analysis Tools
|
|
@@ -216,22 +216,16 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
216
216
|
## Included PDF Tools
|
|
217
217
|
The Pdfalyzer comes with a few command line tools:
|
|
218
218
|
|
|
219
|
-
|
|
220
|
-
|
|
219
|
+
* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
|
|
220
|
+
* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
|
|
221
|
+
* `extract_text_from_pdfs` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_text_from_pdfs --help` for more info.
|
|
221
222
|
|
|
222
|
-
|
|
223
|
-
Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
|
|
224
|
-

|
|
225
|
-
|
|
226
|
-
#### `extract_text_from_pdfs`
|
|
227
|
-
Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
|
|
223
|
+
Running `extract_text_from_pdfs` requires that you install The Pdfalyzer's optional dependencies:
|
|
228
224
|
|
|
229
225
|
```bash
|
|
230
226
|
pipx install pdfalyzer[extract]
|
|
231
227
|
```
|
|
232
228
|
|
|
233
|
-
Run `extract_text_from_pdfs --help` to see the options.
|
|
234
|
-
|
|
235
229
|
|
|
236
230
|
## 3rd Party PDF Tools
|
|
237
231
|
### Installing Didier Stevens's PDF Analysis Tools
|
|
@@ -31,7 +31,8 @@ from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
|
31
31
|
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
32
32
|
from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
|
|
33
33
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
34
|
-
from pdfalyzer.util.argument_parser import
|
|
34
|
+
from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
|
|
35
|
+
from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, ask_to_proceed,
|
|
35
36
|
parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
|
|
36
37
|
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
|
|
37
38
|
|
|
@@ -135,15 +136,13 @@ def combine_pdfs():
|
|
|
135
136
|
|
|
136
137
|
|
|
137
138
|
def extract_pdf_pages() -> None:
|
|
139
|
+
"""Extract a range of pages from a PDF to a new PDF."""
|
|
138
140
|
args = parse_pdf_page_extraction_args()
|
|
139
141
|
PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
|
|
140
142
|
|
|
141
143
|
|
|
142
144
|
def extract_text_from_pdfs() -> None:
|
|
143
|
-
"""
|
|
144
|
-
Extract text from a single file or from all files in a given directory. Can accept
|
|
145
|
-
multiple paths as arguments on the command line.
|
|
146
|
-
"""
|
|
145
|
+
"""Extract text from a list of file or from all PDF files in a list of directories."""
|
|
147
146
|
args: Namespace = parse_text_extraction_args()
|
|
148
147
|
console.line()
|
|
149
148
|
|
|
@@ -34,6 +34,7 @@ class PdfFile:
|
|
|
34
34
|
basename (str): The base name of the PDF file (with extension).
|
|
35
35
|
basename_without_ext (str): The base name of the PDF file (without extension).
|
|
36
36
|
extname (str): The file extension of the PDF file.
|
|
37
|
+
file_size (int): The size of the file in bytes.
|
|
37
38
|
"""
|
|
38
39
|
|
|
39
40
|
def __init__(self, file_path: Union[str, Path]) -> None:
|
|
@@ -44,7 +45,7 @@ class PdfFile:
|
|
|
44
45
|
self.file_path: Path = Path(file_path)
|
|
45
46
|
|
|
46
47
|
if not self.file_path.exists():
|
|
47
|
-
raise FileNotFoundError(f"
|
|
48
|
+
raise FileNotFoundError(f"'{file_path}' is not a valid file or directory.")
|
|
48
49
|
|
|
49
50
|
self.dirname = self.file_path.parent
|
|
50
51
|
self.basename: str = path.basename(file_path)
|
|
@@ -71,7 +72,7 @@ class PdfFile:
|
|
|
71
72
|
Returns:
|
|
72
73
|
Path: The path to the newly created PDF file containing the extracted pages.
|
|
73
74
|
"""
|
|
74
|
-
destination_dir = destination_dir or self.dirname
|
|
75
|
+
destination_dir = Path(destination_dir or self.dirname)
|
|
75
76
|
create_dir_if_it_does_not_exist(destination_dir)
|
|
76
77
|
|
|
77
78
|
if extra_file_suffix is None:
|
|
@@ -3,7 +3,7 @@ Various text formatting/styling/manipulating methods.
|
|
|
3
3
|
"""
|
|
4
4
|
import re
|
|
5
5
|
from pprint import PrettyPrinter
|
|
6
|
-
from typing import List, Pattern, Union
|
|
6
|
+
from typing import List, Optional, Pattern, Union
|
|
7
7
|
|
|
8
8
|
from yaralyzer.output.rich_console import console_width
|
|
9
9
|
|
|
@@ -18,16 +18,14 @@ pp = PrettyPrinter(
|
|
|
18
18
|
sort_dicts=True)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def
|
|
22
|
-
"""
|
|
23
|
-
|
|
21
|
+
def all_strings_are_same_ignoring_numbers(strings: List[str]) -> bool:
|
|
22
|
+
"""Returns true if string addresses are same except for digits."""
|
|
23
|
+
return len(set([replace_digits(s) for s in strings])) == 1
|
|
24
24
|
|
|
25
|
-
if title is None:
|
|
26
|
-
return '-' * width
|
|
27
25
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
return
|
|
26
|
+
def bracketed(index: Union[int, str]) -> str:
|
|
27
|
+
"""Surround index with [ and ]."""
|
|
28
|
+
return f"[{index}]"
|
|
31
29
|
|
|
32
30
|
|
|
33
31
|
def count_pattern_matches_in_text(pattern: str, text: str) -> int:
|
|
@@ -44,9 +42,20 @@ def exception_str(e: Exception) -> str:
|
|
|
44
42
|
return f"{type(e).__name__}: {e}"
|
|
45
43
|
|
|
46
44
|
|
|
47
|
-
def
|
|
48
|
-
"""
|
|
49
|
-
|
|
45
|
+
def generate_hyphen_line(width: Optional[int] = None, title: Optional[str] = None):
|
|
46
|
+
"""e.g. '-----------------BEGIN-----------------'"""
|
|
47
|
+
width = width or console_width()
|
|
48
|
+
|
|
49
|
+
if title is None:
|
|
50
|
+
return '-' * width
|
|
51
|
+
|
|
52
|
+
side_hyphens = int((width - len(title)) / 2) * '-'
|
|
53
|
+
line = side_hyphens + title + side_hyphens
|
|
54
|
+
return line if len(line) == width else line + '-'
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def has_a_common_substring(strings: List[str]) -> bool:
|
|
58
|
+
return all([is_substring_of_longer_strings_in_list(s, strings) for s in strings])
|
|
50
59
|
|
|
51
60
|
|
|
52
61
|
def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
|
|
@@ -54,9 +63,10 @@ def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
|
|
|
54
63
|
return any([_string.startswith(prefix) for prefix in prefixes])
|
|
55
64
|
|
|
56
65
|
|
|
57
|
-
def
|
|
58
|
-
"""
|
|
59
|
-
|
|
66
|
+
def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
|
|
67
|
+
"""Return True if '_string' is a substring of all the 'strings' longer than '_string'."""
|
|
68
|
+
longer_strings = [s for s in strings if len(s) > len(_string)]
|
|
69
|
+
return all([_string in longer_string for longer_string in longer_strings])
|
|
60
70
|
|
|
61
71
|
|
|
62
72
|
def replace_digits(string_with_digits: str) -> str:
|
|
@@ -64,18 +74,6 @@ def replace_digits(string_with_digits: str) -> str:
|
|
|
64
74
|
return DIGIT_REGEX.sub('x', string_with_digits)
|
|
65
75
|
|
|
66
76
|
|
|
67
|
-
def
|
|
68
|
-
"""
|
|
69
|
-
return
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
|
|
73
|
-
longer_strings = [s for s in strings if len(s) > len(_string)]
|
|
74
|
-
return all([_string in longer_string for longer_string in longer_strings])
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def has_a_common_substring(strings: List[str]) -> bool:
|
|
78
|
-
return all([
|
|
79
|
-
is_substring_of_longer_strings_in_list(s, strings)
|
|
80
|
-
for s in strings
|
|
81
|
-
])
|
|
77
|
+
def root_address(_string: str) -> str:
|
|
78
|
+
"""Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
|
|
79
|
+
return _string.split('[')[0]
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Parse command line arguments for
|
|
2
|
+
Parse command line arguments for `pdfalyze` and construct the `PdfalyzerConfig` object.
|
|
3
3
|
"""
|
|
4
4
|
import sys
|
|
5
5
|
from argparse import ArgumentParser, Namespace
|
|
@@ -7,23 +7,17 @@ from collections import namedtuple
|
|
|
7
7
|
from functools import partial, update_wrapper
|
|
8
8
|
from importlib.metadata import version
|
|
9
9
|
from os import getcwd, path
|
|
10
|
-
from pathlib import Path
|
|
11
10
|
from typing import List, Optional
|
|
12
11
|
|
|
13
12
|
from rich_argparse_plus import RichHelpFormatterPlus
|
|
14
13
|
from rich.prompt import Confirm
|
|
15
14
|
from rich.text import Text
|
|
16
|
-
from yaralyzer.helpers.file_helper import files_in_dir
|
|
17
15
|
from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
|
|
18
16
|
from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
|
|
19
17
|
|
|
20
|
-
|
|
21
18
|
from pdfalyzer.config import ALL_STREAMS, PDFALYZER, PdfalyzerConfig
|
|
22
19
|
from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
|
|
23
|
-
from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
|
|
24
|
-
with_pdf_extension)
|
|
25
20
|
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
26
|
-
from pdfalyzer.util.page_range import PageRangeArgumentValidator
|
|
27
21
|
|
|
28
22
|
# NamedTuple to keep our argument selection orderly
|
|
29
23
|
OutputSection = namedtuple('OutputSection', ['argument', 'method'])
|
|
@@ -206,142 +200,6 @@ def all_sections_chosen(args):
|
|
|
206
200
|
return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
|
|
207
201
|
|
|
208
202
|
|
|
209
|
-
#############################################################
|
|
210
|
-
# Separate arg parsers for combine_pdfs and other scripts #
|
|
211
|
-
#############################################################
|
|
212
|
-
|
|
213
|
-
MAX_QUALITY = 10
|
|
214
|
-
|
|
215
|
-
combine_pdfs_parser = ArgumentParser(
|
|
216
|
-
description="Combine multiple PDFs into one.",
|
|
217
|
-
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
|
|
218
|
-
" page numbers prior to merging.",
|
|
219
|
-
formatter_class=RichHelpFormatterPlus)
|
|
220
|
-
|
|
221
|
-
combine_pdfs_parser.add_argument('pdfs',
|
|
222
|
-
help='two or more PDFs to combine',
|
|
223
|
-
metavar='PDF_PATH',
|
|
224
|
-
nargs='+')
|
|
225
|
-
|
|
226
|
-
combine_pdfs_parser.add_argument('-iq', '--image-quality',
|
|
227
|
-
help='image quality for embedded images (can compress PDF at loss of quality)',
|
|
228
|
-
choices=range(1, MAX_QUALITY + 1),
|
|
229
|
-
default=MAX_QUALITY,
|
|
230
|
-
type=int)
|
|
231
|
-
|
|
232
|
-
combine_pdfs_parser.add_argument('-o', '--output-file',
|
|
233
|
-
help='path to write the combined PDFs to',
|
|
234
|
-
required=True)
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def parse_combine_pdfs_args() -> Namespace:
|
|
238
|
-
"""Parse command line args for combine_pdfs script."""
|
|
239
|
-
args = combine_pdfs_parser.parse_args()
|
|
240
|
-
args.output_file = with_pdf_extension(args.output_file)
|
|
241
|
-
confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
|
|
242
|
-
args.number_of_pdfs = len(args.pdfs)
|
|
243
|
-
|
|
244
|
-
if args.number_of_pdfs < 2:
|
|
245
|
-
exit_with_error(f"Need at least 2 PDFs to merge.")
|
|
246
|
-
elif not do_all_files_exist(args.pdfs):
|
|
247
|
-
exit_with_error()
|
|
248
|
-
elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
|
|
249
|
-
exit_with_error()
|
|
250
|
-
|
|
251
|
-
if all(is_pdf(pdf) for pdf in args.pdfs):
|
|
252
|
-
if all(extract_page_number(pdf) for pdf in args.pdfs):
|
|
253
|
-
print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
|
|
254
|
-
args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
|
|
255
|
-
else:
|
|
256
|
-
print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
|
|
257
|
-
else:
|
|
258
|
-
print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
|
|
259
|
-
ask_to_proceed()
|
|
260
|
-
|
|
261
|
-
print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
|
|
262
|
-
return args
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
###########################################
|
|
266
|
-
# Parse args for extract_pdf_pages() #
|
|
267
|
-
###########################################
|
|
268
|
-
page_range_validator = PageRangeArgumentValidator()
|
|
269
|
-
|
|
270
|
-
extract_pdf_parser = ArgumentParser(
|
|
271
|
-
formatter_class=RichHelpFormatterPlus,
|
|
272
|
-
description="Extract pages from one PDF into a new PDF.",
|
|
273
|
-
)
|
|
274
|
-
|
|
275
|
-
extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
|
|
276
|
-
|
|
277
|
-
extract_pdf_parser.add_argument('--page-range', '-r',
|
|
278
|
-
type=page_range_validator,
|
|
279
|
-
help=page_range_validator.HELP_MSG,
|
|
280
|
-
required=True)
|
|
281
|
-
|
|
282
|
-
extract_pdf_parser.add_argument('--destination-dir', '-d',
|
|
283
|
-
help="directory to write the new PDF to",
|
|
284
|
-
default=Path.cwd())
|
|
285
|
-
|
|
286
|
-
extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
def parse_pdf_page_extraction_args() -> Namespace:
|
|
290
|
-
args = extract_pdf_parser.parse_args()
|
|
291
|
-
|
|
292
|
-
if not is_pdf(args.pdf_file):
|
|
293
|
-
log.error(f"'{args.pdf_file}' is not a PDF.")
|
|
294
|
-
sys.exit(-1)
|
|
295
|
-
elif not Path(args.destination_dir).exists():
|
|
296
|
-
log.error(f"Destination dir '{args.destination_dir}' does not exist.")
|
|
297
|
-
sys.exit(1)
|
|
298
|
-
|
|
299
|
-
return args
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
############################################
|
|
303
|
-
# Parse args for extract_text_from_pdfs() #
|
|
304
|
-
############################################
|
|
305
|
-
extract_text_parser = ArgumentParser(
|
|
306
|
-
formatter_class=RichHelpFormatterPlus,
|
|
307
|
-
description="Extract the text from one or more files or directories.",
|
|
308
|
-
epilog="If any of the FILE_OR_DIRs is a directory all files in that directory will be extracted."
|
|
309
|
-
)
|
|
310
|
-
|
|
311
|
-
extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
|
|
312
|
-
extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
313
|
-
|
|
314
|
-
extract_text_parser.add_argument('--page-range', '-r',
|
|
315
|
-
type=page_range_validator,
|
|
316
|
-
help=f"[PDFs only] {page_range_validator.HELP_MSG}")
|
|
317
|
-
|
|
318
|
-
extract_text_parser.add_argument('--print-as-parsed', '-p',
|
|
319
|
-
action='store_true',
|
|
320
|
-
help='print pages as they are parsed instead of waiting until document is fully parsed')
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
def parse_text_extraction_args() -> Namespace:
|
|
324
|
-
args = extract_text_parser.parse_args()
|
|
325
|
-
args.files_to_process = []
|
|
326
|
-
|
|
327
|
-
for file_or_dir in args.file_or_dir:
|
|
328
|
-
file_path = Path(file_or_dir)
|
|
329
|
-
|
|
330
|
-
if not file_path.exists():
|
|
331
|
-
log.error(f"File '{file_path}' doesn't exist!")
|
|
332
|
-
sys.exit(-1)
|
|
333
|
-
elif file_path.is_dir():
|
|
334
|
-
args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
|
|
335
|
-
else:
|
|
336
|
-
args.files_to_process.append(file_path)
|
|
337
|
-
|
|
338
|
-
if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
|
|
339
|
-
log.error(f"--page-range can only be specified for a single PDF")
|
|
340
|
-
sys.exit(-1)
|
|
341
|
-
|
|
342
|
-
return args
|
|
343
|
-
|
|
344
|
-
|
|
345
203
|
#############
|
|
346
204
|
# Helpers #
|
|
347
205
|
#############
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Argument parsers for the command line tools other than `pdfalyze` that are included with The Pdfalyzer.
|
|
3
|
+
|
|
4
|
+
1. combine_pdfs
|
|
5
|
+
2.
|
|
6
|
+
"""
|
|
7
|
+
import sys
|
|
8
|
+
from argparse import ArgumentParser, Namespace
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from rich_argparse_plus import RichHelpFormatterPlus
|
|
12
|
+
from rich.prompt import Confirm
|
|
13
|
+
from rich.text import Text
|
|
14
|
+
from yaralyzer.helpers.file_helper import files_in_dir
|
|
15
|
+
from yaralyzer.util.logging import log
|
|
16
|
+
|
|
17
|
+
from pdfalyzer.util.argument_parser import ask_to_proceed, exit_with_error
|
|
18
|
+
from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
|
|
19
|
+
with_pdf_extension)
|
|
20
|
+
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
21
|
+
from pdfalyzer.util.page_range import PageRangeArgumentValidator
|
|
22
|
+
|
|
23
|
+
MAX_QUALITY = 10
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
##################
|
|
27
|
+
# combine_pdfs #
|
|
28
|
+
##################
|
|
29
|
+
combine_pdfs_parser = ArgumentParser(
|
|
30
|
+
description="Combine multiple PDFs into one.",
|
|
31
|
+
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
|
|
32
|
+
" page numbers prior to merging.",
|
|
33
|
+
formatter_class=RichHelpFormatterPlus)
|
|
34
|
+
|
|
35
|
+
combine_pdfs_parser.add_argument('pdfs',
|
|
36
|
+
help='two or more PDFs to combine',
|
|
37
|
+
metavar='PDF_PATH',
|
|
38
|
+
nargs='+')
|
|
39
|
+
|
|
40
|
+
combine_pdfs_parser.add_argument('-iq', '--image-quality',
|
|
41
|
+
help='image quality for embedded images (can compress PDF at loss of quality)',
|
|
42
|
+
choices=range(1, MAX_QUALITY + 1),
|
|
43
|
+
default=MAX_QUALITY,
|
|
44
|
+
type=int)
|
|
45
|
+
|
|
46
|
+
combine_pdfs_parser.add_argument('-o', '--output-file',
|
|
47
|
+
help='path to write the combined PDFs to',
|
|
48
|
+
required=True)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def parse_combine_pdfs_args() -> Namespace:
|
|
52
|
+
"""Parse command line args for combine_pdfs script."""
|
|
53
|
+
args = combine_pdfs_parser.parse_args()
|
|
54
|
+
args.output_file = with_pdf_extension(args.output_file)
|
|
55
|
+
confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
|
|
56
|
+
args.number_of_pdfs = len(args.pdfs)
|
|
57
|
+
|
|
58
|
+
if args.number_of_pdfs < 2:
|
|
59
|
+
exit_with_error(f"Need at least 2 PDFs to merge.")
|
|
60
|
+
elif not do_all_files_exist(args.pdfs):
|
|
61
|
+
exit_with_error()
|
|
62
|
+
elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
|
|
63
|
+
exit_with_error()
|
|
64
|
+
|
|
65
|
+
if all(is_pdf(pdf) for pdf in args.pdfs):
|
|
66
|
+
if all(extract_page_number(pdf) for pdf in args.pdfs):
|
|
67
|
+
print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
|
|
68
|
+
args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
|
|
69
|
+
else:
|
|
70
|
+
print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
|
|
71
|
+
else:
|
|
72
|
+
print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
|
|
73
|
+
ask_to_proceed()
|
|
74
|
+
|
|
75
|
+
print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
|
|
76
|
+
return args
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
#####################
|
|
80
|
+
# extract_pdf_pages #
|
|
81
|
+
#####################
|
|
82
|
+
page_range_validator = PageRangeArgumentValidator()
|
|
83
|
+
|
|
84
|
+
extract_pdf_parser = ArgumentParser(
|
|
85
|
+
formatter_class=RichHelpFormatterPlus,
|
|
86
|
+
description="Extract pages from one PDF into a new PDF.",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
|
|
90
|
+
|
|
91
|
+
extract_pdf_parser.add_argument('--page-range', '-r',
|
|
92
|
+
type=page_range_validator,
|
|
93
|
+
help=page_range_validator.HELP_MSG,
|
|
94
|
+
required=True)
|
|
95
|
+
|
|
96
|
+
extract_pdf_parser.add_argument('--destination-dir', '-d',
|
|
97
|
+
help="directory to write the new PDF to",
|
|
98
|
+
default=Path.cwd())
|
|
99
|
+
|
|
100
|
+
extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def parse_pdf_page_extraction_args() -> Namespace:
|
|
104
|
+
args = extract_pdf_parser.parse_args()
|
|
105
|
+
|
|
106
|
+
if not is_pdf(args.pdf_file):
|
|
107
|
+
log.error(f"'{args.pdf_file}' is not a PDF.")
|
|
108
|
+
sys.exit(-1)
|
|
109
|
+
elif not Path(args.destination_dir).exists():
|
|
110
|
+
log.error(f"Destination dir '{args.destination_dir}' does not exist.")
|
|
111
|
+
sys.exit(1)
|
|
112
|
+
|
|
113
|
+
return args
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
############################
|
|
117
|
+
# extract_text_from_pdfs #
|
|
118
|
+
############################
|
|
119
|
+
extract_text_parser = ArgumentParser(
|
|
120
|
+
formatter_class=RichHelpFormatterPlus,
|
|
121
|
+
description="Extract the text from one or more files or directories.",
|
|
122
|
+
epilog="If any of the FILE_OR_DIRs is a directory all PDF files in that directory will be extracted."
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
|
|
126
|
+
extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
127
|
+
|
|
128
|
+
extract_text_parser.add_argument('--page-range', '-r',
|
|
129
|
+
type=page_range_validator,
|
|
130
|
+
help=f"[PDFs only] {page_range_validator.HELP_MSG}")
|
|
131
|
+
|
|
132
|
+
extract_text_parser.add_argument('--print-as-parsed', '-p',
|
|
133
|
+
action='store_true',
|
|
134
|
+
help='print pages as they are parsed instead of waiting until document is fully parsed')
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def parse_text_extraction_args() -> Namespace:
|
|
138
|
+
args = extract_text_parser.parse_args()
|
|
139
|
+
args.files_to_process = []
|
|
140
|
+
|
|
141
|
+
for file_or_dir in args.file_or_dir:
|
|
142
|
+
file_path = Path(file_or_dir)
|
|
143
|
+
|
|
144
|
+
if not file_path.exists():
|
|
145
|
+
log.error(f"'{file_path}' is not a valid file or directory.")
|
|
146
|
+
sys.exit(-1)
|
|
147
|
+
elif file_path.is_dir():
|
|
148
|
+
args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
|
|
149
|
+
else:
|
|
150
|
+
args.files_to_process.append(file_path)
|
|
151
|
+
|
|
152
|
+
if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
|
|
153
|
+
log.error(f"--page-range can only be specified for a single PDF")
|
|
154
|
+
sys.exit(-1)
|
|
155
|
+
|
|
156
|
+
return args
|
|
@@ -6,7 +6,7 @@ from argparse import ArgumentTypeError
|
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from typing import Tuple
|
|
8
8
|
|
|
9
|
-
PAGE_RANGE_REGEX = re.compile('
|
|
9
|
+
PAGE_RANGE_REGEX = re.compile(r'[1-9](\d+)?(-\d+)?')
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
@dataclass
|
|
@@ -15,7 +15,7 @@ class PageRange:
|
|
|
15
15
|
|
|
16
16
|
def __post_init__(self):
|
|
17
17
|
if not PAGE_RANGE_REGEX.match(self.page_range):
|
|
18
|
-
raise
|
|
18
|
+
raise ArgumentTypeError(f"Invalid page range '{self.page_range}'")
|
|
19
19
|
|
|
20
20
|
if '-' in self.page_range:
|
|
21
21
|
(self.first_page, self.last_page) = (int(p) for p in self.page_range.split('-'))
|
|
@@ -35,10 +35,10 @@ class PageRange:
|
|
|
35
35
|
if self.first_page + 1 == self.last_page:
|
|
36
36
|
return f"page_{self.first_page}"
|
|
37
37
|
else:
|
|
38
|
-
return f"pages_{self.first_page}-{self.last_page}"
|
|
38
|
+
return f"pages_{self.first_page}-{self.last_page - 1}"
|
|
39
39
|
|
|
40
40
|
def to_tuple(self) -> Tuple[int, int]:
|
|
41
|
-
return (self.first_page, self.last_page)
|
|
41
|
+
return (self.first_page - 1, self.last_page - 1)
|
|
42
42
|
|
|
43
43
|
def __repr__(self) -> str:
|
|
44
44
|
return f"PageRange({self.first_page}, {self.last_page})"
|
|
@@ -48,7 +48,4 @@ class PageRangeArgumentValidator(object):
|
|
|
48
48
|
HELP_MSG = "a single digit ('11') or a range ('11-15') (WILL NOT extract the last page)"
|
|
49
49
|
|
|
50
50
|
def __call__(self, value):
|
|
51
|
-
if not PAGE_RANGE_REGEX.match(value):
|
|
52
|
-
raise ArgumentTypeError("Argument has to match '{}'".format(PAGE_RANGE_REGEX.pattern))
|
|
53
|
-
|
|
54
51
|
return PageRange(value)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "pdfalyzer"
|
|
3
|
-
version = "1.17.
|
|
3
|
+
version = "1.17.1"
|
|
4
4
|
description = "PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
|
|
5
5
|
authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
|
|
6
6
|
license = "GPL-3.0-or-later"
|
|
@@ -74,6 +74,10 @@ yaralyzer = "^1.0.9"
|
|
|
74
74
|
|
|
75
75
|
[tool.poetry.extras]
|
|
76
76
|
extract = ["PyMuPDF", "pytesseract"]
|
|
77
|
+
# Poetry 2.x handles optional dependencies like this:
|
|
78
|
+
# [project.optional-dependencies]
|
|
79
|
+
# extract = ["PyMuPDF", "pytesseract"]
|
|
80
|
+
|
|
77
81
|
|
|
78
82
|
[tool.poetry.group.dev.dependencies]
|
|
79
83
|
flake8 = "^7.3.0"
|
|
@@ -114,8 +118,3 @@ requires = ["poetry-core>=1.0.0"]
|
|
|
114
118
|
addopts = [
|
|
115
119
|
"--import-mode=importlib",
|
|
116
120
|
]
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
# Poetry 2.x handles optional dependencies like this:
|
|
120
|
-
# [project.optional-dependencies]
|
|
121
|
-
# extract = ["PyMuPDF", "pytesseract"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/detection/constants/javascript_reserved_keywords.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|