pdfalyzer 1.17.1__py3-none-any.whl → 1.17.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- CHANGELOG.md +8 -1
- pdfalyzer/__init__.py +3 -3
- pdfalyzer/decorators/pdf_file.py +9 -5
- pdfalyzer/helpers/filesystem_helper.py +0 -1
- pdfalyzer/util/argument_parser.py +1 -1
- pdfalyzer/util/cli_tools_argument_parser.py +14 -6
- {pdfalyzer-1.17.1.dist-info → pdfalyzer-1.17.3.dist-info}/METADATA +4 -4
- {pdfalyzer-1.17.1.dist-info → pdfalyzer-1.17.3.dist-info}/RECORD +11 -11
- {pdfalyzer-1.17.1.dist-info → pdfalyzer-1.17.3.dist-info}/entry_points.txt +1 -1
- {pdfalyzer-1.17.1.dist-info → pdfalyzer-1.17.3.dist-info}/LICENSE +0 -0
- {pdfalyzer-1.17.1.dist-info → pdfalyzer-1.17.3.dist-info}/WHEEL +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.17.3
|
|
4
|
+
* Put back `--debug` arg for CLI tools
|
|
5
|
+
|
|
6
|
+
### 1.17.2
|
|
7
|
+
* Remove unused `--debug` args for CLI tools
|
|
8
|
+
* Rename `extract_text_from_pdfs` to `extract_pdf_text`
|
|
9
|
+
|
|
3
10
|
### 1.17.1
|
|
4
|
-
* Fix issue where `
|
|
11
|
+
* Fix issue where `extract_pdf_pages` page ranges were indexed from 0 instead of 1
|
|
5
12
|
|
|
6
13
|
# 1.17.0
|
|
7
14
|
* Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
|
pdfalyzer/__init__.py
CHANGED
|
@@ -32,8 +32,8 @@ from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
|
32
32
|
from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
|
|
33
33
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
34
34
|
from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
|
|
35
|
-
from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY,
|
|
36
|
-
|
|
35
|
+
from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, parse_combine_pdfs_args,
|
|
36
|
+
parse_pdf_page_extraction_args, parse_text_extraction_args)
|
|
37
37
|
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
|
|
38
38
|
|
|
39
39
|
# For the table shown by running pdfalyzer_show_color_theme
|
|
@@ -141,7 +141,7 @@ def extract_pdf_pages() -> None:
|
|
|
141
141
|
PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
|
|
142
142
|
|
|
143
143
|
|
|
144
|
-
def
|
|
144
|
+
def extract_pdf_text() -> None:
|
|
145
145
|
"""Extract text from a list of file or from all PDF files in a list of directories."""
|
|
146
146
|
args: Namespace = parse_text_extraction_args()
|
|
147
147
|
console.line()
|
pdfalyzer/decorators/pdf_file.py
CHANGED
|
@@ -30,9 +30,9 @@ class PdfFile:
|
|
|
30
30
|
|
|
31
31
|
Attributes:
|
|
32
32
|
file_path (Path): The path to the PDF file.
|
|
33
|
-
dirname (Path): The directory containing the PDF file.
|
|
34
33
|
basename (str): The base name of the PDF file (with extension).
|
|
35
34
|
basename_without_ext (str): The base name of the PDF file (without extension).
|
|
35
|
+
dirname (Path): The directory containing the PDF file.
|
|
36
36
|
extname (str): The file extension of the PDF file.
|
|
37
37
|
file_size (int): The size of the file in bytes.
|
|
38
38
|
"""
|
|
@@ -74,11 +74,15 @@ class PdfFile:
|
|
|
74
74
|
"""
|
|
75
75
|
destination_dir = Path(destination_dir or self.dirname)
|
|
76
76
|
create_dir_if_it_does_not_exist(destination_dir)
|
|
77
|
+
pdf_reader = PdfReader(self.file_path)
|
|
78
|
+
page_count = len(pdf_reader.pages)
|
|
79
|
+
file_suffix = page_range.file_suffix()
|
|
80
|
+
|
|
81
|
+
if page_count < (page_range.last_page - 1):
|
|
82
|
+
raise ValueError(f"PDF only has {page_count} pages but you asked for pages {page_range}!")
|
|
77
83
|
|
|
78
|
-
if extra_file_suffix is None:
|
|
79
|
-
file_suffix
|
|
80
|
-
else:
|
|
81
|
-
file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
|
|
84
|
+
if extra_file_suffix is not None:
|
|
85
|
+
file_suffix += f"__{extra_file_suffix}"
|
|
82
86
|
|
|
83
87
|
extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
|
|
84
88
|
extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
|
|
@@ -196,7 +196,7 @@ def output_sections(args: Namespace, pdfalyzer: 'Pdfalyzer') -> List[OutputSecti
|
|
|
196
196
|
|
|
197
197
|
|
|
198
198
|
def all_sections_chosen(args):
|
|
199
|
-
"""Returns
|
|
199
|
+
"""Returns True if all flags are set or no flags are set."""
|
|
200
200
|
return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
|
|
201
201
|
|
|
202
202
|
|
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
Argument parsers for the command line tools other than `pdfalyze` that are included with The Pdfalyzer.
|
|
3
3
|
|
|
4
4
|
1. combine_pdfs
|
|
5
|
-
2.
|
|
5
|
+
2. extract_pdf_pages
|
|
6
|
+
3. extract_pdf_text
|
|
6
7
|
"""
|
|
8
|
+
import logging
|
|
7
9
|
import sys
|
|
8
10
|
from argparse import ArgumentParser, Namespace
|
|
9
11
|
from pathlib import Path
|
|
@@ -87,6 +89,7 @@ extract_pdf_parser = ArgumentParser(
|
|
|
87
89
|
)
|
|
88
90
|
|
|
89
91
|
extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
|
|
92
|
+
extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
90
93
|
|
|
91
94
|
extract_pdf_parser.add_argument('--page-range', '-r',
|
|
92
95
|
type=page_range_validator,
|
|
@@ -97,8 +100,6 @@ extract_pdf_parser.add_argument('--destination-dir', '-d',
|
|
|
97
100
|
help="directory to write the new PDF to",
|
|
98
101
|
default=Path.cwd())
|
|
99
102
|
|
|
100
|
-
extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
101
|
-
|
|
102
103
|
|
|
103
104
|
def parse_pdf_page_extraction_args() -> Namespace:
|
|
104
105
|
args = extract_pdf_parser.parse_args()
|
|
@@ -110,12 +111,13 @@ def parse_pdf_page_extraction_args() -> Namespace:
|
|
|
110
111
|
log.error(f"Destination dir '{args.destination_dir}' does not exist.")
|
|
111
112
|
sys.exit(1)
|
|
112
113
|
|
|
114
|
+
_set_log_level(args)
|
|
113
115
|
return args
|
|
114
116
|
|
|
115
117
|
|
|
116
|
-
|
|
117
|
-
#
|
|
118
|
-
|
|
118
|
+
######################
|
|
119
|
+
# extract_pdf_text #
|
|
120
|
+
######################
|
|
119
121
|
extract_text_parser = ArgumentParser(
|
|
120
122
|
formatter_class=RichHelpFormatterPlus,
|
|
121
123
|
description="Extract the text from one or more files or directories.",
|
|
@@ -153,4 +155,10 @@ def parse_text_extraction_args() -> Namespace:
|
|
|
153
155
|
log.error(f"--page-range can only be specified for a single PDF")
|
|
154
156
|
sys.exit(-1)
|
|
155
157
|
|
|
158
|
+
_set_log_level(args)
|
|
156
159
|
return args
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _set_log_level(args: Namespace):
|
|
163
|
+
if args.debug:
|
|
164
|
+
log.setLevel(logging.DEBUG)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.17.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 1.17.3
|
|
4
|
+
Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
7
7
|
Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
|
|
@@ -252,9 +252,9 @@ The Pdfalyzer comes with a few command line tools:
|
|
|
252
252
|
|
|
253
253
|
* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
|
|
254
254
|
* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
|
|
255
|
-
* `
|
|
255
|
+
* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
|
|
256
256
|
|
|
257
|
-
Running `
|
|
257
|
+
Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
|
|
258
258
|
|
|
259
259
|
```bash
|
|
260
260
|
pipx install pdfalyzer[extract]
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
.pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
|
|
2
|
-
CHANGELOG.md,sha256=
|
|
2
|
+
CHANGELOG.md,sha256=ZFP4uDoiYT-kNa7XJuyNKhIjcvY5DU4CeMSGn0braPU,13301
|
|
3
3
|
LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
4
|
-
pdfalyzer/__init__.py,sha256=
|
|
4
|
+
pdfalyzer/__init__.py,sha256=2Gikt_-OSXZqeQij4wSwb65g7jycVAupjeFmXBf51lo,6159
|
|
5
5
|
pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
|
|
6
6
|
pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04ElB8ilU,10748
|
|
7
7
|
pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
|
|
8
8
|
pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
|
|
9
9
|
pdfalyzer/decorators/indeterminate_node.py,sha256=QLJr-nGKih8gPZcIqxLU028OwWWD5VjNHYMUjniwT_k,6586
|
|
10
|
-
pdfalyzer/decorators/pdf_file.py,sha256=
|
|
10
|
+
pdfalyzer/decorators/pdf_file.py,sha256=Az3TL_Ttj_pDOHoHsiwpNlrCckCgKTp0VuGevJIi_5c,10481
|
|
11
11
|
pdfalyzer/decorators/pdf_object_properties.py,sha256=Il3RObxQ4XUf0Ei-nd4tjJO0LeaxC6u7yFa3cQs_jVY,5485
|
|
12
12
|
pdfalyzer/decorators/pdf_tree_node.py,sha256=4LReGJUtG8iEcLUQD1jW-yp3xPWsHrC-3Anbkt7XZ3A,11134
|
|
13
13
|
pdfalyzer/decorators/pdf_tree_verifier.py,sha256=2hVe9APsAWQZ7ra8AGndHQnGWmmxmb3ZwfJHZuLvsvc,4714
|
|
@@ -17,7 +17,7 @@ pdfalyzer/detection/javascript_hunter.py,sha256=_wT2vkKTMlm_RGCjYsmwcmV-ag1qep3E
|
|
|
17
17
|
pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47NdsDasg01uiQ,2194
|
|
18
18
|
pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
|
|
19
19
|
pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
|
|
20
|
-
pdfalyzer/helpers/filesystem_helper.py,sha256=
|
|
20
|
+
pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
|
|
21
21
|
pdfalyzer/helpers/image_helper.py,sha256=QjoAUcKKEtpmuEyOmEfmaUN-lzNykQ1SzqgNn9-R4Y0,1120
|
|
22
22
|
pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
|
|
23
23
|
pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
|
|
@@ -35,8 +35,8 @@ pdfalyzer/output/tables/stream_objects_table.py,sha256=PgQj8oTtW5_X8SMQb3FvCWDS-
|
|
|
35
35
|
pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVnev_4uEk,5291
|
|
36
36
|
pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
|
|
37
37
|
pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
|
|
38
|
-
pdfalyzer/util/argument_parser.py,sha256=
|
|
39
|
-
pdfalyzer/util/cli_tools_argument_parser.py,sha256=
|
|
38
|
+
pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
|
|
39
|
+
pdfalyzer/util/cli_tools_argument_parser.py,sha256=RqK_5AWC7qm9Zy7pvDb-J1WSEGBkIyxzNDcFJwSmuX4,6285
|
|
40
40
|
pdfalyzer/util/debugging.py,sha256=hjYGxptJmal9TTaAUkkoo0oNu2tdx6ZYSyC0WjvzHh0,156
|
|
41
41
|
pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
|
|
42
42
|
pdfalyzer/util/page_range.py,sha256=NMNh3_TojxTxBIpvUYK1AmvID_m8qOP6AihZrLWZF2I,1652
|
|
@@ -47,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
|
47
47
|
pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
|
|
48
48
|
pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
|
|
49
49
|
pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
|
|
50
|
-
pdfalyzer-1.17.
|
|
51
|
-
pdfalyzer-1.17.
|
|
52
|
-
pdfalyzer-1.17.
|
|
53
|
-
pdfalyzer-1.17.
|
|
54
|
-
pdfalyzer-1.17.
|
|
50
|
+
pdfalyzer-1.17.3.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
51
|
+
pdfalyzer-1.17.3.dist-info/METADATA,sha256=MczhorkJI7ozznrHf72k7a0QELDinDNHhex4ur8kSr8,27294
|
|
52
|
+
pdfalyzer-1.17.3.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
53
|
+
pdfalyzer-1.17.3.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
|
|
54
|
+
pdfalyzer-1.17.3.dist-info/RECORD,,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
combine_pdfs=pdfalyzer:combine_pdfs
|
|
3
3
|
extract_pdf_pages=pdfalyzer:extract_pdf_pages
|
|
4
|
-
|
|
4
|
+
extract_pdf_text=pdfalyzer:extract_pdf_text
|
|
5
5
|
pdfalyze=pdfalyzer:pdfalyze
|
|
6
6
|
pdfalyzer_show_color_theme=pdfalyzer:pdfalyzer_show_color_theme
|
|
7
7
|
|
|
File without changes
|
|
File without changes
|