pdfalyzer 1.17.2__py3-none-any.whl → 1.17.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- CHANGELOG.md +6 -0
- pdfalyzer/decorators/pdf_file.py +16 -20
- pdfalyzer/helpers/image_helper.py +2 -2
- pdfalyzer/util/cli_tools_argument_parser.py +11 -1
- {pdfalyzer-1.17.2.dist-info → pdfalyzer-1.17.4.dist-info}/METADATA +1 -1
- {pdfalyzer-1.17.2.dist-info → pdfalyzer-1.17.4.dist-info}/RECORD +9 -9
- {pdfalyzer-1.17.2.dist-info → pdfalyzer-1.17.4.dist-info}/LICENSE +0 -0
- {pdfalyzer-1.17.2.dist-info → pdfalyzer-1.17.4.dist-info}/WHEEL +0 -0
- {pdfalyzer-1.17.2.dist-info → pdfalyzer-1.17.4.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.17.4
|
|
4
|
+
* Make `PIL` a lazy import so installs without `[extract]` extras don't fail
|
|
5
|
+
|
|
6
|
+
### 1.17.3
|
|
7
|
+
* Put back `--debug` arg for CLI tools
|
|
8
|
+
|
|
3
9
|
### 1.17.2
|
|
4
10
|
* Remove unused `--debug` args for CLI tools
|
|
5
11
|
* Rename `extract_text_from_pdfs` to `extract_pdf_text`
|
pdfalyzer/decorators/pdf_file.py
CHANGED
|
@@ -20,6 +20,8 @@ from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_t
|
|
|
20
20
|
from pdfalyzer.helpers.string_helper import exception_str
|
|
21
21
|
from pdfalyzer.util.page_range import PageRange
|
|
22
22
|
|
|
23
|
+
DEPENDENCY_ERROR_MSG = "Pdfalyzer is missing an optional dependency required to extract text. " + \
|
|
24
|
+
"Try 'pip install pdfalyzer[extract]'"
|
|
23
25
|
DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
|
|
24
26
|
MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
|
|
25
27
|
|
|
@@ -54,11 +56,11 @@ class PdfFile:
|
|
|
54
56
|
self.file_size = self.file_path.stat().st_size
|
|
55
57
|
|
|
56
58
|
def extract_page_range(
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
59
|
+
self,
|
|
60
|
+
page_range: PageRange,
|
|
61
|
+
destination_dir: Optional[Path] = None,
|
|
62
|
+
extra_file_suffix: Optional[str] = None
|
|
63
|
+
) -> Path:
|
|
62
64
|
"""
|
|
63
65
|
Extract a range of pages to a new PDF file.
|
|
64
66
|
|
|
@@ -86,7 +88,7 @@ class PdfFile:
|
|
|
86
88
|
|
|
87
89
|
extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
|
|
88
90
|
extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
|
|
89
|
-
console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'
|
|
91
|
+
console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'")
|
|
90
92
|
pdf_writer = PdfWriter()
|
|
91
93
|
|
|
92
94
|
with open(self.file_path, 'rb') as source_pdf:
|
|
@@ -99,11 +101,11 @@ class PdfFile:
|
|
|
99
101
|
return extracted_pages_pdf_path
|
|
100
102
|
|
|
101
103
|
def extract_text(
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
104
|
+
self,
|
|
105
|
+
page_range: Optional[PageRange] = None,
|
|
106
|
+
logger: Optional[Logger] = None,
|
|
107
|
+
print_as_parsed: bool = False
|
|
108
|
+
) -> Optional[str]:
|
|
107
109
|
"""
|
|
108
110
|
Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
|
|
109
111
|
|
|
@@ -167,7 +169,7 @@ class PdfFile:
|
|
|
167
169
|
if print_as_parsed:
|
|
168
170
|
print(f"{page_text}")
|
|
169
171
|
except DependencyError:
|
|
170
|
-
log.error(
|
|
172
|
+
log.error(DEPENDENCY_ERROR_MSG)
|
|
171
173
|
except EmptyFileError:
|
|
172
174
|
log.warning("Skipping empty file!")
|
|
173
175
|
except PdfStreamError as e:
|
|
@@ -190,7 +192,8 @@ class PdfFile:
|
|
|
190
192
|
|
|
191
193
|
try:
|
|
192
194
|
extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
|
|
193
|
-
except Exception
|
|
195
|
+
except Exception:
|
|
196
|
+
stderr_console.print_exception()
|
|
194
197
|
stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
|
|
195
198
|
extracted_file = None
|
|
196
199
|
|
|
@@ -215,10 +218,3 @@ class PdfFile:
|
|
|
215
218
|
return
|
|
216
219
|
|
|
217
220
|
stderr_console.print(msg, style=style or "")
|
|
218
|
-
|
|
219
|
-
# def _num_pages(self) -> int:
|
|
220
|
-
# pdf_reader = PdfReader(self.file_path)
|
|
221
|
-
# page_count = len(pdf_reader.pages)
|
|
222
|
-
# log.debug(f"PDF Page count: {page_count}")
|
|
223
|
-
|
|
224
|
-
# for page_number, page in enumerate(pdf_reader.pages, start=1):
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
|
-
from PIL import Image
|
|
4
3
|
from yaralyzer.output.rich_console import console
|
|
5
4
|
|
|
6
5
|
from pdfalyzer.helpers.rich_text_helper import warning_text
|
|
@@ -9,11 +8,12 @@ from pdfalyzer.helpers.rich_text_helper import warning_text
|
|
|
9
8
|
def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
|
|
10
9
|
"""Use pytesseract to OCR the text in the image and return it as a string."""
|
|
11
10
|
import pytesseract
|
|
11
|
+
from PIL import Image
|
|
12
12
|
text = None
|
|
13
13
|
|
|
14
14
|
try:
|
|
15
15
|
text = pytesseract.image_to_string(image)
|
|
16
|
-
except pytesseract.pytesseract.TesseractError
|
|
16
|
+
except pytesseract.pytesseract.TesseractError:
|
|
17
17
|
console.print_exception()
|
|
18
18
|
console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
|
|
19
19
|
except OSError as e:
|
|
@@ -5,6 +5,7 @@ Argument parsers for the command line tools other than `pdfalyze` that are inclu
|
|
|
5
5
|
2. extract_pdf_pages
|
|
6
6
|
3. extract_pdf_text
|
|
7
7
|
"""
|
|
8
|
+
import logging
|
|
8
9
|
import sys
|
|
9
10
|
from argparse import ArgumentParser, Namespace
|
|
10
11
|
from pathlib import Path
|
|
@@ -88,6 +89,7 @@ extract_pdf_parser = ArgumentParser(
|
|
|
88
89
|
)
|
|
89
90
|
|
|
90
91
|
extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
|
|
92
|
+
extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
91
93
|
|
|
92
94
|
extract_pdf_parser.add_argument('--page-range', '-r',
|
|
93
95
|
type=page_range_validator,
|
|
@@ -109,6 +111,7 @@ def parse_pdf_page_extraction_args() -> Namespace:
|
|
|
109
111
|
log.error(f"Destination dir '{args.destination_dir}' does not exist.")
|
|
110
112
|
sys.exit(1)
|
|
111
113
|
|
|
114
|
+
_set_log_level(args)
|
|
112
115
|
return args
|
|
113
116
|
|
|
114
117
|
|
|
@@ -122,6 +125,7 @@ extract_text_parser = ArgumentParser(
|
|
|
122
125
|
)
|
|
123
126
|
|
|
124
127
|
extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
|
|
128
|
+
extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
125
129
|
|
|
126
130
|
extract_text_parser.add_argument('--page-range', '-r',
|
|
127
131
|
type=page_range_validator,
|
|
@@ -129,7 +133,7 @@ extract_text_parser.add_argument('--page-range', '-r',
|
|
|
129
133
|
|
|
130
134
|
extract_text_parser.add_argument('--print-as-parsed', '-p',
|
|
131
135
|
action='store_true',
|
|
132
|
-
help='print pages as they are parsed instead of waiting until
|
|
136
|
+
help='print pages as they are parsed instead of waiting until parsing complete')
|
|
133
137
|
|
|
134
138
|
|
|
135
139
|
def parse_text_extraction_args() -> Namespace:
|
|
@@ -151,4 +155,10 @@ def parse_text_extraction_args() -> Namespace:
|
|
|
151
155
|
log.error(f"--page-range can only be specified for a single PDF")
|
|
152
156
|
sys.exit(-1)
|
|
153
157
|
|
|
158
|
+
_set_log_level(args)
|
|
154
159
|
return args
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _set_log_level(args: Namespace):
|
|
163
|
+
if args.debug:
|
|
164
|
+
log.setLevel(logging.DEBUG)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.17.
|
|
3
|
+
Version: 1.17.4
|
|
4
4
|
Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
.pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
|
|
2
|
-
CHANGELOG.md,sha256=
|
|
2
|
+
CHANGELOG.md,sha256=dyXJVhpeNYDdeh8Ugfl7co6v86ksu_AtNOYKEm2U5TI,13390
|
|
3
3
|
LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
4
4
|
pdfalyzer/__init__.py,sha256=2Gikt_-OSXZqeQij4wSwb65g7jycVAupjeFmXBf51lo,6159
|
|
5
5
|
pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
|
|
@@ -7,7 +7,7 @@ pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04El
|
|
|
7
7
|
pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
|
|
8
8
|
pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
|
|
9
9
|
pdfalyzer/decorators/indeterminate_node.py,sha256=QLJr-nGKih8gPZcIqxLU028OwWWD5VjNHYMUjniwT_k,6586
|
|
10
|
-
pdfalyzer/decorators/pdf_file.py,sha256=
|
|
10
|
+
pdfalyzer/decorators/pdf_file.py,sha256=ryAYzzsO8Fw5_ZMoomruW0Bal8pTb5C0VlLOTjdVqNI,10552
|
|
11
11
|
pdfalyzer/decorators/pdf_object_properties.py,sha256=Il3RObxQ4XUf0Ei-nd4tjJO0LeaxC6u7yFa3cQs_jVY,5485
|
|
12
12
|
pdfalyzer/decorators/pdf_tree_node.py,sha256=4LReGJUtG8iEcLUQD1jW-yp3xPWsHrC-3Anbkt7XZ3A,11134
|
|
13
13
|
pdfalyzer/decorators/pdf_tree_verifier.py,sha256=2hVe9APsAWQZ7ra8AGndHQnGWmmxmb3ZwfJHZuLvsvc,4714
|
|
@@ -18,7 +18,7 @@ pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47Nd
|
|
|
18
18
|
pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
|
|
19
19
|
pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
|
|
20
20
|
pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
|
|
21
|
-
pdfalyzer/helpers/image_helper.py,sha256=
|
|
21
|
+
pdfalyzer/helpers/image_helper.py,sha256=E3Mby-KG-1eIYThuYqXEkwG1mnhY0imvrpiO8N8otfQ,1119
|
|
22
22
|
pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
|
|
23
23
|
pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
|
|
24
24
|
pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
|
|
@@ -36,7 +36,7 @@ pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVn
|
|
|
36
36
|
pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
|
|
37
37
|
pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
|
|
38
38
|
pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
|
|
39
|
-
pdfalyzer/util/cli_tools_argument_parser.py,sha256=
|
|
39
|
+
pdfalyzer/util/cli_tools_argument_parser.py,sha256=HyZhztyrPtbvOswmG975M0tK5KPon37lV3fxVA0OwYo,6277
|
|
40
40
|
pdfalyzer/util/debugging.py,sha256=hjYGxptJmal9TTaAUkkoo0oNu2tdx6ZYSyC0WjvzHh0,156
|
|
41
41
|
pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
|
|
42
42
|
pdfalyzer/util/page_range.py,sha256=NMNh3_TojxTxBIpvUYK1AmvID_m8qOP6AihZrLWZF2I,1652
|
|
@@ -47,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
|
47
47
|
pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
|
|
48
48
|
pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
|
|
49
49
|
pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
|
|
50
|
-
pdfalyzer-1.17.
|
|
51
|
-
pdfalyzer-1.17.
|
|
52
|
-
pdfalyzer-1.17.
|
|
53
|
-
pdfalyzer-1.17.
|
|
54
|
-
pdfalyzer-1.17.
|
|
50
|
+
pdfalyzer-1.17.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
51
|
+
pdfalyzer-1.17.4.dist-info/METADATA,sha256=plr6KKGy51GfRWhsqIku4u4nkMoHwM5xMLmV9Lm38ak,27294
|
|
52
|
+
pdfalyzer-1.17.4.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
53
|
+
pdfalyzer-1.17.4.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
|
|
54
|
+
pdfalyzer-1.17.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|