pdfalyzer 1.17.3__py3-none-any.whl → 1.17.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- CHANGELOG.md +6 -0
- pdfalyzer/decorators/pdf_file.py +16 -13
- pdfalyzer/helpers/image_helper.py +3 -3
- pdfalyzer/util/cli_tools_argument_parser.py +1 -1
- {pdfalyzer-1.17.3.dist-info → pdfalyzer-1.17.5.dist-info}/METADATA +1 -1
- {pdfalyzer-1.17.3.dist-info → pdfalyzer-1.17.5.dist-info}/RECORD +9 -9
- {pdfalyzer-1.17.3.dist-info → pdfalyzer-1.17.5.dist-info}/LICENSE +0 -0
- {pdfalyzer-1.17.3.dist-info → pdfalyzer-1.17.5.dist-info}/WHEEL +0 -0
- {pdfalyzer-1.17.3.dist-info → pdfalyzer-1.17.5.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
pdfalyzer/decorators/pdf_file.py
CHANGED
|
@@ -20,6 +20,8 @@ from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_t
|
|
|
20
20
|
from pdfalyzer.helpers.string_helper import exception_str
|
|
21
21
|
from pdfalyzer.util.page_range import PageRange
|
|
22
22
|
|
|
23
|
+
DEPENDENCY_ERROR_MSG = "Pdfalyzer is missing an optional dependency required to extract text. " + \
|
|
24
|
+
"Try 'pip install pdfalyzer[extract]'"
|
|
23
25
|
DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
|
|
24
26
|
MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
|
|
25
27
|
|
|
@@ -54,11 +56,11 @@ class PdfFile:
|
|
|
54
56
|
self.file_size = self.file_path.stat().st_size
|
|
55
57
|
|
|
56
58
|
def extract_page_range(
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
59
|
+
self,
|
|
60
|
+
page_range: PageRange,
|
|
61
|
+
destination_dir: Optional[Path] = None,
|
|
62
|
+
extra_file_suffix: Optional[str] = None
|
|
63
|
+
) -> Path:
|
|
62
64
|
"""
|
|
63
65
|
Extract a range of pages to a new PDF file.
|
|
64
66
|
|
|
@@ -86,7 +88,7 @@ class PdfFile:
|
|
|
86
88
|
|
|
87
89
|
extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
|
|
88
90
|
extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
|
|
89
|
-
console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'
|
|
91
|
+
console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'")
|
|
90
92
|
pdf_writer = PdfWriter()
|
|
91
93
|
|
|
92
94
|
with open(self.file_path, 'rb') as source_pdf:
|
|
@@ -99,11 +101,11 @@ class PdfFile:
|
|
|
99
101
|
return extracted_pages_pdf_path
|
|
100
102
|
|
|
101
103
|
def extract_text(
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
104
|
+
self,
|
|
105
|
+
page_range: Optional[PageRange] = None,
|
|
106
|
+
logger: Optional[Logger] = None,
|
|
107
|
+
print_as_parsed: bool = False
|
|
108
|
+
) -> Optional[str]:
|
|
107
109
|
"""
|
|
108
110
|
Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
|
|
109
111
|
|
|
@@ -167,7 +169,7 @@ class PdfFile:
|
|
|
167
169
|
if print_as_parsed:
|
|
168
170
|
print(f"{page_text}")
|
|
169
171
|
except DependencyError:
|
|
170
|
-
log.error(
|
|
172
|
+
log.error(DEPENDENCY_ERROR_MSG)
|
|
171
173
|
except EmptyFileError:
|
|
172
174
|
log.warning("Skipping empty file!")
|
|
173
175
|
except PdfStreamError as e:
|
|
@@ -190,7 +192,8 @@ class PdfFile:
|
|
|
190
192
|
|
|
191
193
|
try:
|
|
192
194
|
extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
|
|
193
|
-
except Exception
|
|
195
|
+
except Exception:
|
|
196
|
+
stderr_console.print_exception()
|
|
194
197
|
stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
|
|
195
198
|
extracted_file = None
|
|
196
199
|
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
|
-
from PIL import Image
|
|
4
3
|
from yaralyzer.output.rich_console import console
|
|
5
4
|
|
|
6
5
|
from pdfalyzer.helpers.rich_text_helper import warning_text
|
|
7
6
|
|
|
8
7
|
|
|
9
|
-
def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
|
|
8
|
+
def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]:
|
|
10
9
|
"""Use pytesseract to OCR the text in the image and return it as a string."""
|
|
11
10
|
import pytesseract
|
|
11
|
+
from PIL import Image
|
|
12
12
|
text = None
|
|
13
13
|
|
|
14
14
|
try:
|
|
15
15
|
text = pytesseract.image_to_string(image)
|
|
16
|
-
except pytesseract.pytesseract.TesseractError
|
|
16
|
+
except pytesseract.pytesseract.TesseractError:
|
|
17
17
|
console.print_exception()
|
|
18
18
|
console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
|
|
19
19
|
except OSError as e:
|
|
@@ -133,7 +133,7 @@ extract_text_parser.add_argument('--page-range', '-r',
|
|
|
133
133
|
|
|
134
134
|
extract_text_parser.add_argument('--print-as-parsed', '-p',
|
|
135
135
|
action='store_true',
|
|
136
|
-
help='print pages as they are parsed instead of waiting until
|
|
136
|
+
help='print pages as they are parsed instead of waiting until parsing complete')
|
|
137
137
|
|
|
138
138
|
|
|
139
139
|
def parse_text_extraction_args() -> Namespace:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.17.
|
|
3
|
+
Version: 1.17.5
|
|
4
4
|
Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
.pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
|
|
2
|
-
CHANGELOG.md,sha256=
|
|
2
|
+
CHANGELOG.md,sha256=MJr6WBq7vvZqiuZiTqAZNTy296hZNWGIh4RlImutmx8,13426
|
|
3
3
|
LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
4
4
|
pdfalyzer/__init__.py,sha256=2Gikt_-OSXZqeQij4wSwb65g7jycVAupjeFmXBf51lo,6159
|
|
5
5
|
pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
|
|
@@ -7,7 +7,7 @@ pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04El
|
|
|
7
7
|
pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
|
|
8
8
|
pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
|
|
9
9
|
pdfalyzer/decorators/indeterminate_node.py,sha256=QLJr-nGKih8gPZcIqxLU028OwWWD5VjNHYMUjniwT_k,6586
|
|
10
|
-
pdfalyzer/decorators/pdf_file.py,sha256=
|
|
10
|
+
pdfalyzer/decorators/pdf_file.py,sha256=ryAYzzsO8Fw5_ZMoomruW0Bal8pTb5C0VlLOTjdVqNI,10552
|
|
11
11
|
pdfalyzer/decorators/pdf_object_properties.py,sha256=Il3RObxQ4XUf0Ei-nd4tjJO0LeaxC6u7yFa3cQs_jVY,5485
|
|
12
12
|
pdfalyzer/decorators/pdf_tree_node.py,sha256=4LReGJUtG8iEcLUQD1jW-yp3xPWsHrC-3Anbkt7XZ3A,11134
|
|
13
13
|
pdfalyzer/decorators/pdf_tree_verifier.py,sha256=2hVe9APsAWQZ7ra8AGndHQnGWmmxmb3ZwfJHZuLvsvc,4714
|
|
@@ -18,7 +18,7 @@ pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47Nd
|
|
|
18
18
|
pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
|
|
19
19
|
pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
|
|
20
20
|
pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
|
|
21
|
-
pdfalyzer/helpers/image_helper.py,sha256=
|
|
21
|
+
pdfalyzer/helpers/image_helper.py,sha256=89tJjIDSB_BdHjKE3rLPXWFFAAhKsnpVOckKq6_M4Lc,1121
|
|
22
22
|
pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
|
|
23
23
|
pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
|
|
24
24
|
pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
|
|
@@ -36,7 +36,7 @@ pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVn
|
|
|
36
36
|
pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
|
|
37
37
|
pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
|
|
38
38
|
pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
|
|
39
|
-
pdfalyzer/util/cli_tools_argument_parser.py,sha256=
|
|
39
|
+
pdfalyzer/util/cli_tools_argument_parser.py,sha256=HyZhztyrPtbvOswmG975M0tK5KPon37lV3fxVA0OwYo,6277
|
|
40
40
|
pdfalyzer/util/debugging.py,sha256=hjYGxptJmal9TTaAUkkoo0oNu2tdx6ZYSyC0WjvzHh0,156
|
|
41
41
|
pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
|
|
42
42
|
pdfalyzer/util/page_range.py,sha256=NMNh3_TojxTxBIpvUYK1AmvID_m8qOP6AihZrLWZF2I,1652
|
|
@@ -47,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
|
47
47
|
pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
|
|
48
48
|
pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
|
|
49
49
|
pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
|
|
50
|
-
pdfalyzer-1.17.
|
|
51
|
-
pdfalyzer-1.17.
|
|
52
|
-
pdfalyzer-1.17.
|
|
53
|
-
pdfalyzer-1.17.
|
|
54
|
-
pdfalyzer-1.17.
|
|
50
|
+
pdfalyzer-1.17.5.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
51
|
+
pdfalyzer-1.17.5.dist-info/METADATA,sha256=q-I5CodBjeaL9PerSvFuMkGsJFv7MNkIz1JaurbAgMM,27294
|
|
52
|
+
pdfalyzer-1.17.5.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
53
|
+
pdfalyzer-1.17.5.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
|
|
54
|
+
pdfalyzer-1.17.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|