pdfalyzer 1.17.3__py3-none-any.whl → 1.17.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pdfalyzer might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.17.5
4
+ * Fix `PIL` lazy import
5
+
6
+ ### 1.17.4
7
+ * Make `PIL` a lazy import so installs without `[extract]` extras don't fail
8
+
3
9
  ### 1.17.3
4
10
  * Put back `--debug` arg for CLI tools
5
11
 
@@ -20,6 +20,8 @@ from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_t
20
20
  from pdfalyzer.helpers.string_helper import exception_str
21
21
  from pdfalyzer.util.page_range import PageRange
22
22
 
23
+ DEPENDENCY_ERROR_MSG = "Pdfalyzer is missing an optional dependency required to extract text. " + \
24
+ "Try 'pip install pdfalyzer[extract]'"
23
25
  DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
24
26
  MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
25
27
 
@@ -54,11 +56,11 @@ class PdfFile:
54
56
  self.file_size = self.file_path.stat().st_size
55
57
 
56
58
  def extract_page_range(
57
- self,
58
- page_range: PageRange,
59
- destination_dir: Optional[Path] = None,
60
- extra_file_suffix: Optional[str] = None
61
- ) -> Path:
59
+ self,
60
+ page_range: PageRange,
61
+ destination_dir: Optional[Path] = None,
62
+ extra_file_suffix: Optional[str] = None
63
+ ) -> Path:
62
64
  """
63
65
  Extract a range of pages to a new PDF file.
64
66
 
@@ -86,7 +88,7 @@ class PdfFile:
86
88
 
87
89
  extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
88
90
  extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
89
- console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
91
+ console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'")
90
92
  pdf_writer = PdfWriter()
91
93
 
92
94
  with open(self.file_path, 'rb') as source_pdf:
@@ -99,11 +101,11 @@ class PdfFile:
99
101
  return extracted_pages_pdf_path
100
102
 
101
103
  def extract_text(
102
- self,
103
- page_range: Optional[PageRange] = None,
104
- logger: Optional[Logger] = None,
105
- print_as_parsed: bool = False
106
- ) -> Optional[str]:
104
+ self,
105
+ page_range: Optional[PageRange] = None,
106
+ logger: Optional[Logger] = None,
107
+ print_as_parsed: bool = False
108
+ ) -> Optional[str]:
107
109
  """
108
110
  Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
109
111
 
@@ -167,7 +169,7 @@ class PdfFile:
167
169
  if print_as_parsed:
168
170
  print(f"{page_text}")
169
171
  except DependencyError:
170
- log.error("Pdfalyzer is missing an optional dependency required to extract text. Try 'pip install pdfalyzer[extract]'")
172
+ log.error(DEPENDENCY_ERROR_MSG)
171
173
  except EmptyFileError:
172
174
  log.warning("Skipping empty file!")
173
175
  except PdfStreamError as e:
@@ -190,7 +192,8 @@ class PdfFile:
190
192
 
191
193
  try:
192
194
  extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
193
- except Exception as e:
195
+ except Exception:
196
+ stderr_console.print_exception()
194
197
  stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
195
198
  extracted_file = None
196
199
 
@@ -1,19 +1,19 @@
1
1
  from typing import Optional
2
2
 
3
- from PIL import Image
4
3
  from yaralyzer.output.rich_console import console
5
4
 
6
5
  from pdfalyzer.helpers.rich_text_helper import warning_text
7
6
 
8
7
 
9
- def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
8
+ def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]:
10
9
  """Use pytesseract to OCR the text in the image and return it as a string."""
11
10
  import pytesseract
11
+ from PIL import Image
12
12
  text = None
13
13
 
14
14
  try:
15
15
  text = pytesseract.image_to_string(image)
16
- except pytesseract.pytesseract.TesseractError as e:
16
+ except pytesseract.pytesseract.TesseractError:
17
17
  console.print_exception()
18
18
  console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
19
19
  except OSError as e:
@@ -133,7 +133,7 @@ extract_text_parser.add_argument('--page-range', '-r',
133
133
 
134
134
  extract_text_parser.add_argument('--print-as-parsed', '-p',
135
135
  action='store_true',
136
- help='print pages as they are parsed instead of waiting until document is fully parsed')
136
+ help='print pages as they are parsed instead of waiting until parsing complete')
137
137
 
138
138
 
139
139
  def parse_text_extraction_args() -> Namespace:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.17.3
3
+ Version: 1.17.5
4
4
  Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
@@ -1,5 +1,5 @@
1
1
  .pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
2
- CHANGELOG.md,sha256=ZFP4uDoiYT-kNa7XJuyNKhIjcvY5DU4CeMSGn0braPU,13301
2
+ CHANGELOG.md,sha256=MJr6WBq7vvZqiuZiTqAZNTy296hZNWGIh4RlImutmx8,13426
3
3
  LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
4
4
  pdfalyzer/__init__.py,sha256=2Gikt_-OSXZqeQij4wSwb65g7jycVAupjeFmXBf51lo,6159
5
5
  pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
@@ -7,7 +7,7 @@ pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04El
7
7
  pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
8
8
  pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
9
9
  pdfalyzer/decorators/indeterminate_node.py,sha256=QLJr-nGKih8gPZcIqxLU028OwWWD5VjNHYMUjniwT_k,6586
10
- pdfalyzer/decorators/pdf_file.py,sha256=Az3TL_Ttj_pDOHoHsiwpNlrCckCgKTp0VuGevJIi_5c,10481
10
+ pdfalyzer/decorators/pdf_file.py,sha256=ryAYzzsO8Fw5_ZMoomruW0Bal8pTb5C0VlLOTjdVqNI,10552
11
11
  pdfalyzer/decorators/pdf_object_properties.py,sha256=Il3RObxQ4XUf0Ei-nd4tjJO0LeaxC6u7yFa3cQs_jVY,5485
12
12
  pdfalyzer/decorators/pdf_tree_node.py,sha256=4LReGJUtG8iEcLUQD1jW-yp3xPWsHrC-3Anbkt7XZ3A,11134
13
13
  pdfalyzer/decorators/pdf_tree_verifier.py,sha256=2hVe9APsAWQZ7ra8AGndHQnGWmmxmb3ZwfJHZuLvsvc,4714
@@ -18,7 +18,7 @@ pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47Nd
18
18
  pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
19
19
  pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
20
20
  pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
21
- pdfalyzer/helpers/image_helper.py,sha256=QjoAUcKKEtpmuEyOmEfmaUN-lzNykQ1SzqgNn9-R4Y0,1120
21
+ pdfalyzer/helpers/image_helper.py,sha256=89tJjIDSB_BdHjKE3rLPXWFFAAhKsnpVOckKq6_M4Lc,1121
22
22
  pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
23
23
  pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
24
24
  pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
@@ -36,7 +36,7 @@ pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVn
36
36
  pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
37
37
  pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
38
38
  pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
39
- pdfalyzer/util/cli_tools_argument_parser.py,sha256=RqK_5AWC7qm9Zy7pvDb-J1WSEGBkIyxzNDcFJwSmuX4,6285
39
+ pdfalyzer/util/cli_tools_argument_parser.py,sha256=HyZhztyrPtbvOswmG975M0tK5KPon37lV3fxVA0OwYo,6277
40
40
  pdfalyzer/util/debugging.py,sha256=hjYGxptJmal9TTaAUkkoo0oNu2tdx6ZYSyC0WjvzHh0,156
41
41
  pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
42
42
  pdfalyzer/util/page_range.py,sha256=NMNh3_TojxTxBIpvUYK1AmvID_m8qOP6AihZrLWZF2I,1652
@@ -47,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
47
47
  pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
48
48
  pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
49
49
  pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
50
- pdfalyzer-1.17.3.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
51
- pdfalyzer-1.17.3.dist-info/METADATA,sha256=MczhorkJI7ozznrHf72k7a0QELDinDNHhex4ur8kSr8,27294
52
- pdfalyzer-1.17.3.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
53
- pdfalyzer-1.17.3.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
54
- pdfalyzer-1.17.3.dist-info/RECORD,,
50
+ pdfalyzer-1.17.5.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
51
+ pdfalyzer-1.17.5.dist-info/METADATA,sha256=q-I5CodBjeaL9PerSvFuMkGsJFv7MNkIz1JaurbAgMM,27294
52
+ pdfalyzer-1.17.5.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
53
+ pdfalyzer-1.17.5.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
54
+ pdfalyzer-1.17.5.dist-info/RECORD,,