pdfalyzer 1.17.3__tar.gz → 1.17.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pdfalyzer might be problematic. Click here for more details.

Files changed (52) hide show
  1. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/CHANGELOG.md +6 -0
  2. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/PKG-INFO +1 -1
  3. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/decorators/pdf_file.py +16 -13
  4. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/helpers/image_helper.py +3 -3
  5. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/util/cli_tools_argument_parser.py +1 -1
  6. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pyproject.toml +1 -1
  7. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/.pdfalyzer.example +0 -0
  8. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/LICENSE +0 -0
  9. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/README.md +0 -0
  10. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/__init__.py +0 -0
  11. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/__main__.py +0 -0
  12. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/binary/binary_scanner.py +0 -0
  13. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/config.py +0 -0
  14. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/decorators/document_model_printer.py +0 -0
  15. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/decorators/indeterminate_node.py +0 -0
  16. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/decorators/pdf_object_properties.py +0 -0
  17. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/decorators/pdf_tree_node.py +0 -0
  18. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/decorators/pdf_tree_verifier.py +0 -0
  19. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/detection/constants/binary_regexes.py +0 -0
  20. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
  21. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/detection/javascript_hunter.py +0 -0
  22. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/detection/yaralyzer_helper.py +0 -0
  23. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/font_info.py +0 -0
  24. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/helpers/dict_helper.py +0 -0
  25. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/helpers/filesystem_helper.py +0 -0
  26. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/helpers/number_helper.py +0 -0
  27. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/helpers/pdf_object_helper.py +0 -0
  28. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/helpers/rich_text_helper.py +0 -0
  29. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/helpers/string_helper.py +0 -0
  30. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/output/character_mapping.py +0 -0
  31. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/output/layout.py +0 -0
  32. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/output/pdfalyzer_presenter.py +0 -0
  33. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/output/styles/node_colors.py +0 -0
  34. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/output/styles/rich_theme.py +0 -0
  35. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/output/tables/decoding_stats_table.py +0 -0
  36. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/output/tables/font_summary_table.py +0 -0
  37. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
  38. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/output/tables/stream_objects_table.py +0 -0
  39. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/pdf_object_relationship.py +0 -0
  40. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/pdfalyzer.py +0 -0
  41. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/util/adobe_strings.py +0 -0
  42. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/util/argument_parser.py +0 -0
  43. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/util/debugging.py +0 -0
  44. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/util/exceptions.py +0 -0
  45. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/util/page_range.py +0 -0
  46. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/util/pdf_parser_manager.py +0 -0
  47. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/yara_rules/PDF.yara +0 -0
  48. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
  49. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/yara_rules/__init.py__ +0 -0
  50. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/yara_rules/didier_stevens.yara +0 -0
  51. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
  52. {pdfalyzer-1.17.3 → pdfalyzer-1.17.5}/pdfalyzer/yara_rules/pdf_malware.yara +0 -0
@@ -1,5 +1,11 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.17.5
4
+ * Fix `PIL` lazy import
5
+
6
+ ### 1.17.4
7
+ * Make `PIL` a lazy import so installs without `[extract]` extras don't fail
8
+
3
9
  ### 1.17.3
4
10
  * Put back `--debug` arg for CLI tools
5
11
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.17.3
3
+ Version: 1.17.5
4
4
  Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
@@ -20,6 +20,8 @@ from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_t
20
20
  from pdfalyzer.helpers.string_helper import exception_str
21
21
  from pdfalyzer.util.page_range import PageRange
22
22
 
23
+ DEPENDENCY_ERROR_MSG = "Pdfalyzer is missing an optional dependency required to extract text. " + \
24
+ "Try 'pip install pdfalyzer[extract]'"
23
25
  DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
24
26
  MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
25
27
 
@@ -54,11 +56,11 @@ class PdfFile:
54
56
  self.file_size = self.file_path.stat().st_size
55
57
 
56
58
  def extract_page_range(
57
- self,
58
- page_range: PageRange,
59
- destination_dir: Optional[Path] = None,
60
- extra_file_suffix: Optional[str] = None
61
- ) -> Path:
59
+ self,
60
+ page_range: PageRange,
61
+ destination_dir: Optional[Path] = None,
62
+ extra_file_suffix: Optional[str] = None
63
+ ) -> Path:
62
64
  """
63
65
  Extract a range of pages to a new PDF file.
64
66
 
@@ -86,7 +88,7 @@ class PdfFile:
86
88
 
87
89
  extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
88
90
  extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
89
- console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
91
+ console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'")
90
92
  pdf_writer = PdfWriter()
91
93
 
92
94
  with open(self.file_path, 'rb') as source_pdf:
@@ -99,11 +101,11 @@ class PdfFile:
99
101
  return extracted_pages_pdf_path
100
102
 
101
103
  def extract_text(
102
- self,
103
- page_range: Optional[PageRange] = None,
104
- logger: Optional[Logger] = None,
105
- print_as_parsed: bool = False
106
- ) -> Optional[str]:
104
+ self,
105
+ page_range: Optional[PageRange] = None,
106
+ logger: Optional[Logger] = None,
107
+ print_as_parsed: bool = False
108
+ ) -> Optional[str]:
107
109
  """
108
110
  Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
109
111
 
@@ -167,7 +169,7 @@ class PdfFile:
167
169
  if print_as_parsed:
168
170
  print(f"{page_text}")
169
171
  except DependencyError:
170
- log.error("Pdfalyzer is missing an optional dependency required to extract text. Try 'pip install pdfalyzer[extract]'")
172
+ log.error(DEPENDENCY_ERROR_MSG)
171
173
  except EmptyFileError:
172
174
  log.warning("Skipping empty file!")
173
175
  except PdfStreamError as e:
@@ -190,7 +192,8 @@ class PdfFile:
190
192
 
191
193
  try:
192
194
  extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
193
- except Exception as e:
195
+ except Exception:
196
+ stderr_console.print_exception()
194
197
  stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
195
198
  extracted_file = None
196
199
 
@@ -1,19 +1,19 @@
1
1
  from typing import Optional
2
2
 
3
- from PIL import Image
4
3
  from yaralyzer.output.rich_console import console
5
4
 
6
5
  from pdfalyzer.helpers.rich_text_helper import warning_text
7
6
 
8
7
 
9
- def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
8
+ def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]:
10
9
  """Use pytesseract to OCR the text in the image and return it as a string."""
11
10
  import pytesseract
11
+ from PIL import Image
12
12
  text = None
13
13
 
14
14
  try:
15
15
  text = pytesseract.image_to_string(image)
16
- except pytesseract.pytesseract.TesseractError as e:
16
+ except pytesseract.pytesseract.TesseractError:
17
17
  console.print_exception()
18
18
  console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
19
19
  except OSError as e:
@@ -133,7 +133,7 @@ extract_text_parser.add_argument('--page-range', '-r',
133
133
 
134
134
  extract_text_parser.add_argument('--print-as-parsed', '-p',
135
135
  action='store_true',
136
- help='print pages as they are parsed instead of waiting until document is fully parsed')
136
+ help='print pages as they are parsed instead of waiting until parsing complete')
137
137
 
138
138
 
139
139
  def parse_text_extraction_args() -> Namespace:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "pdfalyzer"
3
- version = "1.17.3"
3
+ version = "1.17.5"
4
4
  description = "Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more."
5
5
  authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
6
6
  license = "GPL-3.0-or-later"
File without changes
File without changes