pdfalyzer 1.16.14__tar.gz → 1.17.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pdfalyzer might be problematic. Click here for more details.

Files changed (52) hide show
  1. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/CHANGELOG.md +5 -1
  2. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/PKG-INFO +26 -5
  3. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/README.md +22 -4
  4. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/__init__.py +19 -5
  5. pdfalyzer-1.17.0/pdfalyzer/decorators/pdf_file.py +212 -0
  6. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/helpers/filesystem_helper.py +27 -3
  7. pdfalyzer-1.17.0/pdfalyzer/helpers/image_helper.py +31 -0
  8. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/helpers/rich_text_helper.py +50 -1
  9. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/helpers/string_helper.py +6 -1
  10. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/output/styles/rich_theme.py +2 -1
  11. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/util/argument_parser.py +88 -0
  12. pdfalyzer-1.17.0/pdfalyzer/util/page_range.py +54 -0
  13. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pyproject.toml +13 -1
  14. pdfalyzer-1.16.14/pdfalyzer/decorators/pdf_file.py +0 -50
  15. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/.pdfalyzer.example +0 -0
  16. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/LICENSE +0 -0
  17. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/__main__.py +0 -0
  18. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/binary/binary_scanner.py +0 -0
  19. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/config.py +0 -0
  20. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/decorators/document_model_printer.py +0 -0
  21. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/decorators/indeterminate_node.py +0 -0
  22. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/decorators/pdf_object_properties.py +0 -0
  23. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/decorators/pdf_tree_node.py +0 -0
  24. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/decorators/pdf_tree_verifier.py +0 -0
  25. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/detection/constants/binary_regexes.py +0 -0
  26. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
  27. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/detection/javascript_hunter.py +0 -0
  28. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/detection/yaralyzer_helper.py +0 -0
  29. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/font_info.py +0 -0
  30. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/helpers/dict_helper.py +0 -0
  31. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/helpers/number_helper.py +0 -0
  32. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/helpers/pdf_object_helper.py +0 -0
  33. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/output/character_mapping.py +0 -0
  34. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/output/layout.py +0 -0
  35. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/output/pdfalyzer_presenter.py +0 -0
  36. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/output/styles/node_colors.py +0 -0
  37. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/output/tables/decoding_stats_table.py +0 -0
  38. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/output/tables/font_summary_table.py +0 -0
  39. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
  40. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/output/tables/stream_objects_table.py +0 -0
  41. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/pdf_object_relationship.py +0 -0
  42. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/pdfalyzer.py +0 -0
  43. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/util/adobe_strings.py +0 -0
  44. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/util/debugging.py +0 -0
  45. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/util/exceptions.py +0 -0
  46. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/util/pdf_parser_manager.py +0 -0
  47. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/PDF.yara +0 -0
  48. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
  49. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/__init.py__ +0 -0
  50. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/didier_stevens.yara +0 -0
  51. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
  52. {pdfalyzer-1.16.14 → pdfalyzer-1.17.0}/pdfalyzer/yara_rules/pdf_malware.yara +0 -0
@@ -1,7 +1,11 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ # 1.17.0
4
+ * Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
5
+ * Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)
6
+
3
7
  ### 1.16.14
4
- * Bump `yaralyzer` to v1.0.9
8
+ * Bump `yaralyzer` to v1.0.9, handle `FileNotFoundError` which is now raised instead of `TypeError`
5
9
  * Drop support for python 3.9
6
10
 
7
11
  ### 1.16.13
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.16.14
3
+ Version: 1.17.0
4
4
  Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
@@ -21,20 +21,23 @@ Classifier: Programming Language :: Python :: 3.13
21
21
  Classifier: Topic :: Artistic Software
22
22
  Classifier: Topic :: Scientific/Engineering :: Visualization
23
23
  Classifier: Topic :: Security
24
+ Provides-Extra: extract
25
+ Requires-Dist: PyMuPDF (>=1.26.4,<2.0.0) ; extra == "extract"
24
26
  Requires-Dist: anytree (>=2.13,<3.0)
25
27
  Requires-Dist: pypdf (>=6.0.0,<7.0.0)
28
+ Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
26
29
  Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
27
30
  Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
28
31
  Project-URL: Documentation, https://github.com/michelcrypt4d4mus/pdfalyzer
29
32
  Project-URL: Repository, https://github.com/michelcrypt4d4mus/pdfalyzer
30
33
  Description-Content-Type: text/markdown
31
34
 
32
- <!-- ![Tests](https://img.shields.io/github/workflow/status/michelcrypt4d4mus/pdfalyzer/tests?label=tests) -->
33
- ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
34
35
  [![GithubRelease](https://img.shields.io/github/v/release/michelcrypt4d4mus/pdfalyzer?sort=semver)](https://pypi.org/project/pdfalyzer/)
35
- [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
36
36
  ![PyPiRelease](https://img.shields.io/pypi/v/pdfalyzer)
37
+ [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
38
+ ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
37
39
  ![Downloads](https://img.shields.io/pypi/dm/pdfalyzer)
40
+ [![Tests](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml/badge.svg)](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
38
41
 
39
42
 
40
43
  # THE PDFALYZER
@@ -242,9 +245,27 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
242
245
 
243
246
  -------------
244
247
 
248
+
245
249
  # PDF Resources
246
250
  ## Included PDF Tools
247
- The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
251
+ The Pdfalyzer comes with a few command line tools:
252
+
253
+ #### `combine_pdfs`
254
+ Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
255
+
256
+ #### `extract_pdf_pages`
257
+ Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
258
+ ![](doc/extract_pages_from_pdf_help.png)
259
+
260
+ #### `extract_text_from_pdfs`
261
+ Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
262
+
263
+ ```bash
264
+ pipx install pdfalyzer[extract]
265
+ ```
266
+
267
+ Run `extract_text_from_pdfs --help` to see the options.
268
+
248
269
 
249
270
  ## 3rd Party PDF Tools
250
271
  ### Installing Didier Stevens's PDF Analysis Tools
@@ -1,9 +1,9 @@
1
- <!-- ![Tests](https://img.shields.io/github/workflow/status/michelcrypt4d4mus/pdfalyzer/tests?label=tests) -->
2
- ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
3
1
  [![GithubRelease](https://img.shields.io/github/v/release/michelcrypt4d4mus/pdfalyzer?sort=semver)](https://pypi.org/project/pdfalyzer/)
4
- [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
5
2
  ![PyPiRelease](https://img.shields.io/pypi/v/pdfalyzer)
3
+ [![GitHub last commit](https://img.shields.io/github/last-commit/michelcrypt4d4mus/pdfalyzer)](https://github.com/michelcrypt4d4mus/pdfalyzer)
4
+ ![Python Version](https://img.shields.io/pypi/pyversions/pdfalyzer)
6
5
  ![Downloads](https://img.shields.io/pypi/dm/pdfalyzer)
6
+ [![Tests](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml/badge.svg)](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
7
7
 
8
8
 
9
9
  # THE PDFALYZER
@@ -211,9 +211,27 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
211
211
 
212
212
  -------------
213
213
 
214
+
214
215
  # PDF Resources
215
216
  ## Included PDF Tools
216
- The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
217
+ The Pdfalyzer comes with a few command line tools:
218
+
219
+ #### `combine_pdfs`
220
+ Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
221
+
222
+ #### `extract_pdf_pages`
223
+ Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
224
+ ![](doc/extract_pages_from_pdf_help.png)
225
+
226
+ #### `extract_text_from_pdfs`
227
+ Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
228
+
229
+ ```bash
230
+ pipx install pdfalyzer[extract]
231
+ ```
232
+
233
+ Run `extract_text_from_pdfs --help` to see the options.
234
+
217
235
 
218
236
  ## 3rd Party PDF Tools
219
237
  ### Installing Didier Stevens's PDF Analysis Tools
@@ -1,5 +1,6 @@
1
1
  import code
2
2
  import sys
3
+ from argparse import Namespace
3
4
  from os import environ, getcwd, path
4
5
 
5
6
  from dotenv import load_dotenv
@@ -24,13 +25,14 @@ from yaralyzer.output.file_export import invoke_rich_export
24
25
  from yaralyzer.output.rich_console import console
25
26
  from yaralyzer.util.logging import log_and_print
26
27
 
28
+ from pdfalyzer.decorators.pdf_file import PdfFile
27
29
  from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
28
30
  from pdfalyzer.helpers.rich_text_helper import print_highlighted
29
31
  from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
30
32
  from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
31
33
  from pdfalyzer.pdfalyzer import Pdfalyzer
32
34
  from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
33
- parse_combine_pdfs_args)
35
+ parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
34
36
  from pdfalyzer.util.pdf_parser_manager import PdfParserManager
35
37
 
36
38
  # For the table shown by running pdfalyzer_show_color_theme
@@ -132,7 +134,19 @@ def combine_pdfs():
132
134
  print_highlighted(txt)
133
135
 
134
136
 
135
- # TODO: migrate this functionality from clown_sort
136
- # def extract_pages_from_pdf() -> None:
137
- # args = parse_pdf_page_extraction_args()
138
- # PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
137
+ def extract_pdf_pages() -> None:
138
+ args = parse_pdf_page_extraction_args()
139
+ PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
140
+
141
+
142
+ def extract_text_from_pdfs() -> None:
143
+ """
144
+ Extract text from a single file or from all files in a given directory. Can accept
145
+ multiple paths as arguments on the command line.
146
+ """
147
+ args: Namespace = parse_text_extraction_args()
148
+ console.line()
149
+
150
+ for file_path in args.files_to_process:
151
+ PdfFile(file_path).print_extracted_text(args.page_range, args.print_as_parsed)
152
+ console.line(2)
@@ -0,0 +1,212 @@
1
+ import io
2
+ from logging import Logger
3
+ from os import path
4
+ from pathlib import Path
5
+ from typing import List, Optional, Union
6
+
7
+ from pypdf import PdfReader, PdfWriter
8
+ from pypdf.errors import DependencyError, EmptyFileError, PdfStreamError
9
+ from rich.console import Console
10
+ from rich.markup import escape
11
+ from rich.panel import Panel
12
+ from rich.text import Text
13
+ from yaralyzer.output.rich_console import console
14
+ from yaralyzer.util.logging import log as yaralyzer_log
15
+
16
+ from pdfalyzer.helpers.filesystem_helper import create_dir_if_it_does_not_exist, insert_suffix_before_extension
17
+ from pdfalyzer.helpers.image_helper import ocr_text
18
+ from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_text, mild_warning,
19
+ print_error, stderr_console)
20
+ from pdfalyzer.helpers.string_helper import exception_str
21
+ from pdfalyzer.util.page_range import PageRange
22
+
23
+ DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
24
+ MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
25
+
26
+
27
+ class PdfFile:
28
+ """
29
+ Wrapper for a PDF file path that provides useful methods and properties.
30
+
31
+ Attributes:
32
+ file_path (Path): The path to the PDF file.
33
+ dirname (Path): The directory containing the PDF file.
34
+ basename (str): The base name of the PDF file (with extension).
35
+ basename_without_ext (str): The base name of the PDF file (without extension).
36
+ extname (str): The file extension of the PDF file.
37
+ """
38
+
39
+ def __init__(self, file_path: Union[str, Path]) -> None:
40
+ """
41
+ Args:
42
+ file_path (Union[str, Path]): Path to the PDF file.
43
+ """
44
+ self.file_path: Path = Path(file_path)
45
+
46
+ if not self.file_path.exists():
47
+ raise FileNotFoundError(f"File '{file_path}' does not exist.")
48
+
49
+ self.dirname = self.file_path.parent
50
+ self.basename: str = path.basename(file_path)
51
+ self.basename_without_ext: str = str(Path(self.basename).with_suffix(''))
52
+ self.extname: str = self.file_path.suffix
53
+ self.file_size = self.file_path.stat().st_size
54
+
55
+ def extract_page_range(
56
+ self,
57
+ page_range: PageRange,
58
+ destination_dir: Optional[Path] = None,
59
+ extra_file_suffix: Optional[str] = None
60
+ ) -> Path:
61
+ """
62
+ Extract a range of pages to a new PDF file.
63
+
64
+ Args:
65
+ page_range (PageRange): Range of pages to extract.
66
+ destination_dir (Optional[Path]): Directory to save the new PDF file. Defaults to the same
67
+ directory as the source PDF.
68
+ extra_file_suffix (Optional[str]): An optional suffix to append to the new PDF's filename.
69
+ Defaults to the page range suffix.
70
+
71
+ Returns:
72
+ Path: The path to the newly created PDF file containing the extracted pages.
73
+ """
74
+ destination_dir = destination_dir or self.dirname
75
+ create_dir_if_it_does_not_exist(destination_dir)
76
+
77
+ if extra_file_suffix is None:
78
+ file_suffix = page_range.file_suffix()
79
+ else:
80
+ file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
81
+
82
+ extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
83
+ extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
84
+ console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
85
+ pdf_writer = PdfWriter()
86
+
87
+ with open(self.file_path, 'rb') as source_pdf:
88
+ pdf_writer.append(fileobj=source_pdf, pages=page_range.to_tuple())
89
+
90
+ with open(extracted_pages_pdf_path, 'wb') as extracted_pages_pdf:
91
+ pdf_writer.write(extracted_pages_pdf)
92
+
93
+ console.print(f"Extracted pages to new PDF: '{extracted_pages_pdf_path}'.")
94
+ return extracted_pages_pdf_path
95
+
96
+ def extract_text(
97
+ self,
98
+ page_range: Optional[PageRange] = None,
99
+ logger: Optional[Logger] = None,
100
+ print_as_parsed: bool = False
101
+ ) -> Optional[str]:
102
+ """
103
+ Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
104
+
105
+ Args:
106
+ page_range (Optional[PageRange]): If provided, only extract text from pages in this range.
107
+ Page numbers are 1-indexed. If not provided, extract text from all pages.
108
+ log (Optional[Logger]): If provided, log progress to this logger. Otherwise use default logger.
109
+ print_as_parsed (bool): If True, print each page's text to STDOUT as it is parsed.
110
+
111
+ Returns:
112
+ Optional[str]: The extracted text, or None if extraction failed.
113
+ """
114
+ from PIL import Image # Imported here to avoid hard dependency if not using this method
115
+ log = logger or yaralyzer_log
116
+ log.debug(f"Extracting text from '{self.file_path}'...")
117
+ self._page_numbers_of_errors: List[int] = []
118
+ extracted_pages = []
119
+
120
+ try:
121
+ pdf_reader = PdfReader(self.file_path)
122
+ page_count = len(pdf_reader.pages)
123
+ log.debug(f"PDF Page count: {page_count}")
124
+
125
+ for page_number, page in enumerate(pdf_reader.pages, start=1):
126
+ if page_range and not page_range.in_range(page_number):
127
+ self._log_to_stderr(f"Skipping page {page_number}...")
128
+ continue
129
+
130
+ self._log_to_stderr(f"Parsing page {page_number}...")
131
+ page_buffer = Console(file=io.StringIO())
132
+ page_buffer.print(Panel(f"PAGE {page_number}", padding=(0, 15), expand=False))
133
+ page_buffer.print(escape(page.extract_text().strip()))
134
+ image_number = 1
135
+
136
+ # Extracting images is a bit fraught (lots of PIL and pypdf exceptions have come from here)
137
+ try:
138
+ for image_number, image in enumerate(page.images, start=1):
139
+ image_name = f"Page {page_number}, Image {image_number}"
140
+ self._log_to_stderr(f" Processing {image_name}...", "dim")
141
+ page_buffer.print(Panel(image_name, expand=False))
142
+ image_obj = Image.open(io.BytesIO(image.data))
143
+ image_text = ocr_text(image_obj, f"{self.file_path} ({image_name})")
144
+ page_buffer.print((image_text or '').strip())
145
+ except (OSError, NotImplementedError, TypeError, ValueError) as e:
146
+ error_str = exception_str(e)
147
+ msg = f"{error_str} while parsing embedded image {image_number} on page {page_number}..."
148
+ mild_warning(msg)
149
+
150
+ # Dump an error PDF and encourage user to report to pypdf team.
151
+ if 'JBIG2Decode' not in str(e):
152
+ stderr_console.print_exception()
153
+
154
+ if page_number not in self._page_numbers_of_errors:
155
+ self._handle_extraction_error(page_number, error_str)
156
+ self._page_numbers_of_errors.append(page_number)
157
+
158
+ page_text = page_buffer.file.getvalue()
159
+ extracted_pages.append(page_text)
160
+ log.debug(page_text)
161
+
162
+ if print_as_parsed:
163
+ print(f"{page_text}")
164
+ except DependencyError:
165
+ log.error("Pdfalyzer is missing an optional dependency required to extract text. Try 'pip install pdfalyzer[extract]'")
166
+ except EmptyFileError:
167
+ log.warning("Skipping empty file!")
168
+ except PdfStreamError as e:
169
+ print_error(f"Error parsing PDF file '{self.file_path}': {e}")
170
+ stderr_console.print_exception()
171
+
172
+ return "\n\n".join(extracted_pages).strip()
173
+
174
+ def print_extracted_text(self, page_range: Optional[PageRange] = None, print_as_parsed: bool = False) -> None:
175
+ """Fancy wrapper for printing the extracted text to the screen."""
176
+ console.print(Panel(str(self.file_path), expand=False, style='bright_white reverse'))
177
+ txt = self.extract_text(page_range=page_range, print_as_parsed=print_as_parsed)
178
+
179
+ if not print_as_parsed:
180
+ console.print(txt)
181
+
182
+ def _handle_extraction_error(self, page_number: int, error_msg: str) -> None:
183
+ """Rip the offending page to a new file and suggest that user report bug to PyPDF."""
184
+ destination_dir = DEFAULT_PDF_ERRORS_DIR
185
+
186
+ try:
187
+ extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
188
+ except Exception as e:
189
+ stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
190
+ extracted_file = None
191
+
192
+ blink_txt = Text('', style='bright_white')
193
+ blink_txt.append("An error (", style='blink color(154)').append(error_msg, style='color(11) blink')
194
+ blink_txt.append(') ', style='blink color(154)')
195
+ blink_txt.append("was encountered while processing a PDF file.\n\n", style='blink color(154)')
196
+
197
+ txt = Text(f"The error was of a type such that it probably came from a bug in ", style='bright_white')
198
+ txt.append('PyPDF', style='underline bright_green').append('. It was encountered processing the file ')
199
+ txt.append(str(self.file_path), style='file').append('. You should see a stack trace above this box.\n\n')
200
+
201
+ txt.append('The offending page will be extracted to ', style='bright_white')
202
+ txt.append(str(extracted_file), style='file').append('.\n\n')
203
+ txt.append(f"Please visit 'https://github.com/py-pdf/pypdf/issues' to report a bug. ", style='bold')
204
+ txt.append(f"Providing the devs with the extracted page and the stack trace help improve pypdf.")
205
+ stderr_console.print(attention_getting_panel(blink_txt + txt, title='PyPDF Error'))
206
+
207
+ def _log_to_stderr(self, msg: str, style: Optional[str] = None) -> None:
208
+ """When parsing very large PDFs it can be useful to log progress and other messages to STDERR."""
209
+ if self.file_size < MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR:
210
+ return
211
+
212
+ stderr_console.print(msg, style=style or "")
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import Optional, Union
7
7
 
8
8
  from yaralyzer.output.rich_console import console
9
+ from yaralyzer.util.logging import log
9
10
 
10
11
  from pdfalyzer.helpers.rich_text_helper import print_highlighted
11
12
 
@@ -18,9 +19,20 @@ PDF_EXT = '.pdf'
18
19
  # type StrOrPath = Union[str, Path]
19
20
 
20
21
 
21
- def with_pdf_extension(file_path: Union[str, Path]) -> str:
22
- """Append '.pdf' to 'file_path' if it doesn't already end with '.pdf'."""
23
- return str(file_path) + ('' if is_pdf(file_path) else PDF_EXT)
22
+ def create_dir_if_it_does_not_exist(dir: Path) -> None:
23
+ """Like it says on the tin."""
24
+ if dir.exists():
25
+ return
26
+
27
+ console.warning(f"Need to create '{dir}'")
28
+ dir.mkdir(parents=True, exist_ok=True)
29
+
30
+
31
+ def insert_suffix_before_extension(file_path: Path, suffix: str, separator: str = '__') -> Path:
32
+ """Inserting 'page 1' suffix in 'path/to/file.jpg' -> '/path/to/file__page_1.jpg'."""
33
+ suffix = strip_bad_chars(suffix).replace(' ', '_')
34
+ file_path_without_extension = file_path.with_suffix('')
35
+ return Path(f"{file_path_without_extension}{separator}{suffix}{file_path.suffix}")
24
36
 
25
37
 
26
38
  def is_pdf(file_path: Union[str, Path]) -> bool:
@@ -100,3 +112,15 @@ def set_max_open_files(num_filehandles: int = DEFAULT_MAX_OPEN_FILES) -> tuple[O
100
112
  soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
101
113
 
102
114
  return (soft, hard)
115
+
116
+
117
+ def strip_bad_chars(text: str) -> str:
118
+ """Remove chars that don't work well in filenames."""
119
+ text = ' '.join(text.splitlines()).replace('\\s+', ' ')
120
+ text = re.sub('’', "'", text).replace('|', 'I').replace(',', ',')
121
+ return re.sub('[^-0-9a-zA-Z@.,?_:=#\'\\$" ()]+', '_', text).replace(' ', ' ')
122
+
123
+
124
+ def with_pdf_extension(file_path: Union[str, Path]) -> str:
125
+ """Append `".pdf"` to `file_path` if it doesn't already end with `".pdf"`."""
126
+ return str(file_path) + ('' if is_pdf(file_path) else PDF_EXT)
@@ -0,0 +1,31 @@
1
+ from typing import Optional
2
+
3
+ from PIL import Image
4
+ from yaralyzer.output.rich_console import console
5
+
6
+ from pdfalyzer.helpers.rich_text_helper import warning_text
7
+
8
+
9
+ def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
10
+ """Use pytesseract to OCR the text in the image and return it as a string."""
11
+ import pytesseract
12
+ text = None
13
+
14
+ try:
15
+ text = pytesseract.image_to_string(image)
16
+ except pytesseract.pytesseract.TesseractError as e:
17
+ console.print_exception()
18
+ console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
19
+ except OSError as e:
20
+ if 'truncated' in str(e):
21
+ console.print(warning_text(f"Truncated image file '{image_name}'!"))
22
+ else:
23
+ console.print_exception()
24
+ console.print(f"Error while extracting '{image_name}'!", style='bright_red')
25
+ raise e
26
+ except Exception as e:
27
+ console.print_exception()
28
+ console.print(f"Error while extracting '{image_name}'!", style='bright_red')
29
+ raise e
30
+
31
+ return None if text is None else text.strip()
@@ -1,17 +1,24 @@
1
1
  """
2
2
  Functions for miscellaneous Rich text/string pretty printing operations.
3
3
  """
4
- from typing import List, Union
4
+ from sys import stderr
5
+ from typing import List, Optional, Union
5
6
 
6
7
  from pypdf.generic import PdfObject
7
8
  from rich.console import Console
9
+ from rich.panel import Panel
10
+ from rich.padding import Padding
8
11
  from rich.text import Text
12
+ from yaralyzer.output.rich_console import console
9
13
 
10
14
  from pdfalyzer.helpers.pdf_object_helper import pypdf_class_name
11
15
  from pdfalyzer.output.styles.node_colors import get_label_style, get_class_style_italic
12
16
 
17
+ ARROW_BULLET = '➤ '
18
+
13
19
  # Usually we use the yaralyzer console but that has no highlighter
14
20
  pdfalyzer_console = Console(color_system='256')
21
+ stderr_console = Console(color_system='256', file=stderr)
15
22
 
16
23
 
17
24
  def print_highlighted(msg: Union[str, Text], **kwargs) -> None:
@@ -32,6 +39,21 @@ def quoted_text(
32
39
  return txt
33
40
 
34
41
 
42
+ def indented_bullet(msg: Union[str, Text], style: Optional[str] = None) -> Text:
43
+ return Text(' ') + bullet_text(msg, style)
44
+
45
+
46
+ def bullet_text(msg: Union[str, Text], style: Optional[str] = None) -> Text:
47
+ if isinstance(msg, str):
48
+ msg = Text(msg, style=style)
49
+
50
+ return Text(ARROW_BULLET).append(msg)
51
+
52
+
53
+ def mild_warning(msg: str) -> None:
54
+ console.print(indented_bullet(Text(msg, style='mild_warning')))
55
+
56
+
35
57
  def node_label(idnum: int, label: str, pdf_object: PdfObject, underline: bool = True) -> Text:
36
58
  """Colored text representation of a PDF node. Example: <5:FontDescriptor(Dictionary)>."""
37
59
  text = Text('<', style='white')
@@ -58,3 +80,30 @@ def pct_txt(_number: int, total: int, digits: int = 1) -> Text:
58
80
  """Return nicely formatted percentage, e.g. '(80%)'."""
59
81
  pct = (100 * float(_number) / float(total)).__round__(digits)
60
82
  return Text(f"({pct}%)", style='blue')
83
+
84
+
85
+ def warning_text(text: Union[str, Text]) -> Text:
86
+ msg = Text('').append(f"WARNING", style='bright_yellow').append(": ")
87
+
88
+ if isinstance(text, Text):
89
+ return msg + text
90
+ else:
91
+ return msg.append(text)
92
+
93
+
94
+ def error_text(text: Union[str, Text]) -> Text:
95
+ msg = Text('').append(f"ERROR", style='bright_red').append(": ")
96
+
97
+ if isinstance(text, Text):
98
+ return msg + text
99
+ else:
100
+ return msg.append(text)
101
+
102
+
103
+ def attention_getting_panel(text: Text, title: str, style: str = 'white on red') -> Padding:
104
+ p = Panel(text, padding=(2), title=title, style=style)
105
+ return Padding(p, pad=(1, 10, 2, 10))
106
+
107
+
108
+ def print_error(text: Union[str, Text]) -> Text:
109
+ console.print(error_text(text))
@@ -35,10 +35,15 @@ def count_pattern_matches_in_text(pattern: str, text: str) -> int:
35
35
 
36
36
 
37
37
  def count_regex_matches_in_text(regex: Pattern, text: str) -> int:
38
- """For use when you precompile the regex"""
38
+ """For use when you precompile the regex."""
39
39
  return sum(1 for _ in regex.finditer(text))
40
40
 
41
41
 
42
+ def exception_str(e: Exception) -> str:
43
+ """A string with the type and message."""
44
+ return f"{type(e).__name__}: {e}"
45
+
46
+
42
47
  def root_address(_string: str) -> str:
43
48
  """Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
44
49
  return _string.split('[')[0]
@@ -51,7 +51,8 @@ PDFALYZER_THEME_DICT.update({
51
51
  'warn.harsh': 'reverse bright_yellow',
52
52
  # error log events
53
53
  'fail': 'bold reverse red',
54
- 'milderror': 'red',
54
+ 'mild_error': 'red', # TODO: unused?
55
+ 'mild_warning': 'color(228) dim',
55
56
  'red_alert': 'blink bold red reverse on white',
56
57
  })
57
58
 
@@ -7,19 +7,23 @@ from collections import namedtuple
7
7
  from functools import partial, update_wrapper
8
8
  from importlib.metadata import version
9
9
  from os import getcwd, path
10
+ from pathlib import Path
10
11
  from typing import List, Optional
11
12
 
12
13
  from rich_argparse_plus import RichHelpFormatterPlus
13
14
  from rich.prompt import Confirm
14
15
  from rich.text import Text
16
+ from yaralyzer.helpers.file_helper import files_in_dir
15
17
  from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
16
18
  from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
17
19
 
20
+
18
21
  from pdfalyzer.config import ALL_STREAMS, PDFALYZER, PdfalyzerConfig
19
22
  from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
20
23
  from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
21
24
  with_pdf_extension)
22
25
  from pdfalyzer.helpers.rich_text_helper import print_highlighted
26
+ from pdfalyzer.util.page_range import PageRangeArgumentValidator
23
27
 
24
28
  # NamedTuple to keep our argument selection orderly
25
29
  OutputSection = namedtuple('OutputSection', ['argument', 'method'])
@@ -258,6 +262,90 @@ def parse_combine_pdfs_args() -> Namespace:
258
262
  return args
259
263
 
260
264
 
265
+ ###########################################
266
+ # Parse args for extract_pdf_pages() #
267
+ ###########################################
268
+ page_range_validator = PageRangeArgumentValidator()
269
+
270
+ extract_pdf_parser = ArgumentParser(
271
+ formatter_class=RichHelpFormatterPlus,
272
+ description="Extract pages from one PDF into a new PDF.",
273
+ )
274
+
275
+ extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
276
+
277
+ extract_pdf_parser.add_argument('--page-range', '-r',
278
+ type=page_range_validator,
279
+ help=page_range_validator.HELP_MSG,
280
+ required=True)
281
+
282
+ extract_pdf_parser.add_argument('--destination-dir', '-d',
283
+ help="directory to write the new PDF to",
284
+ default=Path.cwd())
285
+
286
+ extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
287
+
288
+
289
+ def parse_pdf_page_extraction_args() -> Namespace:
290
+ args = extract_pdf_parser.parse_args()
291
+
292
+ if not is_pdf(args.pdf_file):
293
+ log.error(f"'{args.pdf_file}' is not a PDF.")
294
+ sys.exit(-1)
295
+ elif not Path(args.destination_dir).exists():
296
+ log.error(f"Destination dir '{args.destination_dir}' does not exist.")
297
+ sys.exit(1)
298
+
299
+ return args
300
+
301
+
302
+ ############################################
303
+ # Parse args for extract_text_from_pdfs() #
304
+ ############################################
305
+ extract_text_parser = ArgumentParser(
306
+ formatter_class=RichHelpFormatterPlus,
307
+ description="Extract the text from one or more files or directories.",
308
+ epilog="If any of the FILE_OR_DIRs is a directory all files in that directory will be extracted."
309
+ )
310
+
311
+ extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
312
+ extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
313
+
314
+ extract_text_parser.add_argument('--page-range', '-r',
315
+ type=page_range_validator,
316
+ help=f"[PDFs only] {page_range_validator.HELP_MSG}")
317
+
318
+ extract_text_parser.add_argument('--print-as-parsed', '-p',
319
+ action='store_true',
320
+ help='print pages as they are parsed instead of waiting until document is fully parsed')
321
+
322
+
323
+ def parse_text_extraction_args() -> Namespace:
324
+ args = extract_text_parser.parse_args()
325
+ args.files_to_process = []
326
+
327
+ for file_or_dir in args.file_or_dir:
328
+ file_path = Path(file_or_dir)
329
+
330
+ if not file_path.exists():
331
+ log.error(f"File '{file_path}' doesn't exist!")
332
+ sys.exit(-1)
333
+ elif file_path.is_dir():
334
+ args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
335
+ else:
336
+ args.files_to_process.append(file_path)
337
+
338
+ if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
339
+ log.error(f"--page-range can only be specified for a single PDF")
340
+ sys.exit(-1)
341
+
342
+ return args
343
+
344
+
345
+ #############
346
+ # Helpers #
347
+ #############
348
+
261
349
  def ask_to_proceed() -> None:
262
350
  """Exit if user doesn't confirm they want to proceed."""
263
351
  if not Confirm.ask(Text("Proceed anyway?")):
@@ -0,0 +1,54 @@
1
+ """
2
+ A range of page numbers. Copied from clown_sort repo.
3
+ """
4
+ import re
5
+ from argparse import ArgumentTypeError
6
+ from dataclasses import dataclass
7
+ from typing import Tuple
8
+
9
+ PAGE_RANGE_REGEX = re.compile('\\d(-\\d)?')
10
+
11
+
12
+ @dataclass
13
+ class PageRange:
14
+ page_range: str
15
+
16
+ def __post_init__(self):
17
+ if not PAGE_RANGE_REGEX.match(self.page_range):
18
+ raise ValueError(f"Invalid page range '{self.page_range}'")
19
+
20
+ if '-' in self.page_range:
21
+ (self.first_page, self.last_page) = (int(p) for p in self.page_range.split('-'))
22
+ else:
23
+ self.first_page = int(self.page_range)
24
+ self.last_page = self.first_page + 1
25
+
26
+ if self.last_page <= self.first_page:
27
+ raise ValueError(f"Invalid page range {self.__repr__()}")
28
+
29
+ def in_range(self, page_number) -> bool:
30
+ """Returns `True` if `page_number` is in this range."""
31
+ return page_number >= self.first_page and page_number < self.last_page
32
+
33
+ def file_suffix(self) -> str:
34
+ """String that can be used as file suffix."""
35
+ if self.first_page + 1 == self.last_page:
36
+ return f"page_{self.first_page}"
37
+ else:
38
+ return f"pages_{self.first_page}-{self.last_page}"
39
+
40
+ def to_tuple(self) -> Tuple[int, int]:
41
+ return (self.first_page, self.last_page)
42
+
43
+ def __repr__(self) -> str:
44
+ return f"PageRange({self.first_page}, {self.last_page})"
45
+
46
+
47
+ class PageRangeArgumentValidator(object):
48
+ HELP_MSG = "a single digit ('11') or a range ('11-15') (WILL NOT extract the last page)"
49
+
50
+ def __call__(self, value):
51
+ if not PAGE_RANGE_REGEX.match(value):
52
+ raise ArgumentTypeError("Argument has to match '{}'".format(PAGE_RANGE_REGEX.pattern))
53
+
54
+ return PageRange(value)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "pdfalyzer"
3
- version = "1.16.14"
3
+ version = "1.17.0"
4
4
  description = "PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
5
5
  authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
6
6
  license = "GPL-3.0-or-later"
@@ -68,8 +68,13 @@ packages = [
68
68
  python = "^3.10"
69
69
  anytree = "~=2.13"
70
70
  pypdf = "^6.0.0"
71
+ PyMuPDF = {version = "^1.26.4", optional = true}
72
+ pytesseract = {version = "^0.3.13", optional = true}
71
73
  yaralyzer = "^1.0.9"
72
74
 
75
+ [tool.poetry.extras]
76
+ extract = ["PyMuPDF", "pytesseract"]
77
+
73
78
  [tool.poetry.group.dev.dependencies]
74
79
  flake8 = "^7.3.0"
75
80
  pytest = "^7.1.2"
@@ -81,6 +86,8 @@ pytest-skip-slow = "^0.0.3"
81
86
  #############
82
87
  [tool.poetry.scripts]
83
88
  combine_pdfs = 'pdfalyzer:combine_pdfs'
89
+ extract_pdf_pages = 'pdfalyzer:extract_pdf_pages'
90
+ extract_text_from_pdfs = 'pdfalyzer:extract_text_from_pdfs'
84
91
  pdfalyze = 'pdfalyzer:pdfalyze'
85
92
  pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'
86
93
 
@@ -107,3 +114,8 @@ requires = ["poetry-core>=1.0.0"]
107
114
  addopts = [
108
115
  "--import-mode=importlib",
109
116
  ]
117
+
118
+
119
+ # Poetry 2.x handles optional dependencies like this:
120
+ # [project.optional-dependencies]
121
+ # extract = ["PyMuPDF", "pytesseract"]
@@ -1,50 +0,0 @@
1
- from os import path
2
- from pathlib import Path
3
- from typing import List, Optional, Union
4
-
5
-
6
- class PdfFile:
7
- """
8
- Wrapper for a PDF file path that provides useful methods and properties.
9
- """
10
- def __init__(self, file_path: Union[str, Path]) -> None:
11
- self.file_path: Path = Path(file_path)
12
-
13
- if not self.file_path.exists():
14
- raise FileNotFoundError(f"File '{file_path}' does not exist.")
15
-
16
- self.dirname = self.file_path.parent
17
- self.basename: str = path.basename(file_path)
18
- self.basename_without_ext: str = str(Path(self.basename).with_suffix(''))
19
- self.extname: str = self.file_path.suffix
20
- self.text_extraction_attempted: bool = False
21
-
22
- def extract_page_range(
23
- self,
24
- page_range: PageRange,
25
- destination_dir: Optional[Path] = None,
26
- extra_file_suffix: Optional[str] = None
27
- ) -> Path:
28
- """Extract a range of pages to a new PDF file (or 1 page if last_page_number not provided.)"""
29
- destination_dir = destination_dir or DEFAULT_PDF_ERRORS_DIR
30
- create_dir_if_it_does_not_exist(destination_dir)
31
-
32
- if extra_file_suffix is None:
33
- file_suffix = page_range.file_suffix()
34
- else:
35
- file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
36
-
37
- extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
38
- extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
39
- stderr_console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
40
- pdf_writer = PdfWriter()
41
-
42
- with open(self.file_path, 'rb') as source_pdf:
43
- pdf_writer.append(fileobj=source_pdf, pages=page_range.to_tuple())
44
-
45
- if SortableFile.confirm_file_overwrite(extracted_pages_pdf_path):
46
- with open(extracted_pages_pdf_path, 'wb') as extracted_pages_pdf:
47
- pdf_writer.write(extracted_pages_pdf)
48
-
49
- stderr_console.print(f"Wrote new PDF '{extracted_pages_pdf_path}'.")
50
- return extracted_pages_pdf_path
File without changes