pdfalyzer 1.17.0__py3-none-any.whl → 1.17.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
CHANGELOG.md CHANGED
@@ -1,5 +1,28 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.17.7
4
+ * Bump `pypdf` to 6.1.3 (fixes [#31](https://github.com/michelcrypt4d4mus/pdfalyzer/issues/31), `PyMuPDF` to 1.26.5
5
+
6
+ ### 1.17.6
7
+ * Better handling for errors resulting from bugs in PyPDF
8
+ * Properly close file handle when pdfalyzing is complete
9
+
10
+ ### 1.17.5
11
+ * Fix `PIL` lazy import
12
+
13
+ ### 1.17.4
14
+ * Make `PIL` a lazy import so installs without `[extract]` extras don't fail
15
+
16
+ ### 1.17.3
17
+ * Put back `--debug` arg for CLI tools
18
+
19
+ ### 1.17.2
20
+ * Remove unused `--debug` args for CLI tools
21
+ * Rename `extract_text_from_pdfs` to `extract_pdf_text`
22
+
23
+ ### 1.17.1
24
+ * Fix issue where `extract_pdf_pages` page ranges were indexed from 0 instead of 1
25
+
3
26
  # 1.17.0
4
27
  * Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
5
28
  * Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)
pdfalyzer/__init__.py CHANGED
@@ -31,8 +31,9 @@ from pdfalyzer.helpers.rich_text_helper import print_highlighted
31
31
  from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
32
32
  from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
33
33
  from pdfalyzer.pdfalyzer import Pdfalyzer
34
- from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
35
- parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
34
+ from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
35
+ from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, parse_combine_pdfs_args,
36
+ parse_pdf_page_extraction_args, parse_text_extraction_args)
36
37
  from pdfalyzer.util.pdf_parser_manager import PdfParserManager
37
38
 
38
39
  # For the table shown by running pdfalyzer_show_color_theme
@@ -42,7 +43,7 @@ MAX_THEME_COL_SIZE = 35
42
43
  def pdfalyze():
43
44
  args = parse_arguments()
44
45
  pdfalyzer = Pdfalyzer(args.file_to_scan_path)
45
- pdfalyzer = PdfalyzerPresenter(pdfalyzer)
46
+ presenter = PdfalyzerPresenter(pdfalyzer)
46
47
  output_basepath = None
47
48
 
48
49
  # Binary stream extraction is a special case
@@ -54,7 +55,7 @@ def pdfalyze():
54
55
 
55
56
  # The method that gets called is related to the argument name. See 'possible_output_sections' list in
56
57
  # argument_parser.py. Analysis exports wrap themselves around the methods that actually generate the analyses.
57
- for (arg, method) in output_sections(args, pdfalyzer):
58
+ for (arg, method) in output_sections(args, presenter):
58
59
  if args.output_dir:
59
60
  output_basepath = PdfalyzerConfig.get_output_basepath(method)
60
61
  print(f'Exporting {arg} data to {output_basepath}...')
@@ -79,6 +80,8 @@ def pdfalyze():
79
80
  if args.interact:
80
81
  code.interact(local=locals())
81
82
 
83
+ pdfalyzer.pdf_filehandle.close()
84
+
82
85
 
83
86
  def pdfalyzer_show_color_theme() -> None:
84
87
  """Utility method to show pdfalyzer's color theme. Invocable with 'pdfalyzer_show_color_theme'."""
@@ -135,15 +138,13 @@ def combine_pdfs():
135
138
 
136
139
 
137
140
  def extract_pdf_pages() -> None:
141
+ """Extract a range of pages from a PDF to a new PDF."""
138
142
  args = parse_pdf_page_extraction_args()
139
143
  PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
140
144
 
141
145
 
142
- def extract_text_from_pdfs() -> None:
143
- """
144
- Extract text from a single file or from all files in a given directory. Can accept
145
- multiple paths as arguments on the command line.
146
- """
146
+ def extract_pdf_text() -> None:
147
+ """Extract text from a list of file or from all PDF files in a list of directories."""
147
148
  args: Namespace = parse_text_extraction_args()
148
149
  console.line()
149
150
 
@@ -20,6 +20,8 @@ from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_t
20
20
  from pdfalyzer.helpers.string_helper import exception_str
21
21
  from pdfalyzer.util.page_range import PageRange
22
22
 
23
+ DEPENDENCY_ERROR_MSG = "Pdfalyzer is missing an optional dependency required to extract text. " + \
24
+ "Try 'pip install pdfalyzer[extract]'"
23
25
  DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
24
26
  MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
25
27
 
@@ -30,10 +32,11 @@ class PdfFile:
30
32
 
31
33
  Attributes:
32
34
  file_path (Path): The path to the PDF file.
33
- dirname (Path): The directory containing the PDF file.
34
35
  basename (str): The base name of the PDF file (with extension).
35
36
  basename_without_ext (str): The base name of the PDF file (without extension).
37
+ dirname (Path): The directory containing the PDF file.
36
38
  extname (str): The file extension of the PDF file.
39
+ file_size (int): The size of the file in bytes.
37
40
  """
38
41
 
39
42
  def __init__(self, file_path: Union[str, Path]) -> None:
@@ -44,7 +47,7 @@ class PdfFile:
44
47
  self.file_path: Path = Path(file_path)
45
48
 
46
49
  if not self.file_path.exists():
47
- raise FileNotFoundError(f"File '{file_path}' does not exist.")
50
+ raise FileNotFoundError(f"'{file_path}' is not a valid file or directory.")
48
51
 
49
52
  self.dirname = self.file_path.parent
50
53
  self.basename: str = path.basename(file_path)
@@ -53,11 +56,11 @@ class PdfFile:
53
56
  self.file_size = self.file_path.stat().st_size
54
57
 
55
58
  def extract_page_range(
56
- self,
57
- page_range: PageRange,
58
- destination_dir: Optional[Path] = None,
59
- extra_file_suffix: Optional[str] = None
60
- ) -> Path:
59
+ self,
60
+ page_range: PageRange,
61
+ destination_dir: Optional[Path] = None,
62
+ extra_file_suffix: Optional[str] = None
63
+ ) -> Path:
61
64
  """
62
65
  Extract a range of pages to a new PDF file.
63
66
 
@@ -71,17 +74,21 @@ class PdfFile:
71
74
  Returns:
72
75
  Path: The path to the newly created PDF file containing the extracted pages.
73
76
  """
74
- destination_dir = destination_dir or self.dirname
77
+ destination_dir = Path(destination_dir or self.dirname)
75
78
  create_dir_if_it_does_not_exist(destination_dir)
79
+ pdf_reader = PdfReader(self.file_path)
80
+ page_count = len(pdf_reader.pages)
81
+ file_suffix = page_range.file_suffix()
76
82
 
77
- if extra_file_suffix is None:
78
- file_suffix = page_range.file_suffix()
79
- else:
80
- file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
83
+ if page_count < (page_range.last_page - 1):
84
+ raise ValueError(f"PDF only has {page_count} pages but you asked for pages {page_range}!")
85
+
86
+ if extra_file_suffix is not None:
87
+ file_suffix += f"__{extra_file_suffix}"
81
88
 
82
89
  extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
83
90
  extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
84
- console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
91
+ console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'")
85
92
  pdf_writer = PdfWriter()
86
93
 
87
94
  with open(self.file_path, 'rb') as source_pdf:
@@ -94,11 +101,11 @@ class PdfFile:
94
101
  return extracted_pages_pdf_path
95
102
 
96
103
  def extract_text(
97
- self,
98
- page_range: Optional[PageRange] = None,
99
- logger: Optional[Logger] = None,
100
- print_as_parsed: bool = False
101
- ) -> Optional[str]:
104
+ self,
105
+ page_range: Optional[PageRange] = None,
106
+ logger: Optional[Logger] = None,
107
+ print_as_parsed: bool = False
108
+ ) -> Optional[str]:
102
109
  """
103
110
  Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
104
111
 
@@ -162,7 +169,7 @@ class PdfFile:
162
169
  if print_as_parsed:
163
170
  print(f"{page_text}")
164
171
  except DependencyError:
165
- log.error("Pdfalyzer is missing an optional dependency required to extract text. Try 'pip install pdfalyzer[extract]'")
172
+ log.error(DEPENDENCY_ERROR_MSG)
166
173
  except EmptyFileError:
167
174
  log.warning("Skipping empty file!")
168
175
  except PdfStreamError as e:
@@ -185,7 +192,8 @@ class PdfFile:
185
192
 
186
193
  try:
187
194
  extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
188
- except Exception as e:
195
+ except Exception:
196
+ stderr_console.print_exception()
189
197
  stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
190
198
  extracted_file = None
191
199
 
@@ -2,7 +2,6 @@
2
2
  Functions to help with the pre-configured YARA rules in the /yara directory.
3
3
  """
4
4
  from importlib.resources import as_file, files
5
- from sys import exit
6
5
  from typing import Optional, Union
7
6
 
8
7
  from yaralyzer.config import YaralyzerConfig
@@ -6,7 +6,6 @@ from pathlib import Path
6
6
  from typing import Optional, Union
7
7
 
8
8
  from yaralyzer.output.rich_console import console
9
- from yaralyzer.util.logging import log
10
9
 
11
10
  from pdfalyzer.helpers.rich_text_helper import print_highlighted
12
11
 
@@ -1,19 +1,19 @@
1
1
  from typing import Optional
2
2
 
3
- from PIL import Image
4
3
  from yaralyzer.output.rich_console import console
5
4
 
6
5
  from pdfalyzer.helpers.rich_text_helper import warning_text
7
6
 
8
7
 
9
- def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
8
+ def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]: # noqa F821
10
9
  """Use pytesseract to OCR the text in the image and return it as a string."""
11
10
  import pytesseract
11
+ from PIL import Image
12
12
  text = None
13
13
 
14
14
  try:
15
15
  text = pytesseract.image_to_string(image)
16
- except pytesseract.pytesseract.TesseractError as e:
16
+ except pytesseract.pytesseract.TesseractError:
17
17
  console.print_exception()
18
18
  console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
19
19
  except OSError as e:
@@ -3,7 +3,7 @@ Various text formatting/styling/manipulating methods.
3
3
  """
4
4
  import re
5
5
  from pprint import PrettyPrinter
6
- from typing import List, Pattern, Union
6
+ from typing import List, Optional, Pattern, Union
7
7
 
8
8
  from yaralyzer.output.rich_console import console_width
9
9
 
@@ -18,16 +18,14 @@ pp = PrettyPrinter(
18
18
  sort_dicts=True)
19
19
 
20
20
 
21
- def generate_hyphen_line(width=None, title=None):
22
- """e.g. '-----------------BEGIN-----------------'"""
23
- width = width or console_width()
21
+ def all_strings_are_same_ignoring_numbers(strings: List[str]) -> bool:
22
+ """Returns true if string addresses are same except for digits."""
23
+ return len(set([replace_digits(s) for s in strings])) == 1
24
24
 
25
- if title is None:
26
- return '-' * width
27
25
 
28
- side_hyphens = int((width - len(title)) / 2) * '-'
29
- line = side_hyphens + title + side_hyphens
30
- return line if len(line) == width else line + '-'
26
+ def bracketed(index: Union[int, str]) -> str:
27
+ """Surround index with [ and ]."""
28
+ return f"[{index}]"
31
29
 
32
30
 
33
31
  def count_pattern_matches_in_text(pattern: str, text: str) -> int:
@@ -44,9 +42,20 @@ def exception_str(e: Exception) -> str:
44
42
  return f"{type(e).__name__}: {e}"
45
43
 
46
44
 
47
- def root_address(_string: str) -> str:
48
- """Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
49
- return _string.split('[')[0]
45
+ def generate_hyphen_line(width: Optional[int] = None, title: Optional[str] = None):
46
+ """e.g. '-----------------BEGIN-----------------'"""
47
+ width = width or console_width()
48
+
49
+ if title is None:
50
+ return '-' * width
51
+
52
+ side_hyphens = int((width - len(title)) / 2) * '-'
53
+ line = side_hyphens + title + side_hyphens
54
+ return line if len(line) == width else line + '-'
55
+
56
+
57
+ def has_a_common_substring(strings: List[str]) -> bool:
58
+ return all([is_substring_of_longer_strings_in_list(s, strings) for s in strings])
50
59
 
51
60
 
52
61
  def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
@@ -54,9 +63,10 @@ def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
54
63
  return any([_string.startswith(prefix) for prefix in prefixes])
55
64
 
56
65
 
57
- def bracketed(index: Union[int, str]) -> str:
58
- """Surround index with [ and ]."""
59
- return f"[{index}]"
66
+ def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
67
+ """Return True if '_string' is a substring of all the 'strings' longer than '_string'."""
68
+ longer_strings = [s for s in strings if len(s) > len(_string)]
69
+ return all([_string in longer_string for longer_string in longer_strings])
60
70
 
61
71
 
62
72
  def replace_digits(string_with_digits: str) -> str:
@@ -64,18 +74,6 @@ def replace_digits(string_with_digits: str) -> str:
64
74
  return DIGIT_REGEX.sub('x', string_with_digits)
65
75
 
66
76
 
67
- def all_strings_are_same_ignoring_numbers(strings: List[str]) -> bool:
68
- """Returns true if string addresses are same except for digits."""
69
- return len(set([replace_digits(s) for s in strings])) == 1
70
-
71
-
72
- def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
73
- longer_strings = [s for s in strings if len(s) > len(_string)]
74
- return all([_string in longer_string for longer_string in longer_strings])
75
-
76
-
77
- def has_a_common_substring(strings: List[str]) -> bool:
78
- return all([
79
- is_substring_of_longer_strings_in_list(s, strings)
80
- for s in strings
81
- ])
77
+ def root_address(_string: str) -> str:
78
+ """Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
79
+ return _string.split('[')[0]
pdfalyzer/pdfalyzer.py CHANGED
@@ -7,10 +7,11 @@ from typing import Dict, Iterator, List, Optional
7
7
  from anytree import LevelOrderIter, SymlinkNode
8
8
  from anytree.search import findall, findall_by_attr
9
9
  from pypdf import PdfReader
10
+ from pypdf.errors import PdfReadError
10
11
  from pypdf.generic import IndirectObject
11
12
  from yaralyzer.helpers.file_helper import load_binary_data
12
13
  from yaralyzer.output.file_hashes_table import compute_file_hashes
13
- from yaralyzer.output.rich_console import console
14
+ from yaralyzer.output.rich_console import console, print_fatal_error_and_exit
14
15
  from yaralyzer.util.logging import log
15
16
 
16
17
  from pdfalyzer.decorators.document_model_printer import print_with_header
@@ -22,7 +23,8 @@ from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
22
23
  from pdfalyzer.util.adobe_strings import *
23
24
  from pdfalyzer.util.exceptions import PdfWalkError
24
25
 
25
- TRAILER_FALLBACK_ID = 10000000
26
+ TRAILER_FALLBACK_ID = 10_000_000
27
+ PYPDF_ERROR_MSG = "Failed to open file with PyPDF. Consider filing a PyPDF bug report: https://github.com/py-pdf/pypdf/issues"
26
28
 
27
29
 
28
30
  class Pdfalyzer:
@@ -32,6 +34,19 @@ class Pdfalyzer:
32
34
  Each of the PDF's internal objects isw rapped in a `PdfTreeNode` object. The tree is managed
33
35
  by the `anytree` library. Information about the tree as a whole is stored in this class.
34
36
  Once the PDF is parsed this class provides access to info about or from the underlying PDF tree.
37
+
38
+ Attributes:
39
+ font_infos (List[FontInfo]): Font summary objects
40
+ max_generation (int): Max revision number ("generation") encounted in this PDF.
41
+ nodes_encountered (Dict[int, PdfTreeNode]): Nodes we've traversed already.
42
+ pdf_basename (str): The base name of the PDF file (with extension).
43
+ pdf_bytes (bytes): PDF binary data.
44
+ pdf_bytes_info (BytesInfo): File size, hashes, and other data points about the PDF's raw bytes.
45
+ pdf_filehandle (BufferedReader): File handle that reads the PDF.
46
+ pdf_path (str): The path to the PDF file.
47
+ pdf_size (int): Number of nodes as extracted from the PDF's Trailer node.
48
+ pdf_tree (PdfTreeNode): The top node of the PDF data structure tree.
49
+ verifier (PdfTreeVerifier): PdfTreeVerifier that can validate the PDF has been walked successfully.
35
50
  """
36
51
 
37
52
  def __init__(self, pdf_path: str):
@@ -43,14 +58,21 @@ class Pdfalyzer:
43
58
  self.pdf_basename = basename(pdf_path)
44
59
  self.pdf_bytes = load_binary_data(pdf_path)
45
60
  self.pdf_bytes_info = compute_file_hashes(self.pdf_bytes)
46
- pdf_file = open(pdf_path, 'rb') # Filehandle must be left open for PyPDF to perform seeks
47
- self.pdf_reader = PdfReader(pdf_file)
61
+ self.pdf_filehandle = open(pdf_path, 'rb') # Filehandle must be left open for PyPDF to perform seeks
62
+
63
+ try:
64
+ self.pdf_reader = PdfReader(self.pdf_filehandle)
65
+ except PdfReadError:
66
+ self._handle_fatal_error(f'PdfReadError: "{pdf_path}" doesn\'t seem to be a valid PDF file.')
67
+ except Exception as e:
68
+ console.print_exception()
69
+ self._handle_fatal_error(f"{PYPDF_ERROR_MSG}\n{e}")
48
70
 
49
71
  # Initialize tracking variables
50
- self.indeterminate_ids = set() # See INDETERMINATE_REF_KEYS comment
51
- self.nodes_encountered: Dict[int, PdfTreeNode] = {} # Nodes we've seen already
52
72
  self.font_infos: List[FontInfo] = [] # Font summary objects
53
73
  self.max_generation = 0 # PDF revisions are "generations"; this is the max generation encountered
74
+ self.nodes_encountered: Dict[int, PdfTreeNode] = {} # Nodes we've seen already
75
+ self._indeterminate_ids = set() # See INDETERMINATE_REF_KEYS comment
54
76
 
55
77
  # Bootstrap the root of the tree with the trailer. PDFs are always read trailer first.
56
78
  # Technically the trailer has no PDF Object ID but we set it to the /Size of the PDF.
@@ -148,9 +170,9 @@ class Pdfalyzer:
148
170
  from_node.add_child(to_node)
149
171
 
150
172
  # Remove this to_node from inteterminacy now that it's got a child or parent
151
- if relationship.to_obj.idnum in self.indeterminate_ids:
173
+ if relationship.to_obj.idnum in self._indeterminate_ids:
152
174
  log.info(f" Found {relationship} => {to_node} was marked indeterminate but now placed")
153
- self.indeterminate_ids.remove(relationship.to_obj.idnum)
175
+ self._indeterminate_ids.remove(relationship.to_obj.idnum)
154
176
 
155
177
  # If the relationship is indeterminate or we've seen the PDF object before, add it as
156
178
  # a non-tree relationship for now. An attempt to place the node will be made at the end.
@@ -159,7 +181,7 @@ class Pdfalyzer:
159
181
 
160
182
  # If we already encountered 'to_node' then skip adding it to the queue of nodes to walk
161
183
  if was_seen_before:
162
- if relationship.to_obj.idnum not in self.indeterminate_ids and to_node.parent is None:
184
+ if relationship.to_obj.idnum not in self._indeterminate_ids and to_node.parent is None:
163
185
  raise PdfWalkError(f"{relationship} - ref has no parent and is not indeterminate")
164
186
  else:
165
187
  log.debug(f" Already saw {relationship}; not scanning next")
@@ -167,7 +189,7 @@ class Pdfalyzer:
167
189
  # Indeterminate relationships need to wait until everything has been scanned to be placed
168
190
  elif relationship.is_indeterminate or (relationship.is_link and not self.is_in_tree(to_node)):
169
191
  log.info(f' Indeterminate ref {relationship}')
170
- self.indeterminate_ids.add(to_node.idnum)
192
+ self._indeterminate_ids.add(to_node.idnum)
171
193
  # Link nodes like /Dest are usually just links between nodes
172
194
  elif relationship.is_link:
173
195
  log.debug(f" Link ref {relationship}")
@@ -178,9 +200,13 @@ class Pdfalyzer:
178
200
 
179
201
  return to_node
180
202
 
203
+ def _handle_fatal_error(self, msg: str) -> None:
204
+ self.pdf_filehandle.close()
205
+ print_fatal_error_and_exit(msg)
206
+
181
207
  def _resolve_indeterminate_nodes(self) -> None:
182
208
  """Place all indeterminate nodes in the tree. Called after all nodes have been walked."""
183
- indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self.indeterminate_ids]
209
+ indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self._indeterminate_ids]
184
210
  indeterminate_nodes_string = "\n ".join([f"{node}" for node in indeterminate_nodes])
185
211
  log.info(f"Resolving {len(indeterminate_nodes)} indeterminate nodes: {indeterminate_nodes_string}")
186
212
 
@@ -1,5 +1,5 @@
1
1
  """
2
- Parse command line arguments for pdfalyzer and construct the PdfalyzerConfig object.
2
+ Parse command line arguments for `pdfalyze` and construct the `PdfalyzerConfig` object.
3
3
  """
4
4
  import sys
5
5
  from argparse import ArgumentParser, Namespace
@@ -7,23 +7,17 @@ from collections import namedtuple
7
7
  from functools import partial, update_wrapper
8
8
  from importlib.metadata import version
9
9
  from os import getcwd, path
10
- from pathlib import Path
11
10
  from typing import List, Optional
12
11
 
13
12
  from rich_argparse_plus import RichHelpFormatterPlus
14
13
  from rich.prompt import Confirm
15
14
  from rich.text import Text
16
- from yaralyzer.helpers.file_helper import files_in_dir
17
15
  from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
18
16
  from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
19
17
 
20
-
21
18
  from pdfalyzer.config import ALL_STREAMS, PDFALYZER, PdfalyzerConfig
22
19
  from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
23
- from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
24
- with_pdf_extension)
25
20
  from pdfalyzer.helpers.rich_text_helper import print_highlighted
26
- from pdfalyzer.util.page_range import PageRangeArgumentValidator
27
21
 
28
22
  # NamedTuple to keep our argument selection orderly
29
23
  OutputSection = namedtuple('OutputSection', ['argument', 'method'])
@@ -202,146 +196,10 @@ def output_sections(args: Namespace, pdfalyzer: 'Pdfalyzer') -> List[OutputSecti
202
196
 
203
197
 
204
198
  def all_sections_chosen(args):
205
- """Returns true if all flags are set or no flags are set."""
199
+ """Returns True if all flags are set or no flags are set."""
206
200
  return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
207
201
 
208
202
 
209
- #############################################################
210
- # Separate arg parsers for combine_pdfs and other scripts #
211
- #############################################################
212
-
213
- MAX_QUALITY = 10
214
-
215
- combine_pdfs_parser = ArgumentParser(
216
- description="Combine multiple PDFs into one.",
217
- epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
218
- " page numbers prior to merging.",
219
- formatter_class=RichHelpFormatterPlus)
220
-
221
- combine_pdfs_parser.add_argument('pdfs',
222
- help='two or more PDFs to combine',
223
- metavar='PDF_PATH',
224
- nargs='+')
225
-
226
- combine_pdfs_parser.add_argument('-iq', '--image-quality',
227
- help='image quality for embedded images (can compress PDF at loss of quality)',
228
- choices=range(1, MAX_QUALITY + 1),
229
- default=MAX_QUALITY,
230
- type=int)
231
-
232
- combine_pdfs_parser.add_argument('-o', '--output-file',
233
- help='path to write the combined PDFs to',
234
- required=True)
235
-
236
-
237
- def parse_combine_pdfs_args() -> Namespace:
238
- """Parse command line args for combine_pdfs script."""
239
- args = combine_pdfs_parser.parse_args()
240
- args.output_file = with_pdf_extension(args.output_file)
241
- confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
242
- args.number_of_pdfs = len(args.pdfs)
243
-
244
- if args.number_of_pdfs < 2:
245
- exit_with_error(f"Need at least 2 PDFs to merge.")
246
- elif not do_all_files_exist(args.pdfs):
247
- exit_with_error()
248
- elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
249
- exit_with_error()
250
-
251
- if all(is_pdf(pdf) for pdf in args.pdfs):
252
- if all(extract_page_number(pdf) for pdf in args.pdfs):
253
- print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
254
- args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
255
- else:
256
- print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
257
- else:
258
- print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
259
- ask_to_proceed()
260
-
261
- print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
262
- return args
263
-
264
-
265
- ###########################################
266
- # Parse args for extract_pdf_pages() #
267
- ###########################################
268
- page_range_validator = PageRangeArgumentValidator()
269
-
270
- extract_pdf_parser = ArgumentParser(
271
- formatter_class=RichHelpFormatterPlus,
272
- description="Extract pages from one PDF into a new PDF.",
273
- )
274
-
275
- extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
276
-
277
- extract_pdf_parser.add_argument('--page-range', '-r',
278
- type=page_range_validator,
279
- help=page_range_validator.HELP_MSG,
280
- required=True)
281
-
282
- extract_pdf_parser.add_argument('--destination-dir', '-d',
283
- help="directory to write the new PDF to",
284
- default=Path.cwd())
285
-
286
- extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
287
-
288
-
289
- def parse_pdf_page_extraction_args() -> Namespace:
290
- args = extract_pdf_parser.parse_args()
291
-
292
- if not is_pdf(args.pdf_file):
293
- log.error(f"'{args.pdf_file}' is not a PDF.")
294
- sys.exit(-1)
295
- elif not Path(args.destination_dir).exists():
296
- log.error(f"Destination dir '{args.destination_dir}' does not exist.")
297
- sys.exit(1)
298
-
299
- return args
300
-
301
-
302
- ############################################
303
- # Parse args for extract_text_from_pdfs() #
304
- ############################################
305
- extract_text_parser = ArgumentParser(
306
- formatter_class=RichHelpFormatterPlus,
307
- description="Extract the text from one or more files or directories.",
308
- epilog="If any of the FILE_OR_DIRs is a directory all files in that directory will be extracted."
309
- )
310
-
311
- extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
312
- extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
313
-
314
- extract_text_parser.add_argument('--page-range', '-r',
315
- type=page_range_validator,
316
- help=f"[PDFs only] {page_range_validator.HELP_MSG}")
317
-
318
- extract_text_parser.add_argument('--print-as-parsed', '-p',
319
- action='store_true',
320
- help='print pages as they are parsed instead of waiting until document is fully parsed')
321
-
322
-
323
- def parse_text_extraction_args() -> Namespace:
324
- args = extract_text_parser.parse_args()
325
- args.files_to_process = []
326
-
327
- for file_or_dir in args.file_or_dir:
328
- file_path = Path(file_or_dir)
329
-
330
- if not file_path.exists():
331
- log.error(f"File '{file_path}' doesn't exist!")
332
- sys.exit(-1)
333
- elif file_path.is_dir():
334
- args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
335
- else:
336
- args.files_to_process.append(file_path)
337
-
338
- if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
339
- log.error(f"--page-range can only be specified for a single PDF")
340
- sys.exit(-1)
341
-
342
- return args
343
-
344
-
345
203
  #############
346
204
  # Helpers #
347
205
  #############
@@ -0,0 +1,164 @@
1
+ """
2
+ Argument parsers for the command line tools other than `pdfalyze` that are included with The Pdfalyzer.
3
+
4
+ 1. combine_pdfs
5
+ 2. extract_pdf_pages
6
+ 3. extract_pdf_text
7
+ """
8
+ import logging
9
+ import sys
10
+ from argparse import ArgumentParser, Namespace
11
+ from pathlib import Path
12
+
13
+ from rich_argparse_plus import RichHelpFormatterPlus
14
+ from rich.prompt import Confirm
15
+ from rich.text import Text
16
+ from yaralyzer.helpers.file_helper import files_in_dir
17
+ from yaralyzer.util.logging import log
18
+
19
+ from pdfalyzer.util.argument_parser import ask_to_proceed, exit_with_error
20
+ from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
21
+ with_pdf_extension)
22
+ from pdfalyzer.helpers.rich_text_helper import print_highlighted
23
+ from pdfalyzer.util.page_range import PageRangeArgumentValidator
24
+
25
+ MAX_QUALITY = 10
26
+
27
+
28
+ ##################
29
+ # combine_pdfs #
30
+ ##################
31
+ combine_pdfs_parser = ArgumentParser(
32
+ description="Combine multiple PDFs into one.",
33
+ epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
34
+ " page numbers prior to merging.",
35
+ formatter_class=RichHelpFormatterPlus)
36
+
37
+ combine_pdfs_parser.add_argument('pdfs',
38
+ help='two or more PDFs to combine',
39
+ metavar='PDF_PATH',
40
+ nargs='+')
41
+
42
+ combine_pdfs_parser.add_argument('-iq', '--image-quality',
43
+ help='image quality for embedded images (can compress PDF at loss of quality)',
44
+ choices=range(1, MAX_QUALITY + 1),
45
+ default=MAX_QUALITY,
46
+ type=int)
47
+
48
+ combine_pdfs_parser.add_argument('-o', '--output-file',
49
+ help='path to write the combined PDFs to',
50
+ required=True)
51
+
52
+
53
+ def parse_combine_pdfs_args() -> Namespace:
54
+ """Parse command line args for combine_pdfs script."""
55
+ args = combine_pdfs_parser.parse_args()
56
+ args.output_file = with_pdf_extension(args.output_file)
57
+ confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
58
+ args.number_of_pdfs = len(args.pdfs)
59
+
60
+ if args.number_of_pdfs < 2:
61
+ exit_with_error(f"Need at least 2 PDFs to merge.")
62
+ elif not do_all_files_exist(args.pdfs):
63
+ exit_with_error()
64
+ elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
65
+ exit_with_error()
66
+
67
+ if all(is_pdf(pdf) for pdf in args.pdfs):
68
+ if all(extract_page_number(pdf) for pdf in args.pdfs):
69
+ print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
70
+ args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
71
+ else:
72
+ print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
73
+ else:
74
+ print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
75
+ ask_to_proceed()
76
+
77
+ print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
78
+ return args
79
+
80
+
81
+ #####################
82
+ # extract_pdf_pages #
83
+ #####################
84
+ page_range_validator = PageRangeArgumentValidator()
85
+
86
+ extract_pdf_parser = ArgumentParser(
87
+ formatter_class=RichHelpFormatterPlus,
88
+ description="Extract pages from one PDF into a new PDF.",
89
+ )
90
+
91
+ extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
92
+ extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
93
+
94
+ extract_pdf_parser.add_argument('--page-range', '-r',
95
+ type=page_range_validator,
96
+ help=page_range_validator.HELP_MSG,
97
+ required=True)
98
+
99
+ extract_pdf_parser.add_argument('--destination-dir', '-d',
100
+ help="directory to write the new PDF to",
101
+ default=Path.cwd())
102
+
103
+
104
+ def parse_pdf_page_extraction_args() -> Namespace:
105
+ args = extract_pdf_parser.parse_args()
106
+
107
+ if not is_pdf(args.pdf_file):
108
+ log.error(f"'{args.pdf_file}' is not a PDF.")
109
+ sys.exit(-1)
110
+ elif not Path(args.destination_dir).exists():
111
+ log.error(f"Destination dir '{args.destination_dir}' does not exist.")
112
+ sys.exit(1)
113
+
114
+ _set_log_level(args)
115
+ return args
116
+
117
+
118
+ ######################
119
+ # extract_pdf_text #
120
+ ######################
121
+ extract_text_parser = ArgumentParser(
122
+ formatter_class=RichHelpFormatterPlus,
123
+ description="Extract the text from one or more files or directories.",
124
+ epilog="If any of the FILE_OR_DIRs is a directory all PDF files in that directory will be extracted."
125
+ )
126
+
127
+ extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
128
+ extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
129
+
130
+ extract_text_parser.add_argument('--page-range', '-r',
131
+ type=page_range_validator,
132
+ help=f"[PDFs only] {page_range_validator.HELP_MSG}")
133
+
134
+ extract_text_parser.add_argument('--print-as-parsed', '-p',
135
+ action='store_true',
136
+ help='print pages as they are parsed instead of waiting until parsing complete')
137
+
138
+
139
+ def parse_text_extraction_args() -> Namespace:
140
+ args = extract_text_parser.parse_args()
141
+ args.files_to_process = []
142
+
143
+ for file_or_dir in args.file_or_dir:
144
+ file_path = Path(file_or_dir)
145
+
146
+ if not file_path.exists():
147
+ log.error(f"'{file_path}' is not a valid file or directory.")
148
+ sys.exit(-1)
149
+ elif file_path.is_dir():
150
+ args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
151
+ else:
152
+ args.files_to_process.append(file_path)
153
+
154
+ if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
155
+ log.error(f"--page-range can only be specified for a single PDF")
156
+ sys.exit(-1)
157
+
158
+ _set_log_level(args)
159
+ return args
160
+
161
+
162
+ def _set_log_level(args: Namespace):
163
+ if args.debug:
164
+ log.setLevel(logging.DEBUG)
@@ -6,7 +6,7 @@ from argparse import ArgumentTypeError
6
6
  from dataclasses import dataclass
7
7
  from typing import Tuple
8
8
 
9
- PAGE_RANGE_REGEX = re.compile('\\d(-\\d)?')
9
+ PAGE_RANGE_REGEX = re.compile(r'[1-9](\d+)?(-\d+)?')
10
10
 
11
11
 
12
12
  @dataclass
@@ -15,7 +15,7 @@ class PageRange:
15
15
 
16
16
  def __post_init__(self):
17
17
  if not PAGE_RANGE_REGEX.match(self.page_range):
18
- raise ValueError(f"Invalid page range '{self.page_range}'")
18
+ raise ArgumentTypeError(f"Invalid page range '{self.page_range}'")
19
19
 
20
20
  if '-' in self.page_range:
21
21
  (self.first_page, self.last_page) = (int(p) for p in self.page_range.split('-'))
@@ -35,10 +35,10 @@ class PageRange:
35
35
  if self.first_page + 1 == self.last_page:
36
36
  return f"page_{self.first_page}"
37
37
  else:
38
- return f"pages_{self.first_page}-{self.last_page}"
38
+ return f"pages_{self.first_page}-{self.last_page - 1}"
39
39
 
40
40
  def to_tuple(self) -> Tuple[int, int]:
41
- return (self.first_page, self.last_page)
41
+ return (self.first_page - 1, self.last_page - 1)
42
42
 
43
43
  def __repr__(self) -> str:
44
44
  return f"PageRange({self.first_page}, {self.last_page})"
@@ -48,7 +48,4 @@ class PageRangeArgumentValidator(object):
48
48
  HELP_MSG = "a single digit ('11') or a range ('11-15') (WILL NOT extract the last page)"
49
49
 
50
50
  def __call__(self, value):
51
- if not PAGE_RANGE_REGEX.match(value):
52
- raise ArgumentTypeError("Argument has to match '{}'".format(PAGE_RANGE_REGEX.pattern))
53
-
54
51
  return PageRange(value)
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.17.0
4
- Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
3
+ Version: 1.17.7
4
+ Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
7
7
  Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
@@ -22,9 +22,9 @@ Classifier: Topic :: Artistic Software
22
22
  Classifier: Topic :: Scientific/Engineering :: Visualization
23
23
  Classifier: Topic :: Security
24
24
  Provides-Extra: extract
25
- Requires-Dist: PyMuPDF (>=1.26.4,<2.0.0) ; extra == "extract"
25
+ Requires-Dist: PyMuPDF (>=1.26.5,<2.0.0) ; extra == "extract"
26
26
  Requires-Dist: anytree (>=2.13,<3.0)
27
- Requires-Dist: pypdf (>=6.0.0,<7.0.0)
27
+ Requires-Dist: pypdf (>=6.1.3,<7.0.0)
28
28
  Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
29
29
  Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
30
30
  Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
@@ -67,9 +67,8 @@ If you're looking for one of these things this may be the tool for you.
67
67
  ### What It Don't Do
68
68
  This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
69
69
 
70
- If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it; embedded javascript etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
70
+ If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
71
71
 
72
- -------------
73
72
 
74
73
  # Installation
75
74
  #### All Platforms
@@ -99,7 +98,6 @@ brew install pdfalyzer
99
98
  sudo apt-get install build-essential libssl-dev libffi-dev rustc
100
99
  ```
101
100
 
102
- -------------
103
101
 
104
102
  # Usage
105
103
 
@@ -115,7 +113,7 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
115
113
 
116
114
  The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
117
115
 
118
- ### Setting Command Line Options Permanently With A `.pdfalyzer` File
116
+ #### Setting Command Line Options Permanently With A `.pdfalyzer` File
119
117
  When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
120
118
 
121
119
  1. the current directory
@@ -123,12 +121,9 @@ When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfa
123
121
 
124
122
  If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
125
123
 
126
- ### Environment Variables
124
+ #### Environment Variables
127
125
  Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
128
126
 
129
- ### Colors And Themes
130
- Run `pdfalyzer_show_color_theme` to see the color theme employed.
131
-
132
127
  ### Guarantees
133
128
  Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
134
129
 
@@ -136,7 +131,22 @@ Warnings will be printed if any PDF object ID between 1 and the `/Size` reported
136
131
  [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
137
132
 
138
133
 
139
- ## Use As A Code Library
134
+ ## Included Command Line Tools
135
+ The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
136
+
137
+ * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
138
+ * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
139
+ * `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
140
+ * `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
141
+
142
+ Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
143
+
144
+ ```bash
145
+ pipx install pdfalyzer[extract]
146
+ ```
147
+
148
+
149
+ ## As A Python Library
140
150
  For info about setting up a dev environment see [Contributing](#contributing) below.
141
151
 
142
152
  At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
@@ -247,26 +257,6 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
247
257
 
248
258
 
249
259
  # PDF Resources
250
- ## Included PDF Tools
251
- The Pdfalyzer comes with a few command line tools:
252
-
253
- #### `combine_pdfs`
254
- Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
255
-
256
- #### `extract_pdf_pages`
257
- Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
258
- ![](doc/extract_pages_from_pdf_help.png)
259
-
260
- #### `extract_text_from_pdfs`
261
- Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
262
-
263
- ```bash
264
- pipx install pdfalyzer[extract]
265
- ```
266
-
267
- Run `extract_text_from_pdfs --help` to see the options.
268
-
269
-
270
260
  ## 3rd Party PDF Tools
271
261
  ### Installing Didier Stevens's PDF Analysis Tools
272
262
  Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.
@@ -1,28 +1,28 @@
1
1
  .pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
2
- CHANGELOG.md,sha256=DdmNHFTwo2VoFvmWA9htyUGLWvajyXnalNxB9hLwM9I,13042
2
+ CHANGELOG.md,sha256=LEAlcDOgi-BH86Pe66RFDGFgOfHVaZD05veJbCPyBB0,13681
3
3
  LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
4
- pdfalyzer/__init__.py,sha256=2OMrlYT53jvue3ddhKjF6LMbG2ss377neJBVBELwp3I,6118
4
+ pdfalyzer/__init__.py,sha256=3ylD-19PcG1bJ-rMa6ruP06QaM9Q1BitaMOA2ppugM8,6197
5
5
  pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
6
6
  pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04ElB8ilU,10748
7
7
  pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
8
8
  pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
9
9
  pdfalyzer/decorators/indeterminate_node.py,sha256=QLJr-nGKih8gPZcIqxLU028OwWWD5VjNHYMUjniwT_k,6586
10
- pdfalyzer/decorators/pdf_file.py,sha256=_v4mIpQXlPZTLRg2Tvv_OP_an-HECXbfzoGuq-hZ5io,10199
10
+ pdfalyzer/decorators/pdf_file.py,sha256=ryAYzzsO8Fw5_ZMoomruW0Bal8pTb5C0VlLOTjdVqNI,10552
11
11
  pdfalyzer/decorators/pdf_object_properties.py,sha256=Il3RObxQ4XUf0Ei-nd4tjJO0LeaxC6u7yFa3cQs_jVY,5485
12
12
  pdfalyzer/decorators/pdf_tree_node.py,sha256=4LReGJUtG8iEcLUQD1jW-yp3xPWsHrC-3Anbkt7XZ3A,11134
13
13
  pdfalyzer/decorators/pdf_tree_verifier.py,sha256=2hVe9APsAWQZ7ra8AGndHQnGWmmxmb3ZwfJHZuLvsvc,4714
14
14
  pdfalyzer/detection/constants/binary_regexes.py,sha256=s69S7uq1v4vBy3ZkKKKt3ClNuFCuQ0ztootUxzlgfFw,1632
15
15
  pdfalyzer/detection/constants/javascript_reserved_keywords.py,sha256=CXXdWskdQa0Hs5wCci2RBVvipgZg34_cLfmkWG4Xcmg,991
16
16
  pdfalyzer/detection/javascript_hunter.py,sha256=_wT2vkKTMlm_RGCjYsmwcmV-ag1qep3EpkHmUw0nWcQ,711
17
- pdfalyzer/detection/yaralyzer_helper.py,sha256=_Bkw2JTt3MeD86VOK39C06hn9lNDCc_8ZKLVMEvrwvQ,2215
17
+ pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47NdsDasg01uiQ,2194
18
18
  pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
19
19
  pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
20
- pdfalyzer/helpers/filesystem_helper.py,sha256=onXhSMhxo0YkvdKdosRwUo_RGdW6yNzZF5hfjgZ3GBE,5085
21
- pdfalyzer/helpers/image_helper.py,sha256=QjoAUcKKEtpmuEyOmEfmaUN-lzNykQ1SzqgNn9-R4Y0,1120
20
+ pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
21
+ pdfalyzer/helpers/image_helper.py,sha256=mDiscZZ7yrsFa-bxFqIEz9gH3WGhz8455yhXd4_QfAY,1134
22
22
  pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
23
23
  pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
24
24
  pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
25
- pdfalyzer/helpers/string_helper.py,sha256=YAjZy7KY6Ys_bb_YkUiUGQryONwsOE88LLGNgyWJ62o,2405
25
+ pdfalyzer/helpers/string_helper.py,sha256=zl7VnxqkaB50Zv1yQoz-ShVcLT2_nOgmxekWTpXHyx4,2521
26
26
  pdfalyzer/output/character_mapping.py,sha256=UN66b4BjvJiokBCi2kregiQvi6u2l1BJcHYFGG_G43M,2190
27
27
  pdfalyzer/output/layout.py,sha256=U9n5RnwwBg2UXxRBAc4E2gQ9t3dNsmiu62klz-Ig1Zg,2767
28
28
  pdfalyzer/output/pdfalyzer_presenter.py,sha256=TUsMc2GTUDjFzIGk7Ep5ZASfXcKX_WNtZzZKbQTHcfY,8580
@@ -33,12 +33,13 @@ pdfalyzer/output/tables/font_summary_table.py,sha256=TyCwcvqn99LXTWnmtk6MBPdc_33
33
33
  pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=7G-FLb_EUP50kZmYCTbo8Q6taU4xKp2QIGNOnQtYbNg,5908
34
34
  pdfalyzer/output/tables/stream_objects_table.py,sha256=PgQj8oTtW5_X8SMQb3FvCWDS-d4Zl6QiE44Qhiv7lTY,706
35
35
  pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVnev_4uEk,5291
36
- pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
36
+ pdfalyzer/pdfalyzer.py,sha256=iu4D3Y9qlKP0D_k883ji4U6LLzelQkHONlzAed0QUx4,12713
37
37
  pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
38
- pdfalyzer/util/argument_parser.py,sha256=2aYoW0ZILRSQkEOCaDwrZYmge5QI5tORhNm03rA0my8,15574
38
+ pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
39
+ pdfalyzer/util/cli_tools_argument_parser.py,sha256=HyZhztyrPtbvOswmG975M0tK5KPon37lV3fxVA0OwYo,6277
39
40
  pdfalyzer/util/debugging.py,sha256=hjYGxptJmal9TTaAUkkoo0oNu2tdx6ZYSyC0WjvzHh0,156
40
41
  pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
41
- pdfalyzer/util/page_range.py,sha256=zsHPw9p4QGlx5YEdssntY8HLEZIvBoQrS8Y8V87t5sA,1770
42
+ pdfalyzer/util/page_range.py,sha256=NMNh3_TojxTxBIpvUYK1AmvID_m8qOP6AihZrLWZF2I,1652
42
43
  pdfalyzer/util/pdf_parser_manager.py,sha256=FVRYAYsCd0y5MAm--qvXnwCZnDtB3x85FdJtb-gpyw4,3109
43
44
  pdfalyzer/yara_rules/PDF.yara,sha256=70JzPq5F6AS8F46Seu6u0j5GS1JHxkS42r7g7PVSpRg,81489
44
45
  pdfalyzer/yara_rules/PDF_binary_stream.yara,sha256=Qt0Wd7RFXYiHaT9YxTCrhC68ccmFcEG1XMNC3p5IwcI,821
@@ -46,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
46
47
  pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
47
48
  pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
48
49
  pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
49
- pdfalyzer-1.17.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
50
- pdfalyzer-1.17.0.dist-info/METADATA,sha256=MLXdtDxLIbFC4V2RlW9VKhHb7MEWgcF_3_o4cdlN-94,27337
51
- pdfalyzer-1.17.0.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
52
- pdfalyzer-1.17.0.dist-info/entry_points.txt,sha256=goHVADdqEFcniu4O0k7kabc2rLf3wvRrENJK6c9IkUw,249
53
- pdfalyzer-1.17.0.dist-info/RECORD,,
50
+ pdfalyzer-1.17.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
51
+ pdfalyzer-1.17.7.dist-info/METADATA,sha256=Cbd6Qu3SS8xGKrC__jEPG-74nnYvY0rJu9pirLiqrFQ,27328
52
+ pdfalyzer-1.17.7.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
53
+ pdfalyzer-1.17.7.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
54
+ pdfalyzer-1.17.7.dist-info/RECORD,,
@@ -1,7 +1,7 @@
1
1
  [console_scripts]
2
2
  combine_pdfs=pdfalyzer:combine_pdfs
3
3
  extract_pdf_pages=pdfalyzer:extract_pdf_pages
4
- extract_text_from_pdfs=pdfalyzer:extract_text_from_pdfs
4
+ extract_pdf_text=pdfalyzer:extract_pdf_text
5
5
  pdfalyze=pdfalyzer:pdfalyze
6
6
  pdfalyzer_show_color_theme=pdfalyzer:pdfalyzer_show_color_theme
7
7