pdfalyzer 1.15.0__py3-none-any.whl → 1.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pdfalyzer might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ # 1.16.0
4
+ * Upgrade `PyPDF2` 2.x to `pypdf` 5.0.1 (new name, same package)
5
+ * Add `--image-quality` option to `combine_pdfs` tool
6
+
7
+ ### 1.15.1
8
+ * Add `--no-default-yara-rules` command line option so users can use _only_ their own custom YARA rules files if they want. Previously you could only use custom YARA rules _in addition to_ the default rules; now you can just skip the default rules.
9
+
3
10
  # 1.15.0
4
11
  * Add `combine_pdfs` command line script to merge a bunch of PDFs into one
5
12
  * Remove unused `Deprecated` dependency
pdfalyzer/__init__.py CHANGED
@@ -4,9 +4,8 @@ from os import environ, getcwd, path
4
4
  from pathlib import Path
5
5
 
6
6
  from dotenv import load_dotenv
7
- # TODO: PdfMerger is deprecated in favor of PdfWriter at v3.9.1 (see https://pypdf.readthedocs.io/en/latest/user/merging-pdfs.html#basic-example)
8
- from PyPDF2 import PdfMerger
9
- from PyPDF2.errors import PdfReadError
7
+ from pypdf import PdfWriter
8
+ from pypdf.errors import PdfReadError
10
9
 
11
10
  # Should be first local import before load_dotenv() (or at least I think it needs to come first)
12
11
  from pdfalyzer.config import PdfalyzerConfig
@@ -31,7 +30,8 @@ from pdfalyzer.helpers.rich_text_helper import print_highlighted
31
30
  from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
32
31
  from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
33
32
  from pdfalyzer.pdfalyzer import Pdfalyzer
34
- from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments, parse_combine_pdfs_args
33
+ from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
34
+ parse_combine_pdfs_args)
35
35
  from pdfalyzer.util.pdf_parser_manager import PdfParserManager
36
36
 
37
37
  # For the table shown by running pdfalyzer_show_color_theme
@@ -51,6 +51,7 @@ def pdfalyze():
51
51
  log_and_print(f"Binary stream extraction complete, files written to '{args.output_dir}'.\nExiting.\n")
52
52
  sys.exit()
53
53
 
54
+ # The method that gets called is related to the argument name. See 'possible_output_sections' list in argument_parser.py
54
55
  # Analysis exports wrap themselves around the methods that actually generate the analyses
55
56
  for (arg, method) in output_sections(args, pdfalyzer):
56
57
  if args.output_dir:
@@ -92,10 +93,13 @@ def pdfalyzer_show_color_theme() -> None:
92
93
 
93
94
 
94
95
  def combine_pdfs():
95
- """Utility method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'."""
96
+ """
97
+ Utility method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'.
98
+ Example: https://github.com/py-pdf/pypdf/blob/main/docs/user/merging-pdfs.md
99
+ """
96
100
  args = parse_combine_pdfs_args()
97
101
  set_max_open_files(args.number_of_pdfs)
98
- merger = PdfMerger()
102
+ merger = PdfWriter()
99
103
 
100
104
  for pdf in args.pdfs:
101
105
  try:
@@ -105,18 +109,19 @@ def combine_pdfs():
105
109
  print_highlighted(f" -> Failed to merge '{pdf}'! {e}", style='red')
106
110
  ask_to_proceed()
107
111
 
108
- if args.compression_level == 0:
109
- print_highlighted("\nSkipping content stream compression...")
110
- else:
111
- print_highlighted(f"\nCompressing content streams with zlib level {args.compression_level}...")
112
+ # Iterate through pages and compress, lowering image quality if requested
113
+ # See https://pypdf.readthedocs.io/en/latest/user/file-size.html#reducing-image-quality
114
+ for i, page in enumerate(merger.pages):
115
+ if args.image_quality < MAX_QUALITY:
116
+ for j, img in enumerate(page.images):
117
+ print_highlighted(f" -> Reducing image #{j + 1} quality on page {i + 1} to {args.image_quality}...", style='dim')
118
+ img.replace(img.image, quality=args.image_quality)
112
119
 
113
- for i, page in enumerate(merger.pages):
114
- # TODO: enable image quality reduction + zlib level once PyPDF is upgraded to 4.x and option is available
115
- # See https://pypdf.readthedocs.io/en/latest/user/file-size.html#reducing-image-quality
116
- print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
117
- page.pagedata.compress_content_streams() # This is CPU intensive!
120
+ print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
121
+ page.compress_content_streams() # This is CPU intensive!
118
122
 
119
123
  print_highlighted(f"\nWriting '{args.output_file}'...", style='cyan')
124
+ merger.compress_identical_objects(remove_identicals=True, remove_orphans=True)
120
125
  merger.write(args.output_file)
121
126
  merger.close()
122
127
  txt = Text('').append(f" -> Wrote ")
@@ -3,7 +3,7 @@ Deprecated old, pre-tree, more rawformat reader. Only used for debugging these d
3
3
  """
4
4
  from io import StringIO
5
5
 
6
- from PyPDF2.generic import ArrayObject, DictionaryObject, IndirectObject
6
+ from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject
7
7
  from rich.console import Console
8
8
  from rich.markup import escape
9
9
 
@@ -1,9 +1,9 @@
1
1
  """
2
- Decorator for PyPDF2 PdfObject that extracts a couple of properties (type, label, etc).
2
+ Decorator for PyPDF PdfObject that extracts a couple of properties (type, label, etc).
3
3
  """
4
4
  from typing import Any, List, Optional, Union
5
5
 
6
- from PyPDF2.generic import DictionaryObject, IndirectObject, NumberObject, PdfObject
6
+ from pypdf.generic import DictionaryObject, IndirectObject, NumberObject, PdfObject
7
7
  from rich.text import Text
8
8
  from yaralyzer.util.logging import log
9
9
 
@@ -9,8 +9,8 @@ hooks)
9
9
  from typing import Callable, List, Optional, Set
10
10
 
11
11
  from anytree import NodeMixin, SymlinkNode
12
- from PyPDF2.errors import PdfReadError
13
- from PyPDF2.generic import IndirectObject, PdfObject, StreamObject
12
+ from pypdf.errors import PdfReadError
13
+ from pypdf.generic import IndirectObject, PdfObject, StreamObject
14
14
  from rich.markup import escape
15
15
  from rich.text import Text
16
16
  from yaralyzer.output.rich_console import console
@@ -41,7 +41,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
41
41
  self.stream_data = self.obj.get_data()
42
42
  self.stream_length = len(self.stream_data)
43
43
  except (NotImplementedError, PdfReadError) as e:
44
- msg = f"PyPDF2 failed to decode stream in {self}: {e}.\n" + \
44
+ msg = f"PyPDF failed to decode stream in {self}: {e}.\n" + \
45
45
  "Trees will be unaffected but scans/extractions will not be able to check this stream."
46
46
  console.print_exception()
47
47
  log.warning(msg)
@@ -1,8 +1,8 @@
1
1
  """
2
2
  Verify that the PDF tree is complete/contains all the nodes in the PDF file.
3
3
  """
4
- from PyPDF2.errors import PdfReadError
5
- from PyPDF2.generic import IndirectObject, NameObject, NumberObject
4
+ from pypdf.errors import PdfReadError
5
+ from pypdf.generic import IndirectObject, NameObject, NumberObject
6
6
  from rich.markup import escape
7
7
  from yaralyzer.output.rich_console import console
8
8
  from yaralyzer.util.logging import log
@@ -8,6 +8,8 @@ from typing import Optional, Union
8
8
  from yaralyzer.config import YaralyzerConfig
9
9
  from yaralyzer.yaralyzer import Yaralyzer
10
10
 
11
+ from pdfalyzer.config import PdfalyzerConfig
12
+
11
13
  YARA_RULES_DIR = files('pdfalyzer').joinpath('yara_rules')
12
14
 
13
15
  YARA_RULES_FILES = [
@@ -32,8 +34,11 @@ def _build_yaralyzer(scannable: Union[bytes, str], label: Optional[str] = None)
32
34
  with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[0])) as yara0:
33
35
  with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[1])) as yara1:
34
36
  with as_file(YARA_RULES_DIR.joinpath(YARA_RULES_FILES[2])) as yara2:
35
- rules_paths = [str(y) for y in [yara0, yara1, yara2]]
36
- rules_paths += YaralyzerConfig.args.yara_rules_files or []
37
+ # If there is a custom yara_rules argument file use that instead of the files in the yara_rules/ dir
38
+ rules_paths = YaralyzerConfig.args.yara_rules_files or []
39
+
40
+ if not YaralyzerConfig.args.no_default_yara_rules:
41
+ rules_paths += [str(y) for y in [yara0, yara1, yara2]]
37
42
 
38
43
  try:
39
44
  return Yaralyzer.for_rules_files(rules_paths, scannable, label)
pdfalyzer/font_info.py CHANGED
@@ -3,8 +3,8 @@ Unify font information spread across a bunch of PdfObjects (Font, FontDescriptor
3
3
  and FontFile) into a single class.
4
4
  """
5
5
 
6
- from PyPDF2._cmap import build_char_map, prepare_cm
7
- from PyPDF2.generic import IndirectObject, PdfObject
6
+ from pypdf._cmap import build_char_map, prepare_cm
7
+ from pypdf.generic import IndirectObject, PdfObject
8
8
  from rich.text import Text
9
9
  from yaralyzer.output.rich_console import console
10
10
  from yaralyzer.util.logging import log
@@ -1,9 +1,9 @@
1
1
  """
2
- Some methods to help with the direct manipulation/processing of PyPDF2's PdfObjects
2
+ Some methods to help with the direct manipulation/processing of PyPDF's PdfObjects
3
3
  """
4
4
  from typing import List, Optional
5
5
 
6
- from PyPDF2.generic import IndirectObject, PdfObject
6
+ from pypdf.generic import IndirectObject, PdfObject
7
7
 
8
8
  from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
9
9
  from pdfalyzer.util.adobe_strings import *
@@ -24,7 +24,7 @@ def _sort_pdf_object_refs(refs: List[PdfObjectRelationship]) -> List[PdfObjectRe
24
24
 
25
25
 
26
26
  def pypdf_class_name(obj: PdfObject) -> str:
27
- """Shortened name of type(obj), e.g. PyPDF2.generic._data_structures.ArrayObject becomes Array"""
27
+ """Shortened name of type(obj), e.g. PyPDF.generic._data_structures.ArrayObject becomes Array"""
28
28
  class_pkgs = type(obj).__name__.split('.')
29
29
  class_pkgs.reverse()
30
30
  return class_pkgs[0].removesuffix('Object')
@@ -4,7 +4,7 @@ Functions for miscellaneous Rich text/string operations.
4
4
  from functools import partial
5
5
  from typing import List
6
6
 
7
- from PyPDF2.generic import PdfObject
7
+ from pypdf.generic import PdfObject
8
8
  from rich.console import Console
9
9
  from rich.highlighter import RegexHighlighter, JSONHighlighter
10
10
  from rich.text import Text
@@ -12,13 +12,13 @@ from pdfalyzer.helpers.rich_text_helper import quoted_text
12
12
  from pdfalyzer.helpers.string_helper import pp
13
13
  from pdfalyzer.output.layout import print_headline_panel, subheading_width
14
14
 
15
- CHARMAP_TITLE = 'Character Mapping (As Extracted By PyPDF2)'
15
+ CHARMAP_TITLE = 'Character Mapping (As Extracted By PyPDF)'
16
16
  CHARMAP_TITLE_PADDING = (1, 0, 0, 2)
17
17
  CHARMAP_PADDING = (0, 2, 0, 10)
18
18
 
19
19
 
20
20
  def print_character_mapping(font: 'FontInfo') -> None:
21
- """Prints the character mapping extracted by PyPDF2._charmap in tidy columns"""
21
+ """Prints the character mapping extracted by PyPDF._charmap in tidy columns"""
22
22
  if font.character_mapping is None or len(font.character_mapping) == 0:
23
23
  log.info(f"No character map found in {font}")
24
24
  return
@@ -38,12 +38,12 @@ def print_character_mapping(font: 'FontInfo') -> None:
38
38
 
39
39
 
40
40
  def print_prepared_charmap(font: 'FontInfo'):
41
- """Prints the prepared_charmap returned by PyPDF2"""
41
+ """Prints the prepared_charmap returned by PyPDF."""
42
42
  if font.prepared_char_map is None:
43
43
  log.info(f"No prepared_charmap found in {font}")
44
44
  return
45
45
 
46
- headline = f"{font} Adobe PostScript charmap prepared by PyPDF2"
46
+ headline = f"{font} Adobe PostScript charmap prepared by PyPDF"
47
47
  print_headline_panel(headline, style='charmap.prepared_title')
48
48
  print_bytes(font.prepared_char_map, style='charmap.prepared')
49
49
  console.line()
@@ -47,7 +47,7 @@ class PdfalyzerPresenter:
47
47
  def print_document_info(self) -> None:
48
48
  """Print the embedded document info (author, timestamps, version, etc)."""
49
49
  print_section_header(f'Document Info for {self.pdfalyzer.pdf_basename}')
50
- console.print(pp.pformat(self.pdfalyzer.pdf_reader.getDocumentInfo()))
50
+ console.print(pp.pformat(self.pdfalyzer.pdf_reader.metadata))
51
51
  console.line()
52
52
  console.print(bytes_hashes_table(self.pdfalyzer.pdf_bytes, self.pdfalyzer.pdf_basename))
53
53
  console.line()
@@ -124,7 +124,7 @@ class PdfalyzerPresenter:
124
124
  console.print(build_decoding_stats_table(binary_scanner), justify='center')
125
125
 
126
126
  def print_yara_results(self) -> None:
127
- """Scan the overall PDF and each individual binary stream in it with yara_rules/ files"""
127
+ """Scan the main PDF and each individual binary stream in it with yara_rules/*.yara files"""
128
128
  print_section_header(f"YARA Scan of PDF rules for '{self.pdfalyzer.pdf_basename}'")
129
129
  YaralyzerConfig.args.standalone_mode = True # TODO: using 'standalone mode' like this kind of sucks
130
130
 
@@ -6,7 +6,7 @@ from collections import namedtuple
6
6
  from numbers import Number
7
7
  from typing import Any
8
8
 
9
- from PyPDF2.generic import (ArrayObject, ByteStringObject, EncodedStreamObject, IndirectObject,
9
+ from pypdf.generic import (ArrayObject, ByteStringObject, EncodedStreamObject, IndirectObject,
10
10
  StreamObject, TextStringObject)
11
11
  from yaralyzer.output.rich_console import YARALYZER_THEME_DICT
12
12
 
@@ -5,7 +5,7 @@ from collections import namedtuple
5
5
  from typing import List, Optional
6
6
 
7
7
  from anytree import SymlinkNode
8
- from PyPDF2.generic import StreamObject
8
+ from pypdf.generic import StreamObject
9
9
  from rich.markup import escape
10
10
  from rich.panel import Panel
11
11
  from rich.table import Table
@@ -3,7 +3,7 @@ Simple container class for information about a link between two PDF objects.
3
3
  """
4
4
  from typing import List, Optional, Union
5
5
 
6
- from PyPDF2.generic import IndirectObject, PdfObject
6
+ from pypdf.generic import IndirectObject, PdfObject
7
7
  from yaralyzer.util.logging import log
8
8
 
9
9
  from pdfalyzer.helpers.string_helper import bracketed, is_prefixed_by_any
pdfalyzer/pdfalyzer.py CHANGED
@@ -11,8 +11,8 @@ from typing import Dict, Iterator, List, Optional
11
11
 
12
12
  from anytree import LevelOrderIter, SymlinkNode
13
13
  from anytree.search import findall, findall_by_attr
14
- from PyPDF2 import PdfReader
15
- from PyPDF2.generic import IndirectObject
14
+ from pypdf import PdfReader
15
+ from pypdf.generic import IndirectObject
16
16
  from yaralyzer.helpers.file_helper import load_binary_data
17
17
  from yaralyzer.output.file_hashes_table import compute_file_hashes
18
18
  from yaralyzer.output.rich_console import console
@@ -36,7 +36,7 @@ class Pdfalyzer:
36
36
  self.pdf_basename = basename(pdf_path)
37
37
  self.pdf_bytes = load_binary_data(pdf_path)
38
38
  self.pdf_bytes_info = compute_file_hashes(self.pdf_bytes)
39
- pdf_file = open(pdf_path, 'rb') # Filehandle must be left open for PyPDF2 to perform seeks
39
+ pdf_file = open(pdf_path, 'rb') # Filehandle must be left open for PyPDF to perform seeks
40
40
  self.pdf_reader = PdfReader(pdf_file)
41
41
 
42
42
  # Initialize tracking variables
@@ -2,8 +2,8 @@
2
2
  String constants specified in the Adobe specs for PDFs, fonts, etc.
3
3
  """
4
4
 
5
- from PyPDF2.constants import (CatalogDictionary, ImageAttributes, PageAttributes,
6
- PagesAttributes, Ressources as Resources)
5
+ from pypdf.constants import (CatalogDictionary, ImageAttributes, PageAttributes,
6
+ PagesAttributes, Resources)
7
7
 
8
8
  from pdfalyzer.helpers.string_helper import is_prefixed_by_any
9
9
 
@@ -9,7 +9,7 @@ from typing import List
9
9
  from rich_argparse_plus import RichHelpFormatterPlus
10
10
  from rich.prompt import Confirm
11
11
  from rich.text import Text
12
- from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args
12
+ from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
13
13
  from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
14
14
 
15
15
  from pdfalyzer.config import ALL_STREAMS, PdfalyzerConfig
@@ -50,8 +50,13 @@ export.add_argument('-bin', '--extract-binary-streams',
50
50
  const='bin',
51
51
  help='extract all binary streams in the PDF to separate files (requires pdf-parser.py)')
52
52
 
53
+ # Add one more option to the YARA rules section
54
+ source.add_argument('--no-default-yara-rules',
55
+ action='store_true',
56
+ help='if --yara is selected use only custom rules from --yara-file arg and not the default included YARA rules')
53
57
 
54
- # Note that we extend the yaralyzer's parser and export
58
+
59
+ # Note that we extend the yaralyzer's parser and export
55
60
  parser = ArgumentParser(
56
61
  formatter_class=RichHelpFormatterPlus,
57
62
  description=DESCRIPTION,
@@ -78,7 +83,7 @@ select.add_argument('-f', '--fonts', action='store_true',
78
83
  help="show info about fonts included character mappings for embedded font binaries")
79
84
 
80
85
  select.add_argument('-y', '--yara', action='store_true',
81
- help="scan the PDF with YARA rules")
86
+ help="scan the PDF with the included malicious PDF YARA rules and/or your custom YARA rules")
82
87
 
83
88
  select.add_argument('-c', '--counts', action='store_true',
84
89
  help='show counts of some of the properties of the objects in the PDF')
@@ -127,10 +132,13 @@ def parse_arguments():
127
132
 
128
133
  if not args.streams:
129
134
  if args.extract_quoteds:
130
- raise ArgumentError(None, "--extract-quoted does nothing if --streams is not selected")
135
+ exit_with_error("--extract-quoted does nothing if --streams is not selected")
131
136
  if args.suppress_boms:
132
137
  log.warning("--suppress-boms has nothing to suppress if --streams is not selected")
133
138
 
139
+ if args.no_default_yara_rules and not args.yara_rules_files:
140
+ exit_with_error("--no-default-yara-rules requires at least one --yara-file argument")
141
+
134
142
  # File export options
135
143
  if args.export_svg or args.export_txt or args.export_html or args.extract_binary_streams:
136
144
  args.output_dir = args.output_dir or getcwd()
@@ -149,8 +157,8 @@ def parse_arguments():
149
157
 
150
158
  def output_sections(args, pdfalyzer) -> List[OutputSection]:
151
159
  """
152
- Determine which of the tree visualizations, font scans, etc were requested.
153
- If nothing was specified the default is to output all sections.
160
+ Determine which of the tree visualizations, font scans, etc should be run.
161
+ If nothing is specified output ALL sections other than --streams which is v. slow/verbose.
154
162
  """
155
163
  # Create a partial for print_font_info() because it's the only one that can take an argument
156
164
  # partials have no __name__ so update_wrapper() propagates the 'print_font_info' as this partial's name
@@ -158,7 +166,8 @@ def output_sections(args, pdfalyzer) -> List[OutputSection]:
158
166
  stream_scan = partial(pdfalyzer.print_streams_analysis, idnum=stream_id)
159
167
  update_wrapper(stream_scan, pdfalyzer.print_streams_analysis)
160
168
 
161
- # The first element string matches the argument in 'select' group.
169
+ # 1st element string matches the argument in 'select' group
170
+ # 2nd is fxn to call if selected.
162
171
  # Top to bottom is the default order of output.
163
172
  possible_output_sections = [
164
173
  OutputSection(DOCINFO, pdfalyzer.print_document_info),
@@ -187,6 +196,8 @@ def all_sections_chosen(args):
187
196
  ###############################################
188
197
  # Separate arg parser for combine_pdfs script #
189
198
  ###############################################
199
+ MAX_QUALITY = 10
200
+
190
201
  combine_pdfs_parser = ArgumentParser(
191
202
  description="Combine multiple PDFs into one.",
192
203
  epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" \
@@ -198,10 +209,10 @@ combine_pdfs_parser.add_argument('pdfs',
198
209
  metavar='PDF_PATH',
199
210
  nargs='+')
200
211
 
201
- combine_pdfs_parser.add_argument('-c', '--compression-level',
202
- help='zlib image compression level (0=none, max=1 until PyPDF is upgraded)',
203
- choices=range(0, 2),
204
- default=1,
212
+ combine_pdfs_parser.add_argument('-iq', '--image-quality',
213
+ help='image quality for embedded images (can compress PDF at loss of quality)',
214
+ choices=range(1, MAX_QUALITY + 1),
215
+ default=MAX_QUALITY,
205
216
  type=int)
206
217
 
207
218
  combine_pdfs_parser.add_argument('-o', '--output-file',
@@ -246,7 +257,7 @@ def ask_to_proceed() -> None:
246
257
  def exit_with_error(error_message: str|None = None) -> None:
247
258
  """Print 'error_message' and exit with status code 1."""
248
259
  if error_message:
249
- print_highlighted(error_message, style='bold red')
260
+ print_highlighted(Text('').append('ERROR', style='bold red').append(f': {error_message}'))
250
261
 
251
- print_highlighted('Exiting...', style='red')
262
+ print_highlighted('Exiting...', style='dim red')
252
263
  sys.exit(1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.15.0
3
+ Version: 1.16.0
4
4
  Summary: A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
@@ -16,9 +16,9 @@ Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Topic :: Artistic Software
17
17
  Classifier: Topic :: Scientific/Engineering :: Visualization
18
18
  Classifier: Topic :: Security
19
- Requires-Dist: PyPDF2 (>=2.10,<3.0)
20
19
  Requires-Dist: anytree (>=2.8,<3.0)
21
20
  Requires-Dist: chardet (>=5.0.0,<6.0.0)
21
+ Requires-Dist: pypdf (>=5.0.1,<6.0.0)
22
22
  Requires-Dist: python-dotenv (>=0.21.0,<0.22.0)
23
23
  Requires-Dist: rich (>=12.5.1,<13.0.0)
24
24
  Requires-Dist: rich-argparse-plus (>=0.3.1,<0.4.0)
@@ -71,7 +71,7 @@ Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `
71
71
  pipx install pdfalyzer
72
72
  ```
73
73
 
74
- See [PyPDF2 installation notes](https://github.com/py-pdf/PyPDF2#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
74
+ See [PyPDF installation notes](https://github.com/py-pdf/pypdf#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
75
75
 
76
76
  If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
77
77
 
@@ -123,7 +123,7 @@ Warnings will be printed if any PDF object ID between 1 and the `/Size` reported
123
123
  ## Use As A Code Library
124
124
  For info about setting up a dev environment see [Contributing](#contributing) below.
125
125
 
126
- At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF2](https://github.com/py-pdf/PyPDF2) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
126
+ At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
127
127
 
128
128
  As far as The Pdfalyzer's unique functionality goes, [`Pdfalyzer`](pdfalyzer/pdfalyzer.py) is the class at the heart of the operation. It holds the PDF's logical tree as well as a few other data structures. Chief among these are the [`FontInfo`](pdfalyzer/font_info.py) class which pulls together various properties of a font strewn across 3 or 4 different PDF objects and the [`BinaryScanner`](pdfalyzer/binary/binary_scanner.py) class which lets you dig through the embedded streams' bytes looking for suspicious patterns.
129
129
 
@@ -192,7 +192,7 @@ This image shows a more in-depth view of of the PDF tree for the same document s
192
192
 
193
193
  ## Fonts
194
194
 
195
- #### **Extract character mappings from ancient Adobe font formats**. It's actually `PyPDF2` doing the lifting here but we're happy to take the credit.
195
+ #### **Extract character mappings from ancient Adobe font formats**. It's actually `PyPDF` doing the lifting here but we're happy to take the credit.
196
196
 
197
197
  ![](https://github.com/michelcrypt4d4mus/pdfalyzer/raw/master/doc/svgs/rendered_images/font_character_mapping.png)
198
198
 
@@ -275,7 +275,7 @@ scripts/install_t1utils.sh
275
275
  ## Did The World Really Need Another PDF Tool?
276
276
  This tool was built to fill a gap in the PDF assessment landscape following [my own recent experience trying to find malicious content in a PDF file](https://twitter.com/Cryptadamist/status/1570167937381826560). Didier Stevens's [pdfid.py](https://github.com/DidierStevens/DidierStevensSuite/blob/master/pdfid.py) and [pdf-parser.py](https://github.com/DidierStevens/DidierStevensSuite/blob/master/pdf-parser.py) are still the best game in town when it comes to PDF analysis tools but they lack in the visualization department and also don't give you much to work with as far as giving you a data model you can write your own code around. [Peepdf](https://github.com/jesparza/peepdf) seemed promising but turned out to be in a buggy, out of date, and more or less unfixable state. And neither of them offered much in the way of tooling for embedded binary analysis.
277
277
 
278
- Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [PyPDF2](https://github.com/py-pdf/PyPDF2), [Rich](https://github.com/Textualize/rich), and [YARA](https://github.com/VirusTotal/yara-python) via [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer)) into this tool.
278
+ Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [PyPDF](https://github.com/py-pdf/pypdf), [Rich](https://github.com/Textualize/rich), and [YARA](https://github.com/VirusTotal/yara-python) via [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer)) into this tool.
279
279
 
280
280
  -------------
281
281
 
@@ -289,7 +289,7 @@ These are the naming conventions at play in The Pdfalyzer code base:
289
289
 
290
290
  | Term | Meaning |
291
291
  | ----------------- | ---------------- |
292
- | **`PDF Object`** | Instance of a `PyPDF2` class that represents the information stored in the PDF binary between open and close guillemet quotes (« and ») |
292
+ | **`PDF Object`** | Instance of a `PyPDF` class that represents the information stored in the PDF binary between open and close guillemet quotes (« and ») |
293
293
  | **`reference_key`** | String found in a PDF object that names a property (e.g. `/BaseFont` or `/Subtype`) |
294
294
  | **`reference`** | Link _from_ a PDF object _to_ another node. Outward facing relationships, basically. |
295
295
  | **`address`** | `reference_key` plus a hash key or numerical array index if that's how the reference works. e.g. if node A has a reference key `/Resources` pointing to a dict `{'/Font2': [IndirectObject(55), IndirectObject(2)]}` the address of `IndirectObject(55)` from node A would be `/Resources[/Font2][0]` |
@@ -300,11 +300,10 @@ These are the naming conventions at play in The Pdfalyzer code base:
300
300
  | **`link_node`** | nodes like `/Dest` that just contain a pointer to another node |
301
301
 
302
302
  ### Reference
303
- * [`PyPDF2 2.12.0` documentation](https://pypdf2.readthedocs.io/en/2.12.0/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
303
+ * [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
304
304
 
305
305
 
306
306
  # TODO
307
- * Upgrade `PyPDF` to latest and expand `combine_pdfs` compression command line option
308
307
  * Highlight decodes with a lot of Javascript keywords
309
308
  * https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
310
309
  * https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
@@ -1,38 +1,38 @@
1
- CHANGELOG.md,sha256=m8-CmkWYmqYxEavDgg4RthSxEadbEkjA3DaUZK1bOxY,11320
1
+ CHANGELOG.md,sha256=ojqG5GrSc6nAN3pkuRMfS2qWhrk_OcFk3kMafbnjqXI,11710
2
2
  LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
3
- pdfalyzer/__init__.py,sha256=NXfWa386xLWfnGK77OdRardOdlx-h4n2wBXtbLE2hnY,5187
3
+ pdfalyzer/__init__.py,sha256=q8qSdGdyUYmTYGOp_d2bRCCFASnlVt4wa-DlBikD5-M,5362
4
4
  pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
5
5
  pdfalyzer/binary/binary_scanner.py,sha256=7NrXx8GB2gpb04oR2bcZJKkOXOlzn2hWpcGlcYMqSfs,10217
6
6
  pdfalyzer/config.py,sha256=oN-pVR037lt3giRsnsm4c8ku5hCW8ChFqYFi9V7w1qU,1918
7
- pdfalyzer/decorators/document_model_printer.py,sha256=VD9N47i7CGuNd7b6OYwYzPtx4-LDsEx9cpQIxFjDzI4,2683
7
+ pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
8
8
  pdfalyzer/decorators/indeterminate_node.py,sha256=ivB6dX5aN8W9m0ksXhmUcixnjYjnuE7DARalH-nMjxY,6616
9
- pdfalyzer/decorators/pdf_object_properties.py,sha256=8dqHmi0J2USwnGPSy0Sg_ria_2TsaRWe_HWs-14RKrg,5524
10
- pdfalyzer/decorators/pdf_tree_node.py,sha256=A69k-Wj7g4Y0AgnvFeE-stiNP4ZWNkFaDz3yZitgA4A,10930
11
- pdfalyzer/decorators/pdf_tree_verifier.py,sha256=IRgm7ikdaqJEq66q3JcMZo49XQoONODM7lySioJfxRc,4543
9
+ pdfalyzer/decorators/pdf_object_properties.py,sha256=I7kix5hXNguAH2VW2uINIZRHJ8xYS4JGfc6Aiakyh4c,5522
10
+ pdfalyzer/decorators/pdf_tree_node.py,sha256=sd3a4uQMu_KQ_wvo0pjwQ8K1HI7xGgsGd47eI2IWybY,10927
11
+ pdfalyzer/decorators/pdf_tree_verifier.py,sha256=YC56SQxp5o2zMYgsBPCzX89pCkUHdZ-MCFNIPD9XKRc,4541
12
12
  pdfalyzer/detection/constants/binary_regexes.py,sha256=eFx1VVAOzxKmlacbGgicDCp1fcKgOkQkkzeduGjqLBQ,1594
13
13
  pdfalyzer/detection/constants/javascript_reserved_keywords.py,sha256=CXXdWskdQa0Hs5wCci2RBVvipgZg34_cLfmkWG4Xcmg,991
14
14
  pdfalyzer/detection/javascript_hunter.py,sha256=_wT2vkKTMlm_RGCjYsmwcmV-ag1qep3EpkHmUw0nWcQ,711
15
- pdfalyzer/detection/yaralyzer_helper.py,sha256=hmrnvTVtaX9l4FbXQrtrdXYHaK_IFSTDIuEWBIDPN74,1764
16
- pdfalyzer/font_info.py,sha256=L5ykKvlifAQv2uw-pKqxbQPqWrvbli0IcO8DgDK0SQo,6665
15
+ pdfalyzer/detection/yaralyzer_helper.py,sha256=_l9eJQUtMlo9RhY5h8Xq9gBLxzn1VgJsCA1nCsFDGvo,1999
16
+ pdfalyzer/font_info.py,sha256=0NQ6g4q3pTdirwGjJhur8HkXQlC732cR7IhilO33g2A,6663
17
17
  pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
18
18
  pdfalyzer/helpers/filesystem_helper.py,sha256=wHlFz4DFzPAJt2OzMRrhsjL-O3gLJ02JhuwBRwkE958,4089
19
19
  pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
20
- pdfalyzer/helpers/pdf_object_helper.py,sha256=u0j8B9mY8s5cTGo5LmDcozotvvgZNrwwJ4w_ipQqiXw,1105
21
- pdfalyzer/helpers/rich_text_helper.py,sha256=EkuF1GNQ8F8StZnl2flpI4C8RPvpxUV2aqCIDdjUDj8,2255
20
+ pdfalyzer/helpers/pdf_object_helper.py,sha256=Ija6cWKfFQRXCfZv2ezU1V2v0KFDn9f4ayeX8eG9GmI,1102
21
+ pdfalyzer/helpers/rich_text_helper.py,sha256=s5ytOme8CZCIWAsiPHFlIi6q0KN5qZPBb0OrtTfRkq4,2254
22
22
  pdfalyzer/helpers/string_helper.py,sha256=75EDEFw3UWHvWF32WtvZVBbqYY3ozO4y30dtH2qVMX0,2278
23
- pdfalyzer/output/character_mapping.py,sha256=lKPf-Xw3K3A3h33EOB_B-YaaxuFie7h7PUXCrphuwmw,2095
23
+ pdfalyzer/output/character_mapping.py,sha256=MtC3jKdtMaugi5038fne0T_SFSo9QU4lZl_s7bW7gzI,2092
24
24
  pdfalyzer/output/layout.py,sha256=E58T9Tl6BYZTDsj6ouMr1J5SSUiXa7timUNxnOI2IzI,2149
25
- pdfalyzer/output/pdfalyzer_presenter.py,sha256=-43-4W-Hrbc2FdMjkuAZT3ajtH6cLbOVA5voMw-WeUY,8498
26
- pdfalyzer/output/styles/node_colors.py,sha256=sw-e97iRwAzqBdg0sP_b__9KCe6MbRcgMzQlPL6sCrA,3987
25
+ pdfalyzer/output/pdfalyzer_presenter.py,sha256=CSboSnYFlkgOfwMf3TcoTTJY6FLXJ9OulI9UieSTJeE,8492
26
+ pdfalyzer/output/styles/node_colors.py,sha256=rfsTAUF43K_buw21SZoP6L5c_cLy7S-xA4GUiWJsDkc,3986
27
27
  pdfalyzer/output/styles/rich_theme.py,sha256=Y8QmuINlyZNIHvf3oD0CV3w2dC49NNKtvOChvudDCT8,1983
28
28
  pdfalyzer/output/tables/decoding_stats_table.py,sha256=mhQOiWhmovaC4sop38WcxStv_bIdAlQWUysAz5fW4MU,3461
29
29
  pdfalyzer/output/tables/font_summary_table.py,sha256=xfTqC7BlQd0agQf6nDDhkcJno7hru6mf9_xY1f5IDcw,2065
30
- pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=Soz5gkSl9pMFbwmGxyKyil_9X-Pl-fI0i8s0cvwLC3Q,5909
30
+ pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=7G-FLb_EUP50kZmYCTbo8Q6taU4xKp2QIGNOnQtYbNg,5908
31
31
  pdfalyzer/output/tables/stream_objects_table.py,sha256=nzCTci8Kqs8Pyghad3L5KWHDdIWRSrKCRNW8geA_rMo,707
32
- pdfalyzer/pdf_object_relationship.py,sha256=EgeIiVDofvZd-il114H8ZlKKwCOci5T5S4e15mHK_Wg,5340
33
- pdfalyzer/pdfalyzer.py,sha256=sOZqOKiRivd2I0Lek_cbYu0h4jIi8DXYnw5H0f6TfcA,11016
34
- pdfalyzer/util/adobe_strings.py,sha256=ea9rY83u1oL3uAx43AjuXY24zSdtyc2H7iJN6epaqkE,5048
35
- pdfalyzer/util/argument_parser.py,sha256=dyCd72k4A3Emksmi68MPo8XnRpZkeWUrmsAUoG1gRJg,11275
32
+ pdfalyzer/pdf_object_relationship.py,sha256=ug-338eoXFdD4YtDWPdzcfxP2fQDQa-GE8I3m3a01TA,5339
33
+ pdfalyzer/pdfalyzer.py,sha256=6JflqQJb2crXXaVA6DHHgWB45w2MBFB3pqE3AlZO5WI,11013
34
+ pdfalyzer/util/adobe_strings.py,sha256=F1MOBtSyIuF5HPmzWDr8MgnLyVodOsZSy4AFFCMHq_Y,5033
35
+ pdfalyzer/util/argument_parser.py,sha256=_8bhYkrw_lH9ce-ZnagcCtn9iqjeUW4dbbyQicB5hqE,11902
36
36
  pdfalyzer/util/debugging.py,sha256=nE64VUQbdu2OQRC8w8-AJkMtBOy8Kf3mjozuFslfWsw,156
37
37
  pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
38
38
  pdfalyzer/util/pdf_parser_manager.py,sha256=FVRYAYsCd0y5MAm--qvXnwCZnDtB3x85FdJtb-gpyw4,3109
@@ -40,8 +40,8 @@ pdfalyzer/yara_rules/PDF.yara,sha256=fBMKYmJgBLiCq-kpVzsTP9zUJEBep6yi_QVKmC-FdY0
40
40
  pdfalyzer/yara_rules/PDF_binary_stream.yara,sha256=oWRPLe5yQiRFMvi3BTHNTlB6T7NcAuxKn0C9OSvgJSM,804
41
41
  pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
42
  pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
43
- pdfalyzer-1.15.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
44
- pdfalyzer-1.15.0.dist-info/METADATA,sha256=1oW4vKmmYB7SY9K4qAmix-3mJkiXuODfAkiFTrSvULs,25817
45
- pdfalyzer-1.15.0.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
46
- pdfalyzer-1.15.0.dist-info/entry_points.txt,sha256=aZurgt-Xg3pojS7oTRI4hNLpK1hO4kTfChf0x2eQoD8,147
47
- pdfalyzer-1.15.0.dist-info/RECORD,,
43
+ pdfalyzer-1.16.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
44
+ pdfalyzer-1.16.0.dist-info/METADATA,sha256=IEEZrNEL7fdybwE9t7PasjPu-0XQkglaS_vfIfbLBGU,25716
45
+ pdfalyzer-1.16.0.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
46
+ pdfalyzer-1.16.0.dist-info/entry_points.txt,sha256=aZurgt-Xg3pojS7oTRI4hNLpK1hO4kTfChf0x2eQoD8,147
47
+ pdfalyzer-1.16.0.dist-info/RECORD,,