pdfalyzer 1.15.1__py3-none-any.whl → 1.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- CHANGELOG.md +7 -0
- pdfalyzer/__init__.py +19 -15
- pdfalyzer/decorators/document_model_printer.py +1 -1
- pdfalyzer/decorators/pdf_object_properties.py +2 -2
- pdfalyzer/decorators/pdf_tree_node.py +3 -3
- pdfalyzer/decorators/pdf_tree_verifier.py +2 -2
- pdfalyzer/font_info.py +2 -2
- pdfalyzer/helpers/pdf_object_helper.py +3 -3
- pdfalyzer/helpers/rich_text_helper.py +1 -1
- pdfalyzer/output/character_mapping.py +4 -4
- pdfalyzer/output/pdfalyzer_presenter.py +1 -1
- pdfalyzer/output/styles/node_colors.py +1 -1
- pdfalyzer/output/tables/pdf_node_rich_table.py +1 -1
- pdfalyzer/pdf_object_relationship.py +1 -1
- pdfalyzer/pdfalyzer.py +3 -3
- pdfalyzer/util/adobe_strings.py +2 -2
- pdfalyzer/util/argument_parser.py +6 -4
- {pdfalyzer-1.15.1.dist-info → pdfalyzer-1.16.1.dist-info}/METADATA +9 -9
- {pdfalyzer-1.15.1.dist-info → pdfalyzer-1.16.1.dist-info}/RECORD +22 -22
- {pdfalyzer-1.15.1.dist-info → pdfalyzer-1.16.1.dist-info}/LICENSE +0 -0
- {pdfalyzer-1.15.1.dist-info → pdfalyzer-1.16.1.dist-info}/WHEEL +0 -0
- {pdfalyzer-1.15.1.dist-info → pdfalyzer-1.16.1.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.16.1
|
|
4
|
+
* Configure a `Changelog` link for `pypi` to display
|
|
5
|
+
|
|
6
|
+
# 1.16.0
|
|
7
|
+
* Upgrade `PyPDF2` 2.x to `pypdf` 5.0.1 (new name, same package)
|
|
8
|
+
* Add `--image-quality` option to `combine_pdfs` tool
|
|
9
|
+
|
|
3
10
|
### 1.15.1
|
|
4
11
|
* Add `--no-default-yara-rules` command line option so users can use _only_ their own custom YARA rules files if they want. Previously you could only use custom YARA rules _in addition to_ the default rules; now you can just skip the default rules.
|
|
5
12
|
|
pdfalyzer/__init__.py
CHANGED
|
@@ -4,9 +4,8 @@ from os import environ, getcwd, path
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
6
|
from dotenv import load_dotenv
|
|
7
|
-
|
|
8
|
-
from
|
|
9
|
-
from PyPDF2.errors import PdfReadError
|
|
7
|
+
from pypdf import PdfWriter
|
|
8
|
+
from pypdf.errors import PdfReadError
|
|
10
9
|
|
|
11
10
|
# Should be first local import before load_dotenv() (or at least I think it needs to come first)
|
|
12
11
|
from pdfalyzer.config import PdfalyzerConfig
|
|
@@ -31,7 +30,8 @@ from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
|
31
30
|
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
32
31
|
from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
|
|
33
32
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
34
|
-
from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments,
|
|
33
|
+
from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
|
|
34
|
+
parse_combine_pdfs_args)
|
|
35
35
|
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
|
|
36
36
|
|
|
37
37
|
# For the table shown by running pdfalyzer_show_color_theme
|
|
@@ -93,10 +93,13 @@ def pdfalyzer_show_color_theme() -> None:
|
|
|
93
93
|
|
|
94
94
|
|
|
95
95
|
def combine_pdfs():
|
|
96
|
-
"""
|
|
96
|
+
"""
|
|
97
|
+
Utility method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'.
|
|
98
|
+
Example: https://github.com/py-pdf/pypdf/blob/main/docs/user/merging-pdfs.md
|
|
99
|
+
"""
|
|
97
100
|
args = parse_combine_pdfs_args()
|
|
98
101
|
set_max_open_files(args.number_of_pdfs)
|
|
99
|
-
merger =
|
|
102
|
+
merger = PdfWriter()
|
|
100
103
|
|
|
101
104
|
for pdf in args.pdfs:
|
|
102
105
|
try:
|
|
@@ -106,18 +109,19 @@ def combine_pdfs():
|
|
|
106
109
|
print_highlighted(f" -> Failed to merge '{pdf}'! {e}", style='red')
|
|
107
110
|
ask_to_proceed()
|
|
108
111
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
112
|
+
# Iterate through pages and compress, lowering image quality if requested
|
|
113
|
+
# See https://pypdf.readthedocs.io/en/latest/user/file-size.html#reducing-image-quality
|
|
114
|
+
for i, page in enumerate(merger.pages):
|
|
115
|
+
if args.image_quality < MAX_QUALITY:
|
|
116
|
+
for j, img in enumerate(page.images):
|
|
117
|
+
print_highlighted(f" -> Reducing image #{j + 1} quality on page {i + 1} to {args.image_quality}...", style='dim')
|
|
118
|
+
img.replace(img.image, quality=args.image_quality)
|
|
113
119
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
# See https://pypdf.readthedocs.io/en/latest/user/file-size.html#reducing-image-quality
|
|
117
|
-
print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
|
|
118
|
-
page.pagedata.compress_content_streams() # This is CPU intensive!
|
|
120
|
+
print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
|
|
121
|
+
page.compress_content_streams() # This is CPU intensive!
|
|
119
122
|
|
|
120
123
|
print_highlighted(f"\nWriting '{args.output_file}'...", style='cyan')
|
|
124
|
+
merger.compress_identical_objects(remove_identicals=True, remove_orphans=True)
|
|
121
125
|
merger.write(args.output_file)
|
|
122
126
|
merger.close()
|
|
123
127
|
txt = Text('').append(f" -> Wrote ")
|
|
@@ -3,7 +3,7 @@ Deprecated old, pre-tree, more rawformat reader. Only used for debugging these d
|
|
|
3
3
|
"""
|
|
4
4
|
from io import StringIO
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject
|
|
7
7
|
from rich.console import Console
|
|
8
8
|
from rich.markup import escape
|
|
9
9
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Decorator for
|
|
2
|
+
Decorator for PyPDF PdfObject that extracts a couple of properties (type, label, etc).
|
|
3
3
|
"""
|
|
4
4
|
from typing import Any, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pypdf.generic import DictionaryObject, IndirectObject, NumberObject, PdfObject
|
|
7
7
|
from rich.text import Text
|
|
8
8
|
from yaralyzer.util.logging import log
|
|
9
9
|
|
|
@@ -9,8 +9,8 @@ hooks)
|
|
|
9
9
|
from typing import Callable, List, Optional, Set
|
|
10
10
|
|
|
11
11
|
from anytree import NodeMixin, SymlinkNode
|
|
12
|
-
from
|
|
13
|
-
from
|
|
12
|
+
from pypdf.errors import PdfReadError
|
|
13
|
+
from pypdf.generic import IndirectObject, PdfObject, StreamObject
|
|
14
14
|
from rich.markup import escape
|
|
15
15
|
from rich.text import Text
|
|
16
16
|
from yaralyzer.output.rich_console import console
|
|
@@ -41,7 +41,7 @@ class PdfTreeNode(NodeMixin, PdfObjectProperties):
|
|
|
41
41
|
self.stream_data = self.obj.get_data()
|
|
42
42
|
self.stream_length = len(self.stream_data)
|
|
43
43
|
except (NotImplementedError, PdfReadError) as e:
|
|
44
|
-
msg = f"
|
|
44
|
+
msg = f"PyPDF failed to decode stream in {self}: {e}.\n" + \
|
|
45
45
|
"Trees will be unaffected but scans/extractions will not be able to check this stream."
|
|
46
46
|
console.print_exception()
|
|
47
47
|
log.warning(msg)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Verify that the PDF tree is complete/contains all the nodes in the PDF file.
|
|
3
3
|
"""
|
|
4
|
-
from
|
|
5
|
-
from
|
|
4
|
+
from pypdf.errors import PdfReadError
|
|
5
|
+
from pypdf.generic import IndirectObject, NameObject, NumberObject
|
|
6
6
|
from rich.markup import escape
|
|
7
7
|
from yaralyzer.output.rich_console import console
|
|
8
8
|
from yaralyzer.util.logging import log
|
pdfalyzer/font_info.py
CHANGED
|
@@ -3,8 +3,8 @@ Unify font information spread across a bunch of PdfObjects (Font, FontDescriptor
|
|
|
3
3
|
and FontFile) into a single class.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from
|
|
6
|
+
from pypdf._cmap import build_char_map, prepare_cm
|
|
7
|
+
from pypdf.generic import IndirectObject, PdfObject
|
|
8
8
|
from rich.text import Text
|
|
9
9
|
from yaralyzer.output.rich_console import console
|
|
10
10
|
from yaralyzer.util.logging import log
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Some methods to help with the direct manipulation/processing of
|
|
2
|
+
Some methods to help with the direct manipulation/processing of PyPDF's PdfObjects
|
|
3
3
|
"""
|
|
4
4
|
from typing import List, Optional
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pypdf.generic import IndirectObject, PdfObject
|
|
7
7
|
|
|
8
8
|
from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
|
|
9
9
|
from pdfalyzer.util.adobe_strings import *
|
|
@@ -24,7 +24,7 @@ def _sort_pdf_object_refs(refs: List[PdfObjectRelationship]) -> List[PdfObjectRe
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
def pypdf_class_name(obj: PdfObject) -> str:
|
|
27
|
-
"""Shortened name of type(obj), e.g.
|
|
27
|
+
"""Shortened name of type(obj), e.g. PyPDF.generic._data_structures.ArrayObject becomes Array"""
|
|
28
28
|
class_pkgs = type(obj).__name__.split('.')
|
|
29
29
|
class_pkgs.reverse()
|
|
30
30
|
return class_pkgs[0].removesuffix('Object')
|
|
@@ -4,7 +4,7 @@ Functions for miscellaneous Rich text/string operations.
|
|
|
4
4
|
from functools import partial
|
|
5
5
|
from typing import List
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pypdf.generic import PdfObject
|
|
8
8
|
from rich.console import Console
|
|
9
9
|
from rich.highlighter import RegexHighlighter, JSONHighlighter
|
|
10
10
|
from rich.text import Text
|
|
@@ -12,13 +12,13 @@ from pdfalyzer.helpers.rich_text_helper import quoted_text
|
|
|
12
12
|
from pdfalyzer.helpers.string_helper import pp
|
|
13
13
|
from pdfalyzer.output.layout import print_headline_panel, subheading_width
|
|
14
14
|
|
|
15
|
-
CHARMAP_TITLE = 'Character Mapping (As Extracted By
|
|
15
|
+
CHARMAP_TITLE = 'Character Mapping (As Extracted By PyPDF)'
|
|
16
16
|
CHARMAP_TITLE_PADDING = (1, 0, 0, 2)
|
|
17
17
|
CHARMAP_PADDING = (0, 2, 0, 10)
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def print_character_mapping(font: 'FontInfo') -> None:
|
|
21
|
-
"""Prints the character mapping extracted by
|
|
21
|
+
"""Prints the character mapping extracted by PyPDF._charmap in tidy columns"""
|
|
22
22
|
if font.character_mapping is None or len(font.character_mapping) == 0:
|
|
23
23
|
log.info(f"No character map found in {font}")
|
|
24
24
|
return
|
|
@@ -38,12 +38,12 @@ def print_character_mapping(font: 'FontInfo') -> None:
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
def print_prepared_charmap(font: 'FontInfo'):
|
|
41
|
-
"""Prints the prepared_charmap returned by
|
|
41
|
+
"""Prints the prepared_charmap returned by PyPDF."""
|
|
42
42
|
if font.prepared_char_map is None:
|
|
43
43
|
log.info(f"No prepared_charmap found in {font}")
|
|
44
44
|
return
|
|
45
45
|
|
|
46
|
-
headline = f"{font} Adobe PostScript charmap prepared by
|
|
46
|
+
headline = f"{font} Adobe PostScript charmap prepared by PyPDF"
|
|
47
47
|
print_headline_panel(headline, style='charmap.prepared_title')
|
|
48
48
|
print_bytes(font.prepared_char_map, style='charmap.prepared')
|
|
49
49
|
console.line()
|
|
@@ -47,7 +47,7 @@ class PdfalyzerPresenter:
|
|
|
47
47
|
def print_document_info(self) -> None:
|
|
48
48
|
"""Print the embedded document info (author, timestamps, version, etc)."""
|
|
49
49
|
print_section_header(f'Document Info for {self.pdfalyzer.pdf_basename}')
|
|
50
|
-
console.print(pp.pformat(self.pdfalyzer.pdf_reader.
|
|
50
|
+
console.print(pp.pformat(self.pdfalyzer.pdf_reader.metadata))
|
|
51
51
|
console.line()
|
|
52
52
|
console.print(bytes_hashes_table(self.pdfalyzer.pdf_bytes, self.pdfalyzer.pdf_basename))
|
|
53
53
|
console.line()
|
|
@@ -6,7 +6,7 @@ from collections import namedtuple
|
|
|
6
6
|
from numbers import Number
|
|
7
7
|
from typing import Any
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from pypdf.generic import (ArrayObject, ByteStringObject, EncodedStreamObject, IndirectObject,
|
|
10
10
|
StreamObject, TextStringObject)
|
|
11
11
|
from yaralyzer.output.rich_console import YARALYZER_THEME_DICT
|
|
12
12
|
|
|
@@ -5,7 +5,7 @@ from collections import namedtuple
|
|
|
5
5
|
from typing import List, Optional
|
|
6
6
|
|
|
7
7
|
from anytree import SymlinkNode
|
|
8
|
-
from
|
|
8
|
+
from pypdf.generic import StreamObject
|
|
9
9
|
from rich.markup import escape
|
|
10
10
|
from rich.panel import Panel
|
|
11
11
|
from rich.table import Table
|
|
@@ -3,7 +3,7 @@ Simple container class for information about a link between two PDF objects.
|
|
|
3
3
|
"""
|
|
4
4
|
from typing import List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pypdf.generic import IndirectObject, PdfObject
|
|
7
7
|
from yaralyzer.util.logging import log
|
|
8
8
|
|
|
9
9
|
from pdfalyzer.helpers.string_helper import bracketed, is_prefixed_by_any
|
pdfalyzer/pdfalyzer.py
CHANGED
|
@@ -11,8 +11,8 @@ from typing import Dict, Iterator, List, Optional
|
|
|
11
11
|
|
|
12
12
|
from anytree import LevelOrderIter, SymlinkNode
|
|
13
13
|
from anytree.search import findall, findall_by_attr
|
|
14
|
-
from
|
|
15
|
-
from
|
|
14
|
+
from pypdf import PdfReader
|
|
15
|
+
from pypdf.generic import IndirectObject
|
|
16
16
|
from yaralyzer.helpers.file_helper import load_binary_data
|
|
17
17
|
from yaralyzer.output.file_hashes_table import compute_file_hashes
|
|
18
18
|
from yaralyzer.output.rich_console import console
|
|
@@ -36,7 +36,7 @@ class Pdfalyzer:
|
|
|
36
36
|
self.pdf_basename = basename(pdf_path)
|
|
37
37
|
self.pdf_bytes = load_binary_data(pdf_path)
|
|
38
38
|
self.pdf_bytes_info = compute_file_hashes(self.pdf_bytes)
|
|
39
|
-
pdf_file = open(pdf_path, 'rb') # Filehandle must be left open for
|
|
39
|
+
pdf_file = open(pdf_path, 'rb') # Filehandle must be left open for PyPDF to perform seeks
|
|
40
40
|
self.pdf_reader = PdfReader(pdf_file)
|
|
41
41
|
|
|
42
42
|
# Initialize tracking variables
|
pdfalyzer/util/adobe_strings.py
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
String constants specified in the Adobe specs for PDFs, fonts, etc.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
PagesAttributes,
|
|
5
|
+
from pypdf.constants import (CatalogDictionary, ImageAttributes, PageAttributes,
|
|
6
|
+
PagesAttributes, Resources)
|
|
7
7
|
|
|
8
8
|
from pdfalyzer.helpers.string_helper import is_prefixed_by_any
|
|
9
9
|
|
|
@@ -196,6 +196,8 @@ def all_sections_chosen(args):
|
|
|
196
196
|
###############################################
|
|
197
197
|
# Separate arg parser for combine_pdfs script #
|
|
198
198
|
###############################################
|
|
199
|
+
MAX_QUALITY = 10
|
|
200
|
+
|
|
199
201
|
combine_pdfs_parser = ArgumentParser(
|
|
200
202
|
description="Combine multiple PDFs into one.",
|
|
201
203
|
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" \
|
|
@@ -207,10 +209,10 @@ combine_pdfs_parser.add_argument('pdfs',
|
|
|
207
209
|
metavar='PDF_PATH',
|
|
208
210
|
nargs='+')
|
|
209
211
|
|
|
210
|
-
combine_pdfs_parser.add_argument('-
|
|
211
|
-
help='
|
|
212
|
-
choices=range(
|
|
213
|
-
default=
|
|
212
|
+
combine_pdfs_parser.add_argument('-iq', '--image-quality',
|
|
213
|
+
help='image quality for embedded images (can compress PDF at loss of quality)',
|
|
214
|
+
choices=range(1, MAX_QUALITY + 1),
|
|
215
|
+
default=MAX_QUALITY,
|
|
214
216
|
type=int)
|
|
215
217
|
|
|
216
218
|
combine_pdfs_parser.add_argument('-o', '--output-file',
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.16.1
|
|
4
4
|
Summary: A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -16,13 +16,14 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
16
16
|
Classifier: Topic :: Artistic Software
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
18
18
|
Classifier: Topic :: Security
|
|
19
|
-
Requires-Dist: PyPDF2 (>=2.10,<3.0)
|
|
20
19
|
Requires-Dist: anytree (>=2.8,<3.0)
|
|
21
20
|
Requires-Dist: chardet (>=5.0.0,<6.0.0)
|
|
21
|
+
Requires-Dist: pypdf (>=5.0.1,<6.0.0)
|
|
22
22
|
Requires-Dist: python-dotenv (>=0.21.0,<0.22.0)
|
|
23
23
|
Requires-Dist: rich (>=12.5.1,<13.0.0)
|
|
24
24
|
Requires-Dist: rich-argparse-plus (>=0.3.1,<0.4.0)
|
|
25
25
|
Requires-Dist: yaralyzer (>=0.9.4,<0.10.0)
|
|
26
|
+
Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
|
|
26
27
|
Project-URL: Documentation, https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
27
28
|
Project-URL: Repository, https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
28
29
|
Description-Content-Type: text/markdown
|
|
@@ -71,7 +72,7 @@ Installation with [pipx](https://pypa.github.io/pipx/)[^4] is preferred though `
|
|
|
71
72
|
pipx install pdfalyzer
|
|
72
73
|
```
|
|
73
74
|
|
|
74
|
-
See [
|
|
75
|
+
See [PyPDF installation notes](https://github.com/py-pdf/pypdf#installation) about `PyCryptodome` if you plan to `pdfalyze` any files that use AES encryption.
|
|
75
76
|
|
|
76
77
|
If you are on macOS someone out there was kind enough to make [The Pdfalyzer available via homebrew](https://formulae.brew.sh/formula/pdfalyzer) so `brew install pdfalyzer` should work.
|
|
77
78
|
|
|
@@ -123,7 +124,7 @@ Warnings will be printed if any PDF object ID between 1 and the `/Size` reported
|
|
|
123
124
|
## Use As A Code Library
|
|
124
125
|
For info about setting up a dev environment see [Contributing](#contributing) below.
|
|
125
126
|
|
|
126
|
-
At its core The Pdfalyzer is taking PDF internal objects gathered by [
|
|
127
|
+
At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class. Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
|
|
127
128
|
|
|
128
129
|
As far as The Pdfalyzer's unique functionality goes, [`Pdfalyzer`](pdfalyzer/pdfalyzer.py) is the class at the heart of the operation. It holds the PDF's logical tree as well as a few other data structures. Chief among these are the [`FontInfo`](pdfalyzer/font_info.py) class which pulls together various properties of a font strewn across 3 or 4 different PDF objects and the [`BinaryScanner`](pdfalyzer/binary/binary_scanner.py) class which lets you dig through the embedded streams' bytes looking for suspicious patterns.
|
|
129
130
|
|
|
@@ -192,7 +193,7 @@ This image shows a more in-depth view of of the PDF tree for the same document s
|
|
|
192
193
|
|
|
193
194
|
## Fonts
|
|
194
195
|
|
|
195
|
-
#### **Extract character mappings from ancient Adobe font formats**. It's actually `
|
|
196
|
+
#### **Extract character mappings from ancient Adobe font formats**. It's actually `PyPDF` doing the lifting here but we're happy to take the credit.
|
|
196
197
|
|
|
197
198
|

|
|
198
199
|
|
|
@@ -275,7 +276,7 @@ scripts/install_t1utils.sh
|
|
|
275
276
|
## Did The World Really Need Another PDF Tool?
|
|
276
277
|
This tool was built to fill a gap in the PDF assessment landscape following [my own recent experience trying to find malicious content in a PDF file](https://twitter.com/Cryptadamist/status/1570167937381826560). Didier Stevens's [pdfid.py](https://github.com/DidierStevens/DidierStevensSuite/blob/master/pdfid.py) and [pdf-parser.py](https://github.com/DidierStevens/DidierStevensSuite/blob/master/pdf-parser.py) are still the best game in town when it comes to PDF analysis tools but they lack in the visualization department and also don't give you much to work with as far as giving you a data model you can write your own code around. [Peepdf](https://github.com/jesparza/peepdf) seemed promising but turned out to be in a buggy, out of date, and more or less unfixable state. And neither of them offered much in the way of tooling for embedded binary analysis.
|
|
277
278
|
|
|
278
|
-
Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [
|
|
279
|
+
Thus I felt the world might be slightly improved if I strung together a couple of more stable/well known/actively maintained open source projects ([AnyTree](https://github.com/c0fec0de/anytree), [PyPDF](https://github.com/py-pdf/pypdf), [Rich](https://github.com/Textualize/rich), and [YARA](https://github.com/VirusTotal/yara-python) via [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer)) into this tool.
|
|
279
280
|
|
|
280
281
|
-------------
|
|
281
282
|
|
|
@@ -289,7 +290,7 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
289
290
|
|
|
290
291
|
| Term | Meaning |
|
|
291
292
|
| ----------------- | ---------------- |
|
|
292
|
-
| **`PDF Object`** | Instance of a `
|
|
293
|
+
| **`PDF Object`** | Instance of a `PyPDF` class that represents the information stored in the PDF binary between open and close guillemet quotes (« and ») |
|
|
293
294
|
| **`reference_key`** | String found in a PDF object that names a property (e.g. `/BaseFont` or `/Subtype`) |
|
|
294
295
|
| **`reference`** | Link _from_ a PDF object _to_ another node. Outward facing relationships, basically. |
|
|
295
296
|
| **`address`** | `reference_key` plus a hash key or numerical array index if that's how the reference works. e.g. if node A has a reference key `/Resources` pointing to a dict `{'/Font2': [IndirectObject(55), IndirectObject(2)]}` the address of `IndirectObject(55)` from node A would be `/Resources[/Font2][0]` |
|
|
@@ -300,11 +301,10 @@ These are the naming conventions at play in The Pdfalyzer code base:
|
|
|
300
301
|
| **`link_node`** | nodes like `/Dest` that just contain a pointer to another node |
|
|
301
302
|
|
|
302
303
|
### Reference
|
|
303
|
-
* [`
|
|
304
|
+
* [`PyPDF` documentation](https://pypdf.readthedocs.io/en/stable/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
|
|
304
305
|
|
|
305
306
|
|
|
306
307
|
# TODO
|
|
307
|
-
* Upgrade `PyPDF` to latest and expand `combine_pdfs` compression command line option
|
|
308
308
|
* Highlight decodes with a lot of Javascript keywords
|
|
309
309
|
* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
|
|
310
310
|
* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
|
|
@@ -1,38 +1,38 @@
|
|
|
1
|
-
CHANGELOG.md,sha256=
|
|
1
|
+
CHANGELOG.md,sha256=TisXF8v_aWS4qUyZzdZXCF2aY1YDBMot2t5Osjd2DkY,11775
|
|
2
2
|
LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
3
|
-
pdfalyzer/__init__.py,sha256=
|
|
3
|
+
pdfalyzer/__init__.py,sha256=q8qSdGdyUYmTYGOp_d2bRCCFASnlVt4wa-DlBikD5-M,5362
|
|
4
4
|
pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
|
|
5
5
|
pdfalyzer/binary/binary_scanner.py,sha256=7NrXx8GB2gpb04oR2bcZJKkOXOlzn2hWpcGlcYMqSfs,10217
|
|
6
6
|
pdfalyzer/config.py,sha256=oN-pVR037lt3giRsnsm4c8ku5hCW8ChFqYFi9V7w1qU,1918
|
|
7
|
-
pdfalyzer/decorators/document_model_printer.py,sha256=
|
|
7
|
+
pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
|
|
8
8
|
pdfalyzer/decorators/indeterminate_node.py,sha256=ivB6dX5aN8W9m0ksXhmUcixnjYjnuE7DARalH-nMjxY,6616
|
|
9
|
-
pdfalyzer/decorators/pdf_object_properties.py,sha256=
|
|
10
|
-
pdfalyzer/decorators/pdf_tree_node.py,sha256=
|
|
11
|
-
pdfalyzer/decorators/pdf_tree_verifier.py,sha256=
|
|
9
|
+
pdfalyzer/decorators/pdf_object_properties.py,sha256=I7kix5hXNguAH2VW2uINIZRHJ8xYS4JGfc6Aiakyh4c,5522
|
|
10
|
+
pdfalyzer/decorators/pdf_tree_node.py,sha256=sd3a4uQMu_KQ_wvo0pjwQ8K1HI7xGgsGd47eI2IWybY,10927
|
|
11
|
+
pdfalyzer/decorators/pdf_tree_verifier.py,sha256=YC56SQxp5o2zMYgsBPCzX89pCkUHdZ-MCFNIPD9XKRc,4541
|
|
12
12
|
pdfalyzer/detection/constants/binary_regexes.py,sha256=eFx1VVAOzxKmlacbGgicDCp1fcKgOkQkkzeduGjqLBQ,1594
|
|
13
13
|
pdfalyzer/detection/constants/javascript_reserved_keywords.py,sha256=CXXdWskdQa0Hs5wCci2RBVvipgZg34_cLfmkWG4Xcmg,991
|
|
14
14
|
pdfalyzer/detection/javascript_hunter.py,sha256=_wT2vkKTMlm_RGCjYsmwcmV-ag1qep3EpkHmUw0nWcQ,711
|
|
15
15
|
pdfalyzer/detection/yaralyzer_helper.py,sha256=_l9eJQUtMlo9RhY5h8Xq9gBLxzn1VgJsCA1nCsFDGvo,1999
|
|
16
|
-
pdfalyzer/font_info.py,sha256=
|
|
16
|
+
pdfalyzer/font_info.py,sha256=0NQ6g4q3pTdirwGjJhur8HkXQlC732cR7IhilO33g2A,6663
|
|
17
17
|
pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
|
|
18
18
|
pdfalyzer/helpers/filesystem_helper.py,sha256=wHlFz4DFzPAJt2OzMRrhsjL-O3gLJ02JhuwBRwkE958,4089
|
|
19
19
|
pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
|
|
20
|
-
pdfalyzer/helpers/pdf_object_helper.py,sha256=
|
|
21
|
-
pdfalyzer/helpers/rich_text_helper.py,sha256=
|
|
20
|
+
pdfalyzer/helpers/pdf_object_helper.py,sha256=Ija6cWKfFQRXCfZv2ezU1V2v0KFDn9f4ayeX8eG9GmI,1102
|
|
21
|
+
pdfalyzer/helpers/rich_text_helper.py,sha256=s5ytOme8CZCIWAsiPHFlIi6q0KN5qZPBb0OrtTfRkq4,2254
|
|
22
22
|
pdfalyzer/helpers/string_helper.py,sha256=75EDEFw3UWHvWF32WtvZVBbqYY3ozO4y30dtH2qVMX0,2278
|
|
23
|
-
pdfalyzer/output/character_mapping.py,sha256=
|
|
23
|
+
pdfalyzer/output/character_mapping.py,sha256=MtC3jKdtMaugi5038fne0T_SFSo9QU4lZl_s7bW7gzI,2092
|
|
24
24
|
pdfalyzer/output/layout.py,sha256=E58T9Tl6BYZTDsj6ouMr1J5SSUiXa7timUNxnOI2IzI,2149
|
|
25
|
-
pdfalyzer/output/pdfalyzer_presenter.py,sha256=
|
|
26
|
-
pdfalyzer/output/styles/node_colors.py,sha256=
|
|
25
|
+
pdfalyzer/output/pdfalyzer_presenter.py,sha256=CSboSnYFlkgOfwMf3TcoTTJY6FLXJ9OulI9UieSTJeE,8492
|
|
26
|
+
pdfalyzer/output/styles/node_colors.py,sha256=rfsTAUF43K_buw21SZoP6L5c_cLy7S-xA4GUiWJsDkc,3986
|
|
27
27
|
pdfalyzer/output/styles/rich_theme.py,sha256=Y8QmuINlyZNIHvf3oD0CV3w2dC49NNKtvOChvudDCT8,1983
|
|
28
28
|
pdfalyzer/output/tables/decoding_stats_table.py,sha256=mhQOiWhmovaC4sop38WcxStv_bIdAlQWUysAz5fW4MU,3461
|
|
29
29
|
pdfalyzer/output/tables/font_summary_table.py,sha256=xfTqC7BlQd0agQf6nDDhkcJno7hru6mf9_xY1f5IDcw,2065
|
|
30
|
-
pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=
|
|
30
|
+
pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=7G-FLb_EUP50kZmYCTbo8Q6taU4xKp2QIGNOnQtYbNg,5908
|
|
31
31
|
pdfalyzer/output/tables/stream_objects_table.py,sha256=nzCTci8Kqs8Pyghad3L5KWHDdIWRSrKCRNW8geA_rMo,707
|
|
32
|
-
pdfalyzer/pdf_object_relationship.py,sha256=
|
|
33
|
-
pdfalyzer/pdfalyzer.py,sha256=
|
|
34
|
-
pdfalyzer/util/adobe_strings.py,sha256=
|
|
35
|
-
pdfalyzer/util/argument_parser.py,sha256=
|
|
32
|
+
pdfalyzer/pdf_object_relationship.py,sha256=ug-338eoXFdD4YtDWPdzcfxP2fQDQa-GE8I3m3a01TA,5339
|
|
33
|
+
pdfalyzer/pdfalyzer.py,sha256=6JflqQJb2crXXaVA6DHHgWB45w2MBFB3pqE3AlZO5WI,11013
|
|
34
|
+
pdfalyzer/util/adobe_strings.py,sha256=F1MOBtSyIuF5HPmzWDr8MgnLyVodOsZSy4AFFCMHq_Y,5033
|
|
35
|
+
pdfalyzer/util/argument_parser.py,sha256=_8bhYkrw_lH9ce-ZnagcCtn9iqjeUW4dbbyQicB5hqE,11902
|
|
36
36
|
pdfalyzer/util/debugging.py,sha256=nE64VUQbdu2OQRC8w8-AJkMtBOy8Kf3mjozuFslfWsw,156
|
|
37
37
|
pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
|
|
38
38
|
pdfalyzer/util/pdf_parser_manager.py,sha256=FVRYAYsCd0y5MAm--qvXnwCZnDtB3x85FdJtb-gpyw4,3109
|
|
@@ -40,8 +40,8 @@ pdfalyzer/yara_rules/PDF.yara,sha256=fBMKYmJgBLiCq-kpVzsTP9zUJEBep6yi_QVKmC-FdY0
|
|
|
40
40
|
pdfalyzer/yara_rules/PDF_binary_stream.yara,sha256=oWRPLe5yQiRFMvi3BTHNTlB6T7NcAuxKn0C9OSvgJSM,804
|
|
41
41
|
pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
42
|
pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
|
|
43
|
-
pdfalyzer-1.
|
|
44
|
-
pdfalyzer-1.
|
|
45
|
-
pdfalyzer-1.
|
|
46
|
-
pdfalyzer-1.
|
|
47
|
-
pdfalyzer-1.
|
|
43
|
+
pdfalyzer-1.16.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
44
|
+
pdfalyzer-1.16.1.dist-info/METADATA,sha256=XIwrDK_lRwcCb1CrxFghfAhUpL2uCsFRQGMNHdPzsN4,25812
|
|
45
|
+
pdfalyzer-1.16.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
46
|
+
pdfalyzer-1.16.1.dist-info/entry_points.txt,sha256=aZurgt-Xg3pojS7oTRI4hNLpK1hO4kTfChf0x2eQoD8,147
|
|
47
|
+
pdfalyzer-1.16.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|