pdfalyzer 1.16.14__py3-none-any.whl → 1.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pdfalyzer might be problematic. Click here for more details.
- CHANGELOG.md +8 -1
- pdfalyzer/__init__.py +19 -6
- pdfalyzer/decorators/pdf_file.py +170 -7
- pdfalyzer/detection/yaralyzer_helper.py +0 -1
- pdfalyzer/helpers/filesystem_helper.py +27 -3
- pdfalyzer/helpers/image_helper.py +31 -0
- pdfalyzer/helpers/rich_text_helper.py +50 -1
- pdfalyzer/helpers/string_helper.py +33 -30
- pdfalyzer/output/styles/rich_theme.py +2 -1
- pdfalyzer/util/argument_parser.py +4 -58
- pdfalyzer/util/cli_tools_argument_parser.py +156 -0
- pdfalyzer/util/page_range.py +51 -0
- {pdfalyzer-1.16.14.dist-info → pdfalyzer-1.17.1.dist-info}/METADATA +20 -5
- {pdfalyzer-1.16.14.dist-info → pdfalyzer-1.17.1.dist-info}/RECORD +17 -14
- {pdfalyzer-1.16.14.dist-info → pdfalyzer-1.17.1.dist-info}/entry_points.txt +2 -0
- {pdfalyzer-1.16.14.dist-info → pdfalyzer-1.17.1.dist-info}/LICENSE +0 -0
- {pdfalyzer-1.16.14.dist-info → pdfalyzer-1.17.1.dist-info}/WHEEL +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
# NEXT RELEASE
|
|
2
2
|
|
|
3
|
+
### 1.17.1
|
|
4
|
+
* Fix issue where `combine_pdfs` page ranges were indexed from 0 instead of 1
|
|
5
|
+
|
|
6
|
+
# 1.17.0
|
|
7
|
+
* Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
|
|
8
|
+
* Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)
|
|
9
|
+
|
|
3
10
|
### 1.16.14
|
|
4
|
-
* Bump `yaralyzer` to v1.0.9
|
|
11
|
+
* Bump `yaralyzer` to v1.0.9, handle `FileNotFoundError` which is now raised instead of `TypeError`
|
|
5
12
|
* Drop support for python 3.9
|
|
6
13
|
|
|
7
14
|
### 1.16.13
|
pdfalyzer/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import code
|
|
2
2
|
import sys
|
|
3
|
+
from argparse import Namespace
|
|
3
4
|
from os import environ, getcwd, path
|
|
4
5
|
|
|
5
6
|
from dotenv import load_dotenv
|
|
@@ -24,13 +25,15 @@ from yaralyzer.output.file_export import invoke_rich_export
|
|
|
24
25
|
from yaralyzer.output.rich_console import console
|
|
25
26
|
from yaralyzer.util.logging import log_and_print
|
|
26
27
|
|
|
28
|
+
from pdfalyzer.decorators.pdf_file import PdfFile
|
|
27
29
|
from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
|
|
28
30
|
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
29
31
|
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
|
|
30
32
|
from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
|
|
31
33
|
from pdfalyzer.pdfalyzer import Pdfalyzer
|
|
32
|
-
from pdfalyzer.util.argument_parser import
|
|
33
|
-
|
|
34
|
+
from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
|
|
35
|
+
from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, ask_to_proceed,
|
|
36
|
+
parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
|
|
34
37
|
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
|
|
35
38
|
|
|
36
39
|
# For the table shown by running pdfalyzer_show_color_theme
|
|
@@ -132,7 +135,17 @@ def combine_pdfs():
|
|
|
132
135
|
print_highlighted(txt)
|
|
133
136
|
|
|
134
137
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
138
|
+
def extract_pdf_pages() -> None:
|
|
139
|
+
"""Extract a range of pages from a PDF to a new PDF."""
|
|
140
|
+
args = parse_pdf_page_extraction_args()
|
|
141
|
+
PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def extract_text_from_pdfs() -> None:
|
|
145
|
+
"""Extract text from a list of file or from all PDF files in a list of directories."""
|
|
146
|
+
args: Namespace = parse_text_extraction_args()
|
|
147
|
+
console.line()
|
|
148
|
+
|
|
149
|
+
for file_path in args.files_to_process:
|
|
150
|
+
PdfFile(file_path).print_extracted_text(args.page_range, args.print_as_parsed)
|
|
151
|
+
console.line(2)
|
pdfalyzer/decorators/pdf_file.py
CHANGED
|
@@ -1,23 +1,57 @@
|
|
|
1
|
+
import io
|
|
2
|
+
from logging import Logger
|
|
1
3
|
from os import path
|
|
2
4
|
from pathlib import Path
|
|
3
5
|
from typing import List, Optional, Union
|
|
4
6
|
|
|
7
|
+
from pypdf import PdfReader, PdfWriter
|
|
8
|
+
from pypdf.errors import DependencyError, EmptyFileError, PdfStreamError
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.markup import escape
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
from rich.text import Text
|
|
13
|
+
from yaralyzer.output.rich_console import console
|
|
14
|
+
from yaralyzer.util.logging import log as yaralyzer_log
|
|
15
|
+
|
|
16
|
+
from pdfalyzer.helpers.filesystem_helper import create_dir_if_it_does_not_exist, insert_suffix_before_extension
|
|
17
|
+
from pdfalyzer.helpers.image_helper import ocr_text
|
|
18
|
+
from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_text, mild_warning,
|
|
19
|
+
print_error, stderr_console)
|
|
20
|
+
from pdfalyzer.helpers.string_helper import exception_str
|
|
21
|
+
from pdfalyzer.util.page_range import PageRange
|
|
22
|
+
|
|
23
|
+
DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
|
|
24
|
+
MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
|
|
25
|
+
|
|
5
26
|
|
|
6
27
|
class PdfFile:
|
|
7
28
|
"""
|
|
8
29
|
Wrapper for a PDF file path that provides useful methods and properties.
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
file_path (Path): The path to the PDF file.
|
|
33
|
+
dirname (Path): The directory containing the PDF file.
|
|
34
|
+
basename (str): The base name of the PDF file (with extension).
|
|
35
|
+
basename_without_ext (str): The base name of the PDF file (without extension).
|
|
36
|
+
extname (str): The file extension of the PDF file.
|
|
37
|
+
file_size (int): The size of the file in bytes.
|
|
9
38
|
"""
|
|
39
|
+
|
|
10
40
|
def __init__(self, file_path: Union[str, Path]) -> None:
|
|
41
|
+
"""
|
|
42
|
+
Args:
|
|
43
|
+
file_path (Union[str, Path]): Path to the PDF file.
|
|
44
|
+
"""
|
|
11
45
|
self.file_path: Path = Path(file_path)
|
|
12
46
|
|
|
13
47
|
if not self.file_path.exists():
|
|
14
|
-
raise FileNotFoundError(f"
|
|
48
|
+
raise FileNotFoundError(f"'{file_path}' is not a valid file or directory.")
|
|
15
49
|
|
|
16
50
|
self.dirname = self.file_path.parent
|
|
17
51
|
self.basename: str = path.basename(file_path)
|
|
18
52
|
self.basename_without_ext: str = str(Path(self.basename).with_suffix(''))
|
|
19
53
|
self.extname: str = self.file_path.suffix
|
|
20
|
-
self.
|
|
54
|
+
self.file_size = self.file_path.stat().st_size
|
|
21
55
|
|
|
22
56
|
def extract_page_range(
|
|
23
57
|
self,
|
|
@@ -25,8 +59,20 @@ class PdfFile:
|
|
|
25
59
|
destination_dir: Optional[Path] = None,
|
|
26
60
|
extra_file_suffix: Optional[str] = None
|
|
27
61
|
) -> Path:
|
|
28
|
-
"""
|
|
29
|
-
|
|
62
|
+
"""
|
|
63
|
+
Extract a range of pages to a new PDF file.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
page_range (PageRange): Range of pages to extract.
|
|
67
|
+
destination_dir (Optional[Path]): Directory to save the new PDF file. Defaults to the same
|
|
68
|
+
directory as the source PDF.
|
|
69
|
+
extra_file_suffix (Optional[str]): An optional suffix to append to the new PDF's filename.
|
|
70
|
+
Defaults to the page range suffix.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Path: The path to the newly created PDF file containing the extracted pages.
|
|
74
|
+
"""
|
|
75
|
+
destination_dir = Path(destination_dir or self.dirname)
|
|
30
76
|
create_dir_if_it_does_not_exist(destination_dir)
|
|
31
77
|
|
|
32
78
|
if extra_file_suffix is None:
|
|
@@ -36,15 +82,132 @@ class PdfFile:
|
|
|
36
82
|
|
|
37
83
|
extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
|
|
38
84
|
extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
|
|
39
|
-
|
|
85
|
+
console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
|
|
40
86
|
pdf_writer = PdfWriter()
|
|
41
87
|
|
|
42
88
|
with open(self.file_path, 'rb') as source_pdf:
|
|
43
89
|
pdf_writer.append(fileobj=source_pdf, pages=page_range.to_tuple())
|
|
44
90
|
|
|
45
|
-
if SortableFile.confirm_file_overwrite(extracted_pages_pdf_path):
|
|
46
91
|
with open(extracted_pages_pdf_path, 'wb') as extracted_pages_pdf:
|
|
47
92
|
pdf_writer.write(extracted_pages_pdf)
|
|
48
93
|
|
|
49
|
-
|
|
94
|
+
console.print(f"Extracted pages to new PDF: '{extracted_pages_pdf_path}'.")
|
|
50
95
|
return extracted_pages_pdf_path
|
|
96
|
+
|
|
97
|
+
def extract_text(
|
|
98
|
+
self,
|
|
99
|
+
page_range: Optional[PageRange] = None,
|
|
100
|
+
logger: Optional[Logger] = None,
|
|
101
|
+
print_as_parsed: bool = False
|
|
102
|
+
) -> Optional[str]:
|
|
103
|
+
"""
|
|
104
|
+
Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
page_range (Optional[PageRange]): If provided, only extract text from pages in this range.
|
|
108
|
+
Page numbers are 1-indexed. If not provided, extract text from all pages.
|
|
109
|
+
log (Optional[Logger]): If provided, log progress to this logger. Otherwise use default logger.
|
|
110
|
+
print_as_parsed (bool): If True, print each page's text to STDOUT as it is parsed.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Optional[str]: The extracted text, or None if extraction failed.
|
|
114
|
+
"""
|
|
115
|
+
from PIL import Image # Imported here to avoid hard dependency if not using this method
|
|
116
|
+
log = logger or yaralyzer_log
|
|
117
|
+
log.debug(f"Extracting text from '{self.file_path}'...")
|
|
118
|
+
self._page_numbers_of_errors: List[int] = []
|
|
119
|
+
extracted_pages = []
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
pdf_reader = PdfReader(self.file_path)
|
|
123
|
+
page_count = len(pdf_reader.pages)
|
|
124
|
+
log.debug(f"PDF Page count: {page_count}")
|
|
125
|
+
|
|
126
|
+
for page_number, page in enumerate(pdf_reader.pages, start=1):
|
|
127
|
+
if page_range and not page_range.in_range(page_number):
|
|
128
|
+
self._log_to_stderr(f"Skipping page {page_number}...")
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
self._log_to_stderr(f"Parsing page {page_number}...")
|
|
132
|
+
page_buffer = Console(file=io.StringIO())
|
|
133
|
+
page_buffer.print(Panel(f"PAGE {page_number}", padding=(0, 15), expand=False))
|
|
134
|
+
page_buffer.print(escape(page.extract_text().strip()))
|
|
135
|
+
image_number = 1
|
|
136
|
+
|
|
137
|
+
# Extracting images is a bit fraught (lots of PIL and pypdf exceptions have come from here)
|
|
138
|
+
try:
|
|
139
|
+
for image_number, image in enumerate(page.images, start=1):
|
|
140
|
+
image_name = f"Page {page_number}, Image {image_number}"
|
|
141
|
+
self._log_to_stderr(f" Processing {image_name}...", "dim")
|
|
142
|
+
page_buffer.print(Panel(image_name, expand=False))
|
|
143
|
+
image_obj = Image.open(io.BytesIO(image.data))
|
|
144
|
+
image_text = ocr_text(image_obj, f"{self.file_path} ({image_name})")
|
|
145
|
+
page_buffer.print((image_text or '').strip())
|
|
146
|
+
except (OSError, NotImplementedError, TypeError, ValueError) as e:
|
|
147
|
+
error_str = exception_str(e)
|
|
148
|
+
msg = f"{error_str} while parsing embedded image {image_number} on page {page_number}..."
|
|
149
|
+
mild_warning(msg)
|
|
150
|
+
|
|
151
|
+
# Dump an error PDF and encourage user to report to pypdf team.
|
|
152
|
+
if 'JBIG2Decode' not in str(e):
|
|
153
|
+
stderr_console.print_exception()
|
|
154
|
+
|
|
155
|
+
if page_number not in self._page_numbers_of_errors:
|
|
156
|
+
self._handle_extraction_error(page_number, error_str)
|
|
157
|
+
self._page_numbers_of_errors.append(page_number)
|
|
158
|
+
|
|
159
|
+
page_text = page_buffer.file.getvalue()
|
|
160
|
+
extracted_pages.append(page_text)
|
|
161
|
+
log.debug(page_text)
|
|
162
|
+
|
|
163
|
+
if print_as_parsed:
|
|
164
|
+
print(f"{page_text}")
|
|
165
|
+
except DependencyError:
|
|
166
|
+
log.error("Pdfalyzer is missing an optional dependency required to extract text. Try 'pip install pdfalyzer[extract]'")
|
|
167
|
+
except EmptyFileError:
|
|
168
|
+
log.warning("Skipping empty file!")
|
|
169
|
+
except PdfStreamError as e:
|
|
170
|
+
print_error(f"Error parsing PDF file '{self.file_path}': {e}")
|
|
171
|
+
stderr_console.print_exception()
|
|
172
|
+
|
|
173
|
+
return "\n\n".join(extracted_pages).strip()
|
|
174
|
+
|
|
175
|
+
def print_extracted_text(self, page_range: Optional[PageRange] = None, print_as_parsed: bool = False) -> None:
|
|
176
|
+
"""Fancy wrapper for printing the extracted text to the screen."""
|
|
177
|
+
console.print(Panel(str(self.file_path), expand=False, style='bright_white reverse'))
|
|
178
|
+
txt = self.extract_text(page_range=page_range, print_as_parsed=print_as_parsed)
|
|
179
|
+
|
|
180
|
+
if not print_as_parsed:
|
|
181
|
+
console.print(txt)
|
|
182
|
+
|
|
183
|
+
def _handle_extraction_error(self, page_number: int, error_msg: str) -> None:
|
|
184
|
+
"""Rip the offending page to a new file and suggest that user report bug to PyPDF."""
|
|
185
|
+
destination_dir = DEFAULT_PDF_ERRORS_DIR
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
|
|
189
|
+
except Exception as e:
|
|
190
|
+
stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
|
|
191
|
+
extracted_file = None
|
|
192
|
+
|
|
193
|
+
blink_txt = Text('', style='bright_white')
|
|
194
|
+
blink_txt.append("An error (", style='blink color(154)').append(error_msg, style='color(11) blink')
|
|
195
|
+
blink_txt.append(') ', style='blink color(154)')
|
|
196
|
+
blink_txt.append("was encountered while processing a PDF file.\n\n", style='blink color(154)')
|
|
197
|
+
|
|
198
|
+
txt = Text(f"The error was of a type such that it probably came from a bug in ", style='bright_white')
|
|
199
|
+
txt.append('PyPDF', style='underline bright_green').append('. It was encountered processing the file ')
|
|
200
|
+
txt.append(str(self.file_path), style='file').append('. You should see a stack trace above this box.\n\n')
|
|
201
|
+
|
|
202
|
+
txt.append('The offending page will be extracted to ', style='bright_white')
|
|
203
|
+
txt.append(str(extracted_file), style='file').append('.\n\n')
|
|
204
|
+
txt.append(f"Please visit 'https://github.com/py-pdf/pypdf/issues' to report a bug. ", style='bold')
|
|
205
|
+
txt.append(f"Providing the devs with the extracted page and the stack trace help improve pypdf.")
|
|
206
|
+
stderr_console.print(attention_getting_panel(blink_txt + txt, title='PyPDF Error'))
|
|
207
|
+
|
|
208
|
+
def _log_to_stderr(self, msg: str, style: Optional[str] = None) -> None:
|
|
209
|
+
"""When parsing very large PDFs it can be useful to log progress and other messages to STDERR."""
|
|
210
|
+
if self.file_size < MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR:
|
|
211
|
+
return
|
|
212
|
+
|
|
213
|
+
stderr_console.print(msg, style=style or "")
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Optional, Union
|
|
7
7
|
|
|
8
8
|
from yaralyzer.output.rich_console import console
|
|
9
|
+
from yaralyzer.util.logging import log
|
|
9
10
|
|
|
10
11
|
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
11
12
|
|
|
@@ -18,9 +19,20 @@ PDF_EXT = '.pdf'
|
|
|
18
19
|
# type StrOrPath = Union[str, Path]
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
def
|
|
22
|
-
"""
|
|
23
|
-
|
|
22
|
+
def create_dir_if_it_does_not_exist(dir: Path) -> None:
|
|
23
|
+
"""Like it says on the tin."""
|
|
24
|
+
if dir.exists():
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
console.warning(f"Need to create '{dir}'")
|
|
28
|
+
dir.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def insert_suffix_before_extension(file_path: Path, suffix: str, separator: str = '__') -> Path:
|
|
32
|
+
"""Inserting 'page 1' suffix in 'path/to/file.jpg' -> '/path/to/file__page_1.jpg'."""
|
|
33
|
+
suffix = strip_bad_chars(suffix).replace(' ', '_')
|
|
34
|
+
file_path_without_extension = file_path.with_suffix('')
|
|
35
|
+
return Path(f"{file_path_without_extension}{separator}{suffix}{file_path.suffix}")
|
|
24
36
|
|
|
25
37
|
|
|
26
38
|
def is_pdf(file_path: Union[str, Path]) -> bool:
|
|
@@ -100,3 +112,15 @@ def set_max_open_files(num_filehandles: int = DEFAULT_MAX_OPEN_FILES) -> tuple[O
|
|
|
100
112
|
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
101
113
|
|
|
102
114
|
return (soft, hard)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def strip_bad_chars(text: str) -> str:
|
|
118
|
+
"""Remove chars that don't work well in filenames."""
|
|
119
|
+
text = ' '.join(text.splitlines()).replace('\\s+', ' ')
|
|
120
|
+
text = re.sub('’', "'", text).replace('|', 'I').replace(',', ',')
|
|
121
|
+
return re.sub('[^-0-9a-zA-Z@.,?_:=#\'\\$" ()]+', '_', text).replace(' ', ' ')
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def with_pdf_extension(file_path: Union[str, Path]) -> str:
|
|
125
|
+
"""Append `".pdf"` to `file_path` if it doesn't already end with `".pdf"`."""
|
|
126
|
+
return str(file_path) + ('' if is_pdf(file_path) else PDF_EXT)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from PIL import Image
|
|
4
|
+
from yaralyzer.output.rich_console import console
|
|
5
|
+
|
|
6
|
+
from pdfalyzer.helpers.rich_text_helper import warning_text
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
|
|
10
|
+
"""Use pytesseract to OCR the text in the image and return it as a string."""
|
|
11
|
+
import pytesseract
|
|
12
|
+
text = None
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
text = pytesseract.image_to_string(image)
|
|
16
|
+
except pytesseract.pytesseract.TesseractError as e:
|
|
17
|
+
console.print_exception()
|
|
18
|
+
console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
|
|
19
|
+
except OSError as e:
|
|
20
|
+
if 'truncated' in str(e):
|
|
21
|
+
console.print(warning_text(f"Truncated image file '{image_name}'!"))
|
|
22
|
+
else:
|
|
23
|
+
console.print_exception()
|
|
24
|
+
console.print(f"Error while extracting '{image_name}'!", style='bright_red')
|
|
25
|
+
raise e
|
|
26
|
+
except Exception as e:
|
|
27
|
+
console.print_exception()
|
|
28
|
+
console.print(f"Error while extracting '{image_name}'!", style='bright_red')
|
|
29
|
+
raise e
|
|
30
|
+
|
|
31
|
+
return None if text is None else text.strip()
|
|
@@ -1,17 +1,24 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Functions for miscellaneous Rich text/string pretty printing operations.
|
|
3
3
|
"""
|
|
4
|
-
from
|
|
4
|
+
from sys import stderr
|
|
5
|
+
from typing import List, Optional, Union
|
|
5
6
|
|
|
6
7
|
from pypdf.generic import PdfObject
|
|
7
8
|
from rich.console import Console
|
|
9
|
+
from rich.panel import Panel
|
|
10
|
+
from rich.padding import Padding
|
|
8
11
|
from rich.text import Text
|
|
12
|
+
from yaralyzer.output.rich_console import console
|
|
9
13
|
|
|
10
14
|
from pdfalyzer.helpers.pdf_object_helper import pypdf_class_name
|
|
11
15
|
from pdfalyzer.output.styles.node_colors import get_label_style, get_class_style_italic
|
|
12
16
|
|
|
17
|
+
ARROW_BULLET = '➤ '
|
|
18
|
+
|
|
13
19
|
# Usually we use the yaralyzer console but that has no highlighter
|
|
14
20
|
pdfalyzer_console = Console(color_system='256')
|
|
21
|
+
stderr_console = Console(color_system='256', file=stderr)
|
|
15
22
|
|
|
16
23
|
|
|
17
24
|
def print_highlighted(msg: Union[str, Text], **kwargs) -> None:
|
|
@@ -32,6 +39,21 @@ def quoted_text(
|
|
|
32
39
|
return txt
|
|
33
40
|
|
|
34
41
|
|
|
42
|
+
def indented_bullet(msg: Union[str, Text], style: Optional[str] = None) -> Text:
|
|
43
|
+
return Text(' ') + bullet_text(msg, style)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def bullet_text(msg: Union[str, Text], style: Optional[str] = None) -> Text:
|
|
47
|
+
if isinstance(msg, str):
|
|
48
|
+
msg = Text(msg, style=style)
|
|
49
|
+
|
|
50
|
+
return Text(ARROW_BULLET).append(msg)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def mild_warning(msg: str) -> None:
|
|
54
|
+
console.print(indented_bullet(Text(msg, style='mild_warning')))
|
|
55
|
+
|
|
56
|
+
|
|
35
57
|
def node_label(idnum: int, label: str, pdf_object: PdfObject, underline: bool = True) -> Text:
|
|
36
58
|
"""Colored text representation of a PDF node. Example: <5:FontDescriptor(Dictionary)>."""
|
|
37
59
|
text = Text('<', style='white')
|
|
@@ -58,3 +80,30 @@ def pct_txt(_number: int, total: int, digits: int = 1) -> Text:
|
|
|
58
80
|
"""Return nicely formatted percentage, e.g. '(80%)'."""
|
|
59
81
|
pct = (100 * float(_number) / float(total)).__round__(digits)
|
|
60
82
|
return Text(f"({pct}%)", style='blue')
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def warning_text(text: Union[str, Text]) -> Text:
|
|
86
|
+
msg = Text('').append(f"WARNING", style='bright_yellow').append(": ")
|
|
87
|
+
|
|
88
|
+
if isinstance(text, Text):
|
|
89
|
+
return msg + text
|
|
90
|
+
else:
|
|
91
|
+
return msg.append(text)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def error_text(text: Union[str, Text]) -> Text:
|
|
95
|
+
msg = Text('').append(f"ERROR", style='bright_red').append(": ")
|
|
96
|
+
|
|
97
|
+
if isinstance(text, Text):
|
|
98
|
+
return msg + text
|
|
99
|
+
else:
|
|
100
|
+
return msg.append(text)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def attention_getting_panel(text: Text, title: str, style: str = 'white on red') -> Padding:
|
|
104
|
+
p = Panel(text, padding=(2), title=title, style=style)
|
|
105
|
+
return Padding(p, pad=(1, 10, 2, 10))
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def print_error(text: Union[str, Text]) -> Text:
|
|
109
|
+
console.print(error_text(text))
|
|
@@ -3,7 +3,7 @@ Various text formatting/styling/manipulating methods.
|
|
|
3
3
|
"""
|
|
4
4
|
import re
|
|
5
5
|
from pprint import PrettyPrinter
|
|
6
|
-
from typing import List, Pattern, Union
|
|
6
|
+
from typing import List, Optional, Pattern, Union
|
|
7
7
|
|
|
8
8
|
from yaralyzer.output.rich_console import console_width
|
|
9
9
|
|
|
@@ -18,16 +18,14 @@ pp = PrettyPrinter(
|
|
|
18
18
|
sort_dicts=True)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def
|
|
22
|
-
"""
|
|
23
|
-
|
|
21
|
+
def all_strings_are_same_ignoring_numbers(strings: List[str]) -> bool:
|
|
22
|
+
"""Returns true if string addresses are same except for digits."""
|
|
23
|
+
return len(set([replace_digits(s) for s in strings])) == 1
|
|
24
24
|
|
|
25
|
-
if title is None:
|
|
26
|
-
return '-' * width
|
|
27
25
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
return
|
|
26
|
+
def bracketed(index: Union[int, str]) -> str:
|
|
27
|
+
"""Surround index with [ and ]."""
|
|
28
|
+
return f"[{index}]"
|
|
31
29
|
|
|
32
30
|
|
|
33
31
|
def count_pattern_matches_in_text(pattern: str, text: str) -> int:
|
|
@@ -35,42 +33,47 @@ def count_pattern_matches_in_text(pattern: str, text: str) -> int:
|
|
|
35
33
|
|
|
36
34
|
|
|
37
35
|
def count_regex_matches_in_text(regex: Pattern, text: str) -> int:
|
|
38
|
-
"""For use when you precompile the regex"""
|
|
36
|
+
"""For use when you precompile the regex."""
|
|
39
37
|
return sum(1 for _ in regex.finditer(text))
|
|
40
38
|
|
|
41
39
|
|
|
42
|
-
def
|
|
43
|
-
"""
|
|
44
|
-
return
|
|
40
|
+
def exception_str(e: Exception) -> str:
|
|
41
|
+
"""A string with the type and message."""
|
|
42
|
+
return f"{type(e).__name__}: {e}"
|
|
45
43
|
|
|
46
44
|
|
|
47
|
-
def
|
|
48
|
-
"""
|
|
49
|
-
|
|
45
|
+
def generate_hyphen_line(width: Optional[int] = None, title: Optional[str] = None):
|
|
46
|
+
"""e.g. '-----------------BEGIN-----------------'"""
|
|
47
|
+
width = width or console_width()
|
|
50
48
|
|
|
49
|
+
if title is None:
|
|
50
|
+
return '-' * width
|
|
51
51
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
return
|
|
52
|
+
side_hyphens = int((width - len(title)) / 2) * '-'
|
|
53
|
+
line = side_hyphens + title + side_hyphens
|
|
54
|
+
return line if len(line) == width else line + '-'
|
|
55
55
|
|
|
56
56
|
|
|
57
|
-
def
|
|
58
|
-
|
|
59
|
-
return DIGIT_REGEX.sub('x', string_with_digits)
|
|
57
|
+
def has_a_common_substring(strings: List[str]) -> bool:
|
|
58
|
+
return all([is_substring_of_longer_strings_in_list(s, strings) for s in strings])
|
|
60
59
|
|
|
61
60
|
|
|
62
|
-
def
|
|
63
|
-
"""Returns
|
|
64
|
-
return
|
|
61
|
+
def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
|
|
62
|
+
"""Returns True if _string starts with anything in 'prefixes'."""
|
|
63
|
+
return any([_string.startswith(prefix) for prefix in prefixes])
|
|
65
64
|
|
|
66
65
|
|
|
67
66
|
def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
|
|
67
|
+
"""Return True if '_string' is a substring of all the 'strings' longer than '_string'."""
|
|
68
68
|
longer_strings = [s for s in strings if len(s) > len(_string)]
|
|
69
69
|
return all([_string in longer_string for longer_string in longer_strings])
|
|
70
70
|
|
|
71
71
|
|
|
72
|
-
def
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
72
|
+
def replace_digits(string_with_digits: str) -> str:
|
|
73
|
+
"""Turn all digits to X chars in a string."""
|
|
74
|
+
return DIGIT_REGEX.sub('x', string_with_digits)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def root_address(_string: str) -> str:
|
|
78
|
+
"""Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
|
|
79
|
+
return _string.split('[')[0]
|
|
@@ -51,7 +51,8 @@ PDFALYZER_THEME_DICT.update({
|
|
|
51
51
|
'warn.harsh': 'reverse bright_yellow',
|
|
52
52
|
# error log events
|
|
53
53
|
'fail': 'bold reverse red',
|
|
54
|
-
'
|
|
54
|
+
'mild_error': 'red', # TODO: unused?
|
|
55
|
+
'mild_warning': 'color(228) dim',
|
|
55
56
|
'red_alert': 'blink bold red reverse on white',
|
|
56
57
|
})
|
|
57
58
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Parse command line arguments for
|
|
2
|
+
Parse command line arguments for `pdfalyze` and construct the `PdfalyzerConfig` object.
|
|
3
3
|
"""
|
|
4
4
|
import sys
|
|
5
5
|
from argparse import ArgumentParser, Namespace
|
|
@@ -17,8 +17,6 @@ from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_
|
|
|
17
17
|
|
|
18
18
|
from pdfalyzer.config import ALL_STREAMS, PDFALYZER, PdfalyzerConfig
|
|
19
19
|
from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
|
|
20
|
-
from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
|
|
21
|
-
with_pdf_extension)
|
|
22
20
|
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
23
21
|
|
|
24
22
|
# NamedTuple to keep our argument selection orderly
|
|
@@ -202,61 +200,9 @@ def all_sections_chosen(args):
|
|
|
202
200
|
return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
|
|
203
201
|
|
|
204
202
|
|
|
205
|
-
|
|
206
|
-
#
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
MAX_QUALITY = 10
|
|
210
|
-
|
|
211
|
-
combine_pdfs_parser = ArgumentParser(
|
|
212
|
-
description="Combine multiple PDFs into one.",
|
|
213
|
-
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
|
|
214
|
-
" page numbers prior to merging.",
|
|
215
|
-
formatter_class=RichHelpFormatterPlus)
|
|
216
|
-
|
|
217
|
-
combine_pdfs_parser.add_argument('pdfs',
|
|
218
|
-
help='two or more PDFs to combine',
|
|
219
|
-
metavar='PDF_PATH',
|
|
220
|
-
nargs='+')
|
|
221
|
-
|
|
222
|
-
combine_pdfs_parser.add_argument('-iq', '--image-quality',
|
|
223
|
-
help='image quality for embedded images (can compress PDF at loss of quality)',
|
|
224
|
-
choices=range(1, MAX_QUALITY + 1),
|
|
225
|
-
default=MAX_QUALITY,
|
|
226
|
-
type=int)
|
|
227
|
-
|
|
228
|
-
combine_pdfs_parser.add_argument('-o', '--output-file',
|
|
229
|
-
help='path to write the combined PDFs to',
|
|
230
|
-
required=True)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
def parse_combine_pdfs_args() -> Namespace:
|
|
234
|
-
"""Parse command line args for combine_pdfs script."""
|
|
235
|
-
args = combine_pdfs_parser.parse_args()
|
|
236
|
-
args.output_file = with_pdf_extension(args.output_file)
|
|
237
|
-
confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
|
|
238
|
-
args.number_of_pdfs = len(args.pdfs)
|
|
239
|
-
|
|
240
|
-
if args.number_of_pdfs < 2:
|
|
241
|
-
exit_with_error(f"Need at least 2 PDFs to merge.")
|
|
242
|
-
elif not do_all_files_exist(args.pdfs):
|
|
243
|
-
exit_with_error()
|
|
244
|
-
elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
|
|
245
|
-
exit_with_error()
|
|
246
|
-
|
|
247
|
-
if all(is_pdf(pdf) for pdf in args.pdfs):
|
|
248
|
-
if all(extract_page_number(pdf) for pdf in args.pdfs):
|
|
249
|
-
print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
|
|
250
|
-
args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
|
|
251
|
-
else:
|
|
252
|
-
print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
|
|
253
|
-
else:
|
|
254
|
-
print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
|
|
255
|
-
ask_to_proceed()
|
|
256
|
-
|
|
257
|
-
print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
|
|
258
|
-
return args
|
|
259
|
-
|
|
203
|
+
#############
|
|
204
|
+
# Helpers #
|
|
205
|
+
#############
|
|
260
206
|
|
|
261
207
|
def ask_to_proceed() -> None:
|
|
262
208
|
"""Exit if user doesn't confirm they want to proceed."""
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Argument parsers for the command line tools other than `pdfalyze` that are included with The Pdfalyzer.
|
|
3
|
+
|
|
4
|
+
1. combine_pdfs
|
|
5
|
+
2.
|
|
6
|
+
"""
|
|
7
|
+
import sys
|
|
8
|
+
from argparse import ArgumentParser, Namespace
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from rich_argparse_plus import RichHelpFormatterPlus
|
|
12
|
+
from rich.prompt import Confirm
|
|
13
|
+
from rich.text import Text
|
|
14
|
+
from yaralyzer.helpers.file_helper import files_in_dir
|
|
15
|
+
from yaralyzer.util.logging import log
|
|
16
|
+
|
|
17
|
+
from pdfalyzer.util.argument_parser import ask_to_proceed, exit_with_error
|
|
18
|
+
from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
|
|
19
|
+
with_pdf_extension)
|
|
20
|
+
from pdfalyzer.helpers.rich_text_helper import print_highlighted
|
|
21
|
+
from pdfalyzer.util.page_range import PageRangeArgumentValidator
|
|
22
|
+
|
|
23
|
+
MAX_QUALITY = 10
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
##################
|
|
27
|
+
# combine_pdfs #
|
|
28
|
+
##################
|
|
29
|
+
combine_pdfs_parser = ArgumentParser(
|
|
30
|
+
description="Combine multiple PDFs into one.",
|
|
31
|
+
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
|
|
32
|
+
" page numbers prior to merging.",
|
|
33
|
+
formatter_class=RichHelpFormatterPlus)
|
|
34
|
+
|
|
35
|
+
combine_pdfs_parser.add_argument('pdfs',
|
|
36
|
+
help='two or more PDFs to combine',
|
|
37
|
+
metavar='PDF_PATH',
|
|
38
|
+
nargs='+')
|
|
39
|
+
|
|
40
|
+
combine_pdfs_parser.add_argument('-iq', '--image-quality',
|
|
41
|
+
help='image quality for embedded images (can compress PDF at loss of quality)',
|
|
42
|
+
choices=range(1, MAX_QUALITY + 1),
|
|
43
|
+
default=MAX_QUALITY,
|
|
44
|
+
type=int)
|
|
45
|
+
|
|
46
|
+
combine_pdfs_parser.add_argument('-o', '--output-file',
|
|
47
|
+
help='path to write the combined PDFs to',
|
|
48
|
+
required=True)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def parse_combine_pdfs_args() -> Namespace:
|
|
52
|
+
"""Parse command line args for combine_pdfs script."""
|
|
53
|
+
args = combine_pdfs_parser.parse_args()
|
|
54
|
+
args.output_file = with_pdf_extension(args.output_file)
|
|
55
|
+
confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
|
|
56
|
+
args.number_of_pdfs = len(args.pdfs)
|
|
57
|
+
|
|
58
|
+
if args.number_of_pdfs < 2:
|
|
59
|
+
exit_with_error(f"Need at least 2 PDFs to merge.")
|
|
60
|
+
elif not do_all_files_exist(args.pdfs):
|
|
61
|
+
exit_with_error()
|
|
62
|
+
elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
|
|
63
|
+
exit_with_error()
|
|
64
|
+
|
|
65
|
+
if all(is_pdf(pdf) for pdf in args.pdfs):
|
|
66
|
+
if all(extract_page_number(pdf) for pdf in args.pdfs):
|
|
67
|
+
print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
|
|
68
|
+
args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
|
|
69
|
+
else:
|
|
70
|
+
print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
|
|
71
|
+
else:
|
|
72
|
+
print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
|
|
73
|
+
ask_to_proceed()
|
|
74
|
+
|
|
75
|
+
print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
|
|
76
|
+
return args
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
#####################
|
|
80
|
+
# extract_pdf_pages #
|
|
81
|
+
#####################
|
|
82
|
+
page_range_validator = PageRangeArgumentValidator()
|
|
83
|
+
|
|
84
|
+
extract_pdf_parser = ArgumentParser(
|
|
85
|
+
formatter_class=RichHelpFormatterPlus,
|
|
86
|
+
description="Extract pages from one PDF into a new PDF.",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
|
|
90
|
+
|
|
91
|
+
extract_pdf_parser.add_argument('--page-range', '-r',
|
|
92
|
+
type=page_range_validator,
|
|
93
|
+
help=page_range_validator.HELP_MSG,
|
|
94
|
+
required=True)
|
|
95
|
+
|
|
96
|
+
extract_pdf_parser.add_argument('--destination-dir', '-d',
|
|
97
|
+
help="directory to write the new PDF to",
|
|
98
|
+
default=Path.cwd())
|
|
99
|
+
|
|
100
|
+
extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def parse_pdf_page_extraction_args() -> Namespace:
|
|
104
|
+
args = extract_pdf_parser.parse_args()
|
|
105
|
+
|
|
106
|
+
if not is_pdf(args.pdf_file):
|
|
107
|
+
log.error(f"'{args.pdf_file}' is not a PDF.")
|
|
108
|
+
sys.exit(-1)
|
|
109
|
+
elif not Path(args.destination_dir).exists():
|
|
110
|
+
log.error(f"Destination dir '{args.destination_dir}' does not exist.")
|
|
111
|
+
sys.exit(1)
|
|
112
|
+
|
|
113
|
+
return args
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
############################
|
|
117
|
+
# extract_text_from_pdfs #
|
|
118
|
+
############################
|
|
119
|
+
extract_text_parser = ArgumentParser(
|
|
120
|
+
formatter_class=RichHelpFormatterPlus,
|
|
121
|
+
description="Extract the text from one or more files or directories.",
|
|
122
|
+
epilog="If any of the FILE_OR_DIRs is a directory all PDF files in that directory will be extracted."
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
|
|
126
|
+
extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
|
|
127
|
+
|
|
128
|
+
extract_text_parser.add_argument('--page-range', '-r',
|
|
129
|
+
type=page_range_validator,
|
|
130
|
+
help=f"[PDFs only] {page_range_validator.HELP_MSG}")
|
|
131
|
+
|
|
132
|
+
extract_text_parser.add_argument('--print-as-parsed', '-p',
|
|
133
|
+
action='store_true',
|
|
134
|
+
help='print pages as they are parsed instead of waiting until document is fully parsed')
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def parse_text_extraction_args() -> Namespace:
|
|
138
|
+
args = extract_text_parser.parse_args()
|
|
139
|
+
args.files_to_process = []
|
|
140
|
+
|
|
141
|
+
for file_or_dir in args.file_or_dir:
|
|
142
|
+
file_path = Path(file_or_dir)
|
|
143
|
+
|
|
144
|
+
if not file_path.exists():
|
|
145
|
+
log.error(f"'{file_path}' is not a valid file or directory.")
|
|
146
|
+
sys.exit(-1)
|
|
147
|
+
elif file_path.is_dir():
|
|
148
|
+
args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
|
|
149
|
+
else:
|
|
150
|
+
args.files_to_process.append(file_path)
|
|
151
|
+
|
|
152
|
+
if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
|
|
153
|
+
log.error(f"--page-range can only be specified for a single PDF")
|
|
154
|
+
sys.exit(-1)
|
|
155
|
+
|
|
156
|
+
return args
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A range of page numbers. Copied from clown_sort repo.
|
|
3
|
+
"""
|
|
4
|
+
import re
|
|
5
|
+
from argparse import ArgumentTypeError
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Tuple
|
|
8
|
+
|
|
9
|
+
PAGE_RANGE_REGEX = re.compile(r'[1-9](\d+)?(-\d+)?')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class PageRange:
|
|
14
|
+
page_range: str
|
|
15
|
+
|
|
16
|
+
def __post_init__(self):
|
|
17
|
+
if not PAGE_RANGE_REGEX.match(self.page_range):
|
|
18
|
+
raise ArgumentTypeError(f"Invalid page range '{self.page_range}'")
|
|
19
|
+
|
|
20
|
+
if '-' in self.page_range:
|
|
21
|
+
(self.first_page, self.last_page) = (int(p) for p in self.page_range.split('-'))
|
|
22
|
+
else:
|
|
23
|
+
self.first_page = int(self.page_range)
|
|
24
|
+
self.last_page = self.first_page + 1
|
|
25
|
+
|
|
26
|
+
if self.last_page <= self.first_page:
|
|
27
|
+
raise ValueError(f"Invalid page range {self.__repr__()}")
|
|
28
|
+
|
|
29
|
+
def in_range(self, page_number) -> bool:
|
|
30
|
+
"""Returns `True` if `page_number` is in this range."""
|
|
31
|
+
return page_number >= self.first_page and page_number < self.last_page
|
|
32
|
+
|
|
33
|
+
def file_suffix(self) -> str:
|
|
34
|
+
"""String that can be used as file suffix."""
|
|
35
|
+
if self.first_page + 1 == self.last_page:
|
|
36
|
+
return f"page_{self.first_page}"
|
|
37
|
+
else:
|
|
38
|
+
return f"pages_{self.first_page}-{self.last_page - 1}"
|
|
39
|
+
|
|
40
|
+
def to_tuple(self) -> Tuple[int, int]:
|
|
41
|
+
return (self.first_page - 1, self.last_page - 1)
|
|
42
|
+
|
|
43
|
+
def __repr__(self) -> str:
|
|
44
|
+
return f"PageRange({self.first_page}, {self.last_page})"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class PageRangeArgumentValidator(object):
|
|
48
|
+
HELP_MSG = "a single digit ('11') or a range ('11-15') (WILL NOT extract the last page)"
|
|
49
|
+
|
|
50
|
+
def __call__(self, value):
|
|
51
|
+
return PageRange(value)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pdfalyzer
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.17.1
|
|
4
4
|
Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
|
|
5
5
|
Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -21,20 +21,23 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
21
21
|
Classifier: Topic :: Artistic Software
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
23
23
|
Classifier: Topic :: Security
|
|
24
|
+
Provides-Extra: extract
|
|
25
|
+
Requires-Dist: PyMuPDF (>=1.26.4,<2.0.0) ; extra == "extract"
|
|
24
26
|
Requires-Dist: anytree (>=2.13,<3.0)
|
|
25
27
|
Requires-Dist: pypdf (>=6.0.0,<7.0.0)
|
|
28
|
+
Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
|
|
26
29
|
Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
|
|
27
30
|
Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
|
|
28
31
|
Project-URL: Documentation, https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
29
32
|
Project-URL: Repository, https://github.com/michelcrypt4d4mus/pdfalyzer
|
|
30
33
|
Description-Content-Type: text/markdown
|
|
31
34
|
|
|
32
|
-
<!--  -->
|
|
33
|
-

|
|
34
35
|
[](https://pypi.org/project/pdfalyzer/)
|
|
35
|
-
[](https://github.com/michelcrypt4d4mus/pdfalyzer)
|
|
36
36
|

|
|
37
|
+
[](https://github.com/michelcrypt4d4mus/pdfalyzer)
|
|
38
|
+

|
|
37
39
|

|
|
40
|
+
[](https://github.com/michelcrypt4d4mus/pdfalyzer/actions/workflows/python-package.yml)
|
|
38
41
|
|
|
39
42
|
|
|
40
43
|
# THE PDFALYZER
|
|
@@ -242,9 +245,21 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
|
|
|
242
245
|
|
|
243
246
|
-------------
|
|
244
247
|
|
|
248
|
+
|
|
245
249
|
# PDF Resources
|
|
246
250
|
## Included PDF Tools
|
|
247
|
-
The Pdfalyzer
|
|
251
|
+
The Pdfalyzer comes with a few command line tools:
|
|
252
|
+
|
|
253
|
+
* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
|
|
254
|
+
* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
|
|
255
|
+
* `extract_text_from_pdfs` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_text_from_pdfs --help` for more info.
|
|
256
|
+
|
|
257
|
+
Running `extract_text_from_pdfs` requires that you install The Pdfalyzer's optional dependencies:
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
pipx install pdfalyzer[extract]
|
|
261
|
+
```
|
|
262
|
+
|
|
248
263
|
|
|
249
264
|
## 3rd Party PDF Tools
|
|
250
265
|
### Installing Didier Stevens's PDF Analysis Tools
|
|
@@ -1,32 +1,33 @@
|
|
|
1
1
|
.pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
|
|
2
|
-
CHANGELOG.md,sha256=
|
|
2
|
+
CHANGELOG.md,sha256=KtprK6EZ8FhdPWHs9E-YzGSqHxV_w0GnShvIJ6kMPss,13132
|
|
3
3
|
LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
4
|
-
pdfalyzer/__init__.py,sha256=
|
|
4
|
+
pdfalyzer/__init__.py,sha256=TgCkfaaWuxv3sNMHcMZjh5lAw0oPNYKqJYRXVy9hPKo,6181
|
|
5
5
|
pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
|
|
6
6
|
pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04ElB8ilU,10748
|
|
7
7
|
pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
|
|
8
8
|
pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
|
|
9
9
|
pdfalyzer/decorators/indeterminate_node.py,sha256=QLJr-nGKih8gPZcIqxLU028OwWWD5VjNHYMUjniwT_k,6586
|
|
10
|
-
pdfalyzer/decorators/pdf_file.py,sha256=
|
|
10
|
+
pdfalyzer/decorators/pdf_file.py,sha256=CHXyM8RIvnjKnsDOJxUhk-sfRzLLW50MJpKKTax6Eqk,10274
|
|
11
11
|
pdfalyzer/decorators/pdf_object_properties.py,sha256=Il3RObxQ4XUf0Ei-nd4tjJO0LeaxC6u7yFa3cQs_jVY,5485
|
|
12
12
|
pdfalyzer/decorators/pdf_tree_node.py,sha256=4LReGJUtG8iEcLUQD1jW-yp3xPWsHrC-3Anbkt7XZ3A,11134
|
|
13
13
|
pdfalyzer/decorators/pdf_tree_verifier.py,sha256=2hVe9APsAWQZ7ra8AGndHQnGWmmxmb3ZwfJHZuLvsvc,4714
|
|
14
14
|
pdfalyzer/detection/constants/binary_regexes.py,sha256=s69S7uq1v4vBy3ZkKKKt3ClNuFCuQ0ztootUxzlgfFw,1632
|
|
15
15
|
pdfalyzer/detection/constants/javascript_reserved_keywords.py,sha256=CXXdWskdQa0Hs5wCci2RBVvipgZg34_cLfmkWG4Xcmg,991
|
|
16
16
|
pdfalyzer/detection/javascript_hunter.py,sha256=_wT2vkKTMlm_RGCjYsmwcmV-ag1qep3EpkHmUw0nWcQ,711
|
|
17
|
-
pdfalyzer/detection/yaralyzer_helper.py,sha256=
|
|
17
|
+
pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47NdsDasg01uiQ,2194
|
|
18
18
|
pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
|
|
19
19
|
pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
|
|
20
|
-
pdfalyzer/helpers/filesystem_helper.py,sha256=
|
|
20
|
+
pdfalyzer/helpers/filesystem_helper.py,sha256=onXhSMhxo0YkvdKdosRwUo_RGdW6yNzZF5hfjgZ3GBE,5085
|
|
21
|
+
pdfalyzer/helpers/image_helper.py,sha256=QjoAUcKKEtpmuEyOmEfmaUN-lzNykQ1SzqgNn9-R4Y0,1120
|
|
21
22
|
pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
|
|
22
23
|
pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
|
|
23
|
-
pdfalyzer/helpers/rich_text_helper.py,sha256=
|
|
24
|
-
pdfalyzer/helpers/string_helper.py,sha256=
|
|
24
|
+
pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
|
|
25
|
+
pdfalyzer/helpers/string_helper.py,sha256=zl7VnxqkaB50Zv1yQoz-ShVcLT2_nOgmxekWTpXHyx4,2521
|
|
25
26
|
pdfalyzer/output/character_mapping.py,sha256=UN66b4BjvJiokBCi2kregiQvi6u2l1BJcHYFGG_G43M,2190
|
|
26
27
|
pdfalyzer/output/layout.py,sha256=U9n5RnwwBg2UXxRBAc4E2gQ9t3dNsmiu62klz-Ig1Zg,2767
|
|
27
28
|
pdfalyzer/output/pdfalyzer_presenter.py,sha256=TUsMc2GTUDjFzIGk7Ep5ZASfXcKX_WNtZzZKbQTHcfY,8580
|
|
28
29
|
pdfalyzer/output/styles/node_colors.py,sha256=rfsTAUF43K_buw21SZoP6L5c_cLy7S-xA4GUiWJsDkc,3986
|
|
29
|
-
pdfalyzer/output/styles/rich_theme.py,sha256=
|
|
30
|
+
pdfalyzer/output/styles/rich_theme.py,sha256=akOs6eBtYCHMeB3igehDXY6n9Bg0G2v51oxObE__lGo,2039
|
|
30
31
|
pdfalyzer/output/tables/decoding_stats_table.py,sha256=AZ36NUmgRd_GGykPRXHHTsMvfkxyltKB2XWDPKEZPm4,3622
|
|
31
32
|
pdfalyzer/output/tables/font_summary_table.py,sha256=TyCwcvqn99LXTWnmtk6MBPdc_33NSxIImNv-C2aqrYU,2102
|
|
32
33
|
pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=7G-FLb_EUP50kZmYCTbo8Q6taU4xKp2QIGNOnQtYbNg,5908
|
|
@@ -34,9 +35,11 @@ pdfalyzer/output/tables/stream_objects_table.py,sha256=PgQj8oTtW5_X8SMQb3FvCWDS-
|
|
|
34
35
|
pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVnev_4uEk,5291
|
|
35
36
|
pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
|
|
36
37
|
pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
|
|
37
|
-
pdfalyzer/util/argument_parser.py,sha256=
|
|
38
|
+
pdfalyzer/util/argument_parser.py,sha256=OdvGCowGnVNyulqC5968myCxY4gRu6--WmCIdkiXoWA,9732
|
|
39
|
+
pdfalyzer/util/cli_tools_argument_parser.py,sha256=EE-lk1ZMv3JlZlZ9N3rAndIlYl1__C0iYG0Ti6MEHjM,6107
|
|
38
40
|
pdfalyzer/util/debugging.py,sha256=hjYGxptJmal9TTaAUkkoo0oNu2tdx6ZYSyC0WjvzHh0,156
|
|
39
41
|
pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
|
|
42
|
+
pdfalyzer/util/page_range.py,sha256=NMNh3_TojxTxBIpvUYK1AmvID_m8qOP6AihZrLWZF2I,1652
|
|
40
43
|
pdfalyzer/util/pdf_parser_manager.py,sha256=FVRYAYsCd0y5MAm--qvXnwCZnDtB3x85FdJtb-gpyw4,3109
|
|
41
44
|
pdfalyzer/yara_rules/PDF.yara,sha256=70JzPq5F6AS8F46Seu6u0j5GS1JHxkS42r7g7PVSpRg,81489
|
|
42
45
|
pdfalyzer/yara_rules/PDF_binary_stream.yara,sha256=Qt0Wd7RFXYiHaT9YxTCrhC68ccmFcEG1XMNC3p5IwcI,821
|
|
@@ -44,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
|
44
47
|
pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
|
|
45
48
|
pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
|
|
46
49
|
pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
|
|
47
|
-
pdfalyzer-1.
|
|
48
|
-
pdfalyzer-1.
|
|
49
|
-
pdfalyzer-1.
|
|
50
|
-
pdfalyzer-1.
|
|
51
|
-
pdfalyzer-1.
|
|
50
|
+
pdfalyzer-1.17.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
51
|
+
pdfalyzer-1.17.1.dist-info/METADATA,sha256=nla_K-pZ8XoknqbcCqi90EPydVJ7STe6DDBfOOf_Dso,27309
|
|
52
|
+
pdfalyzer-1.17.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
53
|
+
pdfalyzer-1.17.1.dist-info/entry_points.txt,sha256=goHVADdqEFcniu4O0k7kabc2rLf3wvRrENJK6c9IkUw,249
|
|
54
|
+
pdfalyzer-1.17.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|