pdfalyzer 1.17.0__py3-none-any.whl → 1.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pdfalyzer might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.17.1
4
+ * Fix issue where `combine_pdfs` page ranges were indexed from 0 instead of 1
5
+
3
6
  # 1.17.0
4
7
  * Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
5
8
  * Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)
pdfalyzer/__init__.py CHANGED
@@ -31,7 +31,8 @@ from pdfalyzer.helpers.rich_text_helper import print_highlighted
31
31
  from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
32
32
  from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
33
33
  from pdfalyzer.pdfalyzer import Pdfalyzer
34
- from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
34
+ from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
35
+ from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, ask_to_proceed,
35
36
  parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
36
37
  from pdfalyzer.util.pdf_parser_manager import PdfParserManager
37
38
 
@@ -135,15 +136,13 @@ def combine_pdfs():
135
136
 
136
137
 
137
138
  def extract_pdf_pages() -> None:
139
+ """Extract a range of pages from a PDF to a new PDF."""
138
140
  args = parse_pdf_page_extraction_args()
139
141
  PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
140
142
 
141
143
 
142
144
  def extract_text_from_pdfs() -> None:
143
- """
144
- Extract text from a single file or from all files in a given directory. Can accept
145
- multiple paths as arguments on the command line.
146
- """
145
+ """Extract text from a list of file or from all PDF files in a list of directories."""
147
146
  args: Namespace = parse_text_extraction_args()
148
147
  console.line()
149
148
 
@@ -34,6 +34,7 @@ class PdfFile:
34
34
  basename (str): The base name of the PDF file (with extension).
35
35
  basename_without_ext (str): The base name of the PDF file (without extension).
36
36
  extname (str): The file extension of the PDF file.
37
+ file_size (int): The size of the file in bytes.
37
38
  """
38
39
 
39
40
  def __init__(self, file_path: Union[str, Path]) -> None:
@@ -44,7 +45,7 @@ class PdfFile:
44
45
  self.file_path: Path = Path(file_path)
45
46
 
46
47
  if not self.file_path.exists():
47
- raise FileNotFoundError(f"File '{file_path}' does not exist.")
48
+ raise FileNotFoundError(f"'{file_path}' is not a valid file or directory.")
48
49
 
49
50
  self.dirname = self.file_path.parent
50
51
  self.basename: str = path.basename(file_path)
@@ -71,7 +72,7 @@ class PdfFile:
71
72
  Returns:
72
73
  Path: The path to the newly created PDF file containing the extracted pages.
73
74
  """
74
- destination_dir = destination_dir or self.dirname
75
+ destination_dir = Path(destination_dir or self.dirname)
75
76
  create_dir_if_it_does_not_exist(destination_dir)
76
77
 
77
78
  if extra_file_suffix is None:
@@ -2,7 +2,6 @@
2
2
  Functions to help with the pre-configured YARA rules in the /yara directory.
3
3
  """
4
4
  from importlib.resources import as_file, files
5
- from sys import exit
6
5
  from typing import Optional, Union
7
6
 
8
7
  from yaralyzer.config import YaralyzerConfig
@@ -3,7 +3,7 @@ Various text formatting/styling/manipulating methods.
3
3
  """
4
4
  import re
5
5
  from pprint import PrettyPrinter
6
- from typing import List, Pattern, Union
6
+ from typing import List, Optional, Pattern, Union
7
7
 
8
8
  from yaralyzer.output.rich_console import console_width
9
9
 
@@ -18,16 +18,14 @@ pp = PrettyPrinter(
18
18
  sort_dicts=True)
19
19
 
20
20
 
21
- def generate_hyphen_line(width=None, title=None):
22
- """e.g. '-----------------BEGIN-----------------'"""
23
- width = width or console_width()
21
+ def all_strings_are_same_ignoring_numbers(strings: List[str]) -> bool:
22
+ """Returns true if string addresses are same except for digits."""
23
+ return len(set([replace_digits(s) for s in strings])) == 1
24
24
 
25
- if title is None:
26
- return '-' * width
27
25
 
28
- side_hyphens = int((width - len(title)) / 2) * '-'
29
- line = side_hyphens + title + side_hyphens
30
- return line if len(line) == width else line + '-'
26
+ def bracketed(index: Union[int, str]) -> str:
27
+ """Surround index with [ and ]."""
28
+ return f"[{index}]"
31
29
 
32
30
 
33
31
  def count_pattern_matches_in_text(pattern: str, text: str) -> int:
@@ -44,9 +42,20 @@ def exception_str(e: Exception) -> str:
44
42
  return f"{type(e).__name__}: {e}"
45
43
 
46
44
 
47
- def root_address(_string: str) -> str:
48
- """Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
49
- return _string.split('[')[0]
45
+ def generate_hyphen_line(width: Optional[int] = None, title: Optional[str] = None):
46
+ """e.g. '-----------------BEGIN-----------------'"""
47
+ width = width or console_width()
48
+
49
+ if title is None:
50
+ return '-' * width
51
+
52
+ side_hyphens = int((width - len(title)) / 2) * '-'
53
+ line = side_hyphens + title + side_hyphens
54
+ return line if len(line) == width else line + '-'
55
+
56
+
57
+ def has_a_common_substring(strings: List[str]) -> bool:
58
+ return all([is_substring_of_longer_strings_in_list(s, strings) for s in strings])
50
59
 
51
60
 
52
61
  def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
@@ -54,9 +63,10 @@ def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
54
63
  return any([_string.startswith(prefix) for prefix in prefixes])
55
64
 
56
65
 
57
- def bracketed(index: Union[int, str]) -> str:
58
- """Surround index with [ and ]."""
59
- return f"[{index}]"
66
+ def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
67
+ """Return True if '_string' is a substring of all the 'strings' longer than '_string'."""
68
+ longer_strings = [s for s in strings if len(s) > len(_string)]
69
+ return all([_string in longer_string for longer_string in longer_strings])
60
70
 
61
71
 
62
72
  def replace_digits(string_with_digits: str) -> str:
@@ -64,18 +74,6 @@ def replace_digits(string_with_digits: str) -> str:
64
74
  return DIGIT_REGEX.sub('x', string_with_digits)
65
75
 
66
76
 
67
- def all_strings_are_same_ignoring_numbers(strings: List[str]) -> bool:
68
- """Returns true if string addresses are same except for digits."""
69
- return len(set([replace_digits(s) for s in strings])) == 1
70
-
71
-
72
- def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
73
- longer_strings = [s for s in strings if len(s) > len(_string)]
74
- return all([_string in longer_string for longer_string in longer_strings])
75
-
76
-
77
- def has_a_common_substring(strings: List[str]) -> bool:
78
- return all([
79
- is_substring_of_longer_strings_in_list(s, strings)
80
- for s in strings
81
- ])
77
+ def root_address(_string: str) -> str:
78
+ """Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
79
+ return _string.split('[')[0]
@@ -1,5 +1,5 @@
1
1
  """
2
- Parse command line arguments for pdfalyzer and construct the PdfalyzerConfig object.
2
+ Parse command line arguments for `pdfalyze` and construct the `PdfalyzerConfig` object.
3
3
  """
4
4
  import sys
5
5
  from argparse import ArgumentParser, Namespace
@@ -7,23 +7,17 @@ from collections import namedtuple
7
7
  from functools import partial, update_wrapper
8
8
  from importlib.metadata import version
9
9
  from os import getcwd, path
10
- from pathlib import Path
11
10
  from typing import List, Optional
12
11
 
13
12
  from rich_argparse_plus import RichHelpFormatterPlus
14
13
  from rich.prompt import Confirm
15
14
  from rich.text import Text
16
- from yaralyzer.helpers.file_helper import files_in_dir
17
15
  from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
18
16
  from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
19
17
 
20
-
21
18
  from pdfalyzer.config import ALL_STREAMS, PDFALYZER, PdfalyzerConfig
22
19
  from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
23
- from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
24
- with_pdf_extension)
25
20
  from pdfalyzer.helpers.rich_text_helper import print_highlighted
26
- from pdfalyzer.util.page_range import PageRangeArgumentValidator
27
21
 
28
22
  # NamedTuple to keep our argument selection orderly
29
23
  OutputSection = namedtuple('OutputSection', ['argument', 'method'])
@@ -206,142 +200,6 @@ def all_sections_chosen(args):
206
200
  return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
207
201
 
208
202
 
209
- #############################################################
210
- # Separate arg parsers for combine_pdfs and other scripts #
211
- #############################################################
212
-
213
- MAX_QUALITY = 10
214
-
215
- combine_pdfs_parser = ArgumentParser(
216
- description="Combine multiple PDFs into one.",
217
- epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
218
- " page numbers prior to merging.",
219
- formatter_class=RichHelpFormatterPlus)
220
-
221
- combine_pdfs_parser.add_argument('pdfs',
222
- help='two or more PDFs to combine',
223
- metavar='PDF_PATH',
224
- nargs='+')
225
-
226
- combine_pdfs_parser.add_argument('-iq', '--image-quality',
227
- help='image quality for embedded images (can compress PDF at loss of quality)',
228
- choices=range(1, MAX_QUALITY + 1),
229
- default=MAX_QUALITY,
230
- type=int)
231
-
232
- combine_pdfs_parser.add_argument('-o', '--output-file',
233
- help='path to write the combined PDFs to',
234
- required=True)
235
-
236
-
237
- def parse_combine_pdfs_args() -> Namespace:
238
- """Parse command line args for combine_pdfs script."""
239
- args = combine_pdfs_parser.parse_args()
240
- args.output_file = with_pdf_extension(args.output_file)
241
- confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
242
- args.number_of_pdfs = len(args.pdfs)
243
-
244
- if args.number_of_pdfs < 2:
245
- exit_with_error(f"Need at least 2 PDFs to merge.")
246
- elif not do_all_files_exist(args.pdfs):
247
- exit_with_error()
248
- elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
249
- exit_with_error()
250
-
251
- if all(is_pdf(pdf) for pdf in args.pdfs):
252
- if all(extract_page_number(pdf) for pdf in args.pdfs):
253
- print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
254
- args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
255
- else:
256
- print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
257
- else:
258
- print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
259
- ask_to_proceed()
260
-
261
- print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
262
- return args
263
-
264
-
265
- ###########################################
266
- # Parse args for extract_pdf_pages() #
267
- ###########################################
268
- page_range_validator = PageRangeArgumentValidator()
269
-
270
- extract_pdf_parser = ArgumentParser(
271
- formatter_class=RichHelpFormatterPlus,
272
- description="Extract pages from one PDF into a new PDF.",
273
- )
274
-
275
- extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
276
-
277
- extract_pdf_parser.add_argument('--page-range', '-r',
278
- type=page_range_validator,
279
- help=page_range_validator.HELP_MSG,
280
- required=True)
281
-
282
- extract_pdf_parser.add_argument('--destination-dir', '-d',
283
- help="directory to write the new PDF to",
284
- default=Path.cwd())
285
-
286
- extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
287
-
288
-
289
- def parse_pdf_page_extraction_args() -> Namespace:
290
- args = extract_pdf_parser.parse_args()
291
-
292
- if not is_pdf(args.pdf_file):
293
- log.error(f"'{args.pdf_file}' is not a PDF.")
294
- sys.exit(-1)
295
- elif not Path(args.destination_dir).exists():
296
- log.error(f"Destination dir '{args.destination_dir}' does not exist.")
297
- sys.exit(1)
298
-
299
- return args
300
-
301
-
302
- ############################################
303
- # Parse args for extract_text_from_pdfs() #
304
- ############################################
305
- extract_text_parser = ArgumentParser(
306
- formatter_class=RichHelpFormatterPlus,
307
- description="Extract the text from one or more files or directories.",
308
- epilog="If any of the FILE_OR_DIRs is a directory all files in that directory will be extracted."
309
- )
310
-
311
- extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
312
- extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
313
-
314
- extract_text_parser.add_argument('--page-range', '-r',
315
- type=page_range_validator,
316
- help=f"[PDFs only] {page_range_validator.HELP_MSG}")
317
-
318
- extract_text_parser.add_argument('--print-as-parsed', '-p',
319
- action='store_true',
320
- help='print pages as they are parsed instead of waiting until document is fully parsed')
321
-
322
-
323
- def parse_text_extraction_args() -> Namespace:
324
- args = extract_text_parser.parse_args()
325
- args.files_to_process = []
326
-
327
- for file_or_dir in args.file_or_dir:
328
- file_path = Path(file_or_dir)
329
-
330
- if not file_path.exists():
331
- log.error(f"File '{file_path}' doesn't exist!")
332
- sys.exit(-1)
333
- elif file_path.is_dir():
334
- args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
335
- else:
336
- args.files_to_process.append(file_path)
337
-
338
- if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
339
- log.error(f"--page-range can only be specified for a single PDF")
340
- sys.exit(-1)
341
-
342
- return args
343
-
344
-
345
203
  #############
346
204
  # Helpers #
347
205
  #############
@@ -0,0 +1,156 @@
1
+ """
2
+ Argument parsers for the command line tools other than `pdfalyze` that are included with The Pdfalyzer.
3
+
4
+ 1. combine_pdfs
5
+ 2.
6
+ """
7
+ import sys
8
+ from argparse import ArgumentParser, Namespace
9
+ from pathlib import Path
10
+
11
+ from rich_argparse_plus import RichHelpFormatterPlus
12
+ from rich.prompt import Confirm
13
+ from rich.text import Text
14
+ from yaralyzer.helpers.file_helper import files_in_dir
15
+ from yaralyzer.util.logging import log
16
+
17
+ from pdfalyzer.util.argument_parser import ask_to_proceed, exit_with_error
18
+ from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
19
+ with_pdf_extension)
20
+ from pdfalyzer.helpers.rich_text_helper import print_highlighted
21
+ from pdfalyzer.util.page_range import PageRangeArgumentValidator
22
+
23
+ MAX_QUALITY = 10
24
+
25
+
26
+ ##################
27
+ # combine_pdfs #
28
+ ##################
29
+ combine_pdfs_parser = ArgumentParser(
30
+ description="Combine multiple PDFs into one.",
31
+ epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
32
+ " page numbers prior to merging.",
33
+ formatter_class=RichHelpFormatterPlus)
34
+
35
+ combine_pdfs_parser.add_argument('pdfs',
36
+ help='two or more PDFs to combine',
37
+ metavar='PDF_PATH',
38
+ nargs='+')
39
+
40
+ combine_pdfs_parser.add_argument('-iq', '--image-quality',
41
+ help='image quality for embedded images (can compress PDF at loss of quality)',
42
+ choices=range(1, MAX_QUALITY + 1),
43
+ default=MAX_QUALITY,
44
+ type=int)
45
+
46
+ combine_pdfs_parser.add_argument('-o', '--output-file',
47
+ help='path to write the combined PDFs to',
48
+ required=True)
49
+
50
+
51
+ def parse_combine_pdfs_args() -> Namespace:
52
+ """Parse command line args for combine_pdfs script."""
53
+ args = combine_pdfs_parser.parse_args()
54
+ args.output_file = with_pdf_extension(args.output_file)
55
+ confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
56
+ args.number_of_pdfs = len(args.pdfs)
57
+
58
+ if args.number_of_pdfs < 2:
59
+ exit_with_error(f"Need at least 2 PDFs to merge.")
60
+ elif not do_all_files_exist(args.pdfs):
61
+ exit_with_error()
62
+ elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
63
+ exit_with_error()
64
+
65
+ if all(is_pdf(pdf) for pdf in args.pdfs):
66
+ if all(extract_page_number(pdf) for pdf in args.pdfs):
67
+ print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
68
+ args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
69
+ else:
70
+ print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
71
+ else:
72
+ print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
73
+ ask_to_proceed()
74
+
75
+ print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
76
+ return args
77
+
78
+
79
+ #####################
80
+ # extract_pdf_pages #
81
+ #####################
82
+ page_range_validator = PageRangeArgumentValidator()
83
+
84
+ extract_pdf_parser = ArgumentParser(
85
+ formatter_class=RichHelpFormatterPlus,
86
+ description="Extract pages from one PDF into a new PDF.",
87
+ )
88
+
89
+ extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
90
+
91
+ extract_pdf_parser.add_argument('--page-range', '-r',
92
+ type=page_range_validator,
93
+ help=page_range_validator.HELP_MSG,
94
+ required=True)
95
+
96
+ extract_pdf_parser.add_argument('--destination-dir', '-d',
97
+ help="directory to write the new PDF to",
98
+ default=Path.cwd())
99
+
100
+ extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
101
+
102
+
103
+ def parse_pdf_page_extraction_args() -> Namespace:
104
+ args = extract_pdf_parser.parse_args()
105
+
106
+ if not is_pdf(args.pdf_file):
107
+ log.error(f"'{args.pdf_file}' is not a PDF.")
108
+ sys.exit(-1)
109
+ elif not Path(args.destination_dir).exists():
110
+ log.error(f"Destination dir '{args.destination_dir}' does not exist.")
111
+ sys.exit(1)
112
+
113
+ return args
114
+
115
+
116
+ ############################
117
+ # extract_text_from_pdfs #
118
+ ############################
119
+ extract_text_parser = ArgumentParser(
120
+ formatter_class=RichHelpFormatterPlus,
121
+ description="Extract the text from one or more files or directories.",
122
+ epilog="If any of the FILE_OR_DIRs is a directory all PDF files in that directory will be extracted."
123
+ )
124
+
125
+ extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
126
+ extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
127
+
128
+ extract_text_parser.add_argument('--page-range', '-r',
129
+ type=page_range_validator,
130
+ help=f"[PDFs only] {page_range_validator.HELP_MSG}")
131
+
132
+ extract_text_parser.add_argument('--print-as-parsed', '-p',
133
+ action='store_true',
134
+ help='print pages as they are parsed instead of waiting until document is fully parsed')
135
+
136
+
137
+ def parse_text_extraction_args() -> Namespace:
138
+ args = extract_text_parser.parse_args()
139
+ args.files_to_process = []
140
+
141
+ for file_or_dir in args.file_or_dir:
142
+ file_path = Path(file_or_dir)
143
+
144
+ if not file_path.exists():
145
+ log.error(f"'{file_path}' is not a valid file or directory.")
146
+ sys.exit(-1)
147
+ elif file_path.is_dir():
148
+ args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
149
+ else:
150
+ args.files_to_process.append(file_path)
151
+
152
+ if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
153
+ log.error(f"--page-range can only be specified for a single PDF")
154
+ sys.exit(-1)
155
+
156
+ return args
@@ -6,7 +6,7 @@ from argparse import ArgumentTypeError
6
6
  from dataclasses import dataclass
7
7
  from typing import Tuple
8
8
 
9
- PAGE_RANGE_REGEX = re.compile('\\d(-\\d)?')
9
+ PAGE_RANGE_REGEX = re.compile(r'[1-9](\d+)?(-\d+)?')
10
10
 
11
11
 
12
12
  @dataclass
@@ -15,7 +15,7 @@ class PageRange:
15
15
 
16
16
  def __post_init__(self):
17
17
  if not PAGE_RANGE_REGEX.match(self.page_range):
18
- raise ValueError(f"Invalid page range '{self.page_range}'")
18
+ raise ArgumentTypeError(f"Invalid page range '{self.page_range}'")
19
19
 
20
20
  if '-' in self.page_range:
21
21
  (self.first_page, self.last_page) = (int(p) for p in self.page_range.split('-'))
@@ -35,10 +35,10 @@ class PageRange:
35
35
  if self.first_page + 1 == self.last_page:
36
36
  return f"page_{self.first_page}"
37
37
  else:
38
- return f"pages_{self.first_page}-{self.last_page}"
38
+ return f"pages_{self.first_page}-{self.last_page - 1}"
39
39
 
40
40
  def to_tuple(self) -> Tuple[int, int]:
41
- return (self.first_page, self.last_page)
41
+ return (self.first_page - 1, self.last_page - 1)
42
42
 
43
43
  def __repr__(self) -> str:
44
44
  return f"PageRange({self.first_page}, {self.last_page})"
@@ -48,7 +48,4 @@ class PageRangeArgumentValidator(object):
48
48
  HELP_MSG = "a single digit ('11') or a range ('11-15') (WILL NOT extract the last page)"
49
49
 
50
50
  def __call__(self, value):
51
- if not PAGE_RANGE_REGEX.match(value):
52
- raise ArgumentTypeError("Argument has to match '{}'".format(PAGE_RANGE_REGEX.pattern))
53
-
54
51
  return PageRange(value)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.17.0
3
+ Version: 1.17.1
4
4
  Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
@@ -250,22 +250,16 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
250
250
  ## Included PDF Tools
251
251
  The Pdfalyzer comes with a few command line tools:
252
252
 
253
- #### `combine_pdfs`
254
- Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
253
+ * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
254
+ * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
255
+ * `extract_text_from_pdfs` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_text_from_pdfs --help` for more info.
255
256
 
256
- #### `extract_pdf_pages`
257
- Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
258
- ![](doc/extract_pages_from_pdf_help.png)
259
-
260
- #### `extract_text_from_pdfs`
261
- Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
257
+ Running `extract_text_from_pdfs` requires that you install The Pdfalyzer's optional dependencies:
262
258
 
263
259
  ```bash
264
260
  pipx install pdfalyzer[extract]
265
261
  ```
266
262
 
267
- Run `extract_text_from_pdfs --help` to see the options.
268
-
269
263
 
270
264
  ## 3rd Party PDF Tools
271
265
  ### Installing Didier Stevens's PDF Analysis Tools
@@ -1,20 +1,20 @@
1
1
  .pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
2
- CHANGELOG.md,sha256=DdmNHFTwo2VoFvmWA9htyUGLWvajyXnalNxB9hLwM9I,13042
2
+ CHANGELOG.md,sha256=KtprK6EZ8FhdPWHs9E-YzGSqHxV_w0GnShvIJ6kMPss,13132
3
3
  LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
4
- pdfalyzer/__init__.py,sha256=2OMrlYT53jvue3ddhKjF6LMbG2ss377neJBVBELwp3I,6118
4
+ pdfalyzer/__init__.py,sha256=TgCkfaaWuxv3sNMHcMZjh5lAw0oPNYKqJYRXVy9hPKo,6181
5
5
  pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
6
6
  pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04ElB8ilU,10748
7
7
  pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
8
8
  pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
9
9
  pdfalyzer/decorators/indeterminate_node.py,sha256=QLJr-nGKih8gPZcIqxLU028OwWWD5VjNHYMUjniwT_k,6586
10
- pdfalyzer/decorators/pdf_file.py,sha256=_v4mIpQXlPZTLRg2Tvv_OP_an-HECXbfzoGuq-hZ5io,10199
10
+ pdfalyzer/decorators/pdf_file.py,sha256=CHXyM8RIvnjKnsDOJxUhk-sfRzLLW50MJpKKTax6Eqk,10274
11
11
  pdfalyzer/decorators/pdf_object_properties.py,sha256=Il3RObxQ4XUf0Ei-nd4tjJO0LeaxC6u7yFa3cQs_jVY,5485
12
12
  pdfalyzer/decorators/pdf_tree_node.py,sha256=4LReGJUtG8iEcLUQD1jW-yp3xPWsHrC-3Anbkt7XZ3A,11134
13
13
  pdfalyzer/decorators/pdf_tree_verifier.py,sha256=2hVe9APsAWQZ7ra8AGndHQnGWmmxmb3ZwfJHZuLvsvc,4714
14
14
  pdfalyzer/detection/constants/binary_regexes.py,sha256=s69S7uq1v4vBy3ZkKKKt3ClNuFCuQ0ztootUxzlgfFw,1632
15
15
  pdfalyzer/detection/constants/javascript_reserved_keywords.py,sha256=CXXdWskdQa0Hs5wCci2RBVvipgZg34_cLfmkWG4Xcmg,991
16
16
  pdfalyzer/detection/javascript_hunter.py,sha256=_wT2vkKTMlm_RGCjYsmwcmV-ag1qep3EpkHmUw0nWcQ,711
17
- pdfalyzer/detection/yaralyzer_helper.py,sha256=_Bkw2JTt3MeD86VOK39C06hn9lNDCc_8ZKLVMEvrwvQ,2215
17
+ pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47NdsDasg01uiQ,2194
18
18
  pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
19
19
  pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
20
20
  pdfalyzer/helpers/filesystem_helper.py,sha256=onXhSMhxo0YkvdKdosRwUo_RGdW6yNzZF5hfjgZ3GBE,5085
@@ -22,7 +22,7 @@ pdfalyzer/helpers/image_helper.py,sha256=QjoAUcKKEtpmuEyOmEfmaUN-lzNykQ1SzqgNn9-
22
22
  pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
23
23
  pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
24
24
  pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
25
- pdfalyzer/helpers/string_helper.py,sha256=YAjZy7KY6Ys_bb_YkUiUGQryONwsOE88LLGNgyWJ62o,2405
25
+ pdfalyzer/helpers/string_helper.py,sha256=zl7VnxqkaB50Zv1yQoz-ShVcLT2_nOgmxekWTpXHyx4,2521
26
26
  pdfalyzer/output/character_mapping.py,sha256=UN66b4BjvJiokBCi2kregiQvi6u2l1BJcHYFGG_G43M,2190
27
27
  pdfalyzer/output/layout.py,sha256=U9n5RnwwBg2UXxRBAc4E2gQ9t3dNsmiu62klz-Ig1Zg,2767
28
28
  pdfalyzer/output/pdfalyzer_presenter.py,sha256=TUsMc2GTUDjFzIGk7Ep5ZASfXcKX_WNtZzZKbQTHcfY,8580
@@ -35,10 +35,11 @@ pdfalyzer/output/tables/stream_objects_table.py,sha256=PgQj8oTtW5_X8SMQb3FvCWDS-
35
35
  pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVnev_4uEk,5291
36
36
  pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
37
37
  pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
38
- pdfalyzer/util/argument_parser.py,sha256=2aYoW0ZILRSQkEOCaDwrZYmge5QI5tORhNm03rA0my8,15574
38
+ pdfalyzer/util/argument_parser.py,sha256=OdvGCowGnVNyulqC5968myCxY4gRu6--WmCIdkiXoWA,9732
39
+ pdfalyzer/util/cli_tools_argument_parser.py,sha256=EE-lk1ZMv3JlZlZ9N3rAndIlYl1__C0iYG0Ti6MEHjM,6107
39
40
  pdfalyzer/util/debugging.py,sha256=hjYGxptJmal9TTaAUkkoo0oNu2tdx6ZYSyC0WjvzHh0,156
40
41
  pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
41
- pdfalyzer/util/page_range.py,sha256=zsHPw9p4QGlx5YEdssntY8HLEZIvBoQrS8Y8V87t5sA,1770
42
+ pdfalyzer/util/page_range.py,sha256=NMNh3_TojxTxBIpvUYK1AmvID_m8qOP6AihZrLWZF2I,1652
42
43
  pdfalyzer/util/pdf_parser_manager.py,sha256=FVRYAYsCd0y5MAm--qvXnwCZnDtB3x85FdJtb-gpyw4,3109
43
44
  pdfalyzer/yara_rules/PDF.yara,sha256=70JzPq5F6AS8F46Seu6u0j5GS1JHxkS42r7g7PVSpRg,81489
44
45
  pdfalyzer/yara_rules/PDF_binary_stream.yara,sha256=Qt0Wd7RFXYiHaT9YxTCrhC68ccmFcEG1XMNC3p5IwcI,821
@@ -46,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
46
47
  pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
47
48
  pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
48
49
  pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
49
- pdfalyzer-1.17.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
50
- pdfalyzer-1.17.0.dist-info/METADATA,sha256=MLXdtDxLIbFC4V2RlW9VKhHb7MEWgcF_3_o4cdlN-94,27337
51
- pdfalyzer-1.17.0.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
52
- pdfalyzer-1.17.0.dist-info/entry_points.txt,sha256=goHVADdqEFcniu4O0k7kabc2rLf3wvRrENJK6c9IkUw,249
53
- pdfalyzer-1.17.0.dist-info/RECORD,,
50
+ pdfalyzer-1.17.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
51
+ pdfalyzer-1.17.1.dist-info/METADATA,sha256=nla_K-pZ8XoknqbcCqi90EPydVJ7STe6DDBfOOf_Dso,27309
52
+ pdfalyzer-1.17.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
53
+ pdfalyzer-1.17.1.dist-info/entry_points.txt,sha256=goHVADdqEFcniu4O0k7kabc2rLf3wvRrENJK6c9IkUw,249
54
+ pdfalyzer-1.17.1.dist-info/RECORD,,