PyPI - pdfalyzer - Versions diffs - 1.17.0__tar.gz → 1.17.1__tar.gz - Mend

pdfalyzer 1.17.0tar.gz → 1.17.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pdfalyzer might be problematic. Click here for more details.

Files changed (52) hide show

{pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/CHANGELOG.md RENAMED Viewed

@@ -1,5 +1,8 @@
 # NEXT RELEASE
+### 1.17.1
+* Fix issue where `combine_pdfs` page ranges were indexed from 0 instead of 1
 # 1.17.0
 * Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
 * Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)

{pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pdfalyzer
-Version: 1.17.0
+Version: 1.17.1
 Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
 Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
 License: GPL-3.0-or-later
@@ -250,22 +250,16 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
 ## Included PDF Tools
 The Pdfalyzer comes with a few command line tools:
-#### `combine_pdfs`
-Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
+* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
+* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
+* `extract_text_from_pdfs` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_text_from_pdfs --help` for more info.
-#### `extract_pdf_pages`
-Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
-![](doc/extract_pages_from_pdf_help.png)
-#### `extract_text_from_pdfs`
-Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
+Running `extract_text_from_pdfs` requires that you install The Pdfalyzer's optional dependencies:
 ```bash
 pipx install pdfalyzer[extract]
 ```
-Run `extract_text_from_pdfs --help` to see the options.
 ## 3rd Party PDF Tools
 ### Installing Didier Stevens's PDF Analysis Tools

{pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/README.md RENAMED Viewed

@@ -216,22 +216,16 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
 ## Included PDF Tools
 The Pdfalyzer comes with a few command line tools:
-#### `combine_pdfs`
-Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
+* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
+* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
+* `extract_text_from_pdfs` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_text_from_pdfs --help` for more info.
-#### `extract_pdf_pages`
-Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
-![](doc/extract_pages_from_pdf_help.png)
-#### `extract_text_from_pdfs`
-Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
+Running `extract_text_from_pdfs` requires that you install The Pdfalyzer's optional dependencies:
 ```bash
 pipx install pdfalyzer[extract]
 ```
-Run `extract_text_from_pdfs --help` to see the options.
 ## 3rd Party PDF Tools
 ### Installing Didier Stevens's PDF Analysis Tools

{pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/__init__.py RENAMED Viewed

@@ -31,7 +31,8 @@ from pdfalyzer.helpers.rich_text_helper import print_highlighted
 from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
 from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
 from pdfalyzer.pdfalyzer import Pdfalyzer
-from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
+from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
+from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, ask_to_proceed,
      parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
 from pdfalyzer.util.pdf_parser_manager import PdfParserManager
@@ -135,15 +136,13 @@ def combine_pdfs():
 def extract_pdf_pages() -> None:
+    """Extract a range of pages from a PDF to a new PDF."""
     args = parse_pdf_page_extraction_args()
     PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
 def extract_text_from_pdfs() -> None:
-    """
-    Extract text from a single file or from all files in a given directory. Can accept
-    multiple paths as arguments on the command line.
-    """
+    """Extract text from a list of file or from all PDF files in a list of directories."""
     args: Namespace = parse_text_extraction_args()
     console.line()

{pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/decorators/pdf_file.py RENAMED Viewed

@@ -34,6 +34,7 @@ class PdfFile:
         basename (str): The base name of the PDF file (with extension).
         basename_without_ext (str): The base name of the PDF file (without extension).
         extname (str): The file extension of the PDF file.
+        file_size (int): The size of the file in bytes.
     """
     def __init__(self, file_path: Union[str, Path]) -> None:
@@ -44,7 +45,7 @@ class PdfFile:
         self.file_path: Path = Path(file_path)
         if not self.file_path.exists():
-            raise FileNotFoundError(f"File '{file_path}' does not exist.")
+            raise FileNotFoundError(f"'{file_path}' is not a valid file or directory.")
         self.dirname = self.file_path.parent
         self.basename: str = path.basename(file_path)
@@ -71,7 +72,7 @@ class PdfFile:
         Returns:
             Path: The path to the newly created PDF file containing the extracted pages.
         """
-        destination_dir = destination_dir or self.dirname
+        destination_dir = Path(destination_dir or self.dirname)
         create_dir_if_it_does_not_exist(destination_dir)
         if extra_file_suffix is None:

{pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/detection/yaralyzer_helper.py RENAMED Viewed

@@ -2,7 +2,6 @@
 Functions to help with the pre-configured YARA rules in the /yara directory.
 """
 from importlib.resources import as_file, files
-from sys import exit
 from typing import Optional, Union
 from yaralyzer.config import YaralyzerConfig

{pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/helpers/string_helper.py RENAMED Viewed

@@ -3,7 +3,7 @@ Various text formatting/styling/manipulating methods.
 """
 import re
 from pprint import PrettyPrinter
-from typing import List, Pattern, Union
+from typing import List, Optional, Pattern, Union
 from yaralyzer.output.rich_console import console_width
@@ -18,16 +18,14 @@ pp = PrettyPrinter(
     sort_dicts=True)
-def generate_hyphen_line(width=None, title=None):
-    """e.g. '-----------------BEGIN-----------------'"""
-    width = width or console_width()
+def all_strings_are_same_ignoring_numbers(strings: List[str]) -> bool:
+    """Returns true if string addresses are same except for digits."""
+    return len(set([replace_digits(s) for s in strings])) == 1
-    if title is None:
-        return '-' * width
-    side_hyphens = int((width - len(title)) / 2) * '-'
-    line = side_hyphens + title + side_hyphens
-    return line if len(line) == width else line + '-'
+def bracketed(index: Union[int, str]) -> str:
+    """Surround index with [ and ]."""
+    return f"[{index}]"
 def count_pattern_matches_in_text(pattern: str, text: str) -> int:
@@ -44,9 +42,20 @@ def exception_str(e: Exception) -> str:
     return f"{type(e).__name__}: {e}"
-def root_address(_string: str) -> str:
-    """Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
-    return _string.split('[')[0]
+def generate_hyphen_line(width: Optional[int] = None, title: Optional[str] = None):
+    """e.g. '-----------------BEGIN-----------------'"""
+    width = width or console_width()
+    if title is None:
+        return '-' * width
+    side_hyphens = int((width - len(title)) / 2) * '-'
+    line = side_hyphens + title + side_hyphens
+    return line if len(line) == width else line + '-'
+def has_a_common_substring(strings: List[str]) -> bool:
+    return all([is_substring_of_longer_strings_in_list(s, strings) for s in strings])
 def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
@@ -54,9 +63,10 @@ def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
     return any([_string.startswith(prefix) for prefix in prefixes])
-def bracketed(index: Union[int, str]) -> str:
-    """Surround index with [ and ]."""
-    return f"[{index}]"
+def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
+    """Return True if '_string' is a substring of all the 'strings' longer than '_string'."""
+    longer_strings = [s for s in strings if len(s) > len(_string)]
+    return all([_string in longer_string for longer_string in longer_strings])
 def replace_digits(string_with_digits: str) -> str:
@@ -64,18 +74,6 @@ def replace_digits(string_with_digits: str) -> str:
     return DIGIT_REGEX.sub('x', string_with_digits)
-def all_strings_are_same_ignoring_numbers(strings: List[str]) -> bool:
-    """Returns true if string addresses are same except for digits."""
-    return len(set([replace_digits(s) for s in strings])) == 1
-def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
-    longer_strings = [s for s in strings if len(s) > len(_string)]
-    return all([_string in longer_string for longer_string in longer_strings])
-def has_a_common_substring(strings: List[str]) -> bool:
-    return all([
-        is_substring_of_longer_strings_in_list(s, strings)
-        for s in strings
-    ])
+def root_address(_string: str) -> str:
+    """Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
+    return _string.split('[')[0]

{pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/util/argument_parser.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-Parse command line arguments for pdfalyzer and construct the PdfalyzerConfig object.
+Parse command line arguments for `pdfalyze` and construct the `PdfalyzerConfig` object.
 """
 import sys
 from argparse import ArgumentParser, Namespace
@@ -7,23 +7,17 @@ from collections import namedtuple
 from functools import partial, update_wrapper
 from importlib.metadata import version
 from os import getcwd, path
-from pathlib import Path
 from typing import List, Optional
 from rich_argparse_plus import RichHelpFormatterPlus
 from rich.prompt import Confirm
 from rich.text import Text
-from yaralyzer.helpers.file_helper import files_in_dir
 from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
 from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
 from pdfalyzer.config import ALL_STREAMS, PDFALYZER, PdfalyzerConfig
 from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
-from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
-     with_pdf_extension)
 from pdfalyzer.helpers.rich_text_helper import print_highlighted
-from pdfalyzer.util.page_range import PageRangeArgumentValidator
 # NamedTuple to keep our argument selection orderly
 OutputSection = namedtuple('OutputSection', ['argument', 'method'])
@@ -206,142 +200,6 @@ def all_sections_chosen(args):
     return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
-#############################################################
-#  Separate arg parsers for combine_pdfs and other scripts  #
-#############################################################
-MAX_QUALITY = 10
-combine_pdfs_parser = ArgumentParser(
-    description="Combine multiple PDFs into one.",
-    epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
-           " page numbers prior to merging.",
-    formatter_class=RichHelpFormatterPlus)
-combine_pdfs_parser.add_argument('pdfs',
-                                 help='two or more PDFs to combine',
-                                 metavar='PDF_PATH',
-                                 nargs='+')
-combine_pdfs_parser.add_argument('-iq', '--image-quality',
-                                 help='image quality for embedded images (can compress PDF at loss of quality)',
-                                 choices=range(1, MAX_QUALITY + 1),
-                                 default=MAX_QUALITY,
-                                 type=int)
-combine_pdfs_parser.add_argument('-o', '--output-file',
-                                 help='path to write the combined PDFs to',
-                                 required=True)
-def parse_combine_pdfs_args() -> Namespace:
-    """Parse command line args for combine_pdfs script."""
-    args = combine_pdfs_parser.parse_args()
-    args.output_file = with_pdf_extension(args.output_file)
-    confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
-    args.number_of_pdfs = len(args.pdfs)
-    if args.number_of_pdfs < 2:
-        exit_with_error(f"Need at least 2 PDFs to merge.")
-    elif not do_all_files_exist(args.pdfs):
-        exit_with_error()
-    elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
-        exit_with_error()
-    if all(is_pdf(pdf) for pdf in args.pdfs):
-        if all(extract_page_number(pdf) for pdf in args.pdfs):
-            print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
-            args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
-        else:
-            print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
-    else:
-        print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
-        ask_to_proceed()
-    print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
-    return args
-###########################################
-# Parse args for extract_pdf_pages() #
-###########################################
-page_range_validator = PageRangeArgumentValidator()
-extract_pdf_parser = ArgumentParser(
-    formatter_class=RichHelpFormatterPlus,
-    description="Extract pages from one PDF into a new PDF.",
-)
-extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
-extract_pdf_parser.add_argument('--page-range', '-r',
-                                type=page_range_validator,
-                                help=page_range_validator.HELP_MSG,
-                                required=True)
-extract_pdf_parser.add_argument('--destination-dir', '-d',
-                                help="directory to write the new PDF to",
-                                default=Path.cwd())
-extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
-def parse_pdf_page_extraction_args() -> Namespace:
-    args = extract_pdf_parser.parse_args()
-    if not is_pdf(args.pdf_file):
-        log.error(f"'{args.pdf_file}' is not a PDF.")
-        sys.exit(-1)
-    elif not Path(args.destination_dir).exists():
-        log.error(f"Destination dir '{args.destination_dir}' does not exist.")
-        sys.exit(1)
-    return args
-############################################
-# Parse args for extract_text_from_pdfs() #
-############################################
-extract_text_parser = ArgumentParser(
-    formatter_class=RichHelpFormatterPlus,
-    description="Extract the text from one or more files or directories.",
-    epilog="If any of the FILE_OR_DIRs is a directory all files in that directory will be extracted."
-)
-extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
-extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
-extract_text_parser.add_argument('--page-range', '-r',
-                                 type=page_range_validator,
-                                 help=f"[PDFs only] {page_range_validator.HELP_MSG}")
-extract_text_parser.add_argument('--print-as-parsed', '-p',
-                                 action='store_true',
-                                 help='print pages as they are parsed instead of waiting until document is fully parsed')
-def parse_text_extraction_args() -> Namespace:
-    args = extract_text_parser.parse_args()
-    args.files_to_process = []
-    for file_or_dir in args.file_or_dir:
-        file_path = Path(file_or_dir)
-        if not file_path.exists():
-            log.error(f"File '{file_path}' doesn't exist!")
-            sys.exit(-1)
-        elif file_path.is_dir():
-            args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
-        else:
-            args.files_to_process.append(file_path)
-    if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
-        log.error(f"--page-range can only be specified for a single PDF")
-        sys.exit(-1)
-    return args
 #############
 #  Helpers  #
 #############

pdfalyzer-1.17.1/pdfalyzer/util/cli_tools_argument_parser.py ADDED Viewed

@@ -0,0 +1,156 @@
+"""
+Argument parsers for the command line tools other than `pdfalyze` that are included with The Pdfalyzer.
+1. combine_pdfs
+2.
+"""
+import sys
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from rich_argparse_plus import RichHelpFormatterPlus
+from rich.prompt import Confirm
+from rich.text import Text
+from yaralyzer.helpers.file_helper import files_in_dir
+from yaralyzer.util.logging import log
+from pdfalyzer.util.argument_parser import ask_to_proceed, exit_with_error
+from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
+     with_pdf_extension)
+from pdfalyzer.helpers.rich_text_helper import print_highlighted
+from pdfalyzer.util.page_range import PageRangeArgumentValidator
+MAX_QUALITY = 10
+##################
+#  combine_pdfs  #
+##################
+combine_pdfs_parser = ArgumentParser(
+    description="Combine multiple PDFs into one.",
+    epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
+           " page numbers prior to merging.",
+    formatter_class=RichHelpFormatterPlus)
+combine_pdfs_parser.add_argument('pdfs',
+                                 help='two or more PDFs to combine',
+                                 metavar='PDF_PATH',
+                                 nargs='+')
+combine_pdfs_parser.add_argument('-iq', '--image-quality',
+                                 help='image quality for embedded images (can compress PDF at loss of quality)',
+                                 choices=range(1, MAX_QUALITY + 1),
+                                 default=MAX_QUALITY,
+                                 type=int)
+combine_pdfs_parser.add_argument('-o', '--output-file',
+                                 help='path to write the combined PDFs to',
+                                 required=True)
+def parse_combine_pdfs_args() -> Namespace:
+    """Parse command line args for combine_pdfs script."""
+    args = combine_pdfs_parser.parse_args()
+    args.output_file = with_pdf_extension(args.output_file)
+    confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
+    args.number_of_pdfs = len(args.pdfs)
+    if args.number_of_pdfs < 2:
+        exit_with_error(f"Need at least 2 PDFs to merge.")
+    elif not do_all_files_exist(args.pdfs):
+        exit_with_error()
+    elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
+        exit_with_error()
+    if all(is_pdf(pdf) for pdf in args.pdfs):
+        if all(extract_page_number(pdf) for pdf in args.pdfs):
+            print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
+            args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
+        else:
+            print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
+    else:
+        print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
+        ask_to_proceed()
+    print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
+    return args
+#####################
+# extract_pdf_pages #
+#####################
+page_range_validator = PageRangeArgumentValidator()
+extract_pdf_parser = ArgumentParser(
+    formatter_class=RichHelpFormatterPlus,
+    description="Extract pages from one PDF into a new PDF.",
+)
+extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
+extract_pdf_parser.add_argument('--page-range', '-r',
+                                type=page_range_validator,
+                                help=page_range_validator.HELP_MSG,
+                                required=True)
+extract_pdf_parser.add_argument('--destination-dir', '-d',
+                                help="directory to write the new PDF to",
+                                default=Path.cwd())
+extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
+def parse_pdf_page_extraction_args() -> Namespace:
+    args = extract_pdf_parser.parse_args()
+    if not is_pdf(args.pdf_file):
+        log.error(f"'{args.pdf_file}' is not a PDF.")
+        sys.exit(-1)
+    elif not Path(args.destination_dir).exists():
+        log.error(f"Destination dir '{args.destination_dir}' does not exist.")
+        sys.exit(1)
+    return args
+############################
+#  extract_text_from_pdfs  #
+############################
+extract_text_parser = ArgumentParser(
+    formatter_class=RichHelpFormatterPlus,
+    description="Extract the text from one or more files or directories.",
+    epilog="If any of the FILE_OR_DIRs is a directory all PDF files in that directory will be extracted."
+)
+extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
+extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
+extract_text_parser.add_argument('--page-range', '-r',
+                                 type=page_range_validator,
+                                 help=f"[PDFs only] {page_range_validator.HELP_MSG}")
+extract_text_parser.add_argument('--print-as-parsed', '-p',
+                                 action='store_true',
+                                 help='print pages as they are parsed instead of waiting until document is fully parsed')
+def parse_text_extraction_args() -> Namespace:
+    args = extract_text_parser.parse_args()
+    args.files_to_process = []
+    for file_or_dir in args.file_or_dir:
+        file_path = Path(file_or_dir)
+        if not file_path.exists():
+            log.error(f"'{file_path}' is not a valid file or directory.")
+            sys.exit(-1)
+        elif file_path.is_dir():
+            args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
+        else:
+            args.files_to_process.append(file_path)
+    if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
+        log.error(f"--page-range can only be specified for a single PDF")
+        sys.exit(-1)
+    return args

{pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pdfalyzer/util/page_range.py RENAMED Viewed

@@ -6,7 +6,7 @@ from argparse import ArgumentTypeError
 from dataclasses import dataclass
 from typing import Tuple
-PAGE_RANGE_REGEX = re.compile('\\d(-\\d)?')
+PAGE_RANGE_REGEX = re.compile(r'[1-9](\d+)?(-\d+)?')
 @dataclass
@@ -15,7 +15,7 @@ class PageRange:
     def __post_init__(self):
         if not PAGE_RANGE_REGEX.match(self.page_range):
-            raise ValueError(f"Invalid page range '{self.page_range}'")
+            raise ArgumentTypeError(f"Invalid page range '{self.page_range}'")
         if '-' in self.page_range:
             (self.first_page, self.last_page) = (int(p) for p in self.page_range.split('-'))
@@ -35,10 +35,10 @@ class PageRange:
         if self.first_page + 1 == self.last_page:
             return f"page_{self.first_page}"
         else:
-            return f"pages_{self.first_page}-{self.last_page}"
+            return f"pages_{self.first_page}-{self.last_page - 1}"
     def to_tuple(self) -> Tuple[int, int]:
-        return (self.first_page, self.last_page)
+        return (self.first_page - 1, self.last_page - 1)
     def __repr__(self) -> str:
         return f"PageRange({self.first_page}, {self.last_page})"
@@ -48,7 +48,4 @@ class PageRangeArgumentValidator(object):
     HELP_MSG = "a single digit ('11') or a range ('11-15') (WILL NOT extract the last page)"
     def __call__(self, value):
-        if not PAGE_RANGE_REGEX.match(value):
-            raise ArgumentTypeError("Argument has to match '{}'".format(PAGE_RANGE_REGEX.pattern))
         return PageRange(value)

{pdfalyzer-1.17.0 → pdfalyzer-1.17.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdfalyzer"
-version = "1.17.0"
+version = "1.17.1"
 description = "PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
 authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
 license = "GPL-3.0-or-later"
@@ -74,6 +74,10 @@ yaralyzer = "^1.0.9"
 [tool.poetry.extras]
 extract = ["PyMuPDF", "pytesseract"]
+# Poetry 2.x handles optional dependencies like this:
+# [project.optional-dependencies]
+# extract = ["PyMuPDF", "pytesseract"]
 [tool.poetry.group.dev.dependencies]
 flake8 = "^7.3.0"
@@ -114,8 +118,3 @@ requires = ["poetry-core>=1.0.0"]
 addopts = [
     "--import-mode=importlib",
 ]
-# Poetry 2.x handles optional dependencies like this:
-# [project.optional-dependencies]
-# extract = ["PyMuPDF", "pytesseract"]