PyPI - pdfalyzer - Versions diffs - 1.17.0__py3-none-any.whl → 1.17.7__py3-none-any.whl - Mend

pdfalyzer 1.17.0py3-none-any.whl → 1.17.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

CHANGELOG.md +23 -0
pdfalyzer/__init__.py +10 -9
pdfalyzer/decorators/pdf_file.py +28 -20
pdfalyzer/detection/yaralyzer_helper.py +0 -1
pdfalyzer/helpers/filesystem_helper.py +0 -1
pdfalyzer/helpers/image_helper.py +3 -3
pdfalyzer/helpers/string_helper.py +28 -30
pdfalyzer/pdfalyzer.py +37 -11
pdfalyzer/util/argument_parser.py +2 -144
pdfalyzer/util/cli_tools_argument_parser.py +164 -0
pdfalyzer/util/page_range.py +4 -7
{pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/METADATA +23 -33
{pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/RECORD +16 -15
{pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/entry_points.txt +1 -1
{pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/LICENSE +0 -0
{pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/WHEEL +0 -0

CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,28 @@
 # NEXT RELEASE
+### 1.17.7
+* Bump `pypdf` to 6.1.3 (fixes [#31](https://github.com/michelcrypt4d4mus/pdfalyzer/issues/31), `PyMuPDF` to 1.26.5
+### 1.17.6
+* Better handling for errors resulting from bugs in PyPDF
+* Properly close file handle when pdfalyzing is complete
+### 1.17.5
+* Fix `PIL` lazy import
+### 1.17.4
+* Make `PIL` a lazy import so installs without `[extract]` extras don't fail
+### 1.17.3
+* Put back `--debug` arg for CLI tools
+### 1.17.2
+* Remove unused `--debug` args for CLI tools
+* Rename `extract_text_from_pdfs` to `extract_pdf_text`
+### 1.17.1
+* Fix issue where `extract_pdf_pages` page ranges were indexed from 0 instead of 1
 # 1.17.0
 * Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
 * Add `extract_text_from_pdfs` command line tool (imported from `clown_sort`)

pdfalyzer/__init__.py CHANGED Viewed

@@ -31,8 +31,9 @@ from pdfalyzer.helpers.rich_text_helper import print_highlighted
 from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
 from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
 from pdfalyzer.pdfalyzer import Pdfalyzer
-from pdfalyzer.util.argument_parser import (MAX_QUALITY, ask_to_proceed, output_sections, parse_arguments,
-     parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
+from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
+from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, parse_combine_pdfs_args,
+     parse_pdf_page_extraction_args, parse_text_extraction_args)
 from pdfalyzer.util.pdf_parser_manager import PdfParserManager
 # For the table shown by running pdfalyzer_show_color_theme
@@ -42,7 +43,7 @@ MAX_THEME_COL_SIZE = 35
 def pdfalyze():
     args = parse_arguments()
     pdfalyzer = Pdfalyzer(args.file_to_scan_path)
-    pdfalyzer = PdfalyzerPresenter(pdfalyzer)
+    presenter = PdfalyzerPresenter(pdfalyzer)
     output_basepath = None
     # Binary stream extraction is a special case
@@ -54,7 +55,7 @@ def pdfalyze():
     # The method that gets called is related to the argument name. See 'possible_output_sections' list in
     # argument_parser.py. Analysis exports wrap themselves around the methods that actually generate the analyses.
-    for (arg, method) in output_sections(args, pdfalyzer):
+    for (arg, method) in output_sections(args, presenter):
         if args.output_dir:
             output_basepath = PdfalyzerConfig.get_output_basepath(method)
             print(f'Exporting {arg} data to {output_basepath}...')
@@ -79,6 +80,8 @@ def pdfalyze():
     if args.interact:
         code.interact(local=locals())
+    pdfalyzer.pdf_filehandle.close()
 def pdfalyzer_show_color_theme() -> None:
     """Utility method to show pdfalyzer's color theme. Invocable with 'pdfalyzer_show_color_theme'."""
@@ -135,15 +138,13 @@ def combine_pdfs():
 def extract_pdf_pages() -> None:
+    """Extract a range of pages from a PDF to a new PDF."""
     args = parse_pdf_page_extraction_args()
     PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
-def extract_text_from_pdfs() -> None:
-    """
-    Extract text from a single file or from all files in a given directory. Can accept
-    multiple paths as arguments on the command line.
-    """
+def extract_pdf_text() -> None:
+    """Extract text from a list of file or from all PDF files in a list of directories."""
     args: Namespace = parse_text_extraction_args()
     console.line()

pdfalyzer/decorators/pdf_file.py CHANGED Viewed

@@ -20,6 +20,8 @@ from pdfalyzer.helpers.rich_text_helper import (attention_getting_panel, error_t
 from pdfalyzer.helpers.string_helper import exception_str
 from pdfalyzer.util.page_range import PageRange
+DEPENDENCY_ERROR_MSG = "Pdfalyzer is missing an optional dependency required to extract text. " + \
+                       "Try 'pip install pdfalyzer[extract]'"
 DEFAULT_PDF_ERRORS_DIR = Path.cwd().joinpath('pdf_errors')
 MIN_PDF_SIZE_TO_LOG_PROGRESS_TO_STDERR = 1024 * 1024 * 20
@@ -30,10 +32,11 @@ class PdfFile:
     Attributes:
         file_path (Path): The path to the PDF file.
-        dirname (Path): The directory containing the PDF file.
         basename (str): The base name of the PDF file (with extension).
         basename_without_ext (str): The base name of the PDF file (without extension).
+        dirname (Path): The directory containing the PDF file.
         extname (str): The file extension of the PDF file.
+        file_size (int): The size of the file in bytes.
     """
     def __init__(self, file_path: Union[str, Path]) -> None:
@@ -44,7 +47,7 @@ class PdfFile:
         self.file_path: Path = Path(file_path)
         if not self.file_path.exists():
-            raise FileNotFoundError(f"File '{file_path}' does not exist.")
+            raise FileNotFoundError(f"'{file_path}' is not a valid file or directory.")
         self.dirname = self.file_path.parent
         self.basename: str = path.basename(file_path)
@@ -53,11 +56,11 @@ class PdfFile:
         self.file_size = self.file_path.stat().st_size
     def extract_page_range(
-            self,
-            page_range: PageRange,
-            destination_dir: Optional[Path] = None,
-            extra_file_suffix: Optional[str] = None
-        ) -> Path:
+        self,
+        page_range: PageRange,
+        destination_dir: Optional[Path] = None,
+        extra_file_suffix: Optional[str] = None
+    ) -> Path:
         """
         Extract a range of pages to a new PDF file.
@@ -71,17 +74,21 @@ class PdfFile:
         Returns:
             Path: The path to the newly created PDF file containing the extracted pages.
         """
-        destination_dir = destination_dir or self.dirname
+        destination_dir = Path(destination_dir or self.dirname)
         create_dir_if_it_does_not_exist(destination_dir)
+        pdf_reader = PdfReader(self.file_path)
+        page_count = len(pdf_reader.pages)
+        file_suffix = page_range.file_suffix()
-        if extra_file_suffix is None:
-            file_suffix = page_range.file_suffix()
-        else:
-            file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
+        if page_count < (page_range.last_page - 1):
+            raise ValueError(f"PDF only has {page_count} pages but you asked for pages {page_range}!")
+        if extra_file_suffix is not None:
+            file_suffix += f"__{extra_file_suffix}"
         extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
         extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
-        console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'...")
+        console.print(f"Extracting {page_range.file_suffix()} from '{self.file_path}' to '{extracted_pages_pdf_path}'")
         pdf_writer = PdfWriter()
         with open(self.file_path, 'rb') as source_pdf:
@@ -94,11 +101,11 @@ class PdfFile:
         return extracted_pages_pdf_path
     def extract_text(
-            self,
-            page_range: Optional[PageRange] = None,
-            logger: Optional[Logger] = None,
-            print_as_parsed: bool = False
-        ) -> Optional[str]:
+        self,
+        page_range: Optional[PageRange] = None,
+        logger: Optional[Logger] = None,
+        print_as_parsed: bool = False
+    ) -> Optional[str]:
         """
         Use PyPDF to extract text page by page and use Tesseract to OCR any embedded images.
@@ -162,7 +169,7 @@ class PdfFile:
                 if print_as_parsed:
                     print(f"{page_text}")
         except DependencyError:
-            log.error("Pdfalyzer is missing an optional dependency required to extract text. Try 'pip install pdfalyzer[extract]'")
+            log.error(DEPENDENCY_ERROR_MSG)
         except EmptyFileError:
             log.warning("Skipping empty file!")
         except PdfStreamError as e:
@@ -185,7 +192,8 @@ class PdfFile:
         try:
             extracted_file = self.extract_page_range(PageRange(str(page_number)), destination_dir, error_msg)
-        except Exception as e:
+        except Exception:
+            stderr_console.print_exception()
             stderr_console.print(error_text(f"Failed to extract a page for submission to PyPDF team."))
             extracted_file = None

pdfalyzer/detection/yaralyzer_helper.py CHANGED Viewed

@@ -2,7 +2,6 @@
 Functions to help with the pre-configured YARA rules in the /yara directory.
 """
 from importlib.resources import as_file, files
-from sys import exit
 from typing import Optional, Union
 from yaralyzer.config import YaralyzerConfig

pdfalyzer/helpers/filesystem_helper.py CHANGED Viewed

@@ -6,7 +6,6 @@ from pathlib import Path
 from typing import Optional, Union
 from yaralyzer.output.rich_console import console
-from yaralyzer.util.logging import log
 from pdfalyzer.helpers.rich_text_helper import print_highlighted

pdfalyzer/helpers/image_helper.py CHANGED Viewed

@@ -1,19 +1,19 @@
 from typing import Optional
-from PIL import Image
 from yaralyzer.output.rich_console import console
 from pdfalyzer.helpers.rich_text_helper import warning_text
-def ocr_text(image: Image.Image, image_name: str) -> Optional[str]:
+def ocr_text(image: "Image.Image", image_name: str) -> Optional[str]:  # noqa F821
     """Use pytesseract to OCR the text in the image and return it as a string."""
     import pytesseract
+    from PIL import Image
     text = None
     try:
         text = pytesseract.image_to_string(image)
-    except pytesseract.pytesseract.TesseractError as e:
+    except pytesseract.pytesseract.TesseractError:
         console.print_exception()
         console.print(warning_text(f"Tesseract OCR failure '{image_name}'! No OCR text extracted..."))
     except OSError as e:

pdfalyzer/helpers/string_helper.py CHANGED Viewed

@@ -3,7 +3,7 @@ Various text formatting/styling/manipulating methods.
 """
 import re
 from pprint import PrettyPrinter
-from typing import List, Pattern, Union
+from typing import List, Optional, Pattern, Union
 from yaralyzer.output.rich_console import console_width
@@ -18,16 +18,14 @@ pp = PrettyPrinter(
     sort_dicts=True)
-def generate_hyphen_line(width=None, title=None):
-    """e.g. '-----------------BEGIN-----------------'"""
-    width = width or console_width()
+def all_strings_are_same_ignoring_numbers(strings: List[str]) -> bool:
+    """Returns true if string addresses are same except for digits."""
+    return len(set([replace_digits(s) for s in strings])) == 1
-    if title is None:
-        return '-' * width
-    side_hyphens = int((width - len(title)) / 2) * '-'
-    line = side_hyphens + title + side_hyphens
-    return line if len(line) == width else line + '-'
+def bracketed(index: Union[int, str]) -> str:
+    """Surround index with [ and ]."""
+    return f"[{index}]"
 def count_pattern_matches_in_text(pattern: str, text: str) -> int:
@@ -44,9 +42,20 @@ def exception_str(e: Exception) -> str:
     return f"{type(e).__name__}: {e}"
-def root_address(_string: str) -> str:
-    """Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
-    return _string.split('[')[0]
+def generate_hyphen_line(width: Optional[int] = None, title: Optional[str] = None):
+    """e.g. '-----------------BEGIN-----------------'"""
+    width = width or console_width()
+    if title is None:
+        return '-' * width
+    side_hyphens = int((width - len(title)) / 2) * '-'
+    line = side_hyphens + title + side_hyphens
+    return line if len(line) == width else line + '-'
+def has_a_common_substring(strings: List[str]) -> bool:
+    return all([is_substring_of_longer_strings_in_list(s, strings) for s in strings])
 def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
@@ -54,9 +63,10 @@ def is_prefixed_by_any(_string: str, prefixes: List[str]) -> bool:
     return any([_string.startswith(prefix) for prefix in prefixes])
-def bracketed(index: Union[int, str]) -> str:
-    """Surround index with [ and ]."""
-    return f"[{index}]"
+def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
+    """Return True if '_string' is a substring of all the 'strings' longer than '_string'."""
+    longer_strings = [s for s in strings if len(s) > len(_string)]
+    return all([_string in longer_string for longer_string in longer_strings])
 def replace_digits(string_with_digits: str) -> str:
@@ -64,18 +74,6 @@ def replace_digits(string_with_digits: str) -> str:
     return DIGIT_REGEX.sub('x', string_with_digits)
-def all_strings_are_same_ignoring_numbers(strings: List[str]) -> bool:
-    """Returns true if string addresses are same except for digits."""
-    return len(set([replace_digits(s) for s in strings])) == 1
-def is_substring_of_longer_strings_in_list(_string: str, strings: List[str]) -> bool:
-    longer_strings = [s for s in strings if len(s) > len(_string)]
-    return all([_string in longer_string for longer_string in longer_strings])
-def has_a_common_substring(strings: List[str]) -> bool:
-    return all([
-        is_substring_of_longer_strings_in_list(s, strings)
-        for s in strings
-    ])
+def root_address(_string: str) -> str:
+    """Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
+    return _string.split('[')[0]

pdfalyzer/pdfalyzer.py CHANGED Viewed

@@ -7,10 +7,11 @@ from typing import Dict, Iterator, List, Optional
 from anytree import LevelOrderIter, SymlinkNode
 from anytree.search import findall, findall_by_attr
 from pypdf import PdfReader
+from pypdf.errors import PdfReadError
 from pypdf.generic import IndirectObject
 from yaralyzer.helpers.file_helper import load_binary_data
 from yaralyzer.output.file_hashes_table import compute_file_hashes
-from yaralyzer.output.rich_console import console
+from yaralyzer.output.rich_console import console, print_fatal_error_and_exit
 from yaralyzer.util.logging import log
 from pdfalyzer.decorators.document_model_printer import print_with_header
@@ -22,7 +23,8 @@ from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
 from pdfalyzer.util.adobe_strings import *
 from pdfalyzer.util.exceptions import PdfWalkError
-TRAILER_FALLBACK_ID = 10000000
+TRAILER_FALLBACK_ID = 10_000_000
+PYPDF_ERROR_MSG = "Failed to open file with PyPDF. Consider filing a PyPDF bug report: https://github.com/py-pdf/pypdf/issues"
 class Pdfalyzer:
@@ -32,6 +34,19 @@ class Pdfalyzer:
     Each of the PDF's internal objects isw rapped in a `PdfTreeNode` object. The tree is managed
     by the `anytree` library. Information about the tree as a whole is stored in this class.
     Once the PDF is parsed this class provides access to info about or from the underlying PDF tree.
+    Attributes:
+        font_infos (List[FontInfo]): Font summary objects
+        max_generation (int): Max revision number ("generation") encounted in this PDF.
+        nodes_encountered (Dict[int, PdfTreeNode]): Nodes we've traversed already.
+        pdf_basename (str): The base name of the PDF file (with extension).
+        pdf_bytes (bytes): PDF binary data.
+        pdf_bytes_info (BytesInfo): File size, hashes, and other data points about the PDF's raw bytes.
+        pdf_filehandle (BufferedReader): File handle that reads the PDF.
+        pdf_path (str): The path to the PDF file.
+        pdf_size (int): Number of nodes as extracted from the PDF's Trailer node.
+        pdf_tree (PdfTreeNode): The top node of the PDF data structure tree.
+        verifier (PdfTreeVerifier): PdfTreeVerifier that can validate the PDF has been walked successfully.
     """
     def __init__(self, pdf_path: str):
@@ -43,14 +58,21 @@ class Pdfalyzer:
         self.pdf_basename = basename(pdf_path)
         self.pdf_bytes = load_binary_data(pdf_path)
         self.pdf_bytes_info = compute_file_hashes(self.pdf_bytes)
-        pdf_file = open(pdf_path, 'rb')  # Filehandle must be left open for PyPDF to perform seeks
-        self.pdf_reader = PdfReader(pdf_file)
+        self.pdf_filehandle = open(pdf_path, 'rb')  # Filehandle must be left open for PyPDF to perform seeks
+        try:
+            self.pdf_reader = PdfReader(self.pdf_filehandle)
+        except PdfReadError:
+            self._handle_fatal_error(f'PdfReadError: "{pdf_path}" doesn\'t seem to be a valid PDF file.')
+        except Exception as e:
+            console.print_exception()
+            self._handle_fatal_error(f"{PYPDF_ERROR_MSG}\n{e}")
         # Initialize tracking variables
-        self.indeterminate_ids = set()  # See INDETERMINATE_REF_KEYS comment
-        self.nodes_encountered: Dict[int, PdfTreeNode] = {}  # Nodes we've seen already
         self.font_infos: List[FontInfo] = []  # Font summary objects
         self.max_generation = 0  # PDF revisions are "generations"; this is the max generation encountered
+        self.nodes_encountered: Dict[int, PdfTreeNode] = {}  # Nodes we've seen already
+        self._indeterminate_ids = set()  # See INDETERMINATE_REF_KEYS comment
         # Bootstrap the root of the tree with the trailer. PDFs are always read trailer first.
         # Technically the trailer has no PDF Object ID but we set it to the /Size of the PDF.
@@ -148,9 +170,9 @@ class Pdfalyzer:
                 from_node.add_child(to_node)
             # Remove this to_node from inteterminacy now that it's got a child or parent
-            if relationship.to_obj.idnum in self.indeterminate_ids:
+            if relationship.to_obj.idnum in self._indeterminate_ids:
                 log.info(f"  Found {relationship} => {to_node} was marked indeterminate but now placed")
-                self.indeterminate_ids.remove(relationship.to_obj.idnum)
+                self._indeterminate_ids.remove(relationship.to_obj.idnum)
         # If the relationship is indeterminate or we've seen the PDF object before, add it as
         # a non-tree relationship for now. An attempt to place the node will be made at the end.
@@ -159,7 +181,7 @@ class Pdfalyzer:
             # If we already encountered 'to_node' then skip adding it to the queue of nodes to walk
             if was_seen_before:
-                if relationship.to_obj.idnum not in self.indeterminate_ids and to_node.parent is None:
+                if relationship.to_obj.idnum not in self._indeterminate_ids and to_node.parent is None:
                     raise PdfWalkError(f"{relationship} - ref has no parent and is not indeterminate")
                 else:
                     log.debug(f"  Already saw {relationship}; not scanning next")
@@ -167,7 +189,7 @@ class Pdfalyzer:
             # Indeterminate relationships need to wait until everything has been scanned to be placed
             elif relationship.is_indeterminate or (relationship.is_link and not self.is_in_tree(to_node)):
                 log.info(f'  Indeterminate ref {relationship}')
-                self.indeterminate_ids.add(to_node.idnum)
+                self._indeterminate_ids.add(to_node.idnum)
             # Link nodes like /Dest are usually just links between nodes
             elif relationship.is_link:
                 log.debug(f"  Link ref {relationship}")
@@ -178,9 +200,13 @@ class Pdfalyzer:
         return to_node
+    def _handle_fatal_error(self, msg: str) -> None:
+        self.pdf_filehandle.close()
+        print_fatal_error_and_exit(msg)
     def _resolve_indeterminate_nodes(self) -> None:
         """Place all indeterminate nodes in the tree. Called after all nodes have been walked."""
-        indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self.indeterminate_ids]
+        indeterminate_nodes = [self.nodes_encountered[idnum] for idnum in self._indeterminate_ids]
         indeterminate_nodes_string = "\n   ".join([f"{node}" for node in indeterminate_nodes])
         log.info(f"Resolving {len(indeterminate_nodes)} indeterminate nodes: {indeterminate_nodes_string}")

pdfalyzer/util/argument_parser.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Parse command line arguments for pdfalyzer and construct the PdfalyzerConfig object.
+Parse command line arguments for `pdfalyze` and construct the `PdfalyzerConfig` object.
 """
 import sys
 from argparse import ArgumentParser, Namespace
@@ -7,23 +7,17 @@ from collections import namedtuple
 from functools import partial, update_wrapper
 from importlib.metadata import version
 from os import getcwd, path
-from pathlib import Path
 from typing import List, Optional
 from rich_argparse_plus import RichHelpFormatterPlus
 from rich.prompt import Confirm
 from rich.text import Text
-from yaralyzer.helpers.file_helper import files_in_dir
 from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args, source
 from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
 from pdfalyzer.config import ALL_STREAMS, PDFALYZER, PdfalyzerConfig
 from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
-from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
-     with_pdf_extension)
 from pdfalyzer.helpers.rich_text_helper import print_highlighted
-from pdfalyzer.util.page_range import PageRangeArgumentValidator
 # NamedTuple to keep our argument selection orderly
 OutputSection = namedtuple('OutputSection', ['argument', 'method'])
@@ -202,146 +196,10 @@ def output_sections(args: Namespace, pdfalyzer: 'Pdfalyzer') -> List[OutputSecti
 def all_sections_chosen(args):
-    """Returns true if all flags are set or no flags are set."""
+    """Returns True if all flags are set or no flags are set."""
     return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
-#############################################################
-#  Separate arg parsers for combine_pdfs and other scripts  #
-#############################################################
-MAX_QUALITY = 10
-combine_pdfs_parser = ArgumentParser(
-    description="Combine multiple PDFs into one.",
-    epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
-           " page numbers prior to merging.",
-    formatter_class=RichHelpFormatterPlus)
-combine_pdfs_parser.add_argument('pdfs',
-                                 help='two or more PDFs to combine',
-                                 metavar='PDF_PATH',
-                                 nargs='+')
-combine_pdfs_parser.add_argument('-iq', '--image-quality',
-                                 help='image quality for embedded images (can compress PDF at loss of quality)',
-                                 choices=range(1, MAX_QUALITY + 1),
-                                 default=MAX_QUALITY,
-                                 type=int)
-combine_pdfs_parser.add_argument('-o', '--output-file',
-                                 help='path to write the combined PDFs to',
-                                 required=True)
-def parse_combine_pdfs_args() -> Namespace:
-    """Parse command line args for combine_pdfs script."""
-    args = combine_pdfs_parser.parse_args()
-    args.output_file = with_pdf_extension(args.output_file)
-    confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
-    args.number_of_pdfs = len(args.pdfs)
-    if args.number_of_pdfs < 2:
-        exit_with_error(f"Need at least 2 PDFs to merge.")
-    elif not do_all_files_exist(args.pdfs):
-        exit_with_error()
-    elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
-        exit_with_error()
-    if all(is_pdf(pdf) for pdf in args.pdfs):
-        if all(extract_page_number(pdf) for pdf in args.pdfs):
-            print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
-            args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
-        else:
-            print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
-    else:
-        print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
-        ask_to_proceed()
-    print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
-    return args
-###########################################
-# Parse args for extract_pdf_pages() #
-###########################################
-page_range_validator = PageRangeArgumentValidator()
-extract_pdf_parser = ArgumentParser(
-    formatter_class=RichHelpFormatterPlus,
-    description="Extract pages from one PDF into a new PDF.",
-)
-extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
-extract_pdf_parser.add_argument('--page-range', '-r',
-                                type=page_range_validator,
-                                help=page_range_validator.HELP_MSG,
-                                required=True)
-extract_pdf_parser.add_argument('--destination-dir', '-d',
-                                help="directory to write the new PDF to",
-                                default=Path.cwd())
-extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
-def parse_pdf_page_extraction_args() -> Namespace:
-    args = extract_pdf_parser.parse_args()
-    if not is_pdf(args.pdf_file):
-        log.error(f"'{args.pdf_file}' is not a PDF.")
-        sys.exit(-1)
-    elif not Path(args.destination_dir).exists():
-        log.error(f"Destination dir '{args.destination_dir}' does not exist.")
-        sys.exit(1)
-    return args
-############################################
-# Parse args for extract_text_from_pdfs() #
-############################################
-extract_text_parser = ArgumentParser(
-    formatter_class=RichHelpFormatterPlus,
-    description="Extract the text from one or more files or directories.",
-    epilog="If any of the FILE_OR_DIRs is a directory all files in that directory will be extracted."
-)
-extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
-extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
-extract_text_parser.add_argument('--page-range', '-r',
-                                 type=page_range_validator,
-                                 help=f"[PDFs only] {page_range_validator.HELP_MSG}")
-extract_text_parser.add_argument('--print-as-parsed', '-p',
-                                 action='store_true',
-                                 help='print pages as they are parsed instead of waiting until document is fully parsed')
-def parse_text_extraction_args() -> Namespace:
-    args = extract_text_parser.parse_args()
-    args.files_to_process = []
-    for file_or_dir in args.file_or_dir:
-        file_path = Path(file_or_dir)
-        if not file_path.exists():
-            log.error(f"File '{file_path}' doesn't exist!")
-            sys.exit(-1)
-        elif file_path.is_dir():
-            args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
-        else:
-            args.files_to_process.append(file_path)
-    if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
-        log.error(f"--page-range can only be specified for a single PDF")
-        sys.exit(-1)
-    return args
 #############
 #  Helpers  #
 #############

pdfalyzer/util/cli_tools_argument_parser.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""
+Argument parsers for the command line tools other than `pdfalyze` that are included with The Pdfalyzer.
+1. combine_pdfs
+2. extract_pdf_pages
+3. extract_pdf_text
+"""
+import logging
+import sys
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from rich_argparse_plus import RichHelpFormatterPlus
+from rich.prompt import Confirm
+from rich.text import Text
+from yaralyzer.helpers.file_helper import files_in_dir
+from yaralyzer.util.logging import log
+from pdfalyzer.util.argument_parser import ask_to_proceed, exit_with_error
+from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
+     with_pdf_extension)
+from pdfalyzer.helpers.rich_text_helper import print_highlighted
+from pdfalyzer.util.page_range import PageRangeArgumentValidator
+MAX_QUALITY = 10
+##################
+#  combine_pdfs  #
+##################
+combine_pdfs_parser = ArgumentParser(
+    description="Combine multiple PDFs into one.",
+    epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" +
+           " page numbers prior to merging.",
+    formatter_class=RichHelpFormatterPlus)
+combine_pdfs_parser.add_argument('pdfs',
+                                 help='two or more PDFs to combine',
+                                 metavar='PDF_PATH',
+                                 nargs='+')
+combine_pdfs_parser.add_argument('-iq', '--image-quality',
+                                 help='image quality for embedded images (can compress PDF at loss of quality)',
+                                 choices=range(1, MAX_QUALITY + 1),
+                                 default=MAX_QUALITY,
+                                 type=int)
+combine_pdfs_parser.add_argument('-o', '--output-file',
+                                 help='path to write the combined PDFs to',
+                                 required=True)
+def parse_combine_pdfs_args() -> Namespace:
+    """Parse command line args for combine_pdfs script."""
+    args = combine_pdfs_parser.parse_args()
+    args.output_file = with_pdf_extension(args.output_file)
+    confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
+    args.number_of_pdfs = len(args.pdfs)
+    if args.number_of_pdfs < 2:
+        exit_with_error(f"Need at least 2 PDFs to merge.")
+    elif not do_all_files_exist(args.pdfs):
+        exit_with_error()
+    elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
+        exit_with_error()
+    if all(is_pdf(pdf) for pdf in args.pdfs):
+        if all(extract_page_number(pdf) for pdf in args.pdfs):
+            print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
+            args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
+        else:
+            print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
+    else:
+        print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
+        ask_to_proceed()
+    print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
+    return args
+#####################
+# extract_pdf_pages #
+#####################
+page_range_validator = PageRangeArgumentValidator()
+extract_pdf_parser = ArgumentParser(
+    formatter_class=RichHelpFormatterPlus,
+    description="Extract pages from one PDF into a new PDF.",
+)
+extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
+extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
+extract_pdf_parser.add_argument('--page-range', '-r',
+                                type=page_range_validator,
+                                help=page_range_validator.HELP_MSG,
+                                required=True)
+extract_pdf_parser.add_argument('--destination-dir', '-d',
+                                help="directory to write the new PDF to",
+                                default=Path.cwd())
+def parse_pdf_page_extraction_args() -> Namespace:
+    args = extract_pdf_parser.parse_args()
+    if not is_pdf(args.pdf_file):
+        log.error(f"'{args.pdf_file}' is not a PDF.")
+        sys.exit(-1)
+    elif not Path(args.destination_dir).exists():
+        log.error(f"Destination dir '{args.destination_dir}' does not exist.")
+        sys.exit(1)
+    _set_log_level(args)
+    return args
+######################
+#  extract_pdf_text  #
+######################
+extract_text_parser = ArgumentParser(
+    formatter_class=RichHelpFormatterPlus,
+    description="Extract the text from one or more files or directories.",
+    epilog="If any of the FILE_OR_DIRs is a directory all PDF files in that directory will be extracted."
+)
+extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
+extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
+extract_text_parser.add_argument('--page-range', '-r',
+                                 type=page_range_validator,
+                                 help=f"[PDFs only] {page_range_validator.HELP_MSG}")
+extract_text_parser.add_argument('--print-as-parsed', '-p',
+                                 action='store_true',
+                                 help='print pages as they are parsed instead of waiting until parsing complete')
+def parse_text_extraction_args() -> Namespace:
+    args = extract_text_parser.parse_args()
+    args.files_to_process = []
+    for file_or_dir in args.file_or_dir:
+        file_path = Path(file_or_dir)
+        if not file_path.exists():
+            log.error(f"'{file_path}' is not a valid file or directory.")
+            sys.exit(-1)
+        elif file_path.is_dir():
+            args.files_to_process.extend(files_in_dir(file_path, 'pdf'))
+        else:
+            args.files_to_process.append(file_path)
+    if args.page_range and (len(args.files_to_process) > 1 or not is_pdf(args.files_to_process[0])):
+        log.error(f"--page-range can only be specified for a single PDF")
+        sys.exit(-1)
+    _set_log_level(args)
+    return args
+def _set_log_level(args: Namespace):
+    if args.debug:
+        log.setLevel(logging.DEBUG)

pdfalyzer/util/page_range.py CHANGED Viewed

@@ -6,7 +6,7 @@ from argparse import ArgumentTypeError
 from dataclasses import dataclass
 from typing import Tuple
-PAGE_RANGE_REGEX = re.compile('\\d(-\\d)?')
+PAGE_RANGE_REGEX = re.compile(r'[1-9](\d+)?(-\d+)?')
 @dataclass
@@ -15,7 +15,7 @@ class PageRange:
     def __post_init__(self):
         if not PAGE_RANGE_REGEX.match(self.page_range):
-            raise ValueError(f"Invalid page range '{self.page_range}'")
+            raise ArgumentTypeError(f"Invalid page range '{self.page_range}'")
         if '-' in self.page_range:
             (self.first_page, self.last_page) = (int(p) for p in self.page_range.split('-'))
@@ -35,10 +35,10 @@ class PageRange:
         if self.first_page + 1 == self.last_page:
             return f"page_{self.first_page}"
         else:
-            return f"pages_{self.first_page}-{self.last_page}"
+            return f"pages_{self.first_page}-{self.last_page - 1}"
     def to_tuple(self) -> Tuple[int, int]:
-        return (self.first_page, self.last_page)
+        return (self.first_page - 1, self.last_page - 1)
     def __repr__(self) -> str:
         return f"PageRange({self.first_page}, {self.last_page})"
@@ -48,7 +48,4 @@ class PageRangeArgumentValidator(object):
     HELP_MSG = "a single digit ('11') or a range ('11-15') (WILL NOT extract the last page)"
     def __call__(self, value):
-        if not PAGE_RANGE_REGEX.match(value):
-            raise ArgumentTypeError("Argument has to match '{}'".format(PAGE_RANGE_REGEX.pattern))
         return PageRange(value)

{pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: pdfalyzer
-Version: 1.17.0
-Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
+Version: 1.17.7
+Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
 Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
 License: GPL-3.0-or-later
 Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
@@ -22,9 +22,9 @@ Classifier: Topic :: Artistic Software
 Classifier: Topic :: Scientific/Engineering :: Visualization
 Classifier: Topic :: Security
 Provides-Extra: extract
-Requires-Dist: PyMuPDF (>=1.26.4,<2.0.0) ; extra == "extract"
+Requires-Dist: PyMuPDF (>=1.26.5,<2.0.0) ; extra == "extract"
 Requires-Dist: anytree (>=2.13,<3.0)
-Requires-Dist: pypdf (>=6.0.0,<7.0.0)
+Requires-Dist: pypdf (>=6.1.3,<7.0.0)
 Requires-Dist: pytesseract (>=0.3.13,<0.4.0) ; extra == "extract"
 Requires-Dist: yaralyzer (>=1.0.9,<2.0.0)
 Project-URL: Changelog, https://github.com/michelcrypt4d4mus/pdfalyzer/blob/master/CHANGELOG.md
@@ -67,9 +67,8 @@ If you're looking for one of these things this may be the tool for you.
 ### What It Don't Do
 This tool is mostly for examining/working with a PDF's data and logical structure. As such it doesn't have much to offer as far as extracting text, rendering[^3], writing, etc. etc.
-If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it; embedded javascript etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
+If you suspect you are dealing with a malcious PDF you can safely run `pdfalyze` on it. Embedded javascript and `/OpenAction` nodes etc. will not be executed. If you want to actually look at the contents of a suspect PDF you can use [`dangerzone`](https://dangerzone.rocks/) to sanitize the contents with extreme prejudice before opening it.
--------------
 # Installation
 #### All Platforms
@@ -99,7 +98,6 @@ brew install pdfalyzer
    sudo apt-get install build-essential libssl-dev libffi-dev rustc
    ```
--------------
 # Usage
@@ -115,7 +113,7 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
 The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
-### Setting Command Line Options Permanently With A `.pdfalyzer` File
+#### Setting Command Line Options Permanently With A `.pdfalyzer` File
 When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` in these places in this order:
 1. the current directory
@@ -123,12 +121,9 @@ When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfa
 If it finds a `.pdfalyzer` file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
-### Environment Variables
+#### Environment Variables
 Even if you don't configure your own `.pdfalyzer` file you may still glean some insight from reading the descriptions of the various variables in [`.pdfalyzer.example`](.pdfalyzer.example); there's a little more exposition there than in the output of `pdfalyze -h`.
-### Colors And Themes
-Run `pdfalyzer_show_color_theme` to see the color theme employed.
 ### Guarantees
 Warnings will be printed if any PDF object ID between 1 and the `/Size` reported by the PDF itself could not be successfully placed in the tree. If you do not get any warnings then all[^2] of the inner PDF objects should be seen in the output.
@@ -136,7 +131,22 @@ Warnings will be printed if any PDF object ID between 1 and the `/Size` reported
 [BUFFERZONE Team](https://bufferzonesecurity.com) posted [an excellent example](https://bufferzonesecurity.com/the-beginners-guide-to-adobe-pdf-malware-reverse-engineering-part-1/) of how one might use The Pdfalyzer in tandem with [Didier Stevens' PDF tools](#installing-didier-stevenss-pdf-analysis-tools) to investigate a potentially malicious PDF (archived in [the `doc/` dir in this repo](./doc/) if the link rots).
-## Use As A Code Library
+## Included Command Line Tools
+The Pdfalyzer comes with a few command line tools for doing stuff with PDFs:
+* `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
+* `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
+* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
+* `pdfalyzer_show_color_theme` - Run to see the color theme employed in Pdfalyzer's output.
+Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
+```bash
+pipx install pdfalyzer[extract]
+```
+## As A Python Library
 For info about setting up a dev environment see [Contributing](#contributing) below.
 At its core The Pdfalyzer is taking PDF internal objects gathered by [PyPDF](https://github.com/py-pdf/pypdf) and wrapping them in [AnyTree](https://github.com/c0fec0de/anytree)'s `NodeMixin` class.  Given that things like searching the tree or accessing internal PDF properties will be done through those packages' code it may be helpful to review their documentation.
@@ -247,26 +257,6 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
 # PDF Resources
-## Included PDF Tools
-The Pdfalyzer comes with a few command line tools:
-#### `combine_pdfs`
-Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
-#### `extract_pdf_pages`
-Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` to see the options.
-![](doc/extract_pages_from_pdf_help.png)
-#### `extract_text_from_pdfs`
-Extracts text from a PDF, including applying OCR to all embedded images. Requires that you install The Pdfalyzer's optional dependencies:
-```bash
-pipx install pdfalyzer[extract]
-```
-Run `extract_text_from_pdfs --help` to see the options.
 ## 3rd Party PDF Tools
 ### Installing Didier Stevens's PDF Analysis Tools
 Stevens's tools provide comprehensive info about the contents of a PDF, are guaranteed not to trigger the rendering of any malicious content (especially `pdfid.py`), and have been battle tested for well over a decade. It would probably be a good idea to analyze your PDF with his tools before you start working with this one.

{pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/RECORD RENAMED Viewed

@@ -1,28 +1,28 @@
 .pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
-CHANGELOG.md,sha256=DdmNHFTwo2VoFvmWA9htyUGLWvajyXnalNxB9hLwM9I,13042
+CHANGELOG.md,sha256=LEAlcDOgi-BH86Pe66RFDGFgOfHVaZD05veJbCPyBB0,13681
 LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-pdfalyzer/__init__.py,sha256=2OMrlYT53jvue3ddhKjF6LMbG2ss377neJBVBELwp3I,6118
+pdfalyzer/__init__.py,sha256=3ylD-19PcG1bJ-rMa6ruP06QaM9Q1BitaMOA2ppugM8,6197
 pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
 pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04ElB8ilU,10748
 pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
 pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
 pdfalyzer/decorators/indeterminate_node.py,sha256=QLJr-nGKih8gPZcIqxLU028OwWWD5VjNHYMUjniwT_k,6586
-pdfalyzer/decorators/pdf_file.py,sha256=_v4mIpQXlPZTLRg2Tvv_OP_an-HECXbfzoGuq-hZ5io,10199
+pdfalyzer/decorators/pdf_file.py,sha256=ryAYzzsO8Fw5_ZMoomruW0Bal8pTb5C0VlLOTjdVqNI,10552
 pdfalyzer/decorators/pdf_object_properties.py,sha256=Il3RObxQ4XUf0Ei-nd4tjJO0LeaxC6u7yFa3cQs_jVY,5485
 pdfalyzer/decorators/pdf_tree_node.py,sha256=4LReGJUtG8iEcLUQD1jW-yp3xPWsHrC-3Anbkt7XZ3A,11134
 pdfalyzer/decorators/pdf_tree_verifier.py,sha256=2hVe9APsAWQZ7ra8AGndHQnGWmmxmb3ZwfJHZuLvsvc,4714
 pdfalyzer/detection/constants/binary_regexes.py,sha256=s69S7uq1v4vBy3ZkKKKt3ClNuFCuQ0ztootUxzlgfFw,1632
 pdfalyzer/detection/constants/javascript_reserved_keywords.py,sha256=CXXdWskdQa0Hs5wCci2RBVvipgZg34_cLfmkWG4Xcmg,991
 pdfalyzer/detection/javascript_hunter.py,sha256=_wT2vkKTMlm_RGCjYsmwcmV-ag1qep3EpkHmUw0nWcQ,711
-pdfalyzer/detection/yaralyzer_helper.py,sha256=_Bkw2JTt3MeD86VOK39C06hn9lNDCc_8ZKLVMEvrwvQ,2215
+pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47NdsDasg01uiQ,2194
 pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
 pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
-pdfalyzer/helpers/filesystem_helper.py,sha256=onXhSMhxo0YkvdKdosRwUo_RGdW6yNzZF5hfjgZ3GBE,5085
-pdfalyzer/helpers/image_helper.py,sha256=QjoAUcKKEtpmuEyOmEfmaUN-lzNykQ1SzqgNn9-R4Y0,1120
+pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
+pdfalyzer/helpers/image_helper.py,sha256=mDiscZZ7yrsFa-bxFqIEz9gH3WGhz8455yhXd4_QfAY,1134
 pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
 pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
 pdfalyzer/helpers/rich_text_helper.py,sha256=Q5Zj0I96ymQmDWHkOX4lWEvkizOMMgzYNx4CF35t_7w,3561
-pdfalyzer/helpers/string_helper.py,sha256=YAjZy7KY6Ys_bb_YkUiUGQryONwsOE88LLGNgyWJ62o,2405
+pdfalyzer/helpers/string_helper.py,sha256=zl7VnxqkaB50Zv1yQoz-ShVcLT2_nOgmxekWTpXHyx4,2521
 pdfalyzer/output/character_mapping.py,sha256=UN66b4BjvJiokBCi2kregiQvi6u2l1BJcHYFGG_G43M,2190
 pdfalyzer/output/layout.py,sha256=U9n5RnwwBg2UXxRBAc4E2gQ9t3dNsmiu62klz-Ig1Zg,2767
 pdfalyzer/output/pdfalyzer_presenter.py,sha256=TUsMc2GTUDjFzIGk7Ep5ZASfXcKX_WNtZzZKbQTHcfY,8580
@@ -33,12 +33,13 @@ pdfalyzer/output/tables/font_summary_table.py,sha256=TyCwcvqn99LXTWnmtk6MBPdc_33
 pdfalyzer/output/tables/pdf_node_rich_table.py,sha256=7G-FLb_EUP50kZmYCTbo8Q6taU4xKp2QIGNOnQtYbNg,5908
 pdfalyzer/output/tables/stream_objects_table.py,sha256=PgQj8oTtW5_X8SMQb3FvCWDS-d4Zl6QiE44Qhiv7lTY,706
 pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVnev_4uEk,5291
-pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
+pdfalyzer/pdfalyzer.py,sha256=iu4D3Y9qlKP0D_k883ji4U6LLzelQkHONlzAed0QUx4,12713
 pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
-pdfalyzer/util/argument_parser.py,sha256=2aYoW0ZILRSQkEOCaDwrZYmge5QI5tORhNm03rA0my8,15574
+pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
+pdfalyzer/util/cli_tools_argument_parser.py,sha256=HyZhztyrPtbvOswmG975M0tK5KPon37lV3fxVA0OwYo,6277
 pdfalyzer/util/debugging.py,sha256=hjYGxptJmal9TTaAUkkoo0oNu2tdx6ZYSyC0WjvzHh0,156
 pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
-pdfalyzer/util/page_range.py,sha256=zsHPw9p4QGlx5YEdssntY8HLEZIvBoQrS8Y8V87t5sA,1770
+pdfalyzer/util/page_range.py,sha256=NMNh3_TojxTxBIpvUYK1AmvID_m8qOP6AihZrLWZF2I,1652
 pdfalyzer/util/pdf_parser_manager.py,sha256=FVRYAYsCd0y5MAm--qvXnwCZnDtB3x85FdJtb-gpyw4,3109
 pdfalyzer/yara_rules/PDF.yara,sha256=70JzPq5F6AS8F46Seu6u0j5GS1JHxkS42r7g7PVSpRg,81489
 pdfalyzer/yara_rules/PDF_binary_stream.yara,sha256=Qt0Wd7RFXYiHaT9YxTCrhC68ccmFcEG1XMNC3p5IwcI,821
@@ -46,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
 pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
 pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
 pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
-pdfalyzer-1.17.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-pdfalyzer-1.17.0.dist-info/METADATA,sha256=MLXdtDxLIbFC4V2RlW9VKhHb7MEWgcF_3_o4cdlN-94,27337
-pdfalyzer-1.17.0.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
-pdfalyzer-1.17.0.dist-info/entry_points.txt,sha256=goHVADdqEFcniu4O0k7kabc2rLf3wvRrENJK6c9IkUw,249
-pdfalyzer-1.17.0.dist-info/RECORD,,
+pdfalyzer-1.17.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+pdfalyzer-1.17.7.dist-info/METADATA,sha256=Cbd6Qu3SS8xGKrC__jEPG-74nnYvY0rJu9pirLiqrFQ,27328
+pdfalyzer-1.17.7.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
+pdfalyzer-1.17.7.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
+pdfalyzer-1.17.7.dist-info/RECORD,,

{pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,7 +1,7 @@
 [console_scripts]
 combine_pdfs=pdfalyzer:combine_pdfs
 extract_pdf_pages=pdfalyzer:extract_pdf_pages
-extract_text_from_pdfs=pdfalyzer:extract_text_from_pdfs
+extract_pdf_text=pdfalyzer:extract_pdf_text
 pdfalyze=pdfalyzer:pdfalyze
 pdfalyzer_show_color_theme=pdfalyzer:pdfalyzer_show_color_theme

{pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/LICENSE RENAMED Viewed

File without changes

{pdfalyzer-1.17.0.dist-info → pdfalyzer-1.17.7.dist-info}/WHEEL RENAMED Viewed

File without changes

pdfalyzer 1.17.0__py3-none-any.whl → 1.17.7__py3-none-any.whl

pdfalyzer 1.17.0py3-none-any.whl → 1.17.7py3-none-any.whl