PyPI - pdfalyzer - Versions diffs - 1.17.1__tar.gz → 1.17.2__tar.gz - Mend

pdfalyzer 1.17.1tar.gz → 1.17.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pdfalyzer might be problematic. Click here for more details.

Files changed (52) hide show

{pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/CHANGELOG.md RENAMED Viewed

@@ -1,7 +1,11 @@
 # NEXT RELEASE
+### 1.17.2
+* Remove unused `--debug` args for CLI tools
+* Rename `extract_text_from_pdfs` to `extract_pdf_text`
 ### 1.17.1
-* Fix issue where `combine_pdfs` page ranges were indexed from 0 instead of 1
+* Fix issue where `extract_pdf_pages` page ranges were indexed from 0 instead of 1
 # 1.17.0
 * Add `extract_pdf_pages` command line tool (imported from `clown_sort`)

{pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/PKG-INFO RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: pdfalyzer
-Version: 1.17.1
-Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
+Version: 1.17.2
+Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
 Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
 License: GPL-3.0-or-later
 Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
@@ -252,9 +252,9 @@ The Pdfalyzer comes with a few command line tools:
 * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
 * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
-* `extract_text_from_pdfs` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_text_from_pdfs --help` for more info.
+* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
-Running `extract_text_from_pdfs` requires that you install The Pdfalyzer's optional dependencies:
+Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
 ```bash
 pipx install pdfalyzer[extract]

{pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/README.md RENAMED Viewed

@@ -218,9 +218,9 @@ The Pdfalyzer comes with a few command line tools:
 * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
 * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
-* `extract_text_from_pdfs` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_text_from_pdfs --help` for more info.
+* `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
-Running `extract_text_from_pdfs` requires that you install The Pdfalyzer's optional dependencies:
+Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
 ```bash
 pipx install pdfalyzer[extract]

{pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/__init__.py RENAMED Viewed

@@ -32,8 +32,8 @@ from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
 from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
 from pdfalyzer.pdfalyzer import Pdfalyzer
 from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
-from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, ask_to_proceed,
-     parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
+from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, parse_combine_pdfs_args,
+     parse_pdf_page_extraction_args, parse_text_extraction_args)
 from pdfalyzer.util.pdf_parser_manager import PdfParserManager
 # For the table shown by running pdfalyzer_show_color_theme
@@ -141,7 +141,7 @@ def extract_pdf_pages() -> None:
     PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
-def extract_text_from_pdfs() -> None:
+def extract_pdf_text() -> None:
     """Extract text from a list of file or from all PDF files in a list of directories."""
     args: Namespace = parse_text_extraction_args()
     console.line()

{pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/decorators/pdf_file.py RENAMED Viewed

@@ -30,9 +30,9 @@ class PdfFile:
     Attributes:
         file_path (Path): The path to the PDF file.
-        dirname (Path): The directory containing the PDF file.
         basename (str): The base name of the PDF file (with extension).
         basename_without_ext (str): The base name of the PDF file (without extension).
+        dirname (Path): The directory containing the PDF file.
         extname (str): The file extension of the PDF file.
         file_size (int): The size of the file in bytes.
     """
@@ -74,11 +74,15 @@ class PdfFile:
         """
         destination_dir = Path(destination_dir or self.dirname)
         create_dir_if_it_does_not_exist(destination_dir)
+        pdf_reader = PdfReader(self.file_path)
+        page_count = len(pdf_reader.pages)
+        file_suffix = page_range.file_suffix()
+        if page_count < (page_range.last_page - 1):
+            raise ValueError(f"PDF only has {page_count} pages but you asked for pages {page_range}!")
-        if extra_file_suffix is None:
-            file_suffix = page_range.file_suffix()
-        else:
-            file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
+        if extra_file_suffix is not None:
+            file_suffix += f"__{extra_file_suffix}"
         extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
         extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
@@ -211,3 +215,10 @@ class PdfFile:
             return
         stderr_console.print(msg, style=style or "")
+    # def _num_pages(self) -> int:
+    #     pdf_reader = PdfReader(self.file_path)
+    #     page_count = len(pdf_reader.pages)
+    #         log.debug(f"PDF Page count: {page_count}")
+    #         for page_number, page in enumerate(pdf_reader.pages, start=1):

{pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/helpers/filesystem_helper.py RENAMED Viewed

@@ -6,7 +6,6 @@ from pathlib import Path
 from typing import Optional, Union
 from yaralyzer.output.rich_console import console
-from yaralyzer.util.logging import log
 from pdfalyzer.helpers.rich_text_helper import print_highlighted

{pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/util/argument_parser.py RENAMED Viewed

@@ -196,7 +196,7 @@ def output_sections(args: Namespace, pdfalyzer: 'Pdfalyzer') -> List[OutputSecti
 def all_sections_chosen(args):
-    """Returns true if all flags are set or no flags are set."""
+    """Returns True if all flags are set or no flags are set."""
     return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)

{pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/util/cli_tools_argument_parser.py RENAMED Viewed

@@ -2,7 +2,8 @@
 Argument parsers for the command line tools other than `pdfalyze` that are included with The Pdfalyzer.
 1. combine_pdfs
-2.
+2. extract_pdf_pages
+3. extract_pdf_text
 """
 import sys
 from argparse import ArgumentParser, Namespace
@@ -97,8 +98,6 @@ extract_pdf_parser.add_argument('--destination-dir', '-d',
                                 help="directory to write the new PDF to",
                                 default=Path.cwd())
-extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
 def parse_pdf_page_extraction_args() -> Namespace:
     args = extract_pdf_parser.parse_args()
@@ -113,9 +112,9 @@ def parse_pdf_page_extraction_args() -> Namespace:
     return args
-############################
-#  extract_text_from_pdfs  #
-############################
+######################
+#  extract_pdf_text  #
+######################
 extract_text_parser = ArgumentParser(
     formatter_class=RichHelpFormatterPlus,
     description="Extract the text from one or more files or directories.",
@@ -123,7 +122,6 @@ extract_text_parser = ArgumentParser(
 )
 extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
-extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
 extract_text_parser.add_argument('--page-range', '-r',
                                  type=page_range_validator,

{pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "pdfalyzer"
-version = "1.17.1"
-description = "PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
+version = "1.17.2"
+description = "Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more."
 authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
 license = "GPL-3.0-or-later"
 readme = "README.md"
@@ -91,7 +91,7 @@ pytest-skip-slow = "^0.0.3"
 [tool.poetry.scripts]
 combine_pdfs = 'pdfalyzer:combine_pdfs'
 extract_pdf_pages = 'pdfalyzer:extract_pdf_pages'
-extract_text_from_pdfs = 'pdfalyzer:extract_text_from_pdfs'
+extract_pdf_text = 'pdfalyzer:extract_pdf_text'
 pdfalyze = 'pdfalyzer:pdfalyze'
 pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'