pdfalyzer 1.17.1__tar.gz → 1.17.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pdfalyzer might be problematic. Click here for more details.

Files changed (52) hide show
  1. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/CHANGELOG.md +5 -1
  2. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/PKG-INFO +4 -4
  3. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/README.md +2 -2
  4. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/__init__.py +3 -3
  5. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/decorators/pdf_file.py +16 -5
  6. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/helpers/filesystem_helper.py +0 -1
  7. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/util/argument_parser.py +1 -1
  8. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/util/cli_tools_argument_parser.py +5 -7
  9. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pyproject.toml +3 -3
  10. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/.pdfalyzer.example +0 -0
  11. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/LICENSE +0 -0
  12. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/__main__.py +0 -0
  13. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/binary/binary_scanner.py +0 -0
  14. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/config.py +0 -0
  15. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/decorators/document_model_printer.py +0 -0
  16. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/decorators/indeterminate_node.py +0 -0
  17. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/decorators/pdf_object_properties.py +0 -0
  18. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/decorators/pdf_tree_node.py +0 -0
  19. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/decorators/pdf_tree_verifier.py +0 -0
  20. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/detection/constants/binary_regexes.py +0 -0
  21. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/detection/constants/javascript_reserved_keywords.py +0 -0
  22. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/detection/javascript_hunter.py +0 -0
  23. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/detection/yaralyzer_helper.py +0 -0
  24. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/font_info.py +0 -0
  25. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/helpers/dict_helper.py +0 -0
  26. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/helpers/image_helper.py +0 -0
  27. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/helpers/number_helper.py +0 -0
  28. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/helpers/pdf_object_helper.py +0 -0
  29. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/helpers/rich_text_helper.py +0 -0
  30. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/helpers/string_helper.py +0 -0
  31. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/output/character_mapping.py +0 -0
  32. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/output/layout.py +0 -0
  33. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/output/pdfalyzer_presenter.py +0 -0
  34. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/output/styles/node_colors.py +0 -0
  35. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/output/styles/rich_theme.py +0 -0
  36. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/output/tables/decoding_stats_table.py +0 -0
  37. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/output/tables/font_summary_table.py +0 -0
  38. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/output/tables/pdf_node_rich_table.py +0 -0
  39. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/output/tables/stream_objects_table.py +0 -0
  40. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/pdf_object_relationship.py +0 -0
  41. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/pdfalyzer.py +0 -0
  42. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/util/adobe_strings.py +0 -0
  43. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/util/debugging.py +0 -0
  44. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/util/exceptions.py +0 -0
  45. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/util/page_range.py +0 -0
  46. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/util/pdf_parser_manager.py +0 -0
  47. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/yara_rules/PDF.yara +0 -0
  48. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/yara_rules/PDF_binary_stream.yara +0 -0
  49. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/yara_rules/__init.py__ +0 -0
  50. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/yara_rules/didier_stevens.yara +0 -0
  51. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/yara_rules/lprat.static_file_analysis.yara +0 -0
  52. {pdfalyzer-1.17.1 → pdfalyzer-1.17.2}/pdfalyzer/yara_rules/pdf_malware.yara +0 -0
@@ -1,7 +1,11 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.17.2
4
+ * Remove unused `--debug` args for CLI tools
5
+ * Rename `extract_text_from_pdfs` to `extract_pdf_text`
6
+
3
7
  ### 1.17.1
4
- * Fix issue where `combine_pdfs` page ranges were indexed from 0 instead of 1
8
+ * Fix issue where `extract_pdf_pages` page ranges were indexed from 0 instead of 1
5
9
 
6
10
  # 1.17.0
7
11
  * Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.17.1
4
- Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
3
+ Version: 1.17.2
4
+ Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
7
7
  Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
@@ -252,9 +252,9 @@ The Pdfalyzer comes with a few command line tools:
252
252
 
253
253
  * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
254
254
  * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
255
- * `extract_text_from_pdfs` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_text_from_pdfs --help` for more info.
255
+ * `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
256
256
 
257
- Running `extract_text_from_pdfs` requires that you install The Pdfalyzer's optional dependencies:
257
+ Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
258
258
 
259
259
  ```bash
260
260
  pipx install pdfalyzer[extract]
@@ -218,9 +218,9 @@ The Pdfalyzer comes with a few command line tools:
218
218
 
219
219
  * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
220
220
  * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
221
- * `extract_text_from_pdfs` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_text_from_pdfs --help` for more info.
221
+ * `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
222
222
 
223
- Running `extract_text_from_pdfs` requires that you install The Pdfalyzer's optional dependencies:
223
+ Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
224
224
 
225
225
  ```bash
226
226
  pipx install pdfalyzer[extract]
@@ -32,8 +32,8 @@ from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
32
32
  from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
33
33
  from pdfalyzer.pdfalyzer import Pdfalyzer
34
34
  from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
35
- from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, ask_to_proceed,
36
- parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
35
+ from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, parse_combine_pdfs_args,
36
+ parse_pdf_page_extraction_args, parse_text_extraction_args)
37
37
  from pdfalyzer.util.pdf_parser_manager import PdfParserManager
38
38
 
39
39
  # For the table shown by running pdfalyzer_show_color_theme
@@ -141,7 +141,7 @@ def extract_pdf_pages() -> None:
141
141
  PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
142
142
 
143
143
 
144
- def extract_text_from_pdfs() -> None:
144
+ def extract_pdf_text() -> None:
145
145
  """Extract text from a list of file or from all PDF files in a list of directories."""
146
146
  args: Namespace = parse_text_extraction_args()
147
147
  console.line()
@@ -30,9 +30,9 @@ class PdfFile:
30
30
 
31
31
  Attributes:
32
32
  file_path (Path): The path to the PDF file.
33
- dirname (Path): The directory containing the PDF file.
34
33
  basename (str): The base name of the PDF file (with extension).
35
34
  basename_without_ext (str): The base name of the PDF file (without extension).
35
+ dirname (Path): The directory containing the PDF file.
36
36
  extname (str): The file extension of the PDF file.
37
37
  file_size (int): The size of the file in bytes.
38
38
  """
@@ -74,11 +74,15 @@ class PdfFile:
74
74
  """
75
75
  destination_dir = Path(destination_dir or self.dirname)
76
76
  create_dir_if_it_does_not_exist(destination_dir)
77
+ pdf_reader = PdfReader(self.file_path)
78
+ page_count = len(pdf_reader.pages)
79
+ file_suffix = page_range.file_suffix()
80
+
81
+ if page_count < (page_range.last_page - 1):
82
+ raise ValueError(f"PDF only has {page_count} pages but you asked for pages {page_range}!")
77
83
 
78
- if extra_file_suffix is None:
79
- file_suffix = page_range.file_suffix()
80
- else:
81
- file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
84
+ if extra_file_suffix is not None:
85
+ file_suffix += f"__{extra_file_suffix}"
82
86
 
83
87
  extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
84
88
  extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
@@ -211,3 +215,10 @@ class PdfFile:
211
215
  return
212
216
 
213
217
  stderr_console.print(msg, style=style or "")
218
+
219
+ # def _num_pages(self) -> int:
220
+ # pdf_reader = PdfReader(self.file_path)
221
+ # page_count = len(pdf_reader.pages)
222
+ # log.debug(f"PDF Page count: {page_count}")
223
+
224
+ # for page_number, page in enumerate(pdf_reader.pages, start=1):
@@ -6,7 +6,6 @@ from pathlib import Path
6
6
  from typing import Optional, Union
7
7
 
8
8
  from yaralyzer.output.rich_console import console
9
- from yaralyzer.util.logging import log
10
9
 
11
10
  from pdfalyzer.helpers.rich_text_helper import print_highlighted
12
11
 
@@ -196,7 +196,7 @@ def output_sections(args: Namespace, pdfalyzer: 'Pdfalyzer') -> List[OutputSecti
196
196
 
197
197
 
198
198
  def all_sections_chosen(args):
199
- """Returns true if all flags are set or no flags are set."""
199
+ """Returns True if all flags are set or no flags are set."""
200
200
  return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
201
201
 
202
202
 
@@ -2,7 +2,8 @@
2
2
  Argument parsers for the command line tools other than `pdfalyze` that are included with The Pdfalyzer.
3
3
 
4
4
  1. combine_pdfs
5
- 2.
5
+ 2. extract_pdf_pages
6
+ 3. extract_pdf_text
6
7
  """
7
8
  import sys
8
9
  from argparse import ArgumentParser, Namespace
@@ -97,8 +98,6 @@ extract_pdf_parser.add_argument('--destination-dir', '-d',
97
98
  help="directory to write the new PDF to",
98
99
  default=Path.cwd())
99
100
 
100
- extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
101
-
102
101
 
103
102
  def parse_pdf_page_extraction_args() -> Namespace:
104
103
  args = extract_pdf_parser.parse_args()
@@ -113,9 +112,9 @@ def parse_pdf_page_extraction_args() -> Namespace:
113
112
  return args
114
113
 
115
114
 
116
- ############################
117
- # extract_text_from_pdfs #
118
- ############################
115
+ ######################
116
+ # extract_pdf_text #
117
+ ######################
119
118
  extract_text_parser = ArgumentParser(
120
119
  formatter_class=RichHelpFormatterPlus,
121
120
  description="Extract the text from one or more files or directories.",
@@ -123,7 +122,6 @@ extract_text_parser = ArgumentParser(
123
122
  )
124
123
 
125
124
  extract_text_parser.add_argument('file_or_dir', nargs='+', metavar='FILE_OR_DIR')
126
- extract_text_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
127
125
 
128
126
  extract_text_parser.add_argument('--page-range', '-r',
129
127
  type=page_range_validator,
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "pdfalyzer"
3
- version = "1.17.1"
4
- description = "PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
3
+ version = "1.17.2"
4
+ description = "Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more."
5
5
  authors = ["Michel de Cryptadamus <michel@cryptadamus.com>"]
6
6
  license = "GPL-3.0-or-later"
7
7
  readme = "README.md"
@@ -91,7 +91,7 @@ pytest-skip-slow = "^0.0.3"
91
91
  [tool.poetry.scripts]
92
92
  combine_pdfs = 'pdfalyzer:combine_pdfs'
93
93
  extract_pdf_pages = 'pdfalyzer:extract_pdf_pages'
94
- extract_text_from_pdfs = 'pdfalyzer:extract_text_from_pdfs'
94
+ extract_pdf_text = 'pdfalyzer:extract_pdf_text'
95
95
  pdfalyze = 'pdfalyzer:pdfalyze'
96
96
  pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'
97
97
 
File without changes