pdfalyzer 1.17.1__py3-none-any.whl → 1.17.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pdfalyzer might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,7 +1,14 @@
1
1
  # NEXT RELEASE
2
2
 
3
+ ### 1.17.3
4
+ * Put back `--debug` arg for CLI tools
5
+
6
+ ### 1.17.2
7
+ * Remove unused `--debug` args for CLI tools
8
+ * Rename `extract_text_from_pdfs` to `extract_pdf_text`
9
+
3
10
  ### 1.17.1
4
- * Fix issue where `combine_pdfs` page ranges were indexed from 0 instead of 1
11
+ * Fix issue where `extract_pdf_pages` page ranges were indexed from 0 instead of 1
5
12
 
6
13
  # 1.17.0
7
14
  * Add `extract_pdf_pages` command line tool (imported from `clown_sort`)
pdfalyzer/__init__.py CHANGED
@@ -32,8 +32,8 @@ from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
32
32
  from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
33
33
  from pdfalyzer.pdfalyzer import Pdfalyzer
34
34
  from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments
35
- from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, ask_to_proceed,
36
- parse_combine_pdfs_args, parse_pdf_page_extraction_args, parse_text_extraction_args)
35
+ from pdfalyzer.util.cli_tools_argument_parser import (MAX_QUALITY, parse_combine_pdfs_args,
36
+ parse_pdf_page_extraction_args, parse_text_extraction_args)
37
37
  from pdfalyzer.util.pdf_parser_manager import PdfParserManager
38
38
 
39
39
  # For the table shown by running pdfalyzer_show_color_theme
@@ -141,7 +141,7 @@ def extract_pdf_pages() -> None:
141
141
  PdfFile(args.pdf_file).extract_page_range(args.page_range, destination_dir=args.destination_dir)
142
142
 
143
143
 
144
- def extract_text_from_pdfs() -> None:
144
+ def extract_pdf_text() -> None:
145
145
  """Extract text from a list of file or from all PDF files in a list of directories."""
146
146
  args: Namespace = parse_text_extraction_args()
147
147
  console.line()
@@ -30,9 +30,9 @@ class PdfFile:
30
30
 
31
31
  Attributes:
32
32
  file_path (Path): The path to the PDF file.
33
- dirname (Path): The directory containing the PDF file.
34
33
  basename (str): The base name of the PDF file (with extension).
35
34
  basename_without_ext (str): The base name of the PDF file (without extension).
35
+ dirname (Path): The directory containing the PDF file.
36
36
  extname (str): The file extension of the PDF file.
37
37
  file_size (int): The size of the file in bytes.
38
38
  """
@@ -74,11 +74,15 @@ class PdfFile:
74
74
  """
75
75
  destination_dir = Path(destination_dir or self.dirname)
76
76
  create_dir_if_it_does_not_exist(destination_dir)
77
+ pdf_reader = PdfReader(self.file_path)
78
+ page_count = len(pdf_reader.pages)
79
+ file_suffix = page_range.file_suffix()
80
+
81
+ if page_count < (page_range.last_page - 1):
82
+ raise ValueError(f"PDF only has {page_count} pages but you asked for pages {page_range}!")
77
83
 
78
- if extra_file_suffix is None:
79
- file_suffix = page_range.file_suffix()
80
- else:
81
- file_suffix = f"{page_range.file_suffix()}__{extra_file_suffix}"
84
+ if extra_file_suffix is not None:
85
+ file_suffix += f"__{extra_file_suffix}"
82
86
 
83
87
  extracted_pages_pdf_basename = insert_suffix_before_extension(self.file_path, file_suffix).name
84
88
  extracted_pages_pdf_path = destination_dir.joinpath(extracted_pages_pdf_basename)
@@ -6,7 +6,6 @@ from pathlib import Path
6
6
  from typing import Optional, Union
7
7
 
8
8
  from yaralyzer.output.rich_console import console
9
- from yaralyzer.util.logging import log
10
9
 
11
10
  from pdfalyzer.helpers.rich_text_helper import print_highlighted
12
11
 
@@ -196,7 +196,7 @@ def output_sections(args: Namespace, pdfalyzer: 'Pdfalyzer') -> List[OutputSecti
196
196
 
197
197
 
198
198
  def all_sections_chosen(args):
199
- """Returns true if all flags are set or no flags are set."""
199
+ """Returns True if all flags are set or no flags are set."""
200
200
  return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
201
201
 
202
202
 
@@ -2,8 +2,10 @@
2
2
  Argument parsers for the command line tools other than `pdfalyze` that are included with The Pdfalyzer.
3
3
 
4
4
  1. combine_pdfs
5
- 2.
5
+ 2. extract_pdf_pages
6
+ 3. extract_pdf_text
6
7
  """
8
+ import logging
7
9
  import sys
8
10
  from argparse import ArgumentParser, Namespace
9
11
  from pathlib import Path
@@ -87,6 +89,7 @@ extract_pdf_parser = ArgumentParser(
87
89
  )
88
90
 
89
91
  extract_pdf_parser.add_argument('pdf_file', metavar='PDF_FILE', help='PDF to extract pages from')
92
+ extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
90
93
 
91
94
  extract_pdf_parser.add_argument('--page-range', '-r',
92
95
  type=page_range_validator,
@@ -97,8 +100,6 @@ extract_pdf_parser.add_argument('--destination-dir', '-d',
97
100
  help="directory to write the new PDF to",
98
101
  default=Path.cwd())
99
102
 
100
- extract_pdf_parser.add_argument('--debug', action='store_true', help='turn on debug level logging')
101
-
102
103
 
103
104
  def parse_pdf_page_extraction_args() -> Namespace:
104
105
  args = extract_pdf_parser.parse_args()
@@ -110,12 +111,13 @@ def parse_pdf_page_extraction_args() -> Namespace:
110
111
  log.error(f"Destination dir '{args.destination_dir}' does not exist.")
111
112
  sys.exit(1)
112
113
 
114
+ _set_log_level(args)
113
115
  return args
114
116
 
115
117
 
116
- ############################
117
- # extract_text_from_pdfs #
118
- ############################
118
+ ######################
119
+ # extract_pdf_text #
120
+ ######################
119
121
  extract_text_parser = ArgumentParser(
120
122
  formatter_class=RichHelpFormatterPlus,
121
123
  description="Extract the text from one or more files or directories.",
@@ -153,4 +155,10 @@ def parse_text_extraction_args() -> Namespace:
153
155
  log.error(f"--page-range can only be specified for a single PDF")
154
156
  sys.exit(-1)
155
157
 
158
+ _set_log_level(args)
156
159
  return args
160
+
161
+
162
+ def _set_log_level(args: Namespace):
163
+ if args.debug:
164
+ log.setLevel(logging.DEBUG)
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pdfalyzer
3
- Version: 1.17.1
4
- Summary: PDF analysis tool. Scan a PDF with YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more.
3
+ Version: 1.17.3
4
+ Summary: Analyze PDFs with colors (and YARA). Visualize a PDF's inner tree-like data structure, check it against a library of YARA rules, force decodes of suspicious font binaries, and more.
5
5
  Home-page: https://github.com/michelcrypt4d4mus/pdfalyzer
6
6
  License: GPL-3.0-or-later
7
7
  Keywords: ascii art,binary,color,cybersecurity,DFIR,encoding,font,infosec,maldoc,malicious pdf,malware,malware analysis,pdf,pdfs,pdf analysis,pypdf,threat assessment,threat hunting,threat intelligence,threat research,threatintel,visualization,yara
@@ -252,9 +252,9 @@ The Pdfalyzer comes with a few command line tools:
252
252
 
253
253
  * `combine_pdfs` - Combines multiple PDFs into a single PDF. Run `combine_pdfs --help` for more info.
254
254
  * `extract_pdf_pages` - Extracts page ranges (e.g. "10-25") from a PDF and writes them to a new PDF. Run `extract_pdf_pages --help` for more info.
255
- * `extract_text_from_pdfs` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_text_from_pdfs --help` for more info.
255
+ * `extract_pdf_text` - Extracts text from a PDF, including applying OCR to all embedded images. Run `extract_pdf_text --help` for more info.
256
256
 
257
- Running `extract_text_from_pdfs` requires that you install The Pdfalyzer's optional dependencies:
257
+ Running `extract_pdf_text` requires that you install The Pdfalyzer's optional dependencies:
258
258
 
259
259
  ```bash
260
260
  pipx install pdfalyzer[extract]
@@ -1,13 +1,13 @@
1
1
  .pdfalyzer.example,sha256=sh_qkUBw4hfJia_Dx2wB-fsqJInhx2sSgA7WJz3MHYo,3917
2
- CHANGELOG.md,sha256=KtprK6EZ8FhdPWHs9E-YzGSqHxV_w0GnShvIJ6kMPss,13132
2
+ CHANGELOG.md,sha256=ZFP4uDoiYT-kNa7XJuyNKhIjcvY5DU4CeMSGn0braPU,13301
3
3
  LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
4
- pdfalyzer/__init__.py,sha256=TgCkfaaWuxv3sNMHcMZjh5lAw0oPNYKqJYRXVy9hPKo,6181
4
+ pdfalyzer/__init__.py,sha256=2Gikt_-OSXZqeQij4wSwb65g7jycVAupjeFmXBf51lo,6159
5
5
  pdfalyzer/__main__.py,sha256=Ko_AoAyYMLIe_cmhiUSl6twheLZrGyT8aOSJ2CP7EZY,43
6
6
  pdfalyzer/binary/binary_scanner.py,sha256=gEHHdQ3lKe1P1441fk2OLO8GxJF7FxF8Tq04ElB8ilU,10748
7
7
  pdfalyzer/config.py,sha256=RBtp3Q6IvOW922rcanB4mVceJEM0BEjNybc6_vC7efY,2122
8
8
  pdfalyzer/decorators/document_model_printer.py,sha256=2tjJItZltukmOD2wjGvl2IsBM7Gug59wMHGLpzGDbV0,2682
9
9
  pdfalyzer/decorators/indeterminate_node.py,sha256=QLJr-nGKih8gPZcIqxLU028OwWWD5VjNHYMUjniwT_k,6586
10
- pdfalyzer/decorators/pdf_file.py,sha256=CHXyM8RIvnjKnsDOJxUhk-sfRzLLW50MJpKKTax6Eqk,10274
10
+ pdfalyzer/decorators/pdf_file.py,sha256=Az3TL_Ttj_pDOHoHsiwpNlrCckCgKTp0VuGevJIi_5c,10481
11
11
  pdfalyzer/decorators/pdf_object_properties.py,sha256=Il3RObxQ4XUf0Ei-nd4tjJO0LeaxC6u7yFa3cQs_jVY,5485
12
12
  pdfalyzer/decorators/pdf_tree_node.py,sha256=4LReGJUtG8iEcLUQD1jW-yp3xPWsHrC-3Anbkt7XZ3A,11134
13
13
  pdfalyzer/decorators/pdf_tree_verifier.py,sha256=2hVe9APsAWQZ7ra8AGndHQnGWmmxmb3ZwfJHZuLvsvc,4714
@@ -17,7 +17,7 @@ pdfalyzer/detection/javascript_hunter.py,sha256=_wT2vkKTMlm_RGCjYsmwcmV-ag1qep3E
17
17
  pdfalyzer/detection/yaralyzer_helper.py,sha256=bfIa18f0zdUoOAJIQcwVDjF52sJNV47NdsDasg01uiQ,2194
18
18
  pdfalyzer/font_info.py,sha256=2R85iETY_1eKCeRrkqeIxfPDqXZyWfCNcHx_-aTyF0s,6682
19
19
  pdfalyzer/helpers/dict_helper.py,sha256=2TP0_EJBouaWD6jfnAekrEZ4M5eHKL8Tm61FgXZtBAg,303
20
- pdfalyzer/helpers/filesystem_helper.py,sha256=onXhSMhxo0YkvdKdosRwUo_RGdW6yNzZF5hfjgZ3GBE,5085
20
+ pdfalyzer/helpers/filesystem_helper.py,sha256=QCdUcZlufBWaV38LlrQEGUGOGUtZEozMVSRkoTjwKlU,5046
21
21
  pdfalyzer/helpers/image_helper.py,sha256=QjoAUcKKEtpmuEyOmEfmaUN-lzNykQ1SzqgNn9-R4Y0,1120
22
22
  pdfalyzer/helpers/number_helper.py,sha256=8IlRmaOVLJsUV18VLvWRZU8SzRxL0XZjrY3sjmk2Ro4,292
23
23
  pdfalyzer/helpers/pdf_object_helper.py,sha256=65BlUgnDM9brxJFw_WF8QLomWHaNh-pj88NWoxcMkoQ,1160
@@ -35,8 +35,8 @@ pdfalyzer/output/tables/stream_objects_table.py,sha256=PgQj8oTtW5_X8SMQb3FvCWDS-
35
35
  pdfalyzer/pdf_object_relationship.py,sha256=tOJTp73m82oNZJB7NxvTLv167kqthH8PRbVnev_4uEk,5291
36
36
  pdfalyzer/pdfalyzer.py,sha256=T5U8MZRlL0Kn-GJVqfIIoL7eBUdQeteu50VhQ-IoYh0,11214
37
37
  pdfalyzer/util/adobe_strings.py,sha256=eF4K1RhhR_qgMBT58MzCxWkqQj17OWLCNmG3SjZ9BUs,5045
38
- pdfalyzer/util/argument_parser.py,sha256=OdvGCowGnVNyulqC5968myCxY4gRu6--WmCIdkiXoWA,9732
39
- pdfalyzer/util/cli_tools_argument_parser.py,sha256=EE-lk1ZMv3JlZlZ9N3rAndIlYl1__C0iYG0Ti6MEHjM,6107
38
+ pdfalyzer/util/argument_parser.py,sha256=9ixQMEZz00IPK8xRxuS7DMrQgXIrqhB2ve5W8XTZ1S8,9732
39
+ pdfalyzer/util/cli_tools_argument_parser.py,sha256=RqK_5AWC7qm9Zy7pvDb-J1WSEGBkIyxzNDcFJwSmuX4,6285
40
40
  pdfalyzer/util/debugging.py,sha256=hjYGxptJmal9TTaAUkkoo0oNu2tdx6ZYSyC0WjvzHh0,156
41
41
  pdfalyzer/util/exceptions.py,sha256=XLFFTdx1n6i_VCmvuzvIOCa-djJvGEitfo9lhy3zq0k,98
42
42
  pdfalyzer/util/page_range.py,sha256=NMNh3_TojxTxBIpvUYK1AmvID_m8qOP6AihZrLWZF2I,1652
@@ -47,8 +47,8 @@ pdfalyzer/yara_rules/__init.py__,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
47
47
  pdfalyzer/yara_rules/didier_stevens.yara,sha256=4XhqafU09xzYUP7LCygHHBXOpAXUblJf6Tkn37MUy0w,7253
48
48
  pdfalyzer/yara_rules/lprat.static_file_analysis.yara,sha256=i0CwRH8pBx_QshKFTQtr1CP5n378EZelsF2FxMY2y5A,21859
49
49
  pdfalyzer/yara_rules/pdf_malware.yara,sha256=jDqSTP5BQSi2I_1xZiFZdy68I4oVWDat2j08-qdfbto,91063
50
- pdfalyzer-1.17.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
51
- pdfalyzer-1.17.1.dist-info/METADATA,sha256=nla_K-pZ8XoknqbcCqi90EPydVJ7STe6DDBfOOf_Dso,27309
52
- pdfalyzer-1.17.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
53
- pdfalyzer-1.17.1.dist-info/entry_points.txt,sha256=goHVADdqEFcniu4O0k7kabc2rLf3wvRrENJK6c9IkUw,249
54
- pdfalyzer-1.17.1.dist-info/RECORD,,
50
+ pdfalyzer-1.17.3.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
51
+ pdfalyzer-1.17.3.dist-info/METADATA,sha256=MczhorkJI7ozznrHf72k7a0QELDinDNHhex4ur8kSr8,27294
52
+ pdfalyzer-1.17.3.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
53
+ pdfalyzer-1.17.3.dist-info/entry_points.txt,sha256=5ve7Ydx0p33ZuQWol6wIFAPPUgQrEQoJkOy06mD5t9Y,237
54
+ pdfalyzer-1.17.3.dist-info/RECORD,,
@@ -1,7 +1,7 @@
1
1
  [console_scripts]
2
2
  combine_pdfs=pdfalyzer:combine_pdfs
3
3
  extract_pdf_pages=pdfalyzer:extract_pdf_pages
4
- extract_text_from_pdfs=pdfalyzer:extract_text_from_pdfs
4
+ extract_pdf_text=pdfalyzer:extract_pdf_text
5
5
  pdfalyze=pdfalyzer:pdfalyze
6
6
  pdfalyzer_show_color_theme=pdfalyzer:pdfalyzer_show_color_theme
7
7