modelaudit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ Metadata-Version: 2.1
2
+ Name: modelaudit
3
+ Version: 0.1.0
4
+ Summary: Model file scanning library for detecting malicious code in ML model files
5
+ Home-page: https://github.com/promptfoo/modelaudit
6
+ License: MIT
7
+ Keywords: ai,ml,security,model-scanning,pickle,tensorflow,pytorch
8
+ Author: Ian Webster
9
+ Author-email: ian@promptfoo.dev
10
+ Requires-Python: >=3.9,<4.0
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Security
22
+ Provides-Extra: all
23
+ Provides-Extra: h5
24
+ Provides-Extra: pytorch
25
+ Provides-Extra: tensorflow
26
+ Provides-Extra: yaml
27
+ Requires-Dist: click (>=8.1.3,<9.0.0)
28
+ Requires-Dist: h5py (>=3.1) ; extra == "h5" or extra == "all"
29
+ Requires-Dist: pyyaml (>=6.0,<7.0) ; extra == "yaml" or extra == "all"
30
+ Requires-Dist: tensorflow (>=2.6) ; extra == "tensorflow" or extra == "all"
31
+ Requires-Dist: torch (>=1.6) ; extra == "pytorch" or extra == "all"
32
+ Requires-Dist: yaspin (>=2.3.0,<3.0.0)
33
+ Project-URL: Repository, https://github.com/promptfoo/modelaudit
34
+ Description-Content-Type: text/markdown
35
+
36
+ # ModelAudit
37
+
38
+ A security scanner for machine learning models. Quickly check your AIML models for potential security risks before deployment.
39
+
40
+ ## 🔍 What It Does
41
+
42
+ ModelAudit scans ML model files for:
43
+
44
+ - Malicious code (e.g., `os.system` calls in pickled models)
45
+ - Suspicious TensorFlow operations
46
+ - Potentially unsafe Keras Lambda layers
47
+ - Models with blacklisted names
48
+ - Dangerous pickle opcodes and serialization patterns
49
+ - Suspicious string patterns that might indicate encoded payloads
50
+ - Risky configurations in model architectures
51
+ - Suspicious patterns in model manifests and configuration files
52
+
53
+ ## 🚀 Quick Start
54
+
55
+ ### Installation
56
+
57
+ ```bash
58
+ # Using pip
59
+ pip install modelaudit
60
+
61
+ # Or with optional dependencies for specific model formats
62
+ pip install modelaudit[tensorflow,h5,pytorch]
63
+
64
+ # For YAML manifest scanning support
65
+ pip install modelaudit[yaml]
66
+
67
+ # For all dependencies
68
+ pip install modelaudit[all]
69
+ ```
70
+
71
+ ### Basic Usage
72
+
73
+ ```bash
74
+ # Scan one or more models or directories
75
+ modelaudit scan model.pkl model2.h5 models_directory
76
+
77
+ # Export results to JSON
78
+ modelaudit scan model.pkl --format json --output results.json
79
+
80
+ # Set maximum file size to scan
81
+ modelaudit scan model.pkl --max-file-size 1073741824 # 1GB limit
82
+
83
+ # Add custom blacklist patterns
84
+ modelaudit scan model.pkl --blacklist "unsafe_model" --blacklist "malicious_net"
85
+ ```
86
+
87
+ ## ✨ Features
88
+
89
+ - **Multiple Format Support**: Scans PyTorch, TensorFlow, Keras, and pickle models
90
+ - **Automatic Format Detection**: Identifies model formats automatically
91
+ - **Comprehensive Scanning**: Checks for various security issues with severity levels
92
+ - **Batch Processing**: Scan multiple files and directories at once
93
+ - **Configurable Timeouts**: Set scan timeouts for large models
94
+ - **Detailed Reporting**: Get information about scan duration, files scanned, and bytes processed
95
+ - **Structured Output**: Export results as JSON for integration with other tools
96
+ - **Name Blacklisting**: Block models with names matching suspicious patterns
97
+ - **Manifest Scanning**: Detect suspicious patterns in model configuration files
98
+
99
+ ## 🛡️ Scanners
100
+
101
+ ModelAudit includes specialized scanners for different model formats:
102
+
103
+ - **Pickle Scanner**: Detects malicious code and encoded payloads in pickle files
104
+ - **TensorFlow Scanner**: Identifies suspicious operations in SavedModel format
105
+ - **Keras Scanner**: Checks for unsafe Lambda layers and risky configurations
106
+ - **PyTorch Scanner**: Examines PyTorch models for security issues
107
+ - **Manifest Scanner**: Analyzes model manifests and configuration files for suspicious patterns and blacklisted names
108
+
109
+ ## 🛠️ Development
110
+
111
+ ### Using Poetry
112
+
113
+ ```bash
114
+ # Clone the repository
115
+ git clone https://github.com/promptfoo/modelaudit.git
116
+ cd modelaudit
117
+
118
+ # Install dependencies
119
+ poetry install
120
+
121
+ # Install with extras
122
+ poetry install --extras "all"
123
+ ```
124
+
125
+ ## 📝 License
126
+
127
+ This project is licensed under the MIT License.
128
+
@@ -0,0 +1,92 @@
1
+ # ModelAudit
2
+
3
+ A security scanner for machine learning models. Quickly check your AIML models for potential security risks before deployment.
4
+
5
+ ## 🔍 What It Does
6
+
7
+ ModelAudit scans ML model files for:
8
+
9
+ - Malicious code (e.g., `os.system` calls in pickled models)
10
+ - Suspicious TensorFlow operations
11
+ - Potentially unsafe Keras Lambda layers
12
+ - Models with blacklisted names
13
+ - Dangerous pickle opcodes and serialization patterns
14
+ - Suspicious string patterns that might indicate encoded payloads
15
+ - Risky configurations in model architectures
16
+ - Suspicious patterns in model manifests and configuration files
17
+
18
+ ## 🚀 Quick Start
19
+
20
+ ### Installation
21
+
22
+ ```bash
23
+ # Using pip
24
+ pip install modelaudit
25
+
26
+ # Or with optional dependencies for specific model formats
27
+ pip install modelaudit[tensorflow,h5,pytorch]
28
+
29
+ # For YAML manifest scanning support
30
+ pip install modelaudit[yaml]
31
+
32
+ # For all dependencies
33
+ pip install modelaudit[all]
34
+ ```
35
+
36
+ ### Basic Usage
37
+
38
+ ```bash
39
+ # Scan one or more models or directories
40
+ modelaudit scan model.pkl model2.h5 models_directory
41
+
42
+ # Export results to JSON
43
+ modelaudit scan model.pkl --format json --output results.json
44
+
45
+ # Set maximum file size to scan
46
+ modelaudit scan model.pkl --max-file-size 1073741824 # 1GB limit
47
+
48
+ # Add custom blacklist patterns
49
+ modelaudit scan model.pkl --blacklist "unsafe_model" --blacklist "malicious_net"
50
+ ```
51
+
52
+ ## ✨ Features
53
+
54
+ - **Multiple Format Support**: Scans PyTorch, TensorFlow, Keras, and pickle models
55
+ - **Automatic Format Detection**: Identifies model formats automatically
56
+ - **Comprehensive Scanning**: Checks for various security issues with severity levels
57
+ - **Batch Processing**: Scan multiple files and directories at once
58
+ - **Configurable Timeouts**: Set scan timeouts for large models
59
+ - **Detailed Reporting**: Get information about scan duration, files scanned, and bytes processed
60
+ - **Structured Output**: Export results as JSON for integration with other tools
61
+ - **Name Blacklisting**: Block models with names matching suspicious patterns
62
+ - **Manifest Scanning**: Detect suspicious patterns in model configuration files
63
+
64
+ ## 🛡️ Scanners
65
+
66
+ ModelAudit includes specialized scanners for different model formats:
67
+
68
+ - **Pickle Scanner**: Detects malicious code and encoded payloads in pickle files
69
+ - **TensorFlow Scanner**: Identifies suspicious operations in SavedModel format
70
+ - **Keras Scanner**: Checks for unsafe Lambda layers and risky configurations
71
+ - **PyTorch Scanner**: Examines PyTorch models for security issues
72
+ - **Manifest Scanner**: Analyzes model manifests and configuration files for suspicious patterns and blacklisted names
73
+
74
+ ## 🛠️ Development
75
+
76
+ ### Using Poetry
77
+
78
+ ```bash
79
+ # Clone the repository
80
+ git clone https://github.com/promptfoo/modelaudit.git
81
+ cd modelaudit
82
+
83
+ # Install dependencies
84
+ poetry install
85
+
86
+ # Install with extras
87
+ poetry install --extras "all"
88
+ ```
89
+
90
+ ## 📝 License
91
+
92
+ This project is licensed under the MIT License.
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,284 @@
1
+ import click
2
+ import sys
3
+ import os
4
+ import json
5
+ import logging
6
+ import time
7
+ from yaspin import yaspin
8
+ from yaspin.spinners import Spinners
9
+
10
+ from .core import scan_model_directory_or_file
11
+ from .scanners import ScanResult, IssueSeverity
12
+
13
+ # Configure logging
14
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
15
+ logger = logging.getLogger("modelaudit")
16
+
17
+ @click.group()
18
+ def cli():
19
+ """My Model Scanner CLI."""
20
+ pass
21
+
22
+ @cli.command("scan")
23
+ @click.argument("paths", nargs=-1, type=click.Path(exists=True), required=True)
24
+ @click.option("--blacklist", "-b", multiple=True, help="Additional blacklist patterns to check against model names")
25
+ @click.option("--format", "-f", type=click.Choice(["text", "json"]), default="text", help="Output format [default: text]")
26
+ @click.option("--output", "-o", type=click.Path(), help="Output file path (prints to stdout if not specified)")
27
+ @click.option("--timeout", "-t", type=int, default=300, help="Scan timeout in seconds [default: 300]")
28
+ @click.option("--verbose", "-v", is_flag=True, help="Enable verbose output")
29
+ @click.option("--max-file-size", type=int, default=0,
30
+ help="Maximum file size to scan in bytes [default: unlimited]")
31
+ def scan_command(paths, blacklist, format, output, timeout, verbose, max_file_size):
32
+ """
33
+ Scan one or more model files or directories for malicious content or suspicious references.
34
+
35
+ Usage: modelaudit scan /path/to/model1 /path/to/model2 ...
36
+
37
+ You can specify additional blacklist patterns with --blacklist or -b option:
38
+ modelaudit scan /path/to/model1 /path/to/model2 -b llama -b alpaca
39
+
40
+ Advanced options:
41
+ --format, -f Output format (text or json)
42
+ --output, -o Write results to a file instead of stdout
43
+ --timeout, -t Set scan timeout in seconds
44
+ --verbose, -v Show detailed information during scanning
45
+ --max-file-size Maximum file size to scan in bytes
46
+ """
47
+ # Print a nice header if not in JSON mode and not writing to a file
48
+ if format == "text" and not output:
49
+ header = [
50
+ "─" * 80,
51
+ click.style("ModelAudit Security Scanner", fg="blue", bold=True),
52
+ click.style("Scanning for potential security issues in ML model files", fg="cyan"),
53
+ "─" * 80,
54
+ ]
55
+ click.echo("\n".join(header))
56
+ click.echo(f"Paths to scan: {click.style(', '.join(paths), fg='green')}")
57
+ if blacklist:
58
+ click.echo(f"Additional blacklist patterns: {click.style(', '.join(blacklist), fg='yellow')}")
59
+ click.echo("─" * 80)
60
+ click.echo("")
61
+
62
+ # Set logging level based on verbosity
63
+ if verbose:
64
+ logger.setLevel(logging.DEBUG)
65
+
66
+ # Aggregated results
67
+ aggregated_results = {
68
+ "scanner_names": [], # Track all scanner names used
69
+ "start_time": time.time(),
70
+ "bytes_scanned": 0,
71
+ "issues": [],
72
+ "has_errors": False,
73
+ "files_scanned": 0
74
+ }
75
+
76
+ # Scan each path
77
+ for path in paths:
78
+ # Early exit for common non-model file extensions
79
+ if os.path.isfile(path):
80
+ _, ext = os.path.splitext(path)
81
+ ext = ext.lower()
82
+ if ext in ('.md', '.txt', '.py', '.js', '.html', '.css', '.json', '.yaml', '.yml'):
83
+ if verbose:
84
+ logger.info(f"Skipping non-model file: {path}")
85
+ click.echo(f"Skipping non-model file: {path}")
86
+ continue
87
+
88
+ # Show progress indicator if in text mode and not writing to a file
89
+ spinner = None
90
+ if format == "text" and not output:
91
+ spinner_text = f"Scanning {click.style(path, fg='cyan')}"
92
+ spinner = yaspin(Spinners.dots, text=spinner_text)
93
+ spinner.start()
94
+
95
+ # Perform the scan with the specified options
96
+ try:
97
+ # Define progress callback if using spinner
98
+ progress_callback = None
99
+ if spinner:
100
+ def update_progress(message, percentage):
101
+ spinner.text = f"{message} ({percentage:.1f}%)"
102
+ progress_callback = update_progress
103
+
104
+ # Run the scan with progress reporting
105
+ results = scan_model_directory_or_file(
106
+ path,
107
+ blacklist_patterns=list(blacklist) if blacklist else None,
108
+ timeout=timeout,
109
+ max_file_size=max_file_size,
110
+ progress_callback=progress_callback
111
+ )
112
+
113
+ # Aggregate results
114
+ aggregated_results["bytes_scanned"] += results.get("bytes_scanned", 0)
115
+ aggregated_results["issues"].extend(results.get("issues", []))
116
+ aggregated_results["files_scanned"] += results.get("files_scanned", 1) # Count each file scanned
117
+ if results.get("has_errors", False):
118
+ aggregated_results["has_errors"] = True
119
+
120
+ # Track scanner names
121
+ for scanner in results.get("scanners", []):
122
+ if scanner and scanner not in aggregated_results["scanner_names"] and scanner != "unknown":
123
+ aggregated_results["scanner_names"].append(scanner)
124
+
125
+ # Show completion status if in text mode and not writing to a file
126
+ if spinner:
127
+ if results.get("issues", []):
128
+ # Filter out DEBUG severity issues when not in verbose mode
129
+ visible_issues = [issue for issue in results.get("issues", [])
130
+ if verbose or not isinstance(issue, dict)
131
+ or issue.get("severity") != "debug"]
132
+ issue_count = len(visible_issues)
133
+ spinner.text = f"Scanned {click.style(path, fg='cyan')}"
134
+ if issue_count > 0:
135
+ spinner.ok(click.style(f"✓ Found {issue_count} issues!", fg="yellow", bold=True))
136
+ else:
137
+ spinner.ok(click.style("✓", fg="green", bold=True))
138
+ else:
139
+ spinner.text = f"Scanned {click.style(path, fg='cyan')}"
140
+ spinner.ok(click.style("✓", fg="green", bold=True))
141
+
142
+ except Exception as e:
143
+ # Show error if in text mode and not writing to a file
144
+ if spinner:
145
+ spinner.text = f"Error scanning {click.style(path, fg='cyan')}"
146
+ spinner.fail(click.style("✗", fg="red", bold=True))
147
+
148
+ logger.error(f"Error during scan of {path}: {str(e)}", exc_info=verbose)
149
+ click.echo(f"Error scanning {path}: {str(e)}", err=True)
150
+ aggregated_results["has_errors"] = True
151
+
152
+ # Calculate total duration
153
+ aggregated_results["duration"] = time.time() - aggregated_results["start_time"]
154
+
155
+ # Format the output
156
+ if format == "json":
157
+ output_data = aggregated_results
158
+ output_text = json.dumps(output_data, indent=2)
159
+ else:
160
+ # Text format
161
+ output_text = format_text_output(aggregated_results, verbose)
162
+
163
+ # Send output to the specified destination
164
+ if output:
165
+ with open(output, "w") as f:
166
+ f.write(output_text)
167
+ else:
168
+ # Add a separator line between debug output and scan results
169
+ if format == "text":
170
+ click.echo("\n" + "─" * 80)
171
+ click.echo(output_text)
172
+
173
+ # Exit with appropriate error code
174
+ if aggregated_results.get("has_errors", False) or aggregated_results.get("issues", []):
175
+ sys.exit(1)
176
+ else:
177
+ sys.exit(0)
178
+
179
+ def format_text_output(results, verbose=False):
180
+ """Format scan results as human-readable text with colors"""
181
+ output_lines = []
182
+
183
+ # Add summary information with styling
184
+ if "scanner_names" in results and results["scanner_names"]:
185
+ scanner_names = results["scanner_names"]
186
+ if len(scanner_names) == 1:
187
+ output_lines.append(click.style(f"Active Scanner: {scanner_names[0]}", fg="blue", bold=True))
188
+ else:
189
+ output_lines.append(click.style(f"Active Scanners: {', '.join(scanner_names)}", fg="blue", bold=True))
190
+ if "duration" in results:
191
+ output_lines.append(click.style(f"Scan completed in {results['duration']:.2f} seconds", fg="cyan"))
192
+ if "files_scanned" in results:
193
+ output_lines.append(click.style(f"Files scanned: {results['files_scanned']}", fg="cyan"))
194
+ if "bytes_scanned" in results:
195
+ # Format bytes in a more readable way
196
+ bytes_scanned = results['bytes_scanned']
197
+ if bytes_scanned >= 1024 * 1024 * 1024:
198
+ size_str = f"{bytes_scanned / (1024 * 1024 * 1024):.2f} GB"
199
+ elif bytes_scanned >= 1024 * 1024:
200
+ size_str = f"{bytes_scanned / (1024 * 1024):.2f} MB"
201
+ elif bytes_scanned >= 1024:
202
+ size_str = f"{bytes_scanned / 1024:.2f} KB"
203
+ else:
204
+ size_str = f"{bytes_scanned} bytes"
205
+ output_lines.append(click.style(f"Scanned {size_str}", fg="cyan"))
206
+
207
+ # Add issue details with color-coded severity
208
+ issues = results.get("issues", [])
209
+ if issues:
210
+ # Filter out DEBUG severity issues when not in verbose mode
211
+ visible_issues = [issue for issue in issues
212
+ if verbose or not isinstance(issue, dict)
213
+ or issue.get("severity") != "debug"]
214
+
215
+ # Count issues by severity (excluding DEBUG when not in verbose mode)
216
+ error_count = sum(1 for issue in visible_issues if isinstance(issue, dict) and issue.get("severity") == "error")
217
+ warning_count = sum(1 for issue in visible_issues if isinstance(issue, dict) and issue.get("severity") == "warning")
218
+ info_count = sum(1 for issue in visible_issues if isinstance(issue, dict) and issue.get("severity") == "info")
219
+ debug_count = sum(1 for issue in issues if isinstance(issue, dict) and issue.get("severity") == "debug")
220
+
221
+ # Only show debug count in verbose mode
222
+ issue_summary = []
223
+ if error_count:
224
+ issue_summary.append(click.style(f"{error_count} errors", fg="red", bold=True))
225
+ if warning_count:
226
+ issue_summary.append(click.style(f"{warning_count} warnings", fg="yellow"))
227
+ if info_count:
228
+ issue_summary.append(click.style(f"{info_count} info", fg="blue"))
229
+ if verbose and debug_count:
230
+ issue_summary.append(click.style(f"{debug_count} debug", fg="cyan"))
231
+
232
+ if issue_summary:
233
+ output_lines.append(click.style("Issues found: ", fg="white") + ", ".join(issue_summary))
234
+
235
+ # Only display visible issues
236
+ for i, issue in enumerate(visible_issues, 1):
237
+ severity = issue.get("severity", "warning").lower()
238
+
239
+ # Skip debug issues if verbose is not enabled
240
+ if severity == "debug" and not verbose:
241
+ continue
242
+
243
+ message = issue.get("message", "Unknown issue")
244
+ location = issue.get("location", "")
245
+
246
+ # Color-code based on severity
247
+ if severity == "error":
248
+ severity_style = click.style(f"[ERROR]", fg="red", bold=True)
249
+ elif severity == "warning":
250
+ severity_style = click.style(f"[WARNING]", fg="yellow")
251
+ elif severity == "info":
252
+ severity_style = click.style(f"[INFO]", fg="blue")
253
+ elif severity == "debug":
254
+ severity_style = click.style(f"[DEBUG]", fg="gray")
255
+
256
+ # Format the issue line
257
+ issue_num = click.style(f"{i}.", fg="white", bold=True)
258
+ if location:
259
+ location_str = click.style(f"{location}", fg="cyan", bold=True)
260
+ output_lines.append(f"{issue_num} {location_str}: {severity_style} {message}")
261
+ else:
262
+ output_lines.append(f"{issue_num} {severity_style} {message}")
263
+
264
+ # Add a small separator between issues for readability
265
+ if i < len(issues):
266
+ output_lines.append("")
267
+ else:
268
+ output_lines.append("\n" + click.style("✓ No issues found", fg="green", bold=True))
269
+
270
+ # Add a footer
271
+ output_lines.append("─" * 80)
272
+ if issues:
273
+ if any(isinstance(issue, dict) and issue.get("severity") == "error" for issue in issues):
274
+ status = click.style("✗ Scan completed with errors", fg="red", bold=True)
275
+ else:
276
+ status = click.style("⚠ Scan completed with warnings", fg="yellow", bold=True)
277
+ else:
278
+ status = click.style("✓ Scan completed successfully", fg="green", bold=True)
279
+ output_lines.append(status)
280
+
281
+ return "\n".join(output_lines)
282
+
283
+ def main():
284
+ cli()