ai-assurance-toolkit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_assurance_toolkit-0.1.0/LICENSE +0 -0
- ai_assurance_toolkit-0.1.0/PKG-INFO +104 -0
- ai_assurance_toolkit-0.1.0/README.md +72 -0
- ai_assurance_toolkit-0.1.0/pyproject.toml +62 -0
- ai_assurance_toolkit-0.1.0/setup.cfg +4 -0
- ai_assurance_toolkit-0.1.0/src/ai_assurance_toolkit/__init__.py +5 -0
- ai_assurance_toolkit-0.1.0/src/ai_assurance_toolkit/cli.py +66 -0
- ai_assurance_toolkit-0.1.0/src/ai_assurance_toolkit/performance_evaluator.py +726 -0
- ai_assurance_toolkit-0.1.0/src/ai_assurance_toolkit.egg-info/PKG-INFO +104 -0
- ai_assurance_toolkit-0.1.0/src/ai_assurance_toolkit.egg-info/SOURCES.txt +13 -0
- ai_assurance_toolkit-0.1.0/src/ai_assurance_toolkit.egg-info/dependency_links.txt +1 -0
- ai_assurance_toolkit-0.1.0/src/ai_assurance_toolkit.egg-info/entry_points.txt +2 -0
- ai_assurance_toolkit-0.1.0/src/ai_assurance_toolkit.egg-info/requires.txt +10 -0
- ai_assurance_toolkit-0.1.0/src/ai_assurance_toolkit.egg-info/top_level.txt +1 -0
- ai_assurance_toolkit-0.1.0/tests/test_import.py +4 -0
|
File without changes
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ai-assurance-toolkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python toolkit for evaluating AI model reliability, performance, and deployment readiness.
|
|
5
|
+
Author-email: Happy Iguare <haigu1@morgan.edu>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Repository, https://github.com/harpiking/AI-Assurance-Toolkit
|
|
8
|
+
Keywords: ai assurance,model evaluation,machine learning,responsible ai,model audit,nist ai rmf,public sector ai
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: numpy>=1.24
|
|
23
|
+
Requires-Dist: pandas>=2.0
|
|
24
|
+
Requires-Dist: scikit-learn>=1.3
|
|
25
|
+
Requires-Dist: joblib>=1.3
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
28
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
29
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
30
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# AI Assurance Toolkit
|
|
34
|
+
|
|
35
|
+
AI Assurance Toolkit is a lightweight Python package for evaluating machine learning model reliability, performance, and deployment readiness.
|
|
36
|
+
|
|
37
|
+
## What it does
|
|
38
|
+
|
|
39
|
+
The toolkit evaluates a trained machine learning model against a labeled test dataset and generates a structured performance report.
|
|
40
|
+
|
|
41
|
+
## Metrics included
|
|
42
|
+
|
|
43
|
+
- Accuracy
|
|
44
|
+
- Precision
|
|
45
|
+
- Recall
|
|
46
|
+
- F1 score
|
|
47
|
+
- False positive rate
|
|
48
|
+
- False negative rate
|
|
49
|
+
- AUC-ROC
|
|
50
|
+
- Calibration / Brier score
|
|
51
|
+
- Per-class metrics
|
|
52
|
+
- Plain-English deployment signal
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install ai-assurance-toolkit
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Quick start
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
ai-assurance evaluate \
|
|
64
|
+
--model model.pkl \
|
|
65
|
+
--dataset test_data.csv \
|
|
66
|
+
--target credit_risk \
|
|
67
|
+
--model-name "German Credit Risk Classifier"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Example
|
|
71
|
+
|
|
72
|
+
Generate a test model and sample dataset:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
python examples/setup_test_model.py
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Then run the evaluator:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
ai-assurance evaluate \
|
|
82
|
+
--model model.pkl \
|
|
83
|
+
--dataset test_data.csv \
|
|
84
|
+
--target credit_risk \
|
|
85
|
+
--model-name "German Credit Risk Classifier"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Output
|
|
89
|
+
|
|
90
|
+
The package creates:
|
|
91
|
+
|
|
92
|
+
```text
|
|
93
|
+
module_a_outputs/performance_report.json
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Python usage
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from ai_assurance_toolkit import run_performance_evaluation
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## License
|
|
103
|
+
|
|
104
|
+
Apache License 2.0.
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# AI Assurance Toolkit
|
|
2
|
+
|
|
3
|
+
AI Assurance Toolkit is a lightweight Python package for evaluating machine learning model reliability, performance, and deployment readiness.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
The toolkit evaluates a trained machine learning model against a labeled test dataset and generates a structured performance report.
|
|
8
|
+
|
|
9
|
+
## Metrics included
|
|
10
|
+
|
|
11
|
+
- Accuracy
|
|
12
|
+
- Precision
|
|
13
|
+
- Recall
|
|
14
|
+
- F1 score
|
|
15
|
+
- False positive rate
|
|
16
|
+
- False negative rate
|
|
17
|
+
- AUC-ROC
|
|
18
|
+
- Calibration / Brier score
|
|
19
|
+
- Per-class metrics
|
|
20
|
+
- Plain-English deployment signal
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install ai-assurance-toolkit
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Quick start
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
ai-assurance evaluate \
|
|
32
|
+
--model model.pkl \
|
|
33
|
+
--dataset test_data.csv \
|
|
34
|
+
--target credit_risk \
|
|
35
|
+
--model-name "German Credit Risk Classifier"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Example
|
|
39
|
+
|
|
40
|
+
Generate a test model and sample dataset:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
python examples/setup_test_model.py
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Then run the evaluator:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
ai-assurance evaluate \
|
|
50
|
+
--model model.pkl \
|
|
51
|
+
--dataset test_data.csv \
|
|
52
|
+
--target credit_risk \
|
|
53
|
+
--model-name "German Credit Risk Classifier"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Output
|
|
57
|
+
|
|
58
|
+
The package creates:
|
|
59
|
+
|
|
60
|
+
```text
|
|
61
|
+
module_a_outputs/performance_report.json
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Python usage
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from ai_assurance_toolkit import run_performance_evaluation
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## License
|
|
71
|
+
|
|
72
|
+
Apache License 2.0.
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ai-assurance-toolkit"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A Python toolkit for evaluating AI model reliability, performance, and deployment readiness."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "Apache-2.0" }
|
|
12
|
+
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Happy Iguare", email = "haigu1@morgan.edu" }
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
keywords = [
|
|
18
|
+
"ai assurance",
|
|
19
|
+
"model evaluation",
|
|
20
|
+
"machine learning",
|
|
21
|
+
"responsible ai",
|
|
22
|
+
"model audit",
|
|
23
|
+
"nist ai rmf",
|
|
24
|
+
"public sector ai"
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
classifiers = [
|
|
28
|
+
"Development Status :: 3 - Alpha",
|
|
29
|
+
"Intended Audience :: Developers",
|
|
30
|
+
"Intended Audience :: Science/Research",
|
|
31
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
32
|
+
"Topic :: Software Development :: Quality Assurance",
|
|
33
|
+
"License :: OSI Approved :: MIT License",
|
|
34
|
+
"Programming Language :: Python :: 3",
|
|
35
|
+
"Programming Language :: Python :: 3.10",
|
|
36
|
+
"Programming Language :: Python :: 3.11",
|
|
37
|
+
"Programming Language :: Python :: 3.12"
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
dependencies = [
|
|
41
|
+
"numpy>=1.24",
|
|
42
|
+
"pandas>=2.0",
|
|
43
|
+
"scikit-learn>=1.3",
|
|
44
|
+
"joblib>=1.3"
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[project.optional-dependencies]
|
|
48
|
+
dev = [
|
|
49
|
+
"pytest>=8.0",
|
|
50
|
+
"build>=1.2",
|
|
51
|
+
"twine>=5.0",
|
|
52
|
+
"ruff>=0.5"
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
[project.urls]
|
|
56
|
+
Repository = "https://github.com/harpiking/AI-Assurance-Toolkit"
|
|
57
|
+
|
|
58
|
+
[project.scripts]
|
|
59
|
+
ai-assurance = "ai_assurance_toolkit.cli:main"
|
|
60
|
+
|
|
61
|
+
[tool.setuptools.packages.find]
|
|
62
|
+
where = ["src"]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from ai_assurance_toolkit.performance_evaluator import evaluate_from_files
|
|
5
|
+
|
|
6
|
+
def main() -> int:
|
|
7
|
+
parser = argparse.ArgumentParser(
|
|
8
|
+
prog="ai-assurance",
|
|
9
|
+
description="AI Assurance Toolkit command-line interface"
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
13
|
+
|
|
14
|
+
evaluate_parser = subparsers.add_parser(
|
|
15
|
+
"evaluate",
|
|
16
|
+
help="Evaluate a trained model against a labeled CSV dataset"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
evaluate_parser.add_argument(
|
|
20
|
+
"--model",
|
|
21
|
+
required=True,
|
|
22
|
+
help="Path to the trained model file, such as model.pkl or model.joblib"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
evaluate_parser.add_argument(
|
|
26
|
+
"--dataset",
|
|
27
|
+
required=True,
|
|
28
|
+
help="Path to the test dataset CSV file"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
evaluate_parser.add_argument(
|
|
32
|
+
"--target",
|
|
33
|
+
required=True,
|
|
34
|
+
help="Name of the target column in the dataset"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
evaluate_parser.add_argument(
|
|
38
|
+
"--model-name",
|
|
39
|
+
default="Unnamed Model",
|
|
40
|
+
help="Human-readable model name for the report"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
evaluate_parser.add_argument(
|
|
44
|
+
"--output-dir",
|
|
45
|
+
default="module_a_outputs",
|
|
46
|
+
help="Directory where the output report will be saved"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
args = parser.parse_args()
|
|
50
|
+
|
|
51
|
+
if args.command == "evaluate":
|
|
52
|
+
evaluate_from_files(
|
|
53
|
+
model_path=args.model,
|
|
54
|
+
dataset_path=args.dataset,
|
|
55
|
+
target=args.target,
|
|
56
|
+
model_name=args.model_name,
|
|
57
|
+
output_dir=args.output_dir,
|
|
58
|
+
)
|
|
59
|
+
return 0
|
|
60
|
+
|
|
61
|
+
parser.print_help()
|
|
62
|
+
return 1
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
sys.exit(main())
|
|
@@ -0,0 +1,726 @@
|
|
|
1
|
+
"""
|
|
2
|
+
performance_evaluator.py
|
|
3
|
+
------------------------
|
|
4
|
+
Module A, Component 1 — AI Reliability and Performance Test Suite
|
|
5
|
+
AI Assurance Toolkit | U.S. Public-Sector Edition
|
|
6
|
+
|
|
7
|
+
PURPOSE:
|
|
8
|
+
Evaluates a trained machine learning model's performance against a labeled
|
|
9
|
+
test dataset. Produces quantitative metrics used to assess whether a model
|
|
10
|
+
meets the reliability threshold required for operational deployment in a
|
|
11
|
+
government or regulated-sector context.
|
|
12
|
+
|
|
13
|
+
FEDERAL ALIGNMENT:
|
|
14
|
+
Satisfies the NIST AI Risk Management Framework (AI RMF, 2023) — MEASURE
|
|
15
|
+
function, specifically MR-2.5: "AI system performance or assurance criteria
|
|
16
|
+
are established" and MR-2.6: "Evaluations are conducted on AI system
|
|
17
|
+
performance." Also supports OMB Memorandum M-25-21 documentation
|
|
18
|
+
requirements for deployment readiness reviews.
|
|
19
|
+
|
|
20
|
+
INPUTS:
|
|
21
|
+
- A trained, scikit-learn-compatible classification or regression model
|
|
22
|
+
(loaded from a .pkl or .joblib file)
|
|
23
|
+
- A CSV test dataset with features and a labeled target column
|
|
24
|
+
|
|
25
|
+
OUTPUTS:
|
|
26
|
+
- Console-printed metrics summary
|
|
27
|
+
- module_a_outputs/performance_report.json (structured, human-readable)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import os
|
|
31
|
+
import sys
|
|
32
|
+
import json
|
|
33
|
+
import argparse
|
|
34
|
+
import warnings
|
|
35
|
+
from datetime import datetime
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
import numpy as np
|
|
38
|
+
import pandas as pd
|
|
39
|
+
import joblib
|
|
40
|
+
|
|
41
|
+
from sklearn.metrics import (
|
|
42
|
+
accuracy_score,
|
|
43
|
+
precision_score,
|
|
44
|
+
recall_score,
|
|
45
|
+
f1_score,
|
|
46
|
+
roc_auc_score,
|
|
47
|
+
confusion_matrix,
|
|
48
|
+
brier_score_loss,
|
|
49
|
+
classification_report,
|
|
50
|
+
)
|
|
51
|
+
from sklearn.calibration import calibration_curve
|
|
52
|
+
from sklearn.preprocessing import label_binarize
|
|
53
|
+
|
|
54
|
+
warnings.filterwarnings("ignore")
|
|
55
|
+
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
# CONSTANTS
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
DEFAULT_OUTPUT_DIR = "module_a_outputs"
|
|
61
|
+
OUTPUT_FILENAME = "performance_report.json"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# METRIC COMPUTATION
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
def compute_classification_metrics(
|
|
69
|
+
y_true: np.ndarray,
|
|
70
|
+
y_pred: np.ndarray,
|
|
71
|
+
y_prob: np.ndarray | None,
|
|
72
|
+
class_labels: list,
|
|
73
|
+
) -> dict:
|
|
74
|
+
"""
|
|
75
|
+
Compute the full suite of classification performance metrics.
|
|
76
|
+
|
|
77
|
+
Each metric is explained inline so that developers and reviewers unfamiliar
|
|
78
|
+
with a given statistic can understand its operational significance.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
y_true: Ground-truth labels from the test dataset.
|
|
82
|
+
y_pred: Predicted class labels produced by the model.
|
|
83
|
+
y_prob: Predicted class probabilities (required for AUC-ROC and
|
|
84
|
+
calibration). Pass None if the model does not support
|
|
85
|
+
probability outputs.
|
|
86
|
+
class_labels: Ordered list of unique class label values.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Dictionary of metric names to computed values (floats or dicts).
|
|
90
|
+
"""
|
|
91
|
+
metrics = {}
|
|
92
|
+
|
|
93
|
+
# ------------------------------------------------------------------
|
|
94
|
+
# ACCURACY
|
|
95
|
+
# What it measures: The percentage of all predictions the model got right.
|
|
96
|
+
# Why it matters: Provides a single top-line number for overall correctness.
|
|
97
|
+
# However, it can be misleading when class sizes are unequal
|
|
98
|
+
# (e.g., 95% of records belong to one class).
|
|
99
|
+
# ------------------------------------------------------------------
|
|
100
|
+
metrics["accuracy"] = float(accuracy_score(y_true, y_pred))
|
|
101
|
+
|
|
102
|
+
# Determine whether this is binary or multi-class for averaging strategy
|
|
103
|
+
is_binary = len(class_labels) == 2
|
|
104
|
+
avg_strategy = "binary" if is_binary else "weighted"
|
|
105
|
+
|
|
106
|
+
# ------------------------------------------------------------------
|
|
107
|
+
# PRECISION
|
|
108
|
+
# What it measures: Of all cases the model flagged as positive, what
|
|
109
|
+
# fraction were actually positive?
|
|
110
|
+
# Why it matters: Low precision means many false alarms. In government
|
|
111
|
+
# contexts (e.g., benefits eligibility), false alarms
|
|
112
|
+
# can impose unnecessary burden on individuals.
|
|
113
|
+
# ------------------------------------------------------------------
|
|
114
|
+
metrics["precision"] = float(
|
|
115
|
+
precision_score(y_true, y_pred, average=avg_strategy, zero_division=0)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# ------------------------------------------------------------------
|
|
119
|
+
# RECALL (also called Sensitivity or True Positive Rate)
|
|
120
|
+
# What it measures: Of all actual positive cases, what fraction did the
|
|
121
|
+
# model correctly identify?
|
|
122
|
+
# Why it matters: Low recall means the model is missing real cases. In
|
|
123
|
+
# high-stakes settings (e.g., fraud detection, safety
|
|
124
|
+
# screening), missed detections can be costly or dangerous.
|
|
125
|
+
# ------------------------------------------------------------------
|
|
126
|
+
metrics["recall"] = float(
|
|
127
|
+
recall_score(y_true, y_pred, average=avg_strategy, zero_division=0)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# ------------------------------------------------------------------
|
|
131
|
+
# F1 SCORE
|
|
132
|
+
# What it measures: The harmonic mean of precision and recall. It balances
|
|
133
|
+
# both false alarms and missed detections into a single number.
|
|
134
|
+
# Why it matters: Useful when both types of error are important. A high F1
|
|
135
|
+
# score indicates the model handles both precision and recall
|
|
136
|
+
# well simultaneously.
|
|
137
|
+
# ------------------------------------------------------------------
|
|
138
|
+
metrics["f1_score"] = float(
|
|
139
|
+
f1_score(y_true, y_pred, average=avg_strategy, zero_division=0)
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# ------------------------------------------------------------------
|
|
143
|
+
# FALSE POSITIVE RATE (FPR)
|
|
144
|
+
# What it measures: Of all actual negative cases, what fraction did the
|
|
145
|
+
# model incorrectly flag as positive?
|
|
146
|
+
# Why it matters: High FPR leads to resources being spent investigating
|
|
147
|
+
# non-issues. Critical in screening or triage contexts.
|
|
148
|
+
# ------------------------------------------------------------------
|
|
149
|
+
cm = confusion_matrix(y_true, y_pred, labels=class_labels)
|
|
150
|
+
if is_binary:
|
|
151
|
+
# For binary: cm = [[TN, FP], [FN, TP]]
|
|
152
|
+
tn, fp, fn, tp = cm.ravel()
|
|
153
|
+
fpr = float(fp / (fp + tn)) if (fp + tn) > 0 else 0.0
|
|
154
|
+
fnr = float(fn / (fn + tp)) if (fn + tp) > 0 else 0.0
|
|
155
|
+
else:
|
|
156
|
+
# For multi-class: compute macro-averaged FPR and FNR
|
|
157
|
+
n_classes = len(class_labels)
|
|
158
|
+
fpr_list, fnr_list = [], []
|
|
159
|
+
for i in range(n_classes):
|
|
160
|
+
tp_i = cm[i, i]
|
|
161
|
+
fn_i = cm[i, :].sum() - tp_i
|
|
162
|
+
fp_i = cm[:, i].sum() - tp_i
|
|
163
|
+
tn_i = cm.sum() - tp_i - fn_i - fp_i
|
|
164
|
+
fpr_list.append(fp_i / (fp_i + tn_i) if (fp_i + tn_i) > 0 else 0.0)
|
|
165
|
+
fnr_list.append(fn_i / (fn_i + tp_i) if (fn_i + tp_i) > 0 else 0.0)
|
|
166
|
+
fpr = float(np.mean(fpr_list))
|
|
167
|
+
fnr = float(np.mean(fnr_list))
|
|
168
|
+
|
|
169
|
+
# ------------------------------------------------------------------
|
|
170
|
+
# FALSE NEGATIVE RATE (FNR)
|
|
171
|
+
# What it measures: Of all actual positive cases, what fraction did the
|
|
172
|
+
# model miss (classify as negative)?
|
|
173
|
+
# Why it matters: Directly related to recall (FNR = 1 - Recall). Especially
|
|
174
|
+
# critical in safety, health, or fraud contexts where missing
|
|
175
|
+
# a real event has severe consequences.
|
|
176
|
+
# ------------------------------------------------------------------
|
|
177
|
+
metrics["false_positive_rate"] = fpr
|
|
178
|
+
metrics["false_negative_rate"] = fnr
|
|
179
|
+
|
|
180
|
+
# ------------------------------------------------------------------
|
|
181
|
+
# AUC-ROC (Area Under the Receiver Operating Characteristic Curve)
|
|
182
|
+
# What it measures: The model's ability to distinguish between classes across
|
|
183
|
+
# all possible decision thresholds. Ranges from 0.5 (random
|
|
184
|
+
# guessing) to 1.0 (perfect discrimination).
|
|
185
|
+
# Why it matters: Unlike accuracy, AUC-ROC is threshold-independent. A value
|
|
186
|
+
# above 0.80 is generally considered good; below 0.70 raises
|
|
187
|
+
# concerns about model reliability.
|
|
188
|
+
# ------------------------------------------------------------------
|
|
189
|
+
if y_prob is not None:
|
|
190
|
+
try:
|
|
191
|
+
if is_binary:
|
|
192
|
+
# Use the probability of the positive class (column index 1)
|
|
193
|
+
auc = float(roc_auc_score(y_true, y_prob[:, 1]))
|
|
194
|
+
else:
|
|
195
|
+
# Multi-class: one-vs-rest macro-averaged AUC
|
|
196
|
+
y_true_bin = label_binarize(y_true, classes=class_labels)
|
|
197
|
+
auc = float(
|
|
198
|
+
roc_auc_score(y_true_bin, y_prob, multi_class="ovr", average="weighted")
|
|
199
|
+
)
|
|
200
|
+
metrics["auc_roc"] = auc
|
|
201
|
+
except Exception as exc:
|
|
202
|
+
metrics["auc_roc"] = None
|
|
203
|
+
metrics["auc_roc_error"] = str(exc)
|
|
204
|
+
else:
|
|
205
|
+
metrics["auc_roc"] = None
|
|
206
|
+
metrics["auc_roc_note"] = "Model does not support probability outputs; AUC-ROC not computed."
|
|
207
|
+
|
|
208
|
+
# ------------------------------------------------------------------
|
|
209
|
+
# CALIBRATION SCORE (Brier Score)
|
|
210
|
+
# What it measures: How closely the model's predicted probabilities match
|
|
211
|
+
# actual observed outcomes. A score of 0.0 is perfect;
|
|
212
|
+
# 0.25 is equivalent to always predicting 50% probability.
|
|
213
|
+
# Why it matters: A well-calibrated model is important when predicted
|
|
214
|
+
# probabilities are used for decision-making thresholds
|
|
215
|
+
# (e.g., "flag if probability > 0.7"). Poor calibration
|
|
216
|
+
# means confidence scores cannot be trusted at face value.
|
|
217
|
+
# ------------------------------------------------------------------
|
|
218
|
+
if y_prob is not None and is_binary:
|
|
219
|
+
try:
|
|
220
|
+
brier = float(brier_score_loss(y_true, y_prob[:, 1]))
|
|
221
|
+
metrics["calibration_brier_score"] = brier
|
|
222
|
+
except Exception as exc:
|
|
223
|
+
metrics["calibration_brier_score"] = None
|
|
224
|
+
metrics["calibration_brier_error"] = str(exc)
|
|
225
|
+
else:
|
|
226
|
+
metrics["calibration_brier_score"] = None
|
|
227
|
+
if y_prob is None:
|
|
228
|
+
metrics["calibration_note"] = "Calibration requires probability outputs."
|
|
229
|
+
else:
|
|
230
|
+
metrics["calibration_note"] = "Brier score computed for binary classification only."
|
|
231
|
+
|
|
232
|
+
# ------------------------------------------------------------------
|
|
233
|
+
# PER-CLASS BREAKDOWN
|
|
234
|
+
# What it measures: Precision, recall, and F1 for each individual class.
|
|
235
|
+
# Why it matters: Overall metrics can hide poor performance on minority
|
|
236
|
+
# classes. Per-class detail is required for fairness review.
|
|
237
|
+
# ------------------------------------------------------------------
|
|
238
|
+
report_dict = classification_report(
|
|
239
|
+
y_true, y_pred, labels=class_labels, output_dict=True, zero_division=0
|
|
240
|
+
)
|
|
241
|
+
metrics["per_class_metrics"] = {
|
|
242
|
+
str(label): {
|
|
243
|
+
"precision": round(report_dict[str(label)]["precision"], 4),
|
|
244
|
+
"recall": round(report_dict[str(label)]["recall"], 4),
|
|
245
|
+
"f1_score": round(report_dict[str(label)]["f1-score"], 4),
|
|
246
|
+
"support": int(report_dict[str(label)]["support"]),
|
|
247
|
+
}
|
|
248
|
+
for label in class_labels
|
|
249
|
+
if str(label) in report_dict
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
return metrics
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
# ---------------------------------------------------------------------------
|
|
256
|
+
# PLAIN-ENGLISH INTERPRETATION
|
|
257
|
+
# ---------------------------------------------------------------------------
|
|
258
|
+
|
|
259
|
+
def generate_plain_english_summary(metrics: dict, model_name: str) -> dict:
|
|
260
|
+
"""
|
|
261
|
+
Translate computed metrics into plain-English findings and operational
|
|
262
|
+
implications for a non-technical government program manager audience.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
metrics: The metrics dictionary from compute_classification_metrics().
|
|
266
|
+
model_name: Human-readable name of the model being evaluated.
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
Dictionary with 'findings', 'concerns', 'strengths', and
|
|
270
|
+
'deployment_signal' keys.
|
|
271
|
+
"""
|
|
272
|
+
findings = []
|
|
273
|
+
concerns = []
|
|
274
|
+
strengths = []
|
|
275
|
+
|
|
276
|
+
accuracy = metrics.get("accuracy")
|
|
277
|
+
precision = metrics.get("precision")
|
|
278
|
+
recall = metrics.get("recall")
|
|
279
|
+
f1 = metrics.get("f1_score")
|
|
280
|
+
fpr = metrics.get("false_positive_rate")
|
|
281
|
+
fnr = metrics.get("false_negative_rate")
|
|
282
|
+
auc = metrics.get("auc_roc")
|
|
283
|
+
brier = metrics.get("calibration_brier_score")
|
|
284
|
+
|
|
285
|
+
# --- Accuracy ---
|
|
286
|
+
if accuracy is not None:
|
|
287
|
+
pct = round(accuracy * 100, 1)
|
|
288
|
+
findings.append(
|
|
289
|
+
f"Overall Accuracy: The model correctly predicted the outcome in {pct}% of "
|
|
290
|
+
f"test cases. "
|
|
291
|
+
+ (
|
|
292
|
+
"This is generally considered strong baseline performance."
|
|
293
|
+
if accuracy >= 0.85
|
|
294
|
+
else "This level of accuracy warrants careful review before operational deployment."
|
|
295
|
+
if accuracy >= 0.70
|
|
296
|
+
else "This accuracy level is low and raises significant concerns about model reliability."
|
|
297
|
+
)
|
|
298
|
+
)
|
|
299
|
+
if accuracy >= 0.85:
|
|
300
|
+
strengths.append(f"High overall accuracy ({pct}%).")
|
|
301
|
+
elif accuracy < 0.70:
|
|
302
|
+
concerns.append(f"Overall accuracy of {pct}% is below the recommended 70% threshold for deployment consideration.")
|
|
303
|
+
|
|
304
|
+
# --- Precision ---
|
|
305
|
+
if precision is not None:
|
|
306
|
+
pct = round(precision * 100, 1)
|
|
307
|
+
findings.append(
|
|
308
|
+
f"Precision: When the model predicts a positive outcome, it is correct {pct}% of the time. "
|
|
309
|
+
+ (
|
|
310
|
+
"This indicates a low rate of false alarms."
|
|
311
|
+
if precision >= 0.80
|
|
312
|
+
else "This suggests a meaningful rate of false alarms that may affect operational trust."
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
if precision < 0.70:
|
|
316
|
+
concerns.append(f"Precision of {pct}% means more than 30% of positive predictions are incorrect (false alarms).")
|
|
317
|
+
|
|
318
|
+
# --- Recall ---
|
|
319
|
+
if recall is not None:
|
|
320
|
+
pct = round(recall * 100, 1)
|
|
321
|
+
findings.append(
|
|
322
|
+
f"Recall (Detection Rate): The model correctly identified {pct}% of actual positive cases. "
|
|
323
|
+
+ (
|
|
324
|
+
"Few real cases are being missed."
|
|
325
|
+
if recall >= 0.80
|
|
326
|
+
else "A notable portion of real cases are being missed, which may be a safety or mission concern."
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
if recall < 0.70:
|
|
330
|
+
concerns.append(f"Recall of {pct}% means more than 30% of actual positive cases are going undetected.")
|
|
331
|
+
|
|
332
|
+
# --- F1 Score ---
|
|
333
|
+
if f1 is not None:
|
|
334
|
+
pct = round(f1 * 100, 1)
|
|
335
|
+
findings.append(
|
|
336
|
+
f"F1 Score (Balanced Performance): The combined precision-recall balance score is {pct}%. "
|
|
337
|
+
+ (
|
|
338
|
+
"The model handles both false alarms and missed detections well."
|
|
339
|
+
if f1 >= 0.80
|
|
340
|
+
else "The model shows meaningful trade-offs between false alarms and missed detections."
|
|
341
|
+
)
|
|
342
|
+
)
|
|
343
|
+
if f1 >= 0.80:
|
|
344
|
+
strengths.append(f"Strong F1 score ({pct}%) indicating balanced performance.")
|
|
345
|
+
|
|
346
|
+
# --- False Positive Rate ---
|
|
347
|
+
if fpr is not None:
|
|
348
|
+
pct = round(fpr * 100, 1)
|
|
349
|
+
if fpr > 0.15:
|
|
350
|
+
concerns.append(
|
|
351
|
+
f"False Positive Rate of {pct}%: The model incorrectly flags {pct}% of non-cases as positive. "
|
|
352
|
+
"This may place undue burden on individuals or resources."
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
strengths.append(f"Low false positive rate ({pct}%).")
|
|
356
|
+
|
|
357
|
+
# --- False Negative Rate ---
|
|
358
|
+
if fnr is not None:
|
|
359
|
+
pct = round(fnr * 100, 1)
|
|
360
|
+
if fnr > 0.15:
|
|
361
|
+
concerns.append(
|
|
362
|
+
f"False Negative Rate of {pct}%: The model misses {pct}% of actual positive cases. "
|
|
363
|
+
"This could result in undetected issues with operational or mission impact."
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# --- AUC-ROC ---
|
|
367
|
+
if auc is not None:
|
|
368
|
+
findings.append(
|
|
369
|
+
f"Discrimination Ability (AUC-ROC): The model's ability to distinguish between outcomes "
|
|
370
|
+
f"scores {round(auc, 3)} on a scale of 0.5 (random chance) to 1.0 (perfect). "
|
|
371
|
+
+ (
|
|
372
|
+
"This indicates strong discriminative power."
|
|
373
|
+
if auc >= 0.80
|
|
374
|
+
else "This indicates moderate discriminative power; further review is advised."
|
|
375
|
+
if auc >= 0.70
|
|
376
|
+
else "This score is close to random chance, raising serious questions about model validity."
|
|
377
|
+
)
|
|
378
|
+
)
|
|
379
|
+
if auc >= 0.80:
|
|
380
|
+
strengths.append(f"Strong AUC-ROC score ({round(auc, 3)}).")
|
|
381
|
+
elif auc < 0.70:
|
|
382
|
+
concerns.append(f"AUC-ROC of {round(auc, 3)} is near random chance; model may lack meaningful predictive power.")
|
|
383
|
+
|
|
384
|
+
# --- Calibration ---
|
|
385
|
+
if brier is not None:
|
|
386
|
+
findings.append(
|
|
387
|
+
f"Calibration (Brier Score): The model's confidence scores are calibrated with a Brier score "
|
|
388
|
+
f"of {round(brier, 4)} (lower is better; 0.25 = random guessing). "
|
|
389
|
+
+ (
|
|
390
|
+
"Confidence scores appear reliable."
|
|
391
|
+
if brier <= 0.10
|
|
392
|
+
else "Confidence scores should be interpreted with caution."
|
|
393
|
+
if brier <= 0.20
|
|
394
|
+
else "Confidence scores are poorly calibrated and should not be used for threshold-based decisions."
|
|
395
|
+
)
|
|
396
|
+
)
|
|
397
|
+
if brier > 0.20:
|
|
398
|
+
concerns.append(f"High Brier score ({round(brier, 4)}) indicates unreliable probability estimates.")
|
|
399
|
+
|
|
400
|
+
# --- Overall deployment signal ---
|
|
401
|
+
n_concerns = len(concerns)
|
|
402
|
+
if n_concerns == 0:
|
|
403
|
+
deployment_signal = "APPROVED FOR DEPLOYMENT"
|
|
404
|
+
signal_explanation = (
|
|
405
|
+
"Performance metrics are strong across all dimensions. No significant concerns were identified. "
|
|
406
|
+
"The model appears suitable for deployment pending subgroup and robustness review."
|
|
407
|
+
)
|
|
408
|
+
elif n_concerns <= 2:
|
|
409
|
+
deployment_signal = "APPROVED WITH CONDITIONS"
|
|
410
|
+
signal_explanation = (
|
|
411
|
+
f"{n_concerns} performance concern(s) were identified. The model may be deployable with "
|
|
412
|
+
"additional monitoring, human oversight, or restricted scope. Review the concerns listed below."
|
|
413
|
+
)
|
|
414
|
+
else:
|
|
415
|
+
deployment_signal = "NOT RECOMMENDED FOR DEPLOYMENT"
|
|
416
|
+
signal_explanation = (
|
|
417
|
+
f"{n_concerns} performance concerns were identified. The model does not appear ready for "
|
|
418
|
+
"operational deployment without significant remediation. A detailed remediation plan should be "
|
|
419
|
+
"developed before re-evaluation."
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
return {
|
|
423
|
+
"deployment_signal": deployment_signal,
|
|
424
|
+
"signal_explanation": signal_explanation,
|
|
425
|
+
"findings": findings,
|
|
426
|
+
"strengths": strengths,
|
|
427
|
+
"concerns": concerns,
|
|
428
|
+
"note": (
|
|
429
|
+
"This summary is based solely on overall test-set performance. "
|
|
430
|
+
"Subgroup disparity analysis (Component 2) and robustness testing (Component 3) "
|
|
431
|
+
"are required before a final deployment recommendation can be issued."
|
|
432
|
+
),
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
# ---------------------------------------------------------------------------
|
|
437
|
+
# REPORT ASSEMBLY AND OUTPUT
|
|
438
|
+
# ---------------------------------------------------------------------------
|
|
439
|
+
|
|
440
|
+
def assemble_report(
|
|
441
|
+
metrics: dict,
|
|
442
|
+
plain_english: dict,
|
|
443
|
+
model_name: str,
|
|
444
|
+
dataset_path: str,
|
|
445
|
+
n_samples: int,
|
|
446
|
+
class_labels: list,
|
|
447
|
+
output_dir: str,
|
|
448
|
+
) -> dict:
|
|
449
|
+
"""
|
|
450
|
+
Assemble all computed data into the structured JSON report dictionary
|
|
451
|
+
and write it to disk.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
metrics: Computed metric values.
|
|
455
|
+
plain_english: Plain-English interpretations.
|
|
456
|
+
model_name: Name of the evaluated model.
|
|
457
|
+
dataset_path: Path to the test dataset (for traceability).
|
|
458
|
+
n_samples: Number of test records evaluated.
|
|
459
|
+
class_labels: List of class label values.
|
|
460
|
+
output_dir: Directory where the JSON file will be saved.
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
The fully assembled report as a Python dictionary.
|
|
464
|
+
"""
|
|
465
|
+
report = {
|
|
466
|
+
"report_metadata": {
|
|
467
|
+
"report_type": "Model Performance Evaluation",
|
|
468
|
+
"toolkit": "AI Assurance Toolkit — Module A",
|
|
469
|
+
"component": "Component 1: Model Performance Evaluator",
|
|
470
|
+
"federal_alignment": [
|
|
471
|
+
"NIST AI RMF (2023) — MEASURE function, MR-2.5, MR-2.6: Quantifying AI system performance",
|
|
472
|
+
"OMB Memorandum M-25-21 — Documentation supporting deployment readiness reviews",
|
|
473
|
+
"America's AI Action Plan (July 2025) — Responsible AI deployment evaluation",
|
|
474
|
+
],
|
|
475
|
+
"model_name": model_name,
|
|
476
|
+
"evaluation_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
477
|
+
"test_dataset": str(dataset_path),
|
|
478
|
+
"test_sample_size": n_samples,
|
|
479
|
+
"class_labels": [str(c) for c in class_labels],
|
|
480
|
+
},
|
|
481
|
+
"performance_metrics": {
|
|
482
|
+
"overall": {
|
|
483
|
+
"accuracy": round(metrics["accuracy"], 4),
|
|
484
|
+
"precision": round(metrics["precision"], 4),
|
|
485
|
+
"recall": round(metrics["recall"], 4),
|
|
486
|
+
"f1_score": round(metrics["f1_score"], 4),
|
|
487
|
+
"false_positive_rate": round(metrics["false_positive_rate"], 4),
|
|
488
|
+
"false_negative_rate": round(metrics["false_negative_rate"], 4),
|
|
489
|
+
"auc_roc": round(metrics["auc_roc"], 4) if metrics.get("auc_roc") is not None else None,
|
|
490
|
+
"calibration_brier_score": (
|
|
491
|
+
round(metrics["calibration_brier_score"], 4)
|
|
492
|
+
if metrics.get("calibration_brier_score") is not None
|
|
493
|
+
else None
|
|
494
|
+
),
|
|
495
|
+
},
|
|
496
|
+
"per_class": metrics.get("per_class_metrics", {}),
|
|
497
|
+
},
|
|
498
|
+
"plain_english_summary": plain_english,
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
# Propagate any notes/errors for optional metrics
|
|
502
|
+
for key in ("auc_roc_note", "auc_roc_error", "calibration_note", "calibration_brier_error"):
|
|
503
|
+
if key in metrics:
|
|
504
|
+
report["performance_metrics"]["overall"][key] = metrics[key]
|
|
505
|
+
|
|
506
|
+
# Write to disk
|
|
507
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
508
|
+
output_path = os.path.join(output_dir, OUTPUT_FILENAME)
|
|
509
|
+
with open(output_path, "w", encoding="utf-8") as fh:
|
|
510
|
+
json.dump(report, fh, indent=2)
|
|
511
|
+
|
|
512
|
+
return report
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
# ---------------------------------------------------------------------------
|
|
516
|
+
# CONSOLE OUTPUT
|
|
517
|
+
# ---------------------------------------------------------------------------
|
|
518
|
+
|
|
519
|
+
def print_console_summary(report: dict) -> None:
|
|
520
|
+
"""Print a formatted summary of key findings to the console."""
|
|
521
|
+
meta = report["report_metadata"]
|
|
522
|
+
overall = report["performance_metrics"]["overall"]
|
|
523
|
+
summary = report["plain_english_summary"]
|
|
524
|
+
|
|
525
|
+
divider = "=" * 70
|
|
526
|
+
print(f"\n{divider}")
|
|
527
|
+
print(" AI ASSURANCE TOOLKIT — Module A, Component 1")
|
|
528
|
+
print(" Model Performance Evaluation Report")
|
|
529
|
+
print(divider)
|
|
530
|
+
print(f" Model: {meta['model_name']}")
|
|
531
|
+
print(f" Evaluation Date: {meta['evaluation_date']}")
|
|
532
|
+
print(f" Test Samples: {meta['test_sample_size']:,}")
|
|
533
|
+
print(f" Classes: {', '.join(meta['class_labels'])}")
|
|
534
|
+
print(divider)
|
|
535
|
+
print(" PERFORMANCE METRICS")
|
|
536
|
+
print(f" Accuracy: {overall['accuracy']:.4f}")
|
|
537
|
+
print(f" Precision: {overall['precision']:.4f}")
|
|
538
|
+
print(f" Recall: {overall['recall']:.4f}")
|
|
539
|
+
print(f" F1 Score: {overall['f1_score']:.4f}")
|
|
540
|
+
print(f" False Positive Rate: {overall['false_positive_rate']:.4f}")
|
|
541
|
+
print(f" False Negative Rate: {overall['false_negative_rate']:.4f}")
|
|
542
|
+
auc_val = overall.get("auc_roc")
|
|
543
|
+
print(f" AUC-ROC: {f'{auc_val:.4f}' if auc_val is not None else 'N/A'}")
|
|
544
|
+
brier_val = overall.get("calibration_brier_score")
|
|
545
|
+
print(f" Calibration (Brier): {f'{brier_val:.4f}' if brier_val is not None else 'N/A'}")
|
|
546
|
+
print(divider)
|
|
547
|
+
print(f" DEPLOYMENT SIGNAL: >>> {summary['deployment_signal']} <<<")
|
|
548
|
+
print(f"\n {summary['signal_explanation']}")
|
|
549
|
+
if summary["concerns"]:
|
|
550
|
+
print("\n CONCERNS IDENTIFIED:")
|
|
551
|
+
for i, c in enumerate(summary["concerns"], 1):
|
|
552
|
+
print(f" {i}. {c}")
|
|
553
|
+
if summary["strengths"]:
|
|
554
|
+
print("\n STRENGTHS IDENTIFIED:")
|
|
555
|
+
for s in summary["strengths"]:
|
|
556
|
+
print(f" ✓ {s}")
|
|
557
|
+
print(divider)
|
|
558
|
+
print(f" Report saved to: {DEFAULT_OUTPUT_DIR}/{OUTPUT_FILENAME}")
|
|
559
|
+
print(f"{divider}\n")
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
# ---------------------------------------------------------------------------
|
|
563
|
+
# PUBLIC API — callable from orchestrator
|
|
564
|
+
# ---------------------------------------------------------------------------
|
|
565
|
+
|
|
566
|
+
def run_performance_evaluation(
|
|
567
|
+
model,
|
|
568
|
+
X_test: pd.DataFrame,
|
|
569
|
+
y_test: pd.Series,
|
|
570
|
+
model_name: str = "Unnamed Model",
|
|
571
|
+
dataset_path: str = "N/A",
|
|
572
|
+
output_dir: str = DEFAULT_OUTPUT_DIR,
|
|
573
|
+
) -> dict:
|
|
574
|
+
"""
|
|
575
|
+
Primary entry point for programmatic use (called by run_evaluation.py).
|
|
576
|
+
|
|
577
|
+
Args:
|
|
578
|
+
model: A fitted scikit-learn-compatible classifier.
|
|
579
|
+
X_test: Feature matrix for the test set (pandas DataFrame).
|
|
580
|
+
y_test: True labels for the test set (pandas Series).
|
|
581
|
+
model_name: Human-readable name for the report.
|
|
582
|
+
dataset_path: Original dataset file path (for audit traceability).
|
|
583
|
+
output_dir: Folder where output files will be written.
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
The assembled report dictionary.
|
|
587
|
+
"""
|
|
588
|
+
class_labels = sorted(y_test.unique().tolist())
|
|
589
|
+
y_pred = model.predict(X_test)
|
|
590
|
+
|
|
591
|
+
# Attempt to retrieve probability estimates
|
|
592
|
+
if hasattr(model, "predict_proba"):
|
|
593
|
+
try:
|
|
594
|
+
y_prob = model.predict_proba(X_test)
|
|
595
|
+
except Exception:
|
|
596
|
+
y_prob = None
|
|
597
|
+
else:
|
|
598
|
+
y_prob = None
|
|
599
|
+
|
|
600
|
+
metrics = compute_classification_metrics(y_test.values, y_pred, y_prob, class_labels)
|
|
601
|
+
plain_english = generate_plain_english_summary(metrics, model_name)
|
|
602
|
+
report = assemble_report(
|
|
603
|
+
metrics, plain_english, model_name, dataset_path,
|
|
604
|
+
len(y_test), class_labels, output_dir
|
|
605
|
+
)
|
|
606
|
+
print_console_summary(report)
|
|
607
|
+
return report
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def evaluate_from_files(
|
|
611
|
+
model_path: str,
|
|
612
|
+
dataset_path: str,
|
|
613
|
+
target: str,
|
|
614
|
+
model_name: str = "Unnamed Model",
|
|
615
|
+
output_dir: str = DEFAULT_OUTPUT_DIR,
|
|
616
|
+
) -> dict:
|
|
617
|
+
"""
|
|
618
|
+
Load a trained model and test dataset from files, then run performance evaluation.
|
|
619
|
+
This function is used by the command-line interface.
|
|
620
|
+
"""
|
|
621
|
+
validate_inputs(model_path, dataset_path)
|
|
622
|
+
|
|
623
|
+
print(f"\n[INFO] Loading model from: {model_path}")
|
|
624
|
+
model = joblib.load(model_path)
|
|
625
|
+
|
|
626
|
+
print(f"[INFO] Loading dataset from: {dataset_path}")
|
|
627
|
+
df = pd.read_csv(dataset_path)
|
|
628
|
+
|
|
629
|
+
if target not in df.columns:
|
|
630
|
+
raise ValueError(
|
|
631
|
+
f"Target column '{target}' not found in dataset. "
|
|
632
|
+
f"Available columns: {list(df.columns)}"
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
y_test = df[target]
|
|
636
|
+
|
|
637
|
+
if hasattr(model, "feature_names_in_"):
|
|
638
|
+
expected_features = list(model.feature_names_in_)
|
|
639
|
+
missing_features = [col for col in expected_features if col not in df.columns]
|
|
640
|
+
|
|
641
|
+
if missing_features:
|
|
642
|
+
raise ValueError(
|
|
643
|
+
f"The dataset is missing required model feature columns: {missing_features}"
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
X_test = df[expected_features]
|
|
647
|
+
else:
|
|
648
|
+
X_test = df.drop(columns=[target])
|
|
649
|
+
|
|
650
|
+
print(f"[INFO] Dataset loaded: {len(df):,} rows, {len(X_test.columns)} features.")
|
|
651
|
+
print(f"[INFO] Beginning performance evaluation for: {model_name}\n")
|
|
652
|
+
|
|
653
|
+
return run_performance_evaluation(
|
|
654
|
+
model=model,
|
|
655
|
+
X_test=X_test,
|
|
656
|
+
y_test=y_test,
|
|
657
|
+
model_name=model_name,
|
|
658
|
+
dataset_path=dataset_path,
|
|
659
|
+
output_dir=output_dir,
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
# ---------------------------------------------------------------------------
|
|
663
|
+
# STANDALONE CLI ENTRYPOINT
|
|
664
|
+
# ---------------------------------------------------------------------------
|
|
665
|
+
|
|
666
|
+
def parse_args() -> argparse.Namespace:
|
|
667
|
+
parser = argparse.ArgumentParser(
|
|
668
|
+
description="Component 1 — Model Performance Evaluator (AI Assurance Toolkit)"
|
|
669
|
+
)
|
|
670
|
+
parser.add_argument("--model", required=True, help="Path to the serialized model file (.pkl or .joblib)")
|
|
671
|
+
parser.add_argument("--dataset", required=True, help="Path to the test dataset CSV file")
|
|
672
|
+
parser.add_argument("--target", required=True, help="Name of the target (label) column in the dataset")
|
|
673
|
+
parser.add_argument("--model-name", default="Unnamed Model", help="Human-readable model name for the report")
|
|
674
|
+
parser.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR, help="Directory for output files")
|
|
675
|
+
return parser.parse_args()
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def validate_inputs(model_path: str, dataset_path: str) -> None:
|
|
679
|
+
"""
|
|
680
|
+
Validate that required input files exist before any processing begins.
|
|
681
|
+
Exits with a descriptive error message if validation fails.
|
|
682
|
+
"""
|
|
683
|
+
if not Path(model_path).exists():
|
|
684
|
+
print(f"\n[ERROR] Model file not found: '{model_path}'")
|
|
685
|
+
print(" Please verify the file path and try again.")
|
|
686
|
+
sys.exit(1)
|
|
687
|
+
if not Path(dataset_path).exists():
|
|
688
|
+
print(f"\n[ERROR] Dataset file not found: '{dataset_path}'")
|
|
689
|
+
print(" Please verify the file path and try again.")
|
|
690
|
+
sys.exit(1)
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
def main() -> None:
|
|
694
|
+
"""Standalone CLI entry point."""
|
|
695
|
+
args = parse_args()
|
|
696
|
+
validate_inputs(args.model, args.dataset)
|
|
697
|
+
|
|
698
|
+
print(f"\n[INFO] Loading model from: {args.model}")
|
|
699
|
+
model = joblib.load(args.model)
|
|
700
|
+
|
|
701
|
+
print(f"[INFO] Loading dataset from: {args.dataset}")
|
|
702
|
+
df = pd.read_csv(args.dataset)
|
|
703
|
+
|
|
704
|
+
if args.target not in df.columns:
|
|
705
|
+
print(f"\n[ERROR] Target column '{args.target}' not found in dataset.")
|
|
706
|
+
print(f" Available columns: {list(df.columns)}")
|
|
707
|
+
sys.exit(1)
|
|
708
|
+
|
|
709
|
+
X_test = df.drop(columns=[args.target])
|
|
710
|
+
y_test = df[args.target]
|
|
711
|
+
|
|
712
|
+
print(f"[INFO] Dataset loaded: {len(df):,} rows, {len(X_test.columns)} features.")
|
|
713
|
+
print(f"[INFO] Beginning performance evaluation for: {args.model_name}\n")
|
|
714
|
+
|
|
715
|
+
run_performance_evaluation(
|
|
716
|
+
model=model,
|
|
717
|
+
X_test=X_test,
|
|
718
|
+
y_test=y_test,
|
|
719
|
+
model_name=args.model_name,
|
|
720
|
+
dataset_path=args.dataset,
|
|
721
|
+
output_dir=args.output_dir,
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
if __name__ == "__main__":
|
|
726
|
+
main()
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ai-assurance-toolkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python toolkit for evaluating AI model reliability, performance, and deployment readiness.
|
|
5
|
+
Author-email: Happy Iguare <haigu1@morgan.edu>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Repository, https://github.com/harpiking/AI-Assurance-Toolkit
|
|
8
|
+
Keywords: ai assurance,model evaluation,machine learning,responsible ai,model audit,nist ai rmf,public sector ai
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: numpy>=1.24
|
|
23
|
+
Requires-Dist: pandas>=2.0
|
|
24
|
+
Requires-Dist: scikit-learn>=1.3
|
|
25
|
+
Requires-Dist: joblib>=1.3
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
28
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
29
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
30
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# AI Assurance Toolkit
|
|
34
|
+
|
|
35
|
+
AI Assurance Toolkit is a lightweight Python package for evaluating machine learning model reliability, performance, and deployment readiness.
|
|
36
|
+
|
|
37
|
+
## What it does
|
|
38
|
+
|
|
39
|
+
The toolkit evaluates a trained machine learning model against a labeled test dataset and generates a structured performance report.
|
|
40
|
+
|
|
41
|
+
## Metrics included
|
|
42
|
+
|
|
43
|
+
- Accuracy
|
|
44
|
+
- Precision
|
|
45
|
+
- Recall
|
|
46
|
+
- F1 score
|
|
47
|
+
- False positive rate
|
|
48
|
+
- False negative rate
|
|
49
|
+
- AUC-ROC
|
|
50
|
+
- Calibration / Brier score
|
|
51
|
+
- Per-class metrics
|
|
52
|
+
- Plain-English deployment signal
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install ai-assurance-toolkit
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Quick start
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
ai-assurance evaluate \
|
|
64
|
+
--model model.pkl \
|
|
65
|
+
--dataset test_data.csv \
|
|
66
|
+
--target credit_risk \
|
|
67
|
+
--model-name "German Credit Risk Classifier"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Example
|
|
71
|
+
|
|
72
|
+
Generate a test model and sample dataset:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
python examples/setup_test_model.py
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Then run the evaluator:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
ai-assurance evaluate \
|
|
82
|
+
--model model.pkl \
|
|
83
|
+
--dataset test_data.csv \
|
|
84
|
+
--target credit_risk \
|
|
85
|
+
--model-name "German Credit Risk Classifier"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Output
|
|
89
|
+
|
|
90
|
+
The package creates:
|
|
91
|
+
|
|
92
|
+
```text
|
|
93
|
+
module_a_outputs/performance_report.json
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Python usage
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from ai_assurance_toolkit import run_performance_evaluation
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## License
|
|
103
|
+
|
|
104
|
+
Apache License 2.0.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/ai_assurance_toolkit/__init__.py
|
|
5
|
+
src/ai_assurance_toolkit/cli.py
|
|
6
|
+
src/ai_assurance_toolkit/performance_evaluator.py
|
|
7
|
+
src/ai_assurance_toolkit.egg-info/PKG-INFO
|
|
8
|
+
src/ai_assurance_toolkit.egg-info/SOURCES.txt
|
|
9
|
+
src/ai_assurance_toolkit.egg-info/dependency_links.txt
|
|
10
|
+
src/ai_assurance_toolkit.egg-info/entry_points.txt
|
|
11
|
+
src/ai_assurance_toolkit.egg-info/requires.txt
|
|
12
|
+
src/ai_assurance_toolkit.egg-info/top_level.txt
|
|
13
|
+
tests/test_import.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ai_assurance_toolkit
|