ai-assurance-toolkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,104 @@
1
+ Metadata-Version: 2.4
2
+ Name: ai-assurance-toolkit
3
+ Version: 0.1.0
4
+ Summary: A Python toolkit for evaluating AI model reliability, performance, and deployment readiness.
5
+ Author-email: Happy Iguare <haigu1@morgan.edu>
6
+ License: Apache-2.0
7
+ Project-URL: Repository, https://github.com/harpiking/AI-Assurance-Toolkit
8
+ Keywords: ai assurance,model evaluation,machine learning,responsible ai,model audit,nist ai rmf,public sector ai
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Classifier: Topic :: Software Development :: Quality Assurance
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: numpy>=1.24
23
+ Requires-Dist: pandas>=2.0
24
+ Requires-Dist: scikit-learn>=1.3
25
+ Requires-Dist: joblib>=1.3
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=8.0; extra == "dev"
28
+ Requires-Dist: build>=1.2; extra == "dev"
29
+ Requires-Dist: twine>=5.0; extra == "dev"
30
+ Requires-Dist: ruff>=0.5; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # AI Assurance Toolkit
34
+
35
+ AI Assurance Toolkit is a lightweight Python package for evaluating machine learning model reliability, performance, and deployment readiness.
36
+
37
+ ## What it does
38
+
39
+ The toolkit evaluates a trained machine learning model against a labeled test dataset and generates a structured performance report.
40
+
41
+ ## Metrics included
42
+
43
+ - Accuracy
44
+ - Precision
45
+ - Recall
46
+ - F1 score
47
+ - False positive rate
48
+ - False negative rate
49
+ - AUC-ROC
50
+ - Calibration / Brier score
51
+ - Per-class metrics
52
+ - Plain-English deployment signal
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ pip install ai-assurance-toolkit
58
+ ```
59
+
60
+ ## Quick start
61
+
62
+ ```bash
63
+ ai-assurance evaluate \
64
+ --model model.pkl \
65
+ --dataset test_data.csv \
66
+ --target credit_risk \
67
+ --model-name "German Credit Risk Classifier"
68
+ ```
69
+
70
+ ## Example
71
+
72
+ Generate a test model and sample dataset:
73
+
74
+ ```bash
75
+ python examples/setup_test_model.py
76
+ ```
77
+
78
+ Then run the evaluator:
79
+
80
+ ```bash
81
+ ai-assurance evaluate \
82
+ --model model.pkl \
83
+ --dataset test_data.csv \
84
+ --target credit_risk \
85
+ --model-name "German Credit Risk Classifier"
86
+ ```
87
+
88
+ ## Output
89
+
90
+ The package creates:
91
+
92
+ ```text
93
+ module_a_outputs/performance_report.json
94
+ ```
95
+
96
+ ## Python usage
97
+
98
+ ```python
99
+ from ai_assurance_toolkit import run_performance_evaluation
100
+ ```
101
+
102
+ ## License
103
+
104
+ Apache License 2.0.
@@ -0,0 +1,72 @@
1
+ # AI Assurance Toolkit
2
+
3
+ AI Assurance Toolkit is a lightweight Python package for evaluating machine learning model reliability, performance, and deployment readiness.
4
+
5
+ ## What it does
6
+
7
+ The toolkit evaluates a trained machine learning model against a labeled test dataset and generates a structured performance report.
8
+
9
+ ## Metrics included
10
+
11
+ - Accuracy
12
+ - Precision
13
+ - Recall
14
+ - F1 score
15
+ - False positive rate
16
+ - False negative rate
17
+ - AUC-ROC
18
+ - Calibration / Brier score
19
+ - Per-class metrics
20
+ - Plain-English deployment signal
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install ai-assurance-toolkit
26
+ ```
27
+
28
+ ## Quick start
29
+
30
+ ```bash
31
+ ai-assurance evaluate \
32
+ --model model.pkl \
33
+ --dataset test_data.csv \
34
+ --target credit_risk \
35
+ --model-name "German Credit Risk Classifier"
36
+ ```
37
+
38
+ ## Example
39
+
40
+ Generate a test model and sample dataset:
41
+
42
+ ```bash
43
+ python examples/setup_test_model.py
44
+ ```
45
+
46
+ Then run the evaluator:
47
+
48
+ ```bash
49
+ ai-assurance evaluate \
50
+ --model model.pkl \
51
+ --dataset test_data.csv \
52
+ --target credit_risk \
53
+ --model-name "German Credit Risk Classifier"
54
+ ```
55
+
56
+ ## Output
57
+
58
+ The package creates:
59
+
60
+ ```text
61
+ module_a_outputs/performance_report.json
62
+ ```
63
+
64
+ ## Python usage
65
+
66
+ ```python
67
+ from ai_assurance_toolkit import run_performance_evaluation
68
+ ```
69
+
70
+ ## License
71
+
72
+ Apache License 2.0.
@@ -0,0 +1,62 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "ai-assurance-toolkit"
7
+ version = "0.1.0"
8
+ description = "A Python toolkit for evaluating AI model reliability, performance, and deployment readiness."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "Apache-2.0" }
12
+
13
+ authors = [
14
+ { name = "Happy Iguare", email = "haigu1@morgan.edu" }
15
+ ]
16
+
17
+ keywords = [
18
+ "ai assurance",
19
+ "model evaluation",
20
+ "machine learning",
21
+ "responsible ai",
22
+ "model audit",
23
+ "nist ai rmf",
24
+ "public sector ai"
25
+ ]
26
+
27
+ classifiers = [
28
+ "Development Status :: 3 - Alpha",
29
+ "Intended Audience :: Developers",
30
+ "Intended Audience :: Science/Research",
31
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
32
+ "Topic :: Software Development :: Quality Assurance",
33
+ "License :: OSI Approved :: MIT License",
34
+ "Programming Language :: Python :: 3",
35
+ "Programming Language :: Python :: 3.10",
36
+ "Programming Language :: Python :: 3.11",
37
+ "Programming Language :: Python :: 3.12"
38
+ ]
39
+
40
+ dependencies = [
41
+ "numpy>=1.24",
42
+ "pandas>=2.0",
43
+ "scikit-learn>=1.3",
44
+ "joblib>=1.3"
45
+ ]
46
+
47
+ [project.optional-dependencies]
48
+ dev = [
49
+ "pytest>=8.0",
50
+ "build>=1.2",
51
+ "twine>=5.0",
52
+ "ruff>=0.5"
53
+ ]
54
+
55
+ [project.urls]
56
+ Repository = "https://github.com/harpiking/AI-Assurance-Toolkit"
57
+
58
+ [project.scripts]
59
+ ai-assurance = "ai_assurance_toolkit.cli:main"
60
+
61
+ [tool.setuptools.packages.find]
62
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,5 @@
1
+ from ai_assurance_toolkit.performance_evaluator import run_performance_evaluation
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ __all__ = ["run_performance_evaluation"]
@@ -0,0 +1,66 @@
1
+ import argparse
2
+ import sys
3
+
4
+ from ai_assurance_toolkit.performance_evaluator import evaluate_from_files
5
+
6
+ def main() -> int:
7
+ parser = argparse.ArgumentParser(
8
+ prog="ai-assurance",
9
+ description="AI Assurance Toolkit command-line interface"
10
+ )
11
+
12
+ subparsers = parser.add_subparsers(dest="command", required=True)
13
+
14
+ evaluate_parser = subparsers.add_parser(
15
+ "evaluate",
16
+ help="Evaluate a trained model against a labeled CSV dataset"
17
+ )
18
+
19
+ evaluate_parser.add_argument(
20
+ "--model",
21
+ required=True,
22
+ help="Path to the trained model file, such as model.pkl or model.joblib"
23
+ )
24
+
25
+ evaluate_parser.add_argument(
26
+ "--dataset",
27
+ required=True,
28
+ help="Path to the test dataset CSV file"
29
+ )
30
+
31
+ evaluate_parser.add_argument(
32
+ "--target",
33
+ required=True,
34
+ help="Name of the target column in the dataset"
35
+ )
36
+
37
+ evaluate_parser.add_argument(
38
+ "--model-name",
39
+ default="Unnamed Model",
40
+ help="Human-readable model name for the report"
41
+ )
42
+
43
+ evaluate_parser.add_argument(
44
+ "--output-dir",
45
+ default="module_a_outputs",
46
+ help="Directory where the output report will be saved"
47
+ )
48
+
49
+ args = parser.parse_args()
50
+
51
+ if args.command == "evaluate":
52
+ evaluate_from_files(
53
+ model_path=args.model,
54
+ dataset_path=args.dataset,
55
+ target=args.target,
56
+ model_name=args.model_name,
57
+ output_dir=args.output_dir,
58
+ )
59
+ return 0
60
+
61
+ parser.print_help()
62
+ return 1
63
+
64
+
65
+ if __name__ == "__main__":
66
+ sys.exit(main())
@@ -0,0 +1,726 @@
1
+ """
2
+ performance_evaluator.py
3
+ ------------------------
4
+ Module A, Component 1 — AI Reliability and Performance Test Suite
5
+ AI Assurance Toolkit | U.S. Public-Sector Edition
6
+
7
+ PURPOSE:
8
+ Evaluates a trained machine learning model's performance against a labeled
9
+ test dataset. Produces quantitative metrics used to assess whether a model
10
+ meets the reliability threshold required for operational deployment in a
11
+ government or regulated-sector context.
12
+
13
+ FEDERAL ALIGNMENT:
14
+ Satisfies the NIST AI Risk Management Framework (AI RMF, 2023) — MEASURE
15
+ function, specifically MR-2.5: "AI system performance or assurance criteria
16
+ are established" and MR-2.6: "Evaluations are conducted on AI system
17
+ performance." Also supports OMB Memorandum M-25-21 documentation
18
+ requirements for deployment readiness reviews.
19
+
20
+ INPUTS:
21
+ - A trained, scikit-learn-compatible classification or regression model
22
+ (loaded from a .pkl or .joblib file)
23
+ - A CSV test dataset with features and a labeled target column
24
+
25
+ OUTPUTS:
26
+ - Console-printed metrics summary
27
+ - module_a_outputs/performance_report.json (structured, human-readable)
28
+ """
29
+
30
+ import os
31
+ import sys
32
+ import json
33
+ import argparse
34
+ import warnings
35
+ from datetime import datetime
36
+ from pathlib import Path
37
+ import numpy as np
38
+ import pandas as pd
39
+ import joblib
40
+
41
+ from sklearn.metrics import (
42
+ accuracy_score,
43
+ precision_score,
44
+ recall_score,
45
+ f1_score,
46
+ roc_auc_score,
47
+ confusion_matrix,
48
+ brier_score_loss,
49
+ classification_report,
50
+ )
51
+ from sklearn.calibration import calibration_curve
52
+ from sklearn.preprocessing import label_binarize
53
+
54
+ warnings.filterwarnings("ignore")
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # CONSTANTS
58
+ # ---------------------------------------------------------------------------
59
+
60
+ DEFAULT_OUTPUT_DIR = "module_a_outputs"
61
+ OUTPUT_FILENAME = "performance_report.json"
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # METRIC COMPUTATION
66
+ # ---------------------------------------------------------------------------
67
+
68
+ def compute_classification_metrics(
69
+ y_true: np.ndarray,
70
+ y_pred: np.ndarray,
71
+ y_prob: np.ndarray | None,
72
+ class_labels: list,
73
+ ) -> dict:
74
+ """
75
+ Compute the full suite of classification performance metrics.
76
+
77
+ Each metric is explained inline so that developers and reviewers unfamiliar
78
+ with a given statistic can understand its operational significance.
79
+
80
+ Args:
81
+ y_true: Ground-truth labels from the test dataset.
82
+ y_pred: Predicted class labels produced by the model.
83
+ y_prob: Predicted class probabilities (required for AUC-ROC and
84
+ calibration). Pass None if the model does not support
85
+ probability outputs.
86
+ class_labels: Ordered list of unique class label values.
87
+
88
+ Returns:
89
+ Dictionary of metric names to computed values (floats or dicts).
90
+ """
91
+ metrics = {}
92
+
93
+ # ------------------------------------------------------------------
94
+ # ACCURACY
95
+ # What it measures: The percentage of all predictions the model got right.
96
+ # Why it matters: Provides a single top-line number for overall correctness.
97
+ # However, it can be misleading when class sizes are unequal
98
+ # (e.g., 95% of records belong to one class).
99
+ # ------------------------------------------------------------------
100
+ metrics["accuracy"] = float(accuracy_score(y_true, y_pred))
101
+
102
+ # Determine whether this is binary or multi-class for averaging strategy
103
+ is_binary = len(class_labels) == 2
104
+ avg_strategy = "binary" if is_binary else "weighted"
105
+
106
+ # ------------------------------------------------------------------
107
+ # PRECISION
108
+ # What it measures: Of all cases the model flagged as positive, what
109
+ # fraction were actually positive?
110
+ # Why it matters: Low precision means many false alarms. In government
111
+ # contexts (e.g., benefits eligibility), false alarms
112
+ # can impose unnecessary burden on individuals.
113
+ # ------------------------------------------------------------------
114
+ metrics["precision"] = float(
115
+ precision_score(y_true, y_pred, average=avg_strategy, zero_division=0)
116
+ )
117
+
118
+ # ------------------------------------------------------------------
119
+ # RECALL (also called Sensitivity or True Positive Rate)
120
+ # What it measures: Of all actual positive cases, what fraction did the
121
+ # model correctly identify?
122
+ # Why it matters: Low recall means the model is missing real cases. In
123
+ # high-stakes settings (e.g., fraud detection, safety
124
+ # screening), missed detections can be costly or dangerous.
125
+ # ------------------------------------------------------------------
126
+ metrics["recall"] = float(
127
+ recall_score(y_true, y_pred, average=avg_strategy, zero_division=0)
128
+ )
129
+
130
+ # ------------------------------------------------------------------
131
+ # F1 SCORE
132
+ # What it measures: The harmonic mean of precision and recall. It balances
133
+ # both false alarms and missed detections into a single number.
134
+ # Why it matters: Useful when both types of error are important. A high F1
135
+ # score indicates the model handles both precision and recall
136
+ # well simultaneously.
137
+ # ------------------------------------------------------------------
138
+ metrics["f1_score"] = float(
139
+ f1_score(y_true, y_pred, average=avg_strategy, zero_division=0)
140
+ )
141
+
142
+ # ------------------------------------------------------------------
143
+ # FALSE POSITIVE RATE (FPR)
144
+ # What it measures: Of all actual negative cases, what fraction did the
145
+ # model incorrectly flag as positive?
146
+ # Why it matters: High FPR leads to resources being spent investigating
147
+ # non-issues. Critical in screening or triage contexts.
148
+ # ------------------------------------------------------------------
149
+ cm = confusion_matrix(y_true, y_pred, labels=class_labels)
150
+ if is_binary:
151
+ # For binary: cm = [[TN, FP], [FN, TP]]
152
+ tn, fp, fn, tp = cm.ravel()
153
+ fpr = float(fp / (fp + tn)) if (fp + tn) > 0 else 0.0
154
+ fnr = float(fn / (fn + tp)) if (fn + tp) > 0 else 0.0
155
+ else:
156
+ # For multi-class: compute macro-averaged FPR and FNR
157
+ n_classes = len(class_labels)
158
+ fpr_list, fnr_list = [], []
159
+ for i in range(n_classes):
160
+ tp_i = cm[i, i]
161
+ fn_i = cm[i, :].sum() - tp_i
162
+ fp_i = cm[:, i].sum() - tp_i
163
+ tn_i = cm.sum() - tp_i - fn_i - fp_i
164
+ fpr_list.append(fp_i / (fp_i + tn_i) if (fp_i + tn_i) > 0 else 0.0)
165
+ fnr_list.append(fn_i / (fn_i + tp_i) if (fn_i + tp_i) > 0 else 0.0)
166
+ fpr = float(np.mean(fpr_list))
167
+ fnr = float(np.mean(fnr_list))
168
+
169
+ # ------------------------------------------------------------------
170
+ # FALSE NEGATIVE RATE (FNR)
171
+ # What it measures: Of all actual positive cases, what fraction did the
172
+ # model miss (classify as negative)?
173
+ # Why it matters: Directly related to recall (FNR = 1 - Recall). Especially
174
+ # critical in safety, health, or fraud contexts where missing
175
+ # a real event has severe consequences.
176
+ # ------------------------------------------------------------------
177
+ metrics["false_positive_rate"] = fpr
178
+ metrics["false_negative_rate"] = fnr
179
+
180
+ # ------------------------------------------------------------------
181
+ # AUC-ROC (Area Under the Receiver Operating Characteristic Curve)
182
+ # What it measures: The model's ability to distinguish between classes across
183
+ # all possible decision thresholds. Ranges from 0.5 (random
184
+ # guessing) to 1.0 (perfect discrimination).
185
+ # Why it matters: Unlike accuracy, AUC-ROC is threshold-independent. A value
186
+ # above 0.80 is generally considered good; below 0.70 raises
187
+ # concerns about model reliability.
188
+ # ------------------------------------------------------------------
189
+ if y_prob is not None:
190
+ try:
191
+ if is_binary:
192
+ # Use the probability of the positive class (column index 1)
193
+ auc = float(roc_auc_score(y_true, y_prob[:, 1]))
194
+ else:
195
+ # Multi-class: one-vs-rest macro-averaged AUC
196
+ y_true_bin = label_binarize(y_true, classes=class_labels)
197
+ auc = float(
198
+ roc_auc_score(y_true_bin, y_prob, multi_class="ovr", average="weighted")
199
+ )
200
+ metrics["auc_roc"] = auc
201
+ except Exception as exc:
202
+ metrics["auc_roc"] = None
203
+ metrics["auc_roc_error"] = str(exc)
204
+ else:
205
+ metrics["auc_roc"] = None
206
+ metrics["auc_roc_note"] = "Model does not support probability outputs; AUC-ROC not computed."
207
+
208
+ # ------------------------------------------------------------------
209
+ # CALIBRATION SCORE (Brier Score)
210
+ # What it measures: How closely the model's predicted probabilities match
211
+ # actual observed outcomes. A score of 0.0 is perfect;
212
+ # 0.25 is equivalent to always predicting 50% probability.
213
+ # Why it matters: A well-calibrated model is important when predicted
214
+ # probabilities are used for decision-making thresholds
215
+ # (e.g., "flag if probability > 0.7"). Poor calibration
216
+ # means confidence scores cannot be trusted at face value.
217
+ # ------------------------------------------------------------------
218
+ if y_prob is not None and is_binary:
219
+ try:
220
+ brier = float(brier_score_loss(y_true, y_prob[:, 1]))
221
+ metrics["calibration_brier_score"] = brier
222
+ except Exception as exc:
223
+ metrics["calibration_brier_score"] = None
224
+ metrics["calibration_brier_error"] = str(exc)
225
+ else:
226
+ metrics["calibration_brier_score"] = None
227
+ if y_prob is None:
228
+ metrics["calibration_note"] = "Calibration requires probability outputs."
229
+ else:
230
+ metrics["calibration_note"] = "Brier score computed for binary classification only."
231
+
232
+ # ------------------------------------------------------------------
233
+ # PER-CLASS BREAKDOWN
234
+ # What it measures: Precision, recall, and F1 for each individual class.
235
+ # Why it matters: Overall metrics can hide poor performance on minority
236
+ # classes. Per-class detail is required for fairness review.
237
+ # ------------------------------------------------------------------
238
+ report_dict = classification_report(
239
+ y_true, y_pred, labels=class_labels, output_dict=True, zero_division=0
240
+ )
241
+ metrics["per_class_metrics"] = {
242
+ str(label): {
243
+ "precision": round(report_dict[str(label)]["precision"], 4),
244
+ "recall": round(report_dict[str(label)]["recall"], 4),
245
+ "f1_score": round(report_dict[str(label)]["f1-score"], 4),
246
+ "support": int(report_dict[str(label)]["support"]),
247
+ }
248
+ for label in class_labels
249
+ if str(label) in report_dict
250
+ }
251
+
252
+ return metrics
253
+
254
+
255
+ # ---------------------------------------------------------------------------
256
+ # PLAIN-ENGLISH INTERPRETATION
257
+ # ---------------------------------------------------------------------------
258
+
259
+ def generate_plain_english_summary(metrics: dict, model_name: str) -> dict:
260
+ """
261
+ Translate computed metrics into plain-English findings and operational
262
+ implications for a non-technical government program manager audience.
263
+
264
+ Args:
265
+ metrics: The metrics dictionary from compute_classification_metrics().
266
+ model_name: Human-readable name of the model being evaluated.
267
+
268
+ Returns:
269
+ Dictionary with 'findings', 'concerns', 'strengths', and
270
+ 'deployment_signal' keys.
271
+ """
272
+ findings = []
273
+ concerns = []
274
+ strengths = []
275
+
276
+ accuracy = metrics.get("accuracy")
277
+ precision = metrics.get("precision")
278
+ recall = metrics.get("recall")
279
+ f1 = metrics.get("f1_score")
280
+ fpr = metrics.get("false_positive_rate")
281
+ fnr = metrics.get("false_negative_rate")
282
+ auc = metrics.get("auc_roc")
283
+ brier = metrics.get("calibration_brier_score")
284
+
285
+ # --- Accuracy ---
286
+ if accuracy is not None:
287
+ pct = round(accuracy * 100, 1)
288
+ findings.append(
289
+ f"Overall Accuracy: The model correctly predicted the outcome in {pct}% of "
290
+ f"test cases. "
291
+ + (
292
+ "This is generally considered strong baseline performance."
293
+ if accuracy >= 0.85
294
+ else "This level of accuracy warrants careful review before operational deployment."
295
+ if accuracy >= 0.70
296
+ else "This accuracy level is low and raises significant concerns about model reliability."
297
+ )
298
+ )
299
+ if accuracy >= 0.85:
300
+ strengths.append(f"High overall accuracy ({pct}%).")
301
+ elif accuracy < 0.70:
302
+ concerns.append(f"Overall accuracy of {pct}% is below the recommended 70% threshold for deployment consideration.")
303
+
304
+ # --- Precision ---
305
+ if precision is not None:
306
+ pct = round(precision * 100, 1)
307
+ findings.append(
308
+ f"Precision: When the model predicts a positive outcome, it is correct {pct}% of the time. "
309
+ + (
310
+ "This indicates a low rate of false alarms."
311
+ if precision >= 0.80
312
+ else "This suggests a meaningful rate of false alarms that may affect operational trust."
313
+ )
314
+ )
315
+ if precision < 0.70:
316
+ concerns.append(f"Precision of {pct}% means more than 30% of positive predictions are incorrect (false alarms).")
317
+
318
+ # --- Recall ---
319
+ if recall is not None:
320
+ pct = round(recall * 100, 1)
321
+ findings.append(
322
+ f"Recall (Detection Rate): The model correctly identified {pct}% of actual positive cases. "
323
+ + (
324
+ "Few real cases are being missed."
325
+ if recall >= 0.80
326
+ else "A notable portion of real cases are being missed, which may be a safety or mission concern."
327
+ )
328
+ )
329
+ if recall < 0.70:
330
+ concerns.append(f"Recall of {pct}% means more than 30% of actual positive cases are going undetected.")
331
+
332
+ # --- F1 Score ---
333
+ if f1 is not None:
334
+ pct = round(f1 * 100, 1)
335
+ findings.append(
336
+ f"F1 Score (Balanced Performance): The combined precision-recall balance score is {pct}%. "
337
+ + (
338
+ "The model handles both false alarms and missed detections well."
339
+ if f1 >= 0.80
340
+ else "The model shows meaningful trade-offs between false alarms and missed detections."
341
+ )
342
+ )
343
+ if f1 >= 0.80:
344
+ strengths.append(f"Strong F1 score ({pct}%) indicating balanced performance.")
345
+
346
+ # --- False Positive Rate ---
347
+ if fpr is not None:
348
+ pct = round(fpr * 100, 1)
349
+ if fpr > 0.15:
350
+ concerns.append(
351
+ f"False Positive Rate of {pct}%: The model incorrectly flags {pct}% of non-cases as positive. "
352
+ "This may place undue burden on individuals or resources."
353
+ )
354
+ else:
355
+ strengths.append(f"Low false positive rate ({pct}%).")
356
+
357
+ # --- False Negative Rate ---
358
+ if fnr is not None:
359
+ pct = round(fnr * 100, 1)
360
+ if fnr > 0.15:
361
+ concerns.append(
362
+ f"False Negative Rate of {pct}%: The model misses {pct}% of actual positive cases. "
363
+ "This could result in undetected issues with operational or mission impact."
364
+ )
365
+
366
+ # --- AUC-ROC ---
367
+ if auc is not None:
368
+ findings.append(
369
+ f"Discrimination Ability (AUC-ROC): The model's ability to distinguish between outcomes "
370
+ f"scores {round(auc, 3)} on a scale of 0.5 (random chance) to 1.0 (perfect). "
371
+ + (
372
+ "This indicates strong discriminative power."
373
+ if auc >= 0.80
374
+ else "This indicates moderate discriminative power; further review is advised."
375
+ if auc >= 0.70
376
+ else "This score is close to random chance, raising serious questions about model validity."
377
+ )
378
+ )
379
+ if auc >= 0.80:
380
+ strengths.append(f"Strong AUC-ROC score ({round(auc, 3)}).")
381
+ elif auc < 0.70:
382
+ concerns.append(f"AUC-ROC of {round(auc, 3)} is near random chance; model may lack meaningful predictive power.")
383
+
384
+ # --- Calibration ---
385
+ if brier is not None:
386
+ findings.append(
387
+ f"Calibration (Brier Score): The model's confidence scores are calibrated with a Brier score "
388
+ f"of {round(brier, 4)} (lower is better; 0.25 = random guessing). "
389
+ + (
390
+ "Confidence scores appear reliable."
391
+ if brier <= 0.10
392
+ else "Confidence scores should be interpreted with caution."
393
+ if brier <= 0.20
394
+ else "Confidence scores are poorly calibrated and should not be used for threshold-based decisions."
395
+ )
396
+ )
397
+ if brier > 0.20:
398
+ concerns.append(f"High Brier score ({round(brier, 4)}) indicates unreliable probability estimates.")
399
+
400
+ # --- Overall deployment signal ---
401
+ n_concerns = len(concerns)
402
+ if n_concerns == 0:
403
+ deployment_signal = "APPROVED FOR DEPLOYMENT"
404
+ signal_explanation = (
405
+ "Performance metrics are strong across all dimensions. No significant concerns were identified. "
406
+ "The model appears suitable for deployment pending subgroup and robustness review."
407
+ )
408
+ elif n_concerns <= 2:
409
+ deployment_signal = "APPROVED WITH CONDITIONS"
410
+ signal_explanation = (
411
+ f"{n_concerns} performance concern(s) were identified. The model may be deployable with "
412
+ "additional monitoring, human oversight, or restricted scope. Review the concerns listed below."
413
+ )
414
+ else:
415
+ deployment_signal = "NOT RECOMMENDED FOR DEPLOYMENT"
416
+ signal_explanation = (
417
+ f"{n_concerns} performance concerns were identified. The model does not appear ready for "
418
+ "operational deployment without significant remediation. A detailed remediation plan should be "
419
+ "developed before re-evaluation."
420
+ )
421
+
422
+ return {
423
+ "deployment_signal": deployment_signal,
424
+ "signal_explanation": signal_explanation,
425
+ "findings": findings,
426
+ "strengths": strengths,
427
+ "concerns": concerns,
428
+ "note": (
429
+ "This summary is based solely on overall test-set performance. "
430
+ "Subgroup disparity analysis (Component 2) and robustness testing (Component 3) "
431
+ "are required before a final deployment recommendation can be issued."
432
+ ),
433
+ }
434
+
435
+
436
+ # ---------------------------------------------------------------------------
437
+ # REPORT ASSEMBLY AND OUTPUT
438
+ # ---------------------------------------------------------------------------
439
+
440
+ def assemble_report(
441
+ metrics: dict,
442
+ plain_english: dict,
443
+ model_name: str,
444
+ dataset_path: str,
445
+ n_samples: int,
446
+ class_labels: list,
447
+ output_dir: str,
448
+ ) -> dict:
449
+ """
450
+ Assemble all computed data into the structured JSON report dictionary
451
+ and write it to disk.
452
+
453
+ Args:
454
+ metrics: Computed metric values.
455
+ plain_english: Plain-English interpretations.
456
+ model_name: Name of the evaluated model.
457
+ dataset_path: Path to the test dataset (for traceability).
458
+ n_samples: Number of test records evaluated.
459
+ class_labels: List of class label values.
460
+ output_dir: Directory where the JSON file will be saved.
461
+
462
+ Returns:
463
+ The fully assembled report as a Python dictionary.
464
+ """
465
+ report = {
466
+ "report_metadata": {
467
+ "report_type": "Model Performance Evaluation",
468
+ "toolkit": "AI Assurance Toolkit — Module A",
469
+ "component": "Component 1: Model Performance Evaluator",
470
+ "federal_alignment": [
471
+ "NIST AI RMF (2023) — MEASURE function, MR-2.5, MR-2.6: Quantifying AI system performance",
472
+ "OMB Memorandum M-25-21 — Documentation supporting deployment readiness reviews",
473
+ "America's AI Action Plan (July 2025) — Responsible AI deployment evaluation",
474
+ ],
475
+ "model_name": model_name,
476
+ "evaluation_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
477
+ "test_dataset": str(dataset_path),
478
+ "test_sample_size": n_samples,
479
+ "class_labels": [str(c) for c in class_labels],
480
+ },
481
+ "performance_metrics": {
482
+ "overall": {
483
+ "accuracy": round(metrics["accuracy"], 4),
484
+ "precision": round(metrics["precision"], 4),
485
+ "recall": round(metrics["recall"], 4),
486
+ "f1_score": round(metrics["f1_score"], 4),
487
+ "false_positive_rate": round(metrics["false_positive_rate"], 4),
488
+ "false_negative_rate": round(metrics["false_negative_rate"], 4),
489
+ "auc_roc": round(metrics["auc_roc"], 4) if metrics.get("auc_roc") is not None else None,
490
+ "calibration_brier_score": (
491
+ round(metrics["calibration_brier_score"], 4)
492
+ if metrics.get("calibration_brier_score") is not None
493
+ else None
494
+ ),
495
+ },
496
+ "per_class": metrics.get("per_class_metrics", {}),
497
+ },
498
+ "plain_english_summary": plain_english,
499
+ }
500
+
501
+ # Propagate any notes/errors for optional metrics
502
+ for key in ("auc_roc_note", "auc_roc_error", "calibration_note", "calibration_brier_error"):
503
+ if key in metrics:
504
+ report["performance_metrics"]["overall"][key] = metrics[key]
505
+
506
+ # Write to disk
507
+ os.makedirs(output_dir, exist_ok=True)
508
+ output_path = os.path.join(output_dir, OUTPUT_FILENAME)
509
+ with open(output_path, "w", encoding="utf-8") as fh:
510
+ json.dump(report, fh, indent=2)
511
+
512
+ return report
513
+
514
+
515
+ # ---------------------------------------------------------------------------
516
+ # CONSOLE OUTPUT
517
+ # ---------------------------------------------------------------------------
518
+
519
+ def print_console_summary(report: dict) -> None:
520
+ """Print a formatted summary of key findings to the console."""
521
+ meta = report["report_metadata"]
522
+ overall = report["performance_metrics"]["overall"]
523
+ summary = report["plain_english_summary"]
524
+
525
+ divider = "=" * 70
526
+ print(f"\n{divider}")
527
+ print(" AI ASSURANCE TOOLKIT — Module A, Component 1")
528
+ print(" Model Performance Evaluation Report")
529
+ print(divider)
530
+ print(f" Model: {meta['model_name']}")
531
+ print(f" Evaluation Date: {meta['evaluation_date']}")
532
+ print(f" Test Samples: {meta['test_sample_size']:,}")
533
+ print(f" Classes: {', '.join(meta['class_labels'])}")
534
+ print(divider)
535
+ print(" PERFORMANCE METRICS")
536
+ print(f" Accuracy: {overall['accuracy']:.4f}")
537
+ print(f" Precision: {overall['precision']:.4f}")
538
+ print(f" Recall: {overall['recall']:.4f}")
539
+ print(f" F1 Score: {overall['f1_score']:.4f}")
540
+ print(f" False Positive Rate: {overall['false_positive_rate']:.4f}")
541
+ print(f" False Negative Rate: {overall['false_negative_rate']:.4f}")
542
+ auc_val = overall.get("auc_roc")
543
+ print(f" AUC-ROC: {f'{auc_val:.4f}' if auc_val is not None else 'N/A'}")
544
+ brier_val = overall.get("calibration_brier_score")
545
+ print(f" Calibration (Brier): {f'{brier_val:.4f}' if brier_val is not None else 'N/A'}")
546
+ print(divider)
547
+ print(f" DEPLOYMENT SIGNAL: >>> {summary['deployment_signal']} <<<")
548
+ print(f"\n {summary['signal_explanation']}")
549
+ if summary["concerns"]:
550
+ print("\n CONCERNS IDENTIFIED:")
551
+ for i, c in enumerate(summary["concerns"], 1):
552
+ print(f" {i}. {c}")
553
+ if summary["strengths"]:
554
+ print("\n STRENGTHS IDENTIFIED:")
555
+ for s in summary["strengths"]:
556
+ print(f" ✓ {s}")
557
+ print(divider)
558
+ print(f" Report saved to: {DEFAULT_OUTPUT_DIR}/{OUTPUT_FILENAME}")
559
+ print(f"{divider}\n")
560
+
561
+
562
+ # ---------------------------------------------------------------------------
563
+ # PUBLIC API — callable from orchestrator
564
+ # ---------------------------------------------------------------------------
565
+
566
+ def run_performance_evaluation(
567
+ model,
568
+ X_test: pd.DataFrame,
569
+ y_test: pd.Series,
570
+ model_name: str = "Unnamed Model",
571
+ dataset_path: str = "N/A",
572
+ output_dir: str = DEFAULT_OUTPUT_DIR,
573
+ ) -> dict:
574
+ """
575
+ Primary entry point for programmatic use (called by run_evaluation.py).
576
+
577
+ Args:
578
+ model: A fitted scikit-learn-compatible classifier.
579
+ X_test: Feature matrix for the test set (pandas DataFrame).
580
+ y_test: True labels for the test set (pandas Series).
581
+ model_name: Human-readable name for the report.
582
+ dataset_path: Original dataset file path (for audit traceability).
583
+ output_dir: Folder where output files will be written.
584
+
585
+ Returns:
586
+ The assembled report dictionary.
587
+ """
588
+ class_labels = sorted(y_test.unique().tolist())
589
+ y_pred = model.predict(X_test)
590
+
591
+ # Attempt to retrieve probability estimates
592
+ if hasattr(model, "predict_proba"):
593
+ try:
594
+ y_prob = model.predict_proba(X_test)
595
+ except Exception:
596
+ y_prob = None
597
+ else:
598
+ y_prob = None
599
+
600
+ metrics = compute_classification_metrics(y_test.values, y_pred, y_prob, class_labels)
601
+ plain_english = generate_plain_english_summary(metrics, model_name)
602
+ report = assemble_report(
603
+ metrics, plain_english, model_name, dataset_path,
604
+ len(y_test), class_labels, output_dir
605
+ )
606
+ print_console_summary(report)
607
+ return report
608
+
609
+
610
+ def evaluate_from_files(
611
+ model_path: str,
612
+ dataset_path: str,
613
+ target: str,
614
+ model_name: str = "Unnamed Model",
615
+ output_dir: str = DEFAULT_OUTPUT_DIR,
616
+ ) -> dict:
617
+ """
618
+ Load a trained model and test dataset from files, then run performance evaluation.
619
+ This function is used by the command-line interface.
620
+ """
621
+ validate_inputs(model_path, dataset_path)
622
+
623
+ print(f"\n[INFO] Loading model from: {model_path}")
624
+ model = joblib.load(model_path)
625
+
626
+ print(f"[INFO] Loading dataset from: {dataset_path}")
627
+ df = pd.read_csv(dataset_path)
628
+
629
+ if target not in df.columns:
630
+ raise ValueError(
631
+ f"Target column '{target}' not found in dataset. "
632
+ f"Available columns: {list(df.columns)}"
633
+ )
634
+
635
+ y_test = df[target]
636
+
637
+ if hasattr(model, "feature_names_in_"):
638
+ expected_features = list(model.feature_names_in_)
639
+ missing_features = [col for col in expected_features if col not in df.columns]
640
+
641
+ if missing_features:
642
+ raise ValueError(
643
+ f"The dataset is missing required model feature columns: {missing_features}"
644
+ )
645
+
646
+ X_test = df[expected_features]
647
+ else:
648
+ X_test = df.drop(columns=[target])
649
+
650
+ print(f"[INFO] Dataset loaded: {len(df):,} rows, {len(X_test.columns)} features.")
651
+ print(f"[INFO] Beginning performance evaluation for: {model_name}\n")
652
+
653
+ return run_performance_evaluation(
654
+ model=model,
655
+ X_test=X_test,
656
+ y_test=y_test,
657
+ model_name=model_name,
658
+ dataset_path=dataset_path,
659
+ output_dir=output_dir,
660
+ )
661
+
662
+ # ---------------------------------------------------------------------------
663
+ # STANDALONE CLI ENTRYPOINT
664
+ # ---------------------------------------------------------------------------
665
+
666
+ def parse_args() -> argparse.Namespace:
667
+ parser = argparse.ArgumentParser(
668
+ description="Component 1 — Model Performance Evaluator (AI Assurance Toolkit)"
669
+ )
670
+ parser.add_argument("--model", required=True, help="Path to the serialized model file (.pkl or .joblib)")
671
+ parser.add_argument("--dataset", required=True, help="Path to the test dataset CSV file")
672
+ parser.add_argument("--target", required=True, help="Name of the target (label) column in the dataset")
673
+ parser.add_argument("--model-name", default="Unnamed Model", help="Human-readable model name for the report")
674
+ parser.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR, help="Directory for output files")
675
+ return parser.parse_args()
676
+
677
+
678
+ def validate_inputs(model_path: str, dataset_path: str) -> None:
679
+ """
680
+ Validate that required input files exist before any processing begins.
681
+ Exits with a descriptive error message if validation fails.
682
+ """
683
+ if not Path(model_path).exists():
684
+ print(f"\n[ERROR] Model file not found: '{model_path}'")
685
+ print(" Please verify the file path and try again.")
686
+ sys.exit(1)
687
+ if not Path(dataset_path).exists():
688
+ print(f"\n[ERROR] Dataset file not found: '{dataset_path}'")
689
+ print(" Please verify the file path and try again.")
690
+ sys.exit(1)
691
+
692
+
693
+ def main() -> None:
694
+ """Standalone CLI entry point."""
695
+ args = parse_args()
696
+ validate_inputs(args.model, args.dataset)
697
+
698
+ print(f"\n[INFO] Loading model from: {args.model}")
699
+ model = joblib.load(args.model)
700
+
701
+ print(f"[INFO] Loading dataset from: {args.dataset}")
702
+ df = pd.read_csv(args.dataset)
703
+
704
+ if args.target not in df.columns:
705
+ print(f"\n[ERROR] Target column '{args.target}' not found in dataset.")
706
+ print(f" Available columns: {list(df.columns)}")
707
+ sys.exit(1)
708
+
709
+ X_test = df.drop(columns=[args.target])
710
+ y_test = df[args.target]
711
+
712
+ print(f"[INFO] Dataset loaded: {len(df):,} rows, {len(X_test.columns)} features.")
713
+ print(f"[INFO] Beginning performance evaluation for: {args.model_name}\n")
714
+
715
+ run_performance_evaluation(
716
+ model=model,
717
+ X_test=X_test,
718
+ y_test=y_test,
719
+ model_name=args.model_name,
720
+ dataset_path=args.dataset,
721
+ output_dir=args.output_dir,
722
+ )
723
+
724
+
725
+ if __name__ == "__main__":
726
+ main()
@@ -0,0 +1,104 @@
1
+ Metadata-Version: 2.4
2
+ Name: ai-assurance-toolkit
3
+ Version: 0.1.0
4
+ Summary: A Python toolkit for evaluating AI model reliability, performance, and deployment readiness.
5
+ Author-email: Happy Iguare <haigu1@morgan.edu>
6
+ License: Apache-2.0
7
+ Project-URL: Repository, https://github.com/harpiking/AI-Assurance-Toolkit
8
+ Keywords: ai assurance,model evaluation,machine learning,responsible ai,model audit,nist ai rmf,public sector ai
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Classifier: Topic :: Software Development :: Quality Assurance
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: numpy>=1.24
23
+ Requires-Dist: pandas>=2.0
24
+ Requires-Dist: scikit-learn>=1.3
25
+ Requires-Dist: joblib>=1.3
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=8.0; extra == "dev"
28
+ Requires-Dist: build>=1.2; extra == "dev"
29
+ Requires-Dist: twine>=5.0; extra == "dev"
30
+ Requires-Dist: ruff>=0.5; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # AI Assurance Toolkit
34
+
35
+ AI Assurance Toolkit is a lightweight Python package for evaluating machine learning model reliability, performance, and deployment readiness.
36
+
37
+ ## What it does
38
+
39
+ The toolkit evaluates a trained machine learning model against a labeled test dataset and generates a structured performance report.
40
+
41
+ ## Metrics included
42
+
43
+ - Accuracy
44
+ - Precision
45
+ - Recall
46
+ - F1 score
47
+ - False positive rate
48
+ - False negative rate
49
+ - AUC-ROC
50
+ - Calibration / Brier score
51
+ - Per-class metrics
52
+ - Plain-English deployment signal
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ pip install ai-assurance-toolkit
58
+ ```
59
+
60
+ ## Quick start
61
+
62
+ ```bash
63
+ ai-assurance evaluate \
64
+ --model model.pkl \
65
+ --dataset test_data.csv \
66
+ --target credit_risk \
67
+ --model-name "German Credit Risk Classifier"
68
+ ```
69
+
70
+ ## Example
71
+
72
+ Generate a test model and sample dataset:
73
+
74
+ ```bash
75
+ python examples/setup_test_model.py
76
+ ```
77
+
78
+ Then run the evaluator:
79
+
80
+ ```bash
81
+ ai-assurance evaluate \
82
+ --model model.pkl \
83
+ --dataset test_data.csv \
84
+ --target credit_risk \
85
+ --model-name "German Credit Risk Classifier"
86
+ ```
87
+
88
+ ## Output
89
+
90
+ The package creates:
91
+
92
+ ```text
93
+ module_a_outputs/performance_report.json
94
+ ```
95
+
96
+ ## Python usage
97
+
98
+ ```python
99
+ from ai_assurance_toolkit import run_performance_evaluation
100
+ ```
101
+
102
+ ## License
103
+
104
+ Apache License 2.0.
@@ -0,0 +1,13 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/ai_assurance_toolkit/__init__.py
5
+ src/ai_assurance_toolkit/cli.py
6
+ src/ai_assurance_toolkit/performance_evaluator.py
7
+ src/ai_assurance_toolkit.egg-info/PKG-INFO
8
+ src/ai_assurance_toolkit.egg-info/SOURCES.txt
9
+ src/ai_assurance_toolkit.egg-info/dependency_links.txt
10
+ src/ai_assurance_toolkit.egg-info/entry_points.txt
11
+ src/ai_assurance_toolkit.egg-info/requires.txt
12
+ src/ai_assurance_toolkit.egg-info/top_level.txt
13
+ tests/test_import.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ ai-assurance = ai_assurance_toolkit.cli:main
@@ -0,0 +1,10 @@
1
+ numpy>=1.24
2
+ pandas>=2.0
3
+ scikit-learn>=1.3
4
+ joblib>=1.3
5
+
6
+ [dev]
7
+ pytest>=8.0
8
+ build>=1.2
9
+ twine>=5.0
10
+ ruff>=0.5
@@ -0,0 +1,4 @@
1
+ def test_package_imports():
2
+ import ai_assurance_toolkit
3
+
4
+ assert ai_assurance_toolkit.__version__ == "0.1.0"