model-auditor 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ name: Upload Python Package to PyPI when a Release is Created
2
+
3
+ on:
4
+ release:
5
+ types: [created]
6
+
7
+ jobs:
8
+ pypi-publish:
9
+ name: Publish release to PyPI
10
+ runs-on: ubuntu-latest
11
+ environment:
12
+ name: release
13
+ url: https://pypi.org/p/model-auditor
14
+ permissions:
15
+ id-token: write
16
+ contents: read
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v4
21
+ with:
22
+ python-version: "3.x"
23
+ - name: Install dependencies
24
+ run: |
25
+ python -m pip install --upgrade pip
26
+ python -m pip install --user --upgrade build
27
+ pip install setuptools wheel
28
+ - name: Build package
29
+ run: |
30
+ python -m build
31
+ - name: Publish package distributions to PyPI
32
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,3 @@
1
+ __pycache__/
2
+ *.egg-info
3
+ .ipynb-checkpoints
@@ -0,0 +1,6 @@
1
+ Metadata-Version: 2.4
2
+ Name: model-auditor
3
+ Version: 0.1.0
4
+ Requires-Dist: pandas>=2.2
5
+ Requires-Dist: numpy>=2.1
6
+ Requires-Dist: scikit-learn>=1.5
@@ -0,0 +1,3 @@
1
+ # Model Auditor [Pre-Alpha]
2
+
3
+ ### Beatrice BM
@@ -0,0 +1 @@
1
+ from model_auditor.core import Auditor
@@ -0,0 +1,336 @@
1
+ from typing import Optional, Type, Union, Callable
2
+ import pandas as pd
3
+ import numpy as np
4
+ from numpy.typing import NDArray
5
+ from sklearn.metrics import roc_curve
6
+ from tqdm.auto import tqdm
7
+
8
+ from model_auditor.metric_inputs import AuditorMetricInput
9
+ from model_auditor.metrics import AuditorMetric
10
+ from model_auditor.schemas import (
11
+ AuditorFeature,
12
+ AuditorScore,
13
+ AuditorOutcome,
14
+ FeatureEvaluation,
15
+ ScoreEvaluation,
16
+ )
17
+ from model_auditor.utils import collect_metric_inputs
18
+
19
+
20
+ class Auditor:
21
+ def __init__(
22
+ self,
23
+ data: Optional[pd.DataFrame] = None,
24
+ features: Optional[list[AuditorFeature]] = None,
25
+ scores: Optional[list[AuditorScore]] = None,
26
+ outcome: Optional[AuditorOutcome] = None,
27
+ metrics: Optional[list[AuditorMetric]] = None,
28
+ ) -> None:
29
+ # initialize data
30
+ self.data: Optional[pd.DataFrame] = None if data is None else data.copy()
31
+
32
+ # initialize features
33
+ self.features: dict[str, AuditorFeature] = dict()
34
+ if features is not None:
35
+ for feature in features:
36
+ self.add_feature(**vars(feature))
37
+
38
+ # initialize scores
39
+ self.scores: dict[str, AuditorScore] = dict()
40
+ if scores is not None:
41
+ for score in scores:
42
+ self.add_score(**vars(score))
43
+
44
+ # initialize outcome
45
+ if outcome is not None:
46
+ self.add_outcome(**vars(outcome))
47
+
48
+ # initialize metrics
49
+ self.metrics: list[AuditorScore] = list()
50
+ if metrics is not None:
51
+ self.metrics = metrics
52
+
53
+ # initialize attrs for later
54
+ self._inputs: list[Type[AuditorMetricInput]] = list()
55
+ self._evaluations: list = list()
56
+ self.n_bootstraps: int = 1000
57
+
58
+ def add_data(self, data: pd.DataFrame) -> None:
59
+ """
60
+ Method to add a dataframe to the auditor
61
+
62
+ Args:
63
+ data (pd.DataFrame): Full dataframe which will be subset for subgroup evaluation
64
+ """
65
+ self.data = data.copy()
66
+
67
+ def add_feature(
68
+ self, name: str, label: Optional[str] = None, levels: Optional[list[any]] = None
69
+ ) -> None:
70
+ """
71
+ Method to add a feature to the auditor. Equivalent to a grouping variable in
72
+ packages like tableone, the score variable will be stratified by this feature
73
+
74
+ Args:
75
+ name (str): Column name for the feature.
76
+ label (Optional[str], optional): Optional label for the feature. Defaults to None.
77
+ levels (Optional[list[any]], optional): Valid levels to consider for the feature,
78
+ by default (when set to None) all levels will be considered. Defaults to None.
79
+ [Not currently implemented]
80
+ """
81
+ feature = AuditorFeature(
82
+ name=name,
83
+ label=label,
84
+ levels=levels,
85
+ )
86
+ self.features[feature.name] = feature
87
+
88
+ def add_score(
89
+ self, name: str, label: Optional[str] = None, threshold: Optional[float] = None
90
+ ) -> None:
91
+ """
92
+ Method to add a score to the auditor. Expects a continuous feature which will
93
+ be used to calculate metrics and confidence intervals
94
+
95
+ Args:
96
+ name (str): Column name for the score.
97
+ label (Optional[str], optional): Optional label for the score. Defaults to None.
98
+ threshold (Optional[float], optional): Threshold used to binarize the score column.
99
+ Defaults to None and can be optimized using the Youden index or updated separately later.
100
+ """
101
+ score = AuditorScore(
102
+ name=name,
103
+ label=label,
104
+ threshold=threshold,
105
+ )
106
+ self.scores[score.name] = score
107
+
108
+ def add_outcome(self, name: str, mapping: Optional[dict[any, int]] = None) -> None:
109
+ if self.data is None:
110
+ raise ValueError("Please add data with .add_data() first")
111
+
112
+ if mapping is not None:
113
+ self.data["_truth"] = self.data[name].map(mapping)
114
+ else:
115
+ self.data["_truth"] = self.data[name]
116
+
117
+ def optimize_score_threshold(self, score_name: str) -> float:
118
+ """
119
+ Method to optimize the decision threshold for a score based on the Youden index.
120
+
121
+ Args:
122
+ score_name (str): Name of the target score
123
+
124
+ Raises:
125
+ ValueError: If no scores have been defined with .add_score() first
126
+ ValueError: If no data has been added with .add_data() first
127
+ ValueError: If no outcome variable has been defined with .add_outcome() first
128
+
129
+ Returns:
130
+ float: Optimal threshold identified
131
+ """
132
+ if len(self.scores) == 0:
133
+ raise ValueError("Please define at least one score first")
134
+ if self.data is None:
135
+ raise ValueError("Please add data with .add_data() first")
136
+ elif "_truth" not in self.data.columns.tolist():
137
+ raise ValueError(
138
+ "Please define an outcome variable data with .add_outcome() first"
139
+ )
140
+
141
+ # throws an error if the score has not been defined
142
+ score: AuditorScore = self.scores[score_name]
143
+ score_list: list[float] = self.data[score.name].astype(float).tolist()
144
+
145
+ # otherwise the target score will be the single item in the list
146
+ truth_list: list[float] = self.data["_truth"].astype(float).tolist()
147
+
148
+ # calculate optimal threshold
149
+ fpr, tpr, thresholds = roc_curve(truth_list, score_list)
150
+ idx: int = np.argmax(tpr - fpr).astype(int)
151
+ optimal_threshold: float = thresholds[idx]
152
+
153
+ print(f"Optimal threshold for '{score.name}' found at: {optimal_threshold}")
154
+ return optimal_threshold
155
+
156
+ def set_metrics(self, metrics: list[AuditorMetric]) -> None:
157
+ """
158
+ Method to define the metrics the auditor will use during evaluation of score variables.
159
+
160
+ Args:
161
+ metrics (list[AuditorMetric]): A list of metrics classes following the AuditorMetric
162
+ protocol (pre-made metrics listed in model_auditor.metrics)
163
+ """
164
+ self.metrics: list[AuditorMetric] = metrics
165
+
166
+ def _collect_inputs(self) -> None:
167
+ """
168
+ Collects the minimum set of metric inputs necessary for evaluation
169
+ (based on the metrics defined in self.metrics with the .define_metrics() method)
170
+ """
171
+ inputs_set: set[str] = set()
172
+ for metric in self.metrics:
173
+ inputs_set.update(metric.inputs)
174
+
175
+ inputs_dict: dict[str, Type[AuditorMetricInput]] = collect_metric_inputs()
176
+
177
+ # reinit self._inputs and add all necessary inputs to it
178
+ self._inputs: list[Type[AuditorMetricInput]] = list()
179
+ for input_name in list(inputs_set):
180
+ if input_name not in ["_truth", "_pred"]:
181
+ self._inputs.append(inputs_dict[input_name])
182
+
183
+ def _apply_inputs(self, data: pd.DataFrame) -> pd.DataFrame:
184
+ """
185
+ Method to apply the metric input functions (collected with .collect_inputs())
186
+ to the target data to prepare it for metric calculation
187
+
188
+ Args:
189
+ data (pd.DataFrame): Dataframe to add input columns to
190
+
191
+ Returns:
192
+ pd.DataFrame: Transformed dataframe with metric input columns
193
+ """
194
+ for input_type in self._inputs:
195
+ metric_input = input_type()
196
+ data: pd.DataFrame = metric_input.data_transform(data)
197
+
198
+ return data
199
+
200
+ def _binarize(self, score_data: pd.Series, threshold: float) -> pd.Series:
201
+ return (score_data >= threshold).astype(int)
202
+
203
+ def evaluate(self, score_name: str, threshold: Optional[float] = None):
204
+ if self.data is None:
205
+ raise ValueError("Please add data with .add_data() first")
206
+
207
+ if len(self.metrics) == 0:
208
+ raise ValueError(
209
+ "Please define at least one metric with .set_metrics() first"
210
+ )
211
+
212
+ # get score
213
+ score: AuditorScore = self.scores[score_name]
214
+
215
+ if (threshold is None) & (score.threshold is None):
216
+ raise ValueError(
217
+ "Threshold must be defined in score object or passed to .evaluate_score()"
218
+ )
219
+ elif threshold is None:
220
+ threshold = score.threshold
221
+
222
+ # collect metric inputs to prep for evaluation
223
+ self._collect_inputs()
224
+
225
+ # get the list of columns to retain in the data
226
+ column_list: list[str] = [*self.features.keys(), "_truth"]
227
+
228
+ # copy a slice of the dataframe
229
+ data_slice: pd.DataFrame = self.data.loc[:, column_list]
230
+ data_slice["_pred"] = self.data[score.name]
231
+ data_slice["_binary_pred"] = self._binarize(score_data=data_slice["_pred"], threshold=threshold) # type: ignore
232
+ data_slice = self._apply_inputs(data=data_slice)
233
+
234
+ # create an 'Overall' feature which will be used to calculate metrics on the full data
235
+ data_slice["overall"] = "Overall"
236
+ eval_features: dict[str, AuditorFeature] = {
237
+ "overall": AuditorFeature(
238
+ name="overall",
239
+ label="Overall",
240
+ )
241
+ }
242
+ eval_features.update(**self.features)
243
+
244
+ score_eval: ScoreEvaluation = ScoreEvaluation(
245
+ name=score.name,
246
+ label=score.label if score.label is not None else score.name,
247
+ )
248
+ with tqdm(
249
+ eval_features.values(), position=0, leave=True, desc="Features"
250
+ ) as pbar:
251
+ for feature in pbar:
252
+ pbar.set_postfix({"name": feature.name})
253
+
254
+ # e.g. {"f1": {'levelA': 0.2, 'levelB': 0.4}, ... }
255
+ feature_eval: FeatureEvaluation = self._evaluate_feature(
256
+ data=data_slice, feature=feature
257
+ )
258
+ score_eval.features[feature.name] = feature_eval
259
+
260
+ return score_eval
261
+
262
+ def _evaluate_feature(
263
+ self, data: pd.DataFrame, feature: AuditorFeature
264
+ ) -> FeatureEvaluation:
265
+ with tqdm(range(2), position=1, desc="Stages", leave=False) as feature_pbar:
266
+ feature_pbar.set_postfix({"stage": "Evaluating metrics"})
267
+
268
+ # cast feature levels to string
269
+ data[feature.name] = data[feature.name].astype(str)
270
+
271
+ # then group the df by this feature (so each group contains one
272
+ # unique level of the data) and get all metrics for each
273
+ feature_groups = data.groupby(feature.name)
274
+
275
+ # e.g. {"f1": {'levelA': 0.2, 'levelB': 0.4}, ... }
276
+ feature_eval: FeatureEvaluation = FeatureEvaluation(
277
+ name=feature.name,
278
+ label=feature.label if feature.label is not None else feature.name,
279
+ )
280
+ for metric in tqdm(self.metrics, position=2, desc="Metrics", leave=False):
281
+ # gets a dict with the current metric calculated for levels of the feature
282
+ # e.g. {levelA: 0.5, levelB: 0.5}
283
+ level_eval_dict = feature_groups.apply(metric.data_call).to_dict()
284
+
285
+ feature_eval.update(
286
+ metric_name=metric.name,
287
+ metric_label=metric.label,
288
+ data=level_eval_dict,
289
+ )
290
+
291
+ feature_pbar.update(1)
292
+ feature_pbar.set_postfix({"stage": "Evaluating intervals"})
293
+ # if calculating confidence intervals, do that here
294
+ if self.n_bootstraps is not None:
295
+ for level_name, level_data in tqdm(
296
+ feature_groups, position=2, desc="Bootstrap Levels", leave=False
297
+ ):
298
+ # calculate confidence intervals for eligible metrics for the current feature level
299
+ level_metric_intervals: dict[str, tuple[float, float]] = (
300
+ self._evaluate_confidence_interval(data=level_data)
301
+ )
302
+ # register the calculated intervals
303
+ feature_eval.update_intervals(
304
+ level_name=str(level_name),
305
+ metric_intervals=level_metric_intervals,
306
+ )
307
+ feature_pbar.update(1)
308
+
309
+ return feature_eval
310
+
311
+ def _evaluate_confidence_interval(
312
+ self, data: pd.DataFrame
313
+ ) -> dict[str, tuple[float, float]]:
314
+ n: int = len(data)
315
+
316
+ bootstrap_results: dict[str, NDArray[np.float64]] = dict()
317
+ for metric in self.metrics:
318
+ if metric.ci_eligible:
319
+ bootstrap_results[metric.name] = np.empty(shape=(self.n_bootstraps), dtype=np.float64)
320
+
321
+ # sample n_bootstrap times with replacement
322
+ for i in range(self.n_bootstraps):
323
+ boot_data: pd.DataFrame = data.sample(n, replace=True)
324
+
325
+ # calculate metrics on current bootstrap data
326
+ for metric in self.metrics:
327
+ if metric.ci_eligible:
328
+ bootstrap_results[metric.name][i] = metric.data_call(boot_data)
329
+
330
+ metric_intervals: dict[str, tuple[float, float]] = dict()
331
+ for metric_name, bootstrap_array in bootstrap_results.items():
332
+ # get 95% confidence bounds for metric
333
+ lower, upper = np.percentile(bootstrap_array, [2.5, 97.5])
334
+ metric_intervals[metric_name] = (lower, upper)
335
+
336
+ return metric_intervals
@@ -0,0 +1,58 @@
1
+ import pandas as pd
2
+ from typing import Protocol, Union, runtime_checkable
3
+
4
+
5
+ @runtime_checkable
6
+ class AuditorMetricInput(Protocol):
7
+ name: str
8
+ label: str
9
+ inputs: list[str]
10
+
11
+ def row_call(self, row: pd.Series) -> Union[int, float]:
12
+ """
13
+ method called on each row of a dataframe to calculate a metric
14
+ """
15
+ raise NotImplementedError
16
+
17
+ def data_transform(self, data: pd.DataFrame) -> pd.DataFrame:
18
+ """
19
+ method called on a dataframe to add a metric input column inplace
20
+ """
21
+ data[self.name] = data.apply(self.row_call, axis=1)
22
+ return data
23
+
24
+
25
+ class TruePositives(AuditorMetricInput):
26
+ name: str = "tp"
27
+ label: str = "TP"
28
+ inputs: list[str] = ["_truth", "_binary_pred"]
29
+
30
+ def row_call(self, row: pd.Series) -> int:
31
+ return int((row["_truth"] == 1.0) & (row["_binary_pred"] == 1.0))
32
+
33
+
34
+ class FalsePositives(AuditorMetricInput):
35
+ name: str = "fp"
36
+ label: str = "FP"
37
+ inputs: list[str] = ["_truth", "_binary_pred"]
38
+
39
+ def row_call(self, row: pd.Series) -> int:
40
+ return int((row["_truth"] == 0.0) & (row["_binary_pred"] == 1.0))
41
+
42
+
43
+ class TrueNegatives(AuditorMetricInput):
44
+ name: str = "tn"
45
+ label: str = "TN"
46
+ inputs: list[str] = ["_truth", "_binary_pred"]
47
+
48
+ def row_call(self, row: pd.Series) -> int:
49
+ return int((row["_truth"] == 0.0) & (row["_binary_pred"] == 0.0))
50
+
51
+
52
+ class FalseNegatives(AuditorMetricInput):
53
+ name: str = "fn"
54
+ label: str = "FN"
55
+ inputs: list[str] = ["_truth", "_binary_pred"]
56
+
57
+ def row_call(self, row: pd.Series) -> int:
58
+ return int((row["_truth"] == 1.0) & (row["_binary_pred"] == 0.0))
@@ -0,0 +1,250 @@
1
+ from typing import Protocol, Union, Optional
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.metrics import average_precision_score, roc_auc_score
5
+
6
+
7
+ class AuditorMetric(Protocol):
8
+ name: str
9
+ label: str
10
+ inputs: list[str]
11
+ ci_eligible: bool
12
+
13
+ def data_call(self, data: pd.DataFrame) -> Union[float, int]:
14
+ """
15
+ method called on a dataframe to calculate a metric
16
+ """
17
+ raise NotImplementedError
18
+
19
+
20
+ class Sensitivity(AuditorMetric):
21
+ name: str = "sensitivity"
22
+ label: str = "Sensitivity"
23
+ inputs: list[str] = ["tp", "fn"]
24
+ ci_eligible: bool = True
25
+
26
+ def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
27
+ n_tp: int = data["tp"].sum()
28
+ n_fn: int = data["fn"].sum()
29
+ return n_tp / (n_tp + n_fn + eps)
30
+
31
+
32
+ class Specificity(AuditorMetric):
33
+ name: str = "specificity"
34
+ label: str = "Specificity"
35
+ inputs: list[str] = ["tn", "fp"]
36
+ ci_eligible: bool = True
37
+
38
+ def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
39
+ n_tn: int = data["tn"].sum()
40
+ n_fp: int = data["fp"].sum()
41
+ return n_tn / (n_tn + n_fp + eps)
42
+
43
+
44
+ class Precision(AuditorMetric):
45
+ name: str = "precision"
46
+ label: str = "Precision"
47
+ inputs: list[str] = ["tp", "fp"]
48
+ ci_eligible: bool = True
49
+
50
+ def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
51
+ n_tp: int = data["tp"].sum()
52
+ n_fp: int = data["fp"].sum()
53
+ return n_tp / (n_tp + n_fp + eps)
54
+
55
+
56
+ class Recall(AuditorMetric):
57
+ name: str = "recall"
58
+ label: str = "Recall"
59
+ inputs: list[str] = ["tp", "fn"]
60
+ ci_eligible: bool = True
61
+
62
+ def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
63
+ n_tp: int = data["tp"].sum()
64
+ n_fn: int = data["fn"].sum()
65
+ return n_tp / (n_tp + n_fn + eps)
66
+
67
+
68
+ class F1Score(AuditorMetric):
69
+ name: str = "f1"
70
+ label: str = "F1 Score"
71
+ inputs: list[str] = ["tp", "fp", "fn"]
72
+ ci_eligible: bool = True
73
+
74
+ def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
75
+ # Recalculate to avoid dependency on ordering of metrics
76
+ precision = Precision().data_call(data)
77
+ recall = Recall().data_call(data)
78
+ return 2 * (precision * recall) / (precision + recall + eps)
79
+
80
+
81
+ class AUROC(AuditorMetric):
82
+ name: str = "auroc"
83
+ label: str = "AUROC"
84
+ inputs: list[str] = ["_truth", "_pred"]
85
+ ci_eligible: bool = True
86
+
87
+ def data_call(self, data: pd.DataFrame) -> float:
88
+ try:
89
+ return float(roc_auc_score(data["_truth"], data["_pred"]))
90
+ except ValueError:
91
+ return 0.0
92
+
93
+
94
+ class AUPRC(AuditorMetric):
95
+ name: str = "auprc"
96
+ label: str = "AUPRC"
97
+ inputs: list[str] = ["_truth", "_pred"]
98
+ ci_eligible: bool = True
99
+
100
+ def data_call(self, data: pd.DataFrame) -> float:
101
+ try:
102
+ return float(average_precision_score(data["_truth"], data["_pred"]))
103
+ except ValueError:
104
+ return 0.0
105
+
106
+
107
+ class MatthewsCorrelationCoefficient(AuditorMetric):
108
+ name: str = "mcc"
109
+ label: str = "Matthews Correlation Coefficient"
110
+ inputs: list[str] = ["tp", "tn", "fp", "fn"]
111
+ ci_eligible: bool = True
112
+
113
+ def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
114
+ tp = data["tp"].sum()
115
+ tn = data["tn"].sum()
116
+ fp = data["fp"].sum()
117
+ fn = data["fn"].sum()
118
+
119
+ numerator = (tp * tn) - (fp * fn)
120
+ denominator = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
121
+
122
+ if denominator == 0:
123
+ return 0.0
124
+ return numerator / (denominator + eps)
125
+
126
+
127
+ class FBetaScore(AuditorMetric):
128
+ name: str = "fbeta"
129
+ label: str = "F-beta Score"
130
+ inputs: list[str] = ["precision", "recall"]
131
+ ci_eligible: bool = True
132
+
133
+ def __init__(self, beta: float = 1.0):
134
+ self.beta = beta
135
+ self.name = f"f{beta:.1f}".replace(".", "_") # e.g., "f0_5" or "f2_0"
136
+ self.label = f"F{beta:.1f} Score"
137
+
138
+ def data_call(self, data: pd.DataFrame) -> float:
139
+ precision = Precision().data_call(data)
140
+ recall = Recall().data_call(data)
141
+ beta_sq = self.beta**2
142
+
143
+ if precision + recall == 0:
144
+ return 0.0
145
+
146
+ return (1 + beta_sq) * (precision * recall) / ((beta_sq * precision) + recall)
147
+
148
+
149
+ class TPR(Sensitivity):
150
+ name: str = "tpr"
151
+ label: str = "TPR"
152
+
153
+
154
+ class TNR(Specificity):
155
+ name: str = "tnr"
156
+ label: str = "TNR"
157
+
158
+
159
+ class FPR(AuditorMetric):
160
+ name: str = "fpr"
161
+ label: str = "FPR"
162
+ inputs: list[str] = ["fp", "tn"]
163
+ ci_eligible: bool = True
164
+
165
+ def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
166
+ n_fp: int = data["fp"].sum()
167
+ n_tn: int = data["tn"].sum()
168
+ return n_fp / (n_fp + n_tn + eps)
169
+
170
+
171
+ class FNR(AuditorMetric):
172
+ name: str = "fnr"
173
+ label: str = "FNR"
174
+ inputs: list[str] = ["fn", "tp"]
175
+ ci_eligible: bool = True
176
+
177
+ def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
178
+ n_fn: int = data["fn"].sum()
179
+ n_tp: int = data["tp"].sum()
180
+ return n_fn / (n_fn + n_tp + eps)
181
+
182
+
183
+ class nData(AuditorMetric):
184
+ name: str = "n"
185
+ label: str = "N"
186
+ inputs: list[str] = []
187
+ ci_eligible: bool = False
188
+
189
+ def data_call(self, data: pd.DataFrame) -> int:
190
+ return len(data)
191
+
192
+
193
+ class nTP(AuditorMetric):
194
+ name: str = "n_tp"
195
+ label: str = "TP"
196
+ inputs: list[str] = ['tp']
197
+ ci_eligible: bool = False
198
+
199
+ def data_call(self, data: pd.DataFrame) -> int:
200
+ return data['tp'].sum()
201
+
202
+
203
+ class nTN(AuditorMetric):
204
+ name: str = "n_tn"
205
+ label: str = "TN"
206
+ inputs: list[str] = ['tn']
207
+ ci_eligible: bool = False
208
+
209
+ def data_call(self, data: pd.DataFrame) -> int:
210
+ return data['tn'].sum()
211
+
212
+
213
+ class nFP(AuditorMetric):
214
+ name: str = "n_fp"
215
+ label: str = "FP"
216
+ inputs: list[str] = ['fp']
217
+ ci_eligible: bool = False
218
+
219
+ def data_call(self, data: pd.DataFrame) -> int:
220
+ return data['fp'].sum()
221
+
222
+
223
+ class nFN(AuditorMetric):
224
+ name: str = "n_fn"
225
+ label: str = "FN"
226
+ inputs: list[str] = ['fn']
227
+ ci_eligible: bool = False
228
+
229
+ def data_call(self, data: pd.DataFrame) -> int:
230
+ return data['fn'].sum()
231
+
232
+
233
+ class nPositive(AuditorMetric):
234
+ name: str = "n_pos"
235
+ label: str = "Pos."
236
+ inputs: list[str] = ['_truth']
237
+ ci_eligible: bool = False
238
+
239
+ def data_call(self, data: pd.DataFrame) -> int:
240
+ return (data['_truth'] == 1).astype(int).sum()
241
+
242
+
243
+ class nNegative(AuditorMetric):
244
+ name: str = "n_neg"
245
+ label: str = "Neg."
246
+ inputs: list[str] = ['_truth']
247
+ ci_eligible: bool = False
248
+
249
+ def data_call(self, data: pd.DataFrame) -> int:
250
+ return (data['_truth'] == 0).astype(int).sum()
@@ -0,0 +1 @@
1
+ from model_auditor.plotting.plotters import HierarchyPlotter
@@ -0,0 +1,228 @@
1
+ from typing import Optional, Union, Callable
2
+ import pandas as pd
3
+
4
+ from model_auditor.schemas import AuditorScore, AuditorOutcome
5
+ from model_auditor.plotting.schemas import Hierarchy, HLevel, HItem, PlotterData
6
+
7
+
8
+ class HierarchyPlotter:
9
+ def __init__(self) -> None:
10
+ self.features: Optional[Hierarchy] = None # type: ignore
11
+ self.data: Optional[pd.DataFrame] = None
12
+ self.aggregator: Union[str, Callable] = "median"
13
+
14
+ self.score: Optional[AuditorScore] = None
15
+ self.outcome: Optional[AuditorOutcome] = None
16
+
17
+ def set_data(self, data: pd.DataFrame) -> None:
18
+ """Set data for the plotter
19
+
20
+ Args:
21
+ data (pd.DataFrame): Data used to build the plot
22
+ """
23
+ self.data = data
24
+
25
+ def set_features(self, features: Union[Hierarchy, list[str]]) -> None:
26
+ """Set the feature hierarchy for the plotter
27
+
28
+ Args:
29
+ features (Union[Hierarchy, list[str]]): Expects a list of strings
30
+ (column names corresponding to the data provided) or a predefined
31
+ custom Hierarchy object
32
+
33
+ Raises:
34
+ ValueError: Raised if something other than a list of Hierarchy
35
+ object was passed
36
+ """
37
+ # flat hierarchy
38
+ if isinstance(features, list):
39
+ self.features: Hierarchy = Hierarchy()
40
+ for feature in features:
41
+ self.features.levels.append(HLevel([HItem(name=feature)]))
42
+ # complex/custom hierarchy
43
+ elif isinstance(features, Hierarchy):
44
+ self.features = features
45
+ else:
46
+ raise ValueError(
47
+ "unrecognized type for features, please pass a list of strings or a predefined Hierarchy() object"
48
+ )
49
+
50
+ def set_aggregator(self, method: Union[str, Callable]) -> None:
51
+ """Sets the aggregator used to color the plot cells
52
+
53
+ Args:
54
+ method (Union[str, Callable]): Expects a string corresponding to a
55
+ predefined aggregator for the .agg() pandas method, or a function
56
+ that takes the score column as a series and outputs some float
57
+ """
58
+ self.aggregator = method
59
+
60
+ def set_score(
61
+ self, name: str, label: Optional[str] = None, threshold: Optional[float] = None
62
+ ) -> None:
63
+ """Sets the score column used by the plotter
64
+
65
+ Args:
66
+ name (str): Name of the score column
67
+ label (Optional[str], optional): Label of the score column. Defaults to None
68
+ (plot will just use the column name).
69
+ threshold (Optional[float], optional): Threshold to binarize the score column.
70
+ Defaults to None (currently unused).
71
+ """
72
+ self.score = AuditorScore(
73
+ name=name, label=label if label is not None else name, threshold=threshold
74
+ )
75
+
76
+ def compile(self, container: str) -> PlotterData:
77
+ """Compiles the data for the plotter based on the defined HierarchyPlotter parameters
78
+
79
+ Args:
80
+ container (str): Name of the plot container trace
81
+
82
+ Raises:
83
+ ValueError: If features have not been set with .set_features() first
84
+ ValueError: If a score has not been set with .set_score() first
85
+
86
+ Returns:
87
+ PlotterData: Returns the formatted plotter data TODO: wrap this internally
88
+ """
89
+ if self.features is None:
90
+ raise ValueError("Please set features with .set_features() first")
91
+
92
+ if self.score is None:
93
+ raise ValueError("Please set a score variable with .set_score() first!")
94
+
95
+ datasource = self._prepare_datasource()
96
+ data = PlotterData()
97
+
98
+ if isinstance(self.aggregator, str):
99
+ container_agg: float = (
100
+ datasource[self.score.name].agg(self.aggregator).item()
101
+ )
102
+ else:
103
+ container_agg: float = self.aggregator(datasource)
104
+
105
+ data.add(
106
+ label=container,
107
+ id=container,
108
+ parent="",
109
+ value=len(datasource),
110
+ color=container_agg,
111
+ )
112
+
113
+ return self._recursive_record(
114
+ data=data, datasource=datasource, parent_id=container, idx=0
115
+ )
116
+
117
+ def _recursive_record(
118
+ self, data: PlotterData, datasource: pd.DataFrame, parent_id: str, idx: int
119
+ ):
120
+ """Recursive internal function used to compile data for the plotter"""
121
+ level: HLevel = self.features.levels[idx]
122
+ # init a list to track valid features for this level
123
+ level_features: list[HItem] = []
124
+ for item in level.items:
125
+ # if the item has no query then include it
126
+ if item.query is None:
127
+ level_features.append(item)
128
+
129
+ # otherwise, include it if the feature query evaluates to true for the *entire* datasource
130
+ elif all(datasource.eval(item.query).tolist()): # type: ignore
131
+ level_features.append(item)
132
+
133
+ # if this level has only 1 valid item, consider it the feature
134
+ if len(level_features) == 1:
135
+ feature = level_features[0]
136
+
137
+ # otherwise, if this level has >1 valid item, concatenate them into a temp derived feature
138
+ elif len(level_features) > 1:
139
+ datasource.loc[:, '_temp_feature'] = (
140
+ datasource[[i.name for i in level_features]]
141
+ .apply(lambda row: " & ".join(row.values.astype(str)), axis=1)
142
+ )
143
+
144
+ feature = HItem(name="_temp_feature")
145
+
146
+ # if this level has 0 valid items, return
147
+ else:
148
+ return data
149
+
150
+ # group the df by the current feature and get its frequency and agg metric
151
+ assert isinstance(
152
+ self.score, AuditorScore
153
+ ) # handled by the wrapper but here for type hinting
154
+
155
+ count_dict: dict[str, int] = (
156
+ datasource.groupby(feature.name, as_index=True, observed=False)[self.score.name]
157
+ .agg("count")
158
+ .to_dict()
159
+ )
160
+
161
+ if isinstance(self.aggregator, str):
162
+ # built-in aggregators
163
+ agg_dict: dict[str, float] = (
164
+ datasource.groupby(feature.name, as_index=True, observed=False)[self.score.name]
165
+ .agg(self.aggregator)
166
+ .to_dict()
167
+ )
168
+ else:
169
+ # custom aggregators (pass entire df here instead of just the score series)
170
+ agg_dict: dict = (
171
+ datasource.groupby(feature.name, as_index=True, observed=False)
172
+ .apply(self.aggregator)
173
+ .to_dict()
174
+ )
175
+
176
+ # extract the count dict keys to get the levels for the current feature
177
+ feature_levels: list[str] = list(count_dict.keys())
178
+ # format the parent_id + feature levels into trace identifiers
179
+ id_dict: dict[str, str] = {
180
+ feature_level: f"{parent_id}${feature_level}"
181
+ for feature_level in count_dict.keys()
182
+ }
183
+
184
+ for feature_level in feature_levels:
185
+ # add the current feature data
186
+ data.add(
187
+ label=feature_level,
188
+ id=id_dict[feature_level],
189
+ parent=parent_id,
190
+ value=count_dict[feature_level],
191
+ color=agg_dict[feature_level],
192
+ )
193
+
194
+ # if this isn't the last feature in the stack, get the subset of data for this feature and recurse
195
+ if idx < (len(self.features.levels) - 1):
196
+ data = self._recursive_record(
197
+ data=data,
198
+ datasource=datasource.loc[datasource[feature.name] == feature_level, :].copy(),
199
+ parent_id=id_dict[feature_level],
200
+ idx=idx + 1,
201
+ )
202
+
203
+ return data
204
+
205
+ def _prepare_datasource(self) -> pd.DataFrame:
206
+ """Internal function used to prepare the datasource for plotting
207
+
208
+ Raises:
209
+ ValueError: If data has not been added with .set_data() first
210
+
211
+ Returns:
212
+ pd.DataFrame: Prepared data source
213
+ """
214
+ if self.data is None:
215
+ raise ValueError("Please set data with .set_data() first")
216
+
217
+ data = self.data.copy()
218
+ if self.score is not None:
219
+ data["_pred"] = self.score.name
220
+ else:
221
+ print("no score set")
222
+
223
+ if self.outcome is not None:
224
+ data["_outcome"] = self.outcome.name
225
+ else:
226
+ print("no outcome set")
227
+
228
+ return data
@@ -0,0 +1,40 @@
1
+ from typing import Optional
2
+ from dataclasses import dataclass, field
3
+
4
+
5
+ @dataclass
6
+ class PlotterData:
7
+ labels: list = field(default_factory=list)
8
+ ids: list = field(default_factory=list)
9
+ parents: list = field(default_factory=list)
10
+ values: list = field(default_factory=list)
11
+ colors: list = field(default_factory=list)
12
+
13
+ def add(self, label: str, id: str, parent: str, value: int, color: float) -> None:
14
+ self.labels.append(label)
15
+ self.ids.append(id)
16
+ self.parents.append(parent)
17
+ self.values.append(value)
18
+ self.colors.append(color)
19
+
20
+
21
+ @dataclass
22
+ class HItem:
23
+ """Hierarchy Item"""
24
+
25
+ name: str
26
+ query: Optional[str] = None
27
+
28
+
29
+ @dataclass
30
+ class HLevel:
31
+ """Hierarchy level"""
32
+
33
+ items: list[HItem] = field(default_factory=list)
34
+
35
+
36
+ @dataclass
37
+ class Hierarchy:
38
+ """Hierarchy container"""
39
+
40
+ levels: list[HLevel] = field(default_factory=list)
@@ -0,0 +1,160 @@
1
+ from typing import Optional, Union
2
+ from dataclasses import dataclass, field
3
+
4
+ import pandas as pd
5
+
6
+
7
+ @dataclass
8
+ class LevelMetric:
9
+ """
10
+ Object to store the evaluation results for one metric of one level of a feature.
11
+ (for example, AUC for one category of finding)
12
+
13
+ Args:
14
+ name (str): Name of the current feature level metric
15
+ score (Union[float, int]): Score for the current feature level metric
16
+ interval (tuple[float, float], optional): Optional lower and upper confidence
17
+ bounds for the current feature level metric (defaults to None)
18
+ """
19
+
20
+ name: str
21
+ label: str
22
+ score: Union[float, int]
23
+ interval: Optional[tuple[float, float]] = None
24
+
25
+
26
+ @dataclass
27
+ class LevelEvaluation:
28
+ """
29
+ Object to store the evaluation results for one level of a feature
30
+ (for example, all metrics for one category of finding).
31
+
32
+ Args:
33
+ name (str): Name of the current feature level
34
+ metrics (dict[str, LevelMetric]): Metrics for the current feature level
35
+ (defaults to an empty dict)
36
+ """
37
+
38
+ name: str
39
+ metrics: dict[str, LevelMetric] = field(default_factory=dict)
40
+
41
+ def update(self, metric_name: str, metric_label: str, metric_score: float) -> None:
42
+ self.metrics[metric_name] = LevelMetric(
43
+ name=metric_name, label=metric_label, score=metric_score
44
+ )
45
+
46
+ def update_intervals(self, metric_intervals: dict[str, tuple[float, float]]):
47
+ for metric_name, confidence_interval in metric_intervals.items():
48
+ self.metrics[metric_name].interval = confidence_interval
49
+
50
+ def to_dataframe(self, n_decimals: int = 3, add_index: bool = False, metric_labels: bool = False):
51
+ metric_data: dict[str, str] = dict()
52
+ for metric in self.metrics.values():
53
+ # get the key name for the current metric (label if metric_labels is True)
54
+ metric_key: str = metric.label if metric_labels else metric.name
55
+
56
+ if metric.interval is not None:
57
+ metric_data[metric_key] = (
58
+ f"{metric.score:.{n_decimals}f} ({metric.interval[0]:.{n_decimals}f}, {metric.interval[1]:.{n_decimals}f})"
59
+ )
60
+ elif isinstance(metric.score, float):
61
+ metric_data[metric_key] = f"{metric.score:.{n_decimals}f}"
62
+ else:
63
+ # integer scores (default to comma delimited for now)
64
+ metric_data[metric_key] = f"{metric.score:,}"
65
+
66
+ return pd.DataFrame(metric_data, index=[self.name])
67
+
68
+
69
+ @dataclass
70
+ class FeatureEvaluation:
71
+ """
72
+ Object to store the evaluation results for one feature type
73
+ (for example, metrics associated with different types of findings)
74
+
75
+ Args:
76
+ name (str): Name of the current feature
77
+ name (str): Label for the current feature
78
+ levels (dict[str, LevelEvaluation]): Levels of the current feature
79
+ (defaults to an empty dict)
80
+ """
81
+
82
+ name: str
83
+ label: str
84
+ levels: dict[str, LevelEvaluation] = field(default_factory=dict)
85
+
86
+ def update(
87
+ self, metric_name: str, metric_label: str, data: dict[str, float]
88
+ ) -> None:
89
+ # expects a dict for one metric type: {'levelA': 0.5, 'levelB': 0.5}
90
+ # and maps them to child level metric dicts
91
+ for level_name, level_metric in data.items():
92
+ # try to get the level item and instantiate a new one if it doesn't exist yet
93
+ level_eval: LevelEvaluation = self.levels.get(
94
+ level_name, LevelEvaluation(name=level_name)
95
+ )
96
+ # update the metrics for that level eval object and save it back to the dict
97
+ level_eval.update(
98
+ metric_name=metric_name,
99
+ metric_label=metric_label,
100
+ metric_score=level_metric,
101
+ )
102
+ self.levels[level_name] = level_eval
103
+
104
+ def update_intervals(
105
+ self, level_name: str, metric_intervals: dict[str, tuple[float, float]]
106
+ ):
107
+ self.levels[level_name].update_intervals(metric_intervals=metric_intervals)
108
+
109
+ def to_dataframe(
110
+ self, n_decimals: int = 3, add_index: bool = False, metric_labels: bool = False
111
+ ) -> pd.DataFrame:
112
+ data: list[pd.DataFrame] = []
113
+ for level_data in self.levels.values():
114
+ data.append(level_data.to_dataframe(n_decimals=n_decimals, metric_labels=metric_labels))
115
+
116
+ if add_index:
117
+ return pd.concat({self.label: pd.concat(data, axis=0)})
118
+ else:
119
+ return pd.concat(data, axis=0)
120
+
121
+
122
+ @dataclass
123
+ class ScoreEvaluation:
124
+ name: str
125
+ label: str
126
+ features: dict[str, FeatureEvaluation] = field(default_factory=dict)
127
+
128
+ def to_dataframe(
129
+ self, n_decimals: int = 3, add_index: bool = False, metric_labels: bool = False
130
+ ) -> pd.DataFrame:
131
+ data: list[pd.DataFrame] = []
132
+ for feature_data in self.features.values():
133
+ data.append(
134
+ feature_data.to_dataframe(n_decimals=n_decimals, add_index=True, metric_labels=metric_labels)
135
+ )
136
+
137
+ if add_index:
138
+ return pd.concat({self.label: pd.concat(data, axis=0)})
139
+ else:
140
+ return pd.concat(data, axis=0)
141
+
142
+
143
+ @dataclass
144
+ class AuditorFeature:
145
+ name: str
146
+ label: Optional[str] = None
147
+ levels: Optional[list[any]] = None
148
+
149
+
150
+ @dataclass
151
+ class AuditorScore:
152
+ name: str
153
+ label: Optional[str] = None
154
+ threshold: Optional[float] = None
155
+
156
+
157
+ @dataclass
158
+ class AuditorOutcome:
159
+ name: str
160
+ mapping: Optional[dict[any, int]] = None
@@ -0,0 +1,27 @@
1
+ import importlib
2
+ import inspect
3
+ from typing import Type
4
+ from model_auditor.metric_inputs import AuditorMetricInput
5
+
6
+
7
+ def is_metric_input_valid(cls: type) -> bool:
8
+ return (
9
+ inspect.isclass(cls)
10
+ and hasattr(cls, "name")
11
+ and hasattr(cls, "label")
12
+ and hasattr(cls, "inputs")
13
+ and callable(getattr(cls, "row_call", None))
14
+ and callable(getattr(cls, "data_transform", None))
15
+ )
16
+
17
+
18
+ def collect_metric_inputs() -> dict[str, Type[AuditorMetricInput]]:
19
+ module = importlib.import_module("model_auditor.metric_inputs")
20
+
21
+ input_classes = {
22
+ cls.name: cls
23
+ for _, cls in inspect.getmembers(module, inspect.isclass)
24
+ if is_metric_input_valid(cls) and cls is not AuditorMetricInput
25
+ }
26
+
27
+ return input_classes
@@ -0,0 +1,6 @@
1
+ Metadata-Version: 2.4
2
+ Name: model-auditor
3
+ Version: 0.1.0
4
+ Requires-Dist: pandas>=2.2
5
+ Requires-Dist: numpy>=2.1
6
+ Requires-Dist: scikit-learn>=1.5
@@ -0,0 +1,18 @@
1
+ .gitignore
2
+ README.md
3
+ pyproject.toml
4
+ .github/workflows/publish.yml
5
+ model_auditor/__init__.py
6
+ model_auditor/core.py
7
+ model_auditor/metric_inputs.py
8
+ model_auditor/metrics.py
9
+ model_auditor/schemas.py
10
+ model_auditor/utils.py
11
+ model_auditor.egg-info/PKG-INFO
12
+ model_auditor.egg-info/SOURCES.txt
13
+ model_auditor.egg-info/dependency_links.txt
14
+ model_auditor.egg-info/requires.txt
15
+ model_auditor.egg-info/top_level.txt
16
+ model_auditor/plotting/__init__.py
17
+ model_auditor/plotting/plotters.py
18
+ model_auditor/plotting/schemas.py
@@ -0,0 +1,3 @@
1
+ pandas>=2.2
2
+ numpy>=2.1
3
+ scikit-learn>=1.5
@@ -0,0 +1 @@
1
+ model_auditor
@@ -0,0 +1,16 @@
1
+ # allows the package to be installed in editable mode
2
+ [build-system]
3
+ requires = ["setuptools>=64", "setuptools_scm>=8", "wheel"]
4
+ build-backend = "setuptools.build_meta"
5
+
6
+ [project]
7
+ name = "model-auditor"
8
+ dynamic = ["version"]
9
+
10
+ dependencies = [
11
+ "pandas >= 2.2",
12
+ "numpy >= 2.1",
13
+ "scikit-learn >= 1.5",
14
+ ]
15
+
16
+ [tool.setuptools_scm]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+