model-auditor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_auditor-0.1.0/.github/workflows/publish.yml +32 -0
- model_auditor-0.1.0/.gitignore +3 -0
- model_auditor-0.1.0/PKG-INFO +6 -0
- model_auditor-0.1.0/README.md +3 -0
- model_auditor-0.1.0/model_auditor/__init__.py +1 -0
- model_auditor-0.1.0/model_auditor/core.py +336 -0
- model_auditor-0.1.0/model_auditor/metric_inputs.py +58 -0
- model_auditor-0.1.0/model_auditor/metrics.py +250 -0
- model_auditor-0.1.0/model_auditor/plotting/__init__.py +1 -0
- model_auditor-0.1.0/model_auditor/plotting/plotters.py +228 -0
- model_auditor-0.1.0/model_auditor/plotting/schemas.py +40 -0
- model_auditor-0.1.0/model_auditor/schemas.py +160 -0
- model_auditor-0.1.0/model_auditor/utils.py +27 -0
- model_auditor-0.1.0/model_auditor.egg-info/PKG-INFO +6 -0
- model_auditor-0.1.0/model_auditor.egg-info/SOURCES.txt +18 -0
- model_auditor-0.1.0/model_auditor.egg-info/dependency_links.txt +1 -0
- model_auditor-0.1.0/model_auditor.egg-info/requires.txt +3 -0
- model_auditor-0.1.0/model_auditor.egg-info/top_level.txt +1 -0
- model_auditor-0.1.0/pyproject.toml +16 -0
- model_auditor-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
name: Upload Python Package to PyPI when a Release is Created
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [created]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
pypi-publish:
|
|
9
|
+
name: Publish release to PyPI
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment:
|
|
12
|
+
name: release
|
|
13
|
+
url: https://pypi.org/p/model-auditor
|
|
14
|
+
permissions:
|
|
15
|
+
id-token: write
|
|
16
|
+
contents: read
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
uses: actions/setup-python@v4
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.x"
|
|
23
|
+
- name: Install dependencies
|
|
24
|
+
run: |
|
|
25
|
+
python -m pip install --upgrade pip
|
|
26
|
+
python -m pip install --user --upgrade build
|
|
27
|
+
pip install setuptools wheel
|
|
28
|
+
- name: Build package
|
|
29
|
+
run: |
|
|
30
|
+
python -m build
|
|
31
|
+
- name: Publish package distributions to PyPI
|
|
32
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from model_auditor.core import Auditor
|
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
from typing import Optional, Type, Union, Callable
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
from numpy.typing import NDArray
|
|
5
|
+
from sklearn.metrics import roc_curve
|
|
6
|
+
from tqdm.auto import tqdm
|
|
7
|
+
|
|
8
|
+
from model_auditor.metric_inputs import AuditorMetricInput
|
|
9
|
+
from model_auditor.metrics import AuditorMetric
|
|
10
|
+
from model_auditor.schemas import (
|
|
11
|
+
AuditorFeature,
|
|
12
|
+
AuditorScore,
|
|
13
|
+
AuditorOutcome,
|
|
14
|
+
FeatureEvaluation,
|
|
15
|
+
ScoreEvaluation,
|
|
16
|
+
)
|
|
17
|
+
from model_auditor.utils import collect_metric_inputs
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Auditor:
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
data: Optional[pd.DataFrame] = None,
|
|
24
|
+
features: Optional[list[AuditorFeature]] = None,
|
|
25
|
+
scores: Optional[list[AuditorScore]] = None,
|
|
26
|
+
outcome: Optional[AuditorOutcome] = None,
|
|
27
|
+
metrics: Optional[list[AuditorMetric]] = None,
|
|
28
|
+
) -> None:
|
|
29
|
+
# initialize data
|
|
30
|
+
self.data: Optional[pd.DataFrame] = None if data is None else data.copy()
|
|
31
|
+
|
|
32
|
+
# initialize features
|
|
33
|
+
self.features: dict[str, AuditorFeature] = dict()
|
|
34
|
+
if features is not None:
|
|
35
|
+
for feature in features:
|
|
36
|
+
self.add_feature(**vars(feature))
|
|
37
|
+
|
|
38
|
+
# initialize scores
|
|
39
|
+
self.scores: dict[str, AuditorScore] = dict()
|
|
40
|
+
if scores is not None:
|
|
41
|
+
for score in scores:
|
|
42
|
+
self.add_score(**vars(score))
|
|
43
|
+
|
|
44
|
+
# initialize outcome
|
|
45
|
+
if outcome is not None:
|
|
46
|
+
self.add_outcome(**vars(outcome))
|
|
47
|
+
|
|
48
|
+
# initialize metrics
|
|
49
|
+
self.metrics: list[AuditorScore] = list()
|
|
50
|
+
if metrics is not None:
|
|
51
|
+
self.metrics = metrics
|
|
52
|
+
|
|
53
|
+
# initialize attrs for later
|
|
54
|
+
self._inputs: list[Type[AuditorMetricInput]] = list()
|
|
55
|
+
self._evaluations: list = list()
|
|
56
|
+
self.n_bootstraps: int = 1000
|
|
57
|
+
|
|
58
|
+
def add_data(self, data: pd.DataFrame) -> None:
|
|
59
|
+
"""
|
|
60
|
+
Method to add a dataframe to the auditor
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
data (pd.DataFrame): Full dataframe which will be subset for subgroup evaluation
|
|
64
|
+
"""
|
|
65
|
+
self.data = data.copy()
|
|
66
|
+
|
|
67
|
+
def add_feature(
|
|
68
|
+
self, name: str, label: Optional[str] = None, levels: Optional[list[any]] = None
|
|
69
|
+
) -> None:
|
|
70
|
+
"""
|
|
71
|
+
Method to add a feature to the auditor. Equivalent to a grouping variable in
|
|
72
|
+
packages like tableone, the score variable will be stratified by this feature
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
name (str): Column name for the feature.
|
|
76
|
+
label (Optional[str], optional): Optional label for the feature. Defaults to None.
|
|
77
|
+
levels (Optional[list[any]], optional): Valid levels to consider for the feature,
|
|
78
|
+
by default (when set to None) all levels will be considered. Defaults to None.
|
|
79
|
+
[Not currently implemented]
|
|
80
|
+
"""
|
|
81
|
+
feature = AuditorFeature(
|
|
82
|
+
name=name,
|
|
83
|
+
label=label,
|
|
84
|
+
levels=levels,
|
|
85
|
+
)
|
|
86
|
+
self.features[feature.name] = feature
|
|
87
|
+
|
|
88
|
+
def add_score(
|
|
89
|
+
self, name: str, label: Optional[str] = None, threshold: Optional[float] = None
|
|
90
|
+
) -> None:
|
|
91
|
+
"""
|
|
92
|
+
Method to add a score to the auditor. Expects a continuous feature which will
|
|
93
|
+
be used to calculate metrics and confidence intervals
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
name (str): Column name for the score.
|
|
97
|
+
label (Optional[str], optional): Optional label for the score. Defaults to None.
|
|
98
|
+
threshold (Optional[float], optional): Threshold used to binarize the score column.
|
|
99
|
+
Defaults to None and can be optimized using the Youden index or updated separately later.
|
|
100
|
+
"""
|
|
101
|
+
score = AuditorScore(
|
|
102
|
+
name=name,
|
|
103
|
+
label=label,
|
|
104
|
+
threshold=threshold,
|
|
105
|
+
)
|
|
106
|
+
self.scores[score.name] = score
|
|
107
|
+
|
|
108
|
+
def add_outcome(self, name: str, mapping: Optional[dict[any, int]] = None) -> None:
|
|
109
|
+
if self.data is None:
|
|
110
|
+
raise ValueError("Please add data with .add_data() first")
|
|
111
|
+
|
|
112
|
+
if mapping is not None:
|
|
113
|
+
self.data["_truth"] = self.data[name].map(mapping)
|
|
114
|
+
else:
|
|
115
|
+
self.data["_truth"] = self.data[name]
|
|
116
|
+
|
|
117
|
+
def optimize_score_threshold(self, score_name: str) -> float:
|
|
118
|
+
"""
|
|
119
|
+
Method to optimize the decision threshold for a score based on the Youden index.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
score_name (str): Name of the target score
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
ValueError: If no scores have been defined with .add_score() first
|
|
126
|
+
ValueError: If no data has been added with .add_data() first
|
|
127
|
+
ValueError: If no outcome variable has been defined with .add_outcome() first
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
float: Optimal threshold identified
|
|
131
|
+
"""
|
|
132
|
+
if len(self.scores) == 0:
|
|
133
|
+
raise ValueError("Please define at least one score first")
|
|
134
|
+
if self.data is None:
|
|
135
|
+
raise ValueError("Please add data with .add_data() first")
|
|
136
|
+
elif "_truth" not in self.data.columns.tolist():
|
|
137
|
+
raise ValueError(
|
|
138
|
+
"Please define an outcome variable data with .add_outcome() first"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# throws an error if the score has not been defined
|
|
142
|
+
score: AuditorScore = self.scores[score_name]
|
|
143
|
+
score_list: list[float] = self.data[score.name].astype(float).tolist()
|
|
144
|
+
|
|
145
|
+
# otherwise the target score will be the single item in the list
|
|
146
|
+
truth_list: list[float] = self.data["_truth"].astype(float).tolist()
|
|
147
|
+
|
|
148
|
+
# calculate optimal threshold
|
|
149
|
+
fpr, tpr, thresholds = roc_curve(truth_list, score_list)
|
|
150
|
+
idx: int = np.argmax(tpr - fpr).astype(int)
|
|
151
|
+
optimal_threshold: float = thresholds[idx]
|
|
152
|
+
|
|
153
|
+
print(f"Optimal threshold for '{score.name}' found at: {optimal_threshold}")
|
|
154
|
+
return optimal_threshold
|
|
155
|
+
|
|
156
|
+
def set_metrics(self, metrics: list[AuditorMetric]) -> None:
|
|
157
|
+
"""
|
|
158
|
+
Method to define the metrics the auditor will use during evaluation of score variables.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
metrics (list[AuditorMetric]): A list of metrics classes following the AuditorMetric
|
|
162
|
+
protocol (pre-made metrics listed in model_auditor.metrics)
|
|
163
|
+
"""
|
|
164
|
+
self.metrics: list[AuditorMetric] = metrics
|
|
165
|
+
|
|
166
|
+
def _collect_inputs(self) -> None:
|
|
167
|
+
"""
|
|
168
|
+
Collects the minimum set of metric inputs necessary for evaluation
|
|
169
|
+
(based on the metrics defined in self.metrics with the .define_metrics() method)
|
|
170
|
+
"""
|
|
171
|
+
inputs_set: set[str] = set()
|
|
172
|
+
for metric in self.metrics:
|
|
173
|
+
inputs_set.update(metric.inputs)
|
|
174
|
+
|
|
175
|
+
inputs_dict: dict[str, Type[AuditorMetricInput]] = collect_metric_inputs()
|
|
176
|
+
|
|
177
|
+
# reinit self._inputs and add all necessary inputs to it
|
|
178
|
+
self._inputs: list[Type[AuditorMetricInput]] = list()
|
|
179
|
+
for input_name in list(inputs_set):
|
|
180
|
+
if input_name not in ["_truth", "_pred"]:
|
|
181
|
+
self._inputs.append(inputs_dict[input_name])
|
|
182
|
+
|
|
183
|
+
def _apply_inputs(self, data: pd.DataFrame) -> pd.DataFrame:
|
|
184
|
+
"""
|
|
185
|
+
Method to apply the metric input functions (collected with .collect_inputs())
|
|
186
|
+
to the target data to prepare it for metric calculation
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
data (pd.DataFrame): Dataframe to add input columns to
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
pd.DataFrame: Transformed dataframe with metric input columns
|
|
193
|
+
"""
|
|
194
|
+
for input_type in self._inputs:
|
|
195
|
+
metric_input = input_type()
|
|
196
|
+
data: pd.DataFrame = metric_input.data_transform(data)
|
|
197
|
+
|
|
198
|
+
return data
|
|
199
|
+
|
|
200
|
+
def _binarize(self, score_data: pd.Series, threshold: float) -> pd.Series:
|
|
201
|
+
return (score_data >= threshold).astype(int)
|
|
202
|
+
|
|
203
|
+
def evaluate(self, score_name: str, threshold: Optional[float] = None):
|
|
204
|
+
if self.data is None:
|
|
205
|
+
raise ValueError("Please add data with .add_data() first")
|
|
206
|
+
|
|
207
|
+
if len(self.metrics) == 0:
|
|
208
|
+
raise ValueError(
|
|
209
|
+
"Please define at least one metric with .set_metrics() first"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# get score
|
|
213
|
+
score: AuditorScore = self.scores[score_name]
|
|
214
|
+
|
|
215
|
+
if (threshold is None) & (score.threshold is None):
|
|
216
|
+
raise ValueError(
|
|
217
|
+
"Threshold must be defined in score object or passed to .evaluate_score()"
|
|
218
|
+
)
|
|
219
|
+
elif threshold is None:
|
|
220
|
+
threshold = score.threshold
|
|
221
|
+
|
|
222
|
+
# collect metric inputs to prep for evaluation
|
|
223
|
+
self._collect_inputs()
|
|
224
|
+
|
|
225
|
+
# get the list of columns to retain in the data
|
|
226
|
+
column_list: list[str] = [*self.features.keys(), "_truth"]
|
|
227
|
+
|
|
228
|
+
# copy a slice of the dataframe
|
|
229
|
+
data_slice: pd.DataFrame = self.data.loc[:, column_list]
|
|
230
|
+
data_slice["_pred"] = self.data[score.name]
|
|
231
|
+
data_slice["_binary_pred"] = self._binarize(score_data=data_slice["_pred"], threshold=threshold) # type: ignore
|
|
232
|
+
data_slice = self._apply_inputs(data=data_slice)
|
|
233
|
+
|
|
234
|
+
# create an 'Overall' feature which will be used to calculate metrics on the full data
|
|
235
|
+
data_slice["overall"] = "Overall"
|
|
236
|
+
eval_features: dict[str, AuditorFeature] = {
|
|
237
|
+
"overall": AuditorFeature(
|
|
238
|
+
name="overall",
|
|
239
|
+
label="Overall",
|
|
240
|
+
)
|
|
241
|
+
}
|
|
242
|
+
eval_features.update(**self.features)
|
|
243
|
+
|
|
244
|
+
score_eval: ScoreEvaluation = ScoreEvaluation(
|
|
245
|
+
name=score.name,
|
|
246
|
+
label=score.label if score.label is not None else score.name,
|
|
247
|
+
)
|
|
248
|
+
with tqdm(
|
|
249
|
+
eval_features.values(), position=0, leave=True, desc="Features"
|
|
250
|
+
) as pbar:
|
|
251
|
+
for feature in pbar:
|
|
252
|
+
pbar.set_postfix({"name": feature.name})
|
|
253
|
+
|
|
254
|
+
# e.g. {"f1": {'levelA': 0.2, 'levelB': 0.4}, ... }
|
|
255
|
+
feature_eval: FeatureEvaluation = self._evaluate_feature(
|
|
256
|
+
data=data_slice, feature=feature
|
|
257
|
+
)
|
|
258
|
+
score_eval.features[feature.name] = feature_eval
|
|
259
|
+
|
|
260
|
+
return score_eval
|
|
261
|
+
|
|
262
|
+
def _evaluate_feature(
|
|
263
|
+
self, data: pd.DataFrame, feature: AuditorFeature
|
|
264
|
+
) -> FeatureEvaluation:
|
|
265
|
+
with tqdm(range(2), position=1, desc="Stages", leave=False) as feature_pbar:
|
|
266
|
+
feature_pbar.set_postfix({"stage": "Evaluating metrics"})
|
|
267
|
+
|
|
268
|
+
# cast feature levels to string
|
|
269
|
+
data[feature.name] = data[feature.name].astype(str)
|
|
270
|
+
|
|
271
|
+
# then group the df by this feature (so each group contains one
|
|
272
|
+
# unique level of the data) and get all metrics for each
|
|
273
|
+
feature_groups = data.groupby(feature.name)
|
|
274
|
+
|
|
275
|
+
# e.g. {"f1": {'levelA': 0.2, 'levelB': 0.4}, ... }
|
|
276
|
+
feature_eval: FeatureEvaluation = FeatureEvaluation(
|
|
277
|
+
name=feature.name,
|
|
278
|
+
label=feature.label if feature.label is not None else feature.name,
|
|
279
|
+
)
|
|
280
|
+
for metric in tqdm(self.metrics, position=2, desc="Metrics", leave=False):
|
|
281
|
+
# gets a dict with the current metric calculated for levels of the feature
|
|
282
|
+
# e.g. {levelA: 0.5, levelB: 0.5}
|
|
283
|
+
level_eval_dict = feature_groups.apply(metric.data_call).to_dict()
|
|
284
|
+
|
|
285
|
+
feature_eval.update(
|
|
286
|
+
metric_name=metric.name,
|
|
287
|
+
metric_label=metric.label,
|
|
288
|
+
data=level_eval_dict,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
feature_pbar.update(1)
|
|
292
|
+
feature_pbar.set_postfix({"stage": "Evaluating intervals"})
|
|
293
|
+
# if calculating confidence intervals, do that here
|
|
294
|
+
if self.n_bootstraps is not None:
|
|
295
|
+
for level_name, level_data in tqdm(
|
|
296
|
+
feature_groups, position=2, desc="Bootstrap Levels", leave=False
|
|
297
|
+
):
|
|
298
|
+
# calculate confidence intervals for eligible metrics for the current feature level
|
|
299
|
+
level_metric_intervals: dict[str, tuple[float, float]] = (
|
|
300
|
+
self._evaluate_confidence_interval(data=level_data)
|
|
301
|
+
)
|
|
302
|
+
# register the calculated intervals
|
|
303
|
+
feature_eval.update_intervals(
|
|
304
|
+
level_name=str(level_name),
|
|
305
|
+
metric_intervals=level_metric_intervals,
|
|
306
|
+
)
|
|
307
|
+
feature_pbar.update(1)
|
|
308
|
+
|
|
309
|
+
return feature_eval
|
|
310
|
+
|
|
311
|
+
def _evaluate_confidence_interval(
|
|
312
|
+
self, data: pd.DataFrame
|
|
313
|
+
) -> dict[str, tuple[float, float]]:
|
|
314
|
+
n: int = len(data)
|
|
315
|
+
|
|
316
|
+
bootstrap_results: dict[str, NDArray[np.float64]] = dict()
|
|
317
|
+
for metric in self.metrics:
|
|
318
|
+
if metric.ci_eligible:
|
|
319
|
+
bootstrap_results[metric.name] = np.empty(shape=(self.n_bootstraps), dtype=np.float64)
|
|
320
|
+
|
|
321
|
+
# sample n_bootstrap times with replacement
|
|
322
|
+
for i in range(self.n_bootstraps):
|
|
323
|
+
boot_data: pd.DataFrame = data.sample(n, replace=True)
|
|
324
|
+
|
|
325
|
+
# calculate metrics on current bootstrap data
|
|
326
|
+
for metric in self.metrics:
|
|
327
|
+
if metric.ci_eligible:
|
|
328
|
+
bootstrap_results[metric.name][i] = metric.data_call(boot_data)
|
|
329
|
+
|
|
330
|
+
metric_intervals: dict[str, tuple[float, float]] = dict()
|
|
331
|
+
for metric_name, bootstrap_array in bootstrap_results.items():
|
|
332
|
+
# get 95% confidence bounds for metric
|
|
333
|
+
lower, upper = np.percentile(bootstrap_array, [2.5, 97.5])
|
|
334
|
+
metric_intervals[metric_name] = (lower, upper)
|
|
335
|
+
|
|
336
|
+
return metric_intervals
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import Protocol, Union, runtime_checkable
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@runtime_checkable
|
|
6
|
+
class AuditorMetricInput(Protocol):
|
|
7
|
+
name: str
|
|
8
|
+
label: str
|
|
9
|
+
inputs: list[str]
|
|
10
|
+
|
|
11
|
+
def row_call(self, row: pd.Series) -> Union[int, float]:
|
|
12
|
+
"""
|
|
13
|
+
method called on each row of a dataframe to calculate a metric
|
|
14
|
+
"""
|
|
15
|
+
raise NotImplementedError
|
|
16
|
+
|
|
17
|
+
def data_transform(self, data: pd.DataFrame) -> pd.DataFrame:
|
|
18
|
+
"""
|
|
19
|
+
method called on a dataframe to add a metric input column inplace
|
|
20
|
+
"""
|
|
21
|
+
data[self.name] = data.apply(self.row_call, axis=1)
|
|
22
|
+
return data
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TruePositives(AuditorMetricInput):
|
|
26
|
+
name: str = "tp"
|
|
27
|
+
label: str = "TP"
|
|
28
|
+
inputs: list[str] = ["_truth", "_binary_pred"]
|
|
29
|
+
|
|
30
|
+
def row_call(self, row: pd.Series) -> int:
|
|
31
|
+
return int((row["_truth"] == 1.0) & (row["_binary_pred"] == 1.0))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class FalsePositives(AuditorMetricInput):
|
|
35
|
+
name: str = "fp"
|
|
36
|
+
label: str = "FP"
|
|
37
|
+
inputs: list[str] = ["_truth", "_binary_pred"]
|
|
38
|
+
|
|
39
|
+
def row_call(self, row: pd.Series) -> int:
|
|
40
|
+
return int((row["_truth"] == 0.0) & (row["_binary_pred"] == 1.0))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TrueNegatives(AuditorMetricInput):
|
|
44
|
+
name: str = "tn"
|
|
45
|
+
label: str = "TN"
|
|
46
|
+
inputs: list[str] = ["_truth", "_binary_pred"]
|
|
47
|
+
|
|
48
|
+
def row_call(self, row: pd.Series) -> int:
|
|
49
|
+
return int((row["_truth"] == 0.0) & (row["_binary_pred"] == 0.0))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class FalseNegatives(AuditorMetricInput):
|
|
53
|
+
name: str = "fn"
|
|
54
|
+
label: str = "FN"
|
|
55
|
+
inputs: list[str] = ["_truth", "_binary_pred"]
|
|
56
|
+
|
|
57
|
+
def row_call(self, row: pd.Series) -> int:
|
|
58
|
+
return int((row["_truth"] == 1.0) & (row["_binary_pred"] == 0.0))
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
from typing import Protocol, Union, Optional
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
from sklearn.metrics import average_precision_score, roc_auc_score
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AuditorMetric(Protocol):
|
|
8
|
+
name: str
|
|
9
|
+
label: str
|
|
10
|
+
inputs: list[str]
|
|
11
|
+
ci_eligible: bool
|
|
12
|
+
|
|
13
|
+
def data_call(self, data: pd.DataFrame) -> Union[float, int]:
|
|
14
|
+
"""
|
|
15
|
+
method called on a dataframe to calculate a metric
|
|
16
|
+
"""
|
|
17
|
+
raise NotImplementedError
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Sensitivity(AuditorMetric):
|
|
21
|
+
name: str = "sensitivity"
|
|
22
|
+
label: str = "Sensitivity"
|
|
23
|
+
inputs: list[str] = ["tp", "fn"]
|
|
24
|
+
ci_eligible: bool = True
|
|
25
|
+
|
|
26
|
+
def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
|
|
27
|
+
n_tp: int = data["tp"].sum()
|
|
28
|
+
n_fn: int = data["fn"].sum()
|
|
29
|
+
return n_tp / (n_tp + n_fn + eps)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Specificity(AuditorMetric):
|
|
33
|
+
name: str = "specificity"
|
|
34
|
+
label: str = "Specificity"
|
|
35
|
+
inputs: list[str] = ["tn", "fp"]
|
|
36
|
+
ci_eligible: bool = True
|
|
37
|
+
|
|
38
|
+
def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
|
|
39
|
+
n_tn: int = data["tn"].sum()
|
|
40
|
+
n_fp: int = data["fp"].sum()
|
|
41
|
+
return n_tn / (n_tn + n_fp + eps)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Precision(AuditorMetric):
|
|
45
|
+
name: str = "precision"
|
|
46
|
+
label: str = "Precision"
|
|
47
|
+
inputs: list[str] = ["tp", "fp"]
|
|
48
|
+
ci_eligible: bool = True
|
|
49
|
+
|
|
50
|
+
def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
|
|
51
|
+
n_tp: int = data["tp"].sum()
|
|
52
|
+
n_fp: int = data["fp"].sum()
|
|
53
|
+
return n_tp / (n_tp + n_fp + eps)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class Recall(AuditorMetric):
|
|
57
|
+
name: str = "recall"
|
|
58
|
+
label: str = "Recall"
|
|
59
|
+
inputs: list[str] = ["tp", "fn"]
|
|
60
|
+
ci_eligible: bool = True
|
|
61
|
+
|
|
62
|
+
def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
|
|
63
|
+
n_tp: int = data["tp"].sum()
|
|
64
|
+
n_fn: int = data["fn"].sum()
|
|
65
|
+
return n_tp / (n_tp + n_fn + eps)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class F1Score(AuditorMetric):
|
|
69
|
+
name: str = "f1"
|
|
70
|
+
label: str = "F1 Score"
|
|
71
|
+
inputs: list[str] = ["tp", "fp", "fn"]
|
|
72
|
+
ci_eligible: bool = True
|
|
73
|
+
|
|
74
|
+
def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
|
|
75
|
+
# Recalculate to avoid dependency on ordering of metrics
|
|
76
|
+
precision = Precision().data_call(data)
|
|
77
|
+
recall = Recall().data_call(data)
|
|
78
|
+
return 2 * (precision * recall) / (precision + recall + eps)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class AUROC(AuditorMetric):
|
|
82
|
+
name: str = "auroc"
|
|
83
|
+
label: str = "AUROC"
|
|
84
|
+
inputs: list[str] = ["_truth", "_pred"]
|
|
85
|
+
ci_eligible: bool = True
|
|
86
|
+
|
|
87
|
+
def data_call(self, data: pd.DataFrame) -> float:
|
|
88
|
+
try:
|
|
89
|
+
return float(roc_auc_score(data["_truth"], data["_pred"]))
|
|
90
|
+
except ValueError:
|
|
91
|
+
return 0.0
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class AUPRC(AuditorMetric):
|
|
95
|
+
name: str = "auprc"
|
|
96
|
+
label: str = "AUPRC"
|
|
97
|
+
inputs: list[str] = ["_truth", "_pred"]
|
|
98
|
+
ci_eligible: bool = True
|
|
99
|
+
|
|
100
|
+
def data_call(self, data: pd.DataFrame) -> float:
|
|
101
|
+
try:
|
|
102
|
+
return float(average_precision_score(data["_truth"], data["_pred"]))
|
|
103
|
+
except ValueError:
|
|
104
|
+
return 0.0
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class MatthewsCorrelationCoefficient(AuditorMetric):
|
|
108
|
+
name: str = "mcc"
|
|
109
|
+
label: str = "Matthews Correlation Coefficient"
|
|
110
|
+
inputs: list[str] = ["tp", "tn", "fp", "fn"]
|
|
111
|
+
ci_eligible: bool = True
|
|
112
|
+
|
|
113
|
+
def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
|
|
114
|
+
tp = data["tp"].sum()
|
|
115
|
+
tn = data["tn"].sum()
|
|
116
|
+
fp = data["fp"].sum()
|
|
117
|
+
fn = data["fn"].sum()
|
|
118
|
+
|
|
119
|
+
numerator = (tp * tn) - (fp * fn)
|
|
120
|
+
denominator = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
|
|
121
|
+
|
|
122
|
+
if denominator == 0:
|
|
123
|
+
return 0.0
|
|
124
|
+
return numerator / (denominator + eps)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class FBetaScore(AuditorMetric):
|
|
128
|
+
name: str = "fbeta"
|
|
129
|
+
label: str = "F-beta Score"
|
|
130
|
+
inputs: list[str] = ["precision", "recall"]
|
|
131
|
+
ci_eligible: bool = True
|
|
132
|
+
|
|
133
|
+
def __init__(self, beta: float = 1.0):
|
|
134
|
+
self.beta = beta
|
|
135
|
+
self.name = f"f{beta:.1f}".replace(".", "_") # e.g., "f0_5" or "f2_0"
|
|
136
|
+
self.label = f"F{beta:.1f} Score"
|
|
137
|
+
|
|
138
|
+
def data_call(self, data: pd.DataFrame) -> float:
|
|
139
|
+
precision = Precision().data_call(data)
|
|
140
|
+
recall = Recall().data_call(data)
|
|
141
|
+
beta_sq = self.beta**2
|
|
142
|
+
|
|
143
|
+
if precision + recall == 0:
|
|
144
|
+
return 0.0
|
|
145
|
+
|
|
146
|
+
return (1 + beta_sq) * (precision * recall) / ((beta_sq * precision) + recall)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class TPR(Sensitivity):
|
|
150
|
+
name: str = "tpr"
|
|
151
|
+
label: str = "TPR"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class TNR(Specificity):
|
|
155
|
+
name: str = "tnr"
|
|
156
|
+
label: str = "TNR"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class FPR(AuditorMetric):
|
|
160
|
+
name: str = "fpr"
|
|
161
|
+
label: str = "FPR"
|
|
162
|
+
inputs: list[str] = ["fp", "tn"]
|
|
163
|
+
ci_eligible: bool = True
|
|
164
|
+
|
|
165
|
+
def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
|
|
166
|
+
n_fp: int = data["fp"].sum()
|
|
167
|
+
n_tn: int = data["tn"].sum()
|
|
168
|
+
return n_fp / (n_fp + n_tn + eps)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class FNR(AuditorMetric):
|
|
172
|
+
name: str = "fnr"
|
|
173
|
+
label: str = "FNR"
|
|
174
|
+
inputs: list[str] = ["fn", "tp"]
|
|
175
|
+
ci_eligible: bool = True
|
|
176
|
+
|
|
177
|
+
def data_call(self, data: pd.DataFrame, eps: float = 1e-8) -> float:
|
|
178
|
+
n_fn: int = data["fn"].sum()
|
|
179
|
+
n_tp: int = data["tp"].sum()
|
|
180
|
+
return n_fn / (n_fn + n_tp + eps)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class nData(AuditorMetric):
|
|
184
|
+
name: str = "n"
|
|
185
|
+
label: str = "N"
|
|
186
|
+
inputs: list[str] = []
|
|
187
|
+
ci_eligible: bool = False
|
|
188
|
+
|
|
189
|
+
def data_call(self, data: pd.DataFrame) -> int:
|
|
190
|
+
return len(data)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class nTP(AuditorMetric):
|
|
194
|
+
name: str = "n_tp"
|
|
195
|
+
label: str = "TP"
|
|
196
|
+
inputs: list[str] = ['tp']
|
|
197
|
+
ci_eligible: bool = False
|
|
198
|
+
|
|
199
|
+
def data_call(self, data: pd.DataFrame) -> int:
|
|
200
|
+
return data['tp'].sum()
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class nTN(AuditorMetric):
|
|
204
|
+
name: str = "n_tn"
|
|
205
|
+
label: str = "TN"
|
|
206
|
+
inputs: list[str] = ['tn']
|
|
207
|
+
ci_eligible: bool = False
|
|
208
|
+
|
|
209
|
+
def data_call(self, data: pd.DataFrame) -> int:
|
|
210
|
+
return data['tn'].sum()
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class nFP(AuditorMetric):
|
|
214
|
+
name: str = "n_fp"
|
|
215
|
+
label: str = "FP"
|
|
216
|
+
inputs: list[str] = ['fp']
|
|
217
|
+
ci_eligible: bool = False
|
|
218
|
+
|
|
219
|
+
def data_call(self, data: pd.DataFrame) -> int:
|
|
220
|
+
return data['fp'].sum()
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class nFN(AuditorMetric):
|
|
224
|
+
name: str = "n_fn"
|
|
225
|
+
label: str = "FN"
|
|
226
|
+
inputs: list[str] = ['fn']
|
|
227
|
+
ci_eligible: bool = False
|
|
228
|
+
|
|
229
|
+
def data_call(self, data: pd.DataFrame) -> int:
|
|
230
|
+
return data['fn'].sum()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class nPositive(AuditorMetric):
|
|
234
|
+
name: str = "n_pos"
|
|
235
|
+
label: str = "Pos."
|
|
236
|
+
inputs: list[str] = ['_truth']
|
|
237
|
+
ci_eligible: bool = False
|
|
238
|
+
|
|
239
|
+
def data_call(self, data: pd.DataFrame) -> int:
|
|
240
|
+
return (data['_truth'] == 1).astype(int).sum()
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class nNegative(AuditorMetric):
|
|
244
|
+
name: str = "n_neg"
|
|
245
|
+
label: str = "Neg."
|
|
246
|
+
inputs: list[str] = ['_truth']
|
|
247
|
+
ci_eligible: bool = False
|
|
248
|
+
|
|
249
|
+
def data_call(self, data: pd.DataFrame) -> int:
|
|
250
|
+
return (data['_truth'] == 0).astype(int).sum()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from model_auditor.plotting.plotters import HierarchyPlotter
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
from typing import Optional, Union, Callable
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
from model_auditor.schemas import AuditorScore, AuditorOutcome
|
|
5
|
+
from model_auditor.plotting.schemas import Hierarchy, HLevel, HItem, PlotterData
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HierarchyPlotter:
|
|
9
|
+
def __init__(self) -> None:
|
|
10
|
+
self.features: Optional[Hierarchy] = None # type: ignore
|
|
11
|
+
self.data: Optional[pd.DataFrame] = None
|
|
12
|
+
self.aggregator: Union[str, Callable] = "median"
|
|
13
|
+
|
|
14
|
+
self.score: Optional[AuditorScore] = None
|
|
15
|
+
self.outcome: Optional[AuditorOutcome] = None
|
|
16
|
+
|
|
17
|
+
def set_data(self, data: pd.DataFrame) -> None:
|
|
18
|
+
"""Set data for the plotter
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
data (pd.DataFrame): Data used to build the plot
|
|
22
|
+
"""
|
|
23
|
+
self.data = data
|
|
24
|
+
|
|
25
|
+
def set_features(self, features: Union[Hierarchy, list[str]]) -> None:
|
|
26
|
+
"""Set the feature hierarchy for the plotter
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
features (Union[Hierarchy, list[str]]): Expects a list of strings
|
|
30
|
+
(column names corresponding to the data provided) or a predefined
|
|
31
|
+
custom Hierarchy object
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
ValueError: Raised if something other than a list of Hierarchy
|
|
35
|
+
object was passed
|
|
36
|
+
"""
|
|
37
|
+
# flat hierarchy
|
|
38
|
+
if isinstance(features, list):
|
|
39
|
+
self.features: Hierarchy = Hierarchy()
|
|
40
|
+
for feature in features:
|
|
41
|
+
self.features.levels.append(HLevel([HItem(name=feature)]))
|
|
42
|
+
# complex/custom hierarchy
|
|
43
|
+
elif isinstance(features, Hierarchy):
|
|
44
|
+
self.features = features
|
|
45
|
+
else:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
"unrecognized type for features, please pass a list of strings or a predefined Hierarchy() object"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def set_aggregator(self, method: Union[str, Callable]) -> None:
|
|
51
|
+
"""Sets the aggregator used to color the plot cells
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
method (Union[str, Callable]): Expects a string corresponding to a
|
|
55
|
+
predefined aggregator for the .agg() pandas method, or a function
|
|
56
|
+
that takes the score column as a series and outputs some float
|
|
57
|
+
"""
|
|
58
|
+
self.aggregator = method
|
|
59
|
+
|
|
60
|
+
def set_score(
|
|
61
|
+
self, name: str, label: Optional[str] = None, threshold: Optional[float] = None
|
|
62
|
+
) -> None:
|
|
63
|
+
"""Sets the score column used by the plotter
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
name (str): Name of the score column
|
|
67
|
+
label (Optional[str], optional): Label of the score column. Defaults to None
|
|
68
|
+
(plot will just use the column name).
|
|
69
|
+
threshold (Optional[float], optional): Threshold to binarize the score column.
|
|
70
|
+
Defaults to None (currently unused).
|
|
71
|
+
"""
|
|
72
|
+
self.score = AuditorScore(
|
|
73
|
+
name=name, label=label if label is not None else name, threshold=threshold
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def compile(self, container: str) -> PlotterData:
|
|
77
|
+
"""Compiles the data for the plotter based on the defined HierarchyPlotter parameters
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
container (str): Name of the plot container trace
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
ValueError: If features have not been set with .set_features() first
|
|
84
|
+
ValueError: If a score has not been set with .set_score() first
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
PlotterData: Returns the formatted plotter data TODO: wrap this internally
|
|
88
|
+
"""
|
|
89
|
+
if self.features is None:
|
|
90
|
+
raise ValueError("Please set features with .set_features() first")
|
|
91
|
+
|
|
92
|
+
if self.score is None:
|
|
93
|
+
raise ValueError("Please set a score variable with .set_score() first!")
|
|
94
|
+
|
|
95
|
+
datasource = self._prepare_datasource()
|
|
96
|
+
data = PlotterData()
|
|
97
|
+
|
|
98
|
+
if isinstance(self.aggregator, str):
|
|
99
|
+
container_agg: float = (
|
|
100
|
+
datasource[self.score.name].agg(self.aggregator).item()
|
|
101
|
+
)
|
|
102
|
+
else:
|
|
103
|
+
container_agg: float = self.aggregator(datasource)
|
|
104
|
+
|
|
105
|
+
data.add(
|
|
106
|
+
label=container,
|
|
107
|
+
id=container,
|
|
108
|
+
parent="",
|
|
109
|
+
value=len(datasource),
|
|
110
|
+
color=container_agg,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return self._recursive_record(
|
|
114
|
+
data=data, datasource=datasource, parent_id=container, idx=0
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def _recursive_record(
|
|
118
|
+
self, data: PlotterData, datasource: pd.DataFrame, parent_id: str, idx: int
|
|
119
|
+
):
|
|
120
|
+
"""Recursive internal function used to compile data for the plotter"""
|
|
121
|
+
level: HLevel = self.features.levels[idx]
|
|
122
|
+
# init a list to track valid features for this level
|
|
123
|
+
level_features: list[HItem] = []
|
|
124
|
+
for item in level.items:
|
|
125
|
+
# if the item has no query then include it
|
|
126
|
+
if item.query is None:
|
|
127
|
+
level_features.append(item)
|
|
128
|
+
|
|
129
|
+
# otherwise, include it if the feature query evaluates to true for the *entire* datasource
|
|
130
|
+
elif all(datasource.eval(item.query).tolist()): # type: ignore
|
|
131
|
+
level_features.append(item)
|
|
132
|
+
|
|
133
|
+
# if this level has only 1 valid item, consider it the feature
|
|
134
|
+
if len(level_features) == 1:
|
|
135
|
+
feature = level_features[0]
|
|
136
|
+
|
|
137
|
+
# otherwise, if this level has >1 valid item, concatenate them into a temp derived feature
|
|
138
|
+
elif len(level_features) > 1:
|
|
139
|
+
datasource.loc[:, '_temp_feature'] = (
|
|
140
|
+
datasource[[i.name for i in level_features]]
|
|
141
|
+
.apply(lambda row: " & ".join(row.values.astype(str)), axis=1)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
feature = HItem(name="_temp_feature")
|
|
145
|
+
|
|
146
|
+
# if this level has 0 valid items, return
|
|
147
|
+
else:
|
|
148
|
+
return data
|
|
149
|
+
|
|
150
|
+
# group the df by the current feature and get its frequency and agg metric
|
|
151
|
+
assert isinstance(
|
|
152
|
+
self.score, AuditorScore
|
|
153
|
+
) # handled by the wrapper but here for type hinting
|
|
154
|
+
|
|
155
|
+
count_dict: dict[str, int] = (
|
|
156
|
+
datasource.groupby(feature.name, as_index=True, observed=False)[self.score.name]
|
|
157
|
+
.agg("count")
|
|
158
|
+
.to_dict()
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if isinstance(self.aggregator, str):
|
|
162
|
+
# built-in aggregators
|
|
163
|
+
agg_dict: dict[str, float] = (
|
|
164
|
+
datasource.groupby(feature.name, as_index=True, observed=False)[self.score.name]
|
|
165
|
+
.agg(self.aggregator)
|
|
166
|
+
.to_dict()
|
|
167
|
+
)
|
|
168
|
+
else:
|
|
169
|
+
# custom aggregators (pass entire df here instead of just the score series)
|
|
170
|
+
agg_dict: dict = (
|
|
171
|
+
datasource.groupby(feature.name, as_index=True, observed=False)
|
|
172
|
+
.apply(self.aggregator)
|
|
173
|
+
.to_dict()
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# extract the count dict keys to get the levels for the current feature
|
|
177
|
+
feature_levels: list[str] = list(count_dict.keys())
|
|
178
|
+
# format the parent_id + feature levels into trace identifiers
|
|
179
|
+
id_dict: dict[str, str] = {
|
|
180
|
+
feature_level: f"{parent_id}${feature_level}"
|
|
181
|
+
for feature_level in count_dict.keys()
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
for feature_level in feature_levels:
|
|
185
|
+
# add the current feature data
|
|
186
|
+
data.add(
|
|
187
|
+
label=feature_level,
|
|
188
|
+
id=id_dict[feature_level],
|
|
189
|
+
parent=parent_id,
|
|
190
|
+
value=count_dict[feature_level],
|
|
191
|
+
color=agg_dict[feature_level],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# if this isn't the last feature in the stack, get the subset of data for this feature and recurse
|
|
195
|
+
if idx < (len(self.features.levels) - 1):
|
|
196
|
+
data = self._recursive_record(
|
|
197
|
+
data=data,
|
|
198
|
+
datasource=datasource.loc[datasource[feature.name] == feature_level, :].copy(),
|
|
199
|
+
parent_id=id_dict[feature_level],
|
|
200
|
+
idx=idx + 1,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
return data
|
|
204
|
+
|
|
205
|
+
def _prepare_datasource(self) -> pd.DataFrame:
|
|
206
|
+
"""Internal function used to prepare the datasource for plotting
|
|
207
|
+
|
|
208
|
+
Raises:
|
|
209
|
+
ValueError: If data has not been added with .set_data() first
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
pd.DataFrame: Prepared data source
|
|
213
|
+
"""
|
|
214
|
+
if self.data is None:
|
|
215
|
+
raise ValueError("Please set data with .set_data() first")
|
|
216
|
+
|
|
217
|
+
data = self.data.copy()
|
|
218
|
+
if self.score is not None:
|
|
219
|
+
data["_pred"] = self.score.name
|
|
220
|
+
else:
|
|
221
|
+
print("no score set")
|
|
222
|
+
|
|
223
|
+
if self.outcome is not None:
|
|
224
|
+
data["_outcome"] = self.outcome.name
|
|
225
|
+
else:
|
|
226
|
+
print("no outcome set")
|
|
227
|
+
|
|
228
|
+
return data
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class PlotterData:
|
|
7
|
+
labels: list = field(default_factory=list)
|
|
8
|
+
ids: list = field(default_factory=list)
|
|
9
|
+
parents: list = field(default_factory=list)
|
|
10
|
+
values: list = field(default_factory=list)
|
|
11
|
+
colors: list = field(default_factory=list)
|
|
12
|
+
|
|
13
|
+
def add(self, label: str, id: str, parent: str, value: int, color: float) -> None:
|
|
14
|
+
self.labels.append(label)
|
|
15
|
+
self.ids.append(id)
|
|
16
|
+
self.parents.append(parent)
|
|
17
|
+
self.values.append(value)
|
|
18
|
+
self.colors.append(color)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class HItem:
|
|
23
|
+
"""Hierarchy Item"""
|
|
24
|
+
|
|
25
|
+
name: str
|
|
26
|
+
query: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class HLevel:
|
|
31
|
+
"""Hierarchy level"""
|
|
32
|
+
|
|
33
|
+
items: list[HItem] = field(default_factory=list)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class Hierarchy:
|
|
38
|
+
"""Hierarchy container"""
|
|
39
|
+
|
|
40
|
+
levels: list[HLevel] = field(default_factory=list)
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class LevelMetric:
|
|
9
|
+
"""
|
|
10
|
+
Object to store the evaluation results for one metric of one level of a feature.
|
|
11
|
+
(for example, AUC for one category of finding)
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
name (str): Name of the current feature level metric
|
|
15
|
+
score (Union[float, int]): Score for the current feature level metric
|
|
16
|
+
interval (tuple[float, float], optional): Optional lower and upper confidence
|
|
17
|
+
bounds for the current feature level metric (defaults to None)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
name: str
|
|
21
|
+
label: str
|
|
22
|
+
score: Union[float, int]
|
|
23
|
+
interval: Optional[tuple[float, float]] = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class LevelEvaluation:
|
|
28
|
+
"""
|
|
29
|
+
Object to store the evaluation results for one level of a feature
|
|
30
|
+
(for example, all metrics for one category of finding).
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
name (str): Name of the current feature level
|
|
34
|
+
metrics (dict[str, LevelMetric]): Metrics for the current feature level
|
|
35
|
+
(defaults to an empty dict)
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
name: str
|
|
39
|
+
metrics: dict[str, LevelMetric] = field(default_factory=dict)
|
|
40
|
+
|
|
41
|
+
def update(self, metric_name: str, metric_label: str, metric_score: float) -> None:
|
|
42
|
+
self.metrics[metric_name] = LevelMetric(
|
|
43
|
+
name=metric_name, label=metric_label, score=metric_score
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def update_intervals(self, metric_intervals: dict[str, tuple[float, float]]):
|
|
47
|
+
for metric_name, confidence_interval in metric_intervals.items():
|
|
48
|
+
self.metrics[metric_name].interval = confidence_interval
|
|
49
|
+
|
|
50
|
+
def to_dataframe(self, n_decimals: int = 3, add_index: bool = False, metric_labels: bool = False):
|
|
51
|
+
metric_data: dict[str, str] = dict()
|
|
52
|
+
for metric in self.metrics.values():
|
|
53
|
+
# get the key name for the current metric (label if metric_labels is True)
|
|
54
|
+
metric_key: str = metric.label if metric_labels else metric.name
|
|
55
|
+
|
|
56
|
+
if metric.interval is not None:
|
|
57
|
+
metric_data[metric_key] = (
|
|
58
|
+
f"{metric.score:.{n_decimals}f} ({metric.interval[0]:.{n_decimals}f}, {metric.interval[1]:.{n_decimals}f})"
|
|
59
|
+
)
|
|
60
|
+
elif isinstance(metric.score, float):
|
|
61
|
+
metric_data[metric_key] = f"{metric.score:.{n_decimals}f}"
|
|
62
|
+
else:
|
|
63
|
+
# integer scores (default to comma delimited for now)
|
|
64
|
+
metric_data[metric_key] = f"{metric.score:,}"
|
|
65
|
+
|
|
66
|
+
return pd.DataFrame(metric_data, index=[self.name])
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class FeatureEvaluation:
|
|
71
|
+
"""
|
|
72
|
+
Object to store the evaluation results for one feature type
|
|
73
|
+
(for example, metrics associated with different types of findings)
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
name (str): Name of the current feature
|
|
77
|
+
name (str): Label for the current feature
|
|
78
|
+
levels (dict[str, LevelEvaluation]): Levels of the current feature
|
|
79
|
+
(defaults to an empty dict)
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
name: str
|
|
83
|
+
label: str
|
|
84
|
+
levels: dict[str, LevelEvaluation] = field(default_factory=dict)
|
|
85
|
+
|
|
86
|
+
def update(
|
|
87
|
+
self, metric_name: str, metric_label: str, data: dict[str, float]
|
|
88
|
+
) -> None:
|
|
89
|
+
# expects a dict for one metric type: {'levelA': 0.5, 'levelB': 0.5}
|
|
90
|
+
# and maps them to child level metric dicts
|
|
91
|
+
for level_name, level_metric in data.items():
|
|
92
|
+
# try to get the level item and instantiate a new one if it doesn't exist yet
|
|
93
|
+
level_eval: LevelEvaluation = self.levels.get(
|
|
94
|
+
level_name, LevelEvaluation(name=level_name)
|
|
95
|
+
)
|
|
96
|
+
# update the metrics for that level eval object and save it back to the dict
|
|
97
|
+
level_eval.update(
|
|
98
|
+
metric_name=metric_name,
|
|
99
|
+
metric_label=metric_label,
|
|
100
|
+
metric_score=level_metric,
|
|
101
|
+
)
|
|
102
|
+
self.levels[level_name] = level_eval
|
|
103
|
+
|
|
104
|
+
def update_intervals(
|
|
105
|
+
self, level_name: str, metric_intervals: dict[str, tuple[float, float]]
|
|
106
|
+
):
|
|
107
|
+
self.levels[level_name].update_intervals(metric_intervals=metric_intervals)
|
|
108
|
+
|
|
109
|
+
def to_dataframe(
|
|
110
|
+
self, n_decimals: int = 3, add_index: bool = False, metric_labels: bool = False
|
|
111
|
+
) -> pd.DataFrame:
|
|
112
|
+
data: list[pd.DataFrame] = []
|
|
113
|
+
for level_data in self.levels.values():
|
|
114
|
+
data.append(level_data.to_dataframe(n_decimals=n_decimals, metric_labels=metric_labels))
|
|
115
|
+
|
|
116
|
+
if add_index:
|
|
117
|
+
return pd.concat({self.label: pd.concat(data, axis=0)})
|
|
118
|
+
else:
|
|
119
|
+
return pd.concat(data, axis=0)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@dataclass
|
|
123
|
+
class ScoreEvaluation:
|
|
124
|
+
name: str
|
|
125
|
+
label: str
|
|
126
|
+
features: dict[str, FeatureEvaluation] = field(default_factory=dict)
|
|
127
|
+
|
|
128
|
+
def to_dataframe(
|
|
129
|
+
self, n_decimals: int = 3, add_index: bool = False, metric_labels: bool = False
|
|
130
|
+
) -> pd.DataFrame:
|
|
131
|
+
data: list[pd.DataFrame] = []
|
|
132
|
+
for feature_data in self.features.values():
|
|
133
|
+
data.append(
|
|
134
|
+
feature_data.to_dataframe(n_decimals=n_decimals, add_index=True, metric_labels=metric_labels)
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
if add_index:
|
|
138
|
+
return pd.concat({self.label: pd.concat(data, axis=0)})
|
|
139
|
+
else:
|
|
140
|
+
return pd.concat(data, axis=0)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@dataclass
|
|
144
|
+
class AuditorFeature:
|
|
145
|
+
name: str
|
|
146
|
+
label: Optional[str] = None
|
|
147
|
+
levels: Optional[list[any]] = None
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass
|
|
151
|
+
class AuditorScore:
|
|
152
|
+
name: str
|
|
153
|
+
label: Optional[str] = None
|
|
154
|
+
threshold: Optional[float] = None
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@dataclass
|
|
158
|
+
class AuditorOutcome:
|
|
159
|
+
name: str
|
|
160
|
+
mapping: Optional[dict[any, int]] = None
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import inspect
|
|
3
|
+
from typing import Type
|
|
4
|
+
from model_auditor.metric_inputs import AuditorMetricInput
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def is_metric_input_valid(cls: type) -> bool:
|
|
8
|
+
return (
|
|
9
|
+
inspect.isclass(cls)
|
|
10
|
+
and hasattr(cls, "name")
|
|
11
|
+
and hasattr(cls, "label")
|
|
12
|
+
and hasattr(cls, "inputs")
|
|
13
|
+
and callable(getattr(cls, "row_call", None))
|
|
14
|
+
and callable(getattr(cls, "data_transform", None))
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def collect_metric_inputs() -> dict[str, Type[AuditorMetricInput]]:
|
|
19
|
+
module = importlib.import_module("model_auditor.metric_inputs")
|
|
20
|
+
|
|
21
|
+
input_classes = {
|
|
22
|
+
cls.name: cls
|
|
23
|
+
for _, cls in inspect.getmembers(module, inspect.isclass)
|
|
24
|
+
if is_metric_input_valid(cls) and cls is not AuditorMetricInput
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
return input_classes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
.gitignore
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
.github/workflows/publish.yml
|
|
5
|
+
model_auditor/__init__.py
|
|
6
|
+
model_auditor/core.py
|
|
7
|
+
model_auditor/metric_inputs.py
|
|
8
|
+
model_auditor/metrics.py
|
|
9
|
+
model_auditor/schemas.py
|
|
10
|
+
model_auditor/utils.py
|
|
11
|
+
model_auditor.egg-info/PKG-INFO
|
|
12
|
+
model_auditor.egg-info/SOURCES.txt
|
|
13
|
+
model_auditor.egg-info/dependency_links.txt
|
|
14
|
+
model_auditor.egg-info/requires.txt
|
|
15
|
+
model_auditor.egg-info/top_level.txt
|
|
16
|
+
model_auditor/plotting/__init__.py
|
|
17
|
+
model_auditor/plotting/plotters.py
|
|
18
|
+
model_auditor/plotting/schemas.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
model_auditor
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# allows the package to be installed in editable mode
|
|
2
|
+
[build-system]
|
|
3
|
+
requires = ["setuptools>=64", "setuptools_scm>=8", "wheel"]
|
|
4
|
+
build-backend = "setuptools.build_meta"
|
|
5
|
+
|
|
6
|
+
[project]
|
|
7
|
+
name = "model-auditor"
|
|
8
|
+
dynamic = ["version"]
|
|
9
|
+
|
|
10
|
+
dependencies = [
|
|
11
|
+
"pandas >= 2.2",
|
|
12
|
+
"numpy >= 2.1",
|
|
13
|
+
"scikit-learn >= 1.5",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[tool.setuptools_scm]
|