verosynthea-validator 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- verosynthea_validator-0.2.0/PKG-INFO +178 -0
- verosynthea_validator-0.2.0/README.md +147 -0
- verosynthea_validator-0.2.0/pyproject.toml +40 -0
- verosynthea_validator-0.2.0/setup.cfg +4 -0
- verosynthea_validator-0.2.0/tests/test_fairness.py +187 -0
- verosynthea_validator-0.2.0/verosynthea_validator/__init__.py +22 -0
- verosynthea_validator-0.2.0/verosynthea_validator/assertions.py +100 -0
- verosynthea_validator-0.2.0/verosynthea_validator/data.py +96 -0
- verosynthea_validator-0.2.0/verosynthea_validator/demos.py +299 -0
- verosynthea_validator-0.2.0/verosynthea_validator/fairness.py +244 -0
- verosynthea_validator-0.2.0/verosynthea_validator.egg-info/PKG-INFO +178 -0
- verosynthea_validator-0.2.0/verosynthea_validator.egg-info/SOURCES.txt +13 -0
- verosynthea_validator-0.2.0/verosynthea_validator.egg-info/dependency_links.txt +1 -0
- verosynthea_validator-0.2.0/verosynthea_validator.egg-info/requires.txt +14 -0
- verosynthea_validator-0.2.0/verosynthea_validator.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: verosynthea-validator
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Fairness testing for ML models using Australian demographic data
|
|
5
|
+
Author-email: Verosynthea <hello@verosynthea.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://verosynthea.com/for-ai-labs
|
|
8
|
+
Project-URL: Repository, https://github.com/verosynthea/verosynthea-validator
|
|
9
|
+
Project-URL: Dataset, https://huggingface.co/datasets/vero-synthea/ausynth-sample
|
|
10
|
+
Project-URL: Documentation, https://verosynthea.com/about
|
|
11
|
+
Keywords: fairness,ml,demographics,australia,synthetic-data,census
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: pandas>=1.5
|
|
21
|
+
Requires-Dist: numpy>=1.23
|
|
22
|
+
Requires-Dist: huggingface_hub>=0.20.0
|
|
23
|
+
Requires-Dist: datasets>=2.14.0
|
|
24
|
+
Provides-Extra: hf
|
|
25
|
+
Requires-Dist: datasets>=2.0; extra == "hf"
|
|
26
|
+
Provides-Extra: paid
|
|
27
|
+
Requires-Dist: httpx>=0.24; extra == "paid"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
30
|
+
Requires-Dist: scikit-learn>=1.2; extra == "dev"
|
|
31
|
+
|
|
32
|
+
# verosynthea-validator
|
|
33
|
+
|
|
34
|
+
## Quick Start: Test a US-trained Model on Australian Data
|
|
35
|
+
|
|
36
|
+
This is a real-world fairness scenario. We take a standard classifier trained on US Census data and test how it performs on Australian demographics.
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install verosynthea-validator
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from verosynthea_validator import FairnessReport
|
|
44
|
+
from verosynthea_validator.demos import (
|
|
45
|
+
load_us_adult_baseline,
|
|
46
|
+
load_ausynth_test_set,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Load a US-trained income classifier and Australian test data
|
|
50
|
+
model = load_us_adult_baseline()
|
|
51
|
+
au_data = load_ausynth_test_set()
|
|
52
|
+
|
|
53
|
+
# Run fairness audit
|
|
54
|
+
report = FairnessReport(
|
|
55
|
+
model=model,
|
|
56
|
+
target_column="income_above_threshold",
|
|
57
|
+
protected_attributes=["SEXP", "BPLP", "AGE5P"],
|
|
58
|
+
)
|
|
59
|
+
report.run(test_data=au_data)
|
|
60
|
+
report.show()
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### What you'll see
|
|
64
|
+
|
|
65
|
+
- **Country-of-birth bias gap (~30%):** The US-trained model handles US-typical birth countries well, others poorly
|
|
66
|
+
- **Income threshold miscalibration:** $50K USD doesn't map cleanly to Australian income distributions
|
|
67
|
+
- **Occupation bias (~18%):** Australian occupation categories don't align with UCI Adult codes
|
|
68
|
+
|
|
69
|
+
This is the standard fairness-testing scenario for Australian deployments: models trained on US data need to be validated against Australian populations before production use.
|
|
70
|
+
|
|
71
|
+
### What just happened?
|
|
72
|
+
|
|
73
|
+
The [UCI Adult Income dataset](https://archive.ics.uci.edu/dataset/2/adult) is the canonical fairness benchmark in ML, but it's US Census data from 1994. When you run a model trained on it against Australian population data, the validator surfaces the bias gaps that come from the distribution mismatch.
|
|
74
|
+
|
|
75
|
+
For your own models, replace `load_us_adult_baseline()` with your model and `load_ausynth_test_set()` with your test data or an [AUSynth subset](https://huggingface.co/datasets/vero-synthea/ausynth-sample).
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
## CI/CD gate
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from verosynthea_validator import assert_fair
|
|
84
|
+
|
|
85
|
+
# Fails the build if any group accuracy gap > 5%
|
|
86
|
+
assert_fair(test_data, "label", "prediction", max_accuracy_gap=0.05)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
In pytest:
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
def test_model_fairness():
|
|
93
|
+
predictions = model.predict(test_data)
|
|
94
|
+
test_data["y_pred"] = predictions
|
|
95
|
+
assert_fair(
|
|
96
|
+
test_data, "y_true", "y_pred",
|
|
97
|
+
protected_columns=["SEXP", "BPLP", "profile_name"],
|
|
98
|
+
max_accuracy_gap=0.05,
|
|
99
|
+
max_demographic_parity_gap=0.10,
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## What it measures
|
|
104
|
+
|
|
105
|
+
For each protected column (e.g. sex, birthplace, demographic profile), the validator computes:
|
|
106
|
+
|
|
107
|
+
| Metric | What it checks |
|
|
108
|
+
|--------|---------------|
|
|
109
|
+
| **Accuracy gap** | Max accuracy difference between any two groups |
|
|
110
|
+
| **Demographic parity gap** | Max difference in selection rate (P(y_pred=1)) |
|
|
111
|
+
| **Equalised odds gap** | Max difference in true positive rate or false positive rate |
|
|
112
|
+
|
|
113
|
+
Groups smaller than 30 observations are excluded (configurable via `min_group_size`).
|
|
114
|
+
|
|
115
|
+
## Why this instead of fairlearn or aif360?
|
|
116
|
+
|
|
117
|
+
Those are general-purpose fairness frameworks. This package is purpose-built for Australian demographics:
|
|
118
|
+
|
|
119
|
+
- **Pre-loaded demographic data.** The free tier includes 5,000 synthetic individuals from [AUSynth](https://huggingface.co/datasets/vero-synthea/ausynth-sample) with 25 Census-calibrated variables. No need to source your own protected attributes.
|
|
120
|
+
- **8 demographic profiles.** AUSynth clusters every person into one of 8 profiles (High-earning professionals, Young singles, Retired, etc.) — a richer protected attribute than just age or sex.
|
|
121
|
+
- **Australia-specific calibration.** Variables match ABS Census 2021 categories exactly. Income brackets, occupation codes, education levels, birthplace regions — all in Australian standard classifications.
|
|
122
|
+
- **One-line CI gate.** `assert_fair()` drops into pytest with zero configuration.
|
|
123
|
+
|
|
124
|
+
## Data tiers
|
|
125
|
+
|
|
126
|
+
| Tier | Data | Cost |
|
|
127
|
+
|------|------|------|
|
|
128
|
+
| **Free** | 5,000-row Paddington 4064 sample from [Hugging Face](https://huggingface.co/datasets/vero-synthea/ausynth-sample) | $0 |
|
|
129
|
+
| **Paid** | Full national dataset (32M individuals, 15,352 suburbs) via API | [verosynthea.com](https://verosynthea.com) |
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
from verosynthea_validator import load_ausynth_sample
|
|
133
|
+
|
|
134
|
+
# Free tier (downloads from HF on first call)
|
|
135
|
+
df = load_ausynth_sample()
|
|
136
|
+
|
|
137
|
+
# Paid tier
|
|
138
|
+
df = load_ausynth_sample(api_key="vero_...", geography="bondi-2026-nsw")
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## The 8 demographic profiles
|
|
142
|
+
|
|
143
|
+
| ID | Name | Typical characteristics |
|
|
144
|
+
|----|------|------------------------|
|
|
145
|
+
| 0 | Labourers and operators | Blue-collar, lower income |
|
|
146
|
+
| 1 | Young singles and non-workers | Under 25, students, NILF |
|
|
147
|
+
| 2 | Children | Under 15 |
|
|
148
|
+
| 3 | Non-earning dependants | Adults not in workforce |
|
|
149
|
+
| 4 | Trades and technical workers | Certificate-qualified, mid income |
|
|
150
|
+
| 5 | Established partnered households | Married, mid-career |
|
|
151
|
+
| 6 | Retired and semi-retired | Over 60, pension income |
|
|
152
|
+
| 7 | High-earning professionals | Degree-qualified, professional occupations |
|
|
153
|
+
|
|
154
|
+
## Installation
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
pip install verosynthea-validator # core (pandas + numpy)
|
|
158
|
+
pip install verosynthea-validator[hf] # + Hugging Face datasets loader
|
|
159
|
+
pip install verosynthea-validator[paid] # + httpx for API access
|
|
160
|
+
pip install verosynthea-validator[dev] # + pytest + sklearn for development
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Links
|
|
164
|
+
|
|
165
|
+
- **Dataset:** [vero-synthea/ausynth-sample](https://huggingface.co/datasets/vero-synthea/ausynth-sample) on Hugging Face
|
|
166
|
+
- **Full product:** [verosynthea.com](https://verosynthea.com)
|
|
167
|
+
- **Methodology:** [verosynthea.com/about](https://verosynthea.com/about)
|
|
168
|
+
|
|
169
|
+
## Citation
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
Verosynthea AUSynth (2026). Synthetic Australian Census Data.
|
|
173
|
+
https://verosynthea.com
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## License
|
|
177
|
+
|
|
178
|
+
MIT
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# verosynthea-validator
|
|
2
|
+
|
|
3
|
+
## Quick Start: Test a US-trained Model on Australian Data
|
|
4
|
+
|
|
5
|
+
This is a real-world fairness scenario. We take a standard classifier trained on US Census data and test how it performs on Australian demographics.
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install verosynthea-validator
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
from verosynthea_validator import FairnessReport
|
|
13
|
+
from verosynthea_validator.demos import (
|
|
14
|
+
load_us_adult_baseline,
|
|
15
|
+
load_ausynth_test_set,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# Load a US-trained income classifier and Australian test data
|
|
19
|
+
model = load_us_adult_baseline()
|
|
20
|
+
au_data = load_ausynth_test_set()
|
|
21
|
+
|
|
22
|
+
# Run fairness audit
|
|
23
|
+
report = FairnessReport(
|
|
24
|
+
model=model,
|
|
25
|
+
target_column="income_above_threshold",
|
|
26
|
+
protected_attributes=["SEXP", "BPLP", "AGE5P"],
|
|
27
|
+
)
|
|
28
|
+
report.run(test_data=au_data)
|
|
29
|
+
report.show()
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### What you'll see
|
|
33
|
+
|
|
34
|
+
- **Country-of-birth bias gap (~30%):** The US-trained model handles US-typical birth countries well, others poorly
|
|
35
|
+
- **Income threshold miscalibration:** $50K USD doesn't map cleanly to Australian income distributions
|
|
36
|
+
- **Occupation bias (~18%):** Australian occupation categories don't align with UCI Adult codes
|
|
37
|
+
|
|
38
|
+
This is the standard fairness-testing scenario for Australian deployments: models trained on US data need to be validated against Australian populations before production use.
|
|
39
|
+
|
|
40
|
+
### What just happened?
|
|
41
|
+
|
|
42
|
+
The [UCI Adult Income dataset](https://archive.ics.uci.edu/dataset/2/adult) is the canonical fairness benchmark in ML, but it's US Census data from 1994. When you run a model trained on it against Australian population data, the validator surfaces the bias gaps that come from the distribution mismatch.
|
|
43
|
+
|
|
44
|
+
For your own models, replace `load_us_adult_baseline()` with your model and `load_ausynth_test_set()` with your test data or an [AUSynth subset](https://huggingface.co/datasets/vero-synthea/ausynth-sample).
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
## CI/CD gate
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from verosynthea_validator import assert_fair
|
|
53
|
+
|
|
54
|
+
# Fails the build if any group accuracy gap > 5%
|
|
55
|
+
assert_fair(test_data, "label", "prediction", max_accuracy_gap=0.05)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
In pytest:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
def test_model_fairness():
|
|
62
|
+
predictions = model.predict(test_data)
|
|
63
|
+
test_data["y_pred"] = predictions
|
|
64
|
+
assert_fair(
|
|
65
|
+
test_data, "y_true", "y_pred",
|
|
66
|
+
protected_columns=["SEXP", "BPLP", "profile_name"],
|
|
67
|
+
max_accuracy_gap=0.05,
|
|
68
|
+
max_demographic_parity_gap=0.10,
|
|
69
|
+
)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## What it measures
|
|
73
|
+
|
|
74
|
+
For each protected column (e.g. sex, birthplace, demographic profile), the validator computes:
|
|
75
|
+
|
|
76
|
+
| Metric | What it checks |
|
|
77
|
+
|--------|---------------|
|
|
78
|
+
| **Accuracy gap** | Max accuracy difference between any two groups |
|
|
79
|
+
| **Demographic parity gap** | Max difference in selection rate (P(y_pred=1)) |
|
|
80
|
+
| **Equalised odds gap** | Max difference in true positive rate or false positive rate |
|
|
81
|
+
|
|
82
|
+
Groups smaller than 30 observations are excluded (configurable via `min_group_size`).
|
|
83
|
+
|
|
84
|
+
## Why this instead of fairlearn or aif360?
|
|
85
|
+
|
|
86
|
+
Those are general-purpose fairness frameworks. This package is purpose-built for Australian demographics:
|
|
87
|
+
|
|
88
|
+
- **Pre-loaded demographic data.** The free tier includes 5,000 synthetic individuals from [AUSynth](https://huggingface.co/datasets/vero-synthea/ausynth-sample) with 25 Census-calibrated variables. No need to source your own protected attributes.
|
|
89
|
+
- **8 demographic profiles.** AUSynth clusters every person into one of 8 profiles (High-earning professionals, Young singles, Retired, etc.) — a richer protected attribute than just age or sex.
|
|
90
|
+
- **Australia-specific calibration.** Variables match ABS Census 2021 categories exactly. Income brackets, occupation codes, education levels, birthplace regions — all in Australian standard classifications.
|
|
91
|
+
- **One-line CI gate.** `assert_fair()` drops into pytest with zero configuration.
|
|
92
|
+
|
|
93
|
+
## Data tiers
|
|
94
|
+
|
|
95
|
+
| Tier | Data | Cost |
|
|
96
|
+
|------|------|------|
|
|
97
|
+
| **Free** | 5,000-row Paddington 4064 sample from [Hugging Face](https://huggingface.co/datasets/vero-synthea/ausynth-sample) | $0 |
|
|
98
|
+
| **Paid** | Full national dataset (32M individuals, 15,352 suburbs) via API | [verosynthea.com](https://verosynthea.com) |
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from verosynthea_validator import load_ausynth_sample
|
|
102
|
+
|
|
103
|
+
# Free tier (downloads from HF on first call)
|
|
104
|
+
df = load_ausynth_sample()
|
|
105
|
+
|
|
106
|
+
# Paid tier
|
|
107
|
+
df = load_ausynth_sample(api_key="vero_...", geography="bondi-2026-nsw")
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## The 8 demographic profiles
|
|
111
|
+
|
|
112
|
+
| ID | Name | Typical characteristics |
|
|
113
|
+
|----|------|------------------------|
|
|
114
|
+
| 0 | Labourers and operators | Blue-collar, lower income |
|
|
115
|
+
| 1 | Young singles and non-workers | Under 25, students, NILF |
|
|
116
|
+
| 2 | Children | Under 15 |
|
|
117
|
+
| 3 | Non-earning dependants | Adults not in workforce |
|
|
118
|
+
| 4 | Trades and technical workers | Certificate-qualified, mid income |
|
|
119
|
+
| 5 | Established partnered households | Married, mid-career |
|
|
120
|
+
| 6 | Retired and semi-retired | Over 60, pension income |
|
|
121
|
+
| 7 | High-earning professionals | Degree-qualified, professional occupations |
|
|
122
|
+
|
|
123
|
+
## Installation
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
pip install verosynthea-validator # core (pandas + numpy)
|
|
127
|
+
pip install verosynthea-validator[hf] # + Hugging Face datasets loader
|
|
128
|
+
pip install verosynthea-validator[paid] # + httpx for API access
|
|
129
|
+
pip install verosynthea-validator[dev] # + pytest + sklearn for development
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Links
|
|
133
|
+
|
|
134
|
+
- **Dataset:** [vero-synthea/ausynth-sample](https://huggingface.co/datasets/vero-synthea/ausynth-sample) on Hugging Face
|
|
135
|
+
- **Full product:** [verosynthea.com](https://verosynthea.com)
|
|
136
|
+
- **Methodology:** [verosynthea.com/about](https://verosynthea.com/about)
|
|
137
|
+
|
|
138
|
+
## Citation
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
Verosynthea AUSynth (2026). Synthetic Australian Census Data.
|
|
142
|
+
https://verosynthea.com
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## License
|
|
146
|
+
|
|
147
|
+
MIT
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "verosynthea-validator"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Fairness testing for ML models using Australian demographic data"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Verosynthea", email = "hello@verosynthea.com"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["fairness", "ml", "demographics", "australia", "synthetic-data", "census"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"pandas>=1.5",
|
|
26
|
+
"numpy>=1.23",
|
|
27
|
+
"huggingface_hub>=0.20.0",
|
|
28
|
+
"datasets>=2.14.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
hf = ["datasets>=2.0"]
|
|
33
|
+
paid = ["httpx>=0.24"]
|
|
34
|
+
dev = ["pytest>=7.0", "scikit-learn>=1.2"]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://verosynthea.com/for-ai-labs"
|
|
38
|
+
Repository = "https://github.com/verosynthea/verosynthea-validator"
|
|
39
|
+
Dataset = "https://huggingface.co/datasets/vero-synthea/ausynth-sample"
|
|
40
|
+
Documentation = "https://verosynthea.com/about"
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for verosynthea-validator fairness metrics.
|
|
3
|
+
|
|
4
|
+
Covers:
|
|
5
|
+
- FairnessReport with a fair model (should pass)
|
|
6
|
+
- FairnessReport with a biased model (should fail)
|
|
7
|
+
- assert_fair CI helper
|
|
8
|
+
- Edge cases: small groups, single group, missing columns
|
|
9
|
+
- All 8 demographic profiles as protected attribute
|
|
10
|
+
"""
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
from verosynthea_validator import FairnessReport, assert_fair
|
|
16
|
+
from verosynthea_validator.assertions import FairnessAssertionError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@pytest.fixture
|
|
20
|
+
def sample_data():
|
|
21
|
+
"""Create a synthetic dataset mimicking AUSynth structure."""
|
|
22
|
+
np.random.seed(42)
|
|
23
|
+
n = 2000
|
|
24
|
+
profiles = np.random.choice(
|
|
25
|
+
["High-earning professionals", "Young singles and non-workers",
|
|
26
|
+
"Established partnered households", "Trades and technical workers",
|
|
27
|
+
"Retired and semi-retired", "Labourers and operators",
|
|
28
|
+
"Non-earning dependants", "Children"],
|
|
29
|
+
size=n,
|
|
30
|
+
p=[0.35, 0.20, 0.15, 0.10, 0.08, 0.07, 0.03, 0.02],
|
|
31
|
+
)
|
|
32
|
+
return pd.DataFrame({
|
|
33
|
+
"SEXP": np.random.choice(["Male", "Female"], n),
|
|
34
|
+
"BPLP": np.random.choice(
|
|
35
|
+
["Oceania and Antarctica", "North-West Europe", "South-East Asia"],
|
|
36
|
+
n, p=[0.6, 0.2, 0.2],
|
|
37
|
+
),
|
|
38
|
+
"profile_name": profiles,
|
|
39
|
+
"profile_id": pd.Categorical(profiles).codes,
|
|
40
|
+
"y_true": np.random.binomial(1, 0.4, n),
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@pytest.fixture
|
|
45
|
+
def fair_predictions(sample_data):
|
|
46
|
+
"""A model with roughly equal accuracy across groups."""
|
|
47
|
+
np.random.seed(42)
|
|
48
|
+
noise = np.random.normal(0, 0.2, len(sample_data))
|
|
49
|
+
sample_data["y_pred"] = (
|
|
50
|
+
(sample_data["y_true"] + noise) > 0.5
|
|
51
|
+
).astype(int)
|
|
52
|
+
return sample_data
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@pytest.fixture
|
|
56
|
+
def biased_predictions(sample_data):
|
|
57
|
+
"""A model that performs worse for South-East Asian birthplace."""
|
|
58
|
+
np.random.seed(42)
|
|
59
|
+
noise = np.random.normal(0, 0.2, len(sample_data))
|
|
60
|
+
bias = np.where(sample_data["BPLP"] == "South-East Asia", 0.4, 0)
|
|
61
|
+
sample_data["y_pred"] = (
|
|
62
|
+
(sample_data["y_true"] + noise + bias) > 0.5
|
|
63
|
+
).astype(int)
|
|
64
|
+
return sample_data
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class TestFairnessReport:
|
|
68
|
+
def test_fair_model_passes(self, fair_predictions):
|
|
69
|
+
report = FairnessReport(
|
|
70
|
+
fair_predictions, "y_true", "y_pred",
|
|
71
|
+
protected_columns=["SEXP"],
|
|
72
|
+
)
|
|
73
|
+
results = report.run()
|
|
74
|
+
assert results.n_total == 2000
|
|
75
|
+
assert results.overall_accuracy > 0.5
|
|
76
|
+
assert len(results.results) == 1
|
|
77
|
+
assert results.results[0].accuracy_gap < 0.10
|
|
78
|
+
|
|
79
|
+
def test_biased_model_detected(self, biased_predictions):
|
|
80
|
+
report = FairnessReport(
|
|
81
|
+
biased_predictions, "y_true", "y_pred",
|
|
82
|
+
protected_columns=["BPLP"],
|
|
83
|
+
)
|
|
84
|
+
results = report.run()
|
|
85
|
+
bplp = results.results[0]
|
|
86
|
+
assert bplp.accuracy_gap > 0.05
|
|
87
|
+
assert bplp.column == "BPLP"
|
|
88
|
+
|
|
89
|
+
def test_multiple_protected_columns(self, fair_predictions):
|
|
90
|
+
report = FairnessReport(
|
|
91
|
+
fair_predictions, "y_true", "y_pred",
|
|
92
|
+
protected_columns=["SEXP", "BPLP", "profile_name"],
|
|
93
|
+
)
|
|
94
|
+
results = report.run()
|
|
95
|
+
assert len(results.results) == 3
|
|
96
|
+
columns = {r.column for r in results.results}
|
|
97
|
+
assert columns == {"SEXP", "BPLP", "profile_name"}
|
|
98
|
+
|
|
99
|
+
def test_profiles_as_protected(self, fair_predictions):
|
|
100
|
+
report = FairnessReport(
|
|
101
|
+
fair_predictions, "y_true", "y_pred",
|
|
102
|
+
protected_columns=["profile_name"],
|
|
103
|
+
min_group_size=20,
|
|
104
|
+
)
|
|
105
|
+
results = report.run()
|
|
106
|
+
profile_result = results.results[0]
|
|
107
|
+
assert profile_result.column == "profile_name"
|
|
108
|
+
assert len(profile_result.groups) >= 5
|
|
109
|
+
|
|
110
|
+
def test_summary_output(self, fair_predictions):
|
|
111
|
+
report = FairnessReport(
|
|
112
|
+
fair_predictions, "y_true", "y_pred",
|
|
113
|
+
protected_columns=["SEXP"],
|
|
114
|
+
)
|
|
115
|
+
results = report.run()
|
|
116
|
+
summary = results.summary()
|
|
117
|
+
assert "Fairness Report" in summary
|
|
118
|
+
assert "accuracy gap" in summary.lower()
|
|
119
|
+
|
|
120
|
+
def test_to_dataframe(self, fair_predictions):
|
|
121
|
+
report = FairnessReport(
|
|
122
|
+
fair_predictions, "y_true", "y_pred",
|
|
123
|
+
protected_columns=["SEXP", "BPLP"],
|
|
124
|
+
)
|
|
125
|
+
df = report.run().to_dataframe()
|
|
126
|
+
assert isinstance(df, pd.DataFrame)
|
|
127
|
+
assert "protected_column" in df.columns
|
|
128
|
+
assert "accuracy" in df.columns
|
|
129
|
+
assert len(df) >= 4
|
|
130
|
+
|
|
131
|
+
def test_min_group_size_filter(self, sample_data):
|
|
132
|
+
sample_data["y_pred"] = sample_data["y_true"]
|
|
133
|
+
sample_data["RARE"] = "common"
|
|
134
|
+
sample_data.loc[:4, "RARE"] = "rare"
|
|
135
|
+
report = FairnessReport(
|
|
136
|
+
sample_data, "y_true", "y_pred",
|
|
137
|
+
protected_columns=["RARE"],
|
|
138
|
+
min_group_size=30,
|
|
139
|
+
)
|
|
140
|
+
results = report.run()
|
|
141
|
+
assert len(results.results) == 0
|
|
142
|
+
|
|
143
|
+
def test_missing_column_raises(self, sample_data):
|
|
144
|
+
sample_data["y_pred"] = 0
|
|
145
|
+
with pytest.raises(ValueError, match="not found"):
|
|
146
|
+
FairnessReport(
|
|
147
|
+
sample_data, "y_true", "y_pred",
|
|
148
|
+
protected_columns=["NONEXISTENT"],
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class TestAssertFair:
|
|
153
|
+
def test_fair_model_passes(self, fair_predictions):
|
|
154
|
+
assert_fair(
|
|
155
|
+
fair_predictions, "y_true", "y_pred",
|
|
156
|
+
protected_columns=["SEXP"],
|
|
157
|
+
max_accuracy_gap=0.10,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
def test_biased_model_fails(self, biased_predictions):
|
|
161
|
+
with pytest.raises(FairnessAssertionError) as exc_info:
|
|
162
|
+
assert_fair(
|
|
163
|
+
biased_predictions, "y_true", "y_pred",
|
|
164
|
+
protected_columns=["BPLP"],
|
|
165
|
+
max_accuracy_gap=0.05,
|
|
166
|
+
)
|
|
167
|
+
assert "failed fairness check" in str(exc_info.value)
|
|
168
|
+
assert hasattr(exc_info.value, "details")
|
|
169
|
+
assert len(exc_info.value.details["failures"]) > 0
|
|
170
|
+
|
|
171
|
+
def test_custom_thresholds(self, biased_predictions):
|
|
172
|
+
with pytest.raises(FairnessAssertionError):
|
|
173
|
+
assert_fair(
|
|
174
|
+
biased_predictions, "y_true", "y_pred",
|
|
175
|
+
protected_columns=["BPLP"],
|
|
176
|
+
max_accuracy_gap=0.01,
|
|
177
|
+
max_demographic_parity_gap=0.01,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def test_relaxed_thresholds_pass(self, biased_predictions):
|
|
181
|
+
assert_fair(
|
|
182
|
+
biased_predictions, "y_true", "y_pred",
|
|
183
|
+
protected_columns=["BPLP"],
|
|
184
|
+
max_accuracy_gap=0.50,
|
|
185
|
+
max_demographic_parity_gap=0.50,
|
|
186
|
+
max_equalised_odds_gap=0.50,
|
|
187
|
+
)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
verosynthea-validator — Fairness testing for ML models using Australian demographic data.
|
|
3
|
+
|
|
4
|
+
Quick start:
|
|
5
|
+
from verosynthea_validator import FairnessReport, assert_fair
|
|
6
|
+
|
|
7
|
+
report = FairnessReport(data, y_true="label", y_pred="prediction",
|
|
8
|
+
protected_columns=["SEXP", "BPLP"])
|
|
9
|
+
results = report.run()
|
|
10
|
+
print(results.summary())
|
|
11
|
+
|
|
12
|
+
# CI/CD gate:
|
|
13
|
+
assert_fair(data, "label", "prediction", max_accuracy_gap=0.05)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from verosynthea_validator.fairness import FairnessReport, FairnessResults
|
|
17
|
+
from verosynthea_validator.assertions import assert_fair
|
|
18
|
+
from verosynthea_validator.data import load_ausynth_sample
|
|
19
|
+
from .demos import load_us_adult_baseline, load_ausynth_test_set
|
|
20
|
+
|
|
21
|
+
__version__ = "0.1.0"
|
|
22
|
+
__all__ = ["FairnessReport", "FairnessResults", "assert_fair", "load_ausynth_sample"]
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CI/CD assertion helpers for fairness gating.
|
|
3
|
+
|
|
4
|
+
Usage in pytest or CI:
|
|
5
|
+
from verosynthea_validator import assert_fair
|
|
6
|
+
assert_fair(test_data, "y_true", "y_pred", max_accuracy_gap=0.05)
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Optional, Sequence
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from verosynthea_validator.fairness import FairnessReport
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FairnessAssertionError(AssertionError):
|
|
18
|
+
"""Raised when a model fails a fairness check."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, message: str, details: dict):
|
|
21
|
+
super().__init__(message)
|
|
22
|
+
self.details = details
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def assert_fair(
|
|
26
|
+
data: pd.DataFrame,
|
|
27
|
+
y_true: str,
|
|
28
|
+
y_pred: str,
|
|
29
|
+
protected_columns: Optional[Sequence[str]] = None,
|
|
30
|
+
max_accuracy_gap: float = 0.05,
|
|
31
|
+
max_demographic_parity_gap: float = 0.10,
|
|
32
|
+
max_equalised_odds_gap: float = 0.10,
|
|
33
|
+
min_group_size: int = 30,
|
|
34
|
+
) -> None:
|
|
35
|
+
"""Assert that a model's predictions are fair across demographic groups.
|
|
36
|
+
|
|
37
|
+
Raises FairnessAssertionError if any threshold is exceeded. Designed
|
|
38
|
+
for use in pytest tests and CI/CD pipelines.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
data : pd.DataFrame
|
|
43
|
+
Dataset with predictions and AUSynth demographic columns.
|
|
44
|
+
y_true : str
|
|
45
|
+
Ground-truth binary label column.
|
|
46
|
+
y_pred : str
|
|
47
|
+
Predicted binary label column.
|
|
48
|
+
protected_columns : list[str], optional
|
|
49
|
+
Columns to check. Default: ["SEXP", "BPLP", "profile_name"].
|
|
50
|
+
max_accuracy_gap : float
|
|
51
|
+
Maximum allowed accuracy difference between any two groups (default 0.05).
|
|
52
|
+
max_demographic_parity_gap : float
|
|
53
|
+
Maximum allowed selection rate difference (default 0.10).
|
|
54
|
+
max_equalised_odds_gap : float
|
|
55
|
+
Maximum allowed TPR or FPR difference (default 0.10).
|
|
56
|
+
min_group_size : int
|
|
57
|
+
Minimum group size to include in analysis (default 30).
|
|
58
|
+
|
|
59
|
+
Raises
|
|
60
|
+
------
|
|
61
|
+
FairnessAssertionError
|
|
62
|
+
If any fairness threshold is exceeded. The error's `.details` dict
|
|
63
|
+
contains the full results for debugging.
|
|
64
|
+
"""
|
|
65
|
+
report = FairnessReport(
|
|
66
|
+
data=data,
|
|
67
|
+
y_true=y_true,
|
|
68
|
+
y_pred=y_pred,
|
|
69
|
+
protected_columns=protected_columns,
|
|
70
|
+
min_group_size=min_group_size,
|
|
71
|
+
)
|
|
72
|
+
results = report.run()
|
|
73
|
+
|
|
74
|
+
failures = []
|
|
75
|
+
for r in results.results:
|
|
76
|
+
if r.accuracy_gap > max_accuracy_gap:
|
|
77
|
+
failures.append(
|
|
78
|
+
f"{r.column}: accuracy gap {r.accuracy_gap:.3f} > {max_accuracy_gap}"
|
|
79
|
+
)
|
|
80
|
+
if r.demographic_parity_gap > max_demographic_parity_gap:
|
|
81
|
+
failures.append(
|
|
82
|
+
f"{r.column}: demographic parity gap {r.demographic_parity_gap:.3f} "
|
|
83
|
+
f"> {max_demographic_parity_gap}"
|
|
84
|
+
)
|
|
85
|
+
if r.equalised_odds_gap > max_equalised_odds_gap:
|
|
86
|
+
failures.append(
|
|
87
|
+
f"{r.column}: equalised odds gap {r.equalised_odds_gap:.3f} "
|
|
88
|
+
f"> {max_equalised_odds_gap}"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if failures:
|
|
92
|
+
msg = (
|
|
93
|
+
f"Model failed fairness check on {len(failures)} metric(s):\n"
|
|
94
|
+
+ "\n".join(f" - {f}" for f in failures)
|
|
95
|
+
+ "\n\nFull report:\n" + results.summary()
|
|
96
|
+
)
|
|
97
|
+
raise FairnessAssertionError(msg, {
|
|
98
|
+
"failures": failures,
|
|
99
|
+
"results": [r.to_dict() for r in results.results],
|
|
100
|
+
})
|