verosynthea-validator 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- verosynthea_validator-0.1.0/PKG-INFO +169 -0
- verosynthea_validator-0.1.0/README.md +140 -0
- verosynthea_validator-0.1.0/pyproject.toml +38 -0
- verosynthea_validator-0.1.0/setup.cfg +4 -0
- verosynthea_validator-0.1.0/tests/test_fairness.py +187 -0
- verosynthea_validator-0.1.0/verosynthea_validator/__init__.py +21 -0
- verosynthea_validator-0.1.0/verosynthea_validator/assertions.py +100 -0
- verosynthea_validator-0.1.0/verosynthea_validator/data.py +96 -0
- verosynthea_validator-0.1.0/verosynthea_validator/fairness.py +244 -0
- verosynthea_validator-0.1.0/verosynthea_validator.egg-info/PKG-INFO +169 -0
- verosynthea_validator-0.1.0/verosynthea_validator.egg-info/SOURCES.txt +12 -0
- verosynthea_validator-0.1.0/verosynthea_validator.egg-info/dependency_links.txt +1 -0
- verosynthea_validator-0.1.0/verosynthea_validator.egg-info/requires.txt +12 -0
- verosynthea_validator-0.1.0/verosynthea_validator.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: verosynthea-validator
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fairness testing for ML models using Australian demographic data
|
|
5
|
+
Author-email: Verosynthea <hello@verosynthea.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://verosynthea.com/for-ai-labs
|
|
8
|
+
Project-URL: Repository, https://github.com/verosynthea/verosynthea-validator
|
|
9
|
+
Project-URL: Dataset, https://huggingface.co/datasets/vero-synthea/ausynth-sample
|
|
10
|
+
Project-URL: Documentation, https://verosynthea.com/about
|
|
11
|
+
Keywords: fairness,ml,demographics,australia,synthetic-data,census
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: pandas>=1.5
|
|
21
|
+
Requires-Dist: numpy>=1.23
|
|
22
|
+
Provides-Extra: hf
|
|
23
|
+
Requires-Dist: datasets>=2.0; extra == "hf"
|
|
24
|
+
Provides-Extra: paid
|
|
25
|
+
Requires-Dist: httpx>=0.24; extra == "paid"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: scikit-learn>=1.2; extra == "dev"
|
|
29
|
+
|
|
30
|
+
# verosynthea-validator
|
|
31
|
+
|
|
32
|
+
Fairness testing for ML models using real Australian demographic data. One line to check whether your model treats demographic groups equally.
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install verosynthea-validator
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from verosynthea_validator import FairnessReport
|
|
40
|
+
|
|
41
|
+
report = FairnessReport(
|
|
42
|
+
data=test_data,
|
|
43
|
+
y_true="label",
|
|
44
|
+
y_pred="prediction",
|
|
45
|
+
protected_columns=["SEXP", "BPLP", "profile_name"],
|
|
46
|
+
)
|
|
47
|
+
results = report.run()
|
|
48
|
+
print(results.summary())
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Output:
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
Fairness Report (n=5,000, overall accuracy=0.847)
|
|
55
|
+
============================================================
|
|
56
|
+
|
|
57
|
+
[PASS] SEXP (2 groups, smallest n=2,451)
|
|
58
|
+
Accuracy gap: 0.012
|
|
59
|
+
Demographic parity gap: 0.008
|
|
60
|
+
Equalised odds gap: 0.015
|
|
61
|
+
|
|
62
|
+
[FAIL] BPLP (3 groups, smallest n=312)
|
|
63
|
+
Accuracy gap: 0.073
|
|
64
|
+
Demographic parity gap: 0.091
|
|
65
|
+
Equalised odds gap: 0.064
|
|
66
|
+
|
|
67
|
+
============================================================
|
|
68
|
+
Overall: FAIL (worst gap: 0.073 on BPLP)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## CI/CD gate
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from verosynthea_validator import assert_fair
|
|
75
|
+
|
|
76
|
+
# Fails the build if any group accuracy gap > 5%
|
|
77
|
+
assert_fair(test_data, "label", "prediction", max_accuracy_gap=0.05)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
In pytest:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
def test_model_fairness():
|
|
84
|
+
predictions = model.predict(test_data)
|
|
85
|
+
test_data["y_pred"] = predictions
|
|
86
|
+
assert_fair(
|
|
87
|
+
test_data, "y_true", "y_pred",
|
|
88
|
+
protected_columns=["SEXP", "BPLP", "profile_name"],
|
|
89
|
+
max_accuracy_gap=0.05,
|
|
90
|
+
max_demographic_parity_gap=0.10,
|
|
91
|
+
)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## What it measures
|
|
95
|
+
|
|
96
|
+
For each protected column (e.g. sex, birthplace, demographic profile), the validator computes:
|
|
97
|
+
|
|
98
|
+
| Metric | What it checks |
|
|
99
|
+
|--------|---------------|
|
|
100
|
+
| **Accuracy gap** | Max accuracy difference between any two groups |
|
|
101
|
+
| **Demographic parity gap** | Max difference in selection rate (P(y_pred=1)) |
|
|
102
|
+
| **Equalised odds gap** | Max difference in true positive rate or false positive rate |
|
|
103
|
+
|
|
104
|
+
Groups smaller than 30 observations are excluded (configurable via `min_group_size`).
|
|
105
|
+
|
|
106
|
+
## Why this instead of fairlearn or aif360?
|
|
107
|
+
|
|
108
|
+
Those are general-purpose fairness frameworks. This package is purpose-built for Australian demographics:
|
|
109
|
+
|
|
110
|
+
- **Pre-loaded demographic data.** The free tier includes 5,000 synthetic individuals from [AUSynth](https://huggingface.co/datasets/vero-synthea/ausynth-sample) with 25 Census-calibrated variables. No need to source your own protected attributes.
|
|
111
|
+
- **8 demographic profiles.** AUSynth clusters every person into one of 8 profiles (High-earning professionals, Young singles, Retired, etc.) — a richer protected attribute than just age or sex.
|
|
112
|
+
- **Australia-specific calibration.** Variables match ABS Census 2021 categories exactly. Income brackets, occupation codes, education levels, birthplace regions — all in Australian standard classifications.
|
|
113
|
+
- **One-line CI gate.** `assert_fair()` drops into pytest with zero configuration.
|
|
114
|
+
|
|
115
|
+
## Data tiers
|
|
116
|
+
|
|
117
|
+
| Tier | Data | Cost |
|
|
118
|
+
|------|------|------|
|
|
119
|
+
| **Free** | 5,000-row Paddington 4064 sample from [Hugging Face](https://huggingface.co/datasets/vero-synthea/ausynth-sample) | $0 |
|
|
120
|
+
| **Paid** | Full national dataset (32M individuals, 15,352 suburbs) via API | [verosynthea.com](https://verosynthea.com) |
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from verosynthea_validator import load_ausynth_sample
|
|
124
|
+
|
|
125
|
+
# Free tier (downloads from HF on first call)
|
|
126
|
+
df = load_ausynth_sample()
|
|
127
|
+
|
|
128
|
+
# Paid tier
|
|
129
|
+
df = load_ausynth_sample(api_key="vero_...", geography="bondi-2026-nsw")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## The 8 demographic profiles
|
|
133
|
+
|
|
134
|
+
| ID | Name | Typical characteristics |
|
|
135
|
+
|----|------|------------------------|
|
|
136
|
+
| 0 | Labourers and operators | Blue-collar, lower income |
|
|
137
|
+
| 1 | Young singles and non-workers | Under 25, students, NILF |
|
|
138
|
+
| 2 | Children | Under 15 |
|
|
139
|
+
| 3 | Non-earning dependants | Adults not in workforce |
|
|
140
|
+
| 4 | Trades and technical workers | Certificate-qualified, mid income |
|
|
141
|
+
| 5 | Established partnered households | Married, mid-career |
|
|
142
|
+
| 6 | Retired and semi-retired | Over 60, pension income |
|
|
143
|
+
| 7 | High-earning professionals | Degree-qualified, professional occupations |
|
|
144
|
+
|
|
145
|
+
## Installation
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
pip install verosynthea-validator # core (pandas + numpy)
|
|
149
|
+
pip install verosynthea-validator[hf] # + Hugging Face datasets loader
|
|
150
|
+
pip install verosynthea-validator[paid] # + httpx for API access
|
|
151
|
+
pip install verosynthea-validator[dev] # + pytest + sklearn for development
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Links
|
|
155
|
+
|
|
156
|
+
- **Dataset:** [vero-synthea/ausynth-sample](https://huggingface.co/datasets/vero-synthea/ausynth-sample) on Hugging Face
|
|
157
|
+
- **Full product:** [verosynthea.com](https://verosynthea.com)
|
|
158
|
+
- **Methodology:** [verosynthea.com/about](https://verosynthea.com/about)
|
|
159
|
+
|
|
160
|
+
## Citation
|
|
161
|
+
|
|
162
|
+
```
|
|
163
|
+
Verosynthea AUSynth (2026). Synthetic Australian Census Data.
|
|
164
|
+
https://verosynthea.com
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## License
|
|
168
|
+
|
|
169
|
+
MIT
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# verosynthea-validator
|
|
2
|
+
|
|
3
|
+
Fairness testing for ML models using real Australian demographic data. One line to check whether your model treats demographic groups equally.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install verosynthea-validator
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from verosynthea_validator import FairnessReport
|
|
11
|
+
|
|
12
|
+
report = FairnessReport(
|
|
13
|
+
data=test_data,
|
|
14
|
+
y_true="label",
|
|
15
|
+
y_pred="prediction",
|
|
16
|
+
protected_columns=["SEXP", "BPLP", "profile_name"],
|
|
17
|
+
)
|
|
18
|
+
results = report.run()
|
|
19
|
+
print(results.summary())
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Output:
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
Fairness Report (n=5,000, overall accuracy=0.847)
|
|
26
|
+
============================================================
|
|
27
|
+
|
|
28
|
+
[PASS] SEXP (2 groups, smallest n=2,451)
|
|
29
|
+
Accuracy gap: 0.012
|
|
30
|
+
Demographic parity gap: 0.008
|
|
31
|
+
Equalised odds gap: 0.015
|
|
32
|
+
|
|
33
|
+
[FAIL] BPLP (3 groups, smallest n=312)
|
|
34
|
+
Accuracy gap: 0.073
|
|
35
|
+
Demographic parity gap: 0.091
|
|
36
|
+
Equalised odds gap: 0.064
|
|
37
|
+
|
|
38
|
+
============================================================
|
|
39
|
+
Overall: FAIL (worst gap: 0.073 on BPLP)
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## CI/CD gate
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from verosynthea_validator import assert_fair
|
|
46
|
+
|
|
47
|
+
# Fails the build if any group accuracy gap > 5%
|
|
48
|
+
assert_fair(test_data, "label", "prediction", max_accuracy_gap=0.05)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
In pytest:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
def test_model_fairness():
|
|
55
|
+
predictions = model.predict(test_data)
|
|
56
|
+
test_data["y_pred"] = predictions
|
|
57
|
+
assert_fair(
|
|
58
|
+
test_data, "y_true", "y_pred",
|
|
59
|
+
protected_columns=["SEXP", "BPLP", "profile_name"],
|
|
60
|
+
max_accuracy_gap=0.05,
|
|
61
|
+
max_demographic_parity_gap=0.10,
|
|
62
|
+
)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## What it measures
|
|
66
|
+
|
|
67
|
+
For each protected column (e.g. sex, birthplace, demographic profile), the validator computes:
|
|
68
|
+
|
|
69
|
+
| Metric | What it checks |
|
|
70
|
+
|--------|---------------|
|
|
71
|
+
| **Accuracy gap** | Max accuracy difference between any two groups |
|
|
72
|
+
| **Demographic parity gap** | Max difference in selection rate (P(y_pred=1)) |
|
|
73
|
+
| **Equalised odds gap** | Max difference in true positive rate or false positive rate |
|
|
74
|
+
|
|
75
|
+
Groups smaller than 30 observations are excluded (configurable via `min_group_size`).
|
|
76
|
+
|
|
77
|
+
## Why this instead of fairlearn or aif360?
|
|
78
|
+
|
|
79
|
+
Those are general-purpose fairness frameworks. This package is purpose-built for Australian demographics:
|
|
80
|
+
|
|
81
|
+
- **Pre-loaded demographic data.** The free tier includes 5,000 synthetic individuals from [AUSynth](https://huggingface.co/datasets/vero-synthea/ausynth-sample) with 25 Census-calibrated variables. No need to source your own protected attributes.
|
|
82
|
+
- **8 demographic profiles.** AUSynth clusters every person into one of 8 profiles (High-earning professionals, Young singles, Retired, etc.) — a richer protected attribute than just age or sex.
|
|
83
|
+
- **Australia-specific calibration.** Variables match ABS Census 2021 categories exactly. Income brackets, occupation codes, education levels, birthplace regions — all in Australian standard classifications.
|
|
84
|
+
- **One-line CI gate.** `assert_fair()` drops into pytest with zero configuration.
|
|
85
|
+
|
|
86
|
+
## Data tiers
|
|
87
|
+
|
|
88
|
+
| Tier | Data | Cost |
|
|
89
|
+
|------|------|------|
|
|
90
|
+
| **Free** | 5,000-row Paddington 4064 sample from [Hugging Face](https://huggingface.co/datasets/vero-synthea/ausynth-sample) | $0 |
|
|
91
|
+
| **Paid** | Full national dataset (32M individuals, 15,352 suburbs) via API | [verosynthea.com](https://verosynthea.com) |
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from verosynthea_validator import load_ausynth_sample
|
|
95
|
+
|
|
96
|
+
# Free tier (downloads from HF on first call)
|
|
97
|
+
df = load_ausynth_sample()
|
|
98
|
+
|
|
99
|
+
# Paid tier
|
|
100
|
+
df = load_ausynth_sample(api_key="vero_...", geography="bondi-2026-nsw")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## The 8 demographic profiles
|
|
104
|
+
|
|
105
|
+
| ID | Name | Typical characteristics |
|
|
106
|
+
|----|------|------------------------|
|
|
107
|
+
| 0 | Labourers and operators | Blue-collar, lower income |
|
|
108
|
+
| 1 | Young singles and non-workers | Under 25, students, NILF |
|
|
109
|
+
| 2 | Children | Under 15 |
|
|
110
|
+
| 3 | Non-earning dependants | Adults not in workforce |
|
|
111
|
+
| 4 | Trades and technical workers | Certificate-qualified, mid income |
|
|
112
|
+
| 5 | Established partnered households | Married, mid-career |
|
|
113
|
+
| 6 | Retired and semi-retired | Over 60, pension income |
|
|
114
|
+
| 7 | High-earning professionals | Degree-qualified, professional occupations |
|
|
115
|
+
|
|
116
|
+
## Installation
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
pip install verosynthea-validator # core (pandas + numpy)
|
|
120
|
+
pip install verosynthea-validator[hf] # + Hugging Face datasets loader
|
|
121
|
+
pip install verosynthea-validator[paid] # + httpx for API access
|
|
122
|
+
pip install verosynthea-validator[dev] # + pytest + sklearn for development
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Links
|
|
126
|
+
|
|
127
|
+
- **Dataset:** [vero-synthea/ausynth-sample](https://huggingface.co/datasets/vero-synthea/ausynth-sample) on Hugging Face
|
|
128
|
+
- **Full product:** [verosynthea.com](https://verosynthea.com)
|
|
129
|
+
- **Methodology:** [verosynthea.com/about](https://verosynthea.com/about)
|
|
130
|
+
|
|
131
|
+
## Citation
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
Verosynthea AUSynth (2026). Synthetic Australian Census Data.
|
|
135
|
+
https://verosynthea.com
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
MIT
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "verosynthea-validator"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Fairness testing for ML models using Australian demographic data"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Verosynthea", email = "hello@verosynthea.com"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["fairness", "ml", "demographics", "australia", "synthetic-data", "census"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"pandas>=1.5",
|
|
26
|
+
"numpy>=1.23",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.optional-dependencies]
|
|
30
|
+
hf = ["datasets>=2.0"]
|
|
31
|
+
paid = ["httpx>=0.24"]
|
|
32
|
+
dev = ["pytest>=7.0", "scikit-learn>=1.2"]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://verosynthea.com/for-ai-labs"
|
|
36
|
+
Repository = "https://github.com/verosynthea/verosynthea-validator"
|
|
37
|
+
Dataset = "https://huggingface.co/datasets/vero-synthea/ausynth-sample"
|
|
38
|
+
Documentation = "https://verosynthea.com/about"
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for verosynthea-validator fairness metrics.
|
|
3
|
+
|
|
4
|
+
Covers:
|
|
5
|
+
- FairnessReport with a fair model (should pass)
|
|
6
|
+
- FairnessReport with a biased model (should fail)
|
|
7
|
+
- assert_fair CI helper
|
|
8
|
+
- Edge cases: small groups, single group, missing columns
|
|
9
|
+
- All 8 demographic profiles as protected attribute
|
|
10
|
+
"""
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
from verosynthea_validator import FairnessReport, assert_fair
|
|
16
|
+
from verosynthea_validator.assertions import FairnessAssertionError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@pytest.fixture
|
|
20
|
+
def sample_data():
|
|
21
|
+
"""Create a synthetic dataset mimicking AUSynth structure."""
|
|
22
|
+
np.random.seed(42)
|
|
23
|
+
n = 2000
|
|
24
|
+
profiles = np.random.choice(
|
|
25
|
+
["High-earning professionals", "Young singles and non-workers",
|
|
26
|
+
"Established partnered households", "Trades and technical workers",
|
|
27
|
+
"Retired and semi-retired", "Labourers and operators",
|
|
28
|
+
"Non-earning dependants", "Children"],
|
|
29
|
+
size=n,
|
|
30
|
+
p=[0.35, 0.20, 0.15, 0.10, 0.08, 0.07, 0.03, 0.02],
|
|
31
|
+
)
|
|
32
|
+
return pd.DataFrame({
|
|
33
|
+
"SEXP": np.random.choice(["Male", "Female"], n),
|
|
34
|
+
"BPLP": np.random.choice(
|
|
35
|
+
["Oceania and Antarctica", "North-West Europe", "South-East Asia"],
|
|
36
|
+
n, p=[0.6, 0.2, 0.2],
|
|
37
|
+
),
|
|
38
|
+
"profile_name": profiles,
|
|
39
|
+
"profile_id": pd.Categorical(profiles).codes,
|
|
40
|
+
"y_true": np.random.binomial(1, 0.4, n),
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@pytest.fixture
|
|
45
|
+
def fair_predictions(sample_data):
|
|
46
|
+
"""A model with roughly equal accuracy across groups."""
|
|
47
|
+
np.random.seed(42)
|
|
48
|
+
noise = np.random.normal(0, 0.2, len(sample_data))
|
|
49
|
+
sample_data["y_pred"] = (
|
|
50
|
+
(sample_data["y_true"] + noise) > 0.5
|
|
51
|
+
).astype(int)
|
|
52
|
+
return sample_data
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@pytest.fixture
|
|
56
|
+
def biased_predictions(sample_data):
|
|
57
|
+
"""A model that performs worse for South-East Asian birthplace."""
|
|
58
|
+
np.random.seed(42)
|
|
59
|
+
noise = np.random.normal(0, 0.2, len(sample_data))
|
|
60
|
+
bias = np.where(sample_data["BPLP"] == "South-East Asia", 0.4, 0)
|
|
61
|
+
sample_data["y_pred"] = (
|
|
62
|
+
(sample_data["y_true"] + noise + bias) > 0.5
|
|
63
|
+
).astype(int)
|
|
64
|
+
return sample_data
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class TestFairnessReport:
|
|
68
|
+
def test_fair_model_passes(self, fair_predictions):
|
|
69
|
+
report = FairnessReport(
|
|
70
|
+
fair_predictions, "y_true", "y_pred",
|
|
71
|
+
protected_columns=["SEXP"],
|
|
72
|
+
)
|
|
73
|
+
results = report.run()
|
|
74
|
+
assert results.n_total == 2000
|
|
75
|
+
assert results.overall_accuracy > 0.5
|
|
76
|
+
assert len(results.results) == 1
|
|
77
|
+
assert results.results[0].accuracy_gap < 0.10
|
|
78
|
+
|
|
79
|
+
def test_biased_model_detected(self, biased_predictions):
|
|
80
|
+
report = FairnessReport(
|
|
81
|
+
biased_predictions, "y_true", "y_pred",
|
|
82
|
+
protected_columns=["BPLP"],
|
|
83
|
+
)
|
|
84
|
+
results = report.run()
|
|
85
|
+
bplp = results.results[0]
|
|
86
|
+
assert bplp.accuracy_gap > 0.05
|
|
87
|
+
assert bplp.column == "BPLP"
|
|
88
|
+
|
|
89
|
+
def test_multiple_protected_columns(self, fair_predictions):
|
|
90
|
+
report = FairnessReport(
|
|
91
|
+
fair_predictions, "y_true", "y_pred",
|
|
92
|
+
protected_columns=["SEXP", "BPLP", "profile_name"],
|
|
93
|
+
)
|
|
94
|
+
results = report.run()
|
|
95
|
+
assert len(results.results) == 3
|
|
96
|
+
columns = {r.column for r in results.results}
|
|
97
|
+
assert columns == {"SEXP", "BPLP", "profile_name"}
|
|
98
|
+
|
|
99
|
+
def test_profiles_as_protected(self, fair_predictions):
|
|
100
|
+
report = FairnessReport(
|
|
101
|
+
fair_predictions, "y_true", "y_pred",
|
|
102
|
+
protected_columns=["profile_name"],
|
|
103
|
+
min_group_size=20,
|
|
104
|
+
)
|
|
105
|
+
results = report.run()
|
|
106
|
+
profile_result = results.results[0]
|
|
107
|
+
assert profile_result.column == "profile_name"
|
|
108
|
+
assert len(profile_result.groups) >= 5
|
|
109
|
+
|
|
110
|
+
def test_summary_output(self, fair_predictions):
|
|
111
|
+
report = FairnessReport(
|
|
112
|
+
fair_predictions, "y_true", "y_pred",
|
|
113
|
+
protected_columns=["SEXP"],
|
|
114
|
+
)
|
|
115
|
+
results = report.run()
|
|
116
|
+
summary = results.summary()
|
|
117
|
+
assert "Fairness Report" in summary
|
|
118
|
+
assert "accuracy gap" in summary.lower()
|
|
119
|
+
|
|
120
|
+
def test_to_dataframe(self, fair_predictions):
|
|
121
|
+
report = FairnessReport(
|
|
122
|
+
fair_predictions, "y_true", "y_pred",
|
|
123
|
+
protected_columns=["SEXP", "BPLP"],
|
|
124
|
+
)
|
|
125
|
+
df = report.run().to_dataframe()
|
|
126
|
+
assert isinstance(df, pd.DataFrame)
|
|
127
|
+
assert "protected_column" in df.columns
|
|
128
|
+
assert "accuracy" in df.columns
|
|
129
|
+
assert len(df) >= 4
|
|
130
|
+
|
|
131
|
+
def test_min_group_size_filter(self, sample_data):
|
|
132
|
+
sample_data["y_pred"] = sample_data["y_true"]
|
|
133
|
+
sample_data["RARE"] = "common"
|
|
134
|
+
sample_data.loc[:4, "RARE"] = "rare"
|
|
135
|
+
report = FairnessReport(
|
|
136
|
+
sample_data, "y_true", "y_pred",
|
|
137
|
+
protected_columns=["RARE"],
|
|
138
|
+
min_group_size=30,
|
|
139
|
+
)
|
|
140
|
+
results = report.run()
|
|
141
|
+
assert len(results.results) == 0
|
|
142
|
+
|
|
143
|
+
def test_missing_column_raises(self, sample_data):
|
|
144
|
+
sample_data["y_pred"] = 0
|
|
145
|
+
with pytest.raises(ValueError, match="not found"):
|
|
146
|
+
FairnessReport(
|
|
147
|
+
sample_data, "y_true", "y_pred",
|
|
148
|
+
protected_columns=["NONEXISTENT"],
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class TestAssertFair:
|
|
153
|
+
def test_fair_model_passes(self, fair_predictions):
|
|
154
|
+
assert_fair(
|
|
155
|
+
fair_predictions, "y_true", "y_pred",
|
|
156
|
+
protected_columns=["SEXP"],
|
|
157
|
+
max_accuracy_gap=0.10,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
def test_biased_model_fails(self, biased_predictions):
|
|
161
|
+
with pytest.raises(FairnessAssertionError) as exc_info:
|
|
162
|
+
assert_fair(
|
|
163
|
+
biased_predictions, "y_true", "y_pred",
|
|
164
|
+
protected_columns=["BPLP"],
|
|
165
|
+
max_accuracy_gap=0.05,
|
|
166
|
+
)
|
|
167
|
+
assert "failed fairness check" in str(exc_info.value)
|
|
168
|
+
assert hasattr(exc_info.value, "details")
|
|
169
|
+
assert len(exc_info.value.details["failures"]) > 0
|
|
170
|
+
|
|
171
|
+
def test_custom_thresholds(self, biased_predictions):
|
|
172
|
+
with pytest.raises(FairnessAssertionError):
|
|
173
|
+
assert_fair(
|
|
174
|
+
biased_predictions, "y_true", "y_pred",
|
|
175
|
+
protected_columns=["BPLP"],
|
|
176
|
+
max_accuracy_gap=0.01,
|
|
177
|
+
max_demographic_parity_gap=0.01,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def test_relaxed_thresholds_pass(self, biased_predictions):
|
|
181
|
+
assert_fair(
|
|
182
|
+
biased_predictions, "y_true", "y_pred",
|
|
183
|
+
protected_columns=["BPLP"],
|
|
184
|
+
max_accuracy_gap=0.50,
|
|
185
|
+
max_demographic_parity_gap=0.50,
|
|
186
|
+
max_equalised_odds_gap=0.50,
|
|
187
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
verosynthea-validator — Fairness testing for ML models using Australian demographic data.
|
|
3
|
+
|
|
4
|
+
Quick start:
|
|
5
|
+
from verosynthea_validator import FairnessReport, assert_fair
|
|
6
|
+
|
|
7
|
+
report = FairnessReport(data, y_true="label", y_pred="prediction",
|
|
8
|
+
protected_columns=["SEXP", "BPLP"])
|
|
9
|
+
results = report.run()
|
|
10
|
+
print(results.summary())
|
|
11
|
+
|
|
12
|
+
# CI/CD gate:
|
|
13
|
+
assert_fair(data, "label", "prediction", max_accuracy_gap=0.05)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from verosynthea_validator.fairness import FairnessReport, FairnessResults
|
|
17
|
+
from verosynthea_validator.assertions import assert_fair
|
|
18
|
+
from verosynthea_validator.data import load_ausynth_sample
|
|
19
|
+
|
|
20
|
+
__version__ = "0.1.0"
|
|
21
|
+
__all__ = ["FairnessReport", "FairnessResults", "assert_fair", "load_ausynth_sample"]
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CI/CD assertion helpers for fairness gating.
|
|
3
|
+
|
|
4
|
+
Usage in pytest or CI:
|
|
5
|
+
from verosynthea_validator import assert_fair
|
|
6
|
+
assert_fair(test_data, "y_true", "y_pred", max_accuracy_gap=0.05)
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Optional, Sequence
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from verosynthea_validator.fairness import FairnessReport
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FairnessAssertionError(AssertionError):
|
|
18
|
+
"""Raised when a model fails a fairness check."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, message: str, details: dict):
|
|
21
|
+
super().__init__(message)
|
|
22
|
+
self.details = details
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def assert_fair(
|
|
26
|
+
data: pd.DataFrame,
|
|
27
|
+
y_true: str,
|
|
28
|
+
y_pred: str,
|
|
29
|
+
protected_columns: Optional[Sequence[str]] = None,
|
|
30
|
+
max_accuracy_gap: float = 0.05,
|
|
31
|
+
max_demographic_parity_gap: float = 0.10,
|
|
32
|
+
max_equalised_odds_gap: float = 0.10,
|
|
33
|
+
min_group_size: int = 30,
|
|
34
|
+
) -> None:
|
|
35
|
+
"""Assert that a model's predictions are fair across demographic groups.
|
|
36
|
+
|
|
37
|
+
Raises FairnessAssertionError if any threshold is exceeded. Designed
|
|
38
|
+
for use in pytest tests and CI/CD pipelines.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
data : pd.DataFrame
|
|
43
|
+
Dataset with predictions and AUSynth demographic columns.
|
|
44
|
+
y_true : str
|
|
45
|
+
Ground-truth binary label column.
|
|
46
|
+
y_pred : str
|
|
47
|
+
Predicted binary label column.
|
|
48
|
+
protected_columns : list[str], optional
|
|
49
|
+
Columns to check. Default: ["SEXP", "BPLP", "profile_name"].
|
|
50
|
+
max_accuracy_gap : float
|
|
51
|
+
Maximum allowed accuracy difference between any two groups (default 0.05).
|
|
52
|
+
max_demographic_parity_gap : float
|
|
53
|
+
Maximum allowed selection rate difference (default 0.10).
|
|
54
|
+
max_equalised_odds_gap : float
|
|
55
|
+
Maximum allowed TPR or FPR difference (default 0.10).
|
|
56
|
+
min_group_size : int
|
|
57
|
+
Minimum group size to include in analysis (default 30).
|
|
58
|
+
|
|
59
|
+
Raises
|
|
60
|
+
------
|
|
61
|
+
FairnessAssertionError
|
|
62
|
+
If any fairness threshold is exceeded. The error's `.details` dict
|
|
63
|
+
contains the full results for debugging.
|
|
64
|
+
"""
|
|
65
|
+
report = FairnessReport(
|
|
66
|
+
data=data,
|
|
67
|
+
y_true=y_true,
|
|
68
|
+
y_pred=y_pred,
|
|
69
|
+
protected_columns=protected_columns,
|
|
70
|
+
min_group_size=min_group_size,
|
|
71
|
+
)
|
|
72
|
+
results = report.run()
|
|
73
|
+
|
|
74
|
+
failures = []
|
|
75
|
+
for r in results.results:
|
|
76
|
+
if r.accuracy_gap > max_accuracy_gap:
|
|
77
|
+
failures.append(
|
|
78
|
+
f"{r.column}: accuracy gap {r.accuracy_gap:.3f} > {max_accuracy_gap}"
|
|
79
|
+
)
|
|
80
|
+
if r.demographic_parity_gap > max_demographic_parity_gap:
|
|
81
|
+
failures.append(
|
|
82
|
+
f"{r.column}: demographic parity gap {r.demographic_parity_gap:.3f} "
|
|
83
|
+
f"> {max_demographic_parity_gap}"
|
|
84
|
+
)
|
|
85
|
+
if r.equalised_odds_gap > max_equalised_odds_gap:
|
|
86
|
+
failures.append(
|
|
87
|
+
f"{r.column}: equalised odds gap {r.equalised_odds_gap:.3f} "
|
|
88
|
+
f"> {max_equalised_odds_gap}"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if failures:
|
|
92
|
+
msg = (
|
|
93
|
+
f"Model failed fairness check on {len(failures)} metric(s):\n"
|
|
94
|
+
+ "\n".join(f" - {f}" for f in failures)
|
|
95
|
+
+ "\n\nFull report:\n" + results.summary()
|
|
96
|
+
)
|
|
97
|
+
raise FairnessAssertionError(msg, {
|
|
98
|
+
"failures": failures,
|
|
99
|
+
"results": [r.to_dict() for r in results.results],
|
|
100
|
+
})
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data loading utilities for verosynthea-validator.
|
|
3
|
+
|
|
4
|
+
Free tier: loads the 5,000-row AUSynth sample from Hugging Face.
|
|
5
|
+
Paid tier: connects to verosynthea.com API for full national data.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
_HF_DATASET = "vero-synthea/ausynth-sample"
|
|
17
|
+
_HF_FILE = "ausynth_sample_paddington_4064.parquet"
|
|
18
|
+
_CACHE_DIR = Path.home() / ".cache" / "verosynthea"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_ausynth_sample(
|
|
22
|
+
api_key: Optional[str] = None,
|
|
23
|
+
geography: Optional[str] = None,
|
|
24
|
+
) -> pd.DataFrame:
|
|
25
|
+
"""Load AUSynth demographic data for fairness testing.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
api_key : str, optional
|
|
30
|
+
Verosynthea API key for the full national dataset. If None, loads
|
|
31
|
+
the free 5,000-row Paddington sample from Hugging Face.
|
|
32
|
+
geography : str, optional
|
|
33
|
+
Suburb slug (e.g. "paddington-4064-qld") for paid-tier queries.
|
|
34
|
+
Ignored when using the free sample.
|
|
35
|
+
|
|
36
|
+
Returns
|
|
37
|
+
-------
|
|
38
|
+
pd.DataFrame
|
|
39
|
+
Person-level demographic data with 25+ variables including
|
|
40
|
+
profile_id and profile_name.
|
|
41
|
+
"""
|
|
42
|
+
if api_key:
|
|
43
|
+
return _load_paid(api_key, geography)
|
|
44
|
+
return _load_free_sample()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _load_free_sample() -> pd.DataFrame:
|
|
48
|
+
"""Load the free HF sample. Downloads on first call, caches locally."""
|
|
49
|
+
cache_path = _CACHE_DIR / _HF_FILE
|
|
50
|
+
if cache_path.exists():
|
|
51
|
+
return pd.read_parquet(cache_path)
|
|
52
|
+
|
|
53
|
+
# Try loading from HF datasets library
|
|
54
|
+
try:
|
|
55
|
+
from datasets import load_dataset
|
|
56
|
+
ds = load_dataset(_HF_DATASET, split="train")
|
|
57
|
+
df = ds.to_pandas()
|
|
58
|
+
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
df.to_parquet(cache_path, index=False)
|
|
60
|
+
return df
|
|
61
|
+
except ImportError:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
# Fallback: direct parquet download
|
|
65
|
+
try:
|
|
66
|
+
url = f"https://huggingface.co/datasets/{_HF_DATASET}/resolve/main/{_HF_FILE}"
|
|
67
|
+
df = pd.read_parquet(url)
|
|
68
|
+
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
df.to_parquet(cache_path, index=False)
|
|
70
|
+
return df
|
|
71
|
+
except Exception as e:
|
|
72
|
+
raise RuntimeError(
|
|
73
|
+
f"Could not load AUSynth sample. Install the `datasets` library "
|
|
74
|
+
f"(`pip install datasets`) or check your internet connection. "
|
|
75
|
+
f"Original error: {e}"
|
|
76
|
+
) from e
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _load_paid(api_key: str, geography: Optional[str] = None) -> pd.DataFrame:
|
|
80
|
+
"""Load data from the Verosynthea API (paid tier)."""
|
|
81
|
+
import httpx
|
|
82
|
+
|
|
83
|
+
base = os.environ.get("VEROSYNTHEA_API_URL", "https://api.verosynthea.com/v1")
|
|
84
|
+
params = {}
|
|
85
|
+
if geography:
|
|
86
|
+
params["geography"] = geography
|
|
87
|
+
|
|
88
|
+
resp = httpx.get(
|
|
89
|
+
f"{base}/data/persons",
|
|
90
|
+
headers={"Authorization": f"Bearer {api_key}"},
|
|
91
|
+
params=params,
|
|
92
|
+
timeout=60.0,
|
|
93
|
+
)
|
|
94
|
+
resp.raise_for_status()
|
|
95
|
+
import io
|
|
96
|
+
return pd.read_parquet(io.BytesIO(resp.content))
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core fairness metrics and reporting for verosynthea-validator.
|
|
3
|
+
|
|
4
|
+
Computes demographic parity, equalised odds, accuracy gap, and
|
|
5
|
+
calibration gap across protected groups defined by AUSynth variables.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Optional, Sequence
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# AUSynth profile names for readable output
|
|
17
|
+
PROFILE_NAMES = {
|
|
18
|
+
0: "Labourers and operators",
|
|
19
|
+
1: "Young singles and non-workers",
|
|
20
|
+
2: "Children",
|
|
21
|
+
3: "Non-earning dependants",
|
|
22
|
+
4: "Trades and technical workers",
|
|
23
|
+
5: "Established partnered households",
|
|
24
|
+
6: "Retired and semi-retired",
|
|
25
|
+
7: "High-earning professionals",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# Common protected columns in AUSynth data
|
|
29
|
+
SUGGESTED_PROTECTED = ["SEXP", "BPLP", "GNGP", "AGE5P", "profile_name"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class GroupMetrics:
|
|
34
|
+
"""Fairness metrics for a single demographic group."""
|
|
35
|
+
group_name: str
|
|
36
|
+
group_value: str
|
|
37
|
+
n: int
|
|
38
|
+
accuracy: float
|
|
39
|
+
positive_rate: float # P(y_pred=1) — selection rate
|
|
40
|
+
true_positive_rate: float # P(y_pred=1 | y_true=1) — recall
|
|
41
|
+
false_positive_rate: float # P(y_pred=1 | y_true=0) — false alarm
|
|
42
|
+
positive_predictive_value: float # P(y_true=1 | y_pred=1) — precision
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ProtectedColumnResult:
|
|
47
|
+
"""Fairness analysis for one protected attribute."""
|
|
48
|
+
column: str
|
|
49
|
+
groups: list[GroupMetrics]
|
|
50
|
+
accuracy_gap: float # max - min accuracy across groups
|
|
51
|
+
demographic_parity_gap: float # max - min positive_rate
|
|
52
|
+
equalised_odds_gap: float # max gap in TPR or FPR
|
|
53
|
+
min_group_size: int
|
|
54
|
+
|
|
55
|
+
def to_dict(self) -> dict:
|
|
56
|
+
return {
|
|
57
|
+
"column": self.column,
|
|
58
|
+
"accuracy_gap": round(self.accuracy_gap, 4),
|
|
59
|
+
"demographic_parity_gap": round(self.demographic_parity_gap, 4),
|
|
60
|
+
"equalised_odds_gap": round(self.equalised_odds_gap, 4),
|
|
61
|
+
"n_groups": len(self.groups),
|
|
62
|
+
"min_group_size": self.min_group_size,
|
|
63
|
+
"groups": [
|
|
64
|
+
{
|
|
65
|
+
"value": g.group_value,
|
|
66
|
+
"n": g.n,
|
|
67
|
+
"accuracy": round(g.accuracy, 4),
|
|
68
|
+
"positive_rate": round(g.positive_rate, 4),
|
|
69
|
+
"tpr": round(g.true_positive_rate, 4),
|
|
70
|
+
"fpr": round(g.false_positive_rate, 4),
|
|
71
|
+
}
|
|
72
|
+
for g in self.groups
|
|
73
|
+
],
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class FairnessResults:
|
|
79
|
+
"""Complete fairness report across all protected columns."""
|
|
80
|
+
results: list[ProtectedColumnResult]
|
|
81
|
+
overall_accuracy: float
|
|
82
|
+
n_total: int
|
|
83
|
+
|
|
84
|
+
def summary(self) -> str:
|
|
85
|
+
"""Human-readable summary of fairness results."""
|
|
86
|
+
lines = [
|
|
87
|
+
f"Fairness Report (n={self.n_total:,}, overall accuracy={self.overall_accuracy:.3f})",
|
|
88
|
+
"=" * 60,
|
|
89
|
+
]
|
|
90
|
+
for r in self.results:
|
|
91
|
+
status = "PASS" if r.accuracy_gap <= 0.05 else "FAIL"
|
|
92
|
+
lines.append(
|
|
93
|
+
f"\n[{status}] {r.column} ({len(r.groups)} groups, "
|
|
94
|
+
f"smallest n={r.min_group_size})"
|
|
95
|
+
)
|
|
96
|
+
lines.append(
|
|
97
|
+
f" Accuracy gap: {r.accuracy_gap:.3f}"
|
|
98
|
+
)
|
|
99
|
+
lines.append(
|
|
100
|
+
f" Demographic parity gap: {r.demographic_parity_gap:.3f}"
|
|
101
|
+
)
|
|
102
|
+
lines.append(
|
|
103
|
+
f" Equalised odds gap: {r.equalised_odds_gap:.3f}"
|
|
104
|
+
)
|
|
105
|
+
for g in sorted(r.groups, key=lambda x: -x.accuracy):
|
|
106
|
+
lines.append(
|
|
107
|
+
f" {g.group_value:<40s} acc={g.accuracy:.3f} "
|
|
108
|
+
f"sel={g.positive_rate:.3f} n={g.n}"
|
|
109
|
+
)
|
|
110
|
+
lines.append("\n" + "=" * 60)
|
|
111
|
+
max_gap = max(r.accuracy_gap for r in self.results) if self.results else 0
|
|
112
|
+
if max_gap <= 0.05:
|
|
113
|
+
lines.append("Overall: PASS (all accuracy gaps <= 0.05)")
|
|
114
|
+
else:
|
|
115
|
+
worst = max(self.results, key=lambda r: r.accuracy_gap)
|
|
116
|
+
lines.append(
|
|
117
|
+
f"Overall: FAIL (worst gap: {worst.accuracy_gap:.3f} "
|
|
118
|
+
f"on {worst.column})"
|
|
119
|
+
)
|
|
120
|
+
return "\n".join(lines)
|
|
121
|
+
|
|
122
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
123
|
+
"""Flatten results to a DataFrame for further analysis."""
|
|
124
|
+
rows = []
|
|
125
|
+
for r in self.results:
|
|
126
|
+
for g in r.groups:
|
|
127
|
+
rows.append({
|
|
128
|
+
"protected_column": r.column,
|
|
129
|
+
"group": g.group_value,
|
|
130
|
+
"n": g.n,
|
|
131
|
+
"accuracy": g.accuracy,
|
|
132
|
+
"positive_rate": g.positive_rate,
|
|
133
|
+
"tpr": g.true_positive_rate,
|
|
134
|
+
"fpr": g.false_positive_rate,
|
|
135
|
+
"ppv": g.positive_predictive_value,
|
|
136
|
+
})
|
|
137
|
+
return pd.DataFrame(rows)
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def passed(self) -> bool:
|
|
141
|
+
"""True if all accuracy gaps are <= 0.05."""
|
|
142
|
+
return all(r.accuracy_gap <= 0.05 for r in self.results)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class FairnessReport:
|
|
146
|
+
"""Run a fairness analysis across demographic groups.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
data : pd.DataFrame
|
|
151
|
+
Dataset with predictions and demographic columns.
|
|
152
|
+
y_true : str
|
|
153
|
+
Column name for ground-truth binary labels (0/1).
|
|
154
|
+
y_pred : str
|
|
155
|
+
Column name for predicted binary labels (0/1).
|
|
156
|
+
protected_columns : list[str], optional
|
|
157
|
+
Demographic columns to test. Defaults to ["SEXP", "BPLP", "profile_name"].
|
|
158
|
+
min_group_size : int
|
|
159
|
+
Minimum observations per group to include (default 30).
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
def __init__(
|
|
163
|
+
self,
|
|
164
|
+
data: pd.DataFrame,
|
|
165
|
+
y_true: str,
|
|
166
|
+
y_pred: str,
|
|
167
|
+
protected_columns: Optional[Sequence[str]] = None,
|
|
168
|
+
min_group_size: int = 30,
|
|
169
|
+
):
|
|
170
|
+
self.data = data
|
|
171
|
+
self.y_true = y_true
|
|
172
|
+
self.y_pred = y_pred
|
|
173
|
+
self.protected_columns = list(
|
|
174
|
+
protected_columns or ["SEXP", "BPLP", "profile_name"]
|
|
175
|
+
)
|
|
176
|
+
self.min_group_size = min_group_size
|
|
177
|
+
|
|
178
|
+
# Validate inputs
|
|
179
|
+
for col in [y_true, y_pred] + self.protected_columns:
|
|
180
|
+
if col not in data.columns:
|
|
181
|
+
raise ValueError(f"Column '{col}' not found in data")
|
|
182
|
+
|
|
183
|
+
def run(self) -> FairnessResults:
|
|
184
|
+
"""Compute fairness metrics for all protected columns."""
|
|
185
|
+
yt = self.data[self.y_true].values.astype(int)
|
|
186
|
+
yp = self.data[self.y_pred].values.astype(int)
|
|
187
|
+
overall_acc = (yt == yp).mean()
|
|
188
|
+
|
|
189
|
+
results = []
|
|
190
|
+
for col in self.protected_columns:
|
|
191
|
+
groups = []
|
|
192
|
+
for val, gdf in self.data.groupby(col):
|
|
193
|
+
if len(gdf) < self.min_group_size:
|
|
194
|
+
continue
|
|
195
|
+
idx = gdf.index
|
|
196
|
+
gt = yt[idx]
|
|
197
|
+
pr = yp[idx]
|
|
198
|
+
n = len(gt)
|
|
199
|
+
acc = (gt == pr).mean()
|
|
200
|
+
pos_rate = pr.mean()
|
|
201
|
+
|
|
202
|
+
# TPR / FPR
|
|
203
|
+
pos_mask = gt == 1
|
|
204
|
+
neg_mask = gt == 0
|
|
205
|
+
tpr = pr[pos_mask].mean() if pos_mask.sum() > 0 else 0.0
|
|
206
|
+
fpr = pr[neg_mask].mean() if neg_mask.sum() > 0 else 0.0
|
|
207
|
+
ppv = gt[pr == 1].mean() if (pr == 1).sum() > 0 else 0.0
|
|
208
|
+
|
|
209
|
+
groups.append(GroupMetrics(
|
|
210
|
+
group_name=col,
|
|
211
|
+
group_value=str(val),
|
|
212
|
+
n=n,
|
|
213
|
+
accuracy=float(acc),
|
|
214
|
+
positive_rate=float(pos_rate),
|
|
215
|
+
true_positive_rate=float(tpr),
|
|
216
|
+
false_positive_rate=float(fpr),
|
|
217
|
+
positive_predictive_value=float(ppv),
|
|
218
|
+
))
|
|
219
|
+
|
|
220
|
+
if len(groups) < 2:
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
accs = [g.accuracy for g in groups]
|
|
224
|
+
prs = [g.positive_rate for g in groups]
|
|
225
|
+
tprs = [g.true_positive_rate for g in groups]
|
|
226
|
+
fprs = [g.false_positive_rate for g in groups]
|
|
227
|
+
|
|
228
|
+
results.append(ProtectedColumnResult(
|
|
229
|
+
column=col,
|
|
230
|
+
groups=groups,
|
|
231
|
+
accuracy_gap=max(accs) - min(accs),
|
|
232
|
+
demographic_parity_gap=max(prs) - min(prs),
|
|
233
|
+
equalised_odds_gap=max(
|
|
234
|
+
max(tprs) - min(tprs),
|
|
235
|
+
max(fprs) - min(fprs),
|
|
236
|
+
),
|
|
237
|
+
min_group_size=min(g.n for g in groups),
|
|
238
|
+
))
|
|
239
|
+
|
|
240
|
+
return FairnessResults(
|
|
241
|
+
results=results,
|
|
242
|
+
overall_accuracy=float(overall_acc),
|
|
243
|
+
n_total=len(self.data),
|
|
244
|
+
)
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: verosynthea-validator
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fairness testing for ML models using Australian demographic data
|
|
5
|
+
Author-email: Verosynthea <hello@verosynthea.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://verosynthea.com/for-ai-labs
|
|
8
|
+
Project-URL: Repository, https://github.com/verosynthea/verosynthea-validator
|
|
9
|
+
Project-URL: Dataset, https://huggingface.co/datasets/vero-synthea/ausynth-sample
|
|
10
|
+
Project-URL: Documentation, https://verosynthea.com/about
|
|
11
|
+
Keywords: fairness,ml,demographics,australia,synthetic-data,census
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: pandas>=1.5
|
|
21
|
+
Requires-Dist: numpy>=1.23
|
|
22
|
+
Provides-Extra: hf
|
|
23
|
+
Requires-Dist: datasets>=2.0; extra == "hf"
|
|
24
|
+
Provides-Extra: paid
|
|
25
|
+
Requires-Dist: httpx>=0.24; extra == "paid"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: scikit-learn>=1.2; extra == "dev"
|
|
29
|
+
|
|
30
|
+
# verosynthea-validator
|
|
31
|
+
|
|
32
|
+
Fairness testing for ML models using real Australian demographic data. One line to check whether your model treats demographic groups equally.
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install verosynthea-validator
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from verosynthea_validator import FairnessReport
|
|
40
|
+
|
|
41
|
+
report = FairnessReport(
|
|
42
|
+
data=test_data,
|
|
43
|
+
y_true="label",
|
|
44
|
+
y_pred="prediction",
|
|
45
|
+
protected_columns=["SEXP", "BPLP", "profile_name"],
|
|
46
|
+
)
|
|
47
|
+
results = report.run()
|
|
48
|
+
print(results.summary())
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Output:
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
Fairness Report (n=5,000, overall accuracy=0.847)
|
|
55
|
+
============================================================
|
|
56
|
+
|
|
57
|
+
[PASS] SEXP (2 groups, smallest n=2,451)
|
|
58
|
+
Accuracy gap: 0.012
|
|
59
|
+
Demographic parity gap: 0.008
|
|
60
|
+
Equalised odds gap: 0.015
|
|
61
|
+
|
|
62
|
+
[FAIL] BPLP (3 groups, smallest n=312)
|
|
63
|
+
Accuracy gap: 0.073
|
|
64
|
+
Demographic parity gap: 0.091
|
|
65
|
+
Equalised odds gap: 0.064
|
|
66
|
+
|
|
67
|
+
============================================================
|
|
68
|
+
Overall: FAIL (worst gap: 0.073 on BPLP)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## CI/CD gate
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from verosynthea_validator import assert_fair
|
|
75
|
+
|
|
76
|
+
# Fails the build if any group accuracy gap > 5%
|
|
77
|
+
assert_fair(test_data, "label", "prediction", max_accuracy_gap=0.05)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
In pytest:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
def test_model_fairness():
|
|
84
|
+
predictions = model.predict(test_data)
|
|
85
|
+
test_data["y_pred"] = predictions
|
|
86
|
+
assert_fair(
|
|
87
|
+
test_data, "y_true", "y_pred",
|
|
88
|
+
protected_columns=["SEXP", "BPLP", "profile_name"],
|
|
89
|
+
max_accuracy_gap=0.05,
|
|
90
|
+
max_demographic_parity_gap=0.10,
|
|
91
|
+
)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## What it measures
|
|
95
|
+
|
|
96
|
+
For each protected column (e.g. sex, birthplace, demographic profile), the validator computes:
|
|
97
|
+
|
|
98
|
+
| Metric | What it checks |
|
|
99
|
+
|--------|---------------|
|
|
100
|
+
| **Accuracy gap** | Max accuracy difference between any two groups |
|
|
101
|
+
| **Demographic parity gap** | Max difference in selection rate (P(y_pred=1)) |
|
|
102
|
+
| **Equalised odds gap** | Max difference in true positive rate or false positive rate |
|
|
103
|
+
|
|
104
|
+
Groups smaller than 30 observations are excluded (configurable via `min_group_size`).
|
|
105
|
+
|
|
106
|
+
## Why this instead of fairlearn or aif360?
|
|
107
|
+
|
|
108
|
+
Those are general-purpose fairness frameworks. This package is purpose-built for Australian demographics:
|
|
109
|
+
|
|
110
|
+
- **Pre-loaded demographic data.** The free tier includes 5,000 synthetic individuals from [AUSynth](https://huggingface.co/datasets/vero-synthea/ausynth-sample) with 25 Census-calibrated variables. No need to source your own protected attributes.
|
|
111
|
+
- **8 demographic profiles.** AUSynth clusters every person into one of 8 profiles (High-earning professionals, Young singles, Retired, etc.) — a richer protected attribute than just age or sex.
|
|
112
|
+
- **Australia-specific calibration.** Variables match ABS Census 2021 categories exactly. Income brackets, occupation codes, education levels, birthplace regions — all in Australian standard classifications.
|
|
113
|
+
- **One-line CI gate.** `assert_fair()` drops into pytest with zero configuration.
|
|
114
|
+
|
|
115
|
+
## Data tiers
|
|
116
|
+
|
|
117
|
+
| Tier | Data | Cost |
|
|
118
|
+
|------|------|------|
|
|
119
|
+
| **Free** | 5,000-row Paddington 4064 sample from [Hugging Face](https://huggingface.co/datasets/vero-synthea/ausynth-sample) | $0 |
|
|
120
|
+
| **Paid** | Full national dataset (32M individuals, 15,352 suburbs) via API | [verosynthea.com](https://verosynthea.com) |
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from verosynthea_validator import load_ausynth_sample
|
|
124
|
+
|
|
125
|
+
# Free tier (downloads from HF on first call)
|
|
126
|
+
df = load_ausynth_sample()
|
|
127
|
+
|
|
128
|
+
# Paid tier
|
|
129
|
+
df = load_ausynth_sample(api_key="vero_...", geography="bondi-2026-nsw")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## The 8 demographic profiles
|
|
133
|
+
|
|
134
|
+
| ID | Name | Typical characteristics |
|
|
135
|
+
|----|------|------------------------|
|
|
136
|
+
| 0 | Labourers and operators | Blue-collar, lower income |
|
|
137
|
+
| 1 | Young singles and non-workers | Under 25, students, NILF |
|
|
138
|
+
| 2 | Children | Under 15 |
|
|
139
|
+
| 3 | Non-earning dependants | Adults not in workforce |
|
|
140
|
+
| 4 | Trades and technical workers | Certificate-qualified, mid income |
|
|
141
|
+
| 5 | Established partnered households | Married, mid-career |
|
|
142
|
+
| 6 | Retired and semi-retired | Over 60, pension income |
|
|
143
|
+
| 7 | High-earning professionals | Degree-qualified, professional occupations |
|
|
144
|
+
|
|
145
|
+
## Installation
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
pip install verosynthea-validator # core (pandas + numpy)
|
|
149
|
+
pip install verosynthea-validator[hf] # + Hugging Face datasets loader
|
|
150
|
+
pip install verosynthea-validator[paid] # + httpx for API access
|
|
151
|
+
pip install verosynthea-validator[dev] # + pytest + sklearn for development
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Links
|
|
155
|
+
|
|
156
|
+
- **Dataset:** [vero-synthea/ausynth-sample](https://huggingface.co/datasets/vero-synthea/ausynth-sample) on Hugging Face
|
|
157
|
+
- **Full product:** [verosynthea.com](https://verosynthea.com)
|
|
158
|
+
- **Methodology:** [verosynthea.com/about](https://verosynthea.com/about)
|
|
159
|
+
|
|
160
|
+
## Citation
|
|
161
|
+
|
|
162
|
+
```
|
|
163
|
+
Verosynthea AUSynth (2026). Synthetic Australian Census Data.
|
|
164
|
+
https://verosynthea.com
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## License
|
|
168
|
+
|
|
169
|
+
MIT
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
tests/test_fairness.py
|
|
4
|
+
verosynthea_validator/__init__.py
|
|
5
|
+
verosynthea_validator/assertions.py
|
|
6
|
+
verosynthea_validator/data.py
|
|
7
|
+
verosynthea_validator/fairness.py
|
|
8
|
+
verosynthea_validator.egg-info/PKG-INFO
|
|
9
|
+
verosynthea_validator.egg-info/SOURCES.txt
|
|
10
|
+
verosynthea_validator.egg-info/dependency_links.txt
|
|
11
|
+
verosynthea_validator.egg-info/requires.txt
|
|
12
|
+
verosynthea_validator.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
verosynthea_validator
|