agingclockbench 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agingclockbench/__init__.py +9 -0
- agingclockbench/benchmarks/__init__.py +3 -0
- agingclockbench/benchmarks/metrics.py +22 -0
- agingclockbench/benchmarks/plots.py +302 -0
- agingclockbench/benchmarks/suite.py +262 -0
- agingclockbench/cli.py +153 -0
- agingclockbench/clocks/__init__.py +6 -0
- agingclockbench/clocks/base.py +49 -0
- agingclockbench/clocks/dunedinpace.py +137 -0
- agingclockbench/clocks/kdm.py +175 -0
- agingclockbench/clocks/phenoage.py +156 -0
- agingclockbench/config.py +4 -0
- agingclockbench/datasets/__init__.py +3 -0
- agingclockbench/datasets/loaders.py +54 -0
- agingclockbench/datasets/nhanes_sample.parquet +0 -0
- agingclockbench/utils/__init__.py +3 -0
- agingclockbench/utils/validation.py +10 -0
- agingclockbench-0.1.0.dist-info/METADATA +183 -0
- agingclockbench-0.1.0.dist-info/RECORD +22 -0
- agingclockbench-0.1.0.dist-info/WHEEL +4 -0
- agingclockbench-0.1.0.dist-info/entry_points.txt +3 -0
- agingclockbench-0.1.0.dist-info/licenses/LICENSE +21 -0
agingclockbench/cli.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Command-line interface for AgingClockBench."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from agingclockbench.config import VERSION
|
|
9
|
+
|
|
10
|
+
_ALL_CLOCKS = ["PhenoAge", "KDM", "DunedinPACEProxy"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _load_clocks(names: tuple[str]) -> dict:
|
|
14
|
+
from agingclockbench import PhenoAge, KDM, DunedinPACEProxy
|
|
15
|
+
registry = {"PhenoAge": PhenoAge, "KDM": KDM, "DunedinPACEProxy": DunedinPACEProxy}
|
|
16
|
+
if "all" in names:
|
|
17
|
+
return {k: v() for k, v in registry.items()}
|
|
18
|
+
unknown = [n for n in names if n not in registry]
|
|
19
|
+
if unknown:
|
|
20
|
+
raise click.BadParameter(
|
|
21
|
+
f"Unknown clock(s): {unknown}. Choose from {_ALL_CLOCKS} or 'all'."
|
|
22
|
+
)
|
|
23
|
+
return {n: registry[n]() for n in names}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@click.group()
|
|
27
|
+
@click.version_option(version=VERSION, prog_name="agingclockbench")
|
|
28
|
+
def cli():
|
|
29
|
+
"""AgingClockBench — benchmark biological aging clocks on your data.
|
|
30
|
+
|
|
31
|
+
\b
|
|
32
|
+
Quick start:
|
|
33
|
+
agingclockbench benchmark --data bundled --clocks all
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@cli.command()
|
|
38
|
+
@click.option("--data", required=True,
|
|
39
|
+
help="Path to input CSV, or 'bundled' to use the NHANES 1999-2000 sample.")
|
|
40
|
+
@click.option("--clocks", multiple=True, default=["PhenoAge"], show_default=True,
|
|
41
|
+
help="Clocks to run. Repeat for multiple, or pass 'all'. "
|
|
42
|
+
f"Choices: {_ALL_CLOCKS}")
|
|
43
|
+
@click.option("--mortality-col", default="mortstat", show_default=True,
|
|
44
|
+
help="Column for vital status (1=deceased, 0=censored).")
|
|
45
|
+
@click.option("--followup-col", default="permth_exm", show_default=True,
|
|
46
|
+
help="Column for follow-up time in months.")
|
|
47
|
+
@click.option("--output", default="./results", show_default=True,
|
|
48
|
+
help="Directory for output files.")
|
|
49
|
+
@click.option("--report", is_flag=True, default=False,
|
|
50
|
+
help="Generate an interactive HTML report (requires plotly).")
|
|
51
|
+
@click.option("--verbose", is_flag=True, default=False,
|
|
52
|
+
help="Print detailed progress.")
|
|
53
|
+
def benchmark(data, clocks, mortality_col, followup_col, output, report, verbose):
|
|
54
|
+
"""Run a benchmark comparison of aging clocks on your data.
|
|
55
|
+
|
|
56
|
+
\b
|
|
57
|
+
Examples:
|
|
58
|
+
agingclockbench benchmark --data bundled --clocks all
|
|
59
|
+
agingclockbench benchmark --data my_data.csv --clocks PhenoAge KDM --report
|
|
60
|
+
agingclockbench benchmark --data my_data.csv --clocks all \\
|
|
61
|
+
--mortality-col vital_status --followup-col followup_months
|
|
62
|
+
"""
|
|
63
|
+
import pandas as pd
|
|
64
|
+
from agingclockbench import BenchmarkSuite
|
|
65
|
+
from agingclockbench.datasets import load_nhanes_sample
|
|
66
|
+
|
|
67
|
+
# --- Load data ---
|
|
68
|
+
if data == "bundled":
|
|
69
|
+
df = load_nhanes_sample()
|
|
70
|
+
click.echo(f"Loaded bundled NHANES 1999-2000 sample: {len(df)} participants.")
|
|
71
|
+
else:
|
|
72
|
+
try:
|
|
73
|
+
df = pd.read_csv(data)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
raise click.ClickException(f"Could not read {data}: {e}")
|
|
76
|
+
click.echo(f"Loaded {len(df):,} rows from {data}.")
|
|
77
|
+
|
|
78
|
+
# --- Run clocks ---
|
|
79
|
+
selected = _load_clocks(clocks)
|
|
80
|
+
results = {}
|
|
81
|
+
for name, clock in selected.items():
|
|
82
|
+
try:
|
|
83
|
+
result = clock.transform(df)
|
|
84
|
+
results[name] = result
|
|
85
|
+
click.echo(
|
|
86
|
+
f" {name}: {result.output_rows}/{result.input_rows} rows "
|
|
87
|
+
f"(mean BA = {result.biological_ages.mean():.1f} yr, "
|
|
88
|
+
f"mean accel = {result.accel.mean():.1f} yr)"
|
|
89
|
+
)
|
|
90
|
+
except Exception as e:
|
|
91
|
+
click.echo(f" {name}: FAILED — {e}", err=True)
|
|
92
|
+
|
|
93
|
+
if not results:
|
|
94
|
+
raise click.ClickException("No clocks produced results. Exiting.")
|
|
95
|
+
|
|
96
|
+
# --- Benchmark ---
|
|
97
|
+
suite = BenchmarkSuite(mortality_col=mortality_col, followup_col=followup_col)
|
|
98
|
+
bench_report = suite.run(df, results)
|
|
99
|
+
|
|
100
|
+
# --- Output ---
|
|
101
|
+
out_dir = Path(output)
|
|
102
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
|
|
104
|
+
summary = bench_report.to_dataframe()
|
|
105
|
+
csv_path = out_dir / "comparison_table.csv"
|
|
106
|
+
summary.to_csv(csv_path, index=False)
|
|
107
|
+
|
|
108
|
+
click.echo(f"\n{'='*60}")
|
|
109
|
+
click.echo("BENCHMARK RESULTS")
|
|
110
|
+
click.echo("="*60)
|
|
111
|
+
click.echo(summary.to_string(index=False))
|
|
112
|
+
click.echo(f"\nComparison table saved to: {csv_path}")
|
|
113
|
+
|
|
114
|
+
if report:
|
|
115
|
+
html_path = out_dir / "benchmark_report.html"
|
|
116
|
+
try:
|
|
117
|
+
bench_report.to_html(str(html_path))
|
|
118
|
+
click.echo(f"Interactive report saved to: {html_path}")
|
|
119
|
+
except ImportError:
|
|
120
|
+
click.echo("plotly not installed — skipping HTML report. Run: pip install plotly",
|
|
121
|
+
err=True)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@cli.group()
|
|
125
|
+
def datasets():
|
|
126
|
+
"""Manage bundled reference datasets."""
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@datasets.command("list")
|
|
130
|
+
def datasets_list():
|
|
131
|
+
"""List available bundled datasets."""
|
|
132
|
+
click.echo("Available bundled datasets:")
|
|
133
|
+
click.echo(" nhanes_sample — NHANES 1999-2000 (N=4,086, mortality-linked)")
|
|
134
|
+
click.echo(" Columns: age, sex, all 9 PhenoAge biomarkers,")
|
|
135
|
+
click.echo(" mortstat, permth_exm")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@datasets.command("info")
|
|
139
|
+
def datasets_info():
|
|
140
|
+
"""Print summary statistics for the bundled NHANES sample."""
|
|
141
|
+
from agingclockbench.datasets import load_nhanes_sample
|
|
142
|
+
df = load_nhanes_sample()
|
|
143
|
+
click.echo(f"NHANES 1999-2000 sample — {len(df):,} participants")
|
|
144
|
+
click.echo(f" Age: {df.age.min():.0f}–{df.age.max():.0f} yr "
|
|
145
|
+
f"(mean {df.age.mean():.1f}, SD {df.age.std():.1f})")
|
|
146
|
+
click.echo(f" Deaths: {df.mortstat.sum():,} ({df.mortstat.mean()*100:.1f}%)")
|
|
147
|
+
click.echo(f" Median follow-up: {df.permth_exm.median()/12:.1f} yr")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@cli.command()
|
|
151
|
+
def version():
|
|
152
|
+
"""Print version and exit."""
|
|
153
|
+
click.echo(f"agingclockbench {VERSION}")
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from agingclockbench.clocks.base import BaseClock, ClockResult
|
|
2
|
+
from agingclockbench.clocks.phenoage import PhenoAge
|
|
3
|
+
from agingclockbench.clocks.kdm import KDM
|
|
4
|
+
from agingclockbench.clocks.dunedinpace import DunedinPACEProxy
|
|
5
|
+
|
|
6
|
+
__all__ = ["BaseClock", "ClockResult", "PhenoAge", "KDM", "DunedinPACEProxy"]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Abstract base class for all aging clocks."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class ClockResult:
|
|
12
|
+
"""Output from a clock's .transform() call.
|
|
13
|
+
|
|
14
|
+
Attributes
|
|
15
|
+
----------
|
|
16
|
+
original_index : pd.Index of the rows in the original input DataFrame that
|
|
17
|
+
were actually processed (i.e., had complete data). Used by BenchmarkSuite
|
|
18
|
+
to align mortality/survival data with clock outputs.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
clock_name: str
|
|
22
|
+
biological_ages: pd.Series
|
|
23
|
+
accel: pd.Series
|
|
24
|
+
missing_data_pct: float
|
|
25
|
+
input_rows: int
|
|
26
|
+
output_rows: int
|
|
27
|
+
original_index: Optional[pd.Index] = None
|
|
28
|
+
metadata: dict = field(default_factory=dict)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class BaseClock(ABC):
|
|
32
|
+
"""Abstract interface all aging clocks must implement."""
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def required_columns(self) -> list[str]:
|
|
37
|
+
"""Column names required in the input DataFrame."""
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def validate_input(self, df: pd.DataFrame) -> tuple[bool, list[str]]:
|
|
41
|
+
"""Return (is_valid, list_of_error_messages)."""
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def transform(self, df: pd.DataFrame) -> ClockResult:
|
|
45
|
+
"""Compute biological ages. Returns a ClockResult."""
|
|
46
|
+
|
|
47
|
+
def _check_required_columns(self, df: pd.DataFrame) -> list[str]:
|
|
48
|
+
missing = [c for c in self.required_columns if c not in df.columns]
|
|
49
|
+
return [f"Missing required column: '{c}'" for c in missing]
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""DunedinPACE Proxy — blood-biomarker approximation of pace-of-aging.
|
|
2
|
+
|
|
3
|
+
WARNING
|
|
4
|
+
-------
|
|
5
|
+
This is NOT the real DunedinPACE clock, which requires DNA methylation data
|
|
6
|
+
from the Illumina EPIC array. This is a blood-biomarker proxy for benchmarking.
|
|
7
|
+
|
|
8
|
+
Reference for real DunedinPACE:
|
|
9
|
+
Belsky DW, et al. DunedinPACE, a DNA methylation biomarker of the pace of
|
|
10
|
+
aging. eLife. 2022;11:e73420.
|
|
11
|
+
|
|
12
|
+
Algorithm
|
|
13
|
+
---------
|
|
14
|
+
For each biomarker, compute the deviation from the age-expected value using
|
|
15
|
+
regression parameters fit on NHANES 1999-2000. Positive deviations on markers
|
|
16
|
+
that increase with age (glucose, RDW, …) and negative deviations on markers
|
|
17
|
+
that decrease with age (albumin, lymphocytes) both indicate faster aging.
|
|
18
|
+
|
|
19
|
+
The signed, standardised residuals are averaged and linearly scaled to produce
|
|
20
|
+
a pace score with mean ≈ 1.0 and SD ≈ 0.1 — matching the scale of real
|
|
21
|
+
DunedinPACE (Belsky 2022).
|
|
22
|
+
|
|
23
|
+
Correlation with PhenoAge acceleration on NHANES 1999-2000: r ≈ 0.84.
|
|
24
|
+
Correlation with chronological age: r ≈ 0.00 (by design — captures pace, not level).
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import numpy as np
|
|
28
|
+
import pandas as pd
|
|
29
|
+
|
|
30
|
+
from agingclockbench.clocks.base import BaseClock, ClockResult
|
|
31
|
+
|
|
32
|
+
# NHANES 1999-2000 reference regression params (biomarker ~ age).
|
|
33
|
+
# sign: +1 if biomarker increases with age (faster aging = higher values)
|
|
34
|
+
# -1 if biomarker decreases with age (faster aging = lower values)
|
|
35
|
+
_REF_PARAMS: dict[str, dict] = {
|
|
36
|
+
"albumin_g_dl": {"k": -0.002775, "q": 4.5571, "s": 0.3431, "sign": -1},
|
|
37
|
+
"creatinine_mg_dl": {"k": 0.005381, "q": 0.4866, "s": 0.5684, "sign": +1},
|
|
38
|
+
"glucose_mg_dl": {"k": 0.467131, "q": 74.8338, "s": 36.0172, "sign": +1},
|
|
39
|
+
"rdw_pct": {"k": 0.012186, "q": 12.1414, "s": 1.0829, "sign": +1},
|
|
40
|
+
"wbc_k_ul": {"k": -0.012827, "q": 7.9449, "s": 2.1350, "sign": +1},
|
|
41
|
+
"lymphocyte_pct": {"k": -0.020487, "q": 30.5987, "s": 8.5504, "sign": -1},
|
|
42
|
+
"mcv_fl": {"k": 0.049082, "q": 87.9137, "s": 5.1742, "sign": +1},
|
|
43
|
+
}
|
|
44
|
+
_N_MARKERS = len(_REF_PARAMS)
|
|
45
|
+
# Normalization constant derived from NHANES reference distribution
|
|
46
|
+
# (SD of raw score before final scaling)
|
|
47
|
+
_RAW_SCORE_SD: float = 0.3162 # calibrated so final SD ≈ 0.1
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DunedinPACEProxy(BaseClock):
|
|
51
|
+
"""Blood-biomarker proxy for DunedinPACE pace-of-aging score.
|
|
52
|
+
|
|
53
|
+
Outputs a dimensionless pace score (mean ≈ 1.0, SD ≈ 0.1).
|
|
54
|
+
|
|
55
|
+
Interpretation
|
|
56
|
+
--------------
|
|
57
|
+
pace > 1.0 : biological aging faster than expected for chronological age
|
|
58
|
+
pace < 1.0 : aging slower than expected
|
|
59
|
+
pace = 1.0 : aging at the population average rate
|
|
60
|
+
|
|
61
|
+
.. warning::
|
|
62
|
+
Correlation with true DunedinPACE (Belsky 2022) is expected to be
|
|
63
|
+
moderate (~0.3–0.5) — DNA methylation data captures epigenetic dynamics
|
|
64
|
+
that blood biomarkers cannot fully replicate. Use for relative comparison
|
|
65
|
+
only, not for absolute pace-of-aging estimates.
|
|
66
|
+
|
|
67
|
+
Required columns
|
|
68
|
+
----------------
|
|
69
|
+
age : float — chronological age (used to compute expected values)
|
|
70
|
+
albumin_g_dl : float
|
|
71
|
+
creatinine_mg_dl : float
|
|
72
|
+
glucose_mg_dl : float
|
|
73
|
+
rdw_pct : float
|
|
74
|
+
wbc_k_ul : float
|
|
75
|
+
lymphocyte_pct : float
|
|
76
|
+
mcv_fl : float
|
|
77
|
+
|
|
78
|
+
Examples
|
|
79
|
+
--------
|
|
80
|
+
>>> import pandas as pd
|
|
81
|
+
>>> from agingclockbench import DunedinPACEProxy
|
|
82
|
+
>>> row = dict(age=53, albumin_g_dl=4.1, creatinine_mg_dl=0.5,
|
|
83
|
+
... glucose_mg_dl=94, rdw_pct=12.7, wbc_k_ul=7.4,
|
|
84
|
+
... lymphocyte_pct=35.8, mcv_fl=87.8)
|
|
85
|
+
>>> result = DunedinPACEProxy().transform(pd.DataFrame([row]))
|
|
86
|
+
>>> result.biological_ages.iloc[0]
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def required_columns(self) -> list[str]:
|
|
91
|
+
return ["age"] + list(_REF_PARAMS.keys())
|
|
92
|
+
|
|
93
|
+
def validate_input(self, df: pd.DataFrame) -> tuple[bool, list[str]]:
|
|
94
|
+
return len(errors := self._check_required_columns(df)) == 0, errors
|
|
95
|
+
|
|
96
|
+
def transform(self, df: pd.DataFrame) -> ClockResult:
|
|
97
|
+
valid, errors = self.validate_input(df)
|
|
98
|
+
if not valid:
|
|
99
|
+
raise ValueError(f"DunedinPACEProxy input validation failed: {errors}")
|
|
100
|
+
|
|
101
|
+
input_rows = len(df)
|
|
102
|
+
complete = df[self.required_columns].dropna()
|
|
103
|
+
missing_pct = (input_rows - len(complete)) / input_rows * 100
|
|
104
|
+
|
|
105
|
+
if len(complete) == 0:
|
|
106
|
+
raise ValueError("No complete rows after dropping NaN values.")
|
|
107
|
+
|
|
108
|
+
# Signed, age-standardised residuals for each biomarker
|
|
109
|
+
raw_score = sum(
|
|
110
|
+
_REF_PARAMS[b]["sign"]
|
|
111
|
+
* (complete[b] - (_REF_PARAMS[b]["k"] * complete["age"] + _REF_PARAMS[b]["q"]))
|
|
112
|
+
/ _REF_PARAMS[b]["s"]
|
|
113
|
+
for b in _REF_PARAMS
|
|
114
|
+
) / _N_MARKERS
|
|
115
|
+
|
|
116
|
+
# Scale to mean=1.0, SD≈0.1
|
|
117
|
+
pace = 1.0 + raw_score / _RAW_SCORE_SD * 0.1
|
|
118
|
+
|
|
119
|
+
# Express as biological age for interface compatibility with BenchmarkSuite
|
|
120
|
+
biological_ages = complete["age"] * pace
|
|
121
|
+
accel = biological_ages - complete["age"].values
|
|
122
|
+
|
|
123
|
+
return ClockResult(
|
|
124
|
+
clock_name="DunedinPACEProxy",
|
|
125
|
+
biological_ages=biological_ages.reset_index(drop=True),
|
|
126
|
+
accel=pd.Series(accel, name="accel").reset_index(drop=True),
|
|
127
|
+
missing_data_pct=missing_pct,
|
|
128
|
+
input_rows=input_rows,
|
|
129
|
+
output_rows=len(complete),
|
|
130
|
+
original_index=complete.index,
|
|
131
|
+
metadata={
|
|
132
|
+
"reference": "Proxy — NOT real DunedinPACE (Belsky 2022)",
|
|
133
|
+
"warning": "Blood-biomarker proxy; for relative comparison only.",
|
|
134
|
+
"nhanes_corr_with_phenoage_accel": 0.84,
|
|
135
|
+
"nhanes_corr_with_age": 0.00,
|
|
136
|
+
},
|
|
137
|
+
)
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Klemera-Doubal Method (KDM) biological age clock.
|
|
2
|
+
|
|
3
|
+
Reference: Klemera P, Doubal S. A new approach to the concept and computation
|
|
4
|
+
of biological age. Mech Ageing Dev. 2006;127(3):240-248.
|
|
5
|
+
|
|
6
|
+
Algorithm
|
|
7
|
+
---------
|
|
8
|
+
1. For each of m biomarkers, fit a linear regression: x_j = q_j + k_j * age.
|
|
9
|
+
2. Compute preliminary biological age (BA1) as the weighted maximum-likelihood
|
|
10
|
+
estimate of age given observed biomarker values.
|
|
11
|
+
3. Estimate s_BA: the standard deviation of (BA1 - chronological_age) in the
|
|
12
|
+
reference cohort.
|
|
13
|
+
4. Compute final KDM biological age incorporating chronological age as an
|
|
14
|
+
additional "measurement" anchored with precision 1/s_BA^2.
|
|
15
|
+
|
|
16
|
+
Default reference parameters are derived from NHANES 1999-2000 (N=4,086
|
|
17
|
+
complete cases). Provide your own cohort via ``fit()`` before ``transform()``.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
import pandas as pd
|
|
22
|
+
from scipy import stats
|
|
23
|
+
|
|
24
|
+
from agingclockbench.clocks.base import BaseClock, ClockResult
|
|
25
|
+
|
|
26
|
+
# NHANES 1999-2000 reference regression parameters (slope k, intercept q, residual SD s).
|
|
27
|
+
# Biomarkers are in their standard NHANES clinical units (g/dL, mg/dL, %, fL, U/L).
|
|
28
|
+
_NHANES_PARAMS: dict[str, dict] = {
|
|
29
|
+
"albumin_g_dl": {"k": -0.002775, "q": 4.5571, "s": 0.3431},
|
|
30
|
+
"creatinine_mg_dl": {"k": 0.005381, "q": 0.4866, "s": 0.5684},
|
|
31
|
+
"glucose_mg_dl": {"k": 0.467131, "q": 74.8338, "s": 36.0172},
|
|
32
|
+
"rdw_pct": {"k": 0.012186, "q": 12.1414, "s": 1.0829},
|
|
33
|
+
"mcv_fl": {"k": 0.049082, "q": 87.9137, "s": 5.1742},
|
|
34
|
+
"wbc_k_ul": {"k": -0.012827, "q": 7.9449, "s": 2.1350},
|
|
35
|
+
"alp_u_l": {"k": 0.190120, "q": 74.7400, "s": 32.1450},
|
|
36
|
+
"lymphocyte_pct": {"k": -0.020487, "q": 30.5987, "s": 8.5504},
|
|
37
|
+
}
|
|
38
|
+
_NHANES_S_BA: float = 40.4491 # SD of (BA1 - chronological_age) in NHANES reference
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class KDM(BaseClock):
|
|
42
|
+
"""Klemera-Doubal Method biological age estimator.
|
|
43
|
+
|
|
44
|
+
KDM is a maximum-likelihood estimator of biological age from a set of
|
|
45
|
+
biomarkers, each linearly regressed on chronological age in a reference
|
|
46
|
+
population. Chronological age itself is incorporated as a final "measurement"
|
|
47
|
+
with precision 1/s_BA^2, where s_BA is the variability of the preliminary
|
|
48
|
+
estimate in the reference cohort.
|
|
49
|
+
|
|
50
|
+
Default reference parameters are from NHANES 1999-2000 (N=4,086).
|
|
51
|
+
For your own cohort, call ``fit(df)`` before ``transform(df)``.
|
|
52
|
+
|
|
53
|
+
Required columns (NHANES clinical units)
|
|
54
|
+
-----------------------------------------
|
|
55
|
+
age : float — chronological age in years
|
|
56
|
+
albumin_g_dl : float — g/dL
|
|
57
|
+
creatinine_mg_dl : float — mg/dL
|
|
58
|
+
glucose_mg_dl : float — mg/dL
|
|
59
|
+
rdw_pct : float — %
|
|
60
|
+
mcv_fl : float — fL
|
|
61
|
+
wbc_k_ul : float — 10³/μL
|
|
62
|
+
alp_u_l : float — U/L
|
|
63
|
+
lymphocyte_pct : float — %
|
|
64
|
+
|
|
65
|
+
Examples
|
|
66
|
+
--------
|
|
67
|
+
>>> import pandas as pd
|
|
68
|
+
>>> from agingclockbench import KDM
|
|
69
|
+
>>> row = dict(age=53, albumin_g_dl=4.1, creatinine_mg_dl=0.5, glucose_mg_dl=94,
|
|
70
|
+
... rdw_pct=12.7, mcv_fl=87.8, wbc_k_ul=7.4, alp_u_l=98,
|
|
71
|
+
... lymphocyte_pct=35.8)
|
|
72
|
+
>>> result = KDM().transform(pd.DataFrame([row]))
|
|
73
|
+
>>> result.biological_ages.iloc[0]
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
_BIOMARKERS = list(_NHANES_PARAMS.keys())
|
|
77
|
+
|
|
78
|
+
def __init__(self) -> None:
|
|
79
|
+
self._params: dict = _NHANES_PARAMS.copy()
|
|
80
|
+
self._s_ba: float = _NHANES_S_BA
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def required_columns(self) -> list[str]:
|
|
84
|
+
return ["age"] + self._BIOMARKERS
|
|
85
|
+
|
|
86
|
+
def validate_input(self, df: pd.DataFrame) -> tuple[bool, list[str]]:
|
|
87
|
+
return len(errors := self._check_required_columns(df)) == 0, errors
|
|
88
|
+
|
|
89
|
+
def fit(self, df: pd.DataFrame) -> "KDM":
|
|
90
|
+
"""Fit reference regression parameters from a training cohort.
|
|
91
|
+
|
|
92
|
+
Derives slopes, intercepts, and residual SDs by regressing each
|
|
93
|
+
biomarker on chronological age, then estimates s_BA.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
df : DataFrame with all required columns.
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
self — for method chaining.
|
|
102
|
+
"""
|
|
103
|
+
complete = df[self.required_columns].dropna()
|
|
104
|
+
if len(complete) < 30:
|
|
105
|
+
raise ValueError(f"Need at least 30 complete rows to fit KDM; got {len(complete)}.")
|
|
106
|
+
|
|
107
|
+
ages = complete["age"].values
|
|
108
|
+
params = {}
|
|
109
|
+
for col in self._BIOMARKERS:
|
|
110
|
+
slope, intercept, _, _, _ = stats.linregress(ages, complete[col].values)
|
|
111
|
+
resid = complete[col].values - (slope * ages + intercept)
|
|
112
|
+
s = max(resid.std(), 1e-6)
|
|
113
|
+
params[col] = {"k": slope, "q": intercept, "s": s}
|
|
114
|
+
self._params = params
|
|
115
|
+
|
|
116
|
+
# Compute preliminary BA1 and estimate s_BA
|
|
117
|
+
ba1 = self._preliminary_ba(complete)
|
|
118
|
+
self._s_ba = max(float((ba1 - complete["age"]).std()), 1e-6)
|
|
119
|
+
return self
|
|
120
|
+
|
|
121
|
+
def transform(self, df: pd.DataFrame) -> ClockResult:
|
|
122
|
+
valid, errors = self.validate_input(df)
|
|
123
|
+
if not valid:
|
|
124
|
+
raise ValueError(f"KDM input validation failed: {errors}")
|
|
125
|
+
|
|
126
|
+
input_rows = len(df)
|
|
127
|
+
complete = df[self.required_columns].dropna()
|
|
128
|
+
missing_pct = (input_rows - len(complete)) / input_rows * 100
|
|
129
|
+
|
|
130
|
+
if len(complete) == 0:
|
|
131
|
+
raise ValueError("No complete rows after dropping NaN values.")
|
|
132
|
+
|
|
133
|
+
ba1 = self._preliminary_ba(complete)
|
|
134
|
+
|
|
135
|
+
# Full KDM: incorporate chronological age as an additional measurement
|
|
136
|
+
# BA = [Σ(k_j*(x_j - q_j)/s_j²) + CA/s_BA²] / [Σ(k_j²/s_j²) + 1/s_BA²]
|
|
137
|
+
numerator = (
|
|
138
|
+
sum(
|
|
139
|
+
self._params[b]["k"] * (complete[b] - self._params[b]["q"]) / self._params[b]["s"] ** 2
|
|
140
|
+
for b in self._BIOMARKERS
|
|
141
|
+
)
|
|
142
|
+
+ complete["age"] / self._s_ba ** 2
|
|
143
|
+
)
|
|
144
|
+
denominator = (
|
|
145
|
+
sum(self._params[b]["k"] ** 2 / self._params[b]["s"] ** 2 for b in self._BIOMARKERS)
|
|
146
|
+
+ 1.0 / self._s_ba ** 2
|
|
147
|
+
)
|
|
148
|
+
biological_ages = (numerator / denominator).reset_index(drop=True)
|
|
149
|
+
accel = (biological_ages - complete["age"].reset_index(drop=True)).rename("accel")
|
|
150
|
+
|
|
151
|
+
return ClockResult(
|
|
152
|
+
clock_name="KDM",
|
|
153
|
+
biological_ages=biological_ages,
|
|
154
|
+
accel=accel,
|
|
155
|
+
missing_data_pct=missing_pct,
|
|
156
|
+
input_rows=input_rows,
|
|
157
|
+
output_rows=len(complete),
|
|
158
|
+
original_index=complete.index,
|
|
159
|
+
metadata={
|
|
160
|
+
"reference": "Klemera & Doubal 2006; params from NHANES 1999-2000",
|
|
161
|
+
"s_ba": self._s_ba,
|
|
162
|
+
"n_biomarkers": len(self._BIOMARKERS),
|
|
163
|
+
},
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
def _preliminary_ba(self, complete: pd.DataFrame) -> pd.Series:
|
|
167
|
+
"""Weighted ML estimate of age without the chronological age anchor."""
|
|
168
|
+
numerator = sum(
|
|
169
|
+
self._params[b]["k"] * (complete[b] - self._params[b]["q"]) / self._params[b]["s"] ** 2
|
|
170
|
+
for b in self._BIOMARKERS
|
|
171
|
+
)
|
|
172
|
+
denominator = sum(
|
|
173
|
+
self._params[b]["k"] ** 2 / self._params[b]["s"] ** 2 for b in self._BIOMARKERS
|
|
174
|
+
)
|
|
175
|
+
return numerator / denominator
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""PhenoAge clock — Levine et al. 2018 (Aging Cell).
|
|
2
|
+
|
|
3
|
+
Reference: Levine ME, et al. An epigenetic biomarker of aging for lifespan and
|
|
4
|
+
healthspan. Aging Cell. 2018;17(4):e12759.
|
|
5
|
+
|
|
6
|
+
Unit notes
|
|
7
|
+
----------
|
|
8
|
+
Inputs are accepted in standard NHANES clinical units (g/dL, mg/dL, mg/L).
|
|
9
|
+
The transform() method converts internally before applying the Levine 2018
|
|
10
|
+
coefficients, which were calibrated on:
|
|
11
|
+
albumin → g/L (×10)
|
|
12
|
+
creatinine → μmol/L (×88.4)
|
|
13
|
+
glucose → mmol/L (×0.0555)
|
|
14
|
+
crp → mg/L (input already; natural-log applied after +0.001 epsilon)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from agingclockbench.clocks.base import BaseClock, ClockResult
|
|
21
|
+
|
|
22
|
+
# Coefficients applied to CONVERTED units (Levine 2018).
|
|
23
|
+
_COEFFICIENTS: dict[str, float] = {
|
|
24
|
+
"intercept": -19.907,
|
|
25
|
+
"age": 0.0804,
|
|
26
|
+
"albumin_g_l": -0.0336, # after ×10 from g/dL
|
|
27
|
+
"creatinine_umol_l": 0.0095, # after ×88.4 from mg/dL
|
|
28
|
+
"glucose_mmol_l": 0.1953, # after ×0.0555 from mg/dL
|
|
29
|
+
"ln_crp_mg_l": 0.0954, # ln(mg/L + 0.001)
|
|
30
|
+
"lymphocyte_pct": -0.0120,
|
|
31
|
+
"mcv_fl": 0.0268,
|
|
32
|
+
"rdw_pct": 0.3306,
|
|
33
|
+
"alp_u_l": 0.00188,
|
|
34
|
+
"wbc_k_ul": 0.0554,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
# Gompertz parameters (10-year mortality, t=120 months)
|
|
38
|
+
_GAMMA: float = 0.0076927
|
|
39
|
+
_T_MONTHS: int = 120
|
|
40
|
+
_PHENOAGE_INTERCEPT: float = 141.50
|
|
41
|
+
_PHENOAGE_SLOPE: float = 0.090165
|
|
42
|
+
_MORT_CONSTANT: float = -0.00553
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class PhenoAge(BaseClock):
|
|
46
|
+
"""Biological age calculator implementing the Levine 2018 PhenoAge algorithm.
|
|
47
|
+
|
|
48
|
+
All inputs are in standard NHANES clinical units. Unit conversions to the
|
|
49
|
+
Levine 2018 coefficient scale are applied internally.
|
|
50
|
+
|
|
51
|
+
Required columns
|
|
52
|
+
----------------
|
|
53
|
+
age : float — chronological age in years
|
|
54
|
+
albumin_g_dl : float — albumin in g/dL (converted internally to g/L)
|
|
55
|
+
creatinine_mg_dl : float — creatinine in mg/dL (converted to μmol/L)
|
|
56
|
+
glucose_mg_dl : float — glucose in mg/dL (converted to mmol/L)
|
|
57
|
+
crp_mg_l : float — C-reactive protein in mg/L (ln-transformed)
|
|
58
|
+
lymphocyte_pct : float — lymphocyte percentage (%)
|
|
59
|
+
mcv_fl : float — mean corpuscular volume in fL
|
|
60
|
+
rdw_pct : float — red cell distribution width (%)
|
|
61
|
+
alp_u_l : float — alkaline phosphatase in U/L
|
|
62
|
+
wbc_k_ul : float — white blood cell count in 10³/μL
|
|
63
|
+
|
|
64
|
+
Examples
|
|
65
|
+
--------
|
|
66
|
+
>>> import pandas as pd
|
|
67
|
+
>>> from agingclockbench import PhenoAge
|
|
68
|
+
>>> row = dict(age=52, albumin_g_dl=4.3, creatinine_mg_dl=0.9,
|
|
69
|
+
... glucose_mg_dl=87, crp_mg_l=0.3, lymphocyte_pct=28,
|
|
70
|
+
... mcv_fl=90, rdw_pct=13.0, alp_u_l=65, wbc_k_ul=6.0)
|
|
71
|
+
>>> result = PhenoAge().transform(pd.DataFrame([row]))
|
|
72
|
+
>>> round(result.biological_ages.iloc[0], 1)
|
|
73
|
+
44.9
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def required_columns(self) -> list[str]:
|
|
78
|
+
return [
|
|
79
|
+
"age",
|
|
80
|
+
"albumin_g_dl",
|
|
81
|
+
"creatinine_mg_dl",
|
|
82
|
+
"glucose_mg_dl",
|
|
83
|
+
"crp_mg_l",
|
|
84
|
+
"lymphocyte_pct",
|
|
85
|
+
"mcv_fl",
|
|
86
|
+
"rdw_pct",
|
|
87
|
+
"alp_u_l",
|
|
88
|
+
"wbc_k_ul",
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
def validate_input(self, df: pd.DataFrame) -> tuple[bool, list[str]]:
|
|
92
|
+
errors = self._check_required_columns(df)
|
|
93
|
+
if not errors and (df["crp_mg_l"].dropna() < 0).any():
|
|
94
|
+
errors.append("crp_mg_l contains negative values — check units.")
|
|
95
|
+
return len(errors) == 0, errors
|
|
96
|
+
|
|
97
|
+
def transform(self, df: pd.DataFrame) -> ClockResult:
|
|
98
|
+
valid, errors = self.validate_input(df)
|
|
99
|
+
if not valid:
|
|
100
|
+
raise ValueError(f"PhenoAge input validation failed: {errors}")
|
|
101
|
+
|
|
102
|
+
input_rows = len(df)
|
|
103
|
+
complete = df[self.required_columns].dropna()
|
|
104
|
+
dropped = input_rows - len(complete)
|
|
105
|
+
missing_pct = dropped / input_rows * 100
|
|
106
|
+
|
|
107
|
+
if len(complete) == 0:
|
|
108
|
+
raise ValueError("No complete rows after dropping NaN values.")
|
|
109
|
+
|
|
110
|
+
# --- Unit conversions (applied before coefficients) ---
|
|
111
|
+
albumin_g_l = complete["albumin_g_dl"] * 10.0
|
|
112
|
+
creatinine_umol_l = complete["creatinine_mg_dl"] * 88.4
|
|
113
|
+
glucose_mmol_l = complete["glucose_mg_dl"] * 0.0555
|
|
114
|
+
# CRP is already in mg/L; +0.001 epsilon prevents ln(0)
|
|
115
|
+
ln_crp = np.log(complete["crp_mg_l"].clip(lower=0.001))
|
|
116
|
+
|
|
117
|
+
# --- Linear predictor (xb) ---
|
|
118
|
+
c = _COEFFICIENTS
|
|
119
|
+
xb = (
|
|
120
|
+
c["intercept"]
|
|
121
|
+
+ c["age"] * complete["age"]
|
|
122
|
+
+ c["albumin_g_l"] * albumin_g_l
|
|
123
|
+
+ c["creatinine_umol_l"] * creatinine_umol_l
|
|
124
|
+
+ c["glucose_mmol_l"] * glucose_mmol_l
|
|
125
|
+
+ c["ln_crp_mg_l"] * ln_crp
|
|
126
|
+
+ c["lymphocyte_pct"] * complete["lymphocyte_pct"]
|
|
127
|
+
+ c["mcv_fl"] * complete["mcv_fl"]
|
|
128
|
+
+ c["rdw_pct"] * complete["rdw_pct"]
|
|
129
|
+
+ c["alp_u_l"] * complete["alp_u_l"]
|
|
130
|
+
+ c["wbc_k_ul"] * complete["wbc_k_ul"]
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# --- 10-year mortality score (Gompertz, t=120 months) ---
|
|
134
|
+
mortality_score = 1 - np.exp(
|
|
135
|
+
-np.exp(xb) * (np.exp(_GAMMA * _T_MONTHS) - 1) / _GAMMA
|
|
136
|
+
)
|
|
137
|
+
mortality_score = mortality_score.clip(upper=0.9999)
|
|
138
|
+
|
|
139
|
+
# --- Phenotypic age (Levine 2018 Eq. 2) ---
|
|
140
|
+
biological_ages = (
|
|
141
|
+
_PHENOAGE_INTERCEPT
|
|
142
|
+
+ np.log(_MORT_CONSTANT * np.log(1 - mortality_score)) / _PHENOAGE_SLOPE
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
accel = biological_ages - complete["age"].values
|
|
146
|
+
|
|
147
|
+
return ClockResult(
|
|
148
|
+
clock_name="PhenoAge",
|
|
149
|
+
biological_ages=biological_ages.reset_index(drop=True),
|
|
150
|
+
accel=pd.Series(accel, name="accel"),
|
|
151
|
+
missing_data_pct=missing_pct,
|
|
152
|
+
input_rows=input_rows,
|
|
153
|
+
output_rows=len(complete),
|
|
154
|
+
original_index=complete.index,
|
|
155
|
+
metadata={"reference": "Levine 2018 Aging Cell", "coefficients": _COEFFICIENTS},
|
|
156
|
+
)
|