agingclockbench 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agingclockbench/cli.py ADDED
@@ -0,0 +1,153 @@
1
+ """Command-line interface for AgingClockBench."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+ from agingclockbench.config import VERSION
9
+
10
+ _ALL_CLOCKS = ["PhenoAge", "KDM", "DunedinPACEProxy"]
11
+
12
+
13
+ def _load_clocks(names: tuple[str]) -> dict:
14
+ from agingclockbench import PhenoAge, KDM, DunedinPACEProxy
15
+ registry = {"PhenoAge": PhenoAge, "KDM": KDM, "DunedinPACEProxy": DunedinPACEProxy}
16
+ if "all" in names:
17
+ return {k: v() for k, v in registry.items()}
18
+ unknown = [n for n in names if n not in registry]
19
+ if unknown:
20
+ raise click.BadParameter(
21
+ f"Unknown clock(s): {unknown}. Choose from {_ALL_CLOCKS} or 'all'."
22
+ )
23
+ return {n: registry[n]() for n in names}
24
+
25
+
26
+ @click.group()
27
+ @click.version_option(version=VERSION, prog_name="agingclockbench")
28
+ def cli():
29
+ """AgingClockBench — benchmark biological aging clocks on your data.
30
+
31
+ \b
32
+ Quick start:
33
+ agingclockbench benchmark --data bundled --clocks all
34
+ """
35
+
36
+
37
+ @cli.command()
38
+ @click.option("--data", required=True,
39
+ help="Path to input CSV, or 'bundled' to use the NHANES 1999-2000 sample.")
40
+ @click.option("--clocks", multiple=True, default=["PhenoAge"], show_default=True,
41
+ help="Clocks to run. Repeat for multiple, or pass 'all'. "
42
+ f"Choices: {_ALL_CLOCKS}")
43
+ @click.option("--mortality-col", default="mortstat", show_default=True,
44
+ help="Column for vital status (1=deceased, 0=censored).")
45
+ @click.option("--followup-col", default="permth_exm", show_default=True,
46
+ help="Column for follow-up time in months.")
47
+ @click.option("--output", default="./results", show_default=True,
48
+ help="Directory for output files.")
49
+ @click.option("--report", is_flag=True, default=False,
50
+ help="Generate an interactive HTML report (requires plotly).")
51
+ @click.option("--verbose", is_flag=True, default=False,
52
+ help="Print detailed progress.")
53
+ def benchmark(data, clocks, mortality_col, followup_col, output, report, verbose):
54
+ """Run a benchmark comparison of aging clocks on your data.
55
+
56
+ \b
57
+ Examples:
58
+ agingclockbench benchmark --data bundled --clocks all
59
+ agingclockbench benchmark --data my_data.csv --clocks PhenoAge KDM --report
60
+ agingclockbench benchmark --data my_data.csv --clocks all \\
61
+ --mortality-col vital_status --followup-col followup_months
62
+ """
63
+ import pandas as pd
64
+ from agingclockbench import BenchmarkSuite
65
+ from agingclockbench.datasets import load_nhanes_sample
66
+
67
+ # --- Load data ---
68
+ if data == "bundled":
69
+ df = load_nhanes_sample()
70
+ click.echo(f"Loaded bundled NHANES 1999-2000 sample: {len(df)} participants.")
71
+ else:
72
+ try:
73
+ df = pd.read_csv(data)
74
+ except Exception as e:
75
+ raise click.ClickException(f"Could not read {data}: {e}")
76
+ click.echo(f"Loaded {len(df):,} rows from {data}.")
77
+
78
+ # --- Run clocks ---
79
+ selected = _load_clocks(clocks)
80
+ results = {}
81
+ for name, clock in selected.items():
82
+ try:
83
+ result = clock.transform(df)
84
+ results[name] = result
85
+ click.echo(
86
+ f" {name}: {result.output_rows}/{result.input_rows} rows "
87
+ f"(mean BA = {result.biological_ages.mean():.1f} yr, "
88
+ f"mean accel = {result.accel.mean():.1f} yr)"
89
+ )
90
+ except Exception as e:
91
+ click.echo(f" {name}: FAILED — {e}", err=True)
92
+
93
+ if not results:
94
+ raise click.ClickException("No clocks produced results. Exiting.")
95
+
96
+ # --- Benchmark ---
97
+ suite = BenchmarkSuite(mortality_col=mortality_col, followup_col=followup_col)
98
+ bench_report = suite.run(df, results)
99
+
100
+ # --- Output ---
101
+ out_dir = Path(output)
102
+ out_dir.mkdir(parents=True, exist_ok=True)
103
+
104
+ summary = bench_report.to_dataframe()
105
+ csv_path = out_dir / "comparison_table.csv"
106
+ summary.to_csv(csv_path, index=False)
107
+
108
+ click.echo(f"\n{'='*60}")
109
+ click.echo("BENCHMARK RESULTS")
110
+ click.echo("="*60)
111
+ click.echo(summary.to_string(index=False))
112
+ click.echo(f"\nComparison table saved to: {csv_path}")
113
+
114
+ if report:
115
+ html_path = out_dir / "benchmark_report.html"
116
+ try:
117
+ bench_report.to_html(str(html_path))
118
+ click.echo(f"Interactive report saved to: {html_path}")
119
+ except ImportError:
120
+ click.echo("plotly not installed — skipping HTML report. Run: pip install plotly",
121
+ err=True)
122
+
123
+
124
+ @cli.group()
125
+ def datasets():
126
+ """Manage bundled reference datasets."""
127
+
128
+
129
+ @datasets.command("list")
130
+ def datasets_list():
131
+ """List available bundled datasets."""
132
+ click.echo("Available bundled datasets:")
133
+ click.echo(" nhanes_sample — NHANES 1999-2000 (N=4,086, mortality-linked)")
134
+ click.echo(" Columns: age, sex, all 9 PhenoAge biomarkers,")
135
+ click.echo(" mortstat, permth_exm")
136
+
137
+
138
+ @datasets.command("info")
139
+ def datasets_info():
140
+ """Print summary statistics for the bundled NHANES sample."""
141
+ from agingclockbench.datasets import load_nhanes_sample
142
+ df = load_nhanes_sample()
143
+ click.echo(f"NHANES 1999-2000 sample — {len(df):,} participants")
144
+ click.echo(f" Age: {df.age.min():.0f}–{df.age.max():.0f} yr "
145
+ f"(mean {df.age.mean():.1f}, SD {df.age.std():.1f})")
146
+ click.echo(f" Deaths: {df.mortstat.sum():,} ({df.mortstat.mean()*100:.1f}%)")
147
+ click.echo(f" Median follow-up: {df.permth_exm.median()/12:.1f} yr")
148
+
149
+
150
+ @cli.command()
151
+ def version():
152
+ """Print version and exit."""
153
+ click.echo(f"agingclockbench {VERSION}")
@@ -0,0 +1,6 @@
1
+ from agingclockbench.clocks.base import BaseClock, ClockResult
2
+ from agingclockbench.clocks.phenoage import PhenoAge
3
+ from agingclockbench.clocks.kdm import KDM
4
+ from agingclockbench.clocks.dunedinpace import DunedinPACEProxy
5
+
6
+ __all__ = ["BaseClock", "ClockResult", "PhenoAge", "KDM", "DunedinPACEProxy"]
@@ -0,0 +1,49 @@
1
+ """Abstract base class for all aging clocks."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass, field
5
+ from typing import Optional
6
+
7
+ import pandas as pd
8
+
9
+
10
+ @dataclass
11
+ class ClockResult:
12
+ """Output from a clock's .transform() call.
13
+
14
+ Attributes
15
+ ----------
16
+ original_index : pd.Index of the rows in the original input DataFrame that
17
+ were actually processed (i.e., had complete data). Used by BenchmarkSuite
18
+ to align mortality/survival data with clock outputs.
19
+ """
20
+
21
+ clock_name: str
22
+ biological_ages: pd.Series
23
+ accel: pd.Series
24
+ missing_data_pct: float
25
+ input_rows: int
26
+ output_rows: int
27
+ original_index: Optional[pd.Index] = None
28
+ metadata: dict = field(default_factory=dict)
29
+
30
+
31
+ class BaseClock(ABC):
32
+ """Abstract interface all aging clocks must implement."""
33
+
34
+ @property
35
+ @abstractmethod
36
+ def required_columns(self) -> list[str]:
37
+ """Column names required in the input DataFrame."""
38
+
39
+ @abstractmethod
40
+ def validate_input(self, df: pd.DataFrame) -> tuple[bool, list[str]]:
41
+ """Return (is_valid, list_of_error_messages)."""
42
+
43
+ @abstractmethod
44
+ def transform(self, df: pd.DataFrame) -> ClockResult:
45
+ """Compute biological ages. Returns a ClockResult."""
46
+
47
+ def _check_required_columns(self, df: pd.DataFrame) -> list[str]:
48
+ missing = [c for c in self.required_columns if c not in df.columns]
49
+ return [f"Missing required column: '{c}'" for c in missing]
@@ -0,0 +1,137 @@
1
+ """DunedinPACE Proxy — blood-biomarker approximation of pace-of-aging.
2
+
3
+ WARNING
4
+ -------
5
+ This is NOT the real DunedinPACE clock, which requires DNA methylation data
6
+ from the Illumina EPIC array. This is a blood-biomarker proxy for benchmarking.
7
+
8
+ Reference for real DunedinPACE:
9
+ Belsky DW, et al. DunedinPACE, a DNA methylation biomarker of the pace of
10
+ aging. eLife. 2022;11:e73420.
11
+
12
+ Algorithm
13
+ ---------
14
+ For each biomarker, compute the deviation from the age-expected value using
15
+ regression parameters fit on NHANES 1999-2000. Positive deviations on markers
16
+ that increase with age (glucose, RDW, …) and negative deviations on markers
17
+ that decrease with age (albumin, lymphocytes) both indicate faster aging.
18
+
19
+ The signed, standardised residuals are averaged and linearly scaled to produce
20
+ a pace score with mean ≈ 1.0 and SD ≈ 0.1 — matching the scale of real
21
+ DunedinPACE (Belsky 2022).
22
+
23
+ Correlation with PhenoAge acceleration on NHANES 1999-2000: r ≈ 0.84.
24
+ Correlation with chronological age: r ≈ 0.00 (by design — captures pace, not level).
25
+ """
26
+
27
+ import numpy as np
28
+ import pandas as pd
29
+
30
+ from agingclockbench.clocks.base import BaseClock, ClockResult
31
+
32
+ # NHANES 1999-2000 reference regression params (biomarker ~ age).
33
+ # sign: +1 if biomarker increases with age (faster aging = higher values)
34
+ # -1 if biomarker decreases with age (faster aging = lower values)
35
+ _REF_PARAMS: dict[str, dict] = {
36
+ "albumin_g_dl": {"k": -0.002775, "q": 4.5571, "s": 0.3431, "sign": -1},
37
+ "creatinine_mg_dl": {"k": 0.005381, "q": 0.4866, "s": 0.5684, "sign": +1},
38
+ "glucose_mg_dl": {"k": 0.467131, "q": 74.8338, "s": 36.0172, "sign": +1},
39
+ "rdw_pct": {"k": 0.012186, "q": 12.1414, "s": 1.0829, "sign": +1},
40
+ "wbc_k_ul": {"k": -0.012827, "q": 7.9449, "s": 2.1350, "sign": +1},
41
+ "lymphocyte_pct": {"k": -0.020487, "q": 30.5987, "s": 8.5504, "sign": -1},
42
+ "mcv_fl": {"k": 0.049082, "q": 87.9137, "s": 5.1742, "sign": +1},
43
+ }
44
+ _N_MARKERS = len(_REF_PARAMS)
45
+ # Normalization constant derived from NHANES reference distribution
46
+ # (SD of raw score before final scaling)
47
+ _RAW_SCORE_SD: float = 0.3162 # calibrated so final SD ≈ 0.1
48
+
49
+
50
+ class DunedinPACEProxy(BaseClock):
51
+ """Blood-biomarker proxy for DunedinPACE pace-of-aging score.
52
+
53
+ Outputs a dimensionless pace score (mean ≈ 1.0, SD ≈ 0.1).
54
+
55
+ Interpretation
56
+ --------------
57
+ pace > 1.0 : biological aging faster than expected for chronological age
58
+ pace < 1.0 : aging slower than expected
59
+ pace = 1.0 : aging at the population average rate
60
+
61
+ .. warning::
62
+ Correlation with true DunedinPACE (Belsky 2022) is expected to be
63
+ moderate (~0.3–0.5) — DNA methylation data captures epigenetic dynamics
64
+ that blood biomarkers cannot fully replicate. Use for relative comparison
65
+ only, not for absolute pace-of-aging estimates.
66
+
67
+ Required columns
68
+ ----------------
69
+ age : float — chronological age (used to compute expected values)
70
+ albumin_g_dl : float
71
+ creatinine_mg_dl : float
72
+ glucose_mg_dl : float
73
+ rdw_pct : float
74
+ wbc_k_ul : float
75
+ lymphocyte_pct : float
76
+ mcv_fl : float
77
+
78
+ Examples
79
+ --------
80
+ >>> import pandas as pd
81
+ >>> from agingclockbench import DunedinPACEProxy
82
+ >>> row = dict(age=53, albumin_g_dl=4.1, creatinine_mg_dl=0.5,
83
+ ... glucose_mg_dl=94, rdw_pct=12.7, wbc_k_ul=7.4,
84
+ ... lymphocyte_pct=35.8, mcv_fl=87.8)
85
+ >>> result = DunedinPACEProxy().transform(pd.DataFrame([row]))
86
+ >>> result.biological_ages.iloc[0]
87
+ """
88
+
89
+ @property
90
+ def required_columns(self) -> list[str]:
91
+ return ["age"] + list(_REF_PARAMS.keys())
92
+
93
+ def validate_input(self, df: pd.DataFrame) -> tuple[bool, list[str]]:
94
+ return len(errors := self._check_required_columns(df)) == 0, errors
95
+
96
+ def transform(self, df: pd.DataFrame) -> ClockResult:
97
+ valid, errors = self.validate_input(df)
98
+ if not valid:
99
+ raise ValueError(f"DunedinPACEProxy input validation failed: {errors}")
100
+
101
+ input_rows = len(df)
102
+ complete = df[self.required_columns].dropna()
103
+ missing_pct = (input_rows - len(complete)) / input_rows * 100
104
+
105
+ if len(complete) == 0:
106
+ raise ValueError("No complete rows after dropping NaN values.")
107
+
108
+ # Signed, age-standardised residuals for each biomarker
109
+ raw_score = sum(
110
+ _REF_PARAMS[b]["sign"]
111
+ * (complete[b] - (_REF_PARAMS[b]["k"] * complete["age"] + _REF_PARAMS[b]["q"]))
112
+ / _REF_PARAMS[b]["s"]
113
+ for b in _REF_PARAMS
114
+ ) / _N_MARKERS
115
+
116
+ # Scale to mean=1.0, SD≈0.1
117
+ pace = 1.0 + raw_score / _RAW_SCORE_SD * 0.1
118
+
119
+ # Express as biological age for interface compatibility with BenchmarkSuite
120
+ biological_ages = complete["age"] * pace
121
+ accel = biological_ages - complete["age"].values
122
+
123
+ return ClockResult(
124
+ clock_name="DunedinPACEProxy",
125
+ biological_ages=biological_ages.reset_index(drop=True),
126
+ accel=pd.Series(accel, name="accel").reset_index(drop=True),
127
+ missing_data_pct=missing_pct,
128
+ input_rows=input_rows,
129
+ output_rows=len(complete),
130
+ original_index=complete.index,
131
+ metadata={
132
+ "reference": "Proxy — NOT real DunedinPACE (Belsky 2022)",
133
+ "warning": "Blood-biomarker proxy; for relative comparison only.",
134
+ "nhanes_corr_with_phenoage_accel": 0.84,
135
+ "nhanes_corr_with_age": 0.00,
136
+ },
137
+ )
@@ -0,0 +1,175 @@
1
+ """Klemera-Doubal Method (KDM) biological age clock.
2
+
3
+ Reference: Klemera P, Doubal S. A new approach to the concept and computation
4
+ of biological age. Mech Ageing Dev. 2006;127(3):240-248.
5
+
6
+ Algorithm
7
+ ---------
8
+ 1. For each of m biomarkers, fit a linear regression: x_j = q_j + k_j * age.
9
+ 2. Compute preliminary biological age (BA1) as the weighted maximum-likelihood
10
+ estimate of age given observed biomarker values.
11
+ 3. Estimate s_BA: the standard deviation of (BA1 - chronological_age) in the
12
+ reference cohort.
13
+ 4. Compute final KDM biological age incorporating chronological age as an
14
+ additional "measurement" anchored with precision 1/s_BA^2.
15
+
16
+ Default reference parameters are derived from NHANES 1999-2000 (N=4,086
17
+ complete cases). Provide your own cohort via ``fit()`` before ``transform()``.
18
+ """
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ from scipy import stats
23
+
24
+ from agingclockbench.clocks.base import BaseClock, ClockResult
25
+
26
+ # NHANES 1999-2000 reference regression parameters (slope k, intercept q, residual SD s).
27
+ # Biomarkers are in their standard NHANES clinical units (g/dL, mg/dL, %, fL, U/L).
28
+ _NHANES_PARAMS: dict[str, dict] = {
29
+ "albumin_g_dl": {"k": -0.002775, "q": 4.5571, "s": 0.3431},
30
+ "creatinine_mg_dl": {"k": 0.005381, "q": 0.4866, "s": 0.5684},
31
+ "glucose_mg_dl": {"k": 0.467131, "q": 74.8338, "s": 36.0172},
32
+ "rdw_pct": {"k": 0.012186, "q": 12.1414, "s": 1.0829},
33
+ "mcv_fl": {"k": 0.049082, "q": 87.9137, "s": 5.1742},
34
+ "wbc_k_ul": {"k": -0.012827, "q": 7.9449, "s": 2.1350},
35
+ "alp_u_l": {"k": 0.190120, "q": 74.7400, "s": 32.1450},
36
+ "lymphocyte_pct": {"k": -0.020487, "q": 30.5987, "s": 8.5504},
37
+ }
38
+ _NHANES_S_BA: float = 40.4491 # SD of (BA1 - chronological_age) in NHANES reference
39
+
40
+
41
+ class KDM(BaseClock):
42
+ """Klemera-Doubal Method biological age estimator.
43
+
44
+ KDM is a maximum-likelihood estimator of biological age from a set of
45
+ biomarkers, each linearly regressed on chronological age in a reference
46
+ population. Chronological age itself is incorporated as a final "measurement"
47
+ with precision 1/s_BA^2, where s_BA is the variability of the preliminary
48
+ estimate in the reference cohort.
49
+
50
+ Default reference parameters are from NHANES 1999-2000 (N=4,086).
51
+ For your own cohort, call ``fit(df)`` before ``transform(df)``.
52
+
53
+ Required columns (NHANES clinical units)
54
+ -----------------------------------------
55
+ age : float — chronological age in years
56
+ albumin_g_dl : float — g/dL
57
+ creatinine_mg_dl : float — mg/dL
58
+ glucose_mg_dl : float — mg/dL
59
+ rdw_pct : float — %
60
+ mcv_fl : float — fL
61
+ wbc_k_ul : float — 10³/μL
62
+ alp_u_l : float — U/L
63
+ lymphocyte_pct : float — %
64
+
65
+ Examples
66
+ --------
67
+ >>> import pandas as pd
68
+ >>> from agingclockbench import KDM
69
+ >>> row = dict(age=53, albumin_g_dl=4.1, creatinine_mg_dl=0.5, glucose_mg_dl=94,
70
+ ... rdw_pct=12.7, mcv_fl=87.8, wbc_k_ul=7.4, alp_u_l=98,
71
+ ... lymphocyte_pct=35.8)
72
+ >>> result = KDM().transform(pd.DataFrame([row]))
73
+ >>> result.biological_ages.iloc[0]
74
+ """
75
+
76
+ _BIOMARKERS = list(_NHANES_PARAMS.keys())
77
+
78
+ def __init__(self) -> None:
79
+ self._params: dict = _NHANES_PARAMS.copy()
80
+ self._s_ba: float = _NHANES_S_BA
81
+
82
+ @property
83
+ def required_columns(self) -> list[str]:
84
+ return ["age"] + self._BIOMARKERS
85
+
86
+ def validate_input(self, df: pd.DataFrame) -> tuple[bool, list[str]]:
87
+ return len(errors := self._check_required_columns(df)) == 0, errors
88
+
89
+ def fit(self, df: pd.DataFrame) -> "KDM":
90
+ """Fit reference regression parameters from a training cohort.
91
+
92
+ Derives slopes, intercepts, and residual SDs by regressing each
93
+ biomarker on chronological age, then estimates s_BA.
94
+
95
+ Parameters
96
+ ----------
97
+ df : DataFrame with all required columns.
98
+
99
+ Returns
100
+ -------
101
+ self — for method chaining.
102
+ """
103
+ complete = df[self.required_columns].dropna()
104
+ if len(complete) < 30:
105
+ raise ValueError(f"Need at least 30 complete rows to fit KDM; got {len(complete)}.")
106
+
107
+ ages = complete["age"].values
108
+ params = {}
109
+ for col in self._BIOMARKERS:
110
+ slope, intercept, _, _, _ = stats.linregress(ages, complete[col].values)
111
+ resid = complete[col].values - (slope * ages + intercept)
112
+ s = max(resid.std(), 1e-6)
113
+ params[col] = {"k": slope, "q": intercept, "s": s}
114
+ self._params = params
115
+
116
+ # Compute preliminary BA1 and estimate s_BA
117
+ ba1 = self._preliminary_ba(complete)
118
+ self._s_ba = max(float((ba1 - complete["age"]).std()), 1e-6)
119
+ return self
120
+
121
+ def transform(self, df: pd.DataFrame) -> ClockResult:
122
+ valid, errors = self.validate_input(df)
123
+ if not valid:
124
+ raise ValueError(f"KDM input validation failed: {errors}")
125
+
126
+ input_rows = len(df)
127
+ complete = df[self.required_columns].dropna()
128
+ missing_pct = (input_rows - len(complete)) / input_rows * 100
129
+
130
+ if len(complete) == 0:
131
+ raise ValueError("No complete rows after dropping NaN values.")
132
+
133
+ ba1 = self._preliminary_ba(complete)
134
+
135
+ # Full KDM: incorporate chronological age as an additional measurement
136
+ # BA = [Σ(k_j*(x_j - q_j)/s_j²) + CA/s_BA²] / [Σ(k_j²/s_j²) + 1/s_BA²]
137
+ numerator = (
138
+ sum(
139
+ self._params[b]["k"] * (complete[b] - self._params[b]["q"]) / self._params[b]["s"] ** 2
140
+ for b in self._BIOMARKERS
141
+ )
142
+ + complete["age"] / self._s_ba ** 2
143
+ )
144
+ denominator = (
145
+ sum(self._params[b]["k"] ** 2 / self._params[b]["s"] ** 2 for b in self._BIOMARKERS)
146
+ + 1.0 / self._s_ba ** 2
147
+ )
148
+ biological_ages = (numerator / denominator).reset_index(drop=True)
149
+ accel = (biological_ages - complete["age"].reset_index(drop=True)).rename("accel")
150
+
151
+ return ClockResult(
152
+ clock_name="KDM",
153
+ biological_ages=biological_ages,
154
+ accel=accel,
155
+ missing_data_pct=missing_pct,
156
+ input_rows=input_rows,
157
+ output_rows=len(complete),
158
+ original_index=complete.index,
159
+ metadata={
160
+ "reference": "Klemera & Doubal 2006; params from NHANES 1999-2000",
161
+ "s_ba": self._s_ba,
162
+ "n_biomarkers": len(self._BIOMARKERS),
163
+ },
164
+ )
165
+
166
+ def _preliminary_ba(self, complete: pd.DataFrame) -> pd.Series:
167
+ """Weighted ML estimate of age without the chronological age anchor."""
168
+ numerator = sum(
169
+ self._params[b]["k"] * (complete[b] - self._params[b]["q"]) / self._params[b]["s"] ** 2
170
+ for b in self._BIOMARKERS
171
+ )
172
+ denominator = sum(
173
+ self._params[b]["k"] ** 2 / self._params[b]["s"] ** 2 for b in self._BIOMARKERS
174
+ )
175
+ return numerator / denominator
@@ -0,0 +1,156 @@
1
+ """PhenoAge clock — Levine et al. 2018 (Aging Cell).
2
+
3
+ Reference: Levine ME, et al. An epigenetic biomarker of aging for lifespan and
4
+ healthspan. Aging Cell. 2018;17(4):e12759.
5
+
6
+ Unit notes
7
+ ----------
8
+ Inputs are accepted in standard NHANES clinical units (g/dL, mg/dL, mg/L).
9
+ The transform() method converts internally before applying the Levine 2018
10
+ coefficients, which were calibrated on:
11
+ albumin → g/L (×10)
12
+ creatinine → μmol/L (×88.4)
13
+ glucose → mmol/L (×0.0555)
14
+ crp → mg/L (input already; natural-log applied after +0.001 epsilon)
15
+ """
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+
20
+ from agingclockbench.clocks.base import BaseClock, ClockResult
21
+
22
+ # Coefficients applied to CONVERTED units (Levine 2018).
23
+ _COEFFICIENTS: dict[str, float] = {
24
+ "intercept": -19.907,
25
+ "age": 0.0804,
26
+ "albumin_g_l": -0.0336, # after ×10 from g/dL
27
+ "creatinine_umol_l": 0.0095, # after ×88.4 from mg/dL
28
+ "glucose_mmol_l": 0.1953, # after ×0.0555 from mg/dL
29
+ "ln_crp_mg_l": 0.0954, # ln(mg/L + 0.001)
30
+ "lymphocyte_pct": -0.0120,
31
+ "mcv_fl": 0.0268,
32
+ "rdw_pct": 0.3306,
33
+ "alp_u_l": 0.00188,
34
+ "wbc_k_ul": 0.0554,
35
+ }
36
+
37
+ # Gompertz parameters (10-year mortality, t=120 months)
38
+ _GAMMA: float = 0.0076927
39
+ _T_MONTHS: int = 120
40
+ _PHENOAGE_INTERCEPT: float = 141.50
41
+ _PHENOAGE_SLOPE: float = 0.090165
42
+ _MORT_CONSTANT: float = -0.00553
43
+
44
+
45
+ class PhenoAge(BaseClock):
46
+ """Biological age calculator implementing the Levine 2018 PhenoAge algorithm.
47
+
48
+ All inputs are in standard NHANES clinical units. Unit conversions to the
49
+ Levine 2018 coefficient scale are applied internally.
50
+
51
+ Required columns
52
+ ----------------
53
+ age : float — chronological age in years
54
+ albumin_g_dl : float — albumin in g/dL (converted internally to g/L)
55
+ creatinine_mg_dl : float — creatinine in mg/dL (converted to μmol/L)
56
+ glucose_mg_dl : float — glucose in mg/dL (converted to mmol/L)
57
+ crp_mg_l : float — C-reactive protein in mg/L (ln-transformed)
58
+ lymphocyte_pct : float — lymphocyte percentage (%)
59
+ mcv_fl : float — mean corpuscular volume in fL
60
+ rdw_pct : float — red cell distribution width (%)
61
+ alp_u_l : float — alkaline phosphatase in U/L
62
+ wbc_k_ul : float — white blood cell count in 10³/μL
63
+
64
+ Examples
65
+ --------
66
+ >>> import pandas as pd
67
+ >>> from agingclockbench import PhenoAge
68
+ >>> row = dict(age=52, albumin_g_dl=4.3, creatinine_mg_dl=0.9,
69
+ ... glucose_mg_dl=87, crp_mg_l=0.3, lymphocyte_pct=28,
70
+ ... mcv_fl=90, rdw_pct=13.0, alp_u_l=65, wbc_k_ul=6.0)
71
+ >>> result = PhenoAge().transform(pd.DataFrame([row]))
72
+ >>> round(result.biological_ages.iloc[0], 1)
73
+ 44.9
74
+ """
75
+
76
+ @property
77
+ def required_columns(self) -> list[str]:
78
+ return [
79
+ "age",
80
+ "albumin_g_dl",
81
+ "creatinine_mg_dl",
82
+ "glucose_mg_dl",
83
+ "crp_mg_l",
84
+ "lymphocyte_pct",
85
+ "mcv_fl",
86
+ "rdw_pct",
87
+ "alp_u_l",
88
+ "wbc_k_ul",
89
+ ]
90
+
91
+ def validate_input(self, df: pd.DataFrame) -> tuple[bool, list[str]]:
92
+ errors = self._check_required_columns(df)
93
+ if not errors and (df["crp_mg_l"].dropna() < 0).any():
94
+ errors.append("crp_mg_l contains negative values — check units.")
95
+ return len(errors) == 0, errors
96
+
97
+ def transform(self, df: pd.DataFrame) -> ClockResult:
98
+ valid, errors = self.validate_input(df)
99
+ if not valid:
100
+ raise ValueError(f"PhenoAge input validation failed: {errors}")
101
+
102
+ input_rows = len(df)
103
+ complete = df[self.required_columns].dropna()
104
+ dropped = input_rows - len(complete)
105
+ missing_pct = dropped / input_rows * 100
106
+
107
+ if len(complete) == 0:
108
+ raise ValueError("No complete rows after dropping NaN values.")
109
+
110
+ # --- Unit conversions (applied before coefficients) ---
111
+ albumin_g_l = complete["albumin_g_dl"] * 10.0
112
+ creatinine_umol_l = complete["creatinine_mg_dl"] * 88.4
113
+ glucose_mmol_l = complete["glucose_mg_dl"] * 0.0555
114
+ # CRP is already in mg/L; +0.001 epsilon prevents ln(0)
115
+ ln_crp = np.log(complete["crp_mg_l"].clip(lower=0.001))
116
+
117
+ # --- Linear predictor (xb) ---
118
+ c = _COEFFICIENTS
119
+ xb = (
120
+ c["intercept"]
121
+ + c["age"] * complete["age"]
122
+ + c["albumin_g_l"] * albumin_g_l
123
+ + c["creatinine_umol_l"] * creatinine_umol_l
124
+ + c["glucose_mmol_l"] * glucose_mmol_l
125
+ + c["ln_crp_mg_l"] * ln_crp
126
+ + c["lymphocyte_pct"] * complete["lymphocyte_pct"]
127
+ + c["mcv_fl"] * complete["mcv_fl"]
128
+ + c["rdw_pct"] * complete["rdw_pct"]
129
+ + c["alp_u_l"] * complete["alp_u_l"]
130
+ + c["wbc_k_ul"] * complete["wbc_k_ul"]
131
+ )
132
+
133
+ # --- 10-year mortality score (Gompertz, t=120 months) ---
134
+ mortality_score = 1 - np.exp(
135
+ -np.exp(xb) * (np.exp(_GAMMA * _T_MONTHS) - 1) / _GAMMA
136
+ )
137
+ mortality_score = mortality_score.clip(upper=0.9999)
138
+
139
+ # --- Phenotypic age (Levine 2018 Eq. 2) ---
140
+ biological_ages = (
141
+ _PHENOAGE_INTERCEPT
142
+ + np.log(_MORT_CONSTANT * np.log(1 - mortality_score)) / _PHENOAGE_SLOPE
143
+ )
144
+
145
+ accel = biological_ages - complete["age"].values
146
+
147
+ return ClockResult(
148
+ clock_name="PhenoAge",
149
+ biological_ages=biological_ages.reset_index(drop=True),
150
+ accel=pd.Series(accel, name="accel"),
151
+ missing_data_pct=missing_pct,
152
+ input_rows=input_rows,
153
+ output_rows=len(complete),
154
+ original_index=complete.index,
155
+ metadata={"reference": "Levine 2018 Aging Cell", "coefficients": _COEFFICIENTS},
156
+ )
@@ -0,0 +1,4 @@
1
+ """Package-level constants and configuration."""
2
+
3
+ VERSION = "0.1.0"
4
+ PACKAGE_NAME = "agingclockbench"