PyPI - agingclockbench - Versions diffs - 0.1.0__py3-none-any.whl - Mend

agingclockbench 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

agingclockbench/__init__.py +9 -0
agingclockbench/benchmarks/__init__.py +3 -0
agingclockbench/benchmarks/metrics.py +22 -0
agingclockbench/benchmarks/plots.py +302 -0
agingclockbench/benchmarks/suite.py +262 -0
agingclockbench/cli.py +153 -0
agingclockbench/clocks/__init__.py +6 -0
agingclockbench/clocks/base.py +49 -0
agingclockbench/clocks/dunedinpace.py +137 -0
agingclockbench/clocks/kdm.py +175 -0
agingclockbench/clocks/phenoage.py +156 -0
agingclockbench/config.py +4 -0
agingclockbench/datasets/__init__.py +3 -0
agingclockbench/datasets/loaders.py +54 -0
agingclockbench/datasets/nhanes_sample.parquet +0 -0
agingclockbench/utils/__init__.py +3 -0
agingclockbench/utils/validation.py +10 -0
agingclockbench-0.1.0.dist-info/METADATA +183 -0
agingclockbench-0.1.0.dist-info/RECORD +22 -0
agingclockbench-0.1.0.dist-info/WHEEL +4 -0
agingclockbench-0.1.0.dist-info/entry_points.txt +3 -0
agingclockbench-0.1.0.dist-info/licenses/LICENSE +21 -0

agingclockbench/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""AgingClockBench: Benchmark biological aging clocks on your data."""
+from agingclockbench.clocks.phenoage import PhenoAge
+from agingclockbench.clocks.kdm import KDM
+from agingclockbench.clocks.dunedinpace import DunedinPACEProxy
+from agingclockbench.benchmarks.suite import BenchmarkSuite
+__version__ = "0.1.0"
+__all__ = ["PhenoAge", "KDM", "DunedinPACEProxy", "BenchmarkSuite"]

agingclockbench/benchmarks/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from agingclockbench.benchmarks.suite import BenchmarkSuite, BenchmarkResult, BenchmarkReport
+__all__ = ["BenchmarkSuite", "BenchmarkResult", "BenchmarkReport"]

agingclockbench/benchmarks/metrics.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""Individual metric functions used by BenchmarkSuite."""
+import numpy as np
+import pandas as pd
+from scipy import stats
+def pearson_correlation(x: pd.Series, y: pd.Series) -> tuple[float, float]:
+    """Return (r, p-value) Pearson correlation between x and y."""
+    r, p = stats.pearsonr(x, y)
+    return float(r), float(p)
+def spearman_correlation(x: pd.Series, y: pd.Series) -> float:
+    """Return Spearman rho between x and y."""
+    return float(stats.spearmanr(x, y).statistic)
+def coefficient_of_variation(series: pd.Series) -> float:
+    """Return coefficient of variation (SD / mean)."""
+    mean = series.mean()
+    return float(series.std() / mean) if mean != 0 else float("nan")

agingclockbench/benchmarks/plots.py ADDED Viewed

@@ -0,0 +1,302 @@
+"""Visualization functions for BenchmarkReport.
+All functions return matplotlib/plotly Figure objects so callers can
+save, display, or embed them as needed.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import numpy as np
+import pandas as pd
+if TYPE_CHECKING:
+    from agingclockbench.benchmarks.suite import BenchmarkReport
+    from agingclockbench.clocks.base import ClockResult
+def plot_comparison(
+    report: "BenchmarkReport",
+    df: pd.DataFrame,
+    results: dict[str, "ClockResult"],
+):
+    """Scatter plot of biological age vs chronological age for each clock.
+    Parameters
+    ----------
+    report : BenchmarkReport from BenchmarkSuite.run()
+    df : original input DataFrame (must contain 'age')
+    results : dict mapping clock name -> ClockResult
+    Returns
+    -------
+    matplotlib.figure.Figure
+    """
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    n = len(results)
+    fig, axes = plt.subplots(1, n, figsize=(5 * n, 5), squeeze=False)
+    palette = sns.color_palette("husl", n)
+    for ax, (name, result), color in zip(axes[0], results.items(), palette):
+        if result.original_index is not None:
+            age = df.loc[result.original_index, "age"].values
+        else:
+            age = df["age"].iloc[: result.output_rows].values
+        bio_age = result.biological_ages.values
+        # Scatter with alpha for density
+        ax.scatter(age, bio_age, alpha=0.3, s=8, color=color)
+        # Identity line (biological age = chronological age)
+        lo, hi = min(age.min(), bio_age.min()), max(age.max(), bio_age.max())
+        ax.plot([lo, hi], [lo, hi], "k--", lw=1, label="y = x")
+        # Pearson r from benchmark results
+        br = next((r for r in report.results if r.clock_name == name), None)
+        r_str = f"r = {br.pearson_r:.3f}" if br and not np.isnan(br.pearson_r) else ""
+        ax.set_title(f"{name}\n{r_str}", fontsize=12)
+        ax.set_xlabel("Chronological Age (years)")
+        ax.set_ylabel("Biological Age (years)")
+    fig.suptitle("Biological Age vs Chronological Age", fontsize=14, y=1.02)
+    plt.tight_layout()
+    return fig
+def plot_km_survival(
+    df: pd.DataFrame,
+    results: dict[str, "ClockResult"],
+    mortality_col: str = "mortstat",
+    followup_col: str = "permth_exm",
+    n_quartiles: int = 4,
+):
+    """Kaplan-Meier survival curves stratified by age-acceleration quartile.
+    Parameters
+    ----------
+    df : DataFrame with mortality columns.
+    results : dict mapping clock name -> ClockResult.
+    mortality_col : event indicator column (1=event, 0=censored).
+    followup_col : time-to-event/censoring column (months).
+    n_quartiles : number of strata (default 4).
+    Returns
+    -------
+    matplotlib.figure.Figure
+    """
+    import matplotlib.pyplot as plt
+    from lifelines import KaplanMeierFitter
+    import seaborn as sns
+    n = len(results)
+    fig, axes = plt.subplots(1, n, figsize=(6 * n, 5), squeeze=False)
+    palette = sns.color_palette("RdYlGn_r", n_quartiles)
+    for ax, (name, result) in zip(axes[0], results.items()):
+        if result.original_index is not None:
+            aligned = df.loc[result.original_index].reset_index(drop=True)
+        else:
+            aligned = df.iloc[: result.output_rows].reset_index(drop=True)
+        if mortality_col not in aligned.columns or followup_col not in aligned.columns:
+            ax.text(0.5, 0.5, "No mortality data", ha="center", va="center",
+                    transform=ax.transAxes)
+            ax.set_title(name)
+            continue
+        analysis = pd.DataFrame({
+            "accel": result.accel.values,
+            "event": aligned[mortality_col].values,
+            "time": aligned[followup_col].values,
+        }).dropna()
+        quartile_labels = [f"Q{i+1}" for i in range(n_quartiles)]
+        analysis["quartile"] = pd.qcut(analysis["accel"], n_quartiles,
+                                       labels=quartile_labels)
+        kmf = KaplanMeierFitter()
+        for label, color in zip(quartile_labels, palette):
+            mask = analysis["quartile"] == label
+            kmf.fit(
+                analysis.loc[mask, "time"] / 12,  # months → years
+                analysis.loc[mask, "event"],
+                label=label,
+            )
+            kmf.plot_survival_function(ax=ax, color=color, ci_show=False)
+        ax.set_title(f"{name}\nKaplan-Meier by Accel Quartile", fontsize=11)
+        ax.set_xlabel("Follow-up (years)")
+        ax.set_ylabel("Survival Probability")
+        ax.legend(title="Accel\nQuartile", fontsize=8)
+        ax.set_ylim(0, 1)
+    fig.suptitle("Survival by Biological Age Acceleration Quartile", fontsize=14, y=1.02)
+    plt.tight_layout()
+    return fig
+def plot_correlation_heatmap(results: dict[str, "ClockResult"]):
+    """Heatmap of Pearson correlations between clock accelerations.
+    Parameters
+    ----------
+    results : dict mapping clock name -> ClockResult.
+    Returns
+    -------
+    matplotlib.figure.Figure
+    """
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    names = list(results.keys())
+    n = len(names)
+    corr = np.eye(n)
+    for i, n1 in enumerate(names):
+        for j, n2 in enumerate(names):
+            if i != j:
+                a1 = results[n1].accel
+                a2 = results[n2].accel
+                min_len = min(len(a1), len(a2))
+                if min_len > 2:
+                    corr[i, j] = a1.iloc[:min_len].corr(a2.iloc[:min_len])
+    corr_df = pd.DataFrame(corr, index=names, columns=names)
+    fig, ax = plt.subplots(figsize=(max(4, n * 1.5), max(3, n * 1.5)))
+    sns.heatmap(
+        corr_df,
+        annot=True,
+        fmt=".3f",
+        cmap="coolwarm",
+        vmin=-1,
+        vmax=1,
+        ax=ax,
+        square=True,
+        cbar_kws={"shrink": 0.8},
+    )
+    ax.set_title("Inter-Clock Acceleration Correlations (Pearson r)", fontsize=12)
+    plt.tight_layout()
+    return fig
+def to_html(
+    report: "BenchmarkReport",
+    df: pd.DataFrame,
+    results: dict[str, "ClockResult"],
+    filename: str,
+    mortality_col: str = "mortstat",
+    followup_col: str = "permth_exm",
+) -> None:
+    """Export an interactive Plotly HTML benchmark report.
+    Parameters
+    ----------
+    report : BenchmarkReport from BenchmarkSuite.run()
+    df : original input DataFrame
+    results : dict mapping clock name -> ClockResult
+    filename : output .html path
+    """
+    import plotly.graph_objects as go
+    from plotly.subplots import make_subplots
+    import plotly.express as px
+    n = len(results)
+    # --- Scatter subplots ---
+    fig_scatter = make_subplots(
+        rows=1, cols=n,
+        subplot_titles=[f"{name}" for name in results],
+        shared_yaxes=False,
+    )
+    colors = px.colors.qualitative.Plotly
+    for col, (name, result) in enumerate(results.items(), start=1):
+        if result.original_index is not None:
+            age = df.loc[result.original_index, "age"].values
+        else:
+            age = df["age"].iloc[: result.output_rows].values
+        bio_age = result.biological_ages.values
+        br = next((r for r in report.results if r.clock_name == name), None)
+        r_val = br.pearson_r if br else float("nan")
+        fig_scatter.add_trace(
+            go.Scatter(
+                x=age, y=bio_age,
+                mode="markers",
+                marker=dict(size=4, color=colors[col - 1], opacity=0.4),
+                name=f"{name} (r={r_val:.3f})",
+            ),
+            row=1, col=col,
+        )
+        lo = min(float(age.min()), float(bio_age.min()))
+        hi = max(float(age.max()), float(bio_age.max()))
+        fig_scatter.add_trace(
+            go.Scatter(x=[lo, hi], y=[lo, hi], mode="lines",
+                       line=dict(color="black", dash="dash", width=1),
+                       showlegend=False),
+            row=1, col=col,
+        )
+    fig_scatter.update_layout(
+        title="Biological Age vs Chronological Age",
+        height=450,
+        template="plotly_white",
+    )
+    # --- Benchmark table ---
+    summary_df = report.to_dataframe()
+    def _fmt(col):
+        s = summary_df[col]
+        if pd.api.types.is_numeric_dtype(s):
+            return s.round(4).astype(str).tolist()
+        return s.astype(str).tolist()
+    fig_table = go.Figure(
+        data=[go.Table(
+            header=dict(
+                values=list(summary_df.columns),
+                fill_color="#2c3e50",
+                font=dict(color="white", size=12),
+                align="left",
+            ),
+            cells=dict(
+                values=[_fmt(c) for c in summary_df.columns],
+                fill_color="lavender",
+                align="left",
+            ),
+        )]
+    )
+    fig_table.update_layout(title="Benchmark Summary", height=200)
+    # Combine into single HTML
+    html_scatter = fig_scatter.to_html(full_html=False, include_plotlyjs=False)
+    html_table = fig_table.to_html(full_html=False, include_plotlyjs=False)
+    html = f"""<!DOCTYPE html>
+<html>
+<head>
+  <title>AgingClockBench Report</title>
+  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+  <style>
+    body {{ font-family: Arial, sans-serif; max-width: 1200px; margin: auto; padding: 20px; }}
+    h1 {{ color: #2c3e50; }}
+    h2 {{ color: #34495e; border-bottom: 1px solid #bdc3c7; padding-bottom: 6px; }}
+  </style>
+</head>
+<body>
+  <h1>AgingClockBench Report</h1>
+  <h2>Benchmark Summary</h2>
+  {html_table}
+  <h2>Biological Age vs Chronological Age</h2>
+  {html_scatter}
+</body>
+</html>"""
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(html)
+    print(f"Report saved to {filename}")

agingclockbench/benchmarks/suite.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""BenchmarkSuite — runs validation metrics across multiple aging clocks."""
+from dataclasses import dataclass, field
+import numpy as np
+import pandas as pd
+from scipy import stats
+from agingclockbench.clocks.base import ClockResult
+@dataclass
+class BenchmarkResult:
+    """Validation metrics for a single clock."""
+    clock_name: str
+    pearson_r: float = float("nan")
+    spearman_r: float = float("nan")
+    pearson_pvalue: float = float("nan")
+    mortality_hr: float = float("nan")
+    mortality_hr_ci_lower: float = float("nan")
+    mortality_hr_ci_upper: float = float("nan")
+    mortality_pvalue: float = float("nan")
+    cox_nobs: int = 0
+    cv: float = float("nan")
+    clock_agreement_with_others: dict = field(default_factory=dict)
+class BenchmarkSuite:
+    """Run a standardised validation benchmark on one or more aging clocks.
+    Parameters
+    ----------
+    mortality_col : str
+        Column name for vital status (1 = dead, 0 = censored).
+    followup_col : str
+        Column name for follow-up time. Units must be consistent — the HR
+        interpretation assumes months if using NHANES permth_exm.
+    Examples
+    --------
+    >>> suite = BenchmarkSuite(mortality_col="mortstat", followup_col="permth_exm")
+    >>> report = suite.run(df, results={"PhenoAge": phenoage_result})
+    >>> print(report.to_dataframe())
+    """
+    def __init__(self, mortality_col: str = "mortstat", followup_col: str = "permth_exm") -> None:
+        self.mortality_col = mortality_col
+        self.followup_col = followup_col
+    def run(
+        self,
+        df: pd.DataFrame,
+        results: dict[str, ClockResult],
+    ) -> "BenchmarkReport":
+        """Compute benchmark metrics for each clock result.
+        Uses ``ClockResult.original_index`` to align clock outputs with
+        the correct rows in ``df`` (handles missing-data row drops).
+        Parameters
+        ----------
+        df : Original input DataFrame.
+        results : Mapping of clock name → ClockResult.
+        Returns
+        -------
+        BenchmarkReport
+        """
+        benchmark_results: list[BenchmarkResult] = []
+        accel_series: dict[str, pd.Series] = {}
+        for name, result in results.items():
+            br = BenchmarkResult(clock_name=name)
+            # Align df rows to the rows the clock actually processed
+            if result.original_index is not None:
+                aligned_df = df.loc[result.original_index].reset_index(drop=True)
+            else:
+                aligned_df = df.iloc[: result.output_rows].reset_index(drop=True)
+            age = aligned_df["age"]
+            # Pearson / Spearman correlation with chronological age
+            if age.nunique() > 1:
+                r, p = stats.pearsonr(age, result.biological_ages)
+                br.pearson_r = round(float(r), 4)
+                br.pearson_pvalue = round(float(p), 6)
+                spr = stats.spearmanr(age, result.biological_ages).statistic
+                br.spearman_r = round(float(spr), 4)
+            # Coefficient of variation
+            mean_ba = result.biological_ages.mean()
+            std_ba = result.biological_ages.std()
+            br.cv = round(float(std_ba / mean_ba), 4) if mean_ba != 0 else float("nan")
+            # Cox PH mortality prediction
+            if self.mortality_col in aligned_df.columns and self.followup_col in aligned_df.columns:
+                br = self._run_cox(aligned_df, result, br)
+            benchmark_results.append(br)
+            accel_series[name] = result.accel
+        # Inter-clock agreement (Pearson r of accelerations)
+        for br in benchmark_results:
+            others = {k: v for k, v in accel_series.items() if k != br.clock_name}
+            for other_name, other_accel in others.items():
+                min_len = min(len(accel_series[br.clock_name]), len(other_accel))
+                if min_len > 2 and accel_series[br.clock_name].iloc[:min_len].nunique() > 1:
+                    r, _ = stats.pearsonr(
+                        accel_series[br.clock_name].iloc[:min_len],
+                        other_accel.iloc[:min_len],
+                    )
+                    br.clock_agreement_with_others[other_name] = round(float(r), 4)
+        return BenchmarkReport(
+            results=benchmark_results,
+            df=df,
+            clock_results=results,
+            mortality_col=self.mortality_col,
+            followup_col=self.followup_col,
+        )
+    def _run_cox(
+        self,
+        aligned_df: pd.DataFrame,
+        result: ClockResult,
+        br: BenchmarkResult,
+    ) -> BenchmarkResult:
+        """Fit a Cox PH model: mortality ~ clock_acceleration_sd + age."""
+        try:
+            from lifelines import CoxPHFitter
+            analysis_df = aligned_df[[self.mortality_col, self.followup_col, "age"]].copy()
+            analysis_df["clock_acceleration"] = result.accel.values
+            # Standardise acceleration to per-SD hazard ratio
+            sd = analysis_df["clock_acceleration"].std()
+            if sd > 0:
+                analysis_df["clock_acceleration"] /= sd
+            analysis_df = analysis_df.dropna()
+            if len(analysis_df) < 10 or analysis_df[self.mortality_col].sum() == 0:
+                return br
+            cph = CoxPHFitter()
+            cph.fit(
+                analysis_df,
+                duration_col=self.followup_col,
+                event_col=self.mortality_col,
+                formula="clock_acceleration + age",
+            )
+            summary = cph.summary
+            row = summary.loc["clock_acceleration"]
+            br.mortality_hr = round(float(np.exp(row["coef"])), 4)
+            br.mortality_hr_ci_lower = round(float(np.exp(row["coef lower 95%"])), 4)
+            br.mortality_hr_ci_upper = round(float(np.exp(row["coef upper 95%"])), 4)
+            br.mortality_pvalue = round(float(row["p"]), 6)
+            br.cox_nobs = int(analysis_df[self.mortality_col].sum())
+        except Exception:
+            pass
+        return br
+class BenchmarkReport:
+    """Container for all benchmark results with display and export methods.
+    Attributes
+    ----------
+    results : list[BenchmarkResult]
+    _df : the original input DataFrame (set by BenchmarkSuite.run)
+    _clock_results : dict mapping clock name -> ClockResult
+    """
+    def __init__(
+        self,
+        results: list[BenchmarkResult],
+        df: pd.DataFrame | None = None,
+        clock_results: dict | None = None,
+        mortality_col: str = "mortstat",
+        followup_col: str = "permth_exm",
+    ) -> None:
+        self.results = results
+        self._df = df
+        self._clock_results = clock_results or {}
+        self._mortality_col = mortality_col
+        self._followup_col = followup_col
+    def to_dataframe(self) -> pd.DataFrame:
+        """Return a summary DataFrame — one row per clock."""
+        rows = []
+        for r in self.results:
+            rows.append({
+                "Clock": r.clock_name,
+                "Pearson r": r.pearson_r,
+                "Spearman r": r.spearman_r,
+                "Mort HR (per SD accel)": r.mortality_hr,
+                "HR 95% CI lower": r.mortality_hr_ci_lower,
+                "HR 95% CI upper": r.mortality_hr_ci_upper,
+                "Mort p-value": r.mortality_pvalue,
+                "Cox N (events)": r.cox_nobs,
+                "CV": r.cv,
+            })
+        return pd.DataFrame(rows)
+    def plot_comparison(self, df: pd.DataFrame | None = None,
+                        results: dict | None = None):
+        """Scatter plot of biological age vs chronological age per clock.
+        Returns matplotlib Figure. Pass ``df`` and ``results`` only if you
+        did not run via BenchmarkSuite.run().
+        """
+        from agingclockbench.benchmarks.plots import plot_comparison
+        return plot_comparison(
+            self,
+            df if df is not None else self._df,
+            results if results is not None else self._clock_results,
+        )
+    def plot_km_survival(self, df: pd.DataFrame | None = None,
+                         results: dict | None = None,
+                         n_quartiles: int = 4):
+        """Kaplan-Meier survival by age-acceleration quartile.
+        Returns matplotlib Figure.
+        """
+        from agingclockbench.benchmarks.plots import plot_km_survival
+        return plot_km_survival(
+            df if df is not None else self._df,
+            results if results is not None else self._clock_results,
+            mortality_col=self._mortality_col,
+            followup_col=self._followup_col,
+            n_quartiles=n_quartiles,
+        )
+    def plot_correlation_heatmap(self, results: dict | None = None):
+        """Heatmap of Pearson correlations between clock accelerations.
+        Returns matplotlib Figure.
+        """
+        from agingclockbench.benchmarks.plots import plot_correlation_heatmap
+        return plot_correlation_heatmap(
+            results if results is not None else self._clock_results
+        )
+    def to_html(self, filename: str, df: pd.DataFrame | None = None,
+                results: dict | None = None) -> None:
+        """Export an interactive Plotly HTML benchmark report.
+        Parameters
+        ----------
+        filename : output path (e.g. 'report.html')
+        """
+        from agingclockbench.benchmarks.plots import to_html
+        to_html(
+            self,
+            df if df is not None else self._df,
+            results if results is not None else self._clock_results,
+            filename,
+            mortality_col=self._mortality_col,
+            followup_col=self._followup_col,
+        )