PyPI - pcntoolkit - Versions diffs - 1.2.0.post1__tar.gz → 1.3.0__tar.gz - Mend

pcntoolkit 1.2.0.post1tar.gz → 1.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{pcntoolkit-1.2.0.post1 → pcntoolkit-1.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pcntoolkit
-Version: 1.2.0.post1
+Version: 1.3.0
 Summary: Predictive Clinical Neuroscience Toolkit
 Author: Andre Marquand, Stijn de Boer, Seyed Mostafa Kia, Saige Rutherford, Charlotte Fraza, Barbora Rehák Bučková, Pieter Barkema, Thomas Wolfers, Mariam Zabihi, Richard Dinga, Johanna Bayer, Maarten Mennes, Hester Huijsdens, Linden Parkes, Pierre Berthet
 License-Expression: GPL-3.0-only
@@ -8,20 +8,30 @@ Requires-Python: <3.13,>=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: nibabel>=5.3.1
-Requires-Dist: pymc>=5.19.1
+Requires-Dist: h5py>=3.11.0
+Requires-Dist: h5netcdf>=1.3.0
+Requires-Dist: pymc<6.0.0,>=5.19.1
+Requires-Dist: pytensor<3.0.0,>=2.22.0
 Requires-Dist: scikit-learn>=1.5.2
 Requires-Dist: six>=1.16.0
 Requires-Dist: scipy>=1.12
 Requires-Dist: matplotlib>=3.9.2
 Requires-Dist: seaborn>=0.13.2
 Requires-Dist: numba>=0.60.0
-Requires-Dist: nutpie>=0.16.5
+Requires-Dist: nutpie<0.16.9,>=0.16.5
 Requires-Dist: joblib>=1.4.2
 Requires-Dist: dill>=0.3.9
 Requires-Dist: ipywidgets>=8.1.5
 Requires-Dist: ipykernel>=6.29.5
+Requires-Dist: ipython>=8.0.0
 Requires-Dist: dask>=2025.11.0
 Requires-Dist: filelock>=3.13.0
+Requires-Dist: packaging>=21.3
+Requires-Dist: arviz<1.0.0,>=0.21.0
+Requires-Dist: numpy>=2.0.0
+Requires-Dist: pandas>=2.2.0
+Requires-Dist: xarray>=2024.1.0
+Requires-Dist: cloudpickle>=3.0.0
 Provides-Extra: dev
 Requires-Dist: toml; extra == "dev"
 Requires-Dist: sphinx-tabs>=3.4.7; extra == "dev"
@@ -30,6 +40,7 @@ Requires-Dist: black>=24.10.0; extra == "dev"
 Requires-Dist: sphinx-rtd-theme>=3.0.2; extra == "dev"
 Requires-Dist: ruff>=0.8.6; extra == "dev"
 Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
+Requires-Dist: nbconvert>=7.15.0; extra == "dev"
 Dynamic: license-file
 # Predictive Clinical Neuroscience Toolkit

{pcntoolkit-1.2.0.post1 → pcntoolkit-1.3.0}/pcntoolkit/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from .dataio.data_factory import load_fcon1000
 from .dataio.norm_data import NormData
-from .math_functions.basis_function import BsplineBasisFunction, LinearBasisFunction, PolynomialBasisFunction, CompositeBasisFunction
+from .math_functions.basis_function import BsplineBasisFunction, LinearBasisFunction, PolynomialBasisFunction, CompositeBasisFunction, FractionalPolynomialBasisFunction
 from .math_functions.likelihood import BetaLikelihood, NormalLikelihood, SHASHbLikelihood
 from .math_functions.prior import make_prior
 from .normative_model import NormativeModel
@@ -14,6 +14,7 @@ __version__ = version("pcntoolkit")
 __all__ = [
     "NormData",
     "BsplineBasisFunction",
+    "FractionalPolynomialBasisFunction",
     "LinearBasisFunction",
     "PolynomialBasisFunction",
     "CompositeBasisFunction",

{pcntoolkit-1.2.0.post1 → pcntoolkit-1.3.0}/pcntoolkit/dataio/data_factory.py RENAMED Viewed

@@ -8,11 +8,26 @@ from pcntoolkit.dataio.norm_data import NormData
 def load_fcon1000(save_path: str | None = None):
-    """Download and save fcon dataset to specified path, or load it from there if it is already downloaded"""
+    """Download and save fcon dataset to specified path, or load it from there
+    if it is already downloaded
+    Parameters
+    ----------
+    save_path : str | None
+        The path to save the dataset to, or load it from if it is already
+        downloaded
+    Returns
+    -------
+    NormData
+        The loaded dataset as a NormData object"""
     if not save_path:
         save_path = os.path.join("pcntoolkit_resources", "data")
     os.makedirs(save_path, exist_ok=True)
     data_path = os.path.join(save_path, "fcon1000.csv")
+    # If the dataset is not already downloaded, download it and save it to
+    # the specified path
     if not os.path.exists(data_path):
         data = pd.read_csv(
             "https://raw.githubusercontent.com/predictive-clinical-neuroscience/PCNtoolkit-demo/refs/heads/main/data/fcon1000.csv"
@@ -256,3 +271,98 @@ def load_fcon1000(save_path: str | None = None):
         remove_Nan=True,
     )
     return norm_data
+# NOTE: This dataset is not public
+def load_lifespan_big(
+        n_response_vars: int | None = None,
+        n_largest_sites: int | None = None,
+        n_subjects: int | None = None
+) -> NormData:
+    """
+    Load the lifespan_big dataset, which is a large lifespan dataset with many sites.
+    Parameters
+    ----------
+    n_response_vars : int | None
+        If specified, only use the first n_response_vars response
+        variables.
+    n_largest_sites : int | None
+        If specified, only keep data from the n_largest_sites largest
+        sites.
+    n_subjects : int | None
+        If specified, randomly sample n_subjects subjects.
+    Returns
+    -------
+    NormData
+        The loaded dataset as a NormData object.
+    """
+    # Define the variables in the dataset
+    subject_ids = ["participant_id"]
+    covariates = ["age"]
+    batch_effects = ["sex", "site"]
+    # Define the dtypes for loading the dataset, to ensure that categorical
+    # variables are loaded as strings and numerical variables as floats
+    dtypes = {"participant_id": str, "group": str, "group2": str}
+    for col in batch_effects:
+        dtypes[col] = str
+    for col in covariates:
+        dtypes[col] = float
+    # Load the lifespan dataset with 57116 subjects from the Braicharts paper:
+    # https://doi.org/10.7554/eLife.72904
+    data = pd.read_csv(
+        "/project_cephfs/3022017.06/projects/stijdboe/Data/sairut_data/"
+        "lifespan_big.csv", dtype=dtypes)
+    # Drop rows where all values are NaN
+    data = data.dropna(axis=0, how="all", inplace=False)
+    # Drop columns where even if 1 value is NaN
+    data = data.dropna(axis=1, how="any", inplace=False)
+    data["sex"] = data["sex"].map(
+        {"0.0": "Female", "1.0": "Male", "2.0": "Female"})
+    data["site"] = data["site_ID"]
+    # If requested, take only the n largest sites
+    if n_largest_sites is not None:
+        data = data[data["site_ID"].isin(
+            data["site_ID"].value_counts().head(n_largest_sites).index)]
+    # If requested, take only n subjects
+    if n_subjects is not None:
+        data = data.sample(n=n_subjects, replace=False)
+    # Define response variables as all variables that
+    # are not in subject_ids, covariates, batch_effects,
+    # and that have variance > 0
+    def is_response_var(col_name: str) -> bool:
+        return (
+            col_name not in subject_ids
+            and col_name not in covariates
+            and col_name not in batch_effects
+            and not col_name.startswith("site_")
+            and not col_name.startswith("group")
+            and not col_name.startswith("race")
+            and data[col_name].var() > 0
+        )
+    response_vars = [col for col in data.columns if is_response_var(col)]
+    # If requested, take only n response variables
+    if n_response_vars is not None:
+        response_vars = response_vars[:n_response_vars]
+    # Create NormData object
+    norm_data = NormData.from_dataframe(
+        name="lifespan_big",
+        dataframe=data,
+        covariates=covariates,
+        batch_effects=batch_effects,
+        response_vars=response_vars,
+        subject_ids=subject_ids,
+    )
+    return norm_data

{pcntoolkit-1.2.0.post1 → pcntoolkit-1.3.0}/pcntoolkit/dataio/norm_data.py RENAMED Viewed

@@ -510,7 +510,7 @@ class NormData(xr.Dataset):
             new_data_vars["Z"] = (["observations", "response_vars"], new_Z.data)
         if hasattr(self, "centiles") and hasattr(other, "centiles"):
-            if self.centile.to_numpy() == other.centile.to_numpy():
+            if np.array_equal(self.centile.to_numpy(), other.centile.to_numpy()):
                 new_centiles = xr.DataArray(
                     np.zeros((new_X.shape[0], len(respvar_intersection), len(self.centile.to_numpy()))),
                     dims=["observations", "response_vars", "centile"],
@@ -682,7 +682,23 @@ class NormData(xr.Dataset):
         names: Optional[Tuple[str, str]],
     ) -> Tuple[NormData, NormData]:
         """
-        Split the data into two datasets, one with the specified batch effects and one without.
+        Split the data into two datasets, one with the specified batch effects
+        and one without.
+        This is useful when you want to split a dataset into two smaller ones.
+        Parameters
+        ----------
+        batch_effects : Dict[str, List[str]]
+            A dictionary mapping batch effect dimensions to lists of values to
+            split on.
+        names : Optional[Tuple[str, str]]
+            The names for the two splits.
+        Returns
+        -------
+        Tuple[NormData, NormData]
+            A tuple containing the two split NormData instances.
         """
         if names is None:
             names = ["selected", "not_selected"]  # type:ignore
@@ -1033,19 +1049,31 @@ class NormData(xr.Dataset):
             self.attrs["is_scaled"] = False
-    def select_batch_effects(self, name, batch_effects: Dict[str, List[str]], invert: bool = False) -> NormData:
+    def select_batch_effects(
+        self,
+        name: str,
+        batch_effects: Dict[str, List[str]],
+        invert: bool = False,
+    ) -> NormData:
         """
-        Select only the specified batch effects.
+        Select observations matching (or not matching) batch effects.
         Parameters
         ----------
+        name : str
+            Name to assign to the returned ``NormData`` instance.
         batch_effects : Dict[str, List[str]]
-            A dictionary specifying which batch effects to select.
+            A dictionary mapping batch effect dimensions to lists of values to
+            select batch effects from.
+        invert : bool, optional
+            If ``True``, return observations that do *not* match
+            any of the specified batch effect values. Default is ``False``.
         Returns
         -------
         NormData
-            A NormData instance with the selected batch effects.
+            A NormData instance containing observations matching
+            (or not matching) the specified batch effects.
         """
         mask = np.zeros(self.batch_effects.shape[0], dtype=bool)
         for key, values in batch_effects.items():
@@ -1077,21 +1105,28 @@ class NormData(xr.Dataset):
         """
         acc = []
         x_columns = [col for col in ["X"] if hasattr(self, col)]
-        y_columns = [col for col in ["Y", "Y_harmonized", "Z"] if hasattr(self, col)]
+        y_columns = [col for col in ["Y", "Y_harmonized", "Z", "logp", "Yhat"]
+                     if hasattr(self, col)]
         acc.append(
             xr.Dataset.to_dataframe(self[x_columns], dim_order)
             .reset_index(drop=False)
-            .pivot(index="observations", columns="covariates", values=x_columns)
+            .pivot(index="observations",
+                   columns="covariates",
+                   values=x_columns)
         )
         acc.append(
             xr.Dataset.to_dataframe(self[y_columns], dim_order)
             .reset_index(drop=False)
-            .pivot(index="observations", columns="response_vars", values=y_columns)
+            .pivot(index="observations",
+                   columns="response_vars",
+                   values=y_columns)
         )
         be = (
             xr.DataArray.to_dataframe(self.batch_effects, dim_order)
             .reset_index(drop=False)
-            .pivot(index="observations", columns="batch_effect_dims", values="batch_effects")
+            .pivot(index="observations",
+                   columns="batch_effect_dims",
+                   values="batch_effects")
         )
         be.columns = [("batch_effects", col) for col in be.columns]
@@ -1127,7 +1162,7 @@ class NormData(xr.Dataset):
         res_path = os.path.join(save_dir, f"Z_{self.name}.csv")
         lock_path = res_path + ".lock"
         with FileLock(lock_path):
-            with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
+            with open(res_path, mode="r+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
                 f.seek(0)
                 old_results = pd.read_csv(f) if os.path.getsize(res_path) > 0 else None
                 if old_results is not None:
@@ -1181,7 +1216,7 @@ class NormData(xr.Dataset):
         res_path = os.path.join(save_dir, f"centiles_{self.name}.csv")
         lock_path = res_path + ".lock"
         with FileLock(lock_path):
-            with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
+            with open(res_path, mode="r+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
                 f.seek(0)
                 old_results = pd.read_csv(f) if os.path.getsize(res_path) > 0 else None
                 if old_results is not None:
@@ -1238,7 +1273,7 @@ class NormData(xr.Dataset):
         res_path = os.path.join(save_dir, f"logp_{self.name}.csv")
         lock_path = res_path + ".lock"
         with FileLock(lock_path):
-            with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
+            with open(res_path, mode="r+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
                 f.seek(0)
                 old_results = pd.read_csv(f) if os.path.getsize(res_path) > 0 else None
                 if old_results is not None:
@@ -1282,7 +1317,7 @@ class NormData(xr.Dataset):
         res_path = os.path.join(save_dir, f"statistics_{self.name}.csv")
         lock_path = res_path + ".lock"
         with FileLock(lock_path):
-            with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
+            with open(res_path, mode="r+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
                 f.seek(0)
                 old_results = pd.read_csv(f, index_col=0) if os.path.getsize(res_path) > 0 else None
                 if old_results is not None:
@@ -1336,7 +1371,7 @@ class NormData(xr.Dataset):
         This method creates a DataArray with dimensions 'response_vars' and 'statistics',
         where 'response_vars' corresponds to the response variables in the dataset,
-        and 'statistics' includes statistics such as Rho, RMSE, SMSE, EXPV, NLL, and ShapiroW.
+        and 'statistics' includes statistics such as Rho, RMSE, SMSE, EXPV, MLL, and ShapiroW.
         The DataArray is filled with NaN values initially.
         """
         rv = self.response_vars.to_numpy().copy().tolist()
@@ -1346,7 +1381,7 @@ class NormData(xr.Dataset):
             dims=("response_vars", "statistics"),
             coords={
                 "response_vars": np.arange(len(rv)),
-                "statistics": ["Rho", "Rho_p", "R2", "RMSE", "SMSE", "MSLL", "NLL", "ShapiroW", "MACE", "MAPE", "EXPV"],
+                "statistics": ["Rho", "Rho_p", "R2", "RMSE", "SMSE", "MSLL", "MLL", "ShapiroW", "MACE", "MAPE", "EXPV"],
             },
         )

{pcntoolkit-1.2.0.post1 → pcntoolkit-1.3.0}/pcntoolkit/math_functions/basis_function.py RENAMED Viewed

@@ -7,12 +7,14 @@ import numpy as np
 from scipy.interpolate import BSpline
 from pcntoolkit.util.output import Errors, Output
+from pcntoolkit.util.migration import registry
 def create_basis_function(
-    basis_type: str | dict | None,
-    basis_column: int = 0,
-    **kwargs,
-) -> BasisFunction:
+        basis_type: str | dict | None,
+        basis_column: int = 0,
+        **kwargs,
+        ) -> BasisFunction:
     if isinstance(basis_type, dict):
         return BasisFunction.from_dict(basis_type)
     elif basis_type in ["polynomial", "PolynomialBasisFunction"]:
@@ -25,6 +27,12 @@ def create_basis_function(
     elif basis_type in ["Composite", "CompositeBasis"]:
         parts = [BasisFunction.from_dict(p) for p in kwargs['parts']]
         return CompositeBasisFunction(parts)
+    elif basis_type in [
+            "fractional_polynomial",
+            "FractionalPolynomialBasisFunction"]:
+        return FractionalPolynomialBasisFunction(
+            basis_column, **kwargs
+        )
     else:
         return LinearBasisFunction(basis_column)
@@ -37,13 +45,19 @@ class BasisFunction(ABC):
         self.basis_column = basis_column
         self.is_fitted: bool = kwargs.get("is_fitted", False)
         self.basis_name: str = kwargs.get("basis_name", "basis")
-        self.min: float = kwargs.get("min", 0)
-        self.max: float = kwargs.get("max", 1)
-        self.compute_min: bool = self.min == 0
-        self.compute_max: bool = self.max == 1
+        self.min: float | None = kwargs.get("min", None)
+        self.max: float | None = kwargs.get("max", None)
+        self.compute_min: bool = self.min is None
+        self.compute_max: bool = self.max is None
     @classmethod
-    def from_dict(cls, my_dict: dict) -> BasisFunction:
+    def from_dict(
+        cls, my_dict: dict, version: str | None = None
+    ) -> "BasisFunction":
+        # Apply any registered BasisFunction migrations for this version.
+        my_dict = registry.migrate(
+            "BasisFunction", my_dict, version=version
+        )
         basis_function_type = my_dict["basis_function"]
         basis_function = create_basis_function(basis_function_type, **my_dict)
         return basis_function
@@ -264,4 +278,188 @@ class CompositeBasisFunction(BasisFunction):
     @property
     def dimension(self):
-        return sum([p.dimension for p in self.parts])
+        return sum([p.dimension for p in self.parts])
+class FractionalPolynomialBasisFunction(BasisFunction):
+    """
+    Fractional polynomial basis function for modelling smooth nonlinear
+    effects.
+    The input must be strictly positive (do not standardize the covariates).
+    Power convention:
+        p = 0      -> log(x)
+        p != 0     -> x**p
+    Repeated powers:
+        [p, p, p]  -> x**p, x**p * log(x), x**p * log(x)**2
+    """
+    DEFAULT_POWER_SET = [-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0, 3.0]
+    AGE_FP_POWER_PRESETS = {
+        1: {
+            "default": [0.5],
+        },
+        2: {
+            "default": [0.5, 1.0],
+        },
+        3: {
+            "default": [0.5, 1.0, 2.0],
+        },
+    }
+    def __init__(
+        self,
+        basis_column: int = 0,
+        order: int = 3,
+        powers: list | tuple | str | None = "default",
+        power_set: list | tuple | None = None,
+        eps: float = 1e-8,
+        **kwargs,
+    ):
+        """
+        Initialise the fractional polynomial basis function.
+        Parameters
+        ----------
+        basis_column : int, default=0
+            Column index to transform.
+        order : int, default=3
+            Fractional polynomial order. Must be 1, 2, or 3.
+        powers : list, tuple, str, or None, default="default"
+        power_set : list, tuple, or None, default=None
+            Allowed fractional polynomial powers.
+        eps : float, default=1e-8
+            Numerical stability constant.
+        """
+        super().__init__(basis_column, **kwargs)
+        if order not in [1, 2, 3]:
+            raise ValueError("Fractional polynomial order must be 1, 2, or 3.")
+        self.basis_name = "fractional_polynomial"
+        self.order = int(order)
+        self.eps = float(eps)
+        self.power_set = (
+            list(self.DEFAULT_POWER_SET)
+            if power_set is None
+            else [float(p) for p in power_set]
+        )
+        if powers is None:
+            powers = "default"
+        if isinstance(powers, str):
+            presets = self.AGE_FP_POWER_PRESETS[self.order]
+            if powers not in presets:
+                raise ValueError(
+                    f"Unknown preset '{powers}' for FP order {self.order}. "
+                    f"Available presets are: {list(presets.keys())}"
+                )
+            self.powers = list(presets[powers])
+        else:
+            self.powers = [float(p) for p in powers]
+        if len(self.powers) != self.order:
+            raise ValueError(
+                f"FP order {self.order} requires exactly {self.order} powers, "
+                f"but received {len(self.powers)}: {self.powers}"
+            )
+        for power in self.powers:
+            if power not in self.power_set:
+                raise ValueError(
+                    f"Power {power} is not in the allowed FP power set: "
+                    f"{self.power_set}"
+                )
+    def _validate_positive_finite_input(self, data: np.ndarray) -> np.ndarray:
+        """
+        Validate that input values are finite and strictly positive.
+        Returns
+        -------
+        np.ndarray
+            One-dimensional validated input array.
+        """
+        x = np.asarray(data, dtype=float).reshape(-1)
+        if not np.all(np.isfinite(x)):
+            raise ValueError(
+                "FractionalPolynomialBasisFunction received non-finite values."
+            )
+        if np.any(x <= 0):
+            raise ValueError(
+                "FractionalPolynomialBasisFunction requires strictly positive "
+                "input values. Please shift or rescale the covariate before "
+                "applying this basis function."
+            )
+        return np.maximum(x, self.eps)
+    def _fit(self, data: np.ndarray) -> None:
+        """
+        This function is added just for compatibility with parent class.
+        It only validates training data without computing or storing any
+        parameters.
+        """
+        self._validate_positive_finite_input(data)
+    def _transform(self, data: np.ndarray) -> np.ndarray:
+        """
+        Transform data into the fractional polynomial basis matrix.
+        Returns
+        -------
+        np.ndarray
+            Basis matrix of shape `(n_samples, order)`.
+        """
+        x = self._validate_positive_finite_input(data)
+        log_x = np.log(x)
+        columns = []
+        power_counts = {}
+        for power in self.powers:
+            repeat_index = power_counts.get(power, 0)
+            if power == 0.0:
+                column = log_x.copy()
+            else:
+                column = np.power(x, power)
+            if repeat_index > 0:
+                column = column * np.power(log_x, repeat_index)
+            columns.append(column)
+            power_counts[power] = repeat_index + 1
+        return np.column_stack(columns)
+    @property
+    def dimension(self) -> int:
+        """
+        Number of generated basis columns.
+        """
+        return self.order
+    def to_dict(self) -> dict:
+        """
+        Serialize the basis function configuration.
+        """
+        mydict = super().to_dict()
+        mydict["order"] = self.order
+        mydict["powers"] = list(self.powers)
+        mydict["power_set"] = list(self.power_set)
+        mydict["eps"] = self.eps
+        return mydict

pcntoolkit 1.2.0.post1__tar.gz → 1.3.0__tar.gz

pcntoolkit 1.2.0.post1tar.gz → 1.3.0tar.gz