PyPI - pcntoolkit - Versions diffs - 1.1.2__tar.gz → 1.2.0__tar.gz - Mend

pcntoolkit 1.1.2tar.gz → 1.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{pcntoolkit-1.1.2 → pcntoolkit-1.2.0}/PKG-INFO RENAMED Viewed

@@ -1,10 +1,10 @@
 Metadata-Version: 2.4
 Name: pcntoolkit
-Version: 1.1.2
+Version: 1.2.0
 Summary: Predictive Clinical Neuroscience Toolkit
 Author: Andre Marquand, Stijn de Boer, Seyed Mostafa Kia, Saige Rutherford, Charlotte Fraza, Barbora Rehák Bučková, Pieter Barkema, Thomas Wolfers, Mariam Zabihi, Richard Dinga, Johanna Bayer, Maarten Mennes, Hester Huijsdens, Linden Parkes, Pierre Berthet
 License-Expression: GPL-3.0-only
-Requires-Python: <3.13,>=3.10
+Requires-Python: <3.13,>=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: nibabel>=5.3.1
@@ -15,11 +15,13 @@ Requires-Dist: scipy>=1.12
 Requires-Dist: matplotlib>=3.9.2
 Requires-Dist: seaborn>=0.13.2
 Requires-Dist: numba>=0.60.0
-Requires-Dist: nutpie>=0.13.2
+Requires-Dist: nutpie>=0.16.5
 Requires-Dist: joblib>=1.4.2
 Requires-Dist: dill>=0.3.9
 Requires-Dist: ipywidgets>=8.1.5
 Requires-Dist: ipykernel>=6.29.5
+Requires-Dist: dask>=2025.11.0
+Requires-Dist: filelock>=3.13.0
 Provides-Extra: dev
 Requires-Dist: toml; extra == "dev"
 Requires-Dist: sphinx-tabs>=3.4.7; extra == "dev"

{pcntoolkit-1.1.2 → pcntoolkit-1.2.0}/pcntoolkit/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from .dataio.data_factory import load_fcon1000
 from .dataio.norm_data import NormData
-from .math_functions.basis_function import BsplineBasisFunction, LinearBasisFunction, PolynomialBasisFunction
+from .math_functions.basis_function import BsplineBasisFunction, LinearBasisFunction, PolynomialBasisFunction, CompositeBasisFunction
 from .math_functions.likelihood import BetaLikelihood, NormalLikelihood, SHASHbLikelihood
 from .math_functions.prior import make_prior
 from .normative_model import NormativeModel
@@ -16,6 +16,7 @@ __all__ = [
     "BsplineBasisFunction",
     "LinearBasisFunction",
     "PolynomialBasisFunction",
+    "ComnpositeBasisFunction",
     "NormativeModel",
     "BLR",
     "HBR",

{pcntoolkit-1.1.2 → pcntoolkit-1.2.0}/pcntoolkit/dataio/fileio.py RENAMED Viewed

@@ -3,6 +3,7 @@ from __future__ import print_function
 import os
 import re
 import shutil
+import subprocess
 import sys
 import tempfile
@@ -259,8 +260,8 @@ def load_cifti(filename, vol=False, mask=None, rmtmp=True):
     Output.print(Messages.EXTRACTING_CIFTI_SURFACE_DATA, outstem=outstem)
     giinamel = outstem + "-left.func.gii"
     giinamer = outstem + "-right.func.gii"
-    os.system("wb_command -cifti-separate " + filename + " COLUMN -metric CORTEX_LEFT " + giinamel)
-    os.system("wb_command -cifti-separate " + filename + " COLUMN -metric CORTEX_RIGHT " + giinamer)
+    subprocess.run(["wb_command", "-cifti-separate", filename, "COLUMN", "-metric", "CORTEX_LEFT", giinamel], check=True)
+    subprocess.run(["wb_command", "-cifti-separate", filename, "COLUMN", "-metric", "CORTEX_RIGHT", giinamer], check=True)
     # load the surface data
     giil = nib.load(giinamel)
@@ -284,7 +285,7 @@ def load_cifti(filename, vol=False, mask=None, rmtmp=True):
     if vol:
         niiname = outstem + "-vol.nii"
         Output.print(Messages.EXTRACTING_CIFTI_VOLUME_DATA, niiname=niiname)
-        os.system("wb_command -cifti-separate " + filename + " COLUMN -volume-all " + niiname)
+        subprocess.run(["wb_command", "-cifti-separate", filename, "COLUMN", "-volume-all", niiname], check=True)
         vol = load_nifti(niiname, vol=True)
         volmask = create_mask(vol)
         out = np.concatenate((out, vol2vec(vol, volmask)), axis=0)
@@ -331,8 +332,8 @@ def save_cifti(data, filename, example, mask=None, vol=True, volatlas=None):
     estem = os.path.join(tempfile.gettempdir(), str(os.getpid()) + "-" + fstem)
     giiexnamel = estem + "-left.func.gii"
     giiexnamer = estem + "-right.func.gii"
-    os.system("wb_command -cifti-separate " + example + " COLUMN -metric CORTEX_LEFT " + giiexnamel)
-    os.system("wb_command -cifti-separate " + example + " COLUMN -metric CORTEX_RIGHT " + giiexnamer)
+    subprocess.run(["wb_command", "-cifti-separate", example, "COLUMN", "-metric", "CORTEX_LEFT", giiexnamel], check=True)
+    subprocess.run(["wb_command", "-cifti-separate", example, "COLUMN", "-metric", "CORTEX_RIGHT", giiexnamer], check=True)
     # write left hemisphere
     giiexl = nib.load(giiexnamel)
@@ -359,7 +360,7 @@ def save_cifti(data, filename, example, mask=None, vol=True, volatlas=None):
     # process volumetric data
     if vol:
         niiexname = estem + "-vol.nii"
-        os.system("wb_command -cifti-separate " + example + " COLUMN -volume-all " + niiexname)
+        subprocess.run(["wb_command", "-cifti-separate", example, "COLUMN", "-volume-all", niiexname], check=True)
         niivol = load_nifti(niiexname, vol=True)
         if mask is None:
             mask = create_mask(niivol)
@@ -373,17 +374,13 @@ def save_cifti(data, filename, example, mask=None, vol=True, volatlas=None):
     # write cifti
     fname = fstem + ".dtseries.nii"
-    os.system(
-        "wb_command -cifti-create-dense-timeseries "
-        + fname
-        + " -volume "
-        + fnamev
-        + " "
-        + volatlas
-        + " -left-metric "
-        + fnamel
-        + " -right-metric "
-        + fnamer
+    subprocess.run(
+        [
+            "wb_command", "-cifti-create-dense-timeseries",
+            fname, "-volume", fnamev, volatlas,
+            "-left-metric", fnamel, "-right-metric", fnamer,
+        ],
+        check=True,
     )
     # clean up

{pcntoolkit-1.1.2 → pcntoolkit-1.2.0}/pcntoolkit/dataio/norm_data.py RENAMED Viewed

@@ -11,7 +11,7 @@ is used by all the models in the toolkit.
 from __future__ import annotations
 import copy
-import fcntl
+import json
 import os
 from collections import defaultdict
 from functools import reduce
@@ -32,20 +32,21 @@ from typing import (
 # pylint: enable=deprecated-class
 import numpy as np
-from numpy.typing import ArrayLike
 import pandas as pd  # type: ignore
 import xarray as xr
 from nibabel.loadsave import load
+from numpy.typing import ArrayLike
+from scipy import stats
 from sklearn.model_selection import StratifiedKFold, train_test_split  # type: ignore
 # import datavars from xarray
 from xarray.core.types import DataVars
+from filelock import FileLock
 from pcntoolkit.dataio.fileio import load
 from pcntoolkit.util.output import Messages, Output, Warnings
-from scipy import stats
 class NormData(xr.Dataset):
     """A class for handling normative modeling data, extending xarray.Dataset.
@@ -212,8 +213,7 @@ class NormData(xr.Dataset):
         NormData
             An instance of NormData.
         """
-        img = load(fsl_folder)
-        dat = img.get_fdata()
+        raise NotImplementedError("from_fsl is not yet implemented.")
     @classmethod
     def from_bids(cls, bids_folder, config_params) -> "NormData":  # type: ignore
@@ -232,6 +232,7 @@ class NormData(xr.Dataset):
         NormData
             An instance of NormData.
         """
+        raise NotImplementedError("from_bids is not yet implemented.")
     @classmethod
     def from_xarray(cls, name: str, xarray_dataset: xr.Dataset) -> NormData:
@@ -257,6 +258,35 @@ class NormData(xr.Dataset):
             xarray_dataset.attrs,
         )
+    @classmethod
+    def from_netcdf(cls, name: str, netcdf_path: str) -> NormData:
+        """
+        Load a normative dataset from a netcdf file.
+        Parameters
+        ----------
+        name: str
+            The name of the dataset.
+        netcdf_path: str
+            The path to the netcdf file.
+        Returns
+        -------
+        NormData
+            An instance of NormData.
+        """
+        xr_dset = xr.open_dataset(netcdf_path)
+        # Deserialize the attributes.
+        for attr in xr_dset.attrs:
+            if attr in xr_dset.attrs:
+                xr_dset.attrs[attr] = json.loads(xr_dset.attrs[attr])
+        if "batch_effect_counts" in xr_dset.attrs and xr_dset.attrs["batch_effect_counts"]:
+            # Convert the batch_effect_counts to a defaultdict
+            xr_dset.attrs["batch_effect_counts"] = defaultdict(lambda: 0, xr_dset.attrs["batch_effect_counts"])
+        return cls.from_xarray(name=name, xarray_dataset=xr_dset)
     # pylint: disable=arguments-differ
     @classmethod
     def from_dataframe(  # type:ignore
@@ -292,7 +322,7 @@ class NormData(xr.Dataset):
         attrs : Mapping[str, Any] | None, optional
             Additional attributes for the dataset, by default None.
         remove_Nan: bool
-            Wheter or not to remove NAN values from the dataframe before creationg of the class object. By default False
+            Whether or not to remove NAN values from the dataframe before creating of the class object. By default False
         Returns
         -------
@@ -358,6 +388,26 @@ class NormData(xr.Dataset):
             attrs,
         )
+    def to_netcdf(self, netcdf_path: str) -> None:
+        """
+        Save the NormData object to a netcdf file.
+        Parameters
+        ----------
+        netcdf_path: str
+            The path to the netcdf file.
+        Returns
+        -------
+        None
+        """
+        ds = self.copy(deep=False)
+        # Serialize the attributes using json so that they can be saved to netcdf
+        for attr in ds.attrs:
+            if attr in ds.attrs:
+                ds.attrs[attr] = json.dumps(ds.attrs[attr])
+        xr.Dataset.to_netcdf(ds, netcdf_path, invalid_netcdf=False, format="NETCDF4")
     @classmethod
     def remove_nan(cls, dataframe: pd.DataFrame) -> pd.DataFrame:
         """
@@ -367,7 +417,6 @@ class NormData(xr.Dataset):
         Output.print(f"Removed {len(dataframe) - len(cleaned)} NANs")
         return cleaned
     @classmethod
     def remove_outliers(cls, dataframe: pd.DataFrame, continuous_vars: List[str], z_threshold: float = 3.0) -> pd.DataFrame:
         """
@@ -385,7 +434,6 @@ class NormData(xr.Dataset):
         Output.print(f"Removed {np.sum(~idx)} outliers")
         return dataframe.loc[idx]
     def merge(self, other: NormData, name: str | None = None) -> NormData:
         """
         Merge two NormData objects.
@@ -643,41 +691,71 @@ class NormData(xr.Dataset):
         B = self.select_batch_effects(names[1], batch_effects, invert=True)
         return A, B
+    def has_registered_metadata(self) -> bool:
+        """
+        Check if the batch effect and covariate metadata have been registered and are non-empty.
+        Returns
+        -------
+        bool
+            True if all required metadata attributes exist and are not empty, False otherwise.
+        """
+        required_attrs = [
+            "unique_batch_effects",
+            "batch_effect_counts",
+            "covariate_ranges",
+            "batch_effect_covariate_ranges",
+        ]
+        for attr in required_attrs:
+            # Check if attribute exists and is not an empty dict/defaultdict
+            if attr not in self.attrs or not self.attrs[attr]:
+                return False
+        return True
     def register_batch_effects(self) -> None:
         """
         Create a mapping of batch effects to unique values.
         """
+        if self.has_registered_metadata():
+            return
         my_be: xr.DataArray = self.batch_effects
         # create a dictionary with for each column in the batch effects, a dict from value to int
         self.attrs["unique_batch_effects"] = {}
         self.attrs["batch_effect_counts"] = defaultdict(lambda: 0)
         self.attrs["covariate_ranges"] = {}
-        # TODO: the following can be done much easier using df.groupby.min and xarray.unstack, but that is a TODO for another day. This works for now.
         self.attrs["batch_effect_covariate_ranges"] = {}
-        for dim in self.batch_effect_dims.to_numpy():
-            dim_subset = my_be.sel(batch_effect_dims=dim)
-            uniques, counts = np.unique(dim_subset, return_counts=True)
-            self.attrs["unique_batch_effects"][dim] = list(uniques)
-            self.attrs["batch_effect_counts"][dim] = {k: int(v) for k, v in zip(uniques, counts)}
+        # Vectorized implementation using pandas groupby/agg
+        be_cols = self.batch_effect_dims.to_numpy()
+        be_df = pd.DataFrame(my_be.values, columns=be_cols)
+        x_available = "X" in self.data_vars
+        if x_available:
+            covs = self.covariates.to_numpy()
+            X_df = pd.DataFrame(self.X.values, columns=covs)
+        for dim in be_cols:
+            vc = be_df[dim].value_counts(sort=False)
+            self.attrs["unique_batch_effects"][dim] = vc.index.astype(str).tolist()
+            self.attrs["batch_effect_counts"][dim] = {str(k): int(v) for k, v in vc.to_dict().items()}
             self.attrs["batch_effect_covariate_ranges"][dim] = {}
-            if self.X is not None:
-                for u in uniques:
-                    self.attrs["batch_effect_covariate_ranges"][dim][u] = {}
-                    for c in self.covariates.to_numpy():
-                        u_mask = dim_subset.values == u
-                        my_c = self.X.sel(covariates=c).values[u_mask]
-                        my_min = my_c.min()
-                        my_max = my_c.max()
-                        my_mean = my_c.mean()
-                        self.attrs["batch_effect_covariate_ranges"][dim][u][c] = {"mean": my_mean, "min": my_min, "max": my_max}
-        for c in self.covariates.to_numpy():
-            my_c = self.X.sel(covariates=c).values
-            my_mean = my_c.mean()
-            my_min = my_c.min()
-            my_max = my_c.max()
-            self.attrs["covariate_ranges"][c] = {"mean": my_mean, "min": my_min, "max": my_max}
+            if x_available:
+                grouped = X_df.groupby(be_df[dim], sort=False).agg(["min", "max"])
+                for u, row in grouped.iterrows():
+                    self.attrs["batch_effect_covariate_ranges"][dim][u] = {
+                        c: {"min": float(row[(c, "min")]), "max": float(row[(c, "max")])} for c in covs
+                    }
+        if x_available:
+            overall = X_df.agg(["min", "max"])
+            for c in covs:
+                self.attrs["covariate_ranges"][c] = {
+                    "min": float(overall.loc["min", c]),
+                    "max": float(overall.loc["max", c]),
+                }
     def check_compatibility(self, other: NormData) -> bool:
         """
@@ -735,7 +813,6 @@ class NormData(xr.Dataset):
             for cov in self.covariates.to_numpy()
         }
         mybecr = self.batch_effect_covariate_ranges
         otbecr = other.batch_effect_covariate_ranges
         nbecr = {}
@@ -757,6 +834,7 @@ class NormData(xr.Dataset):
                         case False, False:
                             raise ValueError("This should never happen")
+        # Update instance attributes
         self.unique_batch_effects = copy.deepcopy(all_unique_batch_effects)
         other.unique_batch_effects = copy.deepcopy(all_unique_batch_effects)
         self.covariate_ranges = copy.deepcopy(ncr)
@@ -764,6 +842,14 @@ class NormData(xr.Dataset):
         self.batch_effect_covariate_ranges = copy.deepcopy(nbecr)
         other.batch_effect_covariate_ranges = copy.deepcopy(nbecr)
+        # Update xarray attrs dicts to make them in sync.
+        self.attrs["unique_batch_effects"] = copy.copy(self.unique_batch_effects)
+        other.attrs["unique_batch_effects"] = copy.copy(other.unique_batch_effects)
+        self.attrs["covariate_ranges"] = copy.copy(self.covariate_ranges)
+        other.attrs["covariate_ranges"] = copy.copy(other.covariate_ranges)
+        self.attrs["batch_effect_covariate_ranges"] = copy.copy(self.batch_effect_covariate_ranges)
+        other.attrs["batch_effect_covariate_ranges"] = copy.copy(other.batch_effect_covariate_ranges)
     def scale_forward(self, inscalers: Dict[str, Any], outscalers: Dict[str, Any]) -> None:
         """
         Scale the data forward in-place using provided scalers.
@@ -1036,12 +1122,12 @@ class NormData(xr.Dataset):
         zdf = self.Z.to_dataframe().unstack(level="response_vars")
         zdf.columns = zdf.columns.droplevel(0)
         zdf = zdf.merge(self.subject_ids.to_dataframe(), on="observations", how="left")
-        zdf = zdf[[ "subject_ids", *[z for z in sorted(zdf.columns.tolist()) if z not in ["subject_ids"]]]]
+        zdf = zdf[["subject_ids", *[z for z in sorted(zdf.columns.tolist()) if z not in ["subject_ids"]]]]
         zdf.index = zdf.index.astype(str)
         res_path = os.path.join(save_dir, f"Z_{self.name}.csv")
-        with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
-            try:
-                fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+        lock_path = res_path + ".lock"
+        with FileLock(lock_path):
+            with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
                 f.seek(0)
                 old_results = pd.read_csv(f) if os.path.getsize(res_path) > 0 else None
                 if old_results is not None:
@@ -1067,8 +1153,6 @@ class NormData(xr.Dataset):
                     )
                     new_results.index = new_results.index.astype(str)
                 new_results.to_csv(f)
-            finally:
-                fcntl.flock(f.fileno(), fcntl.LOCK_UN)
     def load_zscores(self, save_dir) -> None:
         Z_path = os.path.join(save_dir, f"Z_{self.name}.csv")
@@ -1089,15 +1173,15 @@ class NormData(xr.Dataset):
         subject_ids.index = subject_ids.index.astype(str)
         subject_ids.columns = pd.MultiIndex.from_tuples([("subject_ids", "X")], names=["subject_ids", "centile"])
         for c in self.centile.to_numpy():
-            subject_ids[("subject_ids", c)] = subject_ids[("subject_ids","X")]
+            subject_ids[("subject_ids", c)] = subject_ids[("subject_ids", "X")]
         subject_ids = subject_ids.drop(columns=[("subject_ids", "X")])
         subject_ids = subject_ids.stack(level="centile")
-        centiles = centiles.merge(subject_ids, on=["observations","centile"], how="left")
-        centiles = centiles[[ "subject_ids", *[z for z in sorted(centiles.columns.tolist()) if z not in ["subject_ids"]]]]
+        centiles = centiles.merge(subject_ids, on=["observations", "centile"], how="left")
+        centiles = centiles[["subject_ids", *[z for z in sorted(centiles.columns.tolist()) if z not in ["subject_ids"]]]]
         res_path = os.path.join(save_dir, f"centiles_{self.name}.csv")
-        with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
-            try:
-                fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+        lock_path = res_path + ".lock"
+        with FileLock(lock_path):
+            with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
                 f.seek(0)
                 old_results = pd.read_csv(f) if os.path.getsize(res_path) > 0 else None
                 if old_results is not None:
@@ -1123,8 +1207,6 @@ class NormData(xr.Dataset):
                     )
                     # new_results.index = new_results.index.astype(str)
                 new_results.to_csv(f)
-            finally:
-                fcntl.flock(f.fileno(), fcntl.LOCK_UN)
     def load_centiles(self, save_dir) -> None:
         C_path = os.path.join(save_dir, f"centiles_{self.name}.csv")
@@ -1137,7 +1219,7 @@ class NormData(xr.Dataset):
             A = np.zeros((len(centiles), len(obs), len(response_vars)))
             for i, c in enumerate(centiles):
                 sub = df[df["centile"] == c]
-                sub.sort_values(by="observations")
+                sub = sub.sort_values(by="observations")
                 for j, rv in enumerate(response_vars):
                     A[i, :, j] = sub[rv]
@@ -1151,12 +1233,12 @@ class NormData(xr.Dataset):
         logp = self.logp.to_dataframe().unstack(level="response_vars")
         logp.columns = logp.columns.droplevel(0)
         logp = logp.merge(self.subject_ids.to_dataframe(), on="observations", how="left")
-        logp = logp[[ "subject_ids", *[z for z in sorted(logp.columns.tolist()) if z not in ["subject_ids"]]]]
+        logp = logp[["subject_ids", *[z for z in sorted(logp.columns.tolist()) if z not in ["subject_ids"]]]]
         logp.index = logp.index.astype(str)
         res_path = os.path.join(save_dir, f"logp_{self.name}.csv")
-        with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
-            try:
-                fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+        lock_path = res_path + ".lock"
+        with FileLock(lock_path):
+            with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
                 f.seek(0)
                 old_results = pd.read_csv(f) if os.path.getsize(res_path) > 0 else None
                 if old_results is not None:
@@ -1182,8 +1264,6 @@ class NormData(xr.Dataset):
                     )
                     new_results.index = new_results.index.astype(str)
                 new_results.to_csv(f)
-            finally:
-                fcntl.flock(f.fileno(), fcntl.LOCK_UN)
     def load_logp(self, save_dir) -> None:
         logp_path = os.path.join(save_dir, f"logp_{self.name}.csv")
@@ -1200,9 +1280,9 @@ class NormData(xr.Dataset):
         mdf = self.statistics.to_dataframe().unstack(level="response_vars")
         mdf.columns = mdf.columns.droplevel(0)
         res_path = os.path.join(save_dir, f"statistics_{self.name}.csv")
-        with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
-            try:
-                fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+        lock_path = res_path + ".lock"
+        with FileLock(lock_path):
+            with open(res_path, mode="a+" if os.path.exists(res_path) else "w", encoding="utf-8") as f:
                 f.seek(0)
                 old_results = pd.read_csv(f, index_col=0) if os.path.getsize(res_path) > 0 else None
                 if old_results is not None:
@@ -1215,8 +1295,6 @@ class NormData(xr.Dataset):
                 f.seek(0)
                 f.truncate()
                 new_results.to_csv(f)
-            finally:
-                fcntl.flock(f.fileno(), fcntl.LOCK_UN)
     def load_statistics(self, save_dir) -> None:
         logp_path = os.path.join(save_dir, f"statistics_{self.name}.csv")

pcntoolkit 1.1.2__tar.gz → 1.2.0__tar.gz

pcntoolkit 1.1.2tar.gz → 1.2.0tar.gz