PyPI - cpgtools - Versions diffs - 2.0.5__py3-none-any.whl - Mend

cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

cpgmodule/BED.py +441 -0
cpgmodule/MI.py +193 -0
cpgmodule/__init__.py +0 -0
cpgmodule/_version.py +1 -0
cpgmodule/cgID.py +866897 -0
cpgmodule/data/AltumAge_cpg.pkl +0 -0
cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
cpgmodule/data/AltumAge_scaler.pkl +0 -0
cpgmodule/data/GA_Bohlin.pkl +0 -0
cpgmodule/data/GA_Haftorn.pkl +0 -0
cpgmodule/data/GA_Knight.pkl +0 -0
cpgmodule/data/GA_Lee_CPC.pkl +0 -0
cpgmodule/data/GA_Lee_RPC.pkl +0 -0
cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
cpgmodule/data/GA_Mayne.pkl +0 -0
cpgmodule/data/Hannum.pkl +0 -0
cpgmodule/data/Horvath_2013.pkl +0 -0
cpgmodule/data/Horvath_2018.pkl +0 -0
cpgmodule/data/Levine.pkl +0 -0
cpgmodule/data/Lu_DNAmTL.pkl +0 -0
cpgmodule/data/Ped_McEwen.pkl +0 -0
cpgmodule/data/Ped_Wu.pkl +0 -0
cpgmodule/data/Zhang_BLUP.pkl +0 -0
cpgmodule/data/Zhang_EN.pkl +0 -0
cpgmodule/data/__init__.py +0 -0
cpgmodule/extend_bed.py +147 -0
cpgmodule/imotif.py +348 -0
cpgmodule/ireader.py +28 -0
cpgmodule/methylClock.py +53 -0
cpgmodule/padjust.py +58 -0
cpgmodule/region2gene.py +170 -0
cpgmodule/utils.py +642 -0
cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
cpgtools-2.0.5.dist-info/METADATA +59 -0
cpgtools-2.0.5.dist-info/RECORD +104 -0
cpgtools-2.0.5.dist-info/WHEEL +5 -0
cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
cpgtools-2.0.5.dist-info/top_level.txt +5 -0
impyute/__init__.py +3 -0
impyute/contrib/__init__.py +7 -0
impyute/contrib/compare.py +69 -0
impyute/contrib/count_missing.py +30 -0
impyute/contrib/describe.py +63 -0
impyute/cs/__init__.py +11 -0
impyute/cs/buck_iterative.py +82 -0
impyute/cs/central_tendency.py +84 -0
impyute/cs/em.py +52 -0
impyute/cs/fast_knn.py +130 -0
impyute/cs/random.py +27 -0
impyute/dataset/__init__.py +6 -0
impyute/dataset/base.py +137 -0
impyute/dataset/corrupt.py +55 -0
impyute/deletion/__init__.py +5 -0
impyute/deletion/complete_case.py +21 -0
impyute/ops/__init__.py +12 -0
impyute/ops/error.py +9 -0
impyute/ops/inverse_distance_weighting.py +31 -0
impyute/ops/matrix.py +47 -0
impyute/ops/testing.py +20 -0
impyute/ops/util.py +96 -0
impyute/ops/wrapper.py +179 -0
impyute/ts/__init__.py +6 -0
impyute/ts/locf.py +57 -0
impyute/ts/moving_window.py +128 -0
impyutelib.py +890 -0
missingpy/__init__.py +4 -0
missingpy/knnimpute.py +328 -0
missingpy/missforest.py +556 -0
missingpy/pairwise_external.py +315 -0
missingpy/tests/__init__.py +0 -0
missingpy/tests/test_knnimpute.py +605 -0
missingpy/tests/test_missforest.py +409 -0
missingpy/utils.py +124 -0
misspylib.py +565 -0

missingpy/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+#from .knnimpute import KNNImputer
+from .missforest import MissForest
+__all__ = ['MissForest']

missingpy/knnimpute.py ADDED Viewed

@@ -0,0 +1,328 @@
+"""KNN Imputer for Missing Data"""
+# Author: Ashim Bhattarai
+# License: GNU General Public License v3 (GPLv3)
+import warnings
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils import check_array
+from sklearn.utils.validation import check_is_fitted
+from sklearn.utils.validation import FLOAT_DTYPES
+from sklearn.neighbors.base import _check_weights
+from sklearn.neighbors.base import _get_weights
+from .pairwise_external import pairwise_distances
+from .pairwise_external import _get_mask
+from .pairwise_external import _MASKED_METRICS
+__all__ = [
+    'KNNImputer',
+]
+class KNNImputer(BaseEstimator, TransformerMixin):
+    """Imputation for completing missing values using k-Nearest Neighbors.
+    Each sample's missing values are imputed using values from ``n_neighbors``
+    nearest neighbors found in the training set. Each missing feature is then
+    imputed as the average, either weighted or unweighted, of these neighbors.
+    Note that if a sample has more than one feature missing, then the
+    neighbors for that sample can be different depending on the particular
+    feature being imputed. Finally, where the number of donor neighbors is
+    less than ``n_neighbors``, the training set average for that feature is
+    used during imputation.
+    Parameters
+    ----------
+    missing_values : integer or "NaN", optional (default = "NaN")
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed. For missing values encoded as
+        ``np.nan``, use the string value "NaN".
+    n_neighbors : int, optional (default = 5)
+        Number of neighboring samples to use for imputation.
+    weights : str or callable, optional (default = "uniform")
+        Weight function used in prediction.  Possible values:
+        - 'uniform' : uniform weights.  All points in each neighborhood
+          are weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - [callable] : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+    metric : str or callable, optional (default = "masked_euclidean")
+        Distance metric for searching neighbors. Possible values:
+        - 'masked_euclidean'
+        - [callable] : a user-defined function which conforms to the
+        definition of _pairwise_callable(X, Y, metric, **kwds). In other
+        words, the function accepts two arrays, X and Y, and a
+        ``missing_values`` keyword in **kwds and returns a scalar distance
+        value.
+    row_max_missing : float, optional (default = 0.5)
+        The maximum fraction of columns (i.e. features) that can be missing
+        before the sample is excluded from nearest neighbor imputation. It
+        means that such rows will not be considered a potential donor in
+        ``fit()``, and in ``transform()`` their missing feature values will be
+        imputed to be the column mean for the entire dataset.
+    col_max_missing : float, optional (default = 0.8)
+        The maximum fraction of rows (or samples) that can be missing
+        for any feature beyond which an error is raised.
+    copy : boolean, optional (default = True)
+        If True, a copy of X will be created. If False, imputation will
+        be done in-place whenever possible. Note that, if metric is
+        "masked_euclidean" and copy=False then missing_values in the
+        input matrix X will be overwritten with zeros.
+    Attributes
+    ----------
+    statistics_ : 1-D array of length {n_features}
+        The 1-D array contains the mean of each feature calculated using
+        observed (i.e. non-missing) values. This is used for imputing
+        missing values in samples that are either excluded from nearest
+        neighbors search because they have too many ( > row_max_missing)
+        missing features or because all of the sample's k-nearest neighbors
+        (i.e., the potential donors) also have the relevant feature value
+        missing.
+    References
+    ----------
+    * Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
+      Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing
+      value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17
+      no. 6, 2001 Pages 520-525.
+    Examples
+    --------
+    >>> from missingpy import KNNImputer
+    >>> nan = float("NaN")
+    >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]
+    >>> imputer = KNNImputer(n_neighbors=2, weights="uniform")
+    >>> imputer.fit_transform(X)
+    array([[1. , 2. , 4. ],
+           [3. , 4. , 3. ],
+           [5.5, 6. , 5. ],
+           [8. , 8. , 7. ]])
+    """
+    def __init__(self, missing_values="NaN", n_neighbors=5,
+                 weights="uniform", metric="masked_euclidean",
+                 row_max_missing=0.5, col_max_missing=0.8, copy=True):
+        self.missing_values = missing_values
+        self.n_neighbors = n_neighbors
+        self.weights = weights
+        self.metric = metric
+        self.row_max_missing = row_max_missing
+        self.col_max_missing = col_max_missing
+        self.copy = copy
+    def _impute(self, dist, X, fitted_X, mask, mask_fx):
+        """Helper function to find and impute missing values"""
+        # For each column, find and impute
+        n_rows_X, n_cols_X = X.shape
+        for c in range(n_cols_X):
+            if not np.any(mask[:, c], axis=0):
+                continue
+            # Row index for receivers and potential donors (pdonors)
+            receivers_row_idx = np.where(mask[:, c])[0]
+            pdonors_row_idx = np.where(~mask_fx[:, c])[0]
+            # Impute using column mean if n_neighbors are not available
+            if len(pdonors_row_idx) < self.n_neighbors:
+                warnings.warn("Insufficient number of neighbors! "
+                              "Filling in column mean.")
+                X[receivers_row_idx, c] = self.statistics_[c]
+                continue
+            # Get distance from potential donors
+            dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx]
+            dist_pdonors = dist_pdonors.reshape(-1,
+                                                len(pdonors_row_idx))
+            # Argpartition to separate actual donors from the rest
+            pdonors_idx = np.argpartition(
+                dist_pdonors, self.n_neighbors - 1, axis=1)
+            # Get final donors row index from pdonors
+            donors_idx = pdonors_idx[:, :self.n_neighbors]
+            # Get weights or None
+            dist_pdonors_rows = np.arange(len(donors_idx))[:, None]
+            weight_matrix = _get_weights(
+                dist_pdonors[
+                    dist_pdonors_rows, donors_idx], self.weights)
+            donor_row_idx_ravel = donors_idx.ravel()
+            # Retrieve donor values and calculate kNN score
+            fitted_X_temp = fitted_X[pdonors_row_idx]
+            donors = fitted_X_temp[donor_row_idx_ravel, c].reshape(
+                (-1, self.n_neighbors))
+            donors_mask = _get_mask(donors, self.missing_values)
+            donors = np.ma.array(donors, mask=donors_mask)
+            # Final imputation
+            imputed = np.ma.average(donors, axis=1,
+                                    weights=weight_matrix)
+            X[receivers_row_idx, c] = imputed.data
+        return X
+    def fit(self, X, y=None):
+        """Fit the imputer on X.
+        Parameters
+        ----------
+        X : {array-like}, shape (n_samples, n_features)
+            Input data, where ``n_samples`` is the number of samples and
+            ``n_features`` is the number of features.
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        # Check data integrity and calling arguments
+        force_all_finite = False if self.missing_values in ["NaN",
+                                                            np.nan] else True
+        if not force_all_finite:
+            if self.metric not in _MASKED_METRICS and not callable(
+                    self.metric):
+                raise ValueError(
+                    "The selected metric does not support NaN values.")
+        X = check_array(X, accept_sparse=False, dtype=np.float64,
+                        force_all_finite=force_all_finite, copy=self.copy)
+        self.weights = _check_weights(self.weights)
+        # Check for +/- inf
+        if np.any(np.isinf(X)):
+            raise ValueError("+/- inf values are not allowed.")
+        # Check if % missing in any column > col_max_missing
+        mask = _get_mask(X, self.missing_values)
+        if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)):
+            raise ValueError("Some column(s) have more than {}% missing values"
+                             .format(self.col_max_missing * 100))
+        X_col_means = np.ma.array(X, mask=mask).mean(axis=0).data
+        # Check if % missing in any row > row_max_missing
+        bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing)
+        if np.any(bad_rows):
+            warnings.warn(
+                "There are rows with more than {0}% missing values. These "
+                "rows are not included as donor neighbors."
+                    .format(self.row_max_missing * 100))
+            # Remove rows that have more than row_max_missing % missing
+            X = X[~bad_rows, :]
+        # Check if sufficient neighboring samples available
+        if X.shape[0] < self.n_neighbors:
+            raise ValueError("There are only %d samples, but n_neighbors=%d."
+                             % (X.shape[0], self.n_neighbors))
+        self.fitted_X_ = X
+        self.statistics_ = X_col_means
+        return self
+    def transform(self, X):
+        """Impute all missing values in X.
+        Parameters
+        ----------
+        X : {array-like}, shape = [n_samples, n_features]
+            The input data to complete.
+        Returns
+        -------
+        X : {array-like}, shape = [n_samples, n_features]
+            The imputed dataset.
+        """
+        check_is_fitted(self, ["fitted_X_", "statistics_"])
+        force_all_finite = False if self.missing_values in ["NaN",
+                                                            np.nan] else True
+        X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES,
+                        force_all_finite=force_all_finite, copy=self.copy)
+        # Check for +/- inf
+        if np.any(np.isinf(X)):
+            raise ValueError("+/- inf values are not allowed in data to be "
+                             "transformed.")
+        # Get fitted data and ensure correct dimension
+        n_rows_fit_X, n_cols_fit_X = self.fitted_X_.shape
+        n_rows_X, n_cols_X = X.shape
+        if n_cols_X != n_cols_fit_X:
+            raise ValueError("Incompatible dimension between the fitted "
+                             "dataset and the one to be transformed.")
+        mask = _get_mask(X, self.missing_values)
+        row_total_missing = mask.sum(axis=1)
+        if not np.any(row_total_missing):
+            return X
+        # Check for excessive missingness in rows
+        bad_rows = row_total_missing > (mask.shape[1] * self.row_max_missing)
+        if np.any(bad_rows):
+            warnings.warn(
+                "There are rows with more than {0}% missing values. The "
+                "missing features in these rows are imputed with column means."
+                    .format(self.row_max_missing * 100))
+            X_bad = X[bad_rows, :]
+            X = X[~bad_rows, :]
+            mask = mask[~bad_rows]
+            row_total_missing = mask.sum(axis=1)
+        row_has_missing = row_total_missing.astype(np.bool)
+        if np.any(row_has_missing):
+            # Mask for fitted_X
+            mask_fx = _get_mask(self.fitted_X_, self.missing_values)
+            # Pairwise distances between receivers and fitted samples
+            dist = np.empty((len(X), len(self.fitted_X_)))
+            dist[row_has_missing] = pairwise_distances(
+                X[row_has_missing], self.fitted_X_, metric=self.metric,
+                squared=False, missing_values=self.missing_values)
+            # Find and impute missing
+            X = self._impute(dist, X, self.fitted_X_, mask, mask_fx)
+        # Merge bad rows to X and mean impute their missing values
+        if np.any(bad_rows):
+            bad_missing_index = np.where(_get_mask(X_bad, self.missing_values))
+            X_bad[bad_missing_index] = np.take(self.statistics_,
+                                               bad_missing_index[1])
+            X_merged = np.empty((n_rows_X, n_cols_X))
+            X_merged[bad_rows, :] = X_bad
+            X_merged[~bad_rows, :] = X
+            X = X_merged
+        return X
+    def fit_transform(self, X, y=None, **fit_params):
+        """Fit KNNImputer and impute all missing values in X.
+        Parameters
+        ----------
+        X : {array-like}, shape (n_samples, n_features)
+            Input data, where ``n_samples`` is the number of samples and
+            ``n_features`` is the number of features.
+        Returns
+        -------
+        X : {array-like}, shape (n_samples, n_features)
+            Returns imputed dataset.
+        """
+        return self.fit(X).transform(X)