PyPI - pg-sui - Versions diffs - 0.2.3__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl - Mend

pg-sui 0.2.3py3-none-any.whl → 1.6.14.dev9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

{pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +99 -77
pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
{pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
{pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
pgsui/__init__.py +35 -54
pgsui/_version.py +34 -0
pgsui/cli.py +909 -0
pgsui/data_processing/__init__.py +0 -0
pgsui/data_processing/config.py +565 -0
pgsui/data_processing/containers.py +1424 -0
pgsui/data_processing/transformers.py +557 -907
pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
pgsui/electron/app/__main__.py +5 -0
pgsui/electron/app/extra-resources/.gitkeep +1 -0
pgsui/electron/app/icons/icons/1024x1024.png +0 -0
pgsui/electron/app/icons/icons/128x128.png +0 -0
pgsui/electron/app/icons/icons/16x16.png +0 -0
pgsui/electron/app/icons/icons/24x24.png +0 -0
pgsui/electron/app/icons/icons/256x256.png +0 -0
pgsui/electron/app/icons/icons/32x32.png +0 -0
pgsui/electron/app/icons/icons/48x48.png +0 -0
pgsui/electron/app/icons/icons/512x512.png +0 -0
pgsui/electron/app/icons/icons/64x64.png +0 -0
pgsui/electron/app/icons/icons/icon.icns +0 -0
pgsui/electron/app/icons/icons/icon.ico +0 -0
pgsui/electron/app/main.js +227 -0
pgsui/electron/app/package-lock.json +6894 -0
pgsui/electron/app/package.json +51 -0
pgsui/electron/app/preload.js +15 -0
pgsui/electron/app/server.py +157 -0
pgsui/electron/app/ui/logo.png +0 -0
pgsui/electron/app/ui/renderer.js +131 -0
pgsui/electron/app/ui/styles.css +59 -0
pgsui/electron/app/ui/ui_shim.js +72 -0
pgsui/electron/bootstrap.py +43 -0
pgsui/electron/launch.py +57 -0
pgsui/electron/package.json +14 -0
pgsui/example_data/__init__.py +0 -0
pgsui/example_data/phylip_files/__init__.py +0 -0
pgsui/example_data/phylip_files/test.phy +0 -0
pgsui/example_data/popmaps/__init__.py +0 -0
pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
pgsui/example_data/structure_files/__init__.py +0 -0
pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
pgsui/impute/__init__.py +0 -0
pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
pgsui/impute/deterministic/imputers/mode.py +844 -0
pgsui/impute/deterministic/imputers/nmf.py +221 -0
pgsui/impute/deterministic/imputers/phylo.py +973 -0
pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
pgsui/impute/supervised/__init__.py +0 -0
pgsui/impute/supervised/base.py +343 -0
pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
pgsui/impute/supervised/imputers/random_forest.py +291 -0
pgsui/impute/unsupervised/__init__.py +0 -0
pgsui/impute/unsupervised/base.py +1118 -0
pgsui/impute/unsupervised/callbacks.py +92 -262
{simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
pgsui/impute/unsupervised/imputers/vae.py +1228 -0
pgsui/impute/unsupervised/loss_functions.py +261 -0
pgsui/impute/unsupervised/models/__init__.py +0 -0
pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
pgsui/impute/unsupervised/models/vae_model.py +269 -630
pgsui/impute/unsupervised/nn_scorers.py +255 -0
pgsui/utils/__init__.py +0 -0
pgsui/utils/classification_viz.py +608 -0
pgsui/utils/logging_utils.py +22 -0
pgsui/utils/misc.py +35 -480
pgsui/utils/plotting.py +996 -829
pgsui/utils/pretty_metrics.py +290 -0
pgsui/utils/scorers.py +213 -666
pg_sui-0.2.3.dist-info/RECORD +0 -75
pg_sui-0.2.3.dist-info/top_level.txt +0 -3
pgsui/example_data/phylip_files/test_n10.phy +0 -118
pgsui/example_data/phylip_files/test_n100.phy +0 -118
pgsui/example_data/phylip_files/test_n2.phy +0 -118
pgsui/example_data/phylip_files/test_n500.phy +0 -118
pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
pgsui/example_data/trees/test.iqtree +0 -376
pgsui/example_data/trees/test.qmat +0 -5
pgsui/example_data/trees/test.rate +0 -2033
pgsui/example_data/trees/test.tre +0 -1
pgsui/example_data/trees/test_n10.rate +0 -19
pgsui/example_data/trees/test_n100.rate +0 -109
pgsui/example_data/trees/test_n500.rate +0 -509
pgsui/example_data/trees/test_siterates.txt +0 -2024
pgsui/example_data/trees/test_siterates_n10.txt +0 -10
pgsui/example_data/trees/test_siterates_n100.txt +0 -100
pgsui/example_data/trees/test_siterates_n500.txt +0 -500
pgsui/example_data/vcf_files/test.vcf +0 -244
pgsui/example_data/vcf_files/test.vcf.gz +0 -0
pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
pgsui/impute/estimators.py +0 -1268
pgsui/impute/impute.py +0 -1463
pgsui/impute/simple_imputers.py +0 -1431
pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
pgsui/impute/unsupervised/keras_classifiers.py +0 -697
pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
pgsui/pg_sui.py +0 -261
pgsui/utils/sequence_tools.py +0 -407
simulation/sim_benchmarks.py +0 -333
simulation/sim_treeparams.py +0 -475
test/__init__.py +0 -0
test/pg_sui_simtest.py +0 -215
test/pg_sui_testing.py +0 -523
test/test.py +0 -151
test/test_pgsui.py +0 -374
test/test_tkc.py +0 -185

pgsui/impute/deterministic/imputers/nmf.py ADDED Viewed

@@ -0,0 +1,221 @@
+from pathlib import Path
+from typing import Dict, List
+# Third-party imports
+import numpy as np
+import pandas as pd
+class ImputeNMF:
+    """Impute missing data using matrix factorization. If ``by_populations=False`` then imputation is by global allele frequency. If ``by_populations=True`` then imputation is by population-wise allele frequency.
+    Args:
+        genotype_data (GenotypeData object or None, optional): GenotypeData instance.
+        latent_features (float, optional): The number of latent variables used to reduce dimensionality of the data. Defaults to 2.
+        learning_rate (float, optional): The learning rate for the optimizers. Adjust if the loss is learning too slowly. Defaults to 0.1.
+        tol (float, optional): Tolerance of the stopping condition. Defaults to 1e-3.
+        missing (int, optional): Missing data value. Defaults to -9.
+        prefix (str, optional): Prefix for writing output files. Defaults to "output".
+        verbose (bool, optional): Whether to print status updates. Set to False for no status updates. Defaults to True.
+        **kwargs (Dict[str, bool | List[List[int]] | None | float | int | str]): Additional keyword arguments to supply. Primarily for internal purposes. Options include: {"iterative_mode": bool, "validation_mode": bool, "gt": List[List[int]]}. "iterative_mode" determines whether ``ImputeAlleleFreq`` is being used as the initial imputer in ``IterativeImputer``. "gt" is used internally for the simple imputers during grid searches and validation. If ``genotype_data is None`` then ``gt`` cannot also be None, and vice versa. Only one of ``gt`` or ``genotype_data`` can be set.
+    Attributes:
+        imputed (GenotypeData): New GenotypeData instance with imputed data.
+    Example:
+        >>>data = GenotypeData(
+        >>>    filename="test.str",
+        >>>    filetype="structure",
+        >>>    popmapfile="test.popmap",
+        >>>)
+        >>>
+        >>>nmf = ImputeMF(
+        >>>    genotype_data=data,
+        >>>    by_populations=True,
+        >>>)
+        >>>
+        >>> # Get GenotypeData instance.
+        >>>gd_nmf = nmf.imputed
+    Raises:
+        TypeError: genotype_data cannot be NoneType.
+    """
+    def __init__(
+        self,
+        genotype_data,
+        *,
+        latent_features: int = 2,
+        max_iter: int = 100,
+        learning_rate: float = 0.0002,
+        regularization_param: float = 0.02,
+        tol: float = 0.1,
+        n_fail: int = 20,
+        missing: int = -9,
+        prefix: str = "imputer",
+        verbose: bool = True,
+        **kwargs: Dict[str, bool | List[List[int]] | None | float | int | str],
+    ) -> None:
+        self.max_iter = max_iter
+        self.latent_features = latent_features
+        self.n_fail = n_fail
+        self.learning_rate = learning_rate
+        self.tol = tol
+        self.regularization_param = regularization_param
+        self.missing = missing
+        self.prefix = prefix
+        self.verbose = verbose
+        self.iterative_mode = kwargs.get("iterative_mode", False)
+        self.validation_mode = kwargs.get("validation_mode", False)
+        gt = kwargs.get("gt", None)
+        if genotype_data is None and gt is None:
+            raise TypeError("GenotypeData and gt cannot both be NoneType.")
+        if gt is None:
+            X = genotype_data.genotypes_012(fmt="numpy")
+        else:
+            X = gt.copy()
+        imputed012 = pd.DataFrame(self.fit_predict(X))
+        genotype_data = genotype_data.copy()
+        genotype_data.snp_data = genotype_data.decode_012(
+            imputed012, prefix=prefix, write_output=False
+        )
+        if self.validation_mode:
+            self.imputed = imputed012.to_numpy()
+        else:
+            self.imputed = genotype_data
+    @property
+    def genotypes_012(self):
+        return self.imputed.genotypes012
+    @property
+    def snp_data(self):
+        return self.imputed.snp_data
+    @property
+    def alignment(self):
+        return self.imputed.alignment
+    def fit_predict(self, X):
+        # imputation
+        if self.verbose:
+            print(f"Doing MF imputation...")
+        R = X
+        R = R.astype(int)
+        R[R == self.missing] = -9
+        R = R + 1
+        R[R < 0] = 0
+        n_row = len(R)
+        n_col = len(R[0])
+        p = np.random.rand(n_row, self.latent_features)
+        q = np.random.rand(n_col, self.latent_features)
+        q_t = q.T
+        fails = 0
+        e_current = None
+        for step in range(self.max_iter):
+            for i in range(n_row):
+                for j in range(n_col):
+                    if R[i][j] > 0:
+                        eij = R[i][j] - np.dot(p[i, :], q_t[:, j])
+                        for k in range(self.latent_features):
+                            p[i][k] = p[i][k] + self.learning_rate * (
+                                2 * eij * q_t[k][j]
+                                - self.regularization_param * p[i][k]
+                            )
+                            q_t[k][j] = q_t[k][j] + self.learning_rate * (
+                                2 * eij * p[i][k]
+                                - self.regularization_param * q_t[k][j]
+                            )
+            e = 0
+            for i in range(n_row):
+                for j in range(len(R[i])):
+                    if R[i][j] > 0:
+                        e = e + pow(R[i][j] - np.dot(p[i, :], q_t[:, j]), 2)
+                        for k in range(self.latent_features):
+                            e = e + (self.regularization_param / 2) * (
+                                pow(p[i][k], 2) + pow(q_t[k][j], 2)
+                            )
+            if e_current is None:
+                e_current = e
+            else:
+                if abs(e_current - e) < self.tol:
+                    fails += 1
+                else:
+                    fails = 0
+                e_current = e
+            if fails >= self.n_fail:
+                break
+        nR = np.dot(p, q_t)
+        # transform values per-column (i.e., only allowing values found in original)
+        tR = self.transform(R, nR)
+        # get accuracy of re-constructing non-missing genotypes
+        accuracy = self.accuracy(X, tR)
+        # insert imputed values for missing genotypes
+        fR = X
+        fR[X < 0] = tR[X < 0]
+        if self.verbose:
+            print("Done!")
+        return fR
+    def transform(self, original, predicted):
+        n_row = len(original)
+        n_col = len(original[0])
+        tR = predicted
+        for j in range(n_col):
+            observed = predicted[:, j]
+            expected = original[:, j]
+            options = np.unique(expected[expected != 0])
+            for i in range(n_row):
+                transform = min(options, key=lambda x: abs(x - predicted[i, j]))
+                tR[i, j] = transform
+        tR = tR - 1
+        tR[tR < 0] = -9
+        return tR
+    def accuracy(self, expected, predicted):
+        prop_same = np.sum(expected[expected >= 0] == predicted[expected >= 0])
+        tot = expected[expected >= 0].size
+        accuracy = prop_same / tot
+        return accuracy
+    def write2file(
+        self, X: pd.DataFrame | np.ndarray | List[List[int | float]]
+    ) -> None:
+        """Write imputed data to file on disk.
+        Args:
+            X (pandas.DataFrame | numpy.ndarray | List[List[int | float]]): Imputed data to write to file.
+        Raises:
+            TypeError: If X is of unsupported type.
+        """
+        outfile = Path(
+            f"{self.prefix}_output",
+            "alignments",
+            "Deterministic",
+            "ImputeMF",
+        )
+        Path(outfile).mkdir(parents=True, exist_ok=True)
+        outfile = Path(outfile) / "imputed_012.csv"
+        if isinstance(X, pd.DataFrame):
+            df = X
+        elif isinstance(X, (np.ndarray, list)):
+            df = pd.DataFrame(X)
+        else:
+            raise TypeError(
+                f"Could not write imputed data because it is of incorrect "
+                f"type. Got {type(X)}"
+            )
+        df.to_csv(outfile, header=False, index=False)

pg-sui 0.2.3__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl

pg-sui 0.2.3py3-none-any.whl → 1.6.14.dev9py3-none-any.whl