pg-sui 0.2.3__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +99 -77
- pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
- {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
- pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
- {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
- pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +909 -0
- pgsui/data_processing/__init__.py +0 -0
- pgsui/data_processing/config.py +565 -0
- pgsui/data_processing/containers.py +1424 -0
- pgsui/data_processing/transformers.py +557 -907
- pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/extra-resources/.gitkeep +1 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +227 -0
- pgsui/electron/app/package-lock.json +6894 -0
- pgsui/electron/app/package.json +51 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +157 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +131 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +57 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/__init__.py +0 -0
- pgsui/example_data/phylip_files/__init__.py +0 -0
- pgsui/example_data/phylip_files/test.phy +0 -0
- pgsui/example_data/popmaps/__init__.py +0 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/structure_files/__init__.py +0 -0
- pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/__init__.py +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
- pgsui/impute/deterministic/imputers/mode.py +844 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +973 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
- pgsui/impute/supervised/__init__.py +0 -0
- pgsui/impute/supervised/base.py +343 -0
- pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
- pgsui/impute/supervised/imputers/random_forest.py +291 -0
- pgsui/impute/unsupervised/__init__.py +0 -0
- pgsui/impute/unsupervised/base.py +1118 -0
- pgsui/impute/unsupervised/callbacks.py +92 -262
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
- pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
- pgsui/impute/unsupervised/imputers/vae.py +1228 -0
- pgsui/impute/unsupervised/loss_functions.py +261 -0
- pgsui/impute/unsupervised/models/__init__.py +0 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
- pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
- pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
- pgsui/impute/unsupervised/models/vae_model.py +269 -630
- pgsui/impute/unsupervised/nn_scorers.py +255 -0
- pgsui/utils/__init__.py +0 -0
- pgsui/utils/classification_viz.py +608 -0
- pgsui/utils/logging_utils.py +22 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +996 -829
- pgsui/utils/pretty_metrics.py +290 -0
- pgsui/utils/scorers.py +213 -666
- pg_sui-0.2.3.dist-info/RECORD +0 -75
- pg_sui-0.2.3.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -1268
- pgsui/impute/impute.py +0 -1463
- pgsui/impute/simple_imputers.py +0 -1431
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
- pgsui/impute/unsupervised/keras_classifiers.py +0 -697
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -151
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -185
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
# Third-party imports
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ImputeNMF:
|
|
10
|
+
"""Impute missing data using matrix factorization. If ``by_populations=False`` then imputation is by global allele frequency. If ``by_populations=True`` then imputation is by population-wise allele frequency.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
genotype_data (GenotypeData object or None, optional): GenotypeData instance.
|
|
14
|
+
latent_features (float, optional): The number of latent variables used to reduce dimensionality of the data. Defaults to 2.
|
|
15
|
+
learning_rate (float, optional): The learning rate for the optimizers. Adjust if the loss is learning too slowly. Defaults to 0.1.
|
|
16
|
+
tol (float, optional): Tolerance of the stopping condition. Defaults to 1e-3.
|
|
17
|
+
missing (int, optional): Missing data value. Defaults to -9.
|
|
18
|
+
prefix (str, optional): Prefix for writing output files. Defaults to "output".
|
|
19
|
+
verbose (bool, optional): Whether to print status updates. Set to False for no status updates. Defaults to True.
|
|
20
|
+
**kwargs (Dict[str, bool | List[List[int]] | None | float | int | str]): Additional keyword arguments to supply. Primarily for internal purposes. Options include: {"iterative_mode": bool, "validation_mode": bool, "gt": List[List[int]]}. "iterative_mode" determines whether ``ImputeAlleleFreq`` is being used as the initial imputer in ``IterativeImputer``. "gt" is used internally for the simple imputers during grid searches and validation. If ``genotype_data is None`` then ``gt`` cannot also be None, and vice versa. Only one of ``gt`` or ``genotype_data`` can be set.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
imputed (GenotypeData): New GenotypeData instance with imputed data.
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
>>>data = GenotypeData(
|
|
27
|
+
>>> filename="test.str",
|
|
28
|
+
>>> filetype="structure",
|
|
29
|
+
>>> popmapfile="test.popmap",
|
|
30
|
+
>>>)
|
|
31
|
+
>>>
|
|
32
|
+
>>>nmf = ImputeMF(
|
|
33
|
+
>>> genotype_data=data,
|
|
34
|
+
>>> by_populations=True,
|
|
35
|
+
>>>)
|
|
36
|
+
>>>
|
|
37
|
+
>>> # Get GenotypeData instance.
|
|
38
|
+
>>>gd_nmf = nmf.imputed
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
TypeError: genotype_data cannot be NoneType.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
genotype_data,
|
|
47
|
+
*,
|
|
48
|
+
latent_features: int = 2,
|
|
49
|
+
max_iter: int = 100,
|
|
50
|
+
learning_rate: float = 0.0002,
|
|
51
|
+
regularization_param: float = 0.02,
|
|
52
|
+
tol: float = 0.1,
|
|
53
|
+
n_fail: int = 20,
|
|
54
|
+
missing: int = -9,
|
|
55
|
+
prefix: str = "imputer",
|
|
56
|
+
verbose: bool = True,
|
|
57
|
+
**kwargs: Dict[str, bool | List[List[int]] | None | float | int | str],
|
|
58
|
+
) -> None:
|
|
59
|
+
self.max_iter = max_iter
|
|
60
|
+
self.latent_features = latent_features
|
|
61
|
+
self.n_fail = n_fail
|
|
62
|
+
self.learning_rate = learning_rate
|
|
63
|
+
self.tol = tol
|
|
64
|
+
self.regularization_param = regularization_param
|
|
65
|
+
self.missing = missing
|
|
66
|
+
self.prefix = prefix
|
|
67
|
+
self.verbose = verbose
|
|
68
|
+
self.iterative_mode = kwargs.get("iterative_mode", False)
|
|
69
|
+
self.validation_mode = kwargs.get("validation_mode", False)
|
|
70
|
+
|
|
71
|
+
gt = kwargs.get("gt", None)
|
|
72
|
+
|
|
73
|
+
if genotype_data is None and gt is None:
|
|
74
|
+
raise TypeError("GenotypeData and gt cannot both be NoneType.")
|
|
75
|
+
|
|
76
|
+
if gt is None:
|
|
77
|
+
X = genotype_data.genotypes_012(fmt="numpy")
|
|
78
|
+
else:
|
|
79
|
+
X = gt.copy()
|
|
80
|
+
imputed012 = pd.DataFrame(self.fit_predict(X))
|
|
81
|
+
genotype_data = genotype_data.copy()
|
|
82
|
+
genotype_data.snp_data = genotype_data.decode_012(
|
|
83
|
+
imputed012, prefix=prefix, write_output=False
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
if self.validation_mode:
|
|
87
|
+
self.imputed = imputed012.to_numpy()
|
|
88
|
+
else:
|
|
89
|
+
self.imputed = genotype_data
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def genotypes_012(self):
|
|
93
|
+
return self.imputed.genotypes012
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def snp_data(self):
|
|
97
|
+
return self.imputed.snp_data
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def alignment(self):
|
|
101
|
+
return self.imputed.alignment
|
|
102
|
+
|
|
103
|
+
def fit_predict(self, X):
|
|
104
|
+
# imputation
|
|
105
|
+
if self.verbose:
|
|
106
|
+
print(f"Doing MF imputation...")
|
|
107
|
+
R = X
|
|
108
|
+
R = R.astype(int)
|
|
109
|
+
R[R == self.missing] = -9
|
|
110
|
+
R = R + 1
|
|
111
|
+
R[R < 0] = 0
|
|
112
|
+
n_row = len(R)
|
|
113
|
+
n_col = len(R[0])
|
|
114
|
+
p = np.random.rand(n_row, self.latent_features)
|
|
115
|
+
q = np.random.rand(n_col, self.latent_features)
|
|
116
|
+
q_t = q.T
|
|
117
|
+
fails = 0
|
|
118
|
+
e_current = None
|
|
119
|
+
for step in range(self.max_iter):
|
|
120
|
+
for i in range(n_row):
|
|
121
|
+
for j in range(n_col):
|
|
122
|
+
if R[i][j] > 0:
|
|
123
|
+
eij = R[i][j] - np.dot(p[i, :], q_t[:, j])
|
|
124
|
+
for k in range(self.latent_features):
|
|
125
|
+
p[i][k] = p[i][k] + self.learning_rate * (
|
|
126
|
+
2 * eij * q_t[k][j]
|
|
127
|
+
- self.regularization_param * p[i][k]
|
|
128
|
+
)
|
|
129
|
+
q_t[k][j] = q_t[k][j] + self.learning_rate * (
|
|
130
|
+
2 * eij * p[i][k]
|
|
131
|
+
- self.regularization_param * q_t[k][j]
|
|
132
|
+
)
|
|
133
|
+
e = 0
|
|
134
|
+
for i in range(n_row):
|
|
135
|
+
for j in range(len(R[i])):
|
|
136
|
+
if R[i][j] > 0:
|
|
137
|
+
e = e + pow(R[i][j] - np.dot(p[i, :], q_t[:, j]), 2)
|
|
138
|
+
for k in range(self.latent_features):
|
|
139
|
+
e = e + (self.regularization_param / 2) * (
|
|
140
|
+
pow(p[i][k], 2) + pow(q_t[k][j], 2)
|
|
141
|
+
)
|
|
142
|
+
if e_current is None:
|
|
143
|
+
e_current = e
|
|
144
|
+
else:
|
|
145
|
+
if abs(e_current - e) < self.tol:
|
|
146
|
+
fails += 1
|
|
147
|
+
else:
|
|
148
|
+
fails = 0
|
|
149
|
+
e_current = e
|
|
150
|
+
if fails >= self.n_fail:
|
|
151
|
+
break
|
|
152
|
+
nR = np.dot(p, q_t)
|
|
153
|
+
|
|
154
|
+
# transform values per-column (i.e., only allowing values found in original)
|
|
155
|
+
tR = self.transform(R, nR)
|
|
156
|
+
|
|
157
|
+
# get accuracy of re-constructing non-missing genotypes
|
|
158
|
+
accuracy = self.accuracy(X, tR)
|
|
159
|
+
|
|
160
|
+
# insert imputed values for missing genotypes
|
|
161
|
+
fR = X
|
|
162
|
+
fR[X < 0] = tR[X < 0]
|
|
163
|
+
|
|
164
|
+
if self.verbose:
|
|
165
|
+
print("Done!")
|
|
166
|
+
|
|
167
|
+
return fR
|
|
168
|
+
|
|
169
|
+
def transform(self, original, predicted):
|
|
170
|
+
n_row = len(original)
|
|
171
|
+
n_col = len(original[0])
|
|
172
|
+
tR = predicted
|
|
173
|
+
for j in range(n_col):
|
|
174
|
+
observed = predicted[:, j]
|
|
175
|
+
expected = original[:, j]
|
|
176
|
+
options = np.unique(expected[expected != 0])
|
|
177
|
+
for i in range(n_row):
|
|
178
|
+
transform = min(options, key=lambda x: abs(x - predicted[i, j]))
|
|
179
|
+
tR[i, j] = transform
|
|
180
|
+
tR = tR - 1
|
|
181
|
+
tR[tR < 0] = -9
|
|
182
|
+
return tR
|
|
183
|
+
|
|
184
|
+
def accuracy(self, expected, predicted):
|
|
185
|
+
prop_same = np.sum(expected[expected >= 0] == predicted[expected >= 0])
|
|
186
|
+
tot = expected[expected >= 0].size
|
|
187
|
+
accuracy = prop_same / tot
|
|
188
|
+
return accuracy
|
|
189
|
+
|
|
190
|
+
def write2file(
|
|
191
|
+
self, X: pd.DataFrame | np.ndarray | List[List[int | float]]
|
|
192
|
+
) -> None:
|
|
193
|
+
"""Write imputed data to file on disk.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
X (pandas.DataFrame | numpy.ndarray | List[List[int | float]]): Imputed data to write to file.
|
|
197
|
+
|
|
198
|
+
Raises:
|
|
199
|
+
TypeError: If X is of unsupported type.
|
|
200
|
+
"""
|
|
201
|
+
outfile = Path(
|
|
202
|
+
f"{self.prefix}_output",
|
|
203
|
+
"alignments",
|
|
204
|
+
"Deterministic",
|
|
205
|
+
"ImputeMF",
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
Path(outfile).mkdir(parents=True, exist_ok=True)
|
|
209
|
+
outfile = Path(outfile) / "imputed_012.csv"
|
|
210
|
+
|
|
211
|
+
if isinstance(X, pd.DataFrame):
|
|
212
|
+
df = X
|
|
213
|
+
elif isinstance(X, (np.ndarray, list)):
|
|
214
|
+
df = pd.DataFrame(X)
|
|
215
|
+
else:
|
|
216
|
+
raise TypeError(
|
|
217
|
+
f"Could not write imputed data because it is of incorrect "
|
|
218
|
+
f"type. Got {type(X)}"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
df.to_csv(outfile, header=False, index=False)
|