pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pg-sui might be problematic. Click here for more details.
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
- pg_sui-1.6.8.dist-info/RECORD +78 -0
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
- pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
- pg_sui-1.6.8.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +635 -0
- pgsui/data_processing/config.py +576 -0
- pgsui/data_processing/containers.py +1782 -0
- pgsui/data_processing/transformers.py +121 -1103
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +189 -0
- pgsui/electron/app/package-lock.json +6893 -0
- pgsui/electron/app/package.json +50 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +146 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +130 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +59 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
- pgsui/impute/deterministic/imputers/mode.py +679 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +971 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
- pgsui/impute/supervised/base.py +339 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
- pgsui/impute/supervised/imputers/random_forest.py +287 -0
- pgsui/impute/unsupervised/base.py +924 -0
- pgsui/impute/unsupervised/callbacks.py +89 -263
- pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
- pgsui/impute/unsupervised/imputers/vae.py +957 -0
- pgsui/impute/unsupervised/loss_functions.py +158 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
- pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
- pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
- pgsui/impute/unsupervised/models/vae_model.py +259 -618
- pgsui/impute/unsupervised/nn_scorers.py +215 -0
- pgsui/utils/classification_viz.py +591 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +514 -824
- pgsui/utils/scorers.py +212 -438
- pg_sui-1.0.2.1.dist-info/RECORD +0 -75
- pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -735
- pgsui/impute/impute.py +0 -1486
- pgsui/impute/simple_imputers.py +0 -1439
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
- pgsui/impute/unsupervised/keras_classifiers.py +0 -702
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -297
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -214
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
- /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
pgsui/impute/simple_imputers.py
DELETED
|
@@ -1,1439 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import sys
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
import warnings
|
|
5
|
-
from typing import Optional, Union, List, Dict, Tuple, Any, Callable
|
|
6
|
-
|
|
7
|
-
# Third-party imports
|
|
8
|
-
import numpy as np
|
|
9
|
-
import pandas as pd
|
|
10
|
-
import scipy.linalg
|
|
11
|
-
import toyplot.pdf
|
|
12
|
-
import toyplot as tp
|
|
13
|
-
import toytree as tt
|
|
14
|
-
from decimal import Decimal
|
|
15
|
-
|
|
16
|
-
from sklearn.impute import SimpleImputer
|
|
17
|
-
|
|
18
|
-
# Custom imports
|
|
19
|
-
try:
|
|
20
|
-
from snpio import GenotypeData
|
|
21
|
-
from ..utils.misc import isnotebook
|
|
22
|
-
except (ModuleNotFoundError, ValueError, ImportError):
|
|
23
|
-
from snpio import GenotypeData
|
|
24
|
-
from utils.misc import isnotebook
|
|
25
|
-
|
|
26
|
-
is_notebook = isnotebook()
|
|
27
|
-
|
|
28
|
-
if is_notebook:
|
|
29
|
-
from tqdm.notebook import tqdm as progressbar
|
|
30
|
-
else:
|
|
31
|
-
from tqdm import tqdm as progressbar
|
|
32
|
-
|
|
33
|
-
# Pandas on pip gives a performance warning when doing the below code.
|
|
34
|
-
# Apparently it's a bug that exists in the pandas version I used here.
|
|
35
|
-
# It can be safely ignored.
|
|
36
|
-
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class ImputePhylo:
|
|
40
|
-
"""Impute missing data using a phylogenetic tree to inform the imputation.
|
|
41
|
-
|
|
42
|
-
Args:
|
|
43
|
-
genotype_data (GenotypeData instance): GenotypeData instance. Must have the q, tree, and optionally site_rates attributes defined.
|
|
44
|
-
|
|
45
|
-
minbr (float or None, optional): Minimum branch length. Defaults to 0.0000000001
|
|
46
|
-
|
|
47
|
-
str_encodings (Dict[str, int], optional): Integer encodings used in STRUCTURE-formatted file. Should be a dictionary with keys=nucleotides and values=integer encodings. The missing data encoding should also be included. Argument is ignored if using a PHYLIP-formatted file. Defaults to {"A": 1, "C": 2, "G": 3, "T": 4, "N": -9}
|
|
48
|
-
|
|
49
|
-
prefix (str, optional): Prefix to use with output files. Defaults to "imputer".
|
|
50
|
-
|
|
51
|
-
save_plots (bool, optional): Whether to save PDF files with genotype imputations for each site to disk. It makes one PDF file per locus, so if you have a lot of loci it will make a lot of PDF files. Defaults to False.
|
|
52
|
-
|
|
53
|
-
disable_progressbar (bool, optional): Whether to disable the progress bar during the imputation. Defaults to False.
|
|
54
|
-
|
|
55
|
-
kwargs (Dict[str, Any] or None, optional): Additional keyword arguments intended for internal purposes only. Possible arguments: {"column_subset": List[int] or numpy.ndarray[int]}; Subset SNPs by a list of indices for IterativeImputer. Defauls to None.
|
|
56
|
-
|
|
57
|
-
Attributes:
|
|
58
|
-
imputed (GenotypeData): New GenotypeData instance with imputed data.
|
|
59
|
-
|
|
60
|
-
Example:
|
|
61
|
-
>>>data = GenotypeData(
|
|
62
|
-
>>> filename="test.str",
|
|
63
|
-
>>> filetype="structure",
|
|
64
|
-
>>> popmapfile="test.popmap",
|
|
65
|
-
>>> guidetree="test.tre",
|
|
66
|
-
>>> qmatrix_iqtree="test.iqtree",
|
|
67
|
-
>>> siterates_iqtree="test.rates",
|
|
68
|
-
>>>)
|
|
69
|
-
>>>
|
|
70
|
-
>>>phylo = ImputePhylo(
|
|
71
|
-
>>> genotype_data=data,
|
|
72
|
-
>>> save_plots=True,
|
|
73
|
-
>>>)
|
|
74
|
-
>>> # Get GenotypeData object.
|
|
75
|
-
>>>gd_phylo = phylo.imputed
|
|
76
|
-
"""
|
|
77
|
-
|
|
78
|
-
def __init__(
|
|
79
|
-
self,
|
|
80
|
-
genotype_data: Optional[Any],
|
|
81
|
-
minbr: Optional[float] = 0.0000000001,
|
|
82
|
-
*,
|
|
83
|
-
str_encodings: Dict[str, int] = {
|
|
84
|
-
"A": 1,
|
|
85
|
-
"C": 2,
|
|
86
|
-
"G": 3,
|
|
87
|
-
"T": 4,
|
|
88
|
-
"N": -9,
|
|
89
|
-
},
|
|
90
|
-
prefix: str = "imputer",
|
|
91
|
-
save_plots: bool = False,
|
|
92
|
-
disable_progressbar: bool = False,
|
|
93
|
-
**kwargs: Optional[Dict[str, Any]],
|
|
94
|
-
) -> None:
|
|
95
|
-
self.genotype_data = genotype_data
|
|
96
|
-
self.alnfile = genotype_data.filename
|
|
97
|
-
self.filetype = genotype_data.filetype
|
|
98
|
-
self.popmap = genotype_data.popmap
|
|
99
|
-
self.str_encodings = str_encodings
|
|
100
|
-
self.prefix = prefix
|
|
101
|
-
self.minbr = minbr
|
|
102
|
-
self.save_plots = save_plots
|
|
103
|
-
self.disable_progressbar = disable_progressbar
|
|
104
|
-
self.column_subset = kwargs.get("column_subset", None)
|
|
105
|
-
self.validation_mode = kwargs.get("validation_mode", False)
|
|
106
|
-
|
|
107
|
-
self.valid_sites = None
|
|
108
|
-
self.valid_sites_count = None
|
|
109
|
-
|
|
110
|
-
self._validate_arguments(genotype_data)
|
|
111
|
-
data, tree, q, site_rates = self._parse_arguments(genotype_data)
|
|
112
|
-
|
|
113
|
-
if not self.validation_mode:
|
|
114
|
-
imputed012 = self.impute_phylo(tree, data, q, site_rates)
|
|
115
|
-
genotype_data = genotype_data.copy()
|
|
116
|
-
genotype_data.snp_data = genotype_data.decode_012(
|
|
117
|
-
imputed012, prefix=prefix, write_output=False
|
|
118
|
-
)
|
|
119
|
-
self.imputed = genotype_data
|
|
120
|
-
else:
|
|
121
|
-
self.imputed = self.impute_phylo(tree, data, q, site_rates)
|
|
122
|
-
|
|
123
|
-
@property
|
|
124
|
-
def genotypes_012(self):
|
|
125
|
-
return self.imputed.genotypes012
|
|
126
|
-
|
|
127
|
-
@property
|
|
128
|
-
def snp_data(self):
|
|
129
|
-
return self.imputed.snp_data
|
|
130
|
-
|
|
131
|
-
@property
|
|
132
|
-
def alignment(self):
|
|
133
|
-
return self.imputed.alignment
|
|
134
|
-
|
|
135
|
-
def impute_phylo(
|
|
136
|
-
self,
|
|
137
|
-
tree: tt.tree,
|
|
138
|
-
genotypes: Dict[str, List[Union[str, int]]],
|
|
139
|
-
Q: pd.DataFrame,
|
|
140
|
-
site_rates=None,
|
|
141
|
-
minbr=0.0000000001,
|
|
142
|
-
) -> pd.DataFrame:
|
|
143
|
-
"""Imputes genotype values with a guide tree.
|
|
144
|
-
|
|
145
|
-
Imputes genotype values by using a provided guide
|
|
146
|
-
tree to inform the imputation, assuming maximum parsimony.
|
|
147
|
-
|
|
148
|
-
Process Outline:
|
|
149
|
-
For each SNP:
|
|
150
|
-
1) if site_rates, get site-transformated Q matrix.
|
|
151
|
-
|
|
152
|
-
2) Postorder traversal of tree to compute ancestral
|
|
153
|
-
state likelihoods for internal nodes (tips -> root).
|
|
154
|
-
If exclude_N==True, then ignore N tips for this step.
|
|
155
|
-
|
|
156
|
-
3) Preorder traversal of tree to populate missing genotypes
|
|
157
|
-
with the maximum likelihood state (root -> tips).
|
|
158
|
-
|
|
159
|
-
Args:
|
|
160
|
-
tree (toytree.tree object): Input tree.
|
|
161
|
-
|
|
162
|
-
genotypes (Dict[str, List[Union[str, int]]]): Dictionary with key=sampleids, value=sequences.
|
|
163
|
-
|
|
164
|
-
Q (pandas.DataFrame): Rate Matrix Q from .iqtree or separate file.
|
|
165
|
-
|
|
166
|
-
site_rates (List): Site-specific substitution rates (used to weight per-site Q)
|
|
167
|
-
|
|
168
|
-
minbr (float) : Minimum branch length (those below this value will be treated as == minbr)
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
pandas.DataFrame: Imputed genotypes.
|
|
172
|
-
|
|
173
|
-
Raises:
|
|
174
|
-
IndexError: If index does not exist when trying to read genotypes.
|
|
175
|
-
AssertionError: Sites must have same lengths.
|
|
176
|
-
AssertionError: Missing data still found after imputation.
|
|
177
|
-
"""
|
|
178
|
-
try:
|
|
179
|
-
if list(genotypes.values())[0][0][1] == "/":
|
|
180
|
-
genotypes = self._str2iupac(genotypes, self.str_encodings)
|
|
181
|
-
except IndexError:
|
|
182
|
-
if self._is_int(list(genotypes.values())[0][0][0]):
|
|
183
|
-
raise
|
|
184
|
-
|
|
185
|
-
if self.column_subset is not None:
|
|
186
|
-
if isinstance(self.column_subset, np.ndarray):
|
|
187
|
-
self.column_subset = self.column_subset.tolist()
|
|
188
|
-
|
|
189
|
-
genotypes = {
|
|
190
|
-
k: [v[i] for i in self.column_subset]
|
|
191
|
-
for k, v in genotypes.items()
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
# For each SNP:
|
|
195
|
-
nsites = list(set([len(v) for v in genotypes.values()]))
|
|
196
|
-
assert len(nsites) == 1, "Some sites have different lengths!"
|
|
197
|
-
|
|
198
|
-
outdir = f"{self.prefix}_imputation_plots"
|
|
199
|
-
|
|
200
|
-
if self.save_plots:
|
|
201
|
-
Path(outdir).mkdir(parents=True, exist_ok=True)
|
|
202
|
-
|
|
203
|
-
for snp_index in progressbar(
|
|
204
|
-
range(nsites[0]),
|
|
205
|
-
desc="Feature Progress: ",
|
|
206
|
-
leave=True,
|
|
207
|
-
disable=self.disable_progressbar,
|
|
208
|
-
):
|
|
209
|
-
rate = 1.0
|
|
210
|
-
if site_rates is not None:
|
|
211
|
-
rate = site_rates[snp_index]
|
|
212
|
-
|
|
213
|
-
site_Q = Q.copy(deep=True) * rate
|
|
214
|
-
|
|
215
|
-
bads = list()
|
|
216
|
-
for samp in genotypes.keys():
|
|
217
|
-
if genotypes[samp][snp_index].upper() == "N":
|
|
218
|
-
bads.append(samp)
|
|
219
|
-
|
|
220
|
-
# postorder traversal to compute likelihood at root
|
|
221
|
-
node_lik = dict()
|
|
222
|
-
for node in tree.treenode.traverse("postorder"):
|
|
223
|
-
if node.is_leaf():
|
|
224
|
-
continue
|
|
225
|
-
|
|
226
|
-
if node.idx not in node_lik:
|
|
227
|
-
node_lik[node.idx] = [1.0, 1.0, 1.0, 1.0]
|
|
228
|
-
|
|
229
|
-
for child in node.get_children():
|
|
230
|
-
# get branch length to child
|
|
231
|
-
# bl = child.edge.length
|
|
232
|
-
# get transition probs
|
|
233
|
-
d = child.dist
|
|
234
|
-
if d < minbr:
|
|
235
|
-
d = minbr
|
|
236
|
-
pt = self._transition_probs(site_Q, d)
|
|
237
|
-
if child.is_leaf():
|
|
238
|
-
if child.name in genotypes:
|
|
239
|
-
if child.name in bads:
|
|
240
|
-
sum = [1.0, 1.0, 1.0, 1.0]
|
|
241
|
-
else:
|
|
242
|
-
# get genotype data
|
|
243
|
-
sum = None
|
|
244
|
-
for allele in self._get_iupac_full(
|
|
245
|
-
genotypes[child.name][snp_index]
|
|
246
|
-
):
|
|
247
|
-
if sum is None:
|
|
248
|
-
sum = [
|
|
249
|
-
Decimal(x)
|
|
250
|
-
for x in list(pt[allele])
|
|
251
|
-
]
|
|
252
|
-
else:
|
|
253
|
-
sum = [
|
|
254
|
-
Decimal(sum[i]) + Decimal(val)
|
|
255
|
-
for i, val in enumerate(
|
|
256
|
-
list(pt[allele])
|
|
257
|
-
)
|
|
258
|
-
]
|
|
259
|
-
node_lik[child.idx] = [Decimal(x) for x in sum]
|
|
260
|
-
|
|
261
|
-
# add to likelihood for parent node
|
|
262
|
-
if node_lik[node.idx] is None:
|
|
263
|
-
node_lik[node.idx] = node_lik[child.idx]
|
|
264
|
-
else:
|
|
265
|
-
node_lik[node.idx] = [
|
|
266
|
-
Decimal(node_lik[child.idx][i])
|
|
267
|
-
* Decimal(val)
|
|
268
|
-
for i, val in enumerate(node_lik[node.idx])
|
|
269
|
-
]
|
|
270
|
-
else:
|
|
271
|
-
# raise error
|
|
272
|
-
sys.exit(
|
|
273
|
-
f"Error: Taxon {child.name} not found in "
|
|
274
|
-
f"genotypes"
|
|
275
|
-
)
|
|
276
|
-
else:
|
|
277
|
-
l = self._get_internal_lik(pt, node_lik[child.idx])
|
|
278
|
-
if node_lik[node.idx] is None:
|
|
279
|
-
node_lik[node.idx] = [Decimal(x) for x in l]
|
|
280
|
-
|
|
281
|
-
else:
|
|
282
|
-
node_lik[node.idx] = [
|
|
283
|
-
Decimal(l[i]) * Decimal(val)
|
|
284
|
-
for i, val in enumerate(node_lik[node.idx])
|
|
285
|
-
]
|
|
286
|
-
|
|
287
|
-
# preorder traversal to get marginal reconstructions at internal
|
|
288
|
-
# nodes
|
|
289
|
-
marg = node_lik.copy()
|
|
290
|
-
for node in tree.treenode.traverse("preorder"):
|
|
291
|
-
if node.is_root():
|
|
292
|
-
continue
|
|
293
|
-
elif node.is_leaf():
|
|
294
|
-
continue
|
|
295
|
-
lik_arr = marg[node.idx]
|
|
296
|
-
parent_arr = marg[node.up.idx]
|
|
297
|
-
marg[node.idx] = [
|
|
298
|
-
Decimal(lik) * (Decimal(parent_arr[i]) / Decimal(lik))
|
|
299
|
-
for i, lik in enumerate(lik_arr)
|
|
300
|
-
]
|
|
301
|
-
|
|
302
|
-
# get marginal reconstructions for bad bois
|
|
303
|
-
two_pass = dict()
|
|
304
|
-
for samp in bads:
|
|
305
|
-
# get most likely state for focal tip
|
|
306
|
-
node = tree.idx_dict[
|
|
307
|
-
tree.get_mrca_idx_from_tip_labels(names=samp)
|
|
308
|
-
]
|
|
309
|
-
dist = node.dist
|
|
310
|
-
parent = node.up
|
|
311
|
-
imputed = None
|
|
312
|
-
pt = self._transition_probs(site_Q, dist)
|
|
313
|
-
lik = self._get_internal_lik(pt, marg[parent.idx])
|
|
314
|
-
|
|
315
|
-
tol = 0.001
|
|
316
|
-
imputed = self._get_imputed_nuc(lik)
|
|
317
|
-
|
|
318
|
-
# two_pass[samp] = [imputed, lik]
|
|
319
|
-
genotypes[samp][snp_index] = imputed
|
|
320
|
-
|
|
321
|
-
# DEPRECATED: RE-ROOTING METHOD OF YANG ET AL
|
|
322
|
-
# NEW METHOD (ABOVE) IS LINEAR
|
|
323
|
-
# reroot=dict()
|
|
324
|
-
# for samp in bads:
|
|
325
|
-
# #focaltree = tree.drop_tips(names=[x for x in bads if x != samp])
|
|
326
|
-
# focaltree = tree.root(names=[samp])
|
|
327
|
-
#
|
|
328
|
-
# mystyle = {
|
|
329
|
-
# "edge_type": "p",
|
|
330
|
-
# "edge_style": {
|
|
331
|
-
# "stroke-width": 1,
|
|
332
|
-
# },
|
|
333
|
-
# "tip_labels_align": True,
|
|
334
|
-
# "tip_labels_style": {"font-size": "5px"},
|
|
335
|
-
# "node_labels": False,
|
|
336
|
-
# }
|
|
337
|
-
#
|
|
338
|
-
# canvas, axes, mark = focaltree.draw()
|
|
339
|
-
# toyplot.pdf.render(canvas, "test.pdf")
|
|
340
|
-
#
|
|
341
|
-
# #postorder traversal to compute likelihood
|
|
342
|
-
# node_lik = dict()
|
|
343
|
-
# for node in focaltree.treenode.traverse("postorder"):
|
|
344
|
-
# if node.is_leaf():
|
|
345
|
-
# continue
|
|
346
|
-
#
|
|
347
|
-
# if node.idx not in node_lik:
|
|
348
|
-
# node_lik[node.idx] = None
|
|
349
|
-
#
|
|
350
|
-
# for child in node.get_children():
|
|
351
|
-
# # get branch length to child
|
|
352
|
-
# # bl = child.edge.length
|
|
353
|
-
# # get transition probs
|
|
354
|
-
# pt = self._transition_probs(site_Q, child.dist)
|
|
355
|
-
# if child.is_leaf():
|
|
356
|
-
# if child.name in genotypes:
|
|
357
|
-
# if child.name in bads:
|
|
358
|
-
# sum = [1.0, 1.0, 1.0, 1.0]
|
|
359
|
-
# else:
|
|
360
|
-
# # get genotype data
|
|
361
|
-
# sum = None
|
|
362
|
-
# for allele in self._get_iupac_full(
|
|
363
|
-
# genotypes[child.name][snp_index]
|
|
364
|
-
# ):
|
|
365
|
-
# if sum is None:
|
|
366
|
-
# sum = [Decimal(x) for x in list(pt[allele])]
|
|
367
|
-
# else:
|
|
368
|
-
# sum = [
|
|
369
|
-
# Decimal(sum[i]) + Decimal(val)
|
|
370
|
-
# for i, val in enumerate(
|
|
371
|
-
# list(pt[allele])
|
|
372
|
-
# )
|
|
373
|
-
# ]
|
|
374
|
-
#
|
|
375
|
-
# node_lik[child.idx] = [Decimal(x) for x in sum]
|
|
376
|
-
#
|
|
377
|
-
# #add to likelihood for parent node
|
|
378
|
-
# if node_lik[node.idx] is None:
|
|
379
|
-
# node_lik[node.idx] = node_lik[child.idx]
|
|
380
|
-
# else:
|
|
381
|
-
# node_lik[node.idx] = [
|
|
382
|
-
# Decimal(node_lik[child.idx][i]) * Decimal(val)
|
|
383
|
-
# for i, val in enumerate(node_lik[node.idx])
|
|
384
|
-
# ]
|
|
385
|
-
# else:
|
|
386
|
-
# # raise error
|
|
387
|
-
# sys.exit(
|
|
388
|
-
# f"Error: Taxon {child.name} not found in "
|
|
389
|
-
# f"genotypes"
|
|
390
|
-
# )
|
|
391
|
-
# else:
|
|
392
|
-
# l = self._get_internal_lik(pt, node_lik[child.idx])
|
|
393
|
-
# if node_lik[node.idx] is None:
|
|
394
|
-
# node_lik[node.idx] = [Decimal(x) for x in l]
|
|
395
|
-
#
|
|
396
|
-
# else:
|
|
397
|
-
# node_lik[node.idx] = [
|
|
398
|
-
# Decimal(l[i]) * Decimal(val)
|
|
399
|
-
# for i, val in enumerate(node_lik[node.idx])
|
|
400
|
-
# ]
|
|
401
|
-
#
|
|
402
|
-
# # get most likely state for focal tip
|
|
403
|
-
# node = focaltree.idx_dict[
|
|
404
|
-
# focaltree.get_mrca_idx_from_tip_labels(names=samp)
|
|
405
|
-
# ]
|
|
406
|
-
# dist = node.dist
|
|
407
|
-
# parent = node.up
|
|
408
|
-
# imputed = None
|
|
409
|
-
# pt = self._transition_probs(site_Q, dist)
|
|
410
|
-
# lik = self._get_internal_lik(pt, node_lik[parent.idx])
|
|
411
|
-
# maxpos = lik.index(max(lik))
|
|
412
|
-
# if maxpos == 0:
|
|
413
|
-
# imputed = "A"
|
|
414
|
-
#
|
|
415
|
-
# elif maxpos == 1:
|
|
416
|
-
# imputed = "C"
|
|
417
|
-
#
|
|
418
|
-
# elif maxpos == 2:
|
|
419
|
-
# imputed = "G"
|
|
420
|
-
#
|
|
421
|
-
# else:
|
|
422
|
-
# imputed = "T"
|
|
423
|
-
# reroot[samp] = [imputed, lik]
|
|
424
|
-
# check if two methods give same results
|
|
425
|
-
# for key in two_pass:
|
|
426
|
-
# if two_pass[key][0] != reroot[key][0]:
|
|
427
|
-
# print("Two-pass:", two_pass[key][0], "-", two_pass[key][1])
|
|
428
|
-
# print("Reroot:", reroot[key][0], "-", reroot[key][1])
|
|
429
|
-
|
|
430
|
-
if self.save_plots:
|
|
431
|
-
self._draw_imputed_position(
|
|
432
|
-
tree,
|
|
433
|
-
bads,
|
|
434
|
-
genotypes,
|
|
435
|
-
snp_index,
|
|
436
|
-
f"{outdir}/{self.prefix}_pos{snp_index}.pdf",
|
|
437
|
-
)
|
|
438
|
-
|
|
439
|
-
df = pd.DataFrame.from_dict(genotypes, orient="index")
|
|
440
|
-
|
|
441
|
-
# Make sure no missing data remains in the dataset
|
|
442
|
-
assert (
|
|
443
|
-
not df.isin([-9]).any().any()
|
|
444
|
-
), "Imputation failed...Missing values found in the imputed dataset"
|
|
445
|
-
|
|
446
|
-
(
|
|
447
|
-
imp_snps,
|
|
448
|
-
self.valid_sites,
|
|
449
|
-
self.valid_sites_count,
|
|
450
|
-
) = self.genotype_data.convert_012(
|
|
451
|
-
df.to_numpy().tolist(), impute_mode=True
|
|
452
|
-
)
|
|
453
|
-
|
|
454
|
-
df_imp = pd.DataFrame.from_records(imp_snps)
|
|
455
|
-
|
|
456
|
-
return df_imp
|
|
457
|
-
|
|
458
|
-
def nbiallelic(self) -> int:
|
|
459
|
-
"""Get the number of remaining bi-allelic sites after imputation.
|
|
460
|
-
|
|
461
|
-
Returns:
|
|
462
|
-
int: Number of bi-allelic sites remaining after imputation.
|
|
463
|
-
"""
|
|
464
|
-
return len(self.imputed.columns)
|
|
465
|
-
|
|
466
|
-
def _get_imputed_nuc(self, lik_arr):
|
|
467
|
-
nucmap = {0: "A", 1: "C", 2: "G", 3: "T"}
|
|
468
|
-
maxpos = lik_arr.index(max(lik_arr))
|
|
469
|
-
picks = set([maxpos])
|
|
470
|
-
# NOT USED:
|
|
471
|
-
# Experimenting with ways to impute heterozygotes.
|
|
472
|
-
# Note that LRT isn't appropriate (as I used here) because
|
|
473
|
-
# the models are not nested & LRTS isn't necessarily expected
|
|
474
|
-
# to be chisq distributed.
|
|
475
|
-
# Check out Vuong test and read Lewis et al 2011 (doi: 10.1111/j.2041-210X.2010.00063.x)
|
|
476
|
-
#
|
|
477
|
-
# for index, alt in enumerate(lik_arr):
|
|
478
|
-
# if index == maxpos:
|
|
479
|
-
# continue
|
|
480
|
-
# else:
|
|
481
|
-
# lr = lrt(lik_arr[maxpos], alt, loglik=False)
|
|
482
|
-
# p = chi2.sf(lr)
|
|
483
|
-
# print(nucmap[maxpos], ":", str(lrt(lik_arr[maxpos], alt, loglik=False)), p)
|
|
484
|
-
return nucmap[maxpos]
|
|
485
|
-
|
|
486
|
-
def _parse_arguments(
|
|
487
|
-
self, genotype_data: Any
|
|
488
|
-
) -> Tuple[Dict[str, List[Union[int, str]]], tt.tree, pd.DataFrame]:
|
|
489
|
-
"""Determine which arguments were specified and set appropriate values.
|
|
490
|
-
|
|
491
|
-
Args:
|
|
492
|
-
genotype_data (GenotypeData object): Initialized GenotypeData object.
|
|
493
|
-
|
|
494
|
-
Returns:
|
|
495
|
-
Dict[str, List[Union[int, str]]]: GenotypeData.snpsdict object. If genotype_data is not None, then this value gets set from the GenotypeData.snpsdict object. If alnfile is not None, then the alignment file gets read and the snpsdict object gets set from the alnfile.
|
|
496
|
-
|
|
497
|
-
toytree.tree: Input phylogeny, either read from GenotypeData object or supplied with treefile.
|
|
498
|
-
|
|
499
|
-
pandas.DataFrame: Q Rate Matrix, either from IQ-TREE file or from its own supplied file.
|
|
500
|
-
"""
|
|
501
|
-
data = genotype_data.snpsdict
|
|
502
|
-
tree = genotype_data.tree
|
|
503
|
-
|
|
504
|
-
# read (optional) Q-matrix
|
|
505
|
-
if genotype_data.q is not None:
|
|
506
|
-
q = genotype_data.q
|
|
507
|
-
else:
|
|
508
|
-
raise TypeError("q must be defined in GenotypeData instance.")
|
|
509
|
-
|
|
510
|
-
# read (optional) site-specific substitution rates
|
|
511
|
-
if genotype_data.site_rates is not None:
|
|
512
|
-
site_rates = genotype_data.site_rates
|
|
513
|
-
else:
|
|
514
|
-
raise TypeError(
|
|
515
|
-
"site rates must be defined in GenotypeData instance."
|
|
516
|
-
)
|
|
517
|
-
|
|
518
|
-
return data, tree, q, site_rates
|
|
519
|
-
|
|
520
|
-
def _validate_arguments(self, genotype_data: Any) -> None:
|
|
521
|
-
"""Validate that the correct arguments were supplied.
|
|
522
|
-
|
|
523
|
-
Args:
|
|
524
|
-
genotype_data (GenotypeData object): Input GenotypeData instance.
|
|
525
|
-
|
|
526
|
-
Raises:
|
|
527
|
-
TypeError: Must define genotype_data.tree in GenotypeData instance.
|
|
528
|
-
TypeError: Q rate matrix must be defined in GenotypeData instance.
|
|
529
|
-
"""
|
|
530
|
-
|
|
531
|
-
if genotype_data.tree is None:
|
|
532
|
-
raise TypeError("genotype_data.tree must be defined")
|
|
533
|
-
|
|
534
|
-
if genotype_data.q is None:
|
|
535
|
-
raise TypeError("q must be defined in GenotypeData instance.")
|
|
536
|
-
|
|
537
|
-
def _print_q(self, q: pd.DataFrame) -> None:
|
|
538
|
-
"""Print Rate Matrix Q.
|
|
539
|
-
|
|
540
|
-
Args:
|
|
541
|
-
q (pandas.DataFrame): Rate Matrix Q.
|
|
542
|
-
"""
|
|
543
|
-
print("Rate matrix Q:")
|
|
544
|
-
print("\tA\tC\tG\tT\t")
|
|
545
|
-
for nuc1 in ["A", "C", "G", "T"]:
|
|
546
|
-
print(nuc1, end="\t")
|
|
547
|
-
for nuc2 in ["A", "C", "G", "T"]:
|
|
548
|
-
print(q[nuc1][nuc2], end="\t")
|
|
549
|
-
print("")
|
|
550
|
-
|
|
551
|
-
def _is_int(self, val: Union[str, int]) -> bool:
|
|
552
|
-
"""Check if value is integer.
|
|
553
|
-
|
|
554
|
-
Args:
|
|
555
|
-
val (int or str): Value to check.
|
|
556
|
-
|
|
557
|
-
Returns:
|
|
558
|
-
bool: True if integer, False if string.
|
|
559
|
-
"""
|
|
560
|
-
try:
|
|
561
|
-
num = int(val)
|
|
562
|
-
except ValueError:
|
|
563
|
-
return False
|
|
564
|
-
return True
|
|
565
|
-
|
|
566
|
-
def _get_nuc_colors(self, nucs: List[str]) -> List[str]:
|
|
567
|
-
"""Get colors for each nucleotide when plotting.
|
|
568
|
-
|
|
569
|
-
Args:
|
|
570
|
-
nucs (List[str]): Nucleotides at current site.
|
|
571
|
-
|
|
572
|
-
Returns:
|
|
573
|
-
List[str]: Hex-code color values for each IUPAC nucleotide.
|
|
574
|
-
"""
|
|
575
|
-
ret = list()
|
|
576
|
-
for nuc in nucs:
|
|
577
|
-
nuc = nuc.upper()
|
|
578
|
-
if nuc == "A":
|
|
579
|
-
ret.append("#0000FF") # blue
|
|
580
|
-
elif nuc == "C":
|
|
581
|
-
ret.append("#FF0000") # red
|
|
582
|
-
elif nuc == "G":
|
|
583
|
-
ret.append("#00FF00") # green
|
|
584
|
-
elif nuc == "T":
|
|
585
|
-
ret.append("#FFFF00") # yellow
|
|
586
|
-
elif nuc == "R":
|
|
587
|
-
ret.append("#0dbaa9") # blue-green
|
|
588
|
-
elif nuc == "Y":
|
|
589
|
-
ret.append("#FFA500") # orange
|
|
590
|
-
elif nuc == "K":
|
|
591
|
-
ret.append("#9acd32") # yellow-green
|
|
592
|
-
elif nuc == "M":
|
|
593
|
-
ret.append("#800080") # purple
|
|
594
|
-
elif nuc == "S":
|
|
595
|
-
ret.append("#964B00")
|
|
596
|
-
elif nuc == "W":
|
|
597
|
-
ret.append("#C0C0C0")
|
|
598
|
-
else:
|
|
599
|
-
ret.append("#000000")
|
|
600
|
-
return ret
|
|
601
|
-
|
|
602
|
-
def _label_bads(
|
|
603
|
-
self, tips: List[str], labels: List[str], bads: List[str]
|
|
604
|
-
) -> List[str]:
|
|
605
|
-
"""Insert asterisks around bad nucleotides.
|
|
606
|
-
|
|
607
|
-
Args:
|
|
608
|
-
tips (List[str]): Tip labels (sample IDs).
|
|
609
|
-
labels (List[str]): List of nucleotides at current site.
|
|
610
|
-
bads (List[str]): List of tips that have missing data at current site.
|
|
611
|
-
|
|
612
|
-
Returns:
|
|
613
|
-
List[str]: IUPAC Nucleotides with "*" inserted around tips that had missing data.
|
|
614
|
-
"""
|
|
615
|
-
for i, t in enumerate(tips):
|
|
616
|
-
if t in bads:
|
|
617
|
-
labels[i] = "*" + str(labels[i]) + "*"
|
|
618
|
-
return labels
|
|
619
|
-
|
|
620
|
-
def _draw_imputed_position(
|
|
621
|
-
self,
|
|
622
|
-
tree: tt.tree,
|
|
623
|
-
bads: List[str],
|
|
624
|
-
genotypes: Dict[str, List[str]],
|
|
625
|
-
pos: int,
|
|
626
|
-
out: str = "tree.pdf",
|
|
627
|
-
) -> None:
|
|
628
|
-
"""Draw nucleotides at phylogeny tip and saves to file on disk.
|
|
629
|
-
|
|
630
|
-
Draws nucleotides as tip labels for the current SNP site. Imputed values have asterisk surrounding the nucleotide label. The tree is converted to a toyplot object and saved to file.
|
|
631
|
-
|
|
632
|
-
Args:
|
|
633
|
-
tree (toytree.tree): Input tree object.
|
|
634
|
-
bads (List[str]): List of sampleIDs that have missing data at the current SNP site.
|
|
635
|
-
genotypes (Dict[str, List[str]]): Genotypes at all SNP sites.
|
|
636
|
-
pos (int): Current SNP index.
|
|
637
|
-
out (str, optional): Output filename for toyplot object.
|
|
638
|
-
"""
|
|
639
|
-
|
|
640
|
-
# print(tree.get_tip_labels())
|
|
641
|
-
sizes = [8 if i in bads else 0 for i in tree.get_tip_labels()]
|
|
642
|
-
colors = [genotypes[i][pos] for i in tree.get_tip_labels()]
|
|
643
|
-
labels = colors
|
|
644
|
-
|
|
645
|
-
labels = self._label_bads(tree.get_tip_labels(), labels, bads)
|
|
646
|
-
|
|
647
|
-
colors = self._get_nuc_colors(colors)
|
|
648
|
-
|
|
649
|
-
mystyle = {
|
|
650
|
-
"edge_type": "p",
|
|
651
|
-
"edge_style": {
|
|
652
|
-
"stroke": tt.colors[0],
|
|
653
|
-
"stroke-width": 1,
|
|
654
|
-
},
|
|
655
|
-
"tip_labels_align": True,
|
|
656
|
-
"tip_labels_style": {"font-size": "5px"},
|
|
657
|
-
"node_labels": False,
|
|
658
|
-
}
|
|
659
|
-
|
|
660
|
-
canvas, axes, mark = tree.draw(
|
|
661
|
-
tip_labels_colors=colors,
|
|
662
|
-
tip_labels=labels,
|
|
663
|
-
width=400,
|
|
664
|
-
height=600,
|
|
665
|
-
**mystyle,
|
|
666
|
-
)
|
|
667
|
-
|
|
668
|
-
toyplot.pdf.render(canvas, out)
|
|
669
|
-
|
|
670
|
-
def _all_missing(
|
|
671
|
-
self,
|
|
672
|
-
tree: tt.tree,
|
|
673
|
-
node_index: int,
|
|
674
|
-
snp_index: int,
|
|
675
|
-
genotypes: Dict[str, List[str]],
|
|
676
|
-
) -> bool:
|
|
677
|
-
"""Check if all descendants of a clade have missing data at SNP site.
|
|
678
|
-
|
|
679
|
-
Args:
|
|
680
|
-
tree (toytree.tree): Input guide tree object.
|
|
681
|
-
|
|
682
|
-
node_index (int): Parent node to determine if all descendants have missing data.
|
|
683
|
-
|
|
684
|
-
snp_index (int): Index of current SNP site.
|
|
685
|
-
|
|
686
|
-
genotypes (Dict[str, List[str]]): Genotypes at all SNP sites.
|
|
687
|
-
|
|
688
|
-
Returns:
|
|
689
|
-
bool: True if all descendants have missing data, otherwise False.
|
|
690
|
-
"""
|
|
691
|
-
for des in tree.get_tip_labels(idx=node_index):
|
|
692
|
-
if genotypes[des][snp_index].upper() not in ["N", "-"]:
|
|
693
|
-
return False
|
|
694
|
-
return True
|
|
695
|
-
|
|
696
|
-
def _get_internal_lik(
|
|
697
|
-
self, pt: pd.DataFrame, lik_arr: List[float]
|
|
698
|
-
) -> List[float]:
|
|
699
|
-
"""Get ancestral state likelihoods for internal nodes of the tree.
|
|
700
|
-
|
|
701
|
-
Postorder traversal to calculate internal ancestral state likelihoods (tips -> root).
|
|
702
|
-
|
|
703
|
-
Args:
|
|
704
|
-
pt (pandas.DataFrame): Transition probabilities calculated from Rate Matrix Q.
|
|
705
|
-
lik_arr (List[float]): Likelihoods for nodes or leaves.
|
|
706
|
-
|
|
707
|
-
Returns:
|
|
708
|
-
List[float]: Internal likelihoods.
|
|
709
|
-
"""
|
|
710
|
-
ret = list()
|
|
711
|
-
for i, val in enumerate(lik_arr):
|
|
712
|
-
col = list(pt.iloc[:, i])
|
|
713
|
-
sum = Decimal(0.0)
|
|
714
|
-
for v in col:
|
|
715
|
-
sum += Decimal(v) * Decimal(val)
|
|
716
|
-
ret.append(sum)
|
|
717
|
-
return ret
|
|
718
|
-
|
|
719
|
-
def _transition_probs(self, Q: pd.DataFrame, t: float) -> pd.DataFrame:
|
|
720
|
-
"""Get transition probabilities for tree.
|
|
721
|
-
|
|
722
|
-
Args:
|
|
723
|
-
Q (pd.DataFrame): Rate Matrix Q.
|
|
724
|
-
t (float): Tree distance of child.
|
|
725
|
-
|
|
726
|
-
Returns:
|
|
727
|
-
pd.DataFrame: Transition probabilities.
|
|
728
|
-
"""
|
|
729
|
-
ret = Q.copy(deep=True)
|
|
730
|
-
m = Q.to_numpy()
|
|
731
|
-
pt = scipy.linalg.expm(m * t)
|
|
732
|
-
ret[:] = pt
|
|
733
|
-
return ret
|
|
734
|
-
|
|
735
|
-
def _str2iupac(
|
|
736
|
-
self, genotypes: Dict[str, List[str]], str_encodings: Dict[str, int]
|
|
737
|
-
) -> Dict[str, List[str]]:
|
|
738
|
-
"""Convert STRUCTURE-format encodings to IUPAC bases.
|
|
739
|
-
|
|
740
|
-
Args:
|
|
741
|
-
genotypes (Dict[str, List[str]]): Genotypes at all sites.
|
|
742
|
-
str_encodings (Dict[str, int]): Dictionary that maps IUPAC bases (keys) to integer encodings (values).
|
|
743
|
-
|
|
744
|
-
Returns:
|
|
745
|
-
Dict[str, List[str]]: Genotypes converted to IUPAC format.
|
|
746
|
-
"""
|
|
747
|
-
a = str_encodings["A"]
|
|
748
|
-
c = str_encodings["C"]
|
|
749
|
-
g = str_encodings["G"]
|
|
750
|
-
t = str_encodings["T"]
|
|
751
|
-
n = str_encodings["N"]
|
|
752
|
-
nuc = {
|
|
753
|
-
f"{a}/{a}": "A",
|
|
754
|
-
f"{c}/{c}": "C",
|
|
755
|
-
f"{g}/{g}": "G",
|
|
756
|
-
f"{t}/{t}": "T",
|
|
757
|
-
f"{n}/{n}": "N",
|
|
758
|
-
f"{a}/{c}": "M",
|
|
759
|
-
f"{c}/{a}": "M",
|
|
760
|
-
f"{a}/{g}": "R",
|
|
761
|
-
f"{g}/{a}": "R",
|
|
762
|
-
f"{a}/{t}": "W",
|
|
763
|
-
f"{t}/{a}": "W",
|
|
764
|
-
f"{c}/{g}": "S",
|
|
765
|
-
f"{g}/{c}": "S",
|
|
766
|
-
f"{c}/{t}": "Y",
|
|
767
|
-
f"{t}/{c}": "Y",
|
|
768
|
-
f"{g}/{t}": "K",
|
|
769
|
-
f"{t}/{g}": "K",
|
|
770
|
-
}
|
|
771
|
-
|
|
772
|
-
for k, v in genotypes.items():
|
|
773
|
-
for i, gt in enumerate(v):
|
|
774
|
-
v[i] = nuc[gt]
|
|
775
|
-
|
|
776
|
-
return genotypes
|
|
777
|
-
|
|
778
|
-
def _get_iupac_full(self, char: str) -> List[str]:
|
|
779
|
-
"""Map nucleotide to list of expanded IUPAC encodings.
|
|
780
|
-
|
|
781
|
-
Args:
|
|
782
|
-
char (str): Current nucleotide.
|
|
783
|
-
|
|
784
|
-
Returns:
|
|
785
|
-
List[str]: List of nucleotides in ``char`` expanded IUPAC.
|
|
786
|
-
"""
|
|
787
|
-
char = char.upper()
|
|
788
|
-
iupac = {
|
|
789
|
-
"A": ["A"],
|
|
790
|
-
"G": ["G"],
|
|
791
|
-
"C": ["C"],
|
|
792
|
-
"T": ["T"],
|
|
793
|
-
"N": ["A", "C", "T", "G"],
|
|
794
|
-
"-": ["A", "C", "T", "G"],
|
|
795
|
-
"R": ["A", "G"],
|
|
796
|
-
"Y": ["C", "T"],
|
|
797
|
-
"S": ["G", "C"],
|
|
798
|
-
"W": ["A", "T"],
|
|
799
|
-
"K": ["G", "T"],
|
|
800
|
-
"M": ["A", "C"],
|
|
801
|
-
"B": ["C", "G", "T"],
|
|
802
|
-
"D": ["A", "G", "T"],
|
|
803
|
-
"H": ["A", "C", "T"],
|
|
804
|
-
"V": ["A", "C", "G"],
|
|
805
|
-
}
|
|
806
|
-
|
|
807
|
-
ret = iupac[char]
|
|
808
|
-
return ret
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
class ImputeAlleleFreq:
|
|
812
|
-
"""Impute missing data by global allele frequency. Population IDs can be sepcified with the pops argument. if pops is None, then imputation is by global allele frequency. If pops is not None, then imputation is by population-wise allele frequency. A list of population IDs in the appropriate format can be obtained from the GenotypeData object as GenotypeData.populations.
|
|
813
|
-
|
|
814
|
-
Args:
|
|
815
|
-
genotype_data (GenotypeData object): GenotypeData instance.
|
|
816
|
-
|
|
817
|
-
by_populations (bool, optional): Whether or not to impute by-population or globally. Defaults to False (global allele frequency).
|
|
818
|
-
|
|
819
|
-
diploid (bool, optional): When diploid=True, function assumes 0=homozygous ref; 1=heterozygous; 2=homozygous alt. 0-1-2 genotypes are decomposed to compute p (=frequency of ref) and q (=frequency of alt). In this case, p and q alleles are sampled to generate either 0 (hom-p), 1 (het), or 2 (hom-q) genotypes. When diploid=FALSE, 0-1-2 are sampled according to their observed frequency. Defaults to True.
|
|
820
|
-
|
|
821
|
-
default (int, optional): Value to set if no alleles sampled at a locus. Defaults to 0.
|
|
822
|
-
|
|
823
|
-
missing (int, optional): Missing data value. Defaults to -9.
|
|
824
|
-
|
|
825
|
-
verbose (bool, optional): Whether to print status updates. Set to False for no status updates. Defaults to True.
|
|
826
|
-
|
|
827
|
-
kwargs (Dict[str, Any]): Additional keyword arguments to supply. Primarily for internal purposes. Options include: {"iterative_mode": bool, validation_mode: bool, gt: List[List[int]]}. "iterative_mode" determines whether ``ImputeAlleleFreq`` is being used as the initial imputer in ``IterativeImputer``\. ``gt`` is used internally for the simple imputers during grid searches and validation. If ``genotype_data is None`` then ``gt`` cannot also be None, and vice versa. Only one of ``gt`` or ``genotype_data`` can be set.
|
|
828
|
-
|
|
829
|
-
Raises:
|
|
830
|
-
TypeError: genotype_data cannot be NoneType.
|
|
831
|
-
|
|
832
|
-
Attributes:
|
|
833
|
-
imputed (GenotypeData): New GenotypeData instance with imputed data.
|
|
834
|
-
|
|
835
|
-
Example:
|
|
836
|
-
>>>data = GenotypeData(
|
|
837
|
-
>>> filename="test.str",
|
|
838
|
-
>>> filetype="structure2rowPopID",
|
|
839
|
-
>>> popmapfile="test.popmap",
|
|
840
|
-
>>>)
|
|
841
|
-
>>>
|
|
842
|
-
>>>afpop = ImputeAlleleFreq(
|
|
843
|
-
>>> genotype_data=data,
|
|
844
|
-
>>> by_populations=True,
|
|
845
|
-
>>>)
|
|
846
|
-
>>>
|
|
847
|
-
>>>gd_afpop = afpop.imputed
|
|
848
|
-
"""
|
|
849
|
-
|
|
850
|
-
def __init__(
|
|
851
|
-
self,
|
|
852
|
-
genotype_data: GenotypeData,
|
|
853
|
-
*,
|
|
854
|
-
by_populations: bool = False,
|
|
855
|
-
diploid: bool = True,
|
|
856
|
-
default: int = 0,
|
|
857
|
-
missing: int = -9,
|
|
858
|
-
verbose: bool = True,
|
|
859
|
-
prefix="imputer",
|
|
860
|
-
**kwargs: Dict[str, Any],
|
|
861
|
-
) -> None:
|
|
862
|
-
if genotype_data is None and gt is None:
|
|
863
|
-
raise TypeError("GenotypeData instance or gt must be provided.")
|
|
864
|
-
|
|
865
|
-
gt = kwargs.get("gt", None)
|
|
866
|
-
|
|
867
|
-
if gt is None:
|
|
868
|
-
gt_list = genotype_data.genotypes_012(fmt="list")
|
|
869
|
-
else:
|
|
870
|
-
gt_list = gt
|
|
871
|
-
|
|
872
|
-
if by_populations:
|
|
873
|
-
if genotype_data.populations is None:
|
|
874
|
-
raise TypeError(
|
|
875
|
-
"When by_populations is True, GenotypeData instance must have a defined populations attribute"
|
|
876
|
-
)
|
|
877
|
-
|
|
878
|
-
self.pops = genotype_data.populations
|
|
879
|
-
|
|
880
|
-
else:
|
|
881
|
-
self.pops = None
|
|
882
|
-
|
|
883
|
-
self.diploid = diploid
|
|
884
|
-
self.default = default
|
|
885
|
-
self.missing = missing
|
|
886
|
-
self.verbose = verbose
|
|
887
|
-
self.iterative_mode = kwargs.get("iterative_mode", False)
|
|
888
|
-
self.validation_mode = kwargs.get("validation_mode", False)
|
|
889
|
-
|
|
890
|
-
if not self.validation_mode:
|
|
891
|
-
imputed012, self.valid_cols = self.fit_predict(gt_list)
|
|
892
|
-
genotype_data = genotype_data.copy()
|
|
893
|
-
genotype_data.snp_data = genotype_data.decode_012(
|
|
894
|
-
imputed012, prefix=prefix, write_output=False
|
|
895
|
-
)
|
|
896
|
-
genotype_data.genotypes_012 = imputed012
|
|
897
|
-
self.imputed = genotype_data
|
|
898
|
-
else:
|
|
899
|
-
self.imputed, self.valid_cols = self.fit_predict(gt_list)
|
|
900
|
-
|
|
901
|
-
@property
|
|
902
|
-
def genotypes_012(self):
|
|
903
|
-
return self.imputed.genotypes_012
|
|
904
|
-
|
|
905
|
-
@property
|
|
906
|
-
def snp_data(self):
|
|
907
|
-
return self.imputed.snp_data
|
|
908
|
-
|
|
909
|
-
@property
|
|
910
|
-
def alignment(self):
|
|
911
|
-
return self.imputed.alignment
|
|
912
|
-
|
|
913
|
-
def fit_predict(
|
|
914
|
-
self, X: List[List[int]]
|
|
915
|
-
) -> Tuple[
|
|
916
|
-
Union[pd.DataFrame, np.ndarray, List[List[Union[int, float]]]],
|
|
917
|
-
List[int],
|
|
918
|
-
]:
|
|
919
|
-
"""Impute missing genotypes using allele frequencies.
|
|
920
|
-
|
|
921
|
-
Impute using global or by_population allele frequencies. Missing alleles are primarily coded as negative; usually -9.
|
|
922
|
-
|
|
923
|
-
Args:
|
|
924
|
-
X (List[List[int]], numpy.ndarray, or pandas.DataFrame): 012-encoded genotypes obtained from the GenotypeData object.
|
|
925
|
-
|
|
926
|
-
Returns:
|
|
927
|
-
pandas.DataFrame, numpy.ndarray, or List[List[Union[int, float]]]: Imputed genotypes of same shape as data.
|
|
928
|
-
|
|
929
|
-
List[int]: Column indexes that were retained.
|
|
930
|
-
|
|
931
|
-
Raises:
|
|
932
|
-
TypeError: X must be either list, np.ndarray, or pd.DataFrame.
|
|
933
|
-
"""
|
|
934
|
-
if self.pops is not None and self.verbose:
|
|
935
|
-
print("\nImputing by population allele frequencies...")
|
|
936
|
-
elif self.pops is None and self.verbose:
|
|
937
|
-
print("\nImputing by global allele frequency...")
|
|
938
|
-
|
|
939
|
-
if isinstance(X, (list, np.ndarray)):
|
|
940
|
-
df = pd.DataFrame(X)
|
|
941
|
-
elif isinstance(X, pd.DataFrame):
|
|
942
|
-
df = X.copy()
|
|
943
|
-
else:
|
|
944
|
-
raise TypeError(
|
|
945
|
-
f"X must be of type list(list(int)), numpy.ndarray, "
|
|
946
|
-
f"or pandas.DataFrame, but got {type(X)}"
|
|
947
|
-
)
|
|
948
|
-
|
|
949
|
-
df = df.astype(int)
|
|
950
|
-
df.replace(self.missing, np.nan, inplace=True)
|
|
951
|
-
|
|
952
|
-
# Initialize an empty list to hold the columns
|
|
953
|
-
columns = []
|
|
954
|
-
valid_cols = list()
|
|
955
|
-
bad_cnt = 0
|
|
956
|
-
|
|
957
|
-
if self.pops is not None:
|
|
958
|
-
df = df.copy()
|
|
959
|
-
|
|
960
|
-
# Impute per-population mode.
|
|
961
|
-
df["pops"] = self.pops
|
|
962
|
-
groups = df.groupby(["pops"], sort=False)
|
|
963
|
-
|
|
964
|
-
for col in df.columns:
|
|
965
|
-
try:
|
|
966
|
-
# Instead of appending to the DataFrame, append to the list
|
|
967
|
-
columns.append(
|
|
968
|
-
groups[col].transform(
|
|
969
|
-
lambda x: x.fillna(x.mode().iloc[0])
|
|
970
|
-
)
|
|
971
|
-
)
|
|
972
|
-
|
|
973
|
-
if col != "pops":
|
|
974
|
-
valid_cols.append(col)
|
|
975
|
-
|
|
976
|
-
except IndexError as e:
|
|
977
|
-
if str(e).lower().startswith("single positional indexer"):
|
|
978
|
-
bad_cnt += 1
|
|
979
|
-
# Impute with global mode, unless globally missing in which case call as 0.0
|
|
980
|
-
if df[col].isna().all():
|
|
981
|
-
columns.append(df[col].fillna(0.0, inplace=False))
|
|
982
|
-
else:
|
|
983
|
-
columns.append(
|
|
984
|
-
df[col].fillna(df[col].mode().iloc[0])
|
|
985
|
-
)
|
|
986
|
-
else:
|
|
987
|
-
raise
|
|
988
|
-
|
|
989
|
-
data = pd.concat(columns, axis=1)
|
|
990
|
-
|
|
991
|
-
if bad_cnt > 0 and not self.validation_mode:
|
|
992
|
-
UserWarning(
|
|
993
|
-
f"\n{bad_cnt} columns were imputed with the "
|
|
994
|
-
f"global mode because some of the populations "
|
|
995
|
-
f"contained only missing data"
|
|
996
|
-
)
|
|
997
|
-
|
|
998
|
-
data.drop("pops", axis=1, inplace=True)
|
|
999
|
-
else:
|
|
1000
|
-
# Impute global mode.
|
|
1001
|
-
imp = SimpleImputer(strategy="most_frequent")
|
|
1002
|
-
|
|
1003
|
-
# replace any columns that are fully missing
|
|
1004
|
-
df.loc[:, df.isna().all()] = df.loc[:, df.isna().all()].fillna(0.0)
|
|
1005
|
-
|
|
1006
|
-
data = pd.DataFrame(imp.fit_transform(df))
|
|
1007
|
-
|
|
1008
|
-
if self.iterative_mode:
|
|
1009
|
-
data = data.astype(dtype="float32")
|
|
1010
|
-
else:
|
|
1011
|
-
data = data.astype(dtype="Int8")
|
|
1012
|
-
|
|
1013
|
-
if self.verbose:
|
|
1014
|
-
print("Done!")
|
|
1015
|
-
|
|
1016
|
-
if not self.validation_mode:
|
|
1017
|
-
return data.values.tolist(), valid_cols
|
|
1018
|
-
return data.values, valid_cols
|
|
1019
|
-
|
|
1020
|
-
def write2file(
|
|
1021
|
-
self, X: Union[pd.DataFrame, np.ndarray, List[List[Union[int, float]]]]
|
|
1022
|
-
) -> None:
|
|
1023
|
-
"""Write imputed data to file on disk.
|
|
1024
|
-
|
|
1025
|
-
Args:
|
|
1026
|
-
X (pandas.DataFrame, numpy.ndarray, List[List[Union[int, float]]]): Imputed data to write to file.
|
|
1027
|
-
|
|
1028
|
-
Raises:
|
|
1029
|
-
TypeError: If X is of unsupported type.
|
|
1030
|
-
"""
|
|
1031
|
-
outfile = os.path.join(
|
|
1032
|
-
f"{self.prefix}_output",
|
|
1033
|
-
"alignments",
|
|
1034
|
-
"Unsupervised",
|
|
1035
|
-
"ImputeAlleleFreq",
|
|
1036
|
-
)
|
|
1037
|
-
|
|
1038
|
-
Path(outfile).mkdir(parents=True, exist_ok=True)
|
|
1039
|
-
|
|
1040
|
-
outfile = os.path.join(outfile, "imputed_012.csv")
|
|
1041
|
-
|
|
1042
|
-
if isinstance(X, pd.DataFrame):
|
|
1043
|
-
df = X
|
|
1044
|
-
elif isinstance(X, (np.ndarray, list)):
|
|
1045
|
-
df = pd.DataFrame(X)
|
|
1046
|
-
else:
|
|
1047
|
-
raise TypeError(
|
|
1048
|
-
f"Could not write imputed data because it is of incorrect "
|
|
1049
|
-
f"type. Got {type(X)}"
|
|
1050
|
-
)
|
|
1051
|
-
|
|
1052
|
-
df.to_csv(outfile, header=False, index=False)
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
class ImputeMF:
|
|
1056
|
-
"""Impute missing data using matrix factorization. If ``by_populations=False`` then imputation is by global allele frequency. If ``by_populations=True`` then imputation is by population-wise allele frequency.
|
|
1057
|
-
|
|
1058
|
-
Args:
|
|
1059
|
-
genotype_data (GenotypeData object or None, optional): GenotypeData instance.
|
|
1060
|
-
|
|
1061
|
-
latent_features (float, optional): The number of latent variables used to reduce dimensionality of the data. Defaults to 2.
|
|
1062
|
-
|
|
1063
|
-
learning_rate (float, optional): The learning rate for the optimizers. Adjust if the loss is learning too slowly. Defaults to 0.1.
|
|
1064
|
-
|
|
1065
|
-
tol (float, optional): Tolerance of the stopping condition. Defaults to 1e-3.
|
|
1066
|
-
|
|
1067
|
-
missing (int, optional): Missing data value. Defaults to -9.
|
|
1068
|
-
|
|
1069
|
-
prefix (str, optional): Prefix for writing output files. Defaults to "output".
|
|
1070
|
-
|
|
1071
|
-
verbose (bool, optional): Whether to print status updates. Set to False for no status updates. Defaults to True.
|
|
1072
|
-
|
|
1073
|
-
**kwargs (Dict[str, Any]): Additional keyword arguments to supply. Primarily for internal purposes. Options include: {"iterative_mode": bool}. "iterative_mode" determines whether ``ImputeAlleleFreq`` is being used as the initial imputer in ``IterativeImputer``.
|
|
1074
|
-
|
|
1075
|
-
Attributes:
|
|
1076
|
-
imputed (GenotypeData): New GenotypeData instance with imputed data.
|
|
1077
|
-
|
|
1078
|
-
Example:
|
|
1079
|
-
>>>data = GenotypeData(
|
|
1080
|
-
>>> filename="test.str",
|
|
1081
|
-
>>> filetype="structure",
|
|
1082
|
-
>>> popmapfile="test.popmap",
|
|
1083
|
-
>>>)
|
|
1084
|
-
>>>
|
|
1085
|
-
>>>nmf = ImputeMF(
|
|
1086
|
-
>>> genotype_data=data,
|
|
1087
|
-
>>> by_populations=True,
|
|
1088
|
-
>>>)
|
|
1089
|
-
>>>
|
|
1090
|
-
>>> # Get GenotypeData instance.
|
|
1091
|
-
>>>gd_nmf = nmf.imputed
|
|
1092
|
-
|
|
1093
|
-
Raises:
|
|
1094
|
-
TypeError: genotype_data cannot be NoneType.
|
|
1095
|
-
"""
|
|
1096
|
-
|
|
1097
|
-
def __init__(
|
|
1098
|
-
self,
|
|
1099
|
-
genotype_data,
|
|
1100
|
-
*,
|
|
1101
|
-
latent_features: int = 2,
|
|
1102
|
-
max_iter: int = 100,
|
|
1103
|
-
learning_rate: float = 0.0002,
|
|
1104
|
-
regularization_param: float = 0.02,
|
|
1105
|
-
tol: float = 0.1,
|
|
1106
|
-
n_fail: int = 20,
|
|
1107
|
-
missing: int = -9,
|
|
1108
|
-
prefix: str = "imputer",
|
|
1109
|
-
verbose: bool = True,
|
|
1110
|
-
**kwargs: Dict[str, Any],
|
|
1111
|
-
) -> None:
|
|
1112
|
-
self.max_iter = max_iter
|
|
1113
|
-
self.latent_features = latent_features
|
|
1114
|
-
self.n_fail = n_fail
|
|
1115
|
-
self.learning_rate = learning_rate
|
|
1116
|
-
self.tol = tol
|
|
1117
|
-
self.regularization_param = regularization_param
|
|
1118
|
-
self.missing = missing
|
|
1119
|
-
self.prefix = prefix
|
|
1120
|
-
self.verbose = verbose
|
|
1121
|
-
self.iterative_mode = kwargs.get("iterative_mode", False)
|
|
1122
|
-
self.validation_mode = kwargs.get("validation_mode", False)
|
|
1123
|
-
|
|
1124
|
-
gt = kwargs.get("gt", None)
|
|
1125
|
-
|
|
1126
|
-
if genotype_data is None and gt is None:
|
|
1127
|
-
raise TypeError("GenotypeData and gt cannot both be NoneType.")
|
|
1128
|
-
|
|
1129
|
-
if gt is None:
|
|
1130
|
-
X = genotype_data.genotypes_012(fmt="numpy")
|
|
1131
|
-
else:
|
|
1132
|
-
X = gt.copy()
|
|
1133
|
-
imputed012 = pd.DataFrame(self.fit_predict(X))
|
|
1134
|
-
genotype_data = genotype_data.copy()
|
|
1135
|
-
genotype_data.snp_data = genotype_data.decode_012(
|
|
1136
|
-
imputed012, prefix=prefix, write_output=False
|
|
1137
|
-
)
|
|
1138
|
-
|
|
1139
|
-
if self.validation_mode:
|
|
1140
|
-
self.imputed = imputed012.to_numpy()
|
|
1141
|
-
else:
|
|
1142
|
-
self.imputed = genotype_data
|
|
1143
|
-
|
|
1144
|
-
@property
|
|
1145
|
-
def genotypes_012(self):
|
|
1146
|
-
return self.imputed.genotypes012
|
|
1147
|
-
|
|
1148
|
-
@property
|
|
1149
|
-
def snp_data(self):
|
|
1150
|
-
return self.imputed.snp_data
|
|
1151
|
-
|
|
1152
|
-
@property
|
|
1153
|
-
def alignment(self):
|
|
1154
|
-
return self.imputed.alignment
|
|
1155
|
-
|
|
1156
|
-
def fit_predict(self, X):
|
|
1157
|
-
# imputation
|
|
1158
|
-
if self.verbose:
|
|
1159
|
-
print(f"Doing MF imputation...")
|
|
1160
|
-
R = X
|
|
1161
|
-
R = R.astype(int)
|
|
1162
|
-
R[R == self.missing] = -9
|
|
1163
|
-
R = R + 1
|
|
1164
|
-
R[R < 0] = 0
|
|
1165
|
-
n_row = len(R)
|
|
1166
|
-
n_col = len(R[0])
|
|
1167
|
-
p = np.random.rand(n_row, self.latent_features)
|
|
1168
|
-
q = np.random.rand(n_col, self.latent_features)
|
|
1169
|
-
q_t = q.T
|
|
1170
|
-
fails = 0
|
|
1171
|
-
e_current = None
|
|
1172
|
-
for step in range(self.max_iter):
|
|
1173
|
-
for i in range(n_row):
|
|
1174
|
-
for j in range(n_col):
|
|
1175
|
-
if R[i][j] > 0:
|
|
1176
|
-
eij = R[i][j] - np.dot(p[i, :], q_t[:, j])
|
|
1177
|
-
for k in range(self.latent_features):
|
|
1178
|
-
p[i][k] = p[i][k] + self.learning_rate * (
|
|
1179
|
-
2 * eij * q_t[k][j]
|
|
1180
|
-
- self.regularization_param * p[i][k]
|
|
1181
|
-
)
|
|
1182
|
-
q_t[k][j] = q_t[k][j] + self.learning_rate * (
|
|
1183
|
-
2 * eij * p[i][k]
|
|
1184
|
-
- self.regularization_param * q_t[k][j]
|
|
1185
|
-
)
|
|
1186
|
-
e = 0
|
|
1187
|
-
for i in range(n_row):
|
|
1188
|
-
for j in range(len(R[i])):
|
|
1189
|
-
if R[i][j] > 0:
|
|
1190
|
-
e = e + pow(R[i][j] - np.dot(p[i, :], q_t[:, j]), 2)
|
|
1191
|
-
for k in range(self.latent_features):
|
|
1192
|
-
e = e + (self.regularization_param / 2) * (
|
|
1193
|
-
pow(p[i][k], 2) + pow(q_t[k][j], 2)
|
|
1194
|
-
)
|
|
1195
|
-
if e_current is None:
|
|
1196
|
-
e_current = e
|
|
1197
|
-
else:
|
|
1198
|
-
if abs(e_current - e) < self.tol:
|
|
1199
|
-
fails += 1
|
|
1200
|
-
else:
|
|
1201
|
-
fails = 0
|
|
1202
|
-
e_current = e
|
|
1203
|
-
if fails >= self.n_fail:
|
|
1204
|
-
break
|
|
1205
|
-
nR = np.dot(p, q_t)
|
|
1206
|
-
|
|
1207
|
-
# transform values per-column (i.e., only allowing values found in original)
|
|
1208
|
-
tR = self.transform(R, nR)
|
|
1209
|
-
|
|
1210
|
-
# get accuracy of re-constructing non-missing genotypes
|
|
1211
|
-
accuracy = self.accuracy(X, tR)
|
|
1212
|
-
|
|
1213
|
-
# insert imputed values for missing genotypes
|
|
1214
|
-
fR = X
|
|
1215
|
-
fR[X < 0] = tR[X < 0]
|
|
1216
|
-
|
|
1217
|
-
if self.verbose:
|
|
1218
|
-
print("Done!")
|
|
1219
|
-
|
|
1220
|
-
return fR
|
|
1221
|
-
|
|
1222
|
-
def transform(self, original, predicted):
|
|
1223
|
-
n_row = len(original)
|
|
1224
|
-
n_col = len(original[0])
|
|
1225
|
-
tR = predicted
|
|
1226
|
-
for j in range(n_col):
|
|
1227
|
-
observed = predicted[:, j]
|
|
1228
|
-
expected = original[:, j]
|
|
1229
|
-
options = np.unique(expected[expected != 0])
|
|
1230
|
-
for i in range(n_row):
|
|
1231
|
-
transform = min(
|
|
1232
|
-
options, key=lambda x: abs(x - predicted[i, j])
|
|
1233
|
-
)
|
|
1234
|
-
tR[i, j] = transform
|
|
1235
|
-
tR = tR - 1
|
|
1236
|
-
tR[tR < 0] = -9
|
|
1237
|
-
return tR
|
|
1238
|
-
|
|
1239
|
-
def accuracy(self, expected, predicted):
|
|
1240
|
-
prop_same = np.sum(expected[expected >= 0] == predicted[expected >= 0])
|
|
1241
|
-
tot = expected[expected >= 0].size
|
|
1242
|
-
accuracy = prop_same / tot
|
|
1243
|
-
return accuracy
|
|
1244
|
-
|
|
1245
|
-
def write2file(
|
|
1246
|
-
self, X: Union[pd.DataFrame, np.ndarray, List[List[Union[int, float]]]]
|
|
1247
|
-
) -> None:
|
|
1248
|
-
"""Write imputed data to file on disk.
|
|
1249
|
-
|
|
1250
|
-
Args:
|
|
1251
|
-
X (pandas.DataFrame, numpy.ndarray, List[List[Union[int, float]]]): Imputed data to write to file.
|
|
1252
|
-
|
|
1253
|
-
Raises:
|
|
1254
|
-
TypeError: If X is of unsupported type.
|
|
1255
|
-
"""
|
|
1256
|
-
outfile = os.path.join(
|
|
1257
|
-
f"{self.prefix}_output",
|
|
1258
|
-
"alignments",
|
|
1259
|
-
"Unsupervised",
|
|
1260
|
-
"ImputeMF",
|
|
1261
|
-
)
|
|
1262
|
-
|
|
1263
|
-
Path(outfile).mkdir(parents=True, exist_ok=True)
|
|
1264
|
-
|
|
1265
|
-
outfile = os.path.join(outfile, "imputed_012.csv")
|
|
1266
|
-
|
|
1267
|
-
if isinstance(X, pd.DataFrame):
|
|
1268
|
-
df = X
|
|
1269
|
-
elif isinstance(X, (np.ndarray, list)):
|
|
1270
|
-
df = pd.DataFrame(X)
|
|
1271
|
-
else:
|
|
1272
|
-
raise TypeError(
|
|
1273
|
-
f"Could not write imputed data because it is of incorrect "
|
|
1274
|
-
f"type. Got {type(X)}"
|
|
1275
|
-
)
|
|
1276
|
-
|
|
1277
|
-
df.to_csv(outfile, header=False, index=False)
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
class ImputeRefAllele:
|
|
1281
|
-
"""Impute missing data by reference allele.
|
|
1282
|
-
|
|
1283
|
-
Args:
|
|
1284
|
-
genotype_data (GenotypeData object): GenotypeData instance.
|
|
1285
|
-
|
|
1286
|
-
missing (int, optional): Missing data value. Defaults to -9.
|
|
1287
|
-
|
|
1288
|
-
verbose (bool, optional): Whether to print status updates. Set to False for no status updates. Defaults to True.
|
|
1289
|
-
|
|
1290
|
-
kwargs (Dict[str, Any]): Additional keyword arguments to supply. Primarily for internal purposes. Options include: {"iterative_mode": bool, validation_mode: bool, gt: List[List[int]]}. "iterative_mode" determines whether ``ImputeRefAllele`` is being used as the initial imputer in ``IterativeImputer``\. ``gt`` is used internally for the simple imputers during grid searches and validation. If ``genotype_data is None`` then ``gt`` cannot also be None, and vice versa. Only one of ``gt`` or ``genotype_data`` can be set.
|
|
1291
|
-
|
|
1292
|
-
Raises:
|
|
1293
|
-
TypeError: genotype_data cannot be NoneType.
|
|
1294
|
-
|
|
1295
|
-
Attributes:
|
|
1296
|
-
imputed (GenotypeData): New GenotypeData instance with imputed data.
|
|
1297
|
-
|
|
1298
|
-
Example:
|
|
1299
|
-
>>>data = GenotypeData(
|
|
1300
|
-
>>> filename="test.str",
|
|
1301
|
-
>>> filetype="structure2rowPopID",
|
|
1302
|
-
>>> popmapfile="test.popmap",
|
|
1303
|
-
>>>)
|
|
1304
|
-
>>>
|
|
1305
|
-
>>>refallele = ImputeRefAllele(
|
|
1306
|
-
>>> genotype_data=data
|
|
1307
|
-
>>>)
|
|
1308
|
-
>>>
|
|
1309
|
-
>>>gd_refallele = refallele.imputed
|
|
1310
|
-
"""
|
|
1311
|
-
|
|
1312
|
-
def __init__(
|
|
1313
|
-
self,
|
|
1314
|
-
genotype_data: GenotypeData,
|
|
1315
|
-
*,
|
|
1316
|
-
missing: int = -9,
|
|
1317
|
-
prefix="imputer",
|
|
1318
|
-
verbose: bool = True,
|
|
1319
|
-
**kwargs: Dict[str, Any],
|
|
1320
|
-
) -> None:
|
|
1321
|
-
if genotype_data is None:
|
|
1322
|
-
raise TypeError("GenotypeData instance must be provided.")
|
|
1323
|
-
|
|
1324
|
-
gt = kwargs.get("gt", None)
|
|
1325
|
-
|
|
1326
|
-
if gt is None:
|
|
1327
|
-
gt_list = genotype_data.genotypes_012(fmt="list")
|
|
1328
|
-
else:
|
|
1329
|
-
gt_list = gt
|
|
1330
|
-
|
|
1331
|
-
self.missing = missing
|
|
1332
|
-
self.verbose = verbose
|
|
1333
|
-
self.iterative_mode = kwargs.get("iterative_mode", False)
|
|
1334
|
-
self.validation_mode = kwargs.get("validation_mode", False)
|
|
1335
|
-
|
|
1336
|
-
# Get reference alleles from GenotypeData object
|
|
1337
|
-
self.ref_alleles = genotype_data.ref
|
|
1338
|
-
|
|
1339
|
-
if not self.validation_mode:
|
|
1340
|
-
imputed012 = self.fit_predict(gt_list)
|
|
1341
|
-
genotype_data = genotype_data.copy()
|
|
1342
|
-
genotype_data.snp_data = genotype_data.decode_012(
|
|
1343
|
-
imputed012, prefix=prefix, write_output=False
|
|
1344
|
-
)
|
|
1345
|
-
self.imputed = genotype_data
|
|
1346
|
-
else:
|
|
1347
|
-
self.imputed = self.fit_predict(gt_list)
|
|
1348
|
-
|
|
1349
|
-
@property
|
|
1350
|
-
def genotypes_012(self):
|
|
1351
|
-
return self.imputed.genotypes_012
|
|
1352
|
-
|
|
1353
|
-
@property
|
|
1354
|
-
def snp_data(self):
|
|
1355
|
-
return self.imputed.snp_data
|
|
1356
|
-
|
|
1357
|
-
@property
|
|
1358
|
-
def alignment(self):
|
|
1359
|
-
return self.imputed.alignment
|
|
1360
|
-
|
|
1361
|
-
def fit_predict(
|
|
1362
|
-
self, X: List[List[Union[int, str]]]
|
|
1363
|
-
) -> Union[pd.DataFrame, np.ndarray, List[List[Union[int, str]]]]:
|
|
1364
|
-
"""Impute missing genotypes using reference alleles.
|
|
1365
|
-
|
|
1366
|
-
Impute using reference alleles. Missing alleles are primarily coded as negative; usually -9.
|
|
1367
|
-
|
|
1368
|
-
Args:
|
|
1369
|
-
X (List[List[Union[int, str]]], numpy.ndarray, or pandas.DataFrame): Genotypes obtained from the GenotypeData object.
|
|
1370
|
-
|
|
1371
|
-
Returns:
|
|
1372
|
-
pandas.DataFrame, numpy.ndarray, or List[List[Union[int, str]]]: Imputed genotypes of same shape as data.
|
|
1373
|
-
|
|
1374
|
-
Raises:
|
|
1375
|
-
TypeError: X must be of type list(list(int or str)), numpy.ndarray,
|
|
1376
|
-
or pandas.DataFrame, but got {type(X)}
|
|
1377
|
-
"""
|
|
1378
|
-
if self.verbose:
|
|
1379
|
-
print("\nImputing missing data with reference alleles...")
|
|
1380
|
-
|
|
1381
|
-
if isinstance(X, (list, np.ndarray)):
|
|
1382
|
-
df = pd.DataFrame(X)
|
|
1383
|
-
elif isinstance(X, pd.DataFrame):
|
|
1384
|
-
df = X.copy()
|
|
1385
|
-
else:
|
|
1386
|
-
raise TypeError(
|
|
1387
|
-
f"X must be of type list(list(int or str)), numpy.ndarray, "
|
|
1388
|
-
f"or pandas.DataFrame, but got {type(X)}"
|
|
1389
|
-
)
|
|
1390
|
-
|
|
1391
|
-
df = df.astype(df.dtypes)
|
|
1392
|
-
df.replace(self.missing, np.nan, inplace=True)
|
|
1393
|
-
|
|
1394
|
-
if df.dtypes[0] == int:
|
|
1395
|
-
df.fillna(0, inplace=True)
|
|
1396
|
-
else:
|
|
1397
|
-
for i, ref in enumerate(self.ref_alleles):
|
|
1398
|
-
df[i].fillna(ref, inplace=True)
|
|
1399
|
-
|
|
1400
|
-
if self.verbose:
|
|
1401
|
-
print("Done!")
|
|
1402
|
-
|
|
1403
|
-
if not self.validation_mode:
|
|
1404
|
-
return df.values.tolist()
|
|
1405
|
-
return df.values
|
|
1406
|
-
|
|
1407
|
-
def write2file(
|
|
1408
|
-
self, X: Union[pd.DataFrame, np.ndarray, List[List[Union[int, float]]]]
|
|
1409
|
-
) -> None:
|
|
1410
|
-
"""Write imputed data to file on disk.
|
|
1411
|
-
|
|
1412
|
-
Args:
|
|
1413
|
-
X (pandas.DataFrame, numpy.ndarray, List[List[Union[int, float]]]): Imputed data to write to file.
|
|
1414
|
-
|
|
1415
|
-
Raises:
|
|
1416
|
-
TypeError: If X is of unsupported type.
|
|
1417
|
-
"""
|
|
1418
|
-
outfile = os.path.join(
|
|
1419
|
-
f"{self.prefix}_output",
|
|
1420
|
-
"alignments",
|
|
1421
|
-
"Unsupervised",
|
|
1422
|
-
"ImputeRefAllele",
|
|
1423
|
-
)
|
|
1424
|
-
|
|
1425
|
-
Path(outfile).mkdir(parents=True, exist_ok=True)
|
|
1426
|
-
|
|
1427
|
-
outfile = os.path.join(outfile, "imputed_012.csv")
|
|
1428
|
-
|
|
1429
|
-
if isinstance(X, pd.DataFrame):
|
|
1430
|
-
df = X
|
|
1431
|
-
elif isinstance(X, (np.ndarray, list)):
|
|
1432
|
-
df = pd.DataFrame(X)
|
|
1433
|
-
else:
|
|
1434
|
-
raise TypeError(
|
|
1435
|
-
f"Could not write imputed data because it is of incorrect "
|
|
1436
|
-
f"type. Got {type(X)}"
|
|
1437
|
-
)
|
|
1438
|
-
|
|
1439
|
-
df.to_csv(outfile, header=False, index=False)
|