pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pg-sui might be problematic. Click here for more details.

Files changed (112) hide show
  1. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
  2. pg_sui-1.6.8.dist-info/RECORD +78 -0
  3. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
  5. pg_sui-1.6.8.dist-info/top_level.txt +1 -0
  6. pgsui/__init__.py +35 -54
  7. pgsui/_version.py +34 -0
  8. pgsui/cli.py +635 -0
  9. pgsui/data_processing/config.py +576 -0
  10. pgsui/data_processing/containers.py +1782 -0
  11. pgsui/data_processing/transformers.py +121 -1103
  12. pgsui/electron/app/__main__.py +5 -0
  13. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  14. pgsui/electron/app/icons/icons/128x128.png +0 -0
  15. pgsui/electron/app/icons/icons/16x16.png +0 -0
  16. pgsui/electron/app/icons/icons/24x24.png +0 -0
  17. pgsui/electron/app/icons/icons/256x256.png +0 -0
  18. pgsui/electron/app/icons/icons/32x32.png +0 -0
  19. pgsui/electron/app/icons/icons/48x48.png +0 -0
  20. pgsui/electron/app/icons/icons/512x512.png +0 -0
  21. pgsui/electron/app/icons/icons/64x64.png +0 -0
  22. pgsui/electron/app/icons/icons/icon.icns +0 -0
  23. pgsui/electron/app/icons/icons/icon.ico +0 -0
  24. pgsui/electron/app/main.js +189 -0
  25. pgsui/electron/app/package-lock.json +6893 -0
  26. pgsui/electron/app/package.json +50 -0
  27. pgsui/electron/app/preload.js +15 -0
  28. pgsui/electron/app/server.py +146 -0
  29. pgsui/electron/app/ui/logo.png +0 -0
  30. pgsui/electron/app/ui/renderer.js +130 -0
  31. pgsui/electron/app/ui/styles.css +59 -0
  32. pgsui/electron/app/ui/ui_shim.js +72 -0
  33. pgsui/electron/bootstrap.py +43 -0
  34. pgsui/electron/launch.py +59 -0
  35. pgsui/electron/package.json +14 -0
  36. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  37. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  38. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  39. pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
  40. pgsui/impute/deterministic/imputers/mode.py +679 -0
  41. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  42. pgsui/impute/deterministic/imputers/phylo.py +971 -0
  43. pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
  44. pgsui/impute/supervised/base.py +339 -0
  45. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
  46. pgsui/impute/supervised/imputers/random_forest.py +287 -0
  47. pgsui/impute/unsupervised/base.py +924 -0
  48. pgsui/impute/unsupervised/callbacks.py +89 -263
  49. pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
  50. pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
  51. pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
  52. pgsui/impute/unsupervised/imputers/vae.py +957 -0
  53. pgsui/impute/unsupervised/loss_functions.py +158 -0
  54. pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
  55. pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
  56. pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
  57. pgsui/impute/unsupervised/models/vae_model.py +259 -618
  58. pgsui/impute/unsupervised/nn_scorers.py +215 -0
  59. pgsui/utils/classification_viz.py +591 -0
  60. pgsui/utils/misc.py +35 -480
  61. pgsui/utils/plotting.py +514 -824
  62. pgsui/utils/scorers.py +212 -438
  63. pg_sui-1.0.2.1.dist-info/RECORD +0 -75
  64. pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
  65. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  66. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  67. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  68. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  69. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  70. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  71. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  72. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  73. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  74. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  75. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  76. pgsui/example_data/trees/test.iqtree +0 -376
  77. pgsui/example_data/trees/test.qmat +0 -5
  78. pgsui/example_data/trees/test.rate +0 -2033
  79. pgsui/example_data/trees/test.tre +0 -1
  80. pgsui/example_data/trees/test_n10.rate +0 -19
  81. pgsui/example_data/trees/test_n100.rate +0 -109
  82. pgsui/example_data/trees/test_n500.rate +0 -509
  83. pgsui/example_data/trees/test_siterates.txt +0 -2024
  84. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  85. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  86. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  87. pgsui/example_data/vcf_files/test.vcf +0 -244
  88. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  89. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  90. pgsui/impute/estimators.py +0 -735
  91. pgsui/impute/impute.py +0 -1486
  92. pgsui/impute/simple_imputers.py +0 -1439
  93. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
  94. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
  95. pgsui/impute/unsupervised/keras_classifiers.py +0 -702
  96. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  97. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
  98. pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
  99. pgsui/pg_sui.py +0 -261
  100. pgsui/utils/sequence_tools.py +0 -407
  101. simulation/sim_benchmarks.py +0 -333
  102. simulation/sim_treeparams.py +0 -475
  103. test/__init__.py +0 -0
  104. test/pg_sui_simtest.py +0 -215
  105. test/pg_sui_testing.py +0 -523
  106. test/test.py +0 -297
  107. test/test_pgsui.py +0 -374
  108. test/test_tkc.py +0 -214
  109. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
  110. /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  111. /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  112. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
@@ -1,1439 +0,0 @@
1
- import os
2
- import sys
3
- from pathlib import Path
4
- import warnings
5
- from typing import Optional, Union, List, Dict, Tuple, Any, Callable
6
-
7
- # Third-party imports
8
- import numpy as np
9
- import pandas as pd
10
- import scipy.linalg
11
- import toyplot.pdf
12
- import toyplot as tp
13
- import toytree as tt
14
- from decimal import Decimal
15
-
16
- from sklearn.impute import SimpleImputer
17
-
18
- # Custom imports
19
- try:
20
- from snpio import GenotypeData
21
- from ..utils.misc import isnotebook
22
- except (ModuleNotFoundError, ValueError, ImportError):
23
- from snpio import GenotypeData
24
- from utils.misc import isnotebook
25
-
26
- is_notebook = isnotebook()
27
-
28
- if is_notebook:
29
- from tqdm.notebook import tqdm as progressbar
30
- else:
31
- from tqdm import tqdm as progressbar
32
-
33
- # Pandas on pip gives a performance warning when doing the below code.
34
- # Apparently it's a bug that exists in the pandas version I used here.
35
- # It can be safely ignored.
36
- warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
37
-
38
-
39
- class ImputePhylo:
40
- """Impute missing data using a phylogenetic tree to inform the imputation.
41
-
42
- Args:
43
- genotype_data (GenotypeData instance): GenotypeData instance. Must have the q, tree, and optionally site_rates attributes defined.
44
-
45
- minbr (float or None, optional): Minimum branch length. Defaults to 0.0000000001
46
-
47
- str_encodings (Dict[str, int], optional): Integer encodings used in STRUCTURE-formatted file. Should be a dictionary with keys=nucleotides and values=integer encodings. The missing data encoding should also be included. Argument is ignored if using a PHYLIP-formatted file. Defaults to {"A": 1, "C": 2, "G": 3, "T": 4, "N": -9}
48
-
49
- prefix (str, optional): Prefix to use with output files. Defaults to "imputer".
50
-
51
- save_plots (bool, optional): Whether to save PDF files with genotype imputations for each site to disk. It makes one PDF file per locus, so if you have a lot of loci it will make a lot of PDF files. Defaults to False.
52
-
53
- disable_progressbar (bool, optional): Whether to disable the progress bar during the imputation. Defaults to False.
54
-
55
- kwargs (Dict[str, Any] or None, optional): Additional keyword arguments intended for internal purposes only. Possible arguments: {"column_subset": List[int] or numpy.ndarray[int]}; Subset SNPs by a list of indices for IterativeImputer. Defauls to None.
56
-
57
- Attributes:
58
- imputed (GenotypeData): New GenotypeData instance with imputed data.
59
-
60
- Example:
61
- >>>data = GenotypeData(
62
- >>> filename="test.str",
63
- >>> filetype="structure",
64
- >>> popmapfile="test.popmap",
65
- >>> guidetree="test.tre",
66
- >>> qmatrix_iqtree="test.iqtree",
67
- >>> siterates_iqtree="test.rates",
68
- >>>)
69
- >>>
70
- >>>phylo = ImputePhylo(
71
- >>> genotype_data=data,
72
- >>> save_plots=True,
73
- >>>)
74
- >>> # Get GenotypeData object.
75
- >>>gd_phylo = phylo.imputed
76
- """
77
-
78
- def __init__(
79
- self,
80
- genotype_data: Optional[Any],
81
- minbr: Optional[float] = 0.0000000001,
82
- *,
83
- str_encodings: Dict[str, int] = {
84
- "A": 1,
85
- "C": 2,
86
- "G": 3,
87
- "T": 4,
88
- "N": -9,
89
- },
90
- prefix: str = "imputer",
91
- save_plots: bool = False,
92
- disable_progressbar: bool = False,
93
- **kwargs: Optional[Dict[str, Any]],
94
- ) -> None:
95
- self.genotype_data = genotype_data
96
- self.alnfile = genotype_data.filename
97
- self.filetype = genotype_data.filetype
98
- self.popmap = genotype_data.popmap
99
- self.str_encodings = str_encodings
100
- self.prefix = prefix
101
- self.minbr = minbr
102
- self.save_plots = save_plots
103
- self.disable_progressbar = disable_progressbar
104
- self.column_subset = kwargs.get("column_subset", None)
105
- self.validation_mode = kwargs.get("validation_mode", False)
106
-
107
- self.valid_sites = None
108
- self.valid_sites_count = None
109
-
110
- self._validate_arguments(genotype_data)
111
- data, tree, q, site_rates = self._parse_arguments(genotype_data)
112
-
113
- if not self.validation_mode:
114
- imputed012 = self.impute_phylo(tree, data, q, site_rates)
115
- genotype_data = genotype_data.copy()
116
- genotype_data.snp_data = genotype_data.decode_012(
117
- imputed012, prefix=prefix, write_output=False
118
- )
119
- self.imputed = genotype_data
120
- else:
121
- self.imputed = self.impute_phylo(tree, data, q, site_rates)
122
-
123
- @property
124
- def genotypes_012(self):
125
- return self.imputed.genotypes012
126
-
127
- @property
128
- def snp_data(self):
129
- return self.imputed.snp_data
130
-
131
- @property
132
- def alignment(self):
133
- return self.imputed.alignment
134
-
135
- def impute_phylo(
136
- self,
137
- tree: tt.tree,
138
- genotypes: Dict[str, List[Union[str, int]]],
139
- Q: pd.DataFrame,
140
- site_rates=None,
141
- minbr=0.0000000001,
142
- ) -> pd.DataFrame:
143
- """Imputes genotype values with a guide tree.
144
-
145
- Imputes genotype values by using a provided guide
146
- tree to inform the imputation, assuming maximum parsimony.
147
-
148
- Process Outline:
149
- For each SNP:
150
- 1) if site_rates, get site-transformated Q matrix.
151
-
152
- 2) Postorder traversal of tree to compute ancestral
153
- state likelihoods for internal nodes (tips -> root).
154
- If exclude_N==True, then ignore N tips for this step.
155
-
156
- 3) Preorder traversal of tree to populate missing genotypes
157
- with the maximum likelihood state (root -> tips).
158
-
159
- Args:
160
- tree (toytree.tree object): Input tree.
161
-
162
- genotypes (Dict[str, List[Union[str, int]]]): Dictionary with key=sampleids, value=sequences.
163
-
164
- Q (pandas.DataFrame): Rate Matrix Q from .iqtree or separate file.
165
-
166
- site_rates (List): Site-specific substitution rates (used to weight per-site Q)
167
-
168
- minbr (float) : Minimum branch length (those below this value will be treated as == minbr)
169
-
170
- Returns:
171
- pandas.DataFrame: Imputed genotypes.
172
-
173
- Raises:
174
- IndexError: If index does not exist when trying to read genotypes.
175
- AssertionError: Sites must have same lengths.
176
- AssertionError: Missing data still found after imputation.
177
- """
178
- try:
179
- if list(genotypes.values())[0][0][1] == "/":
180
- genotypes = self._str2iupac(genotypes, self.str_encodings)
181
- except IndexError:
182
- if self._is_int(list(genotypes.values())[0][0][0]):
183
- raise
184
-
185
- if self.column_subset is not None:
186
- if isinstance(self.column_subset, np.ndarray):
187
- self.column_subset = self.column_subset.tolist()
188
-
189
- genotypes = {
190
- k: [v[i] for i in self.column_subset]
191
- for k, v in genotypes.items()
192
- }
193
-
194
- # For each SNP:
195
- nsites = list(set([len(v) for v in genotypes.values()]))
196
- assert len(nsites) == 1, "Some sites have different lengths!"
197
-
198
- outdir = f"{self.prefix}_imputation_plots"
199
-
200
- if self.save_plots:
201
- Path(outdir).mkdir(parents=True, exist_ok=True)
202
-
203
- for snp_index in progressbar(
204
- range(nsites[0]),
205
- desc="Feature Progress: ",
206
- leave=True,
207
- disable=self.disable_progressbar,
208
- ):
209
- rate = 1.0
210
- if site_rates is not None:
211
- rate = site_rates[snp_index]
212
-
213
- site_Q = Q.copy(deep=True) * rate
214
-
215
- bads = list()
216
- for samp in genotypes.keys():
217
- if genotypes[samp][snp_index].upper() == "N":
218
- bads.append(samp)
219
-
220
- # postorder traversal to compute likelihood at root
221
- node_lik = dict()
222
- for node in tree.treenode.traverse("postorder"):
223
- if node.is_leaf():
224
- continue
225
-
226
- if node.idx not in node_lik:
227
- node_lik[node.idx] = [1.0, 1.0, 1.0, 1.0]
228
-
229
- for child in node.get_children():
230
- # get branch length to child
231
- # bl = child.edge.length
232
- # get transition probs
233
- d = child.dist
234
- if d < minbr:
235
- d = minbr
236
- pt = self._transition_probs(site_Q, d)
237
- if child.is_leaf():
238
- if child.name in genotypes:
239
- if child.name in bads:
240
- sum = [1.0, 1.0, 1.0, 1.0]
241
- else:
242
- # get genotype data
243
- sum = None
244
- for allele in self._get_iupac_full(
245
- genotypes[child.name][snp_index]
246
- ):
247
- if sum is None:
248
- sum = [
249
- Decimal(x)
250
- for x in list(pt[allele])
251
- ]
252
- else:
253
- sum = [
254
- Decimal(sum[i]) + Decimal(val)
255
- for i, val in enumerate(
256
- list(pt[allele])
257
- )
258
- ]
259
- node_lik[child.idx] = [Decimal(x) for x in sum]
260
-
261
- # add to likelihood for parent node
262
- if node_lik[node.idx] is None:
263
- node_lik[node.idx] = node_lik[child.idx]
264
- else:
265
- node_lik[node.idx] = [
266
- Decimal(node_lik[child.idx][i])
267
- * Decimal(val)
268
- for i, val in enumerate(node_lik[node.idx])
269
- ]
270
- else:
271
- # raise error
272
- sys.exit(
273
- f"Error: Taxon {child.name} not found in "
274
- f"genotypes"
275
- )
276
- else:
277
- l = self._get_internal_lik(pt, node_lik[child.idx])
278
- if node_lik[node.idx] is None:
279
- node_lik[node.idx] = [Decimal(x) for x in l]
280
-
281
- else:
282
- node_lik[node.idx] = [
283
- Decimal(l[i]) * Decimal(val)
284
- for i, val in enumerate(node_lik[node.idx])
285
- ]
286
-
287
- # preorder traversal to get marginal reconstructions at internal
288
- # nodes
289
- marg = node_lik.copy()
290
- for node in tree.treenode.traverse("preorder"):
291
- if node.is_root():
292
- continue
293
- elif node.is_leaf():
294
- continue
295
- lik_arr = marg[node.idx]
296
- parent_arr = marg[node.up.idx]
297
- marg[node.idx] = [
298
- Decimal(lik) * (Decimal(parent_arr[i]) / Decimal(lik))
299
- for i, lik in enumerate(lik_arr)
300
- ]
301
-
302
- # get marginal reconstructions for bad bois
303
- two_pass = dict()
304
- for samp in bads:
305
- # get most likely state for focal tip
306
- node = tree.idx_dict[
307
- tree.get_mrca_idx_from_tip_labels(names=samp)
308
- ]
309
- dist = node.dist
310
- parent = node.up
311
- imputed = None
312
- pt = self._transition_probs(site_Q, dist)
313
- lik = self._get_internal_lik(pt, marg[parent.idx])
314
-
315
- tol = 0.001
316
- imputed = self._get_imputed_nuc(lik)
317
-
318
- # two_pass[samp] = [imputed, lik]
319
- genotypes[samp][snp_index] = imputed
320
-
321
- # DEPRECATED: RE-ROOTING METHOD OF YANG ET AL
322
- # NEW METHOD (ABOVE) IS LINEAR
323
- # reroot=dict()
324
- # for samp in bads:
325
- # #focaltree = tree.drop_tips(names=[x for x in bads if x != samp])
326
- # focaltree = tree.root(names=[samp])
327
- #
328
- # mystyle = {
329
- # "edge_type": "p",
330
- # "edge_style": {
331
- # "stroke-width": 1,
332
- # },
333
- # "tip_labels_align": True,
334
- # "tip_labels_style": {"font-size": "5px"},
335
- # "node_labels": False,
336
- # }
337
- #
338
- # canvas, axes, mark = focaltree.draw()
339
- # toyplot.pdf.render(canvas, "test.pdf")
340
- #
341
- # #postorder traversal to compute likelihood
342
- # node_lik = dict()
343
- # for node in focaltree.treenode.traverse("postorder"):
344
- # if node.is_leaf():
345
- # continue
346
- #
347
- # if node.idx not in node_lik:
348
- # node_lik[node.idx] = None
349
- #
350
- # for child in node.get_children():
351
- # # get branch length to child
352
- # # bl = child.edge.length
353
- # # get transition probs
354
- # pt = self._transition_probs(site_Q, child.dist)
355
- # if child.is_leaf():
356
- # if child.name in genotypes:
357
- # if child.name in bads:
358
- # sum = [1.0, 1.0, 1.0, 1.0]
359
- # else:
360
- # # get genotype data
361
- # sum = None
362
- # for allele in self._get_iupac_full(
363
- # genotypes[child.name][snp_index]
364
- # ):
365
- # if sum is None:
366
- # sum = [Decimal(x) for x in list(pt[allele])]
367
- # else:
368
- # sum = [
369
- # Decimal(sum[i]) + Decimal(val)
370
- # for i, val in enumerate(
371
- # list(pt[allele])
372
- # )
373
- # ]
374
- #
375
- # node_lik[child.idx] = [Decimal(x) for x in sum]
376
- #
377
- # #add to likelihood for parent node
378
- # if node_lik[node.idx] is None:
379
- # node_lik[node.idx] = node_lik[child.idx]
380
- # else:
381
- # node_lik[node.idx] = [
382
- # Decimal(node_lik[child.idx][i]) * Decimal(val)
383
- # for i, val in enumerate(node_lik[node.idx])
384
- # ]
385
- # else:
386
- # # raise error
387
- # sys.exit(
388
- # f"Error: Taxon {child.name} not found in "
389
- # f"genotypes"
390
- # )
391
- # else:
392
- # l = self._get_internal_lik(pt, node_lik[child.idx])
393
- # if node_lik[node.idx] is None:
394
- # node_lik[node.idx] = [Decimal(x) for x in l]
395
- #
396
- # else:
397
- # node_lik[node.idx] = [
398
- # Decimal(l[i]) * Decimal(val)
399
- # for i, val in enumerate(node_lik[node.idx])
400
- # ]
401
- #
402
- # # get most likely state for focal tip
403
- # node = focaltree.idx_dict[
404
- # focaltree.get_mrca_idx_from_tip_labels(names=samp)
405
- # ]
406
- # dist = node.dist
407
- # parent = node.up
408
- # imputed = None
409
- # pt = self._transition_probs(site_Q, dist)
410
- # lik = self._get_internal_lik(pt, node_lik[parent.idx])
411
- # maxpos = lik.index(max(lik))
412
- # if maxpos == 0:
413
- # imputed = "A"
414
- #
415
- # elif maxpos == 1:
416
- # imputed = "C"
417
- #
418
- # elif maxpos == 2:
419
- # imputed = "G"
420
- #
421
- # else:
422
- # imputed = "T"
423
- # reroot[samp] = [imputed, lik]
424
- # check if two methods give same results
425
- # for key in two_pass:
426
- # if two_pass[key][0] != reroot[key][0]:
427
- # print("Two-pass:", two_pass[key][0], "-", two_pass[key][1])
428
- # print("Reroot:", reroot[key][0], "-", reroot[key][1])
429
-
430
- if self.save_plots:
431
- self._draw_imputed_position(
432
- tree,
433
- bads,
434
- genotypes,
435
- snp_index,
436
- f"{outdir}/{self.prefix}_pos{snp_index}.pdf",
437
- )
438
-
439
- df = pd.DataFrame.from_dict(genotypes, orient="index")
440
-
441
- # Make sure no missing data remains in the dataset
442
- assert (
443
- not df.isin([-9]).any().any()
444
- ), "Imputation failed...Missing values found in the imputed dataset"
445
-
446
- (
447
- imp_snps,
448
- self.valid_sites,
449
- self.valid_sites_count,
450
- ) = self.genotype_data.convert_012(
451
- df.to_numpy().tolist(), impute_mode=True
452
- )
453
-
454
- df_imp = pd.DataFrame.from_records(imp_snps)
455
-
456
- return df_imp
457
-
458
- def nbiallelic(self) -> int:
459
- """Get the number of remaining bi-allelic sites after imputation.
460
-
461
- Returns:
462
- int: Number of bi-allelic sites remaining after imputation.
463
- """
464
- return len(self.imputed.columns)
465
-
466
- def _get_imputed_nuc(self, lik_arr):
467
- nucmap = {0: "A", 1: "C", 2: "G", 3: "T"}
468
- maxpos = lik_arr.index(max(lik_arr))
469
- picks = set([maxpos])
470
- # NOT USED:
471
- # Experimenting with ways to impute heterozygotes.
472
- # Note that LRT isn't appropriate (as I used here) because
473
- # the models are not nested & LRTS isn't necessarily expected
474
- # to be chisq distributed.
475
- # Check out Vuong test and read Lewis et al 2011 (doi: 10.1111/j.2041-210X.2010.00063.x)
476
- #
477
- # for index, alt in enumerate(lik_arr):
478
- # if index == maxpos:
479
- # continue
480
- # else:
481
- # lr = lrt(lik_arr[maxpos], alt, loglik=False)
482
- # p = chi2.sf(lr)
483
- # print(nucmap[maxpos], ":", str(lrt(lik_arr[maxpos], alt, loglik=False)), p)
484
- return nucmap[maxpos]
485
-
486
- def _parse_arguments(
487
- self, genotype_data: Any
488
- ) -> Tuple[Dict[str, List[Union[int, str]]], tt.tree, pd.DataFrame]:
489
- """Determine which arguments were specified and set appropriate values.
490
-
491
- Args:
492
- genotype_data (GenotypeData object): Initialized GenotypeData object.
493
-
494
- Returns:
495
- Dict[str, List[Union[int, str]]]: GenotypeData.snpsdict object. If genotype_data is not None, then this value gets set from the GenotypeData.snpsdict object. If alnfile is not None, then the alignment file gets read and the snpsdict object gets set from the alnfile.
496
-
497
- toytree.tree: Input phylogeny, either read from GenotypeData object or supplied with treefile.
498
-
499
- pandas.DataFrame: Q Rate Matrix, either from IQ-TREE file or from its own supplied file.
500
- """
501
- data = genotype_data.snpsdict
502
- tree = genotype_data.tree
503
-
504
- # read (optional) Q-matrix
505
- if genotype_data.q is not None:
506
- q = genotype_data.q
507
- else:
508
- raise TypeError("q must be defined in GenotypeData instance.")
509
-
510
- # read (optional) site-specific substitution rates
511
- if genotype_data.site_rates is not None:
512
- site_rates = genotype_data.site_rates
513
- else:
514
- raise TypeError(
515
- "site rates must be defined in GenotypeData instance."
516
- )
517
-
518
- return data, tree, q, site_rates
519
-
520
- def _validate_arguments(self, genotype_data: Any) -> None:
521
- """Validate that the correct arguments were supplied.
522
-
523
- Args:
524
- genotype_data (GenotypeData object): Input GenotypeData instance.
525
-
526
- Raises:
527
- TypeError: Must define genotype_data.tree in GenotypeData instance.
528
- TypeError: Q rate matrix must be defined in GenotypeData instance.
529
- """
530
-
531
- if genotype_data.tree is None:
532
- raise TypeError("genotype_data.tree must be defined")
533
-
534
- if genotype_data.q is None:
535
- raise TypeError("q must be defined in GenotypeData instance.")
536
-
537
- def _print_q(self, q: pd.DataFrame) -> None:
538
- """Print Rate Matrix Q.
539
-
540
- Args:
541
- q (pandas.DataFrame): Rate Matrix Q.
542
- """
543
- print("Rate matrix Q:")
544
- print("\tA\tC\tG\tT\t")
545
- for nuc1 in ["A", "C", "G", "T"]:
546
- print(nuc1, end="\t")
547
- for nuc2 in ["A", "C", "G", "T"]:
548
- print(q[nuc1][nuc2], end="\t")
549
- print("")
550
-
551
- def _is_int(self, val: Union[str, int]) -> bool:
552
- """Check if value is integer.
553
-
554
- Args:
555
- val (int or str): Value to check.
556
-
557
- Returns:
558
- bool: True if integer, False if string.
559
- """
560
- try:
561
- num = int(val)
562
- except ValueError:
563
- return False
564
- return True
565
-
566
- def _get_nuc_colors(self, nucs: List[str]) -> List[str]:
567
- """Get colors for each nucleotide when plotting.
568
-
569
- Args:
570
- nucs (List[str]): Nucleotides at current site.
571
-
572
- Returns:
573
- List[str]: Hex-code color values for each IUPAC nucleotide.
574
- """
575
- ret = list()
576
- for nuc in nucs:
577
- nuc = nuc.upper()
578
- if nuc == "A":
579
- ret.append("#0000FF") # blue
580
- elif nuc == "C":
581
- ret.append("#FF0000") # red
582
- elif nuc == "G":
583
- ret.append("#00FF00") # green
584
- elif nuc == "T":
585
- ret.append("#FFFF00") # yellow
586
- elif nuc == "R":
587
- ret.append("#0dbaa9") # blue-green
588
- elif nuc == "Y":
589
- ret.append("#FFA500") # orange
590
- elif nuc == "K":
591
- ret.append("#9acd32") # yellow-green
592
- elif nuc == "M":
593
- ret.append("#800080") # purple
594
- elif nuc == "S":
595
- ret.append("#964B00")
596
- elif nuc == "W":
597
- ret.append("#C0C0C0")
598
- else:
599
- ret.append("#000000")
600
- return ret
601
-
602
- def _label_bads(
603
- self, tips: List[str], labels: List[str], bads: List[str]
604
- ) -> List[str]:
605
- """Insert asterisks around bad nucleotides.
606
-
607
- Args:
608
- tips (List[str]): Tip labels (sample IDs).
609
- labels (List[str]): List of nucleotides at current site.
610
- bads (List[str]): List of tips that have missing data at current site.
611
-
612
- Returns:
613
- List[str]: IUPAC Nucleotides with "*" inserted around tips that had missing data.
614
- """
615
- for i, t in enumerate(tips):
616
- if t in bads:
617
- labels[i] = "*" + str(labels[i]) + "*"
618
- return labels
619
-
620
- def _draw_imputed_position(
621
- self,
622
- tree: tt.tree,
623
- bads: List[str],
624
- genotypes: Dict[str, List[str]],
625
- pos: int,
626
- out: str = "tree.pdf",
627
- ) -> None:
628
- """Draw nucleotides at phylogeny tip and saves to file on disk.
629
-
630
- Draws nucleotides as tip labels for the current SNP site. Imputed values have asterisk surrounding the nucleotide label. The tree is converted to a toyplot object and saved to file.
631
-
632
- Args:
633
- tree (toytree.tree): Input tree object.
634
- bads (List[str]): List of sampleIDs that have missing data at the current SNP site.
635
- genotypes (Dict[str, List[str]]): Genotypes at all SNP sites.
636
- pos (int): Current SNP index.
637
- out (str, optional): Output filename for toyplot object.
638
- """
639
-
640
- # print(tree.get_tip_labels())
641
- sizes = [8 if i in bads else 0 for i in tree.get_tip_labels()]
642
- colors = [genotypes[i][pos] for i in tree.get_tip_labels()]
643
- labels = colors
644
-
645
- labels = self._label_bads(tree.get_tip_labels(), labels, bads)
646
-
647
- colors = self._get_nuc_colors(colors)
648
-
649
- mystyle = {
650
- "edge_type": "p",
651
- "edge_style": {
652
- "stroke": tt.colors[0],
653
- "stroke-width": 1,
654
- },
655
- "tip_labels_align": True,
656
- "tip_labels_style": {"font-size": "5px"},
657
- "node_labels": False,
658
- }
659
-
660
- canvas, axes, mark = tree.draw(
661
- tip_labels_colors=colors,
662
- tip_labels=labels,
663
- width=400,
664
- height=600,
665
- **mystyle,
666
- )
667
-
668
- toyplot.pdf.render(canvas, out)
669
-
670
- def _all_missing(
671
- self,
672
- tree: tt.tree,
673
- node_index: int,
674
- snp_index: int,
675
- genotypes: Dict[str, List[str]],
676
- ) -> bool:
677
- """Check if all descendants of a clade have missing data at SNP site.
678
-
679
- Args:
680
- tree (toytree.tree): Input guide tree object.
681
-
682
- node_index (int): Parent node to determine if all descendants have missing data.
683
-
684
- snp_index (int): Index of current SNP site.
685
-
686
- genotypes (Dict[str, List[str]]): Genotypes at all SNP sites.
687
-
688
- Returns:
689
- bool: True if all descendants have missing data, otherwise False.
690
- """
691
- for des in tree.get_tip_labels(idx=node_index):
692
- if genotypes[des][snp_index].upper() not in ["N", "-"]:
693
- return False
694
- return True
695
-
696
- def _get_internal_lik(
697
- self, pt: pd.DataFrame, lik_arr: List[float]
698
- ) -> List[float]:
699
- """Get ancestral state likelihoods for internal nodes of the tree.
700
-
701
- Postorder traversal to calculate internal ancestral state likelihoods (tips -> root).
702
-
703
- Args:
704
- pt (pandas.DataFrame): Transition probabilities calculated from Rate Matrix Q.
705
- lik_arr (List[float]): Likelihoods for nodes or leaves.
706
-
707
- Returns:
708
- List[float]: Internal likelihoods.
709
- """
710
- ret = list()
711
- for i, val in enumerate(lik_arr):
712
- col = list(pt.iloc[:, i])
713
- sum = Decimal(0.0)
714
- for v in col:
715
- sum += Decimal(v) * Decimal(val)
716
- ret.append(sum)
717
- return ret
718
-
719
- def _transition_probs(self, Q: pd.DataFrame, t: float) -> pd.DataFrame:
720
- """Get transition probabilities for tree.
721
-
722
- Args:
723
- Q (pd.DataFrame): Rate Matrix Q.
724
- t (float): Tree distance of child.
725
-
726
- Returns:
727
- pd.DataFrame: Transition probabilities.
728
- """
729
- ret = Q.copy(deep=True)
730
- m = Q.to_numpy()
731
- pt = scipy.linalg.expm(m * t)
732
- ret[:] = pt
733
- return ret
734
-
735
- def _str2iupac(
736
- self, genotypes: Dict[str, List[str]], str_encodings: Dict[str, int]
737
- ) -> Dict[str, List[str]]:
738
- """Convert STRUCTURE-format encodings to IUPAC bases.
739
-
740
- Args:
741
- genotypes (Dict[str, List[str]]): Genotypes at all sites.
742
- str_encodings (Dict[str, int]): Dictionary that maps IUPAC bases (keys) to integer encodings (values).
743
-
744
- Returns:
745
- Dict[str, List[str]]: Genotypes converted to IUPAC format.
746
- """
747
- a = str_encodings["A"]
748
- c = str_encodings["C"]
749
- g = str_encodings["G"]
750
- t = str_encodings["T"]
751
- n = str_encodings["N"]
752
- nuc = {
753
- f"{a}/{a}": "A",
754
- f"{c}/{c}": "C",
755
- f"{g}/{g}": "G",
756
- f"{t}/{t}": "T",
757
- f"{n}/{n}": "N",
758
- f"{a}/{c}": "M",
759
- f"{c}/{a}": "M",
760
- f"{a}/{g}": "R",
761
- f"{g}/{a}": "R",
762
- f"{a}/{t}": "W",
763
- f"{t}/{a}": "W",
764
- f"{c}/{g}": "S",
765
- f"{g}/{c}": "S",
766
- f"{c}/{t}": "Y",
767
- f"{t}/{c}": "Y",
768
- f"{g}/{t}": "K",
769
- f"{t}/{g}": "K",
770
- }
771
-
772
- for k, v in genotypes.items():
773
- for i, gt in enumerate(v):
774
- v[i] = nuc[gt]
775
-
776
- return genotypes
777
-
778
- def _get_iupac_full(self, char: str) -> List[str]:
779
- """Map nucleotide to list of expanded IUPAC encodings.
780
-
781
- Args:
782
- char (str): Current nucleotide.
783
-
784
- Returns:
785
- List[str]: List of nucleotides in ``char`` expanded IUPAC.
786
- """
787
- char = char.upper()
788
- iupac = {
789
- "A": ["A"],
790
- "G": ["G"],
791
- "C": ["C"],
792
- "T": ["T"],
793
- "N": ["A", "C", "T", "G"],
794
- "-": ["A", "C", "T", "G"],
795
- "R": ["A", "G"],
796
- "Y": ["C", "T"],
797
- "S": ["G", "C"],
798
- "W": ["A", "T"],
799
- "K": ["G", "T"],
800
- "M": ["A", "C"],
801
- "B": ["C", "G", "T"],
802
- "D": ["A", "G", "T"],
803
- "H": ["A", "C", "T"],
804
- "V": ["A", "C", "G"],
805
- }
806
-
807
- ret = iupac[char]
808
- return ret
809
-
810
-
811
- class ImputeAlleleFreq:
812
- """Impute missing data by global allele frequency. Population IDs can be sepcified with the pops argument. if pops is None, then imputation is by global allele frequency. If pops is not None, then imputation is by population-wise allele frequency. A list of population IDs in the appropriate format can be obtained from the GenotypeData object as GenotypeData.populations.
813
-
814
- Args:
815
- genotype_data (GenotypeData object): GenotypeData instance.
816
-
817
- by_populations (bool, optional): Whether or not to impute by-population or globally. Defaults to False (global allele frequency).
818
-
819
- diploid (bool, optional): When diploid=True, function assumes 0=homozygous ref; 1=heterozygous; 2=homozygous alt. 0-1-2 genotypes are decomposed to compute p (=frequency of ref) and q (=frequency of alt). In this case, p and q alleles are sampled to generate either 0 (hom-p), 1 (het), or 2 (hom-q) genotypes. When diploid=FALSE, 0-1-2 are sampled according to their observed frequency. Defaults to True.
820
-
821
- default (int, optional): Value to set if no alleles sampled at a locus. Defaults to 0.
822
-
823
- missing (int, optional): Missing data value. Defaults to -9.
824
-
825
- verbose (bool, optional): Whether to print status updates. Set to False for no status updates. Defaults to True.
826
-
827
- kwargs (Dict[str, Any]): Additional keyword arguments to supply. Primarily for internal purposes. Options include: {"iterative_mode": bool, validation_mode: bool, gt: List[List[int]]}. "iterative_mode" determines whether ``ImputeAlleleFreq`` is being used as the initial imputer in ``IterativeImputer``\. ``gt`` is used internally for the simple imputers during grid searches and validation. If ``genotype_data is None`` then ``gt`` cannot also be None, and vice versa. Only one of ``gt`` or ``genotype_data`` can be set.
828
-
829
- Raises:
830
- TypeError: genotype_data cannot be NoneType.
831
-
832
- Attributes:
833
- imputed (GenotypeData): New GenotypeData instance with imputed data.
834
-
835
- Example:
836
- >>>data = GenotypeData(
837
- >>> filename="test.str",
838
- >>> filetype="structure2rowPopID",
839
- >>> popmapfile="test.popmap",
840
- >>>)
841
- >>>
842
- >>>afpop = ImputeAlleleFreq(
843
- >>> genotype_data=data,
844
- >>> by_populations=True,
845
- >>>)
846
- >>>
847
- >>>gd_afpop = afpop.imputed
848
- """
849
-
850
- def __init__(
851
- self,
852
- genotype_data: GenotypeData,
853
- *,
854
- by_populations: bool = False,
855
- diploid: bool = True,
856
- default: int = 0,
857
- missing: int = -9,
858
- verbose: bool = True,
859
- prefix="imputer",
860
- **kwargs: Dict[str, Any],
861
- ) -> None:
862
- if genotype_data is None and gt is None:
863
- raise TypeError("GenotypeData instance or gt must be provided.")
864
-
865
- gt = kwargs.get("gt", None)
866
-
867
- if gt is None:
868
- gt_list = genotype_data.genotypes_012(fmt="list")
869
- else:
870
- gt_list = gt
871
-
872
- if by_populations:
873
- if genotype_data.populations is None:
874
- raise TypeError(
875
- "When by_populations is True, GenotypeData instance must have a defined populations attribute"
876
- )
877
-
878
- self.pops = genotype_data.populations
879
-
880
- else:
881
- self.pops = None
882
-
883
- self.diploid = diploid
884
- self.default = default
885
- self.missing = missing
886
- self.verbose = verbose
887
- self.iterative_mode = kwargs.get("iterative_mode", False)
888
- self.validation_mode = kwargs.get("validation_mode", False)
889
-
890
- if not self.validation_mode:
891
- imputed012, self.valid_cols = self.fit_predict(gt_list)
892
- genotype_data = genotype_data.copy()
893
- genotype_data.snp_data = genotype_data.decode_012(
894
- imputed012, prefix=prefix, write_output=False
895
- )
896
- genotype_data.genotypes_012 = imputed012
897
- self.imputed = genotype_data
898
- else:
899
- self.imputed, self.valid_cols = self.fit_predict(gt_list)
900
-
901
- @property
902
- def genotypes_012(self):
903
- return self.imputed.genotypes_012
904
-
905
- @property
906
- def snp_data(self):
907
- return self.imputed.snp_data
908
-
909
- @property
910
- def alignment(self):
911
- return self.imputed.alignment
912
-
913
- def fit_predict(
914
- self, X: List[List[int]]
915
- ) -> Tuple[
916
- Union[pd.DataFrame, np.ndarray, List[List[Union[int, float]]]],
917
- List[int],
918
- ]:
919
- """Impute missing genotypes using allele frequencies.
920
-
921
- Impute using global or by_population allele frequencies. Missing alleles are primarily coded as negative; usually -9.
922
-
923
- Args:
924
- X (List[List[int]], numpy.ndarray, or pandas.DataFrame): 012-encoded genotypes obtained from the GenotypeData object.
925
-
926
- Returns:
927
- pandas.DataFrame, numpy.ndarray, or List[List[Union[int, float]]]: Imputed genotypes of same shape as data.
928
-
929
- List[int]: Column indexes that were retained.
930
-
931
- Raises:
932
- TypeError: X must be either list, np.ndarray, or pd.DataFrame.
933
- """
934
- if self.pops is not None and self.verbose:
935
- print("\nImputing by population allele frequencies...")
936
- elif self.pops is None and self.verbose:
937
- print("\nImputing by global allele frequency...")
938
-
939
- if isinstance(X, (list, np.ndarray)):
940
- df = pd.DataFrame(X)
941
- elif isinstance(X, pd.DataFrame):
942
- df = X.copy()
943
- else:
944
- raise TypeError(
945
- f"X must be of type list(list(int)), numpy.ndarray, "
946
- f"or pandas.DataFrame, but got {type(X)}"
947
- )
948
-
949
- df = df.astype(int)
950
- df.replace(self.missing, np.nan, inplace=True)
951
-
952
- # Initialize an empty list to hold the columns
953
- columns = []
954
- valid_cols = list()
955
- bad_cnt = 0
956
-
957
- if self.pops is not None:
958
- df = df.copy()
959
-
960
- # Impute per-population mode.
961
- df["pops"] = self.pops
962
- groups = df.groupby(["pops"], sort=False)
963
-
964
- for col in df.columns:
965
- try:
966
- # Instead of appending to the DataFrame, append to the list
967
- columns.append(
968
- groups[col].transform(
969
- lambda x: x.fillna(x.mode().iloc[0])
970
- )
971
- )
972
-
973
- if col != "pops":
974
- valid_cols.append(col)
975
-
976
- except IndexError as e:
977
- if str(e).lower().startswith("single positional indexer"):
978
- bad_cnt += 1
979
- # Impute with global mode, unless globally missing in which case call as 0.0
980
- if df[col].isna().all():
981
- columns.append(df[col].fillna(0.0, inplace=False))
982
- else:
983
- columns.append(
984
- df[col].fillna(df[col].mode().iloc[0])
985
- )
986
- else:
987
- raise
988
-
989
- data = pd.concat(columns, axis=1)
990
-
991
- if bad_cnt > 0 and not self.validation_mode:
992
- UserWarning(
993
- f"\n{bad_cnt} columns were imputed with the "
994
- f"global mode because some of the populations "
995
- f"contained only missing data"
996
- )
997
-
998
- data.drop("pops", axis=1, inplace=True)
999
- else:
1000
- # Impute global mode.
1001
- imp = SimpleImputer(strategy="most_frequent")
1002
-
1003
- # replace any columns that are fully missing
1004
- df.loc[:, df.isna().all()] = df.loc[:, df.isna().all()].fillna(0.0)
1005
-
1006
- data = pd.DataFrame(imp.fit_transform(df))
1007
-
1008
- if self.iterative_mode:
1009
- data = data.astype(dtype="float32")
1010
- else:
1011
- data = data.astype(dtype="Int8")
1012
-
1013
- if self.verbose:
1014
- print("Done!")
1015
-
1016
- if not self.validation_mode:
1017
- return data.values.tolist(), valid_cols
1018
- return data.values, valid_cols
1019
-
1020
- def write2file(
1021
- self, X: Union[pd.DataFrame, np.ndarray, List[List[Union[int, float]]]]
1022
- ) -> None:
1023
- """Write imputed data to file on disk.
1024
-
1025
- Args:
1026
- X (pandas.DataFrame, numpy.ndarray, List[List[Union[int, float]]]): Imputed data to write to file.
1027
-
1028
- Raises:
1029
- TypeError: If X is of unsupported type.
1030
- """
1031
- outfile = os.path.join(
1032
- f"{self.prefix}_output",
1033
- "alignments",
1034
- "Unsupervised",
1035
- "ImputeAlleleFreq",
1036
- )
1037
-
1038
- Path(outfile).mkdir(parents=True, exist_ok=True)
1039
-
1040
- outfile = os.path.join(outfile, "imputed_012.csv")
1041
-
1042
- if isinstance(X, pd.DataFrame):
1043
- df = X
1044
- elif isinstance(X, (np.ndarray, list)):
1045
- df = pd.DataFrame(X)
1046
- else:
1047
- raise TypeError(
1048
- f"Could not write imputed data because it is of incorrect "
1049
- f"type. Got {type(X)}"
1050
- )
1051
-
1052
- df.to_csv(outfile, header=False, index=False)
1053
-
1054
-
1055
- class ImputeMF:
1056
- """Impute missing data using matrix factorization. If ``by_populations=False`` then imputation is by global allele frequency. If ``by_populations=True`` then imputation is by population-wise allele frequency.
1057
-
1058
- Args:
1059
- genotype_data (GenotypeData object or None, optional): GenotypeData instance.
1060
-
1061
- latent_features (float, optional): The number of latent variables used to reduce dimensionality of the data. Defaults to 2.
1062
-
1063
- learning_rate (float, optional): The learning rate for the optimizers. Adjust if the loss is learning too slowly. Defaults to 0.1.
1064
-
1065
- tol (float, optional): Tolerance of the stopping condition. Defaults to 1e-3.
1066
-
1067
- missing (int, optional): Missing data value. Defaults to -9.
1068
-
1069
- prefix (str, optional): Prefix for writing output files. Defaults to "output".
1070
-
1071
- verbose (bool, optional): Whether to print status updates. Set to False for no status updates. Defaults to True.
1072
-
1073
- **kwargs (Dict[str, Any]): Additional keyword arguments to supply. Primarily for internal purposes. Options include: {"iterative_mode": bool}. "iterative_mode" determines whether ``ImputeAlleleFreq`` is being used as the initial imputer in ``IterativeImputer``.
1074
-
1075
- Attributes:
1076
- imputed (GenotypeData): New GenotypeData instance with imputed data.
1077
-
1078
- Example:
1079
- >>>data = GenotypeData(
1080
- >>> filename="test.str",
1081
- >>> filetype="structure",
1082
- >>> popmapfile="test.popmap",
1083
- >>>)
1084
- >>>
1085
- >>>nmf = ImputeMF(
1086
- >>> genotype_data=data,
1087
- >>> by_populations=True,
1088
- >>>)
1089
- >>>
1090
- >>> # Get GenotypeData instance.
1091
- >>>gd_nmf = nmf.imputed
1092
-
1093
- Raises:
1094
- TypeError: genotype_data cannot be NoneType.
1095
- """
1096
-
1097
- def __init__(
1098
- self,
1099
- genotype_data,
1100
- *,
1101
- latent_features: int = 2,
1102
- max_iter: int = 100,
1103
- learning_rate: float = 0.0002,
1104
- regularization_param: float = 0.02,
1105
- tol: float = 0.1,
1106
- n_fail: int = 20,
1107
- missing: int = -9,
1108
- prefix: str = "imputer",
1109
- verbose: bool = True,
1110
- **kwargs: Dict[str, Any],
1111
- ) -> None:
1112
- self.max_iter = max_iter
1113
- self.latent_features = latent_features
1114
- self.n_fail = n_fail
1115
- self.learning_rate = learning_rate
1116
- self.tol = tol
1117
- self.regularization_param = regularization_param
1118
- self.missing = missing
1119
- self.prefix = prefix
1120
- self.verbose = verbose
1121
- self.iterative_mode = kwargs.get("iterative_mode", False)
1122
- self.validation_mode = kwargs.get("validation_mode", False)
1123
-
1124
- gt = kwargs.get("gt", None)
1125
-
1126
- if genotype_data is None and gt is None:
1127
- raise TypeError("GenotypeData and gt cannot both be NoneType.")
1128
-
1129
- if gt is None:
1130
- X = genotype_data.genotypes_012(fmt="numpy")
1131
- else:
1132
- X = gt.copy()
1133
- imputed012 = pd.DataFrame(self.fit_predict(X))
1134
- genotype_data = genotype_data.copy()
1135
- genotype_data.snp_data = genotype_data.decode_012(
1136
- imputed012, prefix=prefix, write_output=False
1137
- )
1138
-
1139
- if self.validation_mode:
1140
- self.imputed = imputed012.to_numpy()
1141
- else:
1142
- self.imputed = genotype_data
1143
-
1144
- @property
1145
- def genotypes_012(self):
1146
- return self.imputed.genotypes012
1147
-
1148
- @property
1149
- def snp_data(self):
1150
- return self.imputed.snp_data
1151
-
1152
- @property
1153
- def alignment(self):
1154
- return self.imputed.alignment
1155
-
1156
- def fit_predict(self, X):
1157
- # imputation
1158
- if self.verbose:
1159
- print(f"Doing MF imputation...")
1160
- R = X
1161
- R = R.astype(int)
1162
- R[R == self.missing] = -9
1163
- R = R + 1
1164
- R[R < 0] = 0
1165
- n_row = len(R)
1166
- n_col = len(R[0])
1167
- p = np.random.rand(n_row, self.latent_features)
1168
- q = np.random.rand(n_col, self.latent_features)
1169
- q_t = q.T
1170
- fails = 0
1171
- e_current = None
1172
- for step in range(self.max_iter):
1173
- for i in range(n_row):
1174
- for j in range(n_col):
1175
- if R[i][j] > 0:
1176
- eij = R[i][j] - np.dot(p[i, :], q_t[:, j])
1177
- for k in range(self.latent_features):
1178
- p[i][k] = p[i][k] + self.learning_rate * (
1179
- 2 * eij * q_t[k][j]
1180
- - self.regularization_param * p[i][k]
1181
- )
1182
- q_t[k][j] = q_t[k][j] + self.learning_rate * (
1183
- 2 * eij * p[i][k]
1184
- - self.regularization_param * q_t[k][j]
1185
- )
1186
- e = 0
1187
- for i in range(n_row):
1188
- for j in range(len(R[i])):
1189
- if R[i][j] > 0:
1190
- e = e + pow(R[i][j] - np.dot(p[i, :], q_t[:, j]), 2)
1191
- for k in range(self.latent_features):
1192
- e = e + (self.regularization_param / 2) * (
1193
- pow(p[i][k], 2) + pow(q_t[k][j], 2)
1194
- )
1195
- if e_current is None:
1196
- e_current = e
1197
- else:
1198
- if abs(e_current - e) < self.tol:
1199
- fails += 1
1200
- else:
1201
- fails = 0
1202
- e_current = e
1203
- if fails >= self.n_fail:
1204
- break
1205
- nR = np.dot(p, q_t)
1206
-
1207
- # transform values per-column (i.e., only allowing values found in original)
1208
- tR = self.transform(R, nR)
1209
-
1210
- # get accuracy of re-constructing non-missing genotypes
1211
- accuracy = self.accuracy(X, tR)
1212
-
1213
- # insert imputed values for missing genotypes
1214
- fR = X
1215
- fR[X < 0] = tR[X < 0]
1216
-
1217
- if self.verbose:
1218
- print("Done!")
1219
-
1220
- return fR
1221
-
1222
- def transform(self, original, predicted):
1223
- n_row = len(original)
1224
- n_col = len(original[0])
1225
- tR = predicted
1226
- for j in range(n_col):
1227
- observed = predicted[:, j]
1228
- expected = original[:, j]
1229
- options = np.unique(expected[expected != 0])
1230
- for i in range(n_row):
1231
- transform = min(
1232
- options, key=lambda x: abs(x - predicted[i, j])
1233
- )
1234
- tR[i, j] = transform
1235
- tR = tR - 1
1236
- tR[tR < 0] = -9
1237
- return tR
1238
-
1239
- def accuracy(self, expected, predicted):
1240
- prop_same = np.sum(expected[expected >= 0] == predicted[expected >= 0])
1241
- tot = expected[expected >= 0].size
1242
- accuracy = prop_same / tot
1243
- return accuracy
1244
-
1245
- def write2file(
1246
- self, X: Union[pd.DataFrame, np.ndarray, List[List[Union[int, float]]]]
1247
- ) -> None:
1248
- """Write imputed data to file on disk.
1249
-
1250
- Args:
1251
- X (pandas.DataFrame, numpy.ndarray, List[List[Union[int, float]]]): Imputed data to write to file.
1252
-
1253
- Raises:
1254
- TypeError: If X is of unsupported type.
1255
- """
1256
- outfile = os.path.join(
1257
- f"{self.prefix}_output",
1258
- "alignments",
1259
- "Unsupervised",
1260
- "ImputeMF",
1261
- )
1262
-
1263
- Path(outfile).mkdir(parents=True, exist_ok=True)
1264
-
1265
- outfile = os.path.join(outfile, "imputed_012.csv")
1266
-
1267
- if isinstance(X, pd.DataFrame):
1268
- df = X
1269
- elif isinstance(X, (np.ndarray, list)):
1270
- df = pd.DataFrame(X)
1271
- else:
1272
- raise TypeError(
1273
- f"Could not write imputed data because it is of incorrect "
1274
- f"type. Got {type(X)}"
1275
- )
1276
-
1277
- df.to_csv(outfile, header=False, index=False)
1278
-
1279
-
1280
- class ImputeRefAllele:
1281
- """Impute missing data by reference allele.
1282
-
1283
- Args:
1284
- genotype_data (GenotypeData object): GenotypeData instance.
1285
-
1286
- missing (int, optional): Missing data value. Defaults to -9.
1287
-
1288
- verbose (bool, optional): Whether to print status updates. Set to False for no status updates. Defaults to True.
1289
-
1290
- kwargs (Dict[str, Any]): Additional keyword arguments to supply. Primarily for internal purposes. Options include: {"iterative_mode": bool, validation_mode: bool, gt: List[List[int]]}. "iterative_mode" determines whether ``ImputeRefAllele`` is being used as the initial imputer in ``IterativeImputer``\. ``gt`` is used internally for the simple imputers during grid searches and validation. If ``genotype_data is None`` then ``gt`` cannot also be None, and vice versa. Only one of ``gt`` or ``genotype_data`` can be set.
1291
-
1292
- Raises:
1293
- TypeError: genotype_data cannot be NoneType.
1294
-
1295
- Attributes:
1296
- imputed (GenotypeData): New GenotypeData instance with imputed data.
1297
-
1298
- Example:
1299
- >>>data = GenotypeData(
1300
- >>> filename="test.str",
1301
- >>> filetype="structure2rowPopID",
1302
- >>> popmapfile="test.popmap",
1303
- >>>)
1304
- >>>
1305
- >>>refallele = ImputeRefAllele(
1306
- >>> genotype_data=data
1307
- >>>)
1308
- >>>
1309
- >>>gd_refallele = refallele.imputed
1310
- """
1311
-
1312
- def __init__(
1313
- self,
1314
- genotype_data: GenotypeData,
1315
- *,
1316
- missing: int = -9,
1317
- prefix="imputer",
1318
- verbose: bool = True,
1319
- **kwargs: Dict[str, Any],
1320
- ) -> None:
1321
- if genotype_data is None:
1322
- raise TypeError("GenotypeData instance must be provided.")
1323
-
1324
- gt = kwargs.get("gt", None)
1325
-
1326
- if gt is None:
1327
- gt_list = genotype_data.genotypes_012(fmt="list")
1328
- else:
1329
- gt_list = gt
1330
-
1331
- self.missing = missing
1332
- self.verbose = verbose
1333
- self.iterative_mode = kwargs.get("iterative_mode", False)
1334
- self.validation_mode = kwargs.get("validation_mode", False)
1335
-
1336
- # Get reference alleles from GenotypeData object
1337
- self.ref_alleles = genotype_data.ref
1338
-
1339
- if not self.validation_mode:
1340
- imputed012 = self.fit_predict(gt_list)
1341
- genotype_data = genotype_data.copy()
1342
- genotype_data.snp_data = genotype_data.decode_012(
1343
- imputed012, prefix=prefix, write_output=False
1344
- )
1345
- self.imputed = genotype_data
1346
- else:
1347
- self.imputed = self.fit_predict(gt_list)
1348
-
1349
- @property
1350
- def genotypes_012(self):
1351
- return self.imputed.genotypes_012
1352
-
1353
- @property
1354
- def snp_data(self):
1355
- return self.imputed.snp_data
1356
-
1357
- @property
1358
- def alignment(self):
1359
- return self.imputed.alignment
1360
-
1361
- def fit_predict(
1362
- self, X: List[List[Union[int, str]]]
1363
- ) -> Union[pd.DataFrame, np.ndarray, List[List[Union[int, str]]]]:
1364
- """Impute missing genotypes using reference alleles.
1365
-
1366
- Impute using reference alleles. Missing alleles are primarily coded as negative; usually -9.
1367
-
1368
- Args:
1369
- X (List[List[Union[int, str]]], numpy.ndarray, or pandas.DataFrame): Genotypes obtained from the GenotypeData object.
1370
-
1371
- Returns:
1372
- pandas.DataFrame, numpy.ndarray, or List[List[Union[int, str]]]: Imputed genotypes of same shape as data.
1373
-
1374
- Raises:
1375
- TypeError: X must be of type list(list(int or str)), numpy.ndarray,
1376
- or pandas.DataFrame, but got {type(X)}
1377
- """
1378
- if self.verbose:
1379
- print("\nImputing missing data with reference alleles...")
1380
-
1381
- if isinstance(X, (list, np.ndarray)):
1382
- df = pd.DataFrame(X)
1383
- elif isinstance(X, pd.DataFrame):
1384
- df = X.copy()
1385
- else:
1386
- raise TypeError(
1387
- f"X must be of type list(list(int or str)), numpy.ndarray, "
1388
- f"or pandas.DataFrame, but got {type(X)}"
1389
- )
1390
-
1391
- df = df.astype(df.dtypes)
1392
- df.replace(self.missing, np.nan, inplace=True)
1393
-
1394
- if df.dtypes[0] == int:
1395
- df.fillna(0, inplace=True)
1396
- else:
1397
- for i, ref in enumerate(self.ref_alleles):
1398
- df[i].fillna(ref, inplace=True)
1399
-
1400
- if self.verbose:
1401
- print("Done!")
1402
-
1403
- if not self.validation_mode:
1404
- return df.values.tolist()
1405
- return df.values
1406
-
1407
- def write2file(
1408
- self, X: Union[pd.DataFrame, np.ndarray, List[List[Union[int, float]]]]
1409
- ) -> None:
1410
- """Write imputed data to file on disk.
1411
-
1412
- Args:
1413
- X (pandas.DataFrame, numpy.ndarray, List[List[Union[int, float]]]): Imputed data to write to file.
1414
-
1415
- Raises:
1416
- TypeError: If X is of unsupported type.
1417
- """
1418
- outfile = os.path.join(
1419
- f"{self.prefix}_output",
1420
- "alignments",
1421
- "Unsupervised",
1422
- "ImputeRefAllele",
1423
- )
1424
-
1425
- Path(outfile).mkdir(parents=True, exist_ok=True)
1426
-
1427
- outfile = os.path.join(outfile, "imputed_012.csv")
1428
-
1429
- if isinstance(X, pd.DataFrame):
1430
- df = X
1431
- elif isinstance(X, (np.ndarray, list)):
1432
- df = pd.DataFrame(X)
1433
- else:
1434
- raise TypeError(
1435
- f"Could not write imputed data because it is of incorrect "
1436
- f"type. Got {type(X)}"
1437
- )
1438
-
1439
- df.to_csv(outfile, header=False, index=False)