pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pg-sui might be problematic. Click here for more details.

Files changed (112) hide show
  1. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
  2. pg_sui-1.6.8.dist-info/RECORD +78 -0
  3. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
  5. pg_sui-1.6.8.dist-info/top_level.txt +1 -0
  6. pgsui/__init__.py +35 -54
  7. pgsui/_version.py +34 -0
  8. pgsui/cli.py +635 -0
  9. pgsui/data_processing/config.py +576 -0
  10. pgsui/data_processing/containers.py +1782 -0
  11. pgsui/data_processing/transformers.py +121 -1103
  12. pgsui/electron/app/__main__.py +5 -0
  13. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  14. pgsui/electron/app/icons/icons/128x128.png +0 -0
  15. pgsui/electron/app/icons/icons/16x16.png +0 -0
  16. pgsui/electron/app/icons/icons/24x24.png +0 -0
  17. pgsui/electron/app/icons/icons/256x256.png +0 -0
  18. pgsui/electron/app/icons/icons/32x32.png +0 -0
  19. pgsui/electron/app/icons/icons/48x48.png +0 -0
  20. pgsui/electron/app/icons/icons/512x512.png +0 -0
  21. pgsui/electron/app/icons/icons/64x64.png +0 -0
  22. pgsui/electron/app/icons/icons/icon.icns +0 -0
  23. pgsui/electron/app/icons/icons/icon.ico +0 -0
  24. pgsui/electron/app/main.js +189 -0
  25. pgsui/electron/app/package-lock.json +6893 -0
  26. pgsui/electron/app/package.json +50 -0
  27. pgsui/electron/app/preload.js +15 -0
  28. pgsui/electron/app/server.py +146 -0
  29. pgsui/electron/app/ui/logo.png +0 -0
  30. pgsui/electron/app/ui/renderer.js +130 -0
  31. pgsui/electron/app/ui/styles.css +59 -0
  32. pgsui/electron/app/ui/ui_shim.js +72 -0
  33. pgsui/electron/bootstrap.py +43 -0
  34. pgsui/electron/launch.py +59 -0
  35. pgsui/electron/package.json +14 -0
  36. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  37. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  38. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  39. pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
  40. pgsui/impute/deterministic/imputers/mode.py +679 -0
  41. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  42. pgsui/impute/deterministic/imputers/phylo.py +971 -0
  43. pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
  44. pgsui/impute/supervised/base.py +339 -0
  45. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
  46. pgsui/impute/supervised/imputers/random_forest.py +287 -0
  47. pgsui/impute/unsupervised/base.py +924 -0
  48. pgsui/impute/unsupervised/callbacks.py +89 -263
  49. pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
  50. pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
  51. pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
  52. pgsui/impute/unsupervised/imputers/vae.py +957 -0
  53. pgsui/impute/unsupervised/loss_functions.py +158 -0
  54. pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
  55. pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
  56. pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
  57. pgsui/impute/unsupervised/models/vae_model.py +259 -618
  58. pgsui/impute/unsupervised/nn_scorers.py +215 -0
  59. pgsui/utils/classification_viz.py +591 -0
  60. pgsui/utils/misc.py +35 -480
  61. pgsui/utils/plotting.py +514 -824
  62. pgsui/utils/scorers.py +212 -438
  63. pg_sui-1.0.2.1.dist-info/RECORD +0 -75
  64. pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
  65. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  66. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  67. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  68. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  69. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  70. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  71. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  72. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  73. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  74. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  75. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  76. pgsui/example_data/trees/test.iqtree +0 -376
  77. pgsui/example_data/trees/test.qmat +0 -5
  78. pgsui/example_data/trees/test.rate +0 -2033
  79. pgsui/example_data/trees/test.tre +0 -1
  80. pgsui/example_data/trees/test_n10.rate +0 -19
  81. pgsui/example_data/trees/test_n100.rate +0 -109
  82. pgsui/example_data/trees/test_n500.rate +0 -509
  83. pgsui/example_data/trees/test_siterates.txt +0 -2024
  84. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  85. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  86. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  87. pgsui/example_data/vcf_files/test.vcf +0 -244
  88. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  89. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  90. pgsui/impute/estimators.py +0 -735
  91. pgsui/impute/impute.py +0 -1486
  92. pgsui/impute/simple_imputers.py +0 -1439
  93. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
  94. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
  95. pgsui/impute/unsupervised/keras_classifiers.py +0 -702
  96. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  97. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
  98. pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
  99. pgsui/pg_sui.py +0 -261
  100. pgsui/utils/sequence_tools.py +0 -407
  101. simulation/sim_benchmarks.py +0 -333
  102. simulation/sim_treeparams.py +0 -475
  103. test/__init__.py +0 -0
  104. test/pg_sui_simtest.py +0 -215
  105. test/pg_sui_testing.py +0 -523
  106. test/test.py +0 -297
  107. test/test_pgsui.py +0 -374
  108. test/test_tkc.py +0 -214
  109. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
  110. /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  111. /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  112. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
@@ -0,0 +1,221 @@
1
+ from pathlib import Path
2
+ from typing import Dict, List
3
+
4
+ # Third-party imports
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ class ImputeNMF:
10
+ """Impute missing data using matrix factorization. If ``by_populations=False`` then imputation is by global allele frequency. If ``by_populations=True`` then imputation is by population-wise allele frequency.
11
+
12
+ Args:
13
+ genotype_data (GenotypeData object or None, optional): GenotypeData instance.
14
+ latent_features (float, optional): The number of latent variables used to reduce dimensionality of the data. Defaults to 2.
15
+ learning_rate (float, optional): The learning rate for the optimizers. Adjust if the loss is learning too slowly. Defaults to 0.1.
16
+ tol (float, optional): Tolerance of the stopping condition. Defaults to 1e-3.
17
+ missing (int, optional): Missing data value. Defaults to -9.
18
+ prefix (str, optional): Prefix for writing output files. Defaults to "output".
19
+ verbose (bool, optional): Whether to print status updates. Set to False for no status updates. Defaults to True.
20
+ **kwargs (Dict[str, bool | List[List[int]] | None | float | int | str]): Additional keyword arguments to supply. Primarily for internal purposes. Options include: {"iterative_mode": bool, "validation_mode": bool, "gt": List[List[int]]}. "iterative_mode" determines whether ``ImputeAlleleFreq`` is being used as the initial imputer in ``IterativeImputer``. "gt" is used internally for the simple imputers during grid searches and validation. If ``genotype_data is None`` then ``gt`` cannot also be None, and vice versa. Only one of ``gt`` or ``genotype_data`` can be set.
21
+
22
+ Attributes:
23
+ imputed (GenotypeData): New GenotypeData instance with imputed data.
24
+
25
+ Example:
26
+ >>>data = GenotypeData(
27
+ >>> filename="test.str",
28
+ >>> filetype="structure",
29
+ >>> popmapfile="test.popmap",
30
+ >>>)
31
+ >>>
32
+ >>>nmf = ImputeMF(
33
+ >>> genotype_data=data,
34
+ >>> by_populations=True,
35
+ >>>)
36
+ >>>
37
+ >>> # Get GenotypeData instance.
38
+ >>>gd_nmf = nmf.imputed
39
+
40
+ Raises:
41
+ TypeError: genotype_data cannot be NoneType.
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ genotype_data,
47
+ *,
48
+ latent_features: int = 2,
49
+ max_iter: int = 100,
50
+ learning_rate: float = 0.0002,
51
+ regularization_param: float = 0.02,
52
+ tol: float = 0.1,
53
+ n_fail: int = 20,
54
+ missing: int = -9,
55
+ prefix: str = "imputer",
56
+ verbose: bool = True,
57
+ **kwargs: Dict[str, bool | List[List[int]] | None | float | int | str],
58
+ ) -> None:
59
+ self.max_iter = max_iter
60
+ self.latent_features = latent_features
61
+ self.n_fail = n_fail
62
+ self.learning_rate = learning_rate
63
+ self.tol = tol
64
+ self.regularization_param = regularization_param
65
+ self.missing = missing
66
+ self.prefix = prefix
67
+ self.verbose = verbose
68
+ self.iterative_mode = kwargs.get("iterative_mode", False)
69
+ self.validation_mode = kwargs.get("validation_mode", False)
70
+
71
+ gt = kwargs.get("gt", None)
72
+
73
+ if genotype_data is None and gt is None:
74
+ raise TypeError("GenotypeData and gt cannot both be NoneType.")
75
+
76
+ if gt is None:
77
+ X = genotype_data.genotypes_012(fmt="numpy")
78
+ else:
79
+ X = gt.copy()
80
+ imputed012 = pd.DataFrame(self.fit_predict(X))
81
+ genotype_data = genotype_data.copy()
82
+ genotype_data.snp_data = genotype_data.decode_012(
83
+ imputed012, prefix=prefix, write_output=False
84
+ )
85
+
86
+ if self.validation_mode:
87
+ self.imputed = imputed012.to_numpy()
88
+ else:
89
+ self.imputed = genotype_data
90
+
91
+ @property
92
+ def genotypes_012(self):
93
+ return self.imputed.genotypes012
94
+
95
+ @property
96
+ def snp_data(self):
97
+ return self.imputed.snp_data
98
+
99
+ @property
100
+ def alignment(self):
101
+ return self.imputed.alignment
102
+
103
+ def fit_predict(self, X):
104
+ # imputation
105
+ if self.verbose:
106
+ print(f"Doing MF imputation...")
107
+ R = X
108
+ R = R.astype(int)
109
+ R[R == self.missing] = -9
110
+ R = R + 1
111
+ R[R < 0] = 0
112
+ n_row = len(R)
113
+ n_col = len(R[0])
114
+ p = np.random.rand(n_row, self.latent_features)
115
+ q = np.random.rand(n_col, self.latent_features)
116
+ q_t = q.T
117
+ fails = 0
118
+ e_current = None
119
+ for step in range(self.max_iter):
120
+ for i in range(n_row):
121
+ for j in range(n_col):
122
+ if R[i][j] > 0:
123
+ eij = R[i][j] - np.dot(p[i, :], q_t[:, j])
124
+ for k in range(self.latent_features):
125
+ p[i][k] = p[i][k] + self.learning_rate * (
126
+ 2 * eij * q_t[k][j]
127
+ - self.regularization_param * p[i][k]
128
+ )
129
+ q_t[k][j] = q_t[k][j] + self.learning_rate * (
130
+ 2 * eij * p[i][k]
131
+ - self.regularization_param * q_t[k][j]
132
+ )
133
+ e = 0
134
+ for i in range(n_row):
135
+ for j in range(len(R[i])):
136
+ if R[i][j] > 0:
137
+ e = e + pow(R[i][j] - np.dot(p[i, :], q_t[:, j]), 2)
138
+ for k in range(self.latent_features):
139
+ e = e + (self.regularization_param / 2) * (
140
+ pow(p[i][k], 2) + pow(q_t[k][j], 2)
141
+ )
142
+ if e_current is None:
143
+ e_current = e
144
+ else:
145
+ if abs(e_current - e) < self.tol:
146
+ fails += 1
147
+ else:
148
+ fails = 0
149
+ e_current = e
150
+ if fails >= self.n_fail:
151
+ break
152
+ nR = np.dot(p, q_t)
153
+
154
+ # transform values per-column (i.e., only allowing values found in original)
155
+ tR = self.transform(R, nR)
156
+
157
+ # get accuracy of re-constructing non-missing genotypes
158
+ accuracy = self.accuracy(X, tR)
159
+
160
+ # insert imputed values for missing genotypes
161
+ fR = X
162
+ fR[X < 0] = tR[X < 0]
163
+
164
+ if self.verbose:
165
+ print("Done!")
166
+
167
+ return fR
168
+
169
+ def transform(self, original, predicted):
170
+ n_row = len(original)
171
+ n_col = len(original[0])
172
+ tR = predicted
173
+ for j in range(n_col):
174
+ observed = predicted[:, j]
175
+ expected = original[:, j]
176
+ options = np.unique(expected[expected != 0])
177
+ for i in range(n_row):
178
+ transform = min(options, key=lambda x: abs(x - predicted[i, j]))
179
+ tR[i, j] = transform
180
+ tR = tR - 1
181
+ tR[tR < 0] = -9
182
+ return tR
183
+
184
+ def accuracy(self, expected, predicted):
185
+ prop_same = np.sum(expected[expected >= 0] == predicted[expected >= 0])
186
+ tot = expected[expected >= 0].size
187
+ accuracy = prop_same / tot
188
+ return accuracy
189
+
190
+ def write2file(
191
+ self, X: pd.DataFrame | np.ndarray | List[List[int | float]]
192
+ ) -> None:
193
+ """Write imputed data to file on disk.
194
+
195
+ Args:
196
+ X (pandas.DataFrame | numpy.ndarray | List[List[int | float]]): Imputed data to write to file.
197
+
198
+ Raises:
199
+ TypeError: If X is of unsupported type.
200
+ """
201
+ outfile = Path(
202
+ f"{self.prefix}_output",
203
+ "alignments",
204
+ "Deterministic",
205
+ "ImputeMF",
206
+ )
207
+
208
+ Path(outfile).mkdir(parents=True, exist_ok=True)
209
+ outfile = Path(outfile) / "imputed_012.csv"
210
+
211
+ if isinstance(X, pd.DataFrame):
212
+ df = X
213
+ elif isinstance(X, (np.ndarray, list)):
214
+ df = pd.DataFrame(X)
215
+ else:
216
+ raise TypeError(
217
+ f"Could not write imputed data because it is of incorrect "
218
+ f"type. Got {type(X)}"
219
+ )
220
+
221
+ df.to_csv(outfile, header=False, index=False)