pg-sui 0.2.0__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +101 -79
  2. pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
  3. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
  5. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
  6. pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
  7. pgsui/__init__.py +35 -54
  8. pgsui/_version.py +34 -0
  9. pgsui/cli.py +909 -0
  10. pgsui/data_processing/__init__.py +0 -0
  11. pgsui/data_processing/config.py +565 -0
  12. pgsui/data_processing/containers.py +1424 -0
  13. pgsui/data_processing/transformers.py +557 -907
  14. pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  15. pgsui/electron/app/__main__.py +5 -0
  16. pgsui/electron/app/extra-resources/.gitkeep +1 -0
  17. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  18. pgsui/electron/app/icons/icons/128x128.png +0 -0
  19. pgsui/electron/app/icons/icons/16x16.png +0 -0
  20. pgsui/electron/app/icons/icons/24x24.png +0 -0
  21. pgsui/electron/app/icons/icons/256x256.png +0 -0
  22. pgsui/electron/app/icons/icons/32x32.png +0 -0
  23. pgsui/electron/app/icons/icons/48x48.png +0 -0
  24. pgsui/electron/app/icons/icons/512x512.png +0 -0
  25. pgsui/electron/app/icons/icons/64x64.png +0 -0
  26. pgsui/electron/app/icons/icons/icon.icns +0 -0
  27. pgsui/electron/app/icons/icons/icon.ico +0 -0
  28. pgsui/electron/app/main.js +227 -0
  29. pgsui/electron/app/package-lock.json +6894 -0
  30. pgsui/electron/app/package.json +51 -0
  31. pgsui/electron/app/preload.js +15 -0
  32. pgsui/electron/app/server.py +157 -0
  33. pgsui/electron/app/ui/logo.png +0 -0
  34. pgsui/electron/app/ui/renderer.js +131 -0
  35. pgsui/electron/app/ui/styles.css +59 -0
  36. pgsui/electron/app/ui/ui_shim.js +72 -0
  37. pgsui/electron/bootstrap.py +43 -0
  38. pgsui/electron/launch.py +57 -0
  39. pgsui/electron/package.json +14 -0
  40. pgsui/example_data/__init__.py +0 -0
  41. pgsui/example_data/phylip_files/__init__.py +0 -0
  42. pgsui/example_data/phylip_files/test.phy +0 -0
  43. pgsui/example_data/popmaps/__init__.py +0 -0
  44. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  45. pgsui/example_data/structure_files/__init__.py +0 -0
  46. pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
  47. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  48. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  49. pgsui/impute/__init__.py +0 -0
  50. pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
  51. pgsui/impute/deterministic/imputers/mode.py +844 -0
  52. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  53. pgsui/impute/deterministic/imputers/phylo.py +973 -0
  54. pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
  55. pgsui/impute/supervised/__init__.py +0 -0
  56. pgsui/impute/supervised/base.py +343 -0
  57. pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  58. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
  59. pgsui/impute/supervised/imputers/random_forest.py +291 -0
  60. pgsui/impute/unsupervised/__init__.py +0 -0
  61. pgsui/impute/unsupervised/base.py +1118 -0
  62. pgsui/impute/unsupervised/callbacks.py +92 -262
  63. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
  64. pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
  65. pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
  66. pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
  67. pgsui/impute/unsupervised/imputers/vae.py +1228 -0
  68. pgsui/impute/unsupervised/loss_functions.py +261 -0
  69. pgsui/impute/unsupervised/models/__init__.py +0 -0
  70. pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
  71. pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
  72. pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
  73. pgsui/impute/unsupervised/models/vae_model.py +269 -630
  74. pgsui/impute/unsupervised/nn_scorers.py +255 -0
  75. pgsui/utils/__init__.py +0 -0
  76. pgsui/utils/classification_viz.py +608 -0
  77. pgsui/utils/logging_utils.py +22 -0
  78. pgsui/utils/misc.py +35 -480
  79. pgsui/utils/plotting.py +996 -829
  80. pgsui/utils/pretty_metrics.py +290 -0
  81. pgsui/utils/scorers.py +213 -666
  82. pg_sui-0.2.0.dist-info/RECORD +0 -75
  83. pg_sui-0.2.0.dist-info/top_level.txt +0 -3
  84. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  85. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  86. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  87. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  88. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  89. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  90. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  91. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  92. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  93. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  94. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  95. pgsui/example_data/trees/test.iqtree +0 -376
  96. pgsui/example_data/trees/test.qmat +0 -5
  97. pgsui/example_data/trees/test.rate +0 -2033
  98. pgsui/example_data/trees/test.tre +0 -1
  99. pgsui/example_data/trees/test_n10.rate +0 -19
  100. pgsui/example_data/trees/test_n100.rate +0 -109
  101. pgsui/example_data/trees/test_n500.rate +0 -509
  102. pgsui/example_data/trees/test_siterates.txt +0 -2024
  103. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  104. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  105. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  106. pgsui/example_data/vcf_files/test.vcf +0 -244
  107. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  108. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  109. pgsui/impute/estimators.py +0 -1268
  110. pgsui/impute/impute.py +0 -1463
  111. pgsui/impute/simple_imputers.py +0 -1431
  112. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
  113. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
  114. pgsui/impute/unsupervised/keras_classifiers.py +0 -697
  115. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  116. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
  117. pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
  118. pgsui/pg_sui.py +0 -261
  119. pgsui/utils/sequence_tools.py +0 -407
  120. simulation/sim_benchmarks.py +0 -333
  121. simulation/sim_treeparams.py +0 -475
  122. test/__init__.py +0 -0
  123. test/pg_sui_simtest.py +0 -215
  124. test/pg_sui_testing.py +0 -523
  125. test/test.py +0 -151
  126. test/test_pgsui.py +0 -374
  127. test/test_tkc.py +0 -185
@@ -0,0 +1,221 @@
1
+ from pathlib import Path
2
+ from typing import Dict, List
3
+
4
+ # Third-party imports
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ class ImputeNMF:
10
+ """Impute missing data using matrix factorization. If ``by_populations=False`` then imputation is by global allele frequency. If ``by_populations=True`` then imputation is by population-wise allele frequency.
11
+
12
+ Args:
13
+ genotype_data (GenotypeData object or None, optional): GenotypeData instance.
14
+ latent_features (float, optional): The number of latent variables used to reduce dimensionality of the data. Defaults to 2.
15
+ learning_rate (float, optional): The learning rate for the optimizers. Adjust if the loss is learning too slowly. Defaults to 0.1.
16
+ tol (float, optional): Tolerance of the stopping condition. Defaults to 1e-3.
17
+ missing (int, optional): Missing data value. Defaults to -9.
18
+ prefix (str, optional): Prefix for writing output files. Defaults to "output".
19
+ verbose (bool, optional): Whether to print status updates. Set to False for no status updates. Defaults to True.
20
+ **kwargs (Dict[str, bool | List[List[int]] | None | float | int | str]): Additional keyword arguments to supply. Primarily for internal purposes. Options include: {"iterative_mode": bool, "validation_mode": bool, "gt": List[List[int]]}. "iterative_mode" determines whether ``ImputeAlleleFreq`` is being used as the initial imputer in ``IterativeImputer``. "gt" is used internally for the simple imputers during grid searches and validation. If ``genotype_data is None`` then ``gt`` cannot also be None, and vice versa. Only one of ``gt`` or ``genotype_data`` can be set.
21
+
22
+ Attributes:
23
+ imputed (GenotypeData): New GenotypeData instance with imputed data.
24
+
25
+ Example:
26
+ >>>data = GenotypeData(
27
+ >>> filename="test.str",
28
+ >>> filetype="structure",
29
+ >>> popmapfile="test.popmap",
30
+ >>>)
31
+ >>>
32
+ >>>nmf = ImputeMF(
33
+ >>> genotype_data=data,
34
+ >>> by_populations=True,
35
+ >>>)
36
+ >>>
37
+ >>> # Get GenotypeData instance.
38
+ >>>gd_nmf = nmf.imputed
39
+
40
+ Raises:
41
+ TypeError: genotype_data cannot be NoneType.
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ genotype_data,
47
+ *,
48
+ latent_features: int = 2,
49
+ max_iter: int = 100,
50
+ learning_rate: float = 0.0002,
51
+ regularization_param: float = 0.02,
52
+ tol: float = 0.1,
53
+ n_fail: int = 20,
54
+ missing: int = -9,
55
+ prefix: str = "imputer",
56
+ verbose: bool = True,
57
+ **kwargs: Dict[str, bool | List[List[int]] | None | float | int | str],
58
+ ) -> None:
59
+ self.max_iter = max_iter
60
+ self.latent_features = latent_features
61
+ self.n_fail = n_fail
62
+ self.learning_rate = learning_rate
63
+ self.tol = tol
64
+ self.regularization_param = regularization_param
65
+ self.missing = missing
66
+ self.prefix = prefix
67
+ self.verbose = verbose
68
+ self.iterative_mode = kwargs.get("iterative_mode", False)
69
+ self.validation_mode = kwargs.get("validation_mode", False)
70
+
71
+ gt = kwargs.get("gt", None)
72
+
73
+ if genotype_data is None and gt is None:
74
+ raise TypeError("GenotypeData and gt cannot both be NoneType.")
75
+
76
+ if gt is None:
77
+ X = genotype_data.genotypes_012(fmt="numpy")
78
+ else:
79
+ X = gt.copy()
80
+ imputed012 = pd.DataFrame(self.fit_predict(X))
81
+ genotype_data = genotype_data.copy()
82
+ genotype_data.snp_data = genotype_data.decode_012(
83
+ imputed012, prefix=prefix, write_output=False
84
+ )
85
+
86
+ if self.validation_mode:
87
+ self.imputed = imputed012.to_numpy()
88
+ else:
89
+ self.imputed = genotype_data
90
+
91
+ @property
92
+ def genotypes_012(self):
93
+ return self.imputed.genotypes012
94
+
95
+ @property
96
+ def snp_data(self):
97
+ return self.imputed.snp_data
98
+
99
+ @property
100
+ def alignment(self):
101
+ return self.imputed.alignment
102
+
103
+ def fit_predict(self, X):
104
+ # imputation
105
+ if self.verbose:
106
+ print(f"Doing MF imputation...")
107
+ R = X
108
+ R = R.astype(int)
109
+ R[R == self.missing] = -9
110
+ R = R + 1
111
+ R[R < 0] = 0
112
+ n_row = len(R)
113
+ n_col = len(R[0])
114
+ p = np.random.rand(n_row, self.latent_features)
115
+ q = np.random.rand(n_col, self.latent_features)
116
+ q_t = q.T
117
+ fails = 0
118
+ e_current = None
119
+ for step in range(self.max_iter):
120
+ for i in range(n_row):
121
+ for j in range(n_col):
122
+ if R[i][j] > 0:
123
+ eij = R[i][j] - np.dot(p[i, :], q_t[:, j])
124
+ for k in range(self.latent_features):
125
+ p[i][k] = p[i][k] + self.learning_rate * (
126
+ 2 * eij * q_t[k][j]
127
+ - self.regularization_param * p[i][k]
128
+ )
129
+ q_t[k][j] = q_t[k][j] + self.learning_rate * (
130
+ 2 * eij * p[i][k]
131
+ - self.regularization_param * q_t[k][j]
132
+ )
133
+ e = 0
134
+ for i in range(n_row):
135
+ for j in range(len(R[i])):
136
+ if R[i][j] > 0:
137
+ e = e + pow(R[i][j] - np.dot(p[i, :], q_t[:, j]), 2)
138
+ for k in range(self.latent_features):
139
+ e = e + (self.regularization_param / 2) * (
140
+ pow(p[i][k], 2) + pow(q_t[k][j], 2)
141
+ )
142
+ if e_current is None:
143
+ e_current = e
144
+ else:
145
+ if abs(e_current - e) < self.tol:
146
+ fails += 1
147
+ else:
148
+ fails = 0
149
+ e_current = e
150
+ if fails >= self.n_fail:
151
+ break
152
+ nR = np.dot(p, q_t)
153
+
154
+ # transform values per-column (i.e., only allowing values found in original)
155
+ tR = self.transform(R, nR)
156
+
157
+ # get accuracy of re-constructing non-missing genotypes
158
+ accuracy = self.accuracy(X, tR)
159
+
160
+ # insert imputed values for missing genotypes
161
+ fR = X
162
+ fR[X < 0] = tR[X < 0]
163
+
164
+ if self.verbose:
165
+ print("Done!")
166
+
167
+ return fR
168
+
169
+ def transform(self, original, predicted):
170
+ n_row = len(original)
171
+ n_col = len(original[0])
172
+ tR = predicted
173
+ for j in range(n_col):
174
+ observed = predicted[:, j]
175
+ expected = original[:, j]
176
+ options = np.unique(expected[expected != 0])
177
+ for i in range(n_row):
178
+ transform = min(options, key=lambda x: abs(x - predicted[i, j]))
179
+ tR[i, j] = transform
180
+ tR = tR - 1
181
+ tR[tR < 0] = -9
182
+ return tR
183
+
184
+ def accuracy(self, expected, predicted):
185
+ prop_same = np.sum(expected[expected >= 0] == predicted[expected >= 0])
186
+ tot = expected[expected >= 0].size
187
+ accuracy = prop_same / tot
188
+ return accuracy
189
+
190
+ def write2file(
191
+ self, X: pd.DataFrame | np.ndarray | List[List[int | float]]
192
+ ) -> None:
193
+ """Write imputed data to file on disk.
194
+
195
+ Args:
196
+ X (pandas.DataFrame | numpy.ndarray | List[List[int | float]]): Imputed data to write to file.
197
+
198
+ Raises:
199
+ TypeError: If X is of unsupported type.
200
+ """
201
+ outfile = Path(
202
+ f"{self.prefix}_output",
203
+ "alignments",
204
+ "Deterministic",
205
+ "ImputeMF",
206
+ )
207
+
208
+ Path(outfile).mkdir(parents=True, exist_ok=True)
209
+ outfile = Path(outfile) / "imputed_012.csv"
210
+
211
+ if isinstance(X, pd.DataFrame):
212
+ df = X
213
+ elif isinstance(X, (np.ndarray, list)):
214
+ df = pd.DataFrame(X)
215
+ else:
216
+ raise TypeError(
217
+ f"Could not write imputed data because it is of incorrect "
218
+ f"type. Got {type(X)}"
219
+ )
220
+
221
+ df.to_csv(outfile, header=False, index=False)