pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pg-sui might be problematic. Click here for more details.

Files changed (112) hide show
  1. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
  2. pg_sui-1.6.8.dist-info/RECORD +78 -0
  3. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
  5. pg_sui-1.6.8.dist-info/top_level.txt +1 -0
  6. pgsui/__init__.py +35 -54
  7. pgsui/_version.py +34 -0
  8. pgsui/cli.py +635 -0
  9. pgsui/data_processing/config.py +576 -0
  10. pgsui/data_processing/containers.py +1782 -0
  11. pgsui/data_processing/transformers.py +121 -1103
  12. pgsui/electron/app/__main__.py +5 -0
  13. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  14. pgsui/electron/app/icons/icons/128x128.png +0 -0
  15. pgsui/electron/app/icons/icons/16x16.png +0 -0
  16. pgsui/electron/app/icons/icons/24x24.png +0 -0
  17. pgsui/electron/app/icons/icons/256x256.png +0 -0
  18. pgsui/electron/app/icons/icons/32x32.png +0 -0
  19. pgsui/electron/app/icons/icons/48x48.png +0 -0
  20. pgsui/electron/app/icons/icons/512x512.png +0 -0
  21. pgsui/electron/app/icons/icons/64x64.png +0 -0
  22. pgsui/electron/app/icons/icons/icon.icns +0 -0
  23. pgsui/electron/app/icons/icons/icon.ico +0 -0
  24. pgsui/electron/app/main.js +189 -0
  25. pgsui/electron/app/package-lock.json +6893 -0
  26. pgsui/electron/app/package.json +50 -0
  27. pgsui/electron/app/preload.js +15 -0
  28. pgsui/electron/app/server.py +146 -0
  29. pgsui/electron/app/ui/logo.png +0 -0
  30. pgsui/electron/app/ui/renderer.js +130 -0
  31. pgsui/electron/app/ui/styles.css +59 -0
  32. pgsui/electron/app/ui/ui_shim.js +72 -0
  33. pgsui/electron/bootstrap.py +43 -0
  34. pgsui/electron/launch.py +59 -0
  35. pgsui/electron/package.json +14 -0
  36. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  37. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  38. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  39. pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
  40. pgsui/impute/deterministic/imputers/mode.py +679 -0
  41. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  42. pgsui/impute/deterministic/imputers/phylo.py +971 -0
  43. pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
  44. pgsui/impute/supervised/base.py +339 -0
  45. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
  46. pgsui/impute/supervised/imputers/random_forest.py +287 -0
  47. pgsui/impute/unsupervised/base.py +924 -0
  48. pgsui/impute/unsupervised/callbacks.py +89 -263
  49. pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
  50. pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
  51. pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
  52. pgsui/impute/unsupervised/imputers/vae.py +957 -0
  53. pgsui/impute/unsupervised/loss_functions.py +158 -0
  54. pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
  55. pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
  56. pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
  57. pgsui/impute/unsupervised/models/vae_model.py +259 -618
  58. pgsui/impute/unsupervised/nn_scorers.py +215 -0
  59. pgsui/utils/classification_viz.py +591 -0
  60. pgsui/utils/misc.py +35 -480
  61. pgsui/utils/plotting.py +514 -824
  62. pgsui/utils/scorers.py +212 -438
  63. pg_sui-1.0.2.1.dist-info/RECORD +0 -75
  64. pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
  65. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  66. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  67. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  68. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  69. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  70. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  71. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  72. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  73. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  74. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  75. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  76. pgsui/example_data/trees/test.iqtree +0 -376
  77. pgsui/example_data/trees/test.qmat +0 -5
  78. pgsui/example_data/trees/test.rate +0 -2033
  79. pgsui/example_data/trees/test.tre +0 -1
  80. pgsui/example_data/trees/test_n10.rate +0 -19
  81. pgsui/example_data/trees/test_n100.rate +0 -109
  82. pgsui/example_data/trees/test_n500.rate +0 -509
  83. pgsui/example_data/trees/test_siterates.txt +0 -2024
  84. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  85. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  86. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  87. pgsui/example_data/vcf_files/test.vcf +0 -244
  88. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  89. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  90. pgsui/impute/estimators.py +0 -735
  91. pgsui/impute/impute.py +0 -1486
  92. pgsui/impute/simple_imputers.py +0 -1439
  93. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
  94. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
  95. pgsui/impute/unsupervised/keras_classifiers.py +0 -702
  96. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  97. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
  98. pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
  99. pgsui/pg_sui.py +0 -261
  100. pgsui/utils/sequence_tools.py +0 -407
  101. simulation/sim_benchmarks.py +0 -333
  102. simulation/sim_treeparams.py +0 -475
  103. test/__init__.py +0 -0
  104. test/pg_sui_simtest.py +0 -215
  105. test/pg_sui_testing.py +0 -523
  106. test/test.py +0 -297
  107. test/test_pgsui.py +0 -374
  108. test/test_tkc.py +0 -214
  109. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
  110. /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  111. /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  112. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
@@ -0,0 +1,287 @@
1
+ # Standard library
2
+ from __future__ import annotations
3
+
4
+ from typing import TYPE_CHECKING, Any, Dict, List
5
+
6
+ # Third-party
7
+ import numpy as np
8
+ from sklearn.ensemble import RandomForestClassifier
9
+ from sklearn.exceptions import NotFittedError
10
+ from sklearn.experimental import enable_iterative_imputer # noqa
11
+ from sklearn.impute import IterativeImputer
12
+ from sklearn.model_selection import train_test_split
13
+
14
+ # Project
15
+ from snpio.analysis.genotype_encoder import GenotypeEncoder
16
+ from snpio.utils.logging import LoggerManager
17
+
18
+ from pgsui.data_processing.config import apply_dot_overrides, load_yaml_to_dataclass
19
+ from pgsui.data_processing.containers import (
20
+ RFConfig,
21
+ _ImputerParams,
22
+ _RFParams,
23
+ _SimParams,
24
+ )
25
+ from pgsui.data_processing.transformers import SimGenotypeDataTransformer
26
+ from pgsui.impute.supervised.base import BaseImputer
27
+ from pgsui.utils.plotting import Plotting
28
+ from pgsui.utils.scorers import Scorer
29
+
30
+ if TYPE_CHECKING:
31
+ from snpio.read_input.genotype_data import GenotypeData
32
+
33
+
34
+ def ensure_rf_config(config: RFConfig | Dict | str | None) -> RFConfig:
35
+ """Resolve RF configuration from dataclass, mapping, or YAML path."""
36
+
37
+ if config is None:
38
+ return RFConfig()
39
+ if isinstance(config, RFConfig):
40
+ return config
41
+ if isinstance(config, str):
42
+ return load_yaml_to_dataclass(
43
+ config, RFConfig, preset_builder=RFConfig.from_preset
44
+ )
45
+ if isinstance(config, dict):
46
+ payload = dict(config)
47
+ preset = payload.pop("preset", None)
48
+ base = RFConfig.from_preset(preset) if preset else RFConfig()
49
+
50
+ def _flatten(prefix: str, data: Dict[str, Any], out: Dict[str, Any]) -> None:
51
+ for key, value in data.items():
52
+ dotted = f"{prefix}.{key}" if prefix else key
53
+ if isinstance(value, dict):
54
+ _flatten(dotted, value, out)
55
+ else:
56
+ out[dotted] = value
57
+
58
+ flat: Dict[str, Any] = {}
59
+ _flatten("", payload, flat)
60
+ return apply_dot_overrides(base, flat)
61
+
62
+ raise TypeError("config must be an RFConfig, dict, YAML path, or None.")
63
+
64
+
65
+ class ImputeRandomForest(BaseImputer):
66
+ """Supervised RF imputer driven by :class:`RFConfig`."""
67
+
68
+ def __init__(
69
+ self,
70
+ genotype_data: "GenotypeData",
71
+ *,
72
+ config: RFConfig | Dict | str | None = None,
73
+ overrides: Dict | None = None,
74
+ ) -> None:
75
+ self.model_name = "ImputeRandomForest"
76
+ self.Model = RandomForestClassifier
77
+
78
+ cfg = ensure_rf_config(config)
79
+ if overrides:
80
+ cfg = cfg.apply_overrides(overrides)
81
+ self.cfg = cfg
82
+
83
+ self.genotype_data = genotype_data
84
+ self.pgenc = GenotypeEncoder(genotype_data)
85
+
86
+ self.prefix = cfg.io.prefix
87
+ self.seed = cfg.io.seed
88
+ self.n_jobs = cfg.io.n_jobs
89
+ self.verbose = cfg.io.verbose
90
+ self.debug = cfg.io.debug
91
+
92
+ super().__init__(verbose=self.verbose, debug=self.debug)
93
+
94
+ logman = LoggerManager(
95
+ __name__, prefix=self.prefix, verbose=self.verbose, debug=self.debug
96
+ )
97
+ self.logger = logman.get_logger()
98
+
99
+ self._create_model_directories(
100
+ self.prefix, ["models", "plots", "metrics", "optimize", "parameters"]
101
+ )
102
+
103
+ self.plot_format = cfg.plot.fmt
104
+ if self.plot_format.startswith("."):
105
+ self.plot_format = self.plot_format.lstrip(".")
106
+ self.plot_fontsize = cfg.plot.fontsize
107
+ self.title_fontsize = cfg.plot.fontsize
108
+ self.plot_dpi = cfg.plot.dpi
109
+ self.despine = cfg.plot.despine
110
+ self.show_plots = cfg.plot.show
111
+
112
+ self.validation_split = cfg.train.validation_split
113
+
114
+ self.params = _RFParams(
115
+ n_estimators=cfg.model.n_estimators,
116
+ max_depth=cfg.model.max_depth,
117
+ min_samples_split=cfg.model.min_samples_split,
118
+ min_samples_leaf=cfg.model.min_samples_leaf,
119
+ max_features=cfg.model.max_features,
120
+ criterion=cfg.model.criterion,
121
+ class_weight=cfg.model.class_weight,
122
+ )
123
+
124
+ self.imputer_params = _ImputerParams(
125
+ n_nearest_features=cfg.imputer.n_nearest_features,
126
+ max_iter=cfg.imputer.max_iter,
127
+ random_state=self.seed,
128
+ verbose=self.verbose,
129
+ )
130
+
131
+ self.sim_params = _SimParams(
132
+ prop_missing=cfg.sim.prop_missing,
133
+ strategy=cfg.sim.strategy,
134
+ missing_val=cfg.sim.missing_val,
135
+ het_boost=cfg.sim.het_boost,
136
+ seed=self.seed,
137
+ )
138
+
139
+ self.max_iter = cfg.imputer.max_iter
140
+ self.n_nearest_features = cfg.imputer.n_nearest_features
141
+
142
+ # Will be set in fit()
143
+ self.is_haploid_: bool | None = None
144
+ self.num_classes_: int | None = None
145
+ self.num_features_: int | None = None
146
+ self.rf_models_: List[RandomForestClassifier | None] | None = None
147
+ self.is_fit_: bool = False
148
+
149
+ def fit(self) -> "BaseImputer":
150
+ """Fit the imputer using self.genotype_data with no arguments.
151
+
152
+ This method trains the imputer on the provided genotype data.
153
+
154
+ Steps:
155
+ 1) Encode to 0/1/2 with -9/-1 as missing.
156
+ 2) Split samples into train/test.
157
+ 3) Train IterativeImputer on train (convert missing -> NaN).
158
+ 4) Evaluate on test **non-missing positions** (reconstruction metrics) and call your original plotting stack via _make_class_reports().
159
+
160
+ Returns:
161
+ BaseImputer: self.
162
+ """
163
+ # Prepare utilities & metadata
164
+ self.scorers_ = Scorer(
165
+ prefix=self.prefix, average="macro", verbose=self.verbose, debug=self.debug
166
+ )
167
+
168
+ self.plotter_ = Plotting(
169
+ self.model_name,
170
+ prefix=self.prefix,
171
+ plot_format=self.plot_format,
172
+ plot_dpi=self.plot_dpi,
173
+ plot_fontsize=self.plot_fontsize,
174
+ title_fontsize=self.title_fontsize,
175
+ despine=self.despine,
176
+ show_plots=self.show_plots,
177
+ verbose=self.verbose,
178
+ debug=self.debug,
179
+ )
180
+
181
+ X_int = self.pgenc.genotypes_012
182
+ self.X012_ = X_int.astype(float)
183
+ self.X012_[self.X012_ < 0] = np.nan # Ensure missing are NaN
184
+ self.is_haploid_ = np.count_nonzero(self.X012_ == 1) == 0
185
+ self.num_classes_ = 2 if self.is_haploid_ else 3
186
+ self.n_samples_, self.n_features_ = X_int.shape
187
+
188
+ # Split
189
+ X_train, X_test = train_test_split(
190
+ self.X012_,
191
+ test_size=self.validation_split,
192
+ random_state=self.seed,
193
+ shuffle=True,
194
+ )
195
+
196
+ # Simulate missing values on test set.
197
+ sim_transformer = SimGenotypeDataTransformer(**self.sim_params.to_dict())
198
+
199
+ X_test = np.nan_to_num(X_test, nan=-1) # ensure missing are -1
200
+ sim_transformer.fit(X_test)
201
+ X_test_sim, missing_masks = sim_transformer.transform(X_test)
202
+ sim_mask = missing_masks["simulated"]
203
+ X_test_sim[X_test_sim < 0] = np.nan # ensure missing are NaN
204
+
205
+ self.model_params_ = self.params.to_dict()
206
+ self.model_params_["n_jobs"] = self.n_jobs
207
+ self.model_params_["random_state"] = self.seed
208
+
209
+ # Train IterativeImputer
210
+ est = self.Model(**self.model_params_)
211
+
212
+ self.imputer_ = IterativeImputer(estimator=est, **self.imputer_params.to_dict())
213
+
214
+ self.imputer_.fit(X_train)
215
+ self.is_fit_ = True
216
+
217
+ X_test_imputed = self.imputer_.transform(X_test_sim)
218
+
219
+ # Predict on simulated test set
220
+ y_true_flat = X_test[sim_mask].copy()
221
+ y_pred_flat = X_test_imputed[sim_mask].copy()
222
+
223
+ # Round and clip predictions to valid {0,1,2} or {0,1} if haploid.
224
+ if self.is_haploid_:
225
+ y_pred_flat = np.clip(np.rint(y_pred_flat), 0, 1).astype(int, copy=False)
226
+ y_true_flat = np.clip(np.rint(y_true_flat), 0, 1).astype(int, copy=False)
227
+ else:
228
+ y_pred_flat = np.clip(np.rint(y_pred_flat), 0, 2).astype(int, copy=False)
229
+ y_true_flat = np.clip(np.rint(y_true_flat), 0, 2).astype(int, copy=False)
230
+
231
+ # Evaluate (012 / zygosity)
232
+ self._evaluate_012_and_plot(y_true_flat.copy(), y_pred_flat.copy())
233
+
234
+ # Evaluate (IUPAC)
235
+ encodings_dict = {
236
+ "A": 0,
237
+ "C": 1,
238
+ "G": 2,
239
+ "T": 3,
240
+ "W": 4,
241
+ "R": 5,
242
+ "M": 6,
243
+ "K": 7,
244
+ "Y": 8,
245
+ "S": 9,
246
+ "N": -1,
247
+ }
248
+
249
+ y_true_iupac_tmp = self.pgenc.decode_012(y_true_flat)
250
+ y_pred_iupac_tmp = self.pgenc.decode_012(y_pred_flat)
251
+ y_true_iupac = self.pgenc.convert_int_iupac(
252
+ y_true_iupac_tmp, encodings_dict=encodings_dict
253
+ )
254
+ y_pred_iupac = self.pgenc.convert_int_iupac(
255
+ y_pred_iupac_tmp, encodings_dict=encodings_dict
256
+ )
257
+ self._evaluate_iupac10_and_plot(y_true_iupac, y_pred_iupac)
258
+
259
+ self.best_params_ = self.model_params_
260
+ self.best_params_.update(self.imputer_params.to_dict())
261
+ self.best_params_.update(self.sim_params.to_dict())
262
+ self._save_best_params(self.best_params_)
263
+
264
+ return self
265
+
266
+ def transform(self) -> np.ndarray:
267
+ """Impute all samples and return imputed genotypes.
268
+
269
+ This method applies the trained imputer to the entire dataset, filling in missing genotype values. It ensures that any remaining missing values after imputation are set to -9, and decodes the imputed 0/1/2 genotypes back to their original format.
270
+
271
+ Returns:
272
+ np.ndarray: (n_samples, n_loci) integers with no -9/-1/NaN.
273
+ """
274
+ if not self.is_fit_:
275
+ msg = "Imputer has not been fit; call fit() before transform()."
276
+ self.logger.error(msg)
277
+ raise NotFittedError(msg)
278
+
279
+ X = self.X012_.copy()
280
+ X_imp = self.imputer_.transform(X)
281
+
282
+ if np.any(X_imp < 0) or np.isnan(X_imp).any():
283
+ self.logger.warning("Some imputed values are still missing; setting to -9.")
284
+ X_imp[X_imp < 0] = -9
285
+ X_imp[np.isnan(X_imp)] = -9
286
+
287
+ return self.pgenc.decode_012(X_imp)