pg-sui 0.2.0__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +101 -79
  2. pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
  3. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
  5. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
  6. pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
  7. pgsui/__init__.py +35 -54
  8. pgsui/_version.py +34 -0
  9. pgsui/cli.py +909 -0
  10. pgsui/data_processing/__init__.py +0 -0
  11. pgsui/data_processing/config.py +565 -0
  12. pgsui/data_processing/containers.py +1424 -0
  13. pgsui/data_processing/transformers.py +557 -907
  14. pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  15. pgsui/electron/app/__main__.py +5 -0
  16. pgsui/electron/app/extra-resources/.gitkeep +1 -0
  17. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  18. pgsui/electron/app/icons/icons/128x128.png +0 -0
  19. pgsui/electron/app/icons/icons/16x16.png +0 -0
  20. pgsui/electron/app/icons/icons/24x24.png +0 -0
  21. pgsui/electron/app/icons/icons/256x256.png +0 -0
  22. pgsui/electron/app/icons/icons/32x32.png +0 -0
  23. pgsui/electron/app/icons/icons/48x48.png +0 -0
  24. pgsui/electron/app/icons/icons/512x512.png +0 -0
  25. pgsui/electron/app/icons/icons/64x64.png +0 -0
  26. pgsui/electron/app/icons/icons/icon.icns +0 -0
  27. pgsui/electron/app/icons/icons/icon.ico +0 -0
  28. pgsui/electron/app/main.js +227 -0
  29. pgsui/electron/app/package-lock.json +6894 -0
  30. pgsui/electron/app/package.json +51 -0
  31. pgsui/electron/app/preload.js +15 -0
  32. pgsui/electron/app/server.py +157 -0
  33. pgsui/electron/app/ui/logo.png +0 -0
  34. pgsui/electron/app/ui/renderer.js +131 -0
  35. pgsui/electron/app/ui/styles.css +59 -0
  36. pgsui/electron/app/ui/ui_shim.js +72 -0
  37. pgsui/electron/bootstrap.py +43 -0
  38. pgsui/electron/launch.py +57 -0
  39. pgsui/electron/package.json +14 -0
  40. pgsui/example_data/__init__.py +0 -0
  41. pgsui/example_data/phylip_files/__init__.py +0 -0
  42. pgsui/example_data/phylip_files/test.phy +0 -0
  43. pgsui/example_data/popmaps/__init__.py +0 -0
  44. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  45. pgsui/example_data/structure_files/__init__.py +0 -0
  46. pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
  47. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  48. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  49. pgsui/impute/__init__.py +0 -0
  50. pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
  51. pgsui/impute/deterministic/imputers/mode.py +844 -0
  52. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  53. pgsui/impute/deterministic/imputers/phylo.py +973 -0
  54. pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
  55. pgsui/impute/supervised/__init__.py +0 -0
  56. pgsui/impute/supervised/base.py +343 -0
  57. pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  58. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
  59. pgsui/impute/supervised/imputers/random_forest.py +291 -0
  60. pgsui/impute/unsupervised/__init__.py +0 -0
  61. pgsui/impute/unsupervised/base.py +1118 -0
  62. pgsui/impute/unsupervised/callbacks.py +92 -262
  63. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
  64. pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
  65. pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
  66. pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
  67. pgsui/impute/unsupervised/imputers/vae.py +1228 -0
  68. pgsui/impute/unsupervised/loss_functions.py +261 -0
  69. pgsui/impute/unsupervised/models/__init__.py +0 -0
  70. pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
  71. pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
  72. pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
  73. pgsui/impute/unsupervised/models/vae_model.py +269 -630
  74. pgsui/impute/unsupervised/nn_scorers.py +255 -0
  75. pgsui/utils/__init__.py +0 -0
  76. pgsui/utils/classification_viz.py +608 -0
  77. pgsui/utils/logging_utils.py +22 -0
  78. pgsui/utils/misc.py +35 -480
  79. pgsui/utils/plotting.py +996 -829
  80. pgsui/utils/pretty_metrics.py +290 -0
  81. pgsui/utils/scorers.py +213 -666
  82. pg_sui-0.2.0.dist-info/RECORD +0 -75
  83. pg_sui-0.2.0.dist-info/top_level.txt +0 -3
  84. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  85. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  86. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  87. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  88. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  89. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  90. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  91. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  92. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  93. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  94. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  95. pgsui/example_data/trees/test.iqtree +0 -376
  96. pgsui/example_data/trees/test.qmat +0 -5
  97. pgsui/example_data/trees/test.rate +0 -2033
  98. pgsui/example_data/trees/test.tre +0 -1
  99. pgsui/example_data/trees/test_n10.rate +0 -19
  100. pgsui/example_data/trees/test_n100.rate +0 -109
  101. pgsui/example_data/trees/test_n500.rate +0 -509
  102. pgsui/example_data/trees/test_siterates.txt +0 -2024
  103. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  104. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  105. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  106. pgsui/example_data/vcf_files/test.vcf +0 -244
  107. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  108. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  109. pgsui/impute/estimators.py +0 -1268
  110. pgsui/impute/impute.py +0 -1463
  111. pgsui/impute/simple_imputers.py +0 -1431
  112. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
  113. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
  114. pgsui/impute/unsupervised/keras_classifiers.py +0 -697
  115. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  116. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
  117. pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
  118. pgsui/pg_sui.py +0 -261
  119. pgsui/utils/sequence_tools.py +0 -407
  120. simulation/sim_benchmarks.py +0 -333
  121. simulation/sim_treeparams.py +0 -475
  122. test/__init__.py +0 -0
  123. test/pg_sui_simtest.py +0 -215
  124. test/pg_sui_testing.py +0 -523
  125. test/test.py +0 -151
  126. test/test_pgsui.py +0 -374
  127. test/test_tkc.py +0 -185
@@ -0,0 +1,291 @@
1
+ # Standard library
2
+ from __future__ import annotations
3
+
4
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal
5
+
6
+ # Third-party
7
+ import numpy as np
8
+ from sklearn.ensemble import RandomForestClassifier
9
+ from sklearn.exceptions import NotFittedError
10
+ from sklearn.experimental import enable_iterative_imputer # noqa
11
+ from sklearn.impute import IterativeImputer
12
+ from sklearn.model_selection import train_test_split
13
+
14
+ # Project
15
+ from snpio.analysis.genotype_encoder import GenotypeEncoder
16
+ from snpio.utils.logging import LoggerManager
17
+
18
+ from pgsui.data_processing.config import apply_dot_overrides, load_yaml_to_dataclass
19
+ from pgsui.data_processing.containers import (
20
+ RFConfig,
21
+ _ImputerParams,
22
+ _RFParams,
23
+ _SimParams,
24
+ )
25
+ from pgsui.data_processing.transformers import SimGenotypeDataTransformer
26
+ from pgsui.impute.supervised.base import BaseImputer
27
+ from pgsui.utils.logging_utils import configure_logger
28
+ from pgsui.utils.plotting import Plotting
29
+ from pgsui.utils.scorers import Scorer
30
+
31
+ if TYPE_CHECKING:
32
+ from snpio.read_input.genotype_data import GenotypeData
33
+
34
+
35
+ def ensure_rf_config(config: RFConfig | Dict | str | None) -> RFConfig:
36
+ """Resolve RF configuration from dataclass, mapping, or YAML path."""
37
+
38
+ if config is None:
39
+ return RFConfig()
40
+ if isinstance(config, RFConfig):
41
+ return config
42
+ if isinstance(config, str):
43
+ return load_yaml_to_dataclass(config, RFConfig)
44
+ if isinstance(config, dict):
45
+ payload = dict(config)
46
+ preset = payload.pop("preset", None)
47
+ base = RFConfig.from_preset(preset) if preset else RFConfig()
48
+
49
+ def _flatten(prefix: str, data: Dict[str, Any], out: Dict[str, Any]) -> None:
50
+ for key, value in data.items():
51
+ dotted = f"{prefix}.{key}" if prefix else key
52
+ if isinstance(value, dict):
53
+ _flatten(dotted, value, out)
54
+ else:
55
+ out[dotted] = value
56
+
57
+ flat: Dict[str, Any] = {}
58
+ _flatten("", payload, flat)
59
+ return apply_dot_overrides(base, flat)
60
+
61
+ raise TypeError("config must be an RFConfig, dict, YAML path, or None.")
62
+
63
+
64
+ class ImputeRandomForest(BaseImputer):
65
+ """Supervised RF imputer driven by :class:`RFConfig`."""
66
+
67
+ def __init__(
68
+ self,
69
+ genotype_data: "GenotypeData",
70
+ *,
71
+ config: RFConfig | Dict | str | None = None,
72
+ overrides: Dict | None = None,
73
+ ) -> None:
74
+ self.model_name = "ImputeRandomForest"
75
+ self.Model = RandomForestClassifier
76
+
77
+ cfg = ensure_rf_config(config)
78
+ if overrides:
79
+ cfg = cfg.apply_overrides(overrides)
80
+ self.cfg = cfg
81
+
82
+ self.genotype_data = genotype_data
83
+ self.pgenc = GenotypeEncoder(genotype_data)
84
+
85
+ self.prefix = cfg.io.prefix
86
+ self.seed = cfg.io.seed
87
+ self.n_jobs = cfg.io.n_jobs
88
+ self.verbose = cfg.io.verbose
89
+ self.debug = cfg.io.debug
90
+
91
+ super().__init__(verbose=self.verbose, debug=self.debug)
92
+
93
+ logman = LoggerManager(
94
+ __name__, prefix=self.prefix, verbose=self.verbose, debug=self.debug
95
+ )
96
+ self.logger = configure_logger(
97
+ logman.get_logger(), verbose=self.verbose, debug=self.debug
98
+ )
99
+
100
+ self._create_model_directories(
101
+ self.prefix, ["models", "plots", "metrics", "optimize", "parameters"]
102
+ )
103
+
104
+ self.plot_format: Literal["png", "pdf", "svg", "jpg", "jpeg"] = cfg.plot.fmt
105
+
106
+ self.plot_fontsize = cfg.plot.fontsize
107
+ self.title_fontsize = cfg.plot.fontsize
108
+ self.plot_dpi = cfg.plot.dpi
109
+ self.despine = cfg.plot.despine
110
+ self.show_plots = cfg.plot.show
111
+
112
+ self.validation_split = cfg.train.validation_split
113
+
114
+ self.params = _RFParams(
115
+ n_estimators=cfg.model.n_estimators,
116
+ max_depth=cfg.model.max_depth,
117
+ min_samples_split=cfg.model.min_samples_split,
118
+ min_samples_leaf=cfg.model.min_samples_leaf,
119
+ max_features=cfg.model.max_features,
120
+ criterion=cfg.model.criterion,
121
+ class_weight=cfg.model.class_weight,
122
+ )
123
+
124
+ self.imputer_params = _ImputerParams(
125
+ n_nearest_features=cfg.imputer.n_nearest_features,
126
+ max_iter=cfg.imputer.max_iter,
127
+ random_state=self.seed,
128
+ verbose=self.verbose,
129
+ )
130
+
131
+ self.sim_params = _SimParams(
132
+ prop_missing=cfg.sim.prop_missing,
133
+ strategy=cfg.sim.strategy,
134
+ missing_val=cfg.sim.missing_val,
135
+ het_boost=cfg.sim.het_boost,
136
+ seed=self.seed,
137
+ )
138
+
139
+ self.max_iter = cfg.imputer.max_iter
140
+ self.n_nearest_features = cfg.imputer.n_nearest_features
141
+
142
+ # Will be set in fit()
143
+ self.is_haploid_: bool | None = None
144
+ self.num_classes_: int | None = None
145
+ self.num_features_: int | None = None
146
+ self.rf_models_: List[RandomForestClassifier | None] | None = None
147
+ self.is_fit_: bool = False
148
+
149
+ def fit(self) -> "BaseImputer":
150
+ """Fit the imputer using self.genotype_data with no arguments.
151
+
152
+ This method trains the imputer on the provided genotype data.
153
+
154
+ Steps:
155
+ 1) Encode to 0/1/2 with -9/-1 as missing.
156
+ 2) Split samples into train/test.
157
+ 3) Train IterativeImputer on train (convert missing -> NaN).
158
+ 4) Evaluate on test **non-missing positions** (reconstruction metrics) and call your original plotting stack via _make_class_reports().
159
+
160
+ Returns:
161
+ BaseImputer: self.
162
+ """
163
+ # Prepare utilities & metadata
164
+ self.scorers_ = Scorer(
165
+ prefix=self.prefix, average="macro", verbose=self.verbose, debug=self.debug
166
+ )
167
+
168
+ pf: Literal["png", "pdf", "svg", "jpg", "jpeg"] = self.plot_format
169
+
170
+ self.plotter_ = Plotting(
171
+ self.model_name,
172
+ prefix=self.prefix,
173
+ plot_format=pf,
174
+ plot_dpi=self.plot_dpi,
175
+ plot_fontsize=self.plot_fontsize,
176
+ title_fontsize=self.title_fontsize,
177
+ despine=self.despine,
178
+ show_plots=self.show_plots,
179
+ verbose=self.verbose,
180
+ debug=self.debug,
181
+ multiqc=True,
182
+ multiqc_section=f"PG-SUI: {self.model_name} Model Imputation",
183
+ )
184
+
185
+ X_int = self.pgenc.genotypes_012
186
+ self.X012_ = X_int.astype(float)
187
+ self.X012_[self.X012_ < 0] = np.nan # Ensure missing are NaN
188
+ self.is_haploid_ = np.count_nonzero(self.X012_ == 1) == 0
189
+ self.num_classes_ = 2 if self.is_haploid_ else 3
190
+ self.n_samples_, self.n_features_ = X_int.shape
191
+
192
+ # Split
193
+ X_train, X_test = train_test_split(
194
+ self.X012_,
195
+ test_size=self.validation_split,
196
+ random_state=self.seed,
197
+ shuffle=True,
198
+ )
199
+
200
+ # Simulate missing values on test set.
201
+ sim_transformer = SimGenotypeDataTransformer(**self.sim_params.to_dict())
202
+
203
+ X_test = np.nan_to_num(X_test, nan=-1) # ensure missing are -1
204
+ sim_transformer.fit(X_test)
205
+ X_test_sim, missing_masks = sim_transformer.transform(X_test)
206
+ sim_mask = missing_masks["simulated"]
207
+ X_test_sim[X_test_sim < 0] = np.nan # ensure missing are NaN
208
+
209
+ self.model_params_ = self.params.to_dict()
210
+ self.model_params_["n_jobs"] = self.n_jobs
211
+ self.model_params_["random_state"] = self.seed
212
+
213
+ # Train IterativeImputer
214
+ est = self.Model(**self.model_params_)
215
+
216
+ self.imputer_ = IterativeImputer(estimator=est, **self.imputer_params.to_dict())
217
+
218
+ self.imputer_.fit(X_train)
219
+ self.is_fit_ = True
220
+
221
+ X_test_imputed = self.imputer_.transform(X_test_sim)
222
+
223
+ # Predict on simulated test set
224
+ y_true_flat = X_test[sim_mask].copy()
225
+ y_pred_flat = X_test_imputed[sim_mask].copy()
226
+
227
+ # Round and clip predictions to valid {0,1,2} or {0,1} if haploid.
228
+ if self.is_haploid_:
229
+ y_pred_flat = np.clip(np.rint(y_pred_flat), 0, 1).astype(int, copy=False)
230
+ y_true_flat = np.clip(np.rint(y_true_flat), 0, 1).astype(int, copy=False)
231
+ else:
232
+ y_pred_flat = np.clip(np.rint(y_pred_flat), 0, 2).astype(int, copy=False)
233
+ y_true_flat = np.clip(np.rint(y_true_flat), 0, 2).astype(int, copy=False)
234
+
235
+ # Evaluate (012 / zygosity)
236
+ self._evaluate_012_and_plot(y_true_flat.copy(), y_pred_flat.copy())
237
+
238
+ # Evaluate (IUPAC)
239
+ encodings_dict = {
240
+ "A": 0,
241
+ "C": 1,
242
+ "G": 2,
243
+ "T": 3,
244
+ "W": 4,
245
+ "R": 5,
246
+ "M": 6,
247
+ "K": 7,
248
+ "Y": 8,
249
+ "S": 9,
250
+ "N": -1,
251
+ }
252
+
253
+ y_true_iupac_tmp = self.pgenc.decode_012(y_true_flat)
254
+ y_pred_iupac_tmp = self.pgenc.decode_012(y_pred_flat)
255
+ y_true_iupac = self.pgenc.convert_int_iupac(
256
+ y_true_iupac_tmp, encodings_dict=encodings_dict
257
+ )
258
+ y_pred_iupac = self.pgenc.convert_int_iupac(
259
+ y_pred_iupac_tmp, encodings_dict=encodings_dict
260
+ )
261
+ self._evaluate_iupac10_and_plot(y_true_iupac, y_pred_iupac)
262
+
263
+ self.best_params_ = self.model_params_
264
+ self.best_params_.update(self.imputer_params.to_dict())
265
+ self.best_params_.update(self.sim_params.to_dict())
266
+ self._save_best_params(self.best_params_)
267
+
268
+ return self
269
+
270
+ def transform(self) -> np.ndarray:
271
+ """Impute all samples and return imputed genotypes.
272
+
273
+ This method applies the trained imputer to the entire dataset, filling in missing genotype values. It ensures that any remaining missing values after imputation are set to -9, and decodes the imputed 0/1/2 genotypes back to their original format.
274
+
275
+ Returns:
276
+ np.ndarray: (n_samples, n_loci) integers with no -9/-1/NaN.
277
+ """
278
+ if not self.is_fit_:
279
+ msg = "Imputer has not been fit; call fit() before transform()."
280
+ self.logger.error(msg)
281
+ raise NotFittedError(msg)
282
+
283
+ X = self.X012_.copy()
284
+ X_imp = self.imputer_.transform(X)
285
+
286
+ if np.any(X_imp < 0) or np.isnan(X_imp).any():
287
+ self.logger.warning("Some imputed values are still missing; setting to -9.")
288
+ X_imp[X_imp < 0] = -9
289
+ X_imp[np.isnan(X_imp)] = -9
290
+
291
+ return self.pgenc.decode_012(X_imp)
File without changes