pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pg-sui might be problematic. Click here for more details.

Files changed (112) hide show
  1. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
  2. pg_sui-1.6.8.dist-info/RECORD +78 -0
  3. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
  5. pg_sui-1.6.8.dist-info/top_level.txt +1 -0
  6. pgsui/__init__.py +35 -54
  7. pgsui/_version.py +34 -0
  8. pgsui/cli.py +635 -0
  9. pgsui/data_processing/config.py +576 -0
  10. pgsui/data_processing/containers.py +1782 -0
  11. pgsui/data_processing/transformers.py +121 -1103
  12. pgsui/electron/app/__main__.py +5 -0
  13. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  14. pgsui/electron/app/icons/icons/128x128.png +0 -0
  15. pgsui/electron/app/icons/icons/16x16.png +0 -0
  16. pgsui/electron/app/icons/icons/24x24.png +0 -0
  17. pgsui/electron/app/icons/icons/256x256.png +0 -0
  18. pgsui/electron/app/icons/icons/32x32.png +0 -0
  19. pgsui/electron/app/icons/icons/48x48.png +0 -0
  20. pgsui/electron/app/icons/icons/512x512.png +0 -0
  21. pgsui/electron/app/icons/icons/64x64.png +0 -0
  22. pgsui/electron/app/icons/icons/icon.icns +0 -0
  23. pgsui/electron/app/icons/icons/icon.ico +0 -0
  24. pgsui/electron/app/main.js +189 -0
  25. pgsui/electron/app/package-lock.json +6893 -0
  26. pgsui/electron/app/package.json +50 -0
  27. pgsui/electron/app/preload.js +15 -0
  28. pgsui/electron/app/server.py +146 -0
  29. pgsui/electron/app/ui/logo.png +0 -0
  30. pgsui/electron/app/ui/renderer.js +130 -0
  31. pgsui/electron/app/ui/styles.css +59 -0
  32. pgsui/electron/app/ui/ui_shim.js +72 -0
  33. pgsui/electron/bootstrap.py +43 -0
  34. pgsui/electron/launch.py +59 -0
  35. pgsui/electron/package.json +14 -0
  36. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  37. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  38. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  39. pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
  40. pgsui/impute/deterministic/imputers/mode.py +679 -0
  41. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  42. pgsui/impute/deterministic/imputers/phylo.py +971 -0
  43. pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
  44. pgsui/impute/supervised/base.py +339 -0
  45. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
  46. pgsui/impute/supervised/imputers/random_forest.py +287 -0
  47. pgsui/impute/unsupervised/base.py +924 -0
  48. pgsui/impute/unsupervised/callbacks.py +89 -263
  49. pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
  50. pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
  51. pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
  52. pgsui/impute/unsupervised/imputers/vae.py +957 -0
  53. pgsui/impute/unsupervised/loss_functions.py +158 -0
  54. pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
  55. pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
  56. pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
  57. pgsui/impute/unsupervised/models/vae_model.py +259 -618
  58. pgsui/impute/unsupervised/nn_scorers.py +215 -0
  59. pgsui/utils/classification_viz.py +591 -0
  60. pgsui/utils/misc.py +35 -480
  61. pgsui/utils/plotting.py +514 -824
  62. pgsui/utils/scorers.py +212 -438
  63. pg_sui-1.0.2.1.dist-info/RECORD +0 -75
  64. pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
  65. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  66. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  67. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  68. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  69. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  70. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  71. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  72. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  73. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  74. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  75. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  76. pgsui/example_data/trees/test.iqtree +0 -376
  77. pgsui/example_data/trees/test.qmat +0 -5
  78. pgsui/example_data/trees/test.rate +0 -2033
  79. pgsui/example_data/trees/test.tre +0 -1
  80. pgsui/example_data/trees/test_n10.rate +0 -19
  81. pgsui/example_data/trees/test_n100.rate +0 -109
  82. pgsui/example_data/trees/test_n500.rate +0 -509
  83. pgsui/example_data/trees/test_siterates.txt +0 -2024
  84. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  85. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  86. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  87. pgsui/example_data/vcf_files/test.vcf +0 -244
  88. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  89. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  90. pgsui/impute/estimators.py +0 -735
  91. pgsui/impute/impute.py +0 -1486
  92. pgsui/impute/simple_imputers.py +0 -1439
  93. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
  94. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
  95. pgsui/impute/unsupervised/keras_classifiers.py +0 -702
  96. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  97. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
  98. pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
  99. pgsui/pg_sui.py +0 -261
  100. pgsui/utils/sequence_tools.py +0 -407
  101. simulation/sim_benchmarks.py +0 -333
  102. simulation/sim_treeparams.py +0 -475
  103. test/__init__.py +0 -0
  104. test/pg_sui_simtest.py +0 -215
  105. test/pg_sui_testing.py +0 -523
  106. test/test.py +0 -297
  107. test/test_pgsui.py +0 -374
  108. test/test_tkc.py +0 -214
  109. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
  110. /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  111. /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  112. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
@@ -1,735 +0,0 @@
1
- # Standard library imports
2
- import os
3
- import warnings
4
- from typing import Optional, Union, Dict, Any
5
-
6
- warnings.simplefilter(action="ignore", category=FutureWarning)
7
-
8
-
9
- # Third-party imports
10
- import numpy as np
11
- import pandas as pd
12
-
13
- # Scikit-learn imports
14
- import xgboost as xgb
15
- from sklearn.ensemble import ExtraTreesClassifier
16
- from sklearn.ensemble import RandomForestClassifier
17
- from sklearn.neighbors import KNeighborsClassifier
18
-
19
- # Custom imports
20
- try:
21
- from .impute import Impute
22
- from .unsupervised.neural_network_imputers import VAE, UBP, SAE
23
- except (ModuleNotFoundError, ValueError, ImportError):
24
- from pgsui.impute.impute import Impute
25
- from pgsui.impute.unsupervised.neural_network_imputers import VAE, UBP, SAE
26
-
27
-
28
- class UnsupervisedImputer(Impute):
29
- """Parent class for unsupervised imputers. Contains all common arguments and code between unsupervised imputers.
30
-
31
- Args:
32
-
33
- genotype_data (GenotypeData object): GenotypeData instance that was used to read in the sequence data.
34
-
35
- prefix (str): Prefix for output directory. Defaults to "imputer".
36
-
37
- gridparams (Dict[str, Any] or None, optional): Dictionary with keys=keyword arguments for the specified estimator and values=lists of parameter values or distributions. If ``gridparams=None``\, a grid search is not performed, otherwise ``gridparams`` will be used to specify parameter ranges or distributions for the grid search. If using ``gridsearch_method="gridsearch"``\, then the ``gridparams`` values can be lists or numpy arrays. If using ``gridsearch_method="randomized_gridsearch"``\, distributions can be specified by using scipy.stats.uniform(low, high) (for a uniform distribution) or scipy.stats.loguniform(low, high) (useful if range of values spans orders of magnitude). If using the genetic algorithm grid search by setting ``gridsearch_method="genetic_algorithm"``\, the parameters can be specified as ``sklearn_genetic.space`` objects. The grid search will determine the optimal parameters as those that maximize the scoring metrics. If it takes a long time, run it with a small subset of the data just to find the optimal parameters for the classifier, then run a full imputation using the optimal parameters. Defaults to None (no gridsearch).
38
-
39
- cv (int, optional): Number of cross-validation folds to use with grid search. Defaults to 5.
40
-
41
- validation_split (float, optional): Proportion of training dataset to set aside for loss validation during model training. Defaults to 0.2.
42
-
43
- column_subset (int or float, optional): If float is provided, gets the proportion of the dataset to randomly subset for the grid search or validation. Subsets ``int(n_features * column_subset)`` columns and Should be in the range [0, 1]. It can be small if the grid search or validation takes a long time. If int is provided, subset ``column_subset`` columns. Defaults to 1.0.
44
-
45
- epochs (int, optional): Number of epochs (cycles through the data) to run during training. Defaults to 100.
46
-
47
- batch_size (int, optional): Batch size to train the model with. Model training per epoch is performed over multiple subsets of samples (rows) of size ``batch_size``\. Defaults to 32.
48
-
49
- n_components (int, optional): Number of components (latent dimensions) to compress the input features to. Defaults to 3.
50
-
51
- early_stop_gen (int, optional): Only used with the genetic algorithm grid search option. Stop training early if the model sees ``early_stop_gen`` consecutive generations without improvement to the scoring metric. This can save training time by reducing the number of epochs and generations that are performed. Defaults to 25.
52
-
53
- num_hidden_layers (int, optional): Number of hidden layers to use in the model. Adjust if overfitting or underfitting occurs. Defaults to 1.
54
-
55
- hidden_layer_sizes (str, List[int], List[str], or int, optional): Number of neurons to use in the hidden layers. If string or a list of strings is passed, the strings must be either "midpoint", "sqrt", or "log2". "midpoint" will calculate the midpoint as ``(n_features + n_components) / 2``\. If "sqrt" is supplied, the square root of the number of features will be used to calculate the output units. If "log2" is supplied, the units will be calculated as ``log2(n_features)``\. hidden_layer_sizes will calculate and set the number of output units for each hidden layer. If multiple hidden layers are supplied, each subsequent layer's dimensions are further reduced by the "midpoint", "sqrt", or "log2". E.g., if using ``num_hidden_layers=3`` and ``n_components=2``\, and there are 100 features (columns), the hidden layer sizes for ``midpoint`` will be: [51, 27, 14]. If a single string or integer is supplied, the model will use the same number of output units for each hidden layer. If a list of integers or strings is supplied, the model will use the values supplied in the list. The list length must be equal to the ``num_hidden_layers`` and all hidden layer sizes must be > n_components. Defaults to "midpoint".
56
-
57
- hidden_activation (str, optional): The activation function to use for the hidden layers. See tf.keras.activations for more info. Supported activation functions include: ["elu", "selu", "leaky_relu", "prelu", "relu"]. Each activation function has some advantages and disadvantages and determines the curve and non-linearity of gradient descent. Some are also faster than others. See https://towardsdatascience.com/7-popular-activation-functions-you-should-know-in-deep-learning-and-how-to-use-them-with-keras-and-27b4d838dfe6 for more information. Note that using ``hidden_activation="selu"`` will force ``weights_initializer`` to be "lecun_normal". Defaults to "elu".
58
-
59
- optimizer (str, optional): The optimizer to use with gradient descent. Supported options are: "adam", "sgd", and "adagrad". See tf.keras.optimizers for more info. Defaults to "adam".
60
-
61
- learning_rate (float, optional): The learning rate for the optimizer. Adjust if the loss is learning too slowly or quickly. If you are getting overfitting, it is likely too high, and likewise underfitting can occur when the learning rate is too low. Defaults to 0.01.
62
-
63
- lr_patience (int, optional): Number of epochs without loss improvement to wait before reducing the learning rate. Defaults to 1.0.
64
-
65
- weights_initializer (str, optional): Initializer to use for the model weights. See tf.keras.initializers for more info. Defaults to "glorot_normal".
66
-
67
- l1_penalty (float, optional): L1 regularization penalty to apply. Adjust if the model is over or underfitting. If this value is too high, underfitting can occur, and vice versa. Defaults to 1e-6.
68
-
69
- l2_penalty (float, optional) L2 regularization penalty to apply. If this value is too high, underfitting can occur, and vice versa. Defaults to 1e-6.
70
-
71
- dropout_rate (float, optional): Neuron dropout rate during training. Dropout randomly disables ``dropout_rate`` proportion of neurons during training, which can reduce overfitting. E.g., if dropout_rate is set to 0.2, then 20% of the neurons are randomly dropped out per epoch. Adjust if the model is over or underfitting. Must be a float in the range [0, 1]. Defaults to 0.2.
72
-
73
- sample_weights (str, Dict[int, float], or None, optional): Weights for the ACTG-encoded classes during training. If None, then does not weight classes. If set to "auto", then class weights are automatically calculated for each column to balance classes. If a dictionary is passed, it must contain "A", "C", "G", and "T" as the keys and the class weights as the values. E.g., {"A": 1.0, "C": 1.0, "G": 1.0, "T": 1.0}. The dictionary is then used as the overall class weights. Defaults to None (no weighting).
74
-
75
- gridsearch_method (str, optional): Grid search method to use. Supported options include: {"gridsearch", "randomized_gridsearch", "genetic_algorithm"}. "gridsearch" uses GridSearchCV to test every possible parameter combination. "randomized_gridsearch" picks ``grid_iter`` random combinations of parameters to test. "genetic_algorithm" uses a genetic algorithm via the sklearn-genetic-opt GASearchCV module to do the grid search. If doing a grid search, "randomized_search" takes the least amount of time because it does not have to test all parameters. "genetic_algorithm" takes the longest. See the scikit-learn GridSearchCV and RandomizedSearchCV documentation for the "gridsearch" and "randomized_gridsearch" options, and the sklearn-genetic-opt GASearchCV documentation for the "genetic_algorithm" option. Defaults to "gridsearch".
76
-
77
- grid_iter (int, optional): Number of iterations to use for randomized and genetic algorithm grid searches. For randomized grid search, ``grid_iter`` parameter combinations will be randomly sampled. For the genetic algorithm, this determines how many generations the genetic algorithm will run. Defaults to 80.
78
-
79
- scoring_metric (str, optional): Scoring metric to use for grid searches. The neural network imputers use a multimetric scorer and use different string values for the grid searches. Supported options include: {"accuracy", "hamming", "roc_auc_micro", "roc_auc_macro", "roc_auc_weighted", "average_precision_micro", "average_precision_macro", "average_precision_weighted", "f1_micro", "f1_macro", and "f1_weighted"}. All of the above metrics are calculated during the grid search, but the provided string just sets the metric that the grid search refits to (i.e., which one is used in the best estimator). See the scikit-learn documentation (https://scikit-learn.org/stable/modules/model_evaluation.html) for more information. Defaults to "f1_weighted".
80
-
81
- population_size (int or str, optional): Only used for the genetic algorithm grid search. Size of the initial population to sample randomly generated individuals. If set to "auto", then ``population_size`` is calculated as ``15 * n_parameters``\. If set to an integer, then uses the integer value as ``population_size``\. If you need to speed up the genetic algorithm grid search, try decreasing this parameter. See GASearchCV in the sklearn-genetic-opt documentation (https://sklearn-genetic-opt.readthedocs.io) for more info. Defaults to "auto".
82
-
83
- tournament_size (int, optional): For genetic algorithm grid search only. Number of individuals to perform tournament selection. See GASearchCV in the sklearn-genetic-opt documentation (https://sklearn-genetic-opt.readthedocs.io) for more info. Defaults to 3.
84
-
85
- elitism (bool, optional): For genetic algorithm grid search only. If set to True, takes the ``tournament_size`` best solution to the next generation. See GASearchCV in the sklearn-genetic-opt documentation (https://sklearn-genetic-opt.readthedocs.io) for more info. Defaults to True.
86
-
87
- crossover_probability (float, optional): For genetic algorithm grid search only. Probability of crossover operation between two individuals. See GASearchCV in the sklearn-genetic-opt documentation (https://sklearn-genetic-opt.readthedocs.io) for more info. Defaults to 0.2.
88
-
89
- mutation_probability (float, optional): For genetic algorithm grid search only. Probability of child mutation. See GASearchCV in the sklearn-genetic-opt documentation (https://sklearn-genetic-opt.readthedocs.io) for more info. Defaults to 0.8.
90
-
91
- ga_algorithm (str, optional): For genetic algorithm grid search only. Evolutionary algorithm to use. Supported options include: {"eaMuPlusLambda", "eaMuCommaLambda", "eaSimple"}. If you need to speed up the genetic algorithm grid search, try setting ``algorithm`` to "euSimple", at the expense of evolutionary model robustness. See more details in the DEAP algorithms documentation (https://deap.readthedocs.io). Defaults to "eaMuPlusLambda".
92
-
93
- sim_strategy (str, optional): Strategy to use for simulating missing data. Only used to validate the accuracy of the imputation. The final model will be trained with the non-simulated dataset. Supported options include: {"random", "random_weighted", "nonrandom", "nonrandom_weighted"}. "random" randomly simulates missing data, while "random_weighted" also does this but balances selection of reference, heterozygous, and alternate alleles. When set to "nonrandom", branches from ``GenotypeData.guidetree`` will be randomly sampled to generate missing data on descendant nodes. For "nonrandom_weighted", missing data will be placed on nodes proportionally to their branch lengths (e.g., to generate data distributed as might be the case with mutation-disruption of RAD sites). If using the "nonrandom" or "nonrandom_weighted" options, a guide tree is required to have been initialized in the passed ``genotype_data`` object. Defaults to "random".
94
-
95
- sim_prop_missing (float, optional): Proportion of missing data to use with missing data simulation. Defaults to 0.2.
96
-
97
- disable_progressbar (bool, optional): Whether to disable the tqdm progress bar. Useful if you are doing the imputation on e.g. a high-performance computing cluster, where sometimes tqdm does not work correctly when being written to a file. If False, uses tqdm progress bar. If True, does not use tqdm. Defaults to False.
98
-
99
- n_jobs (int, optional): Number of parallel jobs to use in the grid search if ``gridparams`` is not None. -1 means use all available processors. Defaults to -1 (all CPUs).
100
-
101
- verbose (int, optional): Verbosity flag. The higher, the more verbose. Possible values are 0, 1, or 2. 0 = silent, 1 = progress bar, 2 = one line per epoch. Note that the progress bar is not particularly useful when logged to a file, so verbose=0 or verbose=2 is recommended when not running interactively. Setting verbose higher than 0 is useful for initial runs and debugging, but can slow down training. Defaults to 0.
102
-
103
- kwargs (Dict[str, Any], optional): Possible options include: {"testing": True/False}. If testing is True, a confusion matrix plot will be created showing model performance. Arrays of the true and predicted values will also be printed to STDOUT. testing defaults to False.
104
-
105
- Attributes:
106
- imputed (GenotypeData): New GenotypeData instance with imputed data.
107
-
108
- best_params (Dict[str, Any]): Best found parameters from grid search.
109
- """
110
-
111
- def __init__(
112
- self,
113
- genotype_data,
114
- clf,
115
- clf_type,
116
- *,
117
- prefix="imputer",
118
- gridparams=None,
119
- cv: int = 5,
120
- validation_split=0.2,
121
- column_subset=1.0,
122
- epochs=100,
123
- batch_size=32,
124
- n_components=3,
125
- early_stop_gen=25,
126
- num_hidden_layers=1,
127
- hidden_layer_sizes="midpoint",
128
- optimizer="adam",
129
- hidden_activation="elu",
130
- learning_rate=0.01,
131
- weights_initializer="glorot_normal",
132
- l1_penalty=1e-6,
133
- l2_penalty=1e-6,
134
- dropout_rate=0.2,
135
- kl_beta=1.0,
136
- sample_weights=None,
137
- gridsearch_method="gridsearch",
138
- grid_iter=80,
139
- scoring_metric="f1_weighted",
140
- population_size="auto",
141
- tournament_size=3,
142
- elitism=True,
143
- crossover_probability=0.2,
144
- mutation_probability=0.8,
145
- ga_algorithm="eaMuPlusLambda",
146
- sim_strategy="random_weighted",
147
- sim_prop_missing=0.2,
148
- disable_progressbar=False,
149
- n_jobs=-1,
150
- verbose=0,
151
- **kwargs,
152
- ):
153
- all_kwargs = locals()
154
- all_kwargs.pop("clf")
155
- all_kwargs.pop("clf_type")
156
-
157
- self.clf = clf
158
- self.clf_type = clf_type
159
-
160
- imp_kwargs = {
161
- "str_encodings": {"A": 1, "C": 2, "G": 3, "T": 4, "N": -9},
162
- }
163
- all_kwargs.update(imp_kwargs)
164
- all_kwargs.pop("kwargs")
165
-
166
- super().__init__(self.clf, self.clf_type, all_kwargs)
167
-
168
- if genotype_data is None:
169
- raise TypeError("genotype_data cannot be NoneType")
170
-
171
- X = genotype_data.genotypes_int
172
-
173
- if not isinstance(X, pd.DataFrame):
174
- df = pd.DataFrame(X)
175
- else:
176
- df = X.copy()
177
-
178
- self.imputed, self.best_params = self.fit_predict(df)
179
-
180
-
181
- class SupervisedImputer(Impute):
182
- """Parent class for the supervised imputers. Contains all common arguments and code between supervised imputers.
183
-
184
-
185
- Args:
186
-
187
- genotype_data (GenotypeData object): GenotypeData instance that was used to read in the sequence data.
188
-
189
- prefix (str): Prefix for imputed data's output directory.
190
-
191
- gridparams (Dict[str, Any] or None, optional): Dictionary with keys=keyword arguments for the specified estimator and values=lists of parameter values or distributions. If ``gridparams=None``\, a grid search is not performed, otherwise ``gridparams`` will be used to specify parameter ranges or distributions for the grid search. If using ``gridsearch_method="gridsearch"``\, then the ``gridparams`` values can be lists or numpy arrays. If using ``gridsearch_method="randomized_gridsearch"``\, distributions can be specified by using scipy.stats.uniform(low, high) (for a uniform distribution) or scipy.stats.loguniform(low, high) (useful if range of values spans orders of magnitude). If using the genetic algorithm grid search by setting ``gridsearch_method="genetic_algorithm"``\, the parameters can be specified as ``sklearn_genetic.space`` objects. The grid search will determine the optimal parameters as those that maximize the scoring_methods. NOTE: Takes a long time, so you can run it with a small subset of the data using the ``column_subset`` argument just to find the optimal parameters for the classifier, then it will automatically run a full imputation using the optimal parameters. Defaults to None (no gridsearch).
192
-
193
- do_validation (bool, optional): Whether to validate the imputation if not doing a grid search. This validation method randomly replaces between 15% and 50% of the known, non-missing genotypes in ``n_features * column_subset`` of the features. It then imputes the newly missing genotypes for which we know the true values and calculates validation scores. This procedure is replicated ``cv`` times and a mean, median, minimum, maximum, lower 95% confidence interval (CI) of the mean, and the upper 95% CI are calculated and saved to a CSV file. ``gridparams`` must be set to None for ``do_validation`` to work. Calculating a validation score can be turned off altogether by setting ``do_validation`` to False. Defaults to False.
194
-
195
- column_subset (int or float, optional): If float, proportion of the dataset to randomly subset for the grid search or validation. Should be between 0 and 1, and should also be small, because the grid search or validation takes a long time. If int, subset ``column_subset`` columns. If float, subset ``int(n_features * column_subset)`` columns. Defaults to 0.1.
196
-
197
- cv (int, optional): Number of folds for cross-validation during grid search. Defaults to 5.
198
-
199
- max_iter (int, optional): Maximum number of imputation rounds to perform before returning the imputations computed during the final round. A round is a single imputation of each feature with missing values. Defaults to 10.
200
-
201
- tol (float, optional): Tolerance of the stopping condition for the iterations. Defaults to 1e-3.
202
-
203
- n_nearest_features (int, optional): Number of other features to use to estimate the missing values of eacah feature column. If None, then all features will be used, but this can consume an intractable amount of computing resources. Nearness between features is measured using the absolute correlation coefficient between each feature pair (after initial imputation). To ensure coverage of features throughout the imputation process, the neighbor features are not necessarily nearest, but are drawn with probability proportional to correlation for each imputed target feature. Reducing this can provide significant speed-up when the number of features is large. Defaults to 10.
204
-
205
- initial_strategy (str, optional): Which strategy to use for initializing the missing values in the training data (neighbor columns). IterativeImputer must initially impute the training data (neighbor columns) using a simple, quick imputation in order to predict the missing values for each target column. The ``initial_strategy`` argument specifies which method to use for this initial imputation. Valid options include: “most_frequent”, "populations", "phylogeny", or "mf". "most_frequent" uses the overall mode of each column. "populations" uses the mode per population/ per column via a population map file and the ``ImputeAlleleFreq`` class. "phylogeny" uses an input phylogenetic tree and a rate matrix with the ``ImputePhylo`` class. "mf" performs the imputaton via matrix factorization with the ``ImputeMF`` class. Note that the "mean" and "median" options from the original IterativeImputer are not supported because they are not sensible settings for the type of input data used here. Defaults to "populations".
206
-
207
- str_encodings (dict(str: int), optional): Integer encodings for nucleotides if input file was in STRUCTURE format. Only used if ``initial_strategy="phylogeny"``\. Defaults to {"A": 1, "C": 2, "G": 3, "T": 4, "N": -9}.
208
-
209
- imputation_order (str, optional): The order in which the features will be imputed. Possible values: "ascending" (from features with fewest missing values to most), "descending" (from features with most missing values to fewest), "roman" (left to right), "arabic" (right to left), "random" (a random order for each round). Defaults to "ascending".
210
-
211
- skip_complete (bool, optional): If True, then features with missing values during transform that did not have any missing values during fit will be imputed with the initial imputation method only. Set to True if you have many features with no missing values at both fit and transform time to save compute time. Defaults to False.
212
-
213
- random_state (int or None, optional): The seed of the pseudo random number generator to use for the iterative imputer. Randomizes selection of etimator features if n_nearest_features is not None or the imputation_order is "random". Use an integer for determinism. If None, then uses a different random seed each time. Defaults to None.
214
-
215
- gridsearch_method (str, optional): Grid search method to use. Supported options include: {"gridsearch", "randomized_gridsearch", and "genetic_algorithm"}. "gridsearch" uses GridSearchCV to test every possible parameter combination. "randomized_gridsearch" picks ``grid_iter`` random combinations of parameters to test. "genetic_algorithm" uses a genetic algorithm via sklearn-genetic-opt GASearchCV to do the grid search. If doing a grid search, "randomized_search" takes the least amount of time because it does not have to test all parameters. "genetic_algorithm" takes the longest. See the scikit-learn GridSearchCV and RandomizedSearchCV documentation for the "gridsearch" and "randomized_gridsearch" options, and the sklearn-genetic-opt GASearchCV documentation (https://sklearn-genetic-opt.readthedocs.io) for the "genetic_algorithm" option. Defaults to "gridsearch".
216
-
217
- grid_iter (int, optional): Number of iterations for randomized and genetic algorithm grid searches. Defaults to 80.
218
-
219
- population_size (int or str, optional): For genetic algorithm grid search: Size of the initial population to sample randomly generated individuals. If set to "auto", then ``population_size`` is calculated as ``15 * n_parameters``\. If set to an integer, then uses the integer value as ``population_size``\. If you need to speed up the genetic algorithm grid search, try decreasing this parameter. See GASearchCV in the sklearn-genetic-opt documentation (https://sklearn-genetic-opt.readthedocs.io). Defaults to "auto".
220
-
221
- tournament_size (int, optional): For genetic algorithm grid search: Number of individuals to perform tournament selection. See GASearchCV documentation. Defaults to 3.
222
-
223
- elitism (bool, optional): For genetic algorithm grid search: If True takes the tournament_size best solution to the next generation. See GASearchCV documentation. Defaults to True.
224
-
225
- crossover_probability (float, optional): For genetic algorithm grid search: Probability of crossover operation between two individuals. See GASearchCV documentation. Defaults to 0.2.
226
-
227
- mutation_probability (float, optional): For genetic algorithm grid search: Probability of child mutation. See GASearchCV documentation. Defaults to 0.8.
228
-
229
- ga_algorithm (str, optional): For genetic algorithm grid search: Evolutionary algorithm to use. Supported options include: {"eaMuPlusLambda", "eaMuCommaLambda", "eaSimple"}. If you need to speed up the genetic algorithm grid search, try setting ``algorithm`` to "euSimple", at the expense of evolutionary model robustness. See more details in the DEAP algorithms documentation (https://deap.readthedocs.io). Defaults to "eaMuPlusLambda".
230
-
231
- early_stop_gen (int, optional): If the genetic algorithm sees ``early_stop_gen`` consecutive generations without improvement in the scoring metric, an early stopping callback is implemented. This saves time by reducing the number of generations the genetic algorithm has to perform. Defaults to 5.
232
-
233
- scoring_metric (str, optional): Scoring metric to use for grid searches. See the classification metrics in the scikit-learn documentation (https://scikit-learn.org/stable/modules/model_evaluation.html) for supported options. Defaults to "f1_weighted".
234
-
235
- chunk_size (int or float, optional): Number of loci for which to perform IterativeImputer at one time. Useful for reducing the memory usage if you are running out of RAM. If integer is specified, selects ``chunk_size`` loci at a time. If a float is specified, selects ``math.ceil(total_loci * chunk_size)`` loci at a time]. Defaults to 1.0 (all features).
236
-
237
- disable_progressbar (bool, optional): Whether or not to disable the tqdm progress bar when doing the imputation. If True, progress bar is disabled, which is useful when running the imputation on e.g. an HPC cluster. If the bar is disabled, a status update will be printed to standard output for each iteration and feature instead. If False, the tqdm progress bar will be used. Defaults to False.
238
-
239
- progress_update_percent (int or None, optional): Print status updates for features every ``progress_update_percent``\%. IterativeImputer iterations will always be printed, but ``progress_update_percent`` involves iteration progress through the features of each IterativeImputer iteration. If None, then does not print progress through features. Defaults to None.
240
-
241
- n_jobs (int, optional): Number of parallel jobs to use. If ``gridparams`` is not None, n_jobs is used for the grid search. Otherwise it is used for the classifier. -1 means using all available processors. Defaults to -1 (all CPUs).
242
-
243
- verbose (int, optional): Verbosity flag, controls the debug messages that are issues as functions are evaluated. The higher, the more verbose. Possible values are 0, 1, or 2. Defaults to 0.
244
-
245
- Attributes:
246
- imputed (GenotypeData): New GenotypeData instance with imputed data.
247
-
248
- best_params (Dict[str, Any]): Best found parameters from grid search.
249
-
250
- """
251
-
252
- def __init__(
253
- self,
254
- genotype_data,
255
- clf,
256
- clf_type,
257
- *,
258
- prefix: str = "imputer",
259
- gridparams: Optional[Dict[str, Any]] = None,
260
- do_validation: bool = False,
261
- column_subset: Union[int, float] = 0.1,
262
- cv: int = 5,
263
- max_iter: int = 10,
264
- tol: float = 1e-3,
265
- n_nearest_features: Optional[int] = 10,
266
- initial_strategy: str = "most_frequent",
267
- str_encodings: Dict[str, int] = {
268
- "A": 1,
269
- "C": 2,
270
- "G": 3,
271
- "T": 4,
272
- "N": -9,
273
- },
274
- imputation_order: str = "ascending",
275
- skip_complete: bool = False,
276
- random_state: Optional[int] = None,
277
- gridsearch_method: str = "gridsearch",
278
- grid_iter: int = 80,
279
- population_size: Union[int, str] = "auto",
280
- tournament_size: int = 3,
281
- elitism: bool = True,
282
- crossover_probability: float = 0.2,
283
- mutation_probability: float = 0.8,
284
- ga_algorithm: str = "eaMuPlusLambda",
285
- early_stop_gen: int = 5,
286
- scoring_metric: str = "f1_weighted",
287
- chunk_size: Union[int, float] = 1.0,
288
- disable_progressbar: bool = False,
289
- progress_update_percent: Optional[int] = None,
290
- n_jobs: int = -1,
291
- verbose: int = 0,
292
- **kwargs,
293
- ):
294
- # Get local variables into dictionary object
295
- all_kwargs = locals()
296
- model_kwargs = all_kwargs.pop("kwargs")
297
- all_kwargs.update(model_kwargs)
298
- all_kwargs.pop("self")
299
- all_kwargs.pop("clf")
300
- all_kwargs.pop("clf_type")
301
- all_kwargs.pop("sample_weights")
302
- all_kwargs.pop("kwargs")
303
-
304
- super().__init__(self.clf, self.clf_type, all_kwargs)
305
-
306
- self.imputed, self.best_params = self.fit_predict(
307
- genotype_data.genotypes_012(fmt="pandas")
308
- )
309
-
310
-
311
- class ImputeKNN(SupervisedImputer):
312
- """Does K-Nearest Neighbors Iterative Imputation of missing data. Iterative imputation uses the n_nearest_features to inform the imputation at each feature (i.e., SNP site), using the N most correlated features per site. The N most correlated features are drawn with probability proportional to correlation for each imputed target feature to ensure coverage of features throughout the imputation process.
313
-
314
- Args:
315
- genotype_data (GenotypeData object): GenotypeData instance that was used to read in the sequence data.
316
-
317
- n_neighbors (int, optional): Number of neighbors to use for K-Nearest Neighbors queries. Defaults to 5.
318
-
319
- weights (str, optional): Weight function used in prediction. Possible values: 'Uniform': Uniform weights with all points in each neighborhood weighted equally; 'distance': Weight points by the inverse of their distance, in this case closer neighbors of a query point will have a greater influence than neighbors that are further away; 'callable': A user-defined function that accepts an array of distances and returns an array of the same shape containing the weights. Defaults to "distance".
320
-
321
- algorithm (str, optional): Algorithm used to compute the nearest neighbors. Possible values: 'ball_tree', 'kd_tree', 'brute', 'auto'. Defaults to "auto".
322
-
323
- leaf_size (int, optional): Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. Defaults to 30.
324
-
325
- p (int, optional): Power parameter for the Minkowski metric. When p=1, this is equivalent to using manhattan_distance (l1), and if p=2 it is equivalent to using euclidean distance (l2). For arbitrary p, minkowski_distance (l_p) is used. Defaults to 2.
326
-
327
- metric (str, optional): The distance metric to use for the tree. The default metric is minkowski, and with p=2 this is equivalent to the standard Euclidean metric. See the documentation of sklearn.DistanceMetric for a list of available metrics. If metric is 'precomputed', X is assumed to be a distance matrix and must be square during fit. Defaults to "minkowski".
328
-
329
- Example:
330
- >>> data = GenotypeData(
331
- >>> filename="test.str",
332
- >>> filetype="auto",
333
- >>> guidetree="test.tre",
334
- >>> qmatrix_iqtree="test.iqtree"
335
- >>> )
336
- >>>
337
- >>> # Genetic Algorithm grid_params
338
- >>> grid_params = {
339
- >>> "n_neighbors": Integer(3, 10),
340
- >>> "leaf_size": Integer(10, 50),
341
- >>> }
342
- >>>
343
- >>> knn = ImputeKNN(
344
- >>> genotype_data=data,
345
- >>> gridparams=grid_params,
346
- >>> cv=5,
347
- >>> gridsearch_method="genetic_algorithm",
348
- >>> n_nearest_features=10,
349
- >>> n_estimators=100,
350
- >>> initial_strategy="phylogeny",
351
- >>> )
352
- >>>
353
- >>> knn_gtdata = knn.imputed
354
- """
355
-
356
- def __init__(
357
- self,
358
- genotype_data: Any,
359
- *,
360
- n_neighbors: int = 5,
361
- weights: str = "distance",
362
- algorithm: str = "auto",
363
- leaf_size: int = 30,
364
- p: int = 2,
365
- metric: str = "minkowski",
366
- **kwargs,
367
- ) -> None:
368
- all_kwargs = locals()
369
- kwargs.update(all_kwargs)
370
-
371
- self.clf_type = "classifier"
372
- self.clf = KNeighborsClassifier
373
-
374
- if "self" in kwargs:
375
- kwargs.pop("self")
376
- if "genotype_data" in kwargs:
377
- kwargs.pop("genotype_data")
378
- if "early_stop_gen" in kwargs:
379
- kwargs.pop("early_stop_gen")
380
-
381
- super().__init__(genotype_data, self.clf, self.clf_type, **kwargs)
382
-
383
-
384
- class ImputeRandomForest(SupervisedImputer):
385
- """Does Random Forest or Extra Trees Iterative imputation of missing data. Iterative imputation uses the n_nearest_features to inform the imputation at each feature (i.e., SNP site), using the N most correlated features per site. The N most correlated features are drawn with probability proportional to correlation for each imputed target feature to ensure coverage of features throughout the imputation process.
386
-
387
- Args:
388
- genotype_data (GenotypeData object): GenotypeData instance that was used to read in the sequence data.
389
-
390
- extra_trees (bool, optional): Whether to use ExtraTreesClassifier (If True) instead of RandomForestClassifier (If False). ExtraTreesClassifier is faster, but is not supported by the scikit-learn-intelex patch, whereas RandomForestClassifier is. If using an Intel CPU, the optimizations provided by the scikit-learn-intelex patch might make setting ``extratrees=False`` worthwhile. If you are not using an Intel CPU, the scikit-learn-intelex library is not supported and ExtraTreesClassifier will be faster with similar performance. NOTE: If using scikit-learn-intelex, ``criterion`` must be set to "gini" and ``oob_score`` to False, as those parameters are not currently supported herein. Defaults to True.
391
-
392
- n_estimators (int, optional): The number of trees in the forest. Increasing this value can improve the fit, but at the cost of compute time and resources. Defaults to 100.
393
-
394
- criterion (str, optional): The function to measure the quality of a split. Supported values are "gini" for the Gini impurity and "entropy" for the information gain. Defaults to "gini".
395
-
396
- max_depth (int, optional): The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. Defaults to None.
397
-
398
- min_samples_split (int or float, optional): The minimum number of samples required to split an internal node. If value is an integer, then considers min_samples_split as the minimum number. If value is a floating point, then min_samples_split is a fraction and (min_samples_split * n_samples), rounded up to the nearest integer, are the minimum number of samples for each split. Defaults to 2.
399
-
400
- min_samples_leaf (int or float, optional): The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. If value is an integer, then ``min_samples_leaf`` is the minimum number. If value is floating point, then ``min_samples_leaf`` is a fraction and ``int(min_samples_leaf * n_samples)`` is the minimum number of samples for each node. Defaults to 1.
401
-
402
- min_weight_fraction_leaf (float, optional): The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. Defaults to 0.0.
403
-
404
- max_features (str, int, float, or None, optional): The number of features to consider when looking for the best split. If int, then consider "max_features" features at each split. If float, then "max_features" is a fraction and ``int(max_features * n_samples)`` features are considered at each split. If "sqrt", then ``max_features=sqrt(n_features)``\. If "log2", then ``max_features=log2(n_features)``\. If None, then ``max_features=n_features``\. Defaults to "sqrt".
405
-
406
- max_leaf_nodes (int or None, optional): Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. Defaults to None.
407
-
408
- min_impurity_decrease (float, optional): A node will be split if this split induces a decrease of the impurity greater than or equal to this value. See ``sklearn.ensemble.ExtraTreesClassifier`` documentation for more information. Defaults to 0.0.
409
-
410
- bootstrap (bool, optional): Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree. Defaults to False.
411
-
412
- oob_score (bool, optional): Whether to use out-of-bag samples to estimate the generalization score. Only available if ``bootstrap=True``\. Defaults to False.
413
-
414
- max_samples (int or float, optional): If bootstrap is True, the number of samples to draw from X to train each base estimator. If None (default), then draws ``X.shape[0] samples``\. if int, then draws ``max_samples`` samples. If float, then draws ``int(max_samples * X.shape[0] samples)`` with ``max_samples`` in the interval (0, 1). Defaults to None.
415
-
416
-
417
- Example:
418
- >>> data = GenotypeData(
419
- >>> filename="test.str",
420
- >>> filetype="auto",
421
- >>> guidetree="test.tre",
422
- >>> qmatrix_iqtree="test.iqtree"
423
- >>> )
424
- >>>
425
- >>> # Genetic Algorithm grid_params
426
- >>> grid_params = {
427
- >>> "min_samples_leaf": Integer(1, 10),
428
- >>> "max_depth": Integer(2, 110),
429
- >>> }
430
- >>>
431
- >>> rf = ImputeRandomForest(
432
- >>> genotype_data=data,
433
- >>> gridparams=grid_params,
434
- >>> cv=5,
435
- >>> gridsearch_method="genetic_algorithm",
436
- >>> n_nearest_features=10,
437
- >>> n_estimators=100,
438
- >>> initial_strategy="phylogeny",
439
- >>> )
440
- >>>
441
- >>> rf_gtdata = rf.imputed
442
- """
443
-
444
- def __init__(
445
- self,
446
- genotype_data: Any,
447
- *,
448
- extratrees: bool = True,
449
- n_estimators: int = 100,
450
- criterion: str = "gini",
451
- max_depth: Optional[int] = None,
452
- min_samples_split: Union[int, float] = 2,
453
- min_samples_leaf: Union[int, float] = 1,
454
- min_weight_fraction_leaf: float = 0.0,
455
- max_features: Optional[Union[str, int, float]] = "sqrt",
456
- max_leaf_nodes: Optional[int] = None,
457
- min_impurity_decrease: float = 0.0,
458
- bootstrap: bool = False,
459
- oob_score: bool = False,
460
- max_samples: Optional[Union[int, float]] = None,
461
- **kwargs,
462
- ) -> None:
463
- all_kwargs = locals()
464
- kwargs.update(all_kwargs)
465
- kwargs.pop("self")
466
- kwargs.pop("genotype_data")
467
-
468
- self.extratrees = kwargs.pop("extratrees")
469
-
470
- if self.extratrees:
471
- self.clf = ExtraTreesClassifier
472
-
473
- elif os.environ["intelex"] == "True" and not self.extratrees:
474
- self.clf = RandomForestClassifier
475
-
476
- if kwargs["criterion"] != "gini":
477
- raise ValueError(
478
- "criterion must be set to 'gini' if using the RandomForestClassifier with scikit-learn-intelex"
479
- )
480
- if kwargs["oob_score"]:
481
- raise ValueError(
482
- "oob_score must be set to False if using the "
483
- "RandomForestClassifier with scikit-learn-intelex"
484
- )
485
- else:
486
- self.clf = RandomForestClassifier
487
-
488
- self.clf_type = "classifier"
489
-
490
- if "self" in kwargs:
491
- kwargs.pop("self")
492
- if "genotype_data" in kwargs:
493
- kwargs.pop("genotype_data")
494
- if "early_stop_gen" in kwargs:
495
- kwargs.pop("early_stop_gen")
496
-
497
- super().__init__(genotype_data, self.clf, self.clf_type, **kwargs)
498
-
499
-
500
- class ImputeXGBoost(SupervisedImputer):
501
- """Does XGBoost (Extreme Gradient Boosting) Iterative imputation of missing data. Iterative imputation uses the n_nearest_features to inform the imputation at each feature (i.e., SNP site), using the N most correlated features per site. The N most correlated features are drawn with probability proportional to correlation for each imputed target feature to ensure coverage of features throughout the imputation process.
502
-
503
- Args:
504
- genotype_data (GenotypeData object): GenotypeData instance that was used to read in the sequence data.
505
-
506
- n_estimators (int, optional): The number of boosting rounds. Increasing this value can improve the fit, but at the cost of compute time and RAM usage. Defaults to 100.
507
-
508
- max_depth (int, optional): Maximum tree depth for base learners. Defaults to 3.
509
-
510
- learning_rate (float, optional): Boosting learning rate (eta). Basically, it serves as a weighting factor for correcting new trees when they are added to the model. Typical values are between 0.1 and 0.3. Lower learning rates generally find the best optimum at the cost of requiring far more compute time and resources. Defaults to 0.1.
511
-
512
- booster (str, optional): Specify which booster to use. Possible values include "gbtree", "gblinear", and "dart". Defaults to "gbtree".
513
-
514
- gamma (float, optional): Minimum loss reduction required to make a further partition on a leaf node of the tree. Defaults to 0.0.
515
-
516
- min_child_weight (float, optional): Minimum sum of instance weight(hessian) needed in a child. Defaults to 1.0.
517
-
518
- max_delta_step (float, optional): Maximum delta step we allow each tree's weight estimation to be. Defaults to 0.0.
519
-
520
- subsample (float, optional): Subsample ratio of the training instance. Defaults to 1.0.
521
-
522
- colsample_bytree (float, optional): Subsample ratio of columns when constructing each tree. Defaults to 1.0.
523
-
524
- reg_lambda (float, optional): L2 regularization term on weights (xgb's lambda parameter). Defaults to 1.0.
525
-
526
- reg_alpha (float, optional): L1 regularization term on weights (xgb's alpha parameter). Defaults to 1.0.
527
-
528
- Example:
529
- >>> data = GenotypeData(
530
- >>> filename="test.str",
531
- >>> filetype="auto",
532
- >>> guidetree="test.tre",
533
- >>> qmatrix_iqtree="test.iqtree"
534
- >>> )
535
- >>>
536
- >>> # Genetic Algorithm grid_params
537
- >>> grid_params = {
538
- >>> "learning_rate": Continuous(lower=0.01, upper=0.1),
539
- >>> "max_depth": Integer(2, 110),
540
- >>> }
541
- >>>
542
- >>> xgb = ImputeXGBoost(
543
- >>> genotype_data=data,
544
- >>> gridparams=grid_params,
545
- >>> cv=5,
546
- >>> gridsearch_method="genetic_algorithm",
547
- >>> n_nearest_features=10,
548
- >>> n_estimators=100,
549
- >>> initial_strategy="phylogeny",
550
- >>> )
551
- >>>
552
- >>> xgb_gtdata = xgb.imputed
553
- """
554
-
555
- def __init__(
556
- self,
557
- genotype_data: Any,
558
- *,
559
- n_estimators: int = 100,
560
- max_depth: int = 3,
561
- learning_rate: float = 0.1,
562
- booster: str = "gbtree",
563
- gamma: float = 0.0,
564
- min_child_weight: float = 1.0,
565
- max_delta_step: float = 0.0,
566
- subsample: float = 1.0,
567
- colsample_bytree: float = 1.0,
568
- reg_lambda: float = 1.0,
569
- reg_alpha: float = 0.0,
570
- **kwargs,
571
- ) -> None:
572
- # Get local variables into dictionary object
573
- all_kwargs = locals()
574
- kwargs.update(all_kwargs)
575
-
576
- self.clf_type = "classifier"
577
- self.clf = xgb.XGBClassifier
578
- kwargs["verbosity"] = int(kwargs.get("verbose", False))
579
-
580
- if "self" in kwargs:
581
- kwargs.pop("self")
582
- if "genotype_data" in kwargs:
583
- kwargs.pop("genotype_data")
584
-
585
- super().__init__(genotype_data, self.clf, self.clf_type, **kwargs)
586
-
587
-
588
- class ImputeVAE(UnsupervisedImputer):
589
- """Class to impute missing data using a Variational Autoencoder neural network model. For training, missing values are simulated and the model is trained on the simulated missing values. The real missing values are then predicted by the trained model. The strategy for simulating missing values can be set with the ``sim_strategy`` argument.
590
-
591
- Args:
592
- genotype_data (GenotypeData object): Input data initialized as GenotypeData object. Required positional argument.
593
-
594
- kl_beta (float, optional): Weight to apply to Kullback-Liebler divergence loss. If the latent distribution is not learned well, this weight can be adjusted to adjust how much KL divergence affects the total loss. Should be in the range [0, 1]. If set to 1.0, the KL loss is unweighted. If set to 0.0, the KL loss is negated entirely and does not affect the total loss. Defaults to 1.0.
595
-
596
- Example:
597
- >>> data = GenotypeData(
598
- >>> filename="test.str",
599
- >>> filetype="auto",
600
- >>> guidetree="test.tre",
601
- >>> qmatrix_iqtree="test.iqtree"
602
- >>> )
603
- >>>
604
- >>> vae = ImputeVAE(
605
- >>> genotype_data=data,
606
- >>> learning_rate=0.001,
607
- >>> epochs=200,
608
- >>> )
609
- >>>
610
- >>> vae_gtdata = vae.imputed
611
-
612
- """
613
-
614
- def __init__(
615
- self,
616
- genotype_data,
617
- kl_beta=1.0,
618
- **kwargs,
619
- ):
620
- kwargs["kl_beta"] = kl_beta
621
- super().__init__(genotype_data, VAE, "classifier", **kwargs)
622
-
623
-
624
- class ImputeStandardAutoEncoder(UnsupervisedImputer):
625
- """Class to impute missing data using a standard Autoencoder (SAE) neural network model. For training, missing values are simulated and the model is trained on the simulated missing values. The real missing values are then predicted by the trained model. The strategy for simulating missing values can be set with the ``sim_strategy`` argument.
626
-
627
- Args:
628
- genotype_data (GenotypeData object): Input data initialized as GenotypeData object. Required positional argument.
629
-
630
- Example:
631
- >>> data = GenotypeData(
632
- >>> filename="test.str",
633
- >>> filetype="auto",
634
- >>> guidetree="test.tre",
635
- >>> qmatrix_iqtree="test.iqtree"
636
- >>> )
637
- >>>
638
- >>> sae = ImputeStandardAutoEncoder(
639
- >>> genotype_data=data,
640
- >>> learning_rate=0.001,
641
- >>> n_components=5,
642
- >>> epochs=200,
643
- >>> )
644
- >>>
645
- >>> # Get the imputed data.
646
- >>> sae_gtdata = sae.imputed
647
- """
648
-
649
- def __init__(
650
- self,
651
- genotype_data,
652
- **kwargs,
653
- ):
654
- # Get local variables into dictionary object
655
- self.clf = SAE
656
- self.clf_type = "classifier"
657
-
658
- super().__init__(genotype_data, self.clf, self.clf_type, **kwargs)
659
-
660
-
661
- class ImputeUBP(UnsupervisedImputer):
662
- """Class to impute missing data using an unsupervised backpropagation (UBP) neural network model. For training, missing values are simulated and the model is trained on the simulated missing values. The real missing values are then predicted by the trained model. The strategy for simulating missing values can be set with the ``sim_strategy`` argument.
663
-
664
- UBP [1]_ is an extension of NLPCA with the input being randomly generated and of reduced dimensionality that gets trained to predict the supplied output based on only known values. It then uses the trained model to predict missing values. However, in contrast to NLPCA, UBP trains the model over three phases. The first is a single layer perceptron used to refine the randomly generated input. The second phase is a multi-layer perceptron that uses the refined reduced-dimension data from the first phase as input. In the second phase, the model weights are refined but not the input. In the third phase, the model weights and the inputs are then refined.
665
-
666
- Args:
667
- genotype_data (GenotypeData object): Input data initialized as GenotypeData object. Required positional argument.
668
-
669
- Example:
670
- >>> data = GenotypeData(
671
- >>> filename="test.str",
672
- >>> filetype="auto",
673
- >>> guidetree="test.tre",
674
- >>> qmatrix_iqtree="test.iqtree"
675
- >>> )
676
- >>>
677
- >>> ubp = ImputeUBP(
678
- >>> genotype_data=data,
679
- >>> learning_rate=0.001,
680
- >>> n_components=5
681
- >>> )
682
- >>>
683
- >>> # Get the imputed data.
684
- >>> ubp_gtdata = ubp.imputed
685
-
686
- References:
687
- .. [1] Gashler, M. S., Smith, M. R., Morris, R., & Martinez, T. (2016). Missing value imputation with unsupervised backpropagation. Computational Intelligence, 32(2), 196-215.
688
- """
689
-
690
- def __init__(
691
- self,
692
- genotype_data,
693
- **kwargs,
694
- ):
695
- self.nlpca = kwargs.get("nlpca", False)
696
- self.clf = UBP
697
- self.clf.__name__ = "NLPCA" if self.nlpca else "UBP"
698
- kwargs["nlpca"] = self.nlpca
699
- self.clf_type = "classifier"
700
-
701
- super().__init__(genotype_data, self.clf, self.clf_type, **kwargs)
702
-
703
-
704
- class ImputeNLPCA(ImputeUBP):
705
- """Class to impute missing data using inverse non-linear principal component analysis (NLPCA) neural network models. For training, missing values are simulated and the model is trained on the simulated missing values. The real missing values are then predicted by the trained model. The strategy for simulating missing values can be set with the ``sim_strategy`` argument.
706
-
707
- NLPCA [2]_ trains randomly generated, reduced-dimensionality input to predict the correct output. In the case of imputation, the model is trained only on known values, and the trained model is then used to predict the missing values.
708
-
709
- Args:
710
- genotype_data (GenotypeData object): Input data initialized as GenotypeData object. Required positional argument.
711
-
712
- Example:
713
- >>> data = GenotypeData(
714
- >>> filename="test.str",
715
- >>> filetype="auto",
716
- >>> guidetree="test.tre",
717
- >>> qmatrix_iqtree="test.iqtree"
718
- >>> )
719
- >>>
720
- >>> nlpca = ImputeNLPCA(
721
- >>> genotype_data=data,
722
- >>> learning_rate=0.001,
723
- >>> epochs=200
724
- >>> )
725
- >>>
726
- >>> nlpca_gtdata = nlpca.imputed
727
-
728
- References:
729
-
730
- .. [2] Scholz, M., Kaplan, F., Guy, C. L., Kopka, J., & Selbig, J. (2005). Non-linear PCA: a missing data approach. Bioinformatics, 21(20), 3887-3895.
731
- """
732
-
733
- def __init__(self, *args, **kwargs):
734
- kwargs["nlpca"] = True
735
- super().__init__(*args, **kwargs)