pg-sui 0.2.0__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +101 -79
- pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
- pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
- pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +909 -0
- pgsui/data_processing/__init__.py +0 -0
- pgsui/data_processing/config.py +565 -0
- pgsui/data_processing/containers.py +1424 -0
- pgsui/data_processing/transformers.py +557 -907
- pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/extra-resources/.gitkeep +1 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +227 -0
- pgsui/electron/app/package-lock.json +6894 -0
- pgsui/electron/app/package.json +51 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +157 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +131 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +57 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/__init__.py +0 -0
- pgsui/example_data/phylip_files/__init__.py +0 -0
- pgsui/example_data/phylip_files/test.phy +0 -0
- pgsui/example_data/popmaps/__init__.py +0 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/structure_files/__init__.py +0 -0
- pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/__init__.py +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
- pgsui/impute/deterministic/imputers/mode.py +844 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +973 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
- pgsui/impute/supervised/__init__.py +0 -0
- pgsui/impute/supervised/base.py +343 -0
- pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
- pgsui/impute/supervised/imputers/random_forest.py +291 -0
- pgsui/impute/unsupervised/__init__.py +0 -0
- pgsui/impute/unsupervised/base.py +1118 -0
- pgsui/impute/unsupervised/callbacks.py +92 -262
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
- pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
- pgsui/impute/unsupervised/imputers/vae.py +1228 -0
- pgsui/impute/unsupervised/loss_functions.py +261 -0
- pgsui/impute/unsupervised/models/__init__.py +0 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
- pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
- pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
- pgsui/impute/unsupervised/models/vae_model.py +269 -630
- pgsui/impute/unsupervised/nn_scorers.py +255 -0
- pgsui/utils/__init__.py +0 -0
- pgsui/utils/classification_viz.py +608 -0
- pgsui/utils/logging_utils.py +22 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +996 -829
- pgsui/utils/pretty_metrics.py +290 -0
- pgsui/utils/scorers.py +213 -666
- pg_sui-0.2.0.dist-info/RECORD +0 -75
- pg_sui-0.2.0.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -1268
- pgsui/impute/impute.py +0 -1463
- pgsui/impute/simple_imputers.py +0 -1431
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
- pgsui/impute/unsupervised/keras_classifiers.py +0 -697
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -151
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -185
pgsui/pg_sui.py
DELETED
|
@@ -1,261 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
|
|
3
|
-
# Standard library imports
|
|
4
|
-
import argparse
|
|
5
|
-
import sys
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pandas as pd
|
|
9
|
-
|
|
10
|
-
from sklearn_genetic.space import Continuous, Categorical, Integer
|
|
11
|
-
|
|
12
|
-
from snpio import GenotypeData
|
|
13
|
-
from snpio import Plotting
|
|
14
|
-
from pgsui import *
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def main():
|
|
18
|
-
"""Class instantiations and main package body"""
|
|
19
|
-
|
|
20
|
-
args = get_arguments()
|
|
21
|
-
|
|
22
|
-
if args.str and args.phylip:
|
|
23
|
-
sys.exit("Error: Only one file type can be specified")
|
|
24
|
-
|
|
25
|
-
# If VCF file is specified.
|
|
26
|
-
if args.str:
|
|
27
|
-
if not args.pop_ids and args.popmap is None:
|
|
28
|
-
raise TypeError("Either --pop_ids or --popmap must be specified\n")
|
|
29
|
-
|
|
30
|
-
if args.pop_ids:
|
|
31
|
-
print("\n--pop_ids was specified as column 2\n")
|
|
32
|
-
else:
|
|
33
|
-
print(
|
|
34
|
-
"\n--pop_ids was not specified; "
|
|
35
|
-
"using popmap file to get population IDs\n"
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
if args.onerow_perind:
|
|
39
|
-
print("\nUsing one row per individual...\n")
|
|
40
|
-
else:
|
|
41
|
-
print("\nUsing two rows per individual...\n")
|
|
42
|
-
|
|
43
|
-
if args.onerow_perind:
|
|
44
|
-
data = GenotypeData(
|
|
45
|
-
filename=args.str,
|
|
46
|
-
filetype="structure1row",
|
|
47
|
-
popmapfile=args.popmap,
|
|
48
|
-
guidetree=args.treefile,
|
|
49
|
-
qmatrix_iqtree=args.iqtree,
|
|
50
|
-
prefix=args.prefix,
|
|
51
|
-
)
|
|
52
|
-
else:
|
|
53
|
-
data = GenotypeData(
|
|
54
|
-
filename=args.str,
|
|
55
|
-
filetype="structure2row",
|
|
56
|
-
popmapfile=args.popmap,
|
|
57
|
-
guidetree=args.treefile,
|
|
58
|
-
qmatrix_iqtree=args.iqtree,
|
|
59
|
-
prefix=args.prefix,
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
if args.phylip:
|
|
63
|
-
if args.pop_ids or args.onerow_perind:
|
|
64
|
-
print(
|
|
65
|
-
"\nPhylip file was used with structure arguments; ignoring "
|
|
66
|
-
"structure file arguments\n"
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
if args.popmap is None:
|
|
70
|
-
raise TypeError("No popmap file supplied with PHYLIP file\n")
|
|
71
|
-
|
|
72
|
-
data = GenotypeData(
|
|
73
|
-
filename=args.phylip,
|
|
74
|
-
filetype="phylip",
|
|
75
|
-
popmapfile=args.popmap,
|
|
76
|
-
guidetree=args.treefile,
|
|
77
|
-
qmatrix_iqtree=args.iqtree,
|
|
78
|
-
siterates_iqtree=args.site_rate,
|
|
79
|
-
prefix=args.prefix,
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
data.missingness_reports(prefix=args.prefix, plot_format="png")
|
|
83
|
-
|
|
84
|
-
# For GridSearchCV. Generate parameters to sample from.
|
|
85
|
-
learning_rate = [float(10) ** x for x in np.arange(-4, -1)]
|
|
86
|
-
l1_penalty = [float(10) ** x for x in np.arange(-5, -1)]
|
|
87
|
-
l1_penalty.append(0.0)
|
|
88
|
-
l2_penalty = [float(10) ** x for x in np.arange(-5, -1)]
|
|
89
|
-
l2_penalty.append(0.0)
|
|
90
|
-
hidden_activation = ["elu", "relu"]
|
|
91
|
-
num_hidden_layers = [1, 2, 3]
|
|
92
|
-
hidden_layer_sizes = ["sqrt", "midpoint"]
|
|
93
|
-
n_components = [2, 3, 5, 10]
|
|
94
|
-
dropout_rate = [0.0, 0.2, 0.4]
|
|
95
|
-
# batch_size = [16, 32, 48, 64]
|
|
96
|
-
optimizer = ["adam", "sgd", "adagrad"]
|
|
97
|
-
|
|
98
|
-
# Some are commented out for testing purposes.
|
|
99
|
-
# grid_params = {
|
|
100
|
-
# "learning_rate": learning_rate,
|
|
101
|
-
# # "l1_penalty": l1_penalty,
|
|
102
|
-
# # "l2_penalty": l2_penalty,
|
|
103
|
-
# # "hidden_layer_sizes": hidden_layer_sizes,
|
|
104
|
-
# "n_components": n_components,
|
|
105
|
-
# # "dropout_rate": dropout_rate,
|
|
106
|
-
# # # "optimizer": optimizer,
|
|
107
|
-
# # "num_hidden_layers": num_hidden_layers,
|
|
108
|
-
# # "hidden_activation": hidden_activation,
|
|
109
|
-
# }
|
|
110
|
-
|
|
111
|
-
imp = ImputeXGBoost(
|
|
112
|
-
data,
|
|
113
|
-
max_iter=3,
|
|
114
|
-
gridparams={"n_estimators": [100, 200]},
|
|
115
|
-
n_nearest_features=5,
|
|
116
|
-
# disable_progressbar=False,
|
|
117
|
-
# epochs=100,
|
|
118
|
-
# cv=3,
|
|
119
|
-
# column_subset=1.0,
|
|
120
|
-
# learning_rate=0.01,
|
|
121
|
-
# num_hidden_layers=1,
|
|
122
|
-
# hidden_layer_sizes="midpoint",
|
|
123
|
-
# verbose=10,
|
|
124
|
-
# dropout_rate=0.2,
|
|
125
|
-
# hidden_activation="relu",
|
|
126
|
-
# batch_size=32,
|
|
127
|
-
# l1_penalty=1e-6,
|
|
128
|
-
# l2_penalty=1e-6,
|
|
129
|
-
# # gridparams=grid_params,
|
|
130
|
-
# n_jobs=4,
|
|
131
|
-
# grid_iter=5,
|
|
132
|
-
# sim_strategy="nonrandom_weighted",
|
|
133
|
-
# sim_prop_missing=0.5,
|
|
134
|
-
# scoring_metric="precision_recall_macro",
|
|
135
|
-
# gridsearch_method="gridsearch",
|
|
136
|
-
# early_stop_gen=5,
|
|
137
|
-
# n_components=3,
|
|
138
|
-
# sample_weights={0: 1.0, 1: 0.0, 2: 1.0},
|
|
139
|
-
# sample_weights="auto",
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
gd_imp = imp.imputed
|
|
143
|
-
|
|
144
|
-
components, model = Plotting.run_pca(
|
|
145
|
-
data,
|
|
146
|
-
plot_format="png",
|
|
147
|
-
center=True,
|
|
148
|
-
scale=False,
|
|
149
|
-
prefix=args.prefix,
|
|
150
|
-
# n_axes=3,
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
components_imp, model_imp = Plotting.run_pca(
|
|
154
|
-
gd_imp,
|
|
155
|
-
plot_format="png",
|
|
156
|
-
center=True,
|
|
157
|
-
scale=False,
|
|
158
|
-
prefix=args.prefix + "_imputed",
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
def get_arguments():
|
|
163
|
-
"""[Parse command-line arguments. Imported with argparse]
|
|
164
|
-
|
|
165
|
-
Returns:
|
|
166
|
-
[argparse object]: [contains command-line arguments; accessed as method]
|
|
167
|
-
"""
|
|
168
|
-
|
|
169
|
-
parser = argparse.ArgumentParser(
|
|
170
|
-
description="Machine learning missing data imputation and species delimitation",
|
|
171
|
-
add_help=False,
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
required_args = parser.add_argument_group("Required arguments")
|
|
175
|
-
filetype_args = parser.add_argument_group(
|
|
176
|
-
"File type arguments (choose only one)"
|
|
177
|
-
)
|
|
178
|
-
structure_args = parser.add_argument_group("Structure file arguments")
|
|
179
|
-
optional_args = parser.add_argument_group("Optional arguments")
|
|
180
|
-
|
|
181
|
-
# File Type arguments
|
|
182
|
-
filetype_args.add_argument(
|
|
183
|
-
"-s", "--str", type=str, required=False, help="Input structure file"
|
|
184
|
-
)
|
|
185
|
-
filetype_args.add_argument(
|
|
186
|
-
"-p", "--phylip", type=str, required=False, help="Input phylip file"
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
filetype_args.add_argument(
|
|
190
|
-
"-t",
|
|
191
|
-
"--treefile",
|
|
192
|
-
type=str,
|
|
193
|
-
required=False,
|
|
194
|
-
default=None,
|
|
195
|
-
help="Newick-formatted treefile",
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
filetype_args.add_argument(
|
|
199
|
-
"-i",
|
|
200
|
-
"--iqtree",
|
|
201
|
-
type=str,
|
|
202
|
-
required=False,
|
|
203
|
-
help=".iqtree output file containing Rate Matrix Q",
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
filetype_args.add_argument(
|
|
207
|
-
"--site_rate",
|
|
208
|
-
type=str,
|
|
209
|
-
required=False,
|
|
210
|
-
help="Specify site rate input file.",
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
# Structure Arguments
|
|
214
|
-
structure_args.add_argument(
|
|
215
|
-
"--onerow_perind",
|
|
216
|
-
default=False,
|
|
217
|
-
action="store_true",
|
|
218
|
-
help="Toggles on one row per individual option in structure file",
|
|
219
|
-
)
|
|
220
|
-
structure_args.add_argument(
|
|
221
|
-
"--pop_ids",
|
|
222
|
-
default=False,
|
|
223
|
-
required=False,
|
|
224
|
-
action="store_true",
|
|
225
|
-
help="Toggles on population ID column (2nd col) in structure file",
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
## Optional Arguments
|
|
229
|
-
optional_args.add_argument(
|
|
230
|
-
"-m",
|
|
231
|
-
"--popmap",
|
|
232
|
-
type=str,
|
|
233
|
-
required=False,
|
|
234
|
-
default=None,
|
|
235
|
-
help="Two-column tab-separated population map file: inds\tpops. No header line",
|
|
236
|
-
)
|
|
237
|
-
optional_args.add_argument(
|
|
238
|
-
"--prefix",
|
|
239
|
-
type=str,
|
|
240
|
-
required=False,
|
|
241
|
-
default="imputer",
|
|
242
|
-
help="Prefix for output directory. Output directory will be '<prefix>_output'",
|
|
243
|
-
)
|
|
244
|
-
|
|
245
|
-
# Add help menu
|
|
246
|
-
optional_args.add_argument(
|
|
247
|
-
"-h", "--help", action="help", help="Displays this help menu"
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
# If no command-line arguments are called then exit and call help menu.
|
|
251
|
-
if len(sys.argv) == 1:
|
|
252
|
-
print("\nExiting because no command-line options were called.\n")
|
|
253
|
-
parser.print_help(sys.stderr)
|
|
254
|
-
sys.exit(1)
|
|
255
|
-
|
|
256
|
-
args = parser.parse_args()
|
|
257
|
-
return args
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
if __name__ == "__main__":
|
|
261
|
-
main()
|
pgsui/utils/sequence_tools.py
DELETED
|
@@ -1,407 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
import sys
|
|
3
|
-
from itertools import product
|
|
4
|
-
from collections import Counter
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def blacklist_missing(loci, threshold, iupac=False):
|
|
8
|
-
blacklist = list()
|
|
9
|
-
for i in range(0, len(loci)):
|
|
10
|
-
alleles = expandLoci(loci[i], iupac=False)
|
|
11
|
-
c = Counter(alleles)
|
|
12
|
-
if float(c[-9] / sum(c.values())) > threshold:
|
|
13
|
-
blacklist.append(i)
|
|
14
|
-
return blacklist
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def blacklist_maf(loci, threshold, iupac=False):
|
|
18
|
-
blacklist = list()
|
|
19
|
-
for i in range(0, len(loci)):
|
|
20
|
-
alleles = expandLoci(loci[i], iupac=False)
|
|
21
|
-
c = Counter(alleles)
|
|
22
|
-
if len(c.keys()) <= 1:
|
|
23
|
-
blacklist.append(i)
|
|
24
|
-
continue
|
|
25
|
-
else:
|
|
26
|
-
minor_count = c.most_common(2)[1][1]
|
|
27
|
-
if float(minor_count / sum(c.values())) < threshold:
|
|
28
|
-
blacklist.append(i)
|
|
29
|
-
return blacklist
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def expandLoci(loc, iupac=False):
|
|
33
|
-
"""List of genotypes in 0-1-2 format."""
|
|
34
|
-
ret = list()
|
|
35
|
-
for i in loc:
|
|
36
|
-
if not iupac:
|
|
37
|
-
ret.extend(expand012(i))
|
|
38
|
-
else:
|
|
39
|
-
ret.extent(get_iupac_caseless(i))
|
|
40
|
-
return ret
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def expand012(geno):
|
|
44
|
-
g = str(geno)
|
|
45
|
-
if g == "0":
|
|
46
|
-
return [0, 0]
|
|
47
|
-
elif g == "1":
|
|
48
|
-
return [0, 1]
|
|
49
|
-
elif g == "2":
|
|
50
|
-
return [1, 1]
|
|
51
|
-
else:
|
|
52
|
-
return [-9, -9]
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def remove_items(all_list, bad_list):
|
|
56
|
-
"""Remove items from list using another list."""
|
|
57
|
-
# using list comprehension to perform the task
|
|
58
|
-
return [i for i in all_list if i not in bad_list]
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def count_alleles(l, vcf=False):
|
|
62
|
-
"""Count how many total alleles there are.
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
l (List[str]): List of IUPAC or VCF-style (e.g. 0/1) genotypes.
|
|
66
|
-
vcf (bool, optional): Whether genotypes are VCF or STRUCTURE-style. Defaults to False.
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
int: Total number of alleles in l.
|
|
70
|
-
"""
|
|
71
|
-
all_items = list()
|
|
72
|
-
for i in l:
|
|
73
|
-
if vcf:
|
|
74
|
-
all_items.extend(i.split("/"))
|
|
75
|
-
else:
|
|
76
|
-
all_items.extend(get_iupac_caseless(i))
|
|
77
|
-
all_items = remove_items(all_items, ["-9", "-", "N", -9])
|
|
78
|
-
return len(set(all_items))
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def get_major_allele(l, num=None, vcf=False):
|
|
82
|
-
"""Get most common alleles in list.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
l (List[str]): List of genotypes for one sample.
|
|
86
|
-
|
|
87
|
-
num (int, optional): Number of elements to return. Defaults to None.
|
|
88
|
-
|
|
89
|
-
vcf (bool, optional): Alleles in VCF or STRUCTURE-style format. Defaults to False.
|
|
90
|
-
|
|
91
|
-
Returns:
|
|
92
|
-
list: Most common alleles in descending order.
|
|
93
|
-
"""
|
|
94
|
-
all_items = list()
|
|
95
|
-
for i in l:
|
|
96
|
-
if vcf:
|
|
97
|
-
all_items.extend(i.split("/"))
|
|
98
|
-
else:
|
|
99
|
-
all_items.extend(get_iupac_caseless(i))
|
|
100
|
-
|
|
101
|
-
c = Counter(all_items) # requires collections import
|
|
102
|
-
|
|
103
|
-
# List of tuples with [(allele, count), ...] in order of
|
|
104
|
-
# most to least common
|
|
105
|
-
rets = c.most_common(num)
|
|
106
|
-
|
|
107
|
-
# Returns two most common non-ambiguous bases
|
|
108
|
-
# Makes sure the least common base isn't N or -9
|
|
109
|
-
if vcf:
|
|
110
|
-
return [x[0] for x in rets if x[0] != "-9"]
|
|
111
|
-
else:
|
|
112
|
-
return [x[0] for x in rets if x[0] in ["A", "T", "G", "C"]]
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def get_iupac_caseless(char):
|
|
116
|
-
"""Split IUPAC code to two primary characters, assuming diploidy.
|
|
117
|
-
|
|
118
|
-
Gives all non-valid ambiguities as N.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
char (str): Base to expand into diploid list.
|
|
122
|
-
|
|
123
|
-
Returns:
|
|
124
|
-
List[str]: List of the two expanded alleles.
|
|
125
|
-
"""
|
|
126
|
-
lower = False
|
|
127
|
-
if char.islower():
|
|
128
|
-
lower = True
|
|
129
|
-
char = char.upper()
|
|
130
|
-
iupac = {
|
|
131
|
-
"A": ["A", "A"],
|
|
132
|
-
"G": ["G", "G"],
|
|
133
|
-
"C": ["C", "C"],
|
|
134
|
-
"T": ["T", "T"],
|
|
135
|
-
"N": ["N", "N"],
|
|
136
|
-
"-": ["N", "N"],
|
|
137
|
-
"R": ["A", "G"],
|
|
138
|
-
"Y": ["C", "T"],
|
|
139
|
-
"S": ["G", "C"],
|
|
140
|
-
"W": ["A", "T"],
|
|
141
|
-
"K": ["G", "T"],
|
|
142
|
-
"M": ["A", "C"],
|
|
143
|
-
"B": ["N", "N"],
|
|
144
|
-
"D": ["N", "N"],
|
|
145
|
-
"H": ["N", "N"],
|
|
146
|
-
"V": ["N", "N"],
|
|
147
|
-
}
|
|
148
|
-
ret = iupac[char]
|
|
149
|
-
if lower:
|
|
150
|
-
ret = [c.lower() for c in ret]
|
|
151
|
-
return ret
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def get_iupac_full(char):
|
|
155
|
-
"""Split IUPAC code to all possible primary characters.
|
|
156
|
-
|
|
157
|
-
Gives all ambiguities as "N".
|
|
158
|
-
|
|
159
|
-
Args:
|
|
160
|
-
char (str): Base to expaned into list.
|
|
161
|
-
|
|
162
|
-
Returns:
|
|
163
|
-
List[str]: List of the expanded alleles.
|
|
164
|
-
"""
|
|
165
|
-
char = char.upper()
|
|
166
|
-
iupac = {
|
|
167
|
-
"A": ["A"],
|
|
168
|
-
"G": ["G"],
|
|
169
|
-
"C": ["C"],
|
|
170
|
-
"T": ["T"],
|
|
171
|
-
"N": ["A", "C", "T", "G"],
|
|
172
|
-
"-": ["A", "C", "T", "G"],
|
|
173
|
-
"R": ["A", "G"],
|
|
174
|
-
"Y": ["C", "T"],
|
|
175
|
-
"S": ["G", "C"],
|
|
176
|
-
"W": ["A", "T"],
|
|
177
|
-
"K": ["G", "T"],
|
|
178
|
-
"M": ["A", "C"],
|
|
179
|
-
"B": ["C", "G", "T"],
|
|
180
|
-
"D": ["A", "G", "T"],
|
|
181
|
-
"H": ["A", "C", "T"],
|
|
182
|
-
"V": ["A", "C", "G"],
|
|
183
|
-
}
|
|
184
|
-
ret = iupac[char]
|
|
185
|
-
return ret
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
def expandAmbiquousDNA(sequence):
|
|
189
|
-
"""Generator function to expand ambiguous sequences"""
|
|
190
|
-
for i in product(*[get_iupac_caseless(j) for j in sequence]):
|
|
191
|
-
yield ("".join(i))
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
def get_revComp_caseless(char):
|
|
195
|
-
"""Function to return reverse complement of a nucleotide, while preserving case."""
|
|
196
|
-
lower = False
|
|
197
|
-
if char.islower():
|
|
198
|
-
lower = True
|
|
199
|
-
char = char.upper()
|
|
200
|
-
d = {
|
|
201
|
-
"A": "T",
|
|
202
|
-
"G": "C",
|
|
203
|
-
"C": "G",
|
|
204
|
-
"T": "A",
|
|
205
|
-
"N": "N",
|
|
206
|
-
"-": "-",
|
|
207
|
-
"R": "Y",
|
|
208
|
-
"Y": "R",
|
|
209
|
-
"S": "S",
|
|
210
|
-
"W": "W",
|
|
211
|
-
"K": "M",
|
|
212
|
-
"M": "K",
|
|
213
|
-
"B": "V",
|
|
214
|
-
"D": "H",
|
|
215
|
-
"H": "D",
|
|
216
|
-
"V": "B",
|
|
217
|
-
}
|
|
218
|
-
ret = d[char]
|
|
219
|
-
if lower:
|
|
220
|
-
ret = ret.lower()
|
|
221
|
-
return ret
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
def reverseComplement(seq):
|
|
225
|
-
"""Function to reverse complement a sequence, with case preserved."""
|
|
226
|
-
comp = []
|
|
227
|
-
for i in (get_revComp_caseless(j) for j in seq):
|
|
228
|
-
comp.append(i)
|
|
229
|
-
return "".join(comp[::-1])
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def simplifySeq(seq):
|
|
233
|
-
"""Function to simplify a sequence."""
|
|
234
|
-
temp = re.sub("[ACGT]", "", (seq).upper())
|
|
235
|
-
return temp.translate(str.maketrans("RYSWKMBDHV", "**********"))
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def seqCounter(seq):
|
|
239
|
-
"""Returns dict of character counts"""
|
|
240
|
-
d = {}
|
|
241
|
-
d = {
|
|
242
|
-
"A": 0,
|
|
243
|
-
"N": 0,
|
|
244
|
-
"-": 0,
|
|
245
|
-
"C": 0,
|
|
246
|
-
"G": 0,
|
|
247
|
-
"T": 0,
|
|
248
|
-
"R": 0,
|
|
249
|
-
"Y": 0,
|
|
250
|
-
"S": 0,
|
|
251
|
-
"W": 0,
|
|
252
|
-
"K": 0,
|
|
253
|
-
"M": 0,
|
|
254
|
-
"B": 0,
|
|
255
|
-
"D": 0,
|
|
256
|
-
"H": 0,
|
|
257
|
-
"V": 0,
|
|
258
|
-
}
|
|
259
|
-
for c in seq:
|
|
260
|
-
if c in d:
|
|
261
|
-
d[c] += 1
|
|
262
|
-
d["VAR"] = (
|
|
263
|
-
d["R"]
|
|
264
|
-
+ d["Y"]
|
|
265
|
-
+ d["S"]
|
|
266
|
-
+ d["W"]
|
|
267
|
-
+ d["K"]
|
|
268
|
-
+ d["M"]
|
|
269
|
-
+ d["B"]
|
|
270
|
-
+ d["D"]
|
|
271
|
-
+ d["H"]
|
|
272
|
-
+ d["V"]
|
|
273
|
-
)
|
|
274
|
-
return d
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
def getFlankCounts(ref, x, y, dist):
|
|
278
|
-
"""Get vars, gaps, and N counts for flanking regions of a substring."""
|
|
279
|
-
x2 = x - dist
|
|
280
|
-
if x2 < 0:
|
|
281
|
-
x2 = 0
|
|
282
|
-
y2 = y + dist
|
|
283
|
-
if y2 > len(ref):
|
|
284
|
-
y2 = len(ref)
|
|
285
|
-
flanks = ref[x2:x] + ref[y:y2] # flanks = right + left flank
|
|
286
|
-
counts = seqCounterSimple(simplifySeq(flanks))
|
|
287
|
-
return counts
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
def seqCounterSimple(seq):
|
|
291
|
-
"""Get dict of character counts from a simplified consensus sequence."""
|
|
292
|
-
d = {}
|
|
293
|
-
d = {"N": 0, "-": 0, "*": 0}
|
|
294
|
-
for c in seq:
|
|
295
|
-
if c in d:
|
|
296
|
-
d[c] += 1
|
|
297
|
-
return d
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
def gc_counts(string):
|
|
301
|
-
"""Get GC content of a provided sequence."""
|
|
302
|
-
new = re.sub("[GCgc]", "#", string)
|
|
303
|
-
return sum(1 for c in new if c == "#")
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
def mask_counts(string):
|
|
307
|
-
"""Get counts of masked bases."""
|
|
308
|
-
return sum(1 for c in string if c.islower())
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
def gc_content(string):
|
|
312
|
-
"""Get GC content as proportion."""
|
|
313
|
-
new = re.sub("[GCgc]", "#", string)
|
|
314
|
-
count = sum(1 for c in new if c == "#")
|
|
315
|
-
return count / (len(string))
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
def mask_content(string):
|
|
319
|
-
"""Count number of lower case in a string."""
|
|
320
|
-
count = sum(1 for c in string if c.islower())
|
|
321
|
-
return count / (len(string))
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
def seqSlidingWindowString(seq, shift, width):
|
|
325
|
-
"""Generator to create sliding windows by slicing out substrings."""
|
|
326
|
-
seqlen = len(seq)
|
|
327
|
-
for i in range(0, seqlen, shift):
|
|
328
|
-
if i + width > seqlen:
|
|
329
|
-
j = seqlen
|
|
330
|
-
else:
|
|
331
|
-
j = i + width
|
|
332
|
-
yield seq[i:j]
|
|
333
|
-
if j == seqlen:
|
|
334
|
-
break
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
def seqSlidingWindow(seq, shift, width):
|
|
338
|
-
"""Generator to create sliding windows by slicing out substrings."""
|
|
339
|
-
seqlen = len(seq)
|
|
340
|
-
for i in range(0, seqlen, shift):
|
|
341
|
-
if i + width > seqlen:
|
|
342
|
-
j = seqlen
|
|
343
|
-
else:
|
|
344
|
-
j = i + width
|
|
345
|
-
yield [seq[i:j], i, j]
|
|
346
|
-
if j == seqlen:
|
|
347
|
-
break
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
def stringSubstitute(s, pos, c):
|
|
351
|
-
"""Fast way to replace single char in string.
|
|
352
|
-
|
|
353
|
-
This way is a lot faster than doing it by making a list and subst in list.
|
|
354
|
-
"""
|
|
355
|
-
return s[:pos] + c + s[pos + 1 :]
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
def listToSortUniqueString(l):
|
|
359
|
-
"""Get sorted unique string from list of chars.
|
|
360
|
-
|
|
361
|
-
Args:
|
|
362
|
-
l (List[str]): List of characters.
|
|
363
|
-
|
|
364
|
-
Returns:
|
|
365
|
-
List[str]: Sorted unique strings from list.
|
|
366
|
-
"""
|
|
367
|
-
sl = sorted(set(l))
|
|
368
|
-
return str("".join(sl))
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
def n_lower_chars(string):
|
|
372
|
-
"""Count number of lower case in a string."""
|
|
373
|
-
return sum(1 for c in string if c.islower())
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
def countSlidingWindow(seq, shift, width):
|
|
377
|
-
"""Simplify a sequence to SNP, gaps, and Ns; get counts of sliding windows."""
|
|
378
|
-
seq_temp = re.sub("[ACGT]", "", seq.upper())
|
|
379
|
-
seq_norm = seq_temp.translate(str.maketrans("RYSWKMBDHV", "**********"))
|
|
380
|
-
for i in windowSub(seq_norm, shift, width):
|
|
381
|
-
# print(i)
|
|
382
|
-
window_seq = "".join(i)
|
|
383
|
-
seqCounterSimple(window_seq)
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
class slidingWindowGenerator:
|
|
387
|
-
"""Object for creating an iterable sliding window sampling."""
|
|
388
|
-
|
|
389
|
-
# Need to come back and comment better...
|
|
390
|
-
def __init__(self, seq, shift, width):
|
|
391
|
-
self.__seq = seq
|
|
392
|
-
self.__seqlen = len(self.__seq)
|
|
393
|
-
self.__shift = shift
|
|
394
|
-
self.__width = width
|
|
395
|
-
self.__i = 0
|
|
396
|
-
|
|
397
|
-
def __call__(self):
|
|
398
|
-
self.__seqlen
|
|
399
|
-
while self.__i < self.__seqlen:
|
|
400
|
-
# print("i is ", self.__i, " : Base is ", self.__seq[self.__i]) #debug print
|
|
401
|
-
if self.__i + self.__width > self.__seqlen:
|
|
402
|
-
j = self.__seqlen
|
|
403
|
-
else:
|
|
404
|
-
j = self.__i + self.__width
|
|
405
|
-
yield [self.__seq[self.__i : j], self.__i, j]
|
|
406
|
-
if j == self.__seqlen:
|
|
407
|
-
break
|