pg-sui 0.2.0__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +101 -79
  2. pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
  3. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
  5. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
  6. pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
  7. pgsui/__init__.py +35 -54
  8. pgsui/_version.py +34 -0
  9. pgsui/cli.py +909 -0
  10. pgsui/data_processing/__init__.py +0 -0
  11. pgsui/data_processing/config.py +565 -0
  12. pgsui/data_processing/containers.py +1424 -0
  13. pgsui/data_processing/transformers.py +557 -907
  14. pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  15. pgsui/electron/app/__main__.py +5 -0
  16. pgsui/electron/app/extra-resources/.gitkeep +1 -0
  17. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  18. pgsui/electron/app/icons/icons/128x128.png +0 -0
  19. pgsui/electron/app/icons/icons/16x16.png +0 -0
  20. pgsui/electron/app/icons/icons/24x24.png +0 -0
  21. pgsui/electron/app/icons/icons/256x256.png +0 -0
  22. pgsui/electron/app/icons/icons/32x32.png +0 -0
  23. pgsui/electron/app/icons/icons/48x48.png +0 -0
  24. pgsui/electron/app/icons/icons/512x512.png +0 -0
  25. pgsui/electron/app/icons/icons/64x64.png +0 -0
  26. pgsui/electron/app/icons/icons/icon.icns +0 -0
  27. pgsui/electron/app/icons/icons/icon.ico +0 -0
  28. pgsui/electron/app/main.js +227 -0
  29. pgsui/electron/app/package-lock.json +6894 -0
  30. pgsui/electron/app/package.json +51 -0
  31. pgsui/electron/app/preload.js +15 -0
  32. pgsui/electron/app/server.py +157 -0
  33. pgsui/electron/app/ui/logo.png +0 -0
  34. pgsui/electron/app/ui/renderer.js +131 -0
  35. pgsui/electron/app/ui/styles.css +59 -0
  36. pgsui/electron/app/ui/ui_shim.js +72 -0
  37. pgsui/electron/bootstrap.py +43 -0
  38. pgsui/electron/launch.py +57 -0
  39. pgsui/electron/package.json +14 -0
  40. pgsui/example_data/__init__.py +0 -0
  41. pgsui/example_data/phylip_files/__init__.py +0 -0
  42. pgsui/example_data/phylip_files/test.phy +0 -0
  43. pgsui/example_data/popmaps/__init__.py +0 -0
  44. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  45. pgsui/example_data/structure_files/__init__.py +0 -0
  46. pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
  47. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  48. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  49. pgsui/impute/__init__.py +0 -0
  50. pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
  51. pgsui/impute/deterministic/imputers/mode.py +844 -0
  52. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  53. pgsui/impute/deterministic/imputers/phylo.py +973 -0
  54. pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
  55. pgsui/impute/supervised/__init__.py +0 -0
  56. pgsui/impute/supervised/base.py +343 -0
  57. pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  58. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
  59. pgsui/impute/supervised/imputers/random_forest.py +291 -0
  60. pgsui/impute/unsupervised/__init__.py +0 -0
  61. pgsui/impute/unsupervised/base.py +1118 -0
  62. pgsui/impute/unsupervised/callbacks.py +92 -262
  63. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
  64. pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
  65. pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
  66. pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
  67. pgsui/impute/unsupervised/imputers/vae.py +1228 -0
  68. pgsui/impute/unsupervised/loss_functions.py +261 -0
  69. pgsui/impute/unsupervised/models/__init__.py +0 -0
  70. pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
  71. pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
  72. pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
  73. pgsui/impute/unsupervised/models/vae_model.py +269 -630
  74. pgsui/impute/unsupervised/nn_scorers.py +255 -0
  75. pgsui/utils/__init__.py +0 -0
  76. pgsui/utils/classification_viz.py +608 -0
  77. pgsui/utils/logging_utils.py +22 -0
  78. pgsui/utils/misc.py +35 -480
  79. pgsui/utils/plotting.py +996 -829
  80. pgsui/utils/pretty_metrics.py +290 -0
  81. pgsui/utils/scorers.py +213 -666
  82. pg_sui-0.2.0.dist-info/RECORD +0 -75
  83. pg_sui-0.2.0.dist-info/top_level.txt +0 -3
  84. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  85. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  86. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  87. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  88. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  89. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  90. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  91. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  92. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  93. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  94. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  95. pgsui/example_data/trees/test.iqtree +0 -376
  96. pgsui/example_data/trees/test.qmat +0 -5
  97. pgsui/example_data/trees/test.rate +0 -2033
  98. pgsui/example_data/trees/test.tre +0 -1
  99. pgsui/example_data/trees/test_n10.rate +0 -19
  100. pgsui/example_data/trees/test_n100.rate +0 -109
  101. pgsui/example_data/trees/test_n500.rate +0 -509
  102. pgsui/example_data/trees/test_siterates.txt +0 -2024
  103. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  104. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  105. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  106. pgsui/example_data/vcf_files/test.vcf +0 -244
  107. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  108. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  109. pgsui/impute/estimators.py +0 -1268
  110. pgsui/impute/impute.py +0 -1463
  111. pgsui/impute/simple_imputers.py +0 -1431
  112. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
  113. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
  114. pgsui/impute/unsupervised/keras_classifiers.py +0 -697
  115. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  116. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
  117. pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
  118. pgsui/pg_sui.py +0 -261
  119. pgsui/utils/sequence_tools.py +0 -407
  120. simulation/sim_benchmarks.py +0 -333
  121. simulation/sim_treeparams.py +0 -475
  122. test/__init__.py +0 -0
  123. test/pg_sui_simtest.py +0 -215
  124. test/pg_sui_testing.py +0 -523
  125. test/test.py +0 -151
  126. test/test_pgsui.py +0 -374
  127. test/test_tkc.py +0 -185
pgsui/pg_sui.py DELETED
@@ -1,261 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- # Standard library imports
4
- import argparse
5
- import sys
6
-
7
- import numpy as np
8
- import pandas as pd
9
-
10
- from sklearn_genetic.space import Continuous, Categorical, Integer
11
-
12
- from snpio import GenotypeData
13
- from snpio import Plotting
14
- from pgsui import *
15
-
16
-
17
- def main():
18
- """Class instantiations and main package body"""
19
-
20
- args = get_arguments()
21
-
22
- if args.str and args.phylip:
23
- sys.exit("Error: Only one file type can be specified")
24
-
25
- # If VCF file is specified.
26
- if args.str:
27
- if not args.pop_ids and args.popmap is None:
28
- raise TypeError("Either --pop_ids or --popmap must be specified\n")
29
-
30
- if args.pop_ids:
31
- print("\n--pop_ids was specified as column 2\n")
32
- else:
33
- print(
34
- "\n--pop_ids was not specified; "
35
- "using popmap file to get population IDs\n"
36
- )
37
-
38
- if args.onerow_perind:
39
- print("\nUsing one row per individual...\n")
40
- else:
41
- print("\nUsing two rows per individual...\n")
42
-
43
- if args.onerow_perind:
44
- data = GenotypeData(
45
- filename=args.str,
46
- filetype="structure1row",
47
- popmapfile=args.popmap,
48
- guidetree=args.treefile,
49
- qmatrix_iqtree=args.iqtree,
50
- prefix=args.prefix,
51
- )
52
- else:
53
- data = GenotypeData(
54
- filename=args.str,
55
- filetype="structure2row",
56
- popmapfile=args.popmap,
57
- guidetree=args.treefile,
58
- qmatrix_iqtree=args.iqtree,
59
- prefix=args.prefix,
60
- )
61
-
62
- if args.phylip:
63
- if args.pop_ids or args.onerow_perind:
64
- print(
65
- "\nPhylip file was used with structure arguments; ignoring "
66
- "structure file arguments\n"
67
- )
68
-
69
- if args.popmap is None:
70
- raise TypeError("No popmap file supplied with PHYLIP file\n")
71
-
72
- data = GenotypeData(
73
- filename=args.phylip,
74
- filetype="phylip",
75
- popmapfile=args.popmap,
76
- guidetree=args.treefile,
77
- qmatrix_iqtree=args.iqtree,
78
- siterates_iqtree=args.site_rate,
79
- prefix=args.prefix,
80
- )
81
-
82
- data.missingness_reports(prefix=args.prefix, plot_format="png")
83
-
84
- # For GridSearchCV. Generate parameters to sample from.
85
- learning_rate = [float(10) ** x for x in np.arange(-4, -1)]
86
- l1_penalty = [float(10) ** x for x in np.arange(-5, -1)]
87
- l1_penalty.append(0.0)
88
- l2_penalty = [float(10) ** x for x in np.arange(-5, -1)]
89
- l2_penalty.append(0.0)
90
- hidden_activation = ["elu", "relu"]
91
- num_hidden_layers = [1, 2, 3]
92
- hidden_layer_sizes = ["sqrt", "midpoint"]
93
- n_components = [2, 3, 5, 10]
94
- dropout_rate = [0.0, 0.2, 0.4]
95
- # batch_size = [16, 32, 48, 64]
96
- optimizer = ["adam", "sgd", "adagrad"]
97
-
98
- # Some are commented out for testing purposes.
99
- # grid_params = {
100
- # "learning_rate": learning_rate,
101
- # # "l1_penalty": l1_penalty,
102
- # # "l2_penalty": l2_penalty,
103
- # # "hidden_layer_sizes": hidden_layer_sizes,
104
- # "n_components": n_components,
105
- # # "dropout_rate": dropout_rate,
106
- # # # "optimizer": optimizer,
107
- # # "num_hidden_layers": num_hidden_layers,
108
- # # "hidden_activation": hidden_activation,
109
- # }
110
-
111
- imp = ImputeXGBoost(
112
- data,
113
- max_iter=3,
114
- gridparams={"n_estimators": [100, 200]},
115
- n_nearest_features=5,
116
- # disable_progressbar=False,
117
- # epochs=100,
118
- # cv=3,
119
- # column_subset=1.0,
120
- # learning_rate=0.01,
121
- # num_hidden_layers=1,
122
- # hidden_layer_sizes="midpoint",
123
- # verbose=10,
124
- # dropout_rate=0.2,
125
- # hidden_activation="relu",
126
- # batch_size=32,
127
- # l1_penalty=1e-6,
128
- # l2_penalty=1e-6,
129
- # # gridparams=grid_params,
130
- # n_jobs=4,
131
- # grid_iter=5,
132
- # sim_strategy="nonrandom_weighted",
133
- # sim_prop_missing=0.5,
134
- # scoring_metric="precision_recall_macro",
135
- # gridsearch_method="gridsearch",
136
- # early_stop_gen=5,
137
- # n_components=3,
138
- # sample_weights={0: 1.0, 1: 0.0, 2: 1.0},
139
- # sample_weights="auto",
140
- )
141
-
142
- gd_imp = imp.imputed
143
-
144
- components, model = Plotting.run_pca(
145
- data,
146
- plot_format="png",
147
- center=True,
148
- scale=False,
149
- prefix=args.prefix,
150
- # n_axes=3,
151
- )
152
-
153
- components_imp, model_imp = Plotting.run_pca(
154
- gd_imp,
155
- plot_format="png",
156
- center=True,
157
- scale=False,
158
- prefix=args.prefix + "_imputed",
159
- )
160
-
161
-
162
- def get_arguments():
163
- """[Parse command-line arguments. Imported with argparse]
164
-
165
- Returns:
166
- [argparse object]: [contains command-line arguments; accessed as method]
167
- """
168
-
169
- parser = argparse.ArgumentParser(
170
- description="Machine learning missing data imputation and species delimitation",
171
- add_help=False,
172
- )
173
-
174
- required_args = parser.add_argument_group("Required arguments")
175
- filetype_args = parser.add_argument_group(
176
- "File type arguments (choose only one)"
177
- )
178
- structure_args = parser.add_argument_group("Structure file arguments")
179
- optional_args = parser.add_argument_group("Optional arguments")
180
-
181
- # File Type arguments
182
- filetype_args.add_argument(
183
- "-s", "--str", type=str, required=False, help="Input structure file"
184
- )
185
- filetype_args.add_argument(
186
- "-p", "--phylip", type=str, required=False, help="Input phylip file"
187
- )
188
-
189
- filetype_args.add_argument(
190
- "-t",
191
- "--treefile",
192
- type=str,
193
- required=False,
194
- default=None,
195
- help="Newick-formatted treefile",
196
- )
197
-
198
- filetype_args.add_argument(
199
- "-i",
200
- "--iqtree",
201
- type=str,
202
- required=False,
203
- help=".iqtree output file containing Rate Matrix Q",
204
- )
205
-
206
- filetype_args.add_argument(
207
- "--site_rate",
208
- type=str,
209
- required=False,
210
- help="Specify site rate input file.",
211
- )
212
-
213
- # Structure Arguments
214
- structure_args.add_argument(
215
- "--onerow_perind",
216
- default=False,
217
- action="store_true",
218
- help="Toggles on one row per individual option in structure file",
219
- )
220
- structure_args.add_argument(
221
- "--pop_ids",
222
- default=False,
223
- required=False,
224
- action="store_true",
225
- help="Toggles on population ID column (2nd col) in structure file",
226
- )
227
-
228
- ## Optional Arguments
229
- optional_args.add_argument(
230
- "-m",
231
- "--popmap",
232
- type=str,
233
- required=False,
234
- default=None,
235
- help="Two-column tab-separated population map file: inds\tpops. No header line",
236
- )
237
- optional_args.add_argument(
238
- "--prefix",
239
- type=str,
240
- required=False,
241
- default="imputer",
242
- help="Prefix for output directory. Output directory will be '<prefix>_output'",
243
- )
244
-
245
- # Add help menu
246
- optional_args.add_argument(
247
- "-h", "--help", action="help", help="Displays this help menu"
248
- )
249
-
250
- # If no command-line arguments are called then exit and call help menu.
251
- if len(sys.argv) == 1:
252
- print("\nExiting because no command-line options were called.\n")
253
- parser.print_help(sys.stderr)
254
- sys.exit(1)
255
-
256
- args = parser.parse_args()
257
- return args
258
-
259
-
260
- if __name__ == "__main__":
261
- main()
@@ -1,407 +0,0 @@
1
- import re
2
- import sys
3
- from itertools import product
4
- from collections import Counter
5
-
6
-
7
- def blacklist_missing(loci, threshold, iupac=False):
8
- blacklist = list()
9
- for i in range(0, len(loci)):
10
- alleles = expandLoci(loci[i], iupac=False)
11
- c = Counter(alleles)
12
- if float(c[-9] / sum(c.values())) > threshold:
13
- blacklist.append(i)
14
- return blacklist
15
-
16
-
17
- def blacklist_maf(loci, threshold, iupac=False):
18
- blacklist = list()
19
- for i in range(0, len(loci)):
20
- alleles = expandLoci(loci[i], iupac=False)
21
- c = Counter(alleles)
22
- if len(c.keys()) <= 1:
23
- blacklist.append(i)
24
- continue
25
- else:
26
- minor_count = c.most_common(2)[1][1]
27
- if float(minor_count / sum(c.values())) < threshold:
28
- blacklist.append(i)
29
- return blacklist
30
-
31
-
32
- def expandLoci(loc, iupac=False):
33
- """List of genotypes in 0-1-2 format."""
34
- ret = list()
35
- for i in loc:
36
- if not iupac:
37
- ret.extend(expand012(i))
38
- else:
39
- ret.extent(get_iupac_caseless(i))
40
- return ret
41
-
42
-
43
- def expand012(geno):
44
- g = str(geno)
45
- if g == "0":
46
- return [0, 0]
47
- elif g == "1":
48
- return [0, 1]
49
- elif g == "2":
50
- return [1, 1]
51
- else:
52
- return [-9, -9]
53
-
54
-
55
- def remove_items(all_list, bad_list):
56
- """Remove items from list using another list."""
57
- # using list comprehension to perform the task
58
- return [i for i in all_list if i not in bad_list]
59
-
60
-
61
- def count_alleles(l, vcf=False):
62
- """Count how many total alleles there are.
63
-
64
- Args:
65
- l (List[str]): List of IUPAC or VCF-style (e.g. 0/1) genotypes.
66
- vcf (bool, optional): Whether genotypes are VCF or STRUCTURE-style. Defaults to False.
67
-
68
- Returns:
69
- int: Total number of alleles in l.
70
- """
71
- all_items = list()
72
- for i in l:
73
- if vcf:
74
- all_items.extend(i.split("/"))
75
- else:
76
- all_items.extend(get_iupac_caseless(i))
77
- all_items = remove_items(all_items, ["-9", "-", "N", -9])
78
- return len(set(all_items))
79
-
80
-
81
- def get_major_allele(l, num=None, vcf=False):
82
- """Get most common alleles in list.
83
-
84
- Args:
85
- l (List[str]): List of genotypes for one sample.
86
-
87
- num (int, optional): Number of elements to return. Defaults to None.
88
-
89
- vcf (bool, optional): Alleles in VCF or STRUCTURE-style format. Defaults to False.
90
-
91
- Returns:
92
- list: Most common alleles in descending order.
93
- """
94
- all_items = list()
95
- for i in l:
96
- if vcf:
97
- all_items.extend(i.split("/"))
98
- else:
99
- all_items.extend(get_iupac_caseless(i))
100
-
101
- c = Counter(all_items) # requires collections import
102
-
103
- # List of tuples with [(allele, count), ...] in order of
104
- # most to least common
105
- rets = c.most_common(num)
106
-
107
- # Returns two most common non-ambiguous bases
108
- # Makes sure the least common base isn't N or -9
109
- if vcf:
110
- return [x[0] for x in rets if x[0] != "-9"]
111
- else:
112
- return [x[0] for x in rets if x[0] in ["A", "T", "G", "C"]]
113
-
114
-
115
- def get_iupac_caseless(char):
116
- """Split IUPAC code to two primary characters, assuming diploidy.
117
-
118
- Gives all non-valid ambiguities as N.
119
-
120
- Args:
121
- char (str): Base to expand into diploid list.
122
-
123
- Returns:
124
- List[str]: List of the two expanded alleles.
125
- """
126
- lower = False
127
- if char.islower():
128
- lower = True
129
- char = char.upper()
130
- iupac = {
131
- "A": ["A", "A"],
132
- "G": ["G", "G"],
133
- "C": ["C", "C"],
134
- "T": ["T", "T"],
135
- "N": ["N", "N"],
136
- "-": ["N", "N"],
137
- "R": ["A", "G"],
138
- "Y": ["C", "T"],
139
- "S": ["G", "C"],
140
- "W": ["A", "T"],
141
- "K": ["G", "T"],
142
- "M": ["A", "C"],
143
- "B": ["N", "N"],
144
- "D": ["N", "N"],
145
- "H": ["N", "N"],
146
- "V": ["N", "N"],
147
- }
148
- ret = iupac[char]
149
- if lower:
150
- ret = [c.lower() for c in ret]
151
- return ret
152
-
153
-
154
- def get_iupac_full(char):
155
- """Split IUPAC code to all possible primary characters.
156
-
157
- Gives all ambiguities as "N".
158
-
159
- Args:
160
- char (str): Base to expaned into list.
161
-
162
- Returns:
163
- List[str]: List of the expanded alleles.
164
- """
165
- char = char.upper()
166
- iupac = {
167
- "A": ["A"],
168
- "G": ["G"],
169
- "C": ["C"],
170
- "T": ["T"],
171
- "N": ["A", "C", "T", "G"],
172
- "-": ["A", "C", "T", "G"],
173
- "R": ["A", "G"],
174
- "Y": ["C", "T"],
175
- "S": ["G", "C"],
176
- "W": ["A", "T"],
177
- "K": ["G", "T"],
178
- "M": ["A", "C"],
179
- "B": ["C", "G", "T"],
180
- "D": ["A", "G", "T"],
181
- "H": ["A", "C", "T"],
182
- "V": ["A", "C", "G"],
183
- }
184
- ret = iupac[char]
185
- return ret
186
-
187
-
188
- def expandAmbiquousDNA(sequence):
189
- """Generator function to expand ambiguous sequences"""
190
- for i in product(*[get_iupac_caseless(j) for j in sequence]):
191
- yield ("".join(i))
192
-
193
-
194
- def get_revComp_caseless(char):
195
- """Function to return reverse complement of a nucleotide, while preserving case."""
196
- lower = False
197
- if char.islower():
198
- lower = True
199
- char = char.upper()
200
- d = {
201
- "A": "T",
202
- "G": "C",
203
- "C": "G",
204
- "T": "A",
205
- "N": "N",
206
- "-": "-",
207
- "R": "Y",
208
- "Y": "R",
209
- "S": "S",
210
- "W": "W",
211
- "K": "M",
212
- "M": "K",
213
- "B": "V",
214
- "D": "H",
215
- "H": "D",
216
- "V": "B",
217
- }
218
- ret = d[char]
219
- if lower:
220
- ret = ret.lower()
221
- return ret
222
-
223
-
224
- def reverseComplement(seq):
225
- """Function to reverse complement a sequence, with case preserved."""
226
- comp = []
227
- for i in (get_revComp_caseless(j) for j in seq):
228
- comp.append(i)
229
- return "".join(comp[::-1])
230
-
231
-
232
- def simplifySeq(seq):
233
- """Function to simplify a sequence."""
234
- temp = re.sub("[ACGT]", "", (seq).upper())
235
- return temp.translate(str.maketrans("RYSWKMBDHV", "**********"))
236
-
237
-
238
- def seqCounter(seq):
239
- """Returns dict of character counts"""
240
- d = {}
241
- d = {
242
- "A": 0,
243
- "N": 0,
244
- "-": 0,
245
- "C": 0,
246
- "G": 0,
247
- "T": 0,
248
- "R": 0,
249
- "Y": 0,
250
- "S": 0,
251
- "W": 0,
252
- "K": 0,
253
- "M": 0,
254
- "B": 0,
255
- "D": 0,
256
- "H": 0,
257
- "V": 0,
258
- }
259
- for c in seq:
260
- if c in d:
261
- d[c] += 1
262
- d["VAR"] = (
263
- d["R"]
264
- + d["Y"]
265
- + d["S"]
266
- + d["W"]
267
- + d["K"]
268
- + d["M"]
269
- + d["B"]
270
- + d["D"]
271
- + d["H"]
272
- + d["V"]
273
- )
274
- return d
275
-
276
-
277
- def getFlankCounts(ref, x, y, dist):
278
- """Get vars, gaps, and N counts for flanking regions of a substring."""
279
- x2 = x - dist
280
- if x2 < 0:
281
- x2 = 0
282
- y2 = y + dist
283
- if y2 > len(ref):
284
- y2 = len(ref)
285
- flanks = ref[x2:x] + ref[y:y2] # flanks = right + left flank
286
- counts = seqCounterSimple(simplifySeq(flanks))
287
- return counts
288
-
289
-
290
- def seqCounterSimple(seq):
291
- """Get dict of character counts from a simplified consensus sequence."""
292
- d = {}
293
- d = {"N": 0, "-": 0, "*": 0}
294
- for c in seq:
295
- if c in d:
296
- d[c] += 1
297
- return d
298
-
299
-
300
- def gc_counts(string):
301
- """Get GC content of a provided sequence."""
302
- new = re.sub("[GCgc]", "#", string)
303
- return sum(1 for c in new if c == "#")
304
-
305
-
306
- def mask_counts(string):
307
- """Get counts of masked bases."""
308
- return sum(1 for c in string if c.islower())
309
-
310
-
311
- def gc_content(string):
312
- """Get GC content as proportion."""
313
- new = re.sub("[GCgc]", "#", string)
314
- count = sum(1 for c in new if c == "#")
315
- return count / (len(string))
316
-
317
-
318
- def mask_content(string):
319
- """Count number of lower case in a string."""
320
- count = sum(1 for c in string if c.islower())
321
- return count / (len(string))
322
-
323
-
324
- def seqSlidingWindowString(seq, shift, width):
325
- """Generator to create sliding windows by slicing out substrings."""
326
- seqlen = len(seq)
327
- for i in range(0, seqlen, shift):
328
- if i + width > seqlen:
329
- j = seqlen
330
- else:
331
- j = i + width
332
- yield seq[i:j]
333
- if j == seqlen:
334
- break
335
-
336
-
337
- def seqSlidingWindow(seq, shift, width):
338
- """Generator to create sliding windows by slicing out substrings."""
339
- seqlen = len(seq)
340
- for i in range(0, seqlen, shift):
341
- if i + width > seqlen:
342
- j = seqlen
343
- else:
344
- j = i + width
345
- yield [seq[i:j], i, j]
346
- if j == seqlen:
347
- break
348
-
349
-
350
- def stringSubstitute(s, pos, c):
351
- """Fast way to replace single char in string.
352
-
353
- This way is a lot faster than doing it by making a list and subst in list.
354
- """
355
- return s[:pos] + c + s[pos + 1 :]
356
-
357
-
358
- def listToSortUniqueString(l):
359
- """Get sorted unique string from list of chars.
360
-
361
- Args:
362
- l (List[str]): List of characters.
363
-
364
- Returns:
365
- List[str]: Sorted unique strings from list.
366
- """
367
- sl = sorted(set(l))
368
- return str("".join(sl))
369
-
370
-
371
- def n_lower_chars(string):
372
- """Count number of lower case in a string."""
373
- return sum(1 for c in string if c.islower())
374
-
375
-
376
- def countSlidingWindow(seq, shift, width):
377
- """Simplify a sequence to SNP, gaps, and Ns; get counts of sliding windows."""
378
- seq_temp = re.sub("[ACGT]", "", seq.upper())
379
- seq_norm = seq_temp.translate(str.maketrans("RYSWKMBDHV", "**********"))
380
- for i in windowSub(seq_norm, shift, width):
381
- # print(i)
382
- window_seq = "".join(i)
383
- seqCounterSimple(window_seq)
384
-
385
-
386
- class slidingWindowGenerator:
387
- """Object for creating an iterable sliding window sampling."""
388
-
389
- # Need to come back and comment better...
390
- def __init__(self, seq, shift, width):
391
- self.__seq = seq
392
- self.__seqlen = len(self.__seq)
393
- self.__shift = shift
394
- self.__width = width
395
- self.__i = 0
396
-
397
- def __call__(self):
398
- self.__seqlen
399
- while self.__i < self.__seqlen:
400
- # print("i is ", self.__i, " : Base is ", self.__seq[self.__i]) #debug print
401
- if self.__i + self.__width > self.__seqlen:
402
- j = self.__seqlen
403
- else:
404
- j = self.__i + self.__width
405
- yield [self.__seq[self.__i : j], self.__i, j]
406
- if j == self.__seqlen:
407
- break