NGSpeciesID 0.3.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/NGSpeciesID +52 -59
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1/NGSpeciesID.egg-info}/PKG-INFO +41 -10
- {NGSpeciesID-0.3.0/NGSpeciesID.egg-info → ngspeciesid-0.3.1}/PKG-INFO +41 -10
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/README.md +27 -8
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/modules/barcode_trimmer.py +20 -42
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/modules/cluster.py +33 -57
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/modules/consensus.py +38 -47
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/modules/get_sorted_fastq_for_cluster.py +25 -31
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/modules/help_functions.py +5 -3
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/modules/parallelize.py +26 -23
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/setup.py +1 -1
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/LICENSE.txt +0 -0
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/MANIFEST.in +0 -0
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/NGSpeciesID.egg-info/SOURCES.txt +0 -0
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/NGSpeciesID.egg-info/dependency_links.txt +0 -0
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/NGSpeciesID.egg-info/requires.txt +0 -0
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/NGSpeciesID.egg-info/top_level.txt +0 -0
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/modules/__init__.py +0 -0
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/modules/p_minimizers_shared.py +0 -0
- {NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/setup.cfg +0 -0
@@ -4,13 +4,12 @@ from __future__ import print_function
|
|
4
4
|
import os,sys
|
5
5
|
import argparse
|
6
6
|
import tempfile
|
7
|
-
import errno
|
8
7
|
from time import time
|
9
8
|
import shutil
|
10
9
|
import random
|
10
|
+
import logging
|
11
11
|
|
12
12
|
from modules import get_sorted_fastq_for_cluster
|
13
|
-
from modules import cluster
|
14
13
|
from modules import p_minimizers_shared
|
15
14
|
from modules import help_functions
|
16
15
|
from modules import parallelize
|
@@ -30,7 +29,7 @@ def single_clustering(read_array, p_emp_probs, args):
|
|
30
29
|
result_dict = cluster.reads_to_clusters(clusters, representatives, read_array, p_emp_probs, minimizer_database, new_batch_index, args)
|
31
30
|
# Unpack result. The result dictionary structure is convenient for multiprocessing return but clumsy in single core mode.
|
32
31
|
clusters, representatives, _, _ = list(result_dict.values())[0]
|
33
|
-
|
32
|
+
logging.debug(f"Time elapesd clustering: {time() - start_cluster}")
|
34
33
|
return clusters, representatives
|
35
34
|
|
36
35
|
|
@@ -45,28 +44,30 @@ def main(args):
|
|
45
44
|
"""
|
46
45
|
##### Sort all reads according to expected errorfree kmers #####
|
47
46
|
args.outfile = os.path.join(args.outfolder, "sorted.fastq")
|
48
|
-
|
47
|
+
logging.debug("started sorting seqs")
|
49
48
|
start = time()
|
50
49
|
sorted_reads_fastq_file = get_sorted_fastq_for_cluster.main(args)
|
51
|
-
|
50
|
+
logging.debug(f"elapsed time sorting: {time() - start}")
|
52
51
|
#################################################################
|
53
52
|
|
54
53
|
##### Filter and subsample #####
|
55
54
|
if args.target_length > 0 and args.target_deviation > 0:
|
56
55
|
read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(sorted_reads_fastq_file, 'r'))) if args.target_length - args.target_deviation <= len(seq) <= args.target_length + args.target_deviation]
|
57
|
-
|
56
|
+
logging.debug("Number of reads with read length in interval [{0},{1}]: {2}".format(args.target_length - args.target_deviation, args.target_length + args.target_deviation, len(read_array)))
|
58
57
|
else:
|
59
58
|
read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(sorted_reads_fastq_file, 'r')))]
|
60
59
|
|
61
|
-
if
|
62
|
-
read_array =
|
60
|
+
if args.top_reads:
|
61
|
+
read_array = read_array[:args.sample_size]
|
62
|
+
elif 0 < args.sample_size < len(read_array):
|
63
|
+
read_array = [read_array[i] for i in sorted(random.sample(range(len(read_array)), args.sample_size))]
|
63
64
|
|
64
65
|
abundance_cutoff = int( args.abundance_ratio * len(read_array))
|
65
66
|
#################################################################
|
66
67
|
|
67
68
|
|
68
69
|
##### Import precalculated probabilities of minimizer matching given the error rates of reads, kmer length, and window length #####
|
69
|
-
|
70
|
+
logging.debug("Started imported empirical error probabilities of minimizers shared:")
|
70
71
|
start = time()
|
71
72
|
p_min_shared = p_minimizers_shared.read_empirical_p()
|
72
73
|
p_emp_probs = {}
|
@@ -75,21 +76,22 @@ def main(args):
|
|
75
76
|
p_emp_probs[(float(e1),float(e2))] = float(p)
|
76
77
|
p_emp_probs[(float(e2),float(e1))] = float(p)
|
77
78
|
|
78
|
-
|
79
|
-
|
80
|
-
|
79
|
+
logging.debug(f"{p_emp_probs}")
|
80
|
+
logging.debug(f"{len(p_emp_probs)}")
|
81
|
+
logging.debug(f"elapsed time imported empirical error probabilities of minimizers shared: {time() - start}")
|
81
82
|
##################################################################################################################################
|
82
83
|
|
84
|
+
logging.info(f"Starting Clustering: {len(read_array)} reads")
|
83
85
|
##### Cluster reads, bulk of code base is here #####
|
84
|
-
|
86
|
+
logging.debug("started clustring")
|
85
87
|
start = time()
|
86
88
|
if args.nr_cores > 1:
|
87
89
|
clusters, representatives = parallelize.parallel_clustering(read_array, p_emp_probs, args)
|
88
90
|
else:
|
89
|
-
|
91
|
+
logging.debug("Using 1 core.")
|
90
92
|
clusters, representatives = single_clustering(read_array, p_emp_probs, args)
|
91
93
|
# clusters, representatives = cluster.cluster_seqs(read_array, p_emp_probs, args)
|
92
|
-
|
94
|
+
logging.debug(f"Time elapsed clustering: {time() - start}")
|
93
95
|
####################################################
|
94
96
|
|
95
97
|
|
@@ -101,7 +103,7 @@ def main(args):
|
|
101
103
|
output_cl_id = 0
|
102
104
|
for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),representatives[x[0]][5]), reverse=True):
|
103
105
|
# for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),x[0]), reverse=True):
|
104
|
-
read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate = representatives[c_id]
|
106
|
+
read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate, _ = representatives[c_id]
|
105
107
|
origins_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(output_cl_id, "_".join([item for item in acc.split("_")[:-1]]), c_seq, c_qual, score, error_rate))
|
106
108
|
|
107
109
|
for r_acc in sorted(all_read_acc, key = lambda x: float(x.split("_")[-1]) , reverse=True):
|
@@ -111,20 +113,19 @@ def main(args):
|
|
111
113
|
|
112
114
|
output_cl_id +=1
|
113
115
|
|
114
|
-
|
115
|
-
|
116
|
+
logging.debug(f"Nr clusters larger than 1: {nontrivial_cluster_index}") #, "Non-clustered reads:", len(archived_reads))
|
117
|
+
logging.debug(f"Nr clusters (all): {len(clusters)}") #, "Non-clustered reads:", len(archived_reads))
|
116
118
|
outfile.close()
|
117
119
|
origins_outfile.close()
|
118
120
|
############################
|
119
121
|
|
122
|
+
logging.info(f"Finished Clustering: {nontrivial_cluster_index} clusters formed")
|
120
123
|
|
121
124
|
if args.consensus:
|
122
|
-
|
123
|
-
print("STARTING TO CREATE CLUSTER CONSENSUS")
|
124
|
-
print()
|
125
|
+
logging.info(f"Starting Consensus creation and polishing")
|
125
126
|
work_dir = tempfile.mkdtemp()
|
126
|
-
|
127
|
-
|
127
|
+
logging.debug(f"Temporary workdirectory for consensus and polishing: {work_dir}")
|
128
|
+
logging.debug(
|
128
129
|
f"Forming draft consensus with abundance_cutoff >= {abundance_cutoff} "
|
129
130
|
f"({args.abundance_ratio * 100}% of {len(read_array)} reads)"
|
130
131
|
)
|
@@ -132,27 +133,29 @@ def main(args):
|
|
132
133
|
|
133
134
|
if args.primer_file or args.remove_universal_tails:
|
134
135
|
if args.remove_universal_tails:
|
135
|
-
|
136
|
+
logging.debug("Detecting and removing universal tails")
|
136
137
|
barcodes = barcode_trimmer.get_universal_tails()
|
137
138
|
else:
|
138
|
-
|
139
|
+
logging.debug("Detecting and removing primers")
|
139
140
|
barcodes = barcode_trimmer.read_barcodes(args.primer_file)
|
140
141
|
|
141
142
|
barcode_trimmer.remove_barcodes(centers, barcodes, args)
|
142
143
|
|
143
|
-
|
144
|
+
logging.debug("{0} centers formed".format(len(centers)))
|
144
145
|
centers_filtered = consensus.detect_reverse_complements(centers, args.rc_identity_threshold)
|
145
146
|
centers_polished = consensus.polish_sequences(centers_filtered, args)
|
146
147
|
|
147
148
|
if args.primer_file or args.remove_universal_tails: # check if barcode is found after polishing with medaka
|
148
|
-
barcode_trimmer.remove_barcodes(centers_polished, barcodes, args)
|
149
|
-
|
150
|
-
|
149
|
+
centers_updated = barcode_trimmer.remove_barcodes(centers_polished, barcodes, args)
|
150
|
+
if centers_updated:
|
151
|
+
centers_filtered = consensus.detect_reverse_complements(centers_polished, args.rc_identity_threshold)
|
152
|
+
centers_polished = consensus.polish_sequences(centers_filtered, args)
|
151
153
|
|
152
154
|
|
153
|
-
|
155
|
+
logging.debug("removing temporary workdir")
|
154
156
|
shutil.rmtree(work_dir)
|
155
157
|
|
158
|
+
logging.info(f"Finished Consensus creation: {len(centers_filtered)} created")
|
156
159
|
|
157
160
|
|
158
161
|
def write_fastq(args):
|
@@ -183,19 +186,17 @@ def write_fastq(args):
|
|
183
186
|
|
184
187
|
if __name__ == '__main__':
|
185
188
|
parser = argparse.ArgumentParser(description="Reference-free clustering and consensus forming of targeted ONT or PacBio reads", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
186
|
-
parser.add_argument('--version', action='version', version='%(prog)s 0.3.
|
187
|
-
|
188
|
-
parser.
|
189
|
-
|
190
|
-
|
191
|
-
# parser.add_argument('--mapping', action="store_true", help='Only infer clusters by mapping, no alignment is performed.')
|
189
|
+
parser.add_argument('--version', action='version', version='%(prog)s 0.3.1')
|
190
|
+
parser.add_argument('--debug', action='store_true', help='Enable debug logging')
|
191
|
+
reads_file = parser.add_mutually_exclusive_group(required=True)
|
192
|
+
reads_file.add_argument('--fastq', type=str, help='Path to consensus fastq file(s)')
|
193
|
+
reads_file.add_argument('--use_old_sorted_file', action='store_true', help='Using already existing sorted file if present in specified output directory.')
|
192
194
|
parser.add_argument('--t', dest="nr_cores", type=int, default=8, help='Number of cores allocated for clustering')
|
193
195
|
parser.add_argument('--d', dest="print_output", type=int, default=10000, help='For debugging, prints status of clustering and minimizer database every p reads processed.')
|
194
196
|
parser.add_argument('--q', dest="quality_threshold", type=float, default=7.0, help='Filters reads with average phred quality value under this number (default = 7.0).')
|
195
197
|
|
196
198
|
parser.add_argument('--ont', action="store_true", help='Clustering of ONT transcript reads.')
|
197
199
|
parser.add_argument('--isoseq', action="store_true", help='Clustering of PacBio Iso-Seq reads.')
|
198
|
-
parser.add_argument('--use_old_sorted_file', action="store_true", help='Using already existing sorted file if present in specified output directory.')
|
199
200
|
|
200
201
|
parser.add_argument('--consensus', action="store_true", help='After clustering, (1) run spoa on all clusters, (2) detect reverse complements, (3) run medaka.')
|
201
202
|
parser.add_argument('--abundance_ratio', type=float, default=0.1, help='Threshold for --consensus algorithm. Consider only clusters larger than a fraction of number of total reads (default 0.1)')
|
@@ -207,6 +208,7 @@ if __name__ == '__main__':
|
|
207
208
|
group.add_argument('--racon', action="store_true", help='Run final racon polishing algorithm.')
|
208
209
|
|
209
210
|
parser.add_argument('--medaka_model', type=str, default="", help='Set specific medaka model.')
|
211
|
+
parser.add_argument('--medaka_fastq', action="store_true", help='Request Medaka to output a FASTQ file, instead of FASTA')
|
210
212
|
parser.add_argument('--racon_iter', type=int, default=2, help='Number of times to run racon iteratively')
|
211
213
|
|
212
214
|
group2 = parser.add_mutually_exclusive_group()
|
@@ -217,8 +219,7 @@ if __name__ == '__main__':
|
|
217
219
|
parser.add_argument('--m', dest="target_length", type=int, default=0, help='Intended amplicon length. Invoked to filter out reads with length greater than m + s or smaller than m - s (default = 0 which means no filtering)')
|
218
220
|
parser.add_argument('--s', dest="target_deviation", type=int, default=0, help='Maximum allowed amplicon-length deviation. Invoked to filter out reads with length greater than m + s or smaller than m - s (default = 0 which means no filtering)')
|
219
221
|
parser.add_argument('--sample_size', type=int, default=0, help='Use sample_size reads in the NGSpecies pipeline (default = 0 which means all reads considered). If sample size is larger than actual number of reads, all reads will be used.')
|
220
|
-
|
221
|
-
|
222
|
+
parser.add_argument('--top_reads', action='store_true', help='Use the top --sample_size reads instead of a random selection (default = false, which means random reads considered). ')
|
222
223
|
|
223
224
|
|
224
225
|
parser.add_argument('--k', type=int, default=13, help='Kmer size')
|
@@ -226,6 +227,7 @@ if __name__ == '__main__':
|
|
226
227
|
parser.add_argument('--min_shared', type=int, default=5, help='Minmum number of minimizers shared between read and cluster')
|
227
228
|
parser.add_argument('--mapped_threshold', type=float, default=0.7, help='Minmum mapped fraction of read to be included in cluster. The density of minimizers to classify a region as mapped depends on quality of the read.')
|
228
229
|
parser.add_argument('--aligned_threshold', type=float, default=0.4, help='Minmum aligned fraction of read to be included in cluster. Aligned identity depends on the quality of the read.')
|
230
|
+
parser.add_argument('--symmetric_map_align_thresholds', action='store_true', help='Apply mapped threshold and aligned threshold to fraction of cluster representative which maps onto the read')
|
229
231
|
parser.add_argument('--batch_type', type=str, default='total_nt', help='In parrallel mode, how to split the reads into chunks "total_nt", "nr_reads", or "weighted" (default: total_nt) ')
|
230
232
|
parser.add_argument('--min_fraction', type=float, default=0.8, help='Minmum fraction of minimizers shared compared to best hit, in order to continue mapping.')
|
231
233
|
parser.add_argument('--min_prob_no_hits', type=float, default=0.1, help='Minimum probability for i consecutive minimizers to be different between read and representative and still considered as mapped region, under assumption that they come from the same transcript (depends on read quality).')
|
@@ -244,21 +246,20 @@ if __name__ == '__main__':
|
|
244
246
|
|
245
247
|
args = parser.parse_args()
|
246
248
|
|
249
|
+
loglevel = logging.DEBUG if args.debug else logging.INFO
|
250
|
+
|
251
|
+
logging.basicConfig(
|
252
|
+
level=loglevel,
|
253
|
+
format='%(message)s'
|
254
|
+
)
|
255
|
+
|
247
256
|
if args.which == 'write_fastq':
|
248
257
|
write_fastq(args)
|
249
|
-
|
258
|
+
logging.info("Wrote clusters to separate fastq files.")
|
250
259
|
sys.exit(0)
|
251
260
|
|
252
|
-
if (args.fastq and (args.flnc or args.ccs)):
|
253
|
-
print("Either (1) only a fastq file, or (2) a ccs and a flnc file should be specified. ")
|
254
|
-
sys.exit()
|
255
|
-
|
256
|
-
if (args.flnc != False and args.ccs == False ) or (args.flnc == False and args.ccs != False ):
|
257
|
-
print("isONclust needs both the ccs.bam file produced by ccs and the flnc file produced by isoseq3 cluster. ")
|
258
|
-
sys.exit()
|
259
|
-
|
260
261
|
if args.ont and args.isoseq :
|
261
|
-
|
262
|
+
logging.error("Arguments mutually exclusive, specify either --isoseq or --ont. ")
|
262
263
|
sys.exit()
|
263
264
|
elif args.isoseq:
|
264
265
|
args.k = 15
|
@@ -271,24 +272,16 @@ if __name__ == '__main__':
|
|
271
272
|
if len(sys.argv)==1:
|
272
273
|
parser.print_help()
|
273
274
|
sys.exit()
|
274
|
-
if not args.fastq and not args.flnc and not args.ccs:
|
275
|
-
parser.print_help()
|
276
|
-
sys.exit()
|
277
|
-
|
278
275
|
|
279
276
|
if args.outfolder and not os.path.exists(args.outfolder):
|
280
277
|
os.makedirs(args.outfolder)
|
281
278
|
|
282
|
-
|
283
|
-
# edlib_module = 'edlib'
|
284
279
|
parasail_module = 'parasail'
|
285
|
-
# if edlib_module not in sys.modules:
|
286
|
-
# print('You have not imported the {0} module. Only performing clustering with mapping, i.e., no alignment.'.format(edlib_module))
|
287
280
|
if parasail_module not in sys.modules:
|
288
|
-
|
281
|
+
logging.error('You have not imported the {0} module. Only performing clustering with mapping, i.e., no alignment!'.format(parasail_module))
|
289
282
|
sys.exit(1)
|
290
283
|
if 100 < args.w or args.w < args.k:
|
291
|
-
|
284
|
+
logging.error('Please specify a window of size larger or equal to k, and smaller than 100.')
|
292
285
|
sys.exit(1)
|
293
286
|
|
294
287
|
main(args)
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: NGSpeciesID
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Reconstructs viral consensus sequences from a set of ONT reads.
|
5
5
|
Home-page: https://github.com/ksahlin/NGSpeciesID
|
6
6
|
Author: Kristoffer Sahlin
|
@@ -14,6 +14,18 @@ Classifier: Programming Language :: Python :: 3.6
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.7
|
15
15
|
Requires-Python: !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4
|
16
16
|
License-File: LICENSE.txt
|
17
|
+
Requires-Dist: parasail==1.2.4
|
18
|
+
Requires-Dist: edlib>=1.1.2
|
19
|
+
Dynamic: author
|
20
|
+
Dynamic: author-email
|
21
|
+
Dynamic: classifier
|
22
|
+
Dynamic: description
|
23
|
+
Dynamic: home-page
|
24
|
+
Dynamic: keywords
|
25
|
+
Dynamic: license-file
|
26
|
+
Dynamic: requires-dist
|
27
|
+
Dynamic: requires-python
|
28
|
+
Dynamic: summary
|
17
29
|
|
18
30
|
NGSpeciesID
|
19
31
|
===========
|
@@ -25,25 +37,44 @@ NGSpeciesID is distributed as a python package supported on Linux / OSX with pyt
|
|
25
37
|
Table of Contents
|
26
38
|
=================
|
27
39
|
|
28
|
-
* [INSTALLATION](#
|
29
|
-
* [Using conda](#
|
40
|
+
* [INSTALLATION](#installation)
|
41
|
+
* [Using conda](#using-conda)
|
30
42
|
* [Testing installation](#testing-installation)
|
31
|
-
* [USAGE](#
|
43
|
+
* [USAGE](#usage)
|
32
44
|
* [Filtering and subsampling](#filtering-and-subsampling)
|
33
45
|
* [Removing primers](#removing-primers)
|
34
|
-
* [Output](#
|
35
|
-
* [EXAMPLE WORKFLOW](#
|
36
|
-
* [CREDITS](#
|
37
|
-
* [LICENCE](#
|
46
|
+
* [Output](#output)
|
47
|
+
* [EXAMPLE WORKFLOW](#example-workflow)
|
48
|
+
* [CREDITS](#credits)
|
49
|
+
* [LICENCE](#licence)
|
38
50
|
|
39
51
|
|
40
52
|
|
41
53
|
INSTALLATION
|
42
54
|
----------------
|
43
55
|
|
44
|
-
|
56
|
+
<!---
|
57
|
+
**NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
|
58
|
+
-->
|
45
59
|
|
46
60
|
### Using conda
|
61
|
+
|
62
|
+
**Recent update (2025-04-19)**
|
63
|
+
|
64
|
+
There have been many version updates of medaka and spoa since NGSpeciesID was first published. Below are instructions to install
|
65
|
+
NGSpeciesID with newer versions of spoa ([v4.1.4](https://bioconda.github.io/recipes/spoa/README.html)) and medaka (v2.0.1).
|
66
|
+
|
67
|
+
```
|
68
|
+
conda create -n NGSpeciesID python=3.11 pip
|
69
|
+
conda activate NGSpeciesID
|
70
|
+
conda install --yes -c conda-forge -c bioconda medaka==2.0.1 openblas==0.3.3 spoa racon minimap2 samtools
|
71
|
+
pip install NGSpeciesID
|
72
|
+
```
|
73
|
+
|
74
|
+
Make sure you [test the installation](#testing-installation).
|
75
|
+
|
76
|
+
**Published installation instructions (2021-01-11)**
|
77
|
+
|
47
78
|
Conda is the preferred way to install NGSpeciesID.
|
48
79
|
|
49
80
|
1. Create and activate a new environment called NGSpeciesID
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: NGSpeciesID
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Reconstructs viral consensus sequences from a set of ONT reads.
|
5
5
|
Home-page: https://github.com/ksahlin/NGSpeciesID
|
6
6
|
Author: Kristoffer Sahlin
|
@@ -14,6 +14,18 @@ Classifier: Programming Language :: Python :: 3.6
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.7
|
15
15
|
Requires-Python: !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4
|
16
16
|
License-File: LICENSE.txt
|
17
|
+
Requires-Dist: parasail==1.2.4
|
18
|
+
Requires-Dist: edlib>=1.1.2
|
19
|
+
Dynamic: author
|
20
|
+
Dynamic: author-email
|
21
|
+
Dynamic: classifier
|
22
|
+
Dynamic: description
|
23
|
+
Dynamic: home-page
|
24
|
+
Dynamic: keywords
|
25
|
+
Dynamic: license-file
|
26
|
+
Dynamic: requires-dist
|
27
|
+
Dynamic: requires-python
|
28
|
+
Dynamic: summary
|
17
29
|
|
18
30
|
NGSpeciesID
|
19
31
|
===========
|
@@ -25,25 +37,44 @@ NGSpeciesID is distributed as a python package supported on Linux / OSX with pyt
|
|
25
37
|
Table of Contents
|
26
38
|
=================
|
27
39
|
|
28
|
-
* [INSTALLATION](#
|
29
|
-
* [Using conda](#
|
40
|
+
* [INSTALLATION](#installation)
|
41
|
+
* [Using conda](#using-conda)
|
30
42
|
* [Testing installation](#testing-installation)
|
31
|
-
* [USAGE](#
|
43
|
+
* [USAGE](#usage)
|
32
44
|
* [Filtering and subsampling](#filtering-and-subsampling)
|
33
45
|
* [Removing primers](#removing-primers)
|
34
|
-
* [Output](#
|
35
|
-
* [EXAMPLE WORKFLOW](#
|
36
|
-
* [CREDITS](#
|
37
|
-
* [LICENCE](#
|
46
|
+
* [Output](#output)
|
47
|
+
* [EXAMPLE WORKFLOW](#example-workflow)
|
48
|
+
* [CREDITS](#credits)
|
49
|
+
* [LICENCE](#licence)
|
38
50
|
|
39
51
|
|
40
52
|
|
41
53
|
INSTALLATION
|
42
54
|
----------------
|
43
55
|
|
44
|
-
|
56
|
+
<!---
|
57
|
+
**NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
|
58
|
+
-->
|
45
59
|
|
46
60
|
### Using conda
|
61
|
+
|
62
|
+
**Recent update (2025-04-19)**
|
63
|
+
|
64
|
+
There have been many version updates of medaka and spoa since NGSpeciesID was first published. Below are instructions to install
|
65
|
+
NGSpeciesID with newer versions of spoa ([v4.1.4](https://bioconda.github.io/recipes/spoa/README.html)) and medaka (v2.0.1).
|
66
|
+
|
67
|
+
```
|
68
|
+
conda create -n NGSpeciesID python=3.11 pip
|
69
|
+
conda activate NGSpeciesID
|
70
|
+
conda install --yes -c conda-forge -c bioconda medaka==2.0.1 openblas==0.3.3 spoa racon minimap2 samtools
|
71
|
+
pip install NGSpeciesID
|
72
|
+
```
|
73
|
+
|
74
|
+
Make sure you [test the installation](#testing-installation).
|
75
|
+
|
76
|
+
**Published installation instructions (2021-01-11)**
|
77
|
+
|
47
78
|
Conda is the preferred way to install NGSpeciesID.
|
48
79
|
|
49
80
|
1. Create and activate a new environment called NGSpeciesID
|
@@ -8,25 +8,44 @@ NGSpeciesID is distributed as a python package supported on Linux / OSX with pyt
|
|
8
8
|
Table of Contents
|
9
9
|
=================
|
10
10
|
|
11
|
-
* [INSTALLATION](#
|
12
|
-
* [Using conda](#
|
11
|
+
* [INSTALLATION](#installation)
|
12
|
+
* [Using conda](#using-conda)
|
13
13
|
* [Testing installation](#testing-installation)
|
14
|
-
* [USAGE](#
|
14
|
+
* [USAGE](#usage)
|
15
15
|
* [Filtering and subsampling](#filtering-and-subsampling)
|
16
16
|
* [Removing primers](#removing-primers)
|
17
|
-
* [Output](#
|
18
|
-
* [EXAMPLE WORKFLOW](#
|
19
|
-
* [CREDITS](#
|
20
|
-
* [LICENCE](#
|
17
|
+
* [Output](#output)
|
18
|
+
* [EXAMPLE WORKFLOW](#example-workflow)
|
19
|
+
* [CREDITS](#credits)
|
20
|
+
* [LICENCE](#licence)
|
21
21
|
|
22
22
|
|
23
23
|
|
24
24
|
INSTALLATION
|
25
25
|
----------------
|
26
26
|
|
27
|
-
|
27
|
+
<!---
|
28
|
+
**NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
|
29
|
+
-->
|
28
30
|
|
29
31
|
### Using conda
|
32
|
+
|
33
|
+
**Recent update (2025-04-19)**
|
34
|
+
|
35
|
+
There have been many version updates of medaka and spoa since NGSpeciesID was first published. Below are instructions to install
|
36
|
+
NGSpeciesID with newer versions of spoa ([v4.1.4](https://bioconda.github.io/recipes/spoa/README.html)) and medaka (v2.0.1).
|
37
|
+
|
38
|
+
```
|
39
|
+
conda create -n NGSpeciesID python=3.11 pip
|
40
|
+
conda activate NGSpeciesID
|
41
|
+
conda install --yes -c conda-forge -c bioconda medaka==2.0.1 openblas==0.3.3 spoa racon minimap2 samtools
|
42
|
+
pip install NGSpeciesID
|
43
|
+
```
|
44
|
+
|
45
|
+
Make sure you [test the installation](#testing-installation).
|
46
|
+
|
47
|
+
**Published installation instructions (2021-01-11)**
|
48
|
+
|
30
49
|
Conda is the preferred way to install NGSpeciesID.
|
31
50
|
|
32
51
|
1. Create and activate a new environment called NGSpeciesID
|
@@ -1,5 +1,6 @@
|
|
1
1
|
|
2
2
|
import edlib
|
3
|
+
import logging
|
3
4
|
|
4
5
|
from modules import help_functions
|
5
6
|
|
@@ -15,10 +16,10 @@ def read_barcodes(primer_file):
|
|
15
16
|
barcodes = { acc + '_fw' : seq.strip() for acc, (seq, _) in help_functions.readfq(open(primer_file, 'r'))}
|
16
17
|
|
17
18
|
for acc, seq in list(barcodes.items()):
|
18
|
-
|
19
|
+
logging.debug(f"{acc} {seq} {acc[:-3]}")
|
19
20
|
barcodes[acc[:-3] + '_rc'] = reverse_complement(seq.upper())
|
20
21
|
|
21
|
-
|
22
|
+
logging.debug(f"{barcodes}")
|
22
23
|
return barcodes
|
23
24
|
|
24
25
|
def get_universal_tails():
|
@@ -26,7 +27,7 @@ def get_universal_tails():
|
|
26
27
|
'2_R_rc' : 'ACTTGCCTGTCGCTCTATCTTC'}
|
27
28
|
barcodes['1_F_rc'] = reverse_complement(barcodes['1_F_fw'])
|
28
29
|
barcodes['2_R_fw'] = reverse_complement(barcodes['2_R_rc'])
|
29
|
-
|
30
|
+
logging.debug(f"{barcodes}")
|
30
31
|
return barcodes
|
31
32
|
|
32
33
|
|
@@ -45,14 +46,13 @@ def find_barcode_locations(center, barcodes, primer_max_ed):
|
|
45
46
|
('X', 'T'), ('X', 'C'), ('N', 'G'), ('N', 'A'), ('N', 'T'), ('N', 'C')]
|
46
47
|
all_locations = []
|
47
48
|
for primer_acc, primer_seq in barcodes.items():
|
48
|
-
# print(primer_acc, primer_seq,center)
|
49
49
|
# Add additionalEqualities=IUPAC_map allow edlib to understand IUPAC code
|
50
50
|
result = edlib.align(primer_seq, center,
|
51
51
|
mode="HW", task="locations", k=primer_max_ed,
|
52
52
|
additionalEqualities=IUPAC_map)
|
53
53
|
ed = result["editDistance"]
|
54
54
|
locations = result["locations"]
|
55
|
-
|
55
|
+
logging.debug(f"{locations} {ed}")
|
56
56
|
if locations:
|
57
57
|
all_locations.append((primer_acc, locations[0][0], locations[0][1], ed))
|
58
58
|
return all_locations
|
@@ -63,7 +63,8 @@ def remove_barcodes(centers, barcodes, args):
|
|
63
63
|
Modifies consensus sequences by copping of at barcode sites.
|
64
64
|
This implies changing the datastructure centers with the modified consensus sequeces
|
65
65
|
"""
|
66
|
-
|
66
|
+
|
67
|
+
centers_updated = False
|
67
68
|
for i, (nr_reads_in_cluster, c_id, center, reads_path_name) in enumerate(centers):
|
68
69
|
|
69
70
|
# if consensus is smaller than 2*trim_window we set trim window to half the sequence
|
@@ -74,53 +75,30 @@ def remove_barcodes(centers, barcodes, args):
|
|
74
75
|
|
75
76
|
barcode_locations_beginning = find_barcode_locations(center[:trim_window], barcodes, args.primer_max_ed)
|
76
77
|
barcode_locations_end = find_barcode_locations(center[-trim_window:], barcodes, args.primer_max_ed)
|
77
|
-
|
78
|
+
logging.debug(f"{center}")
|
78
79
|
|
79
80
|
cut_start = 0
|
80
81
|
if barcode_locations_beginning:
|
81
|
-
|
82
|
+
logging.debug(f"FOUND BARCODE BEGINNING {barcode_locations_beginning}")
|
82
83
|
for bc, start, stop, ed in barcode_locations_beginning:
|
83
84
|
if stop > cut_start:
|
84
85
|
cut_start = stop
|
85
86
|
|
86
87
|
cut_end = len(center)
|
87
88
|
if barcode_locations_end:
|
88
|
-
|
89
|
+
logging.debug(f"FOUND BARCODE END {barcode_locations_end}")
|
89
90
|
earliest_hit = len(center)
|
90
91
|
for bc, start, stop, ed in barcode_locations_end:
|
91
92
|
if start < earliest_hit:
|
92
93
|
earliest_hit = start
|
93
94
|
cut_end = len(center) - (trim_window - earliest_hit)
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
# print("FOUND BARCODE", barcode_locations)
|
105
|
-
# cut_start = 0
|
106
|
-
# cut_end = len(center)
|
107
|
-
# print(center)
|
108
|
-
# for bc, start, stop, ed in barcode_locations:
|
109
|
-
# # print(ed,bc, bc[-4], bc[-2:])
|
110
|
-
# if bc[-4] == 'F' and bc[-2:] == 'fw':
|
111
|
-
# cut_start = stop
|
112
|
-
# elif bc[-4] == 'R' and bc[-2:] == 'fw':
|
113
|
-
# cut_end = start
|
114
|
-
# elif bc[-4] == 'R' and bc[-2:] == 'rc':
|
115
|
-
# cut_start = stop
|
116
|
-
# elif bc[-4] == 'F' and bc[-2:] == 'rc':
|
117
|
-
# cut_end = start
|
118
|
-
# else:
|
119
|
-
# print()
|
120
|
-
# print("Primer file not in correct format!")
|
121
|
-
# print()
|
122
|
-
# # print(center)
|
123
|
-
# center = center[cut_start: cut_end]
|
124
|
-
# print(center, "NEW")
|
125
|
-
# print("cut start", cut_start, "cut end", cut_end)
|
126
|
-
# centers[i][2] = center
|
95
|
+
|
96
|
+
if cut_start > 0 or cut_end < len(center):
|
97
|
+
center = center[cut_start: cut_end]
|
98
|
+
|
99
|
+
logging.debug(f"{center} NEW")
|
100
|
+
logging.debug(f"cut start {cut_start} cut end {cut_end}")
|
101
|
+
centers[i][2] = center
|
102
|
+
centers_updated = True
|
103
|
+
|
104
|
+
return centers_updated
|