NGSpeciesID 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,13 +4,12 @@ from __future__ import print_function
4
4
  import os,sys
5
5
  import argparse
6
6
  import tempfile
7
- import errno
8
7
  from time import time
9
8
  import shutil
10
9
  import random
10
+ import logging
11
11
 
12
12
  from modules import get_sorted_fastq_for_cluster
13
- from modules import cluster
14
13
  from modules import p_minimizers_shared
15
14
  from modules import help_functions
16
15
  from modules import parallelize
@@ -30,7 +29,7 @@ def single_clustering(read_array, p_emp_probs, args):
30
29
  result_dict = cluster.reads_to_clusters(clusters, representatives, read_array, p_emp_probs, minimizer_database, new_batch_index, args)
31
30
  # Unpack result. The result dictionary structure is convenient for multiprocessing return but clumsy in single core mode.
32
31
  clusters, representatives, _, _ = list(result_dict.values())[0]
33
- print("Time elapesd clustering:", time() - start_cluster)
32
+ logging.debug(f"Time elapesd clustering: {time() - start_cluster}")
34
33
  return clusters, representatives
35
34
 
36
35
 
@@ -45,28 +44,30 @@ def main(args):
45
44
  """
46
45
  ##### Sort all reads according to expected errorfree kmers #####
47
46
  args.outfile = os.path.join(args.outfolder, "sorted.fastq")
48
- print("started sorting seqs")
47
+ logging.debug("started sorting seqs")
49
48
  start = time()
50
49
  sorted_reads_fastq_file = get_sorted_fastq_for_cluster.main(args)
51
- print("elapsed time sorting:", time() - start)
50
+ logging.debug(f"elapsed time sorting: {time() - start}")
52
51
  #################################################################
53
52
 
54
53
  ##### Filter and subsample #####
55
54
  if args.target_length > 0 and args.target_deviation > 0:
56
55
  read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(sorted_reads_fastq_file, 'r'))) if args.target_length - args.target_deviation <= len(seq) <= args.target_length + args.target_deviation]
57
- print("Number of reads with read length in interval [{0},{1}]:".format(args.target_length - args.target_deviation, args.target_length + args.target_deviation), len(read_array))
56
+ logging.debug("Number of reads with read length in interval [{0},{1}]: {2}".format(args.target_length - args.target_deviation, args.target_length + args.target_deviation, len(read_array)))
58
57
  else:
59
58
  read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(sorted_reads_fastq_file, 'r')))]
60
59
 
61
- if 0 < args.sample_size < len(read_array):
62
- read_array = [ read_array[i] for i in sorted(random.sample(range(len(read_array)), args.sample_size))]
60
+ if args.top_reads:
61
+ read_array = read_array[:args.sample_size]
62
+ elif 0 < args.sample_size < len(read_array):
63
+ read_array = [read_array[i] for i in sorted(random.sample(range(len(read_array)), args.sample_size))]
63
64
 
64
65
  abundance_cutoff = int( args.abundance_ratio * len(read_array))
65
66
  #################################################################
66
67
 
67
68
 
68
69
  ##### Import precalculated probabilities of minimizer matching given the error rates of reads, kmer length, and window length #####
69
- print("Started imported empirical error probabilities of minimizers shared:")
70
+ logging.debug("Started imported empirical error probabilities of minimizers shared:")
70
71
  start = time()
71
72
  p_min_shared = p_minimizers_shared.read_empirical_p()
72
73
  p_emp_probs = {}
@@ -75,21 +76,22 @@ def main(args):
75
76
  p_emp_probs[(float(e1),float(e2))] = float(p)
76
77
  p_emp_probs[(float(e2),float(e1))] = float(p)
77
78
 
78
- print(p_emp_probs)
79
- print(len(p_emp_probs))
80
- print("elapsed time imported empirical error probabilities of minimizers shared:", time() - start)
79
+ logging.debug(f"{p_emp_probs}")
80
+ logging.debug(f"{len(p_emp_probs)}")
81
+ logging.debug(f"elapsed time imported empirical error probabilities of minimizers shared: {time() - start}")
81
82
  ##################################################################################################################################
82
83
 
84
+ logging.info(f"Starting Clustering: {len(read_array)} reads")
83
85
  ##### Cluster reads, bulk of code base is here #####
84
- print("started clustring")
86
+ logging.debug("started clustring")
85
87
  start = time()
86
88
  if args.nr_cores > 1:
87
89
  clusters, representatives = parallelize.parallel_clustering(read_array, p_emp_probs, args)
88
90
  else:
89
- print("Using 1 core.")
91
+ logging.debug("Using 1 core.")
90
92
  clusters, representatives = single_clustering(read_array, p_emp_probs, args)
91
93
  # clusters, representatives = cluster.cluster_seqs(read_array, p_emp_probs, args)
92
- print("Time elapsed clustering:", time() - start)
94
+ logging.debug(f"Time elapsed clustering: {time() - start}")
93
95
  ####################################################
94
96
 
95
97
 
@@ -101,7 +103,7 @@ def main(args):
101
103
  output_cl_id = 0
102
104
  for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),representatives[x[0]][5]), reverse=True):
103
105
  # for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),x[0]), reverse=True):
104
- read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate = representatives[c_id]
106
+ read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate, _ = representatives[c_id]
105
107
  origins_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(output_cl_id, "_".join([item for item in acc.split("_")[:-1]]), c_seq, c_qual, score, error_rate))
106
108
 
107
109
  for r_acc in sorted(all_read_acc, key = lambda x: float(x.split("_")[-1]) , reverse=True):
@@ -111,20 +113,19 @@ def main(args):
111
113
 
112
114
  output_cl_id +=1
113
115
 
114
- print("Nr clusters larger than 1:", nontrivial_cluster_index) #, "Non-clustered reads:", len(archived_reads))
115
- print("Nr clusters (all):", len(clusters)) #, "Non-clustered reads:", len(archived_reads))
116
+ logging.debug(f"Nr clusters larger than 1: {nontrivial_cluster_index}") #, "Non-clustered reads:", len(archived_reads))
117
+ logging.debug(f"Nr clusters (all): {len(clusters)}") #, "Non-clustered reads:", len(archived_reads))
116
118
  outfile.close()
117
119
  origins_outfile.close()
118
120
  ############################
119
121
 
122
+ logging.info(f"Finished Clustering: {nontrivial_cluster_index} clusters formed")
120
123
 
121
124
  if args.consensus:
122
- print()
123
- print("STARTING TO CREATE CLUSTER CONSENSUS")
124
- print()
125
+ logging.info(f"Starting Consensus creation and polishing")
125
126
  work_dir = tempfile.mkdtemp()
126
- print("Temporary workdirektory for consensus and polishing:", work_dir)
127
- print(
127
+ logging.debug(f"Temporary workdirectory for consensus and polishing: {work_dir}")
128
+ logging.debug(
128
129
  f"Forming draft consensus with abundance_cutoff >= {abundance_cutoff} "
129
130
  f"({args.abundance_ratio * 100}% of {len(read_array)} reads)"
130
131
  )
@@ -132,27 +133,29 @@ def main(args):
132
133
 
133
134
  if args.primer_file or args.remove_universal_tails:
134
135
  if args.remove_universal_tails:
135
- print("Detecting and removing universal tails")
136
+ logging.debug("Detecting and removing universal tails")
136
137
  barcodes = barcode_trimmer.get_universal_tails()
137
138
  else:
138
- print("Detecting and removing primers")
139
+ logging.debug("Detecting and removing primers")
139
140
  barcodes = barcode_trimmer.read_barcodes(args.primer_file)
140
141
 
141
142
  barcode_trimmer.remove_barcodes(centers, barcodes, args)
142
143
 
143
- print("{0} centers formed".format(len(centers)))
144
+ logging.debug("{0} centers formed".format(len(centers)))
144
145
  centers_filtered = consensus.detect_reverse_complements(centers, args.rc_identity_threshold)
145
146
  centers_polished = consensus.polish_sequences(centers_filtered, args)
146
147
 
147
148
  if args.primer_file or args.remove_universal_tails: # check if barcode is found after polishing with medaka
148
- barcode_trimmer.remove_barcodes(centers_polished, barcodes, args)
149
- centers_filtered = consensus.detect_reverse_complements(centers_polished, args.rc_identity_threshold)
150
- centers_polished = consensus.polish_sequences(centers_filtered, args)
149
+ centers_updated = barcode_trimmer.remove_barcodes(centers_polished, barcodes, args)
150
+ if centers_updated:
151
+ centers_filtered = consensus.detect_reverse_complements(centers_polished, args.rc_identity_threshold)
152
+ centers_polished = consensus.polish_sequences(centers_filtered, args)
151
153
 
152
154
 
153
- print("removing temporary workdir")
155
+ logging.debug("removing temporary workdir")
154
156
  shutil.rmtree(work_dir)
155
157
 
158
+ logging.info(f"Finished Consensus creation: {len(centers_filtered)} created")
156
159
 
157
160
 
158
161
  def write_fastq(args):
@@ -183,19 +186,17 @@ def write_fastq(args):
183
186
 
184
187
  if __name__ == '__main__':
185
188
  parser = argparse.ArgumentParser(description="Reference-free clustering and consensus forming of targeted ONT or PacBio reads", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
186
- parser.add_argument('--version', action='version', version='%(prog)s 0.3.0')
187
-
188
- parser.add_argument('--fastq', type=str, default=False, help='Path to consensus fastq file(s)')
189
- parser.add_argument('--flnc', type=str, default=False, help='The flnc reads generated by the isoseq3 algorithm (BAM file)')
190
- parser.add_argument('--ccs', type=str, default=False, help='Path to consensus BAM file(s)')
191
- # parser.add_argument('--mapping', action="store_true", help='Only infer clusters by mapping, no alignment is performed.')
189
+ parser.add_argument('--version', action='version', version='%(prog)s 0.3.1')
190
+ parser.add_argument('--debug', action='store_true', help='Enable debug logging')
191
+ reads_file = parser.add_mutually_exclusive_group(required=True)
192
+ reads_file.add_argument('--fastq', type=str, help='Path to consensus fastq file(s)')
193
+ reads_file.add_argument('--use_old_sorted_file', action='store_true', help='Using already existing sorted file if present in specified output directory.')
192
194
  parser.add_argument('--t', dest="nr_cores", type=int, default=8, help='Number of cores allocated for clustering')
193
195
  parser.add_argument('--d', dest="print_output", type=int, default=10000, help='For debugging, prints status of clustering and minimizer database every p reads processed.')
194
196
  parser.add_argument('--q', dest="quality_threshold", type=float, default=7.0, help='Filters reads with average phred quality value under this number (default = 7.0).')
195
197
 
196
198
  parser.add_argument('--ont', action="store_true", help='Clustering of ONT transcript reads.')
197
199
  parser.add_argument('--isoseq', action="store_true", help='Clustering of PacBio Iso-Seq reads.')
198
- parser.add_argument('--use_old_sorted_file', action="store_true", help='Using already existing sorted file if present in specified output directory.')
199
200
 
200
201
  parser.add_argument('--consensus', action="store_true", help='After clustering, (1) run spoa on all clusters, (2) detect reverse complements, (3) run medaka.')
201
202
  parser.add_argument('--abundance_ratio', type=float, default=0.1, help='Threshold for --consensus algorithm. Consider only clusters larger than a fraction of number of total reads (default 0.1)')
@@ -207,6 +208,7 @@ if __name__ == '__main__':
207
208
  group.add_argument('--racon', action="store_true", help='Run final racon polishing algorithm.')
208
209
 
209
210
  parser.add_argument('--medaka_model', type=str, default="", help='Set specific medaka model.')
211
+ parser.add_argument('--medaka_fastq', action="store_true", help='Request Medaka to output a FASTQ file, instead of FASTA')
210
212
  parser.add_argument('--racon_iter', type=int, default=2, help='Number of times to run racon iteratively')
211
213
 
212
214
  group2 = parser.add_mutually_exclusive_group()
@@ -217,8 +219,7 @@ if __name__ == '__main__':
217
219
  parser.add_argument('--m', dest="target_length", type=int, default=0, help='Intended amplicon length. Invoked to filter out reads with length greater than m + s or smaller than m - s (default = 0 which means no filtering)')
218
220
  parser.add_argument('--s', dest="target_deviation", type=int, default=0, help='Maximum allowed amplicon-length deviation. Invoked to filter out reads with length greater than m + s or smaller than m - s (default = 0 which means no filtering)')
219
221
  parser.add_argument('--sample_size', type=int, default=0, help='Use sample_size reads in the NGSpecies pipeline (default = 0 which means all reads considered). If sample size is larger than actual number of reads, all reads will be used.')
220
-
221
-
222
+ parser.add_argument('--top_reads', action='store_true', help='Use the top --sample_size reads instead of a random selection (default = false, which means random reads considered). ')
222
223
 
223
224
 
224
225
  parser.add_argument('--k', type=int, default=13, help='Kmer size')
@@ -226,6 +227,7 @@ if __name__ == '__main__':
226
227
  parser.add_argument('--min_shared', type=int, default=5, help='Minmum number of minimizers shared between read and cluster')
227
228
  parser.add_argument('--mapped_threshold', type=float, default=0.7, help='Minmum mapped fraction of read to be included in cluster. The density of minimizers to classify a region as mapped depends on quality of the read.')
228
229
  parser.add_argument('--aligned_threshold', type=float, default=0.4, help='Minmum aligned fraction of read to be included in cluster. Aligned identity depends on the quality of the read.')
230
+ parser.add_argument('--symmetric_map_align_thresholds', action='store_true', help='Apply mapped threshold and aligned threshold to fraction of cluster representative which maps onto the read')
229
231
  parser.add_argument('--batch_type', type=str, default='total_nt', help='In parrallel mode, how to split the reads into chunks "total_nt", "nr_reads", or "weighted" (default: total_nt) ')
230
232
  parser.add_argument('--min_fraction', type=float, default=0.8, help='Minmum fraction of minimizers shared compared to best hit, in order to continue mapping.')
231
233
  parser.add_argument('--min_prob_no_hits', type=float, default=0.1, help='Minimum probability for i consecutive minimizers to be different between read and representative and still considered as mapped region, under assumption that they come from the same transcript (depends on read quality).')
@@ -244,21 +246,20 @@ if __name__ == '__main__':
244
246
 
245
247
  args = parser.parse_args()
246
248
 
249
+ loglevel = logging.DEBUG if args.debug else logging.INFO
250
+
251
+ logging.basicConfig(
252
+ level=loglevel,
253
+ format='%(message)s'
254
+ )
255
+
247
256
  if args.which == 'write_fastq':
248
257
  write_fastq(args)
249
- print("Wrote clusters to separate fastq files.")
258
+ logging.info("Wrote clusters to separate fastq files.")
250
259
  sys.exit(0)
251
260
 
252
- if (args.fastq and (args.flnc or args.ccs)):
253
- print("Either (1) only a fastq file, or (2) a ccs and a flnc file should be specified. ")
254
- sys.exit()
255
-
256
- if (args.flnc != False and args.ccs == False ) or (args.flnc == False and args.ccs != False ):
257
- print("isONclust needs both the ccs.bam file produced by ccs and the flnc file produced by isoseq3 cluster. ")
258
- sys.exit()
259
-
260
261
  if args.ont and args.isoseq :
261
- print("Arguments mutually exclusive, specify either --isoseq or --ont. ")
262
+ logging.error("Arguments mutually exclusive, specify either --isoseq or --ont. ")
262
263
  sys.exit()
263
264
  elif args.isoseq:
264
265
  args.k = 15
@@ -271,24 +272,16 @@ if __name__ == '__main__':
271
272
  if len(sys.argv)==1:
272
273
  parser.print_help()
273
274
  sys.exit()
274
- if not args.fastq and not args.flnc and not args.ccs:
275
- parser.print_help()
276
- sys.exit()
277
-
278
275
 
279
276
  if args.outfolder and not os.path.exists(args.outfolder):
280
277
  os.makedirs(args.outfolder)
281
278
 
282
-
283
- # edlib_module = 'edlib'
284
279
  parasail_module = 'parasail'
285
- # if edlib_module not in sys.modules:
286
- # print('You have not imported the {0} module. Only performing clustering with mapping, i.e., no alignment.'.format(edlib_module))
287
280
  if parasail_module not in sys.modules:
288
- print('You have not imported the {0} module. Only performing clustering with mapping, i.e., no alignment!'.format(parasail_module))
281
+ logging.error('You have not imported the {0} module. Only performing clustering with mapping, i.e., no alignment!'.format(parasail_module))
289
282
  sys.exit(1)
290
283
  if 100 < args.w or args.w < args.k:
291
- print('Please specify a window of size larger or equal to k, and smaller than 100.')
284
+ logging.error('Please specify a window of size larger or equal to k, and smaller than 100.')
292
285
  sys.exit(1)
293
286
 
294
287
  main(args)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: NGSpeciesID
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Reconstructs viral consensus sequences from a set of ONT reads.
5
5
  Home-page: https://github.com/ksahlin/NGSpeciesID
6
6
  Author: Kristoffer Sahlin
@@ -14,6 +14,18 @@ Classifier: Programming Language :: Python :: 3.6
14
14
  Classifier: Programming Language :: Python :: 3.7
15
15
  Requires-Python: !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4
16
16
  License-File: LICENSE.txt
17
+ Requires-Dist: parasail==1.2.4
18
+ Requires-Dist: edlib>=1.1.2
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: classifier
22
+ Dynamic: description
23
+ Dynamic: home-page
24
+ Dynamic: keywords
25
+ Dynamic: license-file
26
+ Dynamic: requires-dist
27
+ Dynamic: requires-python
28
+ Dynamic: summary
17
29
 
18
30
  NGSpeciesID
19
31
  ===========
@@ -25,25 +37,44 @@ NGSpeciesID is distributed as a python package supported on Linux / OSX with pyt
25
37
  Table of Contents
26
38
  =================
27
39
 
28
- * [INSTALLATION](#INSTALLATION)
29
- * [Using conda](#Using-conda)
40
+ * [INSTALLATION](#installation)
41
+ * [Using conda](#using-conda)
30
42
  * [Testing installation](#testing-installation)
31
- * [USAGE](#USAGE)
43
+ * [USAGE](#usage)
32
44
  * [Filtering and subsampling](#filtering-and-subsampling)
33
45
  * [Removing primers](#removing-primers)
34
- * [Output](#Output)
35
- * [EXAMPLE WORKFLOW](#EXAMPLE-WORKFLOW)
36
- * [CREDITS](#CREDITS)
37
- * [LICENCE](#LICENCE)
46
+ * [Output](#output)
47
+ * [EXAMPLE WORKFLOW](#example-workflow)
48
+ * [CREDITS](#credits)
49
+ * [LICENCE](#licence)
38
50
 
39
51
 
40
52
 
41
53
  INSTALLATION
42
54
  ----------------
43
55
 
44
- **NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the all-in-one installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
56
+ <!---
57
+ **NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
58
+ -->
45
59
 
46
60
  ### Using conda
61
+
62
+ **Recent update (2025-04-19)**
63
+
64
+ There have been many version updates of medaka and spoa since NGSpeciesID was first published. Below are instructions to install
65
+ NGSpeciesID with newer versions of spoa ([v4.1.4](https://bioconda.github.io/recipes/spoa/README.html)) and medaka (v2.0.1).
66
+
67
+ ```
68
+ conda create -n NGSpeciesID python=3.11 pip
69
+ conda activate NGSpeciesID
70
+ conda install --yes -c conda-forge -c bioconda medaka==2.0.1 openblas==0.3.3 spoa racon minimap2 samtools
71
+ pip install NGSpeciesID
72
+ ```
73
+
74
+ Make sure you [test the installation](#testing-installation).
75
+
76
+ **Published installation instructions (2021-01-11)**
77
+
47
78
  Conda is the preferred way to install NGSpeciesID.
48
79
 
49
80
  1. Create and activate a new environment called NGSpeciesID
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: NGSpeciesID
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Reconstructs viral consensus sequences from a set of ONT reads.
5
5
  Home-page: https://github.com/ksahlin/NGSpeciesID
6
6
  Author: Kristoffer Sahlin
@@ -14,6 +14,18 @@ Classifier: Programming Language :: Python :: 3.6
14
14
  Classifier: Programming Language :: Python :: 3.7
15
15
  Requires-Python: !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4
16
16
  License-File: LICENSE.txt
17
+ Requires-Dist: parasail==1.2.4
18
+ Requires-Dist: edlib>=1.1.2
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: classifier
22
+ Dynamic: description
23
+ Dynamic: home-page
24
+ Dynamic: keywords
25
+ Dynamic: license-file
26
+ Dynamic: requires-dist
27
+ Dynamic: requires-python
28
+ Dynamic: summary
17
29
 
18
30
  NGSpeciesID
19
31
  ===========
@@ -25,25 +37,44 @@ NGSpeciesID is distributed as a python package supported on Linux / OSX with pyt
25
37
  Table of Contents
26
38
  =================
27
39
 
28
- * [INSTALLATION](#INSTALLATION)
29
- * [Using conda](#Using-conda)
40
+ * [INSTALLATION](#installation)
41
+ * [Using conda](#using-conda)
30
42
  * [Testing installation](#testing-installation)
31
- * [USAGE](#USAGE)
43
+ * [USAGE](#usage)
32
44
  * [Filtering and subsampling](#filtering-and-subsampling)
33
45
  * [Removing primers](#removing-primers)
34
- * [Output](#Output)
35
- * [EXAMPLE WORKFLOW](#EXAMPLE-WORKFLOW)
36
- * [CREDITS](#CREDITS)
37
- * [LICENCE](#LICENCE)
46
+ * [Output](#output)
47
+ * [EXAMPLE WORKFLOW](#example-workflow)
48
+ * [CREDITS](#credits)
49
+ * [LICENCE](#licence)
38
50
 
39
51
 
40
52
 
41
53
  INSTALLATION
42
54
  ----------------
43
55
 
44
- **NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the all-in-one installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
56
+ <!---
57
+ **NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
58
+ -->
45
59
 
46
60
  ### Using conda
61
+
62
+ **Recent update (2025-04-19)**
63
+
64
+ There have been many version updates of medaka and spoa since NGSpeciesID was first published. Below are instructions to install
65
+ NGSpeciesID with newer versions of spoa ([v4.1.4](https://bioconda.github.io/recipes/spoa/README.html)) and medaka (v2.0.1).
66
+
67
+ ```
68
+ conda create -n NGSpeciesID python=3.11 pip
69
+ conda activate NGSpeciesID
70
+ conda install --yes -c conda-forge -c bioconda medaka==2.0.1 openblas==0.3.3 spoa racon minimap2 samtools
71
+ pip install NGSpeciesID
72
+ ```
73
+
74
+ Make sure you [test the installation](#testing-installation).
75
+
76
+ **Published installation instructions (2021-01-11)**
77
+
47
78
  Conda is the preferred way to install NGSpeciesID.
48
79
 
49
80
  1. Create and activate a new environment called NGSpeciesID
@@ -8,25 +8,44 @@ NGSpeciesID is distributed as a python package supported on Linux / OSX with pyt
8
8
  Table of Contents
9
9
  =================
10
10
 
11
- * [INSTALLATION](#INSTALLATION)
12
- * [Using conda](#Using-conda)
11
+ * [INSTALLATION](#installation)
12
+ * [Using conda](#using-conda)
13
13
  * [Testing installation](#testing-installation)
14
- * [USAGE](#USAGE)
14
+ * [USAGE](#usage)
15
15
  * [Filtering and subsampling](#filtering-and-subsampling)
16
16
  * [Removing primers](#removing-primers)
17
- * [Output](#Output)
18
- * [EXAMPLE WORKFLOW](#EXAMPLE-WORKFLOW)
19
- * [CREDITS](#CREDITS)
20
- * [LICENCE](#LICENCE)
17
+ * [Output](#output)
18
+ * [EXAMPLE WORKFLOW](#example-workflow)
19
+ * [CREDITS](#credits)
20
+ * [LICENCE](#licence)
21
21
 
22
22
 
23
23
 
24
24
  INSTALLATION
25
25
  ----------------
26
26
 
27
- **NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the all-in-one installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
27
+ <!---
28
+ **NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
29
+ -->
28
30
 
29
31
  ### Using conda
32
+
33
+ **Recent update (2025-04-19)**
34
+
35
+ There have been many version updates of medaka and spoa since NGSpeciesID was first published. Below are instructions to install
36
+ NGSpeciesID with newer versions of spoa ([v4.1.4](https://bioconda.github.io/recipes/spoa/README.html)) and medaka (v2.0.1).
37
+
38
+ ```
39
+ conda create -n NGSpeciesID python=3.11 pip
40
+ conda activate NGSpeciesID
41
+ conda install --yes -c conda-forge -c bioconda medaka==2.0.1 openblas==0.3.3 spoa racon minimap2 samtools
42
+ pip install NGSpeciesID
43
+ ```
44
+
45
+ Make sure you [test the installation](#testing-installation).
46
+
47
+ **Published installation instructions (2021-01-11)**
48
+
30
49
  Conda is the preferred way to install NGSpeciesID.
31
50
 
32
51
  1. Create and activate a new environment called NGSpeciesID
@@ -1,5 +1,6 @@
1
1
 
2
2
  import edlib
3
+ import logging
3
4
 
4
5
  from modules import help_functions
5
6
 
@@ -15,10 +16,10 @@ def read_barcodes(primer_file):
15
16
  barcodes = { acc + '_fw' : seq.strip() for acc, (seq, _) in help_functions.readfq(open(primer_file, 'r'))}
16
17
 
17
18
  for acc, seq in list(barcodes.items()):
18
- print(acc, seq,acc[:-3])
19
+ logging.debug(f"{acc} {seq} {acc[:-3]}")
19
20
  barcodes[acc[:-3] + '_rc'] = reverse_complement(seq.upper())
20
21
 
21
- print(barcodes)
22
+ logging.debug(f"{barcodes}")
22
23
  return barcodes
23
24
 
24
25
  def get_universal_tails():
@@ -26,7 +27,7 @@ def get_universal_tails():
26
27
  '2_R_rc' : 'ACTTGCCTGTCGCTCTATCTTC'}
27
28
  barcodes['1_F_rc'] = reverse_complement(barcodes['1_F_fw'])
28
29
  barcodes['2_R_fw'] = reverse_complement(barcodes['2_R_rc'])
29
- print(barcodes)
30
+ logging.debug(f"{barcodes}")
30
31
  return barcodes
31
32
 
32
33
 
@@ -45,14 +46,13 @@ def find_barcode_locations(center, barcodes, primer_max_ed):
45
46
  ('X', 'T'), ('X', 'C'), ('N', 'G'), ('N', 'A'), ('N', 'T'), ('N', 'C')]
46
47
  all_locations = []
47
48
  for primer_acc, primer_seq in barcodes.items():
48
- # print(primer_acc, primer_seq,center)
49
49
  # Add additionalEqualities=IUPAC_map allow edlib to understand IUPAC code
50
50
  result = edlib.align(primer_seq, center,
51
51
  mode="HW", task="locations", k=primer_max_ed,
52
52
  additionalEqualities=IUPAC_map)
53
53
  ed = result["editDistance"]
54
54
  locations = result["locations"]
55
- print(locations, ed)
55
+ logging.debug(f"{locations} {ed}")
56
56
  if locations:
57
57
  all_locations.append((primer_acc, locations[0][0], locations[0][1], ed))
58
58
  return all_locations
@@ -63,7 +63,8 @@ def remove_barcodes(centers, barcodes, args):
63
63
  Modifies consensus sequences by copping of at barcode sites.
64
64
  This implies changing the datastructure centers with the modified consensus sequeces
65
65
  """
66
-
66
+
67
+ centers_updated = False
67
68
  for i, (nr_reads_in_cluster, c_id, center, reads_path_name) in enumerate(centers):
68
69
 
69
70
  # if consensus is smaller than 2*trim_window we set trim window to half the sequence
@@ -74,53 +75,30 @@ def remove_barcodes(centers, barcodes, args):
74
75
 
75
76
  barcode_locations_beginning = find_barcode_locations(center[:trim_window], barcodes, args.primer_max_ed)
76
77
  barcode_locations_end = find_barcode_locations(center[-trim_window:], barcodes, args.primer_max_ed)
77
- print(center)
78
+ logging.debug(f"{center}")
78
79
 
79
80
  cut_start = 0
80
81
  if barcode_locations_beginning:
81
- print("FOUND BARCODE BEGINNING", barcode_locations_beginning)
82
+ logging.debug(f"FOUND BARCODE BEGINNING {barcode_locations_beginning}")
82
83
  for bc, start, stop, ed in barcode_locations_beginning:
83
84
  if stop > cut_start:
84
85
  cut_start = stop
85
86
 
86
87
  cut_end = len(center)
87
88
  if barcode_locations_end:
88
- print("FOUND BARCODE END", barcode_locations_end)
89
+ logging.debug(f"FOUND BARCODE END {barcode_locations_end}")
89
90
  earliest_hit = len(center)
90
91
  for bc, start, stop, ed in barcode_locations_end:
91
92
  if start < earliest_hit:
92
93
  earliest_hit = start
93
94
  cut_end = len(center) - (trim_window - earliest_hit)
94
- center = center[cut_start: cut_end]
95
-
96
- print(center, "NEW")
97
- print("cut start", cut_start, "cut end", cut_end)
98
- centers[i][2] = center
99
-
100
- ## Old code scanned all consensus and were prone to errors due to cutting directionality (befause of fwd or rev comp hits of the adapter)
101
- # for i, (nr_reads_in_cluster, c_id, center, reads_path_name) in enumerate(centers):
102
- # barcode_locations = find_barcode_locations(center, barcodes, args.primer_max_ed)
103
- # if barcode_locations:
104
- # print("FOUND BARCODE", barcode_locations)
105
- # cut_start = 0
106
- # cut_end = len(center)
107
- # print(center)
108
- # for bc, start, stop, ed in barcode_locations:
109
- # # print(ed,bc, bc[-4], bc[-2:])
110
- # if bc[-4] == 'F' and bc[-2:] == 'fw':
111
- # cut_start = stop
112
- # elif bc[-4] == 'R' and bc[-2:] == 'fw':
113
- # cut_end = start
114
- # elif bc[-4] == 'R' and bc[-2:] == 'rc':
115
- # cut_start = stop
116
- # elif bc[-4] == 'F' and bc[-2:] == 'rc':
117
- # cut_end = start
118
- # else:
119
- # print()
120
- # print("Primer file not in correct format!")
121
- # print()
122
- # # print(center)
123
- # center = center[cut_start: cut_end]
124
- # print(center, "NEW")
125
- # print("cut start", cut_start, "cut end", cut_end)
126
- # centers[i][2] = center
95
+
96
+ if cut_start > 0 or cut_end < len(center):
97
+ center = center[cut_start: cut_end]
98
+
99
+ logging.debug(f"{center} NEW")
100
+ logging.debug(f"cut start {cut_start} cut end {cut_end}")
101
+ centers[i][2] = center
102
+ centers_updated = True
103
+
104
+ return centers_updated