PyPI - NGSpeciesID - Versions diffs - 0.3.0__tar.gz → 0.3.1__tar.gz - Mend

NGSpeciesID 0.3.0tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/NGSpeciesID RENAMED Viewed

@@ -4,13 +4,12 @@ from __future__ import print_function
 import os,sys
 import argparse
 import tempfile
-import errno
 from time import time
 import shutil
 import random
+import logging
 from modules import get_sorted_fastq_for_cluster
-from modules import cluster
 from modules import p_minimizers_shared
 from modules import help_functions
 from modules import parallelize
@@ -30,7 +29,7 @@ def single_clustering(read_array, p_emp_probs, args):
     result_dict = cluster.reads_to_clusters(clusters, representatives, read_array, p_emp_probs, minimizer_database, new_batch_index, args)
     # Unpack result. The result dictionary structure is convenient for multiprocessing return but clumsy in single core mode.
     clusters, representatives, _, _ = list(result_dict.values())[0]
-    print("Time elapesd clustering:", time() - start_cluster)
+    logging.debug(f"Time elapesd clustering: {time() - start_cluster}")
     return clusters, representatives
@@ -45,28 +44,30 @@ def main(args):
     """
     ##### Sort all reads according to expected errorfree kmers #####
     args.outfile = os.path.join(args.outfolder, "sorted.fastq")
-    print("started sorting seqs")
+    logging.debug("started sorting seqs")
     start = time()
     sorted_reads_fastq_file = get_sorted_fastq_for_cluster.main(args)
-    print("elapsed time sorting:", time() - start)
+    logging.debug(f"elapsed time sorting: {time() - start}")
     #################################################################
     ##### Filter and subsample #####
     if args.target_length > 0 and args.target_deviation > 0:
         read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(sorted_reads_fastq_file, 'r'))) if args.target_length - args.target_deviation <= len(seq) <= args.target_length + args.target_deviation]
-        print("Number of reads with read length in interval [{0},{1}]:".format(args.target_length - args.target_deviation, args.target_length + args.target_deviation), len(read_array))
+        logging.debug("Number of reads with read length in interval [{0},{1}]: {2}".format(args.target_length - args.target_deviation, args.target_length + args.target_deviation, len(read_array)))
     else:
         read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(sorted_reads_fastq_file, 'r')))]
-    if 0 < args.sample_size < len(read_array):
-        read_array = [  read_array[i] for i in sorted(random.sample(range(len(read_array)), args.sample_size))]
+    if args.top_reads:
+        read_array = read_array[:args.sample_size]
+    elif 0 < args.sample_size < len(read_array):
+        read_array = [read_array[i] for i in sorted(random.sample(range(len(read_array)), args.sample_size))]
     abundance_cutoff = int( args.abundance_ratio * len(read_array))
     #################################################################
     ##### Import precalculated probabilities of minimizer matching given the error rates of reads, kmer length, and window length #####
-    print("Started imported empirical error probabilities of minimizers shared:")
+    logging.debug("Started imported empirical error probabilities of minimizers shared:")
     start = time()
     p_min_shared = p_minimizers_shared.read_empirical_p()
     p_emp_probs = {}
@@ -75,21 +76,22 @@ def main(args):
             p_emp_probs[(float(e1),float(e2))] = float(p)
             p_emp_probs[(float(e2),float(e1))] = float(p)
-    print(p_emp_probs)
-    print(len(p_emp_probs))
-    print("elapsed time imported empirical error probabilities of minimizers shared:", time() - start)
+    logging.debug(f"{p_emp_probs}")
+    logging.debug(f"{len(p_emp_probs)}")
+    logging.debug(f"elapsed time imported empirical error probabilities of minimizers shared: {time() - start}")
     ##################################################################################################################################
+    logging.info(f"Starting Clustering: {len(read_array)} reads")
     ##### Cluster reads, bulk of code base is here #####
-    print("started clustring")
+    logging.debug("started clustring")
     start = time()
     if args.nr_cores > 1:
         clusters, representatives = parallelize.parallel_clustering(read_array, p_emp_probs, args)
     else:
-        print("Using 1 core.")
+        logging.debug("Using 1 core.")
         clusters, representatives = single_clustering(read_array, p_emp_probs, args)
     # clusters, representatives = cluster.cluster_seqs(read_array, p_emp_probs,  args)
-    print("Time elapsed clustering:", time() - start)
+    logging.debug(f"Time elapsed clustering: {time() - start}")
     ####################################################
@@ -101,7 +103,7 @@ def main(args):
     output_cl_id = 0
     for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),representatives[x[0]][5]), reverse=True):
     # for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),x[0]), reverse=True):
-        read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate = representatives[c_id]
+        read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate, _ = representatives[c_id]
         origins_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(output_cl_id, "_".join([item for item in acc.split("_")[:-1]]), c_seq, c_qual, score, error_rate))
         for r_acc in sorted(all_read_acc, key = lambda x: float(x.split("_")[-1]) , reverse=True):
@@ -111,20 +113,19 @@ def main(args):
         output_cl_id +=1
-    print("Nr clusters larger than 1:", nontrivial_cluster_index) #, "Non-clustered reads:", len(archived_reads))
-    print("Nr clusters (all):", len(clusters)) #, "Non-clustered reads:", len(archived_reads))
+    logging.debug(f"Nr clusters larger than 1: {nontrivial_cluster_index}") #, "Non-clustered reads:", len(archived_reads))
+    logging.debug(f"Nr clusters (all): {len(clusters)}") #, "Non-clustered reads:", len(archived_reads))
     outfile.close()
     origins_outfile.close()
     ############################
+    logging.info(f"Finished Clustering: {nontrivial_cluster_index} clusters formed")
     if args.consensus:
-        print()
-        print("STARTING TO CREATE CLUSTER CONSENSUS")
-        print()
+        logging.info(f"Starting Consensus creation and polishing")
         work_dir = tempfile.mkdtemp()
-        print("Temporary workdirektory for consensus and polishing:", work_dir)
-        print(
+        logging.debug(f"Temporary workdirectory for consensus and polishing: {work_dir}")
+        logging.debug(
             f"Forming draft consensus with abundance_cutoff >= {abundance_cutoff} "
             f"({args.abundance_ratio * 100}% of {len(read_array)} reads)"
         )
@@ -132,27 +133,29 @@ def main(args):
         if args.primer_file or args.remove_universal_tails:
             if args.remove_universal_tails:
-                print("Detecting and removing universal tails")
+                logging.debug("Detecting and removing universal tails")
                 barcodes = barcode_trimmer.get_universal_tails()
             else:
-                print("Detecting and removing primers")
+                logging.debug("Detecting and removing primers")
                 barcodes = barcode_trimmer.read_barcodes(args.primer_file)
             barcode_trimmer.remove_barcodes(centers, barcodes, args)
-        print("{0} centers formed".format(len(centers)))
+        logging.debug("{0} centers formed".format(len(centers)))
         centers_filtered = consensus.detect_reverse_complements(centers, args.rc_identity_threshold)
         centers_polished = consensus.polish_sequences(centers_filtered, args)
         if args.primer_file or args.remove_universal_tails: # check if barcode is found after polishing with medaka
-            barcode_trimmer.remove_barcodes(centers_polished, barcodes, args)
-            centers_filtered = consensus.detect_reverse_complements(centers_polished, args.rc_identity_threshold)
-            centers_polished = consensus.polish_sequences(centers_filtered, args)
+            centers_updated = barcode_trimmer.remove_barcodes(centers_polished, barcodes, args)
+            if centers_updated:
+                centers_filtered = consensus.detect_reverse_complements(centers_polished, args.rc_identity_threshold)
+                centers_polished = consensus.polish_sequences(centers_filtered, args)
-        print("removing temporary workdir")
+        logging.debug("removing temporary workdir")
         shutil.rmtree(work_dir)
+        logging.info(f"Finished Consensus creation: {len(centers_filtered)} created")
 def write_fastq(args):
@@ -183,19 +186,17 @@ def write_fastq(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Reference-free clustering and consensus forming of targeted ONT or PacBio reads", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--version', action='version', version='%(prog)s 0.3.0')
-    parser.add_argument('--fastq', type=str,  default=False, help='Path to consensus fastq file(s)')
-    parser.add_argument('--flnc', type=str, default=False, help='The flnc reads generated by the isoseq3 algorithm (BAM file)')
-    parser.add_argument('--ccs', type=str, default=False, help='Path to consensus BAM file(s)')
-    # parser.add_argument('--mapping', action="store_true", help='Only infer clusters by mapping, no alignment is performed.')
+    parser.add_argument('--version', action='version', version='%(prog)s 0.3.1')
+    parser.add_argument('--debug', action='store_true', help='Enable debug logging')
+    reads_file = parser.add_mutually_exclusive_group(required=True)
+    reads_file.add_argument('--fastq', type=str, help='Path to consensus fastq file(s)')
+    reads_file.add_argument('--use_old_sorted_file', action='store_true', help='Using already existing sorted file if present in specified output directory.')
     parser.add_argument('--t', dest="nr_cores", type=int, default=8, help='Number of cores allocated for clustering')
     parser.add_argument('--d', dest="print_output", type=int, default=10000, help='For debugging, prints status of clustering and minimizer database every p reads processed.')
     parser.add_argument('--q', dest="quality_threshold", type=float, default=7.0, help='Filters reads with average phred quality value under this number (default = 7.0).')
     parser.add_argument('--ont', action="store_true", help='Clustering of ONT transcript reads.')
     parser.add_argument('--isoseq', action="store_true", help='Clustering of PacBio Iso-Seq reads.')
-    parser.add_argument('--use_old_sorted_file', action="store_true", help='Using already existing sorted file if present in specified output directory.')
     parser.add_argument('--consensus', action="store_true", help='After clustering, (1) run spoa on all clusters, (2) detect reverse complements, (3) run medaka.')
     parser.add_argument('--abundance_ratio', type=float, default=0.1, help='Threshold for --consensus algorithm. Consider only clusters larger than a fraction of number of total reads (default 0.1)')
@@ -207,6 +208,7 @@ if __name__ == '__main__':
     group.add_argument('--racon', action="store_true", help='Run final racon polishing algorithm.')
     parser.add_argument('--medaka_model', type=str, default="", help='Set specific medaka model.')
+    parser.add_argument('--medaka_fastq', action="store_true", help='Request Medaka to output a FASTQ file, instead of FASTA')
     parser.add_argument('--racon_iter', type=int, default=2, help='Number of times to run racon iteratively')
     group2 = parser.add_mutually_exclusive_group()
@@ -217,8 +219,7 @@ if __name__ == '__main__':
     parser.add_argument('--m', dest="target_length", type=int, default=0, help='Intended amplicon length. Invoked to filter out reads with length greater than m + s or smaller than m - s (default = 0 which means no filtering)')
     parser.add_argument('--s', dest="target_deviation", type=int, default=0, help='Maximum allowed amplicon-length deviation.  Invoked to filter out reads with length greater than m + s or smaller than m - s (default = 0 which means no filtering)')
     parser.add_argument('--sample_size', type=int, default=0, help='Use sample_size reads in the NGSpecies pipeline (default = 0 which means all reads considered). If sample size is larger than actual number of reads, all reads will be used.')
+    parser.add_argument('--top_reads', action='store_true', help='Use the top --sample_size reads instead of a random selection (default = false, which means random reads considered). ')
     parser.add_argument('--k', type=int, default=13, help='Kmer size')
@@ -226,6 +227,7 @@ if __name__ == '__main__':
     parser.add_argument('--min_shared', type=int, default=5, help='Minmum number of minimizers shared between read and cluster')
     parser.add_argument('--mapped_threshold', type=float, default=0.7, help='Minmum mapped fraction of read to be included in cluster. The density of minimizers to classify a region as mapped depends on quality of the read.')
     parser.add_argument('--aligned_threshold', type=float, default=0.4, help='Minmum aligned fraction of read to be included in cluster. Aligned identity depends on the quality of the read.')
+    parser.add_argument('--symmetric_map_align_thresholds', action='store_true', help='Apply mapped threshold and aligned threshold to fraction of cluster representative which maps onto the read')
     parser.add_argument('--batch_type', type=str, default='total_nt', help='In parrallel mode, how to split the reads into chunks "total_nt", "nr_reads", or "weighted" (default: total_nt) ')
     parser.add_argument('--min_fraction', type=float, default=0.8, help='Minmum fraction of minimizers shared compared to best hit, in order to continue mapping.')
     parser.add_argument('--min_prob_no_hits', type=float, default=0.1, help='Minimum probability for i consecutive minimizers to be different between read and representative and still considered as mapped region, under assumption that they come from the same transcript (depends on read quality).')
@@ -244,21 +246,20 @@ if __name__ == '__main__':
     args = parser.parse_args()
+    loglevel = logging.DEBUG if args.debug else logging.INFO
+    logging.basicConfig(
+        level=loglevel,
+        format='%(message)s'
+    )
     if args.which == 'write_fastq':
         write_fastq(args)
-        print("Wrote clusters to separate fastq files.")
+        logging.info("Wrote clusters to separate fastq files.")
         sys.exit(0)
-    if (args.fastq and (args.flnc or args.ccs)):
-        print("Either (1) only a fastq file, or (2) a ccs and a flnc file should be specified. ")
-        sys.exit()
-    if (args.flnc != False and args.ccs == False ) or (args.flnc == False and args.ccs != False ):
-        print("isONclust needs both the ccs.bam file produced by ccs and the flnc file produced by isoseq3 cluster. ")
-        sys.exit()
     if args.ont and args.isoseq :
-        print("Arguments mutually exclusive, specify either --isoseq or --ont. ")
+        logging.error("Arguments mutually exclusive, specify either --isoseq or --ont. ")
         sys.exit()
     elif args.isoseq:
         args.k = 15
@@ -271,24 +272,16 @@ if __name__ == '__main__':
     if len(sys.argv)==1:
         parser.print_help()
         sys.exit()
-    if not args.fastq and not args.flnc and not  args.ccs:
-        parser.print_help()
-        sys.exit()
     if args.outfolder and not os.path.exists(args.outfolder):
         os.makedirs(args.outfolder)
-    # edlib_module = 'edlib'
     parasail_module = 'parasail'
-    # if edlib_module not in sys.modules:
-    #     print('You have not imported the {0} module. Only performing clustering with mapping, i.e., no alignment.'.format(edlib_module))
     if parasail_module not in sys.modules:
-        print('You have not imported the {0} module. Only performing clustering with mapping, i.e., no alignment!'.format(parasail_module))
+        logging.error('You have not imported the {0} module. Only performing clustering with mapping, i.e., no alignment!'.format(parasail_module))
         sys.exit(1)
     if 100 < args.w or args.w < args.k:
-        print('Please specify a window of size larger or equal to k, and smaller than 100.')
+        logging.error('Please specify a window of size larger or equal to k, and smaller than 100.')
         sys.exit(1)
     main(args)

{NGSpeciesID-0.3.0 → ngspeciesid-0.3.1/NGSpeciesID.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: NGSpeciesID
-Version: 0.3.0
+Version: 0.3.1
 Summary: Reconstructs viral consensus sequences from a set of ONT reads.
 Home-page: https://github.com/ksahlin/NGSpeciesID
 Author: Kristoffer Sahlin
@@ -14,6 +14,18 @@ Classifier: Programming Language :: Python :: 3.6
 Classifier: Programming Language :: Python :: 3.7
 Requires-Python: !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4
 License-File: LICENSE.txt
+Requires-Dist: parasail==1.2.4
+Requires-Dist: edlib>=1.1.2
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license-file
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
 NGSpeciesID
 ===========
@@ -25,25 +37,44 @@ NGSpeciesID is distributed as a python package supported on Linux / OSX with pyt
 Table of Contents
 =================
-  * [INSTALLATION](#INSTALLATION)
-    * [Using conda](#Using-conda)
+  * [INSTALLATION](#installation)
+    * [Using conda](#using-conda)
     * [Testing installation](#testing-installation)
-  * [USAGE](#USAGE)
+  * [USAGE](#usage)
     * [Filtering and subsampling](#filtering-and-subsampling)
     * [Removing primers](#removing-primers)
-    * [Output](#Output)
-  * [EXAMPLE WORKFLOW](#EXAMPLE-WORKFLOW)
-  * [CREDITS](#CREDITS)
-  * [LICENCE](#LICENCE)
+    * [Output](#output)
+  * [EXAMPLE WORKFLOW](#example-workflow)
+  * [CREDITS](#credits)
+  * [LICENCE](#licence)
 INSTALLATION
 ----------------
-**NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools  [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the all-in-one installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
+<!---
+**NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools  [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
+-->
 ### Using conda
+**Recent update (2025-04-19)**
+There have been many version updates of medaka and spoa since NGSpeciesID was first published. Below are instructions to install
+NGSpeciesID with newer versions of spoa ([v4.1.4](https://bioconda.github.io/recipes/spoa/README.html)) and medaka (v2.0.1).
+```
+conda create -n NGSpeciesID python=3.11 pip
+conda activate NGSpeciesID
+conda install --yes -c conda-forge -c bioconda medaka==2.0.1 openblas==0.3.3 spoa racon minimap2  samtools
+pip install NGSpeciesID
+```
+Make sure you [test the installation](#testing-installation).
+**Published installation instructions (2021-01-11)**
 Conda is the preferred way to install NGSpeciesID.
 1. Create and activate a new environment called NGSpeciesID

{NGSpeciesID-0.3.0/NGSpeciesID.egg-info → ngspeciesid-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: NGSpeciesID
-Version: 0.3.0
+Version: 0.3.1
 Summary: Reconstructs viral consensus sequences from a set of ONT reads.
 Home-page: https://github.com/ksahlin/NGSpeciesID
 Author: Kristoffer Sahlin
@@ -14,6 +14,18 @@ Classifier: Programming Language :: Python :: 3.6
 Classifier: Programming Language :: Python :: 3.7
 Requires-Python: !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4
 License-File: LICENSE.txt
+Requires-Dist: parasail==1.2.4
+Requires-Dist: edlib>=1.1.2
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license-file
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
 NGSpeciesID
 ===========
@@ -25,25 +37,44 @@ NGSpeciesID is distributed as a python package supported on Linux / OSX with pyt
 Table of Contents
 =================
-  * [INSTALLATION](#INSTALLATION)
-    * [Using conda](#Using-conda)
+  * [INSTALLATION](#installation)
+    * [Using conda](#using-conda)
     * [Testing installation](#testing-installation)
-  * [USAGE](#USAGE)
+  * [USAGE](#usage)
     * [Filtering and subsampling](#filtering-and-subsampling)
     * [Removing primers](#removing-primers)
-    * [Output](#Output)
-  * [EXAMPLE WORKFLOW](#EXAMPLE-WORKFLOW)
-  * [CREDITS](#CREDITS)
-  * [LICENCE](#LICENCE)
+    * [Output](#output)
+  * [EXAMPLE WORKFLOW](#example-workflow)
+  * [CREDITS](#credits)
+  * [LICENCE](#licence)
 INSTALLATION
 ----------------
-**NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools  [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the all-in-one installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
+<!---
+**NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools  [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
+-->
 ### Using conda
+**Recent update (2025-04-19)**
+There have been many version updates of medaka and spoa since NGSpeciesID was first published. Below are instructions to install
+NGSpeciesID with newer versions of spoa ([v4.1.4](https://bioconda.github.io/recipes/spoa/README.html)) and medaka (v2.0.1).
+```
+conda create -n NGSpeciesID python=3.11 pip
+conda activate NGSpeciesID
+conda install --yes -c conda-forge -c bioconda medaka==2.0.1 openblas==0.3.3 spoa racon minimap2  samtools
+pip install NGSpeciesID
+```
+Make sure you [test the installation](#testing-installation).
+**Published installation instructions (2021-01-11)**
 Conda is the preferred way to install NGSpeciesID.
 1. Create and activate a new environment called NGSpeciesID

{NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/README.md RENAMED Viewed

@@ -8,25 +8,44 @@ NGSpeciesID is distributed as a python package supported on Linux / OSX with pyt
 Table of Contents
 =================
-  * [INSTALLATION](#INSTALLATION)
-    * [Using conda](#Using-conda)
+  * [INSTALLATION](#installation)
+    * [Using conda](#using-conda)
     * [Testing installation](#testing-installation)
-  * [USAGE](#USAGE)
+  * [USAGE](#usage)
     * [Filtering and subsampling](#filtering-and-subsampling)
     * [Removing primers](#removing-primers)
-    * [Output](#Output)
-  * [EXAMPLE WORKFLOW](#EXAMPLE-WORKFLOW)
-  * [CREDITS](#CREDITS)
-  * [LICENCE](#LICENCE)
+    * [Output](#output)
+  * [EXAMPLE WORKFLOW](#example-workflow)
+  * [CREDITS](#credits)
+  * [LICENCE](#licence)
 INSTALLATION
 ----------------
-**NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools  [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the all-in-one installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
+<!---
+**NOTE**: If you are experiencing issues (e.g. [this one](https://github.com/rvaser/spoa/issues/26)) with the third party tools  [spoa](https://github.com/rvaser/spoa) or [medaka](https://github.com/nanoporetech/medaka) in the installation instructions below, please install the tools manually with their respective installation instructions [here](https://github.com/rvaser/spoa#installation) and [here](https://github.com/nanoporetech/medaka#installation).
+-->
 ### Using conda
+**Recent update (2025-04-19)**
+There have been many version updates of medaka and spoa since NGSpeciesID was first published. Below are instructions to install
+NGSpeciesID with newer versions of spoa ([v4.1.4](https://bioconda.github.io/recipes/spoa/README.html)) and medaka (v2.0.1).
+```
+conda create -n NGSpeciesID python=3.11 pip
+conda activate NGSpeciesID
+conda install --yes -c conda-forge -c bioconda medaka==2.0.1 openblas==0.3.3 spoa racon minimap2  samtools
+pip install NGSpeciesID
+```
+Make sure you [test the installation](#testing-installation).
+**Published installation instructions (2021-01-11)**
 Conda is the preferred way to install NGSpeciesID.
 1. Create and activate a new environment called NGSpeciesID

{NGSpeciesID-0.3.0 → ngspeciesid-0.3.1}/modules/barcode_trimmer.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import edlib
+import logging
 from modules import help_functions
@@ -15,10 +16,10 @@ def read_barcodes(primer_file):
     barcodes = { acc + '_fw' : seq.strip() for acc, (seq, _) in help_functions.readfq(open(primer_file, 'r'))}
     for acc, seq in list(barcodes.items()):
-        print(acc, seq,acc[:-3])
+        logging.debug(f"{acc} {seq} {acc[:-3]}")
         barcodes[acc[:-3] + '_rc'] = reverse_complement(seq.upper())
-    print(barcodes)
+    logging.debug(f"{barcodes}")
     return barcodes
 def get_universal_tails():
@@ -26,7 +27,7 @@ def get_universal_tails():
                  '2_R_rc' : 'ACTTGCCTGTCGCTCTATCTTC'}
     barcodes['1_F_rc'] = reverse_complement(barcodes['1_F_fw'])
     barcodes['2_R_fw'] = reverse_complement(barcodes['2_R_rc'])
-    print(barcodes)
+    logging.debug(f"{barcodes}")
     return barcodes
@@ -45,14 +46,13 @@ def find_barcode_locations(center, barcodes, primer_max_ed):
                  ('X', 'T'), ('X', 'C'), ('N', 'G'), ('N', 'A'), ('N', 'T'), ('N', 'C')]
     all_locations = []
     for primer_acc, primer_seq in barcodes.items():
-        # print(primer_acc, primer_seq,center)
         # Add additionalEqualities=IUPAC_map allow edlib to understand IUPAC code
         result = edlib.align(primer_seq, center,
                              mode="HW", task="locations", k=primer_max_ed,
                              additionalEqualities=IUPAC_map)
         ed = result["editDistance"]
         locations = result["locations"]
-        print(locations, ed)
+        logging.debug(f"{locations} {ed}")
         if locations:
             all_locations.append((primer_acc, locations[0][0], locations[0][1], ed))
     return all_locations
@@ -63,7 +63,8 @@ def remove_barcodes(centers, barcodes, args):
         Modifies consensus sequences by copping of at barcode sites.
         This implies changing the datastructure centers with the modified consensus sequeces
     """
+    centers_updated = False
     for i, (nr_reads_in_cluster, c_id, center, reads_path_name) in enumerate(centers):
         # if consensus is smaller than 2*trim_window we set trim window to half the sequence
@@ -74,53 +75,30 @@ def remove_barcodes(centers, barcodes, args):
         barcode_locations_beginning = find_barcode_locations(center[:trim_window], barcodes, args.primer_max_ed)
         barcode_locations_end = find_barcode_locations(center[-trim_window:], barcodes, args.primer_max_ed)
-        print(center)
+        logging.debug(f"{center}")
         cut_start = 0
         if barcode_locations_beginning:
-            print("FOUND BARCODE BEGINNING", barcode_locations_beginning)
+            logging.debug(f"FOUND BARCODE BEGINNING {barcode_locations_beginning}")
             for bc, start, stop, ed in barcode_locations_beginning:
                 if stop > cut_start:
                     cut_start = stop
         cut_end = len(center)
         if barcode_locations_end:
-            print("FOUND BARCODE END", barcode_locations_end)
+            logging.debug(f"FOUND BARCODE END {barcode_locations_end}")
             earliest_hit = len(center)
             for bc, start, stop, ed in barcode_locations_end:
                 if start < earliest_hit:
                     earliest_hit = start
             cut_end = len(center) - (trim_window - earliest_hit)
-        center = center[cut_start: cut_end]
-        print(center, "NEW")
-        print("cut start", cut_start, "cut end", cut_end)
-        centers[i][2] = center
-    ## Old code scanned all consensus and were prone to errors due to cutting directionality (befause of fwd or rev comp hits of the adapter)
-    # for i, (nr_reads_in_cluster, c_id, center, reads_path_name) in enumerate(centers):
-    #     barcode_locations = find_barcode_locations(center, barcodes, args.primer_max_ed)
-    #     if barcode_locations:
-    #         print("FOUND BARCODE", barcode_locations)
-    #         cut_start = 0
-    #         cut_end = len(center)
-    #         print(center)
-    #         for bc, start, stop, ed in barcode_locations:
-    #             # print(ed,bc, bc[-4], bc[-2:])
-    #             if bc[-4] == 'F' and bc[-2:] == 'fw':
-    #                 cut_start = stop
-    #             elif bc[-4] == 'R' and bc[-2:] == 'fw':
-    #                 cut_end = start
-    #             elif bc[-4] == 'R' and bc[-2:] == 'rc':
-    #                 cut_start = stop
-    #             elif bc[-4] == 'F' and bc[-2:] == 'rc':
-    #                 cut_end = start
-    #             else:
-    #                 print()
-    #                 print("Primer file not in correct format!")
-    #                 print()
-    #         # print(center)
-    #         center = center[cut_start: cut_end]
-    #         print(center, "NEW")
-    #         print("cut start", cut_start, "cut end", cut_end)
-    #         centers[i][2] = center
+        if cut_start > 0 or cut_end < len(center):
+            center = center[cut_start: cut_end]
+            logging.debug(f"{center} NEW")
+            logging.debug(f"cut start {cut_start} cut end {cut_end}")
+            centers[i][2] = center
+            centers_updated = True
+    return centers_updated

NGSpeciesID 0.3.0__tar.gz → 0.3.1__tar.gz

NGSpeciesID 0.3.0tar.gz → 0.3.1tar.gz