PyPI - bscampp - Versions diffs - 1.0.3__py3-none-any.whl → 1.0.6__py3-none-any.whl - Mend

bscampp 1.0.3py3-none-any.whl → 1.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

bscampp/__init__.py +1 -1
bscampp/configs.py +6 -1
bscampp/functions.py +169 -34
bscampp/jobs.py +18 -3
bscampp/pipeline.py +40 -24
bscampp/utils.py +38 -2
{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/METADATA +8 -7
{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/RECORD +12 -12
{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/WHEEL +1 -1
{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/LICENSE +0 -0
{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/entry_points.txt +0 -0
{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/top_level.txt +0 -0

bscampp/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ import logging, os
 # not really needed for BSCAMPP but safe to update here
 os.sys.setrecursionlimit(1000000)
-__version__ = "1.0.3"
+__version__ = "1.0.6"
 _INSTALL_PATH = __path__[0]
 # global variables to store all loggers

bscampp/configs.py CHANGED Viewed

@@ -6,6 +6,7 @@ except ImportError:
 from argparse import ArgumentParser, Namespace
 from bscampp.init_configs import init_config_file
 from bscampp import get_logger, log_exception
+#from bscampp.utils import inferDataType
 # detect home.path or create if missing
 homepath = os.path.dirname(__file__) + '/home.path'
@@ -33,6 +34,9 @@ class Configs:
     keeptemp = False        # whether to keep all temporary files
     verbose = 'INFO'        # default verbose level to print
     num_cpus = 1            # number of cores to use for parallelization
+    cpus_per_job = 2        # number of cores to use per job
+    max_workers = 1         # max_workers for ProcessPoolExecutor
+                            # ... = max(1, num_cpus // cpus_per_job)
     # binaries
     pplacer_path = None
@@ -42,7 +46,6 @@ class Configs:
     # placement settings
     placement_method = 'epa-ng'
-    model = 'GTR'
     subtreesize = 2000
     votes = 5
     similarityflag = True
@@ -162,6 +165,8 @@ def buildConfigs(parser, cmdline_args, child_process=False, rerun=False):
         Configs.num_cpus = min(os.cpu_count(), Configs.num_cpus)
     else:
         Configs.num_cpus = os.cpu_count()
+    # compute max_workers based on num_cpus and cpus_per_job
+    Configs.max_workers = max(1, Configs.num_cpus // Configs.cpus_per_job)
     # sanity check for existence of base placement binary path
     if Configs.placement_method == 'epa-ng':

bscampp/functions.py CHANGED Viewed

@@ -1,15 +1,80 @@
 import json, time, os, sys
 import treeswift
 from collections import defaultdict, Counter
+import subprocess
 from bscampp import get_logger, log_exception
 from bscampp.configs import Configs
-from bscampp.jobs import EPAngJob, TaxtasticJob, PplacerTaxtasticJob
+from bscampp.jobs import GenericJob, EPAngJob, TaxtasticJob, PplacerTaxtasticJob
 from bscampp.utils import write_fasta
 import bscampp.utils as utils
+import concurrent.futures
 _LOG = get_logger(__name__)
+############################# helper functions ################################
+'''
+Function to recompile binaries from the given directory.
+Assumption, the directory contains a CMakeLists.txt file
+'''
+def recompileBinariesFromDir(dir):
+    _LOG.warning(f"Recompiling binaries with cmake/make at {dir}")
+    # need to recompile the binaries
+    cmake_p = subprocess.Popen(['cmake', dir],
+            cwd=dir, stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE, text=True)
+    cmake_stdout, cmake_stderr = cmake_p.communicate()
+    if cmake_p.returncode != 0:
+        _LOG.error("cmake failed!")
+        exit(cmake_p.returncode)
+    else:
+        _LOG.warning("cmake succeeded!")
+    # run make
+    make_p = subprocess.Popen(['make'],
+            cwd=dir, stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE, text=True)
+    make_stdout, make_stderr = make_p.communicate()
+    if make_p.returncode != 0:
+        _LOG.error(f"make failed!")
+        exit(make_p.returncode)
+    else:
+        _LOG.warning("make succeeded!")
+    _LOG.warning(f"Successfully recompiled binaries at {dir}!")
+'''
+Function to check hamming/fragment_hamming/homology binaries are executable,
+since they were compiled using dynamic library
+'''
+def ensureBinaryExecutable(binpath):
+    dir = os.path.dirname(binpath)
+    # binpath does not exist
+    b_recompile = False
+    if not os.path.exists(binpath):
+        _LOG.warning(f"{binpath} does not exist!")
+        b_recompile = True
+    else:
+        p = subprocess.Popen([binpath], stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
+        stdout, stderr = p.communicate()
+        # 255 or -1 indicates that the binaries work
+        if p.returncode == 255 or p.returncode == -1:
+            pass
+        else:
+            _LOG.warning(f"{binpath} return code is {p.returncode}!")
+            b_recompile = True
+    if b_recompile:
+        recompileBinariesFromDir(dir)
+    return
+########################## end of helper functions ############################
 '''
 Function to read in the placement tree and alignment.
 If query alignment is provided, will use the provided query instead of
@@ -20,7 +85,7 @@ def readData(workdir, dry_run=False):
     _LOG.info('Reading in input data...')
     if dry_run:
-        return None, dict(), '', dict(), '', dict()
+        return None, dict(), '', dict(), '', dict(), dict(), dict()
     # (1) load reference tree
     tree = treeswift.read_tree_newick(Configs.tree_path)
@@ -40,22 +105,44 @@ def readData(workdir, dry_run=False):
     if Configs.qaln_path is not None:
         ref_dict = utils.read_data(Configs.aln_path)
         q_dict = utils.read_data(Configs.qaln_path)
-        aln_path, qaln_path = Configs.aln_path, Configs.qaln_path
+        #aln_path, qaln_path = Configs.aln_path, Configs.qaln_path
     else:
         aln_dict = utils.read_data(Configs.aln_path)
         ref_dict, q_dict = utils.seperate(aln_dict, leaf_dict)
-        # after separating queries from the reference alignment, write
-        # them to to TEMP/
-        qaln_path = os.path.join(workdir, 'qaln.fa')
-        write_fasta(qaln_path, q_dict)
-        aln_path = os.path.join(workdir, 'aln.fa')
-        write_fasta(aln_path, ref_dict)
+    # Added on 3.8.2025 by Chengze Shen
+    #   - to ensure that any characters from the query has correct names
+    #     (e.g., having ":" can cause trouble), have a qname_map that maps
+    #     each taxon name to an idx
+    qidx = 1
+    qname_map = dict()
+    qname_map_rev = dict()
+    for name in q_dict.keys():
+        cvt = str(qidx).zfill(16)   # 16 digits
+        qname_map[name] = cvt
+        qname_map_rev[cvt] = name
+        qidx += 1
+    # modify q_dict as well
+    for name, cvt in qname_map.items():
+        q_dict[cvt] = q_dict[name]
+        q_dict.pop(name)
+    # after separating queries from the reference alignment, write
+    # them to to TEMP/
+    # Updated on 3.5.2025 by Chengze Shen
+    #   - regardless of the input choices, write a copy of both reference
+    #     and query alignment to the workdir
+    qaln_path = os.path.join(workdir, 'qaln.fa')
+    write_fasta(qaln_path, q_dict)
+    aln_path = os.path.join(workdir, 'aln.fa')
+    write_fasta(aln_path, ref_dict)
     t1 = time.perf_counter()
     _LOG.info('Time to read in input data: {} seconds'.format(t1 - t0))
-    return tree, leaf_dict, aln_path, ref_dict, qaln_path, q_dict
+    return tree, leaf_dict, aln_path, ref_dict, qaln_path, q_dict, \
+            qname_map, qname_map_rev
 '''
 Function to get the closest leaf for each query sequence based on Hamming
@@ -75,18 +162,29 @@ def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir, dry_run=False):
     if Configs.subtreetype == "h":
         Configs.votes = Configs.subtreesize
-    cmd = []
     if Configs.similarityflag:
-        cmd.append(os.path.join(Configs.hamming_distance_dir, 'homology'))
+        job_type = 'homology'
     else:
-        if Configs.fragmentflag == False:
-            cmd.append(os.path.join(Configs.hamming_distance_dir, 'hamming'))
+        if Configs.fragmentflag:
+            job_type = 'fragment_hamming'
         else:
-            cmd.append(os.path.join(
-                Configs.hamming_distance_dir, 'fragment_hamming'))
+            job_type = 'hamming'
+    binpath = os.path.join(Configs.hamming_distance_dir, job_type)
+    cmd = [binpath]
+    # Added @ 3.9.2025 by Chengze Shen
+    #   - check if binpath is executable, since the compiled files use dynamic
+    #     libraries.
+    #     If works: should have return code 255
+    #     If not: should have return code 1,
+    #             recompile the binaries using cmake and make
+    ensureBinaryExecutable(binpath)
     cmd.extend([aln_path, str(len(aln)), qaln_path, str(len(qaln)),
         tmp_output, str(Configs.votes)])
-    os.system(' '.join(cmd))
+    job = GenericJob(cmd=cmd, job_type=job_type)
+    _ = job.run()
+    #os.system(' '.join(cmd))
     # process closest leaves
     unusable_queries = set()
@@ -282,16 +380,30 @@ def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
 '''
 Helper function to run a single placement task. Designed to use with
 multiprocessing
+Input: job object
+Return: outpath from job.run()
 '''
-def placeOneSubtree():
-    # TODO
-    pass
+def placeOneSubtree(*jobs,
+        subtree_id=0, num_assigned_queries=-1, outpath=None, logging=None):
+    job_type = None
+    # record the last job_type and _outpath, which will be for the placement
+    # job
+    for job in jobs:
+        job_type = job.job_type
+        # run the job
+        _outpath = job.run(logging=logging)
+    # move output file for EPA-ng output
+    if job_type == 'epa-ng':
+        os.system('mv {} {}'.format(_outpath, outpath))
+    return subtree_id, num_assigned_queries, outpath
 '''
 Function to perform placement of queries for each subtree
 '''
 def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
-        aln, qaln, cmdline_args, workdir, pool, lock, dry_run=False):
+        aln, qaln, cmdline_args, workdir, qname_map, qname_map_rev,
+        pool, lock, dry_run=False):
     t0 = time.perf_counter()
     _LOG.info('Performing placement on each subtree...')
@@ -307,22 +419,21 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
     # go over the dictionary of subtrees and their assigned queries
     # perform placement using either EPA-ng or pplacer
     final_subtree_count, total_subtrees_examined = 0, 0
+    futures = []
+    _LOG.info("Submitting jobs for subtree placement...")
     for subtree, query_list in new_subtree_dict.items():
         total_subtrees_examined += 1
-        _LOG.info('- Subtree {}/{} with {} queries'.format(
-            total_subtrees_examined, len(new_subtree_dict), len(query_list)))
         # empty subtree, continue
         if len(query_list) == 0:
             continue
-        final_subtree_count += 1
         subtree_dir = os.path.join(workdir, f'subtree_{final_subtree_count}')
         if not os.path.isdir(subtree_dir):
             os.makedirs(subtree_dir)
         # name all temporary output files
-        tmp_tree = os.path.join(subtree_dir, 'tree')
+        tmp_tree = os.path.join(subtree_dir, f'subtree_{final_subtree_count}.tre')
         tmp_aln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_aln.fa')
         tmp_qaln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_qaln.fa')
         tmp_output = os.path.join(subtree_dir,
@@ -345,14 +456,16 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
         # 1.27.2025 - Chengze Shen
         # choose the placement method to run
+        jobs = []
         if Configs.placement_method == 'epa-ng':
             job = EPAngJob(path=Configs.epang_path,
                     info_path=Configs.info_path, tree_path=tmp_tree,
                     aln_path=tmp_aln, qaln_path=tmp_qaln,
                     outdir=subtree_dir, num_cpus=Configs.num_cpus)
-            # for EPA-ng, ensure that outpath name is changed to the one we want
-            _outpath = job.run(logging=f'subtree_{final_subtree_count}')
-            os.system('mv {} {}'.format(_outpath, tmp_output))
+            jobs.append(job)
+            ## for EPA-ng, ensure that outpath name is changed to the one we want
+            #_outpath = job.run(logging=f'subtree_{final_subtree_count}')
+            #os.system('mv {} {}'.format(_outpath, tmp_output))
         elif Configs.placement_method == 'pplacer':
             # build ref_pkg with info and tmp_tree and tmp_aln
             refpkg_dir = os.path.join(subtree_dir,
@@ -361,17 +474,33 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
                     outdir=refpkg_dir, name=f'subtree_{final_subtree_count}',
                     aln_path=tmp_aln, tree_path=tmp_tree,
                     info_path=Configs.info_path)
-            _ = taxit_job.run()
+            jobs.append(taxit_job)
+            #_ = taxit_job.run()
             # run pplacer-taxtastic
             job = PplacerTaxtasticJob(path=Configs.pplacer_path,
-                    refpkg_dir=refpkg_dir, model=Configs.model,
+                    refpkg_dir=refpkg_dir,
+                    #molecule=Configs.molecule, model=Configs.model,
                     outpath=tmp_output, num_cpus=Configs.num_cpus,
                     qaln_path=tmp_qaln)
-            tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
+            #tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
+            jobs.append(job)
         else:
             raise ValueError(
                     f"Placement method {Configs.placement_method} not recognized")
+        logging = f'subtree_{final_subtree_count}'
+        futures.append(pool.submit(placeOneSubtree, *jobs,
+            subtree_id=final_subtree_count,
+            num_assigned_queries=len(query_list),
+            outpath=tmp_output, logging=logging))
+        # increment final_subtree_count
+        final_subtree_count += 1
+    # deal with outputs
+    for future in concurrent.futures.as_completed(futures):
+        subtree_id, num_assigned_queries, tmp_output = future.result()
+        _LOG.info('- Subtree {}/{} with {} queries'.format(
+            subtree_id + 1, final_subtree_count, num_assigned_queries))
         # read in each placement result
         place_file = open(tmp_output, 'r')
@@ -391,8 +520,12 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
             field_to_idx = {field: i for i, field in enumerate(fields)}
             for tmp_place in place_json["placements"]:
-                #print(tmp_place)
-                placed_query_list.append(tmp_place[tgt][0])
+                # convert qname back using qname_map_rev
+                qname = qname_map_rev[tmp_place[tgt][0]]
+                tmp_place[tgt][0] = qname
+                placed_query_list.append(qname)
+                #placed_query_list.append(tmp_place[tgt][0])
                 for i in range(len(tmp_place["p"])):
                     edge_num = tmp_place["p"][i][
                             field_to_idx['edge_num']]
@@ -434,6 +567,7 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
                 placements.append(tmp_place.copy())
         place_file.close()
     _LOG.info(f'Final number of subtrees used: {final_subtree_count}')
     # prepare the output jplace to write
@@ -447,6 +581,7 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
     _LOG.info('Time to place queries to subtrees: {} seconds'.format(t1 - t0))
     return jplace
 '''
 Function to write a given jplace object to local output
 '''

bscampp/jobs.py CHANGED Viewed

@@ -112,7 +112,7 @@ class Job(object):
                 else:
                     _LOG.error(error_msg + '\nSTDOUT: ' + stdout +
                             '\nSTDERR: ' + stderr + logpath)
-                exit(1)
+                exit(self.returncode)
         except Exception:
             log_exception(_LOG)
@@ -123,6 +123,18 @@ class Job(object):
         raise NotImplementedError(
             'get_invocation() should be implemented by subclasses.')
+'''
+Generic job that runs the given command, represented as a list of strings
+'''
+class GenericJob(Job):
+    def __init__(self, cmd=[], job_type='external'):
+        Job.__init__(self)
+        self.job_type = job_type
+        self.cmd = cmd
+    def get_invocation(self):
+        return self.cmd, None
 '''
 A EPA-ng job that runs EPA-ng with given parameters
 '''
@@ -137,6 +149,8 @@ class EPAngJob(Job):
         self.aln_path = ''
         self.qaln_path = ''
         self.outdir = ''
+        #self.molecule = ''
+        #self.model = ''
         self.num_cpus = 1
         for k, v in kwargs.items():
@@ -194,7 +208,7 @@ class PplacerTaxtasticJob(Job):
         self.qaln_path = ''
         self.outdir = ''
         self.outpath = ''
-        self.model = 'GTR'
+        #self.model = 'GTR'
         self.num_cpus = 1
         for k, v in kwargs.items():
@@ -202,7 +216,8 @@ class PplacerTaxtasticJob(Job):
     def get_invocation(self):
         # outpath defined
-        cmd = [self.path, '-m', self.model,
+        cmd = [self.path,
+                #'-m', self.model,
                 '-c', self.refpkg_dir, '-o', self.outpath,
                 '-j', str(self.num_cpus), self.qaln_path]
         return cmd, self.outpath

bscampp/pipeline.py CHANGED Viewed

@@ -32,7 +32,9 @@ def bscampp_pipeline(*args, **kwargs):
     # initialize multiprocessing (if needed)
     _LOG.warning('Initializing ProcessPoolExecutor...')
-    pool = ProcessPoolExecutor(Configs.num_cpus, initializer=initial_pool,
+    # maximally concurrently run Configs.num_cpus // 2 jobs, each job
+    # can use 2 threads
+    pool = ProcessPoolExecutor(Configs.max_workers, initializer=initial_pool,
             initargs=(parser, cmdline_args,))
     # (0) temporary files wrote to here
@@ -48,8 +50,8 @@ def bscampp_pipeline(*args, **kwargs):
     # (1) read in tree, alignment, and separate reference sequences from
     # query sequences
-    tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
-            dry_run=dry_run)
+    tree, leaf_dict, aln_path, aln, qaln_path, qaln, qname_map, qname_map_rev \
+            = readData(workdir, dry_run=dry_run)
     # (2) compute closest leaves for all query sequences
     query_votes_dict, query_top_vote_dict = getClosestLeaves(
@@ -64,8 +66,9 @@ def bscampp_pipeline(*args, **kwargs):
     # (4) perform placement for each subtree
     output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
-            placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
-            dry_run=dry_run)
+            placed_query_list, aln, qaln, cmdline_args, workdir,
+            qname_map, qname_map_rev,
+            pool, lock, dry_run=dry_run)
     # (5) write the output jplace to local
     writeOutputJplace(output_jplace, dry_run=dry_run)
@@ -121,8 +124,8 @@ def scampp_pipeline(*args, **kwargs):
     # (1) read in tree, alignment, and separate reference sequences from
     # query sequences
-    tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
-            dry_run=dry_run)
+    tree, leaf_dict, aln_path, aln, qaln_path, qaln, qname_map, qname_map_rev \
+            = readData(workdir, dry_run=dry_run)
     # (2) compute closest leaves for all query sequences
     query_votes_dict, query_top_vote_dict = getClosestLeaves(
@@ -136,8 +139,9 @@ def scampp_pipeline(*args, **kwargs):
     # (4) perform placement for each subtree
     output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
-            placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
-            dry_run=dry_run)
+            placed_query_list, aln, qaln, cmdline_args, workdir,
+            qname_map, qname_map_rev,
+            pool, lock, dry_run=dry_run)
     # (5) write the output jplace to local
     writeOutputJplace(output_jplace, dry_run=dry_run)
@@ -257,14 +261,20 @@ def _init_parser(default_outdir="bscampp_output",
     basic_group.add_argument("-a", "--alignment", "--aln-path", type=str,
                   dest="aln_path",
                   help=("Path for reference sequence alignment in "
-                  "FASTA format. Optionally with query sequences. "
+                  "FASTA format (can be a .gz file). "
+                  "Optionally with query sequences. "
                   "Query alignment can be specified with --qaln-path"),
                   required=required, default=None)
     basic_group.add_argument("-q", "--qalignment", "--qaln-path", type=str,
                   dest="qaln_path",
                   help=("Optionally provide path to query sequence alignment "
-                  "in FASTA format. Default: None"),
+                  "in FASTA format (can be a .gz file). Default: None"),
                   required=False, default=None)
+    #basic_group.add_argument("--molecule", type=str,
+    #              choices=['nucl', 'nucleotide', 'prot', 'protein'],
+    #              help=("Specify nucleotide or protein sequences. "
+    #              "Default: infer datatype"),
+    #              required=False, default=None)
     basic_group.add_argument("-d", "--outdir", type=str,
                   help="Directory path for output. Default: bscampp_output/",
                   required=False, default=default_outdir)
@@ -275,6 +285,9 @@ def _init_parser(default_outdir="bscampp_output",
                   dest="num_cpus",
                   help="Number of cores for parallelization, default: -1 (all)",
                   required=False, default=-1)
+    basic_group.add_argument("--cpus-per-job", type=int,
+                  help="Number of cores to use for each job, default: 2",
+                  required=False, default=2)
     # advanced parameter settings
     advance_group = parser.add_argument_group(
@@ -284,16 +297,24 @@ def _init_parser(default_outdir="bscampp_output",
              ))
     parser.groups['advance_group'] = advance_group
-    advance_group.add_argument("-m", "--model", type=str,
-                  help="Model used for edge distances. Default: GTR",
-                  required=False, default="GTR")
+    #advance_group.add_argument("-m", "--model", type=str,
+    #              help=("Model used for edge distances. EPA-ng will use the "
+    #              "provided info_path (*.bestModel) for model. "
+    #              "Default: GTR for nucleotide, LG for protein"),
+    #              required=False, default=None)
     advance_group.add_argument("-b", "--subtreesize", type=int,
                   help="Integer size of the subtree. Default: 2000",
                   required=False, default=2000)
     advance_group.add_argument("-V", "--votes", type=int,
-                  help="This is only used for BSCAMPP! Number of votes per "
-                  "query sequence. Default: 5",
+                  help="(BSCAMPP only) Number of votes per query sequence. "
+                  "Default: 5",
                   required=False, default=5)
+    advance_group.add_argument("--subtreetype", type=str,
+                  help="(SCAMPP only) Options for collecting "
+                  "nodes for the subtree - d for edge weighted "
+                  "distances, n for node distances, h for Hamming "
+                  "distances. Default: d",
+                  required=False, default='d')
     advance_group.add_argument("--similarityflag", type=str2bool,
                   help="Boolean, True if maximizing sequence similarity "
                   "instead of simple Hamming distance (ignoring gap "
@@ -306,17 +327,12 @@ def _init_parser(default_outdir="bscampp_output",
     parser.groups['misc_group'] = misc_group
     misc_group.add_argument("-n","--tmpfilenbr", type=int,
-                  help="Temporary file indexing. Default: 0",
+                  help="Temporary file indexing (e.g., tmp0/). Default: 0",
                   required=False, default=0)
     misc_group.add_argument("--fragmentflag", type=str2bool,
-                  help="If queries contains fragments. Default: True",
+                  help="If queries contains fragments. Does not do anything "
+                  "if similarity flag is set to True. Default: True",
                   required=False, default=True)
-    misc_group.add_argument("--subtreetype", type=str,
-                  help="(SCAMPP only) Options for collecting "
-                  "nodes for the subtree - d for edge weighted "
-                  "distances, n for node distances, h for Hamming "
-                  "distances. Default: d",
-                  required=False, default='d')
     misc_group.add_argument("--keeptemp", type=str2bool,
                   help="Boolean, True to keep all temporary files. "
                   "Default: False",

bscampp/utils.py CHANGED Viewed

@@ -9,8 +9,11 @@ import random
 import statistics
 import copy
 import gzip
 import argparse
+from bscampp import get_logger, log_exception
+_LOG = get_logger(__name__)
 # reformat argparse help text formatting
 class SmartHelpFormatter(argparse.RawDescriptionHelpFormatter):
     def add_text(self, text):
@@ -36,6 +39,34 @@ BRACKET = {
 }
+# infer datatype from input file
+def inferDataType(path):
+    sequences = read_data(path)
+    acg, t, u, total = 0, 0, 0, 0
+    for taxon, seq in sequences.items():
+        letters = seq.upper()
+        for letter in letters:
+            total = total + 1
+            if letter in ('A', 'C', 'G', 'N'):
+                acg += 1
+            elif letter == 'T':
+                t += 1
+            elif letter == 'U':
+                u += 1
+    # dna -> nucleotide
+    if u == 0 and (acg + t) / total > 0.9:
+        datatype = 'nucleotide'
+    # rna -> nucleotide
+    elif t == 0 and (acg + u) / total > 0.9:
+        datatype = 'nucleotide'
+    # amino acid -> protein
+    else:
+        datatype = 'protein'
+    _LOG.info(f"Inferred input data type: {datatype}")
+    return datatype
 def write_fasta(aln, aln_dict, aligned=True):
     """ Write given dictionary as FASTA file out
@@ -76,7 +107,12 @@ def read_data(aln):
     """
-    f = open(aln)
+    # determine the file type, whether we have a .gz/.gzip file
+    suffix = aln.split('.')[-1]
+    if suffix in ['gz', 'gzip']:
+        f = gzip.open(aln, 'rt')
+    else:
+        f = open(aln)
     result = dict()
     taxa = ""

{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.2
 Name: bscampp
-Version: 1.0.3
-Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
+Version: 1.0.6
+Summary: BSCAMPP and SCAMPP - Scalable Phylogenetic Placement Tools
 Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
 License: MIT License
@@ -66,7 +66,7 @@ Requires-Dist: taxtastic>=0.9.3
 # Overview
 * **Inputs**
   1. Reference tree to place sequences into.
-  2. Alignment of reference sequences.
+  2. Alignment of reference sequences (protein or nucleotide).
   3. Alignment of query sequences (can be combined with ii.).
   4. Tree info file.
      - (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
@@ -230,16 +230,17 @@ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment
 >                         Output file name. Default: bscampp_result.jplace
 >   --threads NUM_CPUS, --num-cpus NUM_CPUS
 >                         Number of cores for parallelization, default: -1 (all)
+>   --cpus-per-job CPUS_PER_JOB
+>                         Number of cores to use for each job, default: 2
 >
 > ADVANCE PARAMETERS:
->   These parameters control how BSCAMPP is run. The default values are set based on experiments.
+>   These parameters control how BSCAMPP and SCAMPP are run. The default values are set based on experiments.
 >
->   -m MODEL, --model MODEL
->                         Model used for edge distances. Default: GTR
 >   -b SUBTREESIZE, --subtreesize SUBTREESIZE
 >                         Integer size of the subtree. Default: 2000
 >   -V VOTES, --votes VOTES
->                         Number of votes per query sequence. Default: 5
+>                         (BSCAMPP only) Number of votes per query sequence.
+>                         Default: 5
 >   --similarityflag SIMILARITYFLAG
 >                         Boolean, True if maximizing sequence similarity
 >                         instead of simple Hamming distance (ignoring gap sites

{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
-bscampp/__init__.py,sha256=toGV8EzvMKviV7xHahhXs0K6fAmHw2cnWb6EDscpIOY,2289
-bscampp/configs.py,sha256=3HJHLN2fLV5Tv3TJL95NpOuSXUV6CvqxRqCOM6TpbJQ,5767
+bscampp/__init__.py,sha256=eDIMYifzKrFdtA3Ac7OvPTyIHUO1ZLgVaM0pKFxxEHE,2289
+bscampp/configs.py,sha256=perl6u5hto6J3JV1JMbsTQ6tqr2uGOk-Z9jfzflid0s,6122
 bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
-bscampp/functions.py,sha256=QYI5RsUEMGc6jLPzFdInpmxA8wiYyN7785P3WxWYiTo,17839
+bscampp/functions.py,sha256=DGHQJLLzXSghDKbha0LW0YPip_45M6MI4t3zdDpzULI,22448
 bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
-bscampp/jobs.py,sha256=PrVMJBabi4cYlrxVLo37XPOY82fY0zZ8Iyp9CWCNWhU,7181
-bscampp/pipeline.py,sha256=C6I1vWeA6Rq_spPHy_il1FJA_DomWHUHYHLUUk9SnLk,13024
-bscampp/utils.py,sha256=ragaI14Lqb2fVp_uYDkFQnV7a50G9-sUOWdVM-sNhUE,29005
+bscampp/jobs.py,sha256=v7buZJs1AnNoXiILwu-W8fo3QjxAh3i9Mp7xfmlJvAY,7569
+bscampp/pipeline.py,sha256=IPZnXZmVxGGfbVUuGCQh5X9oBq48-6pA9QkuvMGPTag,14000
+bscampp/utils.py,sha256=-wns6FaWMKD2wVqjxdBQvjTdagTjywBIaGfqb2mupe4,30039
 bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
 bscampp/tools/pplacer,sha256=p0H4eo9uuiYoWS_kJbPfauOV99i7BXJdZSiwXIuLxTw,7834576
 bscampp/tools/hamming_distance/CMakeLists.txt,sha256=yf9iq7Y61t3WObJHoR4RoGDEvUw_Q8JW3UnI4uh0cfU,389
@@ -17,9 +17,9 @@ bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD
 bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
 bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
 bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
-bscampp-1.0.3.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
-bscampp-1.0.3.dist-info/METADATA,sha256=01Vl-oCadCIiWFBLA564CLNErXILqEzdRrQNPpGy_mc,12507
-bscampp-1.0.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-bscampp-1.0.3.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
-bscampp-1.0.3.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
-bscampp-1.0.3.dist-info/RECORD,,
+bscampp-1.0.6.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
+bscampp-1.0.6.dist-info/METADATA,sha256=0sWAKK30wlps8i0d1BdFqyv5MZVgefRnTn_-yMmO8lQ,12602
+bscampp-1.0.6.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
+bscampp-1.0.6.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
+bscampp-1.0.6.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
+bscampp-1.0.6.dist-info/RECORD,,

{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (76.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/LICENSE RENAMED Viewed

File without changes

{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

bscampp 1.0.3__py3-none-any.whl → 1.0.6__py3-none-any.whl

bscampp 1.0.3py3-none-any.whl → 1.0.6py3-none-any.whl