bscampp 1.0.1a0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bscampp/__init__.py CHANGED
@@ -12,7 +12,7 @@ import logging, os
12
12
  # not really needed for BSCAMPP but safe to update here
13
13
  os.sys.setrecursionlimit(1000000)
14
14
 
15
- __version__ = "1.0.1a"
15
+ __version__ = "1.0.2"
16
16
  _INSTALL_PATH = __path__[0]
17
17
 
18
18
  # global variables to store all loggers
bscampp/configs.py CHANGED
@@ -50,6 +50,7 @@ class Configs:
50
50
  # miscellaneous
51
51
  tmpfilenbr = 0
52
52
  fragmentflag = True
53
+ subtreetype = 'd'
53
54
 
54
55
  # check if the given configuration is valid to add
55
56
  def set_valid_configuration(name, conf):
bscampp/functions.py CHANGED
@@ -15,10 +15,13 @@ Function to read in the placement tree and alignment.
15
15
  If query alignment is provided, will use the provided query instead of
16
16
  the ones (potentially) included in the reference alignment
17
17
  '''
18
- def readData(workdir):
18
+ def readData(workdir, dry_run=False):
19
19
  t0 = time.perf_counter()
20
20
  _LOG.info('Reading in input data...')
21
21
 
22
+ if dry_run:
23
+ return None, dict(), '', dict(), '', dict()
24
+
22
25
  # (1) load reference tree
23
26
  tree = treeswift.read_tree_newick(Configs.tree_path)
24
27
  tree.resolve_polytomies()
@@ -45,10 +48,10 @@ def readData(workdir):
45
48
  # after separating queries from the reference alignment, write
46
49
  # them to to TEMP/
47
50
  qaln_path = os.path.join(workdir, 'qaln.fa')
48
- write_fasta(temp_qaln_path, q_dict)
51
+ write_fasta(qaln_path, q_dict)
49
52
 
50
53
  aln_path = os.path.join(workdir, 'aln.fa')
51
- write_fasta(temp_aln_path, ref_dict)
54
+ write_fasta(aln_path, ref_dict)
52
55
 
53
56
  t1 = time.perf_counter()
54
57
  _LOG.info('Time to read in input data: {} seconds'.format(t1 - t0))
@@ -58,19 +61,25 @@ def readData(workdir):
58
61
  Function to get the closest leaf for each query sequence based on Hamming
59
62
  distance
60
63
  '''
61
- def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir):
64
+ def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir, dry_run=False):
62
65
  t0 = time.perf_counter()
63
66
  _LOG.info('Computing closest leaves for query sequences...')
67
+
68
+ if dry_run:
69
+ return dict(), dict()
70
+
64
71
  query_votes_dict = dict()
65
72
  query_top_vote_dict = dict()
66
-
67
73
  tmp_output = os.path.join(workdir, 'closest.txt')
74
+
75
+ if Configs.subtreetype == "h":
76
+ Configs.votes = Configs.subtreesize
68
77
 
69
78
  cmd = []
70
79
  if Configs.similarityflag:
71
80
  cmd.append(os.path.join(Configs.hamming_distance_dir, 'homology'))
72
81
  else:
73
- if fragment_flag == False:
82
+ if Configs.fragmentflag == False:
74
83
  cmd.append(os.path.join(Configs.hamming_distance_dir, 'hamming'))
75
84
  else:
76
85
  cmd.append(os.path.join(
@@ -115,10 +124,13 @@ def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir):
115
124
  Function to assign queries to subtrees based on their votes
116
125
  '''
117
126
  def assignQueriesToSubtrees(query_votes_dict, query_top_vote_dict,
118
- tree, leaf_dict):
127
+ tree, leaf_dict, dry_run=False):
119
128
  t0 = time.perf_counter()
120
129
  _LOG.info('Adding query votes to the placement tree...')
121
130
 
131
+ if dry_run:
132
+ return dict(), []
133
+
122
134
  # (1) go over the query votes and add them to corresponding leaves
123
135
  lf_votes = Counter()
124
136
  leaf_queries = dict()
@@ -217,6 +229,51 @@ def assignQueriesToSubtrees(query_votes_dict, query_top_vote_dict,
217
229
  _LOG.info('Time to assign queries to subtrees: {} seconds'.format(t1 - t0))
218
230
  return new_subtree_dict, placed_query_list
219
231
 
232
+
233
+ '''
234
+ Function to assign queries to subtrees as used in SCAMPP
235
+ (subtrees are built using the nearest leaf as the seed sequence)
236
+ '''
237
+ def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
238
+ tree, leaf_dict, dry_run=False):
239
+ t0 = time.perf_counter()
240
+
241
+ if dry_run:
242
+ return dict(), []
243
+
244
+ # (1) go over the query seed sequences to see if any queries use
245
+ # the same seed sequence (i.e. subtree)
246
+ seed_queries = dict()
247
+ for query, closest_leaf in query_top_vote_dict.items():
248
+ if closest_leaf not in seed_queries:
249
+ seed_queries[closest_leaf] = [query]
250
+ else:
251
+ seed_queries[closest_leaf].append(query)
252
+
253
+ new_subtree_dict = dict()
254
+ # assign queries to subtrees, and remove them from the pool
255
+ # repeat until all queries are assigned
256
+ for seed_label, queries in seed_queries.items():
257
+ node_y = leaf_dict[seed_label]
258
+ # extract [subtreesize] leaves
259
+ if Configs.subtreetype == "h":
260
+ labels = query_votes_dict[queries[0]]
261
+ elif Configs.subtreetype == "n":
262
+ labels = utils.subtree_nodes(tree, node_y, Configs.subtreesize)
263
+ else:
264
+ labels = utils.subtree_nodes_with_edge_length(tree, node_y,
265
+ Configs.subtreesize)
266
+ subtree = tree.extract_tree_with(labels)
267
+ new_subtree_dict[subtree] = queries
268
+
269
+
270
+ placed_query_list = []
271
+
272
+ t1 = time.perf_counter()
273
+ _LOG.info('Time to assign queries to subtrees: {} seconds'.format(t1 - t0))
274
+ return new_subtree_dict, placed_query_list
275
+
276
+
220
277
  '''
221
278
  Helper function to run a single placement task. Designed to use with
222
279
  multiprocessing
@@ -229,10 +286,13 @@ def placeOneSubtree():
229
286
  Function to perform placement of queries for each subtree
230
287
  '''
231
288
  def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
232
- aln, qaln, cmdline_args, workdir, pool, lock):
289
+ aln, qaln, cmdline_args, workdir, pool, lock, dry_run=False):
233
290
  t0 = time.perf_counter()
234
291
  _LOG.info('Performing placement on each subtree...')
235
292
 
293
+ if dry_run:
294
+ return dict()
295
+
236
296
  # prepare to write an aggregated results to local
237
297
  jplace = dict()
238
298
  utils.add_edge_nbrs(tree)
@@ -381,9 +441,12 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
381
441
  '''
382
442
  Function to write a given jplace object to local output
383
443
  '''
384
- def writeOutputJplace(output_jplace):
444
+ def writeOutputJplace(output_jplace, dry_run=False):
385
445
  t0 = time.perf_counter()
386
446
  _LOG.info('Writing aggregated placements to local...')
447
+
448
+ if dry_run:
449
+ return
387
450
 
388
451
  outpath = os.path.join(Configs.outdir, Configs.outname)
389
452
  outf = open(outpath, 'w')
bscampp/pipeline.py CHANGED
@@ -22,8 +22,13 @@ def bscampp_pipeline(*args, **kwargs):
22
22
  t0 = time.perf_counter()
23
23
  m = Manager(); lock = m.Lock()
24
24
 
25
+ # set up a dry run if specified
26
+ dry_run = False
27
+ if 'dry_run' in kwargs and isinstance(kwargs['dry_run'], bool):
28
+ dry_run = kwargs['dry_run']
29
+
25
30
  # parse command line arguments and build configurations
26
- parser, cmdline_args = parseArguments()
31
+ parser, cmdline_args = parseArguments(dry_run=dry_run)
27
32
 
28
33
  # initialize multiprocessing (if needed)
29
34
  _LOG.warning('Initializing ProcessPoolExecutor...')
@@ -31,33 +36,39 @@ def bscampp_pipeline(*args, **kwargs):
31
36
  initargs=(parser, cmdline_args,))
32
37
 
33
38
  # (0) temporary files wrote to here
34
- workdir = os.path.join(Configs.outdir, f'tmp{Configs.tmpfilenbr}')
35
- try:
36
- if not os.path.isdir(workdir):
37
- os.makedirs(workdir)
38
- except OSError:
39
- log_exception(_LOG)
39
+ if not dry_run:
40
+ workdir = os.path.join(Configs.outdir, f'tmp{Configs.tmpfilenbr}')
41
+ try:
42
+ if not os.path.isdir(workdir):
43
+ os.makedirs(workdir)
44
+ except OSError:
45
+ log_exception(_LOG)
46
+ else:
47
+ workdir = os.getcwd()
40
48
 
41
49
  # (1) read in tree, alignment, and separate reference sequences from
42
50
  # query sequences
43
- tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir)
51
+ tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
52
+ dry_run=dry_run)
44
53
 
45
54
  # (2) compute closest leaves for all query sequences
46
55
  query_votes_dict, query_top_vote_dict = getClosestLeaves(
47
- aln_path, qaln_path, aln, qaln, workdir)
56
+ aln_path, qaln_path, aln, qaln, workdir, dry_run=dry_run)
48
57
 
49
58
  # (3) first assign all queries to their closest-leaf subtrees,
50
59
  # then do reassignment to minimize distance between each's top vote
51
60
  # and the subtree's seed leaf
52
61
  new_subtree_dict, placed_query_list = assignQueriesToSubtrees(
53
- query_votes_dict, query_top_vote_dict, tree, leaf_dict)
62
+ query_votes_dict, query_top_vote_dict, tree, leaf_dict,
63
+ dry_run=dry_run)
54
64
 
55
65
  # (4) perform placement for each subtree
56
66
  output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
57
- placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock)
67
+ placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
68
+ dry_run=dry_run)
58
69
 
59
70
  # (5) write the output jplace to local
60
- writeOutputJplace(output_jplace)
71
+ writeOutputJplace(output_jplace, dry_run=dry_run)
61
72
 
62
73
  # shutdown pool
63
74
  _LOG.warning('Shutting down ProcessPoolExecutor...')
@@ -73,6 +84,84 @@ def bscampp_pipeline(*args, **kwargs):
73
84
  send = time.perf_counter()
74
85
  _LOG.info('BSCAMPP completed in {} seconds...'.format(send - t0))
75
86
 
87
+ if dry_run:
88
+ return True
89
+ else:
90
+ return False
91
+
92
+
93
+ # main pipeline for SCAMPP
94
+ def scampp_pipeline(*args, **kwargs):
95
+ t0 = time.perf_counter()
96
+ m = Manager(); lock = m.Lock()
97
+
98
+ # set up a dry run if specified
99
+ dry_run = False
100
+ if 'dry_run' in kwargs and isinstance(kwargs['dry_run'], bool):
101
+ dry_run = kwargs['dry_run']
102
+
103
+ # parse command line arguments and build configurations
104
+ parser, cmdline_args = parseArguments(dry_run=dry_run, method="SCAMPP")
105
+
106
+ # initialize multiprocessing (if needed)
107
+ _LOG.warning('Initializing ProcessPoolExecutor...')
108
+ pool = ProcessPoolExecutor(Configs.num_cpus, initializer=initial_pool,
109
+ initargs=(parser, cmdline_args,))
110
+
111
+ # (0) temporary files wrote to here
112
+ if not dry_run:
113
+ workdir = os.path.join(Configs.outdir, f'tmp{Configs.tmpfilenbr}')
114
+ try:
115
+ if not os.path.isdir(workdir):
116
+ os.makedirs(workdir)
117
+ except OSError:
118
+ log_exception(_LOG)
119
+ else:
120
+ workdir = os.getcwd()
121
+
122
+ # (1) read in tree, alignment, and separate reference sequences from
123
+ # query sequences
124
+ tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
125
+ dry_run=dry_run)
126
+
127
+ # (2) compute closest leaves for all query sequences
128
+ query_votes_dict, query_top_vote_dict = getClosestLeaves(
129
+ aln_path, qaln_path, aln, qaln, workdir, dry_run=dry_run)
130
+
131
+ # (3) first assign each query to the subtree built using the closest
132
+ # leaf as the seed sequence
133
+ new_subtree_dict, placed_query_list = buildQuerySubtrees(
134
+ query_votes_dict, query_top_vote_dict, tree, leaf_dict,
135
+ dry_run=dry_run)
136
+
137
+ # (4) perform placement for each subtree
138
+ output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
139
+ placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
140
+ dry_run=dry_run)
141
+
142
+ # (5) write the output jplace to local
143
+ writeOutputJplace(output_jplace, dry_run=dry_run)
144
+
145
+ # shutdown pool
146
+ _LOG.warning('Shutting down ProcessPoolExecutor...')
147
+ pool.shutdown()
148
+ _LOG.warning('ProcessPoolExecutor shut down.')
149
+
150
+ # clean up temp files if not keeping
151
+ if not Configs.keeptemp:
152
+ _LOG.info('Removing temporary files...')
153
+ clean_temp_files()
154
+
155
+ # stop SCAMPP
156
+ send = time.perf_counter()
157
+ _LOG.info('SCAMPP completed in {} seconds...'.format(send - t0))
158
+
159
+ if dry_run:
160
+ return True
161
+ else:
162
+ return False
163
+
164
+
76
165
  def clean_temp_files():
77
166
  # all temporary files/directories to remove
78
167
  temp_items = [f'tmp{Configs.tmpfilenbr}']
@@ -86,29 +175,43 @@ def clean_temp_files():
86
175
  continue
87
176
  _LOG.info(f'- Removed {temp}')
88
177
 
89
- def parseArguments():
178
+ def parseArguments(dry_run=False, method="BSCAMPP"):
90
179
  global _root_dir, main_config_path
91
- parser = _init_parser()
180
+
181
+ default_outdir = f"{method.lower()}_output"
182
+ default_outname = f"{method.lower()}_result"
183
+
184
+ parser = _init_parser(default_outdir=default_outdir,
185
+ default_outname=default_outname)
92
186
  cmdline_args = sys.argv[1:]
187
+
188
+ if dry_run:
189
+ cmdline_args = ['-i', 'dummy.info', '-t', 'dummy.tre',
190
+ '-a', 'dummy.fa']
93
191
 
94
192
  # build config
95
193
  buildConfigs(parser, cmdline_args)
96
- _LOG.info('BSCAMPP is running with: {}'.format(
194
+ _LOG.info('{} is running with: {}'.format(method,
97
195
  ' '.join(cmdline_args)))
98
196
  getConfigs()
99
197
 
100
198
  return parser, cmdline_args
101
199
 
102
- def _init_parser():
200
+ def _init_parser(default_outdir="bscampp_output",
201
+ default_outname="bscampp_result"):
103
202
  # example usage
104
203
  example_usages = '''Example usages:
105
- > default
106
- %(prog)s -i raxml.info
204
+ > (1) Default
205
+ %(prog)s -i raxml.bestModel -t reference.tre -a alignment.fa
206
+ > (2) Separate alignment file for query sequences
207
+ %(prog)s -i raxml.bestModel -t reference.tre -a reference.fa -q query.fa
208
+ > (3) Use pplacer instead of EPA-ng as base method (need RAxML-ng info or FastTree log file)
209
+ %(prog)s -i fasttree.log -t reference.tre -a alignment.fa --placement-method pplacer
107
210
  '''
108
211
 
109
212
  parser = ArgumentParser(
110
213
  description=(
111
- "This program runs BSCAMPP, a scalable phylogenetic "
214
+ "This program runs BSCAMPP/SCAMPP, a scalable phylogenetic "
112
215
  "placement framework that scales EPA-ng/pplacer "
113
216
  "to very large tree placement."
114
217
  ),
@@ -135,7 +238,7 @@ def _init_parser():
135
238
  # basic group
136
239
  basic_group = parser.add_argument_group(
137
240
  "Basic parameters".upper(),
138
- "These are the basic parameters for BSCAMPP.")
241
+ "These are the basic parameters for BSCAMPP/SCAMPP.")
139
242
  parser.groups['basic_group'] = basic_group
140
243
 
141
244
  basic_group.add_argument('--placement-method', type=str,
@@ -164,10 +267,10 @@ def _init_parser():
164
267
  required=False, default=None)
165
268
  basic_group.add_argument("-d", "--outdir", type=str,
166
269
  help="Directory path for output. Default: bscampp_output/",
167
- required=False, default="bscampp_output")
270
+ required=False, default=default_outdir)
168
271
  basic_group.add_argument("-o", "--output", type=str, dest="outname",
169
272
  help="Output file name. Default: bscampp_result.jplace",
170
- required=False, default="bscampp_result.jplace")
273
+ required=False, default=f"{default_outname}.jplace")
171
274
  basic_group.add_argument("--threads", "--num-cpus", type=int,
172
275
  dest="num_cpus",
173
276
  help="Number of cores for parallelization, default: -1 (all)",
@@ -188,7 +291,8 @@ def _init_parser():
188
291
  help="Integer size of the subtree. Default: 2000",
189
292
  required=False, default=2000)
190
293
  advance_group.add_argument("-V", "--votes", type=int,
191
- help="Number of votes per query sequence. Default: 5",
294
+ help="This is only used for BSCAMPP! Number of votes per "
295
+ "query sequence. Default: 5",
192
296
  required=False, default=5)
193
297
  advance_group.add_argument("--similarityflag", type=str2bool,
194
298
  help="Boolean, True if maximizing sequence similarity "
@@ -207,6 +311,12 @@ def _init_parser():
207
311
  misc_group.add_argument("--fragmentflag", type=str2bool,
208
312
  help="If queries contains fragments. Default: True",
209
313
  required=False, default=True)
314
+ misc_group.add_argument("--subtreetype", type=str,
315
+ help="(SCAMPP only) Options for collecting "
316
+ "nodes for the subtree - d for edge weighted "
317
+ "distances, n for node distances, h for Hamming "
318
+ "distances. Default: d",
319
+ required=False, default='d')
210
320
  misc_group.add_argument("--keeptemp", type=str2bool,
211
321
  help="Boolean, True to keep all temporary files. "
212
322
  "Default: False",
bscampp/utils.py CHANGED
@@ -8,6 +8,7 @@ from os.path import expanduser,isfile
8
8
  import random
9
9
  import statistics
10
10
  import copy
11
+ import gzip
11
12
 
12
13
  import argparse
13
14
  # reformat argparse help text formatting
@@ -813,25 +814,25 @@ def newick_edge_tokens_node(node):
813
814
  node_to_str[node] = ''.join(out)
814
815
  return node_to_str[node]
815
816
 
816
- def write_tree_newick_edge_tokens(tree, filename, hide_rooted_prefix=False):
817
- '''
818
- Modified from treeswift tree.write_tree_newick()
819
- Write this ``Tree`` to a Newick file
820
- Args:
821
- ``filename`` (``str``): Path to desired output file (plain-text or gzipped)
822
- '''
823
- if not isinstance(filename, str):
824
- raise TypeError("filename must be a str")
825
- treestr = newick_edge_nbr_string(tree)
826
- if hide_rooted_prefix:
827
- if treestr.startswith('[&R]'):
828
- treestr = treestr[4:].strip()
829
- else:
830
- warn("Specified hide_rooted_prefix, but tree was not rooted")
831
- if filename.lower().endswith('.gz'): # gzipped file
832
- f = gopen(expanduser(filename),'wb',9); f.write(treestr.encode()); f.close()
833
- else: # plain-text file
834
- f = open(expanduser(filename),'w'); f.write(treestr); f.close()
817
+ #def write_tree_newick_edge_tokens(tree, filename, hide_rooted_prefix=False):
818
+ # '''
819
+ # Modified from treeswift tree.write_tree_newick()
820
+ # Write this ``Tree`` to a Newick file
821
+ # Args:
822
+ # ``filename`` (``str``): Path to desired output file (plain-text or gzipped)
823
+ # '''
824
+ # if not isinstance(filename, str):
825
+ # raise TypeError("filename must be a str")
826
+ # treestr = newick_edge_nbr_string(tree)
827
+ # if hide_rooted_prefix:
828
+ # if treestr.startswith('[&R]'):
829
+ # treestr = treestr[4:].strip()
830
+ # else:
831
+ # warn("Specified hide_rooted_prefix, but tree was not rooted")
832
+ # if filename.lower().endswith('.gz'): # gzipped file
833
+ # f = gopen(expanduser(filename),'wb',9); f.write(treestr.encode()); f.close()
834
+ # else: # plain-text file
835
+ # f = open(expanduser(filename),'w'); f.write(treestr); f.close()
835
836
 
836
837
  def read_tree_newick_edge_tokens(newick):
837
838
  '''
@@ -850,7 +851,7 @@ def read_tree_newick_edge_tokens(newick):
850
851
  except:
851
852
  raise TypeError("newick must be a str")
852
853
  if newick.lower().endswith('.gz'): # gzipped file
853
- f = gopen(expanduser(newick)); ts = f.read().decode().strip(); f.close()
854
+ f = gzip.open(expanduser(newick)); ts = f.read().decode().strip(); f.close()
854
855
  elif isfile(expanduser(newick)): # plain-text file
855
856
  f = open(expanduser(newick)); ts = f.read().strip(); f.close()
856
857
  else:
@@ -867,7 +868,7 @@ def read_tree_newick_edge_tokens(newick):
867
868
  # end of Newick string
868
869
  if ts[i] == ';':
869
870
  if i != len(ts)-1 or n != t.root:
870
- raise RuntimeError(INVALID_NEWICK)
871
+ raise RuntimeError("INVALID NEWICK")
871
872
 
872
873
  # go to new child
873
874
  elif ts[i] == '(':
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: bscampp
3
- Version: 1.0.1a0
3
+ Version: 1.0.2
4
4
  Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
5
5
  Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
6
6
  License: MIT License
@@ -51,6 +51,11 @@ Requires-Dist: treeswift>=1.1.45
51
51
  Requires-Dist: taxtastic>=0.9.3
52
52
 
53
53
  # BSCAMPP - A Scalable Phylogenetic Placement Method and Framework
54
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/bscampp)](https://pypi.org/project/bscampp/)
55
+ [![PyPI - Version](https://img.shields.io/pypi/v/bscampp?color=blue)](https://pypi.org/project/bscampp/#history)
56
+ [![Build Status](https://img.shields.io/github/actions/workflow/status/ewedell/BSCAMPP/python-package.yml?branch=main&label=build)](https://github.com/ewedell/BSCAMPP/)
57
+ [![PyPI - License](https://img.shields.io/pypi/l/bscampp?color=blue)](https://github.com/ewedell/BSCAMPP/blob/main/LICENSE)
58
+ [![Changelog](https://img.shields.io/badge/CHANGELOG-grey)](https://github.com/ewedell/BSCAMPP/blob/main/CHANGELOG.md)
54
59
 
55
60
  **Table of Contents**
56
61
  1. [Overview](#overview)
@@ -83,8 +88,12 @@ It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 vot
83
88
  are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and 5 respectively (see [Usage](#usage) for more details
84
89
  on customizing BSCAMPP).
85
90
 
91
+ #### SCAMPP
92
+ SCAMPP is also implemented in BSCAMPP. The user can invoke SCAMPP by running
93
+ `run_scampp.py` or `scampp` (if installed with PyPI) after installation.
94
+
86
95
  # Installation
87
- BSCAMPP was tested on **Python 3.7 to 3.12**. There are two ways to install and use BSCAMPP: (1) with PyPI, or
96
+ BSCAMPP was tested on **Python 3.8 to 3.12**. There are two ways to install and use BSCAMPP: (1) with PyPI, or
88
97
  (2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP, please contact Eleanor Wedell
89
98
  (ewedell@illinois.edu).
90
99
 
@@ -93,19 +102,25 @@ EPA-ng and/or pplacer are requirements to run BSCAMPP since BSCAMPP will use the
93
102
  By default, BSCAMPP will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
94
103
  We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
95
104
 
96
- ### (1) Install with `pip` (Coming soon)
105
+ ### (1) Install with `pip`
97
106
  The easiest way to install BSCAMPP is to use `pip install`. This will also install all required Python packages.
98
107
 
99
108
  ```bash
100
109
  # 1. install with pip (--user if no root access)
101
110
  pip install bscampp [--user]
102
111
 
103
- # 2. Two binary executables will be installed. The first time
112
+ # 2. Four binary executables will be installed. The first time
104
113
  # running any will create a config file at
105
114
  # ~/.bscampp/main.config that resolves the links to all
106
115
  # external software (e.g., epa-ng, pplacer)
116
+
117
+ # ---- BSCAMPP functions
107
118
  bscampp [-h] # or
108
119
  run_bscampp.py [-h]
120
+
121
+ # ---- SCAMPP functions
122
+ scampp [-h] # or
123
+ run_scampp.py
109
124
  ```
110
125
 
111
126
  ### (2) Install from GitHub
@@ -155,7 +170,8 @@ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment
155
170
  ### (3) Using `pplacer` as the base placement method
156
171
  ```bash
157
172
  run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
158
- -a [reference alignment] -q [query sequence alignment]
173
+ -a [reference alignment] -q [query sequence alignment] \
174
+ --placement-method pplacer
159
175
  ```
160
176
 
161
177
  ### More comprehensive usage
@@ -216,14 +232,23 @@ run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
216
232
  > Temporary file indexing. Default: 0
217
233
  > --fragmentflag FRAGMENTFLAG
218
234
  > If queries contains fragments. Default: True
235
+ > --subtreetype SUBTREETYPE
236
+ > (SCAMPP only) Options for collecting nodes for the
237
+ > subtree - d for edge weighted distances, n for node
238
+ > distances, h for Hamming distances. Default: d
219
239
  > --keeptemp KEEPTEMP Boolean, True to keep all temporary files. Default:
220
240
  False
221
241
  ```
222
242
 
223
243
 
224
244
  # Example Code and Data
225
- Example script and data are provided in this GitHub repository in `examples/`. The data is originally from the [RNAsim-VS datasets](https://doi.org/10.1093/sysbio/syz063).
226
- * `examples/run.sh`: contains a simple script to test BSCAMPP with `epa-ng` or `pplacer`, placing 200 query sequences to a 10000-leaf placement tree.
227
- The info file is from RAxML-ng when running `epa-ng`, and from FastTree-2 when running `pplacer`.
228
- - `run.sh` will invoke BSCAMPP with `epa-ng`.
229
- - `run.sh pplacer` will invoke BSCAMPP with `pplacer`.
245
+ Example script and data are provided in this GitHub repository in `examples/`.
246
+ The data is originally from the
247
+ [RNAsim-VS datasets](https://doi.org/10.1093/sysbio/syz063).
248
+ * `examples/run_bscampp.sh`: contains a simple script to test BSCAMPP with
249
+ `epa-ng` or `pplacer`, placing 200 query sequences to a 10000-leaf placement
250
+ tree. The info file is from RAxML-ng when running `epa-ng`, and from
251
+ FastTree-2 when running `pplacer`.
252
+ - `run_bscampp.sh` will invoke BSCAMPP with `epa-ng`.
253
+ - `run_bscampp.sh pplacer` will invoke BSCAMPP with `pplacer`.
254
+ * `examples/run_scampp.sh`: the same test script but running SCAMPP.
@@ -1,11 +1,11 @@
1
- bscampp/__init__.py,sha256=WQ01lWQA9s702tTmbq99PAg_HE_-yyidAQTESo0ny-Y,2290
2
- bscampp/configs.py,sha256=XuzRbtcUE5bExe-vEZGZ1CeXBmp4oP7LWFveQySx2xs,5745
1
+ bscampp/__init__.py,sha256=2QetcqvH27YCbxcb-pncQRiLppyt80cKZE6qBtoNTNI,2289
2
+ bscampp/configs.py,sha256=3HJHLN2fLV5Tv3TJL95NpOuSXUV6CvqxRqCOM6TpbJQ,5767
3
3
  bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
4
- bscampp/functions.py,sha256=xcx1phGZF1yCMKk41nP2_V-4F1FziwzX1pZEgmmMaCY,15338
4
+ bscampp/functions.py,sha256=Ou-etis4Dw-vW8ZrHESm8zW_ll6CDkCQNQQSZPlAddU,17300
5
5
  bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
6
6
  bscampp/jobs.py,sha256=de0Dr3ynORwACJqVbeWDfqTwJhWvMYG-7yfRYirGx8M,6703
7
- bscampp/pipeline.py,sha256=5B8cZ3iPYUnzQMYRxAPvFdkIKDk136o5iPDtvPGPPQw,8972
8
- bscampp/utils.py,sha256=va8S6tHPezMaKcONRbcb8WlRdhNrUkV_5DTx57HoUSM,28968
7
+ bscampp/pipeline.py,sha256=C6I1vWeA6Rq_spPHy_il1FJA_DomWHUHYHLUUk9SnLk,13024
8
+ bscampp/utils.py,sha256=ragaI14Lqb2fVp_uYDkFQnV7a50G9-sUOWdVM-sNhUE,29005
9
9
  bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
10
10
  bscampp/tools/pplacer,sha256=p0H4eo9uuiYoWS_kJbPfauOV99i7BXJdZSiwXIuLxTw,7834576
11
11
  bscampp/tools/hamming_distance/CMakeLists.txt,sha256=yf9iq7Y61t3WObJHoR4RoGDEvUw_Q8JW3UnI4uh0cfU,389
@@ -17,9 +17,9 @@ bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD
17
17
  bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
18
18
  bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
19
19
  bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
20
- bscampp-1.0.1a0.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
21
- bscampp-1.0.1a0.dist-info/METADATA,sha256=spokmK4AGqKPzgZ9SdCQXxjMP3-UMQpJQNc8pG0wyFw,10509
22
- bscampp-1.0.1a0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
23
- bscampp-1.0.1a0.dist-info/entry_points.txt,sha256=dZygBmg2OncVyeLeIjh_9e-GBIOesFvMemyW9BRRcXY,113
24
- bscampp-1.0.1a0.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
25
- bscampp-1.0.1a0.dist-info/RECORD,,
20
+ bscampp-1.0.2.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
21
+ bscampp-1.0.2.dist-info/METADATA,sha256=hVJek64HM-2Bcsou5A4Kl8b_g3-Zu1IaTyNdSMh-1hI,11765
22
+ bscampp-1.0.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
23
+ bscampp-1.0.2.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
24
+ bscampp-1.0.2.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
25
+ bscampp-1.0.2.dist-info/RECORD,,
@@ -1,3 +1,5 @@
1
1
  [console_scripts]
2
2
  bscampp = bscampp.pipeline:bscampp_pipeline
3
3
  run_bscampp.py = bscampp.pipeline:bscampp_pipeline
4
+ run_scampp.py = bscampp.pipeline:scampp_pipeline
5
+ scampp = bscampp.pipeline:scampp_pipeline