bscampp 1.0.1a0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bscampp/__init__.py +1 -1
- bscampp/configs.py +1 -0
- bscampp/functions.py +72 -9
- bscampp/pipeline.py +133 -23
- bscampp/utils.py +22 -21
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.2.dist-info}/METADATA +35 -10
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.2.dist-info}/RECORD +11 -11
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.2.dist-info}/entry_points.txt +2 -0
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.2.dist-info}/LICENSE +0 -0
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.2.dist-info}/WHEEL +0 -0
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.2.dist-info}/top_level.txt +0 -0
bscampp/__init__.py
CHANGED
bscampp/configs.py
CHANGED
bscampp/functions.py
CHANGED
@@ -15,10 +15,13 @@ Function to read in the placement tree and alignment.
|
|
15
15
|
If query alignment is provided, will use the provided query instead of
|
16
16
|
the ones (potentially) included in the reference alignment
|
17
17
|
'''
|
18
|
-
def readData(workdir):
|
18
|
+
def readData(workdir, dry_run=False):
|
19
19
|
t0 = time.perf_counter()
|
20
20
|
_LOG.info('Reading in input data...')
|
21
21
|
|
22
|
+
if dry_run:
|
23
|
+
return None, dict(), '', dict(), '', dict()
|
24
|
+
|
22
25
|
# (1) load reference tree
|
23
26
|
tree = treeswift.read_tree_newick(Configs.tree_path)
|
24
27
|
tree.resolve_polytomies()
|
@@ -45,10 +48,10 @@ def readData(workdir):
|
|
45
48
|
# after separating queries from the reference alignment, write
|
46
49
|
# them to to TEMP/
|
47
50
|
qaln_path = os.path.join(workdir, 'qaln.fa')
|
48
|
-
write_fasta(
|
51
|
+
write_fasta(qaln_path, q_dict)
|
49
52
|
|
50
53
|
aln_path = os.path.join(workdir, 'aln.fa')
|
51
|
-
write_fasta(
|
54
|
+
write_fasta(aln_path, ref_dict)
|
52
55
|
|
53
56
|
t1 = time.perf_counter()
|
54
57
|
_LOG.info('Time to read in input data: {} seconds'.format(t1 - t0))
|
@@ -58,19 +61,25 @@ def readData(workdir):
|
|
58
61
|
Function to get the closest leaf for each query sequence based on Hamming
|
59
62
|
distance
|
60
63
|
'''
|
61
|
-
def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir):
|
64
|
+
def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir, dry_run=False):
|
62
65
|
t0 = time.perf_counter()
|
63
66
|
_LOG.info('Computing closest leaves for query sequences...')
|
67
|
+
|
68
|
+
if dry_run:
|
69
|
+
return dict(), dict()
|
70
|
+
|
64
71
|
query_votes_dict = dict()
|
65
72
|
query_top_vote_dict = dict()
|
66
|
-
|
67
73
|
tmp_output = os.path.join(workdir, 'closest.txt')
|
74
|
+
|
75
|
+
if Configs.subtreetype == "h":
|
76
|
+
Configs.votes = Configs.subtreesize
|
68
77
|
|
69
78
|
cmd = []
|
70
79
|
if Configs.similarityflag:
|
71
80
|
cmd.append(os.path.join(Configs.hamming_distance_dir, 'homology'))
|
72
81
|
else:
|
73
|
-
if
|
82
|
+
if Configs.fragmentflag == False:
|
74
83
|
cmd.append(os.path.join(Configs.hamming_distance_dir, 'hamming'))
|
75
84
|
else:
|
76
85
|
cmd.append(os.path.join(
|
@@ -115,10 +124,13 @@ def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir):
|
|
115
124
|
Function to assign queries to subtrees based on their votes
|
116
125
|
'''
|
117
126
|
def assignQueriesToSubtrees(query_votes_dict, query_top_vote_dict,
|
118
|
-
tree, leaf_dict):
|
127
|
+
tree, leaf_dict, dry_run=False):
|
119
128
|
t0 = time.perf_counter()
|
120
129
|
_LOG.info('Adding query votes to the placement tree...')
|
121
130
|
|
131
|
+
if dry_run:
|
132
|
+
return dict(), []
|
133
|
+
|
122
134
|
# (1) go over the query votes and add them to corresponding leaves
|
123
135
|
lf_votes = Counter()
|
124
136
|
leaf_queries = dict()
|
@@ -217,6 +229,51 @@ def assignQueriesToSubtrees(query_votes_dict, query_top_vote_dict,
|
|
217
229
|
_LOG.info('Time to assign queries to subtrees: {} seconds'.format(t1 - t0))
|
218
230
|
return new_subtree_dict, placed_query_list
|
219
231
|
|
232
|
+
|
233
|
+
'''
|
234
|
+
Function to assign queries to subtrees as used in SCAMPP
|
235
|
+
(subtrees are built using the nearest leaf as the seed sequence)
|
236
|
+
'''
|
237
|
+
def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
|
238
|
+
tree, leaf_dict, dry_run=False):
|
239
|
+
t0 = time.perf_counter()
|
240
|
+
|
241
|
+
if dry_run:
|
242
|
+
return dict(), []
|
243
|
+
|
244
|
+
# (1) go over the query seed sequences to see if any queries use
|
245
|
+
# the same seed sequence (i.e. subtree)
|
246
|
+
seed_queries = dict()
|
247
|
+
for query, closest_leaf in query_top_vote_dict.items():
|
248
|
+
if closest_leaf not in seed_queries:
|
249
|
+
seed_queries[closest_leaf] = [query]
|
250
|
+
else:
|
251
|
+
seed_queries[closest_leaf].append(query)
|
252
|
+
|
253
|
+
new_subtree_dict = dict()
|
254
|
+
# assign queries to subtrees, and remove them from the pool
|
255
|
+
# repeat until all queries are assigned
|
256
|
+
for seed_label, queries in seed_queries.items():
|
257
|
+
node_y = leaf_dict[seed_label]
|
258
|
+
# extract [subtreesize] leaves
|
259
|
+
if Configs.subtreetype == "h":
|
260
|
+
labels = query_votes_dict[queries[0]]
|
261
|
+
elif Configs.subtreetype == "n":
|
262
|
+
labels = utils.subtree_nodes(tree, node_y, Configs.subtreesize)
|
263
|
+
else:
|
264
|
+
labels = utils.subtree_nodes_with_edge_length(tree, node_y,
|
265
|
+
Configs.subtreesize)
|
266
|
+
subtree = tree.extract_tree_with(labels)
|
267
|
+
new_subtree_dict[subtree] = queries
|
268
|
+
|
269
|
+
|
270
|
+
placed_query_list = []
|
271
|
+
|
272
|
+
t1 = time.perf_counter()
|
273
|
+
_LOG.info('Time to assign queries to subtrees: {} seconds'.format(t1 - t0))
|
274
|
+
return new_subtree_dict, placed_query_list
|
275
|
+
|
276
|
+
|
220
277
|
'''
|
221
278
|
Helper function to run a single placement task. Designed to use with
|
222
279
|
multiprocessing
|
@@ -229,10 +286,13 @@ def placeOneSubtree():
|
|
229
286
|
Function to perform placement of queries for each subtree
|
230
287
|
'''
|
231
288
|
def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
232
|
-
aln, qaln, cmdline_args, workdir, pool, lock):
|
289
|
+
aln, qaln, cmdline_args, workdir, pool, lock, dry_run=False):
|
233
290
|
t0 = time.perf_counter()
|
234
291
|
_LOG.info('Performing placement on each subtree...')
|
235
292
|
|
293
|
+
if dry_run:
|
294
|
+
return dict()
|
295
|
+
|
236
296
|
# prepare to write an aggregated results to local
|
237
297
|
jplace = dict()
|
238
298
|
utils.add_edge_nbrs(tree)
|
@@ -381,9 +441,12 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
381
441
|
'''
|
382
442
|
Function to write a given jplace object to local output
|
383
443
|
'''
|
384
|
-
def writeOutputJplace(output_jplace):
|
444
|
+
def writeOutputJplace(output_jplace, dry_run=False):
|
385
445
|
t0 = time.perf_counter()
|
386
446
|
_LOG.info('Writing aggregated placements to local...')
|
447
|
+
|
448
|
+
if dry_run:
|
449
|
+
return
|
387
450
|
|
388
451
|
outpath = os.path.join(Configs.outdir, Configs.outname)
|
389
452
|
outf = open(outpath, 'w')
|
bscampp/pipeline.py
CHANGED
@@ -22,8 +22,13 @@ def bscampp_pipeline(*args, **kwargs):
|
|
22
22
|
t0 = time.perf_counter()
|
23
23
|
m = Manager(); lock = m.Lock()
|
24
24
|
|
25
|
+
# set up a dry run if specified
|
26
|
+
dry_run = False
|
27
|
+
if 'dry_run' in kwargs and isinstance(kwargs['dry_run'], bool):
|
28
|
+
dry_run = kwargs['dry_run']
|
29
|
+
|
25
30
|
# parse command line arguments and build configurations
|
26
|
-
parser, cmdline_args = parseArguments()
|
31
|
+
parser, cmdline_args = parseArguments(dry_run=dry_run)
|
27
32
|
|
28
33
|
# initialize multiprocessing (if needed)
|
29
34
|
_LOG.warning('Initializing ProcessPoolExecutor...')
|
@@ -31,33 +36,39 @@ def bscampp_pipeline(*args, **kwargs):
|
|
31
36
|
initargs=(parser, cmdline_args,))
|
32
37
|
|
33
38
|
# (0) temporary files wrote to here
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
os.
|
38
|
-
|
39
|
-
|
39
|
+
if not dry_run:
|
40
|
+
workdir = os.path.join(Configs.outdir, f'tmp{Configs.tmpfilenbr}')
|
41
|
+
try:
|
42
|
+
if not os.path.isdir(workdir):
|
43
|
+
os.makedirs(workdir)
|
44
|
+
except OSError:
|
45
|
+
log_exception(_LOG)
|
46
|
+
else:
|
47
|
+
workdir = os.getcwd()
|
40
48
|
|
41
49
|
# (1) read in tree, alignment, and separate reference sequences from
|
42
50
|
# query sequences
|
43
|
-
tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir
|
51
|
+
tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
|
52
|
+
dry_run=dry_run)
|
44
53
|
|
45
54
|
# (2) compute closest leaves for all query sequences
|
46
55
|
query_votes_dict, query_top_vote_dict = getClosestLeaves(
|
47
|
-
aln_path, qaln_path, aln, qaln, workdir)
|
56
|
+
aln_path, qaln_path, aln, qaln, workdir, dry_run=dry_run)
|
48
57
|
|
49
58
|
# (3) first assign all queries to their closest-leaf subtrees,
|
50
59
|
# then do reassignment to minimize distance between each's top vote
|
51
60
|
# and the subtree's seed leaf
|
52
61
|
new_subtree_dict, placed_query_list = assignQueriesToSubtrees(
|
53
|
-
query_votes_dict, query_top_vote_dict, tree, leaf_dict
|
62
|
+
query_votes_dict, query_top_vote_dict, tree, leaf_dict,
|
63
|
+
dry_run=dry_run)
|
54
64
|
|
55
65
|
# (4) perform placement for each subtree
|
56
66
|
output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
|
57
|
-
placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock
|
67
|
+
placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
|
68
|
+
dry_run=dry_run)
|
58
69
|
|
59
70
|
# (5) write the output jplace to local
|
60
|
-
writeOutputJplace(output_jplace)
|
71
|
+
writeOutputJplace(output_jplace, dry_run=dry_run)
|
61
72
|
|
62
73
|
# shutdown pool
|
63
74
|
_LOG.warning('Shutting down ProcessPoolExecutor...')
|
@@ -73,6 +84,84 @@ def bscampp_pipeline(*args, **kwargs):
|
|
73
84
|
send = time.perf_counter()
|
74
85
|
_LOG.info('BSCAMPP completed in {} seconds...'.format(send - t0))
|
75
86
|
|
87
|
+
if dry_run:
|
88
|
+
return True
|
89
|
+
else:
|
90
|
+
return False
|
91
|
+
|
92
|
+
|
93
|
+
# main pipeline for SCAMPP
|
94
|
+
def scampp_pipeline(*args, **kwargs):
|
95
|
+
t0 = time.perf_counter()
|
96
|
+
m = Manager(); lock = m.Lock()
|
97
|
+
|
98
|
+
# set up a dry run if specified
|
99
|
+
dry_run = False
|
100
|
+
if 'dry_run' in kwargs and isinstance(kwargs['dry_run'], bool):
|
101
|
+
dry_run = kwargs['dry_run']
|
102
|
+
|
103
|
+
# parse command line arguments and build configurations
|
104
|
+
parser, cmdline_args = parseArguments(dry_run=dry_run, method="SCAMPP")
|
105
|
+
|
106
|
+
# initialize multiprocessing (if needed)
|
107
|
+
_LOG.warning('Initializing ProcessPoolExecutor...')
|
108
|
+
pool = ProcessPoolExecutor(Configs.num_cpus, initializer=initial_pool,
|
109
|
+
initargs=(parser, cmdline_args,))
|
110
|
+
|
111
|
+
# (0) temporary files wrote to here
|
112
|
+
if not dry_run:
|
113
|
+
workdir = os.path.join(Configs.outdir, f'tmp{Configs.tmpfilenbr}')
|
114
|
+
try:
|
115
|
+
if not os.path.isdir(workdir):
|
116
|
+
os.makedirs(workdir)
|
117
|
+
except OSError:
|
118
|
+
log_exception(_LOG)
|
119
|
+
else:
|
120
|
+
workdir = os.getcwd()
|
121
|
+
|
122
|
+
# (1) read in tree, alignment, and separate reference sequences from
|
123
|
+
# query sequences
|
124
|
+
tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
|
125
|
+
dry_run=dry_run)
|
126
|
+
|
127
|
+
# (2) compute closest leaves for all query sequences
|
128
|
+
query_votes_dict, query_top_vote_dict = getClosestLeaves(
|
129
|
+
aln_path, qaln_path, aln, qaln, workdir, dry_run=dry_run)
|
130
|
+
|
131
|
+
# (3) first assign each query to the subtree built using the closest
|
132
|
+
# leaf as the seed sequence
|
133
|
+
new_subtree_dict, placed_query_list = buildQuerySubtrees(
|
134
|
+
query_votes_dict, query_top_vote_dict, tree, leaf_dict,
|
135
|
+
dry_run=dry_run)
|
136
|
+
|
137
|
+
# (4) perform placement for each subtree
|
138
|
+
output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
|
139
|
+
placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
|
140
|
+
dry_run=dry_run)
|
141
|
+
|
142
|
+
# (5) write the output jplace to local
|
143
|
+
writeOutputJplace(output_jplace, dry_run=dry_run)
|
144
|
+
|
145
|
+
# shutdown pool
|
146
|
+
_LOG.warning('Shutting down ProcessPoolExecutor...')
|
147
|
+
pool.shutdown()
|
148
|
+
_LOG.warning('ProcessPoolExecutor shut down.')
|
149
|
+
|
150
|
+
# clean up temp files if not keeping
|
151
|
+
if not Configs.keeptemp:
|
152
|
+
_LOG.info('Removing temporary files...')
|
153
|
+
clean_temp_files()
|
154
|
+
|
155
|
+
# stop SCAMPP
|
156
|
+
send = time.perf_counter()
|
157
|
+
_LOG.info('SCAMPP completed in {} seconds...'.format(send - t0))
|
158
|
+
|
159
|
+
if dry_run:
|
160
|
+
return True
|
161
|
+
else:
|
162
|
+
return False
|
163
|
+
|
164
|
+
|
76
165
|
def clean_temp_files():
|
77
166
|
# all temporary files/directories to remove
|
78
167
|
temp_items = [f'tmp{Configs.tmpfilenbr}']
|
@@ -86,29 +175,43 @@ def clean_temp_files():
|
|
86
175
|
continue
|
87
176
|
_LOG.info(f'- Removed {temp}')
|
88
177
|
|
89
|
-
def parseArguments():
|
178
|
+
def parseArguments(dry_run=False, method="BSCAMPP"):
|
90
179
|
global _root_dir, main_config_path
|
91
|
-
|
180
|
+
|
181
|
+
default_outdir = f"{method.lower()}_output"
|
182
|
+
default_outname = f"{method.lower()}_result"
|
183
|
+
|
184
|
+
parser = _init_parser(default_outdir=default_outdir,
|
185
|
+
default_outname=default_outname)
|
92
186
|
cmdline_args = sys.argv[1:]
|
187
|
+
|
188
|
+
if dry_run:
|
189
|
+
cmdline_args = ['-i', 'dummy.info', '-t', 'dummy.tre',
|
190
|
+
'-a', 'dummy.fa']
|
93
191
|
|
94
192
|
# build config
|
95
193
|
buildConfigs(parser, cmdline_args)
|
96
|
-
_LOG.info('
|
194
|
+
_LOG.info('{} is running with: {}'.format(method,
|
97
195
|
' '.join(cmdline_args)))
|
98
196
|
getConfigs()
|
99
197
|
|
100
198
|
return parser, cmdline_args
|
101
199
|
|
102
|
-
def _init_parser(
|
200
|
+
def _init_parser(default_outdir="bscampp_output",
|
201
|
+
default_outname="bscampp_result"):
|
103
202
|
# example usage
|
104
203
|
example_usages = '''Example usages:
|
105
|
-
>
|
106
|
-
%(prog)s -i raxml.
|
204
|
+
> (1) Default
|
205
|
+
%(prog)s -i raxml.bestModel -t reference.tre -a alignment.fa
|
206
|
+
> (2) Separate alignment file for query sequences
|
207
|
+
%(prog)s -i raxml.bestModel -t reference.tre -a reference.fa -q query.fa
|
208
|
+
> (3) Use pplacer instead of EPA-ng as base method (need RAxML-ng info or FastTree log file)
|
209
|
+
%(prog)s -i fasttree.log -t reference.tre -a alignment.fa --placement-method pplacer
|
107
210
|
'''
|
108
211
|
|
109
212
|
parser = ArgumentParser(
|
110
213
|
description=(
|
111
|
-
"This program runs BSCAMPP, a scalable phylogenetic "
|
214
|
+
"This program runs BSCAMPP/SCAMPP, a scalable phylogenetic "
|
112
215
|
"placement framework that scales EPA-ng/pplacer "
|
113
216
|
"to very large tree placement."
|
114
217
|
),
|
@@ -135,7 +238,7 @@ def _init_parser():
|
|
135
238
|
# basic group
|
136
239
|
basic_group = parser.add_argument_group(
|
137
240
|
"Basic parameters".upper(),
|
138
|
-
"These are the basic parameters for BSCAMPP.")
|
241
|
+
"These are the basic parameters for BSCAMPP/SCAMPP.")
|
139
242
|
parser.groups['basic_group'] = basic_group
|
140
243
|
|
141
244
|
basic_group.add_argument('--placement-method', type=str,
|
@@ -164,10 +267,10 @@ def _init_parser():
|
|
164
267
|
required=False, default=None)
|
165
268
|
basic_group.add_argument("-d", "--outdir", type=str,
|
166
269
|
help="Directory path for output. Default: bscampp_output/",
|
167
|
-
required=False, default=
|
270
|
+
required=False, default=default_outdir)
|
168
271
|
basic_group.add_argument("-o", "--output", type=str, dest="outname",
|
169
272
|
help="Output file name. Default: bscampp_result.jplace",
|
170
|
-
required=False, default="
|
273
|
+
required=False, default=f"{default_outname}.jplace")
|
171
274
|
basic_group.add_argument("--threads", "--num-cpus", type=int,
|
172
275
|
dest="num_cpus",
|
173
276
|
help="Number of cores for parallelization, default: -1 (all)",
|
@@ -188,7 +291,8 @@ def _init_parser():
|
|
188
291
|
help="Integer size of the subtree. Default: 2000",
|
189
292
|
required=False, default=2000)
|
190
293
|
advance_group.add_argument("-V", "--votes", type=int,
|
191
|
-
help="Number of votes per
|
294
|
+
help="This is only used for BSCAMPP! Number of votes per "
|
295
|
+
"query sequence. Default: 5",
|
192
296
|
required=False, default=5)
|
193
297
|
advance_group.add_argument("--similarityflag", type=str2bool,
|
194
298
|
help="Boolean, True if maximizing sequence similarity "
|
@@ -207,6 +311,12 @@ def _init_parser():
|
|
207
311
|
misc_group.add_argument("--fragmentflag", type=str2bool,
|
208
312
|
help="If queries contains fragments. Default: True",
|
209
313
|
required=False, default=True)
|
314
|
+
misc_group.add_argument("--subtreetype", type=str,
|
315
|
+
help="(SCAMPP only) Options for collecting "
|
316
|
+
"nodes for the subtree - d for edge weighted "
|
317
|
+
"distances, n for node distances, h for Hamming "
|
318
|
+
"distances. Default: d",
|
319
|
+
required=False, default='d')
|
210
320
|
misc_group.add_argument("--keeptemp", type=str2bool,
|
211
321
|
help="Boolean, True to keep all temporary files. "
|
212
322
|
"Default: False",
|
bscampp/utils.py
CHANGED
@@ -8,6 +8,7 @@ from os.path import expanduser,isfile
|
|
8
8
|
import random
|
9
9
|
import statistics
|
10
10
|
import copy
|
11
|
+
import gzip
|
11
12
|
|
12
13
|
import argparse
|
13
14
|
# reformat argparse help text formatting
|
@@ -813,25 +814,25 @@ def newick_edge_tokens_node(node):
|
|
813
814
|
node_to_str[node] = ''.join(out)
|
814
815
|
return node_to_str[node]
|
815
816
|
|
816
|
-
def write_tree_newick_edge_tokens(tree, filename, hide_rooted_prefix=False):
|
817
|
-
'''
|
818
|
-
Modified from treeswift tree.write_tree_newick()
|
819
|
-
Write this ``Tree`` to a Newick file
|
820
|
-
Args:
|
821
|
-
``filename`` (``str``): Path to desired output file (plain-text or gzipped)
|
822
|
-
'''
|
823
|
-
if not isinstance(filename, str):
|
824
|
-
raise TypeError("filename must be a str")
|
825
|
-
treestr = newick_edge_nbr_string(tree)
|
826
|
-
if hide_rooted_prefix:
|
827
|
-
if treestr.startswith('[&R]'):
|
828
|
-
treestr = treestr[4:].strip()
|
829
|
-
else:
|
830
|
-
warn("Specified hide_rooted_prefix, but tree was not rooted")
|
831
|
-
if filename.lower().endswith('.gz'): # gzipped file
|
832
|
-
f = gopen(expanduser(filename),'wb',9); f.write(treestr.encode()); f.close()
|
833
|
-
else: # plain-text file
|
834
|
-
f = open(expanduser(filename),'w'); f.write(treestr); f.close()
|
817
|
+
#def write_tree_newick_edge_tokens(tree, filename, hide_rooted_prefix=False):
|
818
|
+
# '''
|
819
|
+
# Modified from treeswift tree.write_tree_newick()
|
820
|
+
# Write this ``Tree`` to a Newick file
|
821
|
+
# Args:
|
822
|
+
# ``filename`` (``str``): Path to desired output file (plain-text or gzipped)
|
823
|
+
# '''
|
824
|
+
# if not isinstance(filename, str):
|
825
|
+
# raise TypeError("filename must be a str")
|
826
|
+
# treestr = newick_edge_nbr_string(tree)
|
827
|
+
# if hide_rooted_prefix:
|
828
|
+
# if treestr.startswith('[&R]'):
|
829
|
+
# treestr = treestr[4:].strip()
|
830
|
+
# else:
|
831
|
+
# warn("Specified hide_rooted_prefix, but tree was not rooted")
|
832
|
+
# if filename.lower().endswith('.gz'): # gzipped file
|
833
|
+
# f = gopen(expanduser(filename),'wb',9); f.write(treestr.encode()); f.close()
|
834
|
+
# else: # plain-text file
|
835
|
+
# f = open(expanduser(filename),'w'); f.write(treestr); f.close()
|
835
836
|
|
836
837
|
def read_tree_newick_edge_tokens(newick):
|
837
838
|
'''
|
@@ -850,7 +851,7 @@ def read_tree_newick_edge_tokens(newick):
|
|
850
851
|
except:
|
851
852
|
raise TypeError("newick must be a str")
|
852
853
|
if newick.lower().endswith('.gz'): # gzipped file
|
853
|
-
f =
|
854
|
+
f = gzip.open(expanduser(newick)); ts = f.read().decode().strip(); f.close()
|
854
855
|
elif isfile(expanduser(newick)): # plain-text file
|
855
856
|
f = open(expanduser(newick)); ts = f.read().strip(); f.close()
|
856
857
|
else:
|
@@ -867,7 +868,7 @@ def read_tree_newick_edge_tokens(newick):
|
|
867
868
|
# end of Newick string
|
868
869
|
if ts[i] == ';':
|
869
870
|
if i != len(ts)-1 or n != t.root:
|
870
|
-
raise RuntimeError(
|
871
|
+
raise RuntimeError("INVALID NEWICK")
|
871
872
|
|
872
873
|
# go to new child
|
873
874
|
elif ts[i] == '(':
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: bscampp
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.2
|
4
4
|
Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
|
5
5
|
Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
|
6
6
|
License: MIT License
|
@@ -51,6 +51,11 @@ Requires-Dist: treeswift>=1.1.45
|
|
51
51
|
Requires-Dist: taxtastic>=0.9.3
|
52
52
|
|
53
53
|
# BSCAMPP - A Scalable Phylogenetic Placement Method and Framework
|
54
|
+
[](https://pypi.org/project/bscampp/)
|
55
|
+
[](https://pypi.org/project/bscampp/#history)
|
56
|
+
[](https://github.com/ewedell/BSCAMPP/)
|
57
|
+
[](https://github.com/ewedell/BSCAMPP/blob/main/LICENSE)
|
58
|
+
[](https://github.com/ewedell/BSCAMPP/blob/main/CHANGELOG.md)
|
54
59
|
|
55
60
|
**Table of Contents**
|
56
61
|
1. [Overview](#overview)
|
@@ -83,8 +88,12 @@ It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 vot
|
|
83
88
|
are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and 5 respectively (see [Usage](#usage) for more details
|
84
89
|
on customizing BSCAMPP).
|
85
90
|
|
91
|
+
#### SCAMPP
|
92
|
+
SCAMPP is also implemented in BSCAMPP. The user can invoke SCAMPP by running
|
93
|
+
`run_scampp.py` or `scampp` (if installed with PyPI) after installation.
|
94
|
+
|
86
95
|
# Installation
|
87
|
-
BSCAMPP was tested on **Python 3.
|
96
|
+
BSCAMPP was tested on **Python 3.8 to 3.12**. There are two ways to install and use BSCAMPP: (1) with PyPI, or
|
88
97
|
(2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP, please contact Eleanor Wedell
|
89
98
|
(ewedell@illinois.edu).
|
90
99
|
|
@@ -93,19 +102,25 @@ EPA-ng and/or pplacer are requirements to run BSCAMPP since BSCAMPP will use the
|
|
93
102
|
By default, BSCAMPP will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
|
94
103
|
We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
|
95
104
|
|
96
|
-
### (1) Install with `pip`
|
105
|
+
### (1) Install with `pip`
|
97
106
|
The easiest way to install BSCAMPP is to use `pip install`. This will also install all required Python packages.
|
98
107
|
|
99
108
|
```bash
|
100
109
|
# 1. install with pip (--user if no root access)
|
101
110
|
pip install bscampp [--user]
|
102
111
|
|
103
|
-
# 2.
|
112
|
+
# 2. Four binary executables will be installed. The first time
|
104
113
|
# running any will create a config file at
|
105
114
|
# ~/.bscampp/main.config that resolves the links to all
|
106
115
|
# external software (e.g., epa-ng, pplacer)
|
116
|
+
|
117
|
+
# ---- BSCAMPP functions
|
107
118
|
bscampp [-h] # or
|
108
119
|
run_bscampp.py [-h]
|
120
|
+
|
121
|
+
# ---- SCAMPP functions
|
122
|
+
scampp [-h] # or
|
123
|
+
run_scampp.py
|
109
124
|
```
|
110
125
|
|
111
126
|
### (2) Install from GitHub
|
@@ -155,7 +170,8 @@ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment
|
|
155
170
|
### (3) Using `pplacer` as the base placement method
|
156
171
|
```bash
|
157
172
|
run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
|
158
|
-
-a [reference alignment] -q [query sequence alignment]
|
173
|
+
-a [reference alignment] -q [query sequence alignment] \
|
174
|
+
--placement-method pplacer
|
159
175
|
```
|
160
176
|
|
161
177
|
### More comprehensive usage
|
@@ -216,14 +232,23 @@ run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
|
|
216
232
|
> Temporary file indexing. Default: 0
|
217
233
|
> --fragmentflag FRAGMENTFLAG
|
218
234
|
> If queries contains fragments. Default: True
|
235
|
+
> --subtreetype SUBTREETYPE
|
236
|
+
> (SCAMPP only) Options for collecting nodes for the
|
237
|
+
> subtree - d for edge weighted distances, n for node
|
238
|
+
> distances, h for Hamming distances. Default: d
|
219
239
|
> --keeptemp KEEPTEMP Boolean, True to keep all temporary files. Default:
|
220
240
|
False
|
221
241
|
```
|
222
242
|
|
223
243
|
|
224
244
|
# Example Code and Data
|
225
|
-
Example script and data are provided in this GitHub repository in `examples/`.
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
- `
|
245
|
+
Example script and data are provided in this GitHub repository in `examples/`.
|
246
|
+
The data is originally from the
|
247
|
+
[RNAsim-VS datasets](https://doi.org/10.1093/sysbio/syz063).
|
248
|
+
* `examples/run_bscampp.sh`: contains a simple script to test BSCAMPP with
|
249
|
+
`epa-ng` or `pplacer`, placing 200 query sequences to a 10000-leaf placement
|
250
|
+
tree. The info file is from RAxML-ng when running `epa-ng`, and from
|
251
|
+
FastTree-2 when running `pplacer`.
|
252
|
+
- `run_bscampp.sh` will invoke BSCAMPP with `epa-ng`.
|
253
|
+
- `run_bscampp.sh pplacer` will invoke BSCAMPP with `pplacer`.
|
254
|
+
* `examples/run_scampp.sh`: the same test script but running SCAMPP.
|
@@ -1,11 +1,11 @@
|
|
1
|
-
bscampp/__init__.py,sha256=
|
2
|
-
bscampp/configs.py,sha256=
|
1
|
+
bscampp/__init__.py,sha256=2QetcqvH27YCbxcb-pncQRiLppyt80cKZE6qBtoNTNI,2289
|
2
|
+
bscampp/configs.py,sha256=3HJHLN2fLV5Tv3TJL95NpOuSXUV6CvqxRqCOM6TpbJQ,5767
|
3
3
|
bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
|
4
|
-
bscampp/functions.py,sha256=
|
4
|
+
bscampp/functions.py,sha256=Ou-etis4Dw-vW8ZrHESm8zW_ll6CDkCQNQQSZPlAddU,17300
|
5
5
|
bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
|
6
6
|
bscampp/jobs.py,sha256=de0Dr3ynORwACJqVbeWDfqTwJhWvMYG-7yfRYirGx8M,6703
|
7
|
-
bscampp/pipeline.py,sha256=
|
8
|
-
bscampp/utils.py,sha256=
|
7
|
+
bscampp/pipeline.py,sha256=C6I1vWeA6Rq_spPHy_il1FJA_DomWHUHYHLUUk9SnLk,13024
|
8
|
+
bscampp/utils.py,sha256=ragaI14Lqb2fVp_uYDkFQnV7a50G9-sUOWdVM-sNhUE,29005
|
9
9
|
bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
|
10
10
|
bscampp/tools/pplacer,sha256=p0H4eo9uuiYoWS_kJbPfauOV99i7BXJdZSiwXIuLxTw,7834576
|
11
11
|
bscampp/tools/hamming_distance/CMakeLists.txt,sha256=yf9iq7Y61t3WObJHoR4RoGDEvUw_Q8JW3UnI4uh0cfU,389
|
@@ -17,9 +17,9 @@ bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD
|
|
17
17
|
bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
|
18
18
|
bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
|
19
19
|
bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
|
20
|
-
bscampp-1.0.
|
21
|
-
bscampp-1.0.
|
22
|
-
bscampp-1.0.
|
23
|
-
bscampp-1.0.
|
24
|
-
bscampp-1.0.
|
25
|
-
bscampp-1.0.
|
20
|
+
bscampp-1.0.2.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
|
21
|
+
bscampp-1.0.2.dist-info/METADATA,sha256=hVJek64HM-2Bcsou5A4Kl8b_g3-Zu1IaTyNdSMh-1hI,11765
|
22
|
+
bscampp-1.0.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
23
|
+
bscampp-1.0.2.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
|
24
|
+
bscampp-1.0.2.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
|
25
|
+
bscampp-1.0.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|