bscampp 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bscampp/__init__.py +1 -1
- bscampp/configs.py +6 -1
- bscampp/functions.py +85 -27
- bscampp/jobs.py +5 -2
- bscampp/pipeline.py +40 -24
- bscampp/utils.py +38 -2
- {bscampp-1.0.3.dist-info → bscampp-1.0.5.dist-info}/METADATA +8 -7
- {bscampp-1.0.3.dist-info → bscampp-1.0.5.dist-info}/RECORD +12 -12
- {bscampp-1.0.3.dist-info → bscampp-1.0.5.dist-info}/WHEEL +1 -1
- {bscampp-1.0.3.dist-info → bscampp-1.0.5.dist-info}/LICENSE +0 -0
- {bscampp-1.0.3.dist-info → bscampp-1.0.5.dist-info}/entry_points.txt +0 -0
- {bscampp-1.0.3.dist-info → bscampp-1.0.5.dist-info}/top_level.txt +0 -0
bscampp/__init__.py
CHANGED
bscampp/configs.py
CHANGED
@@ -6,6 +6,7 @@ except ImportError:
|
|
6
6
|
from argparse import ArgumentParser, Namespace
|
7
7
|
from bscampp.init_configs import init_config_file
|
8
8
|
from bscampp import get_logger, log_exception
|
9
|
+
#from bscampp.utils import inferDataType
|
9
10
|
|
10
11
|
# detect home.path or create if missing
|
11
12
|
homepath = os.path.dirname(__file__) + '/home.path'
|
@@ -33,6 +34,9 @@ class Configs:
|
|
33
34
|
keeptemp = False # whether to keep all temporary files
|
34
35
|
verbose = 'INFO' # default verbose level to print
|
35
36
|
num_cpus = 1 # number of cores to use for parallelization
|
37
|
+
cpus_per_job = 2 # number of cores to use per job
|
38
|
+
max_workers = 1 # max_workers for ProcessPoolExecutor
|
39
|
+
# ... = max(1, num_cpus // cpus_per_job)
|
36
40
|
|
37
41
|
# binaries
|
38
42
|
pplacer_path = None
|
@@ -42,7 +46,6 @@ class Configs:
|
|
42
46
|
|
43
47
|
# placement settings
|
44
48
|
placement_method = 'epa-ng'
|
45
|
-
model = 'GTR'
|
46
49
|
subtreesize = 2000
|
47
50
|
votes = 5
|
48
51
|
similarityflag = True
|
@@ -162,6 +165,8 @@ def buildConfigs(parser, cmdline_args, child_process=False, rerun=False):
|
|
162
165
|
Configs.num_cpus = min(os.cpu_count(), Configs.num_cpus)
|
163
166
|
else:
|
164
167
|
Configs.num_cpus = os.cpu_count()
|
168
|
+
# compute max_workers based on num_cpus and cpus_per_job
|
169
|
+
Configs.max_workers = max(1, Configs.num_cpus // Configs.cpus_per_job)
|
165
170
|
|
166
171
|
# sanity check for existence of base placement binary path
|
167
172
|
if Configs.placement_method == 'epa-ng':
|
bscampp/functions.py
CHANGED
@@ -8,6 +8,8 @@ from bscampp.jobs import EPAngJob, TaxtasticJob, PplacerTaxtasticJob
|
|
8
8
|
from bscampp.utils import write_fasta
|
9
9
|
import bscampp.utils as utils
|
10
10
|
|
11
|
+
import concurrent.futures
|
12
|
+
|
11
13
|
_LOG = get_logger(__name__)
|
12
14
|
|
13
15
|
'''
|
@@ -20,7 +22,7 @@ def readData(workdir, dry_run=False):
|
|
20
22
|
_LOG.info('Reading in input data...')
|
21
23
|
|
22
24
|
if dry_run:
|
23
|
-
return None, dict(), '', dict(), '', dict()
|
25
|
+
return None, dict(), '', dict(), '', dict(), dict(), dict()
|
24
26
|
|
25
27
|
# (1) load reference tree
|
26
28
|
tree = treeswift.read_tree_newick(Configs.tree_path)
|
@@ -40,22 +42,39 @@ def readData(workdir, dry_run=False):
|
|
40
42
|
if Configs.qaln_path is not None:
|
41
43
|
ref_dict = utils.read_data(Configs.aln_path)
|
42
44
|
q_dict = utils.read_data(Configs.qaln_path)
|
43
|
-
aln_path, qaln_path = Configs.aln_path, Configs.qaln_path
|
45
|
+
#aln_path, qaln_path = Configs.aln_path, Configs.qaln_path
|
44
46
|
else:
|
45
47
|
aln_dict = utils.read_data(Configs.aln_path)
|
46
48
|
ref_dict, q_dict = utils.seperate(aln_dict, leaf_dict)
|
47
49
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
50
|
+
# after separating queries from the reference alignment, write
|
51
|
+
# them to to TEMP/
|
52
|
+
# Updated on 3.5.2025 by Chengze Shen
|
53
|
+
# - regardless of the input choices, write a copy of both reference
|
54
|
+
# and query alignment to the workdir
|
55
|
+
qaln_path = os.path.join(workdir, 'qaln.fa')
|
56
|
+
write_fasta(qaln_path, q_dict)
|
57
|
+
|
58
|
+
aln_path = os.path.join(workdir, 'aln.fa')
|
59
|
+
write_fasta(aln_path, ref_dict)
|
60
|
+
|
61
|
+
# Added on 3.8.2025 by Chengze Shen
|
62
|
+
# - to ensure that any characters from the query has correct names
|
63
|
+
# (e.g., having ":" can cause trouble), have a qname_map that maps
|
64
|
+
# each taxon name to an idx
|
65
|
+
qidx = 1
|
66
|
+
qname_map = dict()
|
67
|
+
qname_map_rev = dict()
|
68
|
+
for name in q_dict.keys():
|
69
|
+
cvt = str(qidx).zfill(16) # 16 digits
|
70
|
+
qname_map[name] = cvt
|
71
|
+
qname_map_rev[cvt] = name
|
72
|
+
qidx += 1
|
55
73
|
|
56
74
|
t1 = time.perf_counter()
|
57
75
|
_LOG.info('Time to read in input data: {} seconds'.format(t1 - t0))
|
58
|
-
return tree, leaf_dict, aln_path, ref_dict, qaln_path, q_dict
|
76
|
+
return tree, leaf_dict, aln_path, ref_dict, qaln_path, q_dict, \
|
77
|
+
qname_map, qname_map_rev
|
59
78
|
|
60
79
|
'''
|
61
80
|
Function to get the closest leaf for each query sequence based on Hamming
|
@@ -282,16 +301,30 @@ def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
|
|
282
301
|
'''
|
283
302
|
Helper function to run a single placement task. Designed to use with
|
284
303
|
multiprocessing
|
304
|
+
Input: job object
|
305
|
+
Return: outpath from job.run()
|
285
306
|
'''
|
286
|
-
def placeOneSubtree(
|
287
|
-
|
288
|
-
|
307
|
+
def placeOneSubtree(*jobs,
|
308
|
+
subtree_id=0, num_assigned_queries=-1, outpath=None, logging=None):
|
309
|
+
job_type = None
|
310
|
+
# record the last job_type and _outpath, which will be for the placement
|
311
|
+
# job
|
312
|
+
for job in jobs:
|
313
|
+
job_type = job.job_type
|
314
|
+
# run the job
|
315
|
+
_outpath = job.run(logging=logging)
|
316
|
+
|
317
|
+
# move output file for EPA-ng output
|
318
|
+
if job_type == 'epa-ng':
|
319
|
+
os.system('mv {} {}'.format(_outpath, outpath))
|
320
|
+
return subtree_id, num_assigned_queries, outpath
|
289
321
|
|
290
322
|
'''
|
291
323
|
Function to perform placement of queries for each subtree
|
292
324
|
'''
|
293
325
|
def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
294
|
-
aln, qaln, cmdline_args, workdir,
|
326
|
+
aln, qaln, cmdline_args, workdir, qname_map, qname_map_rev,
|
327
|
+
pool, lock, dry_run=False):
|
295
328
|
t0 = time.perf_counter()
|
296
329
|
_LOG.info('Performing placement on each subtree...')
|
297
330
|
|
@@ -307,22 +340,21 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
307
340
|
# go over the dictionary of subtrees and their assigned queries
|
308
341
|
# perform placement using either EPA-ng or pplacer
|
309
342
|
final_subtree_count, total_subtrees_examined = 0, 0
|
343
|
+
futures = []
|
344
|
+
_LOG.info("Submitting jobs for subtree placement...")
|
310
345
|
for subtree, query_list in new_subtree_dict.items():
|
311
346
|
total_subtrees_examined += 1
|
312
|
-
_LOG.info('- Subtree {}/{} with {} queries'.format(
|
313
|
-
total_subtrees_examined, len(new_subtree_dict), len(query_list)))
|
314
347
|
|
315
348
|
# empty subtree, continue
|
316
349
|
if len(query_list) == 0:
|
317
350
|
continue
|
318
|
-
final_subtree_count += 1
|
319
351
|
|
320
352
|
subtree_dir = os.path.join(workdir, f'subtree_{final_subtree_count}')
|
321
353
|
if not os.path.isdir(subtree_dir):
|
322
354
|
os.makedirs(subtree_dir)
|
323
355
|
|
324
356
|
# name all temporary output files
|
325
|
-
tmp_tree = os.path.join(subtree_dir, '
|
357
|
+
tmp_tree = os.path.join(subtree_dir, f'subtree_{final_subtree_count}.tre')
|
326
358
|
tmp_aln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_aln.fa')
|
327
359
|
tmp_qaln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_qaln.fa')
|
328
360
|
tmp_output = os.path.join(subtree_dir,
|
@@ -334,7 +366,10 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
334
366
|
if '' in tmp_leaf_dict:
|
335
367
|
del tmp_leaf_dict['']
|
336
368
|
tmp_ref_dict = {label : aln[label] for label in tmp_leaf_dict.keys()}
|
337
|
-
|
369
|
+
# Changed @ 3.8.2025 by Chengze Shen
|
370
|
+
# - wrote converted name for query sequences and convert them
|
371
|
+
# - back when placements are done
|
372
|
+
tmp_q_dict = {qname_map[name] : qaln[name] for name in query_list}
|
338
373
|
write_fasta(tmp_aln, tmp_ref_dict)
|
339
374
|
write_fasta(tmp_qaln, tmp_q_dict)
|
340
375
|
|
@@ -345,14 +380,16 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
345
380
|
|
346
381
|
# 1.27.2025 - Chengze Shen
|
347
382
|
# choose the placement method to run
|
383
|
+
jobs = []
|
348
384
|
if Configs.placement_method == 'epa-ng':
|
349
385
|
job = EPAngJob(path=Configs.epang_path,
|
350
386
|
info_path=Configs.info_path, tree_path=tmp_tree,
|
351
387
|
aln_path=tmp_aln, qaln_path=tmp_qaln,
|
352
388
|
outdir=subtree_dir, num_cpus=Configs.num_cpus)
|
353
|
-
|
354
|
-
|
355
|
-
|
389
|
+
jobs.append(job)
|
390
|
+
## for EPA-ng, ensure that outpath name is changed to the one we want
|
391
|
+
#_outpath = job.run(logging=f'subtree_{final_subtree_count}')
|
392
|
+
#os.system('mv {} {}'.format(_outpath, tmp_output))
|
356
393
|
elif Configs.placement_method == 'pplacer':
|
357
394
|
# build ref_pkg with info and tmp_tree and tmp_aln
|
358
395
|
refpkg_dir = os.path.join(subtree_dir,
|
@@ -361,17 +398,33 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
361
398
|
outdir=refpkg_dir, name=f'subtree_{final_subtree_count}',
|
362
399
|
aln_path=tmp_aln, tree_path=tmp_tree,
|
363
400
|
info_path=Configs.info_path)
|
364
|
-
|
401
|
+
jobs.append(taxit_job)
|
402
|
+
#_ = taxit_job.run()
|
365
403
|
|
366
404
|
# run pplacer-taxtastic
|
367
405
|
job = PplacerTaxtasticJob(path=Configs.pplacer_path,
|
368
|
-
refpkg_dir=refpkg_dir,
|
406
|
+
refpkg_dir=refpkg_dir,
|
407
|
+
#molecule=Configs.molecule, model=Configs.model,
|
369
408
|
outpath=tmp_output, num_cpus=Configs.num_cpus,
|
370
409
|
qaln_path=tmp_qaln)
|
371
|
-
tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
|
410
|
+
#tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
|
411
|
+
jobs.append(job)
|
372
412
|
else:
|
373
413
|
raise ValueError(
|
374
414
|
f"Placement method {Configs.placement_method} not recognized")
|
415
|
+
logging = f'subtree_{final_subtree_count}'
|
416
|
+
futures.append(pool.submit(placeOneSubtree, *jobs,
|
417
|
+
subtree_id=final_subtree_count,
|
418
|
+
num_assigned_queries=len(query_list),
|
419
|
+
outpath=tmp_output, logging=logging))
|
420
|
+
# increment final_subtree_count
|
421
|
+
final_subtree_count += 1
|
422
|
+
|
423
|
+
# deal with outputs
|
424
|
+
for future in concurrent.futures.as_completed(futures):
|
425
|
+
subtree_id, num_assigned_queries, tmp_output = future.result()
|
426
|
+
_LOG.info('- Subtree {}/{} with {} queries'.format(
|
427
|
+
subtree_id + 1, final_subtree_count, num_assigned_queries))
|
375
428
|
|
376
429
|
# read in each placement result
|
377
430
|
place_file = open(tmp_output, 'r')
|
@@ -391,8 +444,12 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
391
444
|
field_to_idx = {field: i for i, field in enumerate(fields)}
|
392
445
|
|
393
446
|
for tmp_place in place_json["placements"]:
|
394
|
-
#
|
395
|
-
|
447
|
+
# convert qname back using qname_map_rev
|
448
|
+
qname = qname_map_rev[tmp_place[tgt][0]]
|
449
|
+
tmp_place[tgt][0] = qname
|
450
|
+
placed_query_list.append(qname)
|
451
|
+
|
452
|
+
#placed_query_list.append(tmp_place[tgt][0])
|
396
453
|
for i in range(len(tmp_place["p"])):
|
397
454
|
edge_num = tmp_place["p"][i][
|
398
455
|
field_to_idx['edge_num']]
|
@@ -434,6 +491,7 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
434
491
|
|
435
492
|
placements.append(tmp_place.copy())
|
436
493
|
place_file.close()
|
494
|
+
|
437
495
|
_LOG.info(f'Final number of subtrees used: {final_subtree_count}')
|
438
496
|
|
439
497
|
# prepare the output jplace to write
|
bscampp/jobs.py
CHANGED
@@ -137,6 +137,8 @@ class EPAngJob(Job):
|
|
137
137
|
self.aln_path = ''
|
138
138
|
self.qaln_path = ''
|
139
139
|
self.outdir = ''
|
140
|
+
#self.molecule = ''
|
141
|
+
#self.model = ''
|
140
142
|
self.num_cpus = 1
|
141
143
|
|
142
144
|
for k, v in kwargs.items():
|
@@ -194,7 +196,7 @@ class PplacerTaxtasticJob(Job):
|
|
194
196
|
self.qaln_path = ''
|
195
197
|
self.outdir = ''
|
196
198
|
self.outpath = ''
|
197
|
-
self.model = 'GTR'
|
199
|
+
#self.model = 'GTR'
|
198
200
|
self.num_cpus = 1
|
199
201
|
|
200
202
|
for k, v in kwargs.items():
|
@@ -202,7 +204,8 @@ class PplacerTaxtasticJob(Job):
|
|
202
204
|
|
203
205
|
def get_invocation(self):
|
204
206
|
# outpath defined
|
205
|
-
cmd = [self.path,
|
207
|
+
cmd = [self.path,
|
208
|
+
#'-m', self.model,
|
206
209
|
'-c', self.refpkg_dir, '-o', self.outpath,
|
207
210
|
'-j', str(self.num_cpus), self.qaln_path]
|
208
211
|
return cmd, self.outpath
|
bscampp/pipeline.py
CHANGED
@@ -32,7 +32,9 @@ def bscampp_pipeline(*args, **kwargs):
|
|
32
32
|
|
33
33
|
# initialize multiprocessing (if needed)
|
34
34
|
_LOG.warning('Initializing ProcessPoolExecutor...')
|
35
|
-
|
35
|
+
# maximally concurrently run Configs.num_cpus // 2 jobs, each job
|
36
|
+
# can use 2 threads
|
37
|
+
pool = ProcessPoolExecutor(Configs.max_workers, initializer=initial_pool,
|
36
38
|
initargs=(parser, cmdline_args,))
|
37
39
|
|
38
40
|
# (0) temporary files wrote to here
|
@@ -48,8 +50,8 @@ def bscampp_pipeline(*args, **kwargs):
|
|
48
50
|
|
49
51
|
# (1) read in tree, alignment, and separate reference sequences from
|
50
52
|
# query sequences
|
51
|
-
tree, leaf_dict, aln_path, aln, qaln_path, qaln
|
52
|
-
dry_run=dry_run)
|
53
|
+
tree, leaf_dict, aln_path, aln, qaln_path, qaln, qname_map, qname_map_rev \
|
54
|
+
= readData(workdir, dry_run=dry_run)
|
53
55
|
|
54
56
|
# (2) compute closest leaves for all query sequences
|
55
57
|
query_votes_dict, query_top_vote_dict = getClosestLeaves(
|
@@ -64,8 +66,9 @@ def bscampp_pipeline(*args, **kwargs):
|
|
64
66
|
|
65
67
|
# (4) perform placement for each subtree
|
66
68
|
output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
|
67
|
-
placed_query_list, aln, qaln, cmdline_args, workdir,
|
68
|
-
|
69
|
+
placed_query_list, aln, qaln, cmdline_args, workdir,
|
70
|
+
qname_map, qname_map_rev,
|
71
|
+
pool, lock, dry_run=dry_run)
|
69
72
|
|
70
73
|
# (5) write the output jplace to local
|
71
74
|
writeOutputJplace(output_jplace, dry_run=dry_run)
|
@@ -121,8 +124,8 @@ def scampp_pipeline(*args, **kwargs):
|
|
121
124
|
|
122
125
|
# (1) read in tree, alignment, and separate reference sequences from
|
123
126
|
# query sequences
|
124
|
-
tree, leaf_dict, aln_path, aln, qaln_path, qaln
|
125
|
-
dry_run=dry_run)
|
127
|
+
tree, leaf_dict, aln_path, aln, qaln_path, qaln, qname_map, qname_map_rev \
|
128
|
+
= readData(workdir, dry_run=dry_run)
|
126
129
|
|
127
130
|
# (2) compute closest leaves for all query sequences
|
128
131
|
query_votes_dict, query_top_vote_dict = getClosestLeaves(
|
@@ -136,8 +139,9 @@ def scampp_pipeline(*args, **kwargs):
|
|
136
139
|
|
137
140
|
# (4) perform placement for each subtree
|
138
141
|
output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
|
139
|
-
placed_query_list, aln, qaln, cmdline_args, workdir,
|
140
|
-
|
142
|
+
placed_query_list, aln, qaln, cmdline_args, workdir,
|
143
|
+
qname_map, qname_map_rev,
|
144
|
+
pool, lock, dry_run=dry_run)
|
141
145
|
|
142
146
|
# (5) write the output jplace to local
|
143
147
|
writeOutputJplace(output_jplace, dry_run=dry_run)
|
@@ -257,14 +261,20 @@ def _init_parser(default_outdir="bscampp_output",
|
|
257
261
|
basic_group.add_argument("-a", "--alignment", "--aln-path", type=str,
|
258
262
|
dest="aln_path",
|
259
263
|
help=("Path for reference sequence alignment in "
|
260
|
-
"FASTA format
|
264
|
+
"FASTA format (can be a .gz file). "
|
265
|
+
"Optionally with query sequences. "
|
261
266
|
"Query alignment can be specified with --qaln-path"),
|
262
267
|
required=required, default=None)
|
263
268
|
basic_group.add_argument("-q", "--qalignment", "--qaln-path", type=str,
|
264
269
|
dest="qaln_path",
|
265
270
|
help=("Optionally provide path to query sequence alignment "
|
266
|
-
"in FASTA format. Default: None"),
|
271
|
+
"in FASTA format (can be a .gz file). Default: None"),
|
267
272
|
required=False, default=None)
|
273
|
+
#basic_group.add_argument("--molecule", type=str,
|
274
|
+
# choices=['nucl', 'nucleotide', 'prot', 'protein'],
|
275
|
+
# help=("Specify nucleotide or protein sequences. "
|
276
|
+
# "Default: infer datatype"),
|
277
|
+
# required=False, default=None)
|
268
278
|
basic_group.add_argument("-d", "--outdir", type=str,
|
269
279
|
help="Directory path for output. Default: bscampp_output/",
|
270
280
|
required=False, default=default_outdir)
|
@@ -275,6 +285,9 @@ def _init_parser(default_outdir="bscampp_output",
|
|
275
285
|
dest="num_cpus",
|
276
286
|
help="Number of cores for parallelization, default: -1 (all)",
|
277
287
|
required=False, default=-1)
|
288
|
+
basic_group.add_argument("--cpus-per-job", type=int,
|
289
|
+
help="Number of cores to use for each job, default: 2",
|
290
|
+
required=False, default=2)
|
278
291
|
|
279
292
|
# advanced parameter settings
|
280
293
|
advance_group = parser.add_argument_group(
|
@@ -284,16 +297,24 @@ def _init_parser(default_outdir="bscampp_output",
|
|
284
297
|
))
|
285
298
|
parser.groups['advance_group'] = advance_group
|
286
299
|
|
287
|
-
advance_group.add_argument("-m", "--model", type=str,
|
288
|
-
|
289
|
-
|
300
|
+
#advance_group.add_argument("-m", "--model", type=str,
|
301
|
+
# help=("Model used for edge distances. EPA-ng will use the "
|
302
|
+
# "provided info_path (*.bestModel) for model. "
|
303
|
+
# "Default: GTR for nucleotide, LG for protein"),
|
304
|
+
# required=False, default=None)
|
290
305
|
advance_group.add_argument("-b", "--subtreesize", type=int,
|
291
306
|
help="Integer size of the subtree. Default: 2000",
|
292
307
|
required=False, default=2000)
|
293
308
|
advance_group.add_argument("-V", "--votes", type=int,
|
294
|
-
help="
|
295
|
-
"
|
309
|
+
help="(BSCAMPP only) Number of votes per query sequence. "
|
310
|
+
"Default: 5",
|
296
311
|
required=False, default=5)
|
312
|
+
advance_group.add_argument("--subtreetype", type=str,
|
313
|
+
help="(SCAMPP only) Options for collecting "
|
314
|
+
"nodes for the subtree - d for edge weighted "
|
315
|
+
"distances, n for node distances, h for Hamming "
|
316
|
+
"distances. Default: d",
|
317
|
+
required=False, default='d')
|
297
318
|
advance_group.add_argument("--similarityflag", type=str2bool,
|
298
319
|
help="Boolean, True if maximizing sequence similarity "
|
299
320
|
"instead of simple Hamming distance (ignoring gap "
|
@@ -306,17 +327,12 @@ def _init_parser(default_outdir="bscampp_output",
|
|
306
327
|
parser.groups['misc_group'] = misc_group
|
307
328
|
|
308
329
|
misc_group.add_argument("-n","--tmpfilenbr", type=int,
|
309
|
-
help="Temporary file indexing. Default: 0",
|
330
|
+
help="Temporary file indexing (e.g., tmp0/). Default: 0",
|
310
331
|
required=False, default=0)
|
311
332
|
misc_group.add_argument("--fragmentflag", type=str2bool,
|
312
|
-
help="If queries contains fragments.
|
333
|
+
help="If queries contains fragments. Does not do anything "
|
334
|
+
"if similarity flag is set to True. Default: True",
|
313
335
|
required=False, default=True)
|
314
|
-
misc_group.add_argument("--subtreetype", type=str,
|
315
|
-
help="(SCAMPP only) Options for collecting "
|
316
|
-
"nodes for the subtree - d for edge weighted "
|
317
|
-
"distances, n for node distances, h for Hamming "
|
318
|
-
"distances. Default: d",
|
319
|
-
required=False, default='d')
|
320
336
|
misc_group.add_argument("--keeptemp", type=str2bool,
|
321
337
|
help="Boolean, True to keep all temporary files. "
|
322
338
|
"Default: False",
|
bscampp/utils.py
CHANGED
@@ -9,8 +9,11 @@ import random
|
|
9
9
|
import statistics
|
10
10
|
import copy
|
11
11
|
import gzip
|
12
|
-
|
13
12
|
import argparse
|
13
|
+
|
14
|
+
from bscampp import get_logger, log_exception
|
15
|
+
_LOG = get_logger(__name__)
|
16
|
+
|
14
17
|
# reformat argparse help text formatting
|
15
18
|
class SmartHelpFormatter(argparse.RawDescriptionHelpFormatter):
|
16
19
|
def add_text(self, text):
|
@@ -36,6 +39,34 @@ BRACKET = {
|
|
36
39
|
}
|
37
40
|
|
38
41
|
|
42
|
+
# infer datatype from input file
|
43
|
+
def inferDataType(path):
|
44
|
+
sequences = read_data(path)
|
45
|
+
acg, t, u, total = 0, 0, 0, 0
|
46
|
+
for taxon, seq in sequences.items():
|
47
|
+
letters = seq.upper()
|
48
|
+
for letter in letters:
|
49
|
+
total = total + 1
|
50
|
+
|
51
|
+
if letter in ('A', 'C', 'G', 'N'):
|
52
|
+
acg += 1
|
53
|
+
elif letter == 'T':
|
54
|
+
t += 1
|
55
|
+
elif letter == 'U':
|
56
|
+
u += 1
|
57
|
+
# dna -> nucleotide
|
58
|
+
if u == 0 and (acg + t) / total > 0.9:
|
59
|
+
datatype = 'nucleotide'
|
60
|
+
# rna -> nucleotide
|
61
|
+
elif t == 0 and (acg + u) / total > 0.9:
|
62
|
+
datatype = 'nucleotide'
|
63
|
+
# amino acid -> protein
|
64
|
+
else:
|
65
|
+
datatype = 'protein'
|
66
|
+
|
67
|
+
_LOG.info(f"Inferred input data type: {datatype}")
|
68
|
+
return datatype
|
69
|
+
|
39
70
|
def write_fasta(aln, aln_dict, aligned=True):
|
40
71
|
""" Write given dictionary as FASTA file out
|
41
72
|
|
@@ -76,7 +107,12 @@ def read_data(aln):
|
|
76
107
|
|
77
108
|
"""
|
78
109
|
|
79
|
-
|
110
|
+
# determine the file type, whether we have a .gz/.gzip file
|
111
|
+
suffix = aln.split('.')[-1]
|
112
|
+
if suffix in ['gz', 'gzip']:
|
113
|
+
f = gzip.open(aln, 'rt')
|
114
|
+
else:
|
115
|
+
f = open(aln)
|
80
116
|
result = dict()
|
81
117
|
|
82
118
|
taxa = ""
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: bscampp
|
3
|
-
Version: 1.0.
|
4
|
-
Summary: BSCAMPP -
|
3
|
+
Version: 1.0.5
|
4
|
+
Summary: BSCAMPP and SCAMPP - Scalable Phylogenetic Placement Tools
|
5
5
|
Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
|
6
6
|
License: MIT License
|
7
7
|
|
@@ -66,7 +66,7 @@ Requires-Dist: taxtastic>=0.9.3
|
|
66
66
|
# Overview
|
67
67
|
* **Inputs**
|
68
68
|
1. Reference tree to place sequences into.
|
69
|
-
2. Alignment of reference sequences.
|
69
|
+
2. Alignment of reference sequences (protein or nucleotide).
|
70
70
|
3. Alignment of query sequences (can be combined with ii.).
|
71
71
|
4. Tree info file.
|
72
72
|
- (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
|
@@ -230,16 +230,17 @@ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment
|
|
230
230
|
> Output file name. Default: bscampp_result.jplace
|
231
231
|
> --threads NUM_CPUS, --num-cpus NUM_CPUS
|
232
232
|
> Number of cores for parallelization, default: -1 (all)
|
233
|
+
> --cpus-per-job CPUS_PER_JOB
|
234
|
+
> Number of cores to use for each job, default: 2
|
233
235
|
>
|
234
236
|
> ADVANCE PARAMETERS:
|
235
|
-
> These parameters control how BSCAMPP
|
237
|
+
> These parameters control how BSCAMPP and SCAMPP are run. The default values are set based on experiments.
|
236
238
|
>
|
237
|
-
> -m MODEL, --model MODEL
|
238
|
-
> Model used for edge distances. Default: GTR
|
239
239
|
> -b SUBTREESIZE, --subtreesize SUBTREESIZE
|
240
240
|
> Integer size of the subtree. Default: 2000
|
241
241
|
> -V VOTES, --votes VOTES
|
242
|
-
> Number of votes per query sequence.
|
242
|
+
> (BSCAMPP only) Number of votes per query sequence.
|
243
|
+
> Default: 5
|
243
244
|
> --similarityflag SIMILARITYFLAG
|
244
245
|
> Boolean, True if maximizing sequence similarity
|
245
246
|
> instead of simple Hamming distance (ignoring gap sites
|
@@ -1,11 +1,11 @@
|
|
1
|
-
bscampp/__init__.py,sha256=
|
2
|
-
bscampp/configs.py,sha256=
|
1
|
+
bscampp/__init__.py,sha256=ER9JtHb4EYnE1qyPBUmkpbsMeRC_4JDHUla46QUoInw,2289
|
2
|
+
bscampp/configs.py,sha256=perl6u5hto6J3JV1JMbsTQ6tqr2uGOk-Z9jfzflid0s,6122
|
3
3
|
bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
|
4
|
-
bscampp/functions.py,sha256=
|
4
|
+
bscampp/functions.py,sha256=qzlxW-bIJi0woStCzraALPb6VEPlO3CdPfCdfQqT2fQ,20119
|
5
5
|
bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
|
6
|
-
bscampp/jobs.py,sha256=
|
7
|
-
bscampp/pipeline.py,sha256=
|
8
|
-
bscampp/utils.py,sha256
|
6
|
+
bscampp/jobs.py,sha256=1FdvpSX_5VxNmJCCYAMdBKy8n68O1TjSET4XU1QULq0,7252
|
7
|
+
bscampp/pipeline.py,sha256=IPZnXZmVxGGfbVUuGCQh5X9oBq48-6pA9QkuvMGPTag,14000
|
8
|
+
bscampp/utils.py,sha256=-wns6FaWMKD2wVqjxdBQvjTdagTjywBIaGfqb2mupe4,30039
|
9
9
|
bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
|
10
10
|
bscampp/tools/pplacer,sha256=p0H4eo9uuiYoWS_kJbPfauOV99i7BXJdZSiwXIuLxTw,7834576
|
11
11
|
bscampp/tools/hamming_distance/CMakeLists.txt,sha256=yf9iq7Y61t3WObJHoR4RoGDEvUw_Q8JW3UnI4uh0cfU,389
|
@@ -17,9 +17,9 @@ bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD
|
|
17
17
|
bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
|
18
18
|
bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
|
19
19
|
bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
|
20
|
-
bscampp-1.0.
|
21
|
-
bscampp-1.0.
|
22
|
-
bscampp-1.0.
|
23
|
-
bscampp-1.0.
|
24
|
-
bscampp-1.0.
|
25
|
-
bscampp-1.0.
|
20
|
+
bscampp-1.0.5.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
|
21
|
+
bscampp-1.0.5.dist-info/METADATA,sha256=Nz0xmODp6N_e-u2VAgvegoVKW7134rM7UroUYqO7B0Q,12602
|
22
|
+
bscampp-1.0.5.dist-info/WHEEL,sha256=5U-5D1CS1IlCq2UZGreCPlzbpvhviDLR_iCQyI6CTvY,91
|
23
|
+
bscampp-1.0.5.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
|
24
|
+
bscampp-1.0.5.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
|
25
|
+
bscampp-1.0.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|