bscampp 1.0.3__py3-none-any.whl → 1.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bscampp/__init__.py +1 -1
- bscampp/configs.py +6 -1
- bscampp/functions.py +169 -34
- bscampp/jobs.py +18 -3
- bscampp/pipeline.py +40 -24
- bscampp/utils.py +38 -2
- {bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/METADATA +8 -7
- {bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/RECORD +12 -12
- {bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/WHEEL +1 -1
- {bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/LICENSE +0 -0
- {bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/entry_points.txt +0 -0
- {bscampp-1.0.3.dist-info → bscampp-1.0.6.dist-info}/top_level.txt +0 -0
bscampp/__init__.py
CHANGED
bscampp/configs.py
CHANGED
@@ -6,6 +6,7 @@ except ImportError:
|
|
6
6
|
from argparse import ArgumentParser, Namespace
|
7
7
|
from bscampp.init_configs import init_config_file
|
8
8
|
from bscampp import get_logger, log_exception
|
9
|
+
#from bscampp.utils import inferDataType
|
9
10
|
|
10
11
|
# detect home.path or create if missing
|
11
12
|
homepath = os.path.dirname(__file__) + '/home.path'
|
@@ -33,6 +34,9 @@ class Configs:
|
|
33
34
|
keeptemp = False # whether to keep all temporary files
|
34
35
|
verbose = 'INFO' # default verbose level to print
|
35
36
|
num_cpus = 1 # number of cores to use for parallelization
|
37
|
+
cpus_per_job = 2 # number of cores to use per job
|
38
|
+
max_workers = 1 # max_workers for ProcessPoolExecutor
|
39
|
+
# ... = max(1, num_cpus // cpus_per_job)
|
36
40
|
|
37
41
|
# binaries
|
38
42
|
pplacer_path = None
|
@@ -42,7 +46,6 @@ class Configs:
|
|
42
46
|
|
43
47
|
# placement settings
|
44
48
|
placement_method = 'epa-ng'
|
45
|
-
model = 'GTR'
|
46
49
|
subtreesize = 2000
|
47
50
|
votes = 5
|
48
51
|
similarityflag = True
|
@@ -162,6 +165,8 @@ def buildConfigs(parser, cmdline_args, child_process=False, rerun=False):
|
|
162
165
|
Configs.num_cpus = min(os.cpu_count(), Configs.num_cpus)
|
163
166
|
else:
|
164
167
|
Configs.num_cpus = os.cpu_count()
|
168
|
+
# compute max_workers based on num_cpus and cpus_per_job
|
169
|
+
Configs.max_workers = max(1, Configs.num_cpus // Configs.cpus_per_job)
|
165
170
|
|
166
171
|
# sanity check for existence of base placement binary path
|
167
172
|
if Configs.placement_method == 'epa-ng':
|
bscampp/functions.py
CHANGED
@@ -1,15 +1,80 @@
|
|
1
1
|
import json, time, os, sys
|
2
2
|
import treeswift
|
3
3
|
from collections import defaultdict, Counter
|
4
|
+
import subprocess
|
4
5
|
|
5
6
|
from bscampp import get_logger, log_exception
|
6
7
|
from bscampp.configs import Configs
|
7
|
-
from bscampp.jobs import EPAngJob, TaxtasticJob, PplacerTaxtasticJob
|
8
|
+
from bscampp.jobs import GenericJob, EPAngJob, TaxtasticJob, PplacerTaxtasticJob
|
8
9
|
from bscampp.utils import write_fasta
|
9
10
|
import bscampp.utils as utils
|
10
11
|
|
12
|
+
import concurrent.futures
|
13
|
+
|
11
14
|
_LOG = get_logger(__name__)
|
12
15
|
|
16
|
+
############################# helper functions ################################
|
17
|
+
'''
|
18
|
+
Function to recompile binaries from the given directory.
|
19
|
+
Assumption, the directory contains a CMakeLists.txt file
|
20
|
+
'''
|
21
|
+
def recompileBinariesFromDir(dir):
|
22
|
+
_LOG.warning(f"Recompiling binaries with cmake/make at {dir}")
|
23
|
+
|
24
|
+
# need to recompile the binaries
|
25
|
+
cmake_p = subprocess.Popen(['cmake', dir],
|
26
|
+
cwd=dir, stdout=subprocess.PIPE,
|
27
|
+
stderr=subprocess.PIPE, text=True)
|
28
|
+
cmake_stdout, cmake_stderr = cmake_p.communicate()
|
29
|
+
|
30
|
+
if cmake_p.returncode != 0:
|
31
|
+
_LOG.error("cmake failed!")
|
32
|
+
exit(cmake_p.returncode)
|
33
|
+
else:
|
34
|
+
_LOG.warning("cmake succeeded!")
|
35
|
+
|
36
|
+
# run make
|
37
|
+
make_p = subprocess.Popen(['make'],
|
38
|
+
cwd=dir, stdout=subprocess.PIPE,
|
39
|
+
stderr=subprocess.PIPE, text=True)
|
40
|
+
make_stdout, make_stderr = make_p.communicate()
|
41
|
+
|
42
|
+
if make_p.returncode != 0:
|
43
|
+
_LOG.error(f"make failed!")
|
44
|
+
exit(make_p.returncode)
|
45
|
+
else:
|
46
|
+
_LOG.warning("make succeeded!")
|
47
|
+
_LOG.warning(f"Successfully recompiled binaries at {dir}!")
|
48
|
+
|
49
|
+
'''
|
50
|
+
Function to check hamming/fragment_hamming/homology binaries are executable,
|
51
|
+
since they were compiled using dynamic library
|
52
|
+
'''
|
53
|
+
def ensureBinaryExecutable(binpath):
|
54
|
+
dir = os.path.dirname(binpath)
|
55
|
+
|
56
|
+
# binpath does not exist
|
57
|
+
b_recompile = False
|
58
|
+
if not os.path.exists(binpath):
|
59
|
+
_LOG.warning(f"{binpath} does not exist!")
|
60
|
+
b_recompile = True
|
61
|
+
else:
|
62
|
+
p = subprocess.Popen([binpath], stdout=subprocess.PIPE,
|
63
|
+
stderr=subprocess.PIPE)
|
64
|
+
stdout, stderr = p.communicate()
|
65
|
+
# 255 or -1 indicates that the binaries work
|
66
|
+
if p.returncode == 255 or p.returncode == -1:
|
67
|
+
pass
|
68
|
+
else:
|
69
|
+
_LOG.warning(f"{binpath} return code is {p.returncode}!")
|
70
|
+
b_recompile = True
|
71
|
+
|
72
|
+
if b_recompile:
|
73
|
+
recompileBinariesFromDir(dir)
|
74
|
+
return
|
75
|
+
|
76
|
+
########################## end of helper functions ############################
|
77
|
+
|
13
78
|
'''
|
14
79
|
Function to read in the placement tree and alignment.
|
15
80
|
If query alignment is provided, will use the provided query instead of
|
@@ -20,7 +85,7 @@ def readData(workdir, dry_run=False):
|
|
20
85
|
_LOG.info('Reading in input data...')
|
21
86
|
|
22
87
|
if dry_run:
|
23
|
-
return None, dict(), '', dict(), '', dict()
|
88
|
+
return None, dict(), '', dict(), '', dict(), dict(), dict()
|
24
89
|
|
25
90
|
# (1) load reference tree
|
26
91
|
tree = treeswift.read_tree_newick(Configs.tree_path)
|
@@ -40,22 +105,44 @@ def readData(workdir, dry_run=False):
|
|
40
105
|
if Configs.qaln_path is not None:
|
41
106
|
ref_dict = utils.read_data(Configs.aln_path)
|
42
107
|
q_dict = utils.read_data(Configs.qaln_path)
|
43
|
-
aln_path, qaln_path = Configs.aln_path, Configs.qaln_path
|
108
|
+
#aln_path, qaln_path = Configs.aln_path, Configs.qaln_path
|
44
109
|
else:
|
45
110
|
aln_dict = utils.read_data(Configs.aln_path)
|
46
111
|
ref_dict, q_dict = utils.seperate(aln_dict, leaf_dict)
|
47
112
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
113
|
+
# Added on 3.8.2025 by Chengze Shen
|
114
|
+
# - to ensure that any characters from the query has correct names
|
115
|
+
# (e.g., having ":" can cause trouble), have a qname_map that maps
|
116
|
+
# each taxon name to an idx
|
117
|
+
qidx = 1
|
118
|
+
qname_map = dict()
|
119
|
+
qname_map_rev = dict()
|
120
|
+
for name in q_dict.keys():
|
121
|
+
cvt = str(qidx).zfill(16) # 16 digits
|
122
|
+
qname_map[name] = cvt
|
123
|
+
qname_map_rev[cvt] = name
|
124
|
+
qidx += 1
|
125
|
+
# modify q_dict as well
|
126
|
+
for name, cvt in qname_map.items():
|
127
|
+
q_dict[cvt] = q_dict[name]
|
128
|
+
q_dict.pop(name)
|
129
|
+
|
130
|
+
# after separating queries from the reference alignment, write
|
131
|
+
# them to to TEMP/
|
132
|
+
# Updated on 3.5.2025 by Chengze Shen
|
133
|
+
# - regardless of the input choices, write a copy of both reference
|
134
|
+
# and query alignment to the workdir
|
135
|
+
qaln_path = os.path.join(workdir, 'qaln.fa')
|
136
|
+
write_fasta(qaln_path, q_dict)
|
137
|
+
|
138
|
+
aln_path = os.path.join(workdir, 'aln.fa')
|
139
|
+
write_fasta(aln_path, ref_dict)
|
140
|
+
|
55
141
|
|
56
142
|
t1 = time.perf_counter()
|
57
143
|
_LOG.info('Time to read in input data: {} seconds'.format(t1 - t0))
|
58
|
-
return tree, leaf_dict, aln_path, ref_dict, qaln_path, q_dict
|
144
|
+
return tree, leaf_dict, aln_path, ref_dict, qaln_path, q_dict, \
|
145
|
+
qname_map, qname_map_rev
|
59
146
|
|
60
147
|
'''
|
61
148
|
Function to get the closest leaf for each query sequence based on Hamming
|
@@ -75,18 +162,29 @@ def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir, dry_run=False):
|
|
75
162
|
if Configs.subtreetype == "h":
|
76
163
|
Configs.votes = Configs.subtreesize
|
77
164
|
|
78
|
-
cmd = []
|
79
165
|
if Configs.similarityflag:
|
80
|
-
|
166
|
+
job_type = 'homology'
|
81
167
|
else:
|
82
|
-
if Configs.fragmentflag
|
83
|
-
|
168
|
+
if Configs.fragmentflag:
|
169
|
+
job_type = 'fragment_hamming'
|
84
170
|
else:
|
85
|
-
|
86
|
-
|
171
|
+
job_type = 'hamming'
|
172
|
+
binpath = os.path.join(Configs.hamming_distance_dir, job_type)
|
173
|
+
cmd = [binpath]
|
174
|
+
|
175
|
+
# Added @ 3.9.2025 by Chengze Shen
|
176
|
+
# - check if binpath is executable, since the compiled files use dynamic
|
177
|
+
# libraries.
|
178
|
+
# If works: should have return code 255
|
179
|
+
# If not: should have return code 1,
|
180
|
+
# recompile the binaries using cmake and make
|
181
|
+
ensureBinaryExecutable(binpath)
|
182
|
+
|
87
183
|
cmd.extend([aln_path, str(len(aln)), qaln_path, str(len(qaln)),
|
88
184
|
tmp_output, str(Configs.votes)])
|
89
|
-
|
185
|
+
job = GenericJob(cmd=cmd, job_type=job_type)
|
186
|
+
_ = job.run()
|
187
|
+
#os.system(' '.join(cmd))
|
90
188
|
|
91
189
|
# process closest leaves
|
92
190
|
unusable_queries = set()
|
@@ -282,16 +380,30 @@ def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
|
|
282
380
|
'''
|
283
381
|
Helper function to run a single placement task. Designed to use with
|
284
382
|
multiprocessing
|
383
|
+
Input: job object
|
384
|
+
Return: outpath from job.run()
|
285
385
|
'''
|
286
|
-
def placeOneSubtree(
|
287
|
-
|
288
|
-
|
386
|
+
def placeOneSubtree(*jobs,
|
387
|
+
subtree_id=0, num_assigned_queries=-1, outpath=None, logging=None):
|
388
|
+
job_type = None
|
389
|
+
# record the last job_type and _outpath, which will be for the placement
|
390
|
+
# job
|
391
|
+
for job in jobs:
|
392
|
+
job_type = job.job_type
|
393
|
+
# run the job
|
394
|
+
_outpath = job.run(logging=logging)
|
395
|
+
|
396
|
+
# move output file for EPA-ng output
|
397
|
+
if job_type == 'epa-ng':
|
398
|
+
os.system('mv {} {}'.format(_outpath, outpath))
|
399
|
+
return subtree_id, num_assigned_queries, outpath
|
289
400
|
|
290
401
|
'''
|
291
402
|
Function to perform placement of queries for each subtree
|
292
403
|
'''
|
293
404
|
def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
294
|
-
aln, qaln, cmdline_args, workdir,
|
405
|
+
aln, qaln, cmdline_args, workdir, qname_map, qname_map_rev,
|
406
|
+
pool, lock, dry_run=False):
|
295
407
|
t0 = time.perf_counter()
|
296
408
|
_LOG.info('Performing placement on each subtree...')
|
297
409
|
|
@@ -307,22 +419,21 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
307
419
|
# go over the dictionary of subtrees and their assigned queries
|
308
420
|
# perform placement using either EPA-ng or pplacer
|
309
421
|
final_subtree_count, total_subtrees_examined = 0, 0
|
422
|
+
futures = []
|
423
|
+
_LOG.info("Submitting jobs for subtree placement...")
|
310
424
|
for subtree, query_list in new_subtree_dict.items():
|
311
425
|
total_subtrees_examined += 1
|
312
|
-
_LOG.info('- Subtree {}/{} with {} queries'.format(
|
313
|
-
total_subtrees_examined, len(new_subtree_dict), len(query_list)))
|
314
426
|
|
315
427
|
# empty subtree, continue
|
316
428
|
if len(query_list) == 0:
|
317
429
|
continue
|
318
|
-
final_subtree_count += 1
|
319
430
|
|
320
431
|
subtree_dir = os.path.join(workdir, f'subtree_{final_subtree_count}')
|
321
432
|
if not os.path.isdir(subtree_dir):
|
322
433
|
os.makedirs(subtree_dir)
|
323
434
|
|
324
435
|
# name all temporary output files
|
325
|
-
tmp_tree = os.path.join(subtree_dir, '
|
436
|
+
tmp_tree = os.path.join(subtree_dir, f'subtree_{final_subtree_count}.tre')
|
326
437
|
tmp_aln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_aln.fa')
|
327
438
|
tmp_qaln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_qaln.fa')
|
328
439
|
tmp_output = os.path.join(subtree_dir,
|
@@ -345,14 +456,16 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
345
456
|
|
346
457
|
# 1.27.2025 - Chengze Shen
|
347
458
|
# choose the placement method to run
|
459
|
+
jobs = []
|
348
460
|
if Configs.placement_method == 'epa-ng':
|
349
461
|
job = EPAngJob(path=Configs.epang_path,
|
350
462
|
info_path=Configs.info_path, tree_path=tmp_tree,
|
351
463
|
aln_path=tmp_aln, qaln_path=tmp_qaln,
|
352
464
|
outdir=subtree_dir, num_cpus=Configs.num_cpus)
|
353
|
-
|
354
|
-
|
355
|
-
|
465
|
+
jobs.append(job)
|
466
|
+
## for EPA-ng, ensure that outpath name is changed to the one we want
|
467
|
+
#_outpath = job.run(logging=f'subtree_{final_subtree_count}')
|
468
|
+
#os.system('mv {} {}'.format(_outpath, tmp_output))
|
356
469
|
elif Configs.placement_method == 'pplacer':
|
357
470
|
# build ref_pkg with info and tmp_tree and tmp_aln
|
358
471
|
refpkg_dir = os.path.join(subtree_dir,
|
@@ -361,17 +474,33 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
361
474
|
outdir=refpkg_dir, name=f'subtree_{final_subtree_count}',
|
362
475
|
aln_path=tmp_aln, tree_path=tmp_tree,
|
363
476
|
info_path=Configs.info_path)
|
364
|
-
|
477
|
+
jobs.append(taxit_job)
|
478
|
+
#_ = taxit_job.run()
|
365
479
|
|
366
480
|
# run pplacer-taxtastic
|
367
481
|
job = PplacerTaxtasticJob(path=Configs.pplacer_path,
|
368
|
-
refpkg_dir=refpkg_dir,
|
482
|
+
refpkg_dir=refpkg_dir,
|
483
|
+
#molecule=Configs.molecule, model=Configs.model,
|
369
484
|
outpath=tmp_output, num_cpus=Configs.num_cpus,
|
370
485
|
qaln_path=tmp_qaln)
|
371
|
-
tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
|
486
|
+
#tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
|
487
|
+
jobs.append(job)
|
372
488
|
else:
|
373
489
|
raise ValueError(
|
374
490
|
f"Placement method {Configs.placement_method} not recognized")
|
491
|
+
logging = f'subtree_{final_subtree_count}'
|
492
|
+
futures.append(pool.submit(placeOneSubtree, *jobs,
|
493
|
+
subtree_id=final_subtree_count,
|
494
|
+
num_assigned_queries=len(query_list),
|
495
|
+
outpath=tmp_output, logging=logging))
|
496
|
+
# increment final_subtree_count
|
497
|
+
final_subtree_count += 1
|
498
|
+
|
499
|
+
# deal with outputs
|
500
|
+
for future in concurrent.futures.as_completed(futures):
|
501
|
+
subtree_id, num_assigned_queries, tmp_output = future.result()
|
502
|
+
_LOG.info('- Subtree {}/{} with {} queries'.format(
|
503
|
+
subtree_id + 1, final_subtree_count, num_assigned_queries))
|
375
504
|
|
376
505
|
# read in each placement result
|
377
506
|
place_file = open(tmp_output, 'r')
|
@@ -391,8 +520,12 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
391
520
|
field_to_idx = {field: i for i, field in enumerate(fields)}
|
392
521
|
|
393
522
|
for tmp_place in place_json["placements"]:
|
394
|
-
#
|
395
|
-
|
523
|
+
# convert qname back using qname_map_rev
|
524
|
+
qname = qname_map_rev[tmp_place[tgt][0]]
|
525
|
+
tmp_place[tgt][0] = qname
|
526
|
+
placed_query_list.append(qname)
|
527
|
+
|
528
|
+
#placed_query_list.append(tmp_place[tgt][0])
|
396
529
|
for i in range(len(tmp_place["p"])):
|
397
530
|
edge_num = tmp_place["p"][i][
|
398
531
|
field_to_idx['edge_num']]
|
@@ -434,6 +567,7 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
434
567
|
|
435
568
|
placements.append(tmp_place.copy())
|
436
569
|
place_file.close()
|
570
|
+
|
437
571
|
_LOG.info(f'Final number of subtrees used: {final_subtree_count}')
|
438
572
|
|
439
573
|
# prepare the output jplace to write
|
@@ -447,6 +581,7 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
447
581
|
_LOG.info('Time to place queries to subtrees: {} seconds'.format(t1 - t0))
|
448
582
|
return jplace
|
449
583
|
|
584
|
+
|
450
585
|
'''
|
451
586
|
Function to write a given jplace object to local output
|
452
587
|
'''
|
bscampp/jobs.py
CHANGED
@@ -112,7 +112,7 @@ class Job(object):
|
|
112
112
|
else:
|
113
113
|
_LOG.error(error_msg + '\nSTDOUT: ' + stdout +
|
114
114
|
'\nSTDERR: ' + stderr + logpath)
|
115
|
-
exit(
|
115
|
+
exit(self.returncode)
|
116
116
|
except Exception:
|
117
117
|
log_exception(_LOG)
|
118
118
|
|
@@ -123,6 +123,18 @@ class Job(object):
|
|
123
123
|
raise NotImplementedError(
|
124
124
|
'get_invocation() should be implemented by subclasses.')
|
125
125
|
|
126
|
+
'''
|
127
|
+
Generic job that runs the given command, represented as a list of strings
|
128
|
+
'''
|
129
|
+
class GenericJob(Job):
|
130
|
+
def __init__(self, cmd=[], job_type='external'):
|
131
|
+
Job.__init__(self)
|
132
|
+
self.job_type = job_type
|
133
|
+
self.cmd = cmd
|
134
|
+
|
135
|
+
def get_invocation(self):
|
136
|
+
return self.cmd, None
|
137
|
+
|
126
138
|
'''
|
127
139
|
A EPA-ng job that runs EPA-ng with given parameters
|
128
140
|
'''
|
@@ -137,6 +149,8 @@ class EPAngJob(Job):
|
|
137
149
|
self.aln_path = ''
|
138
150
|
self.qaln_path = ''
|
139
151
|
self.outdir = ''
|
152
|
+
#self.molecule = ''
|
153
|
+
#self.model = ''
|
140
154
|
self.num_cpus = 1
|
141
155
|
|
142
156
|
for k, v in kwargs.items():
|
@@ -194,7 +208,7 @@ class PplacerTaxtasticJob(Job):
|
|
194
208
|
self.qaln_path = ''
|
195
209
|
self.outdir = ''
|
196
210
|
self.outpath = ''
|
197
|
-
self.model = 'GTR'
|
211
|
+
#self.model = 'GTR'
|
198
212
|
self.num_cpus = 1
|
199
213
|
|
200
214
|
for k, v in kwargs.items():
|
@@ -202,7 +216,8 @@ class PplacerTaxtasticJob(Job):
|
|
202
216
|
|
203
217
|
def get_invocation(self):
|
204
218
|
# outpath defined
|
205
|
-
cmd = [self.path,
|
219
|
+
cmd = [self.path,
|
220
|
+
#'-m', self.model,
|
206
221
|
'-c', self.refpkg_dir, '-o', self.outpath,
|
207
222
|
'-j', str(self.num_cpus), self.qaln_path]
|
208
223
|
return cmd, self.outpath
|
bscampp/pipeline.py
CHANGED
@@ -32,7 +32,9 @@ def bscampp_pipeline(*args, **kwargs):
|
|
32
32
|
|
33
33
|
# initialize multiprocessing (if needed)
|
34
34
|
_LOG.warning('Initializing ProcessPoolExecutor...')
|
35
|
-
|
35
|
+
# maximally concurrently run Configs.num_cpus // 2 jobs, each job
|
36
|
+
# can use 2 threads
|
37
|
+
pool = ProcessPoolExecutor(Configs.max_workers, initializer=initial_pool,
|
36
38
|
initargs=(parser, cmdline_args,))
|
37
39
|
|
38
40
|
# (0) temporary files wrote to here
|
@@ -48,8 +50,8 @@ def bscampp_pipeline(*args, **kwargs):
|
|
48
50
|
|
49
51
|
# (1) read in tree, alignment, and separate reference sequences from
|
50
52
|
# query sequences
|
51
|
-
tree, leaf_dict, aln_path, aln, qaln_path, qaln
|
52
|
-
dry_run=dry_run)
|
53
|
+
tree, leaf_dict, aln_path, aln, qaln_path, qaln, qname_map, qname_map_rev \
|
54
|
+
= readData(workdir, dry_run=dry_run)
|
53
55
|
|
54
56
|
# (2) compute closest leaves for all query sequences
|
55
57
|
query_votes_dict, query_top_vote_dict = getClosestLeaves(
|
@@ -64,8 +66,9 @@ def bscampp_pipeline(*args, **kwargs):
|
|
64
66
|
|
65
67
|
# (4) perform placement for each subtree
|
66
68
|
output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
|
67
|
-
placed_query_list, aln, qaln, cmdline_args, workdir,
|
68
|
-
|
69
|
+
placed_query_list, aln, qaln, cmdline_args, workdir,
|
70
|
+
qname_map, qname_map_rev,
|
71
|
+
pool, lock, dry_run=dry_run)
|
69
72
|
|
70
73
|
# (5) write the output jplace to local
|
71
74
|
writeOutputJplace(output_jplace, dry_run=dry_run)
|
@@ -121,8 +124,8 @@ def scampp_pipeline(*args, **kwargs):
|
|
121
124
|
|
122
125
|
# (1) read in tree, alignment, and separate reference sequences from
|
123
126
|
# query sequences
|
124
|
-
tree, leaf_dict, aln_path, aln, qaln_path, qaln
|
125
|
-
dry_run=dry_run)
|
127
|
+
tree, leaf_dict, aln_path, aln, qaln_path, qaln, qname_map, qname_map_rev \
|
128
|
+
= readData(workdir, dry_run=dry_run)
|
126
129
|
|
127
130
|
# (2) compute closest leaves for all query sequences
|
128
131
|
query_votes_dict, query_top_vote_dict = getClosestLeaves(
|
@@ -136,8 +139,9 @@ def scampp_pipeline(*args, **kwargs):
|
|
136
139
|
|
137
140
|
# (4) perform placement for each subtree
|
138
141
|
output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
|
139
|
-
placed_query_list, aln, qaln, cmdline_args, workdir,
|
140
|
-
|
142
|
+
placed_query_list, aln, qaln, cmdline_args, workdir,
|
143
|
+
qname_map, qname_map_rev,
|
144
|
+
pool, lock, dry_run=dry_run)
|
141
145
|
|
142
146
|
# (5) write the output jplace to local
|
143
147
|
writeOutputJplace(output_jplace, dry_run=dry_run)
|
@@ -257,14 +261,20 @@ def _init_parser(default_outdir="bscampp_output",
|
|
257
261
|
basic_group.add_argument("-a", "--alignment", "--aln-path", type=str,
|
258
262
|
dest="aln_path",
|
259
263
|
help=("Path for reference sequence alignment in "
|
260
|
-
"FASTA format
|
264
|
+
"FASTA format (can be a .gz file). "
|
265
|
+
"Optionally with query sequences. "
|
261
266
|
"Query alignment can be specified with --qaln-path"),
|
262
267
|
required=required, default=None)
|
263
268
|
basic_group.add_argument("-q", "--qalignment", "--qaln-path", type=str,
|
264
269
|
dest="qaln_path",
|
265
270
|
help=("Optionally provide path to query sequence alignment "
|
266
|
-
"in FASTA format. Default: None"),
|
271
|
+
"in FASTA format (can be a .gz file). Default: None"),
|
267
272
|
required=False, default=None)
|
273
|
+
#basic_group.add_argument("--molecule", type=str,
|
274
|
+
# choices=['nucl', 'nucleotide', 'prot', 'protein'],
|
275
|
+
# help=("Specify nucleotide or protein sequences. "
|
276
|
+
# "Default: infer datatype"),
|
277
|
+
# required=False, default=None)
|
268
278
|
basic_group.add_argument("-d", "--outdir", type=str,
|
269
279
|
help="Directory path for output. Default: bscampp_output/",
|
270
280
|
required=False, default=default_outdir)
|
@@ -275,6 +285,9 @@ def _init_parser(default_outdir="bscampp_output",
|
|
275
285
|
dest="num_cpus",
|
276
286
|
help="Number of cores for parallelization, default: -1 (all)",
|
277
287
|
required=False, default=-1)
|
288
|
+
basic_group.add_argument("--cpus-per-job", type=int,
|
289
|
+
help="Number of cores to use for each job, default: 2",
|
290
|
+
required=False, default=2)
|
278
291
|
|
279
292
|
# advanced parameter settings
|
280
293
|
advance_group = parser.add_argument_group(
|
@@ -284,16 +297,24 @@ def _init_parser(default_outdir="bscampp_output",
|
|
284
297
|
))
|
285
298
|
parser.groups['advance_group'] = advance_group
|
286
299
|
|
287
|
-
advance_group.add_argument("-m", "--model", type=str,
|
288
|
-
|
289
|
-
|
300
|
+
#advance_group.add_argument("-m", "--model", type=str,
|
301
|
+
# help=("Model used for edge distances. EPA-ng will use the "
|
302
|
+
# "provided info_path (*.bestModel) for model. "
|
303
|
+
# "Default: GTR for nucleotide, LG for protein"),
|
304
|
+
# required=False, default=None)
|
290
305
|
advance_group.add_argument("-b", "--subtreesize", type=int,
|
291
306
|
help="Integer size of the subtree. Default: 2000",
|
292
307
|
required=False, default=2000)
|
293
308
|
advance_group.add_argument("-V", "--votes", type=int,
|
294
|
-
help="
|
295
|
-
"
|
309
|
+
help="(BSCAMPP only) Number of votes per query sequence. "
|
310
|
+
"Default: 5",
|
296
311
|
required=False, default=5)
|
312
|
+
advance_group.add_argument("--subtreetype", type=str,
|
313
|
+
help="(SCAMPP only) Options for collecting "
|
314
|
+
"nodes for the subtree - d for edge weighted "
|
315
|
+
"distances, n for node distances, h for Hamming "
|
316
|
+
"distances. Default: d",
|
317
|
+
required=False, default='d')
|
297
318
|
advance_group.add_argument("--similarityflag", type=str2bool,
|
298
319
|
help="Boolean, True if maximizing sequence similarity "
|
299
320
|
"instead of simple Hamming distance (ignoring gap "
|
@@ -306,17 +327,12 @@ def _init_parser(default_outdir="bscampp_output",
|
|
306
327
|
parser.groups['misc_group'] = misc_group
|
307
328
|
|
308
329
|
misc_group.add_argument("-n","--tmpfilenbr", type=int,
|
309
|
-
help="Temporary file indexing. Default: 0",
|
330
|
+
help="Temporary file indexing (e.g., tmp0/). Default: 0",
|
310
331
|
required=False, default=0)
|
311
332
|
misc_group.add_argument("--fragmentflag", type=str2bool,
|
312
|
-
help="If queries contains fragments.
|
333
|
+
help="If queries contains fragments. Does not do anything "
|
334
|
+
"if similarity flag is set to True. Default: True",
|
313
335
|
required=False, default=True)
|
314
|
-
misc_group.add_argument("--subtreetype", type=str,
|
315
|
-
help="(SCAMPP only) Options for collecting "
|
316
|
-
"nodes for the subtree - d for edge weighted "
|
317
|
-
"distances, n for node distances, h for Hamming "
|
318
|
-
"distances. Default: d",
|
319
|
-
required=False, default='d')
|
320
336
|
misc_group.add_argument("--keeptemp", type=str2bool,
|
321
337
|
help="Boolean, True to keep all temporary files. "
|
322
338
|
"Default: False",
|
bscampp/utils.py
CHANGED
@@ -9,8 +9,11 @@ import random
|
|
9
9
|
import statistics
|
10
10
|
import copy
|
11
11
|
import gzip
|
12
|
-
|
13
12
|
import argparse
|
13
|
+
|
14
|
+
from bscampp import get_logger, log_exception
|
15
|
+
_LOG = get_logger(__name__)
|
16
|
+
|
14
17
|
# reformat argparse help text formatting
|
15
18
|
class SmartHelpFormatter(argparse.RawDescriptionHelpFormatter):
|
16
19
|
def add_text(self, text):
|
@@ -36,6 +39,34 @@ BRACKET = {
|
|
36
39
|
}
|
37
40
|
|
38
41
|
|
42
|
+
# infer datatype from input file
|
43
|
+
def inferDataType(path):
|
44
|
+
sequences = read_data(path)
|
45
|
+
acg, t, u, total = 0, 0, 0, 0
|
46
|
+
for taxon, seq in sequences.items():
|
47
|
+
letters = seq.upper()
|
48
|
+
for letter in letters:
|
49
|
+
total = total + 1
|
50
|
+
|
51
|
+
if letter in ('A', 'C', 'G', 'N'):
|
52
|
+
acg += 1
|
53
|
+
elif letter == 'T':
|
54
|
+
t += 1
|
55
|
+
elif letter == 'U':
|
56
|
+
u += 1
|
57
|
+
# dna -> nucleotide
|
58
|
+
if u == 0 and (acg + t) / total > 0.9:
|
59
|
+
datatype = 'nucleotide'
|
60
|
+
# rna -> nucleotide
|
61
|
+
elif t == 0 and (acg + u) / total > 0.9:
|
62
|
+
datatype = 'nucleotide'
|
63
|
+
# amino acid -> protein
|
64
|
+
else:
|
65
|
+
datatype = 'protein'
|
66
|
+
|
67
|
+
_LOG.info(f"Inferred input data type: {datatype}")
|
68
|
+
return datatype
|
69
|
+
|
39
70
|
def write_fasta(aln, aln_dict, aligned=True):
|
40
71
|
""" Write given dictionary as FASTA file out
|
41
72
|
|
@@ -76,7 +107,12 @@ def read_data(aln):
|
|
76
107
|
|
77
108
|
"""
|
78
109
|
|
79
|
-
|
110
|
+
# determine the file type, whether we have a .gz/.gzip file
|
111
|
+
suffix = aln.split('.')[-1]
|
112
|
+
if suffix in ['gz', 'gzip']:
|
113
|
+
f = gzip.open(aln, 'rt')
|
114
|
+
else:
|
115
|
+
f = open(aln)
|
80
116
|
result = dict()
|
81
117
|
|
82
118
|
taxa = ""
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: bscampp
|
3
|
-
Version: 1.0.
|
4
|
-
Summary: BSCAMPP -
|
3
|
+
Version: 1.0.6
|
4
|
+
Summary: BSCAMPP and SCAMPP - Scalable Phylogenetic Placement Tools
|
5
5
|
Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
|
6
6
|
License: MIT License
|
7
7
|
|
@@ -66,7 +66,7 @@ Requires-Dist: taxtastic>=0.9.3
|
|
66
66
|
# Overview
|
67
67
|
* **Inputs**
|
68
68
|
1. Reference tree to place sequences into.
|
69
|
-
2. Alignment of reference sequences.
|
69
|
+
2. Alignment of reference sequences (protein or nucleotide).
|
70
70
|
3. Alignment of query sequences (can be combined with ii.).
|
71
71
|
4. Tree info file.
|
72
72
|
- (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
|
@@ -230,16 +230,17 @@ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment
|
|
230
230
|
> Output file name. Default: bscampp_result.jplace
|
231
231
|
> --threads NUM_CPUS, --num-cpus NUM_CPUS
|
232
232
|
> Number of cores for parallelization, default: -1 (all)
|
233
|
+
> --cpus-per-job CPUS_PER_JOB
|
234
|
+
> Number of cores to use for each job, default: 2
|
233
235
|
>
|
234
236
|
> ADVANCE PARAMETERS:
|
235
|
-
> These parameters control how BSCAMPP
|
237
|
+
> These parameters control how BSCAMPP and SCAMPP are run. The default values are set based on experiments.
|
236
238
|
>
|
237
|
-
> -m MODEL, --model MODEL
|
238
|
-
> Model used for edge distances. Default: GTR
|
239
239
|
> -b SUBTREESIZE, --subtreesize SUBTREESIZE
|
240
240
|
> Integer size of the subtree. Default: 2000
|
241
241
|
> -V VOTES, --votes VOTES
|
242
|
-
> Number of votes per query sequence.
|
242
|
+
> (BSCAMPP only) Number of votes per query sequence.
|
243
|
+
> Default: 5
|
243
244
|
> --similarityflag SIMILARITYFLAG
|
244
245
|
> Boolean, True if maximizing sequence similarity
|
245
246
|
> instead of simple Hamming distance (ignoring gap sites
|
@@ -1,11 +1,11 @@
|
|
1
|
-
bscampp/__init__.py,sha256=
|
2
|
-
bscampp/configs.py,sha256=
|
1
|
+
bscampp/__init__.py,sha256=eDIMYifzKrFdtA3Ac7OvPTyIHUO1ZLgVaM0pKFxxEHE,2289
|
2
|
+
bscampp/configs.py,sha256=perl6u5hto6J3JV1JMbsTQ6tqr2uGOk-Z9jfzflid0s,6122
|
3
3
|
bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
|
4
|
-
bscampp/functions.py,sha256=
|
4
|
+
bscampp/functions.py,sha256=DGHQJLLzXSghDKbha0LW0YPip_45M6MI4t3zdDpzULI,22448
|
5
5
|
bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
|
6
|
-
bscampp/jobs.py,sha256=
|
7
|
-
bscampp/pipeline.py,sha256=
|
8
|
-
bscampp/utils.py,sha256
|
6
|
+
bscampp/jobs.py,sha256=v7buZJs1AnNoXiILwu-W8fo3QjxAh3i9Mp7xfmlJvAY,7569
|
7
|
+
bscampp/pipeline.py,sha256=IPZnXZmVxGGfbVUuGCQh5X9oBq48-6pA9QkuvMGPTag,14000
|
8
|
+
bscampp/utils.py,sha256=-wns6FaWMKD2wVqjxdBQvjTdagTjywBIaGfqb2mupe4,30039
|
9
9
|
bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
|
10
10
|
bscampp/tools/pplacer,sha256=p0H4eo9uuiYoWS_kJbPfauOV99i7BXJdZSiwXIuLxTw,7834576
|
11
11
|
bscampp/tools/hamming_distance/CMakeLists.txt,sha256=yf9iq7Y61t3WObJHoR4RoGDEvUw_Q8JW3UnI4uh0cfU,389
|
@@ -17,9 +17,9 @@ bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD
|
|
17
17
|
bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
|
18
18
|
bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
|
19
19
|
bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
|
20
|
-
bscampp-1.0.
|
21
|
-
bscampp-1.0.
|
22
|
-
bscampp-1.0.
|
23
|
-
bscampp-1.0.
|
24
|
-
bscampp-1.0.
|
25
|
-
bscampp-1.0.
|
20
|
+
bscampp-1.0.6.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
|
21
|
+
bscampp-1.0.6.dist-info/METADATA,sha256=0sWAKK30wlps8i0d1BdFqyv5MZVgefRnTn_-yMmO8lQ,12602
|
22
|
+
bscampp-1.0.6.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
23
|
+
bscampp-1.0.6.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
|
24
|
+
bscampp-1.0.6.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
|
25
|
+
bscampp-1.0.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|