bscampp 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bscampp/__init__.py +1 -1
- bscampp/functions.py +20 -11
- bscampp/jobs.py +18 -8
- {bscampp-1.0.2.dist-info → bscampp-1.0.3.dist-info}/METADATA +43 -25
- {bscampp-1.0.2.dist-info → bscampp-1.0.3.dist-info}/RECORD +9 -9
- {bscampp-1.0.2.dist-info → bscampp-1.0.3.dist-info}/LICENSE +0 -0
- {bscampp-1.0.2.dist-info → bscampp-1.0.3.dist-info}/WHEEL +0 -0
- {bscampp-1.0.2.dist-info → bscampp-1.0.3.dist-info}/entry_points.txt +0 -0
- {bscampp-1.0.2.dist-info → bscampp-1.0.3.dist-info}/top_level.txt +0 -0
bscampp/__init__.py
CHANGED
bscampp/functions.py
CHANGED
@@ -237,6 +237,7 @@ Function to assign queries to subtrees as used in SCAMPP
|
|
237
237
|
def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
|
238
238
|
tree, leaf_dict, dry_run=False):
|
239
239
|
t0 = time.perf_counter()
|
240
|
+
_LOG.info('(SCAMPP) Building query subtree for placement...')
|
240
241
|
|
241
242
|
if dry_run:
|
242
243
|
return dict(), []
|
@@ -253,7 +254,13 @@ def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
|
|
253
254
|
new_subtree_dict = dict()
|
254
255
|
# assign queries to subtrees, and remove them from the pool
|
255
256
|
# repeat until all queries are assigned
|
257
|
+
_total = 0
|
256
258
|
for seed_label, queries in seed_queries.items():
|
259
|
+
####### additional logging for tracking progress
|
260
|
+
_total += 1
|
261
|
+
if _total % 1000 == 0 or _total == len(seed_queries):
|
262
|
+
_LOG.info(f"- Built {_total}/{len(seed_queries)} subtrees")
|
263
|
+
|
257
264
|
node_y = leaf_dict[seed_label]
|
258
265
|
# extract [subtreesize] leaves
|
259
266
|
if Configs.subtreetype == "h":
|
@@ -266,14 +273,12 @@ def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
|
|
266
273
|
subtree = tree.extract_tree_with(labels)
|
267
274
|
new_subtree_dict[subtree] = queries
|
268
275
|
|
269
|
-
|
270
276
|
placed_query_list = []
|
271
|
-
|
277
|
+
|
272
278
|
t1 = time.perf_counter()
|
273
279
|
_LOG.info('Time to assign queries to subtrees: {} seconds'.format(t1 - t0))
|
274
280
|
return new_subtree_dict, placed_query_list
|
275
281
|
|
276
|
-
|
277
282
|
'''
|
278
283
|
Helper function to run a single placement task. Designed to use with
|
279
284
|
multiprocessing
|
@@ -311,12 +316,16 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
311
316
|
if len(query_list) == 0:
|
312
317
|
continue
|
313
318
|
final_subtree_count += 1
|
319
|
+
|
320
|
+
subtree_dir = os.path.join(workdir, f'subtree_{final_subtree_count}')
|
321
|
+
if not os.path.isdir(subtree_dir):
|
322
|
+
os.makedirs(subtree_dir)
|
314
323
|
|
315
324
|
# name all temporary output files
|
316
|
-
tmp_tree = os.path.join(
|
317
|
-
tmp_aln = os.path.join(
|
318
|
-
tmp_qaln = os.path.join(
|
319
|
-
tmp_output = os.path.join(
|
325
|
+
tmp_tree = os.path.join(subtree_dir, 'tree')
|
326
|
+
tmp_aln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_aln.fa')
|
327
|
+
tmp_qaln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_qaln.fa')
|
328
|
+
tmp_output = os.path.join(subtree_dir,
|
320
329
|
'subtree_{}_{}.jplace'.format(
|
321
330
|
final_subtree_count, Configs.placement_method))
|
322
331
|
|
@@ -340,13 +349,13 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
340
349
|
job = EPAngJob(path=Configs.epang_path,
|
341
350
|
info_path=Configs.info_path, tree_path=tmp_tree,
|
342
351
|
aln_path=tmp_aln, qaln_path=tmp_qaln,
|
343
|
-
outdir=
|
352
|
+
outdir=subtree_dir, num_cpus=Configs.num_cpus)
|
344
353
|
# for EPA-ng, ensure that outpath name is changed to the one we want
|
345
|
-
_outpath = job.run()
|
354
|
+
_outpath = job.run(logging=f'subtree_{final_subtree_count}')
|
346
355
|
os.system('mv {} {}'.format(_outpath, tmp_output))
|
347
356
|
elif Configs.placement_method == 'pplacer':
|
348
357
|
# build ref_pkg with info and tmp_tree and tmp_aln
|
349
|
-
refpkg_dir = os.path.join(
|
358
|
+
refpkg_dir = os.path.join(subtree_dir,
|
350
359
|
f'subtree_{final_subtree_count}.refpkg')
|
351
360
|
taxit_job = TaxtasticJob(path=Configs.taxit_path,
|
352
361
|
outdir=refpkg_dir, name=f'subtree_{final_subtree_count}',
|
@@ -359,7 +368,7 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
359
368
|
refpkg_dir=refpkg_dir, model=Configs.model,
|
360
369
|
outpath=tmp_output, num_cpus=Configs.num_cpus,
|
361
370
|
qaln_path=tmp_qaln)
|
362
|
-
tmp_output = job.run()
|
371
|
+
tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
|
363
372
|
else:
|
364
373
|
raise ValueError(
|
365
374
|
f"Placement method {Configs.placement_method} not recognized")
|
bscampp/jobs.py
CHANGED
@@ -3,7 +3,7 @@ from subprocess import Popen
|
|
3
3
|
from abc import abstractmethod
|
4
4
|
|
5
5
|
from bscampp import get_logger, log_exception
|
6
|
-
from bscampp.configs import Configs
|
6
|
+
#from bscampp.configs import Configs
|
7
7
|
|
8
8
|
_LOG = get_logger(__name__)
|
9
9
|
|
@@ -25,7 +25,7 @@ class Job(object):
|
|
25
25
|
return self.pid
|
26
26
|
|
27
27
|
# run the job with given invocation and raise errors when encountered
|
28
|
-
def run(self, stdin="", lock=None, logging=
|
28
|
+
def run(self, stdin="", lock=None, logging=None, shell=False):
|
29
29
|
try:
|
30
30
|
cmd, outpath = self.get_invocation()
|
31
31
|
_LOG.debug(f'Running job_type: {self.job_type}, output: {outpath}')
|
@@ -57,18 +57,22 @@ class Job(object):
|
|
57
57
|
# logging to local or to PIPE
|
58
58
|
stderr, stdout = '', ''
|
59
59
|
scmd = ' '.join(cmd)
|
60
|
-
if logging:
|
60
|
+
if logging != None:
|
61
61
|
logpath = os.path.join(
|
62
|
-
os.path.dirname(outpath),
|
62
|
+
os.path.dirname(outpath),
|
63
|
+
f'{logging}_{self.job_type}.txt')
|
63
64
|
outlogging = open(logpath, 'w', 1)
|
64
65
|
|
65
66
|
# TODO: may need to deal with piping in the future, for now
|
66
67
|
# it is not needed
|
67
68
|
p = Popen(cmd, text=True, bufsize=1,
|
68
69
|
stdin=subprocess.PIPE,
|
69
|
-
stdout=outlogging, stderr=
|
70
|
+
stdout=outlogging, stderr=outlogging)
|
70
71
|
self.pid = p.pid
|
71
72
|
stdout, stderr = p.communicate(input=stdin)
|
73
|
+
# stdout and stderr are both written to outlogging
|
74
|
+
# hence, assign them to be empty strings
|
75
|
+
stdout, stderr = '', ''
|
72
76
|
outlogging.close()
|
73
77
|
else:
|
74
78
|
p = Popen(cmd, text=True, bufsize=1,
|
@@ -92,16 +96,22 @@ class Job(object):
|
|
92
96
|
else:
|
93
97
|
error_msg = ' '.join([f'Error occurred running {self.job_type}.',
|
94
98
|
f'returncode: {self.returncode}'])
|
99
|
+
if logging != None:
|
100
|
+
logpath = '\nLOGPATH: ' + os.path.join(
|
101
|
+
os.path.dirname(outpath),
|
102
|
+
f'{logging}_{self.job_type}.txt')
|
103
|
+
else:
|
104
|
+
logpath = ''
|
95
105
|
if lock:
|
96
106
|
try:
|
97
107
|
lock.acquire()
|
98
108
|
_LOG.error(error_msg + '\nSTDOUT: ' + stdout +
|
99
|
-
'\nSTDERR: ' + stderr)
|
109
|
+
'\nSTDERR: ' + stderr + logpath)
|
100
110
|
finally:
|
101
111
|
lock.release()
|
102
112
|
else:
|
103
113
|
_LOG.error(error_msg + '\nSTDOUT: ' + stdout +
|
104
|
-
'\nSTDERR: ' + stderr)
|
114
|
+
'\nSTDERR: ' + stderr + logpath)
|
105
115
|
exit(1)
|
106
116
|
except Exception:
|
107
117
|
log_exception(_LOG)
|
@@ -177,7 +187,7 @@ A pplacer job that uses taxtastic refpkg to place sequences
|
|
177
187
|
class PplacerTaxtasticJob(Job):
|
178
188
|
def __init__(self, **kwargs):
|
179
189
|
Job.__init__(self)
|
180
|
-
self.job_type = 'pplacer
|
190
|
+
self.job_type = 'pplacer'
|
181
191
|
|
182
192
|
self.path = ''
|
183
193
|
self.refpkg_dir = ''
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: bscampp
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.3
|
4
4
|
Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
|
5
5
|
Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
|
6
6
|
License: MIT License
|
@@ -50,7 +50,7 @@ Requires-Dist: numpy>=1.21.6
|
|
50
50
|
Requires-Dist: treeswift>=1.1.45
|
51
51
|
Requires-Dist: taxtastic>=0.9.3
|
52
52
|
|
53
|
-
# BSCAMPP -
|
53
|
+
# BSCAMPP and SCAMPP - Two Scalable Phylogenetic Placement Methods and Frameworks
|
54
54
|
[](https://pypi.org/project/bscampp/)
|
55
55
|
[](https://pypi.org/project/bscampp/#history)
|
56
56
|
[](https://github.com/ewedell/BSCAMPP/)
|
@@ -70,40 +70,46 @@ Requires-Dist: taxtastic>=0.9.3
|
|
70
70
|
3. Alignment of query sequences (can be combined with ii.).
|
71
71
|
4. Tree info file.
|
72
72
|
- (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
|
73
|
-
- (pplacer as base method), RAxML-ng or FastTree log file.
|
73
|
+
- (pplacer as base method), RAxML-ng or FastTree log file containing model parameters.
|
74
74
|
* **Output**
|
75
75
|
1. Placement results of query sequences in the reference tree in `.jplace` format.
|
76
76
|
|
77
77
|
|
78
|
-
|
79
|
-
BSCAMPP achieves some magnitudes of speedup compared to
|
78
|
+
SCAMPP and BSCAMPP are two scalable solutions for phylogenetic placement. SCAMPP is designed more for accuracy
|
79
|
+
and BSCAMPP is designed more for speed. BSCAMPP achieves some magnitudes of speedup compared to SCAMPP.
|
80
80
|
The core algorithm is described in detail at <https://doi.org/10.1101/2022.10.26.513936>.
|
81
|
-
In short,
|
82
|
-
|
81
|
+
In short, Both frameworks in default use EPA-ng as the base placement method, allowing it to scale to placement trees
|
82
|
+
of at least ~200,000 leaves. Our two methods achieve this by extracting appropriate subtrees and assigning each query
|
83
|
+
to its most fitting subtree.
|
83
84
|
|
84
|
-
|
85
|
-
Currently, BSCAMPP
|
85
|
+
They are divide-and-conquer frameworks and can be used with any base placement methods (e.g., `pplacer` as well).
|
86
|
+
Currently, BSCAMPP and SCAMPP are implemented with `epa-ng` and `pplacer`.
|
86
87
|
|
87
|
-
|
88
|
-
|
89
|
-
|
88
|
+
#### BSCAMPP
|
89
|
+
It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 votes based on current best results,
|
90
|
+
especially if sequences are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and
|
91
|
+
5 respectively (see [Usage](#usage) for more details on customizing BSCAMPP).
|
90
92
|
|
91
93
|
#### SCAMPP
|
92
|
-
SCAMPP is also implemented in BSCAMPP
|
93
|
-
|
94
|
+
SCAMPP is also implemented in BSCAMPP, originally from <https://github.com/chry04/PLUSplacer>.
|
95
|
+
Its default also uses EPA-ng and a subtree size of 2,000.
|
96
|
+
The user can invoke SCAMPP by running `run_scampp.py` or `scampp` (if installed with PyPI) after installation.
|
94
97
|
|
95
98
|
# Installation
|
96
|
-
BSCAMPP
|
97
|
-
(2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP
|
98
|
-
(
|
99
|
+
BSCAMPP and SCAMPP were tested on **Python 3.8 to 3.12**. There are two ways to install:
|
100
|
+
(1) with PyPI, or (2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP or SCAMPP,
|
101
|
+
please contact Eleanor Wedell (ewedell2@illinois.edu).
|
99
102
|
|
100
103
|
### External requirements
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
+
* **Base placement method**:
|
105
|
+
EPA-ng and/or pplacer are requirements since BSCAMPP and SCAMPP will use them as the base phylogenetic placement methods.
|
106
|
+
By default, the software will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
|
107
|
+
We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
|
108
|
+
* **C++ OpenMP**:
|
109
|
+
We also use OpenMP to speed up the similarity comparison between sequences using C++, which is required to run the pre-compiled binaries.
|
104
110
|
|
105
111
|
### (1) Install with `pip`
|
106
|
-
The easiest way to install BSCAMPP is to use `pip install`. This will also install all required Python packages.
|
112
|
+
The easiest way to install BSCAMPP and SCAMPP is to use `pip install`. This will also install all required Python packages.
|
107
113
|
|
108
114
|
```bash
|
109
115
|
# 1. install with pip (--user if no root access)
|
@@ -142,22 +148,29 @@ git clone https://github.com/ewedell/BSCAMPP.git
|
|
142
148
|
# 2. Install all requirements
|
143
149
|
pip install -r requirements.txt
|
144
150
|
|
145
|
-
# 3. Execute BSCAMPP
|
151
|
+
# 3. Execute BSCAMPP/SCAMPP executables
|
146
152
|
python run_bscampp.py [-h]
|
153
|
+
python run_scampp.py [-h]
|
147
154
|
```
|
148
155
|
|
149
156
|
# Usage
|
150
157
|
All parameter settings can be found by running
|
151
158
|
```bash
|
152
|
-
run_bscampp.py -h
|
159
|
+
run_bscampp.py -h #OR
|
160
|
+
run_scampp.py -h
|
153
161
|
```
|
154
162
|
|
155
163
|
### (1) Default case (`epa-ng`)
|
156
164
|
```bash
|
165
|
+
# for BSCAMPP
|
157
166
|
run_bscampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
|
167
|
+
|
168
|
+
# for SCAMPP
|
169
|
+
run_scampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
|
158
170
|
```
|
159
|
-
|
160
|
-
|
171
|
+
BSCAMPP and SCAMPP in default mode run EPA-ng as the base method. `[alignment file]` should
|
172
|
+
contain both sequences from the placement tree and the query sequences to be placed.
|
173
|
+
This will create an output directory `bscampp_output` and write the placement results to
|
161
174
|
`bscampp_output/bscampp_result.jplace`.
|
162
175
|
|
163
176
|
### (2) Separately giving query alignment and finer control of outputs
|
@@ -173,6 +186,11 @@ run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
|
|
173
186
|
-a [reference alignment] -q [query sequence alignment] \
|
174
187
|
--placement-method pplacer
|
175
188
|
```
|
189
|
+
### (4) Changing the number of votes to 15 for BSCAMPP
|
190
|
+
```bash
|
191
|
+
run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment] \
|
192
|
+
-q [query sequence alignment] -V 15
|
193
|
+
```
|
176
194
|
|
177
195
|
### More comprehensive usage
|
178
196
|
```bash
|
@@ -1,9 +1,9 @@
|
|
1
|
-
bscampp/__init__.py,sha256=
|
1
|
+
bscampp/__init__.py,sha256=toGV8EzvMKviV7xHahhXs0K6fAmHw2cnWb6EDscpIOY,2289
|
2
2
|
bscampp/configs.py,sha256=3HJHLN2fLV5Tv3TJL95NpOuSXUV6CvqxRqCOM6TpbJQ,5767
|
3
3
|
bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
|
4
|
-
bscampp/functions.py,sha256=
|
4
|
+
bscampp/functions.py,sha256=QYI5RsUEMGc6jLPzFdInpmxA8wiYyN7785P3WxWYiTo,17839
|
5
5
|
bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
|
6
|
-
bscampp/jobs.py,sha256=
|
6
|
+
bscampp/jobs.py,sha256=PrVMJBabi4cYlrxVLo37XPOY82fY0zZ8Iyp9CWCNWhU,7181
|
7
7
|
bscampp/pipeline.py,sha256=C6I1vWeA6Rq_spPHy_il1FJA_DomWHUHYHLUUk9SnLk,13024
|
8
8
|
bscampp/utils.py,sha256=ragaI14Lqb2fVp_uYDkFQnV7a50G9-sUOWdVM-sNhUE,29005
|
9
9
|
bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
|
@@ -17,9 +17,9 @@ bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD
|
|
17
17
|
bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
|
18
18
|
bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
|
19
19
|
bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
|
20
|
-
bscampp-1.0.
|
21
|
-
bscampp-1.0.
|
22
|
-
bscampp-1.0.
|
23
|
-
bscampp-1.0.
|
24
|
-
bscampp-1.0.
|
25
|
-
bscampp-1.0.
|
20
|
+
bscampp-1.0.3.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
|
21
|
+
bscampp-1.0.3.dist-info/METADATA,sha256=01Vl-oCadCIiWFBLA564CLNErXILqEzdRrQNPpGy_mc,12507
|
22
|
+
bscampp-1.0.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
23
|
+
bscampp-1.0.3.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
|
24
|
+
bscampp-1.0.3.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
|
25
|
+
bscampp-1.0.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|