bscampp 1.0.1b0__tar.gz → 1.0.2b0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/CHANGELOG.md +13 -0
- {bscampp-1.0.1b0/bscampp.egg-info → bscampp-1.0.2b0}/PKG-INFO +69 -31
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/README.md +68 -30
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/__init__.py +1 -1
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/configs.py +1 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/functions.py +65 -8
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/jobs.py +18 -8
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/pipeline.py +100 -11
- {bscampp-1.0.1b0 → bscampp-1.0.2b0/bscampp.egg-info}/PKG-INFO +69 -31
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp.egg-info/entry_points.txt +2 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/pyproject.toml +2 -0
- bscampp-1.0.2b0/tests/test_dry_run.py +21 -0
- bscampp-1.0.1b0/tests/test_dry_run.py +0 -11
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/LICENSE +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/MANIFEST.in +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/default.config +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/init_configs.py +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/epa-ng +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/hamming_distance/CMakeLists.txt +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/hamming_distance/fragment_hamming +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/hamming_distance/hamming +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/hamming_distance/homology +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/hamming_distance/src/fragment_hamming.cpp +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/hamming_distance/src/homology.cpp +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/hamming_distance/src/new_hamming.cpp +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/pplacer +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/utils.py +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp.egg-info/SOURCES.txt +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp.egg-info/dependency_links.txt +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp.egg-info/requires.txt +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp.egg-info/top_level.txt +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/requirements.txt +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/run_bscampp.py +0 -0
- {bscampp-1.0.1b0 → bscampp-1.0.2b0}/setup.cfg +0 -0
@@ -1,3 +1,16 @@
|
|
1
|
+
# BSCAMPP v1.0.2b
|
2
|
+
1. Removed redundant dependency in `bscampp/jobs.py`.
|
3
|
+
2. Added logging to each placement subtask with the base method.
|
4
|
+
3. Changed the temporary file writing directory from a single directory, to
|
5
|
+
their corresponding directories.
|
6
|
+
|
7
|
+
# BSCAMPP v1.0.2
|
8
|
+
1. Added SCAMPP funtionality and its binary executables.
|
9
|
+
|
10
|
+
# BSCAMPP v1.0.1
|
11
|
+
1. Bumped version to full release.
|
12
|
+
2. Completed examples for display in `bscampp --help`.
|
13
|
+
|
1
14
|
# BSCAMPP v1.0.1b
|
2
15
|
1. Removed redundant codes and fixed missing variables.
|
3
16
|
2. Added badges for PyPI installation and current Python Build, etc.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: bscampp
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.2b0
|
4
4
|
Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
|
5
5
|
Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
|
6
6
|
License: MIT License
|
@@ -50,7 +50,7 @@ Requires-Dist: numpy>=1.21.6
|
|
50
50
|
Requires-Dist: treeswift>=1.1.45
|
51
51
|
Requires-Dist: taxtastic>=0.9.3
|
52
52
|
|
53
|
-
# BSCAMPP -
|
53
|
+
# BSCAMPP and SCAMPP - Two Scalable Phylogenetic Placement Methods and Frameworks
|
54
54
|
[](https://pypi.org/project/bscampp/)
|
55
55
|
[](https://pypi.org/project/bscampp/#history)
|
56
56
|
[](https://github.com/ewedell/BSCAMPP/)
|
@@ -70,47 +70,63 @@ Requires-Dist: taxtastic>=0.9.3
|
|
70
70
|
3. Alignment of query sequences (can be combined with ii.).
|
71
71
|
4. Tree info file.
|
72
72
|
- (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
|
73
|
-
- (pplacer as base method), RAxML-ng or FastTree log file.
|
73
|
+
- (pplacer as base method), RAxML-ng or FastTree log file containing model parameters.
|
74
74
|
* **Output**
|
75
75
|
1. Placement results of query sequences in the reference tree in `.jplace` format.
|
76
76
|
|
77
77
|
|
78
|
-
|
79
|
-
BSCAMPP achieves some magnitudes of speedup compared to
|
78
|
+
SCAMPP and BSCAMPP are two scalable solutions for phylogenetic placement. SCAMPP is designed more for accuracy
|
79
|
+
and BSCAMPP is designed more for speed. BSCAMPP achieves some magnitudes of speedup compared to SCAMPP.
|
80
80
|
The core algorithm is described in detail at <https://doi.org/10.1101/2022.10.26.513936>.
|
81
|
-
In short,
|
82
|
-
|
81
|
+
In short, Both frameworks in default use EPA-ng as the base placement method, allowing it to scale to placement trees
|
82
|
+
of at least ~200,000 leaves. Our two methods achieve this by extracting appropriate subtrees and assigning each query
|
83
|
+
to its most fitting subtree.
|
83
84
|
|
84
|
-
|
85
|
-
Currently, BSCAMPP
|
85
|
+
They are divide-and-conquer frameworks and can be used with any base placement methods (e.g., `pplacer` as well).
|
86
|
+
Currently, BSCAMPP and SCAMPP are implemented with `epa-ng` and `pplacer`.
|
86
87
|
|
87
|
-
|
88
|
-
|
89
|
-
|
88
|
+
#### BSCAMPP
|
89
|
+
It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 votes based on current best results,
|
90
|
+
especially if sequences are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and
|
91
|
+
5 respectively (see [Usage](#usage) for more details on customizing BSCAMPP).
|
92
|
+
|
93
|
+
#### SCAMPP
|
94
|
+
SCAMPP is also implemented in BSCAMPP, originally from <https://github.com/chry04/PLUSplacer>.
|
95
|
+
Its default also uses EPA-ng and a subtree size of 2,000.
|
96
|
+
The user can invoke SCAMPP by running `run_scampp.py` or `scampp` (if installed with PyPI) after installation.
|
90
97
|
|
91
98
|
# Installation
|
92
|
-
BSCAMPP
|
93
|
-
(2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP
|
94
|
-
(
|
99
|
+
BSCAMPP and SCAMPP were tested on **Python 3.8 to 3.12**. There are two ways to install:
|
100
|
+
(1) with PyPI, or (2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP or SCAMPP,
|
101
|
+
please contact Eleanor Wedell (ewedell2@illinois.edu).
|
95
102
|
|
96
103
|
### External requirements
|
97
|
-
|
98
|
-
|
99
|
-
|
104
|
+
* **Base placement method**:
|
105
|
+
EPA-ng and/or pplacer are requirements since BSCAMPP and SCAMPP will use them as the base phylogenetic placement methods.
|
106
|
+
By default, the software will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
|
107
|
+
We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
|
108
|
+
* **C++ OpenMP**:
|
109
|
+
We also use OpenMP to speed up the similarity comparison between sequences using C++, which is required to run the pre-compiled binaries.
|
100
110
|
|
101
|
-
### (1) Install with `pip`
|
102
|
-
The easiest way to install BSCAMPP is to use `pip install`. This will also install all required Python packages.
|
111
|
+
### (1) Install with `pip`
|
112
|
+
The easiest way to install BSCAMPP and SCAMPP is to use `pip install`. This will also install all required Python packages.
|
103
113
|
|
104
114
|
```bash
|
105
115
|
# 1. install with pip (--user if no root access)
|
106
116
|
pip install bscampp [--user]
|
107
117
|
|
108
|
-
# 2.
|
118
|
+
# 2. Four binary executables will be installed. The first time
|
109
119
|
# running any will create a config file at
|
110
120
|
# ~/.bscampp/main.config that resolves the links to all
|
111
121
|
# external software (e.g., epa-ng, pplacer)
|
122
|
+
|
123
|
+
# ---- BSCAMPP functions
|
112
124
|
bscampp [-h] # or
|
113
125
|
run_bscampp.py [-h]
|
126
|
+
|
127
|
+
# ---- SCAMPP functions
|
128
|
+
scampp [-h] # or
|
129
|
+
run_scampp.py
|
114
130
|
```
|
115
131
|
|
116
132
|
### (2) Install from GitHub
|
@@ -132,22 +148,29 @@ git clone https://github.com/ewedell/BSCAMPP.git
|
|
132
148
|
# 2. Install all requirements
|
133
149
|
pip install -r requirements.txt
|
134
150
|
|
135
|
-
# 3. Execute BSCAMPP
|
151
|
+
# 3. Execute BSCAMPP/SCAMPP executables
|
136
152
|
python run_bscampp.py [-h]
|
153
|
+
python run_scampp.py [-h]
|
137
154
|
```
|
138
155
|
|
139
156
|
# Usage
|
140
157
|
All parameter settings can be found by running
|
141
158
|
```bash
|
142
|
-
run_bscampp.py -h
|
159
|
+
run_bscampp.py -h #OR
|
160
|
+
run_scampp.py -h
|
143
161
|
```
|
144
162
|
|
145
163
|
### (1) Default case (`epa-ng`)
|
146
164
|
```bash
|
165
|
+
# for BSCAMPP
|
147
166
|
run_bscampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
|
167
|
+
|
168
|
+
# for SCAMPP
|
169
|
+
run_scampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
|
148
170
|
```
|
149
|
-
|
150
|
-
|
171
|
+
BSCAMPP and SCAMPP in default mode run EPA-ng as the base method. `[alignment file]` should
|
172
|
+
contain both sequences from the placement tree and the query sequences to be placed.
|
173
|
+
This will create an output directory `bscampp_output` and write the placement results to
|
151
174
|
`bscampp_output/bscampp_result.jplace`.
|
152
175
|
|
153
176
|
### (2) Separately giving query alignment and finer control of outputs
|
@@ -160,7 +183,13 @@ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment
|
|
160
183
|
### (3) Using `pplacer` as the base placement method
|
161
184
|
```bash
|
162
185
|
run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
|
163
|
-
-a [reference alignment] -q [query sequence alignment]
|
186
|
+
-a [reference alignment] -q [query sequence alignment] \
|
187
|
+
--placement-method pplacer
|
188
|
+
```
|
189
|
+
### (4) Changing the number of votes to 15 for BSCAMPP
|
190
|
+
```bash
|
191
|
+
run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment] \
|
192
|
+
-q [query sequence alignment] -V 15
|
164
193
|
```
|
165
194
|
|
166
195
|
### More comprehensive usage
|
@@ -221,14 +250,23 @@ run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
|
|
221
250
|
> Temporary file indexing. Default: 0
|
222
251
|
> --fragmentflag FRAGMENTFLAG
|
223
252
|
> If queries contains fragments. Default: True
|
253
|
+
> --subtreetype SUBTREETYPE
|
254
|
+
> (SCAMPP only) Options for collecting nodes for the
|
255
|
+
> subtree - d for edge weighted distances, n for node
|
256
|
+
> distances, h for Hamming distances. Default: d
|
224
257
|
> --keeptemp KEEPTEMP Boolean, True to keep all temporary files. Default:
|
225
258
|
False
|
226
259
|
```
|
227
260
|
|
228
261
|
|
229
262
|
# Example Code and Data
|
230
|
-
Example script and data are provided in this GitHub repository in `examples/`.
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
- `
|
263
|
+
Example script and data are provided in this GitHub repository in `examples/`.
|
264
|
+
The data is originally from the
|
265
|
+
[RNAsim-VS datasets](https://doi.org/10.1093/sysbio/syz063).
|
266
|
+
* `examples/run_bscampp.sh`: contains a simple script to test BSCAMPP with
|
267
|
+
`epa-ng` or `pplacer`, placing 200 query sequences to a 10000-leaf placement
|
268
|
+
tree. The info file is from RAxML-ng when running `epa-ng`, and from
|
269
|
+
FastTree-2 when running `pplacer`.
|
270
|
+
- `run_bscampp.sh` will invoke BSCAMPP with `epa-ng`.
|
271
|
+
- `run_bscampp.sh pplacer` will invoke BSCAMPP with `pplacer`.
|
272
|
+
* `examples/run_scampp.sh`: the same test script but running SCAMPP.
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# BSCAMPP -
|
1
|
+
# BSCAMPP and SCAMPP - Two Scalable Phylogenetic Placement Methods and Frameworks
|
2
2
|
[](https://pypi.org/project/bscampp/)
|
3
3
|
[](https://pypi.org/project/bscampp/#history)
|
4
4
|
[](https://github.com/ewedell/BSCAMPP/)
|
@@ -18,47 +18,63 @@
|
|
18
18
|
3. Alignment of query sequences (can be combined with ii.).
|
19
19
|
4. Tree info file.
|
20
20
|
- (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
|
21
|
-
- (pplacer as base method), RAxML-ng or FastTree log file.
|
21
|
+
- (pplacer as base method), RAxML-ng or FastTree log file containing model parameters.
|
22
22
|
* **Output**
|
23
23
|
1. Placement results of query sequences in the reference tree in `.jplace` format.
|
24
24
|
|
25
25
|
|
26
|
-
|
27
|
-
BSCAMPP achieves some magnitudes of speedup compared to
|
26
|
+
SCAMPP and BSCAMPP are two scalable solutions for phylogenetic placement. SCAMPP is designed more for accuracy
|
27
|
+
and BSCAMPP is designed more for speed. BSCAMPP achieves some magnitudes of speedup compared to SCAMPP.
|
28
28
|
The core algorithm is described in detail at <https://doi.org/10.1101/2022.10.26.513936>.
|
29
|
-
In short,
|
30
|
-
|
29
|
+
In short, Both frameworks in default use EPA-ng as the base placement method, allowing it to scale to placement trees
|
30
|
+
of at least ~200,000 leaves. Our two methods achieve this by extracting appropriate subtrees and assigning each query
|
31
|
+
to its most fitting subtree.
|
31
32
|
|
32
|
-
|
33
|
-
Currently, BSCAMPP
|
33
|
+
They are divide-and-conquer frameworks and can be used with any base placement methods (e.g., `pplacer` as well).
|
34
|
+
Currently, BSCAMPP and SCAMPP are implemented with `epa-ng` and `pplacer`.
|
34
35
|
|
35
|
-
|
36
|
-
|
37
|
-
|
36
|
+
#### BSCAMPP
|
37
|
+
It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 votes based on current best results,
|
38
|
+
especially if sequences are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and
|
39
|
+
5 respectively (see [Usage](#usage) for more details on customizing BSCAMPP).
|
40
|
+
|
41
|
+
#### SCAMPP
|
42
|
+
SCAMPP is also implemented in BSCAMPP, originally from <https://github.com/chry04/PLUSplacer>.
|
43
|
+
Its default also uses EPA-ng and a subtree size of 2,000.
|
44
|
+
The user can invoke SCAMPP by running `run_scampp.py` or `scampp` (if installed with PyPI) after installation.
|
38
45
|
|
39
46
|
# Installation
|
40
|
-
BSCAMPP
|
41
|
-
(2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP
|
42
|
-
(
|
47
|
+
BSCAMPP and SCAMPP were tested on **Python 3.8 to 3.12**. There are two ways to install:
|
48
|
+
(1) with PyPI, or (2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP or SCAMPP,
|
49
|
+
please contact Eleanor Wedell (ewedell2@illinois.edu).
|
43
50
|
|
44
51
|
### External requirements
|
45
|
-
|
46
|
-
|
47
|
-
|
52
|
+
* **Base placement method**:
|
53
|
+
EPA-ng and/or pplacer are requirements since BSCAMPP and SCAMPP will use them as the base phylogenetic placement methods.
|
54
|
+
By default, the software will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
|
55
|
+
We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
|
56
|
+
* **C++ OpenMP**:
|
57
|
+
We also use OpenMP to speed up the similarity comparison between sequences using C++, which is required to run the pre-compiled binaries.
|
48
58
|
|
49
|
-
### (1) Install with `pip`
|
50
|
-
The easiest way to install BSCAMPP is to use `pip install`. This will also install all required Python packages.
|
59
|
+
### (1) Install with `pip`
|
60
|
+
The easiest way to install BSCAMPP and SCAMPP is to use `pip install`. This will also install all required Python packages.
|
51
61
|
|
52
62
|
```bash
|
53
63
|
# 1. install with pip (--user if no root access)
|
54
64
|
pip install bscampp [--user]
|
55
65
|
|
56
|
-
# 2.
|
66
|
+
# 2. Four binary executables will be installed. The first time
|
57
67
|
# running any will create a config file at
|
58
68
|
# ~/.bscampp/main.config that resolves the links to all
|
59
69
|
# external software (e.g., epa-ng, pplacer)
|
70
|
+
|
71
|
+
# ---- BSCAMPP functions
|
60
72
|
bscampp [-h] # or
|
61
73
|
run_bscampp.py [-h]
|
74
|
+
|
75
|
+
# ---- SCAMPP functions
|
76
|
+
scampp [-h] # or
|
77
|
+
run_scampp.py
|
62
78
|
```
|
63
79
|
|
64
80
|
### (2) Install from GitHub
|
@@ -80,22 +96,29 @@ git clone https://github.com/ewedell/BSCAMPP.git
|
|
80
96
|
# 2. Install all requirements
|
81
97
|
pip install -r requirements.txt
|
82
98
|
|
83
|
-
# 3. Execute BSCAMPP
|
99
|
+
# 3. Execute BSCAMPP/SCAMPP executables
|
84
100
|
python run_bscampp.py [-h]
|
101
|
+
python run_scampp.py [-h]
|
85
102
|
```
|
86
103
|
|
87
104
|
# Usage
|
88
105
|
All parameter settings can be found by running
|
89
106
|
```bash
|
90
|
-
run_bscampp.py -h
|
107
|
+
run_bscampp.py -h #OR
|
108
|
+
run_scampp.py -h
|
91
109
|
```
|
92
110
|
|
93
111
|
### (1) Default case (`epa-ng`)
|
94
112
|
```bash
|
113
|
+
# for BSCAMPP
|
95
114
|
run_bscampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
|
115
|
+
|
116
|
+
# for SCAMPP
|
117
|
+
run_scampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
|
96
118
|
```
|
97
|
-
|
98
|
-
|
119
|
+
BSCAMPP and SCAMPP in default mode run EPA-ng as the base method. `[alignment file]` should
|
120
|
+
contain both sequences from the placement tree and the query sequences to be placed.
|
121
|
+
This will create an output directory `bscampp_output` and write the placement results to
|
99
122
|
`bscampp_output/bscampp_result.jplace`.
|
100
123
|
|
101
124
|
### (2) Separately giving query alignment and finer control of outputs
|
@@ -108,7 +131,13 @@ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment
|
|
108
131
|
### (3) Using `pplacer` as the base placement method
|
109
132
|
```bash
|
110
133
|
run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
|
111
|
-
-a [reference alignment] -q [query sequence alignment]
|
134
|
+
-a [reference alignment] -q [query sequence alignment] \
|
135
|
+
--placement-method pplacer
|
136
|
+
```
|
137
|
+
### (4) Changing the number of votes to 15 for BSCAMPP
|
138
|
+
```bash
|
139
|
+
run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment] \
|
140
|
+
-q [query sequence alignment] -V 15
|
112
141
|
```
|
113
142
|
|
114
143
|
### More comprehensive usage
|
@@ -169,14 +198,23 @@ run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
|
|
169
198
|
> Temporary file indexing. Default: 0
|
170
199
|
> --fragmentflag FRAGMENTFLAG
|
171
200
|
> If queries contains fragments. Default: True
|
201
|
+
> --subtreetype SUBTREETYPE
|
202
|
+
> (SCAMPP only) Options for collecting nodes for the
|
203
|
+
> subtree - d for edge weighted distances, n for node
|
204
|
+
> distances, h for Hamming distances. Default: d
|
172
205
|
> --keeptemp KEEPTEMP Boolean, True to keep all temporary files. Default:
|
173
206
|
False
|
174
207
|
```
|
175
208
|
|
176
209
|
|
177
210
|
# Example Code and Data
|
178
|
-
Example script and data are provided in this GitHub repository in `examples/`.
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
- `
|
211
|
+
Example script and data are provided in this GitHub repository in `examples/`.
|
212
|
+
The data is originally from the
|
213
|
+
[RNAsim-VS datasets](https://doi.org/10.1093/sysbio/syz063).
|
214
|
+
* `examples/run_bscampp.sh`: contains a simple script to test BSCAMPP with
|
215
|
+
`epa-ng` or `pplacer`, placing 200 query sequences to a 10000-leaf placement
|
216
|
+
tree. The info file is from RAxML-ng when running `epa-ng`, and from
|
217
|
+
FastTree-2 when running `pplacer`.
|
218
|
+
- `run_bscampp.sh` will invoke BSCAMPP with `epa-ng`.
|
219
|
+
- `run_bscampp.sh pplacer` will invoke BSCAMPP with `pplacer`.
|
220
|
+
* `examples/run_scampp.sh`: the same test script but running SCAMPP.
|
@@ -71,6 +71,9 @@ def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir, dry_run=False):
|
|
71
71
|
query_votes_dict = dict()
|
72
72
|
query_top_vote_dict = dict()
|
73
73
|
tmp_output = os.path.join(workdir, 'closest.txt')
|
74
|
+
|
75
|
+
if Configs.subtreetype == "h":
|
76
|
+
Configs.votes = Configs.subtreesize
|
74
77
|
|
75
78
|
cmd = []
|
76
79
|
if Configs.similarityflag:
|
@@ -226,6 +229,56 @@ def assignQueriesToSubtrees(query_votes_dict, query_top_vote_dict,
|
|
226
229
|
_LOG.info('Time to assign queries to subtrees: {} seconds'.format(t1 - t0))
|
227
230
|
return new_subtree_dict, placed_query_list
|
228
231
|
|
232
|
+
|
233
|
+
'''
|
234
|
+
Function to assign queries to subtrees as used in SCAMPP
|
235
|
+
(subtrees are built using the nearest leaf as the seed sequence)
|
236
|
+
'''
|
237
|
+
def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
|
238
|
+
tree, leaf_dict, dry_run=False):
|
239
|
+
t0 = time.perf_counter()
|
240
|
+
_LOG.info('(SCAMPP) Building query subtree for placement...')
|
241
|
+
|
242
|
+
if dry_run:
|
243
|
+
return dict(), []
|
244
|
+
|
245
|
+
# (1) go over the query seed sequences to see if any queries use
|
246
|
+
# the same seed sequence (i.e. subtree)
|
247
|
+
seed_queries = dict()
|
248
|
+
for query, closest_leaf in query_top_vote_dict.items():
|
249
|
+
if closest_leaf not in seed_queries:
|
250
|
+
seed_queries[closest_leaf] = [query]
|
251
|
+
else:
|
252
|
+
seed_queries[closest_leaf].append(query)
|
253
|
+
|
254
|
+
new_subtree_dict = dict()
|
255
|
+
# assign queries to subtrees, and remove them from the pool
|
256
|
+
# repeat until all queries are assigned
|
257
|
+
_total = 0
|
258
|
+
for seed_label, queries in seed_queries.items():
|
259
|
+
####### additional logging for tracking progress
|
260
|
+
_total += 1
|
261
|
+
if _total % 1000 == 0 or _total == len(seed_queries):
|
262
|
+
_LOG.info(f"- Built {_total}/{len(seed_queries)} subtrees")
|
263
|
+
|
264
|
+
node_y = leaf_dict[seed_label]
|
265
|
+
# extract [subtreesize] leaves
|
266
|
+
if Configs.subtreetype == "h":
|
267
|
+
labels = query_votes_dict[queries[0]]
|
268
|
+
elif Configs.subtreetype == "n":
|
269
|
+
labels = utils.subtree_nodes(tree, node_y, Configs.subtreesize)
|
270
|
+
else:
|
271
|
+
labels = utils.subtree_nodes_with_edge_length(tree, node_y,
|
272
|
+
Configs.subtreesize)
|
273
|
+
subtree = tree.extract_tree_with(labels)
|
274
|
+
new_subtree_dict[subtree] = queries
|
275
|
+
|
276
|
+
placed_query_list = []
|
277
|
+
|
278
|
+
t1 = time.perf_counter()
|
279
|
+
_LOG.info('Time to assign queries to subtrees: {} seconds'.format(t1 - t0))
|
280
|
+
return new_subtree_dict, placed_query_list
|
281
|
+
|
229
282
|
'''
|
230
283
|
Helper function to run a single placement task. Designed to use with
|
231
284
|
multiprocessing
|
@@ -263,12 +316,16 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
263
316
|
if len(query_list) == 0:
|
264
317
|
continue
|
265
318
|
final_subtree_count += 1
|
319
|
+
|
320
|
+
subtree_dir = os.path.join(workdir, f'subtree_{final_subtree_count}')
|
321
|
+
if not os.path.isdir(subtree_dir):
|
322
|
+
os.makedirs(subtree_dir)
|
266
323
|
|
267
324
|
# name all temporary output files
|
268
|
-
tmp_tree = os.path.join(
|
269
|
-
tmp_aln = os.path.join(
|
270
|
-
tmp_qaln = os.path.join(
|
271
|
-
tmp_output = os.path.join(
|
325
|
+
tmp_tree = os.path.join(subtree_dir, 'tree')
|
326
|
+
tmp_aln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_aln.fa')
|
327
|
+
tmp_qaln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_qaln.fa')
|
328
|
+
tmp_output = os.path.join(subtree_dir,
|
272
329
|
'subtree_{}_{}.jplace'.format(
|
273
330
|
final_subtree_count, Configs.placement_method))
|
274
331
|
|
@@ -292,13 +349,13 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
292
349
|
job = EPAngJob(path=Configs.epang_path,
|
293
350
|
info_path=Configs.info_path, tree_path=tmp_tree,
|
294
351
|
aln_path=tmp_aln, qaln_path=tmp_qaln,
|
295
|
-
outdir=
|
352
|
+
outdir=subtree_dir, num_cpus=Configs.num_cpus)
|
296
353
|
# for EPA-ng, ensure that outpath name is changed to the one we want
|
297
|
-
_outpath = job.run()
|
354
|
+
_outpath = job.run(logging=f'subtree_{final_subtree_count}')
|
298
355
|
os.system('mv {} {}'.format(_outpath, tmp_output))
|
299
356
|
elif Configs.placement_method == 'pplacer':
|
300
357
|
# build ref_pkg with info and tmp_tree and tmp_aln
|
301
|
-
refpkg_dir = os.path.join(
|
358
|
+
refpkg_dir = os.path.join(subtree_dir,
|
302
359
|
f'subtree_{final_subtree_count}.refpkg')
|
303
360
|
taxit_job = TaxtasticJob(path=Configs.taxit_path,
|
304
361
|
outdir=refpkg_dir, name=f'subtree_{final_subtree_count}',
|
@@ -311,7 +368,7 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
311
368
|
refpkg_dir=refpkg_dir, model=Configs.model,
|
312
369
|
outpath=tmp_output, num_cpus=Configs.num_cpus,
|
313
370
|
qaln_path=tmp_qaln)
|
314
|
-
tmp_output = job.run()
|
371
|
+
tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
|
315
372
|
else:
|
316
373
|
raise ValueError(
|
317
374
|
f"Placement method {Configs.placement_method} not recognized")
|
@@ -3,7 +3,7 @@ from subprocess import Popen
|
|
3
3
|
from abc import abstractmethod
|
4
4
|
|
5
5
|
from bscampp import get_logger, log_exception
|
6
|
-
from bscampp.configs import Configs
|
6
|
+
#from bscampp.configs import Configs
|
7
7
|
|
8
8
|
_LOG = get_logger(__name__)
|
9
9
|
|
@@ -25,7 +25,7 @@ class Job(object):
|
|
25
25
|
return self.pid
|
26
26
|
|
27
27
|
# run the job with given invocation and raise errors when encountered
|
28
|
-
def run(self, stdin="", lock=None, logging=
|
28
|
+
def run(self, stdin="", lock=None, logging=None, shell=False):
|
29
29
|
try:
|
30
30
|
cmd, outpath = self.get_invocation()
|
31
31
|
_LOG.debug(f'Running job_type: {self.job_type}, output: {outpath}')
|
@@ -57,18 +57,22 @@ class Job(object):
|
|
57
57
|
# logging to local or to PIPE
|
58
58
|
stderr, stdout = '', ''
|
59
59
|
scmd = ' '.join(cmd)
|
60
|
-
if logging:
|
60
|
+
if logging != None:
|
61
61
|
logpath = os.path.join(
|
62
|
-
os.path.dirname(outpath),
|
62
|
+
os.path.dirname(outpath),
|
63
|
+
f'{logging}_{self.job_type}.txt')
|
63
64
|
outlogging = open(logpath, 'w', 1)
|
64
65
|
|
65
66
|
# TODO: may need to deal with piping in the future, for now
|
66
67
|
# it is not needed
|
67
68
|
p = Popen(cmd, text=True, bufsize=1,
|
68
69
|
stdin=subprocess.PIPE,
|
69
|
-
stdout=outlogging, stderr=
|
70
|
+
stdout=outlogging, stderr=outlogging)
|
70
71
|
self.pid = p.pid
|
71
72
|
stdout, stderr = p.communicate(input=stdin)
|
73
|
+
# stdout and stderr are both written to outlogging
|
74
|
+
# hence, assign them to be empty strings
|
75
|
+
stdout, stderr = '', ''
|
72
76
|
outlogging.close()
|
73
77
|
else:
|
74
78
|
p = Popen(cmd, text=True, bufsize=1,
|
@@ -92,16 +96,22 @@ class Job(object):
|
|
92
96
|
else:
|
93
97
|
error_msg = ' '.join([f'Error occurred running {self.job_type}.',
|
94
98
|
f'returncode: {self.returncode}'])
|
99
|
+
if logging != None:
|
100
|
+
logpath = '\nLOGPATH: ' + os.path.join(
|
101
|
+
os.path.dirname(outpath),
|
102
|
+
f'{logging}_{self.job_type}.txt')
|
103
|
+
else:
|
104
|
+
logpath = ''
|
95
105
|
if lock:
|
96
106
|
try:
|
97
107
|
lock.acquire()
|
98
108
|
_LOG.error(error_msg + '\nSTDOUT: ' + stdout +
|
99
|
-
'\nSTDERR: ' + stderr)
|
109
|
+
'\nSTDERR: ' + stderr + logpath)
|
100
110
|
finally:
|
101
111
|
lock.release()
|
102
112
|
else:
|
103
113
|
_LOG.error(error_msg + '\nSTDOUT: ' + stdout +
|
104
|
-
'\nSTDERR: ' + stderr)
|
114
|
+
'\nSTDERR: ' + stderr + logpath)
|
105
115
|
exit(1)
|
106
116
|
except Exception:
|
107
117
|
log_exception(_LOG)
|
@@ -177,7 +187,7 @@ A pplacer job that uses taxtastic refpkg to place sequences
|
|
177
187
|
class PplacerTaxtasticJob(Job):
|
178
188
|
def __init__(self, **kwargs):
|
179
189
|
Job.__init__(self)
|
180
|
-
self.job_type = 'pplacer
|
190
|
+
self.job_type = 'pplacer'
|
181
191
|
|
182
192
|
self.path = ''
|
183
193
|
self.refpkg_dir = ''
|
@@ -89,6 +89,79 @@ def bscampp_pipeline(*args, **kwargs):
|
|
89
89
|
else:
|
90
90
|
return False
|
91
91
|
|
92
|
+
|
93
|
+
# main pipeline for SCAMPP
|
94
|
+
def scampp_pipeline(*args, **kwargs):
|
95
|
+
t0 = time.perf_counter()
|
96
|
+
m = Manager(); lock = m.Lock()
|
97
|
+
|
98
|
+
# set up a dry run if specified
|
99
|
+
dry_run = False
|
100
|
+
if 'dry_run' in kwargs and isinstance(kwargs['dry_run'], bool):
|
101
|
+
dry_run = kwargs['dry_run']
|
102
|
+
|
103
|
+
# parse command line arguments and build configurations
|
104
|
+
parser, cmdline_args = parseArguments(dry_run=dry_run, method="SCAMPP")
|
105
|
+
|
106
|
+
# initialize multiprocessing (if needed)
|
107
|
+
_LOG.warning('Initializing ProcessPoolExecutor...')
|
108
|
+
pool = ProcessPoolExecutor(Configs.num_cpus, initializer=initial_pool,
|
109
|
+
initargs=(parser, cmdline_args,))
|
110
|
+
|
111
|
+
# (0) temporary files wrote to here
|
112
|
+
if not dry_run:
|
113
|
+
workdir = os.path.join(Configs.outdir, f'tmp{Configs.tmpfilenbr}')
|
114
|
+
try:
|
115
|
+
if not os.path.isdir(workdir):
|
116
|
+
os.makedirs(workdir)
|
117
|
+
except OSError:
|
118
|
+
log_exception(_LOG)
|
119
|
+
else:
|
120
|
+
workdir = os.getcwd()
|
121
|
+
|
122
|
+
# (1) read in tree, alignment, and separate reference sequences from
|
123
|
+
# query sequences
|
124
|
+
tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
|
125
|
+
dry_run=dry_run)
|
126
|
+
|
127
|
+
# (2) compute closest leaves for all query sequences
|
128
|
+
query_votes_dict, query_top_vote_dict = getClosestLeaves(
|
129
|
+
aln_path, qaln_path, aln, qaln, workdir, dry_run=dry_run)
|
130
|
+
|
131
|
+
# (3) first assign each query to the subtree built using the closest
|
132
|
+
# leaf as the seed sequence
|
133
|
+
new_subtree_dict, placed_query_list = buildQuerySubtrees(
|
134
|
+
query_votes_dict, query_top_vote_dict, tree, leaf_dict,
|
135
|
+
dry_run=dry_run)
|
136
|
+
|
137
|
+
# (4) perform placement for each subtree
|
138
|
+
output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
|
139
|
+
placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
|
140
|
+
dry_run=dry_run)
|
141
|
+
|
142
|
+
# (5) write the output jplace to local
|
143
|
+
writeOutputJplace(output_jplace, dry_run=dry_run)
|
144
|
+
|
145
|
+
# shutdown pool
|
146
|
+
_LOG.warning('Shutting down ProcessPoolExecutor...')
|
147
|
+
pool.shutdown()
|
148
|
+
_LOG.warning('ProcessPoolExecutor shut down.')
|
149
|
+
|
150
|
+
# clean up temp files if not keeping
|
151
|
+
if not Configs.keeptemp:
|
152
|
+
_LOG.info('Removing temporary files...')
|
153
|
+
clean_temp_files()
|
154
|
+
|
155
|
+
# stop SCAMPP
|
156
|
+
send = time.perf_counter()
|
157
|
+
_LOG.info('SCAMPP completed in {} seconds...'.format(send - t0))
|
158
|
+
|
159
|
+
if dry_run:
|
160
|
+
return True
|
161
|
+
else:
|
162
|
+
return False
|
163
|
+
|
164
|
+
|
92
165
|
def clean_temp_files():
|
93
166
|
# all temporary files/directories to remove
|
94
167
|
temp_items = [f'tmp{Configs.tmpfilenbr}']
|
@@ -102,10 +175,14 @@ def clean_temp_files():
|
|
102
175
|
continue
|
103
176
|
_LOG.info(f'- Removed {temp}')
|
104
177
|
|
105
|
-
def parseArguments(dry_run=False):
|
178
|
+
def parseArguments(dry_run=False, method="BSCAMPP"):
|
106
179
|
global _root_dir, main_config_path
|
107
180
|
|
108
|
-
|
181
|
+
default_outdir = f"{method.lower()}_output"
|
182
|
+
default_outname = f"{method.lower()}_result"
|
183
|
+
|
184
|
+
parser = _init_parser(default_outdir=default_outdir,
|
185
|
+
default_outname=default_outname)
|
109
186
|
cmdline_args = sys.argv[1:]
|
110
187
|
|
111
188
|
if dry_run:
|
@@ -114,22 +191,27 @@ def parseArguments(dry_run=False):
|
|
114
191
|
|
115
192
|
# build config
|
116
193
|
buildConfigs(parser, cmdline_args)
|
117
|
-
_LOG.info('
|
194
|
+
_LOG.info('{} is running with: {}'.format(method,
|
118
195
|
' '.join(cmdline_args)))
|
119
196
|
getConfigs()
|
120
197
|
|
121
198
|
return parser, cmdline_args
|
122
199
|
|
123
|
-
def _init_parser(
|
200
|
+
def _init_parser(default_outdir="bscampp_output",
|
201
|
+
default_outname="bscampp_result"):
|
124
202
|
# example usage
|
125
203
|
example_usages = '''Example usages:
|
126
|
-
>
|
127
|
-
%(prog)s -i raxml.
|
204
|
+
> (1) Default
|
205
|
+
%(prog)s -i raxml.bestModel -t reference.tre -a alignment.fa
|
206
|
+
> (2) Separate alignment file for query sequences
|
207
|
+
%(prog)s -i raxml.bestModel -t reference.tre -a reference.fa -q query.fa
|
208
|
+
> (3) Use pplacer instead of EPA-ng as base method (need RAxML-ng info or FastTree log file)
|
209
|
+
%(prog)s -i fasttree.log -t reference.tre -a alignment.fa --placement-method pplacer
|
128
210
|
'''
|
129
211
|
|
130
212
|
parser = ArgumentParser(
|
131
213
|
description=(
|
132
|
-
"This program runs BSCAMPP, a scalable phylogenetic "
|
214
|
+
"This program runs BSCAMPP/SCAMPP, a scalable phylogenetic "
|
133
215
|
"placement framework that scales EPA-ng/pplacer "
|
134
216
|
"to very large tree placement."
|
135
217
|
),
|
@@ -156,7 +238,7 @@ def _init_parser():
|
|
156
238
|
# basic group
|
157
239
|
basic_group = parser.add_argument_group(
|
158
240
|
"Basic parameters".upper(),
|
159
|
-
"These are the basic parameters for BSCAMPP.")
|
241
|
+
"These are the basic parameters for BSCAMPP/SCAMPP.")
|
160
242
|
parser.groups['basic_group'] = basic_group
|
161
243
|
|
162
244
|
basic_group.add_argument('--placement-method', type=str,
|
@@ -185,10 +267,10 @@ def _init_parser():
|
|
185
267
|
required=False, default=None)
|
186
268
|
basic_group.add_argument("-d", "--outdir", type=str,
|
187
269
|
help="Directory path for output. Default: bscampp_output/",
|
188
|
-
required=False, default=
|
270
|
+
required=False, default=default_outdir)
|
189
271
|
basic_group.add_argument("-o", "--output", type=str, dest="outname",
|
190
272
|
help="Output file name. Default: bscampp_result.jplace",
|
191
|
-
required=False, default="
|
273
|
+
required=False, default=f"{default_outname}.jplace")
|
192
274
|
basic_group.add_argument("--threads", "--num-cpus", type=int,
|
193
275
|
dest="num_cpus",
|
194
276
|
help="Number of cores for parallelization, default: -1 (all)",
|
@@ -209,7 +291,8 @@ def _init_parser():
|
|
209
291
|
help="Integer size of the subtree. Default: 2000",
|
210
292
|
required=False, default=2000)
|
211
293
|
advance_group.add_argument("-V", "--votes", type=int,
|
212
|
-
help="Number of votes per
|
294
|
+
help="This is only used for BSCAMPP! Number of votes per "
|
295
|
+
"query sequence. Default: 5",
|
213
296
|
required=False, default=5)
|
214
297
|
advance_group.add_argument("--similarityflag", type=str2bool,
|
215
298
|
help="Boolean, True if maximizing sequence similarity "
|
@@ -228,6 +311,12 @@ def _init_parser():
|
|
228
311
|
misc_group.add_argument("--fragmentflag", type=str2bool,
|
229
312
|
help="If queries contains fragments. Default: True",
|
230
313
|
required=False, default=True)
|
314
|
+
misc_group.add_argument("--subtreetype", type=str,
|
315
|
+
help="(SCAMPP only) Options for collecting "
|
316
|
+
"nodes for the subtree - d for edge weighted "
|
317
|
+
"distances, n for node distances, h for Hamming "
|
318
|
+
"distances. Default: d",
|
319
|
+
required=False, default='d')
|
231
320
|
misc_group.add_argument("--keeptemp", type=str2bool,
|
232
321
|
help="Boolean, True to keep all temporary files. "
|
233
322
|
"Default: False",
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: bscampp
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.2b0
|
4
4
|
Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
|
5
5
|
Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
|
6
6
|
License: MIT License
|
@@ -50,7 +50,7 @@ Requires-Dist: numpy>=1.21.6
|
|
50
50
|
Requires-Dist: treeswift>=1.1.45
|
51
51
|
Requires-Dist: taxtastic>=0.9.3
|
52
52
|
|
53
|
-
# BSCAMPP -
|
53
|
+
# BSCAMPP and SCAMPP - Two Scalable Phylogenetic Placement Methods and Frameworks
|
54
54
|
[](https://pypi.org/project/bscampp/)
|
55
55
|
[](https://pypi.org/project/bscampp/#history)
|
56
56
|
[](https://github.com/ewedell/BSCAMPP/)
|
@@ -70,47 +70,63 @@ Requires-Dist: taxtastic>=0.9.3
|
|
70
70
|
3. Alignment of query sequences (can be combined with ii.).
|
71
71
|
4. Tree info file.
|
72
72
|
- (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
|
73
|
-
- (pplacer as base method), RAxML-ng or FastTree log file.
|
73
|
+
- (pplacer as base method), RAxML-ng or FastTree log file containing model parameters.
|
74
74
|
* **Output**
|
75
75
|
1. Placement results of query sequences in the reference tree in `.jplace` format.
|
76
76
|
|
77
77
|
|
78
|
-
|
79
|
-
BSCAMPP achieves some magnitudes of speedup compared to
|
78
|
+
SCAMPP and BSCAMPP are two scalable solutions for phylogenetic placement. SCAMPP is designed more for accuracy
|
79
|
+
and BSCAMPP is designed more for speed. BSCAMPP achieves some magnitudes of speedup compared to SCAMPP.
|
80
80
|
The core algorithm is described in detail at <https://doi.org/10.1101/2022.10.26.513936>.
|
81
|
-
In short,
|
82
|
-
|
81
|
+
In short, Both frameworks in default use EPA-ng as the base placement method, allowing it to scale to placement trees
|
82
|
+
of at least ~200,000 leaves. Our two methods achieve this by extracting appropriate subtrees and assigning each query
|
83
|
+
to its most fitting subtree.
|
83
84
|
|
84
|
-
|
85
|
-
Currently, BSCAMPP
|
85
|
+
They are divide-and-conquer frameworks and can be used with any base placement methods (e.g., `pplacer` as well).
|
86
|
+
Currently, BSCAMPP and SCAMPP are implemented with `epa-ng` and `pplacer`.
|
86
87
|
|
87
|
-
|
88
|
-
|
89
|
-
|
88
|
+
#### BSCAMPP
|
89
|
+
It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 votes based on current best results,
|
90
|
+
especially if sequences are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and
|
91
|
+
5 respectively (see [Usage](#usage) for more details on customizing BSCAMPP).
|
92
|
+
|
93
|
+
#### SCAMPP
|
94
|
+
SCAMPP is also implemented in BSCAMPP, originally from <https://github.com/chry04/PLUSplacer>.
|
95
|
+
Its default also uses EPA-ng and a subtree size of 2,000.
|
96
|
+
The user can invoke SCAMPP by running `run_scampp.py` or `scampp` (if installed with PyPI) after installation.
|
90
97
|
|
91
98
|
# Installation
|
92
|
-
BSCAMPP
|
93
|
-
(2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP
|
94
|
-
(
|
99
|
+
BSCAMPP and SCAMPP were tested on **Python 3.8 to 3.12**. There are two ways to install:
|
100
|
+
(1) with PyPI, or (2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP or SCAMPP,
|
101
|
+
please contact Eleanor Wedell (ewedell2@illinois.edu).
|
95
102
|
|
96
103
|
### External requirements
|
97
|
-
|
98
|
-
|
99
|
-
|
104
|
+
* **Base placement method**:
|
105
|
+
EPA-ng and/or pplacer are requirements since BSCAMPP and SCAMPP will use them as the base phylogenetic placement methods.
|
106
|
+
By default, the software will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
|
107
|
+
We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
|
108
|
+
* **C++ OpenMP**:
|
109
|
+
We also use OpenMP to speed up the similarity comparison between sequences using C++, which is required to run the pre-compiled binaries.
|
100
110
|
|
101
|
-
### (1) Install with `pip`
|
102
|
-
The easiest way to install BSCAMPP is to use `pip install`. This will also install all required Python packages.
|
111
|
+
### (1) Install with `pip`
|
112
|
+
The easiest way to install BSCAMPP and SCAMPP is to use `pip install`. This will also install all required Python packages.
|
103
113
|
|
104
114
|
```bash
|
105
115
|
# 1. install with pip (--user if no root access)
|
106
116
|
pip install bscampp [--user]
|
107
117
|
|
108
|
-
# 2.
|
118
|
+
# 2. Four binary executables will be installed. The first time
|
109
119
|
# running any will create a config file at
|
110
120
|
# ~/.bscampp/main.config that resolves the links to all
|
111
121
|
# external software (e.g., epa-ng, pplacer)
|
122
|
+
|
123
|
+
# ---- BSCAMPP functions
|
112
124
|
bscampp [-h] # or
|
113
125
|
run_bscampp.py [-h]
|
126
|
+
|
127
|
+
# ---- SCAMPP functions
|
128
|
+
scampp [-h] # or
|
129
|
+
run_scampp.py
|
114
130
|
```
|
115
131
|
|
116
132
|
### (2) Install from GitHub
|
@@ -132,22 +148,29 @@ git clone https://github.com/ewedell/BSCAMPP.git
|
|
132
148
|
# 2. Install all requirements
|
133
149
|
pip install -r requirements.txt
|
134
150
|
|
135
|
-
# 3. Execute BSCAMPP
|
151
|
+
# 3. Execute BSCAMPP/SCAMPP executables
|
136
152
|
python run_bscampp.py [-h]
|
153
|
+
python run_scampp.py [-h]
|
137
154
|
```
|
138
155
|
|
139
156
|
# Usage
|
140
157
|
All parameter settings can be found by running
|
141
158
|
```bash
|
142
|
-
run_bscampp.py -h
|
159
|
+
run_bscampp.py -h #OR
|
160
|
+
run_scampp.py -h
|
143
161
|
```
|
144
162
|
|
145
163
|
### (1) Default case (`epa-ng`)
|
146
164
|
```bash
|
165
|
+
# for BSCAMPP
|
147
166
|
run_bscampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
|
167
|
+
|
168
|
+
# for SCAMPP
|
169
|
+
run_scampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
|
148
170
|
```
|
149
|
-
|
150
|
-
|
171
|
+
BSCAMPP and SCAMPP in default mode run EPA-ng as the base method. `[alignment file]` should
|
172
|
+
contain both sequences from the placement tree and the query sequences to be placed.
|
173
|
+
This will create an output directory `bscampp_output` and write the placement results to
|
151
174
|
`bscampp_output/bscampp_result.jplace`.
|
152
175
|
|
153
176
|
### (2) Separately giving query alignment and finer control of outputs
|
@@ -160,7 +183,13 @@ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment
|
|
160
183
|
### (3) Using `pplacer` as the base placement method
|
161
184
|
```bash
|
162
185
|
run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
|
163
|
-
-a [reference alignment] -q [query sequence alignment]
|
186
|
+
-a [reference alignment] -q [query sequence alignment] \
|
187
|
+
--placement-method pplacer
|
188
|
+
```
|
189
|
+
### (4) Changing the number of votes to 15 for BSCAMPP
|
190
|
+
```bash
|
191
|
+
run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment] \
|
192
|
+
-q [query sequence alignment] -V 15
|
164
193
|
```
|
165
194
|
|
166
195
|
### More comprehensive usage
|
@@ -221,14 +250,23 @@ run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
|
|
221
250
|
> Temporary file indexing. Default: 0
|
222
251
|
> --fragmentflag FRAGMENTFLAG
|
223
252
|
> If queries contains fragments. Default: True
|
253
|
+
> --subtreetype SUBTREETYPE
|
254
|
+
> (SCAMPP only) Options for collecting nodes for the
|
255
|
+
> subtree - d for edge weighted distances, n for node
|
256
|
+
> distances, h for Hamming distances. Default: d
|
224
257
|
> --keeptemp KEEPTEMP Boolean, True to keep all temporary files. Default:
|
225
258
|
False
|
226
259
|
```
|
227
260
|
|
228
261
|
|
229
262
|
# Example Code and Data
|
230
|
-
Example script and data are provided in this GitHub repository in `examples/`.
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
- `
|
263
|
+
Example script and data are provided in this GitHub repository in `examples/`.
|
264
|
+
The data is originally from the
|
265
|
+
[RNAsim-VS datasets](https://doi.org/10.1093/sysbio/syz063).
|
266
|
+
* `examples/run_bscampp.sh`: contains a simple script to test BSCAMPP with
|
267
|
+
`epa-ng` or `pplacer`, placing 200 query sequences to a 10000-leaf placement
|
268
|
+
tree. The info file is from RAxML-ng when running `epa-ng`, and from
|
269
|
+
FastTree-2 when running `pplacer`.
|
270
|
+
- `run_bscampp.sh` will invoke BSCAMPP with `epa-ng`.
|
271
|
+
- `run_bscampp.sh pplacer` will invoke BSCAMPP with `pplacer`.
|
272
|
+
* `examples/run_scampp.sh`: the same test script but running SCAMPP.
|
@@ -37,6 +37,8 @@ classifiers = [
|
|
37
37
|
[project.scripts]
|
38
38
|
bscampp = "bscampp.pipeline:bscampp_pipeline"
|
39
39
|
"run_bscampp.py" = "bscampp.pipeline:bscampp_pipeline"
|
40
|
+
scampp = "bscampp.pipeline:scampp_pipeline"
|
41
|
+
"run_scampp.py" = "bscampp.pipeline:scampp_pipeline"
|
40
42
|
|
41
43
|
[project.urls]
|
42
44
|
Homepage = "https://github.com/ewedell/BSCAMPP"
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# tests/test_dry_run.py
|
2
|
+
import pytest, os
|
3
|
+
from bscampp.pipeline import bscampp_pipeline, scampp_pipeline
|
4
|
+
|
5
|
+
# test BSCAMPP
|
6
|
+
def test_bscampp_pipeline():
|
7
|
+
res = bscampp_pipeline(dry_run=True)
|
8
|
+
assert res == True
|
9
|
+
|
10
|
+
# remove bscampp_output that's created
|
11
|
+
if os.path.isdir('bscampp_output'):
|
12
|
+
os.rmdir('bscampp_output')
|
13
|
+
|
14
|
+
# test SCAMPP (almost the same as BSCAMPP)
|
15
|
+
def test_scampp_pipeline():
|
16
|
+
res = scampp_pipeline(dry_run=True)
|
17
|
+
assert res == True
|
18
|
+
|
19
|
+
# remove scampp_output that's created
|
20
|
+
if os.path.isdir('scampp_output'):
|
21
|
+
os.rmdir('scampp_output')
|
@@ -1,11 +0,0 @@
|
|
1
|
-
# tests/test_dry_run.py
|
2
|
-
import pytest, os
|
3
|
-
from bscampp.pipeline import bscampp_pipeline
|
4
|
-
|
5
|
-
def test_bscampp_pipeline():
|
6
|
-
res = bscampp_pipeline(dry_run=True)
|
7
|
-
assert res == True
|
8
|
-
|
9
|
-
# remove bscampp_output that's created
|
10
|
-
if os.path.isdir('bscampp_output'):
|
11
|
-
os.rmdir('bscampp_output')
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp
RENAMED
File without changes
|
{bscampp-1.0.1b0 → bscampp-1.0.2b0}/bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|