bscampp 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bscampp/__init__.py +68 -0
- bscampp/configs.py +169 -0
- bscampp/default.config +5 -0
- bscampp/functions.py +409 -0
- bscampp/init_configs.py +93 -0
- bscampp/jobs.py +198 -0
- bscampp/pipeline.py +249 -0
- bscampp/tools/epa-ng +0 -0
- bscampp/tools/hamming_distance/CMakeLists.txt +13 -0
- bscampp/tools/hamming_distance/fragment_hamming +0 -0
- bscampp/tools/hamming_distance/hamming +0 -0
- bscampp/tools/hamming_distance/homology +0 -0
- bscampp/tools/hamming_distance/src/fragment_hamming.cpp +180 -0
- bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp +183 -0
- bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp +214 -0
- bscampp/tools/hamming_distance/src/homology.cpp +179 -0
- bscampp/tools/hamming_distance/src/new_hamming.cpp +161 -0
- bscampp/tools/pplacer +0 -0
- bscampp/utils.py +914 -0
- bscampp-1.0.1.dist-info/LICENSE +21 -0
- bscampp-1.0.1.dist-info/METADATA +234 -0
- bscampp-1.0.1.dist-info/RECORD +25 -0
- bscampp-1.0.1.dist-info/WHEEL +5 -0
- bscampp-1.0.1.dist-info/entry_points.txt +3 -0
- bscampp-1.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2022 ewedell
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,234 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: bscampp
|
3
|
+
Version: 1.0.1
|
4
|
+
Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
|
5
|
+
Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
|
6
|
+
License: MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2022 ewedell
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
Project-URL: Homepage, https://github.com/ewedell/BSCAMPP
|
29
|
+
Project-URL: Changelog, https://github.com/ewedell/BSCAMPP/CHANGELOG.md
|
30
|
+
Classifier: Development Status :: 4 - Beta
|
31
|
+
Classifier: Operating System :: OS Independent
|
32
|
+
Classifier: Intended Audience :: Developers
|
33
|
+
Classifier: Intended Audience :: Science/Research
|
34
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
35
|
+
Classifier: Topic :: Software Development
|
36
|
+
Classifier: License :: OSI Approved :: GNU General Public License (GPL)
|
37
|
+
Classifier: Programming Language :: Python
|
38
|
+
Classifier: Programming Language :: Python :: 3
|
39
|
+
Classifier: Programming Language :: Python :: 3.7
|
40
|
+
Classifier: Programming Language :: Python :: 3.8
|
41
|
+
Classifier: Programming Language :: Python :: 3.9
|
42
|
+
Classifier: Programming Language :: Python :: 3.10
|
43
|
+
Classifier: Programming Language :: Python :: 3.11
|
44
|
+
Classifier: Programming Language :: Python :: 3.12
|
45
|
+
Requires-Python: >=3.7
|
46
|
+
Description-Content-Type: text/markdown
|
47
|
+
License-File: LICENSE
|
48
|
+
Requires-Dist: ConfigParser>=5.0.0
|
49
|
+
Requires-Dist: numpy>=1.21.6
|
50
|
+
Requires-Dist: treeswift>=1.1.45
|
51
|
+
Requires-Dist: taxtastic>=0.9.3
|
52
|
+
|
53
|
+
# BSCAMPP - A Scalable Phylogenetic Placement Method and Framework
|
54
|
+
[](https://pypi.org/project/bscampp/)
|
55
|
+
[](https://pypi.org/project/bscampp/#history)
|
56
|
+
[](https://github.com/ewedell/BSCAMPP/)
|
57
|
+
[](https://github.com/ewedell/BSCAMPP/blob/main/LICENSE)
|
58
|
+
[](https://github.com/ewedell/BSCAMPP/blob/main/CHANGELOG.md)
|
59
|
+
|
60
|
+
**Table of Contents**
|
61
|
+
1. [Overview](#overview)
|
62
|
+
2. [Installation](#installation)
|
63
|
+
3. [Usage](#usage)
|
64
|
+
4. [Example Code and Data](#example-code-and-data)
|
65
|
+
|
66
|
+
# Overview
|
67
|
+
* **Inputs**
|
68
|
+
1. Reference tree to place sequences into.
|
69
|
+
2. Alignment of reference sequences.
|
70
|
+
3. Alignment of query sequences (can be combined with ii.).
|
71
|
+
4. Tree info file.
|
72
|
+
- (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
|
73
|
+
- (pplacer as base method), RAxML-ng or FastTree log file.
|
74
|
+
* **Output**
|
75
|
+
1. Placement results of query sequences in the reference tree in `.jplace` format.
|
76
|
+
|
77
|
+
|
78
|
+
BSCAMPP is an extension and scalable solution to its previous method [SCAMPP](https://github.com/chry04/PLUSplacer) for phylogenetic placement.
|
79
|
+
BSCAMPP achieves some magnitudes of speedup compared to the SCAMPP framework.
|
80
|
+
The core algorithm is described in detail at <https://doi.org/10.1101/2022.10.26.513936>.
|
81
|
+
In short, BSCAMPP in default uses EPA-ng as the base placement method, allowing it to scale to placement trees of up to ~200,000 leaves.
|
82
|
+
BSCAMPP achieves this by extracting appropriate subtrees and assigning each query to its most fitting subtree.
|
83
|
+
|
84
|
+
BSCAMPP essentially is a divide-and-conquer framework and can be used with any base placement methods (e.g., `pplacer` as well).
|
85
|
+
Currently, BSCAMPP is implemented with `epa-ng` and `pplacer`.
|
86
|
+
|
87
|
+
It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 votes based on current best results, especially if sequences
|
88
|
+
are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and 5 respectively (see [Usage](#usage) for more details
|
89
|
+
on customizing BSCAMPP).
|
90
|
+
|
91
|
+
# Installation
|
92
|
+
BSCAMPP was tested on **Python 3.7 to 3.12**. There are two ways to install and use BSCAMPP: (1) with PyPI, or
|
93
|
+
(2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP, please contact Eleanor Wedell
|
94
|
+
(ewedell@illinois.edu).
|
95
|
+
|
96
|
+
### External requirements
|
97
|
+
EPA-ng and/or pplacer are requirements to run BSCAMPP since BSCAMPP will use them as the base phylogenetic placement methods.
|
98
|
+
By default, BSCAMPP will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
|
99
|
+
We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
|
100
|
+
|
101
|
+
### (1) Install with `pip` (Coming soon)
|
102
|
+
The easiest way to install BSCAMPP is to use `pip install`. This will also install all required Python packages.
|
103
|
+
|
104
|
+
```bash
|
105
|
+
# 1. install with pip (--user if no root access)
|
106
|
+
pip install bscampp [--user]
|
107
|
+
|
108
|
+
# 2. Two binary executables will be installed. The first time
|
109
|
+
# running any will create a config file at
|
110
|
+
# ~/.bscampp/main.config that resolves the links to all
|
111
|
+
# external software (e.g., epa-ng, pplacer)
|
112
|
+
bscampp [-h] # or
|
113
|
+
run_bscampp.py [-h]
|
114
|
+
```
|
115
|
+
|
116
|
+
### (2) Install from GitHub
|
117
|
+
Alternatively, the user can clone this GitHub repository and install the required packages manually.
|
118
|
+
|
119
|
+
#### Requirements
|
120
|
+
```bash
|
121
|
+
python>=3.7
|
122
|
+
ConfigParser>=5.0.0
|
123
|
+
numpy>=1.21.6
|
124
|
+
treeswift>=1.1.45
|
125
|
+
taxtastic>=0.9.3
|
126
|
+
```
|
127
|
+
|
128
|
+
```bash
|
129
|
+
# 1. Close the GitHub repo
|
130
|
+
git clone https://github.com/ewedell/BSCAMPP.git
|
131
|
+
|
132
|
+
# 2. Install all requirements
|
133
|
+
pip install -r requirements.txt
|
134
|
+
|
135
|
+
# 3. Execute BSCAMPP executable `run_bscampp.py`
|
136
|
+
python run_bscampp.py [-h]
|
137
|
+
```
|
138
|
+
|
139
|
+
# Usage
|
140
|
+
All parameter settings can be found by running
|
141
|
+
```bash
|
142
|
+
run_bscampp.py -h
|
143
|
+
```
|
144
|
+
|
145
|
+
### (1) Default case (`epa-ng`)
|
146
|
+
```bash
|
147
|
+
run_bscampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
|
148
|
+
```
|
149
|
+
To run BSCAMPP in its default mode with EPA-ng. `[alignment file]` should contain both sequences from the placement tree and
|
150
|
+
the query sequences to be placed. This will create an output directory `bscampp_output` and write the placement results to
|
151
|
+
`bscampp_output/bscampp_result.jplace`.
|
152
|
+
|
153
|
+
### (2) Separately giving query alignment and finer control of outputs
|
154
|
+
```bash
|
155
|
+
run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment] \
|
156
|
+
-q [query sequence alignment] -d [output directory] -o [output name] \
|
157
|
+
--threads [num cpus]
|
158
|
+
```
|
159
|
+
|
160
|
+
### (3) Using `pplacer` as the base placement method
|
161
|
+
```bash
|
162
|
+
run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
|
163
|
+
-a [reference alignment] -q [query sequence alignment]
|
164
|
+
```
|
165
|
+
|
166
|
+
### More comprehensive usage
|
167
|
+
```bash
|
168
|
+
> usage: run_bscampp.py [-h] [-v] [--placement-method {epa-ng,pplacer}] -i
|
169
|
+
> INFO_PATH -t TREE_PATH -a ALN_PATH [-q QALN_PATH]
|
170
|
+
> [-d OUTDIR] [-o OUTNAME] [--threads NUM_CPUS] [-m MODEL]
|
171
|
+
> [-b SUBTREESIZE] [-V VOTES]
|
172
|
+
> [--similarityflag SIMILARITYFLAG] [-n TMPFILENBR]
|
173
|
+
> [--fragmentflag FRAGMENTFLAG] [--keeptemp KEEPTEMP]
|
174
|
+
>
|
175
|
+
> This program runs BSCAMPP, a scalable phylogenetic placement framework that scales EPA-ng/pplacer to very large tree placement.
|
176
|
+
>
|
177
|
+
> options:
|
178
|
+
> -h, --help show this help message and exit
|
179
|
+
> -v, --version show program's version number and exit
|
180
|
+
>
|
181
|
+
> BASIC PARAMETERS:
|
182
|
+
> These are the basic parameters for BSCAMPP.
|
183
|
+
>
|
184
|
+
> --placement-method {epa-ng,pplacer}
|
185
|
+
> The base placement method to use. Default: epa-ng
|
186
|
+
> -i INFO_PATH, --info INFO_PATH, --info-path INFO_PATH
|
187
|
+
> Path to model parameters. E.g., .bestModel from
|
188
|
+
> RAxML/RAxML-ng
|
189
|
+
> -t TREE_PATH, --tree TREE_PATH, --tree-path TREE_PATH
|
190
|
+
> Path to reference tree with estimated branch lengths
|
191
|
+
> -a ALN_PATH, --alignment ALN_PATH, --aln-path ALN_PATH
|
192
|
+
> Path for reference sequence alignment in FASTA format.
|
193
|
+
> Optionally with query sequences. Query alignment can
|
194
|
+
> be specified with --qaln-path
|
195
|
+
> -q QALN_PATH, --qalignment QALN_PATH, --qaln-path QALN_PATH
|
196
|
+
> Optionally provide path to query sequence alignment in
|
197
|
+
> FASTA format. Default: None
|
198
|
+
> -d OUTDIR, --outdir OUTDIR
|
199
|
+
> Directory path for output. Default: bscampp_output/
|
200
|
+
> -o OUTNAME, --output OUTNAME
|
201
|
+
> Output file name. Default: bscampp_result.jplace
|
202
|
+
> --threads NUM_CPUS, --num-cpus NUM_CPUS
|
203
|
+
> Number of cores for parallelization, default: -1 (all)
|
204
|
+
>
|
205
|
+
> ADVANCE PARAMETERS:
|
206
|
+
> These parameters control how BSCAMPP is run. The default values are set based on experiments.
|
207
|
+
>
|
208
|
+
> -m MODEL, --model MODEL
|
209
|
+
> Model used for edge distances. Default: GTR
|
210
|
+
> -b SUBTREESIZE, --subtreesize SUBTREESIZE
|
211
|
+
> Integer size of the subtree. Default: 2000
|
212
|
+
> -V VOTES, --votes VOTES
|
213
|
+
> Number of votes per query sequence. Default: 5
|
214
|
+
> --similarityflag SIMILARITYFLAG
|
215
|
+
> Boolean, True if maximizing sequence similarity
|
216
|
+
> instead of simple Hamming distance (ignoring gap sites
|
217
|
+
> in the query). Default: True
|
218
|
+
>
|
219
|
+
> MISCELLANEOUS PARAMETERS:
|
220
|
+
> -n TMPFILENBR, --tmpfilenbr TMPFILENBR
|
221
|
+
> Temporary file indexing. Default: 0
|
222
|
+
> --fragmentflag FRAGMENTFLAG
|
223
|
+
> If queries contains fragments. Default: True
|
224
|
+
> --keeptemp KEEPTEMP Boolean, True to keep all temporary files. Default:
|
225
|
+
False
|
226
|
+
```
|
227
|
+
|
228
|
+
|
229
|
+
# Example Code and Data
|
230
|
+
Example script and data are provided in this GitHub repository in `examples/`. The data is originally from the [RNAsim-VS datasets](https://doi.org/10.1093/sysbio/syz063).
|
231
|
+
* `examples/run.sh`: contains a simple script to test BSCAMPP with `epa-ng` or `pplacer`, placing 200 query sequences to a 10000-leaf placement tree.
|
232
|
+
The info file is from RAxML-ng when running `epa-ng`, and from FastTree-2 when running `pplacer`.
|
233
|
+
- `run.sh` will invoke BSCAMPP with `epa-ng`.
|
234
|
+
- `run.sh pplacer` will invoke BSCAMPP with `pplacer`.
|
@@ -0,0 +1,25 @@
|
|
1
|
+
bscampp/__init__.py,sha256=mfEO3icT2L6hUzGTGXngIfvKxymEspmxBYFvbfSqRx0,2289
|
2
|
+
bscampp/configs.py,sha256=XuzRbtcUE5bExe-vEZGZ1CeXBmp4oP7LWFveQySx2xs,5745
|
3
|
+
bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
|
4
|
+
bscampp/functions.py,sha256=cPT5eSy_8CSNzDx-5ma43Hp9_AMmaWSTXM89bjdrkRs,15640
|
5
|
+
bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
|
6
|
+
bscampp/jobs.py,sha256=de0Dr3ynORwACJqVbeWDfqTwJhWvMYG-7yfRYirGx8M,6703
|
7
|
+
bscampp/pipeline.py,sha256=2pQTmBj9LLyuLyCTJgoKvBj19RAzbkC5FuDman5zpD0,9902
|
8
|
+
bscampp/utils.py,sha256=ragaI14Lqb2fVp_uYDkFQnV7a50G9-sUOWdVM-sNhUE,29005
|
9
|
+
bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
|
10
|
+
bscampp/tools/pplacer,sha256=p0H4eo9uuiYoWS_kJbPfauOV99i7BXJdZSiwXIuLxTw,7834576
|
11
|
+
bscampp/tools/hamming_distance/CMakeLists.txt,sha256=yf9iq7Y61t3WObJHoR4RoGDEvUw_Q8JW3UnI4uh0cfU,389
|
12
|
+
bscampp/tools/hamming_distance/fragment_hamming,sha256=AYVNIkrFZVG222ePKwsAXpAVD0-GOMJJq3uTpghvtCM,43160
|
13
|
+
bscampp/tools/hamming_distance/hamming,sha256=KF-UP37qXGy2Qy0dmMGjUrfSLpe0qlKDTt0Pt5g0vsc,43160
|
14
|
+
bscampp/tools/hamming_distance/homology,sha256=4-F92hdpBc98FxXBrhq4qdwtEBzOcXmk6EFIDpuk-e0,43152
|
15
|
+
bscampp/tools/hamming_distance/src/fragment_hamming.cpp,sha256=Sh76zMA5L-MOUVOcTRQ7N6Yq_iOhq6rqgAihxQsEnQ8,6197
|
16
|
+
bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD-Bk6oKw0seM_4Zn177JvZBlou9Mw6dQ,6286
|
17
|
+
bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
|
18
|
+
bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
|
19
|
+
bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
|
20
|
+
bscampp-1.0.1.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
|
21
|
+
bscampp-1.0.1.dist-info/METADATA,sha256=06T5y13XSerQS_nb4hTk-i6aaIJ8gdCuCvk-hK75nPg,11142
|
22
|
+
bscampp-1.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
23
|
+
bscampp-1.0.1.dist-info/entry_points.txt,sha256=dZygBmg2OncVyeLeIjh_9e-GBIOesFvMemyW9BRRcXY,113
|
24
|
+
bscampp-1.0.1.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
|
25
|
+
bscampp-1.0.1.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
bscampp
|