bscampp 1.0.1a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. bscampp-1.0.1a0/CHANGELOG.md +3 -0
  2. bscampp-1.0.1a0/LICENSE +21 -0
  3. bscampp-1.0.1a0/MANIFEST.in +8 -0
  4. bscampp-1.0.1a0/PKG-INFO +229 -0
  5. bscampp-1.0.1a0/README.md +177 -0
  6. bscampp-1.0.1a0/bscampp/__init__.py +68 -0
  7. bscampp-1.0.1a0/bscampp/configs.py +169 -0
  8. bscampp-1.0.1a0/bscampp/default.config +5 -0
  9. bscampp-1.0.1a0/bscampp/functions.py +394 -0
  10. bscampp-1.0.1a0/bscampp/init_configs.py +93 -0
  11. bscampp-1.0.1a0/bscampp/jobs.py +198 -0
  12. bscampp-1.0.1a0/bscampp/pipeline.py +224 -0
  13. bscampp-1.0.1a0/bscampp/tools/epa-ng +0 -0
  14. bscampp-1.0.1a0/bscampp/tools/hamming_distance/CMakeLists.txt +13 -0
  15. bscampp-1.0.1a0/bscampp/tools/hamming_distance/fragment_hamming +0 -0
  16. bscampp-1.0.1a0/bscampp/tools/hamming_distance/hamming +0 -0
  17. bscampp-1.0.1a0/bscampp/tools/hamming_distance/homology +0 -0
  18. bscampp-1.0.1a0/bscampp/tools/hamming_distance/src/fragment_hamming.cpp +180 -0
  19. bscampp-1.0.1a0/bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp +183 -0
  20. bscampp-1.0.1a0/bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp +214 -0
  21. bscampp-1.0.1a0/bscampp/tools/hamming_distance/src/homology.cpp +179 -0
  22. bscampp-1.0.1a0/bscampp/tools/hamming_distance/src/new_hamming.cpp +161 -0
  23. bscampp-1.0.1a0/bscampp/tools/pplacer +0 -0
  24. bscampp-1.0.1a0/bscampp/utils.py +913 -0
  25. bscampp-1.0.1a0/bscampp.egg-info/PKG-INFO +229 -0
  26. bscampp-1.0.1a0/bscampp.egg-info/SOURCES.txt +32 -0
  27. bscampp-1.0.1a0/bscampp.egg-info/dependency_links.txt +1 -0
  28. bscampp-1.0.1a0/bscampp.egg-info/entry_points.txt +3 -0
  29. bscampp-1.0.1a0/bscampp.egg-info/requires.txt +4 -0
  30. bscampp-1.0.1a0/bscampp.egg-info/top_level.txt +1 -0
  31. bscampp-1.0.1a0/pyproject.toml +48 -0
  32. bscampp-1.0.1a0/requirements.txt +4 -0
  33. bscampp-1.0.1a0/run_bscampp.py +5 -0
  34. bscampp-1.0.1a0/setup.cfg +4 -0
@@ -0,0 +1,3 @@
1
+ # BSCAMPP v1.0.1a
2
+ 1. Completed features with both `epa-ng` and `pplacer` support.
3
+ 2. Refactorized all codes and worked out the PyPI installation for release.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 ewedell
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,8 @@
1
+ include README.md
2
+ include CHANGELOG.md
3
+ include run_bscampp.py
4
+ include bscampp/default.config
5
+ include requirements.txt
6
+ graft bscampp/tools
7
+ prune */__pycache__
8
+ global-exclude *.py[cod]
@@ -0,0 +1,229 @@
1
+ Metadata-Version: 2.2
2
+ Name: bscampp
3
+ Version: 1.0.1a0
4
+ Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
5
+ Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2022 ewedell
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/ewedell/BSCAMPP
29
+ Project-URL: Changelog, https://github.com/ewedell/BSCAMPP/CHANGELOG.md
30
+ Classifier: Development Status :: 4 - Beta
31
+ Classifier: Operating System :: OS Independent
32
+ Classifier: Intended Audience :: Developers
33
+ Classifier: Intended Audience :: Science/Research
34
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
35
+ Classifier: Topic :: Software Development
36
+ Classifier: License :: OSI Approved :: GNU General Public License (GPL)
37
+ Classifier: Programming Language :: Python
38
+ Classifier: Programming Language :: Python :: 3
39
+ Classifier: Programming Language :: Python :: 3.7
40
+ Classifier: Programming Language :: Python :: 3.8
41
+ Classifier: Programming Language :: Python :: 3.9
42
+ Classifier: Programming Language :: Python :: 3.10
43
+ Classifier: Programming Language :: Python :: 3.11
44
+ Classifier: Programming Language :: Python :: 3.12
45
+ Requires-Python: >=3.7
46
+ Description-Content-Type: text/markdown
47
+ License-File: LICENSE
48
+ Requires-Dist: ConfigParser>=5.0.0
49
+ Requires-Dist: numpy>=1.21.6
50
+ Requires-Dist: treeswift>=1.1.45
51
+ Requires-Dist: taxtastic>=0.9.3
52
+
53
+ # BSCAMPP - A Scalable Phylogenetic Placement Method and Framework
54
+
55
+ **Table of Contents**
56
+ 1. [Overview](#overview)
57
+ 2. [Installation](#installation)
58
+ 3. [Usage](#usage)
59
+ 4. [Example Code and Data](#example-code-and-data)
60
+
61
+ # Overview
62
+ * **Inputs**
63
+ 1. Reference tree to place sequences into.
64
+ 2. Alignment of reference sequences.
65
+ 3. Alignment of query sequences (can be combined with ii.).
66
+ 4. Tree info file.
67
+ - (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
68
+ - (pplacer as base method), RAxML-ng or FastTree log file.
69
+ * **Output**
70
+ 1. Placement results of query sequences in the reference tree in `.jplace` format.
71
+
72
+
73
+ BSCAMPP is an extension and scalable solution to its previous method [SCAMPP](https://github.com/chry04/PLUSplacer) for phylogenetic placement.
74
+ BSCAMPP achieves some magnitudes of speedup compared to the SCAMPP framework.
75
+ The core algorithm is described in detail at <https://doi.org/10.1101/2022.10.26.513936>.
76
+ In short, BSCAMPP in default uses EPA-ng as the base placement method, allowing it to scale to placement trees of up to ~200,000 leaves.
77
+ BSCAMPP achieves this by extracting appropriate subtrees and assigning each query to its most fitting subtree.
78
+
79
+ BSCAMPP essentially is a divide-and-conquer framework and can be used with any base placement methods (e.g., `pplacer` as well).
80
+ Currently, BSCAMPP is implemented with `epa-ng` and `pplacer`.
81
+
82
+ It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 votes based on current best results, especially if sequences
83
+ are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and 5 respectively (see [Usage](#usage) for more details
84
+ on customizing BSCAMPP).
85
+
86
+ # Installation
87
+ BSCAMPP was tested on **Python 3.7 to 3.12**. There are two ways to install and use BSCAMPP: (1) with PyPI, or
88
+ (2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP, please contact Eleanor Wedell
89
+ (ewedell@illinois.edu).
90
+
91
+ ### External requirements
92
+ EPA-ng and/or pplacer are requirements to run BSCAMPP since BSCAMPP will use them as the base phylogenetic placement methods.
93
+ By default, BSCAMPP will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
94
+ We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
95
+
96
+ ### (1) Install with `pip` (Coming soon)
97
+ The easiest way to install BSCAMPP is to use `pip install`. This will also install all required Python packages.
98
+
99
+ ```bash
100
+ # 1. install with pip (--user if no root access)
101
+ pip install bscampp [--user]
102
+
103
+ # 2. Two binary executables will be installed. The first time
104
+ # running any will create a config file at
105
+ # ~/.bscampp/main.config that resolves the links to all
106
+ # external software (e.g., epa-ng, pplacer)
107
+ bscampp [-h] # or
108
+ run_bscampp.py [-h]
109
+ ```
110
+
111
+ ### (2) Install from GitHub
112
+ Alternatively, the user can clone this GitHub repository and install the required packages manually.
113
+
114
+ #### Requirements
115
+ ```bash
116
+ python>=3.7
117
+ ConfigParser>=5.0.0
118
+ numpy>=1.21.6
119
+ treeswift>=1.1.45
120
+ taxtastic>=0.9.3
121
+ ```
122
+
123
+ ```bash
124
+ # 1. Close the GitHub repo
125
+ git clone https://github.com/ewedell/BSCAMPP.git
126
+
127
+ # 2. Install all requirements
128
+ pip install -r requirements.txt
129
+
130
+ # 3. Execute BSCAMPP executable `run_bscampp.py`
131
+ python run_bscampp.py [-h]
132
+ ```
133
+
134
+ # Usage
135
+ All parameter settings can be found by running
136
+ ```bash
137
+ run_bscampp.py -h
138
+ ```
139
+
140
+ ### (1) Default case (`epa-ng`)
141
+ ```bash
142
+ run_bscampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
143
+ ```
144
+ To run BSCAMPP in its default mode with EPA-ng. `[alignment file]` should contain both sequences from the placement tree and
145
+ the query sequences to be placed. This will create an output directory `bscampp_output` and write the placement results to
146
+ `bscampp_output/bscampp_result.jplace`.
147
+
148
+ ### (2) Separately giving query alignment and finer control of outputs
149
+ ```bash
150
+ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment] \
151
+ -q [query sequence alignment] -d [output directory] -o [output name] \
152
+ --threads [num cpus]
153
+ ```
154
+
155
+ ### (3) Using `pplacer` as the base placement method
156
+ ```bash
157
+ run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
158
+ -a [reference alignment] -q [query sequence alignment]
159
+ ```
160
+
161
+ ### More comprehensive usage
162
+ ```bash
163
+ > usage: run_bscampp.py [-h] [-v] [--placement-method {epa-ng,pplacer}] -i
164
+ > INFO_PATH -t TREE_PATH -a ALN_PATH [-q QALN_PATH]
165
+ > [-d OUTDIR] [-o OUTNAME] [--threads NUM_CPUS] [-m MODEL]
166
+ > [-b SUBTREESIZE] [-V VOTES]
167
+ > [--similarityflag SIMILARITYFLAG] [-n TMPFILENBR]
168
+ > [--fragmentflag FRAGMENTFLAG] [--keeptemp KEEPTEMP]
169
+ >
170
+ > This program runs BSCAMPP, a scalable phylogenetic placement framework that scales EPA-ng/pplacer to very large tree placement.
171
+ >
172
+ > options:
173
+ > -h, --help show this help message and exit
174
+ > -v, --version show program's version number and exit
175
+ >
176
+ > BASIC PARAMETERS:
177
+ > These are the basic parameters for BSCAMPP.
178
+ >
179
+ > --placement-method {epa-ng,pplacer}
180
+ > The base placement method to use. Default: epa-ng
181
+ > -i INFO_PATH, --info INFO_PATH, --info-path INFO_PATH
182
+ > Path to model parameters. E.g., .bestModel from
183
+ > RAxML/RAxML-ng
184
+ > -t TREE_PATH, --tree TREE_PATH, --tree-path TREE_PATH
185
+ > Path to reference tree with estimated branch lengths
186
+ > -a ALN_PATH, --alignment ALN_PATH, --aln-path ALN_PATH
187
+ > Path for reference sequence alignment in FASTA format.
188
+ > Optionally with query sequences. Query alignment can
189
+ > be specified with --qaln-path
190
+ > -q QALN_PATH, --qalignment QALN_PATH, --qaln-path QALN_PATH
191
+ > Optionally provide path to query sequence alignment in
192
+ > FASTA format. Default: None
193
+ > -d OUTDIR, --outdir OUTDIR
194
+ > Directory path for output. Default: bscampp_output/
195
+ > -o OUTNAME, --output OUTNAME
196
+ > Output file name. Default: bscampp_result.jplace
197
+ > --threads NUM_CPUS, --num-cpus NUM_CPUS
198
+ > Number of cores for parallelization, default: -1 (all)
199
+ >
200
+ > ADVANCE PARAMETERS:
201
+ > These parameters control how BSCAMPP is run. The default values are set based on experiments.
202
+ >
203
+ > -m MODEL, --model MODEL
204
+ > Model used for edge distances. Default: GTR
205
+ > -b SUBTREESIZE, --subtreesize SUBTREESIZE
206
+ > Integer size of the subtree. Default: 2000
207
+ > -V VOTES, --votes VOTES
208
+ > Number of votes per query sequence. Default: 5
209
+ > --similarityflag SIMILARITYFLAG
210
+ > Boolean, True if maximizing sequence similarity
211
+ > instead of simple Hamming distance (ignoring gap sites
212
+ > in the query). Default: True
213
+ >
214
+ > MISCELLANEOUS PARAMETERS:
215
+ > -n TMPFILENBR, --tmpfilenbr TMPFILENBR
216
+ > Temporary file indexing. Default: 0
217
+ > --fragmentflag FRAGMENTFLAG
218
+ > If queries contains fragments. Default: True
219
+ > --keeptemp KEEPTEMP Boolean, True to keep all temporary files. Default:
220
+ False
221
+ ```
222
+
223
+
224
+ # Example Code and Data
225
+ Example script and data are provided in this GitHub repository in `examples/`. The data is originally from the [RNAsim-VS datasets](https://doi.org/10.1093/sysbio/syz063).
226
+ * `examples/run.sh`: contains a simple script to test BSCAMPP with `epa-ng` or `pplacer`, placing 200 query sequences to a 10000-leaf placement tree.
227
+ The info file is from RAxML-ng when running `epa-ng`, and from FastTree-2 when running `pplacer`.
228
+ - `run.sh` will invoke BSCAMPP with `epa-ng`.
229
+ - `run.sh pplacer` will invoke BSCAMPP with `pplacer`.
@@ -0,0 +1,177 @@
1
+ # BSCAMPP - A Scalable Phylogenetic Placement Method and Framework
2
+
3
+ **Table of Contents**
4
+ 1. [Overview](#overview)
5
+ 2. [Installation](#installation)
6
+ 3. [Usage](#usage)
7
+ 4. [Example Code and Data](#example-code-and-data)
8
+
9
+ # Overview
10
+ * **Inputs**
11
+ 1. Reference tree to place sequences into.
12
+ 2. Alignment of reference sequences.
13
+ 3. Alignment of query sequences (can be combined with ii.).
14
+ 4. Tree info file.
15
+ - (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
16
+ - (pplacer as base method), RAxML-ng or FastTree log file.
17
+ * **Output**
18
+ 1. Placement results of query sequences in the reference tree in `.jplace` format.
19
+
20
+
21
+ BSCAMPP is an extension and scalable solution to its previous method [SCAMPP](https://github.com/chry04/PLUSplacer) for phylogenetic placement.
22
+ BSCAMPP achieves some magnitudes of speedup compared to the SCAMPP framework.
23
+ The core algorithm is described in detail at <https://doi.org/10.1101/2022.10.26.513936>.
24
+ In short, BSCAMPP in default uses EPA-ng as the base placement method, allowing it to scale to placement trees of up to ~200,000 leaves.
25
+ BSCAMPP achieves this by extracting appropriate subtrees and assigning each query to its most fitting subtree.
26
+
27
+ BSCAMPP essentially is a divide-and-conquer framework and can be used with any base placement methods (e.g., `pplacer` as well).
28
+ Currently, BSCAMPP is implemented with `epa-ng` and `pplacer`.
29
+
30
+ It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 votes based on current best results, especially if sequences
31
+ are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and 5 respectively (see [Usage](#usage) for more details
32
+ on customizing BSCAMPP).
33
+
34
+ # Installation
35
+ BSCAMPP was tested on **Python 3.7 to 3.12**. There are two ways to install and use BSCAMPP: (1) with PyPI, or
36
+ (2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP, please contact Eleanor Wedell
37
+ (ewedell@illinois.edu).
38
+
39
+ ### External requirements
40
+ EPA-ng and/or pplacer are requirements to run BSCAMPP since BSCAMPP will use them as the base phylogenetic placement methods.
41
+ By default, BSCAMPP will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
42
+ We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
43
+
44
+ ### (1) Install with `pip` (Coming soon)
45
+ The easiest way to install BSCAMPP is to use `pip install`. This will also install all required Python packages.
46
+
47
+ ```bash
48
+ # 1. install with pip (--user if no root access)
49
+ pip install bscampp [--user]
50
+
51
+ # 2. Two binary executables will be installed. The first time
52
+ # running any will create a config file at
53
+ # ~/.bscampp/main.config that resolves the links to all
54
+ # external software (e.g., epa-ng, pplacer)
55
+ bscampp [-h] # or
56
+ run_bscampp.py [-h]
57
+ ```
58
+
59
+ ### (2) Install from GitHub
60
+ Alternatively, the user can clone this GitHub repository and install the required packages manually.
61
+
62
+ #### Requirements
63
+ ```bash
64
+ python>=3.7
65
+ ConfigParser>=5.0.0
66
+ numpy>=1.21.6
67
+ treeswift>=1.1.45
68
+ taxtastic>=0.9.3
69
+ ```
70
+
71
+ ```bash
72
+ # 1. Close the GitHub repo
73
+ git clone https://github.com/ewedell/BSCAMPP.git
74
+
75
+ # 2. Install all requirements
76
+ pip install -r requirements.txt
77
+
78
+ # 3. Execute BSCAMPP executable `run_bscampp.py`
79
+ python run_bscampp.py [-h]
80
+ ```
81
+
82
+ # Usage
83
+ All parameter settings can be found by running
84
+ ```bash
85
+ run_bscampp.py -h
86
+ ```
87
+
88
+ ### (1) Default case (`epa-ng`)
89
+ ```bash
90
+ run_bscampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
91
+ ```
92
+ To run BSCAMPP in its default mode with EPA-ng. `[alignment file]` should contain both sequences from the placement tree and
93
+ the query sequences to be placed. This will create an output directory `bscampp_output` and write the placement results to
94
+ `bscampp_output/bscampp_result.jplace`.
95
+
96
+ ### (2) Separately giving query alignment and finer control of outputs
97
+ ```bash
98
+ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment] \
99
+ -q [query sequence alignment] -d [output directory] -o [output name] \
100
+ --threads [num cpus]
101
+ ```
102
+
103
+ ### (3) Using `pplacer` as the base placement method
104
+ ```bash
105
+ run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
106
+ -a [reference alignment] -q [query sequence alignment]
107
+ ```
108
+
109
+ ### More comprehensive usage
110
+ ```bash
111
+ > usage: run_bscampp.py [-h] [-v] [--placement-method {epa-ng,pplacer}] -i
112
+ > INFO_PATH -t TREE_PATH -a ALN_PATH [-q QALN_PATH]
113
+ > [-d OUTDIR] [-o OUTNAME] [--threads NUM_CPUS] [-m MODEL]
114
+ > [-b SUBTREESIZE] [-V VOTES]
115
+ > [--similarityflag SIMILARITYFLAG] [-n TMPFILENBR]
116
+ > [--fragmentflag FRAGMENTFLAG] [--keeptemp KEEPTEMP]
117
+ >
118
+ > This program runs BSCAMPP, a scalable phylogenetic placement framework that scales EPA-ng/pplacer to very large tree placement.
119
+ >
120
+ > options:
121
+ > -h, --help show this help message and exit
122
+ > -v, --version show program's version number and exit
123
+ >
124
+ > BASIC PARAMETERS:
125
+ > These are the basic parameters for BSCAMPP.
126
+ >
127
+ > --placement-method {epa-ng,pplacer}
128
+ > The base placement method to use. Default: epa-ng
129
+ > -i INFO_PATH, --info INFO_PATH, --info-path INFO_PATH
130
+ > Path to model parameters. E.g., .bestModel from
131
+ > RAxML/RAxML-ng
132
+ > -t TREE_PATH, --tree TREE_PATH, --tree-path TREE_PATH
133
+ > Path to reference tree with estimated branch lengths
134
+ > -a ALN_PATH, --alignment ALN_PATH, --aln-path ALN_PATH
135
+ > Path for reference sequence alignment in FASTA format.
136
+ > Optionally with query sequences. Query alignment can
137
+ > be specified with --qaln-path
138
+ > -q QALN_PATH, --qalignment QALN_PATH, --qaln-path QALN_PATH
139
+ > Optionally provide path to query sequence alignment in
140
+ > FASTA format. Default: None
141
+ > -d OUTDIR, --outdir OUTDIR
142
+ > Directory path for output. Default: bscampp_output/
143
+ > -o OUTNAME, --output OUTNAME
144
+ > Output file name. Default: bscampp_result.jplace
145
+ > --threads NUM_CPUS, --num-cpus NUM_CPUS
146
+ > Number of cores for parallelization, default: -1 (all)
147
+ >
148
+ > ADVANCE PARAMETERS:
149
+ > These parameters control how BSCAMPP is run. The default values are set based on experiments.
150
+ >
151
+ > -m MODEL, --model MODEL
152
+ > Model used for edge distances. Default: GTR
153
+ > -b SUBTREESIZE, --subtreesize SUBTREESIZE
154
+ > Integer size of the subtree. Default: 2000
155
+ > -V VOTES, --votes VOTES
156
+ > Number of votes per query sequence. Default: 5
157
+ > --similarityflag SIMILARITYFLAG
158
+ > Boolean, True if maximizing sequence similarity
159
+ > instead of simple Hamming distance (ignoring gap sites
160
+ > in the query). Default: True
161
+ >
162
+ > MISCELLANEOUS PARAMETERS:
163
+ > -n TMPFILENBR, --tmpfilenbr TMPFILENBR
164
+ > Temporary file indexing. Default: 0
165
+ > --fragmentflag FRAGMENTFLAG
166
+ > If queries contains fragments. Default: True
167
+ > --keeptemp KEEPTEMP Boolean, True to keep all temporary files. Default:
168
+ False
169
+ ```
170
+
171
+
172
+ # Example Code and Data
173
+ Example script and data are provided in this GitHub repository in `examples/`. The data is originally from the [RNAsim-VS datasets](https://doi.org/10.1093/sysbio/syz063).
174
+ * `examples/run.sh`: contains a simple script to test BSCAMPP with `epa-ng` or `pplacer`, placing 200 query sequences to a 10000-leaf placement tree.
175
+ The info file is from RAxML-ng when running `epa-ng`, and from FastTree-2 when running `pplacer`.
176
+ - `run.sh` will invoke BSCAMPP with `epa-ng`.
177
+ - `run.sh pplacer` will invoke BSCAMPP with `pplacer`.
@@ -0,0 +1,68 @@
1
+ ############################################################
2
+ #
3
+ # Init file for BSCAMPP, using the __init__.py from
4
+ # SEPP as the original template. Current adaption comes
5
+ # from https://github.com/c5shen/TIPP3.git
6
+ #
7
+ ############################################################
8
+ from operator import itemgetter
9
+ import logging, os
10
+
11
+ # update system recursion limit to avoid issues
12
+ # not really needed for BSCAMPP but safe to update here
13
+ os.sys.setrecursionlimit(1000000)
14
+
15
+ __version__ = "1.0.1a"
16
+ _INSTALL_PATH = __path__[0]
17
+
18
+ # global variables to store all loggers
19
+ __set_loggers = set()
20
+
21
+ # obtain the current logging level, default to INFO
22
+ def get_logging_level(logging_level='info'):
23
+ logging_level_map = {
24
+ 'DEBUG': logging.DEBUG, 'INFO': logging.INFO,
25
+ 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
26
+ 'CRITICAL': logging.CRITICAL,
27
+ }
28
+ # obtain from environment variable to determine logging level, if
29
+ # set by the user
30
+ env_level = os.getenv('BSCAMPP_LOGGING_LEVEL')
31
+ if env_level is not None:
32
+ ll = env_level.upper()
33
+ else:
34
+ ll = logging_level.upper()
35
+ # default to INFO if ll is not defined
36
+ return logging_level_map.get(ll, logging.INFO)
37
+
38
+ # obtain a logger for a given file
39
+ def get_logger(name='bscampp', log_path=None, logging_level='info'):
40
+ logger = logging.getLogger(name)
41
+ if name not in __set_loggers:
42
+ # set up a new logger for a name not in __set_loggers yet
43
+ level = get_logging_level(logging_level)
44
+ logging_formatter = logging.Formatter(
45
+ ("[%(asctime)s] %(filename)s (line %(lineno)d):"
46
+ " %(levelname) 8s: %(message)s"))
47
+ logging_formatter.datefmt = "%H:%M:%S"
48
+ logger.setLevel(level)
49
+
50
+ # logging to stdout
51
+ if log_path is None:
52
+ ch = logging.StreamHandler()
53
+ else:
54
+ # use FileHandler for logging
55
+ ch = logging.FileHandler(log_path, mode='a')
56
+ ch.setLevel(level)
57
+ ch.setFormatter(logging_formatter)
58
+ logger.addHandler(ch)
59
+ __set_loggers.add(name)
60
+ return logger
61
+
62
+ # logging exception
63
+ def log_exception(logger):
64
+ import traceback, io
65
+ s = io.StringIO()
66
+ traceback.print_exc(None, s)
67
+ logger.error(s.getvalue())
68
+ exit(1)
@@ -0,0 +1,169 @@
1
+ import os, time
2
+ try:
3
+ import configparser
4
+ except ImportError:
5
+ from ConfigParser import configparser
6
+ from argparse import ArgumentParser, Namespace
7
+ from bscampp.init_configs import init_config_file
8
+ from bscampp import get_logger, log_exception
9
+
10
+ # detect home.path or create if missing
11
+ homepath = os.path.dirname(__file__) + '/home.path'
12
+ _root_dir, main_config_path = init_config_file(homepath)
13
+
14
+ # set valid configparse section names
15
+ valid_config_sections = []
16
+ logging_levels = set(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'])
17
+
18
+ _LOG = get_logger(__name__)
19
+
20
+ '''
21
+ Configuration defined by users and by default values
22
+ '''
23
+ class Configs:
24
+ global _root_dir
25
+
26
+ # basic input paths
27
+ info_path = None # info file for pplacer or EPA-ng
28
+ tree_path = None # placement tree path
29
+ aln_path = None # alignment for backbone. Optinally with queries
30
+ qaln_path = None # (optional) alignment for query.
31
+ outdir = None # output directory
32
+ outname = None # output name for the final jplace file
33
+ keeptemp = False # whether to keep all temporary files
34
+ verbose = 'INFO' # default verbose level to print
35
+ num_cpus = 1 # number of cores to use for parallelization
36
+
37
+ # binaries
38
+ pplacer_path = None
39
+ epang_path = None
40
+ taxit_path = None
41
+ hamming_distance_dir = None
42
+
43
+ # placement settings
44
+ placement_method = 'epa-ng'
45
+ model = 'GTR'
46
+ subtreesize = 2000
47
+ votes = 5
48
+ similarityflag = True
49
+
50
+ # miscellaneous
51
+ tmpfilenbr = 0
52
+ fragmentflag = True
53
+
54
+ # check if the given configuration is valid to add
55
+ def set_valid_configuration(name, conf):
56
+ if not isinstance(conf, Namespace):
57
+ _LOG.warning(
58
+ "Looking for Namespace object from \'{}\' but find {}".format(
59
+ name, type(conf)))
60
+ return
61
+
62
+ # basic section defined in main.config
63
+ if name == 'basic':
64
+ for k in conf.__dict__.keys():
65
+ k_attr = getattr(conf, k)
66
+ if not k_attr:
67
+ continue
68
+ if k in Configs.__dict__:
69
+ setattr(Configs, k, k_attr)
70
+ else:
71
+ pass
72
+
73
+ # valid attribute check for print out
74
+ def valid_attribute(k, v):
75
+ if not isinstance(k, str):
76
+ return False
77
+ if k.startswith('_'):
78
+ return False
79
+ return True
80
+
81
+ # print out current configuration
82
+ def getConfigs():
83
+ msg = '\n************ Configurations ************\n' + \
84
+ f'\thome.path: {homepath}\n' + \
85
+ f'\tmain.config: {main_config_path}\n\n'
86
+ for k, v in Configs.__dict__.items():
87
+ if valid_attribute(k, v):
88
+ msg += f'\tConfigs.{k}: {v}\n'
89
+ print(msg, flush=True)
90
+
91
+ # read in config file if it exists
92
+ def _read_config_file(filename, cparser, opts,
93
+ child_process=False, expand=None):
94
+ config_defaults = []
95
+ with open(filename, 'r') as f:
96
+ cparser.read_file(f)
97
+ if cparser.has_section('commandline'):
98
+ for k, v in cparser.items('commandline'):
99
+ config_defaults.append(f'--{k}')
100
+ config_defaults.append(v)
101
+
102
+ for section in cparser.sections():
103
+ if section == 'commandline':
104
+ continue
105
+ if getattr(opts, section, None):
106
+ section_name_space = getattr(opts, section)
107
+ else:
108
+ section_name_space = Namespace()
109
+ for k, v in cparser.items(section):
110
+ if expand and k == 'path':
111
+ v = os.path.join(expand, v)
112
+ setattr(section_name_space, k, v)
113
+ setattr(opts, section, section_name_space)
114
+ return config_defaults
115
+
116
+ '''
117
+ Build Config class
118
+ '''
119
+ def buildConfigs(parser, cmdline_args, child_process=False, rerun=False):
120
+ cparser = configparser.ConfigParser()
121
+ cparser.optionxform = str
122
+ args = parser.parse_args(cmdline_args)
123
+
124
+ # Check if only updating config files, if so, re-initialize the
125
+ # configuration file at ~/.bscampp/main.config and exit
126
+ #if args.command == 'update-configs':
127
+ # _ = init_config_file(homepath, rerun=True)
128
+ # _LOG.warning('Finished re-initializing the configuration file '
129
+ # f'at {main_config_path}, exiting...')
130
+ # exit(0)
131
+
132
+ # first load arguments from main.configs
133
+ main_args = Namespace()
134
+ cmdline_main = _read_config_file(main_config_path,
135
+ cparser, main_args, child_process=child_process)
136
+
137
+ # merge arguments, in the correct order so things are overridden correctly
138
+ args = parser.parse_args(cmdline_main + cmdline_args,
139
+ namespace=main_args)
140
+
141
+ # directly add all arguments that's defined in the Configs class
142
+ for k in args.__dict__.keys():
143
+ k_attr = getattr(args, k)
144
+ if k in Configs.__dict__:
145
+ # valid argument that's defined in the Configs class
146
+ setattr(Configs, k, k_attr)
147
+ else:
148
+ # check if the argument is valid
149
+ set_valid_configuration(k, k_attr)
150
+
151
+ # create outdir
152
+ if not os.path.isdir(Configs.outdir):
153
+ os.makedirs(Configs.outdir)
154
+
155
+ # modify outname if it does not have a .jplace suffix
156
+ if Configs.outname.split('.')[-1].lower() != 'jplace':
157
+ Configs.outname += '.jplace'
158
+
159
+ # modify num_cpus if it is the default value
160
+ if Configs.num_cpus > 0:
161
+ Configs.num_cpus = min(os.cpu_count(), Configs.num_cpus)
162
+ else:
163
+ Configs.num_cpus = os.cpu_count()
164
+
165
+ # sanity check for existence of base placement binary path
166
+ if Configs.placement_method == 'epa-ng':
167
+ assert os.path.exists(Configs.epang_path), 'epa-ng not detected!'
168
+ elif Configs.placement_method == 'pplacer':
169
+ assert os.path.exists(Configs.pplacer_path), 'pplacer not detected!'
@@ -0,0 +1,5 @@
1
+ [basic]
2
+ pplacer_path =
3
+ epang_path =
4
+ taxit_path =
5
+ hamming_distance_dir =