bscampp 1.0.1b0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bscampp/__init__.py CHANGED
@@ -12,7 +12,7 @@ import logging, os
12
12
  # not really needed for BSCAMPP but safe to update here
13
13
  os.sys.setrecursionlimit(1000000)
14
14
 
15
- __version__ = "1.0.1b"
15
+ __version__ = "1.0.2"
16
16
  _INSTALL_PATH = __path__[0]
17
17
 
18
18
  # global variables to store all loggers
bscampp/configs.py CHANGED
@@ -50,6 +50,7 @@ class Configs:
50
50
  # miscellaneous
51
51
  tmpfilenbr = 0
52
52
  fragmentflag = True
53
+ subtreetype = 'd'
53
54
 
54
55
  # check if the given configuration is valid to add
55
56
  def set_valid_configuration(name, conf):
bscampp/functions.py CHANGED
@@ -71,6 +71,9 @@ def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir, dry_run=False):
71
71
  query_votes_dict = dict()
72
72
  query_top_vote_dict = dict()
73
73
  tmp_output = os.path.join(workdir, 'closest.txt')
74
+
75
+ if Configs.subtreetype == "h":
76
+ Configs.votes = Configs.subtreesize
74
77
 
75
78
  cmd = []
76
79
  if Configs.similarityflag:
@@ -226,6 +229,51 @@ def assignQueriesToSubtrees(query_votes_dict, query_top_vote_dict,
226
229
  _LOG.info('Time to assign queries to subtrees: {} seconds'.format(t1 - t0))
227
230
  return new_subtree_dict, placed_query_list
228
231
 
232
+
233
+ '''
234
+ Function to assign queries to subtrees as used in SCAMPP
235
+ (subtrees are built using the nearest leaf as the seed sequence)
236
+ '''
237
+ def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
238
+ tree, leaf_dict, dry_run=False):
239
+ t0 = time.perf_counter()
240
+
241
+ if dry_run:
242
+ return dict(), []
243
+
244
+ # (1) go over the query seed sequences to see if any queries use
245
+ # the same seed sequence (i.e. subtree)
246
+ seed_queries = dict()
247
+ for query, closest_leaf in query_top_vote_dict.items():
248
+ if closest_leaf not in seed_queries:
249
+ seed_queries[closest_leaf] = [query]
250
+ else:
251
+ seed_queries[closest_leaf].append(query)
252
+
253
+ new_subtree_dict = dict()
254
+ # assign queries to subtrees, and remove them from the pool
255
+ # repeat until all queries are assigned
256
+ for seed_label, queries in seed_queries.items():
257
+ node_y = leaf_dict[seed_label]
258
+ # extract [subtreesize] leaves
259
+ if Configs.subtreetype == "h":
260
+ labels = query_votes_dict[queries[0]]
261
+ elif Configs.subtreetype == "n":
262
+ labels = utils.subtree_nodes(tree, node_y, Configs.subtreesize)
263
+ else:
264
+ labels = utils.subtree_nodes_with_edge_length(tree, node_y,
265
+ Configs.subtreesize)
266
+ subtree = tree.extract_tree_with(labels)
267
+ new_subtree_dict[subtree] = queries
268
+
269
+
270
+ placed_query_list = []
271
+
272
+ t1 = time.perf_counter()
273
+ _LOG.info('Time to assign queries to subtrees: {} seconds'.format(t1 - t0))
274
+ return new_subtree_dict, placed_query_list
275
+
276
+
229
277
  '''
230
278
  Helper function to run a single placement task. Designed to use with
231
279
  multiprocessing
bscampp/pipeline.py CHANGED
@@ -89,6 +89,79 @@ def bscampp_pipeline(*args, **kwargs):
89
89
  else:
90
90
  return False
91
91
 
92
+
93
+ # main pipeline for SCAMPP
94
+ def scampp_pipeline(*args, **kwargs):
95
+ t0 = time.perf_counter()
96
+ m = Manager(); lock = m.Lock()
97
+
98
+ # set up a dry run if specified
99
+ dry_run = False
100
+ if 'dry_run' in kwargs and isinstance(kwargs['dry_run'], bool):
101
+ dry_run = kwargs['dry_run']
102
+
103
+ # parse command line arguments and build configurations
104
+ parser, cmdline_args = parseArguments(dry_run=dry_run, method="SCAMPP")
105
+
106
+ # initialize multiprocessing (if needed)
107
+ _LOG.warning('Initializing ProcessPoolExecutor...')
108
+ pool = ProcessPoolExecutor(Configs.num_cpus, initializer=initial_pool,
109
+ initargs=(parser, cmdline_args,))
110
+
111
+ # (0) temporary files wrote to here
112
+ if not dry_run:
113
+ workdir = os.path.join(Configs.outdir, f'tmp{Configs.tmpfilenbr}')
114
+ try:
115
+ if not os.path.isdir(workdir):
116
+ os.makedirs(workdir)
117
+ except OSError:
118
+ log_exception(_LOG)
119
+ else:
120
+ workdir = os.getcwd()
121
+
122
+ # (1) read in tree, alignment, and separate reference sequences from
123
+ # query sequences
124
+ tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
125
+ dry_run=dry_run)
126
+
127
+ # (2) compute closest leaves for all query sequences
128
+ query_votes_dict, query_top_vote_dict = getClosestLeaves(
129
+ aln_path, qaln_path, aln, qaln, workdir, dry_run=dry_run)
130
+
131
+ # (3) first assign each query to the subtree built using the closest
132
+ # leaf as the seed sequence
133
+ new_subtree_dict, placed_query_list = buildQuerySubtrees(
134
+ query_votes_dict, query_top_vote_dict, tree, leaf_dict,
135
+ dry_run=dry_run)
136
+
137
+ # (4) perform placement for each subtree
138
+ output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
139
+ placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
140
+ dry_run=dry_run)
141
+
142
+ # (5) write the output jplace to local
143
+ writeOutputJplace(output_jplace, dry_run=dry_run)
144
+
145
+ # shutdown pool
146
+ _LOG.warning('Shutting down ProcessPoolExecutor...')
147
+ pool.shutdown()
148
+ _LOG.warning('ProcessPoolExecutor shut down.')
149
+
150
+ # clean up temp files if not keeping
151
+ if not Configs.keeptemp:
152
+ _LOG.info('Removing temporary files...')
153
+ clean_temp_files()
154
+
155
+ # stop SCAMPP
156
+ send = time.perf_counter()
157
+ _LOG.info('SCAMPP completed in {} seconds...'.format(send - t0))
158
+
159
+ if dry_run:
160
+ return True
161
+ else:
162
+ return False
163
+
164
+
92
165
  def clean_temp_files():
93
166
  # all temporary files/directories to remove
94
167
  temp_items = [f'tmp{Configs.tmpfilenbr}']
@@ -102,10 +175,14 @@ def clean_temp_files():
102
175
  continue
103
176
  _LOG.info(f'- Removed {temp}')
104
177
 
105
- def parseArguments(dry_run=False):
178
+ def parseArguments(dry_run=False, method="BSCAMPP"):
106
179
  global _root_dir, main_config_path
107
180
 
108
- parser = _init_parser()
181
+ default_outdir = f"{method.lower()}_output"
182
+ default_outname = f"{method.lower()}_result"
183
+
184
+ parser = _init_parser(default_outdir=default_outdir,
185
+ default_outname=default_outname)
109
186
  cmdline_args = sys.argv[1:]
110
187
 
111
188
  if dry_run:
@@ -114,22 +191,27 @@ def parseArguments(dry_run=False):
114
191
 
115
192
  # build config
116
193
  buildConfigs(parser, cmdline_args)
117
- _LOG.info('BSCAMPP is running with: {}'.format(
194
+ _LOG.info('{} is running with: {}'.format(method,
118
195
  ' '.join(cmdline_args)))
119
196
  getConfigs()
120
197
 
121
198
  return parser, cmdline_args
122
199
 
123
- def _init_parser():
200
+ def _init_parser(default_outdir="bscampp_output",
201
+ default_outname="bscampp_result"):
124
202
  # example usage
125
203
  example_usages = '''Example usages:
126
- > default
127
- %(prog)s -i raxml.info
204
+ > (1) Default
205
+ %(prog)s -i raxml.bestModel -t reference.tre -a alignment.fa
206
+ > (2) Separate alignment file for query sequences
207
+ %(prog)s -i raxml.bestModel -t reference.tre -a reference.fa -q query.fa
208
+ > (3) Use pplacer instead of EPA-ng as base method (need RAxML-ng info or FastTree log file)
209
+ %(prog)s -i fasttree.log -t reference.tre -a alignment.fa --placement-method pplacer
128
210
  '''
129
211
 
130
212
  parser = ArgumentParser(
131
213
  description=(
132
- "This program runs BSCAMPP, a scalable phylogenetic "
214
+ "This program runs BSCAMPP/SCAMPP, a scalable phylogenetic "
133
215
  "placement framework that scales EPA-ng/pplacer "
134
216
  "to very large tree placement."
135
217
  ),
@@ -156,7 +238,7 @@ def _init_parser():
156
238
  # basic group
157
239
  basic_group = parser.add_argument_group(
158
240
  "Basic parameters".upper(),
159
- "These are the basic parameters for BSCAMPP.")
241
+ "These are the basic parameters for BSCAMPP/SCAMPP.")
160
242
  parser.groups['basic_group'] = basic_group
161
243
 
162
244
  basic_group.add_argument('--placement-method', type=str,
@@ -185,10 +267,10 @@ def _init_parser():
185
267
  required=False, default=None)
186
268
  basic_group.add_argument("-d", "--outdir", type=str,
187
269
  help="Directory path for output. Default: bscampp_output/",
188
- required=False, default="bscampp_output")
270
+ required=False, default=default_outdir)
189
271
  basic_group.add_argument("-o", "--output", type=str, dest="outname",
190
272
  help="Output file name. Default: bscampp_result.jplace",
191
- required=False, default="bscampp_result.jplace")
273
+ required=False, default=f"{default_outname}.jplace")
192
274
  basic_group.add_argument("--threads", "--num-cpus", type=int,
193
275
  dest="num_cpus",
194
276
  help="Number of cores for parallelization, default: -1 (all)",
@@ -209,7 +291,8 @@ def _init_parser():
209
291
  help="Integer size of the subtree. Default: 2000",
210
292
  required=False, default=2000)
211
293
  advance_group.add_argument("-V", "--votes", type=int,
212
- help="Number of votes per query sequence. Default: 5",
294
+ help="This is only used for BSCAMPP! Number of votes per "
295
+ "query sequence. Default: 5",
213
296
  required=False, default=5)
214
297
  advance_group.add_argument("--similarityflag", type=str2bool,
215
298
  help="Boolean, True if maximizing sequence similarity "
@@ -228,6 +311,12 @@ def _init_parser():
228
311
  misc_group.add_argument("--fragmentflag", type=str2bool,
229
312
  help="If queries contains fragments. Default: True",
230
313
  required=False, default=True)
314
+ misc_group.add_argument("--subtreetype", type=str,
315
+ help="(SCAMPP only) Options for collecting "
316
+ "nodes for the subtree - d for edge weighted "
317
+ "distances, n for node distances, h for Hamming "
318
+ "distances. Default: d",
319
+ required=False, default='d')
231
320
  misc_group.add_argument("--keeptemp", type=str2bool,
232
321
  help="Boolean, True to keep all temporary files. "
233
322
  "Default: False",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: bscampp
3
- Version: 1.0.1b0
3
+ Version: 1.0.2
4
4
  Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
5
5
  Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
6
6
  License: MIT License
@@ -88,8 +88,12 @@ It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 vot
88
88
  are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and 5 respectively (see [Usage](#usage) for more details
89
89
  on customizing BSCAMPP).
90
90
 
91
+ #### SCAMPP
92
+ SCAMPP is also implemented in BSCAMPP. The user can invoke SCAMPP by running
93
+ `run_scampp.py` or `scampp` (if installed with PyPI) after installation.
94
+
91
95
  # Installation
92
- BSCAMPP was tested on **Python 3.7 to 3.12**. There are two ways to install and use BSCAMPP: (1) with PyPI, or
96
+ BSCAMPP was tested on **Python 3.8 to 3.12**. There are two ways to install and use BSCAMPP: (1) with PyPI, or
93
97
  (2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP, please contact Eleanor Wedell
94
98
  (ewedell@illinois.edu).
95
99
 
@@ -98,19 +102,25 @@ EPA-ng and/or pplacer are requirements to run BSCAMPP since BSCAMPP will use the
98
102
  By default, BSCAMPP will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
99
103
  We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
100
104
 
101
- ### (1) Install with `pip` (Coming soon)
105
+ ### (1) Install with `pip`
102
106
  The easiest way to install BSCAMPP is to use `pip install`. This will also install all required Python packages.
103
107
 
104
108
  ```bash
105
109
  # 1. install with pip (--user if no root access)
106
110
  pip install bscampp [--user]
107
111
 
108
- # 2. Two binary executables will be installed. The first time
112
+ # 2. Four binary executables will be installed. The first time
109
113
  # running any will create a config file at
110
114
  # ~/.bscampp/main.config that resolves the links to all
111
115
  # external software (e.g., epa-ng, pplacer)
116
+
117
+ # ---- BSCAMPP functions
112
118
  bscampp [-h] # or
113
119
  run_bscampp.py [-h]
120
+
121
+ # ---- SCAMPP functions
122
+ scampp [-h] # or
123
+ run_scampp.py
114
124
  ```
115
125
 
116
126
  ### (2) Install from GitHub
@@ -160,7 +170,8 @@ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment
160
170
  ### (3) Using `pplacer` as the base placement method
161
171
  ```bash
162
172
  run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
163
- -a [reference alignment] -q [query sequence alignment]
173
+ -a [reference alignment] -q [query sequence alignment] \
174
+ --placement-method pplacer
164
175
  ```
165
176
 
166
177
  ### More comprehensive usage
@@ -221,14 +232,23 @@ run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
221
232
  > Temporary file indexing. Default: 0
222
233
  > --fragmentflag FRAGMENTFLAG
223
234
  > If queries contains fragments. Default: True
235
+ > --subtreetype SUBTREETYPE
236
+ > (SCAMPP only) Options for collecting nodes for the
237
+ > subtree - d for edge weighted distances, n for node
238
+ > distances, h for Hamming distances. Default: d
224
239
  > --keeptemp KEEPTEMP Boolean, True to keep all temporary files. Default:
225
240
  False
226
241
  ```
227
242
 
228
243
 
229
244
  # Example Code and Data
230
- Example script and data are provided in this GitHub repository in `examples/`. The data is originally from the [RNAsim-VS datasets](https://doi.org/10.1093/sysbio/syz063).
231
- * `examples/run.sh`: contains a simple script to test BSCAMPP with `epa-ng` or `pplacer`, placing 200 query sequences to a 10000-leaf placement tree.
232
- The info file is from RAxML-ng when running `epa-ng`, and from FastTree-2 when running `pplacer`.
233
- - `run.sh` will invoke BSCAMPP with `epa-ng`.
234
- - `run.sh pplacer` will invoke BSCAMPP with `pplacer`.
245
+ Example script and data are provided in this GitHub repository in `examples/`.
246
+ The data is originally from the
247
+ [RNAsim-VS datasets](https://doi.org/10.1093/sysbio/syz063).
248
+ * `examples/run_bscampp.sh`: contains a simple script to test BSCAMPP with
249
+ `epa-ng` or `pplacer`, placing 200 query sequences to a 10000-leaf placement
250
+ tree. The info file is from RAxML-ng when running `epa-ng`, and from
251
+ FastTree-2 when running `pplacer`.
252
+ - `run_bscampp.sh` will invoke BSCAMPP with `epa-ng`.
253
+ - `run_bscampp.sh pplacer` will invoke BSCAMPP with `pplacer`.
254
+ * `examples/run_scampp.sh`: the same test script but running SCAMPP.
@@ -1,10 +1,10 @@
1
- bscampp/__init__.py,sha256=Wnn_Bm543hAgQCd9PmwdT_kFBZzGLDW4dcSeP0iLVTk,2290
2
- bscampp/configs.py,sha256=XuzRbtcUE5bExe-vEZGZ1CeXBmp4oP7LWFveQySx2xs,5745
1
+ bscampp/__init__.py,sha256=2QetcqvH27YCbxcb-pncQRiLppyt80cKZE6qBtoNTNI,2289
2
+ bscampp/configs.py,sha256=3HJHLN2fLV5Tv3TJL95NpOuSXUV6CvqxRqCOM6TpbJQ,5767
3
3
  bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
4
- bscampp/functions.py,sha256=cPT5eSy_8CSNzDx-5ma43Hp9_AMmaWSTXM89bjdrkRs,15640
4
+ bscampp/functions.py,sha256=Ou-etis4Dw-vW8ZrHESm8zW_ll6CDkCQNQQSZPlAddU,17300
5
5
  bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
6
6
  bscampp/jobs.py,sha256=de0Dr3ynORwACJqVbeWDfqTwJhWvMYG-7yfRYirGx8M,6703
7
- bscampp/pipeline.py,sha256=UT8y6ObFZ12q5Vw3731r50k8pLMioFNV4qCy0tz_wuk,9550
7
+ bscampp/pipeline.py,sha256=C6I1vWeA6Rq_spPHy_il1FJA_DomWHUHYHLUUk9SnLk,13024
8
8
  bscampp/utils.py,sha256=ragaI14Lqb2fVp_uYDkFQnV7a50G9-sUOWdVM-sNhUE,29005
9
9
  bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
10
10
  bscampp/tools/pplacer,sha256=p0H4eo9uuiYoWS_kJbPfauOV99i7BXJdZSiwXIuLxTw,7834576
@@ -17,9 +17,9 @@ bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD
17
17
  bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
18
18
  bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
19
19
  bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
20
- bscampp-1.0.1b0.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
21
- bscampp-1.0.1b0.dist-info/METADATA,sha256=hCpwS1vbd07cuwW7D5AkiO_I_GP-kqk21IH2yxiPUwM,11144
22
- bscampp-1.0.1b0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
23
- bscampp-1.0.1b0.dist-info/entry_points.txt,sha256=dZygBmg2OncVyeLeIjh_9e-GBIOesFvMemyW9BRRcXY,113
24
- bscampp-1.0.1b0.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
25
- bscampp-1.0.1b0.dist-info/RECORD,,
20
+ bscampp-1.0.2.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
21
+ bscampp-1.0.2.dist-info/METADATA,sha256=hVJek64HM-2Bcsou5A4Kl8b_g3-Zu1IaTyNdSMh-1hI,11765
22
+ bscampp-1.0.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
23
+ bscampp-1.0.2.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
24
+ bscampp-1.0.2.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
25
+ bscampp-1.0.2.dist-info/RECORD,,
@@ -1,3 +1,5 @@
1
1
  [console_scripts]
2
2
  bscampp = bscampp.pipeline:bscampp_pipeline
3
3
  run_bscampp.py = bscampp.pipeline:bscampp_pipeline
4
+ run_scampp.py = bscampp.pipeline:scampp_pipeline
5
+ scampp = bscampp.pipeline:scampp_pipeline