bscampp 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bscampp/__init__.py CHANGED
@@ -12,7 +12,7 @@ import logging, os
12
12
  # not really needed for BSCAMPP but safe to update here
13
13
  os.sys.setrecursionlimit(1000000)
14
14
 
15
- __version__ = "1.0.2"
15
+ __version__ = "1.0.3"
16
16
  _INSTALL_PATH = __path__[0]
17
17
 
18
18
  # global variables to store all loggers
bscampp/functions.py CHANGED
@@ -237,6 +237,7 @@ Function to assign queries to subtrees as used in SCAMPP
237
237
  def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
238
238
  tree, leaf_dict, dry_run=False):
239
239
  t0 = time.perf_counter()
240
+ _LOG.info('(SCAMPP) Building query subtree for placement...')
240
241
 
241
242
  if dry_run:
242
243
  return dict(), []
@@ -253,7 +254,13 @@ def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
253
254
  new_subtree_dict = dict()
254
255
  # assign queries to subtrees, and remove them from the pool
255
256
  # repeat until all queries are assigned
257
+ _total = 0
256
258
  for seed_label, queries in seed_queries.items():
259
+ ####### additional logging for tracking progress
260
+ _total += 1
261
+ if _total % 1000 == 0 or _total == len(seed_queries):
262
+ _LOG.info(f"- Built {_total}/{len(seed_queries)} subtrees")
263
+
257
264
  node_y = leaf_dict[seed_label]
258
265
  # extract [subtreesize] leaves
259
266
  if Configs.subtreetype == "h":
@@ -266,14 +273,12 @@ def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
266
273
  subtree = tree.extract_tree_with(labels)
267
274
  new_subtree_dict[subtree] = queries
268
275
 
269
-
270
276
  placed_query_list = []
271
-
277
+
272
278
  t1 = time.perf_counter()
273
279
  _LOG.info('Time to assign queries to subtrees: {} seconds'.format(t1 - t0))
274
280
  return new_subtree_dict, placed_query_list
275
281
 
276
-
277
282
  '''
278
283
  Helper function to run a single placement task. Designed to use with
279
284
  multiprocessing
@@ -311,12 +316,16 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
311
316
  if len(query_list) == 0:
312
317
  continue
313
318
  final_subtree_count += 1
319
+
320
+ subtree_dir = os.path.join(workdir, f'subtree_{final_subtree_count}')
321
+ if not os.path.isdir(subtree_dir):
322
+ os.makedirs(subtree_dir)
314
323
 
315
324
  # name all temporary output files
316
- tmp_tree = os.path.join(workdir, 'tree')
317
- tmp_aln = os.path.join(workdir, f'subtree_{final_subtree_count}_aln.fa')
318
- tmp_qaln = os.path.join(workdir, f'subtree_{final_subtree_count}_qaln.fa')
319
- tmp_output = os.path.join(workdir,
325
+ tmp_tree = os.path.join(subtree_dir, 'tree')
326
+ tmp_aln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_aln.fa')
327
+ tmp_qaln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_qaln.fa')
328
+ tmp_output = os.path.join(subtree_dir,
320
329
  'subtree_{}_{}.jplace'.format(
321
330
  final_subtree_count, Configs.placement_method))
322
331
 
@@ -340,13 +349,13 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
340
349
  job = EPAngJob(path=Configs.epang_path,
341
350
  info_path=Configs.info_path, tree_path=tmp_tree,
342
351
  aln_path=tmp_aln, qaln_path=tmp_qaln,
343
- outdir=workdir, num_cpus=Configs.num_cpus)
352
+ outdir=subtree_dir, num_cpus=Configs.num_cpus)
344
353
  # for EPA-ng, ensure that outpath name is changed to the one we want
345
- _outpath = job.run()
354
+ _outpath = job.run(logging=f'subtree_{final_subtree_count}')
346
355
  os.system('mv {} {}'.format(_outpath, tmp_output))
347
356
  elif Configs.placement_method == 'pplacer':
348
357
  # build ref_pkg with info and tmp_tree and tmp_aln
349
- refpkg_dir = os.path.join(workdir,
358
+ refpkg_dir = os.path.join(subtree_dir,
350
359
  f'subtree_{final_subtree_count}.refpkg')
351
360
  taxit_job = TaxtasticJob(path=Configs.taxit_path,
352
361
  outdir=refpkg_dir, name=f'subtree_{final_subtree_count}',
@@ -359,7 +368,7 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
359
368
  refpkg_dir=refpkg_dir, model=Configs.model,
360
369
  outpath=tmp_output, num_cpus=Configs.num_cpus,
361
370
  qaln_path=tmp_qaln)
362
- tmp_output = job.run()
371
+ tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
363
372
  else:
364
373
  raise ValueError(
365
374
  f"Placement method {Configs.placement_method} not recognized")
bscampp/jobs.py CHANGED
@@ -3,7 +3,7 @@ from subprocess import Popen
3
3
  from abc import abstractmethod
4
4
 
5
5
  from bscampp import get_logger, log_exception
6
- from bscampp.configs import Configs
6
+ #from bscampp.configs import Configs
7
7
 
8
8
  _LOG = get_logger(__name__)
9
9
 
@@ -25,7 +25,7 @@ class Job(object):
25
25
  return self.pid
26
26
 
27
27
  # run the job with given invocation and raise errors when encountered
28
- def run(self, stdin="", lock=None, logging=False, shell=False):
28
+ def run(self, stdin="", lock=None, logging=None, shell=False):
29
29
  try:
30
30
  cmd, outpath = self.get_invocation()
31
31
  _LOG.debug(f'Running job_type: {self.job_type}, output: {outpath}')
@@ -57,18 +57,22 @@ class Job(object):
57
57
  # logging to local or to PIPE
58
58
  stderr, stdout = '', ''
59
59
  scmd = ' '.join(cmd)
60
- if logging:
60
+ if logging != None:
61
61
  logpath = os.path.join(
62
- os.path.dirname(outpath), 'f{self.job_type}.txt')
62
+ os.path.dirname(outpath),
63
+ f'{logging}_{self.job_type}.txt')
63
64
  outlogging = open(logpath, 'w', 1)
64
65
 
65
66
  # TODO: may need to deal with piping in the future, for now
66
67
  # it is not needed
67
68
  p = Popen(cmd, text=True, bufsize=1,
68
69
  stdin=subprocess.PIPE,
69
- stdout=outlogging, stderr=subprocess.PIPE)
70
+ stdout=outlogging, stderr=outlogging)
70
71
  self.pid = p.pid
71
72
  stdout, stderr = p.communicate(input=stdin)
73
+ # stdout and stderr are both written to outlogging
74
+ # hence, assign them to be empty strings
75
+ stdout, stderr = '', ''
72
76
  outlogging.close()
73
77
  else:
74
78
  p = Popen(cmd, text=True, bufsize=1,
@@ -92,16 +96,22 @@ class Job(object):
92
96
  else:
93
97
  error_msg = ' '.join([f'Error occurred running {self.job_type}.',
94
98
  f'returncode: {self.returncode}'])
99
+ if logging != None:
100
+ logpath = '\nLOGPATH: ' + os.path.join(
101
+ os.path.dirname(outpath),
102
+ f'{logging}_{self.job_type}.txt')
103
+ else:
104
+ logpath = ''
95
105
  if lock:
96
106
  try:
97
107
  lock.acquire()
98
108
  _LOG.error(error_msg + '\nSTDOUT: ' + stdout +
99
- '\nSTDERR: ' + stderr)
109
+ '\nSTDERR: ' + stderr + logpath)
100
110
  finally:
101
111
  lock.release()
102
112
  else:
103
113
  _LOG.error(error_msg + '\nSTDOUT: ' + stdout +
104
- '\nSTDERR: ' + stderr)
114
+ '\nSTDERR: ' + stderr + logpath)
105
115
  exit(1)
106
116
  except Exception:
107
117
  log_exception(_LOG)
@@ -177,7 +187,7 @@ A pplacer job that uses taxtastic refpkg to place sequences
177
187
  class PplacerTaxtasticJob(Job):
178
188
  def __init__(self, **kwargs):
179
189
  Job.__init__(self)
180
- self.job_type = 'pplacer-taxtastic'
190
+ self.job_type = 'pplacer'
181
191
 
182
192
  self.path = ''
183
193
  self.refpkg_dir = ''
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: bscampp
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
5
5
  Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
6
6
  License: MIT License
@@ -50,7 +50,7 @@ Requires-Dist: numpy>=1.21.6
50
50
  Requires-Dist: treeswift>=1.1.45
51
51
  Requires-Dist: taxtastic>=0.9.3
52
52
 
53
- # BSCAMPP - A Scalable Phylogenetic Placement Method and Framework
53
+ # BSCAMPP and SCAMPP - Two Scalable Phylogenetic Placement Methods and Frameworks
54
54
  [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/bscampp)](https://pypi.org/project/bscampp/)
55
55
  [![PyPI - Version](https://img.shields.io/pypi/v/bscampp?color=blue)](https://pypi.org/project/bscampp/#history)
56
56
  [![Build Status](https://img.shields.io/github/actions/workflow/status/ewedell/BSCAMPP/python-package.yml?branch=main&label=build)](https://github.com/ewedell/BSCAMPP/)
@@ -70,40 +70,46 @@ Requires-Dist: taxtastic>=0.9.3
70
70
  3. Alignment of query sequences (can be combined with ii.).
71
71
  4. Tree info file.
72
72
  - (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
73
- - (pplacer as base method), RAxML-ng or FastTree log file.
73
+ - (pplacer as base method), RAxML-ng or FastTree log file containing model parameters.
74
74
  * **Output**
75
75
  1. Placement results of query sequences in the reference tree in `.jplace` format.
76
76
 
77
77
 
78
- BSCAMPP is an extension and scalable solution to its previous method [SCAMPP](https://github.com/chry04/PLUSplacer) for phylogenetic placement.
79
- BSCAMPP achieves some magnitudes of speedup compared to the SCAMPP framework.
78
+ SCAMPP and BSCAMPP are two scalable solutions for phylogenetic placement. SCAMPP is designed more for accuracy
79
+ and BSCAMPP is designed more for speed. BSCAMPP achieves some magnitudes of speedup compared to SCAMPP.
80
80
  The core algorithm is described in detail at <https://doi.org/10.1101/2022.10.26.513936>.
81
- In short, BSCAMPP in default uses EPA-ng as the base placement method, allowing it to scale to placement trees of up to ~200,000 leaves.
82
- BSCAMPP achieves this by extracting appropriate subtrees and assigning each query to its most fitting subtree.
81
+ In short, Both frameworks in default use EPA-ng as the base placement method, allowing it to scale to placement trees
82
+ of at least ~200,000 leaves. Our two methods achieve this by extracting appropriate subtrees and assigning each query
83
+ to its most fitting subtree.
83
84
 
84
- BSCAMPP essentially is a divide-and-conquer framework and can be used with any base placement methods (e.g., `pplacer` as well).
85
- Currently, BSCAMPP is implemented with `epa-ng` and `pplacer`.
85
+ They are divide-and-conquer frameworks and can be used with any base placement methods (e.g., `pplacer` as well).
86
+ Currently, BSCAMPP and SCAMPP are implemented with `epa-ng` and `pplacer`.
86
87
 
87
- It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 votes based on current best results, especially if sequences
88
- are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and 5 respectively (see [Usage](#usage) for more details
89
- on customizing BSCAMPP).
88
+ #### BSCAMPP
89
+ It is recommended that BSCAMPP be used with subtrees of size 2000 and with 5 votes based on current best results,
90
+ especially if sequences are fragmentary. Defaults for the subtree size and number of votes are set to 2,000 and
91
+ 5 respectively (see [Usage](#usage) for more details on customizing BSCAMPP).
90
92
 
91
93
  #### SCAMPP
92
- SCAMPP is also implemented in BSCAMPP. The user can invoke SCAMPP by running
93
- `run_scampp.py` or `scampp` (if installed with PyPI) after installation.
94
+ SCAMPP is also implemented in BSCAMPP, originally from <https://github.com/chry04/PLUSplacer>.
95
+ Its default also uses EPA-ng and a subtree size of 2,000.
96
+ The user can invoke SCAMPP by running `run_scampp.py` or `scampp` (if installed with PyPI) after installation.
94
97
 
95
98
  # Installation
96
- BSCAMPP was tested on **Python 3.8 to 3.12**. There are two ways to install and use BSCAMPP: (1) with PyPI, or
97
- (2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP, please contact Eleanor Wedell
98
- (ewedell@illinois.edu).
99
+ BSCAMPP and SCAMPP were tested on **Python 3.8 to 3.12**. There are two ways to install:
100
+ (1) with PyPI, or (2) from this GitHub repository. If you have any difficulties installing or running BSCAMPP or SCAMPP,
101
+ please contact Eleanor Wedell (ewedell2@illinois.edu).
99
102
 
100
103
  ### External requirements
101
- EPA-ng and/or pplacer are requirements to run BSCAMPP since BSCAMPP will use them as the base phylogenetic placement methods.
102
- By default, BSCAMPP will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
103
- We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
104
+ * **Base placement method**:
105
+ EPA-ng and/or pplacer are requirements since BSCAMPP and SCAMPP will use them as the base phylogenetic placement methods.
106
+ By default, the software will search for binary executables of `pplacer` and `epa-ng` in the user's environment when running for the first time.
107
+ We also included a compiled version of `pplacer` for the Linux system under `bscampp/tools`.
108
+ * **C++ OpenMP**:
109
+ We also use OpenMP to speed up the similarity comparison between sequences using C++, which is required to run the pre-compiled binaries.
104
110
 
105
111
  ### (1) Install with `pip`
106
- The easiest way to install BSCAMPP is to use `pip install`. This will also install all required Python packages.
112
+ The easiest way to install BSCAMPP and SCAMPP is to use `pip install`. This will also install all required Python packages.
107
113
 
108
114
  ```bash
109
115
  # 1. install with pip (--user if no root access)
@@ -142,22 +148,29 @@ git clone https://github.com/ewedell/BSCAMPP.git
142
148
  # 2. Install all requirements
143
149
  pip install -r requirements.txt
144
150
 
145
- # 3. Execute BSCAMPP executable `run_bscampp.py`
151
+ # 3. Execute BSCAMPP/SCAMPP executables
146
152
  python run_bscampp.py [-h]
153
+ python run_scampp.py [-h]
147
154
  ```
148
155
 
149
156
  # Usage
150
157
  All parameter settings can be found by running
151
158
  ```bash
152
- run_bscampp.py -h
159
+ run_bscampp.py -h #OR
160
+ run_scampp.py -h
153
161
  ```
154
162
 
155
163
  ### (1) Default case (`epa-ng`)
156
164
  ```bash
165
+ # for BSCAMPP
157
166
  run_bscampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
167
+
168
+ # for SCAMPP
169
+ run_scampp.py -i [raxml best model] -t [reference tree] -a [alignment file]
158
170
  ```
159
- To run BSCAMPP in its default mode with EPA-ng. `[alignment file]` should contain both sequences from the placement tree and
160
- the query sequences to be placed. This will create an output directory `bscampp_output` and write the placement results to
171
+ BSCAMPP and SCAMPP in default mode run EPA-ng as the base method. `[alignment file]` should
172
+ contain both sequences from the placement tree and the query sequences to be placed.
173
+ This will create an output directory `bscampp_output` and write the placement results to
161
174
  `bscampp_output/bscampp_result.jplace`.
162
175
 
163
176
  ### (2) Separately giving query alignment and finer control of outputs
@@ -173,6 +186,11 @@ run_bscampp.py -i [logfile from either RAxML/FastTree] -t [reference tree] \
173
186
  -a [reference alignment] -q [query sequence alignment] \
174
187
  --placement-method pplacer
175
188
  ```
189
+ ### (4) Changing the number of votes to 15 for BSCAMPP
190
+ ```bash
191
+ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment] \
192
+ -q [query sequence alignment] -V 15
193
+ ```
176
194
 
177
195
  ### More comprehensive usage
178
196
  ```bash
@@ -1,9 +1,9 @@
1
- bscampp/__init__.py,sha256=2QetcqvH27YCbxcb-pncQRiLppyt80cKZE6qBtoNTNI,2289
1
+ bscampp/__init__.py,sha256=toGV8EzvMKviV7xHahhXs0K6fAmHw2cnWb6EDscpIOY,2289
2
2
  bscampp/configs.py,sha256=3HJHLN2fLV5Tv3TJL95NpOuSXUV6CvqxRqCOM6TpbJQ,5767
3
3
  bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
4
- bscampp/functions.py,sha256=Ou-etis4Dw-vW8ZrHESm8zW_ll6CDkCQNQQSZPlAddU,17300
4
+ bscampp/functions.py,sha256=QYI5RsUEMGc6jLPzFdInpmxA8wiYyN7785P3WxWYiTo,17839
5
5
  bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
6
- bscampp/jobs.py,sha256=de0Dr3ynORwACJqVbeWDfqTwJhWvMYG-7yfRYirGx8M,6703
6
+ bscampp/jobs.py,sha256=PrVMJBabi4cYlrxVLo37XPOY82fY0zZ8Iyp9CWCNWhU,7181
7
7
  bscampp/pipeline.py,sha256=C6I1vWeA6Rq_spPHy_il1FJA_DomWHUHYHLUUk9SnLk,13024
8
8
  bscampp/utils.py,sha256=ragaI14Lqb2fVp_uYDkFQnV7a50G9-sUOWdVM-sNhUE,29005
9
9
  bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
@@ -17,9 +17,9 @@ bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD
17
17
  bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
18
18
  bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
19
19
  bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
20
- bscampp-1.0.2.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
21
- bscampp-1.0.2.dist-info/METADATA,sha256=hVJek64HM-2Bcsou5A4Kl8b_g3-Zu1IaTyNdSMh-1hI,11765
22
- bscampp-1.0.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
23
- bscampp-1.0.2.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
24
- bscampp-1.0.2.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
25
- bscampp-1.0.2.dist-info/RECORD,,
20
+ bscampp-1.0.3.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
21
+ bscampp-1.0.3.dist-info/METADATA,sha256=01Vl-oCadCIiWFBLA564CLNErXILqEzdRrQNPpGy_mc,12507
22
+ bscampp-1.0.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
23
+ bscampp-1.0.3.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
24
+ bscampp-1.0.3.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
25
+ bscampp-1.0.3.dist-info/RECORD,,