bscampp 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,93 @@
1
+ import os, sys, shutil
2
+ try:
3
+ import configparser
4
+ except ImportError:
5
+ import ConfigParser as configparser
6
+ from argparse import ArgumentParser, Namespace
7
+ from platform import platform
8
+
9
+ def find_main_config(homepath):
10
+ with open(homepath, 'r') as f:
11
+ _root_dir = f.read().strip()
12
+ main_config_path = os.path.join(_root_dir, 'main.config')
13
+ if os.path.exists(main_config_path):
14
+ return _root_dir, main_config_path
15
+ else:
16
+ return None, None
17
+
18
+ '''
19
+ Initialize the config file at ~/.bscampp/main.config
20
+ By default will prioritize existing software from the user environment
21
+ '''
22
+ def init_config_file(homepath, rerun=False, prioritize_user_software=True):
23
+ if not rerun:
24
+ # make sure home.path exists
25
+ if not os.path.exists(homepath):
26
+ print(f'Cannot find home.path: {homepath}, regenerating...')
27
+ else:
28
+ _root_dir, main_config_path = find_main_config(homepath)
29
+ if _root_dir is not None:
30
+ return _root_dir, main_config_path
31
+ else:
32
+ print(f'Cannot find main.config, regenerating...')
33
+
34
+ _root_dir = os.path.expanduser('~/.bscampp')
35
+ main_config_path = os.path.join(_root_dir, 'main.config')
36
+ print(f'Initializing the config file at: {main_config_path}')
37
+
38
+ # write to local home.path and _root_dir
39
+ if not os.path.isdir(_root_dir):
40
+ os.mkdir(_root_dir)
41
+ with open(homepath, 'w') as f:
42
+ f.write(_root_dir)
43
+
44
+ # create main.config based on the default.config at this file's location
45
+ _config_path = os.path.join(os.path.dirname(__file__), 'default.config')
46
+ cparser = configparser.ConfigParser()
47
+ cparser.optionxform = str
48
+ assert os.path.exists(_config_path), \
49
+ 'default config file missing! Please redownload from GitHub\n'
50
+
51
+ if os.path.exists(main_config_path):
52
+ print(f'Main configuration file {main_config_path} exists...')
53
+ print('Overwriting the existing config file...')
54
+ print('\n')
55
+
56
+ with open(_config_path, 'r') as f:
57
+ cparser.read_file(f)
58
+
59
+ # check platform, e.g., macOS or linux, etc.
60
+ platform_name = platform()
61
+ print(f'System is: {platform_name}')
62
+
63
+ tools_dir = os.path.join(os.path.dirname(__file__), 'tools')
64
+ set_sections = ['basic']
65
+
66
+ # default path to all potential binaries
67
+ cparser.set('basic', 'pplacer_path',
68
+ os.path.join(tools_dir, 'pplacer'))
69
+ cparser.set('basic', 'epang_path',
70
+ os.path.join(tools_dir, 'epa-ng'))
71
+ cparser.set('basic', 'hamming_distance_dir',
72
+ os.path.join(tools_dir, 'hamming_distance'))
73
+
74
+ # macOS TODO: need to recompile the binaries
75
+ if 'macos' in platform_name.lower():
76
+ cparser.set('basic', 'hamming_distance_dir',
77
+ os.path.join(tools_dir, 'macOS', 'hamming_distance'))
78
+
79
+ # prioritize user's software
80
+ if prioritize_user_software:
81
+ print('Detecting existing software from user\'s environment...')
82
+ softwares = ['pplacer', 'epa-ng', 'taxit']
83
+ for software in softwares:
84
+ sname = software.replace('-', '')
85
+ software_path = shutil.which(software)
86
+ if software_path:
87
+ print('\t{}: {}'.format(software, software_path))
88
+ cparser.set('basic', f'{sname}_path', software_path)
89
+ with open(main_config_path, 'w') as f:
90
+ cparser.write(f)
91
+ print(f'\n(Done) main.config was written to: {main_config_path}')
92
+ print(f'If you want to make changes, please directly edit {main_config_path}')
93
+ return _root_dir, main_config_path
bscampp/jobs.py ADDED
@@ -0,0 +1,198 @@
1
+ import os, shutil, subprocess, stat, re, traceback, shlex
2
+ from subprocess import Popen
3
+ from abc import abstractmethod
4
+
5
+ from bscampp import get_logger, log_exception
6
+ from bscampp.configs import Configs
7
+
8
+ _LOG = get_logger(__name__)
9
+
10
+ '''
11
+ Template class Job for running external software/jobs
12
+ '''
13
+ class Job(object):
14
+ def __init__(self):
15
+ self.job_type = ""
16
+ self.errors = []
17
+ self.b_ignore_error = False
18
+ self.pid = -1
19
+ self.returncode = 0
20
+
21
+ def __call__(self):
22
+ return self.run()
23
+
24
+ def get_pid(self):
25
+ return self.pid
26
+
27
+ # run the job with given invocation and raise errors when encountered
28
+ def run(self, stdin="", lock=None, logging=False, shell=False):
29
+ try:
30
+ cmd, outpath = self.get_invocation()
31
+ _LOG.debug(f'Running job_type: {self.job_type}, output: {outpath}')
32
+
33
+ # failsafe for NotImplemented jobs
34
+ if len(cmd) == 0:
35
+ raise ValueError(
36
+ f'{self.job_type} does not have a valid run command. '
37
+ 'It might be due to (invalid input type, etc.).')
38
+
39
+ # identify binaries as the first field
40
+ binpath = cmd[0]
41
+ # deal with special cases, e.g., python, java
42
+ if binpath == 'java':
43
+ binpath = cmd[2]
44
+ elif binpath == 'python' or binpath == 'python3':
45
+ binpath = cmd[1]
46
+ assert os.path.exists(binpath) or binpath == 'gzip', \
47
+ ('executable for %s does not exist: %s' %
48
+ (self.job_type, binpath))
49
+ assert \
50
+ (binpath.count('/')== 0 or os.path.exists(binpath)), \
51
+ ('path for %s does not exist (%s)' %
52
+ (self.job_type, binpath))
53
+
54
+ _LOG.debug('Arguments: %s', ' '.join(
55
+ (str(x) if x is not None else '?NoneType?' for x in cmd)))
56
+
57
+ # logging to local or to PIPE
58
+ stderr, stdout = '', ''
59
+ scmd = ' '.join(cmd)
60
+ if logging:
61
+ logpath = os.path.join(
62
+ os.path.dirname(outpath), 'f{self.job_type}.txt')
63
+ outlogging = open(logpath, 'w', 1)
64
+
65
+ # TODO: may need to deal with piping in the future, for now
66
+ # it is not needed
67
+ p = Popen(cmd, text=True, bufsize=1,
68
+ stdin=subprocess.PIPE,
69
+ stdout=outlogging, stderr=subprocess.PIPE)
70
+ self.pid = p.pid
71
+ stdout, stderr = p.communicate(input=stdin)
72
+ outlogging.close()
73
+ else:
74
+ p = Popen(cmd, text=True, bufsize=1,
75
+ stdin=subprocess.PIPE,
76
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
77
+ self.pid = p.pid
78
+ stdout, stderr = p.communicate(input=stdin)
79
+ self.returncode = p.returncode
80
+
81
+ # successful run and write to log
82
+ if self.returncode == 0:
83
+ if lock:
84
+ try:
85
+ lock.acquire()
86
+ _LOG.debug(f'{self.job_type} completed, output: {outpath}')
87
+ finally:
88
+ lock.release()
89
+ else:
90
+ _LOG.debug(f'{self.job_type} completed, output: {outpath}')
91
+ return outpath
92
+ else:
93
+ error_msg = ' '.join([f'Error occurred running {self.job_type}.',
94
+ f'returncode: {self.returncode}'])
95
+ if lock:
96
+ try:
97
+ lock.acquire()
98
+ _LOG.error(error_msg + '\nSTDOUT: ' + stdout +
99
+ '\nSTDERR: ' + stderr)
100
+ finally:
101
+ lock.release()
102
+ else:
103
+ _LOG.error(error_msg + '\nSTDOUT: ' + stdout +
104
+ '\nSTDERR: ' + stderr)
105
+ exit(1)
106
+ except Exception:
107
+ log_exception(_LOG)
108
+
109
+ # implemented in subclass
110
+ # return: (cmd, outpath)
111
+ @abstractmethod
112
+ def get_invocation(self):
113
+ raise NotImplementedError(
114
+ 'get_invocation() should be implemented by subclasses.')
115
+
116
+ '''
117
+ A EPA-ng job that runs EPA-ng with given parameters
118
+ '''
119
+ class EPAngJob(Job):
120
+ def __init__(self, **kwargs):
121
+ Job.__init__(self)
122
+ self.job_type = 'epa-ng'
123
+
124
+ self.path = ''
125
+ self.info_path = ''
126
+ self.tree_path = ''
127
+ self.aln_path = ''
128
+ self.qaln_path = ''
129
+ self.outdir = ''
130
+ self.num_cpus = 1
131
+
132
+ for k, v in kwargs.items():
133
+ setattr(self, k, v)
134
+
135
+ def get_invocation(self):
136
+ self.outpath = os.path.join(self.outdir, 'epa_result.jplace')
137
+ cmd = [self.path,
138
+ '-m', self.info_path,
139
+ '-t', self.tree_path, '-s', self.aln_path,
140
+ '-q', self.qaln_path, '-w', self.outdir,
141
+ '-T', str(self.num_cpus), '--redo']
142
+ return cmd, self.outpath
143
+
144
+ '''
145
+ A taxtastic job that create a refpkg based on given parameters
146
+ '''
147
+ class TaxtasticJob(Job):
148
+ def __init__(self, **kwargs):
149
+ Job.__init__(self)
150
+ self.job_type = 'taxit'
151
+
152
+ self.path = ''
153
+ self.outdir = ''
154
+ self.name = ''
155
+ self.tree_path = ''
156
+ self.aln_path = ''
157
+ self.info_path = ''
158
+
159
+ for k, v in kwargs.items():
160
+ setattr(self, k, v)
161
+
162
+ def get_invocation(self):
163
+ self.outpath = os.path.join(self.outdir)
164
+ cmd = [self.path, 'create', '-P', self.outdir,
165
+ '-l', self.name, '--aln-fasta', self.aln_path,
166
+ '--tree-file', self.tree_path]
167
+ # check which model file is provided
168
+ if 'bestModel' in self.info_path:
169
+ cmd.extend(['--model-file', self.info_path])
170
+ else:
171
+ cmd.extend(['--tree-stats', self.info_path])
172
+ return cmd, self.outpath
173
+
174
+ '''
175
+ A pplacer job that uses taxtastic refpkg to place sequences
176
+ '''
177
+ class PplacerTaxtasticJob(Job):
178
+ def __init__(self, **kwargs):
179
+ Job.__init__(self)
180
+ self.job_type = 'pplacer-taxtastic'
181
+
182
+ self.path = ''
183
+ self.refpkg_dir = ''
184
+ self.qaln_path = ''
185
+ self.outdir = ''
186
+ self.outpath = ''
187
+ self.model = 'GTR'
188
+ self.num_cpus = 1
189
+
190
+ for k, v in kwargs.items():
191
+ setattr(self, k, v)
192
+
193
+ def get_invocation(self):
194
+ # outpath defined
195
+ cmd = [self.path, '-m', self.model,
196
+ '-c', self.refpkg_dir, '-o', self.outpath,
197
+ '-j', str(self.num_cpus), self.qaln_path]
198
+ return cmd, self.outpath
bscampp/pipeline.py ADDED
@@ -0,0 +1,249 @@
1
+ import json, time, sys, os, shutil
2
+ from argparse import ArgumentParser, Namespace, RawDescriptionHelpFormatter
3
+ import argparse
4
+
5
+ from bscampp import get_logger, log_exception, __version__
6
+ from bscampp.configs import *
7
+ from bscampp.functions import *
8
+ import bscampp.utils as utils
9
+
10
+ from multiprocessing import Manager
11
+ from concurrent.futures import ProcessPoolExecutor
12
+
13
+ _LOG = get_logger(__name__)
14
+
15
+ # process pool initializer
16
+ def initial_pool(parser, cmdline_args):
17
+ # avoid redundant logging for child process
18
+ buildConfigs(parser, cmdline_args, child_process=True)
19
+
20
+ # main pipeline for BSCAMPP
21
+ def bscampp_pipeline(*args, **kwargs):
22
+ t0 = time.perf_counter()
23
+ m = Manager(); lock = m.Lock()
24
+
25
+ # set up a dry run if specified
26
+ dry_run = False
27
+ if 'dry_run' in kwargs and isinstance(kwargs['dry_run'], bool):
28
+ dry_run = kwargs['dry_run']
29
+
30
+ # parse command line arguments and build configurations
31
+ parser, cmdline_args = parseArguments(dry_run=dry_run)
32
+
33
+ # initialize multiprocessing (if needed)
34
+ _LOG.warning('Initializing ProcessPoolExecutor...')
35
+ pool = ProcessPoolExecutor(Configs.num_cpus, initializer=initial_pool,
36
+ initargs=(parser, cmdline_args,))
37
+
38
+ # (0) temporary files wrote to here
39
+ if not dry_run:
40
+ workdir = os.path.join(Configs.outdir, f'tmp{Configs.tmpfilenbr}')
41
+ try:
42
+ if not os.path.isdir(workdir):
43
+ os.makedirs(workdir)
44
+ except OSError:
45
+ log_exception(_LOG)
46
+ else:
47
+ workdir = os.getcwd()
48
+
49
+ # (1) read in tree, alignment, and separate reference sequences from
50
+ # query sequences
51
+ tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
52
+ dry_run=dry_run)
53
+
54
+ # (2) compute closest leaves for all query sequences
55
+ query_votes_dict, query_top_vote_dict = getClosestLeaves(
56
+ aln_path, qaln_path, aln, qaln, workdir, dry_run=dry_run)
57
+
58
+ # (3) first assign all queries to their closest-leaf subtrees,
59
+ # then do reassignment to minimize distance between each's top vote
60
+ # and the subtree's seed leaf
61
+ new_subtree_dict, placed_query_list = assignQueriesToSubtrees(
62
+ query_votes_dict, query_top_vote_dict, tree, leaf_dict,
63
+ dry_run=dry_run)
64
+
65
+ # (4) perform placement for each subtree
66
+ output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
67
+ placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
68
+ dry_run=dry_run)
69
+
70
+ # (5) write the output jplace to local
71
+ writeOutputJplace(output_jplace, dry_run=dry_run)
72
+
73
+ # shutdown pool
74
+ _LOG.warning('Shutting down ProcessPoolExecutor...')
75
+ pool.shutdown()
76
+ _LOG.warning('ProcessPoolExecutor shut down.')
77
+
78
+ # clean up temp files if not keeping
79
+ if not Configs.keeptemp:
80
+ _LOG.info('Removing temporary files...')
81
+ clean_temp_files()
82
+
83
+ # stop BSCAMPP
84
+ send = time.perf_counter()
85
+ _LOG.info('BSCAMPP completed in {} seconds...'.format(send - t0))
86
+
87
+ if dry_run:
88
+ return True
89
+ else:
90
+ return False
91
+
92
+ def clean_temp_files():
93
+ # all temporary files/directories to remove
94
+ temp_items = [f'tmp{Configs.tmpfilenbr}']
95
+ for temp in temp_items:
96
+ temp_path = os.path.join(Configs.outdir, temp)
97
+ if os.path.isfile(temp_path):
98
+ os.remove(temp_path)
99
+ elif os.path.isdir(temp_path):
100
+ shutil.rmtree(temp_path)
101
+ else:
102
+ continue
103
+ _LOG.info(f'- Removed {temp}')
104
+
105
+ def parseArguments(dry_run=False):
106
+ global _root_dir, main_config_path
107
+
108
+ parser = _init_parser()
109
+ cmdline_args = sys.argv[1:]
110
+
111
+ if dry_run:
112
+ cmdline_args = ['-i', 'dummy.info', '-t', 'dummy.tre',
113
+ '-a', 'dummy.fa']
114
+
115
+ # build config
116
+ buildConfigs(parser, cmdline_args)
117
+ _LOG.info('BSCAMPP is running with: {}'.format(
118
+ ' '.join(cmdline_args)))
119
+ getConfigs()
120
+
121
+ return parser, cmdline_args
122
+
123
+ def _init_parser():
124
+ # example usage
125
+ example_usages = '''Example usages:
126
+ > (1) Default
127
+ %(prog)s -i raxml.bestModel -t reference.tre -a alignment.fa
128
+ > (2) Separate alignment file for query sequences
129
+ %(prog)s -i raxml.bestModel -t reference.tre -a reference.fa -q query.fa
130
+ > (3) Use pplacer instead of EPA-ng as base method (need RAxML-ng info or FastTree log file)
131
+ %(prog)s -i fasttree.log -t reference.tre -a alignment.fa --placement-method pplacer
132
+ '''
133
+
134
+ parser = ArgumentParser(
135
+ description=(
136
+ "This program runs BSCAMPP, a scalable phylogenetic "
137
+ "placement framework that scales EPA-ng/pplacer "
138
+ "to very large tree placement."
139
+ ),
140
+ conflict_handler='resolve',
141
+ epilog=example_usages,
142
+ formatter_class=utils.SmartHelpFormatter,
143
+ )
144
+ parser.add_argument('-v', '--version', action='version',
145
+ version="%(prog)s " + __version__)
146
+ parser.groups = dict()
147
+ required = True
148
+
149
+ ## add a subcommand for updating configuration file without running
150
+ ## the BSCAMPP pipeline
151
+ #subparsers = parser.add_subparsers(dest='command',
152
+ # help='Subcommands for BSCAMPP')
153
+ #update_parser = subparsers.add_parser('update-configs',
154
+ # help='Update the configuration file without running BSCAMPP.')
155
+
156
+ ## try update args requirement if subcommand(s) are used
157
+ #if 'update-configs' in sys.argv:
158
+ # required = False
159
+
160
+ # basic group
161
+ basic_group = parser.add_argument_group(
162
+ "Basic parameters".upper(),
163
+ "These are the basic parameters for BSCAMPP.")
164
+ parser.groups['basic_group'] = basic_group
165
+
166
+ basic_group.add_argument('--placement-method', type=str,
167
+ help='The base placement method to use. Default: epa-ng',
168
+ choices=['epa-ng', 'pplacer'], default='epa-ng',
169
+ required=False)
170
+ basic_group.add_argument("-i", "--info", "--info-path", type=str,
171
+ dest="info_path",
172
+ help=("Path to model parameters. E.g., .bestModel "
173
+ "from RAxML/RAxML-ng"),
174
+ required=required, default=None)
175
+ basic_group.add_argument("-t", "--tree", "--tree-path", type=str,
176
+ dest="tree_path",
177
+ help="Path to reference tree with estimated branch lengths",
178
+ required=required, default=None)
179
+ basic_group.add_argument("-a", "--alignment", "--aln-path", type=str,
180
+ dest="aln_path",
181
+ help=("Path for reference sequence alignment in "
182
+ "FASTA format. Optionally with query sequences. "
183
+ "Query alignment can be specified with --qaln-path"),
184
+ required=required, default=None)
185
+ basic_group.add_argument("-q", "--qalignment", "--qaln-path", type=str,
186
+ dest="qaln_path",
187
+ help=("Optionally provide path to query sequence alignment "
188
+ "in FASTA format. Default: None"),
189
+ required=False, default=None)
190
+ basic_group.add_argument("-d", "--outdir", type=str,
191
+ help="Directory path for output. Default: bscampp_output/",
192
+ required=False, default="bscampp_output")
193
+ basic_group.add_argument("-o", "--output", type=str, dest="outname",
194
+ help="Output file name. Default: bscampp_result.jplace",
195
+ required=False, default="bscampp_result.jplace")
196
+ basic_group.add_argument("--threads", "--num-cpus", type=int,
197
+ dest="num_cpus",
198
+ help="Number of cores for parallelization, default: -1 (all)",
199
+ required=False, default=-1)
200
+
201
+ # advanced parameter settings
202
+ advance_group = parser.add_argument_group(
203
+ "Advance parameters".upper(),
204
+ ("These parameters control how BSCAMPP is run. "
205
+ "The default values are set based on experiments."
206
+ ))
207
+ parser.groups['advance_group'] = advance_group
208
+
209
+ advance_group.add_argument("-m", "--model", type=str,
210
+ help="Model used for edge distances. Default: GTR",
211
+ required=False, default="GTR")
212
+ advance_group.add_argument("-b", "--subtreesize", type=int,
213
+ help="Integer size of the subtree. Default: 2000",
214
+ required=False, default=2000)
215
+ advance_group.add_argument("-V", "--votes", type=int,
216
+ help="Number of votes per query sequence. Default: 5",
217
+ required=False, default=5)
218
+ advance_group.add_argument("--similarityflag", type=str2bool,
219
+ help="Boolean, True if maximizing sequence similarity "
220
+ "instead of simple Hamming distance (ignoring gap "
221
+ "sites in the query). Default: True",
222
+ required=False, default=True)
223
+
224
+ # miscellaneous group
225
+ misc_group = parser.add_argument_group(
226
+ "Miscellaneous parameters".upper(),)
227
+ parser.groups['misc_group'] = misc_group
228
+
229
+ misc_group.add_argument("-n","--tmpfilenbr", type=int,
230
+ help="Temporary file indexing. Default: 0",
231
+ required=False, default=0)
232
+ misc_group.add_argument("--fragmentflag", type=str2bool,
233
+ help="If queries contains fragments. Default: True",
234
+ required=False, default=True)
235
+ misc_group.add_argument("--keeptemp", type=str2bool,
236
+ help="Boolean, True to keep all temporary files. "
237
+ "Default: False",
238
+ required=False, default=False)
239
+ return parser
240
+
241
+ def str2bool(b):
242
+ if isinstance(b, bool):
243
+ return b
244
+ if b.lower() in ('yes', 'true', 't', 'y', '1'):
245
+ return True
246
+ elif b.lower() in ('no', 'false', 'f', 'n', '0'):
247
+ return False
248
+ else:
249
+ raise argparse.ArgumentTypeError('Boolean value expected.')
bscampp/tools/epa-ng ADDED
Binary file
@@ -0,0 +1,13 @@
1
+ cmake_minimum_required(VERSION 3.6)
2
+
3
+ project(hamming CXX)
4
+
5
+ find_package(OpenMP)
6
+
7
+ add_executable (hamming src/new_hamming.cpp)
8
+ add_executable (fragment_hamming src/fragment_hamming.cpp)
9
+ add_executable (homology src/homology.cpp)
10
+
11
+ target_link_libraries(fragment_hamming OpenMP::OpenMP_CXX)
12
+ target_link_libraries(hamming OpenMP::OpenMP_CXX)
13
+ target_link_libraries(homology OpenMP::OpenMP_CXX)
Binary file
Binary file