bscampp 1.0.1a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,93 @@
1
+ import os, sys, shutil
2
+ try:
3
+ import configparser
4
+ except ImportError:
5
+ import ConfigParser as configparser
6
+ from argparse import ArgumentParser, Namespace
7
+ from platform import platform
8
+
9
+ def find_main_config(homepath):
10
+ with open(homepath, 'r') as f:
11
+ _root_dir = f.read().strip()
12
+ main_config_path = os.path.join(_root_dir, 'main.config')
13
+ if os.path.exists(main_config_path):
14
+ return _root_dir, main_config_path
15
+ else:
16
+ return None, None
17
+
18
+ '''
19
+ Initialize the config file at ~/.bscampp/main.config
20
+ By default will prioritize existing software from the user environment
21
+ '''
22
+ def init_config_file(homepath, rerun=False, prioritize_user_software=True):
23
+ if not rerun:
24
+ # make sure home.path exists
25
+ if not os.path.exists(homepath):
26
+ print(f'Cannot find home.path: {homepath}, regenerating...')
27
+ else:
28
+ _root_dir, main_config_path = find_main_config(homepath)
29
+ if _root_dir is not None:
30
+ return _root_dir, main_config_path
31
+ else:
32
+ print(f'Cannot find main.config, regenerating...')
33
+
34
+ _root_dir = os.path.expanduser('~/.bscampp')
35
+ main_config_path = os.path.join(_root_dir, 'main.config')
36
+ print(f'Initializing the config file at: {main_config_path}')
37
+
38
+ # write to local home.path and _root_dir
39
+ if not os.path.isdir(_root_dir):
40
+ os.mkdir(_root_dir)
41
+ with open(homepath, 'w') as f:
42
+ f.write(_root_dir)
43
+
44
+ # create main.config based on the default.config at this file's location
45
+ _config_path = os.path.join(os.path.dirname(__file__), 'default.config')
46
+ cparser = configparser.ConfigParser()
47
+ cparser.optionxform = str
48
+ assert os.path.exists(_config_path), \
49
+ 'default config file missing! Please redownload from GitHub\n'
50
+
51
+ if os.path.exists(main_config_path):
52
+ print(f'Main configuration file {main_config_path} exists...')
53
+ print('Overwriting the existing config file...')
54
+ print('\n')
55
+
56
+ with open(_config_path, 'r') as f:
57
+ cparser.read_file(f)
58
+
59
+ # check platform, e.g., macOS or linux, etc.
60
+ platform_name = platform()
61
+ print(f'System is: {platform_name}')
62
+
63
+ tools_dir = os.path.join(os.path.dirname(__file__), 'tools')
64
+ set_sections = ['basic']
65
+
66
+ # default path to all potential binaries
67
+ cparser.set('basic', 'pplacer_path',
68
+ os.path.join(tools_dir, 'pplacer'))
69
+ cparser.set('basic', 'epang_path',
70
+ os.path.join(tools_dir, 'epa-ng'))
71
+ cparser.set('basic', 'hamming_distance_dir',
72
+ os.path.join(tools_dir, 'hamming_distance'))
73
+
74
+ # macOS TODO: need to recompile the binaries
75
+ if 'macos' in platform_name.lower():
76
+ cparser.set('basic', 'hamming_distance_dir',
77
+ os.path.join(tools_dir, 'macOS', 'hamming_distance'))
78
+
79
+ # prioritize user's software
80
+ if prioritize_user_software:
81
+ print('Detecting existing software from user\'s environment...')
82
+ softwares = ['pplacer', 'epa-ng', 'taxit']
83
+ for software in softwares:
84
+ sname = software.replace('-', '')
85
+ software_path = shutil.which(software)
86
+ if software_path:
87
+ print('\t{}: {}'.format(software, software_path))
88
+ cparser.set('basic', f'{sname}_path', software_path)
89
+ with open(main_config_path, 'w') as f:
90
+ cparser.write(f)
91
+ print(f'\n(Done) main.config was written to: {main_config_path}')
92
+ print(f'If you want to make changes, please directly edit {main_config_path}')
93
+ return _root_dir, main_config_path
bscampp/jobs.py ADDED
@@ -0,0 +1,198 @@
1
+ import os, shutil, subprocess, stat, re, traceback, shlex
2
+ from subprocess import Popen
3
+ from abc import abstractmethod
4
+
5
+ from bscampp import get_logger, log_exception
6
+ from bscampp.configs import Configs
7
+
8
+ _LOG = get_logger(__name__)
9
+
10
+ '''
11
+ Template class Job for running external software/jobs
12
+ '''
13
+ class Job(object):
14
+ def __init__(self):
15
+ self.job_type = ""
16
+ self.errors = []
17
+ self.b_ignore_error = False
18
+ self.pid = -1
19
+ self.returncode = 0
20
+
21
+ def __call__(self):
22
+ return self.run()
23
+
24
+ def get_pid(self):
25
+ return self.pid
26
+
27
+ # run the job with given invocation and raise errors when encountered
28
+ def run(self, stdin="", lock=None, logging=False, shell=False):
29
+ try:
30
+ cmd, outpath = self.get_invocation()
31
+ _LOG.debug(f'Running job_type: {self.job_type}, output: {outpath}')
32
+
33
+ # failsafe for NotImplemented jobs
34
+ if len(cmd) == 0:
35
+ raise ValueError(
36
+ f'{self.job_type} does not have a valid run command. '
37
+ 'It might be due to (invalid input type, etc.).')
38
+
39
+ # identify binaries as the first field
40
+ binpath = cmd[0]
41
+ # deal with special cases, e.g., python, java
42
+ if binpath == 'java':
43
+ binpath = cmd[2]
44
+ elif binpath == 'python' or binpath == 'python3':
45
+ binpath = cmd[1]
46
+ assert os.path.exists(binpath) or binpath == 'gzip', \
47
+ ('executable for %s does not exist: %s' %
48
+ (self.job_type, binpath))
49
+ assert \
50
+ (binpath.count('/')== 0 or os.path.exists(binpath)), \
51
+ ('path for %s does not exist (%s)' %
52
+ (self.job_type, binpath))
53
+
54
+ _LOG.debug('Arguments: %s', ' '.join(
55
+ (str(x) if x is not None else '?NoneType?' for x in cmd)))
56
+
57
+ # logging to local or to PIPE
58
+ stderr, stdout = '', ''
59
+ scmd = ' '.join(cmd)
60
+ if logging:
61
+ logpath = os.path.join(
62
+ os.path.dirname(outpath), 'f{self.job_type}.txt')
63
+ outlogging = open(logpath, 'w', 1)
64
+
65
+ # TODO: may need to deal with piping in the future, for now
66
+ # it is not needed
67
+ p = Popen(cmd, text=True, bufsize=1,
68
+ stdin=subprocess.PIPE,
69
+ stdout=outlogging, stderr=subprocess.PIPE)
70
+ self.pid = p.pid
71
+ stdout, stderr = p.communicate(input=stdin)
72
+ outlogging.close()
73
+ else:
74
+ p = Popen(cmd, text=True, bufsize=1,
75
+ stdin=subprocess.PIPE,
76
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
77
+ self.pid = p.pid
78
+ stdout, stderr = p.communicate(input=stdin)
79
+ self.returncode = p.returncode
80
+
81
+ # successful run and write to log
82
+ if self.returncode == 0:
83
+ if lock:
84
+ try:
85
+ lock.acquire()
86
+ _LOG.debug(f'{self.job_type} completed, output: {outpath}')
87
+ finally:
88
+ lock.release()
89
+ else:
90
+ _LOG.debug(f'{self.job_type} completed, output: {outpath}')
91
+ return outpath
92
+ else:
93
+ error_msg = ' '.join([f'Error occurred running {self.job_type}.',
94
+ f'returncode: {self.returncode}'])
95
+ if lock:
96
+ try:
97
+ lock.acquire()
98
+ _LOG.error(error_msg + '\nSTDOUT: ' + stdout +
99
+ '\nSTDERR: ' + stderr)
100
+ finally:
101
+ lock.release()
102
+ else:
103
+ _LOG.error(error_msg + '\nSTDOUT: ' + stdout +
104
+ '\nSTDERR: ' + stderr)
105
+ exit(1)
106
+ except Exception:
107
+ log_exception(_LOG)
108
+
109
+ # implemented in subclass
110
+ # return: (cmd, outpath)
111
+ @abstractmethod
112
+ def get_invocation(self):
113
+ raise NotImplementedError(
114
+ 'get_invocation() should be implemented by subclasses.')
115
+
116
+ '''
117
+ A EPA-ng job that runs EPA-ng with given parameters
118
+ '''
119
+ class EPAngJob(Job):
120
+ def __init__(self, **kwargs):
121
+ Job.__init__(self)
122
+ self.job_type = 'epa-ng'
123
+
124
+ self.path = ''
125
+ self.info_path = ''
126
+ self.tree_path = ''
127
+ self.aln_path = ''
128
+ self.qaln_path = ''
129
+ self.outdir = ''
130
+ self.num_cpus = 1
131
+
132
+ for k, v in kwargs.items():
133
+ setattr(self, k, v)
134
+
135
+ def get_invocation(self):
136
+ self.outpath = os.path.join(self.outdir, 'epa_result.jplace')
137
+ cmd = [self.path,
138
+ '-m', self.info_path,
139
+ '-t', self.tree_path, '-s', self.aln_path,
140
+ '-q', self.qaln_path, '-w', self.outdir,
141
+ '-T', str(self.num_cpus), '--redo']
142
+ return cmd, self.outpath
143
+
144
+ '''
145
+ A taxtastic job that create a refpkg based on given parameters
146
+ '''
147
+ class TaxtasticJob(Job):
148
+ def __init__(self, **kwargs):
149
+ Job.__init__(self)
150
+ self.job_type = 'taxit'
151
+
152
+ self.path = ''
153
+ self.outdir = ''
154
+ self.name = ''
155
+ self.tree_path = ''
156
+ self.aln_path = ''
157
+ self.info_path = ''
158
+
159
+ for k, v in kwargs.items():
160
+ setattr(self, k, v)
161
+
162
+ def get_invocation(self):
163
+ self.outpath = os.path.join(self.outdir)
164
+ cmd = [self.path, 'create', '-P', self.outdir,
165
+ '-l', self.name, '--aln-fasta', self.aln_path,
166
+ '--tree-file', self.tree_path]
167
+ # check which model file is provided
168
+ if 'bestModel' in self.info_path:
169
+ cmd.extend(['--model-file', self.info_path])
170
+ else:
171
+ cmd.extend(['--tree-stats', self.info_path])
172
+ return cmd, self.outpath
173
+
174
+ '''
175
+ A pplacer job that uses taxtastic refpkg to place sequences
176
+ '''
177
+ class PplacerTaxtasticJob(Job):
178
+ def __init__(self, **kwargs):
179
+ Job.__init__(self)
180
+ self.job_type = 'pplacer-taxtastic'
181
+
182
+ self.path = ''
183
+ self.refpkg_dir = ''
184
+ self.qaln_path = ''
185
+ self.outdir = ''
186
+ self.outpath = ''
187
+ self.model = 'GTR'
188
+ self.num_cpus = 1
189
+
190
+ for k, v in kwargs.items():
191
+ setattr(self, k, v)
192
+
193
+ def get_invocation(self):
194
+ # outpath defined
195
+ cmd = [self.path, '-m', self.model,
196
+ '-c', self.refpkg_dir, '-o', self.outpath,
197
+ '-j', str(self.num_cpus), self.qaln_path]
198
+ return cmd, self.outpath
bscampp/pipeline.py ADDED
@@ -0,0 +1,224 @@
1
+ import json, time, sys, os, shutil
2
+ from argparse import ArgumentParser, Namespace, RawDescriptionHelpFormatter
3
+ import argparse
4
+
5
+ from bscampp import get_logger, log_exception, __version__
6
+ from bscampp.configs import *
7
+ from bscampp.functions import *
8
+ import bscampp.utils as utils
9
+
10
+ from multiprocessing import Manager
11
+ from concurrent.futures import ProcessPoolExecutor
12
+
13
+ _LOG = get_logger(__name__)
14
+
15
+ # process pool initializer
16
+ def initial_pool(parser, cmdline_args):
17
+ # avoid redundant logging for child process
18
+ buildConfigs(parser, cmdline_args, child_process=True)
19
+
20
+ # main pipeline for BSCAMPP
21
+ def bscampp_pipeline(*args, **kwargs):
22
+ t0 = time.perf_counter()
23
+ m = Manager(); lock = m.Lock()
24
+
25
+ # parse command line arguments and build configurations
26
+ parser, cmdline_args = parseArguments()
27
+
28
+ # initialize multiprocessing (if needed)
29
+ _LOG.warning('Initializing ProcessPoolExecutor...')
30
+ pool = ProcessPoolExecutor(Configs.num_cpus, initializer=initial_pool,
31
+ initargs=(parser, cmdline_args,))
32
+
33
+ # (0) temporary files wrote to here
34
+ workdir = os.path.join(Configs.outdir, f'tmp{Configs.tmpfilenbr}')
35
+ try:
36
+ if not os.path.isdir(workdir):
37
+ os.makedirs(workdir)
38
+ except OSError:
39
+ log_exception(_LOG)
40
+
41
+ # (1) read in tree, alignment, and separate reference sequences from
42
+ # query sequences
43
+ tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir)
44
+
45
+ # (2) compute closest leaves for all query sequences
46
+ query_votes_dict, query_top_vote_dict = getClosestLeaves(
47
+ aln_path, qaln_path, aln, qaln, workdir)
48
+
49
+ # (3) first assign all queries to their closest-leaf subtrees,
50
+ # then do reassignment to minimize distance between each's top vote
51
+ # and the subtree's seed leaf
52
+ new_subtree_dict, placed_query_list = assignQueriesToSubtrees(
53
+ query_votes_dict, query_top_vote_dict, tree, leaf_dict)
54
+
55
+ # (4) perform placement for each subtree
56
+ output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
57
+ placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock)
58
+
59
+ # (5) write the output jplace to local
60
+ writeOutputJplace(output_jplace)
61
+
62
+ # shutdown pool
63
+ _LOG.warning('Shutting down ProcessPoolExecutor...')
64
+ pool.shutdown()
65
+ _LOG.warning('ProcessPoolExecutor shut down.')
66
+
67
+ # clean up temp files if not keeping
68
+ if not Configs.keeptemp:
69
+ _LOG.info('Removing temporary files...')
70
+ clean_temp_files()
71
+
72
+ # stop BSCAMPP
73
+ send = time.perf_counter()
74
+ _LOG.info('BSCAMPP completed in {} seconds...'.format(send - t0))
75
+
76
+ def clean_temp_files():
77
+ # all temporary files/directories to remove
78
+ temp_items = [f'tmp{Configs.tmpfilenbr}']
79
+ for temp in temp_items:
80
+ temp_path = os.path.join(Configs.outdir, temp)
81
+ if os.path.isfile(temp_path):
82
+ os.remove(temp_path)
83
+ elif os.path.isdir(temp_path):
84
+ shutil.rmtree(temp_path)
85
+ else:
86
+ continue
87
+ _LOG.info(f'- Removed {temp}')
88
+
89
+ def parseArguments():
90
+ global _root_dir, main_config_path
91
+ parser = _init_parser()
92
+ cmdline_args = sys.argv[1:]
93
+
94
+ # build config
95
+ buildConfigs(parser, cmdline_args)
96
+ _LOG.info('BSCAMPP is running with: {}'.format(
97
+ ' '.join(cmdline_args)))
98
+ getConfigs()
99
+
100
+ return parser, cmdline_args
101
+
102
+ def _init_parser():
103
+ # example usage
104
+ example_usages = '''Example usages:
105
+ > default
106
+ %(prog)s -i raxml.info
107
+ '''
108
+
109
+ parser = ArgumentParser(
110
+ description=(
111
+ "This program runs BSCAMPP, a scalable phylogenetic "
112
+ "placement framework that scales EPA-ng/pplacer "
113
+ "to very large tree placement."
114
+ ),
115
+ conflict_handler='resolve',
116
+ epilog=example_usages,
117
+ formatter_class=utils.SmartHelpFormatter,
118
+ )
119
+ parser.add_argument('-v', '--version', action='version',
120
+ version="%(prog)s " + __version__)
121
+ parser.groups = dict()
122
+ required = True
123
+
124
+ ## add a subcommand for updating configuration file without running
125
+ ## the BSCAMPP pipeline
126
+ #subparsers = parser.add_subparsers(dest='command',
127
+ # help='Subcommands for BSCAMPP')
128
+ #update_parser = subparsers.add_parser('update-configs',
129
+ # help='Update the configuration file without running BSCAMPP.')
130
+
131
+ ## try update args requirement if subcommand(s) are used
132
+ #if 'update-configs' in sys.argv:
133
+ # required = False
134
+
135
+ # basic group
136
+ basic_group = parser.add_argument_group(
137
+ "Basic parameters".upper(),
138
+ "These are the basic parameters for BSCAMPP.")
139
+ parser.groups['basic_group'] = basic_group
140
+
141
+ basic_group.add_argument('--placement-method', type=str,
142
+ help='The base placement method to use. Default: epa-ng',
143
+ choices=['epa-ng', 'pplacer'], default='epa-ng',
144
+ required=False)
145
+ basic_group.add_argument("-i", "--info", "--info-path", type=str,
146
+ dest="info_path",
147
+ help=("Path to model parameters. E.g., .bestModel "
148
+ "from RAxML/RAxML-ng"),
149
+ required=required, default=None)
150
+ basic_group.add_argument("-t", "--tree", "--tree-path", type=str,
151
+ dest="tree_path",
152
+ help="Path to reference tree with estimated branch lengths",
153
+ required=required, default=None)
154
+ basic_group.add_argument("-a", "--alignment", "--aln-path", type=str,
155
+ dest="aln_path",
156
+ help=("Path for reference sequence alignment in "
157
+ "FASTA format. Optionally with query sequences. "
158
+ "Query alignment can be specified with --qaln-path"),
159
+ required=required, default=None)
160
+ basic_group.add_argument("-q", "--qalignment", "--qaln-path", type=str,
161
+ dest="qaln_path",
162
+ help=("Optionally provide path to query sequence alignment "
163
+ "in FASTA format. Default: None"),
164
+ required=False, default=None)
165
+ basic_group.add_argument("-d", "--outdir", type=str,
166
+ help="Directory path for output. Default: bscampp_output/",
167
+ required=False, default="bscampp_output")
168
+ basic_group.add_argument("-o", "--output", type=str, dest="outname",
169
+ help="Output file name. Default: bscampp_result.jplace",
170
+ required=False, default="bscampp_result.jplace")
171
+ basic_group.add_argument("--threads", "--num-cpus", type=int,
172
+ dest="num_cpus",
173
+ help="Number of cores for parallelization, default: -1 (all)",
174
+ required=False, default=-1)
175
+
176
+ # advanced parameter settings
177
+ advance_group = parser.add_argument_group(
178
+ "Advance parameters".upper(),
179
+ ("These parameters control how BSCAMPP is run. "
180
+ "The default values are set based on experiments."
181
+ ))
182
+ parser.groups['advance_group'] = advance_group
183
+
184
+ advance_group.add_argument("-m", "--model", type=str,
185
+ help="Model used for edge distances. Default: GTR",
186
+ required=False, default="GTR")
187
+ advance_group.add_argument("-b", "--subtreesize", type=int,
188
+ help="Integer size of the subtree. Default: 2000",
189
+ required=False, default=2000)
190
+ advance_group.add_argument("-V", "--votes", type=int,
191
+ help="Number of votes per query sequence. Default: 5",
192
+ required=False, default=5)
193
+ advance_group.add_argument("--similarityflag", type=str2bool,
194
+ help="Boolean, True if maximizing sequence similarity "
195
+ "instead of simple Hamming distance (ignoring gap "
196
+ "sites in the query). Default: True",
197
+ required=False, default=True)
198
+
199
+ # miscellaneous group
200
+ misc_group = parser.add_argument_group(
201
+ "Miscellaneous parameters".upper(),)
202
+ parser.groups['misc_group'] = misc_group
203
+
204
+ misc_group.add_argument("-n","--tmpfilenbr", type=int,
205
+ help="Temporary file indexing. Default: 0",
206
+ required=False, default=0)
207
+ misc_group.add_argument("--fragmentflag", type=str2bool,
208
+ help="If queries contains fragments. Default: True",
209
+ required=False, default=True)
210
+ misc_group.add_argument("--keeptemp", type=str2bool,
211
+ help="Boolean, True to keep all temporary files. "
212
+ "Default: False",
213
+ required=False, default=False)
214
+ return parser
215
+
216
+ def str2bool(b):
217
+ if isinstance(b, bool):
218
+ return b
219
+ if b.lower() in ('yes', 'true', 't', 'y', '1'):
220
+ return True
221
+ elif b.lower() in ('no', 'false', 'f', 'n', '0'):
222
+ return False
223
+ else:
224
+ raise argparse.ArgumentTypeError('Boolean value expected.')
bscampp/tools/epa-ng ADDED
Binary file
@@ -0,0 +1,13 @@
1
+ cmake_minimum_required(VERSION 3.6)
2
+
3
+ project(hamming CXX)
4
+
5
+ find_package(OpenMP)
6
+
7
+ add_executable (hamming src/new_hamming.cpp)
8
+ add_executable (fragment_hamming src/fragment_hamming.cpp)
9
+ add_executable (homology src/homology.cpp)
10
+
11
+ target_link_libraries(fragment_hamming OpenMP::OpenMP_CXX)
12
+ target_link_libraries(hamming OpenMP::OpenMP_CXX)
13
+ target_link_libraries(homology OpenMP::OpenMP_CXX)
Binary file
Binary file