bscampp 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bscampp/__init__.py +68 -0
- bscampp/configs.py +169 -0
- bscampp/default.config +5 -0
- bscampp/functions.py +409 -0
- bscampp/init_configs.py +93 -0
- bscampp/jobs.py +198 -0
- bscampp/pipeline.py +249 -0
- bscampp/tools/epa-ng +0 -0
- bscampp/tools/hamming_distance/CMakeLists.txt +13 -0
- bscampp/tools/hamming_distance/fragment_hamming +0 -0
- bscampp/tools/hamming_distance/hamming +0 -0
- bscampp/tools/hamming_distance/homology +0 -0
- bscampp/tools/hamming_distance/src/fragment_hamming.cpp +180 -0
- bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp +183 -0
- bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp +214 -0
- bscampp/tools/hamming_distance/src/homology.cpp +179 -0
- bscampp/tools/hamming_distance/src/new_hamming.cpp +161 -0
- bscampp/tools/pplacer +0 -0
- bscampp/utils.py +914 -0
- bscampp-1.0.1.dist-info/LICENSE +21 -0
- bscampp-1.0.1.dist-info/METADATA +234 -0
- bscampp-1.0.1.dist-info/RECORD +25 -0
- bscampp-1.0.1.dist-info/WHEEL +5 -0
- bscampp-1.0.1.dist-info/entry_points.txt +3 -0
- bscampp-1.0.1.dist-info/top_level.txt +1 -0
bscampp/__init__.py
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
############################################################
|
2
|
+
#
|
3
|
+
# Init file for BSCAMPP, using the __init__.py from
|
4
|
+
# SEPP as the original template. Current adaption comes
|
5
|
+
# from https://github.com/c5shen/TIPP3.git
|
6
|
+
#
|
7
|
+
############################################################
|
8
|
+
from operator import itemgetter
|
9
|
+
import logging, os
|
10
|
+
|
11
|
+
# update system recursion limit to avoid issues
|
12
|
+
# not really needed for BSCAMPP but safe to update here
|
13
|
+
os.sys.setrecursionlimit(1000000)
|
14
|
+
|
15
|
+
__version__ = "1.0.1"
|
16
|
+
_INSTALL_PATH = __path__[0]
|
17
|
+
|
18
|
+
# global variables to store all loggers
|
19
|
+
__set_loggers = set()
|
20
|
+
|
21
|
+
# obtain the current logging level, default to INFO
|
22
|
+
def get_logging_level(logging_level='info'):
|
23
|
+
logging_level_map = {
|
24
|
+
'DEBUG': logging.DEBUG, 'INFO': logging.INFO,
|
25
|
+
'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
|
26
|
+
'CRITICAL': logging.CRITICAL,
|
27
|
+
}
|
28
|
+
# obtain from environment variable to determine logging level, if
|
29
|
+
# set by the user
|
30
|
+
env_level = os.getenv('BSCAMPP_LOGGING_LEVEL')
|
31
|
+
if env_level is not None:
|
32
|
+
ll = env_level.upper()
|
33
|
+
else:
|
34
|
+
ll = logging_level.upper()
|
35
|
+
# default to INFO if ll is not defined
|
36
|
+
return logging_level_map.get(ll, logging.INFO)
|
37
|
+
|
38
|
+
# obtain a logger for a given file
|
39
|
+
def get_logger(name='bscampp', log_path=None, logging_level='info'):
|
40
|
+
logger = logging.getLogger(name)
|
41
|
+
if name not in __set_loggers:
|
42
|
+
# set up a new logger for a name not in __set_loggers yet
|
43
|
+
level = get_logging_level(logging_level)
|
44
|
+
logging_formatter = logging.Formatter(
|
45
|
+
("[%(asctime)s] %(filename)s (line %(lineno)d):"
|
46
|
+
" %(levelname) 8s: %(message)s"))
|
47
|
+
logging_formatter.datefmt = "%H:%M:%S"
|
48
|
+
logger.setLevel(level)
|
49
|
+
|
50
|
+
# logging to stdout
|
51
|
+
if log_path is None:
|
52
|
+
ch = logging.StreamHandler()
|
53
|
+
else:
|
54
|
+
# use FileHandler for logging
|
55
|
+
ch = logging.FileHandler(log_path, mode='a')
|
56
|
+
ch.setLevel(level)
|
57
|
+
ch.setFormatter(logging_formatter)
|
58
|
+
logger.addHandler(ch)
|
59
|
+
__set_loggers.add(name)
|
60
|
+
return logger
|
61
|
+
|
62
|
+
# logging exception
|
63
|
+
def log_exception(logger):
|
64
|
+
import traceback, io
|
65
|
+
s = io.StringIO()
|
66
|
+
traceback.print_exc(None, s)
|
67
|
+
logger.error(s.getvalue())
|
68
|
+
exit(1)
|
bscampp/configs.py
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
import os, time
|
2
|
+
try:
|
3
|
+
import configparser
|
4
|
+
except ImportError:
|
5
|
+
from ConfigParser import configparser
|
6
|
+
from argparse import ArgumentParser, Namespace
|
7
|
+
from bscampp.init_configs import init_config_file
|
8
|
+
from bscampp import get_logger, log_exception
|
9
|
+
|
10
|
+
# detect home.path or create if missing
|
11
|
+
homepath = os.path.dirname(__file__) + '/home.path'
|
12
|
+
_root_dir, main_config_path = init_config_file(homepath)
|
13
|
+
|
14
|
+
# set valid configparse section names
|
15
|
+
valid_config_sections = []
|
16
|
+
logging_levels = set(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'])
|
17
|
+
|
18
|
+
_LOG = get_logger(__name__)
|
19
|
+
|
20
|
+
'''
|
21
|
+
Configuration defined by users and by default values
|
22
|
+
'''
|
23
|
+
class Configs:
|
24
|
+
global _root_dir
|
25
|
+
|
26
|
+
# basic input paths
|
27
|
+
info_path = None # info file for pplacer or EPA-ng
|
28
|
+
tree_path = None # placement tree path
|
29
|
+
aln_path = None # alignment for backbone. Optinally with queries
|
30
|
+
qaln_path = None # (optional) alignment for query.
|
31
|
+
outdir = None # output directory
|
32
|
+
outname = None # output name for the final jplace file
|
33
|
+
keeptemp = False # whether to keep all temporary files
|
34
|
+
verbose = 'INFO' # default verbose level to print
|
35
|
+
num_cpus = 1 # number of cores to use for parallelization
|
36
|
+
|
37
|
+
# binaries
|
38
|
+
pplacer_path = None
|
39
|
+
epang_path = None
|
40
|
+
taxit_path = None
|
41
|
+
hamming_distance_dir = None
|
42
|
+
|
43
|
+
# placement settings
|
44
|
+
placement_method = 'epa-ng'
|
45
|
+
model = 'GTR'
|
46
|
+
subtreesize = 2000
|
47
|
+
votes = 5
|
48
|
+
similarityflag = True
|
49
|
+
|
50
|
+
# miscellaneous
|
51
|
+
tmpfilenbr = 0
|
52
|
+
fragmentflag = True
|
53
|
+
|
54
|
+
# check if the given configuration is valid to add
|
55
|
+
def set_valid_configuration(name, conf):
|
56
|
+
if not isinstance(conf, Namespace):
|
57
|
+
_LOG.warning(
|
58
|
+
"Looking for Namespace object from \'{}\' but find {}".format(
|
59
|
+
name, type(conf)))
|
60
|
+
return
|
61
|
+
|
62
|
+
# basic section defined in main.config
|
63
|
+
if name == 'basic':
|
64
|
+
for k in conf.__dict__.keys():
|
65
|
+
k_attr = getattr(conf, k)
|
66
|
+
if not k_attr:
|
67
|
+
continue
|
68
|
+
if k in Configs.__dict__:
|
69
|
+
setattr(Configs, k, k_attr)
|
70
|
+
else:
|
71
|
+
pass
|
72
|
+
|
73
|
+
# valid attribute check for print out
|
74
|
+
def valid_attribute(k, v):
|
75
|
+
if not isinstance(k, str):
|
76
|
+
return False
|
77
|
+
if k.startswith('_'):
|
78
|
+
return False
|
79
|
+
return True
|
80
|
+
|
81
|
+
# print out current configuration
|
82
|
+
def getConfigs():
|
83
|
+
msg = '\n************ Configurations ************\n' + \
|
84
|
+
f'\thome.path: {homepath}\n' + \
|
85
|
+
f'\tmain.config: {main_config_path}\n\n'
|
86
|
+
for k, v in Configs.__dict__.items():
|
87
|
+
if valid_attribute(k, v):
|
88
|
+
msg += f'\tConfigs.{k}: {v}\n'
|
89
|
+
print(msg, flush=True)
|
90
|
+
|
91
|
+
# read in config file if it exists
|
92
|
+
def _read_config_file(filename, cparser, opts,
|
93
|
+
child_process=False, expand=None):
|
94
|
+
config_defaults = []
|
95
|
+
with open(filename, 'r') as f:
|
96
|
+
cparser.read_file(f)
|
97
|
+
if cparser.has_section('commandline'):
|
98
|
+
for k, v in cparser.items('commandline'):
|
99
|
+
config_defaults.append(f'--{k}')
|
100
|
+
config_defaults.append(v)
|
101
|
+
|
102
|
+
for section in cparser.sections():
|
103
|
+
if section == 'commandline':
|
104
|
+
continue
|
105
|
+
if getattr(opts, section, None):
|
106
|
+
section_name_space = getattr(opts, section)
|
107
|
+
else:
|
108
|
+
section_name_space = Namespace()
|
109
|
+
for k, v in cparser.items(section):
|
110
|
+
if expand and k == 'path':
|
111
|
+
v = os.path.join(expand, v)
|
112
|
+
setattr(section_name_space, k, v)
|
113
|
+
setattr(opts, section, section_name_space)
|
114
|
+
return config_defaults
|
115
|
+
|
116
|
+
'''
|
117
|
+
Build Config class
|
118
|
+
'''
|
119
|
+
def buildConfigs(parser, cmdline_args, child_process=False, rerun=False):
|
120
|
+
cparser = configparser.ConfigParser()
|
121
|
+
cparser.optionxform = str
|
122
|
+
args = parser.parse_args(cmdline_args)
|
123
|
+
|
124
|
+
# Check if only updating config files, if so, re-initialize the
|
125
|
+
# configuration file at ~/.bscampp/main.config and exit
|
126
|
+
#if args.command == 'update-configs':
|
127
|
+
# _ = init_config_file(homepath, rerun=True)
|
128
|
+
# _LOG.warning('Finished re-initializing the configuration file '
|
129
|
+
# f'at {main_config_path}, exiting...')
|
130
|
+
# exit(0)
|
131
|
+
|
132
|
+
# first load arguments from main.configs
|
133
|
+
main_args = Namespace()
|
134
|
+
cmdline_main = _read_config_file(main_config_path,
|
135
|
+
cparser, main_args, child_process=child_process)
|
136
|
+
|
137
|
+
# merge arguments, in the correct order so things are overridden correctly
|
138
|
+
args = parser.parse_args(cmdline_main + cmdline_args,
|
139
|
+
namespace=main_args)
|
140
|
+
|
141
|
+
# directly add all arguments that's defined in the Configs class
|
142
|
+
for k in args.__dict__.keys():
|
143
|
+
k_attr = getattr(args, k)
|
144
|
+
if k in Configs.__dict__:
|
145
|
+
# valid argument that's defined in the Configs class
|
146
|
+
setattr(Configs, k, k_attr)
|
147
|
+
else:
|
148
|
+
# check if the argument is valid
|
149
|
+
set_valid_configuration(k, k_attr)
|
150
|
+
|
151
|
+
# create outdir
|
152
|
+
if not os.path.isdir(Configs.outdir):
|
153
|
+
os.makedirs(Configs.outdir)
|
154
|
+
|
155
|
+
# modify outname if it does not have a .jplace suffix
|
156
|
+
if Configs.outname.split('.')[-1].lower() != 'jplace':
|
157
|
+
Configs.outname += '.jplace'
|
158
|
+
|
159
|
+
# modify num_cpus if it is the default value
|
160
|
+
if Configs.num_cpus > 0:
|
161
|
+
Configs.num_cpus = min(os.cpu_count(), Configs.num_cpus)
|
162
|
+
else:
|
163
|
+
Configs.num_cpus = os.cpu_count()
|
164
|
+
|
165
|
+
# sanity check for existence of base placement binary path
|
166
|
+
if Configs.placement_method == 'epa-ng':
|
167
|
+
assert os.path.exists(Configs.epang_path), 'epa-ng not detected!'
|
168
|
+
elif Configs.placement_method == 'pplacer':
|
169
|
+
assert os.path.exists(Configs.pplacer_path), 'pplacer not detected!'
|
bscampp/default.config
ADDED
bscampp/functions.py
ADDED
@@ -0,0 +1,409 @@
|
|
1
|
+
import json, time, os, sys
|
2
|
+
import treeswift
|
3
|
+
from collections import defaultdict, Counter
|
4
|
+
|
5
|
+
from bscampp import get_logger, log_exception
|
6
|
+
from bscampp.configs import Configs
|
7
|
+
from bscampp.jobs import EPAngJob, TaxtasticJob, PplacerTaxtasticJob
|
8
|
+
from bscampp.utils import write_fasta
|
9
|
+
import bscampp.utils as utils
|
10
|
+
|
11
|
+
_LOG = get_logger(__name__)
|
12
|
+
|
13
|
+
'''
|
14
|
+
Function to read in the placement tree and alignment.
|
15
|
+
If query alignment is provided, will use the provided query instead of
|
16
|
+
the ones (potentially) included in the reference alignment
|
17
|
+
'''
|
18
|
+
def readData(workdir, dry_run=False):
|
19
|
+
t0 = time.perf_counter()
|
20
|
+
_LOG.info('Reading in input data...')
|
21
|
+
|
22
|
+
if dry_run:
|
23
|
+
return None, dict(), '', dict(), '', dict()
|
24
|
+
|
25
|
+
# (1) load reference tree
|
26
|
+
tree = treeswift.read_tree_newick(Configs.tree_path)
|
27
|
+
tree.resolve_polytomies()
|
28
|
+
|
29
|
+
leaf_dict = tree.label_to_node(selection='leaves')
|
30
|
+
# clean the leaf keys so that ' or " are not present
|
31
|
+
ori_keys = list(leaf_dict.keys())
|
32
|
+
for key in ori_keys:
|
33
|
+
_node = leaf_dict[key]
|
34
|
+
new_key = key.replace('\'', '')
|
35
|
+
new_key = new_key.replace('\"', '')
|
36
|
+
leaf_dict.pop(key)
|
37
|
+
leaf_dict[new_key] = _node
|
38
|
+
|
39
|
+
# (2) load reference alignment and query alignment (if provided)
|
40
|
+
if Configs.qaln_path is not None:
|
41
|
+
ref_dict = utils.read_data(Configs.aln_path)
|
42
|
+
q_dict = utils.read_data(Configs.qaln_path)
|
43
|
+
aln_path, qaln_path = Configs.aln_path, Configs.qaln_path
|
44
|
+
else:
|
45
|
+
aln_dict = utils.read_data(Configs.aln_path)
|
46
|
+
ref_dict, q_dict = utils.seperate(aln_dict, leaf_dict)
|
47
|
+
|
48
|
+
# after separating queries from the reference alignment, write
|
49
|
+
# them to to TEMP/
|
50
|
+
qaln_path = os.path.join(workdir, 'qaln.fa')
|
51
|
+
write_fasta(qaln_path, q_dict)
|
52
|
+
|
53
|
+
aln_path = os.path.join(workdir, 'aln.fa')
|
54
|
+
write_fasta(aln_path, ref_dict)
|
55
|
+
|
56
|
+
t1 = time.perf_counter()
|
57
|
+
_LOG.info('Time to read in input data: {} seconds'.format(t1 - t0))
|
58
|
+
return tree, leaf_dict, aln_path, ref_dict, qaln_path, q_dict
|
59
|
+
|
60
|
+
'''
|
61
|
+
Function to get the closest leaf for each query sequence based on Hamming
|
62
|
+
distance
|
63
|
+
'''
|
64
|
+
def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir, dry_run=False):
|
65
|
+
t0 = time.perf_counter()
|
66
|
+
_LOG.info('Computing closest leaves for query sequences...')
|
67
|
+
|
68
|
+
if dry_run:
|
69
|
+
return dict(), dict()
|
70
|
+
|
71
|
+
query_votes_dict = dict()
|
72
|
+
query_top_vote_dict = dict()
|
73
|
+
tmp_output = os.path.join(workdir, 'closest.txt')
|
74
|
+
|
75
|
+
cmd = []
|
76
|
+
if Configs.similarityflag:
|
77
|
+
cmd.append(os.path.join(Configs.hamming_distance_dir, 'homology'))
|
78
|
+
else:
|
79
|
+
if Configs.fragmentflag == False:
|
80
|
+
cmd.append(os.path.join(Configs.hamming_distance_dir, 'hamming'))
|
81
|
+
else:
|
82
|
+
cmd.append(os.path.join(
|
83
|
+
Configs.hamming_distance_dir, 'fragment_hamming'))
|
84
|
+
cmd.extend([aln_path, str(len(aln)), qaln_path, str(len(qaln)),
|
85
|
+
tmp_output, str(Configs.votes)])
|
86
|
+
os.system(' '.join(cmd))
|
87
|
+
|
88
|
+
# process closest leaves
|
89
|
+
unusable_queries = set()
|
90
|
+
f = open(tmp_output)
|
91
|
+
for line in f:
|
92
|
+
line = line.strip()
|
93
|
+
y = line.split(',')
|
94
|
+
name = y.pop(0)
|
95
|
+
for idx, taxon in enumerate(y):
|
96
|
+
leaf, hamming = taxon.split(':')
|
97
|
+
y[idx] = (leaf, int(hamming))
|
98
|
+
|
99
|
+
y = sorted(y, key=lambda x: x[1])
|
100
|
+
for idx, taxon in enumerate(y):
|
101
|
+
y[idx] = taxon[0]
|
102
|
+
|
103
|
+
if name.find(':') >= 0:
|
104
|
+
name_list = name.split(":")
|
105
|
+
name = name_list[0]
|
106
|
+
ungapped_length = name_list[1]
|
107
|
+
if y[0] == ungapped_length:
|
108
|
+
_LOG.warning(f'Sequence {name}: no homologous sites found, '
|
109
|
+
'removed before placement.')
|
110
|
+
unusable_queries.add(name)
|
111
|
+
if name not in unusable_queries:
|
112
|
+
query_votes_dict[name] = y
|
113
|
+
query_top_vote_dict[name] = y[0]
|
114
|
+
f.close()
|
115
|
+
|
116
|
+
t1 = time.perf_counter()
|
117
|
+
_LOG.info('Time to compute closest leaves: {} seconds'.format(t1 - t0))
|
118
|
+
return query_votes_dict, query_top_vote_dict
|
119
|
+
|
120
|
+
'''
|
121
|
+
Function to assign queries to subtrees based on their votes
|
122
|
+
'''
|
123
|
+
def assignQueriesToSubtrees(query_votes_dict, query_top_vote_dict,
|
124
|
+
tree, leaf_dict, dry_run=False):
|
125
|
+
t0 = time.perf_counter()
|
126
|
+
_LOG.info('Adding query votes to the placement tree...')
|
127
|
+
|
128
|
+
if dry_run:
|
129
|
+
return dict(), []
|
130
|
+
|
131
|
+
# (1) go over the query votes and add them to corresponding leaves
|
132
|
+
lf_votes = Counter()
|
133
|
+
leaf_queries = dict()
|
134
|
+
for name, y in query_votes_dict.items():
|
135
|
+
lf_votes.update(y)
|
136
|
+
for ind, leaf in enumerate(y):
|
137
|
+
top_vote = False
|
138
|
+
if ind == 0:
|
139
|
+
top_vote = True
|
140
|
+
if leaf not in leaf_queries:
|
141
|
+
leaf_queries[leaf] = {(name,top_vote)}
|
142
|
+
else:
|
143
|
+
leaf_queries[leaf].add((name,top_vote))
|
144
|
+
|
145
|
+
subtree_dict = dict()
|
146
|
+
subtree_leaf_label_dict = dict()
|
147
|
+
most_common_index = 0
|
148
|
+
|
149
|
+
# assign queries to subtrees, and remove them from the pool
|
150
|
+
# repeat until all queries are assigned
|
151
|
+
while len(query_votes_dict) > 0:
|
152
|
+
_LOG.info("queries left to assign: {}".format(len(query_votes_dict)))
|
153
|
+
(seed_label, node_votes) = lf_votes.most_common(
|
154
|
+
most_common_index+1)[most_common_index]
|
155
|
+
|
156
|
+
node_y = leaf_dict[seed_label]
|
157
|
+
# extract [subtreesize] leaves
|
158
|
+
labels = utils.subtree_nodes_with_edge_length(tree, node_y,
|
159
|
+
Configs.subtreesize)
|
160
|
+
subtree = tree.extract_tree_with(labels)
|
161
|
+
label_set = set(labels)
|
162
|
+
|
163
|
+
queries_by_subtree = set()
|
164
|
+
subtree_query_set = set()
|
165
|
+
|
166
|
+
# gather any other queries that can be used with this subtree
|
167
|
+
for label in labels:
|
168
|
+
leaf_queries_remove_set = set()
|
169
|
+
if label in leaf_queries:
|
170
|
+
|
171
|
+
for leaf_query, top_vote in leaf_queries[label]:
|
172
|
+
|
173
|
+
if leaf_query not in query_votes_dict:
|
174
|
+
leaf_queries_remove_set.add((leaf_query, top_vote))
|
175
|
+
continue
|
176
|
+
|
177
|
+
if top_vote:
|
178
|
+
subtree_query_set.add(leaf_query)
|
179
|
+
leaf_queries_remove_set.add((leaf_query, top_vote))
|
180
|
+
|
181
|
+
leaf_queries[label].difference_update(leaf_queries_remove_set)
|
182
|
+
queries_by_subtree.update(subtree_query_set)
|
183
|
+
|
184
|
+
if len(queries_by_subtree) > 0:
|
185
|
+
subtree_dict[subtree] = (seed_label, queries_by_subtree)
|
186
|
+
subtree_leaf_label_dict[subtree] = subtree.label_to_node(
|
187
|
+
selection='leaves')
|
188
|
+
|
189
|
+
votes_b4 = len(list(lf_votes.elements()))
|
190
|
+
for query in queries_by_subtree:
|
191
|
+
if query in query_votes_dict:
|
192
|
+
lf_votes.subtract(query_votes_dict[query])
|
193
|
+
query_votes_dict.pop(query)
|
194
|
+
|
195
|
+
if len(queries_by_subtree) == 0:
|
196
|
+
# 10.27.2023 - Chengze Shen
|
197
|
+
# >>> prevent going over the the total number of votes
|
198
|
+
most_common_index += 1
|
199
|
+
else:
|
200
|
+
most_common_index = 0
|
201
|
+
|
202
|
+
placed_query_list = []
|
203
|
+
|
204
|
+
# reassign queries to the subtree minimizing total edge length
|
205
|
+
# from the query's top vote to the subtree's seedleaf
|
206
|
+
new_subtree_dict = dict()
|
207
|
+
for query, closest_label in query_top_vote_dict.items():
|
208
|
+
best_subtree = None
|
209
|
+
best_distance = 99999999999999999
|
210
|
+
for subtree, value in subtree_dict.items():
|
211
|
+
leaf_label_dict = subtree_leaf_label_dict[subtree]
|
212
|
+
seed_label, _ = value
|
213
|
+
if closest_label in leaf_label_dict:
|
214
|
+
distance = subtree.distance_between(
|
215
|
+
leaf_label_dict[closest_label],
|
216
|
+
leaf_label_dict[seed_label])
|
217
|
+
if distance < best_distance:
|
218
|
+
best_subtree = subtree
|
219
|
+
best_distance = distance
|
220
|
+
if best_subtree in new_subtree_dict:
|
221
|
+
new_subtree_dict[best_subtree].append(query)
|
222
|
+
else:
|
223
|
+
new_subtree_dict[best_subtree] = [query]
|
224
|
+
|
225
|
+
t1 = time.perf_counter()
|
226
|
+
_LOG.info('Time to assign queries to subtrees: {} seconds'.format(t1 - t0))
|
227
|
+
return new_subtree_dict, placed_query_list
|
228
|
+
|
229
|
+
'''
|
230
|
+
Helper function to run a single placement task. Designed to use with
|
231
|
+
multiprocessing
|
232
|
+
'''
|
233
|
+
def placeOneSubtree():
|
234
|
+
# TODO
|
235
|
+
pass
|
236
|
+
|
237
|
+
'''
|
238
|
+
Function to perform placement of queries for each subtree
|
239
|
+
'''
|
240
|
+
def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
241
|
+
aln, qaln, cmdline_args, workdir, pool, lock, dry_run=False):
|
242
|
+
t0 = time.perf_counter()
|
243
|
+
_LOG.info('Performing placement on each subtree...')
|
244
|
+
|
245
|
+
if dry_run:
|
246
|
+
return dict()
|
247
|
+
|
248
|
+
# prepare to write an aggregated results to local
|
249
|
+
jplace = dict()
|
250
|
+
utils.add_edge_nbrs(tree)
|
251
|
+
jplace["tree"] = utils.newick_edge_tokens(tree)
|
252
|
+
placements = []
|
253
|
+
|
254
|
+
# go over the dictionary of subtrees and their assigned queries
|
255
|
+
# perform placement using either EPA-ng or pplacer
|
256
|
+
final_subtree_count, total_subtrees_examined = 0, 0
|
257
|
+
for subtree, query_list in new_subtree_dict.items():
|
258
|
+
total_subtrees_examined += 1
|
259
|
+
_LOG.info('- Subtree {}/{} with {} queries'.format(
|
260
|
+
total_subtrees_examined, len(new_subtree_dict), len(query_list)))
|
261
|
+
|
262
|
+
# empty subtree, continue
|
263
|
+
if len(query_list) == 0:
|
264
|
+
continue
|
265
|
+
final_subtree_count += 1
|
266
|
+
|
267
|
+
# name all temporary output files
|
268
|
+
tmp_tree = os.path.join(workdir, 'tree')
|
269
|
+
tmp_aln = os.path.join(workdir, f'subtree_{final_subtree_count}_aln.fa')
|
270
|
+
tmp_qaln = os.path.join(workdir, f'subtree_{final_subtree_count}_qaln.fa')
|
271
|
+
tmp_output = os.path.join(workdir,
|
272
|
+
'subtree_{}_{}.jplace'.format(
|
273
|
+
final_subtree_count, Configs.placement_method))
|
274
|
+
|
275
|
+
# extract corresponding ref sequences and queries
|
276
|
+
tmp_leaf_dict = subtree.label_to_node(selection='leaves')
|
277
|
+
if '' in tmp_leaf_dict:
|
278
|
+
del tmp_leaf_dict['']
|
279
|
+
tmp_ref_dict = {label : aln[label] for label in tmp_leaf_dict.keys()}
|
280
|
+
tmp_q_dict = {name : qaln[name] for name in query_list}
|
281
|
+
write_fasta(tmp_aln, tmp_ref_dict)
|
282
|
+
write_fasta(tmp_qaln, tmp_q_dict)
|
283
|
+
|
284
|
+
# process the subtree before placement
|
285
|
+
subtree.resolve_polytomies()
|
286
|
+
subtree.suppress_unifurcations()
|
287
|
+
subtree.write_tree_newick(tmp_tree, hide_rooted_prefix=True)
|
288
|
+
|
289
|
+
# 1.27.2025 - Chengze Shen
|
290
|
+
# choose the placement method to run
|
291
|
+
if Configs.placement_method == 'epa-ng':
|
292
|
+
job = EPAngJob(path=Configs.epang_path,
|
293
|
+
info_path=Configs.info_path, tree_path=tmp_tree,
|
294
|
+
aln_path=tmp_aln, qaln_path=tmp_qaln,
|
295
|
+
outdir=workdir, num_cpus=Configs.num_cpus)
|
296
|
+
# for EPA-ng, ensure that outpath name is changed to the one we want
|
297
|
+
_outpath = job.run()
|
298
|
+
os.system('mv {} {}'.format(_outpath, tmp_output))
|
299
|
+
elif Configs.placement_method == 'pplacer':
|
300
|
+
# build ref_pkg with info and tmp_tree and tmp_aln
|
301
|
+
refpkg_dir = os.path.join(workdir,
|
302
|
+
f'subtree_{final_subtree_count}.refpkg')
|
303
|
+
taxit_job = TaxtasticJob(path=Configs.taxit_path,
|
304
|
+
outdir=refpkg_dir, name=f'subtree_{final_subtree_count}',
|
305
|
+
aln_path=tmp_aln, tree_path=tmp_tree,
|
306
|
+
info_path=Configs.info_path)
|
307
|
+
_ = taxit_job.run()
|
308
|
+
|
309
|
+
# run pplacer-taxtastic
|
310
|
+
job = PplacerTaxtasticJob(path=Configs.pplacer_path,
|
311
|
+
refpkg_dir=refpkg_dir, model=Configs.model,
|
312
|
+
outpath=tmp_output, num_cpus=Configs.num_cpus,
|
313
|
+
qaln_path=tmp_qaln)
|
314
|
+
tmp_output = job.run()
|
315
|
+
else:
|
316
|
+
raise ValueError(
|
317
|
+
f"Placement method {Configs.placement_method} not recognized")
|
318
|
+
|
319
|
+
# read in each placement result
|
320
|
+
place_file = open(tmp_output, 'r')
|
321
|
+
place_json = json.load(place_file)
|
322
|
+
tgt = "n"
|
323
|
+
if Configs.placement_method == 'pplacer':
|
324
|
+
tgt = "nm"
|
325
|
+
if len(place_json["placements"]) > 0:
|
326
|
+
added_tree, edge_dict = utils.read_tree_newick_edge_tokens(
|
327
|
+
place_json["tree"])
|
328
|
+
|
329
|
+
# obtain the fields for "p"
|
330
|
+
fields = place_json["fields"]
|
331
|
+
# set the fields in jplace accordingly
|
332
|
+
if "fields" not in jplace:
|
333
|
+
jplace["fields"] = fields
|
334
|
+
field_to_idx = {field: i for i, field in enumerate(fields)}
|
335
|
+
|
336
|
+
for tmp_place in place_json["placements"]:
|
337
|
+
#print(tmp_place)
|
338
|
+
placed_query_list.append(tmp_place[tgt][0])
|
339
|
+
for i in range(len(tmp_place["p"])):
|
340
|
+
edge_num = tmp_place["p"][i][
|
341
|
+
field_to_idx['edge_num']]
|
342
|
+
edge_distal = tmp_place["p"][i][
|
343
|
+
field_to_idx['distal_length']]
|
344
|
+
|
345
|
+
right_n = edge_dict[str(edge_num)]
|
346
|
+
left_n = right_n.get_parent()
|
347
|
+
|
348
|
+
# left and right path_l and path_r are in added_tree
|
349
|
+
left, path_l = utils.find_closest(left_n, {left_n, right_n})
|
350
|
+
right, path_r = utils.find_closest(right_n, {left_n, right_n})
|
351
|
+
|
352
|
+
left = leaf_dict[left.get_label()]
|
353
|
+
right = leaf_dict[right.get_label()]
|
354
|
+
_, path = utils.find_closest(left, {left}, y=right)
|
355
|
+
# now left right and path are in tree
|
356
|
+
|
357
|
+
length = sum([x.get_edge_length() for x in path_l])+edge_distal
|
358
|
+
# left path length through subtree before placement node
|
359
|
+
|
360
|
+
target_edge = path[-1]
|
361
|
+
|
362
|
+
for j in range(len(path)):
|
363
|
+
length -= path[j].get_edge_length()
|
364
|
+
if length < 0:
|
365
|
+
target_edge = path[j]
|
366
|
+
break
|
367
|
+
|
368
|
+
#tmp_place["p"][i][field_to_idx['edge_num']] = 0
|
369
|
+
|
370
|
+
label = target_edge.get_label()
|
371
|
+
|
372
|
+
[taxon, target_edge_nbr] = label.split('%%',1)
|
373
|
+
tmp_place["p"][i][field_to_idx['distal_length']] = \
|
374
|
+
target_edge.get_edge_length()+length
|
375
|
+
tmp_place["p"][i][field_to_idx['edge_num']] = \
|
376
|
+
int(target_edge_nbr)
|
377
|
+
|
378
|
+
placements.append(tmp_place.copy())
|
379
|
+
place_file.close()
|
380
|
+
_LOG.info(f'Final number of subtrees used: {final_subtree_count}')
|
381
|
+
|
382
|
+
# prepare the output jplace to write
|
383
|
+
jplace["placements"] = placements
|
384
|
+
jplace["metadata"] = {"invocation": " ".join(cmdline_args)}
|
385
|
+
jplace["version"] = 3
|
386
|
+
#jplace["fields"] = ["distal_length", "edge_num", "like_weight_ratio", \
|
387
|
+
# "likelihood", "pendant_length"]
|
388
|
+
|
389
|
+
t1 = time.perf_counter()
|
390
|
+
_LOG.info('Time to place queries to subtrees: {} seconds'.format(t1 - t0))
|
391
|
+
return jplace
|
392
|
+
|
393
|
+
'''
|
394
|
+
Function to write a given jplace object to local output
|
395
|
+
'''
|
396
|
+
def writeOutputJplace(output_jplace, dry_run=False):
|
397
|
+
t0 = time.perf_counter()
|
398
|
+
_LOG.info('Writing aggregated placements to local...')
|
399
|
+
|
400
|
+
if dry_run:
|
401
|
+
return
|
402
|
+
|
403
|
+
outpath = os.path.join(Configs.outdir, Configs.outname)
|
404
|
+
outf = open(outpath, 'w')
|
405
|
+
json.dump(output_jplace, outf, sort_keys=True, indent=4)
|
406
|
+
outf.close()
|
407
|
+
|
408
|
+
t1 = time.perf_counter()
|
409
|
+
_LOG.info('Time to build final jplace file: {} seconds'.format(t1 - t0))
|