bscampp 1.0.1a0__py3-none-any.whl → 1.0.1b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bscampp/__init__.py +1 -1
- bscampp/functions.py +24 -9
- bscampp/pipeline.py +34 -13
- bscampp/utils.py +22 -21
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.1b0.dist-info}/METADATA +6 -1
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.1b0.dist-info}/RECORD +10 -10
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.1b0.dist-info}/LICENSE +0 -0
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.1b0.dist-info}/WHEEL +0 -0
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.1b0.dist-info}/entry_points.txt +0 -0
- {bscampp-1.0.1a0.dist-info → bscampp-1.0.1b0.dist-info}/top_level.txt +0 -0
bscampp/__init__.py
CHANGED
bscampp/functions.py
CHANGED
@@ -15,10 +15,13 @@ Function to read in the placement tree and alignment.
|
|
15
15
|
If query alignment is provided, will use the provided query instead of
|
16
16
|
the ones (potentially) included in the reference alignment
|
17
17
|
'''
|
18
|
-
def readData(workdir):
|
18
|
+
def readData(workdir, dry_run=False):
|
19
19
|
t0 = time.perf_counter()
|
20
20
|
_LOG.info('Reading in input data...')
|
21
21
|
|
22
|
+
if dry_run:
|
23
|
+
return None, dict(), '', dict(), '', dict()
|
24
|
+
|
22
25
|
# (1) load reference tree
|
23
26
|
tree = treeswift.read_tree_newick(Configs.tree_path)
|
24
27
|
tree.resolve_polytomies()
|
@@ -45,10 +48,10 @@ def readData(workdir):
|
|
45
48
|
# after separating queries from the reference alignment, write
|
46
49
|
# them to to TEMP/
|
47
50
|
qaln_path = os.path.join(workdir, 'qaln.fa')
|
48
|
-
write_fasta(
|
51
|
+
write_fasta(qaln_path, q_dict)
|
49
52
|
|
50
53
|
aln_path = os.path.join(workdir, 'aln.fa')
|
51
|
-
write_fasta(
|
54
|
+
write_fasta(aln_path, ref_dict)
|
52
55
|
|
53
56
|
t1 = time.perf_counter()
|
54
57
|
_LOG.info('Time to read in input data: {} seconds'.format(t1 - t0))
|
@@ -58,19 +61,22 @@ def readData(workdir):
|
|
58
61
|
Function to get the closest leaf for each query sequence based on Hamming
|
59
62
|
distance
|
60
63
|
'''
|
61
|
-
def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir):
|
64
|
+
def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir, dry_run=False):
|
62
65
|
t0 = time.perf_counter()
|
63
66
|
_LOG.info('Computing closest leaves for query sequences...')
|
67
|
+
|
68
|
+
if dry_run:
|
69
|
+
return dict(), dict()
|
70
|
+
|
64
71
|
query_votes_dict = dict()
|
65
72
|
query_top_vote_dict = dict()
|
66
|
-
|
67
73
|
tmp_output = os.path.join(workdir, 'closest.txt')
|
68
74
|
|
69
75
|
cmd = []
|
70
76
|
if Configs.similarityflag:
|
71
77
|
cmd.append(os.path.join(Configs.hamming_distance_dir, 'homology'))
|
72
78
|
else:
|
73
|
-
if
|
79
|
+
if Configs.fragmentflag == False:
|
74
80
|
cmd.append(os.path.join(Configs.hamming_distance_dir, 'hamming'))
|
75
81
|
else:
|
76
82
|
cmd.append(os.path.join(
|
@@ -115,10 +121,13 @@ def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir):
|
|
115
121
|
Function to assign queries to subtrees based on their votes
|
116
122
|
'''
|
117
123
|
def assignQueriesToSubtrees(query_votes_dict, query_top_vote_dict,
|
118
|
-
tree, leaf_dict):
|
124
|
+
tree, leaf_dict, dry_run=False):
|
119
125
|
t0 = time.perf_counter()
|
120
126
|
_LOG.info('Adding query votes to the placement tree...')
|
121
127
|
|
128
|
+
if dry_run:
|
129
|
+
return dict(), []
|
130
|
+
|
122
131
|
# (1) go over the query votes and add them to corresponding leaves
|
123
132
|
lf_votes = Counter()
|
124
133
|
leaf_queries = dict()
|
@@ -229,10 +238,13 @@ def placeOneSubtree():
|
|
229
238
|
Function to perform placement of queries for each subtree
|
230
239
|
'''
|
231
240
|
def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
232
|
-
aln, qaln, cmdline_args, workdir, pool, lock):
|
241
|
+
aln, qaln, cmdline_args, workdir, pool, lock, dry_run=False):
|
233
242
|
t0 = time.perf_counter()
|
234
243
|
_LOG.info('Performing placement on each subtree...')
|
235
244
|
|
245
|
+
if dry_run:
|
246
|
+
return dict()
|
247
|
+
|
236
248
|
# prepare to write an aggregated results to local
|
237
249
|
jplace = dict()
|
238
250
|
utils.add_edge_nbrs(tree)
|
@@ -381,9 +393,12 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
|
|
381
393
|
'''
|
382
394
|
Function to write a given jplace object to local output
|
383
395
|
'''
|
384
|
-
def writeOutputJplace(output_jplace):
|
396
|
+
def writeOutputJplace(output_jplace, dry_run=False):
|
385
397
|
t0 = time.perf_counter()
|
386
398
|
_LOG.info('Writing aggregated placements to local...')
|
399
|
+
|
400
|
+
if dry_run:
|
401
|
+
return
|
387
402
|
|
388
403
|
outpath = os.path.join(Configs.outdir, Configs.outname)
|
389
404
|
outf = open(outpath, 'w')
|
bscampp/pipeline.py
CHANGED
@@ -22,8 +22,13 @@ def bscampp_pipeline(*args, **kwargs):
|
|
22
22
|
t0 = time.perf_counter()
|
23
23
|
m = Manager(); lock = m.Lock()
|
24
24
|
|
25
|
+
# set up a dry run if specified
|
26
|
+
dry_run = False
|
27
|
+
if 'dry_run' in kwargs and isinstance(kwargs['dry_run'], bool):
|
28
|
+
dry_run = kwargs['dry_run']
|
29
|
+
|
25
30
|
# parse command line arguments and build configurations
|
26
|
-
parser, cmdline_args = parseArguments()
|
31
|
+
parser, cmdline_args = parseArguments(dry_run=dry_run)
|
27
32
|
|
28
33
|
# initialize multiprocessing (if needed)
|
29
34
|
_LOG.warning('Initializing ProcessPoolExecutor...')
|
@@ -31,33 +36,39 @@ def bscampp_pipeline(*args, **kwargs):
|
|
31
36
|
initargs=(parser, cmdline_args,))
|
32
37
|
|
33
38
|
# (0) temporary files wrote to here
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
os.
|
38
|
-
|
39
|
-
|
39
|
+
if not dry_run:
|
40
|
+
workdir = os.path.join(Configs.outdir, f'tmp{Configs.tmpfilenbr}')
|
41
|
+
try:
|
42
|
+
if not os.path.isdir(workdir):
|
43
|
+
os.makedirs(workdir)
|
44
|
+
except OSError:
|
45
|
+
log_exception(_LOG)
|
46
|
+
else:
|
47
|
+
workdir = os.getcwd()
|
40
48
|
|
41
49
|
# (1) read in tree, alignment, and separate reference sequences from
|
42
50
|
# query sequences
|
43
|
-
tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir
|
51
|
+
tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
|
52
|
+
dry_run=dry_run)
|
44
53
|
|
45
54
|
# (2) compute closest leaves for all query sequences
|
46
55
|
query_votes_dict, query_top_vote_dict = getClosestLeaves(
|
47
|
-
aln_path, qaln_path, aln, qaln, workdir)
|
56
|
+
aln_path, qaln_path, aln, qaln, workdir, dry_run=dry_run)
|
48
57
|
|
49
58
|
# (3) first assign all queries to their closest-leaf subtrees,
|
50
59
|
# then do reassignment to minimize distance between each's top vote
|
51
60
|
# and the subtree's seed leaf
|
52
61
|
new_subtree_dict, placed_query_list = assignQueriesToSubtrees(
|
53
|
-
query_votes_dict, query_top_vote_dict, tree, leaf_dict
|
62
|
+
query_votes_dict, query_top_vote_dict, tree, leaf_dict,
|
63
|
+
dry_run=dry_run)
|
54
64
|
|
55
65
|
# (4) perform placement for each subtree
|
56
66
|
output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
|
57
|
-
placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock
|
67
|
+
placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
|
68
|
+
dry_run=dry_run)
|
58
69
|
|
59
70
|
# (5) write the output jplace to local
|
60
|
-
writeOutputJplace(output_jplace)
|
71
|
+
writeOutputJplace(output_jplace, dry_run=dry_run)
|
61
72
|
|
62
73
|
# shutdown pool
|
63
74
|
_LOG.warning('Shutting down ProcessPoolExecutor...')
|
@@ -73,6 +84,11 @@ def bscampp_pipeline(*args, **kwargs):
|
|
73
84
|
send = time.perf_counter()
|
74
85
|
_LOG.info('BSCAMPP completed in {} seconds...'.format(send - t0))
|
75
86
|
|
87
|
+
if dry_run:
|
88
|
+
return True
|
89
|
+
else:
|
90
|
+
return False
|
91
|
+
|
76
92
|
def clean_temp_files():
|
77
93
|
# all temporary files/directories to remove
|
78
94
|
temp_items = [f'tmp{Configs.tmpfilenbr}']
|
@@ -86,10 +102,15 @@ def clean_temp_files():
|
|
86
102
|
continue
|
87
103
|
_LOG.info(f'- Removed {temp}')
|
88
104
|
|
89
|
-
def parseArguments():
|
105
|
+
def parseArguments(dry_run=False):
|
90
106
|
global _root_dir, main_config_path
|
107
|
+
|
91
108
|
parser = _init_parser()
|
92
109
|
cmdline_args = sys.argv[1:]
|
110
|
+
|
111
|
+
if dry_run:
|
112
|
+
cmdline_args = ['-i', 'dummy.info', '-t', 'dummy.tre',
|
113
|
+
'-a', 'dummy.fa']
|
93
114
|
|
94
115
|
# build config
|
95
116
|
buildConfigs(parser, cmdline_args)
|
bscampp/utils.py
CHANGED
@@ -8,6 +8,7 @@ from os.path import expanduser,isfile
|
|
8
8
|
import random
|
9
9
|
import statistics
|
10
10
|
import copy
|
11
|
+
import gzip
|
11
12
|
|
12
13
|
import argparse
|
13
14
|
# reformat argparse help text formatting
|
@@ -813,25 +814,25 @@ def newick_edge_tokens_node(node):
|
|
813
814
|
node_to_str[node] = ''.join(out)
|
814
815
|
return node_to_str[node]
|
815
816
|
|
816
|
-
def write_tree_newick_edge_tokens(tree, filename, hide_rooted_prefix=False):
|
817
|
-
'''
|
818
|
-
Modified from treeswift tree.write_tree_newick()
|
819
|
-
Write this ``Tree`` to a Newick file
|
820
|
-
Args:
|
821
|
-
``filename`` (``str``): Path to desired output file (plain-text or gzipped)
|
822
|
-
'''
|
823
|
-
if not isinstance(filename, str):
|
824
|
-
raise TypeError("filename must be a str")
|
825
|
-
treestr = newick_edge_nbr_string(tree)
|
826
|
-
if hide_rooted_prefix:
|
827
|
-
if treestr.startswith('[&R]'):
|
828
|
-
treestr = treestr[4:].strip()
|
829
|
-
else:
|
830
|
-
warn("Specified hide_rooted_prefix, but tree was not rooted")
|
831
|
-
if filename.lower().endswith('.gz'): # gzipped file
|
832
|
-
f = gopen(expanduser(filename),'wb',9); f.write(treestr.encode()); f.close()
|
833
|
-
else: # plain-text file
|
834
|
-
f = open(expanduser(filename),'w'); f.write(treestr); f.close()
|
817
|
+
#def write_tree_newick_edge_tokens(tree, filename, hide_rooted_prefix=False):
|
818
|
+
# '''
|
819
|
+
# Modified from treeswift tree.write_tree_newick()
|
820
|
+
# Write this ``Tree`` to a Newick file
|
821
|
+
# Args:
|
822
|
+
# ``filename`` (``str``): Path to desired output file (plain-text or gzipped)
|
823
|
+
# '''
|
824
|
+
# if not isinstance(filename, str):
|
825
|
+
# raise TypeError("filename must be a str")
|
826
|
+
# treestr = newick_edge_nbr_string(tree)
|
827
|
+
# if hide_rooted_prefix:
|
828
|
+
# if treestr.startswith('[&R]'):
|
829
|
+
# treestr = treestr[4:].strip()
|
830
|
+
# else:
|
831
|
+
# warn("Specified hide_rooted_prefix, but tree was not rooted")
|
832
|
+
# if filename.lower().endswith('.gz'): # gzipped file
|
833
|
+
# f = gopen(expanduser(filename),'wb',9); f.write(treestr.encode()); f.close()
|
834
|
+
# else: # plain-text file
|
835
|
+
# f = open(expanduser(filename),'w'); f.write(treestr); f.close()
|
835
836
|
|
836
837
|
def read_tree_newick_edge_tokens(newick):
|
837
838
|
'''
|
@@ -850,7 +851,7 @@ def read_tree_newick_edge_tokens(newick):
|
|
850
851
|
except:
|
851
852
|
raise TypeError("newick must be a str")
|
852
853
|
if newick.lower().endswith('.gz'): # gzipped file
|
853
|
-
f =
|
854
|
+
f = gzip.open(expanduser(newick)); ts = f.read().decode().strip(); f.close()
|
854
855
|
elif isfile(expanduser(newick)): # plain-text file
|
855
856
|
f = open(expanduser(newick)); ts = f.read().strip(); f.close()
|
856
857
|
else:
|
@@ -867,7 +868,7 @@ def read_tree_newick_edge_tokens(newick):
|
|
867
868
|
# end of Newick string
|
868
869
|
if ts[i] == ';':
|
869
870
|
if i != len(ts)-1 or n != t.root:
|
870
|
-
raise RuntimeError(
|
871
|
+
raise RuntimeError("INVALID NEWICK")
|
871
872
|
|
872
873
|
# go to new child
|
873
874
|
elif ts[i] == '(':
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: bscampp
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.1b0
|
4
4
|
Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
|
5
5
|
Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
|
6
6
|
License: MIT License
|
@@ -51,6 +51,11 @@ Requires-Dist: treeswift>=1.1.45
|
|
51
51
|
Requires-Dist: taxtastic>=0.9.3
|
52
52
|
|
53
53
|
# BSCAMPP - A Scalable Phylogenetic Placement Method and Framework
|
54
|
+
[](https://pypi.org/project/bscampp/)
|
55
|
+
[](https://pypi.org/project/bscampp/#history)
|
56
|
+
[](https://github.com/ewedell/BSCAMPP/)
|
57
|
+
[](https://github.com/ewedell/BSCAMPP/blob/main/LICENSE)
|
58
|
+
[](https://github.com/ewedell/BSCAMPP/blob/main/CHANGELOG.md)
|
54
59
|
|
55
60
|
**Table of Contents**
|
56
61
|
1. [Overview](#overview)
|
@@ -1,11 +1,11 @@
|
|
1
|
-
bscampp/__init__.py,sha256=
|
1
|
+
bscampp/__init__.py,sha256=Wnn_Bm543hAgQCd9PmwdT_kFBZzGLDW4dcSeP0iLVTk,2290
|
2
2
|
bscampp/configs.py,sha256=XuzRbtcUE5bExe-vEZGZ1CeXBmp4oP7LWFveQySx2xs,5745
|
3
3
|
bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
|
4
|
-
bscampp/functions.py,sha256=
|
4
|
+
bscampp/functions.py,sha256=cPT5eSy_8CSNzDx-5ma43Hp9_AMmaWSTXM89bjdrkRs,15640
|
5
5
|
bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
|
6
6
|
bscampp/jobs.py,sha256=de0Dr3ynORwACJqVbeWDfqTwJhWvMYG-7yfRYirGx8M,6703
|
7
|
-
bscampp/pipeline.py,sha256=
|
8
|
-
bscampp/utils.py,sha256=
|
7
|
+
bscampp/pipeline.py,sha256=UT8y6ObFZ12q5Vw3731r50k8pLMioFNV4qCy0tz_wuk,9550
|
8
|
+
bscampp/utils.py,sha256=ragaI14Lqb2fVp_uYDkFQnV7a50G9-sUOWdVM-sNhUE,29005
|
9
9
|
bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
|
10
10
|
bscampp/tools/pplacer,sha256=p0H4eo9uuiYoWS_kJbPfauOV99i7BXJdZSiwXIuLxTw,7834576
|
11
11
|
bscampp/tools/hamming_distance/CMakeLists.txt,sha256=yf9iq7Y61t3WObJHoR4RoGDEvUw_Q8JW3UnI4uh0cfU,389
|
@@ -17,9 +17,9 @@ bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD
|
|
17
17
|
bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
|
18
18
|
bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
|
19
19
|
bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
|
20
|
-
bscampp-1.0.
|
21
|
-
bscampp-1.0.
|
22
|
-
bscampp-1.0.
|
23
|
-
bscampp-1.0.
|
24
|
-
bscampp-1.0.
|
25
|
-
bscampp-1.0.
|
20
|
+
bscampp-1.0.1b0.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
|
21
|
+
bscampp-1.0.1b0.dist-info/METADATA,sha256=hCpwS1vbd07cuwW7D5AkiO_I_GP-kqk21IH2yxiPUwM,11144
|
22
|
+
bscampp-1.0.1b0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
23
|
+
bscampp-1.0.1b0.dist-info/entry_points.txt,sha256=dZygBmg2OncVyeLeIjh_9e-GBIOesFvMemyW9BRRcXY,113
|
24
|
+
bscampp-1.0.1b0.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
|
25
|
+
bscampp-1.0.1b0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|