partis-bcr 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bin/FastTree +0 -0
- bin/add-chimeras.py +59 -0
- bin/add-seqs-to-outputs.py +81 -0
- bin/bcr-phylo-run.py +799 -0
- bin/build.sh +24 -0
- bin/cf-alleles.py +97 -0
- bin/cf-germlines.py +57 -0
- bin/cf-linearham.py +199 -0
- bin/chimera-plot.py +76 -0
- bin/choose-partially-paired.py +143 -0
- bin/circle-plots.py +30 -0
- bin/compare-plotdirs.py +298 -0
- bin/diff-parameters.py +133 -0
- bin/docker-hub-push.sh +6 -0
- bin/extract-pairing-info.py +55 -0
- bin/gcdyn-simu-run.py +223 -0
- bin/gctree-run.py +244 -0
- bin/get-naive-probabilities.py +126 -0
- bin/iqtree-1.6.12 +0 -0
- bin/lonr.r +1020 -0
- bin/makeHtml +52 -0
- bin/mds-run.py +46 -0
- bin/parse-output.py +277 -0
- bin/partis +1869 -0
- bin/partis-pip +116 -0
- bin/partis.py +1869 -0
- bin/plot-gl-set-trees.py +519 -0
- bin/plot-hmms.py +151 -0
- bin/plot-lb-tree.py +427 -0
- bin/raxml-ng +0 -0
- bin/read-bcr-phylo-trees.py +38 -0
- bin/read-gctree-output.py +166 -0
- bin/run-chimeras.sh +64 -0
- bin/run-dtr-scan.sh +25 -0
- bin/run-paired-loci.sh +100 -0
- bin/run-tree-metrics.sh +88 -0
- bin/smetric-run.py +62 -0
- bin/split-loci.py +317 -0
- bin/swarm-2.1.13-linux-x86_64 +0 -0
- bin/test-germline-inference.py +425 -0
- bin/tree-perf-run.py +194 -0
- bin/vsearch-2.4.3-linux-x86_64 +0 -0
- bin/vsearch-2.4.3-macos-x86_64 +0 -0
- bin/xvfb-run +194 -0
- partis_bcr-1.0.1.data/scripts/cf-alleles.py +97 -0
- partis_bcr-1.0.1.data/scripts/cf-germlines.py +57 -0
- partis_bcr-1.0.1.data/scripts/extract-pairing-info.py +55 -0
- partis_bcr-1.0.1.data/scripts/gctree-run.py +244 -0
- partis_bcr-1.0.1.data/scripts/parse-output.py +277 -0
- partis_bcr-1.0.1.data/scripts/split-loci.py +317 -0
- partis_bcr-1.0.1.data/scripts/test.py +1005 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/METADATA +1 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/RECORD +101 -50
- partis_bcr-1.0.1.dist-info/top_level.txt +1 -0
- {partis → python}/glutils.py +1 -1
- python/main.py +30 -0
- {partis → python}/plotting.py +10 -1
- {partis → python}/treeutils.py +18 -16
- {partis → python}/utils.py +14 -7
- partis/main.py +0 -59
- partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/WHEEL +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/entry_points.txt +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/licenses/COPYING +0 -0
- {partis → python}/__init__.py +0 -0
- {partis → python}/alleleclusterer.py +0 -0
- {partis → python}/allelefinder.py +0 -0
- {partis → python}/alleleremover.py +0 -0
- {partis → python}/annotationclustering.py +0 -0
- {partis → python}/baseutils.py +0 -0
- {partis → python}/cache/__init__.py +0 -0
- {partis → python}/cache/cached_uncertainties.py +0 -0
- {partis → python}/clusterpath.py +0 -0
- {partis → python}/coar.py +0 -0
- {partis → python}/corrcounter.py +0 -0
- {partis → python}/datautils.py +0 -0
- {partis → python}/event.py +0 -0
- {partis → python}/fraction_uncertainty.py +0 -0
- {partis → python}/gex.py +0 -0
- {partis → python}/glomerator.py +0 -0
- {partis → python}/hist.py +0 -0
- {partis → python}/hmmwriter.py +0 -0
- {partis → python}/hutils.py +0 -0
- {partis → python}/indelutils.py +0 -0
- {partis → python}/lbplotting.py +0 -0
- {partis → python}/mds.py +0 -0
- {partis → python}/mutefreqer.py +0 -0
- {partis → python}/paircluster.py +0 -0
- {partis → python}/parametercounter.py +0 -0
- {partis → python}/paramutils.py +0 -0
- {partis → python}/partitiondriver.py +0 -0
- {partis → python}/partitionplotter.py +0 -0
- {partis → python}/performanceplotter.py +0 -0
- {partis → python}/plotconfig.py +0 -0
- {partis → python}/processargs.py +0 -0
- {partis → python}/prutils.py +0 -0
- {partis → python}/recombinator.py +0 -0
- {partis → python}/scanplot.py +0 -0
- {partis → python}/seqfileopener.py +0 -0
- {partis → python}/treegenerator.py +0 -0
- {partis → python}/viterbicluster.py +0 -0
- {partis → python}/vrc01.py +0 -0
- {partis → python}/waterer.py +0 -0
bin/gcdyn-simu-run.py
ADDED
@@ -0,0 +1,223 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
from __future__ import absolute_import, division, unicode_literals
|
3
|
+
from __future__ import print_function
|
4
|
+
import glob
|
5
|
+
import sys
|
6
|
+
import csv
|
7
|
+
from io import open
|
8
|
+
csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields
|
9
|
+
import os
|
10
|
+
import copy
|
11
|
+
import argparse
|
12
|
+
import colored_traceback.always
|
13
|
+
import json
|
14
|
+
|
15
|
+
# if you move this script, you'll need to change this method of getting the imports
|
16
|
+
from pathlib import Path
|
17
|
+
partis_dir = str(Path(__file__).parent.parent)
|
18
|
+
sys.path.insert(1, partis_dir) # + '/python')
|
19
|
+
|
20
|
+
import python.utils as utils
|
21
|
+
import python.paircluster as paircluster
|
22
|
+
import python.glutils as glutils
|
23
|
+
from python.clusterpath import ClusterPath
|
24
|
+
import python.treeutils as treeutils
|
25
|
+
import python.indelutils as indelutils
|
26
|
+
from python.event import RecombinationEvent
|
27
|
+
|
28
|
+
# ----------------------------------------------------------------------------------------
|
29
|
+
def get_replay_naive_antn(glfos, ltmp, add_empty_mature_keys=False, debug=False):
|
30
|
+
antn, naive_seq = {}, []
|
31
|
+
for region in utils.getregions(ltmp):
|
32
|
+
gene, seq = utils.get_single_entry(list(glfos[ltmp]['seqs'][region].items()))
|
33
|
+
antn['%s_gene'%region] = gene
|
34
|
+
if ltmp == 'igh' and region == 'j' and args.rm_last_ighj_base:
|
35
|
+
if debug:
|
36
|
+
print(' removing last ighj base: %s --> %s' % (seq, seq[:-1]))
|
37
|
+
seq = seq[:-1]
|
38
|
+
naive_seq.append(seq)
|
39
|
+
for dstr in utils.all_erosions:
|
40
|
+
antn['%s_del'%dstr] = 0
|
41
|
+
if not utils.has_d_gene(ltmp):
|
42
|
+
antn['d_gene'] = glutils.dummy_d_genes[ltmp]
|
43
|
+
antn['d_5p_del'] = 1
|
44
|
+
for bstr in utils.all_boundaries:
|
45
|
+
antn['%s_insertion'%bstr] = ''
|
46
|
+
antn['naive_seq'] = ''.join(naive_seq)
|
47
|
+
antn['invalid'] = False
|
48
|
+
if add_empty_mature_keys: # keys that need to be replaced in mature annotation
|
49
|
+
antn['unique_ids'] = []
|
50
|
+
antn['seqs'] = []
|
51
|
+
antn['input_seqs'] = antn['seqs']
|
52
|
+
antn['indelfos'] = []
|
53
|
+
antn['paired-uids'] = []
|
54
|
+
else:
|
55
|
+
antn['unique_ids'] = ['replay_naive']
|
56
|
+
antn['seqs'] = [''.join(naive_seq)]
|
57
|
+
antn['input_seqs'] = antn['seqs']
|
58
|
+
antn['indelfos'] = [indelutils.get_empty_indel()]
|
59
|
+
antn['paired-uids'] = [[]]
|
60
|
+
if debug:
|
61
|
+
utils.add_implicit_info(glfos[ltmp], antn)
|
62
|
+
utils.print_reco_event(antn)
|
63
|
+
return antn
|
64
|
+
|
65
|
+
# ----------------------------------------------------------------------------------------
|
66
|
+
def get_uid(name, ltmp):
|
67
|
+
return '%s-%s' % (name, ltmp)
|
68
|
+
|
69
|
+
# ----------------------------------------------------------------------------------------
|
70
|
+
def process_tree(glfos, treelist, tree_sfos, leaf_meta, itree, lp_infos, lpair):
|
71
|
+
antns = {}
|
72
|
+
for ltmp in lpair:
|
73
|
+
antns[ltmp] = get_replay_naive_antn(glfos, ltmp, add_empty_mature_keys=True, debug=args.debug)
|
74
|
+
for tkey in args.meta_columns:
|
75
|
+
antns[ltmp][utils.input_metafile_keys[tkey]] = []
|
76
|
+
|
77
|
+
hloc = utils.heavy_locus(args.ig_or_tr)
|
78
|
+
lloc = utils.get_single_entry([l for l in glfos if l!=hloc])
|
79
|
+
h_seq_len = len(antns[hloc]['naive_seq'])
|
80
|
+
for sfo in tree_sfos[itree]:
|
81
|
+
joint_naive_seq = ''.join(antns[l]['naive_seq'] for l in lpair)
|
82
|
+
if len(sfo['seq']) == len(joint_naive_seq) - 1:
|
83
|
+
raise Exception('seq read from gcdyn file has len %d, one less than replay naive seq %d, you probably need to set --rm-last-ighj-base' % (len(sfo['seq']), len(joint_naive_seq)))
|
84
|
+
assert len(sfo['seq']) == len(joint_naive_seq)
|
85
|
+
if 'naive' in sfo['name']:
|
86
|
+
assert sfo['seq'] == joint_naive_seq
|
87
|
+
continue
|
88
|
+
dumb_offset = 0
|
89
|
+
if args.rm_last_ighj_base:
|
90
|
+
dumb_offset = 1
|
91
|
+
antns[hloc]['seqs'].append(sfo['seq'][:h_seq_len + dumb_offset])
|
92
|
+
antns[lloc]['seqs'].append(sfo['seq'][h_seq_len:])
|
93
|
+
|
94
|
+
for ltmp in lpair:
|
95
|
+
assert antns[ltmp]['input_seqs'][-1] == antns[ltmp]['seqs'][-1]
|
96
|
+
antns[ltmp]['unique_ids'].append(get_uid(sfo['name'], ltmp))
|
97
|
+
antns[ltmp]['indelfos'].append(indelutils.get_empty_indel())
|
98
|
+
other_locus = utils.get_single_entry([l for l in lpair if l!=ltmp])
|
99
|
+
antns[ltmp]['paired-uids'].append([get_uid(sfo['name'], other_locus)])
|
100
|
+
for tkey in args.meta_columns:
|
101
|
+
antns[ltmp][utils.input_metafile_keys[tkey]].append(leaf_meta[sfo['name']][tkey])
|
102
|
+
|
103
|
+
for ltmp in lpair:
|
104
|
+
dtree = treeutils.get_dendro_tree(treestr=treelist[itree])
|
105
|
+
treeutils.translate_labels(dtree, [(s['name'], get_uid(s['name'], ltmp)) for s in tree_sfos[itree] if 'naive' not in s['name']], expect_missing=True)
|
106
|
+
antns[ltmp]['tree'] = dtree.as_string(schema='newick').strip()
|
107
|
+
tmp_event = RecombinationEvent(glfos[ltmp]) # I don't want to move the function out of event.py right now
|
108
|
+
tmp_event.set_reco_id(antns[ltmp], irandom=itree) # not sure that setting <irandom> here actually does anything
|
109
|
+
utils.add_implicit_info(glfos[ltmp], antns[ltmp]) # easiest way to add codon_positions, which we want to write to file
|
110
|
+
|
111
|
+
for ltmp in lpair:
|
112
|
+
lp_infos[lpair]['antn_lists'][ltmp].append(antns[ltmp])
|
113
|
+
|
114
|
+
if args.debug:
|
115
|
+
for ltmp in sorted(glfos):
|
116
|
+
utils.print_reco_event(antns[ltmp], extra_str=' ')
|
117
|
+
|
118
|
+
# ----------------------------------------------------------------------------------------
|
119
|
+
def mfname(): # have to look for old meta file name for backwards compatibility
|
120
|
+
mfn = '%s/meta.csv'%gcd_dir
|
121
|
+
if os.path.exists(mfn): # if new name exists, return that
|
122
|
+
return mfn
|
123
|
+
return '%s/leaf-meta.csv'%gcd_dir # otherwise return old name
|
124
|
+
|
125
|
+
# ----------------------------------------------------------------------------------------
|
126
|
+
def run_gcdyn():
|
127
|
+
if os.path.exists('%s/encoded-trees.npy'%gcd_dir):
|
128
|
+
print(' gcdyn output exists in %s' % gcd_dir)
|
129
|
+
return
|
130
|
+
cmds = ['#!/bin/bash']
|
131
|
+
cmds += utils.mamba_cmds('gcdyn')
|
132
|
+
gcmd = 'gcd-simulate --sample-internal-nodes --label-leaf-internal-nodes --outdir %s --xshift-values 2.5 --xscale-values 5 --yscale-values 1000000' % gcd_dir
|
133
|
+
if args.n_sub_procs is not None:
|
134
|
+
gcmd += ' --n-sub-procs %d' % args.n_sub_procs
|
135
|
+
if args.seed is not None:
|
136
|
+
gcmd += ' --seed %d' % args.seed
|
137
|
+
if args.n_sim_events is not None:
|
138
|
+
gcmd += ' --n-trials %d' % args.n_sim_events
|
139
|
+
if args.obs_times is not None:
|
140
|
+
gcmd += ' --time-to-sampling-values %d' % args.obs_times
|
141
|
+
cmds += [gcmd]
|
142
|
+
cmdfname = '%s/run.sh' % gcd_dir
|
143
|
+
utils.simplerun('\n'.join(cmds) + '\n', cmdfname=cmdfname)
|
144
|
+
|
145
|
+
# ----------------------------------------------------------------------------------------
|
146
|
+
def process_output():
|
147
|
+
glfos = {}
|
148
|
+
for ltmp in utils.loci:
|
149
|
+
if os.path.exists('%s/%s' % (args.replay_germline_dir, ltmp)):
|
150
|
+
glfos[ltmp] = glutils.read_glfo(args.replay_germline_dir, ltmp)
|
151
|
+
|
152
|
+
seqfos = utils.read_fastx('%s/seqs.fasta'%gcd_dir)
|
153
|
+
treelist = treeutils.get_treestrs_from_file('%s/trees.nwk'%gcd_dir)
|
154
|
+
lmetalines = utils.csvlines(mfname())
|
155
|
+
leaf_meta = {l['name'] : {'affinity' : float(l['affinity'])} for l in lmetalines}
|
156
|
+
print(' read %d trees, %d seqs (plus leaf metafo) from %s' % (len(treelist), len(seqfos), gcd_dir))
|
157
|
+
tree_sfos, all_uids = {}, set() # collect up the seqfos for each tree
|
158
|
+
for sfo in seqfos:
|
159
|
+
if sfo['name'].count('-') == 1: # old-style, and naive seq
|
160
|
+
itree, sname = sfo['name'].split('-')
|
161
|
+
elif sfo['name'].count('-') == 2: # new-style
|
162
|
+
itree, listr, sname = sfo['name'].split('-') # listr is either 'leaf' or 'mrca'
|
163
|
+
else:
|
164
|
+
assert False
|
165
|
+
itree = int(itree)
|
166
|
+
if itree not in tree_sfos:
|
167
|
+
tree_sfos[itree] = []
|
168
|
+
tree_sfos[itree].append(sfo)
|
169
|
+
if sfo['name'] in all_uids:
|
170
|
+
raise Exception('found uid %s twice' % sfo['name'])
|
171
|
+
all_uids.add(sfo['name'])
|
172
|
+
print(' %d tree seqfos with lengths: %s' % (len(tree_sfos), ' '.join(str(len(slist)) for slist in sorted(tree_sfos.values(), key=len, reverse=True))))
|
173
|
+
if sorted(tree_sfos) != list(range(len(treelist))):
|
174
|
+
raise Exception('tree indices from sequence names didn\'t match number of trees')
|
175
|
+
|
176
|
+
lpairs = [tuple(sorted(glfos))] # could use utils.locus_pairs() if i update to more than one lpair
|
177
|
+
lp_infos = {lp : {'antn_lists' : {l : [] for l in glfos}, 'glfos' : {l : g for l, g in glfos.items()}, 'cpaths' : {}} for lp in lpairs}
|
178
|
+
for itree in range(len(treelist)):
|
179
|
+
process_tree(glfos, treelist, tree_sfos, leaf_meta, itree, lp_infos, utils.get_single_entry(lpairs))
|
180
|
+
|
181
|
+
print(' writing annotations to %s' % args.outdir)
|
182
|
+
headers = utils.simulation_headers + args.meta_columns
|
183
|
+
def ofn_fn(locus, lpair=None, joint=None):
|
184
|
+
return paircluster.paired_fn(args.outdir, locus, lpair=lpair, suffix='.yaml')
|
185
|
+
paircluster.write_lpair_output_files(lpairs, lp_infos, ofn_fn, headers=headers)
|
186
|
+
glfos, antn_lists, _ = paircluster.concat_heavy_chain(lpairs, lp_infos) # per-locus glfos with concat'd heavy chain
|
187
|
+
paircluster.write_concatd_output_files(glfos, antn_lists, ofn_fn, headers)
|
188
|
+
outfos, metafos = paircluster.get_combined_outmetafos(antn_lists, extra_meta_headers=[utils.input_metafile_keys[k] for k in args.meta_columns])
|
189
|
+
paircluster.write_combined_fasta_and_meta(args.outdir+'/all-seqs.fa', args.outdir+'/meta.yaml', outfos, metafos)
|
190
|
+
|
191
|
+
# ----------------------------------------------------------------------------------------
|
192
|
+
helpstr = """
|
193
|
+
run gcdyn simulation, then process results into partis-format paired output dir, for example:
|
194
|
+
./bin/gcdyn-simu-run.py --gcd-dir <gcd> --n-sub-procs 10 --seed 0 --n-trials 10 --rm-last-ighj-base
|
195
|
+
"""
|
196
|
+
class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
|
197
|
+
pass
|
198
|
+
formatter_class = MultiplyInheritedFormatter
|
199
|
+
parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, description=helpstr)
|
200
|
+
parser.add_argument('--outdir', required=True)
|
201
|
+
parser.add_argument('--actions', default='run:process')
|
202
|
+
parser.add_argument('--input-simu-dir', help='if set, only run \'process\' action, reading gcdyn simu from this dir and writing partis files to --outdir')
|
203
|
+
parser.add_argument('--replay-germline-dir', default='datascripts/meta/taraki-gctree-2021-10/germlines', help='dir with gcreplay germline sequences')
|
204
|
+
parser.add_argument('--rm-last-ighj-base', action='store_true', help='sometimes the ighj gene has an extra G at the end, sometimes not, this says to remove it from the seqs read from --replay-germline-dir')
|
205
|
+
parser.add_argument('--n-sub-procs', type=int)
|
206
|
+
parser.add_argument('--seed', type=int)
|
207
|
+
parser.add_argument('--n-sim-events', type=int)
|
208
|
+
parser.add_argument('--obs-times', type=int)
|
209
|
+
parser.add_argument('--meta-columns', default='affinity')
|
210
|
+
parser.add_argument('--ig-or-tr', default='ig')
|
211
|
+
parser.add_argument('--debug', action='store_true')
|
212
|
+
args = parser.parse_args()
|
213
|
+
args.meta_columns = utils.get_arg_list(args.meta_columns, choices=utils.input_metafile_keys.keys())
|
214
|
+
if args.input_simu_dir is None:
|
215
|
+
gcd_dir = '%s/gcdyn' % args.outdir
|
216
|
+
args.actions = utils.get_arg_list(args.actions)
|
217
|
+
else:
|
218
|
+
gcd_dir = args.input_simu_dir
|
219
|
+
args.actions = ['process']
|
220
|
+
if 'run' in args.actions:
|
221
|
+
run_gcdyn()
|
222
|
+
if 'process' in args.actions:
|
223
|
+
process_output()
|
bin/gctree-run.py
ADDED
@@ -0,0 +1,244 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
from __future__ import absolute_import, division, unicode_literals
|
3
|
+
from __future__ import print_function
|
4
|
+
import numpy
|
5
|
+
import csv
|
6
|
+
import yaml
|
7
|
+
import time
|
8
|
+
import colored_traceback.always
|
9
|
+
import argparse
|
10
|
+
import subprocess
|
11
|
+
import sys
|
12
|
+
import os
|
13
|
+
import dendropy
|
14
|
+
import json
|
15
|
+
from io import open
|
16
|
+
import random
|
17
|
+
from pathlib import Path
|
18
|
+
|
19
|
+
partis_dir = str(Path(__file__).parent.parent)
|
20
|
+
sys.path.insert(1, partis_dir) #'./python')
|
21
|
+
import python.utils as utils
|
22
|
+
import python.glutils as glutils
|
23
|
+
import python.treeutils as treeutils
|
24
|
+
|
25
|
+
# ----------------------------------------------------------------------------------------
|
26
|
+
def get_inf_int_name(gname): # <gname> is just an integer, which won't be unique and will break things
|
27
|
+
return '%s-%s' % (args.inf_int_label, gname)
|
28
|
+
|
29
|
+
# ----------------------------------------------------------------------------------------
|
30
|
+
def gctofn(ft):
|
31
|
+
ftstrs = {
|
32
|
+
'tree' : 'gctree.out.inference.1.nk',
|
33
|
+
'seqs' : 'gctree.out.inference.1.fasta',
|
34
|
+
'dnapars' : 'outfile',
|
35
|
+
}
|
36
|
+
return '%s/%s' % (args.outdir, ftstrs[ft])
|
37
|
+
|
38
|
+
# ----------------------------------------------------------------------------------------
|
39
|
+
def fofn(ft):
|
40
|
+
assert ft in ['tree', 'seqs']
|
41
|
+
return '%s/%s%s' % (args.outdir, ft if ft=='tree' else 'inferred-%s'%ft, '.nwk' if ft=='tree' else '.fa')
|
42
|
+
|
43
|
+
# ----------------------------------------------------------------------------------------
|
44
|
+
def idfn():
|
45
|
+
return 'idmap.txt'
|
46
|
+
|
47
|
+
# ----------------------------------------------------------------------------------------
|
48
|
+
def install():
|
49
|
+
cmds = ['#!/bin/bash']
|
50
|
+
cmds += utils.mamba_cmds(args.env_label, only_prep=True)
|
51
|
+
cmds += ['micromamba create -y -n %s -c conda-forge python=3.9' % args.env_label] # 3.10 currently has problems with ete
|
52
|
+
cmds += ['micromamba activate %s' % args.env_label]
|
53
|
+
cmds += ['micromamba install -y -c bioconda -c conda-forge phylip']
|
54
|
+
cmds += ['micromamba install -y -c conda-forge%s click' % ('' if args.no_dag else ' gctree')]
|
55
|
+
if args.no_dag:
|
56
|
+
cmds += ['pip install gctree==3.3.0'] # I think having --user makes it install in ~/.local (outside mamba env)
|
57
|
+
# micromamba remove -n gctree --all # to nuke it and start over
|
58
|
+
utils.simplerun('\n'.join(cmds) + '\n', cmdfname='/tmp/tmprun.sh', debug=True)
|
59
|
+
|
60
|
+
# ----------------------------------------------------------------------------------------
|
61
|
+
def update():
|
62
|
+
cmds = ['#!/bin/bash']
|
63
|
+
cmds += utils.mamba_cmds(args.env_label)
|
64
|
+
cmds += ['micromamba update phylip gctree click']
|
65
|
+
utils.simplerun('\n'.join(cmds) + '\n', cmdfname='/tmp/tmprun.sh', debug=True)
|
66
|
+
|
67
|
+
# ----------------------------------------------------------------------------------------
|
68
|
+
def add_mfo(tcmd, mfn):
|
69
|
+
kdict = {'frame' : 'frame', 'h_frame' : 'frame', 'l_frame' : 'frame2', 'l_offset' : 'chain_split'} # translates from metafo dict to gctree command line args
|
70
|
+
with open(args.metafname) as mfile:
|
71
|
+
metafo = json.load(mfile)
|
72
|
+
for tk, tc in kdict.items():
|
73
|
+
if tk in metafo:
|
74
|
+
tcmd += ' --%s %d' % (tc, metafo[tk])
|
75
|
+
return tcmd
|
76
|
+
|
77
|
+
# ----------------------------------------------------------------------------------------
|
78
|
+
def run_gctree():
|
79
|
+
# ----------------------------------------------------------------------------------------
|
80
|
+
def get_gctree_cmd():
|
81
|
+
tcmd = '%s/bin/xvfb-run -a gctree infer outfile abundances.csv --root %s --verbose --idlabel' % (utils.get_partis_dir(), args.root_label) # --idlabel writes the output fasta file
|
82
|
+
if not args.base_model and not args.no_dag:
|
83
|
+
tcmd += ' --mutability %s/HS5F_Mutability.csv --substitution %s/HS5F_Substitution.csv' % (args.data_dir, args.data_dir)
|
84
|
+
if args.ranking_coeffs is not None:
|
85
|
+
tcmd += ' --ranking_coeffs %s' % (' '.join(c for c in args.ranking_coeffs))
|
86
|
+
if args.branching_process_ranking_coeff is not None:
|
87
|
+
tcmd += ' --branching_process_ranking_coeff %d' % args.branching_process_ranking_coeff
|
88
|
+
if os.path.exists(args.metafname):
|
89
|
+
tcmd = add_mfo(tcmd, args.metafname)
|
90
|
+
return tcmd
|
91
|
+
# ----------------------------------------------------------------------------------------
|
92
|
+
def get_cmds():
|
93
|
+
cmds = ['#!/bin/bash']
|
94
|
+
cmds += utils.mamba_cmds(args.env_label)
|
95
|
+
if args.run_help:
|
96
|
+
cmds += ['gctree infer -h']
|
97
|
+
return cmds
|
98
|
+
if not os.path.exists(args.infname):
|
99
|
+
raise Exception('--infname %s doesn\'t exist' % args.infname)
|
100
|
+
cmds += ['cd %s' % args.outdir]
|
101
|
+
if args.input_forest_dir is None:
|
102
|
+
ofn = '%s/outfile' % args.outdir # dnapars output file (this is what takes the longest to make
|
103
|
+
if os.path.exists(ofn) and os.stat(ofn).st_size > 0:
|
104
|
+
print(' dnapars output already exists, not rerunning: %s' % ofn)
|
105
|
+
else:
|
106
|
+
if os.path.exists(ofn) and os.stat(ofn).st_size == 0:
|
107
|
+
print(' removing zero length dnapars output %s' % ofn)
|
108
|
+
utils.prep_dir(args.outdir, wildlings=['outfile', 'outtree'], allow_other_files=True) # phylip barfs like a mfer if its outputs exist (probably you'll get a KeyError 'naive')
|
109
|
+
cmds += ['deduplicate %s --root %s --abundance_file abundances.csv --idmapfile %s > deduplicated.phylip' % (args.infname, args.root_label, idfn())]
|
110
|
+
cmds += ['mkconfig deduplicated.phylip dnapars > dnapars.cfg']
|
111
|
+
cmds += ['dnapars < dnapars.cfg > dnapars.log'] # NOTE if things fail, look in dnaparse.log (but it's super verbose so we can't print it to std out by default)
|
112
|
+
else:
|
113
|
+
print(' --input-forest-dir: copying abundance, idmap, and forest files from %s' % args.input_forest_dir)
|
114
|
+
cmds += ['cp %s/{abundances.csv,%s,outfile} %s/' % (args.input_forest_dir, idfn(), args.outdir)]
|
115
|
+
if not args.only_write_forest:
|
116
|
+
cmds.append(get_gctree_cmd())
|
117
|
+
return cmds
|
118
|
+
# ----------------------------------------------------------------------------------------
|
119
|
+
if not args.run_help and utils.output_exists(args, gctofn('dnapars' if args.only_write_forest else 'tree')):
|
120
|
+
return
|
121
|
+
|
122
|
+
cmds = get_cmds() # also preps dir + other stuff
|
123
|
+
|
124
|
+
utils.simplerun('\n'.join(cmds) + '\n', cmdfname=args.outdir + '/run.sh', print_time='gctree', debug=True, dryrun=args.dry_run)
|
125
|
+
if args.run_help:
|
126
|
+
sys.exit()
|
127
|
+
|
128
|
+
# ----------------------------------------------------------------------------------------
|
129
|
+
def parse_output():
|
130
|
+
if utils.output_exists(args, fofn('seqs')):
|
131
|
+
return
|
132
|
+
|
133
|
+
# read translations (this only includes input sequences, not inferred intermediates)
|
134
|
+
idm_trns = {}
|
135
|
+
with open('%s/idmap.txt' % args.outdir) as idfile:
|
136
|
+
reader = csv.DictReader(idfile, fieldnames=('name', 'orig_names'))
|
137
|
+
for line in reader:
|
138
|
+
if line['orig_names'] == '':
|
139
|
+
continue
|
140
|
+
idm_trns[line['name']] = line['orig_names'].split(':')
|
141
|
+
|
142
|
+
# read fasta (mostly for inferred intermediate seqs)
|
143
|
+
seqfos = utils.read_fastx(gctofn('seqs'), look_for_tuples=True)
|
144
|
+
print(' read %d seqs from gctree output fasta' % len(seqfos))
|
145
|
+
if any(s['name']=='' for s in seqfos):
|
146
|
+
n_removed = len([s for s in seqfos if s['name']==''])
|
147
|
+
seqfos = [s for s in seqfos if s['name']!='']
|
148
|
+
print(' %s removed %d seqs with zero-length names \'\' (I\'m *not* sure this is the right thing to do, but it just kicked this error when I was doing the python 3 conversion)' % (utils.wrnstr(), n_removed))
|
149
|
+
nfos = [s for s in seqfos if s['name']==args.root_label]
|
150
|
+
if len(nfos) != 1:
|
151
|
+
print(' %s expected 1 naive seq with label \'%s\' but found %d: %s (in %s)' % (utils.wrnstr(), args.root_label, len(nfos), ' '.join(n['name'] for n in nfos), gctofn('seqs')))
|
152
|
+
seqfos = [s for s in seqfos if s['name'] != args.root_label] # don't want naive seq in final fasta
|
153
|
+
seq_len = numpy.mean([len(s['seq']) for s in seqfos])
|
154
|
+
if not args.expand_all_nodes: # also remove input seqs (well, gctree's new names for input seqs), unless we're expanding all nodes, in which case we need the gctree-named-nodes as fake new internal nodes
|
155
|
+
seqfos = [s for s in seqfos if s['name'] not in idm_trns]
|
156
|
+
if len(seqfos) == 0:
|
157
|
+
print(' %s no inferred sequences (all seqs read from gctree output were input seqs' % utils.wrnstr())
|
158
|
+
inf_int_trns = []
|
159
|
+
for sfo in seqfos:
|
160
|
+
inf_int_trns.append((sfo['name'], get_inf_int_name(sfo['name'])))
|
161
|
+
sfo['name'] = get_inf_int_name(sfo['name'])
|
162
|
+
|
163
|
+
# read tree
|
164
|
+
dtree = treeutils.get_dendro_tree(treefname=gctofn('tree'), debug=args.debug)
|
165
|
+
dtree.scale_edges(1. / seq_len)
|
166
|
+
dtree.seed_node.taxon.label = args.root_label
|
167
|
+
ndict = {n.taxon.label : n for n in dtree.preorder_node_iter()}
|
168
|
+
for gname, onames in idm_trns.items():
|
169
|
+
node = ndict[gname]
|
170
|
+
if node is None:
|
171
|
+
raise Exception('couldn\'t find node with name \'%s\' in tree from gctree in %s' % (gname, gctofn('tree')))
|
172
|
+
if args.debug and len(onames) > 1:
|
173
|
+
print(' abundance > 1 for %s: %d (%s)' % (gname, len(onames), ' '.join(onames)))
|
174
|
+
for onm in onames:
|
175
|
+
if node.taxon.label == gname and not args.expand_all_nodes:
|
176
|
+
node.taxon.label = onm
|
177
|
+
if args.debug and len(onames) > 1:
|
178
|
+
print(' setting node to %s' % onm)
|
179
|
+
continue
|
180
|
+
treeutils.add_zero_length_child(node, dtree, child_name=onm) # add duplicates as children with zero-length edges
|
181
|
+
if args.debug and len(onames) > 1:
|
182
|
+
print(' adding child node %s' % onm)
|
183
|
+
treeutils.translate_labels(dtree, inf_int_trns, expect_missing=True, debug=args.debug)
|
184
|
+
|
185
|
+
if args.fix_multifurcations:
|
186
|
+
input_seqfos = utils.read_fastx(args.infname)
|
187
|
+
dtree, new_seqfos = treeutils.get_binary_tree(dtree, nfos + input_seqfos + seqfos, debug=args.debug)
|
188
|
+
seqfos += new_seqfos
|
189
|
+
if args.debug:
|
190
|
+
print(' final tree:')
|
191
|
+
print(treeutils.get_ascii_tree(dendro_tree=dtree, extra_str=' ', width=350))
|
192
|
+
with open(fofn('tree'), 'w') as ofile:
|
193
|
+
ofile.write('%s\n' % treeutils.as_str(dtree))
|
194
|
+
utils.write_fasta(fofn('seqs'), nfos + seqfos)
|
195
|
+
|
196
|
+
# ----------------------------------------------------------------------------------------
|
197
|
+
ustr = """
|
198
|
+
Run gctree tree inference on sequences from fasta input file <--infname>.
|
199
|
+
Output trees and sequences are written to <--outdir> as inferred-seqs.fa and tree.nwk (gctree output files are also there, but they don't have any postprocessing e.g. fixing names and/or multifurcations.
|
200
|
+
gctree-run.py --infname <fasta> --outdir <outdir>
|
201
|
+
"""
|
202
|
+
parser = argparse.ArgumentParser(usage=ustr)
|
203
|
+
parser.add_argument('--actions', default='run:parse')
|
204
|
+
parser.add_argument('--infname')
|
205
|
+
parser.add_argument('--metafname', help='if you need --frame (v region doesn\'t start at first position) or --chain_split and --frame2 (heavy/light chain smooshed together), pass the info in json format with this arg (see code above for format).')
|
206
|
+
parser.add_argument('--outdir')
|
207
|
+
parser.add_argument('--only-write-forest', action='store_true', help='only run preparatory steps for gctree, i.e. up through dnapars, to write parsimony forest')
|
208
|
+
parser.add_argument('--input-forest-dir', help='If set, skips preparatory steps (see --only-write-forest), and looks for \'abundance.csv\' and parsimony forest file (\'outfile\') in the specified dir')
|
209
|
+
parser.add_argument('--overwrite', action='store_true')
|
210
|
+
parser.add_argument('--base-model', action='store_true', help='By default, we pass gctree info for the s5f mutation model; if this is set, we don\'t, and it instead use the base model.')
|
211
|
+
parser.add_argument('--no-dag', action='store_true', help='If set, use old v1 non-DAG gctree version (v3.3.0). Note that this uses a different env (see --env-label)')
|
212
|
+
parser.add_argument('--ranking-coeffs', nargs='+', help='see gctree help')
|
213
|
+
parser.add_argument('--branching-process-ranking-coeff', type=int, help='see gctree help')
|
214
|
+
parser.add_argument('--env-label', default='gctree')
|
215
|
+
parser.add_argument('--root-label', default='naive')
|
216
|
+
parser.add_argument('--data-dir', default='%s/data/s5f'%utils.get_partis_dir())
|
217
|
+
parser.add_argument('--inf-int-label', default='inf', help='base name for inferred intermediate seqs (numerical name is appended with -')
|
218
|
+
parser.add_argument('--expand-all-nodes', action='store_true', help='Gctree collapses duplicate observed seqs into nodes with new names and abundance N > 1. By default, we expand these such that the node is named for one of the observed seqs, and add N-1 (zero-length) children. If this arg is set, however, we leave the node and add N (zero-length) children.')
|
219
|
+
parser.add_argument('--run-help', action='store_true', help='run gctree help')
|
220
|
+
parser.add_argument('--debug', action='store_true')
|
221
|
+
parser.add_argument('--dry-run', action='store_true')
|
222
|
+
parser.add_argument('--random-seed', type=int, default=0)
|
223
|
+
parser.add_argument('--fix-multifurcations', action='store_true', help='resolves multifurcations (by adding zero length intermediates) and move input seqs that have been extend unifurcations onto zero length branches')
|
224
|
+
|
225
|
+
args = parser.parse_args()
|
226
|
+
random.seed(args.random_seed)
|
227
|
+
numpy.random.seed(args.random_seed)
|
228
|
+
if args.only_write_forest and args.input_forest_dir:
|
229
|
+
raise Exception('doesn\'t make sense to specify both')
|
230
|
+
args.actions = utils.get_arg_list(args.actions, choices=['install', 'update', 'run', 'parse'])
|
231
|
+
args.infname = utils.fpath(args.infname)
|
232
|
+
args.outdir = utils.fpath(args.outdir)
|
233
|
+
if args.no_dag:
|
234
|
+
assert not args.base_model and args.branching_process_ranking_coeff is None and args.ranking_coeffs is None
|
235
|
+
args.env_label = 'gctree-no-dag'
|
236
|
+
|
237
|
+
if 'install' in args.actions:
|
238
|
+
install()
|
239
|
+
if 'update' in args.actions:
|
240
|
+
update()
|
241
|
+
if 'run' in args.actions:
|
242
|
+
run_gctree()
|
243
|
+
if 'parse' in args.actions:
|
244
|
+
parse_output()
|
@@ -0,0 +1,126 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
from __future__ import absolute_import, division, unicode_literals
|
3
|
+
from __future__ import print_function
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
import csv
|
7
|
+
import argparse
|
8
|
+
import operator
|
9
|
+
import argparse
|
10
|
+
import yaml
|
11
|
+
import colored_traceback.always
|
12
|
+
from io import open
|
13
|
+
|
14
|
+
# if you move this script, you'll need to change this method of getting the imports
|
15
|
+
from pathlib import Path
|
16
|
+
partis_dir = str(Path(__file__).parent.parent)
|
17
|
+
sys.path.insert(1, partis_dir) # + '/python')
|
18
|
+
|
19
|
+
import python.utils as utils
|
20
|
+
|
21
|
+
# ----------------------------------------------------------------------------------------
|
22
|
+
def is_acceptable(scol, acceptable_values, lval):
|
23
|
+
if lval in acceptable_values:
|
24
|
+
return True
|
25
|
+
if args.any_allele and '_gene' in scol and any(utils.are_alleles(g, lval) for g in acceptable_values):
|
26
|
+
return True
|
27
|
+
return False
|
28
|
+
|
29
|
+
class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
|
30
|
+
pass
|
31
|
+
formatter_class = MultiplyInheritedFormatter
|
32
|
+
parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter)
|
33
|
+
parser.add_argument('--infname', default='test/ref-results/test/parameters/data/hmm/all-probs.csv', help='input all-probs.csv file from a previously-inferred partis parameter directory, for instance: test/reference-results/test/parameters/data/hmm/all-probs.csv')
|
34
|
+
parser.add_argument('--config-fname', help='yaml file with info on columns for which we want to specify particular values (and skip others). See default/example set below. To create a yaml config file to start from, uncomment the yaml.dump() line below and rerun with no arguments.')
|
35
|
+
parser.add_argument('--outfname')
|
36
|
+
parser.add_argument('--any-allele', action='store_true', help='if set, also include any other alleles of any of the genes specified in \'skip_column_vals\' (note: can also set it in the cfg file).')
|
37
|
+
parser.add_argument('--debug', action='store_true', default=True) # it's kind of confusing without the debug printout
|
38
|
+
args = parser.parse_args()
|
39
|
+
|
40
|
+
non_summed_column = None
|
41
|
+
if args.config_fname is None:
|
42
|
+
non_summed_column = 'v_gene'
|
43
|
+
skip_column_vals = { # to input your own dict on the command line, just convert with str() and quote it
|
44
|
+
# 'cdr3_length' : ['33', '36', '39', '42', '45', '48'], # <value> is list of acceptable values NOTE need to all be strings, otherwise you have to worry about converting the values in the csv file
|
45
|
+
'v_gene' : ['IGHV1-2*02+G35A', 'IGHV1-2*02+T147C', 'IGHV1-2*02'],
|
46
|
+
# 'd_gene' : ['IGHD3-22*01'],
|
47
|
+
'j_gene' : ['IGHJ4*02'],
|
48
|
+
'cdr3_length' : ['66',],
|
49
|
+
}
|
50
|
+
print('%s using default skip column/non-summed column values (which probably don\'t correspond to what you\'re actually interested in)' % utils.color('red', 'note'))
|
51
|
+
# # uncomment to create a yaml file to start from:
|
52
|
+
# with open('tmp.yaml', 'w') as tfile:
|
53
|
+
# yaml.dump({'non_summed_column' : non_summed_column, 'skip_column_vals' : skip_column_vals}, tfile)
|
54
|
+
else:
|
55
|
+
with open(args.config_fname) as yamlfile:
|
56
|
+
yamlfo = yaml.load(yamlfile, Loader=yaml.Loader)
|
57
|
+
if 'non_summed_column' in yamlfo:
|
58
|
+
non_summed_column = yamlfo['non_summed_column']
|
59
|
+
skip_column_vals = yamlfo['skip_column_vals']
|
60
|
+
for scol in skip_column_vals:
|
61
|
+
skip_column_vals[scol] = [str(v) for v in skip_column_vals[scol]] # yaml.load() converts to integers, which is usually nice, but here we don't want it to since we're not converting when reading all-probs.csv (I think there's options to yaml.load to change this, I just don't want to figure it out now)
|
62
|
+
if 'any_allele' in yamlfo:
|
63
|
+
if args.any_allele and not yamlfo['any_allele']: # if it's set to true on the command line, but false in the file
|
64
|
+
print(' %s overwriting --any-allele with value from cfg file %s' % (utils.color('red', 'warning'), args.config_fname))
|
65
|
+
args.any_allele = yamlfo['any_allele']
|
66
|
+
|
67
|
+
info = {}
|
68
|
+
lines_skipped, lines_used = 0, 0
|
69
|
+
counts_skipped, counts_used = 0, 0
|
70
|
+
print(' reading probs from %s' % args.infname)
|
71
|
+
with open(args.infname) as csvfile:
|
72
|
+
reader = csv.DictReader(csvfile)
|
73
|
+
# if args.debug:
|
74
|
+
# print ' all columns in file: %s' % ' '.join(reader.fieldnames)
|
75
|
+
if len(set(skip_column_vals) - set(reader.fieldnames)) > 0:
|
76
|
+
raise Exception('keys in --skip-column-fname not in file: %s' % ' '.join(set(skip_column_vals) - set(reader.fieldnames)))
|
77
|
+
for line in reader:
|
78
|
+
skip_this_line = False
|
79
|
+
for scol, acceptable_values in skip_column_vals.items():
|
80
|
+
if not is_acceptable(scol, acceptable_values, line[scol]):
|
81
|
+
skip_this_line = True
|
82
|
+
lines_skipped += 1
|
83
|
+
counts_skipped += int(line['count'])
|
84
|
+
break
|
85
|
+
if skip_this_line:
|
86
|
+
continue
|
87
|
+
|
88
|
+
if non_summed_column is not None:
|
89
|
+
if line[non_summed_column] not in info:
|
90
|
+
info[line[non_summed_column]] = 0
|
91
|
+
info[line[non_summed_column]] += int(line['count'])
|
92
|
+
|
93
|
+
lines_used += 1
|
94
|
+
counts_used += int(line['count'])
|
95
|
+
|
96
|
+
# ----------------------------------------------------------------------------------------
|
97
|
+
import python.fraction_uncertainty as fraction_uncertainty
|
98
|
+
def frac_err(obs, total):
|
99
|
+
lo, hi = fraction_uncertainty.err(obs, total)
|
100
|
+
return 0.5 * (hi - lo)
|
101
|
+
count_fraction = counts_used / float(counts_used + counts_skipped)
|
102
|
+
|
103
|
+
if args.debug:
|
104
|
+
print(' applied restrictions:%s' % (' (including all alleles of these genes)' if args.any_allele else ''))
|
105
|
+
for scol, acceptable_values in skip_column_vals.items():
|
106
|
+
print(' %15s in %s' % (scol, acceptable_values))
|
107
|
+
print(' used:')
|
108
|
+
print(' %6d / %-6d = %.3f lines' % (lines_used, lines_used + lines_skipped, lines_used / float(lines_used + lines_skipped)))
|
109
|
+
print(' %6d / %-6d = %.3f +/- %.3f counts' % (counts_used, counts_used + counts_skipped, count_fraction, frac_err(counts_used, counts_used + counts_skipped)))
|
110
|
+
|
111
|
+
if non_summed_column is not None:
|
112
|
+
print(' %18s count / %d = fraction' % (non_summed_column, counts_used))
|
113
|
+
for val, count in sorted(list(info.items()), key=operator.itemgetter(1), reverse=True): # sort by counts
|
114
|
+
# for val, count in sorted(info.items()): # sort by column value (e.g. cdr3 length)
|
115
|
+
print(' %18s %6d %.3f +/- %.3f' % (val, count, count / float(counts_used), frac_err(count, counts_used)))
|
116
|
+
|
117
|
+
if args.outfname is not None:
|
118
|
+
if args.debug:
|
119
|
+
print(' writing total counts (plus %d info entries) to %s' % (len(info), args.outfname))
|
120
|
+
with open(args.outfname, 'w') as outfile:
|
121
|
+
yamlfo = {'counts' : counts_used,
|
122
|
+
'total' : counts_used + counts_skipped,
|
123
|
+
'fraction' : count_fraction,
|
124
|
+
'frac_err' : frac_err(counts_used, counts_used + counts_skipped),
|
125
|
+
'info' : info}
|
126
|
+
yaml.dump(yamlfo, outfile, width=150)
|
bin/iqtree-1.6.12
ADDED
Binary file
|