partis-bcr 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. bin/FastTree +0 -0
  2. bin/add-chimeras.py +59 -0
  3. bin/add-seqs-to-outputs.py +81 -0
  4. bin/bcr-phylo-run.py +799 -0
  5. bin/build.sh +24 -0
  6. bin/cf-alleles.py +97 -0
  7. bin/cf-germlines.py +57 -0
  8. bin/cf-linearham.py +199 -0
  9. bin/chimera-plot.py +76 -0
  10. bin/choose-partially-paired.py +143 -0
  11. bin/circle-plots.py +30 -0
  12. bin/compare-plotdirs.py +298 -0
  13. bin/diff-parameters.py +133 -0
  14. bin/docker-hub-push.sh +6 -0
  15. bin/extract-pairing-info.py +55 -0
  16. bin/gcdyn-simu-run.py +223 -0
  17. bin/gctree-run.py +244 -0
  18. bin/get-naive-probabilities.py +126 -0
  19. bin/iqtree-1.6.12 +0 -0
  20. bin/lonr.r +1020 -0
  21. bin/makeHtml +52 -0
  22. bin/mds-run.py +46 -0
  23. bin/parse-output.py +277 -0
  24. bin/partis +1869 -0
  25. bin/partis-pip +116 -0
  26. bin/partis.py +1869 -0
  27. bin/plot-gl-set-trees.py +519 -0
  28. bin/plot-hmms.py +151 -0
  29. bin/plot-lb-tree.py +427 -0
  30. bin/raxml-ng +0 -0
  31. bin/read-bcr-phylo-trees.py +38 -0
  32. bin/read-gctree-output.py +166 -0
  33. bin/run-chimeras.sh +64 -0
  34. bin/run-dtr-scan.sh +25 -0
  35. bin/run-paired-loci.sh +100 -0
  36. bin/run-tree-metrics.sh +88 -0
  37. bin/smetric-run.py +62 -0
  38. bin/split-loci.py +317 -0
  39. bin/swarm-2.1.13-linux-x86_64 +0 -0
  40. bin/test-germline-inference.py +425 -0
  41. bin/tree-perf-run.py +194 -0
  42. bin/vsearch-2.4.3-linux-x86_64 +0 -0
  43. bin/vsearch-2.4.3-macos-x86_64 +0 -0
  44. bin/xvfb-run +194 -0
  45. partis_bcr-1.0.2.data/scripts/cf-alleles.py +97 -0
  46. partis_bcr-1.0.2.data/scripts/cf-germlines.py +57 -0
  47. partis_bcr-1.0.2.data/scripts/extract-pairing-info.py +55 -0
  48. partis_bcr-1.0.2.data/scripts/gctree-run.py +244 -0
  49. partis_bcr-1.0.2.data/scripts/parse-output.py +277 -0
  50. partis_bcr-1.0.2.data/scripts/split-loci.py +317 -0
  51. partis_bcr-1.0.2.data/scripts/test.py +1005 -0
  52. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/METADATA +1 -1
  53. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/RECORD +101 -51
  54. partis_bcr-1.0.2.dist-info/top_level.txt +1 -0
  55. {partis → python}/glutils.py +1 -1
  56. python/main.py +30 -0
  57. {partis → python}/plotting.py +10 -1
  58. {partis → python}/treeutils.py +18 -16
  59. {partis → python}/utils.py +14 -7
  60. packages/ham/bcrham +0 -0
  61. partis/main.py +0 -59
  62. partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
  63. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/WHEEL +0 -0
  64. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/entry_points.txt +0 -0
  65. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/licenses/COPYING +0 -0
  66. {partis → python}/__init__.py +0 -0
  67. {partis → python}/alleleclusterer.py +0 -0
  68. {partis → python}/allelefinder.py +0 -0
  69. {partis → python}/alleleremover.py +0 -0
  70. {partis → python}/annotationclustering.py +0 -0
  71. {partis → python}/baseutils.py +0 -0
  72. {partis → python}/cache/__init__.py +0 -0
  73. {partis → python}/cache/cached_uncertainties.py +0 -0
  74. {partis → python}/clusterpath.py +0 -0
  75. {partis → python}/coar.py +0 -0
  76. {partis → python}/corrcounter.py +0 -0
  77. {partis → python}/datautils.py +0 -0
  78. {partis → python}/event.py +0 -0
  79. {partis → python}/fraction_uncertainty.py +0 -0
  80. {partis → python}/gex.py +0 -0
  81. {partis → python}/glomerator.py +0 -0
  82. {partis → python}/hist.py +0 -0
  83. {partis → python}/hmmwriter.py +0 -0
  84. {partis → python}/hutils.py +0 -0
  85. {partis → python}/indelutils.py +0 -0
  86. {partis → python}/lbplotting.py +0 -0
  87. {partis → python}/mds.py +0 -0
  88. {partis → python}/mutefreqer.py +0 -0
  89. {partis → python}/paircluster.py +0 -0
  90. {partis → python}/parametercounter.py +0 -0
  91. {partis → python}/paramutils.py +0 -0
  92. {partis → python}/partitiondriver.py +0 -0
  93. {partis → python}/partitionplotter.py +0 -0
  94. {partis → python}/performanceplotter.py +0 -0
  95. {partis → python}/plotconfig.py +0 -0
  96. {partis → python}/processargs.py +0 -0
  97. {partis → python}/prutils.py +0 -0
  98. {partis → python}/recombinator.py +0 -0
  99. {partis → python}/scanplot.py +0 -0
  100. {partis → python}/seqfileopener.py +0 -0
  101. {partis → python}/treegenerator.py +0 -0
  102. {partis → python}/viterbicluster.py +0 -0
  103. {partis → python}/vrc01.py +0 -0
  104. {partis → python}/waterer.py +0 -0
bin/gcdyn-simu-run.py ADDED
@@ -0,0 +1,223 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import glob
5
+ import sys
6
+ import csv
7
+ from io import open
8
+ csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields
9
+ import os
10
+ import copy
11
+ import argparse
12
+ import colored_traceback.always
13
+ import json
14
+
15
+ # if you move this script, you'll need to change this method of getting the imports
16
+ from pathlib import Path
17
+ partis_dir = str(Path(__file__).parent.parent)
18
+ sys.path.insert(1, partis_dir) # + '/python')
19
+
20
+ import python.utils as utils
21
+ import python.paircluster as paircluster
22
+ import python.glutils as glutils
23
+ from python.clusterpath import ClusterPath
24
+ import python.treeutils as treeutils
25
+ import python.indelutils as indelutils
26
+ from python.event import RecombinationEvent
27
+
28
+ # ----------------------------------------------------------------------------------------
29
+ def get_replay_naive_antn(glfos, ltmp, add_empty_mature_keys=False, debug=False):
30
+ antn, naive_seq = {}, []
31
+ for region in utils.getregions(ltmp):
32
+ gene, seq = utils.get_single_entry(list(glfos[ltmp]['seqs'][region].items()))
33
+ antn['%s_gene'%region] = gene
34
+ if ltmp == 'igh' and region == 'j' and args.rm_last_ighj_base:
35
+ if debug:
36
+ print(' removing last ighj base: %s --> %s' % (seq, seq[:-1]))
37
+ seq = seq[:-1]
38
+ naive_seq.append(seq)
39
+ for dstr in utils.all_erosions:
40
+ antn['%s_del'%dstr] = 0
41
+ if not utils.has_d_gene(ltmp):
42
+ antn['d_gene'] = glutils.dummy_d_genes[ltmp]
43
+ antn['d_5p_del'] = 1
44
+ for bstr in utils.all_boundaries:
45
+ antn['%s_insertion'%bstr] = ''
46
+ antn['naive_seq'] = ''.join(naive_seq)
47
+ antn['invalid'] = False
48
+ if add_empty_mature_keys: # keys that need to be replaced in mature annotation
49
+ antn['unique_ids'] = []
50
+ antn['seqs'] = []
51
+ antn['input_seqs'] = antn['seqs']
52
+ antn['indelfos'] = []
53
+ antn['paired-uids'] = []
54
+ else:
55
+ antn['unique_ids'] = ['replay_naive']
56
+ antn['seqs'] = [''.join(naive_seq)]
57
+ antn['input_seqs'] = antn['seqs']
58
+ antn['indelfos'] = [indelutils.get_empty_indel()]
59
+ antn['paired-uids'] = [[]]
60
+ if debug:
61
+ utils.add_implicit_info(glfos[ltmp], antn)
62
+ utils.print_reco_event(antn)
63
+ return antn
64
+
65
+ # ----------------------------------------------------------------------------------------
66
+ def get_uid(name, ltmp):
67
+ return '%s-%s' % (name, ltmp)
68
+
69
+ # ----------------------------------------------------------------------------------------
70
+ def process_tree(glfos, treelist, tree_sfos, leaf_meta, itree, lp_infos, lpair):
71
+ antns = {}
72
+ for ltmp in lpair:
73
+ antns[ltmp] = get_replay_naive_antn(glfos, ltmp, add_empty_mature_keys=True, debug=args.debug)
74
+ for tkey in args.meta_columns:
75
+ antns[ltmp][utils.input_metafile_keys[tkey]] = []
76
+
77
+ hloc = utils.heavy_locus(args.ig_or_tr)
78
+ lloc = utils.get_single_entry([l for l in glfos if l!=hloc])
79
+ h_seq_len = len(antns[hloc]['naive_seq'])
80
+ for sfo in tree_sfos[itree]:
81
+ joint_naive_seq = ''.join(antns[l]['naive_seq'] for l in lpair)
82
+ if len(sfo['seq']) == len(joint_naive_seq) - 1:
83
+ raise Exception('seq read from gcdyn file has len %d, one less than replay naive seq %d, you probably need to set --rm-last-ighj-base' % (len(sfo['seq']), len(joint_naive_seq)))
84
+ assert len(sfo['seq']) == len(joint_naive_seq)
85
+ if 'naive' in sfo['name']:
86
+ assert sfo['seq'] == joint_naive_seq
87
+ continue
88
+ dumb_offset = 0
89
+ if args.rm_last_ighj_base:
90
+ dumb_offset = 1
91
+ antns[hloc]['seqs'].append(sfo['seq'][:h_seq_len + dumb_offset])
92
+ antns[lloc]['seqs'].append(sfo['seq'][h_seq_len:])
93
+
94
+ for ltmp in lpair:
95
+ assert antns[ltmp]['input_seqs'][-1] == antns[ltmp]['seqs'][-1]
96
+ antns[ltmp]['unique_ids'].append(get_uid(sfo['name'], ltmp))
97
+ antns[ltmp]['indelfos'].append(indelutils.get_empty_indel())
98
+ other_locus = utils.get_single_entry([l for l in lpair if l!=ltmp])
99
+ antns[ltmp]['paired-uids'].append([get_uid(sfo['name'], other_locus)])
100
+ for tkey in args.meta_columns:
101
+ antns[ltmp][utils.input_metafile_keys[tkey]].append(leaf_meta[sfo['name']][tkey])
102
+
103
+ for ltmp in lpair:
104
+ dtree = treeutils.get_dendro_tree(treestr=treelist[itree])
105
+ treeutils.translate_labels(dtree, [(s['name'], get_uid(s['name'], ltmp)) for s in tree_sfos[itree] if 'naive' not in s['name']], expect_missing=True)
106
+ antns[ltmp]['tree'] = dtree.as_string(schema='newick').strip()
107
+ tmp_event = RecombinationEvent(glfos[ltmp]) # I don't want to move the function out of event.py right now
108
+ tmp_event.set_reco_id(antns[ltmp], irandom=itree) # not sure that setting <irandom> here actually does anything
109
+ utils.add_implicit_info(glfos[ltmp], antns[ltmp]) # easiest way to add codon_positions, which we want to write to file
110
+
111
+ for ltmp in lpair:
112
+ lp_infos[lpair]['antn_lists'][ltmp].append(antns[ltmp])
113
+
114
+ if args.debug:
115
+ for ltmp in sorted(glfos):
116
+ utils.print_reco_event(antns[ltmp], extra_str=' ')
117
+
118
+ # ----------------------------------------------------------------------------------------
119
+ def mfname(): # have to look for old meta file name for backwards compatibility
120
+ mfn = '%s/meta.csv'%gcd_dir
121
+ if os.path.exists(mfn): # if new name exists, return that
122
+ return mfn
123
+ return '%s/leaf-meta.csv'%gcd_dir # otherwise return old name
124
+
125
+ # ----------------------------------------------------------------------------------------
126
+ def run_gcdyn():
127
+ if os.path.exists('%s/encoded-trees.npy'%gcd_dir):
128
+ print(' gcdyn output exists in %s' % gcd_dir)
129
+ return
130
+ cmds = ['#!/bin/bash']
131
+ cmds += utils.mamba_cmds('gcdyn')
132
+ gcmd = 'gcd-simulate --sample-internal-nodes --label-leaf-internal-nodes --outdir %s --xshift-values 2.5 --xscale-values 5 --yscale-values 1000000' % gcd_dir
133
+ if args.n_sub_procs is not None:
134
+ gcmd += ' --n-sub-procs %d' % args.n_sub_procs
135
+ if args.seed is not None:
136
+ gcmd += ' --seed %d' % args.seed
137
+ if args.n_sim_events is not None:
138
+ gcmd += ' --n-trials %d' % args.n_sim_events
139
+ if args.obs_times is not None:
140
+ gcmd += ' --time-to-sampling-values %d' % args.obs_times
141
+ cmds += [gcmd]
142
+ cmdfname = '%s/run.sh' % gcd_dir
143
+ utils.simplerun('\n'.join(cmds) + '\n', cmdfname=cmdfname)
144
+
145
+ # ----------------------------------------------------------------------------------------
146
+ def process_output():
147
+ glfos = {}
148
+ for ltmp in utils.loci:
149
+ if os.path.exists('%s/%s' % (args.replay_germline_dir, ltmp)):
150
+ glfos[ltmp] = glutils.read_glfo(args.replay_germline_dir, ltmp)
151
+
152
+ seqfos = utils.read_fastx('%s/seqs.fasta'%gcd_dir)
153
+ treelist = treeutils.get_treestrs_from_file('%s/trees.nwk'%gcd_dir)
154
+ lmetalines = utils.csvlines(mfname())
155
+ leaf_meta = {l['name'] : {'affinity' : float(l['affinity'])} for l in lmetalines}
156
+ print(' read %d trees, %d seqs (plus leaf metafo) from %s' % (len(treelist), len(seqfos), gcd_dir))
157
+ tree_sfos, all_uids = {}, set() # collect up the seqfos for each tree
158
+ for sfo in seqfos:
159
+ if sfo['name'].count('-') == 1: # old-style, and naive seq
160
+ itree, sname = sfo['name'].split('-')
161
+ elif sfo['name'].count('-') == 2: # new-style
162
+ itree, listr, sname = sfo['name'].split('-') # listr is either 'leaf' or 'mrca'
163
+ else:
164
+ assert False
165
+ itree = int(itree)
166
+ if itree not in tree_sfos:
167
+ tree_sfos[itree] = []
168
+ tree_sfos[itree].append(sfo)
169
+ if sfo['name'] in all_uids:
170
+ raise Exception('found uid %s twice' % sfo['name'])
171
+ all_uids.add(sfo['name'])
172
+ print(' %d tree seqfos with lengths: %s' % (len(tree_sfos), ' '.join(str(len(slist)) for slist in sorted(tree_sfos.values(), key=len, reverse=True))))
173
+ if sorted(tree_sfos) != list(range(len(treelist))):
174
+ raise Exception('tree indices from sequence names didn\'t match number of trees')
175
+
176
+ lpairs = [tuple(sorted(glfos))] # could use utils.locus_pairs() if i update to more than one lpair
177
+ lp_infos = {lp : {'antn_lists' : {l : [] for l in glfos}, 'glfos' : {l : g for l, g in glfos.items()}, 'cpaths' : {}} for lp in lpairs}
178
+ for itree in range(len(treelist)):
179
+ process_tree(glfos, treelist, tree_sfos, leaf_meta, itree, lp_infos, utils.get_single_entry(lpairs))
180
+
181
+ print(' writing annotations to %s' % args.outdir)
182
+ headers = utils.simulation_headers + args.meta_columns
183
+ def ofn_fn(locus, lpair=None, joint=None):
184
+ return paircluster.paired_fn(args.outdir, locus, lpair=lpair, suffix='.yaml')
185
+ paircluster.write_lpair_output_files(lpairs, lp_infos, ofn_fn, headers=headers)
186
+ glfos, antn_lists, _ = paircluster.concat_heavy_chain(lpairs, lp_infos) # per-locus glfos with concat'd heavy chain
187
+ paircluster.write_concatd_output_files(glfos, antn_lists, ofn_fn, headers)
188
+ outfos, metafos = paircluster.get_combined_outmetafos(antn_lists, extra_meta_headers=[utils.input_metafile_keys[k] for k in args.meta_columns])
189
+ paircluster.write_combined_fasta_and_meta(args.outdir+'/all-seqs.fa', args.outdir+'/meta.yaml', outfos, metafos)
190
+
191
+ # ----------------------------------------------------------------------------------------
192
+ helpstr = """
193
+ run gcdyn simulation, then process results into partis-format paired output dir, for example:
194
+ ./bin/gcdyn-simu-run.py --gcd-dir <gcd> --n-sub-procs 10 --seed 0 --n-trials 10 --rm-last-ighj-base
195
+ """
196
+ class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
197
+ pass
198
+ formatter_class = MultiplyInheritedFormatter
199
+ parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, description=helpstr)
200
+ parser.add_argument('--outdir', required=True)
201
+ parser.add_argument('--actions', default='run:process')
202
+ parser.add_argument('--input-simu-dir', help='if set, only run \'process\' action, reading gcdyn simu from this dir and writing partis files to --outdir')
203
+ parser.add_argument('--replay-germline-dir', default='datascripts/meta/taraki-gctree-2021-10/germlines', help='dir with gcreplay germline sequences')
204
+ parser.add_argument('--rm-last-ighj-base', action='store_true', help='sometimes the ighj gene has an extra G at the end, sometimes not, this says to remove it from the seqs read from --replay-germline-dir')
205
+ parser.add_argument('--n-sub-procs', type=int)
206
+ parser.add_argument('--seed', type=int)
207
+ parser.add_argument('--n-sim-events', type=int)
208
+ parser.add_argument('--obs-times', type=int)
209
+ parser.add_argument('--meta-columns', default='affinity')
210
+ parser.add_argument('--ig-or-tr', default='ig')
211
+ parser.add_argument('--debug', action='store_true')
212
+ args = parser.parse_args()
213
+ args.meta_columns = utils.get_arg_list(args.meta_columns, choices=utils.input_metafile_keys.keys())
214
+ if args.input_simu_dir is None:
215
+ gcd_dir = '%s/gcdyn' % args.outdir
216
+ args.actions = utils.get_arg_list(args.actions)
217
+ else:
218
+ gcd_dir = args.input_simu_dir
219
+ args.actions = ['process']
220
+ if 'run' in args.actions:
221
+ run_gcdyn()
222
+ if 'process' in args.actions:
223
+ process_output()
bin/gctree-run.py ADDED
@@ -0,0 +1,244 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import numpy
5
+ import csv
6
+ import yaml
7
+ import time
8
+ import colored_traceback.always
9
+ import argparse
10
+ import subprocess
11
+ import sys
12
+ import os
13
+ import dendropy
14
+ import json
15
+ from io import open
16
+ import random
17
+ from pathlib import Path
18
+
19
+ partis_dir = str(Path(__file__).parent.parent)
20
+ sys.path.insert(1, partis_dir) #'./python')
21
+ import python.utils as utils
22
+ import python.glutils as glutils
23
+ import python.treeutils as treeutils
24
+
25
+ # ----------------------------------------------------------------------------------------
26
+ def get_inf_int_name(gname): # <gname> is just an integer, which won't be unique and will break things
27
+ return '%s-%s' % (args.inf_int_label, gname)
28
+
29
+ # ----------------------------------------------------------------------------------------
30
+ def gctofn(ft):
31
+ ftstrs = {
32
+ 'tree' : 'gctree.out.inference.1.nk',
33
+ 'seqs' : 'gctree.out.inference.1.fasta',
34
+ 'dnapars' : 'outfile',
35
+ }
36
+ return '%s/%s' % (args.outdir, ftstrs[ft])
37
+
38
+ # ----------------------------------------------------------------------------------------
39
+ def fofn(ft):
40
+ assert ft in ['tree', 'seqs']
41
+ return '%s/%s%s' % (args.outdir, ft if ft=='tree' else 'inferred-%s'%ft, '.nwk' if ft=='tree' else '.fa')
42
+
43
+ # ----------------------------------------------------------------------------------------
44
+ def idfn():
45
+ return 'idmap.txt'
46
+
47
+ # ----------------------------------------------------------------------------------------
48
+ def install():
49
+ cmds = ['#!/bin/bash']
50
+ cmds += utils.mamba_cmds(args.env_label, only_prep=True)
51
+ cmds += ['micromamba create -y -n %s -c conda-forge python=3.9' % args.env_label] # 3.10 currently has problems with ete
52
+ cmds += ['micromamba activate %s' % args.env_label]
53
+ cmds += ['micromamba install -y -c bioconda -c conda-forge phylip']
54
+ cmds += ['micromamba install -y -c conda-forge%s click' % ('' if args.no_dag else ' gctree')]
55
+ if args.no_dag:
56
+ cmds += ['pip install gctree==3.3.0'] # I think having --user makes it install in ~/.local (outside mamba env)
57
+ # micromamba remove -n gctree --all # to nuke it and start over
58
+ utils.simplerun('\n'.join(cmds) + '\n', cmdfname='/tmp/tmprun.sh', debug=True)
59
+
60
+ # ----------------------------------------------------------------------------------------
61
+ def update():
62
+ cmds = ['#!/bin/bash']
63
+ cmds += utils.mamba_cmds(args.env_label)
64
+ cmds += ['micromamba update phylip gctree click']
65
+ utils.simplerun('\n'.join(cmds) + '\n', cmdfname='/tmp/tmprun.sh', debug=True)
66
+
67
+ # ----------------------------------------------------------------------------------------
68
+ def add_mfo(tcmd, mfn):
69
+ kdict = {'frame' : 'frame', 'h_frame' : 'frame', 'l_frame' : 'frame2', 'l_offset' : 'chain_split'} # translates from metafo dict to gctree command line args
70
+ with open(args.metafname) as mfile:
71
+ metafo = json.load(mfile)
72
+ for tk, tc in kdict.items():
73
+ if tk in metafo:
74
+ tcmd += ' --%s %d' % (tc, metafo[tk])
75
+ return tcmd
76
+
77
+ # ----------------------------------------------------------------------------------------
78
+ def run_gctree():
79
+ # ----------------------------------------------------------------------------------------
80
+ def get_gctree_cmd():
81
+ tcmd = '%s/bin/xvfb-run -a gctree infer outfile abundances.csv --root %s --verbose --idlabel' % (utils.get_partis_dir(), args.root_label) # --idlabel writes the output fasta file
82
+ if not args.base_model and not args.no_dag:
83
+ tcmd += ' --mutability %s/HS5F_Mutability.csv --substitution %s/HS5F_Substitution.csv' % (args.data_dir, args.data_dir)
84
+ if args.ranking_coeffs is not None:
85
+ tcmd += ' --ranking_coeffs %s' % (' '.join(c for c in args.ranking_coeffs))
86
+ if args.branching_process_ranking_coeff is not None:
87
+ tcmd += ' --branching_process_ranking_coeff %d' % args.branching_process_ranking_coeff
88
+ if os.path.exists(args.metafname):
89
+ tcmd = add_mfo(tcmd, args.metafname)
90
+ return tcmd
91
+ # ----------------------------------------------------------------------------------------
92
+ def get_cmds():
93
+ cmds = ['#!/bin/bash']
94
+ cmds += utils.mamba_cmds(args.env_label)
95
+ if args.run_help:
96
+ cmds += ['gctree infer -h']
97
+ return cmds
98
+ if not os.path.exists(args.infname):
99
+ raise Exception('--infname %s doesn\'t exist' % args.infname)
100
+ cmds += ['cd %s' % args.outdir]
101
+ if args.input_forest_dir is None:
102
+ ofn = '%s/outfile' % args.outdir # dnapars output file (this is what takes the longest to make
103
+ if os.path.exists(ofn) and os.stat(ofn).st_size > 0:
104
+ print(' dnapars output already exists, not rerunning: %s' % ofn)
105
+ else:
106
+ if os.path.exists(ofn) and os.stat(ofn).st_size == 0:
107
+ print(' removing zero length dnapars output %s' % ofn)
108
+ utils.prep_dir(args.outdir, wildlings=['outfile', 'outtree'], allow_other_files=True) # phylip barfs like a mfer if its outputs exist (probably you'll get a KeyError 'naive')
109
+ cmds += ['deduplicate %s --root %s --abundance_file abundances.csv --idmapfile %s > deduplicated.phylip' % (args.infname, args.root_label, idfn())]
110
+ cmds += ['mkconfig deduplicated.phylip dnapars > dnapars.cfg']
111
+ cmds += ['dnapars < dnapars.cfg > dnapars.log'] # NOTE if things fail, look in dnaparse.log (but it's super verbose so we can't print it to std out by default)
112
+ else:
113
+ print(' --input-forest-dir: copying abundance, idmap, and forest files from %s' % args.input_forest_dir)
114
+ cmds += ['cp %s/{abundances.csv,%s,outfile} %s/' % (args.input_forest_dir, idfn(), args.outdir)]
115
+ if not args.only_write_forest:
116
+ cmds.append(get_gctree_cmd())
117
+ return cmds
118
+ # ----------------------------------------------------------------------------------------
119
+ if not args.run_help and utils.output_exists(args, gctofn('dnapars' if args.only_write_forest else 'tree')):
120
+ return
121
+
122
+ cmds = get_cmds() # also preps dir + other stuff
123
+
124
+ utils.simplerun('\n'.join(cmds) + '\n', cmdfname=args.outdir + '/run.sh', print_time='gctree', debug=True, dryrun=args.dry_run)
125
+ if args.run_help:
126
+ sys.exit()
127
+
128
+ # ----------------------------------------------------------------------------------------
129
+ def parse_output():
130
+ if utils.output_exists(args, fofn('seqs')):
131
+ return
132
+
133
+ # read translations (this only includes input sequences, not inferred intermediates)
134
+ idm_trns = {}
135
+ with open('%s/idmap.txt' % args.outdir) as idfile:
136
+ reader = csv.DictReader(idfile, fieldnames=('name', 'orig_names'))
137
+ for line in reader:
138
+ if line['orig_names'] == '':
139
+ continue
140
+ idm_trns[line['name']] = line['orig_names'].split(':')
141
+
142
+ # read fasta (mostly for inferred intermediate seqs)
143
+ seqfos = utils.read_fastx(gctofn('seqs'), look_for_tuples=True)
144
+ print(' read %d seqs from gctree output fasta' % len(seqfos))
145
+ if any(s['name']=='' for s in seqfos):
146
+ n_removed = len([s for s in seqfos if s['name']==''])
147
+ seqfos = [s for s in seqfos if s['name']!='']
148
+ print(' %s removed %d seqs with zero-length names \'\' (I\'m *not* sure this is the right thing to do, but it just kicked this error when I was doing the python 3 conversion)' % (utils.wrnstr(), n_removed))
149
+ nfos = [s for s in seqfos if s['name']==args.root_label]
150
+ if len(nfos) != 1:
151
+ print(' %s expected 1 naive seq with label \'%s\' but found %d: %s (in %s)' % (utils.wrnstr(), args.root_label, len(nfos), ' '.join(n['name'] for n in nfos), gctofn('seqs')))
152
+ seqfos = [s for s in seqfos if s['name'] != args.root_label] # don't want naive seq in final fasta
153
+ seq_len = numpy.mean([len(s['seq']) for s in seqfos])
154
+ if not args.expand_all_nodes: # also remove input seqs (well, gctree's new names for input seqs), unless we're expanding all nodes, in which case we need the gctree-named-nodes as fake new internal nodes
155
+ seqfos = [s for s in seqfos if s['name'] not in idm_trns]
156
+ if len(seqfos) == 0:
157
+ print(' %s no inferred sequences (all seqs read from gctree output were input seqs' % utils.wrnstr())
158
+ inf_int_trns = []
159
+ for sfo in seqfos:
160
+ inf_int_trns.append((sfo['name'], get_inf_int_name(sfo['name'])))
161
+ sfo['name'] = get_inf_int_name(sfo['name'])
162
+
163
+ # read tree
164
+ dtree = treeutils.get_dendro_tree(treefname=gctofn('tree'), debug=args.debug)
165
+ dtree.scale_edges(1. / seq_len)
166
+ dtree.seed_node.taxon.label = args.root_label
167
+ ndict = {n.taxon.label : n for n in dtree.preorder_node_iter()}
168
+ for gname, onames in idm_trns.items():
169
+ node = ndict[gname]
170
+ if node is None:
171
+ raise Exception('couldn\'t find node with name \'%s\' in tree from gctree in %s' % (gname, gctofn('tree')))
172
+ if args.debug and len(onames) > 1:
173
+ print(' abundance > 1 for %s: %d (%s)' % (gname, len(onames), ' '.join(onames)))
174
+ for onm in onames:
175
+ if node.taxon.label == gname and not args.expand_all_nodes:
176
+ node.taxon.label = onm
177
+ if args.debug and len(onames) > 1:
178
+ print(' setting node to %s' % onm)
179
+ continue
180
+ treeutils.add_zero_length_child(node, dtree, child_name=onm) # add duplicates as children with zero-length edges
181
+ if args.debug and len(onames) > 1:
182
+ print(' adding child node %s' % onm)
183
+ treeutils.translate_labels(dtree, inf_int_trns, expect_missing=True, debug=args.debug)
184
+
185
+ if args.fix_multifurcations:
186
+ input_seqfos = utils.read_fastx(args.infname)
187
+ dtree, new_seqfos = treeutils.get_binary_tree(dtree, nfos + input_seqfos + seqfos, debug=args.debug)
188
+ seqfos += new_seqfos
189
+ if args.debug:
190
+ print(' final tree:')
191
+ print(treeutils.get_ascii_tree(dendro_tree=dtree, extra_str=' ', width=350))
192
+ with open(fofn('tree'), 'w') as ofile:
193
+ ofile.write('%s\n' % treeutils.as_str(dtree))
194
+ utils.write_fasta(fofn('seqs'), nfos + seqfos)
195
+
196
+ # ----------------------------------------------------------------------------------------
197
+ ustr = """
198
+ Run gctree tree inference on sequences from fasta input file <--infname>.
199
+ Output trees and sequences are written to <--outdir> as inferred-seqs.fa and tree.nwk (gctree output files are also there, but they don't have any postprocessing e.g. fixing names and/or multifurcations.
200
+ gctree-run.py --infname <fasta> --outdir <outdir>
201
+ """
202
+ parser = argparse.ArgumentParser(usage=ustr)
203
+ parser.add_argument('--actions', default='run:parse')
204
+ parser.add_argument('--infname')
205
+ parser.add_argument('--metafname', help='if you need --frame (v region doesn\'t start at first position) or --chain_split and --frame2 (heavy/light chain smooshed together), pass the info in json format with this arg (see code above for format).')
206
+ parser.add_argument('--outdir')
207
+ parser.add_argument('--only-write-forest', action='store_true', help='only run preparatory steps for gctree, i.e. up through dnapars, to write parsimony forest')
208
+ parser.add_argument('--input-forest-dir', help='If set, skips preparatory steps (see --only-write-forest), and looks for \'abundance.csv\' and parsimony forest file (\'outfile\') in the specified dir')
209
+ parser.add_argument('--overwrite', action='store_true')
210
+ parser.add_argument('--base-model', action='store_true', help='By default, we pass gctree info for the s5f mutation model; if this is set, we don\'t, and it instead use the base model.')
211
+ parser.add_argument('--no-dag', action='store_true', help='If set, use old v1 non-DAG gctree version (v3.3.0). Note that this uses a different env (see --env-label)')
212
+ parser.add_argument('--ranking-coeffs', nargs='+', help='see gctree help')
213
+ parser.add_argument('--branching-process-ranking-coeff', type=int, help='see gctree help')
214
+ parser.add_argument('--env-label', default='gctree')
215
+ parser.add_argument('--root-label', default='naive')
216
+ parser.add_argument('--data-dir', default='%s/data/s5f'%utils.get_partis_dir())
217
+ parser.add_argument('--inf-int-label', default='inf', help='base name for inferred intermediate seqs (numerical name is appended with -')
218
+ parser.add_argument('--expand-all-nodes', action='store_true', help='Gctree collapses duplicate observed seqs into nodes with new names and abundance N > 1. By default, we expand these such that the node is named for one of the observed seqs, and add N-1 (zero-length) children. If this arg is set, however, we leave the node and add N (zero-length) children.')
219
+ parser.add_argument('--run-help', action='store_true', help='run gctree help')
220
+ parser.add_argument('--debug', action='store_true')
221
+ parser.add_argument('--dry-run', action='store_true')
222
+ parser.add_argument('--random-seed', type=int, default=0)
223
+ parser.add_argument('--fix-multifurcations', action='store_true', help='resolves multifurcations (by adding zero length intermediates) and move input seqs that have been extend unifurcations onto zero length branches')
224
+
225
+ args = parser.parse_args()
226
+ random.seed(args.random_seed)
227
+ numpy.random.seed(args.random_seed)
228
+ if args.only_write_forest and args.input_forest_dir:
229
+ raise Exception('doesn\'t make sense to specify both')
230
+ args.actions = utils.get_arg_list(args.actions, choices=['install', 'update', 'run', 'parse'])
231
+ args.infname = utils.fpath(args.infname)
232
+ args.outdir = utils.fpath(args.outdir)
233
+ if args.no_dag:
234
+ assert not args.base_model and args.branching_process_ranking_coeff is None and args.ranking_coeffs is None
235
+ args.env_label = 'gctree-no-dag'
236
+
237
+ if 'install' in args.actions:
238
+ install()
239
+ if 'update' in args.actions:
240
+ update()
241
+ if 'run' in args.actions:
242
+ run_gctree()
243
+ if 'parse' in args.actions:
244
+ parse_output()
@@ -0,0 +1,126 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import os
5
+ import sys
6
+ import csv
7
+ import argparse
8
+ import operator
9
+ import argparse
10
+ import yaml
11
+ import colored_traceback.always
12
+ from io import open
13
+
14
+ # if you move this script, you'll need to change this method of getting the imports
15
+ from pathlib import Path
16
+ partis_dir = str(Path(__file__).parent.parent)
17
+ sys.path.insert(1, partis_dir) # + '/python')
18
+
19
+ import python.utils as utils
20
+
21
+ # ----------------------------------------------------------------------------------------
22
+ def is_acceptable(scol, acceptable_values, lval):
23
+ if lval in acceptable_values:
24
+ return True
25
+ if args.any_allele and '_gene' in scol and any(utils.are_alleles(g, lval) for g in acceptable_values):
26
+ return True
27
+ return False
28
+
29
+ class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
30
+ pass
31
+ formatter_class = MultiplyInheritedFormatter
32
+ parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter)
33
+ parser.add_argument('--infname', default='test/ref-results/test/parameters/data/hmm/all-probs.csv', help='input all-probs.csv file from a previously-inferred partis parameter directory, for instance: test/reference-results/test/parameters/data/hmm/all-probs.csv')
34
+ parser.add_argument('--config-fname', help='yaml file with info on columns for which we want to specify particular values (and skip others). See default/example set below. To create a yaml config file to start from, uncomment the yaml.dump() line below and rerun with no arguments.')
35
+ parser.add_argument('--outfname')
36
+ parser.add_argument('--any-allele', action='store_true', help='if set, also include any other alleles of any of the genes specified in \'skip_column_vals\' (note: can also set it in the cfg file).')
37
+ parser.add_argument('--debug', action='store_true', default=True) # it's kind of confusing without the debug printout
38
+ args = parser.parse_args()
39
+
40
+ non_summed_column = None
41
+ if args.config_fname is None:
42
+ non_summed_column = 'v_gene'
43
+ skip_column_vals = { # to input your own dict on the command line, just convert with str() and quote it
44
+ # 'cdr3_length' : ['33', '36', '39', '42', '45', '48'], # <value> is list of acceptable values NOTE need to all be strings, otherwise you have to worry about converting the values in the csv file
45
+ 'v_gene' : ['IGHV1-2*02+G35A', 'IGHV1-2*02+T147C', 'IGHV1-2*02'],
46
+ # 'd_gene' : ['IGHD3-22*01'],
47
+ 'j_gene' : ['IGHJ4*02'],
48
+ 'cdr3_length' : ['66',],
49
+ }
50
+ print('%s using default skip column/non-summed column values (which probably don\'t correspond to what you\'re actually interested in)' % utils.color('red', 'note'))
51
+ # # uncomment to create a yaml file to start from:
52
+ # with open('tmp.yaml', 'w') as tfile:
53
+ # yaml.dump({'non_summed_column' : non_summed_column, 'skip_column_vals' : skip_column_vals}, tfile)
54
+ else:
55
+ with open(args.config_fname) as yamlfile:
56
+ yamlfo = yaml.load(yamlfile, Loader=yaml.Loader)
57
+ if 'non_summed_column' in yamlfo:
58
+ non_summed_column = yamlfo['non_summed_column']
59
+ skip_column_vals = yamlfo['skip_column_vals']
60
+ for scol in skip_column_vals:
61
+ skip_column_vals[scol] = [str(v) for v in skip_column_vals[scol]] # yaml.load() converts to integers, which is usually nice, but here we don't want it to since we're not converting when reading all-probs.csv (I think there's options to yaml.load to change this, I just don't want to figure it out now)
62
+ if 'any_allele' in yamlfo:
63
+ if args.any_allele and not yamlfo['any_allele']: # if it's set to true on the command line, but false in the file
64
+ print(' %s overwriting --any-allele with value from cfg file %s' % (utils.color('red', 'warning'), args.config_fname))
65
+ args.any_allele = yamlfo['any_allele']
66
+
67
+ info = {}
68
+ lines_skipped, lines_used = 0, 0
69
+ counts_skipped, counts_used = 0, 0
70
+ print(' reading probs from %s' % args.infname)
71
+ with open(args.infname) as csvfile:
72
+ reader = csv.DictReader(csvfile)
73
+ # if args.debug:
74
+ # print ' all columns in file: %s' % ' '.join(reader.fieldnames)
75
+ if len(set(skip_column_vals) - set(reader.fieldnames)) > 0:
76
+ raise Exception('keys in --skip-column-fname not in file: %s' % ' '.join(set(skip_column_vals) - set(reader.fieldnames)))
77
+ for line in reader:
78
+ skip_this_line = False
79
+ for scol, acceptable_values in skip_column_vals.items():
80
+ if not is_acceptable(scol, acceptable_values, line[scol]):
81
+ skip_this_line = True
82
+ lines_skipped += 1
83
+ counts_skipped += int(line['count'])
84
+ break
85
+ if skip_this_line:
86
+ continue
87
+
88
+ if non_summed_column is not None:
89
+ if line[non_summed_column] not in info:
90
+ info[line[non_summed_column]] = 0
91
+ info[line[non_summed_column]] += int(line['count'])
92
+
93
+ lines_used += 1
94
+ counts_used += int(line['count'])
95
+
96
+ # ----------------------------------------------------------------------------------------
97
+ import python.fraction_uncertainty as fraction_uncertainty
98
+ def frac_err(obs, total):
99
+ lo, hi = fraction_uncertainty.err(obs, total)
100
+ return 0.5 * (hi - lo)
101
+ count_fraction = counts_used / float(counts_used + counts_skipped)
102
+
103
+ if args.debug:
104
+ print(' applied restrictions:%s' % (' (including all alleles of these genes)' if args.any_allele else ''))
105
+ for scol, acceptable_values in skip_column_vals.items():
106
+ print(' %15s in %s' % (scol, acceptable_values))
107
+ print(' used:')
108
+ print(' %6d / %-6d = %.3f lines' % (lines_used, lines_used + lines_skipped, lines_used / float(lines_used + lines_skipped)))
109
+ print(' %6d / %-6d = %.3f +/- %.3f counts' % (counts_used, counts_used + counts_skipped, count_fraction, frac_err(counts_used, counts_used + counts_skipped)))
110
+
111
+ if non_summed_column is not None:
112
+ print(' %18s count / %d = fraction' % (non_summed_column, counts_used))
113
+ for val, count in sorted(list(info.items()), key=operator.itemgetter(1), reverse=True): # sort by counts
114
+ # for val, count in sorted(info.items()): # sort by column value (e.g. cdr3 length)
115
+ print(' %18s %6d %.3f +/- %.3f' % (val, count, count / float(counts_used), frac_err(count, counts_used)))
116
+
117
+ if args.outfname is not None:
118
+ if args.debug:
119
+ print(' writing total counts (plus %d info entries) to %s' % (len(info), args.outfname))
120
+ with open(args.outfname, 'w') as outfile:
121
+ yamlfo = {'counts' : counts_used,
122
+ 'total' : counts_used + counts_skipped,
123
+ 'fraction' : count_fraction,
124
+ 'frac_err' : frac_err(counts_used, counts_used + counts_skipped),
125
+ 'info' : info}
126
+ yaml.dump(yamlfo, outfile, width=150)
bin/iqtree-1.6.12 ADDED
Binary file