partis-bcr 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. bin/FastTree +0 -0
  2. bin/add-chimeras.py +59 -0
  3. bin/add-seqs-to-outputs.py +81 -0
  4. bin/bcr-phylo-run.py +799 -0
  5. bin/build.sh +24 -0
  6. bin/cf-alleles.py +97 -0
  7. bin/cf-germlines.py +57 -0
  8. bin/cf-linearham.py +199 -0
  9. bin/chimera-plot.py +76 -0
  10. bin/choose-partially-paired.py +143 -0
  11. bin/circle-plots.py +30 -0
  12. bin/compare-plotdirs.py +298 -0
  13. bin/diff-parameters.py +133 -0
  14. bin/docker-hub-push.sh +6 -0
  15. bin/extract-pairing-info.py +55 -0
  16. bin/gcdyn-simu-run.py +223 -0
  17. bin/gctree-run.py +244 -0
  18. bin/get-naive-probabilities.py +126 -0
  19. bin/iqtree-1.6.12 +0 -0
  20. bin/lonr.r +1020 -0
  21. bin/makeHtml +52 -0
  22. bin/mds-run.py +46 -0
  23. bin/parse-output.py +277 -0
  24. bin/partis +1869 -0
  25. bin/partis-pip +116 -0
  26. bin/partis.py +1869 -0
  27. bin/plot-gl-set-trees.py +519 -0
  28. bin/plot-hmms.py +151 -0
  29. bin/plot-lb-tree.py +427 -0
  30. bin/raxml-ng +0 -0
  31. bin/read-bcr-phylo-trees.py +38 -0
  32. bin/read-gctree-output.py +166 -0
  33. bin/run-chimeras.sh +64 -0
  34. bin/run-dtr-scan.sh +25 -0
  35. bin/run-paired-loci.sh +100 -0
  36. bin/run-tree-metrics.sh +88 -0
  37. bin/smetric-run.py +62 -0
  38. bin/split-loci.py +317 -0
  39. bin/swarm-2.1.13-linux-x86_64 +0 -0
  40. bin/test-germline-inference.py +425 -0
  41. bin/tree-perf-run.py +194 -0
  42. bin/vsearch-2.4.3-linux-x86_64 +0 -0
  43. bin/vsearch-2.4.3-macos-x86_64 +0 -0
  44. bin/xvfb-run +194 -0
  45. partis_bcr-1.0.1.data/scripts/cf-alleles.py +97 -0
  46. partis_bcr-1.0.1.data/scripts/cf-germlines.py +57 -0
  47. partis_bcr-1.0.1.data/scripts/extract-pairing-info.py +55 -0
  48. partis_bcr-1.0.1.data/scripts/gctree-run.py +244 -0
  49. partis_bcr-1.0.1.data/scripts/parse-output.py +277 -0
  50. partis_bcr-1.0.1.data/scripts/split-loci.py +317 -0
  51. partis_bcr-1.0.1.data/scripts/test.py +1005 -0
  52. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/METADATA +1 -1
  53. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/RECORD +101 -50
  54. partis_bcr-1.0.1.dist-info/top_level.txt +1 -0
  55. {partis → python}/glutils.py +1 -1
  56. python/main.py +30 -0
  57. {partis → python}/plotting.py +10 -1
  58. {partis → python}/treeutils.py +18 -16
  59. {partis → python}/utils.py +14 -7
  60. partis/main.py +0 -59
  61. partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
  62. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/WHEEL +0 -0
  63. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/licenses/COPYING +0 -0
  65. {partis → python}/__init__.py +0 -0
  66. {partis → python}/alleleclusterer.py +0 -0
  67. {partis → python}/allelefinder.py +0 -0
  68. {partis → python}/alleleremover.py +0 -0
  69. {partis → python}/annotationclustering.py +0 -0
  70. {partis → python}/baseutils.py +0 -0
  71. {partis → python}/cache/__init__.py +0 -0
  72. {partis → python}/cache/cached_uncertainties.py +0 -0
  73. {partis → python}/clusterpath.py +0 -0
  74. {partis → python}/coar.py +0 -0
  75. {partis → python}/corrcounter.py +0 -0
  76. {partis → python}/datautils.py +0 -0
  77. {partis → python}/event.py +0 -0
  78. {partis → python}/fraction_uncertainty.py +0 -0
  79. {partis → python}/gex.py +0 -0
  80. {partis → python}/glomerator.py +0 -0
  81. {partis → python}/hist.py +0 -0
  82. {partis → python}/hmmwriter.py +0 -0
  83. {partis → python}/hutils.py +0 -0
  84. {partis → python}/indelutils.py +0 -0
  85. {partis → python}/lbplotting.py +0 -0
  86. {partis → python}/mds.py +0 -0
  87. {partis → python}/mutefreqer.py +0 -0
  88. {partis → python}/paircluster.py +0 -0
  89. {partis → python}/parametercounter.py +0 -0
  90. {partis → python}/paramutils.py +0 -0
  91. {partis → python}/partitiondriver.py +0 -0
  92. {partis → python}/partitionplotter.py +0 -0
  93. {partis → python}/performanceplotter.py +0 -0
  94. {partis → python}/plotconfig.py +0 -0
  95. {partis → python}/processargs.py +0 -0
  96. {partis → python}/prutils.py +0 -0
  97. {partis → python}/recombinator.py +0 -0
  98. {partis → python}/scanplot.py +0 -0
  99. {partis → python}/seqfileopener.py +0 -0
  100. {partis → python}/treegenerator.py +0 -0
  101. {partis → python}/viterbicluster.py +0 -0
  102. {partis → python}/vrc01.py +0 -0
  103. {partis → python}/waterer.py +0 -0
@@ -0,0 +1,425 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import numpy
5
+ import copy
6
+ import random
7
+ import argparse
8
+ import time
9
+ import sys
10
+ import os
11
+ import glob
12
+ import colored_traceback.always
13
+
14
+ from subprocess import check_call
15
+ sys.path.insert(1, '.') #'./python')
16
+ from pathlib import Path
17
+ partis_dir = str(Path(__file__).parent.parent)
18
+
19
+ import python.utils as utils
20
+ import python.glutils as glutils
21
+ import python.processargs as processargs
22
+
23
+ # ----------------------------------------------------------------------------------------
24
+ def cov_cmd():
25
+ return 'coverage3 run --append'
26
+
27
+ # ----------------------------------------------------------------------------------------
28
+ def get_outfname(args, method, annotation_performance_plots=False, return_parent_gl_dir=False):
29
+ outdir = args.outdir + '/' + method
30
+ if not annotation_performance_plots: # default: output is igh/ighv.fasta
31
+ if method == 'partis' or method == 'full': # parameter directory, not regular file (although, could change it to the gls .fa in sw/)
32
+ outdir += '/sw/germline-sets'
33
+ if not return_parent_gl_dir:
34
+ return glutils.get_fname(outdir, args.locus, 'v')
35
+ else:
36
+ return outdir
37
+ else: # product of running partis annotation with --plot-annotation-performance
38
+ return outdir + '/annotation-performance-plots'
39
+
40
+ # ----------------------------------------------------------------------------------------
41
+ def simulate(args):
42
+ if utils.output_exists(args, args.simfname):
43
+ return
44
+ cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --force-dont-generate-germline-set --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters'
45
+ cmd_str += ' --allow-nonfunctional-scratch-seqs'
46
+ if args.n_leaf_distribution is None:
47
+ cmd_str += ' --constant-number-of-leaves'
48
+ else:
49
+ cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution
50
+ if args.mut_mult is not None:
51
+ cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)
52
+ if args.root_mrca_weibull_parameter is not None:
53
+ cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter)
54
+
55
+ if args.n_procs is not None:
56
+ cmd_str += ' --n-procs ' + str(args.n_procs)
57
+ if args.slurm:
58
+ cmd_str += ' --batch-system slurm'
59
+
60
+ args.allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv'
61
+
62
+ # figure what genes we're using
63
+ if args.gls_gen:
64
+ sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus)
65
+ glutils.remove_v_genes_with_bad_cysteines(sglfo)
66
+ glutils.generate_germline_set(sglfo, args, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True)
67
+ cmd_str += ' --allele-prevalence-fname ' + args.allele_prevalence_fname
68
+ else:
69
+ sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes))
70
+ added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes)) # NOTE template gene removal is the default for glutils.generate_germline_set
71
+
72
+ if args.allele_prevalence_freqs is not None:
73
+ if not utils.is_normed(args.allele_prevalence_freqs):
74
+ raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs)
75
+ if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']): # already checked when parsing args, but, you know...
76
+ raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v'])))
77
+ gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names
78
+ prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}}
79
+ glutils.write_allele_prevalence_freqs(prevalence_freqs, args.allele_prevalence_fname)
80
+ cmd_str += ' --allele-prevalence-fname ' + args.allele_prevalence_fname
81
+
82
+ glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
83
+ cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'
84
+ # glutils.print_glfo(sglfo)
85
+
86
+ # run simulation
87
+ if args.seed is not None:
88
+ cmd_str += ' --random-seed ' + str(args.seed)
89
+ utils.simplerun(cmd_str, dryrun=args.dryrun)
90
+
91
+ # ----------------------------------------------------------------------------------------
92
+ def run_other_method(args, method):
93
+ if method not in ['tigger-default', 'tigger-tuned', 'igdiscover']: # really just to make it easier to search for this fcn
94
+ assert False
95
+ assert args.n_max_queries is None
96
+ if utils.output_exists(args, get_outfname(args, method)):
97
+ return
98
+ simfasta = utils.getprefix(args.simfname) + '.fa'
99
+ utils.csv_to_fasta(args.simfname, outfname=simfasta, overwrite=False, remove_duplicates=True)
100
+ cmd = './test/%s-run.py' % method.split('-')[0]
101
+ if method == 'tigger-tuned':
102
+ cmd += ' --tuned-tigger-params'
103
+ cmd += ' --infname ' + simfasta
104
+ cmd += ' --outfname ' + get_outfname(args, method)
105
+ if args.species != 'human':
106
+ cmd += ' --species %s' % args.species
107
+ if args.overwrite:
108
+ cmd += ' --overwrite'
109
+ if args.gls_gen:
110
+ cmd += ' --gls-gen'
111
+ cmd += ' --glfo-dir ' + partis_dir + '/' + args.default_germline_dir # the partis mehods have this as the default internally, but we want/have to set it explicitly here
112
+ else:
113
+ cmd += ' --glfo-dir ' + args.inf_glfo_dir
114
+ cmd += ' --simulation-germline-dir ' + args.outdir + '/germlines/simulation' # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info
115
+ if method != 'igdiscover': # for now we're saving all the igdiscover output/intermediate files, so we write them to an output dir
116
+ cmd += ' --workdir ' + args.workdir + '/' + method
117
+ if args.n_procs is not None:
118
+ cmd += ' --n-procs ' + str(args.n_procs)
119
+ if args.slurm:
120
+ cmd += ' --slurm'
121
+
122
+ utils.simplerun(cmd, dryrun=args.dryrun)
123
+
124
+ # ----------------------------------------------------------------------------------------
125
+ def run_performance_plot(args, method):
126
+ perf_outdir = get_outfname(args, method, annotation_performance_plots=True)
127
+ if utils.output_exists(args, perf_outdir):
128
+ return
129
+
130
+ cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --plot-annotation-performance'
131
+ cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation'
132
+ cmd_str += ' --initial-germline-dir ' + get_outfname(args, method, return_parent_gl_dir=True) # i.e. use the inferred glfo from <method>
133
+ cmd_str += ' --parameter-dir ' + perf_outdir + '/dummy-parameter-dir'
134
+ cmd_str += ' --plotdir ' + perf_outdir
135
+ cmd_str += ' --only-smith-waterman --leave-default-germline --dont-write-parameters' # i.e. we really want to annotate, not cache parameters, but then it'd look for a parameter dir
136
+ if args.n_procs is not None:
137
+ cmd_str += ' --n-procs ' + str(args.n_procs)
138
+ if args.n_max_queries is not None:
139
+ cmd_str += ' --n-max-queries ' + str(args.n_max_queries) # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
140
+ if args.slurm:
141
+ cmd_str += ' --batch-system slurm'
142
+ if args.seed is not None:
143
+ cmd_str += ' --random-seed ' + str(args.seed)
144
+ utils.simplerun(cmd_str, dryrun=args.dryrun)
145
+
146
+ # ----------------------------------------------------------------------------------------
147
+ def run_partis_parameter_cache(args, method):
148
+ if utils.output_exists(args, get_outfname(args, method)):
149
+ return
150
+
151
+ paramdir = args.outdir + '/' + method
152
+ plotdir = args.outdir + '/' + method + '/plots'
153
+
154
+ # remove any old sw cache files
155
+ sw_cachefiles = glob.glob(paramdir + '/sw-cache-*.csv')
156
+ if len(sw_cachefiles) > 0:
157
+ for cachefname in sw_cachefiles:
158
+ check_call(['rm', '-v', cachefname])
159
+ sw_cache_gldir = cachefname.replace('.csv', '-glfo')
160
+ if os.path.exists(sw_cache_gldir): # if stuff fails halfway through, you can get one but not the other
161
+ glutils.remove_glfo_files(sw_cache_gldir, args.locus)
162
+ # os.rmdir(sw_cache_gldir)
163
+
164
+ # generate germline set and cache parameters
165
+ cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --only-smith-waterman'
166
+ cmd_str += ' --initial-germline-dir %s' % (args.default_germline_dir if args.gls_gen else args.inf_glfo_dir)
167
+ if method == 'partis':
168
+ cmd_str += ' --debug-allele-finding' # --always-find-new-alleles'
169
+ cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation' # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info
170
+ if args.allele_cluster:
171
+ cmd_str += ' --allele-cluster'
172
+ if args.kmeans_allele_cluster:
173
+ cmd_str += ' --kmeans-allele-cluster'
174
+ elif method == 'full':
175
+ cmd_str += ' --leave-default-germline'
176
+ else:
177
+ assert False
178
+
179
+ if args.species != 'human':
180
+ cmd_str += ' --species %s' % args.species
181
+
182
+ if args.n_procs is not None:
183
+ cmd_str += ' --n-procs ' + str(args.n_procs)
184
+ if args.n_max_queries is not None:
185
+ cmd_str += ' --n-max-queries ' + str(args.n_max_queries) # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
186
+ if args.slurm:
187
+ cmd_str += ' --batch-system slurm'
188
+
189
+ cmd_str += ' --parameter-dir ' + paramdir
190
+ cmd_str += ' --plotdir ' + plotdir
191
+ if args.seed is not None:
192
+ cmd_str += ' --random-seed ' + str(args.seed)
193
+ if args.plot_and_fit_absolutely_everything is not None:
194
+ cmd_str += ' --plot-and-fit-absolutely-everything ' + str(args.plot_and_fit_absolutely_everything)
195
+ utils.simplerun(cmd_str, dryrun=args.dryrun)
196
+
197
+ # ----------------------------------------------------------------------------------------
198
+ def write_inf_glfo(args): # read default glfo, restrict it to the specified alleles, and write to somewhere where all the methods can read it
199
+ # NOTE this dir should *not* be modified by any of the methods
200
+ inf_glfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=args.inf_v_genes + args.dj_genes)
201
+ print(' writing initial inference glfo with %d v: %s' % (len(inf_glfo['seqs']['v']), ' '.join([utils.color_gene(g) for g in inf_glfo['seqs']['v']])))
202
+ glutils.write_glfo(args.inf_glfo_dir, inf_glfo)
203
+
204
+ # ----------------------------------------------------------------------------------------
205
+ def run_tests(args):
206
+ print('seed %d' % args.seed)
207
+ # all fcns return immediately if output already exists
208
+
209
+ if 'simu' in args.methods:
210
+ simulate(args)
211
+ args.methods.remove('simu')
212
+
213
+ if not args.gls_gen:
214
+ write_inf_glfo(args)
215
+ for method in args.methods:
216
+ if args.plot_annotation_performance:
217
+ run_performance_plot(args, method)
218
+ elif method == 'partis' or method == 'full':
219
+ run_partis_parameter_cache(args, method)
220
+ else:
221
+ run_other_method(args, method)
222
+
223
+ # ----------------------------------------------------------------------------------------
224
+ def multiple_tests(args):
225
+ def getlogdir(iproc):
226
+ logdir = args.outdir + '/' + str(iproc) + '/logs'
227
+ if args.plot_annotation_performance:
228
+ logdir += '/annotation-performance-plots'
229
+ return logdir + '/' + '-'.join(args.methods)
230
+ def cmd_str(iproc):
231
+ clist = copy.deepcopy(sys.argv)
232
+ utils.remove_from_arglist(clist, '--n-tests', has_arg=True)
233
+ utils.remove_from_arglist(clist, '--iteststart', has_arg=True)
234
+ utils.replace_in_arglist(clist, '--outdir', args.outdir + '/' + str(iproc))
235
+ utils.replace_in_arglist(clist, '--seed', str(args.seed + iproc))
236
+ # clist.append('--slurm')
237
+ return ' '.join(clist)
238
+
239
+ for iproc in range(args.iteststart, args.n_tests): # don't overwrite old log files... need to eventually fix this so it isn't necessary
240
+ def lfn(iproc, ilog):
241
+ logfname = args.outdir + '/' + str(iproc) + '/log'
242
+ if ilog > 0:
243
+ logfname += '.' + str(ilog)
244
+ return logfname
245
+
246
+ cmdfos = [{'cmd_str' : cmd_str(iproc),
247
+ 'workdir' : args.workdir + '/' + str(iproc),
248
+ 'logdir' : getlogdir(iproc),
249
+ 'outfname' : args.outdir + '/' + str(iproc)}
250
+ for iproc in range(args.iteststart, args.n_tests)]
251
+ if args.dryrun:
252
+ for iproc in range(args.iteststart, args.n_tests):
253
+ utils.simplerun(cmdfos[iproc - args.iteststart]['cmd_str'], dryrun=True)
254
+ return
255
+ for iproc in range(args.iteststart, args.n_tests):
256
+ logd = getlogdir(iproc)
257
+ if os.path.exists(logd + '/log'):
258
+ ilog = 0
259
+ while os.path.exists(logd + '/log.' + str(ilog)):
260
+ ilog += 1
261
+ check_call(['mv', '-v', logd + '/log', logd + '/log.' + str(ilog)])
262
+ print(' look for logs in %s' % args.outdir)
263
+ utils.run_cmds(cmdfos, debug='write')
264
+
265
+ # ----------------------------------------------------------------------------------------
266
+
267
+ # # ----------------------------------------------------------------------------------------
268
+ # from hist import Hist
269
+ # import plotting
270
+ # fig, ax = plotting.mpl_init()
271
+
272
+ # ntrees = 1000
273
+ # distrs = [
274
+ # # (1.5, 'geo'),
275
+ # # (3, 'geo'),
276
+ # (10, 'geo'),
277
+ # # (25, 'geo'),
278
+ # # (2.3, 'zipf'),
279
+ # # (1.8, 'zipf'),
280
+ # # (1.3, 'zipf'),
281
+ # ]
282
+
283
+ # # ----------------------------------------------------------------------------------------
284
+ # def getsubsample(vals):
285
+ # print vals
286
+ # iclust = 0
287
+ # seqs = []
288
+ # for v in vals:
289
+ # seqs += [iclust for _ in range(v)]
290
+ # iclust += 1
291
+ # print seqs
292
+ # subseqs = numpy.random.choice(seqs, size=ntrees, replace=False)
293
+ # # subseqs = seqs[:ntrees]
294
+ # print subseqs
295
+ # import itertools
296
+ # subvals = []
297
+ # for _, group in itertools.groupby(sorted(subseqs)):
298
+ # for what in set(group):
299
+ # subg = [s for s in subseqs if s == what]
300
+ # print what, len(subg)
301
+ # subvals.append(len(subg))
302
+ # print subvals
303
+ # return subvals
304
+
305
+ # ih = 0
306
+ # for n_leaves, fcn in distrs:
307
+ # if fcn == 'zipf':
308
+ # vals = numpy.random.zipf(n_leaves, size=ntrees) # NOTE <n_leaves> is not the mean here
309
+ # elif fcn == 'geo':
310
+ # vals = numpy.random.geometric(1. / n_leaves, size=ntrees)
311
+ # else:
312
+ # assert False
313
+ # nbins = 100
314
+ # htmp = Hist(nbins, -0.5, nbins - 0.5)
315
+ # for v in vals:
316
+ # htmp.fill(v)
317
+ # htmp.mpl_plot(ax, color=plotting.default_colors[ih], errors=False, label='%s %.1f' % (fcn, numpy.mean(vals)))
318
+ # # ----------------------------------------------------------------------------------------
319
+ # hsub = Hist(nbins, -0.5, nbins - 0.5)
320
+ # subvals = getsubsample(vals)
321
+ # for v in subvals:
322
+ # hsub.fill(v)
323
+ # hsub.mpl_plot(ax, color=plotting.default_colors[ih], errors=False, label='%s %.1f' % (fcn, numpy.mean(subvals)), linestyle='--')
324
+ # # ----------------------------------------------------------------------------------------
325
+ # ih += 1
326
+
327
+ # plotting.mpl_finish(ax, utils.fsdir() + '/partis/tmp/tmp', 'baz', xbounds=(0.9, nbins), log='y')
328
+ # sys.exit()
329
+ # # ----------------------------------------------------------------------------------------
330
+
331
+ example_str = '\n '.join(['example usage:',
332
+ 'one new allele separated by 3 snps from existing allele:',
333
+ ' ./bin/test-germline-inference.py --n-sim-events 2000 --n-procs 10 --sim-v-genes=IGHV1-18*01 --inf-v-genes=IGHV1-18*01 --snp-positions 27,55,88',
334
+ 'one new allele [i.e. that the inference doesn\'t know about, but that in this case is in IMGT] separated by 1 snp from existing allele:',
335
+ ' ./bin/test-germline-inference.py --n-sim-events 2000 --n-procs 10 --sim-v-genes=IGHV4-39*01:IGHV4-39*02 --inf-v-genes=IGHV4-39*01',
336
+ 'generate a full germline set for simulation, and then try to infer it:',
337
+ ' ./bin/test-germline-inference.py --n-sim-events 2000 --n-procs 10 --gls-gen'])
338
+ class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
339
+ pass
340
+ formatter_class = MultiplyInheritedFormatter
341
+ parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, epilog=example_str)
342
+ parser.add_argument('--n-sim-events', type=int, default=20, help='number of simulated rearrangement events')
343
+ parser.add_argument('--n-max-queries', type=int, help='number of queries to use for inference from the simulation sample')
344
+ parser.add_argument('--n-leaves', type=float, default=1., help='see bin/partis --help')
345
+ parser.add_argument('--n-leaf-distribution', help='see bin/partis --help')
346
+ parser.add_argument('--root-mrca-weibull-parameter', type=float, help='see bin/partis --help')
347
+ parser.add_argument('--n-procs', type=int)
348
+ parser.add_argument('--seed', type=int, default=int(time.time()), help='random seed')
349
+ parser.add_argument('--gls-gen', action='store_true', help='generate a random germline set from scratch (parameters specified above), and infer a germline set from scratch, instead of using --sim-v-genes, --dj-genes, --inf-v-genes.')
350
+ parser.add_argument('--sim-v-genes', default='IGHV4-39*01:IGHV4-39*08', help='V genes to use for simulation')
351
+ parser.add_argument('--inf-v-genes', default='IGHV4-39*01', help='V genes to use for inference')
352
+ parser.add_argument('--dj-genes', default='IGHD6-19*01:IGHJ4*02', help='D and J genes to use for both simulation and inference')
353
+ parser.add_argument('--snp-positions', help='colon-separated list (length must equal length of <--sim-v-genes>) of comma-separated snp positions for each gene, e.g. for two genes you might have \'3,71:45\'')
354
+ parser.add_argument('--nsnp-list', help='colon-separated list (length must equal length of <--sim-v-genes> unless --gls-gen) of the number of snps to generate for each gene (each snp at a random position). If --gls-gen, then this still gives the number of snpd genes, but it isn\'t assumed to be the same length as anything [i.e. we don\'t yet know how many v genes there\'ll be]')
355
+ parser.add_argument('--indel-positions', help='see --snp-positions (a.t.m. the indel length distributions are hardcoded)')
356
+ parser.add_argument('--nindel-list', help='see --nsnp-list')
357
+ parser.add_argument('--n-genes-per-region', default='::', help='see bin/partis --help')
358
+ parser.add_argument('--n-sim-alleles-per-gene', default='::', help='see bin/partis --help')
359
+ parser.add_argument('--min-sim-allele-prevalence-freq', default=glutils.default_min_allele_prevalence_freq, type=float, help='see bin/partis --help')
360
+ parser.add_argument('--allele-prevalence-freqs', help='colon-separated list of allele prevalence frequencies, including newly-generated snpd genes (ordered alphabetically)')
361
+ parser.add_argument('--dont-remove-template-genes', action='store_true', help='when generating snps, *don\'t* remove the original gene before simulation') # NOTE template gene removal is the default for glutils.generate_germline_set
362
+ parser.add_argument('--mut-mult', type=float, help='DO NOT USE use --mutation-multiplier (see below)')
363
+ parser.add_argument('--mutation-multiplier', type=float, help='see bin/partis --help') # see note below
364
+ parser.add_argument('--slurm', action='store_true')
365
+ parser.add_argument('--overwrite', action='store_true')
366
+ parser.add_argument('--dryrun', action='store_true')
367
+ parser.add_argument('--allele-cluster', action='store_true', help='see bin/partis --help')
368
+ parser.add_argument('--kmeans-allele-cluster', action='store_true', help='see bin/partis --help')
369
+ parser.add_argument('--plot-annotation-performance', action='store_true', help='see bin/partis --help')
370
+ parser.add_argument('--methods', default='simu:partis', help='colon-separated list of methods to run. By default runs simulation, and then partis inference (igdiscover and tigger, if installed, are the other options)')
371
+ parser.add_argument('--outdir', default=utils.fsdir() + '/partis/allele-finder')
372
+ parser.add_argument('--inf-glfo-dir', help='default set below')
373
+ parser.add_argument('--simfname', help='default set below')
374
+ parser.add_argument('--workdir', default=utils.fsdir() + '/_tmp/hmms/' + str(random.randint(0, 999999)))
375
+ parser.add_argument('--n-tests', type=int, help='instead of just running once, run <N> independent tests simultaneously')
376
+ parser.add_argument('--iteststart', type=int, default=0, help='for use with --n-tests, if you want to add more tests on')
377
+ parser.add_argument('--plot-and-fit-absolutely-everything', type=int, help='fit every single position for this <istart> and write every single corresponding plot (slow as hell, and only for debugging/making plots for paper)')
378
+ parser.add_argument('--partis-path', default='./bin/partis')
379
+ parser.add_argument('--prepend-coverage-command', action='store_true', help='see bin/partis --help')
380
+ parser.add_argument('--species', default='human', choices=('human', 'macaque'))
381
+ parser.add_argument('--locus', default='igh')
382
+ parser.add_argument('--allele-prevalence-fname', help='for internal use only (set above)')
383
+
384
+ args = parser.parse_args()
385
+ assert args.locus == 'igh' # would just need to update some things, e.g. propagate through to the various methods
386
+ args.methods = utils.get_arg_list(args.methods)
387
+ available_methods = set(['simu', 'partis', 'full', 'tigger-default', 'tigger-tuned', 'igdiscover'])
388
+ if len(set(args.methods) - available_methods) > 0:
389
+ raise Exception('unexpected --methods: %s' % ' '.join(set(args.methods) - available_methods))
390
+ # args.default_germline_dir = 'old-glfo/%s' % args.species # 'data/germlines/%s' % args.species # NOTE gad damnit, I just deleted old-glfo, had no idea what it was for
391
+ print(' %s hopefully old-glfo/ isn\'t needed to recreate old results (see comment)' % utils.color('yellow', 'note:'))
392
+ args.default_germline_dir = 'data/germlines/%s' % args.species # 'data/germlines/%s' % args.species
393
+
394
+ args.generate_germline_set = args.gls_gen # for compatibility with bin/partis (i.e. so they can both use the fcn in processargs, but I don't have to rewrite either)
395
+ args.mut_mult = args.mutation_multiplier # for compatibility with bin/partis (i.e. so they can both use the fcn in processargs, but I don't have to rewrite either)
396
+ if args.generate_germline_set: # if we're generating/inferring a whole germline set these are either set automatically or not used
397
+ delattr(args, 'sim_v_genes')
398
+ delattr(args, 'inf_v_genes')
399
+ delattr(args, 'dj_genes')
400
+ args.allele_prevalence_freqs = None
401
+ args.inf_glfo_dir = None
402
+ else:
403
+ args.dj_genes = utils.get_arg_list(args.dj_genes)
404
+ args.sim_v_genes = utils.get_arg_list(args.sim_v_genes)
405
+ args.inf_v_genes = utils.get_arg_list(args.inf_v_genes)
406
+ args.allele_prevalence_freqs = utils.get_arg_list(args.allele_prevalence_freqs, floatify=True)
407
+
408
+ processargs.process_gls_gen_args(args) # well, also does stuff with non-gls-gen new allele args
409
+
410
+ if args.inf_glfo_dir is None:
411
+ args.inf_glfo_dir = args.outdir + '/germlines/inference'
412
+ if args.simfname is None:
413
+ args.simfname = args.outdir + '/simu.yaml'
414
+
415
+ if args.prepend_coverage_command:
416
+ args.partis_path = '%s %s' % (cov_cmd(), args.partis_path)
417
+
418
+ if args.seed is not None:
419
+ random.seed(args.seed)
420
+ numpy.random.seed(args.seed)
421
+
422
+ if args.n_tests is not None:
423
+ multiple_tests(args)
424
+ else:
425
+ run_tests(args)
bin/tree-perf-run.py ADDED
@@ -0,0 +1,194 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import sys
5
+ import csv
6
+ from io import open
7
+ csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields
8
+ import os
9
+ import argparse
10
+ import colored_traceback.always
11
+ import json
12
+ import dendropy
13
+
14
+ # if you move this script, you'll need to change this method of getting the imports
15
+ from pathlib import Path
16
+ partis_dir = str(Path(__file__).parent.parent)
17
+ sys.path.insert(1, partis_dir) # + '/python')
18
+
19
+ import python.utils as utils
20
+ import python.glutils as glutils
21
+ import python.treeutils as treeutils
22
+ import python.lbplotting as lbplotting
23
+ import python.coar as coar
24
+
25
+ # ----------------------------------------------------------------------------------------
26
+ helpstr = """
27
+ """
28
+ class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
29
+ pass
30
+ formatter_class = MultiplyInheritedFormatter
31
+ parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, description=helpstr)
32
+ parser.add_argument('--true-tree-file', required=True, help='partis yaml file with true annotations from which to extract true trees')
33
+ parser.add_argument('--inferred-tree-file', required=True, help='partis yaml file with inferred annotations and inferred trees')
34
+ parser.add_argument('--outdir')
35
+ parser.add_argument('--metrics', default='coar:rf:mrca')
36
+ parser.add_argument('--n-procs', type=int, help='NOTE not used, just putting here for consistency with other scripts')
37
+ parser.add_argument('--overwrite', action='store_true', help='NOTE just for compatibility, not used atm')
38
+ parser.add_argument('--itree', type=int, help='only run on tree/annotation with this index')
39
+ parser.add_argument('--debug', type=int, default=0)
40
+ args = parser.parse_args()
41
+ args.metrics = utils.get_arg_list(args.metrics, choices=['coar', 'rf', 'mrca'])
42
+
43
+ _, tru_atn_list, _ = utils.read_output(args.true_tree_file)
44
+ _, inf_atn_list, _ = utils.read_output(args.inferred_tree_file)
45
+
46
+ naive_name = 'naive'
47
+
48
+ # ----------------------------------------------------------------------------------------
49
+ def add_seqs_to_nodes(ttr, seqdict, tfn):
50
+ for node in ttr.preorder_node_iter():
51
+ node.seq = seqdict[naive_name if node is ttr.seed_node else node.taxon.label]
52
+
53
+ # ----------------------------------------------------------------------------------------
54
+ def fix_seqs(atn_t, atn_i, tr_t, tr_i, seq_key='input_seqs', debug=False): # inferred annotation has padded seqs, which means when the h and l seqs get smashed together (in paircluster.sumv() called from paircluster.make_fake_hl_pair_antns()) sometimes there's extra Ns that aren't in the true annotation
55
+ # ----------------------------------------------------------------------------------------
56
+ def combine_chain_seqs(uid, seq_i): # basic idea is that we need to remove any N padding from waterer.py, then repad but just for translation
57
+ new_seq_i = []
58
+ for tch in 'hl':
59
+ cseq_i = utils.per_seq_val(atn_i, '%s_seqs'%tch, uid)
60
+ if cseq_i is None: # inferred ancestral seqs won't have h/l seqs set (maybe also naive?)
61
+ if tch == 'h':
62
+ cseq_i = seq_i[ : cs_lens[tch]]
63
+ else:
64
+ cseq_i = seq_i[cs_lens['h'] : ]
65
+ cseq_i = cseq_i.strip('N')
66
+ else:
67
+ cseq_i = cseq_i.strip('N')
68
+ # n_lstrip, n_rstrip = len(cseq_i) - len(cseq_i.lstrip('N')), len(cseq_i) - len(cseq_i.rstrip('N')) # if this starts causing problems again, it might be worth doing something like this to keep track of n bases removed from each side, and making sure it's the same for all seqs
69
+ if tch not in cs_lens:
70
+ cs_lens[tch] = len(cseq_i) # keep track of the h/l seq lengths, so for inferred nodes where we don't know it, we can remove the same bases
71
+ assert cs_lens[tch] == len(cseq_i) # they should all be the same
72
+ cseq_i = utils.pad_seq_for_translation(atn_i, cseq_i)
73
+ new_seq_i.append(cseq_i)
74
+ return ''.join(new_seq_i).strip('N')
75
+ # ----------------------------------------------------------------------------------------
76
+ def check_seqs(uid, seq_i, seq_t, fix_counts, force=False, dont_fix=False): # check/fix that any nodes that are in both trees have the same sequence
77
+ fix_counts['total'] += 1
78
+ if seq_t == seq_i:
79
+ return False # return whether we fixed it or not
80
+ # utils.color_mutants(seq_t, seq_i, align_if_necessary=True, print_result=True)
81
+ seq_i = combine_chain_seqs(uid, seq_i)
82
+ if seq_t is None:
83
+ assert force # for seqs that are in inferred but not true, we already know we need to fix them (and how)
84
+ else:
85
+ if seq_t != seq_i and not dont_fix:
86
+ print('%s tried to fix %s but seqs still different:' % (utils.wrnstr(), uid))
87
+ utils.color_mutants(seq_t, seq_i, print_result=True, align_if_necessary=True, ref_label='true ', seq_label='inf ')
88
+ assert False # NOTE if you stop crashing here, you probably need to increment something in fix_counts
89
+ seqs_t[uid] = seq_t
90
+ seqs_i[uid] = seq_i
91
+ fix_counts['fixed'].append(uid)
92
+ return True
93
+ # ----------------------------------------------------------------------------------------
94
+ def check_all_lengths(seqs_t, seqs_i): # check/fix that all seqs in both trees have the same length
95
+ lens_t, lens_i = [list(set(len(s) for s in slist.values())) for slist in [seqs_t, seqs_i]]
96
+ true_len = utils.get_single_entry(lens_t)
97
+ if len(lens_i) == 1 and lens_i[0] == true_len:
98
+ return
99
+ tseq = list(seqs_t.values())[0]
100
+ for uid in [u for u, s in seqs_i.items() if len(s) != true_len]:
101
+ utils.color_mutants(tseq, seqs_i[uid], align_if_necessary=True, print_result=True, extra_str=' ', ref_label='arb. true ', seq_label=uid+' ')
102
+ _, new_seq = utils.align_seqs(tseq, seqs_i[uid]) # i added this to fix a case that i ended up fixing a different (much better) way, but it might be useful in future, so leaving here
103
+ seqs_i[uid] = new_seq.replace('-', utils.ambig_base) # UGH
104
+ utils.color_mutants(tseq, seqs_i[uid], align_if_necessary=True, print_result=True, extra_str=' ', ref_label='arb. true ', seq_label=uid+' ')
105
+ raise Exception('different sequence lengths (probably from inferred internal nodes), see previous lines')
106
+ # ----------------------------------------------------------------------------------------
107
+ leaf_ids_t = [l.taxon.label for l in tr_t.leaf_node_iter() if l.taxon.label in atn_t['unique_ids']]
108
+ leaf_ids_i = [u for u in leaf_ids_t if u in atn_i['unique_ids']] # inferred tree may swap internal/leaf nodes
109
+ if set(leaf_ids_i) != set(leaf_ids_t):
110
+ only_true, only_inf = set(leaf_ids_t) - set(leaf_ids_i), set(leaf_ids_i) - set(leaf_ids_t)
111
+ print(' %s inferred leaf ids not the same as true leaf ids when trying to fix seqs (this is probably ok, since the coar calculation will probably skip them).\n %d extra true: %s\n %d extra inf: %s' % (utils.wrnstr(), len(only_true), ' '.join(only_true), len(only_inf), ' '.join(only_inf)))
112
+ common_leaf_ids = set(leaf_ids_t) & set(leaf_ids_i) # maybe missing ones would be ok? but don't want to mess with it, and for now we assume below that they're the same
113
+ seqs_t, seqs_i = [{u : utils.per_seq_val(atn, seq_key, u).strip('N') for u in atn['unique_ids']} for atn in (atn_t, atn_i)]
114
+ seqs_t[naive_name], seqs_i[naive_name] = [a['naive_seq'].strip('N') for a in (atn_t, atn_i)]
115
+ fixed, cs_lens, fix_counts = None, {}, {'fixed' : [], 'total' : 0}
116
+ for uid in common_leaf_ids:
117
+ tfx = check_seqs(uid, seqs_i[uid], seqs_t[uid], fix_counts)
118
+ if fixed is None:
119
+ fixed = tfx
120
+ assert tfx == fixed # if we fix one, we should fix all of them
121
+ if fixed:
122
+ for uid in [u for u in atn_i['unique_ids'] if u not in leaf_ids_i] + [naive_name]: # need to also fix any internal/inferred nodes
123
+ check_seqs(uid, seqs_i[uid], seqs_t.get(uid), fix_counts, force=True, dont_fix=uid==naive_name)
124
+ print(' no nodes needed fixing (all seqs already the same for common true/inferred nodes)' if len(fix_counts['fixed'])==0 else ' fixed %d / %d nodes' % (len(fix_counts['fixed']), fix_counts['total']))
125
+ check_all_lengths(seqs_t, seqs_i)
126
+ if debug and len(fix_counts['fixed']) > 0:
127
+ print(' fixed seqs: %s' % ' '.join(sorted(fix_counts['fixed'])))
128
+
129
+ return seqs_t, seqs_i
130
+
131
+ # ----------------------------------------------------------------------------------------
132
+ def get_n_parsimony_trees(n_clusters):
133
+ # other way to get this number:
134
+ # with open('gctree_base.inference.parsimony_forest.p', 'rb') as fh:
135
+ # forest = pickle.load(fh)
136
+ # n_parsimony_trees = forest._forest.count_histories()
137
+ n_ptree_list = []
138
+ for iclust in range(n_clusters):
139
+ logfn = '%s/%s/iclust-%d/log' % (os.path.dirname(args.inferred_tree_file), os.path.basename(args.inferred_tree_file).replace('-annotations.yaml', ''), iclust)
140
+ out, err = utils.simplerun('grep "number of trees with integer branch lengths:" %s ' % logfn, shell=True, return_out_err=True, debug=False)
141
+ n_ptree_list.append(int(out.split()[-1]))
142
+ return n_ptree_list
143
+
144
+ # ----------------------------------------------------------------------------------------
145
+ # don't need this now that i'm using --simultaneous-true-clonal-seqs (yes, ick)
146
+ def trnfn(u): return u + '_contig_igh+igk'
147
+ utils.translate_uids(tru_atn_list, trfcn=trnfn, expect_missing=True)
148
+
149
+ # ----------------------------------------------------------------------------------------
150
+ jvals = {'coar' : [], 'rf' : [], 'mrca' : []}
151
+ for itree, atn_t in enumerate(tru_atn_list):
152
+ if args.itree is not None and itree != args.itree:
153
+ continue
154
+ print(' %d: starting true annotation with size %d' % (itree, len(atn_t['unique_ids'])))
155
+ atn_i = None
156
+ for tatn in inf_atn_list:
157
+ common_ids = set(atn_t['unique_ids']) & set(tatn['unique_ids'])
158
+ if len(common_ids) > 0:
159
+ estr = '' if not args.debug else ' (missing %d: %s)' % (len(atn_t['unique_ids']) - len(common_ids), ' '.join(sorted(set(atn_t['unique_ids']) - common_ids)))
160
+ print(' found inferred annotation with %d / %d uids in common%s' % (len(common_ids), len(atn_t['unique_ids']), estr))
161
+ atn_i = tatn
162
+ break
163
+ if atn_i is None:
164
+ raise Exception('couldn\'t find inferred annotation for true annotation (looked in %d inferred annotations, maybe try uncommenting translation above): %s' % (len(inf_atn_list), ' '.join(atn_t['unique_ids'])))
165
+ dtree_t, dtree_i = [treeutils.get_dendro_tree(treestr=lbplotting.get_tree_in_line(l, is_true)) for is_true, l in [[True, atn_t], [False, atn_i]]]
166
+ if args.debug:
167
+ for tstr, ttr in zip(['true', 'inf'], [dtree_t, dtree_i]):
168
+ print(' %4s:' % tstr)
169
+ print(utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ttr, width=250))) # , label_fcn=lambda l: l.replace('_contig_igh+igk', '')
170
+ seqs_t, seqs_i = fix_seqs(atn_t, atn_i, dtree_t, dtree_i, debug=args.debug)
171
+ for ttr, seqdict, tfn in zip([dtree_t, dtree_i], [seqs_t, seqs_i], [args.true_tree_file, args.inferred_tree_file]):
172
+ add_seqs_to_nodes(ttr, seqdict, tfn)
173
+ if 'coar' in args.metrics:
174
+ jvals['coar'].append(coar.COAR(dtree_t, dtree_i, known_root=False, debug=args.debug))
175
+ if 'mrca' in args.metrics:
176
+ jvals['mrca'].append(treeutils.mrca_dist(dtree_t, dtree_i, debug=args.debug))
177
+ if 'rf' in args.metrics:
178
+ dts_t, dts_i = treeutils.sync_taxon_namespaces(dtree_t, dtree_i, only_leaves=True) #, debug=True)
179
+ # this is weighted (i.e. depends on edge length), could also use unweighted (fcn symmetric_difference()) [from /loc/dralph/.local/lib/python3.6/site-packages/dendropy/calculate/treecompare.py]
180
+ jvals['rf'].append(dendropy.calculate.treecompare.weighted_robinson_foulds_distance(dts_t, dts_i))
181
+ # print(treeutils.get_ete_rf(dtree_t, dtree_i)
182
+
183
+ # if os.path.basename(args.inferred_tree_file).split('-')[0] == 'gctree':
184
+ # jvals['n-pars-trees'] = get_n_parsimony_trees(len(tru_atn_list))
185
+
186
+ if args.outdir is None:
187
+ print(' %s no --outdir specified, so not writing anything' % utils.wrnstr())
188
+ sys.exit(0)
189
+
190
+ ofn = '%s/tree-perf-vals.yaml' % args.outdir
191
+ print(' writing tree perf values to %s' % ofn)
192
+ if not os.path.exists(args.outdir):
193
+ os.makedirs(args.outdir)
194
+ utils.jsdump(ofn, jvals)
Binary file
Binary file